/src/gmp-6.2.1/mpn/mul_fft.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* Schoenhage's fast multiplication modulo 2^N+1. |
2 | | |
3 | | Contributed by Paul Zimmermann. |
4 | | |
5 | | THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY |
6 | | SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
7 | | GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
8 | | |
9 | | Copyright 1998-2010, 2012, 2013, 2018, 2020 Free Software Foundation, Inc. |
10 | | |
11 | | This file is part of the GNU MP Library. |
12 | | |
13 | | The GNU MP Library is free software; you can redistribute it and/or modify |
14 | | it under the terms of either: |
15 | | |
16 | | * the GNU Lesser General Public License as published by the Free |
17 | | Software Foundation; either version 3 of the License, or (at your |
18 | | option) any later version. |
19 | | |
20 | | or |
21 | | |
22 | | * the GNU General Public License as published by the Free Software |
23 | | Foundation; either version 2 of the License, or (at your option) any |
24 | | later version. |
25 | | |
26 | | or both in parallel, as here. |
27 | | |
28 | | The GNU MP Library is distributed in the hope that it will be useful, but |
29 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
30 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
31 | | for more details. |
32 | | |
33 | | You should have received copies of the GNU General Public License and the |
34 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
35 | | see https://www.gnu.org/licenses/. */ |
36 | | |
37 | | |
38 | | /* References: |
39 | | |
40 | | Schnelle Multiplikation grosser Zahlen, by Arnold Schoenhage and Volker |
41 | | Strassen, Computing 7, p. 281-292, 1971. |
42 | | |
43 | | Asymptotically fast algorithms for the numerical multiplication and division |
44 | | of polynomials with complex coefficients, by Arnold Schoenhage, Computer |
45 | | Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982. |
46 | | |
47 | | Tapes versus Pointers, a study in implementing fast algorithms, by Arnold |
48 | | Schoenhage, Bulletin of the EATCS, 30, p. 23-32, 1986. |
49 | | |
50 | | TODO: |
51 | | |
52 | | Implement some of the tricks published at ISSAC'2007 by Gaudry, Kruppa, and |
53 | | Zimmermann. |
54 | | |
55 | | It might be possible to avoid a small number of MPN_COPYs by using a |
56 | | rotating temporary or two. |
57 | | |
58 | | Cleanup and simplify the code! |
59 | | */ |
60 | | |
61 | | #ifdef TRACE |
62 | | #undef TRACE |
63 | | #define TRACE(x) x |
64 | | #include <stdio.h> |
65 | | #else |
66 | | #define TRACE(x) |
67 | | #endif |
68 | | |
69 | | #include "gmp-impl.h" |
70 | | |
71 | | #ifdef WANT_ADDSUB |
72 | | #include "generic/add_n_sub_n.c" |
73 | | #define HAVE_NATIVE_mpn_add_n_sub_n 1 |
74 | | #endif |
75 | | |
76 | | static mp_limb_t mpn_mul_fft_internal (mp_ptr, mp_size_t, int, mp_ptr *, |
77 | | mp_ptr *, mp_ptr, mp_ptr, mp_size_t, |
78 | | mp_size_t, mp_size_t, int **, mp_ptr, int); |
79 | | static void mpn_mul_fft_decompose (mp_ptr, mp_ptr *, mp_size_t, mp_size_t, mp_srcptr, |
80 | | mp_size_t, mp_size_t, mp_size_t, mp_ptr); |
81 | | |
82 | | |
83 | | /* Find the best k to use for a mod 2^(m*GMP_NUMB_BITS)+1 FFT for m >= n. |
84 | | We have sqr=0 if for a multiply, sqr=1 for a square. |
85 | | There are three generations of this code; we keep the old ones as long as |
86 | | some gmp-mparam.h is not updated. */ |
87 | | |
88 | | |
89 | | /*****************************************************************************/ |
90 | | |
91 | | #if TUNE_PROGRAM_BUILD || (defined (MUL_FFT_TABLE3) && defined (SQR_FFT_TABLE3)) |
92 | | |
93 | | #ifndef FFT_TABLE3_SIZE /* When tuning this is defined in gmp-impl.h */ |
94 | | #if defined (MUL_FFT_TABLE3_SIZE) && defined (SQR_FFT_TABLE3_SIZE) |
95 | | #if MUL_FFT_TABLE3_SIZE > SQR_FFT_TABLE3_SIZE |
96 | | #define FFT_TABLE3_SIZE MUL_FFT_TABLE3_SIZE |
97 | | #else |
98 | | #define FFT_TABLE3_SIZE SQR_FFT_TABLE3_SIZE |
99 | | #endif |
100 | | #endif |
101 | | #endif |
102 | | |
103 | | #ifndef FFT_TABLE3_SIZE |
104 | | #define FFT_TABLE3_SIZE 200 |
105 | | #endif |
106 | | |
107 | | FFT_TABLE_ATTRS struct fft_table_nk mpn_fft_table3[2][FFT_TABLE3_SIZE] = |
108 | | { |
109 | | MUL_FFT_TABLE3, |
110 | | SQR_FFT_TABLE3 |
111 | | }; |
112 | | |
113 | | int |
114 | | mpn_fft_best_k (mp_size_t n, int sqr) |
115 | 0 | { |
116 | 0 | const struct fft_table_nk *fft_tab, *tab; |
117 | 0 | mp_size_t tab_n, thres; |
118 | 0 | int last_k; |
119 | |
|
120 | 0 | fft_tab = mpn_fft_table3[sqr]; |
121 | 0 | last_k = fft_tab->k; |
122 | 0 | for (tab = fft_tab + 1; ; tab++) |
123 | 0 | { |
124 | 0 | tab_n = tab->n; |
125 | 0 | thres = tab_n << last_k; |
126 | 0 | if (n <= thres) |
127 | 0 | break; |
128 | 0 | last_k = tab->k; |
129 | 0 | } |
130 | 0 | return last_k; |
131 | 0 | } |
132 | | |
133 | | #define MPN_FFT_BEST_READY 1 |
134 | | #endif |
135 | | |
136 | | /*****************************************************************************/ |
137 | | |
138 | | #if ! defined (MPN_FFT_BEST_READY) |
139 | | FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] = |
140 | | { |
141 | | MUL_FFT_TABLE, |
142 | | SQR_FFT_TABLE |
143 | | }; |
144 | | |
145 | | int |
146 | | mpn_fft_best_k (mp_size_t n, int sqr) |
147 | | { |
148 | | int i; |
149 | | |
150 | | for (i = 0; mpn_fft_table[sqr][i] != 0; i++) |
151 | | if (n < mpn_fft_table[sqr][i]) |
152 | | return i + FFT_FIRST_K; |
153 | | |
154 | | /* treat 4*last as one further entry */ |
155 | | if (i == 0 || n < 4 * mpn_fft_table[sqr][i - 1]) |
156 | | return i + FFT_FIRST_K; |
157 | | else |
158 | | return i + FFT_FIRST_K + 1; |
159 | | } |
160 | | #endif |
161 | | |
162 | | /*****************************************************************************/ |
163 | | |
164 | | |
165 | | /* Returns smallest possible number of limbs >= pl for a fft of size 2^k, |
166 | | i.e. smallest multiple of 2^k >= pl. |
167 | | |
168 | | Don't declare static: needed by tuneup. |
169 | | */ |
170 | | |
171 | | mp_size_t |
172 | | mpn_fft_next_size (mp_size_t pl, int k) |
173 | 0 | { |
174 | 0 | pl = 1 + ((pl - 1) >> k); /* ceil (pl/2^k) */ |
175 | 0 | return pl << k; |
176 | 0 | } |
177 | | |
178 | | |
179 | | /* Initialize l[i][j] with bitrev(j) */ |
180 | | static void |
181 | | mpn_fft_initl (int **l, int k) |
182 | 0 | { |
183 | 0 | int i, j, K; |
184 | 0 | int *li; |
185 | |
|
186 | 0 | l[0][0] = 0; |
187 | 0 | for (i = 1, K = 1; i <= k; i++, K *= 2) |
188 | 0 | { |
189 | 0 | li = l[i]; |
190 | 0 | for (j = 0; j < K; j++) |
191 | 0 | { |
192 | 0 | li[j] = 2 * l[i - 1][j]; |
193 | 0 | li[K + j] = 1 + li[j]; |
194 | 0 | } |
195 | 0 | } |
196 | 0 | } |
197 | | |
198 | | |
199 | | /* r <- a*2^d mod 2^(n*GMP_NUMB_BITS)+1 with a = {a, n+1} |
200 | | Assumes a is semi-normalized, i.e. a[n] <= 1. |
201 | | r and a must have n+1 limbs, and not overlap. |
202 | | */ |
203 | | static void |
204 | | mpn_fft_mul_2exp_modF (mp_ptr r, mp_srcptr a, mp_bitcnt_t d, mp_size_t n) |
205 | 0 | { |
206 | 0 | unsigned int sh; |
207 | 0 | mp_size_t m; |
208 | 0 | mp_limb_t cc, rd; |
209 | |
|
210 | 0 | sh = d % GMP_NUMB_BITS; |
211 | 0 | m = d / GMP_NUMB_BITS; |
212 | |
|
213 | 0 | if (m >= n) /* negate */ |
214 | 0 | { |
215 | | /* r[0..m-1] <-- lshift(a[n-m]..a[n-1], sh) |
216 | | r[m..n-1] <-- -lshift(a[0]..a[n-m-1], sh) */ |
217 | |
|
218 | 0 | m -= n; |
219 | 0 | if (sh != 0) |
220 | 0 | { |
221 | | /* no out shift below since a[n] <= 1 */ |
222 | 0 | mpn_lshift (r, a + n - m, m + 1, sh); |
223 | 0 | rd = r[m]; |
224 | 0 | cc = mpn_lshiftc (r + m, a, n - m, sh); |
225 | 0 | } |
226 | 0 | else |
227 | 0 | { |
228 | 0 | MPN_COPY (r, a + n - m, m); |
229 | 0 | rd = a[n]; |
230 | 0 | mpn_com (r + m, a, n - m); |
231 | 0 | cc = 0; |
232 | 0 | } |
233 | | |
234 | | /* add cc to r[0], and add rd to r[m] */ |
235 | | |
236 | | /* now add 1 in r[m], subtract 1 in r[n], i.e. add 1 in r[0] */ |
237 | |
|
238 | 0 | r[n] = 0; |
239 | | /* cc < 2^sh <= 2^(GMP_NUMB_BITS-1) thus no overflow here */ |
240 | 0 | cc++; |
241 | 0 | mpn_incr_u (r, cc); |
242 | |
|
243 | 0 | rd++; |
244 | | /* rd might overflow when sh=GMP_NUMB_BITS-1 */ |
245 | 0 | cc = (rd == 0) ? 1 : rd; |
246 | 0 | r = r + m + (rd == 0); |
247 | 0 | mpn_incr_u (r, cc); |
248 | 0 | } |
249 | 0 | else |
250 | 0 | { |
251 | | /* r[0..m-1] <-- -lshift(a[n-m]..a[n-1], sh) |
252 | | r[m..n-1] <-- lshift(a[0]..a[n-m-1], sh) */ |
253 | 0 | if (sh != 0) |
254 | 0 | { |
255 | | /* no out bits below since a[n] <= 1 */ |
256 | 0 | mpn_lshiftc (r, a + n - m, m + 1, sh); |
257 | 0 | rd = ~r[m]; |
258 | | /* {r, m+1} = {a+n-m, m+1} << sh */ |
259 | 0 | cc = mpn_lshift (r + m, a, n - m, sh); /* {r+m, n-m} = {a, n-m}<<sh */ |
260 | 0 | } |
261 | 0 | else |
262 | 0 | { |
263 | | /* r[m] is not used below, but we save a test for m=0 */ |
264 | 0 | mpn_com (r, a + n - m, m + 1); |
265 | 0 | rd = a[n]; |
266 | 0 | MPN_COPY (r + m, a, n - m); |
267 | 0 | cc = 0; |
268 | 0 | } |
269 | | |
270 | | /* now complement {r, m}, subtract cc from r[0], subtract rd from r[m] */ |
271 | | |
272 | | /* if m=0 we just have r[0]=a[n] << sh */ |
273 | 0 | if (m != 0) |
274 | 0 | { |
275 | | /* now add 1 in r[0], subtract 1 in r[m] */ |
276 | 0 | if (cc-- == 0) /* then add 1 to r[0] */ |
277 | 0 | cc = mpn_add_1 (r, r, n, CNST_LIMB(1)); |
278 | 0 | cc = mpn_sub_1 (r, r, m, cc) + 1; |
279 | | /* add 1 to cc instead of rd since rd might overflow */ |
280 | 0 | } |
281 | | |
282 | | /* now subtract cc and rd from r[m..n] */ |
283 | |
|
284 | 0 | r[n] = -mpn_sub_1 (r + m, r + m, n - m, cc); |
285 | 0 | r[n] -= mpn_sub_1 (r + m, r + m, n - m, rd); |
286 | 0 | if (r[n] & GMP_LIMB_HIGHBIT) |
287 | 0 | r[n] = mpn_add_1 (r, r, n, CNST_LIMB(1)); |
288 | 0 | } |
289 | 0 | } |
290 | | |
291 | | #if HAVE_NATIVE_mpn_add_n_sub_n |
292 | | static inline void |
293 | | mpn_fft_add_sub_modF (mp_ptr A0, mp_ptr Ai, mp_srcptr tp, mp_size_t n) |
294 | | { |
295 | | mp_limb_t cyas, c, x; |
296 | | |
297 | | cyas = mpn_add_n_sub_n (A0, Ai, A0, tp, n); |
298 | | |
299 | | c = A0[n] - tp[n] - (cyas & 1); |
300 | | x = (-c) & -((c & GMP_LIMB_HIGHBIT) != 0); |
301 | | Ai[n] = x + c; |
302 | | MPN_INCR_U (Ai, n + 1, x); |
303 | | |
304 | | c = A0[n] + tp[n] + (cyas >> 1); |
305 | | x = (c - 1) & -(c != 0); |
306 | | A0[n] = c - x; |
307 | | MPN_DECR_U (A0, n + 1, x); |
308 | | } |
309 | | |
310 | | #else /* ! HAVE_NATIVE_mpn_add_n_sub_n */ |
311 | | |
312 | | /* r <- a+b mod 2^(n*GMP_NUMB_BITS)+1. |
313 | | Assumes a and b are semi-normalized. |
314 | | */ |
315 | | static inline void |
316 | | mpn_fft_add_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, mp_size_t n) |
317 | 0 | { |
318 | 0 | mp_limb_t c, x; |
319 | |
|
320 | 0 | c = a[n] + b[n] + mpn_add_n (r, a, b, n); |
321 | | /* 0 <= c <= 3 */ |
322 | |
|
323 | 0 | #if 1 |
324 | | /* GCC 4.1 outsmarts most expressions here, and generates a 50% branch. The |
325 | | result is slower code, of course. But the following outsmarts GCC. */ |
326 | 0 | x = (c - 1) & -(c != 0); |
327 | 0 | r[n] = c - x; |
328 | 0 | MPN_DECR_U (r, n + 1, x); |
329 | 0 | #endif |
330 | | #if 0 |
331 | | if (c > 1) |
332 | | { |
333 | | r[n] = 1; /* r[n] - c = 1 */ |
334 | | MPN_DECR_U (r, n + 1, c - 1); |
335 | | } |
336 | | else |
337 | | { |
338 | | r[n] = c; |
339 | | } |
340 | | #endif |
341 | 0 | } |
342 | | |
343 | | /* r <- a-b mod 2^(n*GMP_NUMB_BITS)+1. |
344 | | Assumes a and b are semi-normalized. |
345 | | */ |
346 | | static inline void |
347 | | mpn_fft_sub_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, mp_size_t n) |
348 | 0 | { |
349 | 0 | mp_limb_t c, x; |
350 | |
|
351 | 0 | c = a[n] - b[n] - mpn_sub_n (r, a, b, n); |
352 | | /* -2 <= c <= 1 */ |
353 | |
|
354 | 0 | #if 1 |
355 | | /* GCC 4.1 outsmarts most expressions here, and generates a 50% branch. The |
356 | | result is slower code, of course. But the following outsmarts GCC. */ |
357 | 0 | x = (-c) & -((c & GMP_LIMB_HIGHBIT) != 0); |
358 | 0 | r[n] = x + c; |
359 | 0 | MPN_INCR_U (r, n + 1, x); |
360 | 0 | #endif |
361 | | #if 0 |
362 | | if ((c & GMP_LIMB_HIGHBIT) != 0) |
363 | | { |
364 | | r[n] = 0; |
365 | | MPN_INCR_U (r, n + 1, -c); |
366 | | } |
367 | | else |
368 | | { |
369 | | r[n] = c; |
370 | | } |
371 | | #endif |
372 | 0 | } |
373 | | #endif /* HAVE_NATIVE_mpn_add_n_sub_n */ |
374 | | |
375 | | /* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where |
376 | | N=n*GMP_NUMB_BITS, and 2^omega is a primitive root mod 2^N+1 |
377 | | output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */ |
378 | | |
379 | | static void |
380 | | mpn_fft_fft (mp_ptr *Ap, mp_size_t K, int **ll, |
381 | | mp_size_t omega, mp_size_t n, mp_size_t inc, mp_ptr tp) |
382 | 0 | { |
383 | 0 | if (K == 2) |
384 | 0 | { |
385 | 0 | mp_limb_t cy; |
386 | | #if HAVE_NATIVE_mpn_add_n_sub_n |
387 | | cy = mpn_add_n_sub_n (Ap[0], Ap[inc], Ap[0], Ap[inc], n + 1) & 1; |
388 | | #else |
389 | 0 | MPN_COPY (tp, Ap[0], n + 1); |
390 | 0 | mpn_add_n (Ap[0], Ap[0], Ap[inc], n + 1); |
391 | 0 | cy = mpn_sub_n (Ap[inc], tp, Ap[inc], n + 1); |
392 | 0 | #endif |
393 | 0 | if (Ap[0][n] > 1) /* can be 2 or 3 */ |
394 | 0 | Ap[0][n] = 1 - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - 1); |
395 | 0 | if (cy) /* Ap[inc][n] can be -1 or -2 */ |
396 | 0 | Ap[inc][n] = mpn_add_1 (Ap[inc], Ap[inc], n, ~Ap[inc][n] + 1); |
397 | 0 | } |
398 | 0 | else |
399 | 0 | { |
400 | 0 | mp_size_t j, K2 = K >> 1; |
401 | 0 | int *lk = *ll; |
402 | |
|
403 | 0 | mpn_fft_fft (Ap, K2, ll-1, 2 * omega, n, inc * 2, tp); |
404 | 0 | mpn_fft_fft (Ap+inc, K2, ll-1, 2 * omega, n, inc * 2, tp); |
405 | | /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc] |
406 | | A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */ |
407 | 0 | for (j = 0; j < K2; j++, lk += 2, Ap += 2 * inc) |
408 | 0 | { |
409 | | /* Ap[inc] <- Ap[0] + Ap[inc] * 2^(lk[1] * omega) |
410 | | Ap[0] <- Ap[0] + Ap[inc] * 2^(lk[0] * omega) */ |
411 | 0 | mpn_fft_mul_2exp_modF (tp, Ap[inc], lk[0] * omega, n); |
412 | | #if HAVE_NATIVE_mpn_add_n_sub_n |
413 | | mpn_fft_add_sub_modF (Ap[0], Ap[inc], tp, n); |
414 | | #else |
415 | 0 | mpn_fft_sub_modF (Ap[inc], Ap[0], tp, n); |
416 | 0 | mpn_fft_add_modF (Ap[0], Ap[0], tp, n); |
417 | 0 | #endif |
418 | 0 | } |
419 | 0 | } |
420 | 0 | } |
421 | | |
422 | | /* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where |
423 | | N=n*GMP_NUMB_BITS, and 2^omega is a primitive root mod 2^N+1 |
424 | | output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 |
425 | | tp must have space for 2*(n+1) limbs. |
426 | | */ |
427 | | |
428 | | |
429 | | /* Given ap[0..n] with ap[n]<=1, reduce it modulo 2^(n*GMP_NUMB_BITS)+1, |
430 | | by subtracting that modulus if necessary. |
431 | | |
432 | | If ap[0..n] is exactly 2^(n*GMP_NUMB_BITS) then mpn_sub_1 produces a |
433 | | borrow and the limbs must be zeroed out again. This will occur very |
434 | | infrequently. */ |
435 | | |
436 | | static inline void |
437 | | mpn_fft_normalize (mp_ptr ap, mp_size_t n) |
438 | 0 | { |
439 | 0 | if (ap[n] != 0) |
440 | 0 | { |
441 | 0 | MPN_DECR_U (ap, n + 1, CNST_LIMB(1)); |
442 | 0 | if (ap[n] == 0) |
443 | 0 | { |
444 | | /* This happens with very low probability; we have yet to trigger it, |
445 | | and thereby make sure this code is correct. */ |
446 | 0 | MPN_ZERO (ap, n); |
447 | 0 | ap[n] = 1; |
448 | 0 | } |
449 | 0 | else |
450 | 0 | ap[n] = 0; |
451 | 0 | } |
452 | 0 | } |
453 | | |
454 | | /* a[i] <- a[i]*b[i] mod 2^(n*GMP_NUMB_BITS)+1 for 0 <= i < K */ |
455 | | static void |
456 | | mpn_fft_mul_modF_K (mp_ptr *ap, mp_ptr *bp, mp_size_t n, mp_size_t K) |
457 | 0 | { |
458 | 0 | int i; |
459 | 0 | int sqr = (ap == bp); |
460 | 0 | TMP_DECL; |
461 | |
|
462 | 0 | TMP_MARK; |
463 | |
|
464 | 0 | if (n >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) |
465 | 0 | { |
466 | 0 | mp_size_t K2, nprime2, Nprime2, M2, maxLK, l, Mp2; |
467 | 0 | int k; |
468 | 0 | int **fft_l, *tmp; |
469 | 0 | mp_ptr *Ap, *Bp, A, B, T; |
470 | |
|
471 | 0 | k = mpn_fft_best_k (n, sqr); |
472 | 0 | K2 = (mp_size_t) 1 << k; |
473 | 0 | ASSERT_ALWAYS((n & (K2 - 1)) == 0); |
474 | 0 | maxLK = (K2 > GMP_NUMB_BITS) ? K2 : GMP_NUMB_BITS; |
475 | 0 | M2 = n * GMP_NUMB_BITS >> k; |
476 | 0 | l = n >> k; |
477 | 0 | Nprime2 = ((2 * M2 + k + 2 + maxLK) / maxLK) * maxLK; |
478 | | /* Nprime2 = ceil((2*M2+k+3)/maxLK)*maxLK*/ |
479 | 0 | nprime2 = Nprime2 / GMP_NUMB_BITS; |
480 | | |
481 | | /* we should ensure that nprime2 is a multiple of the next K */ |
482 | 0 | if (nprime2 >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) |
483 | 0 | { |
484 | 0 | mp_size_t K3; |
485 | 0 | for (;;) |
486 | 0 | { |
487 | 0 | K3 = (mp_size_t) 1 << mpn_fft_best_k (nprime2, sqr); |
488 | 0 | if ((nprime2 & (K3 - 1)) == 0) |
489 | 0 | break; |
490 | 0 | nprime2 = (nprime2 + K3 - 1) & -K3; |
491 | 0 | Nprime2 = nprime2 * GMP_LIMB_BITS; |
492 | | /* warning: since nprime2 changed, K3 may change too! */ |
493 | 0 | } |
494 | 0 | } |
495 | 0 | ASSERT_ALWAYS(nprime2 < n); /* otherwise we'll loop */ |
496 | | |
497 | 0 | Mp2 = Nprime2 >> k; |
498 | |
|
499 | 0 | Ap = TMP_BALLOC_MP_PTRS (K2); |
500 | 0 | Bp = TMP_BALLOC_MP_PTRS (K2); |
501 | 0 | A = TMP_BALLOC_LIMBS (2 * (nprime2 + 1) << k); |
502 | 0 | T = TMP_BALLOC_LIMBS (2 * (nprime2 + 1)); |
503 | 0 | B = A + ((nprime2 + 1) << k); |
504 | 0 | fft_l = TMP_BALLOC_TYPE (k + 1, int *); |
505 | 0 | tmp = TMP_BALLOC_TYPE ((size_t) 2 << k, int); |
506 | 0 | for (i = 0; i <= k; i++) |
507 | 0 | { |
508 | 0 | fft_l[i] = tmp; |
509 | 0 | tmp += (mp_size_t) 1 << i; |
510 | 0 | } |
511 | |
|
512 | 0 | mpn_fft_initl (fft_l, k); |
513 | |
|
514 | 0 | TRACE (printf ("recurse: %ldx%ld limbs -> %ld times %ldx%ld (%1.2f)\n", n, |
515 | 0 | n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2)); |
516 | 0 | for (i = 0; i < K; i++, ap++, bp++) |
517 | 0 | { |
518 | 0 | mp_limb_t cy; |
519 | 0 | mpn_fft_normalize (*ap, n); |
520 | 0 | if (!sqr) |
521 | 0 | mpn_fft_normalize (*bp, n); |
522 | |
|
523 | 0 | mpn_mul_fft_decompose (A, Ap, K2, nprime2, *ap, (l << k) + 1, l, Mp2, T); |
524 | 0 | if (!sqr) |
525 | 0 | mpn_mul_fft_decompose (B, Bp, K2, nprime2, *bp, (l << k) + 1, l, Mp2, T); |
526 | |
|
527 | 0 | cy = mpn_mul_fft_internal (*ap, n, k, Ap, Bp, A, B, nprime2, |
528 | 0 | l, Mp2, fft_l, T, sqr); |
529 | 0 | (*ap)[n] = cy; |
530 | 0 | } |
531 | 0 | } |
532 | 0 | else |
533 | 0 | { |
534 | 0 | mp_ptr a, b, tp, tpn; |
535 | 0 | mp_limb_t cc; |
536 | 0 | mp_size_t n2 = 2 * n; |
537 | 0 | tp = TMP_BALLOC_LIMBS (n2); |
538 | 0 | tpn = tp + n; |
539 | 0 | TRACE (printf (" mpn_mul_n %ld of %ld limbs\n", K, n)); |
540 | 0 | for (i = 0; i < K; i++) |
541 | 0 | { |
542 | 0 | a = *ap++; |
543 | 0 | b = *bp++; |
544 | 0 | if (sqr) |
545 | 0 | mpn_sqr (tp, a, n); |
546 | 0 | else |
547 | 0 | mpn_mul_n (tp, b, a, n); |
548 | 0 | if (a[n] != 0) |
549 | 0 | cc = mpn_add_n (tpn, tpn, b, n); |
550 | 0 | else |
551 | 0 | cc = 0; |
552 | 0 | if (b[n] != 0) |
553 | 0 | cc += mpn_add_n (tpn, tpn, a, n) + a[n]; |
554 | 0 | if (cc != 0) |
555 | 0 | { |
556 | 0 | cc = mpn_add_1 (tp, tp, n2, cc); |
557 | | /* If mpn_add_1 give a carry (cc != 0), |
558 | | the result (tp) is at most GMP_NUMB_MAX - 1, |
559 | | so the following addition can't overflow. |
560 | | */ |
561 | 0 | tp[0] += cc; |
562 | 0 | } |
563 | 0 | a[n] = mpn_sub_n (a, tp, tpn, n) && mpn_add_1 (a, a, n, CNST_LIMB(1)); |
564 | 0 | } |
565 | 0 | } |
566 | 0 | TMP_FREE; |
567 | 0 | } |
568 | | |
569 | | |
570 | | /* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]] |
571 | | output: K*A[0] K*A[K-1] ... K*A[1]. |
572 | | Assumes the Ap[] are pseudo-normalized, i.e. 0 <= Ap[][n] <= 1. |
573 | | This condition is also fulfilled at exit. |
574 | | */ |
575 | | static void |
576 | | mpn_fft_fftinv (mp_ptr *Ap, mp_size_t K, mp_size_t omega, mp_size_t n, mp_ptr tp) |
577 | 0 | { |
578 | 0 | if (K == 2) |
579 | 0 | { |
580 | 0 | mp_limb_t cy; |
581 | | #if HAVE_NATIVE_mpn_add_n_sub_n |
582 | | cy = mpn_add_n_sub_n (Ap[0], Ap[1], Ap[0], Ap[1], n + 1) & 1; |
583 | | #else |
584 | 0 | MPN_COPY (tp, Ap[0], n + 1); |
585 | 0 | mpn_add_n (Ap[0], Ap[0], Ap[1], n + 1); |
586 | 0 | cy = mpn_sub_n (Ap[1], tp, Ap[1], n + 1); |
587 | 0 | #endif |
588 | 0 | if (Ap[0][n] > 1) /* can be 2 or 3 */ |
589 | 0 | Ap[0][n] = 1 - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - 1); |
590 | 0 | if (cy) /* Ap[1][n] can be -1 or -2 */ |
591 | 0 | Ap[1][n] = mpn_add_1 (Ap[1], Ap[1], n, ~Ap[1][n] + 1); |
592 | 0 | } |
593 | 0 | else |
594 | 0 | { |
595 | 0 | mp_size_t j, K2 = K >> 1; |
596 | |
|
597 | 0 | mpn_fft_fftinv (Ap, K2, 2 * omega, n, tp); |
598 | 0 | mpn_fft_fftinv (Ap + K2, K2, 2 * omega, n, tp); |
599 | | /* A[j] <- A[j] + omega^j A[j+K/2] |
600 | | A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */ |
601 | 0 | for (j = 0; j < K2; j++, Ap++) |
602 | 0 | { |
603 | | /* Ap[K2] <- Ap[0] + Ap[K2] * 2^((j + K2) * omega) |
604 | | Ap[0] <- Ap[0] + Ap[K2] * 2^(j * omega) */ |
605 | 0 | mpn_fft_mul_2exp_modF (tp, Ap[K2], j * omega, n); |
606 | | #if HAVE_NATIVE_mpn_add_n_sub_n |
607 | | mpn_fft_add_sub_modF (Ap[0], Ap[K2], tp, n); |
608 | | #else |
609 | 0 | mpn_fft_sub_modF (Ap[K2], Ap[0], tp, n); |
610 | 0 | mpn_fft_add_modF (Ap[0], Ap[0], tp, n); |
611 | 0 | #endif |
612 | 0 | } |
613 | 0 | } |
614 | 0 | } |
615 | | |
616 | | |
617 | | /* R <- A/2^k mod 2^(n*GMP_NUMB_BITS)+1 */ |
618 | | static void |
619 | | mpn_fft_div_2exp_modF (mp_ptr r, mp_srcptr a, mp_bitcnt_t k, mp_size_t n) |
620 | 0 | { |
621 | 0 | mp_bitcnt_t i; |
622 | |
|
623 | 0 | ASSERT (r != a); |
624 | 0 | i = (mp_bitcnt_t) 2 * n * GMP_NUMB_BITS - k; |
625 | 0 | mpn_fft_mul_2exp_modF (r, a, i, n); |
626 | | /* 1/2^k = 2^(2nL-k) mod 2^(n*GMP_NUMB_BITS)+1 */ |
627 | | /* normalize so that R < 2^(n*GMP_NUMB_BITS)+1 */ |
628 | 0 | mpn_fft_normalize (r, n); |
629 | 0 | } |
630 | | |
631 | | |
632 | | /* {rp,n} <- {ap,an} mod 2^(n*GMP_NUMB_BITS)+1, n <= an <= 3*n. |
633 | | Returns carry out, i.e. 1 iff {ap,an} = -1 mod 2^(n*GMP_NUMB_BITS)+1, |
634 | | then {rp,n}=0. |
635 | | */ |
636 | | static mp_size_t |
637 | | mpn_fft_norm_modF (mp_ptr rp, mp_size_t n, mp_ptr ap, mp_size_t an) |
638 | 0 | { |
639 | 0 | mp_size_t l, m, rpn; |
640 | 0 | mp_limb_t cc; |
641 | |
|
642 | 0 | ASSERT ((n <= an) && (an <= 3 * n)); |
643 | 0 | m = an - 2 * n; |
644 | 0 | if (m > 0) |
645 | 0 | { |
646 | 0 | l = n; |
647 | | /* add {ap, m} and {ap+2n, m} in {rp, m} */ |
648 | 0 | cc = mpn_add_n (rp, ap, ap + 2 * n, m); |
649 | | /* copy {ap+m, n-m} to {rp+m, n-m} */ |
650 | 0 | rpn = mpn_add_1 (rp + m, ap + m, n - m, cc); |
651 | 0 | } |
652 | 0 | else |
653 | 0 | { |
654 | 0 | l = an - n; /* l <= n */ |
655 | 0 | MPN_COPY (rp, ap, n); |
656 | 0 | rpn = 0; |
657 | 0 | } |
658 | | |
659 | | /* remains to subtract {ap+n, l} from {rp, n+1} */ |
660 | 0 | cc = mpn_sub_n (rp, rp, ap + n, l); |
661 | 0 | rpn -= mpn_sub_1 (rp + l, rp + l, n - l, cc); |
662 | 0 | if (rpn < 0) /* necessarily rpn = -1 */ |
663 | 0 | rpn = mpn_add_1 (rp, rp, n, CNST_LIMB(1)); |
664 | 0 | return rpn; |
665 | 0 | } |
666 | | |
667 | | /* store in A[0..nprime] the first M bits from {n, nl}, |
668 | | in A[nprime+1..] the following M bits, ... |
669 | | Assumes M is a multiple of GMP_NUMB_BITS (M = l * GMP_NUMB_BITS). |
670 | | T must have space for at least (nprime + 1) limbs. |
671 | | We must have nl <= 2*K*l. |
672 | | */ |
673 | | static void |
674 | | mpn_mul_fft_decompose (mp_ptr A, mp_ptr *Ap, mp_size_t K, mp_size_t nprime, |
675 | | mp_srcptr n, mp_size_t nl, mp_size_t l, mp_size_t Mp, |
676 | | mp_ptr T) |
677 | 0 | { |
678 | 0 | mp_size_t i, j; |
679 | 0 | mp_ptr tmp; |
680 | 0 | mp_size_t Kl = K * l; |
681 | 0 | TMP_DECL; |
682 | 0 | TMP_MARK; |
683 | |
|
684 | 0 | if (nl > Kl) /* normalize {n, nl} mod 2^(Kl*GMP_NUMB_BITS)+1 */ |
685 | 0 | { |
686 | 0 | mp_size_t dif = nl - Kl; |
687 | 0 | mp_limb_signed_t cy; |
688 | |
|
689 | 0 | tmp = TMP_BALLOC_LIMBS(Kl + 1); |
690 | |
|
691 | 0 | if (dif > Kl) |
692 | 0 | { |
693 | 0 | int subp = 0; |
694 | |
|
695 | 0 | cy = mpn_sub_n (tmp, n, n + Kl, Kl); |
696 | 0 | n += 2 * Kl; |
697 | 0 | dif -= Kl; |
698 | | |
699 | | /* now dif > 0 */ |
700 | 0 | while (dif > Kl) |
701 | 0 | { |
702 | 0 | if (subp) |
703 | 0 | cy += mpn_sub_n (tmp, tmp, n, Kl); |
704 | 0 | else |
705 | 0 | cy -= mpn_add_n (tmp, tmp, n, Kl); |
706 | 0 | subp ^= 1; |
707 | 0 | n += Kl; |
708 | 0 | dif -= Kl; |
709 | 0 | } |
710 | | /* now dif <= Kl */ |
711 | 0 | if (subp) |
712 | 0 | cy += mpn_sub (tmp, tmp, Kl, n, dif); |
713 | 0 | else |
714 | 0 | cy -= mpn_add (tmp, tmp, Kl, n, dif); |
715 | 0 | if (cy >= 0) |
716 | 0 | cy = mpn_add_1 (tmp, tmp, Kl, cy); |
717 | 0 | else |
718 | 0 | cy = mpn_sub_1 (tmp, tmp, Kl, -cy); |
719 | 0 | } |
720 | 0 | else /* dif <= Kl, i.e. nl <= 2 * Kl */ |
721 | 0 | { |
722 | 0 | cy = mpn_sub (tmp, n, Kl, n + Kl, dif); |
723 | 0 | cy = mpn_add_1 (tmp, tmp, Kl, cy); |
724 | 0 | } |
725 | 0 | tmp[Kl] = cy; |
726 | 0 | nl = Kl + 1; |
727 | 0 | n = tmp; |
728 | 0 | } |
729 | 0 | for (i = 0; i < K; i++) |
730 | 0 | { |
731 | 0 | Ap[i] = A; |
732 | | /* store the next M bits of n into A[0..nprime] */ |
733 | 0 | if (nl > 0) /* nl is the number of remaining limbs */ |
734 | 0 | { |
735 | 0 | j = (l <= nl && i < K - 1) ? l : nl; /* store j next limbs */ |
736 | 0 | nl -= j; |
737 | 0 | MPN_COPY (T, n, j); |
738 | 0 | MPN_ZERO (T + j, nprime + 1 - j); |
739 | 0 | n += l; |
740 | 0 | mpn_fft_mul_2exp_modF (A, T, i * Mp, nprime); |
741 | 0 | } |
742 | 0 | else |
743 | 0 | MPN_ZERO (A, nprime + 1); |
744 | 0 | A += nprime + 1; |
745 | 0 | } |
746 | 0 | ASSERT_ALWAYS (nl == 0); |
747 | 0 | TMP_FREE; |
748 | 0 | } |
749 | | |
750 | | /* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*GMP_NUMB_BITS |
751 | | op is pl limbs, its high bit is returned. |
752 | | One must have pl = mpn_fft_next_size (pl, k). |
753 | | T must have space for 2 * (nprime + 1) limbs. |
754 | | */ |
755 | | |
756 | | static mp_limb_t |
757 | | mpn_mul_fft_internal (mp_ptr op, mp_size_t pl, int k, |
758 | | mp_ptr *Ap, mp_ptr *Bp, mp_ptr A, mp_ptr B, |
759 | | mp_size_t nprime, mp_size_t l, mp_size_t Mp, |
760 | | int **fft_l, mp_ptr T, int sqr) |
761 | 0 | { |
762 | 0 | mp_size_t K, i, pla, lo, sh, j; |
763 | 0 | mp_ptr p; |
764 | 0 | mp_limb_t cc; |
765 | |
|
766 | 0 | K = (mp_size_t) 1 << k; |
767 | | |
768 | | /* direct fft's */ |
769 | 0 | mpn_fft_fft (Ap, K, fft_l + k, 2 * Mp, nprime, 1, T); |
770 | 0 | if (!sqr) |
771 | 0 | mpn_fft_fft (Bp, K, fft_l + k, 2 * Mp, nprime, 1, T); |
772 | | |
773 | | /* term to term multiplications */ |
774 | 0 | mpn_fft_mul_modF_K (Ap, sqr ? Ap : Bp, nprime, K); |
775 | | |
776 | | /* inverse fft's */ |
777 | 0 | mpn_fft_fftinv (Ap, K, 2 * Mp, nprime, T); |
778 | | |
779 | | /* division of terms after inverse fft */ |
780 | 0 | Bp[0] = T + nprime + 1; |
781 | 0 | mpn_fft_div_2exp_modF (Bp[0], Ap[0], k, nprime); |
782 | 0 | for (i = 1; i < K; i++) |
783 | 0 | { |
784 | 0 | Bp[i] = Ap[i - 1]; |
785 | 0 | mpn_fft_div_2exp_modF (Bp[i], Ap[i], k + (K - i) * Mp, nprime); |
786 | 0 | } |
787 | | |
788 | | /* addition of terms in result p */ |
789 | 0 | MPN_ZERO (T, nprime + 1); |
790 | 0 | pla = l * (K - 1) + nprime + 1; /* number of required limbs for p */ |
791 | 0 | p = B; /* B has K*(n' + 1) limbs, which is >= pla, i.e. enough */ |
792 | 0 | MPN_ZERO (p, pla); |
793 | 0 | cc = 0; /* will accumulate the (signed) carry at p[pla] */ |
794 | 0 | for (i = K - 1, lo = l * i + nprime,sh = l * i; i >= 0; i--,lo -= l,sh -= l) |
795 | 0 | { |
796 | 0 | mp_ptr n = p + sh; |
797 | |
|
798 | 0 | j = (K - i) & (K - 1); |
799 | |
|
800 | 0 | if (mpn_add_n (n, n, Bp[j], nprime + 1)) |
801 | 0 | cc += mpn_add_1 (n + nprime + 1, n + nprime + 1, |
802 | 0 | pla - sh - nprime - 1, CNST_LIMB(1)); |
803 | 0 | T[2 * l] = i + 1; /* T = (i + 1)*2^(2*M) */ |
804 | 0 | if (mpn_cmp (Bp[j], T, nprime + 1) > 0) |
805 | 0 | { /* subtract 2^N'+1 */ |
806 | 0 | cc -= mpn_sub_1 (n, n, pla - sh, CNST_LIMB(1)); |
807 | 0 | cc -= mpn_sub_1 (p + lo, p + lo, pla - lo, CNST_LIMB(1)); |
808 | 0 | } |
809 | 0 | } |
810 | 0 | if (cc == -CNST_LIMB(1)) |
811 | 0 | { |
812 | 0 | if ((cc = mpn_add_1 (p + pla - pl, p + pla - pl, pl, CNST_LIMB(1)))) |
813 | 0 | { |
814 | | /* p[pla-pl]...p[pla-1] are all zero */ |
815 | 0 | mpn_sub_1 (p + pla - pl - 1, p + pla - pl - 1, pl + 1, CNST_LIMB(1)); |
816 | 0 | mpn_sub_1 (p + pla - 1, p + pla - 1, 1, CNST_LIMB(1)); |
817 | 0 | } |
818 | 0 | } |
819 | 0 | else if (cc == 1) |
820 | 0 | { |
821 | 0 | if (pla >= 2 * pl) |
822 | 0 | { |
823 | 0 | while ((cc = mpn_add_1 (p + pla - 2 * pl, p + pla - 2 * pl, 2 * pl, cc))) |
824 | 0 | ; |
825 | 0 | } |
826 | 0 | else |
827 | 0 | { |
828 | 0 | cc = mpn_sub_1 (p + pla - pl, p + pla - pl, pl, cc); |
829 | 0 | ASSERT (cc == 0); |
830 | 0 | } |
831 | 0 | } |
832 | 0 | else |
833 | 0 | ASSERT (cc == 0); |
834 | | |
835 | | /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ] |
836 | | < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ] |
837 | | < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */ |
838 | 0 | return mpn_fft_norm_modF (op, pl, p, pla); |
839 | 0 | } |
840 | | |
841 | | /* return the lcm of a and 2^k */ |
842 | | static mp_bitcnt_t |
843 | | mpn_mul_fft_lcm (mp_bitcnt_t a, int k) |
844 | 0 | { |
845 | 0 | mp_bitcnt_t l = k; |
846 | |
|
847 | 0 | while (a % 2 == 0 && k > 0) |
848 | 0 | { |
849 | 0 | a >>= 1; |
850 | 0 | k --; |
851 | 0 | } |
852 | 0 | return a << l; |
853 | 0 | } |
854 | | |
855 | | |
856 | | mp_limb_t |
857 | | mpn_mul_fft (mp_ptr op, mp_size_t pl, |
858 | | mp_srcptr n, mp_size_t nl, |
859 | | mp_srcptr m, mp_size_t ml, |
860 | | int k) |
861 | 0 | { |
862 | 0 | int i; |
863 | 0 | mp_size_t K, maxLK; |
864 | 0 | mp_size_t N, Nprime, nprime, M, Mp, l; |
865 | 0 | mp_ptr *Ap, *Bp, A, T, B; |
866 | 0 | int **fft_l, *tmp; |
867 | 0 | int sqr = (n == m && nl == ml); |
868 | 0 | mp_limb_t h; |
869 | 0 | TMP_DECL; |
870 | |
|
871 | 0 | TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n", pl, nl, ml, k)); |
872 | 0 | ASSERT_ALWAYS (mpn_fft_next_size (pl, k) == pl); |
873 | | |
874 | 0 | TMP_MARK; |
875 | 0 | N = pl * GMP_NUMB_BITS; |
876 | 0 | fft_l = TMP_BALLOC_TYPE (k + 1, int *); |
877 | 0 | tmp = TMP_BALLOC_TYPE ((size_t) 2 << k, int); |
878 | 0 | for (i = 0; i <= k; i++) |
879 | 0 | { |
880 | 0 | fft_l[i] = tmp; |
881 | 0 | tmp += (mp_size_t) 1 << i; |
882 | 0 | } |
883 | |
|
884 | 0 | mpn_fft_initl (fft_l, k); |
885 | 0 | K = (mp_size_t) 1 << k; |
886 | 0 | M = N >> k; /* N = 2^k M */ |
887 | 0 | l = 1 + (M - 1) / GMP_NUMB_BITS; |
888 | 0 | maxLK = mpn_mul_fft_lcm (GMP_NUMB_BITS, k); /* lcm (GMP_NUMB_BITS, 2^k) */ |
889 | |
|
890 | 0 | Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK; |
891 | | /* Nprime = ceil((2*M+k+3)/maxLK)*maxLK; */ |
892 | 0 | nprime = Nprime / GMP_NUMB_BITS; |
893 | 0 | TRACE (printf ("N=%ld K=%ld, M=%ld, l=%ld, maxLK=%ld, Np=%ld, np=%ld\n", |
894 | 0 | N, K, M, l, maxLK, Nprime, nprime)); |
895 | | /* we should ensure that recursively, nprime is a multiple of the next K */ |
896 | 0 | if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) |
897 | 0 | { |
898 | 0 | mp_size_t K2; |
899 | 0 | for (;;) |
900 | 0 | { |
901 | 0 | K2 = (mp_size_t) 1 << mpn_fft_best_k (nprime, sqr); |
902 | 0 | if ((nprime & (K2 - 1)) == 0) |
903 | 0 | break; |
904 | 0 | nprime = (nprime + K2 - 1) & -K2; |
905 | 0 | Nprime = nprime * GMP_LIMB_BITS; |
906 | | /* warning: since nprime changed, K2 may change too! */ |
907 | 0 | } |
908 | 0 | TRACE (printf ("new maxLK=%ld, Np=%ld, np=%ld\n", maxLK, Nprime, nprime)); |
909 | 0 | } |
910 | 0 | ASSERT_ALWAYS (nprime < pl); /* otherwise we'll loop */ |
911 | | |
912 | 0 | T = TMP_BALLOC_LIMBS (2 * (nprime + 1)); |
913 | 0 | Mp = Nprime >> k; |
914 | |
|
915 | 0 | TRACE (printf ("%ldx%ld limbs -> %ld times %ldx%ld limbs (%1.2f)\n", |
916 | 0 | pl, pl, K, nprime, nprime, 2.0 * (double) N / Nprime / K); |
917 | 0 | printf (" temp space %ld\n", 2 * K * (nprime + 1))); |
918 | |
|
919 | 0 | A = TMP_BALLOC_LIMBS (K * (nprime + 1)); |
920 | 0 | Ap = TMP_BALLOC_MP_PTRS (K); |
921 | 0 | mpn_mul_fft_decompose (A, Ap, K, nprime, n, nl, l, Mp, T); |
922 | 0 | if (sqr) |
923 | 0 | { |
924 | 0 | mp_size_t pla; |
925 | 0 | pla = l * (K - 1) + nprime + 1; /* number of required limbs for p */ |
926 | 0 | B = TMP_BALLOC_LIMBS (pla); |
927 | 0 | Bp = TMP_BALLOC_MP_PTRS (K); |
928 | 0 | } |
929 | 0 | else |
930 | 0 | { |
931 | 0 | B = TMP_BALLOC_LIMBS (K * (nprime + 1)); |
932 | 0 | Bp = TMP_BALLOC_MP_PTRS (K); |
933 | 0 | mpn_mul_fft_decompose (B, Bp, K, nprime, m, ml, l, Mp, T); |
934 | 0 | } |
935 | 0 | h = mpn_mul_fft_internal (op, pl, k, Ap, Bp, A, B, nprime, l, Mp, fft_l, T, sqr); |
936 | |
|
937 | 0 | TMP_FREE; |
938 | 0 | return h; |
939 | 0 | } |
940 | | |
941 | | #if WANT_OLD_FFT_FULL |
942 | | /* multiply {n, nl} by {m, ml}, and put the result in {op, nl+ml} */ |
943 | | void |
944 | | mpn_mul_fft_full (mp_ptr op, |
945 | | mp_srcptr n, mp_size_t nl, |
946 | | mp_srcptr m, mp_size_t ml) |
947 | | { |
948 | | mp_ptr pad_op; |
949 | | mp_size_t pl, pl2, pl3, l; |
950 | | mp_size_t cc, c2, oldcc; |
951 | | int k2, k3; |
952 | | int sqr = (n == m && nl == ml); |
953 | | |
954 | | pl = nl + ml; /* total number of limbs of the result */ |
955 | | |
956 | | /* perform a fft mod 2^(2N)+1 and one mod 2^(3N)+1. |
957 | | We must have pl3 = 3/2 * pl2, with pl2 a multiple of 2^k2, and |
958 | | pl3 a multiple of 2^k3. Since k3 >= k2, both are multiples of 2^k2, |
959 | | and pl2 must be an even multiple of 2^k2. Thus (pl2,pl3) = |
960 | | (2*j*2^k2,3*j*2^k2), which works for 3*j <= pl/2^k2 <= 5*j. |
961 | | We need that consecutive intervals overlap, i.e. 5*j >= 3*(j+1), |
962 | | which requires j>=2. Thus this scheme requires pl >= 6 * 2^FFT_FIRST_K. */ |
963 | | |
964 | | /* ASSERT_ALWAYS(pl >= 6 * (1 << FFT_FIRST_K)); */ |
965 | | |
966 | | pl2 = (2 * pl - 1) / 5; /* ceil (2pl/5) - 1 */ |
967 | | do |
968 | | { |
969 | | pl2++; |
970 | | k2 = mpn_fft_best_k (pl2, sqr); /* best fft size for pl2 limbs */ |
971 | | pl2 = mpn_fft_next_size (pl2, k2); |
972 | | pl3 = 3 * pl2 / 2; /* since k>=FFT_FIRST_K=4, pl2 is a multiple of 2^4, |
973 | | thus pl2 / 2 is exact */ |
974 | | k3 = mpn_fft_best_k (pl3, sqr); |
975 | | } |
976 | | while (mpn_fft_next_size (pl3, k3) != pl3); |
977 | | |
978 | | TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl2=%ld pl3=%ld k=%d\n", |
979 | | nl, ml, pl2, pl3, k2)); |
980 | | |
981 | | ASSERT_ALWAYS(pl3 <= pl); |
982 | | cc = mpn_mul_fft (op, pl3, n, nl, m, ml, k3); /* mu */ |
983 | | ASSERT(cc == 0); |
984 | | pad_op = __GMP_ALLOCATE_FUNC_LIMBS (pl2); |
985 | | cc = mpn_mul_fft (pad_op, pl2, n, nl, m, ml, k2); /* lambda */ |
986 | | cc = -cc + mpn_sub_n (pad_op, pad_op, op, pl2); /* lambda - low(mu) */ |
987 | | /* 0 <= cc <= 1 */ |
988 | | ASSERT(0 <= cc && cc <= 1); |
989 | | l = pl3 - pl2; /* l = pl2 / 2 since pl3 = 3/2 * pl2 */ |
990 | | c2 = mpn_add_n (pad_op, pad_op, op + pl2, l); |
991 | | cc = mpn_add_1 (pad_op + l, pad_op + l, l, (mp_limb_t) c2) - cc; |
992 | | ASSERT(-1 <= cc && cc <= 1); |
993 | | if (cc < 0) |
994 | | cc = mpn_add_1 (pad_op, pad_op, pl2, (mp_limb_t) -cc); |
995 | | ASSERT(0 <= cc && cc <= 1); |
996 | | /* now lambda-mu = {pad_op, pl2} - cc mod 2^(pl2*GMP_NUMB_BITS)+1 */ |
997 | | oldcc = cc; |
998 | | #if HAVE_NATIVE_mpn_add_n_sub_n |
999 | | c2 = mpn_add_n_sub_n (pad_op + l, pad_op, pad_op, pad_op + l, l); |
1000 | | cc += c2 >> 1; /* carry out from high <- low + high */ |
1001 | | c2 = c2 & 1; /* borrow out from low <- low - high */ |
1002 | | #else |
1003 | | { |
1004 | | mp_ptr tmp; |
1005 | | TMP_DECL; |
1006 | | |
1007 | | TMP_MARK; |
1008 | | tmp = TMP_BALLOC_LIMBS (l); |
1009 | | MPN_COPY (tmp, pad_op, l); |
1010 | | c2 = mpn_sub_n (pad_op, pad_op, pad_op + l, l); |
1011 | | cc += mpn_add_n (pad_op + l, tmp, pad_op + l, l); |
1012 | | TMP_FREE; |
1013 | | } |
1014 | | #endif |
1015 | | c2 += oldcc; |
1016 | | /* first normalize {pad_op, pl2} before dividing by 2: c2 is the borrow |
1017 | | at pad_op + l, cc is the carry at pad_op + pl2 */ |
1018 | | /* 0 <= cc <= 2 */ |
1019 | | cc -= mpn_sub_1 (pad_op + l, pad_op + l, l, (mp_limb_t) c2); |
1020 | | /* -1 <= cc <= 2 */ |
1021 | | if (cc > 0) |
1022 | | cc = -mpn_sub_1 (pad_op, pad_op, pl2, (mp_limb_t) cc); |
1023 | | /* now -1 <= cc <= 0 */ |
1024 | | if (cc < 0) |
1025 | | cc = mpn_add_1 (pad_op, pad_op, pl2, (mp_limb_t) -cc); |
1026 | | /* now {pad_op, pl2} is normalized, with 0 <= cc <= 1 */ |
1027 | | if (pad_op[0] & 1) /* if odd, add 2^(pl2*GMP_NUMB_BITS)+1 */ |
1028 | | cc += 1 + mpn_add_1 (pad_op, pad_op, pl2, CNST_LIMB(1)); |
1029 | | /* now 0 <= cc <= 2, but cc=2 cannot occur since it would give a carry |
1030 | | out below */ |
1031 | | mpn_rshift (pad_op, pad_op, pl2, 1); /* divide by two */ |
1032 | | if (cc) /* then cc=1 */ |
1033 | | pad_op [pl2 - 1] |= (mp_limb_t) 1 << (GMP_NUMB_BITS - 1); |
1034 | | /* now {pad_op,pl2}-cc = (lambda-mu)/(1-2^(l*GMP_NUMB_BITS)) |
1035 | | mod 2^(pl2*GMP_NUMB_BITS) + 1 */ |
1036 | | c2 = mpn_add_n (op, op, pad_op, pl2); /* no need to add cc (is 0) */ |
1037 | | /* since pl2+pl3 >= pl, necessary the extra limbs (including cc) are zero */ |
1038 | | MPN_COPY (op + pl3, pad_op, pl - pl3); |
1039 | | ASSERT_MPN_ZERO_P (pad_op + pl - pl3, pl2 + pl3 - pl); |
1040 | | __GMP_FREE_FUNC_LIMBS (pad_op, pl2); |
1041 | | /* since the final result has at most pl limbs, no carry out below */ |
1042 | | mpn_add_1 (op + pl2, op + pl2, pl - pl2, (mp_limb_t) c2); |
1043 | | } |
1044 | | #endif |