Line | Count | Source (jump to first uncovered line) |
1 | | /* mpn_mul -- Multiply two natural numbers. |
2 | | |
3 | | Contributed to the GNU project by Torbjorn Granlund. |
4 | | |
5 | | Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012, |
6 | | 2014, 2019 Free Software Foundation, Inc. |
7 | | |
8 | | This file is part of the GNU MP Library. |
9 | | |
10 | | The GNU MP Library is free software; you can redistribute it and/or modify |
11 | | it under the terms of either: |
12 | | |
13 | | * the GNU Lesser General Public License as published by the Free |
14 | | Software Foundation; either version 3 of the License, or (at your |
15 | | option) any later version. |
16 | | |
17 | | or |
18 | | |
19 | | * the GNU General Public License as published by the Free Software |
20 | | Foundation; either version 2 of the License, or (at your option) any |
21 | | later version. |
22 | | |
23 | | or both in parallel, as here. |
24 | | |
25 | | The GNU MP Library is distributed in the hope that it will be useful, but |
26 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
27 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
28 | | for more details. |
29 | | |
30 | | You should have received copies of the GNU General Public License and the |
31 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
32 | | see https://www.gnu.org/licenses/. */ |
33 | | |
34 | | #include "gmp-impl.h" |
35 | | |
36 | | |
37 | | #ifndef MUL_BASECASE_MAX_UN |
38 | 0 | #define MUL_BASECASE_MAX_UN 500 |
39 | | #endif |
40 | | |
41 | | /* Areas where the different toom algorithms can be called (extracted |
42 | | from the t-toom*.c files, and ignoring small constant offsets): |
43 | | |
44 | | 1/6 1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5 1 vn/un |
45 | | 4/7 6/7 |
46 | | 6/11 |
47 | | |--------------------| toom22 (small) |
48 | | || toom22 (large) |
49 | | |xxxx| toom22 called |
50 | | |-------------------------------------| toom32 |
51 | | |xxxxxxxxxxxxxxxx| | toom32 called |
52 | | |------------| toom33 |
53 | | |x| toom33 called |
54 | | |---------------------------------| | toom42 |
55 | | |xxxxxxxxxxxxxxxxxxxxxxxx| | toom42 called |
56 | | |--------------------| toom43 |
57 | | |xxxxxxxxxx| toom43 called |
58 | | |-----------------------------| toom52 (unused) |
59 | | |--------| toom44 |
60 | | |xxxxxxxx| toom44 called |
61 | | |--------------------| | toom53 |
62 | | |xxxxxx| toom53 called |
63 | | |-------------------------| toom62 (unused) |
64 | | |----------------| toom54 (unused) |
65 | | |--------------------| toom63 |
66 | | |xxxxxxxxx| | toom63 called |
67 | | |---------------------------------| toom6h |
68 | | |xxxxxxxx| toom6h called |
69 | | |-------------------------| toom8h (32 bit) |
70 | | |------------------------------------------| toom8h (64 bit) |
71 | | |xxxxxxxx| toom8h called |
72 | | */ |
73 | | |
74 | | #define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn) |
75 | 0 | #define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn) |
76 | | |
77 | | /* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v |
78 | | (pointed to by VP, with VN limbs), and store the result at PRODP. The |
79 | | result is UN + VN limbs. Return the most significant limb of the result. |
80 | | |
81 | | NOTE: The space pointed to by PRODP is overwritten before finished with U |
82 | | and V, so overlap is an error. |
83 | | |
84 | | Argument constraints: |
85 | | 1. UN >= VN. |
86 | | 2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from |
87 | | the multiplier and the multiplicand. */ |
88 | | |
89 | | /* |
90 | | * The cutoff lines in the toomX2 and toomX3 code are now exactly between the |
91 | | ideal lines of the surrounding algorithms. Is that optimal? |
92 | | |
93 | | * The toomX3 code now uses a structure similar to the one of toomX2, except |
94 | | that it loops longer in the unbalanced case. The result is that the |
95 | | remaining area might have un < vn. Should we fix the toomX2 code in a |
96 | | similar way? |
97 | | |
98 | | * The toomX3 code is used for the largest non-FFT unbalanced operands. It |
99 | | therefore calls mpn_mul recursively for certain cases. |
100 | | |
101 | | * Allocate static temp space using THRESHOLD variables (except for toom44 |
102 | | when !WANT_FFT). That way, we can typically have no TMP_ALLOC at all. |
103 | | |
104 | | * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42 |
105 | | have the same vn threshold. This is not true, we should actually use |
106 | | mul_basecase for slightly larger operands for toom32 than for toom22, and |
107 | | even larger for toom42. |
108 | | |
109 | | * That problem is even more prevalent for toomX3. We therefore use special |
110 | | THRESHOLD variables there. |
111 | | */ |
112 | | |
113 | | mp_limb_t |
114 | | mpn_mul (mp_ptr prodp, |
115 | | mp_srcptr up, mp_size_t un, |
116 | | mp_srcptr vp, mp_size_t vn) |
117 | 0 | { |
118 | 0 | ASSERT (un >= vn); |
119 | 0 | ASSERT (vn >= 1); |
120 | 0 | ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un)); |
121 | 0 | ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn)); |
122 | |
|
123 | 0 | if (BELOW_THRESHOLD (un, MUL_TOOM22_THRESHOLD)) |
124 | 0 | { |
125 | | /* When un (and thus vn) is below the toom22 range, do mul_basecase. |
126 | | Test un and not vn here not to thwart the un >> vn code below. |
127 | | This special case is not necessary, but cuts the overhead for the |
128 | | smallest operands. */ |
129 | 0 | mpn_mul_basecase (prodp, up, un, vp, vn); |
130 | 0 | } |
131 | 0 | else if (un == vn) |
132 | 0 | { |
133 | 0 | mpn_mul_n (prodp, up, vp, un); |
134 | 0 | } |
135 | 0 | else if (vn < MUL_TOOM22_THRESHOLD) |
136 | 0 | { /* plain schoolbook multiplication */ |
137 | | |
138 | | /* Unless un is very large, or else if have an applicable mpn_mul_N, |
139 | | perform basecase multiply directly. */ |
140 | 0 | if (un <= MUL_BASECASE_MAX_UN |
141 | | #if HAVE_NATIVE_mpn_mul_2 |
142 | | || vn <= 2 |
143 | | #else |
144 | 0 | || vn == 1 |
145 | 0 | #endif |
146 | 0 | ) |
147 | 0 | mpn_mul_basecase (prodp, up, un, vp, vn); |
148 | 0 | else |
149 | 0 | { |
150 | | /* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory |
151 | | locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply |
152 | | these pieces with the vp[] operand. After each such partial |
153 | | multiplication (but the last) we copy the most significant vn |
154 | | limbs into a temporary buffer since that part would otherwise be |
155 | | overwritten by the next multiplication. After the next |
156 | | multiplication, we add it back. This illustrates the situation: |
157 | | |
158 | | -->vn<-- |
159 | | | |<------- un ------->| |
160 | | _____________________| |
161 | | X /| |
162 | | /XX__________________/ | |
163 | | _____________________ | |
164 | | X / | |
165 | | /XX__________________/ | |
166 | | _____________________ | |
167 | | / / | |
168 | | /____________________/ | |
169 | | ================================================================== |
170 | | |
171 | | The parts marked with X are the parts whose sums are copied into |
172 | | the temporary buffer. */ |
173 | |
|
174 | 0 | mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT]; |
175 | 0 | mp_limb_t cy; |
176 | 0 | ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT); |
177 | |
|
178 | 0 | mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); |
179 | 0 | prodp += MUL_BASECASE_MAX_UN; |
180 | 0 | MPN_COPY (tp, prodp, vn); /* preserve high triangle */ |
181 | 0 | up += MUL_BASECASE_MAX_UN; |
182 | 0 | un -= MUL_BASECASE_MAX_UN; |
183 | 0 | while (un > MUL_BASECASE_MAX_UN) |
184 | 0 | { |
185 | 0 | mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); |
186 | 0 | cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ |
187 | 0 | mpn_incr_u (prodp + vn, cy); |
188 | 0 | prodp += MUL_BASECASE_MAX_UN; |
189 | 0 | MPN_COPY (tp, prodp, vn); /* preserve high triangle */ |
190 | 0 | up += MUL_BASECASE_MAX_UN; |
191 | 0 | un -= MUL_BASECASE_MAX_UN; |
192 | 0 | } |
193 | 0 | if (un > vn) |
194 | 0 | { |
195 | 0 | mpn_mul_basecase (prodp, up, un, vp, vn); |
196 | 0 | } |
197 | 0 | else |
198 | 0 | { |
199 | 0 | ASSERT (un > 0); |
200 | 0 | mpn_mul_basecase (prodp, vp, vn, up, un); |
201 | 0 | } |
202 | 0 | cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ |
203 | 0 | mpn_incr_u (prodp + vn, cy); |
204 | 0 | } |
205 | 0 | } |
206 | 0 | else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD)) |
207 | 0 | { |
208 | | /* Use ToomX2 variants */ |
209 | 0 | mp_ptr scratch; |
210 | 0 | TMP_SDECL; TMP_SMARK; |
211 | |
|
212 | 0 | #define ITCH_TOOMX2 (9 * vn / 2 + GMP_NUMB_BITS * 2) |
213 | 0 | scratch = TMP_SALLOC_LIMBS (ITCH_TOOMX2); |
214 | 0 | ASSERT (mpn_toom22_mul_itch ((5*vn-1)/4, vn) <= ITCH_TOOMX2); /* 5vn/2+ */ |
215 | 0 | ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX2); /* 7vn/6+ */ |
216 | 0 | ASSERT (mpn_toom42_mul_itch (3 * vn - 1, vn) <= ITCH_TOOMX2); /* 9vn/2+ */ |
217 | 0 | #undef ITCH_TOOMX2 |
218 | | |
219 | | /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn |
220 | | square to a (3vn-1)*vn rectangle. Leaving such a rectangle is hardly |
221 | | wise; we would get better balance by slightly moving the bound. We |
222 | | will sometimes end up with un < vn, like in the X3 arm below. */ |
223 | 0 | if (un >= 3 * vn) |
224 | 0 | { |
225 | 0 | mp_limb_t cy; |
226 | 0 | mp_ptr ws; |
227 | | |
228 | | /* The maximum ws usage is for the mpn_mul result. */ |
229 | 0 | ws = TMP_SALLOC_LIMBS (4 * vn); |
230 | |
|
231 | 0 | mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch); |
232 | 0 | un -= 2 * vn; |
233 | 0 | up += 2 * vn; |
234 | 0 | prodp += 2 * vn; |
235 | |
|
236 | 0 | while (un >= 3 * vn) |
237 | 0 | { |
238 | 0 | mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch); |
239 | 0 | un -= 2 * vn; |
240 | 0 | up += 2 * vn; |
241 | 0 | cy = mpn_add_n (prodp, prodp, ws, vn); |
242 | 0 | MPN_COPY (prodp + vn, ws + vn, 2 * vn); |
243 | 0 | mpn_incr_u (prodp + vn, cy); |
244 | 0 | prodp += 2 * vn; |
245 | 0 | } |
246 | | |
247 | | /* vn <= un < 3vn */ |
248 | |
|
249 | 0 | if (4 * un < 5 * vn) |
250 | 0 | mpn_toom22_mul (ws, up, un, vp, vn, scratch); |
251 | 0 | else if (4 * un < 7 * vn) |
252 | 0 | mpn_toom32_mul (ws, up, un, vp, vn, scratch); |
253 | 0 | else |
254 | 0 | mpn_toom42_mul (ws, up, un, vp, vn, scratch); |
255 | |
|
256 | 0 | cy = mpn_add_n (prodp, prodp, ws, vn); |
257 | 0 | MPN_COPY (prodp + vn, ws + vn, un); |
258 | 0 | mpn_incr_u (prodp + vn, cy); |
259 | 0 | } |
260 | 0 | else |
261 | 0 | { |
262 | 0 | if (4 * un < 5 * vn) |
263 | 0 | mpn_toom22_mul (prodp, up, un, vp, vn, scratch); |
264 | 0 | else if (4 * un < 7 * vn) |
265 | 0 | mpn_toom32_mul (prodp, up, un, vp, vn, scratch); |
266 | 0 | else |
267 | 0 | mpn_toom42_mul (prodp, up, un, vp, vn, scratch); |
268 | 0 | } |
269 | 0 | TMP_SFREE; |
270 | 0 | } |
271 | 0 | else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) || |
272 | 0 | BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD)) |
273 | 0 | { |
274 | | /* Handle the largest operands that are not in the FFT range. The 2nd |
275 | | condition makes very unbalanced operands avoid the FFT code (except |
276 | | perhaps as coefficient products of the Toom code. */ |
277 | |
|
278 | 0 | if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn)) |
279 | 0 | { |
280 | | /* Use ToomX3 variants */ |
281 | 0 | mp_ptr scratch; |
282 | 0 | TMP_DECL; TMP_MARK; |
283 | |
|
284 | 0 | #define ITCH_TOOMX3 (4 * vn + GMP_NUMB_BITS) |
285 | 0 | scratch = TMP_ALLOC_LIMBS (ITCH_TOOMX3); |
286 | 0 | ASSERT (mpn_toom33_mul_itch ((7*vn-1)/6, vn) <= ITCH_TOOMX3); /* 7vn/2+ */ |
287 | 0 | ASSERT (mpn_toom43_mul_itch ((3*vn-1)/2, vn) <= ITCH_TOOMX3); /* 9vn/4+ */ |
288 | 0 | ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX3); /* 7vn/6+ */ |
289 | 0 | ASSERT (mpn_toom53_mul_itch ((11*vn-1)/6, vn) <= ITCH_TOOMX3); /* 11vn/3+ */ |
290 | 0 | ASSERT (mpn_toom42_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */ |
291 | 0 | ASSERT (mpn_toom63_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */ |
292 | 0 | #undef ITCH_TOOMX3 |
293 | |
|
294 | 0 | if (2 * un >= 5 * vn) |
295 | 0 | { |
296 | 0 | mp_limb_t cy; |
297 | 0 | mp_ptr ws; |
298 | | |
299 | | /* The maximum ws usage is for the mpn_mul result. */ |
300 | 0 | ws = TMP_ALLOC_LIMBS (7 * vn >> 1); |
301 | |
|
302 | 0 | if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD)) |
303 | 0 | mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch); |
304 | 0 | else |
305 | 0 | mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch); |
306 | 0 | un -= 2 * vn; |
307 | 0 | up += 2 * vn; |
308 | 0 | prodp += 2 * vn; |
309 | |
|
310 | 0 | while (2 * un >= 5 * vn) /* un >= 2.5vn */ |
311 | 0 | { |
312 | 0 | if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD)) |
313 | 0 | mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch); |
314 | 0 | else |
315 | 0 | mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch); |
316 | 0 | un -= 2 * vn; |
317 | 0 | up += 2 * vn; |
318 | 0 | cy = mpn_add_n (prodp, prodp, ws, vn); |
319 | 0 | MPN_COPY (prodp + vn, ws + vn, 2 * vn); |
320 | 0 | mpn_incr_u (prodp + vn, cy); |
321 | 0 | prodp += 2 * vn; |
322 | 0 | } |
323 | | |
324 | | /* vn / 2 <= un < 2.5vn */ |
325 | |
|
326 | 0 | if (un < vn) |
327 | 0 | mpn_mul (ws, vp, vn, up, un); |
328 | 0 | else |
329 | 0 | mpn_mul (ws, up, un, vp, vn); |
330 | |
|
331 | 0 | cy = mpn_add_n (prodp, prodp, ws, vn); |
332 | 0 | MPN_COPY (prodp + vn, ws + vn, un); |
333 | 0 | mpn_incr_u (prodp + vn, cy); |
334 | 0 | } |
335 | 0 | else |
336 | 0 | { |
337 | 0 | if (6 * un < 7 * vn) |
338 | 0 | mpn_toom33_mul (prodp, up, un, vp, vn, scratch); |
339 | 0 | else if (2 * un < 3 * vn) |
340 | 0 | { |
341 | 0 | if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD)) |
342 | 0 | mpn_toom32_mul (prodp, up, un, vp, vn, scratch); |
343 | 0 | else |
344 | 0 | mpn_toom43_mul (prodp, up, un, vp, vn, scratch); |
345 | 0 | } |
346 | 0 | else if (6 * un < 11 * vn) |
347 | 0 | { |
348 | 0 | if (4 * un < 7 * vn) |
349 | 0 | { |
350 | 0 | if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD)) |
351 | 0 | mpn_toom32_mul (prodp, up, un, vp, vn, scratch); |
352 | 0 | else |
353 | 0 | mpn_toom53_mul (prodp, up, un, vp, vn, scratch); |
354 | 0 | } |
355 | 0 | else |
356 | 0 | { |
357 | 0 | if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD)) |
358 | 0 | mpn_toom42_mul (prodp, up, un, vp, vn, scratch); |
359 | 0 | else |
360 | 0 | mpn_toom53_mul (prodp, up, un, vp, vn, scratch); |
361 | 0 | } |
362 | 0 | } |
363 | 0 | else |
364 | 0 | { |
365 | 0 | if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD)) |
366 | 0 | mpn_toom42_mul (prodp, up, un, vp, vn, scratch); |
367 | 0 | else |
368 | 0 | mpn_toom63_mul (prodp, up, un, vp, vn, scratch); |
369 | 0 | } |
370 | 0 | } |
371 | 0 | TMP_FREE; |
372 | 0 | } |
373 | 0 | else |
374 | 0 | { |
375 | 0 | mp_ptr scratch; |
376 | 0 | TMP_DECL; TMP_MARK; |
377 | |
|
378 | 0 | if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD)) |
379 | 0 | { |
380 | 0 | scratch = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (un, vn)); |
381 | 0 | mpn_toom44_mul (prodp, up, un, vp, vn, scratch); |
382 | 0 | } |
383 | 0 | else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD)) |
384 | 0 | { |
385 | 0 | scratch = TMP_SALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn)); |
386 | 0 | mpn_toom6h_mul (prodp, up, un, vp, vn, scratch); |
387 | 0 | } |
388 | 0 | else |
389 | 0 | { |
390 | 0 | scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn)); |
391 | 0 | mpn_toom8h_mul (prodp, up, un, vp, vn, scratch); |
392 | 0 | } |
393 | 0 | TMP_FREE; |
394 | 0 | } |
395 | 0 | } |
396 | 0 | else |
397 | 0 | { |
398 | 0 | if (un >= 8 * vn) |
399 | 0 | { |
400 | 0 | mp_limb_t cy; |
401 | 0 | mp_ptr ws; |
402 | 0 | TMP_DECL; TMP_MARK; |
403 | | |
404 | | /* The maximum ws usage is for the mpn_mul result. */ |
405 | 0 | ws = TMP_BALLOC_LIMBS (9 * vn >> 1); |
406 | |
|
407 | 0 | mpn_fft_mul (prodp, up, 3 * vn, vp, vn); |
408 | 0 | un -= 3 * vn; |
409 | 0 | up += 3 * vn; |
410 | 0 | prodp += 3 * vn; |
411 | |
|
412 | 0 | while (2 * un >= 7 * vn) /* un >= 3.5vn */ |
413 | 0 | { |
414 | 0 | mpn_fft_mul (ws, up, 3 * vn, vp, vn); |
415 | 0 | un -= 3 * vn; |
416 | 0 | up += 3 * vn; |
417 | 0 | cy = mpn_add_n (prodp, prodp, ws, vn); |
418 | 0 | MPN_COPY (prodp + vn, ws + vn, 3 * vn); |
419 | 0 | mpn_incr_u (prodp + vn, cy); |
420 | 0 | prodp += 3 * vn; |
421 | 0 | } |
422 | | |
423 | | /* vn / 2 <= un < 3.5vn */ |
424 | |
|
425 | 0 | if (un < vn) |
426 | 0 | mpn_mul (ws, vp, vn, up, un); |
427 | 0 | else |
428 | 0 | mpn_mul (ws, up, un, vp, vn); |
429 | |
|
430 | 0 | cy = mpn_add_n (prodp, prodp, ws, vn); |
431 | 0 | MPN_COPY (prodp + vn, ws + vn, un); |
432 | 0 | mpn_incr_u (prodp + vn, cy); |
433 | |
|
434 | 0 | TMP_FREE; |
435 | 0 | } |
436 | 0 | else |
437 | 0 | mpn_fft_mul (prodp, up, un, vp, vn); |
438 | 0 | } |
439 | |
|
440 | 0 | return prodp[un + vn - 1]; /* historic */ |
441 | 0 | } |