/src/gmp/mpn/toom33_mul.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in |
2 | | size. Or more accurately, bn <= an < (3/2)bn. |
3 | | |
4 | | Contributed to the GNU project by Torbjorn Granlund. |
5 | | Additional improvements by Marco Bodrato. |
6 | | |
7 | | THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY |
8 | | SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
9 | | GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
10 | | |
11 | | Copyright 2006-2008, 2010, 2012, 2015, 2021 Free Software Foundation, Inc. |
12 | | |
13 | | This file is part of the GNU MP Library. |
14 | | |
15 | | The GNU MP Library is free software; you can redistribute it and/or modify |
16 | | it under the terms of either: |
17 | | |
18 | | * the GNU Lesser General Public License as published by the Free |
19 | | Software Foundation; either version 3 of the License, or (at your |
20 | | option) any later version. |
21 | | |
22 | | or |
23 | | |
24 | | * the GNU General Public License as published by the Free Software |
25 | | Foundation; either version 2 of the License, or (at your option) any |
26 | | later version. |
27 | | |
28 | | or both in parallel, as here. |
29 | | |
30 | | The GNU MP Library is distributed in the hope that it will be useful, but |
31 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
32 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
33 | | for more details. |
34 | | |
35 | | You should have received copies of the GNU General Public License and the |
36 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
37 | | see https://www.gnu.org/licenses/. */ |
38 | | |
39 | | |
40 | | #include "gmp-impl.h" |
41 | | |
42 | | /* Evaluate in: -1, 0, +1, +2, +inf |
43 | | |
44 | | <-s--><--n--><--n--> |
45 | | ____ ______ ______ |
46 | | |_a2_|___a1_|___a0_| |
47 | | |b2_|___b1_|___b0_| |
48 | | <-t-><--n--><--n--> |
49 | | |
50 | | v0 = a0 * b0 # A(0)*B(0) |
51 | | v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2 |
52 | | vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1 |
53 | | v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6 |
54 | | vinf= a2 * b2 # A(inf)*B(inf) |
55 | | */ |
56 | | |
57 | | #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY |
58 | | #define MAYBE_mul_basecase 1 |
59 | | #define MAYBE_mul_toom33 1 |
60 | | #else |
61 | | #define MAYBE_mul_basecase \ |
62 | 0 | (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD) |
63 | | #define MAYBE_mul_toom33 \ |
64 | 0 | (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD) |
65 | | #endif |
66 | | |
67 | | /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced |
68 | | multiplication at the infinity point. We may have |
69 | | MAYBE_mul_basecase == 0, and still get s just below |
70 | | MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get |
71 | | s == 1 and mpn_toom22_mul will crash. |
72 | | */ |
73 | | |
74 | | #define TOOM33_MUL_N_REC(p, a, b, n, ws) \ |
75 | 0 | do { \ |
76 | 0 | if (MAYBE_mul_basecase \ |
77 | 0 | && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \ |
78 | 0 | mpn_mul_basecase (p, a, n, b, n); \ |
79 | 0 | else if (! MAYBE_mul_toom33 \ |
80 | 0 | || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \ |
81 | 0 | mpn_toom22_mul (p, a, n, b, n, ws); \ |
82 | 0 | else \ |
83 | 0 | mpn_toom33_mul (p, a, n, b, n, ws); \ |
84 | 0 | } while (0) |
85 | | |
86 | | void |
87 | | mpn_toom33_mul (mp_ptr pp, |
88 | | mp_srcptr ap, mp_size_t an, |
89 | | mp_srcptr bp, mp_size_t bn, |
90 | | mp_ptr scratch) |
91 | 0 | { |
92 | 0 | const int __gmpn_cpuvec_initialized = 1; |
93 | 0 | mp_size_t n, s, t; |
94 | 0 | int vm1_neg; |
95 | 0 | mp_limb_t cy, vinf0; |
96 | 0 | mp_ptr gp; |
97 | 0 | mp_ptr as1, asm1, as2; |
98 | 0 | mp_ptr bs1, bsm1, bs2; |
99 | |
|
100 | 0 | #define a0 ap |
101 | 0 | #define a1 (ap + n) |
102 | 0 | #define a2 (ap + 2*n) |
103 | 0 | #define b0 bp |
104 | 0 | #define b1 (bp + n) |
105 | 0 | #define b2 (bp + 2*n) |
106 | |
|
107 | 0 | n = (an + 2) / (size_t) 3; |
108 | |
|
109 | 0 | s = an - 2 * n; |
110 | 0 | t = bn - 2 * n; |
111 | |
|
112 | 0 | ASSERT (an >= bn); |
113 | |
|
114 | 0 | ASSERT (0 < s && s <= n); |
115 | 0 | ASSERT (0 < t && t <= n); |
116 | |
|
117 | 0 | as1 = scratch + 4 * n + 4; |
118 | 0 | asm1 = scratch + 2 * n + 2; |
119 | 0 | as2 = pp + n + 1; |
120 | |
|
121 | 0 | bs1 = pp; |
122 | 0 | bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */ |
123 | 0 | bs2 = pp + 2 * n + 2; |
124 | |
|
125 | 0 | gp = scratch; |
126 | |
|
127 | 0 | vm1_neg = 0; |
128 | | |
129 | | /* Compute as1 and asm1. */ |
130 | 0 | cy = mpn_add (gp, a0, n, a2, s); |
131 | | #if HAVE_NATIVE_mpn_add_n_sub_n |
132 | | if (cy == 0 && mpn_cmp (gp, a1, n) < 0) |
133 | | { |
134 | | cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n); |
135 | | as1[n] = cy >> 1; |
136 | | asm1[n] = 0; |
137 | | vm1_neg = 1; |
138 | | } |
139 | | else |
140 | | { |
141 | | mp_limb_t cy2; |
142 | | cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n); |
143 | | as1[n] = cy + (cy2 >> 1); |
144 | | asm1[n] = cy - (cy2 & 1); |
145 | | } |
146 | | #else |
147 | 0 | as1[n] = cy + mpn_add_n (as1, gp, a1, n); |
148 | 0 | if (cy == 0 && mpn_cmp (gp, a1, n) < 0) |
149 | 0 | { |
150 | 0 | mpn_sub_n (asm1, a1, gp, n); |
151 | 0 | asm1[n] = 0; |
152 | 0 | vm1_neg = 1; |
153 | 0 | } |
154 | 0 | else |
155 | 0 | { |
156 | 0 | cy -= mpn_sub_n (asm1, gp, a1, n); |
157 | 0 | asm1[n] = cy; |
158 | 0 | } |
159 | 0 | #endif |
160 | | |
161 | | /* Compute as2. */ |
162 | 0 | #if HAVE_NATIVE_mpn_rsblsh1_n |
163 | 0 | cy = mpn_add_n (as2, a2, as1, s); |
164 | 0 | if (s != n) |
165 | 0 | cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); |
166 | 0 | cy += as1[n]; |
167 | 0 | cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n); |
168 | | #else |
169 | | #if HAVE_NATIVE_mpn_addlsh1_n |
170 | | cy = mpn_addlsh1_n (as2, a1, a2, s); |
171 | | if (s != n) |
172 | | cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy); |
173 | | cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); |
174 | | #else |
175 | | cy = mpn_add_n (as2, a2, as1, s); |
176 | | if (s != n) |
177 | | cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); |
178 | | cy += as1[n]; |
179 | | cy = 2 * cy + mpn_lshift (as2, as2, n, 1); |
180 | | cy -= mpn_sub_n (as2, as2, a0, n); |
181 | | #endif |
182 | | #endif |
183 | 0 | as2[n] = cy; |
184 | | |
185 | | /* Compute bs1 and bsm1. */ |
186 | 0 | cy = mpn_add (gp, b0, n, b2, t); |
187 | | #if HAVE_NATIVE_mpn_add_n_sub_n |
188 | | if (cy == 0 && mpn_cmp (gp, b1, n) < 0) |
189 | | { |
190 | | cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n); |
191 | | bs1[n] = cy >> 1; |
192 | | bsm1[n] = 0; |
193 | | vm1_neg ^= 1; |
194 | | } |
195 | | else |
196 | | { |
197 | | mp_limb_t cy2; |
198 | | cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n); |
199 | | bs1[n] = cy + (cy2 >> 1); |
200 | | bsm1[n] = cy - (cy2 & 1); |
201 | | } |
202 | | #else |
203 | 0 | bs1[n] = cy + mpn_add_n (bs1, gp, b1, n); |
204 | 0 | if (cy == 0 && mpn_cmp (gp, b1, n) < 0) |
205 | 0 | { |
206 | 0 | mpn_sub_n (bsm1, b1, gp, n); |
207 | 0 | bsm1[n] = 0; |
208 | 0 | vm1_neg ^= 1; |
209 | 0 | } |
210 | 0 | else |
211 | 0 | { |
212 | 0 | cy -= mpn_sub_n (bsm1, gp, b1, n); |
213 | 0 | bsm1[n] = cy; |
214 | 0 | } |
215 | 0 | #endif |
216 | | |
217 | | /* Compute bs2. */ |
218 | 0 | #if HAVE_NATIVE_mpn_rsblsh1_n |
219 | 0 | cy = mpn_add_n (bs2, b2, bs1, t); |
220 | 0 | if (t != n) |
221 | 0 | cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy); |
222 | 0 | cy += bs1[n]; |
223 | 0 | cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n); |
224 | | #else |
225 | | #if HAVE_NATIVE_mpn_addlsh1_n |
226 | | cy = mpn_addlsh1_n (bs2, b1, b2, t); |
227 | | if (t != n) |
228 | | cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy); |
229 | | cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n); |
230 | | #else |
231 | | cy = mpn_add_n (bs2, bs1, b2, t); |
232 | | if (t != n) |
233 | | cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy); |
234 | | cy += bs1[n]; |
235 | | cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1); |
236 | | cy -= mpn_sub_n (bs2, bs2, b0, n); |
237 | | #endif |
238 | | #endif |
239 | 0 | bs2[n] = cy; |
240 | |
|
241 | 0 | ASSERT (as1[n] <= 2); |
242 | 0 | ASSERT (bs1[n] <= 2); |
243 | 0 | ASSERT (asm1[n] <= 1); |
244 | 0 | ASSERT (bsm1[n] <= 1); |
245 | 0 | ASSERT (as2[n] <= 6); |
246 | 0 | ASSERT (bs2[n] <= 6); |
247 | |
|
248 | 0 | #define v0 pp /* 2n */ |
249 | 0 | #define v1 (pp + 2 * n) /* 2n+1 */ |
250 | 0 | #define vinf (pp + 4 * n) /* s+t */ |
251 | 0 | #define vm1 scratch /* 2n+1 */ |
252 | 0 | #define v2 (scratch + 2 * n + 1) /* 2n+2 */ |
253 | 0 | #define scratch_out (scratch + 5 * n + 5) |
254 | | |
255 | | /* vm1, 2n+1 limbs */ |
256 | | #ifdef SMALLER_RECURSION |
257 | | TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); |
258 | | cy = 0; |
259 | | if (asm1[n] != 0) |
260 | | cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n); |
261 | | if (bsm1[n] != 0) |
262 | | cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n); |
263 | | vm1[2 * n] = cy; |
264 | | #else |
265 | 0 | vm1[2 * n] = 0; |
266 | 0 | TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + (bsm1[n] | asm1[n]), scratch_out); |
267 | 0 | #endif |
268 | |
|
269 | 0 | TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */ |
270 | | |
271 | | /* vinf, s+t limbs */ |
272 | 0 | if (s > t) mpn_mul (vinf, a2, s, b2, t); |
273 | 0 | else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out); |
274 | |
|
275 | 0 | vinf0 = vinf[0]; /* v1 overlaps with this */ |
276 | |
|
277 | | #ifdef SMALLER_RECURSION |
278 | | /* v1, 2n+1 limbs */ |
279 | | TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out); |
280 | | if (as1[n] == 1) |
281 | | { |
282 | | cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); |
283 | | } |
284 | | else if (as1[n] != 0) |
285 | | { |
286 | | #if HAVE_NATIVE_mpn_addlsh1_n_ip1 |
287 | | cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n); |
288 | | #else |
289 | | cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); |
290 | | #endif |
291 | | } |
292 | | else |
293 | | cy = 0; |
294 | | if (bs1[n] == 1) |
295 | | { |
296 | | cy += mpn_add_n (v1 + n, v1 + n, as1, n); |
297 | | } |
298 | | else if (bs1[n] != 0) |
299 | | { |
300 | | #if HAVE_NATIVE_mpn_addlsh1_n_ip1 |
301 | | cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n); |
302 | | #else |
303 | | cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); |
304 | | #endif |
305 | | } |
306 | | v1[2 * n] = cy; |
307 | | #else |
308 | 0 | cy = vinf[1]; |
309 | 0 | TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out); |
310 | 0 | vinf[1] = cy; |
311 | 0 | #endif |
312 | |
|
313 | 0 | TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */ |
314 | |
|
315 | 0 | mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0); |
316 | 0 | } |