/src/gmp/mpn/toom_interpolate_12pts.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* Interpolation for the algorithm Toom-Cook 6.5-way. |
2 | | |
3 | | Contributed to the GNU project by Marco Bodrato. |
4 | | |
5 | | THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY |
6 | | SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
7 | | GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
8 | | |
9 | | Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc. |
10 | | |
11 | | This file is part of the GNU MP Library. |
12 | | |
13 | | The GNU MP Library is free software; you can redistribute it and/or modify |
14 | | it under the terms of either: |
15 | | |
16 | | * the GNU Lesser General Public License as published by the Free |
17 | | Software Foundation; either version 3 of the License, or (at your |
18 | | option) any later version. |
19 | | |
20 | | or |
21 | | |
22 | | * the GNU General Public License as published by the Free Software |
23 | | Foundation; either version 2 of the License, or (at your option) any |
24 | | later version. |
25 | | |
26 | | or both in parallel, as here. |
27 | | |
28 | | The GNU MP Library is distributed in the hope that it will be useful, but |
29 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
30 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
31 | | for more details. |
32 | | |
33 | | You should have received copies of the GNU General Public License and the |
34 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
35 | | see https://www.gnu.org/licenses/. */ |
36 | | |
37 | | |
38 | | #include "gmp-impl.h" |
39 | | |
40 | | |
41 | | #if GMP_NUMB_BITS < 21 |
42 | | #error Not implemented: Both sublsh_n(,,,20) should be corrected. |
43 | | #endif |
44 | | |
45 | | #if GMP_NUMB_BITS < 16 |
46 | | #error Not implemented: divexact_by42525 needs splitting. |
47 | | #endif |
48 | | |
49 | | #if GMP_NUMB_BITS < 12 |
50 | | #error Not implemented: Hard to adapt... |
51 | | #endif |
52 | | |
53 | | |
54 | | /* FIXME: tuneup should decide the best variant */ |
55 | | #ifndef AORSMUL_FASTER_AORS_AORSLSH |
56 | | #define AORSMUL_FASTER_AORS_AORSLSH 1 |
57 | | #endif |
58 | | #ifndef AORSMUL_FASTER_AORS_2AORSLSH |
59 | | #define AORSMUL_FASTER_AORS_2AORSLSH 1 |
60 | | #endif |
61 | | #ifndef AORSMUL_FASTER_2AORSLSH |
62 | | #define AORSMUL_FASTER_2AORSLSH 1 |
63 | | #endif |
64 | | #ifndef AORSMUL_FASTER_3AORSLSH |
65 | | #define AORSMUL_FASTER_3AORSLSH 1 |
66 | | #endif |
67 | | |
68 | | |
69 | | #if HAVE_NATIVE_mpn_sublsh_n |
70 | | #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s) |
71 | | #else |
72 | | static mp_limb_t |
73 | | DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) |
74 | 0 | { |
75 | | #if USE_MUL_1 && 0 |
76 | | return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s)); |
77 | | #else |
78 | 0 | mp_limb_t __cy; |
79 | 0 | __cy = mpn_lshift(ws,src,n,s); |
80 | 0 | return __cy + mpn_sub_n(dst,dst,ws,n); |
81 | 0 | #endif |
82 | 0 | } |
83 | | #endif |
84 | | |
85 | | #if HAVE_NATIVE_mpn_addlsh_n |
86 | | #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s) |
87 | | #else |
88 | | #if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH) |
89 | | static mp_limb_t |
90 | | DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) |
91 | | { |
92 | | #if USE_MUL_1 && 0 |
93 | | return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s)); |
94 | | #else |
95 | | mp_limb_t __cy; |
96 | | __cy = mpn_lshift(ws,src,n,s); |
97 | | return __cy + mpn_add_n(dst,dst,ws,n); |
98 | | #endif |
99 | | } |
100 | | #endif |
101 | | #endif |
102 | | |
103 | | #if HAVE_NATIVE_mpn_subrsh |
104 | | #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s) |
105 | | #else |
106 | | /* FIXME: This is not a correct definition, it assumes no carry */ |
107 | 0 | #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \ |
108 | 0 | do { \ |
109 | 0 | mp_limb_t __cy; \ |
110 | 0 | MPN_DECR_U (dst, nd, src[0] >> s); \ |
111 | 0 | __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \ |
112 | 0 | MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \ |
113 | 0 | } while (0) |
114 | | #endif |
115 | | |
116 | | |
117 | | #define BINVERT_9 \ |
118 | 0 | ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39) |
119 | | |
120 | | #define BINVERT_255 \ |
121 | | (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8))) |
122 | | |
123 | | /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */ |
124 | | #if GMP_LIMB_BITS == 32 |
125 | | #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B)) |
126 | | #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35)) |
127 | | #else |
128 | | #if GMP_LIMB_BITS == 64 |
129 | 0 | #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B)) |
130 | 0 | #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35)) |
131 | | #endif |
132 | | #endif |
133 | | |
134 | | #ifndef mpn_divexact_by255 |
135 | | #if GMP_NUMB_BITS % 8 == 0 |
136 | | #define mpn_divexact_by255(dst,src,size) \ |
137 | 0 | (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255))) |
138 | | #else |
139 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 |
140 | | #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0) |
141 | | #else |
142 | | #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)) |
143 | | #endif |
144 | | #endif |
145 | | #endif |
146 | | |
147 | | #ifndef mpn_divexact_by9x4 |
148 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 |
149 | 0 | #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2) |
150 | | #else |
151 | | #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2) |
152 | | #endif |
153 | | #endif |
154 | | |
155 | | #ifndef mpn_divexact_by42525 |
156 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525) |
157 | 0 | #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0) |
158 | | #else |
159 | | #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)) |
160 | | #endif |
161 | | #endif |
162 | | |
163 | | #ifndef mpn_divexact_by2835x4 |
164 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835) |
165 | 0 | #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2) |
166 | | #else |
167 | | #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2) |
168 | | #endif |
169 | | #endif |
170 | | |
171 | | /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation |
172 | | points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely, |
173 | | we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of |
174 | | degree 11 (or 10), given the 12 (rsp. 11) values: |
175 | | |
176 | | r0 = limit at infinity of f(x) / x^11, |
177 | | r1 = f(4),f(-4), |
178 | | r2 = f(2),f(-2), |
179 | | r3 = f(1),f(-1), |
180 | | r4 = f(1/4),f(-1/4), |
181 | | r5 = f(1/2),f(-1/2), |
182 | | r6 = f(0). |
183 | | |
184 | | All couples of the form f(n),f(-n) must be already mixed with |
185 | | toom_couple_handling(f(n),...,f(-n),...) |
186 | | |
187 | | The result is stored in {pp, spt + 7*n (or 6*n)}. |
188 | | At entry, r6 is stored at {pp, 2n}, |
189 | | r4 is stored at {pp + 3n, 3n + 1}. |
190 | | r2 is stored at {pp + 7n, 3n + 1}. |
191 | | r0 is stored at {pp +11n, spt}. |
192 | | |
193 | | The other values are 3n+1 limbs each (with most significant limbs small). |
194 | | |
195 | | Negative intermediate results are stored two-complemented. |
196 | | Inputs are destroyed. |
197 | | */ |
198 | | |
199 | | void |
200 | | mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, |
201 | | mp_size_t n, mp_size_t spt, int half, mp_ptr wsi) |
202 | 0 | { |
203 | 0 | mp_limb_t cy; |
204 | 0 | mp_size_t n3; |
205 | 0 | mp_size_t n3p1; |
206 | 0 | n3 = 3 * n; |
207 | 0 | n3p1 = n3 + 1; |
208 | |
|
209 | 0 | #define r4 (pp + n3) /* 3n+1 */ |
210 | 0 | #define r2 (pp + 7 * n) /* 3n+1 */ |
211 | 0 | #define r0 (pp +11 * n) /* s+t <= 2*n */ |
212 | | |
213 | | /******************************* interpolation *****************************/ |
214 | 0 | if (half != 0) { |
215 | 0 | cy = mpn_sub_n (r3, r3, r0, spt); |
216 | 0 | MPN_DECR_U (r3 + spt, n3p1 - spt, cy); |
217 | |
|
218 | 0 | cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi); |
219 | 0 | MPN_DECR_U (r2 + spt, n3p1 - spt, cy); |
220 | 0 | DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi); |
221 | |
|
222 | 0 | cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi); |
223 | 0 | MPN_DECR_U (r1 + spt, n3p1 - spt, cy); |
224 | 0 | DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi); |
225 | 0 | }; |
226 | |
|
227 | 0 | r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi); |
228 | 0 | DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi); |
229 | |
|
230 | | #if HAVE_NATIVE_mpn_add_n_sub_n |
231 | | mpn_add_n_sub_n (r1, r4, r4, r1, n3p1); |
232 | | #else |
233 | 0 | ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1)); |
234 | 0 | mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */ |
235 | 0 | MP_PTR_SWAP(r1, wsi); |
236 | 0 | #endif |
237 | |
|
238 | 0 | r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi); |
239 | 0 | DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi); |
240 | |
|
241 | | #if HAVE_NATIVE_mpn_add_n_sub_n |
242 | | mpn_add_n_sub_n (r2, r5, r5, r2, n3p1); |
243 | | #else |
244 | 0 | mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */ |
245 | 0 | ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1)); |
246 | 0 | MP_PTR_SWAP(r5, wsi); |
247 | 0 | #endif |
248 | |
|
249 | 0 | r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n); |
250 | |
|
251 | 0 | #if AORSMUL_FASTER_AORS_AORSLSH |
252 | 0 | mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */ |
253 | | #else |
254 | | mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */ |
255 | | DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */ |
256 | | #endif |
257 | | /* A division by 2835x4 follows. Warning: the operand can be negative! */ |
258 | 0 | mpn_divexact_by2835x4(r4, r4, n3p1); |
259 | 0 | if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0) |
260 | 0 | r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2)); |
261 | |
|
262 | 0 | #if AORSMUL_FASTER_2AORSLSH |
263 | 0 | mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */ |
264 | | #else |
265 | | DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */ |
266 | | DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */ |
267 | | #endif |
268 | 0 | mpn_divexact_by255(r5, r5, n3p1); |
269 | |
|
270 | 0 | ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi)); |
271 | |
|
272 | 0 | #if AORSMUL_FASTER_3AORSLSH |
273 | 0 | ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100)); |
274 | | #else |
275 | | ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi)); |
276 | | ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi)); |
277 | | ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi)); |
278 | | #endif |
279 | 0 | ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi)); |
280 | 0 | mpn_divexact_by42525(r1, r1, n3p1); |
281 | |
|
282 | 0 | #if AORSMUL_FASTER_AORS_2AORSLSH |
283 | 0 | ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225)); |
284 | | #else |
285 | | ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1)); |
286 | | ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi)); |
287 | | ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi)); |
288 | | #endif |
289 | 0 | mpn_divexact_by9x4(r2, r2, n3p1); |
290 | |
|
291 | 0 | ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1)); |
292 | |
|
293 | 0 | #ifdef HAVE_NATIVE_mpn_rsh1sub_n |
294 | 0 | mpn_rsh1sub_n (r4, r2, r4, n3p1); |
295 | 0 | r4 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; |
296 | | #else |
297 | | mpn_sub_n (r4, r2, r4, n3p1); |
298 | | ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1)); |
299 | | #endif |
300 | 0 | ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1)); |
301 | |
|
302 | 0 | #ifdef HAVE_NATIVE_mpn_rsh1add_n |
303 | 0 | mpn_rsh1add_n (r5, r5, r1, n3p1); |
304 | 0 | r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; |
305 | | #else |
306 | | mpn_add_n (r5, r5, r1, n3p1); |
307 | | ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1)); |
308 | | #endif |
309 | | |
310 | | /* last interpolation steps... */ |
311 | 0 | ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1)); |
312 | 0 | ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1)); |
313 | | /* ... could be mixed with recomposition |
314 | | ||H-r5|M-r5|L-r5| ||H-r1|M-r1|L-r1| |
315 | | */ |
316 | | |
317 | | /***************************** recomposition *******************************/ |
318 | | /* |
319 | | pp[] prior to operations: |
320 | | |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp |
321 | | |
322 | | summation scheme for remaining operations: |
323 | | |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp |
324 | | |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp |
325 | | ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5| |
326 | | */ |
327 | |
|
328 | 0 | cy = mpn_add_n (pp + n, pp + n, r5, n); |
329 | 0 | cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy); |
330 | 0 | #if HAVE_NATIVE_mpn_add_nc |
331 | 0 | cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy); |
332 | | #else |
333 | | MPN_INCR_U (r5 + 2 * n, n + 1, cy); |
334 | | cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n); |
335 | | #endif |
336 | 0 | MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy); |
337 | |
|
338 | 0 | pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n); |
339 | 0 | cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]); |
340 | 0 | #if HAVE_NATIVE_mpn_add_nc |
341 | 0 | cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy); |
342 | | #else |
343 | | MPN_INCR_U (r3 + 2 * n, n + 1, cy); |
344 | | cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n); |
345 | | #endif |
346 | 0 | MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy); |
347 | |
|
348 | 0 | pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n); |
349 | 0 | if (half) { |
350 | 0 | cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]); |
351 | 0 | #if HAVE_NATIVE_mpn_add_nc |
352 | 0 | if (LIKELY (spt > n)) { |
353 | 0 | cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy); |
354 | 0 | MPN_INCR_U (pp + 4 * n3, spt - n, cy); |
355 | 0 | } else { |
356 | 0 | ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy)); |
357 | 0 | } |
358 | | #else |
359 | | MPN_INCR_U (r1 + 2 * n, n + 1, cy); |
360 | | if (LIKELY (spt > n)) { |
361 | | cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n); |
362 | | MPN_INCR_U (pp + 4 * n3, spt - n, cy); |
363 | | } else { |
364 | | ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt)); |
365 | | } |
366 | | #endif |
367 | 0 | } else { |
368 | 0 | ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n])); |
369 | 0 | } |
370 | |
|
371 | 0 | #undef r0 |
372 | 0 | #undef r2 |
373 | 0 | #undef r4 |
374 | 0 | } |