/src/gmp/mpn/toom_interpolate_16pts.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* Interpolation for the algorithm Toom-Cook 8.5-way. |
2 | | |
3 | | Contributed to the GNU project by Marco Bodrato. |
4 | | |
5 | | THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY |
6 | | SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
7 | | GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
8 | | |
9 | | Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc. |
10 | | |
11 | | This file is part of the GNU MP Library. |
12 | | |
13 | | The GNU MP Library is free software; you can redistribute it and/or modify |
14 | | it under the terms of either: |
15 | | |
16 | | * the GNU Lesser General Public License as published by the Free |
17 | | Software Foundation; either version 3 of the License, or (at your |
18 | | option) any later version. |
19 | | |
20 | | or |
21 | | |
22 | | * the GNU General Public License as published by the Free Software |
23 | | Foundation; either version 2 of the License, or (at your option) any |
24 | | later version. |
25 | | |
26 | | or both in parallel, as here. |
27 | | |
28 | | The GNU MP Library is distributed in the hope that it will be useful, but |
29 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
30 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
31 | | for more details. |
32 | | |
33 | | You should have received copies of the GNU General Public License and the |
34 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
35 | | see https://www.gnu.org/licenses/. */ |
36 | | |
37 | | |
38 | | #include "gmp-impl.h" |
39 | | |
40 | | |
41 | | #if GMP_NUMB_BITS < 29 |
42 | | #error Not implemented: Both sublsh_n(,,,28) should be corrected; r2 and r5 need one more LIMB. |
43 | | #endif |
44 | | |
45 | | #if GMP_NUMB_BITS < 28 |
46 | | #error Not implemented: divexact_by188513325 and _by182712915 will not work. |
47 | | #endif |
48 | | |
49 | | |
50 | | /* FIXME: tuneup should decide the best variant */ |
51 | | #ifndef AORSMUL_FASTER_AORS_AORSLSH |
52 | | #define AORSMUL_FASTER_AORS_AORSLSH 1 |
53 | | #endif |
54 | | #ifndef AORSMUL_FASTER_AORS_2AORSLSH |
55 | | #define AORSMUL_FASTER_AORS_2AORSLSH 1 |
56 | | #endif |
57 | | #ifndef AORSMUL_FASTER_2AORSLSH |
58 | | #define AORSMUL_FASTER_2AORSLSH 1 |
59 | | #endif |
60 | | #ifndef AORSMUL_FASTER_3AORSLSH |
61 | | #define AORSMUL_FASTER_3AORSLSH 1 |
62 | | #endif |
63 | | |
64 | | |
65 | | #if HAVE_NATIVE_mpn_sublsh_n |
66 | | #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s) |
67 | | #else |
68 | | static mp_limb_t |
69 | | DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) |
70 | 0 | { |
71 | | #if USE_MUL_1 && 0 |
72 | | return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s)); |
73 | | #else |
74 | 0 | mp_limb_t __cy; |
75 | 0 | __cy = mpn_lshift(ws,src,n,s); |
76 | 0 | return __cy + mpn_sub_n(dst,dst,ws,n); |
77 | 0 | #endif |
78 | 0 | } |
79 | | #endif |
80 | | |
81 | | #if HAVE_NATIVE_mpn_addlsh_n |
82 | | #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s) |
83 | | #else |
84 | | #if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH) |
85 | | static mp_limb_t |
86 | | DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) |
87 | | { |
88 | | #if USE_MUL_1 && 0 |
89 | | return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s)); |
90 | | #else |
91 | | mp_limb_t __cy; |
92 | | __cy = mpn_lshift(ws,src,n,s); |
93 | | return __cy + mpn_add_n(dst,dst,ws,n); |
94 | | #endif |
95 | | } |
96 | | #endif |
97 | | #endif |
98 | | |
99 | | #if HAVE_NATIVE_mpn_subrsh |
100 | | #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s) |
101 | | #else |
102 | | /* FIXME: This is not a correct definition, it assumes no carry */ |
103 | 0 | #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \ |
104 | 0 | do { \ |
105 | 0 | mp_limb_t __cy; \ |
106 | 0 | MPN_DECR_U (dst, nd, src[0] >> s); \ |
107 | 0 | __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \ |
108 | 0 | MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \ |
109 | 0 | } while (0) |
110 | | #endif |
111 | | |
112 | | |
113 | | #if GMP_NUMB_BITS < 43 |
114 | | #define BIT_CORRECTION 1 |
115 | | #define CORRECTION_BITS GMP_NUMB_BITS |
116 | | #else |
117 | 0 | #define BIT_CORRECTION 0 |
118 | 0 | #define CORRECTION_BITS 0 |
119 | | #endif |
120 | | |
121 | | #define BINVERT_9 \ |
122 | 0 | ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39) |
123 | | |
124 | | #define BINVERT_255 \ |
125 | 0 | (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8))) |
126 | | |
127 | | /* FIXME: find some more general expressions for inverses */ |
128 | | #if GMP_LIMB_BITS == 32 |
129 | | #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B)) |
130 | | #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35)) |
131 | | #define BINVERT_182712915 (GMP_NUMB_MASK & CNST_LIMB(0x550659DB)) |
132 | | #define BINVERT_188513325 (GMP_NUMB_MASK & CNST_LIMB(0xFBC333A5)) |
133 | | #define BINVERT_255x182712915L (GMP_NUMB_MASK & CNST_LIMB(0x6FC4CB25)) |
134 | | #define BINVERT_255x188513325L (GMP_NUMB_MASK & CNST_LIMB(0x6864275B)) |
135 | | #if GMP_NAIL_BITS == 0 |
136 | | #define BINVERT_255x182712915H CNST_LIMB(0x1B649A07) |
137 | | #define BINVERT_255x188513325H CNST_LIMB(0x06DB993A) |
138 | | #else /* GMP_NAIL_BITS != 0 */ |
139 | | #define BINVERT_255x182712915H \ |
140 | | (GMP_NUMB_MASK & CNST_LIMB((0x1B649A07<<GMP_NAIL_BITS) | (0x6FC4CB25>>GMP_NUMB_BITS))) |
141 | | #define BINVERT_255x188513325H \ |
142 | | (GMP_NUMB_MASK & CNST_LIMB((0x06DB993A<<GMP_NAIL_BITS) | (0x6864275B>>GMP_NUMB_BITS))) |
143 | | #endif |
144 | | #else |
145 | | #if GMP_LIMB_BITS == 64 |
146 | 0 | #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B)) |
147 | 0 | #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35)) |
148 | 0 | #define BINVERT_255x182712915 (GMP_NUMB_MASK & CNST_LIMB(0x1B649A076FC4CB25)) |
149 | 0 | #define BINVERT_255x188513325 (GMP_NUMB_MASK & CNST_LIMB(0x06DB993A6864275B)) |
150 | | #endif |
151 | | #endif |
152 | | |
153 | | #ifndef mpn_divexact_by255 |
154 | | #if GMP_NUMB_BITS % 8 == 0 |
155 | | #define mpn_divexact_by255(dst,src,size) \ |
156 | | (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255))) |
157 | | #else |
158 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 |
159 | | #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0) |
160 | | #else |
161 | | #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)) |
162 | | #endif |
163 | | #endif |
164 | | #endif |
165 | | |
166 | | #ifndef mpn_divexact_by255x4 |
167 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 |
168 | 0 | #define mpn_divexact_by255x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,2) |
169 | | #else |
170 | | #define mpn_divexact_by255x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)<<2) |
171 | | #endif |
172 | | #endif |
173 | | |
174 | | #ifndef mpn_divexact_by9x16 |
175 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 |
176 | 0 | #define mpn_divexact_by9x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,4) |
177 | | #else |
178 | | #define mpn_divexact_by9x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<4) |
179 | | #endif |
180 | | #endif |
181 | | |
182 | | #ifndef mpn_divexact_by42525x16 |
183 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525) |
184 | 0 | #define mpn_divexact_by42525x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,4) |
185 | | #else |
186 | | #define mpn_divexact_by42525x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)<<4) |
187 | | #endif |
188 | | #endif |
189 | | |
190 | | #ifndef mpn_divexact_by2835x64 |
191 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835) |
192 | 0 | #define mpn_divexact_by2835x64(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,6) |
193 | | #else |
194 | | #define mpn_divexact_by2835x64(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<6) |
195 | | #endif |
196 | | #endif |
197 | | |
198 | | #ifndef mpn_divexact_by255x182712915 |
199 | | #if GMP_NUMB_BITS < 36 |
200 | | #if HAVE_NATIVE_mpn_bdiv_q_2_pi2 && defined(BINVERT_255x182712915H) |
201 | | /* FIXME: use mpn_bdiv_q_2_pi2 */ |
202 | | #endif |
203 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_182712915) |
204 | | #define mpn_divexact_by255x182712915(dst,src,size) \ |
205 | | do { \ |
206 | | mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(182712915),BINVERT_182712915,0); \ |
207 | | mpn_divexact_by255(dst,dst,size); \ |
208 | | } while(0) |
209 | | #else |
210 | | #define mpn_divexact_by255x182712915(dst,src,size) \ |
211 | | do { \ |
212 | | mpn_divexact_1(dst,src,size,CNST_LIMB(182712915)); \ |
213 | | mpn_divexact_by255(dst,dst,size); \ |
214 | | } while(0) |
215 | | #endif |
216 | | #else /* GMP_NUMB_BITS > 35 */ |
217 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x182712915) |
218 | | #define mpn_divexact_by255x182712915(dst,src,size) \ |
219 | 0 | mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(182712915),BINVERT_255x182712915,0) |
220 | | #else |
221 | | #define mpn_divexact_by255x182712915(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(182712915)) |
222 | | #endif |
223 | | #endif /* GMP_NUMB_BITS >?< 36 */ |
224 | | #endif |
225 | | |
226 | | #ifndef mpn_divexact_by255x188513325 |
227 | | #if GMP_NUMB_BITS < 36 |
228 | | #if HAVE_NATIVE_mpn_bdiv_q_1_pi2 && defined(BINVERT_255x188513325H) |
229 | | /* FIXME: use mpn_bdiv_q_1_pi2 */ |
230 | | #endif |
231 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_188513325) |
232 | | #define mpn_divexact_by255x188513325(dst,src,size) \ |
233 | | do { \ |
234 | | mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(188513325),BINVERT_188513325,0); \ |
235 | | mpn_divexact_by255(dst,dst,size); \ |
236 | | } while(0) |
237 | | #else |
238 | | #define mpn_divexact_by255x188513325(dst,src,size) \ |
239 | | do { \ |
240 | | mpn_divexact_1(dst,src,size,CNST_LIMB(188513325)); \ |
241 | | mpn_divexact_by255(dst,dst,size); \ |
242 | | } while(0) |
243 | | #endif |
244 | | #else /* GMP_NUMB_BITS > 35 */ |
245 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x188513325) |
246 | | #define mpn_divexact_by255x188513325(dst,src,size) \ |
247 | 0 | mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(188513325),BINVERT_255x188513325,0) |
248 | | #else |
249 | | #define mpn_divexact_by255x188513325(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(188513325)) |
250 | | #endif |
251 | | #endif /* GMP_NUMB_BITS >?< 36 */ |
252 | | #endif |
253 | | |
254 | | /* Interpolation for Toom-8.5 (or Toom-8), using the evaluation |
255 | | points: infinity(8.5 only), +-8, +-4, +-2, +-1, +-1/4, +-1/2, |
256 | | +-1/8, 0. More precisely, we want to compute |
257 | | f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 15 (or |
258 | | 14), given the 16 (rsp. 15) values: |
259 | | |
260 | | r0 = limit at infinity of f(x) / x^15, |
261 | | r1 = f(8),f(-8), |
262 | | r2 = f(4),f(-4), |
263 | | r3 = f(2),f(-2), |
264 | | r4 = f(1),f(-1), |
265 | | r5 = f(1/4),f(-1/4), |
266 | | r6 = f(1/2),f(-1/2), |
267 | | r7 = f(1/8),f(-1/8), |
268 | | r8 = f(0). |
269 | | |
270 | | All couples of the form f(n),f(-n) must be already mixed with |
271 | | toom_couple_handling(f(n),...,f(-n),...) |
272 | | |
273 | | The result is stored in {pp, spt + 7*n (or 8*n)}. |
274 | | At entry, r8 is stored at {pp, 2n}, |
275 | | r6 is stored at {pp + 3n, 3n + 1}. |
276 | | r4 is stored at {pp + 7n, 3n + 1}. |
277 | | r2 is stored at {pp +11n, 3n + 1}. |
278 | | r0 is stored at {pp +15n, spt}. |
279 | | |
280 | | The other values are 3n+1 limbs each (with most significant limbs small). |
281 | | |
282 | | Negative intermediate results are stored two-complemented. |
283 | | Inputs are destroyed. |
284 | | */ |
285 | | |
286 | | void |
287 | | mpn_toom_interpolate_16pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, mp_ptr r7, |
288 | | mp_size_t n, mp_size_t spt, int half, mp_ptr wsi) |
289 | 0 | { |
290 | 0 | mp_limb_t cy; |
291 | 0 | mp_size_t n3; |
292 | 0 | mp_size_t n3p1; |
293 | 0 | n3 = 3 * n; |
294 | 0 | n3p1 = n3 + 1; |
295 | |
|
296 | 0 | #define r6 (pp + n3) /* 3n+1 */ |
297 | 0 | #define r4 (pp + 7 * n) /* 3n+1 */ |
298 | 0 | #define r2 (pp +11 * n) /* 3n+1 */ |
299 | 0 | #define r0 (pp +15 * n) /* s+t <= 2*n */ |
300 | |
|
301 | 0 | ASSERT( spt <= 2 * n ); |
302 | | /******************************* interpolation *****************************/ |
303 | 0 | if( half != 0) { |
304 | 0 | cy = mpn_sub_n (r4, r4, r0, spt); |
305 | 0 | MPN_DECR_U (r4 + spt, n3p1 - spt, cy); |
306 | |
|
307 | 0 | cy = DO_mpn_sublsh_n (r3, r0, spt, 14, wsi); |
308 | 0 | MPN_DECR_U (r3 + spt, n3p1 - spt, cy); |
309 | 0 | DO_mpn_subrsh(r6, n3p1, r0, spt, 2, wsi); |
310 | |
|
311 | 0 | cy = DO_mpn_sublsh_n (r2, r0, spt, 28, wsi); |
312 | 0 | MPN_DECR_U (r2 + spt, n3p1 - spt, cy); |
313 | 0 | DO_mpn_subrsh(r5, n3p1, r0, spt, 4, wsi); |
314 | |
|
315 | 0 | cy = DO_mpn_sublsh_n (r1 + BIT_CORRECTION, r0, spt, 42 - CORRECTION_BITS, wsi); |
316 | | #if BIT_CORRECTION |
317 | | cy = mpn_sub_1 (r1 + spt + BIT_CORRECTION, r1 + spt + BIT_CORRECTION, |
318 | | n3p1 - spt - BIT_CORRECTION, cy); |
319 | | ASSERT (BIT_CORRECTION > 0 || cy == 0); |
320 | | /* FIXME: assumes r7[n3p1] is writable (it is if r5 follows). */ |
321 | | cy = r7[n3p1]; |
322 | | r7[n3p1] = 0x80; |
323 | | #else |
324 | 0 | MPN_DECR_U (r1 + spt + BIT_CORRECTION, n3p1 - spt - BIT_CORRECTION, cy); |
325 | 0 | #endif |
326 | 0 | DO_mpn_subrsh(r7, n3p1 + BIT_CORRECTION, r0, spt, 6, wsi); |
327 | | #if BIT_CORRECTION |
328 | | /* FIXME: assumes r7[n3p1] is writable. */ |
329 | | ASSERT ( BIT_CORRECTION > 0 || r7[n3p1] == 0x80 ); |
330 | | r7[n3p1] = cy; |
331 | | #endif |
332 | 0 | }; |
333 | |
|
334 | 0 | r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 28, wsi); |
335 | 0 | DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 4, wsi); |
336 | |
|
337 | | #if HAVE_NATIVE_mpn_add_n_sub_n |
338 | | mpn_add_n_sub_n (r2, r5, r5, r2, n3p1); |
339 | | #else |
340 | 0 | mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */ |
341 | 0 | ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1)); |
342 | 0 | MP_PTR_SWAP(r5, wsi); |
343 | 0 | #endif |
344 | |
|
345 | 0 | r6[n3] -= DO_mpn_sublsh_n (r6 + n, pp, 2 * n, 14, wsi); |
346 | 0 | DO_mpn_subrsh(r3 + n, 2 * n + 1, pp, 2 * n, 2, wsi); |
347 | |
|
348 | | #if HAVE_NATIVE_mpn_add_n_sub_n |
349 | | mpn_add_n_sub_n (r3, r6, r6, r3, n3p1); |
350 | | #else |
351 | 0 | ASSERT_NOCARRY(mpn_add_n (wsi, r3, r6, n3p1)); |
352 | 0 | mpn_sub_n (r6, r6, r3, n3p1); /* can be negative */ |
353 | 0 | MP_PTR_SWAP(r3, wsi); |
354 | 0 | #endif |
355 | |
|
356 | 0 | cy = DO_mpn_sublsh_n (r7 + n + BIT_CORRECTION, pp, 2 * n, 42 - CORRECTION_BITS, wsi); |
357 | | #if BIT_CORRECTION |
358 | | MPN_DECR_U (r1 + n, 2 * n + 1, pp[0] >> 6); |
359 | | cy = DO_mpn_sublsh_n (r1 + n, pp + 1, 2 * n - 1, GMP_NUMB_BITS - 6, wsi); |
360 | | cy = mpn_sub_1(r1 + 3 * n - 1, r1 + 3 * n - 1, 2, cy); |
361 | | ASSERT ( BIT_CORRECTION > 0 || cy != 0 ); |
362 | | #else |
363 | 0 | r7[n3] -= cy; |
364 | 0 | DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 6, wsi); |
365 | 0 | #endif |
366 | |
|
367 | | #if HAVE_NATIVE_mpn_add_n_sub_n |
368 | | mpn_add_n_sub_n (r1, r7, r7, r1, n3p1); |
369 | | #else |
370 | 0 | mpn_sub_n (wsi, r7, r1, n3p1); /* can be negative */ |
371 | 0 | mpn_add_n (r1, r1, r7, n3p1); /* if BIT_CORRECTION != 0, can give a carry. */ |
372 | 0 | MP_PTR_SWAP(r7, wsi); |
373 | 0 | #endif |
374 | |
|
375 | 0 | r4[n3] -= mpn_sub_n (r4+n, r4+n, pp, 2 * n); |
376 | |
|
377 | 0 | #if AORSMUL_FASTER_2AORSLSH |
378 | 0 | mpn_submul_1 (r5, r6, n3p1, 1028); /* can be negative */ |
379 | | #else |
380 | | DO_mpn_sublsh_n (r5, r6, n3p1, 2, wsi); /* can be negative */ |
381 | | DO_mpn_sublsh_n (r5, r6, n3p1,10, wsi); /* can be negative */ |
382 | | #endif |
383 | |
|
384 | 0 | mpn_submul_1 (r7, r5, n3p1, 1300); /* can be negative */ |
385 | 0 | #if AORSMUL_FASTER_3AORSLSH |
386 | 0 | mpn_submul_1 (r7, r6, n3p1, 1052688); /* can be negative */ |
387 | | #else |
388 | | DO_mpn_sublsh_n (r7, r6, n3p1, 4, wsi); /* can be negative */ |
389 | | DO_mpn_sublsh_n (r7, r6, n3p1,12, wsi); /* can be negative */ |
390 | | DO_mpn_sublsh_n (r7, r6, n3p1,20, wsi); /* can be negative */ |
391 | | #endif |
392 | 0 | mpn_divexact_by255x188513325(r7, r7, n3p1); |
393 | |
|
394 | 0 | mpn_submul_1 (r5, r7, n3p1, 12567555); /* can be negative */ |
395 | | /* A division by 2835x64 follows. Warning: the operand can be negative! */ |
396 | 0 | mpn_divexact_by2835x64(r5, r5, n3p1); |
397 | 0 | if ((r5[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-7))) != 0) |
398 | 0 | r5[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-6)); |
399 | |
|
400 | 0 | #if AORSMUL_FASTER_AORS_AORSLSH |
401 | 0 | mpn_submul_1 (r6, r7, n3p1, 4095); /* can be negative */ |
402 | | #else |
403 | | mpn_add_n (r6, r6, r7, n3p1); /* can give a carry */ |
404 | | DO_mpn_sublsh_n (r6, r7, n3p1, 12, wsi); /* can be negative */ |
405 | | #endif |
406 | 0 | #if AORSMUL_FASTER_2AORSLSH |
407 | 0 | mpn_addmul_1 (r6, r5, n3p1, 240); /* can be negative */ |
408 | | #else |
409 | | DO_mpn_addlsh_n (r6, r5, n3p1, 8, wsi); /* can give a carry */ |
410 | | DO_mpn_sublsh_n (r6, r5, n3p1, 4, wsi); /* can be negative */ |
411 | | #endif |
412 | | /* A division by 255x4 follows. Warning: the operand can be negative! */ |
413 | 0 | mpn_divexact_by255x4(r6, r6, n3p1); |
414 | 0 | if ((r6[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0) |
415 | 0 | r6[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2)); |
416 | |
|
417 | 0 | ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r4, n3p1, 7, wsi)); |
418 | |
|
419 | 0 | ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r4, n3p1, 13, wsi)); |
420 | 0 | ASSERT_NOCARRY(mpn_submul_1 (r2, r3, n3p1, 400)); |
421 | | |
422 | | /* If GMP_NUMB_BITS < 42 next operations on r1 can give a carry!*/ |
423 | 0 | DO_mpn_sublsh_n (r1, r4, n3p1, 19, wsi); |
424 | 0 | mpn_submul_1 (r1, r2, n3p1, 1428); |
425 | 0 | mpn_submul_1 (r1, r3, n3p1, 112896); |
426 | 0 | mpn_divexact_by255x182712915(r1, r1, n3p1); |
427 | |
|
428 | 0 | ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 15181425)); |
429 | 0 | mpn_divexact_by42525x16(r2, r2, n3p1); |
430 | |
|
431 | 0 | #if AORSMUL_FASTER_AORS_2AORSLSH |
432 | 0 | ASSERT_NOCARRY(mpn_submul_1 (r3, r1, n3p1, 3969)); |
433 | | #else |
434 | | ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1)); |
435 | | ASSERT_NOCARRY(DO_mpn_addlsh_n (r3, r1, n3p1, 7, wsi)); |
436 | | ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r1, n3p1, 12, wsi)); |
437 | | #endif |
438 | 0 | ASSERT_NOCARRY(mpn_submul_1 (r3, r2, n3p1, 900)); |
439 | 0 | mpn_divexact_by9x16(r3, r3, n3p1); |
440 | |
|
441 | 0 | ASSERT_NOCARRY(mpn_sub_n (r4, r4, r1, n3p1)); |
442 | 0 | ASSERT_NOCARRY(mpn_sub_n (r4, r4, r3, n3p1)); |
443 | 0 | ASSERT_NOCARRY(mpn_sub_n (r4, r4, r2, n3p1)); |
444 | |
|
445 | 0 | #ifdef HAVE_NATIVE_mpn_rsh1add_n |
446 | 0 | mpn_rsh1add_n (r6, r2, r6, n3p1); |
447 | 0 | r6 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; |
448 | | #else |
449 | | mpn_add_n (r6, r2, r6, n3p1); |
450 | | ASSERT_NOCARRY(mpn_rshift(r6, r6, n3p1, 1)); |
451 | | #endif |
452 | 0 | ASSERT_NOCARRY(mpn_sub_n (r2, r2, r6, n3p1)); |
453 | |
|
454 | 0 | #ifdef HAVE_NATIVE_mpn_rsh1sub_n |
455 | 0 | mpn_rsh1sub_n (r5, r3, r5, n3p1); |
456 | 0 | r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; |
457 | | #else |
458 | | mpn_sub_n (r5, r3, r5, n3p1); |
459 | | ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1)); |
460 | | #endif |
461 | 0 | ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, n3p1)); |
462 | |
|
463 | 0 | #ifdef HAVE_NATIVE_mpn_rsh1add_n |
464 | 0 | mpn_rsh1add_n (r7, r1, r7, n3p1); |
465 | 0 | r7 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; |
466 | | #else |
467 | | mpn_add_n (r7, r1, r7, n3p1); |
468 | | ASSERT_NOCARRY(mpn_rshift(r7, r7, n3p1, 1)); |
469 | | #endif |
470 | 0 | ASSERT_NOCARRY(mpn_sub_n (r1, r1, r7, n3p1)); |
471 | | |
472 | | /* last interpolation steps... */ |
473 | | /* ... could be mixed with recomposition |
474 | | ||H-r7|M-r7|L-r7| ||H-r5|M-r5|L-r5| |
475 | | */ |
476 | | |
477 | | /***************************** recomposition *******************************/ |
478 | | /* |
479 | | pp[] prior to operations: |
480 | | |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp |
481 | | |
482 | | summation scheme for remaining operations: |
483 | | |__16|n_15|n_14|n_13|n_12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp |
484 | | |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp |
485 | | ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5| ||H r7|M r7|L r7| |
486 | | */ |
487 | |
|
488 | 0 | cy = mpn_add_n (pp + n, pp + n, r7, n); |
489 | 0 | cy = mpn_add_1 (pp + 2 * n, r7 + n, n, cy); |
490 | 0 | #if HAVE_NATIVE_mpn_add_nc |
491 | 0 | cy = r7[n3] + mpn_add_nc(pp + n3, pp + n3, r7 + 2 * n, n, cy); |
492 | | #else |
493 | | MPN_INCR_U (r7 + 2 * n, n + 1, cy); |
494 | | cy = r7[n3] + mpn_add_n (pp + n3, pp + n3, r7 + 2 * n, n); |
495 | | #endif |
496 | 0 | MPN_INCR_U (pp + 4 * n, 2 * n + 1, cy); |
497 | |
|
498 | 0 | pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r5, n); |
499 | 0 | cy = mpn_add_1 (pp + 2 * n3, r5 + n, n, pp[2 * n3]); |
500 | 0 | #if HAVE_NATIVE_mpn_add_nc |
501 | 0 | cy = r5[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r5 + 2 * n, n, cy); |
502 | | #else |
503 | | MPN_INCR_U (r5 + 2 * n, n + 1, cy); |
504 | | cy = r5[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r5 + 2 * n, n); |
505 | | #endif |
506 | 0 | MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy); |
507 | |
|
508 | 0 | pp[10 * n]+= mpn_add_n (pp + 9 * n, pp + 9 * n, r3, n); |
509 | 0 | cy = mpn_add_1 (pp + 10 * n, r3 + n, n, pp[10 * n]); |
510 | 0 | #if HAVE_NATIVE_mpn_add_nc |
511 | 0 | cy = r3[n3] + mpn_add_nc(pp +11 * n, pp +11 * n, r3 + 2 * n, n, cy); |
512 | | #else |
513 | | MPN_INCR_U (r3 + 2 * n, n + 1, cy); |
514 | | cy = r3[n3] + mpn_add_n (pp +11 * n, pp +11 * n, r3 + 2 * n, n); |
515 | | #endif |
516 | 0 | MPN_INCR_U (pp +12 * n, 2 * n + 1, cy); |
517 | |
|
518 | 0 | pp[14 * n]+=mpn_add_n (pp +13 * n, pp +13 * n, r1, n); |
519 | 0 | if ( half ) { |
520 | 0 | cy = mpn_add_1 (pp + 14 * n, r1 + n, n, pp[14 * n]); |
521 | 0 | #if HAVE_NATIVE_mpn_add_nc |
522 | 0 | if(LIKELY(spt > n)) { |
523 | 0 | cy = r1[n3] + mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, n, cy); |
524 | 0 | MPN_INCR_U (pp + 16 * n, spt - n, cy); |
525 | 0 | } else { |
526 | 0 | ASSERT_NOCARRY(mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt, cy)); |
527 | 0 | } |
528 | | #else |
529 | | MPN_INCR_U (r1 + 2 * n, n + 1, cy); |
530 | | if(LIKELY(spt > n)) { |
531 | | cy = r1[n3] + mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, n); |
532 | | MPN_INCR_U (pp + 16 * n, spt - n, cy); |
533 | | } else { |
534 | | ASSERT_NOCARRY(mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt)); |
535 | | } |
536 | | #endif |
537 | 0 | } else { |
538 | 0 | ASSERT_NOCARRY(mpn_add_1 (pp + 14 * n, r1 + n, spt, pp[14 * n])); |
539 | 0 | } |
540 | |
|
541 | 0 | #undef r0 |
542 | 0 | #undef r2 |
543 | 0 | #undef r4 |
544 | 0 | #undef r6 |
545 | 0 | } |