/src/gmp/mpn/toom_interpolate_7pts.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* mpn_toom_interpolate_7pts -- Interpolate for toom44, 53, 62.  | 
2  |  |  | 
3  |  |    Contributed to the GNU project by Niels Möller.  | 
4  |  |    Improvements by Marco Bodrato.  | 
5  |  |  | 
6  |  |    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY  | 
7  |  |    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST  | 
8  |  |    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.  | 
9  |  |  | 
10  |  | Copyright 2006, 2007, 2009, 2014, 2015 Free Software Foundation, Inc.  | 
11  |  |  | 
12  |  | This file is part of the GNU MP Library.  | 
13  |  |  | 
14  |  | The GNU MP Library is free software; you can redistribute it and/or modify  | 
15  |  | it under the terms of either:  | 
16  |  |  | 
17  |  |   * the GNU Lesser General Public License as published by the Free  | 
18  |  |     Software Foundation; either version 3 of the License, or (at your  | 
19  |  |     option) any later version.  | 
20  |  |  | 
21  |  | or  | 
22  |  |  | 
23  |  |   * the GNU General Public License as published by the Free Software  | 
24  |  |     Foundation; either version 2 of the License, or (at your option) any  | 
25  |  |     later version.  | 
26  |  |  | 
27  |  | or both in parallel, as here.  | 
28  |  |  | 
29  |  | The GNU MP Library is distributed in the hope that it will be useful, but  | 
30  |  | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY  | 
31  |  | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License  | 
32  |  | for more details.  | 
33  |  |  | 
34  |  | You should have received copies of the GNU General Public License and the  | 
35  |  | GNU Lesser General Public License along with the GNU MP Library.  If not,  | 
36  |  | see https://www.gnu.org/licenses/.  */  | 
37  |  |  | 
38  |  | #include "gmp-impl.h"  | 
39  |  |  | 
40  |  | #define BINVERT_3 MODLIMB_INVERSE_3  | 
41  |  |  | 
42  |  | #define BINVERT_9 \  | 
43  | 0  |   ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)  | 
44  |  |  | 
45  |  | #define BINVERT_15 \  | 
46  |  |   ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)  | 
47  |  |  | 
48  |  | /* For the various mpn_divexact_byN here, fall back to using either  | 
49  |  |    mpn_pi1_bdiv_q_1 or mpn_divexact_1.  The former has less overhead and is  | 
50  |  |    many faster if it is native.  For now, since mpn_divexact_1 is native on  | 
51  |  |    several platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use  | 
52  |  |    mpn_pi1_bdiv_q_1 unconditionally.  FIXME.  */  | 
53  |  |  | 
54  |  | /* For odd divisors, mpn_divexact_1 works fine with two's complement. */  | 
55  |  | #ifndef mpn_divexact_by3  | 
56  |  | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1  | 
57  |  | #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)  | 
58  |  | #else  | 
59  |  | #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)  | 
60  |  | #endif  | 
61  |  | #endif  | 
62  |  |  | 
63  |  | #ifndef mpn_divexact_by9  | 
64  |  | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1  | 
65  | 0  | #define mpn_divexact_by9(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,9,BINVERT_9,0)  | 
66  |  | #else  | 
67  |  | #define mpn_divexact_by9(dst,src,size) mpn_divexact_1(dst,src,size,9)  | 
68  |  | #endif  | 
69  |  | #endif  | 
70  |  |  | 
71  |  | #ifndef mpn_divexact_by15  | 
72  |  | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1  | 
73  |  | #define mpn_divexact_by15(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,15,BINVERT_15,0)  | 
74  |  | #else  | 
75  |  | #define mpn_divexact_by15(dst,src,size) mpn_divexact_1(dst,src,size,15)  | 
76  |  | #endif  | 
77  |  | #endif  | 
78  |  |  | 
79  |  | /* Interpolation for toom4, using the evaluation points 0, infinity,  | 
80  |  |    1, -1, 2, -2, 1/2. More precisely, we want to compute  | 
81  |  |    f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 6, given the  | 
82  |  |    seven values  | 
83  |  |  | 
84  |  |      w0 = f(0),  | 
85  |  |      w1 = f(-2),  | 
86  |  |      w2 = f(1),  | 
87  |  |      w3 = f(-1),  | 
88  |  |      w4 = f(2)  | 
89  |  |      w5 = 64 * f(1/2)  | 
90  |  |      w6 = limit at infinity of f(x) / x^6,  | 
91  |  |  | 
92  |  |    The result is 6*n + w6n limbs. At entry, w0 is stored at {rp, 2n }, | 
93  |  |    w2 is stored at { rp + 2n, 2n+1 }, and w6 is stored at { rp + 6n, | 
94  |  |    w6n }. The other values are 2n + 1 limbs each (with most  | 
95  |  |    significant limbs small). f(-1) and f(-1/2) may be negative, signs  | 
96  |  |    determined by the flag bits. Inputs are destroyed.  | 
97  |  |  | 
98  |  |    Needs (2*n + 1) limbs of temporary storage.  | 
99  |  | */  | 
100  |  |  | 
101  |  | void  | 
102  |  | mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags,  | 
103  |  |          mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5,  | 
104  |  |          mp_size_t w6n, mp_ptr tp)  | 
105  | 0  | { | 
106  | 0  |   mp_size_t m;  | 
107  | 0  |   mp_limb_t cy;  | 
108  |  | 
  | 
109  | 0  |   m = 2*n + 1;  | 
110  | 0  | #define w0 rp  | 
111  | 0  | #define w2 (rp + 2*n)  | 
112  | 0  | #define w6 (rp + 6*n)  | 
113  |  | 
  | 
114  | 0  |   ASSERT (w6n > 0);  | 
115  | 0  |   ASSERT (w6n <= 2*n);  | 
116  |  |  | 
117  |  |   /* Using formulas similar to Marco Bodrato's  | 
118  |  |  | 
119  |  |      W5 = W5 + W4  | 
120  |  |      W1 =(W4 - W1)/2  | 
121  |  |      W4 = W4 - W0  | 
122  |  |      W4 =(W4 - W1)/4 - W6*16  | 
123  |  |      W3 =(W2 - W3)/2  | 
124  |  |      W2 = W2 - W3  | 
125  |  |  | 
126  |  |      W5 = W5 - W2*65      May be negative.  | 
127  |  |      W2 = W2 - W6 - W0  | 
128  |  |      W5 =(W5 + W2*45)/2   Now >= 0 again.  | 
129  |  |      W4 =(W4 - W2)/3  | 
130  |  |      W2 = W2 - W4  | 
131  |  |  | 
132  |  |      W1 = W5 - W1         May be negative.  | 
133  |  |      W5 =(W5 - W3*8)/9  | 
134  |  |      W3 = W3 - W5  | 
135  |  |      W1 =(W1/15 + W5)/2   Now >= 0 again.  | 
136  |  |      W5 = W5 - W1  | 
137  |  |  | 
138  |  |      where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1),  | 
139  |  |      W4 = f(2), W5 = f(1/2), W6 = f(oo),  | 
140  |  |  | 
141  |  |      Note that most intermediate results are positive; the ones that  | 
142  |  |      may be negative are represented in two's complement. We must  | 
143  |  |      never shift right a value that may be negative, since that would  | 
144  |  |      invalidate the sign bit. On the other hand, divexact by odd  | 
145  |  |      numbers work fine with two's complement.  | 
146  |  |   */  | 
147  |  | 
  | 
148  | 0  |   mpn_add_n (w5, w5, w4, m);  | 
149  | 0  |   if (flags & toom7_w1_neg)  | 
150  | 0  |     { | 
151  | 0  | #ifdef HAVE_NATIVE_mpn_rsh1add_n  | 
152  | 0  |       mpn_rsh1add_n (w1, w1, w4, m);  | 
153  |  | #else  | 
154  |  |       mpn_add_n (w1, w1, w4, m);  ASSERT (!(w1[0] & 1));  | 
155  |  |       mpn_rshift (w1, w1, m, 1);  | 
156  |  | #endif  | 
157  | 0  |     }  | 
158  | 0  |   else  | 
159  | 0  |     { | 
160  | 0  | #ifdef HAVE_NATIVE_mpn_rsh1sub_n  | 
161  | 0  |       mpn_rsh1sub_n (w1, w4, w1, m);  | 
162  |  | #else  | 
163  |  |       mpn_sub_n (w1, w4, w1, m);  ASSERT (!(w1[0] & 1));  | 
164  |  |       mpn_rshift (w1, w1, m, 1);  | 
165  |  | #endif  | 
166  | 0  |     }  | 
167  | 0  |   mpn_sub (w4, w4, m, w0, 2*n);  | 
168  | 0  |   mpn_sub_n (w4, w4, w1, m);  ASSERT (!(w4[0] & 3));  | 
169  | 0  |   mpn_rshift (w4, w4, m, 2); /* w4>=0 */  | 
170  |  | 
  | 
171  | 0  |   tp[w6n] = mpn_lshift (tp, w6, w6n, 4);  | 
172  | 0  |   mpn_sub (w4, w4, m, tp, w6n+1);  | 
173  |  | 
  | 
174  | 0  |   if (flags & toom7_w3_neg)  | 
175  | 0  |     { | 
176  | 0  | #ifdef HAVE_NATIVE_mpn_rsh1add_n  | 
177  | 0  |       mpn_rsh1add_n (w3, w3, w2, m);  | 
178  |  | #else  | 
179  |  |       mpn_add_n (w3, w3, w2, m);  ASSERT (!(w3[0] & 1));  | 
180  |  |       mpn_rshift (w3, w3, m, 1);  | 
181  |  | #endif  | 
182  | 0  |     }  | 
183  | 0  |   else  | 
184  | 0  |     { | 
185  | 0  | #ifdef HAVE_NATIVE_mpn_rsh1sub_n  | 
186  | 0  |       mpn_rsh1sub_n (w3, w2, w3, m);  | 
187  |  | #else  | 
188  |  |       mpn_sub_n (w3, w2, w3, m);  ASSERT (!(w3[0] & 1));  | 
189  |  |       mpn_rshift (w3, w3, m, 1);  | 
190  |  | #endif  | 
191  | 0  |     }  | 
192  |  | 
  | 
193  | 0  |   mpn_sub_n (w2, w2, w3, m);  | 
194  |  | 
  | 
195  | 0  |   mpn_submul_1 (w5, w2, m, 65);  | 
196  | 0  |   mpn_sub (w2, w2, m, w6, w6n);  | 
197  | 0  |   mpn_sub (w2, w2, m, w0, 2*n);  | 
198  |  | 
  | 
199  | 0  |   mpn_addmul_1 (w5, w2, m, 45);  ASSERT (!(w5[0] & 1));  | 
200  | 0  |   mpn_rshift (w5, w5, m, 1);  | 
201  | 0  |   mpn_sub_n (w4, w4, w2, m);  | 
202  |  | 
  | 
203  | 0  |   mpn_divexact_by3 (w4, w4, m);  | 
204  | 0  |   mpn_sub_n (w2, w2, w4, m);  | 
205  |  | 
  | 
206  | 0  |   mpn_sub_n (w1, w5, w1, m);  | 
207  | 0  |   mpn_lshift (tp, w3, m, 3);  | 
208  | 0  |   mpn_sub_n (w5, w5, tp, m);  | 
209  | 0  |   mpn_divexact_by9 (w5, w5, m);  | 
210  | 0  |   mpn_sub_n (w3, w3, w5, m);  | 
211  |  | 
  | 
212  | 0  |   mpn_divexact_by15 (w1, w1, m);  | 
213  | 0  | #ifdef HAVE_NATIVE_mpn_rsh1add_n  | 
214  | 0  |   mpn_rsh1add_n (w1, w1, w5, m);  | 
215  | 0  |   w1[m - 1] &= GMP_NUMB_MASK >> 1;  | 
216  |  | #else  | 
217  |  |   mpn_add_n (w1, w1, w5, m);  ASSERT (!(w1[0] & 1));  | 
218  |  |   mpn_rshift (w1, w1, m, 1); /* w1>=0 now */  | 
219  |  | #endif  | 
220  |  | 
  | 
221  | 0  |   mpn_sub_n (w5, w5, w1, m);  | 
222  |  |  | 
223  |  |   /* These bounds are valid for the 4x4 polynomial product of toom44,  | 
224  |  |    * and they are conservative for toom53 and toom62. */  | 
225  | 0  |   ASSERT (w1[2*n] < 2);  | 
226  | 0  |   ASSERT (w2[2*n] < 3);  | 
227  | 0  |   ASSERT (w3[2*n] < 4);  | 
228  | 0  |   ASSERT (w4[2*n] < 3);  | 
229  | 0  |   ASSERT (w5[2*n] < 2);  | 
230  |  |  | 
231  |  |   /* Addition chain. Note carries and the 2n'th limbs that need to be  | 
232  |  |    * added in.  | 
233  |  |    *  | 
234  |  |    * Special care is needed for w2[2n] and the corresponding carry,  | 
235  |  |    * since the "simple" way of adding it all together would overwrite  | 
236  |  |    * the limb at wp[2*n] and rp[4*n] (same location) with the sum of  | 
237  |  |    * the high half of w3 and the low half of w4.  | 
238  |  |    *  | 
239  |  |    *         7    6    5    4    3    2    1    0  | 
240  |  |    *    |    |    |    |    |    |    |    |    |  | 
241  |  |    *                  ||w3 (2n+1)|  | 
242  |  |    *             ||w4 (2n+1)|  | 
243  |  |    *        ||w5 (2n+1)|        ||w1 (2n+1)|  | 
244  |  |    *  + | w6 (w6n)|        ||w2 (2n+1)| w0 (2n) |  (share storage with r)  | 
245  |  |    *  -----------------------------------------------  | 
246  |  |    *  r |    |    |    |    |    |    |    |    |  | 
247  |  |    *        c7   c6   c5   c4   c3                 Carries to propagate  | 
248  |  |    */  | 
249  |  | 
  | 
250  | 0  |   cy = mpn_add_n (rp + n, rp + n, w1, m);  | 
251  | 0  |   MPN_INCR_U (w2 + n + 1, n , cy);  | 
252  | 0  |   cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n);  | 
253  | 0  |   MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy);  | 
254  | 0  |   cy = mpn_add_n (rp + 4*n, w3 + n, w4, n);  | 
255  | 0  |   MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy);  | 
256  | 0  |   cy = mpn_add_n (rp + 5*n, w4 + n, w5, n);  | 
257  | 0  |   MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy);  | 
258  | 0  |   if (w6n > n + 1)  | 
259  | 0  |     { | 
260  | 0  |       cy = mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, n + 1);  | 
261  | 0  |       MPN_INCR_U (rp + 7*n + 1, w6n - n - 1, cy);  | 
262  | 0  |     }  | 
263  | 0  |   else  | 
264  | 0  |     { | 
265  | 0  |       ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n));  | 
266  |  | #if WANT_ASSERT  | 
267  |  |       { | 
268  |  |   mp_size_t i;  | 
269  |  |   for (i = w6n; i <= n; i++)  | 
270  |  |     ASSERT (w5[n + i] == 0);  | 
271  |  |       }  | 
272  |  | #endif  | 
273  | 0  |     }  | 
274  | 0  | }  |