/src/gmp/mpn/toom_interpolate_7pts.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* mpn_toom_interpolate_7pts -- Interpolate for toom44, 53, 62. |
2 | | |
3 | | Contributed to the GNU project by Niels Möller. |
4 | | Improvements by Marco Bodrato. |
5 | | |
6 | | THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY |
7 | | SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
8 | | GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
9 | | |
10 | | Copyright 2006, 2007, 2009, 2014, 2015 Free Software Foundation, Inc. |
11 | | |
12 | | This file is part of the GNU MP Library. |
13 | | |
14 | | The GNU MP Library is free software; you can redistribute it and/or modify |
15 | | it under the terms of either: |
16 | | |
17 | | * the GNU Lesser General Public License as published by the Free |
18 | | Software Foundation; either version 3 of the License, or (at your |
19 | | option) any later version. |
20 | | |
21 | | or |
22 | | |
23 | | * the GNU General Public License as published by the Free Software |
24 | | Foundation; either version 2 of the License, or (at your option) any |
25 | | later version. |
26 | | |
27 | | or both in parallel, as here. |
28 | | |
29 | | The GNU MP Library is distributed in the hope that it will be useful, but |
30 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
31 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
32 | | for more details. |
33 | | |
34 | | You should have received copies of the GNU General Public License and the |
35 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
36 | | see https://www.gnu.org/licenses/. */ |
37 | | |
38 | | #include "gmp-impl.h" |
39 | | |
40 | | #define BINVERT_3 MODLIMB_INVERSE_3 |
41 | | |
42 | | #define BINVERT_9 \ |
43 | 0 | ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39) |
44 | | |
45 | | #define BINVERT_15 \ |
46 | | ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15) |
47 | | |
48 | | /* For the various mpn_divexact_byN here, fall back to using either |
49 | | mpn_pi1_bdiv_q_1 or mpn_divexact_1. The former has less overhead and is |
50 | | many faster if it is native. For now, since mpn_divexact_1 is native on |
51 | | several platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use |
52 | | mpn_pi1_bdiv_q_1 unconditionally. FIXME. */ |
53 | | |
54 | | /* For odd divisors, mpn_divexact_1 works fine with two's complement. */ |
55 | | #ifndef mpn_divexact_by3 |
56 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 |
57 | | #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0) |
58 | | #else |
59 | | #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3) |
60 | | #endif |
61 | | #endif |
62 | | |
63 | | #ifndef mpn_divexact_by9 |
64 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 |
65 | 0 | #define mpn_divexact_by9(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,9,BINVERT_9,0) |
66 | | #else |
67 | | #define mpn_divexact_by9(dst,src,size) mpn_divexact_1(dst,src,size,9) |
68 | | #endif |
69 | | #endif |
70 | | |
71 | | #ifndef mpn_divexact_by15 |
72 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 |
73 | | #define mpn_divexact_by15(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,15,BINVERT_15,0) |
74 | | #else |
75 | | #define mpn_divexact_by15(dst,src,size) mpn_divexact_1(dst,src,size,15) |
76 | | #endif |
77 | | #endif |
78 | | |
79 | | /* Interpolation for toom4, using the evaluation points 0, infinity, |
80 | | 1, -1, 2, -2, 1/2. More precisely, we want to compute |
81 | | f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 6, given the |
82 | | seven values |
83 | | |
84 | | w0 = f(0), |
85 | | w1 = f(-2), |
86 | | w2 = f(1), |
87 | | w3 = f(-1), |
88 | | w4 = f(2) |
89 | | w5 = 64 * f(1/2) |
90 | | w6 = limit at infinity of f(x) / x^6, |
91 | | |
92 | | The result is 6*n + w6n limbs. At entry, w0 is stored at {rp, 2n }, |
93 | | w2 is stored at { rp + 2n, 2n+1 }, and w6 is stored at { rp + 6n, |
94 | | w6n }. The other values are 2n + 1 limbs each (with most |
95 | | significant limbs small). f(-1) and f(-1/2) may be negative, signs |
96 | | determined by the flag bits. Inputs are destroyed. |
97 | | |
98 | | Needs (2*n + 1) limbs of temporary storage. |
99 | | */ |
100 | | |
101 | | void |
102 | | mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags, |
103 | | mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5, |
104 | | mp_size_t w6n, mp_ptr tp) |
105 | 0 | { |
106 | 0 | mp_size_t m; |
107 | 0 | mp_limb_t cy; |
108 | |
|
109 | 0 | m = 2*n + 1; |
110 | 0 | #define w0 rp |
111 | 0 | #define w2 (rp + 2*n) |
112 | 0 | #define w6 (rp + 6*n) |
113 | |
|
114 | 0 | ASSERT (w6n > 0); |
115 | 0 | ASSERT (w6n <= 2*n); |
116 | | |
117 | | /* Using formulas similar to Marco Bodrato's |
118 | | |
119 | | W5 = W5 + W4 |
120 | | W1 =(W4 - W1)/2 |
121 | | W4 = W4 - W0 |
122 | | W4 =(W4 - W1)/4 - W6*16 |
123 | | W3 =(W2 - W3)/2 |
124 | | W2 = W2 - W3 |
125 | | |
126 | | W5 = W5 - W2*65 May be negative. |
127 | | W2 = W2 - W6 - W0 |
128 | | W5 =(W5 + W2*45)/2 Now >= 0 again. |
129 | | W4 =(W4 - W2)/3 |
130 | | W2 = W2 - W4 |
131 | | |
132 | | W1 = W5 - W1 May be negative. |
133 | | W5 =(W5 - W3*8)/9 |
134 | | W3 = W3 - W5 |
135 | | W1 =(W1/15 + W5)/2 Now >= 0 again. |
136 | | W5 = W5 - W1 |
137 | | |
138 | | where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1), |
139 | | W4 = f(2), W5 = f(1/2), W6 = f(oo), |
140 | | |
141 | | Note that most intermediate results are positive; the ones that |
142 | | may be negative are represented in two's complement. We must |
143 | | never shift right a value that may be negative, since that would |
144 | | invalidate the sign bit. On the other hand, divexact by odd |
145 | | numbers work fine with two's complement. |
146 | | */ |
147 | |
|
148 | 0 | mpn_add_n (w5, w5, w4, m); |
149 | 0 | if (flags & toom7_w1_neg) |
150 | 0 | { |
151 | 0 | #ifdef HAVE_NATIVE_mpn_rsh1add_n |
152 | 0 | mpn_rsh1add_n (w1, w1, w4, m); |
153 | | #else |
154 | | mpn_add_n (w1, w1, w4, m); ASSERT (!(w1[0] & 1)); |
155 | | mpn_rshift (w1, w1, m, 1); |
156 | | #endif |
157 | 0 | } |
158 | 0 | else |
159 | 0 | { |
160 | 0 | #ifdef HAVE_NATIVE_mpn_rsh1sub_n |
161 | 0 | mpn_rsh1sub_n (w1, w4, w1, m); |
162 | | #else |
163 | | mpn_sub_n (w1, w4, w1, m); ASSERT (!(w1[0] & 1)); |
164 | | mpn_rshift (w1, w1, m, 1); |
165 | | #endif |
166 | 0 | } |
167 | 0 | mpn_sub (w4, w4, m, w0, 2*n); |
168 | 0 | mpn_sub_n (w4, w4, w1, m); ASSERT (!(w4[0] & 3)); |
169 | 0 | mpn_rshift (w4, w4, m, 2); /* w4>=0 */ |
170 | |
|
171 | 0 | tp[w6n] = mpn_lshift (tp, w6, w6n, 4); |
172 | 0 | mpn_sub (w4, w4, m, tp, w6n+1); |
173 | |
|
174 | 0 | if (flags & toom7_w3_neg) |
175 | 0 | { |
176 | 0 | #ifdef HAVE_NATIVE_mpn_rsh1add_n |
177 | 0 | mpn_rsh1add_n (w3, w3, w2, m); |
178 | | #else |
179 | | mpn_add_n (w3, w3, w2, m); ASSERT (!(w3[0] & 1)); |
180 | | mpn_rshift (w3, w3, m, 1); |
181 | | #endif |
182 | 0 | } |
183 | 0 | else |
184 | 0 | { |
185 | 0 | #ifdef HAVE_NATIVE_mpn_rsh1sub_n |
186 | 0 | mpn_rsh1sub_n (w3, w2, w3, m); |
187 | | #else |
188 | | mpn_sub_n (w3, w2, w3, m); ASSERT (!(w3[0] & 1)); |
189 | | mpn_rshift (w3, w3, m, 1); |
190 | | #endif |
191 | 0 | } |
192 | |
|
193 | 0 | mpn_sub_n (w2, w2, w3, m); |
194 | |
|
195 | 0 | mpn_submul_1 (w5, w2, m, 65); |
196 | 0 | mpn_sub (w2, w2, m, w6, w6n); |
197 | 0 | mpn_sub (w2, w2, m, w0, 2*n); |
198 | |
|
199 | 0 | mpn_addmul_1 (w5, w2, m, 45); ASSERT (!(w5[0] & 1)); |
200 | 0 | mpn_rshift (w5, w5, m, 1); |
201 | 0 | mpn_sub_n (w4, w4, w2, m); |
202 | |
|
203 | 0 | mpn_divexact_by3 (w4, w4, m); |
204 | 0 | mpn_sub_n (w2, w2, w4, m); |
205 | |
|
206 | 0 | mpn_sub_n (w1, w5, w1, m); |
207 | 0 | mpn_lshift (tp, w3, m, 3); |
208 | 0 | mpn_sub_n (w5, w5, tp, m); |
209 | 0 | mpn_divexact_by9 (w5, w5, m); |
210 | 0 | mpn_sub_n (w3, w3, w5, m); |
211 | |
|
212 | 0 | mpn_divexact_by15 (w1, w1, m); |
213 | 0 | #ifdef HAVE_NATIVE_mpn_rsh1add_n |
214 | 0 | mpn_rsh1add_n (w1, w1, w5, m); |
215 | 0 | w1[m - 1] &= GMP_NUMB_MASK >> 1; |
216 | | #else |
217 | | mpn_add_n (w1, w1, w5, m); ASSERT (!(w1[0] & 1)); |
218 | | mpn_rshift (w1, w1, m, 1); /* w1>=0 now */ |
219 | | #endif |
220 | |
|
221 | 0 | mpn_sub_n (w5, w5, w1, m); |
222 | | |
223 | | /* These bounds are valid for the 4x4 polynomial product of toom44, |
224 | | * and they are conservative for toom53 and toom62. */ |
225 | 0 | ASSERT (w1[2*n] < 2); |
226 | 0 | ASSERT (w2[2*n] < 3); |
227 | 0 | ASSERT (w3[2*n] < 4); |
228 | 0 | ASSERT (w4[2*n] < 3); |
229 | 0 | ASSERT (w5[2*n] < 2); |
230 | | |
231 | | /* Addition chain. Note carries and the 2n'th limbs that need to be |
232 | | * added in. |
233 | | * |
234 | | * Special care is needed for w2[2n] and the corresponding carry, |
235 | | * since the "simple" way of adding it all together would overwrite |
236 | | * the limb at wp[2*n] and rp[4*n] (same location) with the sum of |
237 | | * the high half of w3 and the low half of w4. |
238 | | * |
239 | | * 7 6 5 4 3 2 1 0 |
240 | | * | | | | | | | | | |
241 | | * ||w3 (2n+1)| |
242 | | * ||w4 (2n+1)| |
243 | | * ||w5 (2n+1)| ||w1 (2n+1)| |
244 | | * + | w6 (w6n)| ||w2 (2n+1)| w0 (2n) | (share storage with r) |
245 | | * ----------------------------------------------- |
246 | | * r | | | | | | | | | |
247 | | * c7 c6 c5 c4 c3 Carries to propagate |
248 | | */ |
249 | |
|
250 | 0 | cy = mpn_add_n (rp + n, rp + n, w1, m); |
251 | 0 | MPN_INCR_U (w2 + n + 1, n , cy); |
252 | 0 | cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n); |
253 | 0 | MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy); |
254 | 0 | cy = mpn_add_n (rp + 4*n, w3 + n, w4, n); |
255 | 0 | MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy); |
256 | 0 | cy = mpn_add_n (rp + 5*n, w4 + n, w5, n); |
257 | 0 | MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy); |
258 | 0 | if (w6n > n + 1) |
259 | 0 | { |
260 | 0 | cy = mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, n + 1); |
261 | 0 | MPN_INCR_U (rp + 7*n + 1, w6n - n - 1, cy); |
262 | 0 | } |
263 | 0 | else |
264 | 0 | { |
265 | 0 | ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n)); |
266 | | #if WANT_ASSERT |
267 | | { |
268 | | mp_size_t i; |
269 | | for (i = w6n; i <= n; i++) |
270 | | ASSERT (w5[n + i] == 0); |
271 | | } |
272 | | #endif |
273 | 0 | } |
274 | 0 | } |