/src/gmp/mpn/toom_interpolate_6pts.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52 |
2 | | |
3 | | Contributed to the GNU project by Marco Bodrato. |
4 | | |
5 | | THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY |
6 | | SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
7 | | GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
8 | | |
9 | | Copyright 2009, 2010, 2012 Free Software Foundation, Inc. |
10 | | |
11 | | This file is part of the GNU MP Library. |
12 | | |
13 | | The GNU MP Library is free software; you can redistribute it and/or modify |
14 | | it under the terms of either: |
15 | | |
16 | | * the GNU Lesser General Public License as published by the Free |
17 | | Software Foundation; either version 3 of the License, or (at your |
18 | | option) any later version. |
19 | | |
20 | | or |
21 | | |
22 | | * the GNU General Public License as published by the Free Software |
23 | | Foundation; either version 2 of the License, or (at your option) any |
24 | | later version. |
25 | | |
26 | | or both in parallel, as here. |
27 | | |
28 | | The GNU MP Library is distributed in the hope that it will be useful, but |
29 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
30 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
31 | | for more details. |
32 | | |
33 | | You should have received copies of the GNU General Public License and the |
34 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
35 | | see https://www.gnu.org/licenses/. */ |
36 | | |
37 | | #include "gmp-impl.h" |
38 | | |
39 | | #define BINVERT_3 MODLIMB_INVERSE_3 |
40 | | |
41 | | /* For odd divisors, mpn_divexact_1 works fine with two's complement. */ |
42 | | #ifndef mpn_divexact_by3 |
43 | | #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 |
44 | | #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0) |
45 | | #else |
46 | | #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3) |
47 | | #endif |
48 | | #endif |
49 | | |
50 | | /* Interpolation for Toom-3.5, using the evaluation points: infinity, |
51 | | 1, -1, 2, -2. More precisely, we want to compute |
52 | | f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the |
53 | | six values |
54 | | |
55 | | w5 = f(0), |
56 | | w4 = f(-1), |
57 | | w3 = f(1) |
58 | | w2 = f(-2), |
59 | | w1 = f(2), |
60 | | w0 = limit at infinity of f(x) / x^5, |
61 | | |
62 | | The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at |
63 | | {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at |
64 | | {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most |
65 | | significant limbs small). f(-1) and f(-2) may be negative, signs |
66 | | determined by the flag bits. All intermediate results are positive. |
67 | | Inputs are destroyed. |
68 | | |
69 | | Interpolation sequence was taken from the paper: "Integer and |
70 | | Polynomial Multiplication: Towards Optimal Toom-Cook Matrices". |
71 | | Some slight variations were introduced: adaptation to "gmp |
72 | | instruction set", and a final saving of an operation by interlacing |
73 | | interpolation and recomposition phases. |
74 | | */ |
75 | | |
76 | | void |
77 | | mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags, |
78 | | mp_ptr w4, mp_ptr w2, mp_ptr w1, |
79 | | mp_size_t w0n) |
80 | 0 | { |
81 | 0 | mp_limb_t cy; |
82 | | /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */ |
83 | 0 | mp_limb_t cy4, cy6, embankment; |
84 | |
|
85 | 0 | ASSERT( n > 0 ); |
86 | 0 | ASSERT( 2*n >= w0n && w0n > 0 ); |
87 | |
|
88 | 0 | #define w5 pp /* 2n */ |
89 | 0 | #define w3 (pp + 2 * n) /* 2n+1 */ |
90 | 0 | #define w0 (pp + 5 * n) /* w0n */ |
91 | | |
92 | | /* Interpolate with sequence: |
93 | | W2 =(W1 - W2)>>2 |
94 | | W1 =(W1 - W5)>>1 |
95 | | W1 =(W1 - W2)>>1 |
96 | | W4 =(W3 - W4)>>1 |
97 | | W2 =(W2 - W4)/3 |
98 | | W3 = W3 - W4 - W5 |
99 | | W1 =(W1 - W3)/3 |
100 | | // Last steps are mixed with recomposition... |
101 | | W2 = W2 - W0<<2 |
102 | | W4 = W4 - W2 |
103 | | W3 = W3 - W1 |
104 | | W2 = W2 - W0 |
105 | | */ |
106 | | |
107 | | /* W2 =(W1 - W2)>>2 */ |
108 | 0 | if (flags & toom6_vm2_neg) |
109 | 0 | mpn_add_n (w2, w1, w2, 2 * n + 1); |
110 | 0 | else |
111 | 0 | mpn_sub_n (w2, w1, w2, 2 * n + 1); |
112 | 0 | mpn_rshift (w2, w2, 2 * n + 1, 2); |
113 | | |
114 | | /* W1 =(W1 - W5)>>1 */ |
115 | 0 | w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n); |
116 | 0 | mpn_rshift (w1, w1, 2 * n + 1, 1); |
117 | | |
118 | | /* W1 =(W1 - W2)>>1 */ |
119 | 0 | #if HAVE_NATIVE_mpn_rsh1sub_n |
120 | 0 | mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1); |
121 | | #else |
122 | | mpn_sub_n (w1, w1, w2, 2 * n + 1); |
123 | | mpn_rshift (w1, w1, 2 * n + 1, 1); |
124 | | #endif |
125 | | |
126 | | /* W4 =(W3 - W4)>>1 */ |
127 | 0 | if (flags & toom6_vm1_neg) |
128 | 0 | { |
129 | 0 | #if HAVE_NATIVE_mpn_rsh1add_n |
130 | 0 | mpn_rsh1add_n (w4, w3, w4, 2 * n + 1); |
131 | | #else |
132 | | mpn_add_n (w4, w3, w4, 2 * n + 1); |
133 | | mpn_rshift (w4, w4, 2 * n + 1, 1); |
134 | | #endif |
135 | 0 | } |
136 | 0 | else |
137 | 0 | { |
138 | 0 | #if HAVE_NATIVE_mpn_rsh1sub_n |
139 | 0 | mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1); |
140 | | #else |
141 | | mpn_sub_n (w4, w3, w4, 2 * n + 1); |
142 | | mpn_rshift (w4, w4, 2 * n + 1, 1); |
143 | | #endif |
144 | 0 | } |
145 | | |
146 | | /* W2 =(W2 - W4)/3 */ |
147 | 0 | mpn_sub_n (w2, w2, w4, 2 * n + 1); |
148 | 0 | mpn_divexact_by3 (w2, w2, 2 * n + 1); |
149 | | |
150 | | /* W3 = W3 - W4 - W5 */ |
151 | 0 | mpn_sub_n (w3, w3, w4, 2 * n + 1); |
152 | 0 | w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n); |
153 | | |
154 | | /* W1 =(W1 - W3)/3 */ |
155 | 0 | mpn_sub_n (w1, w1, w3, 2 * n + 1); |
156 | 0 | mpn_divexact_by3 (w1, w1, 2 * n + 1); |
157 | | |
158 | | /* |
159 | | [1 0 0 0 0 0; |
160 | | 0 1 0 0 0 0; |
161 | | 1 0 1 0 0 0; |
162 | | 0 1 0 1 0 0; |
163 | | 1 0 1 0 1 0; |
164 | | 0 0 0 0 0 1] |
165 | | |
166 | | pp[] prior to operations: |
167 | | |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__| |
168 | | |
169 | | summation scheme for remaining operations: |
170 | | |______________5|n_____4|n_____3|n_____2|n______|n______|pp |
171 | | |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__| |
172 | | || H w4 | L w4 | |
173 | | || H w2 | L w2 | |
174 | | || H w1 | L w1 | |
175 | | ||-H w1 |-L w1 | |
176 | | |-H w0 |-L w0 ||-H w2 |-L w2 | |
177 | | */ |
178 | 0 | cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1); |
179 | 0 | MPN_INCR_U (pp + 3 * n + 1, n, cy); |
180 | | |
181 | | /* W2 -= W0<<2 */ |
182 | | #if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n_ip1 |
183 | | #if HAVE_NATIVE_mpn_sublsh2_n_ip1 |
184 | | cy = mpn_sublsh2_n_ip1 (w2, w0, w0n); |
185 | | #else |
186 | | cy = mpn_sublsh_n (w2, w2, w0, w0n, 2); |
187 | | #endif |
188 | | #else |
189 | | /* {W4,2*n+1} is now free and can be overwritten. */ |
190 | 0 | cy = mpn_lshift(w4, w0, w0n, 2); |
191 | 0 | cy+= mpn_sub_n(w2, w2, w4, w0n); |
192 | 0 | #endif |
193 | 0 | MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy); |
194 | | |
195 | | /* W4L = W4L - W2L */ |
196 | 0 | cy = mpn_sub_n (pp + n, pp + n, w2, n); |
197 | 0 | MPN_DECR_U (w3, 2 * n + 1, cy); |
198 | | |
199 | | /* W3H = W3H + W2L */ |
200 | 0 | cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n); |
201 | | /* W1L + W2H */ |
202 | 0 | cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n); |
203 | 0 | MPN_INCR_U (w1 + n, n + 1, cy); |
204 | | |
205 | | /* W0 = W0 + W1H */ |
206 | 0 | if (LIKELY (w0n > n)) |
207 | 0 | cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n); |
208 | 0 | else |
209 | 0 | cy6 = mpn_add_n (w0, w0, w1 + n, w0n); |
210 | | |
211 | | /* |
212 | | summation scheme for the next operation: |
213 | | |...____5|n_____4|n_____3|n_____2|n______|n______|pp |
214 | | |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__| |
215 | | ...-w0___|-w1_w2 | |
216 | | */ |
217 | | /* if(LIKELY(w0n>n)) the two operands below DO overlap! */ |
218 | 0 | cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n); |
219 | | |
220 | | /* embankment is a "dirty trick" to avoid carry/borrow propagation |
221 | | beyond allocated memory */ |
222 | 0 | embankment = w0[w0n - 1] - 1; |
223 | 0 | w0[w0n - 1] = 1; |
224 | 0 | if (LIKELY (w0n > n)) { |
225 | 0 | if (cy4 > cy6) |
226 | 0 | MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6); |
227 | 0 | else |
228 | 0 | MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4); |
229 | 0 | MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy); |
230 | 0 | MPN_INCR_U (w0 + n, w0n - n, cy6); |
231 | 0 | } else { |
232 | 0 | MPN_INCR_U (pp + 4 * n, w0n + n, cy4); |
233 | 0 | MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6); |
234 | 0 | } |
235 | 0 | w0[w0n - 1] += embankment; |
236 | |
|
237 | 0 | #undef w5 |
238 | 0 | #undef w3 |
239 | 0 | #undef w0 |
240 | |
|
241 | 0 | } |