/src/gmp/mpn/matrix22_mul.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* matrix22_mul.c. |
2 | | |
3 | | Contributed by Niels Möller and Marco Bodrato. |
4 | | |
5 | | THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY |
6 | | SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
7 | | GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
8 | | |
9 | | Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc. |
10 | | |
11 | | This file is part of the GNU MP Library. |
12 | | |
13 | | The GNU MP Library is free software; you can redistribute it and/or modify |
14 | | it under the terms of either: |
15 | | |
16 | | * the GNU Lesser General Public License as published by the Free |
17 | | Software Foundation; either version 3 of the License, or (at your |
18 | | option) any later version. |
19 | | |
20 | | or |
21 | | |
22 | | * the GNU General Public License as published by the Free Software |
23 | | Foundation; either version 2 of the License, or (at your option) any |
24 | | later version. |
25 | | |
26 | | or both in parallel, as here. |
27 | | |
28 | | The GNU MP Library is distributed in the hope that it will be useful, but |
29 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
30 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
31 | | for more details. |
32 | | |
33 | | You should have received copies of the GNU General Public License and the |
34 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
35 | | see https://www.gnu.org/licenses/. */ |
36 | | |
37 | | #include "gmp-impl.h" |
38 | | #include "longlong.h" |
39 | | |
40 | 0 | #define MUL(rp, ap, an, bp, bn) do { \ |
41 | 0 | if (an >= bn) \ |
42 | 0 | mpn_mul (rp, ap, an, bp, bn); \ |
43 | 0 | else \ |
44 | 0 | mpn_mul (rp, bp, bn, ap, an); \ |
45 | 0 | } while (0) |
46 | | |
47 | | /* Inputs are unsigned. */ |
48 | | static int |
49 | | abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) |
50 | 0 | { |
51 | 0 | int c; |
52 | 0 | MPN_CMP (c, ap, bp, n); |
53 | 0 | if (c >= 0) |
54 | 0 | { |
55 | 0 | mpn_sub_n (rp, ap, bp, n); |
56 | 0 | return 0; |
57 | 0 | } |
58 | 0 | else |
59 | 0 | { |
60 | 0 | mpn_sub_n (rp, bp, ap, n); |
61 | 0 | return 1; |
62 | 0 | } |
63 | 0 | } |
64 | | |
65 | | static int |
66 | | add_signed_n (mp_ptr rp, |
67 | | mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n) |
68 | 0 | { |
69 | 0 | if (as != bs) |
70 | 0 | return as ^ abs_sub_n (rp, ap, bp, n); |
71 | 0 | else |
72 | 0 | { |
73 | 0 | ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n)); |
74 | 0 | return as; |
75 | 0 | } |
76 | 0 | } |
77 | | |
78 | | mp_size_t |
79 | | mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn) |
80 | 0 | { |
81 | 0 | if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) |
82 | 0 | || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) |
83 | 0 | return 3*rn + 2*mn; |
84 | 0 | else |
85 | 0 | return 3*(rn + mn) + 5; |
86 | 0 | } |
87 | | |
88 | | /* Algorithm: |
89 | | |
90 | | / s0 \ / 1 0 0 0 \ / r0 \ |
91 | | | s1 | | 0 1 0 1 | | r1 | |
92 | | | s2 | | 0 0 -1 1 | | r2 | |
93 | | | s3 | = | 0 1 -1 1 | \ r3 / |
94 | | | s4 | | -1 1 -1 1 | |
95 | | | s5 | | 0 1 0 0 | |
96 | | \ s6 / \ 0 0 1 0 / |
97 | | |
98 | | / t0 \ / 1 0 0 0 \ / m0 \ |
99 | | | t1 | | 0 1 0 1 | | m1 | |
100 | | | t2 | | 0 0 -1 1 | | m2 | |
101 | | | t3 | = | 0 1 -1 1 | \ m3 / |
102 | | | t4 | | -1 1 -1 1 | |
103 | | | t5 | | 0 1 0 0 | |
104 | | \ t6 / \ 0 0 1 0 / |
105 | | |
106 | | Note: the two matrices above are the same, but s_i and t_i are used |
107 | | in the same product, only for i<4, see "A Strassen-like Matrix |
108 | | Multiplication suited for squaring and higher power computation" by |
109 | | M. Bodrato, in Proceedings of ISSAC 2010. |
110 | | |
111 | | / r0 \ / 1 0 0 0 0 1 0 \ / s0*t0 \ |
112 | | | r1 | = | 0 0 -1 1 -1 1 0 | | s1*t1 | |
113 | | | r2 | | 0 1 0 -1 0 -1 -1 | | s2*t2 | |
114 | | \ r3 / \ 0 1 1 -1 0 -1 0 / | s3*t3 | |
115 | | | s4*t5 | |
116 | | | s5*t6 | |
117 | | \ s6*t4 / |
118 | | |
119 | | The scheduling uses two temporaries U0 and U1 to store products, and |
120 | | two, S0 and T0, to store combinations of entries of the two |
121 | | operands. |
122 | | */ |
123 | | |
124 | | /* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3). |
125 | | * |
126 | | * Resulting elements are of size up to rn + mn + 1. |
127 | | * |
128 | | * Temporary storage: 3 rn + 3 mn + 5. */ |
129 | | static void |
130 | | mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, |
131 | | mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, |
132 | | mp_ptr tp) |
133 | 0 | { |
134 | 0 | mp_ptr s0, t0, u0, u1; |
135 | 0 | int r1s, r3s, s0s, t0s, u1s; |
136 | 0 | s0 = tp; tp += rn + 1; |
137 | 0 | t0 = tp; tp += mn + 1; |
138 | 0 | u0 = tp; tp += rn + mn + 1; |
139 | 0 | u1 = tp; /* rn + mn + 2 */ |
140 | |
|
141 | 0 | MUL (u0, r1, rn, m2, mn); /* u5 = s5 * t6 */ |
142 | 0 | r3s = abs_sub_n (r3, r3, r2, rn); /* r3 - r2 */ |
143 | 0 | if (r3s) |
144 | 0 | { |
145 | 0 | r1s = abs_sub_n (r1, r1, r3, rn); |
146 | 0 | r1[rn] = 0; |
147 | 0 | } |
148 | 0 | else |
149 | 0 | { |
150 | 0 | r1[rn] = mpn_add_n (r1, r1, r3, rn); |
151 | 0 | r1s = 0; /* r1 - r2 + r3 */ |
152 | 0 | } |
153 | 0 | if (r1s) |
154 | 0 | { |
155 | 0 | s0[rn] = mpn_add_n (s0, r1, r0, rn); |
156 | 0 | s0s = 0; |
157 | 0 | } |
158 | 0 | else if (r1[rn] != 0) |
159 | 0 | { |
160 | 0 | s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn); |
161 | 0 | s0s = 1; /* s4 = -r0 + r1 - r2 + r3 */ |
162 | | /* Reverse sign! */ |
163 | 0 | } |
164 | 0 | else |
165 | 0 | { |
166 | 0 | s0s = abs_sub_n (s0, r0, r1, rn); |
167 | 0 | s0[rn] = 0; |
168 | 0 | } |
169 | 0 | MUL (u1, r0, rn, m0, mn); /* u0 = s0 * t0 */ |
170 | 0 | r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn); |
171 | 0 | ASSERT (r0[rn+mn] < 2); /* u0 + u5 */ |
172 | |
|
173 | 0 | t0s = abs_sub_n (t0, m3, m2, mn); |
174 | 0 | u1s = r3s^t0s^1; /* Reverse sign! */ |
175 | 0 | MUL (u1, r3, rn, t0, mn); /* u2 = s2 * t2 */ |
176 | 0 | u1[rn+mn] = 0; |
177 | 0 | if (t0s) |
178 | 0 | { |
179 | 0 | t0s = abs_sub_n (t0, m1, t0, mn); |
180 | 0 | t0[mn] = 0; |
181 | 0 | } |
182 | 0 | else |
183 | 0 | { |
184 | 0 | t0[mn] = mpn_add_n (t0, t0, m1, mn); |
185 | 0 | } |
186 | | |
187 | | /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs |
188 | | at r3. I'd expect that for matrices of random size, the high |
189 | | words t0[mn] and r1[rn] are non-zero with a pretty small |
190 | | probability. If that can be confirmed this should be done as an |
191 | | unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn])) |
192 | | add_n. */ |
193 | 0 | if (t0[mn] != 0) |
194 | 0 | { |
195 | 0 | MUL (r3, r1, rn, t0, mn + 1); /* u3 = s3 * t3 */ |
196 | 0 | ASSERT (r1[rn] < 2); |
197 | 0 | if (r1[rn] != 0) |
198 | 0 | mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1); |
199 | 0 | } |
200 | 0 | else |
201 | 0 | { |
202 | 0 | MUL (r3, r1, rn + 1, t0, mn); |
203 | 0 | } |
204 | |
|
205 | 0 | ASSERT (r3[rn+mn] < 4); |
206 | |
|
207 | 0 | u0[rn+mn] = 0; |
208 | 0 | if (r1s^t0s) |
209 | 0 | { |
210 | 0 | r3s = abs_sub_n (r3, u0, r3, rn + mn + 1); |
211 | 0 | } |
212 | 0 | else |
213 | 0 | { |
214 | 0 | ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1)); |
215 | 0 | r3s = 0; /* u3 + u5 */ |
216 | 0 | } |
217 | |
|
218 | 0 | if (t0s) |
219 | 0 | { |
220 | 0 | t0[mn] = mpn_add_n (t0, t0, m0, mn); |
221 | 0 | } |
222 | 0 | else if (t0[mn] != 0) |
223 | 0 | { |
224 | 0 | t0[mn] -= mpn_sub_n (t0, t0, m0, mn); |
225 | 0 | } |
226 | 0 | else |
227 | 0 | { |
228 | 0 | t0s = abs_sub_n (t0, t0, m0, mn); |
229 | 0 | } |
230 | 0 | MUL (u0, r2, rn, t0, mn + 1); /* u6 = s6 * t4 */ |
231 | 0 | ASSERT (u0[rn+mn] < 2); |
232 | 0 | if (r1s) |
233 | 0 | { |
234 | 0 | ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn)); |
235 | 0 | } |
236 | 0 | else |
237 | 0 | { |
238 | 0 | r1[rn] += mpn_add_n (r1, r1, r2, rn); |
239 | 0 | } |
240 | 0 | rn++; |
241 | 0 | t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn); |
242 | | /* u3 + u5 + u6 */ |
243 | 0 | ASSERT (r2[rn+mn-1] < 4); |
244 | 0 | r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn); |
245 | | /* -u2 + u3 + u5 */ |
246 | 0 | ASSERT (r3[rn+mn-1] < 3); |
247 | 0 | MUL (u0, s0, rn, m1, mn); /* u4 = s4 * t5 */ |
248 | 0 | ASSERT (u0[rn+mn-1] < 2); |
249 | 0 | t0[mn] = mpn_add_n (t0, m3, m1, mn); |
250 | 0 | MUL (u1, r1, rn, t0, mn + 1); /* u1 = s1 * t1 */ |
251 | 0 | mn += rn; |
252 | 0 | ASSERT (u1[mn-1] < 4); |
253 | 0 | ASSERT (u1[mn] == 0); |
254 | 0 | ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn)); |
255 | | /* -u2 + u3 - u4 + u5 */ |
256 | 0 | ASSERT (r1[mn-1] < 2); |
257 | 0 | if (r3s) |
258 | 0 | { |
259 | 0 | ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn)); |
260 | 0 | } |
261 | 0 | else |
262 | 0 | { |
263 | 0 | ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn)); |
264 | | /* u1 + u2 - u3 - u5 */ |
265 | 0 | } |
266 | 0 | ASSERT (r3[mn-1] < 2); |
267 | 0 | if (t0s) |
268 | 0 | { |
269 | 0 | ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn)); |
270 | 0 | } |
271 | 0 | else |
272 | 0 | { |
273 | 0 | ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn)); |
274 | | /* u1 - u3 - u5 - u6 */ |
275 | 0 | } |
276 | 0 | ASSERT (r2[mn-1] < 2); |
277 | 0 | } |
278 | | |
279 | | void |
280 | | mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, |
281 | | mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, |
282 | | mp_ptr tp) |
283 | 0 | { |
284 | 0 | if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) |
285 | 0 | || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) |
286 | 0 | { |
287 | 0 | mp_ptr p0, p1; |
288 | 0 | unsigned i; |
289 | | |
290 | | /* Temporary storage: 3 rn + 2 mn */ |
291 | 0 | p0 = tp + rn; |
292 | 0 | p1 = p0 + rn + mn; |
293 | |
|
294 | 0 | for (i = 0; i < 2; i++) |
295 | 0 | { |
296 | 0 | MPN_COPY (tp, r0, rn); |
297 | |
|
298 | 0 | if (rn >= mn) |
299 | 0 | { |
300 | 0 | mpn_mul (p0, r0, rn, m0, mn); |
301 | 0 | mpn_mul (p1, r1, rn, m3, mn); |
302 | 0 | mpn_mul (r0, r1, rn, m2, mn); |
303 | 0 | mpn_mul (r1, tp, rn, m1, mn); |
304 | 0 | } |
305 | 0 | else |
306 | 0 | { |
307 | 0 | mpn_mul (p0, m0, mn, r0, rn); |
308 | 0 | mpn_mul (p1, m3, mn, r1, rn); |
309 | 0 | mpn_mul (r0, m2, mn, r1, rn); |
310 | 0 | mpn_mul (r1, m1, mn, tp, rn); |
311 | 0 | } |
312 | 0 | r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn); |
313 | 0 | r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn); |
314 | |
|
315 | 0 | r0 = r2; r1 = r3; |
316 | 0 | } |
317 | 0 | } |
318 | 0 | else |
319 | 0 | mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn, |
320 | 0 | m0, m1, m2, m3, mn, tp); |
321 | 0 | } |