/src/gmp-6.2.1/mpn/matrix22_mul.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* matrix22_mul.c.  | 
2  |  |  | 
3  |  |    Contributed by Niels Möller and Marco Bodrato.  | 
4  |  |  | 
5  |  |    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY  | 
6  |  |    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST  | 
7  |  |    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.  | 
8  |  |  | 
9  |  | Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc.  | 
10  |  |  | 
11  |  | This file is part of the GNU MP Library.  | 
12  |  |  | 
13  |  | The GNU MP Library is free software; you can redistribute it and/or modify  | 
14  |  | it under the terms of either:  | 
15  |  |  | 
16  |  |   * the GNU Lesser General Public License as published by the Free  | 
17  |  |     Software Foundation; either version 3 of the License, or (at your  | 
18  |  |     option) any later version.  | 
19  |  |  | 
20  |  | or  | 
21  |  |  | 
22  |  |   * the GNU General Public License as published by the Free Software  | 
23  |  |     Foundation; either version 2 of the License, or (at your option) any  | 
24  |  |     later version.  | 
25  |  |  | 
26  |  | or both in parallel, as here.  | 
27  |  |  | 
28  |  | The GNU MP Library is distributed in the hope that it will be useful, but  | 
29  |  | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY  | 
30  |  | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License  | 
31  |  | for more details.  | 
32  |  |  | 
33  |  | You should have received copies of the GNU General Public License and the  | 
34  |  | GNU Lesser General Public License along with the GNU MP Library.  If not,  | 
35  |  | see https://www.gnu.org/licenses/.  */  | 
36  |  |  | 
37  |  | #include "gmp-impl.h"  | 
38  |  | #include "longlong.h"  | 
39  |  |  | 
40  | 0  | #define MUL(rp, ap, an, bp, bn) do {   \ | 
41  | 0  |   if (an >= bn)         \  | 
42  | 0  |     mpn_mul (rp, ap, an, bp, bn);   \  | 
43  | 0  |   else            \  | 
44  | 0  |     mpn_mul (rp, bp, bn, ap, an);   \  | 
45  | 0  | } while (0)  | 
46  |  |  | 
47  |  | /* Inputs are unsigned. */  | 
48  |  | static int  | 
49  |  | abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)  | 
50  | 0  | { | 
51  | 0  |   int c;  | 
52  | 0  |   MPN_CMP (c, ap, bp, n);  | 
53  | 0  |   if (c >= 0)  | 
54  | 0  |     { | 
55  | 0  |       mpn_sub_n (rp, ap, bp, n);  | 
56  | 0  |       return 0;  | 
57  | 0  |     }  | 
58  | 0  |   else  | 
59  | 0  |     { | 
60  | 0  |       mpn_sub_n (rp, bp, ap, n);  | 
61  | 0  |       return 1;  | 
62  | 0  |     }  | 
63  | 0  | }  | 
64  |  |  | 
65  |  | static int  | 
66  |  | add_signed_n (mp_ptr rp,  | 
67  |  |         mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n)  | 
68  | 0  | { | 
69  | 0  |   if (as != bs)  | 
70  | 0  |     return as ^ abs_sub_n (rp, ap, bp, n);  | 
71  | 0  |   else  | 
72  | 0  |     { | 
73  | 0  |       ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n));  | 
74  | 0  |       return as;  | 
75  | 0  |     }  | 
76  | 0  | }  | 
77  |  |  | 
78  |  | mp_size_t  | 
79  |  | mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn)  | 
80  | 0  | { | 
81  | 0  |   if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)  | 
82  | 0  |       || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))  | 
83  | 0  |     return 3*rn + 2*mn;  | 
84  | 0  |   else  | 
85  | 0  |     return 3*(rn + mn) + 5;  | 
86  | 0  | }  | 
87  |  |  | 
88  |  | /* Algorithm:  | 
89  |  |  | 
90  |  |     / s0 \   /  1  0  0  0 \ / r0 \  | 
91  |  |     | s1 |   |  0  1  0  1 | | r1 |  | 
92  |  |     | s2 |   |  0  0 -1  1 | | r2 |  | 
93  |  |     | s3 | = |  0  1 -1  1 | \ r3 /  | 
94  |  |     | s4 |   | -1  1 -1  1 |  | 
95  |  |     | s5 |   |  0  1  0  0 |  | 
96  |  |     \ s6 /   \  0  0  1  0 /  | 
97  |  |  | 
98  |  |     / t0 \   /  1  0  0  0 \ / m0 \  | 
99  |  |     | t1 |   |  0  1  0  1 | | m1 |  | 
100  |  |     | t2 |   |  0  0 -1  1 | | m2 |  | 
101  |  |     | t3 | = |  0  1 -1  1 | \ m3 /  | 
102  |  |     | t4 |   | -1  1 -1  1 |  | 
103  |  |     | t5 |   |  0  1  0  0 |  | 
104  |  |     \ t6 /   \  0  0  1  0 /  | 
105  |  |  | 
106  |  |   Note: the two matrices above are the same, but s_i and t_i are used  | 
107  |  |   in the same product, only for i<4, see "A Strassen-like Matrix  | 
108  |  |   Multiplication suited for squaring and higher power computation" by  | 
109  |  |   M. Bodrato, in Proceedings of ISSAC 2010.  | 
110  |  |  | 
111  |  |     / r0 \   / 1 0  0  0  0  1  0 \ / s0*t0 \  | 
112  |  |     | r1 | = | 0 0 -1  1 -1  1  0 | | s1*t1 |  | 
113  |  |     | r2 |   | 0 1  0 -1  0 -1 -1 | | s2*t2 |  | 
114  |  |     \ r3 /   \ 0 1  1 -1  0 -1  0 / | s3*t3 |  | 
115  |  |             | s4*t5 |  | 
116  |  |             | s5*t6 |  | 
117  |  |             \ s6*t4 /  | 
118  |  |  | 
119  |  |   The scheduling uses two temporaries U0 and U1 to store products, and  | 
120  |  |   two, S0 and T0, to store combinations of entries of the two  | 
121  |  |   operands.  | 
122  |  | */  | 
123  |  |  | 
124  |  | /* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3).  | 
125  |  |  *  | 
126  |  |  * Resulting elements are of size up to rn + mn + 1.  | 
127  |  |  *  | 
128  |  |  * Temporary storage: 3 rn + 3 mn + 5. */  | 
129  |  | static void  | 
130  |  | mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,  | 
131  |  |          mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,  | 
132  |  |          mp_ptr tp)  | 
133  | 0  | { | 
134  | 0  |   mp_ptr s0, t0, u0, u1;  | 
135  | 0  |   int r1s, r3s, s0s, t0s, u1s;  | 
136  | 0  |   s0 = tp; tp += rn + 1;  | 
137  | 0  |   t0 = tp; tp += mn + 1;  | 
138  | 0  |   u0 = tp; tp += rn + mn + 1;  | 
139  | 0  |   u1 = tp; /* rn + mn + 2 */  | 
140  |  | 
  | 
141  | 0  |   MUL (u0, r1, rn, m2, mn);    /* u5 = s5 * t6 */  | 
142  | 0  |   r3s = abs_sub_n (r3, r3, r2, rn); /* r3 - r2 */  | 
143  | 0  |   if (r3s)  | 
144  | 0  |     { | 
145  | 0  |       r1s = abs_sub_n (r1, r1, r3, rn);  | 
146  | 0  |       r1[rn] = 0;  | 
147  | 0  |     }  | 
148  | 0  |   else  | 
149  | 0  |     { | 
150  | 0  |       r1[rn] = mpn_add_n (r1, r1, r3, rn);  | 
151  | 0  |       r1s = 0;        /* r1 - r2 + r3  */  | 
152  | 0  |     }  | 
153  | 0  |   if (r1s)  | 
154  | 0  |     { | 
155  | 0  |       s0[rn] = mpn_add_n (s0, r1, r0, rn);  | 
156  | 0  |       s0s = 0;  | 
157  | 0  |     }  | 
158  | 0  |   else if (r1[rn] != 0)  | 
159  | 0  |     { | 
160  | 0  |       s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn);  | 
161  | 0  |       s0s = 1;        /* s4 = -r0 + r1 - r2 + r3 */  | 
162  |  |           /* Reverse sign! */  | 
163  | 0  |     }  | 
164  | 0  |   else  | 
165  | 0  |     { | 
166  | 0  |       s0s = abs_sub_n (s0, r0, r1, rn);  | 
167  | 0  |       s0[rn] = 0;  | 
168  | 0  |     }  | 
169  | 0  |   MUL (u1, r0, rn, m0, mn);    /* u0 = s0 * t0 */  | 
170  | 0  |   r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn);  | 
171  | 0  |   ASSERT (r0[rn+mn] < 2);   /* u0 + u5 */  | 
172  |  |  | 
173  | 0  |   t0s = abs_sub_n (t0, m3, m2, mn);  | 
174  | 0  |   u1s = r3s^t0s^1;      /* Reverse sign! */  | 
175  | 0  |   MUL (u1, r3, rn, t0, mn);    /* u2 = s2 * t2 */  | 
176  | 0  |   u1[rn+mn] = 0;  | 
177  | 0  |   if (t0s)  | 
178  | 0  |     { | 
179  | 0  |       t0s = abs_sub_n (t0, m1, t0, mn);  | 
180  | 0  |       t0[mn] = 0;  | 
181  | 0  |     }  | 
182  | 0  |   else  | 
183  | 0  |     { | 
184  | 0  |       t0[mn] = mpn_add_n (t0, t0, m1, mn);  | 
185  | 0  |     }  | 
186  |  |  | 
187  |  |   /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs  | 
188  |  |      at r3. I'd expect that for matrices of random size, the high  | 
189  |  |      words t0[mn] and r1[rn] are non-zero with a pretty small  | 
190  |  |      probability. If that can be confirmed this should be done as an  | 
191  |  |      unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn]))  | 
192  |  |      add_n. */  | 
193  | 0  |   if (t0[mn] != 0)  | 
194  | 0  |     { | 
195  | 0  |       MUL (r3, r1, rn, t0, mn + 1);  /* u3 = s3 * t3 */  | 
196  | 0  |       ASSERT (r1[rn] < 2);  | 
197  | 0  |       if (r1[rn] != 0)  | 
198  | 0  |   mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1);  | 
199  | 0  |     }  | 
200  | 0  |   else  | 
201  | 0  |     { | 
202  | 0  |       MUL (r3, r1, rn + 1, t0, mn);  | 
203  | 0  |     }  | 
204  |  |  | 
205  | 0  |   ASSERT (r3[rn+mn] < 4);  | 
206  |  |  | 
207  | 0  |   u0[rn+mn] = 0;  | 
208  | 0  |   if (r1s^t0s)  | 
209  | 0  |     { | 
210  | 0  |       r3s = abs_sub_n (r3, u0, r3, rn + mn + 1);  | 
211  | 0  |     }  | 
212  | 0  |   else  | 
213  | 0  |     { | 
214  | 0  |       ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1));  | 
215  | 0  |       r3s = 0;        /* u3 + u5 */  | 
216  | 0  |     }  | 
217  |  |  | 
218  | 0  |   if (t0s)  | 
219  | 0  |     { | 
220  | 0  |       t0[mn] = mpn_add_n (t0, t0, m0, mn);  | 
221  | 0  |     }  | 
222  | 0  |   else if (t0[mn] != 0)  | 
223  | 0  |     { | 
224  | 0  |       t0[mn] -= mpn_sub_n (t0, t0, m0, mn);  | 
225  | 0  |     }  | 
226  | 0  |   else  | 
227  | 0  |     { | 
228  | 0  |       t0s = abs_sub_n (t0, t0, m0, mn);  | 
229  | 0  |     }  | 
230  | 0  |   MUL (u0, r2, rn, t0, mn + 1);    /* u6 = s6 * t4 */  | 
231  | 0  |   ASSERT (u0[rn+mn] < 2);  | 
232  | 0  |   if (r1s)  | 
233  | 0  |     { | 
234  | 0  |       ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn));  | 
235  | 0  |     }  | 
236  | 0  |   else  | 
237  | 0  |     { | 
238  | 0  |       r1[rn] += mpn_add_n (r1, r1, r2, rn);  | 
239  | 0  |     }  | 
240  | 0  |   rn++;  | 
241  | 0  |   t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn);  | 
242  |  |           /* u3 + u5 + u6 */  | 
243  | 0  |   ASSERT (r2[rn+mn-1] < 4);  | 
244  | 0  |   r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn);  | 
245  |  |           /* -u2 + u3 + u5  */  | 
246  | 0  |   ASSERT (r3[rn+mn-1] < 3);  | 
247  | 0  |   MUL (u0, s0, rn, m1, mn);    /* u4 = s4 * t5 */  | 
248  | 0  |   ASSERT (u0[rn+mn-1] < 2);  | 
249  | 0  |   t0[mn] = mpn_add_n (t0, m3, m1, mn);  | 
250  | 0  |   MUL (u1, r1, rn, t0, mn + 1);    /* u1 = s1 * t1 */  | 
251  | 0  |   mn += rn;  | 
252  | 0  |   ASSERT (u1[mn-1] < 4);  | 
253  | 0  |   ASSERT (u1[mn] == 0);  | 
254  | 0  |   ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn));  | 
255  |  |           /* -u2 + u3 - u4 + u5  */  | 
256  | 0  |   ASSERT (r1[mn-1] < 2);  | 
257  | 0  |   if (r3s)  | 
258  | 0  |     { | 
259  | 0  |       ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn));  | 
260  | 0  |     }  | 
261  | 0  |   else  | 
262  | 0  |     { | 
263  | 0  |       ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn));  | 
264  |  |           /* u1 + u2 - u3 - u5  */  | 
265  | 0  |     }  | 
266  | 0  |   ASSERT (r3[mn-1] < 2);  | 
267  | 0  |   if (t0s)  | 
268  | 0  |     { | 
269  | 0  |       ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn));  | 
270  | 0  |     }  | 
271  | 0  |   else  | 
272  | 0  |     { | 
273  | 0  |       ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn));  | 
274  |  |           /* u1 - u3 - u5 - u6  */  | 
275  | 0  |     }  | 
276  | 0  |   ASSERT (r2[mn-1] < 2);  | 
277  | 0  | }  | 
278  |  |  | 
279  |  | void  | 
280  |  | mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,  | 
281  |  |       mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,  | 
282  |  |       mp_ptr tp)  | 
283  | 0  | { | 
284  | 0  |   if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)  | 
285  | 0  |       || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))  | 
286  | 0  |     { | 
287  | 0  |       mp_ptr p0, p1;  | 
288  | 0  |       unsigned i;  | 
289  |  |  | 
290  |  |       /* Temporary storage: 3 rn + 2 mn */  | 
291  | 0  |       p0 = tp + rn;  | 
292  | 0  |       p1 = p0 + rn + mn;  | 
293  |  | 
  | 
294  | 0  |       for (i = 0; i < 2; i++)  | 
295  | 0  |   { | 
296  | 0  |     MPN_COPY (tp, r0, rn);  | 
297  |  |  | 
298  | 0  |     if (rn >= mn)  | 
299  | 0  |       { | 
300  | 0  |         mpn_mul (p0, r0, rn, m0, mn);  | 
301  | 0  |         mpn_mul (p1, r1, rn, m3, mn);  | 
302  | 0  |         mpn_mul (r0, r1, rn, m2, mn);  | 
303  | 0  |         mpn_mul (r1, tp, rn, m1, mn);  | 
304  | 0  |       }  | 
305  | 0  |     else  | 
306  | 0  |       { | 
307  | 0  |         mpn_mul (p0, m0, mn, r0, rn);  | 
308  | 0  |         mpn_mul (p1, m3, mn, r1, rn);  | 
309  | 0  |         mpn_mul (r0, m2, mn, r1, rn);  | 
310  | 0  |         mpn_mul (r1, m1, mn, tp, rn);  | 
311  | 0  |       }  | 
312  | 0  |     r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn);  | 
313  | 0  |     r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn);  | 
314  |  | 
  | 
315  | 0  |     r0 = r2; r1 = r3;  | 
316  | 0  |   }  | 
317  | 0  |     }  | 
318  | 0  |   else  | 
319  | 0  |     mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn,  | 
320  | 0  |              m0, m1, m2, m3, mn, tp);  | 
321  | 0  | }  |