/src/gmp/mpn/toom43_mul.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* mpn_toom43_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3 | 
2  |  |    times as large as bn.  Or more accurately, bn < an < 2 bn.  | 
3  |  |  | 
4  |  |    Contributed to the GNU project by Marco Bodrato.  | 
5  |  |  | 
6  |  |    The idea of applying toom to unbalanced multiplication is due to Marco  | 
7  |  |    Bodrato and Alberto Zanoni.  | 
8  |  |  | 
9  |  |    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY  | 
10  |  |    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST  | 
11  |  |    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.  | 
12  |  |  | 
13  |  | Copyright 2009, 2020 Free Software Foundation, Inc.  | 
14  |  |  | 
15  |  | This file is part of the GNU MP Library.  | 
16  |  |  | 
17  |  | The GNU MP Library is free software; you can redistribute it and/or modify  | 
18  |  | it under the terms of either:  | 
19  |  |  | 
20  |  |   * the GNU Lesser General Public License as published by the Free  | 
21  |  |     Software Foundation; either version 3 of the License, or (at your  | 
22  |  |     option) any later version.  | 
23  |  |  | 
24  |  | or  | 
25  |  |  | 
26  |  |   * the GNU General Public License as published by the Free Software  | 
27  |  |     Foundation; either version 2 of the License, or (at your option) any  | 
28  |  |     later version.  | 
29  |  |  | 
30  |  | or both in parallel, as here.  | 
31  |  |  | 
32  |  | The GNU MP Library is distributed in the hope that it will be useful, but  | 
33  |  | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY  | 
34  |  | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License  | 
35  |  | for more details.  | 
36  |  |  | 
37  |  | You should have received copies of the GNU General Public License and the  | 
38  |  | GNU Lesser General Public License along with the GNU MP Library.  If not,  | 
39  |  | see https://www.gnu.org/licenses/.  */  | 
40  |  |  | 
41  |  |  | 
42  |  | #include "gmp-impl.h"  | 
43  |  |  | 
44  |  | /* Evaluate in: -2, -1, 0, +1, +2, +inf  | 
45  |  |  | 
46  |  |   <-s-><--n--><--n--><--n-->  | 
47  |  |    ___ ______ ______ ______  | 
48  |  |   |a3_|___a2_|___a1_|___a0_|  | 
49  |  |   |_b2_|___b1_|___b0_|  | 
50  |  |   <-t--><--n--><--n-->  | 
51  |  |  | 
52  |  |   v0  =  a0             * b0          #   A(0)*B(0)  | 
53  |  |   v1  = (a0+ a1+ a2+ a3)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 3  bh <= 2  | 
54  |  |   vm1 = (a0- a1+ a2- a3)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1 |bh|<= 1  | 
55  |  |   v2  = (a0+2a1+4a2+8a3)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 14 bh <= 6  | 
56  |  |   vm2 = (a0-2a1+4a2-8a3)*(b0-2b1+4b2) #  A(-2)*B(-2)    |ah| <= 9 |bh|<= 4  | 
57  |  |   vinf=              a3 *         b2  # A(inf)*B(inf)  | 
58  |  | */  | 
59  |  |  | 
60  |  | void  | 
61  |  | mpn_toom43_mul (mp_ptr pp,  | 
62  |  |     mp_srcptr ap, mp_size_t an,  | 
63  |  |     mp_srcptr bp, mp_size_t bn, mp_ptr scratch)  | 
64  | 0  | { | 
65  | 0  |   mp_size_t n, s, t;  | 
66  | 0  |   enum toom6_flags flags;  | 
67  | 0  |   mp_limb_t cy;  | 
68  |  | 
  | 
69  | 0  | #define a0  ap  | 
70  | 0  | #define a1  (ap + n)  | 
71  | 0  | #define a2  (ap + 2 * n)  | 
72  | 0  | #define a3  (ap + 3 * n)  | 
73  | 0  | #define b0  bp  | 
74  | 0  | #define b1  (bp + n)  | 
75  | 0  | #define b2  (bp + 2 * n)  | 
76  |  | 
  | 
77  | 0  |   n = 1 + (3 * an >= 4 * bn ? (an - 1) >> 2 : (bn - 1) / (size_t) 3);  | 
78  |  | 
  | 
79  | 0  |   s = an - 3 * n;  | 
80  | 0  |   t = bn - 2 * n;  | 
81  |  | 
  | 
82  | 0  |   ASSERT (0 < s && s <= n);  | 
83  | 0  |   ASSERT (0 < t && t <= n);  | 
84  |  |  | 
85  |  |   /* This is true whenever an >= 25 or bn >= 19, I think. It  | 
86  |  |      guarantees that we can fit 5 values of size n+1 in the product  | 
87  |  |      area. */  | 
88  | 0  |   ASSERT (s+t >= 5);  | 
89  |  | 
  | 
90  | 0  | #define v0    pp        /* 2n */  | 
91  | 0  | #define vm1   (scratch)        /* 2n+1 */  | 
92  | 0  | #define v1    (pp + 2*n)      /* 2n+1 */  | 
93  | 0  | #define vm2   (scratch + 2 * n + 1)    /* 2n+1 */  | 
94  | 0  | #define v2    (scratch + 4 * n + 2)    /* 2n+1 */  | 
95  | 0  | #define vinf  (pp + 5 * n)      /* s+t */  | 
96  | 0  | #define bs1    pp        /* n+1 */  | 
97  | 0  | #define bsm1  (scratch + 2 * n + 2)    /* n+1 */  | 
98  | 0  | #define asm1  (scratch + 3 * n + 3)    /* n+1 */  | 
99  | 0  | #define asm2  (scratch + 4 * n + 4)    /* n+1 */  | 
100  | 0  | #define bsm2  (pp + n + 1)      /* n+1 */  | 
101  | 0  | #define bs2   (pp + 2 * n + 2)      /* n+1 */  | 
102  | 0  | #define as2   (pp + 3 * n + 3)      /* n+1 */  | 
103  | 0  | #define as1   (pp + 4 * n + 4)      /* n+1 */  | 
104  |  |  | 
105  |  |   /* Total sccratch need is 6 * n + 3 + 1; we allocate one extra  | 
106  |  |      limb, because products will overwrite 2n+2 limbs. */  | 
107  |  | 
  | 
108  | 0  | #define a0a2  scratch  | 
109  | 0  | #define b0b2  scratch  | 
110  | 0  | #define a1a3  asm1  | 
111  | 0  | #define b1d   bsm1  | 
112  |  |  | 
113  |  |   /* Compute as2 and asm2.  */  | 
114  | 0  |   flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_dgr3_pm2 (as2, asm2, ap, n, s, a1a3));  | 
115  |  |  | 
116  |  |   /* Compute bs2 and bsm2.  */  | 
117  | 0  |   b1d[n] = mpn_lshift (b1d, b1, n, 1);      /*       2b1      */  | 
118  | 0  | #if HAVE_NATIVE_mpn_addlsh2_n  | 
119  | 0  |   cy = mpn_addlsh2_n (b0b2, b0, b2, t);     /*  4b2      + b0 */  | 
120  |  | #else  | 
121  |  |   cy  = mpn_lshift (b0b2, b2, t, 2);      /*  4b2           */  | 
122  |  |   cy += mpn_add_n (b0b2, b0b2, b0, t);      /*  4b2      + b0 */  | 
123  |  | #endif  | 
124  | 0  |   if (t != n)  | 
125  | 0  |     cy = mpn_add_1 (b0b2 + t, b0 + t, n - t, cy);  | 
126  | 0  |   b0b2[n] = cy;  | 
127  |  | 
  | 
128  |  | #if HAVE_NATIVE_mpn_add_n_sub_n  | 
129  |  |   if (mpn_cmp (b0b2, b1d, n+1) < 0)  | 
130  |  |     { | 
131  |  |       mpn_add_n_sub_n (bs2, bsm2, b1d, b0b2, n+1);  | 
132  |  |       flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);  | 
133  |  |     }  | 
134  |  |   else  | 
135  |  |     { | 
136  |  |       mpn_add_n_sub_n (bs2, bsm2, b0b2, b1d, n+1);  | 
137  |  |     }  | 
138  |  | #else  | 
139  | 0  |   mpn_add_n (bs2, b0b2, b1d, n+1);  | 
140  | 0  |   if (mpn_cmp (b0b2, b1d, n+1) < 0)  | 
141  | 0  |     { | 
142  | 0  |       mpn_sub_n (bsm2, b1d, b0b2, n+1);  | 
143  | 0  |       flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);  | 
144  | 0  |     }  | 
145  | 0  |   else  | 
146  | 0  |     { | 
147  | 0  |       mpn_sub_n (bsm2, b0b2, b1d, n+1);  | 
148  | 0  |     }  | 
149  | 0  | #endif  | 
150  |  |  | 
151  |  |   /* Compute as1 and asm1.  */  | 
152  | 0  |   flags = (enum toom6_flags) (flags ^ (toom6_vm1_neg & mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0a2)));  | 
153  |  |  | 
154  |  |   /* Compute bs1 and bsm1.  */  | 
155  | 0  |   bsm1[n] = mpn_add (bsm1, b0, n, b2, t);  | 
156  |  | #if HAVE_NATIVE_mpn_add_n_sub_n  | 
157  |  |   if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)  | 
158  |  |     { | 
159  |  |       cy = mpn_add_n_sub_n (bs1, bsm1, b1, bsm1, n);  | 
160  |  |       bs1[n] = cy >> 1;  | 
161  |  |       flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);  | 
162  |  |     }  | 
163  |  |   else  | 
164  |  |     { | 
165  |  |       cy = mpn_add_n_sub_n (bs1, bsm1, bsm1, b1, n);  | 
166  |  |       bs1[n] = bsm1[n] + (cy >> 1);  | 
167  |  |       bsm1[n]-= cy & 1;  | 
168  |  |     }  | 
169  |  | #else  | 
170  | 0  |   bs1[n] = bsm1[n] + mpn_add_n (bs1, bsm1, b1, n);  | 
171  | 0  |   if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)  | 
172  | 0  |     { | 
173  | 0  |       mpn_sub_n (bsm1, b1, bsm1, n);  | 
174  | 0  |       flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);  | 
175  | 0  |     }  | 
176  | 0  |   else  | 
177  | 0  |     { | 
178  | 0  |       bsm1[n] -= mpn_sub_n (bsm1, bsm1, b1, n);  | 
179  | 0  |     }  | 
180  | 0  | #endif  | 
181  |  | 
  | 
182  | 0  |   ASSERT (as1[n] <= 3);  | 
183  | 0  |   ASSERT (bs1[n] <= 2);  | 
184  | 0  |   ASSERT (asm1[n] <= 1);  | 
185  | 0  |   ASSERT (bsm1[n] <= 1);  | 
186  | 0  |   ASSERT (as2[n] <=14);  | 
187  | 0  |   ASSERT (bs2[n] <= 6);  | 
188  | 0  |   ASSERT (asm2[n] <= 9);  | 
189  | 0  |   ASSERT (bsm2[n] <= 4);  | 
190  |  |  | 
191  |  |   /* vm1, 2n+1 limbs */  | 
192  | 0  |   vm1[2*n] = 0;  | 
193  | 0  |   mpn_mul_n (vm1, asm1, bsm1, n + (asm1[n] | bsm1[n]));  /* W4 */  | 
194  |  |  | 
195  |  |   /* vm2, 2n+1 limbs */  | 
196  | 0  |   mpn_mul_n (vm2, asm2, bsm2, n+1);  /* W2 */  | 
197  |  |  | 
198  |  |   /* v2, 2n+1 limbs */  | 
199  | 0  |   mpn_mul_n (v2, as2, bs2, n+1);  /* W1 */  | 
200  |  |  | 
201  |  |   /* v1, 2n+1 limbs */  | 
202  | 0  |   mpn_mul_n (v1, as1, bs1, n+1);  /* W3 */  | 
203  |  |  | 
204  |  |   /* vinf, s+t limbs */   /* W0 */  | 
205  | 0  |   if (s > t)  mpn_mul (vinf, a3, s, b2, t);  | 
206  | 0  |   else        mpn_mul (vinf, b2, t, a3, s);  | 
207  |  |  | 
208  |  |   /* v0, 2n limbs */  | 
209  | 0  |   mpn_mul_n (v0, ap, bp, n);  /* W5 */  | 
210  |  | 
  | 
211  | 0  |   mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);  | 
212  |  | 
  | 
213  | 0  | #undef v0  | 
214  | 0  | #undef vm1  | 
215  | 0  | #undef v1  | 
216  | 0  | #undef vm2  | 
217  | 0  | #undef v2  | 
218  | 0  | #undef vinf  | 
219  | 0  | #undef bs1  | 
220  | 0  | #undef bs2  | 
221  | 0  | #undef bsm1  | 
222  | 0  | #undef bsm2  | 
223  | 0  | #undef asm1  | 
224  | 0  | #undef asm2  | 
225  |  | /* #undef as1 */  | 
226  |  | /* #undef as2 */  | 
227  | 0  | #undef a0a2  | 
228  | 0  | #undef b0b2  | 
229  | 0  | #undef a1a3  | 
230  | 0  | #undef b1d  | 
231  | 0  | #undef a0  | 
232  | 0  | #undef a1  | 
233  | 0  | #undef a2  | 
234  | 0  | #undef a3  | 
235  | 0  | #undef b0  | 
236  | 0  | #undef b1  | 
237  | 0  | #undef b2  | 
238  | 0  | }  |