Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* mpn_mu_div_qr, mpn_preinv_mu_div_qr.  | 
2  |  |  | 
3  |  |    Compute Q = floor(N / D) and R = N-QD.  N is nn limbs and D is dn limbs and  | 
4  |  |    must be normalized, and Q must be nn-dn limbs.  The requirement that Q is  | 
5  |  |    nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to  | 
6  |  |    let N be unmodified during the operation.  | 
7  |  |  | 
8  |  |    Contributed to the GNU project by Torbjorn Granlund.  | 
9  |  |  | 
10  |  |    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY  | 
11  |  |    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST  | 
12  |  |    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.  | 
13  |  |  | 
14  |  | Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.  | 
15  |  |  | 
16  |  | This file is part of the GNU MP Library.  | 
17  |  |  | 
18  |  | The GNU MP Library is free software; you can redistribute it and/or modify  | 
19  |  | it under the terms of either:  | 
20  |  |  | 
21  |  |   * the GNU Lesser General Public License as published by the Free  | 
22  |  |     Software Foundation; either version 3 of the License, or (at your  | 
23  |  |     option) any later version.  | 
24  |  |  | 
25  |  | or  | 
26  |  |  | 
27  |  |   * the GNU General Public License as published by the Free Software  | 
28  |  |     Foundation; either version 2 of the License, or (at your option) any  | 
29  |  |     later version.  | 
30  |  |  | 
31  |  | or both in parallel, as here.  | 
32  |  |  | 
33  |  | The GNU MP Library is distributed in the hope that it will be useful, but  | 
34  |  | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY  | 
35  |  | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License  | 
36  |  | for more details.  | 
37  |  |  | 
38  |  | You should have received copies of the GNU General Public License and the  | 
39  |  | GNU Lesser General Public License along with the GNU MP Library.  If not,  | 
40  |  | see https://www.gnu.org/licenses/.  */  | 
41  |  |  | 
42  |  |  | 
43  |  | /*  | 
44  |  |    The idea of the algorithm used herein is to compute a smaller inverted value  | 
45  |  |    than used in the standard Barrett algorithm, and thus save time in the  | 
46  |  |    Newton iterations, and pay just a small price when using the inverted value  | 
47  |  |    for developing quotient bits.  This algorithm was presented at ICMS 2006.  | 
48  |  | */  | 
49  |  |  | 
50  |  | /* CAUTION: This code and the code in mu_divappr_q.c should be edited in sync.  | 
51  |  |  | 
52  |  |  Things to work on:  | 
53  |  |  | 
54  |  |   * This isn't optimal when the quotient isn't needed, as it might take a lot  | 
55  |  |     of space.  The computation is always needed, though, so there is no time to  | 
56  |  |     save with special code.  | 
57  |  |  | 
58  |  |   * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,  | 
59  |  |     demonstrated by the fact that the mpn_invertappr function's scratch needs  | 
60  |  |     mean that we need to keep a large allocation long after it is needed.  | 
61  |  |     Things are worse as mpn_mul_fft does not accept any scratch parameter,  | 
62  |  |     which means we'll have a large memory hole while in mpn_mul_fft.  In  | 
63  |  |     general, a peak scratch need in the beginning of a function isn't  | 
64  |  |     well-handled by the itch/scratch scheme.  | 
65  |  | */  | 
66  |  |  | 
67  |  | #ifdef STAT  | 
68  |  | #undef STAT  | 
69  |  | #define STAT(x) x  | 
70  |  | #else  | 
71  |  | #define STAT(x)  | 
72  |  | #endif  | 
73  |  |  | 
74  |  | #include <stdlib.h>   /* for NULL */  | 
75  |  | #include "gmp-impl.h"  | 
76  |  |  | 
77  |  |  | 
78  |  | /* FIXME: The MU_DIV_QR_SKEW_THRESHOLD was not analysed properly.  It gives a  | 
79  |  |    speedup according to old measurements, but does the decision mechanism  | 
80  |  |    really make sense?  It seem like the quotient between dn and qn might be  | 
81  |  |    what we really should be checking.  */  | 
82  |  | #ifndef MU_DIV_QR_SKEW_THRESHOLD  | 
83  | 0  | #define MU_DIV_QR_SKEW_THRESHOLD 100  | 
84  |  | #endif  | 
85  |  |  | 
86  |  | #ifdef CHECK        /* FIXME: Enable in minithres */  | 
87  |  | #undef  MU_DIV_QR_SKEW_THRESHOLD  | 
88  |  | #define MU_DIV_QR_SKEW_THRESHOLD 1  | 
89  |  | #endif  | 
90  |  |  | 
91  |  |  | 
92  |  | static mp_limb_t mpn_mu_div_qr2 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr);  | 
93  |  | static mp_size_t mpn_mu_div_qr_choose_in (mp_size_t, mp_size_t, int);  | 
94  |  |  | 
95  |  |  | 
96  |  | mp_limb_t  | 
97  |  | mpn_mu_div_qr (mp_ptr qp,  | 
98  |  |          mp_ptr rp,  | 
99  |  |          mp_srcptr np,  | 
100  |  |          mp_size_t nn,  | 
101  |  |          mp_srcptr dp,  | 
102  |  |          mp_size_t dn,  | 
103  |  |          mp_ptr scratch)  | 
104  | 0  | { | 
105  | 0  |   mp_size_t qn;  | 
106  | 0  |   mp_limb_t cy, qh;  | 
107  |  | 
  | 
108  | 0  |   qn = nn - dn;  | 
109  | 0  |   if (qn + MU_DIV_QR_SKEW_THRESHOLD < dn)  | 
110  | 0  |     { | 
111  |  |       /* |______________|_ign_first__|   dividend       nn  | 
112  |  |     |_______|_ign_first__|   divisor        dn  | 
113  |  |  | 
114  |  |     |______|       quotient (prel)        qn  | 
115  |  |  | 
116  |  |      |___________________|   quotient * ignored-divisor-part  dn-1  | 
117  |  |       */  | 
118  |  |  | 
119  |  |       /* Compute a preliminary quotient and a partial remainder by dividing the  | 
120  |  |    most significant limbs of each operand.  */  | 
121  | 0  |       qh = mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1),  | 
122  | 0  |          np + nn - (2 * qn + 1), 2 * qn + 1,  | 
123  | 0  |          dp + dn - (qn + 1), qn + 1,  | 
124  | 0  |          scratch);  | 
125  |  |  | 
126  |  |       /* Multiply the quotient by the divisor limbs ignored above.  */  | 
127  | 0  |       if (dn - (qn + 1) > qn)  | 
128  | 0  |   mpn_mul (scratch, dp, dn - (qn + 1), qp, qn);  /* prod is dn-1 limbs */  | 
129  | 0  |       else  | 
130  | 0  |   mpn_mul (scratch, qp, qn, dp, dn - (qn + 1));  /* prod is dn-1 limbs */  | 
131  |  | 
  | 
132  | 0  |       if (qh)  | 
133  | 0  |   cy = mpn_add_n (scratch + qn, scratch + qn, dp, dn - (qn + 1));  | 
134  | 0  |       else  | 
135  | 0  |   cy = 0;  | 
136  | 0  |       scratch[dn - 1] = cy;  | 
137  |  | 
  | 
138  | 0  |       cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1));  | 
139  | 0  |       cy = mpn_sub_nc (rp + nn - (2 * qn + 1),  | 
140  | 0  |            rp + nn - (2 * qn + 1),  | 
141  | 0  |            scratch + nn - (2 * qn + 1),  | 
142  | 0  |            qn + 1, cy);  | 
143  | 0  |       if (cy)  | 
144  | 0  |   { | 
145  | 0  |     qh -= mpn_sub_1 (qp, qp, qn, 1);  | 
146  | 0  |     mpn_add_n (rp, rp, dp, dn);  | 
147  | 0  |   }  | 
148  | 0  |     }  | 
149  | 0  |   else  | 
150  | 0  |     { | 
151  | 0  |       qh = mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch);  | 
152  | 0  |     }  | 
153  |  | 
  | 
154  | 0  |   return qh;  | 
155  | 0  | }  | 
156  |  |  | 
157  |  | static mp_limb_t  | 
158  |  | mpn_mu_div_qr2 (mp_ptr qp,  | 
159  |  |     mp_ptr rp,  | 
160  |  |     mp_srcptr np,  | 
161  |  |     mp_size_t nn,  | 
162  |  |     mp_srcptr dp,  | 
163  |  |     mp_size_t dn,  | 
164  |  |     mp_ptr scratch)  | 
165  | 0  | { | 
166  | 0  |   mp_size_t qn, in;  | 
167  | 0  |   mp_limb_t cy, qh;  | 
168  | 0  |   mp_ptr ip, tp;  | 
169  |  | 
  | 
170  | 0  |   ASSERT (dn > 1);  | 
171  |  | 
  | 
172  | 0  |   qn = nn - dn;  | 
173  |  |  | 
174  |  |   /* Compute the inverse size.  */  | 
175  | 0  |   in = mpn_mu_div_qr_choose_in (qn, dn, 0);  | 
176  | 0  |   ASSERT (in <= dn);  | 
177  |  | 
  | 
178  | 0  | #if 1  | 
179  |  |   /* This alternative inverse computation method gets slightly more accurate  | 
180  |  |      results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function  | 
181  |  |      not adapted (3) mpn_invertappr scratch needs not met.  */  | 
182  | 0  |   ip = scratch;  | 
183  | 0  |   tp = scratch + in + 1;  | 
184  |  |  | 
185  |  |   /* compute an approximate inverse on (in+1) limbs */  | 
186  | 0  |   if (dn == in)  | 
187  | 0  |     { | 
188  | 0  |       MPN_COPY (tp + 1, dp, in);  | 
189  | 0  |       tp[0] = 1;  | 
190  | 0  |       mpn_invertappr (ip, tp, in + 1, tp + in + 1);  | 
191  | 0  |       MPN_COPY_INCR (ip, ip + 1, in);  | 
192  | 0  |     }  | 
193  | 0  |   else  | 
194  | 0  |     { | 
195  | 0  |       cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);  | 
196  | 0  |       if (UNLIKELY (cy != 0))  | 
197  | 0  |   MPN_ZERO (ip, in);  | 
198  | 0  |       else  | 
199  | 0  |   { | 
200  | 0  |     mpn_invertappr (ip, tp, in + 1, tp + in + 1);  | 
201  | 0  |     MPN_COPY_INCR (ip, ip + 1, in);  | 
202  | 0  |   }  | 
203  | 0  |     }  | 
204  |  | #else  | 
205  |  |   /* This older inverse computation method gets slightly worse results than the  | 
206  |  |      one above.  */  | 
207  |  |   ip = scratch;  | 
208  |  |   tp = scratch + in;  | 
209  |  |  | 
210  |  |   /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the  | 
211  |  |      inversion function should do this automatically.  */  | 
212  |  |   if (dn == in)  | 
213  |  |     { | 
214  |  |       tp[in + 1] = 0;  | 
215  |  |       MPN_COPY (tp + in + 2, dp, in);  | 
216  |  |       mpn_invertappr (tp, tp + in + 1, in + 1, NULL);  | 
217  |  |     }  | 
218  |  |   else  | 
219  |  |     { | 
220  |  |       mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);  | 
221  |  |     }  | 
222  |  |   cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);  | 
223  |  |   if (UNLIKELY (cy != 0))  | 
224  |  |     MPN_ZERO (tp + 1, in);  | 
225  |  |   MPN_COPY (ip, tp + 1, in);  | 
226  |  | #endif  | 
227  |  | 
  | 
228  | 0  |   qh = mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in);  | 
229  |  | 
  | 
230  | 0  |   return qh;  | 
231  | 0  | }  | 
232  |  |  | 
233  |  | mp_limb_t  | 
234  |  | mpn_preinv_mu_div_qr (mp_ptr qp,  | 
235  |  |           mp_ptr rp,  | 
236  |  |           mp_srcptr np,  | 
237  |  |           mp_size_t nn,  | 
238  |  |           mp_srcptr dp,  | 
239  |  |           mp_size_t dn,  | 
240  |  |           mp_srcptr ip,  | 
241  |  |           mp_size_t in,  | 
242  |  |           mp_ptr scratch)  | 
243  | 0  | { | 
244  | 0  |   mp_size_t qn;  | 
245  | 0  |   mp_limb_t cy, cx, qh;  | 
246  | 0  |   mp_limb_t r;  | 
247  | 0  |   mp_size_t tn, wn;  | 
248  |  | 
  | 
249  | 0  | #define tp           scratch  | 
250  | 0  | #define scratch_out  (scratch + tn)  | 
251  |  | 
  | 
252  | 0  |   qn = nn - dn;  | 
253  |  | 
  | 
254  | 0  |   np += qn;  | 
255  | 0  |   qp += qn;  | 
256  |  | 
  | 
257  | 0  |   qh = mpn_cmp (np, dp, dn) >= 0;  | 
258  | 0  |   if (qh != 0)  | 
259  | 0  |     mpn_sub_n (rp, np, dp, dn);  | 
260  | 0  |   else  | 
261  | 0  |     MPN_COPY_INCR (rp, np, dn);  | 
262  |  |  | 
263  |  |   /* if (qn == 0) */      /* The while below handles this case */  | 
264  |  |   /*   return qh; */      /* Degenerate use.  Should we allow this? */  | 
265  |  | 
  | 
266  | 0  |   while (qn > 0)  | 
267  | 0  |     { | 
268  | 0  |       if (qn < in)  | 
269  | 0  |   { | 
270  | 0  |     ip += in - qn;  | 
271  | 0  |     in = qn;  | 
272  | 0  |   }  | 
273  | 0  |       np -= in;  | 
274  | 0  |       qp -= in;  | 
275  |  |  | 
276  |  |       /* Compute the next block of quotient limbs by multiplying the inverse I  | 
277  |  |    by the upper part of the partial remainder R.  */  | 
278  | 0  |       mpn_mul_n (tp, rp + dn - in, ip, in);    /* mulhi  */  | 
279  | 0  |       cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */  | 
280  | 0  |       ASSERT_ALWAYS (cy == 0);  | 
281  |  |  | 
282  | 0  |       qn -= in;  | 
283  |  |  | 
284  |  |       /* Compute the product of the quotient block and the divisor D, to be  | 
285  |  |    subtracted from the partial remainder combined with new limbs from the  | 
286  |  |    dividend N.  We only really need the low dn+1 limbs.  */  | 
287  |  | 
  | 
288  | 0  |       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))  | 
289  | 0  |   mpn_mul (tp, dp, dn, qp, in);    /* dn+in limbs, high 'in' cancels */  | 
290  | 0  |       else  | 
291  | 0  |   { | 
292  | 0  |     tn = mpn_mulmod_bnm1_next_size (dn + 1);  | 
293  | 0  |     mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);  | 
294  | 0  |     wn = dn + in - tn;      /* number of wrapped limbs */  | 
295  | 0  |     if (wn > 0)  | 
296  | 0  |       { | 
297  | 0  |         cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);  | 
298  | 0  |         cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);  | 
299  | 0  |         cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;  | 
300  | 0  |         ASSERT_ALWAYS (cx >= cy);  | 
301  | 0  |         mpn_incr_u (tp, cx - cy);  | 
302  | 0  |       }  | 
303  | 0  |   }  | 
304  |  |  | 
305  | 0  |       r = rp[dn - in] - tp[dn];  | 
306  |  |  | 
307  |  |       /* Subtract the product from the partial remainder combined with new  | 
308  |  |    limbs from the dividend N, generating a new partial remainder R.  */  | 
309  | 0  |       if (dn != in)  | 
310  | 0  |   { | 
311  | 0  |     cy = mpn_sub_n (tp, np, tp, in);  /* get next 'in' limbs from N */  | 
312  | 0  |     cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);  | 
313  | 0  |     MPN_COPY (rp, tp, dn);    /* FIXME: try to avoid this */  | 
314  | 0  |   }  | 
315  | 0  |       else  | 
316  | 0  |   { | 
317  | 0  |     cy = mpn_sub_n (rp, np, tp, in);  /* get next 'in' limbs from N */  | 
318  | 0  |   }  | 
319  |  | 
  | 
320  | 0  |       STAT (int i; int err = 0;  | 
321  | 0  |       static int errarr[5]; static int err_rec; static int tot);  | 
322  |  |  | 
323  |  |       /* Check the remainder R and adjust the quotient as needed.  */  | 
324  | 0  |       r -= cy;  | 
325  | 0  |       while (r != 0)  | 
326  | 0  |   { | 
327  |  |     /* We loop 0 times with about 69% probability, 1 time with about 31%  | 
328  |  |        probability, 2 times with about 0.6% probability, if inverse is  | 
329  |  |        computed as recommended.  */  | 
330  | 0  |     mpn_incr_u (qp, 1);  | 
331  | 0  |     cy = mpn_sub_n (rp, rp, dp, dn);  | 
332  | 0  |     r -= cy;  | 
333  | 0  |     STAT (err++);  | 
334  | 0  |   }  | 
335  | 0  |       if (mpn_cmp (rp, dp, dn) >= 0)  | 
336  | 0  |   { | 
337  |  |     /* This is executed with about 76% probability.  */  | 
338  | 0  |     mpn_incr_u (qp, 1);  | 
339  | 0  |     cy = mpn_sub_n (rp, rp, dp, dn);  | 
340  | 0  |     STAT (err++);  | 
341  | 0  |   }  | 
342  |  | 
  | 
343  | 0  |       STAT (  | 
344  | 0  |       tot++;  | 
345  | 0  |       errarr[err]++;  | 
346  | 0  |       if (err > err_rec)  | 
347  | 0  |         err_rec = err;  | 
348  | 0  |       if (tot % 0x10000 == 0)  | 
349  | 0  |         { | 
350  | 0  |     for (i = 0; i <= err_rec; i++)  | 
351  | 0  |       printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot); | 
352  | 0  |     printf ("\n"); | 
353  | 0  |         }  | 
354  | 0  |       );  | 
355  | 0  |     }  | 
356  |  |  | 
357  | 0  |   return qh;  | 
358  | 0  | }  | 
359  |  |  | 
360  |  | /* In case k=0 (automatic choice), we distinguish 3 cases:  | 
361  |  |    (a) dn < qn:         in = ceil(qn / ceil(qn/dn))  | 
362  |  |    (b) dn/3 < qn <= dn: in = ceil(qn / 2)  | 
363  |  |    (c) qn < dn/3:       in = qn  | 
364  |  |    In all cases we have in <= dn.  | 
365  |  |  */  | 
366  |  | static mp_size_t  | 
367  |  | mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k)  | 
368  | 0  | { | 
369  | 0  |   mp_size_t in;  | 
370  |  | 
  | 
371  | 0  |   if (k == 0)  | 
372  | 0  |     { | 
373  | 0  |       mp_size_t b;  | 
374  | 0  |       if (qn > dn)  | 
375  | 0  |   { | 
376  |  |     /* Compute an inverse size that is a nice partition of the quotient.  */  | 
377  | 0  |     b = (qn - 1) / dn + 1;  /* ceil(qn/dn), number of blocks */  | 
378  | 0  |     in = (qn - 1) / b + 1;  /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */  | 
379  | 0  |   }  | 
380  | 0  |       else if (3 * qn > dn)  | 
381  | 0  |   { | 
382  | 0  |     in = (qn - 1) / 2 + 1;  /* b = 2 */  | 
383  | 0  |   }  | 
384  | 0  |       else  | 
385  | 0  |   { | 
386  | 0  |     in = (qn - 1) / 1 + 1;  /* b = 1 */  | 
387  | 0  |   }  | 
388  | 0  |     }  | 
389  | 0  |   else  | 
390  | 0  |     { | 
391  | 0  |       mp_size_t xn;  | 
392  | 0  |       xn = MIN (dn, qn);  | 
393  | 0  |       in = (xn - 1) / k + 1;  | 
394  | 0  |     }  | 
395  |  | 
  | 
396  | 0  |   return in;  | 
397  | 0  | }  | 
398  |  |  | 
399  |  | mp_size_t  | 
400  |  | mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k)  | 
401  | 0  | { | 
402  | 0  |   mp_size_t in = mpn_mu_div_qr_choose_in (nn - dn, dn, mua_k);  | 
403  | 0  |   mp_size_t itch_preinv = mpn_preinv_mu_div_qr_itch (nn, dn, in);  | 
404  | 0  |   mp_size_t itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */  | 
405  |  | 
  | 
406  | 0  |   ASSERT (itch_preinv >= itch_invapp);  | 
407  | 0  |   return in + MAX (itch_invapp, itch_preinv);  | 
408  | 0  | }  | 
409  |  |  | 
410  |  | mp_size_t  | 
411  |  | mpn_preinv_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, mp_size_t in)  | 
412  | 0  | { | 
413  | 0  |   mp_size_t itch_local = mpn_mulmod_bnm1_next_size (dn + 1);  | 
414  | 0  |   mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);  | 
415  |  | 
  | 
416  | 0  |   return itch_local + itch_out;  | 
417  | 0  | }  |