/src/gmp/mpn/invertappr.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* mpn_invertappr and helper functions.  Compute I such that  | 
2  |  |    floor((B^{2n}-1)/U - 1 <= I + B^n <= floor((B^{2n}-1)/U. | 
3  |  |  | 
4  |  |    Contributed to the GNU project by Marco Bodrato.  | 
5  |  |  | 
6  |  |    The algorithm used here was inspired by ApproximateReciprocal from "Modern  | 
7  |  |    Computer Arithmetic", by Richard P. Brent and Paul Zimmermann.  Special  | 
8  |  |    thanks to Paul Zimmermann for his very valuable suggestions on all the  | 
9  |  |    theoretical aspects during the work on this code.  | 
10  |  |  | 
11  |  |    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY  | 
12  |  |    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST  | 
13  |  |    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.  | 
14  |  |  | 
15  |  | Copyright (C) 2007, 2009, 2010, 2012, 2015, 2016 Free Software  | 
16  |  | Foundation, Inc.  | 
17  |  |  | 
18  |  | This file is part of the GNU MP Library.  | 
19  |  |  | 
20  |  | The GNU MP Library is free software; you can redistribute it and/or modify  | 
21  |  | it under the terms of either:  | 
22  |  |  | 
23  |  |   * the GNU Lesser General Public License as published by the Free  | 
24  |  |     Software Foundation; either version 3 of the License, or (at your  | 
25  |  |     option) any later version.  | 
26  |  |  | 
27  |  | or  | 
28  |  |  | 
29  |  |   * the GNU General Public License as published by the Free Software  | 
30  |  |     Foundation; either version 2 of the License, or (at your option) any  | 
31  |  |     later version.  | 
32  |  |  | 
33  |  | or both in parallel, as here.  | 
34  |  |  | 
35  |  | The GNU MP Library is distributed in the hope that it will be useful, but  | 
36  |  | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY  | 
37  |  | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License  | 
38  |  | for more details.  | 
39  |  |  | 
40  |  | You should have received copies of the GNU General Public License and the  | 
41  |  | GNU Lesser General Public License along with the GNU MP Library.  If not,  | 
42  |  | see https://www.gnu.org/licenses/.  */  | 
43  |  |  | 
44  |  | #include "gmp-impl.h"  | 
45  |  | #include "longlong.h"  | 
46  |  |  | 
47  |  | /* FIXME: The iterative version splits the operand in two slightly unbalanced  | 
48  |  |    parts, the use of log_2 (or counting the bits) underestimate the maximum  | 
49  |  |    number of iterations.  */  | 
50  |  |  | 
51  |  | #if TUNE_PROGRAM_BUILD  | 
52  |  | #define NPOWS \  | 
53  |  |  ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)))  | 
54  |  | #define MAYBE_dcpi1_divappr   1  | 
55  |  | #else  | 
56  |  | #define NPOWS \  | 
57  |  |  ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (INV_NEWTON_THRESHOLD))  | 
58  |  | #define MAYBE_dcpi1_divappr \  | 
59  | 0  |   (INV_NEWTON_THRESHOLD < DC_DIVAPPR_Q_THRESHOLD)  | 
60  |  | #if (INV_NEWTON_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) && \  | 
61  |  |     (INV_APPR_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD)  | 
62  |  | #undef  INV_MULMOD_BNM1_THRESHOLD  | 
63  |  | #define INV_MULMOD_BNM1_THRESHOLD 0 /* always when Newton */  | 
64  |  | #endif  | 
65  |  | #endif  | 
66  |  |  | 
67  |  | /* All the three functions mpn{,_bc,_ni}_invertappr (ip, dp, n, scratch), take | 
68  |  |    the strictly normalised value {dp,n} (i.e., most significant bit must be set) | 
69  |  |    as an input, and compute {ip,n}: the approximate reciprocal of {dp,n}. | 
70  |  |  | 
71  |  |    Let e = mpn*_invertappr (ip, dp, n, scratch) be the returned value; the  | 
72  |  |    following conditions are satisfied by the output:  | 
73  |  |      0 <= e <= 1;  | 
74  |  |      {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1+e) . | 
75  |  |    I.e. e=0 means that the result {ip,n} equals the one given by mpn_invert. | 
76  |  |   e=1 means that the result _may_ be one less than expected.  | 
77  |  |  | 
78  |  |    The _bc version returns e=1 most of the time.  | 
79  |  |    The _ni version should return e=0 most of the time; only about 1% of  | 
80  |  |    possible random input should give e=1.  | 
81  |  |  | 
82  |  |    When the strict result is needed, i.e., e=0 in the relation above:  | 
83  |  |      {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1) ; | 
84  |  |    the function mpn_invert (ip, dp, n, scratch) should be used instead.  */  | 
85  |  |  | 
86  |  | /* Maximum scratch needed by this branch (at xp): 2*n */  | 
87  |  | static mp_limb_t  | 
88  |  | mpn_bc_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr xp)  | 
89  | 0  | { | 
90  | 0  |   ASSERT (n > 0);  | 
91  | 0  |   ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);  | 
92  | 0  |   ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));  | 
93  | 0  |   ASSERT (! MPN_OVERLAP_P (ip, n, xp, mpn_invertappr_itch(n)));  | 
94  | 0  |   ASSERT (! MPN_OVERLAP_P (dp, n, xp, mpn_invertappr_itch(n)));  | 
95  |  |  | 
96  |  |   /* Compute a base value of r limbs. */  | 
97  | 0  |   if (n == 1)  | 
98  | 0  |     invert_limb (*ip, *dp);  | 
99  | 0  |   else { | 
100  |  |     /* n > 1 here */  | 
101  | 0  |     MPN_FILL (xp, n, GMP_NUMB_MAX);  | 
102  | 0  |     mpn_com (xp + n, dp, n);  | 
103  |  |  | 
104  |  |     /* Now xp contains B^2n - {dp,n}*B^n - 1 */ | 
105  |  |  | 
106  |  |     /* FIXME: if mpn_*pi1_divappr_q handles n==2, use it! */  | 
107  | 0  |     if (n == 2) { | 
108  | 0  |       mpn_divrem_2 (ip, 0, xp, 4, dp);  | 
109  | 0  |     } else { | 
110  | 0  |       gmp_pi1_t inv;  | 
111  | 0  |       invert_pi1 (inv, dp[n-1], dp[n-2]);  | 
112  | 0  |       if (! MAYBE_dcpi1_divappr  | 
113  | 0  |     || BELOW_THRESHOLD (n, DC_DIVAPPR_Q_THRESHOLD))  | 
114  | 0  |   mpn_sbpi1_divappr_q (ip, xp, 2 * n, dp, n, inv.inv32);  | 
115  | 0  |       else  | 
116  | 0  |   mpn_dcpi1_divappr_q (ip, xp, 2 * n, dp, n, &inv);  | 
117  | 0  |       MPN_DECR_U(ip, n, CNST_LIMB (1));  | 
118  | 0  |       return 1;  | 
119  | 0  |     }  | 
120  | 0  |   }  | 
121  | 0  |   return 0;  | 
122  | 0  | }  | 
123  |  |  | 
124  |  | /* mpn_ni_invertappr: computes the approximate reciprocal using Newton's  | 
125  |  |    iterations (at least one).  | 
126  |  |  | 
127  |  |    Inspired by Algorithm "ApproximateReciprocal", published in "Modern Computer  | 
128  |  |    Arithmetic" by Richard P. Brent and Paul Zimmermann, algorithm 3.5, page 121  | 
129  |  |    in version 0.4 of the book.  | 
130  |  |  | 
131  |  |    Some adaptations were introduced, to allow product mod B^m-1 and return the  | 
132  |  |    value e.  | 
133  |  |  | 
134  |  |    We introduced a correction in such a way that "the value of  | 
135  |  |    B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads | 
136  |  |    "2B^n-1").  | 
137  |  |  | 
138  |  |    Maximum scratch needed by this branch <= 2*n, but have to fit 3*rn  | 
139  |  |    in the scratch, i.e. 3*rn <= 2*n: we require n>4.  | 
140  |  |  | 
141  |  |    We use a wrapped product modulo B^m-1.  NOTE: is there any normalisation  | 
142  |  |    problem for the [0] class?  It shouldn't: we compute 2*|A*X_h - B^{n+h}| < | 
143  |  |    B^m-1.  We may get [0] if and only if we get AX_h = B^{n+h}.  This can | 
144  |  |    happen only if A=B^{n}/2, but this implies X_h = B^{h}*2-1 i.e., AX_h = | 
145  |  |    B^{n+h} - A, then we get into the "negative" branch, where X_h is not | 
146  |  |    incremented (because A < B^n).  | 
147  |  |  | 
148  |  |    FIXME: the scratch for mulmod_bnm1 does not currently fit in the scratch, it  | 
149  |  |    is allocated apart.  | 
150  |  |  */  | 
151  |  |  | 
152  |  | mp_limb_t  | 
153  |  | mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)  | 
154  | 0  | { | 
155  | 0  |   mp_limb_t cy;  | 
156  | 0  |   mp_size_t rn, mn;  | 
157  | 0  |   mp_size_t sizes[NPOWS], *sizp;  | 
158  | 0  |   mp_ptr tp;  | 
159  | 0  |   TMP_DECL;  | 
160  | 0  | #define xp scratch  | 
161  |  | 
  | 
162  | 0  |   ASSERT (n > 4);  | 
163  | 0  |   ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);  | 
164  | 0  |   ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));  | 
165  | 0  |   ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));  | 
166  | 0  |   ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));  | 
167  |  |  | 
168  |  |   /* Compute the computation precisions from highest to lowest, leaving the  | 
169  |  |      base case size in 'rn'.  */  | 
170  | 0  |   sizp = sizes;  | 
171  | 0  |   rn = n;  | 
172  | 0  |   do { | 
173  | 0  |     *sizp = rn;  | 
174  | 0  |     rn = (rn >> 1) + 1;  | 
175  | 0  |     ++sizp;  | 
176  | 0  |   } while (ABOVE_THRESHOLD (rn, INV_NEWTON_THRESHOLD));  | 
177  |  |  | 
178  |  |   /* We search the inverse of 0.{dp,n}, we compute it as 1.{ip,n} */ | 
179  | 0  |   dp += n;  | 
180  | 0  |   ip += n;  | 
181  |  |  | 
182  |  |   /* Compute a base value of rn limbs. */  | 
183  | 0  |   mpn_bc_invertappr (ip - rn, dp - rn, rn, scratch);  | 
184  |  | 
  | 
185  | 0  |   TMP_MARK;  | 
186  |  | 
  | 
187  | 0  |   if (ABOVE_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD))  | 
188  | 0  |     { | 
189  | 0  |       mn = mpn_mulmod_bnm1_next_size (n + 1);  | 
190  | 0  |       tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (mn, n, (n >> 1) + 1));  | 
191  | 0  |     }  | 
192  |  |   /* Use Newton's iterations to get the desired precision.*/  | 
193  |  | 
  | 
194  | 0  |   while (1) { | 
195  | 0  |     n = *--sizp;  | 
196  |  |     /*  | 
197  |  |       v    n  v  | 
198  |  |       +----+--+  | 
199  |  |       ^ rn ^  | 
200  |  |     */  | 
201  |  |  | 
202  |  |     /* Compute i_jd . */  | 
203  | 0  |     if (BELOW_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD)  | 
204  | 0  |   || ((mn = mpn_mulmod_bnm1_next_size (n + 1)) > (n + rn))) { | 
205  |  |       /* FIXME: We do only need {xp,n+1}*/ | 
206  | 0  |       mpn_mul (xp, dp - n, n, ip - rn, rn);  | 
207  | 0  |       mpn_add_n (xp + rn, xp + rn, dp - n, n - rn + 1);  | 
208  | 0  |       cy = CNST_LIMB(1); /* Remember we truncated, Mod B^(n+1) */  | 
209  |  |       /* We computed (truncated) {xp,n+1} <- 1.{ip,rn} * 0.{dp,n} */ | 
210  | 0  |     } else { /* Use B^mn-1 wraparound */ | 
211  | 0  |       mpn_mulmod_bnm1 (xp, mn, dp - n, n, ip - rn, rn, tp);  | 
212  |  |       /* We computed {xp,mn} <- {ip,rn} * {dp,n} mod (B^mn-1) */ | 
213  |  |       /* We know that 2*|ip*dp + dp*B^rn - B^{rn+n}| < B^mn-1 */ | 
214  |  |       /* Add dp*B^rn mod (B^mn-1) */  | 
215  | 0  |       ASSERT (n >= mn - rn);  | 
216  | 0  |       cy = mpn_add_n (xp + rn, xp + rn, dp - n, mn - rn);  | 
217  | 0  |       cy = mpn_add_nc (xp, xp, dp - (n - (mn - rn)), n - (mn - rn), cy);  | 
218  |  |       /* Subtract B^{rn+n}, maybe only compensate the carry*/ | 
219  | 0  |       xp[mn] = CNST_LIMB (1); /* set a limit for DECR_U */  | 
220  | 0  |       MPN_DECR_U (xp + rn + n - mn, 2 * mn + 1 - rn - n, CNST_LIMB (1) - cy);  | 
221  | 0  |       MPN_DECR_U (xp, mn, CNST_LIMB (1) - xp[mn]); /* if DECR_U eroded xp[mn] */  | 
222  | 0  |       cy = CNST_LIMB(0); /* Remember we are working Mod B^mn-1 */  | 
223  | 0  |     }  | 
224  |  | 
  | 
225  | 0  |     if (xp[n] < CNST_LIMB (2)) { /* "positive" residue class */ | 
226  | 0  |       cy = xp[n]; /* 0 <= cy <= 1 here. */  | 
227  | 0  | #if HAVE_NATIVE_mpn_sublsh1_n  | 
228  | 0  |       if (cy++) { | 
229  | 0  |   if (mpn_cmp (xp, dp - n, n) > 0) { | 
230  | 0  |     mp_limb_t chk;  | 
231  | 0  |     chk = mpn_sublsh1_n (xp, xp, dp - n, n);  | 
232  | 0  |     ASSERT (chk == xp[n]);  | 
233  | 0  |     ++ cy;  | 
234  | 0  |   } else  | 
235  | 0  |     ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));  | 
236  | 0  |       }  | 
237  |  | #else /* no mpn_sublsh1_n*/  | 
238  |  |       if (cy++ && !mpn_sub_n (xp, xp, dp - n, n)) { | 
239  |  |   ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));  | 
240  |  |   ++cy;  | 
241  |  |       }  | 
242  |  | #endif  | 
243  |  |       /* 1 <= cy <= 3 here. */  | 
244  | 0  | #if HAVE_NATIVE_mpn_rsblsh1_n  | 
245  | 0  |       if (mpn_cmp (xp, dp - n, n) > 0) { | 
246  | 0  |   ASSERT_NOCARRY (mpn_rsblsh1_n (xp + n, xp, dp - n, n));  | 
247  | 0  |   ++cy;  | 
248  | 0  |       } else  | 
249  | 0  |   ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));  | 
250  |  | #else /* no mpn_rsblsh1_n*/  | 
251  |  |       if (mpn_cmp (xp, dp - n, n) > 0) { | 
252  |  |   ASSERT_NOCARRY (mpn_sub_n (xp, xp, dp - n, n));  | 
253  |  |   ++cy;  | 
254  |  |       }  | 
255  |  |       ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));  | 
256  |  | #endif  | 
257  | 0  |       MPN_DECR_U(ip - rn, rn, cy); /* 1 <= cy <= 4 here. */  | 
258  | 0  |     } else { /* "negative" residue class */ | 
259  | 0  |       ASSERT (xp[n] >= GMP_NUMB_MAX - CNST_LIMB(1));  | 
260  | 0  |       MPN_DECR_U(xp, n + 1, cy);  | 
261  | 0  |       if (xp[n] != GMP_NUMB_MAX) { | 
262  | 0  |   MPN_INCR_U(ip - rn, rn, CNST_LIMB (1));  | 
263  | 0  |   ASSERT_CARRY (mpn_add_n (xp, xp, dp - n, n));  | 
264  | 0  |       }  | 
265  | 0  |       mpn_com (xp + 2 * n - rn, xp + n - rn, rn);  | 
266  | 0  |     }  | 
267  |  |  | 
268  |  |     /* Compute x_ju_j. FIXME:We need {xp+rn,rn}, mulhi? */ | 
269  | 0  |     mpn_mul_n (xp, xp + 2 * n - rn, ip - rn, rn);  | 
270  | 0  |     cy = mpn_add_n (xp + rn, xp + rn, xp + 2 * n - rn, 2 * rn - n);  | 
271  | 0  |     cy = mpn_add_nc (ip - n, xp + 3 * rn - n, xp + n + rn, n - rn, cy);  | 
272  | 0  |     MPN_INCR_U (ip - rn, rn, cy);  | 
273  | 0  |     if (sizp == sizes) { /* Get out of the cycle */ | 
274  |  |       /* Check for possible carry propagation from below. */  | 
275  | 0  |       cy = xp[3 * rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */  | 
276  |  |       /*    cy = mpn_add_1 (xp + rn, xp + rn, 2*rn - n, 4); */  | 
277  | 0  |       break;  | 
278  | 0  |     }  | 
279  | 0  |     rn = n;  | 
280  | 0  |   }  | 
281  | 0  |   TMP_FREE;  | 
282  |  | 
  | 
283  | 0  |   return cy;  | 
284  | 0  | #undef xp  | 
285  | 0  | }  | 
286  |  |  | 
287  |  | mp_limb_t  | 
288  |  | mpn_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)  | 
289  | 0  | { | 
290  | 0  |   ASSERT (n > 0);  | 
291  | 0  |   ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);  | 
292  | 0  |   ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));  | 
293  | 0  |   ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));  | 
294  | 0  |   ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));  | 
295  |  | 
  | 
296  | 0  |   if (BELOW_THRESHOLD (n, INV_NEWTON_THRESHOLD))  | 
297  | 0  |     return mpn_bc_invertappr (ip, dp, n, scratch);  | 
298  | 0  |   else  | 
299  | 0  |     return mpn_ni_invertappr (ip, dp, n, scratch);  | 
300  | 0  | }  |