Coverage Report

Created: 2024-11-21 07:03

/src/libgmp/mpn/invertappr.c
Line
Count
Source (jump to first uncovered line)
1
/* mpn_invertappr and helper functions.  Compute I such that
2
   floor((B^{2n}-1)/U - 1 <= I + B^n <= floor((B^{2n}-1)/U.
3
4
   Contributed to the GNU project by Marco Bodrato.
5
6
   The algorithm used here was inspired by ApproximateReciprocal from "Modern
7
   Computer Arithmetic", by Richard P. Brent and Paul Zimmermann.  Special
8
   thanks to Paul Zimmermann for his very valuable suggestions on all the
9
   theoretical aspects during the work on this code.
10
11
   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
12
   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
13
   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
14
15
Copyright (C) 2007, 2009, 2010, 2012, 2015, 2016 Free Software
16
Foundation, Inc.
17
18
This file is part of the GNU MP Library.
19
20
The GNU MP Library is free software; you can redistribute it and/or modify
21
it under the terms of either:
22
23
  * the GNU Lesser General Public License as published by the Free
24
    Software Foundation; either version 3 of the License, or (at your
25
    option) any later version.
26
27
or
28
29
  * the GNU General Public License as published by the Free Software
30
    Foundation; either version 2 of the License, or (at your option) any
31
    later version.
32
33
or both in parallel, as here.
34
35
The GNU MP Library is distributed in the hope that it will be useful, but
36
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
37
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
38
for more details.
39
40
You should have received copies of the GNU General Public License and the
41
GNU Lesser General Public License along with the GNU MP Library.  If not,
42
see https://www.gnu.org/licenses/.  */
43
44
#include "gmp-impl.h"
45
#include "longlong.h"
46
47
/* FIXME: The iterative version splits the operand in two slightly unbalanced
48
   parts, the use of log_2 (or counting the bits) underestimate the maximum
49
   number of iterations.  */
50
51
#if TUNE_PROGRAM_BUILD
52
#define NPOWS \
53
 ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)))
54
#define MAYBE_dcpi1_divappr   1
55
#else
56
#define NPOWS \
57
 ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (INV_NEWTON_THRESHOLD))
58
#define MAYBE_dcpi1_divappr \
59
0
  (INV_NEWTON_THRESHOLD < DC_DIVAPPR_Q_THRESHOLD)
60
#if (INV_NEWTON_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) && \
61
    (INV_APPR_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD)
62
#undef  INV_MULMOD_BNM1_THRESHOLD
63
#define INV_MULMOD_BNM1_THRESHOLD 0 /* always when Newton */
64
#endif
65
#endif
66
67
/* All the three functions mpn{,_bc,_ni}_invertappr (ip, dp, n, scratch), take
68
   the strictly normalised value {dp,n} (i.e., most significant bit must be set)
69
   as an input, and compute {ip,n}: the approximate reciprocal of {dp,n}.
70
71
   Let e = mpn*_invertappr (ip, dp, n, scratch) be the returned value; the
72
   following conditions are satisfied by the output:
73
     0 <= e <= 1;
74
     {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1+e) .
75
   I.e. e=0 means that the result {ip,n} equals the one given by mpn_invert.
76
  e=1 means that the result _may_ be one less than expected.
77
78
   The _bc version returns e=1 most of the time.
79
   The _ni version should return e=0 most of the time; only about 1% of
80
   possible random input should give e=1.
81
82
   When the strict result is needed, i.e., e=0 in the relation above:
83
     {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1) ;
84
   the function mpn_invert (ip, dp, n, scratch) should be used instead.  */
85
86
/* Maximum scratch needed by this branch (at xp): 2*n */
87
static mp_limb_t
88
mpn_bc_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr xp)
89
0
{
90
0
  ASSERT (n > 0);
91
0
  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
92
0
  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
93
0
  ASSERT (! MPN_OVERLAP_P (ip, n, xp, mpn_invertappr_itch(n)));
94
0
  ASSERT (! MPN_OVERLAP_P (dp, n, xp, mpn_invertappr_itch(n)));
95
96
  /* Compute a base value of r limbs. */
97
0
  if (n == 1)
98
0
    invert_limb (*ip, *dp);
99
0
  else {
100
    /* n > 1 here */
101
0
    MPN_FILL (xp, n, GMP_NUMB_MAX);
102
0
    mpn_com (xp + n, dp, n);
103
104
    /* Now xp contains B^2n - {dp,n}*B^n - 1 */
105
106
    /* FIXME: if mpn_*pi1_divappr_q handles n==2, use it! */
107
0
    if (n == 2) {
108
0
      mpn_divrem_2 (ip, 0, xp, 4, dp);
109
0
    } else {
110
0
      gmp_pi1_t inv;
111
0
      invert_pi1 (inv, dp[n-1], dp[n-2]);
112
0
      if (! MAYBE_dcpi1_divappr
113
0
    || BELOW_THRESHOLD (n, DC_DIVAPPR_Q_THRESHOLD))
114
0
  mpn_sbpi1_divappr_q (ip, xp, 2 * n, dp, n, inv.inv32);
115
0
      else
116
0
  mpn_dcpi1_divappr_q (ip, xp, 2 * n, dp, n, &inv);
117
0
      MPN_DECR_U(ip, n, CNST_LIMB (1));
118
0
      return 1;
119
0
    }
120
0
  }
121
0
  return 0;
122
0
}
123
124
/* mpn_ni_invertappr: computes the approximate reciprocal using Newton's
125
   iterations (at least one).
126
127
   Inspired by Algorithm "ApproximateReciprocal", published in "Modern Computer
128
   Arithmetic" by Richard P. Brent and Paul Zimmermann, algorithm 3.5, page 121
129
   in version 0.4 of the book.
130
131
   Some adaptations were introduced, to allow product mod B^m-1 and return the
132
   value e.
133
134
   We introduced a correction in such a way that "the value of
135
   B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads
136
   "2B^n-1").
137
138
   Maximum scratch needed by this branch <= 2*n, but have to fit 3*rn
139
   in the scratch, i.e. 3*rn <= 2*n: we require n>4.
140
141
   We use a wrapped product modulo B^m-1.  NOTE: is there any normalisation
142
   problem for the [0] class?  It shouldn't: we compute 2*|A*X_h - B^{n+h}| <
143
   B^m-1.  We may get [0] if and only if we get AX_h = B^{n+h}.  This can
144
   happen only if A=B^{n}/2, but this implies X_h = B^{h}*2-1 i.e., AX_h =
145
   B^{n+h} - A, then we get into the "negative" branch, where X_h is not
146
   incremented (because A < B^n).
147
148
   FIXME: the scratch for mulmod_bnm1 does not currently fit in the scratch, it
149
   is allocated apart.
150
 */
151
152
mp_limb_t
153
mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
154
0
{
155
0
  mp_limb_t cy;
156
0
  mp_size_t rn, mn;
157
0
  mp_size_t sizes[NPOWS], *sizp;
158
0
  mp_ptr tp;
159
0
  TMP_DECL;
160
0
#define xp scratch
161
162
0
  ASSERT (n > 4);
163
0
  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
164
0
  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
165
0
  ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
166
0
  ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
167
168
  /* Compute the computation precisions from highest to lowest, leaving the
169
     base case size in 'rn'.  */
170
0
  sizp = sizes;
171
0
  rn = n;
172
0
  do {
173
0
    *sizp = rn;
174
0
    rn = (rn >> 1) + 1;
175
0
    ++sizp;
176
0
  } while (ABOVE_THRESHOLD (rn, INV_NEWTON_THRESHOLD));
177
178
  /* We search the inverse of 0.{dp,n}, we compute it as 1.{ip,n} */
179
0
  dp += n;
180
0
  ip += n;
181
182
  /* Compute a base value of rn limbs. */
183
0
  mpn_bc_invertappr (ip - rn, dp - rn, rn, scratch);
184
185
0
  TMP_MARK;
186
187
0
  if (ABOVE_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD))
188
0
    {
189
0
      mn = mpn_mulmod_bnm1_next_size (n + 1);
190
0
      tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (mn, n, (n >> 1) + 1));
191
0
    }
192
  /* Use Newton's iterations to get the desired precision.*/
193
194
0
  while (1) {
195
0
    n = *--sizp;
196
    /*
197
      v    n  v
198
      +----+--+
199
      ^ rn ^
200
    */
201
202
    /* Compute i_jd . */
203
0
    if (BELOW_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD)
204
0
  || ((mn = mpn_mulmod_bnm1_next_size (n + 1)) > (n + rn))) {
205
      /* FIXME: We do only need {xp,n+1}*/
206
0
      mpn_mul (xp, dp - n, n, ip - rn, rn);
207
0
      mpn_add_n (xp + rn, xp + rn, dp - n, n - rn + 1);
208
0
      cy = CNST_LIMB(1); /* Remember we truncated, Mod B^(n+1) */
209
      /* We computed (truncated) {xp,n+1} <- 1.{ip,rn} * 0.{dp,n} */
210
0
    } else { /* Use B^mn-1 wraparound */
211
0
      mpn_mulmod_bnm1 (xp, mn, dp - n, n, ip - rn, rn, tp);
212
      /* We computed {xp,mn} <- {ip,rn} * {dp,n} mod (B^mn-1) */
213
      /* We know that 2*|ip*dp + dp*B^rn - B^{rn+n}| < B^mn-1 */
214
      /* Add dp*B^rn mod (B^mn-1) */
215
0
      ASSERT (n >= mn - rn);
216
0
      cy = mpn_add_n (xp + rn, xp + rn, dp - n, mn - rn);
217
0
      cy = mpn_add_nc (xp, xp, dp - (n - (mn - rn)), n - (mn - rn), cy);
218
      /* Subtract B^{rn+n}, maybe only compensate the carry*/
219
0
      xp[mn] = CNST_LIMB (1); /* set a limit for DECR_U */
220
0
      MPN_DECR_U (xp + rn + n - mn, 2 * mn + 1 - rn - n, CNST_LIMB (1) - cy);
221
0
      MPN_DECR_U (xp, mn, CNST_LIMB (1) - xp[mn]); /* if DECR_U eroded xp[mn] */
222
0
      cy = CNST_LIMB(0); /* Remember we are working Mod B^mn-1 */
223
0
    }
224
225
0
    if (xp[n] < CNST_LIMB (2)) { /* "positive" residue class */
226
0
      cy = xp[n]; /* 0 <= cy <= 1 here. */
227
0
#if HAVE_NATIVE_mpn_sublsh1_n
228
0
      if (cy++) {
229
0
  if (mpn_cmp (xp, dp - n, n) > 0) {
230
0
    mp_limb_t chk;
231
0
    chk = mpn_sublsh1_n (xp, xp, dp - n, n);
232
0
    ASSERT (chk == xp[n]);
233
0
    ++ cy;
234
0
  } else
235
0
    ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
236
0
      }
237
#else /* no mpn_sublsh1_n*/
238
      if (cy++ && !mpn_sub_n (xp, xp, dp - n, n)) {
239
  ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
240
  ++cy;
241
      }
242
#endif
243
      /* 1 <= cy <= 3 here. */
244
0
#if HAVE_NATIVE_mpn_rsblsh1_n
245
0
      if (mpn_cmp (xp, dp - n, n) > 0) {
246
0
  ASSERT_NOCARRY (mpn_rsblsh1_n (xp + n, xp, dp - n, n));
247
0
  ++cy;
248
0
      } else
249
0
  ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
250
#else /* no mpn_rsblsh1_n*/
251
      if (mpn_cmp (xp, dp - n, n) > 0) {
252
  ASSERT_NOCARRY (mpn_sub_n (xp, xp, dp - n, n));
253
  ++cy;
254
      }
255
      ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
256
#endif
257
0
      MPN_DECR_U(ip - rn, rn, cy); /* 1 <= cy <= 4 here. */
258
0
    } else { /* "negative" residue class */
259
0
      ASSERT (xp[n] >= GMP_NUMB_MAX - CNST_LIMB(1));
260
0
      MPN_DECR_U(xp, n + 1, cy);
261
0
      if (xp[n] != GMP_NUMB_MAX) {
262
0
  MPN_INCR_U(ip - rn, rn, CNST_LIMB (1));
263
0
  ASSERT_CARRY (mpn_add_n (xp, xp, dp - n, n));
264
0
      }
265
0
      mpn_com (xp + 2 * n - rn, xp + n - rn, rn);
266
0
    }
267
268
    /* Compute x_ju_j. FIXME:We need {xp+rn,rn}, mulhi? */
269
0
    mpn_mul_n (xp, xp + 2 * n - rn, ip - rn, rn);
270
0
    cy = mpn_add_n (xp + rn, xp + rn, xp + 2 * n - rn, 2 * rn - n);
271
0
    cy = mpn_add_nc (ip - n, xp + 3 * rn - n, xp + n + rn, n - rn, cy);
272
0
    MPN_INCR_U (ip - rn, rn, cy);
273
0
    if (sizp == sizes) { /* Get out of the cycle */
274
      /* Check for possible carry propagation from below. */
275
0
      cy = xp[3 * rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */
276
      /*    cy = mpn_add_1 (xp + rn, xp + rn, 2*rn - n, 4); */
277
0
      break;
278
0
    }
279
0
    rn = n;
280
0
  }
281
0
  TMP_FREE;
282
283
0
  return cy;
284
0
#undef xp
285
0
}
286
287
mp_limb_t
288
mpn_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
289
0
{
290
0
  ASSERT (n > 0);
291
0
  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
292
0
  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
293
0
  ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
294
0
  ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
295
296
0
  if (BELOW_THRESHOLD (n, INV_NEWTON_THRESHOLD))
297
0
    return mpn_bc_invertappr (ip, dp, n, scratch);
298
0
  else
299
0
    return mpn_ni_invertappr (ip, dp, n, scratch);
300
0
}