/src/libgmp/mpn/invertappr.c

Source (jump to first uncovered line)
/* mpn_invertappr and helper functions.  Compute I such that
   floor((B^{2n}-1)/U - 1 <= I + B^n <= floor((B^{2n}-1)/U.

   Contributed to the GNU project by Marco Bodrato.

   The algorithm used here was inspired by ApproximateReciprocal from "Modern
   Computer Arithmetic", by Richard P. Brent and Paul Zimmermann.  Special
   thanks to Paul Zimmermann for his very valuable suggestions on all the
   theoretical aspects during the work on this code.

   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.

Copyright (C) 2007, 2009, 2010, 2012, 2015, 2016 Free Software
Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */

#include "gmp-impl.h"
#include "longlong.h"

/* FIXME: The iterative version splits the operand in two slightly unbalanced
   parts, the use of log_2 (or counting the bits) underestimate the maximum
   number of iterations.  */

#if TUNE_PROGRAM_BUILD
#define NPOWS \
 ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)))
#define MAYBE_dcpi1_divappr   1
#else
#define NPOWS \
 ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (INV_NEWTON_THRESHOLD))
#define MAYBE_dcpi1_divappr \
  (INV_NEWTON_THRESHOLD < DC_DIVAPPR_Q_THRESHOLD)
#if (INV_NEWTON_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) && \
    (INV_APPR_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD)
#undef  INV_MULMOD_BNM1_THRESHOLD
#define INV_MULMOD_BNM1_THRESHOLD 0 /* always when Newton */
#endif
#endif

/* All the three functions mpn{,_bc,_ni}_invertappr (ip, dp, n, scratch), take
   the strictly normalised value {dp,n} (i.e., most significant bit must be set)
   as an input, and compute {ip,n}: the approximate reciprocal of {dp,n}.

   Let e = mpn*_invertappr (ip, dp, n, scratch) be the returned value; the
   following conditions are satisfied by the output:
     0 <= e <= 1;
     {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1+e) .
   I.e. e=0 means that the result {ip,n} equals the one given by mpn_invert.
  e=1 means that the result _may_ be one less than expected.

   The _bc version returns e=1 most of the time.
   The _ni version should return e=0 most of the time; only about 1% of
   possible random input should give e=1.

   When the strict result is needed, i.e., e=0 in the relation above:
     {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1) ;
   the function mpn_invert (ip, dp, n, scratch) should be used instead.  */

/* Maximum scratch needed by this branch (at xp): 2*n */
static mp_limb_t
mpn_bc_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr xp)
{
  ASSERT (n > 0);
  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
  ASSERT (! MPN_OVERLAP_P (ip, n, xp, mpn_invertappr_itch(n)));
  ASSERT (! MPN_OVERLAP_P (dp, n, xp, mpn_invertappr_itch(n)));

  /* Compute a base value of r limbs. */
  if (n == 1)
    invert_limb (*ip, *dp);
  else {
    /* n > 1 here */
    MPN_FILL (xp, n, GMP_NUMB_MAX);
    mpn_com (xp + n, dp, n);

    /* Now xp contains B^2n - {dp,n}*B^n - 1 */

    /* FIXME: if mpn_*pi1_divappr_q handles n==2, use it! */
    if (n == 2) {
      mpn_divrem_2 (ip, 0, xp, 4, dp);
    } else {
      gmp_pi1_t inv;
      invert_pi1 (inv, dp[n-1], dp[n-2]);
      if (! MAYBE_dcpi1_divappr
    || BELOW_THRESHOLD (n, DC_DIVAPPR_Q_THRESHOLD))
  mpn_sbpi1_divappr_q (ip, xp, 2 * n, dp, n, inv.inv32);
      else
  mpn_dcpi1_divappr_q (ip, xp, 2 * n, dp, n, &inv);
      MPN_DECR_U(ip, n, CNST_LIMB (1));
      return 1;
    }
  }
  return 0;
}

/* mpn_ni_invertappr: computes the approximate reciprocal using Newton's
   iterations (at least one).

   Inspired by Algorithm "ApproximateReciprocal", published in "Modern Computer
   Arithmetic" by Richard P. Brent and Paul Zimmermann, algorithm 3.5, page 121
   in version 0.4 of the book.

   Some adaptations were introduced, to allow product mod B^m-1 and return the
   value e.

   We introduced a correction in such a way that "the value of
   B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads
   "2B^n-1").

   Maximum scratch needed by this branch <= 2*n, but have to fit 3*rn
   in the scratch, i.e. 3*rn <= 2*n: we require n>4.

   We use a wrapped product modulo B^m-1.  NOTE: is there any normalisation
   problem for the [0] class?  It shouldn't: we compute 2*|A*X_h - B^{n+h}| <
   B^m-1.  We may get [0] if and only if we get AX_h = B^{n+h}.  This can
   happen only if A=B^{n}/2, but this implies X_h = B^{h}*2-1 i.e., AX_h =
   B^{n+h} - A, then we get into the "negative" branch, where X_h is not
   incremented (because A < B^n).

   FIXME: the scratch for mulmod_bnm1 does not currently fit in the scratch, it
   is allocated apart.
 */

mp_limb_t
mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
{
  mp_limb_t cy;
  mp_size_t rn, mn;
  mp_size_t sizes[NPOWS], *sizp;
  mp_ptr tp;
  TMP_DECL;
#define xp scratch

  ASSERT (n > 4);
  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
  ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
  ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));

  /* Compute the computation precisions from highest to lowest, leaving the
     base case size in 'rn'.  */
  sizp = sizes;
  rn = n;
  do {
    *sizp = rn;
    rn = (rn >> 1) + 1;
    ++sizp;
  } while (ABOVE_THRESHOLD (rn, INV_NEWTON_THRESHOLD));

  /* We search the inverse of 0.{dp,n}, we compute it as 1.{ip,n} */
  dp += n;
  ip += n;

  /* Compute a base value of rn limbs. */
  mpn_bc_invertappr (ip - rn, dp - rn, rn, scratch);

  TMP_MARK;

  if (ABOVE_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD))
    {
      mn = mpn_mulmod_bnm1_next_size (n + 1);
      tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (mn, n, (n >> 1) + 1));
    }
  /* Use Newton's iterations to get the desired precision.*/

  while (1) {
    n = *--sizp;
    /*
      v    n  v
      +----+--+
      ^ rn ^
    */

    /* Compute i_jd . */
    if (BELOW_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD)
  || ((mn = mpn_mulmod_bnm1_next_size (n + 1)) > (n + rn))) {
      /* FIXME: We do only need {xp,n+1}*/
      mpn_mul (xp, dp - n, n, ip - rn, rn);
      mpn_add_n (xp + rn, xp + rn, dp - n, n - rn + 1);
      cy = CNST_LIMB(1); /* Remember we truncated, Mod B^(n+1) */
      /* We computed (truncated) {xp,n+1} <- 1.{ip,rn} * 0.{dp,n} */
    } else { /* Use B^mn-1 wraparound */
      mpn_mulmod_bnm1 (xp, mn, dp - n, n, ip - rn, rn, tp);
      /* We computed {xp,mn} <- {ip,rn} * {dp,n} mod (B^mn-1) */
      /* We know that 2*|ip*dp + dp*B^rn - B^{rn+n}| < B^mn-1 */
      /* Add dp*B^rn mod (B^mn-1) */
      ASSERT (n >= mn - rn);
      cy = mpn_add_n (xp + rn, xp + rn, dp - n, mn - rn);
      cy = mpn_add_nc (xp, xp, dp - (n - (mn - rn)), n - (mn - rn), cy);
      /* Subtract B^{rn+n}, maybe only compensate the carry*/
      xp[mn] = CNST_LIMB (1); /* set a limit for DECR_U */
      MPN_DECR_U (xp + rn + n - mn, 2 * mn + 1 - rn - n, CNST_LIMB (1) - cy);
      MPN_DECR_U (xp, mn, CNST_LIMB (1) - xp[mn]); /* if DECR_U eroded xp[mn] */
      cy = CNST_LIMB(0); /* Remember we are working Mod B^mn-1 */
    }

    if (xp[n] < CNST_LIMB (2)) { /* "positive" residue class */
      cy = xp[n]; /* 0 <= cy <= 1 here. */
#if HAVE_NATIVE_mpn_sublsh1_n
      if (cy++) {
  if (mpn_cmp (xp, dp - n, n) > 0) {
    mp_limb_t chk;
    chk = mpn_sublsh1_n (xp, xp, dp - n, n);
    ASSERT (chk == xp[n]);
    ++ cy;
  } else
    ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
      }
#else /* no mpn_sublsh1_n*/
      if (cy++ && !mpn_sub_n (xp, xp, dp - n, n)) {
  ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
  ++cy;
      }
#endif
      /* 1 <= cy <= 3 here. */
#if HAVE_NATIVE_mpn_rsblsh1_n
      if (mpn_cmp (xp, dp - n, n) > 0) {
  ASSERT_NOCARRY (mpn_rsblsh1_n (xp + n, xp, dp - n, n));
  ++cy;
      } else
  ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
#else /* no mpn_rsblsh1_n*/
      if (mpn_cmp (xp, dp - n, n) > 0) {
  ASSERT_NOCARRY (mpn_sub_n (xp, xp, dp - n, n));
  ++cy;
      }
      ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
#endif
      MPN_DECR_U(ip - rn, rn, cy); /* 1 <= cy <= 4 here. */
    } else { /* "negative" residue class */
      ASSERT (xp[n] >= GMP_NUMB_MAX - CNST_LIMB(1));
      MPN_DECR_U(xp, n + 1, cy);
      if (xp[n] != GMP_NUMB_MAX) {
  MPN_INCR_U(ip - rn, rn, CNST_LIMB (1));
  ASSERT_CARRY (mpn_add_n (xp, xp, dp - n, n));
      }
      mpn_com (xp + 2 * n - rn, xp + n - rn, rn);
    }

    /* Compute x_ju_j. FIXME:We need {xp+rn,rn}, mulhi? */
    mpn_mul_n (xp, xp + 2 * n - rn, ip - rn, rn);
    cy = mpn_add_n (xp + rn, xp + rn, xp + 2 * n - rn, 2 * rn - n);
    cy = mpn_add_nc (ip - n, xp + 3 * rn - n, xp + n + rn, n - rn, cy);
    MPN_INCR_U (ip - rn, rn, cy);
    if (sizp == sizes) { /* Get out of the cycle */
      /* Check for possible carry propagation from below. */
      cy = xp[3 * rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */
      /*    cy = mpn_add_1 (xp + rn, xp + rn, 2*rn - n, 4); */
      break;
    }
    rn = n;
  }
  TMP_FREE;

  return cy;
#undef xp
}

mp_limb_t
mpn_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
{
  ASSERT (n > 0);
  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
  ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
  ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));

  if (BELOW_THRESHOLD (n, INV_NEWTON_THRESHOLD))
    return mpn_bc_invertappr (ip, dp, n, scratch);
  else
    return mpn_ni_invertappr (ip, dp, n, scratch);
}

Coverage Report

Created: 2024-11-21 07:03

Line	Count	Source (jump to first uncovered line)
1		/* mpn_invertappr and helper functions. Compute I such that
2		floor((B^{2n}-1)/U - 1 <= I + B^n <= floor((B^{2n}-1)/U.
3
4		Contributed to the GNU project by Marco Bodrato.
5
6		The algorithm used here was inspired by ApproximateReciprocal from "Modern
7		Computer Arithmetic", by Richard P. Brent and Paul Zimmermann. Special
8		thanks to Paul Zimmermann for his very valuable suggestions on all the
9		theoretical aspects during the work on this code.
10
11		THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
12		SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
13		GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
14
15		Copyright (C) 2007, 2009, 2010, 2012, 2015, 2016 Free Software
16		Foundation, Inc.
17
18		This file is part of the GNU MP Library.
19
20		The GNU MP Library is free software; you can redistribute it and/or modify
21		it under the terms of either:
22
23		* the GNU Lesser General Public License as published by the Free
24		Software Foundation; either version 3 of the License, or (at your
25		option) any later version.
26
27		or
28
29		* the GNU General Public License as published by the Free Software
30		Foundation; either version 2 of the License, or (at your option) any
31		later version.
32
33		or both in parallel, as here.
34
35		The GNU MP Library is distributed in the hope that it will be useful, but
36		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
37		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
38		for more details.
39
40		You should have received copies of the GNU General Public License and the
41		GNU Lesser General Public License along with the GNU MP Library. If not,
42		see https://www.gnu.org/licenses/. */
43
44		#include "gmp-impl.h"
45		#include "longlong.h"
46
47		/* FIXME: The iterative version splits the operand in two slightly unbalanced
48		parts, the use of log_2 (or counting the bits) underestimate the maximum
49		number of iterations. */
50
51		#if TUNE_PROGRAM_BUILD
52		#define NPOWS \
53		((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)))
54		#define MAYBE_dcpi1_divappr 1
55		#else
56		#define NPOWS \
57		((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (INV_NEWTON_THRESHOLD))
58		#define MAYBE_dcpi1_divappr \
59	0	(INV_NEWTON_THRESHOLD < DC_DIVAPPR_Q_THRESHOLD)
60		#if (INV_NEWTON_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) && \
61		(INV_APPR_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD)
62		#undef INV_MULMOD_BNM1_THRESHOLD
63		#define INV_MULMOD_BNM1_THRESHOLD 0 /* always when Newton */
64		#endif
65		#endif
66
67		/* All the three functions mpn{,_bc,_ni}_invertappr (ip, dp, n, scratch), take
68		the strictly normalised value {dp,n} (i.e., most significant bit must be set)
69		as an input, and compute {ip,n}: the approximate reciprocal of {dp,n}.
70
71		Let e = mpn*_invertappr (ip, dp, n, scratch) be the returned value; the
72		following conditions are satisfied by the output:
73		0 <= e <= 1;
74		{dp,n}(B^n+{ip,n}) < B^{2n} <= {dp,n}(B^n+{ip,n}+1+e) .
75		I.e. e=0 means that the result {ip,n} equals the one given by mpn_invert.
76		e=1 means that the result _may_ be one less than expected.
77
78		The _bc version returns e=1 most of the time.
79		The _ni version should return e=0 most of the time; only about 1% of
80		possible random input should give e=1.
81
82		When the strict result is needed, i.e., e=0 in the relation above:
83		{dp,n}(B^n+{ip,n}) < B^{2n} <= {dp,n}(B^n+{ip,n}+1) ;
84		the function mpn_invert (ip, dp, n, scratch) should be used instead. */
85
86		/* Maximum scratch needed by this branch (at xp): 2n /
87		static mp_limb_t
88		mpn_bc_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr xp)
89	0	{
90	0	ASSERT (n > 0);
91	0	ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
92	0	ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
93	0	ASSERT (! MPN_OVERLAP_P (ip, n, xp, mpn_invertappr_itch(n)));
94	0	ASSERT (! MPN_OVERLAP_P (dp, n, xp, mpn_invertappr_itch(n)));
95
96		/* Compute a base value of r limbs. */
97	0	if (n == 1)
98	0	invert_limb (ip, dp);
99	0	else {
100		/* n > 1 here */
101	0	MPN_FILL (xp, n, GMP_NUMB_MAX);
102	0	mpn_com (xp + n, dp, n);
103
104		/* Now xp contains B^2n - {dp,n}B^n - 1 /
105
106		/* FIXME: if mpn_pi1_divappr_q handles n==2, use it! /
107	0	if (n == 2) {
108	0	mpn_divrem_2 (ip, 0, xp, 4, dp);
109	0	} else {
110	0	gmp_pi1_t inv;
111	0	invert_pi1 (inv, dp[n-1], dp[n-2]);
112	0	if (! MAYBE_dcpi1_divappr
113	0	\|\| BELOW_THRESHOLD (n, DC_DIVAPPR_Q_THRESHOLD))
114	0	mpn_sbpi1_divappr_q (ip, xp, 2 * n, dp, n, inv.inv32);
115	0	else
116	0	mpn_dcpi1_divappr_q (ip, xp, 2 * n, dp, n, &inv);
117	0	MPN_DECR_U(ip, n, CNST_LIMB (1));
118	0	return 1;
119	0	}
120	0	}
121	0	return 0;
122	0	}
123
124		/* mpn_ni_invertappr: computes the approximate reciprocal using Newton's
125		iterations (at least one).
126
127		Inspired by Algorithm "ApproximateReciprocal", published in "Modern Computer
128		Arithmetic" by Richard P. Brent and Paul Zimmermann, algorithm 3.5, page 121
129		in version 0.4 of the book.
130
131		Some adaptations were introduced, to allow product mod B^m-1 and return the
132		value e.
133
134		We introduced a correction in such a way that "the value of
135		B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads
136		"2B^n-1").
137
138		Maximum scratch needed by this branch <= 2n, but have to fit 3rn
139		in the scratch, i.e. 3rn <= 2n: we require n>4.
140
141		We use a wrapped product modulo B^m-1. NOTE: is there any normalisation
142		problem for the [0] class? It shouldn't: we compute 2\|AX_h - B^{n+h}\| <
143		B^m-1. We may get [0] if and only if we get AX_h = B^{n+h}. This can
144		happen only if A=B^{n}/2, but this implies X_h = B^{h}*2-1 i.e., AX_h =
145		B^{n+h} - A, then we get into the "negative" branch, where X_h is not
146		incremented (because A < B^n).
147
148		FIXME: the scratch for mulmod_bnm1 does not currently fit in the scratch, it
149		is allocated apart.
150		*/
151
152		mp_limb_t
153		mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
154	0	{
155	0	mp_limb_t cy;
156	0	mp_size_t rn, mn;
157	0	mp_size_t sizes[NPOWS], *sizp;
158	0	mp_ptr tp;
159	0	TMP_DECL;
160	0	#define xp scratch
161
162	0	ASSERT (n > 4);
163	0	ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
164	0	ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
165	0	ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
166	0	ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
167
168		/* Compute the computation precisions from highest to lowest, leaving the
169		base case size in 'rn'. */
170	0	sizp = sizes;
171	0	rn = n;
172	0	do {
173	0	*sizp = rn;
174	0	rn = (rn >> 1) + 1;
175	0	++sizp;
176	0	} while (ABOVE_THRESHOLD (rn, INV_NEWTON_THRESHOLD));
177
178		/* We search the inverse of 0.{dp,n}, we compute it as 1.{ip,n} */
179	0	dp += n;
180	0	ip += n;
181
182		/* Compute a base value of rn limbs. */
183	0	mpn_bc_invertappr (ip - rn, dp - rn, rn, scratch);
184
185	0	TMP_MARK;
186
187	0	if (ABOVE_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD))
188	0	{
189	0	mn = mpn_mulmod_bnm1_next_size (n + 1);
190	0	tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (mn, n, (n >> 1) + 1));
191	0	}
192		/* Use Newton's iterations to get the desired precision.*/
193
194	0	while (1) {
195	0	n = *--sizp;
196		/*
197		v n v
198		+----+--+
199		^ rn ^
200		*/
201
202		/* Compute i_jd . */
203	0	if (BELOW_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD)
204	0	\|\| ((mn = mpn_mulmod_bnm1_next_size (n + 1)) > (n + rn))) {
205		/* FIXME: We do only need {xp,n+1}*/
206	0	mpn_mul (xp, dp - n, n, ip - rn, rn);
207	0	mpn_add_n (xp + rn, xp + rn, dp - n, n - rn + 1);
208	0	cy = CNST_LIMB(1); /* Remember we truncated, Mod B^(n+1) */
209		/* We computed (truncated) {xp,n+1} <- 1.{ip,rn} * 0.{dp,n} */
210	0	} else { /* Use B^mn-1 wraparound */
211	0	mpn_mulmod_bnm1 (xp, mn, dp - n, n, ip - rn, rn, tp);
212		/* We computed {xp,mn} <- {ip,rn} * {dp,n} mod (B^mn-1) */
213		/* We know that 2\|ipdp + dpB^rn - B^{rn+n}\| < B^mn-1 /
214		/* Add dpB^rn mod (B^mn-1) /
215	0	ASSERT (n >= mn - rn);
216	0	cy = mpn_add_n (xp + rn, xp + rn, dp - n, mn - rn);
217	0	cy = mpn_add_nc (xp, xp, dp - (n - (mn - rn)), n - (mn - rn), cy);
218		/* Subtract B^{rn+n}, maybe only compensate the carry*/
219	0	xp[mn] = CNST_LIMB (1); /* set a limit for DECR_U */
220	0	MPN_DECR_U (xp + rn + n - mn, 2 * mn + 1 - rn - n, CNST_LIMB (1) - cy);
221	0	MPN_DECR_U (xp, mn, CNST_LIMB (1) - xp[mn]); /* if DECR_U eroded xp[mn] */
222	0	cy = CNST_LIMB(0); /* Remember we are working Mod B^mn-1 */
223	0	}
224
225	0	if (xp[n] < CNST_LIMB (2)) { /* "positive" residue class */
226	0	cy = xp[n]; /* 0 <= cy <= 1 here. */
227	0	#if HAVE_NATIVE_mpn_sublsh1_n
228	0	if (cy++) {
229	0	if (mpn_cmp (xp, dp - n, n) > 0) {
230	0	mp_limb_t chk;
231	0	chk = mpn_sublsh1_n (xp, xp, dp - n, n);
232	0	ASSERT (chk == xp[n]);
233	0	++ cy;
234	0	} else
235	0	ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
236	0	}
237		#else /* no mpn_sublsh1_n*/
238		if (cy++ && !mpn_sub_n (xp, xp, dp - n, n)) {
239		ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
240		++cy;
241		}
242		#endif
243		/* 1 <= cy <= 3 here. */
244	0	#if HAVE_NATIVE_mpn_rsblsh1_n
245	0	if (mpn_cmp (xp, dp - n, n) > 0) {
246	0	ASSERT_NOCARRY (mpn_rsblsh1_n (xp + n, xp, dp - n, n));
247	0	++cy;
248	0	} else
249	0	ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
250		#else /* no mpn_rsblsh1_n*/
251		if (mpn_cmp (xp, dp - n, n) > 0) {
252		ASSERT_NOCARRY (mpn_sub_n (xp, xp, dp - n, n));
253		++cy;
254		}
255		ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
256		#endif
257	0	MPN_DECR_U(ip - rn, rn, cy); /* 1 <= cy <= 4 here. */
258	0	} else { /* "negative" residue class */
259	0	ASSERT (xp[n] >= GMP_NUMB_MAX - CNST_LIMB(1));
260	0	MPN_DECR_U(xp, n + 1, cy);
261	0	if (xp[n] != GMP_NUMB_MAX) {
262	0	MPN_INCR_U(ip - rn, rn, CNST_LIMB (1));
263	0	ASSERT_CARRY (mpn_add_n (xp, xp, dp - n, n));
264	0	}
265	0	mpn_com (xp + 2 * n - rn, xp + n - rn, rn);
266	0	}
267
268		/* Compute x_ju_j. FIXME:We need {xp+rn,rn}, mulhi? */
269	0	mpn_mul_n (xp, xp + 2 * n - rn, ip - rn, rn);
270	0	cy = mpn_add_n (xp + rn, xp + rn, xp + 2 * n - rn, 2 * rn - n);
271	0	cy = mpn_add_nc (ip - n, xp + 3 * rn - n, xp + n + rn, n - rn, cy);
272	0	MPN_INCR_U (ip - rn, rn, cy);
273	0	if (sizp == sizes) { /* Get out of the cycle */
274		/* Check for possible carry propagation from below. */
275	0	cy = xp[3 * rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */
276		/* cy = mpn_add_1 (xp + rn, xp + rn, 2rn - n, 4); /
277	0	break;
278	0	}
279	0	rn = n;
280	0	}
281	0	TMP_FREE;
282
283	0	return cy;
284	0	#undef xp
285	0	}
286
287		mp_limb_t
288		mpn_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
289	0	{
290	0	ASSERT (n > 0);
291	0	ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
292	0	ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
293	0	ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
294	0	ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
295
296	0	if (BELOW_THRESHOLD (n, INV_NEWTON_THRESHOLD))
297	0	return mpn_bc_invertappr (ip, dp, n, scratch);
298	0	else
299	0	return mpn_ni_invertappr (ip, dp, n, scratch);
300	0	}