/src/gmp/mpn/mu_bdiv_qr.c

Source (jump to first uncovered line)
/* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn,
   where qn = nn-dn, storing the result in {qp,qn}.  Overlap allowed between Q
   and N; all other overlap disallowed.

   Contributed to the GNU project by Torbjorn Granlund.

   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.

Copyright 2005-2007, 2009, 2010, 2012, 2017 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of either:

  * the GNU Lesser General Public License as published by the Free
    Software Foundation; either version 3 of the License, or (at your
    option) any later version.

or

  * the GNU General Public License as published by the Free Software
    Foundation; either version 2 of the License, or (at your option) any
    later version.

or both in parallel, as here.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received copies of the GNU General Public License and the
GNU Lesser General Public License along with the GNU MP Library.  If not,
see https://www.gnu.org/licenses/.  */


/*
   The idea of the algorithm used herein is to compute a smaller inverted value
   than used in the standard Barrett algorithm, and thus save time in the
   Newton iterations, and pay just a small price when using the inverted value
   for developing quotient bits.  This algorithm was presented at ICMS 2006.
*/

#include "gmp-impl.h"


/* N = {np,nn}
   D = {dp,dn}

   Requirements: N >= D
     D >= 1
     D odd
     dn >= 2
     nn >= 2
     scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn).

   Write quotient to Q = {qp,nn-dn}.

   FIXME: When iterating, perhaps do the small step before loop, not after.
   FIXME: Try to avoid the scalar divisions when computing inverse size.
   FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
    particular, when dn==in, tp and rp could use the same space.
*/
static mp_limb_t
mpn_mu_bdiv_qr_old (mp_ptr qp,
        mp_ptr rp,
        mp_srcptr np, mp_size_t nn,
        mp_srcptr dp, mp_size_t dn,
        mp_ptr scratch)
{
  mp_size_t qn;
  mp_size_t in;
  mp_limb_t cy, c0;
  mp_size_t tn, wn;

  qn = nn - dn;

  ASSERT (dn >= 2);
  ASSERT (qn >= 2);

  if (qn > dn)
    {
      mp_size_t b;

      /* |_______________________|   dividend
      |________|   divisor  */

#define ip           scratch    /* in */
#define tp           (scratch + in)  /* dn+in or next_size(dn) or rest >= binvert_itch(in) */
#define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */

      /* Compute an inverse size that is a nice partition of the quotient.  */
      b = (qn - 1) / dn + 1;  /* ceil(qn/dn), number of blocks */
      in = (qn - 1) / b + 1;  /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */

      /* Some notes on allocation:

   When in = dn, R dies when mpn_mullo returns, if in < dn the low in
   limbs of R dies at that point.  We could save memory by letting T live
   just under R, and let the upper part of T expand into R. These changes
   should reduce itch to perhaps 3dn.
       */

      mpn_binvert (ip, dp, in, tp);

      MPN_COPY (rp, np, dn);
      np += dn;
      cy = 0;

      while (qn > in)
  {
    mpn_mullo_n (qp, rp, ip, in);

    if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
      mpn_mul (tp, dp, dn, qp, in);  /* mulhi, need tp[dn+in-1...in] */
    else
      {
        tn = mpn_mulmod_bnm1_next_size (dn);
        mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
        wn = dn + in - tn;    /* number of wrapped limbs */
        if (wn > 0)
    {
      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
      mpn_decr_u (tp + wn, c0);
    }
      }

    qp += in;
    qn -= in;

    if (dn != in)
      {
        /* Subtract tp[dn-1...in] from partial remainder.  */
        cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
        if (cy == 2)
    {
      mpn_incr_u (tp + dn, 1);
      cy = 1;
    }
      }
    /* Subtract tp[dn+in-1...dn] from dividend.  */
    cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
    np += in;
  }

      /* Generate last qn limbs.  */
      mpn_mullo_n (qp, rp, ip, qn);

      if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
  mpn_mul (tp, dp, dn, qp, qn);    /* mulhi, need tp[qn+in-1...in] */
      else
  {
    tn = mpn_mulmod_bnm1_next_size (dn);
    mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
    wn = dn + qn - tn;      /* number of wrapped limbs */
    if (wn > 0)
      {
        c0 = mpn_sub_n (tp + tn, tp, rp, wn);
        mpn_decr_u (tp + wn, c0);
      }
  }

      if (dn != qn)
  {
    cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
    if (cy == 2)
      {
        mpn_incr_u (tp + dn, 1);
        cy = 1;
      }
  }
      return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy);

#undef ip
#undef tp
#undef scratch_out
    }
  else
    {
      /* |_______________________|   dividend
    |________________|   divisor  */

#define ip           scratch    /* in */
#define tp           (scratch + in)  /* dn+in or next_size(dn) or rest >= binvert_itch(in) */
#define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */

      /* Compute half-sized inverse.  */
      in = qn - (qn >> 1);

      mpn_binvert (ip, dp, in, tp);

      mpn_mullo_n (qp, np, ip, in);    /* low `in' quotient limbs */

      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
  mpn_mul (tp, dp, dn, qp, in);    /* mulhigh */
      else
  {
    tn = mpn_mulmod_bnm1_next_size (dn);
    mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
    wn = dn + in - tn;      /* number of wrapped limbs */
    if (wn > 0)
      {
        c0 = mpn_sub_n (tp + tn, tp, np, wn);
        mpn_decr_u (tp + wn, c0);
      }
  }

      qp += in;
      qn -= in;

      cy = mpn_sub_n (rp, np + in, tp + in, dn);
      mpn_mullo_n (qp, rp, ip, qn);    /* high qn quotient limbs */

      if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
  mpn_mul (tp, dp, dn, qp, qn);    /* mulhigh */
      else
  {
    tn = mpn_mulmod_bnm1_next_size (dn);
    mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
    wn = dn + qn - tn;      /* number of wrapped limbs */
    if (wn > 0)
      {
        c0 = mpn_sub_n (tp + tn, tp, rp, wn);
        mpn_decr_u (tp + wn, c0);
      }
  }

      cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
      if (cy == 2)
  {
    mpn_incr_u (tp + dn, 1);
    cy = 1;
  }
      return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy);

#undef ip
#undef tp
#undef scratch_out
    }
}

mp_limb_t
mpn_mu_bdiv_qr (mp_ptr qp,
    mp_ptr rp,
    mp_srcptr np, mp_size_t nn,
    mp_srcptr dp, mp_size_t dn,
    mp_ptr scratch)
{
  mp_limb_t cy = mpn_mu_bdiv_qr_old (qp, rp, np, nn, dp, dn, scratch);

  /* R' B^{qn} = U - Q' D
   *
   * Q = B^{qn} - Q' (assuming Q' != 0)
   *
   * R B^{qn} = U + Q D = U + B^{qn} D - Q' D
   *          = B^{qn} D + R'
   */

  if (UNLIKELY (!mpn_neg (qp, qp, nn - dn)))
    {
      /* Zero quotient. */
      ASSERT (cy == 0);
      return 0;
    }
  else
    {
      mp_limb_t cy2 = mpn_add_n (rp, rp, dp, dn);
      ASSERT (cy2 >= cy);

      return cy2 - cy;
    }
}


mp_size_t
mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn)
{
  mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
  mp_size_t b;

  ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD);

  qn = nn - dn;

  if (qn > dn)
    {
      b = (qn - 1) / dn + 1;  /* ceil(qn/dn), number of blocks */
      in = (qn - 1) / b + 1;  /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
    }
  else
    {
      in = qn - (qn >> 1);
    }

  if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
    {
      tn = dn + in;
      itch_out = 0;
    }
  else
    {
      tn = mpn_mulmod_bnm1_next_size (dn);
      itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
    }

  itch_binvert = mpn_binvert_itch (in);
  itches = tn + itch_out;
  return in + MAX (itches, itch_binvert);
}

Coverage Report

Created: 2025-03-06 07:58

Line	Count	Source (jump to first uncovered line)
1		/* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn,
2		where qn = nn-dn, storing the result in {qp,qn}. Overlap allowed between Q
3		and N; all other overlap disallowed.
4
5		Contributed to the GNU project by Torbjorn Granlund.
6
7		THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
8		SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9		GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
10
11		Copyright 2005-2007, 2009, 2010, 2012, 2017 Free Software Foundation, Inc.
12
13		This file is part of the GNU MP Library.
14
15		The GNU MP Library is free software; you can redistribute it and/or modify
16		it under the terms of either:
17
18		* the GNU Lesser General Public License as published by the Free
19		Software Foundation; either version 3 of the License, or (at your
20		option) any later version.
21
22		or
23
24		* the GNU General Public License as published by the Free Software
25		Foundation; either version 2 of the License, or (at your option) any
26		later version.
27
28		or both in parallel, as here.
29
30		The GNU MP Library is distributed in the hope that it will be useful, but
31		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33		for more details.
34
35		You should have received copies of the GNU General Public License and the
36		GNU Lesser General Public License along with the GNU MP Library. If not,
37		see https://www.gnu.org/licenses/. */
38
39
40		/*
41		The idea of the algorithm used herein is to compute a smaller inverted value
42		than used in the standard Barrett algorithm, and thus save time in the
43		Newton iterations, and pay just a small price when using the inverted value
44		for developing quotient bits. This algorithm was presented at ICMS 2006.
45		*/
46
47		#include "gmp-impl.h"
48
49
50		/* N = {np,nn}
51		D = {dp,dn}
52
53		Requirements: N >= D
54		D >= 1
55		D odd
56		dn >= 2
57		nn >= 2
58		scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn).
59
60		Write quotient to Q = {qp,nn-dn}.
61
62		FIXME: When iterating, perhaps do the small step before loop, not after.
63		FIXME: Try to avoid the scalar divisions when computing inverse size.
64		FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In
65		particular, when dn==in, tp and rp could use the same space.
66		*/
67		static mp_limb_t
68		mpn_mu_bdiv_qr_old (mp_ptr qp,
69		mp_ptr rp,
70		mp_srcptr np, mp_size_t nn,
71		mp_srcptr dp, mp_size_t dn,
72		mp_ptr scratch)
73	0	{
74	0	mp_size_t qn;
75	0	mp_size_t in;
76	0	mp_limb_t cy, c0;
77	0	mp_size_t tn, wn;
78
79	0	qn = nn - dn;
80
81	0	ASSERT (dn >= 2);
82	0	ASSERT (qn >= 2);
83
84	0	if (qn > dn)
85	0	{
86	0	mp_size_t b;
87
88		/* \|_______________________\| dividend
89		\|________\| divisor */
90
91	0	#define ip scratch /* in */
92	0	#define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */
93	0	#define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
94
95		/* Compute an inverse size that is a nice partition of the quotient. */
96	0	b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
97	0	in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
98
99		/* Some notes on allocation:
100
101		When in = dn, R dies when mpn_mullo returns, if in < dn the low in
102		limbs of R dies at that point. We could save memory by letting T live
103		just under R, and let the upper part of T expand into R. These changes
104		should reduce itch to perhaps 3dn.
105		*/
106
107	0	mpn_binvert (ip, dp, in, tp);
108
109	0	MPN_COPY (rp, np, dn);
110	0	np += dn;
111	0	cy = 0;
112
113	0	while (qn > in)
114	0	{
115	0	mpn_mullo_n (qp, rp, ip, in);
116
117	0	if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
118	0	mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */
119	0	else
120	0	{
121	0	tn = mpn_mulmod_bnm1_next_size (dn);
122	0	mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
123	0	wn = dn + in - tn; /* number of wrapped limbs */
124	0	if (wn > 0)
125	0	{
126	0	c0 = mpn_sub_n (tp + tn, tp, rp, wn);
127	0	mpn_decr_u (tp + wn, c0);
128	0	}
129	0	}
130
131	0	qp += in;
132	0	qn -= in;
133
134	0	if (dn != in)
135	0	{
136		/* Subtract tp[dn-1...in] from partial remainder. */
137	0	cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
138	0	if (cy == 2)
139	0	{
140	0	mpn_incr_u (tp + dn, 1);
141	0	cy = 1;
142	0	}
143	0	}
144		/* Subtract tp[dn+in-1...dn] from dividend. */
145	0	cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
146	0	np += in;
147	0	}
148
149		/* Generate last qn limbs. */
150	0	mpn_mullo_n (qp, rp, ip, qn);
151
152	0	if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
153	0	mpn_mul (tp, dp, dn, qp, qn); /* mulhi, need tp[qn+in-1...in] */
154	0	else
155	0	{
156	0	tn = mpn_mulmod_bnm1_next_size (dn);
157	0	mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
158	0	wn = dn + qn - tn; /* number of wrapped limbs */
159	0	if (wn > 0)
160	0	{
161	0	c0 = mpn_sub_n (tp + tn, tp, rp, wn);
162	0	mpn_decr_u (tp + wn, c0);
163	0	}
164	0	}
165
166	0	if (dn != qn)
167	0	{
168	0	cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
169	0	if (cy == 2)
170	0	{
171	0	mpn_incr_u (tp + dn, 1);
172	0	cy = 1;
173	0	}
174	0	}
175	0	return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy);
176
177	0	#undef ip
178	0	#undef tp
179	0	#undef scratch_out
180	0	}
181	0	else
182	0	{
183		/* \|_______________________\| dividend
184		\|________________\| divisor */
185
186	0	#define ip scratch /* in */
187	0	#define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */
188	0	#define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
189
190		/* Compute half-sized inverse. */
191	0	in = qn - (qn >> 1);
192
193	0	mpn_binvert (ip, dp, in, tp);
194
195	0	mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */
196
197	0	if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
198	0	mpn_mul (tp, dp, dn, qp, in); /* mulhigh */
199	0	else
200	0	{
201	0	tn = mpn_mulmod_bnm1_next_size (dn);
202	0	mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
203	0	wn = dn + in - tn; /* number of wrapped limbs */
204	0	if (wn > 0)
205	0	{
206	0	c0 = mpn_sub_n (tp + tn, tp, np, wn);
207	0	mpn_decr_u (tp + wn, c0);
208	0	}
209	0	}
210
211	0	qp += in;
212	0	qn -= in;
213
214	0	cy = mpn_sub_n (rp, np + in, tp + in, dn);
215	0	mpn_mullo_n (qp, rp, ip, qn); /* high qn quotient limbs */
216
217	0	if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
218	0	mpn_mul (tp, dp, dn, qp, qn); /* mulhigh */
219	0	else
220	0	{
221	0	tn = mpn_mulmod_bnm1_next_size (dn);
222	0	mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
223	0	wn = dn + qn - tn; /* number of wrapped limbs */
224	0	if (wn > 0)
225	0	{
226	0	c0 = mpn_sub_n (tp + tn, tp, rp, wn);
227	0	mpn_decr_u (tp + wn, c0);
228	0	}
229	0	}
230
231	0	cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
232	0	if (cy == 2)
233	0	{
234	0	mpn_incr_u (tp + dn, 1);
235	0	cy = 1;
236	0	}
237	0	return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy);
238
239	0	#undef ip
240	0	#undef tp
241	0	#undef scratch_out
242	0	}
243	0	}
244
245		mp_limb_t
246		mpn_mu_bdiv_qr (mp_ptr qp,
247		mp_ptr rp,
248		mp_srcptr np, mp_size_t nn,
249		mp_srcptr dp, mp_size_t dn,
250		mp_ptr scratch)
251	0	{
252	0	mp_limb_t cy = mpn_mu_bdiv_qr_old (qp, rp, np, nn, dp, dn, scratch);
253
254		/* R' B^{qn} = U - Q' D
255		*
256		* Q = B^{qn} - Q' (assuming Q' != 0)
257		*
258		* R B^{qn} = U + Q D = U + B^{qn} D - Q' D
259		* = B^{qn} D + R'
260		*/
261
262	0	if (UNLIKELY (!mpn_neg (qp, qp, nn - dn)))
263	0	{
264		/* Zero quotient. */
265	0	ASSERT (cy == 0);
266	0	return 0;
267	0	}
268	0	else
269	0	{
270	0	mp_limb_t cy2 = mpn_add_n (rp, rp, dp, dn);
271	0	ASSERT (cy2 >= cy);
272
273	0	return cy2 - cy;
274	0	}
275	0	}
276
277
278		mp_size_t
279		mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn)
280	0	{
281	0	mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
282	0	mp_size_t b;
283
284	0	ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD);
285
286	0	qn = nn - dn;
287
288	0	if (qn > dn)
289	0	{
290	0	b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
291	0	in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
292	0	}
293	0	else
294	0	{
295	0	in = qn - (qn >> 1);
296	0	}
297
298	0	if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
299	0	{
300	0	tn = dn + in;
301	0	itch_out = 0;
302	0	}
303	0	else
304	0	{
305	0	tn = mpn_mulmod_bnm1_next_size (dn);
306	0	itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
307	0	}
308
309	0	itch_binvert = mpn_binvert_itch (in);
310	0	itches = tn + itch_out;
311	0	return in + MAX (itches, itch_binvert);
312	0	}