Line | Count | Source (jump to first uncovered line) |
1 | | /* mpn_mu_div_qr, mpn_preinv_mu_div_qr. |
2 | | |
3 | | Compute Q = floor(N / D) and R = N-QD. N is nn limbs and D is dn limbs and |
4 | | must be normalized, and Q must be nn-dn limbs. The requirement that Q is |
5 | | nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to |
6 | | let N be unmodified during the operation. |
7 | | |
8 | | Contributed to the GNU project by Torbjorn Granlund. |
9 | | |
10 | | THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY |
11 | | SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
12 | | GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. |
13 | | |
14 | | Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc. |
15 | | |
16 | | This file is part of the GNU MP Library. |
17 | | |
18 | | The GNU MP Library is free software; you can redistribute it and/or modify |
19 | | it under the terms of either: |
20 | | |
21 | | * the GNU Lesser General Public License as published by the Free |
22 | | Software Foundation; either version 3 of the License, or (at your |
23 | | option) any later version. |
24 | | |
25 | | or |
26 | | |
27 | | * the GNU General Public License as published by the Free Software |
28 | | Foundation; either version 2 of the License, or (at your option) any |
29 | | later version. |
30 | | |
31 | | or both in parallel, as here. |
32 | | |
33 | | The GNU MP Library is distributed in the hope that it will be useful, but |
34 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
35 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
36 | | for more details. |
37 | | |
38 | | You should have received copies of the GNU General Public License and the |
39 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
40 | | see https://www.gnu.org/licenses/. */ |
41 | | |
42 | | |
43 | | /* |
44 | | The idea of the algorithm used herein is to compute a smaller inverted value |
45 | | than used in the standard Barrett algorithm, and thus save time in the |
46 | | Newton iterations, and pay just a small price when using the inverted value |
47 | | for developing quotient bits. This algorithm was presented at ICMS 2006. |
48 | | */ |
49 | | |
50 | | /* CAUTION: This code and the code in mu_divappr_q.c should be edited in sync. |
51 | | |
52 | | Things to work on: |
53 | | |
54 | | * This isn't optimal when the quotient isn't needed, as it might take a lot |
55 | | of space. The computation is always needed, though, so there is no time to |
56 | | save with special code. |
57 | | |
58 | | * The itch/scratch scheme isn't perhaps such a good idea as it once seemed, |
59 | | demonstrated by the fact that the mpn_invertappr function's scratch needs |
60 | | mean that we need to keep a large allocation long after it is needed. |
61 | | Things are worse as mpn_mul_fft does not accept any scratch parameter, |
62 | | which means we'll have a large memory hole while in mpn_mul_fft. In |
63 | | general, a peak scratch need in the beginning of a function isn't |
64 | | well-handled by the itch/scratch scheme. |
65 | | */ |
66 | | |
67 | | #ifdef STAT |
68 | | #undef STAT |
69 | | #define STAT(x) x |
70 | | #else |
71 | | #define STAT(x) |
72 | | #endif |
73 | | |
74 | | #include <stdlib.h> /* for NULL */ |
75 | | #include "gmp-impl.h" |
76 | | |
77 | | |
78 | | /* FIXME: The MU_DIV_QR_SKEW_THRESHOLD was not analysed properly. It gives a |
79 | | speedup according to old measurements, but does the decision mechanism |
80 | | really make sense? It seem like the quotient between dn and qn might be |
81 | | what we really should be checking. */ |
82 | | #ifndef MU_DIV_QR_SKEW_THRESHOLD |
83 | 0 | #define MU_DIV_QR_SKEW_THRESHOLD 100 |
84 | | #endif |
85 | | |
86 | | #ifdef CHECK /* FIXME: Enable in minithres */ |
87 | | #undef MU_DIV_QR_SKEW_THRESHOLD |
88 | | #define MU_DIV_QR_SKEW_THRESHOLD 1 |
89 | | #endif |
90 | | |
91 | | |
92 | | static mp_limb_t mpn_mu_div_qr2 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr); |
93 | | static mp_size_t mpn_mu_div_qr_choose_in (mp_size_t, mp_size_t, int); |
94 | | |
95 | | |
96 | | mp_limb_t |
97 | | mpn_mu_div_qr (mp_ptr qp, |
98 | | mp_ptr rp, |
99 | | mp_srcptr np, |
100 | | mp_size_t nn, |
101 | | mp_srcptr dp, |
102 | | mp_size_t dn, |
103 | | mp_ptr scratch) |
104 | 0 | { |
105 | 0 | mp_size_t qn; |
106 | 0 | mp_limb_t cy, qh; |
107 | |
|
108 | 0 | qn = nn - dn; |
109 | 0 | if (qn + MU_DIV_QR_SKEW_THRESHOLD < dn) |
110 | 0 | { |
111 | | /* |______________|_ign_first__| dividend nn |
112 | | |_______|_ign_first__| divisor dn |
113 | | |
114 | | |______| quotient (prel) qn |
115 | | |
116 | | |___________________| quotient * ignored-divisor-part dn-1 |
117 | | */ |
118 | | |
119 | | /* Compute a preliminary quotient and a partial remainder by dividing the |
120 | | most significant limbs of each operand. */ |
121 | 0 | qh = mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1), |
122 | 0 | np + nn - (2 * qn + 1), 2 * qn + 1, |
123 | 0 | dp + dn - (qn + 1), qn + 1, |
124 | 0 | scratch); |
125 | | |
126 | | /* Multiply the quotient by the divisor limbs ignored above. */ |
127 | 0 | if (dn - (qn + 1) > qn) |
128 | 0 | mpn_mul (scratch, dp, dn - (qn + 1), qp, qn); /* prod is dn-1 limbs */ |
129 | 0 | else |
130 | 0 | mpn_mul (scratch, qp, qn, dp, dn - (qn + 1)); /* prod is dn-1 limbs */ |
131 | |
|
132 | 0 | if (qh) |
133 | 0 | cy = mpn_add_n (scratch + qn, scratch + qn, dp, dn - (qn + 1)); |
134 | 0 | else |
135 | 0 | cy = 0; |
136 | 0 | scratch[dn - 1] = cy; |
137 | |
|
138 | 0 | cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1)); |
139 | 0 | cy = mpn_sub_nc (rp + nn - (2 * qn + 1), |
140 | 0 | rp + nn - (2 * qn + 1), |
141 | 0 | scratch + nn - (2 * qn + 1), |
142 | 0 | qn + 1, cy); |
143 | 0 | if (cy) |
144 | 0 | { |
145 | 0 | qh -= mpn_sub_1 (qp, qp, qn, 1); |
146 | 0 | mpn_add_n (rp, rp, dp, dn); |
147 | 0 | } |
148 | 0 | } |
149 | 0 | else |
150 | 0 | { |
151 | 0 | qh = mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch); |
152 | 0 | } |
153 | |
|
154 | 0 | return qh; |
155 | 0 | } |
156 | | |
157 | | static mp_limb_t |
158 | | mpn_mu_div_qr2 (mp_ptr qp, |
159 | | mp_ptr rp, |
160 | | mp_srcptr np, |
161 | | mp_size_t nn, |
162 | | mp_srcptr dp, |
163 | | mp_size_t dn, |
164 | | mp_ptr scratch) |
165 | 0 | { |
166 | 0 | mp_size_t qn, in; |
167 | 0 | mp_limb_t cy, qh; |
168 | 0 | mp_ptr ip, tp; |
169 | |
|
170 | 0 | ASSERT (dn > 1); |
171 | |
|
172 | 0 | qn = nn - dn; |
173 | | |
174 | | /* Compute the inverse size. */ |
175 | 0 | in = mpn_mu_div_qr_choose_in (qn, dn, 0); |
176 | 0 | ASSERT (in <= dn); |
177 | |
|
178 | 0 | #if 1 |
179 | | /* This alternative inverse computation method gets slightly more accurate |
180 | | results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function |
181 | | not adapted (3) mpn_invertappr scratch needs not met. */ |
182 | 0 | ip = scratch; |
183 | 0 | tp = scratch + in + 1; |
184 | | |
185 | | /* compute an approximate inverse on (in+1) limbs */ |
186 | 0 | if (dn == in) |
187 | 0 | { |
188 | 0 | MPN_COPY (tp + 1, dp, in); |
189 | 0 | tp[0] = 1; |
190 | 0 | mpn_invertappr (ip, tp, in + 1, tp + in + 1); |
191 | 0 | MPN_COPY_INCR (ip, ip + 1, in); |
192 | 0 | } |
193 | 0 | else |
194 | 0 | { |
195 | 0 | cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1); |
196 | 0 | if (UNLIKELY (cy != 0)) |
197 | 0 | MPN_ZERO (ip, in); |
198 | 0 | else |
199 | 0 | { |
200 | 0 | mpn_invertappr (ip, tp, in + 1, tp + in + 1); |
201 | 0 | MPN_COPY_INCR (ip, ip + 1, in); |
202 | 0 | } |
203 | 0 | } |
204 | | #else |
205 | | /* This older inverse computation method gets slightly worse results than the |
206 | | one above. */ |
207 | | ip = scratch; |
208 | | tp = scratch + in; |
209 | | |
210 | | /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the |
211 | | inversion function should do this automatically. */ |
212 | | if (dn == in) |
213 | | { |
214 | | tp[in + 1] = 0; |
215 | | MPN_COPY (tp + in + 2, dp, in); |
216 | | mpn_invertappr (tp, tp + in + 1, in + 1, NULL); |
217 | | } |
218 | | else |
219 | | { |
220 | | mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL); |
221 | | } |
222 | | cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT); |
223 | | if (UNLIKELY (cy != 0)) |
224 | | MPN_ZERO (tp + 1, in); |
225 | | MPN_COPY (ip, tp + 1, in); |
226 | | #endif |
227 | |
|
228 | 0 | qh = mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in); |
229 | |
|
230 | 0 | return qh; |
231 | 0 | } |
232 | | |
233 | | mp_limb_t |
234 | | mpn_preinv_mu_div_qr (mp_ptr qp, |
235 | | mp_ptr rp, |
236 | | mp_srcptr np, |
237 | | mp_size_t nn, |
238 | | mp_srcptr dp, |
239 | | mp_size_t dn, |
240 | | mp_srcptr ip, |
241 | | mp_size_t in, |
242 | | mp_ptr scratch) |
243 | 0 | { |
244 | 0 | mp_size_t qn; |
245 | 0 | mp_limb_t cy, cx, qh; |
246 | 0 | mp_limb_t r; |
247 | 0 | mp_size_t tn, wn; |
248 | |
|
249 | 0 | #define tp scratch |
250 | 0 | #define scratch_out (scratch + tn) |
251 | |
|
252 | 0 | qn = nn - dn; |
253 | |
|
254 | 0 | np += qn; |
255 | 0 | qp += qn; |
256 | |
|
257 | 0 | qh = mpn_cmp (np, dp, dn) >= 0; |
258 | 0 | if (qh != 0) |
259 | 0 | mpn_sub_n (rp, np, dp, dn); |
260 | 0 | else |
261 | 0 | MPN_COPY_INCR (rp, np, dn); |
262 | | |
263 | | /* if (qn == 0) */ /* The while below handles this case */ |
264 | | /* return qh; */ /* Degenerate use. Should we allow this? */ |
265 | |
|
266 | 0 | while (qn > 0) |
267 | 0 | { |
268 | 0 | if (qn < in) |
269 | 0 | { |
270 | 0 | ip += in - qn; |
271 | 0 | in = qn; |
272 | 0 | } |
273 | 0 | np -= in; |
274 | 0 | qp -= in; |
275 | | |
276 | | /* Compute the next block of quotient limbs by multiplying the inverse I |
277 | | by the upper part of the partial remainder R. */ |
278 | 0 | mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */ |
279 | 0 | cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */ |
280 | 0 | ASSERT_ALWAYS (cy == 0); |
281 | | |
282 | 0 | qn -= in; |
283 | | |
284 | | /* Compute the product of the quotient block and the divisor D, to be |
285 | | subtracted from the partial remainder combined with new limbs from the |
286 | | dividend N. We only really need the low dn+1 limbs. */ |
287 | |
|
288 | 0 | if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) |
289 | 0 | mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */ |
290 | 0 | else |
291 | 0 | { |
292 | 0 | tn = mpn_mulmod_bnm1_next_size (dn + 1); |
293 | 0 | mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); |
294 | 0 | wn = dn + in - tn; /* number of wrapped limbs */ |
295 | 0 | if (wn > 0) |
296 | 0 | { |
297 | 0 | cy = mpn_sub_n (tp, tp, rp + dn - wn, wn); |
298 | 0 | cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy); |
299 | 0 | cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0; |
300 | 0 | ASSERT_ALWAYS (cx >= cy); |
301 | 0 | mpn_incr_u (tp, cx - cy); |
302 | 0 | } |
303 | 0 | } |
304 | | |
305 | 0 | r = rp[dn - in] - tp[dn]; |
306 | | |
307 | | /* Subtract the product from the partial remainder combined with new |
308 | | limbs from the dividend N, generating a new partial remainder R. */ |
309 | 0 | if (dn != in) |
310 | 0 | { |
311 | 0 | cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */ |
312 | 0 | cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy); |
313 | 0 | MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */ |
314 | 0 | } |
315 | 0 | else |
316 | 0 | { |
317 | 0 | cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */ |
318 | 0 | } |
319 | |
|
320 | 0 | STAT (int i; int err = 0; |
321 | 0 | static int errarr[5]; static int err_rec; static int tot); |
322 | | |
323 | | /* Check the remainder R and adjust the quotient as needed. */ |
324 | 0 | r -= cy; |
325 | 0 | while (r != 0) |
326 | 0 | { |
327 | | /* We loop 0 times with about 69% probability, 1 time with about 31% |
328 | | probability, 2 times with about 0.6% probability, if inverse is |
329 | | computed as recommended. */ |
330 | 0 | mpn_incr_u (qp, 1); |
331 | 0 | cy = mpn_sub_n (rp, rp, dp, dn); |
332 | 0 | r -= cy; |
333 | 0 | STAT (err++); |
334 | 0 | } |
335 | 0 | if (mpn_cmp (rp, dp, dn) >= 0) |
336 | 0 | { |
337 | | /* This is executed with about 76% probability. */ |
338 | 0 | mpn_incr_u (qp, 1); |
339 | 0 | cy = mpn_sub_n (rp, rp, dp, dn); |
340 | 0 | STAT (err++); |
341 | 0 | } |
342 | |
|
343 | 0 | STAT ( |
344 | 0 | tot++; |
345 | 0 | errarr[err]++; |
346 | 0 | if (err > err_rec) |
347 | 0 | err_rec = err; |
348 | 0 | if (tot % 0x10000 == 0) |
349 | 0 | { |
350 | 0 | for (i = 0; i <= err_rec; i++) |
351 | 0 | printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot); |
352 | 0 | printf ("\n"); |
353 | 0 | } |
354 | 0 | ); |
355 | 0 | } |
356 | | |
357 | 0 | return qh; |
358 | 0 | } |
359 | | |
360 | | /* In case k=0 (automatic choice), we distinguish 3 cases: |
361 | | (a) dn < qn: in = ceil(qn / ceil(qn/dn)) |
362 | | (b) dn/3 < qn <= dn: in = ceil(qn / 2) |
363 | | (c) qn < dn/3: in = qn |
364 | | In all cases we have in <= dn. |
365 | | */ |
366 | | static mp_size_t |
367 | | mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k) |
368 | 0 | { |
369 | 0 | mp_size_t in; |
370 | |
|
371 | 0 | if (k == 0) |
372 | 0 | { |
373 | 0 | mp_size_t b; |
374 | 0 | if (qn > dn) |
375 | 0 | { |
376 | | /* Compute an inverse size that is a nice partition of the quotient. */ |
377 | 0 | b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ |
378 | 0 | in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ |
379 | 0 | } |
380 | 0 | else if (3 * qn > dn) |
381 | 0 | { |
382 | 0 | in = (qn - 1) / 2 + 1; /* b = 2 */ |
383 | 0 | } |
384 | 0 | else |
385 | 0 | { |
386 | 0 | in = (qn - 1) / 1 + 1; /* b = 1 */ |
387 | 0 | } |
388 | 0 | } |
389 | 0 | else |
390 | 0 | { |
391 | 0 | mp_size_t xn; |
392 | 0 | xn = MIN (dn, qn); |
393 | 0 | in = (xn - 1) / k + 1; |
394 | 0 | } |
395 | |
|
396 | 0 | return in; |
397 | 0 | } |
398 | | |
399 | | mp_size_t |
400 | | mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k) |
401 | 0 | { |
402 | 0 | mp_size_t in = mpn_mu_div_qr_choose_in (nn - dn, dn, mua_k); |
403 | 0 | mp_size_t itch_preinv = mpn_preinv_mu_div_qr_itch (nn, dn, in); |
404 | 0 | mp_size_t itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */ |
405 | |
|
406 | 0 | ASSERT (itch_preinv >= itch_invapp); |
407 | 0 | return in + MAX (itch_invapp, itch_preinv); |
408 | 0 | } |
409 | | |
410 | | mp_size_t |
411 | | mpn_preinv_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, mp_size_t in) |
412 | 0 | { |
413 | 0 | mp_size_t itch_local = mpn_mulmod_bnm1_next_size (dn + 1); |
414 | 0 | mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in); |
415 | |
|
416 | 0 | return itch_local + itch_out; |
417 | 0 | } |