/src/gmp-6.2.1/mpz/bin_uiui.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* mpz_bin_uiui - compute n over k. |
2 | | |
3 | | Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. |
4 | | |
5 | | Copyright 2010-2012, 2015-2018 Free Software Foundation, Inc. |
6 | | |
7 | | This file is part of the GNU MP Library. |
8 | | |
9 | | The GNU MP Library is free software; you can redistribute it and/or modify |
10 | | it under the terms of either: |
11 | | |
12 | | * the GNU Lesser General Public License as published by the Free |
13 | | Software Foundation; either version 3 of the License, or (at your |
14 | | option) any later version. |
15 | | |
16 | | or |
17 | | |
18 | | * the GNU General Public License as published by the Free Software |
19 | | Foundation; either version 2 of the License, or (at your option) any |
20 | | later version. |
21 | | |
22 | | or both in parallel, as here. |
23 | | |
24 | | The GNU MP Library is distributed in the hope that it will be useful, but |
25 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
26 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
27 | | for more details. |
28 | | |
29 | | You should have received copies of the GNU General Public License and the |
30 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
31 | | see https://www.gnu.org/licenses/. */ |
32 | | |
33 | | #include "gmp-impl.h" |
34 | | #include "longlong.h" |
35 | | |
36 | | #ifndef BIN_GOETGHELUCK_THRESHOLD |
37 | | #define BIN_GOETGHELUCK_THRESHOLD 512 |
38 | | #endif |
39 | | #ifndef BIN_UIUI_ENABLE_SMALLDC |
40 | 0 | #define BIN_UIUI_ENABLE_SMALLDC 1 |
41 | | #endif |
42 | | #ifndef BIN_UIUI_RECURSIVE_SMALLDC |
43 | 0 | #define BIN_UIUI_RECURSIVE_SMALLDC (GMP_NUMB_BITS > 32) |
44 | | #endif |
45 | | |
46 | | /* Algorithm: |
47 | | |
48 | | Accumulate chunks of factors first limb-by-limb (using one of mul0-mul8) |
49 | | which are then accumulated into mpn numbers. The first inner loop |
50 | | accumulates divisor factors, the 2nd inner loop accumulates exactly the same |
51 | | number of dividend factors. We avoid accumulating more for the divisor, |
52 | | even with its smaller factors, since we else cannot guarantee divisibility. |
53 | | |
54 | | Since we know each division will yield an integer, we compute the quotient |
55 | | using Hensel norm: If the quotient is limited by 2^t, we compute A / B mod |
56 | | 2^t. |
57 | | |
58 | | Improvements: |
59 | | |
60 | | (1) An obvious improvement to this code would be to compute mod 2^t |
61 | | everywhere. Unfortunately, we cannot determine t beforehand, unless we |
62 | | invoke some approximation, such as Stirling's formula. Of course, we don't |
63 | | need t to be tight. However, it is not clear that this would help much, |
64 | | our numbers are kept reasonably small already. |
65 | | |
66 | | (2) Compute nmax/kmax semi-accurately, without scalar division or a loop. |
67 | | Extracting the 3 msb, then doing a table lookup using cnt*8+msb as index, |
68 | | would make it both reasonably accurate and fast. (We could use a table |
69 | | stored into a limb, perhaps.) The table should take the removed factors of |
70 | | 2 into account (those done on-the-fly in mulN). |
71 | | |
72 | | (3) The first time in the loop we compute the odd part of a |
73 | | factorial in kp, we might use oddfac_1 for this task. |
74 | | */ |
75 | | |
76 | | /* This threshold determines how large divisor to accumulate before we call |
77 | | bdiv. Perhaps we should never call bdiv, and accumulate all we are told, |
78 | | since we are just basecase code anyway? Presumably, this depends on the |
79 | | relative speed of the asymptotically fast code and this code. */ |
80 | 0 | #define SOME_THRESHOLD 20 |
81 | | |
82 | | /* Multiply-into-limb functions. These remove factors of 2 on-the-fly. FIXME: |
83 | | All versions of MAXFACS don't take this 2 removal into account now, meaning |
84 | | that then, shifting just adds some overhead. (We remove factors from the |
85 | | completed limb anyway.) */ |
86 | | |
87 | | static mp_limb_t |
88 | | mul1 (mp_limb_t m) |
89 | 0 | { |
90 | 0 | return m; |
91 | 0 | } |
92 | | |
93 | | static mp_limb_t |
94 | | mul2 (mp_limb_t m) |
95 | 0 | { |
96 | | /* We need to shift before multiplying, to avoid an overflow. */ |
97 | 0 | mp_limb_t m01 = (m | 1) * ((m + 1) >> 1); |
98 | 0 | return m01; |
99 | 0 | } |
100 | | |
101 | | static mp_limb_t |
102 | | mul3 (mp_limb_t m) |
103 | 0 | { |
104 | 0 | mp_limb_t m01 = (m + 0) * (m + 1) >> 1; |
105 | 0 | mp_limb_t m2 = (m + 2); |
106 | 0 | return m01 * m2; |
107 | 0 | } |
108 | | |
109 | | static mp_limb_t |
110 | | mul4 (mp_limb_t m) |
111 | 0 | { |
112 | 0 | mp_limb_t m03 = (m + 0) * (m + 3) >> 1; |
113 | 0 | return m03 * (m03 + 1); /* mul2 (m03) ? */ |
114 | 0 | } |
115 | | |
116 | | static mp_limb_t |
117 | | mul5 (mp_limb_t m) |
118 | 0 | { |
119 | 0 | mp_limb_t m03 = (m + 0) * (m + 3) >> 1; |
120 | 0 | mp_limb_t m034 = m03 * (m + 4); |
121 | 0 | return (m03 + 1) * m034; |
122 | 0 | } |
123 | | |
124 | | static mp_limb_t |
125 | | mul6 (mp_limb_t m) |
126 | 0 | { |
127 | 0 | mp_limb_t m05 = (m + 0) * (m + 5); |
128 | 0 | mp_limb_t m1234 = (m05 + 5) * (m05 + 5) >> 3; |
129 | 0 | return m1234 * (m05 >> 1); |
130 | 0 | } |
131 | | |
132 | | static mp_limb_t |
133 | | mul7 (mp_limb_t m) |
134 | 0 | { |
135 | 0 | mp_limb_t m05 = (m + 0) * (m + 5); |
136 | 0 | mp_limb_t m1234 = (m05 + 5) * (m05 + 5) >> 3; |
137 | 0 | mp_limb_t m056 = m05 * (m + 6) >> 1; |
138 | 0 | return m1234 * m056; |
139 | 0 | } |
140 | | |
141 | | static mp_limb_t |
142 | | mul8 (mp_limb_t m) |
143 | 0 | { |
144 | 0 | mp_limb_t m07 = (m + 0) * (m + 7); |
145 | 0 | mp_limb_t m0257 = m07 * (m07 + 10) >> 3; |
146 | 0 | mp_limb_t m1346 = m07 + 9 + m0257; |
147 | 0 | return m0257 * m1346; |
148 | 0 | } |
149 | | |
150 | | /* |
151 | | static mp_limb_t |
152 | | mul9 (mp_limb_t m) |
153 | | { |
154 | | return (m + 8) * mul8 (m) ; |
155 | | } |
156 | | |
157 | | static mp_limb_t |
158 | | mul10 (mp_limb_t m) |
159 | | { |
160 | | mp_limb_t m09 = (m + 0) * (m + 9); |
161 | | mp_limb_t m18 = (m09 >> 1) + 4; |
162 | | mp_limb_t m0369 = m09 * (m09 + 18) >> 3; |
163 | | mp_limb_t m2457 = m09 * 2 + 35 + m0369; |
164 | | return ((m0369 * m2457) >> 1) * m18; |
165 | | } |
166 | | */ |
167 | | |
168 | | typedef mp_limb_t (* mulfunc_t) (mp_limb_t); |
169 | | |
170 | | static const mulfunc_t mulfunc[] = {mul1,mul2,mul3,mul4,mul5,mul6,mul7,mul8 /* ,mul9,mul10 */}; |
171 | | #define M (numberof(mulfunc)) |
172 | | |
173 | | /* Number of factors-of-2 removed by the corresponding mulN function. */ |
174 | | static const unsigned char tcnttab[] = {0, 1, 1, 2, 2, 4, 4, 6 /*,6,8*/}; |
175 | | |
176 | | #if 1 |
177 | | /* This variant is inaccurate but share the code with other functions. */ |
178 | | #define MAXFACS(max,l) \ |
179 | 0 | do { \ |
180 | 0 | (max) = log_n_max (l); \ |
181 | 0 | } while (0) |
182 | | #else |
183 | | |
184 | | /* This variant is exact(?) but uses a loop. It takes the 2 removal |
185 | | of mulN into account. */ |
186 | | static const unsigned long ftab[] = |
187 | | #if GMP_NUMB_BITS == 64 |
188 | | /* 1 to 8 factors per iteration */ |
189 | | {CNST_LIMB(0xffffffffffffffff),CNST_LIMB(0x16a09e667),0x32cbfc,0x16a08,0x24c0,0xa11,0x345,0x1ab /*,0xe9,0x8e */}; |
190 | | #endif |
191 | | #if GMP_NUMB_BITS == 32 |
192 | | /* 1 to 7 factors per iteration */ |
193 | | {0xffffffff,0x16a09,0x7ff,0x168,0x6f,0x3d,0x20 /* ,0x17 */}; |
194 | | #endif |
195 | | |
196 | | #define MAXFACS(max,l) \ |
197 | | do { \ |
198 | | int __i; \ |
199 | | for (__i = numberof (ftab) - 1; l > ftab[__i]; __i--) \ |
200 | | ; \ |
201 | | (max) = __i + 1; \ |
202 | | } while (0) |
203 | | #endif |
204 | | |
205 | | /* Entry i contains (i!/2^t)^(-1) where t is chosen such that the parenthesis |
206 | | is an odd integer. */ |
207 | | static const mp_limb_t facinv[] = { ONE_LIMB_ODD_FACTORIAL_INVERSES_TABLE }; |
208 | | |
209 | | static void |
210 | | mpz_bdiv_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k) |
211 | 0 | { |
212 | 0 | unsigned nmax, kmax, nmaxnow, numfac; |
213 | 0 | mp_ptr np, kp; |
214 | 0 | mp_size_t nn, kn, alloc; |
215 | 0 | mp_limb_t i, j, t, iii, jjj, cy, dinv; |
216 | 0 | int cnt; |
217 | 0 | mp_size_t maxn; |
218 | 0 | TMP_DECL; |
219 | |
|
220 | 0 | ASSERT (k > ODD_FACTORIAL_TABLE_LIMIT); |
221 | 0 | TMP_MARK; |
222 | |
|
223 | 0 | maxn = 1 + n / GMP_NUMB_BITS; /* absolutely largest result size (limbs) */ |
224 | | |
225 | | /* FIXME: This allocation might be insufficient, but is usually way too |
226 | | large. */ |
227 | 0 | alloc = SOME_THRESHOLD - 1 + MAX (3 * maxn / 2, SOME_THRESHOLD); |
228 | 0 | alloc = MIN (alloc, (mp_size_t) k) + 1; |
229 | 0 | TMP_ALLOC_LIMBS_2 (np, alloc, kp, SOME_THRESHOLD + 1); |
230 | |
|
231 | 0 | MAXFACS (nmax, n); |
232 | 0 | ASSERT (nmax <= M); |
233 | 0 | MAXFACS (kmax, k); |
234 | 0 | ASSERT (kmax <= M); |
235 | 0 | ASSERT (k >= M); |
236 | | |
237 | 0 | i = n - k + 1; |
238 | |
|
239 | 0 | np[0] = 1; nn = 1; |
240 | |
|
241 | 0 | numfac = 1; |
242 | 0 | j = ODD_FACTORIAL_TABLE_LIMIT + 1; |
243 | 0 | jjj = ODD_FACTORIAL_TABLE_MAX; |
244 | 0 | ASSERT (__gmp_oddfac_table[ODD_FACTORIAL_TABLE_LIMIT] == ODD_FACTORIAL_TABLE_MAX); |
245 | | |
246 | 0 | while (1) |
247 | 0 | { |
248 | 0 | kp[0] = jjj; /* store new factors */ |
249 | 0 | kn = 1; |
250 | 0 | t = k - j + 1; |
251 | 0 | kmax = MIN (kmax, t); |
252 | |
|
253 | 0 | while (kmax != 0 && kn < SOME_THRESHOLD) |
254 | 0 | { |
255 | 0 | jjj = mulfunc[kmax - 1] (j); |
256 | 0 | j += kmax; /* number of factors used */ |
257 | 0 | count_trailing_zeros (cnt, jjj); /* count low zeros */ |
258 | 0 | jjj >>= cnt; /* remove remaining low zeros */ |
259 | 0 | cy = mpn_mul_1 (kp, kp, kn, jjj); /* accumulate new factors */ |
260 | 0 | kp[kn] = cy; |
261 | 0 | kn += cy != 0; |
262 | 0 | t = k - j + 1; |
263 | 0 | kmax = MIN (kmax, t); |
264 | 0 | } |
265 | 0 | numfac = j - numfac; |
266 | |
|
267 | 0 | while (numfac != 0) |
268 | 0 | { |
269 | 0 | nmaxnow = MIN (nmax, numfac); |
270 | 0 | iii = mulfunc[nmaxnow - 1] (i); |
271 | 0 | i += nmaxnow; /* number of factors used */ |
272 | 0 | count_trailing_zeros (cnt, iii); /* count low zeros */ |
273 | 0 | iii >>= cnt; /* remove remaining low zeros */ |
274 | 0 | cy = mpn_mul_1 (np, np, nn, iii); /* accumulate new factors */ |
275 | 0 | np[nn] = cy; |
276 | 0 | nn += cy != 0; |
277 | 0 | numfac -= nmaxnow; |
278 | 0 | } |
279 | | |
280 | 0 | ASSERT (nn < alloc); |
281 | | |
282 | 0 | binvert_limb (dinv, kp[0]); |
283 | 0 | nn += (np[nn - 1] >= kp[kn - 1]); |
284 | 0 | nn -= kn; |
285 | 0 | mpn_sbpi1_bdiv_q (np, np, nn, kp, MIN(kn,nn), -dinv); |
286 | 0 | mpn_neg (np, np, nn); |
287 | |
|
288 | 0 | if (kmax == 0) |
289 | 0 | break; |
290 | 0 | numfac = j; |
291 | |
|
292 | 0 | jjj = mulfunc[kmax - 1] (j); |
293 | 0 | j += kmax; /* number of factors used */ |
294 | 0 | count_trailing_zeros (cnt, jjj); /* count low zeros */ |
295 | 0 | jjj >>= cnt; /* remove remaining low zeros */ |
296 | 0 | } |
297 | | |
298 | | /* Put back the right number of factors of 2. */ |
299 | 0 | popc_limb (cnt, n - k); |
300 | 0 | popc_limb (j, k); |
301 | 0 | cnt += j; |
302 | 0 | popc_limb (j, n); |
303 | 0 | cnt -= j; |
304 | 0 | if (cnt != 0) |
305 | 0 | { |
306 | 0 | ASSERT (cnt < GMP_NUMB_BITS); /* can happen, but not for intended use */ |
307 | 0 | cy = mpn_lshift (np, np, nn, cnt); |
308 | 0 | np[nn] = cy; |
309 | 0 | nn += cy != 0; |
310 | 0 | } |
311 | | |
312 | 0 | nn -= np[nn - 1] == 0; /* normalisation */ |
313 | |
|
314 | 0 | kp = MPZ_NEWALLOC (r, nn); |
315 | 0 | SIZ(r) = nn; |
316 | 0 | MPN_COPY (kp, np, nn); |
317 | 0 | TMP_FREE; |
318 | 0 | } |
319 | | |
320 | | static void |
321 | | mpz_smallk_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k) |
322 | 0 | { |
323 | 0 | unsigned nmax, numfac; |
324 | 0 | mp_ptr rp; |
325 | 0 | mp_size_t rn, alloc; |
326 | 0 | mp_limb_t i, iii, cy; |
327 | 0 | unsigned i2cnt, cnt; |
328 | |
|
329 | 0 | MAXFACS (nmax, n); |
330 | 0 | nmax = MIN (nmax, M); |
331 | |
|
332 | 0 | i = n - k + 1; |
333 | |
|
334 | 0 | i2cnt = __gmp_fac2cnt_table[k / 2 - 1]; /* low zeros count */ |
335 | 0 | if (nmax >= k) |
336 | 0 | { |
337 | 0 | MPZ_NEWALLOC (r, 1) [0] = mulfunc[k - 1] (i) * facinv[k - 2] >> |
338 | 0 | (i2cnt - tcnttab[k - 1]); |
339 | 0 | SIZ(r) = 1; |
340 | 0 | return; |
341 | 0 | } |
342 | | |
343 | 0 | count_leading_zeros (cnt, (mp_limb_t) n); |
344 | 0 | cnt = GMP_LIMB_BITS - cnt; |
345 | 0 | alloc = cnt * k / GMP_NUMB_BITS + 3; /* FIXME: ensure rounding is enough. */ |
346 | 0 | rp = MPZ_NEWALLOC (r, alloc); |
347 | |
|
348 | 0 | rp[0] = mulfunc[nmax - 1] (i); |
349 | 0 | rn = 1; |
350 | 0 | i += nmax; /* number of factors used */ |
351 | 0 | i2cnt -= tcnttab[nmax - 1]; /* low zeros count */ |
352 | 0 | numfac = k - nmax; |
353 | 0 | do |
354 | 0 | { |
355 | 0 | nmax = MIN (nmax, numfac); |
356 | 0 | iii = mulfunc[nmax - 1] (i); |
357 | 0 | i += nmax; /* number of factors used */ |
358 | 0 | i2cnt -= tcnttab[nmax - 1]; /* update low zeros count */ |
359 | 0 | cy = mpn_mul_1 (rp, rp, rn, iii); /* accumulate new factors */ |
360 | 0 | rp[rn] = cy; |
361 | 0 | rn += cy != 0; |
362 | 0 | numfac -= nmax; |
363 | 0 | } while (numfac != 0); |
364 | |
|
365 | 0 | ASSERT (rn < alloc); |
366 | | |
367 | 0 | mpn_pi1_bdiv_q_1 (rp, rp, rn, __gmp_oddfac_table[k], facinv[k - 2], i2cnt); |
368 | | /* A two-fold, branch-free normalisation is possible :*/ |
369 | | /* rn -= rp[rn - 1] == 0; */ |
370 | | /* rn -= rp[rn - 1] == 0; */ |
371 | 0 | MPN_NORMALIZE_NOT_ZERO (rp, rn); |
372 | | |
373 | 0 | SIZ(r) = rn; |
374 | 0 | } |
375 | | |
376 | | /* Algorithm: |
377 | | |
378 | | Plain and simply multiply things together. |
379 | | |
380 | | We tabulate factorials (k!/2^t)^(-1) mod B (where t is chosen such |
381 | | that k!/2^t is odd). |
382 | | |
383 | | */ |
384 | | |
385 | | static mp_limb_t |
386 | | bc_bin_uiui (unsigned int n, unsigned int k) |
387 | 0 | { |
388 | 0 | return ((__gmp_oddfac_table[n] * facinv[k - 2] * facinv[n - k - 2]) |
389 | 0 | << (__gmp_fac2cnt_table[n / 2 - 1] - __gmp_fac2cnt_table[k / 2 - 1] - __gmp_fac2cnt_table[(n-k) / 2 - 1])) |
390 | 0 | & GMP_NUMB_MASK; |
391 | 0 | } |
392 | | |
393 | | /* Algorithm: |
394 | | |
395 | | Recursively exploit the relation |
396 | | bin(n,k) = bin(n,k>>1)*bin(n-k>>1,k-k>>1)/bin(k,k>>1) . |
397 | | |
398 | | Values for binomial(k,k>>1) that fit in a limb are precomputed |
399 | | (with inverses). |
400 | | */ |
401 | | |
402 | | /* bin2kk[i - ODD_CENTRAL_BINOMIAL_OFFSET] = |
403 | | binomial(i*2,i)/2^t (where t is chosen so that it is odd). */ |
404 | | static const mp_limb_t bin2kk[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_TABLE }; |
405 | | |
406 | | /* bin2kkinv[i] = bin2kk[i]^-1 mod B */ |
407 | | static const mp_limb_t bin2kkinv[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_INVERSE_TABLE }; |
408 | | |
409 | | /* bin2kk[i] = binomial((i+MIN_S)*2,i+MIN_S)/2^t. This table contains the t values. */ |
410 | | static const unsigned char fac2bin[] = { CENTRAL_BINOMIAL_2FAC_TABLE }; |
411 | | |
412 | | static void |
413 | | mpz_smallkdc_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k) |
414 | 0 | { |
415 | 0 | mp_ptr rp; |
416 | 0 | mp_size_t rn; |
417 | 0 | unsigned long int hk; |
418 | |
|
419 | 0 | hk = k >> 1; |
420 | |
|
421 | 0 | if ((! BIN_UIUI_RECURSIVE_SMALLDC) || hk <= ODD_FACTORIAL_TABLE_LIMIT) |
422 | 0 | mpz_smallk_bin_uiui (r, n, hk); |
423 | 0 | else |
424 | 0 | mpz_smallkdc_bin_uiui (r, n, hk); |
425 | 0 | k -= hk; |
426 | 0 | n -= hk; |
427 | 0 | if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) { |
428 | 0 | mp_limb_t cy; |
429 | 0 | rn = SIZ (r); |
430 | 0 | rp = MPZ_REALLOC (r, rn + 1); |
431 | 0 | cy = mpn_mul_1 (rp, rp, rn, bc_bin_uiui (n, k)); |
432 | 0 | rp [rn] = cy; |
433 | 0 | rn += cy != 0; |
434 | 0 | } else { |
435 | 0 | mp_limb_t buffer[ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3]; |
436 | 0 | mpz_t t; |
437 | |
|
438 | 0 | ALLOC (t) = ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3; |
439 | 0 | PTR (t) = buffer; |
440 | 0 | if ((! BIN_UIUI_RECURSIVE_SMALLDC) || k <= ODD_FACTORIAL_TABLE_LIMIT) |
441 | 0 | mpz_smallk_bin_uiui (t, n, k); |
442 | 0 | else |
443 | 0 | mpz_smallkdc_bin_uiui (t, n, k); |
444 | 0 | mpz_mul (r, r, t); |
445 | 0 | rp = PTR (r); |
446 | 0 | rn = SIZ (r); |
447 | 0 | } |
448 | |
|
449 | 0 | mpn_pi1_bdiv_q_1 (rp, rp, rn, bin2kk[k - ODD_CENTRAL_BINOMIAL_OFFSET], |
450 | 0 | bin2kkinv[k - ODD_CENTRAL_BINOMIAL_OFFSET], |
451 | 0 | fac2bin[k - ODD_CENTRAL_BINOMIAL_OFFSET] - (k != hk)); |
452 | | /* A two-fold, branch-free normalisation is possible :*/ |
453 | | /* rn -= rp[rn - 1] == 0; */ |
454 | | /* rn -= rp[rn - 1] == 0; */ |
455 | 0 | MPN_NORMALIZE_NOT_ZERO (rp, rn); |
456 | | |
457 | 0 | SIZ(r) = rn; |
458 | 0 | } |
459 | | |
460 | | /* mpz_goetgheluck_bin_uiui(RESULT, N, K) -- Set RESULT to binomial(N,K). |
461 | | * |
462 | | * Contributed to the GNU project by Marco Bodrato. |
463 | | * |
464 | | * Implementation of the algorithm by P. Goetgheluck, "Computing |
465 | | * Binomial Coefficients", The American Mathematical Monthly, Vol. 94, |
466 | | * No. 4 (April 1987), pp. 360-365. |
467 | | * |
468 | | * Acknowledgment: Peter Luschny did spot the slowness of the previous |
469 | | * code and suggested the reference. |
470 | | */ |
471 | | |
472 | | /* TODO: Remove duplicated constants / macros / static functions... |
473 | | */ |
474 | | |
475 | | /*************************************************************/ |
476 | | /* Section macros: common macros, for swing/fac/bin (&sieve) */ |
477 | | /*************************************************************/ |
478 | | |
479 | | #define FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I) \ |
480 | 0 | if ((PR) > (MAX_PR)) { \ |
481 | 0 | (VEC)[(I)++] = (PR); \ |
482 | 0 | (PR) = 1; \ |
483 | 0 | } |
484 | | |
485 | | #define FACTOR_LIST_STORE(P, PR, MAX_PR, VEC, I) \ |
486 | 0 | do { \ |
487 | 0 | if ((PR) > (MAX_PR)) { \ |
488 | 0 | (VEC)[(I)++] = (PR); \ |
489 | 0 | (PR) = (P); \ |
490 | 0 | } else \ |
491 | 0 | (PR) *= (P); \ |
492 | 0 | } while (0) |
493 | | |
494 | | #define LOOP_ON_SIEVE_CONTINUE(prime,end,sieve) \ |
495 | 0 | __max_i = (end); \ |
496 | 0 | \ |
497 | 0 | do { \ |
498 | 0 | ++__i; \ |
499 | 0 | if (((sieve)[__index] & __mask) == 0) \ |
500 | 0 | { \ |
501 | 0 | mp_limb_t prime; \ |
502 | 0 | prime = id_to_n(__i) |
503 | | |
504 | | #define LOOP_ON_SIEVE_BEGIN(prime,start,end,off,sieve) \ |
505 | 0 | do { \ |
506 | 0 | mp_limb_t __mask, __index, __max_i, __i; \ |
507 | 0 | \ |
508 | 0 | __i = (start)-(off); \ |
509 | 0 | __index = __i / GMP_LIMB_BITS; \ |
510 | 0 | __mask = CNST_LIMB(1) << (__i % GMP_LIMB_BITS); \ |
511 | 0 | __i += (off); \ |
512 | 0 | \ |
513 | 0 | LOOP_ON_SIEVE_CONTINUE(prime,end,sieve) |
514 | | |
515 | | #define LOOP_ON_SIEVE_STOP \ |
516 | 0 | } \ |
517 | 0 | __mask = __mask << 1 | __mask >> (GMP_LIMB_BITS-1); \ |
518 | 0 | __index += __mask & 1; \ |
519 | 0 | } while (__i <= __max_i) |
520 | | |
521 | | #define LOOP_ON_SIEVE_END \ |
522 | 0 | LOOP_ON_SIEVE_STOP; \ |
523 | 0 | } while (0) |
524 | | |
525 | | /*********************************************************/ |
526 | | /* Section sieve: sieving functions and tools for primes */ |
527 | | /*********************************************************/ |
528 | | |
529 | | #if WANT_ASSERT |
530 | | static mp_limb_t |
531 | 0 | bit_to_n (mp_limb_t bit) { return (bit*3+4)|1; } |
532 | | #endif |
533 | | |
534 | | /* id_to_n (x) = bit_to_n (x-1) = (id*3+1)|1*/ |
535 | | static mp_limb_t |
536 | 0 | id_to_n (mp_limb_t id) { return id*3+1+(id&1); } |
537 | | |
538 | | /* n_to_bit (n) = ((n-1)&(-CNST_LIMB(2)))/3U-1 */ |
539 | | static mp_limb_t |
540 | 0 | n_to_bit (mp_limb_t n) { return ((n-5)|1)/3U; } |
541 | | |
542 | | static mp_size_t |
543 | 0 | primesieve_size (mp_limb_t n) { return n_to_bit(n) / GMP_LIMB_BITS + 1; } |
544 | | |
545 | | /*********************************************************/ |
546 | | /* Section binomial: fast binomial implementation */ |
547 | | /*********************************************************/ |
548 | | |
549 | | #define COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I) \ |
550 | 0 | do { \ |
551 | 0 | mp_limb_t __a, __b, __prime, __ma,__mb; \ |
552 | 0 | __prime = (P); \ |
553 | 0 | __a = (N); __b = (K); __mb = 0; \ |
554 | 0 | FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I); \ |
555 | 0 | do { \ |
556 | 0 | __mb += __b % __prime; __b /= __prime; \ |
557 | 0 | __ma = __a % __prime; __a /= __prime; \ |
558 | 0 | if (__ma < __mb) { \ |
559 | 0 | __mb = 1; (PR) *= __prime; \ |
560 | 0 | } else __mb = 0; \ |
561 | 0 | } while (__a >= __prime); \ |
562 | 0 | } while (0) |
563 | | |
564 | | #define SH_COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I) \ |
565 | 0 | do { \ |
566 | 0 | mp_limb_t __prime; \ |
567 | 0 | __prime = (P); \ |
568 | 0 | if (((N) % __prime) < ((K) % __prime)) { \ |
569 | 0 | FACTOR_LIST_STORE (__prime, PR, MAX_PR, VEC, I); \ |
570 | 0 | } \ |
571 | 0 | } while (0) |
572 | | |
573 | | /* Returns an approximation of the sqare root of x. |
574 | | * It gives: |
575 | | * limb_apprsqrt (x) ^ 2 <= x < (limb_apprsqrt (x)+1) ^ 2 |
576 | | * or |
577 | | * x <= limb_apprsqrt (x) ^ 2 <= x * 9/8 |
578 | | */ |
579 | | static mp_limb_t |
580 | | limb_apprsqrt (mp_limb_t x) |
581 | 0 | { |
582 | 0 | int s; |
583 | |
|
584 | 0 | ASSERT (x > 2); |
585 | 0 | count_leading_zeros (s, x); |
586 | 0 | s = (GMP_LIMB_BITS - s) >> 1; |
587 | 0 | return ((CNST_LIMB(1) << s) + (x >> s)) >> 1; |
588 | 0 | } |
589 | | |
590 | | static void |
591 | | mpz_goetgheluck_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k) |
592 | 0 | { |
593 | 0 | mp_limb_t *sieve, *factors, count; |
594 | 0 | mp_limb_t prod, max_prod; |
595 | 0 | mp_size_t j; |
596 | 0 | TMP_DECL; |
597 | |
|
598 | 0 | ASSERT (BIN_GOETGHELUCK_THRESHOLD >= 13); |
599 | 0 | ASSERT (n >= 25); |
600 | | |
601 | 0 | TMP_MARK; |
602 | 0 | sieve = TMP_ALLOC_LIMBS (primesieve_size (n)); |
603 | |
|
604 | 0 | count = gmp_primesieve (sieve, n) + 1; |
605 | 0 | factors = TMP_ALLOC_LIMBS (count / log_n_max (n) + 1); |
606 | |
|
607 | 0 | max_prod = GMP_NUMB_MAX / n; |
608 | | |
609 | | /* Handle primes = 2, 3 separately. */ |
610 | 0 | popc_limb (count, n - k); |
611 | 0 | popc_limb (j, k); |
612 | 0 | count += j; |
613 | 0 | popc_limb (j, n); |
614 | 0 | count -= j; |
615 | 0 | prod = CNST_LIMB(1) << count; |
616 | |
|
617 | 0 | j = 0; |
618 | 0 | COUNT_A_PRIME (3, n, k, prod, max_prod, factors, j); |
619 | | |
620 | | /* Accumulate prime factors from 5 to n/2 */ |
621 | 0 | { |
622 | 0 | mp_limb_t s; |
623 | |
|
624 | 0 | s = limb_apprsqrt(n); |
625 | 0 | s = n_to_bit (s); |
626 | 0 | ASSERT (bit_to_n (s+1) * bit_to_n (s+1) > n); |
627 | 0 | ASSERT (s <= n_to_bit (n >> 1)); |
628 | 0 | LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (5), s, 0,sieve); |
629 | 0 | COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j); |
630 | 0 | LOOP_ON_SIEVE_STOP; |
631 | |
|
632 | 0 | ASSERT (max_prod <= GMP_NUMB_MAX / 2); |
633 | 0 | max_prod <<= 1; |
634 | |
|
635 | 0 | LOOP_ON_SIEVE_CONTINUE (prime, n_to_bit (n >> 1),sieve); |
636 | 0 | SH_COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j); |
637 | 0 | LOOP_ON_SIEVE_END; |
638 | | |
639 | 0 | max_prod >>= 1; |
640 | 0 | } |
641 | | |
642 | | /* Store primes from (n-k)+1 to n */ |
643 | 0 | ASSERT (n_to_bit (n - k) < n_to_bit (n)); |
644 | | |
645 | 0 | LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (n - k) + 1, n_to_bit (n), 0,sieve); |
646 | 0 | FACTOR_LIST_STORE (prime, prod, max_prod, factors, j); |
647 | 0 | LOOP_ON_SIEVE_END; |
648 | |
|
649 | 0 | if (LIKELY (j != 0)) |
650 | 0 | { |
651 | 0 | factors[j++] = prod; |
652 | 0 | mpz_prodlimbs (r, factors, j); |
653 | 0 | } |
654 | 0 | else |
655 | 0 | { |
656 | 0 | MPZ_NEWALLOC (r, 1)[0] = prod; |
657 | 0 | SIZ (r) = 1; |
658 | 0 | } |
659 | 0 | TMP_FREE; |
660 | 0 | } |
661 | | |
662 | | #undef COUNT_A_PRIME |
663 | | #undef SH_COUNT_A_PRIME |
664 | | #undef LOOP_ON_SIEVE_END |
665 | | #undef LOOP_ON_SIEVE_STOP |
666 | | #undef LOOP_ON_SIEVE_BEGIN |
667 | | #undef LOOP_ON_SIEVE_CONTINUE |
668 | | |
669 | | /*********************************************************/ |
670 | | /* End of implementation of Goetgheluck's algorithm */ |
671 | | /*********************************************************/ |
672 | | |
673 | | void |
674 | | mpz_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k) |
675 | 0 | { |
676 | 0 | if (UNLIKELY (n < k)) { |
677 | 0 | SIZ (r) = 0; |
678 | | #if BITS_PER_ULONG > GMP_NUMB_BITS |
679 | | } else if (UNLIKELY (n > GMP_NUMB_MAX)) { |
680 | | mpz_t tmp; |
681 | | |
682 | | mpz_init_set_ui (tmp, n); |
683 | | mpz_bin_ui (r, tmp, k); |
684 | | mpz_clear (tmp); |
685 | | #endif |
686 | 0 | } else { |
687 | 0 | ASSERT (n <= GMP_NUMB_MAX); |
688 | | /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller. */ |
689 | 0 | k = MIN (k, n - k); |
690 | 0 | if (k < 2) { |
691 | 0 | MPZ_NEWALLOC (r, 1)[0] = k ? n : 1; /* 1 + ((-k) & (n-1)); */ |
692 | 0 | SIZ(r) = 1; |
693 | 0 | } else if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) { /* k >= 2, n >= 4 */ |
694 | 0 | MPZ_NEWALLOC (r, 1)[0] = bc_bin_uiui (n, k); |
695 | 0 | SIZ(r) = 1; |
696 | 0 | } else if (k <= ODD_FACTORIAL_TABLE_LIMIT) |
697 | 0 | mpz_smallk_bin_uiui (r, n, k); |
698 | 0 | else if (BIN_UIUI_ENABLE_SMALLDC && |
699 | 0 | k <= (BIN_UIUI_RECURSIVE_SMALLDC ? ODD_CENTRAL_BINOMIAL_TABLE_LIMIT : ODD_FACTORIAL_TABLE_LIMIT)* 2) |
700 | 0 | mpz_smallkdc_bin_uiui (r, n, k); |
701 | 0 | else if (ABOVE_THRESHOLD (k, BIN_GOETGHELUCK_THRESHOLD) && |
702 | 0 | k > (n >> 4)) /* k > ODD_FACTORIAL_TABLE_LIMIT */ |
703 | 0 | mpz_goetgheluck_bin_uiui (r, n, k); |
704 | 0 | else |
705 | 0 | mpz_bdiv_bin_uiui (r, n, k); |
706 | 0 | } |
707 | 0 | } |