/src/gmp-6.2.1/mpz/bin_uiui.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* mpz_bin_uiui - compute n over k.  | 
2  |  |  | 
3  |  | Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.  | 
4  |  |  | 
5  |  | Copyright 2010-2012, 2015-2018 Free Software Foundation, Inc.  | 
6  |  |  | 
7  |  | This file is part of the GNU MP Library.  | 
8  |  |  | 
9  |  | The GNU MP Library is free software; you can redistribute it and/or modify  | 
10  |  | it under the terms of either:  | 
11  |  |  | 
12  |  |   * the GNU Lesser General Public License as published by the Free  | 
13  |  |     Software Foundation; either version 3 of the License, or (at your  | 
14  |  |     option) any later version.  | 
15  |  |  | 
16  |  | or  | 
17  |  |  | 
18  |  |   * the GNU General Public License as published by the Free Software  | 
19  |  |     Foundation; either version 2 of the License, or (at your option) any  | 
20  |  |     later version.  | 
21  |  |  | 
22  |  | or both in parallel, as here.  | 
23  |  |  | 
24  |  | The GNU MP Library is distributed in the hope that it will be useful, but  | 
25  |  | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY  | 
26  |  | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License  | 
27  |  | for more details.  | 
28  |  |  | 
29  |  | You should have received copies of the GNU General Public License and the  | 
30  |  | GNU Lesser General Public License along with the GNU MP Library.  If not,  | 
31  |  | see https://www.gnu.org/licenses/.  */  | 
32  |  |  | 
33  |  | #include "gmp-impl.h"  | 
34  |  | #include "longlong.h"  | 
35  |  |  | 
36  |  | #ifndef BIN_GOETGHELUCK_THRESHOLD  | 
37  |  | #define BIN_GOETGHELUCK_THRESHOLD  512  | 
38  |  | #endif  | 
39  |  | #ifndef BIN_UIUI_ENABLE_SMALLDC  | 
40  | 0  | #define BIN_UIUI_ENABLE_SMALLDC    1  | 
41  |  | #endif  | 
42  |  | #ifndef BIN_UIUI_RECURSIVE_SMALLDC  | 
43  | 0  | #define BIN_UIUI_RECURSIVE_SMALLDC (GMP_NUMB_BITS > 32)  | 
44  |  | #endif  | 
45  |  |  | 
46  |  | /* Algorithm:  | 
47  |  |  | 
48  |  |    Accumulate chunks of factors first limb-by-limb (using one of mul0-mul8)  | 
49  |  |    which are then accumulated into mpn numbers.  The first inner loop  | 
50  |  |    accumulates divisor factors, the 2nd inner loop accumulates exactly the same  | 
51  |  |    number of dividend factors.  We avoid accumulating more for the divisor,  | 
52  |  |    even with its smaller factors, since we else cannot guarantee divisibility.  | 
53  |  |  | 
54  |  |    Since we know each division will yield an integer, we compute the quotient  | 
55  |  |    using Hensel norm: If the quotient is limited by 2^t, we compute A / B mod  | 
56  |  |    2^t.  | 
57  |  |  | 
58  |  |    Improvements:  | 
59  |  |  | 
60  |  |    (1) An obvious improvement to this code would be to compute mod 2^t  | 
61  |  |    everywhere.  Unfortunately, we cannot determine t beforehand, unless we  | 
62  |  |    invoke some approximation, such as Stirling's formula.  Of course, we don't  | 
63  |  |    need t to be tight.  However, it is not clear that this would help much,  | 
64  |  |    our numbers are kept reasonably small already.  | 
65  |  |  | 
66  |  |    (2) Compute nmax/kmax semi-accurately, without scalar division or a loop.  | 
67  |  |    Extracting the 3 msb, then doing a table lookup using cnt*8+msb as index,  | 
68  |  |    would make it both reasonably accurate and fast.  (We could use a table  | 
69  |  |    stored into a limb, perhaps.)  The table should take the removed factors of  | 
70  |  |    2 into account (those done on-the-fly in mulN).  | 
71  |  |  | 
72  |  |    (3) The first time in the loop we compute the odd part of a  | 
73  |  |    factorial in kp, we might use oddfac_1 for this task.  | 
74  |  |  */  | 
75  |  |  | 
76  |  | /* This threshold determines how large divisor to accumulate before we call  | 
77  |  |    bdiv.  Perhaps we should never call bdiv, and accumulate all we are told,  | 
78  |  |    since we are just basecase code anyway?  Presumably, this depends on the  | 
79  |  |    relative speed of the asymptotically fast code and this code.  */  | 
80  | 0  | #define SOME_THRESHOLD 20  | 
81  |  |  | 
82  |  | /* Multiply-into-limb functions.  These remove factors of 2 on-the-fly.  FIXME:  | 
83  |  |    All versions of MAXFACS don't take this 2 removal into account now, meaning  | 
84  |  |    that then, shifting just adds some overhead.  (We remove factors from the  | 
85  |  |    completed limb anyway.)  */  | 
86  |  |  | 
87  |  | static mp_limb_t  | 
88  |  | mul1 (mp_limb_t m)  | 
89  | 0  | { | 
90  | 0  |   return m;  | 
91  | 0  | }  | 
92  |  |  | 
93  |  | static mp_limb_t  | 
94  |  | mul2 (mp_limb_t m)  | 
95  | 0  | { | 
96  |  |   /* We need to shift before multiplying, to avoid an overflow. */  | 
97  | 0  |   mp_limb_t m01 = (m | 1) * ((m + 1) >> 1);  | 
98  | 0  |   return m01;  | 
99  | 0  | }  | 
100  |  |  | 
101  |  | static mp_limb_t  | 
102  |  | mul3 (mp_limb_t m)  | 
103  | 0  | { | 
104  | 0  |   mp_limb_t m01 = (m + 0) * (m + 1) >> 1;  | 
105  | 0  |   mp_limb_t m2 = (m + 2);  | 
106  | 0  |   return m01 * m2;  | 
107  | 0  | }  | 
108  |  |  | 
109  |  | static mp_limb_t  | 
110  |  | mul4 (mp_limb_t m)  | 
111  | 0  | { | 
112  | 0  |   mp_limb_t m03 = (m + 0) * (m + 3) >> 1;  | 
113  | 0  |   return m03 * (m03 + 1); /* mul2 (m03) ? */  | 
114  | 0  | }  | 
115  |  |  | 
116  |  | static mp_limb_t  | 
117  |  | mul5 (mp_limb_t m)  | 
118  | 0  | { | 
119  | 0  |   mp_limb_t m03 = (m + 0) * (m + 3) >> 1;  | 
120  | 0  |   mp_limb_t m034 = m03 * (m + 4);  | 
121  | 0  |   return (m03 + 1) * m034;  | 
122  | 0  | }  | 
123  |  |  | 
124  |  | static mp_limb_t  | 
125  |  | mul6 (mp_limb_t m)  | 
126  | 0  | { | 
127  | 0  |   mp_limb_t m05 = (m + 0) * (m + 5);  | 
128  | 0  |   mp_limb_t m1234 = (m05 + 5) * (m05 + 5) >> 3;  | 
129  | 0  |   return m1234 * (m05 >> 1);  | 
130  | 0  | }  | 
131  |  |  | 
132  |  | static mp_limb_t  | 
133  |  | mul7 (mp_limb_t m)  | 
134  | 0  | { | 
135  | 0  |   mp_limb_t m05 = (m + 0) * (m + 5);  | 
136  | 0  |   mp_limb_t m1234 = (m05 + 5) * (m05 + 5) >> 3;  | 
137  | 0  |   mp_limb_t m056 = m05 * (m + 6) >> 1;  | 
138  | 0  |   return m1234 * m056;  | 
139  | 0  | }  | 
140  |  |  | 
141  |  | static mp_limb_t  | 
142  |  | mul8 (mp_limb_t m)  | 
143  | 0  | { | 
144  | 0  |   mp_limb_t m07 = (m + 0) * (m + 7);  | 
145  | 0  |   mp_limb_t m0257 = m07 * (m07 + 10) >> 3;  | 
146  | 0  |   mp_limb_t m1346 = m07 + 9 + m0257;  | 
147  | 0  |   return m0257 * m1346;  | 
148  | 0  | }  | 
149  |  |  | 
150  |  | /*  | 
151  |  | static mp_limb_t  | 
152  |  | mul9 (mp_limb_t m)  | 
153  |  | { | 
154  |  |   return (m + 8) * mul8 (m) ;  | 
155  |  | }  | 
156  |  |  | 
157  |  | static mp_limb_t  | 
158  |  | mul10 (mp_limb_t m)  | 
159  |  | { | 
160  |  |   mp_limb_t m09 = (m + 0) * (m + 9);  | 
161  |  |   mp_limb_t m18 = (m09 >> 1) + 4;  | 
162  |  |   mp_limb_t m0369 = m09 * (m09 + 18) >> 3;  | 
163  |  |   mp_limb_t m2457 = m09 * 2 + 35 + m0369;  | 
164  |  |   return ((m0369 * m2457) >> 1) * m18;  | 
165  |  | }  | 
166  |  | */  | 
167  |  |  | 
168  |  | typedef mp_limb_t (* mulfunc_t) (mp_limb_t);  | 
169  |  |  | 
170  |  | static const mulfunc_t mulfunc[] = {mul1,mul2,mul3,mul4,mul5,mul6,mul7,mul8 /* ,mul9,mul10 */}; | 
171  |  | #define M (numberof(mulfunc))  | 
172  |  |  | 
173  |  | /* Number of factors-of-2 removed by the corresponding mulN function.  */  | 
174  |  | static const unsigned char tcnttab[] = {0, 1, 1, 2, 2, 4, 4, 6 /*,6,8*/}; | 
175  |  |  | 
176  |  | #if 1  | 
177  |  | /* This variant is inaccurate but share the code with other functions.  */  | 
178  |  | #define MAXFACS(max,l)              \  | 
179  | 0  |   do {                 \ | 
180  | 0  |     (max) = log_n_max (l);            \  | 
181  | 0  |   } while (0)  | 
182  |  | #else  | 
183  |  |  | 
184  |  | /* This variant is exact(?) but uses a loop.  It takes the 2 removal  | 
185  |  |  of mulN into account.  */  | 
186  |  | static const unsigned long ftab[] =  | 
187  |  | #if GMP_NUMB_BITS == 64  | 
188  |  |   /* 1 to 8 factors per iteration */  | 
189  |  |   {CNST_LIMB(0xffffffffffffffff),CNST_LIMB(0x16a09e667),0x32cbfc,0x16a08,0x24c0,0xa11,0x345,0x1ab /*,0xe9,0x8e */}; | 
190  |  | #endif  | 
191  |  | #if GMP_NUMB_BITS == 32  | 
192  |  |   /* 1 to 7 factors per iteration */  | 
193  |  |   {0xffffffff,0x16a09,0x7ff,0x168,0x6f,0x3d,0x20 /* ,0x17 */}; | 
194  |  | #endif  | 
195  |  |  | 
196  |  | #define MAXFACS(max,l)              \  | 
197  |  |   do {                  \ | 
198  |  |     int __i;                \  | 
199  |  |     for (__i = numberof (ftab) - 1; l > ftab[__i]; __i--)   \  | 
200  |  |       ;                 \  | 
201  |  |     (max) = __i + 1;              \  | 
202  |  |   } while (0)  | 
203  |  | #endif  | 
204  |  |  | 
205  |  | /* Entry i contains (i!/2^t)^(-1) where t is chosen such that the parenthesis  | 
206  |  |    is an odd integer. */  | 
207  |  | static const mp_limb_t facinv[] = { ONE_LIMB_ODD_FACTORIAL_INVERSES_TABLE }; | 
208  |  |  | 
209  |  | static void  | 
210  |  | mpz_bdiv_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)  | 
211  | 0  | { | 
212  | 0  |   unsigned nmax, kmax, nmaxnow, numfac;  | 
213  | 0  |   mp_ptr np, kp;  | 
214  | 0  |   mp_size_t nn, kn, alloc;  | 
215  | 0  |   mp_limb_t i, j, t, iii, jjj, cy, dinv;  | 
216  | 0  |   int cnt;  | 
217  | 0  |   mp_size_t maxn;  | 
218  | 0  |   TMP_DECL;  | 
219  |  | 
  | 
220  | 0  |   ASSERT (k > ODD_FACTORIAL_TABLE_LIMIT);  | 
221  | 0  |   TMP_MARK;  | 
222  |  | 
  | 
223  | 0  |   maxn = 1 + n / GMP_NUMB_BITS;    /* absolutely largest result size (limbs) */  | 
224  |  |  | 
225  |  |   /* FIXME: This allocation might be insufficient, but is usually way too  | 
226  |  |      large.  */  | 
227  | 0  |   alloc = SOME_THRESHOLD - 1 + MAX (3 * maxn / 2, SOME_THRESHOLD);  | 
228  | 0  |   alloc = MIN (alloc, (mp_size_t) k) + 1;  | 
229  | 0  |   TMP_ALLOC_LIMBS_2 (np, alloc, kp, SOME_THRESHOLD + 1);  | 
230  |  | 
  | 
231  | 0  |   MAXFACS (nmax, n);  | 
232  | 0  |   ASSERT (nmax <= M);  | 
233  | 0  |   MAXFACS (kmax, k);  | 
234  | 0  |   ASSERT (kmax <= M);  | 
235  | 0  |   ASSERT (k >= M);  | 
236  |  |  | 
237  | 0  |   i = n - k + 1;  | 
238  |  | 
  | 
239  | 0  |   np[0] = 1; nn = 1;  | 
240  |  | 
  | 
241  | 0  |   numfac = 1;  | 
242  | 0  |   j = ODD_FACTORIAL_TABLE_LIMIT + 1;  | 
243  | 0  |   jjj = ODD_FACTORIAL_TABLE_MAX;  | 
244  | 0  |   ASSERT (__gmp_oddfac_table[ODD_FACTORIAL_TABLE_LIMIT] == ODD_FACTORIAL_TABLE_MAX);  | 
245  |  |  | 
246  | 0  |   while (1)  | 
247  | 0  |     { | 
248  | 0  |       kp[0] = jjj;        /* store new factors */  | 
249  | 0  |       kn = 1;  | 
250  | 0  |       t = k - j + 1;  | 
251  | 0  |       kmax = MIN (kmax, t);  | 
252  |  | 
  | 
253  | 0  |       while (kmax != 0 && kn < SOME_THRESHOLD)  | 
254  | 0  |   { | 
255  | 0  |     jjj = mulfunc[kmax - 1] (j);  | 
256  | 0  |     j += kmax;        /* number of factors used */  | 
257  | 0  |     count_trailing_zeros (cnt, jjj);  /* count low zeros */  | 
258  | 0  |     jjj >>= cnt;        /* remove remaining low zeros */  | 
259  | 0  |     cy = mpn_mul_1 (kp, kp, kn, jjj); /* accumulate new factors */  | 
260  | 0  |     kp[kn] = cy;  | 
261  | 0  |     kn += cy != 0;  | 
262  | 0  |     t = k - j + 1;  | 
263  | 0  |     kmax = MIN (kmax, t);  | 
264  | 0  |   }  | 
265  | 0  |       numfac = j - numfac;  | 
266  |  | 
  | 
267  | 0  |       while (numfac != 0)  | 
268  | 0  |   { | 
269  | 0  |     nmaxnow = MIN (nmax, numfac);  | 
270  | 0  |     iii = mulfunc[nmaxnow - 1] (i);  | 
271  | 0  |     i += nmaxnow;       /* number of factors used */  | 
272  | 0  |     count_trailing_zeros (cnt, iii);  /* count low zeros */  | 
273  | 0  |     iii >>= cnt;        /* remove remaining low zeros */  | 
274  | 0  |     cy = mpn_mul_1 (np, np, nn, iii); /* accumulate new factors */  | 
275  | 0  |     np[nn] = cy;  | 
276  | 0  |     nn += cy != 0;  | 
277  | 0  |     numfac -= nmaxnow;  | 
278  | 0  |   }  | 
279  |  |  | 
280  | 0  |       ASSERT (nn < alloc);  | 
281  |  |  | 
282  | 0  |       binvert_limb (dinv, kp[0]);  | 
283  | 0  |       nn += (np[nn - 1] >= kp[kn - 1]);  | 
284  | 0  |       nn -= kn;  | 
285  | 0  |       mpn_sbpi1_bdiv_q (np, np, nn, kp, MIN(kn,nn), -dinv);  | 
286  | 0  |       mpn_neg (np, np, nn);  | 
287  |  | 
  | 
288  | 0  |       if (kmax == 0)  | 
289  | 0  |   break;  | 
290  | 0  |       numfac = j;  | 
291  |  | 
  | 
292  | 0  |       jjj = mulfunc[kmax - 1] (j);  | 
293  | 0  |       j += kmax;        /* number of factors used */  | 
294  | 0  |       count_trailing_zeros (cnt, jjj);    /* count low zeros */  | 
295  | 0  |       jjj >>= cnt;        /* remove remaining low zeros */  | 
296  | 0  |     }  | 
297  |  |  | 
298  |  |   /* Put back the right number of factors of 2.  */  | 
299  | 0  |   popc_limb (cnt, n - k);  | 
300  | 0  |   popc_limb (j, k);  | 
301  | 0  |   cnt += j;  | 
302  | 0  |   popc_limb (j, n);  | 
303  | 0  |   cnt -= j;  | 
304  | 0  |   if (cnt != 0)  | 
305  | 0  |     { | 
306  | 0  |       ASSERT (cnt < GMP_NUMB_BITS); /* can happen, but not for intended use */  | 
307  | 0  |       cy = mpn_lshift (np, np, nn, cnt);  | 
308  | 0  |       np[nn] = cy;  | 
309  | 0  |       nn += cy != 0;  | 
310  | 0  |     }  | 
311  |  |  | 
312  | 0  |   nn -= np[nn - 1] == 0;  /* normalisation */  | 
313  |  | 
  | 
314  | 0  |   kp = MPZ_NEWALLOC (r, nn);  | 
315  | 0  |   SIZ(r) = nn;  | 
316  | 0  |   MPN_COPY (kp, np, nn);  | 
317  | 0  |   TMP_FREE;  | 
318  | 0  | }  | 
319  |  |  | 
320  |  | static void  | 
321  |  | mpz_smallk_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)  | 
322  | 0  | { | 
323  | 0  |   unsigned nmax, numfac;  | 
324  | 0  |   mp_ptr rp;  | 
325  | 0  |   mp_size_t rn, alloc;  | 
326  | 0  |   mp_limb_t i, iii, cy;  | 
327  | 0  |   unsigned i2cnt, cnt;  | 
328  |  | 
  | 
329  | 0  |   MAXFACS (nmax, n);  | 
330  | 0  |   nmax = MIN (nmax, M);  | 
331  |  | 
  | 
332  | 0  |   i = n - k + 1;  | 
333  |  | 
  | 
334  | 0  |   i2cnt = __gmp_fac2cnt_table[k / 2 - 1];   /* low zeros count */  | 
335  | 0  |   if (nmax >= k)  | 
336  | 0  |     { | 
337  | 0  |       MPZ_NEWALLOC (r, 1) [0] = mulfunc[k - 1] (i) * facinv[k - 2] >>  | 
338  | 0  |   (i2cnt - tcnttab[k - 1]);  | 
339  | 0  |       SIZ(r) = 1;  | 
340  | 0  |       return;  | 
341  | 0  |     }  | 
342  |  |  | 
343  | 0  |   count_leading_zeros (cnt, (mp_limb_t) n);  | 
344  | 0  |   cnt = GMP_LIMB_BITS - cnt;  | 
345  | 0  |   alloc = cnt * k / GMP_NUMB_BITS + 3; /* FIXME: ensure rounding is enough. */  | 
346  | 0  |   rp = MPZ_NEWALLOC (r, alloc);  | 
347  |  | 
  | 
348  | 0  |   rp[0] = mulfunc[nmax - 1] (i);  | 
349  | 0  |   rn = 1;  | 
350  | 0  |   i += nmax;        /* number of factors used */  | 
351  | 0  |   i2cnt -= tcnttab[nmax - 1];   /* low zeros count */  | 
352  | 0  |   numfac = k - nmax;  | 
353  | 0  |   do  | 
354  | 0  |     { | 
355  | 0  |       nmax = MIN (nmax, numfac);  | 
356  | 0  |       iii = mulfunc[nmax - 1] (i);  | 
357  | 0  |       i += nmax;      /* number of factors used */  | 
358  | 0  |       i2cnt -= tcnttab[nmax - 1]; /* update low zeros count */  | 
359  | 0  |       cy = mpn_mul_1 (rp, rp, rn, iii); /* accumulate new factors */  | 
360  | 0  |       rp[rn] = cy;  | 
361  | 0  |       rn += cy != 0;  | 
362  | 0  |       numfac -= nmax;  | 
363  | 0  |     } while (numfac != 0);  | 
364  |  | 
  | 
365  | 0  |   ASSERT (rn < alloc);  | 
366  |  |  | 
367  | 0  |   mpn_pi1_bdiv_q_1 (rp, rp, rn, __gmp_oddfac_table[k], facinv[k - 2], i2cnt);  | 
368  |  |   /* A two-fold, branch-free normalisation is possible :*/  | 
369  |  |   /* rn -= rp[rn - 1] == 0; */  | 
370  |  |   /* rn -= rp[rn - 1] == 0; */  | 
371  | 0  |   MPN_NORMALIZE_NOT_ZERO (rp, rn);  | 
372  |  |  | 
373  | 0  |   SIZ(r) = rn;  | 
374  | 0  | }  | 
375  |  |  | 
376  |  | /* Algorithm:  | 
377  |  |  | 
378  |  |    Plain and simply multiply things together.  | 
379  |  |  | 
380  |  |    We tabulate factorials (k!/2^t)^(-1) mod B (where t is chosen such  | 
381  |  |    that k!/2^t is odd).  | 
382  |  |  | 
383  |  | */  | 
384  |  |  | 
385  |  | static mp_limb_t  | 
386  |  | bc_bin_uiui (unsigned int n, unsigned int k)  | 
387  | 0  | { | 
388  | 0  |   return ((__gmp_oddfac_table[n] * facinv[k - 2] * facinv[n - k - 2])  | 
389  | 0  |     << (__gmp_fac2cnt_table[n / 2 - 1] - __gmp_fac2cnt_table[k / 2 - 1] - __gmp_fac2cnt_table[(n-k) / 2 - 1]))  | 
390  | 0  |     & GMP_NUMB_MASK;  | 
391  | 0  | }  | 
392  |  |  | 
393  |  | /* Algorithm:  | 
394  |  |  | 
395  |  |    Recursively exploit the relation  | 
396  |  |    bin(n,k) = bin(n,k>>1)*bin(n-k>>1,k-k>>1)/bin(k,k>>1) .  | 
397  |  |  | 
398  |  |    Values for binomial(k,k>>1) that fit in a limb are precomputed  | 
399  |  |    (with inverses).  | 
400  |  | */  | 
401  |  |  | 
402  |  | /* bin2kk[i - ODD_CENTRAL_BINOMIAL_OFFSET] =  | 
403  |  |    binomial(i*2,i)/2^t (where t is chosen so that it is odd). */  | 
404  |  | static const mp_limb_t bin2kk[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_TABLE }; | 
405  |  |  | 
406  |  | /* bin2kkinv[i] = bin2kk[i]^-1 mod B */  | 
407  |  | static const mp_limb_t bin2kkinv[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_INVERSE_TABLE }; | 
408  |  |  | 
409  |  | /* bin2kk[i] = binomial((i+MIN_S)*2,i+MIN_S)/2^t. This table contains the t values. */  | 
410  |  | static const unsigned char fac2bin[] = { CENTRAL_BINOMIAL_2FAC_TABLE }; | 
411  |  |  | 
412  |  | static void  | 
413  |  | mpz_smallkdc_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)  | 
414  | 0  | { | 
415  | 0  |   mp_ptr rp;  | 
416  | 0  |   mp_size_t rn;  | 
417  | 0  |   unsigned long int hk;  | 
418  |  | 
  | 
419  | 0  |   hk = k >> 1;  | 
420  |  | 
  | 
421  | 0  |   if ((! BIN_UIUI_RECURSIVE_SMALLDC) || hk <= ODD_FACTORIAL_TABLE_LIMIT)  | 
422  | 0  |     mpz_smallk_bin_uiui (r, n, hk);  | 
423  | 0  |   else  | 
424  | 0  |     mpz_smallkdc_bin_uiui (r, n, hk);  | 
425  | 0  |   k -= hk;  | 
426  | 0  |   n -= hk;  | 
427  | 0  |   if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) { | 
428  | 0  |     mp_limb_t cy;  | 
429  | 0  |     rn = SIZ (r);  | 
430  | 0  |     rp = MPZ_REALLOC (r, rn + 1);  | 
431  | 0  |     cy = mpn_mul_1 (rp, rp, rn, bc_bin_uiui (n, k));  | 
432  | 0  |     rp [rn] = cy;  | 
433  | 0  |     rn += cy != 0;  | 
434  | 0  |   } else { | 
435  | 0  |     mp_limb_t buffer[ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3];  | 
436  | 0  |     mpz_t t;  | 
437  |  | 
  | 
438  | 0  |     ALLOC (t) = ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3;  | 
439  | 0  |     PTR (t) = buffer;  | 
440  | 0  |     if ((! BIN_UIUI_RECURSIVE_SMALLDC) || k <= ODD_FACTORIAL_TABLE_LIMIT)  | 
441  | 0  |       mpz_smallk_bin_uiui (t, n, k);  | 
442  | 0  |     else  | 
443  | 0  |       mpz_smallkdc_bin_uiui (t, n, k);  | 
444  | 0  |     mpz_mul (r, r, t);  | 
445  | 0  |     rp = PTR (r);  | 
446  | 0  |     rn = SIZ (r);  | 
447  | 0  |   }  | 
448  |  | 
  | 
449  | 0  |   mpn_pi1_bdiv_q_1 (rp, rp, rn, bin2kk[k - ODD_CENTRAL_BINOMIAL_OFFSET],  | 
450  | 0  |         bin2kkinv[k - ODD_CENTRAL_BINOMIAL_OFFSET],  | 
451  | 0  |         fac2bin[k - ODD_CENTRAL_BINOMIAL_OFFSET] - (k != hk));  | 
452  |  |   /* A two-fold, branch-free normalisation is possible :*/  | 
453  |  |   /* rn -= rp[rn - 1] == 0; */  | 
454  |  |   /* rn -= rp[rn - 1] == 0; */  | 
455  | 0  |   MPN_NORMALIZE_NOT_ZERO (rp, rn);  | 
456  |  |  | 
457  | 0  |   SIZ(r) = rn;  | 
458  | 0  | }  | 
459  |  |  | 
460  |  | /* mpz_goetgheluck_bin_uiui(RESULT, N, K) -- Set RESULT to binomial(N,K).  | 
461  |  |  *  | 
462  |  |  * Contributed to the GNU project by Marco Bodrato.  | 
463  |  |  *  | 
464  |  |  * Implementation of the algorithm by P. Goetgheluck, "Computing  | 
465  |  |  * Binomial Coefficients", The American Mathematical Monthly, Vol. 94,  | 
466  |  |  * No. 4 (April 1987), pp. 360-365.  | 
467  |  |  *  | 
468  |  |  * Acknowledgment: Peter Luschny did spot the slowness of the previous  | 
469  |  |  * code and suggested the reference.  | 
470  |  |  */  | 
471  |  |  | 
472  |  | /* TODO: Remove duplicated constants / macros / static functions...  | 
473  |  |  */  | 
474  |  |  | 
475  |  | /*************************************************************/  | 
476  |  | /* Section macros: common macros, for swing/fac/bin (&sieve) */  | 
477  |  | /*************************************************************/  | 
478  |  |  | 
479  |  | #define FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I)      \  | 
480  | 0  |   if ((PR) > (MAX_PR)) {         \ | 
481  | 0  |     (VEC)[(I)++] = (PR);          \  | 
482  | 0  |     (PR) = 1;             \  | 
483  | 0  |   }  | 
484  |  |  | 
485  |  | #define FACTOR_LIST_STORE(P, PR, MAX_PR, VEC, I)    \  | 
486  | 0  |   do {               \ | 
487  | 0  |     if ((PR) > (MAX_PR)) {         \ | 
488  | 0  |       (VEC)[(I)++] = (PR);          \  | 
489  | 0  |       (PR) = (P);           \  | 
490  | 0  |     } else             \  | 
491  | 0  |       (PR) *= (P);           \  | 
492  | 0  |   } while (0)  | 
493  |  |  | 
494  |  | #define LOOP_ON_SIEVE_CONTINUE(prime,end,sieve)     \  | 
495  | 0  |     __max_i = (end);            \  | 
496  | 0  |                 \  | 
497  | 0  |     do {             \ | 
498  | 0  |       ++__i;              \  | 
499  | 0  |       if (((sieve)[__index] & __mask) == 0)     \  | 
500  | 0  |   {             \ | 
501  | 0  |     mp_limb_t prime;          \  | 
502  | 0  |     prime = id_to_n(__i)  | 
503  |  |  | 
504  |  | #define LOOP_ON_SIEVE_BEGIN(prime,start,end,off,sieve)    \  | 
505  | 0  |   do {               \ | 
506  | 0  |     mp_limb_t __mask, __index, __max_i, __i;      \  | 
507  | 0  |                 \  | 
508  | 0  |     __i = (start)-(off);          \  | 
509  | 0  |     __index = __i / GMP_LIMB_BITS;       \  | 
510  | 0  |     __mask = CNST_LIMB(1) << (__i % GMP_LIMB_BITS);    \  | 
511  | 0  |     __i += (off);           \  | 
512  | 0  |                 \  | 
513  | 0  |     LOOP_ON_SIEVE_CONTINUE(prime,end,sieve)  | 
514  |  |  | 
515  |  | #define LOOP_ON_SIEVE_STOP          \  | 
516  | 0  |   }              \  | 
517  | 0  |       __mask = __mask << 1 | __mask >> (GMP_LIMB_BITS-1);  \  | 
518  | 0  |       __index += __mask & 1;          \  | 
519  | 0  |     }  while (__i <= __max_i)  | 
520  |  |  | 
521  |  | #define LOOP_ON_SIEVE_END         \  | 
522  | 0  |     LOOP_ON_SIEVE_STOP;           \  | 
523  | 0  |   } while (0)  | 
524  |  |  | 
525  |  | /*********************************************************/  | 
526  |  | /* Section sieve: sieving functions and tools for primes */  | 
527  |  | /*********************************************************/  | 
528  |  |  | 
529  |  | #if WANT_ASSERT  | 
530  |  | static mp_limb_t  | 
531  | 0  | bit_to_n (mp_limb_t bit) { return (bit*3+4)|1; } | 
532  |  | #endif  | 
533  |  |  | 
534  |  | /* id_to_n (x) = bit_to_n (x-1) = (id*3+1)|1*/  | 
535  |  | static mp_limb_t  | 
536  | 0  | id_to_n  (mp_limb_t id)  { return id*3+1+(id&1); } | 
537  |  |  | 
538  |  | /* n_to_bit (n) = ((n-1)&(-CNST_LIMB(2)))/3U-1 */  | 
539  |  | static mp_limb_t  | 
540  | 0  | n_to_bit (mp_limb_t n) { return ((n-5)|1)/3U; } | 
541  |  |  | 
542  |  | static mp_size_t  | 
543  | 0  | primesieve_size (mp_limb_t n) { return n_to_bit(n) / GMP_LIMB_BITS + 1; } | 
544  |  |  | 
545  |  | /*********************************************************/  | 
546  |  | /* Section binomial: fast binomial implementation        */  | 
547  |  | /*********************************************************/  | 
548  |  |  | 
549  |  | #define COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I)  \  | 
550  | 0  |   do {             \ | 
551  | 0  |     mp_limb_t __a, __b, __prime, __ma,__mb;   \  | 
552  | 0  |     __prime = (P);          \  | 
553  | 0  |     __a = (N); __b = (K); __mb = 0;     \  | 
554  | 0  |     FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I);   \  | 
555  | 0  |     do {           \ | 
556  | 0  |       __mb += __b % __prime; __b /= __prime;    \  | 
557  | 0  |       __ma = __a % __prime; __a /= __prime;   \  | 
558  | 0  |       if (__ma < __mb) {       \ | 
559  | 0  |         __mb = 1; (PR) *= __prime;      \  | 
560  | 0  |       } else  __mb = 0;         \  | 
561  | 0  |     } while (__a >= __prime);        \  | 
562  | 0  |   } while (0)  | 
563  |  |  | 
564  |  | #define SH_COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I) \  | 
565  | 0  |   do {             \ | 
566  | 0  |     mp_limb_t __prime;          \  | 
567  | 0  |     __prime = (P);          \  | 
568  | 0  |     if (((N) % __prime) < ((K) % __prime)) {   \ | 
569  | 0  |       FACTOR_LIST_STORE (__prime, PR, MAX_PR, VEC, I); \  | 
570  | 0  |     }             \  | 
571  | 0  |   } while (0)  | 
572  |  |  | 
573  |  | /* Returns an approximation of the sqare root of x.  | 
574  |  |  * It gives:  | 
575  |  |  *   limb_apprsqrt (x) ^ 2 <= x < (limb_apprsqrt (x)+1) ^ 2  | 
576  |  |  * or  | 
577  |  |  *   x <= limb_apprsqrt (x) ^ 2 <= x * 9/8  | 
578  |  |  */  | 
579  |  | static mp_limb_t  | 
580  |  | limb_apprsqrt (mp_limb_t x)  | 
581  | 0  | { | 
582  | 0  |   int s;  | 
583  |  | 
  | 
584  | 0  |   ASSERT (x > 2);  | 
585  | 0  |   count_leading_zeros (s, x);  | 
586  | 0  |   s = (GMP_LIMB_BITS - s) >> 1;  | 
587  | 0  |   return ((CNST_LIMB(1) << s) + (x >> s)) >> 1;  | 
588  | 0  | }  | 
589  |  |  | 
590  |  | static void  | 
591  |  | mpz_goetgheluck_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)  | 
592  | 0  | { | 
593  | 0  |   mp_limb_t *sieve, *factors, count;  | 
594  | 0  |   mp_limb_t prod, max_prod;  | 
595  | 0  |   mp_size_t j;  | 
596  | 0  |   TMP_DECL;  | 
597  |  | 
  | 
598  | 0  |   ASSERT (BIN_GOETGHELUCK_THRESHOLD >= 13);  | 
599  | 0  |   ASSERT (n >= 25);  | 
600  |  |  | 
601  | 0  |   TMP_MARK;  | 
602  | 0  |   sieve = TMP_ALLOC_LIMBS (primesieve_size (n));  | 
603  |  | 
  | 
604  | 0  |   count = gmp_primesieve (sieve, n) + 1;  | 
605  | 0  |   factors = TMP_ALLOC_LIMBS (count / log_n_max (n) + 1);  | 
606  |  | 
  | 
607  | 0  |   max_prod = GMP_NUMB_MAX / n;  | 
608  |  |  | 
609  |  |   /* Handle primes = 2, 3 separately. */  | 
610  | 0  |   popc_limb (count, n - k);  | 
611  | 0  |   popc_limb (j, k);  | 
612  | 0  |   count += j;  | 
613  | 0  |   popc_limb (j, n);  | 
614  | 0  |   count -= j;  | 
615  | 0  |   prod = CNST_LIMB(1) << count;  | 
616  |  | 
  | 
617  | 0  |   j = 0;  | 
618  | 0  |   COUNT_A_PRIME (3, n, k, prod, max_prod, factors, j);  | 
619  |  |  | 
620  |  |   /* Accumulate prime factors from 5 to n/2 */  | 
621  | 0  |     { | 
622  | 0  |       mp_limb_t s;  | 
623  |  | 
  | 
624  | 0  |       s = limb_apprsqrt(n);  | 
625  | 0  |       s = n_to_bit (s);  | 
626  | 0  |       ASSERT (bit_to_n (s+1) * bit_to_n (s+1) > n);  | 
627  | 0  |       ASSERT (s <= n_to_bit (n >> 1));  | 
628  | 0  |       LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (5), s, 0,sieve);  | 
629  | 0  |       COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);  | 
630  | 0  |       LOOP_ON_SIEVE_STOP;  | 
631  |  | 
  | 
632  | 0  |       ASSERT (max_prod <= GMP_NUMB_MAX / 2);  | 
633  | 0  |       max_prod <<= 1;  | 
634  |  | 
  | 
635  | 0  |       LOOP_ON_SIEVE_CONTINUE (prime, n_to_bit (n >> 1),sieve);  | 
636  | 0  |       SH_COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);  | 
637  | 0  |       LOOP_ON_SIEVE_END;  | 
638  |  |  | 
639  | 0  |       max_prod >>= 1;  | 
640  | 0  |     }  | 
641  |  |  | 
642  |  |   /* Store primes from (n-k)+1 to n */  | 
643  | 0  |   ASSERT (n_to_bit (n - k) < n_to_bit (n));  | 
644  |  |  | 
645  | 0  |   LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (n - k) + 1, n_to_bit (n), 0,sieve);  | 
646  | 0  |   FACTOR_LIST_STORE (prime, prod, max_prod, factors, j);  | 
647  | 0  |   LOOP_ON_SIEVE_END;  | 
648  |  | 
  | 
649  | 0  |   if (LIKELY (j != 0))  | 
650  | 0  |     { | 
651  | 0  |       factors[j++] = prod;  | 
652  | 0  |       mpz_prodlimbs (r, factors, j);  | 
653  | 0  |     }  | 
654  | 0  |   else  | 
655  | 0  |     { | 
656  | 0  |       MPZ_NEWALLOC (r, 1)[0] = prod;  | 
657  | 0  |       SIZ (r) = 1;  | 
658  | 0  |     }  | 
659  | 0  |   TMP_FREE;  | 
660  | 0  | }  | 
661  |  |  | 
662  |  | #undef COUNT_A_PRIME  | 
663  |  | #undef SH_COUNT_A_PRIME  | 
664  |  | #undef LOOP_ON_SIEVE_END  | 
665  |  | #undef LOOP_ON_SIEVE_STOP  | 
666  |  | #undef LOOP_ON_SIEVE_BEGIN  | 
667  |  | #undef LOOP_ON_SIEVE_CONTINUE  | 
668  |  |  | 
669  |  | /*********************************************************/  | 
670  |  | /* End of implementation of Goetgheluck's algorithm      */  | 
671  |  | /*********************************************************/  | 
672  |  |  | 
673  |  | void  | 
674  |  | mpz_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)  | 
675  | 0  | { | 
676  | 0  |   if (UNLIKELY (n < k)) { | 
677  | 0  |     SIZ (r) = 0;  | 
678  |  | #if BITS_PER_ULONG > GMP_NUMB_BITS  | 
679  |  |   } else if (UNLIKELY (n > GMP_NUMB_MAX)) { | 
680  |  |     mpz_t tmp;  | 
681  |  |  | 
682  |  |     mpz_init_set_ui (tmp, n);  | 
683  |  |     mpz_bin_ui (r, tmp, k);  | 
684  |  |     mpz_clear (tmp);  | 
685  |  | #endif  | 
686  | 0  |   } else { | 
687  | 0  |     ASSERT (n <= GMP_NUMB_MAX);  | 
688  |  |     /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller. */  | 
689  | 0  |     k = MIN (k, n - k);  | 
690  | 0  |     if (k < 2) { | 
691  | 0  |       MPZ_NEWALLOC (r, 1)[0] = k ? n : 1; /* 1 + ((-k) & (n-1)); */  | 
692  | 0  |       SIZ(r) = 1;  | 
693  | 0  |     } else if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) { /* k >= 2, n >= 4 */ | 
694  | 0  |       MPZ_NEWALLOC (r, 1)[0] = bc_bin_uiui (n, k);  | 
695  | 0  |       SIZ(r) = 1;  | 
696  | 0  |     } else if (k <= ODD_FACTORIAL_TABLE_LIMIT)  | 
697  | 0  |       mpz_smallk_bin_uiui (r, n, k);  | 
698  | 0  |     else if (BIN_UIUI_ENABLE_SMALLDC &&  | 
699  | 0  |        k <= (BIN_UIUI_RECURSIVE_SMALLDC ? ODD_CENTRAL_BINOMIAL_TABLE_LIMIT : ODD_FACTORIAL_TABLE_LIMIT)* 2)  | 
700  | 0  |       mpz_smallkdc_bin_uiui (r, n, k);  | 
701  | 0  |     else if (ABOVE_THRESHOLD (k, BIN_GOETGHELUCK_THRESHOLD) &&  | 
702  | 0  |        k > (n >> 4)) /* k > ODD_FACTORIAL_TABLE_LIMIT */  | 
703  | 0  |       mpz_goetgheluck_bin_uiui (r, n, k);  | 
704  | 0  |     else  | 
705  | 0  |       mpz_bdiv_bin_uiui (r, n, k);  | 
706  | 0  |   }  | 
707  | 0  | }  |