/src/gmp/mpn/generic/hgcd2-div.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* hgcd2-div.h  | 
2  |  |  | 
3  |  |    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY  | 
4  |  |    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST  | 
5  |  |    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.  | 
6  |  |  | 
7  |  | Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019, 2020 Free Software  | 
8  |  | Foundation, Inc.  | 
9  |  |  | 
10  |  | This file is part of the GNU MP Library.  | 
11  |  |  | 
12  |  | The GNU MP Library is free software; you can redistribute it and/or modify  | 
13  |  | it under the terms of either:  | 
14  |  |  | 
15  |  |   * the GNU Lesser General Public License as published by the Free  | 
16  |  |     Software Foundation; either version 3 of the License, or (at your  | 
17  |  |     option) any later version.  | 
18  |  |  | 
19  |  | or  | 
20  |  |  | 
21  |  |   * the GNU General Public License as published by the Free Software  | 
22  |  |     Foundation; either version 2 of the License, or (at your option) any  | 
23  |  |     later version.  | 
24  |  |  | 
25  |  | or both in parallel, as here.  | 
26  |  |  | 
27  |  | The GNU MP Library is distributed in the hope that it will be useful, but  | 
28  |  | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY  | 
29  |  | or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License  | 
30  |  | for more details.  | 
31  |  |  | 
32  |  | You should have received copies of the GNU General Public License and the  | 
33  |  | GNU Lesser General Public License along with the GNU MP Library.  If not,  | 
34  |  | see https://www.gnu.org/licenses/.  */  | 
35  |  |  | 
36  |  | #include "gmp-impl.h"  | 
37  |  | #include "longlong.h"  | 
38  |  |  | 
39  |  | #ifndef HGCD2_DIV1_METHOD  | 
40  |  | #define HGCD2_DIV1_METHOD 3  | 
41  |  | #endif  | 
42  |  |  | 
43  |  | #ifndef HGCD2_DIV2_METHOD  | 
44  |  | #define HGCD2_DIV2_METHOD 2  | 
45  |  | #endif  | 
46  |  |  | 
47  |  | #if HAVE_NATIVE_mpn_div_11  | 
48  |  |  | 
49  |  | #define div1 mpn_div_11  | 
50  |  | /* Single-limb division optimized for small quotients.  | 
51  |  |    Returned value holds d0 = r, d1 = q. */  | 
52  |  | mp_double_limb_t div1 (mp_limb_t, mp_limb_t);  | 
53  |  |  | 
54  |  | #elif HGCD2_DIV1_METHOD == 1  | 
55  |  |  | 
56  |  | static inline mp_double_limb_t  | 
57  |  | div1 (mp_limb_t n0, mp_limb_t d0)  | 
58  |  | { | 
59  |  |   mp_double_limb_t res;  | 
60  |  |   res.d1 = n0 / d0;  | 
61  |  |   res.d0 = n0 - res.d1 * d0;  | 
62  |  |  | 
63  |  |   return res;  | 
64  |  | }  | 
65  |  |  | 
66  |  | #elif HGCD2_DIV1_METHOD == 2  | 
67  |  |  | 
68  |  | static mp_double_limb_t  | 
69  |  | div1 (mp_limb_t n0, mp_limb_t d0)  | 
70  |  | { | 
71  |  |   mp_double_limb_t res;  | 
72  |  |   int ncnt, dcnt, cnt;  | 
73  |  |   mp_limb_t q;  | 
74  |  |   mp_limb_t mask;  | 
75  |  |  | 
76  |  |   ASSERT (n0 >= d0);  | 
77  |  |  | 
78  |  |   count_leading_zeros (ncnt, n0);  | 
79  |  |   count_leading_zeros (dcnt, d0);  | 
80  |  |   cnt = dcnt - ncnt;  | 
81  |  |  | 
82  |  |   d0 <<= cnt;  | 
83  |  |  | 
84  |  |   q = -(mp_limb_t) (n0 >= d0);  | 
85  |  |   n0 -= d0 & q;  | 
86  |  |   d0 >>= 1;  | 
87  |  |   q = -q;  | 
88  |  |  | 
89  |  |   while (--cnt >= 0)  | 
90  |  |     { | 
91  |  |       mask = -(mp_limb_t) (n0 >= d0);  | 
92  |  |       n0 -= d0 & mask;  | 
93  |  |       d0 >>= 1;  | 
94  |  |       q = (q << 1) - mask;  | 
95  |  |     }  | 
96  |  |  | 
97  |  |   res.d0 = n0;  | 
98  |  |   res.d1 = q;  | 
99  |  |   return res;  | 
100  |  | }  | 
101  |  |  | 
102  |  | #elif HGCD2_DIV1_METHOD == 3  | 
103  |  |  | 
104  |  | static inline mp_double_limb_t  | 
105  |  | div1 (mp_limb_t n0, mp_limb_t d0)  | 
106  |  | { | 
107  |  |   mp_double_limb_t res;  | 
108  |  |   if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)  | 
109  |  |       || UNLIKELY (n0 >= (d0 << 3)))  | 
110  |  |     { | 
111  |  |       res.d1 = n0 / d0;  | 
112  |  |       res.d0 = n0 - res.d1 * d0;  | 
113  |  |     }  | 
114  |  |   else  | 
115  |  |     { | 
116  |  |       mp_limb_t q, mask;  | 
117  |  |  | 
118  |  |       d0 <<= 2;  | 
119  |  |  | 
120  |  |       mask = -(mp_limb_t) (n0 >= d0);  | 
121  |  |       n0 -= d0 & mask;  | 
122  |  |       q = 4 & mask;  | 
123  |  |  | 
124  |  |       d0 >>= 1;  | 
125  |  |       mask = -(mp_limb_t) (n0 >= d0);  | 
126  |  |       n0 -= d0 & mask;  | 
127  |  |       q += 2 & mask;  | 
128  |  |  | 
129  |  |       d0 >>= 1;  | 
130  |  |       mask = -(mp_limb_t) (n0 >= d0);  | 
131  |  |       n0 -= d0 & mask;  | 
132  |  |       q -= mask;  | 
133  |  |  | 
134  |  |       res.d0 = n0;  | 
135  |  |       res.d1 = q;  | 
136  |  |     }  | 
137  |  |   return res;  | 
138  |  | }  | 
139  |  |  | 
140  |  | #elif HGCD2_DIV1_METHOD == 4  | 
141  |  |  | 
142  |  | /* Table quotients.  We extract the NBITS most significant bits of the  | 
143  |  |    numerator limb, and the corresponding bits from the divisor limb, and use  | 
144  |  |    these to form an index into the table.  This method is probably only useful  | 
145  |  |    for short pipelines with slow multiplication.  | 
146  |  |  | 
147  |  |    Possible improvements:  | 
148  |  |  | 
149  |  |    * Perhaps extract the highest NBITS of the divisor instead of the same bits  | 
150  |  |      as from the numerator.  That would require another count_leading_zeros,  | 
151  |  |      and a post-multiply shift of the quotient.  | 
152  |  |  | 
153  |  |    * Compress tables?  Their values are tiny, and there are lots of zero  | 
154  |  |      entries (which are never used).  | 
155  |  |  | 
156  |  |    * Round the table entries more cleverly?  | 
157  |  | */  | 
158  |  |  | 
159  |  | #ifndef NBITS  | 
160  |  | #define NBITS 5  | 
161  |  | #endif  | 
162  |  |  | 
163  |  | #if NBITS == 5  | 
164  |  | /* This needs full division about 13.2% of the time. */  | 
165  |  | static const unsigned char tab[512] = { | 
166  |  | 17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
167  |  | 18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
168  |  | 19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
169  |  | 20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,  | 
170  |  | 21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,  | 
171  |  | 22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,  | 
172  |  | 23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,  | 
173  |  | 24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,  | 
174  |  | 25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,  | 
175  |  | 26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,  | 
176  |  | 27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,  | 
177  |  | 28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,  | 
178  |  | 29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,  | 
179  |  | 30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,  | 
180  |  | 31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,  | 
181  |  | 32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1  | 
182  |  | };  | 
183  |  | #elif NBITS == 6  | 
184  |  | /* This needs full division about 9.8% of the time. */  | 
185  |  | static const unsigned char tab[2048] = { | 
186  |  | 33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
187  |  |  1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
188  |  | 34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
189  |  |  1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
190  |  | 35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
191  |  |  1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
192  |  | 36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
193  |  |  1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
194  |  | 37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
195  |  |  1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
196  |  | 38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
197  |  |  1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
198  |  | 39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
199  |  |  1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
200  |  | 40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,  | 
201  |  |  1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
202  |  | 41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,  | 
203  |  |  1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
204  |  | 42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,  | 
205  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
206  |  | 43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,  | 
207  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
208  |  | 44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,  | 
209  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
210  |  | 45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,  | 
211  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
212  |  | 46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,  | 
213  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
214  |  | 47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,  | 
215  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
216  |  | 48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,  | 
217  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
218  |  | 49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,  | 
219  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
220  |  | 50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,  | 
221  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
222  |  | 51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,  | 
223  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,  | 
224  |  | 52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,  | 
225  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,  | 
226  |  | 53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,  | 
227  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,  | 
228  |  | 54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,  | 
229  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,  | 
230  |  | 55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,  | 
231  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,  | 
232  |  | 56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,  | 
233  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,  | 
234  |  | 57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,  | 
235  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,  | 
236  |  | 58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,  | 
237  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,  | 
238  |  | 59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,  | 
239  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,  | 
240  |  | 60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,  | 
241  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,  | 
242  |  | 61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,  | 
243  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,  | 
244  |  | 62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,  | 
245  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,  | 
246  |  | 63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,  | 
247  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,  | 
248  |  | 64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,  | 
249  |  |  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1  | 
250  |  | };  | 
251  |  | #else  | 
252  |  | #error No table for provided NBITS  | 
253  |  | #endif  | 
254  |  |  | 
255  |  | /* Doing tabp with a #define makes compiler warnings about pointing outside an  | 
256  |  |    object go away.  We used to define this as a variable.  It is not clear if  | 
257  |  |    e.g.  (vector[100] - 10) + 10 is well- defined as per the C standard;  | 
258  |  |    (vector[100] + 10) - 10 surely is and there is no sequence point so the  | 
259  |  |    expressions should be equivalent.  To make this safe, we might want to  | 
260  |  |    define tabp as a macro with the index as an argument.  Depending on the  | 
261  |  |    platform, relocs might allow for assembly-time or linker-time resolution to  | 
262  |  |    take place. */  | 
263  |  | #define tabp (tab - (1 << (NBITS - 1) << NBITS))  | 
264  |  |  | 
265  |  | static inline mp_double_limb_t  | 
266  |  | div1 (mp_limb_t n0, mp_limb_t d0)  | 
267  |  | { | 
268  |  |   int ncnt;  | 
269  |  |   size_t nbi, dbi;  | 
270  |  |   mp_limb_t q0;  | 
271  |  |   mp_limb_t r0;  | 
272  |  |   mp_limb_t mask;  | 
273  |  |   mp_double_limb_t res;  | 
274  |  |  | 
275  |  |   ASSERT (n0 >= d0);    /* Actually only msb position is critical. */  | 
276  |  |  | 
277  |  |   count_leading_zeros (ncnt, n0);  | 
278  |  |   nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);  | 
279  |  |   dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);  | 
280  |  |  | 
281  |  |   q0 = tabp[(nbi << NBITS) + dbi];  | 
282  |  |   r0 = n0 - q0 * d0;  | 
283  |  |   mask = -(mp_limb_t) (r0 >= d0);  | 
284  |  |   q0 -= mask;  | 
285  |  |   r0 -= d0 & mask;  | 
286  |  |  | 
287  |  |   if (UNLIKELY (r0 >= d0))  | 
288  |  |     { | 
289  |  |       q0 = n0 / d0;  | 
290  |  |       r0 = n0 - q0 * d0;  | 
291  |  |     }  | 
292  |  |  | 
293  |  |   res.d1 = q0;  | 
294  |  |   res.d0 = r0;  | 
295  |  |   return res;  | 
296  |  | }  | 
297  |  |  | 
298  |  | #elif HGCD2_DIV1_METHOD == 5  | 
299  |  |  | 
300  |  | /* Table inverses of divisors.  We don't bother with suppressing the msb from  | 
301  |  |    the tables.  We index with the NBITS most significant divisor bits,  | 
302  |  |    including the always-set highest bit, but use addressing trickery via tabp  | 
303  |  |    to suppress it.  | 
304  |  |  | 
305  |  |    Possible improvements:  | 
306  |  |  | 
307  |  |    * Do first multiply using 32-bit operations on 64-bit computers.  At least  | 
308  |  |      on most Arm64 cores, that uses 3 times less resources.  It also saves on  | 
309  |  |      many x86-64 processors.  | 
310  |  | */  | 
311  |  |  | 
312  |  | #ifndef NBITS  | 
313  | 0  | #define NBITS 7  | 
314  |  | #endif  | 
315  |  |  | 
316  |  | #if NBITS == 5  | 
317  |  | /* This needs full division about 1.63% of the time. */  | 
318  |  | static const unsigned char tab[16] = { | 
319  |  |  63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32  | 
320  |  | };  | 
321  |  | #elif NBITS == 6  | 
322  |  | /* This needs full division about 0.93% of the time. */  | 
323  |  | static const unsigned char tab[32] = { | 
324  |  | 127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,  | 
325  |  |  84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64  | 
326  |  | };  | 
327  |  | #elif NBITS == 7  | 
328  |  | /* This needs full division about 0.49% of the time. */  | 
329  |  | static const unsigned char tab[64] = { | 
330  |  | 255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,  | 
331  |  | 203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,  | 
332  |  | 169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,  | 
333  |  | 145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128  | 
334  |  | };  | 
335  |  | #elif NBITS == 8  | 
336  |  | /* This needs full division about 0.26% of the time. */  | 
337  |  | static const unsigned short tab[128] = { | 
338  |  | 511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,  | 
339  |  | 454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,  | 
340  |  | 408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,  | 
341  |  | 371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,  | 
342  |  | 340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,  | 
343  |  | 314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,  | 
344  |  | 291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,  | 
345  |  | 272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256  | 
346  |  | };  | 
347  |  | #else  | 
348  |  | #error No table for provided NBITS  | 
349  |  | #endif  | 
350  |  |  | 
351  |  | /* Doing tabp with a #define makes compiler warnings about pointing outside an  | 
352  |  |    object go away.  We used to define this as a variable.  It is not clear if  | 
353  |  |    e.g.  (vector[100] - 10) + 10 is well- defined as per the C standard;  | 
354  |  |    (vector[100] + 10) - 10 surely is and there is no sequence point so the  | 
355  |  |    expressions should be equivalent.  To make this safe, we might want to  | 
356  |  |    define tabp as a macro with the index as an argument.  Depending on the  | 
357  |  |    platform, relocs might allow for assembly-time or linker-time resolution to  | 
358  |  |    take place. */  | 
359  | 0  | #define tabp (tab - (1 << (NBITS - 1)))  | 
360  |  |  | 
361  |  | static inline mp_double_limb_t  | 
362  |  | div1 (mp_limb_t n0, mp_limb_t d0)  | 
363  | 0  | { | 
364  | 0  |   int ncnt, dcnt;  | 
365  | 0  |   size_t dbi;  | 
366  | 0  |   mp_limb_t inv;  | 
367  | 0  |   mp_limb_t q0;  | 
368  | 0  |   mp_limb_t r0;  | 
369  | 0  |   mp_limb_t mask;  | 
370  | 0  |   mp_double_limb_t res;  | 
371  |  | 
  | 
372  | 0  |   count_leading_zeros (ncnt, n0);  | 
373  | 0  |   count_leading_zeros (dcnt, d0);  | 
374  |  | 
  | 
375  | 0  |   dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);  | 
376  | 0  |   inv = tabp[dbi];  | 
377  | 0  |   q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);  | 
378  | 0  |   r0 = n0 - q0 * d0;  | 
379  | 0  |   mask = -(mp_limb_t) (r0 >= d0);  | 
380  | 0  |   q0 -= mask;  | 
381  | 0  |   r0 -= d0 & mask;  | 
382  |  | 
  | 
383  | 0  |   if (UNLIKELY (r0 >= d0))  | 
384  | 0  |     { | 
385  | 0  |       q0 = n0 / d0;  | 
386  | 0  |       r0 = n0 - q0 * d0;  | 
387  | 0  |     }  | 
388  |  | 
  | 
389  | 0  |   res.d1 = q0;  | 
390  | 0  |   res.d0 = r0;  | 
391  | 0  |   return res;  | 
392  | 0  | }  | 
393  |  |  | 
394  |  | #else  | 
395  |  | #error Unknown HGCD2_DIV1_METHOD  | 
396  |  | #endif  | 
397  |  |  | 
398  |  | #if HAVE_NATIVE_mpn_div_22  | 
399  |  |  | 
400  |  | #define div2 mpn_div_22  | 
401  |  | /* Two-limb division optimized for small quotients.  */  | 
402  |  | mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);  | 
403  |  |  | 
404  |  | #elif HGCD2_DIV2_METHOD == 1  | 
405  |  |  | 
406  |  | static mp_limb_t  | 
407  |  | div2 (mp_ptr rp,  | 
408  |  |       mp_limb_t n1, mp_limb_t n0,  | 
409  |  |       mp_limb_t d1, mp_limb_t d0)  | 
410  |  | { | 
411  |  |   mp_double_limb_t rq = div1 (n1, d1);  | 
412  |  |   if (UNLIKELY (rq.d1 > d1))  | 
413  |  |     { | 
414  |  |       mp_limb_t n2, q, t1, t0;  | 
415  |  |       int c;  | 
416  |  |  | 
417  |  |       /* Normalize */  | 
418  |  |       count_leading_zeros (c, d1);  | 
419  |  |       ASSERT (c > 0);  | 
420  |  |  | 
421  |  |       n2 = n1 >> (GMP_LIMB_BITS - c);  | 
422  |  |       n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c));  | 
423  |  |       n0 <<= c;  | 
424  |  |       d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c));  | 
425  |  |       d0 <<= c;  | 
426  |  |  | 
427  |  |       udiv_qrnnd (q, n1, n2, n1, d1);  | 
428  |  |       umul_ppmm (t1, t0, q, d0);  | 
429  |  |       if (t1 > n1 || (t1 == n1 && t0 > n0))  | 
430  |  |   { | 
431  |  |     ASSERT (q > 0);  | 
432  |  |     q--;  | 
433  |  |     sub_ddmmss (t1, t0, t1, t0, d1, d0);  | 
434  |  |   }  | 
435  |  |       sub_ddmmss (n1, n0, n1, n0, t1, t0);  | 
436  |  |  | 
437  |  |       /* Undo normalization */  | 
438  |  |       rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c));  | 
439  |  |       rp[1] = n1 >> c;  | 
440  |  |  | 
441  |  |       return q;  | 
442  |  |     }  | 
443  |  |   else  | 
444  |  |     { | 
445  |  |       mp_limb_t q, t1, t0;  | 
446  |  |       n1 = rq.d0;  | 
447  |  |       q = rq.d1;  | 
448  |  |       umul_ppmm (t1, t0, q, d0);  | 
449  |  |       if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0))  | 
450  |  |   { | 
451  |  |     ASSERT (q > 0);  | 
452  |  |     q--;  | 
453  |  |     sub_ddmmss (t1, t0, t1, t0, d1, d0);  | 
454  |  |   }  | 
455  |  |       sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);  | 
456  |  |       return q;  | 
457  |  |     }  | 
458  |  | }  | 
459  |  |  | 
460  |  | #elif HGCD2_DIV2_METHOD == 2  | 
461  |  |  | 
462  |  | /* Bit-wise div2. Relies on fast count_leading_zeros. */  | 
463  |  | static mp_limb_t  | 
464  |  | div2 (mp_ptr rp,  | 
465  |  |       mp_limb_t n1, mp_limb_t n0,  | 
466  |  |       mp_limb_t d1, mp_limb_t d0)  | 
467  | 0  | { | 
468  | 0  |   mp_limb_t q = 0;  | 
469  | 0  |   int ncnt;  | 
470  | 0  |   int dcnt;  | 
471  |  | 
  | 
472  | 0  |   count_leading_zeros (ncnt, n1);  | 
473  | 0  |   count_leading_zeros (dcnt, d1);  | 
474  | 0  |   dcnt -= ncnt;  | 
475  |  | 
  | 
476  | 0  |   d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));  | 
477  | 0  |   d0 <<= dcnt;  | 
478  |  | 
  | 
479  | 0  |   do  | 
480  | 0  |     { | 
481  | 0  |       mp_limb_t mask;  | 
482  | 0  |       q <<= 1;  | 
483  | 0  |       if (UNLIKELY (n1 == d1))  | 
484  | 0  |   mask = -(n0 >= d0);  | 
485  | 0  |       else  | 
486  | 0  |   mask = -(n1 > d1);  | 
487  |  | 
  | 
488  | 0  |       q -= mask;  | 
489  |  | 
  | 
490  | 0  |       sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);  | 
491  |  | 
  | 
492  | 0  |       d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);  | 
493  | 0  |       d1 = d1 >> 1;  | 
494  | 0  |     }  | 
495  | 0  |   while (dcnt--);  | 
496  |  | 
  | 
497  | 0  |   rp[0] = n0;  | 
498  | 0  |   rp[1] = n1;  | 
499  |  | 
  | 
500  | 0  |   return q;  | 
501  | 0  | }  | 
502  |  | #else  | 
503  |  | #error Unknown HGCD2_DIV2_METHOD  | 
504  |  | #endif  |