/src/gmp-6.2.1/mpn/hgcd2.c
Line | Count | Source |
1 | | /* hgcd2.c |
2 | | |
3 | | THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY |
4 | | SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
5 | | GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. |
6 | | |
7 | | Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation, |
8 | | Inc. |
9 | | |
10 | | This file is part of the GNU MP Library. |
11 | | |
12 | | The GNU MP Library is free software; you can redistribute it and/or modify |
13 | | it under the terms of either: |
14 | | |
15 | | * the GNU Lesser General Public License as published by the Free |
16 | | Software Foundation; either version 3 of the License, or (at your |
17 | | option) any later version. |
18 | | |
19 | | or |
20 | | |
21 | | * the GNU General Public License as published by the Free Software |
22 | | Foundation; either version 2 of the License, or (at your option) any |
23 | | later version. |
24 | | |
25 | | or both in parallel, as here. |
26 | | |
27 | | The GNU MP Library is distributed in the hope that it will be useful, but |
28 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
29 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
30 | | for more details. |
31 | | |
32 | | You should have received copies of the GNU General Public License and the |
33 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
34 | | see https://www.gnu.org/licenses/. */ |
35 | | |
36 | | #include "gmp-impl.h" |
37 | | #include "longlong.h" |
38 | | |
39 | | #ifndef HGCD2_DIV1_METHOD |
40 | | #define HGCD2_DIV1_METHOD 3 |
41 | | #endif |
42 | | |
43 | | #ifndef HGCD2_DIV2_METHOD |
44 | | #define HGCD2_DIV2_METHOD 2 |
45 | | #endif |
46 | | |
47 | | #if GMP_NAIL_BITS != 0 |
48 | | #error Nails not implemented |
49 | | #endif |
50 | | |
51 | | #if HAVE_NATIVE_mpn_div_11 |
52 | | |
53 | | #define div1 mpn_div_11 |
54 | | /* Single-limb division optimized for small quotients. |
55 | | Returned value holds d0 = r, d1 = q. */ |
56 | | mp_double_limb_t div1 (mp_limb_t, mp_limb_t); |
57 | | |
58 | | #elif HGCD2_DIV1_METHOD == 1 |
59 | | |
60 | | static inline mp_double_limb_t |
61 | | div1 (mp_limb_t n0, mp_limb_t d0) |
62 | | { |
63 | | mp_double_limb_t res; |
64 | | res.d1 = n0 / d0; |
65 | | res.d0 = n0 - res.d1 * d0; |
66 | | |
67 | | return res; |
68 | | } |
69 | | |
70 | | #elif HGCD2_DIV1_METHOD == 2 |
71 | | |
72 | | static mp_double_limb_t |
73 | | div1 (mp_limb_t n0, mp_limb_t d0) |
74 | | { |
75 | | mp_double_limb_t res; |
76 | | int ncnt, dcnt, cnt; |
77 | | mp_limb_t q; |
78 | | mp_limb_t mask; |
79 | | |
80 | | ASSERT (n0 >= d0); |
81 | | |
82 | | count_leading_zeros (ncnt, n0); |
83 | | count_leading_zeros (dcnt, d0); |
84 | | cnt = dcnt - ncnt; |
85 | | |
86 | | d0 <<= cnt; |
87 | | |
88 | | q = -(mp_limb_t) (n0 >= d0); |
89 | | n0 -= d0 & q; |
90 | | d0 >>= 1; |
91 | | q = -q; |
92 | | |
93 | | while (--cnt >= 0) |
94 | | { |
95 | | mask = -(mp_limb_t) (n0 >= d0); |
96 | | n0 -= d0 & mask; |
97 | | d0 >>= 1; |
98 | | q = (q << 1) - mask; |
99 | | } |
100 | | |
101 | | res.d0 = n0; |
102 | | res.d1 = q; |
103 | | return res; |
104 | | } |
105 | | |
106 | | #elif HGCD2_DIV1_METHOD == 3 |
107 | | |
108 | | static inline mp_double_limb_t |
109 | | div1 (mp_limb_t n0, mp_limb_t d0) |
110 | 610k | { |
111 | 610k | mp_double_limb_t res; |
112 | 610k | if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0) |
113 | 610k | || UNLIKELY (n0 >= (d0 << 3))) |
114 | 165k | { |
115 | 165k | res.d1 = n0 / d0; |
116 | 165k | res.d0 = n0 - res.d1 * d0; |
117 | 165k | } |
118 | 444k | else |
119 | 444k | { |
120 | 444k | mp_limb_t q, mask; |
121 | | |
122 | 444k | d0 <<= 2; |
123 | | |
124 | 444k | mask = -(mp_limb_t) (n0 >= d0); |
125 | 444k | n0 -= d0 & mask; |
126 | 444k | q = 4 & mask; |
127 | | |
128 | 444k | d0 >>= 1; |
129 | 444k | mask = -(mp_limb_t) (n0 >= d0); |
130 | 444k | n0 -= d0 & mask; |
131 | 444k | q += 2 & mask; |
132 | | |
133 | 444k | d0 >>= 1; |
134 | 444k | mask = -(mp_limb_t) (n0 >= d0); |
135 | 444k | n0 -= d0 & mask; |
136 | 444k | q -= mask; |
137 | | |
138 | 444k | res.d0 = n0; |
139 | 444k | res.d1 = q; |
140 | 444k | } |
141 | 610k | return res; |
142 | 610k | } |
143 | | |
144 | | #elif HGCD2_DIV1_METHOD == 4 |
145 | | |
146 | | /* Table quotients. We extract the NBITS most significant bits of the |
147 | | numerator limb, and the corresponding bits from the divisor limb, and use |
148 | | these to form an index into the table. This method is probably only useful |
149 | | for short pipelines with slow multiplication. |
150 | | |
151 | | Possible improvements: |
152 | | |
153 | | * Perhaps extract the highest NBITS of the divisor instead of the same bits |
154 | | as from the numerator. That would require another count_leading_zeros, |
155 | | and a post-multiply shift of the quotient. |
156 | | |
157 | | * Compress tables? Their values are tiny, and there are lots of zero |
158 | | entries (which are never used). |
159 | | |
160 | | * Round the table entries more cleverly? |
161 | | */ |
162 | | |
163 | | #ifndef NBITS |
164 | | #define NBITS 5 |
165 | | #endif |
166 | | |
167 | | #if NBITS == 5 |
168 | | /* This needs full division about 13.2% of the time. */ |
169 | | static const unsigned char tab[512] = { |
170 | | 17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
171 | | 18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
172 | | 19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0, |
173 | | 20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0, |
174 | | 21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0, |
175 | | 22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0, |
176 | | 23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0, |
177 | | 24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, |
178 | | 25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0, |
179 | | 26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, |
180 | | 27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, |
181 | | 28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0, |
182 | | 29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0, |
183 | | 30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0, |
184 | | 31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, |
185 | | 32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 |
186 | | }; |
187 | | #elif NBITS == 6 |
188 | | /* This needs full division about 9.8% of the time. */ |
189 | | static const unsigned char tab[2048] = { |
190 | | 33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
191 | | 1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
192 | | 34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
193 | | 1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
194 | | 35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
195 | | 1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
196 | | 36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
197 | | 1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
198 | | 37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
199 | | 1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
200 | | 38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
201 | | 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
202 | | 39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1, |
203 | | 1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
204 | | 40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1, |
205 | | 1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
206 | | 41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1, |
207 | | 1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
208 | | 42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1, |
209 | | 1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
210 | | 43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1, |
211 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
212 | | 44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1, |
213 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
214 | | 45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1, |
215 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
216 | | 46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1, |
217 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
218 | | 47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1, |
219 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
220 | | 48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1, |
221 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
222 | | 49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1, |
223 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
224 | | 50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1, |
225 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
226 | | 51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1, |
227 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0, |
228 | | 52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1, |
229 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0, |
230 | | 53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1, |
231 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0, |
232 | | 54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1, |
233 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0, |
234 | | 55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1, |
235 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0, |
236 | | 56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1, |
237 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, |
238 | | 57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1, |
239 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0, |
240 | | 58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1, |
241 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, |
242 | | 59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1, |
243 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, |
244 | | 60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1, |
245 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0, |
246 | | 61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1, |
247 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0, |
248 | | 62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1, |
249 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0, |
250 | | 63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1, |
251 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, |
252 | | 64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1, |
253 | | 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 |
254 | | }; |
255 | | #else |
256 | | #error No table for provided NBITS |
257 | | #endif |
258 | | |
259 | | /* Doing tabp with a #define makes compiler warnings about pointing outside an |
260 | | object go away. We used to define this as a variable. It is not clear if |
261 | | e.g. (vector[100] - 10) + 10 is well- defined as per the C standard; |
262 | | (vector[100] + 10) - 10 surely is and there is no sequence point so the |
263 | | expressions should be equivalent. To make this safe, we might want to |
264 | | define tabp as a macro with the index as an argument. Depending on the |
265 | | platform, relocs might allow for assembly-time or linker-time resolution to |
266 | | take place. */ |
267 | | #define tabp (tab - (1 << (NBITS - 1) << NBITS)) |
268 | | |
269 | | static inline mp_double_limb_t |
270 | | div1 (mp_limb_t n0, mp_limb_t d0) |
271 | | { |
272 | | int ncnt; |
273 | | size_t nbi, dbi; |
274 | | mp_limb_t q0; |
275 | | mp_limb_t r0; |
276 | | mp_limb_t mask; |
277 | | mp_double_limb_t res; |
278 | | |
279 | | ASSERT (n0 >= d0); /* Actually only msb position is critical. */ |
280 | | |
281 | | count_leading_zeros (ncnt, n0); |
282 | | nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS); |
283 | | dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS); |
284 | | |
285 | | q0 = tabp[(nbi << NBITS) + dbi]; |
286 | | r0 = n0 - q0 * d0; |
287 | | mask = -(mp_limb_t) (r0 >= d0); |
288 | | q0 -= mask; |
289 | | r0 -= d0 & mask; |
290 | | |
291 | | if (UNLIKELY (r0 >= d0)) |
292 | | { |
293 | | q0 = n0 / d0; |
294 | | r0 = n0 - q0 * d0; |
295 | | } |
296 | | |
297 | | res.d1 = q0; |
298 | | res.d0 = r0; |
299 | | return res; |
300 | | } |
301 | | |
302 | | #elif HGCD2_DIV1_METHOD == 5 |
303 | | |
304 | | /* Table inverses of divisors. We don't bother with suppressing the msb from |
305 | | the tables. We index with the NBITS most significant divisor bits, |
306 | | including the always-set highest bit, but use addressing trickery via tabp |
307 | | to suppress it. |
308 | | |
309 | | Possible improvements: |
310 | | |
311 | | * Do first multiply using 32-bit operations on 64-bit computers. At least |
312 | | on most Arm64 cores, that uses 3 times less resources. It also saves on |
313 | | many x86-64 processors. |
314 | | */ |
315 | | |
316 | | #ifndef NBITS |
317 | | #define NBITS 7 |
318 | | #endif |
319 | | |
320 | | #if NBITS == 5 |
321 | | /* This needs full division about 1.63% of the time. */ |
322 | | static const unsigned char tab[16] = { |
323 | | 63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32 |
324 | | }; |
325 | | #elif NBITS == 6 |
326 | | /* This needs full division about 0.93% of the time. */ |
327 | | static const unsigned char tab[32] = { |
328 | | 127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86, |
329 | | 84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64 |
330 | | }; |
331 | | #elif NBITS == 7 |
332 | | /* This needs full division about 0.49% of the time. */ |
333 | | static const unsigned char tab[64] = { |
334 | | 255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206, |
335 | | 203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171, |
336 | | 169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146, |
337 | | 145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128 |
338 | | }; |
339 | | #elif NBITS == 8 |
340 | | /* This needs full division about 0.26% of the time. */ |
341 | | static const unsigned short tab[128] = { |
342 | | 511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457, |
343 | | 454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411, |
344 | | 408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373, |
345 | | 371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342, |
346 | | 340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315, |
347 | | 314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292, |
348 | | 291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273, |
349 | | 272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256 |
350 | | }; |
351 | | #else |
352 | | #error No table for provided NBITS |
353 | | #endif |
354 | | |
355 | | /* Doing tabp with a #define makes compiler warnings about pointing outside an |
356 | | object go away. We used to define this as a variable. It is not clear if |
357 | | e.g. (vector[100] - 10) + 10 is well- defined as per the C standard; |
358 | | (vector[100] + 10) - 10 surely is and there is no sequence point so the |
359 | | expressions should be equivalent. To make this safe, we might want to |
360 | | define tabp as a macro with the index as an argument. Depending on the |
361 | | platform, relocs might allow for assembly-time or linker-time resolution to |
362 | | take place. */ |
363 | | #define tabp (tab - (1 << (NBITS - 1))) |
364 | | |
365 | | static inline mp_double_limb_t |
366 | | div1 (mp_limb_t n0, mp_limb_t d0) |
367 | | { |
368 | | int ncnt, dcnt; |
369 | | size_t dbi; |
370 | | mp_limb_t inv; |
371 | | mp_limb_t q0; |
372 | | mp_limb_t r0; |
373 | | mp_limb_t mask; |
374 | | mp_double_limb_t res; |
375 | | |
376 | | count_leading_zeros (ncnt, n0); |
377 | | count_leading_zeros (dcnt, d0); |
378 | | |
379 | | dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS); |
380 | | inv = tabp[dbi]; |
381 | | q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt); |
382 | | r0 = n0 - q0 * d0; |
383 | | mask = -(mp_limb_t) (r0 >= d0); |
384 | | q0 -= mask; |
385 | | r0 -= d0 & mask; |
386 | | |
387 | | if (UNLIKELY (r0 >= d0)) |
388 | | { |
389 | | q0 = n0 / d0; |
390 | | r0 = n0 - q0 * d0; |
391 | | } |
392 | | |
393 | | res.d1 = q0; |
394 | | res.d0 = r0; |
395 | | return res; |
396 | | } |
397 | | |
398 | | #else |
399 | | #error Unknown HGCD2_DIV1_METHOD |
400 | | #endif |
401 | | |
402 | | #if HAVE_NATIVE_mpn_div_22 |
403 | | |
404 | | #define div2 mpn_div_22 |
405 | | /* Two-limb division optimized for small quotients. */ |
406 | | mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t); |
407 | | |
408 | | #elif HGCD2_DIV2_METHOD == 1 |
409 | | |
410 | | static mp_limb_t |
411 | | div2 (mp_ptr rp, |
412 | | mp_limb_t n1, mp_limb_t n0, |
413 | | mp_limb_t d1, mp_limb_t d0) |
414 | | { |
415 | | mp_double_limb_t rq = div1 (n1, d1); |
416 | | if (UNLIKELY (rq.d1 > d1)) |
417 | | { |
418 | | mp_limb_t n2, q, t1, t0; |
419 | | int c; |
420 | | |
421 | | /* Normalize */ |
422 | | count_leading_zeros (c, d1); |
423 | | ASSERT (c > 0); |
424 | | |
425 | | n2 = n1 >> (GMP_LIMB_BITS - c); |
426 | | n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c)); |
427 | | n0 <<= c; |
428 | | d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c)); |
429 | | d0 <<= c; |
430 | | |
431 | | udiv_qrnnd (q, n1, n2, n1, d1); |
432 | | umul_ppmm (t1, t0, q, d0); |
433 | | if (t1 > n1 || (t1 == n1 && t0 > n0)) |
434 | | { |
435 | | ASSERT (q > 0); |
436 | | q--; |
437 | | sub_ddmmss (t1, t0, t1, t0, d1, d0); |
438 | | } |
439 | | sub_ddmmss (n1, n0, n1, n0, t1, t0); |
440 | | |
441 | | /* Undo normalization */ |
442 | | rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c)); |
443 | | rp[1] = n1 >> c; |
444 | | |
445 | | return q; |
446 | | } |
447 | | else |
448 | | { |
449 | | mp_limb_t q, t1, t0; |
450 | | n1 = rq.d0; |
451 | | q = rq.d1; |
452 | | umul_ppmm (t1, t0, q, d0); |
453 | | if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0)) |
454 | | { |
455 | | ASSERT (q > 0); |
456 | | q--; |
457 | | sub_ddmmss (t1, t0, t1, t0, d1, d0); |
458 | | } |
459 | | sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0); |
460 | | return q; |
461 | | } |
462 | | } |
463 | | |
464 | | #elif HGCD2_DIV2_METHOD == 2 |
465 | | |
466 | | /* Bit-wise div2. Relies on fast count_leading_zeros. */ |
467 | | static mp_limb_t |
468 | | div2 (mp_ptr rp, |
469 | | mp_limb_t n1, mp_limb_t n0, |
470 | | mp_limb_t d1, mp_limb_t d0) |
471 | 701k | { |
472 | 701k | mp_limb_t q = 0; |
473 | 701k | int ncnt; |
474 | 701k | int dcnt; |
475 | | |
476 | 701k | count_leading_zeros (ncnt, n1); |
477 | 701k | count_leading_zeros (dcnt, d1); |
478 | 701k | dcnt -= ncnt; |
479 | | |
480 | 701k | d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt)); |
481 | 701k | d0 <<= dcnt; |
482 | | |
483 | 701k | do |
484 | 2.34M | { |
485 | 2.34M | mp_limb_t mask; |
486 | 2.34M | q <<= 1; |
487 | 2.34M | if (UNLIKELY (n1 == d1)) |
488 | 1.54k | mask = -(n0 >= d0); |
489 | 2.34M | else |
490 | 2.34M | mask = -(n1 > d1); |
491 | | |
492 | 2.34M | q -= mask; |
493 | | |
494 | 2.34M | sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0); |
495 | | |
496 | 2.34M | d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1); |
497 | 2.34M | d1 = d1 >> 1; |
498 | 2.34M | } |
499 | 2.34M | while (dcnt--); |
500 | | |
501 | 701k | rp[0] = n0; |
502 | 701k | rp[1] = n1; |
503 | | |
504 | 701k | return q; |
505 | 701k | } |
506 | | #else |
507 | | #error Unknown HGCD2_DIV2_METHOD |
508 | | #endif |
509 | | |
510 | | /* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs |
511 | | matrix M. Returns 1 if we make progress, i.e. can perform at least |
512 | | one subtraction. Otherwise returns zero. */ |
513 | | |
514 | | /* FIXME: Possible optimizations: |
515 | | |
516 | | The div2 function starts with checking the most significant bit of |
517 | | the numerator. We can maintained normalized operands here, call |
518 | | hgcd with normalized operands only, which should make the code |
519 | | simpler and possibly faster. |
520 | | |
521 | | Experiment with table lookups on the most significant bits. |
522 | | |
523 | | This function is also a candidate for assembler implementation. |
524 | | */ |
525 | | int |
526 | | mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl, |
527 | | struct hgcd_matrix1 *M) |
528 | 69.6k | { |
529 | 69.6k | mp_limb_t u00, u01, u10, u11; |
530 | | |
531 | 69.6k | if (ah < 2 || bh < 2) |
532 | 487 | return 0; |
533 | | |
534 | 69.2k | if (ah > bh || (ah == bh && al > bl)) |
535 | 33.6k | { |
536 | 33.6k | sub_ddmmss (ah, al, ah, al, bh, bl); |
537 | 33.6k | if (ah < 2) |
538 | 892 | return 0; |
539 | | |
540 | 32.7k | u00 = u01 = u11 = 1; |
541 | 32.7k | u10 = 0; |
542 | 32.7k | } |
543 | 35.5k | else |
544 | 35.5k | { |
545 | 35.5k | sub_ddmmss (bh, bl, bh, bl, ah, al); |
546 | 35.5k | if (bh < 2) |
547 | 2.11k | return 0; |
548 | | |
549 | 33.4k | u00 = u10 = u11 = 1; |
550 | 33.4k | u01 = 0; |
551 | 33.4k | } |
552 | | |
553 | 66.1k | if (ah < bh) |
554 | 33.2k | goto subtract_a; |
555 | | |
556 | 32.9k | for (;;) |
557 | 615k | { |
558 | 615k | ASSERT (ah >= bh); |
559 | 615k | if (ah == bh) |
560 | 738 | goto done; |
561 | | |
562 | 614k | if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) |
563 | 31.9k | { |
564 | 31.9k | ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); |
565 | 31.9k | bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); |
566 | | |
567 | 31.9k | break; |
568 | 31.9k | } |
569 | | |
570 | | /* Subtract a -= q b, and multiply M from the right by (1 q ; 0 |
571 | | 1), affecting the second column of M. */ |
572 | 582k | ASSERT (ah > bh); |
573 | 582k | sub_ddmmss (ah, al, ah, al, bh, bl); |
574 | | |
575 | 582k | if (ah < 2) |
576 | 31 | goto done; |
577 | | |
578 | 582k | if (ah <= bh) |
579 | 232k | { |
580 | | /* Use q = 1 */ |
581 | 232k | u01 += u00; |
582 | 232k | u11 += u10; |
583 | 232k | } |
584 | 349k | else |
585 | 349k | { |
586 | 349k | mp_limb_t r[2]; |
587 | 349k | mp_limb_t q = div2 (r, ah, al, bh, bl); |
588 | 349k | al = r[0]; ah = r[1]; |
589 | 349k | if (ah < 2) |
590 | 449 | { |
591 | | /* A is too small, but q is correct. */ |
592 | 449 | u01 += q * u00; |
593 | 449 | u11 += q * u10; |
594 | 449 | goto done; |
595 | 449 | } |
596 | 349k | q++; |
597 | 349k | u01 += q * u00; |
598 | 349k | u11 += q * u10; |
599 | 349k | } |
600 | 615k | subtract_a: |
601 | 615k | ASSERT (bh >= ah); |
602 | 615k | if (ah == bh) |
603 | 662 | goto done; |
604 | | |
605 | 614k | if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) |
606 | 31.8k | { |
607 | 31.8k | ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); |
608 | 31.8k | bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); |
609 | | |
610 | 31.8k | goto subtract_a1; |
611 | 31.8k | } |
612 | | |
613 | | /* Subtract b -= q a, and multiply M from the right by (1 0 ; q |
614 | | 1), affecting the first column of M. */ |
615 | 582k | sub_ddmmss (bh, bl, bh, bl, ah, al); |
616 | | |
617 | 582k | if (bh < 2) |
618 | 34 | goto done; |
619 | | |
620 | 582k | if (bh <= ah) |
621 | 231k | { |
622 | | /* Use q = 1 */ |
623 | 231k | u00 += u01; |
624 | 231k | u10 += u11; |
625 | 231k | } |
626 | 351k | else |
627 | 351k | { |
628 | 351k | mp_limb_t r[2]; |
629 | 351k | mp_limb_t q = div2 (r, bh, bl, ah, al); |
630 | 351k | bl = r[0]; bh = r[1]; |
631 | 351k | if (bh < 2) |
632 | 511 | { |
633 | | /* B is too small, but q is correct. */ |
634 | 511 | u00 += q * u01; |
635 | 511 | u10 += q * u11; |
636 | 511 | goto done; |
637 | 511 | } |
638 | 351k | q++; |
639 | 351k | u00 += q * u01; |
640 | 351k | u10 += q * u11; |
641 | 351k | } |
642 | 582k | } |
643 | | |
644 | | /* NOTE: Since we discard the least significant half limb, we don't get a |
645 | | truly maximal M (corresponding to |a - b| < 2^{GMP_LIMB_BITS +1}). */ |
646 | | /* Single precision loop */ |
647 | 31.9k | for (;;) |
648 | 537k | { |
649 | 537k | ASSERT (ah >= bh); |
650 | | |
651 | 537k | ah -= bh; |
652 | 537k | if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) |
653 | 14.5k | break; |
654 | | |
655 | 523k | if (ah <= bh) |
656 | 218k | { |
657 | | /* Use q = 1 */ |
658 | 218k | u01 += u00; |
659 | 218k | u11 += u10; |
660 | 218k | } |
661 | 304k | else |
662 | 304k | { |
663 | 304k | mp_double_limb_t rq = div1 (ah, bh); |
664 | 304k | mp_limb_t q = rq.d1; |
665 | 304k | ah = rq.d0; |
666 | | |
667 | 304k | if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) |
668 | 17.4k | { |
669 | | /* A is too small, but q is correct. */ |
670 | 17.4k | u01 += q * u00; |
671 | 17.4k | u11 += q * u10; |
672 | 17.4k | break; |
673 | 17.4k | } |
674 | 287k | q++; |
675 | 287k | u01 += q * u00; |
676 | 287k | u11 += q * u10; |
677 | 287k | } |
678 | 537k | subtract_a1: |
679 | 537k | ASSERT (bh >= ah); |
680 | | |
681 | 537k | bh -= ah; |
682 | 537k | if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) |
683 | 14.4k | break; |
684 | | |
685 | 523k | if (bh <= ah) |
686 | 218k | { |
687 | | /* Use q = 1 */ |
688 | 218k | u00 += u01; |
689 | 218k | u10 += u11; |
690 | 218k | } |
691 | 305k | else |
692 | 305k | { |
693 | 305k | mp_double_limb_t rq = div1 (bh, ah); |
694 | 305k | mp_limb_t q = rq.d1; |
695 | 305k | bh = rq.d0; |
696 | | |
697 | 305k | if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) |
698 | 17.4k | { |
699 | | /* B is too small, but q is correct. */ |
700 | 17.4k | u00 += q * u01; |
701 | 17.4k | u10 += q * u11; |
702 | 17.4k | break; |
703 | 17.4k | } |
704 | 287k | q++; |
705 | 287k | u00 += q * u01; |
706 | 287k | u10 += q * u11; |
707 | 287k | } |
708 | 523k | } |
709 | | |
710 | 66.1k | done: |
711 | 66.1k | M->u[0][0] = u00; M->u[0][1] = u01; |
712 | 66.1k | M->u[1][0] = u10; M->u[1][1] = u11; |
713 | | |
714 | 66.1k | return 1; |
715 | 31.9k | } |
716 | | |
717 | | /* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must |
718 | | * have space for n + 1 limbs. Uses three buffers to avoid a copy*/ |
719 | | mp_size_t |
720 | | mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M, |
721 | | mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n) |
722 | 59.8k | { |
723 | 59.8k | mp_limb_t ah, bh; |
724 | | |
725 | | /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as |
726 | | |
727 | | r = u00 * a |
728 | | r += u10 * b |
729 | | b *= u11 |
730 | | b += u01 * a |
731 | | */ |
732 | | |
733 | 59.8k | #if HAVE_NATIVE_mpn_addaddmul_1msb0 |
734 | 59.8k | ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]); |
735 | 59.8k | bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]); |
736 | | #else |
737 | | ah = mpn_mul_1 (rp, ap, n, M->u[0][0]); |
738 | | ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]); |
739 | | |
740 | | bh = mpn_mul_1 (bp, bp, n, M->u[1][1]); |
741 | | bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]); |
742 | | #endif |
743 | 59.8k | rp[n] = ah; |
744 | 59.8k | bp[n] = bh; |
745 | | |
746 | 59.8k | n += (ah | bh) > 0; |
747 | 59.8k | return n; |
748 | 59.8k | } |