/src/gmp-6.2.1/mpn/hgcd2.c

Line	Count	Source
1		/* hgcd2.c
2
3		THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
4		SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
5		GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6
7		Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation,
8		Inc.
9
10		This file is part of the GNU MP Library.
11
12		The GNU MP Library is free software; you can redistribute it and/or modify
13		it under the terms of either:
14
15		* the GNU Lesser General Public License as published by the Free
16		Software Foundation; either version 3 of the License, or (at your
17		option) any later version.
18
19		or
20
21		* the GNU General Public License as published by the Free Software
22		Foundation; either version 2 of the License, or (at your option) any
23		later version.
24
25		or both in parallel, as here.
26
27		The GNU MP Library is distributed in the hope that it will be useful, but
28		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
29		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
30		for more details.
31
32		You should have received copies of the GNU General Public License and the
33		GNU Lesser General Public License along with the GNU MP Library. If not,
34		see https://www.gnu.org/licenses/. */
35
36		#include "gmp-impl.h"
37		#include "longlong.h"
38
39		#ifndef HGCD2_DIV1_METHOD
40		#define HGCD2_DIV1_METHOD 3
41		#endif
42
43		#ifndef HGCD2_DIV2_METHOD
44		#define HGCD2_DIV2_METHOD 2
45		#endif
46
47		#if GMP_NAIL_BITS != 0
48		#error Nails not implemented
49		#endif
50
51		#if HAVE_NATIVE_mpn_div_11
52
53		#define div1 mpn_div_11
54		/* Single-limb division optimized for small quotients.
55		Returned value holds d0 = r, d1 = q. */
56		mp_double_limb_t div1 (mp_limb_t, mp_limb_t);
57
58		#elif HGCD2_DIV1_METHOD == 1
59
60		static inline mp_double_limb_t
61		div1 (mp_limb_t n0, mp_limb_t d0)
62		{
63		mp_double_limb_t res;
64		res.d1 = n0 / d0;
65		res.d0 = n0 - res.d1 * d0;
66
67		return res;
68		}
69
70		#elif HGCD2_DIV1_METHOD == 2
71
72		static mp_double_limb_t
73		div1 (mp_limb_t n0, mp_limb_t d0)
74		{
75		mp_double_limb_t res;
76		int ncnt, dcnt, cnt;
77		mp_limb_t q;
78		mp_limb_t mask;
79
80		ASSERT (n0 >= d0);
81
82		count_leading_zeros (ncnt, n0);
83		count_leading_zeros (dcnt, d0);
84		cnt = dcnt - ncnt;
85
86		d0 <<= cnt;
87
88		q = -(mp_limb_t) (n0 >= d0);
89		n0 -= d0 & q;
90		d0 >>= 1;
91		q = -q;
92
93		while (--cnt >= 0)
94		{
95		mask = -(mp_limb_t) (n0 >= d0);
96		n0 -= d0 & mask;
97		d0 >>= 1;
98		q = (q << 1) - mask;
99		}
100
101		res.d0 = n0;
102		res.d1 = q;
103		return res;
104		}
105
106		#elif HGCD2_DIV1_METHOD == 3
107
108		static inline mp_double_limb_t
109		div1 (mp_limb_t n0, mp_limb_t d0)
110	250k	{
111	250k	mp_double_limb_t res;
112	250k	if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
113	250k	\|\| UNLIKELY (n0 >= (d0 << 3)))
114	67.5k	{
115	67.5k	res.d1 = n0 / d0;
116	67.5k	res.d0 = n0 - res.d1 * d0;
117	67.5k	}
118	183k	else
119	183k	{
120	183k	mp_limb_t q, mask;
121
122	183k	d0 <<= 2;
123
124	183k	mask = -(mp_limb_t) (n0 >= d0);
125	183k	n0 -= d0 & mask;
126	183k	q = 4 & mask;
127
128	183k	d0 >>= 1;
129	183k	mask = -(mp_limb_t) (n0 >= d0);
130	183k	n0 -= d0 & mask;
131	183k	q += 2 & mask;
132
133	183k	d0 >>= 1;
134	183k	mask = -(mp_limb_t) (n0 >= d0);
135	183k	n0 -= d0 & mask;
136	183k	q -= mask;
137
138	183k	res.d0 = n0;
139	183k	res.d1 = q;
140	183k	}
141	250k	return res;
142	250k	}
143
144		#elif HGCD2_DIV1_METHOD == 4
145
146		/* Table quotients. We extract the NBITS most significant bits of the
147		numerator limb, and the corresponding bits from the divisor limb, and use
148		these to form an index into the table. This method is probably only useful
149		for short pipelines with slow multiplication.
150
151		Possible improvements:
152
153		* Perhaps extract the highest NBITS of the divisor instead of the same bits
154		as from the numerator. That would require another count_leading_zeros,
155		and a post-multiply shift of the quotient.
156
157		* Compress tables? Their values are tiny, and there are lots of zero
158		entries (which are never used).
159
160		* Round the table entries more cleverly?
161		*/
162
163		#ifndef NBITS
164		#define NBITS 5
165		#endif
166
167		#if NBITS == 5
168		/* This needs full division about 13.2% of the time. */
169		static const unsigned char tab[512] = {
170		17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
171		18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
172		19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
173		20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
174		21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
175		22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
176		23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
177		24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
178		25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
179		26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
180		27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
181		28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
182		29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
183		30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
184		31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
185		32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
186		};
187		#elif NBITS == 6
188		/* This needs full division about 9.8% of the time. */
189		static const unsigned char tab[2048] = {
190		33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
191		1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
192		34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
193		1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
194		35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
195		1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
196		36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
197		1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
198		37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
199		1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
200		38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
201		1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
202		39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
203		1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
204		40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
205		1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
206		41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
207		1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
208		42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
209		1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
210		43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
211		1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
212		44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
213		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
214		45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
215		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
216		46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
217		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
218		47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
219		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
220		48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
221		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
222		49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
223		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
224		50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
225		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
226		51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
227		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
228		52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
229		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
230		53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
231		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
232		54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
233		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
234		55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
235		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
236		56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
237		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
238		57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
239		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
240		58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
241		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
242		59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
243		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
244		60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
245		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
246		61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
247		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
248		62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
249		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
250		63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
251		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
252		64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
253		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
254		};
255		#else
256		#error No table for provided NBITS
257		#endif
258
259		/* Doing tabp with a #define makes compiler warnings about pointing outside an
260		object go away. We used to define this as a variable. It is not clear if
261		e.g. (vector[100] - 10) + 10 is well- defined as per the C standard;
262		(vector[100] + 10) - 10 surely is and there is no sequence point so the
263		expressions should be equivalent. To make this safe, we might want to
264		define tabp as a macro with the index as an argument. Depending on the
265		platform, relocs might allow for assembly-time or linker-time resolution to
266		take place. */
267		#define tabp (tab - (1 << (NBITS - 1) << NBITS))
268
269		static inline mp_double_limb_t
270		div1 (mp_limb_t n0, mp_limb_t d0)
271		{
272		int ncnt;
273		size_t nbi, dbi;
274		mp_limb_t q0;
275		mp_limb_t r0;
276		mp_limb_t mask;
277		mp_double_limb_t res;
278
279		ASSERT (n0 >= d0); /* Actually only msb position is critical. */
280
281		count_leading_zeros (ncnt, n0);
282		nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);
283		dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);
284
285		q0 = tabp[(nbi << NBITS) + dbi];
286		r0 = n0 - q0 * d0;
287		mask = -(mp_limb_t) (r0 >= d0);
288		q0 -= mask;
289		r0 -= d0 & mask;
290
291		if (UNLIKELY (r0 >= d0))
292		{
293		q0 = n0 / d0;
294		r0 = n0 - q0 * d0;
295		}
296
297		res.d1 = q0;
298		res.d0 = r0;
299		return res;
300		}
301
302		#elif HGCD2_DIV1_METHOD == 5
303
304		/* Table inverses of divisors. We don't bother with suppressing the msb from
305		the tables. We index with the NBITS most significant divisor bits,
306		including the always-set highest bit, but use addressing trickery via tabp
307		to suppress it.
308
309		Possible improvements:
310
311		* Do first multiply using 32-bit operations on 64-bit computers. At least
312		on most Arm64 cores, that uses 3 times less resources. It also saves on
313		many x86-64 processors.
314		*/
315
316		#ifndef NBITS
317		#define NBITS 7
318		#endif
319
320		#if NBITS == 5
321		/* This needs full division about 1.63% of the time. */
322		static const unsigned char tab[16] = {
323		63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32
324		};
325		#elif NBITS == 6
326		/* This needs full division about 0.93% of the time. */
327		static const unsigned char tab[32] = {
328		127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,
329		84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64
330		};
331		#elif NBITS == 7
332		/* This needs full division about 0.49% of the time. */
333		static const unsigned char tab[64] = {
334		255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,
335		203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,
336		169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,
337		145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128
338		};
339		#elif NBITS == 8
340		/* This needs full division about 0.26% of the time. */
341		static const unsigned short tab[128] = {
342		511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,
343		454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,
344		408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,
345		371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,
346		340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,
347		314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,
348		291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,
349		272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256
350		};
351		#else
352		#error No table for provided NBITS
353		#endif
354
355		/* Doing tabp with a #define makes compiler warnings about pointing outside an
356		object go away. We used to define this as a variable. It is not clear if
357		e.g. (vector[100] - 10) + 10 is well- defined as per the C standard;
358		(vector[100] + 10) - 10 surely is and there is no sequence point so the
359		expressions should be equivalent. To make this safe, we might want to
360		define tabp as a macro with the index as an argument. Depending on the
361		platform, relocs might allow for assembly-time or linker-time resolution to
362		take place. */
363		#define tabp (tab - (1 << (NBITS - 1)))
364
365		static inline mp_double_limb_t
366		div1 (mp_limb_t n0, mp_limb_t d0)
367		{
368		int ncnt, dcnt;
369		size_t dbi;
370		mp_limb_t inv;
371		mp_limb_t q0;
372		mp_limb_t r0;
373		mp_limb_t mask;
374		mp_double_limb_t res;
375
376		count_leading_zeros (ncnt, n0);
377		count_leading_zeros (dcnt, d0);
378
379		dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
380		inv = tabp[dbi];
381		q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
382		r0 = n0 - q0 * d0;
383		mask = -(mp_limb_t) (r0 >= d0);
384		q0 -= mask;
385		r0 -= d0 & mask;
386
387		if (UNLIKELY (r0 >= d0))
388		{
389		q0 = n0 / d0;
390		r0 = n0 - q0 * d0;
391		}
392
393		res.d1 = q0;
394		res.d0 = r0;
395		return res;
396		}
397
398		#else
399		#error Unknown HGCD2_DIV1_METHOD
400		#endif
401
402		#if HAVE_NATIVE_mpn_div_22
403
404		#define div2 mpn_div_22
405		/* Two-limb division optimized for small quotients. */
406		mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);
407
408		#elif HGCD2_DIV2_METHOD == 1
409
410		static mp_limb_t
411		div2 (mp_ptr rp,
412		mp_limb_t n1, mp_limb_t n0,
413		mp_limb_t d1, mp_limb_t d0)
414		{
415		mp_double_limb_t rq = div1 (n1, d1);
416		if (UNLIKELY (rq.d1 > d1))
417		{
418		mp_limb_t n2, q, t1, t0;
419		int c;
420
421		/* Normalize */
422		count_leading_zeros (c, d1);
423		ASSERT (c > 0);
424
425		n2 = n1 >> (GMP_LIMB_BITS - c);
426		n1 = (n1 << c) \| (n0 >> (GMP_LIMB_BITS - c));
427		n0 <<= c;
428		d1 = (d1 << c) \| (d0 >> (GMP_LIMB_BITS - c));
429		d0 <<= c;
430
431		udiv_qrnnd (q, n1, n2, n1, d1);
432		umul_ppmm (t1, t0, q, d0);
433		if (t1 > n1 \|\| (t1 == n1 && t0 > n0))
434		{
435		ASSERT (q > 0);
436		q--;
437		sub_ddmmss (t1, t0, t1, t0, d1, d0);
438		}
439		sub_ddmmss (n1, n0, n1, n0, t1, t0);
440
441		/* Undo normalization */
442		rp[0] = (n0 >> c) \| (n1 << (GMP_LIMB_BITS - c));
443		rp[1] = n1 >> c;
444
445		return q;
446		}
447		else
448		{
449		mp_limb_t q, t1, t0;
450		n1 = rq.d0;
451		q = rq.d1;
452		umul_ppmm (t1, t0, q, d0);
453		if (UNLIKELY (t1 >= n1) && (t1 > n1 \|\| t0 > n0))
454		{
455		ASSERT (q > 0);
456		q--;
457		sub_ddmmss (t1, t0, t1, t0, d1, d0);
458		}
459		sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
460		return q;
461		}
462		}
463
464		#elif HGCD2_DIV2_METHOD == 2
465
466		/* Bit-wise div2. Relies on fast count_leading_zeros. */
467		static mp_limb_t
468		div2 (mp_ptr rp,
469		mp_limb_t n1, mp_limb_t n0,
470		mp_limb_t d1, mp_limb_t d0)
471	292k	{
472	292k	mp_limb_t q = 0;
473	292k	int ncnt;
474	292k	int dcnt;
475
476	292k	count_leading_zeros (ncnt, n1);
477	292k	count_leading_zeros (dcnt, d1);
478	292k	dcnt -= ncnt;
479
480	292k	d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
481	292k	d0 <<= dcnt;
482
483	292k	do
484	998k	{
485	998k	mp_limb_t mask;
486	998k	q <<= 1;
487	998k	if (UNLIKELY (n1 == d1))
488	1.06k	mask = -(n0 >= d0);
489	997k	else
490	997k	mask = -(n1 > d1);
491
492	998k	q -= mask;
493
494	998k	sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
495
496	998k	d0 = (d1 << (GMP_LIMB_BITS - 1)) \| (d0 >> 1);
497	998k	d1 = d1 >> 1;
498	998k	}
499	998k	while (dcnt--);
500
501	292k	rp[0] = n0;
502	292k	rp[1] = n1;
503
504	292k	return q;
505	292k	}
506		#else
507		#error Unknown HGCD2_DIV2_METHOD
508		#endif
509
510		/* Reduces a,b until \|a-b\| (almost) fits in one limb + 1 bit. Constructs
511		matrix M. Returns 1 if we make progress, i.e. can perform at least
512		one subtraction. Otherwise returns zero. */
513
514		/* FIXME: Possible optimizations:
515
516		The div2 function starts with checking the most significant bit of
517		the numerator. We can maintained normalized operands here, call
518		hgcd with normalized operands only, which should make the code
519		simpler and possibly faster.
520
521		Experiment with table lookups on the most significant bits.
522
523		This function is also a candidate for assembler implementation.
524		*/
525		int
526		mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
527		struct hgcd_matrix1 *M)
528	30.4k	{
529	30.4k	mp_limb_t u00, u01, u10, u11;
530
531	30.4k	if (ah < 2 \|\| bh < 2)
532	395	return 0;
533
534	30.0k	if (ah > bh \|\| (ah == bh && al > bl))
535	14.3k	{
536	14.3k	sub_ddmmss (ah, al, ah, al, bh, bl);
537	14.3k	if (ah < 2)
538	671	return 0;
539
540	13.6k	u00 = u01 = u11 = 1;
541	13.6k	u10 = 0;
542	13.6k	}
543	15.7k	else
544	15.7k	{
545	15.7k	sub_ddmmss (bh, bl, bh, bl, ah, al);
546	15.7k	if (bh < 2)
547	1.27k	return 0;
548
549	14.4k	u00 = u10 = u11 = 1;
550	14.4k	u01 = 0;
551	14.4k	}
552
553	28.0k	if (ah < bh)
554	13.9k	goto subtract_a;
555
556	14.1k	for (;;)
557	257k	{
558	257k	ASSERT (ah >= bh);
559	257k	if (ah == bh)
560	376	goto done;
561
562	257k	if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
563	13.1k	{
564	13.1k	ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
565	13.1k	bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
566
567	13.1k	break;
568	13.1k	}
569
570		/* Subtract a -= q b, and multiply M from the right by (1 q ; 0
571		1), affecting the second column of M. */
572	243k	ASSERT (ah > bh);
573	243k	sub_ddmmss (ah, al, ah, al, bh, bl);
574
575	243k	if (ah < 2)
576	27	goto done;
577
578	243k	if (ah <= bh)
579	97.5k	{
580		/* Use q = 1 */
581	97.5k	u01 += u00;
582	97.5k	u11 += u10;
583	97.5k	}
584	146k	else
585	146k	{
586	146k	mp_limb_t r[2];
587	146k	mp_limb_t q = div2 (r, ah, al, bh, bl);
588	146k	al = r[0]; ah = r[1];
589	146k	if (ah < 2)
590	332	{
591		/* A is too small, but q is correct. */
592	332	u01 += q * u00;
593	332	u11 += q * u10;
594	332	goto done;
595	332	}
596	146k	q++;
597	146k	u01 += q * u00;
598	146k	u11 += q * u10;
599	146k	}
600	257k	subtract_a:
601	257k	ASSERT (bh >= ah);
602	257k	if (ah == bh)
603	407	goto done;
604
605	257k	if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
606	13.4k	{
607	13.4k	ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
608	13.4k	bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
609
610	13.4k	goto subtract_a1;
611	13.4k	}
612
613		/* Subtract b -= q a, and multiply M from the right by (1 0 ; q
614		1), affecting the first column of M. */
615	243k	sub_ddmmss (bh, bl, bh, bl, ah, al);
616
617	243k	if (bh < 2)
618	23	goto done;
619
620	243k	if (bh <= ah)
621	97.1k	{
622		/* Use q = 1 */
623	97.1k	u00 += u01;
624	97.1k	u10 += u11;
625	97.1k	}
626	146k	else
627	146k	{
628	146k	mp_limb_t r[2];
629	146k	mp_limb_t q = div2 (r, bh, bl, ah, al);
630	146k	bl = r[0]; bh = r[1];
631	146k	if (bh < 2)
632	335	{
633		/* B is too small, but q is correct. */
634	335	u00 += q * u01;
635	335	u10 += q * u11;
636	335	goto done;
637	335	}
638	146k	q++;
639	146k	u00 += q * u01;
640	146k	u10 += q * u11;
641	146k	}
642	243k	}
643
644		/* NOTE: Since we discard the least significant half limb, we don't get a
645		truly maximal M (corresponding to \|a - b\| < 2^{GMP_LIMB_BITS +1}). */
646		/* Single precision loop */
647	13.1k	for (;;)
648	220k	{
649	220k	ASSERT (ah >= bh);
650
651	220k	ah -= bh;
652	220k	if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
653	6.00k	break;
654
655	214k	if (ah <= bh)
656	89.0k	{
657		/* Use q = 1 */
658	89.0k	u01 += u00;
659	89.0k	u11 += u10;
660	89.0k	}
661	125k	else
662	125k	{
663	125k	mp_double_limb_t rq = div1 (ah, bh);
664	125k	mp_limb_t q = rq.d1;
665	125k	ah = rq.d0;
666
667	125k	if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
668	7.23k	{
669		/* A is too small, but q is correct. */
670	7.23k	u01 += q * u00;
671	7.23k	u11 += q * u10;
672	7.23k	break;
673	7.23k	}
674	118k	q++;
675	118k	u01 += q * u00;
676	118k	u11 += q * u10;
677	118k	}
678	220k	subtract_a1:
679	220k	ASSERT (bh >= ah);
680
681	220k	bh -= ah;
682	220k	if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
683	6.10k	break;
684
685	214k	if (bh <= ah)
686	89.9k	{
687		/* Use q = 1 */
688	89.9k	u00 += u01;
689	89.9k	u10 += u11;
690	89.9k	}
691	124k	else
692	124k	{
693	124k	mp_double_limb_t rq = div1 (bh, ah);
694	124k	mp_limb_t q = rq.d1;
695	124k	bh = rq.d0;
696
697	124k	if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
698	7.24k	{
699		/* B is too small, but q is correct. */
700	7.24k	u00 += q * u01;
701	7.24k	u10 += q * u11;
702	7.24k	break;
703	7.24k	}
704	117k	q++;
705	117k	u00 += q * u01;
706	117k	u10 += q * u11;
707	117k	}
708	214k	}
709
710	28.0k	done:
711	28.0k	M->u[0][0] = u00; M->u[0][1] = u01;
712	28.0k	M->u[1][0] = u10; M->u[1][1] = u11;
713
714	28.0k	return 1;
715	13.1k	}
716
717		/* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must
718		* have space for n + 1 limbs. Uses three buffers to avoid a copy*/
719		mp_size_t
720		mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M,
721		mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
722	25.1k	{
723	25.1k	mp_limb_t ah, bh;
724
725		/* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
726
727		r = u00 * a
728		r += u10 * b
729		b *= u11
730		b += u01 * a
731		*/
732
733	25.1k	#if HAVE_NATIVE_mpn_addaddmul_1msb0
734	25.1k	ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]);
735	25.1k	bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]);
736		#else
737		ah = mpn_mul_1 (rp, ap, n, M->u[0][0]);
738		ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]);
739
740		bh = mpn_mul_1 (bp, bp, n, M->u[1][1]);
741		bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]);
742		#endif
743	25.1k	rp[n] = ah;
744	25.1k	bp[n] = bh;
745
746	25.1k	n += (ah \| bh) > 0;
747	25.1k	return n;
748	25.1k	}

Coverage Report

Created: 2025-03-09 06:52