/src/gmp-6.2.1/mpn/hgcd2.c

Line	Count	Source
1		/* hgcd2.c
2
3		THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
4		SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
5		GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6
7		Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation,
8		Inc.
9
10		This file is part of the GNU MP Library.
11
12		The GNU MP Library is free software; you can redistribute it and/or modify
13		it under the terms of either:
14
15		* the GNU Lesser General Public License as published by the Free
16		Software Foundation; either version 3 of the License, or (at your
17		option) any later version.
18
19		or
20
21		* the GNU General Public License as published by the Free Software
22		Foundation; either version 2 of the License, or (at your option) any
23		later version.
24
25		or both in parallel, as here.
26
27		The GNU MP Library is distributed in the hope that it will be useful, but
28		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
29		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
30		for more details.
31
32		You should have received copies of the GNU General Public License and the
33		GNU Lesser General Public License along with the GNU MP Library. If not,
34		see https://www.gnu.org/licenses/. */
35
36		#include "gmp-impl.h"
37		#include "longlong.h"
38
39		#ifndef HGCD2_DIV1_METHOD
40		#define HGCD2_DIV1_METHOD 3
41		#endif
42
43		#ifndef HGCD2_DIV2_METHOD
44		#define HGCD2_DIV2_METHOD 2
45		#endif
46
47		#if GMP_NAIL_BITS != 0
48		#error Nails not implemented
49		#endif
50
51		#if HAVE_NATIVE_mpn_div_11
52
53		#define div1 mpn_div_11
54		/* Single-limb division optimized for small quotients.
55		Returned value holds d0 = r, d1 = q. */
56		mp_double_limb_t div1 (mp_limb_t, mp_limb_t);
57
58		#elif HGCD2_DIV1_METHOD == 1
59
60		static inline mp_double_limb_t
61		div1 (mp_limb_t n0, mp_limb_t d0)
62		{
63		mp_double_limb_t res;
64		res.d1 = n0 / d0;
65		res.d0 = n0 - res.d1 * d0;
66
67		return res;
68		}
69
70		#elif HGCD2_DIV1_METHOD == 2
71
72		static mp_double_limb_t
73		div1 (mp_limb_t n0, mp_limb_t d0)
74		{
75		mp_double_limb_t res;
76		int ncnt, dcnt, cnt;
77		mp_limb_t q;
78		mp_limb_t mask;
79
80		ASSERT (n0 >= d0);
81
82		count_leading_zeros (ncnt, n0);
83		count_leading_zeros (dcnt, d0);
84		cnt = dcnt - ncnt;
85
86		d0 <<= cnt;
87
88		q = -(mp_limb_t) (n0 >= d0);
89		n0 -= d0 & q;
90		d0 >>= 1;
91		q = -q;
92
93		while (--cnt >= 0)
94		{
95		mask = -(mp_limb_t) (n0 >= d0);
96		n0 -= d0 & mask;
97		d0 >>= 1;
98		q = (q << 1) - mask;
99		}
100
101		res.d0 = n0;
102		res.d1 = q;
103		return res;
104		}
105
106		#elif HGCD2_DIV1_METHOD == 3
107
108		static inline mp_double_limb_t
109		div1 (mp_limb_t n0, mp_limb_t d0)
110	695k	{
111	695k	mp_double_limb_t res;
112	695k	if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
113	695k	\|\| UNLIKELY (n0 >= (d0 << 3)))
114	188k	{
115	188k	res.d1 = n0 / d0;
116	188k	res.d0 = n0 - res.d1 * d0;
117	188k	}
118	507k	else
119	507k	{
120	507k	mp_limb_t q, mask;
121
122	507k	d0 <<= 2;
123
124	507k	mask = -(mp_limb_t) (n0 >= d0);
125	507k	n0 -= d0 & mask;
126	507k	q = 4 & mask;
127
128	507k	d0 >>= 1;
129	507k	mask = -(mp_limb_t) (n0 >= d0);
130	507k	n0 -= d0 & mask;
131	507k	q += 2 & mask;
132
133	507k	d0 >>= 1;
134	507k	mask = -(mp_limb_t) (n0 >= d0);
135	507k	n0 -= d0 & mask;
136	507k	q -= mask;
137
138	507k	res.d0 = n0;
139	507k	res.d1 = q;
140	507k	}
141	695k	return res;
142	695k	}
143
144		#elif HGCD2_DIV1_METHOD == 4
145
146		/* Table quotients. We extract the NBITS most significant bits of the
147		numerator limb, and the corresponding bits from the divisor limb, and use
148		these to form an index into the table. This method is probably only useful
149		for short pipelines with slow multiplication.
150
151		Possible improvements:
152
153		* Perhaps extract the highest NBITS of the divisor instead of the same bits
154		as from the numerator. That would require another count_leading_zeros,
155		and a post-multiply shift of the quotient.
156
157		* Compress tables? Their values are tiny, and there are lots of zero
158		entries (which are never used).
159
160		* Round the table entries more cleverly?
161		*/
162
163		#ifndef NBITS
164		#define NBITS 5
165		#endif
166
167		#if NBITS == 5
168		/* This needs full division about 13.2% of the time. */
169		static const unsigned char tab[512] = {
170		17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
171		18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
172		19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
173		20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
174		21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
175		22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
176		23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
177		24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
178		25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
179		26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
180		27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
181		28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
182		29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
183		30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
184		31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
185		32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
186		};
187		#elif NBITS == 6
188		/* This needs full division about 9.8% of the time. */
189		static const unsigned char tab[2048] = {
190		33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
191		1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
192		34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
193		1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
194		35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
195		1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
196		36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
197		1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
198		37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
199		1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
200		38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
201		1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
202		39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
203		1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
204		40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
205		1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
206		41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
207		1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
208		42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
209		1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
210		43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
211		1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
212		44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
213		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
214		45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
215		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
216		46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
217		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
218		47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
219		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
220		48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
221		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
222		49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
223		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
224		50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
225		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
226		51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
227		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
228		52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
229		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
230		53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
231		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
232		54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
233		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
234		55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
235		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
236		56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
237		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
238		57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
239		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
240		58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
241		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
242		59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
243		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
244		60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
245		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
246		61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
247		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
248		62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
249		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
250		63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
251		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
252		64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
253		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
254		};
255		#else
256		#error No table for provided NBITS
257		#endif
258
259		/* Doing tabp with a #define makes compiler warnings about pointing outside an
260		object go away. We used to define this as a variable. It is not clear if
261		e.g. (vector[100] - 10) + 10 is well- defined as per the C standard;
262		(vector[100] + 10) - 10 surely is and there is no sequence point so the
263		expressions should be equivalent. To make this safe, we might want to
264		define tabp as a macro with the index as an argument. Depending on the
265		platform, relocs might allow for assembly-time or linker-time resolution to
266		take place. */
267		#define tabp (tab - (1 << (NBITS - 1) << NBITS))
268
269		static inline mp_double_limb_t
270		div1 (mp_limb_t n0, mp_limb_t d0)
271		{
272		int ncnt;
273		size_t nbi, dbi;
274		mp_limb_t q0;
275		mp_limb_t r0;
276		mp_limb_t mask;
277		mp_double_limb_t res;
278
279		ASSERT (n0 >= d0); /* Actually only msb position is critical. */
280
281		count_leading_zeros (ncnt, n0);
282		nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);
283		dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);
284
285		q0 = tabp[(nbi << NBITS) + dbi];
286		r0 = n0 - q0 * d0;
287		mask = -(mp_limb_t) (r0 >= d0);
288		q0 -= mask;
289		r0 -= d0 & mask;
290
291		if (UNLIKELY (r0 >= d0))
292		{
293		q0 = n0 / d0;
294		r0 = n0 - q0 * d0;
295		}
296
297		res.d1 = q0;
298		res.d0 = r0;
299		return res;
300		}
301
302		#elif HGCD2_DIV1_METHOD == 5
303
304		/* Table inverses of divisors. We don't bother with suppressing the msb from
305		the tables. We index with the NBITS most significant divisor bits,
306		including the always-set highest bit, but use addressing trickery via tabp
307		to suppress it.
308
309		Possible improvements:
310
311		* Do first multiply using 32-bit operations on 64-bit computers. At least
312		on most Arm64 cores, that uses 3 times less resources. It also saves on
313		many x86-64 processors.
314		*/
315
316		#ifndef NBITS
317		#define NBITS 7
318		#endif
319
320		#if NBITS == 5
321		/* This needs full division about 1.63% of the time. */
322		static const unsigned char tab[16] = {
323		63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32
324		};
325		#elif NBITS == 6
326		/* This needs full division about 0.93% of the time. */
327		static const unsigned char tab[32] = {
328		127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,
329		84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64
330		};
331		#elif NBITS == 7
332		/* This needs full division about 0.49% of the time. */
333		static const unsigned char tab[64] = {
334		255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,
335		203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,
336		169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,
337		145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128
338		};
339		#elif NBITS == 8
340		/* This needs full division about 0.26% of the time. */
341		static const unsigned short tab[128] = {
342		511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,
343		454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,
344		408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,
345		371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,
346		340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,
347		314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,
348		291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,
349		272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256
350		};
351		#else
352		#error No table for provided NBITS
353		#endif
354
355		/* Doing tabp with a #define makes compiler warnings about pointing outside an
356		object go away. We used to define this as a variable. It is not clear if
357		e.g. (vector[100] - 10) + 10 is well- defined as per the C standard;
358		(vector[100] + 10) - 10 surely is and there is no sequence point so the
359		expressions should be equivalent. To make this safe, we might want to
360		define tabp as a macro with the index as an argument. Depending on the
361		platform, relocs might allow for assembly-time or linker-time resolution to
362		take place. */
363		#define tabp (tab - (1 << (NBITS - 1)))
364
365		static inline mp_double_limb_t
366		div1 (mp_limb_t n0, mp_limb_t d0)
367		{
368		int ncnt, dcnt;
369		size_t dbi;
370		mp_limb_t inv;
371		mp_limb_t q0;
372		mp_limb_t r0;
373		mp_limb_t mask;
374		mp_double_limb_t res;
375
376		count_leading_zeros (ncnt, n0);
377		count_leading_zeros (dcnt, d0);
378
379		dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
380		inv = tabp[dbi];
381		q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
382		r0 = n0 - q0 * d0;
383		mask = -(mp_limb_t) (r0 >= d0);
384		q0 -= mask;
385		r0 -= d0 & mask;
386
387		if (UNLIKELY (r0 >= d0))
388		{
389		q0 = n0 / d0;
390		r0 = n0 - q0 * d0;
391		}
392
393		res.d1 = q0;
394		res.d0 = r0;
395		return res;
396		}
397
398		#else
399		#error Unknown HGCD2_DIV1_METHOD
400		#endif
401
402		#if HAVE_NATIVE_mpn_div_22
403
404		#define div2 mpn_div_22
405		/* Two-limb division optimized for small quotients. */
406		mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);
407
408		#elif HGCD2_DIV2_METHOD == 1
409
410		static mp_limb_t
411		div2 (mp_ptr rp,
412		mp_limb_t n1, mp_limb_t n0,
413		mp_limb_t d1, mp_limb_t d0)
414		{
415		mp_double_limb_t rq = div1 (n1, d1);
416		if (UNLIKELY (rq.d1 > d1))
417		{
418		mp_limb_t n2, q, t1, t0;
419		int c;
420
421		/* Normalize */
422		count_leading_zeros (c, d1);
423		ASSERT (c > 0);
424
425		n2 = n1 >> (GMP_LIMB_BITS - c);
426		n1 = (n1 << c) \| (n0 >> (GMP_LIMB_BITS - c));
427		n0 <<= c;
428		d1 = (d1 << c) \| (d0 >> (GMP_LIMB_BITS - c));
429		d0 <<= c;
430
431		udiv_qrnnd (q, n1, n2, n1, d1);
432		umul_ppmm (t1, t0, q, d0);
433		if (t1 > n1 \|\| (t1 == n1 && t0 > n0))
434		{
435		ASSERT (q > 0);
436		q--;
437		sub_ddmmss (t1, t0, t1, t0, d1, d0);
438		}
439		sub_ddmmss (n1, n0, n1, n0, t1, t0);
440
441		/* Undo normalization */
442		rp[0] = (n0 >> c) \| (n1 << (GMP_LIMB_BITS - c));
443		rp[1] = n1 >> c;
444
445		return q;
446		}
447		else
448		{
449		mp_limb_t q, t1, t0;
450		n1 = rq.d0;
451		q = rq.d1;
452		umul_ppmm (t1, t0, q, d0);
453		if (UNLIKELY (t1 >= n1) && (t1 > n1 \|\| t0 > n0))
454		{
455		ASSERT (q > 0);
456		q--;
457		sub_ddmmss (t1, t0, t1, t0, d1, d0);
458		}
459		sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
460		return q;
461		}
462		}
463
464		#elif HGCD2_DIV2_METHOD == 2
465
466		/* Bit-wise div2. Relies on fast count_leading_zeros. */
467		static mp_limb_t
468		div2 (mp_ptr rp,
469		mp_limb_t n1, mp_limb_t n0,
470		mp_limb_t d1, mp_limb_t d0)
471	798k	{
472	798k	mp_limb_t q = 0;
473	798k	int ncnt;
474	798k	int dcnt;
475
476	798k	count_leading_zeros (ncnt, n1);
477	798k	count_leading_zeros (dcnt, d1);
478	798k	dcnt -= ncnt;
479
480	798k	d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
481	798k	d0 <<= dcnt;
482
483	798k	do
484	2.65M	{
485	2.65M	mp_limb_t mask;
486	2.65M	q <<= 1;
487	2.65M	if (UNLIKELY (n1 == d1))
488	1.32k	mask = -(n0 >= d0);
489	2.65M	else
490	2.65M	mask = -(n1 > d1);
491
492	2.65M	q -= mask;
493
494	2.65M	sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
495
496	2.65M	d0 = (d1 << (GMP_LIMB_BITS - 1)) \| (d0 >> 1);
497	2.65M	d1 = d1 >> 1;
498	2.65M	}
499	2.65M	while (dcnt--);
500
501	798k	rp[0] = n0;
502	798k	rp[1] = n1;
503
504	798k	return q;
505	798k	}
506		#else
507		#error Unknown HGCD2_DIV2_METHOD
508		#endif
509
510		/* Reduces a,b until \|a-b\| (almost) fits in one limb + 1 bit. Constructs
511		matrix M. Returns 1 if we make progress, i.e. can perform at least
512		one subtraction. Otherwise returns zero. */
513
514		/* FIXME: Possible optimizations:
515
516		The div2 function starts with checking the most significant bit of
517		the numerator. We can maintained normalized operands here, call
518		hgcd with normalized operands only, which should make the code
519		simpler and possibly faster.
520
521		Experiment with table lookups on the most significant bits.
522
523		This function is also a candidate for assembler implementation.
524		*/
525		int
526		mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
527		struct hgcd_matrix1 *M)
528	77.3k	{
529	77.3k	mp_limb_t u00, u01, u10, u11;
530
531	77.3k	if (ah < 2 \|\| bh < 2)
532	344	return 0;
533
534	77.0k	if (ah > bh \|\| (ah == bh && al > bl))
535	37.5k	{
536	37.5k	sub_ddmmss (ah, al, ah, al, bh, bl);
537	37.5k	if (ah < 2)
538	723	return 0;
539
540	36.8k	u00 = u01 = u11 = 1;
541	36.8k	u10 = 0;
542	36.8k	}
543	39.4k	else
544	39.4k	{
545	39.4k	sub_ddmmss (bh, bl, bh, bl, ah, al);
546	39.4k	if (bh < 2)
547	1.71k	return 0;
548
549	37.7k	u00 = u10 = u11 = 1;
550	37.7k	u01 = 0;
551	37.7k	}
552
553	74.6k	if (ah < bh)
554	37.2k	goto subtract_a;
555
556	37.3k	for (;;)
557	700k	{
558	700k	ASSERT (ah >= bh);
559	700k	if (ah == bh)
560	617	goto done;
561
562	699k	if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
563	36.2k	{
564	36.2k	ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
565	36.2k	bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
566
567	36.2k	break;
568	36.2k	}
569
570		/* Subtract a -= q b, and multiply M from the right by (1 q ; 0
571		1), affecting the second column of M. */
572	663k	ASSERT (ah > bh);
573	663k	sub_ddmmss (ah, al, ah, al, bh, bl);
574
575	663k	if (ah < 2)
576	31	goto done;
577
578	663k	if (ah <= bh)
579	264k	{
580		/* Use q = 1 */
581	264k	u01 += u00;
582	264k	u11 += u10;
583	264k	}
584	399k	else
585	399k	{
586	399k	mp_limb_t r[2];
587	399k	mp_limb_t q = div2 (r, ah, al, bh, bl);
588	399k	al = r[0]; ah = r[1];
589	399k	if (ah < 2)
590	461	{
591		/* A is too small, but q is correct. */
592	461	u01 += q * u00;
593	461	u11 += q * u10;
594	461	goto done;
595	461	}
596	398k	q++;
597	398k	u01 += q * u00;
598	398k	u11 += q * u10;
599	398k	}
600	700k	subtract_a:
601	700k	ASSERT (bh >= ah);
602	700k	if (ah == bh)
603	560	goto done;
604
605	699k	if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
606	36.2k	{
607	36.2k	ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
608	36.2k	bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
609
610	36.2k	goto subtract_a1;
611	36.2k	}
612
613		/* Subtract b -= q a, and multiply M from the right by (1 0 ; q
614		1), affecting the first column of M. */
615	663k	sub_ddmmss (bh, bl, bh, bl, ah, al);
616
617	663k	if (bh < 2)
618	9	goto done;
619
620	663k	if (bh <= ah)
621	264k	{
622		/* Use q = 1 */
623	264k	u00 += u01;
624	264k	u10 += u11;
625	264k	}
626	399k	else
627	399k	{
628	399k	mp_limb_t r[2];
629	399k	mp_limb_t q = div2 (r, bh, bl, ah, al);
630	399k	bl = r[0]; bh = r[1];
631	399k	if (bh < 2)
632	391	{
633		/* B is too small, but q is correct. */
634	391	u00 += q * u01;
635	391	u10 += q * u11;
636	391	goto done;
637	391	}
638	398k	q++;
639	398k	u00 += q * u01;
640	398k	u10 += q * u11;
641	398k	}
642	663k	}
643
644		/* NOTE: Since we discard the least significant half limb, we don't get a
645		truly maximal M (corresponding to \|a - b\| < 2^{GMP_LIMB_BITS +1}). */
646		/* Single precision loop */
647	36.2k	for (;;)
648	613k	{
649	613k	ASSERT (ah >= bh);
650
651	613k	ah -= bh;
652	613k	if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
653	16.6k	break;
654
655	596k	if (ah <= bh)
656	248k	{
657		/* Use q = 1 */
658	248k	u01 += u00;
659	248k	u11 += u10;
660	248k	}
661	348k	else
662	348k	{
663	348k	mp_double_limb_t rq = div1 (ah, bh);
664	348k	mp_limb_t q = rq.d1;
665	348k	ah = rq.d0;
666
667	348k	if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
668	19.6k	{
669		/* A is too small, but q is correct. */
670	19.6k	u01 += q * u00;
671	19.6k	u11 += q * u10;
672	19.6k	break;
673	19.6k	}
674	328k	q++;
675	328k	u01 += q * u00;
676	328k	u11 += q * u10;
677	328k	}
678	613k	subtract_a1:
679	613k	ASSERT (bh >= ah);
680
681	613k	bh -= ah;
682	613k	if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
683	16.5k	break;
684
685	596k	if (bh <= ah)
686	249k	{
687		/* Use q = 1 */
688	249k	u00 += u01;
689	249k	u10 += u11;
690	249k	}
691	347k	else
692	347k	{
693	347k	mp_double_limb_t rq = div1 (bh, ah);
694	347k	mp_limb_t q = rq.d1;
695	347k	bh = rq.d0;
696
697	347k	if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
698	19.7k	{
699		/* B is too small, but q is correct. */
700	19.7k	u00 += q * u01;
701	19.7k	u10 += q * u11;
702	19.7k	break;
703	19.7k	}
704	327k	q++;
705	327k	u00 += q * u01;
706	327k	u10 += q * u11;
707	327k	}
708	596k	}
709
710	74.6k	done:
711	74.6k	M->u[0][0] = u00; M->u[0][1] = u01;
712	74.6k	M->u[1][0] = u10; M->u[1][1] = u11;
713
714	74.6k	return 1;
715	36.2k	}
716
717		/* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must
718		* have space for n + 1 limbs. Uses three buffers to avoid a copy*/
719		mp_size_t
720		mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M,
721		mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
722	72.9k	{
723	72.9k	mp_limb_t ah, bh;
724
725		/* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
726
727		r = u00 * a
728		r += u10 * b
729		b *= u11
730		b += u01 * a
731		*/
732
733	72.9k	#if HAVE_NATIVE_mpn_addaddmul_1msb0
734	72.9k	ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]);
735	72.9k	bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]);
736		#else
737		ah = mpn_mul_1 (rp, ap, n, M->u[0][0]);
738		ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]);
739
740		bh = mpn_mul_1 (bp, bp, n, M->u[1][1]);
741		bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]);
742		#endif
743	72.9k	rp[n] = ah;
744	72.9k	bp[n] = bh;
745
746	72.9k	n += (ah \| bh) > 0;
747	72.9k	return n;
748	72.9k	}

Coverage Report

Created: 2023-02-22 06:39