/src/gmp-6.2.1/mpn/hgcd2.c

Line	Count	Source
1		/* hgcd2.c
2
3		THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
4		SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
5		GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6
7		Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation,
8		Inc.
9
10		This file is part of the GNU MP Library.
11
12		The GNU MP Library is free software; you can redistribute it and/or modify
13		it under the terms of either:
14
15		* the GNU Lesser General Public License as published by the Free
16		Software Foundation; either version 3 of the License, or (at your
17		option) any later version.
18
19		or
20
21		* the GNU General Public License as published by the Free Software
22		Foundation; either version 2 of the License, or (at your option) any
23		later version.
24
25		or both in parallel, as here.
26
27		The GNU MP Library is distributed in the hope that it will be useful, but
28		WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
29		or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
30		for more details.
31
32		You should have received copies of the GNU General Public License and the
33		GNU Lesser General Public License along with the GNU MP Library. If not,
34		see https://www.gnu.org/licenses/. */
35
36		#include "gmp-impl.h"
37		#include "longlong.h"
38
39		#ifndef HGCD2_DIV1_METHOD
40		#define HGCD2_DIV1_METHOD 3
41		#endif
42
43		#ifndef HGCD2_DIV2_METHOD
44		#define HGCD2_DIV2_METHOD 2
45		#endif
46
47		#if GMP_NAIL_BITS != 0
48		#error Nails not implemented
49		#endif
50
51		#if HAVE_NATIVE_mpn_div_11
52
53		#define div1 mpn_div_11
54		/* Single-limb division optimized for small quotients.
55		Returned value holds d0 = r, d1 = q. */
56		mp_double_limb_t div1 (mp_limb_t, mp_limb_t);
57
58		#elif HGCD2_DIV1_METHOD == 1
59
60		static inline mp_double_limb_t
61		div1 (mp_limb_t n0, mp_limb_t d0)
62		{
63		mp_double_limb_t res;
64		res.d1 = n0 / d0;
65		res.d0 = n0 - res.d1 * d0;
66
67		return res;
68		}
69
70		#elif HGCD2_DIV1_METHOD == 2
71
72		static mp_double_limb_t
73		div1 (mp_limb_t n0, mp_limb_t d0)
74		{
75		mp_double_limb_t res;
76		int ncnt, dcnt, cnt;
77		mp_limb_t q;
78		mp_limb_t mask;
79
80		ASSERT (n0 >= d0);
81
82		count_leading_zeros (ncnt, n0);
83		count_leading_zeros (dcnt, d0);
84		cnt = dcnt - ncnt;
85
86		d0 <<= cnt;
87
88		q = -(mp_limb_t) (n0 >= d0);
89		n0 -= d0 & q;
90		d0 >>= 1;
91		q = -q;
92
93		while (--cnt >= 0)
94		{
95		mask = -(mp_limb_t) (n0 >= d0);
96		n0 -= d0 & mask;
97		d0 >>= 1;
98		q = (q << 1) - mask;
99		}
100
101		res.d0 = n0;
102		res.d1 = q;
103		return res;
104		}
105
106		#elif HGCD2_DIV1_METHOD == 3
107
108		static inline mp_double_limb_t
109		div1 (mp_limb_t n0, mp_limb_t d0)
110	610k	{
111	610k	mp_double_limb_t res;
112	610k	if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
113	610k	\|\| UNLIKELY (n0 >= (d0 << 3)))
114	165k	{
115	165k	res.d1 = n0 / d0;
116	165k	res.d0 = n0 - res.d1 * d0;
117	165k	}
118	444k	else
119	444k	{
120	444k	mp_limb_t q, mask;
121
122	444k	d0 <<= 2;
123
124	444k	mask = -(mp_limb_t) (n0 >= d0);
125	444k	n0 -= d0 & mask;
126	444k	q = 4 & mask;
127
128	444k	d0 >>= 1;
129	444k	mask = -(mp_limb_t) (n0 >= d0);
130	444k	n0 -= d0 & mask;
131	444k	q += 2 & mask;
132
133	444k	d0 >>= 1;
134	444k	mask = -(mp_limb_t) (n0 >= d0);
135	444k	n0 -= d0 & mask;
136	444k	q -= mask;
137
138	444k	res.d0 = n0;
139	444k	res.d1 = q;
140	444k	}
141	610k	return res;
142	610k	}
143
144		#elif HGCD2_DIV1_METHOD == 4
145
146		/* Table quotients. We extract the NBITS most significant bits of the
147		numerator limb, and the corresponding bits from the divisor limb, and use
148		these to form an index into the table. This method is probably only useful
149		for short pipelines with slow multiplication.
150
151		Possible improvements:
152
153		* Perhaps extract the highest NBITS of the divisor instead of the same bits
154		as from the numerator. That would require another count_leading_zeros,
155		and a post-multiply shift of the quotient.
156
157		* Compress tables? Their values are tiny, and there are lots of zero
158		entries (which are never used).
159
160		* Round the table entries more cleverly?
161		*/
162
163		#ifndef NBITS
164		#define NBITS 5
165		#endif
166
167		#if NBITS == 5
168		/* This needs full division about 13.2% of the time. */
169		static const unsigned char tab[512] = {
170		17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
171		18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
172		19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
173		20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
174		21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
175		22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
176		23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
177		24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
178		25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
179		26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
180		27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
181		28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
182		29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
183		30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
184		31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
185		32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
186		};
187		#elif NBITS == 6
188		/* This needs full division about 9.8% of the time. */
189		static const unsigned char tab[2048] = {
190		33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
191		1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
192		34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
193		1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
194		35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
195		1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
196		36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
197		1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
198		37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
199		1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
200		38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
201		1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
202		39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
203		1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
204		40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
205		1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
206		41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
207		1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
208		42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
209		1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
210		43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
211		1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
212		44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
213		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
214		45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
215		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
216		46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
217		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
218		47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
219		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
220		48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
221		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
222		49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
223		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
224		50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
225		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
226		51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
227		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
228		52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
229		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
230		53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
231		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
232		54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
233		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
234		55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
235		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
236		56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
237		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
238		57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
239		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
240		58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
241		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
242		59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
243		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
244		60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
245		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
246		61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
247		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
248		62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
249		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
250		63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
251		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
252		64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
253		1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
254		};
255		#else
256		#error No table for provided NBITS
257		#endif
258
259		/* Doing tabp with a #define makes compiler warnings about pointing outside an
260		object go away. We used to define this as a variable. It is not clear if
261		e.g. (vector[100] - 10) + 10 is well- defined as per the C standard;
262		(vector[100] + 10) - 10 surely is and there is no sequence point so the
263		expressions should be equivalent. To make this safe, we might want to
264		define tabp as a macro with the index as an argument. Depending on the
265		platform, relocs might allow for assembly-time or linker-time resolution to
266		take place. */
267		#define tabp (tab - (1 << (NBITS - 1) << NBITS))
268
269		static inline mp_double_limb_t
270		div1 (mp_limb_t n0, mp_limb_t d0)
271		{
272		int ncnt;
273		size_t nbi, dbi;
274		mp_limb_t q0;
275		mp_limb_t r0;
276		mp_limb_t mask;
277		mp_double_limb_t res;
278
279		ASSERT (n0 >= d0); /* Actually only msb position is critical. */
280
281		count_leading_zeros (ncnt, n0);
282		nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);
283		dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);
284
285		q0 = tabp[(nbi << NBITS) + dbi];
286		r0 = n0 - q0 * d0;
287		mask = -(mp_limb_t) (r0 >= d0);
288		q0 -= mask;
289		r0 -= d0 & mask;
290
291		if (UNLIKELY (r0 >= d0))
292		{
293		q0 = n0 / d0;
294		r0 = n0 - q0 * d0;
295		}
296
297		res.d1 = q0;
298		res.d0 = r0;
299		return res;
300		}
301
302		#elif HGCD2_DIV1_METHOD == 5
303
304		/* Table inverses of divisors. We don't bother with suppressing the msb from
305		the tables. We index with the NBITS most significant divisor bits,
306		including the always-set highest bit, but use addressing trickery via tabp
307		to suppress it.
308
309		Possible improvements:
310
311		* Do first multiply using 32-bit operations on 64-bit computers. At least
312		on most Arm64 cores, that uses 3 times less resources. It also saves on
313		many x86-64 processors.
314		*/
315
316		#ifndef NBITS
317		#define NBITS 7
318		#endif
319
320		#if NBITS == 5
321		/* This needs full division about 1.63% of the time. */
322		static const unsigned char tab[16] = {
323		63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32
324		};
325		#elif NBITS == 6
326		/* This needs full division about 0.93% of the time. */
327		static const unsigned char tab[32] = {
328		127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,
329		84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64
330		};
331		#elif NBITS == 7
332		/* This needs full division about 0.49% of the time. */
333		static const unsigned char tab[64] = {
334		255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,
335		203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,
336		169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,
337		145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128
338		};
339		#elif NBITS == 8
340		/* This needs full division about 0.26% of the time. */
341		static const unsigned short tab[128] = {
342		511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,
343		454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,
344		408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,
345		371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,
346		340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,
347		314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,
348		291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,
349		272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256
350		};
351		#else
352		#error No table for provided NBITS
353		#endif
354
355		/* Doing tabp with a #define makes compiler warnings about pointing outside an
356		object go away. We used to define this as a variable. It is not clear if
357		e.g. (vector[100] - 10) + 10 is well- defined as per the C standard;
358		(vector[100] + 10) - 10 surely is and there is no sequence point so the
359		expressions should be equivalent. To make this safe, we might want to
360		define tabp as a macro with the index as an argument. Depending on the
361		platform, relocs might allow for assembly-time or linker-time resolution to
362		take place. */
363		#define tabp (tab - (1 << (NBITS - 1)))
364
365		static inline mp_double_limb_t
366		div1 (mp_limb_t n0, mp_limb_t d0)
367		{
368		int ncnt, dcnt;
369		size_t dbi;
370		mp_limb_t inv;
371		mp_limb_t q0;
372		mp_limb_t r0;
373		mp_limb_t mask;
374		mp_double_limb_t res;
375
376		count_leading_zeros (ncnt, n0);
377		count_leading_zeros (dcnt, d0);
378
379		dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
380		inv = tabp[dbi];
381		q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
382		r0 = n0 - q0 * d0;
383		mask = -(mp_limb_t) (r0 >= d0);
384		q0 -= mask;
385		r0 -= d0 & mask;
386
387		if (UNLIKELY (r0 >= d0))
388		{
389		q0 = n0 / d0;
390		r0 = n0 - q0 * d0;
391		}
392
393		res.d1 = q0;
394		res.d0 = r0;
395		return res;
396		}
397
398		#else
399		#error Unknown HGCD2_DIV1_METHOD
400		#endif
401
402		#if HAVE_NATIVE_mpn_div_22
403
404		#define div2 mpn_div_22
405		/* Two-limb division optimized for small quotients. */
406		mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);
407
408		#elif HGCD2_DIV2_METHOD == 1
409
410		static mp_limb_t
411		div2 (mp_ptr rp,
412		mp_limb_t n1, mp_limb_t n0,
413		mp_limb_t d1, mp_limb_t d0)
414		{
415		mp_double_limb_t rq = div1 (n1, d1);
416		if (UNLIKELY (rq.d1 > d1))
417		{
418		mp_limb_t n2, q, t1, t0;
419		int c;
420
421		/* Normalize */
422		count_leading_zeros (c, d1);
423		ASSERT (c > 0);
424
425		n2 = n1 >> (GMP_LIMB_BITS - c);
426		n1 = (n1 << c) \| (n0 >> (GMP_LIMB_BITS - c));
427		n0 <<= c;
428		d1 = (d1 << c) \| (d0 >> (GMP_LIMB_BITS - c));
429		d0 <<= c;
430
431		udiv_qrnnd (q, n1, n2, n1, d1);
432		umul_ppmm (t1, t0, q, d0);
433		if (t1 > n1 \|\| (t1 == n1 && t0 > n0))
434		{
435		ASSERT (q > 0);
436		q--;
437		sub_ddmmss (t1, t0, t1, t0, d1, d0);
438		}
439		sub_ddmmss (n1, n0, n1, n0, t1, t0);
440
441		/* Undo normalization */
442		rp[0] = (n0 >> c) \| (n1 << (GMP_LIMB_BITS - c));
443		rp[1] = n1 >> c;
444
445		return q;
446		}
447		else
448		{
449		mp_limb_t q, t1, t0;
450		n1 = rq.d0;
451		q = rq.d1;
452		umul_ppmm (t1, t0, q, d0);
453		if (UNLIKELY (t1 >= n1) && (t1 > n1 \|\| t0 > n0))
454		{
455		ASSERT (q > 0);
456		q--;
457		sub_ddmmss (t1, t0, t1, t0, d1, d0);
458		}
459		sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
460		return q;
461		}
462		}
463
464		#elif HGCD2_DIV2_METHOD == 2
465
466		/* Bit-wise div2. Relies on fast count_leading_zeros. */
467		static mp_limb_t
468		div2 (mp_ptr rp,
469		mp_limb_t n1, mp_limb_t n0,
470		mp_limb_t d1, mp_limb_t d0)
471	701k	{
472	701k	mp_limb_t q = 0;
473	701k	int ncnt;
474	701k	int dcnt;
475
476	701k	count_leading_zeros (ncnt, n1);
477	701k	count_leading_zeros (dcnt, d1);
478	701k	dcnt -= ncnt;
479
480	701k	d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
481	701k	d0 <<= dcnt;
482
483	701k	do
484	2.34M	{
485	2.34M	mp_limb_t mask;
486	2.34M	q <<= 1;
487	2.34M	if (UNLIKELY (n1 == d1))
488	1.54k	mask = -(n0 >= d0);
489	2.34M	else
490	2.34M	mask = -(n1 > d1);
491
492	2.34M	q -= mask;
493
494	2.34M	sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
495
496	2.34M	d0 = (d1 << (GMP_LIMB_BITS - 1)) \| (d0 >> 1);
497	2.34M	d1 = d1 >> 1;
498	2.34M	}
499	2.34M	while (dcnt--);
500
501	701k	rp[0] = n0;
502	701k	rp[1] = n1;
503
504	701k	return q;
505	701k	}
506		#else
507		#error Unknown HGCD2_DIV2_METHOD
508		#endif
509
510		/* Reduces a,b until \|a-b\| (almost) fits in one limb + 1 bit. Constructs
511		matrix M. Returns 1 if we make progress, i.e. can perform at least
512		one subtraction. Otherwise returns zero. */
513
514		/* FIXME: Possible optimizations:
515
516		The div2 function starts with checking the most significant bit of
517		the numerator. We can maintained normalized operands here, call
518		hgcd with normalized operands only, which should make the code
519		simpler and possibly faster.
520
521		Experiment with table lookups on the most significant bits.
522
523		This function is also a candidate for assembler implementation.
524		*/
525		int
526		mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
527		struct hgcd_matrix1 *M)
528	69.6k	{
529	69.6k	mp_limb_t u00, u01, u10, u11;
530
531	69.6k	if (ah < 2 \|\| bh < 2)
532	487	return 0;
533
534	69.2k	if (ah > bh \|\| (ah == bh && al > bl))
535	33.6k	{
536	33.6k	sub_ddmmss (ah, al, ah, al, bh, bl);
537	33.6k	if (ah < 2)
538	892	return 0;
539
540	32.7k	u00 = u01 = u11 = 1;
541	32.7k	u10 = 0;
542	32.7k	}
543	35.5k	else
544	35.5k	{
545	35.5k	sub_ddmmss (bh, bl, bh, bl, ah, al);
546	35.5k	if (bh < 2)
547	2.11k	return 0;
548
549	33.4k	u00 = u10 = u11 = 1;
550	33.4k	u01 = 0;
551	33.4k	}
552
553	66.1k	if (ah < bh)
554	33.2k	goto subtract_a;
555
556	32.9k	for (;;)
557	615k	{
558	615k	ASSERT (ah >= bh);
559	615k	if (ah == bh)
560	738	goto done;
561
562	614k	if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
563	31.9k	{
564	31.9k	ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
565	31.9k	bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
566
567	31.9k	break;
568	31.9k	}
569
570		/* Subtract a -= q b, and multiply M from the right by (1 q ; 0
571		1), affecting the second column of M. */
572	582k	ASSERT (ah > bh);
573	582k	sub_ddmmss (ah, al, ah, al, bh, bl);
574
575	582k	if (ah < 2)
576	31	goto done;
577
578	582k	if (ah <= bh)
579	232k	{
580		/* Use q = 1 */
581	232k	u01 += u00;
582	232k	u11 += u10;
583	232k	}
584	349k	else
585	349k	{
586	349k	mp_limb_t r[2];
587	349k	mp_limb_t q = div2 (r, ah, al, bh, bl);
588	349k	al = r[0]; ah = r[1];
589	349k	if (ah < 2)
590	449	{
591		/* A is too small, but q is correct. */
592	449	u01 += q * u00;
593	449	u11 += q * u10;
594	449	goto done;
595	449	}
596	349k	q++;
597	349k	u01 += q * u00;
598	349k	u11 += q * u10;
599	349k	}
600	615k	subtract_a:
601	615k	ASSERT (bh >= ah);
602	615k	if (ah == bh)
603	662	goto done;
604
605	614k	if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
606	31.8k	{
607	31.8k	ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
608	31.8k	bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
609
610	31.8k	goto subtract_a1;
611	31.8k	}
612
613		/* Subtract b -= q a, and multiply M from the right by (1 0 ; q
614		1), affecting the first column of M. */
615	582k	sub_ddmmss (bh, bl, bh, bl, ah, al);
616
617	582k	if (bh < 2)
618	34	goto done;
619
620	582k	if (bh <= ah)
621	231k	{
622		/* Use q = 1 */
623	231k	u00 += u01;
624	231k	u10 += u11;
625	231k	}
626	351k	else
627	351k	{
628	351k	mp_limb_t r[2];
629	351k	mp_limb_t q = div2 (r, bh, bl, ah, al);
630	351k	bl = r[0]; bh = r[1];
631	351k	if (bh < 2)
632	511	{
633		/* B is too small, but q is correct. */
634	511	u00 += q * u01;
635	511	u10 += q * u11;
636	511	goto done;
637	511	}
638	351k	q++;
639	351k	u00 += q * u01;
640	351k	u10 += q * u11;
641	351k	}
642	582k	}
643
644		/* NOTE: Since we discard the least significant half limb, we don't get a
645		truly maximal M (corresponding to \|a - b\| < 2^{GMP_LIMB_BITS +1}). */
646		/* Single precision loop */
647	31.9k	for (;;)
648	537k	{
649	537k	ASSERT (ah >= bh);
650
651	537k	ah -= bh;
652	537k	if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
653	14.5k	break;
654
655	523k	if (ah <= bh)
656	218k	{
657		/* Use q = 1 */
658	218k	u01 += u00;
659	218k	u11 += u10;
660	218k	}
661	304k	else
662	304k	{
663	304k	mp_double_limb_t rq = div1 (ah, bh);
664	304k	mp_limb_t q = rq.d1;
665	304k	ah = rq.d0;
666
667	304k	if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
668	17.4k	{
669		/* A is too small, but q is correct. */
670	17.4k	u01 += q * u00;
671	17.4k	u11 += q * u10;
672	17.4k	break;
673	17.4k	}
674	287k	q++;
675	287k	u01 += q * u00;
676	287k	u11 += q * u10;
677	287k	}
678	537k	subtract_a1:
679	537k	ASSERT (bh >= ah);
680
681	537k	bh -= ah;
682	537k	if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
683	14.4k	break;
684
685	523k	if (bh <= ah)
686	218k	{
687		/* Use q = 1 */
688	218k	u00 += u01;
689	218k	u10 += u11;
690	218k	}
691	305k	else
692	305k	{
693	305k	mp_double_limb_t rq = div1 (bh, ah);
694	305k	mp_limb_t q = rq.d1;
695	305k	bh = rq.d0;
696
697	305k	if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
698	17.4k	{
699		/* B is too small, but q is correct. */
700	17.4k	u00 += q * u01;
701	17.4k	u10 += q * u11;
702	17.4k	break;
703	17.4k	}
704	287k	q++;
705	287k	u00 += q * u01;
706	287k	u10 += q * u11;
707	287k	}
708	523k	}
709
710	66.1k	done:
711	66.1k	M->u[0][0] = u00; M->u[0][1] = u01;
712	66.1k	M->u[1][0] = u10; M->u[1][1] = u11;
713
714	66.1k	return 1;
715	31.9k	}
716
717		/* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must
718		* have space for n + 1 limbs. Uses three buffers to avoid a copy*/
719		mp_size_t
720		mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M,
721		mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
722	59.8k	{
723	59.8k	mp_limb_t ah, bh;
724
725		/* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
726
727		r = u00 * a
728		r += u10 * b
729		b *= u11
730		b += u01 * a
731		*/
732
733	59.8k	#if HAVE_NATIVE_mpn_addaddmul_1msb0
734	59.8k	ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]);
735	59.8k	bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]);
736		#else
737		ah = mpn_mul_1 (rp, ap, n, M->u[0][0]);
738		ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]);
739
740		bh = mpn_mul_1 (bp, bp, n, M->u[1][1]);
741		bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]);
742		#endif
743	59.8k	rp[n] = ah;
744	59.8k	bp[n] = bh;
745
746	59.8k	n += (ah \| bh) > 0;
747	59.8k	return n;
748	59.8k	}

Coverage Report

Created: 2024-06-28 06:19