Coverage Report

Created: 2024-06-28 06:19

/src/gmp-6.2.1/mpn/hgcd2.c
Line
Count
Source
1
/* hgcd2.c
2
3
   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
4
   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
5
   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6
7
Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation,
8
Inc.
9
10
This file is part of the GNU MP Library.
11
12
The GNU MP Library is free software; you can redistribute it and/or modify
13
it under the terms of either:
14
15
  * the GNU Lesser General Public License as published by the Free
16
    Software Foundation; either version 3 of the License, or (at your
17
    option) any later version.
18
19
or
20
21
  * the GNU General Public License as published by the Free Software
22
    Foundation; either version 2 of the License, or (at your option) any
23
    later version.
24
25
or both in parallel, as here.
26
27
The GNU MP Library is distributed in the hope that it will be useful, but
28
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
29
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
30
for more details.
31
32
You should have received copies of the GNU General Public License and the
33
GNU Lesser General Public License along with the GNU MP Library.  If not,
34
see https://www.gnu.org/licenses/.  */
35
36
#include "gmp-impl.h"
37
#include "longlong.h"
38
39
#ifndef HGCD2_DIV1_METHOD
40
#define HGCD2_DIV1_METHOD 3
41
#endif
42
43
#ifndef HGCD2_DIV2_METHOD
44
#define HGCD2_DIV2_METHOD 2
45
#endif
46
47
#if GMP_NAIL_BITS != 0
48
#error Nails not implemented
49
#endif
50
51
#if HAVE_NATIVE_mpn_div_11
52
53
#define div1 mpn_div_11
54
/* Single-limb division optimized for small quotients.
55
   Returned value holds d0 = r, d1 = q. */
56
mp_double_limb_t div1 (mp_limb_t, mp_limb_t);
57
58
#elif HGCD2_DIV1_METHOD == 1
59
60
static inline mp_double_limb_t
61
div1 (mp_limb_t n0, mp_limb_t d0)
62
{
63
  mp_double_limb_t res;
64
  res.d1 = n0 / d0;
65
  res.d0 = n0 - res.d1 * d0;
66
67
  return res;
68
}
69
70
#elif HGCD2_DIV1_METHOD == 2
71
72
static mp_double_limb_t
73
div1 (mp_limb_t n0, mp_limb_t d0)
74
{
75
  mp_double_limb_t res;
76
  int ncnt, dcnt, cnt;
77
  mp_limb_t q;
78
  mp_limb_t mask;
79
80
  ASSERT (n0 >= d0);
81
82
  count_leading_zeros (ncnt, n0);
83
  count_leading_zeros (dcnt, d0);
84
  cnt = dcnt - ncnt;
85
86
  d0 <<= cnt;
87
88
  q = -(mp_limb_t) (n0 >= d0);
89
  n0 -= d0 & q;
90
  d0 >>= 1;
91
  q = -q;
92
93
  while (--cnt >= 0)
94
    {
95
      mask = -(mp_limb_t) (n0 >= d0);
96
      n0 -= d0 & mask;
97
      d0 >>= 1;
98
      q = (q << 1) - mask;
99
    }
100
101
  res.d0 = n0;
102
  res.d1 = q;
103
  return res;
104
}
105
106
#elif HGCD2_DIV1_METHOD == 3
107
108
static inline mp_double_limb_t
109
div1 (mp_limb_t n0, mp_limb_t d0)
110
610k
{
111
610k
  mp_double_limb_t res;
112
610k
  if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
113
610k
      || UNLIKELY (n0 >= (d0 << 3)))
114
165k
    {
115
165k
      res.d1 = n0 / d0;
116
165k
      res.d0 = n0 - res.d1 * d0;
117
165k
    }
118
444k
  else
119
444k
    {
120
444k
      mp_limb_t q, mask;
121
122
444k
      d0 <<= 2;
123
124
444k
      mask = -(mp_limb_t) (n0 >= d0);
125
444k
      n0 -= d0 & mask;
126
444k
      q = 4 & mask;
127
128
444k
      d0 >>= 1;
129
444k
      mask = -(mp_limb_t) (n0 >= d0);
130
444k
      n0 -= d0 & mask;
131
444k
      q += 2 & mask;
132
133
444k
      d0 >>= 1;
134
444k
      mask = -(mp_limb_t) (n0 >= d0);
135
444k
      n0 -= d0 & mask;
136
444k
      q -= mask;
137
138
444k
      res.d0 = n0;
139
444k
      res.d1 = q;
140
444k
    }
141
610k
  return res;
142
610k
}
143
144
#elif HGCD2_DIV1_METHOD == 4
145
146
/* Table quotients.  We extract the NBITS most significant bits of the
147
   numerator limb, and the corresponding bits from the divisor limb, and use
148
   these to form an index into the table.  This method is probably only useful
149
   for short pipelines with slow multiplication.
150
151
   Possible improvements:
152
153
   * Perhaps extract the highest NBITS of the divisor instead of the same bits
154
     as from the numerator.  That would require another count_leading_zeros,
155
     and a post-multiply shift of the quotient.
156
157
   * Compress tables?  Their values are tiny, and there are lots of zero
158
     entries (which are never used).
159
160
   * Round the table entries more cleverly?
161
*/
162
163
#ifndef NBITS
164
#define NBITS 5
165
#endif
166
167
#if NBITS == 5
168
/* This needs full division about 13.2% of the time. */
169
static const unsigned char tab[512] = {
170
17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
171
18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
172
19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
173
20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
174
21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
175
22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
176
23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
177
24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
178
25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
179
26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
180
27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
181
28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
182
29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
183
30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
184
31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
185
32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
186
};
187
#elif NBITS == 6
188
/* This needs full division about 9.8% of the time. */
189
static const unsigned char tab[2048] = {
190
33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
191
 1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
192
34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
193
 1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
194
35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
195
 1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
196
36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
197
 1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
198
37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
199
 1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
200
38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
201
 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
202
39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
203
 1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
204
40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
205
 1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
206
41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
207
 1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
208
42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
209
 1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
210
43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
211
 1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
212
44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
213
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
214
45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
215
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
216
46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
217
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
218
47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
219
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
220
48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
221
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
222
49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
223
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
224
50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
225
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
226
51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
227
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
228
52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
229
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
230
53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
231
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
232
54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
233
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
234
55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
235
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
236
56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
237
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
238
57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
239
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
240
58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
241
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
242
59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
243
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
244
60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
245
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
246
61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
247
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
248
62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
249
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
250
63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
251
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
252
64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
253
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
254
};
255
#else
256
#error No table for provided NBITS
257
#endif
258
259
/* Doing tabp with a #define makes compiler warnings about pointing outside an
260
   object go away.  We used to define this as a variable.  It is not clear if
261
   e.g.  (vector[100] - 10) + 10 is well- defined as per the C standard;
262
   (vector[100] + 10) - 10 surely is and there is no sequence point so the
263
   expressions should be equivalent.  To make this safe, we might want to
264
   define tabp as a macro with the index as an argument.  Depending on the
265
   platform, relocs might allow for assembly-time or linker-time resolution to
266
   take place. */
267
#define tabp (tab - (1 << (NBITS - 1) << NBITS))
268
269
static inline mp_double_limb_t
270
div1 (mp_limb_t n0, mp_limb_t d0)
271
{
272
  int ncnt;
273
  size_t nbi, dbi;
274
  mp_limb_t q0;
275
  mp_limb_t r0;
276
  mp_limb_t mask;
277
  mp_double_limb_t res;
278
279
  ASSERT (n0 >= d0);    /* Actually only msb position is critical. */
280
281
  count_leading_zeros (ncnt, n0);
282
  nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);
283
  dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);
284
285
  q0 = tabp[(nbi << NBITS) + dbi];
286
  r0 = n0 - q0 * d0;
287
  mask = -(mp_limb_t) (r0 >= d0);
288
  q0 -= mask;
289
  r0 -= d0 & mask;
290
291
  if (UNLIKELY (r0 >= d0))
292
    {
293
      q0 = n0 / d0;
294
      r0 = n0 - q0 * d0;
295
    }
296
297
  res.d1 = q0;
298
  res.d0 = r0;
299
  return res;
300
}
301
302
#elif HGCD2_DIV1_METHOD == 5
303
304
/* Table inverses of divisors.  We don't bother with suppressing the msb from
305
   the tables.  We index with the NBITS most significant divisor bits,
306
   including the always-set highest bit, but use addressing trickery via tabp
307
   to suppress it.
308
309
   Possible improvements:
310
311
   * Do first multiply using 32-bit operations on 64-bit computers.  At least
312
     on most Arm64 cores, that uses 3 times less resources.  It also saves on
313
     many x86-64 processors.
314
*/
315
316
#ifndef NBITS
317
#define NBITS 7
318
#endif
319
320
#if NBITS == 5
321
/* This needs full division about 1.63% of the time. */
322
static const unsigned char tab[16] = {
323
 63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32
324
};
325
#elif NBITS == 6
326
/* This needs full division about 0.93% of the time. */
327
static const unsigned char tab[32] = {
328
127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,
329
 84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64
330
};
331
#elif NBITS == 7
332
/* This needs full division about 0.49% of the time. */
333
static const unsigned char tab[64] = {
334
255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,
335
203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,
336
169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,
337
145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128
338
};
339
#elif NBITS == 8
340
/* This needs full division about 0.26% of the time. */
341
static const unsigned short tab[128] = {
342
511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,
343
454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,
344
408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,
345
371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,
346
340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,
347
314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,
348
291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,
349
272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256
350
};
351
#else
352
#error No table for provided NBITS
353
#endif
354
355
/* Doing tabp with a #define makes compiler warnings about pointing outside an
356
   object go away.  We used to define this as a variable.  It is not clear if
357
   e.g.  (vector[100] - 10) + 10 is well- defined as per the C standard;
358
   (vector[100] + 10) - 10 surely is and there is no sequence point so the
359
   expressions should be equivalent.  To make this safe, we might want to
360
   define tabp as a macro with the index as an argument.  Depending on the
361
   platform, relocs might allow for assembly-time or linker-time resolution to
362
   take place. */
363
#define tabp (tab - (1 << (NBITS - 1)))
364
365
static inline mp_double_limb_t
366
div1 (mp_limb_t n0, mp_limb_t d0)
367
{
368
  int ncnt, dcnt;
369
  size_t dbi;
370
  mp_limb_t inv;
371
  mp_limb_t q0;
372
  mp_limb_t r0;
373
  mp_limb_t mask;
374
  mp_double_limb_t res;
375
376
  count_leading_zeros (ncnt, n0);
377
  count_leading_zeros (dcnt, d0);
378
379
  dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
380
  inv = tabp[dbi];
381
  q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
382
  r0 = n0 - q0 * d0;
383
  mask = -(mp_limb_t) (r0 >= d0);
384
  q0 -= mask;
385
  r0 -= d0 & mask;
386
387
  if (UNLIKELY (r0 >= d0))
388
    {
389
      q0 = n0 / d0;
390
      r0 = n0 - q0 * d0;
391
    }
392
393
  res.d1 = q0;
394
  res.d0 = r0;
395
  return res;
396
}
397
398
#else
399
#error Unknown HGCD2_DIV1_METHOD
400
#endif
401
402
#if HAVE_NATIVE_mpn_div_22
403
404
#define div2 mpn_div_22
405
/* Two-limb division optimized for small quotients.  */
406
mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);
407
408
#elif HGCD2_DIV2_METHOD == 1
409
410
static mp_limb_t
411
div2 (mp_ptr rp,
412
      mp_limb_t n1, mp_limb_t n0,
413
      mp_limb_t d1, mp_limb_t d0)
414
{
415
  mp_double_limb_t rq = div1 (n1, d1);
416
  if (UNLIKELY (rq.d1 > d1))
417
    {
418
      mp_limb_t n2, q, t1, t0;
419
      int c;
420
421
      /* Normalize */
422
      count_leading_zeros (c, d1);
423
      ASSERT (c > 0);
424
425
      n2 = n1 >> (GMP_LIMB_BITS - c);
426
      n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c));
427
      n0 <<= c;
428
      d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c));
429
      d0 <<= c;
430
431
      udiv_qrnnd (q, n1, n2, n1, d1);
432
      umul_ppmm (t1, t0, q, d0);
433
      if (t1 > n1 || (t1 == n1 && t0 > n0))
434
  {
435
    ASSERT (q > 0);
436
    q--;
437
    sub_ddmmss (t1, t0, t1, t0, d1, d0);
438
  }
439
      sub_ddmmss (n1, n0, n1, n0, t1, t0);
440
441
      /* Undo normalization */
442
      rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c));
443
      rp[1] = n1 >> c;
444
445
      return q;
446
    }
447
  else
448
    {
449
      mp_limb_t q, t1, t0;
450
      n1 = rq.d0;
451
      q = rq.d1;
452
      umul_ppmm (t1, t0, q, d0);
453
      if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0))
454
  {
455
    ASSERT (q > 0);
456
    q--;
457
    sub_ddmmss (t1, t0, t1, t0, d1, d0);
458
  }
459
      sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
460
      return q;
461
    }
462
}
463
464
#elif HGCD2_DIV2_METHOD == 2
465
466
/* Bit-wise div2. Relies on fast count_leading_zeros. */
467
static mp_limb_t
468
div2 (mp_ptr rp,
469
      mp_limb_t n1, mp_limb_t n0,
470
      mp_limb_t d1, mp_limb_t d0)
471
701k
{
472
701k
  mp_limb_t q = 0;
473
701k
  int ncnt;
474
701k
  int dcnt;
475
476
701k
  count_leading_zeros (ncnt, n1);
477
701k
  count_leading_zeros (dcnt, d1);
478
701k
  dcnt -= ncnt;
479
480
701k
  d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
481
701k
  d0 <<= dcnt;
482
483
701k
  do
484
2.34M
    {
485
2.34M
      mp_limb_t mask;
486
2.34M
      q <<= 1;
487
2.34M
      if (UNLIKELY (n1 == d1))
488
1.54k
  mask = -(n0 >= d0);
489
2.34M
      else
490
2.34M
  mask = -(n1 > d1);
491
492
2.34M
      q -= mask;
493
494
2.34M
      sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
495
496
2.34M
      d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);
497
2.34M
      d1 = d1 >> 1;
498
2.34M
    }
499
2.34M
  while (dcnt--);
500
501
701k
  rp[0] = n0;
502
701k
  rp[1] = n1;
503
504
701k
  return q;
505
701k
}
506
#else
507
#error Unknown HGCD2_DIV2_METHOD
508
#endif
509
510
/* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs
511
   matrix M. Returns 1 if we make progress, i.e. can perform at least
512
   one subtraction. Otherwise returns zero. */
513
514
/* FIXME: Possible optimizations:
515
516
   The div2 function starts with checking the most significant bit of
517
   the numerator. We can maintained normalized operands here, call
518
   hgcd with normalized operands only, which should make the code
519
   simpler and possibly faster.
520
521
   Experiment with table lookups on the most significant bits.
522
523
   This function is also a candidate for assembler implementation.
524
*/
525
int
526
mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
527
     struct hgcd_matrix1 *M)
528
69.6k
{
529
69.6k
  mp_limb_t u00, u01, u10, u11;
530
531
69.6k
  if (ah < 2 || bh < 2)
532
487
    return 0;
533
534
69.2k
  if (ah > bh || (ah == bh && al > bl))
535
33.6k
    {
536
33.6k
      sub_ddmmss (ah, al, ah, al, bh, bl);
537
33.6k
      if (ah < 2)
538
892
  return 0;
539
540
32.7k
      u00 = u01 = u11 = 1;
541
32.7k
      u10 = 0;
542
32.7k
    }
543
35.5k
  else
544
35.5k
    {
545
35.5k
      sub_ddmmss (bh, bl, bh, bl, ah, al);
546
35.5k
      if (bh < 2)
547
2.11k
  return 0;
548
549
33.4k
      u00 = u10 = u11 = 1;
550
33.4k
      u01 = 0;
551
33.4k
    }
552
553
66.1k
  if (ah < bh)
554
33.2k
    goto subtract_a;
555
556
32.9k
  for (;;)
557
615k
    {
558
615k
      ASSERT (ah >= bh);
559
615k
      if (ah == bh)
560
738
  goto done;
561
562
614k
      if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
563
31.9k
  {
564
31.9k
    ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
565
31.9k
    bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
566
567
31.9k
    break;
568
31.9k
  }
569
570
      /* Subtract a -= q b, and multiply M from the right by (1 q ; 0
571
   1), affecting the second column of M. */
572
582k
      ASSERT (ah > bh);
573
582k
      sub_ddmmss (ah, al, ah, al, bh, bl);
574
575
582k
      if (ah < 2)
576
31
  goto done;
577
578
582k
      if (ah <= bh)
579
232k
  {
580
    /* Use q = 1 */
581
232k
    u01 += u00;
582
232k
    u11 += u10;
583
232k
  }
584
349k
      else
585
349k
  {
586
349k
    mp_limb_t r[2];
587
349k
    mp_limb_t q = div2 (r, ah, al, bh, bl);
588
349k
    al = r[0]; ah = r[1];
589
349k
    if (ah < 2)
590
449
      {
591
        /* A is too small, but q is correct. */
592
449
        u01 += q * u00;
593
449
        u11 += q * u10;
594
449
        goto done;
595
449
      }
596
349k
    q++;
597
349k
    u01 += q * u00;
598
349k
    u11 += q * u10;
599
349k
  }
600
615k
    subtract_a:
601
615k
      ASSERT (bh >= ah);
602
615k
      if (ah == bh)
603
662
  goto done;
604
605
614k
      if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
606
31.8k
  {
607
31.8k
    ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
608
31.8k
    bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
609
610
31.8k
    goto subtract_a1;
611
31.8k
  }
612
613
      /* Subtract b -= q a, and multiply M from the right by (1 0 ; q
614
   1), affecting the first column of M. */
615
582k
      sub_ddmmss (bh, bl, bh, bl, ah, al);
616
617
582k
      if (bh < 2)
618
34
  goto done;
619
620
582k
      if (bh <= ah)
621
231k
  {
622
    /* Use q = 1 */
623
231k
    u00 += u01;
624
231k
    u10 += u11;
625
231k
  }
626
351k
      else
627
351k
  {
628
351k
    mp_limb_t r[2];
629
351k
    mp_limb_t q = div2 (r, bh, bl, ah, al);
630
351k
    bl = r[0]; bh = r[1];
631
351k
    if (bh < 2)
632
511
      {
633
        /* B is too small, but q is correct. */
634
511
        u00 += q * u01;
635
511
        u10 += q * u11;
636
511
        goto done;
637
511
      }
638
351k
    q++;
639
351k
    u00 += q * u01;
640
351k
    u10 += q * u11;
641
351k
  }
642
582k
    }
643
644
  /* NOTE: Since we discard the least significant half limb, we don't get a
645
     truly maximal M (corresponding to |a - b| < 2^{GMP_LIMB_BITS +1}). */
646
  /* Single precision loop */
647
31.9k
  for (;;)
648
537k
    {
649
537k
      ASSERT (ah >= bh);
650
651
537k
      ah -= bh;
652
537k
      if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
653
14.5k
  break;
654
655
523k
      if (ah <= bh)
656
218k
  {
657
    /* Use q = 1 */
658
218k
    u01 += u00;
659
218k
    u11 += u10;
660
218k
  }
661
304k
      else
662
304k
  {
663
304k
    mp_double_limb_t rq = div1 (ah, bh);
664
304k
    mp_limb_t q = rq.d1;
665
304k
    ah = rq.d0;
666
667
304k
    if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
668
17.4k
      {
669
        /* A is too small, but q is correct. */
670
17.4k
        u01 += q * u00;
671
17.4k
        u11 += q * u10;
672
17.4k
        break;
673
17.4k
      }
674
287k
    q++;
675
287k
    u01 += q * u00;
676
287k
    u11 += q * u10;
677
287k
  }
678
537k
    subtract_a1:
679
537k
      ASSERT (bh >= ah);
680
681
537k
      bh -= ah;
682
537k
      if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
683
14.4k
  break;
684
685
523k
      if (bh <= ah)
686
218k
  {
687
    /* Use q = 1 */
688
218k
    u00 += u01;
689
218k
    u10 += u11;
690
218k
  }
691
305k
      else
692
305k
  {
693
305k
    mp_double_limb_t rq = div1 (bh, ah);
694
305k
    mp_limb_t q = rq.d1;
695
305k
    bh = rq.d0;
696
697
305k
    if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
698
17.4k
      {
699
        /* B is too small, but q is correct. */
700
17.4k
        u00 += q * u01;
701
17.4k
        u10 += q * u11;
702
17.4k
        break;
703
17.4k
      }
704
287k
    q++;
705
287k
    u00 += q * u01;
706
287k
    u10 += q * u11;
707
287k
  }
708
523k
    }
709
710
66.1k
 done:
711
66.1k
  M->u[0][0] = u00; M->u[0][1] = u01;
712
66.1k
  M->u[1][0] = u10; M->u[1][1] = u11;
713
714
66.1k
  return 1;
715
31.9k
}
716
717
/* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must
718
 * have space for n + 1 limbs. Uses three buffers to avoid a copy*/
719
mp_size_t
720
mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M,
721
           mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
722
59.8k
{
723
59.8k
  mp_limb_t ah, bh;
724
725
  /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
726
727
     r  = u00 * a
728
     r += u10 * b
729
     b *= u11
730
     b += u01 * a
731
  */
732
733
59.8k
#if HAVE_NATIVE_mpn_addaddmul_1msb0
734
59.8k
  ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]);
735
59.8k
  bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]);
736
#else
737
  ah =     mpn_mul_1 (rp, ap, n, M->u[0][0]);
738
  ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]);
739
740
  bh =     mpn_mul_1 (bp, bp, n, M->u[1][1]);
741
  bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]);
742
#endif
743
59.8k
  rp[n] = ah;
744
59.8k
  bp[n] = bh;
745
746
59.8k
  n += (ah | bh) > 0;
747
59.8k
  return n;
748
59.8k
}