Coverage Report

Created: 2025-03-09 06:52

/src/gmp-6.2.1/mpn/hgcd2.c
Line
Count
Source
1
/* hgcd2.c
2
3
   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
4
   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
5
   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6
7
Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation,
8
Inc.
9
10
This file is part of the GNU MP Library.
11
12
The GNU MP Library is free software; you can redistribute it and/or modify
13
it under the terms of either:
14
15
  * the GNU Lesser General Public License as published by the Free
16
    Software Foundation; either version 3 of the License, or (at your
17
    option) any later version.
18
19
or
20
21
  * the GNU General Public License as published by the Free Software
22
    Foundation; either version 2 of the License, or (at your option) any
23
    later version.
24
25
or both in parallel, as here.
26
27
The GNU MP Library is distributed in the hope that it will be useful, but
28
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
29
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
30
for more details.
31
32
You should have received copies of the GNU General Public License and the
33
GNU Lesser General Public License along with the GNU MP Library.  If not,
34
see https://www.gnu.org/licenses/.  */
35
36
#include "gmp-impl.h"
37
#include "longlong.h"
38
39
#ifndef HGCD2_DIV1_METHOD
40
#define HGCD2_DIV1_METHOD 3
41
#endif
42
43
#ifndef HGCD2_DIV2_METHOD
44
#define HGCD2_DIV2_METHOD 2
45
#endif
46
47
#if GMP_NAIL_BITS != 0
48
#error Nails not implemented
49
#endif
50
51
#if HAVE_NATIVE_mpn_div_11
52
53
#define div1 mpn_div_11
54
/* Single-limb division optimized for small quotients.
55
   Returned value holds d0 = r, d1 = q. */
56
mp_double_limb_t div1 (mp_limb_t, mp_limb_t);
57
58
#elif HGCD2_DIV1_METHOD == 1
59
60
static inline mp_double_limb_t
61
div1 (mp_limb_t n0, mp_limb_t d0)
62
{
63
  mp_double_limb_t res;
64
  res.d1 = n0 / d0;
65
  res.d0 = n0 - res.d1 * d0;
66
67
  return res;
68
}
69
70
#elif HGCD2_DIV1_METHOD == 2
71
72
static mp_double_limb_t
73
div1 (mp_limb_t n0, mp_limb_t d0)
74
{
75
  mp_double_limb_t res;
76
  int ncnt, dcnt, cnt;
77
  mp_limb_t q;
78
  mp_limb_t mask;
79
80
  ASSERT (n0 >= d0);
81
82
  count_leading_zeros (ncnt, n0);
83
  count_leading_zeros (dcnt, d0);
84
  cnt = dcnt - ncnt;
85
86
  d0 <<= cnt;
87
88
  q = -(mp_limb_t) (n0 >= d0);
89
  n0 -= d0 & q;
90
  d0 >>= 1;
91
  q = -q;
92
93
  while (--cnt >= 0)
94
    {
95
      mask = -(mp_limb_t) (n0 >= d0);
96
      n0 -= d0 & mask;
97
      d0 >>= 1;
98
      q = (q << 1) - mask;
99
    }
100
101
  res.d0 = n0;
102
  res.d1 = q;
103
  return res;
104
}
105
106
#elif HGCD2_DIV1_METHOD == 3
107
108
static inline mp_double_limb_t
109
div1 (mp_limb_t n0, mp_limb_t d0)
110
250k
{
111
250k
  mp_double_limb_t res;
112
250k
  if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
113
250k
      || UNLIKELY (n0 >= (d0 << 3)))
114
67.5k
    {
115
67.5k
      res.d1 = n0 / d0;
116
67.5k
      res.d0 = n0 - res.d1 * d0;
117
67.5k
    }
118
183k
  else
119
183k
    {
120
183k
      mp_limb_t q, mask;
121
122
183k
      d0 <<= 2;
123
124
183k
      mask = -(mp_limb_t) (n0 >= d0);
125
183k
      n0 -= d0 & mask;
126
183k
      q = 4 & mask;
127
128
183k
      d0 >>= 1;
129
183k
      mask = -(mp_limb_t) (n0 >= d0);
130
183k
      n0 -= d0 & mask;
131
183k
      q += 2 & mask;
132
133
183k
      d0 >>= 1;
134
183k
      mask = -(mp_limb_t) (n0 >= d0);
135
183k
      n0 -= d0 & mask;
136
183k
      q -= mask;
137
138
183k
      res.d0 = n0;
139
183k
      res.d1 = q;
140
183k
    }
141
250k
  return res;
142
250k
}
143
144
#elif HGCD2_DIV1_METHOD == 4
145
146
/* Table quotients.  We extract the NBITS most significant bits of the
147
   numerator limb, and the corresponding bits from the divisor limb, and use
148
   these to form an index into the table.  This method is probably only useful
149
   for short pipelines with slow multiplication.
150
151
   Possible improvements:
152
153
   * Perhaps extract the highest NBITS of the divisor instead of the same bits
154
     as from the numerator.  That would require another count_leading_zeros,
155
     and a post-multiply shift of the quotient.
156
157
   * Compress tables?  Their values are tiny, and there are lots of zero
158
     entries (which are never used).
159
160
   * Round the table entries more cleverly?
161
*/
162
163
#ifndef NBITS
164
#define NBITS 5
165
#endif
166
167
#if NBITS == 5
168
/* This needs full division about 13.2% of the time. */
169
static const unsigned char tab[512] = {
170
17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
171
18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
172
19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
173
20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
174
21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
175
22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
176
23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
177
24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
178
25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
179
26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
180
27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
181
28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
182
29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
183
30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
184
31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
185
32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
186
};
187
#elif NBITS == 6
188
/* This needs full division about 9.8% of the time. */
189
static const unsigned char tab[2048] = {
190
33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
191
 1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
192
34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
193
 1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
194
35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
195
 1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
196
36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
197
 1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
198
37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
199
 1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
200
38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
201
 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
202
39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
203
 1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
204
40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
205
 1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
206
41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
207
 1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
208
42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
209
 1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
210
43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
211
 1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
212
44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
213
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
214
45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
215
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
216
46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
217
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
218
47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
219
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
220
48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
221
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
222
49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
223
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
224
50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
225
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
226
51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
227
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
228
52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
229
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
230
53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
231
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
232
54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
233
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
234
55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
235
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
236
56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
237
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
238
57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
239
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
240
58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
241
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
242
59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
243
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
244
60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
245
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
246
61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
247
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
248
62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
249
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
250
63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
251
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
252
64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
253
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
254
};
255
#else
256
#error No table for provided NBITS
257
#endif
258
259
/* Doing tabp with a #define makes compiler warnings about pointing outside an
260
   object go away.  We used to define this as a variable.  It is not clear if
261
   e.g.  (vector[100] - 10) + 10 is well- defined as per the C standard;
262
   (vector[100] + 10) - 10 surely is and there is no sequence point so the
263
   expressions should be equivalent.  To make this safe, we might want to
264
   define tabp as a macro with the index as an argument.  Depending on the
265
   platform, relocs might allow for assembly-time or linker-time resolution to
266
   take place. */
267
#define tabp (tab - (1 << (NBITS - 1) << NBITS))
268
269
static inline mp_double_limb_t
270
div1 (mp_limb_t n0, mp_limb_t d0)
271
{
272
  int ncnt;
273
  size_t nbi, dbi;
274
  mp_limb_t q0;
275
  mp_limb_t r0;
276
  mp_limb_t mask;
277
  mp_double_limb_t res;
278
279
  ASSERT (n0 >= d0);    /* Actually only msb position is critical. */
280
281
  count_leading_zeros (ncnt, n0);
282
  nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);
283
  dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);
284
285
  q0 = tabp[(nbi << NBITS) + dbi];
286
  r0 = n0 - q0 * d0;
287
  mask = -(mp_limb_t) (r0 >= d0);
288
  q0 -= mask;
289
  r0 -= d0 & mask;
290
291
  if (UNLIKELY (r0 >= d0))
292
    {
293
      q0 = n0 / d0;
294
      r0 = n0 - q0 * d0;
295
    }
296
297
  res.d1 = q0;
298
  res.d0 = r0;
299
  return res;
300
}
301
302
#elif HGCD2_DIV1_METHOD == 5
303
304
/* Table inverses of divisors.  We don't bother with suppressing the msb from
305
   the tables.  We index with the NBITS most significant divisor bits,
306
   including the always-set highest bit, but use addressing trickery via tabp
307
   to suppress it.
308
309
   Possible improvements:
310
311
   * Do first multiply using 32-bit operations on 64-bit computers.  At least
312
     on most Arm64 cores, that uses 3 times less resources.  It also saves on
313
     many x86-64 processors.
314
*/
315
316
#ifndef NBITS
317
#define NBITS 7
318
#endif
319
320
#if NBITS == 5
321
/* This needs full division about 1.63% of the time. */
322
static const unsigned char tab[16] = {
323
 63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32
324
};
325
#elif NBITS == 6
326
/* This needs full division about 0.93% of the time. */
327
static const unsigned char tab[32] = {
328
127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,
329
 84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64
330
};
331
#elif NBITS == 7
332
/* This needs full division about 0.49% of the time. */
333
static const unsigned char tab[64] = {
334
255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,
335
203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,
336
169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,
337
145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128
338
};
339
#elif NBITS == 8
340
/* This needs full division about 0.26% of the time. */
341
static const unsigned short tab[128] = {
342
511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,
343
454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,
344
408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,
345
371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,
346
340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,
347
314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,
348
291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,
349
272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256
350
};
351
#else
352
#error No table for provided NBITS
353
#endif
354
355
/* Doing tabp with a #define makes compiler warnings about pointing outside an
356
   object go away.  We used to define this as a variable.  It is not clear if
357
   e.g.  (vector[100] - 10) + 10 is well- defined as per the C standard;
358
   (vector[100] + 10) - 10 surely is and there is no sequence point so the
359
   expressions should be equivalent.  To make this safe, we might want to
360
   define tabp as a macro with the index as an argument.  Depending on the
361
   platform, relocs might allow for assembly-time or linker-time resolution to
362
   take place. */
363
#define tabp (tab - (1 << (NBITS - 1)))
364
365
static inline mp_double_limb_t
366
div1 (mp_limb_t n0, mp_limb_t d0)
367
{
368
  int ncnt, dcnt;
369
  size_t dbi;
370
  mp_limb_t inv;
371
  mp_limb_t q0;
372
  mp_limb_t r0;
373
  mp_limb_t mask;
374
  mp_double_limb_t res;
375
376
  count_leading_zeros (ncnt, n0);
377
  count_leading_zeros (dcnt, d0);
378
379
  dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
380
  inv = tabp[dbi];
381
  q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
382
  r0 = n0 - q0 * d0;
383
  mask = -(mp_limb_t) (r0 >= d0);
384
  q0 -= mask;
385
  r0 -= d0 & mask;
386
387
  if (UNLIKELY (r0 >= d0))
388
    {
389
      q0 = n0 / d0;
390
      r0 = n0 - q0 * d0;
391
    }
392
393
  res.d1 = q0;
394
  res.d0 = r0;
395
  return res;
396
}
397
398
#else
399
#error Unknown HGCD2_DIV1_METHOD
400
#endif
401
402
#if HAVE_NATIVE_mpn_div_22
403
404
#define div2 mpn_div_22
405
/* Two-limb division optimized for small quotients.  */
406
mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);
407
408
#elif HGCD2_DIV2_METHOD == 1
409
410
static mp_limb_t
411
div2 (mp_ptr rp,
412
      mp_limb_t n1, mp_limb_t n0,
413
      mp_limb_t d1, mp_limb_t d0)
414
{
415
  mp_double_limb_t rq = div1 (n1, d1);
416
  if (UNLIKELY (rq.d1 > d1))
417
    {
418
      mp_limb_t n2, q, t1, t0;
419
      int c;
420
421
      /* Normalize */
422
      count_leading_zeros (c, d1);
423
      ASSERT (c > 0);
424
425
      n2 = n1 >> (GMP_LIMB_BITS - c);
426
      n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c));
427
      n0 <<= c;
428
      d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c));
429
      d0 <<= c;
430
431
      udiv_qrnnd (q, n1, n2, n1, d1);
432
      umul_ppmm (t1, t0, q, d0);
433
      if (t1 > n1 || (t1 == n1 && t0 > n0))
434
  {
435
    ASSERT (q > 0);
436
    q--;
437
    sub_ddmmss (t1, t0, t1, t0, d1, d0);
438
  }
439
      sub_ddmmss (n1, n0, n1, n0, t1, t0);
440
441
      /* Undo normalization */
442
      rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c));
443
      rp[1] = n1 >> c;
444
445
      return q;
446
    }
447
  else
448
    {
449
      mp_limb_t q, t1, t0;
450
      n1 = rq.d0;
451
      q = rq.d1;
452
      umul_ppmm (t1, t0, q, d0);
453
      if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0))
454
  {
455
    ASSERT (q > 0);
456
    q--;
457
    sub_ddmmss (t1, t0, t1, t0, d1, d0);
458
  }
459
      sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
460
      return q;
461
    }
462
}
463
464
#elif HGCD2_DIV2_METHOD == 2
465
466
/* Bit-wise div2. Relies on fast count_leading_zeros. */
467
static mp_limb_t
468
div2 (mp_ptr rp,
469
      mp_limb_t n1, mp_limb_t n0,
470
      mp_limb_t d1, mp_limb_t d0)
471
292k
{
472
292k
  mp_limb_t q = 0;
473
292k
  int ncnt;
474
292k
  int dcnt;
475
476
292k
  count_leading_zeros (ncnt, n1);
477
292k
  count_leading_zeros (dcnt, d1);
478
292k
  dcnt -= ncnt;
479
480
292k
  d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
481
292k
  d0 <<= dcnt;
482
483
292k
  do
484
998k
    {
485
998k
      mp_limb_t mask;
486
998k
      q <<= 1;
487
998k
      if (UNLIKELY (n1 == d1))
488
1.06k
  mask = -(n0 >= d0);
489
997k
      else
490
997k
  mask = -(n1 > d1);
491
492
998k
      q -= mask;
493
494
998k
      sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
495
496
998k
      d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);
497
998k
      d1 = d1 >> 1;
498
998k
    }
499
998k
  while (dcnt--);
500
501
292k
  rp[0] = n0;
502
292k
  rp[1] = n1;
503
504
292k
  return q;
505
292k
}
506
#else
507
#error Unknown HGCD2_DIV2_METHOD
508
#endif
509
510
/* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs
511
   matrix M. Returns 1 if we make progress, i.e. can perform at least
512
   one subtraction. Otherwise returns zero. */
513
514
/* FIXME: Possible optimizations:
515
516
   The div2 function starts with checking the most significant bit of
517
   the numerator. We can maintained normalized operands here, call
518
   hgcd with normalized operands only, which should make the code
519
   simpler and possibly faster.
520
521
   Experiment with table lookups on the most significant bits.
522
523
   This function is also a candidate for assembler implementation.
524
*/
525
int
526
mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
527
     struct hgcd_matrix1 *M)
528
30.4k
{
529
30.4k
  mp_limb_t u00, u01, u10, u11;
530
531
30.4k
  if (ah < 2 || bh < 2)
532
395
    return 0;
533
534
30.0k
  if (ah > bh || (ah == bh && al > bl))
535
14.3k
    {
536
14.3k
      sub_ddmmss (ah, al, ah, al, bh, bl);
537
14.3k
      if (ah < 2)
538
671
  return 0;
539
540
13.6k
      u00 = u01 = u11 = 1;
541
13.6k
      u10 = 0;
542
13.6k
    }
543
15.7k
  else
544
15.7k
    {
545
15.7k
      sub_ddmmss (bh, bl, bh, bl, ah, al);
546
15.7k
      if (bh < 2)
547
1.27k
  return 0;
548
549
14.4k
      u00 = u10 = u11 = 1;
550
14.4k
      u01 = 0;
551
14.4k
    }
552
553
28.0k
  if (ah < bh)
554
13.9k
    goto subtract_a;
555
556
14.1k
  for (;;)
557
257k
    {
558
257k
      ASSERT (ah >= bh);
559
257k
      if (ah == bh)
560
376
  goto done;
561
562
257k
      if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
563
13.1k
  {
564
13.1k
    ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
565
13.1k
    bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
566
567
13.1k
    break;
568
13.1k
  }
569
570
      /* Subtract a -= q b, and multiply M from the right by (1 q ; 0
571
   1), affecting the second column of M. */
572
243k
      ASSERT (ah > bh);
573
243k
      sub_ddmmss (ah, al, ah, al, bh, bl);
574
575
243k
      if (ah < 2)
576
27
  goto done;
577
578
243k
      if (ah <= bh)
579
97.5k
  {
580
    /* Use q = 1 */
581
97.5k
    u01 += u00;
582
97.5k
    u11 += u10;
583
97.5k
  }
584
146k
      else
585
146k
  {
586
146k
    mp_limb_t r[2];
587
146k
    mp_limb_t q = div2 (r, ah, al, bh, bl);
588
146k
    al = r[0]; ah = r[1];
589
146k
    if (ah < 2)
590
332
      {
591
        /* A is too small, but q is correct. */
592
332
        u01 += q * u00;
593
332
        u11 += q * u10;
594
332
        goto done;
595
332
      }
596
146k
    q++;
597
146k
    u01 += q * u00;
598
146k
    u11 += q * u10;
599
146k
  }
600
257k
    subtract_a:
601
257k
      ASSERT (bh >= ah);
602
257k
      if (ah == bh)
603
407
  goto done;
604
605
257k
      if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
606
13.4k
  {
607
13.4k
    ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
608
13.4k
    bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
609
610
13.4k
    goto subtract_a1;
611
13.4k
  }
612
613
      /* Subtract b -= q a, and multiply M from the right by (1 0 ; q
614
   1), affecting the first column of M. */
615
243k
      sub_ddmmss (bh, bl, bh, bl, ah, al);
616
617
243k
      if (bh < 2)
618
23
  goto done;
619
620
243k
      if (bh <= ah)
621
97.1k
  {
622
    /* Use q = 1 */
623
97.1k
    u00 += u01;
624
97.1k
    u10 += u11;
625
97.1k
  }
626
146k
      else
627
146k
  {
628
146k
    mp_limb_t r[2];
629
146k
    mp_limb_t q = div2 (r, bh, bl, ah, al);
630
146k
    bl = r[0]; bh = r[1];
631
146k
    if (bh < 2)
632
335
      {
633
        /* B is too small, but q is correct. */
634
335
        u00 += q * u01;
635
335
        u10 += q * u11;
636
335
        goto done;
637
335
      }
638
146k
    q++;
639
146k
    u00 += q * u01;
640
146k
    u10 += q * u11;
641
146k
  }
642
243k
    }
643
644
  /* NOTE: Since we discard the least significant half limb, we don't get a
645
     truly maximal M (corresponding to |a - b| < 2^{GMP_LIMB_BITS +1}). */
646
  /* Single precision loop */
647
13.1k
  for (;;)
648
220k
    {
649
220k
      ASSERT (ah >= bh);
650
651
220k
      ah -= bh;
652
220k
      if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
653
6.00k
  break;
654
655
214k
      if (ah <= bh)
656
89.0k
  {
657
    /* Use q = 1 */
658
89.0k
    u01 += u00;
659
89.0k
    u11 += u10;
660
89.0k
  }
661
125k
      else
662
125k
  {
663
125k
    mp_double_limb_t rq = div1 (ah, bh);
664
125k
    mp_limb_t q = rq.d1;
665
125k
    ah = rq.d0;
666
667
125k
    if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
668
7.23k
      {
669
        /* A is too small, but q is correct. */
670
7.23k
        u01 += q * u00;
671
7.23k
        u11 += q * u10;
672
7.23k
        break;
673
7.23k
      }
674
118k
    q++;
675
118k
    u01 += q * u00;
676
118k
    u11 += q * u10;
677
118k
  }
678
220k
    subtract_a1:
679
220k
      ASSERT (bh >= ah);
680
681
220k
      bh -= ah;
682
220k
      if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
683
6.10k
  break;
684
685
214k
      if (bh <= ah)
686
89.9k
  {
687
    /* Use q = 1 */
688
89.9k
    u00 += u01;
689
89.9k
    u10 += u11;
690
89.9k
  }
691
124k
      else
692
124k
  {
693
124k
    mp_double_limb_t rq = div1 (bh, ah);
694
124k
    mp_limb_t q = rq.d1;
695
124k
    bh = rq.d0;
696
697
124k
    if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
698
7.24k
      {
699
        /* B is too small, but q is correct. */
700
7.24k
        u00 += q * u01;
701
7.24k
        u10 += q * u11;
702
7.24k
        break;
703
7.24k
      }
704
117k
    q++;
705
117k
    u00 += q * u01;
706
117k
    u10 += q * u11;
707
117k
  }
708
214k
    }
709
710
28.0k
 done:
711
28.0k
  M->u[0][0] = u00; M->u[0][1] = u01;
712
28.0k
  M->u[1][0] = u10; M->u[1][1] = u11;
713
714
28.0k
  return 1;
715
13.1k
}
716
717
/* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must
718
 * have space for n + 1 limbs. Uses three buffers to avoid a copy*/
719
mp_size_t
720
mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M,
721
           mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
722
25.1k
{
723
25.1k
  mp_limb_t ah, bh;
724
725
  /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
726
727
     r  = u00 * a
728
     r += u10 * b
729
     b *= u11
730
     b += u01 * a
731
  */
732
733
25.1k
#if HAVE_NATIVE_mpn_addaddmul_1msb0
734
25.1k
  ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]);
735
25.1k
  bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]);
736
#else
737
  ah =     mpn_mul_1 (rp, ap, n, M->u[0][0]);
738
  ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]);
739
740
  bh =     mpn_mul_1 (bp, bp, n, M->u[1][1]);
741
  bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]);
742
#endif
743
25.1k
  rp[n] = ah;
744
25.1k
  bp[n] = bh;
745
746
25.1k
  n += (ah | bh) > 0;
747
25.1k
  return n;
748
25.1k
}