Coverage Report

Created: 2024-11-21 07:03

/src/libgmp/mpn/generic/hgcd2-div.h
Line
Count
Source
1
/* hgcd2-div.h
2
3
   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
4
   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
5
   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6
7
Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019, 2020 Free Software
8
Foundation, Inc.
9
10
This file is part of the GNU MP Library.
11
12
The GNU MP Library is free software; you can redistribute it and/or modify
13
it under the terms of either:
14
15
  * the GNU Lesser General Public License as published by the Free
16
    Software Foundation; either version 3 of the License, or (at your
17
    option) any later version.
18
19
or
20
21
  * the GNU General Public License as published by the Free Software
22
    Foundation; either version 2 of the License, or (at your option) any
23
    later version.
24
25
or both in parallel, as here.
26
27
The GNU MP Library is distributed in the hope that it will be useful, but
28
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
29
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
30
for more details.
31
32
You should have received copies of the GNU General Public License and the
33
GNU Lesser General Public License along with the GNU MP Library.  If not,
34
see https://www.gnu.org/licenses/.  */
35
36
#include "gmp-impl.h"
37
#include "longlong.h"
38
39
#ifndef HGCD2_DIV1_METHOD
40
#define HGCD2_DIV1_METHOD 3
41
#endif
42
43
#ifndef HGCD2_DIV2_METHOD
44
#define HGCD2_DIV2_METHOD 2
45
#endif
46
47
#if HAVE_NATIVE_mpn_div_11
48
49
#define div1 mpn_div_11
50
/* Single-limb division optimized for small quotients.
51
   Returned value holds d0 = r, d1 = q. */
52
mp_double_limb_t div1 (mp_limb_t, mp_limb_t);
53
54
#elif HGCD2_DIV1_METHOD == 1
55
56
static inline mp_double_limb_t
57
div1 (mp_limb_t n0, mp_limb_t d0)
58
{
59
  mp_double_limb_t res;
60
  res.d1 = n0 / d0;
61
  res.d0 = n0 - res.d1 * d0;
62
63
  return res;
64
}
65
66
#elif HGCD2_DIV1_METHOD == 2
67
68
static mp_double_limb_t
69
div1 (mp_limb_t n0, mp_limb_t d0)
70
{
71
  mp_double_limb_t res;
72
  int ncnt, dcnt, cnt;
73
  mp_limb_t q;
74
  mp_limb_t mask;
75
76
  ASSERT (n0 >= d0);
77
78
  count_leading_zeros (ncnt, n0);
79
  count_leading_zeros (dcnt, d0);
80
  cnt = dcnt - ncnt;
81
82
  d0 <<= cnt;
83
84
  q = -(mp_limb_t) (n0 >= d0);
85
  n0 -= d0 & q;
86
  d0 >>= 1;
87
  q = -q;
88
89
  while (--cnt >= 0)
90
    {
91
      mask = -(mp_limb_t) (n0 >= d0);
92
      n0 -= d0 & mask;
93
      d0 >>= 1;
94
      q = (q << 1) - mask;
95
    }
96
97
  res.d0 = n0;
98
  res.d1 = q;
99
  return res;
100
}
101
102
#elif HGCD2_DIV1_METHOD == 3
103
104
static inline mp_double_limb_t
105
div1 (mp_limb_t n0, mp_limb_t d0)
106
707k
{
107
707k
  mp_double_limb_t res;
108
707k
  if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
109
707k
      || UNLIKELY (n0 >= (d0 << 3)))
110
191k
    {
111
191k
      res.d1 = n0 / d0;
112
191k
      res.d0 = n0 - res.d1 * d0;
113
191k
    }
114
516k
  else
115
516k
    {
116
516k
      mp_limb_t q, mask;
117
118
516k
      d0 <<= 2;
119
120
516k
      mask = -(mp_limb_t) (n0 >= d0);
121
516k
      n0 -= d0 & mask;
122
516k
      q = 4 & mask;
123
124
516k
      d0 >>= 1;
125
516k
      mask = -(mp_limb_t) (n0 >= d0);
126
516k
      n0 -= d0 & mask;
127
516k
      q += 2 & mask;
128
129
516k
      d0 >>= 1;
130
516k
      mask = -(mp_limb_t) (n0 >= d0);
131
516k
      n0 -= d0 & mask;
132
516k
      q -= mask;
133
134
516k
      res.d0 = n0;
135
516k
      res.d1 = q;
136
516k
    }
137
707k
  return res;
138
707k
}
hgcd2.c:div1
Line
Count
Source
106
473k
{
107
473k
  mp_double_limb_t res;
108
473k
  if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
109
473k
      || UNLIKELY (n0 >= (d0 << 3)))
110
128k
    {
111
128k
      res.d1 = n0 / d0;
112
128k
      res.d0 = n0 - res.d1 * d0;
113
128k
    }
114
345k
  else
115
345k
    {
116
345k
      mp_limb_t q, mask;
117
118
345k
      d0 <<= 2;
119
120
345k
      mask = -(mp_limb_t) (n0 >= d0);
121
345k
      n0 -= d0 & mask;
122
345k
      q = 4 & mask;
123
124
345k
      d0 >>= 1;
125
345k
      mask = -(mp_limb_t) (n0 >= d0);
126
345k
      n0 -= d0 & mask;
127
345k
      q += 2 & mask;
128
129
345k
      d0 >>= 1;
130
345k
      mask = -(mp_limb_t) (n0 >= d0);
131
345k
      n0 -= d0 & mask;
132
345k
      q -= mask;
133
134
345k
      res.d0 = n0;
135
345k
      res.d1 = q;
136
345k
    }
137
473k
  return res;
138
473k
}
hgcd2_jacobi.c:div1
Line
Count
Source
106
234k
{
107
234k
  mp_double_limb_t res;
108
234k
  if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
109
234k
      || UNLIKELY (n0 >= (d0 << 3)))
110
63.2k
    {
111
63.2k
      res.d1 = n0 / d0;
112
63.2k
      res.d0 = n0 - res.d1 * d0;
113
63.2k
    }
114
171k
  else
115
171k
    {
116
171k
      mp_limb_t q, mask;
117
118
171k
      d0 <<= 2;
119
120
171k
      mask = -(mp_limb_t) (n0 >= d0);
121
171k
      n0 -= d0 & mask;
122
171k
      q = 4 & mask;
123
124
171k
      d0 >>= 1;
125
171k
      mask = -(mp_limb_t) (n0 >= d0);
126
171k
      n0 -= d0 & mask;
127
171k
      q += 2 & mask;
128
129
171k
      d0 >>= 1;
130
171k
      mask = -(mp_limb_t) (n0 >= d0);
131
171k
      n0 -= d0 & mask;
132
171k
      q -= mask;
133
134
171k
      res.d0 = n0;
135
171k
      res.d1 = q;
136
171k
    }
137
234k
  return res;
138
234k
}
139
140
#elif HGCD2_DIV1_METHOD == 4
141
142
/* Table quotients.  We extract the NBITS most significant bits of the
143
   numerator limb, and the corresponding bits from the divisor limb, and use
144
   these to form an index into the table.  This method is probably only useful
145
   for short pipelines with slow multiplication.
146
147
   Possible improvements:
148
149
   * Perhaps extract the highest NBITS of the divisor instead of the same bits
150
     as from the numerator.  That would require another count_leading_zeros,
151
     and a post-multiply shift of the quotient.
152
153
   * Compress tables?  Their values are tiny, and there are lots of zero
154
     entries (which are never used).
155
156
   * Round the table entries more cleverly?
157
*/
158
159
#ifndef NBITS
160
#define NBITS 5
161
#endif
162
163
#if NBITS == 5
164
/* This needs full division about 13.2% of the time. */
165
static const unsigned char tab[512] = {
166
17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
167
18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
168
19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
169
20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
170
21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
171
22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
172
23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
173
24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
174
25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
175
26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
176
27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
177
28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
178
29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
179
30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
180
31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
181
32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
182
};
183
#elif NBITS == 6
184
/* This needs full division about 9.8% of the time. */
185
static const unsigned char tab[2048] = {
186
33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
187
 1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
188
34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
189
 1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
190
35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
191
 1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
192
36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
193
 1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
194
37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
195
 1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
196
38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
197
 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
198
39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
199
 1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
200
40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
201
 1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
202
41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
203
 1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
204
42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
205
 1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
206
43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
207
 1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
208
44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
209
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
210
45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
211
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
212
46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
213
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
214
47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
215
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
216
48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
217
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
218
49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
219
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
220
50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
221
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
222
51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
223
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
224
52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
225
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
226
53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
227
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
228
54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
229
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
230
55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
231
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
232
56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
233
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
234
57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
235
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
236
58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
237
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
238
59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
239
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
240
60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
241
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
242
61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
243
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
244
62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
245
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
246
63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
247
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
248
64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
249
 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
250
};
251
#else
252
#error No table for provided NBITS
253
#endif
254
255
/* Doing tabp with a #define makes compiler warnings about pointing outside an
256
   object go away.  We used to define this as a variable.  It is not clear if
257
   e.g.  (vector[100] - 10) + 10 is well- defined as per the C standard;
258
   (vector[100] + 10) - 10 surely is and there is no sequence point so the
259
   expressions should be equivalent.  To make this safe, we might want to
260
   define tabp as a macro with the index as an argument.  Depending on the
261
   platform, relocs might allow for assembly-time or linker-time resolution to
262
   take place. */
263
#define tabp (tab - (1 << (NBITS - 1) << NBITS))
264
265
static inline mp_double_limb_t
266
div1 (mp_limb_t n0, mp_limb_t d0)
267
{
268
  int ncnt;
269
  size_t nbi, dbi;
270
  mp_limb_t q0;
271
  mp_limb_t r0;
272
  mp_limb_t mask;
273
  mp_double_limb_t res;
274
275
  ASSERT (n0 >= d0);    /* Actually only msb position is critical. */
276
277
  count_leading_zeros (ncnt, n0);
278
  nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);
279
  dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);
280
281
  q0 = tabp[(nbi << NBITS) + dbi];
282
  r0 = n0 - q0 * d0;
283
  mask = -(mp_limb_t) (r0 >= d0);
284
  q0 -= mask;
285
  r0 -= d0 & mask;
286
287
  if (UNLIKELY (r0 >= d0))
288
    {
289
      q0 = n0 / d0;
290
      r0 = n0 - q0 * d0;
291
    }
292
293
  res.d1 = q0;
294
  res.d0 = r0;
295
  return res;
296
}
297
298
#elif HGCD2_DIV1_METHOD == 5
299
300
/* Table inverses of divisors.  We don't bother with suppressing the msb from
301
   the tables.  We index with the NBITS most significant divisor bits,
302
   including the always-set highest bit, but use addressing trickery via tabp
303
   to suppress it.
304
305
   Possible improvements:
306
307
   * Do first multiply using 32-bit operations on 64-bit computers.  At least
308
     on most Arm64 cores, that uses 3 times less resources.  It also saves on
309
     many x86-64 processors.
310
*/
311
312
#ifndef NBITS
313
#define NBITS 7
314
#endif
315
316
#if NBITS == 5
317
/* This needs full division about 1.63% of the time. */
318
static const unsigned char tab[16] = {
319
 63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32
320
};
321
#elif NBITS == 6
322
/* This needs full division about 0.93% of the time. */
323
static const unsigned char tab[32] = {
324
127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,
325
 84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64
326
};
327
#elif NBITS == 7
328
/* This needs full division about 0.49% of the time. */
329
static const unsigned char tab[64] = {
330
255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,
331
203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,
332
169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,
333
145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128
334
};
335
#elif NBITS == 8
336
/* This needs full division about 0.26% of the time. */
337
static const unsigned short tab[128] = {
338
511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,
339
454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,
340
408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,
341
371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,
342
340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,
343
314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,
344
291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,
345
272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256
346
};
347
#else
348
#error No table for provided NBITS
349
#endif
350
351
/* Doing tabp with a #define makes compiler warnings about pointing outside an
352
   object go away.  We used to define this as a variable.  It is not clear if
353
   e.g.  (vector[100] - 10) + 10 is well- defined as per the C standard;
354
   (vector[100] + 10) - 10 surely is and there is no sequence point so the
355
   expressions should be equivalent.  To make this safe, we might want to
356
   define tabp as a macro with the index as an argument.  Depending on the
357
   platform, relocs might allow for assembly-time or linker-time resolution to
358
   take place. */
359
#define tabp (tab - (1 << (NBITS - 1)))
360
361
static inline mp_double_limb_t
362
div1 (mp_limb_t n0, mp_limb_t d0)
363
{
364
  int ncnt, dcnt;
365
  size_t dbi;
366
  mp_limb_t inv;
367
  mp_limb_t q0;
368
  mp_limb_t r0;
369
  mp_limb_t mask;
370
  mp_double_limb_t res;
371
372
  count_leading_zeros (ncnt, n0);
373
  count_leading_zeros (dcnt, d0);
374
375
  dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
376
  inv = tabp[dbi];
377
  q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
378
  r0 = n0 - q0 * d0;
379
  mask = -(mp_limb_t) (r0 >= d0);
380
  q0 -= mask;
381
  r0 -= d0 & mask;
382
383
  if (UNLIKELY (r0 >= d0))
384
    {
385
      q0 = n0 / d0;
386
      r0 = n0 - q0 * d0;
387
    }
388
389
  res.d1 = q0;
390
  res.d0 = r0;
391
  return res;
392
}
393
394
#else
395
#error Unknown HGCD2_DIV1_METHOD
396
#endif
397
398
#if HAVE_NATIVE_mpn_div_22
399
400
#define div2 mpn_div_22
401
/* Two-limb division optimized for small quotients.  */
402
mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);
403
404
#elif HGCD2_DIV2_METHOD == 1
405
406
static mp_limb_t
407
div2 (mp_ptr rp,
408
      mp_limb_t n1, mp_limb_t n0,
409
      mp_limb_t d1, mp_limb_t d0)
410
{
411
  mp_double_limb_t rq = div1 (n1, d1);
412
  if (UNLIKELY (rq.d1 > d1))
413
    {
414
      mp_limb_t n2, q, t1, t0;
415
      int c;
416
417
      /* Normalize */
418
      count_leading_zeros (c, d1);
419
      ASSERT (c > 0);
420
421
      n2 = n1 >> (GMP_LIMB_BITS - c);
422
      n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c));
423
      n0 <<= c;
424
      d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c));
425
      d0 <<= c;
426
427
      udiv_qrnnd (q, n1, n2, n1, d1);
428
      umul_ppmm (t1, t0, q, d0);
429
      if (t1 > n1 || (t1 == n1 && t0 > n0))
430
  {
431
    ASSERT (q > 0);
432
    q--;
433
    sub_ddmmss (t1, t0, t1, t0, d1, d0);
434
  }
435
      sub_ddmmss (n1, n0, n1, n0, t1, t0);
436
437
      /* Undo normalization */
438
      rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c));
439
      rp[1] = n1 >> c;
440
441
      return q;
442
    }
443
  else
444
    {
445
      mp_limb_t q, t1, t0;
446
      n1 = rq.d0;
447
      q = rq.d1;
448
      umul_ppmm (t1, t0, q, d0);
449
      if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0))
450
  {
451
    ASSERT (q > 0);
452
    q--;
453
    sub_ddmmss (t1, t0, t1, t0, d1, d0);
454
  }
455
      sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
456
      return q;
457
    }
458
}
459
460
#elif HGCD2_DIV2_METHOD == 2
461
462
/* Bit-wise div2. Relies on fast count_leading_zeros. */
463
static mp_limb_t
464
div2 (mp_ptr rp,
465
      mp_limb_t n1, mp_limb_t n0,
466
      mp_limb_t d1, mp_limb_t d0)
467
805k
{
468
805k
  mp_limb_t q = 0;
469
805k
  int ncnt;
470
805k
  int dcnt;
471
472
805k
  count_leading_zeros (ncnt, n1);
473
805k
  count_leading_zeros (dcnt, d1);
474
805k
  dcnt -= ncnt;
475
476
805k
  d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
477
805k
  d0 <<= dcnt;
478
479
805k
  do
480
2.66M
    {
481
2.66M
      mp_limb_t mask;
482
2.66M
      q <<= 1;
483
2.66M
      if (UNLIKELY (n1 == d1))
484
412
  mask = -(n0 >= d0);
485
2.66M
      else
486
2.66M
  mask = -(n1 > d1);
487
488
2.66M
      q -= mask;
489
490
2.66M
      sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
491
492
2.66M
      d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);
493
2.66M
      d1 = d1 >> 1;
494
2.66M
    }
495
2.66M
  while (dcnt--);
496
497
805k
  rp[0] = n0;
498
805k
  rp[1] = n1;
499
500
805k
  return q;
501
805k
}
hgcd2.c:div2
Line
Count
Source
467
540k
{
468
540k
  mp_limb_t q = 0;
469
540k
  int ncnt;
470
540k
  int dcnt;
471
472
540k
  count_leading_zeros (ncnt, n1);
473
540k
  count_leading_zeros (dcnt, d1);
474
540k
  dcnt -= ncnt;
475
476
540k
  d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
477
540k
  d0 <<= dcnt;
478
479
540k
  do
480
1.79M
    {
481
1.79M
      mp_limb_t mask;
482
1.79M
      q <<= 1;
483
1.79M
      if (UNLIKELY (n1 == d1))
484
372
  mask = -(n0 >= d0);
485
1.79M
      else
486
1.79M
  mask = -(n1 > d1);
487
488
1.79M
      q -= mask;
489
490
1.79M
      sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
491
492
1.79M
      d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);
493
1.79M
      d1 = d1 >> 1;
494
1.79M
    }
495
1.79M
  while (dcnt--);
496
497
540k
  rp[0] = n0;
498
540k
  rp[1] = n1;
499
500
540k
  return q;
501
540k
}
hgcd2_jacobi.c:div2
Line
Count
Source
467
265k
{
468
265k
  mp_limb_t q = 0;
469
265k
  int ncnt;
470
265k
  int dcnt;
471
472
265k
  count_leading_zeros (ncnt, n1);
473
265k
  count_leading_zeros (dcnt, d1);
474
265k
  dcnt -= ncnt;
475
476
265k
  d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
477
265k
  d0 <<= dcnt;
478
479
265k
  do
480
875k
    {
481
875k
      mp_limb_t mask;
482
875k
      q <<= 1;
483
875k
      if (UNLIKELY (n1 == d1))
484
40
  mask = -(n0 >= d0);
485
875k
      else
486
875k
  mask = -(n1 > d1);
487
488
875k
      q -= mask;
489
490
875k
      sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
491
492
875k
      d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);
493
875k
      d1 = d1 >> 1;
494
875k
    }
495
875k
  while (dcnt--);
496
497
265k
  rp[0] = n0;
498
265k
  rp[1] = n1;
499
500
265k
  return q;
501
265k
}
502
#else
503
#error Unknown HGCD2_DIV2_METHOD
504
#endif