Line | Count | Source (jump to first uncovered line) |
1 | | /* mpn_divexact(qp,np,nn,dp,dn,tp) -- Divide N = {np,nn} by D = {dp,dn} storing |
2 | | the result in Q = {qp,nn-dn+1} expecting no remainder. Overlap allowed |
3 | | between Q and N; all other overlap disallowed. |
4 | | |
5 | | Contributed to the GNU project by Torbjorn Granlund. |
6 | | |
7 | | THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY |
8 | | SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
9 | | GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. |
10 | | |
11 | | Copyright 2006, 2007, 2009, 2017 Free Software Foundation, Inc. |
12 | | |
13 | | This file is part of the GNU MP Library. |
14 | | |
15 | | The GNU MP Library is free software; you can redistribute it and/or modify |
16 | | it under the terms of either: |
17 | | |
18 | | * the GNU Lesser General Public License as published by the Free |
19 | | Software Foundation; either version 3 of the License, or (at your |
20 | | option) any later version. |
21 | | |
22 | | or |
23 | | |
24 | | * the GNU General Public License as published by the Free Software |
25 | | Foundation; either version 2 of the License, or (at your option) any |
26 | | later version. |
27 | | |
28 | | or both in parallel, as here. |
29 | | |
30 | | The GNU MP Library is distributed in the hope that it will be useful, but |
31 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
32 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
33 | | for more details. |
34 | | |
35 | | You should have received copies of the GNU General Public License and the |
36 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
37 | | see https://www.gnu.org/licenses/. */ |
38 | | |
39 | | |
40 | | #include "gmp-impl.h" |
41 | | #include "longlong.h" |
42 | | |
43 | | #if 1 |
44 | | void |
45 | | mpn_divexact (mp_ptr qp, |
46 | | mp_srcptr np, mp_size_t nn, |
47 | | mp_srcptr dp, mp_size_t dn) |
48 | 0 | { |
49 | 0 | unsigned shift; |
50 | 0 | mp_size_t qn; |
51 | 0 | mp_ptr tp; |
52 | 0 | TMP_DECL; |
53 | |
|
54 | 0 | ASSERT (dn > 0); |
55 | 0 | ASSERT (nn >= dn); |
56 | 0 | ASSERT (dp[dn-1] > 0); |
57 | |
|
58 | 0 | while (dp[0] == 0) |
59 | 0 | { |
60 | 0 | ASSERT (np[0] == 0); |
61 | 0 | dp++; |
62 | 0 | np++; |
63 | 0 | dn--; |
64 | 0 | nn--; |
65 | 0 | } |
66 | |
|
67 | 0 | if (dn == 1) |
68 | 0 | { |
69 | 0 | MPN_DIVREM_OR_DIVEXACT_1 (qp, np, nn, dp[0]); |
70 | 0 | return; |
71 | 0 | } |
72 | | |
73 | 0 | TMP_MARK; |
74 | |
|
75 | 0 | qn = nn + 1 - dn; |
76 | 0 | count_trailing_zeros (shift, dp[0]); |
77 | |
|
78 | 0 | if (shift > 0) |
79 | 0 | { |
80 | 0 | mp_ptr wp; |
81 | 0 | mp_size_t ss; |
82 | 0 | ss = (dn > qn) ? qn + 1 : dn; |
83 | |
|
84 | 0 | tp = TMP_ALLOC_LIMBS (ss); |
85 | 0 | mpn_rshift (tp, dp, ss, shift); |
86 | 0 | dp = tp; |
87 | | |
88 | | /* Since we have excluded dn == 1, we have nn > qn, and we need |
89 | | to shift one limb beyond qn. */ |
90 | 0 | wp = TMP_ALLOC_LIMBS (qn + 1); |
91 | 0 | mpn_rshift (wp, np, qn + 1, shift); |
92 | 0 | np = wp; |
93 | 0 | } |
94 | |
|
95 | 0 | if (dn > qn) |
96 | 0 | dn = qn; |
97 | |
|
98 | 0 | tp = TMP_ALLOC_LIMBS (mpn_bdiv_q_itch (qn, dn)); |
99 | 0 | mpn_bdiv_q (qp, np, qn, dp, dn, tp); |
100 | 0 | TMP_FREE; |
101 | | |
102 | | /* Since bdiv_q computes -N/D (mod B^{qn}), we must negate now. */ |
103 | 0 | mpn_neg (qp, qp, qn); |
104 | 0 | } |
105 | | |
106 | | #else |
107 | | |
108 | | /* We use the Jebelean's bidirectional exact division algorithm. This is |
109 | | somewhat naively implemented, with equal quotient parts done by 2-adic |
110 | | division and truncating division. Since 2-adic division is faster, it |
111 | | should be used for a larger chunk. |
112 | | |
113 | | This code is horrendously ugly, in all sorts of ways. |
114 | | |
115 | | * It was hacked without much care or thought, but with a testing program. |
116 | | * It handles scratch space frivolously, and furthermore the itch function |
117 | | is broken. |
118 | | * Doesn't provide any measures to deal with mu_divappr_q's +3 error. We |
119 | | have yet to provoke an error due to this, though. |
120 | | * Algorithm selection leaves a lot to be desired. In particular, the choice |
121 | | between DC and MU isn't a point, but we treat it like one. |
122 | | * It makes the msb part 1 or 2 limbs larger than the lsb part, in spite of |
123 | | that the latter is faster. We should at least reverse this, but perhaps |
124 | | we should make the lsb part considerably larger. (How do we tune this?) |
125 | | */ |
126 | | |
127 | | mp_size_t |
128 | | mpn_divexact_itch (mp_size_t nn, mp_size_t dn) |
129 | | { |
130 | | return nn + dn; /* FIXME this is not right */ |
131 | | } |
132 | | |
133 | | void |
134 | | mpn_divexact (mp_ptr qp, |
135 | | mp_srcptr np, mp_size_t nn, |
136 | | mp_srcptr dp, mp_size_t dn, |
137 | | mp_ptr scratch) |
138 | | { |
139 | | mp_size_t qn; |
140 | | mp_size_t nn0, qn0; |
141 | | mp_size_t nn1, qn1; |
142 | | mp_ptr tp; |
143 | | mp_limb_t qml; |
144 | | mp_limb_t qh; |
145 | | int cnt; |
146 | | mp_ptr xdp; |
147 | | mp_limb_t di; |
148 | | mp_limb_t cy; |
149 | | gmp_pi1_t dinv; |
150 | | TMP_DECL; |
151 | | |
152 | | TMP_MARK; |
153 | | |
154 | | qn = nn - dn + 1; |
155 | | |
156 | | /* For small divisors, and small quotients, don't use Jebelean's algorithm. */ |
157 | | if (dn < DIVEXACT_JEB_THRESHOLD || qn < DIVEXACT_JEB_THRESHOLD) |
158 | | { |
159 | | tp = scratch; |
160 | | MPN_COPY (tp, np, qn); |
161 | | binvert_limb (di, dp[0]); di = -di; |
162 | | dn = MIN (dn, qn); |
163 | | mpn_sbpi1_bdiv_q (qp, tp, qn, dp, dn, di); |
164 | | TMP_FREE; |
165 | | return; |
166 | | } |
167 | | |
168 | | qn0 = ((nn - dn) >> 1) + 1; /* low quotient size */ |
169 | | |
170 | | /* If quotient is much larger than the divisor, the bidirectional algorithm |
171 | | does not work as currently implemented. Fall back to plain bdiv. */ |
172 | | if (qn0 > dn) |
173 | | { |
174 | | if (BELOW_THRESHOLD (dn, DC_BDIV_Q_THRESHOLD)) |
175 | | { |
176 | | tp = scratch; |
177 | | MPN_COPY (tp, np, qn); |
178 | | binvert_limb (di, dp[0]); di = -di; |
179 | | dn = MIN (dn, qn); |
180 | | mpn_sbpi1_bdiv_q (qp, tp, qn, dp, dn, di); |
181 | | } |
182 | | else if (BELOW_THRESHOLD (dn, MU_BDIV_Q_THRESHOLD)) |
183 | | { |
184 | | tp = scratch; |
185 | | MPN_COPY (tp, np, qn); |
186 | | binvert_limb (di, dp[0]); di = -di; |
187 | | mpn_dcpi1_bdiv_q (qp, tp, qn, dp, dn, di); |
188 | | } |
189 | | else |
190 | | { |
191 | | mpn_mu_bdiv_q (qp, np, qn, dp, dn, scratch); |
192 | | } |
193 | | TMP_FREE; |
194 | | return; |
195 | | } |
196 | | |
197 | | nn0 = qn0 + qn0; |
198 | | |
199 | | nn1 = nn0 - 1 + ((nn-dn) & 1); |
200 | | qn1 = qn0; |
201 | | if (LIKELY (qn0 != dn)) |
202 | | { |
203 | | nn1 = nn1 + 1; |
204 | | qn1 = qn1 + 1; |
205 | | if (UNLIKELY (dp[dn - 1] == 1 && qn1 != dn)) |
206 | | { |
207 | | /* If the leading divisor limb == 1, i.e. has just one bit, we have |
208 | | to include an extra limb in order to get the needed overlap. */ |
209 | | /* FIXME: Now with the mu_divappr_q function, we should really need |
210 | | more overlap. That indicates one of two things: (1) The test code |
211 | | is not good. (2) We actually overlap too much by default. */ |
212 | | nn1 = nn1 + 1; |
213 | | qn1 = qn1 + 1; |
214 | | } |
215 | | } |
216 | | |
217 | | tp = TMP_ALLOC_LIMBS (nn1 + 1); |
218 | | |
219 | | count_leading_zeros (cnt, dp[dn - 1]); |
220 | | |
221 | | /* Normalize divisor, store into tmp area. */ |
222 | | if (cnt != 0) |
223 | | { |
224 | | xdp = TMP_ALLOC_LIMBS (qn1); |
225 | | mpn_lshift (xdp, dp + dn - qn1, qn1, cnt); |
226 | | } |
227 | | else |
228 | | { |
229 | | xdp = (mp_ptr) dp + dn - qn1; |
230 | | } |
231 | | |
232 | | /* Shift dividend according to the divisor normalization. */ |
233 | | /* FIXME: We compute too much here for XX_divappr_q, but these functions' |
234 | | interfaces want a pointer to the imaginative least significant limb, not |
235 | | to the least significant *used* limb. Of course, we could leave nn1-qn1 |
236 | | rubbish limbs in the low part, to save some time. */ |
237 | | if (cnt != 0) |
238 | | { |
239 | | cy = mpn_lshift (tp, np + nn - nn1, nn1, cnt); |
240 | | if (cy != 0) |
241 | | { |
242 | | tp[nn1] = cy; |
243 | | nn1++; |
244 | | } |
245 | | } |
246 | | else |
247 | | { |
248 | | /* FIXME: This copy is not needed for mpn_mu_divappr_q, except when the |
249 | | mpn_sub_n right before is executed. */ |
250 | | MPN_COPY (tp, np + nn - nn1, nn1); |
251 | | } |
252 | | |
253 | | invert_pi1 (dinv, xdp[qn1 - 1], xdp[qn1 - 2]); |
254 | | if (BELOW_THRESHOLD (qn1, DC_DIVAPPR_Q_THRESHOLD)) |
255 | | { |
256 | | qp[qn0 - 1 + nn1 - qn1] = mpn_sbpi1_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, dinv.inv32); |
257 | | } |
258 | | else if (BELOW_THRESHOLD (qn1, MU_DIVAPPR_Q_THRESHOLD)) |
259 | | { |
260 | | qp[qn0 - 1 + nn1 - qn1] = mpn_dcpi1_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, &dinv); |
261 | | } |
262 | | else |
263 | | { |
264 | | /* FIXME: mpn_mu_divappr_q doesn't handle qh != 0. Work around it with a |
265 | | conditional subtraction here. */ |
266 | | qh = mpn_cmp (tp + nn1 - qn1, xdp, qn1) >= 0; |
267 | | if (qh) |
268 | | mpn_sub_n (tp + nn1 - qn1, tp + nn1 - qn1, xdp, qn1); |
269 | | mpn_mu_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, scratch); |
270 | | qp[qn0 - 1 + nn1 - qn1] = qh; |
271 | | } |
272 | | qml = qp[qn0 - 1]; |
273 | | |
274 | | binvert_limb (di, dp[0]); di = -di; |
275 | | |
276 | | if (BELOW_THRESHOLD (qn0, DC_BDIV_Q_THRESHOLD)) |
277 | | { |
278 | | MPN_COPY (tp, np, qn0); |
279 | | mpn_sbpi1_bdiv_q (qp, tp, qn0, dp, qn0, di); |
280 | | } |
281 | | else if (BELOW_THRESHOLD (qn0, MU_BDIV_Q_THRESHOLD)) |
282 | | { |
283 | | MPN_COPY (tp, np, qn0); |
284 | | mpn_dcpi1_bdiv_q (qp, tp, qn0, dp, qn0, di); |
285 | | } |
286 | | else |
287 | | { |
288 | | mpn_mu_bdiv_q (qp, np, qn0, dp, qn0, scratch); |
289 | | } |
290 | | |
291 | | if (qml < qp[qn0 - 1]) |
292 | | mpn_decr_u (qp + qn0, 1); |
293 | | |
294 | | TMP_FREE; |
295 | | } |
296 | | #endif |