/src/gmp/mpn/mu_bdiv_qr.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn, |
2 | | where qn = nn-dn, storing the result in {qp,qn}. Overlap allowed between Q |
3 | | and N; all other overlap disallowed. |
4 | | |
5 | | Contributed to the GNU project by Torbjorn Granlund. |
6 | | |
7 | | THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY |
8 | | SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
9 | | GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. |
10 | | |
11 | | Copyright 2005-2007, 2009, 2010, 2012, 2017 Free Software Foundation, Inc. |
12 | | |
13 | | This file is part of the GNU MP Library. |
14 | | |
15 | | The GNU MP Library is free software; you can redistribute it and/or modify |
16 | | it under the terms of either: |
17 | | |
18 | | * the GNU Lesser General Public License as published by the Free |
19 | | Software Foundation; either version 3 of the License, or (at your |
20 | | option) any later version. |
21 | | |
22 | | or |
23 | | |
24 | | * the GNU General Public License as published by the Free Software |
25 | | Foundation; either version 2 of the License, or (at your option) any |
26 | | later version. |
27 | | |
28 | | or both in parallel, as here. |
29 | | |
30 | | The GNU MP Library is distributed in the hope that it will be useful, but |
31 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
32 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
33 | | for more details. |
34 | | |
35 | | You should have received copies of the GNU General Public License and the |
36 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
37 | | see https://www.gnu.org/licenses/. */ |
38 | | |
39 | | |
40 | | /* |
41 | | The idea of the algorithm used herein is to compute a smaller inverted value |
42 | | than used in the standard Barrett algorithm, and thus save time in the |
43 | | Newton iterations, and pay just a small price when using the inverted value |
44 | | for developing quotient bits. This algorithm was presented at ICMS 2006. |
45 | | */ |
46 | | |
47 | | #include "gmp-impl.h" |
48 | | |
49 | | |
50 | | /* N = {np,nn} |
51 | | D = {dp,dn} |
52 | | |
53 | | Requirements: N >= D |
54 | | D >= 1 |
55 | | D odd |
56 | | dn >= 2 |
57 | | nn >= 2 |
58 | | scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn). |
59 | | |
60 | | Write quotient to Q = {qp,nn-dn}. |
61 | | |
62 | | FIXME: When iterating, perhaps do the small step before loop, not after. |
63 | | FIXME: Try to avoid the scalar divisions when computing inverse size. |
64 | | FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In |
65 | | particular, when dn==in, tp and rp could use the same space. |
66 | | */ |
67 | | static mp_limb_t |
68 | | mpn_mu_bdiv_qr_old (mp_ptr qp, |
69 | | mp_ptr rp, |
70 | | mp_srcptr np, mp_size_t nn, |
71 | | mp_srcptr dp, mp_size_t dn, |
72 | | mp_ptr scratch) |
73 | 0 | { |
74 | 0 | mp_size_t qn; |
75 | 0 | mp_size_t in; |
76 | 0 | mp_limb_t cy, c0; |
77 | 0 | mp_size_t tn, wn; |
78 | |
|
79 | 0 | qn = nn - dn; |
80 | |
|
81 | 0 | ASSERT (dn >= 2); |
82 | 0 | ASSERT (qn >= 2); |
83 | |
|
84 | 0 | if (qn > dn) |
85 | 0 | { |
86 | 0 | mp_size_t b; |
87 | | |
88 | | /* |_______________________| dividend |
89 | | |________| divisor */ |
90 | |
|
91 | 0 | #define ip scratch /* in */ |
92 | 0 | #define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */ |
93 | 0 | #define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */ |
94 | | |
95 | | /* Compute an inverse size that is a nice partition of the quotient. */ |
96 | 0 | b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ |
97 | 0 | in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ |
98 | | |
99 | | /* Some notes on allocation: |
100 | | |
101 | | When in = dn, R dies when mpn_mullo returns, if in < dn the low in |
102 | | limbs of R dies at that point. We could save memory by letting T live |
103 | | just under R, and let the upper part of T expand into R. These changes |
104 | | should reduce itch to perhaps 3dn. |
105 | | */ |
106 | |
|
107 | 0 | mpn_binvert (ip, dp, in, tp); |
108 | |
|
109 | 0 | MPN_COPY (rp, np, dn); |
110 | 0 | np += dn; |
111 | 0 | cy = 0; |
112 | |
|
113 | 0 | while (qn > in) |
114 | 0 | { |
115 | 0 | mpn_mullo_n (qp, rp, ip, in); |
116 | |
|
117 | 0 | if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) |
118 | 0 | mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */ |
119 | 0 | else |
120 | 0 | { |
121 | 0 | tn = mpn_mulmod_bnm1_next_size (dn); |
122 | 0 | mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); |
123 | 0 | wn = dn + in - tn; /* number of wrapped limbs */ |
124 | 0 | if (wn > 0) |
125 | 0 | { |
126 | 0 | c0 = mpn_sub_n (tp + tn, tp, rp, wn); |
127 | 0 | mpn_decr_u (tp + wn, c0); |
128 | 0 | } |
129 | 0 | } |
130 | |
|
131 | 0 | qp += in; |
132 | 0 | qn -= in; |
133 | |
|
134 | 0 | if (dn != in) |
135 | 0 | { |
136 | | /* Subtract tp[dn-1...in] from partial remainder. */ |
137 | 0 | cy += mpn_sub_n (rp, rp + in, tp + in, dn - in); |
138 | 0 | if (cy == 2) |
139 | 0 | { |
140 | 0 | mpn_incr_u (tp + dn, 1); |
141 | 0 | cy = 1; |
142 | 0 | } |
143 | 0 | } |
144 | | /* Subtract tp[dn+in-1...dn] from dividend. */ |
145 | 0 | cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy); |
146 | 0 | np += in; |
147 | 0 | } |
148 | | |
149 | | /* Generate last qn limbs. */ |
150 | 0 | mpn_mullo_n (qp, rp, ip, qn); |
151 | |
|
152 | 0 | if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) |
153 | 0 | mpn_mul (tp, dp, dn, qp, qn); /* mulhi, need tp[qn+in-1...in] */ |
154 | 0 | else |
155 | 0 | { |
156 | 0 | tn = mpn_mulmod_bnm1_next_size (dn); |
157 | 0 | mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out); |
158 | 0 | wn = dn + qn - tn; /* number of wrapped limbs */ |
159 | 0 | if (wn > 0) |
160 | 0 | { |
161 | 0 | c0 = mpn_sub_n (tp + tn, tp, rp, wn); |
162 | 0 | mpn_decr_u (tp + wn, c0); |
163 | 0 | } |
164 | 0 | } |
165 | |
|
166 | 0 | if (dn != qn) |
167 | 0 | { |
168 | 0 | cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn); |
169 | 0 | if (cy == 2) |
170 | 0 | { |
171 | 0 | mpn_incr_u (tp + dn, 1); |
172 | 0 | cy = 1; |
173 | 0 | } |
174 | 0 | } |
175 | 0 | return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy); |
176 | |
|
177 | 0 | #undef ip |
178 | 0 | #undef tp |
179 | 0 | #undef scratch_out |
180 | 0 | } |
181 | 0 | else |
182 | 0 | { |
183 | | /* |_______________________| dividend |
184 | | |________________| divisor */ |
185 | |
|
186 | 0 | #define ip scratch /* in */ |
187 | 0 | #define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */ |
188 | 0 | #define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */ |
189 | | |
190 | | /* Compute half-sized inverse. */ |
191 | 0 | in = qn - (qn >> 1); |
192 | |
|
193 | 0 | mpn_binvert (ip, dp, in, tp); |
194 | |
|
195 | 0 | mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */ |
196 | |
|
197 | 0 | if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) |
198 | 0 | mpn_mul (tp, dp, dn, qp, in); /* mulhigh */ |
199 | 0 | else |
200 | 0 | { |
201 | 0 | tn = mpn_mulmod_bnm1_next_size (dn); |
202 | 0 | mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); |
203 | 0 | wn = dn + in - tn; /* number of wrapped limbs */ |
204 | 0 | if (wn > 0) |
205 | 0 | { |
206 | 0 | c0 = mpn_sub_n (tp + tn, tp, np, wn); |
207 | 0 | mpn_decr_u (tp + wn, c0); |
208 | 0 | } |
209 | 0 | } |
210 | |
|
211 | 0 | qp += in; |
212 | 0 | qn -= in; |
213 | |
|
214 | 0 | cy = mpn_sub_n (rp, np + in, tp + in, dn); |
215 | 0 | mpn_mullo_n (qp, rp, ip, qn); /* high qn quotient limbs */ |
216 | |
|
217 | 0 | if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) |
218 | 0 | mpn_mul (tp, dp, dn, qp, qn); /* mulhigh */ |
219 | 0 | else |
220 | 0 | { |
221 | 0 | tn = mpn_mulmod_bnm1_next_size (dn); |
222 | 0 | mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out); |
223 | 0 | wn = dn + qn - tn; /* number of wrapped limbs */ |
224 | 0 | if (wn > 0) |
225 | 0 | { |
226 | 0 | c0 = mpn_sub_n (tp + tn, tp, rp, wn); |
227 | 0 | mpn_decr_u (tp + wn, c0); |
228 | 0 | } |
229 | 0 | } |
230 | |
|
231 | 0 | cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn); |
232 | 0 | if (cy == 2) |
233 | 0 | { |
234 | 0 | mpn_incr_u (tp + dn, 1); |
235 | 0 | cy = 1; |
236 | 0 | } |
237 | 0 | return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy); |
238 | |
|
239 | 0 | #undef ip |
240 | 0 | #undef tp |
241 | 0 | #undef scratch_out |
242 | 0 | } |
243 | 0 | } |
244 | | |
245 | | mp_limb_t |
246 | | mpn_mu_bdiv_qr (mp_ptr qp, |
247 | | mp_ptr rp, |
248 | | mp_srcptr np, mp_size_t nn, |
249 | | mp_srcptr dp, mp_size_t dn, |
250 | | mp_ptr scratch) |
251 | 0 | { |
252 | 0 | mp_limb_t cy = mpn_mu_bdiv_qr_old (qp, rp, np, nn, dp, dn, scratch); |
253 | | |
254 | | /* R' B^{qn} = U - Q' D |
255 | | * |
256 | | * Q = B^{qn} - Q' (assuming Q' != 0) |
257 | | * |
258 | | * R B^{qn} = U + Q D = U + B^{qn} D - Q' D |
259 | | * = B^{qn} D + R' |
260 | | */ |
261 | |
|
262 | 0 | if (UNLIKELY (!mpn_neg (qp, qp, nn - dn))) |
263 | 0 | { |
264 | | /* Zero quotient. */ |
265 | 0 | ASSERT (cy == 0); |
266 | 0 | return 0; |
267 | 0 | } |
268 | 0 | else |
269 | 0 | { |
270 | 0 | mp_limb_t cy2 = mpn_add_n (rp, rp, dp, dn); |
271 | 0 | ASSERT (cy2 >= cy); |
272 | |
|
273 | 0 | return cy2 - cy; |
274 | 0 | } |
275 | 0 | } |
276 | | |
277 | | |
278 | | mp_size_t |
279 | | mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn) |
280 | 0 | { |
281 | 0 | mp_size_t qn, in, tn, itch_binvert, itch_out, itches; |
282 | 0 | mp_size_t b; |
283 | |
|
284 | 0 | ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD); |
285 | | |
286 | 0 | qn = nn - dn; |
287 | |
|
288 | 0 | if (qn > dn) |
289 | 0 | { |
290 | 0 | b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ |
291 | 0 | in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ |
292 | 0 | } |
293 | 0 | else |
294 | 0 | { |
295 | 0 | in = qn - (qn >> 1); |
296 | 0 | } |
297 | |
|
298 | 0 | if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) |
299 | 0 | { |
300 | 0 | tn = dn + in; |
301 | 0 | itch_out = 0; |
302 | 0 | } |
303 | 0 | else |
304 | 0 | { |
305 | 0 | tn = mpn_mulmod_bnm1_next_size (dn); |
306 | 0 | itch_out = mpn_mulmod_bnm1_itch (tn, dn, in); |
307 | 0 | } |
308 | |
|
309 | 0 | itch_binvert = mpn_binvert_itch (in); |
310 | 0 | itches = tn + itch_out; |
311 | 0 | return in + MAX (itches, itch_binvert); |
312 | 0 | } |