/src/gmp-6.2.1/mpn/mu_bdiv_q.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn. |
2 | | storing the result in {qp,nn}. Overlap allowed between Q and N; all other |
3 | | overlap disallowed. |
4 | | |
5 | | Contributed to the GNU project by Torbjorn Granlund. |
6 | | |
7 | | THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY |
8 | | SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST |
9 | | GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. |
10 | | |
11 | | Copyright 2005-2007, 2009, 2010, 2017 Free Software Foundation, Inc. |
12 | | |
13 | | This file is part of the GNU MP Library. |
14 | | |
15 | | The GNU MP Library is free software; you can redistribute it and/or modify |
16 | | it under the terms of either: |
17 | | |
18 | | * the GNU Lesser General Public License as published by the Free |
19 | | Software Foundation; either version 3 of the License, or (at your |
20 | | option) any later version. |
21 | | |
22 | | or |
23 | | |
24 | | * the GNU General Public License as published by the Free Software |
25 | | Foundation; either version 2 of the License, or (at your option) any |
26 | | later version. |
27 | | |
28 | | or both in parallel, as here. |
29 | | |
30 | | The GNU MP Library is distributed in the hope that it will be useful, but |
31 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
32 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
33 | | for more details. |
34 | | |
35 | | You should have received copies of the GNU General Public License and the |
36 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
37 | | see https://www.gnu.org/licenses/. */ |
38 | | |
39 | | |
40 | | /* |
41 | | The idea of the algorithm used herein is to compute a smaller inverted value |
42 | | than used in the standard Barrett algorithm, and thus save time in the |
43 | | Newton iterations, and pay just a small price when using the inverted value |
44 | | for developing quotient bits. This algorithm was presented at ICMS 2006. |
45 | | */ |
46 | | |
47 | | #include "gmp-impl.h" |
48 | | |
49 | | |
50 | | /* N = {np,nn} |
51 | | D = {dp,dn} |
52 | | |
53 | | Requirements: N >= D |
54 | | D >= 1 |
55 | | D odd |
56 | | dn >= 2 |
57 | | nn >= 2 |
58 | | scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn). |
59 | | |
60 | | Write quotient to Q = {qp,nn}. |
61 | | |
62 | | FIXME: When iterating, perhaps do the small step before loop, not after. |
63 | | FIXME: Try to avoid the scalar divisions when computing inverse size. |
64 | | FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In |
65 | | particular, when dn==in, tp and rp could use the same space. |
66 | | FIXME: Trim final quotient calculation to qn limbs of precision. |
67 | | */ |
68 | | static void |
69 | | mpn_mu_bdiv_q_old (mp_ptr qp, |
70 | | mp_srcptr np, mp_size_t nn, |
71 | | mp_srcptr dp, mp_size_t dn, |
72 | | mp_ptr scratch) |
73 | 0 | { |
74 | 0 | mp_size_t qn; |
75 | 0 | mp_size_t in; |
76 | 0 | int cy, c0; |
77 | 0 | mp_size_t tn, wn; |
78 | |
|
79 | 0 | qn = nn; |
80 | |
|
81 | 0 | ASSERT (dn >= 2); |
82 | 0 | ASSERT (qn >= 2); |
83 | | |
84 | 0 | if (qn > dn) |
85 | 0 | { |
86 | 0 | mp_size_t b; |
87 | | |
88 | | /* |_______________________| dividend |
89 | | |________| divisor */ |
90 | |
|
91 | 0 | #define ip scratch /* in */ |
92 | 0 | #define rp (scratch + in) /* dn or rest >= binvert_itch(in) */ |
93 | 0 | #define tp (scratch + in + dn) /* dn+in or next_size(dn) */ |
94 | 0 | #define scratch_out (scratch + in + dn + tn) /* mulmod_bnm1_itch(next_size(dn)) */ |
95 | | |
96 | | /* Compute an inverse size that is a nice partition of the quotient. */ |
97 | 0 | b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ |
98 | 0 | in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ |
99 | | |
100 | | /* Some notes on allocation: |
101 | | |
102 | | When in = dn, R dies when mpn_mullo returns, if in < dn the low in |
103 | | limbs of R dies at that point. We could save memory by letting T live |
104 | | just under R, and let the upper part of T expand into R. These changes |
105 | | should reduce itch to perhaps 3dn. |
106 | | */ |
107 | |
|
108 | 0 | mpn_binvert (ip, dp, in, rp); |
109 | |
|
110 | 0 | cy = 0; |
111 | |
|
112 | 0 | MPN_COPY (rp, np, dn); |
113 | 0 | np += dn; |
114 | 0 | mpn_mullo_n (qp, rp, ip, in); |
115 | 0 | qn -= in; |
116 | |
|
117 | 0 | while (qn > in) |
118 | 0 | { |
119 | 0 | if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) |
120 | 0 | mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */ |
121 | 0 | else |
122 | 0 | { |
123 | 0 | tn = mpn_mulmod_bnm1_next_size (dn); |
124 | 0 | mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); |
125 | 0 | wn = dn + in - tn; /* number of wrapped limbs */ |
126 | 0 | if (wn > 0) |
127 | 0 | { |
128 | 0 | c0 = mpn_sub_n (tp + tn, tp, rp, wn); |
129 | 0 | mpn_decr_u (tp + wn, c0); |
130 | 0 | } |
131 | 0 | } |
132 | |
|
133 | 0 | qp += in; |
134 | 0 | if (dn != in) |
135 | 0 | { |
136 | | /* Subtract tp[dn-1...in] from partial remainder. */ |
137 | 0 | cy += mpn_sub_n (rp, rp + in, tp + in, dn - in); |
138 | 0 | if (cy == 2) |
139 | 0 | { |
140 | 0 | mpn_incr_u (tp + dn, 1); |
141 | 0 | cy = 1; |
142 | 0 | } |
143 | 0 | } |
144 | | /* Subtract tp[dn+in-1...dn] from dividend. */ |
145 | 0 | cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy); |
146 | 0 | np += in; |
147 | 0 | mpn_mullo_n (qp, rp, ip, in); |
148 | 0 | qn -= in; |
149 | 0 | } |
150 | | |
151 | | /* Generate last qn limbs. |
152 | | FIXME: It should be possible to limit precision here, since qn is |
153 | | typically somewhat smaller than dn. No big gains expected. */ |
154 | |
|
155 | 0 | if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) |
156 | 0 | mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[qn+in-1...in] */ |
157 | 0 | else |
158 | 0 | { |
159 | 0 | tn = mpn_mulmod_bnm1_next_size (dn); |
160 | 0 | mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); |
161 | 0 | wn = dn + in - tn; /* number of wrapped limbs */ |
162 | 0 | if (wn > 0) |
163 | 0 | { |
164 | 0 | c0 = mpn_sub_n (tp + tn, tp, rp, wn); |
165 | 0 | mpn_decr_u (tp + wn, c0); |
166 | 0 | } |
167 | 0 | } |
168 | |
|
169 | 0 | qp += in; |
170 | 0 | if (dn != in) |
171 | 0 | { |
172 | 0 | cy += mpn_sub_n (rp, rp + in, tp + in, dn - in); |
173 | 0 | if (cy == 2) |
174 | 0 | { |
175 | 0 | mpn_incr_u (tp + dn, 1); |
176 | 0 | cy = 1; |
177 | 0 | } |
178 | 0 | } |
179 | |
|
180 | 0 | mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy); |
181 | 0 | mpn_mullo_n (qp, rp, ip, qn); |
182 | |
|
183 | 0 | #undef ip |
184 | 0 | #undef rp |
185 | 0 | #undef tp |
186 | 0 | #undef scratch_out |
187 | 0 | } |
188 | 0 | else |
189 | 0 | { |
190 | | /* |_______________________| dividend |
191 | | |________________| divisor */ |
192 | |
|
193 | 0 | #define ip scratch /* in */ |
194 | 0 | #define tp (scratch + in) /* qn+in or next_size(qn) or rest >= binvert_itch(in) */ |
195 | 0 | #define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */ |
196 | | |
197 | | /* Compute half-sized inverse. */ |
198 | 0 | in = qn - (qn >> 1); |
199 | |
|
200 | 0 | mpn_binvert (ip, dp, in, tp); |
201 | |
|
202 | 0 | mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */ |
203 | |
|
204 | 0 | if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) |
205 | 0 | mpn_mul (tp, dp, qn, qp, in); /* mulhigh */ |
206 | 0 | else |
207 | 0 | { |
208 | 0 | tn = mpn_mulmod_bnm1_next_size (qn); |
209 | 0 | mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out); |
210 | 0 | wn = qn + in - tn; /* number of wrapped limbs */ |
211 | 0 | if (wn > 0) |
212 | 0 | { |
213 | 0 | c0 = mpn_cmp (tp, np, wn) < 0; |
214 | 0 | mpn_decr_u (tp + wn, c0); |
215 | 0 | } |
216 | 0 | } |
217 | |
|
218 | 0 | mpn_sub_n (tp, np + in, tp + in, qn - in); |
219 | 0 | mpn_mullo_n (qp + in, tp, ip, qn - in); /* high qn-in quotient limbs */ |
220 | |
|
221 | 0 | #undef ip |
222 | 0 | #undef tp |
223 | 0 | #undef scratch_out |
224 | 0 | } |
225 | 0 | } |
226 | | |
227 | | void |
228 | | mpn_mu_bdiv_q (mp_ptr qp, |
229 | | mp_srcptr np, mp_size_t nn, |
230 | | mp_srcptr dp, mp_size_t dn, |
231 | | mp_ptr scratch) |
232 | 0 | { |
233 | 0 | mpn_mu_bdiv_q_old (qp, np, nn, dp, dn, scratch); |
234 | 0 | mpn_neg (qp, qp, nn); |
235 | 0 | } |
236 | | |
237 | | mp_size_t |
238 | | mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn) |
239 | 0 | { |
240 | 0 | mp_size_t qn, in, tn, itch_binvert, itch_out, itches; |
241 | 0 | mp_size_t b; |
242 | |
|
243 | 0 | ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD); |
244 | | |
245 | 0 | qn = nn; |
246 | |
|
247 | 0 | if (qn > dn) |
248 | 0 | { |
249 | 0 | b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ |
250 | 0 | in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ |
251 | 0 | if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) |
252 | 0 | { |
253 | 0 | tn = dn + in; |
254 | 0 | itch_out = 0; |
255 | 0 | } |
256 | 0 | else |
257 | 0 | { |
258 | 0 | tn = mpn_mulmod_bnm1_next_size (dn); |
259 | 0 | itch_out = mpn_mulmod_bnm1_itch (tn, dn, in); |
260 | 0 | } |
261 | 0 | itches = dn + tn + itch_out; |
262 | 0 | } |
263 | 0 | else |
264 | 0 | { |
265 | 0 | in = qn - (qn >> 1); |
266 | 0 | if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) |
267 | 0 | { |
268 | 0 | tn = qn + in; |
269 | 0 | itch_out = 0; |
270 | 0 | } |
271 | 0 | else |
272 | 0 | { |
273 | 0 | tn = mpn_mulmod_bnm1_next_size (qn); |
274 | 0 | itch_out = mpn_mulmod_bnm1_itch (tn, qn, in); |
275 | 0 | } |
276 | 0 | itches = tn + itch_out; |
277 | 0 | } |
278 | |
|
279 | 0 | itch_binvert = mpn_binvert_itch (in); |
280 | 0 | return in + MAX (itches, itch_binvert); |
281 | 0 | } |