Line | Count | Source (jump to first uncovered line) |
1 | | /* mpn_modexact_1c_odd -- mpn by limb exact division style remainder. |
2 | | |
3 | | THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST |
4 | | CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN |
5 | | FUTURE GNU MP RELEASES. |
6 | | |
7 | | Copyright 2000-2004 Free Software Foundation, Inc. |
8 | | |
9 | | This file is part of the GNU MP Library. |
10 | | |
11 | | The GNU MP Library is free software; you can redistribute it and/or modify |
12 | | it under the terms of either: |
13 | | |
14 | | * the GNU Lesser General Public License as published by the Free |
15 | | Software Foundation; either version 3 of the License, or (at your |
16 | | option) any later version. |
17 | | |
18 | | or |
19 | | |
20 | | * the GNU General Public License as published by the Free Software |
21 | | Foundation; either version 2 of the License, or (at your option) any |
22 | | later version. |
23 | | |
24 | | or both in parallel, as here. |
25 | | |
26 | | The GNU MP Library is distributed in the hope that it will be useful, but |
27 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
28 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
29 | | for more details. |
30 | | |
31 | | You should have received copies of the GNU General Public License and the |
32 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
33 | | see https://www.gnu.org/licenses/. */ |
34 | | |
35 | | #include "gmp-impl.h" |
36 | | #include "longlong.h" |
37 | | |
38 | | |
39 | | /* Calculate an r satisfying |
40 | | |
41 | | r*B^k + a - c == q*d |
42 | | |
43 | | where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1 |
44 | | (the caller won't know which), and q is the quotient (discarded). d must |
45 | | be odd, c can be any limb value. |
46 | | |
47 | | If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d. |
48 | | |
49 | | This slightly strange function suits the initial Nx1 reduction for GCDs |
50 | | or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving |
51 | | -r == a mod d (by passing c=0). For a GCD the factor of -1 on r can be |
52 | | ignored, or for the Jacobi symbol it can be accounted for. The function |
53 | | also suits divisibility and congruence testing since if r=0 (or r=d) is |
54 | | obtained then a==c mod d. |
55 | | |
56 | | |
57 | | r is a bit like the remainder returned by mpn_divexact_by3c, and is the |
58 | | sort of remainder mpn_divexact_1 might return. Like mpn_divexact_by3c, r |
59 | | represents a borrow, since effectively quotient limbs are chosen so that |
60 | | subtracting that multiple of d from src at each step will produce a zero |
61 | | limb. |
62 | | |
63 | | A long calculation can be done piece by piece from low to high by passing |
64 | | the return value from one part as the carry parameter to the next part. |
65 | | The effective final k becomes anything between size and size-n, if n |
66 | | pieces are used. |
67 | | |
68 | | |
69 | | A similar sort of routine could be constructed based on adding multiples |
70 | | of d at each limb, much like redc in mpz_powm does. Subtracting however |
71 | | has a small advantage that when subtracting to cancel out l there's never |
72 | | a borrow into h, whereas using an addition would put a carry into h |
73 | | depending whether l==0 or l!=0. |
74 | | |
75 | | |
76 | | In terms of efficiency, this function is similar to a mul-by-inverse |
77 | | mpn_mod_1. Both are essentially two multiplies and are best suited to |
78 | | CPUs with low latency multipliers (in comparison to a divide instruction |
79 | | at least.) But modexact has a few less supplementary operations, only |
80 | | needs low part and high part multiplies, and has fewer working quantities |
81 | | (helping CPUs with few registers). |
82 | | |
83 | | |
84 | | In the main loop it will be noted that the new carry (call it r) is the |
85 | | sum of the high product h and any borrow from l=s-c. If c<d then we will |
86 | | have r<d too, for the following reasons. Let q=l*inverse be the quotient |
87 | | limb, so that q*d = B*h + l, where B=2^GMP_NUMB_BITS. Now if h=d-1 then |
88 | | |
89 | | l = q*d - B*(d-1) <= (B-1)*d - B*(d-1) = B-d |
90 | | |
91 | | But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will |
92 | | never have h=d-1 and so r=h+borrow <= d-1. |
93 | | |
94 | | When c>=d, on the other hand, h=d-1 can certainly occur together with a |
95 | | borrow, thereby giving only r<=d, as per the function definition above. |
96 | | |
97 | | As a design decision it's left to the caller to check for r=d if it might |
98 | | be passing c>=d. Several applications have c<d initially so the extra |
99 | | test is often unnecessary, for example the GCDs or a plain divisibility |
100 | | d|a test will pass c=0. |
101 | | |
102 | | |
103 | | The special case for size==1 is so that it can be assumed c<=d in the |
104 | | high<=divisor test at the end. c<=d is only guaranteed after at least |
105 | | one iteration of the main loop. There's also a decent chance one % is |
106 | | faster than a binvert_limb, though that will depend on the processor. |
107 | | |
108 | | A CPU specific implementation might want to omit the size==1 code or the |
109 | | high<divisor test. mpn/x86/k6/mode1o.asm for instance finds neither |
110 | | useful. */ |
111 | | |
112 | | |
113 | | mp_limb_t |
114 | | mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, |
115 | | mp_limb_t orig_c) |
116 | 0 | { |
117 | 0 | mp_limb_t s, h, l, inverse, dummy, dmul, ret; |
118 | 0 | mp_limb_t c = orig_c; |
119 | 0 | mp_size_t i; |
120 | |
|
121 | 0 | ASSERT (size >= 1); |
122 | 0 | ASSERT (d & 1); |
123 | 0 | ASSERT_MPN (src, size); |
124 | 0 | ASSERT_LIMB (d); |
125 | 0 | ASSERT_LIMB (c); |
126 | |
|
127 | 0 | if (size == 1) |
128 | 0 | { |
129 | 0 | s = src[0]; |
130 | 0 | if (s > c) |
131 | 0 | { |
132 | 0 | l = s-c; |
133 | 0 | h = l % d; |
134 | 0 | if (h != 0) |
135 | 0 | h = d - h; |
136 | 0 | } |
137 | 0 | else |
138 | 0 | { |
139 | 0 | l = c-s; |
140 | 0 | h = l % d; |
141 | 0 | } |
142 | 0 | return h; |
143 | 0 | } |
144 | | |
145 | | |
146 | 0 | binvert_limb (inverse, d); |
147 | 0 | dmul = d << GMP_NAIL_BITS; |
148 | |
|
149 | 0 | i = 0; |
150 | 0 | do |
151 | 0 | { |
152 | 0 | s = src[i]; |
153 | 0 | SUBC_LIMB (c, l, s, c); |
154 | 0 | l = (l * inverse) & GMP_NUMB_MASK; |
155 | 0 | umul_ppmm (h, dummy, l, dmul); |
156 | 0 | c += h; |
157 | 0 | } |
158 | 0 | while (++i < size-1); |
159 | | |
160 | |
|
161 | 0 | s = src[i]; |
162 | 0 | if (s <= d) |
163 | 0 | { |
164 | | /* With high<=d the final step can be a subtract and addback. If c==0 |
165 | | then the addback will restore to l>=0. If c==d then will get l==d |
166 | | if s==0, but that's ok per the function definition. */ |
167 | |
|
168 | 0 | l = c - s; |
169 | 0 | if (c < s) |
170 | 0 | l += d; |
171 | |
|
172 | 0 | ret = l; |
173 | 0 | } |
174 | 0 | else |
175 | 0 | { |
176 | | /* Can't skip a divide, just do the loop code once more. */ |
177 | |
|
178 | 0 | SUBC_LIMB (c, l, s, c); |
179 | 0 | l = (l * inverse) & GMP_NUMB_MASK; |
180 | 0 | umul_ppmm (h, dummy, l, dmul); |
181 | 0 | c += h; |
182 | 0 | ret = c; |
183 | 0 | } |
184 | |
|
185 | 0 | ASSERT (orig_c < d ? ret < d : ret <= d); |
186 | 0 | return ret; |
187 | 0 | } |
188 | | |
189 | | |
190 | | |
191 | | #if 0 |
192 | | |
193 | | /* The following is an alternate form that might shave one cycle on a |
194 | | superscalar processor since it takes c+=h off the dependent chain, |
195 | | leaving just a low product, high product, and a subtract. |
196 | | |
197 | | This is for CPU specific implementations to consider. A special case for |
198 | | high<divisor and/or size==1 can be added if desired. |
199 | | |
200 | | Notice that c is only ever 0 or 1, since if s-c produces a borrow then |
201 | | x=0xFF..FF and x-h cannot produce a borrow. The c=(x>s) could become |
202 | | c=(x==0xFF..FF) too, if that helped. */ |
203 | | |
204 | | mp_limb_t |
205 | | mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h) |
206 | | { |
207 | | mp_limb_t s, x, y, inverse, dummy, dmul, c1, c2; |
208 | | mp_limb_t c = 0; |
209 | | mp_size_t i; |
210 | | |
211 | | ASSERT (size >= 1); |
212 | | ASSERT (d & 1); |
213 | | |
214 | | binvert_limb (inverse, d); |
215 | | dmul = d << GMP_NAIL_BITS; |
216 | | |
217 | | for (i = 0; i < size; i++) |
218 | | { |
219 | | ASSERT (c==0 || c==1); |
220 | | |
221 | | s = src[i]; |
222 | | SUBC_LIMB (c1, x, s, c); |
223 | | |
224 | | SUBC_LIMB (c2, y, x, h); |
225 | | c = c1 + c2; |
226 | | |
227 | | y = (y * inverse) & GMP_NUMB_MASK; |
228 | | umul_ppmm (h, dummy, y, dmul); |
229 | | } |
230 | | |
231 | | h += c; |
232 | | return h; |
233 | | } |
234 | | |
235 | | #endif |