Line | Count | Source (jump to first uncovered line) |
1 | | /* mpn_sec_powm -- Compute R = U^E mod M. Secure variant, side-channel silent |
2 | | under the assumption that the multiply instruction is side channel silent. |
3 | | |
4 | | Contributed to the GNU project by Torbjörn Granlund. |
5 | | |
6 | | Copyright 2007-2009, 2011-2014, 2018-2019, 2021 Free Software Foundation, Inc. |
7 | | |
8 | | This file is part of the GNU MP Library. |
9 | | |
10 | | The GNU MP Library is free software; you can redistribute it and/or modify |
11 | | it under the terms of either: |
12 | | |
13 | | * the GNU Lesser General Public License as published by the Free |
14 | | Software Foundation; either version 3 of the License, or (at your |
15 | | option) any later version. |
16 | | |
17 | | or |
18 | | |
19 | | * the GNU General Public License as published by the Free Software |
20 | | Foundation; either version 2 of the License, or (at your option) any |
21 | | later version. |
22 | | |
23 | | or both in parallel, as here. |
24 | | |
25 | | The GNU MP Library is distributed in the hope that it will be useful, but |
26 | | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
27 | | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
28 | | for more details. |
29 | | |
30 | | You should have received copies of the GNU General Public License and the |
31 | | GNU Lesser General Public License along with the GNU MP Library. If not, |
32 | | see https://www.gnu.org/licenses/. */ |
33 | | |
34 | | |
35 | | /* |
36 | | BASIC ALGORITHM, Compute U^E mod M, where M < B^n is odd. |
37 | | |
38 | | 1. T <- (B^n * U) mod M; convert to REDC form |
39 | | |
40 | | 2. Compute table U^0, U^1, U^2... of floor(log(E))-dependent size |
41 | | |
42 | | 3. While there are more bits in E |
43 | | W <- power left-to-right base-k |
44 | | |
45 | | The article "Defeating modexp side-channel attacks with data-independent |
46 | | execution traces", https://gmplib.org/~tege/modexp-silent.pdf, has details. |
47 | | |
48 | | |
49 | | TODO: |
50 | | |
51 | | * Make getbits a macro, thereby allowing it to update the index operand. |
52 | | That will simplify the code using getbits. (Perhaps make getbits' sibling |
53 | | getbit then have similar form, for symmetry.) |
54 | | |
55 | | * Choose window size without looping. (Superoptimize or think(tm).) |
56 | | |
57 | | * REDC_1_TO_REDC_2_THRESHOLD might actually represent the cutoff between |
58 | | redc_1 and redc_n. On such systems, we will switch to redc_2 causing |
59 | | slowdown. |
60 | | */ |
61 | | |
62 | | #include "gmp-impl.h" |
63 | | #include "longlong.h" |
64 | | |
65 | | #undef MPN_REDC_1_SEC |
66 | | #if HAVE_NATIVE_mpn_sbpi1_bdiv_r |
67 | | #define MPN_REDC_1_SEC(rp, up, mp, n, invm) \ |
68 | 0 | do { \ |
69 | 0 | mp_limb_t cy; \ |
70 | 0 | cy = mpn_sbpi1_bdiv_r (up, 2 * n, mp, n, invm); \ |
71 | 0 | mpn_cnd_sub_n (cy, rp, up + n, mp, n); \ |
72 | 0 | } while (0) |
73 | | #else |
74 | | #define MPN_REDC_1_SEC(rp, up, mp, n, invm) \ |
75 | | do { \ |
76 | | mp_limb_t cy; \ |
77 | | cy = mpn_redc_1 (rp, up, mp, n, invm); \ |
78 | | mpn_cnd_sub_n (cy, rp, rp, mp, n); \ |
79 | | } while (0) |
80 | | #endif |
81 | | |
82 | | #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2 |
83 | | #undef MPN_REDC_2_SEC |
84 | | #define MPN_REDC_2_SEC(rp, up, mp, n, mip) \ |
85 | | do { \ |
86 | | mp_limb_t cy; \ |
87 | | cy = mpn_redc_2 (rp, up, mp, n, mip); \ |
88 | | mpn_cnd_sub_n (cy, rp, rp, mp, n); \ |
89 | | } while (0) |
90 | | #else |
91 | | #define MPN_REDC_2_SEC(rp, up, mp, n, mip) /* empty */ |
92 | | #undef REDC_1_TO_REDC_2_THRESHOLD |
93 | | #define REDC_1_TO_REDC_2_THRESHOLD MP_SIZE_T_MAX |
94 | | #endif |
95 | | |
96 | | /* Define our own mpn squaring function. We do this since we cannot use a |
97 | | native mpn_sqr_basecase over TUNE_SQR_TOOM2_MAX, or a non-native one over |
98 | | SQR_TOOM2_THRESHOLD. This is so because of fixed size stack allocations |
99 | | made inside mpn_sqr_basecase. */ |
100 | | |
101 | | #if ! HAVE_NATIVE_mpn_sqr_basecase |
102 | | /* The limit of the generic code is SQR_TOOM2_THRESHOLD. */ |
103 | | #define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD |
104 | | #endif |
105 | | |
106 | | #if HAVE_NATIVE_mpn_sqr_basecase |
107 | | #ifdef TUNE_SQR_TOOM2_MAX |
108 | | /* We slightly abuse TUNE_SQR_TOOM2_MAX here. If it is set for an assembly |
109 | | mpn_sqr_basecase, it comes from SQR_TOOM2_THRESHOLD_MAX in the assembly |
110 | | file. An assembly mpn_sqr_basecase that does not define it should allow |
111 | | any size. */ |
112 | | #define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD |
113 | | #endif |
114 | | #endif |
115 | | |
116 | | #ifdef WANT_FAT_BINARY |
117 | | /* For fat builds, we use SQR_TOOM2_THRESHOLD which will expand to a read from |
118 | | __gmpn_cpuvec. Perhaps any possible sqr_basecase.asm allow any size, and we |
119 | | limit the use unnecessarily. We cannot tell, so play it safe. FIXME. */ |
120 | | #define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD |
121 | | #endif |
122 | | |
123 | | #ifndef SQR_BASECASE_LIM |
124 | | /* If SQR_BASECASE_LIM is now not defined, use mpn_sqr_basecase for any operand |
125 | | size. */ |
126 | | #define SQR_BASECASE_LIM MP_SIZE_T_MAX |
127 | | #endif |
128 | | |
129 | | #define mpn_local_sqr(rp,up,n) \ |
130 | 0 | do { \ |
131 | 0 | if (ABOVE_THRESHOLD (n, SQR_BASECASE_THRESHOLD) \ |
132 | 0 | && BELOW_THRESHOLD (n, SQR_BASECASE_LIM)) \ |
133 | 0 | mpn_sqr_basecase (rp, up, n); \ |
134 | 0 | else \ |
135 | 0 | mpn_mul_basecase(rp, up, n, up, n); \ |
136 | 0 | } while (0) |
137 | | |
138 | | #define getbit(p,bi) \ |
139 | | ((p[(bi - 1) / GMP_NUMB_BITS] >> (bi - 1) % GMP_NUMB_BITS) & 1) |
140 | | |
141 | | /* FIXME: Maybe some things would get simpler if all callers ensure |
142 | | that bi >= nbits. As far as I understand, with the current code bi |
143 | | < nbits can happen only for the final iteration. */ |
144 | | static inline mp_limb_t |
145 | | getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits) |
146 | 0 | { |
147 | 0 | int nbits_in_r; |
148 | 0 | mp_limb_t r; |
149 | 0 | mp_size_t i; |
150 | |
|
151 | 0 | if (bi < nbits) |
152 | 0 | { |
153 | 0 | return p[0] & (((mp_limb_t) 1 << bi) - 1); |
154 | 0 | } |
155 | 0 | else |
156 | 0 | { |
157 | 0 | bi -= nbits; /* bit index of low bit to extract */ |
158 | 0 | i = bi / GMP_NUMB_BITS; /* word index of low bit to extract */ |
159 | 0 | bi %= GMP_NUMB_BITS; /* bit index in low word */ |
160 | 0 | r = p[i] >> bi; /* extract (low) bits */ |
161 | 0 | nbits_in_r = GMP_NUMB_BITS - bi; /* number of bits now in r */ |
162 | 0 | if (nbits_in_r < nbits) /* did we get enough bits? */ |
163 | 0 | r += p[i + 1] << nbits_in_r; /* prepend bits from higher word */ |
164 | 0 | return r & (((mp_limb_t ) 1 << nbits) - 1); |
165 | 0 | } |
166 | 0 | } |
167 | | |
168 | | #ifndef POWM_SEC_TABLE |
169 | | #if GMP_NUMB_BITS < 50 |
170 | | #define POWM_SEC_TABLE 2,33,96,780,2741 |
171 | | #else |
172 | | #define POWM_SEC_TABLE 2,130,524,2578 |
173 | | #endif |
174 | | #endif |
175 | | |
176 | | #if TUNE_PROGRAM_BUILD |
177 | | extern int win_size (mp_bitcnt_t); |
178 | | #else |
179 | | static inline int |
180 | | win_size (mp_bitcnt_t enb) |
181 | 0 | { |
182 | 0 | int k; |
183 | | /* Find k, such that x[k-1] < enb <= x[k]. |
184 | | |
185 | | We require that x[k] >= k, then it follows that enb > x[k-1] >= |
186 | | k-1, which implies k <= enb. |
187 | | */ |
188 | 0 | static const mp_bitcnt_t x[] = {POWM_SEC_TABLE,~(mp_bitcnt_t)0}; |
189 | 0 | for (k = 0; enb > x[k++]; ) |
190 | 0 | ; |
191 | 0 | ASSERT (k <= enb); |
192 | 0 | return k; |
193 | 0 | } |
194 | | #endif |
195 | | |
196 | | /* Convert U to REDC form, U_r = B^n * U mod M. |
197 | | Uses scratch space at tp of size 2un + n + 1. */ |
198 | | static void |
199 | | redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n, mp_ptr tp) |
200 | 0 | { |
201 | 0 | MPN_ZERO (tp, n); |
202 | 0 | MPN_COPY (tp + n, up, un); |
203 | |
|
204 | 0 | mpn_sec_div_r (tp, un + n, mp, n, tp + un + n); |
205 | 0 | MPN_COPY (rp, tp, n); |
206 | 0 | } |
207 | | |
208 | | static mp_limb_t |
209 | | sec_binvert_limb (mp_limb_t n) |
210 | 0 | { |
211 | 0 | mp_limb_t inv, t; |
212 | 0 | ASSERT ((n & 1) == 1); |
213 | | /* 3 + 2 -> 5 */ |
214 | 0 | inv = n + (((n + 1) << 1) & 0x18); |
215 | |
|
216 | 0 | t = n * inv; |
217 | | #if GMP_NUMB_BITS <= 10 |
218 | | /* 5 x 2 -> 10 */ |
219 | | inv = 2 * inv - inv * t; |
220 | | #else /* GMP_NUMB_BITS > 10 */ |
221 | | /* 5 x 2 + 2 -> 12 */ |
222 | 0 | inv = 2 * inv - inv * t + ((inv<<10)&-(t&(1<<5))); |
223 | 0 | #endif /* GMP_NUMB_BITS <= 10 */ |
224 | |
|
225 | 0 | if (GMP_NUMB_BITS > 12) |
226 | 0 | { |
227 | 0 | t = n * inv - 1; |
228 | 0 | if (GMP_NUMB_BITS <= 36) |
229 | 0 | { |
230 | | /* 12 x 3 -> 36 */ |
231 | 0 | inv += inv * t * (t - 1); |
232 | 0 | } |
233 | 0 | else /* GMP_NUMB_BITS > 36 */ |
234 | 0 | { |
235 | 0 | mp_limb_t t2 = t * t; |
236 | | #if GMP_NUMB_BITS <= 60 |
237 | | /* 12 x 5 -> 60 */ |
238 | | inv += inv * (t2 + 1) * (t2 - t); |
239 | | #else /* GMP_NUMB_BITS > 60 */ |
240 | | /* 12 x 5 + 4 -> 64 */ |
241 | 0 | inv *= (t2 + 1) * (t2 - t) + 1 - ((t<<48)&-(t&(1<<12))); |
242 | | |
243 | | /* 64 -> 128 -> 256 -> ... */ |
244 | 0 | for (int todo = (GMP_NUMB_BITS - 1) >> 6; todo != 0; todo >>= 1) |
245 | 0 | inv = 2 * inv - inv * inv * n; |
246 | 0 | #endif /* GMP_NUMB_BITS <= 60 */ |
247 | 0 | } |
248 | 0 | } |
249 | |
|
250 | 0 | ASSERT ((inv * n & GMP_NUMB_MASK) == 1); |
251 | 0 | return inv & GMP_NUMB_MASK; |
252 | 0 | } |
253 | | |
254 | | /* {rp, n} <-- {bp, bn} ^ {ep, en} mod {mp, n}, |
255 | | where en = ceil (enb / GMP_NUMB_BITS) |
256 | | Requires that {mp, n} is odd (and hence also mp[0] odd). |
257 | | Uses scratch space at tp as defined by mpn_sec_powm_itch. */ |
258 | | void |
259 | | mpn_sec_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, |
260 | | mp_srcptr ep, mp_bitcnt_t enb, |
261 | | mp_srcptr mp, mp_size_t n, mp_ptr tp) |
262 | 0 | { |
263 | 0 | mp_limb_t ip[2], *mip; |
264 | 0 | int windowsize, this_windowsize; |
265 | 0 | mp_limb_t expbits; |
266 | 0 | mp_ptr pp, this_pp, ps; |
267 | 0 | long i; |
268 | 0 | int cnd; |
269 | |
|
270 | 0 | ASSERT (enb > 0); |
271 | 0 | ASSERT (n > 0); |
272 | | /* The code works for bn = 0, but the defined scratch space is 2 limbs |
273 | | greater than we supply, when converting 1 to redc form . */ |
274 | 0 | ASSERT (bn > 0); |
275 | 0 | ASSERT ((mp[0] & 1) != 0); |
276 | |
|
277 | 0 | windowsize = win_size (enb); |
278 | |
|
279 | 0 | mip = ip; |
280 | 0 | mip[0] = sec_binvert_limb (mp[0]); |
281 | 0 | if (ABOVE_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) |
282 | 0 | { |
283 | 0 | mp_limb_t t, dummy, mip0 = mip[0]; |
284 | |
|
285 | 0 | umul_ppmm (t, dummy, mip0, mp[0]); |
286 | 0 | ASSERT (dummy == 1); |
287 | 0 | t += mip0 * mp[1]; /* t = (mp * mip0)[1] */ |
288 | |
|
289 | 0 | mip[1] = t * mip0 - 1; /* ~( - t * mip0) */ |
290 | 0 | } |
291 | 0 | mip[0] = -mip[0]; |
292 | |
|
293 | 0 | pp = tp; |
294 | 0 | tp += (n << windowsize); /* put tp after power table */ |
295 | | |
296 | | /* Compute pp[0] table entry */ |
297 | | /* scratch: | n | 1 | n+2 | */ |
298 | | /* | pp[0] | 1 | redcify | */ |
299 | 0 | this_pp = pp; |
300 | 0 | this_pp[n] = 1; |
301 | 0 | redcify (this_pp, this_pp + n, 1, mp, n, this_pp + n + 1); |
302 | 0 | this_pp += n; |
303 | | |
304 | | /* Compute pp[1] table entry. To avoid excessive scratch usage in the |
305 | | degenerate situation where B >> M, we let redcify use scratch space which |
306 | | will later be used by the pp table (element 2 and up). */ |
307 | | /* scratch: | n | n | bn + n + 1 | */ |
308 | | /* | pp[0] | pp[1] | redcify | */ |
309 | 0 | redcify (this_pp, bp, bn, mp, n, this_pp + n); |
310 | | |
311 | | /* Precompute powers of b and put them in the temporary area at pp. */ |
312 | | /* scratch: | n | n | ... | | 2n | */ |
313 | | /* | pp[0] | pp[1] | ... | pp[2^windowsize-1] | product | */ |
314 | 0 | ps = pp + n; /* initially B^1 */ |
315 | 0 | if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) |
316 | 0 | { |
317 | 0 | for (i = (1 << windowsize) - 2; i > 0; i -= 2) |
318 | 0 | { |
319 | 0 | mpn_local_sqr (tp, ps, n); |
320 | 0 | ps += n; |
321 | 0 | this_pp += n; |
322 | 0 | MPN_REDC_1_SEC (this_pp, tp, mp, n, mip[0]); |
323 | |
|
324 | 0 | mpn_mul_basecase (tp, this_pp, n, pp + n, n); |
325 | 0 | this_pp += n; |
326 | 0 | MPN_REDC_1_SEC (this_pp, tp, mp, n, mip[0]); |
327 | 0 | } |
328 | 0 | } |
329 | 0 | else |
330 | 0 | { |
331 | 0 | for (i = (1 << windowsize) - 2; i > 0; i -= 2) |
332 | 0 | { |
333 | 0 | mpn_local_sqr (tp, ps, n); |
334 | 0 | ps += n; |
335 | 0 | this_pp += n; |
336 | 0 | MPN_REDC_2_SEC (this_pp, tp, mp, n, mip); |
337 | |
|
338 | 0 | mpn_mul_basecase (tp, this_pp, n, pp + n, n); |
339 | 0 | this_pp += n; |
340 | 0 | MPN_REDC_2_SEC (this_pp, tp, mp, n, mip); |
341 | 0 | } |
342 | 0 | } |
343 | |
|
344 | 0 | expbits = getbits (ep, enb, windowsize); |
345 | 0 | ASSERT_ALWAYS (enb >= windowsize); |
346 | 0 | enb -= windowsize; |
347 | |
|
348 | 0 | mpn_sec_tabselect (rp, pp, n, 1 << windowsize, expbits); |
349 | | |
350 | | /* Main exponentiation loop. */ |
351 | | /* scratch: | n | n | ... | | 3n-4n | */ |
352 | | /* | pp[0] | pp[1] | ... | pp[2^windowsize-1] | loop scratch | */ |
353 | |
|
354 | 0 | #define INNERLOOP \ |
355 | 0 | while (enb != 0) \ |
356 | 0 | { \ |
357 | 0 | expbits = getbits (ep, enb, windowsize); \ |
358 | 0 | this_windowsize = windowsize; \ |
359 | 0 | if (enb < windowsize) \ |
360 | 0 | { \ |
361 | 0 | this_windowsize -= windowsize - enb; \ |
362 | 0 | enb = 0; \ |
363 | 0 | } \ |
364 | 0 | else \ |
365 | 0 | enb -= windowsize; \ |
366 | 0 | \ |
367 | 0 | do \ |
368 | 0 | { \ |
369 | 0 | mpn_local_sqr (tp, rp, n); \ |
370 | 0 | MPN_REDUCE (rp, tp, mp, n, mip); \ |
371 | 0 | this_windowsize--; \ |
372 | 0 | } \ |
373 | 0 | while (this_windowsize != 0); \ |
374 | 0 | \ |
375 | 0 | mpn_sec_tabselect (tp + 2*n, pp, n, 1 << windowsize, expbits); \ |
376 | 0 | mpn_mul_basecase (tp, rp, n, tp + 2*n, n); \ |
377 | 0 | \ |
378 | 0 | MPN_REDUCE (rp, tp, mp, n, mip); \ |
379 | 0 | } |
380 | |
|
381 | 0 | if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) |
382 | 0 | { |
383 | 0 | #undef MPN_REDUCE |
384 | 0 | #define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1_SEC (rp, tp, mp, n, mip[0]) |
385 | 0 | INNERLOOP; |
386 | 0 | } |
387 | 0 | else |
388 | 0 | { |
389 | 0 | #undef MPN_REDUCE |
390 | 0 | #define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2_SEC (rp, tp, mp, n, mip) |
391 | 0 | INNERLOOP; |
392 | 0 | } |
393 | |
|
394 | 0 | MPN_COPY (tp, rp, n); |
395 | 0 | MPN_ZERO (tp + n, n); |
396 | |
|
397 | 0 | if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) |
398 | 0 | MPN_REDC_1_SEC (rp, tp, mp, n, mip[0]); |
399 | 0 | else |
400 | 0 | MPN_REDC_2_SEC (rp, tp, mp, n, mip); |
401 | |
|
402 | 0 | cnd = mpn_sub_n (tp, rp, mp, n); /* we need just retval */ |
403 | 0 | mpn_cnd_sub_n (!cnd, rp, rp, mp, n); |
404 | 0 | } |
405 | | |
406 | | mp_size_t |
407 | | mpn_sec_powm_itch (mp_size_t bn, mp_bitcnt_t enb, mp_size_t n) |
408 | 0 | { |
409 | 0 | int windowsize; |
410 | 0 | mp_size_t redcify_itch, itch; |
411 | | |
412 | | /* FIXME: no more _local/_basecase difference. */ |
413 | | /* The top scratch usage will either be when reducing B in the 2nd redcify |
414 | | call, or more typically n*2^windowsize + 3n or 4n, in the main loop. (It |
415 | | is 3n or 4n depending on if we use mpn_local_sqr or a native |
416 | | mpn_sqr_basecase. We assume 4n always for now.) */ |
417 | |
|
418 | 0 | windowsize = win_size (enb); |
419 | | |
420 | | /* The 2n term is due to pp[0] and pp[1] at the time of the 2nd redcify call, |
421 | | the (bn + n) term is due to redcify's own usage, and the rest is due to |
422 | | mpn_sec_div_r's usage when called from redcify. */ |
423 | 0 | redcify_itch = (2 * n) + (bn + n) + ((bn + n) + 2 * n + 2); |
424 | | |
425 | | /* The n * 2^windowsize term is due to the power table, the 4n term is due to |
426 | | scratch needs of squaring/multiplication in the exponentiation loop. */ |
427 | 0 | itch = (n << windowsize) + (4 * n); |
428 | |
|
429 | 0 | return MAX (itch, redcify_itch); |
430 | 0 | } |