/src/boringssl/crypto/fipsmodule/bn/exponentiation.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) |
2 | | * All rights reserved. |
3 | | * |
4 | | * This package is an SSL implementation written |
5 | | * by Eric Young (eay@cryptsoft.com). |
6 | | * The implementation was written so as to conform with Netscapes SSL. |
7 | | * |
8 | | * This library is free for commercial and non-commercial use as long as |
9 | | * the following conditions are aheared to. The following conditions |
10 | | * apply to all code found in this distribution, be it the RC4, RSA, |
11 | | * lhash, DES, etc., code; not just the SSL code. The SSL documentation |
12 | | * included with this distribution is covered by the same copyright terms |
13 | | * except that the holder is Tim Hudson (tjh@cryptsoft.com). |
14 | | * |
15 | | * Copyright remains Eric Young's, and as such any Copyright notices in |
16 | | * the code are not to be removed. |
17 | | * If this package is used in a product, Eric Young should be given attribution |
18 | | * as the author of the parts of the library used. |
19 | | * This can be in the form of a textual message at program startup or |
20 | | * in documentation (online or textual) provided with the package. |
21 | | * |
22 | | * Redistribution and use in source and binary forms, with or without |
23 | | * modification, are permitted provided that the following conditions |
24 | | * are met: |
25 | | * 1. Redistributions of source code must retain the copyright |
26 | | * notice, this list of conditions and the following disclaimer. |
27 | | * 2. Redistributions in binary form must reproduce the above copyright |
28 | | * notice, this list of conditions and the following disclaimer in the |
29 | | * documentation and/or other materials provided with the distribution. |
30 | | * 3. All advertising materials mentioning features or use of this software |
31 | | * must display the following acknowledgement: |
32 | | * "This product includes cryptographic software written by |
33 | | * Eric Young (eay@cryptsoft.com)" |
34 | | * The word 'cryptographic' can be left out if the rouines from the library |
35 | | * being used are not cryptographic related :-). |
36 | | * 4. If you include any Windows specific code (or a derivative thereof) from |
37 | | * the apps directory (application code) you must include an acknowledgement: |
38 | | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" |
39 | | * |
40 | | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND |
41 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
42 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
43 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
44 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
45 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
46 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
47 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
48 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
49 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
50 | | * SUCH DAMAGE. |
51 | | * |
52 | | * The licence and distribution terms for any publically available version or |
53 | | * derivative of this code cannot be changed. i.e. this code cannot simply be |
54 | | * copied and put under another distribution licence |
55 | | * [including the GNU Public Licence.] |
56 | | */ |
57 | | /* ==================================================================== |
58 | | * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. |
59 | | * |
60 | | * Redistribution and use in source and binary forms, with or without |
61 | | * modification, are permitted provided that the following conditions |
62 | | * are met: |
63 | | * |
64 | | * 1. Redistributions of source code must retain the above copyright |
65 | | * notice, this list of conditions and the following disclaimer. |
66 | | * |
67 | | * 2. Redistributions in binary form must reproduce the above copyright |
68 | | * notice, this list of conditions and the following disclaimer in |
69 | | * the documentation and/or other materials provided with the |
70 | | * distribution. |
71 | | * |
72 | | * 3. All advertising materials mentioning features or use of this |
73 | | * software must display the following acknowledgment: |
74 | | * "This product includes software developed by the OpenSSL Project |
75 | | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" |
76 | | * |
77 | | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to |
78 | | * endorse or promote products derived from this software without |
79 | | * prior written permission. For written permission, please contact |
80 | | * openssl-core@openssl.org. |
81 | | * |
82 | | * 5. Products derived from this software may not be called "OpenSSL" |
83 | | * nor may "OpenSSL" appear in their names without prior written |
84 | | * permission of the OpenSSL Project. |
85 | | * |
86 | | * 6. Redistributions of any form whatsoever must retain the following |
87 | | * acknowledgment: |
88 | | * "This product includes software developed by the OpenSSL Project |
89 | | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" |
90 | | * |
91 | | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY |
92 | | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
93 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
94 | | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR |
95 | | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
96 | | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
97 | | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
98 | | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
99 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
100 | | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
101 | | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED |
102 | | * OF THE POSSIBILITY OF SUCH DAMAGE. |
103 | | * ==================================================================== |
104 | | * |
105 | | * This product includes cryptographic software written by Eric Young |
106 | | * (eay@cryptsoft.com). This product includes software written by Tim |
107 | | * Hudson (tjh@cryptsoft.com). */ |
108 | | |
109 | | #include <openssl/bn.h> |
110 | | |
111 | | #include <assert.h> |
112 | | #include <limits.h> |
113 | | #include <stdlib.h> |
114 | | #include <string.h> |
115 | | |
116 | | #include <openssl/err.h> |
117 | | #include <openssl/mem.h> |
118 | | |
119 | | #include "internal.h" |
120 | | #include "rsaz_exp.h" |
121 | | |
122 | | |
123 | 0 | int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) { |
124 | 0 | int i, bits, ret = 0; |
125 | 0 | BIGNUM *v, *rr; |
126 | |
|
127 | 0 | BN_CTX_start(ctx); |
128 | 0 | if (r == a || r == p) { |
129 | 0 | rr = BN_CTX_get(ctx); |
130 | 0 | } else { |
131 | 0 | rr = r; |
132 | 0 | } |
133 | |
|
134 | 0 | v = BN_CTX_get(ctx); |
135 | 0 | if (rr == NULL || v == NULL) { |
136 | 0 | goto err; |
137 | 0 | } |
138 | | |
139 | 0 | if (BN_copy(v, a) == NULL) { |
140 | 0 | goto err; |
141 | 0 | } |
142 | 0 | bits = BN_num_bits(p); |
143 | |
|
144 | 0 | if (BN_is_odd(p)) { |
145 | 0 | if (BN_copy(rr, a) == NULL) { |
146 | 0 | goto err; |
147 | 0 | } |
148 | 0 | } else { |
149 | 0 | if (!BN_one(rr)) { |
150 | 0 | goto err; |
151 | 0 | } |
152 | 0 | } |
153 | | |
154 | 0 | for (i = 1; i < bits; i++) { |
155 | 0 | if (!BN_sqr(v, v, ctx)) { |
156 | 0 | goto err; |
157 | 0 | } |
158 | 0 | if (BN_is_bit_set(p, i)) { |
159 | 0 | if (!BN_mul(rr, rr, v, ctx)) { |
160 | 0 | goto err; |
161 | 0 | } |
162 | 0 | } |
163 | 0 | } |
164 | | |
165 | 0 | if (r != rr && !BN_copy(r, rr)) { |
166 | 0 | goto err; |
167 | 0 | } |
168 | 0 | ret = 1; |
169 | |
|
170 | 0 | err: |
171 | 0 | BN_CTX_end(ctx); |
172 | 0 | return ret; |
173 | 0 | } |
174 | | |
175 | | typedef struct bn_recp_ctx_st { |
176 | | BIGNUM N; // the divisor |
177 | | BIGNUM Nr; // the reciprocal |
178 | | int num_bits; |
179 | | int shift; |
180 | | int flags; |
181 | | } BN_RECP_CTX; |
182 | | |
183 | 1.56k | static void BN_RECP_CTX_init(BN_RECP_CTX *recp) { |
184 | 1.56k | BN_init(&recp->N); |
185 | 1.56k | BN_init(&recp->Nr); |
186 | 1.56k | recp->num_bits = 0; |
187 | 1.56k | recp->shift = 0; |
188 | 1.56k | recp->flags = 0; |
189 | 1.56k | } |
190 | | |
191 | 1.56k | static void BN_RECP_CTX_free(BN_RECP_CTX *recp) { |
192 | 1.56k | if (recp == NULL) { |
193 | 0 | return; |
194 | 0 | } |
195 | | |
196 | 1.56k | BN_free(&recp->N); |
197 | 1.56k | BN_free(&recp->Nr); |
198 | 1.56k | } |
199 | | |
200 | 1.56k | static int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *d, BN_CTX *ctx) { |
201 | 1.56k | if (!BN_copy(&(recp->N), d)) { |
202 | 0 | return 0; |
203 | 0 | } |
204 | 1.56k | BN_zero(&recp->Nr); |
205 | 1.56k | recp->num_bits = BN_num_bits(d); |
206 | 1.56k | recp->shift = 0; |
207 | | |
208 | 1.56k | return 1; |
209 | 1.56k | } |
210 | | |
211 | | // len is the expected size of the result We actually calculate with an extra |
212 | | // word of precision, so we can do faster division if the remainder is not |
213 | | // required. |
214 | | // r := 2^len / m |
215 | 1.48k | static int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx) { |
216 | 1.48k | int ret = -1; |
217 | 1.48k | BIGNUM *t; |
218 | | |
219 | 1.48k | BN_CTX_start(ctx); |
220 | 1.48k | t = BN_CTX_get(ctx); |
221 | 1.48k | if (t == NULL) { |
222 | 0 | goto err; |
223 | 0 | } |
224 | | |
225 | 1.48k | if (!BN_set_bit(t, len)) { |
226 | 0 | goto err; |
227 | 0 | } |
228 | | |
229 | 1.48k | if (!BN_div(r, NULL, t, m, ctx)) { |
230 | 0 | goto err; |
231 | 0 | } |
232 | | |
233 | 1.48k | ret = len; |
234 | | |
235 | 1.48k | err: |
236 | 1.48k | BN_CTX_end(ctx); |
237 | 1.48k | return ret; |
238 | 1.48k | } |
239 | | |
240 | | static int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, |
241 | 126k | BN_RECP_CTX *recp, BN_CTX *ctx) { |
242 | 126k | int i, j, ret = 0; |
243 | 126k | BIGNUM *a, *b, *d, *r; |
244 | | |
245 | 126k | BN_CTX_start(ctx); |
246 | 126k | a = BN_CTX_get(ctx); |
247 | 126k | b = BN_CTX_get(ctx); |
248 | 126k | if (dv != NULL) { |
249 | 0 | d = dv; |
250 | 126k | } else { |
251 | 126k | d = BN_CTX_get(ctx); |
252 | 126k | } |
253 | | |
254 | 126k | if (rem != NULL) { |
255 | 126k | r = rem; |
256 | 126k | } else { |
257 | 0 | r = BN_CTX_get(ctx); |
258 | 0 | } |
259 | | |
260 | 126k | if (a == NULL || b == NULL || d == NULL || r == NULL) { |
261 | 0 | goto err; |
262 | 0 | } |
263 | | |
264 | 126k | if (BN_ucmp(m, &recp->N) < 0) { |
265 | 37.5k | BN_zero(d); |
266 | 37.5k | if (!BN_copy(r, m)) { |
267 | 0 | goto err; |
268 | 0 | } |
269 | 37.5k | BN_CTX_end(ctx); |
270 | 37.5k | return 1; |
271 | 37.5k | } |
272 | | |
273 | | // We want the remainder |
274 | | // Given input of ABCDEF / ab |
275 | | // we need multiply ABCDEF by 3 digests of the reciprocal of ab |
276 | | |
277 | | // i := max(BN_num_bits(m), 2*BN_num_bits(N)) |
278 | 89.3k | i = BN_num_bits(m); |
279 | 89.3k | j = recp->num_bits << 1; |
280 | 89.3k | if (j > i) { |
281 | 81.2k | i = j; |
282 | 81.2k | } |
283 | | |
284 | | // Nr := round(2^i / N) |
285 | 89.3k | if (i != recp->shift) { |
286 | 1.48k | recp->shift = |
287 | 1.48k | BN_reciprocal(&(recp->Nr), &(recp->N), i, |
288 | 1.48k | ctx); // BN_reciprocal returns i, or -1 for an error |
289 | 1.48k | } |
290 | | |
291 | 89.3k | if (recp->shift == -1) { |
292 | 0 | goto err; |
293 | 0 | } |
294 | | |
295 | | // d := |round(round(m / 2^BN_num_bits(N)) * recp->Nr / 2^(i - |
296 | | // BN_num_bits(N)))| |
297 | | // = |round(round(m / 2^BN_num_bits(N)) * round(2^i / N) / 2^(i - |
298 | | // BN_num_bits(N)))| |
299 | | // <= |(m / 2^BN_num_bits(N)) * (2^i / N) * (2^BN_num_bits(N) / 2^i)| |
300 | | // = |m/N| |
301 | 89.3k | if (!BN_rshift(a, m, recp->num_bits)) { |
302 | 0 | goto err; |
303 | 0 | } |
304 | 89.3k | if (!BN_mul(b, a, &(recp->Nr), ctx)) { |
305 | 0 | goto err; |
306 | 0 | } |
307 | 89.3k | if (!BN_rshift(d, b, i - recp->num_bits)) { |
308 | 0 | goto err; |
309 | 0 | } |
310 | 89.3k | d->neg = 0; |
311 | | |
312 | 89.3k | if (!BN_mul(b, &(recp->N), d, ctx)) { |
313 | 0 | goto err; |
314 | 0 | } |
315 | 89.3k | if (!BN_usub(r, m, b)) { |
316 | 0 | goto err; |
317 | 0 | } |
318 | 89.3k | r->neg = 0; |
319 | | |
320 | 89.3k | j = 0; |
321 | 144k | while (BN_ucmp(r, &(recp->N)) >= 0) { |
322 | 55.1k | if (j++ > 2) { |
323 | 0 | OPENSSL_PUT_ERROR(BN, BN_R_BAD_RECIPROCAL); |
324 | 0 | goto err; |
325 | 0 | } |
326 | 55.1k | if (!BN_usub(r, r, &(recp->N))) { |
327 | 0 | goto err; |
328 | 0 | } |
329 | 55.1k | if (!BN_add_word(d, 1)) { |
330 | 0 | goto err; |
331 | 0 | } |
332 | 55.1k | } |
333 | | |
334 | 89.3k | r->neg = BN_is_zero(r) ? 0 : m->neg; |
335 | 89.3k | d->neg = m->neg ^ recp->N.neg; |
336 | 89.3k | ret = 1; |
337 | | |
338 | 89.3k | err: |
339 | 89.3k | BN_CTX_end(ctx); |
340 | 89.3k | return ret; |
341 | 89.3k | } |
342 | | |
343 | | static int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y, |
344 | 126k | BN_RECP_CTX *recp, BN_CTX *ctx) { |
345 | 126k | int ret = 0; |
346 | 126k | BIGNUM *a; |
347 | 126k | const BIGNUM *ca; |
348 | | |
349 | 126k | BN_CTX_start(ctx); |
350 | 126k | a = BN_CTX_get(ctx); |
351 | 126k | if (a == NULL) { |
352 | 0 | goto err; |
353 | 0 | } |
354 | | |
355 | 126k | if (y != NULL) { |
356 | 126k | if (x == y) { |
357 | 105k | if (!BN_sqr(a, x, ctx)) { |
358 | 0 | goto err; |
359 | 0 | } |
360 | 105k | } else { |
361 | 21.2k | if (!BN_mul(a, x, y, ctx)) { |
362 | 0 | goto err; |
363 | 0 | } |
364 | 21.2k | } |
365 | 126k | ca = a; |
366 | 126k | } else { |
367 | 0 | ca = x; // Just do the mod |
368 | 0 | } |
369 | | |
370 | 126k | ret = BN_div_recp(NULL, r, ca, recp, ctx); |
371 | | |
372 | 126k | err: |
373 | 126k | BN_CTX_end(ctx); |
374 | 126k | return ret; |
375 | 126k | } |
376 | | |
377 | | // BN_window_bits_for_exponent_size returns sliding window size for mod_exp with |
378 | | // a |b| bit exponent. |
379 | | // |
380 | | // For window size 'w' (w >= 2) and a random 'b' bits exponent, the number of |
381 | | // multiplications is a constant plus on average |
382 | | // |
383 | | // 2^(w-1) + (b-w)/(w+1); |
384 | | // |
385 | | // here 2^(w-1) is for precomputing the table (we actually need entries only |
386 | | // for windows that have the lowest bit set), and (b-w)/(w+1) is an |
387 | | // approximation for the expected number of w-bit windows, not counting the |
388 | | // first one. |
389 | | // |
390 | | // Thus we should use |
391 | | // |
392 | | // w >= 6 if b > 671 |
393 | | // w = 5 if 671 > b > 239 |
394 | | // w = 4 if 239 > b > 79 |
395 | | // w = 3 if 79 > b > 23 |
396 | | // w <= 2 if 23 > b |
397 | | // |
398 | | // (with draws in between). Very small exponents are often selected |
399 | | // with low Hamming weight, so we use w = 1 for b <= 23. |
400 | 141k | static int BN_window_bits_for_exponent_size(size_t b) { |
401 | 141k | if (b > 671) { |
402 | 532 | return 6; |
403 | 532 | } |
404 | 140k | if (b > 239) { |
405 | 78.3k | return 5; |
406 | 78.3k | } |
407 | 62.2k | if (b > 79) { |
408 | 4.99k | return 4; |
409 | 4.99k | } |
410 | 57.2k | if (b > 23) { |
411 | 869 | return 3; |
412 | 869 | } |
413 | 56.3k | return 1; |
414 | 57.2k | } |
415 | | |
416 | | // TABLE_SIZE is the maximum precomputation table size for *variable* sliding |
417 | | // windows. This must be 2^(max_window - 1), where max_window is the largest |
418 | | // value returned from |BN_window_bits_for_exponent_size|. |
419 | | #define TABLE_SIZE 32 |
420 | | |
421 | | // TABLE_BITS_SMALL is the smallest value returned from |
422 | | // |BN_window_bits_for_exponent_size| when |b| is at most |BN_BITS2| * |
423 | | // |BN_SMALL_MAX_WORDS| words. |
424 | 77.2k | #define TABLE_BITS_SMALL 5 |
425 | | |
426 | | // TABLE_SIZE_SMALL is the same as |TABLE_SIZE|, but when |b| is at most |
427 | | // |BN_BITS2| * |BN_SMALL_MAX_WORDS|. |
428 | | #define TABLE_SIZE_SMALL (1 << (TABLE_BITS_SMALL - 1)) |
429 | | |
430 | | static int mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, |
431 | 1.59k | const BIGNUM *m, BN_CTX *ctx) { |
432 | 1.59k | int i, j, ret = 0, wstart, window; |
433 | 1.59k | int start = 1; |
434 | 1.59k | BIGNUM *aa; |
435 | | // Table of variables obtained from 'ctx' |
436 | 1.59k | BIGNUM *val[TABLE_SIZE]; |
437 | 1.59k | BN_RECP_CTX recp; |
438 | | |
439 | | // This function is only called on even moduli. |
440 | 1.59k | assert(!BN_is_odd(m)); |
441 | | |
442 | 1.59k | int bits = BN_num_bits(p); |
443 | 1.59k | if (bits == 0) { |
444 | 24 | return BN_one(r); |
445 | 24 | } |
446 | | |
447 | 1.56k | BN_RECP_CTX_init(&recp); |
448 | 1.56k | BN_CTX_start(ctx); |
449 | 1.56k | aa = BN_CTX_get(ctx); |
450 | 1.56k | val[0] = BN_CTX_get(ctx); |
451 | 1.56k | if (!aa || !val[0]) { |
452 | 0 | goto err; |
453 | 0 | } |
454 | | |
455 | 1.56k | if (m->neg) { |
456 | | // ignore sign of 'm' |
457 | 0 | if (!BN_copy(aa, m)) { |
458 | 0 | goto err; |
459 | 0 | } |
460 | 0 | aa->neg = 0; |
461 | 0 | if (BN_RECP_CTX_set(&recp, aa, ctx) <= 0) { |
462 | 0 | goto err; |
463 | 0 | } |
464 | 1.56k | } else { |
465 | 1.56k | if (BN_RECP_CTX_set(&recp, m, ctx) <= 0) { |
466 | 0 | goto err; |
467 | 0 | } |
468 | 1.56k | } |
469 | | |
470 | 1.56k | if (!BN_nnmod(val[0], a, m, ctx)) { |
471 | 0 | goto err; // 1 |
472 | 0 | } |
473 | 1.56k | if (BN_is_zero(val[0])) { |
474 | 51 | BN_zero(r); |
475 | 51 | ret = 1; |
476 | 51 | goto err; |
477 | 51 | } |
478 | | |
479 | 1.51k | window = BN_window_bits_for_exponent_size(bits); |
480 | 1.51k | if (window > 1) { |
481 | 392 | if (!BN_mod_mul_reciprocal(aa, val[0], val[0], &recp, ctx)) { |
482 | 0 | goto err; // 2 |
483 | 0 | } |
484 | 392 | j = 1 << (window - 1); |
485 | 3.17k | for (i = 1; i < j; i++) { |
486 | 2.78k | if (((val[i] = BN_CTX_get(ctx)) == NULL) || |
487 | 2.78k | !BN_mod_mul_reciprocal(val[i], val[i - 1], aa, &recp, ctx)) { |
488 | 0 | goto err; |
489 | 0 | } |
490 | 2.78k | } |
491 | 392 | } |
492 | | |
493 | 1.51k | start = 1; // This is used to avoid multiplication etc |
494 | | // when there is only the value '1' in the |
495 | | // buffer. |
496 | 1.51k | wstart = bits - 1; // The top bit of the window |
497 | | |
498 | 1.51k | if (!BN_one(r)) { |
499 | 0 | goto err; |
500 | 0 | } |
501 | | |
502 | 71.6k | for (;;) { |
503 | 71.6k | int wvalue; // The 'value' of the window |
504 | 71.6k | int wend; // The bottom bit of the window |
505 | | |
506 | 71.6k | if (!BN_is_bit_set(p, wstart)) { |
507 | 53.1k | if (!start) { |
508 | 53.1k | if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) { |
509 | 0 | goto err; |
510 | 0 | } |
511 | 53.1k | } |
512 | 53.1k | if (wstart == 0) { |
513 | 438 | break; |
514 | 438 | } |
515 | 52.6k | wstart--; |
516 | 52.6k | continue; |
517 | 53.1k | } |
518 | | |
519 | | // We now have wstart on a 'set' bit, we now need to work out |
520 | | // how bit a window to do. To do this we need to scan |
521 | | // forward until the last set bit before the end of the |
522 | | // window |
523 | 18.5k | wvalue = 1; |
524 | 18.5k | wend = 0; |
525 | 66.3k | for (i = 1; i < window; i++) { |
526 | 48.0k | if (wstart - i < 0) { |
527 | 168 | break; |
528 | 168 | } |
529 | 47.8k | if (BN_is_bit_set(p, wstart - i)) { |
530 | 30.2k | wvalue <<= (i - wend); |
531 | 30.2k | wvalue |= 1; |
532 | 30.2k | wend = i; |
533 | 30.2k | } |
534 | 47.8k | } |
535 | | |
536 | | // wend is the size of the current window |
537 | 18.5k | j = wend + 1; |
538 | | // add the 'bytes above' |
539 | 18.5k | if (!start) { |
540 | 69.1k | for (i = 0; i < j; i++) { |
541 | 52.1k | if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) { |
542 | 0 | goto err; |
543 | 0 | } |
544 | 52.1k | } |
545 | 16.9k | } |
546 | | |
547 | | // wvalue will be an odd number < 2^window |
548 | 18.5k | if (!BN_mod_mul_reciprocal(r, r, val[wvalue >> 1], &recp, ctx)) { |
549 | 0 | goto err; |
550 | 0 | } |
551 | | |
552 | | // move the 'window' down further |
553 | 18.5k | wstart -= wend + 1; |
554 | 18.5k | start = 0; |
555 | 18.5k | if (wstart < 0) { |
556 | 1.08k | break; |
557 | 1.08k | } |
558 | 18.5k | } |
559 | 1.51k | ret = 1; |
560 | | |
561 | 1.56k | err: |
562 | 1.56k | BN_CTX_end(ctx); |
563 | 1.56k | BN_RECP_CTX_free(&recp); |
564 | 1.56k | return ret; |
565 | 1.51k | } |
566 | | |
567 | | int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m, |
568 | 2.81k | BN_CTX *ctx) { |
569 | 2.81k | if (m->neg) { |
570 | 0 | OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); |
571 | 0 | return 0; |
572 | 0 | } |
573 | 2.81k | if (a->neg || BN_ucmp(a, m) >= 0) { |
574 | 2.23k | if (!BN_nnmod(r, a, m, ctx)) { |
575 | 0 | return 0; |
576 | 0 | } |
577 | 2.23k | a = r; |
578 | 2.23k | } |
579 | | |
580 | 2.81k | if (BN_is_odd(m)) { |
581 | 1.22k | return BN_mod_exp_mont(r, a, p, m, ctx, NULL); |
582 | 1.22k | } |
583 | | |
584 | 1.59k | return mod_exp_recp(r, a, p, m, ctx); |
585 | 2.81k | } |
586 | | |
587 | | int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, |
588 | 62.7k | const BIGNUM *m, BN_CTX *ctx, const BN_MONT_CTX *mont) { |
589 | 62.7k | if (!BN_is_odd(m)) { |
590 | 0 | OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); |
591 | 0 | return 0; |
592 | 0 | } |
593 | 62.7k | if (m->neg) { |
594 | 0 | OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); |
595 | 0 | return 0; |
596 | 0 | } |
597 | | // |a| is secret, but |a < m| is not. |
598 | 62.7k | if (a->neg || constant_time_declassify_int(BN_ucmp(a, m)) >= 0) { |
599 | 0 | OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); |
600 | 0 | return 0; |
601 | 0 | } |
602 | | |
603 | 62.7k | int bits = BN_num_bits(p); |
604 | 62.7k | if (bits == 0) { |
605 | | // x**0 mod 1 is still zero. |
606 | 318 | if (BN_abs_is_word(m, 1)) { |
607 | 4 | BN_zero(rr); |
608 | 4 | return 1; |
609 | 4 | } |
610 | 314 | return BN_one(rr); |
611 | 318 | } |
612 | | |
613 | 62.4k | int ret = 0; |
614 | 62.4k | BIGNUM *val[TABLE_SIZE]; |
615 | 62.4k | BN_MONT_CTX *new_mont = NULL; |
616 | | |
617 | 62.4k | BN_CTX_start(ctx); |
618 | 62.4k | BIGNUM *r = BN_CTX_get(ctx); |
619 | 62.4k | val[0] = BN_CTX_get(ctx); |
620 | 62.4k | if (r == NULL || val[0] == NULL) { |
621 | 0 | goto err; |
622 | 0 | } |
623 | | |
624 | | // Allocate a montgomery context if it was not supplied by the caller. |
625 | 62.4k | if (mont == NULL) { |
626 | 6.95k | new_mont = BN_MONT_CTX_new_consttime(m, ctx); |
627 | 6.95k | if (new_mont == NULL) { |
628 | 0 | goto err; |
629 | 0 | } |
630 | 6.95k | mont = new_mont; |
631 | 6.95k | } |
632 | | |
633 | | // We exponentiate by looking at sliding windows of the exponent and |
634 | | // precomputing powers of |a|. Windows may be shifted so they always end on a |
635 | | // set bit, so only precompute odd powers. We compute val[i] = a^(2*i + 1) |
636 | | // for i = 0 to 2^(window-1), all in Montgomery form. |
637 | 62.4k | int window = BN_window_bits_for_exponent_size(bits); |
638 | 62.4k | if (!BN_to_montgomery(val[0], a, mont, ctx)) { |
639 | 0 | goto err; |
640 | 0 | } |
641 | 62.4k | if (window > 1) { |
642 | 7.16k | BIGNUM *d = BN_CTX_get(ctx); |
643 | 7.16k | if (d == NULL || |
644 | 7.16k | !BN_mod_mul_montgomery(d, val[0], val[0], mont, ctx)) { |
645 | 0 | goto err; |
646 | 0 | } |
647 | 76.5k | for (int i = 1; i < 1 << (window - 1); i++) { |
648 | 69.3k | val[i] = BN_CTX_get(ctx); |
649 | 69.3k | if (val[i] == NULL || |
650 | 69.3k | !BN_mod_mul_montgomery(val[i], val[i - 1], d, mont, ctx)) { |
651 | 0 | goto err; |
652 | 0 | } |
653 | 69.3k | } |
654 | 7.16k | } |
655 | | |
656 | | // |p| is non-zero, so at least one window is non-zero. To save some |
657 | | // multiplications, defer initializing |r| until then. |
658 | 62.4k | int r_is_one = 1; |
659 | 62.4k | int wstart = bits - 1; // The top bit of the window. |
660 | 1.76M | for (;;) { |
661 | 1.76M | if (!BN_is_bit_set(p, wstart)) { |
662 | 1.36M | if (!r_is_one && !BN_mod_mul_montgomery(r, r, r, mont, ctx)) { |
663 | 0 | goto err; |
664 | 0 | } |
665 | 1.36M | if (wstart == 0) { |
666 | 2.01k | break; |
667 | 2.01k | } |
668 | 1.36M | wstart--; |
669 | 1.36M | continue; |
670 | 1.36M | } |
671 | | |
672 | | // We now have wstart on a set bit. Find the largest window we can use. |
673 | 404k | int wvalue = 1; |
674 | 404k | int wsize = 0; |
675 | 1.25M | for (int i = 1; i < window && i <= wstart; i++) { |
676 | 847k | if (BN_is_bit_set(p, wstart - i)) { |
677 | 724k | wvalue <<= (i - wsize); |
678 | 724k | wvalue |= 1; |
679 | 724k | wsize = i; |
680 | 724k | } |
681 | 847k | } |
682 | | |
683 | | // Shift |r| to the end of the window. |
684 | 404k | if (!r_is_one) { |
685 | 1.42M | for (int i = 0; i < wsize + 1; i++) { |
686 | 1.08M | if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) { |
687 | 0 | goto err; |
688 | 0 | } |
689 | 1.08M | } |
690 | 342k | } |
691 | | |
692 | 404k | assert(wvalue & 1); |
693 | 404k | assert(wvalue < (1 << window)); |
694 | 404k | if (r_is_one) { |
695 | 62.4k | if (!BN_copy(r, val[wvalue >> 1])) { |
696 | 0 | goto err; |
697 | 0 | } |
698 | 342k | } else if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx)) { |
699 | 0 | goto err; |
700 | 0 | } |
701 | | |
702 | 404k | r_is_one = 0; |
703 | 404k | if (wstart == wsize) { |
704 | 60.3k | break; |
705 | 60.3k | } |
706 | 344k | wstart -= wsize + 1; |
707 | 344k | } |
708 | | |
709 | | // |p| is non-zero, so |r_is_one| must be cleared at some point. |
710 | 62.4k | assert(!r_is_one); |
711 | | |
712 | 62.4k | if (!BN_from_montgomery(rr, r, mont, ctx)) { |
713 | 0 | goto err; |
714 | 0 | } |
715 | 62.4k | ret = 1; |
716 | | |
717 | 62.4k | err: |
718 | 62.4k | BN_MONT_CTX_free(new_mont); |
719 | 62.4k | BN_CTX_end(ctx); |
720 | 62.4k | return ret; |
721 | 62.4k | } |
722 | | |
723 | | void bn_mod_exp_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num, |
724 | | const BN_ULONG *p, size_t num_p, |
725 | 77.2k | const BN_MONT_CTX *mont) { |
726 | 77.2k | if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS || |
727 | 77.2k | num_p > ((size_t)-1) / BN_BITS2) { |
728 | 0 | abort(); |
729 | 0 | } |
730 | 77.2k | assert(BN_is_odd(&mont->N)); |
731 | | |
732 | | // Count the number of bits in |p|, skipping leading zeros. Note this function |
733 | | // treats |p| as public. |
734 | 77.2k | while (num_p != 0 && p[num_p - 1] == 0) { |
735 | 0 | num_p--; |
736 | 0 | } |
737 | 77.2k | if (num_p == 0) { |
738 | 0 | bn_from_montgomery_small(r, num, mont->RR.d, num, mont); |
739 | 0 | return; |
740 | 0 | } |
741 | 77.2k | size_t bits = BN_num_bits_word(p[num_p - 1]) + (num_p - 1) * BN_BITS2; |
742 | 77.2k | assert(bits != 0); |
743 | | |
744 | | // We exponentiate by looking at sliding windows of the exponent and |
745 | | // precomputing powers of |a|. Windows may be shifted so they always end on a |
746 | | // set bit, so only precompute odd powers. We compute val[i] = a^(2*i + 1) for |
747 | | // i = 0 to 2^(window-1), all in Montgomery form. |
748 | 77.2k | unsigned window = BN_window_bits_for_exponent_size(bits); |
749 | 77.2k | if (window > TABLE_BITS_SMALL) { |
750 | 0 | window = TABLE_BITS_SMALL; // Tolerate excessively large |p|. |
751 | 0 | } |
752 | 77.2k | BN_ULONG val[TABLE_SIZE_SMALL][BN_SMALL_MAX_WORDS]; |
753 | 77.2k | OPENSSL_memcpy(val[0], a, num * sizeof(BN_ULONG)); |
754 | 77.2k | if (window > 1) { |
755 | 77.2k | BN_ULONG d[BN_SMALL_MAX_WORDS]; |
756 | 77.2k | bn_mod_mul_montgomery_small(d, val[0], val[0], num, mont); |
757 | 1.23M | for (unsigned i = 1; i < 1u << (window - 1); i++) { |
758 | 1.15M | bn_mod_mul_montgomery_small(val[i], val[i - 1], d, num, mont); |
759 | 1.15M | } |
760 | 77.2k | } |
761 | | |
762 | | // |p| is non-zero, so at least one window is non-zero. To save some |
763 | | // multiplications, defer initializing |r| until then. |
764 | 77.2k | int r_is_one = 1; |
765 | 77.2k | size_t wstart = bits - 1; // The top bit of the window. |
766 | 10.0M | for (;;) { |
767 | 10.0M | if (!bn_is_bit_set_words(p, num_p, wstart)) { |
768 | 5.04M | if (!r_is_one) { |
769 | 5.04M | bn_mod_mul_montgomery_small(r, r, r, num, mont); |
770 | 5.04M | } |
771 | 5.04M | if (wstart == 0) { |
772 | 0 | break; |
773 | 0 | } |
774 | 5.04M | wstart--; |
775 | 5.04M | continue; |
776 | 5.04M | } |
777 | | |
778 | | // We now have wstart on a set bit. Find the largest window we can use. |
779 | 5.04M | unsigned wvalue = 1; |
780 | 5.04M | unsigned wsize = 0; |
781 | 24.9M | for (unsigned i = 1; i < window && i <= wstart; i++) { |
782 | 19.8M | if (bn_is_bit_set_words(p, num_p, wstart - i)) { |
783 | 19.6M | wvalue <<= (i - wsize); |
784 | 19.6M | wvalue |= 1; |
785 | 19.6M | wsize = i; |
786 | 19.6M | } |
787 | 19.8M | } |
788 | | |
789 | | // Shift |r| to the end of the window. |
790 | 5.04M | if (!r_is_one) { |
791 | 29.2M | for (unsigned i = 0; i < wsize + 1; i++) { |
792 | 24.2M | bn_mod_mul_montgomery_small(r, r, r, num, mont); |
793 | 24.2M | } |
794 | 4.97M | } |
795 | | |
796 | 5.04M | assert(wvalue & 1); |
797 | 5.04M | assert(wvalue < (1u << window)); |
798 | 5.04M | if (r_is_one) { |
799 | 77.2k | OPENSSL_memcpy(r, val[wvalue >> 1], num * sizeof(BN_ULONG)); |
800 | 4.97M | } else { |
801 | 4.97M | bn_mod_mul_montgomery_small(r, r, val[wvalue >> 1], num, mont); |
802 | 4.97M | } |
803 | 5.04M | r_is_one = 0; |
804 | 5.04M | if (wstart == wsize) { |
805 | 77.2k | break; |
806 | 77.2k | } |
807 | 4.97M | wstart -= wsize + 1; |
808 | 4.97M | } |
809 | | |
810 | | // |p| is non-zero, so |r_is_one| must be cleared at some point. |
811 | 77.2k | assert(!r_is_one); |
812 | 77.2k | OPENSSL_cleanse(val, sizeof(val)); |
813 | 77.2k | } |
814 | | |
815 | | void bn_mod_inverse0_prime_mont_small(BN_ULONG *r, const BN_ULONG *a, |
816 | 77.2k | size_t num, const BN_MONT_CTX *mont) { |
817 | 77.2k | if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) { |
818 | 0 | abort(); |
819 | 0 | } |
820 | | |
821 | | // Per Fermat's Little Theorem, a^-1 = a^(p-2) (mod p) for p prime. |
822 | 77.2k | BN_ULONG p_minus_two[BN_SMALL_MAX_WORDS]; |
823 | 77.2k | const BN_ULONG *p = mont->N.d; |
824 | 77.2k | OPENSSL_memcpy(p_minus_two, p, num * sizeof(BN_ULONG)); |
825 | 77.2k | if (p_minus_two[0] >= 2) { |
826 | 77.2k | p_minus_two[0] -= 2; |
827 | 77.2k | } else { |
828 | 0 | p_minus_two[0] -= 2; |
829 | 0 | for (size_t i = 1; i < num; i++) { |
830 | 0 | if (p_minus_two[i]-- != 0) { |
831 | 0 | break; |
832 | 0 | } |
833 | 0 | } |
834 | 0 | } |
835 | | |
836 | 77.2k | bn_mod_exp_mont_small(r, a, num, p_minus_two, num, mont); |
837 | 77.2k | } |
838 | | |
839 | | static void copy_to_prebuf(const BIGNUM *b, int top, BN_ULONG *table, int idx, |
840 | 23.7k | int window) { |
841 | 23.7k | int ret = bn_copy_words(table + idx * top, top, b); |
842 | 23.7k | assert(ret); // |b| is guaranteed to fit. |
843 | 23.7k | (void)ret; |
844 | 23.7k | } |
845 | | |
846 | | static int copy_from_prebuf(BIGNUM *b, int top, const BN_ULONG *table, int idx, |
847 | 71.8k | int window) { |
848 | 71.8k | if (!bn_wexpand(b, top)) { |
849 | 0 | return 0; |
850 | 0 | } |
851 | | |
852 | 71.8k | OPENSSL_memset(b->d, 0, sizeof(BN_ULONG) * top); |
853 | 71.8k | const int width = 1 << window; |
854 | 1.02M | for (int i = 0; i < width; i++, table += top) { |
855 | | // Use a value barrier to prevent Clang from adding a branch when |i != idx| |
856 | | // and making this copy not constant time. Clang is still allowed to learn |
857 | | // that |mask| is constant across the inner loop, so this won't inhibit any |
858 | | // vectorization it might do. |
859 | 950k | BN_ULONG mask = value_barrier_w(constant_time_eq_int(i, idx)); |
860 | 8.51M | for (int j = 0; j < top; j++) { |
861 | 7.56M | b->d[j] |= table[j] & mask; |
862 | 7.56M | } |
863 | 950k | } |
864 | | |
865 | 71.8k | b->width = top; |
866 | 71.8k | return 1; |
867 | 71.8k | } |
868 | | |
869 | | // Window sizes optimized for fixed window size modular exponentiation |
870 | | // algorithm (BN_mod_exp_mont_consttime). |
871 | | // |
872 | | // TODO(davidben): These window sizes were originally set for 64-byte cache |
873 | | // lines with a cache-line-dependent constant-time mitigation. They can probably |
874 | | // be revised now that our implementation is no longer cache-time-dependent. |
875 | | #define BN_window_bits_for_ctime_exponent_size(b) \ |
876 | 81.0k | ((b) > 937 ? 6 : (b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1) |
877 | | #define BN_MAX_MOD_EXP_CTIME_WINDOW (6) |
878 | | |
879 | | // This variant of |BN_mod_exp_mont| uses fixed windows and fixed memory access |
880 | | // patterns to protect secret exponents (cf. the hyper-threading timing attacks |
881 | | // pointed out by Colin Percival, |
882 | | // http://www.daemonology.net/hyperthreading-considered-harmful/) |
883 | | int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, |
884 | | const BIGNUM *m, BN_CTX *ctx, |
885 | 81.0k | const BN_MONT_CTX *mont) { |
886 | 81.0k | int i, ret = 0, wvalue; |
887 | 81.0k | BN_MONT_CTX *new_mont = NULL; |
888 | | |
889 | 81.0k | unsigned char *powerbuf_free = NULL; |
890 | 81.0k | size_t powerbuf_len = 0; |
891 | 81.0k | BN_ULONG *powerbuf = NULL; |
892 | | |
893 | 81.0k | if (!BN_is_odd(m)) { |
894 | 0 | OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); |
895 | 0 | return 0; |
896 | 0 | } |
897 | 81.0k | if (m->neg) { |
898 | 0 | OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); |
899 | 0 | return 0; |
900 | 0 | } |
901 | 81.0k | if (a->neg || BN_ucmp(a, m) >= 0) { |
902 | 0 | OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); |
903 | 0 | return 0; |
904 | 0 | } |
905 | | |
906 | | // Use all bits stored in |p|, rather than |BN_num_bits|, so we do not leak |
907 | | // whether the top bits are zero. |
908 | 81.0k | int max_bits = p->width * BN_BITS2; |
909 | 81.0k | int bits = max_bits; |
910 | 81.0k | if (bits == 0) { |
911 | | // x**0 mod 1 is still zero. |
912 | 0 | if (BN_abs_is_word(m, 1)) { |
913 | 0 | BN_zero(rr); |
914 | 0 | return 1; |
915 | 0 | } |
916 | 0 | return BN_one(rr); |
917 | 0 | } |
918 | | |
919 | | // Allocate a montgomery context if it was not supplied by the caller. |
920 | 81.0k | if (mont == NULL) { |
921 | 1.63k | new_mont = BN_MONT_CTX_new_consttime(m, ctx); |
922 | 1.63k | if (new_mont == NULL) { |
923 | 0 | goto err; |
924 | 0 | } |
925 | 1.63k | mont = new_mont; |
926 | 1.63k | } |
927 | | |
928 | | // Use the width in |mont->N|, rather than the copy in |m|. The assembly |
929 | | // implementation assumes it can use |top| to size R. |
930 | 81.0k | int top = mont->N.width; |
931 | | |
932 | 81.0k | #if defined(OPENSSL_BN_ASM_MONT5) || defined(RSAZ_ENABLED) |
933 | | // Share one large stack-allocated buffer between the RSAZ and non-RSAZ code |
934 | | // paths. If we were to use separate static buffers for each then there is |
935 | | // some chance that both large buffers would be allocated on the stack, |
936 | | // causing the stack space requirement to be truly huge (~10KB). |
937 | 81.0k | alignas(MOD_EXP_CTIME_ALIGN) BN_ULONG storage[MOD_EXP_CTIME_STORAGE_LEN]; |
938 | 81.0k | #endif |
939 | 81.0k | #if defined(RSAZ_ENABLED) |
940 | | // If the size of the operands allow it, perform the optimized RSAZ |
941 | | // exponentiation. For further information see crypto/fipsmodule/bn/rsaz_exp.c |
942 | | // and accompanying assembly modules. |
943 | 81.0k | if (a->width == 16 && p->width == 16 && BN_num_bits(m) == 1024 && |
944 | 81.0k | rsaz_avx2_preferred()) { |
945 | 0 | if (!bn_wexpand(rr, 16)) { |
946 | 0 | goto err; |
947 | 0 | } |
948 | 0 | RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0], |
949 | 0 | storage); |
950 | 0 | rr->width = 16; |
951 | 0 | rr->neg = 0; |
952 | 0 | ret = 1; |
953 | 0 | goto err; |
954 | 0 | } |
955 | 81.0k | #endif |
956 | | |
957 | | // Get the window size to use with size of p. |
958 | 81.0k | int window = BN_window_bits_for_ctime_exponent_size(bits); |
959 | 81.0k | assert(window <= BN_MAX_MOD_EXP_CTIME_WINDOW); |
960 | | |
961 | | // Calculating |powerbuf_len| below cannot overflow because of the bound on |
962 | | // Montgomery reduction. |
963 | 81.0k | assert((size_t)top <= BN_MONTGOMERY_MAX_WORDS); |
964 | 81.0k | static_assert( |
965 | 81.0k | BN_MONTGOMERY_MAX_WORDS <= |
966 | 81.0k | INT_MAX / sizeof(BN_ULONG) / ((1 << BN_MAX_MOD_EXP_CTIME_WINDOW) + 3), |
967 | 81.0k | "powerbuf_len may overflow"); |
968 | | |
969 | 81.0k | #if defined(OPENSSL_BN_ASM_MONT5) |
970 | 81.0k | if (window >= 5) { |
971 | 78.5k | window = 5; // ~5% improvement for RSA2048 sign, and even for RSA4096 |
972 | | // Reserve space for the |mont->N| copy. |
973 | 78.5k | powerbuf_len += top * sizeof(mont->N.d[0]); |
974 | 78.5k | } |
975 | 81.0k | #endif |
976 | | |
977 | | // Allocate a buffer large enough to hold all of the pre-computed |
978 | | // powers of |am|, |am| itself, and |tmp|. |
979 | 81.0k | int num_powers = 1 << window; |
980 | 81.0k | powerbuf_len += sizeof(m->d[0]) * top * (num_powers + 2); |
981 | | |
982 | 81.0k | #if defined(OPENSSL_BN_ASM_MONT5) |
983 | 81.0k | if (powerbuf_len <= sizeof(storage)) { |
984 | 81.0k | powerbuf = storage; |
985 | 81.0k | } |
986 | | // |storage| is more than large enough to handle 1024-bit inputs. |
987 | 81.0k | assert(powerbuf != NULL || top * BN_BITS2 > 1024); |
988 | 81.0k | #endif |
989 | 81.0k | if (powerbuf == NULL) { |
990 | 74 | powerbuf_free = OPENSSL_malloc(powerbuf_len + MOD_EXP_CTIME_ALIGN); |
991 | 74 | if (powerbuf_free == NULL) { |
992 | 0 | goto err; |
993 | 0 | } |
994 | 74 | powerbuf = align_pointer(powerbuf_free, MOD_EXP_CTIME_ALIGN); |
995 | 74 | } |
996 | 81.0k | OPENSSL_memset(powerbuf, 0, powerbuf_len); |
997 | | |
998 | | // Place |tmp| and |am| right after powers table. |
999 | 81.0k | BIGNUM tmp, am; |
1000 | 81.0k | tmp.d = powerbuf + top * num_powers; |
1001 | 81.0k | am.d = tmp.d + top; |
1002 | 81.0k | tmp.width = am.width = 0; |
1003 | 81.0k | tmp.dmax = am.dmax = top; |
1004 | 81.0k | tmp.neg = am.neg = 0; |
1005 | 81.0k | tmp.flags = am.flags = BN_FLG_STATIC_DATA; |
1006 | | |
1007 | 81.0k | if (!bn_one_to_montgomery(&tmp, mont, ctx) || |
1008 | 81.0k | !bn_resize_words(&tmp, top)) { |
1009 | 0 | goto err; |
1010 | 0 | } |
1011 | | |
1012 | | // Prepare a^1 in the Montgomery domain. |
1013 | 81.0k | assert(!a->neg); |
1014 | 81.0k | assert(BN_ucmp(a, m) < 0); |
1015 | 81.0k | if (!BN_to_montgomery(&am, a, mont, ctx) || |
1016 | 81.0k | !bn_resize_words(&am, top)) { |
1017 | 0 | goto err; |
1018 | 0 | } |
1019 | | |
1020 | 81.0k | #if defined(OPENSSL_BN_ASM_MONT5) |
1021 | | // This optimization uses ideas from https://eprint.iacr.org/2011/239, |
1022 | | // specifically optimization of cache-timing attack countermeasures, |
1023 | | // pre-computation optimization, and Almost Montgomery Multiplication. |
1024 | | // |
1025 | | // The paper discusses a 4-bit window to optimize 512-bit modular |
1026 | | // exponentiation, used in RSA-1024 with CRT, but RSA-1024 is no longer |
1027 | | // important. |
1028 | | // |
1029 | | // |bn_mul_mont_gather5| and |bn_power5| implement the "almost" reduction |
1030 | | // variant, so the values here may not be fully reduced. They are bounded by R |
1031 | | // (i.e. they fit in |top| words), not |m|. Additionally, we pass these |
1032 | | // "almost" reduced inputs into |bn_mul_mont|, which implements the normal |
1033 | | // reduction variant. Given those inputs, |bn_mul_mont| may not give reduced |
1034 | | // output, but it will still produce "almost" reduced output. |
1035 | | // |
1036 | | // TODO(davidben): Using "almost" reduction complicates analysis of this code, |
1037 | | // and its interaction with other parts of the project. Determine whether this |
1038 | | // is actually necessary for performance. |
1039 | 81.0k | if (window == 5 && top > 1) { |
1040 | | // Copy |mont->N| to improve cache locality. |
1041 | 78.5k | BN_ULONG *np = am.d + top; |
1042 | 1.33M | for (i = 0; i < top; i++) { |
1043 | 1.25M | np[i] = mont->N.d[i]; |
1044 | 1.25M | } |
1045 | | |
1046 | | // Fill |powerbuf| with the first 32 powers of |am|. |
1047 | 78.5k | const BN_ULONG *n0 = mont->n0; |
1048 | 78.5k | bn_scatter5(tmp.d, top, powerbuf, 0); |
1049 | 78.5k | bn_scatter5(am.d, am.width, powerbuf, 1); |
1050 | 78.5k | bn_mul_mont(tmp.d, am.d, am.d, np, n0, top); |
1051 | 78.5k | bn_scatter5(tmp.d, top, powerbuf, 2); |
1052 | | |
1053 | | // Square to compute powers of two. |
1054 | 314k | for (i = 4; i < 32; i *= 2) { |
1055 | 235k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1056 | 235k | bn_scatter5(tmp.d, top, powerbuf, i); |
1057 | 235k | } |
1058 | | // Compute odd powers |i| based on |i - 1|, then all powers |i * 2^j|. |
1059 | 1.25M | for (i = 3; i < 32; i += 2) { |
1060 | 1.17M | bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); |
1061 | 1.17M | bn_scatter5(tmp.d, top, powerbuf, i); |
1062 | 2.04M | for (int j = 2 * i; j < 32; j *= 2) { |
1063 | 863k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1064 | 863k | bn_scatter5(tmp.d, top, powerbuf, j); |
1065 | 863k | } |
1066 | 1.17M | } |
1067 | | |
1068 | 78.5k | bits--; |
1069 | 392k | for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--) { |
1070 | 313k | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); |
1071 | 313k | } |
1072 | 78.5k | bn_gather5(tmp.d, top, powerbuf, wvalue); |
1073 | | |
1074 | | // At this point |bits| is 4 mod 5 and at least -1. (|bits| is the first bit |
1075 | | // that has not been read yet.) |
1076 | 78.5k | assert(bits >= -1 && (bits == -1 || bits % 5 == 4)); |
1077 | | |
1078 | | // Scan the exponent one window at a time starting from the most |
1079 | | // significant bits. |
1080 | 78.5k | if (top & 7) { |
1081 | 5.98k | while (bits >= 0) { |
1082 | 35.6k | for (wvalue = 0, i = 0; i < 5; i++, bits--) { |
1083 | 29.7k | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); |
1084 | 29.7k | } |
1085 | | |
1086 | 5.94k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1087 | 5.94k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1088 | 5.94k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1089 | 5.94k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1090 | 5.94k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1091 | 5.94k | bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); |
1092 | 5.94k | } |
1093 | 78.4k | } else { |
1094 | 78.4k | const uint8_t *p_bytes = (const uint8_t *)p->d; |
1095 | 78.4k | assert(bits < max_bits); |
1096 | | // |p = 0| has been handled as a special case, so |max_bits| is at least |
1097 | | // one word. |
1098 | 78.4k | assert(max_bits >= 64); |
1099 | | |
1100 | | // If the first bit to be read lands in the last byte, unroll the first |
1101 | | // iteration to avoid reading past the bounds of |p->d|. (After the first |
1102 | | // iteration, we are guaranteed to be past the last byte.) Note |bits| |
1103 | | // here is the top bit, inclusive. |
1104 | 78.4k | if (bits - 4 >= max_bits - 8) { |
1105 | | // Read five bits from |bits-4| through |bits|, inclusive. |
1106 | 12 | wvalue = p_bytes[p->width * BN_BYTES - 1]; |
1107 | 12 | wvalue >>= (bits - 4) & 7; |
1108 | 12 | wvalue &= 0x1f; |
1109 | 12 | bits -= 5; |
1110 | 12 | bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); |
1111 | 12 | } |
1112 | 16.0M | while (bits >= 0) { |
1113 | | // Read five bits from |bits-4| through |bits|, inclusive. |
1114 | 16.0M | int first_bit = bits - 4; |
1115 | 16.0M | uint16_t val; |
1116 | 16.0M | OPENSSL_memcpy(&val, p_bytes + (first_bit >> 3), sizeof(val)); |
1117 | 16.0M | val >>= first_bit & 7; |
1118 | 16.0M | val &= 0x1f; |
1119 | 16.0M | bits -= 5; |
1120 | 16.0M | bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, val); |
1121 | 16.0M | } |
1122 | 78.4k | } |
1123 | | // The result is now in |tmp| in Montgomery form, but it may not be fully |
1124 | | // reduced. This is within bounds for |BN_from_montgomery| (tmp < R <= m*R) |
1125 | | // so it will, when converting from Montgomery form, produce a fully reduced |
1126 | | // result. |
1127 | | // |
1128 | | // This differs from Figure 2 of the paper, which uses AMM(h, 1) to convert |
1129 | | // from Montgomery form with unreduced output, followed by an extra |
1130 | | // reduction step. In the paper's terminology, we replace steps 9 and 10 |
1131 | | // with MM(h, 1). |
1132 | 78.5k | } else |
1133 | 2.58k | #endif |
1134 | 2.58k | { |
1135 | 2.58k | copy_to_prebuf(&tmp, top, powerbuf, 0, window); |
1136 | 2.58k | copy_to_prebuf(&am, top, powerbuf, 1, window); |
1137 | | |
1138 | | // If the window size is greater than 1, then calculate |
1139 | | // val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) |
1140 | | // (even powers could instead be computed as (a^(i/2))^2 |
1141 | | // to use the slight performance advantage of sqr over mul). |
1142 | 2.58k | if (window > 1) { |
1143 | 2.58k | if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx)) { |
1144 | 0 | goto err; |
1145 | 0 | } |
1146 | | |
1147 | 2.58k | copy_to_prebuf(&tmp, top, powerbuf, 2, window); |
1148 | | |
1149 | 18.5k | for (i = 3; i < num_powers; i++) { |
1150 | | // Calculate a^i = a^(i-1) * a |
1151 | 15.9k | if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx)) { |
1152 | 0 | goto err; |
1153 | 0 | } |
1154 | | |
1155 | 15.9k | copy_to_prebuf(&tmp, top, powerbuf, i, window); |
1156 | 15.9k | } |
1157 | 2.58k | } |
1158 | | |
1159 | 2.58k | bits--; |
1160 | 6.02k | for (wvalue = 0, i = bits % window; i >= 0; i--, bits--) { |
1161 | 3.43k | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); |
1162 | 3.43k | } |
1163 | 2.58k | if (!copy_from_prebuf(&tmp, top, powerbuf, wvalue, window)) { |
1164 | 0 | goto err; |
1165 | 0 | } |
1166 | | |
1167 | | // Scan the exponent one window at a time starting from the most |
1168 | | // significant bits. |
1169 | 71.8k | while (bits >= 0) { |
1170 | 69.2k | wvalue = 0; // The 'value' of the window |
1171 | | |
1172 | | // Scan the window, squaring the result as we go |
1173 | 310k | for (i = 0; i < window; i++, bits--) { |
1174 | 241k | if (!BN_mod_mul_montgomery(&tmp, &tmp, &tmp, mont, ctx)) { |
1175 | 0 | goto err; |
1176 | 0 | } |
1177 | 241k | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); |
1178 | 241k | } |
1179 | | |
1180 | | // Fetch the appropriate pre-computed value from the pre-buf |
1181 | 69.2k | if (!copy_from_prebuf(&am, top, powerbuf, wvalue, window)) { |
1182 | 0 | goto err; |
1183 | 0 | } |
1184 | | |
1185 | | // Multiply the result into the intermediate result |
1186 | 69.2k | if (!BN_mod_mul_montgomery(&tmp, &tmp, &am, mont, ctx)) { |
1187 | 0 | goto err; |
1188 | 0 | } |
1189 | 69.2k | } |
1190 | 2.58k | } |
1191 | | |
1192 | | // Convert the final result from Montgomery to standard format. If we used the |
1193 | | // |OPENSSL_BN_ASM_MONT5| codepath, |tmp| may not be fully reduced. It is only |
1194 | | // bounded by R rather than |m|. However, that is still within bounds for |
1195 | | // |BN_from_montgomery|, which implements full Montgomery reduction, not |
1196 | | // "almost" Montgomery reduction. |
1197 | 81.0k | if (!BN_from_montgomery(rr, &tmp, mont, ctx)) { |
1198 | 0 | goto err; |
1199 | 0 | } |
1200 | 81.0k | ret = 1; |
1201 | | |
1202 | 81.0k | err: |
1203 | 81.0k | BN_MONT_CTX_free(new_mont); |
1204 | 81.0k | if (powerbuf != NULL && powerbuf_free == NULL) { |
1205 | 81.0k | OPENSSL_cleanse(powerbuf, powerbuf_len); |
1206 | 81.0k | } |
1207 | 81.0k | OPENSSL_free(powerbuf_free); |
1208 | 81.0k | return ret; |
1209 | 81.0k | } |
1210 | | |
1211 | | int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p, |
1212 | | const BIGNUM *m, BN_CTX *ctx, |
1213 | 0 | const BN_MONT_CTX *mont) { |
1214 | 0 | BIGNUM a_bignum; |
1215 | 0 | BN_init(&a_bignum); |
1216 | |
|
1217 | 0 | int ret = 0; |
1218 | | |
1219 | | // BN_mod_exp_mont requires reduced inputs. |
1220 | 0 | if (bn_minimal_width(m) == 1) { |
1221 | 0 | a %= m->d[0]; |
1222 | 0 | } |
1223 | |
|
1224 | 0 | if (!BN_set_word(&a_bignum, a)) { |
1225 | 0 | OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR); |
1226 | 0 | goto err; |
1227 | 0 | } |
1228 | | |
1229 | 0 | ret = BN_mod_exp_mont(rr, &a_bignum, p, m, ctx, mont); |
1230 | |
|
1231 | 0 | err: |
1232 | 0 | BN_free(&a_bignum); |
1233 | |
|
1234 | 0 | return ret; |
1235 | 0 | } |
1236 | | |
1237 | | #define TABLE_SIZE 32 |
1238 | | |
1239 | | int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1, |
1240 | | const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m, |
1241 | 0 | BN_CTX *ctx, const BN_MONT_CTX *mont) { |
1242 | 0 | BIGNUM tmp; |
1243 | 0 | BN_init(&tmp); |
1244 | |
|
1245 | 0 | int ret = 0; |
1246 | 0 | BN_MONT_CTX *new_mont = NULL; |
1247 | | |
1248 | | // Allocate a montgomery context if it was not supplied by the caller. |
1249 | 0 | if (mont == NULL) { |
1250 | 0 | new_mont = BN_MONT_CTX_new_for_modulus(m, ctx); |
1251 | 0 | if (new_mont == NULL) { |
1252 | 0 | goto err; |
1253 | 0 | } |
1254 | 0 | mont = new_mont; |
1255 | 0 | } |
1256 | | |
1257 | | // BN_mod_mul_montgomery removes one Montgomery factor, so passing one |
1258 | | // Montgomery-encoded and one non-Montgomery-encoded value gives a |
1259 | | // non-Montgomery-encoded result. |
1260 | 0 | if (!BN_mod_exp_mont(rr, a1, p1, m, ctx, mont) || |
1261 | 0 | !BN_mod_exp_mont(&tmp, a2, p2, m, ctx, mont) || |
1262 | 0 | !BN_to_montgomery(rr, rr, mont, ctx) || |
1263 | 0 | !BN_mod_mul_montgomery(rr, rr, &tmp, mont, ctx)) { |
1264 | 0 | goto err; |
1265 | 0 | } |
1266 | | |
1267 | 0 | ret = 1; |
1268 | |
|
1269 | 0 | err: |
1270 | 0 | BN_MONT_CTX_free(new_mont); |
1271 | 0 | BN_free(&tmp); |
1272 | |
|
1273 | 0 | return ret; |
1274 | 0 | } |