/src/boringssl/crypto/fipsmodule/bn/exponentiation.c.inc
Line | Count | Source (jump to first uncovered line) |
1 | | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) |
2 | | * All rights reserved. |
3 | | * |
4 | | * This package is an SSL implementation written |
5 | | * by Eric Young (eay@cryptsoft.com). |
6 | | * The implementation was written so as to conform with Netscapes SSL. |
7 | | * |
8 | | * This library is free for commercial and non-commercial use as long as |
9 | | * the following conditions are aheared to. The following conditions |
10 | | * apply to all code found in this distribution, be it the RC4, RSA, |
11 | | * lhash, DES, etc., code; not just the SSL code. The SSL documentation |
12 | | * included with this distribution is covered by the same copyright terms |
13 | | * except that the holder is Tim Hudson (tjh@cryptsoft.com). |
14 | | * |
15 | | * Copyright remains Eric Young's, and as such any Copyright notices in |
16 | | * the code are not to be removed. |
17 | | * If this package is used in a product, Eric Young should be given attribution |
18 | | * as the author of the parts of the library used. |
19 | | * This can be in the form of a textual message at program startup or |
20 | | * in documentation (online or textual) provided with the package. |
21 | | * |
22 | | * Redistribution and use in source and binary forms, with or without |
23 | | * modification, are permitted provided that the following conditions |
24 | | * are met: |
25 | | * 1. Redistributions of source code must retain the copyright |
26 | | * notice, this list of conditions and the following disclaimer. |
27 | | * 2. Redistributions in binary form must reproduce the above copyright |
28 | | * notice, this list of conditions and the following disclaimer in the |
29 | | * documentation and/or other materials provided with the distribution. |
30 | | * 3. All advertising materials mentioning features or use of this software |
31 | | * must display the following acknowledgement: |
32 | | * "This product includes cryptographic software written by |
33 | | * Eric Young (eay@cryptsoft.com)" |
34 | | * The word 'cryptographic' can be left out if the rouines from the library |
35 | | * being used are not cryptographic related :-). |
36 | | * 4. If you include any Windows specific code (or a derivative thereof) from |
37 | | * the apps directory (application code) you must include an acknowledgement: |
38 | | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" |
39 | | * |
40 | | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND |
41 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
42 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
43 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
44 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
45 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
46 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
47 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
48 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
49 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
50 | | * SUCH DAMAGE. |
51 | | * |
52 | | * The licence and distribution terms for any publically available version or |
53 | | * derivative of this code cannot be changed. i.e. this code cannot simply be |
54 | | * copied and put under another distribution licence |
55 | | * [including the GNU Public Licence.] |
56 | | */ |
57 | | /* ==================================================================== |
58 | | * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. |
59 | | * |
60 | | * Redistribution and use in source and binary forms, with or without |
61 | | * modification, are permitted provided that the following conditions |
62 | | * are met: |
63 | | * |
64 | | * 1. Redistributions of source code must retain the above copyright |
65 | | * notice, this list of conditions and the following disclaimer. |
66 | | * |
67 | | * 2. Redistributions in binary form must reproduce the above copyright |
68 | | * notice, this list of conditions and the following disclaimer in |
69 | | * the documentation and/or other materials provided with the |
70 | | * distribution. |
71 | | * |
72 | | * 3. All advertising materials mentioning features or use of this |
73 | | * software must display the following acknowledgment: |
74 | | * "This product includes software developed by the OpenSSL Project |
75 | | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" |
76 | | * |
77 | | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to |
78 | | * endorse or promote products derived from this software without |
79 | | * prior written permission. For written permission, please contact |
80 | | * openssl-core@openssl.org. |
81 | | * |
82 | | * 5. Products derived from this software may not be called "OpenSSL" |
83 | | * nor may "OpenSSL" appear in their names without prior written |
84 | | * permission of the OpenSSL Project. |
85 | | * |
86 | | * 6. Redistributions of any form whatsoever must retain the following |
87 | | * acknowledgment: |
88 | | * "This product includes software developed by the OpenSSL Project |
89 | | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" |
90 | | * |
91 | | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY |
92 | | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
93 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
94 | | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR |
95 | | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
96 | | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
97 | | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
98 | | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
99 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
100 | | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
101 | | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED |
102 | | * OF THE POSSIBILITY OF SUCH DAMAGE. |
103 | | * ==================================================================== |
104 | | * |
105 | | * This product includes cryptographic software written by Eric Young |
106 | | * (eay@cryptsoft.com). This product includes software written by Tim |
107 | | * Hudson (tjh@cryptsoft.com). */ |
108 | | |
109 | | #include <openssl/bn.h> |
110 | | |
111 | | #include <assert.h> |
112 | | #include <limits.h> |
113 | | #include <stdlib.h> |
114 | | #include <string.h> |
115 | | |
116 | | #include <openssl/err.h> |
117 | | #include <openssl/mem.h> |
118 | | |
119 | | #include "internal.h" |
120 | | #include "rsaz_exp.h" |
121 | | |
122 | | #if defined(OPENSSL_BN_ASM_MONT5) |
123 | | |
124 | | // bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it |
125 | | // by |ap| modulo |np|, and stores the result in |rp|. The values are |num| |
126 | | // words long and represented in Montgomery form. |n0| is a pointer to the |
127 | | // corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least |
128 | | // 16 bytes. |power| must be less than 32 and is treated as secret. |
129 | | // |
130 | | // WARNING: This function implements Almost Montgomery Multiplication from |
131 | | // https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. |
132 | | // However, even if they are fully reduced, the output may not be. |
133 | | static void bn_mul_mont_gather5( |
134 | | BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, const BN_ULONG *np, |
135 | 36.1k | const BN_ULONG *n0, int num, int power) { |
136 | 36.1k | if (bn_mulx4x_mont_gather5_capable(num)) { |
137 | 330 | bn_mulx4x_mont_gather5(rp, ap, table, np, n0, num, power); |
138 | 35.8k | } else if (bn_mul4x_mont_gather5_capable(num)) { |
139 | 0 | bn_mul4x_mont_gather5(rp, ap, table, np, n0, num, power); |
140 | 35.8k | } else { |
141 | 35.8k | bn_mul_mont_gather5_nohw(rp, ap, table, np, n0, num, power); |
142 | 35.8k | } |
143 | 36.1k | } |
144 | | |
145 | | // bn_power5 squares |ap| five times and multiplies it by the value stored at |
146 | | // index |power| of |table|, modulo |np|. It stores the result in |rp|. The |
147 | | // values are |num| words long and represented in Montgomery form. |n0| is a |
148 | | // pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible |
149 | | // by 8. |power| must be less than 32 and is treated as secret. |
150 | | // |
151 | | // WARNING: This function implements Almost Montgomery Multiplication from |
152 | | // https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. |
153 | | // However, even if they are fully reduced, the output may not be. |
154 | | static void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, |
155 | | const BN_ULONG *np, const BN_ULONG *n0, int num, |
156 | 4.77k | int power) { |
157 | 4.77k | assert(bn_power5_capable(num)); |
158 | 4.77k | if (bn_powerx5_capable(num)) { |
159 | 4.77k | bn_powerx5(rp, ap, table, np, n0, num, power); |
160 | 4.77k | } else { |
161 | 0 | bn_power5_nohw(rp, ap, table, np, n0, num, power); |
162 | 0 | } |
163 | 4.77k | } |
164 | | |
165 | | #endif // defined(OPENSSL_BN_ASM_MONT5) |
166 | | |
167 | | int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) { |
168 | | int i, bits, ret = 0; |
169 | | BIGNUM *v, *rr; |
170 | | |
171 | | BN_CTX_start(ctx); |
172 | | if (r == a || r == p) { |
173 | | rr = BN_CTX_get(ctx); |
174 | | } else { |
175 | | rr = r; |
176 | | } |
177 | | |
178 | | v = BN_CTX_get(ctx); |
179 | | if (rr == NULL || v == NULL) { |
180 | | goto err; |
181 | | } |
182 | | |
183 | | if (BN_copy(v, a) == NULL) { |
184 | | goto err; |
185 | | } |
186 | | bits = BN_num_bits(p); |
187 | | |
188 | | if (BN_is_odd(p)) { |
189 | | if (BN_copy(rr, a) == NULL) { |
190 | | goto err; |
191 | | } |
192 | | } else { |
193 | | if (!BN_one(rr)) { |
194 | | goto err; |
195 | | } |
196 | | } |
197 | | |
198 | | for (i = 1; i < bits; i++) { |
199 | | if (!BN_sqr(v, v, ctx)) { |
200 | | goto err; |
201 | | } |
202 | | if (BN_is_bit_set(p, i)) { |
203 | | if (!BN_mul(rr, rr, v, ctx)) { |
204 | | goto err; |
205 | | } |
206 | | } |
207 | | } |
208 | | |
209 | | if (r != rr && !BN_copy(r, rr)) { |
210 | | goto err; |
211 | | } |
212 | | ret = 1; |
213 | | |
214 | | err: |
215 | | BN_CTX_end(ctx); |
216 | | return ret; |
217 | | } |
218 | | |
219 | | typedef struct bn_recp_ctx_st { |
220 | | BIGNUM N; // the divisor |
221 | | BIGNUM Nr; // the reciprocal |
222 | | int num_bits; |
223 | | int shift; |
224 | | int flags; |
225 | | } BN_RECP_CTX; |
226 | | |
227 | 193 | static void BN_RECP_CTX_init(BN_RECP_CTX *recp) { |
228 | 193 | BN_init(&recp->N); |
229 | 193 | BN_init(&recp->Nr); |
230 | 193 | recp->num_bits = 0; |
231 | 193 | recp->shift = 0; |
232 | 193 | recp->flags = 0; |
233 | 193 | } |
234 | | |
235 | 193 | static void BN_RECP_CTX_free(BN_RECP_CTX *recp) { |
236 | 193 | if (recp == NULL) { |
237 | 0 | return; |
238 | 0 | } |
239 | | |
240 | 193 | BN_free(&recp->N); |
241 | 193 | BN_free(&recp->Nr); |
242 | 193 | } |
243 | | |
244 | 193 | static int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *d, BN_CTX *ctx) { |
245 | 193 | if (!BN_copy(&(recp->N), d)) { |
246 | 0 | return 0; |
247 | 0 | } |
248 | 193 | BN_zero(&recp->Nr); |
249 | 193 | recp->num_bits = BN_num_bits(d); |
250 | 193 | recp->shift = 0; |
251 | | |
252 | 193 | return 1; |
253 | 193 | } |
254 | | |
255 | | // len is the expected size of the result We actually calculate with an extra |
256 | | // word of precision, so we can do faster division if the remainder is not |
257 | | // required. |
258 | | // r := 2^len / m |
259 | 191 | static int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx) { |
260 | 191 | int ret = -1; |
261 | 191 | BIGNUM *t; |
262 | | |
263 | 191 | BN_CTX_start(ctx); |
264 | 191 | t = BN_CTX_get(ctx); |
265 | 191 | if (t == NULL) { |
266 | 0 | goto err; |
267 | 0 | } |
268 | | |
269 | 191 | if (!BN_set_bit(t, len)) { |
270 | 0 | goto err; |
271 | 0 | } |
272 | | |
273 | 191 | if (!BN_div(r, NULL, t, m, ctx)) { |
274 | 0 | goto err; |
275 | 0 | } |
276 | | |
277 | 191 | ret = len; |
278 | | |
279 | 191 | err: |
280 | 191 | BN_CTX_end(ctx); |
281 | 191 | return ret; |
282 | 191 | } |
283 | | |
284 | | static int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, |
285 | 345k | BN_RECP_CTX *recp, BN_CTX *ctx) { |
286 | 345k | int i, j, ret = 0; |
287 | 345k | BIGNUM *a, *b, *d, *r; |
288 | | |
289 | 345k | BN_CTX_start(ctx); |
290 | 345k | a = BN_CTX_get(ctx); |
291 | 345k | b = BN_CTX_get(ctx); |
292 | 345k | if (dv != NULL) { |
293 | 0 | d = dv; |
294 | 345k | } else { |
295 | 345k | d = BN_CTX_get(ctx); |
296 | 345k | } |
297 | | |
298 | 345k | if (rem != NULL) { |
299 | 345k | r = rem; |
300 | 345k | } else { |
301 | 0 | r = BN_CTX_get(ctx); |
302 | 0 | } |
303 | | |
304 | 345k | if (a == NULL || b == NULL || d == NULL || r == NULL) { |
305 | 0 | goto err; |
306 | 0 | } |
307 | | |
308 | 345k | if (BN_ucmp(m, &recp->N) < 0) { |
309 | 2.07k | BN_zero(d); |
310 | 2.07k | if (!BN_copy(r, m)) { |
311 | 0 | goto err; |
312 | 0 | } |
313 | 2.07k | BN_CTX_end(ctx); |
314 | 2.07k | return 1; |
315 | 2.07k | } |
316 | | |
317 | | // We want the remainder |
318 | | // Given input of ABCDEF / ab |
319 | | // we need multiply ABCDEF by 3 digests of the reciprocal of ab |
320 | | |
321 | | // i := max(BN_num_bits(m), 2*BN_num_bits(N)) |
322 | 343k | i = BN_num_bits(m); |
323 | 343k | j = recp->num_bits << 1; |
324 | 343k | if (j > i) { |
325 | 298k | i = j; |
326 | 298k | } |
327 | | |
328 | | // Nr := round(2^i / N) |
329 | 343k | if (i != recp->shift) { |
330 | 191 | recp->shift = |
331 | 191 | BN_reciprocal(&(recp->Nr), &(recp->N), i, |
332 | 191 | ctx); // BN_reciprocal returns i, or -1 for an error |
333 | 191 | } |
334 | | |
335 | 343k | if (recp->shift == -1) { |
336 | 0 | goto err; |
337 | 0 | } |
338 | | |
339 | | // d := |round(round(m / 2^BN_num_bits(N)) * recp->Nr / 2^(i - |
340 | | // BN_num_bits(N)))| |
341 | | // = |round(round(m / 2^BN_num_bits(N)) * round(2^i / N) / 2^(i - |
342 | | // BN_num_bits(N)))| |
343 | | // <= |(m / 2^BN_num_bits(N)) * (2^i / N) * (2^BN_num_bits(N) / 2^i)| |
344 | | // = |m/N| |
345 | 343k | if (!BN_rshift(a, m, recp->num_bits)) { |
346 | 0 | goto err; |
347 | 0 | } |
348 | 343k | if (!BN_mul(b, a, &(recp->Nr), ctx)) { |
349 | 0 | goto err; |
350 | 0 | } |
351 | 343k | if (!BN_rshift(d, b, i - recp->num_bits)) { |
352 | 0 | goto err; |
353 | 0 | } |
354 | 343k | d->neg = 0; |
355 | | |
356 | 343k | if (!BN_mul(b, &(recp->N), d, ctx)) { |
357 | 0 | goto err; |
358 | 0 | } |
359 | 343k | if (!BN_usub(r, m, b)) { |
360 | 0 | goto err; |
361 | 0 | } |
362 | 343k | r->neg = 0; |
363 | | |
364 | 343k | j = 0; |
365 | 601k | while (BN_ucmp(r, &(recp->N)) >= 0) { |
366 | 257k | if (j++ > 2) { |
367 | 0 | OPENSSL_PUT_ERROR(BN, BN_R_BAD_RECIPROCAL); |
368 | 0 | goto err; |
369 | 0 | } |
370 | 257k | if (!BN_usub(r, r, &(recp->N))) { |
371 | 0 | goto err; |
372 | 0 | } |
373 | 257k | if (!BN_add_word(d, 1)) { |
374 | 0 | goto err; |
375 | 0 | } |
376 | 257k | } |
377 | | |
378 | 343k | r->neg = BN_is_zero(r) ? 0 : m->neg; |
379 | 343k | d->neg = m->neg ^ recp->N.neg; |
380 | 343k | ret = 1; |
381 | | |
382 | 343k | err: |
383 | 343k | BN_CTX_end(ctx); |
384 | 343k | return ret; |
385 | 343k | } |
386 | | |
387 | | static int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y, |
388 | 345k | BN_RECP_CTX *recp, BN_CTX *ctx) { |
389 | 345k | int ret = 0; |
390 | 345k | BIGNUM *a; |
391 | 345k | const BIGNUM *ca; |
392 | | |
393 | 345k | BN_CTX_start(ctx); |
394 | 345k | a = BN_CTX_get(ctx); |
395 | 345k | if (a == NULL) { |
396 | 0 | goto err; |
397 | 0 | } |
398 | | |
399 | 345k | if (y != NULL) { |
400 | 345k | if (x == y) { |
401 | 297k | if (!BN_sqr(a, x, ctx)) { |
402 | 0 | goto err; |
403 | 0 | } |
404 | 297k | } else { |
405 | 47.4k | if (!BN_mul(a, x, y, ctx)) { |
406 | 0 | goto err; |
407 | 0 | } |
408 | 47.4k | } |
409 | 345k | ca = a; |
410 | 345k | } else { |
411 | 0 | ca = x; // Just do the mod |
412 | 0 | } |
413 | | |
414 | 345k | ret = BN_div_recp(NULL, r, ca, recp, ctx); |
415 | | |
416 | 345k | err: |
417 | 345k | BN_CTX_end(ctx); |
418 | 345k | return ret; |
419 | 345k | } |
420 | | |
421 | | // BN_window_bits_for_exponent_size returns sliding window size for mod_exp with |
422 | | // a |b| bit exponent. |
423 | | // |
424 | | // For window size 'w' (w >= 2) and a random 'b' bits exponent, the number of |
425 | | // multiplications is a constant plus on average |
426 | | // |
427 | | // 2^(w-1) + (b-w)/(w+1); |
428 | | // |
429 | | // here 2^(w-1) is for precomputing the table (we actually need entries only |
430 | | // for windows that have the lowest bit set), and (b-w)/(w+1) is an |
431 | | // approximation for the expected number of w-bit windows, not counting the |
432 | | // first one. |
433 | | // |
434 | | // Thus we should use |
435 | | // |
436 | | // w >= 6 if b > 671 |
437 | | // w = 5 if 671 > b > 239 |
438 | | // w = 4 if 239 > b > 79 |
439 | | // w = 3 if 79 > b > 23 |
440 | | // w <= 2 if 23 > b |
441 | | // |
442 | | // (with draws in between). Very small exponents are often selected |
443 | | // with low Hamming weight, so we use w = 1 for b <= 23. |
444 | 380 | static int BN_window_bits_for_exponent_size(size_t b) { |
445 | 380 | if (b > 671) { |
446 | 122 | return 6; |
447 | 122 | } |
448 | 258 | if (b > 239) { |
449 | 79 | return 5; |
450 | 79 | } |
451 | 179 | if (b > 79) { |
452 | 116 | return 4; |
453 | 116 | } |
454 | 63 | if (b > 23) { |
455 | 37 | return 3; |
456 | 37 | } |
457 | 26 | return 1; |
458 | 63 | } |
459 | | |
460 | | // TABLE_SIZE is the maximum precomputation table size for *variable* sliding |
461 | | // windows. This must be 2^(max_window - 1), where max_window is the largest |
462 | | // value returned from |BN_window_bits_for_exponent_size|. |
463 | | #define TABLE_SIZE 32 |
464 | | |
465 | | // TABLE_BITS_SMALL is the smallest value returned from |
466 | | // |BN_window_bits_for_exponent_size| when |b| is at most |BN_BITS2| * |
467 | | // |BN_SMALL_MAX_WORDS| words. |
468 | 48 | #define TABLE_BITS_SMALL 5 |
469 | | |
470 | | // TABLE_SIZE_SMALL is the same as |TABLE_SIZE|, but when |b| is at most |
471 | | // |BN_BITS2| * |BN_SMALL_MAX_WORDS|. |
472 | | #define TABLE_SIZE_SMALL (1 << (TABLE_BITS_SMALL - 1)) |
473 | | |
474 | | static int mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, |
475 | 193 | const BIGNUM *m, BN_CTX *ctx) { |
476 | 193 | int i, j, ret = 0, wstart, window; |
477 | 193 | int start = 1; |
478 | 193 | BIGNUM *aa; |
479 | | // Table of variables obtained from 'ctx' |
480 | 193 | BIGNUM *val[TABLE_SIZE]; |
481 | 193 | BN_RECP_CTX recp; |
482 | | |
483 | | // This function is only called on even moduli. |
484 | 193 | assert(!BN_is_odd(m)); |
485 | | |
486 | 193 | int bits = BN_num_bits(p); |
487 | 193 | if (bits == 0) { |
488 | 0 | return BN_one(r); |
489 | 0 | } |
490 | | |
491 | 193 | BN_RECP_CTX_init(&recp); |
492 | 193 | BN_CTX_start(ctx); |
493 | 193 | aa = BN_CTX_get(ctx); |
494 | 193 | val[0] = BN_CTX_get(ctx); |
495 | 193 | if (!aa || !val[0]) { |
496 | 0 | goto err; |
497 | 0 | } |
498 | | |
499 | 193 | if (m->neg) { |
500 | | // ignore sign of 'm' |
501 | 0 | if (!BN_copy(aa, m)) { |
502 | 0 | goto err; |
503 | 0 | } |
504 | 0 | aa->neg = 0; |
505 | 0 | if (BN_RECP_CTX_set(&recp, aa, ctx) <= 0) { |
506 | 0 | goto err; |
507 | 0 | } |
508 | 193 | } else { |
509 | 193 | if (BN_RECP_CTX_set(&recp, m, ctx) <= 0) { |
510 | 0 | goto err; |
511 | 0 | } |
512 | 193 | } |
513 | | |
514 | 193 | if (!BN_nnmod(val[0], a, m, ctx)) { |
515 | 0 | goto err; // 1 |
516 | 0 | } |
517 | 193 | if (BN_is_zero(val[0])) { |
518 | 2 | BN_zero(r); |
519 | 2 | ret = 1; |
520 | 2 | goto err; |
521 | 2 | } |
522 | | |
523 | 191 | window = BN_window_bits_for_exponent_size(bits); |
524 | 191 | if (window > 1) { |
525 | 187 | if (!BN_mod_mul_reciprocal(aa, val[0], val[0], &recp, ctx)) { |
526 | 0 | goto err; // 2 |
527 | 0 | } |
528 | 187 | j = 1 << (window - 1); |
529 | 4.35k | for (i = 1; i < j; i++) { |
530 | 4.16k | if (((val[i] = BN_CTX_get(ctx)) == NULL) || |
531 | 4.16k | !BN_mod_mul_reciprocal(val[i], val[i - 1], aa, &recp, ctx)) { |
532 | 0 | goto err; |
533 | 0 | } |
534 | 4.16k | } |
535 | 187 | } |
536 | | |
537 | 191 | start = 1; // This is used to avoid multiplication etc |
538 | | // when there is only the value '1' in the |
539 | | // buffer. |
540 | 191 | wstart = bits - 1; // The top bit of the window |
541 | | |
542 | 191 | if (!BN_one(r)) { |
543 | 0 | goto err; |
544 | 0 | } |
545 | | |
546 | 127k | for (;;) { |
547 | 127k | int wvalue; // The 'value' of the window |
548 | 127k | int wend; // The bottom bit of the window |
549 | | |
550 | 127k | if (!BN_is_bit_set(p, wstart)) { |
551 | 84.4k | if (!start) { |
552 | 84.4k | if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) { |
553 | 0 | goto err; |
554 | 0 | } |
555 | 84.4k | } |
556 | 84.4k | if (wstart == 0) { |
557 | 142 | break; |
558 | 142 | } |
559 | 84.3k | wstart--; |
560 | 84.3k | continue; |
561 | 84.4k | } |
562 | | |
563 | | // We now have wstart on a 'set' bit, we now need to work out |
564 | | // how bit a window to do. To do this we need to scan |
565 | | // forward until the last set bit before the end of the |
566 | | // window |
567 | 43.3k | wvalue = 1; |
568 | 43.3k | wend = 0; |
569 | 256k | for (i = 1; i < window; i++) { |
570 | 213k | if (wstart - i < 0) { |
571 | 86 | break; |
572 | 86 | } |
573 | 213k | if (BN_is_bit_set(p, wstart - i)) { |
574 | 105k | wvalue <<= (i - wend); |
575 | 105k | wvalue |= 1; |
576 | 105k | wend = i; |
577 | 105k | } |
578 | 213k | } |
579 | | |
580 | | // wend is the size of the current window |
581 | 43.3k | j = wend + 1; |
582 | | // add the 'bytes above' |
583 | 43.3k | if (!start) { |
584 | 256k | for (i = 0; i < j; i++) { |
585 | 213k | if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) { |
586 | 0 | goto err; |
587 | 0 | } |
588 | 213k | } |
589 | 43.1k | } |
590 | | |
591 | | // wvalue will be an odd number < 2^window |
592 | 43.3k | if (!BN_mod_mul_reciprocal(r, r, val[wvalue >> 1], &recp, ctx)) { |
593 | 0 | goto err; |
594 | 0 | } |
595 | | |
596 | | // move the 'window' down further |
597 | 43.3k | wstart -= wend + 1; |
598 | 43.3k | start = 0; |
599 | 43.3k | if (wstart < 0) { |
600 | 49 | break; |
601 | 49 | } |
602 | 43.3k | } |
603 | 191 | ret = 1; |
604 | | |
605 | 193 | err: |
606 | 193 | BN_CTX_end(ctx); |
607 | 193 | BN_RECP_CTX_free(&recp); |
608 | 193 | return ret; |
609 | 191 | } |
610 | | |
611 | | int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m, |
612 | 275 | BN_CTX *ctx) { |
613 | 275 | if (m->neg) { |
614 | 0 | OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); |
615 | 0 | return 0; |
616 | 0 | } |
617 | 275 | if (a->neg || BN_ucmp(a, m) >= 0) { |
618 | 12 | if (!BN_nnmod(r, a, m, ctx)) { |
619 | 0 | return 0; |
620 | 0 | } |
621 | 12 | a = r; |
622 | 12 | } |
623 | | |
624 | 275 | if (BN_is_odd(m)) { |
625 | 82 | return BN_mod_exp_mont(r, a, p, m, ctx, NULL); |
626 | 82 | } |
627 | | |
628 | 193 | return mod_exp_recp(r, a, p, m, ctx); |
629 | 275 | } |
630 | | |
631 | | int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, |
632 | 187 | const BIGNUM *m, BN_CTX *ctx, const BN_MONT_CTX *mont) { |
633 | 187 | if (!BN_is_odd(m)) { |
634 | 40 | OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); |
635 | 40 | return 0; |
636 | 40 | } |
637 | 147 | if (m->neg) { |
638 | 0 | OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); |
639 | 0 | return 0; |
640 | 0 | } |
641 | | // |a| is secret, but |a < m| is not. |
642 | 147 | if (a->neg || constant_time_declassify_int(BN_ucmp(a, m)) >= 0) { |
643 | 5 | OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); |
644 | 5 | return 0; |
645 | 5 | } |
646 | | |
647 | 142 | int bits = BN_num_bits(p); |
648 | 142 | if (bits == 0) { |
649 | | // x**0 mod 1 is still zero. |
650 | 1 | if (BN_abs_is_word(m, 1)) { |
651 | 0 | BN_zero(rr); |
652 | 0 | return 1; |
653 | 0 | } |
654 | 1 | return BN_one(rr); |
655 | 1 | } |
656 | | |
657 | 141 | int ret = 0; |
658 | 141 | BIGNUM *val[TABLE_SIZE]; |
659 | 141 | BN_MONT_CTX *new_mont = NULL; |
660 | | |
661 | 141 | BN_CTX_start(ctx); |
662 | 141 | BIGNUM *r = BN_CTX_get(ctx); |
663 | 141 | val[0] = BN_CTX_get(ctx); |
664 | 141 | if (r == NULL || val[0] == NULL) { |
665 | 0 | goto err; |
666 | 0 | } |
667 | | |
668 | | // Allocate a montgomery context if it was not supplied by the caller. |
669 | 141 | if (mont == NULL) { |
670 | 134 | new_mont = BN_MONT_CTX_new_consttime(m, ctx); |
671 | 134 | if (new_mont == NULL) { |
672 | 0 | goto err; |
673 | 0 | } |
674 | 134 | mont = new_mont; |
675 | 134 | } |
676 | | |
677 | | // We exponentiate by looking at sliding windows of the exponent and |
678 | | // precomputing powers of |a|. Windows may be shifted so they always end on a |
679 | | // set bit, so only precompute odd powers. We compute val[i] = a^(2*i + 1) |
680 | | // for i = 0 to 2^(window-1), all in Montgomery form. |
681 | 141 | int window = BN_window_bits_for_exponent_size(bits); |
682 | 141 | if (!BN_to_montgomery(val[0], a, mont, ctx)) { |
683 | 0 | goto err; |
684 | 0 | } |
685 | 141 | if (window > 1) { |
686 | 119 | BIGNUM *d = BN_CTX_get(ctx); |
687 | 119 | if (d == NULL || |
688 | 119 | !BN_mod_mul_montgomery(d, val[0], val[0], mont, ctx)) { |
689 | 0 | goto err; |
690 | 0 | } |
691 | 1.12k | for (int i = 1; i < 1 << (window - 1); i++) { |
692 | 1.00k | val[i] = BN_CTX_get(ctx); |
693 | 1.00k | if (val[i] == NULL || |
694 | 1.00k | !BN_mod_mul_montgomery(val[i], val[i - 1], d, mont, ctx)) { |
695 | 0 | goto err; |
696 | 0 | } |
697 | 1.00k | } |
698 | 119 | } |
699 | | |
700 | | // |p| is non-zero, so at least one window is non-zero. To save some |
701 | | // multiplications, defer initializing |r| until then. |
702 | 141 | int r_is_one = 1; |
703 | 141 | int wstart = bits - 1; // The top bit of the window. |
704 | 16.1k | for (;;) { |
705 | 16.1k | if (!BN_is_bit_set(p, wstart)) { |
706 | 10.3k | if (!r_is_one && !BN_mod_mul_montgomery(r, r, r, mont, ctx)) { |
707 | 0 | goto err; |
708 | 0 | } |
709 | 10.3k | if (wstart == 0) { |
710 | 25 | break; |
711 | 25 | } |
712 | 10.3k | wstart--; |
713 | 10.3k | continue; |
714 | 10.3k | } |
715 | | |
716 | | // We now have wstart on a set bit. Find the largest window we can use. |
717 | 5.78k | int wvalue = 1; |
718 | 5.78k | int wsize = 0; |
719 | 26.7k | for (int i = 1; i < window && i <= wstart; i++) { |
720 | 20.9k | if (BN_is_bit_set(p, wstart - i)) { |
721 | 11.5k | wvalue <<= (i - wsize); |
722 | 11.5k | wvalue |= 1; |
723 | 11.5k | wsize = i; |
724 | 11.5k | } |
725 | 20.9k | } |
726 | | |
727 | | // Shift |r| to the end of the window. |
728 | 5.78k | if (!r_is_one) { |
729 | 27.1k | for (int i = 0; i < wsize + 1; i++) { |
730 | 21.5k | if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) { |
731 | 0 | goto err; |
732 | 0 | } |
733 | 21.5k | } |
734 | 5.64k | } |
735 | | |
736 | 5.78k | assert(wvalue & 1); |
737 | 5.78k | assert(wvalue < (1 << window)); |
738 | 5.78k | if (r_is_one) { |
739 | 141 | if (!BN_copy(r, val[wvalue >> 1])) { |
740 | 0 | goto err; |
741 | 0 | } |
742 | 5.64k | } else if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx)) { |
743 | 0 | goto err; |
744 | 0 | } |
745 | | |
746 | 5.78k | r_is_one = 0; |
747 | 5.78k | if (wstart == wsize) { |
748 | 116 | break; |
749 | 116 | } |
750 | 5.66k | wstart -= wsize + 1; |
751 | 5.66k | } |
752 | | |
753 | | // |p| is non-zero, so |r_is_one| must be cleared at some point. |
754 | 141 | assert(!r_is_one); |
755 | | |
756 | 141 | if (!BN_from_montgomery(rr, r, mont, ctx)) { |
757 | 0 | goto err; |
758 | 0 | } |
759 | 141 | ret = 1; |
760 | | |
761 | 141 | err: |
762 | 141 | BN_MONT_CTX_free(new_mont); |
763 | 141 | BN_CTX_end(ctx); |
764 | 141 | return ret; |
765 | 141 | } |
766 | | |
767 | | void bn_mod_exp_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num, |
768 | | const BN_ULONG *p, size_t num_p, |
769 | 48 | const BN_MONT_CTX *mont) { |
770 | 48 | if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS || |
771 | 48 | num_p > SIZE_MAX / BN_BITS2) { |
772 | 0 | abort(); |
773 | 0 | } |
774 | 48 | assert(BN_is_odd(&mont->N)); |
775 | | |
776 | | // Count the number of bits in |p|, skipping leading zeros. Note this function |
777 | | // treats |p| as public. |
778 | 48 | while (num_p != 0 && p[num_p - 1] == 0) { |
779 | 0 | num_p--; |
780 | 0 | } |
781 | 48 | if (num_p == 0) { |
782 | 0 | bn_from_montgomery_small(r, num, mont->RR.d, num, mont); |
783 | 0 | return; |
784 | 0 | } |
785 | 48 | size_t bits = BN_num_bits_word(p[num_p - 1]) + (num_p - 1) * BN_BITS2; |
786 | 48 | assert(bits != 0); |
787 | | |
788 | | // We exponentiate by looking at sliding windows of the exponent and |
789 | | // precomputing powers of |a|. Windows may be shifted so they always end on a |
790 | | // set bit, so only precompute odd powers. We compute val[i] = a^(2*i + 1) for |
791 | | // i = 0 to 2^(window-1), all in Montgomery form. |
792 | 48 | unsigned window = BN_window_bits_for_exponent_size(bits); |
793 | 48 | if (window > TABLE_BITS_SMALL) { |
794 | 0 | window = TABLE_BITS_SMALL; // Tolerate excessively large |p|. |
795 | 0 | } |
796 | 48 | BN_ULONG val[TABLE_SIZE_SMALL][BN_SMALL_MAX_WORDS]; |
797 | 48 | OPENSSL_memcpy(val[0], a, num * sizeof(BN_ULONG)); |
798 | 48 | if (window > 1) { |
799 | 48 | BN_ULONG d[BN_SMALL_MAX_WORDS]; |
800 | 48 | bn_mod_mul_montgomery_small(d, val[0], val[0], num, mont); |
801 | 768 | for (unsigned i = 1; i < 1u << (window - 1); i++) { |
802 | 720 | bn_mod_mul_montgomery_small(val[i], val[i - 1], d, num, mont); |
803 | 720 | } |
804 | 48 | } |
805 | | |
806 | | // |p| is non-zero, so at least one window is non-zero. To save some |
807 | | // multiplications, defer initializing |r| until then. |
808 | 48 | int r_is_one = 1; |
809 | 48 | size_t wstart = bits - 1; // The top bit of the window. |
810 | 5.85k | for (;;) { |
811 | 5.85k | if (!bn_is_bit_set_words(p, num_p, wstart)) { |
812 | 1.86k | if (!r_is_one) { |
813 | 1.86k | bn_mod_mul_montgomery_small(r, r, r, num, mont); |
814 | 1.86k | } |
815 | 1.86k | if (wstart == 0) { |
816 | 0 | break; |
817 | 0 | } |
818 | 1.86k | wstart--; |
819 | 1.86k | continue; |
820 | 1.86k | } |
821 | | |
822 | | // We now have wstart on a set bit. Find the largest window we can use. |
823 | 3.99k | unsigned wvalue = 1; |
824 | 3.99k | unsigned wsize = 0; |
825 | 19.7k | for (unsigned i = 1; i < window && i <= wstart; i++) { |
826 | 15.7k | if (bn_is_bit_set_words(p, num_p, wstart - i)) { |
827 | 15.4k | wvalue <<= (i - wsize); |
828 | 15.4k | wvalue |= 1; |
829 | 15.4k | wsize = i; |
830 | 15.4k | } |
831 | 15.7k | } |
832 | | |
833 | | // Shift |r| to the end of the window. |
834 | 3.99k | if (!r_is_one) { |
835 | 23.2k | for (unsigned i = 0; i < wsize + 1; i++) { |
836 | 19.3k | bn_mod_mul_montgomery_small(r, r, r, num, mont); |
837 | 19.3k | } |
838 | 3.94k | } |
839 | | |
840 | 3.99k | assert(wvalue & 1); |
841 | 3.99k | assert(wvalue < (1u << window)); |
842 | 3.99k | if (r_is_one) { |
843 | 48 | OPENSSL_memcpy(r, val[wvalue >> 1], num * sizeof(BN_ULONG)); |
844 | 3.94k | } else { |
845 | 3.94k | bn_mod_mul_montgomery_small(r, r, val[wvalue >> 1], num, mont); |
846 | 3.94k | } |
847 | 3.99k | r_is_one = 0; |
848 | 3.99k | if (wstart == wsize) { |
849 | 48 | break; |
850 | 48 | } |
851 | 3.94k | wstart -= wsize + 1; |
852 | 3.94k | } |
853 | | |
854 | | // |p| is non-zero, so |r_is_one| must be cleared at some point. |
855 | 48 | assert(!r_is_one); |
856 | 48 | OPENSSL_cleanse(val, sizeof(val)); |
857 | 48 | } |
858 | | |
859 | | void bn_mod_inverse0_prime_mont_small(BN_ULONG *r, const BN_ULONG *a, |
860 | 48 | size_t num, const BN_MONT_CTX *mont) { |
861 | 48 | if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) { |
862 | 0 | abort(); |
863 | 0 | } |
864 | | |
865 | | // Per Fermat's Little Theorem, a^-1 = a^(p-2) (mod p) for p prime. |
866 | 48 | BN_ULONG p_minus_two[BN_SMALL_MAX_WORDS]; |
867 | 48 | const BN_ULONG *p = mont->N.d; |
868 | 48 | OPENSSL_memcpy(p_minus_two, p, num * sizeof(BN_ULONG)); |
869 | 48 | if (p_minus_two[0] >= 2) { |
870 | 48 | p_minus_two[0] -= 2; |
871 | 48 | } else { |
872 | 0 | p_minus_two[0] -= 2; |
873 | 0 | for (size_t i = 1; i < num; i++) { |
874 | 0 | if (p_minus_two[i]-- != 0) { |
875 | 0 | break; |
876 | 0 | } |
877 | 0 | } |
878 | 0 | } |
879 | | |
880 | 48 | bn_mod_exp_mont_small(r, a, num, p_minus_two, num, mont); |
881 | 48 | } |
882 | | |
883 | | static void copy_to_prebuf(const BIGNUM *b, int top, BN_ULONG *table, int idx, |
884 | 94.7k | int window) { |
885 | 94.7k | int ret = bn_copy_words(table + idx * top, top, b); |
886 | 94.7k | assert(ret); // |b| is guaranteed to fit. |
887 | 94.7k | (void)ret; |
888 | 94.7k | } |
889 | | |
890 | | static int copy_from_prebuf(BIGNUM *b, int top, const BN_ULONG *table, int idx, |
891 | 283k | int window) { |
892 | 283k | if (!bn_wexpand(b, top)) { |
893 | 0 | return 0; |
894 | 0 | } |
895 | | |
896 | 283k | OPENSSL_memset(b->d, 0, sizeof(BN_ULONG) * top); |
897 | 283k | const int width = 1 << window; |
898 | 11.2M | for (int i = 0; i < width; i++, table += top) { |
899 | | // Use a value barrier to prevent Clang from adding a branch when |i != idx| |
900 | | // and making this copy not constant time. Clang is still allowed to learn |
901 | | // that |mask| is constant across the inner loop, so this won't inhibit any |
902 | | // vectorization it might do. |
903 | 10.9M | BN_ULONG mask = value_barrier_w(constant_time_eq_int(i, idx)); |
904 | 231M | for (int j = 0; j < top; j++) { |
905 | 220M | b->d[j] |= table[j] & mask; |
906 | 220M | } |
907 | 10.9M | } |
908 | | |
909 | 283k | b->width = top; |
910 | 283k | return 1; |
911 | 283k | } |
912 | | |
913 | | // Window sizes optimized for fixed window size modular exponentiation |
914 | | // algorithm (BN_mod_exp_mont_consttime). |
915 | | // |
916 | | // TODO(davidben): These window sizes were originally set for 64-byte cache |
917 | | // lines with a cache-line-dependent constant-time mitigation. They can probably |
918 | | // be revised now that our implementation is no longer cache-time-dependent. |
919 | | #define BN_window_bits_for_ctime_exponent_size(b) \ |
920 | 4.11k | ((b) > 937 ? 6 : (b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1) |
921 | | #define BN_MAX_MOD_EXP_CTIME_WINDOW (6) |
922 | | |
923 | | // This variant of |BN_mod_exp_mont| uses fixed windows and fixed memory access |
924 | | // patterns to protect secret exponents (cf. the hyper-threading timing attacks |
925 | | // pointed out by Colin Percival, |
926 | | // http://www.daemonology.net/hyperthreading-considered-harmful/) |
927 | | int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, |
928 | | const BIGNUM *m, BN_CTX *ctx, |
929 | 4.24k | const BN_MONT_CTX *mont) { |
930 | 4.24k | int i, ret = 0, wvalue; |
931 | 4.24k | BN_MONT_CTX *new_mont = NULL; |
932 | | |
933 | 4.24k | unsigned char *powerbuf_free = NULL; |
934 | 4.24k | size_t powerbuf_len = 0; |
935 | 4.24k | BN_ULONG *powerbuf = NULL; |
936 | | |
937 | 4.24k | if (!BN_is_odd(m)) { |
938 | 126 | OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); |
939 | 126 | return 0; |
940 | 126 | } |
941 | 4.11k | if (m->neg) { |
942 | 0 | OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); |
943 | 0 | return 0; |
944 | 0 | } |
945 | | // |a| is secret, but it is required to be in range, so these comparisons may |
946 | | // be leaked. |
947 | 4.11k | if (a->neg || constant_time_declassify_int(BN_ucmp(a, m) >= 0)) { |
948 | 6 | OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); |
949 | 6 | return 0; |
950 | 6 | } |
951 | | |
952 | | // Use all bits stored in |p|, rather than |BN_num_bits|, so we do not leak |
953 | | // whether the top bits are zero. |
954 | 4.11k | int max_bits = p->width * BN_BITS2; |
955 | 4.11k | int bits = max_bits; |
956 | 4.11k | if (bits == 0) { |
957 | | // x**0 mod 1 is still zero. |
958 | 0 | if (BN_abs_is_word(m, 1)) { |
959 | 0 | BN_zero(rr); |
960 | 0 | return 1; |
961 | 0 | } |
962 | 0 | return BN_one(rr); |
963 | 0 | } |
964 | | |
965 | | // Allocate a montgomery context if it was not supplied by the caller. |
966 | 4.11k | if (mont == NULL) { |
967 | 116 | new_mont = BN_MONT_CTX_new_consttime(m, ctx); |
968 | 116 | if (new_mont == NULL) { |
969 | 0 | goto err; |
970 | 0 | } |
971 | 116 | mont = new_mont; |
972 | 116 | } |
973 | | |
974 | | // Use the width in |mont->N|, rather than the copy in |m|. The assembly |
975 | | // implementation assumes it can use |top| to size R. |
976 | 4.11k | int top = mont->N.width; |
977 | | |
978 | | #if defined(OPENSSL_BN_ASM_MONT5) || defined(RSAZ_ENABLED) |
979 | | // Share one large stack-allocated buffer between the RSAZ and non-RSAZ code |
980 | | // paths. If we were to use separate static buffers for each then there is |
981 | | // some chance that both large buffers would be allocated on the stack, |
982 | | // causing the stack space requirement to be truly huge (~10KB). |
983 | | alignas(MOD_EXP_CTIME_ALIGN) BN_ULONG storage[MOD_EXP_CTIME_STORAGE_LEN]; |
984 | | #endif |
985 | | #if defined(RSAZ_ENABLED) |
986 | | // If the size of the operands allow it, perform the optimized RSAZ |
987 | | // exponentiation. For further information see crypto/fipsmodule/bn/rsaz_exp.c |
988 | | // and accompanying assembly modules. |
989 | 1.01k | if (a->width == 16 && p->width == 16 && BN_num_bits(m) == 1024 && |
990 | 1.01k | rsaz_avx2_preferred()) { |
991 | 0 | if (!bn_wexpand(rr, 16)) { |
992 | 0 | goto err; |
993 | 0 | } |
994 | 0 | RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0], |
995 | 0 | storage); |
996 | 0 | rr->width = 16; |
997 | 0 | rr->neg = 0; |
998 | 0 | ret = 1; |
999 | 0 | goto err; |
1000 | 0 | } |
1001 | 1.01k | #endif |
1002 | | |
1003 | | // Get the window size to use with size of p. |
1004 | 4.11k | int window = BN_window_bits_for_ctime_exponent_size(bits); |
1005 | 1.01k | assert(window <= BN_MAX_MOD_EXP_CTIME_WINDOW); |
1006 | | |
1007 | | // Calculating |powerbuf_len| below cannot overflow because of the bound on |
1008 | | // Montgomery reduction. |
1009 | 4.11k | assert((size_t)top <= BN_MONTGOMERY_MAX_WORDS); |
1010 | 4.11k | static_assert( |
1011 | 4.11k | BN_MONTGOMERY_MAX_WORDS <= |
1012 | 4.11k | INT_MAX / sizeof(BN_ULONG) / ((1 << BN_MAX_MOD_EXP_CTIME_WINDOW) + 3), |
1013 | 4.11k | "powerbuf_len may overflow"); |
1014 | | |
1015 | | #if defined(OPENSSL_BN_ASM_MONT5) |
1016 | 1.01k | if (window >= 5) { |
1017 | 211 | window = 5; // ~5% improvement for RSA2048 sign, and even for RSA4096 |
1018 | | // Reserve space for the |mont->N| copy. |
1019 | 211 | powerbuf_len += top * sizeof(mont->N.d[0]); |
1020 | 211 | } |
1021 | | #endif |
1022 | | |
1023 | | // Allocate a buffer large enough to hold all of the pre-computed |
1024 | | // powers of |am|, |am| itself, and |tmp|. |
1025 | 4.11k | int num_powers = 1 << window; |
1026 | 4.11k | powerbuf_len += sizeof(m->d[0]) * top * (num_powers + 2); |
1027 | | |
1028 | | #if defined(OPENSSL_BN_ASM_MONT5) |
1029 | 1.01k | if (powerbuf_len <= sizeof(storage)) { |
1030 | 942 | powerbuf = storage; |
1031 | 942 | } |
1032 | | // |storage| is more than large enough to handle 1024-bit inputs. |
1033 | | assert(powerbuf != NULL || top * BN_BITS2 > 1024); |
1034 | 1.01k | #endif |
1035 | 4.11k | if (powerbuf == NULL) { |
1036 | 3.17k | powerbuf_free = OPENSSL_malloc(powerbuf_len + MOD_EXP_CTIME_ALIGN); |
1037 | 3.17k | if (powerbuf_free == NULL) { |
1038 | 0 | goto err; |
1039 | 0 | } |
1040 | 3.17k | powerbuf = align_pointer(powerbuf_free, MOD_EXP_CTIME_ALIGN); |
1041 | 3.17k | } |
1042 | 4.11k | OPENSSL_memset(powerbuf, 0, powerbuf_len); |
1043 | | |
1044 | | // Place |tmp| and |am| right after powers table. |
1045 | 4.11k | BIGNUM tmp, am; |
1046 | 4.11k | tmp.d = powerbuf + top * num_powers; |
1047 | 4.11k | am.d = tmp.d + top; |
1048 | 4.11k | tmp.width = am.width = 0; |
1049 | 4.11k | tmp.dmax = am.dmax = top; |
1050 | 4.11k | tmp.neg = am.neg = 0; |
1051 | 4.11k | tmp.flags = am.flags = BN_FLG_STATIC_DATA; |
1052 | | |
1053 | 4.11k | if (!bn_one_to_montgomery(&tmp, mont, ctx) || |
1054 | 4.11k | !bn_resize_words(&tmp, top)) { |
1055 | 0 | goto err; |
1056 | 0 | } |
1057 | | |
1058 | | // Prepare a^1 in the Montgomery domain. |
1059 | 4.11k | assert(!a->neg); |
1060 | 4.11k | declassify_assert(BN_ucmp(a, m) < 0); |
1061 | 4.11k | if (!BN_to_montgomery(&am, a, mont, ctx) || |
1062 | 4.11k | !bn_resize_words(&am, top)) { |
1063 | 0 | goto err; |
1064 | 0 | } |
1065 | | |
1066 | | #if defined(OPENSSL_BN_ASM_MONT5) |
1067 | | // This optimization uses ideas from https://eprint.iacr.org/2011/239, |
1068 | | // specifically optimization of cache-timing attack countermeasures, |
1069 | | // pre-computation optimization, and Almost Montgomery Multiplication. |
1070 | | // |
1071 | | // The paper discusses a 4-bit window to optimize 512-bit modular |
1072 | | // exponentiation, used in RSA-1024 with CRT, but RSA-1024 is no longer |
1073 | | // important. |
1074 | | // |
1075 | | // |bn_mul_mont_gather5| and |bn_power5| implement the "almost" reduction |
1076 | | // variant, so the values here may not be fully reduced. They are bounded by R |
1077 | | // (i.e. they fit in |top| words), not |m|. Additionally, we pass these |
1078 | | // "almost" reduced inputs into |bn_mul_mont|, which implements the normal |
1079 | | // reduction variant. Given those inputs, |bn_mul_mont| may not give reduced |
1080 | | // output, but it will still produce "almost" reduced output. |
1081 | | // |
1082 | | // TODO(davidben): Using "almost" reduction complicates analysis of this code, |
1083 | | // and its interaction with other parts of the project. Determine whether this |
1084 | | // is actually necessary for performance. |
1085 | 1.01k | if (window == 5 && top > 1) { |
1086 | | // Copy |mont->N| to improve cache locality. |
1087 | 211 | BN_ULONG *np = am.d + top; |
1088 | 6.57k | for (i = 0; i < top; i++) { |
1089 | 6.36k | np[i] = mont->N.d[i]; |
1090 | 6.36k | } |
1091 | | |
1092 | | // Fill |powerbuf| with the first 32 powers of |am|. |
1093 | | const BN_ULONG *n0 = mont->n0; |
1094 | | bn_scatter5(tmp.d, top, powerbuf, 0); |
1095 | | bn_scatter5(am.d, am.width, powerbuf, 1); |
1096 | | bn_mul_mont(tmp.d, am.d, am.d, np, n0, top); |
1097 | | bn_scatter5(tmp.d, top, powerbuf, 2); |
1098 | | |
1099 | | // Square to compute powers of two. |
1100 | 844 | for (i = 4; i < 32; i *= 2) { |
1101 | 633 | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1102 | 633 | bn_scatter5(tmp.d, top, powerbuf, i); |
1103 | 633 | } |
1104 | | // Compute odd powers |i| based on |i - 1|, then all powers |i * 2^j|. |
1105 | 3.37k | for (i = 3; i < 32; i += 2) { |
1106 | 3.16k | bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); |
1107 | 3.16k | bn_scatter5(tmp.d, top, powerbuf, i); |
1108 | 5.48k | for (int j = 2 * i; j < 32; j *= 2) { |
1109 | 2.32k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1110 | 2.32k | bn_scatter5(tmp.d, top, powerbuf, j); |
1111 | 2.32k | } |
1112 | 3.16k | } |
1113 | | |
1114 | | bits--; |
1115 | 1.03k | for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--) { |
1116 | 819 | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); |
1117 | 819 | } |
1118 | | bn_gather5(tmp.d, top, powerbuf, wvalue); |
1119 | | |
1120 | | // At this point |bits| is 4 mod 5 and at least -1. (|bits| is the first bit |
1121 | | // that has not been read yet.) |
1122 | | assert(bits >= -1 && (bits == -1 || bits % 5 == 4)); |
1123 | | |
1124 | | // Scan the exponent one window at a time starting from the most |
1125 | | // significant bits. |
1126 | 211 | if (!bn_power5_capable(top)) { |
1127 | 33.2k | while (bits >= 0) { |
1128 | 198k | for (wvalue = 0, i = 0; i < 5; i++, bits--) { |
1129 | 165k | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); |
1130 | 165k | } |
1131 | | |
1132 | 33.0k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1133 | 33.0k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1134 | 33.0k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1135 | 33.0k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1136 | 33.0k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1137 | 33.0k | bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); |
1138 | 33.0k | } |
1139 | 189 | } else { |
1140 | 22 | const uint8_t *p_bytes = (const uint8_t *)p->d; |
1141 | 22 | assert(bits < max_bits); |
1142 | | // |p = 0| has been handled as a special case, so |max_bits| is at least |
1143 | | // one word. |
1144 | 22 | assert(max_bits >= 64); |
1145 | | |
1146 | | // If the first bit to be read lands in the last byte, unroll the first |
1147 | | // iteration to avoid reading past the bounds of |p->d|. (After the first |
1148 | | // iteration, we are guaranteed to be past the last byte.) Note |bits| |
1149 | | // here is the top bit, inclusive. |
1150 | 22 | if (bits - 4 >= max_bits - 8) { |
1151 | | // Read five bits from |bits-4| through |bits|, inclusive. |
1152 | 21 | wvalue = p_bytes[p->width * BN_BYTES - 1]; |
1153 | 21 | wvalue >>= (bits - 4) & 7; |
1154 | 21 | wvalue &= 0x1f; |
1155 | 21 | bits -= 5; |
1156 | 21 | bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); |
1157 | 21 | } |
1158 | 4.78k | while (bits >= 0) { |
1159 | | // Read five bits from |bits-4| through |bits|, inclusive. |
1160 | 4.75k | int first_bit = bits - 4; |
1161 | 4.75k | uint16_t val; |
1162 | 4.75k | OPENSSL_memcpy(&val, p_bytes + (first_bit >> 3), sizeof(val)); |
1163 | 4.75k | val >>= first_bit & 7; |
1164 | 4.75k | val &= 0x1f; |
1165 | 4.75k | bits -= 5; |
1166 | 4.75k | bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, val); |
1167 | 4.75k | } |
1168 | 22 | } |
1169 | | // The result is now in |tmp| in Montgomery form, but it may not be fully |
1170 | | // reduced. This is within bounds for |BN_from_montgomery| (tmp < R <= m*R) |
1171 | | // so it will, when converting from Montgomery form, produce a fully reduced |
1172 | | // result. |
1173 | | // |
1174 | | // This differs from Figure 2 of the paper, which uses AMM(h, 1) to convert |
1175 | | // from Montgomery form with unreduced output, followed by an extra |
1176 | | // reduction step. In the paper's terminology, we replace steps 9 and 10 |
1177 | | // with MM(h, 1). |
1178 | 211 | } else |
1179 | 804 | #endif |
1180 | 804 | { |
1181 | 804 | copy_to_prebuf(&tmp, top, powerbuf, 0, window); |
1182 | 804 | copy_to_prebuf(&am, top, powerbuf, 1, window); |
1183 | | |
1184 | | // If the window size is greater than 1, then calculate |
1185 | | // val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) |
1186 | | // (even powers could instead be computed as (a^(i/2))^2 |
1187 | | // to use the slight performance advantage of sqr over mul). |
1188 | 3.90k | if (window > 1) { |
1189 | 3.90k | if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx)) { |
1190 | 0 | goto err; |
1191 | 0 | } |
1192 | | |
1193 | 3.90k | copy_to_prebuf(&tmp, top, powerbuf, 2, window); |
1194 | | |
1195 | 86.9k | for (i = 3; i < num_powers; i++) { |
1196 | | // Calculate a^i = a^(i-1) * a |
1197 | 83.0k | if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx)) { |
1198 | 0 | goto err; |
1199 | 0 | } |
1200 | | |
1201 | 83.0k | copy_to_prebuf(&tmp, top, powerbuf, i, window); |
1202 | 83.0k | } |
1203 | 3.90k | } |
1204 | | |
1205 | 3.90k | bits--; |
1206 | 18.7k | for (wvalue = 0, i = bits % window; i >= 0; i--, bits--) { |
1207 | 14.8k | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); |
1208 | 14.8k | } |
1209 | 3.90k | if (!copy_from_prebuf(&tmp, top, powerbuf, wvalue, window)) { |
1210 | 0 | goto err; |
1211 | 0 | } |
1212 | | |
1213 | | // Scan the exponent one window at a time starting from the most |
1214 | | // significant bits. |
1215 | 283k | while (bits >= 0) { |
1216 | 279k | wvalue = 0; // The 'value' of the window |
1217 | | |
1218 | | // Scan the window, squaring the result as we go |
1219 | 1.66M | for (i = 0; i < window; i++, bits--) { |
1220 | 1.38M | if (!BN_mod_mul_montgomery(&tmp, &tmp, &tmp, mont, ctx)) { |
1221 | 0 | goto err; |
1222 | 0 | } |
1223 | 1.38M | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); |
1224 | 1.38M | } |
1225 | | |
1226 | | // Fetch the appropriate pre-computed value from the pre-buf |
1227 | 279k | if (!copy_from_prebuf(&am, top, powerbuf, wvalue, window)) { |
1228 | 0 | goto err; |
1229 | 0 | } |
1230 | | |
1231 | | // Multiply the result into the intermediate result |
1232 | 279k | if (!BN_mod_mul_montgomery(&tmp, &tmp, &am, mont, ctx)) { |
1233 | 0 | goto err; |
1234 | 0 | } |
1235 | 279k | } |
1236 | 804 | } |
1237 | | |
1238 | | // Convert the final result from Montgomery to standard format. If we used the |
1239 | | // |OPENSSL_BN_ASM_MONT5| codepath, |tmp| may not be fully reduced. It is only |
1240 | | // bounded by R rather than |m|. However, that is still within bounds for |
1241 | | // |BN_from_montgomery|, which implements full Montgomery reduction, not |
1242 | | // "almost" Montgomery reduction. |
1243 | 4.11k | if (!BN_from_montgomery(rr, &tmp, mont, ctx)) { |
1244 | 0 | goto err; |
1245 | 0 | } |
1246 | 4.11k | ret = 1; |
1247 | | |
1248 | 4.11k | err: |
1249 | 4.11k | BN_MONT_CTX_free(new_mont); |
1250 | 4.11k | if (powerbuf != NULL && powerbuf_free == NULL) { |
1251 | 942 | OPENSSL_cleanse(powerbuf, powerbuf_len); |
1252 | 942 | } |
1253 | 4.11k | OPENSSL_free(powerbuf_free); |
1254 | 4.11k | return ret; |
1255 | 4.11k | } BN_mod_exp_mont_consttime Line | Count | Source | 929 | 3.16k | const BN_MONT_CTX *mont) { | 930 | 3.16k | int i, ret = 0, wvalue; | 931 | 3.16k | BN_MONT_CTX *new_mont = NULL; | 932 | | | 933 | 3.16k | unsigned char *powerbuf_free = NULL; | 934 | 3.16k | size_t powerbuf_len = 0; | 935 | 3.16k | BN_ULONG *powerbuf = NULL; | 936 | | | 937 | 3.16k | if (!BN_is_odd(m)) { | 938 | 62 | OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); | 939 | 62 | return 0; | 940 | 62 | } | 941 | 3.09k | if (m->neg) { | 942 | 0 | OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); | 943 | 0 | return 0; | 944 | 0 | } | 945 | | // |a| is secret, but it is required to be in range, so these comparisons may | 946 | | // be leaked. | 947 | 3.09k | if (a->neg || constant_time_declassify_int(BN_ucmp(a, m) >= 0)) { | 948 | 2 | OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); | 949 | 2 | return 0; | 950 | 2 | } | 951 | | | 952 | | // Use all bits stored in |p|, rather than |BN_num_bits|, so we do not leak | 953 | | // whether the top bits are zero. | 954 | 3.09k | int max_bits = p->width * BN_BITS2; | 955 | 3.09k | int bits = max_bits; | 956 | 3.09k | if (bits == 0) { | 957 | | // x**0 mod 1 is still zero. | 958 | 0 | if (BN_abs_is_word(m, 1)) { | 959 | 0 | BN_zero(rr); | 960 | 0 | return 1; | 961 | 0 | } | 962 | 0 | return BN_one(rr); | 963 | 0 | } | 964 | | | 965 | | // Allocate a montgomery context if it was not supplied by the caller. | 966 | 3.09k | if (mont == NULL) { | 967 | 59 | new_mont = BN_MONT_CTX_new_consttime(m, ctx); | 968 | 59 | if (new_mont == NULL) { | 969 | 0 | goto err; | 970 | 0 | } | 971 | 59 | mont = new_mont; | 972 | 59 | } | 973 | | | 974 | | // Use the width in |mont->N|, rather than the copy in |m|. The assembly | 975 | | // implementation assumes it can use |top| to size R. | 976 | 3.09k | int top = mont->N.width; | 977 | | | 978 | | #if defined(OPENSSL_BN_ASM_MONT5) || defined(RSAZ_ENABLED) | 979 | | // Share one large stack-allocated buffer between the RSAZ and non-RSAZ code | 980 | | // paths. If we were to use separate static buffers for each then there is | 981 | | // some chance that both large buffers would be allocated on the stack, | 982 | | // causing the stack space requirement to be truly huge (~10KB). | 983 | | alignas(MOD_EXP_CTIME_ALIGN) BN_ULONG storage[MOD_EXP_CTIME_STORAGE_LEN]; | 984 | | #endif | 985 | | #if defined(RSAZ_ENABLED) | 986 | | // If the size of the operands allow it, perform the optimized RSAZ | 987 | | // exponentiation. For further information see crypto/fipsmodule/bn/rsaz_exp.c | 988 | | // and accompanying assembly modules. | 989 | | if (a->width == 16 && p->width == 16 && BN_num_bits(m) == 1024 && | 990 | | rsaz_avx2_preferred()) { | 991 | | if (!bn_wexpand(rr, 16)) { | 992 | | goto err; | 993 | | } | 994 | | RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0], | 995 | | storage); | 996 | | rr->width = 16; | 997 | | rr->neg = 0; | 998 | | ret = 1; | 999 | | goto err; | 1000 | | } | 1001 | | #endif | 1002 | | | 1003 | | // Get the window size to use with size of p. | 1004 | 3.09k | int window = BN_window_bits_for_ctime_exponent_size(bits); | 1005 | 3.09k | assert(window <= BN_MAX_MOD_EXP_CTIME_WINDOW); | 1006 | | | 1007 | | // Calculating |powerbuf_len| below cannot overflow because of the bound on | 1008 | | // Montgomery reduction. | 1009 | 3.09k | assert((size_t)top <= BN_MONTGOMERY_MAX_WORDS); | 1010 | 3.09k | static_assert( | 1011 | 3.09k | BN_MONTGOMERY_MAX_WORDS <= | 1012 | 3.09k | INT_MAX / sizeof(BN_ULONG) / ((1 << BN_MAX_MOD_EXP_CTIME_WINDOW) + 3), | 1013 | 3.09k | "powerbuf_len may overflow"); | 1014 | | | 1015 | | #if defined(OPENSSL_BN_ASM_MONT5) | 1016 | | if (window >= 5) { | 1017 | | window = 5; // ~5% improvement for RSA2048 sign, and even for RSA4096 | 1018 | | // Reserve space for the |mont->N| copy. | 1019 | | powerbuf_len += top * sizeof(mont->N.d[0]); | 1020 | | } | 1021 | | #endif | 1022 | | | 1023 | | // Allocate a buffer large enough to hold all of the pre-computed | 1024 | | // powers of |am|, |am| itself, and |tmp|. | 1025 | 3.09k | int num_powers = 1 << window; | 1026 | 3.09k | powerbuf_len += sizeof(m->d[0]) * top * (num_powers + 2); | 1027 | | | 1028 | | #if defined(OPENSSL_BN_ASM_MONT5) | 1029 | | if (powerbuf_len <= sizeof(storage)) { | 1030 | | powerbuf = storage; | 1031 | | } | 1032 | | // |storage| is more than large enough to handle 1024-bit inputs. | 1033 | | assert(powerbuf != NULL || top * BN_BITS2 > 1024); | 1034 | | #endif | 1035 | 3.09k | if (powerbuf == NULL) { | 1036 | 3.09k | powerbuf_free = OPENSSL_malloc(powerbuf_len + MOD_EXP_CTIME_ALIGN); | 1037 | 3.09k | if (powerbuf_free == NULL) { | 1038 | 0 | goto err; | 1039 | 0 | } | 1040 | 3.09k | powerbuf = align_pointer(powerbuf_free, MOD_EXP_CTIME_ALIGN); | 1041 | 3.09k | } | 1042 | 3.09k | OPENSSL_memset(powerbuf, 0, powerbuf_len); | 1043 | | | 1044 | | // Place |tmp| and |am| right after powers table. | 1045 | 3.09k | BIGNUM tmp, am; | 1046 | 3.09k | tmp.d = powerbuf + top * num_powers; | 1047 | 3.09k | am.d = tmp.d + top; | 1048 | 3.09k | tmp.width = am.width = 0; | 1049 | 3.09k | tmp.dmax = am.dmax = top; | 1050 | 3.09k | tmp.neg = am.neg = 0; | 1051 | 3.09k | tmp.flags = am.flags = BN_FLG_STATIC_DATA; | 1052 | | | 1053 | 3.09k | if (!bn_one_to_montgomery(&tmp, mont, ctx) || | 1054 | 3.09k | !bn_resize_words(&tmp, top)) { | 1055 | 0 | goto err; | 1056 | 0 | } | 1057 | | | 1058 | | // Prepare a^1 in the Montgomery domain. | 1059 | 3.09k | assert(!a->neg); | 1060 | 3.09k | declassify_assert(BN_ucmp(a, m) < 0); | 1061 | 3.09k | if (!BN_to_montgomery(&am, a, mont, ctx) || | 1062 | 3.09k | !bn_resize_words(&am, top)) { | 1063 | 0 | goto err; | 1064 | 0 | } | 1065 | | | 1066 | | #if defined(OPENSSL_BN_ASM_MONT5) | 1067 | | // This optimization uses ideas from https://eprint.iacr.org/2011/239, | 1068 | | // specifically optimization of cache-timing attack countermeasures, | 1069 | | // pre-computation optimization, and Almost Montgomery Multiplication. | 1070 | | // | 1071 | | // The paper discusses a 4-bit window to optimize 512-bit modular | 1072 | | // exponentiation, used in RSA-1024 with CRT, but RSA-1024 is no longer | 1073 | | // important. | 1074 | | // | 1075 | | // |bn_mul_mont_gather5| and |bn_power5| implement the "almost" reduction | 1076 | | // variant, so the values here may not be fully reduced. They are bounded by R | 1077 | | // (i.e. they fit in |top| words), not |m|. Additionally, we pass these | 1078 | | // "almost" reduced inputs into |bn_mul_mont|, which implements the normal | 1079 | | // reduction variant. Given those inputs, |bn_mul_mont| may not give reduced | 1080 | | // output, but it will still produce "almost" reduced output. | 1081 | | // | 1082 | | // TODO(davidben): Using "almost" reduction complicates analysis of this code, | 1083 | | // and its interaction with other parts of the project. Determine whether this | 1084 | | // is actually necessary for performance. | 1085 | | if (window == 5 && top > 1) { | 1086 | | // Copy |mont->N| to improve cache locality. | 1087 | | BN_ULONG *np = am.d + top; | 1088 | | for (i = 0; i < top; i++) { | 1089 | | np[i] = mont->N.d[i]; | 1090 | | } | 1091 | | | 1092 | | // Fill |powerbuf| with the first 32 powers of |am|. | 1093 | | const BN_ULONG *n0 = mont->n0; | 1094 | | bn_scatter5(tmp.d, top, powerbuf, 0); | 1095 | | bn_scatter5(am.d, am.width, powerbuf, 1); | 1096 | | bn_mul_mont(tmp.d, am.d, am.d, np, n0, top); | 1097 | | bn_scatter5(tmp.d, top, powerbuf, 2); | 1098 | | | 1099 | | // Square to compute powers of two. | 1100 | | for (i = 4; i < 32; i *= 2) { | 1101 | | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); | 1102 | | bn_scatter5(tmp.d, top, powerbuf, i); | 1103 | | } | 1104 | | // Compute odd powers |i| based on |i - 1|, then all powers |i * 2^j|. | 1105 | | for (i = 3; i < 32; i += 2) { | 1106 | | bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); | 1107 | | bn_scatter5(tmp.d, top, powerbuf, i); | 1108 | | for (int j = 2 * i; j < 32; j *= 2) { | 1109 | | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); | 1110 | | bn_scatter5(tmp.d, top, powerbuf, j); | 1111 | | } | 1112 | | } | 1113 | | | 1114 | | bits--; | 1115 | | for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--) { | 1116 | | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); | 1117 | | } | 1118 | | bn_gather5(tmp.d, top, powerbuf, wvalue); | 1119 | | | 1120 | | // At this point |bits| is 4 mod 5 and at least -1. (|bits| is the first bit | 1121 | | // that has not been read yet.) | 1122 | | assert(bits >= -1 && (bits == -1 || bits % 5 == 4)); | 1123 | | | 1124 | | // Scan the exponent one window at a time starting from the most | 1125 | | // significant bits. | 1126 | | if (!bn_power5_capable(top)) { | 1127 | | while (bits >= 0) { | 1128 | | for (wvalue = 0, i = 0; i < 5; i++, bits--) { | 1129 | | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); | 1130 | | } | 1131 | | | 1132 | | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); | 1133 | | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); | 1134 | | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); | 1135 | | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); | 1136 | | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); | 1137 | | bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); | 1138 | | } | 1139 | | } else { | 1140 | | const uint8_t *p_bytes = (const uint8_t *)p->d; | 1141 | | assert(bits < max_bits); | 1142 | | // |p = 0| has been handled as a special case, so |max_bits| is at least | 1143 | | // one word. | 1144 | | assert(max_bits >= 64); | 1145 | | | 1146 | | // If the first bit to be read lands in the last byte, unroll the first | 1147 | | // iteration to avoid reading past the bounds of |p->d|. (After the first | 1148 | | // iteration, we are guaranteed to be past the last byte.) Note |bits| | 1149 | | // here is the top bit, inclusive. | 1150 | | if (bits - 4 >= max_bits - 8) { | 1151 | | // Read five bits from |bits-4| through |bits|, inclusive. | 1152 | | wvalue = p_bytes[p->width * BN_BYTES - 1]; | 1153 | | wvalue >>= (bits - 4) & 7; | 1154 | | wvalue &= 0x1f; | 1155 | | bits -= 5; | 1156 | | bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); | 1157 | | } | 1158 | | while (bits >= 0) { | 1159 | | // Read five bits from |bits-4| through |bits|, inclusive. | 1160 | | int first_bit = bits - 4; | 1161 | | uint16_t val; | 1162 | | OPENSSL_memcpy(&val, p_bytes + (first_bit >> 3), sizeof(val)); | 1163 | | val >>= first_bit & 7; | 1164 | | val &= 0x1f; | 1165 | | bits -= 5; | 1166 | | bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, val); | 1167 | | } | 1168 | | } | 1169 | | // The result is now in |tmp| in Montgomery form, but it may not be fully | 1170 | | // reduced. This is within bounds for |BN_from_montgomery| (tmp < R <= m*R) | 1171 | | // so it will, when converting from Montgomery form, produce a fully reduced | 1172 | | // result. | 1173 | | // | 1174 | | // This differs from Figure 2 of the paper, which uses AMM(h, 1) to convert | 1175 | | // from Montgomery form with unreduced output, followed by an extra | 1176 | | // reduction step. In the paper's terminology, we replace steps 9 and 10 | 1177 | | // with MM(h, 1). | 1178 | | } else | 1179 | | #endif | 1180 | 3.09k | { | 1181 | 3.09k | copy_to_prebuf(&tmp, top, powerbuf, 0, window); | 1182 | 3.09k | copy_to_prebuf(&am, top, powerbuf, 1, window); | 1183 | | | 1184 | | // If the window size is greater than 1, then calculate | 1185 | | // val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) | 1186 | | // (even powers could instead be computed as (a^(i/2))^2 | 1187 | | // to use the slight performance advantage of sqr over mul). | 1188 | 3.09k | if (window > 1) { | 1189 | 3.09k | if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx)) { | 1190 | 0 | goto err; | 1191 | 0 | } | 1192 | | | 1193 | 3.09k | copy_to_prebuf(&tmp, top, powerbuf, 2, window); | 1194 | | | 1195 | 75.7k | for (i = 3; i < num_powers; i++) { | 1196 | | // Calculate a^i = a^(i-1) * a | 1197 | 72.6k | if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx)) { | 1198 | 0 | goto err; | 1199 | 0 | } | 1200 | | | 1201 | 72.6k | copy_to_prebuf(&tmp, top, powerbuf, i, window); | 1202 | 72.6k | } | 1203 | 3.09k | } | 1204 | | | 1205 | 3.09k | bits--; | 1206 | 14.7k | for (wvalue = 0, i = bits % window; i >= 0; i--, bits--) { | 1207 | 11.6k | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); | 1208 | 11.6k | } | 1209 | 3.09k | if (!copy_from_prebuf(&tmp, top, powerbuf, wvalue, window)) { | 1210 | 0 | goto err; | 1211 | 0 | } | 1212 | | | 1213 | | // Scan the exponent one window at a time starting from the most | 1214 | | // significant bits. | 1215 | 245k | while (bits >= 0) { | 1216 | 242k | wvalue = 0; // The 'value' of the window | 1217 | | | 1218 | | // Scan the window, squaring the result as we go | 1219 | 1.48M | for (i = 0; i < window; i++, bits--) { | 1220 | 1.23M | if (!BN_mod_mul_montgomery(&tmp, &tmp, &tmp, mont, ctx)) { | 1221 | 0 | goto err; | 1222 | 0 | } | 1223 | 1.23M | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); | 1224 | 1.23M | } | 1225 | | | 1226 | | // Fetch the appropriate pre-computed value from the pre-buf | 1227 | 242k | if (!copy_from_prebuf(&am, top, powerbuf, wvalue, window)) { | 1228 | 0 | goto err; | 1229 | 0 | } | 1230 | | | 1231 | | // Multiply the result into the intermediate result | 1232 | 242k | if (!BN_mod_mul_montgomery(&tmp, &tmp, &am, mont, ctx)) { | 1233 | 0 | goto err; | 1234 | 0 | } | 1235 | 242k | } | 1236 | 3.09k | } | 1237 | | | 1238 | | // Convert the final result from Montgomery to standard format. If we used the | 1239 | | // |OPENSSL_BN_ASM_MONT5| codepath, |tmp| may not be fully reduced. It is only | 1240 | | // bounded by R rather than |m|. However, that is still within bounds for | 1241 | | // |BN_from_montgomery|, which implements full Montgomery reduction, not | 1242 | | // "almost" Montgomery reduction. | 1243 | 3.09k | if (!BN_from_montgomery(rr, &tmp, mont, ctx)) { | 1244 | 0 | goto err; | 1245 | 0 | } | 1246 | 3.09k | ret = 1; | 1247 | | | 1248 | 3.09k | err: | 1249 | 3.09k | BN_MONT_CTX_free(new_mont); | 1250 | 3.09k | if (powerbuf != NULL && powerbuf_free == NULL) { | 1251 | 0 | OPENSSL_cleanse(powerbuf, powerbuf_len); | 1252 | 0 | } | 1253 | 3.09k | OPENSSL_free(powerbuf_free); | 1254 | 3.09k | return ret; | 1255 | 3.09k | } |
BN_mod_exp_mont_consttime Line | Count | Source | 929 | 1.08k | const BN_MONT_CTX *mont) { | 930 | 1.08k | int i, ret = 0, wvalue; | 931 | 1.08k | BN_MONT_CTX *new_mont = NULL; | 932 | | | 933 | 1.08k | unsigned char *powerbuf_free = NULL; | 934 | 1.08k | size_t powerbuf_len = 0; | 935 | 1.08k | BN_ULONG *powerbuf = NULL; | 936 | | | 937 | 1.08k | if (!BN_is_odd(m)) { | 938 | 64 | OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); | 939 | 64 | return 0; | 940 | 64 | } | 941 | 1.01k | if (m->neg) { | 942 | 0 | OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); | 943 | 0 | return 0; | 944 | 0 | } | 945 | | // |a| is secret, but it is required to be in range, so these comparisons may | 946 | | // be leaked. | 947 | 1.01k | if (a->neg || constant_time_declassify_int(BN_ucmp(a, m) >= 0)) { | 948 | 4 | OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); | 949 | 4 | return 0; | 950 | 4 | } | 951 | | | 952 | | // Use all bits stored in |p|, rather than |BN_num_bits|, so we do not leak | 953 | | // whether the top bits are zero. | 954 | 1.01k | int max_bits = p->width * BN_BITS2; | 955 | 1.01k | int bits = max_bits; | 956 | 1.01k | if (bits == 0) { | 957 | | // x**0 mod 1 is still zero. | 958 | 0 | if (BN_abs_is_word(m, 1)) { | 959 | 0 | BN_zero(rr); | 960 | 0 | return 1; | 961 | 0 | } | 962 | 0 | return BN_one(rr); | 963 | 0 | } | 964 | | | 965 | | // Allocate a montgomery context if it was not supplied by the caller. | 966 | 1.01k | if (mont == NULL) { | 967 | 57 | new_mont = BN_MONT_CTX_new_consttime(m, ctx); | 968 | 57 | if (new_mont == NULL) { | 969 | 0 | goto err; | 970 | 0 | } | 971 | 57 | mont = new_mont; | 972 | 57 | } | 973 | | | 974 | | // Use the width in |mont->N|, rather than the copy in |m|. The assembly | 975 | | // implementation assumes it can use |top| to size R. | 976 | 1.01k | int top = mont->N.width; | 977 | | | 978 | 1.01k | #if defined(OPENSSL_BN_ASM_MONT5) || defined(RSAZ_ENABLED) | 979 | | // Share one large stack-allocated buffer between the RSAZ and non-RSAZ code | 980 | | // paths. If we were to use separate static buffers for each then there is | 981 | | // some chance that both large buffers would be allocated on the stack, | 982 | | // causing the stack space requirement to be truly huge (~10KB). | 983 | 1.01k | alignas(MOD_EXP_CTIME_ALIGN) BN_ULONG storage[MOD_EXP_CTIME_STORAGE_LEN]; | 984 | 1.01k | #endif | 985 | 1.01k | #if defined(RSAZ_ENABLED) | 986 | | // If the size of the operands allow it, perform the optimized RSAZ | 987 | | // exponentiation. For further information see crypto/fipsmodule/bn/rsaz_exp.c | 988 | | // and accompanying assembly modules. | 989 | 1.01k | if (a->width == 16 && p->width == 16 && BN_num_bits(m) == 1024 && | 990 | 1.01k | rsaz_avx2_preferred()) { | 991 | 0 | if (!bn_wexpand(rr, 16)) { | 992 | 0 | goto err; | 993 | 0 | } | 994 | 0 | RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0], | 995 | 0 | storage); | 996 | 0 | rr->width = 16; | 997 | 0 | rr->neg = 0; | 998 | 0 | ret = 1; | 999 | 0 | goto err; | 1000 | 0 | } | 1001 | 1.01k | #endif | 1002 | | | 1003 | | // Get the window size to use with size of p. | 1004 | 1.01k | int window = BN_window_bits_for_ctime_exponent_size(bits); | 1005 | 1.01k | assert(window <= BN_MAX_MOD_EXP_CTIME_WINDOW); | 1006 | | | 1007 | | // Calculating |powerbuf_len| below cannot overflow because of the bound on | 1008 | | // Montgomery reduction. | 1009 | 1.01k | assert((size_t)top <= BN_MONTGOMERY_MAX_WORDS); | 1010 | 1.01k | static_assert( | 1011 | 1.01k | BN_MONTGOMERY_MAX_WORDS <= | 1012 | 1.01k | INT_MAX / sizeof(BN_ULONG) / ((1 << BN_MAX_MOD_EXP_CTIME_WINDOW) + 3), | 1013 | 1.01k | "powerbuf_len may overflow"); | 1014 | | | 1015 | 1.01k | #if defined(OPENSSL_BN_ASM_MONT5) | 1016 | 1.01k | if (window >= 5) { | 1017 | 211 | window = 5; // ~5% improvement for RSA2048 sign, and even for RSA4096 | 1018 | | // Reserve space for the |mont->N| copy. | 1019 | 211 | powerbuf_len += top * sizeof(mont->N.d[0]); | 1020 | 211 | } | 1021 | 1.01k | #endif | 1022 | | | 1023 | | // Allocate a buffer large enough to hold all of the pre-computed | 1024 | | // powers of |am|, |am| itself, and |tmp|. | 1025 | 1.01k | int num_powers = 1 << window; | 1026 | 1.01k | powerbuf_len += sizeof(m->d[0]) * top * (num_powers + 2); | 1027 | | | 1028 | 1.01k | #if defined(OPENSSL_BN_ASM_MONT5) | 1029 | 1.01k | if (powerbuf_len <= sizeof(storage)) { | 1030 | 942 | powerbuf = storage; | 1031 | 942 | } | 1032 | | // |storage| is more than large enough to handle 1024-bit inputs. | 1033 | 1.01k | assert(powerbuf != NULL || top * BN_BITS2 > 1024); | 1034 | 1.01k | #endif | 1035 | 1.01k | if (powerbuf == NULL) { | 1036 | 73 | powerbuf_free = OPENSSL_malloc(powerbuf_len + MOD_EXP_CTIME_ALIGN); | 1037 | 73 | if (powerbuf_free == NULL) { | 1038 | 0 | goto err; | 1039 | 0 | } | 1040 | 73 | powerbuf = align_pointer(powerbuf_free, MOD_EXP_CTIME_ALIGN); | 1041 | 73 | } | 1042 | 1.01k | OPENSSL_memset(powerbuf, 0, powerbuf_len); | 1043 | | | 1044 | | // Place |tmp| and |am| right after powers table. | 1045 | 1.01k | BIGNUM tmp, am; | 1046 | 1.01k | tmp.d = powerbuf + top * num_powers; | 1047 | 1.01k | am.d = tmp.d + top; | 1048 | 1.01k | tmp.width = am.width = 0; | 1049 | 1.01k | tmp.dmax = am.dmax = top; | 1050 | 1.01k | tmp.neg = am.neg = 0; | 1051 | 1.01k | tmp.flags = am.flags = BN_FLG_STATIC_DATA; | 1052 | | | 1053 | 1.01k | if (!bn_one_to_montgomery(&tmp, mont, ctx) || | 1054 | 1.01k | !bn_resize_words(&tmp, top)) { | 1055 | 0 | goto err; | 1056 | 0 | } | 1057 | | | 1058 | | // Prepare a^1 in the Montgomery domain. | 1059 | 1.01k | assert(!a->neg); | 1060 | 1.01k | declassify_assert(BN_ucmp(a, m) < 0); | 1061 | 1.01k | if (!BN_to_montgomery(&am, a, mont, ctx) || | 1062 | 1.01k | !bn_resize_words(&am, top)) { | 1063 | 0 | goto err; | 1064 | 0 | } | 1065 | | | 1066 | 1.01k | #if defined(OPENSSL_BN_ASM_MONT5) | 1067 | | // This optimization uses ideas from https://eprint.iacr.org/2011/239, | 1068 | | // specifically optimization of cache-timing attack countermeasures, | 1069 | | // pre-computation optimization, and Almost Montgomery Multiplication. | 1070 | | // | 1071 | | // The paper discusses a 4-bit window to optimize 512-bit modular | 1072 | | // exponentiation, used in RSA-1024 with CRT, but RSA-1024 is no longer | 1073 | | // important. | 1074 | | // | 1075 | | // |bn_mul_mont_gather5| and |bn_power5| implement the "almost" reduction | 1076 | | // variant, so the values here may not be fully reduced. They are bounded by R | 1077 | | // (i.e. they fit in |top| words), not |m|. Additionally, we pass these | 1078 | | // "almost" reduced inputs into |bn_mul_mont|, which implements the normal | 1079 | | // reduction variant. Given those inputs, |bn_mul_mont| may not give reduced | 1080 | | // output, but it will still produce "almost" reduced output. | 1081 | | // | 1082 | | // TODO(davidben): Using "almost" reduction complicates analysis of this code, | 1083 | | // and its interaction with other parts of the project. Determine whether this | 1084 | | // is actually necessary for performance. | 1085 | 1.01k | if (window == 5 && top > 1) { | 1086 | | // Copy |mont->N| to improve cache locality. | 1087 | 211 | BN_ULONG *np = am.d + top; | 1088 | 6.57k | for (i = 0; i < top; i++) { | 1089 | 6.36k | np[i] = mont->N.d[i]; | 1090 | 6.36k | } | 1091 | | | 1092 | | // Fill |powerbuf| with the first 32 powers of |am|. | 1093 | 211 | const BN_ULONG *n0 = mont->n0; | 1094 | 211 | bn_scatter5(tmp.d, top, powerbuf, 0); | 1095 | 211 | bn_scatter5(am.d, am.width, powerbuf, 1); | 1096 | 211 | bn_mul_mont(tmp.d, am.d, am.d, np, n0, top); | 1097 | 211 | bn_scatter5(tmp.d, top, powerbuf, 2); | 1098 | | | 1099 | | // Square to compute powers of two. | 1100 | 844 | for (i = 4; i < 32; i *= 2) { | 1101 | 633 | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); | 1102 | 633 | bn_scatter5(tmp.d, top, powerbuf, i); | 1103 | 633 | } | 1104 | | // Compute odd powers |i| based on |i - 1|, then all powers |i * 2^j|. | 1105 | 3.37k | for (i = 3; i < 32; i += 2) { | 1106 | 3.16k | bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); | 1107 | 3.16k | bn_scatter5(tmp.d, top, powerbuf, i); | 1108 | 5.48k | for (int j = 2 * i; j < 32; j *= 2) { | 1109 | 2.32k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); | 1110 | 2.32k | bn_scatter5(tmp.d, top, powerbuf, j); | 1111 | 2.32k | } | 1112 | 3.16k | } | 1113 | | | 1114 | 211 | bits--; | 1115 | 1.03k | for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--) { | 1116 | 819 | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); | 1117 | 819 | } | 1118 | 211 | bn_gather5(tmp.d, top, powerbuf, wvalue); | 1119 | | | 1120 | | // At this point |bits| is 4 mod 5 and at least -1. (|bits| is the first bit | 1121 | | // that has not been read yet.) | 1122 | 211 | assert(bits >= -1 && (bits == -1 || bits % 5 == 4)); | 1123 | | | 1124 | | // Scan the exponent one window at a time starting from the most | 1125 | | // significant bits. | 1126 | 211 | if (!bn_power5_capable(top)) { | 1127 | 33.2k | while (bits >= 0) { | 1128 | 198k | for (wvalue = 0, i = 0; i < 5; i++, bits--) { | 1129 | 165k | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); | 1130 | 165k | } | 1131 | | | 1132 | 33.0k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); | 1133 | 33.0k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); | 1134 | 33.0k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); | 1135 | 33.0k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); | 1136 | 33.0k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); | 1137 | 33.0k | bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); | 1138 | 33.0k | } | 1139 | 189 | } else { | 1140 | 22 | const uint8_t *p_bytes = (const uint8_t *)p->d; | 1141 | 22 | assert(bits < max_bits); | 1142 | | // |p = 0| has been handled as a special case, so |max_bits| is at least | 1143 | | // one word. | 1144 | 22 | assert(max_bits >= 64); | 1145 | | | 1146 | | // If the first bit to be read lands in the last byte, unroll the first | 1147 | | // iteration to avoid reading past the bounds of |p->d|. (After the first | 1148 | | // iteration, we are guaranteed to be past the last byte.) Note |bits| | 1149 | | // here is the top bit, inclusive. | 1150 | 22 | if (bits - 4 >= max_bits - 8) { | 1151 | | // Read five bits from |bits-4| through |bits|, inclusive. | 1152 | 21 | wvalue = p_bytes[p->width * BN_BYTES - 1]; | 1153 | 21 | wvalue >>= (bits - 4) & 7; | 1154 | 21 | wvalue &= 0x1f; | 1155 | 21 | bits -= 5; | 1156 | 21 | bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); | 1157 | 21 | } | 1158 | 4.78k | while (bits >= 0) { | 1159 | | // Read five bits from |bits-4| through |bits|, inclusive. | 1160 | 4.75k | int first_bit = bits - 4; | 1161 | 4.75k | uint16_t val; | 1162 | 4.75k | OPENSSL_memcpy(&val, p_bytes + (first_bit >> 3), sizeof(val)); | 1163 | 4.75k | val >>= first_bit & 7; | 1164 | 4.75k | val &= 0x1f; | 1165 | 4.75k | bits -= 5; | 1166 | 4.75k | bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, val); | 1167 | 4.75k | } | 1168 | 22 | } | 1169 | | // The result is now in |tmp| in Montgomery form, but it may not be fully | 1170 | | // reduced. This is within bounds for |BN_from_montgomery| (tmp < R <= m*R) | 1171 | | // so it will, when converting from Montgomery form, produce a fully reduced | 1172 | | // result. | 1173 | | // | 1174 | | // This differs from Figure 2 of the paper, which uses AMM(h, 1) to convert | 1175 | | // from Montgomery form with unreduced output, followed by an extra | 1176 | | // reduction step. In the paper's terminology, we replace steps 9 and 10 | 1177 | | // with MM(h, 1). | 1178 | 211 | } else | 1179 | 804 | #endif | 1180 | 804 | { | 1181 | 804 | copy_to_prebuf(&tmp, top, powerbuf, 0, window); | 1182 | 804 | copy_to_prebuf(&am, top, powerbuf, 1, window); | 1183 | | | 1184 | | // If the window size is greater than 1, then calculate | 1185 | | // val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) | 1186 | | // (even powers could instead be computed as (a^(i/2))^2 | 1187 | | // to use the slight performance advantage of sqr over mul). | 1188 | 804 | if (window > 1) { | 1189 | 804 | if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx)) { | 1190 | 0 | goto err; | 1191 | 0 | } | 1192 | | | 1193 | 804 | copy_to_prebuf(&tmp, top, powerbuf, 2, window); | 1194 | | | 1195 | 11.2k | for (i = 3; i < num_powers; i++) { | 1196 | | // Calculate a^i = a^(i-1) * a | 1197 | 10.3k | if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx)) { | 1198 | 0 | goto err; | 1199 | 0 | } | 1200 | | | 1201 | 10.3k | copy_to_prebuf(&tmp, top, powerbuf, i, window); | 1202 | 10.3k | } | 1203 | 804 | } | 1204 | | | 1205 | 804 | bits--; | 1206 | 3.99k | for (wvalue = 0, i = bits % window; i >= 0; i--, bits--) { | 1207 | 3.19k | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); | 1208 | 3.19k | } | 1209 | 804 | if (!copy_from_prebuf(&tmp, top, powerbuf, wvalue, window)) { | 1210 | 0 | goto err; | 1211 | 0 | } | 1212 | | | 1213 | | // Scan the exponent one window at a time starting from the most | 1214 | | // significant bits. | 1215 | 38.2k | while (bits >= 0) { | 1216 | 37.4k | wvalue = 0; // The 'value' of the window | 1217 | | | 1218 | | // Scan the window, squaring the result as we go | 1219 | 187k | for (i = 0; i < window; i++, bits--) { | 1220 | 149k | if (!BN_mod_mul_montgomery(&tmp, &tmp, &tmp, mont, ctx)) { | 1221 | 0 | goto err; | 1222 | 0 | } | 1223 | 149k | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); | 1224 | 149k | } | 1225 | | | 1226 | | // Fetch the appropriate pre-computed value from the pre-buf | 1227 | 37.4k | if (!copy_from_prebuf(&am, top, powerbuf, wvalue, window)) { | 1228 | 0 | goto err; | 1229 | 0 | } | 1230 | | | 1231 | | // Multiply the result into the intermediate result | 1232 | 37.4k | if (!BN_mod_mul_montgomery(&tmp, &tmp, &am, mont, ctx)) { | 1233 | 0 | goto err; | 1234 | 0 | } | 1235 | 37.4k | } | 1236 | 804 | } | 1237 | | | 1238 | | // Convert the final result from Montgomery to standard format. If we used the | 1239 | | // |OPENSSL_BN_ASM_MONT5| codepath, |tmp| may not be fully reduced. It is only | 1240 | | // bounded by R rather than |m|. However, that is still within bounds for | 1241 | | // |BN_from_montgomery|, which implements full Montgomery reduction, not | 1242 | | // "almost" Montgomery reduction. | 1243 | 1.01k | if (!BN_from_montgomery(rr, &tmp, mont, ctx)) { | 1244 | 0 | goto err; | 1245 | 0 | } | 1246 | 1.01k | ret = 1; | 1247 | | | 1248 | 1.01k | err: | 1249 | 1.01k | BN_MONT_CTX_free(new_mont); | 1250 | 1.01k | if (powerbuf != NULL && powerbuf_free == NULL) { | 1251 | 942 | OPENSSL_cleanse(powerbuf, powerbuf_len); | 1252 | 942 | } | 1253 | 1.01k | OPENSSL_free(powerbuf_free); | 1254 | 1.01k | return ret; | 1255 | 1.01k | } |
|
1256 | | |
1257 | | int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p, |
1258 | | const BIGNUM *m, BN_CTX *ctx, |
1259 | | const BN_MONT_CTX *mont) { |
1260 | | BIGNUM a_bignum; |
1261 | | BN_init(&a_bignum); |
1262 | | |
1263 | | int ret = 0; |
1264 | | |
1265 | | // BN_mod_exp_mont requires reduced inputs. |
1266 | | if (bn_minimal_width(m) == 1) { |
1267 | | a %= m->d[0]; |
1268 | | } |
1269 | | |
1270 | | if (!BN_set_word(&a_bignum, a)) { |
1271 | | OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR); |
1272 | | goto err; |
1273 | | } |
1274 | | |
1275 | | ret = BN_mod_exp_mont(rr, &a_bignum, p, m, ctx, mont); |
1276 | | |
1277 | | err: |
1278 | | BN_free(&a_bignum); |
1279 | | |
1280 | | return ret; |
1281 | | } |
1282 | | |
1283 | | #define TABLE_SIZE 32 |
1284 | | |
1285 | | int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1, |
1286 | | const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m, |
1287 | 12 | BN_CTX *ctx, const BN_MONT_CTX *mont) { |
1288 | 12 | BIGNUM tmp; |
1289 | 12 | BN_init(&tmp); |
1290 | | |
1291 | 12 | int ret = 0; |
1292 | 12 | BN_MONT_CTX *new_mont = NULL; |
1293 | | |
1294 | | // Allocate a montgomery context if it was not supplied by the caller. |
1295 | 12 | if (mont == NULL) { |
1296 | 12 | new_mont = BN_MONT_CTX_new_for_modulus(m, ctx); |
1297 | 12 | if (new_mont == NULL) { |
1298 | 11 | goto err; |
1299 | 11 | } |
1300 | 1 | mont = new_mont; |
1301 | 1 | } |
1302 | | |
1303 | | // BN_mod_mul_montgomery removes one Montgomery factor, so passing one |
1304 | | // Montgomery-encoded and one non-Montgomery-encoded value gives a |
1305 | | // non-Montgomery-encoded result. |
1306 | 1 | if (!BN_mod_exp_mont(rr, a1, p1, m, ctx, mont) || |
1307 | 1 | !BN_mod_exp_mont(&tmp, a2, p2, m, ctx, mont) || |
1308 | 1 | !BN_to_montgomery(rr, rr, mont, ctx) || |
1309 | 1 | !BN_mod_mul_montgomery(rr, rr, &tmp, mont, ctx)) { |
1310 | 0 | goto err; |
1311 | 0 | } |
1312 | | |
1313 | 1 | ret = 1; |
1314 | | |
1315 | 12 | err: |
1316 | 12 | BN_MONT_CTX_free(new_mont); |
1317 | 12 | BN_free(&tmp); |
1318 | | |
1319 | 12 | return ret; |
1320 | 1 | } |