/src/boringssl/crypto/fipsmodule/bn/exponentiation.c.inc
Line | Count | Source (jump to first uncovered line) |
1 | | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) |
2 | | * All rights reserved. |
3 | | * |
4 | | * This package is an SSL implementation written |
5 | | * by Eric Young (eay@cryptsoft.com). |
6 | | * The implementation was written so as to conform with Netscapes SSL. |
7 | | * |
8 | | * This library is free for commercial and non-commercial use as long as |
9 | | * the following conditions are aheared to. The following conditions |
10 | | * apply to all code found in this distribution, be it the RC4, RSA, |
11 | | * lhash, DES, etc., code; not just the SSL code. The SSL documentation |
12 | | * included with this distribution is covered by the same copyright terms |
13 | | * except that the holder is Tim Hudson (tjh@cryptsoft.com). |
14 | | * |
15 | | * Copyright remains Eric Young's, and as such any Copyright notices in |
16 | | * the code are not to be removed. |
17 | | * If this package is used in a product, Eric Young should be given attribution |
18 | | * as the author of the parts of the library used. |
19 | | * This can be in the form of a textual message at program startup or |
20 | | * in documentation (online or textual) provided with the package. |
21 | | * |
22 | | * Redistribution and use in source and binary forms, with or without |
23 | | * modification, are permitted provided that the following conditions |
24 | | * are met: |
25 | | * 1. Redistributions of source code must retain the copyright |
26 | | * notice, this list of conditions and the following disclaimer. |
27 | | * 2. Redistributions in binary form must reproduce the above copyright |
28 | | * notice, this list of conditions and the following disclaimer in the |
29 | | * documentation and/or other materials provided with the distribution. |
30 | | * 3. All advertising materials mentioning features or use of this software |
31 | | * must display the following acknowledgement: |
32 | | * "This product includes cryptographic software written by |
33 | | * Eric Young (eay@cryptsoft.com)" |
34 | | * The word 'cryptographic' can be left out if the rouines from the library |
35 | | * being used are not cryptographic related :-). |
36 | | * 4. If you include any Windows specific code (or a derivative thereof) from |
37 | | * the apps directory (application code) you must include an acknowledgement: |
38 | | * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" |
39 | | * |
40 | | * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND |
41 | | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
42 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
43 | | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
44 | | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
45 | | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
46 | | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
47 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
48 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
49 | | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
50 | | * SUCH DAMAGE. |
51 | | * |
52 | | * The licence and distribution terms for any publically available version or |
53 | | * derivative of this code cannot be changed. i.e. this code cannot simply be |
54 | | * copied and put under another distribution licence |
55 | | * [including the GNU Public Licence.] |
56 | | */ |
57 | | /* ==================================================================== |
58 | | * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. |
59 | | * |
60 | | * Redistribution and use in source and binary forms, with or without |
61 | | * modification, are permitted provided that the following conditions |
62 | | * are met: |
63 | | * |
64 | | * 1. Redistributions of source code must retain the above copyright |
65 | | * notice, this list of conditions and the following disclaimer. |
66 | | * |
67 | | * 2. Redistributions in binary form must reproduce the above copyright |
68 | | * notice, this list of conditions and the following disclaimer in |
69 | | * the documentation and/or other materials provided with the |
70 | | * distribution. |
71 | | * |
72 | | * 3. All advertising materials mentioning features or use of this |
73 | | * software must display the following acknowledgment: |
74 | | * "This product includes software developed by the OpenSSL Project |
75 | | * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" |
76 | | * |
77 | | * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to |
78 | | * endorse or promote products derived from this software without |
79 | | * prior written permission. For written permission, please contact |
80 | | * openssl-core@openssl.org. |
81 | | * |
82 | | * 5. Products derived from this software may not be called "OpenSSL" |
83 | | * nor may "OpenSSL" appear in their names without prior written |
84 | | * permission of the OpenSSL Project. |
85 | | * |
86 | | * 6. Redistributions of any form whatsoever must retain the following |
87 | | * acknowledgment: |
88 | | * "This product includes software developed by the OpenSSL Project |
89 | | * for use in the OpenSSL Toolkit (http://www.openssl.org/)" |
90 | | * |
91 | | * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY |
92 | | * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
93 | | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
94 | | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR |
95 | | * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
96 | | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
97 | | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
98 | | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
99 | | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
100 | | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
101 | | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED |
102 | | * OF THE POSSIBILITY OF SUCH DAMAGE. |
103 | | * ==================================================================== |
104 | | * |
105 | | * This product includes cryptographic software written by Eric Young |
106 | | * (eay@cryptsoft.com). This product includes software written by Tim |
107 | | * Hudson (tjh@cryptsoft.com). */ |
108 | | |
109 | | #include <openssl/bn.h> |
110 | | |
111 | | #include <assert.h> |
112 | | #include <limits.h> |
113 | | #include <stdlib.h> |
114 | | #include <string.h> |
115 | | |
116 | | #include <openssl/err.h> |
117 | | #include <openssl/mem.h> |
118 | | |
119 | | #include "internal.h" |
120 | | #include "rsaz_exp.h" |
121 | | |
122 | | #if defined(OPENSSL_BN_ASM_MONT5) |
123 | | |
124 | | // bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it |
125 | | // by |ap| modulo |np|, and stores the result in |rp|. The values are |num| |
126 | | // words long and represented in Montgomery form. |n0| is a pointer to the |
127 | | // corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least |
128 | | // 16 bytes. |power| must be less than 32 and is treated as secret. |
129 | | // |
130 | | // WARNING: This function implements Almost Montgomery Multiplication from |
131 | | // https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. |
132 | | // However, even if they are fully reduced, the output may not be. |
133 | | static void bn_mul_mont_gather5( |
134 | | BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, const BN_ULONG *np, |
135 | 391k | const BN_ULONG *n0, int num, int power) { |
136 | 391k | if (bn_mulx4x_mont_gather5_capable(num)) { |
137 | 0 | bn_mulx4x_mont_gather5(rp, ap, table, np, n0, num, power); |
138 | 391k | } else if (bn_mul4x_mont_gather5_capable(num)) { |
139 | 0 | bn_mul4x_mont_gather5(rp, ap, table, np, n0, num, power); |
140 | 391k | } else { |
141 | 391k | bn_mul_mont_gather5_nohw(rp, ap, table, np, n0, num, power); |
142 | 391k | } |
143 | 391k | } |
144 | | |
145 | | // bn_power5 squares |ap| five times and multiplies it by the value stored at |
146 | | // index |power| of |table|, modulo |np|. It stores the result in |rp|. The |
147 | | // values are |num| words long and represented in Montgomery form. |n0| is a |
148 | | // pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible |
149 | | // by 8. |power| must be less than 32 and is treated as secret. |
150 | | // |
151 | | // WARNING: This function implements Almost Montgomery Multiplication from |
152 | | // https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. |
153 | | // However, even if they are fully reduced, the output may not be. |
154 | | static void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, |
155 | | const BN_ULONG *np, const BN_ULONG *n0, int num, |
156 | 0 | int power) { |
157 | 0 | assert(bn_power5_capable(num)); |
158 | 0 | if (bn_powerx5_capable(num)) { |
159 | 0 | bn_powerx5(rp, ap, table, np, n0, num, power); |
160 | 0 | } else { |
161 | 0 | bn_power5_nohw(rp, ap, table, np, n0, num, power); |
162 | 0 | } |
163 | 0 | } |
164 | | |
165 | | #endif // defined(OPENSSL_BN_ASM_MONT5) |
166 | | |
167 | 188 | int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) { |
168 | 188 | int i, bits, ret = 0; |
169 | 188 | BIGNUM *v, *rr; |
170 | | |
171 | 188 | BN_CTX_start(ctx); |
172 | 188 | if (r == a || r == p) { |
173 | 0 | rr = BN_CTX_get(ctx); |
174 | 188 | } else { |
175 | 188 | rr = r; |
176 | 188 | } |
177 | | |
178 | 188 | v = BN_CTX_get(ctx); |
179 | 188 | if (rr == NULL || v == NULL) { |
180 | 0 | goto err; |
181 | 0 | } |
182 | | |
183 | 188 | if (BN_copy(v, a) == NULL) { |
184 | 0 | goto err; |
185 | 0 | } |
186 | 188 | bits = BN_num_bits(p); |
187 | | |
188 | 188 | if (BN_is_odd(p)) { |
189 | 82 | if (BN_copy(rr, a) == NULL) { |
190 | 0 | goto err; |
191 | 0 | } |
192 | 106 | } else { |
193 | 106 | if (!BN_one(rr)) { |
194 | 0 | goto err; |
195 | 0 | } |
196 | 106 | } |
197 | | |
198 | 1.52k | for (i = 1; i < bits; i++) { |
199 | 1.34k | if (!BN_sqr(v, v, ctx)) { |
200 | 0 | goto err; |
201 | 0 | } |
202 | 1.34k | if (BN_is_bit_set(p, i)) { |
203 | 872 | if (!BN_mul(rr, rr, v, ctx)) { |
204 | 0 | goto err; |
205 | 0 | } |
206 | 872 | } |
207 | 1.34k | } |
208 | | |
209 | 188 | if (r != rr && !BN_copy(r, rr)) { |
210 | 0 | goto err; |
211 | 0 | } |
212 | 188 | ret = 1; |
213 | | |
214 | 188 | err: |
215 | 188 | BN_CTX_end(ctx); |
216 | 188 | return ret; |
217 | 188 | } |
218 | | |
219 | | typedef struct bn_recp_ctx_st { |
220 | | BIGNUM N; // the divisor |
221 | | BIGNUM Nr; // the reciprocal |
222 | | int num_bits; |
223 | | int shift; |
224 | | int flags; |
225 | | } BN_RECP_CTX; |
226 | | |
227 | 1.32k | static void BN_RECP_CTX_init(BN_RECP_CTX *recp) { |
228 | 1.32k | BN_init(&recp->N); |
229 | 1.32k | BN_init(&recp->Nr); |
230 | 1.32k | recp->num_bits = 0; |
231 | 1.32k | recp->shift = 0; |
232 | 1.32k | recp->flags = 0; |
233 | 1.32k | } |
234 | | |
235 | 1.32k | static void BN_RECP_CTX_free(BN_RECP_CTX *recp) { |
236 | 1.32k | if (recp == NULL) { |
237 | 0 | return; |
238 | 0 | } |
239 | | |
240 | 1.32k | BN_free(&recp->N); |
241 | 1.32k | BN_free(&recp->Nr); |
242 | 1.32k | } |
243 | | |
244 | 1.32k | static int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *d, BN_CTX *ctx) { |
245 | 1.32k | if (!BN_copy(&(recp->N), d)) { |
246 | 0 | return 0; |
247 | 0 | } |
248 | 1.32k | BN_zero(&recp->Nr); |
249 | 1.32k | recp->num_bits = BN_num_bits(d); |
250 | 1.32k | recp->shift = 0; |
251 | | |
252 | 1.32k | return 1; |
253 | 1.32k | } |
254 | | |
255 | | // len is the expected size of the result We actually calculate with an extra |
256 | | // word of precision, so we can do faster division if the remainder is not |
257 | | // required. |
258 | | // r := 2^len / m |
259 | 1.31k | static int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx) { |
260 | 1.31k | int ret = -1; |
261 | 1.31k | BIGNUM *t; |
262 | | |
263 | 1.31k | BN_CTX_start(ctx); |
264 | 1.31k | t = BN_CTX_get(ctx); |
265 | 1.31k | if (t == NULL) { |
266 | 0 | goto err; |
267 | 0 | } |
268 | | |
269 | 1.31k | if (!BN_set_bit(t, len)) { |
270 | 0 | goto err; |
271 | 0 | } |
272 | | |
273 | 1.31k | if (!BN_div(r, NULL, t, m, ctx)) { |
274 | 0 | goto err; |
275 | 0 | } |
276 | | |
277 | 1.31k | ret = len; |
278 | | |
279 | 1.31k | err: |
280 | 1.31k | BN_CTX_end(ctx); |
281 | 1.31k | return ret; |
282 | 1.31k | } |
283 | | |
284 | | static int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, |
285 | 391k | BN_RECP_CTX *recp, BN_CTX *ctx) { |
286 | 391k | int i, j, ret = 0; |
287 | 391k | BIGNUM *a, *b, *d, *r; |
288 | | |
289 | 391k | BN_CTX_start(ctx); |
290 | 391k | a = BN_CTX_get(ctx); |
291 | 391k | b = BN_CTX_get(ctx); |
292 | 391k | if (dv != NULL) { |
293 | 0 | d = dv; |
294 | 391k | } else { |
295 | 391k | d = BN_CTX_get(ctx); |
296 | 391k | } |
297 | | |
298 | 391k | if (rem != NULL) { |
299 | 391k | r = rem; |
300 | 391k | } else { |
301 | 0 | r = BN_CTX_get(ctx); |
302 | 0 | } |
303 | | |
304 | 391k | if (a == NULL || b == NULL || d == NULL || r == NULL) { |
305 | 0 | goto err; |
306 | 0 | } |
307 | | |
308 | 391k | if (BN_ucmp(m, &recp->N) < 0) { |
309 | 16.3k | BN_zero(d); |
310 | 16.3k | if (!BN_copy(r, m)) { |
311 | 0 | goto err; |
312 | 0 | } |
313 | 16.3k | BN_CTX_end(ctx); |
314 | 16.3k | return 1; |
315 | 16.3k | } |
316 | | |
317 | | // We want the remainder |
318 | | // Given input of ABCDEF / ab |
319 | | // we need multiply ABCDEF by 3 digests of the reciprocal of ab |
320 | | |
321 | | // i := max(BN_num_bits(m), 2*BN_num_bits(N)) |
322 | 374k | i = BN_num_bits(m); |
323 | 374k | j = recp->num_bits << 1; |
324 | 374k | if (j > i) { |
325 | 319k | i = j; |
326 | 319k | } |
327 | | |
328 | | // Nr := round(2^i / N) |
329 | 374k | if (i != recp->shift) { |
330 | 1.31k | recp->shift = |
331 | 1.31k | BN_reciprocal(&(recp->Nr), &(recp->N), i, |
332 | 1.31k | ctx); // BN_reciprocal returns i, or -1 for an error |
333 | 1.31k | } |
334 | | |
335 | 374k | if (recp->shift == -1) { |
336 | 0 | goto err; |
337 | 0 | } |
338 | | |
339 | | // d := |round(round(m / 2^BN_num_bits(N)) * recp->Nr / 2^(i - |
340 | | // BN_num_bits(N)))| |
341 | | // = |round(round(m / 2^BN_num_bits(N)) * round(2^i / N) / 2^(i - |
342 | | // BN_num_bits(N)))| |
343 | | // <= |(m / 2^BN_num_bits(N)) * (2^i / N) * (2^BN_num_bits(N) / 2^i)| |
344 | | // = |m/N| |
345 | 374k | if (!BN_rshift(a, m, recp->num_bits)) { |
346 | 0 | goto err; |
347 | 0 | } |
348 | 374k | if (!BN_mul(b, a, &(recp->Nr), ctx)) { |
349 | 0 | goto err; |
350 | 0 | } |
351 | 374k | if (!BN_rshift(d, b, i - recp->num_bits)) { |
352 | 0 | goto err; |
353 | 0 | } |
354 | 374k | d->neg = 0; |
355 | | |
356 | 374k | if (!BN_mul(b, &(recp->N), d, ctx)) { |
357 | 0 | goto err; |
358 | 0 | } |
359 | 374k | if (!BN_usub(r, m, b)) { |
360 | 0 | goto err; |
361 | 0 | } |
362 | 374k | r->neg = 0; |
363 | | |
364 | 374k | j = 0; |
365 | 660k | while (BN_ucmp(r, &(recp->N)) >= 0) { |
366 | 285k | if (j++ > 2) { |
367 | 0 | OPENSSL_PUT_ERROR(BN, BN_R_BAD_RECIPROCAL); |
368 | 0 | goto err; |
369 | 0 | } |
370 | 285k | if (!BN_usub(r, r, &(recp->N))) { |
371 | 0 | goto err; |
372 | 0 | } |
373 | 285k | if (!BN_add_word(d, 1)) { |
374 | 0 | goto err; |
375 | 0 | } |
376 | 285k | } |
377 | | |
378 | 374k | r->neg = BN_is_zero(r) ? 0 : m->neg; |
379 | 374k | d->neg = m->neg ^ recp->N.neg; |
380 | 374k | ret = 1; |
381 | | |
382 | 374k | err: |
383 | 374k | BN_CTX_end(ctx); |
384 | 374k | return ret; |
385 | 374k | } |
386 | | |
387 | | static int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y, |
388 | 391k | BN_RECP_CTX *recp, BN_CTX *ctx) { |
389 | 391k | int ret = 0; |
390 | 391k | BIGNUM *a; |
391 | 391k | const BIGNUM *ca; |
392 | | |
393 | 391k | BN_CTX_start(ctx); |
394 | 391k | a = BN_CTX_get(ctx); |
395 | 391k | if (a == NULL) { |
396 | 0 | goto err; |
397 | 0 | } |
398 | | |
399 | 391k | if (y != NULL) { |
400 | 391k | if (x == y) { |
401 | 321k | if (!BN_sqr(a, x, ctx)) { |
402 | 0 | goto err; |
403 | 0 | } |
404 | 321k | } else { |
405 | 69.7k | if (!BN_mul(a, x, y, ctx)) { |
406 | 0 | goto err; |
407 | 0 | } |
408 | 69.7k | } |
409 | 391k | ca = a; |
410 | 391k | } else { |
411 | 0 | ca = x; // Just do the mod |
412 | 0 | } |
413 | | |
414 | 391k | ret = BN_div_recp(NULL, r, ca, recp, ctx); |
415 | | |
416 | 391k | err: |
417 | 391k | BN_CTX_end(ctx); |
418 | 391k | return ret; |
419 | 391k | } |
420 | | |
421 | | // BN_window_bits_for_exponent_size returns sliding window size for mod_exp with |
422 | | // a |b| bit exponent. |
423 | | // |
424 | | // For window size 'w' (w >= 2) and a random 'b' bits exponent, the number of |
425 | | // multiplications is a constant plus on average |
426 | | // |
427 | | // 2^(w-1) + (b-w)/(w+1); |
428 | | // |
429 | | // here 2^(w-1) is for precomputing the table (we actually need entries only |
430 | | // for windows that have the lowest bit set), and (b-w)/(w+1) is an |
431 | | // approximation for the expected number of w-bit windows, not counting the |
432 | | // first one. |
433 | | // |
434 | | // Thus we should use |
435 | | // |
436 | | // w >= 6 if b > 671 |
437 | | // w = 5 if 671 > b > 239 |
438 | | // w = 4 if 239 > b > 79 |
439 | | // w = 3 if 79 > b > 23 |
440 | | // w <= 2 if 23 > b |
441 | | // |
442 | | // (with draws in between). Very small exponents are often selected |
443 | | // with low Hamming weight, so we use w = 1 for b <= 23. |
444 | 2.16k | static int BN_window_bits_for_exponent_size(size_t b) { |
445 | 2.16k | if (b > 671) { |
446 | 0 | return 6; |
447 | 0 | } |
448 | 2.16k | if (b > 239) { |
449 | 1.42k | return 5; |
450 | 1.42k | } |
451 | 743 | if (b > 79) { |
452 | 404 | return 4; |
453 | 404 | } |
454 | 339 | if (b > 23) { |
455 | 157 | return 3; |
456 | 157 | } |
457 | 182 | return 1; |
458 | 339 | } |
459 | | |
460 | | // TABLE_SIZE is the maximum precomputation table size for *variable* sliding |
461 | | // windows. This must be 2^(max_window - 1), where max_window is the largest |
462 | | // value returned from |BN_window_bits_for_exponent_size|. |
463 | | #define TABLE_SIZE 32 |
464 | | |
465 | | // TABLE_BITS_SMALL is the smallest value returned from |
466 | | // |BN_window_bits_for_exponent_size| when |b| is at most |BN_BITS2| * |
467 | | // |BN_SMALL_MAX_WORDS| words. |
468 | 0 | #define TABLE_BITS_SMALL 5 |
469 | | |
470 | | // TABLE_SIZE_SMALL is the same as |TABLE_SIZE|, but when |b| is at most |
471 | | // |BN_BITS2| * |BN_SMALL_MAX_WORDS|. |
472 | | #define TABLE_SIZE_SMALL (1 << (TABLE_BITS_SMALL - 1)) |
473 | | |
474 | | static int mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, |
475 | 1.34k | const BIGNUM *m, BN_CTX *ctx) { |
476 | 1.34k | int i, j, ret = 0, wstart, window; |
477 | 1.34k | int start = 1; |
478 | 1.34k | BIGNUM *aa; |
479 | | // Table of variables obtained from 'ctx' |
480 | 1.34k | BIGNUM *val[TABLE_SIZE]; |
481 | 1.34k | BN_RECP_CTX recp; |
482 | | |
483 | | // This function is only called on even moduli. |
484 | 1.34k | assert(!BN_is_odd(m)); |
485 | | |
486 | 1.34k | int bits = BN_num_bits(p); |
487 | 1.34k | if (bits == 0) { |
488 | 14 | return BN_one(r); |
489 | 14 | } |
490 | | |
491 | 1.32k | BN_RECP_CTX_init(&recp); |
492 | 1.32k | BN_CTX_start(ctx); |
493 | 1.32k | aa = BN_CTX_get(ctx); |
494 | 1.32k | val[0] = BN_CTX_get(ctx); |
495 | 1.32k | if (!aa || !val[0]) { |
496 | 0 | goto err; |
497 | 0 | } |
498 | | |
499 | 1.32k | if (m->neg) { |
500 | | // ignore sign of 'm' |
501 | 0 | if (!BN_copy(aa, m)) { |
502 | 0 | goto err; |
503 | 0 | } |
504 | 0 | aa->neg = 0; |
505 | 0 | if (BN_RECP_CTX_set(&recp, aa, ctx) <= 0) { |
506 | 0 | goto err; |
507 | 0 | } |
508 | 1.32k | } else { |
509 | 1.32k | if (BN_RECP_CTX_set(&recp, m, ctx) <= 0) { |
510 | 0 | goto err; |
511 | 0 | } |
512 | 1.32k | } |
513 | | |
514 | 1.32k | if (!BN_nnmod(val[0], a, m, ctx)) { |
515 | 0 | goto err; // 1 |
516 | 0 | } |
517 | 1.32k | if (BN_is_zero(val[0])) { |
518 | 7 | BN_zero(r); |
519 | 7 | ret = 1; |
520 | 7 | goto err; |
521 | 7 | } |
522 | | |
523 | 1.32k | window = BN_window_bits_for_exponent_size(bits); |
524 | 1.32k | if (window > 1) { |
525 | 1.23k | if (!BN_mod_mul_reciprocal(aa, val[0], val[0], &recp, ctx)) { |
526 | 0 | goto err; // 2 |
527 | 0 | } |
528 | 1.23k | j = 1 << (window - 1); |
529 | 16.6k | for (i = 1; i < j; i++) { |
530 | 15.3k | if (((val[i] = BN_CTX_get(ctx)) == NULL) || |
531 | 15.3k | !BN_mod_mul_reciprocal(val[i], val[i - 1], aa, &recp, ctx)) { |
532 | 0 | goto err; |
533 | 0 | } |
534 | 15.3k | } |
535 | 1.23k | } |
536 | | |
537 | 1.32k | start = 1; // This is used to avoid multiplication etc |
538 | | // when there is only the value '1' in the |
539 | | // buffer. |
540 | 1.32k | wstart = bits - 1; // The top bit of the window |
541 | | |
542 | 1.32k | if (!BN_one(r)) { |
543 | 0 | goto err; |
544 | 0 | } |
545 | | |
546 | 170k | for (;;) { |
547 | 170k | int wvalue; // The 'value' of the window |
548 | 170k | int wend; // The bottom bit of the window |
549 | | |
550 | 170k | if (!BN_is_bit_set(p, wstart)) { |
551 | 116k | if (!start) { |
552 | 116k | if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) { |
553 | 0 | goto err; |
554 | 0 | } |
555 | 116k | } |
556 | 116k | if (wstart == 0) { |
557 | 1.02k | break; |
558 | 1.02k | } |
559 | 115k | wstart--; |
560 | 115k | continue; |
561 | 116k | } |
562 | | |
563 | | // We now have wstart on a 'set' bit, we now need to work out |
564 | | // how bit a window to do. To do this we need to scan |
565 | | // forward until the last set bit before the end of the |
566 | | // window |
567 | 54.3k | wvalue = 1; |
568 | 54.3k | wend = 0; |
569 | 258k | for (i = 1; i < window; i++) { |
570 | 204k | if (wstart - i < 0) { |
571 | 428 | break; |
572 | 428 | } |
573 | 203k | if (BN_is_bit_set(p, wstart - i)) { |
574 | 103k | wvalue <<= (i - wend); |
575 | 103k | wvalue |= 1; |
576 | 103k | wend = i; |
577 | 103k | } |
578 | 203k | } |
579 | | |
580 | | // wend is the size of the current window |
581 | 54.3k | j = wend + 1; |
582 | | // add the 'bytes above' |
583 | 54.3k | if (!start) { |
584 | 256k | for (i = 0; i < j; i++) { |
585 | 203k | if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) { |
586 | 0 | goto err; |
587 | 0 | } |
588 | 203k | } |
589 | 53.0k | } |
590 | | |
591 | | // wvalue will be an odd number < 2^window |
592 | 54.3k | if (!BN_mod_mul_reciprocal(r, r, val[wvalue >> 1], &recp, ctx)) { |
593 | 0 | goto err; |
594 | 0 | } |
595 | | |
596 | | // move the 'window' down further |
597 | 54.3k | wstart -= wend + 1; |
598 | 54.3k | start = 0; |
599 | 54.3k | if (wstart < 0) { |
600 | 296 | break; |
601 | 296 | } |
602 | 54.3k | } |
603 | 1.32k | ret = 1; |
604 | | |
605 | 1.32k | err: |
606 | 1.32k | BN_CTX_end(ctx); |
607 | 1.32k | BN_RECP_CTX_free(&recp); |
608 | 1.32k | return ret; |
609 | 1.32k | } |
610 | | |
611 | | int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m, |
612 | 1.61k | BN_CTX *ctx) { |
613 | 1.61k | if (m->neg) { |
614 | 26 | OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); |
615 | 26 | return 0; |
616 | 26 | } |
617 | 1.58k | if (a->neg || BN_ucmp(a, m) >= 0) { |
618 | 957 | if (!BN_nnmod(r, a, m, ctx)) { |
619 | 2 | return 0; |
620 | 2 | } |
621 | 955 | a = r; |
622 | 955 | } |
623 | | |
624 | 1.58k | if (BN_is_odd(m)) { |
625 | 245 | return BN_mod_exp_mont(r, a, p, m, ctx, NULL); |
626 | 245 | } |
627 | | |
628 | 1.34k | return mod_exp_recp(r, a, p, m, ctx); |
629 | 1.58k | } |
630 | | |
631 | | int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, |
632 | 1.56k | const BIGNUM *m, BN_CTX *ctx, const BN_MONT_CTX *mont) { |
633 | 1.56k | if (!BN_is_odd(m)) { |
634 | 324 | OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); |
635 | 324 | return 0; |
636 | 324 | } |
637 | 1.24k | if (m->neg) { |
638 | 36 | OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); |
639 | 36 | return 0; |
640 | 36 | } |
641 | | // |a| is secret, but |a < m| is not. |
642 | 1.20k | if (a->neg || constant_time_declassify_int(BN_ucmp(a, m)) >= 0) { |
643 | 283 | OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); |
644 | 283 | return 0; |
645 | 283 | } |
646 | | |
647 | 926 | int bits = BN_num_bits(p); |
648 | 926 | if (bits == 0) { |
649 | | // x**0 mod 1 is still zero. |
650 | 80 | if (BN_abs_is_word(m, 1)) { |
651 | 4 | BN_zero(rr); |
652 | 4 | return 1; |
653 | 4 | } |
654 | 76 | return BN_one(rr); |
655 | 80 | } |
656 | | |
657 | 846 | int ret = 0; |
658 | 846 | BIGNUM *val[TABLE_SIZE]; |
659 | 846 | BN_MONT_CTX *new_mont = NULL; |
660 | | |
661 | 846 | BN_CTX_start(ctx); |
662 | 846 | BIGNUM *r = BN_CTX_get(ctx); |
663 | 846 | val[0] = BN_CTX_get(ctx); |
664 | 846 | if (r == NULL || val[0] == NULL) { |
665 | 0 | goto err; |
666 | 0 | } |
667 | | |
668 | | // Allocate a montgomery context if it was not supplied by the caller. |
669 | 846 | if (mont == NULL) { |
670 | 805 | new_mont = BN_MONT_CTX_new_consttime(m, ctx); |
671 | 805 | if (new_mont == NULL) { |
672 | 0 | goto err; |
673 | 0 | } |
674 | 805 | mont = new_mont; |
675 | 805 | } |
676 | | |
677 | | // We exponentiate by looking at sliding windows of the exponent and |
678 | | // precomputing powers of |a|. Windows may be shifted so they always end on a |
679 | | // set bit, so only precompute odd powers. We compute val[i] = a^(2*i + 1) |
680 | | // for i = 0 to 2^(window-1), all in Montgomery form. |
681 | 846 | int window = BN_window_bits_for_exponent_size(bits); |
682 | 846 | if (!BN_to_montgomery(val[0], a, mont, ctx)) { |
683 | 0 | goto err; |
684 | 0 | } |
685 | 846 | if (window > 1) { |
686 | 753 | BIGNUM *d = BN_CTX_get(ctx); |
687 | 753 | if (d == NULL || |
688 | 753 | !BN_mod_mul_montgomery(d, val[0], val[0], mont, ctx)) { |
689 | 0 | goto err; |
690 | 0 | } |
691 | 10.0k | for (int i = 1; i < 1 << (window - 1); i++) { |
692 | 9.27k | val[i] = BN_CTX_get(ctx); |
693 | 9.27k | if (val[i] == NULL || |
694 | 9.27k | !BN_mod_mul_montgomery(val[i], val[i - 1], d, mont, ctx)) { |
695 | 0 | goto err; |
696 | 0 | } |
697 | 9.27k | } |
698 | 753 | } |
699 | | |
700 | | // |p| is non-zero, so at least one window is non-zero. To save some |
701 | | // multiplications, defer initializing |r| until then. |
702 | 846 | int r_is_one = 1; |
703 | 846 | int wstart = bits - 1; // The top bit of the window. |
704 | 100k | for (;;) { |
705 | 100k | if (!BN_is_bit_set(p, wstart)) { |
706 | 67.9k | if (!r_is_one && !BN_mod_mul_montgomery(r, r, r, mont, ctx)) { |
707 | 0 | goto err; |
708 | 0 | } |
709 | 67.9k | if (wstart == 0) { |
710 | 559 | break; |
711 | 559 | } |
712 | 67.3k | wstart--; |
713 | 67.3k | continue; |
714 | 67.9k | } |
715 | | |
716 | | // We now have wstart on a set bit. Find the largest window we can use. |
717 | 32.7k | int wvalue = 1; |
718 | 32.7k | int wsize = 0; |
719 | 155k | for (int i = 1; i < window && i <= wstart; i++) { |
720 | 122k | if (BN_is_bit_set(p, wstart - i)) { |
721 | 63.6k | wvalue <<= (i - wsize); |
722 | 63.6k | wvalue |= 1; |
723 | 63.6k | wsize = i; |
724 | 63.6k | } |
725 | 122k | } |
726 | | |
727 | | // Shift |r| to the end of the window. |
728 | 32.7k | if (!r_is_one) { |
729 | 155k | for (int i = 0; i < wsize + 1; i++) { |
730 | 123k | if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) { |
731 | 0 | goto err; |
732 | 0 | } |
733 | 123k | } |
734 | 31.9k | } |
735 | | |
736 | 32.7k | assert(wvalue & 1); |
737 | 32.7k | assert(wvalue < (1 << window)); |
738 | 32.7k | if (r_is_one) { |
739 | 846 | if (!BN_copy(r, val[wvalue >> 1])) { |
740 | 0 | goto err; |
741 | 0 | } |
742 | 31.9k | } else if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx)) { |
743 | 0 | goto err; |
744 | 0 | } |
745 | | |
746 | 32.7k | r_is_one = 0; |
747 | 32.7k | if (wstart == wsize) { |
748 | 287 | break; |
749 | 287 | } |
750 | 32.4k | wstart -= wsize + 1; |
751 | 32.4k | } |
752 | | |
753 | | // |p| is non-zero, so |r_is_one| must be cleared at some point. |
754 | 846 | assert(!r_is_one); |
755 | | |
756 | 846 | if (!BN_from_montgomery(rr, r, mont, ctx)) { |
757 | 0 | goto err; |
758 | 0 | } |
759 | 846 | ret = 1; |
760 | | |
761 | 846 | err: |
762 | 846 | BN_MONT_CTX_free(new_mont); |
763 | 846 | BN_CTX_end(ctx); |
764 | 846 | return ret; |
765 | 846 | } |
766 | | |
767 | | void bn_mod_exp_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num, |
768 | | const BN_ULONG *p, size_t num_p, |
769 | 0 | const BN_MONT_CTX *mont) { |
770 | 0 | if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS || |
771 | 0 | num_p > SIZE_MAX / BN_BITS2) { |
772 | 0 | abort(); |
773 | 0 | } |
774 | 0 | assert(BN_is_odd(&mont->N)); |
775 | | |
776 | | // Count the number of bits in |p|, skipping leading zeros. Note this function |
777 | | // treats |p| as public. |
778 | 0 | while (num_p != 0 && p[num_p - 1] == 0) { |
779 | 0 | num_p--; |
780 | 0 | } |
781 | 0 | if (num_p == 0) { |
782 | 0 | bn_from_montgomery_small(r, num, mont->RR.d, num, mont); |
783 | 0 | return; |
784 | 0 | } |
785 | 0 | size_t bits = BN_num_bits_word(p[num_p - 1]) + (num_p - 1) * BN_BITS2; |
786 | 0 | assert(bits != 0); |
787 | | |
788 | | // We exponentiate by looking at sliding windows of the exponent and |
789 | | // precomputing powers of |a|. Windows may be shifted so they always end on a |
790 | | // set bit, so only precompute odd powers. We compute val[i] = a^(2*i + 1) for |
791 | | // i = 0 to 2^(window-1), all in Montgomery form. |
792 | 0 | unsigned window = BN_window_bits_for_exponent_size(bits); |
793 | 0 | if (window > TABLE_BITS_SMALL) { |
794 | 0 | window = TABLE_BITS_SMALL; // Tolerate excessively large |p|. |
795 | 0 | } |
796 | 0 | BN_ULONG val[TABLE_SIZE_SMALL][BN_SMALL_MAX_WORDS]; |
797 | 0 | OPENSSL_memcpy(val[0], a, num * sizeof(BN_ULONG)); |
798 | 0 | if (window > 1) { |
799 | 0 | BN_ULONG d[BN_SMALL_MAX_WORDS]; |
800 | 0 | bn_mod_mul_montgomery_small(d, val[0], val[0], num, mont); |
801 | 0 | for (unsigned i = 1; i < 1u << (window - 1); i++) { |
802 | 0 | bn_mod_mul_montgomery_small(val[i], val[i - 1], d, num, mont); |
803 | 0 | } |
804 | 0 | } |
805 | | |
806 | | // |p| is non-zero, so at least one window is non-zero. To save some |
807 | | // multiplications, defer initializing |r| until then. |
808 | 0 | int r_is_one = 1; |
809 | 0 | size_t wstart = bits - 1; // The top bit of the window. |
810 | 0 | for (;;) { |
811 | 0 | if (!bn_is_bit_set_words(p, num_p, wstart)) { |
812 | 0 | if (!r_is_one) { |
813 | 0 | bn_mod_mul_montgomery_small(r, r, r, num, mont); |
814 | 0 | } |
815 | 0 | if (wstart == 0) { |
816 | 0 | break; |
817 | 0 | } |
818 | 0 | wstart--; |
819 | 0 | continue; |
820 | 0 | } |
821 | | |
822 | | // We now have wstart on a set bit. Find the largest window we can use. |
823 | 0 | unsigned wvalue = 1; |
824 | 0 | unsigned wsize = 0; |
825 | 0 | for (unsigned i = 1; i < window && i <= wstart; i++) { |
826 | 0 | if (bn_is_bit_set_words(p, num_p, wstart - i)) { |
827 | 0 | wvalue <<= (i - wsize); |
828 | 0 | wvalue |= 1; |
829 | 0 | wsize = i; |
830 | 0 | } |
831 | 0 | } |
832 | | |
833 | | // Shift |r| to the end of the window. |
834 | 0 | if (!r_is_one) { |
835 | 0 | for (unsigned i = 0; i < wsize + 1; i++) { |
836 | 0 | bn_mod_mul_montgomery_small(r, r, r, num, mont); |
837 | 0 | } |
838 | 0 | } |
839 | |
|
840 | 0 | assert(wvalue & 1); |
841 | 0 | assert(wvalue < (1u << window)); |
842 | 0 | if (r_is_one) { |
843 | 0 | OPENSSL_memcpy(r, val[wvalue >> 1], num * sizeof(BN_ULONG)); |
844 | 0 | } else { |
845 | 0 | bn_mod_mul_montgomery_small(r, r, val[wvalue >> 1], num, mont); |
846 | 0 | } |
847 | 0 | r_is_one = 0; |
848 | 0 | if (wstart == wsize) { |
849 | 0 | break; |
850 | 0 | } |
851 | 0 | wstart -= wsize + 1; |
852 | 0 | } |
853 | | |
854 | | // |p| is non-zero, so |r_is_one| must be cleared at some point. |
855 | 0 | assert(!r_is_one); |
856 | 0 | OPENSSL_cleanse(val, sizeof(val)); |
857 | 0 | } |
858 | | |
859 | | void bn_mod_inverse0_prime_mont_small(BN_ULONG *r, const BN_ULONG *a, |
860 | 0 | size_t num, const BN_MONT_CTX *mont) { |
861 | 0 | if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) { |
862 | 0 | abort(); |
863 | 0 | } |
864 | | |
865 | | // Per Fermat's Little Theorem, a^-1 = a^(p-2) (mod p) for p prime. |
866 | 0 | BN_ULONG p_minus_two[BN_SMALL_MAX_WORDS]; |
867 | 0 | const BN_ULONG *p = mont->N.d; |
868 | 0 | OPENSSL_memcpy(p_minus_two, p, num * sizeof(BN_ULONG)); |
869 | 0 | if (p_minus_two[0] >= 2) { |
870 | 0 | p_minus_two[0] -= 2; |
871 | 0 | } else { |
872 | 0 | p_minus_two[0] -= 2; |
873 | 0 | for (size_t i = 1; i < num; i++) { |
874 | 0 | if (p_minus_two[i]-- != 0) { |
875 | 0 | break; |
876 | 0 | } |
877 | 0 | } |
878 | 0 | } |
879 | |
|
880 | 0 | bn_mod_exp_mont_small(r, a, num, p_minus_two, num, mont); |
881 | 0 | } |
882 | | |
883 | | static void copy_to_prebuf(const BIGNUM *b, int top, BN_ULONG *table, int idx, |
884 | 147k | int window) { |
885 | 147k | int ret = bn_copy_words(table + idx * top, top, b); |
886 | 147k | assert(ret); // |b| is guaranteed to fit. |
887 | 147k | (void)ret; |
888 | 147k | } |
889 | | |
890 | | static int copy_from_prebuf(BIGNUM *b, int top, const BN_ULONG *table, int idx, |
891 | 410k | int window) { |
892 | 410k | if (!bn_wexpand(b, top)) { |
893 | 0 | return 0; |
894 | 0 | } |
895 | | |
896 | 410k | OPENSSL_memset(b->d, 0, sizeof(BN_ULONG) * top); |
897 | 410k | const int width = 1 << window; |
898 | 4.62M | for (int i = 0; i < width; i++, table += top) { |
899 | | // Use a value barrier to prevent Clang from adding a branch when |i != idx| |
900 | | // and making this copy not constant time. Clang is still allowed to learn |
901 | | // that |mask| is constant across the inner loop, so this won't inhibit any |
902 | | // vectorization it might do. |
903 | 4.21M | BN_ULONG mask = value_barrier_w(constant_time_eq_int(i, idx)); |
904 | 12.6M | for (int j = 0; j < top; j++) { |
905 | 8.47M | b->d[j] |= table[j] & mask; |
906 | 8.47M | } |
907 | 4.21M | } |
908 | | |
909 | 410k | b->width = top; |
910 | 410k | return 1; |
911 | 410k | } |
912 | | |
913 | | // Window sizes optimized for fixed window size modular exponentiation |
914 | | // algorithm (BN_mod_exp_mont_consttime). |
915 | | // |
916 | | // TODO(davidben): These window sizes were originally set for 64-byte cache |
917 | | // lines with a cache-line-dependent constant-time mitigation. They can probably |
918 | | // be revised now that our implementation is no longer cache-time-dependent. |
919 | | #define BN_window_bits_for_ctime_exponent_size(b) \ |
920 | 20.8k | ((b) > 937 ? 6 : (b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1) |
921 | | #define BN_MAX_MOD_EXP_CTIME_WINDOW (6) |
922 | | |
923 | | // This variant of |BN_mod_exp_mont| uses fixed windows and fixed memory access |
924 | | // patterns to protect secret exponents (cf. the hyper-threading timing attacks |
925 | | // pointed out by Colin Percival, |
926 | | // http://www.daemonology.net/hyperthreading-considered-harmful/) |
927 | | int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, |
928 | | const BIGNUM *m, BN_CTX *ctx, |
929 | 22.1k | const BN_MONT_CTX *mont) { |
930 | 22.1k | int i, ret = 0, wvalue; |
931 | 22.1k | BN_MONT_CTX *new_mont = NULL; |
932 | | |
933 | 22.1k | unsigned char *powerbuf_free = NULL; |
934 | 22.1k | size_t powerbuf_len = 0; |
935 | 22.1k | BN_ULONG *powerbuf = NULL; |
936 | | |
937 | 22.1k | if (!BN_is_odd(m)) { |
938 | 932 | OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); |
939 | 932 | return 0; |
940 | 932 | } |
941 | 21.1k | if (m->neg) { |
942 | 59 | OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); |
943 | 59 | return 0; |
944 | 59 | } |
945 | | // |a| is secret, but it is required to be in range, so these comparisons may |
946 | | // be leaked. |
947 | 21.1k | if (a->neg || constant_time_declassify_int(BN_ucmp(a, m) >= 0)) { |
948 | 282 | OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); |
949 | 282 | return 0; |
950 | 282 | } |
951 | | |
952 | | // Use all bits stored in |p|, rather than |BN_num_bits|, so we do not leak |
953 | | // whether the top bits are zero. |
954 | 20.8k | int max_bits = p->width * BN_BITS2; |
955 | 20.8k | int bits = max_bits; |
956 | 20.8k | if (bits == 0) { |
957 | | // x**0 mod 1 is still zero. |
958 | 39 | if (BN_abs_is_word(m, 1)) { |
959 | 2 | BN_zero(rr); |
960 | 2 | return 1; |
961 | 2 | } |
962 | 37 | return BN_one(rr); |
963 | 39 | } |
964 | | |
965 | | // Allocate a montgomery context if it was not supplied by the caller. |
966 | 20.8k | if (mont == NULL) { |
967 | 876 | new_mont = BN_MONT_CTX_new_consttime(m, ctx); |
968 | 876 | if (new_mont == NULL) { |
969 | 0 | goto err; |
970 | 0 | } |
971 | 876 | mont = new_mont; |
972 | 876 | } |
973 | | |
974 | | // Use the width in |mont->N|, rather than the copy in |m|. The assembly |
975 | | // implementation assumes it can use |top| to size R. |
976 | 20.8k | int top = mont->N.width; |
977 | | |
978 | 20.8k | #if defined(OPENSSL_BN_ASM_MONT5) || defined(RSAZ_ENABLED) |
979 | | // Share one large stack-allocated buffer between the RSAZ and non-RSAZ code |
980 | | // paths. If we were to use separate static buffers for each then there is |
981 | | // some chance that both large buffers would be allocated on the stack, |
982 | | // causing the stack space requirement to be truly huge (~10KB). |
983 | 20.8k | alignas(MOD_EXP_CTIME_ALIGN) BN_ULONG storage[MOD_EXP_CTIME_STORAGE_LEN]; |
984 | 20.8k | #endif |
985 | 20.8k | #if defined(RSAZ_ENABLED) |
986 | | // If the size of the operands allow it, perform the optimized RSAZ |
987 | | // exponentiation. For further information see crypto/fipsmodule/bn/rsaz_exp.c |
988 | | // and accompanying assembly modules. |
989 | 20.8k | if (a->width == 16 && p->width == 16 && BN_num_bits(m) == 1024 && |
990 | 20.8k | rsaz_avx2_preferred()) { |
991 | 0 | if (!bn_wexpand(rr, 16)) { |
992 | 0 | goto err; |
993 | 0 | } |
994 | 0 | RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0], |
995 | 0 | storage); |
996 | 0 | rr->width = 16; |
997 | 0 | rr->neg = 0; |
998 | 0 | ret = 1; |
999 | 0 | goto err; |
1000 | 0 | } |
1001 | 20.8k | #endif |
1002 | | |
1003 | | // Get the window size to use with size of p. |
1004 | 20.8k | int window = BN_window_bits_for_ctime_exponent_size(bits); |
1005 | 20.8k | assert(window <= BN_MAX_MOD_EXP_CTIME_WINDOW); |
1006 | | |
1007 | | // Calculating |powerbuf_len| below cannot overflow because of the bound on |
1008 | | // Montgomery reduction. |
1009 | 20.8k | assert((size_t)top <= BN_MONTGOMERY_MAX_WORDS); |
1010 | 20.8k | static_assert( |
1011 | 20.8k | BN_MONTGOMERY_MAX_WORDS <= |
1012 | 20.8k | INT_MAX / sizeof(BN_ULONG) / ((1 << BN_MAX_MOD_EXP_CTIME_WINDOW) + 3), |
1013 | 20.8k | "powerbuf_len may overflow"); |
1014 | | |
1015 | 20.8k | #if defined(OPENSSL_BN_ASM_MONT5) |
1016 | 20.8k | if (window >= 5) { |
1017 | 4.92k | window = 5; // ~5% improvement for RSA2048 sign, and even for RSA4096 |
1018 | | // Reserve space for the |mont->N| copy. |
1019 | 4.92k | powerbuf_len += top * sizeof(mont->N.d[0]); |
1020 | 4.92k | } |
1021 | 20.8k | #endif |
1022 | | |
1023 | | // Allocate a buffer large enough to hold all of the pre-computed |
1024 | | // powers of |am|, |am| itself, and |tmp|. |
1025 | 20.8k | int num_powers = 1 << window; |
1026 | 20.8k | powerbuf_len += sizeof(m->d[0]) * top * (num_powers + 2); |
1027 | | |
1028 | 20.8k | #if defined(OPENSSL_BN_ASM_MONT5) |
1029 | 20.8k | if (powerbuf_len <= sizeof(storage)) { |
1030 | 20.8k | powerbuf = storage; |
1031 | 20.8k | } |
1032 | | // |storage| is more than large enough to handle 1024-bit inputs. |
1033 | 20.8k | assert(powerbuf != NULL || top * BN_BITS2 > 1024); |
1034 | 20.8k | #endif |
1035 | 20.8k | if (powerbuf == NULL) { |
1036 | 0 | powerbuf_free = OPENSSL_malloc(powerbuf_len + MOD_EXP_CTIME_ALIGN); |
1037 | 0 | if (powerbuf_free == NULL) { |
1038 | 0 | goto err; |
1039 | 0 | } |
1040 | 0 | powerbuf = align_pointer(powerbuf_free, MOD_EXP_CTIME_ALIGN); |
1041 | 0 | } |
1042 | 20.8k | OPENSSL_memset(powerbuf, 0, powerbuf_len); |
1043 | | |
1044 | | // Place |tmp| and |am| right after powers table. |
1045 | 20.8k | BIGNUM tmp, am; |
1046 | 20.8k | tmp.d = powerbuf + top * num_powers; |
1047 | 20.8k | am.d = tmp.d + top; |
1048 | 20.8k | tmp.width = am.width = 0; |
1049 | 20.8k | tmp.dmax = am.dmax = top; |
1050 | 20.8k | tmp.neg = am.neg = 0; |
1051 | 20.8k | tmp.flags = am.flags = BN_FLG_STATIC_DATA; |
1052 | | |
1053 | 20.8k | if (!bn_one_to_montgomery(&tmp, mont, ctx) || |
1054 | 20.8k | !bn_resize_words(&tmp, top)) { |
1055 | 0 | goto err; |
1056 | 0 | } |
1057 | | |
1058 | | // Prepare a^1 in the Montgomery domain. |
1059 | 20.8k | assert(!a->neg); |
1060 | 20.8k | declassify_assert(BN_ucmp(a, m) < 0); |
1061 | 20.8k | if (!BN_to_montgomery(&am, a, mont, ctx) || |
1062 | 20.8k | !bn_resize_words(&am, top)) { |
1063 | 0 | goto err; |
1064 | 0 | } |
1065 | | |
1066 | 20.8k | #if defined(OPENSSL_BN_ASM_MONT5) |
1067 | | // This optimization uses ideas from https://eprint.iacr.org/2011/239, |
1068 | | // specifically optimization of cache-timing attack countermeasures, |
1069 | | // pre-computation optimization, and Almost Montgomery Multiplication. |
1070 | | // |
1071 | | // The paper discusses a 4-bit window to optimize 512-bit modular |
1072 | | // exponentiation, used in RSA-1024 with CRT, but RSA-1024 is no longer |
1073 | | // important. |
1074 | | // |
1075 | | // |bn_mul_mont_gather5| and |bn_power5| implement the "almost" reduction |
1076 | | // variant, so the values here may not be fully reduced. They are bounded by R |
1077 | | // (i.e. they fit in |top| words), not |m|. Additionally, we pass these |
1078 | | // "almost" reduced inputs into |bn_mul_mont|, which implements the normal |
1079 | | // reduction variant. Given those inputs, |bn_mul_mont| may not give reduced |
1080 | | // output, but it will still produce "almost" reduced output. |
1081 | | // |
1082 | | // TODO(davidben): Using "almost" reduction complicates analysis of this code, |
1083 | | // and its interaction with other parts of the project. Determine whether this |
1084 | | // is actually necessary for performance. |
1085 | 20.8k | if (window == 5 && top > 1) { |
1086 | | // Copy |mont->N| to improve cache locality. |
1087 | 4.89k | BN_ULONG *np = am.d + top; |
1088 | 30.0k | for (i = 0; i < top; i++) { |
1089 | 25.1k | np[i] = mont->N.d[i]; |
1090 | 25.1k | } |
1091 | | |
1092 | | // Fill |powerbuf| with the first 32 powers of |am|. |
1093 | 4.89k | const BN_ULONG *n0 = mont->n0; |
1094 | 4.89k | bn_scatter5(tmp.d, top, powerbuf, 0); |
1095 | 4.89k | bn_scatter5(am.d, am.width, powerbuf, 1); |
1096 | 4.89k | bn_mul_mont(tmp.d, am.d, am.d, np, n0, top); |
1097 | 4.89k | bn_scatter5(tmp.d, top, powerbuf, 2); |
1098 | | |
1099 | | // Square to compute powers of two. |
1100 | 19.5k | for (i = 4; i < 32; i *= 2) { |
1101 | 14.6k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1102 | 14.6k | bn_scatter5(tmp.d, top, powerbuf, i); |
1103 | 14.6k | } |
1104 | | // Compute odd powers |i| based on |i - 1|, then all powers |i * 2^j|. |
1105 | 78.3k | for (i = 3; i < 32; i += 2) { |
1106 | 73.4k | bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); |
1107 | 73.4k | bn_scatter5(tmp.d, top, powerbuf, i); |
1108 | 127k | for (int j = 2 * i; j < 32; j *= 2) { |
1109 | 53.8k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1110 | 53.8k | bn_scatter5(tmp.d, top, powerbuf, j); |
1111 | 53.8k | } |
1112 | 73.4k | } |
1113 | | |
1114 | 4.89k | bits--; |
1115 | 28.6k | for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--) { |
1116 | 23.7k | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); |
1117 | 23.7k | } |
1118 | 4.89k | bn_gather5(tmp.d, top, powerbuf, wvalue); |
1119 | | |
1120 | | // At this point |bits| is 4 mod 5 and at least -1. (|bits| is the first bit |
1121 | | // that has not been read yet.) |
1122 | 4.89k | assert(bits >= -1 && (bits == -1 || bits % 5 == 4)); |
1123 | | |
1124 | | // Scan the exponent one window at a time starting from the most |
1125 | | // significant bits. |
1126 | 4.89k | if (!bn_power5_capable(top)) { |
1127 | 322k | while (bits >= 0) { |
1128 | 1.90M | for (wvalue = 0, i = 0; i < 5; i++, bits--) { |
1129 | 1.58M | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); |
1130 | 1.58M | } |
1131 | | |
1132 | 317k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1133 | 317k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1134 | 317k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1135 | 317k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1136 | 317k | bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); |
1137 | 317k | bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); |
1138 | 317k | } |
1139 | 4.89k | } else { |
1140 | 0 | const uint8_t *p_bytes = (const uint8_t *)p->d; |
1141 | 0 | assert(bits < max_bits); |
1142 | | // |p = 0| has been handled as a special case, so |max_bits| is at least |
1143 | | // one word. |
1144 | 0 | assert(max_bits >= 64); |
1145 | | |
1146 | | // If the first bit to be read lands in the last byte, unroll the first |
1147 | | // iteration to avoid reading past the bounds of |p->d|. (After the first |
1148 | | // iteration, we are guaranteed to be past the last byte.) Note |bits| |
1149 | | // here is the top bit, inclusive. |
1150 | 0 | if (bits - 4 >= max_bits - 8) { |
1151 | | // Read five bits from |bits-4| through |bits|, inclusive. |
1152 | 0 | wvalue = p_bytes[p->width * BN_BYTES - 1]; |
1153 | 0 | wvalue >>= (bits - 4) & 7; |
1154 | 0 | wvalue &= 0x1f; |
1155 | 0 | bits -= 5; |
1156 | 0 | bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); |
1157 | 0 | } |
1158 | 0 | while (bits >= 0) { |
1159 | | // Read five bits from |bits-4| through |bits|, inclusive. |
1160 | 0 | int first_bit = bits - 4; |
1161 | 0 | uint16_t val; |
1162 | 0 | OPENSSL_memcpy(&val, p_bytes + (first_bit >> 3), sizeof(val)); |
1163 | 0 | val >>= first_bit & 7; |
1164 | 0 | val &= 0x1f; |
1165 | 0 | bits -= 5; |
1166 | 0 | bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, val); |
1167 | 0 | } |
1168 | 0 | } |
1169 | | // The result is now in |tmp| in Montgomery form, but it may not be fully |
1170 | | // reduced. This is within bounds for |BN_from_montgomery| (tmp < R <= m*R) |
1171 | | // so it will, when converting from Montgomery form, produce a fully reduced |
1172 | | // result. |
1173 | | // |
1174 | | // This differs from Figure 2 of the paper, which uses AMM(h, 1) to convert |
1175 | | // from Montgomery form with unreduced output, followed by an extra |
1176 | | // reduction step. In the paper's terminology, we replace steps 9 and 10 |
1177 | | // with MM(h, 1). |
1178 | 4.89k | } else |
1179 | 15.9k | #endif |
1180 | 15.9k | { |
1181 | 15.9k | copy_to_prebuf(&tmp, top, powerbuf, 0, window); |
1182 | 15.9k | copy_to_prebuf(&am, top, powerbuf, 1, window); |
1183 | | |
1184 | | // If the window size is greater than 1, then calculate |
1185 | | // val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) |
1186 | | // (even powers could instead be computed as (a^(i/2))^2 |
1187 | | // to use the slight performance advantage of sqr over mul). |
1188 | 15.9k | if (window > 1) { |
1189 | 15.9k | if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx)) { |
1190 | 0 | goto err; |
1191 | 0 | } |
1192 | | |
1193 | 15.9k | copy_to_prebuf(&tmp, top, powerbuf, 2, window); |
1194 | | |
1195 | 115k | for (i = 3; i < num_powers; i++) { |
1196 | | // Calculate a^i = a^(i-1) * a |
1197 | 99.3k | if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx)) { |
1198 | 0 | goto err; |
1199 | 0 | } |
1200 | | |
1201 | 99.3k | copy_to_prebuf(&tmp, top, powerbuf, i, window); |
1202 | 99.3k | } |
1203 | 15.9k | } |
1204 | | |
1205 | 15.9k | bits--; |
1206 | 39.0k | for (wvalue = 0, i = bits % window; i >= 0; i--, bits--) { |
1207 | 23.1k | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); |
1208 | 23.1k | } |
1209 | 15.9k | if (!copy_from_prebuf(&tmp, top, powerbuf, wvalue, window)) { |
1210 | 0 | goto err; |
1211 | 0 | } |
1212 | | |
1213 | | // Scan the exponent one window at a time starting from the most |
1214 | | // significant bits. |
1215 | 410k | while (bits >= 0) { |
1216 | 394k | wvalue = 0; // The 'value' of the window |
1217 | | |
1218 | | // Scan the window, squaring the result as we go |
1219 | 1.68M | for (i = 0; i < window; i++, bits--) { |
1220 | 1.29M | if (!BN_mod_mul_montgomery(&tmp, &tmp, &tmp, mont, ctx)) { |
1221 | 0 | goto err; |
1222 | 0 | } |
1223 | 1.29M | wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); |
1224 | 1.29M | } |
1225 | | |
1226 | | // Fetch the appropriate pre-computed value from the pre-buf |
1227 | 394k | if (!copy_from_prebuf(&am, top, powerbuf, wvalue, window)) { |
1228 | 0 | goto err; |
1229 | 0 | } |
1230 | | |
1231 | | // Multiply the result into the intermediate result |
1232 | 394k | if (!BN_mod_mul_montgomery(&tmp, &tmp, &am, mont, ctx)) { |
1233 | 0 | goto err; |
1234 | 0 | } |
1235 | 394k | } |
1236 | 15.9k | } |
1237 | | |
1238 | | // Convert the final result from Montgomery to standard format. If we used the |
1239 | | // |OPENSSL_BN_ASM_MONT5| codepath, |tmp| may not be fully reduced. It is only |
1240 | | // bounded by R rather than |m|. However, that is still within bounds for |
1241 | | // |BN_from_montgomery|, which implements full Montgomery reduction, not |
1242 | | // "almost" Montgomery reduction. |
1243 | 20.8k | if (!BN_from_montgomery(rr, &tmp, mont, ctx)) { |
1244 | 0 | goto err; |
1245 | 0 | } |
1246 | 20.8k | ret = 1; |
1247 | | |
1248 | 20.8k | err: |
1249 | 20.8k | BN_MONT_CTX_free(new_mont); |
1250 | 20.8k | if (powerbuf != NULL && powerbuf_free == NULL) { |
1251 | 20.8k | OPENSSL_cleanse(powerbuf, powerbuf_len); |
1252 | 20.8k | } |
1253 | 20.8k | OPENSSL_free(powerbuf_free); |
1254 | 20.8k | return ret; |
1255 | 20.8k | } |
1256 | | |
1257 | | int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p, |
1258 | | const BIGNUM *m, BN_CTX *ctx, |
1259 | | const BN_MONT_CTX *mont) { |
1260 | | BIGNUM a_bignum; |
1261 | | BN_init(&a_bignum); |
1262 | | |
1263 | | int ret = 0; |
1264 | | |
1265 | | // BN_mod_exp_mont requires reduced inputs. |
1266 | | if (bn_minimal_width(m) == 1) { |
1267 | | a %= m->d[0]; |
1268 | | } |
1269 | | |
1270 | | if (!BN_set_word(&a_bignum, a)) { |
1271 | | OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR); |
1272 | | goto err; |
1273 | | } |
1274 | | |
1275 | | ret = BN_mod_exp_mont(rr, &a_bignum, p, m, ctx, mont); |
1276 | | |
1277 | | err: |
1278 | | BN_free(&a_bignum); |
1279 | | |
1280 | | return ret; |
1281 | | } |
1282 | | |
1283 | | #define TABLE_SIZE 32 |
1284 | | |
1285 | | int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1, |
1286 | | const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m, |
1287 | 0 | BN_CTX *ctx, const BN_MONT_CTX *mont) { |
1288 | 0 | BIGNUM tmp; |
1289 | 0 | BN_init(&tmp); |
1290 | |
|
1291 | 0 | int ret = 0; |
1292 | 0 | BN_MONT_CTX *new_mont = NULL; |
1293 | | |
1294 | | // Allocate a montgomery context if it was not supplied by the caller. |
1295 | 0 | if (mont == NULL) { |
1296 | 0 | new_mont = BN_MONT_CTX_new_for_modulus(m, ctx); |
1297 | 0 | if (new_mont == NULL) { |
1298 | 0 | goto err; |
1299 | 0 | } |
1300 | 0 | mont = new_mont; |
1301 | 0 | } |
1302 | | |
1303 | | // BN_mod_mul_montgomery removes one Montgomery factor, so passing one |
1304 | | // Montgomery-encoded and one non-Montgomery-encoded value gives a |
1305 | | // non-Montgomery-encoded result. |
1306 | 0 | if (!BN_mod_exp_mont(rr, a1, p1, m, ctx, mont) || |
1307 | 0 | !BN_mod_exp_mont(&tmp, a2, p2, m, ctx, mont) || |
1308 | 0 | !BN_to_montgomery(rr, rr, mont, ctx) || |
1309 | 0 | !BN_mod_mul_montgomery(rr, rr, &tmp, mont, ctx)) { |
1310 | 0 | goto err; |
1311 | 0 | } |
1312 | | |
1313 | 0 | ret = 1; |
1314 | |
|
1315 | 0 | err: |
1316 | 0 | BN_MONT_CTX_free(new_mont); |
1317 | 0 | BN_free(&tmp); |
1318 | |
|
1319 | 0 | return ret; |
1320 | 0 | } |