/src/openssl30/crypto/bn/bn_exp.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /*  | 
2  |  |  * Copyright 1995-2025 The OpenSSL Project Authors. All Rights Reserved.  | 
3  |  |  *  | 
4  |  |  * Licensed under the Apache License 2.0 (the "License").  You may not use  | 
5  |  |  * this file except in compliance with the License.  You can obtain a copy  | 
6  |  |  * in the file LICENSE in the source distribution or at  | 
7  |  |  * https://www.openssl.org/source/license.html  | 
8  |  |  */  | 
9  |  |  | 
10  |  | #include "internal/cryptlib.h"  | 
11  |  | #include "internal/constant_time.h"  | 
12  |  | #include "bn_local.h"  | 
13  |  |  | 
14  |  | #include <stdlib.h>  | 
15  |  | #ifdef _WIN32  | 
16  |  | # include <malloc.h>  | 
17  |  | # ifndef alloca  | 
18  |  | #  define alloca _alloca  | 
19  |  | # endif  | 
20  |  | #elif defined(__GNUC__)  | 
21  |  | # ifndef alloca  | 
22  |  | #  define alloca(s) __builtin_alloca((s))  | 
23  |  | # endif  | 
24  |  | #elif defined(__sun)  | 
25  |  | # include <alloca.h>  | 
26  |  | #endif  | 
27  |  |  | 
28  |  | #include "rsaz_exp.h"  | 
29  |  |  | 
30  |  | #undef SPARC_T4_MONT  | 
31  |  | #if defined(OPENSSL_BN_ASM_MONT) && (defined(__sparc__) || defined(__sparc))  | 
32  |  | # include "crypto/sparc_arch.h"  | 
33  |  | # define SPARC_T4_MONT  | 
34  |  | #endif  | 
35  |  |  | 
36  |  | /* maximum precomputation table size for *variable* sliding windows */  | 
37  |  | #define TABLE_SIZE      32  | 
38  |  |  | 
39  |  | /*  | 
40  |  |  * Beyond this limit the constant time code is disabled due to  | 
41  |  |  * the possible overflow in the computation of powerbufLen in  | 
42  |  |  * BN_mod_exp_mont_consttime.  | 
43  |  |  * When this limit is exceeded, the computation will be done using  | 
44  |  |  * non-constant time code, but it will take very long.  | 
45  |  |  */  | 
46  | 479k  | #define BN_CONSTTIME_SIZE_LIMIT (INT_MAX / BN_BYTES / 256)  | 
47  |  |  | 
48  |  | /* this one works - simple but works */  | 
49  |  | int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)  | 
50  | 0  | { | 
51  | 0  |     int i, bits, ret = 0;  | 
52  | 0  |     BIGNUM *v, *rr;  | 
53  |  | 
  | 
54  | 0  |     if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0  | 
55  | 0  |             || BN_get_flags(a, BN_FLG_CONSTTIME) != 0) { | 
56  |  |         /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */  | 
57  | 0  |         ERR_raise(ERR_LIB_BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);  | 
58  | 0  |         return 0;  | 
59  | 0  |     }  | 
60  |  |  | 
61  | 0  |     BN_CTX_start(ctx);  | 
62  | 0  |     rr = ((r == a) || (r == p)) ? BN_CTX_get(ctx) : r;  | 
63  | 0  |     v = BN_CTX_get(ctx);  | 
64  | 0  |     if (rr == NULL || v == NULL)  | 
65  | 0  |         goto err;  | 
66  |  |  | 
67  | 0  |     if (BN_copy(v, a) == NULL)  | 
68  | 0  |         goto err;  | 
69  | 0  |     bits = BN_num_bits(p);  | 
70  |  | 
  | 
71  | 0  |     if (BN_is_odd(p)) { | 
72  | 0  |         if (BN_copy(rr, a) == NULL)  | 
73  | 0  |             goto err;  | 
74  | 0  |     } else { | 
75  | 0  |         if (!BN_one(rr))  | 
76  | 0  |             goto err;  | 
77  | 0  |     }  | 
78  |  |  | 
79  | 0  |     for (i = 1; i < bits; i++) { | 
80  | 0  |         if (!BN_sqr(v, v, ctx))  | 
81  | 0  |             goto err;  | 
82  | 0  |         if (BN_is_bit_set(p, i)) { | 
83  | 0  |             if (!BN_mul(rr, rr, v, ctx))  | 
84  | 0  |                 goto err;  | 
85  | 0  |         }  | 
86  | 0  |     }  | 
87  | 0  |     if (r != rr && BN_copy(r, rr) == NULL)  | 
88  | 0  |         goto err;  | 
89  |  |  | 
90  | 0  |     ret = 1;  | 
91  | 0  |  err:  | 
92  | 0  |     BN_CTX_end(ctx);  | 
93  | 0  |     bn_check_top(r);  | 
94  | 0  |     return ret;  | 
95  | 0  | }  | 
96  |  |  | 
97  |  | int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,  | 
98  |  |                BN_CTX *ctx)  | 
99  | 168k  | { | 
100  | 168k  |     int ret;  | 
101  |  |  | 
102  | 168k  |     bn_check_top(a);  | 
103  | 168k  |     bn_check_top(p);  | 
104  | 168k  |     bn_check_top(m);  | 
105  |  |  | 
106  |  |     /*-  | 
107  |  |      * For even modulus  m = 2^k*m_odd, it might make sense to compute  | 
108  |  |      * a^p mod m_odd  and  a^p mod 2^k  separately (with Montgomery  | 
109  |  |      * exponentiation for the odd part), using appropriate exponent  | 
110  |  |      * reductions, and combine the results using the CRT.  | 
111  |  |      *  | 
112  |  |      * For now, we use Montgomery only if the modulus is odd; otherwise,  | 
113  |  |      * exponentiation using the reciprocal-based quick remaindering  | 
114  |  |      * algorithm is used.  | 
115  |  |      *  | 
116  |  |      * (Timing obtained with expspeed.c [computations  a^p mod m  | 
117  |  |      * where  a, p, m  are of the same length: 256, 512, 1024, 2048,  | 
118  |  |      * 4096, 8192 bits], compared to the running time of the  | 
119  |  |      * standard algorithm:  | 
120  |  |      *  | 
121  |  |      *   BN_mod_exp_mont   33 .. 40 %  [AMD K6-2, Linux, debug configuration]  | 
122  |  |      *                     55 .. 77 %  [UltraSparc processor, but  | 
123  |  |      *                                  debug-solaris-sparcv8-gcc conf.]  | 
124  |  |      *  | 
125  |  |      *   BN_mod_exp_recp   50 .. 70 %  [AMD K6-2, Linux, debug configuration]  | 
126  |  |      *                     62 .. 118 % [UltraSparc, debug-solaris-sparcv8-gcc]  | 
127  |  |      *  | 
128  |  |      * On the Sparc, BN_mod_exp_recp was faster than BN_mod_exp_mont  | 
129  |  |      * at 2048 and more bits, but at 512 and 1024 bits, it was  | 
130  |  |      * slower even than the standard algorithm!  | 
131  |  |      *  | 
132  |  |      * "Real" timings [linux-elf, solaris-sparcv9-gcc configurations]  | 
133  |  |      * should be obtained when the new Montgomery reduction code  | 
134  |  |      * has been integrated into OpenSSL.)  | 
135  |  |      */  | 
136  |  |  | 
137  | 168k  | #define MONT_MUL_MOD  | 
138  | 168k  | #define MONT_EXP_WORD  | 
139  | 168k  | #define RECP_MUL_MOD  | 
140  |  |  | 
141  | 168k  | #ifdef MONT_MUL_MOD  | 
142  | 168k  |     if (BN_is_odd(m)) { | 
143  | 160k  | # ifdef MONT_EXP_WORD  | 
144  | 160k  |         if (a->top == 1 && !a->neg  | 
145  | 160k  |             && (BN_get_flags(p, BN_FLG_CONSTTIME) == 0)  | 
146  | 160k  |             && (BN_get_flags(a, BN_FLG_CONSTTIME) == 0)  | 
147  | 160k  |             && (BN_get_flags(m, BN_FLG_CONSTTIME) == 0)) { | 
148  | 51.5k  |             BN_ULONG A = a->d[0];  | 
149  | 51.5k  |             ret = BN_mod_exp_mont_word(r, A, p, m, ctx, NULL);  | 
150  | 51.5k  |         } else  | 
151  | 108k  | # endif  | 
152  | 108k  |             ret = BN_mod_exp_mont(r, a, p, m, ctx, NULL);  | 
153  | 160k  |     } else  | 
154  | 8.38k  | #endif  | 
155  | 8.38k  | #ifdef RECP_MUL_MOD  | 
156  | 8.38k  |     { | 
157  | 8.38k  |         ret = BN_mod_exp_recp(r, a, p, m, ctx);  | 
158  | 8.38k  |     }  | 
159  |  | #else  | 
160  |  |     { | 
161  |  |         ret = BN_mod_exp_simple(r, a, p, m, ctx);  | 
162  |  |     }  | 
163  |  | #endif  | 
164  |  |  | 
165  | 168k  |     bn_check_top(r);  | 
166  | 168k  |     return ret;  | 
167  | 168k  | }  | 
168  |  |  | 
169  |  | int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,  | 
170  |  |                     const BIGNUM *m, BN_CTX *ctx)  | 
171  | 8.38k  | { | 
172  | 8.38k  |     int i, j, bits, ret = 0, wstart, wend, window, wvalue;  | 
173  | 8.38k  |     int start = 1;  | 
174  | 8.38k  |     BIGNUM *aa;  | 
175  |  |     /* Table of variables obtained from 'ctx' */  | 
176  | 8.38k  |     BIGNUM *val[TABLE_SIZE];  | 
177  | 8.38k  |     BN_RECP_CTX recp;  | 
178  |  |  | 
179  | 8.38k  |     if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0  | 
180  | 8.38k  |             || BN_get_flags(a, BN_FLG_CONSTTIME) != 0  | 
181  | 8.38k  |             || BN_get_flags(m, BN_FLG_CONSTTIME) != 0) { | 
182  |  |         /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */  | 
183  | 32  |         ERR_raise(ERR_LIB_BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);  | 
184  | 32  |         return 0;  | 
185  | 32  |     }  | 
186  |  |  | 
187  | 8.35k  |     bits = BN_num_bits(p);  | 
188  | 8.35k  |     if (bits == 0) { | 
189  |  |         /* x**0 mod 1, or x**0 mod -1 is still zero. */  | 
190  | 291  |         if (BN_abs_is_word(m, 1)) { | 
191  | 0  |             ret = 1;  | 
192  | 0  |             BN_zero(r);  | 
193  | 291  |         } else { | 
194  | 291  |             ret = BN_one(r);  | 
195  | 291  |         }  | 
196  | 291  |         return ret;  | 
197  | 291  |     }  | 
198  |  |  | 
199  | 8.06k  |     BN_RECP_CTX_init(&recp);  | 
200  |  |  | 
201  | 8.06k  |     BN_CTX_start(ctx);  | 
202  | 8.06k  |     aa = BN_CTX_get(ctx);  | 
203  | 8.06k  |     val[0] = BN_CTX_get(ctx);  | 
204  | 8.06k  |     if (val[0] == NULL)  | 
205  | 0  |         goto err;  | 
206  |  |  | 
207  | 8.06k  |     if (m->neg) { | 
208  |  |         /* ignore sign of 'm' */  | 
209  | 2.54k  |         if (!BN_copy(aa, m))  | 
210  | 0  |             goto err;  | 
211  | 2.54k  |         aa->neg = 0;  | 
212  | 2.54k  |         if (BN_RECP_CTX_set(&recp, aa, ctx) <= 0)  | 
213  | 0  |             goto err;  | 
214  | 5.52k  |     } else { | 
215  | 5.52k  |         if (BN_RECP_CTX_set(&recp, m, ctx) <= 0)  | 
216  | 0  |             goto err;  | 
217  | 5.52k  |     }  | 
218  |  |  | 
219  | 8.06k  |     if (!BN_nnmod(val[0], a, m, ctx))  | 
220  | 0  |         goto err;               /* 1 */  | 
221  | 8.06k  |     if (BN_is_zero(val[0])) { | 
222  | 158  |         BN_zero(r);  | 
223  | 158  |         ret = 1;  | 
224  | 158  |         goto err;  | 
225  | 158  |     }  | 
226  |  |  | 
227  | 7.90k  |     window = BN_window_bits_for_exponent_size(bits);  | 
228  | 7.90k  |     if (window > 1) { | 
229  | 1.52k  |         if (!BN_mod_mul_reciprocal(aa, val[0], val[0], &recp, ctx))  | 
230  | 0  |             goto err;           /* 2 */  | 
231  | 1.52k  |         j = 1 << (window - 1);  | 
232  | 12.4k  |         for (i = 1; i < j; i++) { | 
233  | 10.9k  |             if (((val[i] = BN_CTX_get(ctx)) == NULL) ||  | 
234  | 10.9k  |                 !BN_mod_mul_reciprocal(val[i], val[i - 1], aa, &recp, ctx))  | 
235  | 0  |                 goto err;  | 
236  | 10.9k  |         }  | 
237  | 1.52k  |     }  | 
238  |  |  | 
239  | 7.90k  |     start = 1;                  /* This is used to avoid multiplication etc  | 
240  |  |                                  * when there is only the value '1' in the  | 
241  |  |                                  * buffer. */  | 
242  | 7.90k  |     wvalue = 0;                 /* The 'value' of the window */  | 
243  | 7.90k  |     wstart = bits - 1;          /* The top bit of the window */  | 
244  | 7.90k  |     wend = 0;                   /* The bottom bit of the window */  | 
245  |  |  | 
246  | 7.90k  |     if (r == p) { | 
247  | 0  |         BIGNUM *p_dup = BN_CTX_get(ctx);  | 
248  |  | 
  | 
249  | 0  |         if (p_dup == NULL || BN_copy(p_dup, p) == NULL)  | 
250  | 0  |             goto err;  | 
251  | 0  |         p = p_dup;  | 
252  | 0  |     }  | 
253  |  |  | 
254  | 7.90k  |     if (!BN_one(r))  | 
255  | 0  |         goto err;  | 
256  |  |  | 
257  | 241k  |     for (;;) { | 
258  | 241k  |         if (BN_is_bit_set(p, wstart) == 0) { | 
259  | 160k  |             if (!start)  | 
260  | 160k  |                 if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx))  | 
261  | 0  |                     goto err;  | 
262  | 160k  |             if (wstart == 0)  | 
263  | 3.06k  |                 break;  | 
264  | 157k  |             wstart--;  | 
265  | 157k  |             continue;  | 
266  | 160k  |         }  | 
267  |  |         /*  | 
268  |  |          * We now have wstart on a 'set' bit, we now need to work out how bit  | 
269  |  |          * a window to do.  To do this we need to scan forward until the last  | 
270  |  |          * set bit before the end of the window  | 
271  |  |          */  | 
272  | 80.6k  |         wvalue = 1;  | 
273  | 80.6k  |         wend = 0;  | 
274  | 218k  |         for (i = 1; i < window; i++) { | 
275  | 138k  |             if (wstart - i < 0)  | 
276  | 680  |                 break;  | 
277  | 137k  |             if (BN_is_bit_set(p, wstart - i)) { | 
278  | 85.3k  |                 wvalue <<= (i - wend);  | 
279  | 85.3k  |                 wvalue |= 1;  | 
280  | 85.3k  |                 wend = i;  | 
281  | 85.3k  |             }  | 
282  | 137k  |         }  | 
283  |  |  | 
284  |  |         /* wend is the size of the current window */  | 
285  | 80.6k  |         j = wend + 1;  | 
286  |  |         /* add the 'bytes above' */  | 
287  | 80.6k  |         if (!start)  | 
288  | 245k  |             for (i = 0; i < j; i++) { | 
289  | 173k  |                 if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx))  | 
290  | 0  |                     goto err;  | 
291  | 173k  |             }  | 
292  |  |  | 
293  |  |         /* wvalue will be an odd number < 2^window */  | 
294  | 80.6k  |         if (!BN_mod_mul_reciprocal(r, r, val[wvalue >> 1], &recp, ctx))  | 
295  | 0  |             goto err;  | 
296  |  |  | 
297  |  |         /* move the 'window' down further */  | 
298  | 80.6k  |         wstart -= wend + 1;  | 
299  | 80.6k  |         wvalue = 0;  | 
300  | 80.6k  |         start = 0;  | 
301  | 80.6k  |         if (wstart < 0)  | 
302  | 4.84k  |             break;  | 
303  | 80.6k  |     }  | 
304  | 7.90k  |     ret = 1;  | 
305  | 8.06k  |  err:  | 
306  | 8.06k  |     BN_CTX_end(ctx);  | 
307  | 8.06k  |     BN_RECP_CTX_free(&recp);  | 
308  | 8.06k  |     bn_check_top(r);  | 
309  | 8.06k  |     return ret;  | 
310  | 7.90k  | }  | 
311  |  |  | 
312  |  | int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,  | 
313  |  |                     const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)  | 
314  | 214k  | { | 
315  | 214k  |     int i, j, bits, ret = 0, wstart, wend, window, wvalue;  | 
316  | 214k  |     int start = 1;  | 
317  | 214k  |     BIGNUM *d, *r;  | 
318  | 214k  |     const BIGNUM *aa;  | 
319  |  |     /* Table of variables obtained from 'ctx' */  | 
320  | 214k  |     BIGNUM *val[TABLE_SIZE];  | 
321  | 214k  |     BN_MONT_CTX *mont = NULL;  | 
322  |  |  | 
323  | 214k  |     bn_check_top(a);  | 
324  | 214k  |     bn_check_top(p);  | 
325  | 214k  |     bn_check_top(m);  | 
326  |  |  | 
327  | 214k  |     if (!BN_is_odd(m)) { | 
328  | 0  |         ERR_raise(ERR_LIB_BN, BN_R_CALLED_WITH_EVEN_MODULUS);  | 
329  | 0  |         return 0;  | 
330  | 0  |     }  | 
331  |  |  | 
332  | 214k  |     if (m->top <= BN_CONSTTIME_SIZE_LIMIT  | 
333  | 214k  |         && (BN_get_flags(p, BN_FLG_CONSTTIME) != 0  | 
334  | 214k  |             || BN_get_flags(a, BN_FLG_CONSTTIME) != 0  | 
335  | 214k  |             || BN_get_flags(m, BN_FLG_CONSTTIME) != 0)) { | 
336  | 31.5k  |         return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, in_mont);  | 
337  | 31.5k  |     }  | 
338  |  |  | 
339  | 183k  |     bits = BN_num_bits(p);  | 
340  | 183k  |     if (bits == 0) { | 
341  |  |         /* x**0 mod 1, or x**0 mod -1 is still zero. */  | 
342  | 648  |         if (BN_abs_is_word(m, 1)) { | 
343  | 12  |             ret = 1;  | 
344  | 12  |             BN_zero(rr);  | 
345  | 636  |         } else { | 
346  | 636  |             ret = BN_one(rr);  | 
347  | 636  |         }  | 
348  | 648  |         return ret;  | 
349  | 648  |     }  | 
350  |  |  | 
351  | 182k  |     BN_CTX_start(ctx);  | 
352  | 182k  |     d = BN_CTX_get(ctx);  | 
353  | 182k  |     r = BN_CTX_get(ctx);  | 
354  | 182k  |     val[0] = BN_CTX_get(ctx);  | 
355  | 182k  |     if (val[0] == NULL)  | 
356  | 0  |         goto err;  | 
357  |  |  | 
358  |  |     /*  | 
359  |  |      * If this is not done, things will break in the montgomery part  | 
360  |  |      */  | 
361  |  |  | 
362  | 182k  |     if (in_mont != NULL)  | 
363  | 76.7k  |         mont = in_mont;  | 
364  | 105k  |     else { | 
365  | 105k  |         if ((mont = BN_MONT_CTX_new()) == NULL)  | 
366  | 0  |             goto err;  | 
367  | 105k  |         if (!BN_MONT_CTX_set(mont, m, ctx))  | 
368  | 0  |             goto err;  | 
369  | 105k  |     }  | 
370  |  |  | 
371  | 182k  |     if (a->neg || BN_ucmp(a, m) >= 0) { | 
372  | 735  |         if (!BN_nnmod(val[0], a, m, ctx))  | 
373  | 0  |             goto err;  | 
374  | 735  |         aa = val[0];  | 
375  | 735  |     } else  | 
376  | 181k  |         aa = a;  | 
377  | 182k  |     if (!bn_to_mont_fixed_top(val[0], aa, mont, ctx))  | 
378  | 0  |         goto err;               /* 1 */  | 
379  |  |  | 
380  | 182k  |     window = BN_window_bits_for_exponent_size(bits);  | 
381  | 182k  |     if (window > 1) { | 
382  | 143k  |         if (!bn_mul_mont_fixed_top(d, val[0], val[0], mont, ctx))  | 
383  | 0  |             goto err;           /* 2 */  | 
384  | 143k  |         j = 1 << (window - 1);  | 
385  | 1.33M  |         for (i = 1; i < j; i++) { | 
386  | 1.18M  |             if (((val[i] = BN_CTX_get(ctx)) == NULL) ||  | 
387  | 1.18M  |                 !bn_mul_mont_fixed_top(val[i], val[i - 1], d, mont, ctx))  | 
388  | 0  |                 goto err;  | 
389  | 1.18M  |         }  | 
390  | 143k  |     }  | 
391  |  |  | 
392  | 182k  |     start = 1;                  /* This is used to avoid multiplication etc  | 
393  |  |                                  * when there is only the value '1' in the  | 
394  |  |                                  * buffer. */  | 
395  | 182k  |     wvalue = 0;                 /* The 'value' of the window */  | 
396  | 182k  |     wstart = bits - 1;          /* The top bit of the window */  | 
397  | 182k  |     wend = 0;                   /* The bottom bit of the window */  | 
398  |  |  | 
399  | 182k  | #if 1                           /* by Shay Gueron's suggestion */  | 
400  | 182k  |     j = m->top;                 /* borrow j */  | 
401  | 182k  |     if (m->d[j - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) { | 
402  | 59.7k  |         if (bn_wexpand(r, j) == NULL)  | 
403  | 0  |             goto err;  | 
404  |  |         /* 2^(top*BN_BITS2) - m */  | 
405  | 59.7k  |         r->d[0] = (0 - m->d[0]) & BN_MASK2;  | 
406  | 617k  |         for (i = 1; i < j; i++)  | 
407  | 557k  |             r->d[i] = (~m->d[i]) & BN_MASK2;  | 
408  | 59.7k  |         r->top = j;  | 
409  | 59.7k  |         r->flags |= BN_FLG_FIXED_TOP;  | 
410  | 59.7k  |     } else  | 
411  | 122k  | #endif  | 
412  | 122k  |     if (!bn_to_mont_fixed_top(r, BN_value_one(), mont, ctx))  | 
413  | 0  |         goto err;  | 
414  | 18.4M  |     for (;;) { | 
415  | 18.4M  |         if (BN_is_bit_set(p, wstart) == 0) { | 
416  | 13.7M  |             if (!start) { | 
417  | 13.7M  |                 if (!bn_mul_mont_fixed_top(r, r, r, mont, ctx))  | 
418  | 0  |                     goto err;  | 
419  | 13.7M  |             }  | 
420  | 13.7M  |             if (wstart == 0)  | 
421  | 62.7k  |                 break;  | 
422  | 13.7M  |             wstart--;  | 
423  | 13.7M  |             continue;  | 
424  | 13.7M  |         }  | 
425  |  |         /*  | 
426  |  |          * We now have wstart on a 'set' bit, we now need to work out how bit  | 
427  |  |          * a window to do.  To do this we need to scan forward until the last  | 
428  |  |          * set bit before the end of the window  | 
429  |  |          */  | 
430  | 4.67M  |         wvalue = 1;  | 
431  | 4.67M  |         wend = 0;  | 
432  | 19.5M  |         for (i = 1; i < window; i++) { | 
433  | 14.9M  |             if (wstart - i < 0)  | 
434  | 77.8k  |                 break;  | 
435  | 14.8M  |             if (BN_is_bit_set(p, wstart - i)) { | 
436  | 12.3M  |                 wvalue <<= (i - wend);  | 
437  | 12.3M  |                 wvalue |= 1;  | 
438  | 12.3M  |                 wend = i;  | 
439  | 12.3M  |             }  | 
440  | 14.8M  |         }  | 
441  |  |  | 
442  |  |         /* wend is the size of the current window */  | 
443  | 4.67M  |         j = wend + 1;  | 
444  |  |         /* add the 'bytes above' */  | 
445  | 4.67M  |         if (!start)  | 
446  | 21.8M  |             for (i = 0; i < j; i++) { | 
447  | 17.3M  |                 if (!bn_mul_mont_fixed_top(r, r, r, mont, ctx))  | 
448  | 0  |                     goto err;  | 
449  | 17.3M  |             }  | 
450  |  |  | 
451  |  |         /* wvalue will be an odd number < 2^window */  | 
452  | 4.67M  |         if (!bn_mul_mont_fixed_top(r, r, val[wvalue >> 1], mont, ctx))  | 
453  | 0  |             goto err;  | 
454  |  |  | 
455  |  |         /* move the 'window' down further */  | 
456  | 4.67M  |         wstart -= wend + 1;  | 
457  | 4.67M  |         wvalue = 0;  | 
458  | 4.67M  |         start = 0;  | 
459  | 4.67M  |         if (wstart < 0)  | 
460  | 119k  |             break;  | 
461  | 4.67M  |     }  | 
462  |  |     /*  | 
463  |  |      * Done with zero-padded intermediate BIGNUMs. Final BN_from_montgomery  | 
464  |  |      * removes padding [if any] and makes return value suitable for public  | 
465  |  |      * API consumer.  | 
466  |  |      */  | 
467  |  | #if defined(SPARC_T4_MONT)  | 
468  |  |     if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) { | 
469  |  |         j = mont->N.top;        /* borrow j */  | 
470  |  |         val[0]->d[0] = 1;       /* borrow val[0] */  | 
471  |  |         for (i = 1; i < j; i++)  | 
472  |  |             val[0]->d[i] = 0;  | 
473  |  |         val[0]->top = j;  | 
474  |  |         if (!BN_mod_mul_montgomery(rr, r, val[0], mont, ctx))  | 
475  |  |             goto err;  | 
476  |  |     } else  | 
477  |  | #endif  | 
478  | 182k  |     if (!BN_from_montgomery(rr, r, mont, ctx))  | 
479  | 0  |         goto err;  | 
480  | 182k  |     ret = 1;  | 
481  | 182k  |  err:  | 
482  | 182k  |     if (in_mont == NULL)  | 
483  | 105k  |         BN_MONT_CTX_free(mont);  | 
484  | 182k  |     BN_CTX_end(ctx);  | 
485  | 182k  |     bn_check_top(rr);  | 
486  | 182k  |     return ret;  | 
487  | 182k  | }  | 
488  |  |  | 
489  |  | static BN_ULONG bn_get_bits(const BIGNUM *a, int bitpos)  | 
490  | 519k  | { | 
491  | 519k  |     BN_ULONG ret = 0;  | 
492  | 519k  |     int wordpos;  | 
493  |  |  | 
494  | 519k  |     wordpos = bitpos / BN_BITS2;  | 
495  | 519k  |     bitpos %= BN_BITS2;  | 
496  | 519k  |     if (wordpos >= 0 && wordpos < a->top) { | 
497  | 519k  |         ret = a->d[wordpos] & BN_MASK2;  | 
498  | 519k  |         if (bitpos) { | 
499  | 495k  |             ret >>= bitpos;  | 
500  | 495k  |             if (++wordpos < a->top)  | 
501  | 64.7k  |                 ret |= a->d[wordpos] << (BN_BITS2 - bitpos);  | 
502  | 495k  |         }  | 
503  | 519k  |     }  | 
504  |  |  | 
505  | 519k  |     return ret & BN_MASK2;  | 
506  | 519k  | }  | 
507  |  |  | 
508  |  | /*  | 
509  |  |  * BN_mod_exp_mont_consttime() stores the precomputed powers in a specific  | 
510  |  |  * layout so that accessing any of these table values shows the same access  | 
511  |  |  * pattern as far as cache lines are concerned.  The following functions are  | 
512  |  |  * used to transfer a BIGNUM from/to that table.  | 
513  |  |  */  | 
514  |  |  | 
515  |  | static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top,  | 
516  |  |                                         unsigned char *buf, int idx,  | 
517  |  |                                         int window)  | 
518  | 176k  | { | 
519  | 176k  |     int i, j;  | 
520  | 176k  |     int width = 1 << window;  | 
521  | 176k  |     BN_ULONG *table = (BN_ULONG *)buf;  | 
522  |  |  | 
523  | 176k  |     if (top > b->top)  | 
524  | 0  |         top = b->top;           /* this works because 'buf' is explicitly  | 
525  |  |                                  * zeroed */  | 
526  | 5.61M  |     for (i = 0, j = idx; i < top; i++, j += width) { | 
527  | 5.43M  |         table[j] = b->d[i];  | 
528  | 5.43M  |     }  | 
529  |  |  | 
530  | 176k  |     return 1;  | 
531  | 176k  | }  | 
532  |  |  | 
533  |  | static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top,  | 
534  |  |                                           unsigned char *buf, int idx,  | 
535  |  |                                           int window)  | 
536  | 490k  | { | 
537  | 490k  |     int i, j;  | 
538  | 490k  |     int width = 1 << window;  | 
539  |  |     /*  | 
540  |  |      * We declare table 'volatile' in order to discourage compiler  | 
541  |  |      * from reordering loads from the table. Concern is that if  | 
542  |  |      * reordered in specific manner loads might give away the  | 
543  |  |      * information we are trying to conceal. Some would argue that  | 
544  |  |      * compiler can reorder them anyway, but it can as well be  | 
545  |  |      * argued that doing so would be violation of standard...  | 
546  |  |      */  | 
547  | 490k  |     volatile BN_ULONG *table = (volatile BN_ULONG *)buf;  | 
548  |  |  | 
549  | 490k  |     if (bn_wexpand(b, top) == NULL)  | 
550  | 0  |         return 0;  | 
551  |  |  | 
552  | 490k  |     if (window <= 3) { | 
553  | 8.43M  |         for (i = 0; i < top; i++, table += width) { | 
554  | 8.05M  |             BN_ULONG acc = 0;  | 
555  |  |  | 
556  | 72.4M  |             for (j = 0; j < width; j++) { | 
557  | 64.4M  |                 acc |= table[j] &  | 
558  | 64.4M  |                        ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));  | 
559  | 64.4M  |             }  | 
560  |  |  | 
561  | 8.05M  |             b->d[i] = acc;  | 
562  | 8.05M  |         }  | 
563  | 383k  |     } else { | 
564  | 106k  |         int xstride = 1 << (window - 2);  | 
565  | 106k  |         BN_ULONG y0, y1, y2, y3;  | 
566  |  |  | 
567  | 106k  |         i = idx >> (window - 2);        /* equivalent of idx / xstride */  | 
568  | 106k  |         idx &= xstride - 1;             /* equivalent of idx % xstride */  | 
569  |  |  | 
570  | 106k  |         y0 = (BN_ULONG)0 - (constant_time_eq_int(i,0)&1);  | 
571  | 106k  |         y1 = (BN_ULONG)0 - (constant_time_eq_int(i,1)&1);  | 
572  | 106k  |         y2 = (BN_ULONG)0 - (constant_time_eq_int(i,2)&1);  | 
573  | 106k  |         y3 = (BN_ULONG)0 - (constant_time_eq_int(i,3)&1);  | 
574  |  |  | 
575  | 7.43M  |         for (i = 0; i < top; i++, table += width) { | 
576  | 7.33M  |             BN_ULONG acc = 0;  | 
577  |  |  | 
578  | 36.6M  |             for (j = 0; j < xstride; j++) { | 
579  | 29.3M  |                 acc |= ( (table[j + 0 * xstride] & y0) |  | 
580  | 29.3M  |                          (table[j + 1 * xstride] & y1) |  | 
581  | 29.3M  |                          (table[j + 2 * xstride] & y2) |  | 
582  | 29.3M  |                          (table[j + 3 * xstride] & y3) )  | 
583  | 29.3M  |                        & ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));  | 
584  | 29.3M  |             }  | 
585  |  |  | 
586  | 7.33M  |             b->d[i] = acc;  | 
587  | 7.33M  |         }  | 
588  | 106k  |     }  | 
589  |  |  | 
590  | 490k  |     b->top = top;  | 
591  | 490k  |     b->flags |= BN_FLG_FIXED_TOP;  | 
592  | 490k  |     return 1;  | 
593  | 490k  | }  | 
594  |  |  | 
595  |  | /*  | 
596  |  |  * Given a pointer value, compute the next address that is a cache line  | 
597  |  |  * multiple.  | 
598  |  |  */  | 
599  |  | #define MOD_EXP_CTIME_ALIGN(x_) \  | 
600  | 49.3k  |         ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))  | 
601  |  |  | 
602  |  | /*  | 
603  |  |  * This variant of BN_mod_exp_mont() uses fixed windows and the special  | 
604  |  |  * precomputation memory layout to limit data-dependency to a minimum to  | 
605  |  |  * protect secret exponents (cf. the hyper-threading timing attacks pointed  | 
606  |  |  * out by Colin Percival,  | 
607  |  |  * http://www.daemonology.net/hyperthreading-considered-harmful/)  | 
608  |  |  */  | 
609  |  | int bn_mod_exp_mont_fixed_top(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,  | 
610  |  |                               const BIGNUM *m, BN_CTX *ctx,  | 
611  |  |                               BN_MONT_CTX *in_mont)  | 
612  | 49.7k  | { | 
613  | 49.7k  |     int i, bits, ret = 0, window, wvalue, wmask, window0;  | 
614  | 49.7k  |     int top;  | 
615  | 49.7k  |     BN_MONT_CTX *mont = NULL;  | 
616  |  |  | 
617  | 49.7k  |     int numPowers;  | 
618  | 49.7k  |     unsigned char *powerbufFree = NULL;  | 
619  | 49.7k  |     int powerbufLen = 0;  | 
620  | 49.7k  |     unsigned char *powerbuf = NULL;  | 
621  | 49.7k  |     BIGNUM tmp, am;  | 
622  |  | #if defined(SPARC_T4_MONT)  | 
623  |  |     unsigned int t4 = 0;  | 
624  |  | #endif  | 
625  |  |  | 
626  | 49.7k  |     if (!BN_is_odd(m)) { | 
627  | 0  |         ERR_raise(ERR_LIB_BN, BN_R_CALLED_WITH_EVEN_MODULUS);  | 
628  | 0  |         return 0;  | 
629  | 0  |     }  | 
630  |  |  | 
631  | 49.7k  |     top = m->top;  | 
632  |  |  | 
633  | 49.7k  |     if (top > BN_CONSTTIME_SIZE_LIMIT) { | 
634  |  |         /* Prevent overflowing the powerbufLen computation below */  | 
635  | 0  |         return BN_mod_exp_mont(rr, a, p, m, ctx, in_mont);  | 
636  | 0  |     }  | 
637  |  |  | 
638  |  |     /*  | 
639  |  |      * Use all bits stored in |p|, rather than |BN_num_bits|, so we do not leak  | 
640  |  |      * whether the top bits are zero.  | 
641  |  |      */  | 
642  | 49.7k  |     bits = p->top * BN_BITS2;  | 
643  | 49.7k  |     if (bits == 0) { | 
644  |  |         /* x**0 mod 1, or x**0 mod -1 is still zero. */  | 
645  | 30  |         if (BN_abs_is_word(m, 1)) { | 
646  | 6  |             ret = 1;  | 
647  | 6  |             BN_zero(rr);  | 
648  | 24  |         } else { | 
649  | 24  |             ret = BN_one(rr);  | 
650  | 24  |         }  | 
651  | 30  |         return ret;  | 
652  | 30  |     }  | 
653  |  |  | 
654  | 49.7k  |     BN_CTX_start(ctx);  | 
655  |  |  | 
656  |  |     /*  | 
657  |  |      * Allocate a montgomery context if it was not supplied by the caller. If  | 
658  |  |      * this is not done, things will break in the montgomery part.  | 
659  |  |      */  | 
660  | 49.7k  |     if (in_mont != NULL)  | 
661  | 47.4k  |         mont = in_mont;  | 
662  | 2.24k  |     else { | 
663  | 2.24k  |         if ((mont = BN_MONT_CTX_new()) == NULL)  | 
664  | 0  |             goto err;  | 
665  | 2.24k  |         if (!BN_MONT_CTX_set(mont, m, ctx))  | 
666  | 0  |             goto err;  | 
667  | 2.24k  |     }  | 
668  |  |  | 
669  | 49.7k  |     if (a->neg || BN_ucmp(a, m) >= 0) { | 
670  | 336  |         BIGNUM *reduced = BN_CTX_get(ctx);  | 
671  | 336  |         if (reduced == NULL  | 
672  | 336  |             || !BN_nnmod(reduced, a, m, ctx)) { | 
673  | 0  |             goto err;  | 
674  | 0  |         }  | 
675  | 336  |         a = reduced;  | 
676  | 336  |     }  | 
677  |  |  | 
678  | 49.7k  | #ifdef RSAZ_ENABLED  | 
679  |  |     /*  | 
680  |  |      * If the size of the operands allow it, perform the optimized  | 
681  |  |      * RSAZ exponentiation. For further information see  | 
682  |  |      * crypto/bn/rsaz_exp.c and accompanying assembly modules.  | 
683  |  |      */  | 
684  | 49.7k  |     if ((16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024)  | 
685  | 49.7k  |         && rsaz_avx2_eligible()) { | 
686  | 0  |         if (NULL == bn_wexpand(rr, 16))  | 
687  | 0  |             goto err;  | 
688  | 0  |         RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d,  | 
689  | 0  |                                mont->n0[0]);  | 
690  | 0  |         rr->top = 16;  | 
691  | 0  |         rr->neg = 0;  | 
692  | 0  |         bn_correct_top(rr);  | 
693  | 0  |         ret = 1;  | 
694  | 0  |         goto err;  | 
695  | 49.7k  |     } else if ((8 == a->top) && (8 == p->top) && (BN_num_bits(m) == 512)) { | 
696  | 395  |         if (NULL == bn_wexpand(rr, 8))  | 
697  | 0  |             goto err;  | 
698  | 395  |         RSAZ_512_mod_exp(rr->d, a->d, p->d, m->d, mont->n0[0], mont->RR.d);  | 
699  | 395  |         rr->top = 8;  | 
700  | 395  |         rr->neg = 0;  | 
701  | 395  |         bn_correct_top(rr);  | 
702  | 395  |         ret = 1;  | 
703  | 395  |         goto err;  | 
704  | 395  |     }  | 
705  | 49.3k  | #endif  | 
706  |  |  | 
707  |  |     /* Get the window size to use with size of p. */  | 
708  | 49.3k  |     window = BN_window_bits_for_ctime_exponent_size(bits);  | 
709  |  | #if defined(SPARC_T4_MONT)  | 
710  |  |     if (window >= 5 && (top & 15) == 0 && top <= 64 &&  | 
711  |  |         (OPENSSL_sparcv9cap_P[1] & (CFR_MONTMUL | CFR_MONTSQR)) ==  | 
712  |  |         (CFR_MONTMUL | CFR_MONTSQR) && (t4 = OPENSSL_sparcv9cap_P[0]))  | 
713  |  |         window = 5;  | 
714  |  |     else  | 
715  |  | #endif  | 
716  | 49.3k  | #if defined(OPENSSL_BN_ASM_MONT5)  | 
717  | 49.3k  |     if (window >= 5 && top <= BN_SOFT_LIMIT) { | 
718  | 29.5k  |         window = 5;             /* ~5% improvement for RSA2048 sign, and even  | 
719  |  |                                  * for RSA4096 */  | 
720  |  |         /* reserve space for mont->N.d[] copy */  | 
721  | 29.5k  |         powerbufLen += top * sizeof(mont->N.d[0]);  | 
722  | 29.5k  |     }  | 
723  | 49.3k  | #endif  | 
724  | 49.3k  |     (void)0;  | 
725  |  |  | 
726  |  |     /*  | 
727  |  |      * Allocate a buffer large enough to hold all of the pre-computed powers  | 
728  |  |      * of am, am itself and tmp.  | 
729  |  |      */  | 
730  | 49.3k  |     numPowers = 1 << window;  | 
731  | 49.3k  |     powerbufLen += sizeof(m->d[0]) * (top * numPowers +  | 
732  | 49.3k  |                                       ((2 * top) >  | 
733  | 49.3k  |                                        numPowers ? (2 * top) : numPowers));  | 
734  | 49.3k  | #ifdef alloca  | 
735  | 49.3k  |     if (powerbufLen < 3072)  | 
736  | 21.6k  |         powerbufFree =  | 
737  | 21.6k  |             alloca(powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);  | 
738  | 27.6k  |     else  | 
739  | 27.6k  | #endif  | 
740  | 27.6k  |         if ((powerbufFree =  | 
741  | 27.6k  |              OPENSSL_malloc(powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH))  | 
742  | 27.6k  |             == NULL)  | 
743  | 0  |         goto err;  | 
744  |  |  | 
745  | 49.3k  |     powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);  | 
746  | 49.3k  |     memset(powerbuf, 0, powerbufLen);  | 
747  |  |  | 
748  | 49.3k  | #ifdef alloca  | 
749  | 49.3k  |     if (powerbufLen < 3072)  | 
750  | 21.6k  |         powerbufFree = NULL;  | 
751  | 49.3k  | #endif  | 
752  |  |  | 
753  |  |     /* lay down tmp and am right after powers table */  | 
754  | 49.3k  |     tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0]) * top * numPowers);  | 
755  | 49.3k  |     am.d = tmp.d + top;  | 
756  | 49.3k  |     tmp.top = am.top = 0;  | 
757  | 49.3k  |     tmp.dmax = am.dmax = top;  | 
758  | 49.3k  |     tmp.neg = am.neg = 0;  | 
759  | 49.3k  |     tmp.flags = am.flags = BN_FLG_STATIC_DATA;  | 
760  |  |  | 
761  |  |     /* prepare a^0 in Montgomery domain */  | 
762  | 49.3k  | #if 1                           /* by Shay Gueron's suggestion */  | 
763  | 49.3k  |     if (m->d[top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) { | 
764  |  |         /* 2^(top*BN_BITS2) - m */  | 
765  | 32.9k  |         tmp.d[0] = (0 - m->d[0]) & BN_MASK2;  | 
766  | 777k  |         for (i = 1; i < top; i++)  | 
767  | 744k  |             tmp.d[i] = (~m->d[i]) & BN_MASK2;  | 
768  | 32.9k  |         tmp.top = top;  | 
769  | 32.9k  |     } else  | 
770  | 16.3k  | #endif  | 
771  | 16.3k  |     if (!bn_to_mont_fixed_top(&tmp, BN_value_one(), mont, ctx))  | 
772  | 0  |         goto err;  | 
773  |  |  | 
774  |  |     /* prepare a^1 in Montgomery domain */  | 
775  | 49.3k  |     if (!bn_to_mont_fixed_top(&am, a, mont, ctx))  | 
776  | 0  |         goto err;  | 
777  |  |  | 
778  | 49.3k  |     if (top > BN_SOFT_LIMIT)  | 
779  | 115  |         goto fallback;  | 
780  |  |  | 
781  |  | #if defined(SPARC_T4_MONT)  | 
782  |  |     if (t4) { | 
783  |  |         typedef int (*bn_pwr5_mont_f) (BN_ULONG *tp, const BN_ULONG *np,  | 
784  |  |                                        const BN_ULONG *n0, const void *table,  | 
785  |  |                                        int power, int bits);  | 
786  |  |         int bn_pwr5_mont_t4_8(BN_ULONG *tp, const BN_ULONG *np,  | 
787  |  |                               const BN_ULONG *n0, const void *table,  | 
788  |  |                               int power, int bits);  | 
789  |  |         int bn_pwr5_mont_t4_16(BN_ULONG *tp, const BN_ULONG *np,  | 
790  |  |                                const BN_ULONG *n0, const void *table,  | 
791  |  |                                int power, int bits);  | 
792  |  |         int bn_pwr5_mont_t4_24(BN_ULONG *tp, const BN_ULONG *np,  | 
793  |  |                                const BN_ULONG *n0, const void *table,  | 
794  |  |                                int power, int bits);  | 
795  |  |         int bn_pwr5_mont_t4_32(BN_ULONG *tp, const BN_ULONG *np,  | 
796  |  |                                const BN_ULONG *n0, const void *table,  | 
797  |  |                                int power, int bits);  | 
798  |  |         static const bn_pwr5_mont_f pwr5_funcs[4] = { | 
799  |  |             bn_pwr5_mont_t4_8, bn_pwr5_mont_t4_16,  | 
800  |  |             bn_pwr5_mont_t4_24, bn_pwr5_mont_t4_32  | 
801  |  |         };  | 
802  |  |         bn_pwr5_mont_f pwr5_worker = pwr5_funcs[top / 16 - 1];  | 
803  |  |  | 
804  |  |         typedef int (*bn_mul_mont_f) (BN_ULONG *rp, const BN_ULONG *ap,  | 
805  |  |                                       const void *bp, const BN_ULONG *np,  | 
806  |  |                                       const BN_ULONG *n0);  | 
807  |  |         int bn_mul_mont_t4_8(BN_ULONG *rp, const BN_ULONG *ap, const void *bp,  | 
808  |  |                              const BN_ULONG *np, const BN_ULONG *n0);  | 
809  |  |         int bn_mul_mont_t4_16(BN_ULONG *rp, const BN_ULONG *ap,  | 
810  |  |                               const void *bp, const BN_ULONG *np,  | 
811  |  |                               const BN_ULONG *n0);  | 
812  |  |         int bn_mul_mont_t4_24(BN_ULONG *rp, const BN_ULONG *ap,  | 
813  |  |                               const void *bp, const BN_ULONG *np,  | 
814  |  |                               const BN_ULONG *n0);  | 
815  |  |         int bn_mul_mont_t4_32(BN_ULONG *rp, const BN_ULONG *ap,  | 
816  |  |                               const void *bp, const BN_ULONG *np,  | 
817  |  |                               const BN_ULONG *n0);  | 
818  |  |         static const bn_mul_mont_f mul_funcs[4] = { | 
819  |  |             bn_mul_mont_t4_8, bn_mul_mont_t4_16,  | 
820  |  |             bn_mul_mont_t4_24, bn_mul_mont_t4_32  | 
821  |  |         };  | 
822  |  |         bn_mul_mont_f mul_worker = mul_funcs[top / 16 - 1];  | 
823  |  |  | 
824  |  |         void bn_mul_mont_vis3(BN_ULONG *rp, const BN_ULONG *ap,  | 
825  |  |                               const void *bp, const BN_ULONG *np,  | 
826  |  |                               const BN_ULONG *n0, int num);  | 
827  |  |         void bn_mul_mont_t4(BN_ULONG *rp, const BN_ULONG *ap,  | 
828  |  |                             const void *bp, const BN_ULONG *np,  | 
829  |  |                             const BN_ULONG *n0, int num);  | 
830  |  |         void bn_mul_mont_gather5_t4(BN_ULONG *rp, const BN_ULONG *ap,  | 
831  |  |                                     const void *table, const BN_ULONG *np,  | 
832  |  |                                     const BN_ULONG *n0, int num, int power);  | 
833  |  |         void bn_flip_n_scatter5_t4(const BN_ULONG *inp, size_t num,  | 
834  |  |                                    void *table, size_t power);  | 
835  |  |         void bn_gather5_t4(BN_ULONG *out, size_t num,  | 
836  |  |                            void *table, size_t power);  | 
837  |  |         void bn_flip_t4(BN_ULONG *dst, BN_ULONG *src, size_t num);  | 
838  |  |  | 
839  |  |         BN_ULONG *np = mont->N.d, *n0 = mont->n0;  | 
840  |  |         int stride = 5 * (6 - (top / 16 - 1)); /* multiple of 5, but less  | 
841  |  |                                                 * than 32 */  | 
842  |  |  | 
843  |  |         /*  | 
844  |  |          * BN_to_montgomery can contaminate words above .top [in  | 
845  |  |          * BN_DEBUG build...  | 
846  |  |          */  | 
847  |  |         for (i = am.top; i < top; i++)  | 
848  |  |             am.d[i] = 0;  | 
849  |  |         for (i = tmp.top; i < top; i++)  | 
850  |  |             tmp.d[i] = 0;  | 
851  |  |  | 
852  |  |         bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 0);  | 
853  |  |         bn_flip_n_scatter5_t4(am.d, top, powerbuf, 1);  | 
854  |  |         if (!(*mul_worker) (tmp.d, am.d, am.d, np, n0) &&  | 
855  |  |             !(*mul_worker) (tmp.d, am.d, am.d, np, n0))  | 
856  |  |             bn_mul_mont_vis3(tmp.d, am.d, am.d, np, n0, top);  | 
857  |  |         bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, 2);  | 
858  |  |  | 
859  |  |         for (i = 3; i < 32; i++) { | 
860  |  |             /* Calculate a^i = a^(i-1) * a */  | 
861  |  |             if (!(*mul_worker) (tmp.d, tmp.d, am.d, np, n0) &&  | 
862  |  |                 !(*mul_worker) (tmp.d, tmp.d, am.d, np, n0))  | 
863  |  |                 bn_mul_mont_vis3(tmp.d, tmp.d, am.d, np, n0, top);  | 
864  |  |             bn_flip_n_scatter5_t4(tmp.d, top, powerbuf, i);  | 
865  |  |         }  | 
866  |  |  | 
867  |  |         /* switch to 64-bit domain */  | 
868  |  |         np = alloca(top * sizeof(BN_ULONG));  | 
869  |  |         top /= 2;  | 
870  |  |         bn_flip_t4(np, mont->N.d, top);  | 
871  |  |  | 
872  |  |         /*  | 
873  |  |          * The exponent may not have a whole number of fixed-size windows.  | 
874  |  |          * To simplify the main loop, the initial window has between 1 and  | 
875  |  |          * full-window-size bits such that what remains is always a whole  | 
876  |  |          * number of windows  | 
877  |  |          */  | 
878  |  |         window0 = (bits - 1) % 5 + 1;  | 
879  |  |         wmask = (1 << window0) - 1;  | 
880  |  |         bits -= window0;  | 
881  |  |         wvalue = bn_get_bits(p, bits) & wmask;  | 
882  |  |         bn_gather5_t4(tmp.d, top, powerbuf, wvalue);  | 
883  |  |  | 
884  |  |         /*  | 
885  |  |          * Scan the exponent one window at a time starting from the most  | 
886  |  |          * significant bits.  | 
887  |  |          */  | 
888  |  |         while (bits > 0) { | 
889  |  |             if (bits < stride)  | 
890  |  |                 stride = bits;  | 
891  |  |             bits -= stride;  | 
892  |  |             wvalue = bn_get_bits(p, bits);  | 
893  |  |  | 
894  |  |             if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride))  | 
895  |  |                 continue;  | 
896  |  |             /* retry once and fall back */  | 
897  |  |             if ((*pwr5_worker) (tmp.d, np, n0, powerbuf, wvalue, stride))  | 
898  |  |                 continue;  | 
899  |  |  | 
900  |  |             bits += stride - 5;  | 
901  |  |             wvalue >>= stride - 5;  | 
902  |  |             wvalue &= 31;  | 
903  |  |             bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);  | 
904  |  |             bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);  | 
905  |  |             bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);  | 
906  |  |             bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);  | 
907  |  |             bn_mul_mont_t4(tmp.d, tmp.d, tmp.d, np, n0, top);  | 
908  |  |             bn_mul_mont_gather5_t4(tmp.d, tmp.d, powerbuf, np, n0, top,  | 
909  |  |                                    wvalue);  | 
910  |  |         }  | 
911  |  |  | 
912  |  |         bn_flip_t4(tmp.d, tmp.d, top);  | 
913  |  |         top *= 2;  | 
914  |  |         /* back to 32-bit domain */  | 
915  |  |         tmp.top = top;  | 
916  |  |         bn_correct_top(&tmp);  | 
917  |  |         OPENSSL_cleanse(np, top * sizeof(BN_ULONG));  | 
918  |  |     } else  | 
919  |  | #endif  | 
920  | 49.2k  | #if defined(OPENSSL_BN_ASM_MONT5)  | 
921  | 49.2k  |     if (window == 5 && top > 1) { | 
922  |  |         /*  | 
923  |  |          * This optimization uses ideas from https://eprint.iacr.org/2011/239,  | 
924  |  |          * specifically optimization of cache-timing attack countermeasures,  | 
925  |  |          * pre-computation optimization, and Almost Montgomery Multiplication.  | 
926  |  |          *  | 
927  |  |          * The paper discusses a 4-bit window to optimize 512-bit modular  | 
928  |  |          * exponentiation, used in RSA-1024 with CRT, but RSA-1024 is no longer  | 
929  |  |          * important.  | 
930  |  |          *  | 
931  |  |          * |bn_mul_mont_gather5| and |bn_power5| implement the "almost"  | 
932  |  |          * reduction variant, so the values here may not be fully reduced.  | 
933  |  |          * They are bounded by R (i.e. they fit in |top| words), not |m|.  | 
934  |  |          * Additionally, we pass these "almost" reduced inputs into  | 
935  |  |          * |bn_mul_mont|, which implements the normal reduction variant.  | 
936  |  |          * Given those inputs, |bn_mul_mont| may not give reduced  | 
937  |  |          * output, but it will still produce "almost" reduced output.  | 
938  |  |          */  | 
939  | 29.5k  |         void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,  | 
940  | 29.5k  |                                  const void *table, const BN_ULONG *np,  | 
941  | 29.5k  |                                  const BN_ULONG *n0, int num, int power);  | 
942  | 29.5k  |         void bn_scatter5(const BN_ULONG *inp, size_t num,  | 
943  | 29.5k  |                          void *table, size_t power);  | 
944  | 29.5k  |         void bn_gather5(BN_ULONG *out, size_t num, void *table, size_t power);  | 
945  | 29.5k  |         void bn_power5(BN_ULONG *rp, const BN_ULONG *ap,  | 
946  | 29.5k  |                        const void *table, const BN_ULONG *np,  | 
947  | 29.5k  |                        const BN_ULONG *n0, int num, int power);  | 
948  | 29.5k  |         int bn_get_bits5(const BN_ULONG *ap, int off);  | 
949  |  |  | 
950  | 29.5k  |         BN_ULONG *n0 = mont->n0, *np;  | 
951  |  |  | 
952  |  |         /*  | 
953  |  |          * BN_to_montgomery can contaminate words above .top [in  | 
954  |  |          * BN_DEBUG build...  | 
955  |  |          */  | 
956  | 29.5k  |         for (i = am.top; i < top; i++)  | 
957  | 0  |             am.d[i] = 0;  | 
958  | 29.5k  |         for (i = tmp.top; i < top; i++)  | 
959  | 0  |             tmp.d[i] = 0;  | 
960  |  |  | 
961  |  |         /*  | 
962  |  |          * copy mont->N.d[] to improve cache locality  | 
963  |  |          */  | 
964  | 635k  |         for (np = am.d + top, i = 0; i < top; i++)  | 
965  | 605k  |             np[i] = mont->N.d[i];  | 
966  |  |  | 
967  | 29.5k  |         bn_scatter5(tmp.d, top, powerbuf, 0);  | 
968  | 29.5k  |         bn_scatter5(am.d, am.top, powerbuf, 1);  | 
969  | 29.5k  |         bn_mul_mont(tmp.d, am.d, am.d, np, n0, top);  | 
970  | 29.5k  |         bn_scatter5(tmp.d, top, powerbuf, 2);  | 
971  |  |  | 
972  |  | # if 0  | 
973  |  |         for (i = 3; i < 32; i++) { | 
974  |  |             /* Calculate a^i = a^(i-1) * a */  | 
975  |  |             bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);  | 
976  |  |             bn_scatter5(tmp.d, top, powerbuf, i);  | 
977  |  |         }  | 
978  |  | # else  | 
979  |  |         /* same as above, but uses squaring for 1/2 of operations */  | 
980  | 118k  |         for (i = 4; i < 32; i *= 2) { | 
981  | 88.6k  |             bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);  | 
982  | 88.6k  |             bn_scatter5(tmp.d, top, powerbuf, i);  | 
983  | 88.6k  |         }  | 
984  | 118k  |         for (i = 3; i < 8; i += 2) { | 
985  | 88.6k  |             int j;  | 
986  | 88.6k  |             bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);  | 
987  | 88.6k  |             bn_scatter5(tmp.d, top, powerbuf, i);  | 
988  | 295k  |             for (j = 2 * i; j < 32; j *= 2) { | 
989  | 206k  |                 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);  | 
990  | 206k  |                 bn_scatter5(tmp.d, top, powerbuf, j);  | 
991  | 206k  |             }  | 
992  | 88.6k  |         }  | 
993  | 147k  |         for (; i < 16; i += 2) { | 
994  | 118k  |             bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);  | 
995  | 118k  |             bn_scatter5(tmp.d, top, powerbuf, i);  | 
996  | 118k  |             bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);  | 
997  | 118k  |             bn_scatter5(tmp.d, top, powerbuf, 2 * i);  | 
998  | 118k  |         }  | 
999  | 265k  |         for (; i < 32; i += 2) { | 
1000  | 236k  |             bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1);  | 
1001  | 236k  |             bn_scatter5(tmp.d, top, powerbuf, i);  | 
1002  | 236k  |         }  | 
1003  | 29.5k  | # endif  | 
1004  |  |         /*  | 
1005  |  |          * The exponent may not have a whole number of fixed-size windows.  | 
1006  |  |          * To simplify the main loop, the initial window has between 1 and  | 
1007  |  |          * full-window-size bits such that what remains is always a whole  | 
1008  |  |          * number of windows  | 
1009  |  |          */  | 
1010  | 29.5k  |         window0 = (bits - 1) % 5 + 1;  | 
1011  | 29.5k  |         wmask = (1 << window0) - 1;  | 
1012  | 29.5k  |         bits -= window0;  | 
1013  | 29.5k  |         wvalue = bn_get_bits(p, bits) & wmask;  | 
1014  | 29.5k  |         bn_gather5(tmp.d, top, powerbuf, wvalue);  | 
1015  |  |  | 
1016  |  |         /*  | 
1017  |  |          * Scan the exponent one window at a time starting from the most  | 
1018  |  |          * significant bits.  | 
1019  |  |          */  | 
1020  | 29.5k  |         if (top & 7) { | 
1021  | 1.59M  |             while (bits > 0) { | 
1022  | 1.59M  |                 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);  | 
1023  | 1.59M  |                 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);  | 
1024  | 1.59M  |                 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);  | 
1025  | 1.59M  |                 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);  | 
1026  | 1.59M  |                 bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);  | 
1027  | 1.59M  |                 bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top,  | 
1028  | 1.59M  |                                     bn_get_bits5(p->d, bits -= 5));  | 
1029  | 1.59M  |             }  | 
1030  | 22.6k  |         } else { | 
1031  | 4.69M  |             while (bits > 0) { | 
1032  | 4.67M  |                 bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top,  | 
1033  | 4.67M  |                           bn_get_bits5(p->d, bits -= 5));  | 
1034  | 4.67M  |             }  | 
1035  | 22.6k  |         }  | 
1036  |  |  | 
1037  | 29.5k  |         tmp.top = top;  | 
1038  |  |         /*  | 
1039  |  |          * The result is now in |tmp| in Montgomery form, but it may not be  | 
1040  |  |          * fully reduced. This is within bounds for |BN_from_montgomery|  | 
1041  |  |          * (tmp < R <= m*R) so it will, when converting from Montgomery form,  | 
1042  |  |          * produce a fully reduced result.  | 
1043  |  |          *  | 
1044  |  |          * This differs from Figure 2 of the paper, which uses AMM(h, 1) to  | 
1045  |  |          * convert from Montgomery form with unreduced output, followed by an  | 
1046  |  |          * extra reduction step. In the paper's terminology, we replace  | 
1047  |  |          * steps 9 and 10 with MM(h, 1).  | 
1048  |  |          */  | 
1049  | 29.5k  |     } else  | 
1050  | 19.6k  | #endif  | 
1051  | 19.6k  |     { | 
1052  | 19.7k  |  fallback:  | 
1053  | 19.7k  |         if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, window))  | 
1054  | 0  |             goto err;  | 
1055  | 19.7k  |         if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, window))  | 
1056  | 0  |             goto err;  | 
1057  |  |  | 
1058  |  |         /*  | 
1059  |  |          * If the window size is greater than 1, then calculate  | 
1060  |  |          * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) (even  | 
1061  |  |          * powers could instead be computed as (a^(i/2))^2 to use the slight  | 
1062  |  |          * performance advantage of sqr over mul).  | 
1063  |  |          */  | 
1064  | 19.7k  |         if (window > 1) { | 
1065  | 19.7k  |             if (!bn_mul_mont_fixed_top(&tmp, &am, &am, mont, ctx))  | 
1066  | 0  |                 goto err;  | 
1067  | 19.7k  |             if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2,  | 
1068  | 19.7k  |                                               window))  | 
1069  | 0  |                 goto err;  | 
1070  | 137k  |             for (i = 3; i < numPowers; i++) { | 
1071  |  |                 /* Calculate a^i = a^(i-1) * a */  | 
1072  | 117k  |                 if (!bn_mul_mont_fixed_top(&tmp, &am, &tmp, mont, ctx))  | 
1073  | 0  |                     goto err;  | 
1074  | 117k  |                 if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i,  | 
1075  | 117k  |                                                   window))  | 
1076  | 0  |                     goto err;  | 
1077  | 117k  |             }  | 
1078  | 19.7k  |         }  | 
1079  |  |  | 
1080  |  |         /*  | 
1081  |  |          * The exponent may not have a whole number of fixed-size windows.  | 
1082  |  |          * To simplify the main loop, the initial window has between 1 and  | 
1083  |  |          * full-window-size bits such that what remains is always a whole  | 
1084  |  |          * number of windows  | 
1085  |  |          */  | 
1086  | 19.7k  |         window0 = (bits - 1) % window + 1;  | 
1087  | 19.7k  |         wmask = (1 << window0) - 1;  | 
1088  | 19.7k  |         bits -= window0;  | 
1089  | 19.7k  |         wvalue = bn_get_bits(p, bits) & wmask;  | 
1090  | 19.7k  |         if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp, top, powerbuf, wvalue,  | 
1091  | 19.7k  |                                             window))  | 
1092  | 0  |             goto err;  | 
1093  |  |  | 
1094  | 19.7k  |         wmask = (1 << window) - 1;  | 
1095  |  |         /*  | 
1096  |  |          * Scan the exponent one window at a time starting from the most  | 
1097  |  |          * significant bits.  | 
1098  |  |          */  | 
1099  | 490k  |         while (bits > 0) { | 
1100  |  |  | 
1101  |  |             /* Square the result window-size times */  | 
1102  | 1.98M  |             for (i = 0; i < window; i++)  | 
1103  | 1.51M  |                 if (!bn_mul_mont_fixed_top(&tmp, &tmp, &tmp, mont, ctx))  | 
1104  | 0  |                     goto err;  | 
1105  |  |  | 
1106  |  |             /*  | 
1107  |  |              * Get a window's worth of bits from the exponent  | 
1108  |  |              * This avoids calling BN_is_bit_set for each bit, which  | 
1109  |  |              * is not only slower but also makes each bit vulnerable to  | 
1110  |  |              * EM (and likely other) side-channel attacks like One&Done  | 
1111  |  |              * (for details see "One&Done: A Single-Decryption EM-Based  | 
1112  |  |              *  Attack on OpenSSL's Constant-Time Blinded RSA" by M. Alam,  | 
1113  |  |              *  H. Khan, M. Dey, N. Sinha, R. Callan, A. Zajic, and  | 
1114  |  |              *  M. Prvulovic, in USENIX Security'18)  | 
1115  |  |              */  | 
1116  | 470k  |             bits -= window;  | 
1117  | 470k  |             wvalue = bn_get_bits(p, bits) & wmask;  | 
1118  |  |             /*  | 
1119  |  |              * Fetch the appropriate pre-computed value from the pre-buf  | 
1120  |  |              */  | 
1121  | 470k  |             if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue,  | 
1122  | 470k  |                                                 window))  | 
1123  | 0  |                 goto err;  | 
1124  |  |  | 
1125  |  |             /* Multiply the result into the intermediate result */  | 
1126  | 470k  |             if (!bn_mul_mont_fixed_top(&tmp, &tmp, &am, mont, ctx))  | 
1127  | 0  |                 goto err;  | 
1128  | 470k  |         }  | 
1129  | 19.7k  |     }  | 
1130  |  |  | 
1131  |  |     /*  | 
1132  |  |      * Done with zero-padded intermediate BIGNUMs. Final BN_from_montgomery  | 
1133  |  |      * removes padding [if any] and makes return value suitable for public  | 
1134  |  |      * API consumer.  | 
1135  |  |      */  | 
1136  |  | #if defined(SPARC_T4_MONT)  | 
1137  |  |     if (OPENSSL_sparcv9cap_P[0] & (SPARCV9_VIS3 | SPARCV9_PREFER_FPU)) { | 
1138  |  |         am.d[0] = 1;            /* borrow am */  | 
1139  |  |         for (i = 1; i < top; i++)  | 
1140  |  |             am.d[i] = 0;  | 
1141  |  |         if (!BN_mod_mul_montgomery(rr, &tmp, &am, mont, ctx))  | 
1142  |  |             goto err;  | 
1143  |  |     } else  | 
1144  |  | #endif  | 
1145  | 49.3k  |     if (!bn_from_mont_fixed_top(rr, &tmp, mont, ctx))  | 
1146  | 0  |         goto err;  | 
1147  | 49.3k  |     ret = 1;  | 
1148  | 49.7k  |  err:  | 
1149  | 49.7k  |     if (in_mont == NULL)  | 
1150  | 2.24k  |         BN_MONT_CTX_free(mont);  | 
1151  | 49.7k  |     if (powerbuf != NULL) { | 
1152  | 49.3k  |         OPENSSL_cleanse(powerbuf, powerbufLen);  | 
1153  | 49.3k  |         OPENSSL_free(powerbufFree);  | 
1154  | 49.3k  |     }  | 
1155  | 49.7k  |     BN_CTX_end(ctx);  | 
1156  | 49.7k  |     return ret;  | 
1157  | 49.3k  | }  | 
1158  |  |  | 
1159  |  | int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,  | 
1160  |  |                               const BIGNUM *m, BN_CTX *ctx,  | 
1161  |  |                               BN_MONT_CTX *in_mont)  | 
1162  | 49.2k  | { | 
1163  | 49.2k  |     bn_check_top(a);  | 
1164  | 49.2k  |     bn_check_top(p);  | 
1165  | 49.2k  |     bn_check_top(m);  | 
1166  | 49.2k  |     if (!bn_mod_exp_mont_fixed_top(rr, a, p, m, ctx, in_mont))  | 
1167  | 0  |         return 0;  | 
1168  | 49.2k  |     bn_correct_top(rr);  | 
1169  | 49.2k  |     return 1;  | 
1170  | 49.2k  | }  | 
1171  |  |  | 
1172  |  | int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,  | 
1173  |  |                          const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)  | 
1174  | 51.5k  | { | 
1175  | 51.5k  |     BN_MONT_CTX *mont = NULL;  | 
1176  | 51.5k  |     int b, bits, ret = 0;  | 
1177  | 51.5k  |     int r_is_one;  | 
1178  | 51.5k  |     BN_ULONG w, next_w;  | 
1179  | 51.5k  |     BIGNUM *r, *t;  | 
1180  | 51.5k  |     BIGNUM *swap_tmp;  | 
1181  | 51.5k  | #define BN_MOD_MUL_WORD(r, w, m) \  | 
1182  | 4.06M  |                 (BN_mul_word(r, (w)) && \  | 
1183  | 4.06M  |                 (/* BN_ucmp(r, (m)) < 0 ? 1 :*/  \  | 
1184  | 4.06M  |                         (BN_mod(t, r, m, ctx) && (swap_tmp = r, r = t, t = swap_tmp, 1))))  | 
1185  |  |     /*  | 
1186  |  |      * BN_MOD_MUL_WORD is only used with 'w' large, so the BN_ucmp test is  | 
1187  |  |      * probably more overhead than always using BN_mod (which uses BN_copy if  | 
1188  |  |      * a similar test returns true).  | 
1189  |  |      */  | 
1190  |  |     /*  | 
1191  |  |      * We can use BN_mod and do not need BN_nnmod because our accumulator is  | 
1192  |  |      * never negative (the result of BN_mod does not depend on the sign of  | 
1193  |  |      * the modulus).  | 
1194  |  |      */  | 
1195  | 51.5k  | #define BN_TO_MONTGOMERY_WORD(r, w, mont) \  | 
1196  | 51.5k  |                 (BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx))  | 
1197  |  |  | 
1198  | 51.5k  |     if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0  | 
1199  | 51.5k  |             || BN_get_flags(m, BN_FLG_CONSTTIME) != 0) { | 
1200  |  |         /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */  | 
1201  | 0  |         ERR_raise(ERR_LIB_BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);  | 
1202  | 0  |         return 0;  | 
1203  | 0  |     }  | 
1204  |  |  | 
1205  | 51.5k  |     bn_check_top(p);  | 
1206  | 51.5k  |     bn_check_top(m);  | 
1207  |  |  | 
1208  | 51.5k  |     if (!BN_is_odd(m)) { | 
1209  | 0  |         ERR_raise(ERR_LIB_BN, BN_R_CALLED_WITH_EVEN_MODULUS);  | 
1210  | 0  |         return 0;  | 
1211  | 0  |     }  | 
1212  | 51.5k  |     if (m->top == 1)  | 
1213  | 1.85k  |         a %= m->d[0];           /* make sure that 'a' is reduced */  | 
1214  |  |  | 
1215  | 51.5k  |     bits = BN_num_bits(p);  | 
1216  | 51.5k  |     if (bits == 0) { | 
1217  |  |         /* x**0 mod 1, or x**0 mod -1 is still zero. */  | 
1218  | 53  |         if (BN_abs_is_word(m, 1)) { | 
1219  | 4  |             ret = 1;  | 
1220  | 4  |             BN_zero(rr);  | 
1221  | 49  |         } else { | 
1222  | 49  |             ret = BN_one(rr);  | 
1223  | 49  |         }  | 
1224  | 53  |         return ret;  | 
1225  | 53  |     }  | 
1226  | 51.5k  |     if (a == 0) { | 
1227  | 4  |         BN_zero(rr);  | 
1228  | 4  |         ret = 1;  | 
1229  | 4  |         return ret;  | 
1230  | 4  |     }  | 
1231  |  |  | 
1232  | 51.5k  |     BN_CTX_start(ctx);  | 
1233  | 51.5k  |     r = BN_CTX_get(ctx);  | 
1234  | 51.5k  |     t = BN_CTX_get(ctx);  | 
1235  | 51.5k  |     if (t == NULL)  | 
1236  | 0  |         goto err;  | 
1237  |  |  | 
1238  | 51.5k  |     if (in_mont != NULL)  | 
1239  | 0  |         mont = in_mont;  | 
1240  | 51.5k  |     else { | 
1241  | 51.5k  |         if ((mont = BN_MONT_CTX_new()) == NULL)  | 
1242  | 0  |             goto err;  | 
1243  | 51.5k  |         if (!BN_MONT_CTX_set(mont, m, ctx))  | 
1244  | 0  |             goto err;  | 
1245  | 51.5k  |     }  | 
1246  |  |  | 
1247  | 51.5k  |     r_is_one = 1;               /* except for Montgomery factor */  | 
1248  |  |  | 
1249  |  |     /* bits-1 >= 0 */  | 
1250  |  |  | 
1251  |  |     /* The result is accumulated in the product r*w. */  | 
1252  | 51.5k  |     w = a;                      /* bit 'bits-1' of 'p' is always set */  | 
1253  | 10.5M  |     for (b = bits - 2; b >= 0; b--) { | 
1254  |  |         /* First, square r*w. */  | 
1255  | 10.5M  |         next_w = w * w;  | 
1256  | 10.5M  |         if ((next_w / w) != w) { /* overflow */ | 
1257  | 3.46M  |             if (r_is_one) { | 
1258  | 46.6k  |                 if (!BN_TO_MONTGOMERY_WORD(r, w, mont))  | 
1259  | 0  |                     goto err;  | 
1260  | 46.6k  |                 r_is_one = 0;  | 
1261  | 3.41M  |             } else { | 
1262  | 3.41M  |                 if (!BN_MOD_MUL_WORD(r, w, m))  | 
1263  | 0  |                     goto err;  | 
1264  | 3.41M  |             }  | 
1265  | 3.46M  |             next_w = 1;  | 
1266  | 3.46M  |         }  | 
1267  | 10.5M  |         w = next_w;  | 
1268  | 10.5M  |         if (!r_is_one) { | 
1269  | 10.4M  |             if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))  | 
1270  | 0  |                 goto err;  | 
1271  | 10.4M  |         }  | 
1272  |  |  | 
1273  |  |         /* Second, multiply r*w by 'a' if exponent bit is set. */  | 
1274  | 10.5M  |         if (BN_is_bit_set(p, b)) { | 
1275  | 7.93M  |             next_w = w * a;  | 
1276  | 7.93M  |             if ((next_w / a) != w) { /* overflow */ | 
1277  | 604k  |                 if (r_is_one) { | 
1278  | 4.58k  |                     if (!BN_TO_MONTGOMERY_WORD(r, w, mont))  | 
1279  | 0  |                         goto err;  | 
1280  | 4.58k  |                     r_is_one = 0;  | 
1281  | 600k  |                 } else { | 
1282  | 600k  |                     if (!BN_MOD_MUL_WORD(r, w, m))  | 
1283  | 0  |                         goto err;  | 
1284  | 600k  |                 }  | 
1285  | 604k  |                 next_w = a;  | 
1286  | 604k  |             }  | 
1287  | 7.93M  |             w = next_w;  | 
1288  | 7.93M  |         }  | 
1289  | 10.5M  |     }  | 
1290  |  |  | 
1291  |  |     /* Finally, set r:=r*w. */  | 
1292  | 51.5k  |     if (w != 1) { | 
1293  | 47.0k  |         if (r_is_one) { | 
1294  | 283  |             if (!BN_TO_MONTGOMERY_WORD(r, w, mont))  | 
1295  | 0  |                 goto err;  | 
1296  | 283  |             r_is_one = 0;  | 
1297  | 46.7k  |         } else { | 
1298  | 46.7k  |             if (!BN_MOD_MUL_WORD(r, w, m))  | 
1299  | 0  |                 goto err;  | 
1300  | 46.7k  |         }  | 
1301  | 47.0k  |     }  | 
1302  |  |  | 
1303  | 51.5k  |     if (r_is_one) {             /* can happen only if a == 1 */ | 
1304  | 30  |         if (!BN_one(rr))  | 
1305  | 0  |             goto err;  | 
1306  | 51.4k  |     } else { | 
1307  | 51.4k  |         if (!BN_from_montgomery(rr, r, mont, ctx))  | 
1308  | 0  |             goto err;  | 
1309  | 51.4k  |     }  | 
1310  | 51.5k  |     ret = 1;  | 
1311  | 51.5k  |  err:  | 
1312  | 51.5k  |     if (in_mont == NULL)  | 
1313  | 51.5k  |         BN_MONT_CTX_free(mont);  | 
1314  | 51.5k  |     BN_CTX_end(ctx);  | 
1315  | 51.5k  |     bn_check_top(rr);  | 
1316  | 51.5k  |     return ret;  | 
1317  | 51.5k  | }  | 
1318  |  |  | 
1319  |  | /* The old fallback, simple version :-) */  | 
1320  |  | int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,  | 
1321  |  |                       const BIGNUM *m, BN_CTX *ctx)  | 
1322  | 11.6k  | { | 
1323  | 11.6k  |     int i, j, bits, ret = 0, wstart, wend, window, wvalue;  | 
1324  | 11.6k  |     int start = 1;  | 
1325  | 11.6k  |     BIGNUM *d;  | 
1326  |  |     /* Table of variables obtained from 'ctx' */  | 
1327  | 11.6k  |     BIGNUM *val[TABLE_SIZE];  | 
1328  |  |  | 
1329  | 11.6k  |     if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0  | 
1330  | 11.6k  |             || BN_get_flags(a, BN_FLG_CONSTTIME) != 0  | 
1331  | 11.6k  |             || BN_get_flags(m, BN_FLG_CONSTTIME) != 0) { | 
1332  |  |         /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */  | 
1333  | 0  |         ERR_raise(ERR_LIB_BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);  | 
1334  | 0  |         return 0;  | 
1335  | 0  |     }  | 
1336  |  |  | 
1337  | 11.6k  |     if (r == m) { | 
1338  | 0  |         ERR_raise(ERR_LIB_BN, ERR_R_PASSED_INVALID_ARGUMENT);  | 
1339  | 0  |         return 0;  | 
1340  | 0  |     }  | 
1341  |  |  | 
1342  | 11.6k  |     bits = BN_num_bits(p);  | 
1343  | 11.6k  |     if (bits == 0) { | 
1344  |  |         /* x**0 mod 1, or x**0 mod -1 is still zero. */  | 
1345  | 695  |         if (BN_abs_is_word(m, 1)) { | 
1346  | 10  |             ret = 1;  | 
1347  | 10  |             BN_zero(r);  | 
1348  | 685  |         } else { | 
1349  | 685  |             ret = BN_one(r);  | 
1350  | 685  |         }  | 
1351  | 695  |         return ret;  | 
1352  | 695  |     }  | 
1353  |  |  | 
1354  | 10.9k  |     BN_CTX_start(ctx);  | 
1355  | 10.9k  |     d = BN_CTX_get(ctx);  | 
1356  | 10.9k  |     val[0] = BN_CTX_get(ctx);  | 
1357  | 10.9k  |     if (val[0] == NULL)  | 
1358  | 0  |         goto err;  | 
1359  |  |  | 
1360  | 10.9k  |     if (!BN_nnmod(val[0], a, m, ctx))  | 
1361  | 0  |         goto err;               /* 1 */  | 
1362  | 10.9k  |     if (BN_is_zero(val[0])) { | 
1363  | 2.85k  |         BN_zero(r);  | 
1364  | 2.85k  |         ret = 1;  | 
1365  | 2.85k  |         goto err;  | 
1366  | 2.85k  |     }  | 
1367  |  |  | 
1368  | 8.08k  |     window = BN_window_bits_for_exponent_size(bits);  | 
1369  | 8.08k  |     if (window > 1) { | 
1370  | 1.25k  |         if (!BN_mod_mul(d, val[0], val[0], m, ctx))  | 
1371  | 0  |             goto err;           /* 2 */  | 
1372  | 1.25k  |         j = 1 << (window - 1);  | 
1373  | 10.4k  |         for (i = 1; i < j; i++) { | 
1374  | 9.16k  |             if (((val[i] = BN_CTX_get(ctx)) == NULL) ||  | 
1375  | 9.16k  |                 !BN_mod_mul(val[i], val[i - 1], d, m, ctx))  | 
1376  | 0  |                 goto err;  | 
1377  | 9.16k  |         }  | 
1378  | 1.25k  |     }  | 
1379  |  |  | 
1380  | 8.08k  |     start = 1;                  /* This is used to avoid multiplication etc  | 
1381  |  |                                  * when there is only the value '1' in the  | 
1382  |  |                                  * buffer. */  | 
1383  | 8.08k  |     wvalue = 0;                 /* The 'value' of the window */  | 
1384  | 8.08k  |     wstart = bits - 1;          /* The top bit of the window */  | 
1385  | 8.08k  |     wend = 0;                   /* The bottom bit of the window */  | 
1386  |  |  | 
1387  | 8.08k  |     if (r == p) { | 
1388  | 0  |         BIGNUM *p_dup = BN_CTX_get(ctx);  | 
1389  |  | 
  | 
1390  | 0  |         if (p_dup == NULL || BN_copy(p_dup, p) == NULL)  | 
1391  | 0  |             goto err;  | 
1392  | 0  |         p = p_dup;  | 
1393  | 0  |     }  | 
1394  |  |  | 
1395  | 8.08k  |     if (!BN_one(r))  | 
1396  | 0  |         goto err;  | 
1397  |  |  | 
1398  | 184k  |     for (;;) { | 
1399  | 184k  |         if (BN_is_bit_set(p, wstart) == 0) { | 
1400  | 102k  |             if (!start)  | 
1401  | 102k  |                 if (!BN_mod_mul(r, r, r, m, ctx))  | 
1402  | 0  |                     goto err;  | 
1403  | 102k  |             if (wstart == 0)  | 
1404  | 2.39k  |                 break;  | 
1405  | 100k  |             wstart--;  | 
1406  | 100k  |             continue;  | 
1407  | 102k  |         }  | 
1408  |  |         /*  | 
1409  |  |          * We now have wstart on a 'set' bit, we now need to work out how bit  | 
1410  |  |          * a window to do.  To do this we need to scan forward until the last  | 
1411  |  |          * set bit before the end of the window  | 
1412  |  |          */  | 
1413  | 81.3k  |         wvalue = 1;  | 
1414  | 81.3k  |         wend = 0;  | 
1415  | 216k  |         for (i = 1; i < window; i++) { | 
1416  | 136k  |             if (wstart - i < 0)  | 
1417  | 662  |                 break;  | 
1418  | 135k  |             if (BN_is_bit_set(p, wstart - i)) { | 
1419  | 93.8k  |                 wvalue <<= (i - wend);  | 
1420  | 93.8k  |                 wvalue |= 1;  | 
1421  | 93.8k  |                 wend = i;  | 
1422  | 93.8k  |             }  | 
1423  | 135k  |         }  | 
1424  |  |  | 
1425  |  |         /* wend is the size of the current window */  | 
1426  | 81.3k  |         j = wend + 1;  | 
1427  |  |         /* add the 'bytes above' */  | 
1428  | 81.3k  |         if (!start)  | 
1429  | 255k  |             for (i = 0; i < j; i++) { | 
1430  | 182k  |                 if (!BN_mod_mul(r, r, r, m, ctx))  | 
1431  | 0  |                     goto err;  | 
1432  | 182k  |             }  | 
1433  |  |  | 
1434  |  |         /* wvalue will be an odd number < 2^window */  | 
1435  | 81.3k  |         if (!BN_mod_mul(r, r, val[wvalue >> 1], m, ctx))  | 
1436  | 0  |             goto err;  | 
1437  |  |  | 
1438  |  |         /* move the 'window' down further */  | 
1439  | 81.3k  |         wstart -= wend + 1;  | 
1440  | 81.3k  |         wvalue = 0;  | 
1441  | 81.3k  |         start = 0;  | 
1442  | 81.3k  |         if (wstart < 0)  | 
1443  | 5.69k  |             break;  | 
1444  | 81.3k  |     }  | 
1445  | 8.08k  |     ret = 1;  | 
1446  | 10.9k  |  err:  | 
1447  | 10.9k  |     BN_CTX_end(ctx);  | 
1448  | 10.9k  |     bn_check_top(r);  | 
1449  | 10.9k  |     return ret;  | 
1450  | 8.08k  | }  | 
1451  |  |  | 
1452  |  | /*  | 
1453  |  |  * This is a variant of modular exponentiation optimization that does  | 
1454  |  |  * parallel 2-primes exponentiation using 256-bit (AVX512VL) AVX512_IFMA ISA  | 
1455  |  |  * in 52-bit binary redundant representation.  | 
1456  |  |  * If such instructions are not available, or input data size is not supported,  | 
1457  |  |  * it falls back to two BN_mod_exp_mont_consttime() calls.  | 
1458  |  |  */  | 
1459  |  | int BN_mod_exp_mont_consttime_x2(BIGNUM *rr1, const BIGNUM *a1, const BIGNUM *p1,  | 
1460  |  |                                  const BIGNUM *m1, BN_MONT_CTX *in_mont1,  | 
1461  |  |                                  BIGNUM *rr2, const BIGNUM *a2, const BIGNUM *p2,  | 
1462  |  |                                  const BIGNUM *m2, BN_MONT_CTX *in_mont2,  | 
1463  |  |                                  BN_CTX *ctx)  | 
1464  | 1.39k  | { | 
1465  | 1.39k  |     int ret = 0;  | 
1466  |  |  | 
1467  | 1.39k  | #ifdef RSAZ_ENABLED  | 
1468  | 1.39k  |     BN_MONT_CTX *mont1 = NULL;  | 
1469  | 1.39k  |     BN_MONT_CTX *mont2 = NULL;  | 
1470  |  |  | 
1471  | 1.39k  |     if (ossl_rsaz_avx512ifma_eligible() &&  | 
1472  | 1.39k  |         ((a1->top == 16) && (p1->top == 16) && (BN_num_bits(m1) == 1024) &&  | 
1473  | 0  |          (a2->top == 16) && (p2->top == 16) && (BN_num_bits(m2) == 1024))) { | 
1474  |  | 
  | 
1475  | 0  |         if (bn_wexpand(rr1, 16) == NULL)  | 
1476  | 0  |             goto err;  | 
1477  | 0  |         if (bn_wexpand(rr2, 16) == NULL)  | 
1478  | 0  |             goto err;  | 
1479  |  |  | 
1480  |  |         /*  Ensure that montgomery contexts are initialized */  | 
1481  | 0  |         if (in_mont1 != NULL) { | 
1482  | 0  |             mont1 = in_mont1;  | 
1483  | 0  |         } else { | 
1484  | 0  |             if ((mont1 = BN_MONT_CTX_new()) == NULL)  | 
1485  | 0  |                 goto err;  | 
1486  | 0  |             if (!BN_MONT_CTX_set(mont1, m1, ctx))  | 
1487  | 0  |                 goto err;  | 
1488  | 0  |         }  | 
1489  | 0  |         if (in_mont2 != NULL) { | 
1490  | 0  |             mont2 = in_mont2;  | 
1491  | 0  |         } else { | 
1492  | 0  |             if ((mont2 = BN_MONT_CTX_new()) == NULL)  | 
1493  | 0  |                 goto err;  | 
1494  | 0  |             if (!BN_MONT_CTX_set(mont2, m2, ctx))  | 
1495  | 0  |                 goto err;  | 
1496  | 0  |         }  | 
1497  |  |  | 
1498  | 0  |         ret = ossl_rsaz_mod_exp_avx512_x2(rr1->d, a1->d, p1->d, m1->d,  | 
1499  | 0  |                                           mont1->RR.d, mont1->n0[0],  | 
1500  | 0  |                                           rr2->d, a2->d, p2->d, m2->d,  | 
1501  | 0  |                                           mont2->RR.d, mont2->n0[0],  | 
1502  | 0  |                                           1024 /* factor bit size */);  | 
1503  |  | 
  | 
1504  | 0  |         rr1->top = 16;  | 
1505  | 0  |         rr1->neg = 0;  | 
1506  | 0  |         bn_correct_top(rr1);  | 
1507  | 0  |         bn_check_top(rr1);  | 
1508  |  | 
  | 
1509  | 0  |         rr2->top = 16;  | 
1510  | 0  |         rr2->neg = 0;  | 
1511  | 0  |         bn_correct_top(rr2);  | 
1512  | 0  |         bn_check_top(rr2);  | 
1513  |  | 
  | 
1514  | 0  |         goto err;  | 
1515  | 0  |     }  | 
1516  | 1.39k  | #endif  | 
1517  |  |  | 
1518  |  |     /* rr1 = a1^p1 mod m1 */  | 
1519  | 1.39k  |     ret = BN_mod_exp_mont_consttime(rr1, a1, p1, m1, ctx, in_mont1);  | 
1520  |  |     /* rr2 = a2^p2 mod m2 */  | 
1521  | 1.39k  |     ret &= BN_mod_exp_mont_consttime(rr2, a2, p2, m2, ctx, in_mont2);  | 
1522  |  |  | 
1523  | 1.39k  | #ifdef RSAZ_ENABLED  | 
1524  | 1.39k  | err:  | 
1525  | 1.39k  |     if (in_mont2 == NULL)  | 
1526  | 0  |         BN_MONT_CTX_free(mont2);  | 
1527  | 1.39k  |     if (in_mont1 == NULL)  | 
1528  | 0  |         BN_MONT_CTX_free(mont1);  | 
1529  | 1.39k  | #endif  | 
1530  |  |  | 
1531  | 1.39k  |     return ret;  | 
1532  | 1.39k  | }  |