/src/openssl32/crypto/bn/rsaz_exp.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /*  | 
2  |  |  * Copyright 2013-2023 The OpenSSL Project Authors. All Rights Reserved.  | 
3  |  |  * Copyright (c) 2012, Intel Corporation. All Rights Reserved.  | 
4  |  |  *  | 
5  |  |  * Licensed under the Apache License 2.0 (the "License").  You may not use  | 
6  |  |  * this file except in compliance with the License.  You can obtain a copy  | 
7  |  |  * in the file LICENSE in the source distribution or at  | 
8  |  |  * https://www.openssl.org/source/license.html  | 
9  |  |  *  | 
10  |  |  * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)  | 
11  |  |  * (1) Intel Corporation, Israel Development Center, Haifa, Israel  | 
12  |  |  * (2) University of Haifa, Israel  | 
13  |  |  */  | 
14  |  |  | 
15  |  | #include <openssl/opensslconf.h>  | 
16  |  | #include "internal/common.h"  | 
17  |  | #include "rsaz_exp.h"  | 
18  |  |  | 
19  |  | #ifndef RSAZ_ENABLED  | 
20  |  | NON_EMPTY_TRANSLATION_UNIT  | 
21  |  | #else  | 
22  |  |  | 
23  |  | /*  | 
24  |  |  * See crypto/bn/asm/rsaz-avx2.pl for further details.  | 
25  |  |  */  | 
26  |  | void rsaz_1024_norm2red_avx2(void *red, const void *norm);  | 
27  |  | void rsaz_1024_mul_avx2(void *ret, const void *a, const void *b,  | 
28  |  |                         const void *n, BN_ULONG k);  | 
29  |  | void rsaz_1024_sqr_avx2(void *ret, const void *a, const void *n, BN_ULONG k,  | 
30  |  |                         int cnt);  | 
31  |  | void rsaz_1024_scatter5_avx2(void *tbl, const void *val, int i);  | 
32  |  | void rsaz_1024_gather5_avx2(void *val, const void *tbl, int i);  | 
33  |  | void rsaz_1024_red2norm_avx2(void *norm, const void *red);  | 
34  |  |  | 
35  |  | #if defined(__SUNPRO_C)  | 
36  |  | # pragma align 64(one,two80)  | 
37  |  | #endif  | 
38  |  |  | 
39  |  | ALIGN64 static const BN_ULONG one[40] = { | 
40  |  |     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
41  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  | 
42  |  | };  | 
43  |  |  | 
44  |  | ALIGN64 static const BN_ULONG two80[40] = { | 
45  |  |     0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  | 
46  |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  | 
47  |  | };  | 
48  |  |  | 
49  |  | void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],  | 
50  |  |                             const BN_ULONG base_norm[16],  | 
51  |  |                             const BN_ULONG exponent[16],  | 
52  |  |                             const BN_ULONG m_norm[16], const BN_ULONG RR[16],  | 
53  |  |                             BN_ULONG k0)  | 
54  | 0  | { | 
55  | 0  |     unsigned char storage[320 * 3 + 32 * 9 * 16 + 64]; /* 5.5KB */  | 
56  | 0  |     unsigned char *p_str = storage + (64 - ((size_t)storage % 64));  | 
57  | 0  |     unsigned char *a_inv, *m, *result;  | 
58  | 0  |     unsigned char *table_s = p_str + 320 * 3;  | 
59  | 0  |     unsigned char *R2 = table_s; /* borrow */  | 
60  | 0  |     int index;  | 
61  | 0  |     int wvalue;  | 
62  | 0  |     BN_ULONG tmp[16];  | 
63  |  | 
  | 
64  | 0  |     if ((((size_t)p_str & 4095) + 320) >> 12) { | 
65  | 0  |         result = p_str;  | 
66  | 0  |         a_inv = p_str + 320;  | 
67  | 0  |         m = p_str + 320 * 2;    /* should not cross page */  | 
68  | 0  |     } else { | 
69  | 0  |         m = p_str;              /* should not cross page */  | 
70  | 0  |         result = p_str + 320;  | 
71  | 0  |         a_inv = p_str + 320 * 2;  | 
72  | 0  |     }  | 
73  |  | 
  | 
74  | 0  |     rsaz_1024_norm2red_avx2(m, m_norm);  | 
75  | 0  |     rsaz_1024_norm2red_avx2(a_inv, base_norm);  | 
76  | 0  |     rsaz_1024_norm2red_avx2(R2, RR);  | 
77  |  | 
  | 
78  | 0  |     rsaz_1024_mul_avx2(R2, R2, R2, m, k0);  | 
79  | 0  |     rsaz_1024_mul_avx2(R2, R2, two80, m, k0);  | 
80  |  |  | 
81  |  |     /* table[0] = 1 */  | 
82  | 0  |     rsaz_1024_mul_avx2(result, R2, one, m, k0);  | 
83  |  |     /* table[1] = a_inv^1 */  | 
84  | 0  |     rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);  | 
85  |  | 
  | 
86  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 0);  | 
87  | 0  |     rsaz_1024_scatter5_avx2(table_s, a_inv, 1);  | 
88  |  |  | 
89  |  |     /* table[2] = a_inv^2 */  | 
90  | 0  |     rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);  | 
91  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 2);  | 
92  |  | #if 0  | 
93  |  |     /* this is almost 2x smaller and less than 1% slower */  | 
94  |  |     for (index = 3; index < 32; index++) { | 
95  |  |         rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
96  |  |         rsaz_1024_scatter5_avx2(table_s, result, index);  | 
97  |  |     }  | 
98  |  | #else  | 
99  |  |     /* table[4] = a_inv^4 */  | 
100  | 0  |     rsaz_1024_sqr_avx2(result, result, m, k0, 1);  | 
101  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 4);  | 
102  |  |     /* table[8] = a_inv^8 */  | 
103  | 0  |     rsaz_1024_sqr_avx2(result, result, m, k0, 1);  | 
104  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 8);  | 
105  |  |     /* table[16] = a_inv^16 */  | 
106  | 0  |     rsaz_1024_sqr_avx2(result, result, m, k0, 1);  | 
107  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 16);  | 
108  |  |     /* table[17] = a_inv^17 */  | 
109  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
110  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 17);  | 
111  |  |  | 
112  |  |     /* table[3] */  | 
113  | 0  |     rsaz_1024_gather5_avx2(result, table_s, 2);  | 
114  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
115  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 3);  | 
116  |  |     /* table[6] */  | 
117  | 0  |     rsaz_1024_sqr_avx2(result, result, m, k0, 1);  | 
118  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 6);  | 
119  |  |     /* table[12] */  | 
120  | 0  |     rsaz_1024_sqr_avx2(result, result, m, k0, 1);  | 
121  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 12);  | 
122  |  |     /* table[24] */  | 
123  | 0  |     rsaz_1024_sqr_avx2(result, result, m, k0, 1);  | 
124  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 24);  | 
125  |  |     /* table[25] */  | 
126  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
127  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 25);  | 
128  |  |  | 
129  |  |     /* table[5] */  | 
130  | 0  |     rsaz_1024_gather5_avx2(result, table_s, 4);  | 
131  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
132  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 5);  | 
133  |  |     /* table[10] */  | 
134  | 0  |     rsaz_1024_sqr_avx2(result, result, m, k0, 1);  | 
135  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 10);  | 
136  |  |     /* table[20] */  | 
137  | 0  |     rsaz_1024_sqr_avx2(result, result, m, k0, 1);  | 
138  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 20);  | 
139  |  |     /* table[21] */  | 
140  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
141  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 21);  | 
142  |  |  | 
143  |  |     /* table[7] */  | 
144  | 0  |     rsaz_1024_gather5_avx2(result, table_s, 6);  | 
145  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
146  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 7);  | 
147  |  |     /* table[14] */  | 
148  | 0  |     rsaz_1024_sqr_avx2(result, result, m, k0, 1);  | 
149  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 14);  | 
150  |  |     /* table[28] */  | 
151  | 0  |     rsaz_1024_sqr_avx2(result, result, m, k0, 1);  | 
152  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 28);  | 
153  |  |     /* table[29] */  | 
154  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
155  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 29);  | 
156  |  |  | 
157  |  |     /* table[9] */  | 
158  | 0  |     rsaz_1024_gather5_avx2(result, table_s, 8);  | 
159  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
160  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 9);  | 
161  |  |     /* table[18] */  | 
162  | 0  |     rsaz_1024_sqr_avx2(result, result, m, k0, 1);  | 
163  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 18);  | 
164  |  |     /* table[19] */  | 
165  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
166  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 19);  | 
167  |  |  | 
168  |  |     /* table[11] */  | 
169  | 0  |     rsaz_1024_gather5_avx2(result, table_s, 10);  | 
170  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
171  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 11);  | 
172  |  |     /* table[22] */  | 
173  | 0  |     rsaz_1024_sqr_avx2(result, result, m, k0, 1);  | 
174  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 22);  | 
175  |  |     /* table[23] */  | 
176  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
177  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 23);  | 
178  |  |  | 
179  |  |     /* table[13] */  | 
180  | 0  |     rsaz_1024_gather5_avx2(result, table_s, 12);  | 
181  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
182  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 13);  | 
183  |  |     /* table[26] */  | 
184  | 0  |     rsaz_1024_sqr_avx2(result, result, m, k0, 1);  | 
185  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 26);  | 
186  |  |     /* table[27] */  | 
187  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
188  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 27);  | 
189  |  |  | 
190  |  |     /* table[15] */  | 
191  | 0  |     rsaz_1024_gather5_avx2(result, table_s, 14);  | 
192  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
193  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 15);  | 
194  |  |     /* table[30] */  | 
195  | 0  |     rsaz_1024_sqr_avx2(result, result, m, k0, 1);  | 
196  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 30);  | 
197  |  |     /* table[31] */  | 
198  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
199  | 0  |     rsaz_1024_scatter5_avx2(table_s, result, 31);  | 
200  | 0  | #endif  | 
201  |  |  | 
202  |  |     /* load first window */  | 
203  | 0  |     p_str = (unsigned char *)exponent;  | 
204  | 0  |     wvalue = p_str[127] >> 3;  | 
205  | 0  |     rsaz_1024_gather5_avx2(result, table_s, wvalue);  | 
206  |  | 
  | 
207  | 0  |     index = 1014;  | 
208  |  | 
  | 
209  | 0  |     while (index > -1) {        /* loop for the remaining 127 windows */ | 
210  |  | 
  | 
211  | 0  |         rsaz_1024_sqr_avx2(result, result, m, k0, 5);  | 
212  |  | 
  | 
213  | 0  |         wvalue = (p_str[(index / 8) + 1] << 8) | p_str[index / 8];  | 
214  | 0  |         wvalue = (wvalue >> (index % 8)) & 31;  | 
215  | 0  |         index -= 5;  | 
216  |  | 
  | 
217  | 0  |         rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */  | 
218  | 0  |         rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
219  | 0  |     }  | 
220  |  |  | 
221  |  |     /* square four times */  | 
222  | 0  |     rsaz_1024_sqr_avx2(result, result, m, k0, 4);  | 
223  |  | 
  | 
224  | 0  |     wvalue = p_str[0] & 15;  | 
225  |  | 
  | 
226  | 0  |     rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */  | 
227  | 0  |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0);  | 
228  |  |  | 
229  |  |     /* from Montgomery */  | 
230  | 0  |     rsaz_1024_mul_avx2(result, result, one, m, k0);  | 
231  |  | 
  | 
232  | 0  |     rsaz_1024_red2norm_avx2(result_norm, result);  | 
233  |  | 
  | 
234  | 0  |     bn_reduce_once_in_place(result_norm, /*carry=*/0, m_norm, tmp, 16);  | 
235  |  | 
  | 
236  | 0  |     OPENSSL_cleanse(storage, sizeof(storage));  | 
237  | 0  |     OPENSSL_cleanse(tmp, sizeof(tmp));  | 
238  | 0  | }  | 
239  |  |  | 
240  |  | /*  | 
241  |  |  * See crypto/bn/rsaz-x86_64.pl for further details.  | 
242  |  |  */  | 
243  |  | void rsaz_512_mul(void *ret, const void *a, const void *b, const void *n,  | 
244  |  |                   BN_ULONG k);  | 
245  |  | void rsaz_512_mul_scatter4(void *ret, const void *a, const void *n,  | 
246  |  |                            BN_ULONG k, const void *tbl, unsigned int power);  | 
247  |  | void rsaz_512_mul_gather4(void *ret, const void *a, const void *tbl,  | 
248  |  |                           const void *n, BN_ULONG k, unsigned int power);  | 
249  |  | void rsaz_512_mul_by_one(void *ret, const void *a, const void *n, BN_ULONG k);  | 
250  |  | void rsaz_512_sqr(void *ret, const void *a, const void *n, BN_ULONG k,  | 
251  |  |                   int cnt);  | 
252  |  | void rsaz_512_scatter4(void *tbl, const BN_ULONG *val, int power);  | 
253  |  | void rsaz_512_gather4(BN_ULONG *val, const void *tbl, int power);  | 
254  |  |  | 
255  |  | void RSAZ_512_mod_exp(BN_ULONG result[8],  | 
256  |  |                       const BN_ULONG base[8], const BN_ULONG exponent[8],  | 
257  |  |                       const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8])  | 
258  | 395  | { | 
259  | 395  |     unsigned char storage[16 * 8 * 8 + 64 * 2 + 64]; /* 1.2KB */  | 
260  | 395  |     unsigned char *table = storage + (64 - ((size_t)storage % 64));  | 
261  | 395  |     BN_ULONG *a_inv = (BN_ULONG *)(table + 16 * 8 * 8);  | 
262  | 395  |     BN_ULONG *temp = (BN_ULONG *)(table + 16 * 8 * 8 + 8 * 8);  | 
263  | 395  |     unsigned char *p_str = (unsigned char *)exponent;  | 
264  | 395  |     int index;  | 
265  | 395  |     unsigned int wvalue;  | 
266  | 395  |     BN_ULONG tmp[8];  | 
267  |  |  | 
268  |  |     /* table[0] = 1_inv */  | 
269  | 395  |     temp[0] = 0 - m[0];  | 
270  | 395  |     temp[1] = ~m[1];  | 
271  | 395  |     temp[2] = ~m[2];  | 
272  | 395  |     temp[3] = ~m[3];  | 
273  | 395  |     temp[4] = ~m[4];  | 
274  | 395  |     temp[5] = ~m[5];  | 
275  | 395  |     temp[6] = ~m[6];  | 
276  | 395  |     temp[7] = ~m[7];  | 
277  | 395  |     rsaz_512_scatter4(table, temp, 0);  | 
278  |  |  | 
279  |  |     /* table [1] = a_inv^1 */  | 
280  | 395  |     rsaz_512_mul(a_inv, base, RR, m, k0);  | 
281  | 395  |     rsaz_512_scatter4(table, a_inv, 1);  | 
282  |  |  | 
283  |  |     /* table [2] = a_inv^2 */  | 
284  | 395  |     rsaz_512_sqr(temp, a_inv, m, k0, 1);  | 
285  | 395  |     rsaz_512_scatter4(table, temp, 2);  | 
286  |  |  | 
287  | 5.53k  |     for (index = 3; index < 16; index++)  | 
288  | 5.13k  |         rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index);  | 
289  |  |  | 
290  |  |     /* load first window */  | 
291  | 395  |     wvalue = p_str[63];  | 
292  |  |  | 
293  | 395  |     rsaz_512_gather4(temp, table, wvalue >> 4);  | 
294  | 395  |     rsaz_512_sqr(temp, temp, m, k0, 4);  | 
295  | 395  |     rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0xf);  | 
296  |  |  | 
297  | 25.2k  |     for (index = 62; index >= 0; index--) { | 
298  | 24.8k  |         wvalue = p_str[index];  | 
299  |  |  | 
300  | 24.8k  |         rsaz_512_sqr(temp, temp, m, k0, 4);  | 
301  | 24.8k  |         rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue >> 4);  | 
302  |  |  | 
303  | 24.8k  |         rsaz_512_sqr(temp, temp, m, k0, 4);  | 
304  | 24.8k  |         rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0x0f);  | 
305  | 24.8k  |     }  | 
306  |  |  | 
307  |  |     /* from Montgomery */  | 
308  | 395  |     rsaz_512_mul_by_one(result, temp, m, k0);  | 
309  |  |  | 
310  | 395  |     bn_reduce_once_in_place(result, /*carry=*/0, m, tmp, 8);  | 
311  |  |  | 
312  | 395  |     OPENSSL_cleanse(storage, sizeof(storage));  | 
313  | 395  |     OPENSSL_cleanse(tmp, sizeof(tmp));  | 
314  | 395  | }  | 
315  |  |  | 
316  |  | #endif  |