/src/openssl30/crypto/bn/rsaz_exp.c
| Line | Count | Source (jump to first uncovered line) | 
| 1 |  | /* | 
| 2 |  |  * Copyright 2013-2022 The OpenSSL Project Authors. All Rights Reserved. | 
| 3 |  |  * Copyright (c) 2012, Intel Corporation. All Rights Reserved. | 
| 4 |  |  * | 
| 5 |  |  * Licensed under the Apache License 2.0 (the "License").  You may not use | 
| 6 |  |  * this file except in compliance with the License.  You can obtain a copy | 
| 7 |  |  * in the file LICENSE in the source distribution or at | 
| 8 |  |  * https://www.openssl.org/source/license.html | 
| 9 |  |  * | 
| 10 |  |  * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) | 
| 11 |  |  * (1) Intel Corporation, Israel Development Center, Haifa, Israel | 
| 12 |  |  * (2) University of Haifa, Israel | 
| 13 |  |  */ | 
| 14 |  |  | 
| 15 |  | #include <openssl/opensslconf.h> | 
| 16 |  | #include "rsaz_exp.h" | 
| 17 |  |  | 
| 18 |  | #ifndef RSAZ_ENABLED | 
| 19 |  | NON_EMPTY_TRANSLATION_UNIT | 
| 20 |  | #else | 
| 21 |  |  | 
| 22 |  | /* | 
| 23 |  |  * See crypto/bn/asm/rsaz-avx2.pl for further details. | 
| 24 |  |  */ | 
| 25 |  | void rsaz_1024_norm2red_avx2(void *red, const void *norm); | 
| 26 |  | void rsaz_1024_mul_avx2(void *ret, const void *a, const void *b, | 
| 27 |  |                         const void *n, BN_ULONG k); | 
| 28 |  | void rsaz_1024_sqr_avx2(void *ret, const void *a, const void *n, BN_ULONG k, | 
| 29 |  |                         int cnt); | 
| 30 |  | void rsaz_1024_scatter5_avx2(void *tbl, const void *val, int i); | 
| 31 |  | void rsaz_1024_gather5_avx2(void *val, const void *tbl, int i); | 
| 32 |  | void rsaz_1024_red2norm_avx2(void *norm, const void *red); | 
| 33 |  |  | 
| 34 |  | #if defined(__GNUC__) | 
| 35 |  | # define ALIGN64        __attribute__((aligned(64))) | 
| 36 |  | #elif defined(_MSC_VER) | 
| 37 |  | # define ALIGN64        __declspec(align(64)) | 
| 38 |  | #elif defined(__SUNPRO_C) | 
| 39 |  | # define ALIGN64 | 
| 40 |  | # pragma align 64(one,two80) | 
| 41 |  | #else | 
| 42 |  | /* not fatal, might hurt performance a little */ | 
| 43 |  | # define ALIGN64 | 
| 44 |  | #endif | 
| 45 |  |  | 
| 46 |  | ALIGN64 static const BN_ULONG one[40] = { | 
| 47 |  |     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
| 48 |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | 
| 49 |  | }; | 
| 50 |  |  | 
| 51 |  | ALIGN64 static const BN_ULONG two80[40] = { | 
| 52 |  |     0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 
| 53 |  |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | 
| 54 |  | }; | 
| 55 |  |  | 
| 56 |  | void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16], | 
| 57 |  |                             const BN_ULONG base_norm[16], | 
| 58 |  |                             const BN_ULONG exponent[16], | 
| 59 |  |                             const BN_ULONG m_norm[16], const BN_ULONG RR[16], | 
| 60 |  |                             BN_ULONG k0) | 
| 61 | 0 | { | 
| 62 | 0 |     unsigned char storage[320 * 3 + 32 * 9 * 16 + 64]; /* 5.5KB */ | 
| 63 | 0 |     unsigned char *p_str = storage + (64 - ((size_t)storage % 64)); | 
| 64 | 0 |     unsigned char *a_inv, *m, *result; | 
| 65 | 0 |     unsigned char *table_s = p_str + 320 * 3; | 
| 66 | 0 |     unsigned char *R2 = table_s; /* borrow */ | 
| 67 | 0 |     int index; | 
| 68 | 0 |     int wvalue; | 
| 69 | 0 |     BN_ULONG tmp[16]; | 
| 70 |  | 
 | 
| 71 | 0 |     if ((((size_t)p_str & 4095) + 320) >> 12) { | 
| 72 | 0 |         result = p_str; | 
| 73 | 0 |         a_inv = p_str + 320; | 
| 74 | 0 |         m = p_str + 320 * 2;    /* should not cross page */ | 
| 75 | 0 |     } else { | 
| 76 | 0 |         m = p_str;              /* should not cross page */ | 
| 77 | 0 |         result = p_str + 320; | 
| 78 | 0 |         a_inv = p_str + 320 * 2; | 
| 79 | 0 |     } | 
| 80 |  | 
 | 
| 81 | 0 |     rsaz_1024_norm2red_avx2(m, m_norm); | 
| 82 | 0 |     rsaz_1024_norm2red_avx2(a_inv, base_norm); | 
| 83 | 0 |     rsaz_1024_norm2red_avx2(R2, RR); | 
| 84 |  | 
 | 
| 85 | 0 |     rsaz_1024_mul_avx2(R2, R2, R2, m, k0); | 
| 86 | 0 |     rsaz_1024_mul_avx2(R2, R2, two80, m, k0); | 
| 87 |  |  | 
| 88 |  |     /* table[0] = 1 */ | 
| 89 | 0 |     rsaz_1024_mul_avx2(result, R2, one, m, k0); | 
| 90 |  |     /* table[1] = a_inv^1 */ | 
| 91 | 0 |     rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0); | 
| 92 |  | 
 | 
| 93 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 0); | 
| 94 | 0 |     rsaz_1024_scatter5_avx2(table_s, a_inv, 1); | 
| 95 |  |  | 
| 96 |  |     /* table[2] = a_inv^2 */ | 
| 97 | 0 |     rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1); | 
| 98 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 2); | 
| 99 |  | #if 0 | 
| 100 |  |     /* this is almost 2x smaller and less than 1% slower */ | 
| 101 |  |     for (index = 3; index < 32; index++) { | 
| 102 |  |         rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 103 |  |         rsaz_1024_scatter5_avx2(table_s, result, index); | 
| 104 |  |     } | 
| 105 |  | #else | 
| 106 |  |     /* table[4] = a_inv^4 */ | 
| 107 | 0 |     rsaz_1024_sqr_avx2(result, result, m, k0, 1); | 
| 108 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 4); | 
| 109 |  |     /* table[8] = a_inv^8 */ | 
| 110 | 0 |     rsaz_1024_sqr_avx2(result, result, m, k0, 1); | 
| 111 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 8); | 
| 112 |  |     /* table[16] = a_inv^16 */ | 
| 113 | 0 |     rsaz_1024_sqr_avx2(result, result, m, k0, 1); | 
| 114 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 16); | 
| 115 |  |     /* table[17] = a_inv^17 */ | 
| 116 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 117 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 17); | 
| 118 |  |  | 
| 119 |  |     /* table[3] */ | 
| 120 | 0 |     rsaz_1024_gather5_avx2(result, table_s, 2); | 
| 121 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 122 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 3); | 
| 123 |  |     /* table[6] */ | 
| 124 | 0 |     rsaz_1024_sqr_avx2(result, result, m, k0, 1); | 
| 125 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 6); | 
| 126 |  |     /* table[12] */ | 
| 127 | 0 |     rsaz_1024_sqr_avx2(result, result, m, k0, 1); | 
| 128 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 12); | 
| 129 |  |     /* table[24] */ | 
| 130 | 0 |     rsaz_1024_sqr_avx2(result, result, m, k0, 1); | 
| 131 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 24); | 
| 132 |  |     /* table[25] */ | 
| 133 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 134 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 25); | 
| 135 |  |  | 
| 136 |  |     /* table[5] */ | 
| 137 | 0 |     rsaz_1024_gather5_avx2(result, table_s, 4); | 
| 138 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 139 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 5); | 
| 140 |  |     /* table[10] */ | 
| 141 | 0 |     rsaz_1024_sqr_avx2(result, result, m, k0, 1); | 
| 142 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 10); | 
| 143 |  |     /* table[20] */ | 
| 144 | 0 |     rsaz_1024_sqr_avx2(result, result, m, k0, 1); | 
| 145 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 20); | 
| 146 |  |     /* table[21] */ | 
| 147 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 148 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 21); | 
| 149 |  |  | 
| 150 |  |     /* table[7] */ | 
| 151 | 0 |     rsaz_1024_gather5_avx2(result, table_s, 6); | 
| 152 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 153 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 7); | 
| 154 |  |     /* table[14] */ | 
| 155 | 0 |     rsaz_1024_sqr_avx2(result, result, m, k0, 1); | 
| 156 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 14); | 
| 157 |  |     /* table[28] */ | 
| 158 | 0 |     rsaz_1024_sqr_avx2(result, result, m, k0, 1); | 
| 159 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 28); | 
| 160 |  |     /* table[29] */ | 
| 161 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 162 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 29); | 
| 163 |  |  | 
| 164 |  |     /* table[9] */ | 
| 165 | 0 |     rsaz_1024_gather5_avx2(result, table_s, 8); | 
| 166 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 167 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 9); | 
| 168 |  |     /* table[18] */ | 
| 169 | 0 |     rsaz_1024_sqr_avx2(result, result, m, k0, 1); | 
| 170 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 18); | 
| 171 |  |     /* table[19] */ | 
| 172 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 173 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 19); | 
| 174 |  |  | 
| 175 |  |     /* table[11] */ | 
| 176 | 0 |     rsaz_1024_gather5_avx2(result, table_s, 10); | 
| 177 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 178 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 11); | 
| 179 |  |     /* table[22] */ | 
| 180 | 0 |     rsaz_1024_sqr_avx2(result, result, m, k0, 1); | 
| 181 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 22); | 
| 182 |  |     /* table[23] */ | 
| 183 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 184 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 23); | 
| 185 |  |  | 
| 186 |  |     /* table[13] */ | 
| 187 | 0 |     rsaz_1024_gather5_avx2(result, table_s, 12); | 
| 188 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 189 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 13); | 
| 190 |  |     /* table[26] */ | 
| 191 | 0 |     rsaz_1024_sqr_avx2(result, result, m, k0, 1); | 
| 192 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 26); | 
| 193 |  |     /* table[27] */ | 
| 194 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 195 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 27); | 
| 196 |  |  | 
| 197 |  |     /* table[15] */ | 
| 198 | 0 |     rsaz_1024_gather5_avx2(result, table_s, 14); | 
| 199 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 200 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 15); | 
| 201 |  |     /* table[30] */ | 
| 202 | 0 |     rsaz_1024_sqr_avx2(result, result, m, k0, 1); | 
| 203 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 30); | 
| 204 |  |     /* table[31] */ | 
| 205 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 206 | 0 |     rsaz_1024_scatter5_avx2(table_s, result, 31); | 
| 207 | 0 | #endif | 
| 208 |  |  | 
| 209 |  |     /* load first window */ | 
| 210 | 0 |     p_str = (unsigned char *)exponent; | 
| 211 | 0 |     wvalue = p_str[127] >> 3; | 
| 212 | 0 |     rsaz_1024_gather5_avx2(result, table_s, wvalue); | 
| 213 |  | 
 | 
| 214 | 0 |     index = 1014; | 
| 215 |  | 
 | 
| 216 | 0 |     while (index > -1) {        /* loop for the remaining 127 windows */ | 
| 217 |  | 
 | 
| 218 | 0 |         rsaz_1024_sqr_avx2(result, result, m, k0, 5); | 
| 219 |  | 
 | 
| 220 | 0 |         wvalue = (p_str[(index / 8) + 1] << 8) | p_str[index / 8]; | 
| 221 | 0 |         wvalue = (wvalue >> (index % 8)) & 31; | 
| 222 | 0 |         index -= 5; | 
| 223 |  | 
 | 
| 224 | 0 |         rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */ | 
| 225 | 0 |         rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 226 | 0 |     } | 
| 227 |  |  | 
| 228 |  |     /* square four times */ | 
| 229 | 0 |     rsaz_1024_sqr_avx2(result, result, m, k0, 4); | 
| 230 |  | 
 | 
| 231 | 0 |     wvalue = p_str[0] & 15; | 
| 232 |  | 
 | 
| 233 | 0 |     rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */ | 
| 234 | 0 |     rsaz_1024_mul_avx2(result, result, a_inv, m, k0); | 
| 235 |  |  | 
| 236 |  |     /* from Montgomery */ | 
| 237 | 0 |     rsaz_1024_mul_avx2(result, result, one, m, k0); | 
| 238 |  | 
 | 
| 239 | 0 |     rsaz_1024_red2norm_avx2(result_norm, result); | 
| 240 |  | 
 | 
| 241 | 0 |     bn_reduce_once_in_place(result_norm, /*carry=*/0, m_norm, tmp, 16); | 
| 242 |  | 
 | 
| 243 | 0 |     OPENSSL_cleanse(storage, sizeof(storage)); | 
| 244 | 0 |     OPENSSL_cleanse(tmp, sizeof(tmp)); | 
| 245 | 0 | } | 
| 246 |  |  | 
| 247 |  | /* | 
| 248 |  |  * See crypto/bn/rsaz-x86_64.pl for further details. | 
| 249 |  |  */ | 
| 250 |  | void rsaz_512_mul(void *ret, const void *a, const void *b, const void *n, | 
| 251 |  |                   BN_ULONG k); | 
| 252 |  | void rsaz_512_mul_scatter4(void *ret, const void *a, const void *n, | 
| 253 |  |                            BN_ULONG k, const void *tbl, unsigned int power); | 
| 254 |  | void rsaz_512_mul_gather4(void *ret, const void *a, const void *tbl, | 
| 255 |  |                           const void *n, BN_ULONG k, unsigned int power); | 
| 256 |  | void rsaz_512_mul_by_one(void *ret, const void *a, const void *n, BN_ULONG k); | 
| 257 |  | void rsaz_512_sqr(void *ret, const void *a, const void *n, BN_ULONG k, | 
| 258 |  |                   int cnt); | 
| 259 |  | void rsaz_512_scatter4(void *tbl, const BN_ULONG *val, int power); | 
| 260 |  | void rsaz_512_gather4(BN_ULONG *val, const void *tbl, int power); | 
| 261 |  |  | 
| 262 |  | void RSAZ_512_mod_exp(BN_ULONG result[8], | 
| 263 |  |                       const BN_ULONG base[8], const BN_ULONG exponent[8], | 
| 264 |  |                       const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8]) | 
| 265 | 196 | { | 
| 266 | 196 |     unsigned char storage[16 * 8 * 8 + 64 * 2 + 64]; /* 1.2KB */ | 
| 267 | 196 |     unsigned char *table = storage + (64 - ((size_t)storage % 64)); | 
| 268 | 196 |     BN_ULONG *a_inv = (BN_ULONG *)(table + 16 * 8 * 8); | 
| 269 | 196 |     BN_ULONG *temp = (BN_ULONG *)(table + 16 * 8 * 8 + 8 * 8); | 
| 270 | 196 |     unsigned char *p_str = (unsigned char *)exponent; | 
| 271 | 196 |     int index; | 
| 272 | 196 |     unsigned int wvalue; | 
| 273 | 196 |     BN_ULONG tmp[8]; | 
| 274 |  |  | 
| 275 |  |     /* table[0] = 1_inv */ | 
| 276 | 196 |     temp[0] = 0 - m[0]; | 
| 277 | 196 |     temp[1] = ~m[1]; | 
| 278 | 196 |     temp[2] = ~m[2]; | 
| 279 | 196 |     temp[3] = ~m[3]; | 
| 280 | 196 |     temp[4] = ~m[4]; | 
| 281 | 196 |     temp[5] = ~m[5]; | 
| 282 | 196 |     temp[6] = ~m[6]; | 
| 283 | 196 |     temp[7] = ~m[7]; | 
| 284 | 196 |     rsaz_512_scatter4(table, temp, 0); | 
| 285 |  |  | 
| 286 |  |     /* table [1] = a_inv^1 */ | 
| 287 | 196 |     rsaz_512_mul(a_inv, base, RR, m, k0); | 
| 288 | 196 |     rsaz_512_scatter4(table, a_inv, 1); | 
| 289 |  |  | 
| 290 |  |     /* table [2] = a_inv^2 */ | 
| 291 | 196 |     rsaz_512_sqr(temp, a_inv, m, k0, 1); | 
| 292 | 196 |     rsaz_512_scatter4(table, temp, 2); | 
| 293 |  |  | 
| 294 | 2.74k |     for (index = 3; index < 16; index++) | 
| 295 | 2.54k |         rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index); | 
| 296 |  |  | 
| 297 |  |     /* load first window */ | 
| 298 | 196 |     wvalue = p_str[63]; | 
| 299 |  |  | 
| 300 | 196 |     rsaz_512_gather4(temp, table, wvalue >> 4); | 
| 301 | 196 |     rsaz_512_sqr(temp, temp, m, k0, 4); | 
| 302 | 196 |     rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0xf); | 
| 303 |  |  | 
| 304 | 12.5k |     for (index = 62; index >= 0; index--) { | 
| 305 | 12.3k |         wvalue = p_str[index]; | 
| 306 |  |  | 
| 307 | 12.3k |         rsaz_512_sqr(temp, temp, m, k0, 4); | 
| 308 | 12.3k |         rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue >> 4); | 
| 309 |  |  | 
| 310 | 12.3k |         rsaz_512_sqr(temp, temp, m, k0, 4); | 
| 311 | 12.3k |         rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0x0f); | 
| 312 | 12.3k |     } | 
| 313 |  |  | 
| 314 |  |     /* from Montgomery */ | 
| 315 | 196 |     rsaz_512_mul_by_one(result, temp, m, k0); | 
| 316 |  |  | 
| 317 | 196 |     bn_reduce_once_in_place(result, /*carry=*/0, m, tmp, 8); | 
| 318 |  |  | 
| 319 | 196 |     OPENSSL_cleanse(storage, sizeof(storage)); | 
| 320 | 196 |     OPENSSL_cleanse(tmp, sizeof(tmp)); | 
| 321 | 196 | } | 
| 322 |  |  | 
| 323 |  | #endif |