/src/openssl30/crypto/bn/rsaz_exp_x2.c
| Line | Count | Source (jump to first uncovered line) | 
| 1 |  | /* | 
| 2 |  |  * Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved. | 
| 3 |  |  * Copyright (c) 2020, Intel Corporation. All Rights Reserved. | 
| 4 |  |  * | 
| 5 |  |  * Licensed under the Apache License 2.0 (the "License").  You may not use | 
| 6 |  |  * this file except in compliance with the License.  You can obtain a copy | 
| 7 |  |  * in the file LICENSE in the source distribution or at | 
| 8 |  |  * https://www.openssl.org/source/license.html | 
| 9 |  |  * | 
| 10 |  |  * | 
| 11 |  |  * Originally written by Ilya Albrekht, Sergey Kirillov and Andrey Matyukov | 
| 12 |  |  * Intel Corporation | 
| 13 |  |  * | 
| 14 |  |  */ | 
| 15 |  |  | 
| 16 |  | #include <openssl/opensslconf.h> | 
| 17 |  | #include <openssl/crypto.h> | 
| 18 |  | #include "rsaz_exp.h" | 
| 19 |  |  | 
| 20 |  | #ifndef RSAZ_ENABLED | 
| 21 |  | NON_EMPTY_TRANSLATION_UNIT | 
| 22 |  | #else | 
| 23 |  | # include <assert.h> | 
| 24 |  | # include <string.h> | 
| 25 |  |  | 
| 26 |  | # if defined(__GNUC__) | 
| 27 | 0 | #  define ALIGN64 __attribute__((aligned(64))) | 
| 28 |  | # elif defined(_MSC_VER) | 
| 29 |  | #  define ALIGN64 __declspec(align(64)) | 
| 30 |  | # else | 
| 31 |  | #  define ALIGN64 | 
| 32 |  | # endif | 
| 33 |  |  | 
| 34 |  | # define ALIGN_OF(ptr, boundary) \ | 
| 35 | 0 |     ((unsigned char *)(ptr) + (boundary - (((size_t)(ptr)) & (boundary - 1)))) | 
| 36 |  |  | 
| 37 |  | /* Internal radix */ | 
| 38 | 0 | # define DIGIT_SIZE (52) | 
| 39 |  | /* 52-bit mask */ | 
| 40 | 0 | # define DIGIT_MASK ((uint64_t)0xFFFFFFFFFFFFF) | 
| 41 |  |  | 
| 42 | 0 | # define BITS2WORD8_SIZE(x)  (((x) + 7) >> 3) | 
| 43 | 0 | # define BITS2WORD64_SIZE(x) (((x) + 63) >> 6) | 
| 44 |  |  | 
| 45 |  | static ossl_inline uint64_t get_digit52(const uint8_t *in, int in_len); | 
| 46 |  | static ossl_inline void put_digit52(uint8_t *out, int out_len, uint64_t digit); | 
| 47 |  | static void to_words52(BN_ULONG *out, int out_len, const BN_ULONG *in, | 
| 48 |  |                        int in_bitsize); | 
| 49 |  | static void from_words52(BN_ULONG *bn_out, int out_bitsize, const BN_ULONG *in); | 
| 50 |  | static ossl_inline void set_bit(BN_ULONG *a, int idx); | 
| 51 |  |  | 
| 52 |  | /* Number of |digit_size|-bit digits in |bitsize|-bit value */ | 
| 53 |  | static ossl_inline int number_of_digits(int bitsize, int digit_size) | 
| 54 | 0 | { | 
| 55 | 0 |     return (bitsize + digit_size - 1) / digit_size; | 
| 56 | 0 | } | 
| 57 |  |  | 
| 58 |  | typedef void (*AMM52)(BN_ULONG *res, const BN_ULONG *base, | 
| 59 |  |                       const BN_ULONG *exp, const BN_ULONG *m, BN_ULONG k0); | 
| 60 |  | typedef void (*EXP52_x2)(BN_ULONG *res, const BN_ULONG *base, | 
| 61 |  |                          const BN_ULONG *exp[2], const BN_ULONG *m, | 
| 62 |  |                          const BN_ULONG *rr, const BN_ULONG k0[2]); | 
| 63 |  |  | 
| 64 |  | /* | 
| 65 |  |  * For details of the methods declared below please refer to | 
| 66 |  |  *    crypto/bn/asm/rsaz-avx512.pl | 
| 67 |  |  * | 
| 68 |  |  * Naming notes: | 
| 69 |  |  *  amm = Almost Montgomery Multiplication | 
| 70 |  |  *  ams = Almost Montgomery Squaring | 
| 71 |  |  *  52x20 - data represented as array of 20 digits in 52-bit radix | 
| 72 |  |  *  _x1_/_x2_ - 1 or 2 independent inputs/outputs | 
| 73 |  |  *  _256 suffix - uses 256-bit (AVX512VL) registers | 
| 74 |  |  */ | 
| 75 |  |  | 
| 76 |  | /*AMM = Almost Montgomery Multiplication. */ | 
| 77 |  | void ossl_rsaz_amm52x20_x1_256(BN_ULONG *res, const BN_ULONG *base, | 
| 78 |  |                                const BN_ULONG *exp, const BN_ULONG *m, | 
| 79 |  |                                BN_ULONG k0); | 
| 80 |  | static void RSAZ_exp52x20_x2_256(BN_ULONG *res, const BN_ULONG *base, | 
| 81 |  |                                  const BN_ULONG *exp[2], const BN_ULONG *m, | 
| 82 |  |                                  const BN_ULONG *rr, const BN_ULONG k0[2]); | 
| 83 |  | void ossl_rsaz_amm52x20_x2_256(BN_ULONG *out, const BN_ULONG *a, | 
| 84 |  |                                const BN_ULONG *b, const BN_ULONG *m, | 
| 85 |  |                                const BN_ULONG k0[2]); | 
| 86 |  | void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y, | 
| 87 |  |                                        const BN_ULONG *red_table, | 
| 88 |  |                                        int red_table_idx, int tbl_idx); | 
| 89 |  |  | 
| 90 |  | /* | 
| 91 |  |  * Dual Montgomery modular exponentiation using prime moduli of the | 
| 92 |  |  * same bit size, optimized with AVX512 ISA. | 
| 93 |  |  * | 
| 94 |  |  * Input and output parameters for each exponentiation are independent and | 
| 95 |  |  * denoted here by index |i|, i = 1..2. | 
| 96 |  |  * | 
| 97 |  |  * Input and output are all in regular 2^64 radix. | 
| 98 |  |  * | 
| 99 |  |  * Each moduli shall be |factor_size| bit size. | 
| 100 |  |  * | 
| 101 |  |  * NOTE: currently only 2x1024 case is supported. | 
| 102 |  |  * | 
| 103 |  |  *  [out] res|i|      - result of modular exponentiation: array of qword values | 
| 104 |  |  *                      in regular (2^64) radix. Size of array shall be enough | 
| 105 |  |  *                      to hold |factor_size| bits. | 
| 106 |  |  *  [in]  base|i|     - base | 
| 107 |  |  *  [in]  exp|i|      - exponent | 
| 108 |  |  *  [in]  m|i|        - moduli | 
| 109 |  |  *  [in]  rr|i|       - Montgomery parameter RR = R^2 mod m|i| | 
| 110 |  |  *  [in]  k0_|i|      - Montgomery parameter k0 = -1/m|i| mod 2^64 | 
| 111 |  |  *  [in]  factor_size - moduli bit size | 
| 112 |  |  * | 
| 113 |  |  * \return 0 in case of failure, | 
| 114 |  |  *         1 in case of success. | 
| 115 |  |  */ | 
| 116 |  | int ossl_rsaz_mod_exp_avx512_x2(BN_ULONG *res1, | 
| 117 |  |                                 const BN_ULONG *base1, | 
| 118 |  |                                 const BN_ULONG *exp1, | 
| 119 |  |                                 const BN_ULONG *m1, | 
| 120 |  |                                 const BN_ULONG *rr1, | 
| 121 |  |                                 BN_ULONG k0_1, | 
| 122 |  |                                 BN_ULONG *res2, | 
| 123 |  |                                 const BN_ULONG *base2, | 
| 124 |  |                                 const BN_ULONG *exp2, | 
| 125 |  |                                 const BN_ULONG *m2, | 
| 126 |  |                                 const BN_ULONG *rr2, | 
| 127 |  |                                 BN_ULONG k0_2, | 
| 128 |  |                                 int factor_size) | 
| 129 | 0 | { | 
| 130 | 0 |     int ret = 0; | 
| 131 |  |  | 
| 132 |  |     /* | 
| 133 |  |      * Number of word-size (BN_ULONG) digits to store exponent in redundant | 
| 134 |  |      * representation. | 
| 135 |  |      */ | 
| 136 | 0 |     int exp_digits = number_of_digits(factor_size + 2, DIGIT_SIZE); | 
| 137 | 0 |     int coeff_pow = 4 * (DIGIT_SIZE * exp_digits - factor_size); | 
| 138 | 0 |     BN_ULONG *base1_red, *m1_red, *rr1_red; | 
| 139 | 0 |     BN_ULONG *base2_red, *m2_red, *rr2_red; | 
| 140 | 0 |     BN_ULONG *coeff_red; | 
| 141 | 0 |     BN_ULONG *storage = NULL; | 
| 142 | 0 |     BN_ULONG *storage_aligned = NULL; | 
| 143 | 0 |     BN_ULONG storage_len_bytes = 7 * exp_digits * sizeof(BN_ULONG); | 
| 144 |  |  | 
| 145 |  |     /* AMM = Almost Montgomery Multiplication */ | 
| 146 | 0 |     AMM52 amm = NULL; | 
| 147 |  |     /* Dual (2-exps in parallel) exponentiation */ | 
| 148 | 0 |     EXP52_x2 exp_x2 = NULL; | 
| 149 |  | 
 | 
| 150 | 0 |     const BN_ULONG *exp[2] = {0}; | 
| 151 | 0 |     BN_ULONG k0[2] = {0}; | 
| 152 |  |  | 
| 153 |  |     /* Only 1024-bit factor size is supported now */ | 
| 154 | 0 |     switch (factor_size) { | 
| 155 | 0 |     case 1024: | 
| 156 | 0 |         amm = ossl_rsaz_amm52x20_x1_256; | 
| 157 | 0 |         exp_x2 = RSAZ_exp52x20_x2_256; | 
| 158 | 0 |         break; | 
| 159 | 0 |     default: | 
| 160 | 0 |         goto err; | 
| 161 | 0 |     } | 
| 162 |  |  | 
| 163 | 0 |     storage = (BN_ULONG *)OPENSSL_malloc(storage_len_bytes + 64); | 
| 164 | 0 |     if (storage == NULL) | 
| 165 | 0 |         goto err; | 
| 166 | 0 |     storage_aligned = (BN_ULONG *)ALIGN_OF(storage, 64); | 
| 167 |  |  | 
| 168 |  |     /* Memory layout for red(undant) representations */ | 
| 169 | 0 |     base1_red = storage_aligned; | 
| 170 | 0 |     base2_red = storage_aligned + 1 * exp_digits; | 
| 171 | 0 |     m1_red    = storage_aligned + 2 * exp_digits; | 
| 172 | 0 |     m2_red    = storage_aligned + 3 * exp_digits; | 
| 173 | 0 |     rr1_red   = storage_aligned + 4 * exp_digits; | 
| 174 | 0 |     rr2_red   = storage_aligned + 5 * exp_digits; | 
| 175 | 0 |     coeff_red = storage_aligned + 6 * exp_digits; | 
| 176 |  |  | 
| 177 |  |     /* Convert base_i, m_i, rr_i, from regular to 52-bit radix */ | 
| 178 | 0 |     to_words52(base1_red, exp_digits, base1, factor_size); | 
| 179 | 0 |     to_words52(base2_red, exp_digits, base2, factor_size); | 
| 180 | 0 |     to_words52(m1_red, exp_digits, m1, factor_size); | 
| 181 | 0 |     to_words52(m2_red, exp_digits, m2, factor_size); | 
| 182 | 0 |     to_words52(rr1_red, exp_digits, rr1, factor_size); | 
| 183 | 0 |     to_words52(rr2_red, exp_digits, rr2, factor_size); | 
| 184 |  |  | 
| 185 |  |     /* | 
| 186 |  |      * Compute target domain Montgomery converters RR' for each modulus | 
| 187 |  |      * based on precomputed original domain's RR. | 
| 188 |  |      * | 
| 189 |  |      * RR -> RR' transformation steps: | 
| 190 |  |      *  (1) coeff = 2^k | 
| 191 |  |      *  (2) t = AMM(RR,RR) = RR^2 / R' mod m | 
| 192 |  |      *  (3) RR' = AMM(t, coeff) = RR^2 * 2^k / R'^2 mod m | 
| 193 |  |      * where | 
| 194 |  |      *  k = 4 * (52 * digits52 - modlen) | 
| 195 |  |      *  R  = 2^(64 * ceil(modlen/64)) mod m | 
| 196 |  |      *  RR = R^2 mod M | 
| 197 |  |      *  R' = 2^(52 * ceil(modlen/52)) mod m | 
| 198 |  |      * | 
| 199 |  |      *  modlen = 1024: k = 64, RR = 2^2048 mod m, RR' = 2^2080 mod m | 
| 200 |  |      */ | 
| 201 | 0 |     memset(coeff_red, 0, exp_digits * sizeof(BN_ULONG)); | 
| 202 |  |     /* (1) in reduced domain representation */ | 
| 203 | 0 |     set_bit(coeff_red, 64 * (int)(coeff_pow / 52) + coeff_pow % 52); | 
| 204 |  | 
 | 
| 205 | 0 |     amm(rr1_red, rr1_red, rr1_red, m1_red, k0_1);     /* (2) for m1 */ | 
| 206 | 0 |     amm(rr1_red, rr1_red, coeff_red, m1_red, k0_1);   /* (3) for m1 */ | 
| 207 |  | 
 | 
| 208 | 0 |     amm(rr2_red, rr2_red, rr2_red, m2_red, k0_2);     /* (2) for m2 */ | 
| 209 | 0 |     amm(rr2_red, rr2_red, coeff_red, m2_red, k0_2);   /* (3) for m2 */ | 
| 210 |  | 
 | 
| 211 | 0 |     exp[0] = exp1; | 
| 212 | 0 |     exp[1] = exp2; | 
| 213 |  | 
 | 
| 214 | 0 |     k0[0] = k0_1; | 
| 215 | 0 |     k0[1] = k0_2; | 
| 216 |  | 
 | 
| 217 | 0 |     exp_x2(rr1_red, base1_red, exp, m1_red, rr1_red, k0); | 
| 218 |  |  | 
| 219 |  |     /* Convert rr_i back to regular radix */ | 
| 220 | 0 |     from_words52(res1, factor_size, rr1_red); | 
| 221 | 0 |     from_words52(res2, factor_size, rr2_red); | 
| 222 |  |  | 
| 223 |  |     /* bn_reduce_once_in_place expects number of BN_ULONG, not bit size */ | 
| 224 | 0 |     factor_size /= sizeof(BN_ULONG) * 8; | 
| 225 |  | 
 | 
| 226 | 0 |     bn_reduce_once_in_place(res1, /*carry=*/0, m1, storage, factor_size); | 
| 227 | 0 |     bn_reduce_once_in_place(res2, /*carry=*/0, m2, storage, factor_size); | 
| 228 |  | 
 | 
| 229 | 0 |     ret = 1; | 
| 230 | 0 | err: | 
| 231 | 0 |     if (storage != NULL) { | 
| 232 | 0 |         OPENSSL_cleanse(storage, storage_len_bytes); | 
| 233 | 0 |         OPENSSL_free(storage); | 
| 234 | 0 |     } | 
| 235 | 0 |     return ret; | 
| 236 | 0 | } | 
| 237 |  |  | 
| 238 |  | /* | 
| 239 |  |  * Dual 1024-bit w-ary modular exponentiation using prime moduli of the same | 
| 240 |  |  * bit size using Almost Montgomery Multiplication, optimized with AVX512_IFMA | 
| 241 |  |  * ISA. | 
| 242 |  |  * | 
| 243 |  |  * The parameter w (window size) = 5. | 
| 244 |  |  * | 
| 245 |  |  *  [out] res      - result of modular exponentiation: 2x20 qword | 
| 246 |  |  *                   values in 2^52 radix. | 
| 247 |  |  *  [in]  base     - base (2x20 qword values in 2^52 radix) | 
| 248 |  |  *  [in]  exp      - array of 2 pointers to 16 qword values in 2^64 radix. | 
| 249 |  |  *                   Exponent is not converted to redundant representation. | 
| 250 |  |  *  [in]  m        - moduli (2x20 qword values in 2^52 radix) | 
| 251 |  |  *  [in]  rr       - Montgomery parameter for 2 moduli: RR = 2^2080 mod m. | 
| 252 |  |  *                   (2x20 qword values in 2^52 radix) | 
| 253 |  |  *  [in]  k0       - Montgomery parameter for 2 moduli: k0 = -1/m mod 2^64 | 
| 254 |  |  * | 
| 255 |  |  * \return (void). | 
| 256 |  |  */ | 
| 257 |  | static void RSAZ_exp52x20_x2_256(BN_ULONG *out,          /* [2][20] */ | 
| 258 |  |                                  const BN_ULONG *base,   /* [2][20] */ | 
| 259 |  |                                  const BN_ULONG *exp[2], /* 2x16    */ | 
| 260 |  |                                  const BN_ULONG *m,      /* [2][20] */ | 
| 261 |  |                                  const BN_ULONG *rr,     /* [2][20] */ | 
| 262 |  |                                  const BN_ULONG k0[2]) | 
| 263 | 0 | { | 
| 264 | 0 | # define BITSIZE_MODULUS (1024) | 
| 265 | 0 | # define EXP_WIN_SIZE (5) | 
| 266 | 0 | # define EXP_WIN_MASK ((1U << EXP_WIN_SIZE) - 1) | 
| 267 |  | /* | 
| 268 |  |  * Number of digits (64-bit words) in redundant representation to handle | 
| 269 |  |  * modulus bits | 
| 270 |  |  */ | 
| 271 | 0 | # define RED_DIGITS (20) | 
| 272 | 0 | # define EXP_DIGITS (16) | 
| 273 | 0 | # define DAMM ossl_rsaz_amm52x20_x2_256 | 
| 274 |  | /* | 
| 275 |  |  * Squaring is done using multiplication now. That can be a subject of | 
| 276 |  |  * optimization in future. | 
| 277 |  |  */ | 
| 278 | 0 | # define DAMS(r,a,m,k0) \ | 
| 279 | 0 |               ossl_rsaz_amm52x20_x2_256((r),(a),(a),(m),(k0)) | 
| 280 |  |  | 
| 281 |  |     /* Allocate stack for red(undant) result Y and multiplier X */ | 
| 282 | 0 |     ALIGN64 BN_ULONG red_Y[2][RED_DIGITS]; | 
| 283 | 0 |     ALIGN64 BN_ULONG red_X[2][RED_DIGITS]; | 
| 284 |  |  | 
| 285 |  |     /* Allocate expanded exponent */ | 
| 286 | 0 |     ALIGN64 BN_ULONG expz[2][EXP_DIGITS + 1]; | 
| 287 |  |  | 
| 288 |  |     /* Pre-computed table of base powers */ | 
| 289 | 0 |     ALIGN64 BN_ULONG red_table[1U << EXP_WIN_SIZE][2][RED_DIGITS]; | 
| 290 |  | 
 | 
| 291 | 0 |     int idx; | 
| 292 |  | 
 | 
| 293 | 0 |     memset(red_Y, 0, sizeof(red_Y)); | 
| 294 | 0 |     memset(red_table, 0, sizeof(red_table)); | 
| 295 | 0 |     memset(red_X, 0, sizeof(red_X)); | 
| 296 |  |  | 
| 297 |  |     /* | 
| 298 |  |      * Compute table of powers base^i, i = 0, ..., (2^EXP_WIN_SIZE) - 1 | 
| 299 |  |      *   table[0] = mont(x^0) = mont(1) | 
| 300 |  |      *   table[1] = mont(x^1) = mont(x) | 
| 301 |  |      */ | 
| 302 | 0 |     red_X[0][0] = 1; | 
| 303 | 0 |     red_X[1][0] = 1; | 
| 304 | 0 |     DAMM(red_table[0][0], (const BN_ULONG*)red_X, rr, m, k0); | 
| 305 | 0 |     DAMM(red_table[1][0], base,  rr, m, k0); | 
| 306 |  | 
 | 
| 307 | 0 |     for (idx = 1; idx < (int)((1U << EXP_WIN_SIZE) / 2); idx++) { | 
| 308 | 0 |         DAMS(red_table[2 * idx + 0][0], red_table[1 * idx][0], m, k0); | 
| 309 | 0 |         DAMM(red_table[2 * idx + 1][0], red_table[2 * idx][0], red_table[1][0], m, k0); | 
| 310 | 0 |     } | 
| 311 |  |  | 
| 312 |  |     /* Copy and expand exponents */ | 
| 313 | 0 |     memcpy(expz[0], exp[0], EXP_DIGITS * sizeof(BN_ULONG)); | 
| 314 | 0 |     expz[0][EXP_DIGITS] = 0; | 
| 315 | 0 |     memcpy(expz[1], exp[1], EXP_DIGITS * sizeof(BN_ULONG)); | 
| 316 | 0 |     expz[1][EXP_DIGITS] = 0; | 
| 317 |  |  | 
| 318 |  |     /* Exponentiation */ | 
| 319 | 0 |     { | 
| 320 | 0 |         const int rem = BITSIZE_MODULUS % EXP_WIN_SIZE; | 
| 321 | 0 |         BN_ULONG table_idx_mask = EXP_WIN_MASK; | 
| 322 |  | 
 | 
| 323 | 0 |         int exp_bit_no = BITSIZE_MODULUS - rem; | 
| 324 | 0 |         int exp_chunk_no = exp_bit_no / 64; | 
| 325 | 0 |         int exp_chunk_shift = exp_bit_no % 64; | 
| 326 |  | 
 | 
| 327 | 0 |         BN_ULONG red_table_idx_0, red_table_idx_1; | 
| 328 |  |  | 
| 329 |  |         /* | 
| 330 |  |          * If rem == 0, then | 
| 331 |  |          *      exp_bit_no = modulus_bitsize - exp_win_size | 
| 332 |  |          * However, this isn't possible because rem is { 1024, 1536, 2048 } % 5 | 
| 333 |  |          * which is { 4, 1, 3 } respectively. | 
| 334 |  |          * | 
| 335 |  |          * If this assertion ever fails the fix above is easy. | 
| 336 |  |          */ | 
| 337 | 0 |         OPENSSL_assert(rem != 0); | 
| 338 |  |  | 
| 339 |  |         /* Process 1-st exp window - just init result */ | 
| 340 | 0 |         red_table_idx_0 = expz[0][exp_chunk_no]; | 
| 341 | 0 |         red_table_idx_1 = expz[1][exp_chunk_no]; | 
| 342 |  |         /* | 
| 343 |  |          * The function operates with fixed moduli sizes divisible by 64, | 
| 344 |  |          * thus table index here is always in supported range [0, EXP_WIN_SIZE). | 
| 345 |  |          */ | 
| 346 | 0 |         red_table_idx_0 >>= exp_chunk_shift; | 
| 347 | 0 |         red_table_idx_1 >>= exp_chunk_shift; | 
| 348 |  | 
 | 
| 349 | 0 |         ossl_extract_multiplier_2x20_win5(red_Y[0], (const BN_ULONG*)red_table, | 
| 350 | 0 |                                           (int)red_table_idx_0, 0); | 
| 351 | 0 |         ossl_extract_multiplier_2x20_win5(red_Y[1], (const BN_ULONG*)red_table, | 
| 352 | 0 |                                           (int)red_table_idx_1, 1); | 
| 353 |  |  | 
| 354 |  |         /* Process other exp windows */ | 
| 355 | 0 |         for (exp_bit_no -= EXP_WIN_SIZE; exp_bit_no >= 0; exp_bit_no -= EXP_WIN_SIZE) { | 
| 356 |  |             /* Extract pre-computed multiplier from the table */ | 
| 357 | 0 |             { | 
| 358 | 0 |                 BN_ULONG T; | 
| 359 |  | 
 | 
| 360 | 0 |                 exp_chunk_no = exp_bit_no / 64; | 
| 361 | 0 |                 exp_chunk_shift = exp_bit_no % 64; | 
| 362 | 0 |                 { | 
| 363 | 0 |                     red_table_idx_0 = expz[0][exp_chunk_no]; | 
| 364 | 0 |                     T = expz[0][exp_chunk_no + 1]; | 
| 365 |  | 
 | 
| 366 | 0 |                     red_table_idx_0 >>= exp_chunk_shift; | 
| 367 |  |                     /* | 
| 368 |  |                      * Get additional bits from then next quadword | 
| 369 |  |                      * when 64-bit boundaries are crossed. | 
| 370 |  |                      */ | 
| 371 | 0 |                     if (exp_chunk_shift > 64 - EXP_WIN_SIZE) { | 
| 372 | 0 |                         T <<= (64 - exp_chunk_shift); | 
| 373 | 0 |                         red_table_idx_0 ^= T; | 
| 374 | 0 |                     } | 
| 375 | 0 |                     red_table_idx_0 &= table_idx_mask; | 
| 376 |  | 
 | 
| 377 | 0 |                     ossl_extract_multiplier_2x20_win5(red_X[0], | 
| 378 | 0 |                                                       (const BN_ULONG*)red_table, | 
| 379 | 0 |                                                       (int)red_table_idx_0, 0); | 
| 380 | 0 |                 } | 
| 381 | 0 |                 { | 
| 382 | 0 |                     red_table_idx_1 = expz[1][exp_chunk_no]; | 
| 383 | 0 |                     T = expz[1][exp_chunk_no + 1]; | 
| 384 |  | 
 | 
| 385 | 0 |                     red_table_idx_1 >>= exp_chunk_shift; | 
| 386 |  |                     /* | 
| 387 |  |                      * Get additional bits from then next quadword | 
| 388 |  |                      * when 64-bit boundaries are crossed. | 
| 389 |  |                      */ | 
| 390 | 0 |                     if (exp_chunk_shift > 64 - EXP_WIN_SIZE) { | 
| 391 | 0 |                         T <<= (64 - exp_chunk_shift); | 
| 392 | 0 |                         red_table_idx_1 ^= T; | 
| 393 | 0 |                     } | 
| 394 | 0 |                     red_table_idx_1 &= table_idx_mask; | 
| 395 |  | 
 | 
| 396 | 0 |                     ossl_extract_multiplier_2x20_win5(red_X[1], | 
| 397 | 0 |                                                       (const BN_ULONG*)red_table, | 
| 398 | 0 |                                                       (int)red_table_idx_1, 1); | 
| 399 | 0 |                 } | 
| 400 | 0 |             } | 
| 401 |  |  | 
| 402 |  |             /* Series of squaring */ | 
| 403 | 0 |             DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0); | 
| 404 | 0 |             DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0); | 
| 405 | 0 |             DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0); | 
| 406 | 0 |             DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0); | 
| 407 | 0 |             DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0); | 
| 408 |  | 
 | 
| 409 | 0 |             DAMM((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, (const BN_ULONG*)red_X, m, k0); | 
| 410 | 0 |         } | 
| 411 | 0 |     } | 
| 412 |  |  | 
| 413 |  |     /* | 
| 414 |  |      * | 
| 415 |  |      * NB: After the last AMM of exponentiation in Montgomery domain, the result | 
| 416 |  |      * may be 1025-bit, but the conversion out of Montgomery domain performs an | 
| 417 |  |      * AMM(x,1) which guarantees that the final result is less than |m|, so no | 
| 418 |  |      * conditional subtraction is needed here. See "Efficient Software | 
| 419 |  |      * Implementations of Modular Exponentiation" (by Shay Gueron) paper for details. | 
| 420 |  |      */ | 
| 421 |  |  | 
| 422 |  |     /* Convert result back in regular 2^52 domain */ | 
| 423 | 0 |     memset(red_X, 0, sizeof(red_X)); | 
| 424 | 0 |     red_X[0][0] = 1; | 
| 425 | 0 |     red_X[1][0] = 1; | 
| 426 | 0 |     DAMM(out, (const BN_ULONG*)red_Y, (const BN_ULONG*)red_X, m, k0); | 
| 427 |  |  | 
| 428 |  |     /* Clear exponents */ | 
| 429 | 0 |     OPENSSL_cleanse(expz, sizeof(expz)); | 
| 430 | 0 |     OPENSSL_cleanse(red_Y, sizeof(red_Y)); | 
| 431 |  | 
 | 
| 432 | 0 | # undef DAMS | 
| 433 | 0 | # undef DAMM | 
| 434 | 0 | # undef EXP_DIGITS | 
| 435 | 0 | # undef RED_DIGITS | 
| 436 | 0 | # undef EXP_WIN_MASK | 
| 437 | 0 | # undef EXP_WIN_SIZE | 
| 438 | 0 | # undef BITSIZE_MODULUS | 
| 439 | 0 | } | 
| 440 |  |  | 
| 441 |  | static ossl_inline uint64_t get_digit52(const uint8_t *in, int in_len) | 
| 442 | 0 | { | 
| 443 | 0 |     uint64_t digit = 0; | 
| 444 |  | 
 | 
| 445 | 0 |     assert(in != NULL); | 
| 446 |  |  | 
| 447 | 0 |     for (; in_len > 0; in_len--) { | 
| 448 | 0 |         digit <<= 8; | 
| 449 | 0 |         digit += (uint64_t)(in[in_len - 1]); | 
| 450 | 0 |     } | 
| 451 | 0 |     return digit; | 
| 452 | 0 | } | 
| 453 |  |  | 
| 454 |  | /* | 
| 455 |  |  * Convert array of words in regular (base=2^64) representation to array of | 
| 456 |  |  * words in redundant (base=2^52) one. | 
| 457 |  |  */ | 
| 458 |  | static void to_words52(BN_ULONG *out, int out_len, | 
| 459 |  |                        const BN_ULONG *in, int in_bitsize) | 
| 460 | 0 | { | 
| 461 | 0 |     uint8_t *in_str = NULL; | 
| 462 |  | 
 | 
| 463 | 0 |     assert(out != NULL); | 
| 464 | 0 |     assert(in != NULL); | 
| 465 |  |     /* Check destination buffer capacity */ | 
| 466 | 0 |     assert(out_len >= number_of_digits(in_bitsize, DIGIT_SIZE)); | 
| 467 |  |  | 
| 468 | 0 |     in_str = (uint8_t *)in; | 
| 469 |  | 
 | 
| 470 | 0 |     for (; in_bitsize >= (2 * DIGIT_SIZE); in_bitsize -= (2 * DIGIT_SIZE), out += 2) { | 
| 471 | 0 |         uint64_t digit; | 
| 472 |  | 
 | 
| 473 | 0 |         memcpy(&digit, in_str, sizeof(digit)); | 
| 474 | 0 |         out[0] = digit & DIGIT_MASK; | 
| 475 | 0 |         in_str += 6; | 
| 476 | 0 |         memcpy(&digit, in_str, sizeof(digit)); | 
| 477 | 0 |         out[1] = (digit >> 4) & DIGIT_MASK; | 
| 478 | 0 |         in_str += 7; | 
| 479 | 0 |         out_len -= 2; | 
| 480 | 0 |     } | 
| 481 |  | 
 | 
| 482 | 0 |     if (in_bitsize > DIGIT_SIZE) { | 
| 483 | 0 |         uint64_t digit = get_digit52(in_str, 7); | 
| 484 |  | 
 | 
| 485 | 0 |         out[0] = digit & DIGIT_MASK; | 
| 486 | 0 |         in_str += 6; | 
| 487 | 0 |         in_bitsize -= DIGIT_SIZE; | 
| 488 | 0 |         digit = get_digit52(in_str, BITS2WORD8_SIZE(in_bitsize)); | 
| 489 | 0 |         out[1] = digit >> 4; | 
| 490 | 0 |         out += 2; | 
| 491 | 0 |         out_len -= 2; | 
| 492 | 0 |     } else if (in_bitsize > 0) { | 
| 493 | 0 |         out[0] = get_digit52(in_str, BITS2WORD8_SIZE(in_bitsize)); | 
| 494 | 0 |         out++; | 
| 495 | 0 |         out_len--; | 
| 496 | 0 |     } | 
| 497 |  | 
 | 
| 498 | 0 |     while (out_len > 0) { | 
| 499 | 0 |         *out = 0; | 
| 500 | 0 |         out_len--; | 
| 501 | 0 |         out++; | 
| 502 | 0 |     } | 
| 503 | 0 | } | 
| 504 |  |  | 
| 505 |  | static ossl_inline void put_digit52(uint8_t *pStr, int strLen, uint64_t digit) | 
| 506 | 0 | { | 
| 507 | 0 |     assert(pStr != NULL); | 
| 508 |  |  | 
| 509 | 0 |     for (; strLen > 0; strLen--) { | 
| 510 | 0 |         *pStr++ = (uint8_t)(digit & 0xFF); | 
| 511 | 0 |         digit >>= 8; | 
| 512 | 0 |     } | 
| 513 | 0 | } | 
| 514 |  |  | 
| 515 |  | /* | 
| 516 |  |  * Convert array of words in redundant (base=2^52) representation to array of | 
| 517 |  |  * words in regular (base=2^64) one. | 
| 518 |  |  */ | 
| 519 |  | static void from_words52(BN_ULONG *out, int out_bitsize, const BN_ULONG *in) | 
| 520 | 0 | { | 
| 521 | 0 |     int i; | 
| 522 | 0 |     int out_len = BITS2WORD64_SIZE(out_bitsize); | 
| 523 |  | 
 | 
| 524 | 0 |     assert(out != NULL); | 
| 525 | 0 |     assert(in != NULL); | 
| 526 |  |  | 
| 527 | 0 |     for (i = 0; i < out_len; i++) | 
| 528 | 0 |         out[i] = 0; | 
| 529 |  | 
 | 
| 530 | 0 |     { | 
| 531 | 0 |         uint8_t *out_str = (uint8_t *)out; | 
| 532 |  | 
 | 
| 533 | 0 |         for (; out_bitsize >= (2 * DIGIT_SIZE); | 
| 534 | 0 |                out_bitsize -= (2 * DIGIT_SIZE), in += 2) { | 
| 535 | 0 |             uint64_t digit; | 
| 536 |  | 
 | 
| 537 | 0 |             digit = in[0]; | 
| 538 | 0 |             memcpy(out_str, &digit, sizeof(digit)); | 
| 539 | 0 |             out_str += 6; | 
| 540 | 0 |             digit = digit >> 48 | in[1] << 4; | 
| 541 | 0 |             memcpy(out_str, &digit, sizeof(digit)); | 
| 542 | 0 |             out_str += 7; | 
| 543 | 0 |         } | 
| 544 |  | 
 | 
| 545 | 0 |         if (out_bitsize > DIGIT_SIZE) { | 
| 546 | 0 |             put_digit52(out_str, 7, in[0]); | 
| 547 | 0 |             out_str += 6; | 
| 548 | 0 |             out_bitsize -= DIGIT_SIZE; | 
| 549 | 0 |             put_digit52(out_str, BITS2WORD8_SIZE(out_bitsize), | 
| 550 | 0 |                         (in[1] << 4 | in[0] >> 48)); | 
| 551 | 0 |         } else if (out_bitsize) { | 
| 552 | 0 |             put_digit52(out_str, BITS2WORD8_SIZE(out_bitsize), in[0]); | 
| 553 | 0 |         } | 
| 554 | 0 |     } | 
| 555 | 0 | } | 
| 556 |  |  | 
| 557 |  | /* | 
| 558 |  |  * Set bit at index |idx| in the words array |a|. | 
| 559 |  |  * It does not do any boundaries checks, make sure the index is valid before | 
| 560 |  |  * calling the function. | 
| 561 |  |  */ | 
| 562 |  | static ossl_inline void set_bit(BN_ULONG *a, int idx) | 
| 563 | 0 | { | 
| 564 | 0 |     assert(a != NULL); | 
| 565 |  |  | 
| 566 | 0 |     { | 
| 567 | 0 |         int i, j; | 
| 568 |  | 
 | 
| 569 | 0 |         i = idx / BN_BITS2; | 
| 570 | 0 |         j = idx % BN_BITS2; | 
| 571 | 0 |         a[i] |= (((BN_ULONG)1) << j); | 
| 572 | 0 |     } | 
| 573 | 0 | } | 
| 574 |  |  | 
| 575 |  | #endif |