Coverage Report

Created: 2025-12-04 06:33

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openssl33/crypto/ec/ecp_nistp384.c
Line
Count
Source
1
/*
2
 * Copyright 2023-2025 The OpenSSL Project Authors. All Rights Reserved.
3
 *
4
 * Licensed under the Apache License 2.0 (the "License").  You may not use
5
 * this file except in compliance with the License.  You can obtain a copy
6
 * in the file LICENSE in the source distribution or at
7
 * https://www.openssl.org/source/license.html
8
 */
9
10
/* Copyright 2023 IBM Corp.
11
 *
12
 * Licensed under the Apache License, Version 2.0 (the "License");
13
 *
14
 * you may not use this file except in compliance with the License.
15
 * You may obtain a copy of the License at
16
 *
17
 *     http://www.apache.org/licenses/LICENSE-2.0
18
 *
19
 *  Unless required by applicable law or agreed to in writing, software
20
 *  distributed under the License is distributed on an "AS IS" BASIS,
21
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22
 *  See the License for the specific language governing permissions and
23
 *  limitations under the License.
24
 */
25
26
/*
27
 * Designed for 56-bit limbs by Rohan McLure <rohan.mclure@linux.ibm.com>.
28
 * The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm
29
 * acceleration of felem_{square,mul} as supported in these files.
30
 */
31
32
#include <openssl/e_os2.h>
33
34
#include <string.h>
35
#include <openssl/err.h>
36
#include "ec_local.h"
37
38
#include "internal/numbers.h"
39
40
#ifndef INT128_MAX
41
# error "Your compiler doesn't appear to support 128-bit integer types"
42
#endif
43
44
typedef uint8_t u8;
45
typedef uint64_t u64;
46
47
/*
48
 * The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We
49
 * can serialize an element of this field into 48 bytes. We call this an
50
 * felem_bytearray.
51
 */
52
53
typedef u8 felem_bytearray[48];
54
55
/*
56
 * These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4.
57
 * These values are big-endian.
58
 */
59
static const felem_bytearray nistp384_curve_params[5] = {
60
  {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
61
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
62
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
63
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF},
64
  {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */
65
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
66
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
67
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC},
68
  {0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */
69
   0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12,
70
   0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D,
71
   0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF},
72
  {0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */
73
   0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98,
74
   0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D,
75
   0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7},
76
  {0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */
77
   0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C,
78
   0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE,
79
   0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F},
80
};
81
82
/*-
83
 * The representation of field elements.
84
 * ------------------------------------
85
 *
86
 * We represent field elements with seven values. These values are either 64 or
87
 * 128 bits and the field element represented is:
88
 *   v[0]*2^0 + v[1]*2^56 + v[2]*2^112 + ... + v[6]*2^336  (mod p)
89
 * Each of the seven values is called a 'limb'. Since the limbs are spaced only
90
 * 56 bits apart, but are greater than 56 bits in length, the most significant
91
 * bits of each limb overlap with the least significant bits of the next
92
 *
93
 * This representation is considered to be 'redundant' in the sense that
94
 * intermediate values can each contain more than a 56-bit value in each limb.
95
 * Reduction causes all but the final limb to be reduced to contain a value less
96
 * than 2^56, with the final value represented allowed to be larger than 2^384,
97
 * inasmuch as we can be sure that arithmetic overflow remains impossible. The
98
 * reduced value must of course be congruent to the unreduced value.
99
 *
100
 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
101
 * 'widefelem', featuring enough bits to store the result of a multiplication
102
 * and even some further arithmetic without need for immediate reduction.
103
 */
104
105
691M
#define NLIMBS 7
106
107
typedef uint64_t limb;
108
typedef uint128_t widelimb;
109
typedef limb limb_aX __attribute((__aligned__(1)));
110
typedef limb felem[NLIMBS];
111
typedef widelimb widefelem[2*NLIMBS-1];
112
113
static const limb bottom56bits = 0xffffffffffffff;
114
115
/* Helper functions (de)serialising reduced field elements in little endian */
116
static void bin48_to_felem(felem out, const u8 in[48])
117
31.3k
{
118
31.3k
    memset(out, 0, 56);
119
31.3k
    out[0] = (*((limb *) & in[0])) & bottom56bits;
120
31.3k
    out[1] = (*((limb_aX *) & in[7])) & bottom56bits;
121
31.3k
    out[2] = (*((limb_aX *) & in[14])) & bottom56bits;
122
31.3k
    out[3] = (*((limb_aX *) & in[21])) & bottom56bits;
123
31.3k
    out[4] = (*((limb_aX *) & in[28])) & bottom56bits;
124
31.3k
    out[5] = (*((limb_aX *) & in[35])) & bottom56bits;
125
31.3k
    memmove(&out[6], &in[42], 6);
126
31.3k
}
127
128
static void felem_to_bin48(u8 out[48], const felem in)
129
48.0k
{
130
48.0k
    memset(out, 0, 48);
131
48.0k
    (*((limb *) & out[0]))     |= (in[0] & bottom56bits);
132
48.0k
    (*((limb_aX *) & out[7]))  |= (in[1] & bottom56bits);
133
48.0k
    (*((limb_aX *) & out[14])) |= (in[2] & bottom56bits);
134
48.0k
    (*((limb_aX *) & out[21])) |= (in[3] & bottom56bits);
135
48.0k
    (*((limb_aX *) & out[28])) |= (in[4] & bottom56bits);
136
48.0k
    (*((limb_aX *) & out[35])) |= (in[5] & bottom56bits);
137
48.0k
    memmove(&out[42], &in[6], 6);
138
48.0k
}
139
140
/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
141
static int BN_to_felem(felem out, const BIGNUM *bn)
142
31.3k
{
143
31.3k
    felem_bytearray b_out;
144
31.3k
    int num_bytes;
145
146
31.3k
    if (BN_is_negative(bn)) {
147
0
        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
148
0
        return 0;
149
0
    }
150
31.3k
    num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
151
31.3k
    if (num_bytes < 0) {
152
0
        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
153
0
        return 0;
154
0
    }
155
31.3k
    bin48_to_felem(out, b_out);
156
31.3k
    return 1;
157
31.3k
}
158
159
/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
160
static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
161
48.0k
{
162
48.0k
    felem_bytearray b_out;
163
164
48.0k
    felem_to_bin48(b_out, in);
165
48.0k
    return BN_lebin2bn(b_out, sizeof(b_out), out);
166
48.0k
}
167
168
/*-
169
 * Field operations
170
 * ----------------
171
 */
172
173
static void felem_one(felem out)
174
0
{
175
0
    out[0] = 1;
176
0
    memset(&out[1], 0, sizeof(limb) * (NLIMBS-1));
177
0
}
178
179
static void felem_assign(felem out, const felem in)
180
9.60M
{
181
9.60M
    memcpy(out, in, sizeof(felem));
182
9.60M
}
183
184
/* felem_sum64 sets out = out + in. */
185
static void felem_sum64(felem out, const felem in)
186
4.18M
{
187
4.18M
    unsigned int i;
188
189
33.5M
    for (i = 0; i < NLIMBS; i++)
190
29.3M
        out[i] += in[i];
191
4.18M
}
192
193
/* felem_scalar sets out = in * scalar */
194
static void felem_scalar(felem out, const felem in, limb scalar)
195
12.4M
{
196
12.4M
    unsigned int i;
197
198
99.8M
    for (i = 0; i < NLIMBS; i++)
199
87.3M
        out[i] = in[i] * scalar;
200
12.4M
}
201
202
/* felem_scalar64 sets out = out * scalar */
203
static void felem_scalar64(felem out, limb scalar)
204
5.87M
{
205
5.87M
    unsigned int i;
206
207
47.0M
    for (i = 0; i < NLIMBS; i++)
208
41.1M
        out[i] *= scalar;
209
5.87M
}
210
211
/* felem_scalar128 sets out = out * scalar */
212
static void felem_scalar128(widefelem out, limb scalar)
213
1.95M
{
214
1.95M
    unsigned int i;
215
216
27.4M
    for (i = 0; i < 2*NLIMBS-1; i++)
217
25.4M
        out[i] *= scalar;
218
1.95M
}
219
220
/*-
221
 * felem_neg sets |out| to |-in|
222
 * On entry:
223
 *   in[i] < 2^60 - 2^29
224
 * On exit:
225
 *   out[i] < 2^60
226
 */
227
static void felem_neg(felem out, const felem in)
228
178k
{
229
    /*
230
     * In order to prevent underflow, we add a multiple of p before subtracting.
231
     * Use telescopic sums to represent 2^12 * p redundantly with each limb
232
     * of the form 2^60 + ...
233
     */
234
178k
    static const limb two60m52m4 = (((limb) 1) << 60)
235
178k
                                 - (((limb) 1) << 52)
236
178k
                                 - (((limb) 1) << 4);
237
178k
    static const limb two60p44m12 = (((limb) 1) << 60)
238
178k
                                  + (((limb) 1) << 44)
239
178k
                                  - (((limb) 1) << 12);
240
178k
    static const limb two60m28m4 = (((limb) 1) << 60)
241
178k
                                 - (((limb) 1) << 28)
242
178k
                                 - (((limb) 1) << 4);
243
178k
    static const limb two60m4 = (((limb) 1) << 60)
244
178k
                              - (((limb) 1) << 4);
245
246
178k
    out[0] = two60p44m12 - in[0];
247
178k
    out[1] = two60m52m4 - in[1];
248
178k
    out[2] = two60m28m4 - in[2];
249
178k
    out[3] = two60m4 - in[3];
250
178k
    out[4] = two60m4 - in[4];
251
178k
    out[5] = two60m4 - in[5];
252
178k
    out[6] = two60m4 - in[6];
253
178k
}
254
255
#if defined(ECP_NISTP384_ASM)
256
void p384_felem_diff64(felem out, const felem in);
257
void p384_felem_diff128(widefelem out, const widefelem in);
258
void p384_felem_diff_128_64(widefelem out, const felem in);
259
260
# define felem_diff64           p384_felem_diff64
261
# define felem_diff128          p384_felem_diff128
262
# define felem_diff_128_64      p384_felem_diff_128_64
263
264
#else
265
/*-
266
 * felem_diff64 subtracts |in| from |out|
267
 * On entry:
268
 *   in[i] < 2^60 - 2^52 - 2^4
269
 * On exit:
270
 *   out[i] < out_orig[i] + 2^60 + 2^44
271
 */
272
static void felem_diff64(felem out, const felem in)
273
3.29M
{
274
    /*
275
     * In order to prevent underflow, we add a multiple of p before subtracting.
276
     * Use telescopic sums to represent 2^12 * p redundantly with each limb
277
     * of the form 2^60 + ...
278
     */
279
280
3.29M
    static const limb two60m52m4 = (((limb) 1) << 60)
281
3.29M
                                 - (((limb) 1) << 52)
282
3.29M
                                 - (((limb) 1) << 4);
283
3.29M
    static const limb two60p44m12 = (((limb) 1) << 60)
284
3.29M
                                  + (((limb) 1) << 44)
285
3.29M
                                  - (((limb) 1) << 12);
286
3.29M
    static const limb two60m28m4 = (((limb) 1) << 60)
287
3.29M
                                 - (((limb) 1) << 28)
288
3.29M
                                 - (((limb) 1) << 4);
289
3.29M
    static const limb two60m4 = (((limb) 1) << 60)
290
3.29M
                              - (((limb) 1) << 4);
291
292
3.29M
    out[0] += two60p44m12 - in[0];
293
3.29M
    out[1] += two60m52m4 - in[1];
294
3.29M
    out[2] += two60m28m4 - in[2];
295
3.29M
    out[3] += two60m4 - in[3];
296
3.29M
    out[4] += two60m4 - in[4];
297
3.29M
    out[5] += two60m4 - in[5];
298
3.29M
    out[6] += two60m4 - in[6];
299
3.29M
}
300
301
/*
302
 * in[i] < 2^63
303
 * out[i] < out_orig[i] + 2^64 + 2^48
304
 */
305
static void felem_diff_128_64(widefelem out, const felem in)
306
5.55M
{
307
    /*
308
     * In order to prevent underflow, we add a multiple of p before subtracting.
309
     * Use telescopic sums to represent 2^16 * p redundantly with each limb
310
     * of the form 2^64 + ...
311
     */
312
313
5.55M
    static const widelimb two64m56m8 = (((widelimb) 1) << 64)
314
5.55M
                                     - (((widelimb) 1) << 56)
315
5.55M
                                     - (((widelimb) 1) << 8);
316
5.55M
    static const widelimb two64m32m8 = (((widelimb) 1) << 64)
317
5.55M
                                     - (((widelimb) 1) << 32)
318
5.55M
                                     - (((widelimb) 1) << 8);
319
5.55M
    static const widelimb two64m8 = (((widelimb) 1) << 64)
320
5.55M
                                  - (((widelimb) 1) << 8);
321
5.55M
    static const widelimb two64p48m16 = (((widelimb) 1) << 64)
322
5.55M
                                      + (((widelimb) 1) << 48)
323
5.55M
                                      - (((widelimb) 1) << 16);
324
5.55M
    unsigned int i;
325
326
5.55M
    out[0] += two64p48m16;
327
5.55M
    out[1] += two64m56m8;
328
5.55M
    out[2] += two64m32m8;
329
5.55M
    out[3] += two64m8;
330
5.55M
    out[4] += two64m8;
331
5.55M
    out[5] += two64m8;
332
5.55M
    out[6] += two64m8;
333
334
44.4M
    for (i = 0; i < NLIMBS; i++)
335
38.9M
        out[i] -= in[i];
336
5.55M
}
337
338
/*
339
 * in[i] < 2^127 - 2^119 - 2^71
340
 * out[i] < out_orig[i] + 2^127 + 2^111
341
 */
342
static void felem_diff128(widefelem out, const widefelem in)
343
1.95M
{
344
    /*
345
     * In order to prevent underflow, we add a multiple of p before subtracting.
346
     * Use telescopic sums to represent 2^415 * p redundantly with each limb
347
     * of the form 2^127 + ...
348
     */
349
350
1.95M
    static const widelimb two127 = ((widelimb) 1) << 127;
351
1.95M
    static const widelimb two127m71 = (((widelimb) 1) << 127)
352
1.95M
                                    - (((widelimb) 1) << 71);
353
1.95M
    static const widelimb two127p111m79m71 = (((widelimb) 1) << 127)
354
1.95M
                                           + (((widelimb) 1) << 111)
355
1.95M
                                           - (((widelimb) 1) << 79)
356
1.95M
                                           - (((widelimb) 1) << 71);
357
1.95M
    static const widelimb two127m119m71 = (((widelimb) 1) << 127)
358
1.95M
                                        - (((widelimb) 1) << 119)
359
1.95M
                                        - (((widelimb) 1) << 71);
360
1.95M
    static const widelimb two127m95m71 = (((widelimb) 1) << 127)
361
1.95M
                                       - (((widelimb) 1) << 95)
362
1.95M
                                       - (((widelimb) 1) << 71);
363
1.95M
    unsigned int i;
364
365
1.95M
    out[0]  += two127;
366
1.95M
    out[1]  += two127m71;
367
1.95M
    out[2]  += two127m71;
368
1.95M
    out[3]  += two127m71;
369
1.95M
    out[4]  += two127m71;
370
1.95M
    out[5]  += two127m71;
371
1.95M
    out[6]  += two127p111m79m71;
372
1.95M
    out[7]  += two127m119m71;
373
1.95M
    out[8]  += two127m95m71;
374
1.95M
    out[9]  += two127m71;
375
1.95M
    out[10] += two127m71;
376
1.95M
    out[11] += two127m71;
377
1.95M
    out[12] += two127m71;
378
379
27.4M
    for (i = 0; i < 2*NLIMBS-1; i++)
380
25.4M
        out[i] -= in[i];
381
1.95M
}
382
#endif /* ECP_NISTP384_ASM */
383
384
static void felem_square_ref(widefelem out, const felem in)
385
12.0M
{
386
12.0M
    felem inx2;
387
12.0M
    felem_scalar(inx2, in, 2);
388
389
12.0M
    out[0] = ((uint128_t) in[0]) * in[0];
390
391
12.0M
    out[1] = ((uint128_t) in[0]) * inx2[1];
392
393
12.0M
    out[2] = ((uint128_t) in[0]) * inx2[2]
394
12.0M
           + ((uint128_t) in[1]) * in[1];
395
396
12.0M
    out[3] = ((uint128_t) in[0]) * inx2[3]
397
12.0M
           + ((uint128_t) in[1]) * inx2[2];
398
399
12.0M
    out[4] = ((uint128_t) in[0]) * inx2[4]
400
12.0M
           + ((uint128_t) in[1]) * inx2[3]
401
12.0M
           + ((uint128_t) in[2]) * in[2];
402
403
12.0M
    out[5] = ((uint128_t) in[0]) * inx2[5]
404
12.0M
           + ((uint128_t) in[1]) * inx2[4]
405
12.0M
           + ((uint128_t) in[2]) * inx2[3];
406
407
12.0M
    out[6] = ((uint128_t) in[0]) * inx2[6]
408
12.0M
           + ((uint128_t) in[1]) * inx2[5]
409
12.0M
           + ((uint128_t) in[2]) * inx2[4]
410
12.0M
           + ((uint128_t) in[3]) * in[3];
411
412
12.0M
    out[7] = ((uint128_t) in[1]) * inx2[6]
413
12.0M
           + ((uint128_t) in[2]) * inx2[5]
414
12.0M
           + ((uint128_t) in[3]) * inx2[4];
415
416
12.0M
    out[8] = ((uint128_t) in[2]) * inx2[6]
417
12.0M
           + ((uint128_t) in[3]) * inx2[5]
418
12.0M
           + ((uint128_t) in[4]) * in[4];
419
420
12.0M
    out[9] = ((uint128_t) in[3]) * inx2[6]
421
12.0M
           + ((uint128_t) in[4]) * inx2[5];
422
423
12.0M
    out[10] = ((uint128_t) in[4]) * inx2[6]
424
12.0M
            + ((uint128_t) in[5]) * in[5];
425
426
12.0M
    out[11] = ((uint128_t) in[5]) * inx2[6];
427
428
12.0M
    out[12] = ((uint128_t) in[6]) * in[6];
429
12.0M
}
430
431
static void felem_mul_ref(widefelem out, const felem in1, const felem in2)
432
9.72M
{
433
9.72M
    out[0] = ((uint128_t) in1[0]) * in2[0];
434
435
9.72M
    out[1] = ((uint128_t) in1[0]) * in2[1]
436
9.72M
           + ((uint128_t) in1[1]) * in2[0];
437
438
9.72M
    out[2] = ((uint128_t) in1[0]) * in2[2]
439
9.72M
           + ((uint128_t) in1[1]) * in2[1]
440
9.72M
           + ((uint128_t) in1[2]) * in2[0];
441
442
9.72M
    out[3] = ((uint128_t) in1[0]) * in2[3]
443
9.72M
           + ((uint128_t) in1[1]) * in2[2]
444
9.72M
           + ((uint128_t) in1[2]) * in2[1]
445
9.72M
           + ((uint128_t) in1[3]) * in2[0];
446
447
9.72M
    out[4] = ((uint128_t) in1[0]) * in2[4]
448
9.72M
           + ((uint128_t) in1[1]) * in2[3]
449
9.72M
           + ((uint128_t) in1[2]) * in2[2]
450
9.72M
           + ((uint128_t) in1[3]) * in2[1]
451
9.72M
           + ((uint128_t) in1[4]) * in2[0];
452
453
9.72M
    out[5] = ((uint128_t) in1[0]) * in2[5]
454
9.72M
           + ((uint128_t) in1[1]) * in2[4]
455
9.72M
           + ((uint128_t) in1[2]) * in2[3]
456
9.72M
           + ((uint128_t) in1[3]) * in2[2]
457
9.72M
           + ((uint128_t) in1[4]) * in2[1]
458
9.72M
           + ((uint128_t) in1[5]) * in2[0];
459
460
9.72M
    out[6] = ((uint128_t) in1[0]) * in2[6]
461
9.72M
           + ((uint128_t) in1[1]) * in2[5]
462
9.72M
           + ((uint128_t) in1[2]) * in2[4]
463
9.72M
           + ((uint128_t) in1[3]) * in2[3]
464
9.72M
           + ((uint128_t) in1[4]) * in2[2]
465
9.72M
           + ((uint128_t) in1[5]) * in2[1]
466
9.72M
           + ((uint128_t) in1[6]) * in2[0];
467
468
9.72M
    out[7] = ((uint128_t) in1[1]) * in2[6]
469
9.72M
           + ((uint128_t) in1[2]) * in2[5]
470
9.72M
           + ((uint128_t) in1[3]) * in2[4]
471
9.72M
           + ((uint128_t) in1[4]) * in2[3]
472
9.72M
           + ((uint128_t) in1[5]) * in2[2]
473
9.72M
           + ((uint128_t) in1[6]) * in2[1];
474
475
9.72M
    out[8] = ((uint128_t) in1[2]) * in2[6]
476
9.72M
           + ((uint128_t) in1[3]) * in2[5]
477
9.72M
           + ((uint128_t) in1[4]) * in2[4]
478
9.72M
           + ((uint128_t) in1[5]) * in2[3]
479
9.72M
           + ((uint128_t) in1[6]) * in2[2];
480
481
9.72M
    out[9] = ((uint128_t) in1[3]) * in2[6]
482
9.72M
           + ((uint128_t) in1[4]) * in2[5]
483
9.72M
           + ((uint128_t) in1[5]) * in2[4]
484
9.72M
           + ((uint128_t) in1[6]) * in2[3];
485
486
9.72M
    out[10] = ((uint128_t) in1[4]) * in2[6]
487
9.72M
            + ((uint128_t) in1[5]) * in2[5]
488
9.72M
            + ((uint128_t) in1[6]) * in2[4];
489
490
9.72M
    out[11] = ((uint128_t) in1[5]) * in2[6]
491
9.72M
            + ((uint128_t) in1[6]) * in2[5];
492
493
9.72M
    out[12] = ((uint128_t) in1[6]) * in2[6];
494
9.72M
}
495
496
/*-
497
 * Reduce thirteen 128-bit coefficients to seven 64-bit coefficients.
498
 * in[i] < 2^128 - 2^125
499
 * out[i] < 2^56 for i < 6,
500
 * out[6] <= 2^48
501
 *
502
 * The technique in use here stems from the format of the prime modulus:
503
 * P384 = 2^384 - delta
504
 *
505
 * Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting
506
 * them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These
507
 * coefficients are still quite large, and so we repeatedly apply this
508
 * technique on high-order bits in order to guarantee the desired bounds on
509
 * the size of our output.
510
 *
511
 * The three phases of elimination are as follows:
512
 * [1]: Y = 2^120 (in[12] | in[11] | in[10] | in[9])
513
 * [2]: Y = 2^8 (acc[8] | acc[7])
514
 * [3]: Y = 2^48 (acc[6] >> 48)
515
 * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d)
516
 */
517
static void felem_reduce_ref(felem out, const widefelem in)
518
19.8M
{
519
    /*
520
     * In order to prevent underflow, we add a multiple of p before subtracting.
521
     * Use telescopic sums to represent 2^76 * p redundantly with each limb
522
     * of the form 2^124 + ...
523
     */
524
19.8M
    static const widelimb two124m68 = (((widelimb) 1) << 124)
525
19.8M
                                    - (((widelimb) 1) << 68);
526
19.8M
    static const widelimb two124m116m68 = (((widelimb) 1) << 124)
527
19.8M
                                        - (((widelimb) 1) << 116)
528
19.8M
                                        - (((widelimb) 1) << 68);
529
19.8M
    static const widelimb two124p108m76 = (((widelimb) 1) << 124)
530
19.8M
                                        + (((widelimb) 1) << 108)
531
19.8M
                                        - (((widelimb) 1) << 76);
532
19.8M
    static const widelimb two124m92m68 = (((widelimb) 1) << 124)
533
19.8M
                                       - (((widelimb) 1) << 92)
534
19.8M
                                       - (((widelimb) 1) << 68);
535
19.8M
    widelimb temp, acc[9];
536
19.8M
    unsigned int i;
537
538
19.8M
    memcpy(acc, in, sizeof(widelimb) * 9);
539
540
19.8M
    acc[0] += two124p108m76;
541
19.8M
    acc[1] += two124m116m68;
542
19.8M
    acc[2] += two124m92m68;
543
19.8M
    acc[3] += two124m68;
544
19.8M
    acc[4] += two124m68;
545
19.8M
    acc[5] += two124m68;
546
19.8M
    acc[6] += two124m68;
547
548
    /* [1]: Eliminate in[9], ..., in[12] */
549
19.8M
    acc[8] += in[12] >> 32;
550
19.8M
    acc[7] += (in[12] & 0xffffffff) << 24;
551
19.8M
    acc[7] += in[12] >> 8;
552
19.8M
    acc[6] += (in[12] & 0xff) << 48;
553
19.8M
    acc[6] -= in[12] >> 16;
554
19.8M
    acc[5] -= (in[12] & 0xffff) << 40;
555
19.8M
    acc[6] += in[12] >> 48;
556
19.8M
    acc[5] += (in[12] & 0xffffffffffff) << 8;
557
558
19.8M
    acc[7] += in[11] >> 32;
559
19.8M
    acc[6] += (in[11] & 0xffffffff) << 24;
560
19.8M
    acc[6] += in[11] >> 8;
561
19.8M
    acc[5] += (in[11] & 0xff) << 48;
562
19.8M
    acc[5] -= in[11] >> 16;
563
19.8M
    acc[4] -= (in[11] & 0xffff) << 40;
564
19.8M
    acc[5] += in[11] >> 48;
565
19.8M
    acc[4] += (in[11] & 0xffffffffffff) << 8;
566
567
19.8M
    acc[6] += in[10] >> 32;
568
19.8M
    acc[5] += (in[10] & 0xffffffff) << 24;
569
19.8M
    acc[5] += in[10] >> 8;
570
19.8M
    acc[4] += (in[10] & 0xff) << 48;
571
19.8M
    acc[4] -= in[10] >> 16;
572
19.8M
    acc[3] -= (in[10] & 0xffff) << 40;
573
19.8M
    acc[4] += in[10] >> 48;
574
19.8M
    acc[3] += (in[10] & 0xffffffffffff) << 8;
575
576
19.8M
    acc[5] += in[9] >> 32;
577
19.8M
    acc[4] += (in[9] & 0xffffffff) << 24;
578
19.8M
    acc[4] += in[9] >> 8;
579
19.8M
    acc[3] += (in[9] & 0xff) << 48;
580
19.8M
    acc[3] -= in[9] >> 16;
581
19.8M
    acc[2] -= (in[9] & 0xffff) << 40;
582
19.8M
    acc[3] += in[9] >> 48;
583
19.8M
    acc[2] += (in[9] & 0xffffffffffff) << 8;
584
585
    /*
586
     * [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as
587
     * well as the contributions made from eliminating higher limbs.
588
     * acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121
589
     * acc[8] < in[8] + 2^96
590
     */
591
19.8M
    acc[4] += acc[8] >> 32;
592
19.8M
    acc[3] += (acc[8] & 0xffffffff) << 24;
593
19.8M
    acc[3] += acc[8] >> 8;
594
19.8M
    acc[2] += (acc[8] & 0xff) << 48;
595
19.8M
    acc[2] -= acc[8] >> 16;
596
19.8M
    acc[1] -= (acc[8] & 0xffff) << 40;
597
19.8M
    acc[2] += acc[8] >> 48;
598
19.8M
    acc[1] += (acc[8] & 0xffffffffffff) << 8;
599
600
19.8M
    acc[3] += acc[7] >> 32;
601
19.8M
    acc[2] += (acc[7] & 0xffffffff) << 24;
602
19.8M
    acc[2] += acc[7] >> 8;
603
19.8M
    acc[1] += (acc[7] & 0xff) << 48;
604
19.8M
    acc[1] -= acc[7] >> 16;
605
19.8M
    acc[0] -= (acc[7] & 0xffff) << 40;
606
19.8M
    acc[1] += acc[7] >> 48;
607
19.8M
    acc[0] += (acc[7] & 0xffffffffffff) << 8;
608
609
    /*-
610
     * acc[k] < in[k] + 2^124 + 2^121 
611
     *        < in[k] + 2^125
612
     *        < 2^128, for k <= 6
613
     */
614
615
    /*
616
     * Carry 4 -> 5 -> 6
617
     * This has the effect of ensuring that these more significant limbs
618
     * will be small in value after eliminating high bits from acc[6].
619
     */
620
19.8M
    acc[5] += acc[4] >> 56;
621
19.8M
    acc[4] &= 0x00ffffffffffffff;
622
623
19.8M
    acc[6] += acc[5] >> 56;
624
19.8M
    acc[5] &= 0x00ffffffffffffff;
625
626
    /*-
627
     * acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16
628
     *        < in[6] + 2^125
629
     *        < 2^128
630
     */
631
632
    /* [3]: Eliminate high bits of acc[6] */
633
19.8M
    temp = acc[6] >> 48;
634
19.8M
    acc[6] &= 0x0000ffffffffffff;
635
    
636
    /* temp < 2^80 */
637
638
19.8M
    acc[3] += temp >> 40;
639
19.8M
    acc[2] += (temp & 0xffffffffff) << 16;
640
19.8M
    acc[2] += temp >> 16;
641
19.8M
    acc[1] += (temp & 0xffff) << 40;
642
19.8M
    acc[1] -= temp >> 24;
643
19.8M
    acc[0] -= (temp & 0xffffff) << 32;
644
19.8M
    acc[0] += temp;
645
646
    /*-
647
     * acc[k] < acc_old[k] + 2^64 + 2^56
648
     *        < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4
649
     */
650
651
    /* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
652
19.8M
    acc[1] += acc[0] >> 56;   /* acc[1] < acc_old[1] + 2^72 */
653
19.8M
    acc[0] &= 0x00ffffffffffffff;
654
655
19.8M
    acc[2] += acc[1] >> 56;   /* acc[2] < acc_old[2] + 2^72 + 2^16 */
656
19.8M
    acc[1] &= 0x00ffffffffffffff;
657
658
19.8M
    acc[3] += acc[2] >> 56;   /* acc[3] < acc_old[3] + 2^72 + 2^16 */
659
19.8M
    acc[2] &= 0x00ffffffffffffff;
660
661
    /*-
662
     * acc[k] < acc_old[k] + 2^72 + 2^16
663
     *        < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17
664
     *        < in[k] + 2^125
665
     *        < 2^128 , k < 4
666
     */
667
668
19.8M
    acc[4] += acc[3] >> 56;   /*-
669
                               * acc[4] < acc_old[4] + 2^72 + 2^16
670
                               *        < 2^72 + 2^56 + 2^16
671
                               */
672
19.8M
    acc[3] &= 0x00ffffffffffffff;
673
674
19.8M
    acc[5] += acc[4] >> 56;   /*-
675
                               * acc[5] < acc_old[5] + 2^16 + 1
676
                               *        < 2^56 + 2^16 + 1
677
                               */
678
19.8M
    acc[4] &= 0x00ffffffffffffff;
679
680
19.8M
    acc[6] += acc[5] >> 56;   /* acc[6] < 2^48 + 1 <= 2^48 */
681
19.8M
    acc[5] &= 0x00ffffffffffffff;
682
683
158M
    for (i = 0; i < NLIMBS; i++)
684
138M
        out[i] = acc[i];
685
19.8M
}
686
687
static ossl_inline void felem_square_reduce_ref(felem out, const felem in)
688
7.22M
{
689
7.22M
    widefelem tmp;
690
691
7.22M
    felem_square_ref(tmp, in);
692
7.22M
    felem_reduce_ref(out, tmp);
693
7.22M
}
694
695
static ossl_inline void felem_mul_reduce_ref(felem out, const felem in1, const felem in2)
696
5.86M
{
697
5.86M
    widefelem tmp;
698
699
5.86M
    felem_mul_ref(tmp, in1, in2);
700
5.86M
    felem_reduce_ref(out, tmp);
701
5.86M
}
702
703
#if defined(ECP_NISTP384_ASM)
704
static void felem_square_wrapper(widefelem out, const felem in);
705
static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2);
706
707
static void (*felem_square_p)(widefelem out, const felem in) =
708
    felem_square_wrapper;
709
static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) =
710
    felem_mul_wrapper;
711
712
static void (*felem_reduce_p)(felem out, const widefelem in) = felem_reduce_ref;
713
714
static void (*felem_square_reduce_p)(felem out, const felem in) =
715
    felem_square_reduce_ref;
716
static void (*felem_mul_reduce_p)(felem out, const felem in1, const felem in2) =
717
    felem_mul_reduce_ref;
718
719
void p384_felem_square(widefelem out, const felem in);
720
void p384_felem_mul(widefelem out, const felem in1, const felem in2);
721
void p384_felem_reduce(felem out, const widefelem in);
722
723
void p384_felem_square_reduce(felem out, const felem in);
724
void p384_felem_mul_reduce(felem out, const felem in1, const felem in2);
725
726
# if defined(_ARCH_PPC64)
727
#  include "crypto/ppc_arch.h"
728
# endif
729
730
static void felem_select(void)
731
{
732
# if defined(_ARCH_PPC64)
733
    if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
734
        felem_square_p = p384_felem_square;
735
        felem_mul_p = p384_felem_mul;
736
        felem_reduce_p = p384_felem_reduce;
737
        felem_square_reduce_p = p384_felem_square_reduce;
738
        felem_mul_reduce_p = p384_felem_mul_reduce;
739
740
        return;
741
    }
742
# endif
743
744
    /* Default */
745
    felem_square_p = felem_square_ref;
746
    felem_mul_p = felem_mul_ref;
747
    felem_reduce_p = felem_reduce_ref;
748
    felem_square_reduce_p = felem_square_reduce_ref;
749
    felem_mul_reduce_p = felem_mul_reduce_ref;
750
}
751
752
static void felem_square_wrapper(widefelem out, const felem in)
753
{
754
    felem_select();
755
    felem_square_p(out, in);
756
}
757
758
static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2)
759
{
760
    felem_select();
761
    felem_mul_p(out, in1, in2);
762
}
763
764
# define felem_square felem_square_p
765
# define felem_mul felem_mul_p
766
# define felem_reduce felem_reduce_p
767
768
# define felem_square_reduce felem_square_reduce_p
769
# define felem_mul_reduce felem_mul_reduce_p
770
#else
771
4.82M
# define felem_square felem_square_ref
772
3.86M
# define felem_mul felem_mul_ref
773
6.72M
# define felem_reduce felem_reduce_ref
774
775
7.22M
# define felem_square_reduce felem_square_reduce_ref
776
5.86M
# define felem_mul_reduce felem_mul_reduce_ref
777
#endif
778
779
/*-
780
 * felem_inv calculates |out| = |in|^{-1}
781
 *
782
 * Based on Fermat's Little Theorem:
783
 *   a^p = a (mod p)
784
 *   a^{p-1} = 1 (mod p)
785
 *   a^{p-2} = a^{-1} (mod p)
786
 */
787
static void felem_inv(felem out, const felem in)
788
8.12k
{
789
8.12k
    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6;
790
8.12k
    unsigned int i = 0;
791
792
8.12k
    felem_square_reduce(ftmp, in);      /* 2^1 */
793
8.12k
    felem_mul_reduce(ftmp, ftmp, in);   /* 2^1 + 2^0 */
794
8.12k
    felem_assign(ftmp2, ftmp);
795
796
8.12k
    felem_square_reduce(ftmp, ftmp);    /* 2^2 + 2^1 */
797
8.12k
    felem_mul_reduce(ftmp, ftmp, in);   /* 2^2 + 2^1 * 2^0 */
798
8.12k
    felem_assign(ftmp3, ftmp);
799
800
32.5k
    for (i = 0; i < 3; i++)
801
24.3k
        felem_square_reduce(ftmp, ftmp); /* 2^5 + 2^4 + 2^3 */
802
8.12k
    felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */
803
8.12k
    felem_assign(ftmp4, ftmp);
804
805
56.8k
    for (i = 0; i < 6; i++)
806
48.7k
        felem_square_reduce(ftmp, ftmp); /* 2^11 + ... + 2^6 */
807
8.12k
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^11 + ... + 2^0 */
808
809
32.5k
    for (i = 0; i < 3; i++)
810
24.3k
        felem_square_reduce(ftmp, ftmp); /* 2^14 + ... + 2^3 */
811
8.12k
    felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^14 + ... + 2^0 */
812
8.12k
    felem_assign(ftmp5, ftmp);
813
814
130k
    for (i = 0; i < 15; i++)
815
121k
        felem_square_reduce(ftmp, ftmp); /* 2^29 + ... + 2^15 */
816
8.12k
    felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^29 + ... + 2^0 */
817
8.12k
    felem_assign(ftmp6, ftmp);
818
819
251k
    for (i = 0; i < 30; i++)
820
243k
        felem_square_reduce(ftmp, ftmp); /* 2^59 + ... + 2^30 */
821
8.12k
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^59 + ... + 2^0 */
822
8.12k
    felem_assign(ftmp4, ftmp);
823
824
495k
    for (i = 0; i < 60; i++)
825
487k
        felem_square_reduce(ftmp, ftmp); /* 2^119 + ... + 2^60 */
826
8.12k
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^119 + ... + 2^0 */
827
8.12k
    felem_assign(ftmp4, ftmp);
828
829
983k
    for (i = 0; i < 120; i++)
830
975k
      felem_square_reduce(ftmp, ftmp);   /* 2^239 + ... + 2^120 */
831
8.12k
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^239 + ... + 2^0 */
832
833
130k
    for (i = 0; i < 15; i++)
834
121k
        felem_square_reduce(ftmp, ftmp); /* 2^254 + ... + 2^15 */
835
8.12k
    felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^254 + ... + 2^0 */
836
837
260k
    for (i = 0; i < 31; i++)
838
251k
        felem_square_reduce(ftmp, ftmp); /* 2^285 + ... + 2^31 */
839
8.12k
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^285 + ... + 2^31 + 2^29 + ... + 2^0 */
840
841
24.3k
    for (i = 0; i < 2; i++)
842
16.2k
        felem_square_reduce(ftmp, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^2 */
843
8.12k
    felem_mul_reduce(ftmp, ftmp2, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^0 */
844
845
771k
    for (i = 0; i < 94; i++)
846
763k
        felem_square_reduce(ftmp, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 */
847
8.12k
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 + 2^29 + ... + 2^0 */
848
849
24.3k
    for (i = 0; i < 2; i++)
850
16.2k
        felem_square_reduce(ftmp, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 */
851
8.12k
    felem_mul_reduce(ftmp, in, ftmp);    /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 + 2^0 */
852
853
8.12k
    memcpy(out, ftmp, sizeof(felem));
854
8.12k
}
855
856
/*
857
 * Zero-check: returns a limb with all bits set if |in| == 0 (mod p)
858
 * and 0 otherwise. We know that field elements are reduced to
859
 * 0 < in < 2p, so we only need to check two cases:
860
 * 0 and 2^384 - 2^128 - 2^96 + 2^32 - 1
861
 *   in[k] < 2^56, k < 6
862
 *   in[6] <= 2^48
863
 */
864
static limb felem_is_zero(const felem in)
865
2.50M
{
866
2.50M
    limb zero, p384;
867
868
2.50M
    zero = in[0] | in[1] | in[2] | in[3] | in[4] | in[5] | in[6];
869
2.50M
    zero = ((int64_t) (zero) - 1) >> 63;
870
2.50M
    p384 = (in[0] ^ 0x000000ffffffff) | (in[1] ^ 0xffff0000000000)
871
2.50M
         | (in[2] ^ 0xfffffffffeffff) | (in[3] ^ 0xffffffffffffff)
872
2.50M
         | (in[4] ^ 0xffffffffffffff) | (in[5] ^ 0xffffffffffffff)
873
2.50M
         | (in[6] ^ 0xffffffffffff);
874
2.50M
    p384 = ((int64_t) (p384) - 1) >> 63;
875
876
2.50M
    return (zero | p384);
877
2.50M
}
878
879
static int felem_is_zero_int(const void *in)
880
0
{
881
0
    return (int)(felem_is_zero(in) & ((limb) 1));
882
0
}
883
884
/*-
885
 * felem_contract converts |in| to its unique, minimal representation.
886
 * Assume we've removed all redundant bits.
887
 * On entry:
888
 *   in[k] < 2^56, k < 6
889
 *   in[6] <= 2^48
890
 */
891
static void felem_contract(felem out, const felem in)
892
36.4k
{
893
36.4k
    static const int64_t two56 = ((limb) 1) << 56;
894
895
    /*
896
     * We know for a fact that 0 <= |in| < 2*p, for p = 2^384 - 2^128 - 2^96 + 2^32 - 1
897
     * Perform two successive, idempotent subtractions to reduce if |in| >= p.
898
     */
899
900
36.4k
    int64_t tmp[NLIMBS], cond[5], a;
901
36.4k
    unsigned int i;
902
903
36.4k
    memcpy(tmp, in, sizeof(felem));
904
 
905
    /* Case 1: a = 1 iff |in| >= 2^384 */
906
36.4k
    a = (in[6] >> 48);
907
36.4k
    tmp[0] += a;
908
36.4k
    tmp[0] -= a << 32;
909
36.4k
    tmp[1] += a << 40;
910
36.4k
    tmp[2] += a << 16;
911
36.4k
    tmp[6] &= 0x0000ffffffffffff;
912
913
    /*
914
     * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
915
     * non-zero, so we only need one step
916
     */
917
918
36.4k
    a = tmp[0] >> 63;
919
36.4k
    tmp[0] += a & two56;
920
36.4k
    tmp[1] -= a & 1;
921
922
    /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
923
36.4k
    tmp[2] += tmp[1] >> 56;
924
36.4k
    tmp[1] &= 0x00ffffffffffffff;
925
926
36.4k
    tmp[3] += tmp[2] >> 56;
927
36.4k
    tmp[2] &= 0x00ffffffffffffff;
928
929
36.4k
    tmp[4] += tmp[3] >> 56;
930
36.4k
    tmp[3] &= 0x00ffffffffffffff;
931
932
36.4k
    tmp[5] += tmp[4] >> 56;
933
36.4k
    tmp[4] &= 0x00ffffffffffffff;
934
935
36.4k
    tmp[6] += tmp[5] >> 56; /* tmp[6] < 2^48 */
936
36.4k
    tmp[5] &= 0x00ffffffffffffff;
937
938
    /*
939
     * Case 2: a = all ones if p <= |in| < 2^384, 0 otherwise
940
     */
941
942
    /* 0 iff (2^129..2^383) are all one */
943
36.4k
    cond[0] = ((tmp[6] | 0xff000000000000) & tmp[5] & tmp[4] & tmp[3] & (tmp[2] | 0x0000000001ffff)) + 1;
944
    /* 0 iff 2^128 bit is one */
945
36.4k
    cond[1] = (tmp[2] | ~0x00000000010000) + 1;
946
    /* 0 iff (2^96..2^127) bits are all one */
947
36.4k
    cond[2] = ((tmp[2] | 0xffffffffff0000) & (tmp[1] | 0x0000ffffffffff)) + 1;
948
    /* 0 iff (2^32..2^95) bits are all zero */
949
36.4k
    cond[3] = (tmp[1] & ~0xffff0000000000) | (tmp[0] & ~((int64_t) 0x000000ffffffff));
950
    /* 0 iff (2^0..2^31) bits are all one */
951
36.4k
    cond[4] = (tmp[0] | 0xffffff00000000) + 1;
952
953
    /*
954
     * In effect, invert our conditions, so that 0 values become all 1's,
955
     * any non-zero value in the low-order 56 bits becomes all 0's
956
     */
957
218k
    for (i = 0; i < 5; i++)
958
182k
       cond[i] = ((cond[i] & 0x00ffffffffffffff) - 1) >> 63;
959
960
    /*
961
     * The condition for determining whether in is greater than our
962
     * prime is given by the following condition.
963
     */
964
965
    /* First subtract 2^384 - 2^129 cheaply */
966
36.4k
    a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
967
36.4k
    tmp[6] &= ~a;
968
36.4k
    tmp[5] &= ~a;
969
36.4k
    tmp[4] &= ~a;
970
36.4k
    tmp[3] &= ~a;
971
36.4k
    tmp[2] &= ~a | 0x0000000001ffff;
972
973
    /*
974
     * Subtract 2^128 - 2^96 by
975
     * means of disjoint cases.
976
     */
977
978
    /* subtract 2^128 if that bit is present, and add 2^96 */
979
36.4k
    a = cond[0] & cond[1];
980
36.4k
    tmp[2] &= ~a | 0xfffffffffeffff;
981
36.4k
    tmp[1] += a & ((int64_t) 1 << 40);
982
983
    /* otherwise, clear bits 2^127 .. 2^96  */
984
36.4k
    a = cond[0] & ~cond[1] & (cond[2] & (~cond[3] | cond[4]));
985
36.4k
    tmp[2] &= ~a | 0xffffffffff0000;
986
36.4k
    tmp[1] &= ~a | 0x0000ffffffffff;
987
988
    /* finally, subtract the last 2^32 - 1 */
989
36.4k
    a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
990
36.4k
    tmp[0] += a & (-((int64_t) 1 << 32) + 1);
991
992
    /*
993
     * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
994
     * non-zero, so we only need one step
995
     */
996
36.4k
    a = tmp[0] >> 63;
997
36.4k
    tmp[0] += a & two56;
998
36.4k
    tmp[1] -= a & 1;
999
1000
    /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
1001
36.4k
    tmp[2] += tmp[1] >> 56;
1002
36.4k
    tmp[1] &= 0x00ffffffffffffff;
1003
1004
36.4k
    tmp[3] += tmp[2] >> 56;
1005
36.4k
    tmp[2] &= 0x00ffffffffffffff;
1006
1007
36.4k
    tmp[4] += tmp[3] >> 56;
1008
36.4k
    tmp[3] &= 0x00ffffffffffffff;
1009
1010
36.4k
    tmp[5] += tmp[4] >> 56;
1011
36.4k
    tmp[4] &= 0x00ffffffffffffff;
1012
1013
36.4k
    tmp[6] += tmp[5] >> 56;
1014
36.4k
    tmp[5] &= 0x00ffffffffffffff;
1015
1016
36.4k
    memcpy(out, tmp, sizeof(felem));
1017
36.4k
}
1018
1019
/*-
1020
 * Group operations
1021
 * ----------------
1022
 *
1023
 * Building on top of the field operations we have the operations on the
1024
 * elliptic curve group itself. Points on the curve are represented in Jacobian
1025
 * coordinates
1026
 */
1027
1028
/*-
1029
 * point_double calculates 2*(x_in, y_in, z_in)
1030
 *
1031
 * The method is taken from:
1032
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1033
 *
1034
 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1035
 * while x_out == y_in is not (maybe this works, but it's not tested).
1036
 */
1037
static void
1038
point_double(felem x_out, felem y_out, felem z_out,
1039
             const felem x_in, const felem y_in, const felem z_in)
1040
1.33M
{
1041
1.33M
    widefelem tmp, tmp2;
1042
1.33M
    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1043
1044
1.33M
    felem_assign(ftmp, x_in);
1045
1.33M
    felem_assign(ftmp2, x_in);
1046
1047
    /* delta = z^2 */
1048
1.33M
    felem_square_reduce(delta, z_in);     /* delta[i] < 2^56 */
1049
1050
    /* gamma = y^2 */
1051
1.33M
    felem_square_reduce(gamma, y_in);     /* gamma[i] < 2^56 */
1052
1053
    /* beta = x*gamma */
1054
1.33M
    felem_mul_reduce(beta, x_in, gamma);  /* beta[i] < 2^56 */
1055
1056
    /* alpha = 3*(x-delta)*(x+delta) */
1057
1.33M
    felem_diff64(ftmp, delta);            /* ftmp[i] < 2^60 + 2^58 + 2^44 */
1058
1.33M
    felem_sum64(ftmp2, delta);            /* ftmp2[i] < 2^59 */
1059
1.33M
    felem_scalar64(ftmp2, 3);             /* ftmp2[i] < 2^61 */
1060
1.33M
    felem_mul_reduce(alpha, ftmp, ftmp2); /* alpha[i] < 2^56 */
1061
1062
    /* x' = alpha^2 - 8*beta */
1063
1.33M
    felem_square(tmp, alpha);             /* tmp[i] < 2^115 */
1064
1.33M
    felem_assign(ftmp, beta);             /* ftmp[i] < 2^56 */
1065
1.33M
    felem_scalar64(ftmp, 8);              /* ftmp[i] < 2^59 */
1066
1.33M
    felem_diff_128_64(tmp, ftmp);         /* tmp[i] < 2^115 + 2^64 + 2^48 */
1067
1.33M
    felem_reduce(x_out, tmp);             /* x_out[i] < 2^56 */
1068
1069
    /* z' = (y + z)^2 - gamma - delta */
1070
1.33M
    felem_sum64(delta, gamma);     /* delta[i] < 2^57 */
1071
1.33M
    felem_assign(ftmp, y_in);      /* ftmp[i] < 2^56 */
1072
1.33M
    felem_sum64(ftmp, z_in);       /* ftmp[i] < 2^56 */
1073
1.33M
    felem_square(tmp, ftmp);       /* tmp[i] < 2^115 */
1074
1.33M
    felem_diff_128_64(tmp, delta); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1075
1.33M
    felem_reduce(z_out, tmp);      /* z_out[i] < 2^56 */
1076
1077
    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1078
1.33M
    felem_scalar64(beta, 4);       /* beta[i] < 2^58 */
1079
1.33M
    felem_diff64(beta, x_out);     /* beta[i] < 2^60 + 2^58 + 2^44 */
1080
1.33M
    felem_mul(tmp, alpha, beta);   /* tmp[i] < 2^119 */
1081
1.33M
    felem_square(tmp2, gamma);     /* tmp2[i] < 2^115 */
1082
1.33M
    felem_scalar128(tmp2, 8);      /* tmp2[i] < 2^118 */
1083
1.33M
    felem_diff128(tmp, tmp2);      /* tmp[i] < 2^127 + 2^119 + 2^111 */
1084
1.33M
    felem_reduce(y_out, tmp);      /* tmp[i] < 2^56 */
1085
1.33M
}
1086
1087
/* copy_conditional copies in to out iff mask is all ones. */
1088
static void copy_conditional(felem out, const felem in, limb mask)
1089
3.94M
{
1090
3.94M
    unsigned int i;
1091
1092
31.5M
    for (i = 0; i < NLIMBS; i++)
1093
27.5M
        out[i] ^= mask & (in[i] ^ out[i]);
1094
3.94M
}
1095
1096
/*-
1097
 * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1098
 *
1099
 * The method is taken from
1100
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1101
 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1102
 *
1103
 * This function includes a branch for checking whether the two input points
1104
 * are equal (while not equal to the point at infinity). See comment below
1105
 * on constant-time.
1106
 */
1107
static void point_add(felem x3, felem y3, felem z3,
1108
                      const felem x1, const felem y1, const felem z1,
1109
                      const int mixed, const felem x2, const felem y2,
1110
                      const felem z2)
1111
627k
{
1112
627k
    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1113
627k
    widefelem tmp, tmp2;
1114
627k
    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1115
627k
    limb points_equal;
1116
1117
627k
    z1_is_zero = felem_is_zero(z1);
1118
627k
    z2_is_zero = felem_is_zero(z2);
1119
1120
    /* ftmp = z1z1 = z1**2 */
1121
627k
    felem_square_reduce(ftmp, z1);      /* ftmp[i] < 2^56 */
1122
1123
627k
    if (!mixed) {
1124
        /* ftmp2 = z2z2 = z2**2 */
1125
192k
        felem_square_reduce(ftmp2, z2); /* ftmp2[i] < 2^56 */
1126
1127
        /* u1 = ftmp3 = x1*z2z2 */
1128
192k
        felem_mul_reduce(ftmp3, x1, ftmp2); /* ftmp3[i] < 2^56 */
1129
1130
        /* ftmp5 = z1 + z2 */
1131
192k
        felem_assign(ftmp5, z1);       /* ftmp5[i] < 2^56 */
1132
192k
        felem_sum64(ftmp5, z2);        /* ftmp5[i] < 2^57 */
1133
1134
        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1135
192k
        felem_square(tmp, ftmp5);      /* tmp[i] < 2^117 */
1136
192k
        felem_diff_128_64(tmp, ftmp);  /* tmp[i] < 2^117 + 2^64 + 2^48 */
1137
192k
        felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1138
192k
        felem_reduce(ftmp5, tmp);      /* ftmp5[i] < 2^56 */
1139
1140
        /* ftmp2 = z2 * z2z2 */
1141
192k
        felem_mul_reduce(ftmp2, ftmp2, z2); /* ftmp2[i] < 2^56 */
1142
1143
        /* s1 = ftmp6 = y1 * z2**3 */
1144
192k
        felem_mul_reduce(ftmp6, y1, ftmp2); /* ftmp6[i] < 2^56 */
1145
434k
    } else {
1146
        /*
1147
         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1148
         */
1149
1150
        /* u1 = ftmp3 = x1*z2z2 */
1151
434k
        felem_assign(ftmp3, x1);     /* ftmp3[i] < 2^56 */
1152
1153
        /* ftmp5 = 2*z1z2 */
1154
434k
        felem_scalar(ftmp5, z1, 2);  /* ftmp5[i] < 2^57 */
1155
1156
        /* s1 = ftmp6 = y1 * z2**3 */
1157
434k
        felem_assign(ftmp6, y1);     /* ftmp6[i] < 2^56 */
1158
434k
    }
1159
    /* ftmp3[i] < 2^56, ftmp5[i] < 2^57, ftmp6[i] < 2^56 */
1160
1161
    /* u2 = x2*z1z1 */
1162
627k
    felem_mul(tmp, x2, ftmp);        /* tmp[i] < 2^115 */
1163
1164
    /* h = ftmp4 = u2 - u1 */
1165
627k
    felem_diff_128_64(tmp, ftmp3);   /* tmp[i] < 2^115 + 2^64 + 2^48 */
1166
627k
    felem_reduce(ftmp4, tmp);        /* ftmp[4] < 2^56 */
1167
1168
627k
    x_equal = felem_is_zero(ftmp4);
1169
1170
    /* z_out = ftmp5 * h */
1171
627k
    felem_mul_reduce(z_out, ftmp5, ftmp4);  /* z_out[i] < 2^56 */
1172
1173
    /* ftmp = z1 * z1z1 */
1174
627k
    felem_mul_reduce(ftmp, ftmp, z1);  /* ftmp[i] < 2^56 */
1175
1176
    /* s2 = tmp = y2 * z1**3 */
1177
627k
    felem_mul(tmp, y2, ftmp);      /* tmp[i] < 2^115 */
1178
1179
    /* r = ftmp5 = (s2 - s1)*2 */
1180
627k
    felem_diff_128_64(tmp, ftmp6); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1181
627k
    felem_reduce(ftmp5, tmp);      /* ftmp5[i] < 2^56 */
1182
627k
    y_equal = felem_is_zero(ftmp5);
1183
627k
    felem_scalar64(ftmp5, 2);      /* ftmp5[i] < 2^57 */
1184
1185
    /*
1186
     * The formulae are incorrect if the points are equal, in affine coordinates
1187
     * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1188
     * happens.
1189
     *
1190
     * We use bitwise operations to avoid potential side-channels introduced by
1191
     * the short-circuiting behaviour of boolean operators.
1192
     *
1193
     * The special case of either point being the point at infinity (z1 and/or
1194
     * z2 are zero), is handled separately later on in this function, so we
1195
     * avoid jumping to point_double here in those special cases.
1196
     *
1197
     * Notice the comment below on the implications of this branching for timing
1198
     * leaks and why it is considered practically irrelevant.
1199
     */
1200
627k
    points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1201
1202
627k
    if (points_equal) {
1203
        /*
1204
         * This is obviously not constant-time but it will almost-never happen
1205
         * for ECDH / ECDSA.
1206
         */
1207
0
        point_double(x3, y3, z3, x1, y1, z1);
1208
0
        return;
1209
0
    }
1210
1211
    /* I = ftmp = (2h)**2 */
1212
627k
    felem_assign(ftmp, ftmp4);        /* ftmp[i] < 2^56 */
1213
627k
    felem_scalar64(ftmp, 2);          /* ftmp[i] < 2^57 */
1214
627k
    felem_square_reduce(ftmp, ftmp);  /* ftmp[i] < 2^56 */
1215
1216
    /* J = ftmp2 = h * I */
1217
627k
    felem_mul_reduce(ftmp2, ftmp4, ftmp); /* ftmp2[i] < 2^56 */
1218
1219
    /* V = ftmp4 = U1 * I */
1220
627k
    felem_mul_reduce(ftmp4, ftmp3, ftmp); /* ftmp4[i] < 2^56 */
1221
1222
    /* x_out = r**2 - J - 2V */
1223
627k
    felem_square(tmp, ftmp5);      /* tmp[i] < 2^117 */
1224
627k
    felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1225
627k
    felem_assign(ftmp3, ftmp4);    /* ftmp3[i] < 2^56 */
1226
627k
    felem_scalar64(ftmp4, 2);      /* ftmp4[i] < 2^57 */
1227
627k
    felem_diff_128_64(tmp, ftmp4); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1228
627k
    felem_reduce(x_out, tmp);      /* x_out[i] < 2^56 */
1229
1230
    /* y_out = r(V-x_out) - 2 * s1 * J */
1231
627k
    felem_diff64(ftmp3, x_out);    /* ftmp3[i] < 2^60 + 2^56 + 2^44 */
1232
627k
    felem_mul(tmp, ftmp5, ftmp3);  /* tmp[i] < 2^116 */
1233
627k
    felem_mul(tmp2, ftmp6, ftmp2); /* tmp2[i] < 2^115 */
1234
627k
    felem_scalar128(tmp2, 2);      /* tmp2[i] < 2^116 */
1235
627k
    felem_diff128(tmp, tmp2);      /* tmp[i] < 2^127 + 2^116 + 2^111 */
1236
627k
    felem_reduce(y_out, tmp);      /* y_out[i] < 2^56 */
1237
1238
627k
    copy_conditional(x_out, x2, z1_is_zero);
1239
627k
    copy_conditional(x_out, x1, z2_is_zero);
1240
627k
    copy_conditional(y_out, y2, z1_is_zero);
1241
627k
    copy_conditional(y_out, y1, z2_is_zero);
1242
627k
    copy_conditional(z_out, z2, z1_is_zero);
1243
627k
    copy_conditional(z_out, z1, z2_is_zero);
1244
627k
    felem_assign(x3, x_out);
1245
627k
    felem_assign(y3, y_out);
1246
627k
    felem_assign(z3, z_out);
1247
627k
}
1248
1249
/*-
1250
 * Base point pre computation
1251
 * --------------------------
1252
 *
1253
 * Two different sorts of precomputed tables are used in the following code.
1254
 * Each contain various points on the curve, where each point is three field
1255
 * elements (x, y, z).
1256
 *
1257
 * For the base point table, z is usually 1 (0 for the point at infinity).
1258
 * This table has 16 elements:
1259
 * index | bits    | point
1260
 * ------+---------+------------------------------
1261
 *     0 | 0 0 0 0 | 0G
1262
 *     1 | 0 0 0 1 | 1G
1263
 *     2 | 0 0 1 0 | 2^95G
1264
 *     3 | 0 0 1 1 | (2^95 + 1)G
1265
 *     4 | 0 1 0 0 | 2^190G
1266
 *     5 | 0 1 0 1 | (2^190 + 1)G
1267
 *     6 | 0 1 1 0 | (2^190 + 2^95)G
1268
 *     7 | 0 1 1 1 | (2^190 + 2^95 + 1)G
1269
 *     8 | 1 0 0 0 | 2^285G
1270
 *     9 | 1 0 0 1 | (2^285 + 1)G
1271
 *    10 | 1 0 1 0 | (2^285 + 2^95)G
1272
 *    11 | 1 0 1 1 | (2^285 + 2^95 + 1)G
1273
 *    12 | 1 1 0 0 | (2^285 + 2^190)G
1274
 *    13 | 1 1 0 1 | (2^285 + 2^190 + 1)G
1275
 *    14 | 1 1 1 0 | (2^285 + 2^190 + 2^95)G
1276
 *    15 | 1 1 1 1 | (2^285 + 2^190 + 2^95 + 1)G
1277
 *
1278
 * The reason for this is so that we can clock bits into four different
1279
 * locations when doing simple scalar multiplies against the base point.
1280
 *
1281
 * Tables for other points have table[i] = iG for i in 0 .. 16.
1282
 */
1283
1284
/* gmul is the table of precomputed base points */
1285
static const felem gmul[16][3] = {
1286
{{0, 0, 0, 0, 0, 0, 0},
1287
 {0, 0, 0, 0, 0, 0, 0},
1288
 {0, 0, 0, 0, 0, 0, 0}},
1289
{{0x00545e3872760ab7, 0x00f25dbf55296c3a, 0x00e082542a385502, 0x008ba79b9859f741,
1290
  0x0020ad746e1d3b62, 0x0005378eb1c71ef3, 0x0000aa87ca22be8b},
1291
 {0x00431d7c90ea0e5f, 0x00b1ce1d7e819d7a, 0x0013b5f0b8c00a60, 0x00289a147ce9da31,
1292
  0x0092dc29f8f41dbd, 0x002c6f5d9e98bf92, 0x00003617de4a9626},
1293
 {1, 0, 0, 0, 0, 0, 0}},
1294
{{0x00024711cc902a90, 0x00acb2e579ab4fe1, 0x00af818a4b4d57b1, 0x00a17c7bec49c3de,
1295
  0x004280482d726a8b, 0x00128dd0f0a90f3b, 0x00004387c1c3fa3c},
1296
 {0x002ce76543cf5c3a, 0x00de6cee5ef58f0a, 0x00403e42fa561ca6, 0x00bc54d6f9cb9731,
1297
  0x007155f925fb4ff1, 0x004a9ce731b7b9bc, 0x00002609076bd7b2},
1298
 {1, 0, 0, 0, 0, 0, 0}},
1299
{{0x00e74c9182f0251d, 0x0039bf54bb111974, 0x00b9d2f2eec511d2, 0x0036b1594eb3a6a4,
1300
  0x00ac3bb82d9d564b, 0x00f9313f4615a100, 0x00006716a9a91b10},
1301
 {0x0046698116e2f15c, 0x00f34347067d3d33, 0x008de4ccfdebd002, 0x00e838c6b8e8c97b,
1302
  0x006faf0798def346, 0x007349794a57563c, 0x00002629e7e6ad84},
1303
 {1, 0, 0, 0, 0, 0, 0}},
1304
{{0x0075300e34fd163b, 0x0092e9db4e8d0ad3, 0x00254be9f625f760, 0x00512c518c72ae68,
1305
  0x009bfcf162bede5a, 0x00bf9341566ce311, 0x0000cd6175bd41cf},
1306
 {0x007dfe52af4ac70f, 0x0002159d2d5c4880, 0x00b504d16f0af8d0, 0x0014585e11f5e64c,
1307
  0x0089c6388e030967, 0x00ffb270cbfa5f71, 0x00009a15d92c3947},
1308
 {1, 0, 0, 0, 0, 0, 0}},
1309
{{0x0033fc1278dc4fe5, 0x00d53088c2caa043, 0x0085558827e2db66, 0x00c192bef387b736,
1310
  0x00df6405a2225f2c, 0x0075205aa90fd91a, 0x0000137e3f12349d},
1311
 {0x00ce5b115efcb07e, 0x00abc3308410deeb, 0x005dc6fc1de39904, 0x00907c1c496f36b4,
1312
  0x0008e6ad3926cbe1, 0x00110747b787928c, 0x0000021b9162eb7e},
1313
 {1, 0, 0, 0, 0, 0, 0}},
1314
{{0x008180042cfa26e1, 0x007b826a96254967, 0x0082473694d6b194, 0x007bd6880a45b589,
1315
  0x00c0a5097072d1a3, 0x0019186555e18b4e, 0x000020278190e5ca},
1316
 {0x00b4bef17de61ac0, 0x009535e3c38ed348, 0x002d4aa8e468ceab, 0x00ef40b431036ad3,
1317
  0x00defd52f4542857, 0x0086edbf98234266, 0x00002025b3a7814d},
1318
 {1, 0, 0, 0, 0, 0, 0}},
1319
{{0x00b238aa97b886be, 0x00ef3192d6dd3a32, 0x0079f9e01fd62df8, 0x00742e890daba6c5,
1320
  0x008e5289144408ce, 0x0073bbcc8e0171a5, 0x0000c4fd329d3b52},
1321
 {0x00c6f64a15ee23e7, 0x00dcfb7b171cad8b, 0x00039f6cbd805867, 0x00de024e428d4562,
1322
  0x00be6a594d7c64c5, 0x0078467b70dbcd64, 0x0000251f2ed7079b},
1323
 {1, 0, 0, 0, 0, 0, 0}},
1324
{{0x000e5cc25fc4b872, 0x005ebf10d31ef4e1, 0x0061e0ebd11e8256, 0x0076e026096f5a27,
1325
  0x0013e6fc44662e9a, 0x0042b00289d3597e, 0x000024f089170d88},
1326
 {0x001604d7e0effbe6, 0x0048d77cba64ec2c, 0x008166b16da19e36, 0x006b0d1a0f28c088,
1327
  0x000259fcd47754fd, 0x00cc643e4d725f9a, 0x00007b10f3c79c14},
1328
 {1, 0, 0, 0, 0, 0, 0}},
1329
{{0x00430155e3b908af, 0x00b801e4fec25226, 0x00b0d4bcfe806d26, 0x009fc4014eb13d37,
1330
  0x0066c94e44ec07e8, 0x00d16adc03874ba2, 0x000030c917a0d2a7},
1331
 {0x00edac9e21eb891c, 0x00ef0fb768102eff, 0x00c088cef272a5f3, 0x00cbf782134e2964,
1332
  0x0001044a7ba9a0e3, 0x00e363f5b194cf3c, 0x00009ce85249e372},
1333
 {1, 0, 0, 0, 0, 0, 0}},
1334
{{0x001dd492dda5a7eb, 0x008fd577be539fd1, 0x002ff4b25a5fc3f1, 0x0074a8a1b64df72f,
1335
  0x002ba3d8c204a76c, 0x009d5cff95c8235a, 0x0000e014b9406e0f},
1336
 {0x008c2e4dbfc98aba, 0x00f30bb89f1a1436, 0x00b46f7aea3e259c, 0x009224454ac02f54,
1337
  0x00906401f5645fa2, 0x003a1d1940eabc77, 0x00007c9351d680e6},
1338
 {1, 0, 0, 0, 0, 0, 0}},
1339
{{0x005a35d872ef967c, 0x0049f1b7884e1987, 0x0059d46d7e31f552, 0x00ceb4869d2d0fb6,
1340
  0x00e8e89eee56802a, 0x0049d806a774aaf2, 0x0000147e2af0ae24},
1341
 {0x005fd1bd852c6e5e, 0x00b674b7b3de6885, 0x003b9ea5eb9b6c08, 0x005c9f03babf3ef7,
1342
  0x00605337fecab3c7, 0x009a3f85b11bbcc8, 0x0000455470f330ec},
1343
 {1, 0, 0, 0, 0, 0, 0}},
1344
{{0x002197ff4d55498d, 0x00383e8916c2d8af, 0x00eb203f34d1c6d2, 0x0080367cbd11b542,
1345
  0x00769b3be864e4f5, 0x0081a8458521c7bb, 0x0000c531b34d3539},
1346
 {0x00e2a3d775fa2e13, 0x00534fc379573844, 0x00ff237d2a8db54a, 0x00d301b2335a8882,
1347
  0x000f75ea96103a80, 0x0018fecb3cdd96fa, 0x0000304bf61e94eb},
1348
 {1, 0, 0, 0, 0, 0, 0}},
1349
{{0x00b2afc332a73dbd, 0x0029a0d5bb007bc5, 0x002d628eb210f577, 0x009f59a36dd05f50,
1350
  0x006d339de4eca613, 0x00c75a71addc86bc, 0x000060384c5ea93c},
1351
 {0x00aa9641c32a30b4, 0x00cc73ae8cce565d, 0x00ec911a4df07f61, 0x00aa4b762ea4b264,
1352
  0x0096d395bb393629, 0x004efacfb7632fe0, 0x00006f252f46fa3f},
1353
 {1, 0, 0, 0, 0, 0, 0}},
1354
{{0x00567eec597c7af6, 0x0059ba6795204413, 0x00816d4e6f01196f, 0x004ae6b3eb57951d,
1355
  0x00420f5abdda2108, 0x003401d1f57ca9d9, 0x0000cf5837b0b67a},
1356
 {0x00eaa64b8aeeabf9, 0x00246ddf16bcb4de, 0x000e7e3c3aecd751, 0x0008449f04fed72e,
1357
  0x00307b67ccf09183, 0x0017108c3556b7b1, 0x0000229b2483b3bf},
1358
 {1, 0, 0, 0, 0, 0, 0}},
1359
{{0x00e7c491a7bb78a1, 0x00eafddd1d3049ab, 0x00352c05e2bc7c98, 0x003d6880c165fa5c,
1360
  0x00b6ac61cc11c97d, 0x00beeb54fcf90ce5, 0x0000dc1f0b455edc},
1361
 {0x002db2e7aee34d60, 0x0073b5f415a2d8c0, 0x00dd84e4193e9a0c, 0x00d02d873467c572,
1362
  0x0018baaeda60aee5, 0x0013fb11d697c61e, 0x000083aafcc3a973},
1363
 {1, 0, 0, 0, 0, 0, 0}}
1364
};
1365
1366
/*
1367
 * select_point selects the |idx|th point from a precomputation table and
1368
 * copies it to out.
1369
 *
1370
 * pre_comp below is of the size provided in |size|.
1371
 */
1372
static void select_point(const limb idx, unsigned int size,
1373
                         const felem pre_comp[][3], felem out[3])
1374
617k
{
1375
617k
    unsigned int i, j;
1376
617k
    limb *outlimbs = &out[0][0];
1377
1378
617k
    memset(out, 0, sizeof(*out) * 3);
1379
1380
10.6M
    for (i = 0; i < size; i++) {
1381
10.0M
        const limb *inlimbs = &pre_comp[i][0][0];
1382
10.0M
        limb mask = i ^ idx;
1383
1384
10.0M
        mask |= mask >> 4;
1385
10.0M
        mask |= mask >> 2;
1386
10.0M
        mask |= mask >> 1;
1387
10.0M
        mask &= 1;
1388
10.0M
        mask--;
1389
221M
        for (j = 0; j < NLIMBS * 3; j++)
1390
211M
            outlimbs[j] |= inlimbs[j] & mask;
1391
10.0M
    }
1392
617k
}
1393
1394
/* get_bit returns the |i|th bit in |in| */
1395
static char get_bit(const felem_bytearray in, int i)
1396
2.77M
{
1397
2.77M
    if (i < 0 || i >= 384)
1398
4.64k
        return 0;
1399
2.77M
    return (in[i >> 3] >> (i & 7)) & 1;
1400
2.77M
}
1401
1402
/*
1403
 * Interleaved point multiplication using precomputed point multiples: The
1404
 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1405
 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1406
 * generator, using certain (large) precomputed multiples in g_pre_comp.
1407
 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1408
 */
1409
static void batch_mul(felem x_out, felem y_out, felem z_out,
1410
                      const felem_bytearray scalars[],
1411
                      const unsigned int num_points, const u8 *g_scalar,
1412
                      const int mixed, const felem pre_comp[][17][3],
1413
                      const felem g_pre_comp[16][3])
1414
6.72k
{
1415
6.72k
    int i, skip;
1416
6.72k
    unsigned int num, gen_mul = (g_scalar != NULL);
1417
6.72k
    felem nq[3], tmp[4];
1418
6.72k
    limb bits;
1419
6.72k
    u8 sign, digit;
1420
1421
    /* set nq to the point at infinity */
1422
6.72k
    memset(nq, 0, sizeof(nq));
1423
1424
    /*
1425
     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1426
     * of the generator (last quarter of rounds) and additions of other
1427
     * points multiples (every 5th round).
1428
     */
1429
6.72k
    skip = 1;                   /* save two point operations in the first
1430
                                 * round */
1431
1.32M
    for (i = (num_points ? 380 : 98); i >= 0; --i) {
1432
        /* double */
1433
1.32M
        if (!skip)
1434
1.31M
            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1435
1436
        /* add multiples of the generator */
1437
1.32M
        if (gen_mul && (i <= 98)) {
1438
438k
            bits = get_bit(g_scalar, i + 285) << 3;
1439
438k
            if (i < 95) {
1440
421k
                bits |= get_bit(g_scalar, i + 190) << 2;
1441
421k
                bits |= get_bit(g_scalar, i + 95) << 1;
1442
421k
                bits |= get_bit(g_scalar, i);
1443
421k
            }
1444
            /* select the point to add, in constant time */
1445
438k
            select_point(bits, 16, g_pre_comp, tmp);
1446
438k
            if (!skip) {
1447
                /* The 1 argument below is for "mixed" */
1448
434k
                point_add(nq[0],  nq[1],  nq[2],
1449
434k
                          nq[0],  nq[1],  nq[2], 1,
1450
434k
                          tmp[0], tmp[1], tmp[2]);
1451
434k
            } else {
1452
4.40k
                memcpy(nq, tmp, 3 * sizeof(felem));
1453
4.40k
                skip = 0;
1454
4.40k
            }
1455
438k
        }
1456
1457
        /* do other additions every 5 doublings */
1458
1.32M
        if (num_points && (i % 5 == 0)) {
1459
            /* loop over all scalars */
1460
357k
            for (num = 0; num < num_points; ++num) {
1461
178k
                bits = get_bit(scalars[num], i + 4) << 5;
1462
178k
                bits |= get_bit(scalars[num], i + 3) << 4;
1463
178k
                bits |= get_bit(scalars[num], i + 2) << 3;
1464
178k
                bits |= get_bit(scalars[num], i + 1) << 2;
1465
178k
                bits |= get_bit(scalars[num], i) << 1;
1466
178k
                bits |= get_bit(scalars[num], i - 1);
1467
178k
                ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1468
1469
                /*
1470
                 * select the point to add or subtract, in constant time
1471
                 */
1472
178k
                select_point(digit, 17, pre_comp[num], tmp);
1473
178k
                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1474
                                            * point */
1475
178k
                copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1476
1477
178k
                if (!skip) {
1478
176k
                    point_add(nq[0],  nq[1],  nq[2],
1479
176k
                              nq[0],  nq[1],  nq[2], mixed,
1480
176k
                              tmp[0], tmp[1], tmp[2]);
1481
176k
                } else {
1482
2.32k
                    memcpy(nq, tmp, 3 * sizeof(felem));
1483
2.32k
                    skip = 0;
1484
2.32k
                }
1485
178k
            }
1486
178k
        }
1487
1.32M
    }
1488
6.72k
    felem_assign(x_out, nq[0]);
1489
6.72k
    felem_assign(y_out, nq[1]);
1490
6.72k
    felem_assign(z_out, nq[2]);
1491
6.72k
}
1492
1493
/* Precomputation for the group generator. */
1494
struct nistp384_pre_comp_st {
1495
    felem g_pre_comp[16][3];
1496
    CRYPTO_REF_COUNT references;
1497
};
1498
1499
const EC_METHOD *ossl_ec_GFp_nistp384_method(void)
1500
17.5k
{
1501
17.5k
    static const EC_METHOD ret = {
1502
17.5k
        EC_FLAGS_DEFAULT_OCT,
1503
17.5k
        NID_X9_62_prime_field,
1504
17.5k
        ossl_ec_GFp_nistp384_group_init,
1505
17.5k
        ossl_ec_GFp_simple_group_finish,
1506
17.5k
        ossl_ec_GFp_simple_group_clear_finish,
1507
17.5k
        ossl_ec_GFp_nist_group_copy,
1508
17.5k
        ossl_ec_GFp_nistp384_group_set_curve,
1509
17.5k
        ossl_ec_GFp_simple_group_get_curve,
1510
17.5k
        ossl_ec_GFp_simple_group_get_degree,
1511
17.5k
        ossl_ec_group_simple_order_bits,
1512
17.5k
        ossl_ec_GFp_simple_group_check_discriminant,
1513
17.5k
        ossl_ec_GFp_simple_point_init,
1514
17.5k
        ossl_ec_GFp_simple_point_finish,
1515
17.5k
        ossl_ec_GFp_simple_point_clear_finish,
1516
17.5k
        ossl_ec_GFp_simple_point_copy,
1517
17.5k
        ossl_ec_GFp_simple_point_set_to_infinity,
1518
17.5k
        ossl_ec_GFp_simple_point_set_affine_coordinates,
1519
17.5k
        ossl_ec_GFp_nistp384_point_get_affine_coordinates,
1520
17.5k
        0, /* point_set_compressed_coordinates */
1521
17.5k
        0, /* point2oct */
1522
17.5k
        0, /* oct2point */
1523
17.5k
        ossl_ec_GFp_simple_add,
1524
17.5k
        ossl_ec_GFp_simple_dbl,
1525
17.5k
        ossl_ec_GFp_simple_invert,
1526
17.5k
        ossl_ec_GFp_simple_is_at_infinity,
1527
17.5k
        ossl_ec_GFp_simple_is_on_curve,
1528
17.5k
        ossl_ec_GFp_simple_cmp,
1529
17.5k
        ossl_ec_GFp_simple_make_affine,
1530
17.5k
        ossl_ec_GFp_simple_points_make_affine,
1531
17.5k
        ossl_ec_GFp_nistp384_points_mul,
1532
17.5k
        ossl_ec_GFp_nistp384_precompute_mult,
1533
17.5k
        ossl_ec_GFp_nistp384_have_precompute_mult,
1534
17.5k
        ossl_ec_GFp_nist_field_mul,
1535
17.5k
        ossl_ec_GFp_nist_field_sqr,
1536
17.5k
        0, /* field_div */
1537
17.5k
        ossl_ec_GFp_simple_field_inv,
1538
17.5k
        0, /* field_encode */
1539
17.5k
        0, /* field_decode */
1540
17.5k
        0, /* field_set_to_one */
1541
17.5k
        ossl_ec_key_simple_priv2oct,
1542
17.5k
        ossl_ec_key_simple_oct2priv,
1543
17.5k
        0, /* set private */
1544
17.5k
        ossl_ec_key_simple_generate_key,
1545
17.5k
        ossl_ec_key_simple_check_key,
1546
17.5k
        ossl_ec_key_simple_generate_public_key,
1547
17.5k
        0, /* keycopy */
1548
17.5k
        0, /* keyfinish */
1549
17.5k
        ossl_ecdh_simple_compute_key,
1550
17.5k
        ossl_ecdsa_simple_sign_setup,
1551
17.5k
        ossl_ecdsa_simple_sign_sig,
1552
17.5k
        ossl_ecdsa_simple_verify_sig,
1553
17.5k
        0, /* field_inverse_mod_ord */
1554
17.5k
        0, /* blind_coordinates */
1555
17.5k
        0, /* ladder_pre */
1556
17.5k
        0, /* ladder_step */
1557
17.5k
        0  /* ladder_post */
1558
17.5k
    };
1559
1560
17.5k
    return &ret;
1561
17.5k
}
1562
1563
/******************************************************************************/
1564
/*
1565
 * FUNCTIONS TO MANAGE PRECOMPUTATION
1566
 */
1567
1568
static NISTP384_PRE_COMP *nistp384_pre_comp_new(void)
1569
0
{
1570
0
    NISTP384_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1571
1572
0
    if (ret == NULL)
1573
0
        return ret;
1574
1575
0
    if (!CRYPTO_NEW_REF(&ret->references, 1)) {
1576
0
        OPENSSL_free(ret);
1577
0
        return NULL;
1578
0
    }
1579
0
    return ret;
1580
0
}
1581
1582
NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *p)
1583
0
{
1584
0
    int i;
1585
1586
0
    if (p != NULL)
1587
0
        CRYPTO_UP_REF(&p->references, &i);
1588
0
    return p;
1589
0
}
1590
1591
void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *p)
1592
0
{
1593
0
    int i;
1594
1595
0
    if (p == NULL)
1596
0
        return;
1597
1598
0
    CRYPTO_DOWN_REF(&p->references, &i);
1599
0
    REF_PRINT_COUNT("ossl_ec_nistp384", i, p);
1600
0
    if (i > 0)
1601
0
        return;
1602
0
    REF_ASSERT_ISNT(i < 0);
1603
1604
0
    CRYPTO_FREE_REF(&p->references);
1605
0
    OPENSSL_free(p);
1606
0
}
1607
1608
/******************************************************************************/
1609
/*
1610
 * OPENSSL EC_METHOD FUNCTIONS
1611
 */
1612
1613
int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group)
1614
37.1k
{
1615
37.1k
    int ret;
1616
1617
37.1k
    ret = ossl_ec_GFp_simple_group_init(group);
1618
37.1k
    group->a_is_minus3 = 1;
1619
37.1k
    return ret;
1620
37.1k
}
1621
1622
int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1623
                                         const BIGNUM *a, const BIGNUM *b,
1624
                                         BN_CTX *ctx)
1625
17.5k
{
1626
17.5k
    int ret = 0;
1627
17.5k
    BIGNUM *curve_p, *curve_a, *curve_b;
1628
17.5k
#ifndef FIPS_MODULE
1629
17.5k
    BN_CTX *new_ctx = NULL;
1630
1631
17.5k
    if (ctx == NULL)
1632
0
        ctx = new_ctx = BN_CTX_new();
1633
17.5k
#endif
1634
17.5k
    if (ctx == NULL)
1635
0
        return 0;
1636
1637
17.5k
    BN_CTX_start(ctx);
1638
17.5k
    curve_p = BN_CTX_get(ctx);
1639
17.5k
    curve_a = BN_CTX_get(ctx);
1640
17.5k
    curve_b = BN_CTX_get(ctx);
1641
17.5k
    if (curve_b == NULL)
1642
0
        goto err;
1643
17.5k
    BN_bin2bn(nistp384_curve_params[0], sizeof(felem_bytearray), curve_p);
1644
17.5k
    BN_bin2bn(nistp384_curve_params[1], sizeof(felem_bytearray), curve_a);
1645
17.5k
    BN_bin2bn(nistp384_curve_params[2], sizeof(felem_bytearray), curve_b);
1646
17.5k
    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1647
0
        ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1648
0
        goto err;
1649
0
    }
1650
17.5k
    group->field_mod_func = BN_nist_mod_384;
1651
17.5k
    ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1652
17.5k
 err:
1653
17.5k
    BN_CTX_end(ctx);
1654
17.5k
#ifndef FIPS_MODULE
1655
17.5k
    BN_CTX_free(new_ctx);
1656
17.5k
#endif
1657
17.5k
    return ret;
1658
17.5k
}
1659
1660
/*
1661
 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1662
 * (X/Z^2, Y/Z^3)
1663
 */
1664
int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group,
1665
                                                      const EC_POINT *point,
1666
                                                      BIGNUM *x, BIGNUM *y,
1667
                                                      BN_CTX *ctx)
1668
8.12k
{
1669
8.12k
    felem z1, z2, x_in, y_in, x_out, y_out;
1670
8.12k
    widefelem tmp;
1671
1672
8.12k
    if (EC_POINT_is_at_infinity(group, point)) {
1673
0
        ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1674
0
        return 0;
1675
0
    }
1676
8.12k
    if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1677
8.12k
        (!BN_to_felem(z1, point->Z)))
1678
0
        return 0;
1679
8.12k
    felem_inv(z2, z1);
1680
8.12k
    felem_square(tmp, z2);
1681
8.12k
    felem_reduce(z1, tmp);
1682
8.12k
    felem_mul(tmp, x_in, z1);
1683
8.12k
    felem_reduce(x_in, tmp);
1684
8.12k
    felem_contract(x_out, x_in);
1685
8.12k
    if (x != NULL) {
1686
8.12k
        if (!felem_to_BN(x, x_out)) {
1687
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1688
0
            return 0;
1689
0
        }
1690
8.12k
    }
1691
8.12k
    felem_mul(tmp, z1, z2);
1692
8.12k
    felem_reduce(z1, tmp);
1693
8.12k
    felem_mul(tmp, y_in, z1);
1694
8.12k
    felem_reduce(y_in, tmp);
1695
8.12k
    felem_contract(y_out, y_in);
1696
8.12k
    if (y != NULL) {
1697
6.46k
        if (!felem_to_BN(y, y_out)) {
1698
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1699
0
            return 0;
1700
0
        }
1701
6.46k
    }
1702
8.12k
    return 1;
1703
8.12k
}
1704
1705
/* points below is of size |num|, and tmp_felems is of size |num+1/ */
1706
static void make_points_affine(size_t num, felem points[][3],
1707
                               felem tmp_felems[])
1708
0
{
1709
    /*
1710
     * Runs in constant time, unless an input is the point at infinity (which
1711
     * normally shouldn't happen).
1712
     */
1713
0
    ossl_ec_GFp_nistp_points_make_affine_internal(num,
1714
0
                                                  points,
1715
0
                                                  sizeof(felem),
1716
0
                                                  tmp_felems,
1717
0
                                                  (void (*)(void *))felem_one,
1718
0
                                                  felem_is_zero_int,
1719
0
                                                  (void (*)(void *, const void *))
1720
0
                                                  felem_assign,
1721
0
                                                  (void (*)(void *, const void *))
1722
0
                                                  felem_square_reduce,
1723
0
                                                  (void (*)(void *, const void *, const void*))
1724
0
                                                  felem_mul_reduce,
1725
0
                                                  (void (*)(void *, const void *))
1726
0
                                                  felem_inv,
1727
0
                                                  (void (*)(void *, const void *))
1728
0
                                                  felem_contract);
1729
0
}
1730
1731
/*
1732
 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1733
 * values Result is stored in r (r can equal one of the inputs).
1734
 */
1735
int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r,
1736
                                    const BIGNUM *scalar, size_t num,
1737
                                    const EC_POINT *points[],
1738
                                    const BIGNUM *scalars[], BN_CTX *ctx)
1739
6.72k
{
1740
6.72k
    int ret = 0;
1741
6.72k
    int j;
1742
6.72k
    int mixed = 0;
1743
6.72k
    BIGNUM *x, *y, *z, *tmp_scalar;
1744
6.72k
    felem_bytearray g_secret;
1745
6.72k
    felem_bytearray *secrets = NULL;
1746
6.72k
    felem (*pre_comp)[17][3] = NULL;
1747
6.72k
    felem *tmp_felems = NULL;
1748
6.72k
    unsigned int i;
1749
6.72k
    int num_bytes;
1750
6.72k
    int have_pre_comp = 0;
1751
6.72k
    size_t num_points = num;
1752
6.72k
    felem x_in, y_in, z_in, x_out, y_out, z_out;
1753
6.72k
    NISTP384_PRE_COMP *pre = NULL;
1754
6.72k
    felem(*g_pre_comp)[3] = NULL;
1755
6.72k
    EC_POINT *generator = NULL;
1756
6.72k
    const EC_POINT *p = NULL;
1757
6.72k
    const BIGNUM *p_scalar = NULL;
1758
1759
6.72k
    BN_CTX_start(ctx);
1760
6.72k
    x = BN_CTX_get(ctx);
1761
6.72k
    y = BN_CTX_get(ctx);
1762
6.72k
    z = BN_CTX_get(ctx);
1763
6.72k
    tmp_scalar = BN_CTX_get(ctx);
1764
6.72k
    if (tmp_scalar == NULL)
1765
0
        goto err;
1766
1767
6.72k
    if (scalar != NULL) {
1768
4.43k
        pre = group->pre_comp.nistp384;
1769
4.43k
        if (pre)
1770
            /* we have precomputation, try to use it */
1771
0
            g_pre_comp = &pre->g_pre_comp[0];
1772
4.43k
        else
1773
            /* try to use the standard precomputation */
1774
4.43k
            g_pre_comp = (felem(*)[3]) gmul;
1775
4.43k
        generator = EC_POINT_new(group);
1776
4.43k
        if (generator == NULL)
1777
0
            goto err;
1778
        /* get the generator from precomputation */
1779
4.43k
        if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1780
4.43k
            !felem_to_BN(y, g_pre_comp[1][1]) ||
1781
4.43k
            !felem_to_BN(z, g_pre_comp[1][2])) {
1782
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1783
0
            goto err;
1784
0
        }
1785
4.43k
        if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
1786
4.43k
                                                                generator,
1787
4.43k
                                                                x, y, z, ctx))
1788
0
            goto err;
1789
4.43k
        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1790
            /* precomputation matches generator */
1791
4.43k
            have_pre_comp = 1;
1792
0
        else
1793
            /*
1794
             * we don't have valid precomputation: treat the generator as a
1795
             * random point
1796
             */
1797
0
            num_points++;
1798
4.43k
    }
1799
1800
6.72k
    if (num_points > 0) {
1801
2.32k
        if (num_points >= 2) {
1802
            /*
1803
             * unless we precompute multiples for just one point, converting
1804
             * those into affine form is time well spent
1805
             */
1806
0
            mixed = 1;
1807
0
        }
1808
2.32k
        secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1809
2.32k
        pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1810
2.32k
        if (mixed)
1811
0
            tmp_felems =
1812
0
                OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1813
2.32k
        if ((secrets == NULL) || (pre_comp == NULL)
1814
2.32k
            || (mixed && (tmp_felems == NULL)))
1815
0
            goto err;
1816
1817
        /*
1818
         * we treat NULL scalars as 0, and NULL points as points at infinity,
1819
         * i.e., they contribute nothing to the linear combination
1820
         */
1821
4.64k
        for (i = 0; i < num_points; ++i) {
1822
2.32k
            if (i == num) {
1823
                /*
1824
                 * we didn't have a valid precomputation, so we pick the
1825
                 * generator
1826
                 */
1827
0
                p = EC_GROUP_get0_generator(group);
1828
0
                p_scalar = scalar;
1829
2.32k
            } else {
1830
                /* the i^th point */
1831
2.32k
                p = points[i];
1832
2.32k
                p_scalar = scalars[i];
1833
2.32k
            }
1834
2.32k
            if (p_scalar != NULL && p != NULL) {
1835
                /* reduce scalar to 0 <= scalar < 2^384 */
1836
2.32k
                if ((BN_num_bits(p_scalar) > 384)
1837
2.32k
                    || (BN_is_negative(p_scalar))) {
1838
                    /*
1839
                     * this is an unusual input, and we don't guarantee
1840
                     * constant-timeness
1841
                     */
1842
0
                    if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1843
0
                        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1844
0
                        goto err;
1845
0
                    }
1846
0
                    num_bytes = BN_bn2lebinpad(tmp_scalar,
1847
0
                                               secrets[i], sizeof(secrets[i]));
1848
2.32k
                } else {
1849
2.32k
                    num_bytes = BN_bn2lebinpad(p_scalar,
1850
2.32k
                                               secrets[i], sizeof(secrets[i]));
1851
2.32k
                }
1852
2.32k
                if (num_bytes < 0) {
1853
0
                    ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1854
0
                    goto err;
1855
0
                }
1856
                /* precompute multiples */
1857
2.32k
                if ((!BN_to_felem(x_out, p->X)) ||
1858
2.32k
                    (!BN_to_felem(y_out, p->Y)) ||
1859
2.32k
                    (!BN_to_felem(z_out, p->Z)))
1860
0
                    goto err;
1861
2.32k
                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1862
2.32k
                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1863
2.32k
                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1864
37.1k
                for (j = 2; j <= 16; ++j) {
1865
34.8k
                    if (j & 1) {
1866
16.2k
                        point_add(pre_comp[i][j][0],     pre_comp[i][j][1],     pre_comp[i][j][2],
1867
16.2k
                                  pre_comp[i][1][0],     pre_comp[i][1][1],     pre_comp[i][1][2], 0,
1868
16.2k
                                  pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]);
1869
18.5k
                    } else {
1870
18.5k
                        point_double(pre_comp[i][j][0],     pre_comp[i][j][1],     pre_comp[i][j][2],
1871
18.5k
                                     pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]);
1872
18.5k
                    }
1873
34.8k
                }
1874
2.32k
            }
1875
2.32k
        }
1876
2.32k
        if (mixed)
1877
0
            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1878
2.32k
    }
1879
1880
    /* the scalar for the generator */
1881
6.72k
    if (scalar != NULL && have_pre_comp) {
1882
4.43k
        memset(g_secret, 0, sizeof(g_secret));
1883
        /* reduce scalar to 0 <= scalar < 2^384 */
1884
4.43k
        if ((BN_num_bits(scalar) > 384) || (BN_is_negative(scalar))) {
1885
            /*
1886
             * this is an unusual input, and we don't guarantee
1887
             * constant-timeness
1888
             */
1889
55
            if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1890
0
                ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1891
0
                goto err;
1892
0
            }
1893
55
            num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1894
4.37k
        } else {
1895
4.37k
            num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1896
4.37k
        }
1897
        /* do the multiplication with generator precomputation */
1898
4.43k
        batch_mul(x_out, y_out, z_out,
1899
4.43k
                  (const felem_bytearray(*))secrets, num_points,
1900
4.43k
                  g_secret,
1901
4.43k
                  mixed, (const felem(*)[17][3])pre_comp,
1902
4.43k
                  (const felem(*)[3])g_pre_comp);
1903
4.43k
    } else {
1904
        /* do the multiplication without generator precomputation */
1905
2.28k
        batch_mul(x_out, y_out, z_out,
1906
2.28k
                  (const felem_bytearray(*))secrets, num_points,
1907
2.28k
                  NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1908
2.28k
    }
1909
    /* reduce the output to its unique minimal representation */
1910
6.72k
    felem_contract(x_in, x_out);
1911
6.72k
    felem_contract(y_in, y_out);
1912
6.72k
    felem_contract(z_in, z_out);
1913
6.72k
    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1914
6.72k
        (!felem_to_BN(z, z_in))) {
1915
0
        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1916
0
        goto err;
1917
0
    }
1918
6.72k
    ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
1919
6.72k
                                                             ctx);
1920
1921
6.72k
 err:
1922
6.72k
    BN_CTX_end(ctx);
1923
6.72k
    EC_POINT_free(generator);
1924
6.72k
    OPENSSL_free(secrets);
1925
6.72k
    OPENSSL_free(pre_comp);
1926
6.72k
    OPENSSL_free(tmp_felems);
1927
6.72k
    return ret;
1928
6.72k
}
1929
1930
int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1931
0
{
1932
0
    int ret = 0;
1933
0
    NISTP384_PRE_COMP *pre = NULL;
1934
0
    int i, j;
1935
0
    BIGNUM *x, *y;
1936
0
    EC_POINT *generator = NULL;
1937
0
    felem tmp_felems[16];
1938
0
#ifndef FIPS_MODULE
1939
0
    BN_CTX *new_ctx = NULL;
1940
0
#endif
1941
1942
    /* throw away old precomputation */
1943
0
    EC_pre_comp_free(group);
1944
1945
0
#ifndef FIPS_MODULE
1946
0
    if (ctx == NULL)
1947
0
        ctx = new_ctx = BN_CTX_new();
1948
0
#endif
1949
0
    if (ctx == NULL)
1950
0
        return 0;
1951
1952
0
    BN_CTX_start(ctx);
1953
0
    x = BN_CTX_get(ctx);
1954
0
    y = BN_CTX_get(ctx);
1955
0
    if (y == NULL)
1956
0
        goto err;
1957
    /* get the generator */
1958
0
    if (group->generator == NULL)
1959
0
        goto err;
1960
0
    generator = EC_POINT_new(group);
1961
0
    if (generator == NULL)
1962
0
        goto err;
1963
0
    BN_bin2bn(nistp384_curve_params[3], sizeof(felem_bytearray), x);
1964
0
    BN_bin2bn(nistp384_curve_params[4], sizeof(felem_bytearray), y);
1965
0
    if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
1966
0
        goto err;
1967
0
    if ((pre = nistp384_pre_comp_new()) == NULL)
1968
0
        goto err;
1969
    /*
1970
     * if the generator is the standard one, use built-in precomputation
1971
     */
1972
0
    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1973
0
        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1974
0
        goto done;
1975
0
    }
1976
0
    if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
1977
0
        (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
1978
0
        (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
1979
0
        goto err;
1980
    /* compute 2^95*G, 2^190*G, 2^285*G */
1981
0
    for (i = 1; i <= 4; i <<= 1) {
1982
0
        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1983
0
                     pre->g_pre_comp[i][0],  pre->g_pre_comp[i][1],    pre->g_pre_comp[i][2]);
1984
0
        for (j = 0; j < 94; ++j) {
1985
0
            point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1986
0
                         pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2]);
1987
0
        }
1988
0
    }
1989
    /* g_pre_comp[0] is the point at infinity */
1990
0
    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1991
    /* the remaining multiples */
1992
    /* 2^95*G + 2^190*G */
1993
0
    point_add(pre->g_pre_comp[6][0],  pre->g_pre_comp[6][1],  pre->g_pre_comp[6][2],
1994
0
              pre->g_pre_comp[4][0],  pre->g_pre_comp[4][1],  pre->g_pre_comp[4][2], 0,
1995
0
              pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
1996
    /* 2^95*G + 2^285*G */
1997
0
    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], pre->g_pre_comp[10][2],
1998
0
              pre->g_pre_comp[8][0],  pre->g_pre_comp[8][1],  pre->g_pre_comp[8][2], 0,
1999
0
              pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
2000
    /* 2^190*G + 2^285*G */
2001
0
    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2002
0
              pre->g_pre_comp[8][0],  pre->g_pre_comp[8][1],  pre->g_pre_comp[8][2], 0,
2003
0
              pre->g_pre_comp[4][0],  pre->g_pre_comp[4][1],  pre->g_pre_comp[4][2]);
2004
    /* 2^95*G + 2^190*G + 2^285*G */
2005
0
    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], pre->g_pre_comp[14][2],
2006
0
              pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], 0,
2007
0
              pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
2008
0
    for (i = 1; i < 8; ++i) {
2009
        /* odd multiples: add G */
2010
0
        point_add(pre->g_pre_comp[2 * i + 1][0], pre->g_pre_comp[2 * i + 1][1], pre->g_pre_comp[2 * i + 1][2],
2011
0
                  pre->g_pre_comp[2 * i][0],     pre->g_pre_comp[2 * i][1],     pre->g_pre_comp[2 * i][2], 0,
2012
0
                  pre->g_pre_comp[1][0],         pre->g_pre_comp[1][1],         pre->g_pre_comp[1][2]);
2013
0
    }
2014
0
    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2015
2016
0
 done:
2017
0
    SETPRECOMP(group, nistp384, pre);
2018
0
    ret = 1;
2019
0
    pre = NULL;
2020
0
 err:
2021
0
    BN_CTX_end(ctx);
2022
0
    EC_POINT_free(generator);
2023
0
#ifndef FIPS_MODULE
2024
0
    BN_CTX_free(new_ctx);
2025
0
#endif
2026
0
    ossl_ec_nistp384_pre_comp_free(pre);
2027
0
    return ret;
2028
0
}
2029
2030
int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group)
2031
0
{
2032
    return HAVEPRECOMP(group, nistp384);
2033
0
}