Coverage Report

Created: 2026-05-24 07:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openssl34/crypto/ec/ecp_nistp384.c
Line
Count
Source
1
/*
2
 * Copyright 2023-2025 The OpenSSL Project Authors. All Rights Reserved.
3
 *
4
 * Licensed under the Apache License 2.0 (the "License").  You may not use
5
 * this file except in compliance with the License.  You can obtain a copy
6
 * in the file LICENSE in the source distribution or at
7
 * https://www.openssl.org/source/license.html
8
 */
9
10
/* Copyright 2023 IBM Corp.
11
 *
12
 * Licensed under the Apache License, Version 2.0 (the "License");
13
 *
14
 * you may not use this file except in compliance with the License.
15
 * You may obtain a copy of the License at
16
 *
17
 *     http://www.apache.org/licenses/LICENSE-2.0
18
 *
19
 *  Unless required by applicable law or agreed to in writing, software
20
 *  distributed under the License is distributed on an "AS IS" BASIS,
21
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22
 *  See the License for the specific language governing permissions and
23
 *  limitations under the License.
24
 */
25
26
/*
27
 * Designed for 56-bit limbs by Rohan McLure <rohan.mclure@linux.ibm.com>.
28
 * The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm
29
 * acceleration of felem_{square,mul} as supported in these files.
30
 */
31
32
#include <openssl/e_os2.h>
33
34
#include <string.h>
35
#include <openssl/err.h>
36
#include "ec_local.h"
37
38
#include "internal/numbers.h"
39
40
#ifndef INT128_MAX
41
#error "Your compiler doesn't appear to support 128-bit integer types"
42
#endif
43
44
typedef uint8_t u8;
45
typedef uint64_t u64;
46
47
/*
48
 * The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We
49
 * can serialize an element of this field into 48 bytes. We call this an
50
 * felem_bytearray.
51
 */
52
53
typedef u8 felem_bytearray[48];
54
55
/*
56
 * These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4.
57
 * These values are big-endian.
58
 */
59
static const felem_bytearray nistp384_curve_params[5] = {
60
    { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
61
        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
62
        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
63
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF },
64
    { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */
65
        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
66
        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
67
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC },
68
    { 0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */
69
        0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12,
70
        0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D,
71
        0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF },
72
    { 0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */
73
        0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98,
74
        0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D,
75
        0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7 },
76
    { 0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */
77
        0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C,
78
        0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE,
79
        0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F },
80
};
81
82
/*-
83
 * The representation of field elements.
84
 * ------------------------------------
85
 *
86
 * We represent field elements with seven values. These values are either 64 or
87
 * 128 bits and the field element represented is:
88
 *   v[0]*2^0 + v[1]*2^56 + v[2]*2^112 + ... + v[6]*2^336  (mod p)
89
 * Each of the seven values is called a 'limb'. Since the limbs are spaced only
90
 * 56 bits apart, but are greater than 56 bits in length, the most significant
91
 * bits of each limb overlap with the least significant bits of the next
92
 *
93
 * This representation is considered to be 'redundant' in the sense that
94
 * intermediate values can each contain more than a 56-bit value in each limb.
95
 * Reduction causes all but the final limb to be reduced to contain a value less
96
 * than 2^56, with the final value represented allowed to be larger than 2^384,
97
 * inasmuch as we can be sure that arithmetic overflow remains impossible. The
98
 * reduced value must of course be congruent to the unreduced value.
99
 *
100
 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
101
 * 'widefelem', featuring enough bits to store the result of a multiplication
102
 * and even some further arithmetic without need for immediate reduction.
103
 */
104
105
734M
#define NLIMBS 7
106
107
typedef uint64_t limb;
108
typedef uint128_t widelimb;
109
typedef limb limb_aX __attribute((__aligned__(1)));
110
typedef limb felem[NLIMBS];
111
typedef widelimb widefelem[2 * NLIMBS - 1];
112
113
static const limb bottom56bits = 0xffffffffffffff;
114
115
/* Helper functions (de)serialising reduced field elements in little endian */
116
static void bin48_to_felem(felem out, const u8 in[48])
117
33.1k
{
118
33.1k
    memset(out, 0, 56);
119
33.1k
    out[0] = (*((limb *)&in[0])) & bottom56bits;
120
33.1k
    out[1] = (*((limb_aX *)&in[7])) & bottom56bits;
121
33.1k
    out[2] = (*((limb_aX *)&in[14])) & bottom56bits;
122
33.1k
    out[3] = (*((limb_aX *)&in[21])) & bottom56bits;
123
33.1k
    out[4] = (*((limb_aX *)&in[28])) & bottom56bits;
124
33.1k
    out[5] = (*((limb_aX *)&in[35])) & bottom56bits;
125
33.1k
    memmove(&out[6], &in[42], 6);
126
33.1k
}
127
128
static void felem_to_bin48(u8 out[48], const felem in)
129
50.1k
{
130
50.1k
    memset(out, 0, 48);
131
50.1k
    (*((limb *)&out[0])) |= (in[0] & bottom56bits);
132
50.1k
    (*((limb_aX *)&out[7])) |= (in[1] & bottom56bits);
133
50.1k
    (*((limb_aX *)&out[14])) |= (in[2] & bottom56bits);
134
50.1k
    (*((limb_aX *)&out[21])) |= (in[3] & bottom56bits);
135
50.1k
    (*((limb_aX *)&out[28])) |= (in[4] & bottom56bits);
136
50.1k
    (*((limb_aX *)&out[35])) |= (in[5] & bottom56bits);
137
50.1k
    memmove(&out[42], &in[6], 6);
138
50.1k
}
139
140
/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
141
static int BN_to_felem(felem out, const BIGNUM *bn)
142
33.1k
{
143
33.1k
    felem_bytearray b_out;
144
33.1k
    int num_bytes;
145
146
33.1k
    if (BN_is_negative(bn)) {
147
0
        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
148
0
        return 0;
149
0
    }
150
33.1k
    num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
151
33.1k
    if (num_bytes < 0) {
152
0
        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
153
0
        return 0;
154
0
    }
155
33.1k
    bin48_to_felem(out, b_out);
156
33.1k
    return 1;
157
33.1k
}
158
159
/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
160
static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
161
50.1k
{
162
50.1k
    felem_bytearray b_out;
163
164
50.1k
    felem_to_bin48(b_out, in);
165
50.1k
    return BN_lebin2bn(b_out, sizeof(b_out), out);
166
50.1k
}
167
168
/*-
169
 * Field operations
170
 * ----------------
171
 */
172
173
static void felem_one(felem out)
174
0
{
175
0
    out[0] = 1;
176
0
    memset(&out[1], 0, sizeof(limb) * (NLIMBS - 1));
177
0
}
178
179
static void felem_assign(felem out, const felem in)
180
10.2M
{
181
10.2M
    memcpy(out, in, sizeof(felem));
182
10.2M
}
183
184
/* felem_sum64 sets out = out + in. */
185
static void felem_sum64(felem out, const felem in)
186
4.51M
{
187
4.51M
    unsigned int i;
188
189
36.0M
    for (i = 0; i < NLIMBS; i++)
190
31.5M
        out[i] += in[i];
191
4.51M
}
192
193
/* felem_scalar sets out = in * scalar */
194
static void felem_scalar(felem out, const felem in, limb scalar)
195
13.2M
{
196
13.2M
    unsigned int i;
197
198
106M
    for (i = 0; i < NLIMBS; i++)
199
93.0M
        out[i] = in[i] * scalar;
200
13.2M
}
201
202
/* felem_scalar64 sets out = out * scalar */
203
static void felem_scalar64(felem out, limb scalar)
204
6.28M
{
205
6.28M
    unsigned int i;
206
207
50.2M
    for (i = 0; i < NLIMBS; i++)
208
43.9M
        out[i] *= scalar;
209
6.28M
}
210
211
/* felem_scalar128 sets out = out * scalar */
212
static void felem_scalar128(widefelem out, limb scalar)
213
2.09M
{
214
2.09M
    unsigned int i;
215
216
29.3M
    for (i = 0; i < 2 * NLIMBS - 1; i++)
217
27.2M
        out[i] *= scalar;
218
2.09M
}
219
220
/*-
221
 * felem_neg sets |out| to |-in|
222
 * On entry:
223
 *   in[i] < 2^60 - 2^29
224
 * On exit:
225
 *   out[i] < 2^60
226
 */
227
static void felem_neg(felem out, const felem in)
228
195k
{
229
    /*
230
     * In order to prevent underflow, we add a multiple of p before subtracting.
231
     * Use telescopic sums to represent 2^12 * p redundantly with each limb
232
     * of the form 2^60 + ...
233
     */
234
195k
    static const limb two60m52m4 = (((limb)1) << 60)
235
195k
        - (((limb)1) << 52)
236
195k
        - (((limb)1) << 4);
237
195k
    static const limb two60p44m12 = (((limb)1) << 60)
238
195k
        + (((limb)1) << 44)
239
195k
        - (((limb)1) << 12);
240
195k
    static const limb two60m28m4 = (((limb)1) << 60)
241
195k
        - (((limb)1) << 28)
242
195k
        - (((limb)1) << 4);
243
195k
    static const limb two60m4 = (((limb)1) << 60)
244
195k
        - (((limb)1) << 4);
245
246
195k
    out[0] = two60p44m12 - in[0];
247
195k
    out[1] = two60m52m4 - in[1];
248
195k
    out[2] = two60m28m4 - in[2];
249
195k
    out[3] = two60m4 - in[3];
250
195k
    out[4] = two60m4 - in[4];
251
195k
    out[5] = two60m4 - in[5];
252
195k
    out[6] = two60m4 - in[6];
253
195k
}
254
255
#if defined(ECP_NISTP384_ASM)
256
void p384_felem_diff64(felem out, const felem in);
257
void p384_felem_diff128(widefelem out, const widefelem in);
258
void p384_felem_diff_128_64(widefelem out, const felem in);
259
260
#define felem_diff64 p384_felem_diff64
261
#define felem_diff128 p384_felem_diff128
262
#define felem_diff_128_64 p384_felem_diff_128_64
263
264
#else
265
/*-
266
 * felem_diff64 subtracts |in| from |out|
267
 * On entry:
268
 *   in[i] < 2^60 - 2^52 - 2^4
269
 * On exit:
270
 *   out[i] < out_orig[i] + 2^60 + 2^44
271
 */
272
static void felem_diff64(felem out, const felem in)
273
3.52M
{
274
    /*
275
     * In order to prevent underflow, we add a multiple of p before subtracting.
276
     * Use telescopic sums to represent 2^12 * p redundantly with each limb
277
     * of the form 2^60 + ...
278
     */
279
280
3.52M
    static const limb two60m52m4 = (((limb)1) << 60)
281
3.52M
        - (((limb)1) << 52)
282
3.52M
        - (((limb)1) << 4);
283
3.52M
    static const limb two60p44m12 = (((limb)1) << 60)
284
3.52M
        + (((limb)1) << 44)
285
3.52M
        - (((limb)1) << 12);
286
3.52M
    static const limb two60m28m4 = (((limb)1) << 60)
287
3.52M
        - (((limb)1) << 28)
288
3.52M
        - (((limb)1) << 4);
289
3.52M
    static const limb two60m4 = (((limb)1) << 60)
290
3.52M
        - (((limb)1) << 4);
291
292
3.52M
    out[0] += two60p44m12 - in[0];
293
3.52M
    out[1] += two60m52m4 - in[1];
294
3.52M
    out[2] += two60m28m4 - in[2];
295
3.52M
    out[3] += two60m4 - in[3];
296
3.52M
    out[4] += two60m4 - in[4];
297
3.52M
    out[5] += two60m4 - in[5];
298
3.52M
    out[6] += two60m4 - in[6];
299
3.52M
}
300
301
/*
302
 * in[i] < 2^63
303
 * out[i] < out_orig[i] + 2^64 + 2^48
304
 */
305
static void felem_diff_128_64(widefelem out, const felem in)
306
5.93M
{
307
    /*
308
     * In order to prevent underflow, we add a multiple of p before subtracting.
309
     * Use telescopic sums to represent 2^16 * p redundantly with each limb
310
     * of the form 2^64 + ...
311
     */
312
313
5.93M
    static const widelimb two64m56m8 = (((widelimb)1) << 64)
314
5.93M
        - (((widelimb)1) << 56)
315
5.93M
        - (((widelimb)1) << 8);
316
5.93M
    static const widelimb two64m32m8 = (((widelimb)1) << 64)
317
5.93M
        - (((widelimb)1) << 32)
318
5.93M
        - (((widelimb)1) << 8);
319
5.93M
    static const widelimb two64m8 = (((widelimb)1) << 64)
320
5.93M
        - (((widelimb)1) << 8);
321
5.93M
    static const widelimb two64p48m16 = (((widelimb)1) << 64)
322
5.93M
        + (((widelimb)1) << 48)
323
5.93M
        - (((widelimb)1) << 16);
324
5.93M
    unsigned int i;
325
326
5.93M
    out[0] += two64p48m16;
327
5.93M
    out[1] += two64m56m8;
328
5.93M
    out[2] += two64m32m8;
329
5.93M
    out[3] += two64m8;
330
5.93M
    out[4] += two64m8;
331
5.93M
    out[5] += two64m8;
332
5.93M
    out[6] += two64m8;
333
334
47.4M
    for (i = 0; i < NLIMBS; i++)
335
41.5M
        out[i] -= in[i];
336
5.93M
}
337
338
/*
339
 * in[i] < 2^127 - 2^119 - 2^71
340
 * out[i] < out_orig[i] + 2^127 + 2^111
341
 */
342
static void felem_diff128(widefelem out, const widefelem in)
343
2.09M
{
344
    /*
345
     * In order to prevent underflow, we add a multiple of p before subtracting.
346
     * Use telescopic sums to represent 2^415 * p redundantly with each limb
347
     * of the form 2^127 + ...
348
     */
349
350
2.09M
    static const widelimb two127 = ((widelimb)1) << 127;
351
2.09M
    static const widelimb two127m71 = (((widelimb)1) << 127)
352
2.09M
        - (((widelimb)1) << 71);
353
2.09M
    static const widelimb two127p111m79m71 = (((widelimb)1) << 127)
354
2.09M
        + (((widelimb)1) << 111)
355
2.09M
        - (((widelimb)1) << 79)
356
2.09M
        - (((widelimb)1) << 71);
357
2.09M
    static const widelimb two127m119m71 = (((widelimb)1) << 127)
358
2.09M
        - (((widelimb)1) << 119)
359
2.09M
        - (((widelimb)1) << 71);
360
2.09M
    static const widelimb two127m95m71 = (((widelimb)1) << 127)
361
2.09M
        - (((widelimb)1) << 95)
362
2.09M
        - (((widelimb)1) << 71);
363
2.09M
    unsigned int i;
364
365
2.09M
    out[0] += two127;
366
2.09M
    out[1] += two127m71;
367
2.09M
    out[2] += two127m71;
368
2.09M
    out[3] += two127m71;
369
2.09M
    out[4] += two127m71;
370
2.09M
    out[5] += two127m71;
371
2.09M
    out[6] += two127p111m79m71;
372
2.09M
    out[7] += two127m119m71;
373
2.09M
    out[8] += two127m95m71;
374
2.09M
    out[9] += two127m71;
375
2.09M
    out[10] += two127m71;
376
2.09M
    out[11] += two127m71;
377
2.09M
    out[12] += two127m71;
378
379
29.3M
    for (i = 0; i < 2 * NLIMBS - 1; i++)
380
27.2M
        out[i] -= in[i];
381
2.09M
}
382
#endif /* ECP_NISTP384_ASM */
383
384
static void felem_square_ref(widefelem out, const felem in)
385
12.8M
{
386
12.8M
    felem inx2;
387
12.8M
    felem_scalar(inx2, in, 2);
388
389
12.8M
    out[0] = ((uint128_t)in[0]) * in[0];
390
391
12.8M
    out[1] = ((uint128_t)in[0]) * inx2[1];
392
393
12.8M
    out[2] = ((uint128_t)in[0]) * inx2[2]
394
12.8M
        + ((uint128_t)in[1]) * in[1];
395
396
12.8M
    out[3] = ((uint128_t)in[0]) * inx2[3]
397
12.8M
        + ((uint128_t)in[1]) * inx2[2];
398
399
12.8M
    out[4] = ((uint128_t)in[0]) * inx2[4]
400
12.8M
        + ((uint128_t)in[1]) * inx2[3]
401
12.8M
        + ((uint128_t)in[2]) * in[2];
402
403
12.8M
    out[5] = ((uint128_t)in[0]) * inx2[5]
404
12.8M
        + ((uint128_t)in[1]) * inx2[4]
405
12.8M
        + ((uint128_t)in[2]) * inx2[3];
406
407
12.8M
    out[6] = ((uint128_t)in[0]) * inx2[6]
408
12.8M
        + ((uint128_t)in[1]) * inx2[5]
409
12.8M
        + ((uint128_t)in[2]) * inx2[4]
410
12.8M
        + ((uint128_t)in[3]) * in[3];
411
412
12.8M
    out[7] = ((uint128_t)in[1]) * inx2[6]
413
12.8M
        + ((uint128_t)in[2]) * inx2[5]
414
12.8M
        + ((uint128_t)in[3]) * inx2[4];
415
416
12.8M
    out[8] = ((uint128_t)in[2]) * inx2[6]
417
12.8M
        + ((uint128_t)in[3]) * inx2[5]
418
12.8M
        + ((uint128_t)in[4]) * in[4];
419
420
12.8M
    out[9] = ((uint128_t)in[3]) * inx2[6]
421
12.8M
        + ((uint128_t)in[4]) * inx2[5];
422
423
12.8M
    out[10] = ((uint128_t)in[4]) * inx2[6]
424
12.8M
        + ((uint128_t)in[5]) * in[5];
425
426
12.8M
    out[11] = ((uint128_t)in[5]) * inx2[6];
427
428
12.8M
    out[12] = ((uint128_t)in[6]) * in[6];
429
12.8M
}
430
431
static void felem_mul_ref(widefelem out, const felem in1, const felem in2)
432
10.3M
{
433
10.3M
    out[0] = ((uint128_t)in1[0]) * in2[0];
434
435
10.3M
    out[1] = ((uint128_t)in1[0]) * in2[1]
436
10.3M
        + ((uint128_t)in1[1]) * in2[0];
437
438
10.3M
    out[2] = ((uint128_t)in1[0]) * in2[2]
439
10.3M
        + ((uint128_t)in1[1]) * in2[1]
440
10.3M
        + ((uint128_t)in1[2]) * in2[0];
441
442
10.3M
    out[3] = ((uint128_t)in1[0]) * in2[3]
443
10.3M
        + ((uint128_t)in1[1]) * in2[2]
444
10.3M
        + ((uint128_t)in1[2]) * in2[1]
445
10.3M
        + ((uint128_t)in1[3]) * in2[0];
446
447
10.3M
    out[4] = ((uint128_t)in1[0]) * in2[4]
448
10.3M
        + ((uint128_t)in1[1]) * in2[3]
449
10.3M
        + ((uint128_t)in1[2]) * in2[2]
450
10.3M
        + ((uint128_t)in1[3]) * in2[1]
451
10.3M
        + ((uint128_t)in1[4]) * in2[0];
452
453
10.3M
    out[5] = ((uint128_t)in1[0]) * in2[5]
454
10.3M
        + ((uint128_t)in1[1]) * in2[4]
455
10.3M
        + ((uint128_t)in1[2]) * in2[3]
456
10.3M
        + ((uint128_t)in1[3]) * in2[2]
457
10.3M
        + ((uint128_t)in1[4]) * in2[1]
458
10.3M
        + ((uint128_t)in1[5]) * in2[0];
459
460
10.3M
    out[6] = ((uint128_t)in1[0]) * in2[6]
461
10.3M
        + ((uint128_t)in1[1]) * in2[5]
462
10.3M
        + ((uint128_t)in1[2]) * in2[4]
463
10.3M
        + ((uint128_t)in1[3]) * in2[3]
464
10.3M
        + ((uint128_t)in1[4]) * in2[2]
465
10.3M
        + ((uint128_t)in1[5]) * in2[1]
466
10.3M
        + ((uint128_t)in1[6]) * in2[0];
467
468
10.3M
    out[7] = ((uint128_t)in1[1]) * in2[6]
469
10.3M
        + ((uint128_t)in1[2]) * in2[5]
470
10.3M
        + ((uint128_t)in1[3]) * in2[4]
471
10.3M
        + ((uint128_t)in1[4]) * in2[3]
472
10.3M
        + ((uint128_t)in1[5]) * in2[2]
473
10.3M
        + ((uint128_t)in1[6]) * in2[1];
474
475
10.3M
    out[8] = ((uint128_t)in1[2]) * in2[6]
476
10.3M
        + ((uint128_t)in1[3]) * in2[5]
477
10.3M
        + ((uint128_t)in1[4]) * in2[4]
478
10.3M
        + ((uint128_t)in1[5]) * in2[3]
479
10.3M
        + ((uint128_t)in1[6]) * in2[2];
480
481
10.3M
    out[9] = ((uint128_t)in1[3]) * in2[6]
482
10.3M
        + ((uint128_t)in1[4]) * in2[5]
483
10.3M
        + ((uint128_t)in1[5]) * in2[4]
484
10.3M
        + ((uint128_t)in1[6]) * in2[3];
485
486
10.3M
    out[10] = ((uint128_t)in1[4]) * in2[6]
487
10.3M
        + ((uint128_t)in1[5]) * in2[5]
488
10.3M
        + ((uint128_t)in1[6]) * in2[4];
489
490
10.3M
    out[11] = ((uint128_t)in1[5]) * in2[6]
491
10.3M
        + ((uint128_t)in1[6]) * in2[5];
492
493
10.3M
    out[12] = ((uint128_t)in1[6]) * in2[6];
494
10.3M
}
495
496
/*-
497
 * Reduce thirteen 128-bit coefficients to seven 64-bit coefficients.
498
 * in[i] < 2^128 - 2^125
499
 * out[i] < 2^56 for i < 6,
500
 * out[6] <= 2^48
501
 *
502
 * The technique in use here stems from the format of the prime modulus:
503
 * P384 = 2^384 - delta
504
 *
505
 * Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting
506
 * them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These
507
 * coefficients are still quite large, and so we repeatedly apply this
508
 * technique on high-order bits in order to guarantee the desired bounds on
509
 * the size of our output.
510
 *
511
 * The three phases of elimination are as follows:
512
 * [1]: Y = 2^120 (in[12] | in[11] | in[10] | in[9])
513
 * [2]: Y = 2^8 (acc[8] | acc[7])
514
 * [3]: Y = 2^48 (acc[6] >> 48)
515
 * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d)
516
 */
517
static void felem_reduce_ref(felem out, const widefelem in)
518
21.1M
{
519
    /*
520
     * In order to prevent underflow, we add a multiple of p before subtracting.
521
     * Use telescopic sums to represent 2^76 * p redundantly with each limb
522
     * of the form 2^124 + ...
523
     */
524
21.1M
    static const widelimb two124m68 = (((widelimb)1) << 124)
525
21.1M
        - (((widelimb)1) << 68);
526
21.1M
    static const widelimb two124m116m68 = (((widelimb)1) << 124)
527
21.1M
        - (((widelimb)1) << 116)
528
21.1M
        - (((widelimb)1) << 68);
529
21.1M
    static const widelimb two124p108m76 = (((widelimb)1) << 124)
530
21.1M
        + (((widelimb)1) << 108)
531
21.1M
        - (((widelimb)1) << 76);
532
21.1M
    static const widelimb two124m92m68 = (((widelimb)1) << 124)
533
21.1M
        - (((widelimb)1) << 92)
534
21.1M
        - (((widelimb)1) << 68);
535
21.1M
    widelimb temp, acc[9];
536
21.1M
    unsigned int i;
537
538
21.1M
    memcpy(acc, in, sizeof(widelimb) * 9);
539
540
21.1M
    acc[0] += two124p108m76;
541
21.1M
    acc[1] += two124m116m68;
542
21.1M
    acc[2] += two124m92m68;
543
21.1M
    acc[3] += two124m68;
544
21.1M
    acc[4] += two124m68;
545
21.1M
    acc[5] += two124m68;
546
21.1M
    acc[6] += two124m68;
547
548
    /* [1]: Eliminate in[9], ..., in[12] */
549
21.1M
    acc[8] += in[12] >> 32;
550
21.1M
    acc[7] += (in[12] & 0xffffffff) << 24;
551
21.1M
    acc[7] += in[12] >> 8;
552
21.1M
    acc[6] += (in[12] & 0xff) << 48;
553
21.1M
    acc[6] -= in[12] >> 16;
554
21.1M
    acc[5] -= (in[12] & 0xffff) << 40;
555
21.1M
    acc[6] += in[12] >> 48;
556
21.1M
    acc[5] += (in[12] & 0xffffffffffff) << 8;
557
558
21.1M
    acc[7] += in[11] >> 32;
559
21.1M
    acc[6] += (in[11] & 0xffffffff) << 24;
560
21.1M
    acc[6] += in[11] >> 8;
561
21.1M
    acc[5] += (in[11] & 0xff) << 48;
562
21.1M
    acc[5] -= in[11] >> 16;
563
21.1M
    acc[4] -= (in[11] & 0xffff) << 40;
564
21.1M
    acc[5] += in[11] >> 48;
565
21.1M
    acc[4] += (in[11] & 0xffffffffffff) << 8;
566
567
21.1M
    acc[6] += in[10] >> 32;
568
21.1M
    acc[5] += (in[10] & 0xffffffff) << 24;
569
21.1M
    acc[5] += in[10] >> 8;
570
21.1M
    acc[4] += (in[10] & 0xff) << 48;
571
21.1M
    acc[4] -= in[10] >> 16;
572
21.1M
    acc[3] -= (in[10] & 0xffff) << 40;
573
21.1M
    acc[4] += in[10] >> 48;
574
21.1M
    acc[3] += (in[10] & 0xffffffffffff) << 8;
575
576
21.1M
    acc[5] += in[9] >> 32;
577
21.1M
    acc[4] += (in[9] & 0xffffffff) << 24;
578
21.1M
    acc[4] += in[9] >> 8;
579
21.1M
    acc[3] += (in[9] & 0xff) << 48;
580
21.1M
    acc[3] -= in[9] >> 16;
581
21.1M
    acc[2] -= (in[9] & 0xffff) << 40;
582
21.1M
    acc[3] += in[9] >> 48;
583
21.1M
    acc[2] += (in[9] & 0xffffffffffff) << 8;
584
585
    /*
586
     * [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as
587
     * well as the contributions made from eliminating higher limbs.
588
     * acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121
589
     * acc[8] < in[8] + 2^96
590
     */
591
21.1M
    acc[4] += acc[8] >> 32;
592
21.1M
    acc[3] += (acc[8] & 0xffffffff) << 24;
593
21.1M
    acc[3] += acc[8] >> 8;
594
21.1M
    acc[2] += (acc[8] & 0xff) << 48;
595
21.1M
    acc[2] -= acc[8] >> 16;
596
21.1M
    acc[1] -= (acc[8] & 0xffff) << 40;
597
21.1M
    acc[2] += acc[8] >> 48;
598
21.1M
    acc[1] += (acc[8] & 0xffffffffffff) << 8;
599
600
21.1M
    acc[3] += acc[7] >> 32;
601
21.1M
    acc[2] += (acc[7] & 0xffffffff) << 24;
602
21.1M
    acc[2] += acc[7] >> 8;
603
21.1M
    acc[1] += (acc[7] & 0xff) << 48;
604
21.1M
    acc[1] -= acc[7] >> 16;
605
21.1M
    acc[0] -= (acc[7] & 0xffff) << 40;
606
21.1M
    acc[1] += acc[7] >> 48;
607
21.1M
    acc[0] += (acc[7] & 0xffffffffffff) << 8;
608
609
    /*-
610
     * acc[k] < in[k] + 2^124 + 2^121
611
     *        < in[k] + 2^125
612
     *        < 2^128, for k <= 6
613
     */
614
615
    /*
616
     * Carry 4 -> 5 -> 6
617
     * This has the effect of ensuring that these more significant limbs
618
     * will be small in value after eliminating high bits from acc[6].
619
     */
620
21.1M
    acc[5] += acc[4] >> 56;
621
21.1M
    acc[4] &= 0x00ffffffffffffff;
622
623
21.1M
    acc[6] += acc[5] >> 56;
624
21.1M
    acc[5] &= 0x00ffffffffffffff;
625
626
    /*-
627
     * acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16
628
     *        < in[6] + 2^125
629
     *        < 2^128
630
     */
631
632
    /* [3]: Eliminate high bits of acc[6] */
633
21.1M
    temp = acc[6] >> 48;
634
21.1M
    acc[6] &= 0x0000ffffffffffff;
635
636
    /* temp < 2^80 */
637
638
21.1M
    acc[3] += temp >> 40;
639
21.1M
    acc[2] += (temp & 0xffffffffff) << 16;
640
21.1M
    acc[2] += temp >> 16;
641
21.1M
    acc[1] += (temp & 0xffff) << 40;
642
21.1M
    acc[1] -= temp >> 24;
643
21.1M
    acc[0] -= (temp & 0xffffff) << 32;
644
21.1M
    acc[0] += temp;
645
646
    /*-
647
     * acc[k] < acc_old[k] + 2^64 + 2^56
648
     *        < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4
649
     */
650
651
    /* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
652
21.1M
    acc[1] += acc[0] >> 56; /* acc[1] < acc_old[1] + 2^72 */
653
21.1M
    acc[0] &= 0x00ffffffffffffff;
654
655
21.1M
    acc[2] += acc[1] >> 56; /* acc[2] < acc_old[2] + 2^72 + 2^16 */
656
21.1M
    acc[1] &= 0x00ffffffffffffff;
657
658
21.1M
    acc[3] += acc[2] >> 56; /* acc[3] < acc_old[3] + 2^72 + 2^16 */
659
21.1M
    acc[2] &= 0x00ffffffffffffff;
660
661
    /*-
662
     * acc[k] < acc_old[k] + 2^72 + 2^16
663
     *        < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17
664
     *        < in[k] + 2^125
665
     *        < 2^128 , k < 4
666
     */
667
668
21.1M
    acc[4] += acc[3] >> 56; /*-
669
                             * acc[4] < acc_old[4] + 2^72 + 2^16
670
                             *        < 2^72 + 2^56 + 2^16
671
                             */
672
21.1M
    acc[3] &= 0x00ffffffffffffff;
673
674
21.1M
    acc[5] += acc[4] >> 56; /*-
675
                             * acc[5] < acc_old[5] + 2^16 + 1
676
                             *        < 2^56 + 2^16 + 1
677
                             */
678
21.1M
    acc[4] &= 0x00ffffffffffffff;
679
680
21.1M
    acc[6] += acc[5] >> 56; /* acc[6] < 2^48 + 1 <= 2^48 */
681
21.1M
    acc[5] &= 0x00ffffffffffffff;
682
683
168M
    for (i = 0; i < NLIMBS; i++)
684
147M
        out[i] = acc[i];
685
21.1M
}
686
687
static ossl_inline void felem_square_reduce_ref(felem out, const felem in)
688
7.66M
{
689
7.66M
    widefelem tmp;
690
691
7.66M
    felem_square_ref(tmp, in);
692
7.66M
    felem_reduce_ref(out, tmp);
693
7.66M
}
694
695
static ossl_inline void felem_mul_reduce_ref(felem out, const felem in1, const felem in2)
696
6.26M
{
697
6.26M
    widefelem tmp;
698
699
6.26M
    felem_mul_ref(tmp, in1, in2);
700
6.26M
    felem_reduce_ref(out, tmp);
701
6.26M
}
702
703
#if defined(ECP_NISTP384_ASM)
704
static void felem_square_wrapper(widefelem out, const felem in);
705
static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2);
706
707
static void (*felem_square_p)(widefelem out, const felem in) = felem_square_wrapper;
708
static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) = felem_mul_wrapper;
709
710
static void (*felem_reduce_p)(felem out, const widefelem in) = felem_reduce_ref;
711
712
static void (*felem_square_reduce_p)(felem out, const felem in) = felem_square_reduce_ref;
713
static void (*felem_mul_reduce_p)(felem out, const felem in1, const felem in2) = felem_mul_reduce_ref;
714
715
void p384_felem_square(widefelem out, const felem in);
716
void p384_felem_mul(widefelem out, const felem in1, const felem in2);
717
void p384_felem_reduce(felem out, const widefelem in);
718
719
void p384_felem_square_reduce(felem out, const felem in);
720
void p384_felem_mul_reduce(felem out, const felem in1, const felem in2);
721
722
#if defined(_ARCH_PPC64)
723
#include "crypto/ppc_arch.h"
724
#endif
725
726
static void felem_select(void)
727
{
728
#if defined(_ARCH_PPC64)
729
    if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
730
        felem_square_p = p384_felem_square;
731
        felem_mul_p = p384_felem_mul;
732
        felem_reduce_p = p384_felem_reduce;
733
        felem_square_reduce_p = p384_felem_square_reduce;
734
        felem_mul_reduce_p = p384_felem_mul_reduce;
735
736
        return;
737
    }
738
#endif
739
740
    /* Default */
741
    felem_square_p = felem_square_ref;
742
    felem_mul_p = felem_mul_ref;
743
    felem_reduce_p = felem_reduce_ref;
744
    felem_square_reduce_p = felem_square_reduce_ref;
745
    felem_mul_reduce_p = felem_mul_reduce_ref;
746
}
747
748
static void felem_square_wrapper(widefelem out, const felem in)
749
{
750
    felem_select();
751
    felem_square_p(out, in);
752
}
753
754
static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2)
755
{
756
    felem_select();
757
    felem_mul_p(out, in1, in2);
758
}
759
760
#define felem_square felem_square_p
761
#define felem_mul felem_mul_p
762
#define felem_reduce felem_reduce_p
763
764
#define felem_square_reduce felem_square_reduce_p
765
#define felem_mul_reduce felem_mul_reduce_p
766
#else
767
5.18M
#define felem_square felem_square_ref
768
4.10M
#define felem_mul felem_mul_ref
769
7.19M
#define felem_reduce felem_reduce_ref
770
771
7.66M
#define felem_square_reduce felem_square_reduce_ref
772
6.26M
#define felem_mul_reduce felem_mul_reduce_ref
773
#endif
774
775
/*-
776
 * felem_inv calculates |out| = |in|^{-1}
777
 *
778
 * Based on Fermat's Little Theorem:
779
 *   a^p = a (mod p)
780
 *   a^{p-1} = 1 (mod p)
781
 *   a^{p-2} = a^{-1} (mod p)
782
 */
783
static void felem_inv(felem out, const felem in)
784
8.51k
{
785
8.51k
    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6;
786
8.51k
    unsigned int i = 0;
787
788
8.51k
    felem_square_reduce(ftmp, in); /* 2^1 */
789
8.51k
    felem_mul_reduce(ftmp, ftmp, in); /* 2^1 + 2^0 */
790
8.51k
    felem_assign(ftmp2, ftmp);
791
792
8.51k
    felem_square_reduce(ftmp, ftmp); /* 2^2 + 2^1 */
793
8.51k
    felem_mul_reduce(ftmp, ftmp, in); /* 2^2 + 2^1 * 2^0 */
794
8.51k
    felem_assign(ftmp3, ftmp);
795
796
34.0k
    for (i = 0; i < 3; i++)
797
25.5k
        felem_square_reduce(ftmp, ftmp); /* 2^5 + 2^4 + 2^3 */
798
8.51k
    felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */
799
8.51k
    felem_assign(ftmp4, ftmp);
800
801
59.6k
    for (i = 0; i < 6; i++)
802
51.0k
        felem_square_reduce(ftmp, ftmp); /* 2^11 + ... + 2^6 */
803
8.51k
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^11 + ... + 2^0 */
804
805
34.0k
    for (i = 0; i < 3; i++)
806
25.5k
        felem_square_reduce(ftmp, ftmp); /* 2^14 + ... + 2^3 */
807
8.51k
    felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^14 + ... + 2^0 */
808
8.51k
    felem_assign(ftmp5, ftmp);
809
810
136k
    for (i = 0; i < 15; i++)
811
127k
        felem_square_reduce(ftmp, ftmp); /* 2^29 + ... + 2^15 */
812
8.51k
    felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^29 + ... + 2^0 */
813
8.51k
    felem_assign(ftmp6, ftmp);
814
815
263k
    for (i = 0; i < 30; i++)
816
255k
        felem_square_reduce(ftmp, ftmp); /* 2^59 + ... + 2^30 */
817
8.51k
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^59 + ... + 2^0 */
818
8.51k
    felem_assign(ftmp4, ftmp);
819
820
519k
    for (i = 0; i < 60; i++)
821
510k
        felem_square_reduce(ftmp, ftmp); /* 2^119 + ... + 2^60 */
822
8.51k
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^119 + ... + 2^0 */
823
8.51k
    felem_assign(ftmp4, ftmp);
824
825
1.03M
    for (i = 0; i < 120; i++)
826
1.02M
        felem_square_reduce(ftmp, ftmp); /* 2^239 + ... + 2^120 */
827
8.51k
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^239 + ... + 2^0 */
828
829
136k
    for (i = 0; i < 15; i++)
830
127k
        felem_square_reduce(ftmp, ftmp); /* 2^254 + ... + 2^15 */
831
8.51k
    felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^254 + ... + 2^0 */
832
833
272k
    for (i = 0; i < 31; i++)
834
263k
        felem_square_reduce(ftmp, ftmp); /* 2^285 + ... + 2^31 */
835
8.51k
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^285 + ... + 2^31 + 2^29 + ... + 2^0 */
836
837
25.5k
    for (i = 0; i < 2; i++)
838
17.0k
        felem_square_reduce(ftmp, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^2 */
839
8.51k
    felem_mul_reduce(ftmp, ftmp2, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^0 */
840
841
808k
    for (i = 0; i < 94; i++)
842
800k
        felem_square_reduce(ftmp, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 */
843
8.51k
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 + 2^29 + ... + 2^0 */
844
845
25.5k
    for (i = 0; i < 2; i++)
846
17.0k
        felem_square_reduce(ftmp, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 */
847
8.51k
    felem_mul_reduce(ftmp, in, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 + 2^0 */
848
849
8.51k
    memcpy(out, ftmp, sizeof(felem));
850
8.51k
}
851
852
/*
853
 * Zero-check: returns a limb with all bits set if |in| == 0 (mod p)
854
 * and 0 otherwise. We know that field elements are reduced to
855
 * 0 < in < 2p, so we only need to check two cases:
856
 * 0 and 2^384 - 2^128 - 2^96 + 2^32 - 1
857
 *   in[k] < 2^56, k < 6
858
 *   in[6] <= 2^48
859
 */
860
static limb felem_is_zero(const felem in)
861
2.64M
{
862
2.64M
    limb zero, p384;
863
864
2.64M
    zero = in[0] | in[1] | in[2] | in[3] | in[4] | in[5] | in[6];
865
2.64M
    zero = ((int64_t)(zero)-1) >> 63;
866
2.64M
    p384 = (in[0] ^ 0x000000ffffffff) | (in[1] ^ 0xffff0000000000)
867
2.64M
        | (in[2] ^ 0xfffffffffeffff) | (in[3] ^ 0xffffffffffffff)
868
2.64M
        | (in[4] ^ 0xffffffffffffff) | (in[5] ^ 0xffffffffffffff)
869
2.64M
        | (in[6] ^ 0xffffffffffff);
870
2.64M
    p384 = ((int64_t)(p384)-1) >> 63;
871
872
2.64M
    return (zero | p384);
873
2.64M
}
874
875
static int felem_is_zero_int(const void *in)
876
0
{
877
0
    return (int)(felem_is_zero(in) & ((limb)1));
878
0
}
879
880
/*-
881
 * felem_contract converts |in| to its unique, minimal representation.
882
 * Assume we've removed all redundant bits.
883
 * On entry:
884
 *   in[k] < 2^56, k < 6
885
 *   in[6] <= 2^48
886
 */
887
static void felem_contract(felem out, const felem in)
888
38.3k
{
889
38.3k
    static const int64_t two56 = ((limb)1) << 56;
890
891
    /*
892
     * We know for a fact that 0 <= |in| < 2*p, for p = 2^384 - 2^128 - 2^96 + 2^32 - 1
893
     * Perform two successive, idempotent subtractions to reduce if |in| >= p.
894
     */
895
896
38.3k
    int64_t tmp[NLIMBS], cond[5], a;
897
38.3k
    unsigned int i;
898
899
38.3k
    memcpy(tmp, in, sizeof(felem));
900
901
    /* Case 1: a = 1 iff |in| >= 2^384 */
902
38.3k
    a = (in[6] >> 48);
903
38.3k
    tmp[0] += a;
904
38.3k
    tmp[0] -= a << 32;
905
38.3k
    tmp[1] += a << 40;
906
38.3k
    tmp[2] += a << 16;
907
38.3k
    tmp[6] &= 0x0000ffffffffffff;
908
909
    /*
910
     * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
911
     * non-zero, so we only need one step
912
     */
913
914
38.3k
    a = tmp[0] >> 63;
915
38.3k
    tmp[0] += a & two56;
916
38.3k
    tmp[1] -= a & 1;
917
918
    /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
919
38.3k
    tmp[2] += tmp[1] >> 56;
920
38.3k
    tmp[1] &= 0x00ffffffffffffff;
921
922
38.3k
    tmp[3] += tmp[2] >> 56;
923
38.3k
    tmp[2] &= 0x00ffffffffffffff;
924
925
38.3k
    tmp[4] += tmp[3] >> 56;
926
38.3k
    tmp[3] &= 0x00ffffffffffffff;
927
928
38.3k
    tmp[5] += tmp[4] >> 56;
929
38.3k
    tmp[4] &= 0x00ffffffffffffff;
930
931
38.3k
    tmp[6] += tmp[5] >> 56; /* tmp[6] < 2^48 */
932
38.3k
    tmp[5] &= 0x00ffffffffffffff;
933
934
    /*
935
     * Case 2: a = all ones if p <= |in| < 2^384, 0 otherwise
936
     */
937
938
    /* 0 iff (2^129..2^383) are all one */
939
38.3k
    cond[0] = ((tmp[6] | 0xff000000000000) & tmp[5] & tmp[4] & tmp[3] & (tmp[2] | 0x0000000001ffff)) + 1;
940
    /* 0 iff 2^128 bit is one */
941
38.3k
    cond[1] = (tmp[2] | ~0x00000000010000) + 1;
942
    /* 0 iff (2^96..2^127) bits are all one */
943
38.3k
    cond[2] = ((tmp[2] | 0xffffffffff0000) & (tmp[1] | 0x0000ffffffffff)) + 1;
944
    /* 0 iff (2^32..2^95) bits are all zero */
945
38.3k
    cond[3] = (tmp[1] & ~0xffff0000000000) | (tmp[0] & ~((int64_t)0x000000ffffffff));
946
    /* 0 iff (2^0..2^31) bits are all one */
947
38.3k
    cond[4] = (tmp[0] | 0xffffff00000000) + 1;
948
949
    /*
950
     * In effect, invert our conditions, so that 0 values become all 1's,
951
     * any non-zero value in the low-order 56 bits becomes all 0's
952
     */
953
230k
    for (i = 0; i < 5; i++)
954
191k
        cond[i] = ((cond[i] & 0x00ffffffffffffff) - 1) >> 63;
955
956
    /*
957
     * The condition for determining whether in is greater than our
958
     * prime is given by the following condition.
959
     */
960
961
    /* First subtract 2^384 - 2^129 cheaply */
962
38.3k
    a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
963
38.3k
    tmp[6] &= ~a;
964
38.3k
    tmp[5] &= ~a;
965
38.3k
    tmp[4] &= ~a;
966
38.3k
    tmp[3] &= ~a;
967
38.3k
    tmp[2] &= ~a | 0x0000000001ffff;
968
969
    /*
970
     * Subtract 2^128 - 2^96 by
971
     * means of disjoint cases.
972
     */
973
974
    /* subtract 2^128 if that bit is present, and add 2^96 */
975
38.3k
    a = cond[0] & cond[1];
976
38.3k
    tmp[2] &= ~a | 0xfffffffffeffff;
977
38.3k
    tmp[1] += a & ((int64_t)1 << 40);
978
979
    /* otherwise, clear bits 2^127 .. 2^96  */
980
38.3k
    a = cond[0] & ~cond[1] & (cond[2] & (~cond[3] | cond[4]));
981
38.3k
    tmp[2] &= ~a | 0xffffffffff0000;
982
38.3k
    tmp[1] &= ~a | 0x0000ffffffffff;
983
984
    /* finally, subtract the last 2^32 - 1 */
985
38.3k
    a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
986
38.3k
    tmp[0] += a & (-((int64_t)1 << 32) + 1);
987
988
    /*
989
     * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
990
     * non-zero, so we only need one step
991
     */
992
38.3k
    a = tmp[0] >> 63;
993
38.3k
    tmp[0] += a & two56;
994
38.3k
    tmp[1] -= a & 1;
995
996
    /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
997
38.3k
    tmp[2] += tmp[1] >> 56;
998
38.3k
    tmp[1] &= 0x00ffffffffffffff;
999
1000
38.3k
    tmp[3] += tmp[2] >> 56;
1001
38.3k
    tmp[2] &= 0x00ffffffffffffff;
1002
1003
38.3k
    tmp[4] += tmp[3] >> 56;
1004
38.3k
    tmp[3] &= 0x00ffffffffffffff;
1005
1006
38.3k
    tmp[5] += tmp[4] >> 56;
1007
38.3k
    tmp[4] &= 0x00ffffffffffffff;
1008
1009
38.3k
    tmp[6] += tmp[5] >> 56;
1010
38.3k
    tmp[5] &= 0x00ffffffffffffff;
1011
1012
38.3k
    memcpy(out, tmp, sizeof(felem));
1013
38.3k
}
1014
1015
/*-
1016
 * Group operations
1017
 * ----------------
1018
 *
1019
 * Building on top of the field operations we have the operations on the
1020
 * elliptic curve group itself. Points on the curve are represented in Jacobian
1021
 * coordinates
1022
 */
1023
1024
/*-
1025
 * point_double calculates 2*(x_in, y_in, z_in)
1026
 *
1027
 * The method is taken from:
1028
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1029
 *
1030
 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1031
 * while x_out == y_in is not (maybe this works, but it's not tested).
1032
 */
1033
static void
1034
point_double(felem x_out, felem y_out, felem z_out,
1035
    const felem x_in, const felem y_in, const felem z_in)
1036
1.43M
{
1037
1.43M
    widefelem tmp, tmp2;
1038
1.43M
    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1039
1040
1.43M
    felem_assign(ftmp, x_in);
1041
1.43M
    felem_assign(ftmp2, x_in);
1042
1043
    /* delta = z^2 */
1044
1.43M
    felem_square_reduce(delta, z_in); /* delta[i] < 2^56 */
1045
1046
    /* gamma = y^2 */
1047
1.43M
    felem_square_reduce(gamma, y_in); /* gamma[i] < 2^56 */
1048
1049
    /* beta = x*gamma */
1050
1.43M
    felem_mul_reduce(beta, x_in, gamma); /* beta[i] < 2^56 */
1051
1052
    /* alpha = 3*(x-delta)*(x+delta) */
1053
1.43M
    felem_diff64(ftmp, delta); /* ftmp[i] < 2^60 + 2^58 + 2^44 */
1054
1.43M
    felem_sum64(ftmp2, delta); /* ftmp2[i] < 2^59 */
1055
1.43M
    felem_scalar64(ftmp2, 3); /* ftmp2[i] < 2^61 */
1056
1.43M
    felem_mul_reduce(alpha, ftmp, ftmp2); /* alpha[i] < 2^56 */
1057
1058
    /* x' = alpha^2 - 8*beta */
1059
1.43M
    felem_square(tmp, alpha); /* tmp[i] < 2^115 */
1060
1.43M
    felem_assign(ftmp, beta); /* ftmp[i] < 2^56 */
1061
1.43M
    felem_scalar64(ftmp, 8); /* ftmp[i] < 2^59 */
1062
1.43M
    felem_diff_128_64(tmp, ftmp); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1063
1.43M
    felem_reduce(x_out, tmp); /* x_out[i] < 2^56 */
1064
1065
    /* z' = (y + z)^2 - gamma - delta */
1066
1.43M
    felem_sum64(delta, gamma); /* delta[i] < 2^57 */
1067
1.43M
    felem_assign(ftmp, y_in); /* ftmp[i] < 2^56 */
1068
1.43M
    felem_sum64(ftmp, z_in); /* ftmp[i] < 2^56 */
1069
1.43M
    felem_square(tmp, ftmp); /* tmp[i] < 2^115 */
1070
1.43M
    felem_diff_128_64(tmp, delta); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1071
1.43M
    felem_reduce(z_out, tmp); /* z_out[i] < 2^56 */
1072
1073
    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1074
1.43M
    felem_scalar64(beta, 4); /* beta[i] < 2^58 */
1075
1.43M
    felem_diff64(beta, x_out); /* beta[i] < 2^60 + 2^58 + 2^44 */
1076
1.43M
    felem_mul(tmp, alpha, beta); /* tmp[i] < 2^119 */
1077
1.43M
    felem_square(tmp2, gamma); /* tmp2[i] < 2^115 */
1078
1.43M
    felem_scalar128(tmp2, 8); /* tmp2[i] < 2^118 */
1079
1.43M
    felem_diff128(tmp, tmp2); /* tmp[i] < 2^127 + 2^119 + 2^111 */
1080
1.43M
    felem_reduce(y_out, tmp); /* tmp[i] < 2^56 */
1081
1.43M
}
1082
1083
/* copy_conditional copies in to out iff mask is all ones. */
1084
static void copy_conditional(felem out, const felem in, limb mask)
1085
4.16M
{
1086
4.16M
    unsigned int i;
1087
1088
33.3M
    for (i = 0; i < NLIMBS; i++)
1089
29.1M
        out[i] ^= mask & (in[i] ^ out[i]);
1090
4.16M
}
1091
1092
/*-
1093
 * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1094
 *
1095
 * The method is taken from
1096
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1097
 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1098
 *
1099
 * This function includes a branch for checking whether the two input points
1100
 * are equal (while not equal to the point at infinity). See comment below
1101
 * on constant-time.
1102
 */
1103
static void point_add(felem x3, felem y3, felem z3,
1104
    const felem x1, const felem y1, const felem z1,
1105
    const int mixed, const felem x2, const felem y2,
1106
    const felem z2)
1107
661k
{
1108
661k
    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1109
661k
    widefelem tmp, tmp2;
1110
661k
    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1111
661k
    limb points_equal;
1112
1113
661k
    z1_is_zero = felem_is_zero(z1);
1114
661k
    z2_is_zero = felem_is_zero(z2);
1115
1116
    /* ftmp = z1z1 = z1**2 */
1117
661k
    felem_square_reduce(ftmp, z1); /* ftmp[i] < 2^56 */
1118
1119
661k
    if (!mixed) {
1120
        /* ftmp2 = z2z2 = z2**2 */
1121
210k
        felem_square_reduce(ftmp2, z2); /* ftmp2[i] < 2^56 */
1122
1123
        /* u1 = ftmp3 = x1*z2z2 */
1124
210k
        felem_mul_reduce(ftmp3, x1, ftmp2); /* ftmp3[i] < 2^56 */
1125
1126
        /* ftmp5 = z1 + z2 */
1127
210k
        felem_assign(ftmp5, z1); /* ftmp5[i] < 2^56 */
1128
210k
        felem_sum64(ftmp5, z2); /* ftmp5[i] < 2^57 */
1129
1130
        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1131
210k
        felem_square(tmp, ftmp5); /* tmp[i] < 2^117 */
1132
210k
        felem_diff_128_64(tmp, ftmp); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1133
210k
        felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1134
210k
        felem_reduce(ftmp5, tmp); /* ftmp5[i] < 2^56 */
1135
1136
        /* ftmp2 = z2 * z2z2 */
1137
210k
        felem_mul_reduce(ftmp2, ftmp2, z2); /* ftmp2[i] < 2^56 */
1138
1139
        /* s1 = ftmp6 = y1 * z2**3 */
1140
210k
        felem_mul_reduce(ftmp6, y1, ftmp2); /* ftmp6[i] < 2^56 */
1141
451k
    } else {
1142
        /*
1143
         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1144
         */
1145
1146
        /* u1 = ftmp3 = x1*z2z2 */
1147
451k
        felem_assign(ftmp3, x1); /* ftmp3[i] < 2^56 */
1148
1149
        /* ftmp5 = 2*z1z2 */
1150
451k
        felem_scalar(ftmp5, z1, 2); /* ftmp5[i] < 2^57 */
1151
1152
        /* s1 = ftmp6 = y1 * z2**3 */
1153
451k
        felem_assign(ftmp6, y1); /* ftmp6[i] < 2^56 */
1154
451k
    }
1155
    /* ftmp3[i] < 2^56, ftmp5[i] < 2^57, ftmp6[i] < 2^56 */
1156
1157
    /* u2 = x2*z1z1 */
1158
661k
    felem_mul(tmp, x2, ftmp); /* tmp[i] < 2^115 */
1159
1160
    /* h = ftmp4 = u2 - u1 */
1161
661k
    felem_diff_128_64(tmp, ftmp3); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1162
661k
    felem_reduce(ftmp4, tmp); /* ftmp[4] < 2^56 */
1163
1164
661k
    x_equal = felem_is_zero(ftmp4);
1165
1166
    /* z_out = ftmp5 * h */
1167
661k
    felem_mul_reduce(z_out, ftmp5, ftmp4); /* z_out[i] < 2^56 */
1168
1169
    /* ftmp = z1 * z1z1 */
1170
661k
    felem_mul_reduce(ftmp, ftmp, z1); /* ftmp[i] < 2^56 */
1171
1172
    /* s2 = tmp = y2 * z1**3 */
1173
661k
    felem_mul(tmp, y2, ftmp); /* tmp[i] < 2^115 */
1174
1175
    /* r = ftmp5 = (s2 - s1)*2 */
1176
661k
    felem_diff_128_64(tmp, ftmp6); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1177
661k
    felem_reduce(ftmp5, tmp); /* ftmp5[i] < 2^56 */
1178
661k
    y_equal = felem_is_zero(ftmp5);
1179
661k
    felem_scalar64(ftmp5, 2); /* ftmp5[i] < 2^57 */
1180
1181
    /*
1182
     * The formulae are incorrect if the points are equal, in affine coordinates
1183
     * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1184
     * happens.
1185
     *
1186
     * We use bitwise operations to avoid potential side-channels introduced by
1187
     * the short-circuiting behaviour of boolean operators.
1188
     *
1189
     * The special case of either point being the point at infinity (z1 and/or
1190
     * z2 are zero), is handled separately later on in this function, so we
1191
     * avoid jumping to point_double here in those special cases.
1192
     *
1193
     * Notice the comment below on the implications of this branching for timing
1194
     * leaks and why it is considered practically irrelevant.
1195
     */
1196
661k
    points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1197
1198
661k
    if (points_equal) {
1199
        /*
1200
         * This is obviously not constant-time but it will almost-never happen
1201
         * for ECDH / ECDSA.
1202
         */
1203
0
        point_double(x3, y3, z3, x1, y1, z1);
1204
0
        return;
1205
0
    }
1206
1207
    /* I = ftmp = (2h)**2 */
1208
661k
    felem_assign(ftmp, ftmp4); /* ftmp[i] < 2^56 */
1209
661k
    felem_scalar64(ftmp, 2); /* ftmp[i] < 2^57 */
1210
661k
    felem_square_reduce(ftmp, ftmp); /* ftmp[i] < 2^56 */
1211
1212
    /* J = ftmp2 = h * I */
1213
661k
    felem_mul_reduce(ftmp2, ftmp4, ftmp); /* ftmp2[i] < 2^56 */
1214
1215
    /* V = ftmp4 = U1 * I */
1216
661k
    felem_mul_reduce(ftmp4, ftmp3, ftmp); /* ftmp4[i] < 2^56 */
1217
1218
    /* x_out = r**2 - J - 2V */
1219
661k
    felem_square(tmp, ftmp5); /* tmp[i] < 2^117 */
1220
661k
    felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1221
661k
    felem_assign(ftmp3, ftmp4); /* ftmp3[i] < 2^56 */
1222
661k
    felem_scalar64(ftmp4, 2); /* ftmp4[i] < 2^57 */
1223
661k
    felem_diff_128_64(tmp, ftmp4); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1224
661k
    felem_reduce(x_out, tmp); /* x_out[i] < 2^56 */
1225
1226
    /* y_out = r(V-x_out) - 2 * s1 * J */
1227
661k
    felem_diff64(ftmp3, x_out); /* ftmp3[i] < 2^60 + 2^56 + 2^44 */
1228
661k
    felem_mul(tmp, ftmp5, ftmp3); /* tmp[i] < 2^116 */
1229
661k
    felem_mul(tmp2, ftmp6, ftmp2); /* tmp2[i] < 2^115 */
1230
661k
    felem_scalar128(tmp2, 2); /* tmp2[i] < 2^116 */
1231
661k
    felem_diff128(tmp, tmp2); /* tmp[i] < 2^127 + 2^116 + 2^111 */
1232
661k
    felem_reduce(y_out, tmp); /* y_out[i] < 2^56 */
1233
1234
661k
    copy_conditional(x_out, x2, z1_is_zero);
1235
661k
    copy_conditional(x_out, x1, z2_is_zero);
1236
661k
    copy_conditional(y_out, y2, z1_is_zero);
1237
661k
    copy_conditional(y_out, y1, z2_is_zero);
1238
661k
    copy_conditional(z_out, z2, z1_is_zero);
1239
661k
    copy_conditional(z_out, z1, z2_is_zero);
1240
661k
    felem_assign(x3, x_out);
1241
661k
    felem_assign(y3, y_out);
1242
661k
    felem_assign(z3, z_out);
1243
661k
}
1244
1245
/*-
1246
 * Base point pre computation
1247
 * --------------------------
1248
 *
1249
 * Two different sorts of precomputed tables are used in the following code.
1250
 * Each contain various points on the curve, where each point is three field
1251
 * elements (x, y, z).
1252
 *
1253
 * For the base point table, z is usually 1 (0 for the point at infinity).
1254
 * This table has 16 elements:
1255
 * index | bits    | point
1256
 * ------+---------+------------------------------
1257
 *     0 | 0 0 0 0 | 0G
1258
 *     1 | 0 0 0 1 | 1G
1259
 *     2 | 0 0 1 0 | 2^95G
1260
 *     3 | 0 0 1 1 | (2^95 + 1)G
1261
 *     4 | 0 1 0 0 | 2^190G
1262
 *     5 | 0 1 0 1 | (2^190 + 1)G
1263
 *     6 | 0 1 1 0 | (2^190 + 2^95)G
1264
 *     7 | 0 1 1 1 | (2^190 + 2^95 + 1)G
1265
 *     8 | 1 0 0 0 | 2^285G
1266
 *     9 | 1 0 0 1 | (2^285 + 1)G
1267
 *    10 | 1 0 1 0 | (2^285 + 2^95)G
1268
 *    11 | 1 0 1 1 | (2^285 + 2^95 + 1)G
1269
 *    12 | 1 1 0 0 | (2^285 + 2^190)G
1270
 *    13 | 1 1 0 1 | (2^285 + 2^190 + 1)G
1271
 *    14 | 1 1 1 0 | (2^285 + 2^190 + 2^95)G
1272
 *    15 | 1 1 1 1 | (2^285 + 2^190 + 2^95 + 1)G
1273
 *
1274
 * The reason for this is so that we can clock bits into four different
1275
 * locations when doing simple scalar multiplies against the base point.
1276
 *
1277
 * Tables for other points have table[i] = iG for i in 0 .. 16.
1278
 */
1279
1280
/* gmul is the table of precomputed base points */
1281
static const felem gmul[16][3] = {
1282
    { { 0, 0, 0, 0, 0, 0, 0 },
1283
        { 0, 0, 0, 0, 0, 0, 0 },
1284
        { 0, 0, 0, 0, 0, 0, 0 } },
1285
    { { 0x00545e3872760ab7, 0x00f25dbf55296c3a, 0x00e082542a385502, 0x008ba79b9859f741,
1286
          0x0020ad746e1d3b62, 0x0005378eb1c71ef3, 0x0000aa87ca22be8b },
1287
        { 0x00431d7c90ea0e5f, 0x00b1ce1d7e819d7a, 0x0013b5f0b8c00a60, 0x00289a147ce9da31,
1288
            0x0092dc29f8f41dbd, 0x002c6f5d9e98bf92, 0x00003617de4a9626 },
1289
        { 1, 0, 0, 0, 0, 0, 0 } },
1290
    { { 0x00024711cc902a90, 0x00acb2e579ab4fe1, 0x00af818a4b4d57b1, 0x00a17c7bec49c3de,
1291
          0x004280482d726a8b, 0x00128dd0f0a90f3b, 0x00004387c1c3fa3c },
1292
        { 0x002ce76543cf5c3a, 0x00de6cee5ef58f0a, 0x00403e42fa561ca6, 0x00bc54d6f9cb9731,
1293
            0x007155f925fb4ff1, 0x004a9ce731b7b9bc, 0x00002609076bd7b2 },
1294
        { 1, 0, 0, 0, 0, 0, 0 } },
1295
    { { 0x00e74c9182f0251d, 0x0039bf54bb111974, 0x00b9d2f2eec511d2, 0x0036b1594eb3a6a4,
1296
          0x00ac3bb82d9d564b, 0x00f9313f4615a100, 0x00006716a9a91b10 },
1297
        { 0x0046698116e2f15c, 0x00f34347067d3d33, 0x008de4ccfdebd002, 0x00e838c6b8e8c97b,
1298
            0x006faf0798def346, 0x007349794a57563c, 0x00002629e7e6ad84 },
1299
        { 1, 0, 0, 0, 0, 0, 0 } },
1300
    { { 0x0075300e34fd163b, 0x0092e9db4e8d0ad3, 0x00254be9f625f760, 0x00512c518c72ae68,
1301
          0x009bfcf162bede5a, 0x00bf9341566ce311, 0x0000cd6175bd41cf },
1302
        { 0x007dfe52af4ac70f, 0x0002159d2d5c4880, 0x00b504d16f0af8d0, 0x0014585e11f5e64c,
1303
            0x0089c6388e030967, 0x00ffb270cbfa5f71, 0x00009a15d92c3947 },
1304
        { 1, 0, 0, 0, 0, 0, 0 } },
1305
    { { 0x0033fc1278dc4fe5, 0x00d53088c2caa043, 0x0085558827e2db66, 0x00c192bef387b736,
1306
          0x00df6405a2225f2c, 0x0075205aa90fd91a, 0x0000137e3f12349d },
1307
        { 0x00ce5b115efcb07e, 0x00abc3308410deeb, 0x005dc6fc1de39904, 0x00907c1c496f36b4,
1308
            0x0008e6ad3926cbe1, 0x00110747b787928c, 0x0000021b9162eb7e },
1309
        { 1, 0, 0, 0, 0, 0, 0 } },
1310
    { { 0x008180042cfa26e1, 0x007b826a96254967, 0x0082473694d6b194, 0x007bd6880a45b589,
1311
          0x00c0a5097072d1a3, 0x0019186555e18b4e, 0x000020278190e5ca },
1312
        { 0x00b4bef17de61ac0, 0x009535e3c38ed348, 0x002d4aa8e468ceab, 0x00ef40b431036ad3,
1313
            0x00defd52f4542857, 0x0086edbf98234266, 0x00002025b3a7814d },
1314
        { 1, 0, 0, 0, 0, 0, 0 } },
1315
    { { 0x00b238aa97b886be, 0x00ef3192d6dd3a32, 0x0079f9e01fd62df8, 0x00742e890daba6c5,
1316
          0x008e5289144408ce, 0x0073bbcc8e0171a5, 0x0000c4fd329d3b52 },
1317
        { 0x00c6f64a15ee23e7, 0x00dcfb7b171cad8b, 0x00039f6cbd805867, 0x00de024e428d4562,
1318
            0x00be6a594d7c64c5, 0x0078467b70dbcd64, 0x0000251f2ed7079b },
1319
        { 1, 0, 0, 0, 0, 0, 0 } },
1320
    { { 0x000e5cc25fc4b872, 0x005ebf10d31ef4e1, 0x0061e0ebd11e8256, 0x0076e026096f5a27,
1321
          0x0013e6fc44662e9a, 0x0042b00289d3597e, 0x000024f089170d88 },
1322
        { 0x001604d7e0effbe6, 0x0048d77cba64ec2c, 0x008166b16da19e36, 0x006b0d1a0f28c088,
1323
            0x000259fcd47754fd, 0x00cc643e4d725f9a, 0x00007b10f3c79c14 },
1324
        { 1, 0, 0, 0, 0, 0, 0 } },
1325
    { { 0x00430155e3b908af, 0x00b801e4fec25226, 0x00b0d4bcfe806d26, 0x009fc4014eb13d37,
1326
          0x0066c94e44ec07e8, 0x00d16adc03874ba2, 0x000030c917a0d2a7 },
1327
        { 0x00edac9e21eb891c, 0x00ef0fb768102eff, 0x00c088cef272a5f3, 0x00cbf782134e2964,
1328
            0x0001044a7ba9a0e3, 0x00e363f5b194cf3c, 0x00009ce85249e372 },
1329
        { 1, 0, 0, 0, 0, 0, 0 } },
1330
    { { 0x001dd492dda5a7eb, 0x008fd577be539fd1, 0x002ff4b25a5fc3f1, 0x0074a8a1b64df72f,
1331
          0x002ba3d8c204a76c, 0x009d5cff95c8235a, 0x0000e014b9406e0f },
1332
        { 0x008c2e4dbfc98aba, 0x00f30bb89f1a1436, 0x00b46f7aea3e259c, 0x009224454ac02f54,
1333
            0x00906401f5645fa2, 0x003a1d1940eabc77, 0x00007c9351d680e6 },
1334
        { 1, 0, 0, 0, 0, 0, 0 } },
1335
    { { 0x005a35d872ef967c, 0x0049f1b7884e1987, 0x0059d46d7e31f552, 0x00ceb4869d2d0fb6,
1336
          0x00e8e89eee56802a, 0x0049d806a774aaf2, 0x0000147e2af0ae24 },
1337
        { 0x005fd1bd852c6e5e, 0x00b674b7b3de6885, 0x003b9ea5eb9b6c08, 0x005c9f03babf3ef7,
1338
            0x00605337fecab3c7, 0x009a3f85b11bbcc8, 0x0000455470f330ec },
1339
        { 1, 0, 0, 0, 0, 0, 0 } },
1340
    { { 0x002197ff4d55498d, 0x00383e8916c2d8af, 0x00eb203f34d1c6d2, 0x0080367cbd11b542,
1341
          0x00769b3be864e4f5, 0x0081a8458521c7bb, 0x0000c531b34d3539 },
1342
        { 0x00e2a3d775fa2e13, 0x00534fc379573844, 0x00ff237d2a8db54a, 0x00d301b2335a8882,
1343
            0x000f75ea96103a80, 0x0018fecb3cdd96fa, 0x0000304bf61e94eb },
1344
        { 1, 0, 0, 0, 0, 0, 0 } },
1345
    { { 0x00b2afc332a73dbd, 0x0029a0d5bb007bc5, 0x002d628eb210f577, 0x009f59a36dd05f50,
1346
          0x006d339de4eca613, 0x00c75a71addc86bc, 0x000060384c5ea93c },
1347
        { 0x00aa9641c32a30b4, 0x00cc73ae8cce565d, 0x00ec911a4df07f61, 0x00aa4b762ea4b264,
1348
            0x0096d395bb393629, 0x004efacfb7632fe0, 0x00006f252f46fa3f },
1349
        { 1, 0, 0, 0, 0, 0, 0 } },
1350
    { { 0x00567eec597c7af6, 0x0059ba6795204413, 0x00816d4e6f01196f, 0x004ae6b3eb57951d,
1351
          0x00420f5abdda2108, 0x003401d1f57ca9d9, 0x0000cf5837b0b67a },
1352
        { 0x00eaa64b8aeeabf9, 0x00246ddf16bcb4de, 0x000e7e3c3aecd751, 0x0008449f04fed72e,
1353
            0x00307b67ccf09183, 0x0017108c3556b7b1, 0x0000229b2483b3bf },
1354
        { 1, 0, 0, 0, 0, 0, 0 } },
1355
    { { 0x00e7c491a7bb78a1, 0x00eafddd1d3049ab, 0x00352c05e2bc7c98, 0x003d6880c165fa5c,
1356
          0x00b6ac61cc11c97d, 0x00beeb54fcf90ce5, 0x0000dc1f0b455edc },
1357
        { 0x002db2e7aee34d60, 0x0073b5f415a2d8c0, 0x00dd84e4193e9a0c, 0x00d02d873467c572,
1358
            0x0018baaeda60aee5, 0x0013fb11d697c61e, 0x000083aafcc3a973 },
1359
        { 1, 0, 0, 0, 0, 0, 0 } }
1360
};
1361
1362
/*
1363
 * select_point selects the |idx|th point from a precomputation table and
1364
 * copies it to out.
1365
 *
1366
 * pre_comp below is of the size provided in |size|.
1367
 */
1368
static void select_point(const limb idx, unsigned int size,
1369
    const felem pre_comp[][3], felem out[3])
1370
651k
{
1371
651k
    unsigned int i, j;
1372
651k
    limb *outlimbs = &out[0][0];
1373
1374
651k
    memset(out, 0, sizeof(*out) * 3);
1375
1376
11.2M
    for (i = 0; i < size; i++) {
1377
10.6M
        const limb *inlimbs = &pre_comp[i][0][0];
1378
10.6M
        limb mask = i ^ idx;
1379
1380
10.6M
        mask |= mask >> 4;
1381
10.6M
        mask |= mask >> 2;
1382
10.6M
        mask |= mask >> 1;
1383
10.6M
        mask &= 1;
1384
10.6M
        mask--;
1385
233M
        for (j = 0; j < NLIMBS * 3; j++)
1386
222M
            outlimbs[j] |= inlimbs[j] & mask;
1387
10.6M
    }
1388
651k
}
1389
1390
/* get_bit returns the |i|th bit in |in| */
1391
static char get_bit(const felem_bytearray in, int i)
1392
2.94M
{
1393
2.94M
    if (i < 0 || i >= 384)
1394
5.07k
        return 0;
1395
2.93M
    return (in[i >> 3] >> (i & 7)) & 1;
1396
2.94M
}
1397
1398
/*
1399
 * Interleaved point multiplication using precomputed point multiples: The
1400
 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1401
 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1402
 * generator, using certain (large) precomputed multiples in g_pre_comp.
1403
 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1404
 */
1405
static void batch_mul(felem x_out, felem y_out, felem z_out,
1406
    const felem_bytearray scalars[],
1407
    const unsigned int num_points, const u8 *g_scalar,
1408
    const int mixed, const felem pre_comp[][17][3],
1409
    const felem g_pre_comp[16][3])
1410
7.11k
{
1411
7.11k
    int i, skip;
1412
7.11k
    unsigned int num, gen_mul = (g_scalar != NULL);
1413
7.11k
    felem nq[3], tmp[4];
1414
7.11k
    limb bits;
1415
7.11k
    u8 sign, digit;
1416
1417
    /* set nq to the point at infinity */
1418
7.11k
    memset(nq, 0, sizeof(nq));
1419
1420
    /*
1421
     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1422
     * of the generator (last quarter of rounds) and additions of other
1423
     * points multiples (every 5th round).
1424
     */
1425
7.11k
    skip = 1; /* save two point operations in the first
1426
               * round */
1427
1.42M
    for (i = (num_points ? 380 : 98); i >= 0; --i) {
1428
        /* double */
1429
1.41M
        if (!skip)
1430
1.41M
            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1431
1432
        /* add multiples of the generator */
1433
1.41M
        if (gen_mul && (i <= 98)) {
1434
455k
            bits = get_bit(g_scalar, i + 285) << 3;
1435
455k
            if (i < 95) {
1436
437k
                bits |= get_bit(g_scalar, i + 190) << 2;
1437
437k
                bits |= get_bit(g_scalar, i + 95) << 1;
1438
437k
                bits |= get_bit(g_scalar, i);
1439
437k
            }
1440
            /* select the point to add, in constant time */
1441
455k
            select_point(bits, 16, g_pre_comp, tmp);
1442
455k
            if (!skip) {
1443
                /* The 1 argument below is for "mixed" */
1444
451k
                point_add(nq[0], nq[1], nq[2],
1445
451k
                    nq[0], nq[1], nq[2], 1,
1446
451k
                    tmp[0], tmp[1], tmp[2]);
1447
451k
            } else {
1448
4.57k
                memcpy(nq, tmp, 3 * sizeof(felem));
1449
4.57k
                skip = 0;
1450
4.57k
            }
1451
455k
        }
1452
1453
        /* do other additions every 5 doublings */
1454
1.41M
        if (num_points && (i % 5 == 0)) {
1455
            /* loop over all scalars */
1456
391k
            for (num = 0; num < num_points; ++num) {
1457
195k
                bits = get_bit(scalars[num], i + 4) << 5;
1458
195k
                bits |= get_bit(scalars[num], i + 3) << 4;
1459
195k
                bits |= get_bit(scalars[num], i + 2) << 3;
1460
195k
                bits |= get_bit(scalars[num], i + 1) << 2;
1461
195k
                bits |= get_bit(scalars[num], i) << 1;
1462
195k
                bits |= get_bit(scalars[num], i - 1);
1463
195k
                ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1464
1465
                /*
1466
                 * select the point to add or subtract, in constant time
1467
                 */
1468
195k
                select_point(digit, 17, pre_comp[num], tmp);
1469
195k
                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1470
                                            * point */
1471
195k
                copy_conditional(tmp[1], tmp[3], (-(limb)sign));
1472
1473
195k
                if (!skip) {
1474
192k
                    point_add(nq[0], nq[1], nq[2],
1475
192k
                        nq[0], nq[1], nq[2], mixed,
1476
192k
                        tmp[0], tmp[1], tmp[2]);
1477
192k
                } else {
1478
2.53k
                    memcpy(nq, tmp, 3 * sizeof(felem));
1479
2.53k
                    skip = 0;
1480
2.53k
                }
1481
195k
            }
1482
195k
        }
1483
1.41M
    }
1484
7.11k
    felem_assign(x_out, nq[0]);
1485
7.11k
    felem_assign(y_out, nq[1]);
1486
7.11k
    felem_assign(z_out, nq[2]);
1487
7.11k
}
1488
1489
/* Precomputation for the group generator. */
1490
struct nistp384_pre_comp_st {
1491
    felem g_pre_comp[16][3];
1492
    CRYPTO_REF_COUNT references;
1493
};
1494
1495
const EC_METHOD *ossl_ec_GFp_nistp384_method(void)
1496
23.7k
{
1497
23.7k
    static const EC_METHOD ret = {
1498
23.7k
        EC_FLAGS_DEFAULT_OCT,
1499
23.7k
        NID_X9_62_prime_field,
1500
23.7k
        ossl_ec_GFp_nistp384_group_init,
1501
23.7k
        ossl_ec_GFp_simple_group_finish,
1502
23.7k
        ossl_ec_GFp_simple_group_clear_finish,
1503
23.7k
        ossl_ec_GFp_nist_group_copy,
1504
23.7k
        ossl_ec_GFp_nistp384_group_set_curve,
1505
23.7k
        ossl_ec_GFp_simple_group_get_curve,
1506
23.7k
        ossl_ec_GFp_simple_group_get_degree,
1507
23.7k
        ossl_ec_group_simple_order_bits,
1508
23.7k
        ossl_ec_GFp_simple_group_check_discriminant,
1509
23.7k
        ossl_ec_GFp_simple_point_init,
1510
23.7k
        ossl_ec_GFp_simple_point_finish,
1511
23.7k
        ossl_ec_GFp_simple_point_clear_finish,
1512
23.7k
        ossl_ec_GFp_simple_point_copy,
1513
23.7k
        ossl_ec_GFp_simple_point_set_to_infinity,
1514
23.7k
        ossl_ec_GFp_simple_point_set_affine_coordinates,
1515
23.7k
        ossl_ec_GFp_nistp384_point_get_affine_coordinates,
1516
23.7k
        0, /* point_set_compressed_coordinates */
1517
23.7k
        0, /* point2oct */
1518
23.7k
        0, /* oct2point */
1519
23.7k
        ossl_ec_GFp_simple_add,
1520
23.7k
        ossl_ec_GFp_simple_dbl,
1521
23.7k
        ossl_ec_GFp_simple_invert,
1522
23.7k
        ossl_ec_GFp_simple_is_at_infinity,
1523
23.7k
        ossl_ec_GFp_simple_is_on_curve,
1524
23.7k
        ossl_ec_GFp_simple_cmp,
1525
23.7k
        ossl_ec_GFp_simple_make_affine,
1526
23.7k
        ossl_ec_GFp_simple_points_make_affine,
1527
23.7k
        ossl_ec_GFp_nistp384_points_mul,
1528
23.7k
        ossl_ec_GFp_nistp384_precompute_mult,
1529
23.7k
        ossl_ec_GFp_nistp384_have_precompute_mult,
1530
23.7k
        ossl_ec_GFp_nist_field_mul,
1531
23.7k
        ossl_ec_GFp_nist_field_sqr,
1532
23.7k
        0, /* field_div */
1533
23.7k
        ossl_ec_GFp_simple_field_inv,
1534
23.7k
        0, /* field_encode */
1535
23.7k
        0, /* field_decode */
1536
23.7k
        0, /* field_set_to_one */
1537
23.7k
        ossl_ec_key_simple_priv2oct,
1538
23.7k
        ossl_ec_key_simple_oct2priv,
1539
23.7k
        0, /* set private */
1540
23.7k
        ossl_ec_key_simple_generate_key,
1541
23.7k
        ossl_ec_key_simple_check_key,
1542
23.7k
        ossl_ec_key_simple_generate_public_key,
1543
23.7k
        0, /* keycopy */
1544
23.7k
        0, /* keyfinish */
1545
23.7k
        ossl_ecdh_simple_compute_key,
1546
23.7k
        ossl_ecdsa_simple_sign_setup,
1547
23.7k
        ossl_ecdsa_simple_sign_sig,
1548
23.7k
        ossl_ecdsa_simple_verify_sig,
1549
23.7k
        0, /* field_inverse_mod_ord */
1550
23.7k
        0, /* blind_coordinates */
1551
23.7k
        0, /* ladder_pre */
1552
23.7k
        0, /* ladder_step */
1553
23.7k
        0 /* ladder_post */
1554
23.7k
    };
1555
1556
23.7k
    return &ret;
1557
23.7k
}
1558
1559
/******************************************************************************/
1560
/*
1561
 * FUNCTIONS TO MANAGE PRECOMPUTATION
1562
 */
1563
1564
static NISTP384_PRE_COMP *nistp384_pre_comp_new(void)
1565
0
{
1566
0
    NISTP384_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1567
1568
0
    if (ret == NULL)
1569
0
        return ret;
1570
1571
0
    if (!CRYPTO_NEW_REF(&ret->references, 1)) {
1572
0
        OPENSSL_free(ret);
1573
0
        return NULL;
1574
0
    }
1575
0
    return ret;
1576
0
}
1577
1578
NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *p)
1579
0
{
1580
0
    int i;
1581
1582
0
    if (p != NULL)
1583
0
        CRYPTO_UP_REF(&p->references, &i);
1584
0
    return p;
1585
0
}
1586
1587
void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *p)
1588
0
{
1589
0
    int i;
1590
1591
0
    if (p == NULL)
1592
0
        return;
1593
1594
0
    CRYPTO_DOWN_REF(&p->references, &i);
1595
0
    REF_PRINT_COUNT("ossl_ec_nistp384", i, p);
1596
0
    if (i > 0)
1597
0
        return;
1598
0
    REF_ASSERT_ISNT(i < 0);
1599
1600
0
    CRYPTO_FREE_REF(&p->references);
1601
0
    OPENSSL_free(p);
1602
0
}
1603
1604
/******************************************************************************/
1605
/*
1606
 * OPENSSL EC_METHOD FUNCTIONS
1607
 */
1608
1609
int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group)
1610
49.5k
{
1611
49.5k
    int ret;
1612
1613
49.5k
    ret = ossl_ec_GFp_simple_group_init(group);
1614
49.5k
    group->a_is_minus3 = 1;
1615
49.5k
    return ret;
1616
49.5k
}
1617
1618
int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1619
    const BIGNUM *a, const BIGNUM *b,
1620
    BN_CTX *ctx)
1621
23.7k
{
1622
23.7k
    int ret = 0;
1623
23.7k
    BIGNUM *curve_p, *curve_a, *curve_b;
1624
23.7k
#ifndef FIPS_MODULE
1625
23.7k
    BN_CTX *new_ctx = NULL;
1626
1627
23.7k
    if (ctx == NULL)
1628
0
        ctx = new_ctx = BN_CTX_new();
1629
23.7k
#endif
1630
23.7k
    if (ctx == NULL)
1631
0
        return 0;
1632
1633
23.7k
    BN_CTX_start(ctx);
1634
23.7k
    curve_p = BN_CTX_get(ctx);
1635
23.7k
    curve_a = BN_CTX_get(ctx);
1636
23.7k
    curve_b = BN_CTX_get(ctx);
1637
23.7k
    if (curve_b == NULL)
1638
0
        goto err;
1639
23.7k
    BN_bin2bn(nistp384_curve_params[0], sizeof(felem_bytearray), curve_p);
1640
23.7k
    BN_bin2bn(nistp384_curve_params[1], sizeof(felem_bytearray), curve_a);
1641
23.7k
    BN_bin2bn(nistp384_curve_params[2], sizeof(felem_bytearray), curve_b);
1642
23.7k
    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1643
0
        ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1644
0
        goto err;
1645
0
    }
1646
23.7k
    group->field_mod_func = BN_nist_mod_384;
1647
23.7k
    ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1648
23.7k
err:
1649
23.7k
    BN_CTX_end(ctx);
1650
23.7k
#ifndef FIPS_MODULE
1651
23.7k
    BN_CTX_free(new_ctx);
1652
23.7k
#endif
1653
23.7k
    return ret;
1654
23.7k
}
1655
1656
/*
1657
 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1658
 * (X/Z^2, Y/Z^3)
1659
 */
1660
int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group,
1661
    const EC_POINT *point,
1662
    BIGNUM *x, BIGNUM *y,
1663
    BN_CTX *ctx)
1664
8.51k
{
1665
8.51k
    felem z1, z2, x_in, y_in, x_out, y_out;
1666
8.51k
    widefelem tmp;
1667
1668
8.51k
    if (EC_POINT_is_at_infinity(group, point)) {
1669
0
        ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1670
0
        return 0;
1671
0
    }
1672
8.51k
    if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) || (!BN_to_felem(z1, point->Z)))
1673
0
        return 0;
1674
8.51k
    felem_inv(z2, z1);
1675
8.51k
    felem_square(tmp, z2);
1676
8.51k
    felem_reduce(z1, tmp);
1677
8.51k
    felem_mul(tmp, x_in, z1);
1678
8.51k
    felem_reduce(x_in, tmp);
1679
8.51k
    felem_contract(x_out, x_in);
1680
8.51k
    if (x != NULL) {
1681
8.51k
        if (!felem_to_BN(x, x_out)) {
1682
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1683
0
            return 0;
1684
0
        }
1685
8.51k
    }
1686
8.51k
    felem_mul(tmp, z1, z2);
1687
8.51k
    felem_reduce(z1, tmp);
1688
8.51k
    felem_mul(tmp, y_in, z1);
1689
8.51k
    felem_reduce(y_in, tmp);
1690
8.51k
    felem_contract(y_out, y_in);
1691
8.51k
    if (y != NULL) {
1692
6.50k
        if (!felem_to_BN(y, y_out)) {
1693
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1694
0
            return 0;
1695
0
        }
1696
6.50k
    }
1697
8.51k
    return 1;
1698
8.51k
}
1699
1700
/* points below is of size |num|, and tmp_felems is of size |num+1/ */
1701
static void make_points_affine(size_t num, felem points[][3],
1702
    felem tmp_felems[])
1703
0
{
1704
    /*
1705
     * Runs in constant time, unless an input is the point at infinity (which
1706
     * normally shouldn't happen).
1707
     */
1708
0
    ossl_ec_GFp_nistp_points_make_affine_internal(num,
1709
0
        points,
1710
0
        sizeof(felem),
1711
0
        tmp_felems,
1712
0
        (void (*)(void *))felem_one,
1713
0
        felem_is_zero_int,
1714
0
        (void (*)(void *, const void *))
1715
0
            felem_assign,
1716
0
        (void (*)(void *, const void *))
1717
0
            felem_square_reduce,
1718
0
        (void (*)(void *, const void *, const void *))
1719
0
            felem_mul_reduce,
1720
0
        (void (*)(void *, const void *))
1721
0
            felem_inv,
1722
0
        (void (*)(void *, const void *))
1723
0
            felem_contract);
1724
0
}
1725
1726
/*
1727
 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1728
 * values Result is stored in r (r can equal one of the inputs).
1729
 */
1730
int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r,
1731
    const BIGNUM *scalar, size_t num,
1732
    const EC_POINT *points[],
1733
    const BIGNUM *scalars[], BN_CTX *ctx)
1734
7.11k
{
1735
7.11k
    int ret = 0;
1736
7.11k
    int j;
1737
7.11k
    int mixed = 0;
1738
7.11k
    BIGNUM *x, *y, *z, *tmp_scalar;
1739
7.11k
    felem_bytearray g_secret;
1740
7.11k
    felem_bytearray *secrets = NULL;
1741
7.11k
    felem(*pre_comp)[17][3] = NULL;
1742
7.11k
    felem *tmp_felems = NULL;
1743
7.11k
    unsigned int i;
1744
7.11k
    int num_bytes;
1745
7.11k
    int have_pre_comp = 0;
1746
7.11k
    size_t num_points = num;
1747
7.11k
    felem x_in, y_in, z_in, x_out, y_out, z_out;
1748
7.11k
    NISTP384_PRE_COMP *pre = NULL;
1749
7.11k
    felem(*g_pre_comp)[3] = NULL;
1750
7.11k
    EC_POINT *generator = NULL;
1751
7.11k
    const EC_POINT *p = NULL;
1752
7.11k
    const BIGNUM *p_scalar = NULL;
1753
1754
7.11k
    BN_CTX_start(ctx);
1755
7.11k
    x = BN_CTX_get(ctx);
1756
7.11k
    y = BN_CTX_get(ctx);
1757
7.11k
    z = BN_CTX_get(ctx);
1758
7.11k
    tmp_scalar = BN_CTX_get(ctx);
1759
7.11k
    if (tmp_scalar == NULL)
1760
0
        goto err;
1761
1762
7.11k
    if (scalar != NULL) {
1763
4.60k
        pre = group->pre_comp.nistp384;
1764
4.60k
        if (pre)
1765
            /* we have precomputation, try to use it */
1766
0
            g_pre_comp = &pre->g_pre_comp[0];
1767
4.60k
        else
1768
            /* try to use the standard precomputation */
1769
4.60k
            g_pre_comp = (felem(*)[3])gmul;
1770
4.60k
        generator = EC_POINT_new(group);
1771
4.60k
        if (generator == NULL)
1772
0
            goto err;
1773
        /* get the generator from precomputation */
1774
4.60k
        if (!felem_to_BN(x, g_pre_comp[1][0]) || !felem_to_BN(y, g_pre_comp[1][1]) || !felem_to_BN(z, g_pre_comp[1][2])) {
1775
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1776
0
            goto err;
1777
0
        }
1778
4.60k
        if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
1779
4.60k
                generator,
1780
4.60k
                x, y, z, ctx))
1781
0
            goto err;
1782
4.60k
        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1783
            /* precomputation matches generator */
1784
4.60k
            have_pre_comp = 1;
1785
0
        else
1786
            /*
1787
             * we don't have valid precomputation: treat the generator as a
1788
             * random point
1789
             */
1790
0
            num_points++;
1791
4.60k
    }
1792
1793
7.11k
    if (num_points > 0) {
1794
2.53k
        if (num_points >= 2) {
1795
            /*
1796
             * unless we precompute multiples for just one point, converting
1797
             * those into affine form is time well spent
1798
             */
1799
0
            mixed = 1;
1800
0
        }
1801
2.53k
        secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1802
2.53k
        pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1803
2.53k
        if (mixed)
1804
0
            tmp_felems = OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1805
2.53k
        if ((secrets == NULL) || (pre_comp == NULL)
1806
2.53k
            || (mixed && (tmp_felems == NULL)))
1807
0
            goto err;
1808
1809
        /*
1810
         * we treat NULL scalars as 0, and NULL points as points at infinity,
1811
         * i.e., they contribute nothing to the linear combination
1812
         */
1813
5.07k
        for (i = 0; i < num_points; ++i) {
1814
2.53k
            if (i == num) {
1815
                /*
1816
                 * we didn't have a valid precomputation, so we pick the
1817
                 * generator
1818
                 */
1819
0
                p = EC_GROUP_get0_generator(group);
1820
0
                p_scalar = scalar;
1821
2.53k
            } else {
1822
                /* the i^th point */
1823
2.53k
                p = points[i];
1824
2.53k
                p_scalar = scalars[i];
1825
2.53k
            }
1826
2.53k
            if (p_scalar != NULL && p != NULL) {
1827
                /* reduce scalar to 0 <= scalar < 2^384 */
1828
2.53k
                if ((BN_num_bits(p_scalar) > 384)
1829
2.53k
                    || (BN_is_negative(p_scalar))) {
1830
                    /*
1831
                     * this is an unusual input, and we don't guarantee
1832
                     * constant-timeness
1833
                     */
1834
0
                    if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1835
0
                        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1836
0
                        goto err;
1837
0
                    }
1838
0
                    num_bytes = BN_bn2lebinpad(tmp_scalar,
1839
0
                        secrets[i], sizeof(secrets[i]));
1840
2.53k
                } else {
1841
2.53k
                    num_bytes = BN_bn2lebinpad(p_scalar,
1842
2.53k
                        secrets[i], sizeof(secrets[i]));
1843
2.53k
                }
1844
2.53k
                if (num_bytes < 0) {
1845
0
                    ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1846
0
                    goto err;
1847
0
                }
1848
                /* precompute multiples */
1849
2.53k
                if ((!BN_to_felem(x_out, p->X)) || (!BN_to_felem(y_out, p->Y)) || (!BN_to_felem(z_out, p->Z)))
1850
0
                    goto err;
1851
2.53k
                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1852
2.53k
                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1853
2.53k
                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1854
40.6k
                for (j = 2; j <= 16; ++j) {
1855
38.0k
                    if (j & 1) {
1856
17.7k
                        point_add(pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1857
17.7k
                            pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], 0,
1858
17.7k
                            pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]);
1859
20.3k
                    } else {
1860
20.3k
                        point_double(pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1861
20.3k
                            pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]);
1862
20.3k
                    }
1863
38.0k
                }
1864
2.53k
            }
1865
2.53k
        }
1866
2.53k
        if (mixed)
1867
0
            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1868
2.53k
    }
1869
1870
    /* the scalar for the generator */
1871
7.11k
    if (scalar != NULL && have_pre_comp) {
1872
4.60k
        memset(g_secret, 0, sizeof(g_secret));
1873
        /* reduce scalar to 0 <= scalar < 2^384 */
1874
4.60k
        if ((BN_num_bits(scalar) > 384) || (BN_is_negative(scalar))) {
1875
            /*
1876
             * this is an unusual input, and we don't guarantee
1877
             * constant-timeness
1878
             */
1879
67
            if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1880
0
                ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1881
0
                goto err;
1882
0
            }
1883
67
            num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1884
4.53k
        } else {
1885
4.53k
            num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1886
4.53k
        }
1887
        /* do the multiplication with generator precomputation */
1888
4.60k
        batch_mul(x_out, y_out, z_out,
1889
4.60k
            (const felem_bytearray(*))secrets, num_points,
1890
4.60k
            g_secret,
1891
4.60k
            mixed, (const felem(*)[17][3])pre_comp,
1892
4.60k
            (const felem(*)[3])g_pre_comp);
1893
4.60k
    } else {
1894
        /* do the multiplication without generator precomputation */
1895
2.50k
        batch_mul(x_out, y_out, z_out,
1896
2.50k
            (const felem_bytearray(*))secrets, num_points,
1897
2.50k
            NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1898
2.50k
    }
1899
    /* reduce the output to its unique minimal representation */
1900
7.11k
    felem_contract(x_in, x_out);
1901
7.11k
    felem_contract(y_in, y_out);
1902
7.11k
    felem_contract(z_in, z_out);
1903
7.11k
    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) || (!felem_to_BN(z, z_in))) {
1904
0
        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1905
0
        goto err;
1906
0
    }
1907
7.11k
    ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
1908
7.11k
        ctx);
1909
1910
7.11k
err:
1911
7.11k
    BN_CTX_end(ctx);
1912
7.11k
    EC_POINT_free(generator);
1913
7.11k
    OPENSSL_free(secrets);
1914
7.11k
    OPENSSL_free(pre_comp);
1915
7.11k
    OPENSSL_free(tmp_felems);
1916
7.11k
    return ret;
1917
7.11k
}
1918
1919
int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1920
0
{
1921
0
    int ret = 0;
1922
0
    NISTP384_PRE_COMP *pre = NULL;
1923
0
    int i, j;
1924
0
    BIGNUM *x, *y;
1925
0
    EC_POINT *generator = NULL;
1926
0
    felem tmp_felems[16];
1927
0
#ifndef FIPS_MODULE
1928
0
    BN_CTX *new_ctx = NULL;
1929
0
#endif
1930
1931
    /* throw away old precomputation */
1932
0
    EC_pre_comp_free(group);
1933
1934
0
#ifndef FIPS_MODULE
1935
0
    if (ctx == NULL)
1936
0
        ctx = new_ctx = BN_CTX_new();
1937
0
#endif
1938
0
    if (ctx == NULL)
1939
0
        return 0;
1940
1941
0
    BN_CTX_start(ctx);
1942
0
    x = BN_CTX_get(ctx);
1943
0
    y = BN_CTX_get(ctx);
1944
0
    if (y == NULL)
1945
0
        goto err;
1946
    /* get the generator */
1947
0
    if (group->generator == NULL)
1948
0
        goto err;
1949
0
    generator = EC_POINT_new(group);
1950
0
    if (generator == NULL)
1951
0
        goto err;
1952
0
    BN_bin2bn(nistp384_curve_params[3], sizeof(felem_bytearray), x);
1953
0
    BN_bin2bn(nistp384_curve_params[4], sizeof(felem_bytearray), y);
1954
0
    if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
1955
0
        goto err;
1956
0
    if ((pre = nistp384_pre_comp_new()) == NULL)
1957
0
        goto err;
1958
    /*
1959
     * if the generator is the standard one, use built-in precomputation
1960
     */
1961
0
    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1962
0
        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1963
0
        goto done;
1964
0
    }
1965
0
    if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) || (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) || (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
1966
0
        goto err;
1967
    /* compute 2^95*G, 2^190*G, 2^285*G */
1968
0
    for (i = 1; i <= 4; i <<= 1) {
1969
0
        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1970
0
            pre->g_pre_comp[i][0], pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
1971
0
        for (j = 0; j < 94; ++j) {
1972
0
            point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1973
0
                pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2]);
1974
0
        }
1975
0
    }
1976
    /* g_pre_comp[0] is the point at infinity */
1977
0
    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1978
    /* the remaining multiples */
1979
    /* 2^95*G + 2^190*G */
1980
0
    point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1], pre->g_pre_comp[6][2],
1981
0
        pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], pre->g_pre_comp[4][2], 0,
1982
0
        pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]);
1983
    /* 2^95*G + 2^285*G */
1984
0
    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], pre->g_pre_comp[10][2],
1985
0
        pre->g_pre_comp[8][0], pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], 0,
1986
0
        pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]);
1987
    /* 2^190*G + 2^285*G */
1988
0
    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
1989
0
        pre->g_pre_comp[8][0], pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], 0,
1990
0
        pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], pre->g_pre_comp[4][2]);
1991
    /* 2^95*G + 2^190*G + 2^285*G */
1992
0
    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], pre->g_pre_comp[14][2],
1993
0
        pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], 0,
1994
0
        pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]);
1995
0
    for (i = 1; i < 8; ++i) {
1996
        /* odd multiples: add G */
1997
0
        point_add(pre->g_pre_comp[2 * i + 1][0], pre->g_pre_comp[2 * i + 1][1], pre->g_pre_comp[2 * i + 1][2],
1998
0
            pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
1999
0
            pre->g_pre_comp[1][0], pre->g_pre_comp[1][1], pre->g_pre_comp[1][2]);
2000
0
    }
2001
0
    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2002
2003
0
done:
2004
0
    SETPRECOMP(group, nistp384, pre);
2005
0
    ret = 1;
2006
0
    pre = NULL;
2007
0
err:
2008
0
    BN_CTX_end(ctx);
2009
0
    EC_POINT_free(generator);
2010
0
#ifndef FIPS_MODULE
2011
0
    BN_CTX_free(new_ctx);
2012
0
#endif
2013
0
    ossl_ec_nistp384_pre_comp_free(pre);
2014
0
    return ret;
2015
0
}
2016
2017
int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group)
2018
0
{
2019
    return HAVEPRECOMP(group, nistp384);
2020
0
}