Coverage Report

Created: 2026-05-30 06:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openssl/crypto/ec/ecp_nistp384.c
Line
Count
Source
1
/*
2
 * Copyright 2023-2025 The OpenSSL Project Authors. All Rights Reserved.
3
 *
4
 * Licensed under the Apache License 2.0 (the "License").  You may not use
5
 * this file except in compliance with the License.  You can obtain a copy
6
 * in the file LICENSE in the source distribution or at
7
 * https://www.openssl.org/source/license.html
8
 */
9
10
/* Copyright 2023 IBM Corp.
11
 *
12
 * Licensed under the Apache License, Version 2.0 (the "License");
13
 *
14
 * you may not use this file except in compliance with the License.
15
 * You may obtain a copy of the License at
16
 *
17
 *     http://www.apache.org/licenses/LICENSE-2.0
18
 *
19
 *  Unless required by applicable law or agreed to in writing, software
20
 *  distributed under the License is distributed on an "AS IS" BASIS,
21
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22
 *  See the License for the specific language governing permissions and
23
 *  limitations under the License.
24
 */
25
26
/*
27
 * Designed for 56-bit limbs by Rohan McLure <rohan.mclure@linux.ibm.com>.
28
 * The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm
29
 * acceleration of felem_{square,mul} as supported in these files.
30
 */
31
32
#include <openssl/e_os2.h>
33
34
#include <string.h>
35
#include <openssl/err.h>
36
#include "ec_local.h"
37
38
#include "internal/numbers.h"
39
40
#ifndef INT128_MAX
41
#error "Your compiler doesn't appear to support 128-bit integer types"
42
#endif
43
44
/*
45
 * The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We
46
 * can serialize an element of this field into 48 bytes. We call this an
47
 * felem_bytearray.
48
 */
49
50
typedef uint8_t felem_bytearray[48];
51
52
/*
53
 * These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4.
54
 * These values are big-endian.
55
 */
56
static const felem_bytearray nistp384_curve_params[5] = {
57
    { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
58
        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
59
        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
60
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF },
61
    { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */
62
        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
63
        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
64
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC },
65
    { 0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */
66
        0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12,
67
        0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D,
68
        0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF },
69
    { 0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */
70
        0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98,
71
        0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D,
72
        0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7 },
73
    { 0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */
74
        0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C,
75
        0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE,
76
        0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F },
77
};
78
79
/*-
80
 * The representation of field elements.
81
 * ------------------------------------
82
 *
83
 * We represent field elements with seven values. These values are either 64 or
84
 * 128 bits and the field element represented is:
85
 *   v[0]*2^0 + v[1]*2^56 + v[2]*2^112 + ... + v[6]*2^336  (mod p)
86
 * Each of the seven values is called a 'limb'. Since the limbs are spaced only
87
 * 56 bits apart, but are greater than 56 bits in length, the most significant
88
 * bits of each limb overlap with the least significant bits of the next
89
 *
90
 * This representation is considered to be 'redundant' in the sense that
91
 * intermediate values can each contain more than a 56-bit value in each limb.
92
 * Reduction causes all but the final limb to be reduced to contain a value less
93
 * than 2^56, with the final value represented allowed to be larger than 2^384,
94
 * inasmuch as we can be sure that arithmetic overflow remains impossible. The
95
 * reduced value must of course be congruent to the unreduced value.
96
 *
97
 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
98
 * 'widefelem', featuring enough bits to store the result of a multiplication
99
 * and even some further arithmetic without need for immediate reduction.
100
 */
101
102
0
#define NLIMBS 7
103
104
typedef uint64_t limb;
105
typedef uint128_t widelimb;
106
typedef limb limb_aX __attribute((__aligned__(1)));
107
typedef limb felem[NLIMBS];
108
typedef widelimb widefelem[2 * NLIMBS - 1];
109
110
static const limb bottom56bits = 0xffffffffffffff;
111
112
/* Helper functions (de)serialising reduced field elements in little endian */
113
static void bin48_to_felem(felem out, const uint8_t in[48])
114
0
{
115
0
    memset(out, 0, 56);
116
0
    out[0] = (*((limb *)&in[0])) & bottom56bits;
117
0
    out[1] = (*((limb_aX *)&in[7])) & bottom56bits;
118
0
    out[2] = (*((limb_aX *)&in[14])) & bottom56bits;
119
0
    out[3] = (*((limb_aX *)&in[21])) & bottom56bits;
120
0
    out[4] = (*((limb_aX *)&in[28])) & bottom56bits;
121
0
    out[5] = (*((limb_aX *)&in[35])) & bottom56bits;
122
0
    memmove(&out[6], &in[42], 6);
123
0
}
124
125
static void felem_to_bin48(uint8_t out[48], const felem in)
126
0
{
127
0
    memset(out, 0, 48);
128
0
    (*((limb *)&out[0])) |= (in[0] & bottom56bits);
129
0
    (*((limb_aX *)&out[7])) |= (in[1] & bottom56bits);
130
0
    (*((limb_aX *)&out[14])) |= (in[2] & bottom56bits);
131
0
    (*((limb_aX *)&out[21])) |= (in[3] & bottom56bits);
132
0
    (*((limb_aX *)&out[28])) |= (in[4] & bottom56bits);
133
0
    (*((limb_aX *)&out[35])) |= (in[5] & bottom56bits);
134
0
    memmove(&out[42], &in[6], 6);
135
0
}
136
137
/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
138
static int BN_to_felem(felem out, const BIGNUM *bn)
139
0
{
140
0
    felem_bytearray b_out;
141
0
    int num_bytes;
142
143
0
    if (BN_is_negative(bn)) {
144
0
        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
145
0
        return 0;
146
0
    }
147
0
    num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
148
0
    if (num_bytes < 0) {
149
0
        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
150
0
        return 0;
151
0
    }
152
0
    bin48_to_felem(out, b_out);
153
0
    return 1;
154
0
}
155
156
/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
157
static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
158
0
{
159
0
    felem_bytearray b_out;
160
161
0
    felem_to_bin48(b_out, in);
162
0
    return BN_lebin2bn(b_out, sizeof(b_out), out);
163
0
}
164
165
/*-
166
 * Field operations
167
 * ----------------
168
 */
169
170
static void felem_one(felem out)
171
0
{
172
0
    out[0] = 1;
173
0
    memset(&out[1], 0, sizeof(limb) * (NLIMBS - 1));
174
0
}
175
176
static void felem_assign(felem out, const felem in)
177
0
{
178
0
    memcpy(out, in, sizeof(felem));
179
0
}
180
181
/* felem_sum64 sets out = out + in. */
182
static void felem_sum64(felem out, const felem in)
183
0
{
184
0
    unsigned int i;
185
186
0
    for (i = 0; i < NLIMBS; i++)
187
0
        out[i] += in[i];
188
0
}
189
190
/* felem_scalar sets out = in * scalar */
191
static void felem_scalar(felem out, const felem in, limb scalar)
192
0
{
193
0
    unsigned int i;
194
195
0
    for (i = 0; i < NLIMBS; i++)
196
0
        out[i] = in[i] * scalar;
197
0
}
198
199
/* felem_scalar64 sets out = out * scalar */
200
static void felem_scalar64(felem out, limb scalar)
201
0
{
202
0
    unsigned int i;
203
204
0
    for (i = 0; i < NLIMBS; i++)
205
0
        out[i] *= scalar;
206
0
}
207
208
/* felem_scalar128 sets out = out * scalar */
209
static void felem_scalar128(widefelem out, limb scalar)
210
0
{
211
0
    unsigned int i;
212
213
0
    for (i = 0; i < 2 * NLIMBS - 1; i++)
214
0
        out[i] *= scalar;
215
0
}
216
217
/*-
218
 * felem_neg sets |out| to |-in|
219
 * On entry:
220
 *   in[i] < 2^60 - 2^29
221
 * On exit:
222
 *   out[i] < 2^60
223
 */
224
static void felem_neg(felem out, const felem in)
225
0
{
226
    /*
227
     * In order to prevent underflow, we add a multiple of p before subtracting.
228
     * Use telescopic sums to represent 2^12 * p redundantly with each limb
229
     * of the form 2^60 + ...
230
     */
231
0
    static const limb two60m52m4 = (((limb)1) << 60)
232
0
        - (((limb)1) << 52)
233
0
        - (((limb)1) << 4);
234
0
    static const limb two60p44m12 = (((limb)1) << 60)
235
0
        + (((limb)1) << 44)
236
0
        - (((limb)1) << 12);
237
0
    static const limb two60m28m4 = (((limb)1) << 60)
238
0
        - (((limb)1) << 28)
239
0
        - (((limb)1) << 4);
240
0
    static const limb two60m4 = (((limb)1) << 60)
241
0
        - (((limb)1) << 4);
242
243
0
    out[0] = two60p44m12 - in[0];
244
0
    out[1] = two60m52m4 - in[1];
245
0
    out[2] = two60m28m4 - in[2];
246
0
    out[3] = two60m4 - in[3];
247
0
    out[4] = two60m4 - in[4];
248
0
    out[5] = two60m4 - in[5];
249
0
    out[6] = two60m4 - in[6];
250
0
}
251
252
#if defined(ECP_NISTP384_ASM)
253
void p384_felem_diff64(felem out, const felem in);
254
void p384_felem_diff128(widefelem out, const widefelem in);
255
void p384_felem_diff_128_64(widefelem out, const felem in);
256
257
#define felem_diff64 p384_felem_diff64
258
#define felem_diff128 p384_felem_diff128
259
#define felem_diff_128_64 p384_felem_diff_128_64
260
261
#else
262
/*-
263
 * felem_diff64 subtracts |in| from |out|
264
 * On entry:
265
 *   in[i] < 2^60 - 2^52 - 2^4
266
 * On exit:
267
 *   out[i] < out_orig[i] + 2^60 + 2^44
268
 */
269
static void felem_diff64(felem out, const felem in)
270
0
{
271
    /*
272
     * In order to prevent underflow, we add a multiple of p before subtracting.
273
     * Use telescopic sums to represent 2^12 * p redundantly with each limb
274
     * of the form 2^60 + ...
275
     */
276
277
0
    static const limb two60m52m4 = (((limb)1) << 60)
278
0
        - (((limb)1) << 52)
279
0
        - (((limb)1) << 4);
280
0
    static const limb two60p44m12 = (((limb)1) << 60)
281
0
        + (((limb)1) << 44)
282
0
        - (((limb)1) << 12);
283
0
    static const limb two60m28m4 = (((limb)1) << 60)
284
0
        - (((limb)1) << 28)
285
0
        - (((limb)1) << 4);
286
0
    static const limb two60m4 = (((limb)1) << 60)
287
0
        - (((limb)1) << 4);
288
289
0
    out[0] += two60p44m12 - in[0];
290
0
    out[1] += two60m52m4 - in[1];
291
0
    out[2] += two60m28m4 - in[2];
292
0
    out[3] += two60m4 - in[3];
293
0
    out[4] += two60m4 - in[4];
294
0
    out[5] += two60m4 - in[5];
295
0
    out[6] += two60m4 - in[6];
296
0
}
297
298
/*
299
 * in[i] < 2^63
300
 * out[i] < out_orig[i] + 2^64 + 2^48
301
 */
302
static void felem_diff_128_64(widefelem out, const felem in)
303
0
{
304
    /*
305
     * In order to prevent underflow, we add a multiple of p before subtracting.
306
     * Use telescopic sums to represent 2^16 * p redundantly with each limb
307
     * of the form 2^64 + ...
308
     */
309
310
0
    static const widelimb two64m56m8 = (((widelimb)1) << 64)
311
0
        - (((widelimb)1) << 56)
312
0
        - (((widelimb)1) << 8);
313
0
    static const widelimb two64m32m8 = (((widelimb)1) << 64)
314
0
        - (((widelimb)1) << 32)
315
0
        - (((widelimb)1) << 8);
316
0
    static const widelimb two64m8 = (((widelimb)1) << 64)
317
0
        - (((widelimb)1) << 8);
318
0
    static const widelimb two64p48m16 = (((widelimb)1) << 64)
319
0
        + (((widelimb)1) << 48)
320
0
        - (((widelimb)1) << 16);
321
0
    unsigned int i;
322
323
0
    out[0] += two64p48m16;
324
0
    out[1] += two64m56m8;
325
0
    out[2] += two64m32m8;
326
0
    out[3] += two64m8;
327
0
    out[4] += two64m8;
328
0
    out[5] += two64m8;
329
0
    out[6] += two64m8;
330
331
0
    for (i = 0; i < NLIMBS; i++)
332
0
        out[i] -= in[i];
333
0
}
334
335
/*
336
 * in[i] < 2^127 - 2^119 - 2^71
337
 * out[i] < out_orig[i] + 2^127 + 2^111
338
 */
339
static void felem_diff128(widefelem out, const widefelem in)
340
0
{
341
    /*
342
     * In order to prevent underflow, we add a multiple of p before subtracting.
343
     * Use telescopic sums to represent 2^415 * p redundantly with each limb
344
     * of the form 2^127 + ...
345
     */
346
347
0
    static const widelimb two127 = ((widelimb)1) << 127;
348
0
    static const widelimb two127m71 = (((widelimb)1) << 127)
349
0
        - (((widelimb)1) << 71);
350
0
    static const widelimb two127p111m79m71 = (((widelimb)1) << 127)
351
0
        + (((widelimb)1) << 111)
352
0
        - (((widelimb)1) << 79)
353
0
        - (((widelimb)1) << 71);
354
0
    static const widelimb two127m119m71 = (((widelimb)1) << 127)
355
0
        - (((widelimb)1) << 119)
356
0
        - (((widelimb)1) << 71);
357
0
    static const widelimb two127m95m71 = (((widelimb)1) << 127)
358
0
        - (((widelimb)1) << 95)
359
0
        - (((widelimb)1) << 71);
360
0
    unsigned int i;
361
362
0
    out[0] += two127;
363
0
    out[1] += two127m71;
364
0
    out[2] += two127m71;
365
0
    out[3] += two127m71;
366
0
    out[4] += two127m71;
367
0
    out[5] += two127m71;
368
0
    out[6] += two127p111m79m71;
369
0
    out[7] += two127m119m71;
370
0
    out[8] += two127m95m71;
371
0
    out[9] += two127m71;
372
0
    out[10] += two127m71;
373
0
    out[11] += two127m71;
374
0
    out[12] += two127m71;
375
376
0
    for (i = 0; i < 2 * NLIMBS - 1; i++)
377
0
        out[i] -= in[i];
378
0
}
379
#endif /* ECP_NISTP384_ASM */
380
381
static void felem_square_ref(widefelem out, const felem in)
382
0
{
383
0
    felem inx2;
384
0
    felem_scalar(inx2, in, 2);
385
386
0
    out[0] = ((uint128_t)in[0]) * in[0];
387
388
0
    out[1] = ((uint128_t)in[0]) * inx2[1];
389
390
0
    out[2] = ((uint128_t)in[0]) * inx2[2]
391
0
        + ((uint128_t)in[1]) * in[1];
392
393
0
    out[3] = ((uint128_t)in[0]) * inx2[3]
394
0
        + ((uint128_t)in[1]) * inx2[2];
395
396
0
    out[4] = ((uint128_t)in[0]) * inx2[4]
397
0
        + ((uint128_t)in[1]) * inx2[3]
398
0
        + ((uint128_t)in[2]) * in[2];
399
400
0
    out[5] = ((uint128_t)in[0]) * inx2[5]
401
0
        + ((uint128_t)in[1]) * inx2[4]
402
0
        + ((uint128_t)in[2]) * inx2[3];
403
404
0
    out[6] = ((uint128_t)in[0]) * inx2[6]
405
0
        + ((uint128_t)in[1]) * inx2[5]
406
0
        + ((uint128_t)in[2]) * inx2[4]
407
0
        + ((uint128_t)in[3]) * in[3];
408
409
0
    out[7] = ((uint128_t)in[1]) * inx2[6]
410
0
        + ((uint128_t)in[2]) * inx2[5]
411
0
        + ((uint128_t)in[3]) * inx2[4];
412
413
0
    out[8] = ((uint128_t)in[2]) * inx2[6]
414
0
        + ((uint128_t)in[3]) * inx2[5]
415
0
        + ((uint128_t)in[4]) * in[4];
416
417
0
    out[9] = ((uint128_t)in[3]) * inx2[6]
418
0
        + ((uint128_t)in[4]) * inx2[5];
419
420
0
    out[10] = ((uint128_t)in[4]) * inx2[6]
421
0
        + ((uint128_t)in[5]) * in[5];
422
423
0
    out[11] = ((uint128_t)in[5]) * inx2[6];
424
425
0
    out[12] = ((uint128_t)in[6]) * in[6];
426
0
}
427
428
static void felem_mul_ref(widefelem out, const felem in1, const felem in2)
429
0
{
430
0
    out[0] = ((uint128_t)in1[0]) * in2[0];
431
432
0
    out[1] = ((uint128_t)in1[0]) * in2[1]
433
0
        + ((uint128_t)in1[1]) * in2[0];
434
435
0
    out[2] = ((uint128_t)in1[0]) * in2[2]
436
0
        + ((uint128_t)in1[1]) * in2[1]
437
0
        + ((uint128_t)in1[2]) * in2[0];
438
439
0
    out[3] = ((uint128_t)in1[0]) * in2[3]
440
0
        + ((uint128_t)in1[1]) * in2[2]
441
0
        + ((uint128_t)in1[2]) * in2[1]
442
0
        + ((uint128_t)in1[3]) * in2[0];
443
444
0
    out[4] = ((uint128_t)in1[0]) * in2[4]
445
0
        + ((uint128_t)in1[1]) * in2[3]
446
0
        + ((uint128_t)in1[2]) * in2[2]
447
0
        + ((uint128_t)in1[3]) * in2[1]
448
0
        + ((uint128_t)in1[4]) * in2[0];
449
450
0
    out[5] = ((uint128_t)in1[0]) * in2[5]
451
0
        + ((uint128_t)in1[1]) * in2[4]
452
0
        + ((uint128_t)in1[2]) * in2[3]
453
0
        + ((uint128_t)in1[3]) * in2[2]
454
0
        + ((uint128_t)in1[4]) * in2[1]
455
0
        + ((uint128_t)in1[5]) * in2[0];
456
457
0
    out[6] = ((uint128_t)in1[0]) * in2[6]
458
0
        + ((uint128_t)in1[1]) * in2[5]
459
0
        + ((uint128_t)in1[2]) * in2[4]
460
0
        + ((uint128_t)in1[3]) * in2[3]
461
0
        + ((uint128_t)in1[4]) * in2[2]
462
0
        + ((uint128_t)in1[5]) * in2[1]
463
0
        + ((uint128_t)in1[6]) * in2[0];
464
465
0
    out[7] = ((uint128_t)in1[1]) * in2[6]
466
0
        + ((uint128_t)in1[2]) * in2[5]
467
0
        + ((uint128_t)in1[3]) * in2[4]
468
0
        + ((uint128_t)in1[4]) * in2[3]
469
0
        + ((uint128_t)in1[5]) * in2[2]
470
0
        + ((uint128_t)in1[6]) * in2[1];
471
472
0
    out[8] = ((uint128_t)in1[2]) * in2[6]
473
0
        + ((uint128_t)in1[3]) * in2[5]
474
0
        + ((uint128_t)in1[4]) * in2[4]
475
0
        + ((uint128_t)in1[5]) * in2[3]
476
0
        + ((uint128_t)in1[6]) * in2[2];
477
478
0
    out[9] = ((uint128_t)in1[3]) * in2[6]
479
0
        + ((uint128_t)in1[4]) * in2[5]
480
0
        + ((uint128_t)in1[5]) * in2[4]
481
0
        + ((uint128_t)in1[6]) * in2[3];
482
483
0
    out[10] = ((uint128_t)in1[4]) * in2[6]
484
0
        + ((uint128_t)in1[5]) * in2[5]
485
0
        + ((uint128_t)in1[6]) * in2[4];
486
487
0
    out[11] = ((uint128_t)in1[5]) * in2[6]
488
0
        + ((uint128_t)in1[6]) * in2[5];
489
490
0
    out[12] = ((uint128_t)in1[6]) * in2[6];
491
0
}
492
493
/*-
494
 * Reduce thirteen 128-bit coefficients to seven 64-bit coefficients.
495
 * in[i] < 2^128 - 2^125
496
 * out[i] < 2^56 for i < 6,
497
 * out[6] <= 2^48
498
 *
499
 * The technique in use here stems from the format of the prime modulus:
500
 * P384 = 2^384 - delta
501
 *
502
 * Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting
503
 * them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These
504
 * coefficients are still quite large, and so we repeatedly apply this
505
 * technique on high-order bits in order to guarantee the desired bounds on
506
 * the size of our output.
507
 *
508
 * The three phases of elimination are as follows:
509
 * [1]: Y = 2^120 (in[12] | in[11] | in[10] | in[9])
510
 * [2]: Y = 2^8 (acc[8] | acc[7])
511
 * [3]: Y = 2^48 (acc[6] >> 48)
512
 * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d)
513
 */
514
static void felem_reduce_ref(felem out, const widefelem in)
515
0
{
516
    /*
517
     * In order to prevent underflow, we add a multiple of p before subtracting.
518
     * Use telescopic sums to represent 2^76 * p redundantly with each limb
519
     * of the form 2^124 + ...
520
     */
521
0
    static const widelimb two124m68 = (((widelimb)1) << 124)
522
0
        - (((widelimb)1) << 68);
523
0
    static const widelimb two124m116m68 = (((widelimb)1) << 124)
524
0
        - (((widelimb)1) << 116)
525
0
        - (((widelimb)1) << 68);
526
0
    static const widelimb two124p108m76 = (((widelimb)1) << 124)
527
0
        + (((widelimb)1) << 108)
528
0
        - (((widelimb)1) << 76);
529
0
    static const widelimb two124m92m68 = (((widelimb)1) << 124)
530
0
        - (((widelimb)1) << 92)
531
0
        - (((widelimb)1) << 68);
532
0
    widelimb temp, acc[9];
533
0
    unsigned int i;
534
535
0
    memcpy(acc, in, sizeof(widelimb) * 9);
536
537
0
    acc[0] += two124p108m76;
538
0
    acc[1] += two124m116m68;
539
0
    acc[2] += two124m92m68;
540
0
    acc[3] += two124m68;
541
0
    acc[4] += two124m68;
542
0
    acc[5] += two124m68;
543
0
    acc[6] += two124m68;
544
545
    /* [1]: Eliminate in[9], ..., in[12] */
546
0
    acc[8] += in[12] >> 32;
547
0
    acc[7] += (in[12] & 0xffffffff) << 24;
548
0
    acc[7] += in[12] >> 8;
549
0
    acc[6] += (in[12] & 0xff) << 48;
550
0
    acc[6] -= in[12] >> 16;
551
0
    acc[5] -= (in[12] & 0xffff) << 40;
552
0
    acc[6] += in[12] >> 48;
553
0
    acc[5] += (in[12] & 0xffffffffffff) << 8;
554
555
0
    acc[7] += in[11] >> 32;
556
0
    acc[6] += (in[11] & 0xffffffff) << 24;
557
0
    acc[6] += in[11] >> 8;
558
0
    acc[5] += (in[11] & 0xff) << 48;
559
0
    acc[5] -= in[11] >> 16;
560
0
    acc[4] -= (in[11] & 0xffff) << 40;
561
0
    acc[5] += in[11] >> 48;
562
0
    acc[4] += (in[11] & 0xffffffffffff) << 8;
563
564
0
    acc[6] += in[10] >> 32;
565
0
    acc[5] += (in[10] & 0xffffffff) << 24;
566
0
    acc[5] += in[10] >> 8;
567
0
    acc[4] += (in[10] & 0xff) << 48;
568
0
    acc[4] -= in[10] >> 16;
569
0
    acc[3] -= (in[10] & 0xffff) << 40;
570
0
    acc[4] += in[10] >> 48;
571
0
    acc[3] += (in[10] & 0xffffffffffff) << 8;
572
573
0
    acc[5] += in[9] >> 32;
574
0
    acc[4] += (in[9] & 0xffffffff) << 24;
575
0
    acc[4] += in[9] >> 8;
576
0
    acc[3] += (in[9] & 0xff) << 48;
577
0
    acc[3] -= in[9] >> 16;
578
0
    acc[2] -= (in[9] & 0xffff) << 40;
579
0
    acc[3] += in[9] >> 48;
580
0
    acc[2] += (in[9] & 0xffffffffffff) << 8;
581
582
    /*
583
     * [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as
584
     * well as the contributions made from eliminating higher limbs.
585
     * acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121
586
     * acc[8] < in[8] + 2^96
587
     */
588
0
    acc[4] += acc[8] >> 32;
589
0
    acc[3] += (acc[8] & 0xffffffff) << 24;
590
0
    acc[3] += acc[8] >> 8;
591
0
    acc[2] += (acc[8] & 0xff) << 48;
592
0
    acc[2] -= acc[8] >> 16;
593
0
    acc[1] -= (acc[8] & 0xffff) << 40;
594
0
    acc[2] += acc[8] >> 48;
595
0
    acc[1] += (acc[8] & 0xffffffffffff) << 8;
596
597
0
    acc[3] += acc[7] >> 32;
598
0
    acc[2] += (acc[7] & 0xffffffff) << 24;
599
0
    acc[2] += acc[7] >> 8;
600
0
    acc[1] += (acc[7] & 0xff) << 48;
601
0
    acc[1] -= acc[7] >> 16;
602
0
    acc[0] -= (acc[7] & 0xffff) << 40;
603
0
    acc[1] += acc[7] >> 48;
604
0
    acc[0] += (acc[7] & 0xffffffffffff) << 8;
605
606
    /*-
607
     * acc[k] < in[k] + 2^124 + 2^121
608
     *        < in[k] + 2^125
609
     *        < 2^128, for k <= 6
610
     */
611
612
    /*
613
     * Carry 4 -> 5 -> 6
614
     * This has the effect of ensuring that these more significant limbs
615
     * will be small in value after eliminating high bits from acc[6].
616
     */
617
0
    acc[5] += acc[4] >> 56;
618
0
    acc[4] &= 0x00ffffffffffffff;
619
620
0
    acc[6] += acc[5] >> 56;
621
0
    acc[5] &= 0x00ffffffffffffff;
622
623
    /*-
624
     * acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16
625
     *        < in[6] + 2^125
626
     *        < 2^128
627
     */
628
629
    /* [3]: Eliminate high bits of acc[6] */
630
0
    temp = acc[6] >> 48;
631
0
    acc[6] &= 0x0000ffffffffffff;
632
633
    /* temp < 2^80 */
634
635
0
    acc[3] += temp >> 40;
636
0
    acc[2] += (temp & 0xffffffffff) << 16;
637
0
    acc[2] += temp >> 16;
638
0
    acc[1] += (temp & 0xffff) << 40;
639
0
    acc[1] -= temp >> 24;
640
0
    acc[0] -= (temp & 0xffffff) << 32;
641
0
    acc[0] += temp;
642
643
    /*-
644
     * acc[k] < acc_old[k] + 2^64 + 2^56
645
     *        < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4
646
     */
647
648
    /* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
649
0
    acc[1] += acc[0] >> 56; /* acc[1] < acc_old[1] + 2^72 */
650
0
    acc[0] &= 0x00ffffffffffffff;
651
652
0
    acc[2] += acc[1] >> 56; /* acc[2] < acc_old[2] + 2^72 + 2^16 */
653
0
    acc[1] &= 0x00ffffffffffffff;
654
655
0
    acc[3] += acc[2] >> 56; /* acc[3] < acc_old[3] + 2^72 + 2^16 */
656
0
    acc[2] &= 0x00ffffffffffffff;
657
658
    /*-
659
     * acc[k] < acc_old[k] + 2^72 + 2^16
660
     *        < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17
661
     *        < in[k] + 2^125
662
     *        < 2^128 , k < 4
663
     */
664
665
0
    acc[4] += acc[3] >> 56; /*-
666
                             * acc[4] < acc_old[4] + 2^72 + 2^16
667
                             *        < 2^72 + 2^56 + 2^16
668
                             */
669
0
    acc[3] &= 0x00ffffffffffffff;
670
671
0
    acc[5] += acc[4] >> 56; /*-
672
                             * acc[5] < acc_old[5] + 2^16 + 1
673
                             *        < 2^56 + 2^16 + 1
674
                             */
675
0
    acc[4] &= 0x00ffffffffffffff;
676
677
0
    acc[6] += acc[5] >> 56; /* acc[6] < 2^48 + 1 <= 2^48 */
678
0
    acc[5] &= 0x00ffffffffffffff;
679
680
0
    for (i = 0; i < NLIMBS; i++)
681
0
        out[i] = acc[i];
682
0
}
683
684
static ossl_inline void felem_square_reduce_ref(felem out, const felem in)
685
0
{
686
0
    widefelem tmp;
687
688
0
    felem_square_ref(tmp, in);
689
0
    felem_reduce_ref(out, tmp);
690
0
}
691
692
static ossl_inline void felem_mul_reduce_ref(felem out, const felem in1, const felem in2)
693
0
{
694
0
    widefelem tmp;
695
696
0
    felem_mul_ref(tmp, in1, in2);
697
0
    felem_reduce_ref(out, tmp);
698
0
}
699
700
#if defined(ECP_NISTP384_ASM)
701
static void felem_square_wrapper(widefelem out, const felem in);
702
static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2);
703
704
static void (*felem_square_p)(widefelem out, const felem in) = felem_square_wrapper;
705
static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) = felem_mul_wrapper;
706
707
static void (*felem_reduce_p)(felem out, const widefelem in) = felem_reduce_ref;
708
709
static void (*felem_square_reduce_p)(felem out, const felem in) = felem_square_reduce_ref;
710
static void (*felem_mul_reduce_p)(felem out, const felem in1, const felem in2) = felem_mul_reduce_ref;
711
712
void p384_felem_square(widefelem out, const felem in);
713
void p384_felem_mul(widefelem out, const felem in1, const felem in2);
714
void p384_felem_reduce(felem out, const widefelem in);
715
716
void p384_felem_square_reduce(felem out, const felem in);
717
void p384_felem_mul_reduce(felem out, const felem in1, const felem in2);
718
719
#if defined(_ARCH_PPC64)
720
#include "arch/ppc_arch.h"
721
#endif
722
723
static void felem_select(void)
724
{
725
#if defined(_ARCH_PPC64)
726
    if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
727
        felem_square_p = p384_felem_square;
728
        felem_mul_p = p384_felem_mul;
729
        felem_reduce_p = p384_felem_reduce;
730
        felem_square_reduce_p = p384_felem_square_reduce;
731
        felem_mul_reduce_p = p384_felem_mul_reduce;
732
733
        return;
734
    }
735
#endif
736
737
    /* Default */
738
    felem_square_p = felem_square_ref;
739
    felem_mul_p = felem_mul_ref;
740
    felem_reduce_p = felem_reduce_ref;
741
    felem_square_reduce_p = felem_square_reduce_ref;
742
    felem_mul_reduce_p = felem_mul_reduce_ref;
743
}
744
745
static void felem_square_wrapper(widefelem out, const felem in)
746
{
747
    felem_select();
748
    felem_square_p(out, in);
749
}
750
751
static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2)
752
{
753
    felem_select();
754
    felem_mul_p(out, in1, in2);
755
}
756
757
#define felem_square felem_square_p
758
#define felem_mul felem_mul_p
759
#define felem_reduce felem_reduce_p
760
761
#define felem_square_reduce felem_square_reduce_p
762
#define felem_mul_reduce felem_mul_reduce_p
763
#else
764
0
#define felem_square felem_square_ref
765
0
#define felem_mul felem_mul_ref
766
0
#define felem_reduce felem_reduce_ref
767
768
0
#define felem_square_reduce felem_square_reduce_ref
769
0
#define felem_mul_reduce felem_mul_reduce_ref
770
#endif
771
772
/*-
773
 * felem_inv calculates |out| = |in|^{-1}
774
 *
775
 * Based on Fermat's Little Theorem:
776
 *   a^p = a (mod p)
777
 *   a^{p-1} = 1 (mod p)
778
 *   a^{p-2} = a^{-1} (mod p)
779
 */
780
static void felem_inv(felem out, const felem in)
781
0
{
782
0
    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6;
783
0
    unsigned int i = 0;
784
785
0
    felem_square_reduce(ftmp, in); /* 2^1 */
786
0
    felem_mul_reduce(ftmp, ftmp, in); /* 2^1 + 2^0 */
787
0
    felem_assign(ftmp2, ftmp);
788
789
0
    felem_square_reduce(ftmp, ftmp); /* 2^2 + 2^1 */
790
0
    felem_mul_reduce(ftmp, ftmp, in); /* 2^2 + 2^1 * 2^0 */
791
0
    felem_assign(ftmp3, ftmp);
792
793
0
    for (i = 0; i < 3; i++)
794
0
        felem_square_reduce(ftmp, ftmp); /* 2^5 + 2^4 + 2^3 */
795
0
    felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */
796
0
    felem_assign(ftmp4, ftmp);
797
798
0
    for (i = 0; i < 6; i++)
799
0
        felem_square_reduce(ftmp, ftmp); /* 2^11 + ... + 2^6 */
800
0
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^11 + ... + 2^0 */
801
802
0
    for (i = 0; i < 3; i++)
803
0
        felem_square_reduce(ftmp, ftmp); /* 2^14 + ... + 2^3 */
804
0
    felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^14 + ... + 2^0 */
805
0
    felem_assign(ftmp5, ftmp);
806
807
0
    for (i = 0; i < 15; i++)
808
0
        felem_square_reduce(ftmp, ftmp); /* 2^29 + ... + 2^15 */
809
0
    felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^29 + ... + 2^0 */
810
0
    felem_assign(ftmp6, ftmp);
811
812
0
    for (i = 0; i < 30; i++)
813
0
        felem_square_reduce(ftmp, ftmp); /* 2^59 + ... + 2^30 */
814
0
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^59 + ... + 2^0 */
815
0
    felem_assign(ftmp4, ftmp);
816
817
0
    for (i = 0; i < 60; i++)
818
0
        felem_square_reduce(ftmp, ftmp); /* 2^119 + ... + 2^60 */
819
0
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^119 + ... + 2^0 */
820
0
    felem_assign(ftmp4, ftmp);
821
822
0
    for (i = 0; i < 120; i++)
823
0
        felem_square_reduce(ftmp, ftmp); /* 2^239 + ... + 2^120 */
824
0
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^239 + ... + 2^0 */
825
826
0
    for (i = 0; i < 15; i++)
827
0
        felem_square_reduce(ftmp, ftmp); /* 2^254 + ... + 2^15 */
828
0
    felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^254 + ... + 2^0 */
829
830
0
    for (i = 0; i < 31; i++)
831
0
        felem_square_reduce(ftmp, ftmp); /* 2^285 + ... + 2^31 */
832
0
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^285 + ... + 2^31 + 2^29 + ... + 2^0 */
833
834
0
    for (i = 0; i < 2; i++)
835
0
        felem_square_reduce(ftmp, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^2 */
836
0
    felem_mul_reduce(ftmp, ftmp2, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^0 */
837
838
0
    for (i = 0; i < 94; i++)
839
0
        felem_square_reduce(ftmp, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 */
840
0
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 + 2^29 + ... + 2^0 */
841
842
0
    for (i = 0; i < 2; i++)
843
0
        felem_square_reduce(ftmp, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 */
844
0
    felem_mul_reduce(ftmp, in, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 + 2^0 */
845
846
0
    memcpy(out, ftmp, sizeof(felem));
847
0
}
848
849
/*
850
 * Zero-check: returns a limb with all bits set if |in| == 0 (mod p)
851
 * and 0 otherwise. We know that field elements are reduced to
852
 * 0 < in < 2p, so we only need to check two cases:
853
 * 0 and 2^384 - 2^128 - 2^96 + 2^32 - 1
854
 *   in[k] < 2^56, k < 6
855
 *   in[6] <= 2^48
856
 */
857
static limb felem_is_zero(const felem in)
858
0
{
859
0
    limb zero, p384;
860
861
0
    zero = in[0] | in[1] | in[2] | in[3] | in[4] | in[5] | in[6];
862
0
    zero = ((int64_t)(zero)-1) >> 63;
863
0
    p384 = (in[0] ^ 0x000000ffffffff) | (in[1] ^ 0xffff0000000000)
864
0
        | (in[2] ^ 0xfffffffffeffff) | (in[3] ^ 0xffffffffffffff)
865
0
        | (in[4] ^ 0xffffffffffffff) | (in[5] ^ 0xffffffffffffff)
866
0
        | (in[6] ^ 0xffffffffffff);
867
0
    p384 = ((int64_t)(p384)-1) >> 63;
868
869
0
    return (zero | p384);
870
0
}
871
872
static int felem_is_zero_int(const void *in)
873
0
{
874
0
    return (int)(felem_is_zero(in) & ((limb)1));
875
0
}
876
877
/*-
878
 * felem_contract converts |in| to its unique, minimal representation.
879
 * Assume we've removed all redundant bits.
880
 * On entry:
881
 *   in[k] < 2^56, k < 6
882
 *   in[6] <= 2^48
883
 */
884
static void felem_contract(felem out, const felem in)
885
0
{
886
0
    static const int64_t two56 = ((limb)1) << 56;
887
888
    /*
889
     * We know for a fact that 0 <= |in| < 2*p, for p = 2^384 - 2^128 - 2^96 + 2^32 - 1
890
     * Perform two successive, idempotent subtractions to reduce if |in| >= p.
891
     */
892
893
0
    int64_t tmp[NLIMBS], cond[5], a;
894
0
    unsigned int i;
895
896
0
    memcpy(tmp, in, sizeof(felem));
897
898
    /* Case 1: a = 1 iff |in| >= 2^384 */
899
0
    a = (in[6] >> 48);
900
0
    tmp[0] += a;
901
0
    tmp[0] -= a << 32;
902
0
    tmp[1] += a << 40;
903
0
    tmp[2] += a << 16;
904
0
    tmp[6] &= 0x0000ffffffffffff;
905
906
    /*
907
     * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
908
     * non-zero, so we only need one step
909
     */
910
911
0
    a = tmp[0] >> 63;
912
0
    tmp[0] += a & two56;
913
0
    tmp[1] -= a & 1;
914
915
    /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
916
0
    tmp[2] += tmp[1] >> 56;
917
0
    tmp[1] &= 0x00ffffffffffffff;
918
919
0
    tmp[3] += tmp[2] >> 56;
920
0
    tmp[2] &= 0x00ffffffffffffff;
921
922
0
    tmp[4] += tmp[3] >> 56;
923
0
    tmp[3] &= 0x00ffffffffffffff;
924
925
0
    tmp[5] += tmp[4] >> 56;
926
0
    tmp[4] &= 0x00ffffffffffffff;
927
928
0
    tmp[6] += tmp[5] >> 56; /* tmp[6] < 2^48 */
929
0
    tmp[5] &= 0x00ffffffffffffff;
930
931
    /*
932
     * Case 2: a = all ones if p <= |in| < 2^384, 0 otherwise
933
     */
934
935
    /* 0 iff (2^129..2^383) are all one */
936
0
    cond[0] = ((tmp[6] | 0xff000000000000) & tmp[5] & tmp[4] & tmp[3] & (tmp[2] | 0x0000000001ffff)) + 1;
937
    /* 0 iff 2^128 bit is one */
938
0
    cond[1] = (tmp[2] | ~0x00000000010000) + 1;
939
    /* 0 iff (2^96..2^127) bits are all one */
940
0
    cond[2] = ((tmp[2] | 0xffffffffff0000) & (tmp[1] | 0x0000ffffffffff)) + 1;
941
    /* 0 iff (2^32..2^95) bits are all zero */
942
0
    cond[3] = (tmp[1] & ~0xffff0000000000) | (tmp[0] & ~((int64_t)0x000000ffffffff));
943
    /* 0 iff (2^0..2^31) bits are all one */
944
0
    cond[4] = (tmp[0] | 0xffffff00000000) + 1;
945
946
    /*
947
     * In effect, invert our conditions, so that 0 values become all 1's,
948
     * any non-zero value in the low-order 56 bits becomes all 0's
949
     */
950
0
    for (i = 0; i < 5; i++)
951
0
        cond[i] = ((cond[i] & 0x00ffffffffffffff) - 1) >> 63;
952
953
    /*
954
     * The condition for determining whether in is greater than our
955
     * prime is given by the following condition.
956
     */
957
958
    /* First subtract 2^384 - 2^129 cheaply */
959
0
    a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
960
0
    tmp[6] &= ~a;
961
0
    tmp[5] &= ~a;
962
0
    tmp[4] &= ~a;
963
0
    tmp[3] &= ~a;
964
0
    tmp[2] &= ~a | 0x0000000001ffff;
965
966
    /*
967
     * Subtract 2^128 - 2^96 by
968
     * means of disjoint cases.
969
     */
970
971
    /* subtract 2^128 if that bit is present, and add 2^96 */
972
0
    a = cond[0] & cond[1];
973
0
    tmp[2] &= ~a | 0xfffffffffeffff;
974
0
    tmp[1] += a & ((int64_t)1 << 40);
975
976
    /* otherwise, clear bits 2^127 .. 2^96  */
977
0
    a = cond[0] & ~cond[1] & (cond[2] & (~cond[3] | cond[4]));
978
0
    tmp[2] &= ~a | 0xffffffffff0000;
979
0
    tmp[1] &= ~a | 0x0000ffffffffff;
980
981
    /* finally, subtract the last 2^32 - 1 */
982
0
    a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
983
0
    tmp[0] += a & (-((int64_t)1 << 32) + 1);
984
985
    /*
986
     * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
987
     * non-zero, so we only need one step
988
     */
989
0
    a = tmp[0] >> 63;
990
0
    tmp[0] += a & two56;
991
0
    tmp[1] -= a & 1;
992
993
    /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
994
0
    tmp[2] += tmp[1] >> 56;
995
0
    tmp[1] &= 0x00ffffffffffffff;
996
997
0
    tmp[3] += tmp[2] >> 56;
998
0
    tmp[2] &= 0x00ffffffffffffff;
999
1000
0
    tmp[4] += tmp[3] >> 56;
1001
0
    tmp[3] &= 0x00ffffffffffffff;
1002
1003
0
    tmp[5] += tmp[4] >> 56;
1004
0
    tmp[4] &= 0x00ffffffffffffff;
1005
1006
0
    tmp[6] += tmp[5] >> 56;
1007
0
    tmp[5] &= 0x00ffffffffffffff;
1008
1009
0
    memcpy(out, tmp, sizeof(felem));
1010
0
}
1011
1012
/*-
1013
 * Group operations
1014
 * ----------------
1015
 *
1016
 * Building on top of the field operations we have the operations on the
1017
 * elliptic curve group itself. Points on the curve are represented in Jacobian
1018
 * coordinates
1019
 */
1020
1021
/*-
1022
 * point_double calculates 2*(x_in, y_in, z_in)
1023
 *
1024
 * The method is taken from:
1025
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1026
 *
1027
 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1028
 * while x_out == y_in is not (maybe this works, but it's not tested).
1029
 */
1030
static void
1031
point_double(felem x_out, felem y_out, felem z_out,
1032
    const felem x_in, const felem y_in, const felem z_in)
1033
0
{
1034
0
    widefelem tmp, tmp2;
1035
0
    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1036
1037
0
    felem_assign(ftmp, x_in);
1038
0
    felem_assign(ftmp2, x_in);
1039
1040
    /* delta = z^2 */
1041
0
    felem_square_reduce(delta, z_in); /* delta[i] < 2^56 */
1042
1043
    /* gamma = y^2 */
1044
0
    felem_square_reduce(gamma, y_in); /* gamma[i] < 2^56 */
1045
1046
    /* beta = x*gamma */
1047
0
    felem_mul_reduce(beta, x_in, gamma); /* beta[i] < 2^56 */
1048
1049
    /* alpha = 3*(x-delta)*(x+delta) */
1050
0
    felem_diff64(ftmp, delta); /* ftmp[i] < 2^60 + 2^58 + 2^44 */
1051
0
    felem_sum64(ftmp2, delta); /* ftmp2[i] < 2^59 */
1052
0
    felem_scalar64(ftmp2, 3); /* ftmp2[i] < 2^61 */
1053
0
    felem_mul_reduce(alpha, ftmp, ftmp2); /* alpha[i] < 2^56 */
1054
1055
    /* x' = alpha^2 - 8*beta */
1056
0
    felem_square(tmp, alpha); /* tmp[i] < 2^115 */
1057
0
    felem_assign(ftmp, beta); /* ftmp[i] < 2^56 */
1058
0
    felem_scalar64(ftmp, 8); /* ftmp[i] < 2^59 */
1059
0
    felem_diff_128_64(tmp, ftmp); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1060
0
    felem_reduce(x_out, tmp); /* x_out[i] < 2^56 */
1061
1062
    /* z' = (y + z)^2 - gamma - delta */
1063
0
    felem_sum64(delta, gamma); /* delta[i] < 2^57 */
1064
0
    felem_assign(ftmp, y_in); /* ftmp[i] < 2^56 */
1065
0
    felem_sum64(ftmp, z_in); /* ftmp[i] < 2^56 */
1066
0
    felem_square(tmp, ftmp); /* tmp[i] < 2^115 */
1067
0
    felem_diff_128_64(tmp, delta); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1068
0
    felem_reduce(z_out, tmp); /* z_out[i] < 2^56 */
1069
1070
    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1071
0
    felem_scalar64(beta, 4); /* beta[i] < 2^58 */
1072
0
    felem_diff64(beta, x_out); /* beta[i] < 2^60 + 2^58 + 2^44 */
1073
0
    felem_mul(tmp, alpha, beta); /* tmp[i] < 2^119 */
1074
0
    felem_square(tmp2, gamma); /* tmp2[i] < 2^115 */
1075
0
    felem_scalar128(tmp2, 8); /* tmp2[i] < 2^118 */
1076
0
    felem_diff128(tmp, tmp2); /* tmp[i] < 2^127 + 2^119 + 2^111 */
1077
0
    felem_reduce(y_out, tmp); /* tmp[i] < 2^56 */
1078
0
}
1079
1080
/* copy_conditional copies in to out iff mask is all ones. */
1081
static void copy_conditional(felem out, const felem in, limb mask)
1082
0
{
1083
0
    unsigned int i;
1084
1085
0
    for (i = 0; i < NLIMBS; i++)
1086
0
        out[i] ^= mask & (in[i] ^ out[i]);
1087
0
}
1088
1089
/*-
1090
 * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1091
 *
1092
 * The method is taken from
1093
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1094
 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1095
 *
1096
 * This function includes a branch for checking whether the two input points
1097
 * are equal (while not equal to the point at infinity). See comment below
1098
 * on constant-time.
1099
 */
1100
static void point_add(felem x3, felem y3, felem z3,
1101
    const felem x1, const felem y1, const felem z1,
1102
    const int mixed, const felem x2, const felem y2,
1103
    const felem z2)
1104
0
{
1105
0
    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1106
0
    widefelem tmp, tmp2;
1107
0
    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1108
0
    limb points_equal;
1109
1110
0
    z1_is_zero = felem_is_zero(z1);
1111
0
    z2_is_zero = felem_is_zero(z2);
1112
1113
    /* ftmp = z1z1 = z1**2 */
1114
0
    felem_square_reduce(ftmp, z1); /* ftmp[i] < 2^56 */
1115
1116
0
    if (!mixed) {
1117
        /* ftmp2 = z2z2 = z2**2 */
1118
0
        felem_square_reduce(ftmp2, z2); /* ftmp2[i] < 2^56 */
1119
1120
        /* u1 = ftmp3 = x1*z2z2 */
1121
0
        felem_mul_reduce(ftmp3, x1, ftmp2); /* ftmp3[i] < 2^56 */
1122
1123
        /* ftmp5 = z1 + z2 */
1124
0
        felem_assign(ftmp5, z1); /* ftmp5[i] < 2^56 */
1125
0
        felem_sum64(ftmp5, z2); /* ftmp5[i] < 2^57 */
1126
1127
        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1128
0
        felem_square(tmp, ftmp5); /* tmp[i] < 2^117 */
1129
0
        felem_diff_128_64(tmp, ftmp); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1130
0
        felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1131
0
        felem_reduce(ftmp5, tmp); /* ftmp5[i] < 2^56 */
1132
1133
        /* ftmp2 = z2 * z2z2 */
1134
0
        felem_mul_reduce(ftmp2, ftmp2, z2); /* ftmp2[i] < 2^56 */
1135
1136
        /* s1 = ftmp6 = y1 * z2**3 */
1137
0
        felem_mul_reduce(ftmp6, y1, ftmp2); /* ftmp6[i] < 2^56 */
1138
0
    } else {
1139
        /*
1140
         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1141
         */
1142
1143
        /* u1 = ftmp3 = x1*z2z2 */
1144
0
        felem_assign(ftmp3, x1); /* ftmp3[i] < 2^56 */
1145
1146
        /* ftmp5 = 2*z1z2 */
1147
0
        felem_scalar(ftmp5, z1, 2); /* ftmp5[i] < 2^57 */
1148
1149
        /* s1 = ftmp6 = y1 * z2**3 */
1150
0
        felem_assign(ftmp6, y1); /* ftmp6[i] < 2^56 */
1151
0
    }
1152
    /* ftmp3[i] < 2^56, ftmp5[i] < 2^57, ftmp6[i] < 2^56 */
1153
1154
    /* u2 = x2*z1z1 */
1155
0
    felem_mul(tmp, x2, ftmp); /* tmp[i] < 2^115 */
1156
1157
    /* h = ftmp4 = u2 - u1 */
1158
0
    felem_diff_128_64(tmp, ftmp3); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1159
0
    felem_reduce(ftmp4, tmp); /* ftmp[4] < 2^56 */
1160
1161
0
    x_equal = felem_is_zero(ftmp4);
1162
1163
    /* z_out = ftmp5 * h */
1164
0
    felem_mul_reduce(z_out, ftmp5, ftmp4); /* z_out[i] < 2^56 */
1165
1166
    /* ftmp = z1 * z1z1 */
1167
0
    felem_mul_reduce(ftmp, ftmp, z1); /* ftmp[i] < 2^56 */
1168
1169
    /* s2 = tmp = y2 * z1**3 */
1170
0
    felem_mul(tmp, y2, ftmp); /* tmp[i] < 2^115 */
1171
1172
    /* r = ftmp5 = (s2 - s1)*2 */
1173
0
    felem_diff_128_64(tmp, ftmp6); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1174
0
    felem_reduce(ftmp5, tmp); /* ftmp5[i] < 2^56 */
1175
0
    y_equal = felem_is_zero(ftmp5);
1176
0
    felem_scalar64(ftmp5, 2); /* ftmp5[i] < 2^57 */
1177
1178
    /*
1179
     * The formulae are incorrect if the points are equal, in affine coordinates
1180
     * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1181
     * happens.
1182
     *
1183
     * We use bitwise operations to avoid potential side-channels introduced by
1184
     * the short-circuiting behaviour of boolean operators.
1185
     *
1186
     * The special case of either point being the point at infinity (z1 and/or
1187
     * z2 are zero), is handled separately later on in this function, so we
1188
     * avoid jumping to point_double here in those special cases.
1189
     *
1190
     * Notice the comment below on the implications of this branching for timing
1191
     * leaks and why it is considered practically irrelevant.
1192
     */
1193
0
    points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1194
1195
0
    if (points_equal) {
1196
        /*
1197
         * This is obviously not constant-time but it will almost-never happen
1198
         * for ECDH / ECDSA.
1199
         */
1200
0
        point_double(x3, y3, z3, x1, y1, z1);
1201
0
        return;
1202
0
    }
1203
1204
    /* I = ftmp = (2h)**2 */
1205
0
    felem_assign(ftmp, ftmp4); /* ftmp[i] < 2^56 */
1206
0
    felem_scalar64(ftmp, 2); /* ftmp[i] < 2^57 */
1207
0
    felem_square_reduce(ftmp, ftmp); /* ftmp[i] < 2^56 */
1208
1209
    /* J = ftmp2 = h * I */
1210
0
    felem_mul_reduce(ftmp2, ftmp4, ftmp); /* ftmp2[i] < 2^56 */
1211
1212
    /* V = ftmp4 = U1 * I */
1213
0
    felem_mul_reduce(ftmp4, ftmp3, ftmp); /* ftmp4[i] < 2^56 */
1214
1215
    /* x_out = r**2 - J - 2V */
1216
0
    felem_square(tmp, ftmp5); /* tmp[i] < 2^117 */
1217
0
    felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1218
0
    felem_assign(ftmp3, ftmp4); /* ftmp3[i] < 2^56 */
1219
0
    felem_scalar64(ftmp4, 2); /* ftmp4[i] < 2^57 */
1220
0
    felem_diff_128_64(tmp, ftmp4); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1221
0
    felem_reduce(x_out, tmp); /* x_out[i] < 2^56 */
1222
1223
    /* y_out = r(V-x_out) - 2 * s1 * J */
1224
0
    felem_diff64(ftmp3, x_out); /* ftmp3[i] < 2^60 + 2^56 + 2^44 */
1225
0
    felem_mul(tmp, ftmp5, ftmp3); /* tmp[i] < 2^116 */
1226
0
    felem_mul(tmp2, ftmp6, ftmp2); /* tmp2[i] < 2^115 */
1227
0
    felem_scalar128(tmp2, 2); /* tmp2[i] < 2^116 */
1228
0
    felem_diff128(tmp, tmp2); /* tmp[i] < 2^127 + 2^116 + 2^111 */
1229
0
    felem_reduce(y_out, tmp); /* y_out[i] < 2^56 */
1230
1231
0
    copy_conditional(x_out, x2, z1_is_zero);
1232
0
    copy_conditional(x_out, x1, z2_is_zero);
1233
0
    copy_conditional(y_out, y2, z1_is_zero);
1234
0
    copy_conditional(y_out, y1, z2_is_zero);
1235
0
    copy_conditional(z_out, z2, z1_is_zero);
1236
0
    copy_conditional(z_out, z1, z2_is_zero);
1237
0
    felem_assign(x3, x_out);
1238
0
    felem_assign(y3, y_out);
1239
0
    felem_assign(z3, z_out);
1240
0
}
1241
1242
/*-
1243
 * Base point pre computation
1244
 * --------------------------
1245
 *
1246
 * Two different sorts of precomputed tables are used in the following code.
1247
 * Each contain various points on the curve, where each point is three field
1248
 * elements (x, y, z).
1249
 *
1250
 * For the base point table, z is usually 1 (0 for the point at infinity).
1251
 * This table has 16 elements:
1252
 * index | bits    | point
1253
 * ------+---------+------------------------------
1254
 *     0 | 0 0 0 0 | 0G
1255
 *     1 | 0 0 0 1 | 1G
1256
 *     2 | 0 0 1 0 | 2^95G
1257
 *     3 | 0 0 1 1 | (2^95 + 1)G
1258
 *     4 | 0 1 0 0 | 2^190G
1259
 *     5 | 0 1 0 1 | (2^190 + 1)G
1260
 *     6 | 0 1 1 0 | (2^190 + 2^95)G
1261
 *     7 | 0 1 1 1 | (2^190 + 2^95 + 1)G
1262
 *     8 | 1 0 0 0 | 2^285G
1263
 *     9 | 1 0 0 1 | (2^285 + 1)G
1264
 *    10 | 1 0 1 0 | (2^285 + 2^95)G
1265
 *    11 | 1 0 1 1 | (2^285 + 2^95 + 1)G
1266
 *    12 | 1 1 0 0 | (2^285 + 2^190)G
1267
 *    13 | 1 1 0 1 | (2^285 + 2^190 + 1)G
1268
 *    14 | 1 1 1 0 | (2^285 + 2^190 + 2^95)G
1269
 *    15 | 1 1 1 1 | (2^285 + 2^190 + 2^95 + 1)G
1270
 *
1271
 * The reason for this is so that we can clock bits into four different
1272
 * locations when doing simple scalar multiplies against the base point.
1273
 *
1274
 * Tables for other points have table[i] = iG for i in 0 .. 16.
1275
 */
1276
1277
/* gmul is the table of precomputed base points */
1278
static const felem gmul[16][3] = {
1279
    { { 0, 0, 0, 0, 0, 0, 0 },
1280
        { 0, 0, 0, 0, 0, 0, 0 },
1281
        { 0, 0, 0, 0, 0, 0, 0 } },
1282
    { { 0x00545e3872760ab7, 0x00f25dbf55296c3a, 0x00e082542a385502, 0x008ba79b9859f741,
1283
          0x0020ad746e1d3b62, 0x0005378eb1c71ef3, 0x0000aa87ca22be8b },
1284
        { 0x00431d7c90ea0e5f, 0x00b1ce1d7e819d7a, 0x0013b5f0b8c00a60, 0x00289a147ce9da31,
1285
            0x0092dc29f8f41dbd, 0x002c6f5d9e98bf92, 0x00003617de4a9626 },
1286
        { 1, 0, 0, 0, 0, 0, 0 } },
1287
    { { 0x00024711cc902a90, 0x00acb2e579ab4fe1, 0x00af818a4b4d57b1, 0x00a17c7bec49c3de,
1288
          0x004280482d726a8b, 0x00128dd0f0a90f3b, 0x00004387c1c3fa3c },
1289
        { 0x002ce76543cf5c3a, 0x00de6cee5ef58f0a, 0x00403e42fa561ca6, 0x00bc54d6f9cb9731,
1290
            0x007155f925fb4ff1, 0x004a9ce731b7b9bc, 0x00002609076bd7b2 },
1291
        { 1, 0, 0, 0, 0, 0, 0 } },
1292
    { { 0x00e74c9182f0251d, 0x0039bf54bb111974, 0x00b9d2f2eec511d2, 0x0036b1594eb3a6a4,
1293
          0x00ac3bb82d9d564b, 0x00f9313f4615a100, 0x00006716a9a91b10 },
1294
        { 0x0046698116e2f15c, 0x00f34347067d3d33, 0x008de4ccfdebd002, 0x00e838c6b8e8c97b,
1295
            0x006faf0798def346, 0x007349794a57563c, 0x00002629e7e6ad84 },
1296
        { 1, 0, 0, 0, 0, 0, 0 } },
1297
    { { 0x0075300e34fd163b, 0x0092e9db4e8d0ad3, 0x00254be9f625f760, 0x00512c518c72ae68,
1298
          0x009bfcf162bede5a, 0x00bf9341566ce311, 0x0000cd6175bd41cf },
1299
        { 0x007dfe52af4ac70f, 0x0002159d2d5c4880, 0x00b504d16f0af8d0, 0x0014585e11f5e64c,
1300
            0x0089c6388e030967, 0x00ffb270cbfa5f71, 0x00009a15d92c3947 },
1301
        { 1, 0, 0, 0, 0, 0, 0 } },
1302
    { { 0x0033fc1278dc4fe5, 0x00d53088c2caa043, 0x0085558827e2db66, 0x00c192bef387b736,
1303
          0x00df6405a2225f2c, 0x0075205aa90fd91a, 0x0000137e3f12349d },
1304
        { 0x00ce5b115efcb07e, 0x00abc3308410deeb, 0x005dc6fc1de39904, 0x00907c1c496f36b4,
1305
            0x0008e6ad3926cbe1, 0x00110747b787928c, 0x0000021b9162eb7e },
1306
        { 1, 0, 0, 0, 0, 0, 0 } },
1307
    { { 0x008180042cfa26e1, 0x007b826a96254967, 0x0082473694d6b194, 0x007bd6880a45b589,
1308
          0x00c0a5097072d1a3, 0x0019186555e18b4e, 0x000020278190e5ca },
1309
        { 0x00b4bef17de61ac0, 0x009535e3c38ed348, 0x002d4aa8e468ceab, 0x00ef40b431036ad3,
1310
            0x00defd52f4542857, 0x0086edbf98234266, 0x00002025b3a7814d },
1311
        { 1, 0, 0, 0, 0, 0, 0 } },
1312
    { { 0x00b238aa97b886be, 0x00ef3192d6dd3a32, 0x0079f9e01fd62df8, 0x00742e890daba6c5,
1313
          0x008e5289144408ce, 0x0073bbcc8e0171a5, 0x0000c4fd329d3b52 },
1314
        { 0x00c6f64a15ee23e7, 0x00dcfb7b171cad8b, 0x00039f6cbd805867, 0x00de024e428d4562,
1315
            0x00be6a594d7c64c5, 0x0078467b70dbcd64, 0x0000251f2ed7079b },
1316
        { 1, 0, 0, 0, 0, 0, 0 } },
1317
    { { 0x000e5cc25fc4b872, 0x005ebf10d31ef4e1, 0x0061e0ebd11e8256, 0x0076e026096f5a27,
1318
          0x0013e6fc44662e9a, 0x0042b00289d3597e, 0x000024f089170d88 },
1319
        { 0x001604d7e0effbe6, 0x0048d77cba64ec2c, 0x008166b16da19e36, 0x006b0d1a0f28c088,
1320
            0x000259fcd47754fd, 0x00cc643e4d725f9a, 0x00007b10f3c79c14 },
1321
        { 1, 0, 0, 0, 0, 0, 0 } },
1322
    { { 0x00430155e3b908af, 0x00b801e4fec25226, 0x00b0d4bcfe806d26, 0x009fc4014eb13d37,
1323
          0x0066c94e44ec07e8, 0x00d16adc03874ba2, 0x000030c917a0d2a7 },
1324
        { 0x00edac9e21eb891c, 0x00ef0fb768102eff, 0x00c088cef272a5f3, 0x00cbf782134e2964,
1325
            0x0001044a7ba9a0e3, 0x00e363f5b194cf3c, 0x00009ce85249e372 },
1326
        { 1, 0, 0, 0, 0, 0, 0 } },
1327
    { { 0x001dd492dda5a7eb, 0x008fd577be539fd1, 0x002ff4b25a5fc3f1, 0x0074a8a1b64df72f,
1328
          0x002ba3d8c204a76c, 0x009d5cff95c8235a, 0x0000e014b9406e0f },
1329
        { 0x008c2e4dbfc98aba, 0x00f30bb89f1a1436, 0x00b46f7aea3e259c, 0x009224454ac02f54,
1330
            0x00906401f5645fa2, 0x003a1d1940eabc77, 0x00007c9351d680e6 },
1331
        { 1, 0, 0, 0, 0, 0, 0 } },
1332
    { { 0x005a35d872ef967c, 0x0049f1b7884e1987, 0x0059d46d7e31f552, 0x00ceb4869d2d0fb6,
1333
          0x00e8e89eee56802a, 0x0049d806a774aaf2, 0x0000147e2af0ae24 },
1334
        { 0x005fd1bd852c6e5e, 0x00b674b7b3de6885, 0x003b9ea5eb9b6c08, 0x005c9f03babf3ef7,
1335
            0x00605337fecab3c7, 0x009a3f85b11bbcc8, 0x0000455470f330ec },
1336
        { 1, 0, 0, 0, 0, 0, 0 } },
1337
    { { 0x002197ff4d55498d, 0x00383e8916c2d8af, 0x00eb203f34d1c6d2, 0x0080367cbd11b542,
1338
          0x00769b3be864e4f5, 0x0081a8458521c7bb, 0x0000c531b34d3539 },
1339
        { 0x00e2a3d775fa2e13, 0x00534fc379573844, 0x00ff237d2a8db54a, 0x00d301b2335a8882,
1340
            0x000f75ea96103a80, 0x0018fecb3cdd96fa, 0x0000304bf61e94eb },
1341
        { 1, 0, 0, 0, 0, 0, 0 } },
1342
    { { 0x00b2afc332a73dbd, 0x0029a0d5bb007bc5, 0x002d628eb210f577, 0x009f59a36dd05f50,
1343
          0x006d339de4eca613, 0x00c75a71addc86bc, 0x000060384c5ea93c },
1344
        { 0x00aa9641c32a30b4, 0x00cc73ae8cce565d, 0x00ec911a4df07f61, 0x00aa4b762ea4b264,
1345
            0x0096d395bb393629, 0x004efacfb7632fe0, 0x00006f252f46fa3f },
1346
        { 1, 0, 0, 0, 0, 0, 0 } },
1347
    { { 0x00567eec597c7af6, 0x0059ba6795204413, 0x00816d4e6f01196f, 0x004ae6b3eb57951d,
1348
          0x00420f5abdda2108, 0x003401d1f57ca9d9, 0x0000cf5837b0b67a },
1349
        { 0x00eaa64b8aeeabf9, 0x00246ddf16bcb4de, 0x000e7e3c3aecd751, 0x0008449f04fed72e,
1350
            0x00307b67ccf09183, 0x0017108c3556b7b1, 0x0000229b2483b3bf },
1351
        { 1, 0, 0, 0, 0, 0, 0 } },
1352
    { { 0x00e7c491a7bb78a1, 0x00eafddd1d3049ab, 0x00352c05e2bc7c98, 0x003d6880c165fa5c,
1353
          0x00b6ac61cc11c97d, 0x00beeb54fcf90ce5, 0x0000dc1f0b455edc },
1354
        { 0x002db2e7aee34d60, 0x0073b5f415a2d8c0, 0x00dd84e4193e9a0c, 0x00d02d873467c572,
1355
            0x0018baaeda60aee5, 0x0013fb11d697c61e, 0x000083aafcc3a973 },
1356
        { 1, 0, 0, 0, 0, 0, 0 } }
1357
};
1358
1359
/*
1360
 * select_point selects the |idx|th point from a precomputation table and
1361
 * copies it to out.
1362
 *
1363
 * pre_comp below is of the size provided in |size|.
1364
 */
1365
static void select_point(const limb idx, unsigned int size,
1366
    const felem pre_comp[][3], felem out[3])
1367
0
{
1368
0
    unsigned int i, j;
1369
0
    limb *outlimbs = &out[0][0];
1370
1371
0
    memset(out, 0, sizeof(*out) * 3);
1372
1373
0
    for (i = 0; i < size; i++) {
1374
0
        const limb *inlimbs = &pre_comp[i][0][0];
1375
0
        limb mask = i ^ idx;
1376
1377
0
        mask |= mask >> 4;
1378
0
        mask |= mask >> 2;
1379
0
        mask |= mask >> 1;
1380
0
        mask &= 1;
1381
0
        mask--;
1382
0
        for (j = 0; j < NLIMBS * 3; j++)
1383
0
            outlimbs[j] |= inlimbs[j] & mask;
1384
0
    }
1385
0
}
1386
1387
/* get_bit returns the |i|th bit in |in| */
1388
static char get_bit(const felem_bytearray in, int i)
1389
0
{
1390
0
    if (i < 0 || i >= 384)
1391
0
        return 0;
1392
0
    return (in[i >> 3] >> (i & 7)) & 1;
1393
0
}
1394
1395
/*
1396
 * Interleaved point multiplication using precomputed point multiples: The
1397
 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1398
 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1399
 * generator, using certain (large) precomputed multiples in g_pre_comp.
1400
 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1401
 */
1402
static void batch_mul(felem x_out, felem y_out, felem z_out,
1403
    const felem_bytearray scalars[],
1404
    const unsigned int num_points, const uint8_t *g_scalar,
1405
    const int mixed, const felem pre_comp[][17][3],
1406
    const felem g_pre_comp[16][3])
1407
0
{
1408
0
    int i, skip;
1409
0
    unsigned int num, gen_mul = (g_scalar != NULL);
1410
0
    felem nq[3], tmp[4];
1411
0
    limb bits;
1412
0
    uint8_t sign, digit;
1413
1414
    /* set nq to the point at infinity */
1415
0
    memset(nq, 0, sizeof(nq));
1416
1417
    /*
1418
     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1419
     * of the generator (last quarter of rounds) and additions of other
1420
     * points multiples (every 5th round).
1421
     */
1422
0
    skip = 1; /* save two point operations in the first
1423
               * round */
1424
0
    for (i = (num_points ? 380 : 98); i >= 0; --i) {
1425
        /* double */
1426
0
        if (!skip)
1427
0
            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1428
1429
        /* add multiples of the generator */
1430
0
        if (gen_mul && (i <= 98)) {
1431
0
            bits = get_bit(g_scalar, i + 285) << 3;
1432
0
            if (i < 95) {
1433
0
                bits |= get_bit(g_scalar, i + 190) << 2;
1434
0
                bits |= get_bit(g_scalar, i + 95) << 1;
1435
0
                bits |= get_bit(g_scalar, i);
1436
0
            }
1437
            /* select the point to add, in constant time */
1438
0
            select_point(bits, 16, g_pre_comp, tmp);
1439
0
            if (!skip) {
1440
                /* The 1 argument below is for "mixed" */
1441
0
                point_add(nq[0], nq[1], nq[2],
1442
0
                    nq[0], nq[1], nq[2], 1,
1443
0
                    tmp[0], tmp[1], tmp[2]);
1444
0
            } else {
1445
0
                memcpy(nq, tmp, 3 * sizeof(felem));
1446
0
                skip = 0;
1447
0
            }
1448
0
        }
1449
1450
        /* do other additions every 5 doublings */
1451
0
        if (num_points && (i % 5 == 0)) {
1452
            /* loop over all scalars */
1453
0
            for (num = 0; num < num_points; ++num) {
1454
0
                bits = get_bit(scalars[num], i + 4) << 5;
1455
0
                bits |= get_bit(scalars[num], i + 3) << 4;
1456
0
                bits |= get_bit(scalars[num], i + 2) << 3;
1457
0
                bits |= get_bit(scalars[num], i + 1) << 2;
1458
0
                bits |= get_bit(scalars[num], i) << 1;
1459
0
                bits |= get_bit(scalars[num], i - 1);
1460
0
                ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1461
1462
                /*
1463
                 * select the point to add or subtract, in constant time
1464
                 */
1465
0
                select_point(digit, 17, pre_comp[num], tmp);
1466
0
                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1467
                                            * point */
1468
0
                copy_conditional(tmp[1], tmp[3], (-(limb)sign));
1469
1470
0
                if (!skip) {
1471
0
                    point_add(nq[0], nq[1], nq[2],
1472
0
                        nq[0], nq[1], nq[2], mixed,
1473
0
                        tmp[0], tmp[1], tmp[2]);
1474
0
                } else {
1475
0
                    memcpy(nq, tmp, 3 * sizeof(felem));
1476
0
                    skip = 0;
1477
0
                }
1478
0
            }
1479
0
        }
1480
0
    }
1481
0
    felem_assign(x_out, nq[0]);
1482
0
    felem_assign(y_out, nq[1]);
1483
0
    felem_assign(z_out, nq[2]);
1484
0
}
1485
1486
/* Precomputation for the group generator. */
1487
struct nistp384_pre_comp_st {
1488
    felem g_pre_comp[16][3];
1489
    CRYPTO_REF_COUNT references;
1490
};
1491
1492
const EC_METHOD *ossl_ec_GFp_nistp384_method(void)
1493
0
{
1494
0
    static const EC_METHOD ret = {
1495
0
        EC_FLAGS_DEFAULT_OCT,
1496
0
        NID_X9_62_prime_field,
1497
0
        ossl_ec_GFp_nistp384_group_init,
1498
0
        ossl_ec_GFp_simple_group_finish,
1499
0
        ossl_ec_GFp_simple_group_clear_finish,
1500
0
        ossl_ec_GFp_nist_group_copy,
1501
0
        ossl_ec_GFp_nistp384_group_set_curve,
1502
0
        ossl_ec_GFp_simple_group_get_curve,
1503
0
        ossl_ec_GFp_simple_group_get_degree,
1504
0
        ossl_ec_group_simple_order_bits,
1505
0
        ossl_ec_GFp_simple_group_check_discriminant,
1506
0
        ossl_ec_GFp_simple_point_init,
1507
0
        ossl_ec_GFp_simple_point_finish,
1508
0
        ossl_ec_GFp_simple_point_clear_finish,
1509
0
        ossl_ec_GFp_simple_point_copy,
1510
0
        ossl_ec_GFp_simple_point_set_to_infinity,
1511
0
        ossl_ec_GFp_simple_point_set_affine_coordinates,
1512
0
        ossl_ec_GFp_nistp384_point_get_affine_coordinates,
1513
0
        0, /* point_set_compressed_coordinates */
1514
0
        0, /* point2oct */
1515
0
        0, /* oct2point */
1516
0
        ossl_ec_GFp_simple_add,
1517
0
        ossl_ec_GFp_simple_dbl,
1518
0
        ossl_ec_GFp_simple_invert,
1519
0
        ossl_ec_GFp_simple_is_at_infinity,
1520
0
        ossl_ec_GFp_simple_is_on_curve,
1521
0
        ossl_ec_GFp_simple_cmp,
1522
0
        ossl_ec_GFp_simple_make_affine,
1523
0
        ossl_ec_GFp_simple_points_make_affine,
1524
0
        ossl_ec_GFp_nistp384_points_mul,
1525
0
        ossl_ec_GFp_nistp384_precompute_mult,
1526
0
        ossl_ec_GFp_nistp384_have_precompute_mult,
1527
0
        ossl_ec_GFp_nist_field_mul,
1528
0
        ossl_ec_GFp_nist_field_sqr,
1529
0
        0, /* field_div */
1530
0
        ossl_ec_GFp_simple_field_inv,
1531
0
        0, /* field_encode */
1532
0
        0, /* field_decode */
1533
0
        0, /* field_set_to_one */
1534
0
        ossl_ec_key_simple_priv2oct,
1535
0
        ossl_ec_key_simple_oct2priv,
1536
0
        0, /* set private */
1537
0
        ossl_ec_key_simple_generate_key,
1538
0
        ossl_ec_key_simple_check_key,
1539
0
        ossl_ec_key_simple_generate_public_key,
1540
0
        0, /* keycopy */
1541
0
        0, /* keyfinish */
1542
0
        ossl_ecdh_simple_compute_key,
1543
0
        ossl_ecdsa_simple_sign_setup,
1544
0
        ossl_ecdsa_simple_sign_sig,
1545
0
        ossl_ecdsa_simple_verify_sig,
1546
0
        0, /* field_inverse_mod_ord */
1547
0
        0, /* blind_coordinates */
1548
0
        0, /* ladder_pre */
1549
0
        0, /* ladder_step */
1550
0
        0 /* ladder_post */
1551
0
    };
1552
1553
0
    return &ret;
1554
0
}
1555
1556
/******************************************************************************/
1557
/*
1558
 * FUNCTIONS TO MANAGE PRECOMPUTATION
1559
 */
1560
1561
static NISTP384_PRE_COMP *nistp384_pre_comp_new(void)
1562
0
{
1563
0
    NISTP384_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1564
1565
0
    if (ret == NULL)
1566
0
        return ret;
1567
1568
0
    if (!CRYPTO_NEW_REF(&ret->references, 1)) {
1569
0
        OPENSSL_free(ret);
1570
0
        return NULL;
1571
0
    }
1572
0
    return ret;
1573
0
}
1574
1575
NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *p)
1576
0
{
1577
0
    int i;
1578
1579
0
    if (p != NULL)
1580
0
        CRYPTO_UP_REF(&p->references, &i);
1581
0
    return p;
1582
0
}
1583
1584
void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *p)
1585
0
{
1586
0
    int i;
1587
1588
0
    if (p == NULL)
1589
0
        return;
1590
1591
0
    CRYPTO_DOWN_REF(&p->references, &i);
1592
0
    REF_PRINT_COUNT("ossl_ec_nistp384", i, p);
1593
0
    if (i > 0)
1594
0
        return;
1595
0
    REF_ASSERT_ISNT(i < 0);
1596
1597
0
    CRYPTO_FREE_REF(&p->references);
1598
0
    OPENSSL_free(p);
1599
0
}
1600
1601
/******************************************************************************/
1602
/*
1603
 * OPENSSL EC_METHOD FUNCTIONS
1604
 */
1605
1606
int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group)
1607
0
{
1608
0
    int ret;
1609
1610
0
    ret = ossl_ec_GFp_simple_group_init(group);
1611
0
    group->a_is_minus3 = 1;
1612
0
    return ret;
1613
0
}
1614
1615
int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1616
    const BIGNUM *a, const BIGNUM *b,
1617
    BN_CTX *ctx)
1618
0
{
1619
0
    int ret = 0;
1620
0
    BIGNUM *curve_p, *curve_a, *curve_b;
1621
0
#ifndef FIPS_MODULE
1622
0
    BN_CTX *new_ctx = NULL;
1623
1624
0
    if (ctx == NULL)
1625
0
        ctx = new_ctx = BN_CTX_new();
1626
0
#endif
1627
0
    if (ctx == NULL)
1628
0
        return 0;
1629
1630
0
    BN_CTX_start(ctx);
1631
0
    curve_p = BN_CTX_get(ctx);
1632
0
    curve_a = BN_CTX_get(ctx);
1633
0
    curve_b = BN_CTX_get(ctx);
1634
0
    if (curve_b == NULL)
1635
0
        goto err;
1636
0
    BN_bin2bn(nistp384_curve_params[0], sizeof(felem_bytearray), curve_p);
1637
0
    BN_bin2bn(nistp384_curve_params[1], sizeof(felem_bytearray), curve_a);
1638
0
    BN_bin2bn(nistp384_curve_params[2], sizeof(felem_bytearray), curve_b);
1639
0
    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1640
0
        ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1641
0
        goto err;
1642
0
    }
1643
0
    group->field_mod_func = BN_nist_mod_384;
1644
0
    ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1645
0
err:
1646
0
    BN_CTX_end(ctx);
1647
0
#ifndef FIPS_MODULE
1648
0
    BN_CTX_free(new_ctx);
1649
0
#endif
1650
0
    return ret;
1651
0
}
1652
1653
/*
1654
 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1655
 * (X/Z^2, Y/Z^3)
1656
 */
1657
int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group,
1658
    const EC_POINT *point,
1659
    BIGNUM *x, BIGNUM *y,
1660
    BN_CTX *ctx)
1661
0
{
1662
0
    felem z1, z2, x_in, y_in, x_out, y_out;
1663
0
    widefelem tmp;
1664
1665
0
    if (EC_POINT_is_at_infinity(group, point)) {
1666
0
        ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1667
0
        return 0;
1668
0
    }
1669
0
    if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) || (!BN_to_felem(z1, point->Z)))
1670
0
        return 0;
1671
0
    felem_inv(z2, z1);
1672
0
    felem_square(tmp, z2);
1673
0
    felem_reduce(z1, tmp);
1674
0
    felem_mul(tmp, x_in, z1);
1675
0
    felem_reduce(x_in, tmp);
1676
0
    felem_contract(x_out, x_in);
1677
0
    if (x != NULL) {
1678
0
        if (!felem_to_BN(x, x_out)) {
1679
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1680
0
            return 0;
1681
0
        }
1682
0
    }
1683
0
    felem_mul(tmp, z1, z2);
1684
0
    felem_reduce(z1, tmp);
1685
0
    felem_mul(tmp, y_in, z1);
1686
0
    felem_reduce(y_in, tmp);
1687
0
    felem_contract(y_out, y_in);
1688
0
    if (y != NULL) {
1689
0
        if (!felem_to_BN(y, y_out)) {
1690
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1691
0
            return 0;
1692
0
        }
1693
0
    }
1694
0
    return 1;
1695
0
}
1696
1697
/* points below is of size |num|, and tmp_felems is of size |num+1/ */
1698
static void make_points_affine(size_t num, felem points[][3],
1699
    felem tmp_felems[])
1700
0
{
1701
    /*
1702
     * Runs in constant time, unless an input is the point at infinity (which
1703
     * normally shouldn't happen).
1704
     */
1705
0
    ossl_ec_GFp_nistp_points_make_affine_internal(num,
1706
0
        points,
1707
0
        sizeof(felem),
1708
0
        tmp_felems,
1709
0
        (void (*)(void *))felem_one,
1710
0
        felem_is_zero_int,
1711
0
        (void (*)(void *, const void *))
1712
0
            felem_assign,
1713
0
        (void (*)(void *, const void *))
1714
0
            felem_square_reduce,
1715
0
        (void (*)(void *, const void *, const void *))
1716
0
            felem_mul_reduce,
1717
0
        (void (*)(void *, const void *))
1718
0
            felem_inv,
1719
0
        (void (*)(void *, const void *))
1720
0
            felem_contract);
1721
0
}
1722
1723
/*
1724
 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1725
 * values Result is stored in r (r can equal one of the inputs).
1726
 */
1727
int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r,
1728
    const BIGNUM *scalar, size_t num,
1729
    const EC_POINT *points[],
1730
    const BIGNUM *scalars[], BN_CTX *ctx)
1731
0
{
1732
0
    int ret = 0;
1733
0
    int j;
1734
0
    int mixed = 0;
1735
0
    BIGNUM *x, *y, *z, *tmp_scalar;
1736
0
    felem_bytearray g_secret;
1737
0
    felem_bytearray *secrets = NULL;
1738
0
    felem(*pre_comp)[17][3] = NULL;
1739
0
    felem *tmp_felems = NULL;
1740
0
    unsigned int i;
1741
0
    int num_bytes;
1742
0
    int have_pre_comp = 0;
1743
0
    size_t num_points = num;
1744
0
    felem x_in, y_in, z_in, x_out, y_out, z_out;
1745
0
    NISTP384_PRE_COMP *pre = NULL;
1746
0
    felem(*g_pre_comp)[3] = NULL;
1747
0
    EC_POINT *generator = NULL;
1748
0
    const EC_POINT *p = NULL;
1749
0
    const BIGNUM *p_scalar = NULL;
1750
1751
0
    BN_CTX_start(ctx);
1752
0
    x = BN_CTX_get(ctx);
1753
0
    y = BN_CTX_get(ctx);
1754
0
    z = BN_CTX_get(ctx);
1755
0
    tmp_scalar = BN_CTX_get(ctx);
1756
0
    if (tmp_scalar == NULL)
1757
0
        goto err;
1758
1759
0
    if (scalar != NULL) {
1760
0
        pre = group->pre_comp.nistp384;
1761
0
        if (pre)
1762
            /* we have precomputation, try to use it */
1763
0
            g_pre_comp = &pre->g_pre_comp[0];
1764
0
        else
1765
            /* try to use the standard precomputation */
1766
0
            g_pre_comp = (felem(*)[3])gmul;
1767
0
        generator = EC_POINT_new(group);
1768
0
        if (generator == NULL)
1769
0
            goto err;
1770
        /* get the generator from precomputation */
1771
0
        if (!felem_to_BN(x, g_pre_comp[1][0]) || !felem_to_BN(y, g_pre_comp[1][1]) || !felem_to_BN(z, g_pre_comp[1][2])) {
1772
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1773
0
            goto err;
1774
0
        }
1775
0
        if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
1776
0
                generator,
1777
0
                x, y, z, ctx))
1778
0
            goto err;
1779
0
        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1780
            /* precomputation matches generator */
1781
0
            have_pre_comp = 1;
1782
0
        else
1783
            /*
1784
             * we don't have valid precomputation: treat the generator as a
1785
             * random point
1786
             */
1787
0
            num_points++;
1788
0
    }
1789
1790
0
    if (num_points > 0) {
1791
0
        if (num_points >= 2) {
1792
            /*
1793
             * unless we precompute multiples for just one point, converting
1794
             * those into affine form is time well spent
1795
             */
1796
0
            mixed = 1;
1797
0
        }
1798
0
        secrets = OPENSSL_calloc(num_points, sizeof(*secrets));
1799
0
        pre_comp = OPENSSL_calloc(num_points, sizeof(*pre_comp));
1800
0
        if (mixed)
1801
0
            tmp_felems = OPENSSL_malloc_array(num_points * 17 + 1, sizeof(*tmp_felems));
1802
0
        if ((secrets == NULL) || (pre_comp == NULL)
1803
0
            || (mixed && (tmp_felems == NULL)))
1804
0
            goto err;
1805
1806
        /*
1807
         * we treat NULL scalars as 0, and NULL points as points at infinity,
1808
         * i.e., they contribute nothing to the linear combination
1809
         */
1810
0
        for (i = 0; i < num_points; ++i) {
1811
0
            if (i == num) {
1812
                /*
1813
                 * we didn't have a valid precomputation, so we pick the
1814
                 * generator
1815
                 */
1816
0
                p = EC_GROUP_get0_generator(group);
1817
0
                p_scalar = scalar;
1818
0
            } else {
1819
                /* the i^th point */
1820
0
                p = points[i];
1821
0
                p_scalar = scalars[i];
1822
0
            }
1823
0
            if (p_scalar != NULL && p != NULL) {
1824
                /* reduce scalar to 0 <= scalar < 2^384 */
1825
0
                if ((BN_num_bits(p_scalar) > 384)
1826
0
                    || (BN_is_negative(p_scalar))) {
1827
                    /*
1828
                     * this is an unusual input, and we don't guarantee
1829
                     * constant-timeness
1830
                     */
1831
0
                    if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1832
0
                        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1833
0
                        goto err;
1834
0
                    }
1835
0
                    num_bytes = BN_bn2lebinpad(tmp_scalar,
1836
0
                        secrets[i], sizeof(secrets[i]));
1837
0
                } else {
1838
0
                    num_bytes = BN_bn2lebinpad(p_scalar,
1839
0
                        secrets[i], sizeof(secrets[i]));
1840
0
                }
1841
0
                if (num_bytes < 0) {
1842
0
                    ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1843
0
                    goto err;
1844
0
                }
1845
                /* precompute multiples */
1846
0
                if ((!BN_to_felem(x_out, p->X)) || (!BN_to_felem(y_out, p->Y)) || (!BN_to_felem(z_out, p->Z)))
1847
0
                    goto err;
1848
0
                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1849
0
                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1850
0
                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1851
0
                for (j = 2; j <= 16; ++j) {
1852
0
                    if (j & 1) {
1853
0
                        point_add(pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1854
0
                            pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], 0,
1855
0
                            pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]);
1856
0
                    } else {
1857
0
                        point_double(pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1858
0
                            pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]);
1859
0
                    }
1860
0
                }
1861
0
            }
1862
0
        }
1863
0
        if (mixed)
1864
0
            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1865
0
    }
1866
1867
    /* the scalar for the generator */
1868
0
    if (scalar != NULL && have_pre_comp) {
1869
0
        memset(g_secret, 0, sizeof(g_secret));
1870
        /* reduce scalar to 0 <= scalar < 2^384 */
1871
0
        if ((BN_num_bits(scalar) > 384) || (BN_is_negative(scalar))) {
1872
            /*
1873
             * this is an unusual input, and we don't guarantee
1874
             * constant-timeness
1875
             */
1876
0
            if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1877
0
                ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1878
0
                goto err;
1879
0
            }
1880
0
            num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1881
0
        } else {
1882
0
            num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1883
0
        }
1884
        /* do the multiplication with generator precomputation */
1885
0
        batch_mul(x_out, y_out, z_out,
1886
0
            (const felem_bytearray(*))secrets, num_points,
1887
0
            g_secret,
1888
0
            mixed, (const felem(*)[17][3])pre_comp,
1889
0
            (const felem(*)[3])g_pre_comp);
1890
0
    } else {
1891
        /* do the multiplication without generator precomputation */
1892
0
        batch_mul(x_out, y_out, z_out,
1893
0
            (const felem_bytearray(*))secrets, num_points,
1894
0
            NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1895
0
    }
1896
    /* reduce the output to its unique minimal representation */
1897
0
    felem_contract(x_in, x_out);
1898
0
    felem_contract(y_in, y_out);
1899
0
    felem_contract(z_in, z_out);
1900
0
    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) || (!felem_to_BN(z, z_in))) {
1901
0
        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1902
0
        goto err;
1903
0
    }
1904
0
    ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
1905
0
        ctx);
1906
1907
0
err:
1908
0
    BN_CTX_end(ctx);
1909
0
    EC_POINT_free(generator);
1910
0
    OPENSSL_free(secrets);
1911
0
    OPENSSL_free(pre_comp);
1912
0
    OPENSSL_free(tmp_felems);
1913
0
    return ret;
1914
0
}
1915
1916
int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1917
0
{
1918
0
    int ret = 0;
1919
0
    NISTP384_PRE_COMP *pre = NULL;
1920
0
    int i, j;
1921
0
    BIGNUM *x, *y;
1922
0
    EC_POINT *generator = NULL;
1923
0
    felem tmp_felems[16];
1924
0
#ifndef FIPS_MODULE
1925
0
    BN_CTX *new_ctx = NULL;
1926
0
#endif
1927
1928
    /* throw away old precomputation */
1929
0
    EC_pre_comp_free(group);
1930
1931
0
#ifndef FIPS_MODULE
1932
0
    if (ctx == NULL)
1933
0
        ctx = new_ctx = BN_CTX_new();
1934
0
#endif
1935
0
    if (ctx == NULL)
1936
0
        return 0;
1937
1938
0
    BN_CTX_start(ctx);
1939
0
    x = BN_CTX_get(ctx);
1940
0
    y = BN_CTX_get(ctx);
1941
0
    if (y == NULL)
1942
0
        goto err;
1943
    /* get the generator */
1944
0
    if (group->generator == NULL)
1945
0
        goto err;
1946
0
    generator = EC_POINT_new(group);
1947
0
    if (generator == NULL)
1948
0
        goto err;
1949
0
    BN_bin2bn(nistp384_curve_params[3], sizeof(felem_bytearray), x);
1950
0
    BN_bin2bn(nistp384_curve_params[4], sizeof(felem_bytearray), y);
1951
0
    if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
1952
0
        goto err;
1953
0
    if ((pre = nistp384_pre_comp_new()) == NULL)
1954
0
        goto err;
1955
    /*
1956
     * if the generator is the standard one, use built-in precomputation
1957
     */
1958
0
    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1959
0
        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1960
0
        goto done;
1961
0
    }
1962
0
    if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) || (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) || (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
1963
0
        goto err;
1964
    /* compute 2^95*G, 2^190*G, 2^285*G */
1965
0
    for (i = 1; i <= 4; i <<= 1) {
1966
0
        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1967
0
            pre->g_pre_comp[i][0], pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
1968
0
        for (j = 0; j < 94; ++j) {
1969
0
            point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1970
0
                pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2]);
1971
0
        }
1972
0
    }
1973
    /* g_pre_comp[0] is the point at infinity */
1974
0
    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1975
    /* the remaining multiples */
1976
    /* 2^95*G + 2^190*G */
1977
0
    point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1], pre->g_pre_comp[6][2],
1978
0
        pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], pre->g_pre_comp[4][2], 0,
1979
0
        pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]);
1980
    /* 2^95*G + 2^285*G */
1981
0
    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], pre->g_pre_comp[10][2],
1982
0
        pre->g_pre_comp[8][0], pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], 0,
1983
0
        pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]);
1984
    /* 2^190*G + 2^285*G */
1985
0
    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
1986
0
        pre->g_pre_comp[8][0], pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], 0,
1987
0
        pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], pre->g_pre_comp[4][2]);
1988
    /* 2^95*G + 2^190*G + 2^285*G */
1989
0
    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], pre->g_pre_comp[14][2],
1990
0
        pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], 0,
1991
0
        pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]);
1992
0
    for (i = 1; i < 8; ++i) {
1993
        /* odd multiples: add G */
1994
0
        point_add(pre->g_pre_comp[2 * i + 1][0], pre->g_pre_comp[2 * i + 1][1], pre->g_pre_comp[2 * i + 1][2],
1995
0
            pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
1996
0
            pre->g_pre_comp[1][0], pre->g_pre_comp[1][1], pre->g_pre_comp[1][2]);
1997
0
    }
1998
0
    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
1999
2000
0
done:
2001
0
    SETPRECOMP(group, nistp384, pre);
2002
0
    ret = 1;
2003
0
    pre = NULL;
2004
0
err:
2005
0
    BN_CTX_end(ctx);
2006
0
    EC_POINT_free(generator);
2007
0
#ifndef FIPS_MODULE
2008
0
    BN_CTX_free(new_ctx);
2009
0
#endif
2010
0
    ossl_ec_nistp384_pre_comp_free(pre);
2011
0
    return ret;
2012
0
}
2013
2014
int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group)
2015
0
{
2016
    return HAVEPRECOMP(group, nistp384);
2017
0
}