Coverage Report

Created: 2024-07-27 06:37

/src/openssl/crypto/ec/ecp_nistp384.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
3
 *
4
 * Licensed under the Apache License 2.0 (the "License").  You may not use
5
 * this file except in compliance with the License.  You can obtain a copy
6
 * in the file LICENSE in the source distribution or at
7
 * https://www.openssl.org/source/license.html
8
 */
9
10
/* Copyright 2023 IBM Corp.
11
 *
12
 * Licensed under the Apache License, Version 2.0 (the "License");
13
 *
14
 * you may not use this file except in compliance with the License.
15
 * You may obtain a copy of the License at
16
 *
17
 *     http://www.apache.org/licenses/LICENSE-2.0
18
 *
19
 *  Unless required by applicable law or agreed to in writing, software
20
 *  distributed under the License is distributed on an "AS IS" BASIS,
21
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22
 *  See the License for the specific language governing permissions and
23
 *  limitations under the License.
24
 */
25
26
/*
27
 * Designed for 56-bit limbs by Rohan McLure <rohan.mclure@linux.ibm.com>.
28
 * The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm
29
 * acceleration of felem_{square,mul} as supported in these files.
30
 */
31
32
#include <openssl/e_os2.h>
33
34
#include <string.h>
35
#include <openssl/err.h>
36
#include "ec_local.h"
37
38
#include "internal/numbers.h"
39
40
#ifndef INT128_MAX
41
# error "Your compiler doesn't appear to support 128-bit integer types"
42
#endif
43
44
typedef uint8_t u8;
45
typedef uint64_t u64;
46
47
/*
48
 * The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We
49
 * can serialize an element of this field into 48 bytes. We call this an
50
 * felem_bytearray.
51
 */
52
53
typedef u8 felem_bytearray[48];
54
55
/*
56
 * These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4.
57
 * These values are big-endian.
58
 */
59
static const felem_bytearray nistp384_curve_params[5] = {
60
  {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
61
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
62
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
63
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF},
64
  {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */
65
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
66
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
67
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC},
68
  {0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */
69
   0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12,
70
   0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D,
71
   0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF},
72
  {0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */
73
   0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98,
74
   0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D,
75
   0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7},
76
  {0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */
77
   0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C,
78
   0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE,
79
   0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F},
80
};
81
82
/*-
83
 * The representation of field elements.
84
 * ------------------------------------
85
 *
86
 * We represent field elements with seven values. These values are either 64 or
87
 * 128 bits and the field element represented is:
88
 *   v[0]*2^0 + v[1]*2^56 + v[2]*2^112 + ... + v[6]*2^336  (mod p)
89
 * Each of the seven values is called a 'limb'. Since the limbs are spaced only
90
 * 56 bits apart, but are greater than 56 bits in length, the most significant
91
 * bits of each limb overlap with the least significant bits of the next
92
 *
93
 * This representation is considered to be 'redundant' in the sense that
94
 * intermediate values can each contain more than a 56-bit value in each limb.
95
 * Reduction causes all but the final limb to be reduced to contain a value less
96
 * than 2^56, with the final value represented allowed to be larger than 2^384,
97
 * inasmuch as we can be sure that arithmetic overflow remains impossible. The
98
 * reduced value must of course be congruent to the unreduced value.
99
 *
100
 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
101
 * 'widefelem', featuring enough bits to store the result of a multiplication
102
 * and even some further arithmetic without need for immediate reduction.
103
 */
104
105
37.6k
#define NLIMBS 7
106
107
typedef uint64_t limb;
108
typedef uint128_t widelimb;
109
typedef limb limb_aX __attribute((__aligned__(1)));
110
typedef limb felem[NLIMBS];
111
typedef widelimb widefelem[2*NLIMBS-1];
112
113
static const limb bottom56bits = 0xffffffffffffff;
114
115
/* Helper functions (de)serialising reduced field elements in little endian */
116
static void bin48_to_felem(felem out, const u8 in[48])
117
18
{
118
18
    memset(out, 0, 56);
119
18
    out[0] = (*((limb *) & in[0])) & bottom56bits;
120
18
    out[1] = (*((limb_aX *) & in[7])) & bottom56bits;
121
18
    out[2] = (*((limb_aX *) & in[14])) & bottom56bits;
122
18
    out[3] = (*((limb_aX *) & in[21])) & bottom56bits;
123
18
    out[4] = (*((limb_aX *) & in[28])) & bottom56bits;
124
18
    out[5] = (*((limb_aX *) & in[35])) & bottom56bits;
125
18
    memmove(&out[6], &in[42], 6);
126
18
}
127
128
static void felem_to_bin48(u8 out[48], const felem in)
129
12
{
130
12
    memset(out, 0, 48);
131
12
    (*((limb *) & out[0]))     |= (in[0] & bottom56bits);
132
12
    (*((limb_aX *) & out[7]))  |= (in[1] & bottom56bits);
133
12
    (*((limb_aX *) & out[14])) |= (in[2] & bottom56bits);
134
12
    (*((limb_aX *) & out[21])) |= (in[3] & bottom56bits);
135
12
    (*((limb_aX *) & out[28])) |= (in[4] & bottom56bits);
136
12
    (*((limb_aX *) & out[35])) |= (in[5] & bottom56bits);
137
12
    memmove(&out[42], &in[6], 6);
138
12
}
139
140
/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
141
static int BN_to_felem(felem out, const BIGNUM *bn)
142
18
{
143
18
    felem_bytearray b_out;
144
18
    int num_bytes;
145
146
18
    if (BN_is_negative(bn)) {
147
0
        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
148
0
        return 0;
149
0
    }
150
18
    num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
151
18
    if (num_bytes < 0) {
152
0
        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
153
0
        return 0;
154
0
    }
155
18
    bin48_to_felem(out, b_out);
156
18
    return 1;
157
18
}
158
159
/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
160
static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
161
12
{
162
12
    felem_bytearray b_out;
163
164
12
    felem_to_bin48(b_out, in);
165
12
    return BN_lebin2bn(b_out, sizeof(b_out), out);
166
12
}
167
168
/*-
169
 * Field operations
170
 * ----------------
171
 */
172
173
static void felem_one(felem out)
174
0
{
175
0
    out[0] = 1;
176
0
    memset(&out[1], 0, sizeof(limb) * (NLIMBS-1));
177
0
}
178
179
static void felem_assign(felem out, const felem in)
180
42
{
181
42
    memcpy(out, in, sizeof(felem));
182
42
}
183
184
/* felem_sum64 sets out = out + in. */
185
static void felem_sum64(felem out, const felem in)
186
0
{
187
0
    unsigned int i;
188
189
0
    for (i = 0; i < NLIMBS; i++)
190
0
        out[i] += in[i];
191
0
}
192
193
/* felem_scalar sets out = in * scalar */
194
static void felem_scalar(felem out, const felem in, limb scalar)
195
2.30k
{
196
2.30k
    unsigned int i;
197
198
18.4k
    for (i = 0; i < NLIMBS; i++)
199
16.1k
        out[i] = in[i] * scalar;
200
2.30k
}
201
202
/* felem_scalar64 sets out = out * scalar */
203
static void felem_scalar64(felem out, limb scalar)
204
0
{
205
0
    unsigned int i;
206
207
0
    for (i = 0; i < NLIMBS; i++)
208
0
        out[i] *= scalar;
209
0
}
210
211
/* felem_scalar128 sets out = out * scalar */
212
static void felem_scalar128(widefelem out, limb scalar)
213
0
{
214
0
    unsigned int i;
215
216
0
    for (i = 0; i < 2*NLIMBS-1; i++)
217
0
        out[i] *= scalar;
218
0
}
219
220
/*-
221
 * felem_neg sets |out| to |-in|
222
 * On entry:
223
 *   in[i] < 2^60 - 2^29
224
 * On exit:
225
 *   out[i] < 2^60
226
 */
227
static void felem_neg(felem out, const felem in)
228
0
{
229
    /*
230
     * In order to prevent underflow, we add a multiple of p before subtracting.
231
     * Use telescopic sums to represent 2^12 * p redundantly with each limb
232
     * of the form 2^60 + ...
233
     */
234
0
    static const limb two60m52m4 = (((limb) 1) << 60)
235
0
                                 - (((limb) 1) << 52)
236
0
                                 - (((limb) 1) << 4);
237
0
    static const limb two60p44m12 = (((limb) 1) << 60)
238
0
                                  + (((limb) 1) << 44)
239
0
                                  - (((limb) 1) << 12);
240
0
    static const limb two60m28m4 = (((limb) 1) << 60)
241
0
                                 - (((limb) 1) << 28)
242
0
                                 - (((limb) 1) << 4);
243
0
    static const limb two60m4 = (((limb) 1) << 60)
244
0
                              - (((limb) 1) << 4);
245
246
0
    out[0] = two60p44m12 - in[0];
247
0
    out[1] = two60m52m4 - in[1];
248
0
    out[2] = two60m28m4 - in[2];
249
0
    out[3] = two60m4 - in[3];
250
0
    out[4] = two60m4 - in[4];
251
0
    out[5] = two60m4 - in[5];
252
0
    out[6] = two60m4 - in[6];
253
0
}
254
255
/*-
256
 * felem_diff64 subtracts |in| from |out|
257
 * On entry:
258
 *   in[i] < 2^60 - 2^52 - 2^4
259
 * On exit:
260
 *   out[i] < out_orig[i] + 2^60 + 2^44
261
 */
262
static void felem_diff64(felem out, const felem in)
263
0
{
264
    /*
265
     * In order to prevent underflow, we add a multiple of p before subtracting.
266
     * Use telescopic sums to represent 2^12 * p redundantly with each limb
267
     * of the form 2^60 + ...
268
     */
269
270
0
    static const limb two60m52m4 = (((limb) 1) << 60)
271
0
                                 - (((limb) 1) << 52)
272
0
                                 - (((limb) 1) << 4);
273
0
    static const limb two60p44m12 = (((limb) 1) << 60)
274
0
                                  + (((limb) 1) << 44)
275
0
                                  - (((limb) 1) << 12);
276
0
    static const limb two60m28m4 = (((limb) 1) << 60)
277
0
                                 - (((limb) 1) << 28)
278
0
                                 - (((limb) 1) << 4);
279
0
    static const limb two60m4 = (((limb) 1) << 60)
280
0
                              - (((limb) 1) << 4);
281
282
0
    out[0] += two60p44m12 - in[0];
283
0
    out[1] += two60m52m4 - in[1];
284
0
    out[2] += two60m28m4 - in[2];
285
0
    out[3] += two60m4 - in[3];
286
0
    out[4] += two60m4 - in[4];
287
0
    out[5] += two60m4 - in[5];
288
0
    out[6] += two60m4 - in[6];
289
0
}
290
291
/*
292
 * in[i] < 2^63
293
 * out[i] < out_orig[i] + 2^64 + 2^48
294
 */
295
static void felem_diff_128_64(widefelem out, const felem in)
296
0
{
297
    /*
298
     * In order to prevent underflow, we add a multiple of p before subtracting.
299
     * Use telescopic sums to represent 2^16 * p redundantly with each limb
300
     * of the form 2^64 + ...
301
     */
302
303
0
    static const widelimb two64m56m8 = (((widelimb) 1) << 64)
304
0
                                     - (((widelimb) 1) << 56)
305
0
                                     - (((widelimb) 1) << 8);
306
0
    static const widelimb two64m32m8 = (((widelimb) 1) << 64)
307
0
                                     - (((widelimb) 1) << 32)
308
0
                                     - (((widelimb) 1) << 8);
309
0
    static const widelimb two64m8 = (((widelimb) 1) << 64)
310
0
                                  - (((widelimb) 1) << 8);
311
0
    static const widelimb two64p48m16 = (((widelimb) 1) << 64)
312
0
                                      + (((widelimb) 1) << 48)
313
0
                                      - (((widelimb) 1) << 16);
314
0
    unsigned int i;
315
316
0
    out[0] += two64p48m16;
317
0
    out[1] += two64m56m8;
318
0
    out[2] += two64m32m8;
319
0
    out[3] += two64m8;
320
0
    out[4] += two64m8;
321
0
    out[5] += two64m8;
322
0
    out[6] += two64m8;
323
324
0
    for (i = 0; i < NLIMBS; i++)
325
0
        out[i] -= in[i];
326
0
}
327
328
/*
329
 * in[i] < 2^127 - 2^119 - 2^71
330
 * out[i] < out_orig[i] + 2^127 + 2^111
331
 */
332
static void felem_diff128(widefelem out, const widefelem in)
333
0
{
334
    /*
335
     * In order to prevent underflow, we add a multiple of p before subtracting.
336
     * Use telescopic sums to represent 2^415 * p redundantly with each limb
337
     * of the form 2^127 + ...
338
     */
339
340
0
    static const widelimb two127 = ((widelimb) 1) << 127;
341
0
    static const widelimb two127m71 = (((widelimb) 1) << 127)
342
0
                                    - (((widelimb) 1) << 71);
343
0
    static const widelimb two127p111m79m71 = (((widelimb) 1) << 127)
344
0
                                           + (((widelimb) 1) << 111)
345
0
                                           - (((widelimb) 1) << 79)
346
0
                                           - (((widelimb) 1) << 71);
347
0
    static const widelimb two127m119m71 = (((widelimb) 1) << 127)
348
0
                                        - (((widelimb) 1) << 119)
349
0
                                        - (((widelimb) 1) << 71);
350
0
    static const widelimb two127m95m71 = (((widelimb) 1) << 127)
351
0
                                       - (((widelimb) 1) << 95)
352
0
                                       - (((widelimb) 1) << 71);
353
0
    unsigned int i;
354
355
0
    out[0]  += two127;
356
0
    out[1]  += two127m71;
357
0
    out[2]  += two127m71;
358
0
    out[3]  += two127m71;
359
0
    out[4]  += two127m71;
360
0
    out[5]  += two127m71;
361
0
    out[6]  += two127p111m79m71;
362
0
    out[7]  += two127m119m71;
363
0
    out[8]  += two127m95m71;
364
0
    out[9]  += two127m71;
365
0
    out[10] += two127m71;
366
0
    out[11] += two127m71;
367
0
    out[12] += two127m71;
368
369
0
    for (i = 0; i < 2*NLIMBS-1; i++)
370
0
        out[i] -= in[i];
371
0
}
372
373
static void felem_square_ref(widefelem out, const felem in)
374
2.30k
{
375
2.30k
    felem inx2;
376
2.30k
    felem_scalar(inx2, in, 2);
377
378
2.30k
    out[0] = ((uint128_t) in[0]) * in[0];
379
380
2.30k
    out[1] = ((uint128_t) in[0]) * inx2[1];
381
382
2.30k
    out[2] = ((uint128_t) in[0]) * inx2[2]
383
2.30k
           + ((uint128_t) in[1]) * in[1];
384
385
2.30k
    out[3] = ((uint128_t) in[0]) * inx2[3]
386
2.30k
           + ((uint128_t) in[1]) * inx2[2];
387
388
2.30k
    out[4] = ((uint128_t) in[0]) * inx2[4]
389
2.30k
           + ((uint128_t) in[1]) * inx2[3]
390
2.30k
           + ((uint128_t) in[2]) * in[2];
391
392
2.30k
    out[5] = ((uint128_t) in[0]) * inx2[5]
393
2.30k
           + ((uint128_t) in[1]) * inx2[4]
394
2.30k
           + ((uint128_t) in[2]) * inx2[3];
395
396
2.30k
    out[6] = ((uint128_t) in[0]) * inx2[6]
397
2.30k
           + ((uint128_t) in[1]) * inx2[5]
398
2.30k
           + ((uint128_t) in[2]) * inx2[4]
399
2.30k
           + ((uint128_t) in[3]) * in[3];
400
401
2.30k
    out[7] = ((uint128_t) in[1]) * inx2[6]
402
2.30k
           + ((uint128_t) in[2]) * inx2[5]
403
2.30k
           + ((uint128_t) in[3]) * inx2[4];
404
405
2.30k
    out[8] = ((uint128_t) in[2]) * inx2[6]
406
2.30k
           + ((uint128_t) in[3]) * inx2[5]
407
2.30k
           + ((uint128_t) in[4]) * in[4];
408
409
2.30k
    out[9] = ((uint128_t) in[3]) * inx2[6]
410
2.30k
           + ((uint128_t) in[4]) * inx2[5];
411
412
2.30k
    out[10] = ((uint128_t) in[4]) * inx2[6]
413
2.30k
            + ((uint128_t) in[5]) * in[5];
414
415
2.30k
    out[11] = ((uint128_t) in[5]) * inx2[6];
416
417
2.30k
    out[12] = ((uint128_t) in[6]) * in[6];
418
2.30k
}
419
420
static void felem_mul_ref(widefelem out, const felem in1, const felem in2)
421
102
{
422
102
    out[0] = ((uint128_t) in1[0]) * in2[0];
423
424
102
    out[1] = ((uint128_t) in1[0]) * in2[1]
425
102
           + ((uint128_t) in1[1]) * in2[0];
426
427
102
    out[2] = ((uint128_t) in1[0]) * in2[2]
428
102
           + ((uint128_t) in1[1]) * in2[1]
429
102
           + ((uint128_t) in1[2]) * in2[0];
430
431
102
    out[3] = ((uint128_t) in1[0]) * in2[3]
432
102
           + ((uint128_t) in1[1]) * in2[2]
433
102
           + ((uint128_t) in1[2]) * in2[1]
434
102
           + ((uint128_t) in1[3]) * in2[0];
435
436
102
    out[4] = ((uint128_t) in1[0]) * in2[4]
437
102
           + ((uint128_t) in1[1]) * in2[3]
438
102
           + ((uint128_t) in1[2]) * in2[2]
439
102
           + ((uint128_t) in1[3]) * in2[1]
440
102
           + ((uint128_t) in1[4]) * in2[0];
441
442
102
    out[5] = ((uint128_t) in1[0]) * in2[5]
443
102
           + ((uint128_t) in1[1]) * in2[4]
444
102
           + ((uint128_t) in1[2]) * in2[3]
445
102
           + ((uint128_t) in1[3]) * in2[2]
446
102
           + ((uint128_t) in1[4]) * in2[1]
447
102
           + ((uint128_t) in1[5]) * in2[0];
448
449
102
    out[6] = ((uint128_t) in1[0]) * in2[6]
450
102
           + ((uint128_t) in1[1]) * in2[5]
451
102
           + ((uint128_t) in1[2]) * in2[4]
452
102
           + ((uint128_t) in1[3]) * in2[3]
453
102
           + ((uint128_t) in1[4]) * in2[2]
454
102
           + ((uint128_t) in1[5]) * in2[1]
455
102
           + ((uint128_t) in1[6]) * in2[0];
456
457
102
    out[7] = ((uint128_t) in1[1]) * in2[6]
458
102
           + ((uint128_t) in1[2]) * in2[5]
459
102
           + ((uint128_t) in1[3]) * in2[4]
460
102
           + ((uint128_t) in1[4]) * in2[3]
461
102
           + ((uint128_t) in1[5]) * in2[2]
462
102
           + ((uint128_t) in1[6]) * in2[1];
463
464
102
    out[8] = ((uint128_t) in1[2]) * in2[6]
465
102
           + ((uint128_t) in1[3]) * in2[5]
466
102
           + ((uint128_t) in1[4]) * in2[4]
467
102
           + ((uint128_t) in1[5]) * in2[3]
468
102
           + ((uint128_t) in1[6]) * in2[2];
469
470
102
    out[9] = ((uint128_t) in1[3]) * in2[6]
471
102
           + ((uint128_t) in1[4]) * in2[5]
472
102
           + ((uint128_t) in1[5]) * in2[4]
473
102
           + ((uint128_t) in1[6]) * in2[3];
474
475
102
    out[10] = ((uint128_t) in1[4]) * in2[6]
476
102
            + ((uint128_t) in1[5]) * in2[5]
477
102
            + ((uint128_t) in1[6]) * in2[4];
478
479
102
    out[11] = ((uint128_t) in1[5]) * in2[6]
480
102
            + ((uint128_t) in1[6]) * in2[5];
481
482
102
    out[12] = ((uint128_t) in1[6]) * in2[6];
483
102
}
484
485
/*-
486
 * Reduce thirteen 128-bit coefficients to seven 64-bit coefficients.
487
 * in[i] < 2^128 - 2^125
488
 * out[i] < 2^56 for i < 6,
489
 * out[6] <= 2^48
490
 *
491
 * The technique in use here stems from the format of the prime modulus:
492
 * P384 = 2^384 - delta
493
 *
494
 * Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting
495
 * them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These
496
 * coefficients are still quite large, and so we repeatedly apply this
497
 * technique on high-order bits in order to guarantee the desired bounds on
498
 * the size of our output.
499
 *
500
 * The three phases of elimination are as follows:
501
 * [1]: Y = 2^120 (in[12] | in[11] | in[10] | in[9])
502
 * [2]: Y = 2^8 (acc[8] | acc[7])
503
 * [3]: Y = 2^48 (acc[6] >> 48)
504
 * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d)
505
 */
506
static void felem_reduce(felem out, const widefelem in)
507
2.40k
{
508
    /*
509
     * In order to prevent underflow, we add a multiple of p before subtracting.
510
     * Use telescopic sums to represent 2^76 * p redundantly with each limb
511
     * of the form 2^124 + ...
512
     */
513
2.40k
    static const widelimb two124m68 = (((widelimb) 1) << 124)
514
2.40k
                                    - (((widelimb) 1) << 68);
515
2.40k
    static const widelimb two124m116m68 = (((widelimb) 1) << 124)
516
2.40k
                                        - (((widelimb) 1) << 116)
517
2.40k
                                        - (((widelimb) 1) << 68);
518
2.40k
    static const widelimb two124p108m76 = (((widelimb) 1) << 124)
519
2.40k
                                        + (((widelimb) 1) << 108)
520
2.40k
                                        - (((widelimb) 1) << 76);
521
2.40k
    static const widelimb two124m92m68 = (((widelimb) 1) << 124)
522
2.40k
                                       - (((widelimb) 1) << 92)
523
2.40k
                                       - (((widelimb) 1) << 68);
524
2.40k
    widelimb temp, acc[9];
525
2.40k
    unsigned int i;
526
527
2.40k
    memcpy(acc, in, sizeof(widelimb) * 9);
528
529
2.40k
    acc[0] += two124p108m76;
530
2.40k
    acc[1] += two124m116m68;
531
2.40k
    acc[2] += two124m92m68;
532
2.40k
    acc[3] += two124m68;
533
2.40k
    acc[4] += two124m68;
534
2.40k
    acc[5] += two124m68;
535
2.40k
    acc[6] += two124m68;
536
537
    /* [1]: Eliminate in[9], ..., in[12] */
538
2.40k
    acc[8] += in[12] >> 32;
539
2.40k
    acc[7] += (in[12] & 0xffffffff) << 24;
540
2.40k
    acc[7] += in[12] >> 8;
541
2.40k
    acc[6] += (in[12] & 0xff) << 48;
542
2.40k
    acc[6] -= in[12] >> 16;
543
2.40k
    acc[5] -= (in[12] & 0xffff) << 40;
544
2.40k
    acc[6] += in[12] >> 48;
545
2.40k
    acc[5] += (in[12] & 0xffffffffffff) << 8;
546
547
2.40k
    acc[7] += in[11] >> 32;
548
2.40k
    acc[6] += (in[11] & 0xffffffff) << 24;
549
2.40k
    acc[6] += in[11] >> 8;
550
2.40k
    acc[5] += (in[11] & 0xff) << 48;
551
2.40k
    acc[5] -= in[11] >> 16;
552
2.40k
    acc[4] -= (in[11] & 0xffff) << 40;
553
2.40k
    acc[5] += in[11] >> 48;
554
2.40k
    acc[4] += (in[11] & 0xffffffffffff) << 8;
555
556
2.40k
    acc[6] += in[10] >> 32;
557
2.40k
    acc[5] += (in[10] & 0xffffffff) << 24;
558
2.40k
    acc[5] += in[10] >> 8;
559
2.40k
    acc[4] += (in[10] & 0xff) << 48;
560
2.40k
    acc[4] -= in[10] >> 16;
561
2.40k
    acc[3] -= (in[10] & 0xffff) << 40;
562
2.40k
    acc[4] += in[10] >> 48;
563
2.40k
    acc[3] += (in[10] & 0xffffffffffff) << 8;
564
565
2.40k
    acc[5] += in[9] >> 32;
566
2.40k
    acc[4] += (in[9] & 0xffffffff) << 24;
567
2.40k
    acc[4] += in[9] >> 8;
568
2.40k
    acc[3] += (in[9] & 0xff) << 48;
569
2.40k
    acc[3] -= in[9] >> 16;
570
2.40k
    acc[2] -= (in[9] & 0xffff) << 40;
571
2.40k
    acc[3] += in[9] >> 48;
572
2.40k
    acc[2] += (in[9] & 0xffffffffffff) << 8;
573
574
    /*
575
     * [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as
576
     * well as the contributions made from eliminating higher limbs.
577
     * acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121
578
     * acc[8] < in[8] + 2^96
579
     */
580
2.40k
    acc[4] += acc[8] >> 32;
581
2.40k
    acc[3] += (acc[8] & 0xffffffff) << 24;
582
2.40k
    acc[3] += acc[8] >> 8;
583
2.40k
    acc[2] += (acc[8] & 0xff) << 48;
584
2.40k
    acc[2] -= acc[8] >> 16;
585
2.40k
    acc[1] -= (acc[8] & 0xffff) << 40;
586
2.40k
    acc[2] += acc[8] >> 48;
587
2.40k
    acc[1] += (acc[8] & 0xffffffffffff) << 8;
588
589
2.40k
    acc[3] += acc[7] >> 32;
590
2.40k
    acc[2] += (acc[7] & 0xffffffff) << 24;
591
2.40k
    acc[2] += acc[7] >> 8;
592
2.40k
    acc[1] += (acc[7] & 0xff) << 48;
593
2.40k
    acc[1] -= acc[7] >> 16;
594
2.40k
    acc[0] -= (acc[7] & 0xffff) << 40;
595
2.40k
    acc[1] += acc[7] >> 48;
596
2.40k
    acc[0] += (acc[7] & 0xffffffffffff) << 8;
597
598
    /*-
599
     * acc[k] < in[k] + 2^124 + 2^121
600
     *        < in[k] + 2^125
601
     *        < 2^128, for k <= 6
602
     */
603
604
    /*
605
     * Carry 4 -> 5 -> 6
606
     * This has the effect of ensuring that these more significant limbs
607
     * will be small in value after eliminating high bits from acc[6].
608
     */
609
2.40k
    acc[5] += acc[4] >> 56;
610
2.40k
    acc[4] &= 0x00ffffffffffffff;
611
612
2.40k
    acc[6] += acc[5] >> 56;
613
2.40k
    acc[5] &= 0x00ffffffffffffff;
614
615
    /*-
616
     * acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16
617
     *        < in[6] + 2^125
618
     *        < 2^128
619
     */
620
621
    /* [3]: Eliminate high bits of acc[6] */
622
2.40k
    temp = acc[6] >> 48;
623
2.40k
    acc[6] &= 0x0000ffffffffffff;
624
625
    /* temp < 2^80 */
626
627
2.40k
    acc[3] += temp >> 40;
628
2.40k
    acc[2] += (temp & 0xffffffffff) << 16;
629
2.40k
    acc[2] += temp >> 16;
630
2.40k
    acc[1] += (temp & 0xffff) << 40;
631
2.40k
    acc[1] -= temp >> 24;
632
2.40k
    acc[0] -= (temp & 0xffffff) << 32;
633
2.40k
    acc[0] += temp;
634
635
    /*-
636
     * acc[k] < acc_old[k] + 2^64 + 2^56
637
     *        < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4
638
     */
639
640
    /* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
641
2.40k
    acc[1] += acc[0] >> 56;   /* acc[1] < acc_old[1] + 2^72 */
642
2.40k
    acc[0] &= 0x00ffffffffffffff;
643
644
2.40k
    acc[2] += acc[1] >> 56;   /* acc[2] < acc_old[2] + 2^72 + 2^16 */
645
2.40k
    acc[1] &= 0x00ffffffffffffff;
646
647
2.40k
    acc[3] += acc[2] >> 56;   /* acc[3] < acc_old[3] + 2^72 + 2^16 */
648
2.40k
    acc[2] &= 0x00ffffffffffffff;
649
650
    /*-
651
     * acc[k] < acc_old[k] + 2^72 + 2^16
652
     *        < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17
653
     *        < in[k] + 2^125
654
     *        < 2^128 , k < 4
655
     */
656
657
2.40k
    acc[4] += acc[3] >> 56;   /*-
658
                               * acc[4] < acc_old[4] + 2^72 + 2^16
659
                               *        < 2^72 + 2^56 + 2^16
660
                               */
661
2.40k
    acc[3] &= 0x00ffffffffffffff;
662
663
2.40k
    acc[5] += acc[4] >> 56;   /*-
664
                               * acc[5] < acc_old[5] + 2^16 + 1
665
                               *        < 2^56 + 2^16 + 1
666
                               */
667
2.40k
    acc[4] &= 0x00ffffffffffffff;
668
669
2.40k
    acc[6] += acc[5] >> 56;   /* acc[6] < 2^48 + 1 <= 2^48 */
670
2.40k
    acc[5] &= 0x00ffffffffffffff;
671
672
19.2k
    for (i = 0; i < NLIMBS; i++)
673
16.8k
        out[i] = acc[i];
674
2.40k
}
675
676
#if defined(ECP_NISTP384_ASM)
677
static void felem_square_wrapper(widefelem out, const felem in);
678
static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2);
679
680
static void (*felem_square_p)(widefelem out, const felem in) =
681
    felem_square_wrapper;
682
static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) =
683
    felem_mul_wrapper;
684
685
void p384_felem_square(widefelem out, const felem in);
686
void p384_felem_mul(widefelem out, const felem in1, const felem in2);
687
688
# if defined(_ARCH_PPC64)
689
#  include "crypto/ppc_arch.h"
690
# endif
691
692
static void felem_select(void)
693
{
694
# if defined(_ARCH_PPC64)
695
    if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
696
        felem_square_p = p384_felem_square;
697
        felem_mul_p = p384_felem_mul;
698
699
        return;
700
    }
701
# endif
702
703
    /* Default */
704
    felem_square_p = felem_square_ref;
705
    felem_mul_p = felem_mul_ref;
706
}
707
708
static void felem_square_wrapper(widefelem out, const felem in)
709
{
710
    felem_select();
711
    felem_square_p(out, in);
712
}
713
714
static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2)
715
{
716
    felem_select();
717
    felem_mul_p(out, in1, in2);
718
}
719
720
# define felem_square felem_square_p
721
# define felem_mul felem_mul_p
722
#else
723
2.30k
# define felem_square felem_square_ref
724
102
# define felem_mul felem_mul_ref
725
#endif
726
727
static ossl_inline void felem_square_reduce(felem out, const felem in)
728
2.29k
{
729
2.29k
    widefelem tmp;
730
731
2.29k
    felem_square(tmp, in);
732
2.29k
    felem_reduce(out, tmp);
733
2.29k
}
734
735
static ossl_inline void felem_mul_reduce(felem out, const felem in1, const felem in2)
736
84
{
737
84
    widefelem tmp;
738
739
84
    felem_mul(tmp, in1, in2);
740
84
    felem_reduce(out, tmp);
741
84
}
742
743
/*-
744
 * felem_inv calculates |out| = |in|^{-1}
745
 *
746
 * Based on Fermat's Little Theorem:
747
 *   a^p = a (mod p)
748
 *   a^{p-1} = 1 (mod p)
749
 *   a^{p-2} = a^{-1} (mod p)
750
 */
751
static void felem_inv(felem out, const felem in)
752
6
{
753
6
    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6;
754
6
    unsigned int i = 0;
755
756
6
    felem_square_reduce(ftmp, in);      /* 2^1 */
757
6
    felem_mul_reduce(ftmp, ftmp, in);   /* 2^1 + 2^0 */
758
6
    felem_assign(ftmp2, ftmp);
759
760
6
    felem_square_reduce(ftmp, ftmp);    /* 2^2 + 2^1 */
761
6
    felem_mul_reduce(ftmp, ftmp, in);   /* 2^2 + 2^1 * 2^0 */
762
6
    felem_assign(ftmp3, ftmp);
763
764
24
    for (i = 0; i < 3; i++)
765
18
        felem_square_reduce(ftmp, ftmp); /* 2^5 + 2^4 + 2^3 */
766
6
    felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */
767
6
    felem_assign(ftmp4, ftmp);
768
769
42
    for (i = 0; i < 6; i++)
770
36
        felem_square_reduce(ftmp, ftmp); /* 2^11 + ... + 2^6 */
771
6
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^11 + ... + 2^0 */
772
773
24
    for (i = 0; i < 3; i++)
774
18
        felem_square_reduce(ftmp, ftmp); /* 2^14 + ... + 2^3 */
775
6
    felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^14 + ... + 2^0 */
776
6
    felem_assign(ftmp5, ftmp);
777
778
96
    for (i = 0; i < 15; i++)
779
90
        felem_square_reduce(ftmp, ftmp); /* 2^29 + ... + 2^15 */
780
6
    felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^29 + ... + 2^0 */
781
6
    felem_assign(ftmp6, ftmp);
782
783
186
    for (i = 0; i < 30; i++)
784
180
        felem_square_reduce(ftmp, ftmp); /* 2^59 + ... + 2^30 */
785
6
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^59 + ... + 2^0 */
786
6
    felem_assign(ftmp4, ftmp);
787
788
366
    for (i = 0; i < 60; i++)
789
360
        felem_square_reduce(ftmp, ftmp); /* 2^119 + ... + 2^60 */
790
6
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^119 + ... + 2^0 */
791
6
    felem_assign(ftmp4, ftmp);
792
793
726
    for (i = 0; i < 120; i++)
794
720
      felem_square_reduce(ftmp, ftmp);   /* 2^239 + ... + 2^120 */
795
6
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^239 + ... + 2^0 */
796
797
96
    for (i = 0; i < 15; i++)
798
90
        felem_square_reduce(ftmp, ftmp); /* 2^254 + ... + 2^15 */
799
6
    felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^254 + ... + 2^0 */
800
801
192
    for (i = 0; i < 31; i++)
802
186
        felem_square_reduce(ftmp, ftmp); /* 2^285 + ... + 2^31 */
803
6
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^285 + ... + 2^31 + 2^29 + ... + 2^0 */
804
805
18
    for (i = 0; i < 2; i++)
806
12
        felem_square_reduce(ftmp, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^2 */
807
6
    felem_mul_reduce(ftmp, ftmp2, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^0 */
808
809
570
    for (i = 0; i < 94; i++)
810
564
        felem_square_reduce(ftmp, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 */
811
6
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 + 2^29 + ... + 2^0 */
812
813
18
    for (i = 0; i < 2; i++)
814
12
        felem_square_reduce(ftmp, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 */
815
6
    felem_mul_reduce(ftmp, in, ftmp);    /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 + 2^0 */
816
817
6
    memcpy(out, ftmp, sizeof(felem));
818
6
}
819
820
/*
821
 * Zero-check: returns a limb with all bits set if |in| == 0 (mod p)
822
 * and 0 otherwise. We know that field elements are reduced to
823
 * 0 < in < 2p, so we only need to check two cases:
824
 * 0 and 2^384 - 2^128 - 2^96 + 2^32 - 1
825
 *   in[k] < 2^56, k < 6
826
 *   in[6] <= 2^48
827
 */
828
static limb felem_is_zero(const felem in)
829
0
{
830
0
    limb zero, p384;
831
832
0
    zero = in[0] | in[1] | in[2] | in[3] | in[4] | in[5] | in[6];
833
0
    zero = ((int64_t) (zero) - 1) >> 63;
834
0
    p384 = (in[0] ^ 0x000000ffffffff) | (in[1] ^ 0xffff0000000000)
835
0
         | (in[2] ^ 0xfffffffffeffff) | (in[3] ^ 0xffffffffffffff)
836
0
         | (in[4] ^ 0xffffffffffffff) | (in[5] ^ 0xffffffffffffff)
837
0
         | (in[6] ^ 0xffffffffffff);
838
0
    p384 = ((int64_t) (p384) - 1) >> 63;
839
840
0
    return (zero | p384);
841
0
}
842
843
static int felem_is_zero_int(const void *in)
844
0
{
845
0
    return (int)(felem_is_zero(in) & ((limb) 1));
846
0
}
847
848
/*-
849
 * felem_contract converts |in| to its unique, minimal representation.
850
 * Assume we've removed all redundant bits.
851
 * On entry:
852
 *   in[k] < 2^56, k < 6
853
 *   in[6] <= 2^48
854
 */
855
static void felem_contract(felem out, const felem in)
856
12
{
857
12
    static const int64_t two56 = ((limb) 1) << 56;
858
859
    /*
860
     * We know for a fact that 0 <= |in| < 2*p, for p = 2^384 - 2^128 - 2^96 + 2^32 - 1
861
     * Perform two successive, idempotent subtractions to reduce if |in| >= p.
862
     */
863
864
12
    int64_t tmp[NLIMBS], cond[5], a;
865
12
    unsigned int i;
866
867
12
    memcpy(tmp, in, sizeof(felem));
868
869
    /* Case 1: a = 1 iff |in| >= 2^384 */
870
12
    a = (in[6] >> 48);
871
12
    tmp[0] += a;
872
12
    tmp[0] -= a << 32;
873
12
    tmp[1] += a << 40;
874
12
    tmp[2] += a << 16;
875
12
    tmp[6] &= 0x0000ffffffffffff;
876
877
    /*
878
     * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
879
     * non-zero, so we only need one step
880
     */
881
882
12
    a = tmp[0] >> 63;
883
12
    tmp[0] += a & two56;
884
12
    tmp[1] -= a & 1;
885
886
    /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
887
12
    tmp[2] += tmp[1] >> 56;
888
12
    tmp[1] &= 0x00ffffffffffffff;
889
890
12
    tmp[3] += tmp[2] >> 56;
891
12
    tmp[2] &= 0x00ffffffffffffff;
892
893
12
    tmp[4] += tmp[3] >> 56;
894
12
    tmp[3] &= 0x00ffffffffffffff;
895
896
12
    tmp[5] += tmp[4] >> 56;
897
12
    tmp[4] &= 0x00ffffffffffffff;
898
899
12
    tmp[6] += tmp[5] >> 56; /* tmp[6] < 2^48 */
900
12
    tmp[5] &= 0x00ffffffffffffff;
901
902
    /*
903
     * Case 2: a = all ones if p <= |in| < 2^384, 0 otherwise
904
     */
905
906
    /* 0 iff (2^129..2^383) are all one */
907
12
    cond[0] = ((tmp[6] | 0xff000000000000) & tmp[5] & tmp[4] & tmp[3] & (tmp[2] | 0x0000000001ffff)) + 1;
908
    /* 0 iff 2^128 bit is one */
909
12
    cond[1] = (tmp[2] | ~0x00000000010000) + 1;
910
    /* 0 iff (2^96..2^127) bits are all one */
911
12
    cond[2] = ((tmp[2] | 0xffffffffff0000) & (tmp[1] | 0x0000ffffffffff)) + 1;
912
    /* 0 iff (2^32..2^95) bits are all zero */
913
12
    cond[3] = (tmp[1] & ~0xffff0000000000) | (tmp[0] & ~((int64_t) 0x000000ffffffff));
914
    /* 0 iff (2^0..2^31) bits are all one */
915
12
    cond[4] = (tmp[0] | 0xffffff00000000) + 1;
916
917
    /*
918
     * In effect, invert our conditions, so that 0 values become all 1's,
919
     * any non-zero value in the low-order 56 bits becomes all 0's
920
     */
921
72
    for (i = 0; i < 5; i++)
922
60
       cond[i] = ((cond[i] & 0x00ffffffffffffff) - 1) >> 63;
923
924
    /*
925
     * The condition for determining whether in is greater than our
926
     * prime is given by the following condition.
927
     */
928
929
    /* First subtract 2^384 - 2^129 cheaply */
930
12
    a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
931
12
    tmp[6] &= ~a;
932
12
    tmp[5] &= ~a;
933
12
    tmp[4] &= ~a;
934
12
    tmp[3] &= ~a;
935
12
    tmp[2] &= ~a | 0x0000000001ffff;
936
937
    /*
938
     * Subtract 2^128 - 2^96 by
939
     * means of disjoint cases.
940
     */
941
942
    /* subtract 2^128 if that bit is present, and add 2^96 */
943
12
    a = cond[0] & cond[1];
944
12
    tmp[2] &= ~a | 0xfffffffffeffff;
945
12
    tmp[1] += a & ((int64_t) 1 << 40);
946
947
    /* otherwise, clear bits 2^127 .. 2^96  */
948
12
    a = cond[0] & ~cond[1] & (cond[2] & (~cond[3] | cond[4]));
949
12
    tmp[2] &= ~a | 0xffffffffff0000;
950
12
    tmp[1] &= ~a | 0x0000ffffffffff;
951
952
    /* finally, subtract the last 2^32 - 1 */
953
12
    a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
954
12
    tmp[0] += a & (-((int64_t) 1 << 32) + 1);
955
956
    /*
957
     * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
958
     * non-zero, so we only need one step
959
     */
960
12
    a = tmp[0] >> 63;
961
12
    tmp[0] += a & two56;
962
12
    tmp[1] -= a & 1;
963
964
    /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
965
12
    tmp[2] += tmp[1] >> 56;
966
12
    tmp[1] &= 0x00ffffffffffffff;
967
968
12
    tmp[3] += tmp[2] >> 56;
969
12
    tmp[2] &= 0x00ffffffffffffff;
970
971
12
    tmp[4] += tmp[3] >> 56;
972
12
    tmp[3] &= 0x00ffffffffffffff;
973
974
12
    tmp[5] += tmp[4] >> 56;
975
12
    tmp[4] &= 0x00ffffffffffffff;
976
977
12
    tmp[6] += tmp[5] >> 56;
978
12
    tmp[5] &= 0x00ffffffffffffff;
979
980
12
    memcpy(out, tmp, sizeof(felem));
981
12
}
982
983
/*-
984
 * Group operations
985
 * ----------------
986
 *
987
 * Building on top of the field operations we have the operations on the
988
 * elliptic curve group itself. Points on the curve are represented in Jacobian
989
 * coordinates
990
 */
991
992
/*-
993
 * point_double calculates 2*(x_in, y_in, z_in)
994
 *
995
 * The method is taken from:
996
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
997
 *
998
 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
999
 * while x_out == y_in is not (maybe this works, but it's not tested).
1000
 */
1001
static void
1002
point_double(felem x_out, felem y_out, felem z_out,
1003
             const felem x_in, const felem y_in, const felem z_in)
1004
0
{
1005
0
    widefelem tmp, tmp2;
1006
0
    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1007
1008
0
    felem_assign(ftmp, x_in);
1009
0
    felem_assign(ftmp2, x_in);
1010
1011
    /* delta = z^2 */
1012
0
    felem_square_reduce(delta, z_in);     /* delta[i] < 2^56 */
1013
1014
    /* gamma = y^2 */
1015
0
    felem_square_reduce(gamma, y_in);     /* gamma[i] < 2^56 */
1016
1017
    /* beta = x*gamma */
1018
0
    felem_mul_reduce(beta, x_in, gamma);  /* beta[i] < 2^56 */
1019
1020
    /* alpha = 3*(x-delta)*(x+delta) */
1021
0
    felem_diff64(ftmp, delta);            /* ftmp[i] < 2^60 + 2^58 + 2^44 */
1022
0
    felem_sum64(ftmp2, delta);            /* ftmp2[i] < 2^59 */
1023
0
    felem_scalar64(ftmp2, 3);             /* ftmp2[i] < 2^61 */
1024
0
    felem_mul_reduce(alpha, ftmp, ftmp2); /* alpha[i] < 2^56 */
1025
1026
    /* x' = alpha^2 - 8*beta */
1027
0
    felem_square(tmp, alpha);             /* tmp[i] < 2^115 */
1028
0
    felem_assign(ftmp, beta);             /* ftmp[i] < 2^56 */
1029
0
    felem_scalar64(ftmp, 8);              /* ftmp[i] < 2^59 */
1030
0
    felem_diff_128_64(tmp, ftmp);         /* tmp[i] < 2^115 + 2^64 + 2^48 */
1031
0
    felem_reduce(x_out, tmp);             /* x_out[i] < 2^56 */
1032
1033
    /* z' = (y + z)^2 - gamma - delta */
1034
0
    felem_sum64(delta, gamma);     /* delta[i] < 2^57 */
1035
0
    felem_assign(ftmp, y_in);      /* ftmp[i] < 2^56 */
1036
0
    felem_sum64(ftmp, z_in);       /* ftmp[i] < 2^56 */
1037
0
    felem_square(tmp, ftmp);       /* tmp[i] < 2^115 */
1038
0
    felem_diff_128_64(tmp, delta); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1039
0
    felem_reduce(z_out, tmp);      /* z_out[i] < 2^56 */
1040
1041
    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1042
0
    felem_scalar64(beta, 4);       /* beta[i] < 2^58 */
1043
0
    felem_diff64(beta, x_out);     /* beta[i] < 2^60 + 2^58 + 2^44 */
1044
0
    felem_mul(tmp, alpha, beta);   /* tmp[i] < 2^119 */
1045
0
    felem_square(tmp2, gamma);     /* tmp2[i] < 2^115 */
1046
0
    felem_scalar128(tmp2, 8);      /* tmp2[i] < 2^118 */
1047
0
    felem_diff128(tmp, tmp2);      /* tmp[i] < 2^127 + 2^119 + 2^111 */
1048
0
    felem_reduce(y_out, tmp);      /* tmp[i] < 2^56 */
1049
0
}
1050
1051
/* copy_conditional copies in to out iff mask is all ones. */
1052
static void copy_conditional(felem out, const felem in, limb mask)
1053
0
{
1054
0
    unsigned int i;
1055
1056
0
    for (i = 0; i < NLIMBS; i++)
1057
0
        out[i] ^= mask & (in[i] ^ out[i]);
1058
0
}
1059
1060
/*-
1061
 * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1062
 *
1063
 * The method is taken from
1064
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1065
 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1066
 *
1067
 * This function includes a branch for checking whether the two input points
1068
 * are equal (while not equal to the point at infinity). See comment below
1069
 * on constant-time.
1070
 */
1071
static void point_add(felem x3, felem y3, felem z3,
1072
                      const felem x1, const felem y1, const felem z1,
1073
                      const int mixed, const felem x2, const felem y2,
1074
                      const felem z2)
1075
0
{
1076
0
    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1077
0
    widefelem tmp, tmp2;
1078
0
    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1079
0
    limb points_equal;
1080
1081
0
    z1_is_zero = felem_is_zero(z1);
1082
0
    z2_is_zero = felem_is_zero(z2);
1083
1084
    /* ftmp = z1z1 = z1**2 */
1085
0
    felem_square_reduce(ftmp, z1);      /* ftmp[i] < 2^56 */
1086
1087
0
    if (!mixed) {
1088
        /* ftmp2 = z2z2 = z2**2 */
1089
0
        felem_square_reduce(ftmp2, z2); /* ftmp2[i] < 2^56 */
1090
1091
        /* u1 = ftmp3 = x1*z2z2 */
1092
0
        felem_mul_reduce(ftmp3, x1, ftmp2); /* ftmp3[i] < 2^56 */
1093
1094
        /* ftmp5 = z1 + z2 */
1095
0
        felem_assign(ftmp5, z1);       /* ftmp5[i] < 2^56 */
1096
0
        felem_sum64(ftmp5, z2);        /* ftmp5[i] < 2^57 */
1097
1098
        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1099
0
        felem_square(tmp, ftmp5);      /* tmp[i] < 2^117 */
1100
0
        felem_diff_128_64(tmp, ftmp);  /* tmp[i] < 2^117 + 2^64 + 2^48 */
1101
0
        felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1102
0
        felem_reduce(ftmp5, tmp);      /* ftmp5[i] < 2^56 */
1103
1104
        /* ftmp2 = z2 * z2z2 */
1105
0
        felem_mul_reduce(ftmp2, ftmp2, z2); /* ftmp2[i] < 2^56 */
1106
1107
        /* s1 = ftmp6 = y1 * z2**3 */
1108
0
        felem_mul_reduce(ftmp6, y1, ftmp2); /* ftmp6[i] < 2^56 */
1109
0
    } else {
1110
        /*
1111
         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1112
         */
1113
1114
        /* u1 = ftmp3 = x1*z2z2 */
1115
0
        felem_assign(ftmp3, x1);     /* ftmp3[i] < 2^56 */
1116
1117
        /* ftmp5 = 2*z1z2 */
1118
0
        felem_scalar(ftmp5, z1, 2);  /* ftmp5[i] < 2^57 */
1119
1120
        /* s1 = ftmp6 = y1 * z2**3 */
1121
0
        felem_assign(ftmp6, y1);     /* ftmp6[i] < 2^56 */
1122
0
    }
1123
    /* ftmp3[i] < 2^56, ftmp5[i] < 2^57, ftmp6[i] < 2^56 */
1124
1125
    /* u2 = x2*z1z1 */
1126
0
    felem_mul(tmp, x2, ftmp);        /* tmp[i] < 2^115 */
1127
1128
    /* h = ftmp4 = u2 - u1 */
1129
0
    felem_diff_128_64(tmp, ftmp3);   /* tmp[i] < 2^115 + 2^64 + 2^48 */
1130
0
    felem_reduce(ftmp4, tmp);        /* ftmp[4] < 2^56 */
1131
1132
0
    x_equal = felem_is_zero(ftmp4);
1133
1134
    /* z_out = ftmp5 * h */
1135
0
    felem_mul_reduce(z_out, ftmp5, ftmp4);  /* z_out[i] < 2^56 */
1136
1137
    /* ftmp = z1 * z1z1 */
1138
0
    felem_mul_reduce(ftmp, ftmp, z1);  /* ftmp[i] < 2^56 */
1139
1140
    /* s2 = tmp = y2 * z1**3 */
1141
0
    felem_mul(tmp, y2, ftmp);      /* tmp[i] < 2^115 */
1142
1143
    /* r = ftmp5 = (s2 - s1)*2 */
1144
0
    felem_diff_128_64(tmp, ftmp6); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1145
0
    felem_reduce(ftmp5, tmp);      /* ftmp5[i] < 2^56 */
1146
0
    y_equal = felem_is_zero(ftmp5);
1147
0
    felem_scalar64(ftmp5, 2);      /* ftmp5[i] < 2^57 */
1148
1149
    /*
1150
     * The formulae are incorrect if the points are equal, in affine coordinates
1151
     * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1152
     * happens.
1153
     *
1154
     * We use bitwise operations to avoid potential side-channels introduced by
1155
     * the short-circuiting behaviour of boolean operators.
1156
     *
1157
     * The special case of either point being the point at infinity (z1 and/or
1158
     * z2 are zero), is handled separately later on in this function, so we
1159
     * avoid jumping to point_double here in those special cases.
1160
     *
1161
     * Notice the comment below on the implications of this branching for timing
1162
     * leaks and why it is considered practically irrelevant.
1163
     */
1164
0
    points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1165
1166
0
    if (points_equal) {
1167
        /*
1168
         * This is obviously not constant-time but it will almost-never happen
1169
         * for ECDH / ECDSA.
1170
         */
1171
0
        point_double(x3, y3, z3, x1, y1, z1);
1172
0
        return;
1173
0
    }
1174
1175
    /* I = ftmp = (2h)**2 */
1176
0
    felem_assign(ftmp, ftmp4);        /* ftmp[i] < 2^56 */
1177
0
    felem_scalar64(ftmp, 2);          /* ftmp[i] < 2^57 */
1178
0
    felem_square_reduce(ftmp, ftmp);  /* ftmp[i] < 2^56 */
1179
1180
    /* J = ftmp2 = h * I */
1181
0
    felem_mul_reduce(ftmp2, ftmp4, ftmp); /* ftmp2[i] < 2^56 */
1182
1183
    /* V = ftmp4 = U1 * I */
1184
0
    felem_mul_reduce(ftmp4, ftmp3, ftmp); /* ftmp4[i] < 2^56 */
1185
1186
    /* x_out = r**2 - J - 2V */
1187
0
    felem_square(tmp, ftmp5);      /* tmp[i] < 2^117 */
1188
0
    felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1189
0
    felem_assign(ftmp3, ftmp4);    /* ftmp3[i] < 2^56 */
1190
0
    felem_scalar64(ftmp4, 2);      /* ftmp4[i] < 2^57 */
1191
0
    felem_diff_128_64(tmp, ftmp4); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1192
0
    felem_reduce(x_out, tmp);      /* x_out[i] < 2^56 */
1193
1194
    /* y_out = r(V-x_out) - 2 * s1 * J */
1195
0
    felem_diff64(ftmp3, x_out);    /* ftmp3[i] < 2^60 + 2^56 + 2^44 */
1196
0
    felem_mul(tmp, ftmp5, ftmp3);  /* tmp[i] < 2^116 */
1197
0
    felem_mul(tmp2, ftmp6, ftmp2); /* tmp2[i] < 2^115 */
1198
0
    felem_scalar128(tmp2, 2);      /* tmp2[i] < 2^116 */
1199
0
    felem_diff128(tmp, tmp2);      /* tmp[i] < 2^127 + 2^116 + 2^111 */
1200
0
    felem_reduce(y_out, tmp);      /* y_out[i] < 2^56 */
1201
1202
0
    copy_conditional(x_out, x2, z1_is_zero);
1203
0
    copy_conditional(x_out, x1, z2_is_zero);
1204
0
    copy_conditional(y_out, y2, z1_is_zero);
1205
0
    copy_conditional(y_out, y1, z2_is_zero);
1206
0
    copy_conditional(z_out, z2, z1_is_zero);
1207
0
    copy_conditional(z_out, z1, z2_is_zero);
1208
0
    felem_assign(x3, x_out);
1209
0
    felem_assign(y3, y_out);
1210
0
    felem_assign(z3, z_out);
1211
0
}
1212
1213
/*-
1214
 * Base point pre computation
1215
 * --------------------------
1216
 *
1217
 * Two different sorts of precomputed tables are used in the following code.
1218
 * Each contain various points on the curve, where each point is three field
1219
 * elements (x, y, z).
1220
 *
1221
 * For the base point table, z is usually 1 (0 for the point at infinity).
1222
 * This table has 16 elements:
1223
 * index | bits    | point
1224
 * ------+---------+------------------------------
1225
 *     0 | 0 0 0 0 | 0G
1226
 *     1 | 0 0 0 1 | 1G
1227
 *     2 | 0 0 1 0 | 2^95G
1228
 *     3 | 0 0 1 1 | (2^95 + 1)G
1229
 *     4 | 0 1 0 0 | 2^190G
1230
 *     5 | 0 1 0 1 | (2^190 + 1)G
1231
 *     6 | 0 1 1 0 | (2^190 + 2^95)G
1232
 *     7 | 0 1 1 1 | (2^190 + 2^95 + 1)G
1233
 *     8 | 1 0 0 0 | 2^285G
1234
 *     9 | 1 0 0 1 | (2^285 + 1)G
1235
 *    10 | 1 0 1 0 | (2^285 + 2^95)G
1236
 *    11 | 1 0 1 1 | (2^285 + 2^95 + 1)G
1237
 *    12 | 1 1 0 0 | (2^285 + 2^190)G
1238
 *    13 | 1 1 0 1 | (2^285 + 2^190 + 1)G
1239
 *    14 | 1 1 1 0 | (2^285 + 2^190 + 2^95)G
1240
 *    15 | 1 1 1 1 | (2^285 + 2^190 + 2^95 + 1)G
1241
 *
1242
 * The reason for this is so that we can clock bits into four different
1243
 * locations when doing simple scalar multiplies against the base point.
1244
 *
1245
 * Tables for other points have table[i] = iG for i in 0 .. 16.
1246
 */
1247
1248
/* gmul is the table of precomputed base points */
1249
static const felem gmul[16][3] = {
1250
{{0, 0, 0, 0, 0, 0, 0},
1251
 {0, 0, 0, 0, 0, 0, 0},
1252
 {0, 0, 0, 0, 0, 0, 0}},
1253
{{0x00545e3872760ab7, 0x00f25dbf55296c3a, 0x00e082542a385502, 0x008ba79b9859f741,
1254
  0x0020ad746e1d3b62, 0x0005378eb1c71ef3, 0x0000aa87ca22be8b},
1255
 {0x00431d7c90ea0e5f, 0x00b1ce1d7e819d7a, 0x0013b5f0b8c00a60, 0x00289a147ce9da31,
1256
  0x0092dc29f8f41dbd, 0x002c6f5d9e98bf92, 0x00003617de4a9626},
1257
 {1, 0, 0, 0, 0, 0, 0}},
1258
{{0x00024711cc902a90, 0x00acb2e579ab4fe1, 0x00af818a4b4d57b1, 0x00a17c7bec49c3de,
1259
  0x004280482d726a8b, 0x00128dd0f0a90f3b, 0x00004387c1c3fa3c},
1260
 {0x002ce76543cf5c3a, 0x00de6cee5ef58f0a, 0x00403e42fa561ca6, 0x00bc54d6f9cb9731,
1261
  0x007155f925fb4ff1, 0x004a9ce731b7b9bc, 0x00002609076bd7b2},
1262
 {1, 0, 0, 0, 0, 0, 0}},
1263
{{0x00e74c9182f0251d, 0x0039bf54bb111974, 0x00b9d2f2eec511d2, 0x0036b1594eb3a6a4,
1264
  0x00ac3bb82d9d564b, 0x00f9313f4615a100, 0x00006716a9a91b10},
1265
 {0x0046698116e2f15c, 0x00f34347067d3d33, 0x008de4ccfdebd002, 0x00e838c6b8e8c97b,
1266
  0x006faf0798def346, 0x007349794a57563c, 0x00002629e7e6ad84},
1267
 {1, 0, 0, 0, 0, 0, 0}},
1268
{{0x0075300e34fd163b, 0x0092e9db4e8d0ad3, 0x00254be9f625f760, 0x00512c518c72ae68,
1269
  0x009bfcf162bede5a, 0x00bf9341566ce311, 0x0000cd6175bd41cf},
1270
 {0x007dfe52af4ac70f, 0x0002159d2d5c4880, 0x00b504d16f0af8d0, 0x0014585e11f5e64c,
1271
  0x0089c6388e030967, 0x00ffb270cbfa5f71, 0x00009a15d92c3947},
1272
 {1, 0, 0, 0, 0, 0, 0}},
1273
{{0x0033fc1278dc4fe5, 0x00d53088c2caa043, 0x0085558827e2db66, 0x00c192bef387b736,
1274
  0x00df6405a2225f2c, 0x0075205aa90fd91a, 0x0000137e3f12349d},
1275
 {0x00ce5b115efcb07e, 0x00abc3308410deeb, 0x005dc6fc1de39904, 0x00907c1c496f36b4,
1276
  0x0008e6ad3926cbe1, 0x00110747b787928c, 0x0000021b9162eb7e},
1277
 {1, 0, 0, 0, 0, 0, 0}},
1278
{{0x008180042cfa26e1, 0x007b826a96254967, 0x0082473694d6b194, 0x007bd6880a45b589,
1279
  0x00c0a5097072d1a3, 0x0019186555e18b4e, 0x000020278190e5ca},
1280
 {0x00b4bef17de61ac0, 0x009535e3c38ed348, 0x002d4aa8e468ceab, 0x00ef40b431036ad3,
1281
  0x00defd52f4542857, 0x0086edbf98234266, 0x00002025b3a7814d},
1282
 {1, 0, 0, 0, 0, 0, 0}},
1283
{{0x00b238aa97b886be, 0x00ef3192d6dd3a32, 0x0079f9e01fd62df8, 0x00742e890daba6c5,
1284
  0x008e5289144408ce, 0x0073bbcc8e0171a5, 0x0000c4fd329d3b52},
1285
 {0x00c6f64a15ee23e7, 0x00dcfb7b171cad8b, 0x00039f6cbd805867, 0x00de024e428d4562,
1286
  0x00be6a594d7c64c5, 0x0078467b70dbcd64, 0x0000251f2ed7079b},
1287
 {1, 0, 0, 0, 0, 0, 0}},
1288
{{0x000e5cc25fc4b872, 0x005ebf10d31ef4e1, 0x0061e0ebd11e8256, 0x0076e026096f5a27,
1289
  0x0013e6fc44662e9a, 0x0042b00289d3597e, 0x000024f089170d88},
1290
 {0x001604d7e0effbe6, 0x0048d77cba64ec2c, 0x008166b16da19e36, 0x006b0d1a0f28c088,
1291
  0x000259fcd47754fd, 0x00cc643e4d725f9a, 0x00007b10f3c79c14},
1292
 {1, 0, 0, 0, 0, 0, 0}},
1293
{{0x00430155e3b908af, 0x00b801e4fec25226, 0x00b0d4bcfe806d26, 0x009fc4014eb13d37,
1294
  0x0066c94e44ec07e8, 0x00d16adc03874ba2, 0x000030c917a0d2a7},
1295
 {0x00edac9e21eb891c, 0x00ef0fb768102eff, 0x00c088cef272a5f3, 0x00cbf782134e2964,
1296
  0x0001044a7ba9a0e3, 0x00e363f5b194cf3c, 0x00009ce85249e372},
1297
 {1, 0, 0, 0, 0, 0, 0}},
1298
{{0x001dd492dda5a7eb, 0x008fd577be539fd1, 0x002ff4b25a5fc3f1, 0x0074a8a1b64df72f,
1299
  0x002ba3d8c204a76c, 0x009d5cff95c8235a, 0x0000e014b9406e0f},
1300
 {0x008c2e4dbfc98aba, 0x00f30bb89f1a1436, 0x00b46f7aea3e259c, 0x009224454ac02f54,
1301
  0x00906401f5645fa2, 0x003a1d1940eabc77, 0x00007c9351d680e6},
1302
 {1, 0, 0, 0, 0, 0, 0}},
1303
{{0x005a35d872ef967c, 0x0049f1b7884e1987, 0x0059d46d7e31f552, 0x00ceb4869d2d0fb6,
1304
  0x00e8e89eee56802a, 0x0049d806a774aaf2, 0x0000147e2af0ae24},
1305
 {0x005fd1bd852c6e5e, 0x00b674b7b3de6885, 0x003b9ea5eb9b6c08, 0x005c9f03babf3ef7,
1306
  0x00605337fecab3c7, 0x009a3f85b11bbcc8, 0x0000455470f330ec},
1307
 {1, 0, 0, 0, 0, 0, 0}},
1308
{{0x002197ff4d55498d, 0x00383e8916c2d8af, 0x00eb203f34d1c6d2, 0x0080367cbd11b542,
1309
  0x00769b3be864e4f5, 0x0081a8458521c7bb, 0x0000c531b34d3539},
1310
 {0x00e2a3d775fa2e13, 0x00534fc379573844, 0x00ff237d2a8db54a, 0x00d301b2335a8882,
1311
  0x000f75ea96103a80, 0x0018fecb3cdd96fa, 0x0000304bf61e94eb},
1312
 {1, 0, 0, 0, 0, 0, 0}},
1313
{{0x00b2afc332a73dbd, 0x0029a0d5bb007bc5, 0x002d628eb210f577, 0x009f59a36dd05f50,
1314
  0x006d339de4eca613, 0x00c75a71addc86bc, 0x000060384c5ea93c},
1315
 {0x00aa9641c32a30b4, 0x00cc73ae8cce565d, 0x00ec911a4df07f61, 0x00aa4b762ea4b264,
1316
  0x0096d395bb393629, 0x004efacfb7632fe0, 0x00006f252f46fa3f},
1317
 {1, 0, 0, 0, 0, 0, 0}},
1318
{{0x00567eec597c7af6, 0x0059ba6795204413, 0x00816d4e6f01196f, 0x004ae6b3eb57951d,
1319
  0x00420f5abdda2108, 0x003401d1f57ca9d9, 0x0000cf5837b0b67a},
1320
 {0x00eaa64b8aeeabf9, 0x00246ddf16bcb4de, 0x000e7e3c3aecd751, 0x0008449f04fed72e,
1321
  0x00307b67ccf09183, 0x0017108c3556b7b1, 0x0000229b2483b3bf},
1322
 {1, 0, 0, 0, 0, 0, 0}},
1323
{{0x00e7c491a7bb78a1, 0x00eafddd1d3049ab, 0x00352c05e2bc7c98, 0x003d6880c165fa5c,
1324
  0x00b6ac61cc11c97d, 0x00beeb54fcf90ce5, 0x0000dc1f0b455edc},
1325
 {0x002db2e7aee34d60, 0x0073b5f415a2d8c0, 0x00dd84e4193e9a0c, 0x00d02d873467c572,
1326
  0x0018baaeda60aee5, 0x0013fb11d697c61e, 0x000083aafcc3a973},
1327
 {1, 0, 0, 0, 0, 0, 0}}
1328
};
1329
1330
/*
1331
 * select_point selects the |idx|th point from a precomputation table and
1332
 * copies it to out.
1333
 *
1334
 * pre_comp below is of the size provided in |size|.
1335
 */
1336
static void select_point(const limb idx, unsigned int size,
1337
                         const felem pre_comp[][3], felem out[3])
1338
0
{
1339
0
    unsigned int i, j;
1340
0
    limb *outlimbs = &out[0][0];
1341
1342
0
    memset(out, 0, sizeof(*out) * 3);
1343
1344
0
    for (i = 0; i < size; i++) {
1345
0
        const limb *inlimbs = &pre_comp[i][0][0];
1346
0
        limb mask = i ^ idx;
1347
1348
0
        mask |= mask >> 4;
1349
0
        mask |= mask >> 2;
1350
0
        mask |= mask >> 1;
1351
0
        mask &= 1;
1352
0
        mask--;
1353
0
        for (j = 0; j < NLIMBS * 3; j++)
1354
0
            outlimbs[j] |= inlimbs[j] & mask;
1355
0
    }
1356
0
}
1357
1358
/* get_bit returns the |i|th bit in |in| */
1359
static char get_bit(const felem_bytearray in, int i)
1360
0
{
1361
0
    if (i < 0 || i >= 384)
1362
0
        return 0;
1363
0
    return (in[i >> 3] >> (i & 7)) & 1;
1364
0
}
1365
1366
/*
1367
 * Interleaved point multiplication using precomputed point multiples: The
1368
 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1369
 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1370
 * generator, using certain (large) precomputed multiples in g_pre_comp.
1371
 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1372
 */
1373
static void batch_mul(felem x_out, felem y_out, felem z_out,
1374
                      const felem_bytearray scalars[],
1375
                      const unsigned int num_points, const u8 *g_scalar,
1376
                      const int mixed, const felem pre_comp[][17][3],
1377
                      const felem g_pre_comp[16][3])
1378
0
{
1379
0
    int i, skip;
1380
0
    unsigned int num, gen_mul = (g_scalar != NULL);
1381
0
    felem nq[3], tmp[4];
1382
0
    limb bits;
1383
0
    u8 sign, digit;
1384
1385
    /* set nq to the point at infinity */
1386
0
    memset(nq, 0, sizeof(nq));
1387
1388
    /*
1389
     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1390
     * of the generator (last quarter of rounds) and additions of other
1391
     * points multiples (every 5th round).
1392
     */
1393
0
    skip = 1;                   /* save two point operations in the first
1394
                                 * round */
1395
0
    for (i = (num_points ? 380 : 98); i >= 0; --i) {
1396
        /* double */
1397
0
        if (!skip)
1398
0
            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1399
1400
        /* add multiples of the generator */
1401
0
        if (gen_mul && (i <= 98)) {
1402
0
            bits = get_bit(g_scalar, i + 285) << 3;
1403
0
            if (i < 95) {
1404
0
                bits |= get_bit(g_scalar, i + 190) << 2;
1405
0
                bits |= get_bit(g_scalar, i + 95) << 1;
1406
0
                bits |= get_bit(g_scalar, i);
1407
0
            }
1408
            /* select the point to add, in constant time */
1409
0
            select_point(bits, 16, g_pre_comp, tmp);
1410
0
            if (!skip) {
1411
                /* The 1 argument below is for "mixed" */
1412
0
                point_add(nq[0],  nq[1],  nq[2],
1413
0
                          nq[0],  nq[1],  nq[2], 1,
1414
0
                          tmp[0], tmp[1], tmp[2]);
1415
0
            } else {
1416
0
                memcpy(nq, tmp, 3 * sizeof(felem));
1417
0
                skip = 0;
1418
0
            }
1419
0
        }
1420
1421
        /* do other additions every 5 doublings */
1422
0
        if (num_points && (i % 5 == 0)) {
1423
            /* loop over all scalars */
1424
0
            for (num = 0; num < num_points; ++num) {
1425
0
                bits = get_bit(scalars[num], i + 4) << 5;
1426
0
                bits |= get_bit(scalars[num], i + 3) << 4;
1427
0
                bits |= get_bit(scalars[num], i + 2) << 3;
1428
0
                bits |= get_bit(scalars[num], i + 1) << 2;
1429
0
                bits |= get_bit(scalars[num], i) << 1;
1430
0
                bits |= get_bit(scalars[num], i - 1);
1431
0
                ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1432
1433
                /*
1434
                 * select the point to add or subtract, in constant time
1435
                 */
1436
0
                select_point(digit, 17, pre_comp[num], tmp);
1437
0
                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1438
                                            * point */
1439
0
                copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1440
1441
0
                if (!skip) {
1442
0
                    point_add(nq[0],  nq[1],  nq[2],
1443
0
                              nq[0],  nq[1],  nq[2], mixed,
1444
0
                              tmp[0], tmp[1], tmp[2]);
1445
0
                } else {
1446
0
                    memcpy(nq, tmp, 3 * sizeof(felem));
1447
0
                    skip = 0;
1448
0
                }
1449
0
            }
1450
0
        }
1451
0
    }
1452
0
    felem_assign(x_out, nq[0]);
1453
0
    felem_assign(y_out, nq[1]);
1454
0
    felem_assign(z_out, nq[2]);
1455
0
}
1456
1457
/* Precomputation for the group generator. */
1458
struct nistp384_pre_comp_st {
1459
    felem g_pre_comp[16][3];
1460
    CRYPTO_REF_COUNT references;
1461
};
1462
1463
const EC_METHOD *ossl_ec_GFp_nistp384_method(void)
1464
95
{
1465
95
    static const EC_METHOD ret = {
1466
95
        EC_FLAGS_DEFAULT_OCT,
1467
95
        NID_X9_62_prime_field,
1468
95
        ossl_ec_GFp_nistp384_group_init,
1469
95
        ossl_ec_GFp_simple_group_finish,
1470
95
        ossl_ec_GFp_simple_group_clear_finish,
1471
95
        ossl_ec_GFp_nist_group_copy,
1472
95
        ossl_ec_GFp_nistp384_group_set_curve,
1473
95
        ossl_ec_GFp_simple_group_get_curve,
1474
95
        ossl_ec_GFp_simple_group_get_degree,
1475
95
        ossl_ec_group_simple_order_bits,
1476
95
        ossl_ec_GFp_simple_group_check_discriminant,
1477
95
        ossl_ec_GFp_simple_point_init,
1478
95
        ossl_ec_GFp_simple_point_finish,
1479
95
        ossl_ec_GFp_simple_point_clear_finish,
1480
95
        ossl_ec_GFp_simple_point_copy,
1481
95
        ossl_ec_GFp_simple_point_set_to_infinity,
1482
95
        ossl_ec_GFp_simple_point_set_affine_coordinates,
1483
95
        ossl_ec_GFp_nistp384_point_get_affine_coordinates,
1484
95
        0, /* point_set_compressed_coordinates */
1485
95
        0, /* point2oct */
1486
95
        0, /* oct2point */
1487
95
        ossl_ec_GFp_simple_add,
1488
95
        ossl_ec_GFp_simple_dbl,
1489
95
        ossl_ec_GFp_simple_invert,
1490
95
        ossl_ec_GFp_simple_is_at_infinity,
1491
95
        ossl_ec_GFp_simple_is_on_curve,
1492
95
        ossl_ec_GFp_simple_cmp,
1493
95
        ossl_ec_GFp_simple_make_affine,
1494
95
        ossl_ec_GFp_simple_points_make_affine,
1495
95
        ossl_ec_GFp_nistp384_points_mul,
1496
95
        ossl_ec_GFp_nistp384_precompute_mult,
1497
95
        ossl_ec_GFp_nistp384_have_precompute_mult,
1498
95
        ossl_ec_GFp_nist_field_mul,
1499
95
        ossl_ec_GFp_nist_field_sqr,
1500
95
        0, /* field_div */
1501
95
        ossl_ec_GFp_simple_field_inv,
1502
95
        0, /* field_encode */
1503
95
        0, /* field_decode */
1504
95
        0, /* field_set_to_one */
1505
95
        ossl_ec_key_simple_priv2oct,
1506
95
        ossl_ec_key_simple_oct2priv,
1507
95
        0, /* set private */
1508
95
        ossl_ec_key_simple_generate_key,
1509
95
        ossl_ec_key_simple_check_key,
1510
95
        ossl_ec_key_simple_generate_public_key,
1511
95
        0, /* keycopy */
1512
95
        0, /* keyfinish */
1513
95
        ossl_ecdh_simple_compute_key,
1514
95
        ossl_ecdsa_simple_sign_setup,
1515
95
        ossl_ecdsa_simple_sign_sig,
1516
95
        ossl_ecdsa_simple_verify_sig,
1517
95
        0, /* field_inverse_mod_ord */
1518
95
        0, /* blind_coordinates */
1519
95
        0, /* ladder_pre */
1520
95
        0, /* ladder_step */
1521
95
        0  /* ladder_post */
1522
95
    };
1523
1524
95
    return &ret;
1525
95
}
1526
1527
/******************************************************************************/
1528
/*
1529
 * FUNCTIONS TO MANAGE PRECOMPUTATION
1530
 */
1531
1532
static NISTP384_PRE_COMP *nistp384_pre_comp_new(void)
1533
0
{
1534
0
    NISTP384_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1535
1536
0
    if (ret == NULL)
1537
0
        return ret;
1538
1539
0
    if (!CRYPTO_NEW_REF(&ret->references, 1)) {
1540
0
        OPENSSL_free(ret);
1541
0
        return NULL;
1542
0
    }
1543
0
    return ret;
1544
0
}
1545
1546
NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *p)
1547
0
{
1548
0
    int i;
1549
1550
0
    if (p != NULL)
1551
0
        CRYPTO_UP_REF(&p->references, &i);
1552
0
    return p;
1553
0
}
1554
1555
void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *p)
1556
0
{
1557
0
    int i;
1558
1559
0
    if (p == NULL)
1560
0
        return;
1561
1562
0
    CRYPTO_DOWN_REF(&p->references, &i);
1563
0
    REF_PRINT_COUNT("ossl_ec_nistp384", p);
1564
0
    if (i > 0)
1565
0
        return;
1566
0
    REF_ASSERT_ISNT(i < 0);
1567
1568
0
    CRYPTO_FREE_REF(&p->references);
1569
0
    OPENSSL_free(p);
1570
0
}
1571
1572
/******************************************************************************/
1573
/*
1574
 * OPENSSL EC_METHOD FUNCTIONS
1575
 */
1576
1577
int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group)
1578
190
{
1579
190
    int ret;
1580
1581
190
    ret = ossl_ec_GFp_simple_group_init(group);
1582
190
    group->a_is_minus3 = 1;
1583
190
    return ret;
1584
190
}
1585
1586
int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1587
                                         const BIGNUM *a, const BIGNUM *b,
1588
                                         BN_CTX *ctx)
1589
95
{
1590
95
    int ret = 0;
1591
95
    BIGNUM *curve_p, *curve_a, *curve_b;
1592
95
#ifndef FIPS_MODULE
1593
95
    BN_CTX *new_ctx = NULL;
1594
1595
95
    if (ctx == NULL)
1596
0
        ctx = new_ctx = BN_CTX_new();
1597
95
#endif
1598
95
    if (ctx == NULL)
1599
0
        return 0;
1600
1601
95
    BN_CTX_start(ctx);
1602
95
    curve_p = BN_CTX_get(ctx);
1603
95
    curve_a = BN_CTX_get(ctx);
1604
95
    curve_b = BN_CTX_get(ctx);
1605
95
    if (curve_b == NULL)
1606
0
        goto err;
1607
95
    BN_bin2bn(nistp384_curve_params[0], sizeof(felem_bytearray), curve_p);
1608
95
    BN_bin2bn(nistp384_curve_params[1], sizeof(felem_bytearray), curve_a);
1609
95
    BN_bin2bn(nistp384_curve_params[2], sizeof(felem_bytearray), curve_b);
1610
95
    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1611
0
        ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1612
0
        goto err;
1613
0
    }
1614
95
    group->field_mod_func = BN_nist_mod_384;
1615
95
    ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1616
95
 err:
1617
95
    BN_CTX_end(ctx);
1618
95
#ifndef FIPS_MODULE
1619
95
    BN_CTX_free(new_ctx);
1620
95
#endif
1621
95
    return ret;
1622
95
}
1623
1624
/*
1625
 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1626
 * (X/Z^2, Y/Z^3)
1627
 */
1628
int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group,
1629
                                                      const EC_POINT *point,
1630
                                                      BIGNUM *x, BIGNUM *y,
1631
                                                      BN_CTX *ctx)
1632
6
{
1633
6
    felem z1, z2, x_in, y_in, x_out, y_out;
1634
6
    widefelem tmp;
1635
1636
6
    if (EC_POINT_is_at_infinity(group, point)) {
1637
0
        ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1638
0
        return 0;
1639
0
    }
1640
6
    if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1641
6
        (!BN_to_felem(z1, point->Z)))
1642
0
        return 0;
1643
6
    felem_inv(z2, z1);
1644
6
    felem_square(tmp, z2);
1645
6
    felem_reduce(z1, tmp);
1646
6
    felem_mul(tmp, x_in, z1);
1647
6
    felem_reduce(x_in, tmp);
1648
6
    felem_contract(x_out, x_in);
1649
6
    if (x != NULL) {
1650
6
        if (!felem_to_BN(x, x_out)) {
1651
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1652
0
            return 0;
1653
0
        }
1654
6
    }
1655
6
    felem_mul(tmp, z1, z2);
1656
6
    felem_reduce(z1, tmp);
1657
6
    felem_mul(tmp, y_in, z1);
1658
6
    felem_reduce(y_in, tmp);
1659
6
    felem_contract(y_out, y_in);
1660
6
    if (y != NULL) {
1661
6
        if (!felem_to_BN(y, y_out)) {
1662
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1663
0
            return 0;
1664
0
        }
1665
6
    }
1666
6
    return 1;
1667
6
}
1668
1669
/* points below is of size |num|, and tmp_felems is of size |num+1/ */
1670
static void make_points_affine(size_t num, felem points[][3],
1671
                               felem tmp_felems[])
1672
0
{
1673
    /*
1674
     * Runs in constant time, unless an input is the point at infinity (which
1675
     * normally shouldn't happen).
1676
     */
1677
0
    ossl_ec_GFp_nistp_points_make_affine_internal(num,
1678
0
                                                  points,
1679
0
                                                  sizeof(felem),
1680
0
                                                  tmp_felems,
1681
0
                                                  (void (*)(void *))felem_one,
1682
0
                                                  felem_is_zero_int,
1683
0
                                                  (void (*)(void *, const void *))
1684
0
                                                  felem_assign,
1685
0
                                                  (void (*)(void *, const void *))
1686
0
                                                  felem_square_reduce,
1687
0
                                                  (void (*)(void *, const void *, const void*))
1688
0
                                                  felem_mul_reduce,
1689
0
                                                  (void (*)(void *, const void *))
1690
0
                                                  felem_inv,
1691
0
                                                  (void (*)(void *, const void *))
1692
0
                                                  felem_contract);
1693
0
}
1694
1695
/*
1696
 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1697
 * values Result is stored in r (r can equal one of the inputs).
1698
 */
1699
int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r,
1700
                                    const BIGNUM *scalar, size_t num,
1701
                                    const EC_POINT *points[],
1702
                                    const BIGNUM *scalars[], BN_CTX *ctx)
1703
0
{
1704
0
    int ret = 0;
1705
0
    int j;
1706
0
    int mixed = 0;
1707
0
    BIGNUM *x, *y, *z, *tmp_scalar;
1708
0
    felem_bytearray g_secret;
1709
0
    felem_bytearray *secrets = NULL;
1710
0
    felem (*pre_comp)[17][3] = NULL;
1711
0
    felem *tmp_felems = NULL;
1712
0
    unsigned int i;
1713
0
    int num_bytes;
1714
0
    int have_pre_comp = 0;
1715
0
    size_t num_points = num;
1716
0
    felem x_in, y_in, z_in, x_out, y_out, z_out;
1717
0
    NISTP384_PRE_COMP *pre = NULL;
1718
0
    felem(*g_pre_comp)[3] = NULL;
1719
0
    EC_POINT *generator = NULL;
1720
0
    const EC_POINT *p = NULL;
1721
0
    const BIGNUM *p_scalar = NULL;
1722
1723
0
    BN_CTX_start(ctx);
1724
0
    x = BN_CTX_get(ctx);
1725
0
    y = BN_CTX_get(ctx);
1726
0
    z = BN_CTX_get(ctx);
1727
0
    tmp_scalar = BN_CTX_get(ctx);
1728
0
    if (tmp_scalar == NULL)
1729
0
        goto err;
1730
1731
0
    if (scalar != NULL) {
1732
0
        pre = group->pre_comp.nistp384;
1733
0
        if (pre)
1734
            /* we have precomputation, try to use it */
1735
0
            g_pre_comp = &pre->g_pre_comp[0];
1736
0
        else
1737
            /* try to use the standard precomputation */
1738
0
            g_pre_comp = (felem(*)[3]) gmul;
1739
0
        generator = EC_POINT_new(group);
1740
0
        if (generator == NULL)
1741
0
            goto err;
1742
        /* get the generator from precomputation */
1743
0
        if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1744
0
            !felem_to_BN(y, g_pre_comp[1][1]) ||
1745
0
            !felem_to_BN(z, g_pre_comp[1][2])) {
1746
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1747
0
            goto err;
1748
0
        }
1749
0
        if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
1750
0
                                                                generator,
1751
0
                                                                x, y, z, ctx))
1752
0
            goto err;
1753
0
        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1754
            /* precomputation matches generator */
1755
0
            have_pre_comp = 1;
1756
0
        else
1757
            /*
1758
             * we don't have valid precomputation: treat the generator as a
1759
             * random point
1760
             */
1761
0
            num_points++;
1762
0
    }
1763
1764
0
    if (num_points > 0) {
1765
0
        if (num_points >= 2) {
1766
            /*
1767
             * unless we precompute multiples for just one point, converting
1768
             * those into affine form is time well spent
1769
             */
1770
0
            mixed = 1;
1771
0
        }
1772
0
        secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1773
0
        pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1774
0
        if (mixed)
1775
0
            tmp_felems =
1776
0
                OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1777
0
        if ((secrets == NULL) || (pre_comp == NULL)
1778
0
            || (mixed && (tmp_felems == NULL)))
1779
0
            goto err;
1780
1781
        /*
1782
         * we treat NULL scalars as 0, and NULL points as points at infinity,
1783
         * i.e., they contribute nothing to the linear combination
1784
         */
1785
0
        for (i = 0; i < num_points; ++i) {
1786
0
            if (i == num) {
1787
                /*
1788
                 * we didn't have a valid precomputation, so we pick the
1789
                 * generator
1790
                 */
1791
0
                p = EC_GROUP_get0_generator(group);
1792
0
                p_scalar = scalar;
1793
0
            } else {
1794
                /* the i^th point */
1795
0
                p = points[i];
1796
0
                p_scalar = scalars[i];
1797
0
            }
1798
0
            if (p_scalar != NULL && p != NULL) {
1799
                /* reduce scalar to 0 <= scalar < 2^384 */
1800
0
                if ((BN_num_bits(p_scalar) > 384)
1801
0
                    || (BN_is_negative(p_scalar))) {
1802
                    /*
1803
                     * this is an unusual input, and we don't guarantee
1804
                     * constant-timeness
1805
                     */
1806
0
                    if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1807
0
                        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1808
0
                        goto err;
1809
0
                    }
1810
0
                    num_bytes = BN_bn2lebinpad(tmp_scalar,
1811
0
                                               secrets[i], sizeof(secrets[i]));
1812
0
                } else {
1813
0
                    num_bytes = BN_bn2lebinpad(p_scalar,
1814
0
                                               secrets[i], sizeof(secrets[i]));
1815
0
                }
1816
0
                if (num_bytes < 0) {
1817
0
                    ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1818
0
                    goto err;
1819
0
                }
1820
                /* precompute multiples */
1821
0
                if ((!BN_to_felem(x_out, p->X)) ||
1822
0
                    (!BN_to_felem(y_out, p->Y)) ||
1823
0
                    (!BN_to_felem(z_out, p->Z)))
1824
0
                    goto err;
1825
0
                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1826
0
                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1827
0
                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1828
0
                for (j = 2; j <= 16; ++j) {
1829
0
                    if (j & 1) {
1830
0
                        point_add(pre_comp[i][j][0],     pre_comp[i][j][1],     pre_comp[i][j][2],
1831
0
                                  pre_comp[i][1][0],     pre_comp[i][1][1],     pre_comp[i][1][2], 0,
1832
0
                                  pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]);
1833
0
                    } else {
1834
0
                        point_double(pre_comp[i][j][0],     pre_comp[i][j][1],     pre_comp[i][j][2],
1835
0
                                     pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]);
1836
0
                    }
1837
0
                }
1838
0
            }
1839
0
        }
1840
0
        if (mixed)
1841
0
            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1842
0
    }
1843
1844
    /* the scalar for the generator */
1845
0
    if (scalar != NULL && have_pre_comp) {
1846
0
        memset(g_secret, 0, sizeof(g_secret));
1847
        /* reduce scalar to 0 <= scalar < 2^384 */
1848
0
        if ((BN_num_bits(scalar) > 384) || (BN_is_negative(scalar))) {
1849
            /*
1850
             * this is an unusual input, and we don't guarantee
1851
             * constant-timeness
1852
             */
1853
0
            if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1854
0
                ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1855
0
                goto err;
1856
0
            }
1857
0
            num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1858
0
        } else {
1859
0
            num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1860
0
        }
1861
        /* do the multiplication with generator precomputation */
1862
0
        batch_mul(x_out, y_out, z_out,
1863
0
                  (const felem_bytearray(*))secrets, num_points,
1864
0
                  g_secret,
1865
0
                  mixed, (const felem(*)[17][3])pre_comp,
1866
0
                  (const felem(*)[3])g_pre_comp);
1867
0
    } else {
1868
        /* do the multiplication without generator precomputation */
1869
0
        batch_mul(x_out, y_out, z_out,
1870
0
                  (const felem_bytearray(*))secrets, num_points,
1871
0
                  NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1872
0
    }
1873
    /* reduce the output to its unique minimal representation */
1874
0
    felem_contract(x_in, x_out);
1875
0
    felem_contract(y_in, y_out);
1876
0
    felem_contract(z_in, z_out);
1877
0
    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1878
0
        (!felem_to_BN(z, z_in))) {
1879
0
        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1880
0
        goto err;
1881
0
    }
1882
0
    ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
1883
0
                                                             ctx);
1884
1885
0
 err:
1886
0
    BN_CTX_end(ctx);
1887
0
    EC_POINT_free(generator);
1888
0
    OPENSSL_free(secrets);
1889
0
    OPENSSL_free(pre_comp);
1890
0
    OPENSSL_free(tmp_felems);
1891
0
    return ret;
1892
0
}
1893
1894
int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1895
0
{
1896
0
    int ret = 0;
1897
0
    NISTP384_PRE_COMP *pre = NULL;
1898
0
    int i, j;
1899
0
    BIGNUM *x, *y;
1900
0
    EC_POINT *generator = NULL;
1901
0
    felem tmp_felems[16];
1902
0
#ifndef FIPS_MODULE
1903
0
    BN_CTX *new_ctx = NULL;
1904
0
#endif
1905
1906
    /* throw away old precomputation */
1907
0
    EC_pre_comp_free(group);
1908
1909
0
#ifndef FIPS_MODULE
1910
0
    if (ctx == NULL)
1911
0
        ctx = new_ctx = BN_CTX_new();
1912
0
#endif
1913
0
    if (ctx == NULL)
1914
0
        return 0;
1915
1916
0
    BN_CTX_start(ctx);
1917
0
    x = BN_CTX_get(ctx);
1918
0
    y = BN_CTX_get(ctx);
1919
0
    if (y == NULL)
1920
0
        goto err;
1921
    /* get the generator */
1922
0
    if (group->generator == NULL)
1923
0
        goto err;
1924
0
    generator = EC_POINT_new(group);
1925
0
    if (generator == NULL)
1926
0
        goto err;
1927
0
    BN_bin2bn(nistp384_curve_params[3], sizeof(felem_bytearray), x);
1928
0
    BN_bin2bn(nistp384_curve_params[4], sizeof(felem_bytearray), y);
1929
0
    if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
1930
0
        goto err;
1931
0
    if ((pre = nistp384_pre_comp_new()) == NULL)
1932
0
        goto err;
1933
    /*
1934
     * if the generator is the standard one, use built-in precomputation
1935
     */
1936
0
    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1937
0
        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1938
0
        goto done;
1939
0
    }
1940
0
    if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
1941
0
        (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
1942
0
        (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
1943
0
        goto err;
1944
    /* compute 2^95*G, 2^190*G, 2^285*G */
1945
0
    for (i = 1; i <= 4; i <<= 1) {
1946
0
        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1947
0
                     pre->g_pre_comp[i][0],  pre->g_pre_comp[i][1],    pre->g_pre_comp[i][2]);
1948
0
        for (j = 0; j < 94; ++j) {
1949
0
            point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1950
0
                         pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2]);
1951
0
        }
1952
0
    }
1953
    /* g_pre_comp[0] is the point at infinity */
1954
0
    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1955
    /* the remaining multiples */
1956
    /* 2^95*G + 2^190*G */
1957
0
    point_add(pre->g_pre_comp[6][0],  pre->g_pre_comp[6][1],  pre->g_pre_comp[6][2],
1958
0
              pre->g_pre_comp[4][0],  pre->g_pre_comp[4][1],  pre->g_pre_comp[4][2], 0,
1959
0
              pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
1960
    /* 2^95*G + 2^285*G */
1961
0
    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], pre->g_pre_comp[10][2],
1962
0
              pre->g_pre_comp[8][0],  pre->g_pre_comp[8][1],  pre->g_pre_comp[8][2], 0,
1963
0
              pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
1964
    /* 2^190*G + 2^285*G */
1965
0
    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
1966
0
              pre->g_pre_comp[8][0],  pre->g_pre_comp[8][1],  pre->g_pre_comp[8][2], 0,
1967
0
              pre->g_pre_comp[4][0],  pre->g_pre_comp[4][1],  pre->g_pre_comp[4][2]);
1968
    /* 2^95*G + 2^190*G + 2^285*G */
1969
0
    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], pre->g_pre_comp[14][2],
1970
0
              pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], 0,
1971
0
              pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
1972
0
    for (i = 1; i < 8; ++i) {
1973
        /* odd multiples: add G */
1974
0
        point_add(pre->g_pre_comp[2 * i + 1][0], pre->g_pre_comp[2 * i + 1][1], pre->g_pre_comp[2 * i + 1][2],
1975
0
                  pre->g_pre_comp[2 * i][0],     pre->g_pre_comp[2 * i][1],     pre->g_pre_comp[2 * i][2], 0,
1976
0
                  pre->g_pre_comp[1][0],         pre->g_pre_comp[1][1],         pre->g_pre_comp[1][2]);
1977
0
    }
1978
0
    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
1979
1980
0
 done:
1981
0
    SETPRECOMP(group, nistp384, pre);
1982
0
    ret = 1;
1983
0
    pre = NULL;
1984
0
 err:
1985
0
    BN_CTX_end(ctx);
1986
0
    EC_POINT_free(generator);
1987
0
#ifndef FIPS_MODULE
1988
0
    BN_CTX_free(new_ctx);
1989
0
#endif
1990
0
    ossl_ec_nistp384_pre_comp_free(pre);
1991
0
    return ret;
1992
0
}
1993
1994
int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group)
1995
0
{
1996
0
    return HAVEPRECOMP(group, nistp384);
1997
0
}