Coverage Report

Created: 2024-07-27 06:39

/src/openssl32/crypto/ec/ecp_nistp384.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
3
 *
4
 * Licensed under the Apache License 2.0 (the "License").  You may not use
5
 * this file except in compliance with the License.  You can obtain a copy
6
 * in the file LICENSE in the source distribution or at
7
 * https://www.openssl.org/source/license.html
8
 */
9
10
/* Copyright 2023 IBM Corp.
11
 *
12
 * Licensed under the Apache License, Version 2.0 (the "License");
13
 *
14
 * you may not use this file except in compliance with the License.
15
 * You may obtain a copy of the License at
16
 *
17
 *     http://www.apache.org/licenses/LICENSE-2.0
18
 *
19
 *  Unless required by applicable law or agreed to in writing, software
20
 *  distributed under the License is distributed on an "AS IS" BASIS,
21
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22
 *  See the License for the specific language governing permissions and
23
 *  limitations under the License.
24
 */
25
26
/*
27
 * Designed for 56-bit limbs by Rohan McLure <rohan.mclure@linux.ibm.com>.
28
 * The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm
29
 * acceleration of felem_{square,mul} as supported in these files.
30
 */
31
32
#include <openssl/e_os2.h>
33
34
#include <string.h>
35
#include <openssl/err.h>
36
#include "ec_local.h"
37
38
#include "internal/numbers.h"
39
40
#ifndef INT128_MAX
41
# error "Your compiler doesn't appear to support 128-bit integer types"
42
#endif
43
44
typedef uint8_t u8;
45
typedef uint64_t u64;
46
47
/*
48
 * The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We
49
 * can serialize an element of this field into 48 bytes. We call this an
50
 * felem_bytearray.
51
 */
52
53
typedef u8 felem_bytearray[48];
54
55
/*
56
 * These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4.
57
 * These values are big-endian.
58
 */
59
static const felem_bytearray nistp384_curve_params[5] = {
60
  {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
61
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
62
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
63
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF},
64
  {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */
65
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
66
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
67
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC},
68
  {0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */
69
   0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12,
70
   0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D,
71
   0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF},
72
  {0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */
73
   0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98,
74
   0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D,
75
   0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7},
76
  {0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */
77
   0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C,
78
   0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE,
79
   0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F},
80
};
81
82
/*-
83
 * The representation of field elements.
84
 * ------------------------------------
85
 *
86
 * We represent field elements with seven values. These values are either 64 or
87
 * 128 bits and the field element represented is:
88
 *   v[0]*2^0 + v[1]*2^56 + v[2]*2^112 + ... + v[6]*2^336  (mod p)
89
 * Each of the seven values is called a 'limb'. Since the limbs are spaced only
90
 * 56 bits apart, but are greater than 56 bits in length, the most significant
91
 * bits of each limb overlap with the least significant bits of the next
92
 *
93
 * This representation is considered to be 'redundant' in the sense that
94
 * intermediate values can each contain more than a 56-bit value in each limb.
95
 * Reduction causes all but the final limb to be reduced to contain a value less
96
 * than 2^56, with the final value represented allowed to be larger than 2^384,
97
 * inasmuch as we can be sure that arithmetic overflow remains impossible. The
98
 * reduced value must of course be congruent to the unreduced value.
99
 *
100
 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
101
 * 'widefelem', featuring enough bits to store the result of a multiplication
102
 * and even some further arithmetic without need for immediate reduction.
103
 */
104
105
199M
#define NLIMBS 7
106
107
typedef uint64_t limb;
108
typedef uint128_t widelimb;
109
typedef limb limb_aX __attribute((__aligned__(1)));
110
typedef limb felem[NLIMBS];
111
typedef widelimb widefelem[2*NLIMBS-1];
112
113
static const limb bottom56bits = 0xffffffffffffff;
114
115
/* Helper functions (de)serialising reduced field elements in little endian */
116
static void bin48_to_felem(felem out, const u8 in[48])
117
8.96k
{
118
8.96k
    memset(out, 0, 56);
119
8.96k
    out[0] = (*((limb *) & in[0])) & bottom56bits;
120
8.96k
    out[1] = (*((limb_aX *) & in[7])) & bottom56bits;
121
8.96k
    out[2] = (*((limb_aX *) & in[14])) & bottom56bits;
122
8.96k
    out[3] = (*((limb_aX *) & in[21])) & bottom56bits;
123
8.96k
    out[4] = (*((limb_aX *) & in[28])) & bottom56bits;
124
8.96k
    out[5] = (*((limb_aX *) & in[35])) & bottom56bits;
125
8.96k
    memmove(&out[6], &in[42], 6);
126
8.96k
}
127
128
static void felem_to_bin48(u8 out[48], const felem in)
129
13.3k
{
130
13.3k
    memset(out, 0, 48);
131
13.3k
    (*((limb *) & out[0]))     |= (in[0] & bottom56bits);
132
13.3k
    (*((limb_aX *) & out[7]))  |= (in[1] & bottom56bits);
133
13.3k
    (*((limb_aX *) & out[14])) |= (in[2] & bottom56bits);
134
13.3k
    (*((limb_aX *) & out[21])) |= (in[3] & bottom56bits);
135
13.3k
    (*((limb_aX *) & out[28])) |= (in[4] & bottom56bits);
136
13.3k
    (*((limb_aX *) & out[35])) |= (in[5] & bottom56bits);
137
13.3k
    memmove(&out[42], &in[6], 6);
138
13.3k
}
139
140
/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
141
static int BN_to_felem(felem out, const BIGNUM *bn)
142
8.96k
{
143
8.96k
    felem_bytearray b_out;
144
8.96k
    int num_bytes;
145
146
8.96k
    if (BN_is_negative(bn)) {
147
0
        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
148
0
        return 0;
149
0
    }
150
8.96k
    num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
151
8.96k
    if (num_bytes < 0) {
152
0
        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
153
0
        return 0;
154
0
    }
155
8.96k
    bin48_to_felem(out, b_out);
156
8.96k
    return 1;
157
8.96k
}
158
159
/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
160
static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
161
13.3k
{
162
13.3k
    felem_bytearray b_out;
163
164
13.3k
    felem_to_bin48(b_out, in);
165
13.3k
    return BN_lebin2bn(b_out, sizeof(b_out), out);
166
13.3k
}
167
168
/*-
169
 * Field operations
170
 * ----------------
171
 */
172
173
static void felem_one(felem out)
174
0
{
175
0
    out[0] = 1;
176
0
    memset(&out[1], 0, sizeof(limb) * (NLIMBS-1));
177
0
}
178
179
static void felem_assign(felem out, const felem in)
180
2.80M
{
181
2.80M
    memcpy(out, in, sizeof(felem));
182
2.80M
}
183
184
/* felem_sum64 sets out = out + in. */
185
static void felem_sum64(felem out, const felem in)
186
1.26M
{
187
1.26M
    unsigned int i;
188
189
10.1M
    for (i = 0; i < NLIMBS; i++)
190
8.86M
        out[i] += in[i];
191
1.26M
}
192
193
/* felem_scalar sets out = in * scalar */
194
static void felem_scalar(felem out, const felem in, limb scalar)
195
3.63M
{
196
3.63M
    unsigned int i;
197
198
29.1M
    for (i = 0; i < NLIMBS; i++)
199
25.4M
        out[i] = in[i] * scalar;
200
3.63M
}
201
202
/* felem_scalar64 sets out = out * scalar */
203
static void felem_scalar64(felem out, limb scalar)
204
1.73M
{
205
1.73M
    unsigned int i;
206
207
13.8M
    for (i = 0; i < NLIMBS; i++)
208
12.1M
        out[i] *= scalar;
209
1.73M
}
210
211
/* felem_scalar128 sets out = out * scalar */
212
static void felem_scalar128(widefelem out, limb scalar)
213
578k
{
214
578k
    unsigned int i;
215
216
8.09M
    for (i = 0; i < 2*NLIMBS-1; i++)
217
7.51M
        out[i] *= scalar;
218
578k
}
219
220
/*-
221
 * felem_neg sets |out| to |-in|
222
 * On entry:
223
 *   in[i] < 2^60 - 2^29
224
 * On exit:
225
 *   out[i] < 2^60
226
 */
227
static void felem_neg(felem out, const felem in)
228
57.0k
{
229
    /*
230
     * In order to prevent underflow, we add a multiple of p before subtracting.
231
     * Use telescopic sums to represent 2^12 * p redundantly with each limb
232
     * of the form 2^60 + ...
233
     */
234
57.0k
    static const limb two60m52m4 = (((limb) 1) << 60)
235
57.0k
                                 - (((limb) 1) << 52)
236
57.0k
                                 - (((limb) 1) << 4);
237
57.0k
    static const limb two60p44m12 = (((limb) 1) << 60)
238
57.0k
                                  + (((limb) 1) << 44)
239
57.0k
                                  - (((limb) 1) << 12);
240
57.0k
    static const limb two60m28m4 = (((limb) 1) << 60)
241
57.0k
                                 - (((limb) 1) << 28)
242
57.0k
                                 - (((limb) 1) << 4);
243
57.0k
    static const limb two60m4 = (((limb) 1) << 60)
244
57.0k
                              - (((limb) 1) << 4);
245
246
57.0k
    out[0] = two60p44m12 - in[0];
247
57.0k
    out[1] = two60m52m4 - in[1];
248
57.0k
    out[2] = two60m28m4 - in[2];
249
57.0k
    out[3] = two60m4 - in[3];
250
57.0k
    out[4] = two60m4 - in[4];
251
57.0k
    out[5] = two60m4 - in[5];
252
57.0k
    out[6] = two60m4 - in[6];
253
57.0k
}
254
255
/*-
256
 * felem_diff64 subtracts |in| from |out|
257
 * On entry:
258
 *   in[i] < 2^60 - 2^52 - 2^4
259
 * On exit:
260
 *   out[i] < out_orig[i] + 2^60 + 2^44
261
 */
262
static void felem_diff64(felem out, const felem in)
263
979k
{
264
    /*
265
     * In order to prevent underflow, we add a multiple of p before subtracting.
266
     * Use telescopic sums to represent 2^12 * p redundantly with each limb
267
     * of the form 2^60 + ...
268
     */
269
270
979k
    static const limb two60m52m4 = (((limb) 1) << 60)
271
979k
                                 - (((limb) 1) << 52)
272
979k
                                 - (((limb) 1) << 4);
273
979k
    static const limb two60p44m12 = (((limb) 1) << 60)
274
979k
                                  + (((limb) 1) << 44)
275
979k
                                  - (((limb) 1) << 12);
276
979k
    static const limb two60m28m4 = (((limb) 1) << 60)
277
979k
                                 - (((limb) 1) << 28)
278
979k
                                 - (((limb) 1) << 4);
279
979k
    static const limb two60m4 = (((limb) 1) << 60)
280
979k
                              - (((limb) 1) << 4);
281
282
979k
    out[0] += two60p44m12 - in[0];
283
979k
    out[1] += two60m52m4 - in[1];
284
979k
    out[2] += two60m28m4 - in[2];
285
979k
    out[3] += two60m4 - in[3];
286
979k
    out[4] += two60m4 - in[4];
287
979k
    out[5] += two60m4 - in[5];
288
979k
    out[6] += two60m4 - in[6];
289
979k
}
290
291
/*
292
 * in[i] < 2^63
293
 * out[i] < out_orig[i] + 2^64 + 2^48
294
 */
295
static void felem_diff_128_64(widefelem out, const felem in)
296
1.63M
{
297
    /*
298
     * In order to prevent underflow, we add a multiple of p before subtracting.
299
     * Use telescopic sums to represent 2^16 * p redundantly with each limb
300
     * of the form 2^64 + ...
301
     */
302
303
1.63M
    static const widelimb two64m56m8 = (((widelimb) 1) << 64)
304
1.63M
                                     - (((widelimb) 1) << 56)
305
1.63M
                                     - (((widelimb) 1) << 8);
306
1.63M
    static const widelimb two64m32m8 = (((widelimb) 1) << 64)
307
1.63M
                                     - (((widelimb) 1) << 32)
308
1.63M
                                     - (((widelimb) 1) << 8);
309
1.63M
    static const widelimb two64m8 = (((widelimb) 1) << 64)
310
1.63M
                                  - (((widelimb) 1) << 8);
311
1.63M
    static const widelimb two64p48m16 = (((widelimb) 1) << 64)
312
1.63M
                                      + (((widelimb) 1) << 48)
313
1.63M
                                      - (((widelimb) 1) << 16);
314
1.63M
    unsigned int i;
315
316
1.63M
    out[0] += two64p48m16;
317
1.63M
    out[1] += two64m56m8;
318
1.63M
    out[2] += two64m32m8;
319
1.63M
    out[3] += two64m8;
320
1.63M
    out[4] += two64m8;
321
1.63M
    out[5] += two64m8;
322
1.63M
    out[6] += two64m8;
323
324
13.0M
    for (i = 0; i < NLIMBS; i++)
325
11.4M
        out[i] -= in[i];
326
1.63M
}
327
328
/*
329
 * in[i] < 2^127 - 2^119 - 2^71
330
 * out[i] < out_orig[i] + 2^127 + 2^111
331
 */
332
static void felem_diff128(widefelem out, const widefelem in)
333
578k
{
334
    /*
335
     * In order to prevent underflow, we add a multiple of p before subtracting.
336
     * Use telescopic sums to represent 2^415 * p redundantly with each limb
337
     * of the form 2^127 + ...
338
     */
339
340
578k
    static const widelimb two127 = ((widelimb) 1) << 127;
341
578k
    static const widelimb two127m71 = (((widelimb) 1) << 127)
342
578k
                                    - (((widelimb) 1) << 71);
343
578k
    static const widelimb two127p111m79m71 = (((widelimb) 1) << 127)
344
578k
                                           + (((widelimb) 1) << 111)
345
578k
                                           - (((widelimb) 1) << 79)
346
578k
                                           - (((widelimb) 1) << 71);
347
578k
    static const widelimb two127m119m71 = (((widelimb) 1) << 127)
348
578k
                                        - (((widelimb) 1) << 119)
349
578k
                                        - (((widelimb) 1) << 71);
350
578k
    static const widelimb two127m95m71 = (((widelimb) 1) << 127)
351
578k
                                       - (((widelimb) 1) << 95)
352
578k
                                       - (((widelimb) 1) << 71);
353
578k
    unsigned int i;
354
355
578k
    out[0]  += two127;
356
578k
    out[1]  += two127m71;
357
578k
    out[2]  += two127m71;
358
578k
    out[3]  += two127m71;
359
578k
    out[4]  += two127m71;
360
578k
    out[5]  += two127m71;
361
578k
    out[6]  += two127p111m79m71;
362
578k
    out[7]  += two127m119m71;
363
578k
    out[8]  += two127m95m71;
364
578k
    out[9]  += two127m71;
365
578k
    out[10] += two127m71;
366
578k
    out[11] += two127m71;
367
578k
    out[12] += two127m71;
368
369
8.09M
    for (i = 0; i < 2*NLIMBS-1; i++)
370
7.51M
        out[i] -= in[i];
371
578k
}
372
373
static void felem_square_ref(widefelem out, const felem in)
374
3.52M
{
375
3.52M
    felem inx2;
376
3.52M
    felem_scalar(inx2, in, 2);
377
378
3.52M
    out[0] = ((uint128_t) in[0]) * in[0];
379
380
3.52M
    out[1] = ((uint128_t) in[0]) * inx2[1];
381
382
3.52M
    out[2] = ((uint128_t) in[0]) * inx2[2]
383
3.52M
           + ((uint128_t) in[1]) * in[1];
384
385
3.52M
    out[3] = ((uint128_t) in[0]) * inx2[3]
386
3.52M
           + ((uint128_t) in[1]) * inx2[2];
387
388
3.52M
    out[4] = ((uint128_t) in[0]) * inx2[4]
389
3.52M
           + ((uint128_t) in[1]) * inx2[3]
390
3.52M
           + ((uint128_t) in[2]) * in[2];
391
392
3.52M
    out[5] = ((uint128_t) in[0]) * inx2[5]
393
3.52M
           + ((uint128_t) in[1]) * inx2[4]
394
3.52M
           + ((uint128_t) in[2]) * inx2[3];
395
396
3.52M
    out[6] = ((uint128_t) in[0]) * inx2[6]
397
3.52M
           + ((uint128_t) in[1]) * inx2[5]
398
3.52M
           + ((uint128_t) in[2]) * inx2[4]
399
3.52M
           + ((uint128_t) in[3]) * in[3];
400
401
3.52M
    out[7] = ((uint128_t) in[1]) * inx2[6]
402
3.52M
           + ((uint128_t) in[2]) * inx2[5]
403
3.52M
           + ((uint128_t) in[3]) * inx2[4];
404
405
3.52M
    out[8] = ((uint128_t) in[2]) * inx2[6]
406
3.52M
           + ((uint128_t) in[3]) * inx2[5]
407
3.52M
           + ((uint128_t) in[4]) * in[4];
408
409
3.52M
    out[9] = ((uint128_t) in[3]) * inx2[6]
410
3.52M
           + ((uint128_t) in[4]) * inx2[5];
411
412
3.52M
    out[10] = ((uint128_t) in[4]) * inx2[6]
413
3.52M
            + ((uint128_t) in[5]) * in[5];
414
415
3.52M
    out[11] = ((uint128_t) in[5]) * inx2[6];
416
417
3.52M
    out[12] = ((uint128_t) in[6]) * in[6];
418
3.52M
}
419
420
static void felem_mul_ref(widefelem out, const felem in1, const felem in2)
421
2.84M
{
422
2.84M
    out[0] = ((uint128_t) in1[0]) * in2[0];
423
424
2.84M
    out[1] = ((uint128_t) in1[0]) * in2[1]
425
2.84M
           + ((uint128_t) in1[1]) * in2[0];
426
427
2.84M
    out[2] = ((uint128_t) in1[0]) * in2[2]
428
2.84M
           + ((uint128_t) in1[1]) * in2[1]
429
2.84M
           + ((uint128_t) in1[2]) * in2[0];
430
431
2.84M
    out[3] = ((uint128_t) in1[0]) * in2[3]
432
2.84M
           + ((uint128_t) in1[1]) * in2[2]
433
2.84M
           + ((uint128_t) in1[2]) * in2[1]
434
2.84M
           + ((uint128_t) in1[3]) * in2[0];
435
436
2.84M
    out[4] = ((uint128_t) in1[0]) * in2[4]
437
2.84M
           + ((uint128_t) in1[1]) * in2[3]
438
2.84M
           + ((uint128_t) in1[2]) * in2[2]
439
2.84M
           + ((uint128_t) in1[3]) * in2[1]
440
2.84M
           + ((uint128_t) in1[4]) * in2[0];
441
442
2.84M
    out[5] = ((uint128_t) in1[0]) * in2[5]
443
2.84M
           + ((uint128_t) in1[1]) * in2[4]
444
2.84M
           + ((uint128_t) in1[2]) * in2[3]
445
2.84M
           + ((uint128_t) in1[3]) * in2[2]
446
2.84M
           + ((uint128_t) in1[4]) * in2[1]
447
2.84M
           + ((uint128_t) in1[5]) * in2[0];
448
449
2.84M
    out[6] = ((uint128_t) in1[0]) * in2[6]
450
2.84M
           + ((uint128_t) in1[1]) * in2[5]
451
2.84M
           + ((uint128_t) in1[2]) * in2[4]
452
2.84M
           + ((uint128_t) in1[3]) * in2[3]
453
2.84M
           + ((uint128_t) in1[4]) * in2[2]
454
2.84M
           + ((uint128_t) in1[5]) * in2[1]
455
2.84M
           + ((uint128_t) in1[6]) * in2[0];
456
457
2.84M
    out[7] = ((uint128_t) in1[1]) * in2[6]
458
2.84M
           + ((uint128_t) in1[2]) * in2[5]
459
2.84M
           + ((uint128_t) in1[3]) * in2[4]
460
2.84M
           + ((uint128_t) in1[4]) * in2[3]
461
2.84M
           + ((uint128_t) in1[5]) * in2[2]
462
2.84M
           + ((uint128_t) in1[6]) * in2[1];
463
464
2.84M
    out[8] = ((uint128_t) in1[2]) * in2[6]
465
2.84M
           + ((uint128_t) in1[3]) * in2[5]
466
2.84M
           + ((uint128_t) in1[4]) * in2[4]
467
2.84M
           + ((uint128_t) in1[5]) * in2[3]
468
2.84M
           + ((uint128_t) in1[6]) * in2[2];
469
470
2.84M
    out[9] = ((uint128_t) in1[3]) * in2[6]
471
2.84M
           + ((uint128_t) in1[4]) * in2[5]
472
2.84M
           + ((uint128_t) in1[5]) * in2[4]
473
2.84M
           + ((uint128_t) in1[6]) * in2[3];
474
475
2.84M
    out[10] = ((uint128_t) in1[4]) * in2[6]
476
2.84M
            + ((uint128_t) in1[5]) * in2[5]
477
2.84M
            + ((uint128_t) in1[6]) * in2[4];
478
479
2.84M
    out[11] = ((uint128_t) in1[5]) * in2[6]
480
2.84M
            + ((uint128_t) in1[6]) * in2[5];
481
482
2.84M
    out[12] = ((uint128_t) in1[6]) * in2[6];
483
2.84M
}
484
485
/*-
486
 * Reduce thirteen 128-bit coefficients to seven 64-bit coefficients.
487
 * in[i] < 2^128 - 2^125
488
 * out[i] < 2^56 for i < 6,
489
 * out[6] <= 2^48
490
 *
491
 * The technique in use here stems from the format of the prime modulus:
492
 * P384 = 2^384 - delta
493
 *
494
 * Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting
495
 * them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These
496
 * coefficients are still quite large, and so we repeatedly apply this
497
 * technique on high-order bits in order to guarantee the desired bounds on
498
 * the size of our output.
499
 *
500
 * The three phases of elimination are as follows:
501
 * [1]: Y = 2^120 (in[12] | in[11] | in[10] | in[9])
502
 * [2]: Y = 2^8 (acc[8] | acc[7])
503
 * [3]: Y = 2^48 (acc[6] >> 48)
504
 * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d)
505
 */
506
static void felem_reduce(felem out, const widefelem in)
507
5.78M
{
508
    /*
509
     * In order to prevent underflow, we add a multiple of p before subtracting.
510
     * Use telescopic sums to represent 2^76 * p redundantly with each limb
511
     * of the form 2^124 + ...
512
     */
513
5.78M
    static const widelimb two124m68 = (((widelimb) 1) << 124)
514
5.78M
                                    - (((widelimb) 1) << 68);
515
5.78M
    static const widelimb two124m116m68 = (((widelimb) 1) << 124)
516
5.78M
                                        - (((widelimb) 1) << 116)
517
5.78M
                                        - (((widelimb) 1) << 68);
518
5.78M
    static const widelimb two124p108m76 = (((widelimb) 1) << 124)
519
5.78M
                                        + (((widelimb) 1) << 108)
520
5.78M
                                        - (((widelimb) 1) << 76);
521
5.78M
    static const widelimb two124m92m68 = (((widelimb) 1) << 124)
522
5.78M
                                       - (((widelimb) 1) << 92)
523
5.78M
                                       - (((widelimb) 1) << 68);
524
5.78M
    widelimb temp, acc[9];
525
5.78M
    unsigned int i;
526
527
5.78M
    memcpy(acc, in, sizeof(widelimb) * 9);
528
529
5.78M
    acc[0] += two124p108m76;
530
5.78M
    acc[1] += two124m116m68;
531
5.78M
    acc[2] += two124m92m68;
532
5.78M
    acc[3] += two124m68;
533
5.78M
    acc[4] += two124m68;
534
5.78M
    acc[5] += two124m68;
535
5.78M
    acc[6] += two124m68;
536
537
    /* [1]: Eliminate in[9], ..., in[12] */
538
5.78M
    acc[8] += in[12] >> 32;
539
5.78M
    acc[7] += (in[12] & 0xffffffff) << 24;
540
5.78M
    acc[7] += in[12] >> 8;
541
5.78M
    acc[6] += (in[12] & 0xff) << 48;
542
5.78M
    acc[6] -= in[12] >> 16;
543
5.78M
    acc[5] -= (in[12] & 0xffff) << 40;
544
5.78M
    acc[6] += in[12] >> 48;
545
5.78M
    acc[5] += (in[12] & 0xffffffffffff) << 8;
546
547
5.78M
    acc[7] += in[11] >> 32;
548
5.78M
    acc[6] += (in[11] & 0xffffffff) << 24;
549
5.78M
    acc[6] += in[11] >> 8;
550
5.78M
    acc[5] += (in[11] & 0xff) << 48;
551
5.78M
    acc[5] -= in[11] >> 16;
552
5.78M
    acc[4] -= (in[11] & 0xffff) << 40;
553
5.78M
    acc[5] += in[11] >> 48;
554
5.78M
    acc[4] += (in[11] & 0xffffffffffff) << 8;
555
556
5.78M
    acc[6] += in[10] >> 32;
557
5.78M
    acc[5] += (in[10] & 0xffffffff) << 24;
558
5.78M
    acc[5] += in[10] >> 8;
559
5.78M
    acc[4] += (in[10] & 0xff) << 48;
560
5.78M
    acc[4] -= in[10] >> 16;
561
5.78M
    acc[3] -= (in[10] & 0xffff) << 40;
562
5.78M
    acc[4] += in[10] >> 48;
563
5.78M
    acc[3] += (in[10] & 0xffffffffffff) << 8;
564
565
5.78M
    acc[5] += in[9] >> 32;
566
5.78M
    acc[4] += (in[9] & 0xffffffff) << 24;
567
5.78M
    acc[4] += in[9] >> 8;
568
5.78M
    acc[3] += (in[9] & 0xff) << 48;
569
5.78M
    acc[3] -= in[9] >> 16;
570
5.78M
    acc[2] -= (in[9] & 0xffff) << 40;
571
5.78M
    acc[3] += in[9] >> 48;
572
5.78M
    acc[2] += (in[9] & 0xffffffffffff) << 8;
573
574
    /*
575
     * [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as
576
     * well as the contributions made from eliminating higher limbs.
577
     * acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121
578
     * acc[8] < in[8] + 2^96
579
     */
580
5.78M
    acc[4] += acc[8] >> 32;
581
5.78M
    acc[3] += (acc[8] & 0xffffffff) << 24;
582
5.78M
    acc[3] += acc[8] >> 8;
583
5.78M
    acc[2] += (acc[8] & 0xff) << 48;
584
5.78M
    acc[2] -= acc[8] >> 16;
585
5.78M
    acc[1] -= (acc[8] & 0xffff) << 40;
586
5.78M
    acc[2] += acc[8] >> 48;
587
5.78M
    acc[1] += (acc[8] & 0xffffffffffff) << 8;
588
589
5.78M
    acc[3] += acc[7] >> 32;
590
5.78M
    acc[2] += (acc[7] & 0xffffffff) << 24;
591
5.78M
    acc[2] += acc[7] >> 8;
592
5.78M
    acc[1] += (acc[7] & 0xff) << 48;
593
5.78M
    acc[1] -= acc[7] >> 16;
594
5.78M
    acc[0] -= (acc[7] & 0xffff) << 40;
595
5.78M
    acc[1] += acc[7] >> 48;
596
5.78M
    acc[0] += (acc[7] & 0xffffffffffff) << 8;
597
598
    /*-
599
     * acc[k] < in[k] + 2^124 + 2^121 
600
     *        < in[k] + 2^125
601
     *        < 2^128, for k <= 6
602
     */
603
604
    /*
605
     * Carry 4 -> 5 -> 6
606
     * This has the effect of ensuring that these more significant limbs
607
     * will be small in value after eliminating high bits from acc[6].
608
     */
609
5.78M
    acc[5] += acc[4] >> 56;
610
5.78M
    acc[4] &= 0x00ffffffffffffff;
611
612
5.78M
    acc[6] += acc[5] >> 56;
613
5.78M
    acc[5] &= 0x00ffffffffffffff;
614
615
    /*-
616
     * acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16
617
     *        < in[6] + 2^125
618
     *        < 2^128
619
     */
620
621
    /* [3]: Eliminate high bits of acc[6] */
622
5.78M
    temp = acc[6] >> 48;
623
5.78M
    acc[6] &= 0x0000ffffffffffff;
624
    
625
    /* temp < 2^80 */
626
627
5.78M
    acc[3] += temp >> 40;
628
5.78M
    acc[2] += (temp & 0xffffffffff) << 16;
629
5.78M
    acc[2] += temp >> 16;
630
5.78M
    acc[1] += (temp & 0xffff) << 40;
631
5.78M
    acc[1] -= temp >> 24;
632
5.78M
    acc[0] -= (temp & 0xffffff) << 32;
633
5.78M
    acc[0] += temp;
634
635
    /*-
636
     * acc[k] < acc_old[k] + 2^64 + 2^56
637
     *        < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4
638
     */
639
640
    /* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
641
5.78M
    acc[1] += acc[0] >> 56;   /* acc[1] < acc_old[1] + 2^72 */
642
5.78M
    acc[0] &= 0x00ffffffffffffff;
643
644
5.78M
    acc[2] += acc[1] >> 56;   /* acc[2] < acc_old[2] + 2^72 + 2^16 */
645
5.78M
    acc[1] &= 0x00ffffffffffffff;
646
647
5.78M
    acc[3] += acc[2] >> 56;   /* acc[3] < acc_old[3] + 2^72 + 2^16 */
648
5.78M
    acc[2] &= 0x00ffffffffffffff;
649
650
    /*-
651
     * acc[k] < acc_old[k] + 2^72 + 2^16
652
     *        < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17
653
     *        < in[k] + 2^125
654
     *        < 2^128 , k < 4
655
     */
656
657
5.78M
    acc[4] += acc[3] >> 56;   /*-
658
                               * acc[4] < acc_old[4] + 2^72 + 2^16
659
                               *        < 2^72 + 2^56 + 2^16
660
                               */
661
5.78M
    acc[3] &= 0x00ffffffffffffff;
662
663
5.78M
    acc[5] += acc[4] >> 56;   /*-
664
                               * acc[5] < acc_old[5] + 2^16 + 1
665
                               *        < 2^56 + 2^16 + 1
666
                               */
667
5.78M
    acc[4] &= 0x00ffffffffffffff;
668
669
5.78M
    acc[6] += acc[5] >> 56;   /* acc[6] < 2^48 + 1 <= 2^48 */
670
5.78M
    acc[5] &= 0x00ffffffffffffff;
671
672
46.3M
    for (i = 0; i < NLIMBS; i++)
673
40.5M
        out[i] = acc[i];
674
5.78M
}
675
676
#if defined(ECP_NISTP384_ASM)
677
static void felem_square_wrapper(widefelem out, const felem in);
678
static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2);
679
680
static void (*felem_square_p)(widefelem out, const felem in) =
681
    felem_square_wrapper;
682
static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) =
683
    felem_mul_wrapper;
684
685
void p384_felem_square(widefelem out, const felem in);
686
void p384_felem_mul(widefelem out, const felem in1, const felem in2);
687
688
# if defined(_ARCH_PPC64)
689
#  include "crypto/ppc_arch.h"
690
# endif
691
692
static void felem_select(void)
693
{
694
# if defined(_ARCH_PPC64)
695
    if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
696
        felem_square_p = p384_felem_square;
697
        felem_mul_p = p384_felem_mul;
698
699
        return;
700
    }
701
# endif
702
703
    /* Default */
704
    felem_square_p = felem_square_ref;
705
    felem_mul_p = felem_mul_ref;
706
}
707
708
static void felem_square_wrapper(widefelem out, const felem in)
709
{
710
    felem_select();
711
    felem_square_p(out, in);
712
}
713
714
static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2)
715
{
716
    felem_select();
717
    felem_mul_p(out, in1, in2);
718
}
719
720
# define felem_square felem_square_p
721
# define felem_mul felem_mul_p
722
#else
723
3.52M
# define felem_square felem_square_ref
724
2.84M
# define felem_mul felem_mul_ref
725
#endif
726
727
static ossl_inline void felem_square_reduce(felem out, const felem in)
728
2.07M
{
729
2.07M
    widefelem tmp;
730
731
2.07M
    felem_square(tmp, in);
732
2.07M
    felem_reduce(out, tmp);
733
2.07M
}
734
735
static ossl_inline void felem_mul_reduce(felem out, const felem in1, const felem in2)
736
1.72M
{
737
1.72M
    widefelem tmp;
738
739
1.72M
    felem_mul(tmp, in1, in2);
740
1.72M
    felem_reduce(out, tmp);
741
1.72M
}
742
743
/*-
744
 * felem_inv calculates |out| = |in|^{-1}
745
 *
746
 * Based on Fermat's Little Theorem:
747
 *   a^p = a (mod p)
748
 *   a^{p-1} = 1 (mod p)
749
 *   a^{p-2} = a^{-1} (mod p)
750
 */
751
static void felem_inv(felem out, const felem in)
752
2.24k
{
753
2.24k
    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6;
754
2.24k
    unsigned int i = 0;
755
756
2.24k
    felem_square_reduce(ftmp, in);      /* 2^1 */
757
2.24k
    felem_mul_reduce(ftmp, ftmp, in);   /* 2^1 + 2^0 */
758
2.24k
    felem_assign(ftmp2, ftmp);
759
760
2.24k
    felem_square_reduce(ftmp, ftmp);    /* 2^2 + 2^1 */
761
2.24k
    felem_mul_reduce(ftmp, ftmp, in);   /* 2^2 + 2^1 * 2^0 */
762
2.24k
    felem_assign(ftmp3, ftmp);
763
764
8.98k
    for (i = 0; i < 3; i++)
765
6.74k
        felem_square_reduce(ftmp, ftmp); /* 2^5 + 2^4 + 2^3 */
766
2.24k
    felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */
767
2.24k
    felem_assign(ftmp4, ftmp);
768
769
15.7k
    for (i = 0; i < 6; i++)
770
13.4k
        felem_square_reduce(ftmp, ftmp); /* 2^11 + ... + 2^6 */
771
2.24k
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^11 + ... + 2^0 */
772
773
8.98k
    for (i = 0; i < 3; i++)
774
6.74k
        felem_square_reduce(ftmp, ftmp); /* 2^14 + ... + 2^3 */
775
2.24k
    felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^14 + ... + 2^0 */
776
2.24k
    felem_assign(ftmp5, ftmp);
777
778
35.9k
    for (i = 0; i < 15; i++)
779
33.7k
        felem_square_reduce(ftmp, ftmp); /* 2^29 + ... + 2^15 */
780
2.24k
    felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^29 + ... + 2^0 */
781
2.24k
    felem_assign(ftmp6, ftmp);
782
783
69.6k
    for (i = 0; i < 30; i++)
784
67.4k
        felem_square_reduce(ftmp, ftmp); /* 2^59 + ... + 2^30 */
785
2.24k
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^59 + ... + 2^0 */
786
2.24k
    felem_assign(ftmp4, ftmp);
787
788
137k
    for (i = 0; i < 60; i++)
789
134k
        felem_square_reduce(ftmp, ftmp); /* 2^119 + ... + 2^60 */
790
2.24k
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^119 + ... + 2^0 */
791
2.24k
    felem_assign(ftmp4, ftmp);
792
793
271k
    for (i = 0; i < 120; i++)
794
269k
      felem_square_reduce(ftmp, ftmp);   /* 2^239 + ... + 2^120 */
795
2.24k
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^239 + ... + 2^0 */
796
797
35.9k
    for (i = 0; i < 15; i++)
798
33.7k
        felem_square_reduce(ftmp, ftmp); /* 2^254 + ... + 2^15 */
799
2.24k
    felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^254 + ... + 2^0 */
800
801
71.9k
    for (i = 0; i < 31; i++)
802
69.6k
        felem_square_reduce(ftmp, ftmp); /* 2^285 + ... + 2^31 */
803
2.24k
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^285 + ... + 2^31 + 2^29 + ... + 2^0 */
804
805
6.74k
    for (i = 0; i < 2; i++)
806
4.49k
        felem_square_reduce(ftmp, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^2 */
807
2.24k
    felem_mul_reduce(ftmp, ftmp2, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^0 */
808
809
213k
    for (i = 0; i < 94; i++)
810
211k
        felem_square_reduce(ftmp, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 */
811
2.24k
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 + 2^29 + ... + 2^0 */
812
813
6.74k
    for (i = 0; i < 2; i++)
814
4.49k
        felem_square_reduce(ftmp, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 */
815
2.24k
    felem_mul_reduce(ftmp, in, ftmp);    /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 + 2^0 */
816
817
2.24k
    memcpy(out, ftmp, sizeof(felem));
818
2.24k
}
819
820
/*
821
 * Zero-check: returns a limb with all bits set if |in| == 0 (mod p)
822
 * and 0 otherwise. We know that field elements are reduced to
823
 * 0 < in < 2p, so we only need to check two cases:
824
 * 0 and 2^384 - 2^128 - 2^96 + 2^32 - 1
825
 *   in[k] < 2^56, k < 6
826
 *   in[6] <= 2^48
827
 */
828
static limb felem_is_zero(const felem in)
829
707k
{
830
707k
    limb zero, p384;
831
832
707k
    zero = in[0] | in[1] | in[2] | in[3] | in[4] | in[5] | in[6];
833
707k
    zero = ((int64_t) (zero) - 1) >> 63;
834
707k
    p384 = (in[0] ^ 0x000000ffffffff) | (in[1] ^ 0xffff0000000000)
835
707k
         | (in[2] ^ 0xfffffffffeffff) | (in[3] ^ 0xffffffffffffff)
836
707k
         | (in[4] ^ 0xffffffffffffff) | (in[5] ^ 0xffffffffffffff)
837
707k
         | (in[6] ^ 0xffffffffffff);
838
707k
    p384 = ((int64_t) (p384) - 1) >> 63;
839
840
707k
    return (zero | p384);
841
707k
}
842
843
static int felem_is_zero_int(const void *in)
844
0
{
845
0
    return (int)(felem_is_zero(in) & ((limb) 1));
846
0
}
847
848
/*-
849
 * felem_contract converts |in| to its unique, minimal representation.
850
 * Assume we've removed all redundant bits.
851
 * On entry:
852
 *   in[k] < 2^56, k < 6
853
 *   in[6] <= 2^48
854
 */
855
static void felem_contract(felem out, const felem in)
856
10.2k
{
857
10.2k
    static const int64_t two56 = ((limb) 1) << 56;
858
859
    /*
860
     * We know for a fact that 0 <= |in| < 2*p, for p = 2^384 - 2^128 - 2^96 + 2^32 - 1
861
     * Perform two successive, idempotent subtractions to reduce if |in| >= p.
862
     */
863
864
10.2k
    int64_t tmp[NLIMBS], cond[5], a;
865
10.2k
    unsigned int i;
866
867
10.2k
    memcpy(tmp, in, sizeof(felem));
868
 
869
    /* Case 1: a = 1 iff |in| >= 2^384 */
870
10.2k
    a = (in[6] >> 48);
871
10.2k
    tmp[0] += a;
872
10.2k
    tmp[0] -= a << 32;
873
10.2k
    tmp[1] += a << 40;
874
10.2k
    tmp[2] += a << 16;
875
10.2k
    tmp[6] &= 0x0000ffffffffffff;
876
877
    /*
878
     * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
879
     * non-zero, so we only need one step
880
     */
881
882
10.2k
    a = tmp[0] >> 63;
883
10.2k
    tmp[0] += a & two56;
884
10.2k
    tmp[1] -= a & 1;
885
886
    /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
887
10.2k
    tmp[2] += tmp[1] >> 56;
888
10.2k
    tmp[1] &= 0x00ffffffffffffff;
889
890
10.2k
    tmp[3] += tmp[2] >> 56;
891
10.2k
    tmp[2] &= 0x00ffffffffffffff;
892
893
10.2k
    tmp[4] += tmp[3] >> 56;
894
10.2k
    tmp[3] &= 0x00ffffffffffffff;
895
896
10.2k
    tmp[5] += tmp[4] >> 56;
897
10.2k
    tmp[4] &= 0x00ffffffffffffff;
898
899
10.2k
    tmp[6] += tmp[5] >> 56; /* tmp[6] < 2^48 */
900
10.2k
    tmp[5] &= 0x00ffffffffffffff;
901
902
    /*
903
     * Case 2: a = all ones if p <= |in| < 2^384, 0 otherwise
904
     */
905
906
    /* 0 iff (2^129..2^383) are all one */
907
10.2k
    cond[0] = ((tmp[6] | 0xff000000000000) & tmp[5] & tmp[4] & tmp[3] & (tmp[2] | 0x0000000001ffff)) + 1;
908
    /* 0 iff 2^128 bit is one */
909
10.2k
    cond[1] = (tmp[2] | ~0x00000000010000) + 1;
910
    /* 0 iff (2^96..2^127) bits are all one */
911
10.2k
    cond[2] = ((tmp[2] | 0xffffffffff0000) & (tmp[1] | 0x0000ffffffffff)) + 1;
912
    /* 0 iff (2^32..2^95) bits are all zero */
913
10.2k
    cond[3] = (tmp[1] & ~0xffff0000000000) | (tmp[0] & ~((int64_t) 0x000000ffffffff));
914
    /* 0 iff (2^0..2^31) bits are all one */
915
10.2k
    cond[4] = (tmp[0] | 0xffffff00000000) + 1;
916
917
    /*
918
     * In effect, invert our conditions, so that 0 values become all 1's,
919
     * any non-zero value in the low-order 56 bits becomes all 0's
920
     */
921
61.2k
    for (i = 0; i < 5; i++)
922
51.0k
       cond[i] = ((cond[i] & 0x00ffffffffffffff) - 1) >> 63;
923
924
    /*
925
     * The condition for determining whether in is greater than our
926
     * prime is given by the following condition.
927
     */
928
929
    /* First subtract 2^384 - 2^129 cheaply */
930
10.2k
    a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
931
10.2k
    tmp[6] &= ~a;
932
10.2k
    tmp[5] &= ~a;
933
10.2k
    tmp[4] &= ~a;
934
10.2k
    tmp[3] &= ~a;
935
10.2k
    tmp[2] &= ~a | 0x0000000001ffff;
936
937
    /*
938
     * Subtract 2^128 - 2^96 by
939
     * means of disjoint cases.
940
     */
941
942
    /* subtract 2^128 if that bit is present, and add 2^96 */
943
10.2k
    a = cond[0] & cond[1];
944
10.2k
    tmp[2] &= ~a | 0xfffffffffeffff;
945
10.2k
    tmp[1] += a & ((int64_t) 1 << 40);
946
947
    /* otherwise, clear bits 2^127 .. 2^96  */
948
10.2k
    a = cond[0] & ~cond[1] & (cond[2] & (~cond[3] | cond[4]));
949
10.2k
    tmp[2] &= ~a | 0xffffffffff0000;
950
10.2k
    tmp[1] &= ~a | 0x0000ffffffffff;
951
952
    /* finally, subtract the last 2^32 - 1 */
953
10.2k
    a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
954
10.2k
    tmp[0] += a & (-((int64_t) 1 << 32) + 1);
955
956
    /*
957
     * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
958
     * non-zero, so we only need one step
959
     */
960
10.2k
    a = tmp[0] >> 63;
961
10.2k
    tmp[0] += a & two56;
962
10.2k
    tmp[1] -= a & 1;
963
964
    /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
965
10.2k
    tmp[2] += tmp[1] >> 56;
966
10.2k
    tmp[1] &= 0x00ffffffffffffff;
967
968
10.2k
    tmp[3] += tmp[2] >> 56;
969
10.2k
    tmp[2] &= 0x00ffffffffffffff;
970
971
10.2k
    tmp[4] += tmp[3] >> 56;
972
10.2k
    tmp[3] &= 0x00ffffffffffffff;
973
974
10.2k
    tmp[5] += tmp[4] >> 56;
975
10.2k
    tmp[4] &= 0x00ffffffffffffff;
976
977
10.2k
    tmp[6] += tmp[5] >> 56;
978
10.2k
    tmp[5] &= 0x00ffffffffffffff;
979
980
10.2k
    memcpy(out, tmp, sizeof(felem));
981
10.2k
}
982
983
/*-
984
 * Group operations
985
 * ----------------
986
 *
987
 * Building on top of the field operations we have the operations on the
988
 * elliptic curve group itself. Points on the curve are represented in Jacobian
989
 * coordinates
990
 */
991
992
/*-
993
 * point_double calculates 2*(x_in, y_in, z_in)
994
 *
995
 * The method is taken from:
996
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
997
 *
998
 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
999
 * while x_out == y_in is not (maybe this works, but it's not tested).
1000
 */
1001
static void
1002
point_double(felem x_out, felem y_out, felem z_out,
1003
             const felem x_in, const felem y_in, const felem z_in)
1004
401k
{
1005
401k
    widefelem tmp, tmp2;
1006
401k
    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1007
1008
401k
    felem_assign(ftmp, x_in);
1009
401k
    felem_assign(ftmp2, x_in);
1010
1011
    /* delta = z^2 */
1012
401k
    felem_square_reduce(delta, z_in);     /* delta[i] < 2^56 */
1013
1014
    /* gamma = y^2 */
1015
401k
    felem_square_reduce(gamma, y_in);     /* gamma[i] < 2^56 */
1016
1017
    /* beta = x*gamma */
1018
401k
    felem_mul_reduce(beta, x_in, gamma);  /* beta[i] < 2^56 */
1019
1020
    /* alpha = 3*(x-delta)*(x+delta) */
1021
401k
    felem_diff64(ftmp, delta);            /* ftmp[i] < 2^60 + 2^58 + 2^44 */
1022
401k
    felem_sum64(ftmp2, delta);            /* ftmp2[i] < 2^59 */
1023
401k
    felem_scalar64(ftmp2, 3);             /* ftmp2[i] < 2^61 */
1024
401k
    felem_mul_reduce(alpha, ftmp, ftmp2); /* alpha[i] < 2^56 */
1025
1026
    /* x' = alpha^2 - 8*beta */
1027
401k
    felem_square(tmp, alpha);             /* tmp[i] < 2^115 */
1028
401k
    felem_assign(ftmp, beta);             /* ftmp[i] < 2^56 */
1029
401k
    felem_scalar64(ftmp, 8);              /* ftmp[i] < 2^59 */
1030
401k
    felem_diff_128_64(tmp, ftmp);         /* tmp[i] < 2^115 + 2^64 + 2^48 */
1031
401k
    felem_reduce(x_out, tmp);             /* x_out[i] < 2^56 */
1032
1033
    /* z' = (y + z)^2 - gamma - delta */
1034
401k
    felem_sum64(delta, gamma);     /* delta[i] < 2^57 */
1035
401k
    felem_assign(ftmp, y_in);      /* ftmp[i] < 2^56 */
1036
401k
    felem_sum64(ftmp, z_in);       /* ftmp[i] < 2^56 */
1037
401k
    felem_square(tmp, ftmp);       /* tmp[i] < 2^115 */
1038
401k
    felem_diff_128_64(tmp, delta); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1039
401k
    felem_reduce(z_out, tmp);      /* z_out[i] < 2^56 */
1040
1041
    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1042
401k
    felem_scalar64(beta, 4);       /* beta[i] < 2^58 */
1043
401k
    felem_diff64(beta, x_out);     /* beta[i] < 2^60 + 2^58 + 2^44 */
1044
401k
    felem_mul(tmp, alpha, beta);   /* tmp[i] < 2^119 */
1045
401k
    felem_square(tmp2, gamma);     /* tmp2[i] < 2^115 */
1046
401k
    felem_scalar128(tmp2, 8);      /* tmp2[i] < 2^118 */
1047
401k
    felem_diff128(tmp, tmp2);      /* tmp[i] < 2^127 + 2^119 + 2^111 */
1048
401k
    felem_reduce(y_out, tmp);      /* tmp[i] < 2^56 */
1049
401k
}
1050
1051
/* copy_conditional copies in to out iff mask is all ones. */
1052
static void copy_conditional(felem out, const felem in, limb mask)
1053
1.11M
{
1054
1.11M
    unsigned int i;
1055
1056
8.94M
    for (i = 0; i < NLIMBS; i++)
1057
7.82M
        out[i] ^= mask & (in[i] ^ out[i]);
1058
1.11M
}
1059
1060
/*-
1061
 * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1062
 *
1063
 * The method is taken from
1064
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1065
 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1066
 *
1067
 * This function includes a branch for checking whether the two input points
1068
 * are equal (while not equal to the point at infinity). See comment below
1069
 * on constant-time.
1070
 */
1071
static void point_add(felem x3, felem y3, felem z3,
1072
                      const felem x1, const felem y1, const felem z1,
1073
                      const int mixed, const felem x2, const felem y2,
1074
                      const felem z2)
1075
176k
{
1076
176k
    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1077
176k
    widefelem tmp, tmp2;
1078
176k
    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1079
176k
    limb points_equal;
1080
1081
176k
    z1_is_zero = felem_is_zero(z1);
1082
176k
    z2_is_zero = felem_is_zero(z2);
1083
1084
    /* ftmp = z1z1 = z1**2 */
1085
176k
    felem_square_reduce(ftmp, z1);      /* ftmp[i] < 2^56 */
1086
1087
176k
    if (!mixed) {
1088
        /* ftmp2 = z2z2 = z2**2 */
1089
61.5k
        felem_square_reduce(ftmp2, z2); /* ftmp2[i] < 2^56 */
1090
1091
        /* u1 = ftmp3 = x1*z2z2 */
1092
61.5k
        felem_mul_reduce(ftmp3, x1, ftmp2); /* ftmp3[i] < 2^56 */
1093
1094
        /* ftmp5 = z1 + z2 */
1095
61.5k
        felem_assign(ftmp5, z1);       /* ftmp5[i] < 2^56 */
1096
61.5k
        felem_sum64(ftmp5, z2);        /* ftmp5[i] < 2^57 */
1097
1098
        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1099
61.5k
        felem_square(tmp, ftmp5);      /* tmp[i] < 2^117 */
1100
61.5k
        felem_diff_128_64(tmp, ftmp);  /* tmp[i] < 2^117 + 2^64 + 2^48 */
1101
61.5k
        felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1102
61.5k
        felem_reduce(ftmp5, tmp);      /* ftmp5[i] < 2^56 */
1103
1104
        /* ftmp2 = z2 * z2z2 */
1105
61.5k
        felem_mul_reduce(ftmp2, ftmp2, z2); /* ftmp2[i] < 2^56 */
1106
1107
        /* s1 = ftmp6 = y1 * z2**3 */
1108
61.5k
        felem_mul_reduce(ftmp6, y1, ftmp2); /* ftmp6[i] < 2^56 */
1109
115k
    } else {
1110
        /*
1111
         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1112
         */
1113
1114
        /* u1 = ftmp3 = x1*z2z2 */
1115
115k
        felem_assign(ftmp3, x1);     /* ftmp3[i] < 2^56 */
1116
1117
        /* ftmp5 = 2*z1z2 */
1118
115k
        felem_scalar(ftmp5, z1, 2);  /* ftmp5[i] < 2^57 */
1119
1120
        /* s1 = ftmp6 = y1 * z2**3 */
1121
115k
        felem_assign(ftmp6, y1);     /* ftmp6[i] < 2^56 */
1122
115k
    }
1123
    /* ftmp3[i] < 2^56, ftmp5[i] < 2^57, ftmp6[i] < 2^56 */
1124
1125
    /* u2 = x2*z1z1 */
1126
176k
    felem_mul(tmp, x2, ftmp);        /* tmp[i] < 2^115 */
1127
1128
    /* h = ftmp4 = u2 - u1 */
1129
176k
    felem_diff_128_64(tmp, ftmp3);   /* tmp[i] < 2^115 + 2^64 + 2^48 */
1130
176k
    felem_reduce(ftmp4, tmp);        /* ftmp[4] < 2^56 */
1131
1132
176k
    x_equal = felem_is_zero(ftmp4);
1133
1134
    /* z_out = ftmp5 * h */
1135
176k
    felem_mul_reduce(z_out, ftmp5, ftmp4);  /* z_out[i] < 2^56 */
1136
1137
    /* ftmp = z1 * z1z1 */
1138
176k
    felem_mul_reduce(ftmp, ftmp, z1);  /* ftmp[i] < 2^56 */
1139
1140
    /* s2 = tmp = y2 * z1**3 */
1141
176k
    felem_mul(tmp, y2, ftmp);      /* tmp[i] < 2^115 */
1142
1143
    /* r = ftmp5 = (s2 - s1)*2 */
1144
176k
    felem_diff_128_64(tmp, ftmp6); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1145
176k
    felem_reduce(ftmp5, tmp);      /* ftmp5[i] < 2^56 */
1146
176k
    y_equal = felem_is_zero(ftmp5);
1147
176k
    felem_scalar64(ftmp5, 2);      /* ftmp5[i] < 2^57 */
1148
1149
    /*
1150
     * The formulae are incorrect if the points are equal, in affine coordinates
1151
     * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1152
     * happens.
1153
     *
1154
     * We use bitwise operations to avoid potential side-channels introduced by
1155
     * the short-circuiting behaviour of boolean operators.
1156
     *
1157
     * The special case of either point being the point at infinity (z1 and/or
1158
     * z2 are zero), is handled separately later on in this function, so we
1159
     * avoid jumping to point_double here in those special cases.
1160
     *
1161
     * Notice the comment below on the implications of this branching for timing
1162
     * leaks and why it is considered practically irrelevant.
1163
     */
1164
176k
    points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1165
1166
176k
    if (points_equal) {
1167
        /*
1168
         * This is obviously not constant-time but it will almost-never happen
1169
         * for ECDH / ECDSA.
1170
         */
1171
0
        point_double(x3, y3, z3, x1, y1, z1);
1172
0
        return;
1173
0
    }
1174
1175
    /* I = ftmp = (2h)**2 */
1176
176k
    felem_assign(ftmp, ftmp4);        /* ftmp[i] < 2^56 */
1177
176k
    felem_scalar64(ftmp, 2);          /* ftmp[i] < 2^57 */
1178
176k
    felem_square_reduce(ftmp, ftmp);  /* ftmp[i] < 2^56 */
1179
1180
    /* J = ftmp2 = h * I */
1181
176k
    felem_mul_reduce(ftmp2, ftmp4, ftmp); /* ftmp2[i] < 2^56 */
1182
1183
    /* V = ftmp4 = U1 * I */
1184
176k
    felem_mul_reduce(ftmp4, ftmp3, ftmp); /* ftmp4[i] < 2^56 */
1185
1186
    /* x_out = r**2 - J - 2V */
1187
176k
    felem_square(tmp, ftmp5);      /* tmp[i] < 2^117 */
1188
176k
    felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1189
176k
    felem_assign(ftmp3, ftmp4);    /* ftmp3[i] < 2^56 */
1190
176k
    felem_scalar64(ftmp4, 2);      /* ftmp4[i] < 2^57 */
1191
176k
    felem_diff_128_64(tmp, ftmp4); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1192
176k
    felem_reduce(x_out, tmp);      /* x_out[i] < 2^56 */
1193
1194
    /* y_out = r(V-x_out) - 2 * s1 * J */
1195
176k
    felem_diff64(ftmp3, x_out);    /* ftmp3[i] < 2^60 + 2^56 + 2^44 */
1196
176k
    felem_mul(tmp, ftmp5, ftmp3);  /* tmp[i] < 2^116 */
1197
176k
    felem_mul(tmp2, ftmp6, ftmp2); /* tmp2[i] < 2^115 */
1198
176k
    felem_scalar128(tmp2, 2);      /* tmp2[i] < 2^116 */
1199
176k
    felem_diff128(tmp, tmp2);      /* tmp[i] < 2^127 + 2^116 + 2^111 */
1200
176k
    felem_reduce(y_out, tmp);      /* y_out[i] < 2^56 */
1201
1202
176k
    copy_conditional(x_out, x2, z1_is_zero);
1203
176k
    copy_conditional(x_out, x1, z2_is_zero);
1204
176k
    copy_conditional(y_out, y2, z1_is_zero);
1205
176k
    copy_conditional(y_out, y1, z2_is_zero);
1206
176k
    copy_conditional(z_out, z2, z1_is_zero);
1207
176k
    copy_conditional(z_out, z1, z2_is_zero);
1208
176k
    felem_assign(x3, x_out);
1209
176k
    felem_assign(y3, y_out);
1210
176k
    felem_assign(z3, z_out);
1211
176k
}
1212
1213
/*-
1214
 * Base point pre computation
1215
 * --------------------------
1216
 *
1217
 * Two different sorts of precomputed tables are used in the following code.
1218
 * Each contain various points on the curve, where each point is three field
1219
 * elements (x, y, z).
1220
 *
1221
 * For the base point table, z is usually 1 (0 for the point at infinity).
1222
 * This table has 16 elements:
1223
 * index | bits    | point
1224
 * ------+---------+------------------------------
1225
 *     0 | 0 0 0 0 | 0G
1226
 *     1 | 0 0 0 1 | 1G
1227
 *     2 | 0 0 1 0 | 2^95G
1228
 *     3 | 0 0 1 1 | (2^95 + 1)G
1229
 *     4 | 0 1 0 0 | 2^190G
1230
 *     5 | 0 1 0 1 | (2^190 + 1)G
1231
 *     6 | 0 1 1 0 | (2^190 + 2^95)G
1232
 *     7 | 0 1 1 1 | (2^190 + 2^95 + 1)G
1233
 *     8 | 1 0 0 0 | 2^285G
1234
 *     9 | 1 0 0 1 | (2^285 + 1)G
1235
 *    10 | 1 0 1 0 | (2^285 + 2^95)G
1236
 *    11 | 1 0 1 1 | (2^285 + 2^95 + 1)G
1237
 *    12 | 1 1 0 0 | (2^285 + 2^190)G
1238
 *    13 | 1 1 0 1 | (2^285 + 2^190 + 1)G
1239
 *    14 | 1 1 1 0 | (2^285 + 2^190 + 2^95)G
1240
 *    15 | 1 1 1 1 | (2^285 + 2^190 + 2^95 + 1)G
1241
 *
1242
 * The reason for this is so that we can clock bits into four different
1243
 * locations when doing simple scalar multiplies against the base point.
1244
 *
1245
 * Tables for other points have table[i] = iG for i in 0 .. 16.
1246
 */
1247
1248
/* gmul is the table of precomputed base points */
1249
static const felem gmul[16][3] = {
1250
{{0, 0, 0, 0, 0, 0, 0},
1251
 {0, 0, 0, 0, 0, 0, 0},
1252
 {0, 0, 0, 0, 0, 0, 0}},
1253
{{0x00545e3872760ab7, 0x00f25dbf55296c3a, 0x00e082542a385502, 0x008ba79b9859f741,
1254
  0x0020ad746e1d3b62, 0x0005378eb1c71ef3, 0x0000aa87ca22be8b},
1255
 {0x00431d7c90ea0e5f, 0x00b1ce1d7e819d7a, 0x0013b5f0b8c00a60, 0x00289a147ce9da31,
1256
  0x0092dc29f8f41dbd, 0x002c6f5d9e98bf92, 0x00003617de4a9626},
1257
 {1, 0, 0, 0, 0, 0, 0}},
1258
{{0x00024711cc902a90, 0x00acb2e579ab4fe1, 0x00af818a4b4d57b1, 0x00a17c7bec49c3de,
1259
  0x004280482d726a8b, 0x00128dd0f0a90f3b, 0x00004387c1c3fa3c},
1260
 {0x002ce76543cf5c3a, 0x00de6cee5ef58f0a, 0x00403e42fa561ca6, 0x00bc54d6f9cb9731,
1261
  0x007155f925fb4ff1, 0x004a9ce731b7b9bc, 0x00002609076bd7b2},
1262
 {1, 0, 0, 0, 0, 0, 0}},
1263
{{0x00e74c9182f0251d, 0x0039bf54bb111974, 0x00b9d2f2eec511d2, 0x0036b1594eb3a6a4,
1264
  0x00ac3bb82d9d564b, 0x00f9313f4615a100, 0x00006716a9a91b10},
1265
 {0x0046698116e2f15c, 0x00f34347067d3d33, 0x008de4ccfdebd002, 0x00e838c6b8e8c97b,
1266
  0x006faf0798def346, 0x007349794a57563c, 0x00002629e7e6ad84},
1267
 {1, 0, 0, 0, 0, 0, 0}},
1268
{{0x0075300e34fd163b, 0x0092e9db4e8d0ad3, 0x00254be9f625f760, 0x00512c518c72ae68,
1269
  0x009bfcf162bede5a, 0x00bf9341566ce311, 0x0000cd6175bd41cf},
1270
 {0x007dfe52af4ac70f, 0x0002159d2d5c4880, 0x00b504d16f0af8d0, 0x0014585e11f5e64c,
1271
  0x0089c6388e030967, 0x00ffb270cbfa5f71, 0x00009a15d92c3947},
1272
 {1, 0, 0, 0, 0, 0, 0}},
1273
{{0x0033fc1278dc4fe5, 0x00d53088c2caa043, 0x0085558827e2db66, 0x00c192bef387b736,
1274
  0x00df6405a2225f2c, 0x0075205aa90fd91a, 0x0000137e3f12349d},
1275
 {0x00ce5b115efcb07e, 0x00abc3308410deeb, 0x005dc6fc1de39904, 0x00907c1c496f36b4,
1276
  0x0008e6ad3926cbe1, 0x00110747b787928c, 0x0000021b9162eb7e},
1277
 {1, 0, 0, 0, 0, 0, 0}},
1278
{{0x008180042cfa26e1, 0x007b826a96254967, 0x0082473694d6b194, 0x007bd6880a45b589,
1279
  0x00c0a5097072d1a3, 0x0019186555e18b4e, 0x000020278190e5ca},
1280
 {0x00b4bef17de61ac0, 0x009535e3c38ed348, 0x002d4aa8e468ceab, 0x00ef40b431036ad3,
1281
  0x00defd52f4542857, 0x0086edbf98234266, 0x00002025b3a7814d},
1282
 {1, 0, 0, 0, 0, 0, 0}},
1283
{{0x00b238aa97b886be, 0x00ef3192d6dd3a32, 0x0079f9e01fd62df8, 0x00742e890daba6c5,
1284
  0x008e5289144408ce, 0x0073bbcc8e0171a5, 0x0000c4fd329d3b52},
1285
 {0x00c6f64a15ee23e7, 0x00dcfb7b171cad8b, 0x00039f6cbd805867, 0x00de024e428d4562,
1286
  0x00be6a594d7c64c5, 0x0078467b70dbcd64, 0x0000251f2ed7079b},
1287
 {1, 0, 0, 0, 0, 0, 0}},
1288
{{0x000e5cc25fc4b872, 0x005ebf10d31ef4e1, 0x0061e0ebd11e8256, 0x0076e026096f5a27,
1289
  0x0013e6fc44662e9a, 0x0042b00289d3597e, 0x000024f089170d88},
1290
 {0x001604d7e0effbe6, 0x0048d77cba64ec2c, 0x008166b16da19e36, 0x006b0d1a0f28c088,
1291
  0x000259fcd47754fd, 0x00cc643e4d725f9a, 0x00007b10f3c79c14},
1292
 {1, 0, 0, 0, 0, 0, 0}},
1293
{{0x00430155e3b908af, 0x00b801e4fec25226, 0x00b0d4bcfe806d26, 0x009fc4014eb13d37,
1294
  0x0066c94e44ec07e8, 0x00d16adc03874ba2, 0x000030c917a0d2a7},
1295
 {0x00edac9e21eb891c, 0x00ef0fb768102eff, 0x00c088cef272a5f3, 0x00cbf782134e2964,
1296
  0x0001044a7ba9a0e3, 0x00e363f5b194cf3c, 0x00009ce85249e372},
1297
 {1, 0, 0, 0, 0, 0, 0}},
1298
{{0x001dd492dda5a7eb, 0x008fd577be539fd1, 0x002ff4b25a5fc3f1, 0x0074a8a1b64df72f,
1299
  0x002ba3d8c204a76c, 0x009d5cff95c8235a, 0x0000e014b9406e0f},
1300
 {0x008c2e4dbfc98aba, 0x00f30bb89f1a1436, 0x00b46f7aea3e259c, 0x009224454ac02f54,
1301
  0x00906401f5645fa2, 0x003a1d1940eabc77, 0x00007c9351d680e6},
1302
 {1, 0, 0, 0, 0, 0, 0}},
1303
{{0x005a35d872ef967c, 0x0049f1b7884e1987, 0x0059d46d7e31f552, 0x00ceb4869d2d0fb6,
1304
  0x00e8e89eee56802a, 0x0049d806a774aaf2, 0x0000147e2af0ae24},
1305
 {0x005fd1bd852c6e5e, 0x00b674b7b3de6885, 0x003b9ea5eb9b6c08, 0x005c9f03babf3ef7,
1306
  0x00605337fecab3c7, 0x009a3f85b11bbcc8, 0x0000455470f330ec},
1307
 {1, 0, 0, 0, 0, 0, 0}},
1308
{{0x002197ff4d55498d, 0x00383e8916c2d8af, 0x00eb203f34d1c6d2, 0x0080367cbd11b542,
1309
  0x00769b3be864e4f5, 0x0081a8458521c7bb, 0x0000c531b34d3539},
1310
 {0x00e2a3d775fa2e13, 0x00534fc379573844, 0x00ff237d2a8db54a, 0x00d301b2335a8882,
1311
  0x000f75ea96103a80, 0x0018fecb3cdd96fa, 0x0000304bf61e94eb},
1312
 {1, 0, 0, 0, 0, 0, 0}},
1313
{{0x00b2afc332a73dbd, 0x0029a0d5bb007bc5, 0x002d628eb210f577, 0x009f59a36dd05f50,
1314
  0x006d339de4eca613, 0x00c75a71addc86bc, 0x000060384c5ea93c},
1315
 {0x00aa9641c32a30b4, 0x00cc73ae8cce565d, 0x00ec911a4df07f61, 0x00aa4b762ea4b264,
1316
  0x0096d395bb393629, 0x004efacfb7632fe0, 0x00006f252f46fa3f},
1317
 {1, 0, 0, 0, 0, 0, 0}},
1318
{{0x00567eec597c7af6, 0x0059ba6795204413, 0x00816d4e6f01196f, 0x004ae6b3eb57951d,
1319
  0x00420f5abdda2108, 0x003401d1f57ca9d9, 0x0000cf5837b0b67a},
1320
 {0x00eaa64b8aeeabf9, 0x00246ddf16bcb4de, 0x000e7e3c3aecd751, 0x0008449f04fed72e,
1321
  0x00307b67ccf09183, 0x0017108c3556b7b1, 0x0000229b2483b3bf},
1322
 {1, 0, 0, 0, 0, 0, 0}},
1323
{{0x00e7c491a7bb78a1, 0x00eafddd1d3049ab, 0x00352c05e2bc7c98, 0x003d6880c165fa5c,
1324
  0x00b6ac61cc11c97d, 0x00beeb54fcf90ce5, 0x0000dc1f0b455edc},
1325
 {0x002db2e7aee34d60, 0x0073b5f415a2d8c0, 0x00dd84e4193e9a0c, 0x00d02d873467c572,
1326
  0x0018baaeda60aee5, 0x0013fb11d697c61e, 0x000083aafcc3a973},
1327
 {1, 0, 0, 0, 0, 0, 0}}
1328
};
1329
1330
/*
1331
 * select_point selects the |idx|th point from a precomputation table and
1332
 * copies it to out.
1333
 *
1334
 * pre_comp below is of the size provided in |size|.
1335
 */
1336
static void select_point(const limb idx, unsigned int size,
1337
                         const felem pre_comp[][3], felem out[3])
1338
173k
{
1339
173k
    unsigned int i, j;
1340
173k
    limb *outlimbs = &out[0][0];
1341
1342
173k
    memset(out, 0, sizeof(*out) * 3);
1343
1344
3.00M
    for (i = 0; i < size; i++) {
1345
2.83M
        const limb *inlimbs = &pre_comp[i][0][0];
1346
2.83M
        limb mask = i ^ idx;
1347
1348
2.83M
        mask |= mask >> 4;
1349
2.83M
        mask |= mask >> 2;
1350
2.83M
        mask |= mask >> 1;
1351
2.83M
        mask &= 1;
1352
2.83M
        mask--;
1353
62.3M
        for (j = 0; j < NLIMBS * 3; j++)
1354
59.5M
            outlimbs[j] |= inlimbs[j] & mask;
1355
2.83M
    }
1356
173k
}
1357
1358
/* get_bit returns the |i|th bit in |in| */
1359
static char get_bit(const felem_bytearray in, int i)
1360
794k
{
1361
794k
    if (i < 0 || i >= 384)
1362
1.48k
        return 0;
1363
792k
    return (in[i >> 3] >> (i & 7)) & 1;
1364
794k
}
1365
1366
/*
1367
 * Interleaved point multiplication using precomputed point multiples: The
1368
 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1369
 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1370
 * generator, using certain (large) precomputed multiples in g_pre_comp.
1371
 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1372
 */
1373
static void batch_mul(felem x_out, felem y_out, felem z_out,
1374
                      const felem_bytearray scalars[],
1375
                      const unsigned int num_points, const u8 *g_scalar,
1376
                      const int mixed, const felem pre_comp[][17][3],
1377
                      const felem g_pre_comp[16][3])
1378
1.90k
{
1379
1.90k
    int i, skip;
1380
1.90k
    unsigned int num, gen_mul = (g_scalar != NULL);
1381
1.90k
    felem nq[3], tmp[4];
1382
1.90k
    limb bits;
1383
1.90k
    u8 sign, digit;
1384
1385
    /* set nq to the point at infinity */
1386
1.90k
    memset(nq, 0, sizeof(nq));
1387
1388
    /*
1389
     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1390
     * of the generator (last quarter of rounds) and additions of other
1391
     * points multiples (every 5th round).
1392
     */
1393
1.90k
    skip = 1;                   /* save two point operations in the first
1394
                                 * round */
1395
399k
    for (i = (num_points ? 380 : 98); i >= 0; --i) {
1396
        /* double */
1397
397k
        if (!skip)
1398
395k
            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1399
1400
        /* add multiples of the generator */
1401
397k
        if (gen_mul && (i <= 98)) {
1402
116k
            bits = get_bit(g_scalar, i + 285) << 3;
1403
116k
            if (i < 95) {
1404
111k
                bits |= get_bit(g_scalar, i + 190) << 2;
1405
111k
                bits |= get_bit(g_scalar, i + 95) << 1;
1406
111k
                bits |= get_bit(g_scalar, i);
1407
111k
            }
1408
            /* select the point to add, in constant time */
1409
116k
            select_point(bits, 16, g_pre_comp, tmp);
1410
116k
            if (!skip) {
1411
                /* The 1 argument below is for "mixed" */
1412
115k
                point_add(nq[0],  nq[1],  nq[2],
1413
115k
                          nq[0],  nq[1],  nq[2], 1,
1414
115k
                          tmp[0], tmp[1], tmp[2]);
1415
115k
            } else {
1416
1.16k
                memcpy(nq, tmp, 3 * sizeof(felem));
1417
1.16k
                skip = 0;
1418
1.16k
            }
1419
116k
        }
1420
1421
        /* do other additions every 5 doublings */
1422
397k
        if (num_points && (i % 5 == 0)) {
1423
            /* loop over all scalars */
1424
114k
            for (num = 0; num < num_points; ++num) {
1425
57.0k
                bits = get_bit(scalars[num], i + 4) << 5;
1426
57.0k
                bits |= get_bit(scalars[num], i + 3) << 4;
1427
57.0k
                bits |= get_bit(scalars[num], i + 2) << 3;
1428
57.0k
                bits |= get_bit(scalars[num], i + 1) << 2;
1429
57.0k
                bits |= get_bit(scalars[num], i) << 1;
1430
57.0k
                bits |= get_bit(scalars[num], i - 1);
1431
57.0k
                ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1432
1433
                /*
1434
                 * select the point to add or subtract, in constant time
1435
                 */
1436
57.0k
                select_point(digit, 17, pre_comp[num], tmp);
1437
57.0k
                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1438
                                            * point */
1439
57.0k
                copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1440
1441
57.0k
                if (!skip) {
1442
56.3k
                    point_add(nq[0],  nq[1],  nq[2],
1443
56.3k
                              nq[0],  nq[1],  nq[2], mixed,
1444
56.3k
                              tmp[0], tmp[1], tmp[2]);
1445
56.3k
                } else {
1446
741
                    memcpy(nq, tmp, 3 * sizeof(felem));
1447
741
                    skip = 0;
1448
741
                }
1449
57.0k
            }
1450
57.0k
        }
1451
397k
    }
1452
1.90k
    felem_assign(x_out, nq[0]);
1453
1.90k
    felem_assign(y_out, nq[1]);
1454
1.90k
    felem_assign(z_out, nq[2]);
1455
1.90k
}
1456
1457
/* Precomputation for the group generator. */
1458
struct nistp384_pre_comp_st {
1459
    felem g_pre_comp[16][3];
1460
    CRYPTO_REF_COUNT references;
1461
};
1462
1463
const EC_METHOD *ossl_ec_GFp_nistp384_method(void)
1464
6.74k
{
1465
6.74k
    static const EC_METHOD ret = {
1466
6.74k
        EC_FLAGS_DEFAULT_OCT,
1467
6.74k
        NID_X9_62_prime_field,
1468
6.74k
        ossl_ec_GFp_nistp384_group_init,
1469
6.74k
        ossl_ec_GFp_simple_group_finish,
1470
6.74k
        ossl_ec_GFp_simple_group_clear_finish,
1471
6.74k
        ossl_ec_GFp_nist_group_copy,
1472
6.74k
        ossl_ec_GFp_nistp384_group_set_curve,
1473
6.74k
        ossl_ec_GFp_simple_group_get_curve,
1474
6.74k
        ossl_ec_GFp_simple_group_get_degree,
1475
6.74k
        ossl_ec_group_simple_order_bits,
1476
6.74k
        ossl_ec_GFp_simple_group_check_discriminant,
1477
6.74k
        ossl_ec_GFp_simple_point_init,
1478
6.74k
        ossl_ec_GFp_simple_point_finish,
1479
6.74k
        ossl_ec_GFp_simple_point_clear_finish,
1480
6.74k
        ossl_ec_GFp_simple_point_copy,
1481
6.74k
        ossl_ec_GFp_simple_point_set_to_infinity,
1482
6.74k
        ossl_ec_GFp_simple_point_set_affine_coordinates,
1483
6.74k
        ossl_ec_GFp_nistp384_point_get_affine_coordinates,
1484
6.74k
        0, /* point_set_compressed_coordinates */
1485
6.74k
        0, /* point2oct */
1486
6.74k
        0, /* oct2point */
1487
6.74k
        ossl_ec_GFp_simple_add,
1488
6.74k
        ossl_ec_GFp_simple_dbl,
1489
6.74k
        ossl_ec_GFp_simple_invert,
1490
6.74k
        ossl_ec_GFp_simple_is_at_infinity,
1491
6.74k
        ossl_ec_GFp_simple_is_on_curve,
1492
6.74k
        ossl_ec_GFp_simple_cmp,
1493
6.74k
        ossl_ec_GFp_simple_make_affine,
1494
6.74k
        ossl_ec_GFp_simple_points_make_affine,
1495
6.74k
        ossl_ec_GFp_nistp384_points_mul,
1496
6.74k
        ossl_ec_GFp_nistp384_precompute_mult,
1497
6.74k
        ossl_ec_GFp_nistp384_have_precompute_mult,
1498
6.74k
        ossl_ec_GFp_nist_field_mul,
1499
6.74k
        ossl_ec_GFp_nist_field_sqr,
1500
6.74k
        0, /* field_div */
1501
6.74k
        ossl_ec_GFp_simple_field_inv,
1502
6.74k
        0, /* field_encode */
1503
6.74k
        0, /* field_decode */
1504
6.74k
        0, /* field_set_to_one */
1505
6.74k
        ossl_ec_key_simple_priv2oct,
1506
6.74k
        ossl_ec_key_simple_oct2priv,
1507
6.74k
        0, /* set private */
1508
6.74k
        ossl_ec_key_simple_generate_key,
1509
6.74k
        ossl_ec_key_simple_check_key,
1510
6.74k
        ossl_ec_key_simple_generate_public_key,
1511
6.74k
        0, /* keycopy */
1512
6.74k
        0, /* keyfinish */
1513
6.74k
        ossl_ecdh_simple_compute_key,
1514
6.74k
        ossl_ecdsa_simple_sign_setup,
1515
6.74k
        ossl_ecdsa_simple_sign_sig,
1516
6.74k
        ossl_ecdsa_simple_verify_sig,
1517
6.74k
        0, /* field_inverse_mod_ord */
1518
6.74k
        0, /* blind_coordinates */
1519
6.74k
        0, /* ladder_pre */
1520
6.74k
        0, /* ladder_step */
1521
6.74k
        0  /* ladder_post */
1522
6.74k
    };
1523
1524
6.74k
    return &ret;
1525
6.74k
}
1526
1527
/******************************************************************************/
1528
/*
1529
 * FUNCTIONS TO MANAGE PRECOMPUTATION
1530
 */
1531
1532
static NISTP384_PRE_COMP *nistp384_pre_comp_new(void)
1533
0
{
1534
0
    NISTP384_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1535
1536
0
    if (ret == NULL)
1537
0
        return ret;
1538
1539
0
    if (!CRYPTO_NEW_REF(&ret->references, 1)) {
1540
0
        OPENSSL_free(ret);
1541
0
        return NULL;
1542
0
    }
1543
0
    return ret;
1544
0
}
1545
1546
NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *p)
1547
0
{
1548
0
    int i;
1549
1550
0
    if (p != NULL)
1551
0
        CRYPTO_UP_REF(&p->references, &i);
1552
0
    return p;
1553
0
}
1554
1555
void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *p)
1556
0
{
1557
0
    int i;
1558
1559
0
    if (p == NULL)
1560
0
        return;
1561
1562
0
    CRYPTO_DOWN_REF(&p->references, &i);
1563
0
    REF_PRINT_COUNT("ossl_ec_nistp384", p);
1564
0
    if (i > 0)
1565
0
        return;
1566
0
    REF_ASSERT_ISNT(i < 0);
1567
1568
0
    CRYPTO_FREE_REF(&p->references);
1569
0
    OPENSSL_free(p);
1570
0
}
1571
1572
/******************************************************************************/
1573
/*
1574
 * OPENSSL EC_METHOD FUNCTIONS
1575
 */
1576
1577
int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group)
1578
13.7k
{
1579
13.7k
    int ret;
1580
1581
13.7k
    ret = ossl_ec_GFp_simple_group_init(group);
1582
13.7k
    group->a_is_minus3 = 1;
1583
13.7k
    return ret;
1584
13.7k
}
1585
1586
int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1587
                                         const BIGNUM *a, const BIGNUM *b,
1588
                                         BN_CTX *ctx)
1589
6.74k
{
1590
6.74k
    int ret = 0;
1591
6.74k
    BIGNUM *curve_p, *curve_a, *curve_b;
1592
6.74k
#ifndef FIPS_MODULE
1593
6.74k
    BN_CTX *new_ctx = NULL;
1594
1595
6.74k
    if (ctx == NULL)
1596
0
        ctx = new_ctx = BN_CTX_new();
1597
6.74k
#endif
1598
6.74k
    if (ctx == NULL)
1599
0
        return 0;
1600
1601
6.74k
    BN_CTX_start(ctx);
1602
6.74k
    curve_p = BN_CTX_get(ctx);
1603
6.74k
    curve_a = BN_CTX_get(ctx);
1604
6.74k
    curve_b = BN_CTX_get(ctx);
1605
6.74k
    if (curve_b == NULL)
1606
0
        goto err;
1607
6.74k
    BN_bin2bn(nistp384_curve_params[0], sizeof(felem_bytearray), curve_p);
1608
6.74k
    BN_bin2bn(nistp384_curve_params[1], sizeof(felem_bytearray), curve_a);
1609
6.74k
    BN_bin2bn(nistp384_curve_params[2], sizeof(felem_bytearray), curve_b);
1610
6.74k
    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1611
0
        ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1612
0
        goto err;
1613
0
    }
1614
6.74k
    group->field_mod_func = BN_nist_mod_384;
1615
6.74k
    ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1616
6.74k
 err:
1617
6.74k
    BN_CTX_end(ctx);
1618
6.74k
#ifndef FIPS_MODULE
1619
6.74k
    BN_CTX_free(new_ctx);
1620
6.74k
#endif
1621
6.74k
    return ret;
1622
6.74k
}
1623
1624
/*
1625
 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1626
 * (X/Z^2, Y/Z^3)
1627
 */
1628
int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group,
1629
                                                      const EC_POINT *point,
1630
                                                      BIGNUM *x, BIGNUM *y,
1631
                                                      BN_CTX *ctx)
1632
2.24k
{
1633
2.24k
    felem z1, z2, x_in, y_in, x_out, y_out;
1634
2.24k
    widefelem tmp;
1635
1636
2.24k
    if (EC_POINT_is_at_infinity(group, point)) {
1637
0
        ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1638
0
        return 0;
1639
0
    }
1640
2.24k
    if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1641
2.24k
        (!BN_to_felem(z1, point->Z)))
1642
0
        return 0;
1643
2.24k
    felem_inv(z2, z1);
1644
2.24k
    felem_square(tmp, z2);
1645
2.24k
    felem_reduce(z1, tmp);
1646
2.24k
    felem_mul(tmp, x_in, z1);
1647
2.24k
    felem_reduce(x_in, tmp);
1648
2.24k
    felem_contract(x_out, x_in);
1649
2.24k
    if (x != NULL) {
1650
2.24k
        if (!felem_to_BN(x, x_out)) {
1651
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1652
0
            return 0;
1653
0
        }
1654
2.24k
    }
1655
2.24k
    felem_mul(tmp, z1, z2);
1656
2.24k
    felem_reduce(z1, tmp);
1657
2.24k
    felem_mul(tmp, y_in, z1);
1658
2.24k
    felem_reduce(y_in, tmp);
1659
2.24k
    felem_contract(y_out, y_in);
1660
2.24k
    if (y != NULL) {
1661
1.83k
        if (!felem_to_BN(y, y_out)) {
1662
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1663
0
            return 0;
1664
0
        }
1665
1.83k
    }
1666
2.24k
    return 1;
1667
2.24k
}
1668
1669
/* points below is of size |num|, and tmp_felems is of size |num+1/ */
1670
static void make_points_affine(size_t num, felem points[][3],
1671
                               felem tmp_felems[])
1672
0
{
1673
    /*
1674
     * Runs in constant time, unless an input is the point at infinity (which
1675
     * normally shouldn't happen).
1676
     */
1677
0
    ossl_ec_GFp_nistp_points_make_affine_internal(num,
1678
0
                                                  points,
1679
0
                                                  sizeof(felem),
1680
0
                                                  tmp_felems,
1681
0
                                                  (void (*)(void *))felem_one,
1682
0
                                                  felem_is_zero_int,
1683
0
                                                  (void (*)(void *, const void *))
1684
0
                                                  felem_assign,
1685
0
                                                  (void (*)(void *, const void *))
1686
0
                                                  felem_square_reduce,
1687
0
                                                  (void (*)(void *, const void *, const void*))
1688
0
                                                  felem_mul_reduce,
1689
0
                                                  (void (*)(void *, const void *))
1690
0
                                                  felem_inv,
1691
0
                                                  (void (*)(void *, const void *))
1692
0
                                                  felem_contract);
1693
0
}
1694
1695
/*
1696
 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1697
 * values Result is stored in r (r can equal one of the inputs).
1698
 */
1699
int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r,
1700
                                    const BIGNUM *scalar, size_t num,
1701
                                    const EC_POINT *points[],
1702
                                    const BIGNUM *scalars[], BN_CTX *ctx)
1703
1.90k
{
1704
1.90k
    int ret = 0;
1705
1.90k
    int j;
1706
1.90k
    int mixed = 0;
1707
1.90k
    BIGNUM *x, *y, *z, *tmp_scalar;
1708
1.90k
    felem_bytearray g_secret;
1709
1.90k
    felem_bytearray *secrets = NULL;
1710
1.90k
    felem (*pre_comp)[17][3] = NULL;
1711
1.90k
    felem *tmp_felems = NULL;
1712
1.90k
    unsigned int i;
1713
1.90k
    int num_bytes;
1714
1.90k
    int have_pre_comp = 0;
1715
1.90k
    size_t num_points = num;
1716
1.90k
    felem x_in, y_in, z_in, x_out, y_out, z_out;
1717
1.90k
    NISTP384_PRE_COMP *pre = NULL;
1718
1.90k
    felem(*g_pre_comp)[3] = NULL;
1719
1.90k
    EC_POINT *generator = NULL;
1720
1.90k
    const EC_POINT *p = NULL;
1721
1.90k
    const BIGNUM *p_scalar = NULL;
1722
1723
1.90k
    BN_CTX_start(ctx);
1724
1.90k
    x = BN_CTX_get(ctx);
1725
1.90k
    y = BN_CTX_get(ctx);
1726
1.90k
    z = BN_CTX_get(ctx);
1727
1.90k
    tmp_scalar = BN_CTX_get(ctx);
1728
1.90k
    if (tmp_scalar == NULL)
1729
0
        goto err;
1730
1731
1.90k
    if (scalar != NULL) {
1732
1.17k
        pre = group->pre_comp.nistp384;
1733
1.17k
        if (pre)
1734
            /* we have precomputation, try to use it */
1735
0
            g_pre_comp = &pre->g_pre_comp[0];
1736
1.17k
        else
1737
            /* try to use the standard precomputation */
1738
1.17k
            g_pre_comp = (felem(*)[3]) gmul;
1739
1.17k
        generator = EC_POINT_new(group);
1740
1.17k
        if (generator == NULL)
1741
0
            goto err;
1742
        /* get the generator from precomputation */
1743
1.17k
        if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1744
1.17k
            !felem_to_BN(y, g_pre_comp[1][1]) ||
1745
1.17k
            !felem_to_BN(z, g_pre_comp[1][2])) {
1746
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1747
0
            goto err;
1748
0
        }
1749
1.17k
        if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
1750
1.17k
                                                                generator,
1751
1.17k
                                                                x, y, z, ctx))
1752
0
            goto err;
1753
1.17k
        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1754
            /* precomputation matches generator */
1755
1.17k
            have_pre_comp = 1;
1756
0
        else
1757
            /*
1758
             * we don't have valid precomputation: treat the generator as a
1759
             * random point
1760
             */
1761
0
            num_points++;
1762
1.17k
    }
1763
1764
1.90k
    if (num_points > 0) {
1765
741
        if (num_points >= 2) {
1766
            /*
1767
             * unless we precompute multiples for just one point, converting
1768
             * those into affine form is time well spent
1769
             */
1770
0
            mixed = 1;
1771
0
        }
1772
741
        secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1773
741
        pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1774
741
        if (mixed)
1775
0
            tmp_felems =
1776
0
                OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1777
741
        if ((secrets == NULL) || (pre_comp == NULL)
1778
741
            || (mixed && (tmp_felems == NULL)))
1779
0
            goto err;
1780
1781
        /*
1782
         * we treat NULL scalars as 0, and NULL points as points at infinity,
1783
         * i.e., they contribute nothing to the linear combination
1784
         */
1785
1.48k
        for (i = 0; i < num_points; ++i) {
1786
741
            if (i == num) {
1787
                /*
1788
                 * we didn't have a valid precomputation, so we pick the
1789
                 * generator
1790
                 */
1791
0
                p = EC_GROUP_get0_generator(group);
1792
0
                p_scalar = scalar;
1793
741
            } else {
1794
                /* the i^th point */
1795
741
                p = points[i];
1796
741
                p_scalar = scalars[i];
1797
741
            }
1798
741
            if (p_scalar != NULL && p != NULL) {
1799
                /* reduce scalar to 0 <= scalar < 2^384 */
1800
741
                if ((BN_num_bits(p_scalar) > 384)
1801
741
                    || (BN_is_negative(p_scalar))) {
1802
                    /*
1803
                     * this is an unusual input, and we don't guarantee
1804
                     * constant-timeness
1805
                     */
1806
0
                    if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1807
0
                        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1808
0
                        goto err;
1809
0
                    }
1810
0
                    num_bytes = BN_bn2lebinpad(tmp_scalar,
1811
0
                                               secrets[i], sizeof(secrets[i]));
1812
741
                } else {
1813
741
                    num_bytes = BN_bn2lebinpad(p_scalar,
1814
741
                                               secrets[i], sizeof(secrets[i]));
1815
741
                }
1816
741
                if (num_bytes < 0) {
1817
0
                    ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1818
0
                    goto err;
1819
0
                }
1820
                /* precompute multiples */
1821
741
                if ((!BN_to_felem(x_out, p->X)) ||
1822
741
                    (!BN_to_felem(y_out, p->Y)) ||
1823
741
                    (!BN_to_felem(z_out, p->Z)))
1824
0
                    goto err;
1825
741
                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1826
741
                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1827
741
                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1828
11.8k
                for (j = 2; j <= 16; ++j) {
1829
11.1k
                    if (j & 1) {
1830
5.18k
                        point_add(pre_comp[i][j][0],     pre_comp[i][j][1],     pre_comp[i][j][2],
1831
5.18k
                                  pre_comp[i][1][0],     pre_comp[i][1][1],     pre_comp[i][1][2], 0,
1832
5.18k
                                  pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]);
1833
5.92k
                    } else {
1834
5.92k
                        point_double(pre_comp[i][j][0],     pre_comp[i][j][1],     pre_comp[i][j][2],
1835
5.92k
                                     pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]);
1836
5.92k
                    }
1837
11.1k
                }
1838
741
            }
1839
741
        }
1840
741
        if (mixed)
1841
0
            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1842
741
    }
1843
1844
    /* the scalar for the generator */
1845
1.90k
    if (scalar != NULL && have_pre_comp) {
1846
1.17k
        memset(g_secret, 0, sizeof(g_secret));
1847
        /* reduce scalar to 0 <= scalar < 2^384 */
1848
1.17k
        if ((BN_num_bits(scalar) > 384) || (BN_is_negative(scalar))) {
1849
            /*
1850
             * this is an unusual input, and we don't guarantee
1851
             * constant-timeness
1852
             */
1853
9
            if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1854
0
                ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1855
0
                goto err;
1856
0
            }
1857
9
            num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1858
1.16k
        } else {
1859
1.16k
            num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1860
1.16k
        }
1861
        /* do the multiplication with generator precomputation */
1862
1.17k
        batch_mul(x_out, y_out, z_out,
1863
1.17k
                  (const felem_bytearray(*))secrets, num_points,
1864
1.17k
                  g_secret,
1865
1.17k
                  mixed, (const felem(*)[17][3])pre_comp,
1866
1.17k
                  (const felem(*)[3])g_pre_comp);
1867
1.17k
    } else {
1868
        /* do the multiplication without generator precomputation */
1869
727
        batch_mul(x_out, y_out, z_out,
1870
727
                  (const felem_bytearray(*))secrets, num_points,
1871
727
                  NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1872
727
    }
1873
    /* reduce the output to its unique minimal representation */
1874
1.90k
    felem_contract(x_in, x_out);
1875
1.90k
    felem_contract(y_in, y_out);
1876
1.90k
    felem_contract(z_in, z_out);
1877
1.90k
    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1878
1.90k
        (!felem_to_BN(z, z_in))) {
1879
0
        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1880
0
        goto err;
1881
0
    }
1882
1.90k
    ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
1883
1.90k
                                                             ctx);
1884
1885
1.90k
 err:
1886
1.90k
    BN_CTX_end(ctx);
1887
1.90k
    EC_POINT_free(generator);
1888
1.90k
    OPENSSL_free(secrets);
1889
1.90k
    OPENSSL_free(pre_comp);
1890
1.90k
    OPENSSL_free(tmp_felems);
1891
1.90k
    return ret;
1892
1.90k
}
1893
1894
int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1895
0
{
1896
0
    int ret = 0;
1897
0
    NISTP384_PRE_COMP *pre = NULL;
1898
0
    int i, j;
1899
0
    BIGNUM *x, *y;
1900
0
    EC_POINT *generator = NULL;
1901
0
    felem tmp_felems[16];
1902
0
#ifndef FIPS_MODULE
1903
0
    BN_CTX *new_ctx = NULL;
1904
0
#endif
1905
1906
    /* throw away old precomputation */
1907
0
    EC_pre_comp_free(group);
1908
1909
0
#ifndef FIPS_MODULE
1910
0
    if (ctx == NULL)
1911
0
        ctx = new_ctx = BN_CTX_new();
1912
0
#endif
1913
0
    if (ctx == NULL)
1914
0
        return 0;
1915
1916
0
    BN_CTX_start(ctx);
1917
0
    x = BN_CTX_get(ctx);
1918
0
    y = BN_CTX_get(ctx);
1919
0
    if (y == NULL)
1920
0
        goto err;
1921
    /* get the generator */
1922
0
    if (group->generator == NULL)
1923
0
        goto err;
1924
0
    generator = EC_POINT_new(group);
1925
0
    if (generator == NULL)
1926
0
        goto err;
1927
0
    BN_bin2bn(nistp384_curve_params[3], sizeof(felem_bytearray), x);
1928
0
    BN_bin2bn(nistp384_curve_params[4], sizeof(felem_bytearray), y);
1929
0
    if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
1930
0
        goto err;
1931
0
    if ((pre = nistp384_pre_comp_new()) == NULL)
1932
0
        goto err;
1933
    /*
1934
     * if the generator is the standard one, use built-in precomputation
1935
     */
1936
0
    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1937
0
        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1938
0
        goto done;
1939
0
    }
1940
0
    if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
1941
0
        (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
1942
0
        (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
1943
0
        goto err;
1944
    /* compute 2^95*G, 2^190*G, 2^285*G */
1945
0
    for (i = 1; i <= 4; i <<= 1) {
1946
0
        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1947
0
                     pre->g_pre_comp[i][0],  pre->g_pre_comp[i][1],    pre->g_pre_comp[i][2]);
1948
0
        for (j = 0; j < 94; ++j) {
1949
0
            point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1950
0
                         pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2]);
1951
0
        }
1952
0
    }
1953
    /* g_pre_comp[0] is the point at infinity */
1954
0
    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1955
    /* the remaining multiples */
1956
    /* 2^95*G + 2^190*G */
1957
0
    point_add(pre->g_pre_comp[6][0],  pre->g_pre_comp[6][1],  pre->g_pre_comp[6][2],
1958
0
              pre->g_pre_comp[4][0],  pre->g_pre_comp[4][1],  pre->g_pre_comp[4][2], 0,
1959
0
              pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
1960
    /* 2^95*G + 2^285*G */
1961
0
    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], pre->g_pre_comp[10][2],
1962
0
              pre->g_pre_comp[8][0],  pre->g_pre_comp[8][1],  pre->g_pre_comp[8][2], 0,
1963
0
              pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
1964
    /* 2^190*G + 2^285*G */
1965
0
    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
1966
0
              pre->g_pre_comp[8][0],  pre->g_pre_comp[8][1],  pre->g_pre_comp[8][2], 0,
1967
0
              pre->g_pre_comp[4][0],  pre->g_pre_comp[4][1],  pre->g_pre_comp[4][2]);
1968
    /* 2^95*G + 2^190*G + 2^285*G */
1969
0
    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], pre->g_pre_comp[14][2],
1970
0
              pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], 0,
1971
0
              pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
1972
0
    for (i = 1; i < 8; ++i) {
1973
        /* odd multiples: add G */
1974
0
        point_add(pre->g_pre_comp[2 * i + 1][0], pre->g_pre_comp[2 * i + 1][1], pre->g_pre_comp[2 * i + 1][2],
1975
0
                  pre->g_pre_comp[2 * i][0],     pre->g_pre_comp[2 * i][1],     pre->g_pre_comp[2 * i][2], 0,
1976
0
                  pre->g_pre_comp[1][0],         pre->g_pre_comp[1][1],         pre->g_pre_comp[1][2]);
1977
0
    }
1978
0
    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
1979
1980
0
 done:
1981
0
    SETPRECOMP(group, nistp384, pre);
1982
0
    ret = 1;
1983
0
    pre = NULL;
1984
0
 err:
1985
0
    BN_CTX_end(ctx);
1986
0
    EC_POINT_free(generator);
1987
0
#ifndef FIPS_MODULE
1988
0
    BN_CTX_free(new_ctx);
1989
0
#endif
1990
0
    ossl_ec_nistp384_pre_comp_free(pre);
1991
0
    return ret;
1992
0
}
1993
1994
int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group)
1995
0
{
1996
0
    return HAVEPRECOMP(group, nistp384);
1997
0
}