Coverage Report

Created: 2024-07-24 06:31

/src/openssl/crypto/ec/ecp_nistp384.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
3
 *
4
 * Licensed under the Apache License 2.0 (the "License").  You may not use
5
 * this file except in compliance with the License.  You can obtain a copy
6
 * in the file LICENSE in the source distribution or at
7
 * https://www.openssl.org/source/license.html
8
 */
9
10
/* Copyright 2023 IBM Corp.
11
 *
12
 * Licensed under the Apache License, Version 2.0 (the "License");
13
 *
14
 * you may not use this file except in compliance with the License.
15
 * You may obtain a copy of the License at
16
 *
17
 *     http://www.apache.org/licenses/LICENSE-2.0
18
 *
19
 *  Unless required by applicable law or agreed to in writing, software
20
 *  distributed under the License is distributed on an "AS IS" BASIS,
21
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22
 *  See the License for the specific language governing permissions and
23
 *  limitations under the License.
24
 */
25
26
/*
27
 * Designed for 56-bit limbs by Rohan McLure <rohan.mclure@linux.ibm.com>.
28
 * The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm
29
 * acceleration of felem_{square,mul} as supported in these files.
30
 */
31
32
#include <openssl/e_os2.h>
33
34
#include <string.h>
35
#include <openssl/err.h>
36
#include "ec_local.h"
37
38
#include "internal/numbers.h"
39
40
#ifndef INT128_MAX
41
# error "Your compiler doesn't appear to support 128-bit integer types"
42
#endif
43
44
typedef uint8_t u8;
45
typedef uint64_t u64;
46
47
/*
48
 * The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We
49
 * can serialize an element of this field into 48 bytes. We call this an
50
 * felem_bytearray.
51
 */
52
53
typedef u8 felem_bytearray[48];
54
55
/*
56
 * These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4.
57
 * These values are big-endian.
58
 */
59
static const felem_bytearray nistp384_curve_params[5] = {
60
  {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
61
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
62
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
63
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF},
64
  {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */
65
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
66
   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
67
   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC},
68
  {0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */
69
   0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12,
70
   0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D,
71
   0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF},
72
  {0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */
73
   0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98,
74
   0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D,
75
   0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7},
76
  {0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */
77
   0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C,
78
   0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE,
79
   0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F},
80
};
81
82
/*-
83
 * The representation of field elements.
84
 * ------------------------------------
85
 *
86
 * We represent field elements with seven values. These values are either 64 or
87
 * 128 bits and the field element represented is:
88
 *   v[0]*2^0 + v[1]*2^56 + v[2]*2^112 + ... + v[6]*2^336  (mod p)
89
 * Each of the seven values is called a 'limb'. Since the limbs are spaced only
90
 * 56 bits apart, but are greater than 56 bits in length, the most significant
91
 * bits of each limb overlap with the least significant bits of the next
92
 *
93
 * This representation is considered to be 'redundant' in the sense that
94
 * intermediate values can each contain more than a 56-bit value in each limb.
95
 * Reduction causes all but the final limb to be reduced to contain a value less
96
 * than 2^56, with the final value represented allowed to be larger than 2^384,
97
 * inasmuch as we can be sure that arithmetic overflow remains impossible. The
98
 * reduced value must of course be congruent to the unreduced value.
99
 *
100
 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
101
 * 'widefelem', featuring enough bits to store the result of a multiplication
102
 * and even some further arithmetic without need for immediate reduction.
103
 */
104
105
22.0M
#define NLIMBS 7
106
107
typedef uint64_t limb;
108
typedef uint128_t widelimb;
109
typedef limb limb_aX __attribute((__aligned__(1)));
110
typedef limb felem[NLIMBS];
111
typedef widelimb widefelem[2*NLIMBS-1];
112
113
static const limb bottom56bits = 0xffffffffffffff;
114
115
/* Helper functions (de)serialising reduced field elements in little endian */
116
static void bin48_to_felem(felem out, const u8 in[48])
117
792
{
118
792
    memset(out, 0, 56);
119
792
    out[0] = (*((limb *) & in[0])) & bottom56bits;
120
792
    out[1] = (*((limb_aX *) & in[7])) & bottom56bits;
121
792
    out[2] = (*((limb_aX *) & in[14])) & bottom56bits;
122
792
    out[3] = (*((limb_aX *) & in[21])) & bottom56bits;
123
792
    out[4] = (*((limb_aX *) & in[28])) & bottom56bits;
124
792
    out[5] = (*((limb_aX *) & in[35])) & bottom56bits;
125
792
    memmove(&out[6], &in[42], 6);
126
792
}
127
128
static void felem_to_bin48(u8 out[48], const felem in)
129
2.11k
{
130
2.11k
    memset(out, 0, 48);
131
2.11k
    (*((limb *) & out[0]))     |= (in[0] & bottom56bits);
132
2.11k
    (*((limb_aX *) & out[7]))  |= (in[1] & bottom56bits);
133
2.11k
    (*((limb_aX *) & out[14])) |= (in[2] & bottom56bits);
134
2.11k
    (*((limb_aX *) & out[21])) |= (in[3] & bottom56bits);
135
2.11k
    (*((limb_aX *) & out[28])) |= (in[4] & bottom56bits);
136
2.11k
    (*((limb_aX *) & out[35])) |= (in[5] & bottom56bits);
137
2.11k
    memmove(&out[42], &in[6], 6);
138
2.11k
}
139
140
/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
141
static int BN_to_felem(felem out, const BIGNUM *bn)
142
792
{
143
792
    felem_bytearray b_out;
144
792
    int num_bytes;
145
146
792
    if (BN_is_negative(bn)) {
147
0
        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
148
0
        return 0;
149
0
    }
150
792
    num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
151
792
    if (num_bytes < 0) {
152
0
        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
153
0
        return 0;
154
0
    }
155
792
    bin48_to_felem(out, b_out);
156
792
    return 1;
157
792
}
158
159
/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
160
static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
161
2.11k
{
162
2.11k
    felem_bytearray b_out;
163
164
2.11k
    felem_to_bin48(b_out, in);
165
2.11k
    return BN_lebin2bn(b_out, sizeof(b_out), out);
166
2.11k
}
167
168
/*-
169
 * Field operations
170
 * ----------------
171
 */
172
173
static void felem_one(felem out)
174
0
{
175
0
    out[0] = 1;
176
0
    memset(&out[1], 0, sizeof(limb) * (NLIMBS-1));
177
0
}
178
179
static void felem_assign(felem out, const felem in)
180
287k
{
181
287k
    memcpy(out, in, sizeof(felem));
182
287k
}
183
184
/* felem_sum64 sets out = out + in. */
185
static void felem_sum64(felem out, const felem in)
186
77.6k
{
187
77.6k
    unsigned int i;
188
189
620k
    for (i = 0; i < NLIMBS; i++)
190
543k
        out[i] += in[i];
191
77.6k
}
192
193
/* felem_scalar sets out = in * scalar */
194
static void felem_scalar(felem out, const felem in, limb scalar)
195
334k
{
196
334k
    unsigned int i;
197
198
2.67M
    for (i = 0; i < NLIMBS; i++)
199
2.33M
        out[i] = in[i] * scalar;
200
334k
}
201
202
/* felem_scalar64 sets out = out * scalar */
203
static void felem_scalar64(felem out, limb scalar)
204
155k
{
205
155k
    unsigned int i;
206
207
1.24M
    for (i = 0; i < NLIMBS; i++)
208
1.08M
        out[i] *= scalar;
209
155k
}
210
211
/* felem_scalar128 sets out = out * scalar */
212
static void felem_scalar128(widefelem out, limb scalar)
213
51.7k
{
214
51.7k
    unsigned int i;
215
216
724k
    for (i = 0; i < 2*NLIMBS-1; i++)
217
672k
        out[i] *= scalar;
218
51.7k
}
219
220
/*-
221
 * felem_neg sets |out| to |-in|
222
 * On entry:
223
 *   in[i] < 2^60 - 2^29
224
 * On exit:
225
 *   out[i] < 2^60
226
 */
227
static void felem_neg(felem out, const felem in)
228
0
{
229
    /*
230
     * In order to prevent underflow, we add a multiple of p before subtracting.
231
     * Use telescopic sums to represent 2^12 * p redundantly with each limb
232
     * of the form 2^60 + ...
233
     */
234
0
    static const limb two60m52m4 = (((limb) 1) << 60)
235
0
                                 - (((limb) 1) << 52)
236
0
                                 - (((limb) 1) << 4);
237
0
    static const limb two60p44m12 = (((limb) 1) << 60)
238
0
                                  + (((limb) 1) << 44)
239
0
                                  - (((limb) 1) << 12);
240
0
    static const limb two60m28m4 = (((limb) 1) << 60)
241
0
                                 - (((limb) 1) << 28)
242
0
                                 - (((limb) 1) << 4);
243
0
    static const limb two60m4 = (((limb) 1) << 60)
244
0
                              - (((limb) 1) << 4);
245
246
0
    out[0] = two60p44m12 - in[0];
247
0
    out[1] = two60m52m4 - in[1];
248
0
    out[2] = two60m28m4 - in[2];
249
0
    out[3] = two60m4 - in[3];
250
0
    out[4] = two60m4 - in[4];
251
0
    out[5] = two60m4 - in[5];
252
0
    out[6] = two60m4 - in[6];
253
0
}
254
255
/*-
256
 * felem_diff64 subtracts |in| from |out|
257
 * On entry:
258
 *   in[i] < 2^60 - 2^52 - 2^4
259
 * On exit:
260
 *   out[i] < out_orig[i] + 2^60 + 2^44
261
 */
262
static void felem_diff64(felem out, const felem in)
263
77.6k
{
264
    /*
265
     * In order to prevent underflow, we add a multiple of p before subtracting.
266
     * Use telescopic sums to represent 2^12 * p redundantly with each limb
267
     * of the form 2^60 + ...
268
     */
269
270
77.6k
    static const limb two60m52m4 = (((limb) 1) << 60)
271
77.6k
                                 - (((limb) 1) << 52)
272
77.6k
                                 - (((limb) 1) << 4);
273
77.6k
    static const limb two60p44m12 = (((limb) 1) << 60)
274
77.6k
                                  + (((limb) 1) << 44)
275
77.6k
                                  - (((limb) 1) << 12);
276
77.6k
    static const limb two60m28m4 = (((limb) 1) << 60)
277
77.6k
                                 - (((limb) 1) << 28)
278
77.6k
                                 - (((limb) 1) << 4);
279
77.6k
    static const limb two60m4 = (((limb) 1) << 60)
280
77.6k
                              - (((limb) 1) << 4);
281
282
77.6k
    out[0] += two60p44m12 - in[0];
283
77.6k
    out[1] += two60m52m4 - in[1];
284
77.6k
    out[2] += two60m28m4 - in[2];
285
77.6k
    out[3] += two60m4 - in[3];
286
77.6k
    out[4] += two60m4 - in[4];
287
77.6k
    out[5] += two60m4 - in[5];
288
77.6k
    out[6] += two60m4 - in[6];
289
77.6k
}
290
291
/*
292
 * in[i] < 2^63
293
 * out[i] < out_orig[i] + 2^64 + 2^48
294
 */
295
static void felem_diff_128_64(widefelem out, const felem in)
296
155k
{
297
    /*
298
     * In order to prevent underflow, we add a multiple of p before subtracting.
299
     * Use telescopic sums to represent 2^16 * p redundantly with each limb
300
     * of the form 2^64 + ...
301
     */
302
303
155k
    static const widelimb two64m56m8 = (((widelimb) 1) << 64)
304
155k
                                     - (((widelimb) 1) << 56)
305
155k
                                     - (((widelimb) 1) << 8);
306
155k
    static const widelimb two64m32m8 = (((widelimb) 1) << 64)
307
155k
                                     - (((widelimb) 1) << 32)
308
155k
                                     - (((widelimb) 1) << 8);
309
155k
    static const widelimb two64m8 = (((widelimb) 1) << 64)
310
155k
                                  - (((widelimb) 1) << 8);
311
155k
    static const widelimb two64p48m16 = (((widelimb) 1) << 64)
312
155k
                                      + (((widelimb) 1) << 48)
313
155k
                                      - (((widelimb) 1) << 16);
314
155k
    unsigned int i;
315
316
155k
    out[0] += two64p48m16;
317
155k
    out[1] += two64m56m8;
318
155k
    out[2] += two64m32m8;
319
155k
    out[3] += two64m8;
320
155k
    out[4] += two64m8;
321
155k
    out[5] += two64m8;
322
155k
    out[6] += two64m8;
323
324
1.24M
    for (i = 0; i < NLIMBS; i++)
325
1.08M
        out[i] -= in[i];
326
155k
}
327
328
/*
329
 * in[i] < 2^127 - 2^119 - 2^71
330
 * out[i] < out_orig[i] + 2^127 + 2^111
331
 */
332
static void felem_diff128(widefelem out, const widefelem in)
333
51.7k
{
334
    /*
335
     * In order to prevent underflow, we add a multiple of p before subtracting.
336
     * Use telescopic sums to represent 2^415 * p redundantly with each limb
337
     * of the form 2^127 + ...
338
     */
339
340
51.7k
    static const widelimb two127 = ((widelimb) 1) << 127;
341
51.7k
    static const widelimb two127m71 = (((widelimb) 1) << 127)
342
51.7k
                                    - (((widelimb) 1) << 71);
343
51.7k
    static const widelimb two127p111m79m71 = (((widelimb) 1) << 127)
344
51.7k
                                           + (((widelimb) 1) << 111)
345
51.7k
                                           - (((widelimb) 1) << 79)
346
51.7k
                                           - (((widelimb) 1) << 71);
347
51.7k
    static const widelimb two127m119m71 = (((widelimb) 1) << 127)
348
51.7k
                                        - (((widelimb) 1) << 119)
349
51.7k
                                        - (((widelimb) 1) << 71);
350
51.7k
    static const widelimb two127m95m71 = (((widelimb) 1) << 127)
351
51.7k
                                       - (((widelimb) 1) << 95)
352
51.7k
                                       - (((widelimb) 1) << 71);
353
51.7k
    unsigned int i;
354
355
51.7k
    out[0]  += two127;
356
51.7k
    out[1]  += two127m71;
357
51.7k
    out[2]  += two127m71;
358
51.7k
    out[3]  += two127m71;
359
51.7k
    out[4]  += two127m71;
360
51.7k
    out[5]  += two127m71;
361
51.7k
    out[6]  += two127p111m79m71;
362
51.7k
    out[7]  += two127m119m71;
363
51.7k
    out[8]  += two127m95m71;
364
51.7k
    out[9]  += two127m71;
365
51.7k
    out[10] += two127m71;
366
51.7k
    out[11] += two127m71;
367
51.7k
    out[12] += two127m71;
368
369
724k
    for (i = 0; i < 2*NLIMBS-1; i++)
370
672k
        out[i] -= in[i];
371
51.7k
}
372
373
static void felem_square_ref(widefelem out, const felem in)
374
308k
{
375
308k
    felem inx2;
376
308k
    felem_scalar(inx2, in, 2);
377
378
308k
    out[0] = ((uint128_t) in[0]) * in[0];
379
380
308k
    out[1] = ((uint128_t) in[0]) * inx2[1];
381
382
308k
    out[2] = ((uint128_t) in[0]) * inx2[2]
383
308k
           + ((uint128_t) in[1]) * in[1];
384
385
308k
    out[3] = ((uint128_t) in[0]) * inx2[3]
386
308k
           + ((uint128_t) in[1]) * inx2[2];
387
388
308k
    out[4] = ((uint128_t) in[0]) * inx2[4]
389
308k
           + ((uint128_t) in[1]) * inx2[3]
390
308k
           + ((uint128_t) in[2]) * in[2];
391
392
308k
    out[5] = ((uint128_t) in[0]) * inx2[5]
393
308k
           + ((uint128_t) in[1]) * inx2[4]
394
308k
           + ((uint128_t) in[2]) * inx2[3];
395
396
308k
    out[6] = ((uint128_t) in[0]) * inx2[6]
397
308k
           + ((uint128_t) in[1]) * inx2[5]
398
308k
           + ((uint128_t) in[2]) * inx2[4]
399
308k
           + ((uint128_t) in[3]) * in[3];
400
401
308k
    out[7] = ((uint128_t) in[1]) * inx2[6]
402
308k
           + ((uint128_t) in[2]) * inx2[5]
403
308k
           + ((uint128_t) in[3]) * inx2[4];
404
405
308k
    out[8] = ((uint128_t) in[2]) * inx2[6]
406
308k
           + ((uint128_t) in[3]) * inx2[5]
407
308k
           + ((uint128_t) in[4]) * in[4];
408
409
308k
    out[9] = ((uint128_t) in[3]) * inx2[6]
410
308k
           + ((uint128_t) in[4]) * inx2[5];
411
412
308k
    out[10] = ((uint128_t) in[4]) * inx2[6]
413
308k
            + ((uint128_t) in[5]) * in[5];
414
415
308k
    out[11] = ((uint128_t) in[5]) * inx2[6];
416
417
308k
    out[12] = ((uint128_t) in[6]) * in[6];
418
308k
}
419
420
static void felem_mul_ref(widefelem out, const felem in1, const felem in2)
421
289k
{
422
289k
    out[0] = ((uint128_t) in1[0]) * in2[0];
423
424
289k
    out[1] = ((uint128_t) in1[0]) * in2[1]
425
289k
           + ((uint128_t) in1[1]) * in2[0];
426
427
289k
    out[2] = ((uint128_t) in1[0]) * in2[2]
428
289k
           + ((uint128_t) in1[1]) * in2[1]
429
289k
           + ((uint128_t) in1[2]) * in2[0];
430
431
289k
    out[3] = ((uint128_t) in1[0]) * in2[3]
432
289k
           + ((uint128_t) in1[1]) * in2[2]
433
289k
           + ((uint128_t) in1[2]) * in2[1]
434
289k
           + ((uint128_t) in1[3]) * in2[0];
435
436
289k
    out[4] = ((uint128_t) in1[0]) * in2[4]
437
289k
           + ((uint128_t) in1[1]) * in2[3]
438
289k
           + ((uint128_t) in1[2]) * in2[2]
439
289k
           + ((uint128_t) in1[3]) * in2[1]
440
289k
           + ((uint128_t) in1[4]) * in2[0];
441
442
289k
    out[5] = ((uint128_t) in1[0]) * in2[5]
443
289k
           + ((uint128_t) in1[1]) * in2[4]
444
289k
           + ((uint128_t) in1[2]) * in2[3]
445
289k
           + ((uint128_t) in1[3]) * in2[2]
446
289k
           + ((uint128_t) in1[4]) * in2[1]
447
289k
           + ((uint128_t) in1[5]) * in2[0];
448
449
289k
    out[6] = ((uint128_t) in1[0]) * in2[6]
450
289k
           + ((uint128_t) in1[1]) * in2[5]
451
289k
           + ((uint128_t) in1[2]) * in2[4]
452
289k
           + ((uint128_t) in1[3]) * in2[3]
453
289k
           + ((uint128_t) in1[4]) * in2[2]
454
289k
           + ((uint128_t) in1[5]) * in2[1]
455
289k
           + ((uint128_t) in1[6]) * in2[0];
456
457
289k
    out[7] = ((uint128_t) in1[1]) * in2[6]
458
289k
           + ((uint128_t) in1[2]) * in2[5]
459
289k
           + ((uint128_t) in1[3]) * in2[4]
460
289k
           + ((uint128_t) in1[4]) * in2[3]
461
289k
           + ((uint128_t) in1[5]) * in2[2]
462
289k
           + ((uint128_t) in1[6]) * in2[1];
463
464
289k
    out[8] = ((uint128_t) in1[2]) * in2[6]
465
289k
           + ((uint128_t) in1[3]) * in2[5]
466
289k
           + ((uint128_t) in1[4]) * in2[4]
467
289k
           + ((uint128_t) in1[5]) * in2[3]
468
289k
           + ((uint128_t) in1[6]) * in2[2];
469
470
289k
    out[9] = ((uint128_t) in1[3]) * in2[6]
471
289k
           + ((uint128_t) in1[4]) * in2[5]
472
289k
           + ((uint128_t) in1[5]) * in2[4]
473
289k
           + ((uint128_t) in1[6]) * in2[3];
474
475
289k
    out[10] = ((uint128_t) in1[4]) * in2[6]
476
289k
            + ((uint128_t) in1[5]) * in2[5]
477
289k
            + ((uint128_t) in1[6]) * in2[4];
478
479
289k
    out[11] = ((uint128_t) in1[5]) * in2[6]
480
289k
            + ((uint128_t) in1[6]) * in2[5];
481
482
289k
    out[12] = ((uint128_t) in1[6]) * in2[6];
483
289k
}
484
485
/*-
486
 * Reduce thirteen 128-bit coefficients to seven 64-bit coefficients.
487
 * in[i] < 2^128 - 2^125
488
 * out[i] < 2^56 for i < 6,
489
 * out[6] <= 2^48
490
 *
491
 * The technique in use here stems from the format of the prime modulus:
492
 * P384 = 2^384 - delta
493
 *
494
 * Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting
495
 * them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These
496
 * coefficients are still quite large, and so we repeatedly apply this
497
 * technique on high-order bits in order to guarantee the desired bounds on
498
 * the size of our output.
499
 *
500
 * The three phases of elimination are as follows:
501
 * [1]: Y = 2^120 (in[12] | in[11] | in[10] | in[9])
502
 * [2]: Y = 2^8 (acc[8] | acc[7])
503
 * [3]: Y = 2^48 (acc[6] >> 48)
504
 * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d)
505
 */
506
static void felem_reduce(felem out, const widefelem in)
507
545k
{
508
    /*
509
     * In order to prevent underflow, we add a multiple of p before subtracting.
510
     * Use telescopic sums to represent 2^76 * p redundantly with each limb
511
     * of the form 2^124 + ...
512
     */
513
545k
    static const widelimb two124m68 = (((widelimb) 1) << 124)
514
545k
                                    - (((widelimb) 1) << 68);
515
545k
    static const widelimb two124m116m68 = (((widelimb) 1) << 124)
516
545k
                                        - (((widelimb) 1) << 116)
517
545k
                                        - (((widelimb) 1) << 68);
518
545k
    static const widelimb two124p108m76 = (((widelimb) 1) << 124)
519
545k
                                        + (((widelimb) 1) << 108)
520
545k
                                        - (((widelimb) 1) << 76);
521
545k
    static const widelimb two124m92m68 = (((widelimb) 1) << 124)
522
545k
                                       - (((widelimb) 1) << 92)
523
545k
                                       - (((widelimb) 1) << 68);
524
545k
    widelimb temp, acc[9];
525
545k
    unsigned int i;
526
527
545k
    memcpy(acc, in, sizeof(widelimb) * 9);
528
529
545k
    acc[0] += two124p108m76;
530
545k
    acc[1] += two124m116m68;
531
545k
    acc[2] += two124m92m68;
532
545k
    acc[3] += two124m68;
533
545k
    acc[4] += two124m68;
534
545k
    acc[5] += two124m68;
535
545k
    acc[6] += two124m68;
536
537
    /* [1]: Eliminate in[9], ..., in[12] */
538
545k
    acc[8] += in[12] >> 32;
539
545k
    acc[7] += (in[12] & 0xffffffff) << 24;
540
545k
    acc[7] += in[12] >> 8;
541
545k
    acc[6] += (in[12] & 0xff) << 48;
542
545k
    acc[6] -= in[12] >> 16;
543
545k
    acc[5] -= (in[12] & 0xffff) << 40;
544
545k
    acc[6] += in[12] >> 48;
545
545k
    acc[5] += (in[12] & 0xffffffffffff) << 8;
546
547
545k
    acc[7] += in[11] >> 32;
548
545k
    acc[6] += (in[11] & 0xffffffff) << 24;
549
545k
    acc[6] += in[11] >> 8;
550
545k
    acc[5] += (in[11] & 0xff) << 48;
551
545k
    acc[5] -= in[11] >> 16;
552
545k
    acc[4] -= (in[11] & 0xffff) << 40;
553
545k
    acc[5] += in[11] >> 48;
554
545k
    acc[4] += (in[11] & 0xffffffffffff) << 8;
555
556
545k
    acc[6] += in[10] >> 32;
557
545k
    acc[5] += (in[10] & 0xffffffff) << 24;
558
545k
    acc[5] += in[10] >> 8;
559
545k
    acc[4] += (in[10] & 0xff) << 48;
560
545k
    acc[4] -= in[10] >> 16;
561
545k
    acc[3] -= (in[10] & 0xffff) << 40;
562
545k
    acc[4] += in[10] >> 48;
563
545k
    acc[3] += (in[10] & 0xffffffffffff) << 8;
564
565
545k
    acc[5] += in[9] >> 32;
566
545k
    acc[4] += (in[9] & 0xffffffff) << 24;
567
545k
    acc[4] += in[9] >> 8;
568
545k
    acc[3] += (in[9] & 0xff) << 48;
569
545k
    acc[3] -= in[9] >> 16;
570
545k
    acc[2] -= (in[9] & 0xffff) << 40;
571
545k
    acc[3] += in[9] >> 48;
572
545k
    acc[2] += (in[9] & 0xffffffffffff) << 8;
573
574
    /*
575
     * [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as
576
     * well as the contributions made from eliminating higher limbs.
577
     * acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121
578
     * acc[8] < in[8] + 2^96
579
     */
580
545k
    acc[4] += acc[8] >> 32;
581
545k
    acc[3] += (acc[8] & 0xffffffff) << 24;
582
545k
    acc[3] += acc[8] >> 8;
583
545k
    acc[2] += (acc[8] & 0xff) << 48;
584
545k
    acc[2] -= acc[8] >> 16;
585
545k
    acc[1] -= (acc[8] & 0xffff) << 40;
586
545k
    acc[2] += acc[8] >> 48;
587
545k
    acc[1] += (acc[8] & 0xffffffffffff) << 8;
588
589
545k
    acc[3] += acc[7] >> 32;
590
545k
    acc[2] += (acc[7] & 0xffffffff) << 24;
591
545k
    acc[2] += acc[7] >> 8;
592
545k
    acc[1] += (acc[7] & 0xff) << 48;
593
545k
    acc[1] -= acc[7] >> 16;
594
545k
    acc[0] -= (acc[7] & 0xffff) << 40;
595
545k
    acc[1] += acc[7] >> 48;
596
545k
    acc[0] += (acc[7] & 0xffffffffffff) << 8;
597
598
    /*-
599
     * acc[k] < in[k] + 2^124 + 2^121
600
     *        < in[k] + 2^125
601
     *        < 2^128, for k <= 6
602
     */
603
604
    /*
605
     * Carry 4 -> 5 -> 6
606
     * This has the effect of ensuring that these more significant limbs
607
     * will be small in value after eliminating high bits from acc[6].
608
     */
609
545k
    acc[5] += acc[4] >> 56;
610
545k
    acc[4] &= 0x00ffffffffffffff;
611
612
545k
    acc[6] += acc[5] >> 56;
613
545k
    acc[5] &= 0x00ffffffffffffff;
614
615
    /*-
616
     * acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16
617
     *        < in[6] + 2^125
618
     *        < 2^128
619
     */
620
621
    /* [3]: Eliminate high bits of acc[6] */
622
545k
    temp = acc[6] >> 48;
623
545k
    acc[6] &= 0x0000ffffffffffff;
624
625
    /* temp < 2^80 */
626
627
545k
    acc[3] += temp >> 40;
628
545k
    acc[2] += (temp & 0xffffffffff) << 16;
629
545k
    acc[2] += temp >> 16;
630
545k
    acc[1] += (temp & 0xffff) << 40;
631
545k
    acc[1] -= temp >> 24;
632
545k
    acc[0] -= (temp & 0xffffff) << 32;
633
545k
    acc[0] += temp;
634
635
    /*-
636
     * acc[k] < acc_old[k] + 2^64 + 2^56
637
     *        < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4
638
     */
639
640
    /* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
641
545k
    acc[1] += acc[0] >> 56;   /* acc[1] < acc_old[1] + 2^72 */
642
545k
    acc[0] &= 0x00ffffffffffffff;
643
644
545k
    acc[2] += acc[1] >> 56;   /* acc[2] < acc_old[2] + 2^72 + 2^16 */
645
545k
    acc[1] &= 0x00ffffffffffffff;
646
647
545k
    acc[3] += acc[2] >> 56;   /* acc[3] < acc_old[3] + 2^72 + 2^16 */
648
545k
    acc[2] &= 0x00ffffffffffffff;
649
650
    /*-
651
     * acc[k] < acc_old[k] + 2^72 + 2^16
652
     *        < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17
653
     *        < in[k] + 2^125
654
     *        < 2^128 , k < 4
655
     */
656
657
545k
    acc[4] += acc[3] >> 56;   /*-
658
                               * acc[4] < acc_old[4] + 2^72 + 2^16
659
                               *        < 2^72 + 2^56 + 2^16
660
                               */
661
545k
    acc[3] &= 0x00ffffffffffffff;
662
663
545k
    acc[5] += acc[4] >> 56;   /*-
664
                               * acc[5] < acc_old[5] + 2^16 + 1
665
                               *        < 2^56 + 2^16 + 1
666
                               */
667
545k
    acc[4] &= 0x00ffffffffffffff;
668
669
545k
    acc[6] += acc[5] >> 56;   /* acc[6] < 2^48 + 1 <= 2^48 */
670
545k
    acc[5] &= 0x00ffffffffffffff;
671
672
4.36M
    for (i = 0; i < NLIMBS; i++)
673
3.81M
        out[i] = acc[i];
674
545k
}
675
676
#if defined(ECP_NISTP384_ASM)
677
static void felem_square_wrapper(widefelem out, const felem in);
678
static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2);
679
680
static void (*felem_square_p)(widefelem out, const felem in) =
681
    felem_square_wrapper;
682
static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) =
683
    felem_mul_wrapper;
684
685
void p384_felem_square(widefelem out, const felem in);
686
void p384_felem_mul(widefelem out, const felem in1, const felem in2);
687
688
# if defined(_ARCH_PPC64)
689
#  include "crypto/ppc_arch.h"
690
# endif
691
692
static void felem_select(void)
693
{
694
# if defined(_ARCH_PPC64)
695
    if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
696
        felem_square_p = p384_felem_square;
697
        felem_mul_p = p384_felem_mul;
698
699
        return;
700
    }
701
# endif
702
703
    /* Default */
704
    felem_square_p = felem_square_ref;
705
    felem_mul_p = felem_mul_ref;
706
}
707
708
static void felem_square_wrapper(widefelem out, const felem in)
709
{
710
    felem_select();
711
    felem_square_p(out, in);
712
}
713
714
static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2)
715
{
716
    felem_select();
717
    felem_mul_p(out, in1, in2);
718
}
719
720
# define felem_square felem_square_p
721
# define felem_mul felem_mul_p
722
#else
723
308k
# define felem_square felem_square_ref
724
289k
# define felem_mul felem_mul_ref
725
#endif
726
727
static ossl_inline void felem_square_reduce(felem out, const felem in)
728
204k
{
729
204k
    widefelem tmp;
730
731
204k
    felem_square(tmp, in);
732
204k
    felem_reduce(out, tmp);
733
204k
}
734
735
static ossl_inline void felem_mul_reduce(felem out, const felem in1, const felem in2)
736
158k
{
737
158k
    widefelem tmp;
738
739
158k
    felem_mul(tmp, in1, in2);
740
158k
    felem_reduce(out, tmp);
741
158k
}
742
743
/*-
744
 * felem_inv calculates |out| = |in|^{-1}
745
 *
746
 * Based on Fermat's Little Theorem:
747
 *   a^p = a (mod p)
748
 *   a^{p-1} = 1 (mod p)
749
 *   a^{p-2} = a^{-1} (mod p)
750
 */
751
static void felem_inv(felem out, const felem in)
752
264
{
753
264
    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6;
754
264
    unsigned int i = 0;
755
756
264
    felem_square_reduce(ftmp, in);      /* 2^1 */
757
264
    felem_mul_reduce(ftmp, ftmp, in);   /* 2^1 + 2^0 */
758
264
    felem_assign(ftmp2, ftmp);
759
760
264
    felem_square_reduce(ftmp, ftmp);    /* 2^2 + 2^1 */
761
264
    felem_mul_reduce(ftmp, ftmp, in);   /* 2^2 + 2^1 * 2^0 */
762
264
    felem_assign(ftmp3, ftmp);
763
764
1.05k
    for (i = 0; i < 3; i++)
765
792
        felem_square_reduce(ftmp, ftmp); /* 2^5 + 2^4 + 2^3 */
766
264
    felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */
767
264
    felem_assign(ftmp4, ftmp);
768
769
1.84k
    for (i = 0; i < 6; i++)
770
1.58k
        felem_square_reduce(ftmp, ftmp); /* 2^11 + ... + 2^6 */
771
264
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^11 + ... + 2^0 */
772
773
1.05k
    for (i = 0; i < 3; i++)
774
792
        felem_square_reduce(ftmp, ftmp); /* 2^14 + ... + 2^3 */
775
264
    felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^14 + ... + 2^0 */
776
264
    felem_assign(ftmp5, ftmp);
777
778
4.22k
    for (i = 0; i < 15; i++)
779
3.96k
        felem_square_reduce(ftmp, ftmp); /* 2^29 + ... + 2^15 */
780
264
    felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^29 + ... + 2^0 */
781
264
    felem_assign(ftmp6, ftmp);
782
783
8.18k
    for (i = 0; i < 30; i++)
784
7.92k
        felem_square_reduce(ftmp, ftmp); /* 2^59 + ... + 2^30 */
785
264
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^59 + ... + 2^0 */
786
264
    felem_assign(ftmp4, ftmp);
787
788
16.1k
    for (i = 0; i < 60; i++)
789
15.8k
        felem_square_reduce(ftmp, ftmp); /* 2^119 + ... + 2^60 */
790
264
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^119 + ... + 2^0 */
791
264
    felem_assign(ftmp4, ftmp);
792
793
31.9k
    for (i = 0; i < 120; i++)
794
31.6k
      felem_square_reduce(ftmp, ftmp);   /* 2^239 + ... + 2^120 */
795
264
    felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^239 + ... + 2^0 */
796
797
4.22k
    for (i = 0; i < 15; i++)
798
3.96k
        felem_square_reduce(ftmp, ftmp); /* 2^254 + ... + 2^15 */
799
264
    felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^254 + ... + 2^0 */
800
801
8.44k
    for (i = 0; i < 31; i++)
802
8.18k
        felem_square_reduce(ftmp, ftmp); /* 2^285 + ... + 2^31 */
803
264
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^285 + ... + 2^31 + 2^29 + ... + 2^0 */
804
805
792
    for (i = 0; i < 2; i++)
806
528
        felem_square_reduce(ftmp, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^2 */
807
264
    felem_mul_reduce(ftmp, ftmp2, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^0 */
808
809
25.0k
    for (i = 0; i < 94; i++)
810
24.8k
        felem_square_reduce(ftmp, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 */
811
264
    felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 + 2^29 + ... + 2^0 */
812
813
792
    for (i = 0; i < 2; i++)
814
528
        felem_square_reduce(ftmp, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 */
815
264
    felem_mul_reduce(ftmp, in, ftmp);    /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 + 2^0 */
816
817
264
    memcpy(out, ftmp, sizeof(felem));
818
264
}
819
820
/*
821
 * Zero-check: returns a limb with all bits set if |in| == 0 (mod p)
822
 * and 0 otherwise. We know that field elements are reduced to
823
 * 0 < in < 2p, so we only need to check two cases:
824
 * 0 and 2^384 - 2^128 - 2^96 + 2^32 - 1
825
 *   in[k] < 2^56, k < 6
826
 *   in[6] <= 2^48
827
 */
828
static limb felem_is_zero(const felem in)
829
103k
{
830
103k
    limb zero, p384;
831
832
103k
    zero = in[0] | in[1] | in[2] | in[3] | in[4] | in[5] | in[6];
833
103k
    zero = ((int64_t) (zero) - 1) >> 63;
834
103k
    p384 = (in[0] ^ 0x000000ffffffff) | (in[1] ^ 0xffff0000000000)
835
103k
         | (in[2] ^ 0xfffffffffeffff) | (in[3] ^ 0xffffffffffffff)
836
103k
         | (in[4] ^ 0xffffffffffffff) | (in[5] ^ 0xffffffffffffff)
837
103k
         | (in[6] ^ 0xffffffffffff);
838
103k
    p384 = ((int64_t) (p384) - 1) >> 63;
839
840
103k
    return (zero | p384);
841
103k
}
842
843
static int felem_is_zero_int(const void *in)
844
0
{
845
0
    return (int)(felem_is_zero(in) & ((limb) 1));
846
0
}
847
848
/*-
849
 * felem_contract converts |in| to its unique, minimal representation.
850
 * Assume we've removed all redundant bits.
851
 * On entry:
852
 *   in[k] < 2^56, k < 6
853
 *   in[6] <= 2^48
854
 */
855
static void felem_contract(felem out, const felem in)
856
1.32k
{
857
1.32k
    static const int64_t two56 = ((limb) 1) << 56;
858
859
    /*
860
     * We know for a fact that 0 <= |in| < 2*p, for p = 2^384 - 2^128 - 2^96 + 2^32 - 1
861
     * Perform two successive, idempotent subtractions to reduce if |in| >= p.
862
     */
863
864
1.32k
    int64_t tmp[NLIMBS], cond[5], a;
865
1.32k
    unsigned int i;
866
867
1.32k
    memcpy(tmp, in, sizeof(felem));
868
869
    /* Case 1: a = 1 iff |in| >= 2^384 */
870
1.32k
    a = (in[6] >> 48);
871
1.32k
    tmp[0] += a;
872
1.32k
    tmp[0] -= a << 32;
873
1.32k
    tmp[1] += a << 40;
874
1.32k
    tmp[2] += a << 16;
875
1.32k
    tmp[6] &= 0x0000ffffffffffff;
876
877
    /*
878
     * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
879
     * non-zero, so we only need one step
880
     */
881
882
1.32k
    a = tmp[0] >> 63;
883
1.32k
    tmp[0] += a & two56;
884
1.32k
    tmp[1] -= a & 1;
885
886
    /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
887
1.32k
    tmp[2] += tmp[1] >> 56;
888
1.32k
    tmp[1] &= 0x00ffffffffffffff;
889
890
1.32k
    tmp[3] += tmp[2] >> 56;
891
1.32k
    tmp[2] &= 0x00ffffffffffffff;
892
893
1.32k
    tmp[4] += tmp[3] >> 56;
894
1.32k
    tmp[3] &= 0x00ffffffffffffff;
895
896
1.32k
    tmp[5] += tmp[4] >> 56;
897
1.32k
    tmp[4] &= 0x00ffffffffffffff;
898
899
1.32k
    tmp[6] += tmp[5] >> 56; /* tmp[6] < 2^48 */
900
1.32k
    tmp[5] &= 0x00ffffffffffffff;
901
902
    /*
903
     * Case 2: a = all ones if p <= |in| < 2^384, 0 otherwise
904
     */
905
906
    /* 0 iff (2^129..2^383) are all one */
907
1.32k
    cond[0] = ((tmp[6] | 0xff000000000000) & tmp[5] & tmp[4] & tmp[3] & (tmp[2] | 0x0000000001ffff)) + 1;
908
    /* 0 iff 2^128 bit is one */
909
1.32k
    cond[1] = (tmp[2] | ~0x00000000010000) + 1;
910
    /* 0 iff (2^96..2^127) bits are all one */
911
1.32k
    cond[2] = ((tmp[2] | 0xffffffffff0000) & (tmp[1] | 0x0000ffffffffff)) + 1;
912
    /* 0 iff (2^32..2^95) bits are all zero */
913
1.32k
    cond[3] = (tmp[1] & ~0xffff0000000000) | (tmp[0] & ~((int64_t) 0x000000ffffffff));
914
    /* 0 iff (2^0..2^31) bits are all one */
915
1.32k
    cond[4] = (tmp[0] | 0xffffff00000000) + 1;
916
917
    /*
918
     * In effect, invert our conditions, so that 0 values become all 1's,
919
     * any non-zero value in the low-order 56 bits becomes all 0's
920
     */
921
7.92k
    for (i = 0; i < 5; i++)
922
6.60k
       cond[i] = ((cond[i] & 0x00ffffffffffffff) - 1) >> 63;
923
924
    /*
925
     * The condition for determining whether in is greater than our
926
     * prime is given by the following condition.
927
     */
928
929
    /* First subtract 2^384 - 2^129 cheaply */
930
1.32k
    a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
931
1.32k
    tmp[6] &= ~a;
932
1.32k
    tmp[5] &= ~a;
933
1.32k
    tmp[4] &= ~a;
934
1.32k
    tmp[3] &= ~a;
935
1.32k
    tmp[2] &= ~a | 0x0000000001ffff;
936
937
    /*
938
     * Subtract 2^128 - 2^96 by
939
     * means of disjoint cases.
940
     */
941
942
    /* subtract 2^128 if that bit is present, and add 2^96 */
943
1.32k
    a = cond[0] & cond[1];
944
1.32k
    tmp[2] &= ~a | 0xfffffffffeffff;
945
1.32k
    tmp[1] += a & ((int64_t) 1 << 40);
946
947
    /* otherwise, clear bits 2^127 .. 2^96  */
948
1.32k
    a = cond[0] & ~cond[1] & (cond[2] & (~cond[3] | cond[4]));
949
1.32k
    tmp[2] &= ~a | 0xffffffffff0000;
950
1.32k
    tmp[1] &= ~a | 0x0000ffffffffff;
951
952
    /* finally, subtract the last 2^32 - 1 */
953
1.32k
    a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
954
1.32k
    tmp[0] += a & (-((int64_t) 1 << 32) + 1);
955
956
    /*
957
     * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
958
     * non-zero, so we only need one step
959
     */
960
1.32k
    a = tmp[0] >> 63;
961
1.32k
    tmp[0] += a & two56;
962
1.32k
    tmp[1] -= a & 1;
963
964
    /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
965
1.32k
    tmp[2] += tmp[1] >> 56;
966
1.32k
    tmp[1] &= 0x00ffffffffffffff;
967
968
1.32k
    tmp[3] += tmp[2] >> 56;
969
1.32k
    tmp[2] &= 0x00ffffffffffffff;
970
971
1.32k
    tmp[4] += tmp[3] >> 56;
972
1.32k
    tmp[3] &= 0x00ffffffffffffff;
973
974
1.32k
    tmp[5] += tmp[4] >> 56;
975
1.32k
    tmp[4] &= 0x00ffffffffffffff;
976
977
1.32k
    tmp[6] += tmp[5] >> 56;
978
1.32k
    tmp[5] &= 0x00ffffffffffffff;
979
980
1.32k
    memcpy(out, tmp, sizeof(felem));
981
1.32k
}
982
983
/*-
984
 * Group operations
985
 * ----------------
986
 *
987
 * Building on top of the field operations we have the operations on the
988
 * elliptic curve group itself. Points on the curve are represented in Jacobian
989
 * coordinates
990
 */
991
992
/*-
993
 * point_double calculates 2*(x_in, y_in, z_in)
994
 *
995
 * The method is taken from:
996
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
997
 *
998
 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
999
 * while x_out == y_in is not (maybe this works, but it's not tested).
1000
 */
1001
static void
1002
point_double(felem x_out, felem y_out, felem z_out,
1003
             const felem x_in, const felem y_in, const felem z_in)
1004
25.8k
{
1005
25.8k
    widefelem tmp, tmp2;
1006
25.8k
    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1007
1008
25.8k
    felem_assign(ftmp, x_in);
1009
25.8k
    felem_assign(ftmp2, x_in);
1010
1011
    /* delta = z^2 */
1012
25.8k
    felem_square_reduce(delta, z_in);     /* delta[i] < 2^56 */
1013
1014
    /* gamma = y^2 */
1015
25.8k
    felem_square_reduce(gamma, y_in);     /* gamma[i] < 2^56 */
1016
1017
    /* beta = x*gamma */
1018
25.8k
    felem_mul_reduce(beta, x_in, gamma);  /* beta[i] < 2^56 */
1019
1020
    /* alpha = 3*(x-delta)*(x+delta) */
1021
25.8k
    felem_diff64(ftmp, delta);            /* ftmp[i] < 2^60 + 2^58 + 2^44 */
1022
25.8k
    felem_sum64(ftmp2, delta);            /* ftmp2[i] < 2^59 */
1023
25.8k
    felem_scalar64(ftmp2, 3);             /* ftmp2[i] < 2^61 */
1024
25.8k
    felem_mul_reduce(alpha, ftmp, ftmp2); /* alpha[i] < 2^56 */
1025
1026
    /* x' = alpha^2 - 8*beta */
1027
25.8k
    felem_square(tmp, alpha);             /* tmp[i] < 2^115 */
1028
25.8k
    felem_assign(ftmp, beta);             /* ftmp[i] < 2^56 */
1029
25.8k
    felem_scalar64(ftmp, 8);              /* ftmp[i] < 2^59 */
1030
25.8k
    felem_diff_128_64(tmp, ftmp);         /* tmp[i] < 2^115 + 2^64 + 2^48 */
1031
25.8k
    felem_reduce(x_out, tmp);             /* x_out[i] < 2^56 */
1032
1033
    /* z' = (y + z)^2 - gamma - delta */
1034
25.8k
    felem_sum64(delta, gamma);     /* delta[i] < 2^57 */
1035
25.8k
    felem_assign(ftmp, y_in);      /* ftmp[i] < 2^56 */
1036
25.8k
    felem_sum64(ftmp, z_in);       /* ftmp[i] < 2^56 */
1037
25.8k
    felem_square(tmp, ftmp);       /* tmp[i] < 2^115 */
1038
25.8k
    felem_diff_128_64(tmp, delta); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1039
25.8k
    felem_reduce(z_out, tmp);      /* z_out[i] < 2^56 */
1040
1041
    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1042
25.8k
    felem_scalar64(beta, 4);       /* beta[i] < 2^58 */
1043
25.8k
    felem_diff64(beta, x_out);     /* beta[i] < 2^60 + 2^58 + 2^44 */
1044
25.8k
    felem_mul(tmp, alpha, beta);   /* tmp[i] < 2^119 */
1045
25.8k
    felem_square(tmp2, gamma);     /* tmp2[i] < 2^115 */
1046
25.8k
    felem_scalar128(tmp2, 8);      /* tmp2[i] < 2^118 */
1047
25.8k
    felem_diff128(tmp, tmp2);      /* tmp[i] < 2^127 + 2^119 + 2^111 */
1048
25.8k
    felem_reduce(y_out, tmp);      /* tmp[i] < 2^56 */
1049
25.8k
}
1050
1051
/* copy_conditional copies in to out iff mask is all ones. */
1052
static void copy_conditional(felem out, const felem in, limb mask)
1053
155k
{
1054
155k
    unsigned int i;
1055
1056
1.24M
    for (i = 0; i < NLIMBS; i++)
1057
1.08M
        out[i] ^= mask & (in[i] ^ out[i]);
1058
155k
}
1059
1060
/*-
1061
 * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1062
 *
1063
 * The method is taken from
1064
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1065
 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1066
 *
1067
 * This function includes a branch for checking whether the two input points
1068
 * are equal (while not equal to the point at infinity). See comment below
1069
 * on constant-time.
1070
 */
1071
static void point_add(felem x3, felem y3, felem z3,
1072
                      const felem x1, const felem y1, const felem z1,
1073
                      const int mixed, const felem x2, const felem y2,
1074
                      const felem z2)
1075
25.8k
{
1076
25.8k
    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1077
25.8k
    widefelem tmp, tmp2;
1078
25.8k
    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1079
25.8k
    limb points_equal;
1080
1081
25.8k
    z1_is_zero = felem_is_zero(z1);
1082
25.8k
    z2_is_zero = felem_is_zero(z2);
1083
1084
    /* ftmp = z1z1 = z1**2 */
1085
25.8k
    felem_square_reduce(ftmp, z1);      /* ftmp[i] < 2^56 */
1086
1087
25.8k
    if (!mixed) {
1088
        /* ftmp2 = z2z2 = z2**2 */
1089
0
        felem_square_reduce(ftmp2, z2); /* ftmp2[i] < 2^56 */
1090
1091
        /* u1 = ftmp3 = x1*z2z2 */
1092
0
        felem_mul_reduce(ftmp3, x1, ftmp2); /* ftmp3[i] < 2^56 */
1093
1094
        /* ftmp5 = z1 + z2 */
1095
0
        felem_assign(ftmp5, z1);       /* ftmp5[i] < 2^56 */
1096
0
        felem_sum64(ftmp5, z2);        /* ftmp5[i] < 2^57 */
1097
1098
        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1099
0
        felem_square(tmp, ftmp5);      /* tmp[i] < 2^117 */
1100
0
        felem_diff_128_64(tmp, ftmp);  /* tmp[i] < 2^117 + 2^64 + 2^48 */
1101
0
        felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1102
0
        felem_reduce(ftmp5, tmp);      /* ftmp5[i] < 2^56 */
1103
1104
        /* ftmp2 = z2 * z2z2 */
1105
0
        felem_mul_reduce(ftmp2, ftmp2, z2); /* ftmp2[i] < 2^56 */
1106
1107
        /* s1 = ftmp6 = y1 * z2**3 */
1108
0
        felem_mul_reduce(ftmp6, y1, ftmp2); /* ftmp6[i] < 2^56 */
1109
25.8k
    } else {
1110
        /*
1111
         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1112
         */
1113
1114
        /* u1 = ftmp3 = x1*z2z2 */
1115
25.8k
        felem_assign(ftmp3, x1);     /* ftmp3[i] < 2^56 */
1116
1117
        /* ftmp5 = 2*z1z2 */
1118
25.8k
        felem_scalar(ftmp5, z1, 2);  /* ftmp5[i] < 2^57 */
1119
1120
        /* s1 = ftmp6 = y1 * z2**3 */
1121
25.8k
        felem_assign(ftmp6, y1);     /* ftmp6[i] < 2^56 */
1122
25.8k
    }
1123
    /* ftmp3[i] < 2^56, ftmp5[i] < 2^57, ftmp6[i] < 2^56 */
1124
1125
    /* u2 = x2*z1z1 */
1126
25.8k
    felem_mul(tmp, x2, ftmp);        /* tmp[i] < 2^115 */
1127
1128
    /* h = ftmp4 = u2 - u1 */
1129
25.8k
    felem_diff_128_64(tmp, ftmp3);   /* tmp[i] < 2^115 + 2^64 + 2^48 */
1130
25.8k
    felem_reduce(ftmp4, tmp);        /* ftmp[4] < 2^56 */
1131
1132
25.8k
    x_equal = felem_is_zero(ftmp4);
1133
1134
    /* z_out = ftmp5 * h */
1135
25.8k
    felem_mul_reduce(z_out, ftmp5, ftmp4);  /* z_out[i] < 2^56 */
1136
1137
    /* ftmp = z1 * z1z1 */
1138
25.8k
    felem_mul_reduce(ftmp, ftmp, z1);  /* ftmp[i] < 2^56 */
1139
1140
    /* s2 = tmp = y2 * z1**3 */
1141
25.8k
    felem_mul(tmp, y2, ftmp);      /* tmp[i] < 2^115 */
1142
1143
    /* r = ftmp5 = (s2 - s1)*2 */
1144
25.8k
    felem_diff_128_64(tmp, ftmp6); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1145
25.8k
    felem_reduce(ftmp5, tmp);      /* ftmp5[i] < 2^56 */
1146
25.8k
    y_equal = felem_is_zero(ftmp5);
1147
25.8k
    felem_scalar64(ftmp5, 2);      /* ftmp5[i] < 2^57 */
1148
1149
    /*
1150
     * The formulae are incorrect if the points are equal, in affine coordinates
1151
     * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1152
     * happens.
1153
     *
1154
     * We use bitwise operations to avoid potential side-channels introduced by
1155
     * the short-circuiting behaviour of boolean operators.
1156
     *
1157
     * The special case of either point being the point at infinity (z1 and/or
1158
     * z2 are zero), is handled separately later on in this function, so we
1159
     * avoid jumping to point_double here in those special cases.
1160
     *
1161
     * Notice the comment below on the implications of this branching for timing
1162
     * leaks and why it is considered practically irrelevant.
1163
     */
1164
25.8k
    points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1165
1166
25.8k
    if (points_equal) {
1167
        /*
1168
         * This is obviously not constant-time but it will almost-never happen
1169
         * for ECDH / ECDSA.
1170
         */
1171
0
        point_double(x3, y3, z3, x1, y1, z1);
1172
0
        return;
1173
0
    }
1174
1175
    /* I = ftmp = (2h)**2 */
1176
25.8k
    felem_assign(ftmp, ftmp4);        /* ftmp[i] < 2^56 */
1177
25.8k
    felem_scalar64(ftmp, 2);          /* ftmp[i] < 2^57 */
1178
25.8k
    felem_square_reduce(ftmp, ftmp);  /* ftmp[i] < 2^56 */
1179
1180
    /* J = ftmp2 = h * I */
1181
25.8k
    felem_mul_reduce(ftmp2, ftmp4, ftmp); /* ftmp2[i] < 2^56 */
1182
1183
    /* V = ftmp4 = U1 * I */
1184
25.8k
    felem_mul_reduce(ftmp4, ftmp3, ftmp); /* ftmp4[i] < 2^56 */
1185
1186
    /* x_out = r**2 - J - 2V */
1187
25.8k
    felem_square(tmp, ftmp5);      /* tmp[i] < 2^117 */
1188
25.8k
    felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1189
25.8k
    felem_assign(ftmp3, ftmp4);    /* ftmp3[i] < 2^56 */
1190
25.8k
    felem_scalar64(ftmp4, 2);      /* ftmp4[i] < 2^57 */
1191
25.8k
    felem_diff_128_64(tmp, ftmp4); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1192
25.8k
    felem_reduce(x_out, tmp);      /* x_out[i] < 2^56 */
1193
1194
    /* y_out = r(V-x_out) - 2 * s1 * J */
1195
25.8k
    felem_diff64(ftmp3, x_out);    /* ftmp3[i] < 2^60 + 2^56 + 2^44 */
1196
25.8k
    felem_mul(tmp, ftmp5, ftmp3);  /* tmp[i] < 2^116 */
1197
25.8k
    felem_mul(tmp2, ftmp6, ftmp2); /* tmp2[i] < 2^115 */
1198
25.8k
    felem_scalar128(tmp2, 2);      /* tmp2[i] < 2^116 */
1199
25.8k
    felem_diff128(tmp, tmp2);      /* tmp[i] < 2^127 + 2^116 + 2^111 */
1200
25.8k
    felem_reduce(y_out, tmp);      /* y_out[i] < 2^56 */
1201
1202
25.8k
    copy_conditional(x_out, x2, z1_is_zero);
1203
25.8k
    copy_conditional(x_out, x1, z2_is_zero);
1204
25.8k
    copy_conditional(y_out, y2, z1_is_zero);
1205
25.8k
    copy_conditional(y_out, y1, z2_is_zero);
1206
25.8k
    copy_conditional(z_out, z2, z1_is_zero);
1207
25.8k
    copy_conditional(z_out, z1, z2_is_zero);
1208
25.8k
    felem_assign(x3, x_out);
1209
25.8k
    felem_assign(y3, y_out);
1210
25.8k
    felem_assign(z3, z_out);
1211
25.8k
}
1212
1213
/*-
1214
 * Base point pre computation
1215
 * --------------------------
1216
 *
1217
 * Two different sorts of precomputed tables are used in the following code.
1218
 * Each contain various points on the curve, where each point is three field
1219
 * elements (x, y, z).
1220
 *
1221
 * For the base point table, z is usually 1 (0 for the point at infinity).
1222
 * This table has 16 elements:
1223
 * index | bits    | point
1224
 * ------+---------+------------------------------
1225
 *     0 | 0 0 0 0 | 0G
1226
 *     1 | 0 0 0 1 | 1G
1227
 *     2 | 0 0 1 0 | 2^95G
1228
 *     3 | 0 0 1 1 | (2^95 + 1)G
1229
 *     4 | 0 1 0 0 | 2^190G
1230
 *     5 | 0 1 0 1 | (2^190 + 1)G
1231
 *     6 | 0 1 1 0 | (2^190 + 2^95)G
1232
 *     7 | 0 1 1 1 | (2^190 + 2^95 + 1)G
1233
 *     8 | 1 0 0 0 | 2^285G
1234
 *     9 | 1 0 0 1 | (2^285 + 1)G
1235
 *    10 | 1 0 1 0 | (2^285 + 2^95)G
1236
 *    11 | 1 0 1 1 | (2^285 + 2^95 + 1)G
1237
 *    12 | 1 1 0 0 | (2^285 + 2^190)G
1238
 *    13 | 1 1 0 1 | (2^285 + 2^190 + 1)G
1239
 *    14 | 1 1 1 0 | (2^285 + 2^190 + 2^95)G
1240
 *    15 | 1 1 1 1 | (2^285 + 2^190 + 2^95 + 1)G
1241
 *
1242
 * The reason for this is so that we can clock bits into four different
1243
 * locations when doing simple scalar multiplies against the base point.
1244
 *
1245
 * Tables for other points have table[i] = iG for i in 0 .. 16.
1246
 */
1247
1248
/* gmul is the table of precomputed base points */
1249
static const felem gmul[16][3] = {
1250
{{0, 0, 0, 0, 0, 0, 0},
1251
 {0, 0, 0, 0, 0, 0, 0},
1252
 {0, 0, 0, 0, 0, 0, 0}},
1253
{{0x00545e3872760ab7, 0x00f25dbf55296c3a, 0x00e082542a385502, 0x008ba79b9859f741,
1254
  0x0020ad746e1d3b62, 0x0005378eb1c71ef3, 0x0000aa87ca22be8b},
1255
 {0x00431d7c90ea0e5f, 0x00b1ce1d7e819d7a, 0x0013b5f0b8c00a60, 0x00289a147ce9da31,
1256
  0x0092dc29f8f41dbd, 0x002c6f5d9e98bf92, 0x00003617de4a9626},
1257
 {1, 0, 0, 0, 0, 0, 0}},
1258
{{0x00024711cc902a90, 0x00acb2e579ab4fe1, 0x00af818a4b4d57b1, 0x00a17c7bec49c3de,
1259
  0x004280482d726a8b, 0x00128dd0f0a90f3b, 0x00004387c1c3fa3c},
1260
 {0x002ce76543cf5c3a, 0x00de6cee5ef58f0a, 0x00403e42fa561ca6, 0x00bc54d6f9cb9731,
1261
  0x007155f925fb4ff1, 0x004a9ce731b7b9bc, 0x00002609076bd7b2},
1262
 {1, 0, 0, 0, 0, 0, 0}},
1263
{{0x00e74c9182f0251d, 0x0039bf54bb111974, 0x00b9d2f2eec511d2, 0x0036b1594eb3a6a4,
1264
  0x00ac3bb82d9d564b, 0x00f9313f4615a100, 0x00006716a9a91b10},
1265
 {0x0046698116e2f15c, 0x00f34347067d3d33, 0x008de4ccfdebd002, 0x00e838c6b8e8c97b,
1266
  0x006faf0798def346, 0x007349794a57563c, 0x00002629e7e6ad84},
1267
 {1, 0, 0, 0, 0, 0, 0}},
1268
{{0x0075300e34fd163b, 0x0092e9db4e8d0ad3, 0x00254be9f625f760, 0x00512c518c72ae68,
1269
  0x009bfcf162bede5a, 0x00bf9341566ce311, 0x0000cd6175bd41cf},
1270
 {0x007dfe52af4ac70f, 0x0002159d2d5c4880, 0x00b504d16f0af8d0, 0x0014585e11f5e64c,
1271
  0x0089c6388e030967, 0x00ffb270cbfa5f71, 0x00009a15d92c3947},
1272
 {1, 0, 0, 0, 0, 0, 0}},
1273
{{0x0033fc1278dc4fe5, 0x00d53088c2caa043, 0x0085558827e2db66, 0x00c192bef387b736,
1274
  0x00df6405a2225f2c, 0x0075205aa90fd91a, 0x0000137e3f12349d},
1275
 {0x00ce5b115efcb07e, 0x00abc3308410deeb, 0x005dc6fc1de39904, 0x00907c1c496f36b4,
1276
  0x0008e6ad3926cbe1, 0x00110747b787928c, 0x0000021b9162eb7e},
1277
 {1, 0, 0, 0, 0, 0, 0}},
1278
{{0x008180042cfa26e1, 0x007b826a96254967, 0x0082473694d6b194, 0x007bd6880a45b589,
1279
  0x00c0a5097072d1a3, 0x0019186555e18b4e, 0x000020278190e5ca},
1280
 {0x00b4bef17de61ac0, 0x009535e3c38ed348, 0x002d4aa8e468ceab, 0x00ef40b431036ad3,
1281
  0x00defd52f4542857, 0x0086edbf98234266, 0x00002025b3a7814d},
1282
 {1, 0, 0, 0, 0, 0, 0}},
1283
{{0x00b238aa97b886be, 0x00ef3192d6dd3a32, 0x0079f9e01fd62df8, 0x00742e890daba6c5,
1284
  0x008e5289144408ce, 0x0073bbcc8e0171a5, 0x0000c4fd329d3b52},
1285
 {0x00c6f64a15ee23e7, 0x00dcfb7b171cad8b, 0x00039f6cbd805867, 0x00de024e428d4562,
1286
  0x00be6a594d7c64c5, 0x0078467b70dbcd64, 0x0000251f2ed7079b},
1287
 {1, 0, 0, 0, 0, 0, 0}},
1288
{{0x000e5cc25fc4b872, 0x005ebf10d31ef4e1, 0x0061e0ebd11e8256, 0x0076e026096f5a27,
1289
  0x0013e6fc44662e9a, 0x0042b00289d3597e, 0x000024f089170d88},
1290
 {0x001604d7e0effbe6, 0x0048d77cba64ec2c, 0x008166b16da19e36, 0x006b0d1a0f28c088,
1291
  0x000259fcd47754fd, 0x00cc643e4d725f9a, 0x00007b10f3c79c14},
1292
 {1, 0, 0, 0, 0, 0, 0}},
1293
{{0x00430155e3b908af, 0x00b801e4fec25226, 0x00b0d4bcfe806d26, 0x009fc4014eb13d37,
1294
  0x0066c94e44ec07e8, 0x00d16adc03874ba2, 0x000030c917a0d2a7},
1295
 {0x00edac9e21eb891c, 0x00ef0fb768102eff, 0x00c088cef272a5f3, 0x00cbf782134e2964,
1296
  0x0001044a7ba9a0e3, 0x00e363f5b194cf3c, 0x00009ce85249e372},
1297
 {1, 0, 0, 0, 0, 0, 0}},
1298
{{0x001dd492dda5a7eb, 0x008fd577be539fd1, 0x002ff4b25a5fc3f1, 0x0074a8a1b64df72f,
1299
  0x002ba3d8c204a76c, 0x009d5cff95c8235a, 0x0000e014b9406e0f},
1300
 {0x008c2e4dbfc98aba, 0x00f30bb89f1a1436, 0x00b46f7aea3e259c, 0x009224454ac02f54,
1301
  0x00906401f5645fa2, 0x003a1d1940eabc77, 0x00007c9351d680e6},
1302
 {1, 0, 0, 0, 0, 0, 0}},
1303
{{0x005a35d872ef967c, 0x0049f1b7884e1987, 0x0059d46d7e31f552, 0x00ceb4869d2d0fb6,
1304
  0x00e8e89eee56802a, 0x0049d806a774aaf2, 0x0000147e2af0ae24},
1305
 {0x005fd1bd852c6e5e, 0x00b674b7b3de6885, 0x003b9ea5eb9b6c08, 0x005c9f03babf3ef7,
1306
  0x00605337fecab3c7, 0x009a3f85b11bbcc8, 0x0000455470f330ec},
1307
 {1, 0, 0, 0, 0, 0, 0}},
1308
{{0x002197ff4d55498d, 0x00383e8916c2d8af, 0x00eb203f34d1c6d2, 0x0080367cbd11b542,
1309
  0x00769b3be864e4f5, 0x0081a8458521c7bb, 0x0000c531b34d3539},
1310
 {0x00e2a3d775fa2e13, 0x00534fc379573844, 0x00ff237d2a8db54a, 0x00d301b2335a8882,
1311
  0x000f75ea96103a80, 0x0018fecb3cdd96fa, 0x0000304bf61e94eb},
1312
 {1, 0, 0, 0, 0, 0, 0}},
1313
{{0x00b2afc332a73dbd, 0x0029a0d5bb007bc5, 0x002d628eb210f577, 0x009f59a36dd05f50,
1314
  0x006d339de4eca613, 0x00c75a71addc86bc, 0x000060384c5ea93c},
1315
 {0x00aa9641c32a30b4, 0x00cc73ae8cce565d, 0x00ec911a4df07f61, 0x00aa4b762ea4b264,
1316
  0x0096d395bb393629, 0x004efacfb7632fe0, 0x00006f252f46fa3f},
1317
 {1, 0, 0, 0, 0, 0, 0}},
1318
{{0x00567eec597c7af6, 0x0059ba6795204413, 0x00816d4e6f01196f, 0x004ae6b3eb57951d,
1319
  0x00420f5abdda2108, 0x003401d1f57ca9d9, 0x0000cf5837b0b67a},
1320
 {0x00eaa64b8aeeabf9, 0x00246ddf16bcb4de, 0x000e7e3c3aecd751, 0x0008449f04fed72e,
1321
  0x00307b67ccf09183, 0x0017108c3556b7b1, 0x0000229b2483b3bf},
1322
 {1, 0, 0, 0, 0, 0, 0}},
1323
{{0x00e7c491a7bb78a1, 0x00eafddd1d3049ab, 0x00352c05e2bc7c98, 0x003d6880c165fa5c,
1324
  0x00b6ac61cc11c97d, 0x00beeb54fcf90ce5, 0x0000dc1f0b455edc},
1325
 {0x002db2e7aee34d60, 0x0073b5f415a2d8c0, 0x00dd84e4193e9a0c, 0x00d02d873467c572,
1326
  0x0018baaeda60aee5, 0x0013fb11d697c61e, 0x000083aafcc3a973},
1327
 {1, 0, 0, 0, 0, 0, 0}}
1328
};
1329
1330
/*
1331
 * select_point selects the |idx|th point from a precomputation table and
1332
 * copies it to out.
1333
 *
1334
 * pre_comp below is of the size provided in |size|.
1335
 */
1336
static void select_point(const limb idx, unsigned int size,
1337
                         const felem pre_comp[][3], felem out[3])
1338
26.1k
{
1339
26.1k
    unsigned int i, j;
1340
26.1k
    limb *outlimbs = &out[0][0];
1341
1342
26.1k
    memset(out, 0, sizeof(*out) * 3);
1343
1344
444k
    for (i = 0; i < size; i++) {
1345
418k
        const limb *inlimbs = &pre_comp[i][0][0];
1346
418k
        limb mask = i ^ idx;
1347
1348
418k
        mask |= mask >> 4;
1349
418k
        mask |= mask >> 2;
1350
418k
        mask |= mask >> 1;
1351
418k
        mask &= 1;
1352
418k
        mask--;
1353
9.19M
        for (j = 0; j < NLIMBS * 3; j++)
1354
8.78M
            outlimbs[j] |= inlimbs[j] & mask;
1355
418k
    }
1356
26.1k
}
1357
1358
/* get_bit returns the |i|th bit in |in| */
1359
static char get_bit(const felem_bytearray in, int i)
1360
101k
{
1361
101k
    if (i < 0 || i >= 384)
1362
0
        return 0;
1363
101k
    return (in[i >> 3] >> (i & 7)) & 1;
1364
101k
}
1365
1366
/*
1367
 * Interleaved point multiplication using precomputed point multiples: The
1368
 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1369
 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1370
 * generator, using certain (large) precomputed multiples in g_pre_comp.
1371
 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1372
 */
1373
static void batch_mul(felem x_out, felem y_out, felem z_out,
1374
                      const felem_bytearray scalars[],
1375
                      const unsigned int num_points, const u8 *g_scalar,
1376
                      const int mixed, const felem pre_comp[][17][3],
1377
                      const felem g_pre_comp[16][3])
1378
264
{
1379
264
    int i, skip;
1380
264
    unsigned int num, gen_mul = (g_scalar != NULL);
1381
264
    felem nq[3], tmp[4];
1382
264
    limb bits;
1383
264
    u8 sign, digit;
1384
1385
    /* set nq to the point at infinity */
1386
264
    memset(nq, 0, sizeof(nq));
1387
1388
    /*
1389
     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1390
     * of the generator (last quarter of rounds) and additions of other
1391
     * points multiples (every 5th round).
1392
     */
1393
264
    skip = 1;                   /* save two point operations in the first
1394
                                 * round */
1395
26.4k
    for (i = (num_points ? 380 : 98); i >= 0; --i) {
1396
        /* double */
1397
26.1k
        if (!skip)
1398
25.8k
            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1399
1400
        /* add multiples of the generator */
1401
26.1k
        if (gen_mul && (i <= 98)) {
1402
26.1k
            bits = get_bit(g_scalar, i + 285) << 3;
1403
26.1k
            if (i < 95) {
1404
25.0k
                bits |= get_bit(g_scalar, i + 190) << 2;
1405
25.0k
                bits |= get_bit(g_scalar, i + 95) << 1;
1406
25.0k
                bits |= get_bit(g_scalar, i);
1407
25.0k
            }
1408
            /* select the point to add, in constant time */
1409
26.1k
            select_point(bits, 16, g_pre_comp, tmp);
1410
26.1k
            if (!skip) {
1411
                /* The 1 argument below is for "mixed" */
1412
25.8k
                point_add(nq[0],  nq[1],  nq[2],
1413
25.8k
                          nq[0],  nq[1],  nq[2], 1,
1414
25.8k
                          tmp[0], tmp[1], tmp[2]);
1415
25.8k
            } else {
1416
264
                memcpy(nq, tmp, 3 * sizeof(felem));
1417
264
                skip = 0;
1418
264
            }
1419
26.1k
        }
1420
1421
        /* do other additions every 5 doublings */
1422
26.1k
        if (num_points && (i % 5 == 0)) {
1423
            /* loop over all scalars */
1424
0
            for (num = 0; num < num_points; ++num) {
1425
0
                bits = get_bit(scalars[num], i + 4) << 5;
1426
0
                bits |= get_bit(scalars[num], i + 3) << 4;
1427
0
                bits |= get_bit(scalars[num], i + 2) << 3;
1428
0
                bits |= get_bit(scalars[num], i + 1) << 2;
1429
0
                bits |= get_bit(scalars[num], i) << 1;
1430
0
                bits |= get_bit(scalars[num], i - 1);
1431
0
                ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1432
1433
                /*
1434
                 * select the point to add or subtract, in constant time
1435
                 */
1436
0
                select_point(digit, 17, pre_comp[num], tmp);
1437
0
                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1438
                                            * point */
1439
0
                copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1440
1441
0
                if (!skip) {
1442
0
                    point_add(nq[0],  nq[1],  nq[2],
1443
0
                              nq[0],  nq[1],  nq[2], mixed,
1444
0
                              tmp[0], tmp[1], tmp[2]);
1445
0
                } else {
1446
0
                    memcpy(nq, tmp, 3 * sizeof(felem));
1447
0
                    skip = 0;
1448
0
                }
1449
0
            }
1450
0
        }
1451
26.1k
    }
1452
264
    felem_assign(x_out, nq[0]);
1453
264
    felem_assign(y_out, nq[1]);
1454
264
    felem_assign(z_out, nq[2]);
1455
264
}
1456
1457
/* Precomputation for the group generator. */
1458
struct nistp384_pre_comp_st {
1459
    felem g_pre_comp[16][3];
1460
    CRYPTO_REF_COUNT references;
1461
};
1462
1463
const EC_METHOD *ossl_ec_GFp_nistp384_method(void)
1464
264
{
1465
264
    static const EC_METHOD ret = {
1466
264
        EC_FLAGS_DEFAULT_OCT,
1467
264
        NID_X9_62_prime_field,
1468
264
        ossl_ec_GFp_nistp384_group_init,
1469
264
        ossl_ec_GFp_simple_group_finish,
1470
264
        ossl_ec_GFp_simple_group_clear_finish,
1471
264
        ossl_ec_GFp_nist_group_copy,
1472
264
        ossl_ec_GFp_nistp384_group_set_curve,
1473
264
        ossl_ec_GFp_simple_group_get_curve,
1474
264
        ossl_ec_GFp_simple_group_get_degree,
1475
264
        ossl_ec_group_simple_order_bits,
1476
264
        ossl_ec_GFp_simple_group_check_discriminant,
1477
264
        ossl_ec_GFp_simple_point_init,
1478
264
        ossl_ec_GFp_simple_point_finish,
1479
264
        ossl_ec_GFp_simple_point_clear_finish,
1480
264
        ossl_ec_GFp_simple_point_copy,
1481
264
        ossl_ec_GFp_simple_point_set_to_infinity,
1482
264
        ossl_ec_GFp_simple_point_set_affine_coordinates,
1483
264
        ossl_ec_GFp_nistp384_point_get_affine_coordinates,
1484
264
        0, /* point_set_compressed_coordinates */
1485
264
        0, /* point2oct */
1486
264
        0, /* oct2point */
1487
264
        ossl_ec_GFp_simple_add,
1488
264
        ossl_ec_GFp_simple_dbl,
1489
264
        ossl_ec_GFp_simple_invert,
1490
264
        ossl_ec_GFp_simple_is_at_infinity,
1491
264
        ossl_ec_GFp_simple_is_on_curve,
1492
264
        ossl_ec_GFp_simple_cmp,
1493
264
        ossl_ec_GFp_simple_make_affine,
1494
264
        ossl_ec_GFp_simple_points_make_affine,
1495
264
        ossl_ec_GFp_nistp384_points_mul,
1496
264
        ossl_ec_GFp_nistp384_precompute_mult,
1497
264
        ossl_ec_GFp_nistp384_have_precompute_mult,
1498
264
        ossl_ec_GFp_nist_field_mul,
1499
264
        ossl_ec_GFp_nist_field_sqr,
1500
264
        0, /* field_div */
1501
264
        ossl_ec_GFp_simple_field_inv,
1502
264
        0, /* field_encode */
1503
264
        0, /* field_decode */
1504
264
        0, /* field_set_to_one */
1505
264
        ossl_ec_key_simple_priv2oct,
1506
264
        ossl_ec_key_simple_oct2priv,
1507
264
        0, /* set private */
1508
264
        ossl_ec_key_simple_generate_key,
1509
264
        ossl_ec_key_simple_check_key,
1510
264
        ossl_ec_key_simple_generate_public_key,
1511
264
        0, /* keycopy */
1512
264
        0, /* keyfinish */
1513
264
        ossl_ecdh_simple_compute_key,
1514
264
        ossl_ecdsa_simple_sign_setup,
1515
264
        ossl_ecdsa_simple_sign_sig,
1516
264
        ossl_ecdsa_simple_verify_sig,
1517
264
        0, /* field_inverse_mod_ord */
1518
264
        0, /* blind_coordinates */
1519
264
        0, /* ladder_pre */
1520
264
        0, /* ladder_step */
1521
264
        0  /* ladder_post */
1522
264
    };
1523
1524
264
    return &ret;
1525
264
}
1526
1527
/******************************************************************************/
1528
/*
1529
 * FUNCTIONS TO MANAGE PRECOMPUTATION
1530
 */
1531
1532
static NISTP384_PRE_COMP *nistp384_pre_comp_new(void)
1533
0
{
1534
0
    NISTP384_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1535
1536
0
    if (ret == NULL)
1537
0
        return ret;
1538
1539
0
    if (!CRYPTO_NEW_REF(&ret->references, 1)) {
1540
0
        OPENSSL_free(ret);
1541
0
        return NULL;
1542
0
    }
1543
0
    return ret;
1544
0
}
1545
1546
NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *p)
1547
0
{
1548
0
    int i;
1549
1550
0
    if (p != NULL)
1551
0
        CRYPTO_UP_REF(&p->references, &i);
1552
0
    return p;
1553
0
}
1554
1555
void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *p)
1556
0
{
1557
0
    int i;
1558
1559
0
    if (p == NULL)
1560
0
        return;
1561
1562
0
    CRYPTO_DOWN_REF(&p->references, &i);
1563
0
    REF_PRINT_COUNT("ossl_ec_nistp384", p);
1564
0
    if (i > 0)
1565
0
        return;
1566
0
    REF_ASSERT_ISNT(i < 0);
1567
1568
0
    CRYPTO_FREE_REF(&p->references);
1569
0
    OPENSSL_free(p);
1570
0
}
1571
1572
/******************************************************************************/
1573
/*
1574
 * OPENSSL EC_METHOD FUNCTIONS
1575
 */
1576
1577
int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group)
1578
637
{
1579
637
    int ret;
1580
1581
637
    ret = ossl_ec_GFp_simple_group_init(group);
1582
637
    group->a_is_minus3 = 1;
1583
637
    return ret;
1584
637
}
1585
1586
int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1587
                                         const BIGNUM *a, const BIGNUM *b,
1588
                                         BN_CTX *ctx)
1589
264
{
1590
264
    int ret = 0;
1591
264
    BIGNUM *curve_p, *curve_a, *curve_b;
1592
264
#ifndef FIPS_MODULE
1593
264
    BN_CTX *new_ctx = NULL;
1594
1595
264
    if (ctx == NULL)
1596
0
        ctx = new_ctx = BN_CTX_new();
1597
264
#endif
1598
264
    if (ctx == NULL)
1599
0
        return 0;
1600
1601
264
    BN_CTX_start(ctx);
1602
264
    curve_p = BN_CTX_get(ctx);
1603
264
    curve_a = BN_CTX_get(ctx);
1604
264
    curve_b = BN_CTX_get(ctx);
1605
264
    if (curve_b == NULL)
1606
0
        goto err;
1607
264
    BN_bin2bn(nistp384_curve_params[0], sizeof(felem_bytearray), curve_p);
1608
264
    BN_bin2bn(nistp384_curve_params[1], sizeof(felem_bytearray), curve_a);
1609
264
    BN_bin2bn(nistp384_curve_params[2], sizeof(felem_bytearray), curve_b);
1610
264
    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1611
0
        ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1612
0
        goto err;
1613
0
    }
1614
264
    group->field_mod_func = BN_nist_mod_384;
1615
264
    ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1616
264
 err:
1617
264
    BN_CTX_end(ctx);
1618
264
#ifndef FIPS_MODULE
1619
264
    BN_CTX_free(new_ctx);
1620
264
#endif
1621
264
    return ret;
1622
264
}
1623
1624
/*
1625
 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1626
 * (X/Z^2, Y/Z^3)
1627
 */
1628
int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group,
1629
                                                      const EC_POINT *point,
1630
                                                      BIGNUM *x, BIGNUM *y,
1631
                                                      BN_CTX *ctx)
1632
264
{
1633
264
    felem z1, z2, x_in, y_in, x_out, y_out;
1634
264
    widefelem tmp;
1635
1636
264
    if (EC_POINT_is_at_infinity(group, point)) {
1637
0
        ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1638
0
        return 0;
1639
0
    }
1640
264
    if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1641
264
        (!BN_to_felem(z1, point->Z)))
1642
0
        return 0;
1643
264
    felem_inv(z2, z1);
1644
264
    felem_square(tmp, z2);
1645
264
    felem_reduce(z1, tmp);
1646
264
    felem_mul(tmp, x_in, z1);
1647
264
    felem_reduce(x_in, tmp);
1648
264
    felem_contract(x_out, x_in);
1649
264
    if (x != NULL) {
1650
264
        if (!felem_to_BN(x, x_out)) {
1651
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1652
0
            return 0;
1653
0
        }
1654
264
    }
1655
264
    felem_mul(tmp, z1, z2);
1656
264
    felem_reduce(z1, tmp);
1657
264
    felem_mul(tmp, y_in, z1);
1658
264
    felem_reduce(y_in, tmp);
1659
264
    felem_contract(y_out, y_in);
1660
264
    if (y != NULL) {
1661
264
        if (!felem_to_BN(y, y_out)) {
1662
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1663
0
            return 0;
1664
0
        }
1665
264
    }
1666
264
    return 1;
1667
264
}
1668
1669
/* points below is of size |num|, and tmp_felems is of size |num+1/ */
1670
static void make_points_affine(size_t num, felem points[][3],
1671
                               felem tmp_felems[])
1672
0
{
1673
    /*
1674
     * Runs in constant time, unless an input is the point at infinity (which
1675
     * normally shouldn't happen).
1676
     */
1677
0
    ossl_ec_GFp_nistp_points_make_affine_internal(num,
1678
0
                                                  points,
1679
0
                                                  sizeof(felem),
1680
0
                                                  tmp_felems,
1681
0
                                                  (void (*)(void *))felem_one,
1682
0
                                                  felem_is_zero_int,
1683
0
                                                  (void (*)(void *, const void *))
1684
0
                                                  felem_assign,
1685
0
                                                  (void (*)(void *, const void *))
1686
0
                                                  felem_square_reduce,
1687
0
                                                  (void (*)(void *, const void *, const void*))
1688
0
                                                  felem_mul_reduce,
1689
0
                                                  (void (*)(void *, const void *))
1690
0
                                                  felem_inv,
1691
0
                                                  (void (*)(void *, const void *))
1692
0
                                                  felem_contract);
1693
0
}
1694
1695
/*
1696
 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1697
 * values Result is stored in r (r can equal one of the inputs).
1698
 */
1699
int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r,
1700
                                    const BIGNUM *scalar, size_t num,
1701
                                    const EC_POINT *points[],
1702
                                    const BIGNUM *scalars[], BN_CTX *ctx)
1703
264
{
1704
264
    int ret = 0;
1705
264
    int j;
1706
264
    int mixed = 0;
1707
264
    BIGNUM *x, *y, *z, *tmp_scalar;
1708
264
    felem_bytearray g_secret;
1709
264
    felem_bytearray *secrets = NULL;
1710
264
    felem (*pre_comp)[17][3] = NULL;
1711
264
    felem *tmp_felems = NULL;
1712
264
    unsigned int i;
1713
264
    int num_bytes;
1714
264
    int have_pre_comp = 0;
1715
264
    size_t num_points = num;
1716
264
    felem x_in, y_in, z_in, x_out, y_out, z_out;
1717
264
    NISTP384_PRE_COMP *pre = NULL;
1718
264
    felem(*g_pre_comp)[3] = NULL;
1719
264
    EC_POINT *generator = NULL;
1720
264
    const EC_POINT *p = NULL;
1721
264
    const BIGNUM *p_scalar = NULL;
1722
1723
264
    BN_CTX_start(ctx);
1724
264
    x = BN_CTX_get(ctx);
1725
264
    y = BN_CTX_get(ctx);
1726
264
    z = BN_CTX_get(ctx);
1727
264
    tmp_scalar = BN_CTX_get(ctx);
1728
264
    if (tmp_scalar == NULL)
1729
0
        goto err;
1730
1731
264
    if (scalar != NULL) {
1732
264
        pre = group->pre_comp.nistp384;
1733
264
        if (pre)
1734
            /* we have precomputation, try to use it */
1735
0
            g_pre_comp = &pre->g_pre_comp[0];
1736
264
        else
1737
            /* try to use the standard precomputation */
1738
264
            g_pre_comp = (felem(*)[3]) gmul;
1739
264
        generator = EC_POINT_new(group);
1740
264
        if (generator == NULL)
1741
0
            goto err;
1742
        /* get the generator from precomputation */
1743
264
        if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1744
264
            !felem_to_BN(y, g_pre_comp[1][1]) ||
1745
264
            !felem_to_BN(z, g_pre_comp[1][2])) {
1746
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1747
0
            goto err;
1748
0
        }
1749
264
        if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
1750
264
                                                                generator,
1751
264
                                                                x, y, z, ctx))
1752
0
            goto err;
1753
264
        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1754
            /* precomputation matches generator */
1755
264
            have_pre_comp = 1;
1756
0
        else
1757
            /*
1758
             * we don't have valid precomputation: treat the generator as a
1759
             * random point
1760
             */
1761
0
            num_points++;
1762
264
    }
1763
1764
264
    if (num_points > 0) {
1765
0
        if (num_points >= 2) {
1766
            /*
1767
             * unless we precompute multiples for just one point, converting
1768
             * those into affine form is time well spent
1769
             */
1770
0
            mixed = 1;
1771
0
        }
1772
0
        secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1773
0
        pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1774
0
        if (mixed)
1775
0
            tmp_felems =
1776
0
                OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1777
0
        if ((secrets == NULL) || (pre_comp == NULL)
1778
0
            || (mixed && (tmp_felems == NULL)))
1779
0
            goto err;
1780
1781
        /*
1782
         * we treat NULL scalars as 0, and NULL points as points at infinity,
1783
         * i.e., they contribute nothing to the linear combination
1784
         */
1785
0
        for (i = 0; i < num_points; ++i) {
1786
0
            if (i == num) {
1787
                /*
1788
                 * we didn't have a valid precomputation, so we pick the
1789
                 * generator
1790
                 */
1791
0
                p = EC_GROUP_get0_generator(group);
1792
0
                p_scalar = scalar;
1793
0
            } else {
1794
                /* the i^th point */
1795
0
                p = points[i];
1796
0
                p_scalar = scalars[i];
1797
0
            }
1798
0
            if (p_scalar != NULL && p != NULL) {
1799
                /* reduce scalar to 0 <= scalar < 2^384 */
1800
0
                if ((BN_num_bits(p_scalar) > 384)
1801
0
                    || (BN_is_negative(p_scalar))) {
1802
                    /*
1803
                     * this is an unusual input, and we don't guarantee
1804
                     * constant-timeness
1805
                     */
1806
0
                    if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1807
0
                        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1808
0
                        goto err;
1809
0
                    }
1810
0
                    num_bytes = BN_bn2lebinpad(tmp_scalar,
1811
0
                                               secrets[i], sizeof(secrets[i]));
1812
0
                } else {
1813
0
                    num_bytes = BN_bn2lebinpad(p_scalar,
1814
0
                                               secrets[i], sizeof(secrets[i]));
1815
0
                }
1816
0
                if (num_bytes < 0) {
1817
0
                    ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1818
0
                    goto err;
1819
0
                }
1820
                /* precompute multiples */
1821
0
                if ((!BN_to_felem(x_out, p->X)) ||
1822
0
                    (!BN_to_felem(y_out, p->Y)) ||
1823
0
                    (!BN_to_felem(z_out, p->Z)))
1824
0
                    goto err;
1825
0
                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1826
0
                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1827
0
                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1828
0
                for (j = 2; j <= 16; ++j) {
1829
0
                    if (j & 1) {
1830
0
                        point_add(pre_comp[i][j][0],     pre_comp[i][j][1],     pre_comp[i][j][2],
1831
0
                                  pre_comp[i][1][0],     pre_comp[i][1][1],     pre_comp[i][1][2], 0,
1832
0
                                  pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]);
1833
0
                    } else {
1834
0
                        point_double(pre_comp[i][j][0],     pre_comp[i][j][1],     pre_comp[i][j][2],
1835
0
                                     pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]);
1836
0
                    }
1837
0
                }
1838
0
            }
1839
0
        }
1840
0
        if (mixed)
1841
0
            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1842
0
    }
1843
1844
    /* the scalar for the generator */
1845
264
    if (scalar != NULL && have_pre_comp) {
1846
264
        memset(g_secret, 0, sizeof(g_secret));
1847
        /* reduce scalar to 0 <= scalar < 2^384 */
1848
264
        if ((BN_num_bits(scalar) > 384) || (BN_is_negative(scalar))) {
1849
            /*
1850
             * this is an unusual input, and we don't guarantee
1851
             * constant-timeness
1852
             */
1853
0
            if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1854
0
                ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1855
0
                goto err;
1856
0
            }
1857
0
            num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1858
264
        } else {
1859
264
            num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1860
264
        }
1861
        /* do the multiplication with generator precomputation */
1862
264
        batch_mul(x_out, y_out, z_out,
1863
264
                  (const felem_bytearray(*))secrets, num_points,
1864
264
                  g_secret,
1865
264
                  mixed, (const felem(*)[17][3])pre_comp,
1866
264
                  (const felem(*)[3])g_pre_comp);
1867
264
    } else {
1868
        /* do the multiplication without generator precomputation */
1869
0
        batch_mul(x_out, y_out, z_out,
1870
0
                  (const felem_bytearray(*))secrets, num_points,
1871
0
                  NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1872
0
    }
1873
    /* reduce the output to its unique minimal representation */
1874
264
    felem_contract(x_in, x_out);
1875
264
    felem_contract(y_in, y_out);
1876
264
    felem_contract(z_in, z_out);
1877
264
    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1878
264
        (!felem_to_BN(z, z_in))) {
1879
0
        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1880
0
        goto err;
1881
0
    }
1882
264
    ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
1883
264
                                                             ctx);
1884
1885
264
 err:
1886
264
    BN_CTX_end(ctx);
1887
264
    EC_POINT_free(generator);
1888
264
    OPENSSL_free(secrets);
1889
264
    OPENSSL_free(pre_comp);
1890
264
    OPENSSL_free(tmp_felems);
1891
264
    return ret;
1892
264
}
1893
1894
int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1895
0
{
1896
0
    int ret = 0;
1897
0
    NISTP384_PRE_COMP *pre = NULL;
1898
0
    int i, j;
1899
0
    BIGNUM *x, *y;
1900
0
    EC_POINT *generator = NULL;
1901
0
    felem tmp_felems[16];
1902
0
#ifndef FIPS_MODULE
1903
0
    BN_CTX *new_ctx = NULL;
1904
0
#endif
1905
1906
    /* throw away old precomputation */
1907
0
    EC_pre_comp_free(group);
1908
1909
0
#ifndef FIPS_MODULE
1910
0
    if (ctx == NULL)
1911
0
        ctx = new_ctx = BN_CTX_new();
1912
0
#endif
1913
0
    if (ctx == NULL)
1914
0
        return 0;
1915
1916
0
    BN_CTX_start(ctx);
1917
0
    x = BN_CTX_get(ctx);
1918
0
    y = BN_CTX_get(ctx);
1919
0
    if (y == NULL)
1920
0
        goto err;
1921
    /* get the generator */
1922
0
    if (group->generator == NULL)
1923
0
        goto err;
1924
0
    generator = EC_POINT_new(group);
1925
0
    if (generator == NULL)
1926
0
        goto err;
1927
0
    BN_bin2bn(nistp384_curve_params[3], sizeof(felem_bytearray), x);
1928
0
    BN_bin2bn(nistp384_curve_params[4], sizeof(felem_bytearray), y);
1929
0
    if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
1930
0
        goto err;
1931
0
    if ((pre = nistp384_pre_comp_new()) == NULL)
1932
0
        goto err;
1933
    /*
1934
     * if the generator is the standard one, use built-in precomputation
1935
     */
1936
0
    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1937
0
        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1938
0
        goto done;
1939
0
    }
1940
0
    if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
1941
0
        (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
1942
0
        (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
1943
0
        goto err;
1944
    /* compute 2^95*G, 2^190*G, 2^285*G */
1945
0
    for (i = 1; i <= 4; i <<= 1) {
1946
0
        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1947
0
                     pre->g_pre_comp[i][0],  pre->g_pre_comp[i][1],    pre->g_pre_comp[i][2]);
1948
0
        for (j = 0; j < 94; ++j) {
1949
0
            point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1950
0
                         pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2]);
1951
0
        }
1952
0
    }
1953
    /* g_pre_comp[0] is the point at infinity */
1954
0
    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1955
    /* the remaining multiples */
1956
    /* 2^95*G + 2^190*G */
1957
0
    point_add(pre->g_pre_comp[6][0],  pre->g_pre_comp[6][1],  pre->g_pre_comp[6][2],
1958
0
              pre->g_pre_comp[4][0],  pre->g_pre_comp[4][1],  pre->g_pre_comp[4][2], 0,
1959
0
              pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
1960
    /* 2^95*G + 2^285*G */
1961
0
    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], pre->g_pre_comp[10][2],
1962
0
              pre->g_pre_comp[8][0],  pre->g_pre_comp[8][1],  pre->g_pre_comp[8][2], 0,
1963
0
              pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
1964
    /* 2^190*G + 2^285*G */
1965
0
    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
1966
0
              pre->g_pre_comp[8][0],  pre->g_pre_comp[8][1],  pre->g_pre_comp[8][2], 0,
1967
0
              pre->g_pre_comp[4][0],  pre->g_pre_comp[4][1],  pre->g_pre_comp[4][2]);
1968
    /* 2^95*G + 2^190*G + 2^285*G */
1969
0
    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], pre->g_pre_comp[14][2],
1970
0
              pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], 0,
1971
0
              pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
1972
0
    for (i = 1; i < 8; ++i) {
1973
        /* odd multiples: add G */
1974
0
        point_add(pre->g_pre_comp[2 * i + 1][0], pre->g_pre_comp[2 * i + 1][1], pre->g_pre_comp[2 * i + 1][2],
1975
0
                  pre->g_pre_comp[2 * i][0],     pre->g_pre_comp[2 * i][1],     pre->g_pre_comp[2 * i][2], 0,
1976
0
                  pre->g_pre_comp[1][0],         pre->g_pre_comp[1][1],         pre->g_pre_comp[1][2]);
1977
0
    }
1978
0
    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
1979
1980
0
 done:
1981
0
    SETPRECOMP(group, nistp384, pre);
1982
0
    ret = 1;
1983
0
    pre = NULL;
1984
0
 err:
1985
0
    BN_CTX_end(ctx);
1986
0
    EC_POINT_free(generator);
1987
0
#ifndef FIPS_MODULE
1988
0
    BN_CTX_free(new_ctx);
1989
0
#endif
1990
0
    ossl_ec_nistp384_pre_comp_free(pre);
1991
0
    return ret;
1992
0
}
1993
1994
int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group)
1995
0
{
1996
0
    return HAVEPRECOMP(group, nistp384);
1997
0
}