Coverage Report

Created: 2026-05-30 06:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openssl/crypto/ec/ecp_nistp521.c
Line
Count
Source
1
/*
2
 * Copyright 2011-2025 The OpenSSL Project Authors. All Rights Reserved.
3
 *
4
 * Licensed under the Apache License 2.0 (the "License").  You may not use
5
 * this file except in compliance with the License.  You can obtain a copy
6
 * in the file LICENSE in the source distribution or at
7
 * https://www.openssl.org/source/license.html
8
 */
9
10
/* Copyright 2011 Google Inc.
11
 *
12
 * Licensed under the Apache License, Version 2.0 (the "License");
13
 *
14
 * you may not use this file except in compliance with the License.
15
 * You may obtain a copy of the License at
16
 *
17
 *     http://www.apache.org/licenses/LICENSE-2.0
18
 *
19
 *  Unless required by applicable law or agreed to in writing, software
20
 *  distributed under the License is distributed on an "AS IS" BASIS,
21
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22
 *  See the License for the specific language governing permissions and
23
 *  limitations under the License.
24
 */
25
26
/*
27
 * ECDSA low level APIs are deprecated for public use, but still ok for
28
 * internal use.
29
 */
30
#include "internal/deprecated.h"
31
32
/*
33
 * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
34
 *
35
 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
36
 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
37
 * work which got its smarts from Daniel J. Bernstein's work on the same.
38
 */
39
40
#include <openssl/e_os2.h>
41
42
#include <string.h>
43
#include <openssl/err.h>
44
#include "ec_local.h"
45
46
#include "internal/numbers.h"
47
48
#ifndef INT128_MAX
49
#error "Your compiler doesn't appear to support 128-bit integer types"
50
#endif
51
52
/*
53
 * The underlying field. P521 operates over GF(2^521-1). We can serialize an
54
 * element of this field into 66 bytes where the most significant byte
55
 * contains only a single bit. We call this an felem_bytearray.
56
 */
57
58
typedef uint8_t felem_bytearray[66];
59
60
/*
61
 * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
62
 * These values are big-endian.
63
 */
64
static const felem_bytearray nistp521_curve_params[5] = {
65
    { 0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
66
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
67
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73
        0xff, 0xff },
74
    { 0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
75
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
76
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
77
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81
        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82
        0xff, 0xfc },
83
    { 0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
84
        0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
85
        0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
86
        0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
87
        0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
88
        0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
89
        0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
90
        0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
91
        0x3f, 0x00 },
92
    { 0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
93
        0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
94
        0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
95
        0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
96
        0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
97
        0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
98
        0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
99
        0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
100
        0xbd, 0x66 },
101
    { 0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
102
        0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
103
        0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
104
        0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
105
        0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
106
        0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
107
        0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
108
        0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
109
        0x66, 0x50 }
110
};
111
112
/*-
113
 * The representation of field elements.
114
 * ------------------------------------
115
 *
116
 * We represent field elements with nine values. These values are either 64 or
117
 * 128 bits and the field element represented is:
118
 *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
119
 * Each of the nine values is called a 'limb'. Since the limbs are spaced only
120
 * 58 bits apart, but are greater than 58 bits in length, the most significant
121
 * bits of each limb overlap with the least significant bits of the next.
122
 *
123
 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
124
 * 'largefelem' */
125
126
0
#define NLIMBS 9
127
128
typedef uint64_t limb;
129
typedef limb limb_aX __attribute((__aligned__(1)));
130
typedef limb felem[NLIMBS];
131
typedef uint128_t largefelem[NLIMBS];
132
133
static const limb bottom57bits = 0x1ffffffffffffff;
134
static const limb bottom58bits = 0x3ffffffffffffff;
135
136
/*
137
 * bin66_to_felem takes a little-endian byte array and converts it into felem
138
 * form. This assumes that the CPU is little-endian.
139
 */
140
static void bin66_to_felem(felem out, const uint8_t in[66])
141
0
{
142
0
    out[0] = (*((limb *)&in[0])) & bottom58bits;
143
0
    out[1] = (*((limb_aX *)&in[7]) >> 2) & bottom58bits;
144
0
    out[2] = (*((limb_aX *)&in[14]) >> 4) & bottom58bits;
145
0
    out[3] = (*((limb_aX *)&in[21]) >> 6) & bottom58bits;
146
0
    out[4] = (*((limb_aX *)&in[29])) & bottom58bits;
147
0
    out[5] = (*((limb_aX *)&in[36]) >> 2) & bottom58bits;
148
0
    out[6] = (*((limb_aX *)&in[43]) >> 4) & bottom58bits;
149
0
    out[7] = (*((limb_aX *)&in[50]) >> 6) & bottom58bits;
150
0
    out[8] = (*((limb_aX *)&in[58])) & bottom57bits;
151
0
}
152
153
/*
154
 * felem_to_bin66 takes an felem and serializes into a little endian, 66 byte
155
 * array. This assumes that the CPU is little-endian.
156
 */
157
static void felem_to_bin66(uint8_t out[66], const felem in)
158
0
{
159
0
    memset(out, 0, 66);
160
0
    (*((limb *)&out[0])) = in[0];
161
0
    (*((limb_aX *)&out[7])) |= in[1] << 2;
162
0
    (*((limb_aX *)&out[14])) |= in[2] << 4;
163
0
    (*((limb_aX *)&out[21])) |= in[3] << 6;
164
0
    (*((limb_aX *)&out[29])) = in[4];
165
0
    (*((limb_aX *)&out[36])) |= in[5] << 2;
166
0
    (*((limb_aX *)&out[43])) |= in[6] << 4;
167
0
    (*((limb_aX *)&out[50])) |= in[7] << 6;
168
0
    (*((limb_aX *)&out[58])) = in[8];
169
0
}
170
171
/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
172
static int BN_to_felem(felem out, const BIGNUM *bn)
173
0
{
174
0
    felem_bytearray b_out;
175
0
    int num_bytes;
176
177
0
    if (BN_is_negative(bn)) {
178
0
        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
179
0
        return 0;
180
0
    }
181
0
    num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
182
0
    if (num_bytes < 0) {
183
0
        ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
184
0
        return 0;
185
0
    }
186
0
    bin66_to_felem(out, b_out);
187
0
    return 1;
188
0
}
189
190
/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
191
static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
192
0
{
193
0
    felem_bytearray b_out;
194
0
    felem_to_bin66(b_out, in);
195
0
    return BN_lebin2bn(b_out, sizeof(b_out), out);
196
0
}
197
198
/*-
199
 * Field operations
200
 * ----------------
201
 */
202
203
static void felem_one(felem out)
204
0
{
205
0
    out[0] = 1;
206
0
    out[1] = 0;
207
0
    out[2] = 0;
208
0
    out[3] = 0;
209
0
    out[4] = 0;
210
0
    out[5] = 0;
211
0
    out[6] = 0;
212
0
    out[7] = 0;
213
0
    out[8] = 0;
214
0
}
215
216
static void felem_assign(felem out, const felem in)
217
0
{
218
0
    out[0] = in[0];
219
0
    out[1] = in[1];
220
0
    out[2] = in[2];
221
0
    out[3] = in[3];
222
0
    out[4] = in[4];
223
0
    out[5] = in[5];
224
0
    out[6] = in[6];
225
0
    out[7] = in[7];
226
0
    out[8] = in[8];
227
0
}
228
229
/* felem_sum64 sets out = out + in. */
230
static void felem_sum64(felem out, const felem in)
231
0
{
232
0
    out[0] += in[0];
233
0
    out[1] += in[1];
234
0
    out[2] += in[2];
235
0
    out[3] += in[3];
236
0
    out[4] += in[4];
237
0
    out[5] += in[5];
238
0
    out[6] += in[6];
239
0
    out[7] += in[7];
240
0
    out[8] += in[8];
241
0
}
242
243
/* felem_scalar sets out = in * scalar */
244
static void felem_scalar(felem out, const felem in, limb scalar)
245
0
{
246
0
    out[0] = in[0] * scalar;
247
0
    out[1] = in[1] * scalar;
248
0
    out[2] = in[2] * scalar;
249
0
    out[3] = in[3] * scalar;
250
0
    out[4] = in[4] * scalar;
251
0
    out[5] = in[5] * scalar;
252
0
    out[6] = in[6] * scalar;
253
0
    out[7] = in[7] * scalar;
254
0
    out[8] = in[8] * scalar;
255
0
}
256
257
/* felem_scalar64 sets out = out * scalar */
258
static void felem_scalar64(felem out, limb scalar)
259
0
{
260
0
    out[0] *= scalar;
261
0
    out[1] *= scalar;
262
0
    out[2] *= scalar;
263
0
    out[3] *= scalar;
264
0
    out[4] *= scalar;
265
0
    out[5] *= scalar;
266
0
    out[6] *= scalar;
267
0
    out[7] *= scalar;
268
0
    out[8] *= scalar;
269
0
}
270
271
/* felem_scalar128 sets out = out * scalar */
272
static void felem_scalar128(largefelem out, limb scalar)
273
0
{
274
0
    out[0] *= scalar;
275
0
    out[1] *= scalar;
276
0
    out[2] *= scalar;
277
0
    out[3] *= scalar;
278
0
    out[4] *= scalar;
279
0
    out[5] *= scalar;
280
0
    out[6] *= scalar;
281
0
    out[7] *= scalar;
282
0
    out[8] *= scalar;
283
0
}
284
285
/*-
286
 * felem_neg sets |out| to |-in|
287
 * On entry:
288
 *   in[i] < 2^59 + 2^14
289
 * On exit:
290
 *   out[i] < 2^62
291
 */
292
static void felem_neg(felem out, const felem in)
293
0
{
294
    /* In order to prevent underflow, we subtract from 0 mod p. */
295
0
    static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
296
0
    static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
297
298
0
    out[0] = two62m3 - in[0];
299
0
    out[1] = two62m2 - in[1];
300
0
    out[2] = two62m2 - in[2];
301
0
    out[3] = two62m2 - in[3];
302
0
    out[4] = two62m2 - in[4];
303
0
    out[5] = two62m2 - in[5];
304
0
    out[6] = two62m2 - in[6];
305
0
    out[7] = two62m2 - in[7];
306
0
    out[8] = two62m2 - in[8];
307
0
}
308
309
/*-
310
 * felem_diff64 subtracts |in| from |out|
311
 * On entry:
312
 *   in[i] < 2^59 + 2^14
313
 * On exit:
314
 *   out[i] < out[i] + 2^62
315
 */
316
static void felem_diff64(felem out, const felem in)
317
0
{
318
    /*
319
     * In order to prevent underflow, we add 0 mod p before subtracting.
320
     */
321
0
    static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
322
0
    static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
323
324
0
    out[0] += two62m3 - in[0];
325
0
    out[1] += two62m2 - in[1];
326
0
    out[2] += two62m2 - in[2];
327
0
    out[3] += two62m2 - in[3];
328
0
    out[4] += two62m2 - in[4];
329
0
    out[5] += two62m2 - in[5];
330
0
    out[6] += two62m2 - in[6];
331
0
    out[7] += two62m2 - in[7];
332
0
    out[8] += two62m2 - in[8];
333
0
}
334
335
/*-
336
 * felem_diff_128_64 subtracts |in| from |out|
337
 * On entry:
338
 *   in[i] < 2^62 + 2^17
339
 * On exit:
340
 *   out[i] < out[i] + 2^63
341
 */
342
static void felem_diff_128_64(largefelem out, const felem in)
343
0
{
344
    /*
345
     * In order to prevent underflow, we add 64p mod p (which is equivalent
346
     * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
347
     * digit number with all bits set to 1. See "The representation of field
348
     * elements" comment above for a description of how limbs are used to
349
     * represent a number. 64p is represented with 8 limbs containing a number
350
     * with 58 bits set and one limb with a number with 57 bits set.
351
     */
352
0
    static const limb two63m6 = (((limb)1) << 63) - (((limb)1) << 6);
353
0
    static const limb two63m5 = (((limb)1) << 63) - (((limb)1) << 5);
354
355
0
    out[0] += two63m6 - in[0];
356
0
    out[1] += two63m5 - in[1];
357
0
    out[2] += two63m5 - in[2];
358
0
    out[3] += two63m5 - in[3];
359
0
    out[4] += two63m5 - in[4];
360
0
    out[5] += two63m5 - in[5];
361
0
    out[6] += two63m5 - in[6];
362
0
    out[7] += two63m5 - in[7];
363
0
    out[8] += two63m5 - in[8];
364
0
}
365
366
/*-
367
 * felem_diff_128_64 subtracts |in| from |out|
368
 * On entry:
369
 *   in[i] < 2^126
370
 * On exit:
371
 *   out[i] < out[i] + 2^127 - 2^69
372
 */
373
static void felem_diff128(largefelem out, const largefelem in)
374
0
{
375
    /*
376
     * In order to prevent underflow, we add 0 mod p before subtracting.
377
     */
378
0
    static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70);
379
0
    static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69);
380
381
0
    out[0] += (two127m70 - in[0]);
382
0
    out[1] += (two127m69 - in[1]);
383
0
    out[2] += (two127m69 - in[2]);
384
0
    out[3] += (two127m69 - in[3]);
385
0
    out[4] += (two127m69 - in[4]);
386
0
    out[5] += (two127m69 - in[5]);
387
0
    out[6] += (two127m69 - in[6]);
388
0
    out[7] += (two127m69 - in[7]);
389
0
    out[8] += (two127m69 - in[8]);
390
0
}
391
392
/*-
393
 * felem_square sets |out| = |in|^2
394
 * On entry:
395
 *   in[i] < 2^62
396
 * On exit:
397
 *   out[i] < 17 * max(in[i]) * max(in[i])
398
 */
399
static void felem_square_ref(largefelem out, const felem in)
400
0
{
401
0
    felem inx2, inx4;
402
0
    felem_scalar(inx2, in, 2);
403
0
    felem_scalar(inx4, in, 4);
404
405
    /*-
406
     * We have many cases were we want to do
407
     *   in[x] * in[y] +
408
     *   in[y] * in[x]
409
     * This is obviously just
410
     *   2 * in[x] * in[y]
411
     * However, rather than do the doubling on the 128 bit result, we
412
     * double one of the inputs to the multiplication by reading from
413
     * |inx2|
414
     */
415
416
0
    out[0] = ((uint128_t)in[0]) * in[0];
417
0
    out[1] = ((uint128_t)in[0]) * inx2[1];
418
0
    out[2] = ((uint128_t)in[0]) * inx2[2] + ((uint128_t)in[1]) * in[1];
419
0
    out[3] = ((uint128_t)in[0]) * inx2[3] + ((uint128_t)in[1]) * inx2[2];
420
0
    out[4] = ((uint128_t)in[0]) * inx2[4] + ((uint128_t)in[1]) * inx2[3] + ((uint128_t)in[2]) * in[2];
421
0
    out[5] = ((uint128_t)in[0]) * inx2[5] + ((uint128_t)in[1]) * inx2[4] + ((uint128_t)in[2]) * inx2[3];
422
0
    out[6] = ((uint128_t)in[0]) * inx2[6] + ((uint128_t)in[1]) * inx2[5] + ((uint128_t)in[2]) * inx2[4] + ((uint128_t)in[3]) * in[3];
423
0
    out[7] = ((uint128_t)in[0]) * inx2[7] + ((uint128_t)in[1]) * inx2[6] + ((uint128_t)in[2]) * inx2[5] + ((uint128_t)in[3]) * inx2[4];
424
0
    out[8] = ((uint128_t)in[0]) * inx2[8] + ((uint128_t)in[1]) * inx2[7] + ((uint128_t)in[2]) * inx2[6] + ((uint128_t)in[3]) * inx2[5] + ((uint128_t)in[4]) * in[4];
425
426
    /*
427
     * The remaining limbs fall above 2^521, with the first falling at 2^522.
428
     * They correspond to locations one bit up from the limbs produced above
429
     * so we would have to multiply by two to align them. Again, rather than
430
     * operate on the 128-bit result, we double one of the inputs to the
431
     * multiplication. If we want to double for both this reason, and the
432
     * reason above, then we end up multiplying by four.
433
     */
434
435
    /* 9 */
436
0
    out[0] += ((uint128_t)in[1]) * inx4[8] + ((uint128_t)in[2]) * inx4[7] + ((uint128_t)in[3]) * inx4[6] + ((uint128_t)in[4]) * inx4[5];
437
438
    /* 10 */
439
0
    out[1] += ((uint128_t)in[2]) * inx4[8] + ((uint128_t)in[3]) * inx4[7] + ((uint128_t)in[4]) * inx4[6] + ((uint128_t)in[5]) * inx2[5];
440
441
    /* 11 */
442
0
    out[2] += ((uint128_t)in[3]) * inx4[8] + ((uint128_t)in[4]) * inx4[7] + ((uint128_t)in[5]) * inx4[6];
443
444
    /* 12 */
445
0
    out[3] += ((uint128_t)in[4]) * inx4[8] + ((uint128_t)in[5]) * inx4[7] + ((uint128_t)in[6]) * inx2[6];
446
447
    /* 13 */
448
0
    out[4] += ((uint128_t)in[5]) * inx4[8] + ((uint128_t)in[6]) * inx4[7];
449
450
    /* 14 */
451
0
    out[5] += ((uint128_t)in[6]) * inx4[8] + ((uint128_t)in[7]) * inx2[7];
452
453
    /* 15 */
454
0
    out[6] += ((uint128_t)in[7]) * inx4[8];
455
456
    /* 16 */
457
0
    out[7] += ((uint128_t)in[8]) * inx2[8];
458
0
}
459
460
/*-
461
 * felem_mul sets |out| = |in1| * |in2|
462
 * On entry:
463
 *   in1[i] < 2^64
464
 *   in2[i] < 2^63
465
 * On exit:
466
 *   out[i] < 17 * max(in1[i]) * max(in2[i])
467
 */
468
static void felem_mul_ref(largefelem out, const felem in1, const felem in2)
469
0
{
470
0
    felem in2x2;
471
0
    felem_scalar(in2x2, in2, 2);
472
473
0
    out[0] = ((uint128_t)in1[0]) * in2[0];
474
475
0
    out[1] = ((uint128_t)in1[0]) * in2[1] + ((uint128_t)in1[1]) * in2[0];
476
477
0
    out[2] = ((uint128_t)in1[0]) * in2[2] + ((uint128_t)in1[1]) * in2[1] + ((uint128_t)in1[2]) * in2[0];
478
479
0
    out[3] = ((uint128_t)in1[0]) * in2[3] + ((uint128_t)in1[1]) * in2[2] + ((uint128_t)in1[2]) * in2[1] + ((uint128_t)in1[3]) * in2[0];
480
481
0
    out[4] = ((uint128_t)in1[0]) * in2[4] + ((uint128_t)in1[1]) * in2[3] + ((uint128_t)in1[2]) * in2[2] + ((uint128_t)in1[3]) * in2[1] + ((uint128_t)in1[4]) * in2[0];
482
483
0
    out[5] = ((uint128_t)in1[0]) * in2[5] + ((uint128_t)in1[1]) * in2[4] + ((uint128_t)in1[2]) * in2[3] + ((uint128_t)in1[3]) * in2[2] + ((uint128_t)in1[4]) * in2[1] + ((uint128_t)in1[5]) * in2[0];
484
485
0
    out[6] = ((uint128_t)in1[0]) * in2[6] + ((uint128_t)in1[1]) * in2[5] + ((uint128_t)in1[2]) * in2[4] + ((uint128_t)in1[3]) * in2[3] + ((uint128_t)in1[4]) * in2[2] + ((uint128_t)in1[5]) * in2[1] + ((uint128_t)in1[6]) * in2[0];
486
487
0
    out[7] = ((uint128_t)in1[0]) * in2[7] + ((uint128_t)in1[1]) * in2[6] + ((uint128_t)in1[2]) * in2[5] + ((uint128_t)in1[3]) * in2[4] + ((uint128_t)in1[4]) * in2[3] + ((uint128_t)in1[5]) * in2[2] + ((uint128_t)in1[6]) * in2[1] + ((uint128_t)in1[7]) * in2[0];
488
489
0
    out[8] = ((uint128_t)in1[0]) * in2[8] + ((uint128_t)in1[1]) * in2[7] + ((uint128_t)in1[2]) * in2[6] + ((uint128_t)in1[3]) * in2[5] + ((uint128_t)in1[4]) * in2[4] + ((uint128_t)in1[5]) * in2[3] + ((uint128_t)in1[6]) * in2[2] + ((uint128_t)in1[7]) * in2[1] + ((uint128_t)in1[8]) * in2[0];
490
491
    /* See comment in felem_square about the use of in2x2 here */
492
493
0
    out[0] += ((uint128_t)in1[1]) * in2x2[8] + ((uint128_t)in1[2]) * in2x2[7] + ((uint128_t)in1[3]) * in2x2[6] + ((uint128_t)in1[4]) * in2x2[5] + ((uint128_t)in1[5]) * in2x2[4] + ((uint128_t)in1[6]) * in2x2[3] + ((uint128_t)in1[7]) * in2x2[2] + ((uint128_t)in1[8]) * in2x2[1];
494
495
0
    out[1] += ((uint128_t)in1[2]) * in2x2[8] + ((uint128_t)in1[3]) * in2x2[7] + ((uint128_t)in1[4]) * in2x2[6] + ((uint128_t)in1[5]) * in2x2[5] + ((uint128_t)in1[6]) * in2x2[4] + ((uint128_t)in1[7]) * in2x2[3] + ((uint128_t)in1[8]) * in2x2[2];
496
497
0
    out[2] += ((uint128_t)in1[3]) * in2x2[8] + ((uint128_t)in1[4]) * in2x2[7] + ((uint128_t)in1[5]) * in2x2[6] + ((uint128_t)in1[6]) * in2x2[5] + ((uint128_t)in1[7]) * in2x2[4] + ((uint128_t)in1[8]) * in2x2[3];
498
499
0
    out[3] += ((uint128_t)in1[4]) * in2x2[8] + ((uint128_t)in1[5]) * in2x2[7] + ((uint128_t)in1[6]) * in2x2[6] + ((uint128_t)in1[7]) * in2x2[5] + ((uint128_t)in1[8]) * in2x2[4];
500
501
0
    out[4] += ((uint128_t)in1[5]) * in2x2[8] + ((uint128_t)in1[6]) * in2x2[7] + ((uint128_t)in1[7]) * in2x2[6] + ((uint128_t)in1[8]) * in2x2[5];
502
503
0
    out[5] += ((uint128_t)in1[6]) * in2x2[8] + ((uint128_t)in1[7]) * in2x2[7] + ((uint128_t)in1[8]) * in2x2[6];
504
505
0
    out[6] += ((uint128_t)in1[7]) * in2x2[8] + ((uint128_t)in1[8]) * in2x2[7];
506
507
0
    out[7] += ((uint128_t)in1[8]) * in2x2[8];
508
0
}
509
510
static const limb bottom52bits = 0xfffffffffffff;
511
512
/*-
513
 * felem_reduce converts a largefelem to an felem.
514
 * On entry:
515
 *   in[i] < 2^128
516
 * On exit:
517
 *   out[i] < 2^59 + 2^14
518
 */
519
static void felem_reduce(felem out, const largefelem in)
520
0
{
521
0
    uint64_t overflow1, overflow2;
522
523
0
    out[0] = ((limb)in[0]) & bottom58bits;
524
0
    out[1] = ((limb)in[1]) & bottom58bits;
525
0
    out[2] = ((limb)in[2]) & bottom58bits;
526
0
    out[3] = ((limb)in[3]) & bottom58bits;
527
0
    out[4] = ((limb)in[4]) & bottom58bits;
528
0
    out[5] = ((limb)in[5]) & bottom58bits;
529
0
    out[6] = ((limb)in[6]) & bottom58bits;
530
0
    out[7] = ((limb)in[7]) & bottom58bits;
531
0
    out[8] = ((limb)in[8]) & bottom58bits;
532
533
    /* out[i] < 2^58 */
534
535
0
    out[1] += ((limb)in[0]) >> 58;
536
0
    out[1] += (((limb)(in[0] >> 64)) & bottom52bits) << 6;
537
    /*-
538
     * out[1] < 2^58 + 2^6 + 2^58
539
     *        = 2^59 + 2^6
540
     */
541
0
    out[2] += ((limb)(in[0] >> 64)) >> 52;
542
543
0
    out[2] += ((limb)in[1]) >> 58;
544
0
    out[2] += (((limb)(in[1] >> 64)) & bottom52bits) << 6;
545
0
    out[3] += ((limb)(in[1] >> 64)) >> 52;
546
547
0
    out[3] += ((limb)in[2]) >> 58;
548
0
    out[3] += (((limb)(in[2] >> 64)) & bottom52bits) << 6;
549
0
    out[4] += ((limb)(in[2] >> 64)) >> 52;
550
551
0
    out[4] += ((limb)in[3]) >> 58;
552
0
    out[4] += (((limb)(in[3] >> 64)) & bottom52bits) << 6;
553
0
    out[5] += ((limb)(in[3] >> 64)) >> 52;
554
555
0
    out[5] += ((limb)in[4]) >> 58;
556
0
    out[5] += (((limb)(in[4] >> 64)) & bottom52bits) << 6;
557
0
    out[6] += ((limb)(in[4] >> 64)) >> 52;
558
559
0
    out[6] += ((limb)in[5]) >> 58;
560
0
    out[6] += (((limb)(in[5] >> 64)) & bottom52bits) << 6;
561
0
    out[7] += ((limb)(in[5] >> 64)) >> 52;
562
563
0
    out[7] += ((limb)in[6]) >> 58;
564
0
    out[7] += (((limb)(in[6] >> 64)) & bottom52bits) << 6;
565
0
    out[8] += ((limb)(in[6] >> 64)) >> 52;
566
567
0
    out[8] += ((limb)in[7]) >> 58;
568
0
    out[8] += (((limb)(in[7] >> 64)) & bottom52bits) << 6;
569
    /*-
570
     * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
571
     *            < 2^59 + 2^13
572
     */
573
0
    overflow1 = ((limb)(in[7] >> 64)) >> 52;
574
575
0
    overflow1 += ((limb)in[8]) >> 58;
576
0
    overflow1 += (((limb)(in[8] >> 64)) & bottom52bits) << 6;
577
0
    overflow2 = ((limb)(in[8] >> 64)) >> 52;
578
579
0
    overflow1 <<= 1; /* overflow1 < 2^13 + 2^7 + 2^59 */
580
0
    overflow2 <<= 1; /* overflow2 < 2^13 */
581
582
0
    out[0] += overflow1; /* out[0] < 2^60 */
583
0
    out[1] += overflow2; /* out[1] < 2^59 + 2^6 + 2^13 */
584
585
0
    out[1] += out[0] >> 58;
586
0
    out[0] &= bottom58bits;
587
    /*-
588
     * out[0] < 2^58
589
     * out[1] < 2^59 + 2^6 + 2^13 + 2^2
590
     *        < 2^59 + 2^14
591
     */
592
0
}
593
594
#if defined(ECP_NISTP521_ASM)
595
static void felem_square_wrapper(largefelem out, const felem in);
596
static void felem_mul_wrapper(largefelem out, const felem in1, const felem in2);
597
598
static void (*felem_square_p)(largefelem out, const felem in) = felem_square_wrapper;
599
static void (*felem_mul_p)(largefelem out, const felem in1, const felem in2) = felem_mul_wrapper;
600
601
void p521_felem_square(largefelem out, const felem in);
602
void p521_felem_mul(largefelem out, const felem in1, const felem in2);
603
604
#if defined(_ARCH_PPC64)
605
#include "arch/ppc_arch.h"
606
#endif
607
608
static void felem_select(void)
609
{
610
#if defined(_ARCH_PPC64)
611
    if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
612
        felem_square_p = p521_felem_square;
613
        felem_mul_p = p521_felem_mul;
614
615
        return;
616
    }
617
#endif
618
619
    /* Default */
620
    felem_square_p = felem_square_ref;
621
    felem_mul_p = felem_mul_ref;
622
}
623
624
static void felem_square_wrapper(largefelem out, const felem in)
625
{
626
    felem_select();
627
    felem_square_p(out, in);
628
}
629
630
static void felem_mul_wrapper(largefelem out, const felem in1, const felem in2)
631
{
632
    felem_select();
633
    felem_mul_p(out, in1, in2);
634
}
635
636
#define felem_square felem_square_p
637
#define felem_mul felem_mul_p
638
#else
639
0
#define felem_square felem_square_ref
640
0
#define felem_mul felem_mul_ref
641
#endif
642
643
static void felem_square_reduce(felem out, const felem in)
644
0
{
645
0
    largefelem tmp;
646
0
    felem_square(tmp, in);
647
0
    felem_reduce(out, tmp);
648
0
}
649
650
static void felem_mul_reduce(felem out, const felem in1, const felem in2)
651
0
{
652
0
    largefelem tmp;
653
0
    felem_mul(tmp, in1, in2);
654
0
    felem_reduce(out, tmp);
655
0
}
656
657
/*-
658
 * felem_inv calculates |out| = |in|^{-1}
659
 *
660
 * Based on Fermat's Little Theorem:
661
 *   a^p = a (mod p)
662
 *   a^{p-1} = 1 (mod p)
663
 *   a^{p-2} = a^{-1} (mod p)
664
 */
665
static void felem_inv(felem out, const felem in)
666
0
{
667
0
    felem ftmp, ftmp2, ftmp3, ftmp4;
668
0
    largefelem tmp;
669
0
    unsigned i;
670
671
0
    felem_square(tmp, in);
672
0
    felem_reduce(ftmp, tmp); /* 2^1 */
673
0
    felem_mul(tmp, in, ftmp);
674
0
    felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */
675
0
    felem_assign(ftmp2, ftmp);
676
0
    felem_square(tmp, ftmp);
677
0
    felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */
678
0
    felem_mul(tmp, in, ftmp);
679
0
    felem_reduce(ftmp, tmp); /* 2^3 - 2^0 */
680
0
    felem_square(tmp, ftmp);
681
0
    felem_reduce(ftmp, tmp); /* 2^4 - 2^1 */
682
683
0
    felem_square(tmp, ftmp2);
684
0
    felem_reduce(ftmp3, tmp); /* 2^3 - 2^1 */
685
0
    felem_square(tmp, ftmp3);
686
0
    felem_reduce(ftmp3, tmp); /* 2^4 - 2^2 */
687
0
    felem_mul(tmp, ftmp3, ftmp2);
688
0
    felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
689
690
0
    felem_assign(ftmp2, ftmp3);
691
0
    felem_square(tmp, ftmp3);
692
0
    felem_reduce(ftmp3, tmp); /* 2^5 - 2^1 */
693
0
    felem_square(tmp, ftmp3);
694
0
    felem_reduce(ftmp3, tmp); /* 2^6 - 2^2 */
695
0
    felem_square(tmp, ftmp3);
696
0
    felem_reduce(ftmp3, tmp); /* 2^7 - 2^3 */
697
0
    felem_square(tmp, ftmp3);
698
0
    felem_reduce(ftmp3, tmp); /* 2^8 - 2^4 */
699
0
    felem_mul(tmp, ftmp3, ftmp);
700
0
    felem_reduce(ftmp4, tmp); /* 2^8 - 2^1 */
701
0
    felem_square(tmp, ftmp4);
702
0
    felem_reduce(ftmp4, tmp); /* 2^9 - 2^2 */
703
0
    felem_mul(tmp, ftmp3, ftmp2);
704
0
    felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
705
0
    felem_assign(ftmp2, ftmp3);
706
707
0
    for (i = 0; i < 8; i++) {
708
0
        felem_square(tmp, ftmp3);
709
0
        felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
710
0
    }
711
0
    felem_mul(tmp, ftmp3, ftmp2);
712
0
    felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
713
0
    felem_assign(ftmp2, ftmp3);
714
715
0
    for (i = 0; i < 16; i++) {
716
0
        felem_square(tmp, ftmp3);
717
0
        felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
718
0
    }
719
0
    felem_mul(tmp, ftmp3, ftmp2);
720
0
    felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
721
0
    felem_assign(ftmp2, ftmp3);
722
723
0
    for (i = 0; i < 32; i++) {
724
0
        felem_square(tmp, ftmp3);
725
0
        felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
726
0
    }
727
0
    felem_mul(tmp, ftmp3, ftmp2);
728
0
    felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
729
0
    felem_assign(ftmp2, ftmp3);
730
731
0
    for (i = 0; i < 64; i++) {
732
0
        felem_square(tmp, ftmp3);
733
0
        felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
734
0
    }
735
0
    felem_mul(tmp, ftmp3, ftmp2);
736
0
    felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
737
0
    felem_assign(ftmp2, ftmp3);
738
739
0
    for (i = 0; i < 128; i++) {
740
0
        felem_square(tmp, ftmp3);
741
0
        felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
742
0
    }
743
0
    felem_mul(tmp, ftmp3, ftmp2);
744
0
    felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
745
0
    felem_assign(ftmp2, ftmp3);
746
747
0
    for (i = 0; i < 256; i++) {
748
0
        felem_square(tmp, ftmp3);
749
0
        felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
750
0
    }
751
0
    felem_mul(tmp, ftmp3, ftmp2);
752
0
    felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
753
754
0
    for (i = 0; i < 9; i++) {
755
0
        felem_square(tmp, ftmp3);
756
0
        felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
757
0
    }
758
0
    felem_mul(tmp, ftmp3, ftmp4);
759
0
    felem_reduce(ftmp3, tmp); /* 2^521 - 2^2 */
760
0
    felem_mul(tmp, ftmp3, in);
761
0
    felem_reduce(out, tmp); /* 2^521 - 3 */
762
0
}
763
764
/* This is 2^521-1, expressed as an felem */
765
static const felem kPrime = {
766
    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
767
    0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
768
    0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
769
};
770
771
/*-
772
 * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
773
 * otherwise.
774
 * On entry:
775
 *   in[i] < 2^59 + 2^14
776
 */
777
static limb felem_is_zero(const felem in)
778
0
{
779
0
    felem ftmp;
780
0
    limb is_zero, is_p;
781
0
    felem_assign(ftmp, in);
782
783
0
    ftmp[0] += ftmp[8] >> 57;
784
0
    ftmp[8] &= bottom57bits;
785
    /* ftmp[8] < 2^57 */
786
0
    ftmp[1] += ftmp[0] >> 58;
787
0
    ftmp[0] &= bottom58bits;
788
0
    ftmp[2] += ftmp[1] >> 58;
789
0
    ftmp[1] &= bottom58bits;
790
0
    ftmp[3] += ftmp[2] >> 58;
791
0
    ftmp[2] &= bottom58bits;
792
0
    ftmp[4] += ftmp[3] >> 58;
793
0
    ftmp[3] &= bottom58bits;
794
0
    ftmp[5] += ftmp[4] >> 58;
795
0
    ftmp[4] &= bottom58bits;
796
0
    ftmp[6] += ftmp[5] >> 58;
797
0
    ftmp[5] &= bottom58bits;
798
0
    ftmp[7] += ftmp[6] >> 58;
799
0
    ftmp[6] &= bottom58bits;
800
0
    ftmp[8] += ftmp[7] >> 58;
801
0
    ftmp[7] &= bottom58bits;
802
    /* ftmp[8] < 2^57 + 4 */
803
804
    /*
805
     * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
806
     * than our bound for ftmp[8]. Therefore we only have to check if the
807
     * zero is zero or 2^521-1.
808
     */
809
810
0
    is_zero = 0;
811
0
    is_zero |= ftmp[0];
812
0
    is_zero |= ftmp[1];
813
0
    is_zero |= ftmp[2];
814
0
    is_zero |= ftmp[3];
815
0
    is_zero |= ftmp[4];
816
0
    is_zero |= ftmp[5];
817
0
    is_zero |= ftmp[6];
818
0
    is_zero |= ftmp[7];
819
0
    is_zero |= ftmp[8];
820
821
0
    is_zero--;
822
    /*
823
     * We know that ftmp[i] < 2^63, therefore the only way that the top bit
824
     * can be set is if is_zero was 0 before the decrement.
825
     */
826
0
    is_zero = 0 - (is_zero >> 63);
827
828
0
    is_p = ftmp[0] ^ kPrime[0];
829
0
    is_p |= ftmp[1] ^ kPrime[1];
830
0
    is_p |= ftmp[2] ^ kPrime[2];
831
0
    is_p |= ftmp[3] ^ kPrime[3];
832
0
    is_p |= ftmp[4] ^ kPrime[4];
833
0
    is_p |= ftmp[5] ^ kPrime[5];
834
0
    is_p |= ftmp[6] ^ kPrime[6];
835
0
    is_p |= ftmp[7] ^ kPrime[7];
836
0
    is_p |= ftmp[8] ^ kPrime[8];
837
838
0
    is_p--;
839
0
    is_p = 0 - (is_p >> 63);
840
841
0
    is_zero |= is_p;
842
0
    return is_zero;
843
0
}
844
845
static int felem_is_zero_int(const void *in)
846
0
{
847
0
    return (int)(felem_is_zero(in) & ((limb)1));
848
0
}
849
850
/*-
851
 * felem_contract converts |in| to its unique, minimal representation.
852
 * On entry:
853
 *   in[i] < 2^59 + 2^14
854
 */
855
static void felem_contract(felem out, const felem in)
856
0
{
857
0
    limb is_p, is_greater, sign;
858
0
    static const limb two58 = ((limb)1) << 58;
859
860
0
    felem_assign(out, in);
861
862
0
    out[0] += out[8] >> 57;
863
0
    out[8] &= bottom57bits;
864
    /* out[8] < 2^57 */
865
0
    out[1] += out[0] >> 58;
866
0
    out[0] &= bottom58bits;
867
0
    out[2] += out[1] >> 58;
868
0
    out[1] &= bottom58bits;
869
0
    out[3] += out[2] >> 58;
870
0
    out[2] &= bottom58bits;
871
0
    out[4] += out[3] >> 58;
872
0
    out[3] &= bottom58bits;
873
0
    out[5] += out[4] >> 58;
874
0
    out[4] &= bottom58bits;
875
0
    out[6] += out[5] >> 58;
876
0
    out[5] &= bottom58bits;
877
0
    out[7] += out[6] >> 58;
878
0
    out[6] &= bottom58bits;
879
0
    out[8] += out[7] >> 58;
880
0
    out[7] &= bottom58bits;
881
    /* out[8] < 2^57 + 4 */
882
883
    /*
884
     * If the value is greater than 2^521-1 then we have to subtract 2^521-1
885
     * out. See the comments in felem_is_zero regarding why we don't test for
886
     * other multiples of the prime.
887
     */
888
889
    /*
890
     * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
891
     */
892
893
0
    is_p = out[0] ^ kPrime[0];
894
0
    is_p |= out[1] ^ kPrime[1];
895
0
    is_p |= out[2] ^ kPrime[2];
896
0
    is_p |= out[3] ^ kPrime[3];
897
0
    is_p |= out[4] ^ kPrime[4];
898
0
    is_p |= out[5] ^ kPrime[5];
899
0
    is_p |= out[6] ^ kPrime[6];
900
0
    is_p |= out[7] ^ kPrime[7];
901
0
    is_p |= out[8] ^ kPrime[8];
902
903
0
    is_p--;
904
0
    is_p &= is_p << 32;
905
0
    is_p &= is_p << 16;
906
0
    is_p &= is_p << 8;
907
0
    is_p &= is_p << 4;
908
0
    is_p &= is_p << 2;
909
0
    is_p &= is_p << 1;
910
0
    is_p = 0 - (is_p >> 63);
911
0
    is_p = ~is_p;
912
913
    /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
914
915
0
    out[0] &= is_p;
916
0
    out[1] &= is_p;
917
0
    out[2] &= is_p;
918
0
    out[3] &= is_p;
919
0
    out[4] &= is_p;
920
0
    out[5] &= is_p;
921
0
    out[6] &= is_p;
922
0
    out[7] &= is_p;
923
0
    out[8] &= is_p;
924
925
    /*
926
     * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
927
     * 57 is greater than zero as (2^521-1) + x >= 2^522
928
     */
929
0
    is_greater = out[8] >> 57;
930
0
    is_greater |= is_greater << 32;
931
0
    is_greater |= is_greater << 16;
932
0
    is_greater |= is_greater << 8;
933
0
    is_greater |= is_greater << 4;
934
0
    is_greater |= is_greater << 2;
935
0
    is_greater |= is_greater << 1;
936
0
    is_greater = 0 - (is_greater >> 63);
937
938
0
    out[0] -= kPrime[0] & is_greater;
939
0
    out[1] -= kPrime[1] & is_greater;
940
0
    out[2] -= kPrime[2] & is_greater;
941
0
    out[3] -= kPrime[3] & is_greater;
942
0
    out[4] -= kPrime[4] & is_greater;
943
0
    out[5] -= kPrime[5] & is_greater;
944
0
    out[6] -= kPrime[6] & is_greater;
945
0
    out[7] -= kPrime[7] & is_greater;
946
0
    out[8] -= kPrime[8] & is_greater;
947
948
    /* Eliminate negative coefficients */
949
0
    sign = -(out[0] >> 63);
950
0
    out[0] += (two58 & sign);
951
0
    out[1] -= (1 & sign);
952
0
    sign = -(out[1] >> 63);
953
0
    out[1] += (two58 & sign);
954
0
    out[2] -= (1 & sign);
955
0
    sign = -(out[2] >> 63);
956
0
    out[2] += (two58 & sign);
957
0
    out[3] -= (1 & sign);
958
0
    sign = -(out[3] >> 63);
959
0
    out[3] += (two58 & sign);
960
0
    out[4] -= (1 & sign);
961
0
    sign = -(out[4] >> 63);
962
0
    out[4] += (two58 & sign);
963
0
    out[5] -= (1 & sign);
964
0
    sign = -(out[0] >> 63);
965
0
    out[5] += (two58 & sign);
966
0
    out[6] -= (1 & sign);
967
0
    sign = -(out[6] >> 63);
968
0
    out[6] += (two58 & sign);
969
0
    out[7] -= (1 & sign);
970
0
    sign = -(out[7] >> 63);
971
0
    out[7] += (two58 & sign);
972
0
    out[8] -= (1 & sign);
973
0
    sign = -(out[5] >> 63);
974
0
    out[5] += (two58 & sign);
975
0
    out[6] -= (1 & sign);
976
0
    sign = -(out[6] >> 63);
977
0
    out[6] += (two58 & sign);
978
0
    out[7] -= (1 & sign);
979
0
    sign = -(out[7] >> 63);
980
0
    out[7] += (two58 & sign);
981
0
    out[8] -= (1 & sign);
982
0
}
983
984
/*-
985
 * Group operations
986
 * ----------------
987
 *
988
 * Building on top of the field operations we have the operations on the
989
 * elliptic curve group itself. Points on the curve are represented in Jacobian
990
 * coordinates */
991
992
/*-
993
 * point_double calculates 2*(x_in, y_in, z_in)
994
 *
995
 * The method is taken from:
996
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
997
 *
998
 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
999
 * while x_out == y_in is not (maybe this works, but it's not tested). */
1000
static void
1001
point_double(felem x_out, felem y_out, felem z_out,
1002
    const felem x_in, const felem y_in, const felem z_in)
1003
0
{
1004
0
    largefelem tmp, tmp2;
1005
0
    felem delta, gamma, beta, alpha, ftmp, ftmp2;
1006
1007
0
    felem_assign(ftmp, x_in);
1008
0
    felem_assign(ftmp2, x_in);
1009
1010
    /* delta = z^2 */
1011
0
    felem_square(tmp, z_in);
1012
0
    felem_reduce(delta, tmp); /* delta[i] < 2^59 + 2^14 */
1013
1014
    /* gamma = y^2 */
1015
0
    felem_square(tmp, y_in);
1016
0
    felem_reduce(gamma, tmp); /* gamma[i] < 2^59 + 2^14 */
1017
1018
    /* beta = x*gamma */
1019
0
    felem_mul(tmp, x_in, gamma);
1020
0
    felem_reduce(beta, tmp); /* beta[i] < 2^59 + 2^14 */
1021
1022
    /* alpha = 3*(x-delta)*(x+delta) */
1023
0
    felem_diff64(ftmp, delta);
1024
    /* ftmp[i] < 2^61 */
1025
0
    felem_sum64(ftmp2, delta);
1026
    /* ftmp2[i] < 2^60 + 2^15 */
1027
0
    felem_scalar64(ftmp2, 3);
1028
    /* ftmp2[i] < 3*2^60 + 3*2^15 */
1029
0
    felem_mul(tmp, ftmp, ftmp2);
1030
    /*-
1031
     * tmp[i] < 17(3*2^121 + 3*2^76)
1032
     *        = 61*2^121 + 61*2^76
1033
     *        < 64*2^121 + 64*2^76
1034
     *        = 2^127 + 2^82
1035
     *        < 2^128
1036
     */
1037
0
    felem_reduce(alpha, tmp);
1038
1039
    /* x' = alpha^2 - 8*beta */
1040
0
    felem_square(tmp, alpha);
1041
    /*
1042
     * tmp[i] < 17*2^120 < 2^125
1043
     */
1044
0
    felem_assign(ftmp, beta);
1045
0
    felem_scalar64(ftmp, 8);
1046
    /* ftmp[i] < 2^62 + 2^17 */
1047
0
    felem_diff_128_64(tmp, ftmp);
1048
    /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1049
0
    felem_reduce(x_out, tmp);
1050
1051
    /* z' = (y + z)^2 - gamma - delta */
1052
0
    felem_sum64(delta, gamma);
1053
    /* delta[i] < 2^60 + 2^15 */
1054
0
    felem_assign(ftmp, y_in);
1055
0
    felem_sum64(ftmp, z_in);
1056
    /* ftmp[i] < 2^60 + 2^15 */
1057
0
    felem_square(tmp, ftmp);
1058
    /*
1059
     * tmp[i] < 17(2^122) < 2^127
1060
     */
1061
0
    felem_diff_128_64(tmp, delta);
1062
    /* tmp[i] < 2^127 + 2^63 */
1063
0
    felem_reduce(z_out, tmp);
1064
1065
    /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1066
0
    felem_scalar64(beta, 4);
1067
    /* beta[i] < 2^61 + 2^16 */
1068
0
    felem_diff64(beta, x_out);
1069
    /* beta[i] < 2^61 + 2^60 + 2^16 */
1070
0
    felem_mul(tmp, alpha, beta);
1071
    /*-
1072
     * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1073
     *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1074
     *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1075
     *        < 2^128
1076
     */
1077
0
    felem_square(tmp2, gamma);
1078
    /*-
1079
     * tmp2[i] < 17*(2^59 + 2^14)^2
1080
     *         = 17*(2^118 + 2^74 + 2^28)
1081
     */
1082
0
    felem_scalar128(tmp2, 8);
1083
    /*-
1084
     * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1085
     *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1086
     *         < 2^126
1087
     */
1088
0
    felem_diff128(tmp, tmp2);
1089
    /*-
1090
     * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1091
     *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1092
     *          2^74 + 2^69 + 2^34 + 2^30
1093
     *        < 2^128
1094
     */
1095
0
    felem_reduce(y_out, tmp);
1096
0
}
1097
1098
/* copy_conditional copies in to out iff mask is all ones. */
1099
static void copy_conditional(felem out, const felem in, limb mask)
1100
0
{
1101
0
    unsigned i;
1102
0
    for (i = 0; i < NLIMBS; ++i) {
1103
0
        const limb tmp = mask & (in[i] ^ out[i]);
1104
0
        out[i] ^= tmp;
1105
0
    }
1106
0
}
1107
1108
/*-
1109
 * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1110
 *
1111
 * The method is taken from
1112
 *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1113
 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1114
 *
1115
 * This function includes a branch for checking whether the two input points
1116
 * are equal (while not equal to the point at infinity). See comment below
1117
 * on constant-time.
1118
 */
1119
static void point_add(felem x3, felem y3, felem z3,
1120
    const felem x1, const felem y1, const felem z1,
1121
    const int mixed, const felem x2, const felem y2,
1122
    const felem z2)
1123
0
{
1124
0
    felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1125
0
    largefelem tmp, tmp2;
1126
0
    limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1127
0
    limb points_equal;
1128
1129
0
    z1_is_zero = felem_is_zero(z1);
1130
0
    z2_is_zero = felem_is_zero(z2);
1131
1132
    /* ftmp = z1z1 = z1**2 */
1133
0
    felem_square(tmp, z1);
1134
0
    felem_reduce(ftmp, tmp);
1135
1136
0
    if (!mixed) {
1137
        /* ftmp2 = z2z2 = z2**2 */
1138
0
        felem_square(tmp, z2);
1139
0
        felem_reduce(ftmp2, tmp);
1140
1141
        /* u1 = ftmp3 = x1*z2z2 */
1142
0
        felem_mul(tmp, x1, ftmp2);
1143
0
        felem_reduce(ftmp3, tmp);
1144
1145
        /* ftmp5 = z1 + z2 */
1146
0
        felem_assign(ftmp5, z1);
1147
0
        felem_sum64(ftmp5, z2);
1148
        /* ftmp5[i] < 2^61 */
1149
1150
        /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1151
0
        felem_square(tmp, ftmp5);
1152
        /* tmp[i] < 17*2^122 */
1153
0
        felem_diff_128_64(tmp, ftmp);
1154
        /* tmp[i] < 17*2^122 + 2^63 */
1155
0
        felem_diff_128_64(tmp, ftmp2);
1156
        /* tmp[i] < 17*2^122 + 2^64 */
1157
0
        felem_reduce(ftmp5, tmp);
1158
1159
        /* ftmp2 = z2 * z2z2 */
1160
0
        felem_mul(tmp, ftmp2, z2);
1161
0
        felem_reduce(ftmp2, tmp);
1162
1163
        /* s1 = ftmp6 = y1 * z2**3 */
1164
0
        felem_mul(tmp, y1, ftmp2);
1165
0
        felem_reduce(ftmp6, tmp);
1166
0
    } else {
1167
        /*
1168
         * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1169
         */
1170
1171
        /* u1 = ftmp3 = x1*z2z2 */
1172
0
        felem_assign(ftmp3, x1);
1173
1174
        /* ftmp5 = 2*z1z2 */
1175
0
        felem_scalar(ftmp5, z1, 2);
1176
1177
        /* s1 = ftmp6 = y1 * z2**3 */
1178
0
        felem_assign(ftmp6, y1);
1179
0
    }
1180
1181
    /* u2 = x2*z1z1 */
1182
0
    felem_mul(tmp, x2, ftmp);
1183
    /* tmp[i] < 17*2^120 */
1184
1185
    /* h = ftmp4 = u2 - u1 */
1186
0
    felem_diff_128_64(tmp, ftmp3);
1187
    /* tmp[i] < 17*2^120 + 2^63 */
1188
0
    felem_reduce(ftmp4, tmp);
1189
1190
0
    x_equal = felem_is_zero(ftmp4);
1191
1192
    /* z_out = ftmp5 * h */
1193
0
    felem_mul(tmp, ftmp5, ftmp4);
1194
0
    felem_reduce(z_out, tmp);
1195
1196
    /* ftmp = z1 * z1z1 */
1197
0
    felem_mul(tmp, ftmp, z1);
1198
0
    felem_reduce(ftmp, tmp);
1199
1200
    /* s2 = tmp = y2 * z1**3 */
1201
0
    felem_mul(tmp, y2, ftmp);
1202
    /* tmp[i] < 17*2^120 */
1203
1204
    /* r = ftmp5 = (s2 - s1)*2 */
1205
0
    felem_diff_128_64(tmp, ftmp6);
1206
    /* tmp[i] < 17*2^120 + 2^63 */
1207
0
    felem_reduce(ftmp5, tmp);
1208
0
    y_equal = felem_is_zero(ftmp5);
1209
0
    felem_scalar64(ftmp5, 2);
1210
    /* ftmp5[i] < 2^61 */
1211
1212
    /*
1213
     * The formulae are incorrect if the points are equal, in affine coordinates
1214
     * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1215
     * happens.
1216
     *
1217
     * We use bitwise operations to avoid potential side-channels introduced by
1218
     * the short-circuiting behaviour of boolean operators.
1219
     *
1220
     * The special case of either point being the point at infinity (z1 and/or
1221
     * z2 are zero), is handled separately later on in this function, so we
1222
     * avoid jumping to point_double here in those special cases.
1223
     *
1224
     * Notice the comment below on the implications of this branching for timing
1225
     * leaks and why it is considered practically irrelevant.
1226
     */
1227
0
    points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1228
1229
0
    if (points_equal) {
1230
        /*
1231
         * This is obviously not constant-time but it will almost-never happen
1232
         * for ECDH / ECDSA. The case where it can happen is during scalar-mult
1233
         * where the intermediate value gets very close to the group order.
1234
         * Since |ossl_ec_GFp_nistp_recode_scalar_bits| produces signed digits
1235
         * for the scalar, it's possible for the intermediate value to be a small
1236
         * negative multiple of the base point, and for the final signed digit
1237
         * to be the same value. We believe that this only occurs for the scalar
1238
         * 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1239
         * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1240
         * 71e913863f7, in that case the penultimate intermediate is -9G and
1241
         * the final digit is also -9G. Since this only happens for a single
1242
         * scalar, the timing leak is irrelevant. (Any attacker who wanted to
1243
         * check whether a secret scalar was that exact value, can already do
1244
         * so.)
1245
         */
1246
0
        point_double(x3, y3, z3, x1, y1, z1);
1247
0
        return;
1248
0
    }
1249
1250
    /* I = ftmp = (2h)**2 */
1251
0
    felem_assign(ftmp, ftmp4);
1252
0
    felem_scalar64(ftmp, 2);
1253
    /* ftmp[i] < 2^61 */
1254
0
    felem_square(tmp, ftmp);
1255
    /* tmp[i] < 17*2^122 */
1256
0
    felem_reduce(ftmp, tmp);
1257
1258
    /* J = ftmp2 = h * I */
1259
0
    felem_mul(tmp, ftmp4, ftmp);
1260
0
    felem_reduce(ftmp2, tmp);
1261
1262
    /* V = ftmp4 = U1 * I */
1263
0
    felem_mul(tmp, ftmp3, ftmp);
1264
0
    felem_reduce(ftmp4, tmp);
1265
1266
    /* x_out = r**2 - J - 2V */
1267
0
    felem_square(tmp, ftmp5);
1268
    /* tmp[i] < 17*2^122 */
1269
0
    felem_diff_128_64(tmp, ftmp2);
1270
    /* tmp[i] < 17*2^122 + 2^63 */
1271
0
    felem_assign(ftmp3, ftmp4);
1272
0
    felem_scalar64(ftmp4, 2);
1273
    /* ftmp4[i] < 2^61 */
1274
0
    felem_diff_128_64(tmp, ftmp4);
1275
    /* tmp[i] < 17*2^122 + 2^64 */
1276
0
    felem_reduce(x_out, tmp);
1277
1278
    /* y_out = r(V-x_out) - 2 * s1 * J */
1279
0
    felem_diff64(ftmp3, x_out);
1280
    /*
1281
     * ftmp3[i] < 2^60 + 2^60 = 2^61
1282
     */
1283
0
    felem_mul(tmp, ftmp5, ftmp3);
1284
    /* tmp[i] < 17*2^122 */
1285
0
    felem_mul(tmp2, ftmp6, ftmp2);
1286
    /* tmp2[i] < 17*2^120 */
1287
0
    felem_scalar128(tmp2, 2);
1288
    /* tmp2[i] < 17*2^121 */
1289
0
    felem_diff128(tmp, tmp2);
1290
    /*-
1291
     * tmp[i] < 2^127 - 2^69 + 17*2^122
1292
     *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1293
     *        < 2^127
1294
     */
1295
0
    felem_reduce(y_out, tmp);
1296
1297
0
    copy_conditional(x_out, x2, z1_is_zero);
1298
0
    copy_conditional(x_out, x1, z2_is_zero);
1299
0
    copy_conditional(y_out, y2, z1_is_zero);
1300
0
    copy_conditional(y_out, y1, z2_is_zero);
1301
0
    copy_conditional(z_out, z2, z1_is_zero);
1302
0
    copy_conditional(z_out, z1, z2_is_zero);
1303
0
    felem_assign(x3, x_out);
1304
0
    felem_assign(y3, y_out);
1305
0
    felem_assign(z3, z_out);
1306
0
}
1307
1308
/*-
1309
 * Base point pre computation
1310
 * --------------------------
1311
 *
1312
 * Two different sorts of precomputed tables are used in the following code.
1313
 * Each contain various points on the curve, where each point is three field
1314
 * elements (x, y, z).
1315
 *
1316
 * For the base point table, z is usually 1 (0 for the point at infinity).
1317
 * This table has 16 elements:
1318
 * index | bits    | point
1319
 * ------+---------+------------------------------
1320
 *     0 | 0 0 0 0 | 0G
1321
 *     1 | 0 0 0 1 | 1G
1322
 *     2 | 0 0 1 0 | 2^130G
1323
 *     3 | 0 0 1 1 | (2^130 + 1)G
1324
 *     4 | 0 1 0 0 | 2^260G
1325
 *     5 | 0 1 0 1 | (2^260 + 1)G
1326
 *     6 | 0 1 1 0 | (2^260 + 2^130)G
1327
 *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1328
 *     8 | 1 0 0 0 | 2^390G
1329
 *     9 | 1 0 0 1 | (2^390 + 1)G
1330
 *    10 | 1 0 1 0 | (2^390 + 2^130)G
1331
 *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1332
 *    12 | 1 1 0 0 | (2^390 + 2^260)G
1333
 *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1334
 *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1335
 *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1336
 *
1337
 * The reason for this is so that we can clock bits into four different
1338
 * locations when doing simple scalar multiplies against the base point.
1339
 *
1340
 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1341
1342
/* gmul is the table of precomputed base points */
1343
static const felem gmul[16][3] = {
1344
    { { 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1345
        { 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1346
        { 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
1347
    { { 0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1348
          0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1349
          0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404 },
1350
        { 0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1351
            0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1352
            0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b },
1353
        { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1354
    { { 0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1355
          0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1356
          0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5 },
1357
        { 0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1358
            0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1359
            0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7 },
1360
        { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1361
    { { 0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1362
          0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1363
          0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9 },
1364
        { 0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1365
            0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1366
            0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe },
1367
        { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1368
    { { 0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1369
          0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1370
          0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065 },
1371
        { 0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1372
            0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1373
            0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524 },
1374
        { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1375
    { { 0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1376
          0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1377
          0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe },
1378
        { 0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1379
            0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1380
            0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7 },
1381
        { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1382
    { { 0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1383
          0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1384
          0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256 },
1385
        { 0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1386
            0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1387
            0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd },
1388
        { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1389
    { { 0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1390
          0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1391
          0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23 },
1392
        { 0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1393
            0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1394
            0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e },
1395
        { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1396
    { { 0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1397
          0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1398
          0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5 },
1399
        { 0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1400
            0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1401
            0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242 },
1402
        { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1403
    { { 0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1404
          0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1405
          0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203 },
1406
        { 0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1407
            0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1408
            0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f },
1409
        { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1410
    { { 0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1411
          0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1412
          0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a },
1413
        { 0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1414
            0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1415
            0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a },
1416
        { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1417
    { { 0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1418
          0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1419
          0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b },
1420
        { 0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1421
            0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1422
            0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f },
1423
        { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1424
    { { 0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1425
          0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1426
          0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf },
1427
        { 0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1428
            0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1429
            0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d },
1430
        { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1431
    { { 0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1432
          0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1433
          0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684 },
1434
        { 0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1435
            0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1436
            0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81 },
1437
        { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1438
    { { 0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1439
          0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1440
          0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d },
1441
        { 0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1442
            0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1443
            0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42 },
1444
        { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1445
    { { 0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1446
          0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1447
          0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f },
1448
        { 0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1449
            0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1450
            0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055 },
1451
        { 1, 0, 0, 0, 0, 0, 0, 0, 0 } }
1452
};
1453
1454
/*
1455
 * select_point selects the |idx|th point from a precomputation table and
1456
 * copies it to out.
1457
 */
1458
/* pre_comp below is of the size provided in |size| */
1459
static void select_point(const limb idx, unsigned int size,
1460
    const felem pre_comp[][3], felem out[3])
1461
0
{
1462
0
    unsigned i, j;
1463
0
    limb *outlimbs = &out[0][0];
1464
1465
0
    memset(out, 0, sizeof(*out) * 3);
1466
1467
0
    for (i = 0; i < size; i++) {
1468
0
        const limb *inlimbs = &pre_comp[i][0][0];
1469
0
        limb mask = i ^ idx;
1470
0
        mask |= mask >> 4;
1471
0
        mask |= mask >> 2;
1472
0
        mask |= mask >> 1;
1473
0
        mask &= 1;
1474
0
        mask--;
1475
0
        for (j = 0; j < NLIMBS * 3; j++)
1476
0
            outlimbs[j] |= inlimbs[j] & mask;
1477
0
    }
1478
0
}
1479
1480
/* get_bit returns the |i|th bit in |in| */
1481
static char get_bit(const felem_bytearray in, int i)
1482
0
{
1483
0
    if (i < 0)
1484
0
        return 0;
1485
0
    return (in[i >> 3] >> (i & 7)) & 1;
1486
0
}
1487
1488
/*
1489
 * Interleaved point multiplication using precomputed point multiples: The
1490
 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1491
 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1492
 * generator, using certain (large) precomputed multiples in g_pre_comp.
1493
 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1494
 */
1495
static void batch_mul(felem x_out, felem y_out, felem z_out,
1496
    const felem_bytearray scalars[],
1497
    const unsigned num_points, const uint8_t *g_scalar,
1498
    const int mixed, const felem pre_comp[][17][3],
1499
    const felem g_pre_comp[16][3])
1500
0
{
1501
0
    int i, skip;
1502
0
    unsigned num, gen_mul = (g_scalar != NULL);
1503
0
    felem nq[3], tmp[4];
1504
0
    limb bits;
1505
0
    uint8_t sign, digit;
1506
1507
    /* set nq to the point at infinity */
1508
0
    memset(nq, 0, sizeof(nq));
1509
1510
    /*
1511
     * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1512
     * of the generator (last quarter of rounds) and additions of other
1513
     * points multiples (every 5th round).
1514
     */
1515
0
    skip = 1; /* save two point operations in the first
1516
               * round */
1517
0
    for (i = (num_points ? 520 : 130); i >= 0; --i) {
1518
        /* double */
1519
0
        if (!skip)
1520
0
            point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1521
1522
        /* add multiples of the generator */
1523
0
        if (gen_mul && (i <= 130)) {
1524
0
            bits = get_bit(g_scalar, i + 390) << 3;
1525
0
            if (i < 130) {
1526
0
                bits |= get_bit(g_scalar, i + 260) << 2;
1527
0
                bits |= get_bit(g_scalar, i + 130) << 1;
1528
0
                bits |= get_bit(g_scalar, i);
1529
0
            }
1530
            /* select the point to add, in constant time */
1531
0
            select_point(bits, 16, g_pre_comp, tmp);
1532
0
            if (!skip) {
1533
                /* The 1 argument below is for "mixed" */
1534
0
                point_add(nq[0], nq[1], nq[2],
1535
0
                    nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1536
0
            } else {
1537
0
                memcpy(nq, tmp, 3 * sizeof(felem));
1538
0
                skip = 0;
1539
0
            }
1540
0
        }
1541
1542
        /* do other additions every 5 doublings */
1543
0
        if (num_points && (i % 5 == 0)) {
1544
            /* loop over all scalars */
1545
0
            for (num = 0; num < num_points; ++num) {
1546
0
                bits = get_bit(scalars[num], i + 4) << 5;
1547
0
                bits |= get_bit(scalars[num], i + 3) << 4;
1548
0
                bits |= get_bit(scalars[num], i + 2) << 3;
1549
0
                bits |= get_bit(scalars[num], i + 1) << 2;
1550
0
                bits |= get_bit(scalars[num], i) << 1;
1551
0
                bits |= get_bit(scalars[num], i - 1);
1552
0
                ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1553
1554
                /*
1555
                 * select the point to add or subtract, in constant time
1556
                 */
1557
0
                select_point(digit, 17, pre_comp[num], tmp);
1558
0
                felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1559
                                            * point */
1560
0
                copy_conditional(tmp[1], tmp[3], (-(limb)sign));
1561
1562
0
                if (!skip) {
1563
0
                    point_add(nq[0], nq[1], nq[2],
1564
0
                        nq[0], nq[1], nq[2],
1565
0
                        mixed, tmp[0], tmp[1], tmp[2]);
1566
0
                } else {
1567
0
                    memcpy(nq, tmp, 3 * sizeof(felem));
1568
0
                    skip = 0;
1569
0
                }
1570
0
            }
1571
0
        }
1572
0
    }
1573
0
    felem_assign(x_out, nq[0]);
1574
0
    felem_assign(y_out, nq[1]);
1575
0
    felem_assign(z_out, nq[2]);
1576
0
}
1577
1578
/* Precomputation for the group generator. */
1579
struct nistp521_pre_comp_st {
1580
    felem g_pre_comp[16][3];
1581
    CRYPTO_REF_COUNT references;
1582
};
1583
1584
const EC_METHOD *EC_GFp_nistp521_method(void)
1585
0
{
1586
0
    static const EC_METHOD ret = {
1587
0
        EC_FLAGS_DEFAULT_OCT,
1588
0
        NID_X9_62_prime_field,
1589
0
        ossl_ec_GFp_nistp521_group_init,
1590
0
        ossl_ec_GFp_simple_group_finish,
1591
0
        ossl_ec_GFp_simple_group_clear_finish,
1592
0
        ossl_ec_GFp_nist_group_copy,
1593
0
        ossl_ec_GFp_nistp521_group_set_curve,
1594
0
        ossl_ec_GFp_simple_group_get_curve,
1595
0
        ossl_ec_GFp_simple_group_get_degree,
1596
0
        ossl_ec_group_simple_order_bits,
1597
0
        ossl_ec_GFp_simple_group_check_discriminant,
1598
0
        ossl_ec_GFp_simple_point_init,
1599
0
        ossl_ec_GFp_simple_point_finish,
1600
0
        ossl_ec_GFp_simple_point_clear_finish,
1601
0
        ossl_ec_GFp_simple_point_copy,
1602
0
        ossl_ec_GFp_simple_point_set_to_infinity,
1603
0
        ossl_ec_GFp_simple_point_set_affine_coordinates,
1604
0
        ossl_ec_GFp_nistp521_point_get_affine_coordinates,
1605
0
        0 /* point_set_compressed_coordinates */,
1606
0
        0 /* point2oct */,
1607
0
        0 /* oct2point */,
1608
0
        ossl_ec_GFp_simple_add,
1609
0
        ossl_ec_GFp_simple_dbl,
1610
0
        ossl_ec_GFp_simple_invert,
1611
0
        ossl_ec_GFp_simple_is_at_infinity,
1612
0
        ossl_ec_GFp_simple_is_on_curve,
1613
0
        ossl_ec_GFp_simple_cmp,
1614
0
        ossl_ec_GFp_simple_make_affine,
1615
0
        ossl_ec_GFp_simple_points_make_affine,
1616
0
        ossl_ec_GFp_nistp521_points_mul,
1617
0
        ossl_ec_GFp_nistp521_precompute_mult,
1618
0
        ossl_ec_GFp_nistp521_have_precompute_mult,
1619
0
        ossl_ec_GFp_nist_field_mul,
1620
0
        ossl_ec_GFp_nist_field_sqr,
1621
0
        0 /* field_div */,
1622
0
        ossl_ec_GFp_simple_field_inv,
1623
0
        0 /* field_encode */,
1624
0
        0 /* field_decode */,
1625
0
        0, /* field_set_to_one */
1626
0
        ossl_ec_key_simple_priv2oct,
1627
0
        ossl_ec_key_simple_oct2priv,
1628
0
        0, /* set private */
1629
0
        ossl_ec_key_simple_generate_key,
1630
0
        ossl_ec_key_simple_check_key,
1631
0
        ossl_ec_key_simple_generate_public_key,
1632
0
        0, /* keycopy */
1633
0
        0, /* keyfinish */
1634
0
        ossl_ecdh_simple_compute_key,
1635
0
        ossl_ecdsa_simple_sign_setup,
1636
0
        ossl_ecdsa_simple_sign_sig,
1637
0
        ossl_ecdsa_simple_verify_sig,
1638
0
        0, /* field_inverse_mod_ord */
1639
0
        0, /* blind_coordinates */
1640
0
        0, /* ladder_pre */
1641
0
        0, /* ladder_step */
1642
0
        0 /* ladder_post */
1643
0
    };
1644
1645
0
    return &ret;
1646
0
}
1647
1648
/******************************************************************************/
1649
/*
1650
 * FUNCTIONS TO MANAGE PRECOMPUTATION
1651
 */
1652
1653
static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
1654
0
{
1655
0
    NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1656
1657
0
    if (ret == NULL)
1658
0
        return ret;
1659
1660
0
    if (!CRYPTO_NEW_REF(&ret->references, 1)) {
1661
0
        OPENSSL_free(ret);
1662
0
        return NULL;
1663
0
    }
1664
0
    return ret;
1665
0
}
1666
1667
NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1668
0
{
1669
0
    int i;
1670
0
    if (p != NULL)
1671
0
        CRYPTO_UP_REF(&p->references, &i);
1672
0
    return p;
1673
0
}
1674
1675
void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1676
0
{
1677
0
    int i;
1678
1679
0
    if (p == NULL)
1680
0
        return;
1681
1682
0
    CRYPTO_DOWN_REF(&p->references, &i);
1683
0
    REF_PRINT_COUNT("EC_nistp521", i, p);
1684
0
    if (i > 0)
1685
0
        return;
1686
0
    REF_ASSERT_ISNT(i < 0);
1687
1688
0
    CRYPTO_FREE_REF(&p->references);
1689
0
    OPENSSL_free(p);
1690
0
}
1691
1692
/******************************************************************************/
1693
/*
1694
 * OPENSSL EC_METHOD FUNCTIONS
1695
 */
1696
1697
int ossl_ec_GFp_nistp521_group_init(EC_GROUP *group)
1698
0
{
1699
0
    int ret;
1700
0
    ret = ossl_ec_GFp_simple_group_init(group);
1701
0
    group->a_is_minus3 = 1;
1702
0
    return ret;
1703
0
}
1704
1705
int ossl_ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1706
    const BIGNUM *a, const BIGNUM *b,
1707
    BN_CTX *ctx)
1708
0
{
1709
0
    int ret = 0;
1710
0
    BIGNUM *curve_p, *curve_a, *curve_b;
1711
0
#ifndef FIPS_MODULE
1712
0
    BN_CTX *new_ctx = NULL;
1713
1714
0
    if (ctx == NULL)
1715
0
        ctx = new_ctx = BN_CTX_new();
1716
0
#endif
1717
0
    if (ctx == NULL)
1718
0
        return 0;
1719
1720
0
    BN_CTX_start(ctx);
1721
0
    curve_p = BN_CTX_get(ctx);
1722
0
    curve_a = BN_CTX_get(ctx);
1723
0
    curve_b = BN_CTX_get(ctx);
1724
0
    if (curve_b == NULL)
1725
0
        goto err;
1726
0
    BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1727
0
    BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1728
0
    BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1729
0
    if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1730
0
        ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1731
0
        goto err;
1732
0
    }
1733
0
    group->field_mod_func = BN_nist_mod_521;
1734
0
    ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1735
0
err:
1736
0
    BN_CTX_end(ctx);
1737
0
#ifndef FIPS_MODULE
1738
0
    BN_CTX_free(new_ctx);
1739
0
#endif
1740
0
    return ret;
1741
0
}
1742
1743
/*
1744
 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1745
 * (X/Z^2, Y/Z^3)
1746
 */
1747
int ossl_ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1748
    const EC_POINT *point,
1749
    BIGNUM *x, BIGNUM *y,
1750
    BN_CTX *ctx)
1751
0
{
1752
0
    felem z1, z2, x_in, y_in, x_out, y_out;
1753
0
    largefelem tmp;
1754
1755
0
    if (EC_POINT_is_at_infinity(group, point)) {
1756
0
        ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1757
0
        return 0;
1758
0
    }
1759
0
    if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) || (!BN_to_felem(z1, point->Z)))
1760
0
        return 0;
1761
0
    felem_inv(z2, z1);
1762
0
    felem_square(tmp, z2);
1763
0
    felem_reduce(z1, tmp);
1764
0
    felem_mul(tmp, x_in, z1);
1765
0
    felem_reduce(x_in, tmp);
1766
0
    felem_contract(x_out, x_in);
1767
0
    if (x != NULL) {
1768
0
        if (!felem_to_BN(x, x_out)) {
1769
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1770
0
            return 0;
1771
0
        }
1772
0
    }
1773
0
    felem_mul(tmp, z1, z2);
1774
0
    felem_reduce(z1, tmp);
1775
0
    felem_mul(tmp, y_in, z1);
1776
0
    felem_reduce(y_in, tmp);
1777
0
    felem_contract(y_out, y_in);
1778
0
    if (y != NULL) {
1779
0
        if (!felem_to_BN(y, y_out)) {
1780
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1781
0
            return 0;
1782
0
        }
1783
0
    }
1784
0
    return 1;
1785
0
}
1786
1787
/* points below is of size |num|, and tmp_felems is of size |num+1/ */
1788
static void make_points_affine(size_t num, felem points[][3],
1789
    felem tmp_felems[])
1790
0
{
1791
    /*
1792
     * Runs in constant time, unless an input is the point at infinity (which
1793
     * normally shouldn't happen).
1794
     */
1795
0
    ossl_ec_GFp_nistp_points_make_affine_internal(num,
1796
0
        points,
1797
0
        sizeof(felem),
1798
0
        tmp_felems,
1799
0
        (void (*)(void *))felem_one,
1800
0
        felem_is_zero_int,
1801
0
        (void (*)(void *, const void *))
1802
0
            felem_assign,
1803
0
        (void (*)(void *, const void *))
1804
0
            felem_square_reduce,
1805
0
        (void (*)(void *,
1806
0
            const void
1807
0
                *,
1808
0
            const void
1809
0
                *))
1810
0
            felem_mul_reduce,
1811
0
        (void (*)(void *, const void *))
1812
0
            felem_inv,
1813
0
        (void (*)(void *, const void *))
1814
0
            felem_contract);
1815
0
}
1816
1817
/*
1818
 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1819
 * values Result is stored in r (r can equal one of the inputs).
1820
 */
1821
int ossl_ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1822
    const BIGNUM *scalar, size_t num,
1823
    const EC_POINT *points[],
1824
    const BIGNUM *scalars[], BN_CTX *ctx)
1825
0
{
1826
0
    int ret = 0;
1827
0
    int j;
1828
0
    int mixed = 0;
1829
0
    BIGNUM *x, *y, *z, *tmp_scalar;
1830
0
    felem_bytearray g_secret;
1831
0
    felem_bytearray *secrets = NULL;
1832
0
    felem(*pre_comp)[17][3] = NULL;
1833
0
    felem *tmp_felems = NULL;
1834
0
    unsigned i;
1835
0
    int num_bytes;
1836
0
    int have_pre_comp = 0;
1837
0
    size_t num_points = num;
1838
0
    felem x_in, y_in, z_in, x_out, y_out, z_out;
1839
0
    NISTP521_PRE_COMP *pre = NULL;
1840
0
    felem(*g_pre_comp)[3] = NULL;
1841
0
    EC_POINT *generator = NULL;
1842
0
    const EC_POINT *p = NULL;
1843
0
    const BIGNUM *p_scalar = NULL;
1844
1845
0
    BN_CTX_start(ctx);
1846
0
    x = BN_CTX_get(ctx);
1847
0
    y = BN_CTX_get(ctx);
1848
0
    z = BN_CTX_get(ctx);
1849
0
    tmp_scalar = BN_CTX_get(ctx);
1850
0
    if (tmp_scalar == NULL)
1851
0
        goto err;
1852
1853
0
    if (scalar != NULL) {
1854
0
        pre = group->pre_comp.nistp521;
1855
0
        if (pre)
1856
            /* we have precomputation, try to use it */
1857
0
            g_pre_comp = &pre->g_pre_comp[0];
1858
0
        else
1859
            /* try to use the standard precomputation */
1860
0
            g_pre_comp = (felem(*)[3])gmul;
1861
0
        generator = EC_POINT_new(group);
1862
0
        if (generator == NULL)
1863
0
            goto err;
1864
        /* get the generator from precomputation */
1865
0
        if (!felem_to_BN(x, g_pre_comp[1][0]) || !felem_to_BN(y, g_pre_comp[1][1]) || !felem_to_BN(z, g_pre_comp[1][2])) {
1866
0
            ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1867
0
            goto err;
1868
0
        }
1869
0
        if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
1870
0
                generator,
1871
0
                x, y, z, ctx))
1872
0
            goto err;
1873
0
        if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1874
            /* precomputation matches generator */
1875
0
            have_pre_comp = 1;
1876
0
        else
1877
            /*
1878
             * we don't have valid precomputation: treat the generator as a
1879
             * random point
1880
             */
1881
0
            num_points++;
1882
0
    }
1883
1884
0
    if (num_points > 0) {
1885
0
        if (num_points >= 2) {
1886
            /*
1887
             * unless we precompute multiples for just one point, converting
1888
             * those into affine form is time well spent
1889
             */
1890
0
            mixed = 1;
1891
0
        }
1892
0
        secrets = OPENSSL_calloc(num_points, sizeof(*secrets));
1893
0
        pre_comp = OPENSSL_calloc(num_points, sizeof(*pre_comp));
1894
0
        if (mixed)
1895
0
            tmp_felems = OPENSSL_malloc_array(num_points * 17 + 1, sizeof(*tmp_felems));
1896
0
        if ((secrets == NULL) || (pre_comp == NULL)
1897
0
            || (mixed && (tmp_felems == NULL)))
1898
0
            goto err;
1899
1900
        /*
1901
         * we treat NULL scalars as 0, and NULL points as points at infinity,
1902
         * i.e., they contribute nothing to the linear combination
1903
         */
1904
0
        for (i = 0; i < num_points; ++i) {
1905
0
            if (i == num) {
1906
                /*
1907
                 * we didn't have a valid precomputation, so we pick the
1908
                 * generator
1909
                 */
1910
0
                p = EC_GROUP_get0_generator(group);
1911
0
                p_scalar = scalar;
1912
0
            } else {
1913
                /* the i^th point */
1914
0
                p = points[i];
1915
0
                p_scalar = scalars[i];
1916
0
            }
1917
0
            if ((p_scalar != NULL) && (p != NULL)) {
1918
                /* reduce scalar to 0 <= scalar < 2^521 */
1919
0
                if ((BN_num_bits(p_scalar) > 521)
1920
0
                    || (BN_is_negative(p_scalar))) {
1921
                    /*
1922
                     * this is an unusual input, and we don't guarantee
1923
                     * constant-timeness
1924
                     */
1925
0
                    if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1926
0
                        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1927
0
                        goto err;
1928
0
                    }
1929
0
                    num_bytes = BN_bn2lebinpad(tmp_scalar,
1930
0
                        secrets[i], sizeof(secrets[i]));
1931
0
                } else {
1932
0
                    num_bytes = BN_bn2lebinpad(p_scalar,
1933
0
                        secrets[i], sizeof(secrets[i]));
1934
0
                }
1935
0
                if (num_bytes < 0) {
1936
0
                    ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1937
0
                    goto err;
1938
0
                }
1939
                /* precompute multiples */
1940
0
                if ((!BN_to_felem(x_out, p->X)) || (!BN_to_felem(y_out, p->Y)) || (!BN_to_felem(z_out, p->Z)))
1941
0
                    goto err;
1942
0
                memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1943
0
                memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1944
0
                memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1945
0
                for (j = 2; j <= 16; ++j) {
1946
0
                    if (j & 1) {
1947
0
                        point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1948
0
                            pre_comp[i][j][2], pre_comp[i][1][0],
1949
0
                            pre_comp[i][1][1], pre_comp[i][1][2], 0,
1950
0
                            pre_comp[i][j - 1][0],
1951
0
                            pre_comp[i][j - 1][1],
1952
0
                            pre_comp[i][j - 1][2]);
1953
0
                    } else {
1954
0
                        point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1955
0
                            pre_comp[i][j][2], pre_comp[i][j / 2][0],
1956
0
                            pre_comp[i][j / 2][1],
1957
0
                            pre_comp[i][j / 2][2]);
1958
0
                    }
1959
0
                }
1960
0
            }
1961
0
        }
1962
0
        if (mixed)
1963
0
            make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1964
0
    }
1965
1966
    /* the scalar for the generator */
1967
0
    if ((scalar != NULL) && (have_pre_comp)) {
1968
0
        memset(g_secret, 0, sizeof(g_secret));
1969
        /* reduce scalar to 0 <= scalar < 2^521 */
1970
0
        if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1971
            /*
1972
             * this is an unusual input, and we don't guarantee
1973
             * constant-timeness
1974
             */
1975
0
            if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1976
0
                ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1977
0
                goto err;
1978
0
            }
1979
0
            num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1980
0
        } else {
1981
0
            num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1982
0
        }
1983
        /* do the multiplication with generator precomputation */
1984
0
        batch_mul(x_out, y_out, z_out,
1985
0
            (const felem_bytearray(*))secrets, num_points,
1986
0
            g_secret,
1987
0
            mixed, (const felem(*)[17][3])pre_comp,
1988
0
            (const felem(*)[3])g_pre_comp);
1989
0
    } else {
1990
        /* do the multiplication without generator precomputation */
1991
0
        batch_mul(x_out, y_out, z_out,
1992
0
            (const felem_bytearray(*))secrets, num_points,
1993
0
            NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1994
0
    }
1995
    /* reduce the output to its unique minimal representation */
1996
0
    felem_contract(x_in, x_out);
1997
0
    felem_contract(y_in, y_out);
1998
0
    felem_contract(z_in, z_out);
1999
0
    if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) || (!felem_to_BN(z, z_in))) {
2000
0
        ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2001
0
        goto err;
2002
0
    }
2003
0
    ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
2004
0
        ctx);
2005
2006
0
err:
2007
0
    BN_CTX_end(ctx);
2008
0
    EC_POINT_free(generator);
2009
0
    OPENSSL_free(secrets);
2010
0
    OPENSSL_free(pre_comp);
2011
0
    OPENSSL_free(tmp_felems);
2012
0
    return ret;
2013
0
}
2014
2015
int ossl_ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2016
0
{
2017
0
    int ret = 0;
2018
0
    NISTP521_PRE_COMP *pre = NULL;
2019
0
    int i, j;
2020
0
    BIGNUM *x, *y;
2021
0
    EC_POINT *generator = NULL;
2022
0
    felem tmp_felems[16];
2023
0
#ifndef FIPS_MODULE
2024
0
    BN_CTX *new_ctx = NULL;
2025
0
#endif
2026
2027
    /* throw away old precomputation */
2028
0
    EC_pre_comp_free(group);
2029
2030
0
#ifndef FIPS_MODULE
2031
0
    if (ctx == NULL)
2032
0
        ctx = new_ctx = BN_CTX_new();
2033
0
#endif
2034
0
    if (ctx == NULL)
2035
0
        return 0;
2036
2037
0
    BN_CTX_start(ctx);
2038
0
    x = BN_CTX_get(ctx);
2039
0
    y = BN_CTX_get(ctx);
2040
0
    if (y == NULL)
2041
0
        goto err;
2042
    /* get the generator */
2043
0
    if (group->generator == NULL)
2044
0
        goto err;
2045
0
    generator = EC_POINT_new(group);
2046
0
    if (generator == NULL)
2047
0
        goto err;
2048
0
    BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2049
0
    BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2050
0
    if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2051
0
        goto err;
2052
0
    if ((pre = nistp521_pre_comp_new()) == NULL)
2053
0
        goto err;
2054
    /*
2055
     * if the generator is the standard one, use built-in precomputation
2056
     */
2057
0
    if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2058
0
        memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2059
0
        goto done;
2060
0
    }
2061
0
    if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) || (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) || (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2062
0
        goto err;
2063
    /* compute 2^130*G, 2^260*G, 2^390*G */
2064
0
    for (i = 1; i <= 4; i <<= 1) {
2065
0
        point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2066
0
            pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2067
0
            pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2068
0
        for (j = 0; j < 129; ++j) {
2069
0
            point_double(pre->g_pre_comp[2 * i][0],
2070
0
                pre->g_pre_comp[2 * i][1],
2071
0
                pre->g_pre_comp[2 * i][2],
2072
0
                pre->g_pre_comp[2 * i][0],
2073
0
                pre->g_pre_comp[2 * i][1],
2074
0
                pre->g_pre_comp[2 * i][2]);
2075
0
        }
2076
0
    }
2077
    /* g_pre_comp[0] is the point at infinity */
2078
0
    memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2079
    /* the remaining multiples */
2080
    /* 2^130*G + 2^260*G */
2081
0
    point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2082
0
        pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2083
0
        pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2084
0
        0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2085
0
        pre->g_pre_comp[2][2]);
2086
    /* 2^130*G + 2^390*G */
2087
0
    point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2088
0
        pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2089
0
        pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2090
0
        0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2091
0
        pre->g_pre_comp[2][2]);
2092
    /* 2^260*G + 2^390*G */
2093
0
    point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2094
0
        pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2095
0
        pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2096
0
        0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2097
0
        pre->g_pre_comp[4][2]);
2098
    /* 2^130*G + 2^260*G + 2^390*G */
2099
0
    point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2100
0
        pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2101
0
        pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2102
0
        0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2103
0
        pre->g_pre_comp[2][2]);
2104
0
    for (i = 1; i < 8; ++i) {
2105
        /* odd multiples: add G */
2106
0
        point_add(pre->g_pre_comp[2 * i + 1][0],
2107
0
            pre->g_pre_comp[2 * i + 1][1],
2108
0
            pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2109
0
            pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2110
0
            pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2111
0
            pre->g_pre_comp[1][2]);
2112
0
    }
2113
0
    make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2114
2115
0
done:
2116
0
    SETPRECOMP(group, nistp521, pre);
2117
0
    ret = 1;
2118
0
    pre = NULL;
2119
0
err:
2120
0
    BN_CTX_end(ctx);
2121
0
    EC_POINT_free(generator);
2122
0
#ifndef FIPS_MODULE
2123
0
    BN_CTX_free(new_ctx);
2124
0
#endif
2125
0
    EC_nistp521_pre_comp_free(pre);
2126
0
    return ret;
2127
0
}
2128
2129
int ossl_ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2130
0
{
2131
    return HAVEPRECOMP(group, nistp521);
2132
0
}