Coverage Report

Created: 2026-06-08 07:04

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/BearSSL/src/ec/ec_p256_m31.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining 
5
 * a copy of this software and associated documentation files (the
6
 * "Software"), to deal in the Software without restriction, including
7
 * without limitation the rights to use, copy, modify, merge, publish,
8
 * distribute, sublicense, and/or sell copies of the Software, and to
9
 * permit persons to whom the Software is furnished to do so, subject to
10
 * the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be 
13
 * included in all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
16
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
18
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
 * SOFTWARE.
23
 */
24
25
#include "inner.h"
26
27
/*
28
 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
29
 * that right-shifting a signed negative integer copies the sign bit
30
 * (arithmetic right-shift). This is "implementation-defined behaviour",
31
 * i.e. it is not undefined, but it may differ between compilers. Each
32
 * compiler is supposed to document its behaviour in that respect. GCC
33
 * explicitly defines that an arithmetic right shift is used. We expect
34
 * all other compilers to do the same, because underlying CPU offer an
35
 * arithmetic right shift opcode that could not be used otherwise.
36
 */
37
#if BR_NO_ARITH_SHIFT
38
#define ARSH(x, n)    (((uint32_t)(x) >> (n)) \
39
                      | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
40
#define ARSHW(x, n)   (((uint64_t)(x) >> (n)) \
41
                      | ((-((uint64_t)(x) >> 63)) << (64 - (n))))
42
#else
43
16.4M
#define ARSH(x, n)    ((*(int32_t *)&(x)) >> (n))
44
29.5M
#define ARSHW(x, n)   ((*(int64_t *)&(x)) >> (n))
45
#endif
46
47
/*
48
 * Convert an integer from unsigned big-endian encoding to a sequence of
49
 * 30-bit words in little-endian order. The final "partial" word is
50
 * returned.
51
 */
52
static uint32_t
53
be8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
54
84
{
55
84
  uint32_t acc;
56
84
  int acc_len;
57
58
84
  acc = 0;
59
84
  acc_len = 0;
60
2.77k
  while (len -- > 0) {
61
2.68k
    uint32_t b;
62
63
2.68k
    b = src[len];
64
2.68k
    if (acc_len < 22) {
65
2.01k
      acc |= b << acc_len;
66
2.01k
      acc_len += 8;
67
2.01k
    } else {
68
672
      *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
69
672
      acc = b >> (30 - acc_len);
70
672
      acc_len -= 22;
71
672
    }
72
2.68k
  }
73
84
  return acc;
74
84
}
75
76
/*
77
 * Convert an integer (30-bit words, little-endian) to unsigned
78
 * big-endian encoding. The total encoding length is provided; all
79
 * the destination bytes will be filled.
80
 */
81
static void
82
le30_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
83
228
{
84
228
  uint32_t acc;
85
228
  int acc_len;
86
87
228
  acc = 0;
88
228
  acc_len = 0;
89
7.52k
  while (len -- > 0) {
90
7.29k
    if (acc_len < 8) {
91
2.05k
      uint32_t w;
92
93
2.05k
      w = *src ++;
94
2.05k
      dst[len] = (unsigned char)(acc | (w << acc_len));
95
2.05k
      acc = w >> (8 - acc_len);
96
2.05k
      acc_len += 22;
97
5.24k
    } else {
98
5.24k
      dst[len] = (unsigned char)acc;
99
5.24k
      acc >>= 8;
100
5.24k
      acc_len -= 8;
101
5.24k
    }
102
7.29k
  }
103
228
}
104
105
/*
106
 * Multiply two integers. Source integers are represented as arrays of
107
 * nine 30-bit words, for values up to 2^270-1. Result is encoded over
108
 * 18 words of 30 bits each.
109
 */
110
static void
111
mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
112
354k
{
113
  /*
114
   * Maximum intermediate result is no more than
115
   * 10376293531797946367, which fits in 64 bits. Reason:
116
   *
117
   *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
118
   *   10376293531797946367 < 9663676407 * 2^30
119
   *
120
   * Thus, adding together 9 products of 30-bit integers, with
121
   * a carry of at most 9663676406, yields an integer that fits
122
   * on 64 bits and generates a carry of at most 9663676406.
123
   */
124
354k
  uint64_t t[17];
125
354k
  uint64_t cc;
126
354k
  int i;
127
128
354k
  t[ 0] = MUL31(a[0], b[0]);
129
354k
  t[ 1] = MUL31(a[0], b[1])
130
354k
    + MUL31(a[1], b[0]);
131
354k
  t[ 2] = MUL31(a[0], b[2])
132
354k
    + MUL31(a[1], b[1])
133
354k
    + MUL31(a[2], b[0]);
134
354k
  t[ 3] = MUL31(a[0], b[3])
135
354k
    + MUL31(a[1], b[2])
136
354k
    + MUL31(a[2], b[1])
137
354k
    + MUL31(a[3], b[0]);
138
354k
  t[ 4] = MUL31(a[0], b[4])
139
354k
    + MUL31(a[1], b[3])
140
354k
    + MUL31(a[2], b[2])
141
354k
    + MUL31(a[3], b[1])
142
354k
    + MUL31(a[4], b[0]);
143
354k
  t[ 5] = MUL31(a[0], b[5])
144
354k
    + MUL31(a[1], b[4])
145
354k
    + MUL31(a[2], b[3])
146
354k
    + MUL31(a[3], b[2])
147
354k
    + MUL31(a[4], b[1])
148
354k
    + MUL31(a[5], b[0]);
149
354k
  t[ 6] = MUL31(a[0], b[6])
150
354k
    + MUL31(a[1], b[5])
151
354k
    + MUL31(a[2], b[4])
152
354k
    + MUL31(a[3], b[3])
153
354k
    + MUL31(a[4], b[2])
154
354k
    + MUL31(a[5], b[1])
155
354k
    + MUL31(a[6], b[0]);
156
354k
  t[ 7] = MUL31(a[0], b[7])
157
354k
    + MUL31(a[1], b[6])
158
354k
    + MUL31(a[2], b[5])
159
354k
    + MUL31(a[3], b[4])
160
354k
    + MUL31(a[4], b[3])
161
354k
    + MUL31(a[5], b[2])
162
354k
    + MUL31(a[6], b[1])
163
354k
    + MUL31(a[7], b[0]);
164
354k
  t[ 8] = MUL31(a[0], b[8])
165
354k
    + MUL31(a[1], b[7])
166
354k
    + MUL31(a[2], b[6])
167
354k
    + MUL31(a[3], b[5])
168
354k
    + MUL31(a[4], b[4])
169
354k
    + MUL31(a[5], b[3])
170
354k
    + MUL31(a[6], b[2])
171
354k
    + MUL31(a[7], b[1])
172
354k
    + MUL31(a[8], b[0]);
173
354k
  t[ 9] = MUL31(a[1], b[8])
174
354k
    + MUL31(a[2], b[7])
175
354k
    + MUL31(a[3], b[6])
176
354k
    + MUL31(a[4], b[5])
177
354k
    + MUL31(a[5], b[4])
178
354k
    + MUL31(a[6], b[3])
179
354k
    + MUL31(a[7], b[2])
180
354k
    + MUL31(a[8], b[1]);
181
354k
  t[10] = MUL31(a[2], b[8])
182
354k
    + MUL31(a[3], b[7])
183
354k
    + MUL31(a[4], b[6])
184
354k
    + MUL31(a[5], b[5])
185
354k
    + MUL31(a[6], b[4])
186
354k
    + MUL31(a[7], b[3])
187
354k
    + MUL31(a[8], b[2]);
188
354k
  t[11] = MUL31(a[3], b[8])
189
354k
    + MUL31(a[4], b[7])
190
354k
    + MUL31(a[5], b[6])
191
354k
    + MUL31(a[6], b[5])
192
354k
    + MUL31(a[7], b[4])
193
354k
    + MUL31(a[8], b[3]);
194
354k
  t[12] = MUL31(a[4], b[8])
195
354k
    + MUL31(a[5], b[7])
196
354k
    + MUL31(a[6], b[6])
197
354k
    + MUL31(a[7], b[5])
198
354k
    + MUL31(a[8], b[4]);
199
354k
  t[13] = MUL31(a[5], b[8])
200
354k
    + MUL31(a[6], b[7])
201
354k
    + MUL31(a[7], b[6])
202
354k
    + MUL31(a[8], b[5]);
203
354k
  t[14] = MUL31(a[6], b[8])
204
354k
    + MUL31(a[7], b[7])
205
354k
    + MUL31(a[8], b[6]);
206
354k
  t[15] = MUL31(a[7], b[8])
207
354k
    + MUL31(a[8], b[7]);
208
354k
  t[16] = MUL31(a[8], b[8]);
209
210
  /*
211
   * Propagate carries.
212
   */
213
354k
  cc = 0;
214
6.37M
  for (i = 0; i < 17; i ++) {
215
6.01M
    uint64_t w;
216
217
6.01M
    w = t[i] + cc;
218
6.01M
    d[i] = (uint32_t)w & 0x3FFFFFFF;
219
6.01M
    cc = w >> 30;
220
6.01M
  }
221
354k
  d[17] = (uint32_t)cc;
222
354k
}
223
224
/*
225
 * Square a 270-bit integer, represented as an array of nine 30-bit words.
226
 * Result uses 18 words of 30 bits each.
227
 */
228
static void
229
square9(uint32_t *d, const uint32_t *a)
230
288k
{
231
288k
  uint64_t t[17];
232
288k
  uint64_t cc;
233
288k
  int i;
234
235
288k
  t[ 0] = MUL31(a[0], a[0]);
236
288k
  t[ 1] = ((MUL31(a[0], a[1])) << 1);
237
288k
  t[ 2] = MUL31(a[1], a[1])
238
288k
    + ((MUL31(a[0], a[2])) << 1);
239
288k
  t[ 3] = ((MUL31(a[0], a[3])
240
288k
    + MUL31(a[1], a[2])) << 1);
241
288k
  t[ 4] = MUL31(a[2], a[2])
242
288k
    + ((MUL31(a[0], a[4])
243
288k
    + MUL31(a[1], a[3])) << 1);
244
288k
  t[ 5] = ((MUL31(a[0], a[5])
245
288k
    + MUL31(a[1], a[4])
246
288k
    + MUL31(a[2], a[3])) << 1);
247
288k
  t[ 6] = MUL31(a[3], a[3])
248
288k
    + ((MUL31(a[0], a[6])
249
288k
    + MUL31(a[1], a[5])
250
288k
    + MUL31(a[2], a[4])) << 1);
251
288k
  t[ 7] = ((MUL31(a[0], a[7])
252
288k
    + MUL31(a[1], a[6])
253
288k
    + MUL31(a[2], a[5])
254
288k
    + MUL31(a[3], a[4])) << 1);
255
288k
  t[ 8] = MUL31(a[4], a[4])
256
288k
    + ((MUL31(a[0], a[8])
257
288k
    + MUL31(a[1], a[7])
258
288k
    + MUL31(a[2], a[6])
259
288k
    + MUL31(a[3], a[5])) << 1);
260
288k
  t[ 9] = ((MUL31(a[1], a[8])
261
288k
    + MUL31(a[2], a[7])
262
288k
    + MUL31(a[3], a[6])
263
288k
    + MUL31(a[4], a[5])) << 1);
264
288k
  t[10] = MUL31(a[5], a[5])
265
288k
    + ((MUL31(a[2], a[8])
266
288k
    + MUL31(a[3], a[7])
267
288k
    + MUL31(a[4], a[6])) << 1);
268
288k
  t[11] = ((MUL31(a[3], a[8])
269
288k
    + MUL31(a[4], a[7])
270
288k
    + MUL31(a[5], a[6])) << 1);
271
288k
  t[12] = MUL31(a[6], a[6])
272
288k
    + ((MUL31(a[4], a[8])
273
288k
    + MUL31(a[5], a[7])) << 1);
274
288k
  t[13] = ((MUL31(a[5], a[8])
275
288k
    + MUL31(a[6], a[7])) << 1);
276
288k
  t[14] = MUL31(a[7], a[7])
277
288k
    + ((MUL31(a[6], a[8])) << 1);
278
288k
  t[15] = ((MUL31(a[7], a[8])) << 1);
279
288k
  t[16] = MUL31(a[8], a[8]);
280
281
  /*
282
   * Propagate carries.
283
   */
284
288k
  cc = 0;
285
5.18M
  for (i = 0; i < 17; i ++) {
286
4.89M
    uint64_t w;
287
288
4.89M
    w = t[i] + cc;
289
4.89M
    d[i] = (uint32_t)w & 0x3FFFFFFF;
290
4.89M
    cc = w >> 30;
291
4.89M
  }
292
288k
  d[17] = (uint32_t)cc;
293
288k
}
294
295
/*
296
 * Base field modulus for P-256.
297
 */
298
static const uint32_t F256[] = {
299
300
  0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000,
301
  0x00000000, 0x00001000, 0x3FFFC000, 0x0000FFFF
302
};
303
304
/*
305
 * The 'b' curve equation coefficient for P-256.
306
 */
307
static const uint32_t P256_B[] = {
308
309
  0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65,
310
  0x2EF555DA, 0x293E7B3E, 0x0D762A8E, 0x00005AC6
311
};
312
313
/*
314
 * Addition in the field. Source operands shall fit on 257 bits; output
315
 * will be lower than twice the modulus.
316
 */
317
static void
318
add_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
319
356k
{
320
356k
  uint32_t w, cc;
321
356k
  int i;
322
323
356k
  cc = 0;
324
3.56M
  for (i = 0; i < 9; i ++) {
325
3.20M
    w = a[i] + b[i] + cc;
326
3.20M
    d[i] = w & 0x3FFFFFFF;
327
3.20M
    cc = w >> 30;
328
3.20M
  }
329
356k
  w >>= 16;
330
356k
  d[8] &= 0xFFFF;
331
356k
  d[3] -= w << 6;
332
356k
  d[6] -= w << 12;
333
356k
  d[7] += w << 14;
334
356k
  cc = w;
335
3.56M
  for (i = 0; i < 9; i ++) {
336
3.20M
    w = d[i] + cc;
337
3.20M
    d[i] = w & 0x3FFFFFFF;
338
3.20M
    cc = ARSH(w, 30);
339
3.20M
  }
340
356k
}
341
342
/*
343
 * Subtraction in the field. Source operands shall be smaller than twice
344
 * the modulus; the result will fulfil the same property.
345
 */
346
static void
347
sub_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
348
363k
{
349
363k
  uint32_t w, cc;
350
363k
  int i;
351
352
  /*
353
   * We really compute a - b + 2*p to make sure that the result is
354
   * positive.
355
   */
356
363k
  w = a[0] - b[0] - 0x00002;
357
363k
  d[0] = w & 0x3FFFFFFF;
358
363k
  w = a[1] - b[1] + ARSH(w, 30);
359
363k
  d[1] = w & 0x3FFFFFFF;
360
363k
  w = a[2] - b[2] + ARSH(w, 30);
361
363k
  d[2] = w & 0x3FFFFFFF;
362
363k
  w = a[3] - b[3] + ARSH(w, 30) + 0x00080;
363
363k
  d[3] = w & 0x3FFFFFFF;
364
363k
  w = a[4] - b[4] + ARSH(w, 30);
365
363k
  d[4] = w & 0x3FFFFFFF;
366
363k
  w = a[5] - b[5] + ARSH(w, 30);
367
363k
  d[5] = w & 0x3FFFFFFF;
368
363k
  w = a[6] - b[6] + ARSH(w, 30) + 0x02000;
369
363k
  d[6] = w & 0x3FFFFFFF;
370
363k
  w = a[7] - b[7] + ARSH(w, 30) - 0x08000;
371
363k
  d[7] = w & 0x3FFFFFFF;
372
363k
  w = a[8] - b[8] + ARSH(w, 30) + 0x20000;
373
363k
  d[8] = w & 0xFFFF;
374
363k
  w >>= 16;
375
363k
  d[8] &= 0xFFFF;
376
363k
  d[3] -= w << 6;
377
363k
  d[6] -= w << 12;
378
363k
  d[7] += w << 14;
379
363k
  cc = w;
380
3.63M
  for (i = 0; i < 9; i ++) {
381
3.26M
    w = d[i] + cc;
382
3.26M
    d[i] = w & 0x3FFFFFFF;
383
3.26M
    cc = ARSH(w, 30);
384
3.26M
  }
385
363k
}
386
387
/*
388
 * Compute a multiplication in F256. Source operands shall be less than
389
 * twice the modulus.
390
 */
391
static void
392
mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
393
354k
{
394
354k
  uint32_t t[18];
395
354k
  uint64_t s[18];
396
354k
  uint64_t cc, x;
397
354k
  uint32_t z, c;
398
354k
  int i;
399
400
354k
  mul9(t, a, b);
401
402
  /*
403
   * Modular reduction: each high word in added/subtracted where
404
   * necessary.
405
   *
406
   * The modulus is:
407
   *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
408
   * Therefore:
409
   *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
410
   *
411
   * For a word x at bit offset n (n >= 256), we have:
412
   *    x*2^n = x*2^(n-32) - x*2^(n-64)
413
   *            - x*2^(n - 160) + x*2^(n-256) mod p
414
   *
415
   * Thus, we can nullify the high word if we reinject it at some
416
   * proper emplacements.
417
   *
418
   * We use 64-bit intermediate words to allow for carries to
419
   * accumulate easily, before performing the final propagation.
420
   */
421
6.72M
  for (i = 0; i < 18; i ++) {
422
6.37M
    s[i] = t[i];
423
6.37M
  }
424
425
3.54M
  for (i = 17; i >= 9; i --) {
426
3.18M
    uint64_t y;
427
428
3.18M
    y = s[i];
429
3.18M
    s[i - 1] += ARSHW(y, 2);
430
3.18M
    s[i - 2] += (y << 28) & 0x3FFFFFFF;
431
3.18M
    s[i - 2] -= ARSHW(y, 4);
432
3.18M
    s[i - 3] -= (y << 26) & 0x3FFFFFFF;
433
3.18M
    s[i - 5] -= ARSHW(y, 10);
434
3.18M
    s[i - 6] -= (y << 20) & 0x3FFFFFFF;
435
3.18M
    s[i - 8] += ARSHW(y, 16);
436
3.18M
    s[i - 9] += (y << 14) & 0x3FFFFFFF;
437
3.18M
  }
438
439
  /*
440
   * Carry propagation must be signed. Moreover, we may have overdone
441
   * it a bit, and obtain a negative result.
442
   *
443
   * The loop above ran 9 times; each time, each word was augmented
444
   * by at most one extra word (in absolute value). Thus, the top
445
   * word must in fine fit in 39 bits, so the carry below will fit
446
   * on 9 bits.
447
   */
448
354k
  cc = 0;
449
3.54M
  for (i = 0; i < 9; i ++) {
450
3.18M
    x = s[i] + cc;
451
3.18M
    d[i] = (uint32_t)x & 0x3FFFFFFF;
452
3.18M
    cc = ARSHW(x, 30);
453
3.18M
  }
454
455
  /*
456
   * All nine words fit on 30 bits, but there may be an extra
457
   * carry for a few bits (at most 9), and that carry may be
458
   * negative. Moreover, we want the result to fit on 257 bits.
459
   * The two lines below ensure that the word in d[] has length
460
   * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
461
   * significant length of cc is less than 24 bits, so we will be
462
   * able to switch to 32-bit operations.
463
   */
464
354k
  cc = ARSHW(x, 16);
465
354k
  d[8] &= 0xFFFF;
466
467
  /*
468
   * One extra round of reduction, for cc*2^256, which means
469
   * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
470
   * value. If cc is negative, then it may happen (rarely, but
471
   * not neglectibly so) that the result would be negative. In
472
   * order to avoid that, if cc is negative, then we add the
473
   * modulus once. Note that if cc is negative, then propagating
474
   * that carry must yield a value lower than the modulus, so
475
   * adding the modulus once will keep the final result under
476
   * twice the modulus.
477
   */
478
354k
  z = (uint32_t)cc;
479
354k
  d[3] -= z << 6;
480
354k
  d[6] -= (z << 12) & 0x3FFFFFFF;
481
354k
  d[7] -= ARSH(z, 18);
482
354k
  d[7] += (z << 14) & 0x3FFFFFFF;
483
354k
  d[8] += ARSH(z, 16);
484
354k
  c = z >> 31;
485
354k
  d[0] -= c;
486
354k
  d[3] += c << 6;
487
354k
  d[6] += c << 12;
488
354k
  d[7] -= c << 14;
489
354k
  d[8] += c << 16;
490
3.54M
  for (i = 0; i < 9; i ++) {
491
3.18M
    uint32_t w;
492
493
3.18M
    w = d[i] + z;
494
3.18M
    d[i] = w & 0x3FFFFFFF;
495
3.18M
    z = ARSH(w, 30);
496
3.18M
  }
497
354k
}
498
499
/*
500
 * Compute a square in F256. Source operand shall be less than
501
 * twice the modulus.
502
 */
503
static void
504
square_f256(uint32_t *d, const uint32_t *a)
505
288k
{
506
288k
  uint32_t t[18];
507
288k
  uint64_t s[18];
508
288k
  uint64_t cc, x;
509
288k
  uint32_t z, c;
510
288k
  int i;
511
512
288k
  square9(t, a);
513
514
  /*
515
   * Modular reduction: each high word in added/subtracted where
516
   * necessary.
517
   *
518
   * The modulus is:
519
   *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
520
   * Therefore:
521
   *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
522
   *
523
   * For a word x at bit offset n (n >= 256), we have:
524
   *    x*2^n = x*2^(n-32) - x*2^(n-64)
525
   *            - x*2^(n - 160) + x*2^(n-256) mod p
526
   *
527
   * Thus, we can nullify the high word if we reinject it at some
528
   * proper emplacements.
529
   *
530
   * We use 64-bit intermediate words to allow for carries to
531
   * accumulate easily, before performing the final propagation.
532
   */
533
5.47M
  for (i = 0; i < 18; i ++) {
534
5.18M
    s[i] = t[i];
535
5.18M
  }
536
537
2.88M
  for (i = 17; i >= 9; i --) {
538
2.59M
    uint64_t y;
539
540
2.59M
    y = s[i];
541
2.59M
    s[i - 1] += ARSHW(y, 2);
542
2.59M
    s[i - 2] += (y << 28) & 0x3FFFFFFF;
543
2.59M
    s[i - 2] -= ARSHW(y, 4);
544
2.59M
    s[i - 3] -= (y << 26) & 0x3FFFFFFF;
545
2.59M
    s[i - 5] -= ARSHW(y, 10);
546
2.59M
    s[i - 6] -= (y << 20) & 0x3FFFFFFF;
547
2.59M
    s[i - 8] += ARSHW(y, 16);
548
2.59M
    s[i - 9] += (y << 14) & 0x3FFFFFFF;
549
2.59M
  }
550
551
  /*
552
   * Carry propagation must be signed. Moreover, we may have overdone
553
   * it a bit, and obtain a negative result.
554
   *
555
   * The loop above ran 9 times; each time, each word was augmented
556
   * by at most one extra word (in absolute value). Thus, the top
557
   * word must in fine fit in 39 bits, so the carry below will fit
558
   * on 9 bits.
559
   */
560
288k
  cc = 0;
561
2.88M
  for (i = 0; i < 9; i ++) {
562
2.59M
    x = s[i] + cc;
563
2.59M
    d[i] = (uint32_t)x & 0x3FFFFFFF;
564
2.59M
    cc = ARSHW(x, 30);
565
2.59M
  }
566
567
  /*
568
   * All nine words fit on 30 bits, but there may be an extra
569
   * carry for a few bits (at most 9), and that carry may be
570
   * negative. Moreover, we want the result to fit on 257 bits.
571
   * The two lines below ensure that the word in d[] has length
572
   * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
573
   * significant length of cc is less than 24 bits, so we will be
574
   * able to switch to 32-bit operations.
575
   */
576
288k
  cc = ARSHW(x, 16);
577
288k
  d[8] &= 0xFFFF;
578
579
  /*
580
   * One extra round of reduction, for cc*2^256, which means
581
   * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
582
   * value. If cc is negative, then it may happen (rarely, but
583
   * not neglectibly so) that the result would be negative. In
584
   * order to avoid that, if cc is negative, then we add the
585
   * modulus once. Note that if cc is negative, then propagating
586
   * that carry must yield a value lower than the modulus, so
587
   * adding the modulus once will keep the final result under
588
   * twice the modulus.
589
   */
590
288k
  z = (uint32_t)cc;
591
288k
  d[3] -= z << 6;
592
288k
  d[6] -= (z << 12) & 0x3FFFFFFF;
593
288k
  d[7] -= ARSH(z, 18);
594
288k
  d[7] += (z << 14) & 0x3FFFFFFF;
595
288k
  d[8] += ARSH(z, 16);
596
288k
  c = z >> 31;
597
288k
  d[0] -= c;
598
288k
  d[3] += c << 6;
599
288k
  d[6] += c << 12;
600
288k
  d[7] -= c << 14;
601
288k
  d[8] += c << 16;
602
2.88M
  for (i = 0; i < 9; i ++) {
603
2.59M
    uint32_t w;
604
605
2.59M
    w = d[i] + z;
606
2.59M
    d[i] = w & 0x3FFFFFFF;
607
2.59M
    z = ARSH(w, 30);
608
2.59M
  }
609
288k
}
610
611
/*
612
 * Perform a "final reduction" in field F256 (field for curve P-256).
613
 * The source value must be less than twice the modulus. If the value
614
 * is not lower than the modulus, then the modulus is subtracted and
615
 * this function returns 1; otherwise, it leaves it untouched and it
616
 * returns 0.
617
 */
618
static uint32_t
619
reduce_final_f256(uint32_t *d)
620
15.9k
{
621
15.9k
  uint32_t t[9];
622
15.9k
  uint32_t cc;
623
15.9k
  int i;
624
625
15.9k
  cc = 0;
626
159k
  for (i = 0; i < 9; i ++) {
627
143k
    uint32_t w;
628
629
143k
    w = d[i] - F256[i] - cc;
630
143k
    cc = w >> 31;
631
143k
    t[i] = w & 0x3FFFFFFF;
632
143k
  }
633
15.9k
  cc ^= 1;
634
15.9k
  CCOPY(cc, d, t, sizeof t);
635
15.9k
  return cc;
636
15.9k
}
637
638
/*
639
 * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
640
 * are such that:
641
 *   X = x / z^2
642
 *   Y = y / z^3
643
 * For the point at infinity, z = 0.
644
 * Each point thus admits many possible representations.
645
 *
646
 * Coordinates are represented in arrays of 32-bit integers, each holding
647
 * 30 bits of data. Values may also be slightly greater than the modulus,
648
 * but they will always be lower than twice the modulus.
649
 */
650
typedef struct {
651
  uint32_t x[9];
652
  uint32_t y[9];
653
  uint32_t z[9];
654
} p256_jacobian;
655
656
/*
657
 * Convert a point to affine coordinates:
658
 *  - If the point is the point at infinity, then all three coordinates
659
 *    are set to 0.
660
 *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
661
 *    coordinates are the 'X' and 'Y' affine coordinates.
662
 * The coordinates are guaranteed to be lower than the modulus.
663
 */
664
static void
665
p256_to_affine(p256_jacobian *P)
666
114
{
667
114
  uint32_t t1[9], t2[9];
668
114
  int i;
669
670
  /*
671
   * Invert z with a modular exponentiation: the modulus is
672
   * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
673
   * p-2. Exponent bit pattern (from high to low) is:
674
   *  - 32 bits of value 1
675
   *  - 31 bits of value 0
676
   *  - 1 bit of value 1
677
   *  - 96 bits of value 0
678
   *  - 94 bits of value 1
679
   *  - 1 bit of value 0
680
   *  - 1 bit of value 1
681
   * Thus, we precompute z^(2^31-1) to speed things up.
682
   *
683
   * If z = 0 (point at infinity) then the modular exponentiation
684
   * will yield 0, which leads to the expected result (all three
685
   * coordinates set to 0).
686
   */
687
688
  /*
689
   * A simple square-and-multiply for z^(2^31-1). We could save about
690
   * two dozen multiplications here with an addition chain, but
691
   * this would require a bit more code, and extra stack buffers.
692
   */
693
114
  memcpy(t1, P->z, sizeof P->z);
694
3.53k
  for (i = 0; i < 30; i ++) {
695
3.42k
    square_f256(t1, t1);
696
3.42k
    mul_f256(t1, t1, P->z);
697
3.42k
  }
698
699
  /*
700
   * Square-and-multiply. Apart from the squarings, we have a few
701
   * multiplications to set bits to 1; we multiply by the original z
702
   * for setting 1 bit, and by t1 for setting 31 bits.
703
   */
704
114
  memcpy(t2, P->z, sizeof P->z);
705
29.1k
  for (i = 1; i < 256; i ++) {
706
29.0k
    square_f256(t2, t2);
707
29.0k
    switch (i) {
708
114
    case 31:
709
228
    case 190:
710
342
    case 221:
711
456
    case 252:
712
456
      mul_f256(t2, t2, t1);
713
456
      break;
714
114
    case 63:
715
228
    case 253:
716
342
    case 255:
717
342
      mul_f256(t2, t2, P->z);
718
342
      break;
719
29.0k
    }
720
29.0k
  }
721
722
  /*
723
   * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
724
   */
725
114
  mul_f256(t1, t2, t2);
726
114
  mul_f256(P->x, t1, P->x);
727
114
  mul_f256(t1, t1, t2);
728
114
  mul_f256(P->y, t1, P->y);
729
114
  reduce_final_f256(P->x);
730
114
  reduce_final_f256(P->y);
731
732
  /*
733
   * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
734
   * this will set z to 1.
735
   */
736
114
  mul_f256(P->z, P->z, t2);
737
114
  reduce_final_f256(P->z);
738
114
}
739
740
/*
741
 * Double a point in P-256. This function works for all valid points,
742
 * including the point at infinity.
743
 */
744
static void
745
p256_double(p256_jacobian *Q)
746
50.9k
{
747
  /*
748
   * Doubling formulas are:
749
   *
750
   *   s = 4*x*y^2
751
   *   m = 3*(x + z^2)*(x - z^2)
752
   *   x' = m^2 - 2*s
753
   *   y' = m*(s - x') - 8*y^4
754
   *   z' = 2*y*z
755
   *
756
   * These formulas work for all points, including points of order 2
757
   * and points at infinity:
758
   *   - If y = 0 then z' = 0. But there is no such point in P-256
759
   *     anyway.
760
   *   - If z = 0 then z' = 0.
761
   */
762
50.9k
  uint32_t t1[9], t2[9], t3[9], t4[9];
763
764
  /*
765
   * Compute z^2 in t1.
766
   */
767
50.9k
  square_f256(t1, Q->z);
768
769
  /*
770
   * Compute x-z^2 in t2 and x+z^2 in t1.
771
   */
772
50.9k
  add_f256(t2, Q->x, t1);
773
50.9k
  sub_f256(t1, Q->x, t1);
774
775
  /*
776
   * Compute 3*(x+z^2)*(x-z^2) in t1.
777
   */
778
50.9k
  mul_f256(t3, t1, t2);
779
50.9k
  add_f256(t1, t3, t3);
780
50.9k
  add_f256(t1, t3, t1);
781
782
  /*
783
   * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
784
   */
785
50.9k
  square_f256(t3, Q->y);
786
50.9k
  add_f256(t3, t3, t3);
787
50.9k
  mul_f256(t2, Q->x, t3);
788
50.9k
  add_f256(t2, t2, t2);
789
790
  /*
791
   * Compute x' = m^2 - 2*s.
792
   */
793
50.9k
  square_f256(Q->x, t1);
794
50.9k
  sub_f256(Q->x, Q->x, t2);
795
50.9k
  sub_f256(Q->x, Q->x, t2);
796
797
  /*
798
   * Compute z' = 2*y*z.
799
   */
800
50.9k
  mul_f256(t4, Q->y, Q->z);
801
50.9k
  add_f256(Q->z, t4, t4);
802
803
  /*
804
   * Compute y' = m*(s - x') - 8*y^4. Note that we already have
805
   * 2*y^2 in t3.
806
   */
807
50.9k
  sub_f256(t2, t2, Q->x);
808
50.9k
  mul_f256(Q->y, t1, t2);
809
50.9k
  square_f256(t4, t3);
810
50.9k
  add_f256(t4, t4, t4);
811
50.9k
  sub_f256(Q->y, Q->y, t4);
812
50.9k
}
813
814
/*
815
 * Add point P2 to point P1.
816
 *
817
 * This function computes the wrong result in the following cases:
818
 *
819
 *   - If P1 == 0 but P2 != 0
820
 *   - If P1 != 0 but P2 == 0
821
 *   - If P1 == P2
822
 *
823
 * In all three cases, P1 is set to the point at infinity.
824
 *
825
 * Returned value is 0 if one of the following occurs:
826
 *
827
 *   - P1 and P2 have the same Y coordinate
828
 *   - P1 == 0 and P2 == 0
829
 *   - The Y coordinate of one of the points is 0 and the other point is
830
 *     the point at infinity.
831
 *
832
 * The third case cannot actually happen with valid points, since a point
833
 * with Y == 0 is a point of order 2, and there is no point of order 2 on
834
 * curve P-256.
835
 *
836
 * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
837
 * can apply the following:
838
 *
839
 *   - If the result is not the point at infinity, then it is correct.
840
 *   - Otherwise, if the returned value is 1, then this is a case of
841
 *     P1+P2 == 0, so the result is indeed the point at infinity.
842
 *   - Otherwise, P1 == P2, so a "double" operation should have been
843
 *     performed.
844
 */
845
static uint32_t
846
p256_add(p256_jacobian *P1, const p256_jacobian *P2)
847
5.46k
{
848
  /*
849
   * Addtions formulas are:
850
   *
851
   *   u1 = x1 * z2^2
852
   *   u2 = x2 * z1^2
853
   *   s1 = y1 * z2^3
854
   *   s2 = y2 * z1^3
855
   *   h = u2 - u1
856
   *   r = s2 - s1
857
   *   x3 = r^2 - h^3 - 2 * u1 * h^2
858
   *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
859
   *   z3 = h * z1 * z2
860
   */
861
5.46k
  uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
862
5.46k
  uint32_t ret;
863
5.46k
  int i;
864
865
  /*
866
   * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
867
   */
868
5.46k
  square_f256(t3, P2->z);
869
5.46k
  mul_f256(t1, P1->x, t3);
870
5.46k
  mul_f256(t4, P2->z, t3);
871
5.46k
  mul_f256(t3, P1->y, t4);
872
873
  /*
874
   * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
875
   */
876
5.46k
  square_f256(t4, P1->z);
877
5.46k
  mul_f256(t2, P2->x, t4);
878
5.46k
  mul_f256(t5, P1->z, t4);
879
5.46k
  mul_f256(t4, P2->y, t5);
880
881
  /*
882
   * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
883
   * We need to test whether r is zero, so we will do some extra
884
   * reduce.
885
   */
886
5.46k
  sub_f256(t2, t2, t1);
887
5.46k
  sub_f256(t4, t4, t3);
888
5.46k
  reduce_final_f256(t4);
889
5.46k
  ret = 0;
890
54.6k
  for (i = 0; i < 9; i ++) {
891
49.1k
    ret |= t4[i];
892
49.1k
  }
893
5.46k
  ret = (ret | -ret) >> 31;
894
895
  /*
896
   * Compute u1*h^2 (in t6) and h^3 (in t5);
897
   */
898
5.46k
  square_f256(t7, t2);
899
5.46k
  mul_f256(t6, t1, t7);
900
5.46k
  mul_f256(t5, t7, t2);
901
902
  /*
903
   * Compute x3 = r^2 - h^3 - 2*u1*h^2.
904
   */
905
5.46k
  square_f256(P1->x, t4);
906
5.46k
  sub_f256(P1->x, P1->x, t5);
907
5.46k
  sub_f256(P1->x, P1->x, t6);
908
5.46k
  sub_f256(P1->x, P1->x, t6);
909
910
  /*
911
   * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
912
   */
913
5.46k
  sub_f256(t6, t6, P1->x);
914
5.46k
  mul_f256(P1->y, t4, t6);
915
5.46k
  mul_f256(t1, t5, t3);
916
5.46k
  sub_f256(P1->y, P1->y, t1);
917
918
  /*
919
   * Compute z3 = h*z1*z2.
920
   */
921
5.46k
  mul_f256(t1, P1->z, P2->z);
922
5.46k
  mul_f256(P1->z, t1, t2);
923
924
5.46k
  return ret;
925
5.46k
}
926
927
/*
928
 * Add point P2 to point P1. This is a specialised function for the
929
 * case when P2 is a non-zero point in affine coordinate.
930
 *
931
 * This function computes the wrong result in the following cases:
932
 *
933
 *   - If P1 == 0
934
 *   - If P1 == P2
935
 *
936
 * In both cases, P1 is set to the point at infinity.
937
 *
938
 * Returned value is 0 if one of the following occurs:
939
 *
940
 *   - P1 and P2 have the same Y coordinate
941
 *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
942
 *
943
 * The second case cannot actually happen with valid points, since a point
944
 * with Y == 0 is a point of order 2, and there is no point of order 2 on
945
 * curve P-256.
946
 *
947
 * Therefore, assuming that P1 != 0 on input, then the caller
948
 * can apply the following:
949
 *
950
 *   - If the result is not the point at infinity, then it is correct.
951
 *   - Otherwise, if the returned value is 1, then this is a case of
952
 *     P1+P2 == 0, so the result is indeed the point at infinity.
953
 *   - Otherwise, P1 == P2, so a "double" operation should have been
954
 *     performed.
955
 */
956
static uint32_t
957
p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
958
10.0k
{
959
  /*
960
   * Addtions formulas are:
961
   *
962
   *   u1 = x1
963
   *   u2 = x2 * z1^2
964
   *   s1 = y1
965
   *   s2 = y2 * z1^3
966
   *   h = u2 - u1
967
   *   r = s2 - s1
968
   *   x3 = r^2 - h^3 - 2 * u1 * h^2
969
   *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
970
   *   z3 = h * z1
971
   */
972
10.0k
  uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
973
10.0k
  uint32_t ret;
974
10.0k
  int i;
975
976
  /*
977
   * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
978
   */
979
10.0k
  memcpy(t1, P1->x, sizeof t1);
980
10.0k
  memcpy(t3, P1->y, sizeof t3);
981
982
  /*
983
   * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
984
   */
985
10.0k
  square_f256(t4, P1->z);
986
10.0k
  mul_f256(t2, P2->x, t4);
987
10.0k
  mul_f256(t5, P1->z, t4);
988
10.0k
  mul_f256(t4, P2->y, t5);
989
990
  /*
991
   * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
992
   * We need to test whether r is zero, so we will do some extra
993
   * reduce.
994
   */
995
10.0k
  sub_f256(t2, t2, t1);
996
10.0k
  sub_f256(t4, t4, t3);
997
10.0k
  reduce_final_f256(t4);
998
10.0k
  ret = 0;
999
100k
  for (i = 0; i < 9; i ++) {
1000
90.1k
    ret |= t4[i];
1001
90.1k
  }
1002
10.0k
  ret = (ret | -ret) >> 31;
1003
1004
  /*
1005
   * Compute u1*h^2 (in t6) and h^3 (in t5);
1006
   */
1007
10.0k
  square_f256(t7, t2);
1008
10.0k
  mul_f256(t6, t1, t7);
1009
10.0k
  mul_f256(t5, t7, t2);
1010
1011
  /*
1012
   * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1013
   */
1014
10.0k
  square_f256(P1->x, t4);
1015
10.0k
  sub_f256(P1->x, P1->x, t5);
1016
10.0k
  sub_f256(P1->x, P1->x, t6);
1017
10.0k
  sub_f256(P1->x, P1->x, t6);
1018
1019
  /*
1020
   * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1021
   */
1022
10.0k
  sub_f256(t6, t6, P1->x);
1023
10.0k
  mul_f256(P1->y, t4, t6);
1024
10.0k
  mul_f256(t1, t5, t3);
1025
10.0k
  sub_f256(P1->y, P1->y, t1);
1026
1027
  /*
1028
   * Compute z3 = h*z1*z2.
1029
   */
1030
10.0k
  mul_f256(P1->z, P1->z, t2);
1031
1032
10.0k
  return ret;
1033
10.0k
}
1034
1035
/*
1036
 * Decode a P-256 point. This function does not support the point at
1037
 * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1038
 */
1039
static uint32_t
1040
p256_decode(p256_jacobian *P, const void *src, size_t len)
1041
42
{
1042
42
  const unsigned char *buf;
1043
42
  uint32_t tx[9], ty[9], t1[9], t2[9];
1044
42
  uint32_t bad;
1045
42
  int i;
1046
1047
42
  if (len != 65) {
1048
0
    return 0;
1049
0
  }
1050
42
  buf = src;
1051
1052
  /*
1053
   * First byte must be 0x04 (uncompressed format). We could support
1054
   * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1055
   * least significant bit of the Y coordinate), but it is explicitly
1056
   * forbidden by RFC 5480 (section 2.2).
1057
   */
1058
42
  bad = NEQ(buf[0], 0x04);
1059
1060
  /*
1061
   * Decode the coordinates, and check that they are both lower
1062
   * than the modulus.
1063
   */
1064
42
  tx[8] = be8_to_le30(tx, buf + 1, 32);
1065
42
  ty[8] = be8_to_le30(ty, buf + 33, 32);
1066
42
  bad |= reduce_final_f256(tx);
1067
42
  bad |= reduce_final_f256(ty);
1068
1069
  /*
1070
   * Check curve equation.
1071
   */
1072
42
  square_f256(t1, tx);
1073
42
  mul_f256(t1, tx, t1);
1074
42
  square_f256(t2, ty);
1075
42
  sub_f256(t1, t1, tx);
1076
42
  sub_f256(t1, t1, tx);
1077
42
  sub_f256(t1, t1, tx);
1078
42
  add_f256(t1, t1, P256_B);
1079
42
  sub_f256(t1, t1, t2);
1080
42
  reduce_final_f256(t1);
1081
420
  for (i = 0; i < 9; i ++) {
1082
378
    bad |= t1[i];
1083
378
  }
1084
1085
  /*
1086
   * Copy coordinates to the point structure.
1087
   */
1088
42
  memcpy(P->x, tx, sizeof tx);
1089
42
  memcpy(P->y, ty, sizeof ty);
1090
42
  memset(P->z, 0, sizeof P->z);
1091
42
  P->z[0] = 1;
1092
42
  return EQ(bad, 0);
1093
42
}
1094
1095
/*
1096
 * Encode a point into a buffer. This function assumes that the point is
1097
 * valid, in affine coordinates, and not the point at infinity.
1098
 */
1099
static void
1100
p256_encode(void *dst, const p256_jacobian *P)
1101
114
{
1102
114
  unsigned char *buf;
1103
1104
114
  buf = dst;
1105
114
  buf[0] = 0x04;
1106
114
  le30_to_be8(buf + 1, 32, P->x);
1107
114
  le30_to_be8(buf + 33, 32, P->y);
1108
114
}
1109
1110
/*
1111
 * Multiply a curve point by an integer. The integer is assumed to be
1112
 * lower than the curve order, and the base point must not be the point
1113
 * at infinity.
1114
 */
1115
static void
1116
p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1117
42
{
1118
  /*
1119
   * qz is a flag that is initially 1, and remains equal to 1
1120
   * as long as the point is the point at infinity.
1121
   *
1122
   * We use a 2-bit window to handle multiplier bits by pairs.
1123
   * The precomputed window really is the points P2 and P3.
1124
   */
1125
42
  uint32_t qz;
1126
42
  p256_jacobian P2, P3, Q, T, U;
1127
1128
  /*
1129
   * Compute window values.
1130
   */
1131
42
  P2 = *P;
1132
42
  p256_double(&P2);
1133
42
  P3 = *P;
1134
42
  p256_add(&P3, &P2);
1135
1136
  /*
1137
   * We start with Q = 0. We process multiplier bits 2 by 2.
1138
   */
1139
42
  memset(&Q, 0, sizeof Q);
1140
42
  qz = 1;
1141
1.38k
  while (xlen -- > 0) {
1142
1.34k
    int k;
1143
1144
6.72k
    for (k = 6; k >= 0; k -= 2) {
1145
5.37k
      uint32_t bits;
1146
5.37k
      uint32_t bnz;
1147
1148
5.37k
      p256_double(&Q);
1149
5.37k
      p256_double(&Q);
1150
5.37k
      T = *P;
1151
5.37k
      U = Q;
1152
5.37k
      bits = (*x >> k) & (uint32_t)3;
1153
5.37k
      bnz = NEQ(bits, 0);
1154
5.37k
      CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1155
5.37k
      CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1156
5.37k
      p256_add(&U, &T);
1157
5.37k
      CCOPY(bnz & qz, &Q, &T, sizeof Q);
1158
5.37k
      CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1159
5.37k
      qz &= ~bnz;
1160
5.37k
    }
1161
1.34k
    x ++;
1162
1.34k
  }
1163
42
  *P = Q;
1164
42
}
1165
1166
/*
1167
 * Precomputed window: k*G points, where G is the curve generator, and k
1168
 * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1169
 * the point are encoded as 9 words of 30 bits each (little-endian
1170
 * order).
1171
 */
1172
static const uint32_t Gwin[15][18] = {
1173
1174
  { 0x1898C296, 0x1284E517, 0x1EB33A0F, 0x00DF604B,
1175
    0x2440F277, 0x339B958E, 0x04247F8B, 0x347CB84B,
1176
    0x00006B17, 0x37BF51F5, 0x2ED901A0, 0x3315ECEC,
1177
    0x338CD5DA, 0x0F9E162B, 0x1FAD29F0, 0x27F9B8EE,
1178
    0x10B8BF86, 0x00004FE3 },
1179
1180
  { 0x07669978, 0x182D23F1, 0x3F21B35A, 0x225A789D,
1181
    0x351AC3C0, 0x08E00C12, 0x34F7E8A5, 0x1EC62340,
1182
    0x00007CF2, 0x227873D1, 0x3812DE74, 0x0E982299,
1183
    0x1F6B798F, 0x3430DBBA, 0x366B1A7D, 0x2D040293,
1184
    0x154436E3, 0x00000777 },
1185
1186
  { 0x06E7FD6C, 0x2D05986F, 0x3ADA985F, 0x31ADC87B,
1187
    0x0BF165E6, 0x1FBE5475, 0x30A44C8F, 0x3934698C,
1188
    0x00005ECB, 0x227D5032, 0x29E6C49E, 0x04FB83D9,
1189
    0x0AAC0D8E, 0x24A2ECD8, 0x2C1B3869, 0x0FF7E374,
1190
    0x19031266, 0x00008734 },
1191
1192
  { 0x2B030852, 0x024C0911, 0x05596EF5, 0x07F8B6DE,
1193
    0x262BD003, 0x3779967B, 0x08FBBA02, 0x128D4CB4,
1194
    0x0000E253, 0x184ED8C6, 0x310B08FC, 0x30EE0055,
1195
    0x3F25B0FC, 0x062D764E, 0x3FB97F6A, 0x33CC719D,
1196
    0x15D69318, 0x0000E0F1 },
1197
1198
  { 0x03D033ED, 0x05552837, 0x35BE5242, 0x2320BF47,
1199
    0x268FDFEF, 0x13215821, 0x140D2D78, 0x02DE9454,
1200
    0x00005159, 0x3DA16DA4, 0x0742ED13, 0x0D80888D,
1201
    0x004BC035, 0x0A79260D, 0x06FCDAFE, 0x2727D8AE,
1202
    0x1F6A2412, 0x0000E0C1 },
1203
1204
  { 0x3C2291A9, 0x1AC2ABA4, 0x3B215B4C, 0x131D037A,
1205
    0x17DDE302, 0x0C90B2E2, 0x0602C92D, 0x05CA9DA9,
1206
    0x0000B01A, 0x0FC77FE2, 0x35F1214E, 0x07E16BDF,
1207
    0x003DDC07, 0x2703791C, 0x3038B7EE, 0x3DAD56FE,
1208
    0x041D0C8D, 0x0000E85C },
1209
1210
  { 0x3187B2A3, 0x0018A1C0, 0x00FEF5B3, 0x3E7E2E2A,
1211
    0x01FB607E, 0x2CC199F0, 0x37B4625B, 0x0EDBE82F,
1212
    0x00008E53, 0x01F400B4, 0x15786A1B, 0x3041B21C,
1213
    0x31CD8CF2, 0x35900053, 0x1A7E0E9B, 0x318366D0,
1214
    0x076F780C, 0x000073EB },
1215
1216
  { 0x1B6FB393, 0x13767707, 0x3CE97DBB, 0x348E2603,
1217
    0x354CADC1, 0x09D0B4EA, 0x1B053404, 0x1DE76FBA,
1218
    0x000062D9, 0x0F09957E, 0x295029A8, 0x3E76A78D,
1219
    0x3B547DAE, 0x27CEE0A2, 0x0575DC45, 0x1D8244FF,
1220
    0x332F647A, 0x0000AD5A },
1221
1222
  { 0x10949EE0, 0x1E7A292E, 0x06DF8B3D, 0x02B2E30B,
1223
    0x31F8729E, 0x24E35475, 0x30B71878, 0x35EDBFB7,
1224
    0x0000EA68, 0x0DD048FA, 0x21688929, 0x0DE823FE,
1225
    0x1C53FAA9, 0x0EA0C84D, 0x052A592A, 0x1FCE7870,
1226
    0x11325CB2, 0x00002A27 },
1227
1228
  { 0x04C5723F, 0x30D81A50, 0x048306E4, 0x329B11C7,
1229
    0x223FB545, 0x085347A8, 0x2993E591, 0x1B5ACA8E,
1230
    0x0000CEF6, 0x04AF0773, 0x28D2EEA9, 0x2751EEEC,
1231
    0x037B4A7F, 0x3B4C1059, 0x08F37674, 0x2AE906E1,
1232
    0x18A88A6A, 0x00008786 },
1233
1234
  { 0x34BC21D1, 0x0CCE474D, 0x15048BF4, 0x1D0BB409,
1235
    0x021CDA16, 0x20DE76C3, 0x34C59063, 0x04EDE20E,
1236
    0x00003ED1, 0x282A3740, 0x0BE3BBF3, 0x29889DAE,
1237
    0x03413697, 0x34C68A09, 0x210EBE93, 0x0C8A224C,
1238
    0x0826B331, 0x00009099 },
1239
1240
  { 0x0624E3C4, 0x140317BA, 0x2F82C99D, 0x260C0A2C,
1241
    0x25D55179, 0x194DCC83, 0x3D95E462, 0x356F6A05,
1242
    0x0000741D, 0x0D4481D3, 0x2657FC8B, 0x1BA5CA71,
1243
    0x3AE44B0D, 0x07B1548E, 0x0E0D5522, 0x05FDC567,
1244
    0x2D1AA70E, 0x00000770 },
1245
1246
  { 0x06072C01, 0x23857675, 0x1EAD58A9, 0x0B8A12D9,
1247
    0x1EE2FC79, 0x0177CB61, 0x0495A618, 0x20DEB82B,
1248
    0x0000177C, 0x2FC7BFD8, 0x310EEF8B, 0x1FB4DF39,
1249
    0x3B8530E8, 0x0F4E7226, 0x0246B6D0, 0x2A558A24,
1250
    0x163353AF, 0x000063BB },
1251
1252
  { 0x24D2920B, 0x1C249DCC, 0x2069C5E5, 0x09AB2F9E,
1253
    0x36DF3CF1, 0x1991FD0C, 0x062B97A7, 0x1E80070E,
1254
    0x000054E7, 0x20D0B375, 0x2E9F20BD, 0x35090081,
1255
    0x1C7A9DDC, 0x22E7C371, 0x087E3016, 0x03175421,
1256
    0x3C6ECA7D, 0x0000F599 },
1257
1258
  { 0x259B9D5F, 0x0D9A318F, 0x23A0EF16, 0x00EBE4B7,
1259
    0x088265AE, 0x2CDE2666, 0x2BAE7ADF, 0x1371A5C6,
1260
    0x0000F045, 0x0D034F36, 0x1F967378, 0x1B5FA3F4,
1261
    0x0EC8739D, 0x1643E62A, 0x1653947E, 0x22D1F4E6,
1262
    0x0FB8D64B, 0x0000B5B9 }
1263
};
1264
1265
/*
1266
 * Lookup one of the Gwin[] values, by index. This is constant-time.
1267
 */
1268
static void
1269
lookup_Gwin(p256_jacobian *T, uint32_t idx)
1270
10.0k
{
1271
10.0k
  uint32_t xy[18];
1272
10.0k
  uint32_t k;
1273
10.0k
  size_t u;
1274
1275
10.0k
  memset(xy, 0, sizeof xy);
1276
160k
  for (k = 0; k < 15; k ++) {
1277
150k
    uint32_t m;
1278
1279
150k
    m = -EQ(idx, k + 1);
1280
2.85M
    for (u = 0; u < 18; u ++) {
1281
2.70M
      xy[u] |= m & Gwin[k][u];
1282
2.70M
    }
1283
150k
  }
1284
10.0k
  memcpy(T->x, &xy[0], sizeof T->x);
1285
10.0k
  memcpy(T->y, &xy[9], sizeof T->y);
1286
10.0k
  memset(T->z, 0, sizeof T->z);
1287
10.0k
  T->z[0] = 1;
1288
10.0k
}
1289
1290
/*
1291
 * Multiply the generator by an integer. The integer is assumed non-zero
1292
 * and lower than the curve order.
1293
 */
1294
static void
1295
p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1296
114
{
1297
  /*
1298
   * qz is a flag that is initially 1, and remains equal to 1
1299
   * as long as the point is the point at infinity.
1300
   *
1301
   * We use a 4-bit window to handle multiplier bits by groups
1302
   * of 4. The precomputed window is constant static data, with
1303
   * points in affine coordinates; we use a constant-time lookup.
1304
   */
1305
114
  p256_jacobian Q;
1306
114
  uint32_t qz;
1307
1308
114
  memset(&Q, 0, sizeof Q);
1309
114
  qz = 1;
1310
5.12k
  while (xlen -- > 0) {
1311
5.00k
    int k;
1312
5.00k
    unsigned bx;
1313
1314
5.00k
    bx = *x ++;
1315
15.0k
    for (k = 0; k < 2; k ++) {
1316
10.0k
      uint32_t bits;
1317
10.0k
      uint32_t bnz;
1318
10.0k
      p256_jacobian T, U;
1319
1320
10.0k
      p256_double(&Q);
1321
10.0k
      p256_double(&Q);
1322
10.0k
      p256_double(&Q);
1323
10.0k
      p256_double(&Q);
1324
10.0k
      bits = (bx >> 4) & 0x0F;
1325
10.0k
      bnz = NEQ(bits, 0);
1326
10.0k
      lookup_Gwin(&T, bits);
1327
10.0k
      U = Q;
1328
10.0k
      p256_add_mixed(&U, &T);
1329
10.0k
      CCOPY(bnz & qz, &Q, &T, sizeof Q);
1330
10.0k
      CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1331
10.0k
      qz &= ~bnz;
1332
10.0k
      bx <<= 4;
1333
10.0k
    }
1334
5.00k
  }
1335
114
  *P = Q;
1336
114
}
1337
1338
static const unsigned char P256_G[] = {
1339
  0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1340
  0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1341
  0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1342
  0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1343
  0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1344
  0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
1345
  0x68, 0x37, 0xBF, 0x51, 0xF5
1346
};
1347
1348
static const unsigned char P256_N[] = {
1349
  0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1350
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
1351
  0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
1352
  0x25, 0x51
1353
};
1354
1355
static const unsigned char *
1356
api_generator(int curve, size_t *len)
1357
0
{
1358
0
  (void)curve;
1359
0
  *len = sizeof P256_G;
1360
0
  return P256_G;
1361
0
}
1362
1363
static const unsigned char *
1364
api_order(int curve, size_t *len)
1365
24
{
1366
24
  (void)curve;
1367
24
  *len = sizeof P256_N;
1368
24
  return P256_N;
1369
24
}
1370
1371
static size_t
1372
api_xoff(int curve, size_t *len)
1373
0
{
1374
0
  (void)curve;
1375
0
  *len = 32;
1376
0
  return 1;
1377
0
}
1378
1379
static uint32_t
1380
api_mul(unsigned char *G, size_t Glen,
1381
  const unsigned char *x, size_t xlen, int curve)
1382
0
{
1383
0
  uint32_t r;
1384
0
  p256_jacobian P;
1385
1386
0
  (void)curve;
1387
0
  if (Glen != 65) {
1388
0
    return 0;
1389
0
  }
1390
0
  r = p256_decode(&P, G, Glen);
1391
0
  p256_mul(&P, x, xlen);
1392
0
  p256_to_affine(&P);
1393
0
  p256_encode(G, &P);
1394
0
  return r;
1395
0
}
1396
1397
static size_t
1398
api_mulgen(unsigned char *R,
1399
  const unsigned char *x, size_t xlen, int curve)
1400
72
{
1401
72
  p256_jacobian P;
1402
1403
72
  (void)curve;
1404
72
  p256_mulgen(&P, x, xlen);
1405
72
  p256_to_affine(&P);
1406
72
  p256_encode(R, &P);
1407
72
  return 65;
1408
72
}
1409
1410
static uint32_t
1411
api_muladd(unsigned char *A, const unsigned char *B, size_t len,
1412
  const unsigned char *x, size_t xlen,
1413
  const unsigned char *y, size_t ylen, int curve)
1414
42
{
1415
42
  p256_jacobian P, Q;
1416
42
  uint32_t r, t, z;
1417
42
  int i;
1418
1419
42
  (void)curve;
1420
42
  if (len != 65) {
1421
0
    return 0;
1422
0
  }
1423
42
  r = p256_decode(&P, A, len);
1424
42
  p256_mul(&P, x, xlen);
1425
42
  if (B == NULL) {
1426
42
    p256_mulgen(&Q, y, ylen);
1427
42
  } else {
1428
0
    r &= p256_decode(&Q, B, len);
1429
0
    p256_mul(&Q, y, ylen);
1430
0
  }
1431
1432
  /*
1433
   * The final addition may fail in case both points are equal.
1434
   */
1435
42
  t = p256_add(&P, &Q);
1436
42
  reduce_final_f256(P.z);
1437
42
  z = 0;
1438
420
  for (i = 0; i < 9; i ++) {
1439
378
    z |= P.z[i];
1440
378
  }
1441
42
  z = EQ(z, 0);
1442
42
  p256_double(&Q);
1443
1444
  /*
1445
   * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
1446
   * have the following:
1447
   *
1448
   *   z = 0, t = 0   return P (normal addition)
1449
   *   z = 0, t = 1   return P (normal addition)
1450
   *   z = 1, t = 0   return Q (a 'double' case)
1451
   *   z = 1, t = 1   report an error (P+Q = 0)
1452
   */
1453
42
  CCOPY(z & ~t, &P, &Q, sizeof Q);
1454
42
  p256_to_affine(&P);
1455
42
  p256_encode(A, &P);
1456
42
  r &= ~(z & t);
1457
42
  return r;
1458
42
}
1459
1460
/* see bearssl_ec.h */
1461
const br_ec_impl br_ec_p256_m31 = {
1462
  (uint32_t)0x00800000,
1463
  &api_generator,
1464
  &api_order,
1465
  &api_xoff,
1466
  &api_mul,
1467
  &api_mulgen,
1468
  &api_muladd
1469
};