Coverage Report

Created: 2024-06-28 06:08

/src/BearSSL/src/ec/ec_c25519_m31.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining 
5
 * a copy of this software and associated documentation files (the
6
 * "Software"), to deal in the Software without restriction, including
7
 * without limitation the rights to use, copy, modify, merge, publish,
8
 * distribute, sublicense, and/or sell copies of the Software, and to
9
 * permit persons to whom the Software is furnished to do so, subject to
10
 * the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be 
13
 * included in all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
16
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
18
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
 * SOFTWARE.
23
 */
24
25
#include "inner.h"
26
27
/* obsolete
28
#include <stdio.h>
29
#include <stdlib.h>
30
static void
31
print_int(const char *name, const uint32_t *x)
32
{
33
  size_t u;
34
  unsigned char tmp[40];
35
36
  printf("%s = ", name);
37
  for (u = 0; u < 9; u ++) {
38
    if (x[u] > 0x3FFFFFFF) {
39
      printf("INVALID:");
40
      for (u = 0; u < 9; u ++) {
41
        printf(" %08X", x[u]);
42
      }
43
      printf("\n");
44
      return;
45
    }
46
  }
47
  memset(tmp, 0, sizeof tmp);
48
  for (u = 0; u < 9; u ++) {
49
    uint64_t w;
50
    int j, k;
51
52
    w = x[u];
53
    j = 30 * (int)u;
54
    k = j & 7;
55
    if (k != 0) {
56
      w <<= k;
57
      j -= k;
58
    }
59
    k = j >> 3;
60
    for (j = 0; j < 8; j ++) {
61
      tmp[39 - k - j] |= (unsigned char)w;
62
      w >>= 8;
63
    }
64
  }
65
  for (u = 8; u < 40; u ++) {
66
    printf("%02X", tmp[u]);
67
  }
68
  printf("\n");
69
}
70
*/
71
72
/*
73
 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
74
 * that right-shifting a signed negative integer copies the sign bit
75
 * (arithmetic right-shift). This is "implementation-defined behaviour",
76
 * i.e. it is not undefined, but it may differ between compilers. Each
77
 * compiler is supposed to document its behaviour in that respect. GCC
78
 * explicitly defines that an arithmetic right shift is used. We expect
79
 * all other compilers to do the same, because underlying CPU offer an
80
 * arithmetic right shift opcode that could not be used otherwise.
81
 */
82
#if BR_NO_ARITH_SHIFT
83
#define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
84
                    | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
85
#else
86
137k
#define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
87
#endif
88
89
/*
90
 * Convert an integer from unsigned little-endian encoding to a sequence of
91
 * 30-bit words in little-endian order. The final "partial" word is
92
 * returned.
93
 */
94
static uint32_t
95
le8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
96
15
{
97
15
  uint32_t acc;
98
15
  int acc_len;
99
100
15
  acc = 0;
101
15
  acc_len = 0;
102
495
  while (len -- > 0) {
103
480
    uint32_t b;
104
105
480
    b = *src ++;
106
480
    if (acc_len < 22) {
107
360
      acc |= b << acc_len;
108
360
      acc_len += 8;
109
360
    } else {
110
120
      *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
111
120
      acc = b >> (30 - acc_len);
112
120
      acc_len -= 22;
113
120
    }
114
480
  }
115
15
  return acc;
116
15
}
117
118
/*
119
 * Convert an integer (30-bit words, little-endian) to unsigned
120
 * little-endian encoding. The total encoding length is provided; all
121
 * the destination bytes will be filled.
122
 */
123
static void
124
le30_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
125
15
{
126
15
  uint32_t acc;
127
15
  int acc_len;
128
129
15
  acc = 0;
130
15
  acc_len = 0;
131
495
  while (len -- > 0) {
132
480
    if (acc_len < 8) {
133
135
      uint32_t w;
134
135
135
      w = *src ++;
136
135
      *dst ++ = (unsigned char)(acc | (w << acc_len));
137
135
      acc = w >> (8 - acc_len);
138
135
      acc_len += 22;
139
345
    } else {
140
345
      *dst ++ = (unsigned char)acc;
141
345
      acc >>= 8;
142
345
      acc_len -= 8;
143
345
    }
144
480
  }
145
15
}
146
147
/*
148
 * Multiply two integers. Source integers are represented as arrays of
149
 * nine 30-bit words, for values up to 2^270-1. Result is encoded over
150
 * 18 words of 30 bits each.
151
 */
152
static void
153
mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
154
19.7k
{
155
  /*
156
   * Maximum intermediate result is no more than
157
   * 10376293531797946367, which fits in 64 bits. Reason:
158
   *
159
   *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
160
   *   10376293531797946367 < 9663676407 * 2^30
161
   *
162
   * Thus, adding together 9 products of 30-bit integers, with
163
   * a carry of at most 9663676406, yields an integer that fits
164
   * on 64 bits and generates a carry of at most 9663676406.
165
   */
166
19.7k
  uint64_t t[17];
167
19.7k
  uint64_t cc;
168
19.7k
  int i;
169
170
19.7k
  t[ 0] = MUL31(a[0], b[0]);
171
19.7k
  t[ 1] = MUL31(a[0], b[1])
172
19.7k
    + MUL31(a[1], b[0]);
173
19.7k
  t[ 2] = MUL31(a[0], b[2])
174
19.7k
    + MUL31(a[1], b[1])
175
19.7k
    + MUL31(a[2], b[0]);
176
19.7k
  t[ 3] = MUL31(a[0], b[3])
177
19.7k
    + MUL31(a[1], b[2])
178
19.7k
    + MUL31(a[2], b[1])
179
19.7k
    + MUL31(a[3], b[0]);
180
19.7k
  t[ 4] = MUL31(a[0], b[4])
181
19.7k
    + MUL31(a[1], b[3])
182
19.7k
    + MUL31(a[2], b[2])
183
19.7k
    + MUL31(a[3], b[1])
184
19.7k
    + MUL31(a[4], b[0]);
185
19.7k
  t[ 5] = MUL31(a[0], b[5])
186
19.7k
    + MUL31(a[1], b[4])
187
19.7k
    + MUL31(a[2], b[3])
188
19.7k
    + MUL31(a[3], b[2])
189
19.7k
    + MUL31(a[4], b[1])
190
19.7k
    + MUL31(a[5], b[0]);
191
19.7k
  t[ 6] = MUL31(a[0], b[6])
192
19.7k
    + MUL31(a[1], b[5])
193
19.7k
    + MUL31(a[2], b[4])
194
19.7k
    + MUL31(a[3], b[3])
195
19.7k
    + MUL31(a[4], b[2])
196
19.7k
    + MUL31(a[5], b[1])
197
19.7k
    + MUL31(a[6], b[0]);
198
19.7k
  t[ 7] = MUL31(a[0], b[7])
199
19.7k
    + MUL31(a[1], b[6])
200
19.7k
    + MUL31(a[2], b[5])
201
19.7k
    + MUL31(a[3], b[4])
202
19.7k
    + MUL31(a[4], b[3])
203
19.7k
    + MUL31(a[5], b[2])
204
19.7k
    + MUL31(a[6], b[1])
205
19.7k
    + MUL31(a[7], b[0]);
206
19.7k
  t[ 8] = MUL31(a[0], b[8])
207
19.7k
    + MUL31(a[1], b[7])
208
19.7k
    + MUL31(a[2], b[6])
209
19.7k
    + MUL31(a[3], b[5])
210
19.7k
    + MUL31(a[4], b[4])
211
19.7k
    + MUL31(a[5], b[3])
212
19.7k
    + MUL31(a[6], b[2])
213
19.7k
    + MUL31(a[7], b[1])
214
19.7k
    + MUL31(a[8], b[0]);
215
19.7k
  t[ 9] = MUL31(a[1], b[8])
216
19.7k
    + MUL31(a[2], b[7])
217
19.7k
    + MUL31(a[3], b[6])
218
19.7k
    + MUL31(a[4], b[5])
219
19.7k
    + MUL31(a[5], b[4])
220
19.7k
    + MUL31(a[6], b[3])
221
19.7k
    + MUL31(a[7], b[2])
222
19.7k
    + MUL31(a[8], b[1]);
223
19.7k
  t[10] = MUL31(a[2], b[8])
224
19.7k
    + MUL31(a[3], b[7])
225
19.7k
    + MUL31(a[4], b[6])
226
19.7k
    + MUL31(a[5], b[5])
227
19.7k
    + MUL31(a[6], b[4])
228
19.7k
    + MUL31(a[7], b[3])
229
19.7k
    + MUL31(a[8], b[2]);
230
19.7k
  t[11] = MUL31(a[3], b[8])
231
19.7k
    + MUL31(a[4], b[7])
232
19.7k
    + MUL31(a[5], b[6])
233
19.7k
    + MUL31(a[6], b[5])
234
19.7k
    + MUL31(a[7], b[4])
235
19.7k
    + MUL31(a[8], b[3]);
236
19.7k
  t[12] = MUL31(a[4], b[8])
237
19.7k
    + MUL31(a[5], b[7])
238
19.7k
    + MUL31(a[6], b[6])
239
19.7k
    + MUL31(a[7], b[5])
240
19.7k
    + MUL31(a[8], b[4]);
241
19.7k
  t[13] = MUL31(a[5], b[8])
242
19.7k
    + MUL31(a[6], b[7])
243
19.7k
    + MUL31(a[7], b[6])
244
19.7k
    + MUL31(a[8], b[5]);
245
19.7k
  t[14] = MUL31(a[6], b[8])
246
19.7k
    + MUL31(a[7], b[7])
247
19.7k
    + MUL31(a[8], b[6]);
248
19.7k
  t[15] = MUL31(a[7], b[8])
249
19.7k
    + MUL31(a[8], b[7]);
250
19.7k
  t[16] = MUL31(a[8], b[8]);
251
252
  /*
253
   * Propagate carries.
254
   */
255
19.7k
  cc = 0;
256
355k
  for (i = 0; i < 17; i ++) {
257
336k
    uint64_t w;
258
259
336k
    w = t[i] + cc;
260
336k
    d[i] = (uint32_t)w & 0x3FFFFFFF;
261
336k
    cc = w >> 30;
262
336k
  }
263
19.7k
  d[17] = (uint32_t)cc;
264
19.7k
}
265
266
/*
267
 * Square a 270-bit integer, represented as an array of nine 30-bit words.
268
 * Result uses 18 words of 30 bits each.
269
 */
270
static void
271
square9(uint32_t *d, const uint32_t *a)
272
19.1k
{
273
19.1k
  uint64_t t[17];
274
19.1k
  uint64_t cc;
275
19.1k
  int i;
276
277
19.1k
  t[ 0] = MUL31(a[0], a[0]);
278
19.1k
  t[ 1] = ((MUL31(a[0], a[1])) << 1);
279
19.1k
  t[ 2] = MUL31(a[1], a[1])
280
19.1k
    + ((MUL31(a[0], a[2])) << 1);
281
19.1k
  t[ 3] = ((MUL31(a[0], a[3])
282
19.1k
    + MUL31(a[1], a[2])) << 1);
283
19.1k
  t[ 4] = MUL31(a[2], a[2])
284
19.1k
    + ((MUL31(a[0], a[4])
285
19.1k
    + MUL31(a[1], a[3])) << 1);
286
19.1k
  t[ 5] = ((MUL31(a[0], a[5])
287
19.1k
    + MUL31(a[1], a[4])
288
19.1k
    + MUL31(a[2], a[3])) << 1);
289
19.1k
  t[ 6] = MUL31(a[3], a[3])
290
19.1k
    + ((MUL31(a[0], a[6])
291
19.1k
    + MUL31(a[1], a[5])
292
19.1k
    + MUL31(a[2], a[4])) << 1);
293
19.1k
  t[ 7] = ((MUL31(a[0], a[7])
294
19.1k
    + MUL31(a[1], a[6])
295
19.1k
    + MUL31(a[2], a[5])
296
19.1k
    + MUL31(a[3], a[4])) << 1);
297
19.1k
  t[ 8] = MUL31(a[4], a[4])
298
19.1k
    + ((MUL31(a[0], a[8])
299
19.1k
    + MUL31(a[1], a[7])
300
19.1k
    + MUL31(a[2], a[6])
301
19.1k
    + MUL31(a[3], a[5])) << 1);
302
19.1k
  t[ 9] = ((MUL31(a[1], a[8])
303
19.1k
    + MUL31(a[2], a[7])
304
19.1k
    + MUL31(a[3], a[6])
305
19.1k
    + MUL31(a[4], a[5])) << 1);
306
19.1k
  t[10] = MUL31(a[5], a[5])
307
19.1k
    + ((MUL31(a[2], a[8])
308
19.1k
    + MUL31(a[3], a[7])
309
19.1k
    + MUL31(a[4], a[6])) << 1);
310
19.1k
  t[11] = ((MUL31(a[3], a[8])
311
19.1k
    + MUL31(a[4], a[7])
312
19.1k
    + MUL31(a[5], a[6])) << 1);
313
19.1k
  t[12] = MUL31(a[6], a[6])
314
19.1k
    + ((MUL31(a[4], a[8])
315
19.1k
    + MUL31(a[5], a[7])) << 1);
316
19.1k
  t[13] = ((MUL31(a[5], a[8])
317
19.1k
    + MUL31(a[6], a[7])) << 1);
318
19.1k
  t[14] = MUL31(a[7], a[7])
319
19.1k
    + ((MUL31(a[6], a[8])) << 1);
320
19.1k
  t[15] = ((MUL31(a[7], a[8])) << 1);
321
19.1k
  t[16] = MUL31(a[8], a[8]);
322
323
  /*
324
   * Propagate carries.
325
   */
326
19.1k
  cc = 0;
327
343k
  for (i = 0; i < 17; i ++) {
328
324k
    uint64_t w;
329
330
324k
    w = t[i] + cc;
331
324k
    d[i] = (uint32_t)w & 0x3FFFFFFF;
332
324k
    cc = w >> 30;
333
324k
  }
334
19.1k
  d[17] = (uint32_t)cc;
335
19.1k
}
336
337
/*
338
 * Perform a "final reduction" in field F255 (field for Curve25519)
339
 * The source value must be less than twice the modulus. If the value
340
 * is not lower than the modulus, then the modulus is subtracted and
341
 * this function returns 1; otherwise, it leaves it untouched and it
342
 * returns 0.
343
 */
344
static uint32_t
345
reduce_final_f255(uint32_t *d)
346
15
{
347
15
  uint32_t t[9];
348
15
  uint32_t cc;
349
15
  int i;
350
351
15
  memcpy(t, d, sizeof t);
352
15
  cc = 19;
353
150
  for (i = 0; i < 9; i ++) {
354
135
    uint32_t w;
355
356
135
    w = t[i] + cc;
357
135
    cc = w >> 30;
358
135
    t[i] = w & 0x3FFFFFFF;
359
135
  }
360
15
  cc = t[8] >> 15;
361
15
  t[8] &= 0x7FFF;
362
15
  CCOPY(cc, d, t, sizeof t);
363
15
  return cc;
364
15
}
365
366
/*
367
 * Perform a multiplication of two integers modulo 2^255-19.
368
 * Operands are arrays of 9 words, each containing 30 bits of data, in
369
 * little-endian order. Input value may be up to 2^256-1; on output, value
370
 * fits on 256 bits and is lower than twice the modulus.
371
 */
372
static void
373
f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
374
19.7k
{
375
19.7k
  uint32_t t[18], cc;
376
19.7k
  int i;
377
378
  /*
379
   * Compute raw multiplication. All result words fit in 30 bits
380
   * each; upper word (t[17]) must fit on 2 bits, since the product
381
   * of two 256-bit integers must fit on 512 bits.
382
   */
383
19.7k
  mul9(t, a, b);
384
385
  /*
386
   * Modular reduction: each high word is added where necessary.
387
   * Since the modulus is 2^255-19 and word 9 corresponds to
388
   * offset 9*30 = 270, word 9+k must be added to word k with
389
   * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
390
   * added that way.
391
   *
392
   * Keeping the carry on 32 bits helps with 32-bit architectures,
393
   * and does not noticeably impact performance on 64-bit systems.
394
   */
395
19.7k
  cc = MUL15(t[8] >> 15, 19);  /* at most 19*(2^15-1) = 622573 */
396
19.7k
  t[8] &= 0x7FFF;
397
197k
  for (i = 0; i < 9; i ++) {
398
177k
    uint64_t w;
399
400
177k
    w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
401
177k
    t[i] = (uint32_t)w & 0x3FFFFFFF;
402
177k
    cc = (uint32_t)(w >> 30);  /* at most 622592 */
403
177k
  }
404
405
  /*
406
   * Original product was up to (2^256-1)^2, i.e. a 512-bit integer.
407
   * This was split into two parts (upper of 257 bits, lower of 255
408
   * bits), and the upper was added to the lower with a factor 19,
409
   * which means that the intermediate value is less than 77*2^255
410
   * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are
411
   * less than 77, and the initial carry cc is at most 76*19 = 1444.
412
   */
413
19.7k
  cc = MUL15(t[8] >> 15, 19);
414
19.7k
  t[8] &= 0x7FFF;
415
197k
  for (i = 0; i < 9; i ++) {
416
177k
    uint32_t z;
417
418
177k
    z = t[i] + cc;
419
177k
    d[i] = z & 0x3FFFFFFF;
420
177k
    cc = z >> 30;
421
177k
  }
422
423
  /*
424
   * Final result is at most 2^255 + 1443. In particular, the last
425
   * carry is necessarily 0, since t[8] was truncated to 15 bits.
426
   */
427
19.7k
}
428
429
/*
430
 * Perform a squaring of an integer modulo 2^255-19.
431
 * Operands are arrays of 9 words, each containing 30 bits of data, in
432
 * little-endian order. Input value may be up to 2^256-1; on output, value
433
 * fits on 256 bits and is lower than twice the modulus.
434
 */
435
static void
436
f255_square(uint32_t *d, const uint32_t *a)
437
19.1k
{
438
19.1k
  uint32_t t[18], cc;
439
19.1k
  int i;
440
441
  /*
442
   * Compute raw squaring. All result words fit in 30 bits
443
   * each; upper word (t[17]) must fit on 2 bits, since the square
444
   * of a 256-bit integers must fit on 512 bits.
445
   */
446
19.1k
  square9(t, a);
447
448
  /*
449
   * Modular reduction: each high word is added where necessary.
450
   * See f255_mul() for details on the reduction and carry limits.
451
   */
452
19.1k
  cc = MUL15(t[8] >> 15, 19);
453
19.1k
  t[8] &= 0x7FFF;
454
191k
  for (i = 0; i < 9; i ++) {
455
171k
    uint64_t w;
456
457
171k
    w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
458
171k
    t[i] = (uint32_t)w & 0x3FFFFFFF;
459
171k
    cc = (uint32_t)(w >> 30);
460
171k
  }
461
19.1k
  cc = MUL15(t[8] >> 15, 19);
462
19.1k
  t[8] &= 0x7FFF;
463
191k
  for (i = 0; i < 9; i ++) {
464
171k
    uint32_t z;
465
466
171k
    z = t[i] + cc;
467
171k
    d[i] = z & 0x3FFFFFFF;
468
171k
    cc = z >> 30;
469
171k
  }
470
19.1k
}
471
472
/*
473
 * Add two values in F255. Partial reduction is performed (down to less
474
 * than twice the modulus).
475
 */
476
static void
477
f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
478
15.3k
{
479
  /*
480
   * Since operand words fit on 30 bits, we can use 32-bit
481
   * variables throughout.
482
   */
483
15.3k
  int i;
484
15.3k
  uint32_t cc, w;
485
486
15.3k
  cc = 0;
487
153k
  for (i = 0; i < 9; i ++) {
488
137k
    w = a[i] + b[i] + cc;
489
137k
    d[i] = w & 0x3FFFFFFF;
490
137k
    cc = w >> 30;
491
137k
  }
492
15.3k
  cc = MUL15(w >> 15, 19);
493
15.3k
  d[8] &= 0x7FFF;
494
153k
  for (i = 0; i < 9; i ++) {
495
137k
    w = d[i] + cc;
496
137k
    d[i] = w & 0x3FFFFFFF;
497
137k
    cc = w >> 30;
498
137k
  }
499
15.3k
}
500
501
/*
502
 * Subtract one value from another in F255. Partial reduction is
503
 * performed (down to less than twice the modulus).
504
 */
505
static void
506
f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
507
15.3k
{
508
  /*
509
   * We actually compute a - b + 2*p, so that the final value is
510
   * necessarily positive.
511
   */
512
15.3k
  int i;
513
15.3k
  uint32_t cc, w;
514
515
15.3k
  cc = (uint32_t)-38;
516
153k
  for (i = 0; i < 9; i ++) {
517
137k
    w = a[i] - b[i] + cc;
518
137k
    d[i] = w & 0x3FFFFFFF;
519
137k
    cc = ARSH(w, 30);
520
137k
  }
521
15.3k
  cc = MUL15((w + 0x10000) >> 15, 19);
522
15.3k
  d[8] &= 0x7FFF;
523
153k
  for (i = 0; i < 9; i ++) {
524
137k
    w = d[i] + cc;
525
137k
    d[i] = w & 0x3FFFFFFF;
526
137k
    cc = w >> 30;
527
137k
  }
528
15.3k
}
529
530
/*
531
 * Multiply an integer by the 'A24' constant (121665). Partial reduction
532
 * is performed (down to less than twice the modulus).
533
 */
534
static void
535
f255_mul_a24(uint32_t *d, const uint32_t *a)
536
3.82k
{
537
3.82k
  int i;
538
3.82k
  uint64_t w;
539
3.82k
  uint32_t cc;
540
541
  /*
542
   * a[] is over 256 bits, thus a[8] has length at most 16 bits.
543
   * We single out the processing of the last word: intermediate
544
   * value w is up to 121665*2^16, yielding a carry for the next
545
   * loop of at most 19*(121665*2^16/2^15) = 4623289.
546
   */
547
3.82k
  cc = 0;
548
34.4k
  for (i = 0; i < 8; i ++) {
549
30.6k
    w = MUL31(a[i], 121665) + (uint64_t)cc;
550
30.6k
    d[i] = (uint32_t)w & 0x3FFFFFFF;
551
30.6k
    cc = (uint32_t)(w >> 30);
552
30.6k
  }
553
3.82k
  w = MUL31(a[8], 121665) + (uint64_t)cc;
554
3.82k
  d[8] = (uint32_t)w & 0x7FFF;
555
3.82k
  cc = MUL15((uint32_t)(w >> 15), 19);
556
557
38.2k
  for (i = 0; i < 9; i ++) {
558
34.4k
    uint32_t z;
559
560
34.4k
    z = d[i] + cc;
561
34.4k
    d[i] = z & 0x3FFFFFFF;
562
34.4k
    cc = z >> 30;
563
34.4k
  }
564
3.82k
}
565
566
static const unsigned char GEN[] = {
567
  0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
568
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
569
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
570
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
571
};
572
573
static const unsigned char ORDER[] = {
574
  0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
575
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
576
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
577
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
578
};
579
580
static const unsigned char *
581
api_generator(int curve, size_t *len)
582
15
{
583
15
  (void)curve;
584
15
  *len = 32;
585
15
  return GEN;
586
15
}
587
588
static const unsigned char *
589
api_order(int curve, size_t *len)
590
15
{
591
15
  (void)curve;
592
15
  *len = 32;
593
15
  return ORDER;
594
15
}
595
596
static size_t
597
api_xoff(int curve, size_t *len)
598
0
{
599
0
  (void)curve;
600
0
  *len = 32;
601
0
  return 0;
602
0
}
603
604
static void
605
cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
606
7.68k
{
607
7.68k
  int i;
608
609
7.68k
  ctl = -ctl;
610
76.8k
  for (i = 0; i < 9; i ++) {
611
69.1k
    uint32_t aw, bw, tw;
612
613
69.1k
    aw = a[i];
614
69.1k
    bw = b[i];
615
69.1k
    tw = ctl & (aw ^ bw);
616
69.1k
    a[i] = aw ^ tw;
617
69.1k
    b[i] = bw ^ tw;
618
69.1k
  }
619
7.68k
}
620
621
static uint32_t
622
api_mul(unsigned char *G, size_t Glen,
623
  const unsigned char *kb, size_t kblen, int curve)
624
15
{
625
15
  uint32_t x1[9], x2[9], x3[9], z2[9], z3[9];
626
15
  uint32_t a[9], aa[9], b[9], bb[9];
627
15
  uint32_t c[9], d[9], e[9], da[9], cb[9];
628
15
  unsigned char k[32];
629
15
  uint32_t swap;
630
15
  int i;
631
632
15
  (void)curve;
633
634
  /*
635
   * Points are encoded over exactly 32 bytes. Multipliers must fit
636
   * in 32 bytes as well.
637
   * RFC 7748 mandates that the high bit of the last point byte must
638
   * be ignored/cleared.
639
   */
640
15
  if (Glen != 32 || kblen > 32) {
641
0
    return 0;
642
0
  }
643
15
  G[31] &= 0x7F;
644
645
  /*
646
   * Initialise variables x1, x2, z2, x3 and z3. We set all of them
647
   * into Montgomery representation.
648
   */
649
15
  x1[8] = le8_to_le30(x1, G, 32);
650
15
  memcpy(x3, x1, sizeof x1);
651
15
  memset(z2, 0, sizeof z2);
652
15
  memset(x2, 0, sizeof x2);
653
15
  x2[0] = 1;
654
15
  memset(z3, 0, sizeof z3);
655
15
  z3[0] = 1;
656
657
15
  memset(k, 0, (sizeof k) - kblen);
658
15
  memcpy(k + (sizeof k) - kblen, kb, kblen);
659
15
  k[31] &= 0xF8;
660
15
  k[0] &= 0x7F;
661
15
  k[0] |= 0x40;
662
663
  /* obsolete
664
  print_int("x1", x1);
665
  */
666
667
15
  swap = 0;
668
3.84k
  for (i = 254; i >= 0; i --) {
669
3.82k
    uint32_t kt;
670
671
3.82k
    kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
672
3.82k
    swap ^= kt;
673
3.82k
    cswap(x2, x3, swap);
674
3.82k
    cswap(z2, z3, swap);
675
3.82k
    swap = kt;
676
677
    /* obsolete
678
    print_int("x2", x2);
679
    print_int("z2", z2);
680
    print_int("x3", x3);
681
    print_int("z3", z3);
682
    */
683
684
3.82k
    f255_add(a, x2, z2);
685
3.82k
    f255_square(aa, a);
686
3.82k
    f255_sub(b, x2, z2);
687
3.82k
    f255_square(bb, b);
688
3.82k
    f255_sub(e, aa, bb);
689
3.82k
    f255_add(c, x3, z3);
690
3.82k
    f255_sub(d, x3, z3);
691
3.82k
    f255_mul(da, d, a);
692
3.82k
    f255_mul(cb, c, b);
693
694
    /* obsolete
695
    print_int("a ", a);
696
    print_int("aa", aa);
697
    print_int("b ", b);
698
    print_int("bb", bb);
699
    print_int("e ", e);
700
    print_int("c ", c);
701
    print_int("d ", d);
702
    print_int("da", da);
703
    print_int("cb", cb);
704
    */
705
706
3.82k
    f255_add(x3, da, cb);
707
3.82k
    f255_square(x3, x3);
708
3.82k
    f255_sub(z3, da, cb);
709
3.82k
    f255_square(z3, z3);
710
3.82k
    f255_mul(z3, z3, x1);
711
3.82k
    f255_mul(x2, aa, bb);
712
3.82k
    f255_mul_a24(z2, e);
713
3.82k
    f255_add(z2, z2, aa);
714
3.82k
    f255_mul(z2, e, z2);
715
716
    /* obsolete
717
    print_int("x2", x2);
718
    print_int("z2", z2);
719
    print_int("x3", x3);
720
    print_int("z3", z3);
721
    */
722
3.82k
  }
723
15
  cswap(x2, x3, swap);
724
15
  cswap(z2, z3, swap);
725
726
  /*
727
   * Inverse z2 with a modular exponentiation. This is a simple
728
   * square-and-multiply algorithm; we mutualise most non-squarings
729
   * since the exponent contains almost only ones.
730
   */
731
15
  memcpy(a, z2, sizeof z2);
732
240
  for (i = 0; i < 15; i ++) {
733
225
    f255_square(a, a);
734
225
    f255_mul(a, a, z2);
735
225
  }
736
15
  memcpy(b, a, sizeof a);
737
225
  for (i = 0; i < 14; i ++) {
738
210
    int j;
739
740
3.57k
    for (j = 0; j < 16; j ++) {
741
3.36k
      f255_square(b, b);
742
3.36k
    }
743
210
    f255_mul(b, b, a);
744
210
  }
745
240
  for (i = 14; i >= 0; i --) {
746
225
    f255_square(b, b);
747
225
    if ((0xFFEB >> i) & 1) {
748
195
      f255_mul(b, z2, b);
749
195
    }
750
225
  }
751
15
  f255_mul(x2, x2, b);
752
15
  reduce_final_f255(x2);
753
15
  le30_to_le8(G, 32, x2);
754
15
  return 1;
755
15
}
756
757
static size_t
758
api_mulgen(unsigned char *R,
759
  const unsigned char *x, size_t xlen, int curve)
760
15
{
761
15
  const unsigned char *G;
762
15
  size_t Glen;
763
764
15
  G = api_generator(curve, &Glen);
765
15
  memcpy(R, G, Glen);
766
15
  api_mul(R, Glen, x, xlen, curve);
767
15
  return Glen;
768
15
}
769
770
static uint32_t
771
api_muladd(unsigned char *A, const unsigned char *B, size_t len,
772
  const unsigned char *x, size_t xlen,
773
  const unsigned char *y, size_t ylen, int curve)
774
0
{
775
  /*
776
   * We don't implement this method, since it is used for ECDSA
777
   * only, and there is no ECDSA over Curve25519 (which instead
778
   * uses EdDSA).
779
   */
780
0
  (void)A;
781
0
  (void)B;
782
0
  (void)len;
783
0
  (void)x;
784
0
  (void)xlen;
785
0
  (void)y;
786
0
  (void)ylen;
787
0
  (void)curve;
788
0
  return 0;
789
0
}
790
791
/* see bearssl_ec.h */
792
const br_ec_impl br_ec_c25519_m31 = {
793
  (uint32_t)0x20000000,
794
  &api_generator,
795
  &api_order,
796
  &api_xoff,
797
  &api_mul,
798
  &api_mulgen,
799
  &api_muladd
800
};