Coverage Report

Created: 2023-06-07 07:11

/src/boringssl/crypto/curve25519/curve25519.c
Line
Count
Source (jump to first uncovered line)
1
/* Copyright (c) 2020, Google Inc.
2
 *
3
 * Permission to use, copy, modify, and/or distribute this software for any
4
 * purpose with or without fee is hereby granted, provided that the above
5
 * copyright notice and this permission notice appear in all copies.
6
 *
7
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10
 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
15
// Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP
16
// 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as
17
// public domain. Other parts have been replaced to call into code generated by
18
// Fiat (https://github.com/mit-plv/fiat-crypto) in //third_party/fiat.
19
//
20
// The field functions are shared by Ed25519 and X25519 where possible.
21
22
#include <assert.h>
23
#include <string.h>
24
25
#include <openssl/mem.h>
26
#include <openssl/rand.h>
27
#include <openssl/sha.h>
28
29
#include "internal.h"
30
#include "../internal.h"
31
32
// Various pre-computed constants.
33
#include "./curve25519_tables.h"
34
35
#if defined(BORINGSSL_HAS_UINT128)
36
#include "../../third_party/fiat/curve25519_64.h"
37
#elif defined(OPENSSL_64_BIT)
38
#include "../../third_party/fiat/curve25519_64_msvc.h"
39
#else
40
#include "../../third_party/fiat/curve25519_32.h"
41
#endif
42
43
44
// Low-level intrinsic operations
45
46
0
static uint64_t load_3(const uint8_t *in) {
47
0
  uint64_t result;
48
0
  result = (uint64_t)in[0];
49
0
  result |= ((uint64_t)in[1]) << 8;
50
0
  result |= ((uint64_t)in[2]) << 16;
51
0
  return result;
52
0
}
53
54
0
static uint64_t load_4(const uint8_t *in) {
55
0
  uint64_t result;
56
0
  result = (uint64_t)in[0];
57
0
  result |= ((uint64_t)in[1]) << 8;
58
0
  result |= ((uint64_t)in[2]) << 16;
59
0
  result |= ((uint64_t)in[3]) << 24;
60
0
  return result;
61
0
}
62
63
64
// Field operations.
65
66
#if defined(OPENSSL_64_BIT)
67
68
typedef uint64_t fe_limb_t;
69
0
#define FE_NUM_LIMBS 5
70
71
// assert_fe asserts that |f| satisfies bounds:
72
//
73
//  [[0x0 ~> 0x8cccccccccccc],
74
//   [0x0 ~> 0x8cccccccccccc],
75
//   [0x0 ~> 0x8cccccccccccc],
76
//   [0x0 ~> 0x8cccccccccccc],
77
//   [0x0 ~> 0x8cccccccccccc]]
78
//
79
// See comments in curve25519_64.h for which functions use these bounds for
80
// inputs or outputs.
81
#define assert_fe(f)                                                    \
82
30.7k
  do {                                                                  \
83
184k
    for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
84
153k
      assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc));             \
85
153k
    }                                                                   \
86
30.7k
  } while (0)
87
88
// assert_fe_loose asserts that |f| satisfies bounds:
89
//
90
//  [[0x0 ~> 0x1a666666666664],
91
//   [0x0 ~> 0x1a666666666664],
92
//   [0x0 ~> 0x1a666666666664],
93
//   [0x0 ~> 0x1a666666666664],
94
//   [0x0 ~> 0x1a666666666664]]
95
//
96
// See comments in curve25519_64.h for which functions use these bounds for
97
// inputs or outputs.
98
#define assert_fe_loose(f)                                              \
99
31.9k
  do {                                                                  \
100
191k
    for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
101
159k
      assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664));            \
102
159k
    }                                                                   \
103
31.9k
  } while (0)
104
105
#else
106
107
typedef uint32_t fe_limb_t;
108
#define FE_NUM_LIMBS 10
109
110
// assert_fe asserts that |f| satisfies bounds:
111
//
112
//  [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
113
//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
114
//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
115
//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
116
//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
117
//
118
// See comments in curve25519_32.h for which functions use these bounds for
119
// inputs or outputs.
120
#define assert_fe(f)                                                     \
121
  do {                                                                   \
122
    for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
123
      assert(f[_assert_fe_i] <=                                          \
124
             ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u));            \
125
    }                                                                    \
126
  } while (0)
127
128
// assert_fe_loose asserts that |f| satisfies bounds:
129
//
130
//  [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
131
//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
132
//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
133
//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
134
//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
135
//
136
// See comments in curve25519_32.h for which functions use these bounds for
137
// inputs or outputs.
138
#define assert_fe_loose(f)                                               \
139
  do {                                                                   \
140
    for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
141
      assert(f[_assert_fe_i] <=                                          \
142
             ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u));            \
143
    }                                                                    \
144
  } while (0)
145
146
#endif  // OPENSSL_64_BIT
147
148
static_assert(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS,
149
              "fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe");
150
151
0
static void fe_frombytes_strict(fe *h, const uint8_t s[32]) {
152
  // |fiat_25519_from_bytes| requires the top-most bit be clear.
153
0
  assert((s[31] & 0x80) == 0);
154
0
  fiat_25519_from_bytes(h->v, s);
155
0
  assert_fe(h->v);
156
0
}
157
158
0
static void fe_frombytes(fe *h, const uint8_t s[32]) {
159
0
  uint8_t s_copy[32];
160
0
  OPENSSL_memcpy(s_copy, s, 32);
161
0
  s_copy[31] &= 0x7f;
162
0
  fe_frombytes_strict(h, s_copy);
163
0
}
164
165
197
static void fe_tobytes(uint8_t s[32], const fe *f) {
166
197
  assert_fe(f->v);
167
197
  fiat_25519_to_bytes(s, f->v);
168
197
}
169
170
// h = 0
171
0
static void fe_0(fe *h) {
172
0
  OPENSSL_memset(h, 0, sizeof(fe));
173
0
}
174
175
0
static void fe_loose_0(fe_loose *h) {
176
0
  OPENSSL_memset(h, 0, sizeof(fe_loose));
177
0
}
178
179
// h = 1
180
0
static void fe_1(fe *h) {
181
0
  OPENSSL_memset(h, 0, sizeof(fe));
182
0
  h->v[0] = 1;
183
0
}
184
185
0
static void fe_loose_1(fe_loose *h) {
186
0
  OPENSSL_memset(h, 0, sizeof(fe_loose));
187
0
  h->v[0] = 1;
188
0
}
189
190
// h = f + g
191
// Can overlap h with f or g.
192
31
static void fe_add(fe_loose *h, const fe *f, const fe *g) {
193
31
  assert_fe(f->v);
194
31
  assert_fe(g->v);
195
31
  fiat_25519_add(h->v, f->v, g->v);
196
31
  assert_fe_loose(h->v);
197
31
}
198
199
// h = f - g
200
// Can overlap h with f or g.
201
31
static void fe_sub(fe_loose *h, const fe *f, const fe *g) {
202
31
  assert_fe(f->v);
203
31
  assert_fe(g->v);
204
31
  fiat_25519_sub(h->v, f->v, g->v);
205
31
  assert_fe_loose(h->v);
206
31
}
207
208
0
static void fe_carry(fe *h, const fe_loose* f) {
209
0
  assert_fe_loose(f->v);
210
0
  fiat_25519_carry(h->v, f->v);
211
0
  assert_fe(h->v);
212
0
}
213
214
static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS],
215
                        const fe_limb_t in1[FE_NUM_LIMBS],
216
1.45k
                        const fe_limb_t in2[FE_NUM_LIMBS]) {
217
1.45k
  assert_fe_loose(in1);
218
1.45k
  assert_fe_loose(in2);
219
1.45k
  fiat_25519_carry_mul(out, in1, in2);
220
1.45k
  assert_fe(out);
221
1.45k
}
222
223
0
static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) {
224
0
  fe_mul_impl(h->v, f->v, g->v);
225
0
}
226
227
0
static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) {
228
0
  fe_mul_impl(h->v, f->v, g->v);
229
0
}
230
231
1.30k
static void fe_mul_ttt(fe *h, const fe *f, const fe *g) {
232
1.30k
  fe_mul_impl(h->v, f->v, g->v);
233
1.30k
}
234
235
145
static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) {
236
145
  fe_mul_impl(h->v, f->v, g->v);
237
145
}
238
239
0
static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) {
240
0
  fe_mul_impl(h->v, f->v, g->v);
241
0
}
242
243
0
static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) {
244
0
  fe_mul_impl(h->v, f->v, g->v);
245
0
}
246
247
114
static void fe_sq_tl(fe *h, const fe_loose *f) {
248
114
  assert_fe_loose(f->v);
249
114
  fiat_25519_carry_square(h->v, f->v);
250
114
  assert_fe(h->v);
251
114
}
252
253
28.8k
static void fe_sq_tt(fe *h, const fe *f) {
254
28.8k
  assert_fe_loose(f->v);
255
28.8k
  fiat_25519_carry_square(h->v, f->v);
256
28.8k
  assert_fe(h->v);
257
28.8k
}
258
259
// Replace (f,g) with (g,f) if b == 1;
260
// replace (f,g) with (f,g) if b == 0.
261
//
262
// Preconditions: b in {0,1}.
263
0
static void fe_cswap(fe *f, fe *g, fe_limb_t b) {
264
0
  b = 0-b;
265
0
  for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
266
0
    fe_limb_t x = f->v[i] ^ g->v[i];
267
0
    x &= b;
268
0
    f->v[i] ^= x;
269
0
    g->v[i] ^= x;
270
0
  }
271
0
}
272
273
0
static void fe_mul121666(fe *h, const fe_loose *f) {
274
0
  assert_fe_loose(f->v);
275
0
  fiat_25519_carry_scmul_121666(h->v, f->v);
276
0
  assert_fe(h->v);
277
0
}
278
279
// h = -f
280
0
static void fe_neg(fe_loose *h, const fe *f) {
281
0
  assert_fe(f->v);
282
0
  fiat_25519_opp(h->v, f->v);
283
0
  assert_fe_loose(h->v);
284
0
}
285
286
// Replace (f,g) with (g,g) if b == 1;
287
// replace (f,g) with (f,g) if b == 0.
288
//
289
// Preconditions: b in {0,1}.
290
0
static void fe_cmov(fe_loose *f, const fe_loose *g, fe_limb_t b) {
291
  // Silence an unused function warning. |fiat_25519_selectznz| isn't quite the
292
  // calling convention the rest of this code wants, so implement it by hand.
293
  //
294
  // TODO(davidben): Switch to fiat's calling convention, or ask fiat to emit a
295
  // different one.
296
0
  (void)fiat_25519_selectznz;
297
298
0
  b = 0-b;
299
0
  for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
300
0
    fe_limb_t x = f->v[i] ^ g->v[i];
301
0
    x &= b;
302
0
    f->v[i] ^= x;
303
0
  }
304
0
}
305
306
// h = f
307
0
static void fe_copy(fe *h, const fe *f) {
308
0
  OPENSSL_memmove(h, f, sizeof(fe));
309
0
}
310
311
83
static void fe_copy_lt(fe_loose *h, const fe *f) {
312
83
  static_assert(sizeof(fe_loose) == sizeof(fe), "fe and fe_loose mismatch");
313
83
  OPENSSL_memmove(h, f, sizeof(fe));
314
83
}
315
316
114
static void fe_loose_invert(fe *out, const fe_loose *z) {
317
114
  fe t0;
318
114
  fe t1;
319
114
  fe t2;
320
114
  fe t3;
321
114
  int i;
322
323
114
  fe_sq_tl(&t0, z);
324
114
  fe_sq_tt(&t1, &t0);
325
228
  for (i = 1; i < 2; ++i) {
326
114
    fe_sq_tt(&t1, &t1);
327
114
  }
328
114
  fe_mul_tlt(&t1, z, &t1);
329
114
  fe_mul_ttt(&t0, &t0, &t1);
330
114
  fe_sq_tt(&t2, &t0);
331
114
  fe_mul_ttt(&t1, &t1, &t2);
332
114
  fe_sq_tt(&t2, &t1);
333
570
  for (i = 1; i < 5; ++i) {
334
456
    fe_sq_tt(&t2, &t2);
335
456
  }
336
114
  fe_mul_ttt(&t1, &t2, &t1);
337
114
  fe_sq_tt(&t2, &t1);
338
1.14k
  for (i = 1; i < 10; ++i) {
339
1.02k
    fe_sq_tt(&t2, &t2);
340
1.02k
  }
341
114
  fe_mul_ttt(&t2, &t2, &t1);
342
114
  fe_sq_tt(&t3, &t2);
343
2.28k
  for (i = 1; i < 20; ++i) {
344
2.16k
    fe_sq_tt(&t3, &t3);
345
2.16k
  }
346
114
  fe_mul_ttt(&t2, &t3, &t2);
347
114
  fe_sq_tt(&t2, &t2);
348
1.14k
  for (i = 1; i < 10; ++i) {
349
1.02k
    fe_sq_tt(&t2, &t2);
350
1.02k
  }
351
114
  fe_mul_ttt(&t1, &t2, &t1);
352
114
  fe_sq_tt(&t2, &t1);
353
5.70k
  for (i = 1; i < 50; ++i) {
354
5.58k
    fe_sq_tt(&t2, &t2);
355
5.58k
  }
356
114
  fe_mul_ttt(&t2, &t2, &t1);
357
114
  fe_sq_tt(&t3, &t2);
358
11.4k
  for (i = 1; i < 100; ++i) {
359
11.2k
    fe_sq_tt(&t3, &t3);
360
11.2k
  }
361
114
  fe_mul_ttt(&t2, &t3, &t2);
362
114
  fe_sq_tt(&t2, &t2);
363
5.70k
  for (i = 1; i < 50; ++i) {
364
5.58k
    fe_sq_tt(&t2, &t2);
365
5.58k
  }
366
114
  fe_mul_ttt(&t1, &t2, &t1);
367
114
  fe_sq_tt(&t1, &t1);
368
570
  for (i = 1; i < 5; ++i) {
369
456
    fe_sq_tt(&t1, &t1);
370
456
  }
371
114
  fe_mul_ttt(out, &t1, &t0);
372
114
}
373
374
83
static void fe_invert(fe *out, const fe *z) {
375
83
  fe_loose l;
376
83
  fe_copy_lt(&l, z);
377
83
  fe_loose_invert(out, &l);
378
83
}
379
380
// return 0 if f == 0
381
// return 1 if f != 0
382
0
static int fe_isnonzero(const fe_loose *f) {
383
0
  fe tight;
384
0
  fe_carry(&tight, f);
385
0
  uint8_t s[32];
386
0
  fe_tobytes(s, &tight);
387
388
0
  static const uint8_t zero[32] = {0};
389
0
  return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0;
390
0
}
391
392
// return 1 if f is in {1,3,5,...,q-2}
393
// return 0 if f is in {0,2,4,...,q-1}
394
83
static int fe_isnegative(const fe *f) {
395
83
  uint8_t s[32];
396
83
  fe_tobytes(s, f);
397
83
  return s[0] & 1;
398
83
}
399
400
0
static void fe_sq2_tt(fe *h, const fe *f) {
401
  // h = f^2
402
0
  fe_sq_tt(h, f);
403
404
  // h = h + h
405
0
  fe_loose tmp;
406
0
  fe_add(&tmp, h, h);
407
0
  fe_carry(h, &tmp);
408
0
}
409
410
0
static void fe_pow22523(fe *out, const fe *z) {
411
0
  fe t0;
412
0
  fe t1;
413
0
  fe t2;
414
0
  int i;
415
416
0
  fe_sq_tt(&t0, z);
417
0
  fe_sq_tt(&t1, &t0);
418
0
  for (i = 1; i < 2; ++i) {
419
0
    fe_sq_tt(&t1, &t1);
420
0
  }
421
0
  fe_mul_ttt(&t1, z, &t1);
422
0
  fe_mul_ttt(&t0, &t0, &t1);
423
0
  fe_sq_tt(&t0, &t0);
424
0
  fe_mul_ttt(&t0, &t1, &t0);
425
0
  fe_sq_tt(&t1, &t0);
426
0
  for (i = 1; i < 5; ++i) {
427
0
    fe_sq_tt(&t1, &t1);
428
0
  }
429
0
  fe_mul_ttt(&t0, &t1, &t0);
430
0
  fe_sq_tt(&t1, &t0);
431
0
  for (i = 1; i < 10; ++i) {
432
0
    fe_sq_tt(&t1, &t1);
433
0
  }
434
0
  fe_mul_ttt(&t1, &t1, &t0);
435
0
  fe_sq_tt(&t2, &t1);
436
0
  for (i = 1; i < 20; ++i) {
437
0
    fe_sq_tt(&t2, &t2);
438
0
  }
439
0
  fe_mul_ttt(&t1, &t2, &t1);
440
0
  fe_sq_tt(&t1, &t1);
441
0
  for (i = 1; i < 10; ++i) {
442
0
    fe_sq_tt(&t1, &t1);
443
0
  }
444
0
  fe_mul_ttt(&t0, &t1, &t0);
445
0
  fe_sq_tt(&t1, &t0);
446
0
  for (i = 1; i < 50; ++i) {
447
0
    fe_sq_tt(&t1, &t1);
448
0
  }
449
0
  fe_mul_ttt(&t1, &t1, &t0);
450
0
  fe_sq_tt(&t2, &t1);
451
0
  for (i = 1; i < 100; ++i) {
452
0
    fe_sq_tt(&t2, &t2);
453
0
  }
454
0
  fe_mul_ttt(&t1, &t2, &t1);
455
0
  fe_sq_tt(&t1, &t1);
456
0
  for (i = 1; i < 50; ++i) {
457
0
    fe_sq_tt(&t1, &t1);
458
0
  }
459
0
  fe_mul_ttt(&t0, &t1, &t0);
460
0
  fe_sq_tt(&t0, &t0);
461
0
  for (i = 1; i < 2; ++i) {
462
0
    fe_sq_tt(&t0, &t0);
463
0
  }
464
0
  fe_mul_ttt(out, &t0, z);
465
0
}
466
467
468
// Group operations.
469
470
0
void x25519_ge_tobytes(uint8_t s[32], const ge_p2 *h) {
471
0
  fe recip;
472
0
  fe x;
473
0
  fe y;
474
475
0
  fe_invert(&recip, &h->Z);
476
0
  fe_mul_ttt(&x, &h->X, &recip);
477
0
  fe_mul_ttt(&y, &h->Y, &recip);
478
0
  fe_tobytes(s, &y);
479
0
  s[31] ^= fe_isnegative(&x) << 7;
480
0
}
481
482
83
static void ge_p3_tobytes(uint8_t s[32], const ge_p3 *h) {
483
83
  fe recip;
484
83
  fe x;
485
83
  fe y;
486
487
83
  fe_invert(&recip, &h->Z);
488
83
  fe_mul_ttt(&x, &h->X, &recip);
489
83
  fe_mul_ttt(&y, &h->Y, &recip);
490
83
  fe_tobytes(s, &y);
491
83
  s[31] ^= fe_isnegative(&x) << 7;
492
83
}
493
494
0
int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) {
495
0
  fe u;
496
0
  fe_loose v;
497
0
  fe w;
498
0
  fe vxx;
499
0
  fe_loose check;
500
501
0
  fe_frombytes(&h->Y, s);
502
0
  fe_1(&h->Z);
503
0
  fe_sq_tt(&w, &h->Y);
504
0
  fe_mul_ttt(&vxx, &w, &d);
505
0
  fe_sub(&v, &w, &h->Z);  // u = y^2-1
506
0
  fe_carry(&u, &v);
507
0
  fe_add(&v, &vxx, &h->Z);  // v = dy^2+1
508
509
0
  fe_mul_ttl(&w, &u, &v);  // w = u*v
510
0
  fe_pow22523(&h->X, &w);  // x = w^((q-5)/8)
511
0
  fe_mul_ttt(&h->X, &h->X, &u);  // x = u*w^((q-5)/8)
512
513
0
  fe_sq_tt(&vxx, &h->X);
514
0
  fe_mul_ttl(&vxx, &vxx, &v);
515
0
  fe_sub(&check, &vxx, &u);
516
0
  if (fe_isnonzero(&check)) {
517
0
    fe_add(&check, &vxx, &u);
518
0
    if (fe_isnonzero(&check)) {
519
0
      return 0;
520
0
    }
521
0
    fe_mul_ttt(&h->X, &h->X, &sqrtm1);
522
0
  }
523
524
0
  if (fe_isnegative(&h->X) != (s[31] >> 7)) {
525
0
    fe_loose t;
526
0
    fe_neg(&t, &h->X);
527
0
    fe_carry(&h->X, &t);
528
0
  }
529
530
0
  fe_mul_ttt(&h->T, &h->X, &h->Y);
531
0
  return 1;
532
0
}
533
534
0
static void ge_p2_0(ge_p2 *h) {
535
0
  fe_0(&h->X);
536
0
  fe_1(&h->Y);
537
0
  fe_1(&h->Z);
538
0
}
539
540
0
static void ge_p3_0(ge_p3 *h) {
541
0
  fe_0(&h->X);
542
0
  fe_1(&h->Y);
543
0
  fe_1(&h->Z);
544
0
  fe_0(&h->T);
545
0
}
546
547
0
static void ge_cached_0(ge_cached *h) {
548
0
  fe_loose_1(&h->YplusX);
549
0
  fe_loose_1(&h->YminusX);
550
0
  fe_loose_1(&h->Z);
551
0
  fe_loose_0(&h->T2d);
552
0
}
553
554
0
static void ge_precomp_0(ge_precomp *h) {
555
0
  fe_loose_1(&h->yplusx);
556
0
  fe_loose_1(&h->yminusx);
557
0
  fe_loose_0(&h->xy2d);
558
0
}
559
560
// r = p
561
0
static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) {
562
0
  fe_copy(&r->X, &p->X);
563
0
  fe_copy(&r->Y, &p->Y);
564
0
  fe_copy(&r->Z, &p->Z);
565
0
}
566
567
// r = p
568
0
void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
569
0
  fe_add(&r->YplusX, &p->Y, &p->X);
570
0
  fe_sub(&r->YminusX, &p->Y, &p->X);
571
0
  fe_copy_lt(&r->Z, &p->Z);
572
0
  fe_mul_ltt(&r->T2d, &p->T, &d2);
573
0
}
574
575
// r = p
576
0
void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) {
577
0
  fe_mul_tll(&r->X, &p->X, &p->T);
578
0
  fe_mul_tll(&r->Y, &p->Y, &p->Z);
579
0
  fe_mul_tll(&r->Z, &p->Z, &p->T);
580
0
}
581
582
// r = p
583
0
void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
584
0
  fe_mul_tll(&r->X, &p->X, &p->T);
585
0
  fe_mul_tll(&r->Y, &p->Y, &p->Z);
586
0
  fe_mul_tll(&r->Z, &p->Z, &p->T);
587
0
  fe_mul_tll(&r->T, &p->X, &p->Y);
588
0
}
589
590
// r = p
591
0
static void ge_p1p1_to_cached(ge_cached *r, const ge_p1p1 *p) {
592
0
  ge_p3 t;
593
0
  x25519_ge_p1p1_to_p3(&t, p);
594
0
  x25519_ge_p3_to_cached(r, &t);
595
0
}
596
597
// r = 2 * p
598
0
static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
599
0
  fe trX, trZ, trT;
600
0
  fe t0;
601
602
0
  fe_sq_tt(&trX, &p->X);
603
0
  fe_sq_tt(&trZ, &p->Y);
604
0
  fe_sq2_tt(&trT, &p->Z);
605
0
  fe_add(&r->Y, &p->X, &p->Y);
606
0
  fe_sq_tl(&t0, &r->Y);
607
608
0
  fe_add(&r->Y, &trZ, &trX);
609
0
  fe_sub(&r->Z, &trZ, &trX);
610
0
  fe_carry(&trZ, &r->Y);
611
0
  fe_sub(&r->X, &t0, &trZ);
612
0
  fe_carry(&trZ, &r->Z);
613
0
  fe_sub(&r->T, &trT, &trZ);
614
0
}
615
616
// r = 2 * p
617
0
static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) {
618
0
  ge_p2 q;
619
0
  ge_p3_to_p2(&q, p);
620
0
  ge_p2_dbl(r, &q);
621
0
}
622
623
// r = p + q
624
0
static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
625
0
  fe trY, trZ, trT;
626
627
0
  fe_add(&r->X, &p->Y, &p->X);
628
0
  fe_sub(&r->Y, &p->Y, &p->X);
629
0
  fe_mul_tll(&trZ, &r->X, &q->yplusx);
630
0
  fe_mul_tll(&trY, &r->Y, &q->yminusx);
631
0
  fe_mul_tlt(&trT, &q->xy2d, &p->T);
632
0
  fe_add(&r->T, &p->Z, &p->Z);
633
0
  fe_sub(&r->X, &trZ, &trY);
634
0
  fe_add(&r->Y, &trZ, &trY);
635
0
  fe_carry(&trZ, &r->T);
636
0
  fe_add(&r->Z, &trZ, &trT);
637
0
  fe_sub(&r->T, &trZ, &trT);
638
0
}
639
640
// r = p - q
641
0
static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
642
0
  fe trY, trZ, trT;
643
644
0
  fe_add(&r->X, &p->Y, &p->X);
645
0
  fe_sub(&r->Y, &p->Y, &p->X);
646
0
  fe_mul_tll(&trZ, &r->X, &q->yminusx);
647
0
  fe_mul_tll(&trY, &r->Y, &q->yplusx);
648
0
  fe_mul_tlt(&trT, &q->xy2d, &p->T);
649
0
  fe_add(&r->T, &p->Z, &p->Z);
650
0
  fe_sub(&r->X, &trZ, &trY);
651
0
  fe_add(&r->Y, &trZ, &trY);
652
0
  fe_carry(&trZ, &r->T);
653
0
  fe_sub(&r->Z, &trZ, &trT);
654
0
  fe_add(&r->T, &trZ, &trT);
655
0
}
656
657
// r = p + q
658
0
void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
659
0
  fe trX, trY, trZ, trT;
660
661
0
  fe_add(&r->X, &p->Y, &p->X);
662
0
  fe_sub(&r->Y, &p->Y, &p->X);
663
0
  fe_mul_tll(&trZ, &r->X, &q->YplusX);
664
0
  fe_mul_tll(&trY, &r->Y, &q->YminusX);
665
0
  fe_mul_tlt(&trT, &q->T2d, &p->T);
666
0
  fe_mul_ttl(&trX, &p->Z, &q->Z);
667
0
  fe_add(&r->T, &trX, &trX);
668
0
  fe_sub(&r->X, &trZ, &trY);
669
0
  fe_add(&r->Y, &trZ, &trY);
670
0
  fe_carry(&trZ, &r->T);
671
0
  fe_add(&r->Z, &trZ, &trT);
672
0
  fe_sub(&r->T, &trZ, &trT);
673
0
}
674
675
// r = p - q
676
0
void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
677
0
  fe trX, trY, trZ, trT;
678
679
0
  fe_add(&r->X, &p->Y, &p->X);
680
0
  fe_sub(&r->Y, &p->Y, &p->X);
681
0
  fe_mul_tll(&trZ, &r->X, &q->YminusX);
682
0
  fe_mul_tll(&trY, &r->Y, &q->YplusX);
683
0
  fe_mul_tlt(&trT, &q->T2d, &p->T);
684
0
  fe_mul_ttl(&trX, &p->Z, &q->Z);
685
0
  fe_add(&r->T, &trX, &trX);
686
0
  fe_sub(&r->X, &trZ, &trY);
687
0
  fe_add(&r->Y, &trZ, &trY);
688
0
  fe_carry(&trZ, &r->T);
689
0
  fe_sub(&r->Z, &trZ, &trT);
690
0
  fe_add(&r->T, &trZ, &trT);
691
0
}
692
693
0
static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) {
694
0
  fe_cmov(&t->yplusx, &u->yplusx, b);
695
0
  fe_cmov(&t->yminusx, &u->yminusx, b);
696
0
  fe_cmov(&t->xy2d, &u->xy2d, b);
697
0
}
698
699
void x25519_ge_scalarmult_small_precomp(
700
0
    ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) {
701
  // precomp_table is first expanded into matching |ge_precomp|
702
  // elements.
703
0
  ge_precomp multiples[15];
704
705
0
  unsigned i;
706
0
  for (i = 0; i < 15; i++) {
707
    // The precomputed table is assumed to already clear the top bit, so
708
    // |fe_frombytes_strict| may be used directly.
709
0
    const uint8_t *bytes = &precomp_table[i*(2 * 32)];
710
0
    fe x, y;
711
0
    fe_frombytes_strict(&x, bytes);
712
0
    fe_frombytes_strict(&y, bytes + 32);
713
714
0
    ge_precomp *out = &multiples[i];
715
0
    fe_add(&out->yplusx, &y, &x);
716
0
    fe_sub(&out->yminusx, &y, &x);
717
0
    fe_mul_ltt(&out->xy2d, &x, &y);
718
0
    fe_mul_llt(&out->xy2d, &out->xy2d, &d2);
719
0
  }
720
721
  // See the comment above |k25519SmallPrecomp| about the structure of the
722
  // precomputed elements. This loop does 64 additions and 64 doublings to
723
  // calculate the result.
724
0
  ge_p3_0(h);
725
726
0
  for (i = 63; i < 64; i--) {
727
0
    unsigned j;
728
0
    signed char index = 0;
729
730
0
    for (j = 0; j < 4; j++) {
731
0
      const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7));
732
0
      index |= (bit << j);
733
0
    }
734
735
0
    ge_precomp e;
736
0
    ge_precomp_0(&e);
737
738
0
    for (j = 1; j < 16; j++) {
739
0
      cmov(&e, &multiples[j-1], 1&constant_time_eq_w(index, j));
740
0
    }
741
742
0
    ge_cached cached;
743
0
    ge_p1p1 r;
744
0
    x25519_ge_p3_to_cached(&cached, h);
745
0
    x25519_ge_add(&r, h, &cached);
746
0
    x25519_ge_p1p1_to_p3(h, &r);
747
748
0
    ge_madd(&r, h, &e);
749
0
    x25519_ge_p1p1_to_p3(h, &r);
750
0
  }
751
0
}
752
753
#if defined(OPENSSL_SMALL)
754
755
void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
756
  x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp);
757
}
758
759
#else
760
761
0
static void table_select(ge_precomp *t, const int pos, const signed char b) {
762
0
  uint8_t bnegative = constant_time_msb_w(b);
763
0
  uint8_t babs = b - ((bnegative & b) << 1);
764
765
0
  uint8_t t_bytes[3][32] = {
766
0
      {constant_time_is_zero_w(b) & 1}, {constant_time_is_zero_w(b) & 1}, {0}};
767
0
#if defined(__clang__) // materialize for vectorization, 6% speedup
768
0
  __asm__("" : "+m" (t_bytes) : /*no inputs*/);
769
0
#endif
770
0
  static_assert(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0]), "");
771
0
  for (int i = 0; i < 8; i++) {
772
0
    constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i],
773
0
                                     sizeof(t_bytes),
774
0
                                     constant_time_eq_w(babs, 1 + i));
775
0
  }
776
777
0
  fe yplusx, yminusx, xy2d;
778
0
  fe_frombytes_strict(&yplusx, t_bytes[0]);
779
0
  fe_frombytes_strict(&yminusx, t_bytes[1]);
780
0
  fe_frombytes_strict(&xy2d, t_bytes[2]);
781
782
0
  fe_copy_lt(&t->yplusx, &yplusx);
783
0
  fe_copy_lt(&t->yminusx, &yminusx);
784
0
  fe_copy_lt(&t->xy2d, &xy2d);
785
786
0
  ge_precomp minust;
787
0
  fe_copy_lt(&minust.yplusx, &yminusx);
788
0
  fe_copy_lt(&minust.yminusx, &yplusx);
789
0
  fe_neg(&minust.xy2d, &xy2d);
790
0
  cmov(t, &minust, bnegative>>7);
791
0
}
792
793
// h = a * B
794
// where a = a[0]+256*a[1]+...+256^31 a[31]
795
// B is the Ed25519 base point (x,4/5) with x positive.
796
//
797
// Preconditions:
798
//   a[31] <= 127
799
114
void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
800
114
#if defined(BORINGSSL_FE25519_ADX)
801
114
  if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
802
114
      CRYPTO_is_ADX_capable()) {
803
114
    uint8_t t[4][32];
804
114
    x25519_ge_scalarmult_base_adx(t, a);
805
114
    fiat_25519_from_bytes(h->X.v, t[0]);
806
114
    fiat_25519_from_bytes(h->Y.v, t[1]);
807
114
    fiat_25519_from_bytes(h->Z.v, t[2]);
808
114
    fiat_25519_from_bytes(h->T.v, t[3]);
809
114
    return;
810
114
  }
811
0
#endif
812
0
  signed char e[64];
813
0
  signed char carry;
814
0
  ge_p1p1 r;
815
0
  ge_p2 s;
816
0
  ge_precomp t;
817
0
  int i;
818
819
0
  for (i = 0; i < 32; ++i) {
820
0
    e[2 * i + 0] = (a[i] >> 0) & 15;
821
0
    e[2 * i + 1] = (a[i] >> 4) & 15;
822
0
  }
823
  // each e[i] is between 0 and 15
824
  // e[63] is between 0 and 7
825
826
0
  carry = 0;
827
0
  for (i = 0; i < 63; ++i) {
828
0
    e[i] += carry;
829
0
    carry = e[i] + 8;
830
0
    carry >>= 4;
831
0
    e[i] -= carry << 4;
832
0
  }
833
0
  e[63] += carry;
834
  // each e[i] is between -8 and 8
835
836
0
  ge_p3_0(h);
837
0
  for (i = 1; i < 64; i += 2) {
838
0
    table_select(&t, i / 2, e[i]);
839
0
    ge_madd(&r, h, &t);
840
0
    x25519_ge_p1p1_to_p3(h, &r);
841
0
  }
842
843
0
  ge_p3_dbl(&r, h);
844
0
  x25519_ge_p1p1_to_p2(&s, &r);
845
0
  ge_p2_dbl(&r, &s);
846
0
  x25519_ge_p1p1_to_p2(&s, &r);
847
0
  ge_p2_dbl(&r, &s);
848
0
  x25519_ge_p1p1_to_p2(&s, &r);
849
0
  ge_p2_dbl(&r, &s);
850
0
  x25519_ge_p1p1_to_p3(h, &r);
851
852
0
  for (i = 0; i < 64; i += 2) {
853
0
    table_select(&t, i / 2, e[i]);
854
0
    ge_madd(&r, h, &t);
855
0
    x25519_ge_p1p1_to_p3(h, &r);
856
0
  }
857
0
}
858
859
#endif
860
861
0
static void cmov_cached(ge_cached *t, ge_cached *u, uint8_t b) {
862
0
  fe_cmov(&t->YplusX, &u->YplusX, b);
863
0
  fe_cmov(&t->YminusX, &u->YminusX, b);
864
0
  fe_cmov(&t->Z, &u->Z, b);
865
0
  fe_cmov(&t->T2d, &u->T2d, b);
866
0
}
867
868
// r = scalar * A.
869
// where a = a[0]+256*a[1]+...+256^31 a[31].
870
0
void x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, const ge_p3 *A) {
871
0
  ge_p2 Ai_p2[8];
872
0
  ge_cached Ai[16];
873
0
  ge_p1p1 t;
874
875
0
  ge_cached_0(&Ai[0]);
876
0
  x25519_ge_p3_to_cached(&Ai[1], A);
877
0
  ge_p3_to_p2(&Ai_p2[1], A);
878
879
0
  unsigned i;
880
0
  for (i = 2; i < 16; i += 2) {
881
0
    ge_p2_dbl(&t, &Ai_p2[i / 2]);
882
0
    ge_p1p1_to_cached(&Ai[i], &t);
883
0
    if (i < 8) {
884
0
      x25519_ge_p1p1_to_p2(&Ai_p2[i], &t);
885
0
    }
886
0
    x25519_ge_add(&t, A, &Ai[i]);
887
0
    ge_p1p1_to_cached(&Ai[i + 1], &t);
888
0
    if (i < 7) {
889
0
      x25519_ge_p1p1_to_p2(&Ai_p2[i + 1], &t);
890
0
    }
891
0
  }
892
893
0
  ge_p2_0(r);
894
0
  ge_p3 u;
895
896
0
  for (i = 0; i < 256; i += 4) {
897
0
    ge_p2_dbl(&t, r);
898
0
    x25519_ge_p1p1_to_p2(r, &t);
899
0
    ge_p2_dbl(&t, r);
900
0
    x25519_ge_p1p1_to_p2(r, &t);
901
0
    ge_p2_dbl(&t, r);
902
0
    x25519_ge_p1p1_to_p2(r, &t);
903
0
    ge_p2_dbl(&t, r);
904
0
    x25519_ge_p1p1_to_p3(&u, &t);
905
906
0
    uint8_t index = scalar[31 - i/8];
907
0
    index >>= 4 - (i & 4);
908
0
    index &= 0xf;
909
910
0
    unsigned j;
911
0
    ge_cached selected;
912
0
    ge_cached_0(&selected);
913
0
    for (j = 0; j < 16; j++) {
914
0
      cmov_cached(&selected, &Ai[j], 1&constant_time_eq_w(index, j));
915
0
    }
916
917
0
    x25519_ge_add(&t, &u, &selected);
918
0
    x25519_ge_p1p1_to_p2(r, &t);
919
0
  }
920
0
}
921
922
0
static void slide(signed char *r, const uint8_t *a) {
923
0
  int i;
924
0
  int b;
925
0
  int k;
926
927
0
  for (i = 0; i < 256; ++i) {
928
0
    r[i] = 1 & (a[i >> 3] >> (i & 7));
929
0
  }
930
931
0
  for (i = 0; i < 256; ++i) {
932
0
    if (r[i]) {
933
0
      for (b = 1; b <= 6 && i + b < 256; ++b) {
934
0
        if (r[i + b]) {
935
0
          if (r[i] + (r[i + b] << b) <= 15) {
936
0
            r[i] += r[i + b] << b;
937
0
            r[i + b] = 0;
938
0
          } else if (r[i] - (r[i + b] << b) >= -15) {
939
0
            r[i] -= r[i + b] << b;
940
0
            for (k = i + b; k < 256; ++k) {
941
0
              if (!r[k]) {
942
0
                r[k] = 1;
943
0
                break;
944
0
              }
945
0
              r[k] = 0;
946
0
            }
947
0
          } else {
948
0
            break;
949
0
          }
950
0
        }
951
0
      }
952
0
    }
953
0
  }
954
0
}
955
956
// r = a * A + b * B
957
// where a = a[0]+256*a[1]+...+256^31 a[31].
958
// and b = b[0]+256*b[1]+...+256^31 b[31].
959
// B is the Ed25519 base point (x,4/5) with x positive.
960
static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
961
0
                                         const ge_p3 *A, const uint8_t *b) {
962
0
  signed char aslide[256];
963
0
  signed char bslide[256];
964
0
  ge_cached Ai[8];  // A,3A,5A,7A,9A,11A,13A,15A
965
0
  ge_p1p1 t;
966
0
  ge_p3 u;
967
0
  ge_p3 A2;
968
0
  int i;
969
970
0
  slide(aslide, a);
971
0
  slide(bslide, b);
972
973
0
  x25519_ge_p3_to_cached(&Ai[0], A);
974
0
  ge_p3_dbl(&t, A);
975
0
  x25519_ge_p1p1_to_p3(&A2, &t);
976
0
  x25519_ge_add(&t, &A2, &Ai[0]);
977
0
  x25519_ge_p1p1_to_p3(&u, &t);
978
0
  x25519_ge_p3_to_cached(&Ai[1], &u);
979
0
  x25519_ge_add(&t, &A2, &Ai[1]);
980
0
  x25519_ge_p1p1_to_p3(&u, &t);
981
0
  x25519_ge_p3_to_cached(&Ai[2], &u);
982
0
  x25519_ge_add(&t, &A2, &Ai[2]);
983
0
  x25519_ge_p1p1_to_p3(&u, &t);
984
0
  x25519_ge_p3_to_cached(&Ai[3], &u);
985
0
  x25519_ge_add(&t, &A2, &Ai[3]);
986
0
  x25519_ge_p1p1_to_p3(&u, &t);
987
0
  x25519_ge_p3_to_cached(&Ai[4], &u);
988
0
  x25519_ge_add(&t, &A2, &Ai[4]);
989
0
  x25519_ge_p1p1_to_p3(&u, &t);
990
0
  x25519_ge_p3_to_cached(&Ai[5], &u);
991
0
  x25519_ge_add(&t, &A2, &Ai[5]);
992
0
  x25519_ge_p1p1_to_p3(&u, &t);
993
0
  x25519_ge_p3_to_cached(&Ai[6], &u);
994
0
  x25519_ge_add(&t, &A2, &Ai[6]);
995
0
  x25519_ge_p1p1_to_p3(&u, &t);
996
0
  x25519_ge_p3_to_cached(&Ai[7], &u);
997
998
0
  ge_p2_0(r);
999
1000
0
  for (i = 255; i >= 0; --i) {
1001
0
    if (aslide[i] || bslide[i]) {
1002
0
      break;
1003
0
    }
1004
0
  }
1005
1006
0
  for (; i >= 0; --i) {
1007
0
    ge_p2_dbl(&t, r);
1008
1009
0
    if (aslide[i] > 0) {
1010
0
      x25519_ge_p1p1_to_p3(&u, &t);
1011
0
      x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]);
1012
0
    } else if (aslide[i] < 0) {
1013
0
      x25519_ge_p1p1_to_p3(&u, &t);
1014
0
      x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
1015
0
    }
1016
1017
0
    if (bslide[i] > 0) {
1018
0
      x25519_ge_p1p1_to_p3(&u, &t);
1019
0
      ge_madd(&t, &u, &Bi[bslide[i] / 2]);
1020
0
    } else if (bslide[i] < 0) {
1021
0
      x25519_ge_p1p1_to_p3(&u, &t);
1022
0
      ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]);
1023
0
    }
1024
1025
0
    x25519_ge_p1p1_to_p2(r, &t);
1026
0
  }
1027
0
}
1028
1029
// int64_lshift21 returns |a << 21| but is defined when shifting bits into the
1030
// sign bit. This works around a language flaw in C.
1031
0
static inline int64_t int64_lshift21(int64_t a) {
1032
0
  return (int64_t)((uint64_t)a << 21);
1033
0
}
1034
1035
// The set of scalars is \Z/l
1036
// where l = 2^252 + 27742317777372353535851937790883648493.
1037
1038
// Input:
1039
//   s[0]+256*s[1]+...+256^63*s[63] = s
1040
//
1041
// Output:
1042
//   s[0]+256*s[1]+...+256^31*s[31] = s mod l
1043
//   where l = 2^252 + 27742317777372353535851937790883648493.
1044
//   Overwrites s in place.
1045
0
void x25519_sc_reduce(uint8_t s[64]) {
1046
0
  int64_t s0 = 2097151 & load_3(s);
1047
0
  int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
1048
0
  int64_t s2 = 2097151 & (load_3(s + 5) >> 2);
1049
0
  int64_t s3 = 2097151 & (load_4(s + 7) >> 7);
1050
0
  int64_t s4 = 2097151 & (load_4(s + 10) >> 4);
1051
0
  int64_t s5 = 2097151 & (load_3(s + 13) >> 1);
1052
0
  int64_t s6 = 2097151 & (load_4(s + 15) >> 6);
1053
0
  int64_t s7 = 2097151 & (load_3(s + 18) >> 3);
1054
0
  int64_t s8 = 2097151 & load_3(s + 21);
1055
0
  int64_t s9 = 2097151 & (load_4(s + 23) >> 5);
1056
0
  int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
1057
0
  int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
1058
0
  int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
1059
0
  int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
1060
0
  int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
1061
0
  int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
1062
0
  int64_t s16 = 2097151 & load_3(s + 42);
1063
0
  int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
1064
0
  int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
1065
0
  int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
1066
0
  int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
1067
0
  int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
1068
0
  int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
1069
0
  int64_t s23 = (load_4(s + 60) >> 3);
1070
0
  int64_t carry0;
1071
0
  int64_t carry1;
1072
0
  int64_t carry2;
1073
0
  int64_t carry3;
1074
0
  int64_t carry4;
1075
0
  int64_t carry5;
1076
0
  int64_t carry6;
1077
0
  int64_t carry7;
1078
0
  int64_t carry8;
1079
0
  int64_t carry9;
1080
0
  int64_t carry10;
1081
0
  int64_t carry11;
1082
0
  int64_t carry12;
1083
0
  int64_t carry13;
1084
0
  int64_t carry14;
1085
0
  int64_t carry15;
1086
0
  int64_t carry16;
1087
1088
0
  s11 += s23 * 666643;
1089
0
  s12 += s23 * 470296;
1090
0
  s13 += s23 * 654183;
1091
0
  s14 -= s23 * 997805;
1092
0
  s15 += s23 * 136657;
1093
0
  s16 -= s23 * 683901;
1094
0
  s23 = 0;
1095
1096
0
  s10 += s22 * 666643;
1097
0
  s11 += s22 * 470296;
1098
0
  s12 += s22 * 654183;
1099
0
  s13 -= s22 * 997805;
1100
0
  s14 += s22 * 136657;
1101
0
  s15 -= s22 * 683901;
1102
0
  s22 = 0;
1103
1104
0
  s9 += s21 * 666643;
1105
0
  s10 += s21 * 470296;
1106
0
  s11 += s21 * 654183;
1107
0
  s12 -= s21 * 997805;
1108
0
  s13 += s21 * 136657;
1109
0
  s14 -= s21 * 683901;
1110
0
  s21 = 0;
1111
1112
0
  s8 += s20 * 666643;
1113
0
  s9 += s20 * 470296;
1114
0
  s10 += s20 * 654183;
1115
0
  s11 -= s20 * 997805;
1116
0
  s12 += s20 * 136657;
1117
0
  s13 -= s20 * 683901;
1118
0
  s20 = 0;
1119
1120
0
  s7 += s19 * 666643;
1121
0
  s8 += s19 * 470296;
1122
0
  s9 += s19 * 654183;
1123
0
  s10 -= s19 * 997805;
1124
0
  s11 += s19 * 136657;
1125
0
  s12 -= s19 * 683901;
1126
0
  s19 = 0;
1127
1128
0
  s6 += s18 * 666643;
1129
0
  s7 += s18 * 470296;
1130
0
  s8 += s18 * 654183;
1131
0
  s9 -= s18 * 997805;
1132
0
  s10 += s18 * 136657;
1133
0
  s11 -= s18 * 683901;
1134
0
  s18 = 0;
1135
1136
0
  carry6 = (s6 + (1 << 20)) >> 21;
1137
0
  s7 += carry6;
1138
0
  s6 -= int64_lshift21(carry6);
1139
0
  carry8 = (s8 + (1 << 20)) >> 21;
1140
0
  s9 += carry8;
1141
0
  s8 -= int64_lshift21(carry8);
1142
0
  carry10 = (s10 + (1 << 20)) >> 21;
1143
0
  s11 += carry10;
1144
0
  s10 -= int64_lshift21(carry10);
1145
0
  carry12 = (s12 + (1 << 20)) >> 21;
1146
0
  s13 += carry12;
1147
0
  s12 -= int64_lshift21(carry12);
1148
0
  carry14 = (s14 + (1 << 20)) >> 21;
1149
0
  s15 += carry14;
1150
0
  s14 -= int64_lshift21(carry14);
1151
0
  carry16 = (s16 + (1 << 20)) >> 21;
1152
0
  s17 += carry16;
1153
0
  s16 -= int64_lshift21(carry16);
1154
1155
0
  carry7 = (s7 + (1 << 20)) >> 21;
1156
0
  s8 += carry7;
1157
0
  s7 -= int64_lshift21(carry7);
1158
0
  carry9 = (s9 + (1 << 20)) >> 21;
1159
0
  s10 += carry9;
1160
0
  s9 -= int64_lshift21(carry9);
1161
0
  carry11 = (s11 + (1 << 20)) >> 21;
1162
0
  s12 += carry11;
1163
0
  s11 -= int64_lshift21(carry11);
1164
0
  carry13 = (s13 + (1 << 20)) >> 21;
1165
0
  s14 += carry13;
1166
0
  s13 -= int64_lshift21(carry13);
1167
0
  carry15 = (s15 + (1 << 20)) >> 21;
1168
0
  s16 += carry15;
1169
0
  s15 -= int64_lshift21(carry15);
1170
1171
0
  s5 += s17 * 666643;
1172
0
  s6 += s17 * 470296;
1173
0
  s7 += s17 * 654183;
1174
0
  s8 -= s17 * 997805;
1175
0
  s9 += s17 * 136657;
1176
0
  s10 -= s17 * 683901;
1177
0
  s17 = 0;
1178
1179
0
  s4 += s16 * 666643;
1180
0
  s5 += s16 * 470296;
1181
0
  s6 += s16 * 654183;
1182
0
  s7 -= s16 * 997805;
1183
0
  s8 += s16 * 136657;
1184
0
  s9 -= s16 * 683901;
1185
0
  s16 = 0;
1186
1187
0
  s3 += s15 * 666643;
1188
0
  s4 += s15 * 470296;
1189
0
  s5 += s15 * 654183;
1190
0
  s6 -= s15 * 997805;
1191
0
  s7 += s15 * 136657;
1192
0
  s8 -= s15 * 683901;
1193
0
  s15 = 0;
1194
1195
0
  s2 += s14 * 666643;
1196
0
  s3 += s14 * 470296;
1197
0
  s4 += s14 * 654183;
1198
0
  s5 -= s14 * 997805;
1199
0
  s6 += s14 * 136657;
1200
0
  s7 -= s14 * 683901;
1201
0
  s14 = 0;
1202
1203
0
  s1 += s13 * 666643;
1204
0
  s2 += s13 * 470296;
1205
0
  s3 += s13 * 654183;
1206
0
  s4 -= s13 * 997805;
1207
0
  s5 += s13 * 136657;
1208
0
  s6 -= s13 * 683901;
1209
0
  s13 = 0;
1210
1211
0
  s0 += s12 * 666643;
1212
0
  s1 += s12 * 470296;
1213
0
  s2 += s12 * 654183;
1214
0
  s3 -= s12 * 997805;
1215
0
  s4 += s12 * 136657;
1216
0
  s5 -= s12 * 683901;
1217
0
  s12 = 0;
1218
1219
0
  carry0 = (s0 + (1 << 20)) >> 21;
1220
0
  s1 += carry0;
1221
0
  s0 -= int64_lshift21(carry0);
1222
0
  carry2 = (s2 + (1 << 20)) >> 21;
1223
0
  s3 += carry2;
1224
0
  s2 -= int64_lshift21(carry2);
1225
0
  carry4 = (s4 + (1 << 20)) >> 21;
1226
0
  s5 += carry4;
1227
0
  s4 -= int64_lshift21(carry4);
1228
0
  carry6 = (s6 + (1 << 20)) >> 21;
1229
0
  s7 += carry6;
1230
0
  s6 -= int64_lshift21(carry6);
1231
0
  carry8 = (s8 + (1 << 20)) >> 21;
1232
0
  s9 += carry8;
1233
0
  s8 -= int64_lshift21(carry8);
1234
0
  carry10 = (s10 + (1 << 20)) >> 21;
1235
0
  s11 += carry10;
1236
0
  s10 -= int64_lshift21(carry10);
1237
1238
0
  carry1 = (s1 + (1 << 20)) >> 21;
1239
0
  s2 += carry1;
1240
0
  s1 -= int64_lshift21(carry1);
1241
0
  carry3 = (s3 + (1 << 20)) >> 21;
1242
0
  s4 += carry3;
1243
0
  s3 -= int64_lshift21(carry3);
1244
0
  carry5 = (s5 + (1 << 20)) >> 21;
1245
0
  s6 += carry5;
1246
0
  s5 -= int64_lshift21(carry5);
1247
0
  carry7 = (s7 + (1 << 20)) >> 21;
1248
0
  s8 += carry7;
1249
0
  s7 -= int64_lshift21(carry7);
1250
0
  carry9 = (s9 + (1 << 20)) >> 21;
1251
0
  s10 += carry9;
1252
0
  s9 -= int64_lshift21(carry9);
1253
0
  carry11 = (s11 + (1 << 20)) >> 21;
1254
0
  s12 += carry11;
1255
0
  s11 -= int64_lshift21(carry11);
1256
1257
0
  s0 += s12 * 666643;
1258
0
  s1 += s12 * 470296;
1259
0
  s2 += s12 * 654183;
1260
0
  s3 -= s12 * 997805;
1261
0
  s4 += s12 * 136657;
1262
0
  s5 -= s12 * 683901;
1263
0
  s12 = 0;
1264
1265
0
  carry0 = s0 >> 21;
1266
0
  s1 += carry0;
1267
0
  s0 -= int64_lshift21(carry0);
1268
0
  carry1 = s1 >> 21;
1269
0
  s2 += carry1;
1270
0
  s1 -= int64_lshift21(carry1);
1271
0
  carry2 = s2 >> 21;
1272
0
  s3 += carry2;
1273
0
  s2 -= int64_lshift21(carry2);
1274
0
  carry3 = s3 >> 21;
1275
0
  s4 += carry3;
1276
0
  s3 -= int64_lshift21(carry3);
1277
0
  carry4 = s4 >> 21;
1278
0
  s5 += carry4;
1279
0
  s4 -= int64_lshift21(carry4);
1280
0
  carry5 = s5 >> 21;
1281
0
  s6 += carry5;
1282
0
  s5 -= int64_lshift21(carry5);
1283
0
  carry6 = s6 >> 21;
1284
0
  s7 += carry6;
1285
0
  s6 -= int64_lshift21(carry6);
1286
0
  carry7 = s7 >> 21;
1287
0
  s8 += carry7;
1288
0
  s7 -= int64_lshift21(carry7);
1289
0
  carry8 = s8 >> 21;
1290
0
  s9 += carry8;
1291
0
  s8 -= int64_lshift21(carry8);
1292
0
  carry9 = s9 >> 21;
1293
0
  s10 += carry9;
1294
0
  s9 -= int64_lshift21(carry9);
1295
0
  carry10 = s10 >> 21;
1296
0
  s11 += carry10;
1297
0
  s10 -= int64_lshift21(carry10);
1298
0
  carry11 = s11 >> 21;
1299
0
  s12 += carry11;
1300
0
  s11 -= int64_lshift21(carry11);
1301
1302
0
  s0 += s12 * 666643;
1303
0
  s1 += s12 * 470296;
1304
0
  s2 += s12 * 654183;
1305
0
  s3 -= s12 * 997805;
1306
0
  s4 += s12 * 136657;
1307
0
  s5 -= s12 * 683901;
1308
0
  s12 = 0;
1309
1310
0
  carry0 = s0 >> 21;
1311
0
  s1 += carry0;
1312
0
  s0 -= int64_lshift21(carry0);
1313
0
  carry1 = s1 >> 21;
1314
0
  s2 += carry1;
1315
0
  s1 -= int64_lshift21(carry1);
1316
0
  carry2 = s2 >> 21;
1317
0
  s3 += carry2;
1318
0
  s2 -= int64_lshift21(carry2);
1319
0
  carry3 = s3 >> 21;
1320
0
  s4 += carry3;
1321
0
  s3 -= int64_lshift21(carry3);
1322
0
  carry4 = s4 >> 21;
1323
0
  s5 += carry4;
1324
0
  s4 -= int64_lshift21(carry4);
1325
0
  carry5 = s5 >> 21;
1326
0
  s6 += carry5;
1327
0
  s5 -= int64_lshift21(carry5);
1328
0
  carry6 = s6 >> 21;
1329
0
  s7 += carry6;
1330
0
  s6 -= int64_lshift21(carry6);
1331
0
  carry7 = s7 >> 21;
1332
0
  s8 += carry7;
1333
0
  s7 -= int64_lshift21(carry7);
1334
0
  carry8 = s8 >> 21;
1335
0
  s9 += carry8;
1336
0
  s8 -= int64_lshift21(carry8);
1337
0
  carry9 = s9 >> 21;
1338
0
  s10 += carry9;
1339
0
  s9 -= int64_lshift21(carry9);
1340
0
  carry10 = s10 >> 21;
1341
0
  s11 += carry10;
1342
0
  s10 -= int64_lshift21(carry10);
1343
1344
0
  s[0] = s0 >> 0;
1345
0
  s[1] = s0 >> 8;
1346
0
  s[2] = (s0 >> 16) | (s1 << 5);
1347
0
  s[3] = s1 >> 3;
1348
0
  s[4] = s1 >> 11;
1349
0
  s[5] = (s1 >> 19) | (s2 << 2);
1350
0
  s[6] = s2 >> 6;
1351
0
  s[7] = (s2 >> 14) | (s3 << 7);
1352
0
  s[8] = s3 >> 1;
1353
0
  s[9] = s3 >> 9;
1354
0
  s[10] = (s3 >> 17) | (s4 << 4);
1355
0
  s[11] = s4 >> 4;
1356
0
  s[12] = s4 >> 12;
1357
0
  s[13] = (s4 >> 20) | (s5 << 1);
1358
0
  s[14] = s5 >> 7;
1359
0
  s[15] = (s5 >> 15) | (s6 << 6);
1360
0
  s[16] = s6 >> 2;
1361
0
  s[17] = s6 >> 10;
1362
0
  s[18] = (s6 >> 18) | (s7 << 3);
1363
0
  s[19] = s7 >> 5;
1364
0
  s[20] = s7 >> 13;
1365
0
  s[21] = s8 >> 0;
1366
0
  s[22] = s8 >> 8;
1367
0
  s[23] = (s8 >> 16) | (s9 << 5);
1368
0
  s[24] = s9 >> 3;
1369
0
  s[25] = s9 >> 11;
1370
0
  s[26] = (s9 >> 19) | (s10 << 2);
1371
0
  s[27] = s10 >> 6;
1372
0
  s[28] = (s10 >> 14) | (s11 << 7);
1373
0
  s[29] = s11 >> 1;
1374
0
  s[30] = s11 >> 9;
1375
0
  s[31] = s11 >> 17;
1376
0
}
1377
1378
// Input:
1379
//   a[0]+256*a[1]+...+256^31*a[31] = a
1380
//   b[0]+256*b[1]+...+256^31*b[31] = b
1381
//   c[0]+256*c[1]+...+256^31*c[31] = c
1382
//
1383
// Output:
1384
//   s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
1385
//   where l = 2^252 + 27742317777372353535851937790883648493.
1386
static void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b,
1387
0
                      const uint8_t *c) {
1388
0
  int64_t a0 = 2097151 & load_3(a);
1389
0
  int64_t a1 = 2097151 & (load_4(a + 2) >> 5);
1390
0
  int64_t a2 = 2097151 & (load_3(a + 5) >> 2);
1391
0
  int64_t a3 = 2097151 & (load_4(a + 7) >> 7);
1392
0
  int64_t a4 = 2097151 & (load_4(a + 10) >> 4);
1393
0
  int64_t a5 = 2097151 & (load_3(a + 13) >> 1);
1394
0
  int64_t a6 = 2097151 & (load_4(a + 15) >> 6);
1395
0
  int64_t a7 = 2097151 & (load_3(a + 18) >> 3);
1396
0
  int64_t a8 = 2097151 & load_3(a + 21);
1397
0
  int64_t a9 = 2097151 & (load_4(a + 23) >> 5);
1398
0
  int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
1399
0
  int64_t a11 = (load_4(a + 28) >> 7);
1400
0
  int64_t b0 = 2097151 & load_3(b);
1401
0
  int64_t b1 = 2097151 & (load_4(b + 2) >> 5);
1402
0
  int64_t b2 = 2097151 & (load_3(b + 5) >> 2);
1403
0
  int64_t b3 = 2097151 & (load_4(b + 7) >> 7);
1404
0
  int64_t b4 = 2097151 & (load_4(b + 10) >> 4);
1405
0
  int64_t b5 = 2097151 & (load_3(b + 13) >> 1);
1406
0
  int64_t b6 = 2097151 & (load_4(b + 15) >> 6);
1407
0
  int64_t b7 = 2097151 & (load_3(b + 18) >> 3);
1408
0
  int64_t b8 = 2097151 & load_3(b + 21);
1409
0
  int64_t b9 = 2097151 & (load_4(b + 23) >> 5);
1410
0
  int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
1411
0
  int64_t b11 = (load_4(b + 28) >> 7);
1412
0
  int64_t c0 = 2097151 & load_3(c);
1413
0
  int64_t c1 = 2097151 & (load_4(c + 2) >> 5);
1414
0
  int64_t c2 = 2097151 & (load_3(c + 5) >> 2);
1415
0
  int64_t c3 = 2097151 & (load_4(c + 7) >> 7);
1416
0
  int64_t c4 = 2097151 & (load_4(c + 10) >> 4);
1417
0
  int64_t c5 = 2097151 & (load_3(c + 13) >> 1);
1418
0
  int64_t c6 = 2097151 & (load_4(c + 15) >> 6);
1419
0
  int64_t c7 = 2097151 & (load_3(c + 18) >> 3);
1420
0
  int64_t c8 = 2097151 & load_3(c + 21);
1421
0
  int64_t c9 = 2097151 & (load_4(c + 23) >> 5);
1422
0
  int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
1423
0
  int64_t c11 = (load_4(c + 28) >> 7);
1424
0
  int64_t s0;
1425
0
  int64_t s1;
1426
0
  int64_t s2;
1427
0
  int64_t s3;
1428
0
  int64_t s4;
1429
0
  int64_t s5;
1430
0
  int64_t s6;
1431
0
  int64_t s7;
1432
0
  int64_t s8;
1433
0
  int64_t s9;
1434
0
  int64_t s10;
1435
0
  int64_t s11;
1436
0
  int64_t s12;
1437
0
  int64_t s13;
1438
0
  int64_t s14;
1439
0
  int64_t s15;
1440
0
  int64_t s16;
1441
0
  int64_t s17;
1442
0
  int64_t s18;
1443
0
  int64_t s19;
1444
0
  int64_t s20;
1445
0
  int64_t s21;
1446
0
  int64_t s22;
1447
0
  int64_t s23;
1448
0
  int64_t carry0;
1449
0
  int64_t carry1;
1450
0
  int64_t carry2;
1451
0
  int64_t carry3;
1452
0
  int64_t carry4;
1453
0
  int64_t carry5;
1454
0
  int64_t carry6;
1455
0
  int64_t carry7;
1456
0
  int64_t carry8;
1457
0
  int64_t carry9;
1458
0
  int64_t carry10;
1459
0
  int64_t carry11;
1460
0
  int64_t carry12;
1461
0
  int64_t carry13;
1462
0
  int64_t carry14;
1463
0
  int64_t carry15;
1464
0
  int64_t carry16;
1465
0
  int64_t carry17;
1466
0
  int64_t carry18;
1467
0
  int64_t carry19;
1468
0
  int64_t carry20;
1469
0
  int64_t carry21;
1470
0
  int64_t carry22;
1471
1472
0
  s0 = c0 + a0 * b0;
1473
0
  s1 = c1 + a0 * b1 + a1 * b0;
1474
0
  s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0;
1475
0
  s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
1476
0
  s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0;
1477
0
  s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0;
1478
0
  s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0;
1479
0
  s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 +
1480
0
       a6 * b1 + a7 * b0;
1481
0
  s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 +
1482
0
       a6 * b2 + a7 * b1 + a8 * b0;
1483
0
  s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 +
1484
0
       a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0;
1485
0
  s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 +
1486
0
        a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0;
1487
0
  s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 +
1488
0
        a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0;
1489
0
  s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 +
1490
0
        a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1;
1491
0
  s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 +
1492
0
        a9 * b4 + a10 * b3 + a11 * b2;
1493
0
  s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5 +
1494
0
        a10 * b4 + a11 * b3;
1495
0
  s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 + a10 * b5 +
1496
0
        a11 * b4;
1497
0
  s16 = a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5;
1498
0
  s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6;
1499
0
  s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7;
1500
0
  s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8;
1501
0
  s20 = a9 * b11 + a10 * b10 + a11 * b9;
1502
0
  s21 = a10 * b11 + a11 * b10;
1503
0
  s22 = a11 * b11;
1504
0
  s23 = 0;
1505
1506
0
  carry0 = (s0 + (1 << 20)) >> 21;
1507
0
  s1 += carry0;
1508
0
  s0 -= int64_lshift21(carry0);
1509
0
  carry2 = (s2 + (1 << 20)) >> 21;
1510
0
  s3 += carry2;
1511
0
  s2 -= int64_lshift21(carry2);
1512
0
  carry4 = (s4 + (1 << 20)) >> 21;
1513
0
  s5 += carry4;
1514
0
  s4 -= int64_lshift21(carry4);
1515
0
  carry6 = (s6 + (1 << 20)) >> 21;
1516
0
  s7 += carry6;
1517
0
  s6 -= int64_lshift21(carry6);
1518
0
  carry8 = (s8 + (1 << 20)) >> 21;
1519
0
  s9 += carry8;
1520
0
  s8 -= int64_lshift21(carry8);
1521
0
  carry10 = (s10 + (1 << 20)) >> 21;
1522
0
  s11 += carry10;
1523
0
  s10 -= int64_lshift21(carry10);
1524
0
  carry12 = (s12 + (1 << 20)) >> 21;
1525
0
  s13 += carry12;
1526
0
  s12 -= int64_lshift21(carry12);
1527
0
  carry14 = (s14 + (1 << 20)) >> 21;
1528
0
  s15 += carry14;
1529
0
  s14 -= int64_lshift21(carry14);
1530
0
  carry16 = (s16 + (1 << 20)) >> 21;
1531
0
  s17 += carry16;
1532
0
  s16 -= int64_lshift21(carry16);
1533
0
  carry18 = (s18 + (1 << 20)) >> 21;
1534
0
  s19 += carry18;
1535
0
  s18 -= int64_lshift21(carry18);
1536
0
  carry20 = (s20 + (1 << 20)) >> 21;
1537
0
  s21 += carry20;
1538
0
  s20 -= int64_lshift21(carry20);
1539
0
  carry22 = (s22 + (1 << 20)) >> 21;
1540
0
  s23 += carry22;
1541
0
  s22 -= int64_lshift21(carry22);
1542
1543
0
  carry1 = (s1 + (1 << 20)) >> 21;
1544
0
  s2 += carry1;
1545
0
  s1 -= int64_lshift21(carry1);
1546
0
  carry3 = (s3 + (1 << 20)) >> 21;
1547
0
  s4 += carry3;
1548
0
  s3 -= int64_lshift21(carry3);
1549
0
  carry5 = (s5 + (1 << 20)) >> 21;
1550
0
  s6 += carry5;
1551
0
  s5 -= int64_lshift21(carry5);
1552
0
  carry7 = (s7 + (1 << 20)) >> 21;
1553
0
  s8 += carry7;
1554
0
  s7 -= int64_lshift21(carry7);
1555
0
  carry9 = (s9 + (1 << 20)) >> 21;
1556
0
  s10 += carry9;
1557
0
  s9 -= int64_lshift21(carry9);
1558
0
  carry11 = (s11 + (1 << 20)) >> 21;
1559
0
  s12 += carry11;
1560
0
  s11 -= int64_lshift21(carry11);
1561
0
  carry13 = (s13 + (1 << 20)) >> 21;
1562
0
  s14 += carry13;
1563
0
  s13 -= int64_lshift21(carry13);
1564
0
  carry15 = (s15 + (1 << 20)) >> 21;
1565
0
  s16 += carry15;
1566
0
  s15 -= int64_lshift21(carry15);
1567
0
  carry17 = (s17 + (1 << 20)) >> 21;
1568
0
  s18 += carry17;
1569
0
  s17 -= int64_lshift21(carry17);
1570
0
  carry19 = (s19 + (1 << 20)) >> 21;
1571
0
  s20 += carry19;
1572
0
  s19 -= int64_lshift21(carry19);
1573
0
  carry21 = (s21 + (1 << 20)) >> 21;
1574
0
  s22 += carry21;
1575
0
  s21 -= int64_lshift21(carry21);
1576
1577
0
  s11 += s23 * 666643;
1578
0
  s12 += s23 * 470296;
1579
0
  s13 += s23 * 654183;
1580
0
  s14 -= s23 * 997805;
1581
0
  s15 += s23 * 136657;
1582
0
  s16 -= s23 * 683901;
1583
0
  s23 = 0;
1584
1585
0
  s10 += s22 * 666643;
1586
0
  s11 += s22 * 470296;
1587
0
  s12 += s22 * 654183;
1588
0
  s13 -= s22 * 997805;
1589
0
  s14 += s22 * 136657;
1590
0
  s15 -= s22 * 683901;
1591
0
  s22 = 0;
1592
1593
0
  s9 += s21 * 666643;
1594
0
  s10 += s21 * 470296;
1595
0
  s11 += s21 * 654183;
1596
0
  s12 -= s21 * 997805;
1597
0
  s13 += s21 * 136657;
1598
0
  s14 -= s21 * 683901;
1599
0
  s21 = 0;
1600
1601
0
  s8 += s20 * 666643;
1602
0
  s9 += s20 * 470296;
1603
0
  s10 += s20 * 654183;
1604
0
  s11 -= s20 * 997805;
1605
0
  s12 += s20 * 136657;
1606
0
  s13 -= s20 * 683901;
1607
0
  s20 = 0;
1608
1609
0
  s7 += s19 * 666643;
1610
0
  s8 += s19 * 470296;
1611
0
  s9 += s19 * 654183;
1612
0
  s10 -= s19 * 997805;
1613
0
  s11 += s19 * 136657;
1614
0
  s12 -= s19 * 683901;
1615
0
  s19 = 0;
1616
1617
0
  s6 += s18 * 666643;
1618
0
  s7 += s18 * 470296;
1619
0
  s8 += s18 * 654183;
1620
0
  s9 -= s18 * 997805;
1621
0
  s10 += s18 * 136657;
1622
0
  s11 -= s18 * 683901;
1623
0
  s18 = 0;
1624
1625
0
  carry6 = (s6 + (1 << 20)) >> 21;
1626
0
  s7 += carry6;
1627
0
  s6 -= int64_lshift21(carry6);
1628
0
  carry8 = (s8 + (1 << 20)) >> 21;
1629
0
  s9 += carry8;
1630
0
  s8 -= int64_lshift21(carry8);
1631
0
  carry10 = (s10 + (1 << 20)) >> 21;
1632
0
  s11 += carry10;
1633
0
  s10 -= int64_lshift21(carry10);
1634
0
  carry12 = (s12 + (1 << 20)) >> 21;
1635
0
  s13 += carry12;
1636
0
  s12 -= int64_lshift21(carry12);
1637
0
  carry14 = (s14 + (1 << 20)) >> 21;
1638
0
  s15 += carry14;
1639
0
  s14 -= int64_lshift21(carry14);
1640
0
  carry16 = (s16 + (1 << 20)) >> 21;
1641
0
  s17 += carry16;
1642
0
  s16 -= int64_lshift21(carry16);
1643
1644
0
  carry7 = (s7 + (1 << 20)) >> 21;
1645
0
  s8 += carry7;
1646
0
  s7 -= int64_lshift21(carry7);
1647
0
  carry9 = (s9 + (1 << 20)) >> 21;
1648
0
  s10 += carry9;
1649
0
  s9 -= int64_lshift21(carry9);
1650
0
  carry11 = (s11 + (1 << 20)) >> 21;
1651
0
  s12 += carry11;
1652
0
  s11 -= int64_lshift21(carry11);
1653
0
  carry13 = (s13 + (1 << 20)) >> 21;
1654
0
  s14 += carry13;
1655
0
  s13 -= int64_lshift21(carry13);
1656
0
  carry15 = (s15 + (1 << 20)) >> 21;
1657
0
  s16 += carry15;
1658
0
  s15 -= int64_lshift21(carry15);
1659
1660
0
  s5 += s17 * 666643;
1661
0
  s6 += s17 * 470296;
1662
0
  s7 += s17 * 654183;
1663
0
  s8 -= s17 * 997805;
1664
0
  s9 += s17 * 136657;
1665
0
  s10 -= s17 * 683901;
1666
0
  s17 = 0;
1667
1668
0
  s4 += s16 * 666643;
1669
0
  s5 += s16 * 470296;
1670
0
  s6 += s16 * 654183;
1671
0
  s7 -= s16 * 997805;
1672
0
  s8 += s16 * 136657;
1673
0
  s9 -= s16 * 683901;
1674
0
  s16 = 0;
1675
1676
0
  s3 += s15 * 666643;
1677
0
  s4 += s15 * 470296;
1678
0
  s5 += s15 * 654183;
1679
0
  s6 -= s15 * 997805;
1680
0
  s7 += s15 * 136657;
1681
0
  s8 -= s15 * 683901;
1682
0
  s15 = 0;
1683
1684
0
  s2 += s14 * 666643;
1685
0
  s3 += s14 * 470296;
1686
0
  s4 += s14 * 654183;
1687
0
  s5 -= s14 * 997805;
1688
0
  s6 += s14 * 136657;
1689
0
  s7 -= s14 * 683901;
1690
0
  s14 = 0;
1691
1692
0
  s1 += s13 * 666643;
1693
0
  s2 += s13 * 470296;
1694
0
  s3 += s13 * 654183;
1695
0
  s4 -= s13 * 997805;
1696
0
  s5 += s13 * 136657;
1697
0
  s6 -= s13 * 683901;
1698
0
  s13 = 0;
1699
1700
0
  s0 += s12 * 666643;
1701
0
  s1 += s12 * 470296;
1702
0
  s2 += s12 * 654183;
1703
0
  s3 -= s12 * 997805;
1704
0
  s4 += s12 * 136657;
1705
0
  s5 -= s12 * 683901;
1706
0
  s12 = 0;
1707
1708
0
  carry0 = (s0 + (1 << 20)) >> 21;
1709
0
  s1 += carry0;
1710
0
  s0 -= int64_lshift21(carry0);
1711
0
  carry2 = (s2 + (1 << 20)) >> 21;
1712
0
  s3 += carry2;
1713
0
  s2 -= int64_lshift21(carry2);
1714
0
  carry4 = (s4 + (1 << 20)) >> 21;
1715
0
  s5 += carry4;
1716
0
  s4 -= int64_lshift21(carry4);
1717
0
  carry6 = (s6 + (1 << 20)) >> 21;
1718
0
  s7 += carry6;
1719
0
  s6 -= int64_lshift21(carry6);
1720
0
  carry8 = (s8 + (1 << 20)) >> 21;
1721
0
  s9 += carry8;
1722
0
  s8 -= int64_lshift21(carry8);
1723
0
  carry10 = (s10 + (1 << 20)) >> 21;
1724
0
  s11 += carry10;
1725
0
  s10 -= int64_lshift21(carry10);
1726
1727
0
  carry1 = (s1 + (1 << 20)) >> 21;
1728
0
  s2 += carry1;
1729
0
  s1 -= int64_lshift21(carry1);
1730
0
  carry3 = (s3 + (1 << 20)) >> 21;
1731
0
  s4 += carry3;
1732
0
  s3 -= int64_lshift21(carry3);
1733
0
  carry5 = (s5 + (1 << 20)) >> 21;
1734
0
  s6 += carry5;
1735
0
  s5 -= int64_lshift21(carry5);
1736
0
  carry7 = (s7 + (1 << 20)) >> 21;
1737
0
  s8 += carry7;
1738
0
  s7 -= int64_lshift21(carry7);
1739
0
  carry9 = (s9 + (1 << 20)) >> 21;
1740
0
  s10 += carry9;
1741
0
  s9 -= int64_lshift21(carry9);
1742
0
  carry11 = (s11 + (1 << 20)) >> 21;
1743
0
  s12 += carry11;
1744
0
  s11 -= int64_lshift21(carry11);
1745
1746
0
  s0 += s12 * 666643;
1747
0
  s1 += s12 * 470296;
1748
0
  s2 += s12 * 654183;
1749
0
  s3 -= s12 * 997805;
1750
0
  s4 += s12 * 136657;
1751
0
  s5 -= s12 * 683901;
1752
0
  s12 = 0;
1753
1754
0
  carry0 = s0 >> 21;
1755
0
  s1 += carry0;
1756
0
  s0 -= int64_lshift21(carry0);
1757
0
  carry1 = s1 >> 21;
1758
0
  s2 += carry1;
1759
0
  s1 -= int64_lshift21(carry1);
1760
0
  carry2 = s2 >> 21;
1761
0
  s3 += carry2;
1762
0
  s2 -= int64_lshift21(carry2);
1763
0
  carry3 = s3 >> 21;
1764
0
  s4 += carry3;
1765
0
  s3 -= int64_lshift21(carry3);
1766
0
  carry4 = s4 >> 21;
1767
0
  s5 += carry4;
1768
0
  s4 -= int64_lshift21(carry4);
1769
0
  carry5 = s5 >> 21;
1770
0
  s6 += carry5;
1771
0
  s5 -= int64_lshift21(carry5);
1772
0
  carry6 = s6 >> 21;
1773
0
  s7 += carry6;
1774
0
  s6 -= int64_lshift21(carry6);
1775
0
  carry7 = s7 >> 21;
1776
0
  s8 += carry7;
1777
0
  s7 -= int64_lshift21(carry7);
1778
0
  carry8 = s8 >> 21;
1779
0
  s9 += carry8;
1780
0
  s8 -= int64_lshift21(carry8);
1781
0
  carry9 = s9 >> 21;
1782
0
  s10 += carry9;
1783
0
  s9 -= int64_lshift21(carry9);
1784
0
  carry10 = s10 >> 21;
1785
0
  s11 += carry10;
1786
0
  s10 -= int64_lshift21(carry10);
1787
0
  carry11 = s11 >> 21;
1788
0
  s12 += carry11;
1789
0
  s11 -= int64_lshift21(carry11);
1790
1791
0
  s0 += s12 * 666643;
1792
0
  s1 += s12 * 470296;
1793
0
  s2 += s12 * 654183;
1794
0
  s3 -= s12 * 997805;
1795
0
  s4 += s12 * 136657;
1796
0
  s5 -= s12 * 683901;
1797
0
  s12 = 0;
1798
1799
0
  carry0 = s0 >> 21;
1800
0
  s1 += carry0;
1801
0
  s0 -= int64_lshift21(carry0);
1802
0
  carry1 = s1 >> 21;
1803
0
  s2 += carry1;
1804
0
  s1 -= int64_lshift21(carry1);
1805
0
  carry2 = s2 >> 21;
1806
0
  s3 += carry2;
1807
0
  s2 -= int64_lshift21(carry2);
1808
0
  carry3 = s3 >> 21;
1809
0
  s4 += carry3;
1810
0
  s3 -= int64_lshift21(carry3);
1811
0
  carry4 = s4 >> 21;
1812
0
  s5 += carry4;
1813
0
  s4 -= int64_lshift21(carry4);
1814
0
  carry5 = s5 >> 21;
1815
0
  s6 += carry5;
1816
0
  s5 -= int64_lshift21(carry5);
1817
0
  carry6 = s6 >> 21;
1818
0
  s7 += carry6;
1819
0
  s6 -= int64_lshift21(carry6);
1820
0
  carry7 = s7 >> 21;
1821
0
  s8 += carry7;
1822
0
  s7 -= int64_lshift21(carry7);
1823
0
  carry8 = s8 >> 21;
1824
0
  s9 += carry8;
1825
0
  s8 -= int64_lshift21(carry8);
1826
0
  carry9 = s9 >> 21;
1827
0
  s10 += carry9;
1828
0
  s9 -= int64_lshift21(carry9);
1829
0
  carry10 = s10 >> 21;
1830
0
  s11 += carry10;
1831
0
  s10 -= int64_lshift21(carry10);
1832
1833
0
  s[0] = s0 >> 0;
1834
0
  s[1] = s0 >> 8;
1835
0
  s[2] = (s0 >> 16) | (s1 << 5);
1836
0
  s[3] = s1 >> 3;
1837
0
  s[4] = s1 >> 11;
1838
0
  s[5] = (s1 >> 19) | (s2 << 2);
1839
0
  s[6] = s2 >> 6;
1840
0
  s[7] = (s2 >> 14) | (s3 << 7);
1841
0
  s[8] = s3 >> 1;
1842
0
  s[9] = s3 >> 9;
1843
0
  s[10] = (s3 >> 17) | (s4 << 4);
1844
0
  s[11] = s4 >> 4;
1845
0
  s[12] = s4 >> 12;
1846
0
  s[13] = (s4 >> 20) | (s5 << 1);
1847
0
  s[14] = s5 >> 7;
1848
0
  s[15] = (s5 >> 15) | (s6 << 6);
1849
0
  s[16] = s6 >> 2;
1850
0
  s[17] = s6 >> 10;
1851
0
  s[18] = (s6 >> 18) | (s7 << 3);
1852
0
  s[19] = s7 >> 5;
1853
0
  s[20] = s7 >> 13;
1854
0
  s[21] = s8 >> 0;
1855
0
  s[22] = s8 >> 8;
1856
0
  s[23] = (s8 >> 16) | (s9 << 5);
1857
0
  s[24] = s9 >> 3;
1858
0
  s[25] = s9 >> 11;
1859
0
  s[26] = (s9 >> 19) | (s10 << 2);
1860
0
  s[27] = s10 >> 6;
1861
0
  s[28] = (s10 >> 14) | (s11 << 7);
1862
0
  s[29] = s11 >> 1;
1863
0
  s[30] = s11 >> 9;
1864
0
  s[31] = s11 >> 17;
1865
0
}
1866
1867
0
void ED25519_keypair(uint8_t out_public_key[32], uint8_t out_private_key[64]) {
1868
0
  uint8_t seed[32];
1869
0
  RAND_bytes(seed, 32);
1870
0
  ED25519_keypair_from_seed(out_public_key, out_private_key, seed);
1871
0
}
1872
1873
int ED25519_sign(uint8_t out_sig[64], const uint8_t *message,
1874
0
                 size_t message_len, const uint8_t private_key[64]) {
1875
  // NOTE: The documentation on this function says that it returns zero on
1876
  // allocation failure. While that can't happen with the current
1877
  // implementation, we want to reserve the ability to allocate in this
1878
  // implementation in the future.
1879
1880
0
  uint8_t az[SHA512_DIGEST_LENGTH];
1881
0
  SHA512(private_key, 32, az);
1882
1883
0
  az[0] &= 248;
1884
0
  az[31] &= 63;
1885
0
  az[31] |= 64;
1886
1887
0
  SHA512_CTX hash_ctx;
1888
0
  SHA512_Init(&hash_ctx);
1889
0
  SHA512_Update(&hash_ctx, az + 32, 32);
1890
0
  SHA512_Update(&hash_ctx, message, message_len);
1891
0
  uint8_t nonce[SHA512_DIGEST_LENGTH];
1892
0
  SHA512_Final(nonce, &hash_ctx);
1893
1894
0
  x25519_sc_reduce(nonce);
1895
0
  ge_p3 R;
1896
0
  x25519_ge_scalarmult_base(&R, nonce);
1897
0
  ge_p3_tobytes(out_sig, &R);
1898
1899
0
  SHA512_Init(&hash_ctx);
1900
0
  SHA512_Update(&hash_ctx, out_sig, 32);
1901
0
  SHA512_Update(&hash_ctx, private_key + 32, 32);
1902
0
  SHA512_Update(&hash_ctx, message, message_len);
1903
0
  uint8_t hram[SHA512_DIGEST_LENGTH];
1904
0
  SHA512_Final(hram, &hash_ctx);
1905
1906
0
  x25519_sc_reduce(hram);
1907
0
  sc_muladd(out_sig + 32, hram, az, nonce);
1908
1909
0
  return 1;
1910
0
}
1911
1912
int ED25519_verify(const uint8_t *message, size_t message_len,
1913
0
                   const uint8_t signature[64], const uint8_t public_key[32]) {
1914
0
  ge_p3 A;
1915
0
  if ((signature[63] & 224) != 0 ||
1916
0
      !x25519_ge_frombytes_vartime(&A, public_key)) {
1917
0
    return 0;
1918
0
  }
1919
1920
0
  fe_loose t;
1921
0
  fe_neg(&t, &A.X);
1922
0
  fe_carry(&A.X, &t);
1923
0
  fe_neg(&t, &A.T);
1924
0
  fe_carry(&A.T, &t);
1925
1926
0
  uint8_t pkcopy[32];
1927
0
  OPENSSL_memcpy(pkcopy, public_key, 32);
1928
0
  uint8_t rcopy[32];
1929
0
  OPENSSL_memcpy(rcopy, signature, 32);
1930
0
  uint8_t scopy[32];
1931
0
  OPENSSL_memcpy(scopy, signature + 32, 32);
1932
1933
  // https://tools.ietf.org/html/rfc8032#section-5.1.7 requires that s be in
1934
  // the range [0, order) in order to prevent signature malleability.
1935
1936
  // kOrder is the order of Curve25519 in little-endian form.
1937
0
  static const uint64_t kOrder[4] = {
1938
0
    UINT64_C(0x5812631a5cf5d3ed),
1939
0
    UINT64_C(0x14def9dea2f79cd6),
1940
0
    0,
1941
0
    UINT64_C(0x1000000000000000),
1942
0
  };
1943
0
  for (size_t i = 3;; i--) {
1944
0
    uint64_t word = CRYPTO_load_u64_le(scopy + i * 8);
1945
0
    if (word > kOrder[i]) {
1946
0
      return 0;
1947
0
    } else if (word < kOrder[i]) {
1948
0
      break;
1949
0
    } else if (i == 0) {
1950
0
      return 0;
1951
0
    }
1952
0
  }
1953
1954
0
  SHA512_CTX hash_ctx;
1955
0
  SHA512_Init(&hash_ctx);
1956
0
  SHA512_Update(&hash_ctx, signature, 32);
1957
0
  SHA512_Update(&hash_ctx, public_key, 32);
1958
0
  SHA512_Update(&hash_ctx, message, message_len);
1959
0
  uint8_t h[SHA512_DIGEST_LENGTH];
1960
0
  SHA512_Final(h, &hash_ctx);
1961
1962
0
  x25519_sc_reduce(h);
1963
1964
0
  ge_p2 R;
1965
0
  ge_double_scalarmult_vartime(&R, h, &A, scopy);
1966
1967
0
  uint8_t rcheck[32];
1968
0
  x25519_ge_tobytes(rcheck, &R);
1969
1970
0
  return CRYPTO_memcmp(rcheck, rcopy, sizeof(rcheck)) == 0;
1971
0
}
1972
1973
void ED25519_keypair_from_seed(uint8_t out_public_key[32],
1974
                               uint8_t out_private_key[64],
1975
83
                               const uint8_t seed[32]) {
1976
83
  uint8_t az[SHA512_DIGEST_LENGTH];
1977
83
  SHA512(seed, 32, az);
1978
1979
83
  az[0] &= 248;
1980
83
  az[31] &= 127;
1981
83
  az[31] |= 64;
1982
1983
83
  ge_p3 A;
1984
83
  x25519_ge_scalarmult_base(&A, az);
1985
83
  ge_p3_tobytes(out_public_key, &A);
1986
1987
83
  OPENSSL_memcpy(out_private_key, seed, 32);
1988
83
  OPENSSL_memcpy(out_private_key + 32, out_public_key, 32);
1989
83
}
1990
1991
1992
static void x25519_scalar_mult_generic(uint8_t out[32],
1993
                                       const uint8_t scalar[32],
1994
0
                                       const uint8_t point[32]) {
1995
0
  fe x1, x2, z2, x3, z3, tmp0, tmp1;
1996
0
  fe_loose x2l, z2l, x3l, tmp0l, tmp1l;
1997
1998
0
  uint8_t e[32];
1999
0
  OPENSSL_memcpy(e, scalar, 32);
2000
0
  e[0] &= 248;
2001
0
  e[31] &= 127;
2002
0
  e[31] |= 64;
2003
2004
  // The following implementation was transcribed to Coq and proven to
2005
  // correspond to unary scalar multiplication in affine coordinates given that
2006
  // x1 != 0 is the x coordinate of some point on the curve. It was also checked
2007
  // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2
2008
  // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the
2009
  // underlying field, so it applies to Curve25519 itself and the quadratic
2010
  // twist of Curve25519. It was not proven in Coq that prime-field arithmetic
2011
  // correctly simulates extension-field arithmetic on prime-field values.
2012
  // The decoding of the byte array representation of e was not considered.
2013
  // Specification of Montgomery curves in affine coordinates:
2014
  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
2015
  // Proof that these form a group that is isomorphic to a Weierstrass curve:
2016
  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
2017
  // Coq transcription and correctness proof of the loop (where scalarbits=255):
2018
  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
2019
  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
2020
  // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0
2021
0
  fe_frombytes(&x1, point);
2022
0
  fe_1(&x2);
2023
0
  fe_0(&z2);
2024
0
  fe_copy(&x3, &x1);
2025
0
  fe_1(&z3);
2026
2027
0
  unsigned swap = 0;
2028
0
  int pos;
2029
0
  for (pos = 254; pos >= 0; --pos) {
2030
    // loop invariant as of right before the test, for the case where x1 != 0:
2031
    //   pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero
2032
    //   let r := e >> (pos+1) in the following equalities of projective points:
2033
    //   to_xz (r*P)     === if swap then (x3, z3) else (x2, z2)
2034
    //   to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
2035
    //   x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P)
2036
0
    unsigned b = 1 & (e[pos / 8] >> (pos & 7));
2037
0
    swap ^= b;
2038
0
    fe_cswap(&x2, &x3, swap);
2039
0
    fe_cswap(&z2, &z3, swap);
2040
0
    swap = b;
2041
    // Coq transcription of ladderstep formula (called from transcribed loop):
2042
    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
2043
    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
2044
    // x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
2045
    // x1  = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
2046
0
    fe_sub(&tmp0l, &x3, &z3);
2047
0
    fe_sub(&tmp1l, &x2, &z2);
2048
0
    fe_add(&x2l, &x2, &z2);
2049
0
    fe_add(&z2l, &x3, &z3);
2050
0
    fe_mul_tll(&z3, &tmp0l, &x2l);
2051
0
    fe_mul_tll(&z2, &z2l, &tmp1l);
2052
0
    fe_sq_tl(&tmp0, &tmp1l);
2053
0
    fe_sq_tl(&tmp1, &x2l);
2054
0
    fe_add(&x3l, &z3, &z2);
2055
0
    fe_sub(&z2l, &z3, &z2);
2056
0
    fe_mul_ttt(&x2, &tmp1, &tmp0);
2057
0
    fe_sub(&tmp1l, &tmp1, &tmp0);
2058
0
    fe_sq_tl(&z2, &z2l);
2059
0
    fe_mul121666(&z3, &tmp1l);
2060
0
    fe_sq_tl(&x3, &x3l);
2061
0
    fe_add(&tmp0l, &tmp0, &z3);
2062
0
    fe_mul_ttt(&z3, &x1, &z2);
2063
0
    fe_mul_tll(&z2, &tmp1l, &tmp0l);
2064
0
  }
2065
  // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2)
2066
0
  fe_cswap(&x2, &x3, swap);
2067
0
  fe_cswap(&z2, &z3, swap);
2068
2069
0
  fe_invert(&z2, &z2);
2070
0
  fe_mul_ttt(&x2, &x2, &z2);
2071
0
  fe_tobytes(out, &x2);
2072
0
}
2073
2074
static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32],
2075
0
                               const uint8_t point[32]) {
2076
#if defined(BORINGSSL_X25519_NEON)
2077
  if (CRYPTO_is_NEON_capable()) {
2078
    x25519_NEON(out, scalar, point);
2079
    return;
2080
  }
2081
#elif defined(BORINGSSL_FE25519_ADX)
2082
0
  if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
2083
0
      CRYPTO_is_ADX_capable()) {
2084
0
    x25519_scalar_mult_adx(out, scalar, point);
2085
0
    return;
2086
0
  }
2087
0
#endif
2088
2089
0
  x25519_scalar_mult_generic(out, scalar, point);
2090
0
}
2091
2092
0
void X25519_keypair(uint8_t out_public_value[32], uint8_t out_private_key[32]) {
2093
0
  RAND_bytes(out_private_key, 32);
2094
2095
  // All X25519 implementations should decode scalars correctly (see
2096
  // https://tools.ietf.org/html/rfc7748#section-5). However, if an
2097
  // implementation doesn't then it might interoperate with random keys a
2098
  // fraction of the time because they'll, randomly, happen to be correctly
2099
  // formed.
2100
  //
2101
  // Thus we do the opposite of the masking here to make sure that our private
2102
  // keys are never correctly masked and so, hopefully, any incorrect
2103
  // implementations are deterministically broken.
2104
  //
2105
  // This does not affect security because, although we're throwing away
2106
  // entropy, a valid implementation of scalarmult should throw away the exact
2107
  // same bits anyway.
2108
0
  out_private_key[0] |= ~248;
2109
0
  out_private_key[31] &= ~64;
2110
0
  out_private_key[31] |= ~127;
2111
2112
0
  X25519_public_from_private(out_public_value, out_private_key);
2113
0
}
2114
2115
int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32],
2116
0
           const uint8_t peer_public_value[32]) {
2117
0
  static const uint8_t kZeros[32] = {0};
2118
0
  x25519_scalar_mult(out_shared_key, private_key, peer_public_value);
2119
  // The all-zero output results when the input is a point of small order.
2120
0
  return constant_time_declassify_int(
2121
0
             CRYPTO_memcmp(kZeros, out_shared_key, 32)) != 0;
2122
0
}
2123
2124
void X25519_public_from_private(uint8_t out_public_value[32],
2125
31
                                const uint8_t private_key[32]) {
2126
#if defined(BORINGSSL_X25519_NEON)
2127
  if (CRYPTO_is_NEON_capable()) {
2128
    static const uint8_t kMongomeryBasePoint[32] = {9};
2129
    x25519_NEON(out_public_value, private_key, kMongomeryBasePoint);
2130
    return;
2131
  }
2132
#endif
2133
2134
31
  uint8_t e[32];
2135
31
  OPENSSL_memcpy(e, private_key, 32);
2136
31
  e[0] &= 248;
2137
31
  e[31] &= 127;
2138
31
  e[31] |= 64;
2139
2140
31
  ge_p3 A;
2141
31
  x25519_ge_scalarmult_base(&A, e);
2142
2143
  // We only need the u-coordinate of the curve25519 point. The map is
2144
  // u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y).
2145
31
  fe_loose zplusy, zminusy;
2146
31
  fe zminusy_inv;
2147
31
  fe_add(&zplusy, &A.Z, &A.Y);
2148
31
  fe_sub(&zminusy, &A.Z, &A.Y);
2149
31
  fe_loose_invert(&zminusy_inv, &zminusy);
2150
31
  fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv);
2151
31
  fe_tobytes(out_public_value, &zminusy_inv);
2152
31
  CONSTTIME_DECLASSIFY(out_public_value, 32);
2153
31
}