Coverage Report

Created: 2025-11-17 06:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/boringssl/crypto/curve25519/curve25519.cc
Line
Count
Source
1
// Copyright 2020 The BoringSSL Authors
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//     https://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
// Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP
16
// 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as
17
// public domain. Other parts have been replaced to call into code generated by
18
// Fiat (https://github.com/mit-plv/fiat-crypto) in //third_party/fiat.
19
//
20
// The field functions are shared by Ed25519 and X25519 where possible.
21
22
#include <assert.h>
23
#include <string.h>
24
25
#include <openssl/mem.h>
26
#include <openssl/rand.h>
27
#include <openssl/sha2.h>
28
29
#include "../internal.h"
30
#include "internal.h"
31
32
// Various pre-computed constants.
33
#include "./curve25519_tables.h"
34
35
#if defined(BORINGSSL_HAS_UINT128)
36
#include "../../third_party/fiat/curve25519_64.h"
37
#elif defined(OPENSSL_64_BIT)
38
#include "../../third_party/fiat/curve25519_64_msvc.h"
39
#else
40
#include "../../third_party/fiat/curve25519_32.h"
41
#endif
42
43
44
// Low-level intrinsic operations
45
46
0
static uint64_t load_3(const uint8_t *in) {
47
0
  uint64_t result;
48
0
  result = (uint64_t)in[0];
49
0
  result |= ((uint64_t)in[1]) << 8;
50
0
  result |= ((uint64_t)in[2]) << 16;
51
0
  return result;
52
0
}
53
54
0
static uint64_t load_4(const uint8_t *in) {
55
0
  uint64_t result;
56
0
  result = (uint64_t)in[0];
57
0
  result |= ((uint64_t)in[1]) << 8;
58
0
  result |= ((uint64_t)in[2]) << 16;
59
0
  result |= ((uint64_t)in[3]) << 24;
60
0
  return result;
61
0
}
62
63
64
// Field operations.
65
66
#if defined(OPENSSL_64_BIT)
67
68
typedef uint64_t fe_limb_t;
69
0
#define FE_NUM_LIMBS 5
70
71
// assert_fe asserts that |f| satisfies bounds:
72
//
73
//  [[0x0 ~> 0x8cccccccccccc],
74
//   [0x0 ~> 0x8cccccccccccc],
75
//   [0x0 ~> 0x8cccccccccccc],
76
//   [0x0 ~> 0x8cccccccccccc],
77
//   [0x0 ~> 0x8cccccccccccc]]
78
//
79
// See comments in curve25519_64.h for which functions use these bounds for
80
// inputs or outputs.
81
#define assert_fe(f)                                                    \
82
46.2M
  do {                                                                  \
83
277M
    for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
84
231M
      declassify_assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc));  \
85
231M
    }                                                                   \
86
46.2M
  } while (0)
87
88
// assert_fe_loose asserts that |f| satisfies bounds:
89
//
90
//  [[0x0 ~> 0x1a666666666664],
91
//   [0x0 ~> 0x1a666666666664],
92
//   [0x0 ~> 0x1a666666666664],
93
//   [0x0 ~> 0x1a666666666664],
94
//   [0x0 ~> 0x1a666666666664]]
95
//
96
// See comments in curve25519_64.h for which functions use these bounds for
97
// inputs or outputs.
98
#define assert_fe_loose(f)                                              \
99
47.7M
  do {                                                                  \
100
286M
    for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
101
238M
      declassify_assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664)); \
102
238M
    }                                                                   \
103
47.7M
  } while (0)
104
105
#else
106
107
typedef uint32_t fe_limb_t;
108
#define FE_NUM_LIMBS 10
109
110
// assert_fe asserts that |f| satisfies bounds:
111
//
112
//  [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
113
//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
114
//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
115
//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
116
//   [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
117
//
118
// See comments in curve25519_32.h for which functions use these bounds for
119
// inputs or outputs.
120
#define assert_fe(f)                                                     \
121
  do {                                                                   \
122
    for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
123
      declassify_assert(f[_assert_fe_i] <=                               \
124
                        ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u)); \
125
    }                                                                    \
126
  } while (0)
127
128
// assert_fe_loose asserts that |f| satisfies bounds:
129
//
130
//  [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
131
//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
132
//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
133
//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
134
//   [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
135
//
136
// See comments in curve25519_32.h for which functions use these bounds for
137
// inputs or outputs.
138
#define assert_fe_loose(f)                                               \
139
  do {                                                                   \
140
    for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
141
      declassify_assert(f[_assert_fe_i] <=                               \
142
                        ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u)); \
143
    }                                                                    \
144
  } while (0)
145
146
#endif  // OPENSSL_64_BIT
147
148
static_assert(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS,
149
              "fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe");
150
151
0
static void fe_frombytes_strict(fe *h, const uint8_t s[32]) {
152
  // |fiat_25519_from_bytes| requires the top-most bit be clear.
153
0
  declassify_assert((s[31] & 0x80) == 0);
154
0
  fiat_25519_from_bytes(h->v, s);
155
0
  assert_fe(h->v);
156
0
}
157
158
0
static void fe_frombytes(fe *h, const uint8_t s[32]) {
159
0
  uint8_t s_copy[32];
160
0
  OPENSSL_memcpy(s_copy, s, 32);
161
0
  s_copy[31] &= 0x7f;
162
0
  fe_frombytes_strict(h, s_copy);
163
0
}
164
165
170k
static void fe_tobytes(uint8_t s[32], const fe *f) {
166
170k
  assert_fe(f->v);
167
170k
  fiat_25519_to_bytes(s, f->v);
168
170k
}
169
170
// h = 0
171
0
static void fe_0(fe *h) { OPENSSL_memset(h, 0, sizeof(fe)); }
172
173
0
static void fe_loose_0(fe_loose *h) { OPENSSL_memset(h, 0, sizeof(fe_loose)); }
174
175
// h = 1
176
0
static void fe_1(fe *h) {
177
0
  OPENSSL_memset(h, 0, sizeof(fe));
178
0
  h->v[0] = 1;
179
0
}
180
181
0
static void fe_loose_1(fe_loose *h) {
182
0
  OPENSSL_memset(h, 0, sizeof(fe_loose));
183
0
  h->v[0] = 1;
184
0
}
185
186
// h = f + g
187
// Can overlap h with f or g.
188
170k
static void fe_add(fe_loose *h, const fe *f, const fe *g) {
189
170k
  assert_fe(f->v);
190
170k
  assert_fe(g->v);
191
170k
  fiat_25519_add(h->v, f->v, g->v);
192
170k
  assert_fe_loose(h->v);
193
170k
}
194
195
// h = f - g
196
// Can overlap h with f or g.
197
170k
static void fe_sub(fe_loose *h, const fe *f, const fe *g) {
198
170k
  assert_fe(f->v);
199
170k
  assert_fe(g->v);
200
170k
  fiat_25519_sub(h->v, f->v, g->v);
201
170k
  assert_fe_loose(h->v);
202
170k
}
203
204
0
static void fe_carry(fe *h, const fe_loose *f) {
205
0
  assert_fe_loose(f->v);
206
0
  fiat_25519_carry(h->v, f->v);
207
0
  assert_fe(h->v);
208
0
}
209
210
static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS],
211
                        const fe_limb_t in1[FE_NUM_LIMBS],
212
2.04M
                        const fe_limb_t in2[FE_NUM_LIMBS]) {
213
2.04M
  assert_fe_loose(in1);
214
2.04M
  assert_fe_loose(in2);
215
2.04M
  fiat_25519_carry_mul(out, in1, in2);
216
2.04M
  assert_fe(out);
217
2.04M
}
218
219
0
static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) {
220
0
  fe_mul_impl(h->v, f->v, g->v);
221
0
}
222
223
0
static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) {
224
0
  fe_mul_impl(h->v, f->v, g->v);
225
0
}
226
227
1.70M
static void fe_mul_ttt(fe *h, const fe *f, const fe *g) {
228
1.70M
  fe_mul_impl(h->v, f->v, g->v);
229
1.70M
}
230
231
341k
static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) {
232
341k
  fe_mul_impl(h->v, f->v, g->v);
233
341k
}
234
235
0
static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) {
236
0
  fe_mul_impl(h->v, f->v, g->v);
237
0
}
238
239
0
static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) {
240
0
  fe_mul_impl(h->v, f->v, g->v);
241
0
}
242
243
170k
static void fe_sq_tl(fe *h, const fe_loose *f) {
244
170k
  assert_fe_loose(f->v);
245
170k
  fiat_25519_carry_square(h->v, f->v);
246
170k
  assert_fe(h->v);
247
170k
}
248
249
43.1M
static void fe_sq_tt(fe *h, const fe *f) {
250
43.1M
  assert_fe_loose(f->v);
251
43.1M
  fiat_25519_carry_square(h->v, f->v);
252
43.1M
  assert_fe(h->v);
253
43.1M
}
254
255
// Replace (f,g) with (g,f) if b == 1;
256
// replace (f,g) with (f,g) if b == 0.
257
//
258
// Preconditions: b in {0,1}.
259
0
static void fe_cswap(fe *f, fe *g, fe_limb_t b) {
260
0
  b = 0 - b;
261
0
  for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
262
0
    fe_limb_t x = f->v[i] ^ g->v[i];
263
0
    x &= b;
264
0
    f->v[i] ^= x;
265
0
    g->v[i] ^= x;
266
0
  }
267
0
}
268
269
0
static void fe_mul121666(fe *h, const fe_loose *f) {
270
0
  assert_fe_loose(f->v);
271
0
  fiat_25519_carry_scmul_121666(h->v, f->v);
272
0
  assert_fe(h->v);
273
0
}
274
275
// h = -f
276
0
static void fe_neg(fe_loose *h, const fe *f) {
277
0
  assert_fe(f->v);
278
0
  fiat_25519_opp(h->v, f->v);
279
0
  assert_fe_loose(h->v);
280
0
}
281
282
// Replace (f,g) with (g,g) if b == 1;
283
// replace (f,g) with (f,g) if b == 0.
284
//
285
// Preconditions: b in {0,1}.
286
0
static void fe_cmov(fe_loose *f, const fe_loose *g, fe_limb_t b) {
287
  // Silence an unused function warning. |fiat_25519_selectznz| isn't quite the
288
  // calling convention the rest of this code wants, so implement it by hand.
289
  //
290
  // TODO(davidben): Switch to fiat's calling convention, or ask fiat to emit a
291
  // different one.
292
293
0
  b = 0 - b;
294
0
  for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
295
0
    fe_limb_t x = f->v[i] ^ g->v[i];
296
0
    x &= b;
297
0
    f->v[i] ^= x;
298
0
  }
299
0
}
300
301
// h = f
302
0
static void fe_copy(fe *h, const fe *f) { OPENSSL_memmove(h, f, sizeof(fe)); }
303
304
32
static void fe_copy_lt(fe_loose *h, const fe *f) {
305
32
  static_assert(sizeof(fe_loose) == sizeof(fe), "fe and fe_loose mismatch");
306
32
  OPENSSL_memmove(h, f, sizeof(fe));
307
32
}
308
309
170k
static void fe_loose_invert(fe *out, const fe_loose *z) {
310
170k
  fe t0;
311
170k
  fe t1;
312
170k
  fe t2;
313
170k
  fe t3;
314
170k
  int i;
315
316
170k
  fe_sq_tl(&t0, z);
317
170k
  fe_sq_tt(&t1, &t0);
318
341k
  for (i = 1; i < 2; ++i) {
319
170k
    fe_sq_tt(&t1, &t1);
320
170k
  }
321
170k
  fe_mul_tlt(&t1, z, &t1);
322
170k
  fe_mul_ttt(&t0, &t0, &t1);
323
170k
  fe_sq_tt(&t2, &t0);
324
170k
  fe_mul_ttt(&t1, &t1, &t2);
325
170k
  fe_sq_tt(&t2, &t1);
326
853k
  for (i = 1; i < 5; ++i) {
327
682k
    fe_sq_tt(&t2, &t2);
328
682k
  }
329
170k
  fe_mul_ttt(&t1, &t2, &t1);
330
170k
  fe_sq_tt(&t2, &t1);
331
1.70M
  for (i = 1; i < 10; ++i) {
332
1.53M
    fe_sq_tt(&t2, &t2);
333
1.53M
  }
334
170k
  fe_mul_ttt(&t2, &t2, &t1);
335
170k
  fe_sq_tt(&t3, &t2);
336
3.41M
  for (i = 1; i < 20; ++i) {
337
3.24M
    fe_sq_tt(&t3, &t3);
338
3.24M
  }
339
170k
  fe_mul_ttt(&t2, &t3, &t2);
340
170k
  fe_sq_tt(&t2, &t2);
341
1.70M
  for (i = 1; i < 10; ++i) {
342
1.53M
    fe_sq_tt(&t2, &t2);
343
1.53M
  }
344
170k
  fe_mul_ttt(&t1, &t2, &t1);
345
170k
  fe_sq_tt(&t2, &t1);
346
8.53M
  for (i = 1; i < 50; ++i) {
347
8.36M
    fe_sq_tt(&t2, &t2);
348
8.36M
  }
349
170k
  fe_mul_ttt(&t2, &t2, &t1);
350
170k
  fe_sq_tt(&t3, &t2);
351
17.0M
  for (i = 1; i < 100; ++i) {
352
16.8M
    fe_sq_tt(&t3, &t3);
353
16.8M
  }
354
170k
  fe_mul_ttt(&t2, &t3, &t2);
355
170k
  fe_sq_tt(&t2, &t2);
356
8.53M
  for (i = 1; i < 50; ++i) {
357
8.36M
    fe_sq_tt(&t2, &t2);
358
8.36M
  }
359
170k
  fe_mul_ttt(&t1, &t2, &t1);
360
170k
  fe_sq_tt(&t1, &t1);
361
853k
  for (i = 1; i < 5; ++i) {
362
682k
    fe_sq_tt(&t1, &t1);
363
682k
  }
364
170k
  fe_mul_ttt(out, &t1, &t0);
365
170k
}
366
367
32
static void fe_invert(fe *out, const fe *z) {
368
32
  fe_loose l;
369
32
  fe_copy_lt(&l, z);
370
32
  fe_loose_invert(out, &l);
371
32
}
372
373
// return 0 if f == 0
374
// return 1 if f != 0
375
0
static int fe_isnonzero(const fe_loose *f) {
376
0
  fe tight;
377
0
  fe_carry(&tight, f);
378
0
  uint8_t s[32];
379
0
  fe_tobytes(s, &tight);
380
381
0
  static const uint8_t zero[32] = {0};
382
0
  return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0;
383
0
}
384
385
// return 1 if f is in {1,3,5,...,q-2}
386
// return 0 if f is in {0,2,4,...,q-1}
387
32
static int fe_isnegative(const fe *f) {
388
32
  uint8_t s[32];
389
32
  fe_tobytes(s, f);
390
32
  return s[0] & 1;
391
32
}
392
393
0
static void fe_sq2_tt(fe *h, const fe *f) {
394
  // h = f^2
395
0
  fe_sq_tt(h, f);
396
397
  // h = h + h
398
0
  fe_loose tmp;
399
0
  fe_add(&tmp, h, h);
400
0
  fe_carry(h, &tmp);
401
0
}
402
403
0
static void fe_pow22523(fe *out, const fe *z) {
404
0
  fe t0;
405
0
  fe t1;
406
0
  fe t2;
407
0
  int i;
408
409
0
  fe_sq_tt(&t0, z);
410
0
  fe_sq_tt(&t1, &t0);
411
0
  for (i = 1; i < 2; ++i) {
412
0
    fe_sq_tt(&t1, &t1);
413
0
  }
414
0
  fe_mul_ttt(&t1, z, &t1);
415
0
  fe_mul_ttt(&t0, &t0, &t1);
416
0
  fe_sq_tt(&t0, &t0);
417
0
  fe_mul_ttt(&t0, &t1, &t0);
418
0
  fe_sq_tt(&t1, &t0);
419
0
  for (i = 1; i < 5; ++i) {
420
0
    fe_sq_tt(&t1, &t1);
421
0
  }
422
0
  fe_mul_ttt(&t0, &t1, &t0);
423
0
  fe_sq_tt(&t1, &t0);
424
0
  for (i = 1; i < 10; ++i) {
425
0
    fe_sq_tt(&t1, &t1);
426
0
  }
427
0
  fe_mul_ttt(&t1, &t1, &t0);
428
0
  fe_sq_tt(&t2, &t1);
429
0
  for (i = 1; i < 20; ++i) {
430
0
    fe_sq_tt(&t2, &t2);
431
0
  }
432
0
  fe_mul_ttt(&t1, &t2, &t1);
433
0
  fe_sq_tt(&t1, &t1);
434
0
  for (i = 1; i < 10; ++i) {
435
0
    fe_sq_tt(&t1, &t1);
436
0
  }
437
0
  fe_mul_ttt(&t0, &t1, &t0);
438
0
  fe_sq_tt(&t1, &t0);
439
0
  for (i = 1; i < 50; ++i) {
440
0
    fe_sq_tt(&t1, &t1);
441
0
  }
442
0
  fe_mul_ttt(&t1, &t1, &t0);
443
0
  fe_sq_tt(&t2, &t1);
444
0
  for (i = 1; i < 100; ++i) {
445
0
    fe_sq_tt(&t2, &t2);
446
0
  }
447
0
  fe_mul_ttt(&t1, &t2, &t1);
448
0
  fe_sq_tt(&t1, &t1);
449
0
  for (i = 1; i < 50; ++i) {
450
0
    fe_sq_tt(&t1, &t1);
451
0
  }
452
0
  fe_mul_ttt(&t0, &t1, &t0);
453
0
  fe_sq_tt(&t0, &t0);
454
0
  for (i = 1; i < 2; ++i) {
455
0
    fe_sq_tt(&t0, &t0);
456
0
  }
457
0
  fe_mul_ttt(out, &t0, z);
458
0
}
459
460
461
// Group operations.
462
463
0
void x25519_ge_tobytes(uint8_t s[32], const ge_p2 *h) {
464
0
  fe recip;
465
0
  fe x;
466
0
  fe y;
467
468
0
  fe_invert(&recip, &h->Z);
469
0
  fe_mul_ttt(&x, &h->X, &recip);
470
0
  fe_mul_ttt(&y, &h->Y, &recip);
471
0
  fe_tobytes(s, &y);
472
0
  s[31] ^= fe_isnegative(&x) << 7;
473
0
}
474
475
32
static void ge_p3_tobytes(uint8_t s[32], const ge_p3 *h) {
476
32
  fe recip;
477
32
  fe x;
478
32
  fe y;
479
480
32
  fe_invert(&recip, &h->Z);
481
32
  fe_mul_ttt(&x, &h->X, &recip);
482
32
  fe_mul_ttt(&y, &h->Y, &recip);
483
32
  fe_tobytes(s, &y);
484
32
  s[31] ^= fe_isnegative(&x) << 7;
485
32
}
486
487
0
int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) {
488
0
  fe u;
489
0
  fe_loose v;
490
0
  fe w;
491
0
  fe vxx;
492
0
  fe_loose check;
493
494
0
  fe_frombytes(&h->Y, s);
495
0
  fe_1(&h->Z);
496
0
  fe_sq_tt(&w, &h->Y);
497
0
  fe_mul_ttt(&vxx, &w, &d);
498
0
  fe_sub(&v, &w, &h->Z);  // u = y^2-1
499
0
  fe_carry(&u, &v);
500
0
  fe_add(&v, &vxx, &h->Z);  // v = dy^2+1
501
502
0
  fe_mul_ttl(&w, &u, &v);        // w = u*v
503
0
  fe_pow22523(&h->X, &w);        // x = w^((q-5)/8)
504
0
  fe_mul_ttt(&h->X, &h->X, &u);  // x = u*w^((q-5)/8)
505
506
0
  fe_sq_tt(&vxx, &h->X);
507
0
  fe_mul_ttl(&vxx, &vxx, &v);
508
0
  fe_sub(&check, &vxx, &u);
509
0
  if (fe_isnonzero(&check)) {
510
0
    fe_add(&check, &vxx, &u);
511
0
    if (fe_isnonzero(&check)) {
512
0
      return 0;
513
0
    }
514
0
    fe_mul_ttt(&h->X, &h->X, &sqrtm1);
515
0
  }
516
517
0
  if (fe_isnegative(&h->X) != (s[31] >> 7)) {
518
0
    fe_loose t;
519
0
    fe_neg(&t, &h->X);
520
0
    fe_carry(&h->X, &t);
521
0
  }
522
523
0
  fe_mul_ttt(&h->T, &h->X, &h->Y);
524
0
  return 1;
525
0
}
526
527
0
static void ge_p2_0(ge_p2 *h) {
528
0
  fe_0(&h->X);
529
0
  fe_1(&h->Y);
530
0
  fe_1(&h->Z);
531
0
}
532
533
0
static void ge_p3_0(ge_p3 *h) {
534
0
  fe_0(&h->X);
535
0
  fe_1(&h->Y);
536
0
  fe_1(&h->Z);
537
0
  fe_0(&h->T);
538
0
}
539
540
0
static void ge_cached_0(ge_cached *h) {
541
0
  fe_loose_1(&h->YplusX);
542
0
  fe_loose_1(&h->YminusX);
543
0
  fe_loose_1(&h->Z);
544
0
  fe_loose_0(&h->T2d);
545
0
}
546
547
0
static void ge_precomp_0(ge_precomp *h) {
548
0
  fe_loose_1(&h->yplusx);
549
0
  fe_loose_1(&h->yminusx);
550
0
  fe_loose_0(&h->xy2d);
551
0
}
552
553
// r = p
554
0
static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) {
555
0
  fe_copy(&r->X, &p->X);
556
0
  fe_copy(&r->Y, &p->Y);
557
0
  fe_copy(&r->Z, &p->Z);
558
0
}
559
560
// r = p
561
0
void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
562
0
  fe_add(&r->YplusX, &p->Y, &p->X);
563
0
  fe_sub(&r->YminusX, &p->Y, &p->X);
564
0
  fe_copy_lt(&r->Z, &p->Z);
565
0
  fe_mul_ltt(&r->T2d, &p->T, &d2);
566
0
}
567
568
// r = p
569
0
void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) {
570
0
  fe_mul_tll(&r->X, &p->X, &p->T);
571
0
  fe_mul_tll(&r->Y, &p->Y, &p->Z);
572
0
  fe_mul_tll(&r->Z, &p->Z, &p->T);
573
0
}
574
575
// r = p
576
0
void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
577
0
  fe_mul_tll(&r->X, &p->X, &p->T);
578
0
  fe_mul_tll(&r->Y, &p->Y, &p->Z);
579
0
  fe_mul_tll(&r->Z, &p->Z, &p->T);
580
0
  fe_mul_tll(&r->T, &p->X, &p->Y);
581
0
}
582
583
// r = p
584
0
static void ge_p1p1_to_cached(ge_cached *r, const ge_p1p1 *p) {
585
0
  ge_p3 t;
586
0
  x25519_ge_p1p1_to_p3(&t, p);
587
0
  x25519_ge_p3_to_cached(r, &t);
588
0
}
589
590
// r = 2 * p
591
0
static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
592
0
  fe trX, trZ, trT;
593
0
  fe t0;
594
595
0
  fe_sq_tt(&trX, &p->X);
596
0
  fe_sq_tt(&trZ, &p->Y);
597
0
  fe_sq2_tt(&trT, &p->Z);
598
0
  fe_add(&r->Y, &p->X, &p->Y);
599
0
  fe_sq_tl(&t0, &r->Y);
600
601
0
  fe_add(&r->Y, &trZ, &trX);
602
0
  fe_sub(&r->Z, &trZ, &trX);
603
0
  fe_carry(&trZ, &r->Y);
604
0
  fe_sub(&r->X, &t0, &trZ);
605
0
  fe_carry(&trZ, &r->Z);
606
0
  fe_sub(&r->T, &trT, &trZ);
607
0
}
608
609
// r = 2 * p
610
0
static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) {
611
0
  ge_p2 q;
612
0
  ge_p3_to_p2(&q, p);
613
0
  ge_p2_dbl(r, &q);
614
0
}
615
616
// r = p + q
617
0
static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
618
0
  fe trY, trZ, trT;
619
620
0
  fe_add(&r->X, &p->Y, &p->X);
621
0
  fe_sub(&r->Y, &p->Y, &p->X);
622
0
  fe_mul_tll(&trZ, &r->X, &q->yplusx);
623
0
  fe_mul_tll(&trY, &r->Y, &q->yminusx);
624
0
  fe_mul_tlt(&trT, &q->xy2d, &p->T);
625
0
  fe_add(&r->T, &p->Z, &p->Z);
626
0
  fe_sub(&r->X, &trZ, &trY);
627
0
  fe_add(&r->Y, &trZ, &trY);
628
0
  fe_carry(&trZ, &r->T);
629
0
  fe_add(&r->Z, &trZ, &trT);
630
0
  fe_sub(&r->T, &trZ, &trT);
631
0
}
632
633
// r = p - q
634
0
static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
635
0
  fe trY, trZ, trT;
636
637
0
  fe_add(&r->X, &p->Y, &p->X);
638
0
  fe_sub(&r->Y, &p->Y, &p->X);
639
0
  fe_mul_tll(&trZ, &r->X, &q->yminusx);
640
0
  fe_mul_tll(&trY, &r->Y, &q->yplusx);
641
0
  fe_mul_tlt(&trT, &q->xy2d, &p->T);
642
0
  fe_add(&r->T, &p->Z, &p->Z);
643
0
  fe_sub(&r->X, &trZ, &trY);
644
0
  fe_add(&r->Y, &trZ, &trY);
645
0
  fe_carry(&trZ, &r->T);
646
0
  fe_sub(&r->Z, &trZ, &trT);
647
0
  fe_add(&r->T, &trZ, &trT);
648
0
}
649
650
// r = p + q
651
0
void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
652
0
  fe trX, trY, trZ, trT;
653
654
0
  fe_add(&r->X, &p->Y, &p->X);
655
0
  fe_sub(&r->Y, &p->Y, &p->X);
656
0
  fe_mul_tll(&trZ, &r->X, &q->YplusX);
657
0
  fe_mul_tll(&trY, &r->Y, &q->YminusX);
658
0
  fe_mul_tlt(&trT, &q->T2d, &p->T);
659
0
  fe_mul_ttl(&trX, &p->Z, &q->Z);
660
0
  fe_add(&r->T, &trX, &trX);
661
0
  fe_sub(&r->X, &trZ, &trY);
662
0
  fe_add(&r->Y, &trZ, &trY);
663
0
  fe_carry(&trZ, &r->T);
664
0
  fe_add(&r->Z, &trZ, &trT);
665
0
  fe_sub(&r->T, &trZ, &trT);
666
0
}
667
668
// r = p - q
669
0
void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
670
0
  fe trX, trY, trZ, trT;
671
672
0
  fe_add(&r->X, &p->Y, &p->X);
673
0
  fe_sub(&r->Y, &p->Y, &p->X);
674
0
  fe_mul_tll(&trZ, &r->X, &q->YminusX);
675
0
  fe_mul_tll(&trY, &r->Y, &q->YplusX);
676
0
  fe_mul_tlt(&trT, &q->T2d, &p->T);
677
0
  fe_mul_ttl(&trX, &p->Z, &q->Z);
678
0
  fe_add(&r->T, &trX, &trX);
679
0
  fe_sub(&r->X, &trZ, &trY);
680
0
  fe_add(&r->Y, &trZ, &trY);
681
0
  fe_carry(&trZ, &r->T);
682
0
  fe_sub(&r->Z, &trZ, &trT);
683
0
  fe_add(&r->T, &trZ, &trT);
684
0
}
685
686
0
static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) {
687
0
  fe_cmov(&t->yplusx, &u->yplusx, b);
688
0
  fe_cmov(&t->yminusx, &u->yminusx, b);
689
0
  fe_cmov(&t->xy2d, &u->xy2d, b);
690
0
}
691
692
void x25519_ge_scalarmult_small_precomp(
693
0
    ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) {
694
  // precomp_table is first expanded into matching |ge_precomp|
695
  // elements.
696
0
  ge_precomp multiples[15];
697
698
0
  unsigned i;
699
0
  for (i = 0; i < 15; i++) {
700
    // The precomputed table is assumed to already clear the top bit, so
701
    // |fe_frombytes_strict| may be used directly.
702
0
    const uint8_t *bytes = &precomp_table[i * (2 * 32)];
703
0
    fe x, y;
704
0
    fe_frombytes_strict(&x, bytes);
705
0
    fe_frombytes_strict(&y, bytes + 32);
706
707
0
    ge_precomp *out = &multiples[i];
708
0
    fe_add(&out->yplusx, &y, &x);
709
0
    fe_sub(&out->yminusx, &y, &x);
710
0
    fe_mul_ltt(&out->xy2d, &x, &y);
711
0
    fe_mul_llt(&out->xy2d, &out->xy2d, &d2);
712
0
  }
713
714
  // See the comment above |k25519SmallPrecomp| about the structure of the
715
  // precomputed elements. This loop does 64 additions and 64 doublings to
716
  // calculate the result.
717
0
  ge_p3_0(h);
718
719
0
  for (i = 63; i < 64; i--) {
720
0
    unsigned j;
721
0
    signed char index = 0;
722
723
0
    for (j = 0; j < 4; j++) {
724
0
      const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7));
725
0
      index |= (bit << j);
726
0
    }
727
728
0
    ge_precomp e;
729
0
    ge_precomp_0(&e);
730
731
0
    for (j = 1; j < 16; j++) {
732
0
      cmov(&e, &multiples[j - 1], 1 & constant_time_eq_w(index, j));
733
0
    }
734
735
0
    ge_cached cached;
736
0
    ge_p1p1 r;
737
0
    x25519_ge_p3_to_cached(&cached, h);
738
0
    x25519_ge_add(&r, h, &cached);
739
0
    x25519_ge_p1p1_to_p3(h, &r);
740
741
0
    ge_madd(&r, h, &e);
742
0
    x25519_ge_p1p1_to_p3(h, &r);
743
0
  }
744
0
}
745
746
#if defined(OPENSSL_SMALL)
747
748
void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
749
  x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp);
750
}
751
752
#else
753
754
0
static void table_select(ge_precomp *t, const int pos, const signed char b) {
755
0
  uint8_t bnegative = constant_time_msb_w(b);
756
0
  uint8_t babs = b - ((bnegative & b) << 1);
757
758
0
  uint8_t t_bytes[3][32] = {
759
0
      {static_cast<uint8_t>(constant_time_is_zero_w(b) & 1)},
760
0
      {static_cast<uint8_t>(constant_time_is_zero_w(b) & 1)},
761
0
      {0}};
762
0
#if defined(__clang__)  // materialize for vectorization, 6% speedup
763
0
  __asm__("" : "+m"(t_bytes) : /*no inputs*/);
764
0
#endif
765
0
  static_assert(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0]));
766
0
  for (int i = 0; i < 8; i++) {
767
0
    constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i],
768
0
                                     sizeof(t_bytes),
769
0
                                     constant_time_eq_w(babs, 1 + i));
770
0
  }
771
772
0
  fe yplusx, yminusx, xy2d;
773
0
  fe_frombytes_strict(&yplusx, t_bytes[0]);
774
0
  fe_frombytes_strict(&yminusx, t_bytes[1]);
775
0
  fe_frombytes_strict(&xy2d, t_bytes[2]);
776
777
0
  fe_copy_lt(&t->yplusx, &yplusx);
778
0
  fe_copy_lt(&t->yminusx, &yminusx);
779
0
  fe_copy_lt(&t->xy2d, &xy2d);
780
781
0
  ge_precomp minust;
782
0
  fe_copy_lt(&minust.yplusx, &yminusx);
783
0
  fe_copy_lt(&minust.yminusx, &yplusx);
784
0
  fe_neg(&minust.xy2d, &xy2d);
785
0
  cmov(t, &minust, bnegative >> 7);
786
0
}
787
788
// h = a * B
789
// where a = a[0]+256*a[1]+...+256^31 a[31]
790
// B is the Ed25519 base point (x,4/5) with x positive.
791
//
792
// Preconditions:
793
//   a[31] <= 127
794
170k
void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
795
170k
#if defined(BORINGSSL_FE25519_ADX)
796
170k
  if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
797
170k
      CRYPTO_is_ADX_capable()) {
798
170k
    uint8_t t[4][32];
799
170k
    x25519_ge_scalarmult_base_adx(t, a);
800
170k
    fiat_25519_from_bytes(h->X.v, t[0]);
801
170k
    fiat_25519_from_bytes(h->Y.v, t[1]);
802
170k
    fiat_25519_from_bytes(h->Z.v, t[2]);
803
170k
    fiat_25519_from_bytes(h->T.v, t[3]);
804
170k
    return;
805
170k
  }
806
0
#endif
807
0
  signed char e[64];
808
0
  signed char carry;
809
0
  ge_p1p1 r;
810
0
  ge_p2 s;
811
0
  ge_precomp t;
812
0
  int i;
813
814
0
  for (i = 0; i < 32; ++i) {
815
0
    e[2 * i + 0] = (a[i] >> 0) & 15;
816
0
    e[2 * i + 1] = (a[i] >> 4) & 15;
817
0
  }
818
  // each e[i] is between 0 and 15
819
  // e[63] is between 0 and 7
820
821
0
  carry = 0;
822
0
  for (i = 0; i < 63; ++i) {
823
0
    e[i] += carry;
824
0
    carry = e[i] + 8;
825
0
    carry >>= 4;
826
0
    e[i] -= carry << 4;
827
0
  }
828
0
  e[63] += carry;
829
  // each e[i] is between -8 and 8
830
831
0
  ge_p3_0(h);
832
0
  for (i = 1; i < 64; i += 2) {
833
0
    table_select(&t, i / 2, e[i]);
834
0
    ge_madd(&r, h, &t);
835
0
    x25519_ge_p1p1_to_p3(h, &r);
836
0
  }
837
838
0
  ge_p3_dbl(&r, h);
839
0
  x25519_ge_p1p1_to_p2(&s, &r);
840
0
  ge_p2_dbl(&r, &s);
841
0
  x25519_ge_p1p1_to_p2(&s, &r);
842
0
  ge_p2_dbl(&r, &s);
843
0
  x25519_ge_p1p1_to_p2(&s, &r);
844
0
  ge_p2_dbl(&r, &s);
845
0
  x25519_ge_p1p1_to_p3(h, &r);
846
847
0
  for (i = 0; i < 64; i += 2) {
848
0
    table_select(&t, i / 2, e[i]);
849
0
    ge_madd(&r, h, &t);
850
0
    x25519_ge_p1p1_to_p3(h, &r);
851
0
  }
852
0
}
853
854
#endif
855
856
0
static void cmov_cached(ge_cached *t, ge_cached *u, uint8_t b) {
857
0
  fe_cmov(&t->YplusX, &u->YplusX, b);
858
0
  fe_cmov(&t->YminusX, &u->YminusX, b);
859
0
  fe_cmov(&t->Z, &u->Z, b);
860
0
  fe_cmov(&t->T2d, &u->T2d, b);
861
0
}
862
863
// r = scalar * A.
864
// where a = a[0]+256*a[1]+...+256^31 a[31].
865
0
void x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, const ge_p3 *A) {
866
0
  ge_p2 Ai_p2[8];
867
0
  ge_cached Ai[16];
868
0
  ge_p1p1 t;
869
870
0
  ge_cached_0(&Ai[0]);
871
0
  x25519_ge_p3_to_cached(&Ai[1], A);
872
0
  ge_p3_to_p2(&Ai_p2[1], A);
873
874
0
  unsigned i;
875
0
  for (i = 2; i < 16; i += 2) {
876
0
    ge_p2_dbl(&t, &Ai_p2[i / 2]);
877
0
    ge_p1p1_to_cached(&Ai[i], &t);
878
0
    if (i < 8) {
879
0
      x25519_ge_p1p1_to_p2(&Ai_p2[i], &t);
880
0
    }
881
0
    x25519_ge_add(&t, A, &Ai[i]);
882
0
    ge_p1p1_to_cached(&Ai[i + 1], &t);
883
0
    if (i < 7) {
884
0
      x25519_ge_p1p1_to_p2(&Ai_p2[i + 1], &t);
885
0
    }
886
0
  }
887
888
0
  ge_p2_0(r);
889
0
  ge_p3 u;
890
891
0
  for (i = 0; i < 256; i += 4) {
892
0
    ge_p2_dbl(&t, r);
893
0
    x25519_ge_p1p1_to_p2(r, &t);
894
0
    ge_p2_dbl(&t, r);
895
0
    x25519_ge_p1p1_to_p2(r, &t);
896
0
    ge_p2_dbl(&t, r);
897
0
    x25519_ge_p1p1_to_p2(r, &t);
898
0
    ge_p2_dbl(&t, r);
899
0
    x25519_ge_p1p1_to_p3(&u, &t);
900
901
0
    uint8_t index = scalar[31 - i / 8];
902
0
    index >>= 4 - (i & 4);
903
0
    index &= 0xf;
904
905
0
    unsigned j;
906
0
    ge_cached selected;
907
0
    ge_cached_0(&selected);
908
0
    for (j = 0; j < 16; j++) {
909
0
      cmov_cached(&selected, &Ai[j], 1 & constant_time_eq_w(index, j));
910
0
    }
911
912
0
    x25519_ge_add(&t, &u, &selected);
913
0
    x25519_ge_p1p1_to_p2(r, &t);
914
0
  }
915
0
}
916
917
0
static void slide(signed char *r, const uint8_t *a) {
918
0
  int i;
919
0
  int b;
920
0
  int k;
921
922
0
  for (i = 0; i < 256; ++i) {
923
0
    r[i] = 1 & (a[i >> 3] >> (i & 7));
924
0
  }
925
926
0
  for (i = 0; i < 256; ++i) {
927
0
    if (r[i]) {
928
0
      for (b = 1; b <= 6 && i + b < 256; ++b) {
929
0
        if (r[i + b]) {
930
0
          if (r[i] + (r[i + b] << b) <= 15) {
931
0
            r[i] += r[i + b] << b;
932
0
            r[i + b] = 0;
933
0
          } else if (r[i] - (r[i + b] << b) >= -15) {
934
0
            r[i] -= r[i + b] << b;
935
0
            for (k = i + b; k < 256; ++k) {
936
0
              if (!r[k]) {
937
0
                r[k] = 1;
938
0
                break;
939
0
              }
940
0
              r[k] = 0;
941
0
            }
942
0
          } else {
943
0
            break;
944
0
          }
945
0
        }
946
0
      }
947
0
    }
948
0
  }
949
0
}
950
951
// r = a * A + b * B
952
// where a = a[0]+256*a[1]+...+256^31 a[31].
953
// and b = b[0]+256*b[1]+...+256^31 b[31].
954
// B is the Ed25519 base point (x,4/5) with x positive.
955
static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
956
0
                                         const ge_p3 *A, const uint8_t *b) {
957
0
  signed char aslide[256];
958
0
  signed char bslide[256];
959
0
  ge_cached Ai[8];  // A,3A,5A,7A,9A,11A,13A,15A
960
0
  ge_p1p1 t;
961
0
  ge_p3 u;
962
0
  ge_p3 A2;
963
0
  int i;
964
965
0
  slide(aslide, a);
966
0
  slide(bslide, b);
967
968
0
  x25519_ge_p3_to_cached(&Ai[0], A);
969
0
  ge_p3_dbl(&t, A);
970
0
  x25519_ge_p1p1_to_p3(&A2, &t);
971
0
  x25519_ge_add(&t, &A2, &Ai[0]);
972
0
  x25519_ge_p1p1_to_p3(&u, &t);
973
0
  x25519_ge_p3_to_cached(&Ai[1], &u);
974
0
  x25519_ge_add(&t, &A2, &Ai[1]);
975
0
  x25519_ge_p1p1_to_p3(&u, &t);
976
0
  x25519_ge_p3_to_cached(&Ai[2], &u);
977
0
  x25519_ge_add(&t, &A2, &Ai[2]);
978
0
  x25519_ge_p1p1_to_p3(&u, &t);
979
0
  x25519_ge_p3_to_cached(&Ai[3], &u);
980
0
  x25519_ge_add(&t, &A2, &Ai[3]);
981
0
  x25519_ge_p1p1_to_p3(&u, &t);
982
0
  x25519_ge_p3_to_cached(&Ai[4], &u);
983
0
  x25519_ge_add(&t, &A2, &Ai[4]);
984
0
  x25519_ge_p1p1_to_p3(&u, &t);
985
0
  x25519_ge_p3_to_cached(&Ai[5], &u);
986
0
  x25519_ge_add(&t, &A2, &Ai[5]);
987
0
  x25519_ge_p1p1_to_p3(&u, &t);
988
0
  x25519_ge_p3_to_cached(&Ai[6], &u);
989
0
  x25519_ge_add(&t, &A2, &Ai[6]);
990
0
  x25519_ge_p1p1_to_p3(&u, &t);
991
0
  x25519_ge_p3_to_cached(&Ai[7], &u);
992
993
0
  ge_p2_0(r);
994
995
0
  for (i = 255; i >= 0; --i) {
996
0
    if (aslide[i] || bslide[i]) {
997
0
      break;
998
0
    }
999
0
  }
1000
1001
0
  for (; i >= 0; --i) {
1002
0
    ge_p2_dbl(&t, r);
1003
1004
0
    if (aslide[i] > 0) {
1005
0
      x25519_ge_p1p1_to_p3(&u, &t);
1006
0
      x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]);
1007
0
    } else if (aslide[i] < 0) {
1008
0
      x25519_ge_p1p1_to_p3(&u, &t);
1009
0
      x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
1010
0
    }
1011
1012
0
    if (bslide[i] > 0) {
1013
0
      x25519_ge_p1p1_to_p3(&u, &t);
1014
0
      ge_madd(&t, &u, &Bi[bslide[i] / 2]);
1015
0
    } else if (bslide[i] < 0) {
1016
0
      x25519_ge_p1p1_to_p3(&u, &t);
1017
0
      ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]);
1018
0
    }
1019
1020
0
    x25519_ge_p1p1_to_p2(r, &t);
1021
0
  }
1022
0
}
1023
1024
// int64_lshift21 returns |a << 21| but is defined when shifting bits into the
1025
// sign bit. This works around a language flaw in C.
1026
0
static inline int64_t int64_lshift21(int64_t a) {
1027
0
  return (int64_t)((uint64_t)a << 21);
1028
0
}
1029
1030
// The set of scalars is \Z/l
1031
// where l = 2^252 + 27742317777372353535851937790883648493.
1032
1033
// Input:
1034
//   s[0]+256*s[1]+...+256^63*s[63] = s
1035
//
1036
// Output:
1037
//   s[0]+256*s[1]+...+256^31*s[31] = s mod l
1038
//   where l = 2^252 + 27742317777372353535851937790883648493.
1039
//   Overwrites s in place.
1040
0
void x25519_sc_reduce(uint8_t s[64]) {
1041
0
  int64_t s0 = 2097151 & load_3(s);
1042
0
  int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
1043
0
  int64_t s2 = 2097151 & (load_3(s + 5) >> 2);
1044
0
  int64_t s3 = 2097151 & (load_4(s + 7) >> 7);
1045
0
  int64_t s4 = 2097151 & (load_4(s + 10) >> 4);
1046
0
  int64_t s5 = 2097151 & (load_3(s + 13) >> 1);
1047
0
  int64_t s6 = 2097151 & (load_4(s + 15) >> 6);
1048
0
  int64_t s7 = 2097151 & (load_3(s + 18) >> 3);
1049
0
  int64_t s8 = 2097151 & load_3(s + 21);
1050
0
  int64_t s9 = 2097151 & (load_4(s + 23) >> 5);
1051
0
  int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
1052
0
  int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
1053
0
  int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
1054
0
  int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
1055
0
  int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
1056
0
  int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
1057
0
  int64_t s16 = 2097151 & load_3(s + 42);
1058
0
  int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
1059
0
  int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
1060
0
  int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
1061
0
  int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
1062
0
  int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
1063
0
  int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
1064
0
  int64_t s23 = (load_4(s + 60) >> 3);
1065
0
  int64_t carry0;
1066
0
  int64_t carry1;
1067
0
  int64_t carry2;
1068
0
  int64_t carry3;
1069
0
  int64_t carry4;
1070
0
  int64_t carry5;
1071
0
  int64_t carry6;
1072
0
  int64_t carry7;
1073
0
  int64_t carry8;
1074
0
  int64_t carry9;
1075
0
  int64_t carry10;
1076
0
  int64_t carry11;
1077
0
  int64_t carry12;
1078
0
  int64_t carry13;
1079
0
  int64_t carry14;
1080
0
  int64_t carry15;
1081
0
  int64_t carry16;
1082
1083
0
  s11 += s23 * 666643;
1084
0
  s12 += s23 * 470296;
1085
0
  s13 += s23 * 654183;
1086
0
  s14 -= s23 * 997805;
1087
0
  s15 += s23 * 136657;
1088
0
  s16 -= s23 * 683901;
1089
0
  s23 = 0;
1090
1091
0
  s10 += s22 * 666643;
1092
0
  s11 += s22 * 470296;
1093
0
  s12 += s22 * 654183;
1094
0
  s13 -= s22 * 997805;
1095
0
  s14 += s22 * 136657;
1096
0
  s15 -= s22 * 683901;
1097
0
  s22 = 0;
1098
1099
0
  s9 += s21 * 666643;
1100
0
  s10 += s21 * 470296;
1101
0
  s11 += s21 * 654183;
1102
0
  s12 -= s21 * 997805;
1103
0
  s13 += s21 * 136657;
1104
0
  s14 -= s21 * 683901;
1105
0
  s21 = 0;
1106
1107
0
  s8 += s20 * 666643;
1108
0
  s9 += s20 * 470296;
1109
0
  s10 += s20 * 654183;
1110
0
  s11 -= s20 * 997805;
1111
0
  s12 += s20 * 136657;
1112
0
  s13 -= s20 * 683901;
1113
0
  s20 = 0;
1114
1115
0
  s7 += s19 * 666643;
1116
0
  s8 += s19 * 470296;
1117
0
  s9 += s19 * 654183;
1118
0
  s10 -= s19 * 997805;
1119
0
  s11 += s19 * 136657;
1120
0
  s12 -= s19 * 683901;
1121
0
  s19 = 0;
1122
1123
0
  s6 += s18 * 666643;
1124
0
  s7 += s18 * 470296;
1125
0
  s8 += s18 * 654183;
1126
0
  s9 -= s18 * 997805;
1127
0
  s10 += s18 * 136657;
1128
0
  s11 -= s18 * 683901;
1129
0
  s18 = 0;
1130
1131
0
  carry6 = (s6 + (1 << 20)) >> 21;
1132
0
  s7 += carry6;
1133
0
  s6 -= int64_lshift21(carry6);
1134
0
  carry8 = (s8 + (1 << 20)) >> 21;
1135
0
  s9 += carry8;
1136
0
  s8 -= int64_lshift21(carry8);
1137
0
  carry10 = (s10 + (1 << 20)) >> 21;
1138
0
  s11 += carry10;
1139
0
  s10 -= int64_lshift21(carry10);
1140
0
  carry12 = (s12 + (1 << 20)) >> 21;
1141
0
  s13 += carry12;
1142
0
  s12 -= int64_lshift21(carry12);
1143
0
  carry14 = (s14 + (1 << 20)) >> 21;
1144
0
  s15 += carry14;
1145
0
  s14 -= int64_lshift21(carry14);
1146
0
  carry16 = (s16 + (1 << 20)) >> 21;
1147
0
  s17 += carry16;
1148
0
  s16 -= int64_lshift21(carry16);
1149
1150
0
  carry7 = (s7 + (1 << 20)) >> 21;
1151
0
  s8 += carry7;
1152
0
  s7 -= int64_lshift21(carry7);
1153
0
  carry9 = (s9 + (1 << 20)) >> 21;
1154
0
  s10 += carry9;
1155
0
  s9 -= int64_lshift21(carry9);
1156
0
  carry11 = (s11 + (1 << 20)) >> 21;
1157
0
  s12 += carry11;
1158
0
  s11 -= int64_lshift21(carry11);
1159
0
  carry13 = (s13 + (1 << 20)) >> 21;
1160
0
  s14 += carry13;
1161
0
  s13 -= int64_lshift21(carry13);
1162
0
  carry15 = (s15 + (1 << 20)) >> 21;
1163
0
  s16 += carry15;
1164
0
  s15 -= int64_lshift21(carry15);
1165
1166
0
  s5 += s17 * 666643;
1167
0
  s6 += s17 * 470296;
1168
0
  s7 += s17 * 654183;
1169
0
  s8 -= s17 * 997805;
1170
0
  s9 += s17 * 136657;
1171
0
  s10 -= s17 * 683901;
1172
0
  s17 = 0;
1173
1174
0
  s4 += s16 * 666643;
1175
0
  s5 += s16 * 470296;
1176
0
  s6 += s16 * 654183;
1177
0
  s7 -= s16 * 997805;
1178
0
  s8 += s16 * 136657;
1179
0
  s9 -= s16 * 683901;
1180
0
  s16 = 0;
1181
1182
0
  s3 += s15 * 666643;
1183
0
  s4 += s15 * 470296;
1184
0
  s5 += s15 * 654183;
1185
0
  s6 -= s15 * 997805;
1186
0
  s7 += s15 * 136657;
1187
0
  s8 -= s15 * 683901;
1188
0
  s15 = 0;
1189
1190
0
  s2 += s14 * 666643;
1191
0
  s3 += s14 * 470296;
1192
0
  s4 += s14 * 654183;
1193
0
  s5 -= s14 * 997805;
1194
0
  s6 += s14 * 136657;
1195
0
  s7 -= s14 * 683901;
1196
0
  s14 = 0;
1197
1198
0
  s1 += s13 * 666643;
1199
0
  s2 += s13 * 470296;
1200
0
  s3 += s13 * 654183;
1201
0
  s4 -= s13 * 997805;
1202
0
  s5 += s13 * 136657;
1203
0
  s6 -= s13 * 683901;
1204
0
  s13 = 0;
1205
1206
0
  s0 += s12 * 666643;
1207
0
  s1 += s12 * 470296;
1208
0
  s2 += s12 * 654183;
1209
0
  s3 -= s12 * 997805;
1210
0
  s4 += s12 * 136657;
1211
0
  s5 -= s12 * 683901;
1212
0
  s12 = 0;
1213
1214
0
  carry0 = (s0 + (1 << 20)) >> 21;
1215
0
  s1 += carry0;
1216
0
  s0 -= int64_lshift21(carry0);
1217
0
  carry2 = (s2 + (1 << 20)) >> 21;
1218
0
  s3 += carry2;
1219
0
  s2 -= int64_lshift21(carry2);
1220
0
  carry4 = (s4 + (1 << 20)) >> 21;
1221
0
  s5 += carry4;
1222
0
  s4 -= int64_lshift21(carry4);
1223
0
  carry6 = (s6 + (1 << 20)) >> 21;
1224
0
  s7 += carry6;
1225
0
  s6 -= int64_lshift21(carry6);
1226
0
  carry8 = (s8 + (1 << 20)) >> 21;
1227
0
  s9 += carry8;
1228
0
  s8 -= int64_lshift21(carry8);
1229
0
  carry10 = (s10 + (1 << 20)) >> 21;
1230
0
  s11 += carry10;
1231
0
  s10 -= int64_lshift21(carry10);
1232
1233
0
  carry1 = (s1 + (1 << 20)) >> 21;
1234
0
  s2 += carry1;
1235
0
  s1 -= int64_lshift21(carry1);
1236
0
  carry3 = (s3 + (1 << 20)) >> 21;
1237
0
  s4 += carry3;
1238
0
  s3 -= int64_lshift21(carry3);
1239
0
  carry5 = (s5 + (1 << 20)) >> 21;
1240
0
  s6 += carry5;
1241
0
  s5 -= int64_lshift21(carry5);
1242
0
  carry7 = (s7 + (1 << 20)) >> 21;
1243
0
  s8 += carry7;
1244
0
  s7 -= int64_lshift21(carry7);
1245
0
  carry9 = (s9 + (1 << 20)) >> 21;
1246
0
  s10 += carry9;
1247
0
  s9 -= int64_lshift21(carry9);
1248
0
  carry11 = (s11 + (1 << 20)) >> 21;
1249
0
  s12 += carry11;
1250
0
  s11 -= int64_lshift21(carry11);
1251
1252
0
  s0 += s12 * 666643;
1253
0
  s1 += s12 * 470296;
1254
0
  s2 += s12 * 654183;
1255
0
  s3 -= s12 * 997805;
1256
0
  s4 += s12 * 136657;
1257
0
  s5 -= s12 * 683901;
1258
0
  s12 = 0;
1259
1260
0
  carry0 = s0 >> 21;
1261
0
  s1 += carry0;
1262
0
  s0 -= int64_lshift21(carry0);
1263
0
  carry1 = s1 >> 21;
1264
0
  s2 += carry1;
1265
0
  s1 -= int64_lshift21(carry1);
1266
0
  carry2 = s2 >> 21;
1267
0
  s3 += carry2;
1268
0
  s2 -= int64_lshift21(carry2);
1269
0
  carry3 = s3 >> 21;
1270
0
  s4 += carry3;
1271
0
  s3 -= int64_lshift21(carry3);
1272
0
  carry4 = s4 >> 21;
1273
0
  s5 += carry4;
1274
0
  s4 -= int64_lshift21(carry4);
1275
0
  carry5 = s5 >> 21;
1276
0
  s6 += carry5;
1277
0
  s5 -= int64_lshift21(carry5);
1278
0
  carry6 = s6 >> 21;
1279
0
  s7 += carry6;
1280
0
  s6 -= int64_lshift21(carry6);
1281
0
  carry7 = s7 >> 21;
1282
0
  s8 += carry7;
1283
0
  s7 -= int64_lshift21(carry7);
1284
0
  carry8 = s8 >> 21;
1285
0
  s9 += carry8;
1286
0
  s8 -= int64_lshift21(carry8);
1287
0
  carry9 = s9 >> 21;
1288
0
  s10 += carry9;
1289
0
  s9 -= int64_lshift21(carry9);
1290
0
  carry10 = s10 >> 21;
1291
0
  s11 += carry10;
1292
0
  s10 -= int64_lshift21(carry10);
1293
0
  carry11 = s11 >> 21;
1294
0
  s12 += carry11;
1295
0
  s11 -= int64_lshift21(carry11);
1296
1297
0
  s0 += s12 * 666643;
1298
0
  s1 += s12 * 470296;
1299
0
  s2 += s12 * 654183;
1300
0
  s3 -= s12 * 997805;
1301
0
  s4 += s12 * 136657;
1302
0
  s5 -= s12 * 683901;
1303
0
  s12 = 0;
1304
1305
0
  carry0 = s0 >> 21;
1306
0
  s1 += carry0;
1307
0
  s0 -= int64_lshift21(carry0);
1308
0
  carry1 = s1 >> 21;
1309
0
  s2 += carry1;
1310
0
  s1 -= int64_lshift21(carry1);
1311
0
  carry2 = s2 >> 21;
1312
0
  s3 += carry2;
1313
0
  s2 -= int64_lshift21(carry2);
1314
0
  carry3 = s3 >> 21;
1315
0
  s4 += carry3;
1316
0
  s3 -= int64_lshift21(carry3);
1317
0
  carry4 = s4 >> 21;
1318
0
  s5 += carry4;
1319
0
  s4 -= int64_lshift21(carry4);
1320
0
  carry5 = s5 >> 21;
1321
0
  s6 += carry5;
1322
0
  s5 -= int64_lshift21(carry5);
1323
0
  carry6 = s6 >> 21;
1324
0
  s7 += carry6;
1325
0
  s6 -= int64_lshift21(carry6);
1326
0
  carry7 = s7 >> 21;
1327
0
  s8 += carry7;
1328
0
  s7 -= int64_lshift21(carry7);
1329
0
  carry8 = s8 >> 21;
1330
0
  s9 += carry8;
1331
0
  s8 -= int64_lshift21(carry8);
1332
0
  carry9 = s9 >> 21;
1333
0
  s10 += carry9;
1334
0
  s9 -= int64_lshift21(carry9);
1335
0
  carry10 = s10 >> 21;
1336
0
  s11 += carry10;
1337
0
  s10 -= int64_lshift21(carry10);
1338
1339
0
  s[0] = s0 >> 0;
1340
0
  s[1] = s0 >> 8;
1341
0
  s[2] = (s0 >> 16) | (s1 << 5);
1342
0
  s[3] = s1 >> 3;
1343
0
  s[4] = s1 >> 11;
1344
0
  s[5] = (s1 >> 19) | (s2 << 2);
1345
0
  s[6] = s2 >> 6;
1346
0
  s[7] = (s2 >> 14) | (s3 << 7);
1347
0
  s[8] = s3 >> 1;
1348
0
  s[9] = s3 >> 9;
1349
0
  s[10] = (s3 >> 17) | (s4 << 4);
1350
0
  s[11] = s4 >> 4;
1351
0
  s[12] = s4 >> 12;
1352
0
  s[13] = (s4 >> 20) | (s5 << 1);
1353
0
  s[14] = s5 >> 7;
1354
0
  s[15] = (s5 >> 15) | (s6 << 6);
1355
0
  s[16] = s6 >> 2;
1356
0
  s[17] = s6 >> 10;
1357
0
  s[18] = (s6 >> 18) | (s7 << 3);
1358
0
  s[19] = s7 >> 5;
1359
0
  s[20] = s7 >> 13;
1360
0
  s[21] = s8 >> 0;
1361
0
  s[22] = s8 >> 8;
1362
0
  s[23] = (s8 >> 16) | (s9 << 5);
1363
0
  s[24] = s9 >> 3;
1364
0
  s[25] = s9 >> 11;
1365
0
  s[26] = (s9 >> 19) | (s10 << 2);
1366
0
  s[27] = s10 >> 6;
1367
0
  s[28] = (s10 >> 14) | (s11 << 7);
1368
0
  s[29] = s11 >> 1;
1369
0
  s[30] = s11 >> 9;
1370
0
  s[31] = s11 >> 17;
1371
0
}
1372
1373
// Input:
1374
//   a[0]+256*a[1]+...+256^31*a[31] = a
1375
//   b[0]+256*b[1]+...+256^31*b[31] = b
1376
//   c[0]+256*c[1]+...+256^31*c[31] = c
1377
//
1378
// Output:
1379
//   s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
1380
//   where l = 2^252 + 27742317777372353535851937790883648493.
1381
static void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b,
1382
0
                      const uint8_t *c) {
1383
0
  int64_t a0 = 2097151 & load_3(a);
1384
0
  int64_t a1 = 2097151 & (load_4(a + 2) >> 5);
1385
0
  int64_t a2 = 2097151 & (load_3(a + 5) >> 2);
1386
0
  int64_t a3 = 2097151 & (load_4(a + 7) >> 7);
1387
0
  int64_t a4 = 2097151 & (load_4(a + 10) >> 4);
1388
0
  int64_t a5 = 2097151 & (load_3(a + 13) >> 1);
1389
0
  int64_t a6 = 2097151 & (load_4(a + 15) >> 6);
1390
0
  int64_t a7 = 2097151 & (load_3(a + 18) >> 3);
1391
0
  int64_t a8 = 2097151 & load_3(a + 21);
1392
0
  int64_t a9 = 2097151 & (load_4(a + 23) >> 5);
1393
0
  int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
1394
0
  int64_t a11 = (load_4(a + 28) >> 7);
1395
0
  int64_t b0 = 2097151 & load_3(b);
1396
0
  int64_t b1 = 2097151 & (load_4(b + 2) >> 5);
1397
0
  int64_t b2 = 2097151 & (load_3(b + 5) >> 2);
1398
0
  int64_t b3 = 2097151 & (load_4(b + 7) >> 7);
1399
0
  int64_t b4 = 2097151 & (load_4(b + 10) >> 4);
1400
0
  int64_t b5 = 2097151 & (load_3(b + 13) >> 1);
1401
0
  int64_t b6 = 2097151 & (load_4(b + 15) >> 6);
1402
0
  int64_t b7 = 2097151 & (load_3(b + 18) >> 3);
1403
0
  int64_t b8 = 2097151 & load_3(b + 21);
1404
0
  int64_t b9 = 2097151 & (load_4(b + 23) >> 5);
1405
0
  int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
1406
0
  int64_t b11 = (load_4(b + 28) >> 7);
1407
0
  int64_t c0 = 2097151 & load_3(c);
1408
0
  int64_t c1 = 2097151 & (load_4(c + 2) >> 5);
1409
0
  int64_t c2 = 2097151 & (load_3(c + 5) >> 2);
1410
0
  int64_t c3 = 2097151 & (load_4(c + 7) >> 7);
1411
0
  int64_t c4 = 2097151 & (load_4(c + 10) >> 4);
1412
0
  int64_t c5 = 2097151 & (load_3(c + 13) >> 1);
1413
0
  int64_t c6 = 2097151 & (load_4(c + 15) >> 6);
1414
0
  int64_t c7 = 2097151 & (load_3(c + 18) >> 3);
1415
0
  int64_t c8 = 2097151 & load_3(c + 21);
1416
0
  int64_t c9 = 2097151 & (load_4(c + 23) >> 5);
1417
0
  int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
1418
0
  int64_t c11 = (load_4(c + 28) >> 7);
1419
0
  int64_t s0;
1420
0
  int64_t s1;
1421
0
  int64_t s2;
1422
0
  int64_t s3;
1423
0
  int64_t s4;
1424
0
  int64_t s5;
1425
0
  int64_t s6;
1426
0
  int64_t s7;
1427
0
  int64_t s8;
1428
0
  int64_t s9;
1429
0
  int64_t s10;
1430
0
  int64_t s11;
1431
0
  int64_t s12;
1432
0
  int64_t s13;
1433
0
  int64_t s14;
1434
0
  int64_t s15;
1435
0
  int64_t s16;
1436
0
  int64_t s17;
1437
0
  int64_t s18;
1438
0
  int64_t s19;
1439
0
  int64_t s20;
1440
0
  int64_t s21;
1441
0
  int64_t s22;
1442
0
  int64_t s23;
1443
0
  int64_t carry0;
1444
0
  int64_t carry1;
1445
0
  int64_t carry2;
1446
0
  int64_t carry3;
1447
0
  int64_t carry4;
1448
0
  int64_t carry5;
1449
0
  int64_t carry6;
1450
0
  int64_t carry7;
1451
0
  int64_t carry8;
1452
0
  int64_t carry9;
1453
0
  int64_t carry10;
1454
0
  int64_t carry11;
1455
0
  int64_t carry12;
1456
0
  int64_t carry13;
1457
0
  int64_t carry14;
1458
0
  int64_t carry15;
1459
0
  int64_t carry16;
1460
0
  int64_t carry17;
1461
0
  int64_t carry18;
1462
0
  int64_t carry19;
1463
0
  int64_t carry20;
1464
0
  int64_t carry21;
1465
0
  int64_t carry22;
1466
1467
0
  s0 = c0 + a0 * b0;
1468
0
  s1 = c1 + a0 * b1 + a1 * b0;
1469
0
  s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0;
1470
0
  s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
1471
0
  s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0;
1472
0
  s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0;
1473
0
  s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0;
1474
0
  s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 +
1475
0
       a6 * b1 + a7 * b0;
1476
0
  s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 +
1477
0
       a6 * b2 + a7 * b1 + a8 * b0;
1478
0
  s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 +
1479
0
       a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0;
1480
0
  s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 +
1481
0
        a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0;
1482
0
  s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 +
1483
0
        a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0;
1484
0
  s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 +
1485
0
        a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1;
1486
0
  s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 +
1487
0
        a9 * b4 + a10 * b3 + a11 * b2;
1488
0
  s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5 +
1489
0
        a10 * b4 + a11 * b3;
1490
0
  s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 + a10 * b5 +
1491
0
        a11 * b4;
1492
0
  s16 = a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5;
1493
0
  s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6;
1494
0
  s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7;
1495
0
  s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8;
1496
0
  s20 = a9 * b11 + a10 * b10 + a11 * b9;
1497
0
  s21 = a10 * b11 + a11 * b10;
1498
0
  s22 = a11 * b11;
1499
0
  s23 = 0;
1500
1501
0
  carry0 = (s0 + (1 << 20)) >> 21;
1502
0
  s1 += carry0;
1503
0
  s0 -= int64_lshift21(carry0);
1504
0
  carry2 = (s2 + (1 << 20)) >> 21;
1505
0
  s3 += carry2;
1506
0
  s2 -= int64_lshift21(carry2);
1507
0
  carry4 = (s4 + (1 << 20)) >> 21;
1508
0
  s5 += carry4;
1509
0
  s4 -= int64_lshift21(carry4);
1510
0
  carry6 = (s6 + (1 << 20)) >> 21;
1511
0
  s7 += carry6;
1512
0
  s6 -= int64_lshift21(carry6);
1513
0
  carry8 = (s8 + (1 << 20)) >> 21;
1514
0
  s9 += carry8;
1515
0
  s8 -= int64_lshift21(carry8);
1516
0
  carry10 = (s10 + (1 << 20)) >> 21;
1517
0
  s11 += carry10;
1518
0
  s10 -= int64_lshift21(carry10);
1519
0
  carry12 = (s12 + (1 << 20)) >> 21;
1520
0
  s13 += carry12;
1521
0
  s12 -= int64_lshift21(carry12);
1522
0
  carry14 = (s14 + (1 << 20)) >> 21;
1523
0
  s15 += carry14;
1524
0
  s14 -= int64_lshift21(carry14);
1525
0
  carry16 = (s16 + (1 << 20)) >> 21;
1526
0
  s17 += carry16;
1527
0
  s16 -= int64_lshift21(carry16);
1528
0
  carry18 = (s18 + (1 << 20)) >> 21;
1529
0
  s19 += carry18;
1530
0
  s18 -= int64_lshift21(carry18);
1531
0
  carry20 = (s20 + (1 << 20)) >> 21;
1532
0
  s21 += carry20;
1533
0
  s20 -= int64_lshift21(carry20);
1534
0
  carry22 = (s22 + (1 << 20)) >> 21;
1535
0
  s23 += carry22;
1536
0
  s22 -= int64_lshift21(carry22);
1537
1538
0
  carry1 = (s1 + (1 << 20)) >> 21;
1539
0
  s2 += carry1;
1540
0
  s1 -= int64_lshift21(carry1);
1541
0
  carry3 = (s3 + (1 << 20)) >> 21;
1542
0
  s4 += carry3;
1543
0
  s3 -= int64_lshift21(carry3);
1544
0
  carry5 = (s5 + (1 << 20)) >> 21;
1545
0
  s6 += carry5;
1546
0
  s5 -= int64_lshift21(carry5);
1547
0
  carry7 = (s7 + (1 << 20)) >> 21;
1548
0
  s8 += carry7;
1549
0
  s7 -= int64_lshift21(carry7);
1550
0
  carry9 = (s9 + (1 << 20)) >> 21;
1551
0
  s10 += carry9;
1552
0
  s9 -= int64_lshift21(carry9);
1553
0
  carry11 = (s11 + (1 << 20)) >> 21;
1554
0
  s12 += carry11;
1555
0
  s11 -= int64_lshift21(carry11);
1556
0
  carry13 = (s13 + (1 << 20)) >> 21;
1557
0
  s14 += carry13;
1558
0
  s13 -= int64_lshift21(carry13);
1559
0
  carry15 = (s15 + (1 << 20)) >> 21;
1560
0
  s16 += carry15;
1561
0
  s15 -= int64_lshift21(carry15);
1562
0
  carry17 = (s17 + (1 << 20)) >> 21;
1563
0
  s18 += carry17;
1564
0
  s17 -= int64_lshift21(carry17);
1565
0
  carry19 = (s19 + (1 << 20)) >> 21;
1566
0
  s20 += carry19;
1567
0
  s19 -= int64_lshift21(carry19);
1568
0
  carry21 = (s21 + (1 << 20)) >> 21;
1569
0
  s22 += carry21;
1570
0
  s21 -= int64_lshift21(carry21);
1571
1572
0
  s11 += s23 * 666643;
1573
0
  s12 += s23 * 470296;
1574
0
  s13 += s23 * 654183;
1575
0
  s14 -= s23 * 997805;
1576
0
  s15 += s23 * 136657;
1577
0
  s16 -= s23 * 683901;
1578
0
  s23 = 0;
1579
1580
0
  s10 += s22 * 666643;
1581
0
  s11 += s22 * 470296;
1582
0
  s12 += s22 * 654183;
1583
0
  s13 -= s22 * 997805;
1584
0
  s14 += s22 * 136657;
1585
0
  s15 -= s22 * 683901;
1586
0
  s22 = 0;
1587
1588
0
  s9 += s21 * 666643;
1589
0
  s10 += s21 * 470296;
1590
0
  s11 += s21 * 654183;
1591
0
  s12 -= s21 * 997805;
1592
0
  s13 += s21 * 136657;
1593
0
  s14 -= s21 * 683901;
1594
0
  s21 = 0;
1595
1596
0
  s8 += s20 * 666643;
1597
0
  s9 += s20 * 470296;
1598
0
  s10 += s20 * 654183;
1599
0
  s11 -= s20 * 997805;
1600
0
  s12 += s20 * 136657;
1601
0
  s13 -= s20 * 683901;
1602
0
  s20 = 0;
1603
1604
0
  s7 += s19 * 666643;
1605
0
  s8 += s19 * 470296;
1606
0
  s9 += s19 * 654183;
1607
0
  s10 -= s19 * 997805;
1608
0
  s11 += s19 * 136657;
1609
0
  s12 -= s19 * 683901;
1610
0
  s19 = 0;
1611
1612
0
  s6 += s18 * 666643;
1613
0
  s7 += s18 * 470296;
1614
0
  s8 += s18 * 654183;
1615
0
  s9 -= s18 * 997805;
1616
0
  s10 += s18 * 136657;
1617
0
  s11 -= s18 * 683901;
1618
0
  s18 = 0;
1619
1620
0
  carry6 = (s6 + (1 << 20)) >> 21;
1621
0
  s7 += carry6;
1622
0
  s6 -= int64_lshift21(carry6);
1623
0
  carry8 = (s8 + (1 << 20)) >> 21;
1624
0
  s9 += carry8;
1625
0
  s8 -= int64_lshift21(carry8);
1626
0
  carry10 = (s10 + (1 << 20)) >> 21;
1627
0
  s11 += carry10;
1628
0
  s10 -= int64_lshift21(carry10);
1629
0
  carry12 = (s12 + (1 << 20)) >> 21;
1630
0
  s13 += carry12;
1631
0
  s12 -= int64_lshift21(carry12);
1632
0
  carry14 = (s14 + (1 << 20)) >> 21;
1633
0
  s15 += carry14;
1634
0
  s14 -= int64_lshift21(carry14);
1635
0
  carry16 = (s16 + (1 << 20)) >> 21;
1636
0
  s17 += carry16;
1637
0
  s16 -= int64_lshift21(carry16);
1638
1639
0
  carry7 = (s7 + (1 << 20)) >> 21;
1640
0
  s8 += carry7;
1641
0
  s7 -= int64_lshift21(carry7);
1642
0
  carry9 = (s9 + (1 << 20)) >> 21;
1643
0
  s10 += carry9;
1644
0
  s9 -= int64_lshift21(carry9);
1645
0
  carry11 = (s11 + (1 << 20)) >> 21;
1646
0
  s12 += carry11;
1647
0
  s11 -= int64_lshift21(carry11);
1648
0
  carry13 = (s13 + (1 << 20)) >> 21;
1649
0
  s14 += carry13;
1650
0
  s13 -= int64_lshift21(carry13);
1651
0
  carry15 = (s15 + (1 << 20)) >> 21;
1652
0
  s16 += carry15;
1653
0
  s15 -= int64_lshift21(carry15);
1654
1655
0
  s5 += s17 * 666643;
1656
0
  s6 += s17 * 470296;
1657
0
  s7 += s17 * 654183;
1658
0
  s8 -= s17 * 997805;
1659
0
  s9 += s17 * 136657;
1660
0
  s10 -= s17 * 683901;
1661
0
  s17 = 0;
1662
1663
0
  s4 += s16 * 666643;
1664
0
  s5 += s16 * 470296;
1665
0
  s6 += s16 * 654183;
1666
0
  s7 -= s16 * 997805;
1667
0
  s8 += s16 * 136657;
1668
0
  s9 -= s16 * 683901;
1669
0
  s16 = 0;
1670
1671
0
  s3 += s15 * 666643;
1672
0
  s4 += s15 * 470296;
1673
0
  s5 += s15 * 654183;
1674
0
  s6 -= s15 * 997805;
1675
0
  s7 += s15 * 136657;
1676
0
  s8 -= s15 * 683901;
1677
0
  s15 = 0;
1678
1679
0
  s2 += s14 * 666643;
1680
0
  s3 += s14 * 470296;
1681
0
  s4 += s14 * 654183;
1682
0
  s5 -= s14 * 997805;
1683
0
  s6 += s14 * 136657;
1684
0
  s7 -= s14 * 683901;
1685
0
  s14 = 0;
1686
1687
0
  s1 += s13 * 666643;
1688
0
  s2 += s13 * 470296;
1689
0
  s3 += s13 * 654183;
1690
0
  s4 -= s13 * 997805;
1691
0
  s5 += s13 * 136657;
1692
0
  s6 -= s13 * 683901;
1693
0
  s13 = 0;
1694
1695
0
  s0 += s12 * 666643;
1696
0
  s1 += s12 * 470296;
1697
0
  s2 += s12 * 654183;
1698
0
  s3 -= s12 * 997805;
1699
0
  s4 += s12 * 136657;
1700
0
  s5 -= s12 * 683901;
1701
0
  s12 = 0;
1702
1703
0
  carry0 = (s0 + (1 << 20)) >> 21;
1704
0
  s1 += carry0;
1705
0
  s0 -= int64_lshift21(carry0);
1706
0
  carry2 = (s2 + (1 << 20)) >> 21;
1707
0
  s3 += carry2;
1708
0
  s2 -= int64_lshift21(carry2);
1709
0
  carry4 = (s4 + (1 << 20)) >> 21;
1710
0
  s5 += carry4;
1711
0
  s4 -= int64_lshift21(carry4);
1712
0
  carry6 = (s6 + (1 << 20)) >> 21;
1713
0
  s7 += carry6;
1714
0
  s6 -= int64_lshift21(carry6);
1715
0
  carry8 = (s8 + (1 << 20)) >> 21;
1716
0
  s9 += carry8;
1717
0
  s8 -= int64_lshift21(carry8);
1718
0
  carry10 = (s10 + (1 << 20)) >> 21;
1719
0
  s11 += carry10;
1720
0
  s10 -= int64_lshift21(carry10);
1721
1722
0
  carry1 = (s1 + (1 << 20)) >> 21;
1723
0
  s2 += carry1;
1724
0
  s1 -= int64_lshift21(carry1);
1725
0
  carry3 = (s3 + (1 << 20)) >> 21;
1726
0
  s4 += carry3;
1727
0
  s3 -= int64_lshift21(carry3);
1728
0
  carry5 = (s5 + (1 << 20)) >> 21;
1729
0
  s6 += carry5;
1730
0
  s5 -= int64_lshift21(carry5);
1731
0
  carry7 = (s7 + (1 << 20)) >> 21;
1732
0
  s8 += carry7;
1733
0
  s7 -= int64_lshift21(carry7);
1734
0
  carry9 = (s9 + (1 << 20)) >> 21;
1735
0
  s10 += carry9;
1736
0
  s9 -= int64_lshift21(carry9);
1737
0
  carry11 = (s11 + (1 << 20)) >> 21;
1738
0
  s12 += carry11;
1739
0
  s11 -= int64_lshift21(carry11);
1740
1741
0
  s0 += s12 * 666643;
1742
0
  s1 += s12 * 470296;
1743
0
  s2 += s12 * 654183;
1744
0
  s3 -= s12 * 997805;
1745
0
  s4 += s12 * 136657;
1746
0
  s5 -= s12 * 683901;
1747
0
  s12 = 0;
1748
1749
0
  carry0 = s0 >> 21;
1750
0
  s1 += carry0;
1751
0
  s0 -= int64_lshift21(carry0);
1752
0
  carry1 = s1 >> 21;
1753
0
  s2 += carry1;
1754
0
  s1 -= int64_lshift21(carry1);
1755
0
  carry2 = s2 >> 21;
1756
0
  s3 += carry2;
1757
0
  s2 -= int64_lshift21(carry2);
1758
0
  carry3 = s3 >> 21;
1759
0
  s4 += carry3;
1760
0
  s3 -= int64_lshift21(carry3);
1761
0
  carry4 = s4 >> 21;
1762
0
  s5 += carry4;
1763
0
  s4 -= int64_lshift21(carry4);
1764
0
  carry5 = s5 >> 21;
1765
0
  s6 += carry5;
1766
0
  s5 -= int64_lshift21(carry5);
1767
0
  carry6 = s6 >> 21;
1768
0
  s7 += carry6;
1769
0
  s6 -= int64_lshift21(carry6);
1770
0
  carry7 = s7 >> 21;
1771
0
  s8 += carry7;
1772
0
  s7 -= int64_lshift21(carry7);
1773
0
  carry8 = s8 >> 21;
1774
0
  s9 += carry8;
1775
0
  s8 -= int64_lshift21(carry8);
1776
0
  carry9 = s9 >> 21;
1777
0
  s10 += carry9;
1778
0
  s9 -= int64_lshift21(carry9);
1779
0
  carry10 = s10 >> 21;
1780
0
  s11 += carry10;
1781
0
  s10 -= int64_lshift21(carry10);
1782
0
  carry11 = s11 >> 21;
1783
0
  s12 += carry11;
1784
0
  s11 -= int64_lshift21(carry11);
1785
1786
0
  s0 += s12 * 666643;
1787
0
  s1 += s12 * 470296;
1788
0
  s2 += s12 * 654183;
1789
0
  s3 -= s12 * 997805;
1790
0
  s4 += s12 * 136657;
1791
0
  s5 -= s12 * 683901;
1792
0
  s12 = 0;
1793
1794
0
  carry0 = s0 >> 21;
1795
0
  s1 += carry0;
1796
0
  s0 -= int64_lshift21(carry0);
1797
0
  carry1 = s1 >> 21;
1798
0
  s2 += carry1;
1799
0
  s1 -= int64_lshift21(carry1);
1800
0
  carry2 = s2 >> 21;
1801
0
  s3 += carry2;
1802
0
  s2 -= int64_lshift21(carry2);
1803
0
  carry3 = s3 >> 21;
1804
0
  s4 += carry3;
1805
0
  s3 -= int64_lshift21(carry3);
1806
0
  carry4 = s4 >> 21;
1807
0
  s5 += carry4;
1808
0
  s4 -= int64_lshift21(carry4);
1809
0
  carry5 = s5 >> 21;
1810
0
  s6 += carry5;
1811
0
  s5 -= int64_lshift21(carry5);
1812
0
  carry6 = s6 >> 21;
1813
0
  s7 += carry6;
1814
0
  s6 -= int64_lshift21(carry6);
1815
0
  carry7 = s7 >> 21;
1816
0
  s8 += carry7;
1817
0
  s7 -= int64_lshift21(carry7);
1818
0
  carry8 = s8 >> 21;
1819
0
  s9 += carry8;
1820
0
  s8 -= int64_lshift21(carry8);
1821
0
  carry9 = s9 >> 21;
1822
0
  s10 += carry9;
1823
0
  s9 -= int64_lshift21(carry9);
1824
0
  carry10 = s10 >> 21;
1825
0
  s11 += carry10;
1826
0
  s10 -= int64_lshift21(carry10);
1827
1828
0
  s[0] = s0 >> 0;
1829
0
  s[1] = s0 >> 8;
1830
0
  s[2] = (s0 >> 16) | (s1 << 5);
1831
0
  s[3] = s1 >> 3;
1832
0
  s[4] = s1 >> 11;
1833
0
  s[5] = (s1 >> 19) | (s2 << 2);
1834
0
  s[6] = s2 >> 6;
1835
0
  s[7] = (s2 >> 14) | (s3 << 7);
1836
0
  s[8] = s3 >> 1;
1837
0
  s[9] = s3 >> 9;
1838
0
  s[10] = (s3 >> 17) | (s4 << 4);
1839
0
  s[11] = s4 >> 4;
1840
0
  s[12] = s4 >> 12;
1841
0
  s[13] = (s4 >> 20) | (s5 << 1);
1842
0
  s[14] = s5 >> 7;
1843
0
  s[15] = (s5 >> 15) | (s6 << 6);
1844
0
  s[16] = s6 >> 2;
1845
0
  s[17] = s6 >> 10;
1846
0
  s[18] = (s6 >> 18) | (s7 << 3);
1847
0
  s[19] = s7 >> 5;
1848
0
  s[20] = s7 >> 13;
1849
0
  s[21] = s8 >> 0;
1850
0
  s[22] = s8 >> 8;
1851
0
  s[23] = (s8 >> 16) | (s9 << 5);
1852
0
  s[24] = s9 >> 3;
1853
0
  s[25] = s9 >> 11;
1854
0
  s[26] = (s9 >> 19) | (s10 << 2);
1855
0
  s[27] = s10 >> 6;
1856
0
  s[28] = (s10 >> 14) | (s11 << 7);
1857
0
  s[29] = s11 >> 1;
1858
0
  s[30] = s11 >> 9;
1859
0
  s[31] = s11 >> 17;
1860
0
}
1861
1862
0
void ED25519_keypair(uint8_t out_public_key[32], uint8_t out_private_key[64]) {
1863
0
  uint8_t seed[32];
1864
0
  RAND_bytes(seed, 32);
1865
0
  ED25519_keypair_from_seed(out_public_key, out_private_key, seed);
1866
0
}
1867
1868
int ED25519_sign(uint8_t out_sig[64], const uint8_t *message,
1869
0
                 size_t message_len, const uint8_t private_key[64]) {
1870
  // NOTE: The documentation on this function says that it returns zero on
1871
  // allocation failure. While that can't happen with the current
1872
  // implementation, we want to reserve the ability to allocate in this
1873
  // implementation in the future.
1874
1875
0
  uint8_t az[SHA512_DIGEST_LENGTH];
1876
0
  SHA512(private_key, 32, az);
1877
1878
0
  az[0] &= 248;
1879
0
  az[31] &= 63;
1880
0
  az[31] |= 64;
1881
1882
0
  SHA512_CTX hash_ctx;
1883
0
  SHA512_Init(&hash_ctx);
1884
0
  SHA512_Update(&hash_ctx, az + 32, 32);
1885
0
  SHA512_Update(&hash_ctx, message, message_len);
1886
0
  uint8_t nonce[SHA512_DIGEST_LENGTH];
1887
0
  SHA512_Final(nonce, &hash_ctx);
1888
1889
0
  x25519_sc_reduce(nonce);
1890
0
  ge_p3 R;
1891
0
  x25519_ge_scalarmult_base(&R, nonce);
1892
0
  ge_p3_tobytes(out_sig, &R);
1893
1894
0
  SHA512_Init(&hash_ctx);
1895
0
  SHA512_Update(&hash_ctx, out_sig, 32);
1896
0
  SHA512_Update(&hash_ctx, private_key + 32, 32);
1897
0
  SHA512_Update(&hash_ctx, message, message_len);
1898
0
  uint8_t hram[SHA512_DIGEST_LENGTH];
1899
0
  SHA512_Final(hram, &hash_ctx);
1900
1901
0
  x25519_sc_reduce(hram);
1902
0
  sc_muladd(out_sig + 32, hram, az, nonce);
1903
1904
  // The signature is computed from the private key, but is public.
1905
0
  CONSTTIME_DECLASSIFY(out_sig, 64);
1906
0
  return 1;
1907
0
}
1908
1909
int ED25519_verify(const uint8_t *message, size_t message_len,
1910
0
                   const uint8_t signature[64], const uint8_t public_key[32]) {
1911
0
  ge_p3 A;
1912
0
  if ((signature[63] & 224) != 0 ||
1913
0
      !x25519_ge_frombytes_vartime(&A, public_key)) {
1914
0
    return 0;
1915
0
  }
1916
1917
0
  fe_loose t;
1918
0
  fe_neg(&t, &A.X);
1919
0
  fe_carry(&A.X, &t);
1920
0
  fe_neg(&t, &A.T);
1921
0
  fe_carry(&A.T, &t);
1922
1923
0
  uint8_t pkcopy[32];
1924
0
  OPENSSL_memcpy(pkcopy, public_key, 32);
1925
0
  uint8_t rcopy[32];
1926
0
  OPENSSL_memcpy(rcopy, signature, 32);
1927
0
  uint8_t scopy[32];
1928
0
  OPENSSL_memcpy(scopy, signature + 32, 32);
1929
1930
  // https://tools.ietf.org/html/rfc8032#section-5.1.7 requires that s be in
1931
  // the range [0, order) in order to prevent signature malleability.
1932
1933
  // kOrder is the order of Curve25519 in little-endian form.
1934
0
  static const uint64_t kOrder[4] = {
1935
0
      UINT64_C(0x5812631a5cf5d3ed),
1936
0
      UINT64_C(0x14def9dea2f79cd6),
1937
0
      0,
1938
0
      UINT64_C(0x1000000000000000),
1939
0
  };
1940
0
  for (size_t i = 3;; i--) {
1941
0
    uint64_t word = CRYPTO_load_u64_le(scopy + i * 8);
1942
0
    if (word > kOrder[i]) {
1943
0
      return 0;
1944
0
    } else if (word < kOrder[i]) {
1945
0
      break;
1946
0
    } else if (i == 0) {
1947
0
      return 0;
1948
0
    }
1949
0
  }
1950
1951
0
  SHA512_CTX hash_ctx;
1952
0
  SHA512_Init(&hash_ctx);
1953
0
  SHA512_Update(&hash_ctx, signature, 32);
1954
0
  SHA512_Update(&hash_ctx, public_key, 32);
1955
0
  SHA512_Update(&hash_ctx, message, message_len);
1956
0
  uint8_t h[SHA512_DIGEST_LENGTH];
1957
0
  SHA512_Final(h, &hash_ctx);
1958
1959
0
  x25519_sc_reduce(h);
1960
1961
0
  ge_p2 R;
1962
0
  ge_double_scalarmult_vartime(&R, h, &A, scopy);
1963
1964
0
  uint8_t rcheck[32];
1965
0
  x25519_ge_tobytes(rcheck, &R);
1966
1967
0
  return CRYPTO_memcmp(rcheck, rcopy, sizeof(rcheck)) == 0;
1968
0
}
1969
1970
void ED25519_keypair_from_seed(uint8_t out_public_key[32],
1971
                               uint8_t out_private_key[64],
1972
32
                               const uint8_t seed[32]) {
1973
32
  uint8_t az[SHA512_DIGEST_LENGTH];
1974
32
  SHA512(seed, 32, az);
1975
1976
32
  az[0] &= 248;
1977
32
  az[31] &= 127;
1978
32
  az[31] |= 64;
1979
1980
32
  ge_p3 A;
1981
32
  x25519_ge_scalarmult_base(&A, az);
1982
32
  ge_p3_tobytes(out_public_key, &A);
1983
  // The public key is derived from the private key, but it is public.
1984
32
  CONSTTIME_DECLASSIFY(out_public_key, 32);
1985
1986
32
  OPENSSL_memcpy(out_private_key, seed, 32);
1987
32
  OPENSSL_memcpy(out_private_key + 32, out_public_key, 32);
1988
32
}
1989
1990
1991
static void x25519_scalar_mult_generic(uint8_t out[32],
1992
                                       const uint8_t scalar[32],
1993
0
                                       const uint8_t point[32]) {
1994
0
  fe x1, x2, z2, x3, z3, tmp0, tmp1;
1995
0
  fe_loose x2l, z2l, x3l, tmp0l, tmp1l;
1996
1997
0
  uint8_t e[32];
1998
0
  OPENSSL_memcpy(e, scalar, 32);
1999
0
  e[0] &= 248;
2000
0
  e[31] &= 127;
2001
0
  e[31] |= 64;
2002
2003
  // The following implementation was transcribed to Coq and proven to
2004
  // correspond to unary scalar multiplication in affine coordinates given that
2005
  // x1 != 0 is the x coordinate of some point on the curve. It was also checked
2006
  // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2
2007
  // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the
2008
  // underlying field, so it applies to Curve25519 itself and the quadratic
2009
  // twist of Curve25519. It was not proven in Coq that prime-field arithmetic
2010
  // correctly simulates extension-field arithmetic on prime-field values.
2011
  // The decoding of the byte array representation of e was not considered.
2012
  // Specification of Montgomery curves in affine coordinates:
2013
  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
2014
  // Proof that these form a group that is isomorphic to a Weierstrass curve:
2015
  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
2016
  // Coq transcription and correctness proof of the loop (where scalarbits=255):
2017
  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
2018
  // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
2019
  // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0
2020
0
  fe_frombytes(&x1, point);
2021
0
  fe_1(&x2);
2022
0
  fe_0(&z2);
2023
0
  fe_copy(&x3, &x1);
2024
0
  fe_1(&z3);
2025
2026
0
  unsigned swap = 0;
2027
0
  int pos;
2028
0
  for (pos = 254; pos >= 0; --pos) {
2029
    // loop invariant as of right before the test, for the case where x1 != 0:
2030
    //   pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero
2031
    //   let r := e >> (pos+1) in the following equalities of projective points:
2032
    //   to_xz (r*P)     === if swap then (x3, z3) else (x2, z2)
2033
    //   to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
2034
    //   x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P)
2035
0
    unsigned b = 1 & (e[pos / 8] >> (pos & 7));
2036
0
    swap ^= b;
2037
0
    fe_cswap(&x2, &x3, swap);
2038
0
    fe_cswap(&z2, &z3, swap);
2039
0
    swap = b;
2040
    // Coq transcription of ladderstep formula (called from transcribed loop):
2041
    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
2042
    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
2043
    // x1 != 0
2044
    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
2045
    // x1  = 0
2046
    // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
2047
0
    fe_sub(&tmp0l, &x3, &z3);
2048
0
    fe_sub(&tmp1l, &x2, &z2);
2049
0
    fe_add(&x2l, &x2, &z2);
2050
0
    fe_add(&z2l, &x3, &z3);
2051
0
    fe_mul_tll(&z3, &tmp0l, &x2l);
2052
0
    fe_mul_tll(&z2, &z2l, &tmp1l);
2053
0
    fe_sq_tl(&tmp0, &tmp1l);
2054
0
    fe_sq_tl(&tmp1, &x2l);
2055
0
    fe_add(&x3l, &z3, &z2);
2056
0
    fe_sub(&z2l, &z3, &z2);
2057
0
    fe_mul_ttt(&x2, &tmp1, &tmp0);
2058
0
    fe_sub(&tmp1l, &tmp1, &tmp0);
2059
0
    fe_sq_tl(&z2, &z2l);
2060
0
    fe_mul121666(&z3, &tmp1l);
2061
0
    fe_sq_tl(&x3, &x3l);
2062
0
    fe_add(&tmp0l, &tmp0, &z3);
2063
0
    fe_mul_ttt(&z3, &x1, &z2);
2064
0
    fe_mul_tll(&z2, &tmp1l, &tmp0l);
2065
0
  }
2066
  // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2)
2067
0
  fe_cswap(&x2, &x3, swap);
2068
0
  fe_cswap(&z2, &z3, swap);
2069
2070
0
  fe_invert(&z2, &z2);
2071
0
  fe_mul_ttt(&x2, &x2, &z2);
2072
0
  fe_tobytes(out, &x2);
2073
0
}
2074
2075
static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32],
2076
26.3k
                               const uint8_t point[32]) {
2077
#if defined(BORINGSSL_X25519_NEON)
2078
  if (CRYPTO_is_NEON_capable()) {
2079
    x25519_NEON(out, scalar, point);
2080
    return;
2081
  }
2082
#elif defined(BORINGSSL_FE25519_ADX)
2083
26.3k
  if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
2084
26.3k
      CRYPTO_is_ADX_capable()) {
2085
26.3k
    x25519_scalar_mult_adx(out, scalar, point);
2086
26.3k
    return;
2087
26.3k
  }
2088
0
#endif
2089
2090
0
  x25519_scalar_mult_generic(out, scalar, point);
2091
0
}
2092
2093
158k
void X25519_keypair(uint8_t out_public_value[32], uint8_t out_private_key[32]) {
2094
158k
  RAND_bytes(out_private_key, 32);
2095
2096
  // All X25519 implementations should decode scalars correctly (see
2097
  // https://tools.ietf.org/html/rfc7748#section-5). However, if an
2098
  // implementation doesn't then it might interoperate with random keys a
2099
  // fraction of the time because they'll, randomly, happen to be correctly
2100
  // formed.
2101
  //
2102
  // Thus we do the opposite of the masking here to make sure that our private
2103
  // keys are never correctly masked and so, hopefully, any incorrect
2104
  // implementations are deterministically broken.
2105
  //
2106
  // This does not affect security because, although we're throwing away
2107
  // entropy, a valid implementation of scalarmult should throw away the exact
2108
  // same bits anyway.
2109
158k
  out_private_key[0] |= ~248;
2110
158k
  out_private_key[31] &= ~64;
2111
158k
  out_private_key[31] |= ~127;
2112
2113
158k
  X25519_public_from_private(out_public_value, out_private_key);
2114
158k
}
2115
2116
int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32],
2117
26.3k
           const uint8_t peer_public_value[32]) {
2118
26.3k
  static const uint8_t kZeros[32] = {0};
2119
26.3k
  x25519_scalar_mult(out_shared_key, private_key, peer_public_value);
2120
  // The all-zero output results when the input is a point of small order.
2121
26.3k
  return constant_time_declassify_int(
2122
26.3k
             CRYPTO_memcmp(kZeros, out_shared_key, 32)) != 0;
2123
26.3k
}
2124
2125
void X25519_public_from_private(uint8_t out_public_value[32],
2126
170k
                                const uint8_t private_key[32]) {
2127
#if defined(BORINGSSL_X25519_NEON)
2128
  if (CRYPTO_is_NEON_capable()) {
2129
    static const uint8_t kMongomeryBasePoint[32] = {9};
2130
    x25519_NEON(out_public_value, private_key, kMongomeryBasePoint);
2131
    return;
2132
  }
2133
#endif
2134
2135
170k
  uint8_t e[32];
2136
170k
  OPENSSL_memcpy(e, private_key, 32);
2137
170k
  e[0] &= 248;
2138
170k
  e[31] &= 127;
2139
170k
  e[31] |= 64;
2140
2141
170k
  ge_p3 A;
2142
170k
  x25519_ge_scalarmult_base(&A, e);
2143
2144
  // We only need the u-coordinate of the curve25519 point. The map is
2145
  // u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y).
2146
170k
  fe_loose zplusy, zminusy;
2147
170k
  fe zminusy_inv;
2148
170k
  fe_add(&zplusy, &A.Z, &A.Y);
2149
170k
  fe_sub(&zminusy, &A.Z, &A.Y);
2150
170k
  fe_loose_invert(&zminusy_inv, &zminusy);
2151
170k
  fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv);
2152
170k
  fe_tobytes(out_public_value, &zminusy_inv);
2153
170k
  CONSTTIME_DECLASSIFY(out_public_value, 32);
2154
170k
}