Coverage Report

Created: 2024-09-11 06:39

/src/trezor-firmware/crypto/bignum.c
Line
Count
Source (jump to first uncovered line)
1
/**
2
 * Copyright (c) 2013-2014 Tomas Dzetkulic
3
 * Copyright (c) 2013-2014 Pavol Rusnak
4
 * Copyright (c)      2015 Jochen Hoenicke
5
 * Copyright (c)      2016 Alex Beregszaszi
6
 *
7
 * Permission is hereby granted, free of charge, to any person obtaining
8
 * a copy of this software and associated documentation files (the "Software"),
9
 * to deal in the Software without restriction, including without limitation
10
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11
 * and/or sell copies of the Software, and to permit persons to whom the
12
 * Software is furnished to do so, subject to the following conditions:
13
 *
14
 * The above copyright notice and this permission notice shall be included
15
 * in all copies or substantial portions of the Software.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
21
 * OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23
 * OTHER DEALINGS IN THE SOFTWARE.
24
 */
25
26
#include "bignum.h"
27
28
#include <assert.h>
29
#include <stdint.h>
30
#include <stdio.h>
31
#include <string.h>
32
33
#include "memzero.h"
34
#include "script.h"
35
36
/*
37
 This library implements 256-bit numbers arithmetic.
38
39
 An unsigned 256-bit number is represented by a bignum256 structure, that is an
40
 array of nine 32-bit values called limbs. Limbs are digits of the number in
41
 the base 2**29 representation in the little endian order. This means that
42
   bignum256 x;
43
 represents the value
44
   sum([x[i] * 2**(29*i) for i in range(9)).
45
46
 A limb of a bignum256 is *normalized* iff it's less than 2**29.
47
 A bignum256 is *normalized* iff every its limb is normalized.
48
 A number is *fully reduced modulo p* iff it is less than p.
49
 A number is *partly reduced modulo p* iff is is less than 2*p.
50
 The number p is usually a prime number such that 2^256 - 2^224 <= p <= 2^256.
51
52
 All functions except bn_fast_mod expect that all their bignum256 inputs are
53
 normalized. (The function bn_fast_mod allows the input number to have the
54
 most significant limb unnormalized). All bignum256 outputs of all functions
55
 are guaranteed to be  normalized.
56
57
 A number can be partly reduced with bn_fast_mod, a partly reduced number can
58
 be fully reduced with bn_mod.
59
60
 A function has *constant control flow with regard to its argument* iff the
61
 order in which instructions of the function are executed doesn't depend on the
62
 value of the argument.
63
 A function has *constant memory access flow with regard to its argument* iff
64
 the memory addresses that are acessed and the order in which they are accessed
65
 don't depend on the value of the argument.
66
 A function *has contant control (memory access) flow* iff it has constant
67
 control (memory access) flow with regard to all its arguments.
68
69
 The following function has contant control flow with regard to its arugment
70
 n, however is doesn't have constant memory access flow with regard to it:
71
 void (int n, int *a) }
72
   a[0] = 0;
73
   a[n] = 0; // memory address reveals the value of n
74
 }
75
76
 Unless stated otherwise all functions are supposed to have both constant
77
 control flow and constant memory access flow.
78
 */
79
80
#define BN_MAX_DECIMAL_DIGITS \
81
0
  79  // floor(log(2**(LIMBS * BITS_PER_LIMB), 10)) + 1
82
83
// y = (bignum256) x
84
// Assumes x is normalized and x < 2**261 == 2**(BITS_PER_LIMB * LIMBS)
85
// Guarantees y is normalized
86
26.9M
void bn_copy_lower(const bignum512 *x, bignum256 *y) {
87
269M
  for (int i = 0; i < BN_LIMBS; i++) {
88
242M
    y->val[i] = x->val[i];
89
242M
  }
90
26.9M
}
91
92
// out_number = (bignum256) in_number
93
// Assumes in_number is a raw bigendian 256-bit number
94
// Guarantees out_number is normalized
95
25.2k
void bn_read_be(const uint8_t *in_number, bignum256 *out_number) {
96
25.2k
  uint32_t temp = 0;
97
98
227k
  for (int i = 0; i < BN_LIMBS - 1; i++) {
99
201k
    uint32_t limb = read_be(in_number + (BN_LIMBS - 2 - i) * 4);
100
101
201k
    temp |= limb << (BN_EXTRA_BITS * i);
102
201k
    out_number->val[i] = temp & BN_LIMB_MASK;
103
104
201k
    temp = limb >> (32 - BN_EXTRA_BITS * (i + 1));
105
201k
  }
106
107
25.2k
  out_number->val[BN_LIMBS - 1] = temp;
108
25.2k
}
109
110
// out_number = (bignum512) in_number
111
// Assumes in_number is a raw bigendian 512-bit number
112
// Guarantees out_number is normalized
113
0
void bn_read_be_512(const uint8_t *in_number, bignum512 *out_number) {
114
0
  bignum256 lower = {0}, upper = {0};
115
116
0
  bn_read_be(in_number + 32, &lower);
117
0
  bn_read_be(in_number, &upper);
118
119
0
  const int shift_length = BN_BITS_PER_LIMB * BN_LIMBS - 256;
120
0
  uint32_t shift = upper.val[0] & ((1 << shift_length) - 1);
121
0
  for (int i = 0; i < shift_length; i++) {
122
0
    bn_rshift(&upper);
123
0
  }
124
0
  lower.val[BN_LIMBS - 1] |= shift << (BN_BITS_PER_LIMB - shift_length);
125
126
0
  for (int i = 0; i < BN_LIMBS; i++) {
127
0
    out_number->val[i] = lower.val[i];
128
0
  }
129
0
  for (int i = 0; i < BN_LIMBS; i++) {
130
0
    out_number->val[i + BN_LIMBS] = upper.val[i];
131
0
  }
132
0
}
133
134
// out_number = (256BE) in_number
135
// Assumes in_number < 2**256
136
// Guarantess out_number is a raw bigendian 256-bit number
137
13.7k
void bn_write_be(const bignum256 *in_number, uint8_t *out_number) {
138
13.7k
  uint32_t temp = in_number->val[BN_LIMBS - 1];
139
123k
  for (int i = BN_LIMBS - 2; i >= 0; i--) {
140
110k
    uint32_t limb = in_number->val[i];
141
142
110k
    temp = (temp << (BN_BITS_PER_LIMB - BN_EXTRA_BITS * i)) |
143
110k
           (limb >> (BN_EXTRA_BITS * i));
144
110k
    write_be(out_number + (BN_LIMBS - 2 - i) * 4, temp);
145
146
110k
    temp = limb;
147
110k
  }
148
13.7k
}
149
150
// out_number = (bignum256) in_number
151
// Assumes in_number is a raw little endian 256-bit number
152
// Guarantees out_number is normalized
153
0
void bn_read_le(const uint8_t *in_number, bignum256 *out_number) {
154
0
  uint32_t temp = 0;
155
0
  for (int i = 0; i < BN_LIMBS - 1; i++) {
156
0
    uint32_t limb = read_le(in_number + i * 4);
157
158
0
    temp |= limb << (BN_EXTRA_BITS * i);
159
0
    out_number->val[i] = temp & BN_LIMB_MASK;
160
0
    temp = limb >> (32 - BN_EXTRA_BITS * (i + 1));
161
0
  }
162
163
0
  out_number->val[BN_LIMBS - 1] = temp;
164
0
}
165
166
// out_number = (256LE) in_number
167
// Assumes in_number < 2**256
168
// Guarantess out_number is a raw little endian 256-bit number
169
0
void bn_write_le(const bignum256 *in_number, uint8_t *out_number) {
170
0
  uint32_t temp = in_number->val[BN_LIMBS - 1];
171
172
0
  for (int i = BN_LIMBS - 2; i >= 0; i--) {
173
0
    uint32_t limb = in_number->val[i];
174
0
    temp = (temp << (BN_BITS_PER_LIMB - BN_EXTRA_BITS * i)) |
175
0
           (limb >> (BN_EXTRA_BITS * i));
176
0
    write_le(out_number + i * 4, temp);
177
0
    temp = limb;
178
0
  }
179
0
}
180
181
// out_number = (bignum256) in_number
182
// Guarantees out_number is normalized
183
0
void bn_read_uint32(uint32_t in_number, bignum256 *out_number) {
184
0
  out_number->val[0] = in_number & BN_LIMB_MASK;
185
0
  out_number->val[1] = in_number >> BN_BITS_PER_LIMB;
186
0
  for (uint32_t i = 2; i < BN_LIMBS; i++) out_number->val[i] = 0;
187
0
}
188
189
// out_number = (bignum256) in_number
190
// Guarantees out_number is normalized
191
0
void bn_read_uint64(uint64_t in_number, bignum256 *out_number) {
192
0
  out_number->val[0] = in_number & BN_LIMB_MASK;
193
0
  out_number->val[1] = (in_number >>= BN_BITS_PER_LIMB) & BN_LIMB_MASK;
194
0
  out_number->val[2] = in_number >> BN_BITS_PER_LIMB;
195
0
  for (uint32_t i = 3; i < BN_LIMBS; i++) out_number->val[i] = 0;
196
0
}
197
198
// Returns the bitsize of x
199
// Assumes x is normalized
200
// The function doesn't have neither constant control flow nor constant memory
201
//   access flow
202
1.21k
int bn_bitcount(const bignum256 *x) {
203
1.21k
  for (int i = BN_LIMBS - 1; i >= 0; i--) {
204
1.21k
    uint32_t limb = x->val[i];
205
1.21k
    if (limb != 0) {
206
      // __builtin_clz returns the number of leading zero bits starting at the
207
      // most significant bit position
208
1.21k
      return i * BN_BITS_PER_LIMB + (32 - __builtin_clz(limb));
209
1.21k
    }
210
1.21k
  }
211
0
  return 0;
212
1.21k
}
213
214
// Returns the number of decimal digits of x; if x is 0, returns 1
215
// Assumes x is normalized
216
// The function doesn't have neither constant control flow nor constant memory
217
//   access flow
218
0
unsigned int bn_digitcount(const bignum256 *x) {
219
0
  bignum256 val = {0};
220
0
  bn_copy(x, &val);
221
222
0
  unsigned int digits = 1;
223
0
  for (unsigned int i = 0; i < BN_MAX_DECIMAL_DIGITS; i += 3) {
224
0
    uint32_t limb = 0;
225
226
0
    bn_divmod1000(&val, &limb);
227
228
0
    if (limb >= 100) {
229
0
      digits = i + 3;
230
0
    } else if (limb >= 10) {
231
0
      digits = i + 2;
232
0
    } else if (limb >= 1) {
233
0
      digits = i + 1;
234
0
    }
235
0
  }
236
237
0
  memzero(&val, sizeof(val));
238
239
0
  return digits;
240
0
}
241
242
// x = 0
243
// Guarantees x is normalized
244
168k
void bn_zero(bignum256 *x) {
245
1.68M
  for (int i = 0; i < BN_LIMBS; i++) {
246
1.51M
    x->val[i] = 0;
247
1.51M
  }
248
168k
}
249
250
// x = 1
251
// Guarantees x is normalized
252
84.0k
void bn_one(bignum256 *x) {
253
84.0k
  x->val[0] = 1;
254
756k
  for (int i = 1; i < BN_LIMBS; i++) {
255
672k
    x->val[i] = 0;
256
672k
  }
257
84.0k
}
258
259
// Returns x == 0
260
// Assumes x is normalized
261
257k
int bn_is_zero(const bignum256 *x) {
262
257k
  uint32_t result = 0;
263
2.57M
  for (int i = 0; i < BN_LIMBS; i++) {
264
2.31M
    result |= x->val[i];
265
2.31M
  }
266
257k
  return !result;
267
257k
}
268
269
// Returns x == 1
270
// Assumes x is normalized
271
43.7M
int bn_is_one(const bignum256 *x) {
272
43.7M
  uint32_t result = x->val[0] ^ 1;
273
393M
  for (int i = 1; i < BN_LIMBS; i++) {
274
350M
    result |= x->val[i];
275
350M
  }
276
43.7M
  return !result;
277
43.7M
}
278
279
// Returns x < y
280
// Assumes x, y are normalized
281
44.6M
int bn_is_less(const bignum256 *x, const bignum256 *y) {
282
44.6M
  uint32_t res1 = 0;
283
44.6M
  uint32_t res2 = 0;
284
446M
  for (int i = BN_LIMBS - 1; i >= 0; i--) {
285
401M
    res1 = (res1 << 1) | (x->val[i] < y->val[i]);
286
401M
    res2 = (res2 << 1) | (x->val[i] > y->val[i]);
287
401M
  }
288
44.6M
  return res1 > res2;
289
44.6M
}
290
291
// Returns x == y
292
// Assumes x, y are normalized
293
684k
int bn_is_equal(const bignum256 *x, const bignum256 *y) {
294
684k
  uint32_t result = 0;
295
6.84M
  for (int i = 0; i < BN_LIMBS; i++) {
296
6.15M
    result |= x->val[i] ^ y->val[i];
297
6.15M
  }
298
684k
  return !result;
299
684k
}
300
301
// res = cond if truecase else falsecase
302
// Assumes cond is either 0 or 1
303
// Works properly even if &res == &truecase or &res == &falsecase or
304
//   &truecase == &falsecase or &res == &truecase == &falsecase
305
void bn_cmov(bignum256 *res, volatile uint32_t cond, const bignum256 *truecase,
306
5.96M
             const bignum256 *falsecase) {
307
  // Intentional use of bitwise OR operator to ensure constant-time
308
5.96M
  assert((int)(cond == 1) | (int)(cond == 0));
309
310
5.96M
  uint32_t tmask = -cond;   // tmask = 0xFFFFFFFF if cond else 0x00000000
311
5.96M
  uint32_t fmask = ~tmask;  // fmask = 0x00000000 if cond else 0xFFFFFFFF
312
313
59.6M
  for (int i = 0; i < BN_LIMBS; i++) {
314
53.6M
    res->val[i] = (truecase->val[i] & tmask) | (falsecase->val[i] & fmask);
315
53.6M
  }
316
5.96M
}
317
318
// x = -x % prime if cond else x,
319
// Explicitly x = (3 * prime - x if x > prime else 2 * prime - x) if cond else
320
//   else (x if x > prime else x + prime)
321
// Assumes x is normalized and partly reduced
322
// Assumes cond is either 1 or 0
323
// Guarantees x is normalized
324
// Assumes prime is normalized and
325
//   0 < prime < 2**260 == 2**(BITS_PER_LIMB * LIMBS - 1)
326
562k
void bn_cnegate(volatile uint32_t cond, bignum256 *x, const bignum256 *prime) {
327
  // Intentional use of bitwise OR operator to ensure constant time
328
562k
  assert((int)(cond == 1) | (int)(cond == 0));
329
330
562k
  uint32_t tmask = -cond;   // tmask = 0xFFFFFFFF if cond else 0x00000000
331
562k
  uint32_t fmask = ~tmask;  // fmask = 0x00000000 if cond else 0xFFFFFFFF
332
333
562k
  bn_mod(x, prime);
334
  // x < prime
335
336
562k
  uint32_t acc1 = 1;
337
562k
  uint32_t acc2 = 0;
338
339
5.62M
  for (int i = 0; i < BN_LIMBS; i++) {
340
5.06M
    acc1 += (BN_BASE - 1) + 2 * prime->val[i] - x->val[i];
341
    // acc1 neither overflows 32 bits nor underflows 0
342
    // Proof:
343
    //   acc1 + (BASE - 1) + 2 * prime[i] - x[i]
344
    //     >= (BASE - 1) - x >= (2**BITS_PER_LIMB - 1) - (2**BITS_PER_LIMB - 1)
345
    //     == 0
346
    //   acc1 + (BASE - 1) + 2 * prime[i] - x[i]
347
    //     <= acc1 + (BASE - 1) + 2 * prime[i]
348
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + 2 * (2**BITS_PER_LIMB - 1) +
349
    //       (2**BITS_PER_LIMB - 1)
350
    //     == 7 + 3 * 2**29 < 2**32
351
352
5.06M
    acc2 += prime->val[i] + x->val[i];
353
    // acc2 doesn't overflow 32 bits
354
    // Proof:
355
    //   acc2 + prime[i] + x[i]
356
    //     <= 2**(32 - BITS_PER_LIMB) - 1 + 2 * (2**BITS_PER_LIMB - 1)
357
    //     == 2**(32 - BITS_PER_LIMB) + 2**(BITS_PER_LIMB + 1) - 2
358
    //     == 2**30 + 5 < 2**32
359
360
    // x = acc1 & LIMB_MASK if cond else acc2 & LIMB_MASK
361
5.06M
    x->val[i] = ((acc1 & tmask) | (acc2 & fmask)) & BN_LIMB_MASK;
362
363
5.06M
    acc1 >>= BN_BITS_PER_LIMB;
364
    // acc1 <= 7 == 2**(32 - BITS_PER_LIMB) - 1
365
    // acc1 == 2**(BITS_PER_LIMB * (i + 1)) + 2 * prime[:i + 1] - x[:i + 1]
366
    //   >> BITS_PER_LIMB * (i + 1)
367
368
5.06M
    acc2 >>= BN_BITS_PER_LIMB;
369
    // acc2 <= 7 == 2**(32 - BITS_PER_LIMB) - 1
370
    // acc2 == prime[:i + 1] + x[:i + 1] >> BITS_PER_LIMB * (i + 1)
371
5.06M
  }
372
373
  // assert(acc1 == 1); // assert prime <= 2**260
374
  // assert(acc2 == 0);
375
376
  // clang-format off
377
  // acc1 == 1
378
  // Proof:
379
  //   acc1 == 2**(BITS_PER_LIMB * LIMBS) + 2 * prime[:LIMBS] - x[:LIMBS] >> BITS_PER_LIMB * LIMBS
380
  //     == 2**(BITS_PER_LIMB * LIMBS) + 2 * prime - x >> BITS_PER_LIMB * LIMBS
381
  //     <= 2**(BITS_PER_LIMB * LIMBS) + 2 * prime >> BITS_PER_LIMB * LIMBS
382
  //     <= 2**(BITS_PER_LIMB * LIMBS) + 2 * (2**(BITS_PER_LIMB * LIMBS - 1) - 1) >> BITS_PER_LIMB * LIMBS
383
  //     <= 2**(BITS_PER_LIMB * LIMBS) + 2**(BITS_PER_LIMB * LIMBS) - 2 >> BITS_PER_LIMB * LIMBS
384
  //     == 1
385
386
  //   acc1 == 2**(BITS_PER_LIMB * LIMBS) + 2 * prime[:LIMBS] - x[:LIMBS] >> BITS_PER_LIMB * LIMBS
387
  //     == 2**(BITS_PER_LIMB * LIMBS) + 2 * prime - x >> BITS_PER_LIMB * LIMBS
388
  //     >= 2**(BITS_PER_LIMB * LIMBS) + 0 >> BITS_PER_LIMB * LIMBS
389
  //     == 1
390
391
  // acc2 == 0
392
  // Proof:
393
  //   acc2 == prime[:LIMBS] + x[:LIMBS] >> BITS_PER_LIMB * LIMBS
394
  //     == prime + x >> BITS_PER_LIMB * LIMBS
395
  //     <= 2 * prime - 1 >> BITS_PER_LIMB * LIMBS
396
  //     <= 2 * (2**(BITS_PER_LIMB * LIMBS - 1) - 1) - 1 >> 261
397
  //     == 2**(BITS_PER_LIMB * LIMBS) - 3 >> BITS_PER_LIMB * LIMBS
398
  //     == 0
399
  // clang-format on
400
562k
}
401
402
// x <<= 1
403
// Assumes x is normalized, x < 2**260 == 2**(LIMBS*BITS_PER_LIMB - 1)
404
// Guarantees x is normalized
405
45.9M
void bn_lshift(bignum256 *x) {
406
413M
  for (int i = BN_LIMBS - 1; i > 0; i--) {
407
367M
    x->val[i] = ((x->val[i] << 1) & BN_LIMB_MASK) |
408
367M
                (x->val[i - 1] >> (BN_BITS_PER_LIMB - 1));
409
367M
  }
410
45.9M
  x->val[0] = (x->val[0] << 1) & BN_LIMB_MASK;
411
45.9M
}
412
413
// x >>= 1, i.e. x = floor(x/2)
414
// Assumes x is normalized
415
// Guarantees x is normalized
416
// If x is partly reduced (fully reduced) modulo prime,
417
//   guarantess x will be partly reduced (fully reduced) modulo prime
418
43.7M
void bn_rshift(bignum256 *x) {
419
393M
  for (int i = 0; i < BN_LIMBS - 1; i++) {
420
350M
    x->val[i] =
421
350M
        (x->val[i] >> 1) | ((x->val[i + 1] & 1) << (BN_BITS_PER_LIMB - 1));
422
350M
  }
423
43.7M
  x->val[BN_LIMBS - 1] >>= 1;
424
43.7M
}
425
426
// Sets i-th least significant bit (counting from zero)
427
// Assumes x is normalized and 0 <= i < 261 == LIMBS*BITS_PER_LIMB
428
// Guarantees x is normalized
429
// The function has constant control flow but not constant memory access flow
430
//   with regard to i
431
0
void bn_setbit(bignum256 *x, uint16_t i) {
432
0
  assert(i < BN_LIMBS * BN_BITS_PER_LIMB);
433
0
  x->val[i / BN_BITS_PER_LIMB] |= (1u << (i % BN_BITS_PER_LIMB));
434
0
}
435
436
// clears i-th least significant bit (counting from zero)
437
// Assumes x is normalized and 0 <= i < 261 == LIMBS*BITS_PER_LIMB
438
// Guarantees x is normalized
439
// The function has constant control flow but not constant memory access flow
440
//   with regard to i
441
0
void bn_clearbit(bignum256 *x, uint16_t i) {
442
0
  assert(i < BN_LIMBS * BN_BITS_PER_LIMB);
443
0
  x->val[i / BN_BITS_PER_LIMB] &= ~(1u << (i % BN_BITS_PER_LIMB));
444
0
}
445
446
// returns i-th least significant bit (counting from zero)
447
// Assumes x is normalized and 0 <= i < 261 == LIMBS*BITS_PER_LIMB
448
// The function has constant control flow but not constant memory access flow
449
//   with regard to i
450
0
uint32_t bn_testbit(const bignum256 *x, uint16_t i) {
451
0
  assert(i < BN_LIMBS * BN_BITS_PER_LIMB);
452
0
  return (x->val[i / BN_BITS_PER_LIMB] >> (i % BN_BITS_PER_LIMB)) & 1;
453
0
}
454
455
// res = x ^ y
456
// Assumes x, y are normalized
457
// Guarantees res is normalized
458
// Works properly even if &res == &x or &res == &y or &res == &x == &y
459
0
void bn_xor(bignum256 *res, const bignum256 *x, const bignum256 *y) {
460
0
  for (int i = 0; i < BN_LIMBS; i++) {
461
0
    res->val[i] = x->val[i] ^ y->val[i];
462
0
  }
463
0
}
464
465
// x = x / 2 % prime
466
// Explicitly x = x / 2 if is_even(x) else (x + prime) / 2
467
// Assumes x is normalized, x + prime < 261 == LIMBS * BITS_PER_LIMB
468
// Guarantees x is normalized
469
// If x is partly reduced (fully reduced) modulo prime,
470
//   guarantess x will be partly reduced (fully reduced) modulo prime
471
// Assumes prime is an odd number and normalized
472
5.19M
void bn_mult_half(bignum256 *x, const bignum256 *prime) {
473
  // x = x / 2 if is_even(x) else (x + prime) / 2
474
475
5.19M
  uint32_t x_is_odd_mask =
476
5.19M
      -(x->val[0] & 1);  // x_is_odd_mask = 0xFFFFFFFF if is_odd(x) else 0
477
478
5.19M
  uint32_t acc = (x->val[0] + (prime->val[0] & x_is_odd_mask)) >> 1;
479
  // acc < 2**BITS_PER_LIMB
480
  // Proof:
481
  //   acc == x[0] + prime[0] & x_is_odd_mask >> 1
482
  //     <= (2**(BITS_PER_LIMB) - 1) + (2**(BITS_PER_LIMB) - 1) >> 1
483
  //     == 2**(BITS_PER_LIMB + 1) - 2 >> 1
484
  //     <  2**(BITS_PER_LIMB)
485
486
46.7M
  for (int i = 0; i < BN_LIMBS - 1; i++) {
487
41.5M
    uint32_t temp = (x->val[i + 1] + (prime->val[i + 1] & x_is_odd_mask));
488
    // temp < 2**(BITS_PER_LIMB + 1)
489
    // Proof:
490
    //   temp == x[i + 1] + val[i + 1] & x_is_odd_mask
491
    //     <= (2**(BITS_PER_LIMB) - 1) + (2**(BITS_PER_LIMB) - 1)
492
    //     <  2**(BITS_PER_LIMB + 1)
493
494
41.5M
    acc += (temp & 1) << (BN_BITS_PER_LIMB - 1);
495
    // acc doesn't overflow 32 bits
496
    // Proof:
497
    //   acc + (temp & 1 << BITS_PER_LIMB - 1)
498
    //     <= 2**(BITS_PER_LIMB + 1) + 2**(BITS_PER_LIMB - 1)
499
    //     <= 2**30 + 2**28 < 2**32
500
501
41.5M
    x->val[i] = acc & BN_LIMB_MASK;
502
41.5M
    acc >>= BN_BITS_PER_LIMB;
503
41.5M
    acc += temp >> 1;
504
    // acc < 2**(BITS_PER_LIMB + 1)
505
    // Proof:
506
    //   acc + (temp >> 1)
507
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + (2**(BITS_PER_LIMB + 1) - 1 >> 1)
508
    //     == 7 + 2**(BITS_PER_LIMB) - 1 < 2**(BITS_PER_LIMB + 1)
509
510
    // acc == x[:i+2]+(prime[:i+2] & x_is_odd_mask) >> BITS_PER_LIMB * (i+1)
511
41.5M
  }
512
5.19M
  x->val[BN_LIMBS - 1] = acc;
513
514
  // assert(acc >> BITS_PER_LIMB == 0);
515
  // acc >> BITS_PER_LIMB == 0
516
  // Proof:
517
  //   acc
518
  //     == x[:LIMBS] + (prime[:LIMBS] & x_is_odd_mask) >> BITS_PER_LIMB*LIMBS
519
  //     == x + (prime & x_is_odd_mask) >> BITS_PER_LIMB * LIMBS
520
  //     <= x + prime >> BITS_PER_LIMB * LIMBS
521
  //     <= 2**(BITS_PER_LIMB * LIMBS) - 1 >> BITS_PER_LIMB * LIMBS
522
  //     == 0
523
5.19M
}
524
525
// x = x * k % prime
526
// Assumes x is normalized, 0 <= k <= 8 = 2**(32 - BITS_PER_LIMB)
527
// Assumes prime is normalized and 2^256 - 2^224 <= prime <= 2^256
528
// Guarantees x is normalized and partly reduced modulo prime
529
5.00M
void bn_mult_k(bignum256 *x, uint8_t k, const bignum256 *prime) {
530
5.00M
  assert(k <= 8);
531
532
50.0M
  for (int i = 0; i < BN_LIMBS; i++) {
533
45.0M
    x->val[i] = k * x->val[i];
534
    // x[i] doesn't overflow 32 bits
535
    // k * x[i] <= 2**(32 - BITS_PER_LIMB) * (2**BITS_PER_LIMB - 1)
536
    //   < 2**(32 - BITS_PER_LIMB) * 2**BITS_PER_LIMB == 2**32
537
45.0M
  }
538
539
5.00M
  bn_fast_mod(x, prime);
540
5.00M
}
541
542
// Reduces partly reduced x modulo prime
543
// Explicitly x = x if x < prime else x - prime
544
// Assumes x is partly reduced modulo prime
545
// Guarantees x is fully reduced modulo prime
546
// Assumes prime is nonzero and normalized
547
830k
void bn_mod(bignum256 *x, const bignum256 *prime) {
548
830k
  uint32_t x_less_prime = bn_is_less(x, prime);
549
550
830k
  bignum256 temp = {0};
551
830k
  bn_subtract(x, prime, &temp);
552
830k
  bn_cmov(x, x_less_prime, x, &temp);
553
554
830k
  memzero(&temp, sizeof(temp));
555
830k
}
556
557
// Auxiliary function for bn_multiply
558
// res = k * x
559
// Assumes k and x are normalized
560
// Guarantees res is normalized 18 digit little endian number in base 2**29
561
26.9M
void bn_multiply_long(const bignum256 *k, const bignum256 *x, bignum512 *res) {
562
  // Uses long multiplication in base 2**29, see
563
  // https://en.wikipedia.org/wiki/Multiplication_algorithm#Long_multiplication
564
565
26.9M
  uint64_t acc = 0;
566
567
  // compute lower half
568
269M
  for (int i = 0; i < BN_LIMBS; i++) {
569
1.45G
    for (int j = 0; j <= i; j++) {
570
1.21G
      acc += k->val[j] * (uint64_t)x->val[i - j];
571
      // acc doesn't overflow 64 bits
572
      // Proof:
573
      //   acc <= acc + sum([k[j] * x[i-j] for j in range(i)])
574
      //     <= (2**(64 - BITS_PER_LIMB) - 1) +
575
      //       LIMBS * (2**BITS_PER_LIMB - 1) * (2**BITS_PER_LIMB - 1)
576
      //     == (2**35 - 1) + 9 * (2**29 - 1) * (2**29 - 1)
577
      //     <= 2**35 + 9 * 2**58 < 2**64
578
1.21G
    }
579
580
242M
    res->val[i] = acc & BN_LIMB_MASK;
581
242M
    acc >>= BN_BITS_PER_LIMB;
582
    // acc <= 2**35 - 1 == 2**(64 - BITS_PER_LIMB) - 1
583
242M
  }
584
585
  // compute upper half
586
242M
  for (int i = BN_LIMBS; i < 2 * BN_LIMBS - 1; i++) {
587
1.18G
    for (int j = i - BN_LIMBS + 1; j < BN_LIMBS; j++) {
588
971M
      acc += k->val[j] * (uint64_t)x->val[i - j];
589
      // acc doesn't overflow 64 bits
590
      // Proof:
591
      //   acc <= acc + sum([k[j] * x[i-j] for j in range(i)])
592
      //     <= (2**(64 - BITS_PER_LIMB) - 1)
593
      //       LIMBS * (2**BITS_PER_LIMB - 1) * (2**BITS_PER_LIMB - 1)
594
      //     == (2**35 - 1) + 9 * (2**29 - 1) * (2**29 - 1)
595
      //     <= 2**35 + 9 * 2**58 < 2**64
596
971M
    }
597
598
215M
    res->val[i] = acc & (BN_BASE - 1);
599
215M
    acc >>= BN_BITS_PER_LIMB;
600
    // acc < 2**35 == 2**(64 - BITS_PER_LIMB)
601
215M
  }
602
603
26.9M
  res->val[2 * BN_LIMBS - 1] = acc;
604
26.9M
}
605
606
// Auxiliary function for bn_multiply
607
// Assumes 0 <= d <= 8 == LIMBS - 1
608
// Assumes res is normalized and res < 2**(256 + 29*d + 31)
609
// Guarantess res in normalized and res < 2 * prime * 2**(29*d)
610
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
611
void bn_multiply_reduce_step(bignum512 *res, const bignum256 *prime,
612
242M
                             uint32_t d) {
613
  // clang-format off
614
  // Computes res = res - (res // 2**(256 + BITS_PER_LIMB * d)) * prime * 2**(BITS_PER_LIMB * d)
615
616
  // res - (res // 2**(256 + BITS_PER_LIMB * d)) * prime * 2**(BITS_PER_LIMB * d) < 2 * prime * 2**(BITS_PER_LIMB * d)
617
  // Proof:
618
  //   res - res // (2**(256 + BITS_PER_LIMB * d)) * 2**(BITS_PER_LIMB * d) * prime
619
  //     == res - res // (2**(256 + BITS_PER_LIMB * d)) * 2**(BITS_PER_LIMB * d) * (2**256 - (2**256 - prime))
620
  //     == res - res // (2**(256 + BITS_PER_LIMB * d)) * 2**(BITS_PER_LIMB * d) * 2**256 + res // (2**(256 + BITS_PER_LIMB * d)) * 2**(BITS_PER_LIMB * d) * (2**256 - prime)
621
  //     == (res % 2**(256 + BITS_PER_LIMB * d)) + res // (2**256 + BITS_PER_LIMB * d) * 2**(BITS_PER_LIMB * d) * (2**256 - prime)
622
  //     <= (2**(256 + 29*d + 31) % 2**(256 + 29*d)) + (2**(256 + 29*d + 31) - 1) / (2**256 + 29*d) * 2**(29*d) * (2**256 - prime)
623
  //     <= 2**(256 + 29*d) + 2**(256 + 29*d + 31) / (2**256 + 29*d) * 2**(29*d) * (2**256 - prime)
624
  //     == 2**(256 + 29*d) + 2**31 * 2**(29*d) * (2**256 - prime)
625
  //     == 2**(29*d) * (2**256 + 2**31 * (2*256 - prime))
626
  //     <= 2**(29*d) * (2**256 + 2**31 * 2*224)
627
  //     <= 2**(29*d) * (2**256 + 2**255)
628
  //     <= 2**(29*d) * 2 * (2**256 - 2**224)
629
  //     <= 2 * prime * 2**(29*d)
630
  // clang-format on
631
632
242M
  uint32_t coef =
633
242M
      (res->val[d + BN_LIMBS - 1] >>
634
242M
       (256 - (BN_LIMBS - 1) * BN_BITS_PER_LIMB)) +
635
242M
      (res->val[d + BN_LIMBS] << ((BN_LIMBS * BN_BITS_PER_LIMB) - 256));
636
637
  // coef == res // 2**(256 + BITS_PER_LIMB * d)
638
639
  // coef <  2**31
640
  // Proof:
641
  //   coef == res // 2**(256 + BITS_PER_LIMB * d)
642
  //     <  2**(256 + 29 * d + 31) // 2**(256 + 29 * d)
643
  //     == 2**31
644
645
242M
  const int shift = 31;
646
242M
  uint64_t acc = 1ull << shift;
647
648
2.42G
  for (int i = 0; i < BN_LIMBS; i++) {
649
2.18G
    acc += (((uint64_t)(BN_BASE - 1)) << shift) + res->val[d + i] -
650
2.18G
           prime->val[i] * (uint64_t)coef;
651
    // acc neither overflow 64 bits nor underflow zero
652
    // Proof:
653
    //   acc + ((BASE - 1) << shift) + res[d + i] - prime[i] * coef
654
    //     >= ((BASE - 1) << shift) - prime[i] * coef
655
    //     == 2**shift * (2**BITS_PER_LIMB - 1) - (2**BITS_PER_LIMB - 1) *
656
    //       (2**31 - 1)
657
    //     == (2**shift - 2**31 + 1) * (2**BITS_PER_LIMB - 1)
658
    //     == (2**31 - 2**31 + 1) * (2**29 - 1)
659
    //     == 2**29 - 1 > 0
660
    //   acc + ((BASE - 1) << shift) + res[d + i] - prime[i] * coef
661
    //     <= acc + ((BASE - 1) << shift) + res[d+i]
662
    //     <= (2**(64 - BITS_PER_LIMB) - 1) + 2**shift * (2**BITS_PER_LIMB - 1)
663
    //       + (2*BITS_PER_LIMB - 1)
664
    //     == (2**(64 - BITS_PER_LIMB) - 1) + (2**shift + 1) *
665
    //       (2**BITS_PER_LIMB - 1)
666
    //     == (2**35 - 1) + (2**31 + 1) * (2**29 - 1)
667
    //     <= 2**35 + 2**60 + 2**29 < 2**64
668
669
2.18G
    res->val[d + i] = acc & BN_LIMB_MASK;
670
2.18G
    acc >>= BN_BITS_PER_LIMB;
671
    // acc <= 2**(64 - BITS_PER_LIMB) - 1 == 2**35 - 1
672
673
    // acc == (1 << BITS_PER_LIMB * (i + 1) + shift) + res[d : d + i + 1]
674
    //   - coef * prime[:i + 1] >> BITS_PER_LIMB * (i + 1)
675
2.18G
  }
676
677
  // acc += (((uint64_t)(BASE - 1)) << shift) + res[d + LIMBS];
678
  // acc >>= BITS_PER_LIMB;
679
  // assert(acc <= 1ul << shift);
680
681
  // clang-format off
682
  // acc == 1 << shift
683
  // Proof:
684
  //   acc
685
  //     == (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + res[d : d + LIMBS + 1] - coef * prime[:LIMBS] >> BITS_PER_LIMB * (LIMBS + 1)
686
  //     == (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + res[d : d + LIMBS + 1] - coef * prime >> BITS_PER_LIMB * (LIMBS + 1)
687
688
  //     == (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + (res[d : d + LIMBS + 1] - coef * prime) >> BITS_PER_LIMB * (LIMBS + 1)
689
  //     <= (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + (res[:d] + BASE**d * res[d : d + LIMBS + 1] - BASE**d * coef * prime)//BASE**d >> BITS_PER_LIMB * (LIMBS + 1)
690
  //     <= (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + (res - BASE**d * coef * prime) // BASE**d >> BITS_PER_LIMB * (LIMBS + 1)
691
  //     == (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + (2 * prime * BASE**d) // BASE**d >> BITS_PER_LIMB * (LIMBS + 1)
692
  //     <= (1 << 321) + 2 * 2**256 >> 290
693
  //     == 1 << 31 == 1 << shift
694
695
  //     == (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + res[d : d + LIMBS + 1] - coef * prime[:LIMBS + 1] >> BITS_PER_LIMB * (LIMBS + 1)
696
  //     >= (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + 0 >> BITS_PER_LIMB * (LIMBS + 1)
697
  //     == 1 << shift
698
  // clang-format on
699
700
242M
  res->val[d + BN_LIMBS] = 0;
701
242M
}
702
703
// Partly reduces x
704
// Assumes x in normalized and res < 2**519
705
// Guarantees x is normalized and partly reduced modulo prime
706
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
707
26.9M
void bn_reduce(bignum512 *x, const bignum256 *prime) {
708
269M
  for (int i = BN_LIMBS - 1; i >= 0; i--) {
709
    // res < 2**(256 + 29*i + 31)
710
    // Proof:
711
    //   if i == LIMBS - 1:
712
    //     res < 2**519
713
    //       == 2**(256 + 29 * 8 + 31)
714
    //       == 2**(256 + 29 * (LIMBS - 1) + 31)
715
    //   else:
716
    //     res < 2 * prime * 2**(29 * (i + 1))
717
    //       <= 2**256 * 2**(29*i + 29) < 2**(256 + 29*i + 31)
718
242M
    bn_multiply_reduce_step(x, prime, i);
719
242M
  }
720
26.9M
}
721
722
// x = k * x % prime
723
// Assumes k, x are normalized, k * x < 2**519
724
// Guarantees x is normalized and partly reduced modulo prime
725
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
726
26.9M
void bn_multiply(const bignum256 *k, bignum256 *x, const bignum256 *prime) {
727
26.9M
  bignum512 res = {0};
728
729
26.9M
  bn_multiply_long(k, x, &res);
730
26.9M
  bn_reduce(&res, prime);
731
26.9M
  bn_copy_lower(&res, x);
732
733
26.9M
  memzero(&res, sizeof(res));
734
26.9M
}
735
736
// Partly reduces x modulo prime
737
// Assumes limbs of x except the last (the most significant) one are normalized
738
// Assumes prime is normalized and 2^256 - 2^224 <= prime <= 2^256
739
// Guarantees x is normalized and partly reduced modulo prime
740
13.6M
void bn_fast_mod(bignum256 *x, const bignum256 *prime) {
741
  // Computes x = x - (x // 2**256) * prime
742
743
  // x < 2**((LIMBS - 1) * BITS_PER_LIMB + 32) == 2**264
744
745
  // x - (x // 2**256) * prime < 2 * prime
746
  // Proof:
747
  //   x - (x // 2**256) * prime
748
  //     == x - (x // 2**256) * (2**256 - (2**256 - prime))
749
  //     == x - ((x // 2**256) * 2**256) + (x // 2**256) * (2**256 - prime)
750
  //     == (x % prime) + (x // 2**256) * (2**256 - prime)
751
  //     <= prime - 1 + (2**264 // 2**256) * (2**256 - prime)
752
  //     <= 2**256 + 2**8 * 2**224 == 2**256 + 2**232
753
  //     <  2 * (2**256 - 2**224)
754
  //     <= 2 * prime
755
756
  // x - (x // 2**256 - 1) * prime < 2 * prime
757
  // Proof:
758
  //   x - (x // 2**256) * prime + prime
759
  //     == x - (x // 2**256) * (2**256 - (2**256 - prime)) + prime
760
  //     == x - ((x//2**256) * 2**256) + (x//2**256) * (2**256 - prime) + prime
761
  //     == (x % prime) + (x // 2**256) * (2**256 - prime) + prime
762
  //     <= 2 * prime - 1 + (2**264 // 2**256) * (2**256 - prime)
763
  //     <= 2 * prime + 2**8 * 2**224 == 2**256 + 2**232 + 2**256 - 2**224
764
  //     <  2 * (2**256 - 2**224)
765
  //     <= 2 * prime
766
767
13.6M
  uint32_t coef =
768
13.6M
      x->val[BN_LIMBS - 1] >> (256 - ((BN_LIMBS - 1) * BN_BITS_PER_LIMB));
769
770
  // clang-format off
771
  // coef == x // 2**256
772
  // 0 <= coef < 2**((LIMBS - 1) * BITS_PER_LIMB + 32 - 256) == 256
773
  // Proof:
774
  //*  Let x[[a : b] be the number consisting of a-th to (b-1)-th bit of the number x.
775
  //   x[LIMBS - 1] >> (256 - ((LIMBS - 1) * BITS_PER_LIMB))
776
  //     == x[[(LIMBS - 1) * BITS_PER_LIMB : (LIMBS - 1) * BITS_PER_LIMB + 32]] >> (256 - ((LIMBS - 1) * BITS_PER_LIMB))
777
  //     == x[[256 - ((LIMBS - 1) * BITS_PER_LIMB) + (LIMBS - 1) * BITS_PER_LIMB : (LIMBS - 1) * BITS_PER_LIMB + 32]]
778
  //     == x[[256 : (LIMBS - 1) * BITS_PER_LIMB + 32]]
779
  //     == x[[256 : 264]] == x // 2**256
780
  // clang-format on
781
782
13.6M
  const int shift = 8;
783
13.6M
  uint64_t acc = 1ull << shift;
784
785
136M
  for (int i = 0; i < BN_LIMBS; i++) {
786
122M
    acc += (((uint64_t)(BN_BASE - 1)) << shift) + x->val[i] -
787
122M
           prime->val[i] * (uint64_t)coef;
788
    // acc neither overflows 64 bits nor underflows 0
789
    // Proof:
790
    //   acc + (BASE - 1 << shift) + x[i] - prime[i] * coef
791
    //     >= (BASE - 1 << shift) - prime[i] * coef
792
    //     >= 2**shift * (2**BITS_PER_LIMB - 1) - (2**BITS_PER_LIMB - 1) * 255
793
    //     == (2**shift - 255) * (2**BITS_PER_LIMB - 1)
794
    //     == (2**8 - 255) * (2**29 - 1) == 2**29 - 1 >= 0
795
    //   acc + (BASE - 1 << shift) + x[i] - prime[i] * coef
796
    //     <= acc + ((BASE - 1) << shift) + x[i]
797
    //     <= (2**(64 - BITS_PER_LIMB) - 1) + 2**shift * (2**BITS_PER_LIMB - 1)
798
    //       + (2**32 - 1)
799
    //     == (2**35 - 1) + 2**8 * (2**29 - 1) + 2**32
800
    //     <  2**35 + 2**37 + 2**32 < 2**64
801
802
122M
    x->val[i] = acc & BN_LIMB_MASK;
803
122M
    acc >>= BN_BITS_PER_LIMB;
804
    // acc <= 2**(64 - BITS_PER_LIMB) - 1 == 2**35 - 1
805
806
    // acc == (1 << BITS_PER_LIMB * (i + 1) + shift) + x[:i + 1]
807
    //   - coef * prime[:i + 1] >> BITS_PER_LIMB * (i + 1)
808
122M
  }
809
810
  // assert(acc == 1 << shift);
811
812
  // clang-format off
813
  // acc == 1 << shift
814
  // Proof:
815
  //   acc
816
  //     == (1 << BITS_PER_LIMB * LIMBS + shift) + x[:LIMBS] - coef * prime[:LIMBS] >> BITS_PER_LIMB * LIMBS
817
  //     == (1 << BITS_PER_LIMB * LIMBS + shift) + (x - coef * prime) >> BITS_PER_LIMB * LIMBS
818
  //     <= (1 << BITS_PER_LIMB * LIMBS + shift) + (2 * prime) >> BITS_PER_LIMB * LIMBS
819
  //     <= (1 << BITS_PER_LIMB * LIMBS + shift) + 2 * 2**256 >> BITS_PER_LIMB * LIMBS
820
  //     <= 2**269 + 2**257 >> 2**261
821
  //     <= 1 << 8 == 1 << shift
822
823
  //   acc
824
  //     == (1 << BITS_PER_LIMB * LIMBS + shift) + x[:LIMBS] - coef * prime[:LIMBS] >> BITS_PER_LIMB * LIMBS
825
  //     >= (1 << BITS_PER_LIMB * LIMBS + shift) + 0 >> BITS_PER_LIMB * LIMBS
826
  //     == (1 << BITS_PER_LIMB * LIMBS + shift) + 0 >> BITS_PER_LIMB * LIMBS
827
  //     <= 1 << 8 == 1 << shift
828
  // clang-format on
829
13.6M
}
830
831
// res = x**e % prime
832
// Assumes both x and e are normalized, x < 2**259
833
// Guarantees res is normalized and partly reduced modulo prime
834
// Works properly even if &x == &res
835
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
836
// The function doesn't have neither constant control flow nor constant memory
837
//  access flow with regard to e
838
void bn_power_mod(const bignum256 *x, const bignum256 *e,
839
186
                  const bignum256 *prime, bignum256 *res) {
840
  // Uses iterative right-to-left exponentiation by squaring, see
841
  // https://en.wikipedia.org/wiki/Modular_exponentiation#Right-to-left_binary_method
842
843
186
  bignum256 acc = {0};
844
186
  bn_copy(x, &acc);
845
846
186
  bn_one(res);
847
1.86k
  for (int i = 0; i < BN_LIMBS; i++) {
848
1.67k
    uint32_t limb = e->val[i];
849
850
48.9k
    for (int j = 0; j < BN_BITS_PER_LIMB; j++) {
851
      // Break if the following bits of the last limb are zero
852
47.4k
      if (i == BN_LIMBS - 1 && limb == 0) break;
853
854
47.2k
      if (limb & 1)
855
        // acc * res < 2**519
856
        // Proof:
857
        //   acc * res <= max(2**259 - 1, 2 * prime) * (2 * prime)
858
        //     == max(2**259 - 1, 2**257) * 2**257 < 2**259 * 2**257
859
        //     == 2**516 < 2**519
860
45.9k
        bn_multiply(&acc, res, prime);
861
862
47.2k
      limb >>= 1;
863
      // acc * acc < 2**519
864
      // Proof:
865
      //   acc * acc <= max(2**259 - 1, 2 * prime)**2
866
      //     <= (2**259)**2 == 2**518 < 2**519
867
47.2k
      bn_multiply(&acc, &acc, prime);
868
47.2k
    }
869
    // acc == x**(e[:i + 1]) % prime
870
1.67k
  }
871
872
186
  memzero(&acc, sizeof(acc));
873
186
}
874
875
// x = sqrt(x) % prime
876
// Explicitly x = x**((prime+1)/4) % prime
877
// The other root is -sqrt(x)
878
// Assumes x is normalized, x < 2**259 and quadratic residuum mod prime
879
// Assumes prime is a prime number, prime % 4 == 3, it is normalized and
880
//   2**256 - 2**224 <= prime <= 2**256
881
// Guarantees x is normalized and fully reduced modulo prime
882
// The function doesn't have neither constant control flow nor constant memory
883
//  access flow with regard to prime
884
186
void bn_sqrt(bignum256 *x, const bignum256 *prime) {
885
  // Uses the Lagrange formula for the primes of the special form, see
886
  // http://en.wikipedia.org/wiki/Quadratic_residue#Prime_or_prime_power_modulus
887
  // If prime % 4 == 3, then sqrt(x) % prime == x**((prime+1)//4) % prime
888
889
186
  assert(prime->val[0] % 4 == 3);
890
891
  // e = (prime + 1) // 4
892
186
  bignum256 e = {0};
893
186
  bn_copy(prime, &e);
894
186
  bn_addi(&e, 1);
895
186
  bn_rshift(&e);
896
186
  bn_rshift(&e);
897
898
186
  bn_power_mod(x, &e, prime, x);
899
186
  bn_mod(x, prime);
900
901
186
  memzero(&e, sizeof(e));
902
186
}
903
904
// a = 1/a % 2**n
905
// Assumes a is odd, 1 <= n <= 32
906
// The function doesn't have neither constant control flow nor constant memory
907
//   access flow with regard to n
908
1.50M
uint32_t inverse_mod_power_two(uint32_t a, uint32_t n) {
909
  // Uses "Explicit Quadratic Modular inverse modulo 2" from section 3.3 of "On
910
  // Newton-Raphson iteration for multiplicative inverses modulo prime powers"
911
  // by Jean-Guillaume Dumas, see
912
  // https://arxiv.org/pdf/1209.6626.pdf
913
914
  // 1/a % 2**n
915
  //   = (2-a) * product([1 + (a-1)**(2**i) for i in range(1, floor(log2(n)))])
916
917
1.50M
  uint32_t acc = 2 - a;
918
1.50M
  uint32_t f = a - 1;
919
920
  // mask = (1 << n) - 1
921
1.50M
  uint32_t mask = n == 32 ? 0xFFFFFFFF : (1u << n) - 1;
922
923
9.05M
  for (uint32_t i = 1; i < n; i <<= 1) {
924
7.54M
    f = (f * f) & mask;
925
7.54M
    acc = (acc * (1 + f)) & mask;
926
7.54M
  }
927
928
1.50M
  return acc;
929
1.50M
}
930
931
// x = (x / 2**BITS_PER_LIMB) % prime
932
// Assumes both x and prime are normalized
933
// Assumes prime is an odd number and normalized
934
// Guarantees x is normalized
935
// If x is partly reduced (fully reduced) modulo prime,
936
//   guarantess x will be partly reduced (fully reduced) modulo prime
937
1.50M
void bn_divide_base(bignum256 *x, const bignum256 *prime) {
938
  // Uses an explicit formula for the modular inverse of power of two
939
  // (x / 2**n) % prime == (x + ((-x / prime) % 2**n) * prime) // 2**n
940
  // Proof:
941
  //   (x + ((-x / prime) % 2**n) * prime) % 2**n
942
  //     == (x - x / prime * prime) % 2**n
943
  //     == 0
944
  //   (x + ((-1 / prime) % 2**n) * prime) % prime
945
  //     == x
946
  //   if x < prime:
947
  //     (x + ((-x / prime) % 2**n) * prime) // 2**n
948
  //       <= ((prime - 1) + (2**n - 1) * prime) / 2**n
949
  //       == (2**n * prime - 1) / 2**n == prime - 1 / 2**n < prime
950
  //   if x < 2 * prime:
951
  //     (x + ((-x / prime) % 2**n) * prime) // 2**n
952
  //       <= ((2 * prime - 1) + (2**n - 1) * prime) / 2**n
953
  //       == (2**n * prime + prime - 1) / 2**n
954
  //       == prime + (prime - 1) / 2**n < 2 * prime
955
956
  // m = (-x / prime) % 2**BITS_PER_LIMB
957
1.50M
  uint32_t m = (x->val[0] * (BN_BASE - inverse_mod_power_two(
958
1.50M
                                           prime->val[0], BN_BITS_PER_LIMB))) &
959
1.50M
               BN_LIMB_MASK;
960
  // m < 2**BITS_PER_LIMB
961
962
1.50M
  uint64_t acc = x->val[0] + (uint64_t)m * prime->val[0];
963
1.50M
  acc >>= BN_BITS_PER_LIMB;
964
965
13.5M
  for (int i = 1; i < BN_LIMBS; i++) {
966
12.0M
    acc = acc + x->val[i] + (uint64_t)m * prime->val[i];
967
    // acc does not overflow 64 bits
968
    // acc == acc + x + m * prime
969
    //    <= 2**(64 - BITS_PER_LIMB) + 2**(BITS_PER_LIMB)
970
    //      2**(BITS_PER_LIMB) * 2**(BITS_PER_LIMB)
971
    //    <= 2**(2 * BITS_PER_LIMB) + 2**(64 - BITS_PER_LIMB) +
972
    //      2**(BITS_PER_LIMB)
973
    //    <= 2**58 + 2**35 + 2**29 < 2**64
974
975
12.0M
    x->val[i - 1] = acc & BN_LIMB_MASK;
976
12.0M
    acc >>= BN_BITS_PER_LIMB;
977
    // acc < 2**35 == 2**(64 - BITS_PER_LIMB)
978
979
    // acc == x[:i + 1] + m * prime[:i + 1] >> BITS_PER_LIMB * (i + 1)
980
12.0M
  }
981
982
1.50M
  x->val[BN_LIMBS - 1] = acc;
983
984
1.50M
  assert(acc >> BN_BITS_PER_LIMB == 0);
985
986
  // clang-format off
987
  // acc >> BITS_PER_LIMB == 0
988
  // Proof:
989
  //   acc >> BITS_PER_LIMB
990
  //     == (x[:LIMB] + m * prime[:LIMB] >> BITS_PER_LIMB * LIMBS) >> BITS_PER_LIMB * (LIMBS + 1)
991
  //     == x + m * prime >> BITS_PER_LIMB * (LIMBS + 1)
992
  //     <= (2**(BITS_PER_LIMB * LIMBS) - 1) + (2**BITS_PER_LIMB - 1) * (2**(BITS_PER_LIMB * LIMBS) - 1) >> BITS_PER_LIMB * (LIMBS + 1)
993
  //     == 2**(BITS_PER_LIMB * LIMBS) - 1 + 2**(BITS_PER_LIMB * (LIMBS + 1)) - 2**(BITS_PER_LIMB * LIMBS) - 2**BITS_PER_LIMB + 1 >> BITS_PER_LIMB * (LIMBS + 1)
994
  //     == 2**(BITS_PER_LIMB * (LIMBS + 1)) - 2**BITS_PER_LIMB >> BITS_PER_LIMB * (LIMBS + 1)
995
  //     == 0
996
  // clang-format on
997
1.50M
}
998
999
#if !USE_INVERSE_FAST
1000
// x = 1/x % prime if x != 0 else 0
1001
// Assumes x is normalized
1002
// Assumes prime is a prime number
1003
// Guarantees x is normalized and fully reduced modulo prime
1004
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
1005
// The function doesn't have neither constant control flow nor constant memory
1006
//   access flow with regard to prime
1007
static void bn_inverse_slow(bignum256 *x, const bignum256 *prime) {
1008
  // Uses formula 1/x % prime == x**(prime - 2) % prime
1009
  // See https://en.wikipedia.org/wiki/Fermat%27s_little_theorem
1010
1011
  bn_fast_mod(x, prime);
1012
1013
  // e = prime - 2
1014
  bignum256 e = {0};
1015
  bn_read_uint32(2, &e);
1016
  bn_subtract(prime, &e, &e);
1017
1018
  bn_power_mod(x, &e, prime, x);
1019
  bn_mod(x, prime);
1020
1021
  memzero(&e, sizeof(e));
1022
}
1023
#endif
1024
1025
#if false
1026
// x = 1/x % prime if x != 0 else 0
1027
// Assumes x is is_normalized
1028
// Assumes GCD(x, prime) = 1
1029
// Guarantees x is normalized and fully reduced modulo prime
1030
// Assumes prime is odd, normalized, 2**256 - 2**224 <= prime <= 2**256
1031
// The function doesn't have neither constant control flow nor constant memory
1032
//   access flow with regard to prime and x
1033
static void bn_inverse_fast(bignum256 *x, const bignum256 *prime) {
1034
  // "The Almost Montgomery Inverse" from the section 3 of "Constant Time
1035
  // Modular Inversion" by Joppe W. Bos
1036
  // See http://www.joppebos.com/files/CTInversion.pdf
1037
1038
  /*
1039
    u = prime
1040
    v = x & prime
1041
    s = 1
1042
    r = 0
1043
1044
    k = 0
1045
    while v != 1:
1046
      k += 1
1047
      if is_even(u):
1048
        u = u // 2
1049
        s = 2 * s
1050
      elif is_even(v):
1051
        v = v // 2
1052
        r = 2 * r
1053
      elif v < u:
1054
        u = (u - v) // 2
1055
        r = r + s
1056
        s = 2 * s
1057
      else:
1058
        v = (v - u) // 2
1059
        s = r + s
1060
        r = 2 * r
1061
1062
    s = (s / 2**k) % prime
1063
    return s
1064
  */
1065
1066
  if (bn_is_zero(x)) return;
1067
1068
  bn_fast_mod(x, prime);
1069
  bn_mod(x, prime);
1070
1071
  bignum256 u = {0}, v = {0}, r = {0}, s = {0};
1072
  bn_copy(prime, &u);
1073
  bn_copy(x, &v);
1074
  bn_one(&s);
1075
  bn_zero(&r);
1076
1077
  int k = 0;
1078
  while (!bn_is_one(&v)) {
1079
    if ((u.val[0] & 1) == 0) {
1080
      bn_rshift(&u);
1081
      bn_lshift(&s);
1082
    } else if ((v.val[0] & 1) == 0) {
1083
      bn_rshift(&v);
1084
      bn_lshift(&r);
1085
    } else if (bn_is_less(&v, &u)) {
1086
      bn_subtract(&u, &v, &u);
1087
      bn_rshift(&u);
1088
      bn_add(&r, &s);
1089
      bn_lshift(&s);
1090
    } else {
1091
      bn_subtract(&v, &u, &v);
1092
      bn_rshift(&v);
1093
      bn_add(&s, &r);
1094
      bn_lshift(&r);
1095
    }
1096
    k += 1;
1097
    assert(!bn_is_zero(&v)); // assert GCD(x, prime) == 1
1098
  }
1099
1100
  // s = s / 2**(k // BITS_PER_LIMB * BITS_PER_LIMB)
1101
  for (int i = 0; i < k / BITS_PER_LIMB; i++) {
1102
    bn_divide_base(&s, prime);
1103
  }
1104
1105
  // s = s / 2**(k % BITS_PER_LIMB)
1106
  for (int i = 0; i < k % BN_BITS_PER_LIMB; i++) {
1107
    bn_mult_half(&s, prime);
1108
  }
1109
1110
  bn_copy(&s, x);
1111
1112
  memzero(&u, sizeof(u));
1113
  memzero(&v, sizeof(v));
1114
  memzero(&r, sizeof(r));
1115
  memzero(&s, sizeof(s));
1116
}
1117
#endif
1118
1119
#if USE_INVERSE_FAST
1120
// x = 1/x % prime if x != 0 else 0
1121
// Assumes x is is_normalized
1122
// Assumes GCD(x, prime) = 1
1123
// Guarantees x is normalized and fully reduced modulo prime
1124
// Assumes prime is odd, normalized, 2**256 - 2**224 <= prime <= 2**256
1125
// The function has constant control flow but not constant memory access flow
1126
//   with regard to prime and x
1127
83.8k
static void bn_inverse_fast(bignum256 *x, const bignum256 *prime) {
1128
  // Custom constant time version of "The Almost Montgomery Inverse" from the
1129
  // section 3 of "Constant Time Modular Inversion" by Joppe W. Bos
1130
  // See http://www.joppebos.com/files/CTInversion.pdf
1131
1132
  /*
1133
    u = prime
1134
    v = x % prime
1135
    s = 1
1136
    r = 0
1137
1138
    k = 0
1139
    while v != 1:
1140
      k += 1
1141
      if is_even(u): # b1
1142
        u = u // 2
1143
        s = 2 * s
1144
      elif is_even(v): # b2
1145
        v = v // 2
1146
        r = 2 * r
1147
      elif v < u: # b3
1148
        u = (u - v) // 2
1149
        r = r + s
1150
        s = 2 * s
1151
      else: # b4
1152
        v = (v - u) // 2
1153
        s = r + s
1154
        r = 2 * r
1155
1156
    s = (s / 2**k) % prime
1157
    return s
1158
  */
1159
1160
83.8k
  bn_fast_mod(x, prime);
1161
83.8k
  bn_mod(x, prime);
1162
1163
83.8k
  bignum256 u = {0}, v = {0}, r = {0}, s = {0};
1164
83.8k
  bn_copy(prime, &u);
1165
83.8k
  bn_copy(x, &v);
1166
83.8k
  bn_one(&s);
1167
83.8k
  bn_zero(&r);
1168
1169
83.8k
  bignum256 zero = {0};
1170
83.8k
  bn_zero(&zero);
1171
1172
83.8k
  int k = 0;
1173
1174
83.8k
  int finished = 0, u_even = 0, v_even = 0, v_less_u = 0, b1 = 0, b2 = 0,
1175
83.8k
      b3 = 0, b4 = 0;
1176
83.8k
  finished = 0;
1177
1178
43.8M
  for (int i = 0; i < 2 * BN_LIMBS * BN_BITS_PER_LIMB; i++) {
1179
43.7M
    finished = finished | -bn_is_one(&v);
1180
43.7M
    u_even = -bn_is_even(&u);
1181
43.7M
    v_even = -bn_is_even(&v);
1182
43.7M
    v_less_u = -bn_is_less(&v, &u);
1183
1184
43.7M
    b1 = ~finished & u_even;
1185
43.7M
    b2 = ~finished & ~b1 & v_even;
1186
43.7M
    b3 = ~finished & ~b1 & ~b2 & v_less_u;
1187
43.7M
    b4 = ~finished & ~b1 & ~b2 & ~b3;
1188
1189
// The ternary operator for pointers with constant control flow
1190
// BN_INVERSE_FAST_TERNARY(c, t, f) = t if c else f
1191
// Very nasty hack, sorry for that
1192
43.7M
#define BN_INVERSE_FAST_TERNARY(c, t, f) \
1193
306M
  ((void *)(((c) & (uintptr_t)(t)) | (~(c) & (uintptr_t)(f))))
1194
1195
43.7M
    bn_subtract(BN_INVERSE_FAST_TERNARY(b3, &u, &v),
1196
43.7M
                BN_INVERSE_FAST_TERNARY(
1197
43.7M
                    b3 | b4, BN_INVERSE_FAST_TERNARY(b3, &v, &u), &zero),
1198
43.7M
                BN_INVERSE_FAST_TERNARY(b3, &u, &v));
1199
1200
43.7M
    bn_add(BN_INVERSE_FAST_TERNARY(b3, &r, &s),
1201
43.7M
           BN_INVERSE_FAST_TERNARY(b3 | b4, BN_INVERSE_FAST_TERNARY(b3, &s, &r),
1202
43.7M
                                   &zero));
1203
43.7M
    bn_rshift(BN_INVERSE_FAST_TERNARY(b1 | b3, &u, &v));
1204
43.7M
    bn_lshift(BN_INVERSE_FAST_TERNARY(b1 | b3, &s, &r));
1205
1206
43.7M
    k = k - ~finished;
1207
43.7M
  }
1208
1209
  // s = s / 2**(k // BITS_PER_LIMB * BITS_PER_LIMB)
1210
1.59M
  for (int i = 0; i < 2 * BN_LIMBS; i++) {
1211
    // s = s / 2**BITS_PER_LIMB % prime if i < k // BITS_PER_LIMB else s
1212
1.50M
    bn_copy(&s, &r);
1213
1.50M
    bn_divide_base(&r, prime);
1214
1.50M
    bn_cmov(&s, i < k / BN_BITS_PER_LIMB, &r, &s);
1215
1.50M
  }
1216
1217
  // s = s / 2**(k % BITS_PER_LIMB)
1218
2.51M
  for (int i = 0; i < BN_BITS_PER_LIMB; i++) {
1219
    // s = s / 2 % prime if i < k % BITS_PER_LIMB else s
1220
2.43M
    bn_copy(&s, &r);
1221
2.43M
    bn_mult_half(&r, prime);
1222
2.43M
    bn_cmov(&s, i < k % BN_BITS_PER_LIMB, &r, &s);
1223
2.43M
  }
1224
1225
83.8k
  bn_cmov(x, bn_is_zero(x), x, &s);
1226
1227
83.8k
  memzero(&u, sizeof(u));
1228
83.8k
  memzero(&v, sizeof(v));
1229
83.8k
  memzero(&r, sizeof(s));
1230
83.8k
  memzero(&s, sizeof(s));
1231
83.8k
}
1232
#endif
1233
1234
#if false
1235
// x = 1/x % prime if x != 0 else 0
1236
// Assumes x is is_normalized
1237
// Assumes GCD(x, prime) = 1
1238
// Guarantees x is normalized and fully reduced modulo prime
1239
// Assumes prime is odd, normalized, 2**256 - 2**224 <= prime <= 2**256
1240
static void bn_inverse_fast(bignum256 *x, const bignum256 *prime) {
1241
  // Custom constant time version of "The Almost Montgomery Inverse" from the
1242
  // section 3 of "Constant Time Modular Inversion" by Joppe W. Bos
1243
  // See http://www.joppebos.com/files/CTInversion.pdf
1244
1245
  /*
1246
    u = prime
1247
    v = x % prime
1248
    s = 1
1249
    r = 0
1250
1251
    k = 0
1252
    while v != 1:
1253
      k += 1
1254
      if is_even(u): # b1
1255
        u = u // 2
1256
        s = 2 * s
1257
      elif is_even(v): # b2
1258
        v = v // 2
1259
        r = 2 * r
1260
      elif v < u: # b3
1261
        u = (u - v) // 2
1262
        r = r + s
1263
        s = 2 * s
1264
      else: # b4
1265
        v = (v - u) // 2
1266
        s = r + s
1267
        r = 2 * r
1268
1269
    s = (s / 2**k) % prime
1270
    return s
1271
  */
1272
1273
  bn_fast_mod(x, prime);
1274
  bn_mod(x, prime);
1275
1276
  bignum256 u = {0}, v = {0}, r = {0}, s = {0};
1277
  bn_copy(prime, &u);
1278
  bn_copy(x, &v);
1279
  bn_one(&s);
1280
  bn_zero(&r);
1281
1282
  bignum256 zero = {0};
1283
  bn_zero(&zero);
1284
1285
  int k = 0;
1286
1287
  uint32_t finished = 0, u_even = 0, v_even = 0, v_less_u = 0, b1 = 0, b2 = 0,
1288
      b3 = 0, b4 = 0;
1289
  finished = 0;
1290
1291
  bignum256 u_half = {0}, v_half = {0}, u_minus_v_half = {0}, v_minus_u_half = {0}, r_plus_s = {0}, r_twice = {0}, s_twice = {0};
1292
  for (int i = 0; i < 2 * BN_LIMBS * BN_BITS_PER_LIMB; i++) {
1293
    finished = finished | bn_is_one(&v);
1294
    u_even = bn_is_even(&u);
1295
    v_even = bn_is_even(&v);
1296
    v_less_u = bn_is_less(&v, &u);
1297
1298
    b1 = (finished ^ 1) & u_even;
1299
    b2 = (finished ^ 1) & (b1 ^ 1) & v_even;
1300
    b3 = (finished ^ 1) & (b1 ^ 1) & (b2 ^ 1) & v_less_u;
1301
    b4 = (finished ^ 1) & (b1 ^ 1) & (b2 ^ 1) & (b3 ^ 1);
1302
1303
    // u_half = u // 2
1304
    bn_copy(&u, &u_half);
1305
    bn_rshift(&u_half);
1306
1307
    // v_half = v // 2
1308
    bn_copy(&v, &v_half);
1309
    bn_rshift(&v_half);
1310
1311
    // u_minus_v_half  = (u - v) // 2
1312
    bn_subtract(&u, &v, &u_minus_v_half);
1313
    bn_rshift(&u_minus_v_half);
1314
1315
    // v_minus_u_half  = (v - u) // 2
1316
    bn_subtract(&v, &u, &v_minus_u_half);
1317
    bn_rshift(&v_minus_u_half);
1318
1319
    // r_plus_s = r + s
1320
    bn_copy(&r, &r_plus_s);
1321
    bn_add(&r_plus_s, &s);
1322
1323
    // r_twice = 2 * r
1324
    bn_copy(&r, &r_twice);
1325
    bn_lshift(&r_twice);
1326
1327
    // s_twice = 2 * s
1328
    bn_copy(&s, &s_twice);
1329
    bn_lshift(&s_twice);
1330
1331
    bn_cmov(&u, b1, &u_half, &u);
1332
    bn_cmov(&u, b3, &u_minus_v_half, &u);
1333
1334
    bn_cmov(&v, b2, &v_half, &v);
1335
    bn_cmov(&v, b4, &v_minus_u_half, &v);
1336
1337
    bn_cmov(&r, b2 | b4, &r_twice, &r);
1338
    bn_cmov(&r, b3, &r_plus_s, &r);
1339
1340
    bn_cmov(&s, b1 | b3, &s_twice, &s);
1341
    bn_cmov(&s, b4, &r_plus_s, &s);
1342
1343
    k = k + (finished ^ 1);
1344
  }
1345
1346
  // s = s / 2**(k // BITS_PER_LIMB * BITS_PER_LIMB)
1347
  for (int i = 0; i < 2 * BN_LIMBS; i++) {
1348
    // s = s / 2**BITS_PER_LIMB % prime if i < k // BITS_PER_LIMB else s
1349
    bn_copy(&s, &r);
1350
    bn_divide_base(&r, prime);
1351
    bn_cmov(&s, i < k / BITS_PER_LIMB, &r, &s);
1352
  }
1353
1354
  // s = s / 2**(k % BITS_PER_LIMB)
1355
  for (int i = 0; i < BN_BITS_PER_LIMB; i++) {
1356
    // s = s / 2 % prime if i < k % BITS_PER_LIMB else s
1357
    bn_copy(&s, &r);
1358
    bn_mult_half(&r, prime);
1359
    bn_cmov(&s, i < k % BN_BITS_PER_LIMB, &r, &s);
1360
  }
1361
1362
  bn_cmov(x, bn_is_zero(x), x, &s);
1363
1364
  memzero(&u, sizeof(u));
1365
  memzero(&v, sizeof(v));
1366
  memzero(&r, sizeof(r));
1367
  memzero(&s, sizeof(s));
1368
  memzero(&u_half, sizeof(u_half));
1369
  memzero(&v_half, sizeof(v_half));
1370
  memzero(&u_minus_v_half, sizeof(u_minus_v_half));
1371
  memzero(&v_minus_u_half, sizeof(v_minus_u_half));
1372
  memzero(&r_twice, sizeof(r_twice));
1373
  memzero(&s_twice, sizeof(s_twice));
1374
  memzero(&r_plus_s, sizeof(r_plus_s));
1375
}
1376
#endif
1377
1378
// Normalizes x
1379
// Assumes x < 2**261 == 2**(LIMBS * BITS_PER_LIMB)
1380
// Guarantees x is normalized
1381
0
void bn_normalize(bignum256 *x) {
1382
0
  uint32_t acc = 0;
1383
1384
0
  for (int i = 0; i < BN_LIMBS; i++) {
1385
0
    acc += x->val[i];
1386
    // acc doesn't overflow 32 bits
1387
    // Proof:
1388
    //   acc + x[i]
1389
    //      <= (2**(32 - BITS_PER_LIMB) - 1) + (2**BITS_PER_LIMB - 1)
1390
    //      == 7 + 2**29 - 1 < 2**32
1391
1392
0
    x->val[i] = acc & BN_LIMB_MASK;
1393
0
    acc >>= (BN_BITS_PER_LIMB);
1394
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1
1395
0
  }
1396
0
}
1397
1398
// x = x + y
1399
// Assumes x, y are normalized, x + y < 2**(LIMBS*BITS_PER_LIMB) == 2**261
1400
// Guarantees x is normalized
1401
// Works properly even if &x == &y
1402
44.8M
void bn_add(bignum256 *x, const bignum256 *y) {
1403
44.8M
  uint32_t acc = 0;
1404
448M
  for (int i = 0; i < BN_LIMBS; i++) {
1405
403M
    acc += x->val[i] + y->val[i];
1406
    // acc doesn't overflow 32 bits
1407
    // Proof:
1408
    //   acc + x[i] + y[i]
1409
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + 2 * (2**BITS_PER_LIMB - 1)
1410
    //     == (2**(32 - BITS_PER_LIMB) - 1) + 2**(BITS_PER_LIMB + 1) - 2
1411
    //     == 7 + 2**30 - 2 < 2**32
1412
1413
403M
    x->val[i] = acc & BN_LIMB_MASK;
1414
403M
    acc >>= BN_BITS_PER_LIMB;
1415
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1
1416
1417
    // acc == x[:i + 1] + y[:i + 1] >> BITS_PER_LIMB * (i + 1)
1418
403M
  }
1419
1420
  // assert(acc == 0); // assert x + y < 2**261
1421
  // acc == 0
1422
  // Proof:
1423
  //   acc == x[:LIMBS] + y[:LIMBS] >> LIMBS * BITS_PER_LIMB
1424
  //     == x + y >> LIMBS * BITS_PER_LIMB
1425
  //     <= 2**(LIMBS * BITS_PER_LIMB) - 1 >> LIMBS * BITS_PER_LIMB == 0
1426
44.8M
}
1427
1428
// x = x + y % prime
1429
// Assumes x, y are normalized
1430
// Guarantees x is normalized and partly reduced modulo prime
1431
// Assumes prime is normalized and 2^256 - 2^224 <= prime <= 2^256
1432
65.7k
void bn_addmod(bignum256 *x, const bignum256 *y, const bignum256 *prime) {
1433
657k
  for (int i = 0; i < BN_LIMBS; i++) {
1434
591k
    x->val[i] += y->val[i];
1435
    // x[i] doesn't overflow 32 bits
1436
    // Proof:
1437
    //   x[i] + y[i]
1438
    //     <= 2 * (2**BITS_PER_LIMB - 1)
1439
    //     == 2**30 - 2 < 2**32
1440
591k
  }
1441
1442
65.7k
  bn_fast_mod(x, prime);
1443
65.7k
}
1444
1445
// x = x + y
1446
// Assumes x is normalized
1447
// Assumes y <= 2**32 - 2**29 == 2**32 - 2**BITS_PER_LIMB and
1448
//   x + y < 2**261 == 2**(LIMBS * BITS_PER_LIMB)
1449
// Guarantees x is normalized
1450
186
void bn_addi(bignum256 *x, uint32_t y) {
1451
  // assert(y <= 3758096384); // assert y <= 2**32 - 2**29
1452
186
  uint32_t acc = y;
1453
1454
1.86k
  for (int i = 0; i < BN_LIMBS; i++) {
1455
1.67k
    acc += x->val[i];
1456
    // acc doesn't overflow 32 bits
1457
    // Proof:
1458
    //   if i == 0:
1459
    //     acc + x[i] == y + x[0]
1460
    //       <= (2**32 - 2**BITS_PER_LIMB) + (2**BITS_PER_LIMB - 1)
1461
    //       == 2**32 - 1 < 2**32
1462
    //   else:
1463
    //     acc + x[i]
1464
    //       <= (2**(32 - BITS_PER_LIMB) - 1) + (2**BITS_PER_LIMB - 1)
1465
    //       == 7 + 2**29 - 1 < 2**32
1466
1467
1.67k
    x->val[i] = acc & BN_LIMB_MASK;
1468
1.67k
    acc >>= (BN_BITS_PER_LIMB);
1469
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1
1470
1471
    // acc == x[:i + 1] + y >> BITS_PER_LIMB * (i + 1)
1472
1.67k
  }
1473
1474
  // assert(acc == 0); // assert x + y < 2**261
1475
  // acc == 0
1476
  // Proof:
1477
  //   acc == x[:LIMBS] + y << LIMBS * BITS_PER_LIMB
1478
  //     == x + y << LIMBS * BITS_PER_LIMB
1479
  //     <= 2**(LIMBS + BITS_PER_LIMB) - 1 << LIMBS * BITS_PER_LIMB
1480
  //     == 0
1481
186
}
1482
1483
// x = x - y % prime
1484
// Explicitly x = x + prime - y
1485
// Assumes x, y are normalized
1486
// Assumes y < prime[0], x + prime - y < 2**261 == 2**(LIMBS * BITS_PER_LIMB)
1487
// Guarantees x is normalized
1488
// If x is fully reduced modulo prime,
1489
//   guarantess x will be partly reduced modulo prime
1490
// Assumes prime is nonzero and normalized
1491
12.0k
void bn_subi(bignum256 *x, uint32_t y, const bignum256 *prime) {
1492
12.0k
  assert(y < prime->val[0]);
1493
1494
  // x = x + prime - y
1495
1496
12.0k
  uint32_t acc = -y;
1497
120k
  for (int i = 0; i < BN_LIMBS; i++) {
1498
108k
    acc += x->val[i] + prime->val[i];
1499
    // acc neither overflows 32 bits nor underflows 0
1500
    // Proof:
1501
    //   acc + x[i] + prime[i]
1502
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + 2 * (2**BITS_PER_LIMB - 1)
1503
    //     <= 7 + 2**30 - 2 < 2**32
1504
    //   acc + x[i] + prime[i]
1505
    //     >= -y + prime[0] >= 0
1506
1507
108k
    x->val[i] = acc & BN_LIMB_MASK;
1508
108k
    acc >>= BN_BITS_PER_LIMB;
1509
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1
1510
1511
    // acc == x[:i + 1] + prime[:i + 1] - y >> BITS_PER_LIMB * (i + 1)
1512
108k
  }
1513
1514
  // assert(acc == 0); // assert x + prime - y < 2**261
1515
  // acc == 0
1516
  // Proof:
1517
  //   acc == x[:LIMBS] + prime[:LIMBS] - y >> BITS_PER_LIMB * LIMBS
1518
  //     == x + prime - y >> BITS_PER_LIMB * LIMBS
1519
  //     <= 2**(LIMBS * BITS_PER_LIMB) - 1 >> BITS_PER_LIMB * LIMBS == 0
1520
12.0k
}
1521
1522
// res = x - y % prime
1523
// Explicitly res = x + (2 * prime - y)
1524
// Assumes x, y are normalized, y is partly reduced
1525
// Assumes x + 2 * prime - y < 2**261 == 2**(BITS_PER_LIMB * LIMBS)
1526
// Guarantees res is normalized
1527
// Assumes prime is nonzero and normalized
1528
void bn_subtractmod(const bignum256 *x, const bignum256 *y, bignum256 *res,
1529
12.5M
                    const bignum256 *prime) {
1530
  // res = x + (2 * prime - y)
1531
1532
12.5M
  uint32_t acc = 1;
1533
1534
125M
  for (int i = 0; i < BN_LIMBS; i++) {
1535
112M
    acc += (BN_BASE - 1) + x->val[i] + 2 * prime->val[i] - y->val[i];
1536
    // acc neither overflows 32 bits nor underflows 0
1537
    // Proof:
1538
    //   acc + (BASE - 1) + x[i] + 2 * prime[i] - y[i]
1539
    //     >= (BASE - 1) - y[i]
1540
    //     == (2**BITS_PER_LIMB - 1) - (2**BITS_PER_LIMB - 1) == 0
1541
    //   acc + (BASE - 1) + x[i] + 2 * prime[i] - y[i]
1542
    //     <= acc + (BASE - 1) + x[i] + 2 * prime[i]
1543
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + (2**BITS_PER_LIMB - 1) +
1544
    //     (2**BITS_PER_LIMB - 1) + 2 * (2**BITS_PER_LIMB - 1)
1545
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + 4 * (2**BITS_PER_LIMB - 1)
1546
    //     == 7 + 4 * 2**29 - 4 == 2**31 + 3 < 2**32
1547
1548
112M
    res->val[i] = acc & (BN_BASE - 1);
1549
112M
    acc >>= BN_BITS_PER_LIMB;
1550
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1
1551
1552
    // acc == 2**(BITS_PER_LIMB * (i + 1)) + x[:i+1] - y[:i+1] + 2*prime[:i+1]
1553
    //   >> BITS_PER_LIMB * (i+1)
1554
112M
  }
1555
1556
  // assert(acc == 1); // assert x + 2 * prime - y < 2**261
1557
1558
  // clang-format off
1559
  // acc == 1
1560
  // Proof:
1561
  //   acc == 2**(BITS_PER_LIMB * LIMBS) + x[:LIMBS] - y[:LIMBS] + 2 * prime[:LIMBS] >> BITS_PER_LIMB * LIMBS
1562
  //     == 2**(BITS_PER_LIMB * LIMBS) + x - y + 2 * prime >> BITS_PER_LIMB * LIMBS
1563
  //     == 2**(BITS_PER_LIMB * LIMBS) + x + (2 * prime - y) >> BITS_PER_LIMB * LIMBS
1564
  //     <= 2**(BITS_PER_LIMB * LIMBS) + 2**(BITS_PER_LIMB * LIMBS) - 1 >> BITS_PER_LIMB * LIMBS
1565
  //     <= 2 * 2**(BITS_PER_LIMB * LIMBS) - 1 >> BITS_PER_LIMB * LIMBS
1566
  //     == 1
1567
1568
  //   acc == 2**(BITS_PER_LIMB * LIMBS) + x[:LIMBS] - y[:LIMBS] + 2 * prime[:LIMBS] >> BITS_PER_LIMB * LIMBS
1569
  //     == 2**(BITS_PER_LIMB * LIMBS) + x - y + 2 * prime >> BITS_PER_LIMB * LIMBS
1570
  //     == 2**(BITS_PER_LIMB * LIMBS) + x + (2 * prime - y) >> BITS_PER_LIMB * LIMBS
1571
  //     >= 2**(BITS_PER_LIMB * LIMBS) + 0 + 1 >> BITS_PER_LIMB * LIMBS
1572
  //     == 1
1573
  // clang-format on
1574
12.5M
}
1575
1576
// res = x - y
1577
// Assumes x, y are normalized and x >= y
1578
// Guarantees res is normalized
1579
// Works properly even if &x == &y or &x == &res or &y == &res or
1580
//   &x == &y == &res
1581
44.6M
void bn_subtract(const bignum256 *x, const bignum256 *y, bignum256 *res) {
1582
44.6M
  uint32_t acc = 1;
1583
446M
  for (int i = 0; i < BN_LIMBS; i++) {
1584
401M
    acc += (BN_BASE - 1) + x->val[i] - y->val[i];
1585
    // acc neither overflows 32 bits nor underflows 0
1586
    // Proof:
1587
    //   acc + (BASE - 1) + x[i] - y[i]
1588
    //     >= (BASE - 1) - y == (2**BITS_PER_LIMB - 1) - (2**BITS_PER_LIMB - 1)
1589
    //     == 0
1590
    //   acc + (BASE - 1) + x[i] - y[i]
1591
    //     <= acc + (BASE - 1) + x[i]
1592
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + (2**BITS_PER_LIMB - 1) +
1593
    //       (2**BITS_PER_LIMB - 1)
1594
    //     == 7 + 2 * 2**29 < 2 **32
1595
1596
401M
    res->val[i] = acc & BN_LIMB_MASK;
1597
401M
    acc >>= BN_BITS_PER_LIMB;
1598
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1
1599
1600
    // acc == 2**(BITS_PER_LIMB * (i + 1)) + x[:i + 1] - y[:i + 1]
1601
    //   >> BITS_PER_LIMB * (i + 1)
1602
401M
  }
1603
1604
  // assert(acc == 1); // assert x >= y
1605
1606
  // clang-format off
1607
  // acc == 1
1608
  // Proof:
1609
  //   acc == 2**(BITS_PER_LIMB * LIMBS) + x[:LIMBS] - y[:LIMBS] >> BITS_PER_LIMB * LIMBS
1610
  //     == 2**(BITS_PER_LIMB * LIMBS) + x - y >> BITS_PER_LIMB * LIMBS
1611
  //     == 2**(BITS_PER_LIMB * LIMBS) + x >> BITS_PER_LIMB * LIMBS
1612
  //     <= 2**(BITS_PER_LIMB * LIMBS) + 2**(BITS_PER_LIMB * LIMBS) - 1 >> BITS_PER_LIMB * LIMBS
1613
  //     <= 2 * 2**(BITS_PER_LIMB * LIMBS) - 1 >> BITS_PER_LIMB * LIMBS
1614
  //     == 1
1615
1616
  //   acc == 2**(BITS_PER_LIMB * LIMBS) + x[:LIMBS] - y[:LIMBS] >> BITS_PER_LIMB * LIMBS
1617
  //     == 2**(BITS_PER_LIMB * LIMBS) + x - y >> BITS_PER_LIMB * LIMBS
1618
  //     >= 2**(BITS_PER_LIMB * LIMBS) >> BITS_PER_LIMB * LIMBS
1619
  //     == 1
1620
44.6M
}
1621
1622
// Returns 0 if x is zero
1623
// Returns 1 if x is a square modulo prime
1624
// Returns -1 if x is not a square modulo prime
1625
// Assumes x is normalized, x < 2**259
1626
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
1627
// Assumes prime is a prime
1628
// The function doesn't have neither constant control flow nor constant memory
1629
//  access flow with regard to prime
1630
0
int bn_legendre(const bignum256 *x, const bignum256 *prime) {
1631
  // This is a naive implementation
1632
  // A better implementation would be to use the Euclidean algorithm together with the quadratic reciprocity law
1633
1634
  // e = (prime - 1) / 2
1635
0
  bignum256 e = {0};
1636
0
  bn_copy(prime, &e);
1637
0
  bn_rshift(&e);
1638
1639
  // res = x**e % prime
1640
0
  bignum256 res = {0};
1641
0
  bn_power_mod(x, &e, prime, &res);
1642
0
  bn_mod(&res, prime);
1643
1644
0
  if (bn_is_one(&res)) {
1645
0
    return 1;
1646
0
  }
1647
1648
0
  if (bn_is_zero(&res)) {
1649
0
    return 0;
1650
0
  }
1651
1652
0
  return -1;
1653
0
}
1654
1655
// q = x // d, r = x % d
1656
// Assumes x is normalized, 1 <= d <= 61304
1657
// Guarantees q is normalized
1658
0
void bn_long_division(bignum256 *x, uint32_t d, bignum256 *q, uint32_t *r) {
1659
0
  assert(1 <= d && d < 61304);
1660
1661
0
  uint32_t acc = 0;
1662
1663
0
  *r = x->val[BN_LIMBS - 1] % d;
1664
0
  q->val[BN_LIMBS - 1] = x->val[BN_LIMBS - 1] / d;
1665
1666
0
  for (int i = BN_LIMBS - 2; i >= 0; i--) {
1667
0
    acc = *r * (BN_BASE % d) + x->val[i];
1668
    // acc doesn't overflow 32 bits
1669
    // Proof:
1670
    //   r * (BASE % d) + x[i]
1671
    //     <= (d - 1) * (d - 1) + (2**BITS_PER_LIMB - 1)
1672
    //     == d**2 - 2*d + 2**BITS_PER_LIMB
1673
    //     == 61304**2 - 2 * 61304 + 2**29
1674
    //     == 3758057808 + 2**29  < 2**32
1675
1676
0
    q->val[i] = *r * (BN_BASE / d) + (acc / d);
1677
    // q[i] doesn't overflow 32 bits
1678
    // Proof:
1679
    //   r * (BASE // d) + (acc // d)
1680
    //     <= (d - 1) * (2**BITS_PER_LIMB / d) +
1681
    //      ((d**2 - 2*d + 2**BITS_PER_LIMB) / d)
1682
    //     <= (d - 1) * (2**BITS_PER_LIMB / d) + (d - 2 + 2**BITS_PER_LIMB / d)
1683
    //     == (d - 1 + 1) * (2**BITS_PER_LIMB / d) + d - 2
1684
    //     == 2**BITS_PER_LIMB + d - 2 <= 2**29 + 61304 < 2**32
1685
1686
    // q[i] == (r * BASE + x[i]) // d
1687
    // Proof:
1688
    //   q[i] == r * (BASE // d) + (acc // d)
1689
    //     == r * (BASE // d) + (r * (BASE % d) + x[i]) // d
1690
    //     == (r * d * (BASE // d) + r * (BASE % d) + x[i]) // d
1691
    //     == (r * (d * (BASE // d) + (BASE % d)) + x[i]) // d
1692
    //     == (r * BASE + x[i]) // d
1693
1694
    // q[i] < 2**BITS_PER_LIMB
1695
    // Proof:
1696
    //   q[i] == (r * BASE + x[i]) // d
1697
    //     <= ((d - 1) * 2**BITS_PER_LIMB + (2**BITS_PER_LIMB - 1)) / d
1698
    //     == (d * 2**BITS_PER_LIMB - 1) / d == 2**BITS_PER_LIMB - 1 / d
1699
    //     <  2**BITS_PER_LIMB
1700
1701
0
    *r = acc % d;
1702
    // r == (r * BASE + x[i]) % d
1703
    // Proof:
1704
    //   r == acc % d == (r * (BASE % d) + x[i]) % d
1705
    //     == (r * BASE + x[i]) % d
1706
1707
    // x[:i] == q[:i] * d + r
1708
0
  }
1709
0
}
1710
1711
// x = x // 58, r = x % 58
1712
// Assumes x is normalized
1713
// Guarantees x is normalized
1714
0
void bn_divmod58(bignum256 *x, uint32_t *r) { bn_long_division(x, 58, x, r); }
1715
1716
// x = x // 1000, r = x % 1000
1717
// Assumes x is normalized
1718
// Guarantees x is normalized
1719
0
void bn_divmod1000(bignum256 *x, uint32_t *r) {
1720
0
  bn_long_division(x, 1000, x, r);
1721
0
}
1722
1723
// x = x // 10, r = x % 10
1724
// Assumes x is normalized
1725
// Guarantees x is normalized
1726
0
void bn_divmod10(bignum256 *x, uint32_t *r) { bn_long_division(x, 10, x, r); }
1727
1728
// Formats amount
1729
// Assumes amount is normalized
1730
// Assumes prefix and suffix are null-terminated strings
1731
// Assumes output is an array of length output_length
1732
// The function doesn't have neither constant control flow nor constant memory
1733
//   access flow with regard to any its argument
1734
0
size_t bn_format(const bignum256 *amount, const char *prefix, const char *suffix, unsigned int decimals, int exponent, bool trailing, char thousands, char *output, size_t output_length) {
1735
1736
/*
1737
  Python prototype of the function:
1738
1739
  def format(amount, prefix, suffix, decimals, exponent, trailing, thousands):
1740
      if exponent >= 0:
1741
          amount *= 10**exponent
1742
      else:
1743
          amount //= 10 ** (-exponent)
1744
1745
      d = pow(10, decimals)
1746
1747
      integer_part = amount // d
1748
      integer_str = f"{integer_part:,}".replace(",", thousands or "")
1749
1750
      if decimals:
1751
          decimal_part = amount % d
1752
          decimal_str = f".{decimal_part:0{decimals}d}"
1753
          if not trailing:
1754
              decimal_str = decimal_str.rstrip("0").rstrip(".")
1755
      else:
1756
          decimal_str = ""
1757
1758
      return prefix + integer_str + decimal_str + suffix
1759
*/
1760
1761
// Auxiliary macro for bn_format
1762
// If enough space adds one character to output starting from the end
1763
0
#define BN_FORMAT_ADD_OUTPUT_CHAR(c)                               \
1764
0
  {                                                                \
1765
0
    --position;                                                    \
1766
0
    if (output <= position && position < output + output_length) { \
1767
0
      *position = (c);                                             \
1768
0
    } else {                                                       \
1769
0
      memset(output, '\0', output_length);                         \
1770
0
      return 0;                                                    \
1771
0
    }                                                              \
1772
0
  }
1773
1774
0
  bignum256 temp = {0};
1775
0
  bn_copy(amount, &temp);
1776
0
  uint32_t digit = 0;
1777
1778
0
  char *position = output + output_length;
1779
1780
  // Add string ending character
1781
0
  BN_FORMAT_ADD_OUTPUT_CHAR('\0');
1782
1783
  // Add suffix
1784
0
  size_t suffix_length = suffix ? strlen(suffix) : 0;
1785
0
  for (int i = suffix_length - 1; i >= 0; --i)
1786
0
    BN_FORMAT_ADD_OUTPUT_CHAR(suffix[i])
1787
1788
  // amount //= 10**exponent
1789
0
  for (; exponent < 0; ++exponent) {
1790
    // if temp == 0, there is no need to divide it by 10 anymore
1791
0
    if (bn_is_zero(&temp)) {
1792
0
      exponent = 0;
1793
0
      break;
1794
0
    }
1795
0
    bn_divmod10(&temp, &digit);
1796
0
  }
1797
1798
  // exponent >= 0 && decimals >= 0
1799
1800
0
  bool fractional_part = false;  // is fractional-part of amount present
1801
1802
0
  {  // Add fractional-part digits of amount
1803
    // Add trailing zeroes
1804
0
    unsigned int trailing_zeros = decimals < (unsigned int) exponent ? decimals : (unsigned int) exponent;
1805
    // When casting a negative int to unsigned int, UINT_MAX is added to the int before
1806
    // Since exponent >= 0, the value remains unchanged
1807
0
    decimals -= trailing_zeros;
1808
0
    exponent -= trailing_zeros;
1809
1810
0
    if (trailing && trailing_zeros) {
1811
0
      fractional_part = true;
1812
0
      for (; trailing_zeros > 0; --trailing_zeros)
1813
0
          BN_FORMAT_ADD_OUTPUT_CHAR('0')
1814
0
    }
1815
1816
    // exponent == 0 || decimals == 0
1817
1818
    // Add significant digits and leading zeroes
1819
0
    for (; decimals > 0; --decimals) {
1820
0
      bn_divmod10(&temp, &digit);
1821
1822
0
      if (fractional_part || digit || trailing) {
1823
0
        fractional_part = true;
1824
0
        BN_FORMAT_ADD_OUTPUT_CHAR('0' + digit)
1825
0
      }
1826
0
      else if (bn_is_zero(&temp)) {
1827
        // We break since the remaining digits are zeroes and fractional_part == trailing == false
1828
0
        decimals = 0;
1829
0
        break;
1830
0
      }
1831
0
    }
1832
    // decimals == 0
1833
0
  }
1834
1835
0
  if (fractional_part) {
1836
0
    BN_FORMAT_ADD_OUTPUT_CHAR('.')
1837
0
  }
1838
1839
0
  {  // Add integer-part digits of amount
1840
    // Add trailing zeroes
1841
0
    int digits = 0;
1842
0
    if (!bn_is_zero(&temp)) {
1843
0
      for (; exponent > 0; --exponent) {
1844
0
        ++digits;
1845
0
        BN_FORMAT_ADD_OUTPUT_CHAR('0')
1846
0
        if (thousands != 0 && digits % 3 == 0) {
1847
0
          BN_FORMAT_ADD_OUTPUT_CHAR(thousands)
1848
0
        }
1849
0
      }
1850
0
    }
1851
    // decimals == 0 && exponent == 0
1852
1853
    // Add significant digits
1854
0
    bool is_zero = false;
1855
0
    do {
1856
0
      ++digits;
1857
0
      bn_divmod10(&temp, &digit);
1858
0
      is_zero = bn_is_zero(&temp);
1859
0
      BN_FORMAT_ADD_OUTPUT_CHAR('0' + digit)
1860
0
      if (thousands != 0 && !is_zero && digits % 3 == 0) {
1861
0
        BN_FORMAT_ADD_OUTPUT_CHAR(thousands)
1862
0
      }
1863
0
    } while (!is_zero);
1864
0
  }
1865
1866
  // Add prefix
1867
0
  size_t prefix_length = prefix ? strlen(prefix) : 0;
1868
0
  for (int i = prefix_length - 1; i >= 0; --i)
1869
0
    BN_FORMAT_ADD_OUTPUT_CHAR(prefix[i])
1870
1871
  // Move formatted amount to the start of output
1872
0
  int length = output - position + output_length;
1873
0
  memmove(output, position, length);
1874
0
  return length - 1;
1875
0
}
1876
1877
#if USE_BN_PRINT
1878
// Prints x in hexadecimal
1879
// Assumes x is normalized and x < 2**256
1880
void bn_print(const bignum256 *x) {
1881
  printf("%06x", x->val[8]);
1882
  printf("%08x", ((x->val[7] << 3) | (x->val[6] >> 26)));
1883
  printf("%07x", ((x->val[6] << 2) | (x->val[5] >> 27)) & 0x0FFFFFFF);
1884
  printf("%07x", ((x->val[5] << 1) | (x->val[4] >> 28)) & 0x0FFFFFFF);
1885
  printf("%07x", x->val[4] & 0x0FFFFFFF);
1886
  printf("%08x", ((x->val[3] << 3) | (x->val[2] >> 26)));
1887
  printf("%07x", ((x->val[2] << 2) | (x->val[1] >> 27)) & 0x0FFFFFFF);
1888
  printf("%07x", ((x->val[1] << 1) | (x->val[0] >> 28)) & 0x0FFFFFFF);
1889
  printf("%07x", x->val[0] & 0x0FFFFFFF);
1890
}
1891
1892
// Prints comma separated list of limbs of x
1893
void bn_print_raw(const bignum256 *x) {
1894
  for (int i = 0; i < BN_LIMBS - 1; i++) {
1895
    printf("0x%08x, ", x->val[i]);
1896
  }
1897
  printf("0x%08x", x->val[BN_LIMBS - 1]);
1898
}
1899
#endif
1900
1901
#if USE_INVERSE_FAST
1902
83.8k
void bn_inverse(bignum256 *x, const bignum256 *prime) {
1903
83.8k
  bn_inverse_fast(x, prime);
1904
83.8k
}
1905
#else
1906
void bn_inverse(bignum256 *x, const bignum256 *prime) {
1907
  bn_inverse_slow(x, prime);
1908
}
1909
#endif