Coverage Report

Created: 2024-11-21 07:03

/src/trezor-firmware/crypto/bignum.c
Line
Count
Source (jump to first uncovered line)
1
/**
2
 * Copyright (c) 2013-2014 Tomas Dzetkulic
3
 * Copyright (c) 2013-2014 Pavol Rusnak
4
 * Copyright (c)      2015 Jochen Hoenicke
5
 * Copyright (c)      2016 Alex Beregszaszi
6
 *
7
 * Permission is hereby granted, free of charge, to any person obtaining
8
 * a copy of this software and associated documentation files (the "Software"),
9
 * to deal in the Software without restriction, including without limitation
10
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11
 * and/or sell copies of the Software, and to permit persons to whom the
12
 * Software is furnished to do so, subject to the following conditions:
13
 *
14
 * The above copyright notice and this permission notice shall be included
15
 * in all copies or substantial portions of the Software.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
21
 * OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23
 * OTHER DEALINGS IN THE SOFTWARE.
24
 */
25
26
#include "bignum.h"
27
28
#include <assert.h>
29
#include <stdint.h>
30
#include <stdio.h>
31
#include <string.h>
32
33
#include "memzero.h"
34
#include "script.h"
35
36
/*
37
 This library implements 256-bit numbers arithmetic.
38
39
 An unsigned 256-bit number is represented by a bignum256 structure, that is an
40
 array of nine 32-bit values called limbs. Limbs are digits of the number in
41
 the base 2**29 representation in the little endian order. This means that
42
   bignum256 x;
43
 represents the value
44
   sum([x[i] * 2**(29*i) for i in range(9)).
45
46
 A limb of a bignum256 is *normalized* iff it's less than 2**29.
47
 A bignum256 is *normalized* iff every its limb is normalized.
48
 A number is *fully reduced modulo p* iff it is less than p.
49
 A number is *partly reduced modulo p* iff is is less than 2*p.
50
 The number p is usually a prime number such that 2^256 - 2^224 <= p <= 2^256.
51
52
 All functions except bn_fast_mod expect that all their bignum256 inputs are
53
 normalized. (The function bn_fast_mod allows the input number to have the
54
 most significant limb unnormalized). All bignum256 outputs of all functions
55
 are guaranteed to be  normalized.
56
57
 A number can be partly reduced with bn_fast_mod, a partly reduced number can
58
 be fully reduced with bn_mod.
59
60
 A function has *constant control flow with regard to its argument* iff the
61
 order in which instructions of the function are executed doesn't depend on the
62
 value of the argument.
63
 A function has *constant memory access flow with regard to its argument* iff
64
 the memory addresses that are acessed and the order in which they are accessed
65
 don't depend on the value of the argument.
66
 A function *has contant control (memory access) flow* iff it has constant
67
 control (memory access) flow with regard to all its arguments.
68
69
 The following function has contant control flow with regard to its arugment
70
 n, however is doesn't have constant memory access flow with regard to it:
71
 void (int n, int *a) }
72
   a[0] = 0;
73
   a[n] = 0; // memory address reveals the value of n
74
 }
75
76
 Unless stated otherwise all functions are supposed to have both constant
77
 control flow and constant memory access flow.
78
 */
79
80
#define BN_MAX_DECIMAL_DIGITS \
81
0
  79  // floor(log(2**(LIMBS * BITS_PER_LIMB), 10)) + 1
82
83
// y = (bignum256) x
84
// Assumes x is normalized and x < 2**261 == 2**(BITS_PER_LIMB * LIMBS)
85
// Guarantees y is normalized
86
3.43M
void bn_copy_lower(const bignum512 *x, bignum256 *y) {
87
34.3M
  for (int i = 0; i < BN_LIMBS; i++) {
88
30.9M
    y->val[i] = x->val[i];
89
30.9M
  }
90
3.43M
}
91
92
// out_number = (bignum256) in_number
93
// Assumes in_number is a raw bigendian 256-bit number
94
// Guarantees out_number is normalized
95
3.65k
void bn_read_be(const uint8_t *in_number, bignum256 *out_number) {
96
3.65k
  uint32_t temp = 0;
97
98
32.8k
  for (int i = 0; i < BN_LIMBS - 1; i++) {
99
29.2k
    uint32_t limb = read_be(in_number + (BN_LIMBS - 2 - i) * 4);
100
101
29.2k
    temp |= limb << (BN_EXTRA_BITS * i);
102
29.2k
    out_number->val[i] = temp & BN_LIMB_MASK;
103
104
29.2k
    temp = limb >> (32 - BN_EXTRA_BITS * (i + 1));
105
29.2k
  }
106
107
3.65k
  out_number->val[BN_LIMBS - 1] = temp;
108
3.65k
}
109
110
// out_number = (bignum512) in_number
111
// Assumes in_number is a raw bigendian 512-bit number
112
// Guarantees out_number is normalized
113
0
void bn_read_be_512(const uint8_t *in_number, bignum512 *out_number) {
114
0
  bignum256 lower = {0}, upper = {0};
115
116
0
  bn_read_be(in_number + 32, &lower);
117
0
  bn_read_be(in_number, &upper);
118
119
0
  const int shift_length = BN_BITS_PER_LIMB * BN_LIMBS - 256;
120
0
  uint32_t shift = upper.val[0] & ((1 << shift_length) - 1);
121
0
  for (int i = 0; i < shift_length; i++) {
122
0
    bn_rshift(&upper);
123
0
  }
124
0
  lower.val[BN_LIMBS - 1] |= shift << (BN_BITS_PER_LIMB - shift_length);
125
126
0
  for (int i = 0; i < BN_LIMBS; i++) {
127
0
    out_number->val[i] = lower.val[i];
128
0
  }
129
0
  for (int i = 0; i < BN_LIMBS; i++) {
130
0
    out_number->val[i + BN_LIMBS] = upper.val[i];
131
0
  }
132
0
}
133
134
// out_number = (256BE) in_number
135
// Assumes in_number < 2**256
136
// Guarantess out_number is a raw bigendian 256-bit number
137
1.95k
void bn_write_be(const bignum256 *in_number, uint8_t *out_number) {
138
1.95k
  uint32_t temp = in_number->val[BN_LIMBS - 1];
139
17.5k
  for (int i = BN_LIMBS - 2; i >= 0; i--) {
140
15.6k
    uint32_t limb = in_number->val[i];
141
142
15.6k
    temp = (temp << (BN_BITS_PER_LIMB - BN_EXTRA_BITS * i)) |
143
15.6k
           (limb >> (BN_EXTRA_BITS * i));
144
15.6k
    write_be(out_number + (BN_LIMBS - 2 - i) * 4, temp);
145
146
15.6k
    temp = limb;
147
15.6k
  }
148
1.95k
}
149
150
// out_number = (bignum256) in_number
151
// Assumes in_number is a raw little endian 256-bit number
152
// Guarantees out_number is normalized
153
0
void bn_read_le(const uint8_t *in_number, bignum256 *out_number) {
154
0
  uint32_t temp = 0;
155
0
  for (int i = 0; i < BN_LIMBS - 1; i++) {
156
0
    uint32_t limb = read_le(in_number + i * 4);
157
158
0
    temp |= limb << (BN_EXTRA_BITS * i);
159
0
    out_number->val[i] = temp & BN_LIMB_MASK;
160
0
    temp = limb >> (32 - BN_EXTRA_BITS * (i + 1));
161
0
  }
162
163
0
  out_number->val[BN_LIMBS - 1] = temp;
164
0
}
165
166
// out_number = (256LE) in_number
167
// Assumes in_number < 2**256
168
// Guarantess out_number is a raw little endian 256-bit number
169
0
void bn_write_le(const bignum256 *in_number, uint8_t *out_number) {
170
0
  uint32_t temp = in_number->val[BN_LIMBS - 1];
171
172
0
  for (int i = BN_LIMBS - 2; i >= 0; i--) {
173
0
    uint32_t limb = in_number->val[i];
174
0
    temp = (temp << (BN_BITS_PER_LIMB - BN_EXTRA_BITS * i)) |
175
0
           (limb >> (BN_EXTRA_BITS * i));
176
0
    write_le(out_number + i * 4, temp);
177
0
    temp = limb;
178
0
  }
179
0
}
180
181
// out_number = (bignum256) in_number
182
// Guarantees out_number is normalized
183
0
void bn_read_uint32(uint32_t in_number, bignum256 *out_number) {
184
0
  out_number->val[0] = in_number & BN_LIMB_MASK;
185
0
  out_number->val[1] = in_number >> BN_BITS_PER_LIMB;
186
0
  for (uint32_t i = 2; i < BN_LIMBS; i++) out_number->val[i] = 0;
187
0
}
188
189
// out_number = (bignum256) in_number
190
// Guarantees out_number is normalized
191
0
void bn_read_uint64(uint64_t in_number, bignum256 *out_number) {
192
0
  out_number->val[0] = in_number & BN_LIMB_MASK;
193
0
  out_number->val[1] = (in_number >>= BN_BITS_PER_LIMB) & BN_LIMB_MASK;
194
0
  out_number->val[2] = in_number >> BN_BITS_PER_LIMB;
195
0
  for (uint32_t i = 3; i < BN_LIMBS; i++) out_number->val[i] = 0;
196
0
}
197
198
// Returns the bitsize of x
199
// Assumes x is normalized
200
// The function doesn't have neither constant control flow nor constant memory
201
//   access flow
202
203
int bn_bitcount(const bignum256 *x) {
203
203
  for (int i = BN_LIMBS - 1; i >= 0; i--) {
204
203
    uint32_t limb = x->val[i];
205
203
    if (limb != 0) {
206
      // __builtin_clz returns the number of leading zero bits starting at the
207
      // most significant bit position
208
203
      return i * BN_BITS_PER_LIMB + (32 - __builtin_clz(limb));
209
203
    }
210
203
  }
211
0
  return 0;
212
203
}
213
214
// Returns the number of decimal digits of x; if x is 0, returns 1
215
// Assumes x is normalized
216
// The function doesn't have neither constant control flow nor constant memory
217
//   access flow
218
0
unsigned int bn_digitcount(const bignum256 *x) {
219
0
  bignum256 val = {0};
220
0
  bn_copy(x, &val);
221
222
0
  unsigned int digits = 1;
223
0
  for (unsigned int i = 0; i < BN_MAX_DECIMAL_DIGITS; i += 3) {
224
0
    uint32_t limb = 0;
225
226
0
    bn_divmod1000(&val, &limb);
227
228
0
    if (limb >= 100) {
229
0
      digits = i + 3;
230
0
    } else if (limb >= 10) {
231
0
      digits = i + 2;
232
0
    } else if (limb >= 1) {
233
0
      digits = i + 1;
234
0
    }
235
0
  }
236
237
0
  memzero(&val, sizeof(val));
238
239
0
  return digits;
240
0
}
241
242
// x = 0
243
// Guarantees x is normalized
244
21.5k
void bn_zero(bignum256 *x) {
245
215k
  for (int i = 0; i < BN_LIMBS; i++) {
246
193k
    x->val[i] = 0;
247
193k
  }
248
21.5k
}
249
250
// x = 1
251
// Guarantees x is normalized
252
10.7k
void bn_one(bignum256 *x) {
253
10.7k
  x->val[0] = 1;
254
96.7k
  for (int i = 1; i < BN_LIMBS; i++) {
255
85.9k
    x->val[i] = 0;
256
85.9k
  }
257
10.7k
}
258
259
// Returns x == 0
260
// Assumes x is normalized
261
33.4k
int bn_is_zero(const bignum256 *x) {
262
33.4k
  uint32_t result = 0;
263
334k
  for (int i = 0; i < BN_LIMBS; i++) {
264
301k
    result |= x->val[i];
265
301k
  }
266
33.4k
  return !result;
267
33.4k
}
268
269
// Returns x == 1
270
// Assumes x is normalized
271
5.60M
int bn_is_one(const bignum256 *x) {
272
5.60M
  uint32_t result = x->val[0] ^ 1;
273
50.4M
  for (int i = 1; i < BN_LIMBS; i++) {
274
44.8M
    result |= x->val[i];
275
44.8M
  }
276
5.60M
  return !result;
277
5.60M
}
278
279
// Returns x < y
280
// Assumes x, y are normalized
281
5.71M
int bn_is_less(const bignum256 *x, const bignum256 *y) {
282
5.71M
  uint32_t res1 = 0;
283
5.71M
  uint32_t res2 = 0;
284
57.1M
  for (int i = BN_LIMBS - 1; i >= 0; i--) {
285
51.4M
    res1 = (res1 << 1) | (x->val[i] < y->val[i]);
286
51.4M
    res2 = (res2 << 1) | (x->val[i] > y->val[i]);
287
51.4M
  }
288
5.71M
  return res1 > res2;
289
5.71M
}
290
291
// Returns x == y
292
// Assumes x, y are normalized
293
87.1k
int bn_is_equal(const bignum256 *x, const bignum256 *y) {
294
87.1k
  uint32_t result = 0;
295
871k
  for (int i = 0; i < BN_LIMBS; i++) {
296
784k
    result |= x->val[i] ^ y->val[i];
297
784k
  }
298
87.1k
  return !result;
299
87.1k
}
300
301
// res = cond if truecase else falsecase
302
// Assumes cond is either 0 or 1
303
// Works properly even if &res == &truecase or &res == &falsecase or
304
//   &truecase == &falsecase or &res == &truecase == &falsecase
305
void bn_cmov(bignum256 *res, volatile uint32_t cond, const bignum256 *truecase,
306
762k
             const bignum256 *falsecase) {
307
  // Intentional use of bitwise OR operator to ensure constant-time
308
762k
  assert((int)(cond == 1) | (int)(cond == 0));
309
310
762k
  uint32_t tmask = -cond;   // tmask = 0xFFFFFFFF if cond else 0x00000000
311
762k
  uint32_t fmask = ~tmask;  // fmask = 0x00000000 if cond else 0xFFFFFFFF
312
313
7.62M
  for (int i = 0; i < BN_LIMBS; i++) {
314
6.86M
    res->val[i] = (truecase->val[i] & tmask) | (falsecase->val[i] & fmask);
315
6.86M
  }
316
762k
}
317
318
// x = -x % prime if cond else x,
319
// Explicitly x = (3 * prime - x if x > prime else 2 * prime - x) if cond else
320
//   else (x if x > prime else x + prime)
321
// Assumes x is normalized and partly reduced
322
// Assumes cond is either 1 or 0
323
// Guarantees x is normalized
324
// Assumes prime is normalized and
325
//   0 < prime < 2**260 == 2**(BITS_PER_LIMB * LIMBS - 1)
326
71.5k
void bn_cnegate(volatile uint32_t cond, bignum256 *x, const bignum256 *prime) {
327
  // Intentional use of bitwise OR operator to ensure constant time
328
71.5k
  assert((int)(cond == 1) | (int)(cond == 0));
329
330
71.5k
  uint32_t tmask = -cond;   // tmask = 0xFFFFFFFF if cond else 0x00000000
331
71.5k
  uint32_t fmask = ~tmask;  // fmask = 0x00000000 if cond else 0xFFFFFFFF
332
333
71.5k
  bn_mod(x, prime);
334
  // x < prime
335
336
71.5k
  uint32_t acc1 = 1;
337
71.5k
  uint32_t acc2 = 0;
338
339
715k
  for (int i = 0; i < BN_LIMBS; i++) {
340
643k
    acc1 += (BN_BASE - 1) + 2 * prime->val[i] - x->val[i];
341
    // acc1 neither overflows 32 bits nor underflows 0
342
    // Proof:
343
    //   acc1 + (BASE - 1) + 2 * prime[i] - x[i]
344
    //     >= (BASE - 1) - x >= (2**BITS_PER_LIMB - 1) - (2**BITS_PER_LIMB - 1)
345
    //     == 0
346
    //   acc1 + (BASE - 1) + 2 * prime[i] - x[i]
347
    //     <= acc1 + (BASE - 1) + 2 * prime[i]
348
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + 2 * (2**BITS_PER_LIMB - 1) +
349
    //       (2**BITS_PER_LIMB - 1)
350
    //     == 7 + 3 * 2**29 < 2**32
351
352
643k
    acc2 += prime->val[i] + x->val[i];
353
    // acc2 doesn't overflow 32 bits
354
    // Proof:
355
    //   acc2 + prime[i] + x[i]
356
    //     <= 2**(32 - BITS_PER_LIMB) - 1 + 2 * (2**BITS_PER_LIMB - 1)
357
    //     == 2**(32 - BITS_PER_LIMB) + 2**(BITS_PER_LIMB + 1) - 2
358
    //     == 2**30 + 5 < 2**32
359
360
    // x = acc1 & LIMB_MASK if cond else acc2 & LIMB_MASK
361
643k
    x->val[i] = ((acc1 & tmask) | (acc2 & fmask)) & BN_LIMB_MASK;
362
363
643k
    acc1 >>= BN_BITS_PER_LIMB;
364
    // acc1 <= 7 == 2**(32 - BITS_PER_LIMB) - 1
365
    // acc1 == 2**(BITS_PER_LIMB * (i + 1)) + 2 * prime[:i + 1] - x[:i + 1]
366
    //   >> BITS_PER_LIMB * (i + 1)
367
368
643k
    acc2 >>= BN_BITS_PER_LIMB;
369
    // acc2 <= 7 == 2**(32 - BITS_PER_LIMB) - 1
370
    // acc2 == prime[:i + 1] + x[:i + 1] >> BITS_PER_LIMB * (i + 1)
371
643k
  }
372
373
  // assert(acc1 == 1); // assert prime <= 2**260
374
  // assert(acc2 == 0);
375
376
  // clang-format off
377
  // acc1 == 1
378
  // Proof:
379
  //   acc1 == 2**(BITS_PER_LIMB * LIMBS) + 2 * prime[:LIMBS] - x[:LIMBS] >> BITS_PER_LIMB * LIMBS
380
  //     == 2**(BITS_PER_LIMB * LIMBS) + 2 * prime - x >> BITS_PER_LIMB * LIMBS
381
  //     <= 2**(BITS_PER_LIMB * LIMBS) + 2 * prime >> BITS_PER_LIMB * LIMBS
382
  //     <= 2**(BITS_PER_LIMB * LIMBS) + 2 * (2**(BITS_PER_LIMB * LIMBS - 1) - 1) >> BITS_PER_LIMB * LIMBS
383
  //     <= 2**(BITS_PER_LIMB * LIMBS) + 2**(BITS_PER_LIMB * LIMBS) - 2 >> BITS_PER_LIMB * LIMBS
384
  //     == 1
385
386
  //   acc1 == 2**(BITS_PER_LIMB * LIMBS) + 2 * prime[:LIMBS] - x[:LIMBS] >> BITS_PER_LIMB * LIMBS
387
  //     == 2**(BITS_PER_LIMB * LIMBS) + 2 * prime - x >> BITS_PER_LIMB * LIMBS
388
  //     >= 2**(BITS_PER_LIMB * LIMBS) + 0 >> BITS_PER_LIMB * LIMBS
389
  //     == 1
390
391
  // acc2 == 0
392
  // Proof:
393
  //   acc2 == prime[:LIMBS] + x[:LIMBS] >> BITS_PER_LIMB * LIMBS
394
  //     == prime + x >> BITS_PER_LIMB * LIMBS
395
  //     <= 2 * prime - 1 >> BITS_PER_LIMB * LIMBS
396
  //     <= 2 * (2**(BITS_PER_LIMB * LIMBS - 1) - 1) - 1 >> 261
397
  //     == 2**(BITS_PER_LIMB * LIMBS) - 3 >> BITS_PER_LIMB * LIMBS
398
  //     == 0
399
  // clang-format on
400
71.5k
}
401
402
// x <<= 1
403
// Assumes x is normalized, x < 2**260 == 2**(LIMBS*BITS_PER_LIMB - 1)
404
// Guarantees x is normalized
405
5.89M
void bn_lshift(bignum256 *x) {
406
53.0M
  for (int i = BN_LIMBS - 1; i > 0; i--) {
407
47.1M
    x->val[i] = ((x->val[i] << 1) & BN_LIMB_MASK) |
408
47.1M
                (x->val[i - 1] >> (BN_BITS_PER_LIMB - 1));
409
47.1M
  }
410
5.89M
  x->val[0] = (x->val[0] << 1) & BN_LIMB_MASK;
411
5.89M
}
412
413
// x >>= 1, i.e. x = floor(x/2)
414
// Assumes x is normalized
415
// Guarantees x is normalized
416
// If x is partly reduced (fully reduced) modulo prime,
417
//   guarantess x will be partly reduced (fully reduced) modulo prime
418
5.60M
void bn_rshift(bignum256 *x) {
419
50.4M
  for (int i = 0; i < BN_LIMBS - 1; i++) {
420
44.8M
    x->val[i] =
421
44.8M
        (x->val[i] >> 1) | ((x->val[i + 1] & 1) << (BN_BITS_PER_LIMB - 1));
422
44.8M
  }
423
5.60M
  x->val[BN_LIMBS - 1] >>= 1;
424
5.60M
}
425
426
// Sets i-th least significant bit (counting from zero)
427
// Assumes x is normalized and 0 <= i < 261 == LIMBS*BITS_PER_LIMB
428
// Guarantees x is normalized
429
// The function has constant control flow but not constant memory access flow
430
//   with regard to i
431
0
void bn_setbit(bignum256 *x, uint16_t i) {
432
0
  assert(i < BN_LIMBS * BN_BITS_PER_LIMB);
433
0
  x->val[i / BN_BITS_PER_LIMB] |= (1u << (i % BN_BITS_PER_LIMB));
434
0
}
435
436
// clears i-th least significant bit (counting from zero)
437
// Assumes x is normalized and 0 <= i < 261 == LIMBS*BITS_PER_LIMB
438
// Guarantees x is normalized
439
// The function has constant control flow but not constant memory access flow
440
//   with regard to i
441
0
void bn_clearbit(bignum256 *x, uint16_t i) {
442
0
  assert(i < BN_LIMBS * BN_BITS_PER_LIMB);
443
0
  x->val[i / BN_BITS_PER_LIMB] &= ~(1u << (i % BN_BITS_PER_LIMB));
444
0
}
445
446
// returns i-th least significant bit (counting from zero)
447
// Assumes x is normalized and 0 <= i < 261 == LIMBS*BITS_PER_LIMB
448
// The function has constant control flow but not constant memory access flow
449
//   with regard to i
450
0
uint32_t bn_testbit(const bignum256 *x, uint16_t i) {
451
0
  assert(i < BN_LIMBS * BN_BITS_PER_LIMB);
452
0
  return (x->val[i / BN_BITS_PER_LIMB] >> (i % BN_BITS_PER_LIMB)) & 1;
453
0
}
454
455
// res = x ^ y
456
// Assumes x, y are normalized
457
// Guarantees res is normalized
458
// Works properly even if &res == &x or &res == &y or &res == &x == &y
459
0
void bn_xor(bignum256 *res, const bignum256 *x, const bignum256 *y) {
460
0
  for (int i = 0; i < BN_LIMBS; i++) {
461
0
    res->val[i] = x->val[i] ^ y->val[i];
462
0
  }
463
0
}
464
465
// x = x / 2 % prime
466
// Explicitly x = x / 2 if is_even(x) else (x + prime) / 2
467
// Assumes x is normalized, x + prime < 261 == LIMBS * BITS_PER_LIMB
468
// Guarantees x is normalized
469
// If x is partly reduced (fully reduced) modulo prime,
470
//   guarantess x will be partly reduced (fully reduced) modulo prime
471
// Assumes prime is an odd number and normalized
472
663k
void bn_mult_half(bignum256 *x, const bignum256 *prime) {
473
  // x = x / 2 if is_even(x) else (x + prime) / 2
474
475
663k
  uint32_t x_is_odd_mask =
476
663k
      -(x->val[0] & 1);  // x_is_odd_mask = 0xFFFFFFFF if is_odd(x) else 0
477
478
663k
  uint32_t acc = (x->val[0] + (prime->val[0] & x_is_odd_mask)) >> 1;
479
  // acc < 2**BITS_PER_LIMB
480
  // Proof:
481
  //   acc == x[0] + prime[0] & x_is_odd_mask >> 1
482
  //     <= (2**(BITS_PER_LIMB) - 1) + (2**(BITS_PER_LIMB) - 1) >> 1
483
  //     == 2**(BITS_PER_LIMB + 1) - 2 >> 1
484
  //     <  2**(BITS_PER_LIMB)
485
486
5.97M
  for (int i = 0; i < BN_LIMBS - 1; i++) {
487
5.31M
    uint32_t temp = (x->val[i + 1] + (prime->val[i + 1] & x_is_odd_mask));
488
    // temp < 2**(BITS_PER_LIMB + 1)
489
    // Proof:
490
    //   temp == x[i + 1] + val[i + 1] & x_is_odd_mask
491
    //     <= (2**(BITS_PER_LIMB) - 1) + (2**(BITS_PER_LIMB) - 1)
492
    //     <  2**(BITS_PER_LIMB + 1)
493
494
5.31M
    acc += (temp & 1) << (BN_BITS_PER_LIMB - 1);
495
    // acc doesn't overflow 32 bits
496
    // Proof:
497
    //   acc + (temp & 1 << BITS_PER_LIMB - 1)
498
    //     <= 2**(BITS_PER_LIMB + 1) + 2**(BITS_PER_LIMB - 1)
499
    //     <= 2**30 + 2**28 < 2**32
500
501
5.31M
    x->val[i] = acc & BN_LIMB_MASK;
502
5.31M
    acc >>= BN_BITS_PER_LIMB;
503
5.31M
    acc += temp >> 1;
504
    // acc < 2**(BITS_PER_LIMB + 1)
505
    // Proof:
506
    //   acc + (temp >> 1)
507
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + (2**(BITS_PER_LIMB + 1) - 1 >> 1)
508
    //     == 7 + 2**(BITS_PER_LIMB) - 1 < 2**(BITS_PER_LIMB + 1)
509
510
    // acc == x[:i+2]+(prime[:i+2] & x_is_odd_mask) >> BITS_PER_LIMB * (i+1)
511
5.31M
  }
512
663k
  x->val[BN_LIMBS - 1] = acc;
513
514
  // assert(acc >> BITS_PER_LIMB == 0);
515
  // acc >> BITS_PER_LIMB == 0
516
  // Proof:
517
  //   acc
518
  //     == x[:LIMBS] + (prime[:LIMBS] & x_is_odd_mask) >> BITS_PER_LIMB*LIMBS
519
  //     == x + (prime & x_is_odd_mask) >> BITS_PER_LIMB * LIMBS
520
  //     <= x + prime >> BITS_PER_LIMB * LIMBS
521
  //     <= 2**(BITS_PER_LIMB * LIMBS) - 1 >> BITS_PER_LIMB * LIMBS
522
  //     == 0
523
663k
}
524
525
// x = x * k % prime
526
// Assumes x is normalized, 0 <= k <= 8 = 2**(32 - BITS_PER_LIMB)
527
// Assumes prime is normalized and 2^256 - 2^224 <= prime <= 2^256
528
// Guarantees x is normalized and partly reduced modulo prime
529
648k
void bn_mult_k(bignum256 *x, uint8_t k, const bignum256 *prime) {
530
648k
  assert(k <= 8);
531
532
6.48M
  for (int i = 0; i < BN_LIMBS; i++) {
533
5.83M
    x->val[i] = k * x->val[i];
534
    // x[i] doesn't overflow 32 bits
535
    // k * x[i] <= 2**(32 - BITS_PER_LIMB) * (2**BITS_PER_LIMB - 1)
536
    //   < 2**(32 - BITS_PER_LIMB) * 2**BITS_PER_LIMB == 2**32
537
5.83M
  }
538
539
648k
  bn_fast_mod(x, prime);
540
648k
}
541
542
// Reduces partly reduced x modulo prime
543
// Explicitly x = x if x < prime else x - prime
544
// Assumes x is partly reduced modulo prime
545
// Guarantees x is fully reduced modulo prime
546
// Assumes prime is nonzero and normalized
547
105k
void bn_mod(bignum256 *x, const bignum256 *prime) {
548
105k
  uint32_t x_less_prime = bn_is_less(x, prime);
549
550
105k
  bignum256 temp = {0};
551
105k
  bn_subtract(x, prime, &temp);
552
105k
  bn_cmov(x, x_less_prime, x, &temp);
553
554
105k
  memzero(&temp, sizeof(temp));
555
105k
}
556
557
// Auxiliary function for bn_multiply
558
// res = k * x
559
// Assumes k and x are normalized
560
// Guarantees res is normalized 18 digit little endian number in base 2**29
561
3.43M
void bn_multiply_long(const bignum256 *k, const bignum256 *x, bignum512 *res) {
562
  // Uses long multiplication in base 2**29, see
563
  // https://en.wikipedia.org/wiki/Multiplication_algorithm#Long_multiplication
564
565
3.43M
  uint64_t acc = 0;
566
567
  // compute lower half
568
34.3M
  for (int i = 0; i < BN_LIMBS; i++) {
569
185M
    for (int j = 0; j <= i; j++) {
570
154M
      acc += k->val[j] * (uint64_t)x->val[i - j];
571
      // acc doesn't overflow 64 bits
572
      // Proof:
573
      //   acc <= acc + sum([k[j] * x[i-j] for j in range(i)])
574
      //     <= (2**(64 - BITS_PER_LIMB) - 1) +
575
      //       LIMBS * (2**BITS_PER_LIMB - 1) * (2**BITS_PER_LIMB - 1)
576
      //     == (2**35 - 1) + 9 * (2**29 - 1) * (2**29 - 1)
577
      //     <= 2**35 + 9 * 2**58 < 2**64
578
154M
    }
579
580
30.9M
    res->val[i] = acc & BN_LIMB_MASK;
581
30.9M
    acc >>= BN_BITS_PER_LIMB;
582
    // acc <= 2**35 - 1 == 2**(64 - BITS_PER_LIMB) - 1
583
30.9M
  }
584
585
  // compute upper half
586
30.9M
  for (int i = BN_LIMBS; i < 2 * BN_LIMBS - 1; i++) {
587
151M
    for (int j = i - BN_LIMBS + 1; j < BN_LIMBS; j++) {
588
123M
      acc += k->val[j] * (uint64_t)x->val[i - j];
589
      // acc doesn't overflow 64 bits
590
      // Proof:
591
      //   acc <= acc + sum([k[j] * x[i-j] for j in range(i)])
592
      //     <= (2**(64 - BITS_PER_LIMB) - 1)
593
      //       LIMBS * (2**BITS_PER_LIMB - 1) * (2**BITS_PER_LIMB - 1)
594
      //     == (2**35 - 1) + 9 * (2**29 - 1) * (2**29 - 1)
595
      //     <= 2**35 + 9 * 2**58 < 2**64
596
123M
    }
597
598
27.4M
    res->val[i] = acc & (BN_BASE - 1);
599
27.4M
    acc >>= BN_BITS_PER_LIMB;
600
    // acc < 2**35 == 2**(64 - BITS_PER_LIMB)
601
27.4M
  }
602
603
3.43M
  res->val[2 * BN_LIMBS - 1] = acc;
604
3.43M
}
605
606
// Auxiliary function for bn_multiply
607
// Assumes 0 <= d <= 8 == LIMBS - 1
608
// Assumes res is normalized and res < 2**(256 + 29*d + 31)
609
// Guarantess res in normalized and res < 2 * prime * 2**(29*d)
610
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
611
void bn_multiply_reduce_step(bignum512 *res, const bignum256 *prime,
612
30.9M
                             uint32_t d) {
613
  // clang-format off
614
  // Computes res = res - (res // 2**(256 + BITS_PER_LIMB * d)) * prime * 2**(BITS_PER_LIMB * d)
615
616
  // res - (res // 2**(256 + BITS_PER_LIMB * d)) * prime * 2**(BITS_PER_LIMB * d) < 2 * prime * 2**(BITS_PER_LIMB * d)
617
  // Proof:
618
  //   res - res // (2**(256 + BITS_PER_LIMB * d)) * 2**(BITS_PER_LIMB * d) * prime
619
  //     == res - res // (2**(256 + BITS_PER_LIMB * d)) * 2**(BITS_PER_LIMB * d) * (2**256 - (2**256 - prime))
620
  //     == res - res // (2**(256 + BITS_PER_LIMB * d)) * 2**(BITS_PER_LIMB * d) * 2**256 + res // (2**(256 + BITS_PER_LIMB * d)) * 2**(BITS_PER_LIMB * d) * (2**256 - prime)
621
  //     == (res % 2**(256 + BITS_PER_LIMB * d)) + res // (2**256 + BITS_PER_LIMB * d) * 2**(BITS_PER_LIMB * d) * (2**256 - prime)
622
  //     <= (2**(256 + 29*d + 31) % 2**(256 + 29*d)) + (2**(256 + 29*d + 31) - 1) / (2**256 + 29*d) * 2**(29*d) * (2**256 - prime)
623
  //     <= 2**(256 + 29*d) + 2**(256 + 29*d + 31) / (2**256 + 29*d) * 2**(29*d) * (2**256 - prime)
624
  //     == 2**(256 + 29*d) + 2**31 * 2**(29*d) * (2**256 - prime)
625
  //     == 2**(29*d) * (2**256 + 2**31 * (2*256 - prime))
626
  //     <= 2**(29*d) * (2**256 + 2**31 * 2*224)
627
  //     <= 2**(29*d) * (2**256 + 2**255)
628
  //     <= 2**(29*d) * 2 * (2**256 - 2**224)
629
  //     <= 2 * prime * 2**(29*d)
630
  // clang-format on
631
632
30.9M
  uint32_t coef =
633
30.9M
      (res->val[d + BN_LIMBS - 1] >>
634
30.9M
       (256 - (BN_LIMBS - 1) * BN_BITS_PER_LIMB)) +
635
30.9M
      (res->val[d + BN_LIMBS] << ((BN_LIMBS * BN_BITS_PER_LIMB) - 256));
636
637
  // coef == res // 2**(256 + BITS_PER_LIMB * d)
638
639
  // coef <  2**31
640
  // Proof:
641
  //   coef == res // 2**(256 + BITS_PER_LIMB * d)
642
  //     <  2**(256 + 29 * d + 31) // 2**(256 + 29 * d)
643
  //     == 2**31
644
645
30.9M
  const int shift = 31;
646
30.9M
  uint64_t acc = 1ull << shift;
647
648
309M
  for (int i = 0; i < BN_LIMBS; i++) {
649
278M
    acc += (((uint64_t)(BN_BASE - 1)) << shift) + res->val[d + i] -
650
278M
           prime->val[i] * (uint64_t)coef;
651
    // acc neither overflow 64 bits nor underflow zero
652
    // Proof:
653
    //   acc + ((BASE - 1) << shift) + res[d + i] - prime[i] * coef
654
    //     >= ((BASE - 1) << shift) - prime[i] * coef
655
    //     == 2**shift * (2**BITS_PER_LIMB - 1) - (2**BITS_PER_LIMB - 1) *
656
    //       (2**31 - 1)
657
    //     == (2**shift - 2**31 + 1) * (2**BITS_PER_LIMB - 1)
658
    //     == (2**31 - 2**31 + 1) * (2**29 - 1)
659
    //     == 2**29 - 1 > 0
660
    //   acc + ((BASE - 1) << shift) + res[d + i] - prime[i] * coef
661
    //     <= acc + ((BASE - 1) << shift) + res[d+i]
662
    //     <= (2**(64 - BITS_PER_LIMB) - 1) + 2**shift * (2**BITS_PER_LIMB - 1)
663
    //       + (2*BITS_PER_LIMB - 1)
664
    //     == (2**(64 - BITS_PER_LIMB) - 1) + (2**shift + 1) *
665
    //       (2**BITS_PER_LIMB - 1)
666
    //     == (2**35 - 1) + (2**31 + 1) * (2**29 - 1)
667
    //     <= 2**35 + 2**60 + 2**29 < 2**64
668
669
278M
    res->val[d + i] = acc & BN_LIMB_MASK;
670
278M
    acc >>= BN_BITS_PER_LIMB;
671
    // acc <= 2**(64 - BITS_PER_LIMB) - 1 == 2**35 - 1
672
673
    // acc == (1 << BITS_PER_LIMB * (i + 1) + shift) + res[d : d + i + 1]
674
    //   - coef * prime[:i + 1] >> BITS_PER_LIMB * (i + 1)
675
278M
  }
676
677
  // acc += (((uint64_t)(BASE - 1)) << shift) + res[d + LIMBS];
678
  // acc >>= BITS_PER_LIMB;
679
  // assert(acc <= 1ul << shift);
680
681
  // clang-format off
682
  // acc == 1 << shift
683
  // Proof:
684
  //   acc
685
  //     == (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + res[d : d + LIMBS + 1] - coef * prime[:LIMBS] >> BITS_PER_LIMB * (LIMBS + 1)
686
  //     == (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + res[d : d + LIMBS + 1] - coef * prime >> BITS_PER_LIMB * (LIMBS + 1)
687
688
  //     == (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + (res[d : d + LIMBS + 1] - coef * prime) >> BITS_PER_LIMB * (LIMBS + 1)
689
  //     <= (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + (res[:d] + BASE**d * res[d : d + LIMBS + 1] - BASE**d * coef * prime)//BASE**d >> BITS_PER_LIMB * (LIMBS + 1)
690
  //     <= (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + (res - BASE**d * coef * prime) // BASE**d >> BITS_PER_LIMB * (LIMBS + 1)
691
  //     == (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + (2 * prime * BASE**d) // BASE**d >> BITS_PER_LIMB * (LIMBS + 1)
692
  //     <= (1 << 321) + 2 * 2**256 >> 290
693
  //     == 1 << 31 == 1 << shift
694
695
  //     == (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + res[d : d + LIMBS + 1] - coef * prime[:LIMBS + 1] >> BITS_PER_LIMB * (LIMBS + 1)
696
  //     >= (1 << BITS_PER_LIMB * (LIMBS + 1) + shift) + 0 >> BITS_PER_LIMB * (LIMBS + 1)
697
  //     == 1 << shift
698
  // clang-format on
699
700
30.9M
  res->val[d + BN_LIMBS] = 0;
701
30.9M
}
702
703
// Partly reduces x
704
// Assumes x in normalized and res < 2**519
705
// Guarantees x is normalized and partly reduced modulo prime
706
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
707
3.43M
void bn_reduce(bignum512 *x, const bignum256 *prime) {
708
34.3M
  for (int i = BN_LIMBS - 1; i >= 0; i--) {
709
    // res < 2**(256 + 29*i + 31)
710
    // Proof:
711
    //   if i == LIMBS - 1:
712
    //     res < 2**519
713
    //       == 2**(256 + 29 * 8 + 31)
714
    //       == 2**(256 + 29 * (LIMBS - 1) + 31)
715
    //   else:
716
    //     res < 2 * prime * 2**(29 * (i + 1))
717
    //       <= 2**256 * 2**(29*i + 29) < 2**(256 + 29*i + 31)
718
30.9M
    bn_multiply_reduce_step(x, prime, i);
719
30.9M
  }
720
3.43M
}
721
722
// x = k * x % prime
723
// Assumes k, x are normalized, k * x < 2**519
724
// Guarantees x is normalized and partly reduced modulo prime
725
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
726
3.43M
void bn_multiply(const bignum256 *k, bignum256 *x, const bignum256 *prime) {
727
3.43M
  bignum512 res = {0};
728
729
3.43M
  bn_multiply_long(k, x, &res);
730
3.43M
  bn_reduce(&res, prime);
731
3.43M
  bn_copy_lower(&res, x);
732
733
3.43M
  memzero(&res, sizeof(res));
734
3.43M
}
735
736
// Partly reduces x modulo prime
737
// Assumes limbs of x except the last (the most significant) one are normalized
738
// Assumes prime is normalized and 2^256 - 2^224 <= prime <= 2^256
739
// Guarantees x is normalized and partly reduced modulo prime
740
1.74M
void bn_fast_mod(bignum256 *x, const bignum256 *prime) {
741
  // Computes x = x - (x // 2**256) * prime
742
743
  // x < 2**((LIMBS - 1) * BITS_PER_LIMB + 32) == 2**264
744
745
  // x - (x // 2**256) * prime < 2 * prime
746
  // Proof:
747
  //   x - (x // 2**256) * prime
748
  //     == x - (x // 2**256) * (2**256 - (2**256 - prime))
749
  //     == x - ((x // 2**256) * 2**256) + (x // 2**256) * (2**256 - prime)
750
  //     == (x % prime) + (x // 2**256) * (2**256 - prime)
751
  //     <= prime - 1 + (2**264 // 2**256) * (2**256 - prime)
752
  //     <= 2**256 + 2**8 * 2**224 == 2**256 + 2**232
753
  //     <  2 * (2**256 - 2**224)
754
  //     <= 2 * prime
755
756
  // x - (x // 2**256 - 1) * prime < 2 * prime
757
  // Proof:
758
  //   x - (x // 2**256) * prime + prime
759
  //     == x - (x // 2**256) * (2**256 - (2**256 - prime)) + prime
760
  //     == x - ((x//2**256) * 2**256) + (x//2**256) * (2**256 - prime) + prime
761
  //     == (x % prime) + (x // 2**256) * (2**256 - prime) + prime
762
  //     <= 2 * prime - 1 + (2**264 // 2**256) * (2**256 - prime)
763
  //     <= 2 * prime + 2**8 * 2**224 == 2**256 + 2**232 + 2**256 - 2**224
764
  //     <  2 * (2**256 - 2**224)
765
  //     <= 2 * prime
766
767
1.74M
  uint32_t coef =
768
1.74M
      x->val[BN_LIMBS - 1] >> (256 - ((BN_LIMBS - 1) * BN_BITS_PER_LIMB));
769
770
  // clang-format off
771
  // coef == x // 2**256
772
  // 0 <= coef < 2**((LIMBS - 1) * BITS_PER_LIMB + 32 - 256) == 256
773
  // Proof:
774
  //*  Let x[[a : b] be the number consisting of a-th to (b-1)-th bit of the number x.
775
  //   x[LIMBS - 1] >> (256 - ((LIMBS - 1) * BITS_PER_LIMB))
776
  //     == x[[(LIMBS - 1) * BITS_PER_LIMB : (LIMBS - 1) * BITS_PER_LIMB + 32]] >> (256 - ((LIMBS - 1) * BITS_PER_LIMB))
777
  //     == x[[256 - ((LIMBS - 1) * BITS_PER_LIMB) + (LIMBS - 1) * BITS_PER_LIMB : (LIMBS - 1) * BITS_PER_LIMB + 32]]
778
  //     == x[[256 : (LIMBS - 1) * BITS_PER_LIMB + 32]]
779
  //     == x[[256 : 264]] == x // 2**256
780
  // clang-format on
781
782
1.74M
  const int shift = 8;
783
1.74M
  uint64_t acc = 1ull << shift;
784
785
17.4M
  for (int i = 0; i < BN_LIMBS; i++) {
786
15.6M
    acc += (((uint64_t)(BN_BASE - 1)) << shift) + x->val[i] -
787
15.6M
           prime->val[i] * (uint64_t)coef;
788
    // acc neither overflows 64 bits nor underflows 0
789
    // Proof:
790
    //   acc + (BASE - 1 << shift) + x[i] - prime[i] * coef
791
    //     >= (BASE - 1 << shift) - prime[i] * coef
792
    //     >= 2**shift * (2**BITS_PER_LIMB - 1) - (2**BITS_PER_LIMB - 1) * 255
793
    //     == (2**shift - 255) * (2**BITS_PER_LIMB - 1)
794
    //     == (2**8 - 255) * (2**29 - 1) == 2**29 - 1 >= 0
795
    //   acc + (BASE - 1 << shift) + x[i] - prime[i] * coef
796
    //     <= acc + ((BASE - 1) << shift) + x[i]
797
    //     <= (2**(64 - BITS_PER_LIMB) - 1) + 2**shift * (2**BITS_PER_LIMB - 1)
798
    //       + (2**32 - 1)
799
    //     == (2**35 - 1) + 2**8 * (2**29 - 1) + 2**32
800
    //     <  2**35 + 2**37 + 2**32 < 2**64
801
802
15.6M
    x->val[i] = acc & BN_LIMB_MASK;
803
15.6M
    acc >>= BN_BITS_PER_LIMB;
804
    // acc <= 2**(64 - BITS_PER_LIMB) - 1 == 2**35 - 1
805
806
    // acc == (1 << BITS_PER_LIMB * (i + 1) + shift) + x[:i + 1]
807
    //   - coef * prime[:i + 1] >> BITS_PER_LIMB * (i + 1)
808
15.6M
  }
809
810
  // assert(acc == 1 << shift);
811
812
  // clang-format off
813
  // acc == 1 << shift
814
  // Proof:
815
  //   acc
816
  //     == (1 << BITS_PER_LIMB * LIMBS + shift) + x[:LIMBS] - coef * prime[:LIMBS] >> BITS_PER_LIMB * LIMBS
817
  //     == (1 << BITS_PER_LIMB * LIMBS + shift) + (x - coef * prime) >> BITS_PER_LIMB * LIMBS
818
  //     <= (1 << BITS_PER_LIMB * LIMBS + shift) + (2 * prime) >> BITS_PER_LIMB * LIMBS
819
  //     <= (1 << BITS_PER_LIMB * LIMBS + shift) + 2 * 2**256 >> BITS_PER_LIMB * LIMBS
820
  //     <= 2**269 + 2**257 >> 2**261
821
  //     <= 1 << 8 == 1 << shift
822
823
  //   acc
824
  //     == (1 << BITS_PER_LIMB * LIMBS + shift) + x[:LIMBS] - coef * prime[:LIMBS] >> BITS_PER_LIMB * LIMBS
825
  //     >= (1 << BITS_PER_LIMB * LIMBS + shift) + 0 >> BITS_PER_LIMB * LIMBS
826
  //     == (1 << BITS_PER_LIMB * LIMBS + shift) + 0 >> BITS_PER_LIMB * LIMBS
827
  //     <= 1 << 8 == 1 << shift
828
  // clang-format on
829
1.74M
}
830
831
// res = x**e % prime
832
// Assumes both x and e are normalized, x < 2**259
833
// Guarantees res is normalized and partly reduced modulo prime
834
// Works properly even if &x == &res
835
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
836
// The function doesn't have neither constant control flow nor constant memory
837
//  access flow with regard to e
838
void bn_power_mod(const bignum256 *x, const bignum256 *e,
839
1
                  const bignum256 *prime, bignum256 *res) {
840
  // Uses iterative right-to-left exponentiation by squaring, see
841
  // https://en.wikipedia.org/wiki/Modular_exponentiation#Right-to-left_binary_method
842
843
1
  bignum256 acc = {0};
844
1
  bn_copy(x, &acc);
845
846
1
  bn_one(res);
847
10
  for (int i = 0; i < BN_LIMBS; i++) {
848
9
    uint32_t limb = e->val[i];
849
850
263
    for (int j = 0; j < BN_BITS_PER_LIMB; j++) {
851
      // Break if the following bits of the last limb are zero
852
255
      if (i == BN_LIMBS - 1 && limb == 0) break;
853
854
254
      if (limb & 1)
855
        // acc * res < 2**519
856
        // Proof:
857
        //   acc * res <= max(2**259 - 1, 2 * prime) * (2 * prime)
858
        //     == max(2**259 - 1, 2**257) * 2**257 < 2**259 * 2**257
859
        //     == 2**516 < 2**519
860
247
        bn_multiply(&acc, res, prime);
861
862
254
      limb >>= 1;
863
      // acc * acc < 2**519
864
      // Proof:
865
      //   acc * acc <= max(2**259 - 1, 2 * prime)**2
866
      //     <= (2**259)**2 == 2**518 < 2**519
867
254
      bn_multiply(&acc, &acc, prime);
868
254
    }
869
    // acc == x**(e[:i + 1]) % prime
870
9
  }
871
872
1
  memzero(&acc, sizeof(acc));
873
1
}
874
875
// x = sqrt(x) % prime
876
// Explicitly x = x**((prime+1)/4) % prime
877
// The other root is -sqrt(x)
878
// Assumes x is normalized, x < 2**259 and quadratic residuum mod prime
879
// Assumes prime is a prime number, prime % 4 == 3, it is normalized and
880
//   2**256 - 2**224 <= prime <= 2**256
881
// Guarantees x is normalized and fully reduced modulo prime
882
// The function doesn't have neither constant control flow nor constant memory
883
//  access flow with regard to prime
884
1
void bn_sqrt(bignum256 *x, const bignum256 *prime) {
885
  // Uses the Lagrange formula for the primes of the special form, see
886
  // http://en.wikipedia.org/wiki/Quadratic_residue#Prime_or_prime_power_modulus
887
  // If prime % 4 == 3, then sqrt(x) % prime == x**((prime+1)//4) % prime
888
889
1
  assert(prime->val[0] % 4 == 3);
890
891
  // e = (prime + 1) // 4
892
1
  bignum256 e = {0};
893
1
  bn_copy(prime, &e);
894
1
  bn_addi(&e, 1);
895
1
  bn_rshift(&e);
896
1
  bn_rshift(&e);
897
898
1
  bn_power_mod(x, &e, prime, x);
899
1
  bn_mod(x, prime);
900
901
1
  memzero(&e, sizeof(e));
902
1
}
903
904
// a = 1/a % 2**n
905
// Assumes a is odd, 1 <= n <= 32
906
// The function doesn't have neither constant control flow nor constant memory
907
//   access flow with regard to n
908
193k
uint32_t inverse_mod_power_two(uint32_t a, uint32_t n) {
909
  // Uses "Explicit Quadratic Modular inverse modulo 2" from section 3.3 of "On
910
  // Newton-Raphson iteration for multiplicative inverses modulo prime powers"
911
  // by Jean-Guillaume Dumas, see
912
  // https://arxiv.org/pdf/1209.6626.pdf
913
914
  // 1/a % 2**n
915
  //   = (2-a) * product([1 + (a-1)**(2**i) for i in range(1, floor(log2(n)))])
916
917
193k
  uint32_t acc = 2 - a;
918
193k
  uint32_t f = a - 1;
919
920
  // mask = (1 << n) - 1
921
193k
  uint32_t mask = n == 32 ? 0xFFFFFFFF : (1u << n) - 1;
922
923
1.16M
  for (uint32_t i = 1; i < n; i <<= 1) {
924
967k
    f = (f * f) & mask;
925
967k
    acc = (acc * (1 + f)) & mask;
926
967k
  }
927
928
193k
  return acc;
929
193k
}
930
931
// x = (x / 2**BITS_PER_LIMB) % prime
932
// Assumes both x and prime are normalized
933
// Assumes prime is an odd number and normalized
934
// Guarantees x is normalized
935
// If x is partly reduced (fully reduced) modulo prime,
936
//   guarantess x will be partly reduced (fully reduced) modulo prime
937
193k
void bn_divide_base(bignum256 *x, const bignum256 *prime) {
938
  // Uses an explicit formula for the modular inverse of power of two
939
  // (x / 2**n) % prime == (x + ((-x / prime) % 2**n) * prime) // 2**n
940
  // Proof:
941
  //   (x + ((-x / prime) % 2**n) * prime) % 2**n
942
  //     == (x - x / prime * prime) % 2**n
943
  //     == 0
944
  //   (x + ((-1 / prime) % 2**n) * prime) % prime
945
  //     == x
946
  //   if x < prime:
947
  //     (x + ((-x / prime) % 2**n) * prime) // 2**n
948
  //       <= ((prime - 1) + (2**n - 1) * prime) / 2**n
949
  //       == (2**n * prime - 1) / 2**n == prime - 1 / 2**n < prime
950
  //   if x < 2 * prime:
951
  //     (x + ((-x / prime) % 2**n) * prime) // 2**n
952
  //       <= ((2 * prime - 1) + (2**n - 1) * prime) / 2**n
953
  //       == (2**n * prime + prime - 1) / 2**n
954
  //       == prime + (prime - 1) / 2**n < 2 * prime
955
956
  // m = (-x / prime) % 2**BITS_PER_LIMB
957
193k
  uint32_t m = (x->val[0] * (BN_BASE - inverse_mod_power_two(
958
193k
                                           prime->val[0], BN_BITS_PER_LIMB))) &
959
193k
               BN_LIMB_MASK;
960
  // m < 2**BITS_PER_LIMB
961
962
193k
  uint64_t acc = x->val[0] + (uint64_t)m * prime->val[0];
963
193k
  acc >>= BN_BITS_PER_LIMB;
964
965
1.74M
  for (int i = 1; i < BN_LIMBS; i++) {
966
1.54M
    acc = acc + x->val[i] + (uint64_t)m * prime->val[i];
967
    // acc does not overflow 64 bits
968
    // acc == acc + x + m * prime
969
    //    <= 2**(64 - BITS_PER_LIMB) + 2**(BITS_PER_LIMB)
970
    //      2**(BITS_PER_LIMB) * 2**(BITS_PER_LIMB)
971
    //    <= 2**(2 * BITS_PER_LIMB) + 2**(64 - BITS_PER_LIMB) +
972
    //      2**(BITS_PER_LIMB)
973
    //    <= 2**58 + 2**35 + 2**29 < 2**64
974
975
1.54M
    x->val[i - 1] = acc & BN_LIMB_MASK;
976
1.54M
    acc >>= BN_BITS_PER_LIMB;
977
    // acc < 2**35 == 2**(64 - BITS_PER_LIMB)
978
979
    // acc == x[:i + 1] + m * prime[:i + 1] >> BITS_PER_LIMB * (i + 1)
980
1.54M
  }
981
982
193k
  x->val[BN_LIMBS - 1] = acc;
983
984
193k
  assert(acc >> BN_BITS_PER_LIMB == 0);
985
986
  // clang-format off
987
  // acc >> BITS_PER_LIMB == 0
988
  // Proof:
989
  //   acc >> BITS_PER_LIMB
990
  //     == (x[:LIMB] + m * prime[:LIMB] >> BITS_PER_LIMB * LIMBS) >> BITS_PER_LIMB * (LIMBS + 1)
991
  //     == x + m * prime >> BITS_PER_LIMB * (LIMBS + 1)
992
  //     <= (2**(BITS_PER_LIMB * LIMBS) - 1) + (2**BITS_PER_LIMB - 1) * (2**(BITS_PER_LIMB * LIMBS) - 1) >> BITS_PER_LIMB * (LIMBS + 1)
993
  //     == 2**(BITS_PER_LIMB * LIMBS) - 1 + 2**(BITS_PER_LIMB * (LIMBS + 1)) - 2**(BITS_PER_LIMB * LIMBS) - 2**BITS_PER_LIMB + 1 >> BITS_PER_LIMB * (LIMBS + 1)
994
  //     == 2**(BITS_PER_LIMB * (LIMBS + 1)) - 2**BITS_PER_LIMB >> BITS_PER_LIMB * (LIMBS + 1)
995
  //     == 0
996
  // clang-format on
997
193k
}
998
999
#if !USE_INVERSE_FAST
1000
// x = 1/x % prime if x != 0 else 0
1001
// Assumes x is normalized
1002
// Assumes prime is a prime number
1003
// Guarantees x is normalized and fully reduced modulo prime
1004
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
1005
// The function doesn't have neither constant control flow nor constant memory
1006
//   access flow with regard to prime
1007
static void bn_inverse_slow(bignum256 *x, const bignum256 *prime) {
1008
  // Uses formula 1/x % prime == x**(prime - 2) % prime
1009
  // See https://en.wikipedia.org/wiki/Fermat%27s_little_theorem
1010
1011
  bn_fast_mod(x, prime);
1012
1013
  // e = prime - 2
1014
  bignum256 e = {0};
1015
  bn_read_uint32(2, &e);
1016
  bn_subtract(prime, &e, &e);
1017
1018
  bn_power_mod(x, &e, prime, x);
1019
  bn_mod(x, prime);
1020
1021
  memzero(&e, sizeof(e));
1022
}
1023
#endif
1024
1025
#if false
1026
// x = 1/x % prime if x != 0 else 0
1027
// Assumes x is is_normalized
1028
// Assumes GCD(x, prime) = 1
1029
// Guarantees x is normalized and fully reduced modulo prime
1030
// Assumes prime is odd, normalized, 2**256 - 2**224 <= prime <= 2**256
1031
// The function doesn't have neither constant control flow nor constant memory
1032
//   access flow with regard to prime and x
1033
static void bn_inverse_fast(bignum256 *x, const bignum256 *prime) {
1034
  // "The Almost Montgomery Inverse" from the section 3 of "Constant Time
1035
  // Modular Inversion" by Joppe W. Bos
1036
  // See http://www.joppebos.com/files/CTInversion.pdf
1037
1038
  /*
1039
    u = prime
1040
    v = x & prime
1041
    s = 1
1042
    r = 0
1043
1044
    k = 0
1045
    while v != 1:
1046
      k += 1
1047
      if is_even(u):
1048
        u = u // 2
1049
        s = 2 * s
1050
      elif is_even(v):
1051
        v = v // 2
1052
        r = 2 * r
1053
      elif v < u:
1054
        u = (u - v) // 2
1055
        r = r + s
1056
        s = 2 * s
1057
      else:
1058
        v = (v - u) // 2
1059
        s = r + s
1060
        r = 2 * r
1061
1062
    s = (s / 2**k) % prime
1063
    return s
1064
  */
1065
1066
  if (bn_is_zero(x)) return;
1067
1068
  bn_fast_mod(x, prime);
1069
  bn_mod(x, prime);
1070
1071
  bignum256 u = {0}, v = {0}, r = {0}, s = {0};
1072
  bn_copy(prime, &u);
1073
  bn_copy(x, &v);
1074
  bn_one(&s);
1075
  bn_zero(&r);
1076
1077
  int k = 0;
1078
  while (!bn_is_one(&v)) {
1079
    if ((u.val[0] & 1) == 0) {
1080
      bn_rshift(&u);
1081
      bn_lshift(&s);
1082
    } else if ((v.val[0] & 1) == 0) {
1083
      bn_rshift(&v);
1084
      bn_lshift(&r);
1085
    } else if (bn_is_less(&v, &u)) {
1086
      bn_subtract(&u, &v, &u);
1087
      bn_rshift(&u);
1088
      bn_add(&r, &s);
1089
      bn_lshift(&s);
1090
    } else {
1091
      bn_subtract(&v, &u, &v);
1092
      bn_rshift(&v);
1093
      bn_add(&s, &r);
1094
      bn_lshift(&r);
1095
    }
1096
    k += 1;
1097
    assert(!bn_is_zero(&v)); // assert GCD(x, prime) == 1
1098
  }
1099
1100
  // s = s / 2**(k // BITS_PER_LIMB * BITS_PER_LIMB)
1101
  for (int i = 0; i < k / BITS_PER_LIMB; i++) {
1102
    bn_divide_base(&s, prime);
1103
  }
1104
1105
  // s = s / 2**(k % BITS_PER_LIMB)
1106
  for (int i = 0; i < k % BN_BITS_PER_LIMB; i++) {
1107
    bn_mult_half(&s, prime);
1108
  }
1109
1110
  bn_copy(&s, x);
1111
1112
  memzero(&u, sizeof(u));
1113
  memzero(&v, sizeof(v));
1114
  memzero(&r, sizeof(r));
1115
  memzero(&s, sizeof(s));
1116
}
1117
#endif
1118
1119
#if USE_INVERSE_FAST
1120
// x = 1/x % prime if x != 0 else 0
1121
// Assumes x is is_normalized
1122
// Assumes GCD(x, prime) = 1
1123
// Guarantees x is normalized and fully reduced modulo prime
1124
// Assumes prime is odd, normalized, 2**256 - 2**224 <= prime <= 2**256
1125
// The function has constant control flow but not constant memory access flow
1126
//   with regard to prime and x
1127
10.7k
static void bn_inverse_fast(bignum256 *x, const bignum256 *prime) {
1128
  // Custom constant time version of "The Almost Montgomery Inverse" from the
1129
  // section 3 of "Constant Time Modular Inversion" by Joppe W. Bos
1130
  // See http://www.joppebos.com/files/CTInversion.pdf
1131
1132
  /*
1133
    u = prime
1134
    v = x % prime
1135
    s = 1
1136
    r = 0
1137
1138
    k = 0
1139
    while v != 1:
1140
      k += 1
1141
      if is_even(u): # b1
1142
        u = u // 2
1143
        s = 2 * s
1144
      elif is_even(v): # b2
1145
        v = v // 2
1146
        r = 2 * r
1147
      elif v < u: # b3
1148
        u = (u - v) // 2
1149
        r = r + s
1150
        s = 2 * s
1151
      else: # b4
1152
        v = (v - u) // 2
1153
        s = r + s
1154
        r = 2 * r
1155
1156
    s = (s / 2**k) % prime
1157
    return s
1158
  */
1159
1160
10.7k
  bn_fast_mod(x, prime);
1161
10.7k
  bn_mod(x, prime);
1162
1163
10.7k
  bignum256 u = {0}, v = {0}, r = {0}, s = {0};
1164
10.7k
  bn_copy(prime, &u);
1165
10.7k
  bn_copy(x, &v);
1166
10.7k
  bn_one(&s);
1167
10.7k
  bn_zero(&r);
1168
1169
10.7k
  bignum256 zero = {0};
1170
10.7k
  bn_zero(&zero);
1171
1172
10.7k
  int k = 0;
1173
1174
10.7k
  int finished = 0, u_even = 0, v_even = 0, v_less_u = 0, b1 = 0, b2 = 0,
1175
10.7k
      b3 = 0, b4 = 0;
1176
10.7k
  finished = 0;
1177
1178
5.62M
  for (int i = 0; i < 2 * BN_LIMBS * BN_BITS_PER_LIMB; i++) {
1179
5.60M
    finished = finished | -bn_is_one(&v);
1180
5.60M
    u_even = -bn_is_even(&u);
1181
5.60M
    v_even = -bn_is_even(&v);
1182
5.60M
    v_less_u = -bn_is_less(&v, &u);
1183
1184
5.60M
    b1 = ~finished & u_even;
1185
5.60M
    b2 = ~finished & ~b1 & v_even;
1186
5.60M
    b3 = ~finished & ~b1 & ~b2 & v_less_u;
1187
5.60M
    b4 = ~finished & ~b1 & ~b2 & ~b3;
1188
1189
// The ternary operator for pointers with constant control flow
1190
// BN_INVERSE_FAST_TERNARY(c, t, f) = t if c else f
1191
// Very nasty hack, sorry for that
1192
5.60M
#define BN_INVERSE_FAST_TERNARY(c, t, f) \
1193
39.2M
  ((void *)(((c) & (uintptr_t)(t)) | (~(c) & (uintptr_t)(f))))
1194
1195
5.60M
    bn_subtract(BN_INVERSE_FAST_TERNARY(b3, &u, &v),
1196
5.60M
                BN_INVERSE_FAST_TERNARY(
1197
5.60M
                    b3 | b4, BN_INVERSE_FAST_TERNARY(b3, &v, &u), &zero),
1198
5.60M
                BN_INVERSE_FAST_TERNARY(b3, &u, &v));
1199
1200
5.60M
    bn_add(BN_INVERSE_FAST_TERNARY(b3, &r, &s),
1201
5.60M
           BN_INVERSE_FAST_TERNARY(b3 | b4, BN_INVERSE_FAST_TERNARY(b3, &s, &r),
1202
5.60M
                                   &zero));
1203
5.60M
    bn_rshift(BN_INVERSE_FAST_TERNARY(b1 | b3, &u, &v));
1204
5.60M
    bn_lshift(BN_INVERSE_FAST_TERNARY(b1 | b3, &s, &r));
1205
1206
5.60M
    k = k - ~finished;
1207
5.60M
  }
1208
1209
  // s = s / 2**(k // BITS_PER_LIMB * BITS_PER_LIMB)
1210
204k
  for (int i = 0; i < 2 * BN_LIMBS; i++) {
1211
    // s = s / 2**BITS_PER_LIMB % prime if i < k // BITS_PER_LIMB else s
1212
193k
    bn_copy(&s, &r);
1213
193k
    bn_divide_base(&r, prime);
1214
193k
    bn_cmov(&s, i < k / BN_BITS_PER_LIMB, &r, &s);
1215
193k
  }
1216
1217
  // s = s / 2**(k % BITS_PER_LIMB)
1218
322k
  for (int i = 0; i < BN_BITS_PER_LIMB; i++) {
1219
    // s = s / 2 % prime if i < k % BITS_PER_LIMB else s
1220
311k
    bn_copy(&s, &r);
1221
311k
    bn_mult_half(&r, prime);
1222
311k
    bn_cmov(&s, i < k % BN_BITS_PER_LIMB, &r, &s);
1223
311k
  }
1224
1225
10.7k
  bn_cmov(x, bn_is_zero(x), x, &s);
1226
1227
10.7k
  memzero(&u, sizeof(u));
1228
10.7k
  memzero(&v, sizeof(v));
1229
10.7k
  memzero(&r, sizeof(s));
1230
10.7k
  memzero(&s, sizeof(s));
1231
10.7k
}
1232
#endif
1233
1234
#if false
1235
// x = 1/x % prime if x != 0 else 0
1236
// Assumes x is is_normalized
1237
// Assumes GCD(x, prime) = 1
1238
// Guarantees x is normalized and fully reduced modulo prime
1239
// Assumes prime is odd, normalized, 2**256 - 2**224 <= prime <= 2**256
1240
static void bn_inverse_fast(bignum256 *x, const bignum256 *prime) {
1241
  // Custom constant time version of "The Almost Montgomery Inverse" from the
1242
  // section 3 of "Constant Time Modular Inversion" by Joppe W. Bos
1243
  // See http://www.joppebos.com/files/CTInversion.pdf
1244
1245
  /*
1246
    u = prime
1247
    v = x % prime
1248
    s = 1
1249
    r = 0
1250
1251
    k = 0
1252
    while v != 1:
1253
      k += 1
1254
      if is_even(u): # b1
1255
        u = u // 2
1256
        s = 2 * s
1257
      elif is_even(v): # b2
1258
        v = v // 2
1259
        r = 2 * r
1260
      elif v < u: # b3
1261
        u = (u - v) // 2
1262
        r = r + s
1263
        s = 2 * s
1264
      else: # b4
1265
        v = (v - u) // 2
1266
        s = r + s
1267
        r = 2 * r
1268
1269
    s = (s / 2**k) % prime
1270
    return s
1271
  */
1272
1273
  bn_fast_mod(x, prime);
1274
  bn_mod(x, prime);
1275
1276
  bignum256 u = {0}, v = {0}, r = {0}, s = {0};
1277
  bn_copy(prime, &u);
1278
  bn_copy(x, &v);
1279
  bn_one(&s);
1280
  bn_zero(&r);
1281
1282
  bignum256 zero = {0};
1283
  bn_zero(&zero);
1284
1285
  int k = 0;
1286
1287
  uint32_t finished = 0, u_even = 0, v_even = 0, v_less_u = 0, b1 = 0, b2 = 0,
1288
      b3 = 0, b4 = 0;
1289
  finished = 0;
1290
1291
  bignum256 u_half = {0}, v_half = {0}, u_minus_v_half = {0}, v_minus_u_half = {0}, r_plus_s = {0}, r_twice = {0}, s_twice = {0};
1292
  for (int i = 0; i < 2 * BN_LIMBS * BN_BITS_PER_LIMB; i++) {
1293
    finished = finished | bn_is_one(&v);
1294
    u_even = bn_is_even(&u);
1295
    v_even = bn_is_even(&v);
1296
    v_less_u = bn_is_less(&v, &u);
1297
1298
    b1 = (finished ^ 1) & u_even;
1299
    b2 = (finished ^ 1) & (b1 ^ 1) & v_even;
1300
    b3 = (finished ^ 1) & (b1 ^ 1) & (b2 ^ 1) & v_less_u;
1301
    b4 = (finished ^ 1) & (b1 ^ 1) & (b2 ^ 1) & (b3 ^ 1);
1302
1303
    // u_half = u // 2
1304
    bn_copy(&u, &u_half);
1305
    bn_rshift(&u_half);
1306
1307
    // v_half = v // 2
1308
    bn_copy(&v, &v_half);
1309
    bn_rshift(&v_half);
1310
1311
    // u_minus_v_half  = (u - v) // 2
1312
    bn_subtract(&u, &v, &u_minus_v_half);
1313
    bn_rshift(&u_minus_v_half);
1314
1315
    // v_minus_u_half  = (v - u) // 2
1316
    bn_subtract(&v, &u, &v_minus_u_half);
1317
    bn_rshift(&v_minus_u_half);
1318
1319
    // r_plus_s = r + s
1320
    bn_copy(&r, &r_plus_s);
1321
    bn_add(&r_plus_s, &s);
1322
1323
    // r_twice = 2 * r
1324
    bn_copy(&r, &r_twice);
1325
    bn_lshift(&r_twice);
1326
1327
    // s_twice = 2 * s
1328
    bn_copy(&s, &s_twice);
1329
    bn_lshift(&s_twice);
1330
1331
    bn_cmov(&u, b1, &u_half, &u);
1332
    bn_cmov(&u, b3, &u_minus_v_half, &u);
1333
1334
    bn_cmov(&v, b2, &v_half, &v);
1335
    bn_cmov(&v, b4, &v_minus_u_half, &v);
1336
1337
    bn_cmov(&r, b2 | b4, &r_twice, &r);
1338
    bn_cmov(&r, b3, &r_plus_s, &r);
1339
1340
    bn_cmov(&s, b1 | b3, &s_twice, &s);
1341
    bn_cmov(&s, b4, &r_plus_s, &s);
1342
1343
    k = k + (finished ^ 1);
1344
  }
1345
1346
  // s = s / 2**(k // BITS_PER_LIMB * BITS_PER_LIMB)
1347
  for (int i = 0; i < 2 * BN_LIMBS; i++) {
1348
    // s = s / 2**BITS_PER_LIMB % prime if i < k // BITS_PER_LIMB else s
1349
    bn_copy(&s, &r);
1350
    bn_divide_base(&r, prime);
1351
    bn_cmov(&s, i < k / BITS_PER_LIMB, &r, &s);
1352
  }
1353
1354
  // s = s / 2**(k % BITS_PER_LIMB)
1355
  for (int i = 0; i < BN_BITS_PER_LIMB; i++) {
1356
    // s = s / 2 % prime if i < k % BITS_PER_LIMB else s
1357
    bn_copy(&s, &r);
1358
    bn_mult_half(&r, prime);
1359
    bn_cmov(&s, i < k % BN_BITS_PER_LIMB, &r, &s);
1360
  }
1361
1362
  bn_cmov(x, bn_is_zero(x), x, &s);
1363
1364
  memzero(&u, sizeof(u));
1365
  memzero(&v, sizeof(v));
1366
  memzero(&r, sizeof(r));
1367
  memzero(&s, sizeof(s));
1368
  memzero(&u_half, sizeof(u_half));
1369
  memzero(&v_half, sizeof(v_half));
1370
  memzero(&u_minus_v_half, sizeof(u_minus_v_half));
1371
  memzero(&v_minus_u_half, sizeof(v_minus_u_half));
1372
  memzero(&r_twice, sizeof(r_twice));
1373
  memzero(&s_twice, sizeof(s_twice));
1374
  memzero(&r_plus_s, sizeof(r_plus_s));
1375
}
1376
#endif
1377
1378
// Normalizes x
1379
// Assumes x < 2**261 == 2**(LIMBS * BITS_PER_LIMB)
1380
// Guarantees x is normalized
1381
0
void bn_normalize(bignum256 *x) {
1382
0
  uint32_t acc = 0;
1383
1384
0
  for (int i = 0; i < BN_LIMBS; i++) {
1385
0
    acc += x->val[i];
1386
    // acc doesn't overflow 32 bits
1387
    // Proof:
1388
    //   acc + x[i]
1389
    //      <= (2**(32 - BITS_PER_LIMB) - 1) + (2**BITS_PER_LIMB - 1)
1390
    //      == 7 + 2**29 - 1 < 2**32
1391
1392
0
    x->val[i] = acc & BN_LIMB_MASK;
1393
0
    acc >>= (BN_BITS_PER_LIMB);
1394
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1
1395
0
  }
1396
0
}
1397
1398
// x = x + y
1399
// Assumes x, y are normalized, x + y < 2**(LIMBS*BITS_PER_LIMB) == 2**261
1400
// Guarantees x is normalized
1401
// Works properly even if &x == &y
1402
5.75M
void bn_add(bignum256 *x, const bignum256 *y) {
1403
5.75M
  uint32_t acc = 0;
1404
57.5M
  for (int i = 0; i < BN_LIMBS; i++) {
1405
51.7M
    acc += x->val[i] + y->val[i];
1406
    // acc doesn't overflow 32 bits
1407
    // Proof:
1408
    //   acc + x[i] + y[i]
1409
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + 2 * (2**BITS_PER_LIMB - 1)
1410
    //     == (2**(32 - BITS_PER_LIMB) - 1) + 2**(BITS_PER_LIMB + 1) - 2
1411
    //     == 7 + 2**30 - 2 < 2**32
1412
1413
51.7M
    x->val[i] = acc & BN_LIMB_MASK;
1414
51.7M
    acc >>= BN_BITS_PER_LIMB;
1415
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1
1416
1417
    // acc == x[:i + 1] + y[:i + 1] >> BITS_PER_LIMB * (i + 1)
1418
51.7M
  }
1419
1420
  // assert(acc == 0); // assert x + y < 2**261
1421
  // acc == 0
1422
  // Proof:
1423
  //   acc == x[:LIMBS] + y[:LIMBS] >> LIMBS * BITS_PER_LIMB
1424
  //     == x + y >> LIMBS * BITS_PER_LIMB
1425
  //     <= 2**(LIMBS * BITS_PER_LIMB) - 1 >> LIMBS * BITS_PER_LIMB == 0
1426
5.75M
}
1427
1428
// x = x + y % prime
1429
// Assumes x, y are normalized
1430
// Guarantees x is normalized and partly reduced modulo prime
1431
// Assumes prime is normalized and 2^256 - 2^224 <= prime <= 2^256
1432
8.35k
void bn_addmod(bignum256 *x, const bignum256 *y, const bignum256 *prime) {
1433
83.5k
  for (int i = 0; i < BN_LIMBS; i++) {
1434
75.1k
    x->val[i] += y->val[i];
1435
    // x[i] doesn't overflow 32 bits
1436
    // Proof:
1437
    //   x[i] + y[i]
1438
    //     <= 2 * (2**BITS_PER_LIMB - 1)
1439
    //     == 2**30 - 2 < 2**32
1440
75.1k
  }
1441
1442
8.35k
  bn_fast_mod(x, prime);
1443
8.35k
}
1444
1445
// x = x + y
1446
// Assumes x is normalized
1447
// Assumes y <= 2**32 - 2**29 == 2**32 - 2**BITS_PER_LIMB and
1448
//   x + y < 2**261 == 2**(LIMBS * BITS_PER_LIMB)
1449
// Guarantees x is normalized
1450
1
void bn_addi(bignum256 *x, uint32_t y) {
1451
  // assert(y <= 3758096384); // assert y <= 2**32 - 2**29
1452
1
  uint32_t acc = y;
1453
1454
10
  for (int i = 0; i < BN_LIMBS; i++) {
1455
9
    acc += x->val[i];
1456
    // acc doesn't overflow 32 bits
1457
    // Proof:
1458
    //   if i == 0:
1459
    //     acc + x[i] == y + x[0]
1460
    //       <= (2**32 - 2**BITS_PER_LIMB) + (2**BITS_PER_LIMB - 1)
1461
    //       == 2**32 - 1 < 2**32
1462
    //   else:
1463
    //     acc + x[i]
1464
    //       <= (2**(32 - BITS_PER_LIMB) - 1) + (2**BITS_PER_LIMB - 1)
1465
    //       == 7 + 2**29 - 1 < 2**32
1466
1467
9
    x->val[i] = acc & BN_LIMB_MASK;
1468
9
    acc >>= (BN_BITS_PER_LIMB);
1469
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1
1470
1471
    // acc == x[:i + 1] + y >> BITS_PER_LIMB * (i + 1)
1472
9
  }
1473
1474
  // assert(acc == 0); // assert x + y < 2**261
1475
  // acc == 0
1476
  // Proof:
1477
  //   acc == x[:LIMBS] + y << LIMBS * BITS_PER_LIMB
1478
  //     == x + y << LIMBS * BITS_PER_LIMB
1479
  //     <= 2**(LIMBS + BITS_PER_LIMB) - 1 << LIMBS * BITS_PER_LIMB
1480
  //     == 0
1481
1
}
1482
1483
// x = x - y % prime
1484
// Explicitly x = x + prime - y
1485
// Assumes x, y are normalized
1486
// Assumes y < prime[0], x + prime - y < 2**261 == 2**(LIMBS * BITS_PER_LIMB)
1487
// Guarantees x is normalized
1488
// If x is fully reduced modulo prime,
1489
//   guarantess x will be partly reduced modulo prime
1490
// Assumes prime is nonzero and normalized
1491
1.44k
void bn_subi(bignum256 *x, uint32_t y, const bignum256 *prime) {
1492
1.44k
  assert(y < prime->val[0]);
1493
1494
  // x = x + prime - y
1495
1496
1.44k
  uint32_t acc = -y;
1497
14.4k
  for (int i = 0; i < BN_LIMBS; i++) {
1498
12.9k
    acc += x->val[i] + prime->val[i];
1499
    // acc neither overflows 32 bits nor underflows 0
1500
    // Proof:
1501
    //   acc + x[i] + prime[i]
1502
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + 2 * (2**BITS_PER_LIMB - 1)
1503
    //     <= 7 + 2**30 - 2 < 2**32
1504
    //   acc + x[i] + prime[i]
1505
    //     >= -y + prime[0] >= 0
1506
1507
12.9k
    x->val[i] = acc & BN_LIMB_MASK;
1508
12.9k
    acc >>= BN_BITS_PER_LIMB;
1509
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1
1510
1511
    // acc == x[:i + 1] + prime[:i + 1] - y >> BITS_PER_LIMB * (i + 1)
1512
12.9k
  }
1513
1514
  // assert(acc == 0); // assert x + prime - y < 2**261
1515
  // acc == 0
1516
  // Proof:
1517
  //   acc == x[:LIMBS] + prime[:LIMBS] - y >> BITS_PER_LIMB * LIMBS
1518
  //     == x + prime - y >> BITS_PER_LIMB * LIMBS
1519
  //     <= 2**(LIMBS * BITS_PER_LIMB) - 1 >> BITS_PER_LIMB * LIMBS == 0
1520
1.44k
}
1521
1522
// res = x - y % prime
1523
// Explicitly res = x + (2 * prime - y)
1524
// Assumes x, y are normalized, y is partly reduced
1525
// Assumes x + 2 * prime - y < 2**261 == 2**(BITS_PER_LIMB * LIMBS)
1526
// Guarantees res is normalized
1527
// Assumes prime is nonzero and normalized
1528
void bn_subtractmod(const bignum256 *x, const bignum256 *y, bignum256 *res,
1529
1.60M
                    const bignum256 *prime) {
1530
  // res = x + (2 * prime - y)
1531
1532
1.60M
  uint32_t acc = 1;
1533
1534
16.0M
  for (int i = 0; i < BN_LIMBS; i++) {
1535
14.4M
    acc += (BN_BASE - 1) + x->val[i] + 2 * prime->val[i] - y->val[i];
1536
    // acc neither overflows 32 bits nor underflows 0
1537
    // Proof:
1538
    //   acc + (BASE - 1) + x[i] + 2 * prime[i] - y[i]
1539
    //     >= (BASE - 1) - y[i]
1540
    //     == (2**BITS_PER_LIMB - 1) - (2**BITS_PER_LIMB - 1) == 0
1541
    //   acc + (BASE - 1) + x[i] + 2 * prime[i] - y[i]
1542
    //     <= acc + (BASE - 1) + x[i] + 2 * prime[i]
1543
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + (2**BITS_PER_LIMB - 1) +
1544
    //     (2**BITS_PER_LIMB - 1) + 2 * (2**BITS_PER_LIMB - 1)
1545
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + 4 * (2**BITS_PER_LIMB - 1)
1546
    //     == 7 + 4 * 2**29 - 4 == 2**31 + 3 < 2**32
1547
1548
14.4M
    res->val[i] = acc & (BN_BASE - 1);
1549
14.4M
    acc >>= BN_BITS_PER_LIMB;
1550
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1
1551
1552
    // acc == 2**(BITS_PER_LIMB * (i + 1)) + x[:i+1] - y[:i+1] + 2*prime[:i+1]
1553
    //   >> BITS_PER_LIMB * (i+1)
1554
14.4M
  }
1555
1556
  // assert(acc == 1); // assert x + 2 * prime - y < 2**261
1557
1558
  // clang-format off
1559
  // acc == 1
1560
  // Proof:
1561
  //   acc == 2**(BITS_PER_LIMB * LIMBS) + x[:LIMBS] - y[:LIMBS] + 2 * prime[:LIMBS] >> BITS_PER_LIMB * LIMBS
1562
  //     == 2**(BITS_PER_LIMB * LIMBS) + x - y + 2 * prime >> BITS_PER_LIMB * LIMBS
1563
  //     == 2**(BITS_PER_LIMB * LIMBS) + x + (2 * prime - y) >> BITS_PER_LIMB * LIMBS
1564
  //     <= 2**(BITS_PER_LIMB * LIMBS) + 2**(BITS_PER_LIMB * LIMBS) - 1 >> BITS_PER_LIMB * LIMBS
1565
  //     <= 2 * 2**(BITS_PER_LIMB * LIMBS) - 1 >> BITS_PER_LIMB * LIMBS
1566
  //     == 1
1567
1568
  //   acc == 2**(BITS_PER_LIMB * LIMBS) + x[:LIMBS] - y[:LIMBS] + 2 * prime[:LIMBS] >> BITS_PER_LIMB * LIMBS
1569
  //     == 2**(BITS_PER_LIMB * LIMBS) + x - y + 2 * prime >> BITS_PER_LIMB * LIMBS
1570
  //     == 2**(BITS_PER_LIMB * LIMBS) + x + (2 * prime - y) >> BITS_PER_LIMB * LIMBS
1571
  //     >= 2**(BITS_PER_LIMB * LIMBS) + 0 + 1 >> BITS_PER_LIMB * LIMBS
1572
  //     == 1
1573
  // clang-format on
1574
1.60M
}
1575
1576
// res = x - y
1577
// Assumes x, y are normalized and x >= y
1578
// Guarantees res is normalized
1579
// Works properly even if &x == &y or &x == &res or &y == &res or
1580
//   &x == &y == &res
1581
5.71M
void bn_subtract(const bignum256 *x, const bignum256 *y, bignum256 *res) {
1582
5.71M
  uint32_t acc = 1;
1583
57.1M
  for (int i = 0; i < BN_LIMBS; i++) {
1584
51.4M
    acc += (BN_BASE - 1) + x->val[i] - y->val[i];
1585
    // acc neither overflows 32 bits nor underflows 0
1586
    // Proof:
1587
    //   acc + (BASE - 1) + x[i] - y[i]
1588
    //     >= (BASE - 1) - y == (2**BITS_PER_LIMB - 1) - (2**BITS_PER_LIMB - 1)
1589
    //     == 0
1590
    //   acc + (BASE - 1) + x[i] - y[i]
1591
    //     <= acc + (BASE - 1) + x[i]
1592
    //     <= (2**(32 - BITS_PER_LIMB) - 1) + (2**BITS_PER_LIMB - 1) +
1593
    //       (2**BITS_PER_LIMB - 1)
1594
    //     == 7 + 2 * 2**29 < 2 **32
1595
1596
51.4M
    res->val[i] = acc & BN_LIMB_MASK;
1597
51.4M
    acc >>= BN_BITS_PER_LIMB;
1598
    // acc <= 7 == 2**(32 - BITS_PER_LIMB) - 1
1599
1600
    // acc == 2**(BITS_PER_LIMB * (i + 1)) + x[:i + 1] - y[:i + 1]
1601
    //   >> BITS_PER_LIMB * (i + 1)
1602
51.4M
  }
1603
1604
  // assert(acc == 1); // assert x >= y
1605
1606
  // clang-format off
1607
  // acc == 1
1608
  // Proof:
1609
  //   acc == 2**(BITS_PER_LIMB * LIMBS) + x[:LIMBS] - y[:LIMBS] >> BITS_PER_LIMB * LIMBS
1610
  //     == 2**(BITS_PER_LIMB * LIMBS) + x - y >> BITS_PER_LIMB * LIMBS
1611
  //     == 2**(BITS_PER_LIMB * LIMBS) + x >> BITS_PER_LIMB * LIMBS
1612
  //     <= 2**(BITS_PER_LIMB * LIMBS) + 2**(BITS_PER_LIMB * LIMBS) - 1 >> BITS_PER_LIMB * LIMBS
1613
  //     <= 2 * 2**(BITS_PER_LIMB * LIMBS) - 1 >> BITS_PER_LIMB * LIMBS
1614
  //     == 1
1615
1616
  //   acc == 2**(BITS_PER_LIMB * LIMBS) + x[:LIMBS] - y[:LIMBS] >> BITS_PER_LIMB * LIMBS
1617
  //     == 2**(BITS_PER_LIMB * LIMBS) + x - y >> BITS_PER_LIMB * LIMBS
1618
  //     >= 2**(BITS_PER_LIMB * LIMBS) >> BITS_PER_LIMB * LIMBS
1619
  //     == 1
1620
5.71M
}
1621
1622
// Returns 0 if x is zero
1623
// Returns 1 if x is a square modulo prime
1624
// Returns -1 if x is not a square modulo prime
1625
// Assumes x is normalized, x < 2**259
1626
// Assumes prime is normalized, 2**256 - 2**224 <= prime <= 2**256
1627
// Assumes prime is a prime
1628
// The function doesn't have neither constant control flow nor constant memory
1629
//  access flow with regard to prime
1630
0
int bn_legendre(const bignum256 *x, const bignum256 *prime) {
1631
  // This is a naive implementation
1632
  // A better implementation would be to use the Euclidean algorithm together with the quadratic reciprocity law
1633
1634
  // e = (prime - 1) / 2
1635
0
  bignum256 e = {0};
1636
0
  bn_copy(prime, &e);
1637
0
  bn_rshift(&e);
1638
1639
  // res = x**e % prime
1640
0
  bignum256 res = {0};
1641
0
  bn_power_mod(x, &e, prime, &res);
1642
0
  bn_mod(&res, prime);
1643
1644
0
  if (bn_is_one(&res)) {
1645
0
    return 1;
1646
0
  }
1647
1648
0
  if (bn_is_zero(&res)) {
1649
0
    return 0;
1650
0
  }
1651
1652
0
  return -1;
1653
0
}
1654
1655
// q = x // d, r = x % d
1656
// Assumes x is normalized, 1 <= d <= 61304
1657
// Guarantees q is normalized
1658
0
void bn_long_division(bignum256 *x, uint32_t d, bignum256 *q, uint32_t *r) {
1659
0
  assert(1 <= d && d < 61304);
1660
1661
0
  uint32_t acc = 0;
1662
1663
0
  *r = x->val[BN_LIMBS - 1] % d;
1664
0
  q->val[BN_LIMBS - 1] = x->val[BN_LIMBS - 1] / d;
1665
1666
0
  for (int i = BN_LIMBS - 2; i >= 0; i--) {
1667
0
    acc = *r * (BN_BASE % d) + x->val[i];
1668
    // acc doesn't overflow 32 bits
1669
    // Proof:
1670
    //   r * (BASE % d) + x[i]
1671
    //     <= (d - 1) * (d - 1) + (2**BITS_PER_LIMB - 1)
1672
    //     == d**2 - 2*d + 2**BITS_PER_LIMB
1673
    //     == 61304**2 - 2 * 61304 + 2**29
1674
    //     == 3758057808 + 2**29  < 2**32
1675
1676
0
    q->val[i] = *r * (BN_BASE / d) + (acc / d);
1677
    // q[i] doesn't overflow 32 bits
1678
    // Proof:
1679
    //   r * (BASE // d) + (acc // d)
1680
    //     <= (d - 1) * (2**BITS_PER_LIMB / d) +
1681
    //      ((d**2 - 2*d + 2**BITS_PER_LIMB) / d)
1682
    //     <= (d - 1) * (2**BITS_PER_LIMB / d) + (d - 2 + 2**BITS_PER_LIMB / d)
1683
    //     == (d - 1 + 1) * (2**BITS_PER_LIMB / d) + d - 2
1684
    //     == 2**BITS_PER_LIMB + d - 2 <= 2**29 + 61304 < 2**32
1685
1686
    // q[i] == (r * BASE + x[i]) // d
1687
    // Proof:
1688
    //   q[i] == r * (BASE // d) + (acc // d)
1689
    //     == r * (BASE // d) + (r * (BASE % d) + x[i]) // d
1690
    //     == (r * d * (BASE // d) + r * (BASE % d) + x[i]) // d
1691
    //     == (r * (d * (BASE // d) + (BASE % d)) + x[i]) // d
1692
    //     == (r * BASE + x[i]) // d
1693
1694
    // q[i] < 2**BITS_PER_LIMB
1695
    // Proof:
1696
    //   q[i] == (r * BASE + x[i]) // d
1697
    //     <= ((d - 1) * 2**BITS_PER_LIMB + (2**BITS_PER_LIMB - 1)) / d
1698
    //     == (d * 2**BITS_PER_LIMB - 1) / d == 2**BITS_PER_LIMB - 1 / d
1699
    //     <  2**BITS_PER_LIMB
1700
1701
0
    *r = acc % d;
1702
    // r == (r * BASE + x[i]) % d
1703
    // Proof:
1704
    //   r == acc % d == (r * (BASE % d) + x[i]) % d
1705
    //     == (r * BASE + x[i]) % d
1706
1707
    // x[:i] == q[:i] * d + r
1708
0
  }
1709
0
}
1710
1711
// x = x // 58, r = x % 58
1712
// Assumes x is normalized
1713
// Guarantees x is normalized
1714
0
void bn_divmod58(bignum256 *x, uint32_t *r) { bn_long_division(x, 58, x, r); }
1715
1716
// x = x // 1000, r = x % 1000
1717
// Assumes x is normalized
1718
// Guarantees x is normalized
1719
0
void bn_divmod1000(bignum256 *x, uint32_t *r) {
1720
0
  bn_long_division(x, 1000, x, r);
1721
0
}
1722
1723
// x = x // 10, r = x % 10
1724
// Assumes x is normalized
1725
// Guarantees x is normalized
1726
0
void bn_divmod10(bignum256 *x, uint32_t *r) { bn_long_division(x, 10, x, r); }
1727
1728
// Formats amount
1729
// Assumes amount is normalized
1730
// Assumes prefix and suffix are null-terminated strings
1731
// Assumes output is an array of length output_length
1732
// The function doesn't have neither constant control flow nor constant memory
1733
//   access flow with regard to any its argument
1734
0
size_t bn_format(const bignum256 *amount, const char *prefix, const char *suffix, unsigned int decimals, int exponent, bool trailing, char thousands, char *output, size_t output_length) {
1735
1736
/*
1737
  Python prototype of the function:
1738
1739
  def format(amount, prefix, suffix, decimals, exponent, trailing, thousands):
1740
      if exponent >= 0:
1741
          amount *= 10**exponent
1742
      else:
1743
          amount //= 10 ** (-exponent)
1744
1745
      d = pow(10, decimals)
1746
1747
      integer_part = amount // d
1748
      integer_str = f"{integer_part:,}".replace(",", thousands or "")
1749
1750
      if decimals:
1751
          decimal_part = amount % d
1752
          decimal_str = f".{decimal_part:0{decimals}d}"
1753
          if not trailing:
1754
              decimal_str = decimal_str.rstrip("0").rstrip(".")
1755
      else:
1756
          decimal_str = ""
1757
1758
      return prefix + integer_str + decimal_str + suffix
1759
*/
1760
1761
// Auxiliary macro for bn_format
1762
// If enough space adds one character to output starting from the end
1763
0
#define BN_FORMAT_ADD_OUTPUT_CHAR(c)                               \
1764
0
  {                                                                \
1765
0
    --position;                                                    \
1766
0
    if (output <= position && position < output + output_length) { \
1767
0
      *position = (c);                                             \
1768
0
    } else {                                                       \
1769
0
      memset(output, '\0', output_length);                         \
1770
0
      return 0;                                                    \
1771
0
    }                                                              \
1772
0
  }
1773
1774
0
  bignum256 temp = {0};
1775
0
  bn_copy(amount, &temp);
1776
0
  uint32_t digit = 0;
1777
1778
0
  char *position = output + output_length;
1779
1780
  // Add string ending character
1781
0
  BN_FORMAT_ADD_OUTPUT_CHAR('\0');
1782
1783
  // Add suffix
1784
0
  size_t suffix_length = suffix ? strlen(suffix) : 0;
1785
0
  for (int i = suffix_length - 1; i >= 0; --i)
1786
0
    BN_FORMAT_ADD_OUTPUT_CHAR(suffix[i])
1787
1788
  // amount //= 10**exponent
1789
0
  for (; exponent < 0; ++exponent) {
1790
    // if temp == 0, there is no need to divide it by 10 anymore
1791
0
    if (bn_is_zero(&temp)) {
1792
0
      exponent = 0;
1793
0
      break;
1794
0
    }
1795
0
    bn_divmod10(&temp, &digit);
1796
0
  }
1797
1798
  // exponent >= 0 && decimals >= 0
1799
1800
0
  bool fractional_part = false;  // is fractional-part of amount present
1801
1802
0
  {  // Add fractional-part digits of amount
1803
    // Add trailing zeroes
1804
0
    unsigned int trailing_zeros = decimals < (unsigned int) exponent ? decimals : (unsigned int) exponent;
1805
    // When casting a negative int to unsigned int, UINT_MAX is added to the int before
1806
    // Since exponent >= 0, the value remains unchanged
1807
0
    decimals -= trailing_zeros;
1808
0
    exponent -= trailing_zeros;
1809
1810
0
    if (trailing && trailing_zeros) {
1811
0
      fractional_part = true;
1812
0
      for (; trailing_zeros > 0; --trailing_zeros)
1813
0
          BN_FORMAT_ADD_OUTPUT_CHAR('0')
1814
0
    }
1815
1816
    // exponent == 0 || decimals == 0
1817
1818
    // Add significant digits and leading zeroes
1819
0
    for (; decimals > 0; --decimals) {
1820
0
      bn_divmod10(&temp, &digit);
1821
1822
0
      if (fractional_part || digit || trailing) {
1823
0
        fractional_part = true;
1824
0
        BN_FORMAT_ADD_OUTPUT_CHAR('0' + digit)
1825
0
      }
1826
0
      else if (bn_is_zero(&temp)) {
1827
        // We break since the remaining digits are zeroes and fractional_part == trailing == false
1828
0
        decimals = 0;
1829
0
        break;
1830
0
      }
1831
0
    }
1832
    // decimals == 0
1833
0
  }
1834
1835
0
  if (fractional_part) {
1836
0
    BN_FORMAT_ADD_OUTPUT_CHAR('.')
1837
0
  }
1838
1839
0
  {  // Add integer-part digits of amount
1840
    // Add trailing zeroes
1841
0
    int digits = 0;
1842
0
    if (!bn_is_zero(&temp)) {
1843
0
      for (; exponent > 0; --exponent) {
1844
0
        ++digits;
1845
0
        BN_FORMAT_ADD_OUTPUT_CHAR('0')
1846
0
        if (thousands != 0 && digits % 3 == 0) {
1847
0
          BN_FORMAT_ADD_OUTPUT_CHAR(thousands)
1848
0
        }
1849
0
      }
1850
0
    }
1851
    // decimals == 0 && exponent == 0
1852
1853
    // Add significant digits
1854
0
    bool is_zero = false;
1855
0
    do {
1856
0
      ++digits;
1857
0
      bn_divmod10(&temp, &digit);
1858
0
      is_zero = bn_is_zero(&temp);
1859
0
      BN_FORMAT_ADD_OUTPUT_CHAR('0' + digit)
1860
0
      if (thousands != 0 && !is_zero && digits % 3 == 0) {
1861
0
        BN_FORMAT_ADD_OUTPUT_CHAR(thousands)
1862
0
      }
1863
0
    } while (!is_zero);
1864
0
  }
1865
1866
  // Add prefix
1867
0
  size_t prefix_length = prefix ? strlen(prefix) : 0;
1868
0
  for (int i = prefix_length - 1; i >= 0; --i)
1869
0
    BN_FORMAT_ADD_OUTPUT_CHAR(prefix[i])
1870
1871
  // Move formatted amount to the start of output
1872
0
  int length = output - position + output_length;
1873
0
  memmove(output, position, length);
1874
0
  return length - 1;
1875
0
}
1876
1877
#if USE_BN_PRINT
1878
// Prints x in hexadecimal
1879
// Assumes x is normalized and x < 2**256
1880
void bn_print(const bignum256 *x) {
1881
  printf("%06x", x->val[8]);
1882
  printf("%08x", ((x->val[7] << 3) | (x->val[6] >> 26)));
1883
  printf("%07x", ((x->val[6] << 2) | (x->val[5] >> 27)) & 0x0FFFFFFF);
1884
  printf("%07x", ((x->val[5] << 1) | (x->val[4] >> 28)) & 0x0FFFFFFF);
1885
  printf("%07x", x->val[4] & 0x0FFFFFFF);
1886
  printf("%08x", ((x->val[3] << 3) | (x->val[2] >> 26)));
1887
  printf("%07x", ((x->val[2] << 2) | (x->val[1] >> 27)) & 0x0FFFFFFF);
1888
  printf("%07x", ((x->val[1] << 1) | (x->val[0] >> 28)) & 0x0FFFFFFF);
1889
  printf("%07x", x->val[0] & 0x0FFFFFFF);
1890
}
1891
1892
// Prints comma separated list of limbs of x
1893
void bn_print_raw(const bignum256 *x) {
1894
  for (int i = 0; i < BN_LIMBS - 1; i++) {
1895
    printf("0x%08x, ", x->val[i]);
1896
  }
1897
  printf("0x%08x", x->val[BN_LIMBS - 1]);
1898
}
1899
#endif
1900
1901
#if USE_INVERSE_FAST
1902
10.7k
void bn_inverse(bignum256 *x, const bignum256 *prime) {
1903
10.7k
  bn_inverse_fast(x, prime);
1904
10.7k
}
1905
#else
1906
void bn_inverse(bignum256 *x, const bignum256 *prime) {
1907
  bn_inverse_slow(x, prime);
1908
}
1909
#endif