Coverage Report

Created: 2026-04-15 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/boringssl/crypto/fipsmodule/ec/p256-nistz.cc.inc
Line
Count
Source
1
// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
2
// Copyright (c) 2014, Intel Corporation. All Rights Reserved.
3
//
4
// Licensed under the Apache License, Version 2.0 (the "License");
5
// you may not use this file except in compliance with the License.
6
// You may obtain a copy of the License at
7
//
8
//     https://www.apache.org/licenses/LICENSE-2.0
9
//
10
// Unless required by applicable law or agreed to in writing, software
11
// distributed under the License is distributed on an "AS IS" BASIS,
12
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
// See the License for the specific language governing permissions and
14
// limitations under the License.
15
//
16
// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
17
// (1) Intel Corporation, Israel Development Center, Haifa, Israel
18
// (2) University of Haifa, Israel
19
//
20
// Reference:
21
// S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
22
//                          256 Bit Primes"
23
24
#include <openssl/ec.h>
25
26
#include <assert.h>
27
#include <stdint.h>
28
#include <string.h>
29
30
#include <openssl/bn.h>
31
#include <openssl/crypto.h>
32
#include <openssl/err.h>
33
34
#include "../../internal.h"
35
#include "../bn/internal.h"
36
#include "../delocate.h"
37
#include "internal.h"
38
#include "p256-nistz.h"
39
40
41
using namespace bssl;
42
43
#if !defined(OPENSSL_NO_ASM) &&                              \
44
    (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
45
    !defined(OPENSSL_SMALL)
46
47
typedef P256_POINT_AFFINE PRECOMP256_ROW[64];
48
49
// One converted into the Montgomery domain
50
static const BN_ULONG ONE_MONT[P256_LIMBS] = {
51
    TOBN(0x00000000, 0x00000001),
52
    TOBN(0xffffffff, 0x00000000),
53
    TOBN(0xffffffff, 0xffffffff),
54
    TOBN(0x00000000, 0xfffffffe),
55
};
56
57
// Precomputed tables for the default generator
58
#include "p256-nistz-table.h"
59
60
// Recode window to a signed digit, see |ec_GFp_nistp_recode_scalar_bits| in
61
// util.c for details
62
556k
static crypto_word_t booth_recode_w5(crypto_word_t in) {
63
556k
  crypto_word_t s, d;
64
65
556k
  s = ~((in >> 5) - 1);
66
556k
  d = (1 << 6) - in - 1;
67
556k
  d = (d & s) | (in & ~s);
68
556k
  d = (d >> 1) + (d & 1);
69
70
556k
  return (d << 1) + (s & 1);
71
556k
}
72
73
433k
static crypto_word_t booth_recode_w7(crypto_word_t in) {
74
433k
  crypto_word_t s, d;
75
76
433k
  s = ~((in >> 7) - 1);
77
433k
  d = (1 << 8) - in - 1;
78
433k
  d = (d & s) | (in & ~s);
79
433k
  d = (d >> 1) + (d & 1);
80
81
433k
  return (d << 1) + (s & 1);
82
433k
}
83
84
// copy_conditional copies |src| to |dst| if |move| is one and leaves it as-is
85
// if |move| is zero.
86
//
87
// WARNING: this breaks the usual convention of constant-time functions
88
// returning masks.
89
static void copy_conditional(BN_ULONG dst[P256_LIMBS],
90
701k
                             const BN_ULONG src[P256_LIMBS], BN_ULONG move) {
91
701k
  BN_ULONG mask1 = ((BN_ULONG)0) - move;
92
701k
  BN_ULONG mask2 = ~mask1;
93
94
701k
  dst[0] = (src[0] & mask1) ^ (dst[0] & mask2);
95
701k
  dst[1] = (src[1] & mask1) ^ (dst[1] & mask2);
96
701k
  dst[2] = (src[2] & mask1) ^ (dst[2] & mask2);
97
701k
  dst[3] = (src[3] & mask1) ^ (dst[3] & mask2);
98
701k
  if (P256_LIMBS == 8) {
99
0
    dst[4] = (src[4] & mask1) ^ (dst[4] & mask2);
100
0
    dst[5] = (src[5] & mask1) ^ (dst[5] & mask2);
101
0
    dst[6] = (src[6] & mask1) ^ (dst[6] & mask2);
102
0
    dst[7] = (src[7] & mask1) ^ (dst[7] & mask2);
103
0
  }
104
701k
}
105
106
// is_not_zero returns one iff in != 0 and zero otherwise.
107
//
108
// WARNING: this breaks the usual convention of constant-time functions
109
// returning masks.
110
//
111
// (define-fun is_not_zero ((in (_ BitVec 64))) (_ BitVec 64)
112
//   (bvlshr (bvor in (bvsub #x0000000000000000 in)) #x000000000000003f)
113
// )
114
//
115
// (declare-fun x () (_ BitVec 64))
116
//
117
// (assert (and (= x #x0000000000000000) (= (is_not_zero x)
118
// #x0000000000000001))) (check-sat)
119
//
120
// (assert (and (not (= x #x0000000000000000)) (= (is_not_zero x)
121
// #x0000000000000000))) (check-sat)
122
//
123
4.09k
static BN_ULONG is_not_zero(BN_ULONG in) {
124
4.09k
  in |= (0 - in);
125
4.09k
  in >>= BN_BITS2 - 1;
126
4.09k
  return in;
127
4.09k
}
128
129
#if defined(OPENSSL_X86_64)
130
// Dispatch between CPU variations. The "_adx" suffixed functions use MULX in
131
// addition to ADCX/ADOX. MULX is part of BMI2, not ADX, so we must check both
132
// capabilities.
133
static void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS],
134
                                  const BN_ULONG a[P256_LIMBS],
135
122k
                                  const BN_ULONG b[P256_LIMBS]) {
136
122k
  if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) {
137
122k
    ecp_nistz256_mul_mont_adx(res, a, b);
138
122k
  } else {
139
0
    ecp_nistz256_mul_mont_nohw(res, a, b);
140
0
  }
141
122k
}
142
143
static void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS],
144
1.95M
                                  const BN_ULONG a[P256_LIMBS]) {
145
1.95M
  if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) {
146
1.95M
    ecp_nistz256_sqr_mont_adx(res, a);
147
1.95M
  } else {
148
0
    ecp_nistz256_sqr_mont_nohw(res, a);
149
0
  }
150
1.95M
}
151
152
static void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
153
                                      const BN_ULONG a[P256_LIMBS],
154
21.0k
                                      const BN_ULONG b[P256_LIMBS]) {
155
21.0k
  if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) {
156
21.0k
    ecp_nistz256_ord_mul_mont_adx(res, a, b);
157
21.0k
  } else {
158
0
    ecp_nistz256_ord_mul_mont_nohw(res, a, b);
159
0
  }
160
21.0k
}
161
162
static void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
163
                                      const BN_ULONG a[P256_LIMBS],
164
19.3k
                                      BN_ULONG rep) {
165
19.3k
  if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) {
166
19.3k
    ecp_nistz256_ord_sqr_mont_adx(res, a, rep);
167
19.3k
  } else {
168
0
    ecp_nistz256_ord_sqr_mont_nohw(res, a, rep);
169
0
  }
170
19.3k
}
171
172
static void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16],
173
556k
                                   int index) {
174
556k
  if (CRYPTO_is_AVX2_capable()) {
175
556k
    ecp_nistz256_select_w5_avx2(val, in_t, index);
176
556k
  } else {
177
0
    ecp_nistz256_select_w5_nohw(val, in_t, index);
178
0
  }
179
556k
}
180
181
static void ecp_nistz256_select_w7(P256_POINT_AFFINE *val,
182
                                   const P256_POINT_AFFINE in_t[64],
183
151k
                                   int index) {
184
151k
  if (CRYPTO_is_AVX2_capable()) {
185
151k
    ecp_nistz256_select_w7_avx2(val, in_t, index);
186
151k
  } else {
187
0
    ecp_nistz256_select_w7_nohw(val, in_t, index);
188
0
  }
189
151k
}
190
191
2.81M
static void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a) {
192
2.81M
  if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) {
193
2.81M
    ecp_nistz256_point_double_adx(r, a);
194
2.81M
  } else {
195
0
    ecp_nistz256_point_double_nohw(r, a);
196
0
  }
197
2.81M
}
198
199
static void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a,
200
628k
                                   const P256_POINT *b) {
201
628k
  if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) {
202
628k
    ecp_nistz256_point_add_adx(r, a, b);
203
628k
  } else {
204
0
    ecp_nistz256_point_add_nohw(r, a, b);
205
0
  }
206
628k
}
207
208
static void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a,
209
419k
                                          const P256_POINT_AFFINE *b) {
210
419k
  if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) {
211
419k
    ecp_nistz256_point_add_affine_adx(r, a, b);
212
419k
  } else {
213
0
    ecp_nistz256_point_add_affine_nohw(r, a, b);
214
0
  }
215
419k
}
216
#endif  // OPENSSL_X86_64
217
218
// ecp_nistz256_from_mont sets |res| to |in|, converted from Montgomery domain
219
// by multiplying with 1.
220
static void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS],
221
7.62k
                                   const BN_ULONG in[P256_LIMBS]) {
222
7.62k
  static const BN_ULONG ONE[P256_LIMBS] = {1};
223
7.62k
  ecp_nistz256_mul_mont(res, in, ONE);
224
7.62k
}
225
226
// ecp_nistz256_mod_inverse_sqr_mont sets |r| to (|in| * 2^-256)^-2 * 2^256 mod
227
// p. That is, |r| is the modular inverse square of |in| for input and output in
228
// the Montgomery domain.
229
static void ecp_nistz256_mod_inverse_sqr_mont(BN_ULONG r[P256_LIMBS],
230
7.64k
                                              const BN_ULONG in[P256_LIMBS]) {
231
  // This implements the addition chain described in
232
  // https://briansmith.org/ecc-inversion-addition-chains-01#p256_field_inversion
233
7.64k
  BN_ULONG x2[P256_LIMBS], x3[P256_LIMBS], x6[P256_LIMBS], x12[P256_LIMBS],
234
7.64k
      x15[P256_LIMBS], x30[P256_LIMBS], x32[P256_LIMBS];
235
7.64k
  ecp_nistz256_sqr_mont(x2, in);      // 2^2 - 2^1
236
7.64k
  ecp_nistz256_mul_mont(x2, x2, in);  // 2^2 - 2^0
237
238
7.64k
  ecp_nistz256_sqr_mont(x3, x2);      // 2^3 - 2^1
239
7.64k
  ecp_nistz256_mul_mont(x3, x3, in);  // 2^3 - 2^0
240
241
7.64k
  ecp_nistz256_sqr_mont(x6, x3);
242
22.9k
  for (int i = 1; i < 3; i++) {
243
15.2k
    ecp_nistz256_sqr_mont(x6, x6);
244
15.2k
  }                                   // 2^6 - 2^3
245
7.64k
  ecp_nistz256_mul_mont(x6, x6, x3);  // 2^6 - 2^0
246
247
7.64k
  ecp_nistz256_sqr_mont(x12, x6);
248
45.8k
  for (int i = 1; i < 6; i++) {
249
38.2k
    ecp_nistz256_sqr_mont(x12, x12);
250
38.2k
  }                                     // 2^12 - 2^6
251
7.64k
  ecp_nistz256_mul_mont(x12, x12, x6);  // 2^12 - 2^0
252
253
7.64k
  ecp_nistz256_sqr_mont(x15, x12);
254
22.9k
  for (int i = 1; i < 3; i++) {
255
15.2k
    ecp_nistz256_sqr_mont(x15, x15);
256
15.2k
  }                                     // 2^15 - 2^3
257
7.64k
  ecp_nistz256_mul_mont(x15, x15, x3);  // 2^15 - 2^0
258
259
7.64k
  ecp_nistz256_sqr_mont(x30, x15);
260
114k
  for (int i = 1; i < 15; i++) {
261
107k
    ecp_nistz256_sqr_mont(x30, x30);
262
107k
  }                                      // 2^30 - 2^15
263
7.64k
  ecp_nistz256_mul_mont(x30, x30, x15);  // 2^30 - 2^0
264
265
7.64k
  ecp_nistz256_sqr_mont(x32, x30);
266
7.64k
  ecp_nistz256_sqr_mont(x32, x32);      // 2^32 - 2^2
267
7.64k
  ecp_nistz256_mul_mont(x32, x32, x2);  // 2^32 - 2^0
268
269
7.64k
  BN_ULONG ret[P256_LIMBS];
270
7.64k
  ecp_nistz256_sqr_mont(ret, x32);
271
244k
  for (int i = 1; i < 31 + 1; i++) {
272
236k
    ecp_nistz256_sqr_mont(ret, ret);
273
236k
  }                                     // 2^64 - 2^32
274
7.64k
  ecp_nistz256_mul_mont(ret, ret, in);  // 2^64 - 2^32 + 2^0
275
276
985k
  for (int i = 0; i < 96 + 32; i++) {
277
978k
    ecp_nistz256_sqr_mont(ret, ret);
278
978k
  }                                      // 2^192 - 2^160 + 2^128
279
7.64k
  ecp_nistz256_mul_mont(ret, ret, x32);  // 2^192 - 2^160 + 2^128 + 2^32 - 2^0
280
281
252k
  for (int i = 0; i < 32; i++) {
282
244k
    ecp_nistz256_sqr_mont(ret, ret);
283
244k
  }                                      // 2^224 - 2^192 + 2^160 + 2^64 - 2^32
284
7.64k
  ecp_nistz256_mul_mont(ret, ret, x32);  // 2^224 - 2^192 + 2^160 + 2^64 - 2^0
285
286
236k
  for (int i = 0; i < 30; i++) {
287
229k
    ecp_nistz256_sqr_mont(ret, ret);
288
229k
  }                                      // 2^254 - 2^222 + 2^190 + 2^94 - 2^30
289
7.64k
  ecp_nistz256_mul_mont(ret, ret, x30);  // 2^254 - 2^222 + 2^190 + 2^94 - 2^0
290
291
7.64k
  ecp_nistz256_sqr_mont(ret, ret);
292
7.64k
  ecp_nistz256_sqr_mont(r, ret);  // 2^256 - 2^224 + 2^192 + 2^96 - 2^2
293
7.64k
}
294
295
// r = p * p_scalar
296
static void ecp_nistz256_windowed_mul(const EC_GROUP *group, P256_POINT *r,
297
                                      const EC_JACOBIAN *p,
298
10.7k
                                      const EC_SCALAR *p_scalar) {
299
10.7k
  assert(p != nullptr);
300
10.7k
  assert(p_scalar != nullptr);
301
10.7k
  assert(group->field.N.width == P256_LIMBS);
302
303
10.7k
  static const size_t kWindowSize = 5;
304
10.7k
  static const crypto_word_t kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
305
306
  // A |P256_POINT| is (3 * 32) = 96 bytes, and the 64-byte alignment should
307
  // add no more than 63 bytes of overhead. Thus, |table| should require
308
  // ~1599 ((96 * 16) + 63) bytes of stack space.
309
10.7k
  alignas(64) P256_POINT table[16];
310
10.7k
  uint8_t p_str[33];
311
10.7k
  OPENSSL_memcpy(p_str, p_scalar->words, 32);
312
10.7k
  p_str[32] = 0;
313
314
  // table[0] is implicitly (0,0,0) (the point at infinity), therefore it is
315
  // not stored. All other values are actually stored with an offset of -1 in
316
  // table.
317
10.7k
  P256_POINT *row = table;
318
10.7k
  assert(group->field.N.width == P256_LIMBS);
319
10.7k
  OPENSSL_memcpy(row[1 - 1].X, p->X.words, P256_LIMBS * sizeof(BN_ULONG));
320
10.7k
  OPENSSL_memcpy(row[1 - 1].Y, p->Y.words, P256_LIMBS * sizeof(BN_ULONG));
321
10.7k
  OPENSSL_memcpy(row[1 - 1].Z, p->Z.words, P256_LIMBS * sizeof(BN_ULONG));
322
323
10.7k
  ecp_nistz256_point_double(&row[2 - 1], &row[1 - 1]);
324
10.7k
  ecp_nistz256_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]);
325
10.7k
  ecp_nistz256_point_double(&row[4 - 1], &row[2 - 1]);
326
10.7k
  ecp_nistz256_point_double(&row[6 - 1], &row[3 - 1]);
327
10.7k
  ecp_nistz256_point_double(&row[8 - 1], &row[4 - 1]);
328
10.7k
  ecp_nistz256_point_double(&row[12 - 1], &row[6 - 1]);
329
10.7k
  ecp_nistz256_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]);
330
10.7k
  ecp_nistz256_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]);
331
10.7k
  ecp_nistz256_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]);
332
10.7k
  ecp_nistz256_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]);
333
10.7k
  ecp_nistz256_point_double(&row[14 - 1], &row[7 - 1]);
334
10.7k
  ecp_nistz256_point_double(&row[10 - 1], &row[5 - 1]);
335
10.7k
  ecp_nistz256_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]);
336
10.7k
  ecp_nistz256_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]);
337
10.7k
  ecp_nistz256_point_double(&row[16 - 1], &row[8 - 1]);
338
339
10.7k
  BN_ULONG tmp[P256_LIMBS];
340
10.7k
  alignas(32) P256_POINT h;
341
10.7k
  size_t index = 255;
342
10.7k
  crypto_word_t wvalue = p_str[(index - 1) / 8];
343
10.7k
  wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
344
345
10.7k
  ecp_nistz256_select_w5(r, table, booth_recode_w5(wvalue) >> 1);
346
347
556k
  while (index >= 5) {
348
545k
    if (index != 255) {
349
535k
      size_t off = (index - 1) / 8;
350
351
535k
      wvalue = (crypto_word_t)p_str[off] | (crypto_word_t)p_str[off + 1] << 8;
352
535k
      wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
353
354
535k
      wvalue = booth_recode_w5(wvalue);
355
356
535k
      ecp_nistz256_select_w5(&h, table, wvalue >> 1);
357
358
535k
      ecp_nistz256_neg(tmp, h.Y);
359
535k
      copy_conditional(h.Y, tmp, (wvalue & 1));
360
361
535k
      ecp_nistz256_point_add(r, r, &h);
362
535k
    }
363
364
545k
    index -= kWindowSize;
365
366
545k
    ecp_nistz256_point_double(r, r);
367
545k
    ecp_nistz256_point_double(r, r);
368
545k
    ecp_nistz256_point_double(r, r);
369
545k
    ecp_nistz256_point_double(r, r);
370
545k
    ecp_nistz256_point_double(r, r);
371
545k
  }
372
373
  // Final window
374
10.7k
  wvalue = p_str[0];
375
10.7k
  wvalue = (wvalue << 1) & kMask;
376
377
10.7k
  wvalue = booth_recode_w5(wvalue);
378
379
10.7k
  ecp_nistz256_select_w5(&h, table, wvalue >> 1);
380
381
10.7k
  ecp_nistz256_neg(tmp, h.Y);
382
10.7k
  copy_conditional(h.Y, tmp, wvalue & 1);
383
384
10.7k
  ecp_nistz256_point_add(r, r, &h);
385
10.7k
}
386
387
11.7k
static crypto_word_t calc_first_wvalue(size_t *index, const uint8_t p_str[33]) {
388
11.7k
  static const size_t kWindowSize = 7;
389
11.7k
  static const crypto_word_t kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
390
11.7k
  *index = kWindowSize;
391
392
11.7k
  crypto_word_t wvalue = (p_str[0] << 1) & kMask;
393
11.7k
  return booth_recode_w7(wvalue);
394
11.7k
}
395
396
421k
static crypto_word_t calc_wvalue(size_t *index, const uint8_t p_str[33]) {
397
421k
  static const size_t kWindowSize = 7;
398
421k
  static const crypto_word_t kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
399
400
421k
  const size_t off = (*index - 1) / 8;
401
421k
  crypto_word_t wvalue =
402
421k
      (crypto_word_t)p_str[off] | (crypto_word_t)p_str[off + 1] << 8;
403
421k
  wvalue = (wvalue >> ((*index - 1) % 8)) & kMask;
404
421k
  *index += kWindowSize;
405
406
421k
  return booth_recode_w7(wvalue);
407
421k
}
408
409
static void ecp_nistz256_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
410
                                   const EC_JACOBIAN *p,
411
3.08k
                                   const EC_SCALAR *scalar) {
412
3.08k
  alignas(32) P256_POINT out;
413
3.08k
  ecp_nistz256_windowed_mul(group, &out, p, scalar);
414
415
3.08k
  assert(group->field.N.width == P256_LIMBS);
416
3.08k
  OPENSSL_memcpy(r->X.words, out.X, P256_LIMBS * sizeof(BN_ULONG));
417
3.08k
  OPENSSL_memcpy(r->Y.words, out.Y, P256_LIMBS * sizeof(BN_ULONG));
418
3.08k
  OPENSSL_memcpy(r->Z.words, out.Z, P256_LIMBS * sizeof(BN_ULONG));
419
3.08k
}
420
421
static void ecp_nistz256_point_mul_base(const EC_GROUP *group, EC_JACOBIAN *r,
422
4.09k
                                        const EC_SCALAR *scalar) {
423
4.09k
  uint8_t p_str[33];
424
4.09k
  OPENSSL_memcpy(p_str, scalar->words, 32);
425
4.09k
  p_str[32] = 0;
426
427
  // First window
428
4.09k
  size_t index = 0;
429
4.09k
  crypto_word_t wvalue = calc_first_wvalue(&index, p_str);
430
431
4.09k
  alignas(32) P256_POINT_AFFINE t;
432
4.09k
  alignas(32) P256_POINT p;
433
4.09k
  ecp_nistz256_select_w7(&t, ecp_nistz256_precomputed[0], wvalue >> 1);
434
4.09k
  ecp_nistz256_neg(p.Z, t.Y);
435
4.09k
  copy_conditional(t.Y, p.Z, wvalue & 1);
436
437
  // Convert |t| from affine to Jacobian coordinates. We set Z to zero if |t|
438
  // is infinity and |ONE_MONT| otherwise. |t| was computed from the table, so
439
  // it is infinity iff |wvalue >> 1| is zero.
440
4.09k
  OPENSSL_memcpy(p.X, t.X, sizeof(p.X));
441
4.09k
  OPENSSL_memcpy(p.Y, t.Y, sizeof(p.Y));
442
4.09k
  OPENSSL_memset(p.Z, 0, sizeof(p.Z));
443
4.09k
  copy_conditional(p.Z, ONE_MONT, is_not_zero(wvalue >> 1));
444
445
151k
  for (int i = 1; i < 37; i++) {
446
147k
    wvalue = calc_wvalue(&index, p_str);
447
448
147k
    ecp_nistz256_select_w7(&t, ecp_nistz256_precomputed[i], wvalue >> 1);
449
450
147k
    alignas(32) BN_ULONG neg_Y[P256_LIMBS];
451
147k
    ecp_nistz256_neg(neg_Y, t.Y);
452
147k
    copy_conditional(t.Y, neg_Y, wvalue & 1);
453
454
    // Note |ecp_nistz256_point_add_affine| does not work if |p| and |t| are the
455
    // same non-infinity point.
456
147k
    ecp_nistz256_point_add_affine(&p, &p, &t);
457
147k
  }
458
459
4.09k
  assert(group->field.N.width == P256_LIMBS);
460
4.09k
  OPENSSL_memcpy(r->X.words, p.X, P256_LIMBS * sizeof(BN_ULONG));
461
4.09k
  OPENSSL_memcpy(r->Y.words, p.Y, P256_LIMBS * sizeof(BN_ULONG));
462
4.09k
  OPENSSL_memcpy(r->Z.words, p.Z, P256_LIMBS * sizeof(BN_ULONG));
463
4.09k
}
464
465
static void ecp_nistz256_points_mul_public(const EC_GROUP *group,
466
                                           EC_JACOBIAN *r,
467
                                           const EC_SCALAR *g_scalar,
468
                                           const EC_JACOBIAN *p_,
469
7.62k
                                           const EC_SCALAR *p_scalar) {
470
7.62k
  assert(p_ != nullptr && p_scalar != nullptr && g_scalar != nullptr);
471
472
7.62k
  alignas(32) P256_POINT p;
473
7.62k
  uint8_t p_str[33];
474
7.62k
  OPENSSL_memcpy(p_str, g_scalar->words, 32);
475
7.62k
  p_str[32] = 0;
476
477
  // First window
478
7.62k
  size_t index = 0;
479
7.62k
  size_t wvalue = calc_first_wvalue(&index, p_str);
480
481
  // Convert |p| from affine to Jacobian coordinates. We set Z to zero if |p|
482
  // is infinity and |ONE_MONT| otherwise. |p| was computed from the table, so
483
  // it is infinity iff |wvalue >> 1| is zero.
484
7.62k
  if ((wvalue >> 1) != 0) {
485
7.55k
    OPENSSL_memcpy(p.X, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].X,
486
7.55k
                   sizeof(p.X));
487
7.55k
    OPENSSL_memcpy(p.Y, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].Y,
488
7.55k
                   sizeof(p.Y));
489
7.55k
    OPENSSL_memcpy(p.Z, ONE_MONT, sizeof(p.Z));
490
7.55k
  } else {
491
72
    OPENSSL_memset(p.X, 0, sizeof(p.X));
492
72
    OPENSSL_memset(p.Y, 0, sizeof(p.Y));
493
72
    OPENSSL_memset(p.Z, 0, sizeof(p.Z));
494
72
  }
495
496
7.62k
  if ((wvalue & 1) == 1) {
497
3.81k
    ecp_nistz256_neg(p.Y, p.Y);
498
3.81k
  }
499
500
282k
  for (int i = 1; i < 37; i++) {
501
274k
    wvalue = calc_wvalue(&index, p_str);
502
274k
    if ((wvalue >> 1) == 0) {
503
2.50k
      continue;
504
2.50k
    }
505
506
271k
    alignas(32) P256_POINT_AFFINE t;
507
271k
    OPENSSL_memcpy(&t, &ecp_nistz256_precomputed[i][(wvalue >> 1) - 1],
508
271k
                   sizeof(t));
509
271k
    if ((wvalue & 1) == 1) {
510
132k
      ecp_nistz256_neg(t.Y, t.Y);
511
132k
    }
512
513
    // Note |ecp_nistz256_point_add_affine| does not work if |p| and |t| are
514
    // the same non-infinity point, so it is important that we compute the
515
    // |g_scalar| term before the |p_scalar| term.
516
271k
    ecp_nistz256_point_add_affine(&p, &p, &t);
517
271k
  }
518
519
7.62k
  alignas(32) P256_POINT tmp;
520
7.62k
  ecp_nistz256_windowed_mul(group, &tmp, p_, p_scalar);
521
7.62k
  ecp_nistz256_point_add(&p, &p, &tmp);
522
523
7.62k
  assert(group->field.N.width == P256_LIMBS);
524
7.62k
  OPENSSL_memcpy(r->X.words, p.X, P256_LIMBS * sizeof(BN_ULONG));
525
7.62k
  OPENSSL_memcpy(r->Y.words, p.Y, P256_LIMBS * sizeof(BN_ULONG));
526
7.62k
  OPENSSL_memcpy(r->Z.words, p.Z, P256_LIMBS * sizeof(BN_ULONG));
527
7.62k
}
528
529
static int ecp_nistz256_get_affine(const EC_GROUP *group,
530
                                   const EC_JACOBIAN *point, EC_FELEM *x,
531
7.64k
                                   EC_FELEM *y) {
532
7.64k
  if (constant_time_declassify_int(
533
7.64k
          ec_GFp_simple_is_at_infinity(group, point))) {
534
2
    OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
535
2
    return 0;
536
2
  }
537
538
7.64k
  BN_ULONG z_inv2[P256_LIMBS];
539
7.64k
  assert(group->field.N.width == P256_LIMBS);
540
7.64k
  ecp_nistz256_mod_inverse_sqr_mont(z_inv2, point->Z.words);
541
542
7.64k
  if (x != nullptr) {
543
7.64k
    ecp_nistz256_mul_mont(x->words, z_inv2, point->X.words);
544
7.64k
  }
545
546
7.64k
  if (y != nullptr) {
547
4.01k
    ecp_nistz256_sqr_mont(z_inv2, z_inv2);                            // z^-4
548
4.01k
    ecp_nistz256_mul_mont(y->words, point->Y.words, point->Z.words);  // y * z
549
4.01k
    ecp_nistz256_mul_mont(y->words, y->words, z_inv2);  // y * z^-3
550
4.01k
  }
551
552
7.64k
  return 1;
553
7.64k
}
554
555
static void ecp_nistz256_add(const EC_GROUP *group, EC_JACOBIAN *r,
556
0
                             const EC_JACOBIAN *a_, const EC_JACOBIAN *b_) {
557
0
  P256_POINT a, b;
558
0
  OPENSSL_memcpy(a.X, a_->X.words, P256_LIMBS * sizeof(BN_ULONG));
559
0
  OPENSSL_memcpy(a.Y, a_->Y.words, P256_LIMBS * sizeof(BN_ULONG));
560
0
  OPENSSL_memcpy(a.Z, a_->Z.words, P256_LIMBS * sizeof(BN_ULONG));
561
0
  OPENSSL_memcpy(b.X, b_->X.words, P256_LIMBS * sizeof(BN_ULONG));
562
0
  OPENSSL_memcpy(b.Y, b_->Y.words, P256_LIMBS * sizeof(BN_ULONG));
563
0
  OPENSSL_memcpy(b.Z, b_->Z.words, P256_LIMBS * sizeof(BN_ULONG));
564
0
  ecp_nistz256_point_add(&a, &a, &b);
565
0
  OPENSSL_memcpy(r->X.words, a.X, P256_LIMBS * sizeof(BN_ULONG));
566
0
  OPENSSL_memcpy(r->Y.words, a.Y, P256_LIMBS * sizeof(BN_ULONG));
567
0
  OPENSSL_memcpy(r->Z.words, a.Z, P256_LIMBS * sizeof(BN_ULONG));
568
0
}
569
570
static void ecp_nistz256_dbl(const EC_GROUP *group, EC_JACOBIAN *r,
571
0
                             const EC_JACOBIAN *a_) {
572
0
  P256_POINT a;
573
0
  OPENSSL_memcpy(a.X, a_->X.words, P256_LIMBS * sizeof(BN_ULONG));
574
0
  OPENSSL_memcpy(a.Y, a_->Y.words, P256_LIMBS * sizeof(BN_ULONG));
575
0
  OPENSSL_memcpy(a.Z, a_->Z.words, P256_LIMBS * sizeof(BN_ULONG));
576
0
  ecp_nistz256_point_double(&a, &a);
577
0
  OPENSSL_memcpy(r->X.words, a.X, P256_LIMBS * sizeof(BN_ULONG));
578
0
  OPENSSL_memcpy(r->Y.words, a.Y, P256_LIMBS * sizeof(BN_ULONG));
579
0
  OPENSSL_memcpy(r->Z.words, a.Z, P256_LIMBS * sizeof(BN_ULONG));
580
0
}
581
582
static void ecp_nistz256_inv0_mod_ord(const EC_GROUP *group, EC_SCALAR *out,
583
553
                                      const EC_SCALAR *in) {
584
  // table[i] stores a power of |in| corresponding to the matching enum value.
585
553
  enum {
586
    // The following indices specify the power in binary.
587
553
    i_1 = 0,
588
553
    i_10,
589
553
    i_11,
590
553
    i_101,
591
553
    i_111,
592
553
    i_1010,
593
553
    i_1111,
594
553
    i_10101,
595
553
    i_101010,
596
553
    i_101111,
597
    // The following indices specify 2^N-1, or N ones in a row.
598
553
    i_x6,
599
553
    i_x8,
600
553
    i_x16,
601
553
    i_x32
602
553
  };
603
553
  BN_ULONG table[15][P256_LIMBS];
604
605
  // https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
606
  //
607
  // Even though this code path spares 12 squarings, 4.5%, and 13
608
  // multiplications, 25%, the overall sign operation is not that much faster,
609
  // not more that 2%. Most of the performance of this function comes from the
610
  // scalar operations.
611
612
  // Pre-calculate powers.
613
553
  OPENSSL_memcpy(table[i_1], in->words, P256_LIMBS * sizeof(BN_ULONG));
614
615
553
  ecp_nistz256_ord_sqr_mont(table[i_10], table[i_1], 1);
616
617
553
  ecp_nistz256_ord_mul_mont(table[i_11], table[i_1], table[i_10]);
618
619
553
  ecp_nistz256_ord_mul_mont(table[i_101], table[i_11], table[i_10]);
620
621
553
  ecp_nistz256_ord_mul_mont(table[i_111], table[i_101], table[i_10]);
622
623
553
  ecp_nistz256_ord_sqr_mont(table[i_1010], table[i_101], 1);
624
625
553
  ecp_nistz256_ord_mul_mont(table[i_1111], table[i_1010], table[i_101]);
626
627
553
  ecp_nistz256_ord_sqr_mont(table[i_10101], table[i_1010], 1);
628
553
  ecp_nistz256_ord_mul_mont(table[i_10101], table[i_10101], table[i_1]);
629
630
553
  ecp_nistz256_ord_sqr_mont(table[i_101010], table[i_10101], 1);
631
632
553
  ecp_nistz256_ord_mul_mont(table[i_101111], table[i_101010], table[i_101]);
633
634
553
  ecp_nistz256_ord_mul_mont(table[i_x6], table[i_101010], table[i_10101]);
635
636
553
  ecp_nistz256_ord_sqr_mont(table[i_x8], table[i_x6], 2);
637
553
  ecp_nistz256_ord_mul_mont(table[i_x8], table[i_x8], table[i_11]);
638
639
553
  ecp_nistz256_ord_sqr_mont(table[i_x16], table[i_x8], 8);
640
553
  ecp_nistz256_ord_mul_mont(table[i_x16], table[i_x16], table[i_x8]);
641
642
553
  ecp_nistz256_ord_sqr_mont(table[i_x32], table[i_x16], 16);
643
553
  ecp_nistz256_ord_mul_mont(table[i_x32], table[i_x32], table[i_x16]);
644
645
  // Compute |in| raised to the order-2.
646
553
  ecp_nistz256_ord_sqr_mont(out->words, table[i_x32], 64);
647
553
  ecp_nistz256_ord_mul_mont(out->words, out->words, table[i_x32]);
648
553
  static const struct {
649
553
    uint8_t p, i;
650
553
  } kChain[27] = {{32, i_x32},    {6, i_101111}, {5, i_111},    {4, i_11},
651
553
                  {5, i_1111},    {5, i_10101},  {4, i_101},    {3, i_101},
652
553
                  {3, i_101},     {5, i_111},    {9, i_101111}, {6, i_1111},
653
553
                  {2, i_1},       {5, i_1},      {6, i_1111},   {5, i_111},
654
553
                  {4, i_111},     {5, i_111},    {5, i_101},    {3, i_11},
655
553
                  {10, i_101111}, {2, i_11},     {5, i_11},     {5, i_11},
656
553
                  {3, i_1},       {7, i_10101},  {6, i_1111}};
657
14.9k
  for (const auto &step : kChain) {
658
14.9k
    ecp_nistz256_ord_sqr_mont(out->words, out->words, step.p);
659
14.9k
    ecp_nistz256_ord_mul_mont(out->words, out->words, table[step.i]);
660
14.9k
  }
661
553
}
662
663
static int ecp_nistz256_scalar_to_montgomery_inv_vartime(const EC_GROUP *group,
664
                                                         EC_SCALAR *out,
665
7.62k
                                                         const EC_SCALAR *in) {
666
7.62k
#if defined(OPENSSL_X86_64)
667
7.62k
  if (!CRYPTO_is_AVX_capable()) {
668
    // No AVX support; fallback to generic code.
669
0
    return ec_simple_scalar_to_montgomery_inv_vartime(group, out, in);
670
0
  }
671
7.62k
#endif
672
673
7.62k
  assert(group->order.N.width == P256_LIMBS);
674
7.62k
  if (!beeu_mod_inverse_vartime(out->words, in->words, group->order.N.d)) {
675
0
    return 0;
676
0
  }
677
678
  // The result should be returned in the Montgomery domain.
679
7.62k
  ec_scalar_to_montgomery(group, out, out);
680
7.62k
  return 1;
681
7.62k
}
682
683
static int ecp_nistz256_cmp_x_coordinate(const EC_GROUP *group,
684
                                         const EC_JACOBIAN *p,
685
7.62k
                                         const EC_SCALAR *r) {
686
7.62k
  if (ec_GFp_simple_is_at_infinity(group, p)) {
687
0
    return 0;
688
0
  }
689
690
7.62k
  assert(group->order.N.width == P256_LIMBS);
691
7.62k
  assert(group->field.N.width == P256_LIMBS);
692
693
  // We wish to compare X/Z^2 with r. This is equivalent to comparing X with
694
  // r*Z^2. Note that X and Z are represented in Montgomery form, while r is
695
  // not.
696
7.62k
  BN_ULONG r_Z2[P256_LIMBS], Z2_mont[P256_LIMBS], X[P256_LIMBS];
697
7.62k
  ecp_nistz256_mul_mont(Z2_mont, p->Z.words, p->Z.words);
698
7.62k
  ecp_nistz256_mul_mont(r_Z2, r->words, Z2_mont);
699
7.62k
  ecp_nistz256_from_mont(X, p->X.words);
700
701
7.62k
  if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == 0) {
702
15
    return 1;
703
15
  }
704
705
  // During signing the x coefficient is reduced modulo the group order.
706
  // Therefore there is a small possibility, less than 1/2^128, that group_order
707
  // < p.x < P. in that case we need not only to compare against |r| but also to
708
  // compare against r+group_order.
709
7.60k
  BN_ULONG carry = bn_add_words(r_Z2, r->words, group->order.N.d, P256_LIMBS);
710
7.60k
  if (carry == 0 && bn_less_than_words(r_Z2, group->field.N.d, P256_LIMBS)) {
711
    // r + group_order < p, so compare (r + group_order) * Z^2 against X.
712
201
    ecp_nistz256_mul_mont(r_Z2, r_Z2, Z2_mont);
713
201
    if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == 0) {
714
0
      return 1;
715
0
    }
716
201
  }
717
718
7.60k
  return 0;
719
7.60k
}
720
721
BSSL_NAMESPACE_BEGIN
722
723
19
DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) {
724
19
  out->point_get_affine_coordinates = ecp_nistz256_get_affine;
725
19
  out->add = ecp_nistz256_add;
726
19
  out->dbl = ecp_nistz256_dbl;
727
19
  out->mul = ecp_nistz256_point_mul;
728
19
  out->mul_base = ecp_nistz256_point_mul_base;
729
19
  out->mul_public = ecp_nistz256_points_mul_public;
730
19
  out->felem_mul = ec_GFp_mont_felem_mul;
731
19
  out->felem_sqr = ec_GFp_mont_felem_sqr;
732
19
  out->felem_to_bytes = ec_GFp_mont_felem_to_bytes;
733
19
  out->felem_from_bytes = ec_GFp_mont_felem_from_bytes;
734
19
  out->felem_reduce = ec_GFp_mont_felem_reduce;
735
  // TODO(davidben): This should use the specialized field arithmetic
736
  // implementation, rather than the generic one.
737
19
  out->felem_exp = ec_GFp_mont_felem_exp;
738
19
  out->scalar_inv0_montgomery = ecp_nistz256_inv0_mod_ord;
739
19
  out->scalar_to_montgomery_inv_vartime =
740
19
      ecp_nistz256_scalar_to_montgomery_inv_vartime;
741
19
  out->cmp_x_coordinate = ecp_nistz256_cmp_x_coordinate;
742
19
}
743
744
BSSL_NAMESPACE_END
745
746
#endif /* !defined(OPENSSL_NO_ASM) &&                              \
747
          (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
748
          !defined(OPENSSL_SMALL) */