Coverage Report

Created: 2022-12-08 06:10

/src/libgcrypt/mpi/ec-nist.c
Line
Count
Source (jump to first uncovered line)
1
/* ec-nist.c -  NIST optimized elliptic curve functions
2
 * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
3
 *
4
 * This file is part of Libgcrypt.
5
 *
6
 * Libgcrypt is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU Lesser General Public License as
8
 * published by the Free Software Foundation; either version 2.1 of
9
 * the License, or (at your option) any later version.
10
 *
11
 * Libgcrypt is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
18
 */
19
20
#include <config.h>
21
#include <stdio.h>
22
#include <stdlib.h>
23
#include <errno.h>
24
25
26
#ifndef ASM_DISABLED
27
28
29
#include "mpi-internal.h"
30
#include "longlong.h"
31
#include "g10lib.h"
32
#include "context.h"
33
#include "ec-context.h"
34
#include "ec-inline.h"
35
36
37
/* These variables are used to generate masks from conditional operation
38
 * flag parameters.  Use of volatile prevents compiler optimizations from
39
 * converting AND-masking to conditional branches.  */
40
static volatile mpi_limb_t vzero = 0;
41
static volatile mpi_limb_t vone = 1;
42
43
44
static inline
45
void prefetch(const void *tab, size_t len)
46
9.58M
{
47
9.58M
  const volatile byte *vtab = tab;
48
49
9.58M
  if (len > 0 * 64)
50
9.58M
    (void)vtab[0 * 64];
51
9.58M
  if (len > 1 * 64)
52
9.58M
    (void)vtab[1 * 64];
53
9.58M
  if (len > 2 * 64)
54
9.58M
    (void)vtab[2 * 64];
55
9.58M
  if (len > 3 * 64)
56
9.58M
    (void)vtab[3 * 64];
57
9.58M
  if (len > 4 * 64)
58
9.58M
    (void)vtab[4 * 64];
59
9.58M
  if (len > 5 * 64)
60
9.58M
    (void)vtab[5 * 64];
61
9.58M
  if (len > 6 * 64)
62
9.58M
    (void)vtab[6 * 64];
63
9.58M
  if (len > 7 * 64)
64
9.58M
    (void)vtab[7 * 64];
65
9.58M
  if (len > 8 * 64)
66
183k
    (void)vtab[8 * 64];
67
9.58M
  if (len > 9 * 64)
68
183k
    (void)vtab[9 * 64];
69
9.58M
  if (len > 10 * 64)
70
0
    (void)vtab[10 * 64];
71
9.58M
  (void)vtab[len - 1];
72
9.58M
}
73
74
75
/* Fast reduction routines for NIST curves.  */
76
77
void
78
_gcry_mpi_ec_nist192_mod (gcry_mpi_t w, mpi_ec_t ctx)
79
0
{
80
0
  static const mpi_limb64_t p_mult[3][4] =
81
0
  {
82
0
    { /* P * 1 */
83
0
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xfffffffeU),
84
0
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0x00000000U)
85
0
    },
86
0
    { /* P * 2 */
87
0
      LIMB64_C(0xffffffffU, 0xfffffffeU), LIMB64_C(0xffffffffU, 0xfffffffdU),
88
0
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0x00000001U)
89
0
    },
90
0
    { /* P * 3 */
91
0
      LIMB64_C(0xffffffffU, 0xfffffffdU), LIMB64_C(0xffffffffU, 0xfffffffcU),
92
0
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0x00000002U)
93
0
    }
94
0
  };
95
0
  const mpi_limb64_t zero = LIMB_TO64(0);
96
0
  mpi_ptr_t wp;
97
0
  mpi_limb64_t s[192 / BITS_PER_MPI_LIMB64 + 1];
98
0
  mpi_limb64_t o[DIM(s)];
99
0
  const mpi_size_t wsize = DIM(s) - 1;
100
0
  mpi_limb_t mask1;
101
0
  mpi_limb_t mask2;
102
0
  mpi_limb_t s_is_negative;
103
0
  int carry;
104
105
0
  MPN_NORMALIZE (w->d, w->nlimbs);
106
0
  if (mpi_nbits_more_than (w, 2 * 192))
107
0
    log_bug ("W must be less than m^2\n");
108
109
0
  RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64);
110
0
  RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64);
111
112
0
  wp = w->d;
113
114
0
  prefetch (p_mult, sizeof(p_mult));
115
116
  /* See "FIPS 186-4, D.2.1 Curve P-192". */
117
118
0
  s[0] = LOAD64(wp, 3);
119
0
  ADD3_LIMB64 (s[3],  s[2],          s[1],
120
0
         zero,  zero,          LOAD64(wp, 3),
121
0
         zero,  LOAD64(wp, 4), LOAD64(wp, 4));
122
123
0
  ADD4_LIMB64 (s[3],  s[2],          s[1],          s[0],
124
0
         s[3],  s[2],          s[1],          s[0],
125
0
         zero,  LOAD64(wp, 5), LOAD64(wp, 5), LOAD64(wp, 5));
126
127
0
  ADD4_LIMB64 (s[3],  s[2],          s[1],          s[0],
128
0
         s[3],  s[2],          s[1],          s[0],
129
0
         zero,  LOAD64(wp, 2), LOAD64(wp, 1), LOAD64(wp, 0));
130
131
  /* mod p:
132
   *  's[3]' holds carry value (0..2). Subtract (carry + 1) * p. Result will be
133
   *  with in range -p...p. Handle result being negative with addition and
134
   *  conditional store. */
135
136
0
  carry = LO32_LIMB64(s[3]);
137
138
0
  SUB4_LIMB64 (s[3], s[2], s[1], s[0],
139
0
         s[3], s[2], s[1], s[0],
140
0
         p_mult[carry][3], p_mult[carry][2],
141
0
         p_mult[carry][1], p_mult[carry][0]);
142
143
0
  ADD4_LIMB64 (o[3], o[2], o[1], o[0],
144
0
         s[3], s[2], s[1], s[0],
145
0
         zero,
146
0
         p_mult[0][2], p_mult[0][1], p_mult[0][0]);
147
148
0
  s_is_negative = LO32_LIMB64(s[3]) >> 31;
149
150
0
  mask2 = vzero - s_is_negative;
151
0
  mask1 = s_is_negative - vone;
152
153
0
  STORE64_COND(wp, 0, mask2, o[0], mask1, s[0]);
154
0
  STORE64_COND(wp, 1, mask2, o[1], mask1, s[1]);
155
0
  STORE64_COND(wp, 2, mask2, o[2], mask1, s[2]);
156
157
0
  w->nlimbs = 192 / BITS_PER_MPI_LIMB;
158
0
  MPN_NORMALIZE (wp, w->nlimbs);
159
0
}
160
161
void
162
_gcry_mpi_ec_nist224_mod (gcry_mpi_t w, mpi_ec_t ctx)
163
0
{
164
0
  static const mpi_limb64_t p_mult[5][4] =
165
0
  {
166
0
    { /* P * -1 */
167
0
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xffffffffU),
168
0
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xffffffffU, 0x00000000U)
169
0
    },
170
0
    { /* P * 0 */
171
0
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
172
0
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U)
173
0
    },
174
0
    { /* P * 1 */
175
0
      LIMB64_C(0x00000000U, 0x00000001U), LIMB64_C(0xffffffffU, 0x00000000U),
176
0
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xffffffffU)
177
0
    },
178
0
    { /* P * 2 */
179
0
      LIMB64_C(0x00000000U, 0x00000002U), LIMB64_C(0xfffffffeU, 0x00000000U),
180
0
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000001U, 0xffffffffU)
181
0
    },
182
0
    { /* P * 3 */
183
0
      LIMB64_C(0x00000000U, 0x00000003U), LIMB64_C(0xfffffffdU, 0x00000000U),
184
0
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000002U, 0xffffffffU)
185
0
    }
186
0
  };
187
0
  const mpi_limb64_t zero = LIMB_TO64(0);
188
0
  mpi_ptr_t wp;
189
0
  mpi_limb64_t s[(224 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64];
190
0
  mpi_limb64_t d[DIM(s)];
191
0
  const mpi_size_t wsize = DIM(s);
192
0
  mpi_size_t psize = ctx->p->nlimbs;
193
0
  mpi_limb_t mask1;
194
0
  mpi_limb_t mask2;
195
0
  mpi_limb_t s_is_negative;
196
0
  int carry;
197
198
0
  MPN_NORMALIZE (w->d, w->nlimbs);
199
0
  if (mpi_nbits_more_than (w, 2 * 224))
200
0
    log_bug ("W must be less than m^2\n");
201
202
0
  RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64);
203
0
  RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64);
204
0
  ctx->p->nlimbs = psize;
205
206
0
  wp = w->d;
207
208
0
  prefetch (p_mult, sizeof(p_mult));
209
210
  /* See "FIPS 186-4, D.2.2 Curve P-224". */
211
212
  /* "S1 + S2" with 64-bit limbs:
213
   *     [0:A10]:[ A9: A8]:[ A7:0]:[0:0]
214
   *  +    [0:0]:[A13:A12]:[A11:0]:[0:0]
215
   *  => s[3]:s[2]:s[1]:s[0]
216
   */
217
0
  s[0] = zero;
218
0
  ADD3_LIMB64 (s[3], s[2], s[1],
219
0
         LIMB64_HILO(0, LOAD32(wp, 10)),
220
0
         LOAD64(wp, 8 / 2),
221
0
         LIMB64_HILO(LOAD32(wp, 7), 0),
222
0
         zero,
223
0
         LOAD64(wp, 12 / 2),
224
0
         LIMB64_HILO(LOAD32(wp, 11), 0));
225
226
  /* "T + S1 + S2" */
227
0
  ADD4_LIMB64 (s[3], s[2], s[1], s[0],
228
0
         s[3], s[2], s[1], s[0],
229
0
         LIMB64_HILO(0, LOAD32(wp, 6)),
230
0
         LOAD64(wp, 4 / 2),
231
0
         LOAD64(wp, 2 / 2),
232
0
         LOAD64(wp, 0 / 2));
233
234
  /* "D1 + D2" with 64-bit limbs:
235
   *     [0:A13]:[A12:A11]:[A10: A9]:[ A8: A7]
236
   *  +    [0:0]:[  0:  0]:[  0:A13]:[A12:A11]
237
   *  => d[3]:d[2]:d[1]:d[0]
238
   */
239
0
  ADD4_LIMB64 (d[3], d[2], d[1], d[0],
240
0
         LIMB64_HILO(0, LOAD32(wp, 13)),
241
0
         LOAD64_UNALIGNED(wp, 11 / 2),
242
0
         LOAD64_UNALIGNED(wp, 9 / 2),
243
0
         LOAD64_UNALIGNED(wp, 7 / 2),
244
0
         zero,
245
0
         zero,
246
0
         LIMB64_HILO(0, LOAD32(wp, 13)),
247
0
         LOAD64_UNALIGNED(wp, 11 / 2));
248
249
  /* "T + S1 + S2 - D1 - D2" */
250
0
  SUB4_LIMB64 (s[3], s[2], s[1], s[0],
251
0
         s[3], s[2], s[1], s[0],
252
0
         d[3], d[2], d[1], d[0]);
253
254
  /* mod p:
255
   *  Upper 32-bits of 's[3]' holds carry value (-2..2).
256
   *  Subtract (carry + 1) * p. Result will be with in range -p...p.
257
   *  Handle result being negative with addition and conditional store. */
258
259
0
  carry = HI32_LIMB64(s[3]);
260
261
0
  SUB4_LIMB64 (s[3], s[2], s[1], s[0],
262
0
         s[3], s[2], s[1], s[0],
263
0
         p_mult[carry + 2][3], p_mult[carry + 2][2],
264
0
         p_mult[carry + 2][1], p_mult[carry + 2][0]);
265
266
0
  ADD4_LIMB64 (d[3], d[2], d[1], d[0],
267
0
         s[3], s[2], s[1], s[0],
268
0
         p_mult[0 + 2][3], p_mult[0 + 2][2],
269
0
         p_mult[0 + 2][1], p_mult[0 + 2][0]);
270
271
0
  s_is_negative = (HI32_LIMB64(s[3]) >> 31);
272
273
0
  mask2 = vzero - s_is_negative;
274
0
  mask1 = s_is_negative - vone;
275
276
0
  STORE64_COND(wp, 0, mask2, d[0], mask1, s[0]);
277
0
  STORE64_COND(wp, 1, mask2, d[1], mask1, s[1]);
278
0
  STORE64_COND(wp, 2, mask2, d[2], mask1, s[2]);
279
0
  STORE64_COND(wp, 3, mask2, d[3], mask1, s[3]);
280
281
0
  w->nlimbs = wsize * LIMBS_PER_LIMB64;
282
0
  MPN_NORMALIZE (wp, w->nlimbs);
283
0
}
284
285
void
286
_gcry_mpi_ec_nist256_mod (gcry_mpi_t w, mpi_ec_t ctx)
287
9.40M
{
288
9.40M
  static const mpi_limb64_t p_mult[12][5] =
289
9.40M
  {
290
9.40M
    { /* P * -3 */
291
9.40M
      LIMB64_C(0x00000000U, 0x00000003U), LIMB64_C(0xfffffffdU, 0x00000000U),
292
9.40M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000002U, 0xfffffffcU),
293
9.40M
      LIMB64_C(0xffffffffU, 0xfffffffdU)
294
9.40M
    },
295
9.40M
    { /* P * -2 */
296
9.40M
      LIMB64_C(0x00000000U, 0x00000002U), LIMB64_C(0xfffffffeU, 0x00000000U),
297
9.40M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000001U, 0xfffffffdU),
298
9.40M
      LIMB64_C(0xffffffffU, 0xfffffffeU)
299
9.40M
    },
300
9.40M
    { /* P * -1 */
301
9.40M
      LIMB64_C(0x00000000U, 0x00000001U), LIMB64_C(0xffffffffU, 0x00000000U),
302
9.40M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xfffffffeU),
303
9.40M
      LIMB64_C(0xffffffffU, 0xffffffffU)
304
9.40M
    },
305
9.40M
    { /* P * 0 */
306
9.40M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
307
9.40M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
308
9.40M
      LIMB64_C(0x00000000U, 0x00000000U)
309
9.40M
    },
310
9.40M
    { /* P * 1 */
311
9.40M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xffffffffU),
312
9.40M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xffffffffU, 0x00000001U),
313
9.40M
      LIMB64_C(0x00000000U, 0x00000000U)
314
9.40M
    },
315
9.40M
    { /* P * 2 */
316
9.40M
      LIMB64_C(0xffffffffU, 0xfffffffeU), LIMB64_C(0x00000001U, 0xffffffffU),
317
9.40M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffeU, 0x00000002U),
318
9.40M
      LIMB64_C(0x00000000U, 0x00000001U)
319
9.40M
    },
320
9.40M
    { /* P * 3 */
321
9.40M
      LIMB64_C(0xffffffffU, 0xfffffffdU), LIMB64_C(0x00000002U, 0xffffffffU),
322
9.40M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffdU, 0x00000003U),
323
9.40M
      LIMB64_C(0x00000000U, 0x00000002U)
324
9.40M
    },
325
9.40M
    { /* P * 4 */
326
9.40M
      LIMB64_C(0xffffffffU, 0xfffffffcU), LIMB64_C(0x00000003U, 0xffffffffU),
327
9.40M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffcU, 0x00000004U),
328
9.40M
      LIMB64_C(0x00000000U, 0x00000003U)
329
9.40M
    },
330
9.40M
    { /* P * 5 */
331
9.40M
      LIMB64_C(0xffffffffU, 0xfffffffbU), LIMB64_C(0x00000004U, 0xffffffffU),
332
9.40M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffbU, 0x00000005U),
333
9.40M
      LIMB64_C(0x00000000U, 0x00000004U)
334
9.40M
    },
335
9.40M
    { /* P * 6 */
336
9.40M
      LIMB64_C(0xffffffffU, 0xfffffffaU), LIMB64_C(0x00000005U, 0xffffffffU),
337
9.40M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffaU, 0x00000006U),
338
9.40M
      LIMB64_C(0x00000000U, 0x00000005U)
339
9.40M
    },
340
9.40M
    { /* P * 7 */
341
9.40M
      LIMB64_C(0xffffffffU, 0xfffffff9U), LIMB64_C(0x00000006U, 0xffffffffU),
342
9.40M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffff9U, 0x00000007U),
343
9.40M
      LIMB64_C(0x00000000U, 0x00000006U)
344
9.40M
    }
345
9.40M
  };
346
9.40M
  const mpi_limb64_t zero = LIMB_TO64(0);
347
9.40M
  mpi_ptr_t wp;
348
9.40M
  mpi_limb64_t s[(256 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64 + 1];
349
9.40M
  mpi_limb64_t t[DIM(s)];
350
9.40M
  mpi_limb64_t d[DIM(s)];
351
9.40M
  mpi_limb64_t e[DIM(s)];
352
9.40M
  const mpi_size_t wsize = DIM(s) - 1;
353
9.40M
  mpi_size_t psize = ctx->p->nlimbs;
354
9.40M
  mpi_limb_t mask1;
355
9.40M
  mpi_limb_t mask2;
356
9.40M
  mpi_limb_t mask3;
357
9.40M
  mpi_limb_t s_is_negative;
358
9.40M
  mpi_limb_t d_is_negative;
359
9.40M
  int carry;
360
361
9.40M
  MPN_NORMALIZE (w->d, w->nlimbs);
362
9.40M
  if (mpi_nbits_more_than (w, 2 * 256))
363
0
    log_bug ("W must be less than m^2\n");
364
365
9.40M
  RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64);
366
9.40M
  RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64);
367
9.40M
  ctx->p->nlimbs = psize;
368
369
9.40M
  wp = w->d;
370
371
9.40M
  prefetch (p_mult, sizeof(p_mult));
372
373
  /* See "FIPS 186-4, D.2.3 Curve P-256". */
374
375
  /* "S1 + S2" with 64-bit limbs:
376
   *     [A15:A14]:[A13:A12]:[A11:0]:[0:0]
377
   *  +    [0:A15]:[A14:A13]:[A12:0]:[0:0]
378
   *  => s[4]:s[3]:s[2]:s[1]:s[0]
379
   */
380
9.40M
  s[0] = zero;
381
9.40M
  ADD4_LIMB64 (s[4], s[3], s[2], s[1],
382
9.40M
         zero,
383
9.40M
         LOAD64(wp, 14 / 2),
384
9.40M
         LOAD64(wp, 12 / 2),
385
9.40M
         LIMB64_HILO(LOAD32(wp, 11), 0),
386
9.40M
         zero,
387
9.40M
         LIMB64_HILO(0, LOAD32(wp, 15)),
388
9.40M
         LOAD64_UNALIGNED(wp, 13 / 2),
389
9.40M
         LIMB64_HILO(LOAD32(wp, 12), 0));
390
391
  /* "S3 + S4" with 64-bit limbs:
392
   *     [A15:A14]:[  0:  0]:[  0:A10]:[ A9:A8]
393
   *  +   [A8:A13]:[A15:A14]:[A13:A11]:[A10:A9]
394
   *  => t[4]:t[3]:t[2]:t[1]:t[0]
395
   */
396
9.40M
  ADD5_LIMB64 (t[4], t[3], t[2], t[1], t[0],
397
9.40M
         zero,
398
9.40M
         LOAD64(wp, 14 / 2),
399
9.40M
         zero,
400
9.40M
         LIMB64_HILO(0, LOAD32(wp, 10)),
401
9.40M
         LOAD64(wp, 8 / 2),
402
9.40M
         zero,
403
9.40M
         LIMB64_HILO(LOAD32(wp, 8), LOAD32(wp, 13)),
404
9.40M
         LOAD64(wp, 14 / 2),
405
9.40M
         LIMB64_HILO(LOAD32(wp, 13), LOAD32(wp, 11)),
406
9.40M
         LOAD64_UNALIGNED(wp, 9 / 2));
407
408
  /* "2*S1 + 2*S2" */
409
9.40M
  ADD5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
410
9.40M
               s[4], s[3], s[2], s[1], s[0],
411
9.40M
               s[4], s[3], s[2], s[1], s[0]);
412
413
  /* "T + S3 + S4" */
414
9.40M
  ADD5_LIMB64 (t[4], t[3], t[2], t[1], t[0],
415
9.40M
         t[4], t[3], t[2], t[1], t[0],
416
9.40M
         zero,
417
9.40M
         LOAD64(wp, 6 / 2),
418
9.40M
         LOAD64(wp, 4 / 2),
419
9.40M
         LOAD64(wp, 2 / 2),
420
9.40M
         LOAD64(wp, 0 / 2));
421
422
  /* "2*S1 + 2*S2 - D3" with 64-bit limbs:
423
   *    s[4]:    s[3]:    s[2]:    s[1]:     s[0]
424
   *  -       [A12:0]:[A10:A9]:[A8:A15]:[A14:A13]
425
   *  => s[4]:s[3]:s[2]:s[1]:s[0]
426
   */
427
9.40M
  SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
428
9.40M
               s[4], s[3], s[2], s[1], s[0],
429
9.40M
         zero,
430
9.40M
         LIMB64_HILO(LOAD32(wp, 12), 0),
431
9.40M
         LOAD64_UNALIGNED(wp, 9 / 2),
432
9.40M
         LIMB64_HILO(LOAD32(wp, 8), LOAD32(wp, 15)),
433
9.40M
         LOAD64_UNALIGNED(wp, 13 / 2));
434
435
  /* "T + 2*S1 + 2*S2 + S3 + S4 - D3" */
436
9.40M
  ADD5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
437
9.40M
               s[4], s[3], s[2], s[1], s[0],
438
9.40M
               t[4], t[3], t[2], t[1], t[0]);
439
440
  /* "D1 + D2" with 64-bit limbs:
441
   *     [0:A13]:[A12:A11] + [A15:A14]:[A13:A12] => d[2]:d[1]:d[0]
442
   *     [A10:A8] + [A11:A9] => d[4]:d[3]
443
   */
444
9.40M
  ADD3_LIMB64 (d[2], d[1], d[0],
445
9.40M
         zero,
446
9.40M
         LIMB64_HILO(0, LOAD32(wp, 13)),
447
9.40M
         LOAD64_UNALIGNED(wp, 11 / 2),
448
9.40M
         zero,
449
9.40M
         LOAD64(wp, 14 / 2),
450
9.40M
         LOAD64(wp, 12 / 2));
451
9.40M
  ADD2_LIMB64 (d[4], d[3],
452
9.40M
         zero, LIMB64_HILO(LOAD32(wp, 10), LOAD32(wp, 8)),
453
9.40M
         zero, LIMB64_HILO(LOAD32(wp, 11), LOAD32(wp, 9)));
454
455
  /* "D1 + D2 + D4" with 64-bit limbs:
456
   *    d[4]:    d[3]:     d[2]:  d[1]:     d[0]
457
   *  -       [A13:0]:[A11:A10]:[A9:0]:[A15:A14]
458
   *  => d[4]:d[3]:d[2]:d[1]:d[0]
459
   */
460
9.40M
  ADD5_LIMB64 (d[4], d[3], d[2], d[1], d[0],
461
9.40M
               d[4], d[3], d[2], d[1], d[0],
462
9.40M
         zero,
463
9.40M
         LIMB64_HILO(LOAD32(wp, 13), 0),
464
9.40M
         LOAD64(wp, 10 / 2),
465
9.40M
         LIMB64_HILO(LOAD32(wp, 9), 0),
466
9.40M
         LOAD64(wp, 14 / 2));
467
468
  /* "T + 2*S1 + 2*S2 + S3 + S4 - D1 - D2 - D3 - D4" */
469
9.40M
  SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
470
9.40M
               s[4], s[3], s[2], s[1], s[0],
471
9.40M
               d[4], d[3], d[2], d[1], d[0]);
472
473
  /* mod p:
474
   *  's[4]' holds carry value (-4..6). Subtract (carry + 1) * p. Result
475
   *  will be with in range -2*p...p. Handle result being negative with
476
   *  addition and conditional store. */
477
478
9.40M
  carry = LO32_LIMB64(s[4]);
479
480
9.40M
  SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
481
9.40M
         s[4], s[3], s[2], s[1], s[0],
482
9.40M
         p_mult[carry + 4][4], p_mult[carry + 4][3],
483
9.40M
         p_mult[carry + 4][2], p_mult[carry + 4][1],
484
9.40M
         p_mult[carry + 4][0]);
485
486
  /* Add 1*P */
487
9.40M
  ADD5_LIMB64 (d[4], d[3], d[2], d[1], d[0],
488
9.40M
         s[4], s[3], s[2], s[1], s[0],
489
9.40M
         zero,
490
9.40M
         p_mult[0 + 4][3], p_mult[0 + 4][2],
491
9.40M
         p_mult[0 + 4][1], p_mult[0 + 4][0]);
492
493
  /* Add 2*P */
494
9.40M
  ADD5_LIMB64 (e[4], e[3], e[2], e[1], e[0],
495
9.40M
         s[4], s[3], s[2], s[1], s[0],
496
9.40M
         zero,
497
9.40M
         p_mult[1 + 4][3], p_mult[1 + 4][2],
498
9.40M
         p_mult[1 + 4][1], p_mult[1 + 4][0]);
499
500
9.40M
  s_is_negative = LO32_LIMB64(s[4]) >> 31;
501
9.40M
  d_is_negative = LO32_LIMB64(d[4]) >> 31;
502
9.40M
  mask3 = vzero - d_is_negative;
503
9.40M
  mask2 = (vzero - s_is_negative) & ~mask3;
504
9.40M
  mask1 = (s_is_negative - vone) & ~mask3;
505
506
9.40M
  s[0] = LIMB_OR64(MASK_AND64(mask2, d[0]), MASK_AND64(mask1, s[0]));
507
9.40M
  s[1] = LIMB_OR64(MASK_AND64(mask2, d[1]), MASK_AND64(mask1, s[1]));
508
9.40M
  s[2] = LIMB_OR64(MASK_AND64(mask2, d[2]), MASK_AND64(mask1, s[2]));
509
9.40M
  s[3] = LIMB_OR64(MASK_AND64(mask2, d[3]), MASK_AND64(mask1, s[3]));
510
9.40M
  s[0] = LIMB_OR64(MASK_AND64(mask3, e[0]), s[0]);
511
9.40M
  s[1] = LIMB_OR64(MASK_AND64(mask3, e[1]), s[1]);
512
9.40M
  s[2] = LIMB_OR64(MASK_AND64(mask3, e[2]), s[2]);
513
9.40M
  s[3] = LIMB_OR64(MASK_AND64(mask3, e[3]), s[3]);
514
515
9.40M
  STORE64(wp, 0, s[0]);
516
9.40M
  STORE64(wp, 1, s[1]);
517
9.40M
  STORE64(wp, 2, s[2]);
518
9.40M
  STORE64(wp, 3, s[3]);
519
520
9.40M
  w->nlimbs = wsize * LIMBS_PER_LIMB64;
521
9.40M
  MPN_NORMALIZE (wp, w->nlimbs);
522
9.40M
}
523
524
void
525
_gcry_mpi_ec_nist384_mod (gcry_mpi_t w, mpi_ec_t ctx)
526
183k
{
527
183k
  static const mpi_limb64_t p_mult[11][7] =
528
183k
  {
529
183k
    { /* P * -2 */
530
183k
      LIMB64_C(0xfffffffeU, 0x00000002U), LIMB64_C(0x00000001U, 0xffffffffU),
531
183k
      LIMB64_C(0x00000000U, 0x00000002U), LIMB64_C(0x00000000U, 0x00000000U),
532
183k
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
533
183k
      LIMB64_C(0xffffffffU, 0xfffffffeU)
534
183k
    },
535
183k
    { /* P * -1 */
536
183k
      LIMB64_C(0xffffffffU, 0x00000001U), LIMB64_C(0x00000000U, 0xffffffffU),
537
183k
      LIMB64_C(0x00000000U, 0x00000001U), LIMB64_C(0x00000000U, 0x00000000U),
538
183k
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
539
183k
      LIMB64_C(0xffffffffU, 0xffffffffU)
540
183k
    },
541
183k
    { /* P * 0 */
542
183k
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
543
183k
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
544
183k
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
545
183k
      LIMB64_C(0x00000000U, 0x00000000U)
546
183k
    },
547
183k
    { /* P * 1 */
548
183k
      LIMB64_C(0x00000000U, 0xffffffffU), LIMB64_C(0xffffffffU, 0x00000000U),
549
183k
      LIMB64_C(0xffffffffU, 0xfffffffeU), LIMB64_C(0xffffffffU, 0xffffffffU),
550
183k
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
551
183k
      LIMB64_C(0x00000000U, 0x00000000U)
552
183k
    },
553
183k
    { /* P * 2 */
554
183k
      LIMB64_C(0x00000001U, 0xfffffffeU), LIMB64_C(0xfffffffeU, 0x00000000U),
555
183k
      LIMB64_C(0xffffffffU, 0xfffffffdU), LIMB64_C(0xffffffffU, 0xffffffffU),
556
183k
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
557
183k
      LIMB64_C(0x00000000U, 0x00000001U)
558
183k
    },
559
183k
    { /* P * 3 */
560
183k
      LIMB64_C(0x00000002U, 0xfffffffdU), LIMB64_C(0xfffffffdU, 0x00000000U),
561
183k
      LIMB64_C(0xffffffffU, 0xfffffffcU), LIMB64_C(0xffffffffU, 0xffffffffU),
562
183k
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
563
183k
      LIMB64_C(0x00000000U, 0x00000002U)
564
183k
    },
565
183k
    { /* P * 4 */
566
183k
      LIMB64_C(0x00000003U, 0xfffffffcU), LIMB64_C(0xfffffffcU, 0x00000000U),
567
183k
      LIMB64_C(0xffffffffU, 0xfffffffbU), LIMB64_C(0xffffffffU, 0xffffffffU),
568
183k
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
569
183k
      LIMB64_C(0x00000000U, 0x00000003U)
570
183k
    },
571
183k
    { /* P * 5 */
572
183k
      LIMB64_C(0x00000004U, 0xfffffffbU), LIMB64_C(0xfffffffbU, 0x00000000U),
573
183k
      LIMB64_C(0xffffffffU, 0xfffffffaU), LIMB64_C(0xffffffffU, 0xffffffffU),
574
183k
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
575
183k
      LIMB64_C(0x00000000U, 0x00000004U)
576
183k
    },
577
183k
    { /* P * 6 */
578
183k
      LIMB64_C(0x00000005U, 0xfffffffaU), LIMB64_C(0xfffffffaU, 0x00000000U),
579
183k
      LIMB64_C(0xffffffffU, 0xfffffff9U), LIMB64_C(0xffffffffU, 0xffffffffU),
580
183k
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
581
183k
      LIMB64_C(0x00000000U, 0x00000005U)
582
183k
    },
583
183k
    { /* P * 7 */
584
183k
      LIMB64_C(0x00000006U, 0xfffffff9U), LIMB64_C(0xfffffff9U, 0x00000000U),
585
183k
      LIMB64_C(0xffffffffU, 0xfffffff8U), LIMB64_C(0xffffffffU, 0xffffffffU),
586
183k
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
587
183k
      LIMB64_C(0x00000000U, 0x00000006U)
588
183k
    },
589
183k
    { /* P * 8 */
590
183k
      LIMB64_C(0x00000007U, 0xfffffff8U), LIMB64_C(0xfffffff8U, 0x00000000U),
591
183k
      LIMB64_C(0xffffffffU, 0xfffffff7U), LIMB64_C(0xffffffffU, 0xffffffffU),
592
183k
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
593
183k
      LIMB64_C(0x00000000U, 0x00000007U)
594
183k
    },
595
183k
  };
596
183k
  const mpi_limb64_t zero = LIMB_TO64(0);
597
183k
  mpi_ptr_t wp;
598
183k
  mpi_limb64_t s[(384 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64 + 1];
599
183k
  mpi_limb64_t t[DIM(s)];
600
183k
  mpi_limb64_t d[DIM(s)];
601
183k
  mpi_limb64_t x[DIM(s)];
602
#if (BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB) && defined(WORDS_BIGENDIAN)
603
  mpi_limb_t wp_shr32[(DIM(s) - 1) * LIMBS_PER_LIMB64];
604
#endif
605
183k
  const mpi_size_t wsize = DIM(s) - 1;
606
183k
  mpi_size_t psize = ctx->p->nlimbs;
607
183k
  mpi_limb_t mask1;
608
183k
  mpi_limb_t mask2;
609
183k
  mpi_limb_t s_is_negative;
610
183k
  int carry;
611
612
183k
  MPN_NORMALIZE (w->d, w->nlimbs);
613
183k
  if (mpi_nbits_more_than (w, 2 * 384))
614
0
    log_bug ("W must be less than m^2\n");
615
616
183k
  RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64);
617
183k
  RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64);
618
183k
  ctx->p->nlimbs = psize;
619
620
183k
  wp = w->d;
621
622
183k
  prefetch (p_mult, sizeof(p_mult));
623
624
  /* See "FIPS 186-4, D.2.4 Curve P-384". */
625
626
183k
#if BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB
627
# ifdef WORDS_BIGENDIAN
628
#  define LOAD64_SHR32(idx) LOAD64(wp_shr32, ((idx) / 2 - wsize))
629
  _gcry_mpih_rshift (wp_shr32, wp + 384 / BITS_PER_MPI_LIMB,
630
         wsize * LIMBS_PER_LIMB64, 32);
631
# else
632
183k
# define LOAD64_SHR32(idx) LOAD64_UNALIGNED(wp, idx / 2)
633
183k
#endif
634
#else
635
# define LOAD64_SHR32(idx) LIMB64_HILO(LOAD32(wp, (idx) + 1), LOAD32(wp, idx))
636
#endif
637
638
  /* "S1 + S1" with 64-bit limbs:
639
   *     [0:A23]:[A22:A21]
640
   *  +  [0:A23]:[A22:A21]
641
   *  => s[3]:s[2]
642
   */
643
183k
  ADD2_LIMB64 (s[3], s[2],
644
183k
         LIMB64_HILO(0, LOAD32(wp, 23)),
645
183k
         LOAD64_SHR32(21),
646
183k
         LIMB64_HILO(0, LOAD32(wp, 23)),
647
183k
         LOAD64_SHR32(21));
648
649
  /* "S5 + S6" with 64-bit limbs:
650
   *     [A23:A22]:[A21:A20]:[  0:0]:[0:  0]
651
   *  +  [  0:  0]:[A23:A22]:[A21:0]:[0:A20]
652
   *  => x[4]:x[3]:x[2]:x[1]:x[0]
653
   */
654
183k
  x[0] = LIMB64_HILO(0, LOAD32(wp, 20));
655
183k
  x[1] = LIMB64_HILO(LOAD32(wp, 21), 0);
656
183k
  ADD3_LIMB64 (x[4], x[3], x[2],
657
183k
         zero, LOAD64(wp, 22 / 2), LOAD64(wp, 20 / 2),
658
183k
         zero, zero, LOAD64(wp, 22 / 2));
659
660
  /* "D2 + D3" with 64-bit limbs:
661
   *     [0:A23]:[A22:A21]:[A20:0]
662
   *  +  [0:A23]:[A23:0]:[0:0]
663
   *  => d[2]:d[1]:d[0]
664
   */
665
183k
  d[0] = LIMB64_HILO(LOAD32(wp, 20), 0);
666
183k
  ADD2_LIMB64 (d[2], d[1],
667
183k
         LIMB64_HILO(0, LOAD32(wp, 23)),
668
183k
         LOAD64_SHR32(21),
669
183k
         LIMB64_HILO(0, LOAD32(wp, 23)),
670
183k
         LIMB64_HILO(LOAD32(wp, 23), 0));
671
672
  /* "2*S1 + S5 + S6" with 64-bit limbs:
673
   *     s[4]:s[3]:s[2]:s[1]:s[0]
674
   *  +  x[4]:x[3]:x[2]:x[1]:x[0]
675
   *  => s[4]:s[3]:s[2]:s[1]:s[0]
676
   */
677
183k
  s[0] = x[0];
678
183k
  s[1] = x[1];
679
183k
  ADD3_LIMB64(s[4], s[3], s[2],
680
183k
        zero, s[3], s[2],
681
183k
        x[4], x[3], x[2]);
682
683
  /* "T + S2" with 64-bit limbs:
684
   *     [A11:A10]:[ A9: A8]:[ A7: A6]:[ A5: A4]:[ A3: A2]:[ A1: A0]
685
   *  +  [A23:A22]:[A21:A20]:[A19:A18]:[A17:A16]:[A15:A14]:[A13:A12]
686
   *  => t[6]:t[5]:t[4]:t[3]:t[2]:t[1]:t[0]
687
   */
688
183k
  ADD7_LIMB64 (t[6], t[5], t[4], t[3], t[2], t[1], t[0],
689
183k
         zero,
690
183k
         LOAD64(wp, 10 / 2), LOAD64(wp, 8 / 2), LOAD64(wp, 6 / 2),
691
183k
         LOAD64(wp, 4 / 2), LOAD64(wp, 2 / 2), LOAD64(wp, 0 / 2),
692
183k
         zero,
693
183k
         LOAD64(wp, 22 / 2), LOAD64(wp, 20 / 2), LOAD64(wp, 18 / 2),
694
183k
         LOAD64(wp, 16 / 2), LOAD64(wp, 14 / 2), LOAD64(wp, 12 / 2));
695
696
  /* "2*S1 + S4 + S5 + S6" with 64-bit limbs:
697
   *     s[6]:     s[5]:     s[4]:     s[3]:     s[2]:   s[1]:   s[0]
698
   *  +       [A19:A18]:[A17:A16]:[A15:A14]:[A13:A12]:[A20:0]:[A23:0]
699
   *  => s[6]:s[5]:s[4]:s[3]:s[2]:s[1]:s[0]
700
   */
701
183k
  ADD7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
702
183k
         zero, zero, s[4], s[3], s[2], s[1], s[0],
703
183k
         zero,
704
183k
         LOAD64(wp, 18 / 2), LOAD64(wp, 16 / 2),
705
183k
         LOAD64(wp, 14 / 2), LOAD64(wp, 12 / 2),
706
183k
         LIMB64_HILO(LOAD32(wp, 20), 0),
707
183k
         LIMB64_HILO(LOAD32(wp, 23), 0));
708
709
  /* "D1 + D2 + D3" with 64-bit limbs:
710
   *     d[6]:     d[5]:     d[4]:     d[3]:     d[2]:     d[1]:     d[0]
711
   *  +       [A22:A21]:[A20:A19]:[A18:A17]:[A16:A15]:[A14:A13]:[A12:A23]
712
   *  => d[6]:d[5]:d[4]:d[3]:d[2]:d[1]:d[0]
713
   */
714
183k
  ADD7_LIMB64 (d[6], d[5], d[4], d[3], d[2], d[1], d[0],
715
183k
         zero, zero, zero, zero, d[2], d[1], d[0],
716
183k
         zero,
717
183k
         LOAD64_SHR32(21),
718
183k
         LOAD64_SHR32(19),
719
183k
         LOAD64_SHR32(17),
720
183k
         LOAD64_SHR32(15),
721
183k
         LOAD64_SHR32(13),
722
183k
         LIMB64_HILO(LOAD32(wp, 12), LOAD32(wp, 23)));
723
724
  /* "2*S1 + S3 + S4 + S5 + S6" with 64-bit limbs:
725
   *     s[6]:     s[5]:     s[4]:     s[3]:     s[2]:     s[1]:     s[0]
726
   *  +       [A20:A19]:[A18:A17]:[A16:A15]:[A14:A13]:[A12:A23]:[A22:A21]
727
   *  => s[6]:s[5]:s[4]:s[3]:s[2]:s[1]:s[0]
728
   */
729
183k
  ADD7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
730
183k
         s[6], s[5], s[4], s[3], s[2], s[1], s[0],
731
183k
         zero,
732
183k
         LOAD64_SHR32(19),
733
183k
         LOAD64_SHR32(17),
734
183k
         LOAD64_SHR32(15),
735
183k
         LOAD64_SHR32(13),
736
183k
         LIMB64_HILO(LOAD32(wp, 12), LOAD32(wp, 23)),
737
183k
         LOAD64_SHR32(21));
738
739
  /* "T + 2*S1 + S2 + S3 + S4 + S5 + S6" */
740
183k
  ADD7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
741
183k
               s[6], s[5], s[4], s[3], s[2], s[1], s[0],
742
183k
               t[6], t[5], t[4], t[3], t[2], t[1], t[0]);
743
744
  /* "T + 2*S1 + S2 + S3 + S4 + S5 + S6 - D1 - D2 - D3" */
745
183k
  SUB7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
746
183k
               s[6], s[5], s[4], s[3], s[2], s[1], s[0],
747
183k
               d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
748
749
183k
#undef LOAD64_SHR32
750
751
  /* mod p:
752
   *  's[6]' holds carry value (-3..7). Subtract (carry + 1) * p. Result
753
   *  will be with in range -p...p. Handle result being negative with
754
   *  addition and conditional store. */
755
756
183k
  carry = LO32_LIMB64(s[6]);
757
758
183k
  SUB7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
759
183k
         s[6], s[5], s[4], s[3], s[2], s[1], s[0],
760
183k
         p_mult[carry + 3][6], p_mult[carry + 3][5],
761
183k
         p_mult[carry + 3][4], p_mult[carry + 3][3],
762
183k
         p_mult[carry + 3][2], p_mult[carry + 3][1],
763
183k
         p_mult[carry + 3][0]);
764
765
183k
  ADD7_LIMB64 (d[6], d[5], d[4], d[3], d[2], d[1], d[0],
766
183k
         s[6], s[5], s[4], s[3], s[2], s[1], s[0],
767
183k
         zero,
768
183k
         p_mult[0 + 3][5], p_mult[0 + 3][4],
769
183k
         p_mult[0 + 3][3], p_mult[0 + 3][2],
770
183k
         p_mult[0 + 3][1], p_mult[0 + 3][0]);
771
772
183k
  s_is_negative = LO32_LIMB64(s[6]) >> 31;
773
183k
  mask2 = vzero - s_is_negative;
774
183k
  mask1 = s_is_negative - vone;
775
776
183k
  STORE64_COND(wp, 0, mask2, d[0], mask1, s[0]);
777
183k
  STORE64_COND(wp, 1, mask2, d[1], mask1, s[1]);
778
183k
  STORE64_COND(wp, 2, mask2, d[2], mask1, s[2]);
779
183k
  STORE64_COND(wp, 3, mask2, d[3], mask1, s[3]);
780
183k
  STORE64_COND(wp, 4, mask2, d[4], mask1, s[4]);
781
183k
  STORE64_COND(wp, 5, mask2, d[5], mask1, s[5]);
782
783
183k
  w->nlimbs = wsize * LIMBS_PER_LIMB64;
784
183k
  MPN_NORMALIZE (wp, w->nlimbs);
785
786
#if (BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB) && defined(WORDS_BIGENDIAN)
787
  wipememory(wp_shr32, sizeof(wp_shr32));
788
#endif
789
183k
}
790
791
void
792
_gcry_mpi_ec_nist521_mod (gcry_mpi_t w, mpi_ec_t ctx)
793
4.45M
{
794
4.45M
  mpi_limb_t s[(521 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB];
795
4.45M
  const mpi_size_t wsize = DIM(s);
796
4.45M
  mpi_limb_t cy;
797
4.45M
  mpi_ptr_t wp;
798
799
4.45M
  MPN_NORMALIZE (w->d, w->nlimbs);
800
4.45M
  if (mpi_nbits_more_than (w, 2 * 521))
801
0
    log_bug ("W must be less than m^2\n");
802
803
4.45M
  RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2);
804
805
4.45M
  wp = w->d;
806
807
  /* See "FIPS 186-4, D.2.5 Curve P-521". */
808
809
4.45M
  _gcry_mpih_rshift (s, wp + wsize - 1, wsize, 521 % BITS_PER_MPI_LIMB);
810
4.45M
  s[wsize - 1] &= (1 << (521 % BITS_PER_MPI_LIMB)) - 1;
811
4.45M
  wp[wsize - 1] &= (1 << (521 % BITS_PER_MPI_LIMB)) - 1;
812
4.45M
  _gcry_mpih_add_n (wp, wp, s, wsize);
813
814
  /* "mod p" */
815
4.45M
  cy = _gcry_mpih_sub_n (wp, wp, ctx->p->d, wsize);
816
4.45M
  _gcry_mpih_add_n (s, wp, ctx->p->d, wsize);
817
4.45M
  mpih_set_cond (wp, s, wsize, (cy != 0UL));
818
819
4.45M
  w->nlimbs = wsize;
820
4.45M
  MPN_NORMALIZE (wp, w->nlimbs);
821
4.45M
}
822
823
#endif /* !ASM_DISABLED */