Coverage Report

Created: 2024-11-21 07:03

/src/libgcrypt/mpi/ec-nist.c
Line
Count
Source (jump to first uncovered line)
1
/* ec-nist.c -  NIST optimized elliptic curve functions
2
 * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
3
 *
4
 * This file is part of Libgcrypt.
5
 *
6
 * Libgcrypt is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU Lesser General Public License as
8
 * published by the Free Software Foundation; either version 2.1 of
9
 * the License, or (at your option) any later version.
10
 *
11
 * Libgcrypt is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
18
 */
19
20
#include <config.h>
21
#include <stdio.h>
22
#include <stdlib.h>
23
#include <errno.h>
24
25
26
#ifndef ASM_DISABLED
27
28
29
#include "mpi-internal.h"
30
#include "longlong.h"
31
#include "g10lib.h"
32
#include "context.h"
33
#include "ec-context.h"
34
#include "ec-inline.h"
35
#include "const-time.h"
36
37
38
static inline
39
void prefetch(const void *tab, size_t len)
40
9.68M
{
41
9.68M
  const volatile byte *vtab = tab;
42
43
9.68M
  if (len > 0 * 64)
44
9.68M
    (void)vtab[0 * 64];
45
9.68M
  if (len > 1 * 64)
46
9.68M
    (void)vtab[1 * 64];
47
9.68M
  if (len > 2 * 64)
48
9.23M
    (void)vtab[2 * 64];
49
9.68M
  if (len > 3 * 64)
50
8.17M
    (void)vtab[3 * 64];
51
9.68M
  if (len > 4 * 64)
52
8.17M
    (void)vtab[4 * 64];
53
9.68M
  if (len > 5 * 64)
54
8.17M
    (void)vtab[5 * 64];
55
9.68M
  if (len > 6 * 64)
56
8.17M
    (void)vtab[6 * 64];
57
9.68M
  if (len > 7 * 64)
58
8.17M
    (void)vtab[7 * 64];
59
9.68M
  if (len > 8 * 64)
60
4.48M
    (void)vtab[8 * 64];
61
9.68M
  if (len > 9 * 64)
62
4.48M
    (void)vtab[9 * 64];
63
9.68M
  if (len > 10 * 64)
64
0
    (void)vtab[10 * 64];
65
9.68M
  (void)vtab[len - 1];
66
9.68M
}
67
68
69
/* Fast reduction routines for NIST curves.  */
70
71
void
72
_gcry_mpi_ec_nist192_mod (gcry_mpi_t w, mpi_ec_t ctx)
73
448k
{
74
448k
  static const mpi_limb64_t p_mult[3][4] =
75
448k
  {
76
448k
    { /* P * 1 */
77
448k
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xfffffffeU),
78
448k
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0x00000000U)
79
448k
    },
80
448k
    { /* P * 2 */
81
448k
      LIMB64_C(0xffffffffU, 0xfffffffeU), LIMB64_C(0xffffffffU, 0xfffffffdU),
82
448k
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0x00000001U)
83
448k
    },
84
448k
    { /* P * 3 */
85
448k
      LIMB64_C(0xffffffffU, 0xfffffffdU), LIMB64_C(0xffffffffU, 0xfffffffcU),
86
448k
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0x00000002U)
87
448k
    }
88
448k
  };
89
448k
  const mpi_limb64_t zero = LIMB_TO64(0);
90
448k
  mpi_ptr_t wp;
91
448k
  mpi_limb64_t s[192 / BITS_PER_MPI_LIMB64 + 1];
92
448k
  mpi_limb64_t o[DIM(s)];
93
448k
  const mpi_size_t wsize = DIM(s) - 1;
94
448k
  mpi_limb_t mask1;
95
448k
  mpi_limb_t mask2;
96
448k
  mpi_limb_t s_is_negative;
97
448k
  int carry;
98
99
448k
  MPN_NORMALIZE (w->d, w->nlimbs);
100
448k
  if (mpi_nbits_more_than (w, 2 * 192))
101
0
    log_bug ("W must be less than m^2\n");
102
103
448k
  RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64);
104
448k
  RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64);
105
106
448k
  wp = w->d;
107
108
448k
  prefetch (p_mult, sizeof(p_mult));
109
110
  /* See "FIPS 186-4, D.2.1 Curve P-192". */
111
112
448k
  s[0] = LOAD64(wp, 3);
113
448k
  ADD3_LIMB64 (s[3],  s[2],          s[1],
114
448k
         zero,  zero,          LOAD64(wp, 3),
115
448k
         zero,  LOAD64(wp, 4), LOAD64(wp, 4));
116
117
448k
  ADD4_LIMB64 (s[3],  s[2],          s[1],          s[0],
118
448k
         s[3],  s[2],          s[1],          s[0],
119
448k
         zero,  LOAD64(wp, 5), LOAD64(wp, 5), LOAD64(wp, 5));
120
121
448k
  ADD4_LIMB64 (s[3],  s[2],          s[1],          s[0],
122
448k
         s[3],  s[2],          s[1],          s[0],
123
448k
         zero,  LOAD64(wp, 2), LOAD64(wp, 1), LOAD64(wp, 0));
124
125
  /* mod p:
126
   *  's[3]' holds carry value (0..2). Subtract (carry + 1) * p. Result will be
127
   *  with in range -p...p. Handle result being negative with addition and
128
   *  conditional store. */
129
130
448k
  carry = LO32_LIMB64(s[3]);
131
132
448k
  SUB4_LIMB64 (s[3], s[2], s[1], s[0],
133
448k
         s[3], s[2], s[1], s[0],
134
448k
         p_mult[carry][3], p_mult[carry][2],
135
448k
         p_mult[carry][1], p_mult[carry][0]);
136
137
448k
  ADD4_LIMB64 (o[3], o[2], o[1], o[0],
138
448k
         s[3], s[2], s[1], s[0],
139
448k
         zero,
140
448k
         p_mult[0][2], p_mult[0][1], p_mult[0][0]);
141
142
448k
  s_is_negative = LO32_LIMB64(s[3]) >> 31;
143
144
448k
  mask2 = ct_limb_gen_mask(s_is_negative);
145
448k
  mask1 = ct_limb_gen_inv_mask(s_is_negative);
146
147
448k
  STORE64_COND(wp, 0, mask2, o[0], mask1, s[0]);
148
448k
  STORE64_COND(wp, 1, mask2, o[1], mask1, s[1]);
149
448k
  STORE64_COND(wp, 2, mask2, o[2], mask1, s[2]);
150
151
448k
  w->nlimbs = 192 / BITS_PER_MPI_LIMB;
152
448k
  MPN_NORMALIZE (wp, w->nlimbs);
153
448k
}
154
155
void
156
_gcry_mpi_ec_nist224_mod (gcry_mpi_t w, mpi_ec_t ctx)
157
1.05M
{
158
1.05M
  static const mpi_limb64_t p_mult[5][4] =
159
1.05M
  {
160
1.05M
    { /* P * -1 */
161
1.05M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xffffffffU),
162
1.05M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xffffffffU, 0x00000000U)
163
1.05M
    },
164
1.05M
    { /* P * 0 */
165
1.05M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
166
1.05M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U)
167
1.05M
    },
168
1.05M
    { /* P * 1 */
169
1.05M
      LIMB64_C(0x00000000U, 0x00000001U), LIMB64_C(0xffffffffU, 0x00000000U),
170
1.05M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xffffffffU)
171
1.05M
    },
172
1.05M
    { /* P * 2 */
173
1.05M
      LIMB64_C(0x00000000U, 0x00000002U), LIMB64_C(0xfffffffeU, 0x00000000U),
174
1.05M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000001U, 0xffffffffU)
175
1.05M
    },
176
1.05M
    { /* P * 3 */
177
1.05M
      LIMB64_C(0x00000000U, 0x00000003U), LIMB64_C(0xfffffffdU, 0x00000000U),
178
1.05M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000002U, 0xffffffffU)
179
1.05M
    }
180
1.05M
  };
181
1.05M
  const mpi_limb64_t zero = LIMB_TO64(0);
182
1.05M
  mpi_ptr_t wp;
183
1.05M
  mpi_limb64_t s[(224 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64];
184
1.05M
  mpi_limb64_t d[DIM(s)];
185
1.05M
  const mpi_size_t wsize = DIM(s);
186
1.05M
  mpi_size_t psize = ctx->p->nlimbs;
187
1.05M
  mpi_limb_t mask1;
188
1.05M
  mpi_limb_t mask2;
189
1.05M
  mpi_limb_t s_is_negative;
190
1.05M
  int carry;
191
192
1.05M
  MPN_NORMALIZE (w->d, w->nlimbs);
193
1.05M
  if (mpi_nbits_more_than (w, 2 * 224))
194
0
    log_bug ("W must be less than m^2\n");
195
196
1.05M
  RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64);
197
1.05M
  RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64);
198
1.05M
  ctx->p->nlimbs = psize;
199
200
1.05M
  wp = w->d;
201
202
1.05M
  prefetch (p_mult, sizeof(p_mult));
203
204
  /* See "FIPS 186-4, D.2.2 Curve P-224". */
205
206
  /* "S1 + S2" with 64-bit limbs:
207
   *     [0:A10]:[ A9: A8]:[ A7:0]:[0:0]
208
   *  +    [0:0]:[A13:A12]:[A11:0]:[0:0]
209
   *  => s[3]:s[2]:s[1]:s[0]
210
   */
211
1.05M
  s[0] = zero;
212
1.05M
  ADD3_LIMB64 (s[3], s[2], s[1],
213
1.05M
         LIMB64_HILO(0, LOAD32(wp, 10)),
214
1.05M
         LOAD64(wp, 8 / 2),
215
1.05M
         LIMB64_HILO(LOAD32(wp, 7), 0),
216
1.05M
         zero,
217
1.05M
         LOAD64(wp, 12 / 2),
218
1.05M
         LIMB64_HILO(LOAD32(wp, 11), 0));
219
220
  /* "T + S1 + S2" */
221
1.05M
  ADD4_LIMB64 (s[3], s[2], s[1], s[0],
222
1.05M
         s[3], s[2], s[1], s[0],
223
1.05M
         LIMB64_HILO(0, LOAD32(wp, 6)),
224
1.05M
         LOAD64(wp, 4 / 2),
225
1.05M
         LOAD64(wp, 2 / 2),
226
1.05M
         LOAD64(wp, 0 / 2));
227
228
  /* "D1 + D2" with 64-bit limbs:
229
   *     [0:A13]:[A12:A11]:[A10: A9]:[ A8: A7]
230
   *  +    [0:0]:[  0:  0]:[  0:A13]:[A12:A11]
231
   *  => d[3]:d[2]:d[1]:d[0]
232
   */
233
1.05M
  ADD4_LIMB64 (d[3], d[2], d[1], d[0],
234
1.05M
         LIMB64_HILO(0, LOAD32(wp, 13)),
235
1.05M
         LOAD64_UNALIGNED(wp, 11 / 2),
236
1.05M
         LOAD64_UNALIGNED(wp, 9 / 2),
237
1.05M
         LOAD64_UNALIGNED(wp, 7 / 2),
238
1.05M
         zero,
239
1.05M
         zero,
240
1.05M
         LIMB64_HILO(0, LOAD32(wp, 13)),
241
1.05M
         LOAD64_UNALIGNED(wp, 11 / 2));
242
243
  /* "T + S1 + S2 - D1 - D2" */
244
1.05M
  SUB4_LIMB64 (s[3], s[2], s[1], s[0],
245
1.05M
         s[3], s[2], s[1], s[0],
246
1.05M
         d[3], d[2], d[1], d[0]);
247
248
  /* mod p:
249
   *  Upper 32-bits of 's[3]' holds carry value (-2..2).
250
   *  Subtract (carry + 1) * p. Result will be with in range -p...p.
251
   *  Handle result being negative with addition and conditional store. */
252
253
1.05M
  carry = HI32_LIMB64(s[3]);
254
255
1.05M
  SUB4_LIMB64 (s[3], s[2], s[1], s[0],
256
1.05M
         s[3], s[2], s[1], s[0],
257
1.05M
         p_mult[carry + 2][3], p_mult[carry + 2][2],
258
1.05M
         p_mult[carry + 2][1], p_mult[carry + 2][0]);
259
260
1.05M
  ADD4_LIMB64 (d[3], d[2], d[1], d[0],
261
1.05M
         s[3], s[2], s[1], s[0],
262
1.05M
         p_mult[0 + 2][3], p_mult[0 + 2][2],
263
1.05M
         p_mult[0 + 2][1], p_mult[0 + 2][0]);
264
265
1.05M
  s_is_negative = (HI32_LIMB64(s[3]) >> 31);
266
267
1.05M
  mask2 = ct_limb_gen_mask(s_is_negative);
268
1.05M
  mask1 = ct_limb_gen_inv_mask(s_is_negative);
269
270
1.05M
  STORE64_COND(wp, 0, mask2, d[0], mask1, s[0]);
271
1.05M
  STORE64_COND(wp, 1, mask2, d[1], mask1, s[1]);
272
1.05M
  STORE64_COND(wp, 2, mask2, d[2], mask1, s[2]);
273
1.05M
  STORE64_COND(wp, 3, mask2, d[3], mask1, s[3]);
274
275
1.05M
  w->nlimbs = wsize * LIMBS_PER_LIMB64;
276
1.05M
  MPN_NORMALIZE (wp, w->nlimbs);
277
1.05M
}
278
279
void
280
_gcry_mpi_ec_nist256_mod (gcry_mpi_t w, mpi_ec_t ctx)
281
3.69M
{
282
3.69M
  static const mpi_limb64_t p_mult[12][5] =
283
3.69M
  {
284
3.69M
    { /* P * -3 */
285
3.69M
      LIMB64_C(0x00000000U, 0x00000003U), LIMB64_C(0xfffffffdU, 0x00000000U),
286
3.69M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000002U, 0xfffffffcU),
287
3.69M
      LIMB64_C(0xffffffffU, 0xfffffffdU)
288
3.69M
    },
289
3.69M
    { /* P * -2 */
290
3.69M
      LIMB64_C(0x00000000U, 0x00000002U), LIMB64_C(0xfffffffeU, 0x00000000U),
291
3.69M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000001U, 0xfffffffdU),
292
3.69M
      LIMB64_C(0xffffffffU, 0xfffffffeU)
293
3.69M
    },
294
3.69M
    { /* P * -1 */
295
3.69M
      LIMB64_C(0x00000000U, 0x00000001U), LIMB64_C(0xffffffffU, 0x00000000U),
296
3.69M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xfffffffeU),
297
3.69M
      LIMB64_C(0xffffffffU, 0xffffffffU)
298
3.69M
    },
299
3.69M
    { /* P * 0 */
300
3.69M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
301
3.69M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
302
3.69M
      LIMB64_C(0x00000000U, 0x00000000U)
303
3.69M
    },
304
3.69M
    { /* P * 1 */
305
3.69M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xffffffffU),
306
3.69M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xffffffffU, 0x00000001U),
307
3.69M
      LIMB64_C(0x00000000U, 0x00000000U)
308
3.69M
    },
309
3.69M
    { /* P * 2 */
310
3.69M
      LIMB64_C(0xffffffffU, 0xfffffffeU), LIMB64_C(0x00000001U, 0xffffffffU),
311
3.69M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffeU, 0x00000002U),
312
3.69M
      LIMB64_C(0x00000000U, 0x00000001U)
313
3.69M
    },
314
3.69M
    { /* P * 3 */
315
3.69M
      LIMB64_C(0xffffffffU, 0xfffffffdU), LIMB64_C(0x00000002U, 0xffffffffU),
316
3.69M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffdU, 0x00000003U),
317
3.69M
      LIMB64_C(0x00000000U, 0x00000002U)
318
3.69M
    },
319
3.69M
    { /* P * 4 */
320
3.69M
      LIMB64_C(0xffffffffU, 0xfffffffcU), LIMB64_C(0x00000003U, 0xffffffffU),
321
3.69M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffcU, 0x00000004U),
322
3.69M
      LIMB64_C(0x00000000U, 0x00000003U)
323
3.69M
    },
324
3.69M
    { /* P * 5 */
325
3.69M
      LIMB64_C(0xffffffffU, 0xfffffffbU), LIMB64_C(0x00000004U, 0xffffffffU),
326
3.69M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffbU, 0x00000005U),
327
3.69M
      LIMB64_C(0x00000000U, 0x00000004U)
328
3.69M
    },
329
3.69M
    { /* P * 6 */
330
3.69M
      LIMB64_C(0xffffffffU, 0xfffffffaU), LIMB64_C(0x00000005U, 0xffffffffU),
331
3.69M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffaU, 0x00000006U),
332
3.69M
      LIMB64_C(0x00000000U, 0x00000005U)
333
3.69M
    },
334
3.69M
    { /* P * 7 */
335
3.69M
      LIMB64_C(0xffffffffU, 0xfffffff9U), LIMB64_C(0x00000006U, 0xffffffffU),
336
3.69M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffff9U, 0x00000007U),
337
3.69M
      LIMB64_C(0x00000000U, 0x00000006U)
338
3.69M
    }
339
3.69M
  };
340
3.69M
  const mpi_limb64_t zero = LIMB_TO64(0);
341
3.69M
  mpi_ptr_t wp;
342
3.69M
  mpi_limb64_t s[(256 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64 + 1];
343
3.69M
  mpi_limb64_t t[DIM(s)];
344
3.69M
  mpi_limb64_t d[DIM(s)];
345
3.69M
  mpi_limb64_t e[DIM(s)];
346
3.69M
  const mpi_size_t wsize = DIM(s) - 1;
347
3.69M
  mpi_size_t psize = ctx->p->nlimbs;
348
3.69M
  mpi_limb_t mask1;
349
3.69M
  mpi_limb_t mask2;
350
3.69M
  mpi_limb_t mask3;
351
3.69M
  mpi_limb_t s_is_negative;
352
3.69M
  mpi_limb_t d_is_negative;
353
3.69M
  int carry;
354
355
3.69M
  MPN_NORMALIZE (w->d, w->nlimbs);
356
3.69M
  if (mpi_nbits_more_than (w, 2 * 256))
357
0
    log_bug ("W must be less than m^2\n");
358
359
3.69M
  RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64);
360
3.69M
  RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64);
361
3.69M
  ctx->p->nlimbs = psize;
362
363
3.69M
  wp = w->d;
364
365
3.69M
  prefetch (p_mult, sizeof(p_mult));
366
367
  /* See "FIPS 186-4, D.2.3 Curve P-256". */
368
369
  /* "S1 + S2" with 64-bit limbs:
370
   *     [A15:A14]:[A13:A12]:[A11:0]:[0:0]
371
   *  +    [0:A15]:[A14:A13]:[A12:0]:[0:0]
372
   *  => s[4]:s[3]:s[2]:s[1]:s[0]
373
   */
374
3.69M
  s[0] = zero;
375
3.69M
  ADD4_LIMB64 (s[4], s[3], s[2], s[1],
376
3.69M
         zero,
377
3.69M
         LOAD64(wp, 14 / 2),
378
3.69M
         LOAD64(wp, 12 / 2),
379
3.69M
         LIMB64_HILO(LOAD32(wp, 11), 0),
380
3.69M
         zero,
381
3.69M
         LIMB64_HILO(0, LOAD32(wp, 15)),
382
3.69M
         LOAD64_UNALIGNED(wp, 13 / 2),
383
3.69M
         LIMB64_HILO(LOAD32(wp, 12), 0));
384
385
  /* "S3 + S4" with 64-bit limbs:
386
   *     [A15:A14]:[  0:  0]:[  0:A10]:[ A9:A8]
387
   *  +   [A8:A13]:[A15:A14]:[A13:A11]:[A10:A9]
388
   *  => t[4]:t[3]:t[2]:t[1]:t[0]
389
   */
390
3.69M
  ADD5_LIMB64 (t[4], t[3], t[2], t[1], t[0],
391
3.69M
         zero,
392
3.69M
         LOAD64(wp, 14 / 2),
393
3.69M
         zero,
394
3.69M
         LIMB64_HILO(0, LOAD32(wp, 10)),
395
3.69M
         LOAD64(wp, 8 / 2),
396
3.69M
         zero,
397
3.69M
         LIMB64_HILO(LOAD32(wp, 8), LOAD32(wp, 13)),
398
3.69M
         LOAD64(wp, 14 / 2),
399
3.69M
         LIMB64_HILO(LOAD32(wp, 13), LOAD32(wp, 11)),
400
3.69M
         LOAD64_UNALIGNED(wp, 9 / 2));
401
402
  /* "2*S1 + 2*S2" */
403
3.69M
  ADD5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
404
3.69M
               s[4], s[3], s[2], s[1], s[0],
405
3.69M
               s[4], s[3], s[2], s[1], s[0]);
406
407
  /* "T + S3 + S4" */
408
3.69M
  ADD5_LIMB64 (t[4], t[3], t[2], t[1], t[0],
409
3.69M
         t[4], t[3], t[2], t[1], t[0],
410
3.69M
         zero,
411
3.69M
         LOAD64(wp, 6 / 2),
412
3.69M
         LOAD64(wp, 4 / 2),
413
3.69M
         LOAD64(wp, 2 / 2),
414
3.69M
         LOAD64(wp, 0 / 2));
415
416
  /* "2*S1 + 2*S2 - D3" with 64-bit limbs:
417
   *    s[4]:    s[3]:    s[2]:    s[1]:     s[0]
418
   *  -       [A12:0]:[A10:A9]:[A8:A15]:[A14:A13]
419
   *  => s[4]:s[3]:s[2]:s[1]:s[0]
420
   */
421
3.69M
  SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
422
3.69M
               s[4], s[3], s[2], s[1], s[0],
423
3.69M
         zero,
424
3.69M
         LIMB64_HILO(LOAD32(wp, 12), 0),
425
3.69M
         LOAD64_UNALIGNED(wp, 9 / 2),
426
3.69M
         LIMB64_HILO(LOAD32(wp, 8), LOAD32(wp, 15)),
427
3.69M
         LOAD64_UNALIGNED(wp, 13 / 2));
428
429
  /* "T + 2*S1 + 2*S2 + S3 + S4 - D3" */
430
3.69M
  ADD5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
431
3.69M
               s[4], s[3], s[2], s[1], s[0],
432
3.69M
               t[4], t[3], t[2], t[1], t[0]);
433
434
  /* "D1 + D2" with 64-bit limbs:
435
   *     [0:A13]:[A12:A11] + [A15:A14]:[A13:A12] => d[2]:d[1]:d[0]
436
   *     [A10:A8] + [A11:A9] => d[4]:d[3]
437
   */
438
3.69M
  ADD3_LIMB64 (d[2], d[1], d[0],
439
3.69M
         zero,
440
3.69M
         LIMB64_HILO(0, LOAD32(wp, 13)),
441
3.69M
         LOAD64_UNALIGNED(wp, 11 / 2),
442
3.69M
         zero,
443
3.69M
         LOAD64(wp, 14 / 2),
444
3.69M
         LOAD64(wp, 12 / 2));
445
3.69M
  ADD2_LIMB64 (d[4], d[3],
446
3.69M
         zero, LIMB64_HILO(LOAD32(wp, 10), LOAD32(wp, 8)),
447
3.69M
         zero, LIMB64_HILO(LOAD32(wp, 11), LOAD32(wp, 9)));
448
449
  /* "D1 + D2 + D4" with 64-bit limbs:
450
   *    d[4]:    d[3]:     d[2]:  d[1]:     d[0]
451
   *  -       [A13:0]:[A11:A10]:[A9:0]:[A15:A14]
452
   *  => d[4]:d[3]:d[2]:d[1]:d[0]
453
   */
454
3.69M
  ADD5_LIMB64 (d[4], d[3], d[2], d[1], d[0],
455
3.69M
               d[4], d[3], d[2], d[1], d[0],
456
3.69M
         zero,
457
3.69M
         LIMB64_HILO(LOAD32(wp, 13), 0),
458
3.69M
         LOAD64(wp, 10 / 2),
459
3.69M
         LIMB64_HILO(LOAD32(wp, 9), 0),
460
3.69M
         LOAD64(wp, 14 / 2));
461
462
  /* "T + 2*S1 + 2*S2 + S3 + S4 - D1 - D2 - D3 - D4" */
463
3.69M
  SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
464
3.69M
               s[4], s[3], s[2], s[1], s[0],
465
3.69M
               d[4], d[3], d[2], d[1], d[0]);
466
467
  /* mod p:
468
   *  's[4]' holds carry value (-4..6). Subtract (carry + 1) * p. Result
469
   *  will be with in range -2*p...p. Handle result being negative with
470
   *  addition and conditional store. */
471
472
3.69M
  carry = LO32_LIMB64(s[4]);
473
474
  /* Load values to stack to ease register pressure on i386. */
475
3.69M
  e[0] = p_mult[carry + 4][0];
476
3.69M
  e[1] = p_mult[carry + 4][1];
477
3.69M
  e[2] = p_mult[carry + 4][2];
478
3.69M
  e[3] = p_mult[carry + 4][3];
479
3.69M
  e[4] = p_mult[carry + 4][4];
480
3.69M
  SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0],
481
3.69M
         s[4], s[3], s[2], s[1], s[0],
482
3.69M
         e[4], e[3], e[2], e[1], e[0]);
483
484
  /* Add 1*P */
485
3.69M
  ADD5_LIMB64 (d[4], d[3], d[2], d[1], d[0],
486
3.69M
         s[4], s[3], s[2], s[1], s[0],
487
3.69M
         zero,
488
3.69M
         p_mult[0 + 4][3], p_mult[0 + 4][2],
489
3.69M
         p_mult[0 + 4][1], p_mult[0 + 4][0]);
490
491
  /* Add 2*P */
492
3.69M
  ADD5_LIMB64 (e[4], e[3], e[2], e[1], e[0],
493
3.69M
         s[4], s[3], s[2], s[1], s[0],
494
3.69M
         zero,
495
3.69M
         p_mult[1 + 4][3], p_mult[1 + 4][2],
496
3.69M
         p_mult[1 + 4][1], p_mult[1 + 4][0]);
497
498
3.69M
  s_is_negative = LO32_LIMB64(s[4]) >> 31;
499
3.69M
  d_is_negative = LO32_LIMB64(d[4]) >> 31;
500
3.69M
  mask3 = ct_limb_gen_mask(d_is_negative);
501
3.69M
  mask2 = ct_limb_gen_mask(s_is_negative) & ~mask3;
502
3.69M
  mask1 = ct_limb_gen_inv_mask(s_is_negative) & ~mask3;
503
504
3.69M
  s[0] = LIMB_OR64(MASK_AND64(mask2, d[0]), MASK_AND64(mask1, s[0]));
505
3.69M
  s[1] = LIMB_OR64(MASK_AND64(mask2, d[1]), MASK_AND64(mask1, s[1]));
506
3.69M
  s[2] = LIMB_OR64(MASK_AND64(mask2, d[2]), MASK_AND64(mask1, s[2]));
507
3.69M
  s[3] = LIMB_OR64(MASK_AND64(mask2, d[3]), MASK_AND64(mask1, s[3]));
508
3.69M
  s[0] = LIMB_OR64(MASK_AND64(mask3, e[0]), s[0]);
509
3.69M
  s[1] = LIMB_OR64(MASK_AND64(mask3, e[1]), s[1]);
510
3.69M
  s[2] = LIMB_OR64(MASK_AND64(mask3, e[2]), s[2]);
511
3.69M
  s[3] = LIMB_OR64(MASK_AND64(mask3, e[3]), s[3]);
512
513
3.69M
  STORE64(wp, 0, s[0]);
514
3.69M
  STORE64(wp, 1, s[1]);
515
3.69M
  STORE64(wp, 2, s[2]);
516
3.69M
  STORE64(wp, 3, s[3]);
517
518
3.69M
  w->nlimbs = wsize * LIMBS_PER_LIMB64;
519
3.69M
  MPN_NORMALIZE (wp, w->nlimbs);
520
3.69M
}
521
522
void
523
_gcry_mpi_ec_nist384_mod (gcry_mpi_t w, mpi_ec_t ctx)
524
4.48M
{
525
4.48M
  static const mpi_limb64_t p_mult[11][7] =
526
4.48M
  {
527
4.48M
    { /* P * -2 */
528
4.48M
      LIMB64_C(0xfffffffeU, 0x00000002U), LIMB64_C(0x00000001U, 0xffffffffU),
529
4.48M
      LIMB64_C(0x00000000U, 0x00000002U), LIMB64_C(0x00000000U, 0x00000000U),
530
4.48M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
531
4.48M
      LIMB64_C(0xffffffffU, 0xfffffffeU)
532
4.48M
    },
533
4.48M
    { /* P * -1 */
534
4.48M
      LIMB64_C(0xffffffffU, 0x00000001U), LIMB64_C(0x00000000U, 0xffffffffU),
535
4.48M
      LIMB64_C(0x00000000U, 0x00000001U), LIMB64_C(0x00000000U, 0x00000000U),
536
4.48M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
537
4.48M
      LIMB64_C(0xffffffffU, 0xffffffffU)
538
4.48M
    },
539
4.48M
    { /* P * 0 */
540
4.48M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
541
4.48M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
542
4.48M
      LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U),
543
4.48M
      LIMB64_C(0x00000000U, 0x00000000U)
544
4.48M
    },
545
4.48M
    { /* P * 1 */
546
4.48M
      LIMB64_C(0x00000000U, 0xffffffffU), LIMB64_C(0xffffffffU, 0x00000000U),
547
4.48M
      LIMB64_C(0xffffffffU, 0xfffffffeU), LIMB64_C(0xffffffffU, 0xffffffffU),
548
4.48M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
549
4.48M
      LIMB64_C(0x00000000U, 0x00000000U)
550
4.48M
    },
551
4.48M
    { /* P * 2 */
552
4.48M
      LIMB64_C(0x00000001U, 0xfffffffeU), LIMB64_C(0xfffffffeU, 0x00000000U),
553
4.48M
      LIMB64_C(0xffffffffU, 0xfffffffdU), LIMB64_C(0xffffffffU, 0xffffffffU),
554
4.48M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
555
4.48M
      LIMB64_C(0x00000000U, 0x00000001U)
556
4.48M
    },
557
4.48M
    { /* P * 3 */
558
4.48M
      LIMB64_C(0x00000002U, 0xfffffffdU), LIMB64_C(0xfffffffdU, 0x00000000U),
559
4.48M
      LIMB64_C(0xffffffffU, 0xfffffffcU), LIMB64_C(0xffffffffU, 0xffffffffU),
560
4.48M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
561
4.48M
      LIMB64_C(0x00000000U, 0x00000002U)
562
4.48M
    },
563
4.48M
    { /* P * 4 */
564
4.48M
      LIMB64_C(0x00000003U, 0xfffffffcU), LIMB64_C(0xfffffffcU, 0x00000000U),
565
4.48M
      LIMB64_C(0xffffffffU, 0xfffffffbU), LIMB64_C(0xffffffffU, 0xffffffffU),
566
4.48M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
567
4.48M
      LIMB64_C(0x00000000U, 0x00000003U)
568
4.48M
    },
569
4.48M
    { /* P * 5 */
570
4.48M
      LIMB64_C(0x00000004U, 0xfffffffbU), LIMB64_C(0xfffffffbU, 0x00000000U),
571
4.48M
      LIMB64_C(0xffffffffU, 0xfffffffaU), LIMB64_C(0xffffffffU, 0xffffffffU),
572
4.48M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
573
4.48M
      LIMB64_C(0x00000000U, 0x00000004U)
574
4.48M
    },
575
4.48M
    { /* P * 6 */
576
4.48M
      LIMB64_C(0x00000005U, 0xfffffffaU), LIMB64_C(0xfffffffaU, 0x00000000U),
577
4.48M
      LIMB64_C(0xffffffffU, 0xfffffff9U), LIMB64_C(0xffffffffU, 0xffffffffU),
578
4.48M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
579
4.48M
      LIMB64_C(0x00000000U, 0x00000005U)
580
4.48M
    },
581
4.48M
    { /* P * 7 */
582
4.48M
      LIMB64_C(0x00000006U, 0xfffffff9U), LIMB64_C(0xfffffff9U, 0x00000000U),
583
4.48M
      LIMB64_C(0xffffffffU, 0xfffffff8U), LIMB64_C(0xffffffffU, 0xffffffffU),
584
4.48M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
585
4.48M
      LIMB64_C(0x00000000U, 0x00000006U)
586
4.48M
    },
587
4.48M
    { /* P * 8 */
588
4.48M
      LIMB64_C(0x00000007U, 0xfffffff8U), LIMB64_C(0xfffffff8U, 0x00000000U),
589
4.48M
      LIMB64_C(0xffffffffU, 0xfffffff7U), LIMB64_C(0xffffffffU, 0xffffffffU),
590
4.48M
      LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU),
591
4.48M
      LIMB64_C(0x00000000U, 0x00000007U)
592
4.48M
    },
593
4.48M
  };
594
4.48M
  const mpi_limb64_t zero = LIMB_TO64(0);
595
4.48M
  mpi_ptr_t wp;
596
4.48M
  mpi_limb64_t s[(384 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64 + 1];
597
4.48M
  mpi_limb64_t t[DIM(s)];
598
4.48M
  mpi_limb64_t d[DIM(s)];
599
4.48M
  mpi_limb64_t x[DIM(s)];
600
#if (BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB) && defined(WORDS_BIGENDIAN)
601
  mpi_limb_t wp_shr32[(DIM(s) - 1) * LIMBS_PER_LIMB64];
602
#endif
603
4.48M
  const mpi_size_t wsize = DIM(s) - 1;
604
4.48M
  mpi_size_t psize = ctx->p->nlimbs;
605
4.48M
  mpi_limb_t mask1;
606
4.48M
  mpi_limb_t mask2;
607
4.48M
  mpi_limb_t s_is_negative;
608
4.48M
  int carry;
609
610
4.48M
  MPN_NORMALIZE (w->d, w->nlimbs);
611
4.48M
  if (mpi_nbits_more_than (w, 2 * 384))
612
0
    log_bug ("W must be less than m^2\n");
613
614
4.48M
  RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64);
615
4.48M
  RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64);
616
4.48M
  ctx->p->nlimbs = psize;
617
618
4.48M
  wp = w->d;
619
620
4.48M
  prefetch (p_mult, sizeof(p_mult));
621
622
  /* See "FIPS 186-4, D.2.4 Curve P-384". */
623
624
4.48M
#if BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB
625
# ifdef WORDS_BIGENDIAN
626
#  define LOAD64_SHR32(idx) LOAD64(wp_shr32, ((idx) / 2 - wsize))
627
  _gcry_mpih_rshift (wp_shr32, wp + 384 / BITS_PER_MPI_LIMB,
628
         wsize * LIMBS_PER_LIMB64, 32);
629
# else
630
4.48M
# define LOAD64_SHR32(idx) LOAD64_UNALIGNED(wp, idx / 2)
631
4.48M
#endif
632
#else
633
# define LOAD64_SHR32(idx) LIMB64_HILO(LOAD32(wp, (idx) + 1), LOAD32(wp, idx))
634
#endif
635
636
  /* "S1 + S1" with 64-bit limbs:
637
   *     [0:A23]:[A22:A21]
638
   *  +  [0:A23]:[A22:A21]
639
   *  => s[3]:s[2]
640
   */
641
4.48M
  ADD2_LIMB64 (s[3], s[2],
642
4.48M
         LIMB64_HILO(0, LOAD32(wp, 23)),
643
4.48M
         LOAD64_SHR32(21),
644
4.48M
         LIMB64_HILO(0, LOAD32(wp, 23)),
645
4.48M
         LOAD64_SHR32(21));
646
647
  /* "S5 + S6" with 64-bit limbs:
648
   *     [A23:A22]:[A21:A20]:[  0:0]:[0:  0]
649
   *  +  [  0:  0]:[A23:A22]:[A21:0]:[0:A20]
650
   *  => x[4]:x[3]:x[2]:x[1]:x[0]
651
   */
652
4.48M
  x[0] = LIMB64_HILO(0, LOAD32(wp, 20));
653
4.48M
  x[1] = LIMB64_HILO(LOAD32(wp, 21), 0);
654
4.48M
  ADD3_LIMB64 (x[4], x[3], x[2],
655
4.48M
         zero, LOAD64(wp, 22 / 2), LOAD64(wp, 20 / 2),
656
4.48M
         zero, zero, LOAD64(wp, 22 / 2));
657
658
  /* "D2 + D3" with 64-bit limbs:
659
   *     [0:A23]:[A22:A21]:[A20:0]
660
   *  +  [0:A23]:[A23:0]:[0:0]
661
   *  => d[2]:d[1]:d[0]
662
   */
663
4.48M
  d[0] = LIMB64_HILO(LOAD32(wp, 20), 0);
664
4.48M
  ADD2_LIMB64 (d[2], d[1],
665
4.48M
         LIMB64_HILO(0, LOAD32(wp, 23)),
666
4.48M
         LOAD64_SHR32(21),
667
4.48M
         LIMB64_HILO(0, LOAD32(wp, 23)),
668
4.48M
         LIMB64_HILO(LOAD32(wp, 23), 0));
669
670
  /* "2*S1 + S5 + S6" with 64-bit limbs:
671
   *     s[4]:s[3]:s[2]:s[1]:s[0]
672
   *  +  x[4]:x[3]:x[2]:x[1]:x[0]
673
   *  => s[4]:s[3]:s[2]:s[1]:s[0]
674
   */
675
4.48M
  s[0] = x[0];
676
4.48M
  s[1] = x[1];
677
4.48M
  ADD3_LIMB64(s[4], s[3], s[2],
678
4.48M
        zero, s[3], s[2],
679
4.48M
        x[4], x[3], x[2]);
680
681
  /* "T + S2" with 64-bit limbs:
682
   *     [A11:A10]:[ A9: A8]:[ A7: A6]:[ A5: A4]:[ A3: A2]:[ A1: A0]
683
   *  +  [A23:A22]:[A21:A20]:[A19:A18]:[A17:A16]:[A15:A14]:[A13:A12]
684
   *  => t[6]:t[5]:t[4]:t[3]:t[2]:t[1]:t[0]
685
   */
686
4.48M
  ADD7_LIMB64 (t[6], t[5], t[4], t[3], t[2], t[1], t[0],
687
4.48M
         zero,
688
4.48M
         LOAD64(wp, 10 / 2), LOAD64(wp, 8 / 2), LOAD64(wp, 6 / 2),
689
4.48M
         LOAD64(wp, 4 / 2), LOAD64(wp, 2 / 2), LOAD64(wp, 0 / 2),
690
4.48M
         zero,
691
4.48M
         LOAD64(wp, 22 / 2), LOAD64(wp, 20 / 2), LOAD64(wp, 18 / 2),
692
4.48M
         LOAD64(wp, 16 / 2), LOAD64(wp, 14 / 2), LOAD64(wp, 12 / 2));
693
694
  /* "2*S1 + S4 + S5 + S6" with 64-bit limbs:
695
   *     s[6]:     s[5]:     s[4]:     s[3]:     s[2]:   s[1]:   s[0]
696
   *  +       [A19:A18]:[A17:A16]:[A15:A14]:[A13:A12]:[A20:0]:[A23:0]
697
   *  => s[6]:s[5]:s[4]:s[3]:s[2]:s[1]:s[0]
698
   */
699
4.48M
  ADD7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
700
4.48M
         zero, zero, s[4], s[3], s[2], s[1], s[0],
701
4.48M
         zero,
702
4.48M
         LOAD64(wp, 18 / 2), LOAD64(wp, 16 / 2),
703
4.48M
         LOAD64(wp, 14 / 2), LOAD64(wp, 12 / 2),
704
4.48M
         LIMB64_HILO(LOAD32(wp, 20), 0),
705
4.48M
         LIMB64_HILO(LOAD32(wp, 23), 0));
706
707
  /* "D1 + D2 + D3" with 64-bit limbs:
708
   *     d[6]:     d[5]:     d[4]:     d[3]:     d[2]:     d[1]:     d[0]
709
   *  +       [A22:A21]:[A20:A19]:[A18:A17]:[A16:A15]:[A14:A13]:[A12:A23]
710
   *  => d[6]:d[5]:d[4]:d[3]:d[2]:d[1]:d[0]
711
   */
712
4.48M
  ADD7_LIMB64 (d[6], d[5], d[4], d[3], d[2], d[1], d[0],
713
4.48M
         zero, zero, zero, zero, d[2], d[1], d[0],
714
4.48M
         zero,
715
4.48M
         LOAD64_SHR32(21),
716
4.48M
         LOAD64_SHR32(19),
717
4.48M
         LOAD64_SHR32(17),
718
4.48M
         LOAD64_SHR32(15),
719
4.48M
         LOAD64_SHR32(13),
720
4.48M
         LIMB64_HILO(LOAD32(wp, 12), LOAD32(wp, 23)));
721
722
  /* "2*S1 + S3 + S4 + S5 + S6" with 64-bit limbs:
723
   *     s[6]:     s[5]:     s[4]:     s[3]:     s[2]:     s[1]:     s[0]
724
   *  +       [A20:A19]:[A18:A17]:[A16:A15]:[A14:A13]:[A12:A23]:[A22:A21]
725
   *  => s[6]:s[5]:s[4]:s[3]:s[2]:s[1]:s[0]
726
   */
727
4.48M
  ADD7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
728
4.48M
         s[6], s[5], s[4], s[3], s[2], s[1], s[0],
729
4.48M
         zero,
730
4.48M
         LOAD64_SHR32(19),
731
4.48M
         LOAD64_SHR32(17),
732
4.48M
         LOAD64_SHR32(15),
733
4.48M
         LOAD64_SHR32(13),
734
4.48M
         LIMB64_HILO(LOAD32(wp, 12), LOAD32(wp, 23)),
735
4.48M
         LOAD64_SHR32(21));
736
737
  /* "T + 2*S1 + S2 + S3 + S4 + S5 + S6" */
738
4.48M
  ADD7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
739
4.48M
               s[6], s[5], s[4], s[3], s[2], s[1], s[0],
740
4.48M
               t[6], t[5], t[4], t[3], t[2], t[1], t[0]);
741
742
  /* "T + 2*S1 + S2 + S3 + S4 + S5 + S6 - D1 - D2 - D3" */
743
4.48M
  SUB7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
744
4.48M
               s[6], s[5], s[4], s[3], s[2], s[1], s[0],
745
4.48M
               d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
746
747
4.48M
#undef LOAD64_SHR32
748
749
  /* mod p:
750
   *  's[6]' holds carry value (-3..7). Subtract (carry + 1) * p. Result
751
   *  will be with in range -p...p. Handle result being negative with
752
   *  addition and conditional store. */
753
754
4.48M
  carry = LO32_LIMB64(s[6]);
755
756
  /* Load values to stack to ease register pressure on i386. */
757
4.48M
  x[0] = p_mult[carry + 3][0];
758
4.48M
  x[1] = p_mult[carry + 3][1];
759
4.48M
  x[2] = p_mult[carry + 3][2];
760
4.48M
  x[3] = p_mult[carry + 3][3];
761
4.48M
  x[4] = p_mult[carry + 3][4];
762
4.48M
  x[5] = p_mult[carry + 3][5];
763
4.48M
  x[6] = p_mult[carry + 3][6];
764
4.48M
  SUB7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0],
765
4.48M
         s[6], s[5], s[4], s[3], s[2], s[1], s[0],
766
4.48M
         x[6], x[5], x[4], x[3], x[2], x[1], x[0]);
767
768
4.48M
  ADD7_LIMB64 (d[6], d[5], d[4], d[3], d[2], d[1], d[0],
769
4.48M
         s[6], s[5], s[4], s[3], s[2], s[1], s[0],
770
4.48M
         zero,
771
4.48M
         p_mult[0 + 3][5], p_mult[0 + 3][4],
772
4.48M
         p_mult[0 + 3][3], p_mult[0 + 3][2],
773
4.48M
         p_mult[0 + 3][1], p_mult[0 + 3][0]);
774
775
4.48M
  s_is_negative = LO32_LIMB64(s[6]) >> 31;
776
4.48M
  mask2 = ct_limb_gen_mask(s_is_negative);
777
4.48M
  mask1 = ct_limb_gen_inv_mask(s_is_negative);
778
779
4.48M
  STORE64_COND(wp, 0, mask2, d[0], mask1, s[0]);
780
4.48M
  STORE64_COND(wp, 1, mask2, d[1], mask1, s[1]);
781
4.48M
  STORE64_COND(wp, 2, mask2, d[2], mask1, s[2]);
782
4.48M
  STORE64_COND(wp, 3, mask2, d[3], mask1, s[3]);
783
4.48M
  STORE64_COND(wp, 4, mask2, d[4], mask1, s[4]);
784
4.48M
  STORE64_COND(wp, 5, mask2, d[5], mask1, s[5]);
785
786
4.48M
  w->nlimbs = wsize * LIMBS_PER_LIMB64;
787
4.48M
  MPN_NORMALIZE (wp, w->nlimbs);
788
789
#if (BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB) && defined(WORDS_BIGENDIAN)
790
  wipememory(wp_shr32, sizeof(wp_shr32));
791
#endif
792
4.48M
}
793
794
void
795
_gcry_mpi_ec_nist521_mod (gcry_mpi_t w, mpi_ec_t ctx)
796
4.43M
{
797
4.43M
  mpi_limb_t s[(521 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB];
798
4.43M
  const mpi_size_t wsize = DIM(s);
799
4.43M
  mpi_limb_t cy;
800
4.43M
  mpi_ptr_t wp;
801
802
4.43M
  MPN_NORMALIZE (w->d, w->nlimbs);
803
4.43M
  if (mpi_nbits_more_than (w, 2 * 521))
804
0
    log_bug ("W must be less than m^2\n");
805
806
4.43M
  RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2);
807
808
4.43M
  wp = w->d;
809
810
  /* See "FIPS 186-4, D.2.5 Curve P-521". */
811
812
4.43M
  _gcry_mpih_rshift (s, wp + wsize - 1, wsize, 521 % BITS_PER_MPI_LIMB);
813
4.43M
  s[wsize - 1] &= (1 << (521 % BITS_PER_MPI_LIMB)) - 1;
814
4.43M
  wp[wsize - 1] &= (1 << (521 % BITS_PER_MPI_LIMB)) - 1;
815
4.43M
  _gcry_mpih_add_n (wp, wp, s, wsize);
816
817
  /* "mod p" */
818
4.43M
  cy = _gcry_mpih_sub_n (wp, wp, ctx->p->d, wsize);
819
4.43M
  _gcry_mpih_add_n (s, wp, ctx->p->d, wsize);
820
4.43M
  mpih_set_cond (wp, s, wsize, mpih_limb_is_not_zero (cy));
821
822
4.43M
  w->nlimbs = wsize;
823
4.43M
  MPN_NORMALIZE (wp, w->nlimbs);
824
4.43M
}
825
826
#endif /* !ASM_DISABLED */