/src/libgcrypt/mpi/ec-nist.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* ec-nist.c - NIST optimized elliptic curve functions |
2 | | * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
3 | | * |
4 | | * This file is part of Libgcrypt. |
5 | | * |
6 | | * Libgcrypt is free software; you can redistribute it and/or modify |
7 | | * it under the terms of the GNU Lesser General Public License as |
8 | | * published by the Free Software Foundation; either version 2.1 of |
9 | | * the License, or (at your option) any later version. |
10 | | * |
11 | | * Libgcrypt is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | | * GNU Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public |
17 | | * License along with this program; if not, see <http://www.gnu.org/licenses/>. |
18 | | */ |
19 | | |
20 | | #include <config.h> |
21 | | #include <stdio.h> |
22 | | #include <stdlib.h> |
23 | | #include <errno.h> |
24 | | |
25 | | |
26 | | #ifndef ASM_DISABLED |
27 | | |
28 | | |
29 | | #include "mpi-internal.h" |
30 | | #include "longlong.h" |
31 | | #include "g10lib.h" |
32 | | #include "context.h" |
33 | | #include "ec-context.h" |
34 | | #include "ec-inline.h" |
35 | | #include "const-time.h" |
36 | | |
37 | | |
38 | | static inline |
39 | | void prefetch(const void *tab, size_t len) |
40 | 9.68M | { |
41 | 9.68M | const volatile byte *vtab = tab; |
42 | | |
43 | 9.68M | if (len > 0 * 64) |
44 | 9.68M | (void)vtab[0 * 64]; |
45 | 9.68M | if (len > 1 * 64) |
46 | 9.68M | (void)vtab[1 * 64]; |
47 | 9.68M | if (len > 2 * 64) |
48 | 9.23M | (void)vtab[2 * 64]; |
49 | 9.68M | if (len > 3 * 64) |
50 | 8.17M | (void)vtab[3 * 64]; |
51 | 9.68M | if (len > 4 * 64) |
52 | 8.17M | (void)vtab[4 * 64]; |
53 | 9.68M | if (len > 5 * 64) |
54 | 8.17M | (void)vtab[5 * 64]; |
55 | 9.68M | if (len > 6 * 64) |
56 | 8.17M | (void)vtab[6 * 64]; |
57 | 9.68M | if (len > 7 * 64) |
58 | 8.17M | (void)vtab[7 * 64]; |
59 | 9.68M | if (len > 8 * 64) |
60 | 4.48M | (void)vtab[8 * 64]; |
61 | 9.68M | if (len > 9 * 64) |
62 | 4.48M | (void)vtab[9 * 64]; |
63 | 9.68M | if (len > 10 * 64) |
64 | 0 | (void)vtab[10 * 64]; |
65 | 9.68M | (void)vtab[len - 1]; |
66 | 9.68M | } |
67 | | |
68 | | |
69 | | /* Fast reduction routines for NIST curves. */ |
70 | | |
71 | | void |
72 | | _gcry_mpi_ec_nist192_mod (gcry_mpi_t w, mpi_ec_t ctx) |
73 | 448k | { |
74 | 448k | static const mpi_limb64_t p_mult[3][4] = |
75 | 448k | { |
76 | 448k | { /* P * 1 */ |
77 | 448k | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xfffffffeU), |
78 | 448k | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0x00000000U) |
79 | 448k | }, |
80 | 448k | { /* P * 2 */ |
81 | 448k | LIMB64_C(0xffffffffU, 0xfffffffeU), LIMB64_C(0xffffffffU, 0xfffffffdU), |
82 | 448k | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0x00000001U) |
83 | 448k | }, |
84 | 448k | { /* P * 3 */ |
85 | 448k | LIMB64_C(0xffffffffU, 0xfffffffdU), LIMB64_C(0xffffffffU, 0xfffffffcU), |
86 | 448k | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0x00000002U) |
87 | 448k | } |
88 | 448k | }; |
89 | 448k | const mpi_limb64_t zero = LIMB_TO64(0); |
90 | 448k | mpi_ptr_t wp; |
91 | 448k | mpi_limb64_t s[192 / BITS_PER_MPI_LIMB64 + 1]; |
92 | 448k | mpi_limb64_t o[DIM(s)]; |
93 | 448k | const mpi_size_t wsize = DIM(s) - 1; |
94 | 448k | mpi_limb_t mask1; |
95 | 448k | mpi_limb_t mask2; |
96 | 448k | mpi_limb_t s_is_negative; |
97 | 448k | int carry; |
98 | | |
99 | 448k | MPN_NORMALIZE (w->d, w->nlimbs); |
100 | 448k | if (mpi_nbits_more_than (w, 2 * 192)) |
101 | 0 | log_bug ("W must be less than m^2\n"); |
102 | | |
103 | 448k | RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64); |
104 | 448k | RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64); |
105 | | |
106 | 448k | wp = w->d; |
107 | | |
108 | 448k | prefetch (p_mult, sizeof(p_mult)); |
109 | | |
110 | | /* See "FIPS 186-4, D.2.1 Curve P-192". */ |
111 | | |
112 | 448k | s[0] = LOAD64(wp, 3); |
113 | 448k | ADD3_LIMB64 (s[3], s[2], s[1], |
114 | 448k | zero, zero, LOAD64(wp, 3), |
115 | 448k | zero, LOAD64(wp, 4), LOAD64(wp, 4)); |
116 | | |
117 | 448k | ADD4_LIMB64 (s[3], s[2], s[1], s[0], |
118 | 448k | s[3], s[2], s[1], s[0], |
119 | 448k | zero, LOAD64(wp, 5), LOAD64(wp, 5), LOAD64(wp, 5)); |
120 | | |
121 | 448k | ADD4_LIMB64 (s[3], s[2], s[1], s[0], |
122 | 448k | s[3], s[2], s[1], s[0], |
123 | 448k | zero, LOAD64(wp, 2), LOAD64(wp, 1), LOAD64(wp, 0)); |
124 | | |
125 | | /* mod p: |
126 | | * 's[3]' holds carry value (0..2). Subtract (carry + 1) * p. Result will be |
127 | | * with in range -p...p. Handle result being negative with addition and |
128 | | * conditional store. */ |
129 | | |
130 | 448k | carry = LO32_LIMB64(s[3]); |
131 | | |
132 | 448k | SUB4_LIMB64 (s[3], s[2], s[1], s[0], |
133 | 448k | s[3], s[2], s[1], s[0], |
134 | 448k | p_mult[carry][3], p_mult[carry][2], |
135 | 448k | p_mult[carry][1], p_mult[carry][0]); |
136 | | |
137 | 448k | ADD4_LIMB64 (o[3], o[2], o[1], o[0], |
138 | 448k | s[3], s[2], s[1], s[0], |
139 | 448k | zero, |
140 | 448k | p_mult[0][2], p_mult[0][1], p_mult[0][0]); |
141 | | |
142 | 448k | s_is_negative = LO32_LIMB64(s[3]) >> 31; |
143 | | |
144 | 448k | mask2 = ct_limb_gen_mask(s_is_negative); |
145 | 448k | mask1 = ct_limb_gen_inv_mask(s_is_negative); |
146 | | |
147 | 448k | STORE64_COND(wp, 0, mask2, o[0], mask1, s[0]); |
148 | 448k | STORE64_COND(wp, 1, mask2, o[1], mask1, s[1]); |
149 | 448k | STORE64_COND(wp, 2, mask2, o[2], mask1, s[2]); |
150 | | |
151 | 448k | w->nlimbs = 192 / BITS_PER_MPI_LIMB; |
152 | 448k | MPN_NORMALIZE (wp, w->nlimbs); |
153 | 448k | } |
154 | | |
155 | | void |
156 | | _gcry_mpi_ec_nist224_mod (gcry_mpi_t w, mpi_ec_t ctx) |
157 | 1.05M | { |
158 | 1.05M | static const mpi_limb64_t p_mult[5][4] = |
159 | 1.05M | { |
160 | 1.05M | { /* P * -1 */ |
161 | 1.05M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xffffffffU), |
162 | 1.05M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xffffffffU, 0x00000000U) |
163 | 1.05M | }, |
164 | 1.05M | { /* P * 0 */ |
165 | 1.05M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), |
166 | 1.05M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U) |
167 | 1.05M | }, |
168 | 1.05M | { /* P * 1 */ |
169 | 1.05M | LIMB64_C(0x00000000U, 0x00000001U), LIMB64_C(0xffffffffU, 0x00000000U), |
170 | 1.05M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xffffffffU) |
171 | 1.05M | }, |
172 | 1.05M | { /* P * 2 */ |
173 | 1.05M | LIMB64_C(0x00000000U, 0x00000002U), LIMB64_C(0xfffffffeU, 0x00000000U), |
174 | 1.05M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000001U, 0xffffffffU) |
175 | 1.05M | }, |
176 | 1.05M | { /* P * 3 */ |
177 | 1.05M | LIMB64_C(0x00000000U, 0x00000003U), LIMB64_C(0xfffffffdU, 0x00000000U), |
178 | 1.05M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000002U, 0xffffffffU) |
179 | 1.05M | } |
180 | 1.05M | }; |
181 | 1.05M | const mpi_limb64_t zero = LIMB_TO64(0); |
182 | 1.05M | mpi_ptr_t wp; |
183 | 1.05M | mpi_limb64_t s[(224 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64]; |
184 | 1.05M | mpi_limb64_t d[DIM(s)]; |
185 | 1.05M | const mpi_size_t wsize = DIM(s); |
186 | 1.05M | mpi_size_t psize = ctx->p->nlimbs; |
187 | 1.05M | mpi_limb_t mask1; |
188 | 1.05M | mpi_limb_t mask2; |
189 | 1.05M | mpi_limb_t s_is_negative; |
190 | 1.05M | int carry; |
191 | | |
192 | 1.05M | MPN_NORMALIZE (w->d, w->nlimbs); |
193 | 1.05M | if (mpi_nbits_more_than (w, 2 * 224)) |
194 | 0 | log_bug ("W must be less than m^2\n"); |
195 | | |
196 | 1.05M | RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64); |
197 | 1.05M | RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64); |
198 | 1.05M | ctx->p->nlimbs = psize; |
199 | | |
200 | 1.05M | wp = w->d; |
201 | | |
202 | 1.05M | prefetch (p_mult, sizeof(p_mult)); |
203 | | |
204 | | /* See "FIPS 186-4, D.2.2 Curve P-224". */ |
205 | | |
206 | | /* "S1 + S2" with 64-bit limbs: |
207 | | * [0:A10]:[ A9: A8]:[ A7:0]:[0:0] |
208 | | * + [0:0]:[A13:A12]:[A11:0]:[0:0] |
209 | | * => s[3]:s[2]:s[1]:s[0] |
210 | | */ |
211 | 1.05M | s[0] = zero; |
212 | 1.05M | ADD3_LIMB64 (s[3], s[2], s[1], |
213 | 1.05M | LIMB64_HILO(0, LOAD32(wp, 10)), |
214 | 1.05M | LOAD64(wp, 8 / 2), |
215 | 1.05M | LIMB64_HILO(LOAD32(wp, 7), 0), |
216 | 1.05M | zero, |
217 | 1.05M | LOAD64(wp, 12 / 2), |
218 | 1.05M | LIMB64_HILO(LOAD32(wp, 11), 0)); |
219 | | |
220 | | /* "T + S1 + S2" */ |
221 | 1.05M | ADD4_LIMB64 (s[3], s[2], s[1], s[0], |
222 | 1.05M | s[3], s[2], s[1], s[0], |
223 | 1.05M | LIMB64_HILO(0, LOAD32(wp, 6)), |
224 | 1.05M | LOAD64(wp, 4 / 2), |
225 | 1.05M | LOAD64(wp, 2 / 2), |
226 | 1.05M | LOAD64(wp, 0 / 2)); |
227 | | |
228 | | /* "D1 + D2" with 64-bit limbs: |
229 | | * [0:A13]:[A12:A11]:[A10: A9]:[ A8: A7] |
230 | | * + [0:0]:[ 0: 0]:[ 0:A13]:[A12:A11] |
231 | | * => d[3]:d[2]:d[1]:d[0] |
232 | | */ |
233 | 1.05M | ADD4_LIMB64 (d[3], d[2], d[1], d[0], |
234 | 1.05M | LIMB64_HILO(0, LOAD32(wp, 13)), |
235 | 1.05M | LOAD64_UNALIGNED(wp, 11 / 2), |
236 | 1.05M | LOAD64_UNALIGNED(wp, 9 / 2), |
237 | 1.05M | LOAD64_UNALIGNED(wp, 7 / 2), |
238 | 1.05M | zero, |
239 | 1.05M | zero, |
240 | 1.05M | LIMB64_HILO(0, LOAD32(wp, 13)), |
241 | 1.05M | LOAD64_UNALIGNED(wp, 11 / 2)); |
242 | | |
243 | | /* "T + S1 + S2 - D1 - D2" */ |
244 | 1.05M | SUB4_LIMB64 (s[3], s[2], s[1], s[0], |
245 | 1.05M | s[3], s[2], s[1], s[0], |
246 | 1.05M | d[3], d[2], d[1], d[0]); |
247 | | |
248 | | /* mod p: |
249 | | * Upper 32-bits of 's[3]' holds carry value (-2..2). |
250 | | * Subtract (carry + 1) * p. Result will be with in range -p...p. |
251 | | * Handle result being negative with addition and conditional store. */ |
252 | | |
253 | 1.05M | carry = HI32_LIMB64(s[3]); |
254 | | |
255 | 1.05M | SUB4_LIMB64 (s[3], s[2], s[1], s[0], |
256 | 1.05M | s[3], s[2], s[1], s[0], |
257 | 1.05M | p_mult[carry + 2][3], p_mult[carry + 2][2], |
258 | 1.05M | p_mult[carry + 2][1], p_mult[carry + 2][0]); |
259 | | |
260 | 1.05M | ADD4_LIMB64 (d[3], d[2], d[1], d[0], |
261 | 1.05M | s[3], s[2], s[1], s[0], |
262 | 1.05M | p_mult[0 + 2][3], p_mult[0 + 2][2], |
263 | 1.05M | p_mult[0 + 2][1], p_mult[0 + 2][0]); |
264 | | |
265 | 1.05M | s_is_negative = (HI32_LIMB64(s[3]) >> 31); |
266 | | |
267 | 1.05M | mask2 = ct_limb_gen_mask(s_is_negative); |
268 | 1.05M | mask1 = ct_limb_gen_inv_mask(s_is_negative); |
269 | | |
270 | 1.05M | STORE64_COND(wp, 0, mask2, d[0], mask1, s[0]); |
271 | 1.05M | STORE64_COND(wp, 1, mask2, d[1], mask1, s[1]); |
272 | 1.05M | STORE64_COND(wp, 2, mask2, d[2], mask1, s[2]); |
273 | 1.05M | STORE64_COND(wp, 3, mask2, d[3], mask1, s[3]); |
274 | | |
275 | 1.05M | w->nlimbs = wsize * LIMBS_PER_LIMB64; |
276 | 1.05M | MPN_NORMALIZE (wp, w->nlimbs); |
277 | 1.05M | } |
278 | | |
279 | | void |
280 | | _gcry_mpi_ec_nist256_mod (gcry_mpi_t w, mpi_ec_t ctx) |
281 | 3.69M | { |
282 | 3.69M | static const mpi_limb64_t p_mult[12][5] = |
283 | 3.69M | { |
284 | 3.69M | { /* P * -3 */ |
285 | 3.69M | LIMB64_C(0x00000000U, 0x00000003U), LIMB64_C(0xfffffffdU, 0x00000000U), |
286 | 3.69M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000002U, 0xfffffffcU), |
287 | 3.69M | LIMB64_C(0xffffffffU, 0xfffffffdU) |
288 | 3.69M | }, |
289 | 3.69M | { /* P * -2 */ |
290 | 3.69M | LIMB64_C(0x00000000U, 0x00000002U), LIMB64_C(0xfffffffeU, 0x00000000U), |
291 | 3.69M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000001U, 0xfffffffdU), |
292 | 3.69M | LIMB64_C(0xffffffffU, 0xfffffffeU) |
293 | 3.69M | }, |
294 | 3.69M | { /* P * -1 */ |
295 | 3.69M | LIMB64_C(0x00000000U, 0x00000001U), LIMB64_C(0xffffffffU, 0x00000000U), |
296 | 3.69M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xfffffffeU), |
297 | 3.69M | LIMB64_C(0xffffffffU, 0xffffffffU) |
298 | 3.69M | }, |
299 | 3.69M | { /* P * 0 */ |
300 | 3.69M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), |
301 | 3.69M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), |
302 | 3.69M | LIMB64_C(0x00000000U, 0x00000000U) |
303 | 3.69M | }, |
304 | 3.69M | { /* P * 1 */ |
305 | 3.69M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0x00000000U, 0xffffffffU), |
306 | 3.69M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xffffffffU, 0x00000001U), |
307 | 3.69M | LIMB64_C(0x00000000U, 0x00000000U) |
308 | 3.69M | }, |
309 | 3.69M | { /* P * 2 */ |
310 | 3.69M | LIMB64_C(0xffffffffU, 0xfffffffeU), LIMB64_C(0x00000001U, 0xffffffffU), |
311 | 3.69M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffeU, 0x00000002U), |
312 | 3.69M | LIMB64_C(0x00000000U, 0x00000001U) |
313 | 3.69M | }, |
314 | 3.69M | { /* P * 3 */ |
315 | 3.69M | LIMB64_C(0xffffffffU, 0xfffffffdU), LIMB64_C(0x00000002U, 0xffffffffU), |
316 | 3.69M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffdU, 0x00000003U), |
317 | 3.69M | LIMB64_C(0x00000000U, 0x00000002U) |
318 | 3.69M | }, |
319 | 3.69M | { /* P * 4 */ |
320 | 3.69M | LIMB64_C(0xffffffffU, 0xfffffffcU), LIMB64_C(0x00000003U, 0xffffffffU), |
321 | 3.69M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffcU, 0x00000004U), |
322 | 3.69M | LIMB64_C(0x00000000U, 0x00000003U) |
323 | 3.69M | }, |
324 | 3.69M | { /* P * 5 */ |
325 | 3.69M | LIMB64_C(0xffffffffU, 0xfffffffbU), LIMB64_C(0x00000004U, 0xffffffffU), |
326 | 3.69M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffbU, 0x00000005U), |
327 | 3.69M | LIMB64_C(0x00000000U, 0x00000004U) |
328 | 3.69M | }, |
329 | 3.69M | { /* P * 6 */ |
330 | 3.69M | LIMB64_C(0xffffffffU, 0xfffffffaU), LIMB64_C(0x00000005U, 0xffffffffU), |
331 | 3.69M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffffaU, 0x00000006U), |
332 | 3.69M | LIMB64_C(0x00000000U, 0x00000005U) |
333 | 3.69M | }, |
334 | 3.69M | { /* P * 7 */ |
335 | 3.69M | LIMB64_C(0xffffffffU, 0xfffffff9U), LIMB64_C(0x00000006U, 0xffffffffU), |
336 | 3.69M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0xfffffff9U, 0x00000007U), |
337 | 3.69M | LIMB64_C(0x00000000U, 0x00000006U) |
338 | 3.69M | } |
339 | 3.69M | }; |
340 | 3.69M | const mpi_limb64_t zero = LIMB_TO64(0); |
341 | 3.69M | mpi_ptr_t wp; |
342 | 3.69M | mpi_limb64_t s[(256 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64 + 1]; |
343 | 3.69M | mpi_limb64_t t[DIM(s)]; |
344 | 3.69M | mpi_limb64_t d[DIM(s)]; |
345 | 3.69M | mpi_limb64_t e[DIM(s)]; |
346 | 3.69M | const mpi_size_t wsize = DIM(s) - 1; |
347 | 3.69M | mpi_size_t psize = ctx->p->nlimbs; |
348 | 3.69M | mpi_limb_t mask1; |
349 | 3.69M | mpi_limb_t mask2; |
350 | 3.69M | mpi_limb_t mask3; |
351 | 3.69M | mpi_limb_t s_is_negative; |
352 | 3.69M | mpi_limb_t d_is_negative; |
353 | 3.69M | int carry; |
354 | | |
355 | 3.69M | MPN_NORMALIZE (w->d, w->nlimbs); |
356 | 3.69M | if (mpi_nbits_more_than (w, 2 * 256)) |
357 | 0 | log_bug ("W must be less than m^2\n"); |
358 | | |
359 | 3.69M | RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64); |
360 | 3.69M | RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64); |
361 | 3.69M | ctx->p->nlimbs = psize; |
362 | | |
363 | 3.69M | wp = w->d; |
364 | | |
365 | 3.69M | prefetch (p_mult, sizeof(p_mult)); |
366 | | |
367 | | /* See "FIPS 186-4, D.2.3 Curve P-256". */ |
368 | | |
369 | | /* "S1 + S2" with 64-bit limbs: |
370 | | * [A15:A14]:[A13:A12]:[A11:0]:[0:0] |
371 | | * + [0:A15]:[A14:A13]:[A12:0]:[0:0] |
372 | | * => s[4]:s[3]:s[2]:s[1]:s[0] |
373 | | */ |
374 | 3.69M | s[0] = zero; |
375 | 3.69M | ADD4_LIMB64 (s[4], s[3], s[2], s[1], |
376 | 3.69M | zero, |
377 | 3.69M | LOAD64(wp, 14 / 2), |
378 | 3.69M | LOAD64(wp, 12 / 2), |
379 | 3.69M | LIMB64_HILO(LOAD32(wp, 11), 0), |
380 | 3.69M | zero, |
381 | 3.69M | LIMB64_HILO(0, LOAD32(wp, 15)), |
382 | 3.69M | LOAD64_UNALIGNED(wp, 13 / 2), |
383 | 3.69M | LIMB64_HILO(LOAD32(wp, 12), 0)); |
384 | | |
385 | | /* "S3 + S4" with 64-bit limbs: |
386 | | * [A15:A14]:[ 0: 0]:[ 0:A10]:[ A9:A8] |
387 | | * + [A8:A13]:[A15:A14]:[A13:A11]:[A10:A9] |
388 | | * => t[4]:t[3]:t[2]:t[1]:t[0] |
389 | | */ |
390 | 3.69M | ADD5_LIMB64 (t[4], t[3], t[2], t[1], t[0], |
391 | 3.69M | zero, |
392 | 3.69M | LOAD64(wp, 14 / 2), |
393 | 3.69M | zero, |
394 | 3.69M | LIMB64_HILO(0, LOAD32(wp, 10)), |
395 | 3.69M | LOAD64(wp, 8 / 2), |
396 | 3.69M | zero, |
397 | 3.69M | LIMB64_HILO(LOAD32(wp, 8), LOAD32(wp, 13)), |
398 | 3.69M | LOAD64(wp, 14 / 2), |
399 | 3.69M | LIMB64_HILO(LOAD32(wp, 13), LOAD32(wp, 11)), |
400 | 3.69M | LOAD64_UNALIGNED(wp, 9 / 2)); |
401 | | |
402 | | /* "2*S1 + 2*S2" */ |
403 | 3.69M | ADD5_LIMB64 (s[4], s[3], s[2], s[1], s[0], |
404 | 3.69M | s[4], s[3], s[2], s[1], s[0], |
405 | 3.69M | s[4], s[3], s[2], s[1], s[0]); |
406 | | |
407 | | /* "T + S3 + S4" */ |
408 | 3.69M | ADD5_LIMB64 (t[4], t[3], t[2], t[1], t[0], |
409 | 3.69M | t[4], t[3], t[2], t[1], t[0], |
410 | 3.69M | zero, |
411 | 3.69M | LOAD64(wp, 6 / 2), |
412 | 3.69M | LOAD64(wp, 4 / 2), |
413 | 3.69M | LOAD64(wp, 2 / 2), |
414 | 3.69M | LOAD64(wp, 0 / 2)); |
415 | | |
416 | | /* "2*S1 + 2*S2 - D3" with 64-bit limbs: |
417 | | * s[4]: s[3]: s[2]: s[1]: s[0] |
418 | | * - [A12:0]:[A10:A9]:[A8:A15]:[A14:A13] |
419 | | * => s[4]:s[3]:s[2]:s[1]:s[0] |
420 | | */ |
421 | 3.69M | SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0], |
422 | 3.69M | s[4], s[3], s[2], s[1], s[0], |
423 | 3.69M | zero, |
424 | 3.69M | LIMB64_HILO(LOAD32(wp, 12), 0), |
425 | 3.69M | LOAD64_UNALIGNED(wp, 9 / 2), |
426 | 3.69M | LIMB64_HILO(LOAD32(wp, 8), LOAD32(wp, 15)), |
427 | 3.69M | LOAD64_UNALIGNED(wp, 13 / 2)); |
428 | | |
429 | | /* "T + 2*S1 + 2*S2 + S3 + S4 - D3" */ |
430 | 3.69M | ADD5_LIMB64 (s[4], s[3], s[2], s[1], s[0], |
431 | 3.69M | s[4], s[3], s[2], s[1], s[0], |
432 | 3.69M | t[4], t[3], t[2], t[1], t[0]); |
433 | | |
434 | | /* "D1 + D2" with 64-bit limbs: |
435 | | * [0:A13]:[A12:A11] + [A15:A14]:[A13:A12] => d[2]:d[1]:d[0] |
436 | | * [A10:A8] + [A11:A9] => d[4]:d[3] |
437 | | */ |
438 | 3.69M | ADD3_LIMB64 (d[2], d[1], d[0], |
439 | 3.69M | zero, |
440 | 3.69M | LIMB64_HILO(0, LOAD32(wp, 13)), |
441 | 3.69M | LOAD64_UNALIGNED(wp, 11 / 2), |
442 | 3.69M | zero, |
443 | 3.69M | LOAD64(wp, 14 / 2), |
444 | 3.69M | LOAD64(wp, 12 / 2)); |
445 | 3.69M | ADD2_LIMB64 (d[4], d[3], |
446 | 3.69M | zero, LIMB64_HILO(LOAD32(wp, 10), LOAD32(wp, 8)), |
447 | 3.69M | zero, LIMB64_HILO(LOAD32(wp, 11), LOAD32(wp, 9))); |
448 | | |
449 | | /* "D1 + D2 + D4" with 64-bit limbs: |
450 | | * d[4]: d[3]: d[2]: d[1]: d[0] |
451 | | * - [A13:0]:[A11:A10]:[A9:0]:[A15:A14] |
452 | | * => d[4]:d[3]:d[2]:d[1]:d[0] |
453 | | */ |
454 | 3.69M | ADD5_LIMB64 (d[4], d[3], d[2], d[1], d[0], |
455 | 3.69M | d[4], d[3], d[2], d[1], d[0], |
456 | 3.69M | zero, |
457 | 3.69M | LIMB64_HILO(LOAD32(wp, 13), 0), |
458 | 3.69M | LOAD64(wp, 10 / 2), |
459 | 3.69M | LIMB64_HILO(LOAD32(wp, 9), 0), |
460 | 3.69M | LOAD64(wp, 14 / 2)); |
461 | | |
462 | | /* "T + 2*S1 + 2*S2 + S3 + S4 - D1 - D2 - D3 - D4" */ |
463 | 3.69M | SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0], |
464 | 3.69M | s[4], s[3], s[2], s[1], s[0], |
465 | 3.69M | d[4], d[3], d[2], d[1], d[0]); |
466 | | |
467 | | /* mod p: |
468 | | * 's[4]' holds carry value (-4..6). Subtract (carry + 1) * p. Result |
469 | | * will be with in range -2*p...p. Handle result being negative with |
470 | | * addition and conditional store. */ |
471 | | |
472 | 3.69M | carry = LO32_LIMB64(s[4]); |
473 | | |
474 | | /* Load values to stack to ease register pressure on i386. */ |
475 | 3.69M | e[0] = p_mult[carry + 4][0]; |
476 | 3.69M | e[1] = p_mult[carry + 4][1]; |
477 | 3.69M | e[2] = p_mult[carry + 4][2]; |
478 | 3.69M | e[3] = p_mult[carry + 4][3]; |
479 | 3.69M | e[4] = p_mult[carry + 4][4]; |
480 | 3.69M | SUB5_LIMB64 (s[4], s[3], s[2], s[1], s[0], |
481 | 3.69M | s[4], s[3], s[2], s[1], s[0], |
482 | 3.69M | e[4], e[3], e[2], e[1], e[0]); |
483 | | |
484 | | /* Add 1*P */ |
485 | 3.69M | ADD5_LIMB64 (d[4], d[3], d[2], d[1], d[0], |
486 | 3.69M | s[4], s[3], s[2], s[1], s[0], |
487 | 3.69M | zero, |
488 | 3.69M | p_mult[0 + 4][3], p_mult[0 + 4][2], |
489 | 3.69M | p_mult[0 + 4][1], p_mult[0 + 4][0]); |
490 | | |
491 | | /* Add 2*P */ |
492 | 3.69M | ADD5_LIMB64 (e[4], e[3], e[2], e[1], e[0], |
493 | 3.69M | s[4], s[3], s[2], s[1], s[0], |
494 | 3.69M | zero, |
495 | 3.69M | p_mult[1 + 4][3], p_mult[1 + 4][2], |
496 | 3.69M | p_mult[1 + 4][1], p_mult[1 + 4][0]); |
497 | | |
498 | 3.69M | s_is_negative = LO32_LIMB64(s[4]) >> 31; |
499 | 3.69M | d_is_negative = LO32_LIMB64(d[4]) >> 31; |
500 | 3.69M | mask3 = ct_limb_gen_mask(d_is_negative); |
501 | 3.69M | mask2 = ct_limb_gen_mask(s_is_negative) & ~mask3; |
502 | 3.69M | mask1 = ct_limb_gen_inv_mask(s_is_negative) & ~mask3; |
503 | | |
504 | 3.69M | s[0] = LIMB_OR64(MASK_AND64(mask2, d[0]), MASK_AND64(mask1, s[0])); |
505 | 3.69M | s[1] = LIMB_OR64(MASK_AND64(mask2, d[1]), MASK_AND64(mask1, s[1])); |
506 | 3.69M | s[2] = LIMB_OR64(MASK_AND64(mask2, d[2]), MASK_AND64(mask1, s[2])); |
507 | 3.69M | s[3] = LIMB_OR64(MASK_AND64(mask2, d[3]), MASK_AND64(mask1, s[3])); |
508 | 3.69M | s[0] = LIMB_OR64(MASK_AND64(mask3, e[0]), s[0]); |
509 | 3.69M | s[1] = LIMB_OR64(MASK_AND64(mask3, e[1]), s[1]); |
510 | 3.69M | s[2] = LIMB_OR64(MASK_AND64(mask3, e[2]), s[2]); |
511 | 3.69M | s[3] = LIMB_OR64(MASK_AND64(mask3, e[3]), s[3]); |
512 | | |
513 | 3.69M | STORE64(wp, 0, s[0]); |
514 | 3.69M | STORE64(wp, 1, s[1]); |
515 | 3.69M | STORE64(wp, 2, s[2]); |
516 | 3.69M | STORE64(wp, 3, s[3]); |
517 | | |
518 | 3.69M | w->nlimbs = wsize * LIMBS_PER_LIMB64; |
519 | 3.69M | MPN_NORMALIZE (wp, w->nlimbs); |
520 | 3.69M | } |
521 | | |
522 | | void |
523 | | _gcry_mpi_ec_nist384_mod (gcry_mpi_t w, mpi_ec_t ctx) |
524 | 4.48M | { |
525 | 4.48M | static const mpi_limb64_t p_mult[11][7] = |
526 | 4.48M | { |
527 | 4.48M | { /* P * -2 */ |
528 | 4.48M | LIMB64_C(0xfffffffeU, 0x00000002U), LIMB64_C(0x00000001U, 0xffffffffU), |
529 | 4.48M | LIMB64_C(0x00000000U, 0x00000002U), LIMB64_C(0x00000000U, 0x00000000U), |
530 | 4.48M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), |
531 | 4.48M | LIMB64_C(0xffffffffU, 0xfffffffeU) |
532 | 4.48M | }, |
533 | 4.48M | { /* P * -1 */ |
534 | 4.48M | LIMB64_C(0xffffffffU, 0x00000001U), LIMB64_C(0x00000000U, 0xffffffffU), |
535 | 4.48M | LIMB64_C(0x00000000U, 0x00000001U), LIMB64_C(0x00000000U, 0x00000000U), |
536 | 4.48M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), |
537 | 4.48M | LIMB64_C(0xffffffffU, 0xffffffffU) |
538 | 4.48M | }, |
539 | 4.48M | { /* P * 0 */ |
540 | 4.48M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), |
541 | 4.48M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), |
542 | 4.48M | LIMB64_C(0x00000000U, 0x00000000U), LIMB64_C(0x00000000U, 0x00000000U), |
543 | 4.48M | LIMB64_C(0x00000000U, 0x00000000U) |
544 | 4.48M | }, |
545 | 4.48M | { /* P * 1 */ |
546 | 4.48M | LIMB64_C(0x00000000U, 0xffffffffU), LIMB64_C(0xffffffffU, 0x00000000U), |
547 | 4.48M | LIMB64_C(0xffffffffU, 0xfffffffeU), LIMB64_C(0xffffffffU, 0xffffffffU), |
548 | 4.48M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), |
549 | 4.48M | LIMB64_C(0x00000000U, 0x00000000U) |
550 | 4.48M | }, |
551 | 4.48M | { /* P * 2 */ |
552 | 4.48M | LIMB64_C(0x00000001U, 0xfffffffeU), LIMB64_C(0xfffffffeU, 0x00000000U), |
553 | 4.48M | LIMB64_C(0xffffffffU, 0xfffffffdU), LIMB64_C(0xffffffffU, 0xffffffffU), |
554 | 4.48M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), |
555 | 4.48M | LIMB64_C(0x00000000U, 0x00000001U) |
556 | 4.48M | }, |
557 | 4.48M | { /* P * 3 */ |
558 | 4.48M | LIMB64_C(0x00000002U, 0xfffffffdU), LIMB64_C(0xfffffffdU, 0x00000000U), |
559 | 4.48M | LIMB64_C(0xffffffffU, 0xfffffffcU), LIMB64_C(0xffffffffU, 0xffffffffU), |
560 | 4.48M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), |
561 | 4.48M | LIMB64_C(0x00000000U, 0x00000002U) |
562 | 4.48M | }, |
563 | 4.48M | { /* P * 4 */ |
564 | 4.48M | LIMB64_C(0x00000003U, 0xfffffffcU), LIMB64_C(0xfffffffcU, 0x00000000U), |
565 | 4.48M | LIMB64_C(0xffffffffU, 0xfffffffbU), LIMB64_C(0xffffffffU, 0xffffffffU), |
566 | 4.48M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), |
567 | 4.48M | LIMB64_C(0x00000000U, 0x00000003U) |
568 | 4.48M | }, |
569 | 4.48M | { /* P * 5 */ |
570 | 4.48M | LIMB64_C(0x00000004U, 0xfffffffbU), LIMB64_C(0xfffffffbU, 0x00000000U), |
571 | 4.48M | LIMB64_C(0xffffffffU, 0xfffffffaU), LIMB64_C(0xffffffffU, 0xffffffffU), |
572 | 4.48M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), |
573 | 4.48M | LIMB64_C(0x00000000U, 0x00000004U) |
574 | 4.48M | }, |
575 | 4.48M | { /* P * 6 */ |
576 | 4.48M | LIMB64_C(0x00000005U, 0xfffffffaU), LIMB64_C(0xfffffffaU, 0x00000000U), |
577 | 4.48M | LIMB64_C(0xffffffffU, 0xfffffff9U), LIMB64_C(0xffffffffU, 0xffffffffU), |
578 | 4.48M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), |
579 | 4.48M | LIMB64_C(0x00000000U, 0x00000005U) |
580 | 4.48M | }, |
581 | 4.48M | { /* P * 7 */ |
582 | 4.48M | LIMB64_C(0x00000006U, 0xfffffff9U), LIMB64_C(0xfffffff9U, 0x00000000U), |
583 | 4.48M | LIMB64_C(0xffffffffU, 0xfffffff8U), LIMB64_C(0xffffffffU, 0xffffffffU), |
584 | 4.48M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), |
585 | 4.48M | LIMB64_C(0x00000000U, 0x00000006U) |
586 | 4.48M | }, |
587 | 4.48M | { /* P * 8 */ |
588 | 4.48M | LIMB64_C(0x00000007U, 0xfffffff8U), LIMB64_C(0xfffffff8U, 0x00000000U), |
589 | 4.48M | LIMB64_C(0xffffffffU, 0xfffffff7U), LIMB64_C(0xffffffffU, 0xffffffffU), |
590 | 4.48M | LIMB64_C(0xffffffffU, 0xffffffffU), LIMB64_C(0xffffffffU, 0xffffffffU), |
591 | 4.48M | LIMB64_C(0x00000000U, 0x00000007U) |
592 | 4.48M | }, |
593 | 4.48M | }; |
594 | 4.48M | const mpi_limb64_t zero = LIMB_TO64(0); |
595 | 4.48M | mpi_ptr_t wp; |
596 | 4.48M | mpi_limb64_t s[(384 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64 + 1]; |
597 | 4.48M | mpi_limb64_t t[DIM(s)]; |
598 | 4.48M | mpi_limb64_t d[DIM(s)]; |
599 | 4.48M | mpi_limb64_t x[DIM(s)]; |
600 | | #if (BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB) && defined(WORDS_BIGENDIAN) |
601 | | mpi_limb_t wp_shr32[(DIM(s) - 1) * LIMBS_PER_LIMB64]; |
602 | | #endif |
603 | 4.48M | const mpi_size_t wsize = DIM(s) - 1; |
604 | 4.48M | mpi_size_t psize = ctx->p->nlimbs; |
605 | 4.48M | mpi_limb_t mask1; |
606 | 4.48M | mpi_limb_t mask2; |
607 | 4.48M | mpi_limb_t s_is_negative; |
608 | 4.48M | int carry; |
609 | | |
610 | 4.48M | MPN_NORMALIZE (w->d, w->nlimbs); |
611 | 4.48M | if (mpi_nbits_more_than (w, 2 * 384)) |
612 | 0 | log_bug ("W must be less than m^2\n"); |
613 | | |
614 | 4.48M | RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2 * LIMBS_PER_LIMB64); |
615 | 4.48M | RESIZE_AND_CLEAR_IF_NEEDED (ctx->p, wsize * LIMBS_PER_LIMB64); |
616 | 4.48M | ctx->p->nlimbs = psize; |
617 | | |
618 | 4.48M | wp = w->d; |
619 | | |
620 | 4.48M | prefetch (p_mult, sizeof(p_mult)); |
621 | | |
622 | | /* See "FIPS 186-4, D.2.4 Curve P-384". */ |
623 | | |
624 | 4.48M | #if BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB |
625 | | # ifdef WORDS_BIGENDIAN |
626 | | # define LOAD64_SHR32(idx) LOAD64(wp_shr32, ((idx) / 2 - wsize)) |
627 | | _gcry_mpih_rshift (wp_shr32, wp + 384 / BITS_PER_MPI_LIMB, |
628 | | wsize * LIMBS_PER_LIMB64, 32); |
629 | | # else |
630 | 4.48M | # define LOAD64_SHR32(idx) LOAD64_UNALIGNED(wp, idx / 2) |
631 | 4.48M | #endif |
632 | | #else |
633 | | # define LOAD64_SHR32(idx) LIMB64_HILO(LOAD32(wp, (idx) + 1), LOAD32(wp, idx)) |
634 | | #endif |
635 | | |
636 | | /* "S1 + S1" with 64-bit limbs: |
637 | | * [0:A23]:[A22:A21] |
638 | | * + [0:A23]:[A22:A21] |
639 | | * => s[3]:s[2] |
640 | | */ |
641 | 4.48M | ADD2_LIMB64 (s[3], s[2], |
642 | 4.48M | LIMB64_HILO(0, LOAD32(wp, 23)), |
643 | 4.48M | LOAD64_SHR32(21), |
644 | 4.48M | LIMB64_HILO(0, LOAD32(wp, 23)), |
645 | 4.48M | LOAD64_SHR32(21)); |
646 | | |
647 | | /* "S5 + S6" with 64-bit limbs: |
648 | | * [A23:A22]:[A21:A20]:[ 0:0]:[0: 0] |
649 | | * + [ 0: 0]:[A23:A22]:[A21:0]:[0:A20] |
650 | | * => x[4]:x[3]:x[2]:x[1]:x[0] |
651 | | */ |
652 | 4.48M | x[0] = LIMB64_HILO(0, LOAD32(wp, 20)); |
653 | 4.48M | x[1] = LIMB64_HILO(LOAD32(wp, 21), 0); |
654 | 4.48M | ADD3_LIMB64 (x[4], x[3], x[2], |
655 | 4.48M | zero, LOAD64(wp, 22 / 2), LOAD64(wp, 20 / 2), |
656 | 4.48M | zero, zero, LOAD64(wp, 22 / 2)); |
657 | | |
658 | | /* "D2 + D3" with 64-bit limbs: |
659 | | * [0:A23]:[A22:A21]:[A20:0] |
660 | | * + [0:A23]:[A23:0]:[0:0] |
661 | | * => d[2]:d[1]:d[0] |
662 | | */ |
663 | 4.48M | d[0] = LIMB64_HILO(LOAD32(wp, 20), 0); |
664 | 4.48M | ADD2_LIMB64 (d[2], d[1], |
665 | 4.48M | LIMB64_HILO(0, LOAD32(wp, 23)), |
666 | 4.48M | LOAD64_SHR32(21), |
667 | 4.48M | LIMB64_HILO(0, LOAD32(wp, 23)), |
668 | 4.48M | LIMB64_HILO(LOAD32(wp, 23), 0)); |
669 | | |
670 | | /* "2*S1 + S5 + S6" with 64-bit limbs: |
671 | | * s[4]:s[3]:s[2]:s[1]:s[0] |
672 | | * + x[4]:x[3]:x[2]:x[1]:x[0] |
673 | | * => s[4]:s[3]:s[2]:s[1]:s[0] |
674 | | */ |
675 | 4.48M | s[0] = x[0]; |
676 | 4.48M | s[1] = x[1]; |
677 | 4.48M | ADD3_LIMB64(s[4], s[3], s[2], |
678 | 4.48M | zero, s[3], s[2], |
679 | 4.48M | x[4], x[3], x[2]); |
680 | | |
681 | | /* "T + S2" with 64-bit limbs: |
682 | | * [A11:A10]:[ A9: A8]:[ A7: A6]:[ A5: A4]:[ A3: A2]:[ A1: A0] |
683 | | * + [A23:A22]:[A21:A20]:[A19:A18]:[A17:A16]:[A15:A14]:[A13:A12] |
684 | | * => t[6]:t[5]:t[4]:t[3]:t[2]:t[1]:t[0] |
685 | | */ |
686 | 4.48M | ADD7_LIMB64 (t[6], t[5], t[4], t[3], t[2], t[1], t[0], |
687 | 4.48M | zero, |
688 | 4.48M | LOAD64(wp, 10 / 2), LOAD64(wp, 8 / 2), LOAD64(wp, 6 / 2), |
689 | 4.48M | LOAD64(wp, 4 / 2), LOAD64(wp, 2 / 2), LOAD64(wp, 0 / 2), |
690 | 4.48M | zero, |
691 | 4.48M | LOAD64(wp, 22 / 2), LOAD64(wp, 20 / 2), LOAD64(wp, 18 / 2), |
692 | 4.48M | LOAD64(wp, 16 / 2), LOAD64(wp, 14 / 2), LOAD64(wp, 12 / 2)); |
693 | | |
694 | | /* "2*S1 + S4 + S5 + S6" with 64-bit limbs: |
695 | | * s[6]: s[5]: s[4]: s[3]: s[2]: s[1]: s[0] |
696 | | * + [A19:A18]:[A17:A16]:[A15:A14]:[A13:A12]:[A20:0]:[A23:0] |
697 | | * => s[6]:s[5]:s[4]:s[3]:s[2]:s[1]:s[0] |
698 | | */ |
699 | 4.48M | ADD7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0], |
700 | 4.48M | zero, zero, s[4], s[3], s[2], s[1], s[0], |
701 | 4.48M | zero, |
702 | 4.48M | LOAD64(wp, 18 / 2), LOAD64(wp, 16 / 2), |
703 | 4.48M | LOAD64(wp, 14 / 2), LOAD64(wp, 12 / 2), |
704 | 4.48M | LIMB64_HILO(LOAD32(wp, 20), 0), |
705 | 4.48M | LIMB64_HILO(LOAD32(wp, 23), 0)); |
706 | | |
707 | | /* "D1 + D2 + D3" with 64-bit limbs: |
708 | | * d[6]: d[5]: d[4]: d[3]: d[2]: d[1]: d[0] |
709 | | * + [A22:A21]:[A20:A19]:[A18:A17]:[A16:A15]:[A14:A13]:[A12:A23] |
710 | | * => d[6]:d[5]:d[4]:d[3]:d[2]:d[1]:d[0] |
711 | | */ |
712 | 4.48M | ADD7_LIMB64 (d[6], d[5], d[4], d[3], d[2], d[1], d[0], |
713 | 4.48M | zero, zero, zero, zero, d[2], d[1], d[0], |
714 | 4.48M | zero, |
715 | 4.48M | LOAD64_SHR32(21), |
716 | 4.48M | LOAD64_SHR32(19), |
717 | 4.48M | LOAD64_SHR32(17), |
718 | 4.48M | LOAD64_SHR32(15), |
719 | 4.48M | LOAD64_SHR32(13), |
720 | 4.48M | LIMB64_HILO(LOAD32(wp, 12), LOAD32(wp, 23))); |
721 | | |
722 | | /* "2*S1 + S3 + S4 + S5 + S6" with 64-bit limbs: |
723 | | * s[6]: s[5]: s[4]: s[3]: s[2]: s[1]: s[0] |
724 | | * + [A20:A19]:[A18:A17]:[A16:A15]:[A14:A13]:[A12:A23]:[A22:A21] |
725 | | * => s[6]:s[5]:s[4]:s[3]:s[2]:s[1]:s[0] |
726 | | */ |
727 | 4.48M | ADD7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0], |
728 | 4.48M | s[6], s[5], s[4], s[3], s[2], s[1], s[0], |
729 | 4.48M | zero, |
730 | 4.48M | LOAD64_SHR32(19), |
731 | 4.48M | LOAD64_SHR32(17), |
732 | 4.48M | LOAD64_SHR32(15), |
733 | 4.48M | LOAD64_SHR32(13), |
734 | 4.48M | LIMB64_HILO(LOAD32(wp, 12), LOAD32(wp, 23)), |
735 | 4.48M | LOAD64_SHR32(21)); |
736 | | |
737 | | /* "T + 2*S1 + S2 + S3 + S4 + S5 + S6" */ |
738 | 4.48M | ADD7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0], |
739 | 4.48M | s[6], s[5], s[4], s[3], s[2], s[1], s[0], |
740 | 4.48M | t[6], t[5], t[4], t[3], t[2], t[1], t[0]); |
741 | | |
742 | | /* "T + 2*S1 + S2 + S3 + S4 + S5 + S6 - D1 - D2 - D3" */ |
743 | 4.48M | SUB7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0], |
744 | 4.48M | s[6], s[5], s[4], s[3], s[2], s[1], s[0], |
745 | 4.48M | d[6], d[5], d[4], d[3], d[2], d[1], d[0]); |
746 | | |
747 | 4.48M | #undef LOAD64_SHR32 |
748 | | |
749 | | /* mod p: |
750 | | * 's[6]' holds carry value (-3..7). Subtract (carry + 1) * p. Result |
751 | | * will be with in range -p...p. Handle result being negative with |
752 | | * addition and conditional store. */ |
753 | | |
754 | 4.48M | carry = LO32_LIMB64(s[6]); |
755 | | |
756 | | /* Load values to stack to ease register pressure on i386. */ |
757 | 4.48M | x[0] = p_mult[carry + 3][0]; |
758 | 4.48M | x[1] = p_mult[carry + 3][1]; |
759 | 4.48M | x[2] = p_mult[carry + 3][2]; |
760 | 4.48M | x[3] = p_mult[carry + 3][3]; |
761 | 4.48M | x[4] = p_mult[carry + 3][4]; |
762 | 4.48M | x[5] = p_mult[carry + 3][5]; |
763 | 4.48M | x[6] = p_mult[carry + 3][6]; |
764 | 4.48M | SUB7_LIMB64 (s[6], s[5], s[4], s[3], s[2], s[1], s[0], |
765 | 4.48M | s[6], s[5], s[4], s[3], s[2], s[1], s[0], |
766 | 4.48M | x[6], x[5], x[4], x[3], x[2], x[1], x[0]); |
767 | | |
768 | 4.48M | ADD7_LIMB64 (d[6], d[5], d[4], d[3], d[2], d[1], d[0], |
769 | 4.48M | s[6], s[5], s[4], s[3], s[2], s[1], s[0], |
770 | 4.48M | zero, |
771 | 4.48M | p_mult[0 + 3][5], p_mult[0 + 3][4], |
772 | 4.48M | p_mult[0 + 3][3], p_mult[0 + 3][2], |
773 | 4.48M | p_mult[0 + 3][1], p_mult[0 + 3][0]); |
774 | | |
775 | 4.48M | s_is_negative = LO32_LIMB64(s[6]) >> 31; |
776 | 4.48M | mask2 = ct_limb_gen_mask(s_is_negative); |
777 | 4.48M | mask1 = ct_limb_gen_inv_mask(s_is_negative); |
778 | | |
779 | 4.48M | STORE64_COND(wp, 0, mask2, d[0], mask1, s[0]); |
780 | 4.48M | STORE64_COND(wp, 1, mask2, d[1], mask1, s[1]); |
781 | 4.48M | STORE64_COND(wp, 2, mask2, d[2], mask1, s[2]); |
782 | 4.48M | STORE64_COND(wp, 3, mask2, d[3], mask1, s[3]); |
783 | 4.48M | STORE64_COND(wp, 4, mask2, d[4], mask1, s[4]); |
784 | 4.48M | STORE64_COND(wp, 5, mask2, d[5], mask1, s[5]); |
785 | | |
786 | 4.48M | w->nlimbs = wsize * LIMBS_PER_LIMB64; |
787 | 4.48M | MPN_NORMALIZE (wp, w->nlimbs); |
788 | | |
789 | | #if (BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB) && defined(WORDS_BIGENDIAN) |
790 | | wipememory(wp_shr32, sizeof(wp_shr32)); |
791 | | #endif |
792 | 4.48M | } |
793 | | |
794 | | void |
795 | | _gcry_mpi_ec_nist521_mod (gcry_mpi_t w, mpi_ec_t ctx) |
796 | 4.43M | { |
797 | 4.43M | mpi_limb_t s[(521 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB]; |
798 | 4.43M | const mpi_size_t wsize = DIM(s); |
799 | 4.43M | mpi_limb_t cy; |
800 | 4.43M | mpi_ptr_t wp; |
801 | | |
802 | 4.43M | MPN_NORMALIZE (w->d, w->nlimbs); |
803 | 4.43M | if (mpi_nbits_more_than (w, 2 * 521)) |
804 | 0 | log_bug ("W must be less than m^2\n"); |
805 | | |
806 | 4.43M | RESIZE_AND_CLEAR_IF_NEEDED (w, wsize * 2); |
807 | | |
808 | 4.43M | wp = w->d; |
809 | | |
810 | | /* See "FIPS 186-4, D.2.5 Curve P-521". */ |
811 | | |
812 | 4.43M | _gcry_mpih_rshift (s, wp + wsize - 1, wsize, 521 % BITS_PER_MPI_LIMB); |
813 | 4.43M | s[wsize - 1] &= (1 << (521 % BITS_PER_MPI_LIMB)) - 1; |
814 | 4.43M | wp[wsize - 1] &= (1 << (521 % BITS_PER_MPI_LIMB)) - 1; |
815 | 4.43M | _gcry_mpih_add_n (wp, wp, s, wsize); |
816 | | |
817 | | /* "mod p" */ |
818 | 4.43M | cy = _gcry_mpih_sub_n (wp, wp, ctx->p->d, wsize); |
819 | 4.43M | _gcry_mpih_add_n (s, wp, ctx->p->d, wsize); |
820 | 4.43M | mpih_set_cond (wp, s, wsize, mpih_limb_is_not_zero (cy)); |
821 | | |
822 | 4.43M | w->nlimbs = wsize; |
823 | 4.43M | MPN_NORMALIZE (wp, w->nlimbs); |
824 | 4.43M | } |
825 | | |
826 | | #endif /* !ASM_DISABLED */ |