/src/openssl/crypto/ec/ecp_nistp384.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. |
3 | | * |
4 | | * Licensed under the Apache License 2.0 (the "License"). You may not use |
5 | | * this file except in compliance with the License. You can obtain a copy |
6 | | * in the file LICENSE in the source distribution or at |
7 | | * https://www.openssl.org/source/license.html |
8 | | */ |
9 | | |
10 | | /* Copyright 2023 IBM Corp. |
11 | | * |
12 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
13 | | * |
14 | | * you may not use this file except in compliance with the License. |
15 | | * You may obtain a copy of the License at |
16 | | * |
17 | | * http://www.apache.org/licenses/LICENSE-2.0 |
18 | | * |
19 | | * Unless required by applicable law or agreed to in writing, software |
20 | | * distributed under the License is distributed on an "AS IS" BASIS, |
21 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
22 | | * See the License for the specific language governing permissions and |
23 | | * limitations under the License. |
24 | | */ |
25 | | |
26 | | /* |
27 | | * Designed for 56-bit limbs by Rohan McLure <rohan.mclure@linux.ibm.com>. |
28 | | * The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm |
29 | | * acceleration of felem_{square,mul} as supported in these files. |
30 | | */ |
31 | | |
32 | | #include <openssl/e_os2.h> |
33 | | |
34 | | #include <string.h> |
35 | | #include <openssl/err.h> |
36 | | #include "ec_local.h" |
37 | | |
38 | | #include "internal/numbers.h" |
39 | | |
40 | | #ifndef INT128_MAX |
41 | | # error "Your compiler doesn't appear to support 128-bit integer types" |
42 | | #endif |
43 | | |
44 | | typedef uint8_t u8; |
45 | | typedef uint64_t u64; |
46 | | |
47 | | /* |
48 | | * The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We |
49 | | * can serialize an element of this field into 48 bytes. We call this an |
50 | | * felem_bytearray. |
51 | | */ |
52 | | |
53 | | typedef u8 felem_bytearray[48]; |
54 | | |
55 | | /* |
56 | | * These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4. |
57 | | * These values are big-endian. |
58 | | */ |
59 | | static const felem_bytearray nistp384_curve_params[5] = { |
60 | | {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */ |
61 | | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
62 | | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, |
63 | | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF}, |
64 | | {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */ |
65 | | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
66 | | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, |
67 | | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC}, |
68 | | {0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */ |
69 | | 0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12, |
70 | | 0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D, |
71 | | 0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF}, |
72 | | {0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */ |
73 | | 0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98, |
74 | | 0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D, |
75 | | 0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7}, |
76 | | {0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */ |
77 | | 0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C, |
78 | | 0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE, |
79 | | 0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F}, |
80 | | }; |
81 | | |
82 | | /*- |
83 | | * The representation of field elements. |
84 | | * ------------------------------------ |
85 | | * |
86 | | * We represent field elements with seven values. These values are either 64 or |
87 | | * 128 bits and the field element represented is: |
88 | | * v[0]*2^0 + v[1]*2^56 + v[2]*2^112 + ... + v[6]*2^336 (mod p) |
89 | | * Each of the seven values is called a 'limb'. Since the limbs are spaced only |
90 | | * 56 bits apart, but are greater than 56 bits in length, the most significant |
91 | | * bits of each limb overlap with the least significant bits of the next |
92 | | * |
93 | | * This representation is considered to be 'redundant' in the sense that |
94 | | * intermediate values can each contain more than a 56-bit value in each limb. |
95 | | * Reduction causes all but the final limb to be reduced to contain a value less |
96 | | * than 2^56, with the final value represented allowed to be larger than 2^384, |
97 | | * inasmuch as we can be sure that arithmetic overflow remains impossible. The |
98 | | * reduced value must of course be congruent to the unreduced value. |
99 | | * |
100 | | * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a |
101 | | * 'widefelem', featuring enough bits to store the result of a multiplication |
102 | | * and even some further arithmetic without need for immediate reduction. |
103 | | */ |
104 | | |
105 | 22.0M | #define NLIMBS 7 |
106 | | |
107 | | typedef uint64_t limb; |
108 | | typedef uint128_t widelimb; |
109 | | typedef limb limb_aX __attribute((__aligned__(1))); |
110 | | typedef limb felem[NLIMBS]; |
111 | | typedef widelimb widefelem[2*NLIMBS-1]; |
112 | | |
113 | | static const limb bottom56bits = 0xffffffffffffff; |
114 | | |
115 | | /* Helper functions (de)serialising reduced field elements in little endian */ |
116 | | static void bin48_to_felem(felem out, const u8 in[48]) |
117 | 792 | { |
118 | 792 | memset(out, 0, 56); |
119 | 792 | out[0] = (*((limb *) & in[0])) & bottom56bits; |
120 | 792 | out[1] = (*((limb_aX *) & in[7])) & bottom56bits; |
121 | 792 | out[2] = (*((limb_aX *) & in[14])) & bottom56bits; |
122 | 792 | out[3] = (*((limb_aX *) & in[21])) & bottom56bits; |
123 | 792 | out[4] = (*((limb_aX *) & in[28])) & bottom56bits; |
124 | 792 | out[5] = (*((limb_aX *) & in[35])) & bottom56bits; |
125 | 792 | memmove(&out[6], &in[42], 6); |
126 | 792 | } |
127 | | |
128 | | static void felem_to_bin48(u8 out[48], const felem in) |
129 | 2.11k | { |
130 | 2.11k | memset(out, 0, 48); |
131 | 2.11k | (*((limb *) & out[0])) |= (in[0] & bottom56bits); |
132 | 2.11k | (*((limb_aX *) & out[7])) |= (in[1] & bottom56bits); |
133 | 2.11k | (*((limb_aX *) & out[14])) |= (in[2] & bottom56bits); |
134 | 2.11k | (*((limb_aX *) & out[21])) |= (in[3] & bottom56bits); |
135 | 2.11k | (*((limb_aX *) & out[28])) |= (in[4] & bottom56bits); |
136 | 2.11k | (*((limb_aX *) & out[35])) |= (in[5] & bottom56bits); |
137 | 2.11k | memmove(&out[42], &in[6], 6); |
138 | 2.11k | } |
139 | | |
140 | | /* BN_to_felem converts an OpenSSL BIGNUM into an felem */ |
141 | | static int BN_to_felem(felem out, const BIGNUM *bn) |
142 | 792 | { |
143 | 792 | felem_bytearray b_out; |
144 | 792 | int num_bytes; |
145 | | |
146 | 792 | if (BN_is_negative(bn)) { |
147 | 0 | ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE); |
148 | 0 | return 0; |
149 | 0 | } |
150 | 792 | num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out)); |
151 | 792 | if (num_bytes < 0) { |
152 | 0 | ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE); |
153 | 0 | return 0; |
154 | 0 | } |
155 | 792 | bin48_to_felem(out, b_out); |
156 | 792 | return 1; |
157 | 792 | } |
158 | | |
159 | | /* felem_to_BN converts an felem into an OpenSSL BIGNUM */ |
160 | | static BIGNUM *felem_to_BN(BIGNUM *out, const felem in) |
161 | 2.11k | { |
162 | 2.11k | felem_bytearray b_out; |
163 | | |
164 | 2.11k | felem_to_bin48(b_out, in); |
165 | 2.11k | return BN_lebin2bn(b_out, sizeof(b_out), out); |
166 | 2.11k | } |
167 | | |
168 | | /*- |
169 | | * Field operations |
170 | | * ---------------- |
171 | | */ |
172 | | |
173 | | static void felem_one(felem out) |
174 | 0 | { |
175 | 0 | out[0] = 1; |
176 | 0 | memset(&out[1], 0, sizeof(limb) * (NLIMBS-1)); |
177 | 0 | } |
178 | | |
179 | | static void felem_assign(felem out, const felem in) |
180 | 287k | { |
181 | 287k | memcpy(out, in, sizeof(felem)); |
182 | 287k | } |
183 | | |
184 | | /* felem_sum64 sets out = out + in. */ |
185 | | static void felem_sum64(felem out, const felem in) |
186 | 77.6k | { |
187 | 77.6k | unsigned int i; |
188 | | |
189 | 620k | for (i = 0; i < NLIMBS; i++) |
190 | 543k | out[i] += in[i]; |
191 | 77.6k | } |
192 | | |
193 | | /* felem_scalar sets out = in * scalar */ |
194 | | static void felem_scalar(felem out, const felem in, limb scalar) |
195 | 334k | { |
196 | 334k | unsigned int i; |
197 | | |
198 | 2.67M | for (i = 0; i < NLIMBS; i++) |
199 | 2.33M | out[i] = in[i] * scalar; |
200 | 334k | } |
201 | | |
202 | | /* felem_scalar64 sets out = out * scalar */ |
203 | | static void felem_scalar64(felem out, limb scalar) |
204 | 155k | { |
205 | 155k | unsigned int i; |
206 | | |
207 | 1.24M | for (i = 0; i < NLIMBS; i++) |
208 | 1.08M | out[i] *= scalar; |
209 | 155k | } |
210 | | |
211 | | /* felem_scalar128 sets out = out * scalar */ |
212 | | static void felem_scalar128(widefelem out, limb scalar) |
213 | 51.7k | { |
214 | 51.7k | unsigned int i; |
215 | | |
216 | 724k | for (i = 0; i < 2*NLIMBS-1; i++) |
217 | 672k | out[i] *= scalar; |
218 | 51.7k | } |
219 | | |
220 | | /*- |
221 | | * felem_neg sets |out| to |-in| |
222 | | * On entry: |
223 | | * in[i] < 2^60 - 2^29 |
224 | | * On exit: |
225 | | * out[i] < 2^60 |
226 | | */ |
227 | | static void felem_neg(felem out, const felem in) |
228 | 0 | { |
229 | | /* |
230 | | * In order to prevent underflow, we add a multiple of p before subtracting. |
231 | | * Use telescopic sums to represent 2^12 * p redundantly with each limb |
232 | | * of the form 2^60 + ... |
233 | | */ |
234 | 0 | static const limb two60m52m4 = (((limb) 1) << 60) |
235 | 0 | - (((limb) 1) << 52) |
236 | 0 | - (((limb) 1) << 4); |
237 | 0 | static const limb two60p44m12 = (((limb) 1) << 60) |
238 | 0 | + (((limb) 1) << 44) |
239 | 0 | - (((limb) 1) << 12); |
240 | 0 | static const limb two60m28m4 = (((limb) 1) << 60) |
241 | 0 | - (((limb) 1) << 28) |
242 | 0 | - (((limb) 1) << 4); |
243 | 0 | static const limb two60m4 = (((limb) 1) << 60) |
244 | 0 | - (((limb) 1) << 4); |
245 | |
|
246 | 0 | out[0] = two60p44m12 - in[0]; |
247 | 0 | out[1] = two60m52m4 - in[1]; |
248 | 0 | out[2] = two60m28m4 - in[2]; |
249 | 0 | out[3] = two60m4 - in[3]; |
250 | 0 | out[4] = two60m4 - in[4]; |
251 | 0 | out[5] = two60m4 - in[5]; |
252 | 0 | out[6] = two60m4 - in[6]; |
253 | 0 | } |
254 | | |
255 | | /*- |
256 | | * felem_diff64 subtracts |in| from |out| |
257 | | * On entry: |
258 | | * in[i] < 2^60 - 2^52 - 2^4 |
259 | | * On exit: |
260 | | * out[i] < out_orig[i] + 2^60 + 2^44 |
261 | | */ |
262 | | static void felem_diff64(felem out, const felem in) |
263 | 77.6k | { |
264 | | /* |
265 | | * In order to prevent underflow, we add a multiple of p before subtracting. |
266 | | * Use telescopic sums to represent 2^12 * p redundantly with each limb |
267 | | * of the form 2^60 + ... |
268 | | */ |
269 | | |
270 | 77.6k | static const limb two60m52m4 = (((limb) 1) << 60) |
271 | 77.6k | - (((limb) 1) << 52) |
272 | 77.6k | - (((limb) 1) << 4); |
273 | 77.6k | static const limb two60p44m12 = (((limb) 1) << 60) |
274 | 77.6k | + (((limb) 1) << 44) |
275 | 77.6k | - (((limb) 1) << 12); |
276 | 77.6k | static const limb two60m28m4 = (((limb) 1) << 60) |
277 | 77.6k | - (((limb) 1) << 28) |
278 | 77.6k | - (((limb) 1) << 4); |
279 | 77.6k | static const limb two60m4 = (((limb) 1) << 60) |
280 | 77.6k | - (((limb) 1) << 4); |
281 | | |
282 | 77.6k | out[0] += two60p44m12 - in[0]; |
283 | 77.6k | out[1] += two60m52m4 - in[1]; |
284 | 77.6k | out[2] += two60m28m4 - in[2]; |
285 | 77.6k | out[3] += two60m4 - in[3]; |
286 | 77.6k | out[4] += two60m4 - in[4]; |
287 | 77.6k | out[5] += two60m4 - in[5]; |
288 | 77.6k | out[6] += two60m4 - in[6]; |
289 | 77.6k | } |
290 | | |
291 | | /* |
292 | | * in[i] < 2^63 |
293 | | * out[i] < out_orig[i] + 2^64 + 2^48 |
294 | | */ |
295 | | static void felem_diff_128_64(widefelem out, const felem in) |
296 | 155k | { |
297 | | /* |
298 | | * In order to prevent underflow, we add a multiple of p before subtracting. |
299 | | * Use telescopic sums to represent 2^16 * p redundantly with each limb |
300 | | * of the form 2^64 + ... |
301 | | */ |
302 | | |
303 | 155k | static const widelimb two64m56m8 = (((widelimb) 1) << 64) |
304 | 155k | - (((widelimb) 1) << 56) |
305 | 155k | - (((widelimb) 1) << 8); |
306 | 155k | static const widelimb two64m32m8 = (((widelimb) 1) << 64) |
307 | 155k | - (((widelimb) 1) << 32) |
308 | 155k | - (((widelimb) 1) << 8); |
309 | 155k | static const widelimb two64m8 = (((widelimb) 1) << 64) |
310 | 155k | - (((widelimb) 1) << 8); |
311 | 155k | static const widelimb two64p48m16 = (((widelimb) 1) << 64) |
312 | 155k | + (((widelimb) 1) << 48) |
313 | 155k | - (((widelimb) 1) << 16); |
314 | 155k | unsigned int i; |
315 | | |
316 | 155k | out[0] += two64p48m16; |
317 | 155k | out[1] += two64m56m8; |
318 | 155k | out[2] += two64m32m8; |
319 | 155k | out[3] += two64m8; |
320 | 155k | out[4] += two64m8; |
321 | 155k | out[5] += two64m8; |
322 | 155k | out[6] += two64m8; |
323 | | |
324 | 1.24M | for (i = 0; i < NLIMBS; i++) |
325 | 1.08M | out[i] -= in[i]; |
326 | 155k | } |
327 | | |
328 | | /* |
329 | | * in[i] < 2^127 - 2^119 - 2^71 |
330 | | * out[i] < out_orig[i] + 2^127 + 2^111 |
331 | | */ |
332 | | static void felem_diff128(widefelem out, const widefelem in) |
333 | 51.7k | { |
334 | | /* |
335 | | * In order to prevent underflow, we add a multiple of p before subtracting. |
336 | | * Use telescopic sums to represent 2^415 * p redundantly with each limb |
337 | | * of the form 2^127 + ... |
338 | | */ |
339 | | |
340 | 51.7k | static const widelimb two127 = ((widelimb) 1) << 127; |
341 | 51.7k | static const widelimb two127m71 = (((widelimb) 1) << 127) |
342 | 51.7k | - (((widelimb) 1) << 71); |
343 | 51.7k | static const widelimb two127p111m79m71 = (((widelimb) 1) << 127) |
344 | 51.7k | + (((widelimb) 1) << 111) |
345 | 51.7k | - (((widelimb) 1) << 79) |
346 | 51.7k | - (((widelimb) 1) << 71); |
347 | 51.7k | static const widelimb two127m119m71 = (((widelimb) 1) << 127) |
348 | 51.7k | - (((widelimb) 1) << 119) |
349 | 51.7k | - (((widelimb) 1) << 71); |
350 | 51.7k | static const widelimb two127m95m71 = (((widelimb) 1) << 127) |
351 | 51.7k | - (((widelimb) 1) << 95) |
352 | 51.7k | - (((widelimb) 1) << 71); |
353 | 51.7k | unsigned int i; |
354 | | |
355 | 51.7k | out[0] += two127; |
356 | 51.7k | out[1] += two127m71; |
357 | 51.7k | out[2] += two127m71; |
358 | 51.7k | out[3] += two127m71; |
359 | 51.7k | out[4] += two127m71; |
360 | 51.7k | out[5] += two127m71; |
361 | 51.7k | out[6] += two127p111m79m71; |
362 | 51.7k | out[7] += two127m119m71; |
363 | 51.7k | out[8] += two127m95m71; |
364 | 51.7k | out[9] += two127m71; |
365 | 51.7k | out[10] += two127m71; |
366 | 51.7k | out[11] += two127m71; |
367 | 51.7k | out[12] += two127m71; |
368 | | |
369 | 724k | for (i = 0; i < 2*NLIMBS-1; i++) |
370 | 672k | out[i] -= in[i]; |
371 | 51.7k | } |
372 | | |
373 | | static void felem_square_ref(widefelem out, const felem in) |
374 | 308k | { |
375 | 308k | felem inx2; |
376 | 308k | felem_scalar(inx2, in, 2); |
377 | | |
378 | 308k | out[0] = ((uint128_t) in[0]) * in[0]; |
379 | | |
380 | 308k | out[1] = ((uint128_t) in[0]) * inx2[1]; |
381 | | |
382 | 308k | out[2] = ((uint128_t) in[0]) * inx2[2] |
383 | 308k | + ((uint128_t) in[1]) * in[1]; |
384 | | |
385 | 308k | out[3] = ((uint128_t) in[0]) * inx2[3] |
386 | 308k | + ((uint128_t) in[1]) * inx2[2]; |
387 | | |
388 | 308k | out[4] = ((uint128_t) in[0]) * inx2[4] |
389 | 308k | + ((uint128_t) in[1]) * inx2[3] |
390 | 308k | + ((uint128_t) in[2]) * in[2]; |
391 | | |
392 | 308k | out[5] = ((uint128_t) in[0]) * inx2[5] |
393 | 308k | + ((uint128_t) in[1]) * inx2[4] |
394 | 308k | + ((uint128_t) in[2]) * inx2[3]; |
395 | | |
396 | 308k | out[6] = ((uint128_t) in[0]) * inx2[6] |
397 | 308k | + ((uint128_t) in[1]) * inx2[5] |
398 | 308k | + ((uint128_t) in[2]) * inx2[4] |
399 | 308k | + ((uint128_t) in[3]) * in[3]; |
400 | | |
401 | 308k | out[7] = ((uint128_t) in[1]) * inx2[6] |
402 | 308k | + ((uint128_t) in[2]) * inx2[5] |
403 | 308k | + ((uint128_t) in[3]) * inx2[4]; |
404 | | |
405 | 308k | out[8] = ((uint128_t) in[2]) * inx2[6] |
406 | 308k | + ((uint128_t) in[3]) * inx2[5] |
407 | 308k | + ((uint128_t) in[4]) * in[4]; |
408 | | |
409 | 308k | out[9] = ((uint128_t) in[3]) * inx2[6] |
410 | 308k | + ((uint128_t) in[4]) * inx2[5]; |
411 | | |
412 | 308k | out[10] = ((uint128_t) in[4]) * inx2[6] |
413 | 308k | + ((uint128_t) in[5]) * in[5]; |
414 | | |
415 | 308k | out[11] = ((uint128_t) in[5]) * inx2[6]; |
416 | | |
417 | 308k | out[12] = ((uint128_t) in[6]) * in[6]; |
418 | 308k | } |
419 | | |
420 | | static void felem_mul_ref(widefelem out, const felem in1, const felem in2) |
421 | 289k | { |
422 | 289k | out[0] = ((uint128_t) in1[0]) * in2[0]; |
423 | | |
424 | 289k | out[1] = ((uint128_t) in1[0]) * in2[1] |
425 | 289k | + ((uint128_t) in1[1]) * in2[0]; |
426 | | |
427 | 289k | out[2] = ((uint128_t) in1[0]) * in2[2] |
428 | 289k | + ((uint128_t) in1[1]) * in2[1] |
429 | 289k | + ((uint128_t) in1[2]) * in2[0]; |
430 | | |
431 | 289k | out[3] = ((uint128_t) in1[0]) * in2[3] |
432 | 289k | + ((uint128_t) in1[1]) * in2[2] |
433 | 289k | + ((uint128_t) in1[2]) * in2[1] |
434 | 289k | + ((uint128_t) in1[3]) * in2[0]; |
435 | | |
436 | 289k | out[4] = ((uint128_t) in1[0]) * in2[4] |
437 | 289k | + ((uint128_t) in1[1]) * in2[3] |
438 | 289k | + ((uint128_t) in1[2]) * in2[2] |
439 | 289k | + ((uint128_t) in1[3]) * in2[1] |
440 | 289k | + ((uint128_t) in1[4]) * in2[0]; |
441 | | |
442 | 289k | out[5] = ((uint128_t) in1[0]) * in2[5] |
443 | 289k | + ((uint128_t) in1[1]) * in2[4] |
444 | 289k | + ((uint128_t) in1[2]) * in2[3] |
445 | 289k | + ((uint128_t) in1[3]) * in2[2] |
446 | 289k | + ((uint128_t) in1[4]) * in2[1] |
447 | 289k | + ((uint128_t) in1[5]) * in2[0]; |
448 | | |
449 | 289k | out[6] = ((uint128_t) in1[0]) * in2[6] |
450 | 289k | + ((uint128_t) in1[1]) * in2[5] |
451 | 289k | + ((uint128_t) in1[2]) * in2[4] |
452 | 289k | + ((uint128_t) in1[3]) * in2[3] |
453 | 289k | + ((uint128_t) in1[4]) * in2[2] |
454 | 289k | + ((uint128_t) in1[5]) * in2[1] |
455 | 289k | + ((uint128_t) in1[6]) * in2[0]; |
456 | | |
457 | 289k | out[7] = ((uint128_t) in1[1]) * in2[6] |
458 | 289k | + ((uint128_t) in1[2]) * in2[5] |
459 | 289k | + ((uint128_t) in1[3]) * in2[4] |
460 | 289k | + ((uint128_t) in1[4]) * in2[3] |
461 | 289k | + ((uint128_t) in1[5]) * in2[2] |
462 | 289k | + ((uint128_t) in1[6]) * in2[1]; |
463 | | |
464 | 289k | out[8] = ((uint128_t) in1[2]) * in2[6] |
465 | 289k | + ((uint128_t) in1[3]) * in2[5] |
466 | 289k | + ((uint128_t) in1[4]) * in2[4] |
467 | 289k | + ((uint128_t) in1[5]) * in2[3] |
468 | 289k | + ((uint128_t) in1[6]) * in2[2]; |
469 | | |
470 | 289k | out[9] = ((uint128_t) in1[3]) * in2[6] |
471 | 289k | + ((uint128_t) in1[4]) * in2[5] |
472 | 289k | + ((uint128_t) in1[5]) * in2[4] |
473 | 289k | + ((uint128_t) in1[6]) * in2[3]; |
474 | | |
475 | 289k | out[10] = ((uint128_t) in1[4]) * in2[6] |
476 | 289k | + ((uint128_t) in1[5]) * in2[5] |
477 | 289k | + ((uint128_t) in1[6]) * in2[4]; |
478 | | |
479 | 289k | out[11] = ((uint128_t) in1[5]) * in2[6] |
480 | 289k | + ((uint128_t) in1[6]) * in2[5]; |
481 | | |
482 | 289k | out[12] = ((uint128_t) in1[6]) * in2[6]; |
483 | 289k | } |
484 | | |
485 | | /*- |
486 | | * Reduce thirteen 128-bit coefficients to seven 64-bit coefficients. |
487 | | * in[i] < 2^128 - 2^125 |
488 | | * out[i] < 2^56 for i < 6, |
489 | | * out[6] <= 2^48 |
490 | | * |
491 | | * The technique in use here stems from the format of the prime modulus: |
492 | | * P384 = 2^384 - delta |
493 | | * |
494 | | * Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting |
495 | | * them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These |
496 | | * coefficients are still quite large, and so we repeatedly apply this |
497 | | * technique on high-order bits in order to guarantee the desired bounds on |
498 | | * the size of our output. |
499 | | * |
500 | | * The three phases of elimination are as follows: |
501 | | * [1]: Y = 2^120 (in[12] | in[11] | in[10] | in[9]) |
502 | | * [2]: Y = 2^8 (acc[8] | acc[7]) |
503 | | * [3]: Y = 2^48 (acc[6] >> 48) |
504 | | * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d) |
505 | | */ |
506 | | static void felem_reduce(felem out, const widefelem in) |
507 | 545k | { |
508 | | /* |
509 | | * In order to prevent underflow, we add a multiple of p before subtracting. |
510 | | * Use telescopic sums to represent 2^76 * p redundantly with each limb |
511 | | * of the form 2^124 + ... |
512 | | */ |
513 | 545k | static const widelimb two124m68 = (((widelimb) 1) << 124) |
514 | 545k | - (((widelimb) 1) << 68); |
515 | 545k | static const widelimb two124m116m68 = (((widelimb) 1) << 124) |
516 | 545k | - (((widelimb) 1) << 116) |
517 | 545k | - (((widelimb) 1) << 68); |
518 | 545k | static const widelimb two124p108m76 = (((widelimb) 1) << 124) |
519 | 545k | + (((widelimb) 1) << 108) |
520 | 545k | - (((widelimb) 1) << 76); |
521 | 545k | static const widelimb two124m92m68 = (((widelimb) 1) << 124) |
522 | 545k | - (((widelimb) 1) << 92) |
523 | 545k | - (((widelimb) 1) << 68); |
524 | 545k | widelimb temp, acc[9]; |
525 | 545k | unsigned int i; |
526 | | |
527 | 545k | memcpy(acc, in, sizeof(widelimb) * 9); |
528 | | |
529 | 545k | acc[0] += two124p108m76; |
530 | 545k | acc[1] += two124m116m68; |
531 | 545k | acc[2] += two124m92m68; |
532 | 545k | acc[3] += two124m68; |
533 | 545k | acc[4] += two124m68; |
534 | 545k | acc[5] += two124m68; |
535 | 545k | acc[6] += two124m68; |
536 | | |
537 | | /* [1]: Eliminate in[9], ..., in[12] */ |
538 | 545k | acc[8] += in[12] >> 32; |
539 | 545k | acc[7] += (in[12] & 0xffffffff) << 24; |
540 | 545k | acc[7] += in[12] >> 8; |
541 | 545k | acc[6] += (in[12] & 0xff) << 48; |
542 | 545k | acc[6] -= in[12] >> 16; |
543 | 545k | acc[5] -= (in[12] & 0xffff) << 40; |
544 | 545k | acc[6] += in[12] >> 48; |
545 | 545k | acc[5] += (in[12] & 0xffffffffffff) << 8; |
546 | | |
547 | 545k | acc[7] += in[11] >> 32; |
548 | 545k | acc[6] += (in[11] & 0xffffffff) << 24; |
549 | 545k | acc[6] += in[11] >> 8; |
550 | 545k | acc[5] += (in[11] & 0xff) << 48; |
551 | 545k | acc[5] -= in[11] >> 16; |
552 | 545k | acc[4] -= (in[11] & 0xffff) << 40; |
553 | 545k | acc[5] += in[11] >> 48; |
554 | 545k | acc[4] += (in[11] & 0xffffffffffff) << 8; |
555 | | |
556 | 545k | acc[6] += in[10] >> 32; |
557 | 545k | acc[5] += (in[10] & 0xffffffff) << 24; |
558 | 545k | acc[5] += in[10] >> 8; |
559 | 545k | acc[4] += (in[10] & 0xff) << 48; |
560 | 545k | acc[4] -= in[10] >> 16; |
561 | 545k | acc[3] -= (in[10] & 0xffff) << 40; |
562 | 545k | acc[4] += in[10] >> 48; |
563 | 545k | acc[3] += (in[10] & 0xffffffffffff) << 8; |
564 | | |
565 | 545k | acc[5] += in[9] >> 32; |
566 | 545k | acc[4] += (in[9] & 0xffffffff) << 24; |
567 | 545k | acc[4] += in[9] >> 8; |
568 | 545k | acc[3] += (in[9] & 0xff) << 48; |
569 | 545k | acc[3] -= in[9] >> 16; |
570 | 545k | acc[2] -= (in[9] & 0xffff) << 40; |
571 | 545k | acc[3] += in[9] >> 48; |
572 | 545k | acc[2] += (in[9] & 0xffffffffffff) << 8; |
573 | | |
574 | | /* |
575 | | * [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as |
576 | | * well as the contributions made from eliminating higher limbs. |
577 | | * acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121 |
578 | | * acc[8] < in[8] + 2^96 |
579 | | */ |
580 | 545k | acc[4] += acc[8] >> 32; |
581 | 545k | acc[3] += (acc[8] & 0xffffffff) << 24; |
582 | 545k | acc[3] += acc[8] >> 8; |
583 | 545k | acc[2] += (acc[8] & 0xff) << 48; |
584 | 545k | acc[2] -= acc[8] >> 16; |
585 | 545k | acc[1] -= (acc[8] & 0xffff) << 40; |
586 | 545k | acc[2] += acc[8] >> 48; |
587 | 545k | acc[1] += (acc[8] & 0xffffffffffff) << 8; |
588 | | |
589 | 545k | acc[3] += acc[7] >> 32; |
590 | 545k | acc[2] += (acc[7] & 0xffffffff) << 24; |
591 | 545k | acc[2] += acc[7] >> 8; |
592 | 545k | acc[1] += (acc[7] & 0xff) << 48; |
593 | 545k | acc[1] -= acc[7] >> 16; |
594 | 545k | acc[0] -= (acc[7] & 0xffff) << 40; |
595 | 545k | acc[1] += acc[7] >> 48; |
596 | 545k | acc[0] += (acc[7] & 0xffffffffffff) << 8; |
597 | | |
598 | | /*- |
599 | | * acc[k] < in[k] + 2^124 + 2^121 |
600 | | * < in[k] + 2^125 |
601 | | * < 2^128, for k <= 6 |
602 | | */ |
603 | | |
604 | | /* |
605 | | * Carry 4 -> 5 -> 6 |
606 | | * This has the effect of ensuring that these more significant limbs |
607 | | * will be small in value after eliminating high bits from acc[6]. |
608 | | */ |
609 | 545k | acc[5] += acc[4] >> 56; |
610 | 545k | acc[4] &= 0x00ffffffffffffff; |
611 | | |
612 | 545k | acc[6] += acc[5] >> 56; |
613 | 545k | acc[5] &= 0x00ffffffffffffff; |
614 | | |
615 | | /*- |
616 | | * acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16 |
617 | | * < in[6] + 2^125 |
618 | | * < 2^128 |
619 | | */ |
620 | | |
621 | | /* [3]: Eliminate high bits of acc[6] */ |
622 | 545k | temp = acc[6] >> 48; |
623 | 545k | acc[6] &= 0x0000ffffffffffff; |
624 | | |
625 | | /* temp < 2^80 */ |
626 | | |
627 | 545k | acc[3] += temp >> 40; |
628 | 545k | acc[2] += (temp & 0xffffffffff) << 16; |
629 | 545k | acc[2] += temp >> 16; |
630 | 545k | acc[1] += (temp & 0xffff) << 40; |
631 | 545k | acc[1] -= temp >> 24; |
632 | 545k | acc[0] -= (temp & 0xffffff) << 32; |
633 | 545k | acc[0] += temp; |
634 | | |
635 | | /*- |
636 | | * acc[k] < acc_old[k] + 2^64 + 2^56 |
637 | | * < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4 |
638 | | */ |
639 | | |
640 | | /* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */ |
641 | 545k | acc[1] += acc[0] >> 56; /* acc[1] < acc_old[1] + 2^72 */ |
642 | 545k | acc[0] &= 0x00ffffffffffffff; |
643 | | |
644 | 545k | acc[2] += acc[1] >> 56; /* acc[2] < acc_old[2] + 2^72 + 2^16 */ |
645 | 545k | acc[1] &= 0x00ffffffffffffff; |
646 | | |
647 | 545k | acc[3] += acc[2] >> 56; /* acc[3] < acc_old[3] + 2^72 + 2^16 */ |
648 | 545k | acc[2] &= 0x00ffffffffffffff; |
649 | | |
650 | | /*- |
651 | | * acc[k] < acc_old[k] + 2^72 + 2^16 |
652 | | * < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17 |
653 | | * < in[k] + 2^125 |
654 | | * < 2^128 , k < 4 |
655 | | */ |
656 | | |
657 | 545k | acc[4] += acc[3] >> 56; /*- |
658 | | * acc[4] < acc_old[4] + 2^72 + 2^16 |
659 | | * < 2^72 + 2^56 + 2^16 |
660 | | */ |
661 | 545k | acc[3] &= 0x00ffffffffffffff; |
662 | | |
663 | 545k | acc[5] += acc[4] >> 56; /*- |
664 | | * acc[5] < acc_old[5] + 2^16 + 1 |
665 | | * < 2^56 + 2^16 + 1 |
666 | | */ |
667 | 545k | acc[4] &= 0x00ffffffffffffff; |
668 | | |
669 | 545k | acc[6] += acc[5] >> 56; /* acc[6] < 2^48 + 1 <= 2^48 */ |
670 | 545k | acc[5] &= 0x00ffffffffffffff; |
671 | | |
672 | 4.36M | for (i = 0; i < NLIMBS; i++) |
673 | 3.81M | out[i] = acc[i]; |
674 | 545k | } |
675 | | |
676 | | #if defined(ECP_NISTP384_ASM) |
677 | | static void felem_square_wrapper(widefelem out, const felem in); |
678 | | static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2); |
679 | | |
680 | | static void (*felem_square_p)(widefelem out, const felem in) = |
681 | | felem_square_wrapper; |
682 | | static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) = |
683 | | felem_mul_wrapper; |
684 | | |
685 | | void p384_felem_square(widefelem out, const felem in); |
686 | | void p384_felem_mul(widefelem out, const felem in1, const felem in2); |
687 | | |
688 | | # if defined(_ARCH_PPC64) |
689 | | # include "crypto/ppc_arch.h" |
690 | | # endif |
691 | | |
692 | | static void felem_select(void) |
693 | | { |
694 | | # if defined(_ARCH_PPC64) |
695 | | if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) { |
696 | | felem_square_p = p384_felem_square; |
697 | | felem_mul_p = p384_felem_mul; |
698 | | |
699 | | return; |
700 | | } |
701 | | # endif |
702 | | |
703 | | /* Default */ |
704 | | felem_square_p = felem_square_ref; |
705 | | felem_mul_p = felem_mul_ref; |
706 | | } |
707 | | |
708 | | static void felem_square_wrapper(widefelem out, const felem in) |
709 | | { |
710 | | felem_select(); |
711 | | felem_square_p(out, in); |
712 | | } |
713 | | |
714 | | static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2) |
715 | | { |
716 | | felem_select(); |
717 | | felem_mul_p(out, in1, in2); |
718 | | } |
719 | | |
720 | | # define felem_square felem_square_p |
721 | | # define felem_mul felem_mul_p |
722 | | #else |
723 | 308k | # define felem_square felem_square_ref |
724 | 289k | # define felem_mul felem_mul_ref |
725 | | #endif |
726 | | |
727 | | static ossl_inline void felem_square_reduce(felem out, const felem in) |
728 | 204k | { |
729 | 204k | widefelem tmp; |
730 | | |
731 | 204k | felem_square(tmp, in); |
732 | 204k | felem_reduce(out, tmp); |
733 | 204k | } |
734 | | |
735 | | static ossl_inline void felem_mul_reduce(felem out, const felem in1, const felem in2) |
736 | 158k | { |
737 | 158k | widefelem tmp; |
738 | | |
739 | 158k | felem_mul(tmp, in1, in2); |
740 | 158k | felem_reduce(out, tmp); |
741 | 158k | } |
742 | | |
743 | | /*- |
744 | | * felem_inv calculates |out| = |in|^{-1} |
745 | | * |
746 | | * Based on Fermat's Little Theorem: |
747 | | * a^p = a (mod p) |
748 | | * a^{p-1} = 1 (mod p) |
749 | | * a^{p-2} = a^{-1} (mod p) |
750 | | */ |
751 | | static void felem_inv(felem out, const felem in) |
752 | 264 | { |
753 | 264 | felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6; |
754 | 264 | unsigned int i = 0; |
755 | | |
756 | 264 | felem_square_reduce(ftmp, in); /* 2^1 */ |
757 | 264 | felem_mul_reduce(ftmp, ftmp, in); /* 2^1 + 2^0 */ |
758 | 264 | felem_assign(ftmp2, ftmp); |
759 | | |
760 | 264 | felem_square_reduce(ftmp, ftmp); /* 2^2 + 2^1 */ |
761 | 264 | felem_mul_reduce(ftmp, ftmp, in); /* 2^2 + 2^1 * 2^0 */ |
762 | 264 | felem_assign(ftmp3, ftmp); |
763 | | |
764 | 1.05k | for (i = 0; i < 3; i++) |
765 | 792 | felem_square_reduce(ftmp, ftmp); /* 2^5 + 2^4 + 2^3 */ |
766 | 264 | felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */ |
767 | 264 | felem_assign(ftmp4, ftmp); |
768 | | |
769 | 1.84k | for (i = 0; i < 6; i++) |
770 | 1.58k | felem_square_reduce(ftmp, ftmp); /* 2^11 + ... + 2^6 */ |
771 | 264 | felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^11 + ... + 2^0 */ |
772 | | |
773 | 1.05k | for (i = 0; i < 3; i++) |
774 | 792 | felem_square_reduce(ftmp, ftmp); /* 2^14 + ... + 2^3 */ |
775 | 264 | felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^14 + ... + 2^0 */ |
776 | 264 | felem_assign(ftmp5, ftmp); |
777 | | |
778 | 4.22k | for (i = 0; i < 15; i++) |
779 | 3.96k | felem_square_reduce(ftmp, ftmp); /* 2^29 + ... + 2^15 */ |
780 | 264 | felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^29 + ... + 2^0 */ |
781 | 264 | felem_assign(ftmp6, ftmp); |
782 | | |
783 | 8.18k | for (i = 0; i < 30; i++) |
784 | 7.92k | felem_square_reduce(ftmp, ftmp); /* 2^59 + ... + 2^30 */ |
785 | 264 | felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^59 + ... + 2^0 */ |
786 | 264 | felem_assign(ftmp4, ftmp); |
787 | | |
788 | 16.1k | for (i = 0; i < 60; i++) |
789 | 15.8k | felem_square_reduce(ftmp, ftmp); /* 2^119 + ... + 2^60 */ |
790 | 264 | felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^119 + ... + 2^0 */ |
791 | 264 | felem_assign(ftmp4, ftmp); |
792 | | |
793 | 31.9k | for (i = 0; i < 120; i++) |
794 | 31.6k | felem_square_reduce(ftmp, ftmp); /* 2^239 + ... + 2^120 */ |
795 | 264 | felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^239 + ... + 2^0 */ |
796 | | |
797 | 4.22k | for (i = 0; i < 15; i++) |
798 | 3.96k | felem_square_reduce(ftmp, ftmp); /* 2^254 + ... + 2^15 */ |
799 | 264 | felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^254 + ... + 2^0 */ |
800 | | |
801 | 8.44k | for (i = 0; i < 31; i++) |
802 | 8.18k | felem_square_reduce(ftmp, ftmp); /* 2^285 + ... + 2^31 */ |
803 | 264 | felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^285 + ... + 2^31 + 2^29 + ... + 2^0 */ |
804 | | |
805 | 792 | for (i = 0; i < 2; i++) |
806 | 528 | felem_square_reduce(ftmp, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^2 */ |
807 | 264 | felem_mul_reduce(ftmp, ftmp2, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^0 */ |
808 | | |
809 | 25.0k | for (i = 0; i < 94; i++) |
810 | 24.8k | felem_square_reduce(ftmp, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 */ |
811 | 264 | felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 + 2^29 + ... + 2^0 */ |
812 | | |
813 | 792 | for (i = 0; i < 2; i++) |
814 | 528 | felem_square_reduce(ftmp, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 */ |
815 | 264 | felem_mul_reduce(ftmp, in, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 + 2^0 */ |
816 | | |
817 | 264 | memcpy(out, ftmp, sizeof(felem)); |
818 | 264 | } |
819 | | |
820 | | /* |
821 | | * Zero-check: returns a limb with all bits set if |in| == 0 (mod p) |
822 | | * and 0 otherwise. We know that field elements are reduced to |
823 | | * 0 < in < 2p, so we only need to check two cases: |
824 | | * 0 and 2^384 - 2^128 - 2^96 + 2^32 - 1 |
825 | | * in[k] < 2^56, k < 6 |
826 | | * in[6] <= 2^48 |
827 | | */ |
828 | | static limb felem_is_zero(const felem in) |
829 | 103k | { |
830 | 103k | limb zero, p384; |
831 | | |
832 | 103k | zero = in[0] | in[1] | in[2] | in[3] | in[4] | in[5] | in[6]; |
833 | 103k | zero = ((int64_t) (zero) - 1) >> 63; |
834 | 103k | p384 = (in[0] ^ 0x000000ffffffff) | (in[1] ^ 0xffff0000000000) |
835 | 103k | | (in[2] ^ 0xfffffffffeffff) | (in[3] ^ 0xffffffffffffff) |
836 | 103k | | (in[4] ^ 0xffffffffffffff) | (in[5] ^ 0xffffffffffffff) |
837 | 103k | | (in[6] ^ 0xffffffffffff); |
838 | 103k | p384 = ((int64_t) (p384) - 1) >> 63; |
839 | | |
840 | 103k | return (zero | p384); |
841 | 103k | } |
842 | | |
843 | | static int felem_is_zero_int(const void *in) |
844 | 0 | { |
845 | 0 | return (int)(felem_is_zero(in) & ((limb) 1)); |
846 | 0 | } |
847 | | |
848 | | /*- |
849 | | * felem_contract converts |in| to its unique, minimal representation. |
850 | | * Assume we've removed all redundant bits. |
851 | | * On entry: |
852 | | * in[k] < 2^56, k < 6 |
853 | | * in[6] <= 2^48 |
854 | | */ |
855 | | static void felem_contract(felem out, const felem in) |
856 | 1.32k | { |
857 | 1.32k | static const int64_t two56 = ((limb) 1) << 56; |
858 | | |
859 | | /* |
860 | | * We know for a fact that 0 <= |in| < 2*p, for p = 2^384 - 2^128 - 2^96 + 2^32 - 1 |
861 | | * Perform two successive, idempotent subtractions to reduce if |in| >= p. |
862 | | */ |
863 | | |
864 | 1.32k | int64_t tmp[NLIMBS], cond[5], a; |
865 | 1.32k | unsigned int i; |
866 | | |
867 | 1.32k | memcpy(tmp, in, sizeof(felem)); |
868 | | |
869 | | /* Case 1: a = 1 iff |in| >= 2^384 */ |
870 | 1.32k | a = (in[6] >> 48); |
871 | 1.32k | tmp[0] += a; |
872 | 1.32k | tmp[0] -= a << 32; |
873 | 1.32k | tmp[1] += a << 40; |
874 | 1.32k | tmp[2] += a << 16; |
875 | 1.32k | tmp[6] &= 0x0000ffffffffffff; |
876 | | |
877 | | /* |
878 | | * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be |
879 | | * non-zero, so we only need one step |
880 | | */ |
881 | | |
882 | 1.32k | a = tmp[0] >> 63; |
883 | 1.32k | tmp[0] += a & two56; |
884 | 1.32k | tmp[1] -= a & 1; |
885 | | |
886 | | /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */ |
887 | 1.32k | tmp[2] += tmp[1] >> 56; |
888 | 1.32k | tmp[1] &= 0x00ffffffffffffff; |
889 | | |
890 | 1.32k | tmp[3] += tmp[2] >> 56; |
891 | 1.32k | tmp[2] &= 0x00ffffffffffffff; |
892 | | |
893 | 1.32k | tmp[4] += tmp[3] >> 56; |
894 | 1.32k | tmp[3] &= 0x00ffffffffffffff; |
895 | | |
896 | 1.32k | tmp[5] += tmp[4] >> 56; |
897 | 1.32k | tmp[4] &= 0x00ffffffffffffff; |
898 | | |
899 | 1.32k | tmp[6] += tmp[5] >> 56; /* tmp[6] < 2^48 */ |
900 | 1.32k | tmp[5] &= 0x00ffffffffffffff; |
901 | | |
902 | | /* |
903 | | * Case 2: a = all ones if p <= |in| < 2^384, 0 otherwise |
904 | | */ |
905 | | |
906 | | /* 0 iff (2^129..2^383) are all one */ |
907 | 1.32k | cond[0] = ((tmp[6] | 0xff000000000000) & tmp[5] & tmp[4] & tmp[3] & (tmp[2] | 0x0000000001ffff)) + 1; |
908 | | /* 0 iff 2^128 bit is one */ |
909 | 1.32k | cond[1] = (tmp[2] | ~0x00000000010000) + 1; |
910 | | /* 0 iff (2^96..2^127) bits are all one */ |
911 | 1.32k | cond[2] = ((tmp[2] | 0xffffffffff0000) & (tmp[1] | 0x0000ffffffffff)) + 1; |
912 | | /* 0 iff (2^32..2^95) bits are all zero */ |
913 | 1.32k | cond[3] = (tmp[1] & ~0xffff0000000000) | (tmp[0] & ~((int64_t) 0x000000ffffffff)); |
914 | | /* 0 iff (2^0..2^31) bits are all one */ |
915 | 1.32k | cond[4] = (tmp[0] | 0xffffff00000000) + 1; |
916 | | |
917 | | /* |
918 | | * In effect, invert our conditions, so that 0 values become all 1's, |
919 | | * any non-zero value in the low-order 56 bits becomes all 0's |
920 | | */ |
921 | 7.92k | for (i = 0; i < 5; i++) |
922 | 6.60k | cond[i] = ((cond[i] & 0x00ffffffffffffff) - 1) >> 63; |
923 | | |
924 | | /* |
925 | | * The condition for determining whether in is greater than our |
926 | | * prime is given by the following condition. |
927 | | */ |
928 | | |
929 | | /* First subtract 2^384 - 2^129 cheaply */ |
930 | 1.32k | a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4]))); |
931 | 1.32k | tmp[6] &= ~a; |
932 | 1.32k | tmp[5] &= ~a; |
933 | 1.32k | tmp[4] &= ~a; |
934 | 1.32k | tmp[3] &= ~a; |
935 | 1.32k | tmp[2] &= ~a | 0x0000000001ffff; |
936 | | |
937 | | /* |
938 | | * Subtract 2^128 - 2^96 by |
939 | | * means of disjoint cases. |
940 | | */ |
941 | | |
942 | | /* subtract 2^128 if that bit is present, and add 2^96 */ |
943 | 1.32k | a = cond[0] & cond[1]; |
944 | 1.32k | tmp[2] &= ~a | 0xfffffffffeffff; |
945 | 1.32k | tmp[1] += a & ((int64_t) 1 << 40); |
946 | | |
947 | | /* otherwise, clear bits 2^127 .. 2^96 */ |
948 | 1.32k | a = cond[0] & ~cond[1] & (cond[2] & (~cond[3] | cond[4])); |
949 | 1.32k | tmp[2] &= ~a | 0xffffffffff0000; |
950 | 1.32k | tmp[1] &= ~a | 0x0000ffffffffff; |
951 | | |
952 | | /* finally, subtract the last 2^32 - 1 */ |
953 | 1.32k | a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4]))); |
954 | 1.32k | tmp[0] += a & (-((int64_t) 1 << 32) + 1); |
955 | | |
956 | | /* |
957 | | * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be |
958 | | * non-zero, so we only need one step |
959 | | */ |
960 | 1.32k | a = tmp[0] >> 63; |
961 | 1.32k | tmp[0] += a & two56; |
962 | 1.32k | tmp[1] -= a & 1; |
963 | | |
964 | | /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */ |
965 | 1.32k | tmp[2] += tmp[1] >> 56; |
966 | 1.32k | tmp[1] &= 0x00ffffffffffffff; |
967 | | |
968 | 1.32k | tmp[3] += tmp[2] >> 56; |
969 | 1.32k | tmp[2] &= 0x00ffffffffffffff; |
970 | | |
971 | 1.32k | tmp[4] += tmp[3] >> 56; |
972 | 1.32k | tmp[3] &= 0x00ffffffffffffff; |
973 | | |
974 | 1.32k | tmp[5] += tmp[4] >> 56; |
975 | 1.32k | tmp[4] &= 0x00ffffffffffffff; |
976 | | |
977 | 1.32k | tmp[6] += tmp[5] >> 56; |
978 | 1.32k | tmp[5] &= 0x00ffffffffffffff; |
979 | | |
980 | 1.32k | memcpy(out, tmp, sizeof(felem)); |
981 | 1.32k | } |
982 | | |
983 | | /*- |
984 | | * Group operations |
985 | | * ---------------- |
986 | | * |
987 | | * Building on top of the field operations we have the operations on the |
988 | | * elliptic curve group itself. Points on the curve are represented in Jacobian |
989 | | * coordinates |
990 | | */ |
991 | | |
992 | | /*- |
993 | | * point_double calculates 2*(x_in, y_in, z_in) |
994 | | * |
995 | | * The method is taken from: |
996 | | * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b |
997 | | * |
998 | | * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed. |
999 | | * while x_out == y_in is not (maybe this works, but it's not tested). |
1000 | | */ |
1001 | | static void |
1002 | | point_double(felem x_out, felem y_out, felem z_out, |
1003 | | const felem x_in, const felem y_in, const felem z_in) |
1004 | 25.8k | { |
1005 | 25.8k | widefelem tmp, tmp2; |
1006 | 25.8k | felem delta, gamma, beta, alpha, ftmp, ftmp2; |
1007 | | |
1008 | 25.8k | felem_assign(ftmp, x_in); |
1009 | 25.8k | felem_assign(ftmp2, x_in); |
1010 | | |
1011 | | /* delta = z^2 */ |
1012 | 25.8k | felem_square_reduce(delta, z_in); /* delta[i] < 2^56 */ |
1013 | | |
1014 | | /* gamma = y^2 */ |
1015 | 25.8k | felem_square_reduce(gamma, y_in); /* gamma[i] < 2^56 */ |
1016 | | |
1017 | | /* beta = x*gamma */ |
1018 | 25.8k | felem_mul_reduce(beta, x_in, gamma); /* beta[i] < 2^56 */ |
1019 | | |
1020 | | /* alpha = 3*(x-delta)*(x+delta) */ |
1021 | 25.8k | felem_diff64(ftmp, delta); /* ftmp[i] < 2^60 + 2^58 + 2^44 */ |
1022 | 25.8k | felem_sum64(ftmp2, delta); /* ftmp2[i] < 2^59 */ |
1023 | 25.8k | felem_scalar64(ftmp2, 3); /* ftmp2[i] < 2^61 */ |
1024 | 25.8k | felem_mul_reduce(alpha, ftmp, ftmp2); /* alpha[i] < 2^56 */ |
1025 | | |
1026 | | /* x' = alpha^2 - 8*beta */ |
1027 | 25.8k | felem_square(tmp, alpha); /* tmp[i] < 2^115 */ |
1028 | 25.8k | felem_assign(ftmp, beta); /* ftmp[i] < 2^56 */ |
1029 | 25.8k | felem_scalar64(ftmp, 8); /* ftmp[i] < 2^59 */ |
1030 | 25.8k | felem_diff_128_64(tmp, ftmp); /* tmp[i] < 2^115 + 2^64 + 2^48 */ |
1031 | 25.8k | felem_reduce(x_out, tmp); /* x_out[i] < 2^56 */ |
1032 | | |
1033 | | /* z' = (y + z)^2 - gamma - delta */ |
1034 | 25.8k | felem_sum64(delta, gamma); /* delta[i] < 2^57 */ |
1035 | 25.8k | felem_assign(ftmp, y_in); /* ftmp[i] < 2^56 */ |
1036 | 25.8k | felem_sum64(ftmp, z_in); /* ftmp[i] < 2^56 */ |
1037 | 25.8k | felem_square(tmp, ftmp); /* tmp[i] < 2^115 */ |
1038 | 25.8k | felem_diff_128_64(tmp, delta); /* tmp[i] < 2^115 + 2^64 + 2^48 */ |
1039 | 25.8k | felem_reduce(z_out, tmp); /* z_out[i] < 2^56 */ |
1040 | | |
1041 | | /* y' = alpha*(4*beta - x') - 8*gamma^2 */ |
1042 | 25.8k | felem_scalar64(beta, 4); /* beta[i] < 2^58 */ |
1043 | 25.8k | felem_diff64(beta, x_out); /* beta[i] < 2^60 + 2^58 + 2^44 */ |
1044 | 25.8k | felem_mul(tmp, alpha, beta); /* tmp[i] < 2^119 */ |
1045 | 25.8k | felem_square(tmp2, gamma); /* tmp2[i] < 2^115 */ |
1046 | 25.8k | felem_scalar128(tmp2, 8); /* tmp2[i] < 2^118 */ |
1047 | 25.8k | felem_diff128(tmp, tmp2); /* tmp[i] < 2^127 + 2^119 + 2^111 */ |
1048 | 25.8k | felem_reduce(y_out, tmp); /* tmp[i] < 2^56 */ |
1049 | 25.8k | } |
1050 | | |
1051 | | /* copy_conditional copies in to out iff mask is all ones. */ |
1052 | | static void copy_conditional(felem out, const felem in, limb mask) |
1053 | 155k | { |
1054 | 155k | unsigned int i; |
1055 | | |
1056 | 1.24M | for (i = 0; i < NLIMBS; i++) |
1057 | 1.08M | out[i] ^= mask & (in[i] ^ out[i]); |
1058 | 155k | } |
1059 | | |
1060 | | /*- |
1061 | | * point_add calculates (x1, y1, z1) + (x2, y2, z2) |
1062 | | * |
1063 | | * The method is taken from |
1064 | | * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, |
1065 | | * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). |
1066 | | * |
1067 | | * This function includes a branch for checking whether the two input points |
1068 | | * are equal (while not equal to the point at infinity). See comment below |
1069 | | * on constant-time. |
1070 | | */ |
1071 | | static void point_add(felem x3, felem y3, felem z3, |
1072 | | const felem x1, const felem y1, const felem z1, |
1073 | | const int mixed, const felem x2, const felem y2, |
1074 | | const felem z2) |
1075 | 25.8k | { |
1076 | 25.8k | felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out; |
1077 | 25.8k | widefelem tmp, tmp2; |
1078 | 25.8k | limb x_equal, y_equal, z1_is_zero, z2_is_zero; |
1079 | 25.8k | limb points_equal; |
1080 | | |
1081 | 25.8k | z1_is_zero = felem_is_zero(z1); |
1082 | 25.8k | z2_is_zero = felem_is_zero(z2); |
1083 | | |
1084 | | /* ftmp = z1z1 = z1**2 */ |
1085 | 25.8k | felem_square_reduce(ftmp, z1); /* ftmp[i] < 2^56 */ |
1086 | | |
1087 | 25.8k | if (!mixed) { |
1088 | | /* ftmp2 = z2z2 = z2**2 */ |
1089 | 0 | felem_square_reduce(ftmp2, z2); /* ftmp2[i] < 2^56 */ |
1090 | | |
1091 | | /* u1 = ftmp3 = x1*z2z2 */ |
1092 | 0 | felem_mul_reduce(ftmp3, x1, ftmp2); /* ftmp3[i] < 2^56 */ |
1093 | | |
1094 | | /* ftmp5 = z1 + z2 */ |
1095 | 0 | felem_assign(ftmp5, z1); /* ftmp5[i] < 2^56 */ |
1096 | 0 | felem_sum64(ftmp5, z2); /* ftmp5[i] < 2^57 */ |
1097 | | |
1098 | | /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */ |
1099 | 0 | felem_square(tmp, ftmp5); /* tmp[i] < 2^117 */ |
1100 | 0 | felem_diff_128_64(tmp, ftmp); /* tmp[i] < 2^117 + 2^64 + 2^48 */ |
1101 | 0 | felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^65 + 2^49 */ |
1102 | 0 | felem_reduce(ftmp5, tmp); /* ftmp5[i] < 2^56 */ |
1103 | | |
1104 | | /* ftmp2 = z2 * z2z2 */ |
1105 | 0 | felem_mul_reduce(ftmp2, ftmp2, z2); /* ftmp2[i] < 2^56 */ |
1106 | | |
1107 | | /* s1 = ftmp6 = y1 * z2**3 */ |
1108 | 0 | felem_mul_reduce(ftmp6, y1, ftmp2); /* ftmp6[i] < 2^56 */ |
1109 | 25.8k | } else { |
1110 | | /* |
1111 | | * We'll assume z2 = 1 (special case z2 = 0 is handled later) |
1112 | | */ |
1113 | | |
1114 | | /* u1 = ftmp3 = x1*z2z2 */ |
1115 | 25.8k | felem_assign(ftmp3, x1); /* ftmp3[i] < 2^56 */ |
1116 | | |
1117 | | /* ftmp5 = 2*z1z2 */ |
1118 | 25.8k | felem_scalar(ftmp5, z1, 2); /* ftmp5[i] < 2^57 */ |
1119 | | |
1120 | | /* s1 = ftmp6 = y1 * z2**3 */ |
1121 | 25.8k | felem_assign(ftmp6, y1); /* ftmp6[i] < 2^56 */ |
1122 | 25.8k | } |
1123 | | /* ftmp3[i] < 2^56, ftmp5[i] < 2^57, ftmp6[i] < 2^56 */ |
1124 | | |
1125 | | /* u2 = x2*z1z1 */ |
1126 | 25.8k | felem_mul(tmp, x2, ftmp); /* tmp[i] < 2^115 */ |
1127 | | |
1128 | | /* h = ftmp4 = u2 - u1 */ |
1129 | 25.8k | felem_diff_128_64(tmp, ftmp3); /* tmp[i] < 2^115 + 2^64 + 2^48 */ |
1130 | 25.8k | felem_reduce(ftmp4, tmp); /* ftmp[4] < 2^56 */ |
1131 | | |
1132 | 25.8k | x_equal = felem_is_zero(ftmp4); |
1133 | | |
1134 | | /* z_out = ftmp5 * h */ |
1135 | 25.8k | felem_mul_reduce(z_out, ftmp5, ftmp4); /* z_out[i] < 2^56 */ |
1136 | | |
1137 | | /* ftmp = z1 * z1z1 */ |
1138 | 25.8k | felem_mul_reduce(ftmp, ftmp, z1); /* ftmp[i] < 2^56 */ |
1139 | | |
1140 | | /* s2 = tmp = y2 * z1**3 */ |
1141 | 25.8k | felem_mul(tmp, y2, ftmp); /* tmp[i] < 2^115 */ |
1142 | | |
1143 | | /* r = ftmp5 = (s2 - s1)*2 */ |
1144 | 25.8k | felem_diff_128_64(tmp, ftmp6); /* tmp[i] < 2^115 + 2^64 + 2^48 */ |
1145 | 25.8k | felem_reduce(ftmp5, tmp); /* ftmp5[i] < 2^56 */ |
1146 | 25.8k | y_equal = felem_is_zero(ftmp5); |
1147 | 25.8k | felem_scalar64(ftmp5, 2); /* ftmp5[i] < 2^57 */ |
1148 | | |
1149 | | /* |
1150 | | * The formulae are incorrect if the points are equal, in affine coordinates |
1151 | | * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this |
1152 | | * happens. |
1153 | | * |
1154 | | * We use bitwise operations to avoid potential side-channels introduced by |
1155 | | * the short-circuiting behaviour of boolean operators. |
1156 | | * |
1157 | | * The special case of either point being the point at infinity (z1 and/or |
1158 | | * z2 are zero), is handled separately later on in this function, so we |
1159 | | * avoid jumping to point_double here in those special cases. |
1160 | | * |
1161 | | * Notice the comment below on the implications of this branching for timing |
1162 | | * leaks and why it is considered practically irrelevant. |
1163 | | */ |
1164 | 25.8k | points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero)); |
1165 | | |
1166 | 25.8k | if (points_equal) { |
1167 | | /* |
1168 | | * This is obviously not constant-time but it will almost-never happen |
1169 | | * for ECDH / ECDSA. |
1170 | | */ |
1171 | 0 | point_double(x3, y3, z3, x1, y1, z1); |
1172 | 0 | return; |
1173 | 0 | } |
1174 | | |
1175 | | /* I = ftmp = (2h)**2 */ |
1176 | 25.8k | felem_assign(ftmp, ftmp4); /* ftmp[i] < 2^56 */ |
1177 | 25.8k | felem_scalar64(ftmp, 2); /* ftmp[i] < 2^57 */ |
1178 | 25.8k | felem_square_reduce(ftmp, ftmp); /* ftmp[i] < 2^56 */ |
1179 | | |
1180 | | /* J = ftmp2 = h * I */ |
1181 | 25.8k | felem_mul_reduce(ftmp2, ftmp4, ftmp); /* ftmp2[i] < 2^56 */ |
1182 | | |
1183 | | /* V = ftmp4 = U1 * I */ |
1184 | 25.8k | felem_mul_reduce(ftmp4, ftmp3, ftmp); /* ftmp4[i] < 2^56 */ |
1185 | | |
1186 | | /* x_out = r**2 - J - 2V */ |
1187 | 25.8k | felem_square(tmp, ftmp5); /* tmp[i] < 2^117 */ |
1188 | 25.8k | felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^64 + 2^48 */ |
1189 | 25.8k | felem_assign(ftmp3, ftmp4); /* ftmp3[i] < 2^56 */ |
1190 | 25.8k | felem_scalar64(ftmp4, 2); /* ftmp4[i] < 2^57 */ |
1191 | 25.8k | felem_diff_128_64(tmp, ftmp4); /* tmp[i] < 2^117 + 2^65 + 2^49 */ |
1192 | 25.8k | felem_reduce(x_out, tmp); /* x_out[i] < 2^56 */ |
1193 | | |
1194 | | /* y_out = r(V-x_out) - 2 * s1 * J */ |
1195 | 25.8k | felem_diff64(ftmp3, x_out); /* ftmp3[i] < 2^60 + 2^56 + 2^44 */ |
1196 | 25.8k | felem_mul(tmp, ftmp5, ftmp3); /* tmp[i] < 2^116 */ |
1197 | 25.8k | felem_mul(tmp2, ftmp6, ftmp2); /* tmp2[i] < 2^115 */ |
1198 | 25.8k | felem_scalar128(tmp2, 2); /* tmp2[i] < 2^116 */ |
1199 | 25.8k | felem_diff128(tmp, tmp2); /* tmp[i] < 2^127 + 2^116 + 2^111 */ |
1200 | 25.8k | felem_reduce(y_out, tmp); /* y_out[i] < 2^56 */ |
1201 | | |
1202 | 25.8k | copy_conditional(x_out, x2, z1_is_zero); |
1203 | 25.8k | copy_conditional(x_out, x1, z2_is_zero); |
1204 | 25.8k | copy_conditional(y_out, y2, z1_is_zero); |
1205 | 25.8k | copy_conditional(y_out, y1, z2_is_zero); |
1206 | 25.8k | copy_conditional(z_out, z2, z1_is_zero); |
1207 | 25.8k | copy_conditional(z_out, z1, z2_is_zero); |
1208 | 25.8k | felem_assign(x3, x_out); |
1209 | 25.8k | felem_assign(y3, y_out); |
1210 | 25.8k | felem_assign(z3, z_out); |
1211 | 25.8k | } |
1212 | | |
1213 | | /*- |
1214 | | * Base point pre computation |
1215 | | * -------------------------- |
1216 | | * |
1217 | | * Two different sorts of precomputed tables are used in the following code. |
1218 | | * Each contain various points on the curve, where each point is three field |
1219 | | * elements (x, y, z). |
1220 | | * |
1221 | | * For the base point table, z is usually 1 (0 for the point at infinity). |
1222 | | * This table has 16 elements: |
1223 | | * index | bits | point |
1224 | | * ------+---------+------------------------------ |
1225 | | * 0 | 0 0 0 0 | 0G |
1226 | | * 1 | 0 0 0 1 | 1G |
1227 | | * 2 | 0 0 1 0 | 2^95G |
1228 | | * 3 | 0 0 1 1 | (2^95 + 1)G |
1229 | | * 4 | 0 1 0 0 | 2^190G |
1230 | | * 5 | 0 1 0 1 | (2^190 + 1)G |
1231 | | * 6 | 0 1 1 0 | (2^190 + 2^95)G |
1232 | | * 7 | 0 1 1 1 | (2^190 + 2^95 + 1)G |
1233 | | * 8 | 1 0 0 0 | 2^285G |
1234 | | * 9 | 1 0 0 1 | (2^285 + 1)G |
1235 | | * 10 | 1 0 1 0 | (2^285 + 2^95)G |
1236 | | * 11 | 1 0 1 1 | (2^285 + 2^95 + 1)G |
1237 | | * 12 | 1 1 0 0 | (2^285 + 2^190)G |
1238 | | * 13 | 1 1 0 1 | (2^285 + 2^190 + 1)G |
1239 | | * 14 | 1 1 1 0 | (2^285 + 2^190 + 2^95)G |
1240 | | * 15 | 1 1 1 1 | (2^285 + 2^190 + 2^95 + 1)G |
1241 | | * |
1242 | | * The reason for this is so that we can clock bits into four different |
1243 | | * locations when doing simple scalar multiplies against the base point. |
1244 | | * |
1245 | | * Tables for other points have table[i] = iG for i in 0 .. 16. |
1246 | | */ |
1247 | | |
1248 | | /* gmul is the table of precomputed base points */ |
1249 | | static const felem gmul[16][3] = { |
1250 | | {{0, 0, 0, 0, 0, 0, 0}, |
1251 | | {0, 0, 0, 0, 0, 0, 0}, |
1252 | | {0, 0, 0, 0, 0, 0, 0}}, |
1253 | | {{0x00545e3872760ab7, 0x00f25dbf55296c3a, 0x00e082542a385502, 0x008ba79b9859f741, |
1254 | | 0x0020ad746e1d3b62, 0x0005378eb1c71ef3, 0x0000aa87ca22be8b}, |
1255 | | {0x00431d7c90ea0e5f, 0x00b1ce1d7e819d7a, 0x0013b5f0b8c00a60, 0x00289a147ce9da31, |
1256 | | 0x0092dc29f8f41dbd, 0x002c6f5d9e98bf92, 0x00003617de4a9626}, |
1257 | | {1, 0, 0, 0, 0, 0, 0}}, |
1258 | | {{0x00024711cc902a90, 0x00acb2e579ab4fe1, 0x00af818a4b4d57b1, 0x00a17c7bec49c3de, |
1259 | | 0x004280482d726a8b, 0x00128dd0f0a90f3b, 0x00004387c1c3fa3c}, |
1260 | | {0x002ce76543cf5c3a, 0x00de6cee5ef58f0a, 0x00403e42fa561ca6, 0x00bc54d6f9cb9731, |
1261 | | 0x007155f925fb4ff1, 0x004a9ce731b7b9bc, 0x00002609076bd7b2}, |
1262 | | {1, 0, 0, 0, 0, 0, 0}}, |
1263 | | {{0x00e74c9182f0251d, 0x0039bf54bb111974, 0x00b9d2f2eec511d2, 0x0036b1594eb3a6a4, |
1264 | | 0x00ac3bb82d9d564b, 0x00f9313f4615a100, 0x00006716a9a91b10}, |
1265 | | {0x0046698116e2f15c, 0x00f34347067d3d33, 0x008de4ccfdebd002, 0x00e838c6b8e8c97b, |
1266 | | 0x006faf0798def346, 0x007349794a57563c, 0x00002629e7e6ad84}, |
1267 | | {1, 0, 0, 0, 0, 0, 0}}, |
1268 | | {{0x0075300e34fd163b, 0x0092e9db4e8d0ad3, 0x00254be9f625f760, 0x00512c518c72ae68, |
1269 | | 0x009bfcf162bede5a, 0x00bf9341566ce311, 0x0000cd6175bd41cf}, |
1270 | | {0x007dfe52af4ac70f, 0x0002159d2d5c4880, 0x00b504d16f0af8d0, 0x0014585e11f5e64c, |
1271 | | 0x0089c6388e030967, 0x00ffb270cbfa5f71, 0x00009a15d92c3947}, |
1272 | | {1, 0, 0, 0, 0, 0, 0}}, |
1273 | | {{0x0033fc1278dc4fe5, 0x00d53088c2caa043, 0x0085558827e2db66, 0x00c192bef387b736, |
1274 | | 0x00df6405a2225f2c, 0x0075205aa90fd91a, 0x0000137e3f12349d}, |
1275 | | {0x00ce5b115efcb07e, 0x00abc3308410deeb, 0x005dc6fc1de39904, 0x00907c1c496f36b4, |
1276 | | 0x0008e6ad3926cbe1, 0x00110747b787928c, 0x0000021b9162eb7e}, |
1277 | | {1, 0, 0, 0, 0, 0, 0}}, |
1278 | | {{0x008180042cfa26e1, 0x007b826a96254967, 0x0082473694d6b194, 0x007bd6880a45b589, |
1279 | | 0x00c0a5097072d1a3, 0x0019186555e18b4e, 0x000020278190e5ca}, |
1280 | | {0x00b4bef17de61ac0, 0x009535e3c38ed348, 0x002d4aa8e468ceab, 0x00ef40b431036ad3, |
1281 | | 0x00defd52f4542857, 0x0086edbf98234266, 0x00002025b3a7814d}, |
1282 | | {1, 0, 0, 0, 0, 0, 0}}, |
1283 | | {{0x00b238aa97b886be, 0x00ef3192d6dd3a32, 0x0079f9e01fd62df8, 0x00742e890daba6c5, |
1284 | | 0x008e5289144408ce, 0x0073bbcc8e0171a5, 0x0000c4fd329d3b52}, |
1285 | | {0x00c6f64a15ee23e7, 0x00dcfb7b171cad8b, 0x00039f6cbd805867, 0x00de024e428d4562, |
1286 | | 0x00be6a594d7c64c5, 0x0078467b70dbcd64, 0x0000251f2ed7079b}, |
1287 | | {1, 0, 0, 0, 0, 0, 0}}, |
1288 | | {{0x000e5cc25fc4b872, 0x005ebf10d31ef4e1, 0x0061e0ebd11e8256, 0x0076e026096f5a27, |
1289 | | 0x0013e6fc44662e9a, 0x0042b00289d3597e, 0x000024f089170d88}, |
1290 | | {0x001604d7e0effbe6, 0x0048d77cba64ec2c, 0x008166b16da19e36, 0x006b0d1a0f28c088, |
1291 | | 0x000259fcd47754fd, 0x00cc643e4d725f9a, 0x00007b10f3c79c14}, |
1292 | | {1, 0, 0, 0, 0, 0, 0}}, |
1293 | | {{0x00430155e3b908af, 0x00b801e4fec25226, 0x00b0d4bcfe806d26, 0x009fc4014eb13d37, |
1294 | | 0x0066c94e44ec07e8, 0x00d16adc03874ba2, 0x000030c917a0d2a7}, |
1295 | | {0x00edac9e21eb891c, 0x00ef0fb768102eff, 0x00c088cef272a5f3, 0x00cbf782134e2964, |
1296 | | 0x0001044a7ba9a0e3, 0x00e363f5b194cf3c, 0x00009ce85249e372}, |
1297 | | {1, 0, 0, 0, 0, 0, 0}}, |
1298 | | {{0x001dd492dda5a7eb, 0x008fd577be539fd1, 0x002ff4b25a5fc3f1, 0x0074a8a1b64df72f, |
1299 | | 0x002ba3d8c204a76c, 0x009d5cff95c8235a, 0x0000e014b9406e0f}, |
1300 | | {0x008c2e4dbfc98aba, 0x00f30bb89f1a1436, 0x00b46f7aea3e259c, 0x009224454ac02f54, |
1301 | | 0x00906401f5645fa2, 0x003a1d1940eabc77, 0x00007c9351d680e6}, |
1302 | | {1, 0, 0, 0, 0, 0, 0}}, |
1303 | | {{0x005a35d872ef967c, 0x0049f1b7884e1987, 0x0059d46d7e31f552, 0x00ceb4869d2d0fb6, |
1304 | | 0x00e8e89eee56802a, 0x0049d806a774aaf2, 0x0000147e2af0ae24}, |
1305 | | {0x005fd1bd852c6e5e, 0x00b674b7b3de6885, 0x003b9ea5eb9b6c08, 0x005c9f03babf3ef7, |
1306 | | 0x00605337fecab3c7, 0x009a3f85b11bbcc8, 0x0000455470f330ec}, |
1307 | | {1, 0, 0, 0, 0, 0, 0}}, |
1308 | | {{0x002197ff4d55498d, 0x00383e8916c2d8af, 0x00eb203f34d1c6d2, 0x0080367cbd11b542, |
1309 | | 0x00769b3be864e4f5, 0x0081a8458521c7bb, 0x0000c531b34d3539}, |
1310 | | {0x00e2a3d775fa2e13, 0x00534fc379573844, 0x00ff237d2a8db54a, 0x00d301b2335a8882, |
1311 | | 0x000f75ea96103a80, 0x0018fecb3cdd96fa, 0x0000304bf61e94eb}, |
1312 | | {1, 0, 0, 0, 0, 0, 0}}, |
1313 | | {{0x00b2afc332a73dbd, 0x0029a0d5bb007bc5, 0x002d628eb210f577, 0x009f59a36dd05f50, |
1314 | | 0x006d339de4eca613, 0x00c75a71addc86bc, 0x000060384c5ea93c}, |
1315 | | {0x00aa9641c32a30b4, 0x00cc73ae8cce565d, 0x00ec911a4df07f61, 0x00aa4b762ea4b264, |
1316 | | 0x0096d395bb393629, 0x004efacfb7632fe0, 0x00006f252f46fa3f}, |
1317 | | {1, 0, 0, 0, 0, 0, 0}}, |
1318 | | {{0x00567eec597c7af6, 0x0059ba6795204413, 0x00816d4e6f01196f, 0x004ae6b3eb57951d, |
1319 | | 0x00420f5abdda2108, 0x003401d1f57ca9d9, 0x0000cf5837b0b67a}, |
1320 | | {0x00eaa64b8aeeabf9, 0x00246ddf16bcb4de, 0x000e7e3c3aecd751, 0x0008449f04fed72e, |
1321 | | 0x00307b67ccf09183, 0x0017108c3556b7b1, 0x0000229b2483b3bf}, |
1322 | | {1, 0, 0, 0, 0, 0, 0}}, |
1323 | | {{0x00e7c491a7bb78a1, 0x00eafddd1d3049ab, 0x00352c05e2bc7c98, 0x003d6880c165fa5c, |
1324 | | 0x00b6ac61cc11c97d, 0x00beeb54fcf90ce5, 0x0000dc1f0b455edc}, |
1325 | | {0x002db2e7aee34d60, 0x0073b5f415a2d8c0, 0x00dd84e4193e9a0c, 0x00d02d873467c572, |
1326 | | 0x0018baaeda60aee5, 0x0013fb11d697c61e, 0x000083aafcc3a973}, |
1327 | | {1, 0, 0, 0, 0, 0, 0}} |
1328 | | }; |
1329 | | |
1330 | | /* |
1331 | | * select_point selects the |idx|th point from a precomputation table and |
1332 | | * copies it to out. |
1333 | | * |
1334 | | * pre_comp below is of the size provided in |size|. |
1335 | | */ |
1336 | | static void select_point(const limb idx, unsigned int size, |
1337 | | const felem pre_comp[][3], felem out[3]) |
1338 | 26.1k | { |
1339 | 26.1k | unsigned int i, j; |
1340 | 26.1k | limb *outlimbs = &out[0][0]; |
1341 | | |
1342 | 26.1k | memset(out, 0, sizeof(*out) * 3); |
1343 | | |
1344 | 444k | for (i = 0; i < size; i++) { |
1345 | 418k | const limb *inlimbs = &pre_comp[i][0][0]; |
1346 | 418k | limb mask = i ^ idx; |
1347 | | |
1348 | 418k | mask |= mask >> 4; |
1349 | 418k | mask |= mask >> 2; |
1350 | 418k | mask |= mask >> 1; |
1351 | 418k | mask &= 1; |
1352 | 418k | mask--; |
1353 | 9.19M | for (j = 0; j < NLIMBS * 3; j++) |
1354 | 8.78M | outlimbs[j] |= inlimbs[j] & mask; |
1355 | 418k | } |
1356 | 26.1k | } |
1357 | | |
1358 | | /* get_bit returns the |i|th bit in |in| */ |
1359 | | static char get_bit(const felem_bytearray in, int i) |
1360 | 101k | { |
1361 | 101k | if (i < 0 || i >= 384) |
1362 | 0 | return 0; |
1363 | 101k | return (in[i >> 3] >> (i & 7)) & 1; |
1364 | 101k | } |
1365 | | |
1366 | | /* |
1367 | | * Interleaved point multiplication using precomputed point multiples: The |
1368 | | * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars |
1369 | | * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the |
1370 | | * generator, using certain (large) precomputed multiples in g_pre_comp. |
1371 | | * Output point (X, Y, Z) is stored in x_out, y_out, z_out |
1372 | | */ |
1373 | | static void batch_mul(felem x_out, felem y_out, felem z_out, |
1374 | | const felem_bytearray scalars[], |
1375 | | const unsigned int num_points, const u8 *g_scalar, |
1376 | | const int mixed, const felem pre_comp[][17][3], |
1377 | | const felem g_pre_comp[16][3]) |
1378 | 264 | { |
1379 | 264 | int i, skip; |
1380 | 264 | unsigned int num, gen_mul = (g_scalar != NULL); |
1381 | 264 | felem nq[3], tmp[4]; |
1382 | 264 | limb bits; |
1383 | 264 | u8 sign, digit; |
1384 | | |
1385 | | /* set nq to the point at infinity */ |
1386 | 264 | memset(nq, 0, sizeof(nq)); |
1387 | | |
1388 | | /* |
1389 | | * Loop over all scalars msb-to-lsb, interleaving additions of multiples |
1390 | | * of the generator (last quarter of rounds) and additions of other |
1391 | | * points multiples (every 5th round). |
1392 | | */ |
1393 | 264 | skip = 1; /* save two point operations in the first |
1394 | | * round */ |
1395 | 26.4k | for (i = (num_points ? 380 : 98); i >= 0; --i) { |
1396 | | /* double */ |
1397 | 26.1k | if (!skip) |
1398 | 25.8k | point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); |
1399 | | |
1400 | | /* add multiples of the generator */ |
1401 | 26.1k | if (gen_mul && (i <= 98)) { |
1402 | 26.1k | bits = get_bit(g_scalar, i + 285) << 3; |
1403 | 26.1k | if (i < 95) { |
1404 | 25.0k | bits |= get_bit(g_scalar, i + 190) << 2; |
1405 | 25.0k | bits |= get_bit(g_scalar, i + 95) << 1; |
1406 | 25.0k | bits |= get_bit(g_scalar, i); |
1407 | 25.0k | } |
1408 | | /* select the point to add, in constant time */ |
1409 | 26.1k | select_point(bits, 16, g_pre_comp, tmp); |
1410 | 26.1k | if (!skip) { |
1411 | | /* The 1 argument below is for "mixed" */ |
1412 | 25.8k | point_add(nq[0], nq[1], nq[2], |
1413 | 25.8k | nq[0], nq[1], nq[2], 1, |
1414 | 25.8k | tmp[0], tmp[1], tmp[2]); |
1415 | 25.8k | } else { |
1416 | 264 | memcpy(nq, tmp, 3 * sizeof(felem)); |
1417 | 264 | skip = 0; |
1418 | 264 | } |
1419 | 26.1k | } |
1420 | | |
1421 | | /* do other additions every 5 doublings */ |
1422 | 26.1k | if (num_points && (i % 5 == 0)) { |
1423 | | /* loop over all scalars */ |
1424 | 0 | for (num = 0; num < num_points; ++num) { |
1425 | 0 | bits = get_bit(scalars[num], i + 4) << 5; |
1426 | 0 | bits |= get_bit(scalars[num], i + 3) << 4; |
1427 | 0 | bits |= get_bit(scalars[num], i + 2) << 3; |
1428 | 0 | bits |= get_bit(scalars[num], i + 1) << 2; |
1429 | 0 | bits |= get_bit(scalars[num], i) << 1; |
1430 | 0 | bits |= get_bit(scalars[num], i - 1); |
1431 | 0 | ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); |
1432 | | |
1433 | | /* |
1434 | | * select the point to add or subtract, in constant time |
1435 | | */ |
1436 | 0 | select_point(digit, 17, pre_comp[num], tmp); |
1437 | 0 | felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative |
1438 | | * point */ |
1439 | 0 | copy_conditional(tmp[1], tmp[3], (-(limb) sign)); |
1440 | |
|
1441 | 0 | if (!skip) { |
1442 | 0 | point_add(nq[0], nq[1], nq[2], |
1443 | 0 | nq[0], nq[1], nq[2], mixed, |
1444 | 0 | tmp[0], tmp[1], tmp[2]); |
1445 | 0 | } else { |
1446 | 0 | memcpy(nq, tmp, 3 * sizeof(felem)); |
1447 | 0 | skip = 0; |
1448 | 0 | } |
1449 | 0 | } |
1450 | 0 | } |
1451 | 26.1k | } |
1452 | 264 | felem_assign(x_out, nq[0]); |
1453 | 264 | felem_assign(y_out, nq[1]); |
1454 | 264 | felem_assign(z_out, nq[2]); |
1455 | 264 | } |
1456 | | |
1457 | | /* Precomputation for the group generator. */ |
1458 | | struct nistp384_pre_comp_st { |
1459 | | felem g_pre_comp[16][3]; |
1460 | | CRYPTO_REF_COUNT references; |
1461 | | }; |
1462 | | |
1463 | | const EC_METHOD *ossl_ec_GFp_nistp384_method(void) |
1464 | 264 | { |
1465 | 264 | static const EC_METHOD ret = { |
1466 | 264 | EC_FLAGS_DEFAULT_OCT, |
1467 | 264 | NID_X9_62_prime_field, |
1468 | 264 | ossl_ec_GFp_nistp384_group_init, |
1469 | 264 | ossl_ec_GFp_simple_group_finish, |
1470 | 264 | ossl_ec_GFp_simple_group_clear_finish, |
1471 | 264 | ossl_ec_GFp_nist_group_copy, |
1472 | 264 | ossl_ec_GFp_nistp384_group_set_curve, |
1473 | 264 | ossl_ec_GFp_simple_group_get_curve, |
1474 | 264 | ossl_ec_GFp_simple_group_get_degree, |
1475 | 264 | ossl_ec_group_simple_order_bits, |
1476 | 264 | ossl_ec_GFp_simple_group_check_discriminant, |
1477 | 264 | ossl_ec_GFp_simple_point_init, |
1478 | 264 | ossl_ec_GFp_simple_point_finish, |
1479 | 264 | ossl_ec_GFp_simple_point_clear_finish, |
1480 | 264 | ossl_ec_GFp_simple_point_copy, |
1481 | 264 | ossl_ec_GFp_simple_point_set_to_infinity, |
1482 | 264 | ossl_ec_GFp_simple_point_set_affine_coordinates, |
1483 | 264 | ossl_ec_GFp_nistp384_point_get_affine_coordinates, |
1484 | 264 | 0, /* point_set_compressed_coordinates */ |
1485 | 264 | 0, /* point2oct */ |
1486 | 264 | 0, /* oct2point */ |
1487 | 264 | ossl_ec_GFp_simple_add, |
1488 | 264 | ossl_ec_GFp_simple_dbl, |
1489 | 264 | ossl_ec_GFp_simple_invert, |
1490 | 264 | ossl_ec_GFp_simple_is_at_infinity, |
1491 | 264 | ossl_ec_GFp_simple_is_on_curve, |
1492 | 264 | ossl_ec_GFp_simple_cmp, |
1493 | 264 | ossl_ec_GFp_simple_make_affine, |
1494 | 264 | ossl_ec_GFp_simple_points_make_affine, |
1495 | 264 | ossl_ec_GFp_nistp384_points_mul, |
1496 | 264 | ossl_ec_GFp_nistp384_precompute_mult, |
1497 | 264 | ossl_ec_GFp_nistp384_have_precompute_mult, |
1498 | 264 | ossl_ec_GFp_nist_field_mul, |
1499 | 264 | ossl_ec_GFp_nist_field_sqr, |
1500 | 264 | 0, /* field_div */ |
1501 | 264 | ossl_ec_GFp_simple_field_inv, |
1502 | 264 | 0, /* field_encode */ |
1503 | 264 | 0, /* field_decode */ |
1504 | 264 | 0, /* field_set_to_one */ |
1505 | 264 | ossl_ec_key_simple_priv2oct, |
1506 | 264 | ossl_ec_key_simple_oct2priv, |
1507 | 264 | 0, /* set private */ |
1508 | 264 | ossl_ec_key_simple_generate_key, |
1509 | 264 | ossl_ec_key_simple_check_key, |
1510 | 264 | ossl_ec_key_simple_generate_public_key, |
1511 | 264 | 0, /* keycopy */ |
1512 | 264 | 0, /* keyfinish */ |
1513 | 264 | ossl_ecdh_simple_compute_key, |
1514 | 264 | ossl_ecdsa_simple_sign_setup, |
1515 | 264 | ossl_ecdsa_simple_sign_sig, |
1516 | 264 | ossl_ecdsa_simple_verify_sig, |
1517 | 264 | 0, /* field_inverse_mod_ord */ |
1518 | 264 | 0, /* blind_coordinates */ |
1519 | 264 | 0, /* ladder_pre */ |
1520 | 264 | 0, /* ladder_step */ |
1521 | 264 | 0 /* ladder_post */ |
1522 | 264 | }; |
1523 | | |
1524 | 264 | return &ret; |
1525 | 264 | } |
1526 | | |
1527 | | /******************************************************************************/ |
1528 | | /* |
1529 | | * FUNCTIONS TO MANAGE PRECOMPUTATION |
1530 | | */ |
1531 | | |
1532 | | static NISTP384_PRE_COMP *nistp384_pre_comp_new(void) |
1533 | 0 | { |
1534 | 0 | NISTP384_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret)); |
1535 | |
|
1536 | 0 | if (ret == NULL) |
1537 | 0 | return ret; |
1538 | | |
1539 | 0 | if (!CRYPTO_NEW_REF(&ret->references, 1)) { |
1540 | 0 | OPENSSL_free(ret); |
1541 | 0 | return NULL; |
1542 | 0 | } |
1543 | 0 | return ret; |
1544 | 0 | } |
1545 | | |
1546 | | NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *p) |
1547 | 0 | { |
1548 | 0 | int i; |
1549 | |
|
1550 | 0 | if (p != NULL) |
1551 | 0 | CRYPTO_UP_REF(&p->references, &i); |
1552 | 0 | return p; |
1553 | 0 | } |
1554 | | |
1555 | | void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *p) |
1556 | 0 | { |
1557 | 0 | int i; |
1558 | |
|
1559 | 0 | if (p == NULL) |
1560 | 0 | return; |
1561 | | |
1562 | 0 | CRYPTO_DOWN_REF(&p->references, &i); |
1563 | 0 | REF_PRINT_COUNT("ossl_ec_nistp384", p); |
1564 | 0 | if (i > 0) |
1565 | 0 | return; |
1566 | 0 | REF_ASSERT_ISNT(i < 0); |
1567 | |
|
1568 | 0 | CRYPTO_FREE_REF(&p->references); |
1569 | 0 | OPENSSL_free(p); |
1570 | 0 | } |
1571 | | |
1572 | | /******************************************************************************/ |
1573 | | /* |
1574 | | * OPENSSL EC_METHOD FUNCTIONS |
1575 | | */ |
1576 | | |
1577 | | int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group) |
1578 | 637 | { |
1579 | 637 | int ret; |
1580 | | |
1581 | 637 | ret = ossl_ec_GFp_simple_group_init(group); |
1582 | 637 | group->a_is_minus3 = 1; |
1583 | 637 | return ret; |
1584 | 637 | } |
1585 | | |
1586 | | int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p, |
1587 | | const BIGNUM *a, const BIGNUM *b, |
1588 | | BN_CTX *ctx) |
1589 | 264 | { |
1590 | 264 | int ret = 0; |
1591 | 264 | BIGNUM *curve_p, *curve_a, *curve_b; |
1592 | 264 | #ifndef FIPS_MODULE |
1593 | 264 | BN_CTX *new_ctx = NULL; |
1594 | | |
1595 | 264 | if (ctx == NULL) |
1596 | 0 | ctx = new_ctx = BN_CTX_new(); |
1597 | 264 | #endif |
1598 | 264 | if (ctx == NULL) |
1599 | 0 | return 0; |
1600 | | |
1601 | 264 | BN_CTX_start(ctx); |
1602 | 264 | curve_p = BN_CTX_get(ctx); |
1603 | 264 | curve_a = BN_CTX_get(ctx); |
1604 | 264 | curve_b = BN_CTX_get(ctx); |
1605 | 264 | if (curve_b == NULL) |
1606 | 0 | goto err; |
1607 | 264 | BN_bin2bn(nistp384_curve_params[0], sizeof(felem_bytearray), curve_p); |
1608 | 264 | BN_bin2bn(nistp384_curve_params[1], sizeof(felem_bytearray), curve_a); |
1609 | 264 | BN_bin2bn(nistp384_curve_params[2], sizeof(felem_bytearray), curve_b); |
1610 | 264 | if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) { |
1611 | 0 | ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS); |
1612 | 0 | goto err; |
1613 | 0 | } |
1614 | 264 | group->field_mod_func = BN_nist_mod_384; |
1615 | 264 | ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx); |
1616 | 264 | err: |
1617 | 264 | BN_CTX_end(ctx); |
1618 | 264 | #ifndef FIPS_MODULE |
1619 | 264 | BN_CTX_free(new_ctx); |
1620 | 264 | #endif |
1621 | 264 | return ret; |
1622 | 264 | } |
1623 | | |
1624 | | /* |
1625 | | * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') = |
1626 | | * (X/Z^2, Y/Z^3) |
1627 | | */ |
1628 | | int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group, |
1629 | | const EC_POINT *point, |
1630 | | BIGNUM *x, BIGNUM *y, |
1631 | | BN_CTX *ctx) |
1632 | 264 | { |
1633 | 264 | felem z1, z2, x_in, y_in, x_out, y_out; |
1634 | 264 | widefelem tmp; |
1635 | | |
1636 | 264 | if (EC_POINT_is_at_infinity(group, point)) { |
1637 | 0 | ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY); |
1638 | 0 | return 0; |
1639 | 0 | } |
1640 | 264 | if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) || |
1641 | 264 | (!BN_to_felem(z1, point->Z))) |
1642 | 0 | return 0; |
1643 | 264 | felem_inv(z2, z1); |
1644 | 264 | felem_square(tmp, z2); |
1645 | 264 | felem_reduce(z1, tmp); |
1646 | 264 | felem_mul(tmp, x_in, z1); |
1647 | 264 | felem_reduce(x_in, tmp); |
1648 | 264 | felem_contract(x_out, x_in); |
1649 | 264 | if (x != NULL) { |
1650 | 264 | if (!felem_to_BN(x, x_out)) { |
1651 | 0 | ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB); |
1652 | 0 | return 0; |
1653 | 0 | } |
1654 | 264 | } |
1655 | 264 | felem_mul(tmp, z1, z2); |
1656 | 264 | felem_reduce(z1, tmp); |
1657 | 264 | felem_mul(tmp, y_in, z1); |
1658 | 264 | felem_reduce(y_in, tmp); |
1659 | 264 | felem_contract(y_out, y_in); |
1660 | 264 | if (y != NULL) { |
1661 | 264 | if (!felem_to_BN(y, y_out)) { |
1662 | 0 | ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB); |
1663 | 0 | return 0; |
1664 | 0 | } |
1665 | 264 | } |
1666 | 264 | return 1; |
1667 | 264 | } |
1668 | | |
1669 | | /* points below is of size |num|, and tmp_felems is of size |num+1/ */ |
1670 | | static void make_points_affine(size_t num, felem points[][3], |
1671 | | felem tmp_felems[]) |
1672 | 0 | { |
1673 | | /* |
1674 | | * Runs in constant time, unless an input is the point at infinity (which |
1675 | | * normally shouldn't happen). |
1676 | | */ |
1677 | 0 | ossl_ec_GFp_nistp_points_make_affine_internal(num, |
1678 | 0 | points, |
1679 | 0 | sizeof(felem), |
1680 | 0 | tmp_felems, |
1681 | 0 | (void (*)(void *))felem_one, |
1682 | 0 | felem_is_zero_int, |
1683 | 0 | (void (*)(void *, const void *)) |
1684 | 0 | felem_assign, |
1685 | 0 | (void (*)(void *, const void *)) |
1686 | 0 | felem_square_reduce, |
1687 | 0 | (void (*)(void *, const void *, const void*)) |
1688 | 0 | felem_mul_reduce, |
1689 | 0 | (void (*)(void *, const void *)) |
1690 | 0 | felem_inv, |
1691 | 0 | (void (*)(void *, const void *)) |
1692 | 0 | felem_contract); |
1693 | 0 | } |
1694 | | |
1695 | | /* |
1696 | | * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL |
1697 | | * values Result is stored in r (r can equal one of the inputs). |
1698 | | */ |
1699 | | int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r, |
1700 | | const BIGNUM *scalar, size_t num, |
1701 | | const EC_POINT *points[], |
1702 | | const BIGNUM *scalars[], BN_CTX *ctx) |
1703 | 264 | { |
1704 | 264 | int ret = 0; |
1705 | 264 | int j; |
1706 | 264 | int mixed = 0; |
1707 | 264 | BIGNUM *x, *y, *z, *tmp_scalar; |
1708 | 264 | felem_bytearray g_secret; |
1709 | 264 | felem_bytearray *secrets = NULL; |
1710 | 264 | felem (*pre_comp)[17][3] = NULL; |
1711 | 264 | felem *tmp_felems = NULL; |
1712 | 264 | unsigned int i; |
1713 | 264 | int num_bytes; |
1714 | 264 | int have_pre_comp = 0; |
1715 | 264 | size_t num_points = num; |
1716 | 264 | felem x_in, y_in, z_in, x_out, y_out, z_out; |
1717 | 264 | NISTP384_PRE_COMP *pre = NULL; |
1718 | 264 | felem(*g_pre_comp)[3] = NULL; |
1719 | 264 | EC_POINT *generator = NULL; |
1720 | 264 | const EC_POINT *p = NULL; |
1721 | 264 | const BIGNUM *p_scalar = NULL; |
1722 | | |
1723 | 264 | BN_CTX_start(ctx); |
1724 | 264 | x = BN_CTX_get(ctx); |
1725 | 264 | y = BN_CTX_get(ctx); |
1726 | 264 | z = BN_CTX_get(ctx); |
1727 | 264 | tmp_scalar = BN_CTX_get(ctx); |
1728 | 264 | if (tmp_scalar == NULL) |
1729 | 0 | goto err; |
1730 | | |
1731 | 264 | if (scalar != NULL) { |
1732 | 264 | pre = group->pre_comp.nistp384; |
1733 | 264 | if (pre) |
1734 | | /* we have precomputation, try to use it */ |
1735 | 0 | g_pre_comp = &pre->g_pre_comp[0]; |
1736 | 264 | else |
1737 | | /* try to use the standard precomputation */ |
1738 | 264 | g_pre_comp = (felem(*)[3]) gmul; |
1739 | 264 | generator = EC_POINT_new(group); |
1740 | 264 | if (generator == NULL) |
1741 | 0 | goto err; |
1742 | | /* get the generator from precomputation */ |
1743 | 264 | if (!felem_to_BN(x, g_pre_comp[1][0]) || |
1744 | 264 | !felem_to_BN(y, g_pre_comp[1][1]) || |
1745 | 264 | !felem_to_BN(z, g_pre_comp[1][2])) { |
1746 | 0 | ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB); |
1747 | 0 | goto err; |
1748 | 0 | } |
1749 | 264 | if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, |
1750 | 264 | generator, |
1751 | 264 | x, y, z, ctx)) |
1752 | 0 | goto err; |
1753 | 264 | if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) |
1754 | | /* precomputation matches generator */ |
1755 | 264 | have_pre_comp = 1; |
1756 | 0 | else |
1757 | | /* |
1758 | | * we don't have valid precomputation: treat the generator as a |
1759 | | * random point |
1760 | | */ |
1761 | 0 | num_points++; |
1762 | 264 | } |
1763 | | |
1764 | 264 | if (num_points > 0) { |
1765 | 0 | if (num_points >= 2) { |
1766 | | /* |
1767 | | * unless we precompute multiples for just one point, converting |
1768 | | * those into affine form is time well spent |
1769 | | */ |
1770 | 0 | mixed = 1; |
1771 | 0 | } |
1772 | 0 | secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points); |
1773 | 0 | pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points); |
1774 | 0 | if (mixed) |
1775 | 0 | tmp_felems = |
1776 | 0 | OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1)); |
1777 | 0 | if ((secrets == NULL) || (pre_comp == NULL) |
1778 | 0 | || (mixed && (tmp_felems == NULL))) |
1779 | 0 | goto err; |
1780 | | |
1781 | | /* |
1782 | | * we treat NULL scalars as 0, and NULL points as points at infinity, |
1783 | | * i.e., they contribute nothing to the linear combination |
1784 | | */ |
1785 | 0 | for (i = 0; i < num_points; ++i) { |
1786 | 0 | if (i == num) { |
1787 | | /* |
1788 | | * we didn't have a valid precomputation, so we pick the |
1789 | | * generator |
1790 | | */ |
1791 | 0 | p = EC_GROUP_get0_generator(group); |
1792 | 0 | p_scalar = scalar; |
1793 | 0 | } else { |
1794 | | /* the i^th point */ |
1795 | 0 | p = points[i]; |
1796 | 0 | p_scalar = scalars[i]; |
1797 | 0 | } |
1798 | 0 | if (p_scalar != NULL && p != NULL) { |
1799 | | /* reduce scalar to 0 <= scalar < 2^384 */ |
1800 | 0 | if ((BN_num_bits(p_scalar) > 384) |
1801 | 0 | || (BN_is_negative(p_scalar))) { |
1802 | | /* |
1803 | | * this is an unusual input, and we don't guarantee |
1804 | | * constant-timeness |
1805 | | */ |
1806 | 0 | if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) { |
1807 | 0 | ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB); |
1808 | 0 | goto err; |
1809 | 0 | } |
1810 | 0 | num_bytes = BN_bn2lebinpad(tmp_scalar, |
1811 | 0 | secrets[i], sizeof(secrets[i])); |
1812 | 0 | } else { |
1813 | 0 | num_bytes = BN_bn2lebinpad(p_scalar, |
1814 | 0 | secrets[i], sizeof(secrets[i])); |
1815 | 0 | } |
1816 | 0 | if (num_bytes < 0) { |
1817 | 0 | ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB); |
1818 | 0 | goto err; |
1819 | 0 | } |
1820 | | /* precompute multiples */ |
1821 | 0 | if ((!BN_to_felem(x_out, p->X)) || |
1822 | 0 | (!BN_to_felem(y_out, p->Y)) || |
1823 | 0 | (!BN_to_felem(z_out, p->Z))) |
1824 | 0 | goto err; |
1825 | 0 | memcpy(pre_comp[i][1][0], x_out, sizeof(felem)); |
1826 | 0 | memcpy(pre_comp[i][1][1], y_out, sizeof(felem)); |
1827 | 0 | memcpy(pre_comp[i][1][2], z_out, sizeof(felem)); |
1828 | 0 | for (j = 2; j <= 16; ++j) { |
1829 | 0 | if (j & 1) { |
1830 | 0 | point_add(pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], |
1831 | 0 | pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], 0, |
1832 | 0 | pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]); |
1833 | 0 | } else { |
1834 | 0 | point_double(pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], |
1835 | 0 | pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]); |
1836 | 0 | } |
1837 | 0 | } |
1838 | 0 | } |
1839 | 0 | } |
1840 | 0 | if (mixed) |
1841 | 0 | make_points_affine(num_points * 17, pre_comp[0], tmp_felems); |
1842 | 0 | } |
1843 | | |
1844 | | /* the scalar for the generator */ |
1845 | 264 | if (scalar != NULL && have_pre_comp) { |
1846 | 264 | memset(g_secret, 0, sizeof(g_secret)); |
1847 | | /* reduce scalar to 0 <= scalar < 2^384 */ |
1848 | 264 | if ((BN_num_bits(scalar) > 384) || (BN_is_negative(scalar))) { |
1849 | | /* |
1850 | | * this is an unusual input, and we don't guarantee |
1851 | | * constant-timeness |
1852 | | */ |
1853 | 0 | if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) { |
1854 | 0 | ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB); |
1855 | 0 | goto err; |
1856 | 0 | } |
1857 | 0 | num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret)); |
1858 | 264 | } else { |
1859 | 264 | num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret)); |
1860 | 264 | } |
1861 | | /* do the multiplication with generator precomputation */ |
1862 | 264 | batch_mul(x_out, y_out, z_out, |
1863 | 264 | (const felem_bytearray(*))secrets, num_points, |
1864 | 264 | g_secret, |
1865 | 264 | mixed, (const felem(*)[17][3])pre_comp, |
1866 | 264 | (const felem(*)[3])g_pre_comp); |
1867 | 264 | } else { |
1868 | | /* do the multiplication without generator precomputation */ |
1869 | 0 | batch_mul(x_out, y_out, z_out, |
1870 | 0 | (const felem_bytearray(*))secrets, num_points, |
1871 | 0 | NULL, mixed, (const felem(*)[17][3])pre_comp, NULL); |
1872 | 0 | } |
1873 | | /* reduce the output to its unique minimal representation */ |
1874 | 264 | felem_contract(x_in, x_out); |
1875 | 264 | felem_contract(y_in, y_out); |
1876 | 264 | felem_contract(z_in, z_out); |
1877 | 264 | if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) || |
1878 | 264 | (!felem_to_BN(z, z_in))) { |
1879 | 0 | ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB); |
1880 | 0 | goto err; |
1881 | 0 | } |
1882 | 264 | ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z, |
1883 | 264 | ctx); |
1884 | | |
1885 | 264 | err: |
1886 | 264 | BN_CTX_end(ctx); |
1887 | 264 | EC_POINT_free(generator); |
1888 | 264 | OPENSSL_free(secrets); |
1889 | 264 | OPENSSL_free(pre_comp); |
1890 | 264 | OPENSSL_free(tmp_felems); |
1891 | 264 | return ret; |
1892 | 264 | } |
1893 | | |
1894 | | int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx) |
1895 | 0 | { |
1896 | 0 | int ret = 0; |
1897 | 0 | NISTP384_PRE_COMP *pre = NULL; |
1898 | 0 | int i, j; |
1899 | 0 | BIGNUM *x, *y; |
1900 | 0 | EC_POINT *generator = NULL; |
1901 | 0 | felem tmp_felems[16]; |
1902 | 0 | #ifndef FIPS_MODULE |
1903 | 0 | BN_CTX *new_ctx = NULL; |
1904 | 0 | #endif |
1905 | | |
1906 | | /* throw away old precomputation */ |
1907 | 0 | EC_pre_comp_free(group); |
1908 | |
|
1909 | 0 | #ifndef FIPS_MODULE |
1910 | 0 | if (ctx == NULL) |
1911 | 0 | ctx = new_ctx = BN_CTX_new(); |
1912 | 0 | #endif |
1913 | 0 | if (ctx == NULL) |
1914 | 0 | return 0; |
1915 | | |
1916 | 0 | BN_CTX_start(ctx); |
1917 | 0 | x = BN_CTX_get(ctx); |
1918 | 0 | y = BN_CTX_get(ctx); |
1919 | 0 | if (y == NULL) |
1920 | 0 | goto err; |
1921 | | /* get the generator */ |
1922 | 0 | if (group->generator == NULL) |
1923 | 0 | goto err; |
1924 | 0 | generator = EC_POINT_new(group); |
1925 | 0 | if (generator == NULL) |
1926 | 0 | goto err; |
1927 | 0 | BN_bin2bn(nistp384_curve_params[3], sizeof(felem_bytearray), x); |
1928 | 0 | BN_bin2bn(nistp384_curve_params[4], sizeof(felem_bytearray), y); |
1929 | 0 | if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx)) |
1930 | 0 | goto err; |
1931 | 0 | if ((pre = nistp384_pre_comp_new()) == NULL) |
1932 | 0 | goto err; |
1933 | | /* |
1934 | | * if the generator is the standard one, use built-in precomputation |
1935 | | */ |
1936 | 0 | if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) { |
1937 | 0 | memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); |
1938 | 0 | goto done; |
1939 | 0 | } |
1940 | 0 | if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) || |
1941 | 0 | (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) || |
1942 | 0 | (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z))) |
1943 | 0 | goto err; |
1944 | | /* compute 2^95*G, 2^190*G, 2^285*G */ |
1945 | 0 | for (i = 1; i <= 4; i <<= 1) { |
1946 | 0 | point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], |
1947 | 0 | pre->g_pre_comp[i][0], pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]); |
1948 | 0 | for (j = 0; j < 94; ++j) { |
1949 | 0 | point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], |
1950 | 0 | pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2]); |
1951 | 0 | } |
1952 | 0 | } |
1953 | | /* g_pre_comp[0] is the point at infinity */ |
1954 | 0 | memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0])); |
1955 | | /* the remaining multiples */ |
1956 | | /* 2^95*G + 2^190*G */ |
1957 | 0 | point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1], pre->g_pre_comp[6][2], |
1958 | 0 | pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], pre->g_pre_comp[4][2], 0, |
1959 | 0 | pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]); |
1960 | | /* 2^95*G + 2^285*G */ |
1961 | 0 | point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], pre->g_pre_comp[10][2], |
1962 | 0 | pre->g_pre_comp[8][0], pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], 0, |
1963 | 0 | pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]); |
1964 | | /* 2^190*G + 2^285*G */ |
1965 | 0 | point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], |
1966 | 0 | pre->g_pre_comp[8][0], pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], 0, |
1967 | 0 | pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], pre->g_pre_comp[4][2]); |
1968 | | /* 2^95*G + 2^190*G + 2^285*G */ |
1969 | 0 | point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], pre->g_pre_comp[14][2], |
1970 | 0 | pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], 0, |
1971 | 0 | pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], pre->g_pre_comp[2][2]); |
1972 | 0 | for (i = 1; i < 8; ++i) { |
1973 | | /* odd multiples: add G */ |
1974 | 0 | point_add(pre->g_pre_comp[2 * i + 1][0], pre->g_pre_comp[2 * i + 1][1], pre->g_pre_comp[2 * i + 1][2], |
1975 | 0 | pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0, |
1976 | 0 | pre->g_pre_comp[1][0], pre->g_pre_comp[1][1], pre->g_pre_comp[1][2]); |
1977 | 0 | } |
1978 | 0 | make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems); |
1979 | |
|
1980 | 0 | done: |
1981 | 0 | SETPRECOMP(group, nistp384, pre); |
1982 | 0 | ret = 1; |
1983 | 0 | pre = NULL; |
1984 | 0 | err: |
1985 | 0 | BN_CTX_end(ctx); |
1986 | 0 | EC_POINT_free(generator); |
1987 | 0 | #ifndef FIPS_MODULE |
1988 | 0 | BN_CTX_free(new_ctx); |
1989 | 0 | #endif |
1990 | 0 | ossl_ec_nistp384_pre_comp_free(pre); |
1991 | 0 | return ret; |
1992 | 0 | } |
1993 | | |
1994 | | int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group) |
1995 | 0 | { |
1996 | 0 | return HAVEPRECOMP(group, nistp384); |
1997 | 0 | } |