/src/BearSSL/src/ec/ec_c25519_m31.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> |
3 | | * |
4 | | * Permission is hereby granted, free of charge, to any person obtaining |
5 | | * a copy of this software and associated documentation files (the |
6 | | * "Software"), to deal in the Software without restriction, including |
7 | | * without limitation the rights to use, copy, modify, merge, publish, |
8 | | * distribute, sublicense, and/or sell copies of the Software, and to |
9 | | * permit persons to whom the Software is furnished to do so, subject to |
10 | | * the following conditions: |
11 | | * |
12 | | * The above copyright notice and this permission notice shall be |
13 | | * included in all copies or substantial portions of the Software. |
14 | | * |
15 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
16 | | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
17 | | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
18 | | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
19 | | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
20 | | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
21 | | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | |
25 | | #include "inner.h" |
26 | | |
27 | | /* obsolete |
28 | | #include <stdio.h> |
29 | | #include <stdlib.h> |
30 | | static void |
31 | | print_int(const char *name, const uint32_t *x) |
32 | | { |
33 | | size_t u; |
34 | | unsigned char tmp[40]; |
35 | | |
36 | | printf("%s = ", name); |
37 | | for (u = 0; u < 9; u ++) { |
38 | | if (x[u] > 0x3FFFFFFF) { |
39 | | printf("INVALID:"); |
40 | | for (u = 0; u < 9; u ++) { |
41 | | printf(" %08X", x[u]); |
42 | | } |
43 | | printf("\n"); |
44 | | return; |
45 | | } |
46 | | } |
47 | | memset(tmp, 0, sizeof tmp); |
48 | | for (u = 0; u < 9; u ++) { |
49 | | uint64_t w; |
50 | | int j, k; |
51 | | |
52 | | w = x[u]; |
53 | | j = 30 * (int)u; |
54 | | k = j & 7; |
55 | | if (k != 0) { |
56 | | w <<= k; |
57 | | j -= k; |
58 | | } |
59 | | k = j >> 3; |
60 | | for (j = 0; j < 8; j ++) { |
61 | | tmp[39 - k - j] |= (unsigned char)w; |
62 | | w >>= 8; |
63 | | } |
64 | | } |
65 | | for (u = 8; u < 40; u ++) { |
66 | | printf("%02X", tmp[u]); |
67 | | } |
68 | | printf("\n"); |
69 | | } |
70 | | */ |
71 | | |
72 | | /* |
73 | | * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_ |
74 | | * that right-shifting a signed negative integer copies the sign bit |
75 | | * (arithmetic right-shift). This is "implementation-defined behaviour", |
76 | | * i.e. it is not undefined, but it may differ between compilers. Each |
77 | | * compiler is supposed to document its behaviour in that respect. GCC |
78 | | * explicitly defines that an arithmetic right shift is used. We expect |
79 | | * all other compilers to do the same, because underlying CPU offer an |
80 | | * arithmetic right shift opcode that could not be used otherwise. |
81 | | */ |
82 | | #if BR_NO_ARITH_SHIFT |
83 | | #define ARSH(x, n) (((uint32_t)(x) >> (n)) \ |
84 | | | ((-((uint32_t)(x) >> 31)) << (32 - (n)))) |
85 | | #else |
86 | 137k | #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n)) |
87 | | #endif |
88 | | |
89 | | /* |
90 | | * Convert an integer from unsigned little-endian encoding to a sequence of |
91 | | * 30-bit words in little-endian order. The final "partial" word is |
92 | | * returned. |
93 | | */ |
94 | | static uint32_t |
95 | | le8_to_le30(uint32_t *dst, const unsigned char *src, size_t len) |
96 | 15 | { |
97 | 15 | uint32_t acc; |
98 | 15 | int acc_len; |
99 | | |
100 | 15 | acc = 0; |
101 | 15 | acc_len = 0; |
102 | 495 | while (len -- > 0) { |
103 | 480 | uint32_t b; |
104 | | |
105 | 480 | b = *src ++; |
106 | 480 | if (acc_len < 22) { |
107 | 360 | acc |= b << acc_len; |
108 | 360 | acc_len += 8; |
109 | 360 | } else { |
110 | 120 | *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF; |
111 | 120 | acc = b >> (30 - acc_len); |
112 | 120 | acc_len -= 22; |
113 | 120 | } |
114 | 480 | } |
115 | 15 | return acc; |
116 | 15 | } |
117 | | |
118 | | /* |
119 | | * Convert an integer (30-bit words, little-endian) to unsigned |
120 | | * little-endian encoding. The total encoding length is provided; all |
121 | | * the destination bytes will be filled. |
122 | | */ |
123 | | static void |
124 | | le30_to_le8(unsigned char *dst, size_t len, const uint32_t *src) |
125 | 15 | { |
126 | 15 | uint32_t acc; |
127 | 15 | int acc_len; |
128 | | |
129 | 15 | acc = 0; |
130 | 15 | acc_len = 0; |
131 | 495 | while (len -- > 0) { |
132 | 480 | if (acc_len < 8) { |
133 | 135 | uint32_t w; |
134 | | |
135 | 135 | w = *src ++; |
136 | 135 | *dst ++ = (unsigned char)(acc | (w << acc_len)); |
137 | 135 | acc = w >> (8 - acc_len); |
138 | 135 | acc_len += 22; |
139 | 345 | } else { |
140 | 345 | *dst ++ = (unsigned char)acc; |
141 | 345 | acc >>= 8; |
142 | 345 | acc_len -= 8; |
143 | 345 | } |
144 | 480 | } |
145 | 15 | } |
146 | | |
147 | | /* |
148 | | * Multiply two integers. Source integers are represented as arrays of |
149 | | * nine 30-bit words, for values up to 2^270-1. Result is encoded over |
150 | | * 18 words of 30 bits each. |
151 | | */ |
152 | | static void |
153 | | mul9(uint32_t *d, const uint32_t *a, const uint32_t *b) |
154 | 19.7k | { |
155 | | /* |
156 | | * Maximum intermediate result is no more than |
157 | | * 10376293531797946367, which fits in 64 bits. Reason: |
158 | | * |
159 | | * 10376293531797946367 = 9 * (2^30-1)^2 + 9663676406 |
160 | | * 10376293531797946367 < 9663676407 * 2^30 |
161 | | * |
162 | | * Thus, adding together 9 products of 30-bit integers, with |
163 | | * a carry of at most 9663676406, yields an integer that fits |
164 | | * on 64 bits and generates a carry of at most 9663676406. |
165 | | */ |
166 | 19.7k | uint64_t t[17]; |
167 | 19.7k | uint64_t cc; |
168 | 19.7k | int i; |
169 | | |
170 | 19.7k | t[ 0] = MUL31(a[0], b[0]); |
171 | 19.7k | t[ 1] = MUL31(a[0], b[1]) |
172 | 19.7k | + MUL31(a[1], b[0]); |
173 | 19.7k | t[ 2] = MUL31(a[0], b[2]) |
174 | 19.7k | + MUL31(a[1], b[1]) |
175 | 19.7k | + MUL31(a[2], b[0]); |
176 | 19.7k | t[ 3] = MUL31(a[0], b[3]) |
177 | 19.7k | + MUL31(a[1], b[2]) |
178 | 19.7k | + MUL31(a[2], b[1]) |
179 | 19.7k | + MUL31(a[3], b[0]); |
180 | 19.7k | t[ 4] = MUL31(a[0], b[4]) |
181 | 19.7k | + MUL31(a[1], b[3]) |
182 | 19.7k | + MUL31(a[2], b[2]) |
183 | 19.7k | + MUL31(a[3], b[1]) |
184 | 19.7k | + MUL31(a[4], b[0]); |
185 | 19.7k | t[ 5] = MUL31(a[0], b[5]) |
186 | 19.7k | + MUL31(a[1], b[4]) |
187 | 19.7k | + MUL31(a[2], b[3]) |
188 | 19.7k | + MUL31(a[3], b[2]) |
189 | 19.7k | + MUL31(a[4], b[1]) |
190 | 19.7k | + MUL31(a[5], b[0]); |
191 | 19.7k | t[ 6] = MUL31(a[0], b[6]) |
192 | 19.7k | + MUL31(a[1], b[5]) |
193 | 19.7k | + MUL31(a[2], b[4]) |
194 | 19.7k | + MUL31(a[3], b[3]) |
195 | 19.7k | + MUL31(a[4], b[2]) |
196 | 19.7k | + MUL31(a[5], b[1]) |
197 | 19.7k | + MUL31(a[6], b[0]); |
198 | 19.7k | t[ 7] = MUL31(a[0], b[7]) |
199 | 19.7k | + MUL31(a[1], b[6]) |
200 | 19.7k | + MUL31(a[2], b[5]) |
201 | 19.7k | + MUL31(a[3], b[4]) |
202 | 19.7k | + MUL31(a[4], b[3]) |
203 | 19.7k | + MUL31(a[5], b[2]) |
204 | 19.7k | + MUL31(a[6], b[1]) |
205 | 19.7k | + MUL31(a[7], b[0]); |
206 | 19.7k | t[ 8] = MUL31(a[0], b[8]) |
207 | 19.7k | + MUL31(a[1], b[7]) |
208 | 19.7k | + MUL31(a[2], b[6]) |
209 | 19.7k | + MUL31(a[3], b[5]) |
210 | 19.7k | + MUL31(a[4], b[4]) |
211 | 19.7k | + MUL31(a[5], b[3]) |
212 | 19.7k | + MUL31(a[6], b[2]) |
213 | 19.7k | + MUL31(a[7], b[1]) |
214 | 19.7k | + MUL31(a[8], b[0]); |
215 | 19.7k | t[ 9] = MUL31(a[1], b[8]) |
216 | 19.7k | + MUL31(a[2], b[7]) |
217 | 19.7k | + MUL31(a[3], b[6]) |
218 | 19.7k | + MUL31(a[4], b[5]) |
219 | 19.7k | + MUL31(a[5], b[4]) |
220 | 19.7k | + MUL31(a[6], b[3]) |
221 | 19.7k | + MUL31(a[7], b[2]) |
222 | 19.7k | + MUL31(a[8], b[1]); |
223 | 19.7k | t[10] = MUL31(a[2], b[8]) |
224 | 19.7k | + MUL31(a[3], b[7]) |
225 | 19.7k | + MUL31(a[4], b[6]) |
226 | 19.7k | + MUL31(a[5], b[5]) |
227 | 19.7k | + MUL31(a[6], b[4]) |
228 | 19.7k | + MUL31(a[7], b[3]) |
229 | 19.7k | + MUL31(a[8], b[2]); |
230 | 19.7k | t[11] = MUL31(a[3], b[8]) |
231 | 19.7k | + MUL31(a[4], b[7]) |
232 | 19.7k | + MUL31(a[5], b[6]) |
233 | 19.7k | + MUL31(a[6], b[5]) |
234 | 19.7k | + MUL31(a[7], b[4]) |
235 | 19.7k | + MUL31(a[8], b[3]); |
236 | 19.7k | t[12] = MUL31(a[4], b[8]) |
237 | 19.7k | + MUL31(a[5], b[7]) |
238 | 19.7k | + MUL31(a[6], b[6]) |
239 | 19.7k | + MUL31(a[7], b[5]) |
240 | 19.7k | + MUL31(a[8], b[4]); |
241 | 19.7k | t[13] = MUL31(a[5], b[8]) |
242 | 19.7k | + MUL31(a[6], b[7]) |
243 | 19.7k | + MUL31(a[7], b[6]) |
244 | 19.7k | + MUL31(a[8], b[5]); |
245 | 19.7k | t[14] = MUL31(a[6], b[8]) |
246 | 19.7k | + MUL31(a[7], b[7]) |
247 | 19.7k | + MUL31(a[8], b[6]); |
248 | 19.7k | t[15] = MUL31(a[7], b[8]) |
249 | 19.7k | + MUL31(a[8], b[7]); |
250 | 19.7k | t[16] = MUL31(a[8], b[8]); |
251 | | |
252 | | /* |
253 | | * Propagate carries. |
254 | | */ |
255 | 19.7k | cc = 0; |
256 | 355k | for (i = 0; i < 17; i ++) { |
257 | 336k | uint64_t w; |
258 | | |
259 | 336k | w = t[i] + cc; |
260 | 336k | d[i] = (uint32_t)w & 0x3FFFFFFF; |
261 | 336k | cc = w >> 30; |
262 | 336k | } |
263 | 19.7k | d[17] = (uint32_t)cc; |
264 | 19.7k | } |
265 | | |
266 | | /* |
267 | | * Square a 270-bit integer, represented as an array of nine 30-bit words. |
268 | | * Result uses 18 words of 30 bits each. |
269 | | */ |
270 | | static void |
271 | | square9(uint32_t *d, const uint32_t *a) |
272 | 19.1k | { |
273 | 19.1k | uint64_t t[17]; |
274 | 19.1k | uint64_t cc; |
275 | 19.1k | int i; |
276 | | |
277 | 19.1k | t[ 0] = MUL31(a[0], a[0]); |
278 | 19.1k | t[ 1] = ((MUL31(a[0], a[1])) << 1); |
279 | 19.1k | t[ 2] = MUL31(a[1], a[1]) |
280 | 19.1k | + ((MUL31(a[0], a[2])) << 1); |
281 | 19.1k | t[ 3] = ((MUL31(a[0], a[3]) |
282 | 19.1k | + MUL31(a[1], a[2])) << 1); |
283 | 19.1k | t[ 4] = MUL31(a[2], a[2]) |
284 | 19.1k | + ((MUL31(a[0], a[4]) |
285 | 19.1k | + MUL31(a[1], a[3])) << 1); |
286 | 19.1k | t[ 5] = ((MUL31(a[0], a[5]) |
287 | 19.1k | + MUL31(a[1], a[4]) |
288 | 19.1k | + MUL31(a[2], a[3])) << 1); |
289 | 19.1k | t[ 6] = MUL31(a[3], a[3]) |
290 | 19.1k | + ((MUL31(a[0], a[6]) |
291 | 19.1k | + MUL31(a[1], a[5]) |
292 | 19.1k | + MUL31(a[2], a[4])) << 1); |
293 | 19.1k | t[ 7] = ((MUL31(a[0], a[7]) |
294 | 19.1k | + MUL31(a[1], a[6]) |
295 | 19.1k | + MUL31(a[2], a[5]) |
296 | 19.1k | + MUL31(a[3], a[4])) << 1); |
297 | 19.1k | t[ 8] = MUL31(a[4], a[4]) |
298 | 19.1k | + ((MUL31(a[0], a[8]) |
299 | 19.1k | + MUL31(a[1], a[7]) |
300 | 19.1k | + MUL31(a[2], a[6]) |
301 | 19.1k | + MUL31(a[3], a[5])) << 1); |
302 | 19.1k | t[ 9] = ((MUL31(a[1], a[8]) |
303 | 19.1k | + MUL31(a[2], a[7]) |
304 | 19.1k | + MUL31(a[3], a[6]) |
305 | 19.1k | + MUL31(a[4], a[5])) << 1); |
306 | 19.1k | t[10] = MUL31(a[5], a[5]) |
307 | 19.1k | + ((MUL31(a[2], a[8]) |
308 | 19.1k | + MUL31(a[3], a[7]) |
309 | 19.1k | + MUL31(a[4], a[6])) << 1); |
310 | 19.1k | t[11] = ((MUL31(a[3], a[8]) |
311 | 19.1k | + MUL31(a[4], a[7]) |
312 | 19.1k | + MUL31(a[5], a[6])) << 1); |
313 | 19.1k | t[12] = MUL31(a[6], a[6]) |
314 | 19.1k | + ((MUL31(a[4], a[8]) |
315 | 19.1k | + MUL31(a[5], a[7])) << 1); |
316 | 19.1k | t[13] = ((MUL31(a[5], a[8]) |
317 | 19.1k | + MUL31(a[6], a[7])) << 1); |
318 | 19.1k | t[14] = MUL31(a[7], a[7]) |
319 | 19.1k | + ((MUL31(a[6], a[8])) << 1); |
320 | 19.1k | t[15] = ((MUL31(a[7], a[8])) << 1); |
321 | 19.1k | t[16] = MUL31(a[8], a[8]); |
322 | | |
323 | | /* |
324 | | * Propagate carries. |
325 | | */ |
326 | 19.1k | cc = 0; |
327 | 343k | for (i = 0; i < 17; i ++) { |
328 | 324k | uint64_t w; |
329 | | |
330 | 324k | w = t[i] + cc; |
331 | 324k | d[i] = (uint32_t)w & 0x3FFFFFFF; |
332 | 324k | cc = w >> 30; |
333 | 324k | } |
334 | 19.1k | d[17] = (uint32_t)cc; |
335 | 19.1k | } |
336 | | |
337 | | /* |
338 | | * Perform a "final reduction" in field F255 (field for Curve25519) |
339 | | * The source value must be less than twice the modulus. If the value |
340 | | * is not lower than the modulus, then the modulus is subtracted and |
341 | | * this function returns 1; otherwise, it leaves it untouched and it |
342 | | * returns 0. |
343 | | */ |
344 | | static uint32_t |
345 | | reduce_final_f255(uint32_t *d) |
346 | 15 | { |
347 | 15 | uint32_t t[9]; |
348 | 15 | uint32_t cc; |
349 | 15 | int i; |
350 | | |
351 | 15 | memcpy(t, d, sizeof t); |
352 | 15 | cc = 19; |
353 | 150 | for (i = 0; i < 9; i ++) { |
354 | 135 | uint32_t w; |
355 | | |
356 | 135 | w = t[i] + cc; |
357 | 135 | cc = w >> 30; |
358 | 135 | t[i] = w & 0x3FFFFFFF; |
359 | 135 | } |
360 | 15 | cc = t[8] >> 15; |
361 | 15 | t[8] &= 0x7FFF; |
362 | 15 | CCOPY(cc, d, t, sizeof t); |
363 | 15 | return cc; |
364 | 15 | } |
365 | | |
366 | | /* |
367 | | * Perform a multiplication of two integers modulo 2^255-19. |
368 | | * Operands are arrays of 9 words, each containing 30 bits of data, in |
369 | | * little-endian order. Input value may be up to 2^256-1; on output, value |
370 | | * fits on 256 bits and is lower than twice the modulus. |
371 | | */ |
372 | | static void |
373 | | f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b) |
374 | 19.7k | { |
375 | 19.7k | uint32_t t[18], cc; |
376 | 19.7k | int i; |
377 | | |
378 | | /* |
379 | | * Compute raw multiplication. All result words fit in 30 bits |
380 | | * each; upper word (t[17]) must fit on 2 bits, since the product |
381 | | * of two 256-bit integers must fit on 512 bits. |
382 | | */ |
383 | 19.7k | mul9(t, a, b); |
384 | | |
385 | | /* |
386 | | * Modular reduction: each high word is added where necessary. |
387 | | * Since the modulus is 2^255-19 and word 9 corresponds to |
388 | | * offset 9*30 = 270, word 9+k must be added to word k with |
389 | | * a factor of 19*2^15 = 622592. The extra bits in word 8 are also |
390 | | * added that way. |
391 | | * |
392 | | * Keeping the carry on 32 bits helps with 32-bit architectures, |
393 | | * and does not noticeably impact performance on 64-bit systems. |
394 | | */ |
395 | 19.7k | cc = MUL15(t[8] >> 15, 19); /* at most 19*(2^15-1) = 622573 */ |
396 | 19.7k | t[8] &= 0x7FFF; |
397 | 197k | for (i = 0; i < 9; i ++) { |
398 | 177k | uint64_t w; |
399 | | |
400 | 177k | w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592); |
401 | 177k | t[i] = (uint32_t)w & 0x3FFFFFFF; |
402 | 177k | cc = (uint32_t)(w >> 30); /* at most 622592 */ |
403 | 177k | } |
404 | | |
405 | | /* |
406 | | * Original product was up to (2^256-1)^2, i.e. a 512-bit integer. |
407 | | * This was split into two parts (upper of 257 bits, lower of 255 |
408 | | * bits), and the upper was added to the lower with a factor 19, |
409 | | * which means that the intermediate value is less than 77*2^255 |
410 | | * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are |
411 | | * less than 77, and the initial carry cc is at most 76*19 = 1444. |
412 | | */ |
413 | 19.7k | cc = MUL15(t[8] >> 15, 19); |
414 | 19.7k | t[8] &= 0x7FFF; |
415 | 197k | for (i = 0; i < 9; i ++) { |
416 | 177k | uint32_t z; |
417 | | |
418 | 177k | z = t[i] + cc; |
419 | 177k | d[i] = z & 0x3FFFFFFF; |
420 | 177k | cc = z >> 30; |
421 | 177k | } |
422 | | |
423 | | /* |
424 | | * Final result is at most 2^255 + 1443. In particular, the last |
425 | | * carry is necessarily 0, since t[8] was truncated to 15 bits. |
426 | | */ |
427 | 19.7k | } |
428 | | |
429 | | /* |
430 | | * Perform a squaring of an integer modulo 2^255-19. |
431 | | * Operands are arrays of 9 words, each containing 30 bits of data, in |
432 | | * little-endian order. Input value may be up to 2^256-1; on output, value |
433 | | * fits on 256 bits and is lower than twice the modulus. |
434 | | */ |
435 | | static void |
436 | | f255_square(uint32_t *d, const uint32_t *a) |
437 | 19.1k | { |
438 | 19.1k | uint32_t t[18], cc; |
439 | 19.1k | int i; |
440 | | |
441 | | /* |
442 | | * Compute raw squaring. All result words fit in 30 bits |
443 | | * each; upper word (t[17]) must fit on 2 bits, since the square |
444 | | * of a 256-bit integers must fit on 512 bits. |
445 | | */ |
446 | 19.1k | square9(t, a); |
447 | | |
448 | | /* |
449 | | * Modular reduction: each high word is added where necessary. |
450 | | * See f255_mul() for details on the reduction and carry limits. |
451 | | */ |
452 | 19.1k | cc = MUL15(t[8] >> 15, 19); |
453 | 19.1k | t[8] &= 0x7FFF; |
454 | 191k | for (i = 0; i < 9; i ++) { |
455 | 171k | uint64_t w; |
456 | | |
457 | 171k | w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592); |
458 | 171k | t[i] = (uint32_t)w & 0x3FFFFFFF; |
459 | 171k | cc = (uint32_t)(w >> 30); |
460 | 171k | } |
461 | 19.1k | cc = MUL15(t[8] >> 15, 19); |
462 | 19.1k | t[8] &= 0x7FFF; |
463 | 191k | for (i = 0; i < 9; i ++) { |
464 | 171k | uint32_t z; |
465 | | |
466 | 171k | z = t[i] + cc; |
467 | 171k | d[i] = z & 0x3FFFFFFF; |
468 | 171k | cc = z >> 30; |
469 | 171k | } |
470 | 19.1k | } |
471 | | |
472 | | /* |
473 | | * Add two values in F255. Partial reduction is performed (down to less |
474 | | * than twice the modulus). |
475 | | */ |
476 | | static void |
477 | | f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b) |
478 | 15.3k | { |
479 | | /* |
480 | | * Since operand words fit on 30 bits, we can use 32-bit |
481 | | * variables throughout. |
482 | | */ |
483 | 15.3k | int i; |
484 | 15.3k | uint32_t cc, w; |
485 | | |
486 | 15.3k | cc = 0; |
487 | 153k | for (i = 0; i < 9; i ++) { |
488 | 137k | w = a[i] + b[i] + cc; |
489 | 137k | d[i] = w & 0x3FFFFFFF; |
490 | 137k | cc = w >> 30; |
491 | 137k | } |
492 | 15.3k | cc = MUL15(w >> 15, 19); |
493 | 15.3k | d[8] &= 0x7FFF; |
494 | 153k | for (i = 0; i < 9; i ++) { |
495 | 137k | w = d[i] + cc; |
496 | 137k | d[i] = w & 0x3FFFFFFF; |
497 | 137k | cc = w >> 30; |
498 | 137k | } |
499 | 15.3k | } |
500 | | |
501 | | /* |
502 | | * Subtract one value from another in F255. Partial reduction is |
503 | | * performed (down to less than twice the modulus). |
504 | | */ |
505 | | static void |
506 | | f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b) |
507 | 15.3k | { |
508 | | /* |
509 | | * We actually compute a - b + 2*p, so that the final value is |
510 | | * necessarily positive. |
511 | | */ |
512 | 15.3k | int i; |
513 | 15.3k | uint32_t cc, w; |
514 | | |
515 | 15.3k | cc = (uint32_t)-38; |
516 | 153k | for (i = 0; i < 9; i ++) { |
517 | 137k | w = a[i] - b[i] + cc; |
518 | 137k | d[i] = w & 0x3FFFFFFF; |
519 | 137k | cc = ARSH(w, 30); |
520 | 137k | } |
521 | 15.3k | cc = MUL15((w + 0x10000) >> 15, 19); |
522 | 15.3k | d[8] &= 0x7FFF; |
523 | 153k | for (i = 0; i < 9; i ++) { |
524 | 137k | w = d[i] + cc; |
525 | 137k | d[i] = w & 0x3FFFFFFF; |
526 | 137k | cc = w >> 30; |
527 | 137k | } |
528 | 15.3k | } |
529 | | |
530 | | /* |
531 | | * Multiply an integer by the 'A24' constant (121665). Partial reduction |
532 | | * is performed (down to less than twice the modulus). |
533 | | */ |
534 | | static void |
535 | | f255_mul_a24(uint32_t *d, const uint32_t *a) |
536 | 3.82k | { |
537 | 3.82k | int i; |
538 | 3.82k | uint64_t w; |
539 | 3.82k | uint32_t cc; |
540 | | |
541 | | /* |
542 | | * a[] is over 256 bits, thus a[8] has length at most 16 bits. |
543 | | * We single out the processing of the last word: intermediate |
544 | | * value w is up to 121665*2^16, yielding a carry for the next |
545 | | * loop of at most 19*(121665*2^16/2^15) = 4623289. |
546 | | */ |
547 | 3.82k | cc = 0; |
548 | 34.4k | for (i = 0; i < 8; i ++) { |
549 | 30.6k | w = MUL31(a[i], 121665) + (uint64_t)cc; |
550 | 30.6k | d[i] = (uint32_t)w & 0x3FFFFFFF; |
551 | 30.6k | cc = (uint32_t)(w >> 30); |
552 | 30.6k | } |
553 | 3.82k | w = MUL31(a[8], 121665) + (uint64_t)cc; |
554 | 3.82k | d[8] = (uint32_t)w & 0x7FFF; |
555 | 3.82k | cc = MUL15((uint32_t)(w >> 15), 19); |
556 | | |
557 | 38.2k | for (i = 0; i < 9; i ++) { |
558 | 34.4k | uint32_t z; |
559 | | |
560 | 34.4k | z = d[i] + cc; |
561 | 34.4k | d[i] = z & 0x3FFFFFFF; |
562 | 34.4k | cc = z >> 30; |
563 | 34.4k | } |
564 | 3.82k | } |
565 | | |
566 | | static const unsigned char GEN[] = { |
567 | | 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
568 | | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
569 | | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
570 | | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
571 | | }; |
572 | | |
573 | | static const unsigned char ORDER[] = { |
574 | | 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
575 | | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
576 | | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
577 | | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF |
578 | | }; |
579 | | |
580 | | static const unsigned char * |
581 | | api_generator(int curve, size_t *len) |
582 | 15 | { |
583 | 15 | (void)curve; |
584 | 15 | *len = 32; |
585 | 15 | return GEN; |
586 | 15 | } |
587 | | |
588 | | static const unsigned char * |
589 | | api_order(int curve, size_t *len) |
590 | 15 | { |
591 | 15 | (void)curve; |
592 | 15 | *len = 32; |
593 | 15 | return ORDER; |
594 | 15 | } |
595 | | |
596 | | static size_t |
597 | | api_xoff(int curve, size_t *len) |
598 | 0 | { |
599 | 0 | (void)curve; |
600 | 0 | *len = 32; |
601 | 0 | return 0; |
602 | 0 | } |
603 | | |
604 | | static void |
605 | | cswap(uint32_t *a, uint32_t *b, uint32_t ctl) |
606 | 7.68k | { |
607 | 7.68k | int i; |
608 | | |
609 | 7.68k | ctl = -ctl; |
610 | 76.8k | for (i = 0; i < 9; i ++) { |
611 | 69.1k | uint32_t aw, bw, tw; |
612 | | |
613 | 69.1k | aw = a[i]; |
614 | 69.1k | bw = b[i]; |
615 | 69.1k | tw = ctl & (aw ^ bw); |
616 | 69.1k | a[i] = aw ^ tw; |
617 | 69.1k | b[i] = bw ^ tw; |
618 | 69.1k | } |
619 | 7.68k | } |
620 | | |
621 | | static uint32_t |
622 | | api_mul(unsigned char *G, size_t Glen, |
623 | | const unsigned char *kb, size_t kblen, int curve) |
624 | 15 | { |
625 | 15 | uint32_t x1[9], x2[9], x3[9], z2[9], z3[9]; |
626 | 15 | uint32_t a[9], aa[9], b[9], bb[9]; |
627 | 15 | uint32_t c[9], d[9], e[9], da[9], cb[9]; |
628 | 15 | unsigned char k[32]; |
629 | 15 | uint32_t swap; |
630 | 15 | int i; |
631 | | |
632 | 15 | (void)curve; |
633 | | |
634 | | /* |
635 | | * Points are encoded over exactly 32 bytes. Multipliers must fit |
636 | | * in 32 bytes as well. |
637 | | * RFC 7748 mandates that the high bit of the last point byte must |
638 | | * be ignored/cleared. |
639 | | */ |
640 | 15 | if (Glen != 32 || kblen > 32) { |
641 | 0 | return 0; |
642 | 0 | } |
643 | 15 | G[31] &= 0x7F; |
644 | | |
645 | | /* |
646 | | * Initialise variables x1, x2, z2, x3 and z3. We set all of them |
647 | | * into Montgomery representation. |
648 | | */ |
649 | 15 | x1[8] = le8_to_le30(x1, G, 32); |
650 | 15 | memcpy(x3, x1, sizeof x1); |
651 | 15 | memset(z2, 0, sizeof z2); |
652 | 15 | memset(x2, 0, sizeof x2); |
653 | 15 | x2[0] = 1; |
654 | 15 | memset(z3, 0, sizeof z3); |
655 | 15 | z3[0] = 1; |
656 | | |
657 | 15 | memset(k, 0, (sizeof k) - kblen); |
658 | 15 | memcpy(k + (sizeof k) - kblen, kb, kblen); |
659 | 15 | k[31] &= 0xF8; |
660 | 15 | k[0] &= 0x7F; |
661 | 15 | k[0] |= 0x40; |
662 | | |
663 | | /* obsolete |
664 | | print_int("x1", x1); |
665 | | */ |
666 | | |
667 | 15 | swap = 0; |
668 | 3.84k | for (i = 254; i >= 0; i --) { |
669 | 3.82k | uint32_t kt; |
670 | | |
671 | 3.82k | kt = (k[31 - (i >> 3)] >> (i & 7)) & 1; |
672 | 3.82k | swap ^= kt; |
673 | 3.82k | cswap(x2, x3, swap); |
674 | 3.82k | cswap(z2, z3, swap); |
675 | 3.82k | swap = kt; |
676 | | |
677 | | /* obsolete |
678 | | print_int("x2", x2); |
679 | | print_int("z2", z2); |
680 | | print_int("x3", x3); |
681 | | print_int("z3", z3); |
682 | | */ |
683 | | |
684 | 3.82k | f255_add(a, x2, z2); |
685 | 3.82k | f255_square(aa, a); |
686 | 3.82k | f255_sub(b, x2, z2); |
687 | 3.82k | f255_square(bb, b); |
688 | 3.82k | f255_sub(e, aa, bb); |
689 | 3.82k | f255_add(c, x3, z3); |
690 | 3.82k | f255_sub(d, x3, z3); |
691 | 3.82k | f255_mul(da, d, a); |
692 | 3.82k | f255_mul(cb, c, b); |
693 | | |
694 | | /* obsolete |
695 | | print_int("a ", a); |
696 | | print_int("aa", aa); |
697 | | print_int("b ", b); |
698 | | print_int("bb", bb); |
699 | | print_int("e ", e); |
700 | | print_int("c ", c); |
701 | | print_int("d ", d); |
702 | | print_int("da", da); |
703 | | print_int("cb", cb); |
704 | | */ |
705 | | |
706 | 3.82k | f255_add(x3, da, cb); |
707 | 3.82k | f255_square(x3, x3); |
708 | 3.82k | f255_sub(z3, da, cb); |
709 | 3.82k | f255_square(z3, z3); |
710 | 3.82k | f255_mul(z3, z3, x1); |
711 | 3.82k | f255_mul(x2, aa, bb); |
712 | 3.82k | f255_mul_a24(z2, e); |
713 | 3.82k | f255_add(z2, z2, aa); |
714 | 3.82k | f255_mul(z2, e, z2); |
715 | | |
716 | | /* obsolete |
717 | | print_int("x2", x2); |
718 | | print_int("z2", z2); |
719 | | print_int("x3", x3); |
720 | | print_int("z3", z3); |
721 | | */ |
722 | 3.82k | } |
723 | 15 | cswap(x2, x3, swap); |
724 | 15 | cswap(z2, z3, swap); |
725 | | |
726 | | /* |
727 | | * Inverse z2 with a modular exponentiation. This is a simple |
728 | | * square-and-multiply algorithm; we mutualise most non-squarings |
729 | | * since the exponent contains almost only ones. |
730 | | */ |
731 | 15 | memcpy(a, z2, sizeof z2); |
732 | 240 | for (i = 0; i < 15; i ++) { |
733 | 225 | f255_square(a, a); |
734 | 225 | f255_mul(a, a, z2); |
735 | 225 | } |
736 | 15 | memcpy(b, a, sizeof a); |
737 | 225 | for (i = 0; i < 14; i ++) { |
738 | 210 | int j; |
739 | | |
740 | 3.57k | for (j = 0; j < 16; j ++) { |
741 | 3.36k | f255_square(b, b); |
742 | 3.36k | } |
743 | 210 | f255_mul(b, b, a); |
744 | 210 | } |
745 | 240 | for (i = 14; i >= 0; i --) { |
746 | 225 | f255_square(b, b); |
747 | 225 | if ((0xFFEB >> i) & 1) { |
748 | 195 | f255_mul(b, z2, b); |
749 | 195 | } |
750 | 225 | } |
751 | 15 | f255_mul(x2, x2, b); |
752 | 15 | reduce_final_f255(x2); |
753 | 15 | le30_to_le8(G, 32, x2); |
754 | 15 | return 1; |
755 | 15 | } |
756 | | |
757 | | static size_t |
758 | | api_mulgen(unsigned char *R, |
759 | | const unsigned char *x, size_t xlen, int curve) |
760 | 15 | { |
761 | 15 | const unsigned char *G; |
762 | 15 | size_t Glen; |
763 | | |
764 | 15 | G = api_generator(curve, &Glen); |
765 | 15 | memcpy(R, G, Glen); |
766 | 15 | api_mul(R, Glen, x, xlen, curve); |
767 | 15 | return Glen; |
768 | 15 | } |
769 | | |
770 | | static uint32_t |
771 | | api_muladd(unsigned char *A, const unsigned char *B, size_t len, |
772 | | const unsigned char *x, size_t xlen, |
773 | | const unsigned char *y, size_t ylen, int curve) |
774 | 0 | { |
775 | | /* |
776 | | * We don't implement this method, since it is used for ECDSA |
777 | | * only, and there is no ECDSA over Curve25519 (which instead |
778 | | * uses EdDSA). |
779 | | */ |
780 | 0 | (void)A; |
781 | 0 | (void)B; |
782 | 0 | (void)len; |
783 | 0 | (void)x; |
784 | 0 | (void)xlen; |
785 | 0 | (void)y; |
786 | 0 | (void)ylen; |
787 | 0 | (void)curve; |
788 | 0 | return 0; |
789 | 0 | } |
790 | | |
791 | | /* see bearssl_ec.h */ |
792 | | const br_ec_impl br_ec_c25519_m31 = { |
793 | | (uint32_t)0x20000000, |
794 | | &api_generator, |
795 | | &api_order, |
796 | | &api_xoff, |
797 | | &api_mul, |
798 | | &api_mulgen, |
799 | | &api_muladd |
800 | | }; |