/src/BearSSL/src/ec/ec_c25519_m62.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org> |
3 | | * |
4 | | * Permission is hereby granted, free of charge, to any person obtaining |
5 | | * a copy of this software and associated documentation files (the |
6 | | * "Software"), to deal in the Software without restriction, including |
7 | | * without limitation the rights to use, copy, modify, merge, publish, |
8 | | * distribute, sublicense, and/or sell copies of the Software, and to |
9 | | * permit persons to whom the Software is furnished to do so, subject to |
10 | | * the following conditions: |
11 | | * |
12 | | * The above copyright notice and this permission notice shall be |
13 | | * included in all copies or substantial portions of the Software. |
14 | | * |
15 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
16 | | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
17 | | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
18 | | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
19 | | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
20 | | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
21 | | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | |
25 | | #include "inner.h" |
26 | | |
27 | | #if BR_INT128 || BR_UMUL128 |
28 | | |
29 | | #if BR_UMUL128 |
30 | | #include <intrin.h> |
31 | | #endif |
32 | | |
33 | | static const unsigned char GEN[] = { |
34 | | 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
35 | | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
36 | | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
37 | | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
38 | | }; |
39 | | |
40 | | static const unsigned char ORDER[] = { |
41 | | 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
42 | | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
43 | | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, |
44 | | 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF |
45 | | }; |
46 | | |
47 | | static const unsigned char * |
48 | | api_generator(int curve, size_t *len) |
49 | 14 | { |
50 | 14 | (void)curve; |
51 | 14 | *len = 32; |
52 | 14 | return GEN; |
53 | 14 | } |
54 | | |
55 | | static const unsigned char * |
56 | | api_order(int curve, size_t *len) |
57 | 13 | { |
58 | 13 | (void)curve; |
59 | 13 | *len = 32; |
60 | 13 | return ORDER; |
61 | 13 | } |
62 | | |
63 | | static size_t |
64 | | api_xoff(int curve, size_t *len) |
65 | 0 | { |
66 | 0 | (void)curve; |
67 | 0 | *len = 32; |
68 | 0 | return 0; |
69 | 0 | } |
70 | | |
71 | | /* |
72 | | * A field element is encoded as five 64-bit integers, in basis 2^51. |
73 | | * Limbs may be occasionally larger than 2^51, to save on carry |
74 | | * propagation costs. |
75 | | */ |
76 | | |
77 | 1.19M | #define MASK51 (((uint64_t)1 << 51) - (uint64_t)1) |
78 | | |
79 | | /* |
80 | | * Swap two field elements, conditionally on a flag. |
81 | | */ |
82 | | static inline void |
83 | | f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl) |
84 | 7.16k | { |
85 | 7.16k | uint64_t m, w; |
86 | | |
87 | 7.16k | m = -(uint64_t)ctl; |
88 | 7.16k | w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w; |
89 | 7.16k | w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w; |
90 | 7.16k | w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w; |
91 | 7.16k | w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w; |
92 | 7.16k | w = m & (a[4] ^ b[4]); a[4] ^= w; b[4] ^= w; |
93 | 7.16k | } |
94 | | |
95 | | /* |
96 | | * Addition with no carry propagation. Limbs double in size. |
97 | | */ |
98 | | static inline void |
99 | | f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b) |
100 | 14.2k | { |
101 | 14.2k | d[0] = a[0] + b[0]; |
102 | 14.2k | d[1] = a[1] + b[1]; |
103 | 14.2k | d[2] = a[2] + b[2]; |
104 | 14.2k | d[3] = a[3] + b[3]; |
105 | 14.2k | d[4] = a[4] + b[4]; |
106 | 14.2k | } |
107 | | |
108 | | /* |
109 | | * Subtraction. |
110 | | * On input, limbs must fit on 60 bits each. On output, result is |
111 | | * partially reduced, with max value 2^255+19456; moreover, all |
112 | | * limbs will fit on 51 bits, except the low limb, which may have |
113 | | * value up to 2^51+19455. |
114 | | */ |
115 | | static inline void |
116 | | f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b) |
117 | 14.2k | { |
118 | 14.2k | uint64_t cc, w; |
119 | | |
120 | | /* |
121 | | * We compute d = (2^255-19)*1024 + a - b. Since the limbs |
122 | | * fit on 60 bits, the maximum value of operands are slightly |
123 | | * more than 2^264, but much less than 2^265-19456. This |
124 | | * ensures that the result is positive. |
125 | | */ |
126 | | |
127 | | /* |
128 | | * Initial carry is 19456, since we add 2^265-19456. Each |
129 | | * individual subtraction may yield a carry up to 513. |
130 | | */ |
131 | 14.2k | w = a[0] - b[0] - 19456; |
132 | 14.2k | d[0] = w & MASK51; |
133 | 14.2k | cc = -(w >> 51) & 0x3FF; |
134 | 14.2k | w = a[1] - b[1] - cc; |
135 | 14.2k | d[1] = w & MASK51; |
136 | 14.2k | cc = -(w >> 51) & 0x3FF; |
137 | 14.2k | w = a[2] - b[2] - cc; |
138 | 14.2k | d[2] = w & MASK51; |
139 | 14.2k | cc = -(w >> 51) & 0x3FF; |
140 | 14.2k | w = a[3] - b[3] - cc; |
141 | 14.2k | d[3] = w & MASK51; |
142 | 14.2k | cc = -(w >> 51) & 0x3FF; |
143 | 14.2k | d[4] = ((uint64_t)1 << 61) + a[4] - b[4] - cc; |
144 | | |
145 | | /* |
146 | | * Partial reduction. The intermediate result may be up to |
147 | | * slightly above 2^265, but less than 2^265+2^255. When we |
148 | | * truncate to 255 bits, the upper bits will be at most 1024. |
149 | | */ |
150 | 14.2k | d[0] += 19 * (d[4] >> 51); |
151 | 14.2k | d[4] &= MASK51; |
152 | 14.2k | } |
153 | | |
154 | | /* |
155 | | * UMUL51(hi, lo, x, y) computes: |
156 | | * |
157 | | * hi = floor((x * y) / (2^51)) |
158 | | * lo = x * y mod 2^51 |
159 | | * |
160 | | * Note that lo < 2^51, but "hi" may be larger, if the input operands are |
161 | | * larger. |
162 | | */ |
163 | | #if BR_INT128 |
164 | | |
165 | 907k | #define UMUL51(hi, lo, x, y) do { \ |
166 | 907k | unsigned __int128 umul_tmp; \ |
167 | 907k | umul_tmp = (unsigned __int128)(x) * (unsigned __int128)(y); \ |
168 | 907k | (hi) = (uint64_t)(umul_tmp >> 51); \ |
169 | 907k | (lo) = (uint64_t)umul_tmp & MASK51; \ |
170 | 907k | } while (0) |
171 | | |
172 | | #elif BR_UMUL128 |
173 | | |
174 | | #define UMUL51(hi, lo, x, y) do { \ |
175 | | uint64_t umul_hi, umul_lo; \ |
176 | | umul_lo = _umul128((x), (y), &umul_hi); \ |
177 | | (hi) = (umul_hi << 13) | (umul_lo >> 51); \ |
178 | | (lo) = umul_lo & MASK51; \ |
179 | | } while (0) |
180 | | |
181 | | #endif |
182 | | |
183 | | /* |
184 | | * Multiplication. |
185 | | * On input, limbs must fit on 54 bits each. |
186 | | * On output, limb 0 is at most 2^51 + 155647, and other limbs fit |
187 | | * on 51 bits each. |
188 | | */ |
189 | | static inline void |
190 | | f255_mul(uint64_t *d, uint64_t *a, uint64_t *b) |
191 | 36.2k | { |
192 | 36.2k | uint64_t t[10], hi, lo, w, cc; |
193 | | |
194 | | /* |
195 | | * Perform cross products, accumulating values without carry |
196 | | * propagation. |
197 | | * |
198 | | * Since input limbs fit on 54 bits each, each individual |
199 | | * UMUL51 will produce a "hi" of less than 2^57. The maximum |
200 | | * sum will be at most 5*(2^57-1) + 4*(2^51-1) (for t[5]), |
201 | | * i.e. less than 324*2^51. |
202 | | */ |
203 | | |
204 | 36.2k | UMUL51(t[1], t[0], a[0], b[0]); |
205 | | |
206 | 36.2k | UMUL51(t[2], lo, a[1], b[0]); t[1] += lo; |
207 | 36.2k | UMUL51(hi, lo, a[0], b[1]); t[1] += lo; t[2] += hi; |
208 | | |
209 | 36.2k | UMUL51(t[3], lo, a[2], b[0]); t[2] += lo; |
210 | 36.2k | UMUL51(hi, lo, a[1], b[1]); t[2] += lo; t[3] += hi; |
211 | 36.2k | UMUL51(hi, lo, a[0], b[2]); t[2] += lo; t[3] += hi; |
212 | | |
213 | 36.2k | UMUL51(t[4], lo, a[3], b[0]); t[3] += lo; |
214 | 36.2k | UMUL51(hi, lo, a[2], b[1]); t[3] += lo; t[4] += hi; |
215 | 36.2k | UMUL51(hi, lo, a[1], b[2]); t[3] += lo; t[4] += hi; |
216 | 36.2k | UMUL51(hi, lo, a[0], b[3]); t[3] += lo; t[4] += hi; |
217 | | |
218 | 36.2k | UMUL51(t[5], lo, a[4], b[0]); t[4] += lo; |
219 | 36.2k | UMUL51(hi, lo, a[3], b[1]); t[4] += lo; t[5] += hi; |
220 | 36.2k | UMUL51(hi, lo, a[2], b[2]); t[4] += lo; t[5] += hi; |
221 | 36.2k | UMUL51(hi, lo, a[1], b[3]); t[4] += lo; t[5] += hi; |
222 | 36.2k | UMUL51(hi, lo, a[0], b[4]); t[4] += lo; t[5] += hi; |
223 | | |
224 | 36.2k | UMUL51(t[6], lo, a[4], b[1]); t[5] += lo; |
225 | 36.2k | UMUL51(hi, lo, a[3], b[2]); t[5] += lo; t[6] += hi; |
226 | 36.2k | UMUL51(hi, lo, a[2], b[3]); t[5] += lo; t[6] += hi; |
227 | 36.2k | UMUL51(hi, lo, a[1], b[4]); t[5] += lo; t[6] += hi; |
228 | | |
229 | 36.2k | UMUL51(t[7], lo, a[4], b[2]); t[6] += lo; |
230 | 36.2k | UMUL51(hi, lo, a[3], b[3]); t[6] += lo; t[7] += hi; |
231 | 36.2k | UMUL51(hi, lo, a[2], b[4]); t[6] += lo; t[7] += hi; |
232 | | |
233 | 36.2k | UMUL51(t[8], lo, a[4], b[3]); t[7] += lo; |
234 | 36.2k | UMUL51(hi, lo, a[3], b[4]); t[7] += lo; t[8] += hi; |
235 | | |
236 | 36.2k | UMUL51(t[9], lo, a[4], b[4]); t[8] += lo; |
237 | | |
238 | | /* |
239 | | * The upper words t[5]..t[9] are folded back into the lower |
240 | | * words, using the rule that 2^255 = 19 in the field. |
241 | | * |
242 | | * Since each t[i] is less than 324*2^51, the additions below |
243 | | * will yield less than 6480*2^51 in each limb; this fits in |
244 | | * 64 bits (6480*2^51 < 8192*2^51 = 2^64), hence there is |
245 | | * no overflow. |
246 | | */ |
247 | 36.2k | t[0] += 19 * t[5]; |
248 | 36.2k | t[1] += 19 * t[6]; |
249 | 36.2k | t[2] += 19 * t[7]; |
250 | 36.2k | t[3] += 19 * t[8]; |
251 | 36.2k | t[4] += 19 * t[9]; |
252 | | |
253 | | /* |
254 | | * Propagate carries. |
255 | | */ |
256 | 36.2k | w = t[0]; |
257 | 36.2k | d[0] = w & MASK51; |
258 | 36.2k | cc = w >> 51; |
259 | 36.2k | w = t[1] + cc; |
260 | 36.2k | d[1] = w & MASK51; |
261 | 36.2k | cc = w >> 51; |
262 | 36.2k | w = t[2] + cc; |
263 | 36.2k | d[2] = w & MASK51; |
264 | 36.2k | cc = w >> 51; |
265 | 36.2k | w = t[3] + cc; |
266 | 36.2k | d[3] = w & MASK51; |
267 | 36.2k | cc = w >> 51; |
268 | 36.2k | w = t[4] + cc; |
269 | 36.2k | d[4] = w & MASK51; |
270 | 36.2k | cc = w >> 51; |
271 | | |
272 | | /* |
273 | | * Since the limbs were 64-bit values, the top carry is at |
274 | | * most 8192 (in practice, that cannot be reached). We simply |
275 | | * performed a partial reduction. |
276 | | */ |
277 | 36.2k | d[0] += 19 * cc; |
278 | 36.2k | } |
279 | | |
280 | | /* |
281 | | * Multiplication by A24 = 121665. |
282 | | * Input must have limbs of 60 bits at most. |
283 | | */ |
284 | | static inline void |
285 | | f255_mul_a24(uint64_t *d, const uint64_t *a) |
286 | 3.57k | { |
287 | 3.57k | uint64_t t[5], cc, w; |
288 | | |
289 | | /* |
290 | | * 121665 = 15 * 8111. We first multiply by 15, with carry |
291 | | * propagation and partial reduction. |
292 | | */ |
293 | 3.57k | w = a[0] * 15; |
294 | 3.57k | t[0] = w & MASK51; |
295 | 3.57k | cc = w >> 51; |
296 | 3.57k | w = a[1] * 15 + cc; |
297 | 3.57k | t[1] = w & MASK51; |
298 | 3.57k | cc = w >> 51; |
299 | 3.57k | w = a[2] * 15 + cc; |
300 | 3.57k | t[2] = w & MASK51; |
301 | 3.57k | cc = w >> 51; |
302 | 3.57k | w = a[3] * 15 + cc; |
303 | 3.57k | t[3] = w & MASK51; |
304 | 3.57k | cc = w >> 51; |
305 | 3.57k | w = a[4] * 15 + cc; |
306 | 3.57k | t[4] = w & MASK51; |
307 | 3.57k | t[0] += 19 * (w >> 51); |
308 | | |
309 | | /* |
310 | | * Then multiplication by 8111. At that point, we known that |
311 | | * t[0] is less than 2^51 + 19*8192, and other limbs are less |
312 | | * than 2^51; thus, there will be no overflow. |
313 | | */ |
314 | 3.57k | w = t[0] * 8111; |
315 | 3.57k | d[0] = w & MASK51; |
316 | 3.57k | cc = w >> 51; |
317 | 3.57k | w = t[1] * 8111 + cc; |
318 | 3.57k | d[1] = w & MASK51; |
319 | 3.57k | cc = w >> 51; |
320 | 3.57k | w = t[2] * 8111 + cc; |
321 | 3.57k | d[2] = w & MASK51; |
322 | 3.57k | cc = w >> 51; |
323 | 3.57k | w = t[3] * 8111 + cc; |
324 | 3.57k | d[3] = w & MASK51; |
325 | 3.57k | cc = w >> 51; |
326 | 3.57k | w = t[4] * 8111 + cc; |
327 | 3.57k | d[4] = w & MASK51; |
328 | 3.57k | d[0] += 19 * (w >> 51); |
329 | 3.57k | } |
330 | | |
331 | | /* |
332 | | * Finalize reduction. |
333 | | * On input, limbs must fit on 51 bits, except possibly the low limb, |
334 | | * which may be slightly above 2^51. |
335 | | */ |
336 | | static inline void |
337 | | f255_final_reduce(uint64_t *a) |
338 | 14 | { |
339 | 14 | uint64_t t[5], cc, w; |
340 | | |
341 | | /* |
342 | | * We add 19. If the result (in t[]) is below 2^255, then a[] |
343 | | * is already less than 2^255-19, thus already reduced. |
344 | | * Otherwise, we subtract 2^255 from t[], in which case we |
345 | | * have t = a - (2^255-19), and that's our result. |
346 | | */ |
347 | 14 | w = a[0] + 19; |
348 | 14 | t[0] = w & MASK51; |
349 | 14 | cc = w >> 51; |
350 | 14 | w = a[1] + cc; |
351 | 14 | t[1] = w & MASK51; |
352 | 14 | cc = w >> 51; |
353 | 14 | w = a[2] + cc; |
354 | 14 | t[2] = w & MASK51; |
355 | 14 | cc = w >> 51; |
356 | 14 | w = a[3] + cc; |
357 | 14 | t[3] = w & MASK51; |
358 | 14 | cc = w >> 51; |
359 | 14 | w = a[4] + cc; |
360 | 14 | t[4] = w & MASK51; |
361 | 14 | cc = w >> 51; |
362 | | |
363 | | /* |
364 | | * The bit 255 of t is in cc. If that bit is 0, when a[] must |
365 | | * be unchanged; otherwise, it must be replaced with t[]. |
366 | | */ |
367 | 14 | cc = -cc; |
368 | 14 | a[0] ^= cc & (a[0] ^ t[0]); |
369 | 14 | a[1] ^= cc & (a[1] ^ t[1]); |
370 | 14 | a[2] ^= cc & (a[2] ^ t[2]); |
371 | 14 | a[3] ^= cc & (a[3] ^ t[3]); |
372 | 14 | a[4] ^= cc & (a[4] ^ t[4]); |
373 | 14 | } |
374 | | |
375 | | static uint32_t |
376 | | api_mul(unsigned char *G, size_t Glen, |
377 | | const unsigned char *kb, size_t kblen, int curve) |
378 | 14 | { |
379 | 14 | unsigned char k[32]; |
380 | 14 | uint64_t x1[5], x2[5], z2[5], x3[5], z3[5]; |
381 | 14 | uint32_t swap; |
382 | 14 | int i; |
383 | | |
384 | 14 | (void)curve; |
385 | | |
386 | | /* |
387 | | * Points are encoded over exactly 32 bytes. Multipliers must fit |
388 | | * in 32 bytes as well. |
389 | | */ |
390 | 14 | if (Glen != 32 || kblen > 32) { |
391 | 0 | return 0; |
392 | 0 | } |
393 | | |
394 | | /* |
395 | | * RFC 7748 mandates that the high bit of the last point byte must |
396 | | * be ignored/cleared; the "& MASK51" in the initialization for |
397 | | * x1[4] clears that bit. |
398 | | */ |
399 | 14 | x1[0] = br_dec64le(&G[0]) & MASK51; |
400 | 14 | x1[1] = (br_dec64le(&G[6]) >> 3) & MASK51; |
401 | 14 | x1[2] = (br_dec64le(&G[12]) >> 6) & MASK51; |
402 | 14 | x1[3] = (br_dec64le(&G[19]) >> 1) & MASK51; |
403 | 14 | x1[4] = (br_dec64le(&G[24]) >> 12) & MASK51; |
404 | | |
405 | | /* |
406 | | * We can use memset() to clear values, because exact-width types |
407 | | * like uint64_t are guaranteed to have no padding bits or |
408 | | * trap representations. |
409 | | */ |
410 | 14 | memset(x2, 0, sizeof x2); |
411 | 14 | x2[0] = 1; |
412 | 14 | memset(z2, 0, sizeof z2); |
413 | 14 | memcpy(x3, x1, sizeof x1); |
414 | 14 | memcpy(z3, x2, sizeof x2); |
415 | | |
416 | | /* |
417 | | * The multiplier is provided in big-endian notation, and |
418 | | * possibly shorter than 32 bytes. |
419 | | */ |
420 | 14 | memset(k, 0, (sizeof k) - kblen); |
421 | 14 | memcpy(k + (sizeof k) - kblen, kb, kblen); |
422 | 14 | k[31] &= 0xF8; |
423 | 14 | k[0] &= 0x7F; |
424 | 14 | k[0] |= 0x40; |
425 | | |
426 | 14 | swap = 0; |
427 | | |
428 | 3.58k | for (i = 254; i >= 0; i --) { |
429 | 3.57k | uint64_t a[5], aa[5], b[5], bb[5], e[5]; |
430 | 3.57k | uint64_t c[5], d[5], da[5], cb[5]; |
431 | 3.57k | uint32_t kt; |
432 | | |
433 | 3.57k | kt = (k[31 - (i >> 3)] >> (i & 7)) & 1; |
434 | 3.57k | swap ^= kt; |
435 | 3.57k | f255_cswap(x2, x3, swap); |
436 | 3.57k | f255_cswap(z2, z3, swap); |
437 | 3.57k | swap = kt; |
438 | | |
439 | | /* |
440 | | * At that point, limbs of x_2 and z_2 are assumed to fit |
441 | | * on at most 52 bits each. |
442 | | * |
443 | | * Each f255_add() adds one bit to the maximum range of |
444 | | * the values, but f255_sub() and f255_mul() bring back |
445 | | * the limbs into 52 bits. All f255_add() outputs are |
446 | | * used only as inputs for f255_mul(), which ensures |
447 | | * that limbs remain in the proper range. |
448 | | */ |
449 | | |
450 | | /* A = x_2 + z_2 -- limbs fit on 53 bits each */ |
451 | 3.57k | f255_add(a, x2, z2); |
452 | | |
453 | | /* AA = A^2 */ |
454 | 3.57k | f255_mul(aa, a, a); |
455 | | |
456 | | /* B = x_2 - z_2 */ |
457 | 3.57k | f255_sub(b, x2, z2); |
458 | | |
459 | | /* BB = B^2 */ |
460 | 3.57k | f255_mul(bb, b, b); |
461 | | |
462 | | /* E = AA - BB */ |
463 | 3.57k | f255_sub(e, aa, bb); |
464 | | |
465 | | /* C = x_3 + z_3 -- limbs fit on 53 bits each */ |
466 | 3.57k | f255_add(c, x3, z3); |
467 | | |
468 | | /* D = x_3 - z_3 */ |
469 | 3.57k | f255_sub(d, x3, z3); |
470 | | |
471 | | /* DA = D * A */ |
472 | 3.57k | f255_mul(da, d, a); |
473 | | |
474 | | /* CB = C * B */ |
475 | 3.57k | f255_mul(cb, c, b); |
476 | | |
477 | | /* x_3 = (DA + CB)^2 */ |
478 | 3.57k | f255_add(x3, da, cb); |
479 | 3.57k | f255_mul(x3, x3, x3); |
480 | | |
481 | | /* z_3 = x_1 * (DA - CB)^2 */ |
482 | 3.57k | f255_sub(z3, da, cb); |
483 | 3.57k | f255_mul(z3, z3, z3); |
484 | 3.57k | f255_mul(z3, x1, z3); |
485 | | |
486 | | /* x_2 = AA * BB */ |
487 | 3.57k | f255_mul(x2, aa, bb); |
488 | | |
489 | | /* z_2 = E * (AA + a24 * E) */ |
490 | 3.57k | f255_mul_a24(z2, e); |
491 | 3.57k | f255_add(z2, aa, z2); |
492 | 3.57k | f255_mul(z2, e, z2); |
493 | 3.57k | } |
494 | | |
495 | 14 | f255_cswap(x2, x3, swap); |
496 | 14 | f255_cswap(z2, z3, swap); |
497 | | |
498 | | /* |
499 | | * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize |
500 | | * most non-squarings. We use x1 and x3, now useless, as temporaries. |
501 | | */ |
502 | 14 | memcpy(x1, z2, sizeof z2); |
503 | 224 | for (i = 0; i < 15; i ++) { |
504 | 210 | f255_mul(x1, x1, x1); |
505 | 210 | f255_mul(x1, x1, z2); |
506 | 210 | } |
507 | 14 | memcpy(x3, x1, sizeof x1); |
508 | 210 | for (i = 0; i < 14; i ++) { |
509 | 196 | int j; |
510 | | |
511 | 3.33k | for (j = 0; j < 16; j ++) { |
512 | 3.13k | f255_mul(x3, x3, x3); |
513 | 3.13k | } |
514 | 196 | f255_mul(x3, x3, x1); |
515 | 196 | } |
516 | 224 | for (i = 14; i >= 0; i --) { |
517 | 210 | f255_mul(x3, x3, x3); |
518 | 210 | if ((0xFFEB >> i) & 1) { |
519 | 182 | f255_mul(x3, z2, x3); |
520 | 182 | } |
521 | 210 | } |
522 | | |
523 | | /* |
524 | | * Compute x2/z2. We have 1/z2 in x3. |
525 | | */ |
526 | 14 | f255_mul(x2, x2, x3); |
527 | 14 | f255_final_reduce(x2); |
528 | | |
529 | | /* |
530 | | * Encode the final x2 value in little-endian. We first assemble |
531 | | * the limbs into 64-bit values. |
532 | | */ |
533 | 14 | x2[0] |= x2[1] << 51; |
534 | 14 | x2[1] = (x2[1] >> 13) | (x2[2] << 38); |
535 | 14 | x2[2] = (x2[2] >> 26) | (x2[3] << 25); |
536 | 14 | x2[3] = (x2[3] >> 39) | (x2[4] << 12); |
537 | 14 | br_enc64le(G, x2[0]); |
538 | 14 | br_enc64le(G + 8, x2[1]); |
539 | 14 | br_enc64le(G + 16, x2[2]); |
540 | 14 | br_enc64le(G + 24, x2[3]); |
541 | 14 | return 1; |
542 | 14 | } |
543 | | |
544 | | static size_t |
545 | | api_mulgen(unsigned char *R, |
546 | | const unsigned char *x, size_t xlen, int curve) |
547 | 14 | { |
548 | 14 | const unsigned char *G; |
549 | 14 | size_t Glen; |
550 | | |
551 | 14 | G = api_generator(curve, &Glen); |
552 | 14 | memcpy(R, G, Glen); |
553 | 14 | api_mul(R, Glen, x, xlen, curve); |
554 | 14 | return Glen; |
555 | 14 | } |
556 | | |
557 | | static uint32_t |
558 | | api_muladd(unsigned char *A, const unsigned char *B, size_t len, |
559 | | const unsigned char *x, size_t xlen, |
560 | | const unsigned char *y, size_t ylen, int curve) |
561 | 0 | { |
562 | | /* |
563 | | * We don't implement this method, since it is used for ECDSA |
564 | | * only, and there is no ECDSA over Curve25519 (which instead |
565 | | * uses EdDSA). |
566 | | */ |
567 | 0 | (void)A; |
568 | 0 | (void)B; |
569 | 0 | (void)len; |
570 | 0 | (void)x; |
571 | 0 | (void)xlen; |
572 | 0 | (void)y; |
573 | 0 | (void)ylen; |
574 | 0 | (void)curve; |
575 | 0 | return 0; |
576 | 0 | } |
577 | | |
578 | | /* see bearssl_ec.h */ |
579 | | const br_ec_impl br_ec_c25519_m62 = { |
580 | | (uint32_t)0x20000000, |
581 | | &api_generator, |
582 | | &api_order, |
583 | | &api_xoff, |
584 | | &api_mul, |
585 | | &api_mulgen, |
586 | | &api_muladd |
587 | | }; |
588 | | |
589 | | /* see bearssl_ec.h */ |
590 | | const br_ec_impl * |
591 | | br_ec_c25519_m62_get(void) |
592 | 14 | { |
593 | 14 | return &br_ec_c25519_m62; |
594 | 14 | } |
595 | | |
596 | | #else |
597 | | |
598 | | /* see bearssl_ec.h */ |
599 | | const br_ec_impl * |
600 | | br_ec_c25519_m62_get(void) |
601 | | { |
602 | | return 0; |
603 | | } |
604 | | |
605 | | #endif |