Line data Source code
1 : #include "fd_x25519.h"
2 : #include "fd_f25519.h"
3 :
4 : /* FD_X25519_VECTORIZE calls mul4 instead of sqr2+mul2, and similar.
5 : Only useful if the underlying ops are actually vectorized and therefore
6 : the cost of 4 muls is <= the cost of 2 sqr + 2 mul. */
7 : #define FD_X25519_VECTORIZE FD_HAS_AVX512
8 :
9 : /* FD_X25519_ALIGN aligns variables. */
10 : #if FD_HAS_AVX
11 : #define FD_X25519_ALIGN __attribute__((aligned(32)))
12 : #else
13 : #define FD_X25519_ALIGN
14 : #endif
15 :
16 : /*
17 : * Constant time primitives
18 : */
19 :
20 : static inline int FD_FN_SENSITIVE
21 0 : fd_x25519_is_zero_const_time( uchar const point[ 32 ] ) {
22 : //TODO: this is generally done by (x)or-ing the limbs, see also RFC 7748, page 13.
23 0 : int is_zero = 1;
24 0 : for( ulong i=0UL; i<32UL; i++ ) {
25 0 : is_zero &= ( !point[ i ] );
26 0 : }
27 0 : return is_zero;
28 0 : }
29 :
30 : #if !FD_HAS_AVX512
31 :
32 : static inline void FD_FN_SENSITIVE
33 : fd_x25519_montgomery_ladder( fd_f25519_t * x2,
34 : fd_f25519_t * z2,
35 : fd_f25519_t const * x1,
36 : uchar const * secret_scalar ) {
37 : /* memory areas that will contain (partial) secrets and will be cleared at the end */
38 : fd_f25519_t secret_tmp_f[4];
39 : int swap = 0;
40 : int b = 0;
41 :
42 : /* human-readable variables */
43 : fd_f25519_t * x3 = &secret_tmp_f[0];
44 : fd_f25519_t * z3 = &secret_tmp_f[1];
45 : fd_f25519_t * tmp0 = &secret_tmp_f[2];
46 : fd_f25519_t * tmp1 = &secret_tmp_f[3];
47 :
48 : fd_f25519_set( x2, fd_f25519_one );
49 : fd_f25519_set( z2, fd_f25519_zero );
50 :
51 : /* use fd_f25519_add to reduce x1 mod p. this is required (we have a test). */
52 : fd_f25519_add( x3, fd_f25519_zero, x1 );
53 : fd_f25519_set( z3, fd_f25519_one );
54 :
55 : for( long pos=254UL; pos>=0; pos-- ) {
56 : b = (secret_scalar[ pos / 8L ] >> ( pos & 7L )) & 1;
57 : swap ^= b;
58 : fd_f25519_swap_if( x2, x3, swap );
59 : fd_f25519_swap_if( z2, z3, swap );
60 : swap = b;
61 :
62 : fd_f25519_sub_nr( tmp0, x3, z3 );
63 : fd_f25519_sub_nr( tmp1, x2, z2 );
64 : fd_f25519_add_nr( x2, x2, z2 );
65 : fd_f25519_add_nr( z2, x3, z3 );
66 :
67 : # if FD_X25519_VECTORIZE /* Note that okay to use less efficient squaring because we get it for free in unused vector lanes */
68 : fd_f25519_mul4( z3, tmp0, x2,
69 : z2, z2, tmp1,
70 : tmp0, tmp1, tmp1,
71 : tmp1, x2, x2 );
72 : # else /* Use more efficient squaring if scalar implementation */
73 : fd_f25519_mul2( z3, tmp0, x2,
74 : z2, z2, tmp1 );
75 : fd_f25519_sqr2( tmp0, tmp1,
76 : tmp1, x2 );
77 : # endif
78 : fd_f25519_add_nr( x3, z3, z2 );
79 : fd_f25519_sub_nr( z2, z3, z2 );
80 : # if FD_X25519_VECTORIZE /* See note above */
81 : fd_f25519_mul2( x2, tmp1, tmp0,
82 : z2, z2, z2 );
83 : # else
84 : fd_f25519_mul( x2, tmp1, tmp0 );
85 : fd_f25519_sqr( z2, z2 );
86 : # endif
87 : fd_f25519_sub_nr( tmp1, tmp1, tmp0 );
88 :
89 : fd_f25519_mul_121666( z3, tmp1 );
90 :
91 : fd_f25519_add_nr( tmp0, tmp0, z3 );
92 : # if FD_X25519_VECTORIZE /* See note above */
93 : fd_f25519_mul3( x3, x3, x3,
94 : z3, x1, z2,
95 : z2, tmp0, tmp1 );
96 : # else
97 : fd_f25519_sqr ( x3, x3 );
98 : fd_f25519_mul2( z3, x1, z2,
99 : z2, tmp1, tmp0 );
100 : # endif
101 : }
102 :
103 : fd_f25519_swap_if( x2, x3, swap );
104 : fd_f25519_swap_if( z2, z3, swap );
105 :
106 : /* Sanitize */
107 :
108 : fd_memzero_explicit( secret_tmp_f, sizeof(secret_tmp_f) );
109 : fd_memzero_explicit( &b, sizeof(int) );
110 : fd_memzero_explicit( &swap, sizeof(int) );
111 : }
112 : #else
113 :
114 : /* This is the "transposed" version of the Montgomery ladder above.
115 : Experimentally, this is 15-20% faster on AVX-512. */
116 : static inline void FD_FN_SENSITIVE
117 : fd_x25519_montgomery_ladder( fd_f25519_t * x2,
118 : fd_f25519_t * z2,
119 : fd_f25519_t const * x1,
120 0 : uchar const * secret_scalar ) {
121 0 : FD_R43X6_QUAD_DECL( U );
122 0 : FD_R43X6_QUAD_DECL( Q );
123 0 : FD_R43X6_QUAD_DECL( P );
124 0 : FD_R43X6_QUAD_PACK( U, fd_r43x6_zero(),
125 0 : fd_r43x6_zero(),
126 0 : fd_r43x6_zero(),
127 0 : x1->el ); // x_1 = u, in u44
128 0 : FD_R43X6_QUAD_PACK( Q, fd_r43x6_one(), // x_2 = 1, in u44
129 0 : fd_r43x6_zero(), // z_2 = 0, in u44
130 0 : x1->el, // x_3 = u, in u44
131 0 : fd_r43x6_one() ); // z_3 = 1, in u44
132 : /* reduce x1 */
133 0 : FD_R43X6_QUAD_FOLD_SIGNED( U, U );
134 0 : FD_R43X6_QUAD_FOLD_SIGNED( Q, Q );
135 0 : int swap = 0;
136 0 : int k_t = 0;
137 0 : wwl_t perm;
138 0 : fd_r43x6_t AA, E, F, G, H, GG;
139 :
140 0 : for( int t=254UL; t>=0; t-- ) { // For t = bits-1 down to 0:
141 :
142 : /* At this point, Q and U in u44|u44|u44|u44 */
143 :
144 0 : k_t = (secret_scalar[ t / 8L ] >> ( t & 7L )) & 1; // k_t = (k >> t) & 1;
145 0 : swap ^= k_t; // swap ^= k_t
146 0 : perm = wwl_if( (-swap) & 0xff, wwl( 2L,3L,0L,1L, 6L,7L,4L,5L ), wwl( 0L,1L,2L,3L, 4L,5L,6L,7L ) );
147 0 : Q03 = wwl_permute( perm, Q03 ); // (x_2, x_3) = cswap(swap, x_2, x_3)
148 0 : Q14 = wwl_permute( perm, Q14 ); // (z_2, z_3) = cswap(swap, z_2, z_3)
149 0 : Q25 = wwl_permute( perm, Q25 );
150 0 : swap = k_t; // swap = k_t
151 :
152 : /* These operations are exactly from the RFC but have been reordered
153 : slightly to make it easier to extract ILP. */
154 :
155 0 : FD_R43X6_QUAD_PERMUTE ( P, 0,0,2,2, Q ); // A = x_2 + z_2, P = x_2|x_2|x_3 |x_3, in u44|u44|u44|u44
156 0 : FD_R43X6_QUAD_PERMUTE ( Q, 1,1,3,3, Q ); // B = x_2 - z_2, Q = z_2|z_2|z_3 |z_3, in u44|u44|u44|u44
157 0 : FD_R43X6_QUAD_LANE_ADD_FAST( P, P, 1,0,1,0, P, Q ); // C = x_3 + z_3, P = A |x_2|C |x_3, in u45|u44|u45|u44
158 0 : FD_R43X6_QUAD_LANE_SUB_FAST( P, P, 0,1,0,1, P, Q ); // D = x_3 - z_3, P = A |B |C |D, in u45|s44|u45|s44
159 0 : FD_R43X6_QUAD_PERMUTE ( Q, 0,1,1,0, P ); // BB = B^2, P = A |B |B |A, in u44|u44|u44|u44
160 0 : FD_R43X6_QUAD_MUL_FAST ( P, P, Q ); // DA = D * A, P = AA |BB |CB |DA, in u62|u62|u62|u62
161 0 : FD_R43X6_QUAD_FOLD_UNSIGNED( P, P ); // DA = D * A, P = AA |BB |CB |DA, in u44|u44|u44|u44
162 0 : FD_R43X6_QUAD_PERMUTE ( Q, 1,0,3,2, P ); // CB = C * B, Q = BB |AA |DA |CB, in u62|u62|u62|u62
163 0 : FD_R43X6_QUAD_LANE_SUB_FAST( P, P, 0,1,0,1, Q, P ); // E = AA-BB, P = AA |E |CB |CB-DA, in u62|s62|u62|s62
164 0 : FD_R43X6_QUAD_LANE_ADD_FAST( P, P, 0,0,1,0, P, Q ); // P = AA |E |DA+CB|CB-DA, in u62|s62|u63|s62
165 0 : FD_R43X6_QUAD_LANE_IF ( Q, 0,1,1,0, P, Q ); // Q = BB |E |DA+CB|CB, in u62|u44|u44|u62
166 0 : FD_R43X6_QUAD_LANE_IF ( Q, 0,0,0,1, U, Q ); // x_3 = (DA + CB)^2, Q = BB |E |DA+CB|x_1, in u62|u44|u44|u44
167 0 : FD_R43X6_QUAD_UNPACK ( AA, E, F, G, P );
168 0 : H = fd_r43x6_add_fast( AA, fd_r43x6_scale_fast( 121665L, E ) ); // H = AA + a24 * E, in u60
169 0 : GG = fd_r43x6_sqr_fast( G ); // GG = (DA - CB)^2, in u61
170 0 : FD_R43X6_QUAD_PACK ( P, AA, H, F, GG ); // z_2 = E * (AA + a24 * E), P = AA |H |DA+CB|GG, in u44|u60|u44|u61
171 0 : FD_R43X6_QUAD_FOLD_UNSIGNED( P, P ); // P = AA |H |DA+CB|GG, in u44|u44|u44|u44
172 0 : FD_R43X6_QUAD_MUL_FAST ( P, P, Q ); // z_3 = x_1 * (DA - CB)^2, Q = x_2|z_2|x_3 |z_3, in u62|u62|u62|u62
173 0 : FD_R43X6_QUAD_FOLD_UNSIGNED( Q, P ); // Q = x_2|z_2|x_3 |z_3, in u44|u44|u44|u44
174 0 : }
175 :
176 : /* At this point, Q in u44|u44|u44|u44 */
177 0 : perm = wwl_if( (-swap) & 0xff, wwl( 2L,3L,0L,1L, 6L,7L,4L,5L ), wwl( 0L,1L,2L,3L, 4L,5L,6L,7L ) );
178 0 : Q03 = wwl_permute( perm, Q03 ); // (x_2, x_3) = cswap(swap, x_2, x_3)
179 0 : Q14 = wwl_permute( perm, Q14 ); // (z_2, z_3) = cswap(swap, z_2, z_3)
180 0 : Q25 = wwl_permute( perm, Q25 );
181 :
182 0 : FD_R43X6_QUAD_UNPACK( x2->el, z2->el, E, F, Q );
183 :
184 : /* Sanitize */
185 :
186 0 : fd_memzero_explicit( &P03, sizeof(wwl_t) );
187 0 : fd_memzero_explicit( &P14, sizeof(wwl_t) );
188 0 : fd_memzero_explicit( &P25, sizeof(wwl_t) );
189 0 : fd_memzero_explicit( &U03, sizeof(wwl_t) );
190 0 : fd_memzero_explicit( &U14, sizeof(wwl_t) );
191 0 : fd_memzero_explicit( &U25, sizeof(wwl_t) );
192 0 : fd_memzero_explicit( &Q03, sizeof(wwl_t) );
193 0 : fd_memzero_explicit( &Q14, sizeof(wwl_t) );
194 0 : fd_memzero_explicit( &Q25, sizeof(wwl_t) );
195 0 : fd_memzero_explicit( &AA, sizeof(wwl_t) );
196 0 : fd_memzero_explicit( &E, sizeof(wwl_t) );
197 0 : fd_memzero_explicit( &F, sizeof(wwl_t) );
198 0 : fd_memzero_explicit( &G, sizeof(wwl_t) );
199 0 : fd_memzero_explicit( &H, sizeof(wwl_t) );
200 0 : fd_memzero_explicit( &GG, sizeof(wwl_t) );
201 0 : fd_memzero_explicit( &perm, sizeof(wwl_t) );
202 0 : fd_memzero_explicit( &swap, sizeof(int) );
203 0 : fd_memzero_explicit( &k_t, sizeof(int) );
204 :
205 0 : }
206 : #endif
207 :
208 : /*
209 : * X25519 Protocol
210 : */
211 :
212 : static inline void FD_FN_SENSITIVE
213 : fd_x25519_scalar_mul_const_time( uchar out[ 32 ],
214 : uchar const * secret_scalar,
215 0 : fd_f25519_t const * point_x ) {
216 0 : fd_f25519_t x2[1], z2[1];
217 :
218 0 : fd_x25519_montgomery_ladder( x2, z2, point_x, secret_scalar );
219 :
220 0 : fd_f25519_inv( z2, z2 );
221 0 : fd_f25519_mul( x2, x2, z2 );
222 :
223 0 : fd_f25519_tobytes( out, x2 );
224 0 : }
225 :
226 : static const uchar fd_x25519_basepoint[ 32 ] FD_X25519_ALIGN = { 9 };
227 :
228 : uchar * FD_FN_SENSITIVE
229 : fd_x25519_public( uchar self_public_key [ 32 ],
230 0 : uchar const self_private_key[ 32 ] ) {
231 : /* IETF RFC 7748 Section 4.1 (page 3) */
232 0 : return fd_x25519_exchange( self_public_key, self_private_key, fd_x25519_basepoint );
233 0 : }
234 :
235 : uchar * FD_FN_SENSITIVE
236 : fd_x25519_exchange( uchar shared_secret [ 32 ],
237 : uchar const self_private_key[ 32 ],
238 0 : uchar const peer_public_key [ 32 ] ) {
239 :
240 : /* Memory areas that will contain secrets */
241 0 : uchar secret_scalar[ 32UL ] FD_X25519_ALIGN;
242 :
243 : /* Public local variables */
244 0 : fd_f25519_t peer_public_key_point_u[1];
245 :
246 : // RFC 7748 - Elliptic Curves for Security
247 : //
248 : // 5. The X25519 and X448 Functions
249 : //
250 : // The "X25519" and "X448" functions perform scalar multiplication on
251 : // the Montgomery form of the above curves. (This is used when
252 : // implementing Diffie-Hellman.) The functions take a scalar and a
253 : // u-coordinate as inputs and produce a u-coordinate as output.
254 : // Although the functions work internally with integers, the inputs and
255 : // outputs are 32-byte strings (for X25519) or 56-byte strings (for
256 : // X448) and this specification defines their encoding.
257 :
258 : // The u-coordinates are elements of the underlying field GF(2^255 - 19)
259 : // or GF(2^448 - 2^224 - 1) and are encoded as an array of bytes, u, in
260 : // little-endian order such that u[0] + 256*u[1] + 256^2*u[2] + ... +
261 : // 256^(n-1)*u[n-1] is congruent to the value modulo p and u[n-1] is
262 : // minimal. When receiving such an array, implementations of X25519
263 : // (but not X448) MUST mask the most significant bit in the final byte.
264 : // This is done to preserve compatibility with point formats that
265 : // reserve the sign bit for use in other protocols and to increase
266 : // resistance to implementation fingerprinting.
267 :
268 : // Implementations MUST accept non-canonical values and process them as
269 : // if they had been reduced modulo the field prime. The non-canonical
270 : // values are 2^255 - 19 through 2^255 - 1 for X25519 and 2^448 - 2^224
271 : // - 1 through 2^448 - 1 for X448.
272 :
273 : /* From the text above:
274 : 1. When receiving such an array, implementations of X25519 [...]
275 : MUST mask the most significant bit in the final byte
276 : >> this is done by fd_f25519_frombytes
277 : 2. Implementations MUST accept non-canonical values
278 : >> no extra check needed */
279 0 : fd_f25519_frombytes( peer_public_key_point_u, peer_public_key );
280 :
281 : // Scalars are assumed to be randomly generated bytes. For X25519, in
282 : // order to decode 32 random bytes as an integer scalar, set the three
283 : // least significant bits of the first byte and the most significant bit
284 : // of the last to zero, set the second most significant bit of the last
285 : // byte to 1 and, finally, decode as little-endian. This means that the
286 : // resulting integer is of the form 2^254 plus eight times a value
287 : // between 0 and 2^251 - 1 (inclusive). Likewise, for X448, set the two
288 : // least significant bits of the first byte to 0, and the most
289 : // significant bit of the last byte to 1. This means that the resulting
290 : // integer is of the form 2^447 plus four times a value between 0 and
291 : // 2^445 - 1 (inclusive).
292 :
293 : /* decodeScalar25519
294 : note: e need to copy the private key, because we need to sanitize it. */
295 0 : memcpy( secret_scalar, self_private_key, 32UL );
296 0 : secret_scalar[ 0] &= (uchar)0xF8;
297 0 : secret_scalar[31] &= (uchar)0x7F;
298 0 : secret_scalar[31] |= (uchar)0x40;
299 :
300 0 : fd_x25519_scalar_mul_const_time( shared_secret, secret_scalar, peer_public_key_point_u );
301 :
302 : /* Sanitize */
303 0 : fd_memzero_explicit( secret_scalar, 32UL );
304 :
305 : /* Reject low order points */
306 0 : if( FD_UNLIKELY( fd_x25519_is_zero_const_time( shared_secret ) ) ) {
307 0 : return NULL;
308 0 : }
309 :
310 0 : return shared_secret;
311 0 : }
|