/src/openssl/crypto/ml_dsa/ml_dsa_sample.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright 2024-2025 The OpenSSL Project Authors. All Rights Reserved. |
3 | | * |
4 | | * Licensed under the Apache License 2.0 (the "License"). You may not use |
5 | | * this file except in compliance with the License. You can obtain a copy |
6 | | * in the file LICENSE in the source distribution or at |
7 | | * https://www.openssl.org/source/license.html |
8 | | */ |
9 | | |
10 | | #include <openssl/byteorder.h> |
11 | | #include "ml_dsa_local.h" |
12 | | #include "ml_dsa_vector.h" |
13 | | #include "ml_dsa_matrix.h" |
14 | | #include "ml_dsa_hash.h" |
15 | | #include "internal/constant_time.h" |
16 | | #include "internal/sha3.h" |
17 | | #include "internal/packet.h" |
18 | | |
19 | | #define SHAKE128_BLOCKSIZE SHA3_BLOCKSIZE(128) |
20 | | #define SHAKE256_BLOCKSIZE SHA3_BLOCKSIZE(256) |
21 | | |
22 | | /* |
23 | | * This is a constant time version of n % 5 |
24 | | * Note that 0xFFFF / 5 = 0x3333, 2 is added to make an over-estimate of 1/5 |
25 | | * and then we divide by (0xFFFF + 1) |
26 | | */ |
27 | 0 | #define MOD5(n) ((n) - 5 * (0x3335 * (n) >> 16)) |
28 | | |
29 | | #if SHAKE128_BLOCKSIZE % 3 != 0 |
30 | | #error "rej_ntt_poly() requires SHAKE128_BLOCKSIZE to be a multiple of 3" |
31 | | #endif |
32 | | |
33 | | typedef int(COEFF_FROM_NIBBLE_FUNC)(uint32_t nibble, uint32_t *out); |
34 | | |
35 | | static COEFF_FROM_NIBBLE_FUNC coeff_from_nibble_4; |
36 | | static COEFF_FROM_NIBBLE_FUNC coeff_from_nibble_2; |
37 | | |
38 | | /** |
39 | | * @brief Combine 3 bytes to form an coefficient. |
40 | | * See FIPS 204, Algorithm 14, CoeffFromThreeBytes() |
41 | | * |
42 | | * This is not constant time as it is used to generate the matrix A which is public. |
43 | | * |
44 | | * @param s A byte array of 3 uniformly distributed bytes. |
45 | | * @param out The returned coefficient in the range 0..q-1. |
46 | | * @returns 1 if the value is less than q or 0 otherwise. |
47 | | * This is used for rejection sampling. |
48 | | */ |
49 | | static ossl_inline int coeff_from_three_bytes(const uint8_t *s, uint32_t *out) |
50 | 0 | { |
51 | | /* Zero out the top bit of the 3rd byte to get a value in the range 0..2^23-1) */ |
52 | 0 | *out = (uint32_t)s[0] | ((uint32_t)s[1] << 8) | (((uint32_t)s[2] & 0x7f) << 16); |
53 | 0 | return *out < ML_DSA_Q; |
54 | 0 | } |
55 | | |
56 | | /** |
57 | | * @brief Generate a value in the range (q-4..0..4) |
58 | | * See FIPS 204, Algorithm 15, CoeffFromHalfByte() where eta = 4 |
59 | | * Note the FIPS 204 code uses the range -4..4 (whereas this code adds q to the |
60 | | * negative numbers). |
61 | | * |
62 | | * @param nibble A value in the range 0..15 |
63 | | * @param out The returned value if the range (q-4)..0..4 if nibble is < 9 |
64 | | * @returns 1 nibble was in range, or 0 if the nibble was rejected. |
65 | | */ |
66 | | static ossl_inline int coeff_from_nibble_4(uint32_t nibble, uint32_t *out) |
67 | 0 | { |
68 | | /* |
69 | | * This is not constant time but will not leak any important info since |
70 | | * the value is either chosen or thrown away. |
71 | | */ |
72 | 0 | if (value_barrier_32(nibble < 9)) { |
73 | 0 | *out = mod_sub(4, nibble); |
74 | 0 | return 1; |
75 | 0 | } |
76 | 0 | return 0; |
77 | 0 | } |
78 | | |
79 | | /** |
80 | | * @brief Generate a value in the range (q-2..0..2) |
81 | | * See FIPS 204, Algorithm 15, CoeffFromHalfByte() where eta = 2 |
82 | | * Note the FIPS 204 code uses the range -2..2 (whereas this code adds q to the |
83 | | * negative numbers). |
84 | | * |
85 | | * @param nibble A value in the range 0..15 |
86 | | * @param out The returned value if the range (q-2)..0..2 if nibble is < 15 |
87 | | * @returns 1 nibble was in range, or 0 if the nibble was rejected. |
88 | | */ |
89 | | static ossl_inline int coeff_from_nibble_2(uint32_t nibble, uint32_t *out) |
90 | 0 | { |
91 | 0 | if (value_barrier_32(nibble < 15)) { |
92 | 0 | *out = mod_sub(2, MOD5(nibble)); |
93 | 0 | return 1; |
94 | 0 | } |
95 | 0 | return 0; |
96 | 0 | } |
97 | | |
98 | | /** |
99 | | * @brief Use a seed value to generate a polynomial with coefficients in the |
100 | | * range of 0..q-1 using rejection sampling. |
101 | | * SHAKE128 is used to absorb the seed, and then sequences of 3 sample bytes are |
102 | | * squeezed to try to produce coefficients. |
103 | | * The SHAKE128 stream is used to get uniformly distributed elements. |
104 | | * This algorithm is used for matrix expansion and only operates on public inputs. |
105 | | * |
106 | | * See FIPS 204, Algorithm 30, RejNTTPoly() |
107 | | * |
108 | | * @param g_ctx A EVP_MD_CTX object used for sampling the seed. |
109 | | * @param md A pre-fetched SHAKE128 object. |
110 | | * @param seed The seed to use for sampling. |
111 | | * @param seed_len The size of |seed| |
112 | | * @param out The returned polynomial with coefficients in the range of |
113 | | * 0..q-1. This range is required for NTT. |
114 | | * @returns 1 if the polynomial was successfully generated, or 0 if any of the |
115 | | * digest operations failed. |
116 | | */ |
117 | | static int rej_ntt_poly(EVP_MD_CTX *g_ctx, const EVP_MD *md, |
118 | | const uint8_t *seed, size_t seed_len, POLY *out) |
119 | 0 | { |
120 | 0 | int j = 0; |
121 | 0 | uint8_t blocks[SHAKE128_BLOCKSIZE], *b, *end = blocks + sizeof(blocks); |
122 | | |
123 | | /* |
124 | | * Instead of just squeezing 3 bytes at a time, we grab a whole block |
125 | | * Note that the shake128 blocksize of 168 is divisible by 3. |
126 | | */ |
127 | 0 | if (!shake_xof(g_ctx, md, seed, seed_len, blocks, sizeof(blocks))) |
128 | 0 | return 0; |
129 | | |
130 | 0 | while (1) { |
131 | 0 | for (b = blocks; b < end; b += 3) { |
132 | 0 | if (coeff_from_three_bytes(b, &(out->coeff[j]))) { |
133 | 0 | if (++j >= ML_DSA_NUM_POLY_COEFFICIENTS) |
134 | 0 | return 1; /* finished */ |
135 | 0 | } |
136 | 0 | } |
137 | 0 | if (!EVP_DigestSqueeze(g_ctx, blocks, sizeof(blocks))) |
138 | 0 | return 0; |
139 | 0 | } |
140 | 0 | } |
141 | | |
142 | | /** |
143 | | * @brief Use a seed value to generate a polynomial with coefficients in the |
144 | | * range of ((q-eta)..0..eta) using rejection sampling. eta is either 2 or 4. |
145 | | * SHAKE256 is used to absorb the seed, and then samples are squeezed. |
146 | | * See FIPS 204, Algorithm 31, RejBoundedPoly() |
147 | | * |
148 | | * @param h_ctx A EVP_MD_CTX object context used to sample the seed. |
149 | | * @param md A pre-fetched SHAKE256 object. |
150 | | * @param coef_from_nibble A function that is dependent on eta, which takes a |
151 | | * nibble and tries to see if it is in the correct range. |
152 | | * @param seed The seed to use for sampling. |
153 | | * @param seed_len The size of |seed| |
154 | | * @param out The returned polynomial with coefficients in the range of |
155 | | * ((q-eta)..0..eta) |
156 | | * @returns 1 if the polynomial was successfully generated, or 0 if any of the |
157 | | * digest operations failed. |
158 | | */ |
159 | | static int rej_bounded_poly(EVP_MD_CTX *h_ctx, const EVP_MD *md, |
160 | | COEFF_FROM_NIBBLE_FUNC *coef_from_nibble, |
161 | | const uint8_t *seed, size_t seed_len, POLY *out) |
162 | 0 | { |
163 | 0 | int j = 0; |
164 | 0 | uint32_t z0, z1; |
165 | 0 | uint8_t blocks[SHAKE256_BLOCKSIZE], *b, *end = blocks + sizeof(blocks); |
166 | | |
167 | | /* Instead of just squeezing 1 byte at a time, we grab a whole block */ |
168 | 0 | if (!shake_xof(h_ctx, md, seed, seed_len, blocks, sizeof(blocks))) |
169 | 0 | return 0; |
170 | | |
171 | 0 | while (1) { |
172 | 0 | for (b = blocks; b < end; b++) { |
173 | 0 | z0 = *b & 0x0F; /* lower nibble of byte */ |
174 | 0 | z1 = *b >> 4; /* high nibble of byte */ |
175 | |
|
176 | 0 | if (coef_from_nibble(z0, &out->coeff[j]) |
177 | 0 | && ++j >= ML_DSA_NUM_POLY_COEFFICIENTS) |
178 | 0 | return 1; |
179 | 0 | if (coef_from_nibble(z1, &out->coeff[j]) |
180 | 0 | && ++j >= ML_DSA_NUM_POLY_COEFFICIENTS) |
181 | 0 | return 1; |
182 | 0 | } |
183 | 0 | if (!EVP_DigestSqueeze(h_ctx, blocks, sizeof(blocks))) |
184 | 0 | return 0; |
185 | 0 | } |
186 | 0 | } |
187 | | |
188 | | /** |
189 | | * @brief Generate a k * l matrix that has uniformly distributed polynomial |
190 | | * elements using rejection sampling. |
191 | | * See FIPS 204, Algorithm 32, ExpandA() |
192 | | * |
193 | | * @param g_ctx A EVP_MD_CTX context used for rejection sampling |
194 | | * seed values generated from the seed rho. |
195 | | * @param md A pre-fetched SHAKE128 object |
196 | | * @param rho A 32 byte seed to generated the matrix from. |
197 | | * @param out The generated k * l matrix of polynomials with coefficients |
198 | | * in the range of 0..q-1. |
199 | | * @returns 1 if the matrix was generated, or 0 on error. |
200 | | */ |
201 | | int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md, |
202 | | const uint8_t *rho, MATRIX *out) |
203 | 0 | { |
204 | 0 | int ret = 0; |
205 | 0 | size_t i, j; |
206 | 0 | uint8_t derived_seed[ML_DSA_RHO_BYTES + 2]; |
207 | 0 | POLY *poly = out->m_poly; |
208 | | |
209 | | /* The seed used for each matrix element is rho + column_index + row_index */ |
210 | 0 | memcpy(derived_seed, rho, ML_DSA_RHO_BYTES); |
211 | |
|
212 | 0 | for (i = 0; i < out->k; i++) { |
213 | 0 | for (j = 0; j < out->l; j++) { |
214 | 0 | derived_seed[ML_DSA_RHO_BYTES + 1] = (uint8_t)i; |
215 | 0 | derived_seed[ML_DSA_RHO_BYTES] = (uint8_t)j; |
216 | | /* Generate the polynomial for each matrix element using a unique seed */ |
217 | 0 | if (!rej_ntt_poly(g_ctx, md, derived_seed, sizeof(derived_seed), poly++)) |
218 | 0 | goto err; |
219 | 0 | } |
220 | 0 | } |
221 | 0 | ret = 1; |
222 | 0 | err: |
223 | 0 | return ret; |
224 | 0 | } |
225 | | |
226 | | /** |
227 | | * @brief Generates 2 vectors using rejection sampling whose polynomial |
228 | | * coefficients are in the interval [q-eta..0..eta] |
229 | | * |
230 | | * See FIPS 204, Algorithm 33, ExpandS(). |
231 | | * Note that in FIPS 204 the range -eta..eta is used. |
232 | | * |
233 | | * @param h_ctx A EVP_MD_CTX context to use to sample the seed. |
234 | | * @param md A pre-fetched SHAKE256 object. |
235 | | * @param eta Is either 2 or 4, and determines the range of the coefficients for |
236 | | * s1 and s2. |
237 | | * @param seed A 64 byte seed to use for sampling. |
238 | | * @param s1 A 1 * l column vector containing polynomials with coefficients in |
239 | | * the range (q-eta)..0..eta |
240 | | * @param s2 A 1 * k column vector containing polynomials with coefficients in |
241 | | * the range (q-eta)..0..eta |
242 | | * @returns 1 if s1 and s2 were successfully generated, or 0 otherwise. |
243 | | */ |
244 | | int ossl_ml_dsa_vector_expand_S(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta, |
245 | | const uint8_t *seed, VECTOR *s1, VECTOR *s2) |
246 | 0 | { |
247 | 0 | int ret = 0; |
248 | 0 | size_t i; |
249 | 0 | size_t l = s1->num_poly; |
250 | 0 | size_t k = s2->num_poly; |
251 | 0 | uint8_t derived_seed[ML_DSA_PRIV_SEED_BYTES + 2]; |
252 | 0 | COEFF_FROM_NIBBLE_FUNC *coef_from_nibble_fn; |
253 | |
|
254 | 0 | coef_from_nibble_fn = (eta == ML_DSA_ETA_4) ? coeff_from_nibble_4 : coeff_from_nibble_2; |
255 | | |
256 | | /* |
257 | | * Each polynomial generated uses a unique seed that consists of |
258 | | * seed + counter (where the counter is 2 bytes starting at 0) |
259 | | */ |
260 | 0 | memcpy(derived_seed, seed, ML_DSA_PRIV_SEED_BYTES); |
261 | 0 | derived_seed[ML_DSA_PRIV_SEED_BYTES] = 0; |
262 | 0 | derived_seed[ML_DSA_PRIV_SEED_BYTES + 1] = 0; |
263 | |
|
264 | 0 | for (i = 0; i < l; i++) { |
265 | 0 | if (!rej_bounded_poly(h_ctx, md, coef_from_nibble_fn, |
266 | 0 | derived_seed, sizeof(derived_seed), &s1->poly[i])) |
267 | 0 | goto err; |
268 | 0 | ++derived_seed[ML_DSA_PRIV_SEED_BYTES]; |
269 | 0 | } |
270 | 0 | for (i = 0; i < k; i++) { |
271 | 0 | if (!rej_bounded_poly(h_ctx, md, coef_from_nibble_fn, |
272 | 0 | derived_seed, sizeof(derived_seed), &s2->poly[i])) |
273 | 0 | goto err; |
274 | 0 | ++derived_seed[ML_DSA_PRIV_SEED_BYTES]; |
275 | 0 | } |
276 | 0 | ret = 1; |
277 | 0 | err: |
278 | 0 | return ret; |
279 | 0 | } |
280 | | |
281 | | /* See FIPS 204, Algorithm 34, ExpandMask(), Step 4 & 5 */ |
282 | | int ossl_ml_dsa_poly_expand_mask(POLY *out, const uint8_t *seed, size_t seed_len, |
283 | | uint32_t gamma1, |
284 | | EVP_MD_CTX *h_ctx, const EVP_MD *md) |
285 | 0 | { |
286 | 0 | uint8_t buf[32 * 20]; |
287 | 0 | size_t buf_len = 32 * (gamma1 == ML_DSA_GAMMA1_TWO_POWER_19 ? 20 : 18); |
288 | |
|
289 | 0 | return shake_xof(h_ctx, md, seed, seed_len, buf, buf_len) |
290 | 0 | && ossl_ml_dsa_poly_decode_expand_mask(out, buf, buf_len, gamma1); |
291 | 0 | } |
292 | | |
293 | | /* |
294 | | * @brief Sample a polynomial with coefficients in the range {-1..1}. |
295 | | * The number of non zero values (hamming weight) is given by tau |
296 | | * |
297 | | * See FIPS 204, Algorithm 29, SampleInBall() |
298 | | * This function is assumed to not be constant time. |
299 | | * The algorithm is based on Durstenfeld's version of the Fisher-Yates shuffle. |
300 | | * |
301 | | * Note that the coefficients returned by this implementation are positive |
302 | | * i.e one of q-1, 0, or 1. |
303 | | * |
304 | | * @param tau is the number of +1 or -1's in the polynomial 'out_c' (39, 49 or 60) |
305 | | * that is less than or equal to 64 |
306 | | */ |
307 | | int ossl_ml_dsa_poly_sample_in_ball(POLY *out_c, const uint8_t *seed, int seed_len, |
308 | | EVP_MD_CTX *h_ctx, const EVP_MD *md, |
309 | | uint32_t tau) |
310 | 0 | { |
311 | 0 | uint8_t block[SHAKE256_BLOCKSIZE]; |
312 | 0 | uint64_t signs; |
313 | 0 | int offset = 8; |
314 | 0 | size_t end; |
315 | | |
316 | | /* |
317 | | * Rather than squeeze 8 bytes followed by lots of 1 byte squeezes |
318 | | * the SHAKE blocksize is squeezed each time and buffered into 'block'. |
319 | | */ |
320 | 0 | if (!shake_xof(h_ctx, md, seed, seed_len, block, sizeof(block))) |
321 | 0 | return 0; |
322 | | |
323 | | /* |
324 | | * grab the first 64 bits - since tau < 64 |
325 | | * Each bit gives a +1 or -1 value. |
326 | | */ |
327 | 0 | OPENSSL_load_u64_le(&signs, block); |
328 | | |
329 | | /* |
330 | | * SampleInBall implements a Fisher-Yates shuffle whose rejection-sampling |
331 | | * inner loop and data-dependent array index unavoidably leak the structure |
332 | | * of the challenge polynomial via memory-access pattern and branch timing. |
333 | | * This is safe: c_tilde = H(mu ‖ w1) is the Fiat-Shamir commitment and is |
334 | | * published in the accepted signature, so the SHAKE bytes that build c are |
335 | | * effectively public. See the BoringSSL design discussion at |
336 | | * https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/8d8f01ac_70af3f21/ |
337 | | * |
338 | | * The first 8 bytes (the sign bits loaded into |signs| above) are left |
339 | | * tainted: they determine only the ±1 values written into c, which flow |
340 | | * into the CT arithmetic of cs1/cs2/ct0 alongside the already-tainted |
341 | | * secret polynomials and cause no spurious violations there. |
342 | | * Only the rejection-sampling bytes need to be declassified. |
343 | | */ |
344 | 0 | CONSTTIME_DECLASSIFY(block + offset, sizeof(block) - offset); |
345 | |
|
346 | 0 | poly_zero(out_c); |
347 | | |
348 | | /* Loop tau times */ |
349 | 0 | for (end = 256 - tau; end < 256; end++) { |
350 | 0 | size_t index; /* index is a random offset to write +1 or -1 */ |
351 | | |
352 | | /* rejection sample in {0..end} to choose an index to place -1 or 1 into */ |
353 | 0 | for (;;) { |
354 | 0 | if (offset == sizeof(block)) { |
355 | | /* squeeze another block if the bytes from block have been used */ |
356 | 0 | if (!EVP_DigestSqueeze(h_ctx, block, sizeof(block))) |
357 | 0 | return 0; |
358 | | /* See comment above for why the block is declassified. */ |
359 | 0 | CONSTTIME_DECLASSIFY(block, sizeof(block)); |
360 | 0 | offset = 0; |
361 | 0 | } |
362 | | |
363 | 0 | index = block[offset++]; |
364 | 0 | if (index <= end) |
365 | 0 | break; |
366 | 0 | } |
367 | | |
368 | | /* |
369 | | * In-place swap the coefficient we are about to replace to the end so |
370 | | * we don't lose any values that have been already written. |
371 | | */ |
372 | 0 | out_c->coeff[end] = out_c->coeff[index]; |
373 | | /* set the random coefficient value to either 1 or q-1 */ |
374 | 0 | out_c->coeff[index] = mod_sub(1, 2 * (signs & 1)); |
375 | 0 | signs >>= 1; /* grab the next random bit */ |
376 | 0 | } |
377 | 0 | return 1; |
378 | 0 | } |