/src/openssl35/crypto/ml_dsa/ml_dsa_sample.c

Source
/*
 * Copyright 2024-2025 The OpenSSL Project Authors. All Rights Reserved.
 *
 * Licensed under the Apache License 2.0 (the "License").  You may not use
 * this file except in compliance with the License.  You can obtain a copy
 * in the file LICENSE in the source distribution or at
 * https://www.openssl.org/source/license.html
 */

#include <openssl/byteorder.h>
#include "ml_dsa_local.h"
#include "ml_dsa_vector.h"
#include "ml_dsa_matrix.h"
#include "ml_dsa_hash.h"
#include "internal/sha3.h"
#include "internal/packet.h"

#define SHAKE128_BLOCKSIZE SHA3_BLOCKSIZE(128)
#define SHAKE256_BLOCKSIZE SHA3_BLOCKSIZE(256)

/*
 * This is a constant time version of n % 5
 * Note that 0xFFFF / 5 = 0x3333, 2 is added to make an over-estimate of 1/5
 * and then we divide by (0xFFFF + 1)
 */
#define MOD5(n) ((n) - 5 * (0x3335 * (n) >> 16))

#if SHAKE128_BLOCKSIZE % 3 != 0
#error "rej_ntt_poly() requires SHAKE128_BLOCKSIZE to be a multiple of 3"
#endif

typedef int(COEFF_FROM_NIBBLE_FUNC)(uint32_t nibble, uint32_t *out);

static COEFF_FROM_NIBBLE_FUNC coeff_from_nibble_4;
static COEFF_FROM_NIBBLE_FUNC coeff_from_nibble_2;

/**
 * @brief Combine 3 bytes to form an coefficient.
 * See FIPS 204, Algorithm 14, CoeffFromThreeBytes()
 *
 * This is not constant time as it is used to generate the matrix A which is public.
 *
 * @param s A byte array of 3 uniformly distributed bytes.
 * @param out The returned coefficient in the range 0..q-1.
 * @returns 1 if the value is less than q or 0 otherwise.
 *          This is used for rejection sampling.
 */
static ossl_inline int coeff_from_three_bytes(const uint8_t *s, uint32_t *out)
{
    /* Zero out the top bit of the 3rd byte to get a value in the range 0..2^23-1) */
    *out = (uint32_t)s[0] | ((uint32_t)s[1] << 8) | (((uint32_t)s[2] & 0x7f) << 16);
    return *out < ML_DSA_Q;
}

/**
 * @brief Generate a value in the range (q-4..0..4)
 * See FIPS 204, Algorithm 15, CoeffFromHalfByte() where eta = 4
 * Note the FIPS 204 code uses the range -4..4 (whereas this code adds q to the
 * negative numbers).
 *
 * @param nibble A value in the range 0..15
 * @param out The returned value if the range (q-4)..0..4 if nibble is < 9
 * @returns 1 nibble was in range, or 0 if the nibble was rejected.
 */
static ossl_inline int coeff_from_nibble_4(uint32_t nibble, uint32_t *out)
{
    /*
     * This is not constant time but will not leak any important info since
     * the value is either chosen or thrown away.
     */
    if (value_barrier_32(nibble < 9)) {
        *out = mod_sub(4, nibble);
        return 1;
    }
    return 0;
}

/**
 * @brief Generate a value in the range (q-2..0..2)
 * See FIPS 204, Algorithm 15, CoeffFromHalfByte() where eta = 2
 * Note the FIPS 204 code uses the range -2..2 (whereas this code adds q to the
 * negative numbers).
 *
 * @param nibble A value in the range 0..15
 * @param out The returned value if the range (q-2)..0..2 if nibble is < 15
 * @returns 1 nibble was in range, or 0 if the nibble was rejected.
 */
static ossl_inline int coeff_from_nibble_2(uint32_t nibble, uint32_t *out)
{
    if (value_barrier_32(nibble < 15)) {
        *out = mod_sub(2, MOD5(nibble));
        return 1;
    }
    return 0;
}

/**
 * @brief Use a seed value to generate a polynomial with coefficients in the
 * range of 0..q-1 using rejection sampling.
 * SHAKE128 is used to absorb the seed, and then sequences of 3 sample bytes are
 * squeezed to try to produce coefficients.
 * The SHAKE128 stream is used to get uniformly distributed elements.
 * This algorithm is used for matrix expansion and only operates on public inputs.
 *
 * See FIPS 204, Algorithm 30, RejNTTPoly()
 *
 * @param g_ctx A EVP_MD_CTX object used for sampling the seed.
 * @param md A pre-fetched SHAKE128 object.
 * @param seed The seed to use for sampling.
 * @param seed_len The size of |seed|
 * @param out The returned polynomial with coefficients in the range of
 *            0..q-1. This range is required for NTT.
 * @returns 1 if the polynomial was successfully generated, or 0 if any of the
 *            digest operations failed.
 */
static int rej_ntt_poly(EVP_MD_CTX *g_ctx, const EVP_MD *md,
    const uint8_t *seed, size_t seed_len, POLY *out)
{
    int j = 0;
    uint8_t blocks[SHAKE128_BLOCKSIZE], *b, *end = blocks + sizeof(blocks);

    /*
     * Instead of just squeezing 3 bytes at a time, we grab a whole block
     * Note that the shake128 blocksize of 168 is divisible by 3.
     */
    if (!shake_xof(g_ctx, md, seed, seed_len, blocks, sizeof(blocks)))
        return 0;

    while (1) {
        for (b = blocks; b < end; b += 3) {
            if (coeff_from_three_bytes(b, &(out->coeff[j]))) {
                if (++j >= ML_DSA_NUM_POLY_COEFFICIENTS)
                    return 1; /* finished */
            }
        }
        if (!EVP_DigestSqueeze(g_ctx, blocks, sizeof(blocks)))
            return 0;
    }
}

/**
 * @brief Use a seed value to generate a polynomial with coefficients in the
 * range of ((q-eta)..0..eta) using rejection sampling. eta is either 2 or 4.
 * SHAKE256 is used to absorb the seed, and then samples are squeezed.
 * See FIPS 204, Algorithm 31, RejBoundedPoly()
 *
 * @param h_ctx A EVP_MD_CTX object context used to sample the seed.
 * @param md A pre-fetched SHAKE256 object.
 * @param coef_from_nibble A function that is dependent on eta, which takes a
 *                         nibble and tries to see if it is in the correct range.
 * @param seed The seed to use for sampling.
 * @param seed_len The size of |seed|
 * @param out The returned polynomial with coefficients in the range of
 *            ((q-eta)..0..eta)
 * @returns 1 if the polynomial was successfully generated, or 0 if any of the
 *            digest operations failed.
 */
static int rej_bounded_poly(EVP_MD_CTX *h_ctx, const EVP_MD *md,
    COEFF_FROM_NIBBLE_FUNC *coef_from_nibble,
    const uint8_t *seed, size_t seed_len, POLY *out)
{
    int j = 0;
    uint32_t z0, z1;
    uint8_t blocks[SHAKE256_BLOCKSIZE], *b, *end = blocks + sizeof(blocks);

    /* Instead of just squeezing 1 byte at a time, we grab a whole block */
    if (!shake_xof(h_ctx, md, seed, seed_len, blocks, sizeof(blocks)))
        return 0;

    while (1) {
        for (b = blocks; b < end; b++) {
            z0 = *b & 0x0F; /* lower nibble of byte */
            z1 = *b >> 4; /* high nibble of byte */

            if (coef_from_nibble(z0, &out->coeff[j])
                && ++j >= ML_DSA_NUM_POLY_COEFFICIENTS)
                return 1;
            if (coef_from_nibble(z1, &out->coeff[j])
                && ++j >= ML_DSA_NUM_POLY_COEFFICIENTS)
                return 1;
        }
        if (!EVP_DigestSqueeze(h_ctx, blocks, sizeof(blocks)))
            return 0;
    }
}

/**
 * @brief Generate a k * l matrix that has uniformly distributed polynomial
 *        elements using rejection sampling.
 * See FIPS 204, Algorithm 32, ExpandA()
 *
 * @param g_ctx A EVP_MD_CTX context used for rejection sampling
 *              seed values generated from the seed rho.
 * @param md A pre-fetched SHAKE128 object
 * @param rho A 32 byte seed to generated the matrix from.
 * @param out The generated k * l matrix of polynomials with coefficients
 *            in the range of 0..q-1.
 * @returns 1 if the matrix was generated, or 0 on error.
 */
int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md,
    const uint8_t *rho, MATRIX *out)
{
    int ret = 0;
    size_t i, j;
    uint8_t derived_seed[ML_DSA_RHO_BYTES + 2];
    POLY *poly = out->m_poly;

    /* The seed used for each matrix element is rho + column_index + row_index */
    memcpy(derived_seed, rho, ML_DSA_RHO_BYTES);

    for (i = 0; i < out->k; i++) {
        for (j = 0; j < out->l; j++) {
            derived_seed[ML_DSA_RHO_BYTES + 1] = (uint8_t)i;
            derived_seed[ML_DSA_RHO_BYTES] = (uint8_t)j;
            /* Generate the polynomial for each matrix element using a unique seed */
            if (!rej_ntt_poly(g_ctx, md, derived_seed, sizeof(derived_seed), poly++))
                goto err;
        }
    }
    ret = 1;
err:
    return ret;
}

/**
 * @brief Generates 2 vectors using rejection sampling whose polynomial
 * coefficients are in the interval [q-eta..0..eta]
 *
 * See FIPS 204, Algorithm 33, ExpandS().
 * Note that in FIPS 204 the range -eta..eta is used.
 *
 * @param h_ctx A EVP_MD_CTX context to use to sample the seed.
 * @param md A pre-fetched SHAKE256 object.
 * @param eta Is either 2 or 4, and determines the range of the coefficients for
 *            s1 and s2.
 * @param seed A 64 byte seed to use for sampling.
 * @param s1 A 1 * l column vector containing polynomials with coefficients in
 *           the range (q-eta)..0..eta
 * @param s2 A 1 * k column vector containing polynomials with coefficients in
 *           the range (q-eta)..0..eta
 * @returns 1 if s1 and s2 were successfully generated, or 0 otherwise.
 */
int ossl_ml_dsa_vector_expand_S(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta,
    const uint8_t *seed, VECTOR *s1, VECTOR *s2)
{
    int ret = 0;
    size_t i;
    size_t l = s1->num_poly;
    size_t k = s2->num_poly;
    uint8_t derived_seed[ML_DSA_PRIV_SEED_BYTES + 2];
    COEFF_FROM_NIBBLE_FUNC *coef_from_nibble_fn;

    coef_from_nibble_fn = (eta == ML_DSA_ETA_4) ? coeff_from_nibble_4 : coeff_from_nibble_2;

    /*
     * Each polynomial generated uses a unique seed that consists of
     * seed + counter (where the counter is 2 bytes starting at 0)
     */
    memcpy(derived_seed, seed, ML_DSA_PRIV_SEED_BYTES);
    derived_seed[ML_DSA_PRIV_SEED_BYTES] = 0;
    derived_seed[ML_DSA_PRIV_SEED_BYTES + 1] = 0;

    for (i = 0; i < l; i++) {
        if (!rej_bounded_poly(h_ctx, md, coef_from_nibble_fn,
                derived_seed, sizeof(derived_seed), &s1->poly[i]))
            goto err;
        ++derived_seed[ML_DSA_PRIV_SEED_BYTES];
    }
    for (i = 0; i < k; i++) {
        if (!rej_bounded_poly(h_ctx, md, coef_from_nibble_fn,
                derived_seed, sizeof(derived_seed), &s2->poly[i]))
            goto err;
        ++derived_seed[ML_DSA_PRIV_SEED_BYTES];
    }
    ret = 1;
err:
    return ret;
}

/* See FIPS 204, Algorithm 34, ExpandMask(), Step 4 & 5 */
int ossl_ml_dsa_poly_expand_mask(POLY *out, const uint8_t *seed, size_t seed_len,
    uint32_t gamma1,
    EVP_MD_CTX *h_ctx, const EVP_MD *md)
{
    uint8_t buf[32 * 20];
    size_t buf_len = 32 * (gamma1 == ML_DSA_GAMMA1_TWO_POWER_19 ? 20 : 18);

    return shake_xof(h_ctx, md, seed, seed_len, buf, buf_len)
        && ossl_ml_dsa_poly_decode_expand_mask(out, buf, buf_len, gamma1);
}

/*
 * @brief Sample a polynomial with coefficients in the range {-1..1}.
 * The number of non zero values (hamming weight) is given by tau
 *
 * See FIPS 204, Algorithm 29, SampleInBall()
 * This function is assumed to not be constant time.
 * The algorithm is based on Durstenfeld's version of the Fisher-Yates shuffle.
 *
 * Note that the coefficients returned by this implementation are positive
 * i.e one of q-1, 0, or 1.
 *
 * @param tau is the number of +1 or -1's in the polynomial 'out_c' (39, 49 or 60)
 *            that is less than or equal to 64
 */
int ossl_ml_dsa_poly_sample_in_ball(POLY *out_c, const uint8_t *seed, int seed_len,
    EVP_MD_CTX *h_ctx, const EVP_MD *md,
    uint32_t tau)
{
    uint8_t block[SHAKE256_BLOCKSIZE];
    uint64_t signs;
    int offset = 8;
    size_t end;

    /*
     * Rather than squeeze 8 bytes followed by lots of 1 byte squeezes
     * the SHAKE blocksize is squeezed each time and buffered into 'block'.
     */
    if (!shake_xof(h_ctx, md, seed, seed_len, block, sizeof(block)))
        return 0;

    /*
     * grab the first 64 bits - since tau < 64
     * Each bit gives a +1 or -1 value.
     */
    OPENSSL_load_u64_le(&signs, block);

    poly_zero(out_c);

    /* Loop tau times */
    for (end = 256 - tau; end < 256; end++) {
        size_t index; /* index is a random offset to write +1 or -1 */

        /* rejection sample in {0..end} to choose an index to place -1 or 1 into */
        for (;;) {
            if (offset == sizeof(block)) {
                /* squeeze another block if the bytes from block have been used */
                if (!EVP_DigestSqueeze(h_ctx, block, sizeof(block)))
                    return 0;
                offset = 0;
            }

            index = block[offset++];
            if (index <= end)
                break;
        }

        /*
         * In-place swap the coefficient we are about to replace to the end so
         * we don't lose any values that have been already written.
         */
        out_c->coeff[end] = out_c->coeff[index];
        /* set the random coefficient value to either 1 or q-1 */
        out_c->coeff[index] = mod_sub(1, 2 * (signs & 1));
        signs >>= 1; /* grab the next random bit */
    }
    return 1;
}

Coverage Report

Created: 2025-12-31 06:58

Line	Count	Source
1		/*
2		* Copyright 2024-2025 The OpenSSL Project Authors. All Rights Reserved.
3		*
4		* Licensed under the Apache License 2.0 (the "License"). You may not use
5		* this file except in compliance with the License. You can obtain a copy
6		* in the file LICENSE in the source distribution or at
7		* https://www.openssl.org/source/license.html
8		*/
9
10		#include <openssl/byteorder.h>
11		#include "ml_dsa_local.h"
12		#include "ml_dsa_vector.h"
13		#include "ml_dsa_matrix.h"
14		#include "ml_dsa_hash.h"
15		#include "internal/sha3.h"
16		#include "internal/packet.h"
17
18		#define SHAKE128_BLOCKSIZE SHA3_BLOCKSIZE(128)
19		#define SHAKE256_BLOCKSIZE SHA3_BLOCKSIZE(256)
20
21		/*
22		* This is a constant time version of n % 5
23		* Note that 0xFFFF / 5 = 0x3333, 2 is added to make an over-estimate of 1/5
24		* and then we divide by (0xFFFF + 1)
25		*/
26	2.81M	#define MOD5(n) ((n) - 5 * (0x3335 * (n) >> 16))
27
28		#if SHAKE128_BLOCKSIZE % 3 != 0
29		#error "rej_ntt_poly() requires SHAKE128_BLOCKSIZE to be a multiple of 3"
30		#endif
31
32		typedef int(COEFF_FROM_NIBBLE_FUNC)(uint32_t nibble, uint32_t *out);
33
34		static COEFF_FROM_NIBBLE_FUNC coeff_from_nibble_4;
35		static COEFF_FROM_NIBBLE_FUNC coeff_from_nibble_2;
36
37		/**
38		* @brief Combine 3 bytes to form an coefficient.
39		* See FIPS 204, Algorithm 14, CoeffFromThreeBytes()
40		*
41		* This is not constant time as it is used to generate the matrix A which is public.
42		*
43		* @param s A byte array of 3 uniformly distributed bytes.
44		* @param out The returned coefficient in the range 0..q-1.
45		* @returns 1 if the value is less than q or 0 otherwise.
46		* This is used for rejection sampling.
47		*/
48		static ossl_inline int coeff_from_three_bytes(const uint8_t s, uint32_t out)
49	21.6M	{
50		/* Zero out the top bit of the 3rd byte to get a value in the range 0..2^23-1) */
51	21.6M	*out = (uint32_t)s[0] \| ((uint32_t)s[1] << 8) \| (((uint32_t)s[2] & 0x7f) << 16);
52	21.6M	return *out < ML_DSA_Q;
53	21.6M	}
54
55		/**
56		* @brief Generate a value in the range (q-4..0..4)
57		* See FIPS 204, Algorithm 15, CoeffFromHalfByte() where eta = 4
58		* Note the FIPS 204 code uses the range -4..4 (whereas this code adds q to the
59		* negative numbers).
60		*
61		* @param nibble A value in the range 0..15
62		* @param out The returned value if the range (q-4)..0..4 if nibble is < 9
63		* @returns 1 nibble was in range, or 0 if the nibble was rejected.
64		*/
65		static ossl_inline int coeff_from_nibble_4(uint32_t nibble, uint32_t *out)
66	2.47M	{
67		/*
68		* This is not constant time but will not leak any important info since
69		* the value is either chosen or thrown away.
70		*/
71	2.47M	if (value_barrier_32(nibble < 9)) {
72	1.39M	*out = mod_sub(4, nibble);
73	1.39M	return 1;
74	1.39M	}
75	1.08M	return 0;
76	2.47M	}
77
78		/**
79		* @brief Generate a value in the range (q-2..0..2)
80		* See FIPS 204, Algorithm 15, CoeffFromHalfByte() where eta = 2
81		* Note the FIPS 204 code uses the range -2..2 (whereas this code adds q to the
82		* negative numbers).
83		*
84		* @param nibble A value in the range 0..15
85		* @param out The returned value if the range (q-2)..0..2 if nibble is < 15
86		* @returns 1 nibble was in range, or 0 if the nibble was rejected.
87		*/
88		static ossl_inline int coeff_from_nibble_2(uint32_t nibble, uint32_t *out)
89	3.00M	{
90	3.00M	if (value_barrier_32(nibble < 15)) {
91	2.81M	*out = mod_sub(2, MOD5(nibble));
92	2.81M	return 1;
93	2.81M	}
94	188k	return 0;
95	3.00M	}
96
97		/**
98		* @brief Use a seed value to generate a polynomial with coefficients in the
99		* range of 0..q-1 using rejection sampling.
100		* SHAKE128 is used to absorb the seed, and then sequences of 3 sample bytes are
101		* squeezed to try to produce coefficients.
102		* The SHAKE128 stream is used to get uniformly distributed elements.
103		* This algorithm is used for matrix expansion and only operates on public inputs.
104		*
105		* See FIPS 204, Algorithm 30, RejNTTPoly()
106		*
107		* @param g_ctx A EVP_MD_CTX object used for sampling the seed.
108		* @param md A pre-fetched SHAKE128 object.
109		* @param seed The seed to use for sampling.
110		* @param seed_len The size of \|seed\|
111		* @param out The returned polynomial with coefficients in the range of
112		* 0..q-1. This range is required for NTT.
113		* @returns 1 if the polynomial was successfully generated, or 0 if any of the
114		* digest operations failed.
115		*/
116		static int rej_ntt_poly(EVP_MD_CTX g_ctx, const EVP_MD md,
117		const uint8_t seed, size_t seed_len, POLY out)
118	84.3k	{
119	84.3k	int j = 0;
120	84.3k	uint8_t blocks[SHAKE128_BLOCKSIZE], b, end = blocks + sizeof(blocks);
121
122		/*
123		* Instead of just squeezing 3 bytes at a time, we grab a whole block
124		* Note that the shake128 blocksize of 168 is divisible by 3.
125		*/
126	84.3k	if (!shake_xof(g_ctx, md, seed, seed_len, blocks, sizeof(blocks)))
127	0	return 0;
128
129	421k	while (1) {
130	21.9M	for (b = blocks; b < end; b += 3) {
131	21.6M	if (coeff_from_three_bytes(b, &(out->coeff[j]))) {
132	21.5M	if (++j >= ML_DSA_NUM_POLY_COEFFICIENTS)
133	84.3k	return 1; /* finished */
134	21.5M	}
135	21.6M	}
136	337k	if (!EVP_DigestSqueeze(g_ctx, blocks, sizeof(blocks)))
137	0	return 0;
138	337k	}
139	84.3k	}
140
141		/**
142		* @brief Use a seed value to generate a polynomial with coefficients in the
143		* range of ((q-eta)..0..eta) using rejection sampling. eta is either 2 or 4.
144		* SHAKE256 is used to absorb the seed, and then samples are squeezed.
145		* See FIPS 204, Algorithm 31, RejBoundedPoly()
146		*
147		* @param h_ctx A EVP_MD_CTX object context used to sample the seed.
148		* @param md A pre-fetched SHAKE256 object.
149		* @param coef_from_nibble A function that is dependent on eta, which takes a
150		* nibble and tries to see if it is in the correct range.
151		* @param seed The seed to use for sampling.
152		* @param seed_len The size of \|seed\|
153		* @param out The returned polynomial with coefficients in the range of
154		* ((q-eta)..0..eta)
155		* @returns 1 if the polynomial was successfully generated, or 0 if any of the
156		* digest operations failed.
157		*/
158		static int rej_bounded_poly(EVP_MD_CTX h_ctx, const EVP_MD md,
159		COEFF_FROM_NIBBLE_FUNC *coef_from_nibble,
160		const uint8_t seed, size_t seed_len, POLY out)
161	16.4k	{
162	16.4k	int j = 0;
163	16.4k	uint32_t z0, z1;
164	16.4k	uint8_t blocks[SHAKE256_BLOCKSIZE], b, end = blocks + sizeof(blocks);
165
166		/* Instead of just squeezing 1 byte at a time, we grab a whole block */
167	16.4k	if (!shake_xof(h_ctx, md, seed, seed_len, blocks, sizeof(blocks)))
168	0	return 0;
169
170	27.7k	while (1) {
171	2.75M	for (b = blocks; b < end; b++) {
172	2.74M	z0 = b & 0x0F; / lower nibble of byte */
173	2.74M	z1 = b >> 4; / high nibble of byte */
174
175	2.74M	if (coef_from_nibble(z0, &out->coeff[j])
176	2.10M	&& ++j >= ML_DSA_NUM_POLY_COEFFICIENTS)
177	8.26k	return 1;
178	2.73M	if (coef_from_nibble(z1, &out->coeff[j])
179	2.09M	&& ++j >= ML_DSA_NUM_POLY_COEFFICIENTS)
180	8.17k	return 1;
181	2.73M	}
182	11.3k	if (!EVP_DigestSqueeze(h_ctx, blocks, sizeof(blocks)))
183	0	return 0;
184	11.3k	}
185	16.4k	}
186
187		/**
188		* @brief Generate a k * l matrix that has uniformly distributed polynomial
189		* elements using rejection sampling.
190		* See FIPS 204, Algorithm 32, ExpandA()
191		*
192		* @param g_ctx A EVP_MD_CTX context used for rejection sampling
193		* seed values generated from the seed rho.
194		* @param md A pre-fetched SHAKE128 object
195		* @param rho A 32 byte seed to generated the matrix from.
196		* @param out The generated k * l matrix of polynomials with coefficients
197		* in the range of 0..q-1.
198		* @returns 1 if the matrix was generated, or 0 on error.
199		*/
200		int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX g_ctx, const EVP_MD md,
201		const uint8_t rho, MATRIX out)
202	2.68k	{
203	2.68k	int ret = 0;
204	2.68k	size_t i, j;
205	2.68k	uint8_t derived_seed[ML_DSA_RHO_BYTES + 2];
206	2.68k	POLY *poly = out->m_poly;
207
208		/* The seed used for each matrix element is rho + column_index + row_index */
209	2.68k	memcpy(derived_seed, rho, ML_DSA_RHO_BYTES);
210
211	18.1k	for (i = 0; i < out->k; i++) {
212	99.7k	for (j = 0; j < out->l; j++) {
213	84.3k	derived_seed[ML_DSA_RHO_BYTES + 1] = (uint8_t)i;
214	84.3k	derived_seed[ML_DSA_RHO_BYTES] = (uint8_t)j;
215		/* Generate the polynomial for each matrix element using a unique seed */
216	84.3k	if (!rej_ntt_poly(g_ctx, md, derived_seed, sizeof(derived_seed), poly++))
217	0	goto err;
218	84.3k	}
219	15.4k	}
220	2.68k	ret = 1;
221	2.68k	err:
222	2.68k	return ret;
223	2.68k	}
224
225		/**
226		* @brief Generates 2 vectors using rejection sampling whose polynomial
227		* coefficients are in the interval [q-eta..0..eta]
228		*
229		* See FIPS 204, Algorithm 33, ExpandS().
230		* Note that in FIPS 204 the range -eta..eta is used.
231		*
232		* @param h_ctx A EVP_MD_CTX context to use to sample the seed.
233		* @param md A pre-fetched SHAKE256 object.
234		* @param eta Is either 2 or 4, and determines the range of the coefficients for
235		* s1 and s2.
236		* @param seed A 64 byte seed to use for sampling.
237		* @param s1 A 1 * l column vector containing polynomials with coefficients in
238		* the range (q-eta)..0..eta
239		* @param s2 A 1 * k column vector containing polynomials with coefficients in
240		* the range (q-eta)..0..eta
241		* @returns 1 if s1 and s2 were successfully generated, or 0 otherwise.
242		*/
243		int ossl_ml_dsa_vector_expand_S(EVP_MD_CTX h_ctx, const EVP_MD md, int eta,
244		const uint8_t seed, VECTOR s1, VECTOR *s2)
245	1.49k	{
246	1.49k	int ret = 0;
247	1.49k	size_t i;
248	1.49k	size_t l = s1->num_poly;
249	1.49k	size_t k = s2->num_poly;
250	1.49k	uint8_t derived_seed[ML_DSA_PRIV_SEED_BYTES + 2];
251	1.49k	COEFF_FROM_NIBBLE_FUNC *coef_from_nibble_fn;
252
253	1.49k	coef_from_nibble_fn = (eta == ML_DSA_ETA_4) ? coeff_from_nibble_4 : coeff_from_nibble_2;
254
255		/*
256		* Each polynomial generated uses a unique seed that consists of
257		* seed + counter (where the counter is 2 bytes starting at 0)
258		*/
259	1.49k	memcpy(derived_seed, seed, ML_DSA_PRIV_SEED_BYTES);
260	1.49k	derived_seed[ML_DSA_PRIV_SEED_BYTES] = 0;
261	1.49k	derived_seed[ML_DSA_PRIV_SEED_BYTES + 1] = 0;
262
263	9.25k	for (i = 0; i < l; i++) {
264	7.75k	if (!rej_bounded_poly(h_ctx, md, coef_from_nibble_fn,
265	7.75k	derived_seed, sizeof(derived_seed), &s1->poly[i]))
266	0	goto err;
267	7.75k	++derived_seed[ML_DSA_PRIV_SEED_BYTES];
268	7.75k	}
269	10.1k	for (i = 0; i < k; i++) {
270	8.67k	if (!rej_bounded_poly(h_ctx, md, coef_from_nibble_fn,
271	8.67k	derived_seed, sizeof(derived_seed), &s2->poly[i]))
272	0	goto err;
273	8.67k	++derived_seed[ML_DSA_PRIV_SEED_BYTES];
274	8.67k	}
275	1.49k	ret = 1;
276	1.49k	err:
277	1.49k	return ret;
278	1.49k	}
279
280		/* See FIPS 204, Algorithm 34, ExpandMask(), Step 4 & 5 */
281		int ossl_ml_dsa_poly_expand_mask(POLY out, const uint8_t seed, size_t seed_len,
282		uint32_t gamma1,
283		EVP_MD_CTX h_ctx, const EVP_MD md)
284	13.8k	{
285	13.8k	uint8_t buf[32 * 20];
286	13.8k	size_t buf_len = 32 * (gamma1 == ML_DSA_GAMMA1_TWO_POWER_19 ? 20 : 18);
287
288	13.8k	return shake_xof(h_ctx, md, seed, seed_len, buf, buf_len)
289	13.8k	&& ossl_ml_dsa_poly_decode_expand_mask(out, buf, buf_len, gamma1);
290	13.8k	}
291
292		/*
293		* @brief Sample a polynomial with coefficients in the range {-1..1}.
294		* The number of non zero values (hamming weight) is given by tau
295		*
296		* See FIPS 204, Algorithm 29, SampleInBall()
297		* This function is assumed to not be constant time.
298		* The algorithm is based on Durstenfeld's version of the Fisher-Yates shuffle.
299		*
300		* Note that the coefficients returned by this implementation are positive
301		* i.e one of q-1, 0, or 1.
302		*
303		* @param tau is the number of +1 or -1's in the polynomial 'out_c' (39, 49 or 60)
304		* that is less than or equal to 64
305		*/
306		int ossl_ml_dsa_poly_sample_in_ball(POLY out_c, const uint8_t seed, int seed_len,
307		EVP_MD_CTX h_ctx, const EVP_MD md,
308		uint32_t tau)
309	3.30k	{
310	3.30k	uint8_t block[SHAKE256_BLOCKSIZE];
311	3.30k	uint64_t signs;
312	3.30k	int offset = 8;
313	3.30k	size_t end;
314
315		/*
316		* Rather than squeeze 8 bytes followed by lots of 1 byte squeezes
317		* the SHAKE blocksize is squeezed each time and buffered into 'block'.
318		*/
319	3.30k	if (!shake_xof(h_ctx, md, seed, seed_len, block, sizeof(block)))
320	0	return 0;
321
322		/*
323		* grab the first 64 bits - since tau < 64
324		* Each bit gives a +1 or -1 value.
325		*/
326	3.30k	OPENSSL_load_u64_le(&signs, block);
327
328	3.30k	poly_zero(out_c);
329
330		/* Loop tau times */
331	161k	for (end = 256 - tau; end < 256; end++) {
332	157k	size_t index; /* index is a random offset to write +1 or -1 */
333
334		/* rejection sample in {0..end} to choose an index to place -1 or 1 into */
335	174k	for (;;) {
336	174k	if (offset == sizeof(block)) {
337		/* squeeze another block if the bytes from block have been used */
338	0	if (!EVP_DigestSqueeze(h_ctx, block, sizeof(block)))
339	0	return 0;
340	0	offset = 0;
341	0	}
342
343	174k	index = block[offset++];
344	174k	if (index <= end)
345	157k	break;
346	174k	}
347
348		/*
349		* In-place swap the coefficient we are about to replace to the end so
350		* we don't lose any values that have been already written.
351		*/
352	157k	out_c->coeff[end] = out_c->coeff[index];
353		/* set the random coefficient value to either 1 or q-1 */
354	157k	out_c->coeff[index] = mod_sub(1, 2 * (signs & 1));
355	157k	signs >>= 1; /* grab the next random bit */
356	157k	}
357	3.30k	return 1;
358	3.30k	}