/rust/registry/src/index.crates.io-1949cf8c6b5b557f/zstd-0.13.3/src/dict.rs

Source
//! Train a dictionary from various sources.
//!
//! A dictionary can help improve the compression of small files.
//! The dictionary must be present during decompression,
//! but can be shared across multiple "similar" files.
//!
//! Creating a dictionary using the `zstd` C library,
//! using the `zstd` command-line interface, using this library,
//! or using the `train` binary provided, should give the same result,
//! and are therefore completely compatible.
//!
//! To use, see [`Encoder::with_dictionary`] or [`Decoder::with_dictionary`].
//!
//! [`Encoder::with_dictionary`]: ../struct.Encoder.html#method.with_dictionary
//! [`Decoder::with_dictionary`]: ../struct.Decoder.html#method.with_dictionary

#[cfg(feature = "zdict_builder")]
use std::io::{self, Read};

pub use zstd_safe::{CDict, DDict};

/// Prepared dictionary for compression
///
/// A dictionary can include its own copy of the data (if it is `'static`), or it can merely point
/// to a separate buffer (if it has another lifetime).
pub struct EncoderDictionary<'a> {
    cdict: CDict<'a>,
}

impl EncoderDictionary<'static> {
    /// Creates a prepared dictionary for compression.
    ///
    /// This will copy the dictionary internally.
    pub fn copy(dictionary: &[u8], level: i32) -> Self {
        Self {
            cdict: zstd_safe::create_cdict(dictionary, level),
        }
    }
}

impl<'a> EncoderDictionary<'a> {
    #[cfg(feature = "experimental")]
    #[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "experimental")))]
    /// Create prepared dictionary for compression
    ///
    /// A level of `0` uses zstd's default (currently `3`).
    ///
    /// Only available with the `experimental` feature. Use `EncoderDictionary::copy` otherwise.
    pub fn new(dictionary: &'a [u8], level: i32) -> Self {
        Self {
            cdict: zstd_safe::CDict::create_by_reference(dictionary, level),
        }
    }

    /// Returns reference to `CDict` inner object
    pub fn as_cdict(&self) -> &CDict<'a> {
        &self.cdict
    }
}

/// Prepared dictionary for decompression
pub struct DecoderDictionary<'a> {
    ddict: DDict<'a>,
}

impl DecoderDictionary<'static> {
    /// Create a prepared dictionary for decompression.
    ///
    /// This will copy the dictionary internally.
    pub fn copy(dictionary: &[u8]) -> Self {
        Self {
            ddict: zstd_safe::DDict::create(dictionary),
        }
    }
}

impl<'a> DecoderDictionary<'a> {
    #[cfg(feature = "experimental")]
    #[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "experimental")))]
    /// Create prepared dictionary for decompression
    ///
    /// Only available with the `experimental` feature. Use `DecoderDictionary::copy` otherwise.
    pub fn new(dict: &'a [u8]) -> Self {
        Self {
            ddict: zstd_safe::DDict::create_by_reference(dict),
        }
    }

    /// Returns reference to `DDict` inner object
    pub fn as_ddict(&self) -> &DDict<'a> {
        &self.ddict
    }
}

/// Train a dictionary from a big continuous chunk of data, with all samples
/// contiguous in memory.
///
/// This is the most efficient way to train a dictionary,
/// since this is directly fed into `zstd`.
///
/// * `sample_data` is the concatenation of all sample data.
/// * `sample_sizes` is the size of each sample in `sample_data`.
///     The sum of all `sample_sizes` should equal the length of `sample_data`.
/// * `max_size` is the maximum size of the dictionary to generate.
///
/// The result is the dictionary data. You can, for example, feed it to [`CDict::create`].
#[cfg(feature = "zdict_builder")]
#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))]
pub fn from_continuous(
    sample_data: &[u8],
    sample_sizes: &[usize],
    max_size: usize,
) -> io::Result<Vec<u8>> {
    use crate::map_error_code;

    // Complain if the lengths don't add up to the entire data.
    if sample_sizes.iter().sum::<usize>() != sample_data.len() {
        return Err(io::Error::new(
            io::ErrorKind::Other,
            "sample sizes don't add up".to_string(),
        ));
    }

    let mut result = Vec::with_capacity(max_size);
    zstd_safe::train_from_buffer(&mut result, sample_data, sample_sizes)
        .map_err(map_error_code)?;
    Ok(result)
}

/// Train a dictionary from multiple samples.
///
/// The samples will internally be copied to a single continuous buffer,
/// so make sure you have enough memory available.
///
/// If you need to stretch your system's limits,
/// [`from_continuous`] directly uses the given slice.
///
/// [`from_continuous`]: ./fn.from_continuous.html
///
/// * `samples` is a list of individual samples to train on.
/// * `max_size` is the maximum size of the dictionary to generate.
///
/// The result is the dictionary data. You can, for example, feed it to [`CDict::create`].
#[cfg(feature = "zdict_builder")]
#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))]
pub fn from_samples<S: AsRef<[u8]>>(
    samples: &[S],
    max_size: usize,
) -> io::Result<Vec<u8>> {
    // Pre-allocate the entire required size.
    let total_length: usize =
        samples.iter().map(|sample| sample.as_ref().len()).sum();

    let mut data = Vec::with_capacity(total_length);

    // Copy every sample to a big chunk of memory
    data.extend(samples.iter().flat_map(|s| s.as_ref()).cloned());

    let sizes: Vec<_> = samples.iter().map(|s| s.as_ref().len()).collect();

    from_continuous(&data, &sizes, max_size)
}

/// Train a dictionary from multiple samples.
///
/// Unlike [`from_samples`], this does not require having a list of all samples.
/// It also allows running into an error when iterating through the samples.
///
/// They will still be copied to a continuous array and fed to [`from_continuous`].
///
/// * `samples` is an iterator of individual samples to train on.
/// * `max_size` is the maximum size of the dictionary to generate.
///
/// The result is the dictionary data. You can, for example, feed it to [`CDict::create`].
///
/// # Examples
///
/// ```rust,no_run
/// // Train from a couple of json files.
/// let dict_buffer = zstd::dict::from_sample_iterator(
///     ["file_a.json", "file_b.json"]
///         .into_iter()
///         .map(|filename| std::fs::File::open(filename)),
///     10_000,  // 10kB dictionary
/// ).unwrap();
/// ```
///
/// ```rust,no_run
/// use std::io::BufRead as _;
/// // Treat each line from stdin as a separate sample.
/// let dict_buffer = zstd::dict::from_sample_iterator(
///     std::io::stdin().lock().lines().map(|line: std::io::Result<String>| {
///         // Transform each line into a `Cursor<Vec<u8>>` so they implement Read.
///         line.map(String::into_bytes)
///             .map(std::io::Cursor::new)
///     }),
///     10_000,  // 10kB dictionary
/// ).unwrap();
/// ```
#[cfg(feature = "zdict_builder")]
#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))]
pub fn from_sample_iterator<I, R>(
    samples: I,
    max_size: usize,
) -> io::Result<Vec<u8>>
where
    I: IntoIterator<Item = io::Result<R>>,
    R: Read,
{
    let mut data = Vec::new();
    let mut sizes = Vec::new();

    for sample in samples {
        let mut sample = sample?;
        let len = sample.read_to_end(&mut data)?;
        sizes.push(len);
    }

    from_continuous(&data, &sizes, max_size)
}

/// Train a dict from a list of files.
///
/// * `filenames` is an iterator of files to load. Each file will be treated as an individual
///     sample.
/// * `max_size` is the maximum size of the dictionary to generate.
///
/// The result is the dictionary data. You can, for example, feed it to [`CDict::create`].
#[cfg(feature = "zdict_builder")]
#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))]
pub fn from_files<I, P>(filenames: I, max_size: usize) -> io::Result<Vec<u8>>
where
    P: AsRef<std::path::Path>,
    I: IntoIterator<Item = P>,
{
    from_sample_iterator(
        filenames
            .into_iter()
            .map(|filename| std::fs::File::open(filename)),
        max_size,
    )
}

#[cfg(test)]
#[cfg(feature = "zdict_builder")]
mod tests {
    use std::fs;
    use std::io;
    use std::io::Read;

    use walkdir;

    #[test]
    fn test_dict_training() {
        // Train a dictionary
        let paths: Vec<_> = walkdir::WalkDir::new("src")
            .into_iter()
            .map(|entry| entry.unwrap())
            .map(|entry| entry.into_path())
            .filter(|path| path.to_str().unwrap().ends_with(".rs"))
            .collect();

        let dict = super::from_files(&paths, 4000).unwrap();

        for path in paths {
            let mut buffer = Vec::new();
            let mut file = fs::File::open(path).unwrap();
            let mut content = Vec::new();
            file.read_to_end(&mut content).unwrap();
            io::copy(
                &mut &content[..],
                &mut crate::stream::Encoder::with_dictionary(
                    &mut buffer,
                    1,
                    &dict,
                )
                .unwrap()
                .auto_finish(),
            )
            .unwrap();

            let mut result = Vec::new();
            io::copy(
                &mut crate::stream::Decoder::with_dictionary(
                    &buffer[..],
                    &dict[..],
                )
                .unwrap(),
                &mut result,
            )
            .unwrap();

            assert_eq!(&content, &result);
        }
    }
}

Coverage Report

Created: 2026-03-31 07:58

Line	Count	Source
1		//! Train a dictionary from various sources.
2		//!
3		//! A dictionary can help improve the compression of small files.
4		//! The dictionary must be present during decompression,
5		//! but can be shared across multiple "similar" files.
6		//!
7		//! Creating a dictionary using the `zstd` C library,
8		//! using the `zstd` command-line interface, using this library,
9		//! or using the `train` binary provided, should give the same result,
10		//! and are therefore completely compatible.
11		//!
12		//! To use, see [`Encoder::with_dictionary`] or [`Decoder::with_dictionary`].
13		//!
14		//! [`Encoder::with_dictionary`]: ../struct.Encoder.html#method.with_dictionary
15		//! [`Decoder::with_dictionary`]: ../struct.Decoder.html#method.with_dictionary
16
17		#[cfg(feature = "zdict_builder")]
18		use std::io::{self, Read};
19
20		pub use zstd_safe::{CDict, DDict};
21
22		/// Prepared dictionary for compression
23		///
24		/// A dictionary can include its own copy of the data (if it is `'static`), or it can merely point
25		/// to a separate buffer (if it has another lifetime).
26		pub struct EncoderDictionary<'a> {
27		cdict: CDict<'a>,
28		}
29
30		impl EncoderDictionary<'static> {
31		/// Creates a prepared dictionary for compression.
32		///
33		/// This will copy the dictionary internally.
34	0	pub fn copy(dictionary: &[u8], level: i32) -> Self {
35	0	Self {
36	0	cdict: zstd_safe::create_cdict(dictionary, level),
37	0	}
38	0	}
39		}
40
41		impl<'a> EncoderDictionary<'a> {
42		#[cfg(feature = "experimental")]
43		#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "experimental")))]
44		/// Create prepared dictionary for compression
45		///
46		/// A level of `0` uses zstd's default (currently `3`).
47		///
48		/// Only available with the `experimental` feature. Use `EncoderDictionary::copy` otherwise.
49		pub fn new(dictionary: &'a [u8], level: i32) -> Self {
50		Self {
51		cdict: zstd_safe::CDict::create_by_reference(dictionary, level),
52		}
53		}
54
55		/// Returns reference to `CDict` inner object
56	0	pub fn as_cdict(&self) -> &CDict<'a> {
57	0	&self.cdict
58	0	}
59		}
60
61		/// Prepared dictionary for decompression
62		pub struct DecoderDictionary<'a> {
63		ddict: DDict<'a>,
64		}
65
66		impl DecoderDictionary<'static> {
67		/// Create a prepared dictionary for decompression.
68		///
69		/// This will copy the dictionary internally.
70	0	pub fn copy(dictionary: &[u8]) -> Self {
71	0	Self {
72	0	ddict: zstd_safe::DDict::create(dictionary),
73	0	}
74	0	}
75		}
76
77		impl<'a> DecoderDictionary<'a> {
78		#[cfg(feature = "experimental")]
79		#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "experimental")))]
80		/// Create prepared dictionary for decompression
81		///
82		/// Only available with the `experimental` feature. Use `DecoderDictionary::copy` otherwise.
83		pub fn new(dict: &'a [u8]) -> Self {
84		Self {
85		ddict: zstd_safe::DDict::create_by_reference(dict),
86		}
87		}
88
89		/// Returns reference to `DDict` inner object
90	0	pub fn as_ddict(&self) -> &DDict<'a> {
91	0	&self.ddict
92	0	}
93		}
94
95		/// Train a dictionary from a big continuous chunk of data, with all samples
96		/// contiguous in memory.
97		///
98		/// This is the most efficient way to train a dictionary,
99		/// since this is directly fed into `zstd`.
100		///
101		/// * `sample_data` is the concatenation of all sample data.
102		/// * `sample_sizes` is the size of each sample in `sample_data`.
103		/// The sum of all `sample_sizes` should equal the length of `sample_data`.
104		/// * `max_size` is the maximum size of the dictionary to generate.
105		///
106		/// The result is the dictionary data. You can, for example, feed it to [`CDict::create`].
107		#[cfg(feature = "zdict_builder")]
108		#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))]
109	0	pub fn from_continuous(
110	0	sample_data: &[u8],
111	0	sample_sizes: &[usize],
112	0	max_size: usize,
113	0	) -> io::Result<Vec<u8>> {
114		use crate::map_error_code;
115
116		// Complain if the lengths don't add up to the entire data.
117	0	if sample_sizes.iter().sum::<usize>() != sample_data.len() {
118	0	return Err(io::Error::new(
119	0	io::ErrorKind::Other,
120	0	"sample sizes don't add up".to_string(),
121	0	));
122	0	}
123
124	0	let mut result = Vec::with_capacity(max_size);
125	0	zstd_safe::train_from_buffer(&mut result, sample_data, sample_sizes)
126	0	.map_err(map_error_code)?;
127	0	Ok(result)
128	0	}
129
130		/// Train a dictionary from multiple samples.
131		///
132		/// The samples will internally be copied to a single continuous buffer,
133		/// so make sure you have enough memory available.
134		///
135		/// If you need to stretch your system's limits,
136		/// [`from_continuous`] directly uses the given slice.
137		///
138		/// [`from_continuous`]: ./fn.from_continuous.html
139		///
140		/// * `samples` is a list of individual samples to train on.
141		/// * `max_size` is the maximum size of the dictionary to generate.
142		///
143		/// The result is the dictionary data. You can, for example, feed it to [`CDict::create`].
144		#[cfg(feature = "zdict_builder")]
145		#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))]
146	0	pub fn from_samples<S: AsRef<[u8]>>(
147	0	samples: &[S],
148	0	max_size: usize,
149	0	) -> io::Result<Vec<u8>> {
150		// Pre-allocate the entire required size.
151	0	let total_length: usize =
152	0	samples.iter().map(\|sample\| sample.as_ref().len()).sum();
153
154	0	let mut data = Vec::with_capacity(total_length);
155
156		// Copy every sample to a big chunk of memory
157	0	data.extend(samples.iter().flat_map(\|s\| s.as_ref()).cloned());
158
159	0	let sizes: Vec<_> = samples.iter().map(\|s\| s.as_ref().len()).collect();
160
161	0	from_continuous(&data, &sizes, max_size)
162	0	}
163
164		/// Train a dictionary from multiple samples.
165		///
166		/// Unlike [`from_samples`], this does not require having a list of all samples.
167		/// It also allows running into an error when iterating through the samples.
168		///
169		/// They will still be copied to a continuous array and fed to [`from_continuous`].
170		///
171		/// * `samples` is an iterator of individual samples to train on.
172		/// * `max_size` is the maximum size of the dictionary to generate.
173		///
174		/// The result is the dictionary data. You can, for example, feed it to [`CDict::create`].
175		///
176		/// # Examples
177		///
178		/// ```rust,no_run
179		/// // Train from a couple of json files.
180		/// let dict_buffer = zstd::dict::from_sample_iterator(
181		/// ["file_a.json", "file_b.json"]
182		/// .into_iter()
183		/// .map(\|filename\| std::fs::File::open(filename)),
184		/// 10_000, // 10kB dictionary
185		/// ).unwrap();
186		/// ```
187		///
188		/// ```rust,no_run
189		/// use std::io::BufRead as _;
190		/// // Treat each line from stdin as a separate sample.
191		/// let dict_buffer = zstd::dict::from_sample_iterator(
192		/// std::io::stdin().lock().lines().map(\|line: std::io::Result<String>\| {
193		/// // Transform each line into a `Cursor<Vec<u8>>` so they implement Read.
194		/// line.map(String::into_bytes)
195		/// .map(std::io::Cursor::new)
196		/// }),
197		/// 10_000, // 10kB dictionary
198		/// ).unwrap();
199		/// ```
200		#[cfg(feature = "zdict_builder")]
201		#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))]
202	0	pub fn from_sample_iterator<I, R>(
203	0	samples: I,
204	0	max_size: usize,
205	0	) -> io::Result<Vec<u8>>
206	0	where
207	0	I: IntoIterator<Item = io::Result<R>>,
208	0	R: Read,
209		{
210	0	let mut data = Vec::new();
211	0	let mut sizes = Vec::new();
212
213	0	for sample in samples {
214	0	let mut sample = sample?;
215	0	let len = sample.read_to_end(&mut data)?;
216	0	sizes.push(len);
217		}
218
219	0	from_continuous(&data, &sizes, max_size)
220	0	}
221
222		/// Train a dict from a list of files.
223		///
224		/// * `filenames` is an iterator of files to load. Each file will be treated as an individual
225		/// sample.
226		/// * `max_size` is the maximum size of the dictionary to generate.
227		///
228		/// The result is the dictionary data. You can, for example, feed it to [`CDict::create`].
229		#[cfg(feature = "zdict_builder")]
230		#[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))]
231	0	pub fn from_files<I, P>(filenames: I, max_size: usize) -> io::Result<Vec<u8>>
232	0	where
233	0	P: AsRef<std::path::Path>,
234	0	I: IntoIterator<Item = P>,
235		{
236	0	from_sample_iterator(
237	0	filenames
238	0	.into_iter()
239	0	.map(\|filename\| std::fs::File::open(filename)),
240	0	max_size,
241		)
242	0	}
243
244		#[cfg(test)]
245		#[cfg(feature = "zdict_builder")]
246		mod tests {
247		use std::fs;
248		use std::io;
249		use std::io::Read;
250
251		use walkdir;
252
253		#[test]
254		fn test_dict_training() {
255		// Train a dictionary
256		let paths: Vec<_> = walkdir::WalkDir::new("src")
257		.into_iter()
258		.map(\|entry\| entry.unwrap())
259		.map(\|entry\| entry.into_path())
260		.filter(\|path\| path.to_str().unwrap().ends_with(".rs"))
261		.collect();
262
263		let dict = super::from_files(&paths, 4000).unwrap();
264
265		for path in paths {
266		let mut buffer = Vec::new();
267		let mut file = fs::File::open(path).unwrap();
268		let mut content = Vec::new();
269		file.read_to_end(&mut content).unwrap();
270		io::copy(
271		&mut &content[..],
272		&mut crate::stream::Encoder::with_dictionary(
273		&mut buffer,
274		1,
275		&dict,
276		)
277		.unwrap()
278		.auto_finish(),
279		)
280		.unwrap();
281
282		let mut result = Vec::new();
283		io::copy(
284		&mut crate::stream::Decoder::with_dictionary(
285		&buffer[..],
286		&dict[..],
287		)
288		.unwrap(),
289		&mut result,
290		)
291		.unwrap();
292
293		assert_eq!(&content, &result);
294		}
295		}
296		}