/rust/registry/src/index.crates.io-1949cf8c6b5b557f/zstd-0.13.3/src/dict.rs
Line | Count | Source |
1 | | //! Train a dictionary from various sources. |
2 | | //! |
3 | | //! A dictionary can help improve the compression of small files. |
4 | | //! The dictionary must be present during decompression, |
5 | | //! but can be shared across multiple "similar" files. |
6 | | //! |
7 | | //! Creating a dictionary using the `zstd` C library, |
8 | | //! using the `zstd` command-line interface, using this library, |
9 | | //! or using the `train` binary provided, should give the same result, |
10 | | //! and are therefore completely compatible. |
11 | | //! |
12 | | //! To use, see [`Encoder::with_dictionary`] or [`Decoder::with_dictionary`]. |
13 | | //! |
14 | | //! [`Encoder::with_dictionary`]: ../struct.Encoder.html#method.with_dictionary |
15 | | //! [`Decoder::with_dictionary`]: ../struct.Decoder.html#method.with_dictionary |
16 | | |
17 | | #[cfg(feature = "zdict_builder")] |
18 | | use std::io::{self, Read}; |
19 | | |
20 | | pub use zstd_safe::{CDict, DDict}; |
21 | | |
22 | | /// Prepared dictionary for compression |
23 | | /// |
24 | | /// A dictionary can include its own copy of the data (if it is `'static`), or it can merely point |
25 | | /// to a separate buffer (if it has another lifetime). |
26 | | pub struct EncoderDictionary<'a> { |
27 | | cdict: CDict<'a>, |
28 | | } |
29 | | |
30 | | impl EncoderDictionary<'static> { |
31 | | /// Creates a prepared dictionary for compression. |
32 | | /// |
33 | | /// This will copy the dictionary internally. |
34 | 0 | pub fn copy(dictionary: &[u8], level: i32) -> Self { |
35 | 0 | Self { |
36 | 0 | cdict: zstd_safe::create_cdict(dictionary, level), |
37 | 0 | } |
38 | 0 | } |
39 | | } |
40 | | |
41 | | impl<'a> EncoderDictionary<'a> { |
42 | | #[cfg(feature = "experimental")] |
43 | | #[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "experimental")))] |
44 | | /// Create prepared dictionary for compression |
45 | | /// |
46 | | /// A level of `0` uses zstd's default (currently `3`). |
47 | | /// |
48 | | /// Only available with the `experimental` feature. Use `EncoderDictionary::copy` otherwise. |
49 | | pub fn new(dictionary: &'a [u8], level: i32) -> Self { |
50 | | Self { |
51 | | cdict: zstd_safe::CDict::create_by_reference(dictionary, level), |
52 | | } |
53 | | } |
54 | | |
55 | | /// Returns reference to `CDict` inner object |
56 | 0 | pub fn as_cdict(&self) -> &CDict<'a> { |
57 | 0 | &self.cdict |
58 | 0 | } |
59 | | } |
60 | | |
61 | | /// Prepared dictionary for decompression |
62 | | pub struct DecoderDictionary<'a> { |
63 | | ddict: DDict<'a>, |
64 | | } |
65 | | |
66 | | impl DecoderDictionary<'static> { |
67 | | /// Create a prepared dictionary for decompression. |
68 | | /// |
69 | | /// This will copy the dictionary internally. |
70 | 0 | pub fn copy(dictionary: &[u8]) -> Self { |
71 | 0 | Self { |
72 | 0 | ddict: zstd_safe::DDict::create(dictionary), |
73 | 0 | } |
74 | 0 | } |
75 | | } |
76 | | |
77 | | impl<'a> DecoderDictionary<'a> { |
78 | | #[cfg(feature = "experimental")] |
79 | | #[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "experimental")))] |
80 | | /// Create prepared dictionary for decompression |
81 | | /// |
82 | | /// Only available with the `experimental` feature. Use `DecoderDictionary::copy` otherwise. |
83 | | pub fn new(dict: &'a [u8]) -> Self { |
84 | | Self { |
85 | | ddict: zstd_safe::DDict::create_by_reference(dict), |
86 | | } |
87 | | } |
88 | | |
89 | | /// Returns reference to `DDict` inner object |
90 | 0 | pub fn as_ddict(&self) -> &DDict<'a> { |
91 | 0 | &self.ddict |
92 | 0 | } |
93 | | } |
94 | | |
95 | | /// Train a dictionary from a big continuous chunk of data, with all samples |
96 | | /// contiguous in memory. |
97 | | /// |
98 | | /// This is the most efficient way to train a dictionary, |
99 | | /// since this is directly fed into `zstd`. |
100 | | /// |
101 | | /// * `sample_data` is the concatenation of all sample data. |
102 | | /// * `sample_sizes` is the size of each sample in `sample_data`. |
103 | | /// The sum of all `sample_sizes` should equal the length of `sample_data`. |
104 | | /// * `max_size` is the maximum size of the dictionary to generate. |
105 | | /// |
106 | | /// The result is the dictionary data. You can, for example, feed it to [`CDict::create`]. |
107 | | #[cfg(feature = "zdict_builder")] |
108 | | #[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))] |
109 | 0 | pub fn from_continuous( |
110 | 0 | sample_data: &[u8], |
111 | 0 | sample_sizes: &[usize], |
112 | 0 | max_size: usize, |
113 | 0 | ) -> io::Result<Vec<u8>> { |
114 | | use crate::map_error_code; |
115 | | |
116 | | // Complain if the lengths don't add up to the entire data. |
117 | 0 | if sample_sizes.iter().sum::<usize>() != sample_data.len() { |
118 | 0 | return Err(io::Error::new( |
119 | 0 | io::ErrorKind::Other, |
120 | 0 | "sample sizes don't add up".to_string(), |
121 | 0 | )); |
122 | 0 | } |
123 | | |
124 | 0 | let mut result = Vec::with_capacity(max_size); |
125 | 0 | zstd_safe::train_from_buffer(&mut result, sample_data, sample_sizes) |
126 | 0 | .map_err(map_error_code)?; |
127 | 0 | Ok(result) |
128 | 0 | } |
129 | | |
130 | | /// Train a dictionary from multiple samples. |
131 | | /// |
132 | | /// The samples will internally be copied to a single continuous buffer, |
133 | | /// so make sure you have enough memory available. |
134 | | /// |
135 | | /// If you need to stretch your system's limits, |
136 | | /// [`from_continuous`] directly uses the given slice. |
137 | | /// |
138 | | /// [`from_continuous`]: ./fn.from_continuous.html |
139 | | /// |
140 | | /// * `samples` is a list of individual samples to train on. |
141 | | /// * `max_size` is the maximum size of the dictionary to generate. |
142 | | /// |
143 | | /// The result is the dictionary data. You can, for example, feed it to [`CDict::create`]. |
144 | | #[cfg(feature = "zdict_builder")] |
145 | | #[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))] |
146 | 0 | pub fn from_samples<S: AsRef<[u8]>>( |
147 | 0 | samples: &[S], |
148 | 0 | max_size: usize, |
149 | 0 | ) -> io::Result<Vec<u8>> { |
150 | | // Pre-allocate the entire required size. |
151 | 0 | let total_length: usize = |
152 | 0 | samples.iter().map(|sample| sample.as_ref().len()).sum(); |
153 | | |
154 | 0 | let mut data = Vec::with_capacity(total_length); |
155 | | |
156 | | // Copy every sample to a big chunk of memory |
157 | 0 | data.extend(samples.iter().flat_map(|s| s.as_ref()).cloned()); |
158 | | |
159 | 0 | let sizes: Vec<_> = samples.iter().map(|s| s.as_ref().len()).collect(); |
160 | | |
161 | 0 | from_continuous(&data, &sizes, max_size) |
162 | 0 | } |
163 | | |
164 | | /// Train a dictionary from multiple samples. |
165 | | /// |
166 | | /// Unlike [`from_samples`], this does not require having a list of all samples. |
167 | | /// It also allows running into an error when iterating through the samples. |
168 | | /// |
169 | | /// They will still be copied to a continuous array and fed to [`from_continuous`]. |
170 | | /// |
171 | | /// * `samples` is an iterator of individual samples to train on. |
172 | | /// * `max_size` is the maximum size of the dictionary to generate. |
173 | | /// |
174 | | /// The result is the dictionary data. You can, for example, feed it to [`CDict::create`]. |
175 | | /// |
176 | | /// # Examples |
177 | | /// |
178 | | /// ```rust,no_run |
179 | | /// // Train from a couple of json files. |
180 | | /// let dict_buffer = zstd::dict::from_sample_iterator( |
181 | | /// ["file_a.json", "file_b.json"] |
182 | | /// .into_iter() |
183 | | /// .map(|filename| std::fs::File::open(filename)), |
184 | | /// 10_000, // 10kB dictionary |
185 | | /// ).unwrap(); |
186 | | /// ``` |
187 | | /// |
188 | | /// ```rust,no_run |
189 | | /// use std::io::BufRead as _; |
190 | | /// // Treat each line from stdin as a separate sample. |
191 | | /// let dict_buffer = zstd::dict::from_sample_iterator( |
192 | | /// std::io::stdin().lock().lines().map(|line: std::io::Result<String>| { |
193 | | /// // Transform each line into a `Cursor<Vec<u8>>` so they implement Read. |
194 | | /// line.map(String::into_bytes) |
195 | | /// .map(std::io::Cursor::new) |
196 | | /// }), |
197 | | /// 10_000, // 10kB dictionary |
198 | | /// ).unwrap(); |
199 | | /// ``` |
200 | | #[cfg(feature = "zdict_builder")] |
201 | | #[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))] |
202 | 0 | pub fn from_sample_iterator<I, R>( |
203 | 0 | samples: I, |
204 | 0 | max_size: usize, |
205 | 0 | ) -> io::Result<Vec<u8>> |
206 | 0 | where |
207 | 0 | I: IntoIterator<Item = io::Result<R>>, |
208 | 0 | R: Read, |
209 | | { |
210 | 0 | let mut data = Vec::new(); |
211 | 0 | let mut sizes = Vec::new(); |
212 | | |
213 | 0 | for sample in samples { |
214 | 0 | let mut sample = sample?; |
215 | 0 | let len = sample.read_to_end(&mut data)?; |
216 | 0 | sizes.push(len); |
217 | | } |
218 | | |
219 | 0 | from_continuous(&data, &sizes, max_size) |
220 | 0 | } |
221 | | |
222 | | /// Train a dict from a list of files. |
223 | | /// |
224 | | /// * `filenames` is an iterator of files to load. Each file will be treated as an individual |
225 | | /// sample. |
226 | | /// * `max_size` is the maximum size of the dictionary to generate. |
227 | | /// |
228 | | /// The result is the dictionary data. You can, for example, feed it to [`CDict::create`]. |
229 | | #[cfg(feature = "zdict_builder")] |
230 | | #[cfg_attr(feature = "doc-cfg", doc(cfg(feature = "zdict_builder")))] |
231 | 0 | pub fn from_files<I, P>(filenames: I, max_size: usize) -> io::Result<Vec<u8>> |
232 | 0 | where |
233 | 0 | P: AsRef<std::path::Path>, |
234 | 0 | I: IntoIterator<Item = P>, |
235 | | { |
236 | 0 | from_sample_iterator( |
237 | 0 | filenames |
238 | 0 | .into_iter() |
239 | 0 | .map(|filename| std::fs::File::open(filename)), |
240 | 0 | max_size, |
241 | | ) |
242 | 0 | } |
243 | | |
244 | | #[cfg(test)] |
245 | | #[cfg(feature = "zdict_builder")] |
246 | | mod tests { |
247 | | use std::fs; |
248 | | use std::io; |
249 | | use std::io::Read; |
250 | | |
251 | | use walkdir; |
252 | | |
253 | | #[test] |
254 | | fn test_dict_training() { |
255 | | // Train a dictionary |
256 | | let paths: Vec<_> = walkdir::WalkDir::new("src") |
257 | | .into_iter() |
258 | | .map(|entry| entry.unwrap()) |
259 | | .map(|entry| entry.into_path()) |
260 | | .filter(|path| path.to_str().unwrap().ends_with(".rs")) |
261 | | .collect(); |
262 | | |
263 | | let dict = super::from_files(&paths, 4000).unwrap(); |
264 | | |
265 | | for path in paths { |
266 | | let mut buffer = Vec::new(); |
267 | | let mut file = fs::File::open(path).unwrap(); |
268 | | let mut content = Vec::new(); |
269 | | file.read_to_end(&mut content).unwrap(); |
270 | | io::copy( |
271 | | &mut &content[..], |
272 | | &mut crate::stream::Encoder::with_dictionary( |
273 | | &mut buffer, |
274 | | 1, |
275 | | &dict, |
276 | | ) |
277 | | .unwrap() |
278 | | .auto_finish(), |
279 | | ) |
280 | | .unwrap(); |
281 | | |
282 | | let mut result = Vec::new(); |
283 | | io::copy( |
284 | | &mut crate::stream::Decoder::with_dictionary( |
285 | | &buffer[..], |
286 | | &dict[..], |
287 | | ) |
288 | | .unwrap(), |
289 | | &mut result, |
290 | | ) |
291 | | .unwrap(); |
292 | | |
293 | | assert_eq!(&content, &result); |
294 | | } |
295 | | } |
296 | | } |