/src/xz/src/liblzma/lz/lz_encoder.h
Line | Count | Source |
1 | | // SPDX-License-Identifier: 0BSD |
2 | | |
3 | | /////////////////////////////////////////////////////////////////////////////// |
4 | | // |
5 | | /// \file lz_encoder.h |
6 | | /// \brief LZ in window and match finder API |
7 | | /// |
8 | | // Authors: Igor Pavlov |
9 | | // Lasse Collin |
10 | | // |
11 | | /////////////////////////////////////////////////////////////////////////////// |
12 | | |
13 | | #ifndef LZMA_LZ_ENCODER_H |
14 | | #define LZMA_LZ_ENCODER_H |
15 | | |
16 | | #include "common.h" |
17 | | |
18 | | |
19 | | // For now, the dictionary size is limited to 1.5 GiB. This may grow |
20 | | // in the future if needed, but it needs a little more work than just |
21 | | // changing this check. |
22 | | #define IS_ENC_DICT_SIZE_VALID(size) \ |
23 | 0 | ((size) >= LZMA_DICT_SIZE_MIN \ |
24 | 0 | && (size) <= (UINT32_C(1) << 30) + (UINT32_C(1) << 29)) |
25 | | |
26 | | |
27 | | /// A table of these is used by the LZ-based encoder to hold |
28 | | /// the length-distance pairs found by the match finder. |
29 | | typedef struct { |
30 | | uint32_t len; |
31 | | uint32_t dist; |
32 | | } lzma_match; |
33 | | |
34 | | |
35 | | typedef struct lzma_mf_s lzma_mf; |
36 | | struct lzma_mf_s { |
37 | | /////////////// |
38 | | // In Window // |
39 | | /////////////// |
40 | | |
41 | | /// Pointer to buffer with data to be compressed |
42 | | uint8_t *buffer; |
43 | | |
44 | | /// Total size of the allocated buffer (that is, including all |
45 | | /// the extra space) |
46 | | uint32_t size; |
47 | | |
48 | | /// Number of bytes that must be kept available in our input history. |
49 | | /// That is, once keep_size_before bytes have been processed, |
50 | | /// buffer[read_pos - keep_size_before] is the oldest byte that |
51 | | /// must be available for reading. |
52 | | uint32_t keep_size_before; |
53 | | |
54 | | /// Number of bytes that must be kept in buffer after read_pos. |
55 | | /// That is, read_pos <= write_pos - keep_size_after as long as |
56 | | /// action is LZMA_RUN; when action != LZMA_RUN, read_pos is allowed |
57 | | /// to reach write_pos so that the last bytes get encoded too. |
58 | | uint32_t keep_size_after; |
59 | | |
60 | | /// Match finders store locations of matches using 32-bit integers. |
61 | | /// To avoid adjusting several megabytes of integers every time the |
62 | | /// input window is moved with move_window, we only adjust the |
63 | | /// offset of the buffer. Thus, buffer[value_in_hash_table - offset] |
64 | | /// is the byte pointed by value_in_hash_table. |
65 | | uint32_t offset; |
66 | | |
67 | | /// buffer[read_pos] is the next byte to run through the match |
68 | | /// finder. This is incremented in the match finder once the byte |
69 | | /// has been processed. |
70 | | uint32_t read_pos; |
71 | | |
72 | | /// Number of bytes that have been ran through the match finder, but |
73 | | /// which haven't been encoded by the LZ-based encoder yet. |
74 | | uint32_t read_ahead; |
75 | | |
76 | | /// As long as read_pos is less than read_limit, there is enough |
77 | | /// input available in buffer for at least one encoding loop. |
78 | | /// |
79 | | /// Because of the stateful API, read_limit may and will get greater |
80 | | /// than read_pos quite often. This is taken into account when |
81 | | /// calculating the value for keep_size_after. |
82 | | uint32_t read_limit; |
83 | | |
84 | | /// buffer[write_pos] is the first byte that doesn't contain valid |
85 | | /// uncompressed data; that is, the next input byte will be copied |
86 | | /// to buffer[write_pos]. |
87 | | uint32_t write_pos; |
88 | | |
89 | | /// Number of bytes not hashed before read_pos. This is needed to |
90 | | /// restart the match finder after LZMA_SYNC_FLUSH. |
91 | | uint32_t pending; |
92 | | |
93 | | ////////////////// |
94 | | // Match Finder // |
95 | | ////////////////// |
96 | | |
97 | | /// Find matches. Returns the number of distance-length pairs written |
98 | | /// to the matches array. This is called only via lzma_mf_find(). |
99 | | uint32_t (*find)(lzma_mf *mf, lzma_match *matches); |
100 | | |
101 | | /// Skips num bytes. This is like find() but doesn't make the |
102 | | /// distance-length pairs available, thus being a little faster. |
103 | | /// This is called only via mf_skip(). |
104 | | void (*skip)(lzma_mf *mf, uint32_t num); |
105 | | |
106 | | uint32_t *hash; |
107 | | uint32_t *son; |
108 | | uint32_t cyclic_pos; |
109 | | uint32_t cyclic_size; // Must be dictionary size + 1. |
110 | | uint32_t hash_mask; |
111 | | |
112 | | /// Maximum number of loops in the match finder |
113 | | uint32_t depth; |
114 | | |
115 | | /// Maximum length of a match that the match finder will try to find. |
116 | | uint32_t nice_len; |
117 | | |
118 | | /// Maximum length of a match supported by the LZ-based encoder. |
119 | | /// If the longest match found by the match finder is nice_len, |
120 | | /// mf_find() tries to expand it up to match_len_max bytes. |
121 | | uint32_t match_len_max; |
122 | | |
123 | | /// When running out of input, binary tree match finders need to know |
124 | | /// if it is due to flushing or finishing. The action is used also |
125 | | /// by the LZ-based encoders themselves. |
126 | | lzma_action action; |
127 | | |
128 | | /// Number of elements in hash[] |
129 | | uint32_t hash_count; |
130 | | |
131 | | /// Number of elements in son[] |
132 | | uint32_t sons_count; |
133 | | }; |
134 | | |
135 | | |
136 | | typedef struct { |
137 | | /// Extra amount of data to keep available before the "actual" |
138 | | /// dictionary. |
139 | | size_t before_size; |
140 | | |
141 | | /// Size of the history buffer |
142 | | size_t dict_size; |
143 | | |
144 | | /// Extra amount of data to keep available after the "actual" |
145 | | /// dictionary. |
146 | | size_t after_size; |
147 | | |
148 | | /// Maximum length of a match that the LZ-based encoder can accept. |
149 | | /// This is used to extend matches of length nice_len to the |
150 | | /// maximum possible length. |
151 | | size_t match_len_max; |
152 | | |
153 | | /// Match finder will search matches up to this length. |
154 | | /// This must be less than or equal to match_len_max. |
155 | | size_t nice_len; |
156 | | |
157 | | /// Type of the match finder to use |
158 | | lzma_match_finder match_finder; |
159 | | |
160 | | /// Maximum search depth |
161 | | uint32_t depth; |
162 | | |
163 | | /// Initial dictionary for the match finder to search. |
164 | | const uint8_t *preset_dict; |
165 | | |
166 | | /// If the preset dictionary is NULL, this value is ignored. |
167 | | /// Otherwise this member must indicate the preset dictionary's |
168 | | /// buffer size. If this size is larger than dict_size, then only |
169 | | /// the dict_size sized tail of the preset_dict will be used. |
170 | | uint32_t preset_dict_size; |
171 | | |
172 | | } lzma_lz_options; |
173 | | |
174 | | |
175 | | // The total usable buffer space at any moment outside the match finder: |
176 | | // before_size + dict_size + after_size + match_len_max |
177 | | // |
178 | | // In reality, there's some extra space allocated to prevent the number of |
179 | | // memmove() calls reasonable. The bigger the dict_size is, the bigger |
180 | | // this extra buffer will be since with bigger dictionaries memmove() would |
181 | | // also take longer. |
182 | | // |
183 | | // A single encoder loop in the LZ-based encoder may call the match finder |
184 | | // (mf_find() or mf_skip()) at most after_size times. In other words, |
185 | | // a single encoder loop may increment lzma_mf.read_pos at most after_size |
186 | | // times. Since matches are looked up to |
187 | | // lzma_mf.buffer[lzma_mf.read_pos + match_len_max - 1], the total |
188 | | // amount of extra buffer needed after dict_size becomes |
189 | | // after_size + match_len_max. |
190 | | // |
191 | | // before_size has two uses. The first one is to keep literals available |
192 | | // in cases when the LZ-based encoder has made some read ahead. |
193 | | // TODO: Maybe this could be changed by making the LZ-based encoders to |
194 | | // store the actual literals as they do with length-distance pairs. |
195 | | // |
196 | | // Algorithms such as LZMA2 first try to compress a chunk, and then check |
197 | | // if the encoded result is smaller than the uncompressed one. If the chunk |
198 | | // was incompressible, it is better to store it in uncompressed form in |
199 | | // the output stream. To do this, the whole uncompressed chunk has to be |
200 | | // still available in the history buffer. before_size achieves that. |
201 | | |
202 | | |
203 | | typedef struct { |
204 | | /// Data specific to the LZ-based encoder |
205 | | void *coder; |
206 | | |
207 | | /// Function to encode from *dict to out[] |
208 | | lzma_ret (*code)(void *coder, |
209 | | lzma_mf *restrict mf, uint8_t *restrict out, |
210 | | size_t *restrict out_pos, size_t out_size); |
211 | | |
212 | | /// Free allocated resources |
213 | | void (*end)(void *coder, const lzma_allocator *allocator); |
214 | | |
215 | | /// Update the options in the middle of the encoding. |
216 | | lzma_ret (*options_update)(void *coder, const lzma_filter *filter); |
217 | | |
218 | | /// Set maximum allowed output size |
219 | | lzma_ret (*set_out_limit)(void *coder, uint64_t *uncomp_size, |
220 | | uint64_t out_limit); |
221 | | |
222 | | } lzma_lz_encoder; |
223 | | |
224 | | |
225 | | // Basic steps: |
226 | | // 1. Input gets copied into the dictionary. |
227 | | // 2. Data in dictionary gets run through the match finder byte by byte. |
228 | | // 3. The literals and matches are encoded using e.g. LZMA. |
229 | | // |
230 | | // The bytes that have been ran through the match finder, but not encoded yet, |
231 | | // are called 'read ahead'. |
232 | | |
233 | | |
234 | | /// Get how many bytes the match finder hashes in its initial step. |
235 | | /// This is also the minimum nice_len value with the match finder. |
236 | | static inline uint32_t |
237 | | mf_get_hash_bytes(lzma_match_finder match_finder) |
238 | 0 | { |
239 | 0 | return (uint32_t)match_finder & 0x0F; |
240 | 0 | } Unexecuted instantiation: lzma_encoder.c:mf_get_hash_bytes Unexecuted instantiation: lz_encoder.c:mf_get_hash_bytes Unexecuted instantiation: lz_encoder_mf.c:mf_get_hash_bytes Unexecuted instantiation: lzma_encoder_optimum_fast.c:mf_get_hash_bytes Unexecuted instantiation: lzma_encoder_optimum_normal.c:mf_get_hash_bytes Unexecuted instantiation: lzma2_encoder.c:mf_get_hash_bytes |
241 | | |
242 | | |
243 | | /// Get pointer to the first byte not ran through the match finder |
244 | | static inline const uint8_t * |
245 | | mf_ptr(const lzma_mf *mf) |
246 | 0 | { |
247 | 0 | return mf->buffer + mf->read_pos; |
248 | 0 | } Unexecuted instantiation: lzma_encoder.c:mf_ptr Unexecuted instantiation: lz_encoder.c:mf_ptr Unexecuted instantiation: lz_encoder_mf.c:mf_ptr Unexecuted instantiation: lzma_encoder_optimum_fast.c:mf_ptr Unexecuted instantiation: lzma_encoder_optimum_normal.c:mf_ptr Unexecuted instantiation: lzma2_encoder.c:mf_ptr |
249 | | |
250 | | |
251 | | /// Get the number of bytes that haven't been ran through the match finder yet. |
252 | | static inline uint32_t |
253 | | mf_avail(const lzma_mf *mf) |
254 | 0 | { |
255 | 0 | return mf->write_pos - mf->read_pos; |
256 | 0 | } Unexecuted instantiation: lzma_encoder.c:mf_avail Unexecuted instantiation: lz_encoder.c:mf_avail Unexecuted instantiation: lz_encoder_mf.c:mf_avail Unexecuted instantiation: lzma_encoder_optimum_fast.c:mf_avail Unexecuted instantiation: lzma_encoder_optimum_normal.c:mf_avail Unexecuted instantiation: lzma2_encoder.c:mf_avail |
257 | | |
258 | | |
259 | | /// Get the number of bytes that haven't been encoded yet (some of these |
260 | | /// bytes may have been ran through the match finder though). |
261 | | static inline uint32_t |
262 | | mf_unencoded(const lzma_mf *mf) |
263 | 0 | { |
264 | 0 | return mf->write_pos - mf->read_pos + mf->read_ahead; |
265 | 0 | } Unexecuted instantiation: lzma_encoder.c:mf_unencoded Unexecuted instantiation: lz_encoder.c:mf_unencoded Unexecuted instantiation: lz_encoder_mf.c:mf_unencoded Unexecuted instantiation: lzma_encoder_optimum_fast.c:mf_unencoded Unexecuted instantiation: lzma_encoder_optimum_normal.c:mf_unencoded Unexecuted instantiation: lzma2_encoder.c:mf_unencoded |
266 | | |
267 | | |
268 | | /// Calculate the absolute offset from the beginning of the most recent |
269 | | /// dictionary reset. Only the lowest four bits are important, so there's no |
270 | | /// problem that we don't know the 64-bit size of the data encoded so far. |
271 | | /// |
272 | | /// NOTE: When moving the input window, we need to do it so that the lowest |
273 | | /// bits of dict->read_pos are not modified to keep this macro working |
274 | | /// as intended. |
275 | | static inline uint32_t |
276 | | mf_position(const lzma_mf *mf) |
277 | 0 | { |
278 | 0 | return mf->read_pos - mf->read_ahead; |
279 | 0 | } Unexecuted instantiation: lzma_encoder.c:mf_position Unexecuted instantiation: lz_encoder.c:mf_position Unexecuted instantiation: lz_encoder_mf.c:mf_position Unexecuted instantiation: lzma_encoder_optimum_fast.c:mf_position Unexecuted instantiation: lzma_encoder_optimum_normal.c:mf_position Unexecuted instantiation: lzma2_encoder.c:mf_position |
280 | | |
281 | | |
282 | | /// Since everything else begins with mf_, use it also for lzma_mf_find(). |
283 | 0 | #define mf_find lzma_mf_find |
284 | | |
285 | | |
286 | | /// Skip the given number of bytes. This is used when a good match was found. |
287 | | /// For example, if mf_find() finds a match of 200 bytes long, the first byte |
288 | | /// of that match was already consumed by mf_find(), and the rest 199 bytes |
289 | | /// have to be skipped with mf_skip(mf, 199). |
290 | | static inline void |
291 | | mf_skip(lzma_mf *mf, uint32_t amount) |
292 | 0 | { |
293 | 0 | if (amount != 0) { |
294 | 0 | mf->skip(mf, amount); |
295 | 0 | mf->read_ahead += amount; |
296 | 0 | } |
297 | 0 | } Unexecuted instantiation: lzma_encoder.c:mf_skip Unexecuted instantiation: lz_encoder.c:mf_skip Unexecuted instantiation: lz_encoder_mf.c:mf_skip Unexecuted instantiation: lzma_encoder_optimum_fast.c:mf_skip Unexecuted instantiation: lzma_encoder_optimum_normal.c:mf_skip Unexecuted instantiation: lzma2_encoder.c:mf_skip |
298 | | |
299 | | |
300 | | /// Copies at most *left number of bytes from the history buffer |
301 | | /// to out[]. This is needed by LZMA2 to encode uncompressed chunks. |
302 | | static inline void |
303 | | mf_read(lzma_mf *mf, uint8_t *out, size_t *out_pos, size_t out_size, |
304 | | size_t *left) |
305 | 0 | { |
306 | 0 | const size_t out_avail = out_size - *out_pos; |
307 | 0 | const size_t copy_size = my_min(out_avail, *left); |
308 | |
|
309 | 0 | assert(mf->read_ahead == 0); |
310 | 0 | assert(mf->read_pos >= *left); |
311 | |
|
312 | 0 | memcpy(out + *out_pos, mf->buffer + mf->read_pos - *left, |
313 | 0 | copy_size); |
314 | |
|
315 | 0 | *out_pos += copy_size; |
316 | 0 | *left -= copy_size; |
317 | 0 | return; |
318 | 0 | } Unexecuted instantiation: lzma_encoder.c:mf_read Unexecuted instantiation: lz_encoder.c:mf_read Unexecuted instantiation: lz_encoder_mf.c:mf_read Unexecuted instantiation: lzma_encoder_optimum_fast.c:mf_read Unexecuted instantiation: lzma_encoder_optimum_normal.c:mf_read Unexecuted instantiation: lzma2_encoder.c:mf_read |
319 | | |
320 | | |
321 | | extern lzma_ret lzma_lz_encoder_init( |
322 | | lzma_next_coder *next, const lzma_allocator *allocator, |
323 | | const lzma_filter_info *filters, |
324 | | lzma_ret (*lz_init)(lzma_lz_encoder *lz, |
325 | | const lzma_allocator *allocator, |
326 | | lzma_vli id, const void *options, |
327 | | lzma_lz_options *lz_options)); |
328 | | |
329 | | |
330 | | extern uint64_t lzma_lz_encoder_memusage(const lzma_lz_options *lz_options); |
331 | | |
332 | | |
333 | | // These are only for LZ encoder's internal use. |
334 | | extern uint32_t lzma_mf_find( |
335 | | lzma_mf *mf, uint32_t *count, lzma_match *matches); |
336 | | |
337 | | extern uint32_t lzma_mf_hc3_find(lzma_mf *dict, lzma_match *matches); |
338 | | extern void lzma_mf_hc3_skip(lzma_mf *dict, uint32_t amount); |
339 | | |
340 | | extern uint32_t lzma_mf_hc4_find(lzma_mf *dict, lzma_match *matches); |
341 | | extern void lzma_mf_hc4_skip(lzma_mf *dict, uint32_t amount); |
342 | | |
343 | | extern uint32_t lzma_mf_bt2_find(lzma_mf *dict, lzma_match *matches); |
344 | | extern void lzma_mf_bt2_skip(lzma_mf *dict, uint32_t amount); |
345 | | |
346 | | extern uint32_t lzma_mf_bt3_find(lzma_mf *dict, lzma_match *matches); |
347 | | extern void lzma_mf_bt3_skip(lzma_mf *dict, uint32_t amount); |
348 | | |
349 | | extern uint32_t lzma_mf_bt4_find(lzma_mf *dict, lzma_match *matches); |
350 | | extern void lzma_mf_bt4_skip(lzma_mf *dict, uint32_t amount); |
351 | | |
352 | | #endif |