/src/libdeflate/lib/deflate_compress.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * deflate_compress.c - a compressor for DEFLATE |
3 | | * |
4 | | * Copyright 2016 Eric Biggers |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person |
7 | | * obtaining a copy of this software and associated documentation |
8 | | * files (the "Software"), to deal in the Software without |
9 | | * restriction, including without limitation the rights to use, |
10 | | * copy, modify, merge, publish, distribute, sublicense, and/or sell |
11 | | * copies of the Software, and to permit persons to whom the |
12 | | * Software is furnished to do so, subject to the following |
13 | | * conditions: |
14 | | * |
15 | | * The above copyright notice and this permission notice shall be |
16 | | * included in all copies or substantial portions of the Software. |
17 | | * |
18 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
19 | | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
20 | | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
21 | | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
22 | | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
23 | | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
24 | | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
25 | | * OTHER DEALINGS IN THE SOFTWARE. |
26 | | */ |
27 | | |
28 | | #include "deflate_compress.h" |
29 | | #include "deflate_constants.h" |
30 | | |
31 | | /******************************************************************************/ |
32 | | |
33 | | /* |
34 | | * The following parameters can be changed at build time to customize the |
35 | | * compression algorithms slightly: |
36 | | * |
37 | | * (Note, not all customizable parameters are here. Some others can be found in |
38 | | * libdeflate_alloc_compressor() and in *_matchfinder.h.) |
39 | | */ |
40 | | |
41 | | /* |
42 | | * If this parameter is defined to 1, then the near-optimal parsing algorithm |
43 | | * will be included, and compression levels 10-12 will use it. This algorithm |
44 | | * usually produces a compression ratio significantly better than the other |
45 | | * algorithms. However, it is slow. If this parameter is defined to 0, then |
46 | | * levels 10-12 will be the same as level 9 and will use the lazy2 algorithm. |
47 | | */ |
48 | | #define SUPPORT_NEAR_OPTIMAL_PARSING 1 |
49 | | |
50 | | /* |
51 | | * This is the minimum block length that the compressor will use, in |
52 | | * uncompressed bytes. This should be a value below which using shorter blocks |
53 | | * is unlikely to be worthwhile, due to the per-block overhead. This value does |
54 | | * not apply to the final block, which may be shorter than this (if the input is |
55 | | * shorter, it will have to be), or to the final uncompressed block in a series |
56 | | * of uncompressed blocks that cover more than UINT16_MAX bytes. |
57 | | * |
58 | | * This value is also approximately the amount by which what would otherwise be |
59 | | * the second-to-last block is allowed to grow past the soft maximum length in |
60 | | * order to avoid having to use a very short final block. |
61 | | * |
62 | | * Defining a fixed minimum block length is needed in order to guarantee a |
63 | | * reasonable upper bound on the compressed size. It's also needed because our |
64 | | * block splitting algorithm doesn't work well on very short blocks. |
65 | | */ |
66 | 0 | #define MIN_BLOCK_LENGTH 5000 |
67 | | |
68 | | /* |
69 | | * For the greedy, lazy, lazy2, and near-optimal compressors: This is the soft |
70 | | * maximum block length, in uncompressed bytes. The compressor will try to end |
71 | | * blocks at this length, but it may go slightly past it if there is a match |
72 | | * that straddles this limit or if the input data ends soon after this limit. |
73 | | * This parameter doesn't apply to uncompressed blocks, which the DEFLATE format |
74 | | * limits to 65535 bytes. |
75 | | * |
76 | | * This should be a value above which it is very likely that splitting the block |
77 | | * would produce a better compression ratio. For the near-optimal compressor, |
78 | | * increasing/decreasing this parameter will increase/decrease per-compressor |
79 | | * memory usage linearly. |
80 | | */ |
81 | 0 | #define SOFT_MAX_BLOCK_LENGTH 300000 |
82 | | |
83 | | /* |
84 | | * For the greedy, lazy, and lazy2 compressors: this is the length of the |
85 | | * sequence store, which is an array where the compressor temporarily stores |
86 | | * matches that it's going to use in the current block. This value is the |
87 | | * maximum number of matches that can be used in a block. If the sequence store |
88 | | * fills up, then the compressor will be forced to end the block early. This |
89 | | * value should be large enough so that this rarely happens, due to the block |
90 | | * being ended normally before then. Increasing/decreasing this value will |
91 | | * increase/decrease per-compressor memory usage linearly. |
92 | | */ |
93 | 0 | #define SEQ_STORE_LENGTH 50000 |
94 | | |
95 | | /* |
96 | | * For deflate_compress_fastest(): This is the soft maximum block length. |
97 | | * deflate_compress_fastest() doesn't use the regular block splitting algorithm; |
98 | | * it only ends blocks when they reach FAST_SOFT_MAX_BLOCK_LENGTH bytes or |
99 | | * FAST_SEQ_STORE_LENGTH matches. Therefore, this value should be lower than |
100 | | * the regular SOFT_MAX_BLOCK_LENGTH. |
101 | | */ |
102 | 0 | #define FAST_SOFT_MAX_BLOCK_LENGTH 65535 |
103 | | |
104 | | /* |
105 | | * For deflate_compress_fastest(): this is the length of the sequence store. |
106 | | * This is like SEQ_STORE_LENGTH, but this should be a lower value. |
107 | | */ |
108 | 0 | #define FAST_SEQ_STORE_LENGTH 8192 |
109 | | |
110 | | /* |
111 | | * These are the maximum codeword lengths, in bits, the compressor will use for |
112 | | * each Huffman code. The DEFLATE format defines limits for these. However, |
113 | | * further limiting litlen codewords to 14 bits is beneficial, since it has |
114 | | * negligible effect on compression ratio but allows some optimizations when |
115 | | * outputting bits. (It allows 4 literals to be written at once rather than 3.) |
116 | | */ |
117 | 0 | #define MAX_LITLEN_CODEWORD_LEN 14 |
118 | 0 | #define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN |
119 | 0 | #define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN |
120 | | |
121 | | #if SUPPORT_NEAR_OPTIMAL_PARSING |
122 | | |
123 | | /* Parameters specific to the near-optimal parsing algorithm */ |
124 | | |
125 | | /* |
126 | | * BIT_COST is a scaling factor that allows the near-optimal compressor to |
127 | | * consider fractional bit costs when deciding which literal/match sequence to |
128 | | * use. This is useful when the true symbol costs are unknown. For example, if |
129 | | * the compressor thinks that a symbol has 6.5 bits of entropy, it can set its |
130 | | * cost to 6.5 bits rather than have to use 6 or 7 bits. Although in the end |
131 | | * each symbol will use a whole number of bits due to the Huffman coding, |
132 | | * considering fractional bits can be helpful due to the limited information. |
133 | | * |
134 | | * BIT_COST should be a power of 2. A value of 8 or 16 works well. A higher |
135 | | * value isn't very useful since the calculations are approximate anyway. |
136 | | * |
137 | | * BIT_COST doesn't apply to deflate_flush_block() and |
138 | | * deflate_compute_true_cost(), which consider whole bits. |
139 | | */ |
140 | 0 | #define BIT_COST 16 |
141 | | |
142 | | /* |
143 | | * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to |
144 | | * be needed to output a symbol that was unused in the previous optimization |
145 | | * pass. Assigning a default cost allows the symbol to be used in the next |
146 | | * optimization pass. However, the cost should be relatively high because the |
147 | | * symbol probably won't be used very many times (if at all). |
148 | | */ |
149 | 0 | #define LITERAL_NOSTAT_BITS 13 |
150 | 0 | #define LENGTH_NOSTAT_BITS 13 |
151 | 0 | #define OFFSET_NOSTAT_BITS 10 |
152 | | |
153 | | /* |
154 | | * This is (slightly less than) the maximum number of matches that the |
155 | | * near-optimal compressor will cache per block. This behaves similarly to |
156 | | * SEQ_STORE_LENGTH for the other compressors. |
157 | | */ |
158 | 0 | #define MATCH_CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5) |
159 | | |
160 | | #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ |
161 | | |
162 | | /******************************************************************************/ |
163 | | |
164 | | /* Include the needed matchfinders. */ |
165 | 0 | #define MATCHFINDER_WINDOW_ORDER DEFLATE_WINDOW_ORDER |
166 | | #include "hc_matchfinder.h" |
167 | | #include "ht_matchfinder.h" |
168 | | #if SUPPORT_NEAR_OPTIMAL_PARSING |
169 | | # include "bt_matchfinder.h" |
170 | | /* |
171 | | * This is the maximum number of matches the binary trees matchfinder can find |
172 | | * at a single position. Since the matchfinder never finds more than one match |
173 | | * for the same length, presuming one of each possible length is sufficient for |
174 | | * an upper bound. (This says nothing about whether it is worthwhile to |
175 | | * consider so many matches; this is just defining the worst case.) |
176 | | */ |
177 | | #define MAX_MATCHES_PER_POS \ |
178 | | (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1) |
179 | | #endif |
180 | | |
181 | | /* |
182 | | * The largest block length we will ever use is when the final block is of |
183 | | * length SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, or when any block is of |
184 | | * length SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN. The latter case |
185 | | * occurs when the lazy2 compressor chooses two literals and a maximum-length |
186 | | * match, starting at SOFT_MAX_BLOCK_LENGTH - 1. |
187 | | */ |
188 | | #define MAX_BLOCK_LENGTH \ |
189 | | MAX(SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, \ |
190 | | SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN) |
191 | | |
192 | | static forceinline void |
193 | | check_buildtime_parameters(void) |
194 | 0 | { |
195 | | /* |
196 | | * Verify that MIN_BLOCK_LENGTH is being honored, as |
197 | | * libdeflate_deflate_compress_bound() depends on it. |
198 | | */ |
199 | 0 | STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH); |
200 | 0 | STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH); |
201 | 0 | STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN >= |
202 | 0 | MIN_BLOCK_LENGTH); |
203 | 0 | STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN >= |
204 | 0 | MIN_BLOCK_LENGTH); |
205 | 0 | #if SUPPORT_NEAR_OPTIMAL_PARSING |
206 | 0 | STATIC_ASSERT(MIN_BLOCK_LENGTH * MAX_MATCHES_PER_POS <= |
207 | 0 | MATCH_CACHE_LENGTH); |
208 | 0 | #endif |
209 | | |
210 | | /* The definition of MAX_BLOCK_LENGTH assumes this. */ |
211 | 0 | STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH); |
212 | | |
213 | | /* Verify that the sequence stores aren't uselessly large. */ |
214 | 0 | STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN <= |
215 | 0 | SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH); |
216 | 0 | STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN <= |
217 | 0 | FAST_SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH); |
218 | | |
219 | | /* Verify that the maximum codeword lengths are valid. */ |
220 | 0 | STATIC_ASSERT( |
221 | 0 | MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN); |
222 | 0 | STATIC_ASSERT( |
223 | 0 | MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN); |
224 | 0 | STATIC_ASSERT( |
225 | 0 | MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN); |
226 | 0 | STATIC_ASSERT( |
227 | 0 | (1U << MAX_LITLEN_CODEWORD_LEN) >= DEFLATE_NUM_LITLEN_SYMS); |
228 | 0 | STATIC_ASSERT( |
229 | 0 | (1U << MAX_OFFSET_CODEWORD_LEN) >= DEFLATE_NUM_OFFSET_SYMS); |
230 | 0 | STATIC_ASSERT( |
231 | 0 | (1U << MAX_PRE_CODEWORD_LEN) >= DEFLATE_NUM_PRECODE_SYMS); |
232 | 0 | } |
233 | | |
234 | | /******************************************************************************/ |
235 | | |
236 | | /* Table: length slot => length slot base value */ |
237 | | static const u32 deflate_length_slot_base[] = { |
238 | | 3, 4, 5, 6, 7, 8, 9, 10, |
239 | | 11, 13, 15, 17, 19, 23, 27, 31, |
240 | | 35, 43, 51, 59, 67, 83, 99, 115, |
241 | | 131, 163, 195, 227, 258, |
242 | | }; |
243 | | |
244 | | /* Table: length slot => number of extra length bits */ |
245 | | static const u8 deflate_extra_length_bits[] = { |
246 | | 0, 0, 0, 0, 0, 0, 0, 0, |
247 | | 1, 1, 1, 1, 2, 2, 2, 2, |
248 | | 3, 3, 3, 3, 4, 4, 4, 4, |
249 | | 5, 5, 5, 5, 0, |
250 | | }; |
251 | | |
252 | | /* Table: offset slot => offset slot base value */ |
253 | | static const u32 deflate_offset_slot_base[] = { |
254 | | 1, 2, 3, 4, 5, 7, 9, 13, |
255 | | 17, 25, 33, 49, 65, 97, 129, 193, |
256 | | 257, 385, 513, 769, 1025, 1537, 2049, 3073, |
257 | | 4097, 6145, 8193, 12289, 16385, 24577, |
258 | | }; |
259 | | |
260 | | /* Table: offset slot => number of extra offset bits */ |
261 | | static const u8 deflate_extra_offset_bits[] = { |
262 | | 0, 0, 0, 0, 1, 1, 2, 2, |
263 | | 3, 3, 4, 4, 5, 5, 6, 6, |
264 | | 7, 7, 8, 8, 9, 9, 10, 10, |
265 | | 11, 11, 12, 12, 13, 13, |
266 | | }; |
267 | | |
268 | | /* Table: length => length slot */ |
269 | | static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = { |
270 | | 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, |
271 | | 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, |
272 | | 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, |
273 | | 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, |
274 | | 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, |
275 | | 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, |
276 | | 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, |
277 | | 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, |
278 | | 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, |
279 | | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, |
280 | | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, |
281 | | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, |
282 | | 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
283 | | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
284 | | 27, 27, 28, |
285 | | }; |
286 | | |
287 | | /* |
288 | | * Table: 'offset - 1 => offset_slot' for offset <= 256. |
289 | | * This was generated by scripts/gen_offset_slot_map.py. |
290 | | */ |
291 | | static const u8 deflate_offset_slot[256] = { |
292 | | 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, |
293 | | 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, |
294 | | 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
295 | | 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, |
296 | | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, |
297 | | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, |
298 | | 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, |
299 | | 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, |
300 | | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
301 | | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
302 | | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
303 | | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
304 | | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
305 | | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
306 | | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
307 | | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
308 | | }; |
309 | | |
310 | | /* The order in which precode codeword lengths are stored */ |
311 | | static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { |
312 | | 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 |
313 | | }; |
314 | | |
315 | | /* Table: precode symbol => number of extra bits */ |
316 | | static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = { |
317 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7 |
318 | | }; |
319 | | |
320 | | /* Codewords for the DEFLATE Huffman codes */ |
321 | | struct deflate_codewords { |
322 | | u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; |
323 | | u32 offset[DEFLATE_NUM_OFFSET_SYMS]; |
324 | | }; |
325 | | |
326 | | /* |
327 | | * Codeword lengths (in bits) for the DEFLATE Huffman codes. |
328 | | * A zero length means the corresponding symbol had zero frequency. |
329 | | */ |
330 | | struct deflate_lens { |
331 | | u8 litlen[DEFLATE_NUM_LITLEN_SYMS]; |
332 | | u8 offset[DEFLATE_NUM_OFFSET_SYMS]; |
333 | | }; |
334 | | |
335 | | /* Codewords and lengths for the DEFLATE Huffman codes */ |
336 | | struct deflate_codes { |
337 | | struct deflate_codewords codewords; |
338 | | struct deflate_lens lens; |
339 | | }; |
340 | | |
341 | | /* Symbol frequency counters for the DEFLATE Huffman codes */ |
342 | | struct deflate_freqs { |
343 | | u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; |
344 | | u32 offset[DEFLATE_NUM_OFFSET_SYMS]; |
345 | | }; |
346 | | |
347 | | /* |
348 | | * Represents a run of literals followed by a match or end-of-block. This |
349 | | * struct is needed to temporarily store items chosen by the parser, since items |
350 | | * cannot be written until all items for the block have been chosen and the |
351 | | * block's Huffman codes have been computed. |
352 | | */ |
353 | | struct deflate_sequence { |
354 | | |
355 | | /* |
356 | | * Bits 0..22: the number of literals in this run. This may be 0 and |
357 | | * can be at most MAX_BLOCK_LENGTH. The literals are not stored |
358 | | * explicitly in this structure; instead, they are read directly from |
359 | | * the uncompressed data. |
360 | | * |
361 | | * Bits 23..31: the length of the match which follows the literals, or 0 |
362 | | * if this literal run was the last in the block, so there is no match |
363 | | * which follows it. |
364 | | */ |
365 | 0 | #define SEQ_LENGTH_SHIFT 23 |
366 | 0 | #define SEQ_LITRUNLEN_MASK (((u32)1 << SEQ_LENGTH_SHIFT) - 1) |
367 | | u32 litrunlen_and_length; |
368 | | |
369 | | /* |
370 | | * If 'length' doesn't indicate end-of-block, then this is the offset of |
371 | | * the match which follows the literals. |
372 | | */ |
373 | | u16 offset; |
374 | | |
375 | | /* |
376 | | * If 'length' doesn't indicate end-of-block, then this is the offset |
377 | | * slot of the match which follows the literals. |
378 | | */ |
379 | | u16 offset_slot; |
380 | | }; |
381 | | |
382 | | #if SUPPORT_NEAR_OPTIMAL_PARSING |
383 | | |
384 | | /* Costs for the near-optimal parsing algorithm */ |
385 | | struct deflate_costs { |
386 | | |
387 | | /* The cost to output each possible literal */ |
388 | | u32 literal[DEFLATE_NUM_LITERALS]; |
389 | | |
390 | | /* The cost to output each possible match length */ |
391 | | u32 length[DEFLATE_MAX_MATCH_LEN + 1]; |
392 | | |
393 | | /* The cost to output a match offset of each possible offset slot */ |
394 | | u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS]; |
395 | | }; |
396 | | |
397 | | /* |
398 | | * This structure represents a byte position in the input data and a node in the |
399 | | * graph of possible match/literal choices for the current block. |
400 | | * |
401 | | * Logically, each incoming edge to this node is labeled with a literal or a |
402 | | * match that can be taken to reach this position from an earlier position; and |
403 | | * each outgoing edge from this node is labeled with a literal or a match that |
404 | | * can be taken to advance from this position to a later position. |
405 | | * |
406 | | * But these "edges" are actually stored elsewhere (in 'match_cache'). Here we |
407 | | * associate with each node just two pieces of information: |
408 | | * |
409 | | * 'cost_to_end' is the minimum cost to reach the end of the block from |
410 | | * this position. |
411 | | * |
412 | | * 'item' represents the literal or match that must be chosen from here to |
413 | | * reach the end of the block with the minimum cost. Equivalently, this |
414 | | * can be interpreted as the label of the outgoing edge on the minimum-cost |
415 | | * path to the "end of block" node from this node. |
416 | | */ |
417 | | struct deflate_optimum_node { |
418 | | |
419 | | u32 cost_to_end; |
420 | | |
421 | | /* |
422 | | * Notes on the match/literal representation used here: |
423 | | * |
424 | | * The low bits of 'item' are the length: 1 if this is a literal, |
425 | | * or the match length if this is a match. |
426 | | * |
427 | | * The high bits of 'item' are the actual literal byte if this is a |
428 | | * literal, or the match offset if this is a match. |
429 | | */ |
430 | 0 | #define OPTIMUM_OFFSET_SHIFT 9 |
431 | 0 | #define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1) |
432 | | u32 item; |
433 | | |
434 | | }; |
435 | | |
436 | | #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ |
437 | | |
438 | | /* Block split statistics. See "Block splitting algorithm" below. */ |
439 | 0 | #define NUM_LITERAL_OBSERVATION_TYPES 8 |
440 | 0 | #define NUM_MATCH_OBSERVATION_TYPES 2 |
441 | 0 | #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \ |
442 | 0 | NUM_MATCH_OBSERVATION_TYPES) |
443 | 0 | #define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512 |
444 | | struct block_split_stats { |
445 | | u32 new_observations[NUM_OBSERVATION_TYPES]; |
446 | | u32 observations[NUM_OBSERVATION_TYPES]; |
447 | | u32 num_new_observations; |
448 | | u32 num_observations; |
449 | | }; |
450 | | |
451 | | struct deflate_output_bitstream; |
452 | | |
453 | | /* The main DEFLATE compressor structure */ |
454 | | struct libdeflate_compressor { |
455 | | |
456 | | /* Pointer to the compress() implementation chosen at allocation time */ |
457 | | void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in, |
458 | | size_t in_nbytes, struct deflate_output_bitstream *os); |
459 | | |
460 | | /* The free() function for this struct, chosen at allocation time */ |
461 | | free_func_t free_func; |
462 | | |
463 | | /* The compression level with which this compressor was created */ |
464 | | unsigned compression_level; |
465 | | |
466 | | /* Anything of this size or less we won't bother trying to compress. */ |
467 | | size_t max_passthrough_size; |
468 | | |
469 | | /* |
470 | | * The maximum search depth: consider at most this many potential |
471 | | * matches at each position |
472 | | */ |
473 | | u32 max_search_depth; |
474 | | |
475 | | /* |
476 | | * The "nice" match length: if a match of this length is found, choose |
477 | | * it immediately without further consideration |
478 | | */ |
479 | | u32 nice_match_length; |
480 | | |
481 | | /* Frequency counters for the current block */ |
482 | | struct deflate_freqs freqs; |
483 | | |
484 | | /* Block split statistics for the current block */ |
485 | | struct block_split_stats split_stats; |
486 | | |
487 | | /* Dynamic Huffman codes for the current block */ |
488 | | struct deflate_codes codes; |
489 | | |
490 | | /* The static Huffman codes defined by the DEFLATE format */ |
491 | | struct deflate_codes static_codes; |
492 | | |
493 | | /* Temporary space for block flushing */ |
494 | | union { |
495 | | /* Information about the precode */ |
496 | | struct { |
497 | | u32 freqs[DEFLATE_NUM_PRECODE_SYMS]; |
498 | | u32 codewords[DEFLATE_NUM_PRECODE_SYMS]; |
499 | | u8 lens[DEFLATE_NUM_PRECODE_SYMS]; |
500 | | unsigned items[DEFLATE_NUM_LITLEN_SYMS + |
501 | | DEFLATE_NUM_OFFSET_SYMS]; |
502 | | unsigned num_litlen_syms; |
503 | | unsigned num_offset_syms; |
504 | | unsigned num_explicit_lens; |
505 | | unsigned num_items; |
506 | | } precode; |
507 | | /* |
508 | | * The "full" length codewords. Used only after the information |
509 | | * in 'precode' is no longer needed. |
510 | | */ |
511 | | struct { |
512 | | u32 codewords[DEFLATE_MAX_MATCH_LEN + 1]; |
513 | | u8 lens[DEFLATE_MAX_MATCH_LEN + 1]; |
514 | | } length; |
515 | | } o; |
516 | | |
517 | | union { |
518 | | /* Data for greedy or lazy parsing */ |
519 | | struct { |
520 | | /* Hash chains matchfinder */ |
521 | | struct hc_matchfinder hc_mf; |
522 | | |
523 | | /* Matches and literals chosen for the current block */ |
524 | | struct deflate_sequence sequences[SEQ_STORE_LENGTH + 1]; |
525 | | |
526 | | } g; /* (g)reedy */ |
527 | | |
528 | | /* Data for fastest parsing */ |
529 | | struct { |
530 | | /* Hash table matchfinder */ |
531 | | struct ht_matchfinder ht_mf; |
532 | | |
533 | | /* Matches and literals chosen for the current block */ |
534 | | struct deflate_sequence sequences[ |
535 | | FAST_SEQ_STORE_LENGTH + 1]; |
536 | | |
537 | | } f; /* (f)astest */ |
538 | | |
539 | | #if SUPPORT_NEAR_OPTIMAL_PARSING |
540 | | /* Data for near-optimal parsing */ |
541 | | struct { |
542 | | |
543 | | /* Binary tree matchfinder */ |
544 | | struct bt_matchfinder bt_mf; |
545 | | |
546 | | /* |
547 | | * Cached matches for the current block. This array |
548 | | * contains the matches that were found at each position |
549 | | * in the block. Specifically, for each position, there |
550 | | * is a list of matches found at that position, if any, |
551 | | * sorted by strictly increasing length. In addition, |
552 | | * following the matches for each position, there is a |
553 | | * special 'struct lz_match' whose 'length' member |
554 | | * contains the number of matches found at that |
555 | | * position, and whose 'offset' member contains the |
556 | | * literal at that position. |
557 | | * |
558 | | * Note: in rare cases, there will be a very high number |
559 | | * of matches in the block and this array will overflow. |
560 | | * If this happens, we force the end of the current |
561 | | * block. MATCH_CACHE_LENGTH is the length at which we |
562 | | * actually check for overflow. The extra slots beyond |
563 | | * this are enough to absorb the worst case overflow, |
564 | | * which occurs if starting at |
565 | | * &match_cache[MATCH_CACHE_LENGTH - 1], we write |
566 | | * MAX_MATCHES_PER_POS matches and a match count header, |
567 | | * then skip searching for matches at |
568 | | * 'DEFLATE_MAX_MATCH_LEN - 1' positions and write the |
569 | | * match count header for each. |
570 | | */ |
571 | | struct lz_match match_cache[MATCH_CACHE_LENGTH + |
572 | | MAX_MATCHES_PER_POS + |
573 | | DEFLATE_MAX_MATCH_LEN - 1]; |
574 | | |
575 | | /* |
576 | | * Array of nodes, one per position, for running the |
577 | | * minimum-cost path algorithm. |
578 | | * |
579 | | * This array must be large enough to accommodate the |
580 | | * worst-case number of nodes, which is MAX_BLOCK_LENGTH |
581 | | * plus 1 for the end-of-block node. |
582 | | */ |
583 | | struct deflate_optimum_node optimum_nodes[ |
584 | | MAX_BLOCK_LENGTH + 1]; |
585 | | |
586 | | /* The current cost model being used */ |
587 | | struct deflate_costs costs; |
588 | | |
589 | | /* Saved cost model */ |
590 | | struct deflate_costs costs_saved; |
591 | | |
592 | | /* |
593 | | * A table that maps match offset to offset slot. This |
594 | | * differs from deflate_offset_slot[] in that this is a |
595 | | * full map, not a condensed one. The full map is more |
596 | | * appropriate for the near-optimal parser, since the |
597 | | * near-optimal parser does more offset => offset_slot |
598 | | * translations, it doesn't intersperse them with |
599 | | * matchfinding (so cache evictions are less of a |
600 | | * concern), and it uses more memory anyway. |
601 | | */ |
602 | | u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1]; |
603 | | |
604 | | /* Literal/match statistics saved from previous block */ |
605 | | u32 prev_observations[NUM_OBSERVATION_TYPES]; |
606 | | u32 prev_num_observations; |
607 | | |
608 | | /* |
609 | | * Approximate match length frequencies based on a |
610 | | * greedy parse, gathered during matchfinding. This is |
611 | | * used for setting the initial symbol costs. |
612 | | */ |
613 | | u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1]; |
614 | | u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1]; |
615 | | |
616 | | /* |
617 | | * The maximum number of optimization passes |
618 | | * (min-cost path searches) per block. |
619 | | * Larger values = more compression. |
620 | | */ |
621 | | unsigned max_optim_passes; |
622 | | |
623 | | /* |
624 | | * If an optimization pass improves the cost by fewer |
625 | | * than this number of bits, then optimization will stop |
626 | | * early, before max_optim_passes has been reached. |
627 | | * Smaller values = more compression. |
628 | | */ |
629 | | u32 min_improvement_to_continue; |
630 | | |
631 | | /* |
632 | | * The minimum number of bits that would need to be |
633 | | * saved for it to be considered worth the time to |
634 | | * regenerate and use the min-cost path from a previous |
635 | | * optimization pass, in the case where the final |
636 | | * optimization pass actually increased the cost. |
637 | | * Smaller values = more compression. |
638 | | */ |
639 | | u32 min_bits_to_use_nonfinal_path; |
640 | | |
641 | | /* |
642 | | * The maximum block length, in uncompressed bytes, at |
643 | | * which to find and consider the optimal match/literal |
644 | | * list for the static Huffman codes. This strategy |
645 | | * improves the compression ratio produced by static |
646 | | * Huffman blocks and can discover more cases in which |
647 | | * static blocks are worthwhile. This helps mostly with |
648 | | * small blocks, hence why this parameter is a max_len. |
649 | | * |
650 | | * Above this block length, static Huffman blocks are |
651 | | * only used opportunistically. I.e. a static Huffman |
652 | | * block is only used if a static block using the same |
653 | | * match/literal list as the optimized dynamic block |
654 | | * happens to be cheaper than the dynamic block itself. |
655 | | */ |
656 | | u32 max_len_to_optimize_static_block; |
657 | | |
658 | | } n; /* (n)ear-optimal */ |
659 | | #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ |
660 | | |
661 | | } p; /* (p)arser */ |
662 | | }; |
663 | | |
664 | | /* |
665 | | * The type for the bitbuffer variable, which temporarily holds bits that are |
666 | | * being packed into bytes and written to the output buffer. For best |
667 | | * performance, this should have size equal to a machine word. |
668 | | */ |
669 | | typedef machine_word_t bitbuf_t; |
670 | | |
671 | | /* |
672 | | * The capacity of the bitbuffer, in bits. This is 1 less than the real size, |
673 | | * in order to avoid undefined behavior when doing bitbuf >>= bitcount & ~7. |
674 | | */ |
675 | 0 | #define BITBUF_NBITS (8 * sizeof(bitbuf_t) - 1) |
676 | | |
677 | | /* |
678 | | * Can the specified number of bits always be added to 'bitbuf' after any |
679 | | * pending bytes have been flushed? There can be up to 7 bits remaining after a |
680 | | * flush, so the count must not exceed BITBUF_NBITS after adding 'n' more bits. |
681 | | */ |
682 | 0 | #define CAN_BUFFER(n) (7 + (n) <= BITBUF_NBITS) |
683 | | |
684 | | /* |
685 | | * Structure to keep track of the current state of sending bits to the |
686 | | * compressed output buffer |
687 | | */ |
688 | | struct deflate_output_bitstream { |
689 | | |
690 | | /* Bits that haven't yet been written to the output buffer */ |
691 | | bitbuf_t bitbuf; |
692 | | |
693 | | /* |
694 | | * Number of bits currently held in @bitbuf. This can be between 0 and |
695 | | * BITBUF_NBITS in general, or between 0 and 7 after a flush. |
696 | | */ |
697 | | unsigned bitcount; |
698 | | |
699 | | /* |
700 | | * Pointer to the position in the output buffer at which the next byte |
701 | | * should be written |
702 | | */ |
703 | | u8 *next; |
704 | | |
705 | | /* Pointer to the end of the output buffer */ |
706 | | u8 *end; |
707 | | |
708 | | /* true if the output buffer ran out of space */ |
709 | | bool overflow; |
710 | | }; |
711 | | |
712 | | /* |
713 | | * Add some bits to the bitbuffer variable of the output bitstream. The caller |
714 | | * must ensure that 'bitcount + n <= BITBUF_NBITS', by calling FLUSH_BITS() |
715 | | * frequently enough. |
716 | | */ |
717 | 0 | #define ADD_BITS(bits, n) \ |
718 | 0 | do { \ |
719 | 0 | bitbuf |= (bitbuf_t)(bits) << bitcount; \ |
720 | 0 | bitcount += (n); \ |
721 | 0 | ASSERT(bitcount <= BITBUF_NBITS); \ |
722 | 0 | } while (0) |
723 | | |
724 | | /* |
725 | | * Flush bits from the bitbuffer variable to the output buffer. After this, the |
726 | | * bitbuffer will contain at most 7 bits (a partial byte). |
727 | | * |
728 | | * Since deflate_flush_block() verified ahead of time that there is enough space |
729 | | * remaining before actually writing the block, it's guaranteed that out_next |
730 | | * won't exceed os->end. However, there might not be enough space remaining to |
731 | | * flush a whole word, even though that's fastest. Therefore, flush a whole |
732 | | * word if there is space for it, otherwise flush a byte at a time. |
733 | | */ |
734 | 0 | #define FLUSH_BITS() \ |
735 | 0 | do { \ |
736 | 0 | if (UNALIGNED_ACCESS_IS_FAST && likely(out_next < out_fast_end)) { \ |
737 | 0 | /* Flush a whole word (branchlessly). */ \ |
738 | 0 | put_unaligned_leword(bitbuf, out_next); \ |
739 | 0 | bitbuf >>= bitcount & ~7; \ |
740 | 0 | out_next += bitcount >> 3; \ |
741 | 0 | bitcount &= 7; \ |
742 | 0 | } else { \ |
743 | 0 | /* Flush a byte at a time. */ \ |
744 | 0 | while (bitcount >= 8) { \ |
745 | 0 | ASSERT(out_next < os->end); \ |
746 | 0 | *out_next++ = bitbuf; \ |
747 | 0 | bitcount -= 8; \ |
748 | 0 | bitbuf >>= 8; \ |
749 | 0 | } \ |
750 | 0 | } \ |
751 | 0 | } while (0) |
752 | | |
753 | | /* |
754 | | * Given the binary tree node A[subtree_idx] whose children already satisfy the |
755 | | * maxheap property, swap the node with its greater child until it is greater |
756 | | * than or equal to both of its children, so that the maxheap property is |
757 | | * satisfied in the subtree rooted at A[subtree_idx]. 'A' uses 1-based indices. |
758 | | */ |
759 | | static void |
760 | | heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx) |
761 | 0 | { |
762 | 0 | unsigned parent_idx; |
763 | 0 | unsigned child_idx; |
764 | 0 | u32 v; |
765 | |
|
766 | 0 | v = A[subtree_idx]; |
767 | 0 | parent_idx = subtree_idx; |
768 | 0 | while ((child_idx = parent_idx * 2) <= length) { |
769 | 0 | if (child_idx < length && A[child_idx + 1] > A[child_idx]) |
770 | 0 | child_idx++; |
771 | 0 | if (v >= A[child_idx]) |
772 | 0 | break; |
773 | 0 | A[parent_idx] = A[child_idx]; |
774 | 0 | parent_idx = child_idx; |
775 | 0 | } |
776 | 0 | A[parent_idx] = v; |
777 | 0 | } |
778 | | |
779 | | /* |
780 | | * Rearrange the array 'A' so that it satisfies the maxheap property. |
781 | | * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1]. |
782 | | */ |
783 | | static void |
784 | | heapify_array(u32 A[], unsigned length) |
785 | 0 | { |
786 | 0 | unsigned subtree_idx; |
787 | |
|
788 | 0 | for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--) |
789 | 0 | heapify_subtree(A, length, subtree_idx); |
790 | 0 | } |
791 | | |
792 | | /* |
793 | | * Sort the array 'A', which contains 'length' unsigned 32-bit integers. |
794 | | * |
795 | | * Note: name this function heap_sort() instead of heapsort() to avoid colliding |
796 | | * with heapsort() from stdlib.h on BSD-derived systems. |
797 | | */ |
798 | | static void |
799 | | heap_sort(u32 A[], unsigned length) |
800 | 0 | { |
801 | 0 | A--; /* Use 1-based indices */ |
802 | |
|
803 | 0 | heapify_array(A, length); |
804 | |
|
805 | 0 | while (length >= 2) { |
806 | 0 | u32 tmp = A[length]; |
807 | |
|
808 | 0 | A[length] = A[1]; |
809 | 0 | A[1] = tmp; |
810 | 0 | length--; |
811 | 0 | heapify_subtree(A, length, 1); |
812 | 0 | } |
813 | 0 | } |
814 | | |
815 | 0 | #define NUM_SYMBOL_BITS 10 |
816 | | #define NUM_FREQ_BITS (32 - NUM_SYMBOL_BITS) |
817 | 0 | #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) |
818 | 0 | #define FREQ_MASK (~SYMBOL_MASK) |
819 | | |
820 | 0 | #define GET_NUM_COUNTERS(num_syms) (num_syms) |
821 | | |
822 | | /* |
823 | | * Sort the symbols primarily by frequency and secondarily by symbol value. |
824 | | * Discard symbols with zero frequency and fill in an array with the remaining |
825 | | * symbols, along with their frequencies. The low NUM_SYMBOL_BITS bits of each |
826 | | * array entry will contain the symbol value, and the remaining bits will |
827 | | * contain the frequency. |
828 | | * |
829 | | * @num_syms |
830 | | * Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS. |
831 | | * |
832 | | * @freqs[num_syms] |
833 | | * Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1. |
834 | | * |
835 | | * @lens[num_syms] |
836 | | * An array that eventually will hold the length of each codeword. This |
837 | | * function only fills in the codeword lengths for symbols that have zero |
838 | | * frequency, which are not well defined per se but will be set to 0. |
839 | | * |
840 | | * @symout[num_syms] |
841 | | * The output array, described above. |
842 | | * |
843 | | * Returns the number of entries in 'symout' that were filled. This is the |
844 | | * number of symbols that have nonzero frequency. |
845 | | */ |
846 | | static unsigned |
847 | | sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[]) |
848 | 0 | { |
849 | 0 | unsigned sym; |
850 | 0 | unsigned i; |
851 | 0 | unsigned num_used_syms; |
852 | 0 | unsigned num_counters; |
853 | 0 | unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)]; |
854 | | |
855 | | /* |
856 | | * We use heapsort, but with an added optimization. Since often most |
857 | | * symbol frequencies are low, we first do a count sort using a limited |
858 | | * number of counters. High frequencies are counted in the last |
859 | | * counter, and only they will be sorted with heapsort. |
860 | | * |
861 | | * Note: with more symbols, it is generally beneficial to have more |
862 | | * counters. About 1 counter per symbol seems fastest. |
863 | | */ |
864 | |
|
865 | 0 | num_counters = GET_NUM_COUNTERS(num_syms); |
866 | |
|
867 | 0 | memset(counters, 0, num_counters * sizeof(counters[0])); |
868 | | |
869 | | /* Count the frequencies. */ |
870 | 0 | for (sym = 0; sym < num_syms; sym++) |
871 | 0 | counters[MIN(freqs[sym], num_counters - 1)]++; |
872 | | |
873 | | /* |
874 | | * Make the counters cumulative, ignoring the zero-th, which counted |
875 | | * symbols with zero frequency. As a side effect, this calculates the |
876 | | * number of symbols with nonzero frequency. |
877 | | */ |
878 | 0 | num_used_syms = 0; |
879 | 0 | for (i = 1; i < num_counters; i++) { |
880 | 0 | unsigned count = counters[i]; |
881 | |
|
882 | 0 | counters[i] = num_used_syms; |
883 | 0 | num_used_syms += count; |
884 | 0 | } |
885 | | |
886 | | /* |
887 | | * Sort nonzero-frequency symbols using the counters. At the same time, |
888 | | * set the codeword lengths of zero-frequency symbols to 0. |
889 | | */ |
890 | 0 | for (sym = 0; sym < num_syms; sym++) { |
891 | 0 | u32 freq = freqs[sym]; |
892 | |
|
893 | 0 | if (freq != 0) { |
894 | 0 | symout[counters[MIN(freq, num_counters - 1)]++] = |
895 | 0 | sym | (freq << NUM_SYMBOL_BITS); |
896 | 0 | } else { |
897 | 0 | lens[sym] = 0; |
898 | 0 | } |
899 | 0 | } |
900 | | |
901 | | /* Sort the symbols counted in the last counter. */ |
902 | 0 | heap_sort(symout + counters[num_counters - 2], |
903 | 0 | counters[num_counters - 1] - counters[num_counters - 2]); |
904 | |
|
905 | 0 | return num_used_syms; |
906 | 0 | } |
907 | | |
908 | | /* |
909 | | * Build a Huffman tree. |
910 | | * |
911 | | * This is an optimized implementation that |
912 | | * (a) takes advantage of the frequencies being already sorted; |
913 | | * (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman |
914 | | * tree are sufficient to generate a canonical code; |
915 | | * (c) Only stores parent pointers, not child pointers; |
916 | | * (d) Produces the nodes in the same memory used for input frequency |
917 | | * information. |
918 | | * |
919 | | * Array 'A', which contains 'sym_count' entries, is used for both input and |
920 | | * output. For this function, 'sym_count' must be at least 2. |
921 | | * |
922 | | * For input, the array must contain the frequencies of the symbols, sorted in |
923 | | * increasing order. Specifically, each entry must contain a frequency left |
924 | | * shifted by NUM_SYMBOL_BITS bits. Any data in the low NUM_SYMBOL_BITS bits of |
925 | | * the entries will be ignored by this function. Although these bits will, in |
926 | | * fact, contain the symbols that correspond to the frequencies, this function |
927 | | * is concerned with frequencies only and keeps the symbols as-is. |
928 | | * |
929 | | * For output, this function will produce the non-leaf nodes of the Huffman |
930 | | * tree. These nodes will be stored in the first (sym_count - 1) entries of the |
931 | | * array. Entry A[sym_count - 2] will represent the root node. Each other node |
932 | | * will contain the zero-based index of its parent node in 'A', left shifted by |
933 | | * NUM_SYMBOL_BITS bits. The low NUM_SYMBOL_BITS bits of each entry in A will |
934 | | * be kept as-is. Again, note that although these low bits will, in fact, |
935 | | * contain a symbol value, this symbol will have *no relationship* with the |
936 | | * Huffman tree node that happens to occupy the same slot. This is because this |
937 | | * implementation only generates the non-leaf nodes of the tree. |
938 | | */ |
939 | | static void |
940 | | build_tree(u32 A[], unsigned sym_count) |
941 | 0 | { |
942 | 0 | const unsigned last_idx = sym_count - 1; |
943 | | |
944 | | /* Index of the next lowest frequency leaf that still needs a parent */ |
945 | 0 | unsigned i = 0; |
946 | | |
947 | | /* |
948 | | * Index of the next lowest frequency non-leaf that still needs a |
949 | | * parent, or 'e' if there is currently no such node |
950 | | */ |
951 | 0 | unsigned b = 0; |
952 | | |
953 | | /* Index of the next spot for a non-leaf (will overwrite a leaf) */ |
954 | 0 | unsigned e = 0; |
955 | |
|
956 | 0 | do { |
957 | 0 | u32 new_freq; |
958 | | |
959 | | /* |
960 | | * Select the next two lowest frequency nodes among the leaves |
961 | | * A[i] and non-leaves A[b], and create a new node A[e] to be |
962 | | * their parent. Set the new node's frequency to the sum of the |
963 | | * frequencies of its two children. |
964 | | * |
965 | | * Usually the next two lowest frequency nodes are of the same |
966 | | * type (leaf or non-leaf), so check those cases first. |
967 | | */ |
968 | 0 | if (i + 1 <= last_idx && |
969 | 0 | (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) { |
970 | | /* Two leaves */ |
971 | 0 | new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK); |
972 | 0 | i += 2; |
973 | 0 | } else if (b + 2 <= e && |
974 | 0 | (i > last_idx || |
975 | 0 | (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) { |
976 | | /* Two non-leaves */ |
977 | 0 | new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK); |
978 | 0 | A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); |
979 | 0 | A[b + 1] = (e << NUM_SYMBOL_BITS) | |
980 | 0 | (A[b + 1] & SYMBOL_MASK); |
981 | 0 | b += 2; |
982 | 0 | } else { |
983 | | /* One leaf and one non-leaf */ |
984 | 0 | new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK); |
985 | 0 | A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); |
986 | 0 | i++; |
987 | 0 | b++; |
988 | 0 | } |
989 | 0 | A[e] = new_freq | (A[e] & SYMBOL_MASK); |
990 | | /* |
991 | | * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the |
992 | | * tree is complete once we've created 'n - 1' non-leaves. |
993 | | */ |
994 | 0 | } while (++e < last_idx); |
995 | 0 | } |
996 | | |
997 | | /* |
998 | | * Given the stripped-down Huffman tree constructed by build_tree(), determine |
999 | | * the number of codewords that should be assigned each possible length, taking |
1000 | | * into account the length-limited constraint. |
1001 | | * |
1002 | | * @A |
1003 | | * The array produced by build_tree(), containing parent index information |
1004 | | * for the non-leaf nodes of the Huffman tree. Each entry in this array is |
1005 | | * a node; a node's parent always has a greater index than that node |
1006 | | * itself. This function will overwrite the parent index information in |
1007 | | * this array, so essentially it will destroy the tree. However, the data |
1008 | | * in the low NUM_SYMBOL_BITS of each entry will be preserved. |
1009 | | * |
1010 | | * @root_idx |
1011 | | * The 0-based index of the root node in 'A', and consequently one less |
1012 | | * than the number of tree node entries in 'A'. (Or, really 2 less than |
1013 | | * the actual length of 'A'.) |
1014 | | * |
1015 | | * @len_counts |
1016 | | * An array of length ('max_codeword_len' + 1) in which the number of |
1017 | | * codewords having each length <= max_codeword_len will be returned. |
1018 | | * |
1019 | | * @max_codeword_len |
1020 | | * The maximum permissible codeword length. |
1021 | | */ |
1022 | | static void |
1023 | | compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[], |
1024 | | unsigned max_codeword_len) |
1025 | 0 | { |
1026 | 0 | unsigned len; |
1027 | 0 | int node; |
1028 | | |
1029 | | /* |
1030 | | * The key observations are: |
1031 | | * |
1032 | | * (1) We can traverse the non-leaf nodes of the tree, always visiting a |
1033 | | * parent before its children, by simply iterating through the array |
1034 | | * in reverse order. Consequently, we can compute the depth of each |
1035 | | * node in one pass, overwriting the parent indices with depths. |
1036 | | * |
1037 | | * (2) We can initially assume that in the real Huffman tree, both |
1038 | | * children of the root are leaves. This corresponds to two |
1039 | | * codewords of length 1. Then, whenever we visit a (non-leaf) node |
1040 | | * during the traversal, we modify this assumption to account for |
1041 | | * the current node *not* being a leaf, but rather its two children |
1042 | | * being leaves. This causes the loss of one codeword for the |
1043 | | * current depth and the addition of two codewords for the current |
1044 | | * depth plus one. |
1045 | | * |
1046 | | * (3) We can handle the length-limited constraint fairly easily by |
1047 | | * simply using the largest length available when a depth exceeds |
1048 | | * max_codeword_len. |
1049 | | */ |
1050 | |
|
1051 | 0 | for (len = 0; len <= max_codeword_len; len++) |
1052 | 0 | len_counts[len] = 0; |
1053 | 0 | len_counts[1] = 2; |
1054 | | |
1055 | | /* Set the root node's depth to 0. */ |
1056 | 0 | A[root_idx] &= SYMBOL_MASK; |
1057 | |
|
1058 | 0 | for (node = root_idx - 1; node >= 0; node--) { |
1059 | | |
1060 | | /* Calculate the depth of this node. */ |
1061 | |
|
1062 | 0 | unsigned parent = A[node] >> NUM_SYMBOL_BITS; |
1063 | 0 | unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS; |
1064 | 0 | unsigned depth = parent_depth + 1; |
1065 | | |
1066 | | /* |
1067 | | * Set the depth of this node so that it is available when its |
1068 | | * children (if any) are processed. |
1069 | | */ |
1070 | 0 | A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS); |
1071 | | |
1072 | | /* |
1073 | | * If needed, decrease the length to meet the length-limited |
1074 | | * constraint. This is not the optimal method for generating |
1075 | | * length-limited Huffman codes! But it should be good enough. |
1076 | | */ |
1077 | 0 | if (depth >= max_codeword_len) { |
1078 | 0 | depth = max_codeword_len; |
1079 | 0 | do { |
1080 | 0 | depth--; |
1081 | 0 | } while (len_counts[depth] == 0); |
1082 | 0 | } |
1083 | | |
1084 | | /* |
1085 | | * Account for the fact that we have a non-leaf node at the |
1086 | | * current depth. |
1087 | | */ |
1088 | 0 | len_counts[depth]--; |
1089 | 0 | len_counts[depth + 1] += 2; |
1090 | 0 | } |
1091 | 0 | } |
1092 | | |
1093 | | /* |
1094 | | * DEFLATE uses bit-reversed codewords, so we must bit-reverse the codewords |
1095 | | * after generating them. All codewords have length <= 16 bits. If the CPU has |
1096 | | * a bit-reversal instruction, then that is the fastest method. Otherwise the |
1097 | | * fastest method is to reverse the bits in each of the two bytes using a table. |
1098 | | * The table method is slightly faster than using bitwise operations to flip |
1099 | | * adjacent 1, 2, 4, and then 8-bit fields, even if 2 to 4 codewords are packed |
1100 | | * into a machine word and processed together using that method. |
1101 | | */ |
1102 | | |
1103 | | #ifdef rbit32 |
1104 | | static forceinline u32 reverse_codeword(u32 codeword, u8 len) |
1105 | | { |
1106 | | return rbit32(codeword) >> ((32 - len) & 31); |
1107 | | } |
1108 | | #else |
1109 | | /* Generated by scripts/gen_bitreverse_tab.py */ |
1110 | | static const u8 bitreverse_tab[256] = { |
1111 | | 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, |
1112 | | 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, |
1113 | | 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, |
1114 | | 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, |
1115 | | 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, |
1116 | | 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, |
1117 | | 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, |
1118 | | 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, |
1119 | | 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, |
1120 | | 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, |
1121 | | 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, |
1122 | | 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa, |
1123 | | 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, |
1124 | | 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, |
1125 | | 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, |
1126 | | 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe, |
1127 | | 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, |
1128 | | 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, |
1129 | | 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, |
1130 | | 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, |
1131 | | 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, |
1132 | | 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, |
1133 | | 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, |
1134 | | 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd, |
1135 | | 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, |
1136 | | 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, |
1137 | | 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, |
1138 | | 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb, |
1139 | | 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, |
1140 | | 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, |
1141 | | 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, |
1142 | | 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff, |
1143 | | }; |
1144 | | |
1145 | | static forceinline u32 reverse_codeword(u32 codeword, u8 len) |
1146 | 0 | { |
1147 | 0 | STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16); |
1148 | 0 | codeword = ((u32)bitreverse_tab[codeword & 0xff] << 8) | |
1149 | 0 | bitreverse_tab[codeword >> 8]; |
1150 | 0 | return codeword >> (16 - len); |
1151 | 0 | } |
1152 | | #endif /* !rbit32 */ |
1153 | | |
1154 | | /* |
1155 | | * Generate the codewords for a canonical Huffman code. |
1156 | | * |
1157 | | * @A |
1158 | | * The output array for codewords. In addition, initially this |
1159 | | * array must contain the symbols, sorted primarily by frequency and |
1160 | | * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of |
1161 | | * each entry. |
1162 | | * |
1163 | | * @len |
1164 | | * Output array for codeword lengths. |
1165 | | * |
1166 | | * @len_counts |
1167 | | * An array that provides the number of codewords that will have |
1168 | | * each possible length <= max_codeword_len. |
1169 | | * |
1170 | | * @max_codeword_len |
1171 | | * Maximum length, in bits, of each codeword. |
1172 | | * |
1173 | | * @num_syms |
1174 | | * Number of symbols in the alphabet, including symbols with zero |
1175 | | * frequency. This is the length of the 'A' and 'len' arrays. |
1176 | | */ |
1177 | | static void |
1178 | | gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[], |
1179 | | unsigned max_codeword_len, unsigned num_syms) |
1180 | 0 | { |
1181 | 0 | u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1]; |
1182 | 0 | unsigned i; |
1183 | 0 | unsigned len; |
1184 | 0 | unsigned sym; |
1185 | | |
1186 | | /* |
1187 | | * Given the number of codewords that will have each length, assign |
1188 | | * codeword lengths to symbols. We do this by assigning the lengths in |
1189 | | * decreasing order to the symbols sorted primarily by increasing |
1190 | | * frequency and secondarily by increasing symbol value. |
1191 | | */ |
1192 | 0 | for (i = 0, len = max_codeword_len; len >= 1; len--) { |
1193 | 0 | unsigned count = len_counts[len]; |
1194 | |
|
1195 | 0 | while (count--) |
1196 | 0 | lens[A[i++] & SYMBOL_MASK] = len; |
1197 | 0 | } |
1198 | | |
1199 | | /* |
1200 | | * Generate the codewords themselves. We initialize the |
1201 | | * 'next_codewords' array to provide the lexicographically first |
1202 | | * codeword of each length, then assign codewords in symbol order. This |
1203 | | * produces a canonical code. |
1204 | | */ |
1205 | 0 | next_codewords[0] = 0; |
1206 | 0 | next_codewords[1] = 0; |
1207 | 0 | for (len = 2; len <= max_codeword_len; len++) |
1208 | 0 | next_codewords[len] = |
1209 | 0 | (next_codewords[len - 1] + len_counts[len - 1]) << 1; |
1210 | |
|
1211 | 0 | for (sym = 0; sym < num_syms; sym++) { |
1212 | | /* DEFLATE requires bit-reversed codewords. */ |
1213 | 0 | A[sym] = reverse_codeword(next_codewords[lens[sym]]++, |
1214 | 0 | lens[sym]); |
1215 | 0 | } |
1216 | 0 | } |
1217 | | |
1218 | | /* |
1219 | | * --------------------------------------------------------------------- |
1220 | | * deflate_make_huffman_code() |
1221 | | * --------------------------------------------------------------------- |
1222 | | * |
1223 | | * Given an alphabet and the frequency of each symbol in it, construct a |
1224 | | * length-limited canonical Huffman code. |
1225 | | * |
1226 | | * @num_syms |
1227 | | * The number of symbols in the alphabet. The symbols are the integers in |
1228 | | * the range [0, num_syms - 1]. This parameter must be at least 2 and |
1229 | | * must not exceed (1 << NUM_SYMBOL_BITS). |
1230 | | * |
1231 | | * @max_codeword_len |
1232 | | * The maximum permissible codeword length. |
1233 | | * |
1234 | | * @freqs |
1235 | | * An array of length @num_syms that gives the frequency of each symbol. |
1236 | | * It is valid for some, none, or all of the frequencies to be 0. The sum |
1237 | | * of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1. |
1238 | | * |
1239 | | * @lens |
1240 | | * An array of @num_syms entries in which this function will return the |
1241 | | * length, in bits, of the codeword assigned to each symbol. Symbols with |
1242 | | * 0 frequency will not have codewords per se, but their entries in this |
1243 | | * array will be set to 0. No lengths greater than @max_codeword_len will |
1244 | | * be assigned. |
1245 | | * |
1246 | | * @codewords |
1247 | | * An array of @num_syms entries in which this function will return the |
1248 | | * codeword for each symbol, right-justified and padded on the left with |
1249 | | * zeroes. Codewords for symbols with 0 frequency will be undefined. |
1250 | | * |
1251 | | * --------------------------------------------------------------------- |
1252 | | * |
1253 | | * This function builds a length-limited canonical Huffman code. |
1254 | | * |
1255 | | * A length-limited Huffman code contains no codewords longer than some |
1256 | | * specified length, and has exactly (with some algorithms) or approximately |
1257 | | * (with the algorithm used here) the minimum weighted path length from the |
1258 | | * root, given this constraint. |
1259 | | * |
1260 | | * A canonical Huffman code satisfies the properties that a longer codeword |
1261 | | * never lexicographically precedes a shorter codeword, and the lexicographic |
1262 | | * ordering of codewords of the same length is the same as the lexicographic |
1263 | | * ordering of the corresponding symbols. A canonical Huffman code, or more |
1264 | | * generally a canonical prefix code, can be reconstructed from only a list |
1265 | | * containing the codeword length of each symbol. |
1266 | | * |
1267 | | * The classic algorithm to generate a Huffman code creates a node for each |
1268 | | * symbol, then inserts these nodes into a min-heap keyed by symbol frequency. |
1269 | | * Then, repeatedly, the two lowest-frequency nodes are removed from the |
1270 | | * min-heap and added as the children of a new node having frequency equal to |
1271 | | * the sum of its two children, which is then inserted into the min-heap. When |
1272 | | * only a single node remains in the min-heap, it is the root of the Huffman |
1273 | | * tree. The codeword for each symbol is determined by the path needed to reach |
1274 | | * the corresponding node from the root. Descending to the left child appends a |
1275 | | * 0 bit, whereas descending to the right child appends a 1 bit. |
1276 | | * |
1277 | | * The classic algorithm is relatively easy to understand, but it is subject to |
1278 | | * a number of inefficiencies. In practice, it is fastest to first sort the |
1279 | | * symbols by frequency. (This itself can be subject to an optimization based |
1280 | | * on the fact that most frequencies tend to be low.) At the same time, we sort |
1281 | | * secondarily by symbol value, which aids the process of generating a canonical |
1282 | | * code. Then, during tree construction, no heap is necessary because both the |
1283 | | * leaf nodes and the unparented non-leaf nodes can be easily maintained in |
1284 | | * sorted order. Consequently, there can never be more than two possibilities |
1285 | | * for the next-lowest-frequency node. |
1286 | | * |
1287 | | * In addition, because we're generating a canonical code, we actually don't |
1288 | | * need the leaf nodes of the tree at all, only the non-leaf nodes. This is |
1289 | | * because for canonical code generation we don't need to know where the symbols |
1290 | | * are in the tree. Rather, we only need to know how many leaf nodes have each |
1291 | | * depth (codeword length). And this information can, in fact, be quickly |
1292 | | * generated from the tree of non-leaves only. |
1293 | | * |
1294 | | * Furthermore, we can build this stripped-down Huffman tree directly in the |
1295 | | * array in which the codewords are to be generated, provided that these array |
1296 | | * slots are large enough to hold a symbol and frequency value. |
1297 | | * |
1298 | | * Still furthermore, we don't even need to maintain explicit child pointers. |
1299 | | * We only need the parent pointers, and even those can be overwritten in-place |
1300 | | * with depth information as part of the process of extracting codeword lengths |
1301 | | * from the tree. So in summary, we do NOT need a big structure like: |
1302 | | * |
1303 | | * struct huffman_tree_node { |
1304 | | * unsigned int symbol; |
1305 | | * unsigned int frequency; |
1306 | | * unsigned int depth; |
1307 | | * struct huffman_tree_node *left_child; |
1308 | | * struct huffman_tree_node *right_child; |
1309 | | * }; |
1310 | | * |
1311 | | * |
1312 | | * ... which often gets used in "naive" implementations of Huffman code |
1313 | | * generation. |
1314 | | * |
1315 | | * Many of these optimizations are based on the implementation in 7-Zip (source |
1316 | | * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov. |
1317 | | */ |
1318 | | static void |
1319 | | deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len, |
1320 | | const u32 freqs[], u8 lens[], u32 codewords[]) |
1321 | 0 | { |
1322 | 0 | u32 *A = codewords; |
1323 | 0 | unsigned num_used_syms; |
1324 | |
|
1325 | 0 | STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS); |
1326 | 0 | STATIC_ASSERT(MAX_BLOCK_LENGTH <= ((u32)1 << NUM_FREQ_BITS) - 1); |
1327 | | |
1328 | | /* |
1329 | | * We begin by sorting the symbols primarily by frequency and |
1330 | | * secondarily by symbol value. As an optimization, the array used for |
1331 | | * this purpose ('A') shares storage with the space in which we will |
1332 | | * eventually return the codewords. |
1333 | | */ |
1334 | 0 | num_used_syms = sort_symbols(num_syms, freqs, lens, A); |
1335 | | /* |
1336 | | * 'num_used_syms' is the number of symbols with nonzero frequency. |
1337 | | * This may be less than @num_syms. 'num_used_syms' is also the number |
1338 | | * of entries in 'A' that are valid. Each entry consists of a distinct |
1339 | | * symbol and a nonzero frequency packed into a 32-bit integer. |
1340 | | */ |
1341 | | |
1342 | | /* |
1343 | | * A complete Huffman code must contain at least 2 codewords. Yet, it's |
1344 | | * possible that fewer than 2 symbols were used. When this happens, |
1345 | | * it's usually for the offset code (0-1 symbols used). But it's also |
1346 | | * theoretically possible for the litlen and pre codes (1 symbol used). |
1347 | | * |
1348 | | * The DEFLATE RFC explicitly allows the offset code to contain just 1 |
1349 | | * codeword, or even be completely empty. But it's silent about the |
1350 | | * other codes. It also doesn't say whether, in the 1-codeword case, |
1351 | | * the codeword (which it says must be 1 bit) is '0' or '1'. |
1352 | | * |
1353 | | * In any case, some DEFLATE decompressors reject these cases. zlib |
1354 | | * generally allows them, but it does reject precodes that have just 1 |
1355 | | * codeword. More problematically, zlib v1.2.1 and earlier rejected |
1356 | | * empty offset codes, and this behavior can also be seen in Windows |
1357 | | * Explorer's ZIP unpacker (supposedly even still in Windows 11). |
1358 | | * |
1359 | | * Other DEFLATE compressors, including zlib, always send at least 2 |
1360 | | * codewords in order to make a complete Huffman code. Therefore, this |
1361 | | * is a case where practice does not entirely match the specification. |
1362 | | * We follow practice by generating 2 codewords of length 1: codeword |
1363 | | * '0' for symbol 0, and codeword '1' for another symbol -- the used |
1364 | | * symbol if it exists and is not symbol 0, otherwise symbol 1. This |
1365 | | * does worsen the compression ratio by having to send an unnecessary |
1366 | | * offset codeword length. But this only affects rare cases such as |
1367 | | * blocks containing all literals, and it only makes a tiny difference. |
1368 | | */ |
1369 | 0 | if (unlikely(num_used_syms < 2)) { |
1370 | 0 | unsigned sym = num_used_syms ? (A[0] & SYMBOL_MASK) : 0; |
1371 | 0 | unsigned nonzero_idx = sym ? sym : 1; |
1372 | |
|
1373 | 0 | codewords[0] = 0; |
1374 | 0 | lens[0] = 1; |
1375 | 0 | codewords[nonzero_idx] = 1; |
1376 | 0 | lens[nonzero_idx] = 1; |
1377 | 0 | return; |
1378 | 0 | } |
1379 | | |
1380 | | /* |
1381 | | * Build a stripped-down version of the Huffman tree, sharing the array |
1382 | | * 'A' with the symbol values. Then extract length counts from the tree |
1383 | | * and use them to generate the final codewords. |
1384 | | */ |
1385 | | |
1386 | 0 | build_tree(A, num_used_syms); |
1387 | |
|
1388 | 0 | { |
1389 | 0 | unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; |
1390 | |
|
1391 | 0 | compute_length_counts(A, num_used_syms - 2, |
1392 | 0 | len_counts, max_codeword_len); |
1393 | |
|
1394 | 0 | gen_codewords(A, lens, len_counts, max_codeword_len, num_syms); |
1395 | 0 | } |
1396 | 0 | } |
1397 | | |
1398 | | /* |
1399 | | * Clear the Huffman symbol frequency counters. This must be called when |
1400 | | * starting a new DEFLATE block. |
1401 | | */ |
1402 | | static void |
1403 | | deflate_reset_symbol_frequencies(struct libdeflate_compressor *c) |
1404 | 0 | { |
1405 | 0 | memset(&c->freqs, 0, sizeof(c->freqs)); |
1406 | 0 | } |
1407 | | |
1408 | | /* |
1409 | | * Build the literal/length and offset Huffman codes for a DEFLATE block. |
1410 | | * |
1411 | | * This takes as input the frequency tables for each alphabet and produces as |
1412 | | * output a set of tables that map symbols to codewords and codeword lengths. |
1413 | | */ |
1414 | | static void |
1415 | | deflate_make_huffman_codes(const struct deflate_freqs *freqs, |
1416 | | struct deflate_codes *codes) |
1417 | 0 | { |
1418 | 0 | deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS, |
1419 | 0 | MAX_LITLEN_CODEWORD_LEN, |
1420 | 0 | freqs->litlen, |
1421 | 0 | codes->lens.litlen, |
1422 | 0 | codes->codewords.litlen); |
1423 | |
|
1424 | 0 | deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS, |
1425 | 0 | MAX_OFFSET_CODEWORD_LEN, |
1426 | 0 | freqs->offset, |
1427 | 0 | codes->lens.offset, |
1428 | 0 | codes->codewords.offset); |
1429 | 0 | } |
1430 | | |
1431 | | /* Initialize c->static_codes. */ |
1432 | | static void |
1433 | | deflate_init_static_codes(struct libdeflate_compressor *c) |
1434 | 0 | { |
1435 | 0 | unsigned i; |
1436 | |
|
1437 | 0 | for (i = 0; i < 144; i++) |
1438 | 0 | c->freqs.litlen[i] = 1 << (9 - 8); |
1439 | 0 | for (; i < 256; i++) |
1440 | 0 | c->freqs.litlen[i] = 1 << (9 - 9); |
1441 | 0 | for (; i < 280; i++) |
1442 | 0 | c->freqs.litlen[i] = 1 << (9 - 7); |
1443 | 0 | for (; i < 288; i++) |
1444 | 0 | c->freqs.litlen[i] = 1 << (9 - 8); |
1445 | |
|
1446 | 0 | for (i = 0; i < 32; i++) |
1447 | 0 | c->freqs.offset[i] = 1 << (5 - 5); |
1448 | |
|
1449 | 0 | deflate_make_huffman_codes(&c->freqs, &c->static_codes); |
1450 | 0 | } |
1451 | | |
1452 | | /* Return the offset slot for the given match offset, using the small map. */ |
1453 | | static forceinline unsigned |
1454 | | deflate_get_offset_slot(u32 offset) |
1455 | 0 | { |
1456 | | /* |
1457 | | * 1 <= offset <= 32768 here. For 1 <= offset <= 256, |
1458 | | * deflate_offset_slot[offset - 1] gives the slot. |
1459 | | * |
1460 | | * For 257 <= offset <= 32768, we take advantage of the fact that 257 is |
1461 | | * the beginning of slot 16, and each slot [16..30) is exactly 1 << 7 == |
1462 | | * 128 times larger than each slot [2..16) (since the number of extra |
1463 | | * bits increases by 1 every 2 slots). Thus, the slot is: |
1464 | | * |
1465 | | * deflate_offset_slot[2 + ((offset - 257) >> 7)] + (16 - 2) |
1466 | | * == deflate_offset_slot[((offset - 1) >> 7)] + 14 |
1467 | | * |
1468 | | * Define 'n = (offset <= 256) ? 0 : 7'. Then any offset is handled by: |
1469 | | * |
1470 | | * deflate_offset_slot[(offset - 1) >> n] + (n << 1) |
1471 | | * |
1472 | | * For better performance, replace 'n = (offset <= 256) ? 0 : 7' with |
1473 | | * the equivalent (for offset <= 536871168) 'n = (256 - offset) >> 29'. |
1474 | | */ |
1475 | 0 | unsigned n = (256 - offset) >> 29; |
1476 | |
|
1477 | 0 | ASSERT(offset >= 1 && offset <= 32768); |
1478 | |
|
1479 | 0 | return deflate_offset_slot[(offset - 1) >> n] + (n << 1); |
1480 | 0 | } |
1481 | | |
1482 | | static unsigned |
1483 | | deflate_compute_precode_items(const u8 lens[], const unsigned num_lens, |
1484 | | u32 precode_freqs[], unsigned precode_items[]) |
1485 | 0 | { |
1486 | 0 | unsigned *itemptr; |
1487 | 0 | unsigned run_start; |
1488 | 0 | unsigned run_end; |
1489 | 0 | unsigned extra_bits; |
1490 | 0 | u8 len; |
1491 | |
|
1492 | 0 | memset(precode_freqs, 0, |
1493 | 0 | DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0])); |
1494 | |
|
1495 | 0 | itemptr = precode_items; |
1496 | 0 | run_start = 0; |
1497 | 0 | do { |
1498 | | /* Find the next run of codeword lengths. */ |
1499 | | |
1500 | | /* len = the length being repeated */ |
1501 | 0 | len = lens[run_start]; |
1502 | | |
1503 | | /* Extend the run. */ |
1504 | 0 | run_end = run_start; |
1505 | 0 | do { |
1506 | 0 | run_end++; |
1507 | 0 | } while (run_end != num_lens && len == lens[run_end]); |
1508 | |
|
1509 | 0 | if (len == 0) { |
1510 | | /* Run of zeroes. */ |
1511 | | |
1512 | | /* Symbol 18: RLE 11 to 138 zeroes at a time. */ |
1513 | 0 | while ((run_end - run_start) >= 11) { |
1514 | 0 | extra_bits = MIN((run_end - run_start) - 11, |
1515 | 0 | 0x7F); |
1516 | 0 | precode_freqs[18]++; |
1517 | 0 | *itemptr++ = 18 | (extra_bits << 5); |
1518 | 0 | run_start += 11 + extra_bits; |
1519 | 0 | } |
1520 | | |
1521 | | /* Symbol 17: RLE 3 to 10 zeroes at a time. */ |
1522 | 0 | if ((run_end - run_start) >= 3) { |
1523 | 0 | extra_bits = MIN((run_end - run_start) - 3, |
1524 | 0 | 0x7); |
1525 | 0 | precode_freqs[17]++; |
1526 | 0 | *itemptr++ = 17 | (extra_bits << 5); |
1527 | 0 | run_start += 3 + extra_bits; |
1528 | 0 | } |
1529 | 0 | } else { |
1530 | | |
1531 | | /* A run of nonzero lengths. */ |
1532 | | |
1533 | | /* Symbol 16: RLE 3 to 6 of the previous length. */ |
1534 | 0 | if ((run_end - run_start) >= 4) { |
1535 | 0 | precode_freqs[len]++; |
1536 | 0 | *itemptr++ = len; |
1537 | 0 | run_start++; |
1538 | 0 | do { |
1539 | 0 | extra_bits = MIN((run_end - run_start) - |
1540 | 0 | 3, 0x3); |
1541 | 0 | precode_freqs[16]++; |
1542 | 0 | *itemptr++ = 16 | (extra_bits << 5); |
1543 | 0 | run_start += 3 + extra_bits; |
1544 | 0 | } while ((run_end - run_start) >= 3); |
1545 | 0 | } |
1546 | 0 | } |
1547 | | |
1548 | | /* Output any remaining lengths without RLE. */ |
1549 | 0 | while (run_start != run_end) { |
1550 | 0 | precode_freqs[len]++; |
1551 | 0 | *itemptr++ = len; |
1552 | 0 | run_start++; |
1553 | 0 | } |
1554 | 0 | } while (run_start != num_lens); |
1555 | |
|
1556 | 0 | return itemptr - precode_items; |
1557 | 0 | } |
1558 | | |
1559 | | /* |
1560 | | * Huffman codeword lengths for dynamic Huffman blocks are compressed using a |
1561 | | * separate Huffman code, the "precode", which contains a symbol for each |
1562 | | * possible codeword length in the larger code as well as several special |
1563 | | * symbols to represent repeated codeword lengths (a form of run-length |
1564 | | * encoding). The precode is itself constructed in canonical form, and its |
1565 | | * codeword lengths are represented literally in 19 3-bit fields that |
1566 | | * immediately precede the compressed codeword lengths of the larger code. |
1567 | | */ |
1568 | | |
1569 | | /* Precompute the information needed to output dynamic Huffman codes. */ |
1570 | | static void |
1571 | | deflate_precompute_huffman_header(struct libdeflate_compressor *c) |
1572 | 0 | { |
1573 | | /* Compute how many litlen and offset symbols are needed. */ |
1574 | |
|
1575 | 0 | for (c->o.precode.num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS; |
1576 | 0 | c->o.precode.num_litlen_syms > 257; |
1577 | 0 | c->o.precode.num_litlen_syms--) |
1578 | 0 | if (c->codes.lens.litlen[c->o.precode.num_litlen_syms - 1] != 0) |
1579 | 0 | break; |
1580 | |
|
1581 | 0 | for (c->o.precode.num_offset_syms = DEFLATE_NUM_OFFSET_SYMS; |
1582 | 0 | c->o.precode.num_offset_syms > 1; |
1583 | 0 | c->o.precode.num_offset_syms--) |
1584 | 0 | if (c->codes.lens.offset[c->o.precode.num_offset_syms - 1] != 0) |
1585 | 0 | break; |
1586 | | |
1587 | | /* |
1588 | | * If we're not using the full set of literal/length codeword lengths, |
1589 | | * then temporarily move the offset codeword lengths over so that the |
1590 | | * literal/length and offset codeword lengths are contiguous. |
1591 | | */ |
1592 | 0 | STATIC_ASSERT(offsetof(struct deflate_lens, offset) == |
1593 | 0 | DEFLATE_NUM_LITLEN_SYMS); |
1594 | 0 | if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { |
1595 | 0 | memmove((u8 *)&c->codes.lens + c->o.precode.num_litlen_syms, |
1596 | 0 | (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, |
1597 | 0 | c->o.precode.num_offset_syms); |
1598 | 0 | } |
1599 | | |
1600 | | /* |
1601 | | * Compute the "items" (RLE / literal tokens and extra bits) with which |
1602 | | * the codeword lengths in the larger code will be output. |
1603 | | */ |
1604 | 0 | c->o.precode.num_items = |
1605 | 0 | deflate_compute_precode_items((u8 *)&c->codes.lens, |
1606 | 0 | c->o.precode.num_litlen_syms + |
1607 | 0 | c->o.precode.num_offset_syms, |
1608 | 0 | c->o.precode.freqs, |
1609 | 0 | c->o.precode.items); |
1610 | | |
1611 | | /* Build the precode. */ |
1612 | 0 | deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS, |
1613 | 0 | MAX_PRE_CODEWORD_LEN, |
1614 | 0 | c->o.precode.freqs, c->o.precode.lens, |
1615 | 0 | c->o.precode.codewords); |
1616 | | |
1617 | | /* Count how many precode lengths we actually need to output. */ |
1618 | 0 | for (c->o.precode.num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS; |
1619 | 0 | c->o.precode.num_explicit_lens > 4; |
1620 | 0 | c->o.precode.num_explicit_lens--) |
1621 | 0 | if (c->o.precode.lens[deflate_precode_lens_permutation[ |
1622 | 0 | c->o.precode.num_explicit_lens - 1]] != 0) |
1623 | 0 | break; |
1624 | | |
1625 | | /* Restore the offset codeword lengths if needed. */ |
1626 | 0 | if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { |
1627 | 0 | memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, |
1628 | 0 | (u8 *)&c->codes.lens + c->o.precode.num_litlen_syms, |
1629 | 0 | c->o.precode.num_offset_syms); |
1630 | 0 | } |
1631 | 0 | } |
1632 | | |
1633 | | /* |
1634 | | * To make it faster to output matches, compute the "full" match length |
1635 | | * codewords, i.e. the concatenation of the litlen codeword and the extra bits |
1636 | | * for each possible match length. |
1637 | | */ |
1638 | | static void |
1639 | | deflate_compute_full_len_codewords(struct libdeflate_compressor *c, |
1640 | | const struct deflate_codes *codes) |
1641 | 0 | { |
1642 | 0 | u32 len; |
1643 | |
|
1644 | 0 | STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN + |
1645 | 0 | DEFLATE_MAX_EXTRA_LENGTH_BITS <= 32); |
1646 | |
|
1647 | 0 | for (len = DEFLATE_MIN_MATCH_LEN; len <= DEFLATE_MAX_MATCH_LEN; len++) { |
1648 | 0 | unsigned slot = deflate_length_slot[len]; |
1649 | 0 | unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + slot; |
1650 | 0 | u32 extra_bits = len - deflate_length_slot_base[slot]; |
1651 | |
|
1652 | 0 | c->o.length.codewords[len] = |
1653 | 0 | codes->codewords.litlen[litlen_sym] | |
1654 | 0 | (extra_bits << codes->lens.litlen[litlen_sym]); |
1655 | 0 | c->o.length.lens[len] = codes->lens.litlen[litlen_sym] + |
1656 | 0 | deflate_extra_length_bits[slot]; |
1657 | 0 | } |
1658 | 0 | } |
1659 | | |
1660 | | /* Write a match to the output buffer. */ |
1661 | 0 | #define WRITE_MATCH(c_, codes_, length_, offset_, offset_slot_) \ |
1662 | 0 | do { \ |
1663 | 0 | const struct libdeflate_compressor *c__ = (c_); \ |
1664 | 0 | const struct deflate_codes *codes__ = (codes_); \ |
1665 | 0 | u32 length__ = (length_); \ |
1666 | 0 | u32 offset__ = (offset_); \ |
1667 | 0 | unsigned offset_slot__ = (offset_slot_); \ |
1668 | 0 | \ |
1669 | 0 | /* Litlen symbol and extra length bits */ \ |
1670 | 0 | STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + \ |
1671 | 0 | DEFLATE_MAX_EXTRA_LENGTH_BITS)); \ |
1672 | 0 | ADD_BITS(c__->o.length.codewords[length__], \ |
1673 | 0 | c__->o.length.lens[length__]); \ |
1674 | 0 | \ |
1675 | 0 | if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + \ |
1676 | 0 | DEFLATE_MAX_EXTRA_LENGTH_BITS + \ |
1677 | 0 | MAX_OFFSET_CODEWORD_LEN + \ |
1678 | 0 | DEFLATE_MAX_EXTRA_OFFSET_BITS)) \ |
1679 | 0 | FLUSH_BITS(); \ |
1680 | 0 | \ |
1681 | 0 | /* Offset symbol */ \ |
1682 | 0 | ADD_BITS(codes__->codewords.offset[offset_slot__], \ |
1683 | 0 | codes__->lens.offset[offset_slot__]); \ |
1684 | 0 | \ |
1685 | 0 | if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + \ |
1686 | 0 | DEFLATE_MAX_EXTRA_OFFSET_BITS)) \ |
1687 | 0 | FLUSH_BITS(); \ |
1688 | 0 | \ |
1689 | 0 | /* Extra offset bits */ \ |
1690 | 0 | ADD_BITS(offset__ - deflate_offset_slot_base[offset_slot__], \ |
1691 | 0 | deflate_extra_offset_bits[offset_slot__]); \ |
1692 | 0 | \ |
1693 | 0 | FLUSH_BITS(); \ |
1694 | 0 | } while (0) |
1695 | | |
1696 | | /* |
1697 | | * Choose the best type of block to use (dynamic Huffman, static Huffman, or |
1698 | | * uncompressed), then output it. |
1699 | | * |
1700 | | * The uncompressed data of the block is @block_begin[0..@block_length-1]. The |
1701 | | * sequence of literals and matches that will be used to compress the block (if |
1702 | | * a compressed block is chosen) is given by @sequences if it's non-NULL, or |
1703 | | * else @c->p.n.optimum_nodes. @c->freqs and @c->codes must be already set |
1704 | | * according to the literals, matches, and end-of-block symbol. |
1705 | | */ |
1706 | | static void |
1707 | | deflate_flush_block(struct libdeflate_compressor *c, |
1708 | | struct deflate_output_bitstream *os, |
1709 | | const u8 *block_begin, u32 block_length, |
1710 | | const struct deflate_sequence *sequences, |
1711 | | bool is_final_block) |
1712 | 0 | { |
1713 | | /* |
1714 | | * It is hard to get compilers to understand that writes to 'os->next' |
1715 | | * don't alias 'os'. That hurts performance significantly, as |
1716 | | * everything in 'os' would keep getting re-loaded. ('restrict' |
1717 | | * *should* do the trick, but it's unreliable.) Therefore, we keep all |
1718 | | * the output bitstream state in local variables, and output bits using |
1719 | | * macros. This is similar to what the decompressor does. |
1720 | | */ |
1721 | 0 | const u8 *in_next = block_begin; |
1722 | 0 | const u8 * const in_end = block_begin + block_length; |
1723 | 0 | bitbuf_t bitbuf = os->bitbuf; |
1724 | 0 | unsigned bitcount = os->bitcount; |
1725 | 0 | u8 *out_next = os->next; |
1726 | 0 | u8 * const out_fast_end = |
1727 | 0 | os->end - MIN(WORDBYTES - 1, os->end - out_next); |
1728 | | /* |
1729 | | * The cost for each block type, in bits. Start with the cost of the |
1730 | | * block header which is 3 bits. |
1731 | | */ |
1732 | 0 | u32 dynamic_cost = 3; |
1733 | 0 | u32 static_cost = 3; |
1734 | 0 | u32 uncompressed_cost = 3; |
1735 | 0 | u32 best_cost; |
1736 | 0 | struct deflate_codes *codes; |
1737 | 0 | unsigned sym; |
1738 | |
|
1739 | 0 | ASSERT(block_length >= MIN_BLOCK_LENGTH || |
1740 | 0 | (is_final_block && block_length > 0)); |
1741 | 0 | ASSERT(block_length <= MAX_BLOCK_LENGTH); |
1742 | 0 | ASSERT(bitcount <= 7); |
1743 | 0 | ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0); |
1744 | 0 | ASSERT(out_next <= os->end); |
1745 | 0 | ASSERT(!os->overflow); |
1746 | | |
1747 | | /* Precompute the precode items and build the precode. */ |
1748 | 0 | deflate_precompute_huffman_header(c); |
1749 | | |
1750 | | /* Account for the cost of encoding dynamic Huffman codes. */ |
1751 | 0 | dynamic_cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens); |
1752 | 0 | for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { |
1753 | 0 | u32 extra = deflate_extra_precode_bits[sym]; |
1754 | |
|
1755 | 0 | dynamic_cost += c->o.precode.freqs[sym] * |
1756 | 0 | (extra + c->o.precode.lens[sym]); |
1757 | 0 | } |
1758 | | |
1759 | | /* Account for the cost of encoding literals. */ |
1760 | 0 | for (sym = 0; sym < 144; sym++) { |
1761 | 0 | dynamic_cost += c->freqs.litlen[sym] * |
1762 | 0 | c->codes.lens.litlen[sym]; |
1763 | 0 | static_cost += c->freqs.litlen[sym] * 8; |
1764 | 0 | } |
1765 | 0 | for (; sym < 256; sym++) { |
1766 | 0 | dynamic_cost += c->freqs.litlen[sym] * |
1767 | 0 | c->codes.lens.litlen[sym]; |
1768 | 0 | static_cost += c->freqs.litlen[sym] * 9; |
1769 | 0 | } |
1770 | | |
1771 | | /* Account for the cost of encoding the end-of-block symbol. */ |
1772 | 0 | dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK]; |
1773 | 0 | static_cost += 7; |
1774 | | |
1775 | | /* Account for the cost of encoding lengths. */ |
1776 | 0 | for (sym = DEFLATE_FIRST_LEN_SYM; |
1777 | 0 | sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits); |
1778 | 0 | sym++) { |
1779 | 0 | u32 extra = deflate_extra_length_bits[ |
1780 | 0 | sym - DEFLATE_FIRST_LEN_SYM]; |
1781 | |
|
1782 | 0 | dynamic_cost += c->freqs.litlen[sym] * |
1783 | 0 | (extra + c->codes.lens.litlen[sym]); |
1784 | 0 | static_cost += c->freqs.litlen[sym] * |
1785 | 0 | (extra + c->static_codes.lens.litlen[sym]); |
1786 | 0 | } |
1787 | | |
1788 | | /* Account for the cost of encoding offsets. */ |
1789 | 0 | for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) { |
1790 | 0 | u32 extra = deflate_extra_offset_bits[sym]; |
1791 | |
|
1792 | 0 | dynamic_cost += c->freqs.offset[sym] * |
1793 | 0 | (extra + c->codes.lens.offset[sym]); |
1794 | 0 | static_cost += c->freqs.offset[sym] * (extra + 5); |
1795 | 0 | } |
1796 | | |
1797 | | /* Compute the cost of using uncompressed blocks. */ |
1798 | 0 | uncompressed_cost += (-(bitcount + 3) & 7) + 32 + |
1799 | 0 | (40 * (DIV_ROUND_UP(block_length, |
1800 | 0 | UINT16_MAX) - 1)) + |
1801 | 0 | (8 * block_length); |
1802 | | |
1803 | | /* |
1804 | | * Choose and output the cheapest type of block. If there is a tie, |
1805 | | * prefer uncompressed, then static, then dynamic. |
1806 | | */ |
1807 | |
|
1808 | 0 | best_cost = MIN(dynamic_cost, MIN(static_cost, uncompressed_cost)); |
1809 | | |
1810 | | /* If the block isn't going to fit, then stop early. */ |
1811 | 0 | if (DIV_ROUND_UP(bitcount + best_cost, 8) > os->end - out_next) { |
1812 | 0 | os->overflow = true; |
1813 | 0 | return; |
1814 | 0 | } |
1815 | | /* |
1816 | | * Else, now we know that the block fits, so no further bounds checks on |
1817 | | * the output buffer are required until the next block. |
1818 | | */ |
1819 | | |
1820 | 0 | if (best_cost == uncompressed_cost) { |
1821 | | /* |
1822 | | * Uncompressed block(s). DEFLATE limits the length of |
1823 | | * uncompressed blocks to UINT16_MAX bytes, so if the length of |
1824 | | * the "block" we're flushing is over UINT16_MAX, we actually |
1825 | | * output multiple blocks. |
1826 | | */ |
1827 | 0 | do { |
1828 | 0 | u8 bfinal = 0; |
1829 | 0 | size_t len = UINT16_MAX; |
1830 | |
|
1831 | 0 | if (in_end - in_next <= UINT16_MAX) { |
1832 | 0 | bfinal = is_final_block; |
1833 | 0 | len = in_end - in_next; |
1834 | 0 | } |
1835 | | /* It was already checked that there is enough space. */ |
1836 | 0 | ASSERT(os->end - out_next >= |
1837 | 0 | DIV_ROUND_UP(bitcount + 3, 8) + 4 + len); |
1838 | | /* |
1839 | | * Output BFINAL (1 bit) and BTYPE (2 bits), then align |
1840 | | * to a byte boundary. |
1841 | | */ |
1842 | 0 | STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0); |
1843 | 0 | *out_next++ = (bfinal << bitcount) | bitbuf; |
1844 | 0 | if (bitcount > 5) |
1845 | 0 | *out_next++ = 0; |
1846 | 0 | bitbuf = 0; |
1847 | 0 | bitcount = 0; |
1848 | | /* Output LEN and NLEN, then the data itself. */ |
1849 | 0 | put_unaligned_le16(len, out_next); |
1850 | 0 | out_next += 2; |
1851 | 0 | put_unaligned_le16(~len, out_next); |
1852 | 0 | out_next += 2; |
1853 | 0 | memcpy(out_next, in_next, len); |
1854 | 0 | out_next += len; |
1855 | 0 | in_next += len; |
1856 | 0 | } while (in_next != in_end); |
1857 | | /* Done outputting uncompressed block(s) */ |
1858 | 0 | goto out; |
1859 | 0 | } |
1860 | | |
1861 | 0 | if (best_cost == static_cost) { |
1862 | | /* Static Huffman block */ |
1863 | 0 | codes = &c->static_codes; |
1864 | 0 | ADD_BITS(is_final_block, 1); |
1865 | 0 | ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2); |
1866 | 0 | FLUSH_BITS(); |
1867 | 0 | } else { |
1868 | 0 | const unsigned num_explicit_lens = c->o.precode.num_explicit_lens; |
1869 | 0 | const unsigned num_precode_items = c->o.precode.num_items; |
1870 | 0 | unsigned precode_sym, precode_item; |
1871 | 0 | unsigned i; |
1872 | | |
1873 | | /* Dynamic Huffman block */ |
1874 | |
|
1875 | 0 | codes = &c->codes; |
1876 | 0 | STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3)); |
1877 | 0 | ADD_BITS(is_final_block, 1); |
1878 | 0 | ADD_BITS(DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN, 2); |
1879 | 0 | ADD_BITS(c->o.precode.num_litlen_syms - 257, 5); |
1880 | 0 | ADD_BITS(c->o.precode.num_offset_syms - 1, 5); |
1881 | 0 | ADD_BITS(num_explicit_lens - 4, 4); |
1882 | | |
1883 | | /* Output the lengths of the codewords in the precode. */ |
1884 | 0 | if (CAN_BUFFER(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { |
1885 | | /* |
1886 | | * A 64-bit bitbuffer is just one bit too small to hold |
1887 | | * the maximum number of precode lens, so to minimize |
1888 | | * flushes we merge one len with the previous fields. |
1889 | | */ |
1890 | 0 | precode_sym = deflate_precode_lens_permutation[0]; |
1891 | 0 | ADD_BITS(c->o.precode.lens[precode_sym], 3); |
1892 | 0 | FLUSH_BITS(); |
1893 | 0 | i = 1; /* num_explicit_lens >= 4 */ |
1894 | 0 | do { |
1895 | 0 | precode_sym = |
1896 | 0 | deflate_precode_lens_permutation[i]; |
1897 | 0 | ADD_BITS(c->o.precode.lens[precode_sym], 3); |
1898 | 0 | } while (++i < num_explicit_lens); |
1899 | 0 | FLUSH_BITS(); |
1900 | 0 | } else { |
1901 | 0 | FLUSH_BITS(); |
1902 | 0 | i = 0; |
1903 | 0 | do { |
1904 | 0 | precode_sym = |
1905 | 0 | deflate_precode_lens_permutation[i]; |
1906 | 0 | ADD_BITS(c->o.precode.lens[precode_sym], 3); |
1907 | 0 | FLUSH_BITS(); |
1908 | 0 | } while (++i < num_explicit_lens); |
1909 | 0 | } |
1910 | | |
1911 | | /* |
1912 | | * Output the lengths of the codewords in the litlen and offset |
1913 | | * codes, encoded by the precode. |
1914 | | */ |
1915 | 0 | i = 0; |
1916 | 0 | do { |
1917 | 0 | precode_item = c->o.precode.items[i]; |
1918 | 0 | precode_sym = precode_item & 0x1F; |
1919 | 0 | STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7)); |
1920 | 0 | ADD_BITS(c->o.precode.codewords[precode_sym], |
1921 | 0 | c->o.precode.lens[precode_sym]); |
1922 | 0 | ADD_BITS(precode_item >> 5, |
1923 | 0 | deflate_extra_precode_bits[precode_sym]); |
1924 | 0 | FLUSH_BITS(); |
1925 | 0 | } while (++i < num_precode_items); |
1926 | 0 | } |
1927 | | |
1928 | | /* Output the literals and matches for a dynamic or static block. */ |
1929 | 0 | ASSERT(bitcount <= 7); |
1930 | 0 | deflate_compute_full_len_codewords(c, codes); |
1931 | 0 | #if SUPPORT_NEAR_OPTIMAL_PARSING |
1932 | 0 | if (sequences == NULL) { |
1933 | | /* Output the literals and matches from the minimum-cost path */ |
1934 | 0 | struct deflate_optimum_node *cur_node = |
1935 | 0 | &c->p.n.optimum_nodes[0]; |
1936 | 0 | struct deflate_optimum_node * const end_node = |
1937 | 0 | &c->p.n.optimum_nodes[block_length]; |
1938 | 0 | do { |
1939 | 0 | u32 length = cur_node->item & OPTIMUM_LEN_MASK; |
1940 | 0 | u32 offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; |
1941 | |
|
1942 | 0 | if (length == 1) { |
1943 | | /* Literal */ |
1944 | 0 | ADD_BITS(codes->codewords.litlen[offset], |
1945 | 0 | codes->lens.litlen[offset]); |
1946 | 0 | FLUSH_BITS(); |
1947 | 0 | } else { |
1948 | | /* Match */ |
1949 | 0 | WRITE_MATCH(c, codes, length, offset, |
1950 | 0 | c->p.n.offset_slot_full[offset]); |
1951 | 0 | } |
1952 | 0 | cur_node += length; |
1953 | 0 | } while (cur_node != end_node); |
1954 | 0 | } else |
1955 | 0 | #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ |
1956 | 0 | { |
1957 | | /* Output the literals and matches from the sequences list. */ |
1958 | 0 | const struct deflate_sequence *seq; |
1959 | |
|
1960 | 0 | for (seq = sequences; ; seq++) { |
1961 | 0 | u32 litrunlen = seq->litrunlen_and_length & |
1962 | 0 | SEQ_LITRUNLEN_MASK; |
1963 | 0 | u32 length = seq->litrunlen_and_length >> |
1964 | 0 | SEQ_LENGTH_SHIFT; |
1965 | 0 | unsigned lit; |
1966 | | |
1967 | | /* Output a run of literals. */ |
1968 | 0 | if (CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) { |
1969 | 0 | for (; litrunlen >= 4; litrunlen -= 4) { |
1970 | 0 | lit = *in_next++; |
1971 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
1972 | 0 | codes->lens.litlen[lit]); |
1973 | 0 | lit = *in_next++; |
1974 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
1975 | 0 | codes->lens.litlen[lit]); |
1976 | 0 | lit = *in_next++; |
1977 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
1978 | 0 | codes->lens.litlen[lit]); |
1979 | 0 | lit = *in_next++; |
1980 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
1981 | 0 | codes->lens.litlen[lit]); |
1982 | 0 | FLUSH_BITS(); |
1983 | 0 | } |
1984 | 0 | if (litrunlen-- != 0) { |
1985 | 0 | lit = *in_next++; |
1986 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
1987 | 0 | codes->lens.litlen[lit]); |
1988 | 0 | if (litrunlen-- != 0) { |
1989 | 0 | lit = *in_next++; |
1990 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
1991 | 0 | codes->lens.litlen[lit]); |
1992 | 0 | if (litrunlen-- != 0) { |
1993 | 0 | lit = *in_next++; |
1994 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
1995 | 0 | codes->lens.litlen[lit]); |
1996 | 0 | } |
1997 | 0 | } |
1998 | 0 | FLUSH_BITS(); |
1999 | 0 | } |
2000 | 0 | } else { |
2001 | 0 | while (litrunlen--) { |
2002 | 0 | lit = *in_next++; |
2003 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
2004 | 0 | codes->lens.litlen[lit]); |
2005 | 0 | FLUSH_BITS(); |
2006 | 0 | } |
2007 | 0 | } |
2008 | |
|
2009 | 0 | if (length == 0) { /* Last sequence? */ |
2010 | 0 | ASSERT(in_next == in_end); |
2011 | 0 | break; |
2012 | 0 | } |
2013 | | |
2014 | | /* Output a match. */ |
2015 | 0 | WRITE_MATCH(c, codes, length, seq->offset, |
2016 | 0 | seq->offset_slot); |
2017 | 0 | in_next += length; |
2018 | 0 | } |
2019 | 0 | } |
2020 | | |
2021 | | /* Output the end-of-block symbol. */ |
2022 | 0 | ASSERT(bitcount <= 7); |
2023 | 0 | ADD_BITS(codes->codewords.litlen[DEFLATE_END_OF_BLOCK], |
2024 | 0 | codes->lens.litlen[DEFLATE_END_OF_BLOCK]); |
2025 | 0 | FLUSH_BITS(); |
2026 | 0 | out: |
2027 | 0 | ASSERT(bitcount <= 7); |
2028 | | /* |
2029 | | * Assert that the block cost was computed correctly. This is relied on |
2030 | | * above for the bounds check on the output buffer. Also, |
2031 | | * libdeflate_deflate_compress_bound() relies on this via the assumption |
2032 | | * that uncompressed blocks will always be used when cheapest. |
2033 | | */ |
2034 | 0 | ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == best_cost); |
2035 | 0 | os->bitbuf = bitbuf; |
2036 | 0 | os->bitcount = bitcount; |
2037 | 0 | os->next = out_next; |
2038 | 0 | } |
2039 | | |
2040 | | static void |
2041 | | deflate_finish_block(struct libdeflate_compressor *c, |
2042 | | struct deflate_output_bitstream *os, |
2043 | | const u8 *block_begin, u32 block_length, |
2044 | | const struct deflate_sequence *sequences, |
2045 | | bool is_final_block) |
2046 | 0 | { |
2047 | 0 | c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; |
2048 | 0 | deflate_make_huffman_codes(&c->freqs, &c->codes); |
2049 | 0 | deflate_flush_block(c, os, block_begin, block_length, sequences, |
2050 | 0 | is_final_block); |
2051 | 0 | } |
2052 | | |
2053 | | /******************************************************************************/ |
2054 | | |
2055 | | /* |
2056 | | * Block splitting algorithm. The problem is to decide when it is worthwhile to |
2057 | | * start a new block with new Huffman codes. There is a theoretically optimal |
2058 | | * solution: recursively consider every possible block split, considering the |
2059 | | * exact cost of each block, and choose the minimum cost approach. But this is |
2060 | | * far too slow. Instead, as an approximation, we can count symbols and after |
2061 | | * every N symbols, compare the expected distribution of symbols based on the |
2062 | | * previous data with the actual distribution. If they differ "by enough", then |
2063 | | * start a new block. |
2064 | | * |
2065 | | * As an optimization and heuristic, we don't distinguish between every symbol |
2066 | | * but rather we combine many symbols into a single "observation type". For |
2067 | | * literals we only look at the high bits and low bits, and for matches we only |
2068 | | * look at whether the match is long or not. The assumption is that for typical |
2069 | | * "real" data, places that are good block boundaries will tend to be noticeable |
2070 | | * based only on changes in these aggregate probabilities, without looking for |
2071 | | * subtle differences in individual symbols. For example, a change from ASCII |
2072 | | * bytes to non-ASCII bytes, or from few matches (generally less compressible) |
2073 | | * to many matches (generally more compressible), would be easily noticed based |
2074 | | * on the aggregates. |
2075 | | * |
2076 | | * For determining whether the probability distributions are "different enough" |
2077 | | * to start a new block, the simple heuristic of splitting when the sum of |
2078 | | * absolute differences exceeds a constant seems to be good enough. We also add |
2079 | | * a number proportional to the block length so that the algorithm is more |
2080 | | * likely to end long blocks than short blocks. This reflects the general |
2081 | | * expectation that it will become increasingly beneficial to start a new block |
2082 | | * as the current block grows longer. |
2083 | | * |
2084 | | * Finally, for an approximation, it is not strictly necessary that the exact |
2085 | | * symbols being used are considered. With "near-optimal parsing", for example, |
2086 | | * the actual symbols that will be used are unknown until after the block |
2087 | | * boundary is chosen and the block has been optimized. Since the final choices |
2088 | | * cannot be used, we can use preliminary "greedy" choices instead. |
2089 | | */ |
2090 | | |
2091 | | /* Initialize the block split statistics when starting a new block. */ |
2092 | | static void |
2093 | | init_block_split_stats(struct block_split_stats *stats) |
2094 | 0 | { |
2095 | 0 | int i; |
2096 | |
|
2097 | 0 | for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { |
2098 | 0 | stats->new_observations[i] = 0; |
2099 | 0 | stats->observations[i] = 0; |
2100 | 0 | } |
2101 | 0 | stats->num_new_observations = 0; |
2102 | 0 | stats->num_observations = 0; |
2103 | 0 | } |
2104 | | |
2105 | | /* |
2106 | | * Literal observation. Heuristic: use the top 2 bits and low 1 bits of the |
2107 | | * literal, for 8 possible literal observation types. |
2108 | | */ |
2109 | | static forceinline void |
2110 | | observe_literal(struct block_split_stats *stats, u8 lit) |
2111 | 0 | { |
2112 | 0 | stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++; |
2113 | 0 | stats->num_new_observations++; |
2114 | 0 | } |
2115 | | |
2116 | | /* |
2117 | | * Match observation. Heuristic: use one observation type for "short match" and |
2118 | | * one observation type for "long match". |
2119 | | */ |
2120 | | static forceinline void |
2121 | | observe_match(struct block_split_stats *stats, u32 length) |
2122 | 0 | { |
2123 | 0 | stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + |
2124 | 0 | (length >= 9)]++; |
2125 | 0 | stats->num_new_observations++; |
2126 | 0 | } |
2127 | | |
2128 | | static void |
2129 | | merge_new_observations(struct block_split_stats *stats) |
2130 | 0 | { |
2131 | 0 | int i; |
2132 | |
|
2133 | 0 | for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { |
2134 | 0 | stats->observations[i] += stats->new_observations[i]; |
2135 | 0 | stats->new_observations[i] = 0; |
2136 | 0 | } |
2137 | 0 | stats->num_observations += stats->num_new_observations; |
2138 | 0 | stats->num_new_observations = 0; |
2139 | 0 | } |
2140 | | |
2141 | | static bool |
2142 | | do_end_block_check(struct block_split_stats *stats, u32 block_length) |
2143 | 0 | { |
2144 | 0 | if (stats->num_observations > 0) { |
2145 | | /* |
2146 | | * Compute the sum of absolute differences of probabilities. To |
2147 | | * avoid needing to use floating point arithmetic or do slow |
2148 | | * divisions, we do all arithmetic with the probabilities |
2149 | | * multiplied by num_observations * num_new_observations. E.g., |
2150 | | * for the "old" observations the probabilities would be |
2151 | | * (double)observations[i] / num_observations, but since we |
2152 | | * multiply by both num_observations and num_new_observations we |
2153 | | * really do observations[i] * num_new_observations. |
2154 | | */ |
2155 | 0 | u32 total_delta = 0; |
2156 | 0 | u32 num_items; |
2157 | 0 | u32 cutoff; |
2158 | 0 | int i; |
2159 | |
|
2160 | 0 | for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { |
2161 | 0 | u32 expected = stats->observations[i] * |
2162 | 0 | stats->num_new_observations; |
2163 | 0 | u32 actual = stats->new_observations[i] * |
2164 | 0 | stats->num_observations; |
2165 | 0 | u32 delta = (actual > expected) ? actual - expected : |
2166 | 0 | expected - actual; |
2167 | |
|
2168 | 0 | total_delta += delta; |
2169 | 0 | } |
2170 | |
|
2171 | 0 | num_items = stats->num_observations + |
2172 | 0 | stats->num_new_observations; |
2173 | | /* |
2174 | | * Heuristic: the cutoff is when the sum of absolute differences |
2175 | | * of probabilities becomes at least 200/512. As above, the |
2176 | | * probability is multiplied by both num_new_observations and |
2177 | | * num_observations. Be careful to avoid integer overflow. |
2178 | | */ |
2179 | 0 | cutoff = stats->num_new_observations * 200 / 512 * |
2180 | 0 | stats->num_observations; |
2181 | | /* |
2182 | | * Very short blocks have a lot of overhead for the Huffman |
2183 | | * codes, so only use them if it clearly seems worthwhile. |
2184 | | * (This is an additional penalty, which adds to the smaller |
2185 | | * penalty below which scales more slowly.) |
2186 | | */ |
2187 | 0 | if (block_length < 10000 && num_items < 8192) |
2188 | 0 | cutoff += (u64)cutoff * (8192 - num_items) / 8192; |
2189 | | |
2190 | | /* Ready to end the block? */ |
2191 | 0 | if (total_delta + |
2192 | 0 | (block_length / 4096) * stats->num_observations >= cutoff) |
2193 | 0 | return true; |
2194 | 0 | } |
2195 | 0 | merge_new_observations(stats); |
2196 | 0 | return false; |
2197 | 0 | } |
2198 | | |
2199 | | static forceinline bool |
2200 | | ready_to_check_block(const struct block_split_stats *stats, |
2201 | | const u8 *in_block_begin, const u8 *in_next, |
2202 | | const u8 *in_end) |
2203 | 0 | { |
2204 | 0 | return stats->num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK |
2205 | 0 | && in_next - in_block_begin >= MIN_BLOCK_LENGTH |
2206 | 0 | && in_end - in_next >= MIN_BLOCK_LENGTH; |
2207 | 0 | } |
2208 | | |
2209 | | static forceinline bool |
2210 | | should_end_block(struct block_split_stats *stats, |
2211 | | const u8 *in_block_begin, const u8 *in_next, const u8 *in_end) |
2212 | 0 | { |
2213 | | /* Ready to try to end the block (again)? */ |
2214 | 0 | if (!ready_to_check_block(stats, in_block_begin, in_next, in_end)) |
2215 | 0 | return false; |
2216 | | |
2217 | 0 | return do_end_block_check(stats, in_next - in_block_begin); |
2218 | 0 | } |
2219 | | |
2220 | | /******************************************************************************/ |
2221 | | |
2222 | | static void |
2223 | | deflate_begin_sequences(struct libdeflate_compressor *c, |
2224 | | struct deflate_sequence *first_seq) |
2225 | 0 | { |
2226 | 0 | deflate_reset_symbol_frequencies(c); |
2227 | 0 | first_seq->litrunlen_and_length = 0; |
2228 | 0 | } |
2229 | | |
2230 | | static forceinline void |
2231 | | deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal, |
2232 | | bool gather_split_stats, struct deflate_sequence *seq) |
2233 | 0 | { |
2234 | 0 | c->freqs.litlen[literal]++; |
2235 | |
|
2236 | 0 | if (gather_split_stats) |
2237 | 0 | observe_literal(&c->split_stats, literal); |
2238 | |
|
2239 | 0 | STATIC_ASSERT(MAX_BLOCK_LENGTH <= SEQ_LITRUNLEN_MASK); |
2240 | 0 | seq->litrunlen_and_length++; |
2241 | 0 | } |
2242 | | |
2243 | | static forceinline void |
2244 | | deflate_choose_match(struct libdeflate_compressor *c, |
2245 | | u32 length, u32 offset, bool gather_split_stats, |
2246 | | struct deflate_sequence **seq_p) |
2247 | 0 | { |
2248 | 0 | struct deflate_sequence *seq = *seq_p; |
2249 | 0 | unsigned length_slot = deflate_length_slot[length]; |
2250 | 0 | unsigned offset_slot = deflate_get_offset_slot(offset); |
2251 | |
|
2252 | 0 | c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++; |
2253 | 0 | c->freqs.offset[offset_slot]++; |
2254 | 0 | if (gather_split_stats) |
2255 | 0 | observe_match(&c->split_stats, length); |
2256 | |
|
2257 | 0 | seq->litrunlen_and_length |= length << SEQ_LENGTH_SHIFT; |
2258 | 0 | seq->offset = offset; |
2259 | 0 | seq->offset_slot = offset_slot; |
2260 | |
|
2261 | 0 | seq++; |
2262 | 0 | seq->litrunlen_and_length = 0; |
2263 | 0 | *seq_p = seq; |
2264 | 0 | } |
2265 | | |
2266 | | /* |
2267 | | * Decrease the maximum and nice match lengths if we're approaching the end of |
2268 | | * the input buffer. |
2269 | | */ |
2270 | | static forceinline void |
2271 | | adjust_max_and_nice_len(u32 *max_len, u32 *nice_len, size_t remaining) |
2272 | 0 | { |
2273 | 0 | if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) { |
2274 | 0 | *max_len = remaining; |
2275 | 0 | *nice_len = MIN(*nice_len, *max_len); |
2276 | 0 | } |
2277 | 0 | } |
2278 | | |
2279 | | /* |
2280 | | * Choose the minimum match length for the greedy and lazy parsers. |
2281 | | * |
2282 | | * By default the minimum match length is 3, which is the smallest length the |
2283 | | * DEFLATE format allows. However, with greedy and lazy parsing, some data |
2284 | | * (e.g. DNA sequencing data) benefits greatly from a longer minimum length. |
2285 | | * Typically, this is because literals are very cheap. In general, the |
2286 | | * near-optimal parser handles this case naturally, but the greedy and lazy |
2287 | | * parsers need a heuristic to decide when to use short matches. |
2288 | | * |
2289 | | * The heuristic we use is to make the minimum match length depend on the number |
2290 | | * of different literals that exist in the data. If there are many different |
2291 | | * literals, then literals will probably be expensive, so short matches will |
2292 | | * probably be worthwhile. Conversely, if not many literals are used, then |
2293 | | * probably literals will be cheap and short matches won't be worthwhile. |
2294 | | */ |
2295 | | static u32 |
2296 | | choose_min_match_len(u32 num_used_literals, u32 max_search_depth) |
2297 | 0 | { |
2298 | | /* map from num_used_literals to min_len */ |
2299 | 0 | static const u8 min_lens[] = { |
2300 | 0 | 9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6, |
2301 | 0 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, |
2302 | 0 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, |
2303 | 0 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
2304 | 0 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
2305 | | /* The rest is implicitly 3. */ |
2306 | 0 | }; |
2307 | 0 | u32 min_len; |
2308 | |
|
2309 | 0 | STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3); |
2310 | 0 | STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1); |
2311 | |
|
2312 | 0 | if (num_used_literals >= ARRAY_LEN(min_lens)) |
2313 | 0 | return 3; |
2314 | 0 | min_len = min_lens[num_used_literals]; |
2315 | | /* |
2316 | | * With a low max_search_depth, it may be too hard to find long matches. |
2317 | | */ |
2318 | 0 | if (max_search_depth < 16) { |
2319 | 0 | if (max_search_depth < 5) |
2320 | 0 | min_len = MIN(min_len, 4); |
2321 | 0 | else if (max_search_depth < 10) |
2322 | 0 | min_len = MIN(min_len, 5); |
2323 | 0 | else |
2324 | 0 | min_len = MIN(min_len, 7); |
2325 | 0 | } |
2326 | 0 | return min_len; |
2327 | 0 | } |
2328 | | |
2329 | | static u32 |
2330 | | calculate_min_match_len(const u8 *data, size_t data_len, u32 max_search_depth) |
2331 | 0 | { |
2332 | 0 | u8 used[256] = { 0 }; |
2333 | 0 | u32 num_used_literals = 0; |
2334 | 0 | size_t i; |
2335 | | |
2336 | | /* |
2337 | | * For very short inputs, the static Huffman code has a good chance of |
2338 | | * being best, in which case there is no reason to avoid short matches. |
2339 | | */ |
2340 | 0 | if (data_len < 512) |
2341 | 0 | return DEFLATE_MIN_MATCH_LEN; |
2342 | | |
2343 | | /* |
2344 | | * For an initial approximation, scan the first 4 KiB of data. The |
2345 | | * caller may use recalculate_min_match_len() to update min_len later. |
2346 | | */ |
2347 | 0 | data_len = MIN(data_len, 4096); |
2348 | 0 | for (i = 0; i < data_len; i++) |
2349 | 0 | used[data[i]] = 1; |
2350 | 0 | for (i = 0; i < 256; i++) |
2351 | 0 | num_used_literals += used[i]; |
2352 | 0 | return choose_min_match_len(num_used_literals, max_search_depth); |
2353 | 0 | } |
2354 | | |
2355 | | /* |
2356 | | * Recalculate the minimum match length for a block, now that we know the |
2357 | | * distribution of literals that are actually being used (freqs->litlen). |
2358 | | */ |
2359 | | static u32 |
2360 | | recalculate_min_match_len(const struct deflate_freqs *freqs, |
2361 | | u32 max_search_depth) |
2362 | 0 | { |
2363 | 0 | u32 literal_freq = 0; |
2364 | 0 | u32 cutoff; |
2365 | 0 | u32 num_used_literals = 0; |
2366 | 0 | int i; |
2367 | |
|
2368 | 0 | for (i = 0; i < DEFLATE_NUM_LITERALS; i++) |
2369 | 0 | literal_freq += freqs->litlen[i]; |
2370 | |
|
2371 | 0 | cutoff = literal_freq >> 10; /* Ignore literals used very rarely. */ |
2372 | |
|
2373 | 0 | for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { |
2374 | 0 | if (freqs->litlen[i] > cutoff) |
2375 | 0 | num_used_literals++; |
2376 | 0 | } |
2377 | 0 | return choose_min_match_len(num_used_literals, max_search_depth); |
2378 | 0 | } |
2379 | | |
2380 | | static forceinline const u8 * |
2381 | | choose_max_block_end(const u8 *in_block_begin, const u8 *in_end, |
2382 | | size_t soft_max_len) |
2383 | 0 | { |
2384 | 0 | if (in_end - in_block_begin < soft_max_len + MIN_BLOCK_LENGTH) |
2385 | 0 | return in_end; |
2386 | 0 | return in_block_begin + soft_max_len; |
2387 | 0 | } |
2388 | | |
2389 | | /* |
2390 | | * This is the level 0 "compressor". It always outputs uncompressed blocks. |
2391 | | */ |
2392 | | static size_t |
2393 | | deflate_compress_none(const u8 *in, size_t in_nbytes, |
2394 | | u8 *out, size_t out_nbytes_avail) |
2395 | 0 | { |
2396 | 0 | const u8 *in_next = in; |
2397 | 0 | const u8 * const in_end = in + in_nbytes; |
2398 | 0 | u8 *out_next = out; |
2399 | 0 | u8 * const out_end = out + out_nbytes_avail; |
2400 | | |
2401 | | /* |
2402 | | * If the input is zero-length, we still must output a block in order |
2403 | | * for the output to be a valid DEFLATE stream. Handle this case |
2404 | | * specially to avoid potentially passing NULL to memcpy() below. |
2405 | | */ |
2406 | 0 | if (unlikely(in_nbytes == 0)) { |
2407 | 0 | if (out_nbytes_avail < 5) |
2408 | 0 | return 0; |
2409 | | /* BFINAL and BTYPE */ |
2410 | 0 | *out_next++ = 1 | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1); |
2411 | | /* LEN and NLEN */ |
2412 | 0 | put_unaligned_le32(0xFFFF0000, out_next); |
2413 | 0 | return 5; |
2414 | 0 | } |
2415 | | |
2416 | 0 | do { |
2417 | 0 | u8 bfinal = 0; |
2418 | 0 | size_t len = UINT16_MAX; |
2419 | |
|
2420 | 0 | if (in_end - in_next <= UINT16_MAX) { |
2421 | 0 | bfinal = 1; |
2422 | 0 | len = in_end - in_next; |
2423 | 0 | } |
2424 | 0 | if (out_end - out_next < 5 + len) |
2425 | 0 | return 0; |
2426 | | /* |
2427 | | * Output BFINAL and BTYPE. The stream is already byte-aligned |
2428 | | * here, so this step always requires outputting exactly 1 byte. |
2429 | | */ |
2430 | 0 | *out_next++ = bfinal | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1); |
2431 | | |
2432 | | /* Output LEN and NLEN, then the data itself. */ |
2433 | 0 | put_unaligned_le16(len, out_next); |
2434 | 0 | out_next += 2; |
2435 | 0 | put_unaligned_le16(~len, out_next); |
2436 | 0 | out_next += 2; |
2437 | 0 | memcpy(out_next, in_next, len); |
2438 | 0 | out_next += len; |
2439 | 0 | in_next += len; |
2440 | 0 | } while (in_next != in_end); |
2441 | | |
2442 | 0 | return out_next - out; |
2443 | 0 | } |
2444 | | |
2445 | | /* |
2446 | | * This is a faster variant of deflate_compress_greedy(). It uses the |
2447 | | * ht_matchfinder rather than the hc_matchfinder. It also skips the block |
2448 | | * splitting algorithm and just uses fixed length blocks. c->max_search_depth |
2449 | | * has no effect with this algorithm, as it is hardcoded in ht_matchfinder.h. |
2450 | | */ |
2451 | | static void |
2452 | | deflate_compress_fastest(struct libdeflate_compressor * restrict c, |
2453 | | const u8 *in, size_t in_nbytes, |
2454 | | struct deflate_output_bitstream *os) |
2455 | 0 | { |
2456 | 0 | const u8 *in_next = in; |
2457 | 0 | const u8 *in_end = in_next + in_nbytes; |
2458 | 0 | const u8 *in_cur_base = in_next; |
2459 | 0 | u32 max_len = DEFLATE_MAX_MATCH_LEN; |
2460 | 0 | u32 nice_len = MIN(c->nice_match_length, max_len); |
2461 | 0 | u32 next_hash = 0; |
2462 | |
|
2463 | 0 | ht_matchfinder_init(&c->p.f.ht_mf); |
2464 | |
|
2465 | 0 | do { |
2466 | | /* Starting a new DEFLATE block */ |
2467 | |
|
2468 | 0 | const u8 * const in_block_begin = in_next; |
2469 | 0 | const u8 * const in_max_block_end = choose_max_block_end( |
2470 | 0 | in_next, in_end, FAST_SOFT_MAX_BLOCK_LENGTH); |
2471 | 0 | struct deflate_sequence *seq = c->p.f.sequences; |
2472 | |
|
2473 | 0 | deflate_begin_sequences(c, seq); |
2474 | |
|
2475 | 0 | do { |
2476 | 0 | u32 length; |
2477 | 0 | u32 offset; |
2478 | 0 | size_t remaining = in_end - in_next; |
2479 | |
|
2480 | 0 | if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) { |
2481 | 0 | max_len = remaining; |
2482 | 0 | if (max_len < HT_MATCHFINDER_REQUIRED_NBYTES) { |
2483 | 0 | do { |
2484 | 0 | deflate_choose_literal(c, |
2485 | 0 | *in_next++, false, seq); |
2486 | 0 | } while (--max_len); |
2487 | 0 | break; |
2488 | 0 | } |
2489 | 0 | nice_len = MIN(nice_len, max_len); |
2490 | 0 | } |
2491 | 0 | length = ht_matchfinder_longest_match(&c->p.f.ht_mf, |
2492 | 0 | &in_cur_base, |
2493 | 0 | in_next, |
2494 | 0 | max_len, |
2495 | 0 | nice_len, |
2496 | 0 | &next_hash, |
2497 | 0 | &offset); |
2498 | 0 | if (length) { |
2499 | | /* Match found */ |
2500 | 0 | deflate_choose_match(c, length, offset, false, |
2501 | 0 | &seq); |
2502 | 0 | ht_matchfinder_skip_bytes(&c->p.f.ht_mf, |
2503 | 0 | &in_cur_base, |
2504 | 0 | in_next + 1, |
2505 | 0 | in_end, |
2506 | 0 | length - 1, |
2507 | 0 | &next_hash); |
2508 | 0 | in_next += length; |
2509 | 0 | } else { |
2510 | | /* No match found */ |
2511 | 0 | deflate_choose_literal(c, *in_next++, false, |
2512 | 0 | seq); |
2513 | 0 | } |
2514 | | |
2515 | | /* Check if it's time to output another block. */ |
2516 | 0 | } while (in_next < in_max_block_end && |
2517 | 0 | seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]); |
2518 | | |
2519 | 0 | deflate_finish_block(c, os, in_block_begin, |
2520 | 0 | in_next - in_block_begin, |
2521 | 0 | c->p.f.sequences, in_next == in_end); |
2522 | 0 | } while (in_next != in_end && !os->overflow); |
2523 | 0 | } |
2524 | | |
2525 | | /* |
2526 | | * This is the "greedy" DEFLATE compressor. It always chooses the longest match. |
2527 | | */ |
2528 | | static void |
2529 | | deflate_compress_greedy(struct libdeflate_compressor * restrict c, |
2530 | | const u8 *in, size_t in_nbytes, |
2531 | | struct deflate_output_bitstream *os) |
2532 | 0 | { |
2533 | 0 | const u8 *in_next = in; |
2534 | 0 | const u8 *in_end = in_next + in_nbytes; |
2535 | 0 | const u8 *in_cur_base = in_next; |
2536 | 0 | u32 max_len = DEFLATE_MAX_MATCH_LEN; |
2537 | 0 | u32 nice_len = MIN(c->nice_match_length, max_len); |
2538 | 0 | u32 next_hashes[2] = {0, 0}; |
2539 | |
|
2540 | 0 | hc_matchfinder_init(&c->p.g.hc_mf); |
2541 | |
|
2542 | 0 | do { |
2543 | | /* Starting a new DEFLATE block */ |
2544 | |
|
2545 | 0 | const u8 * const in_block_begin = in_next; |
2546 | 0 | const u8 * const in_max_block_end = choose_max_block_end( |
2547 | 0 | in_next, in_end, SOFT_MAX_BLOCK_LENGTH); |
2548 | 0 | struct deflate_sequence *seq = c->p.g.sequences; |
2549 | 0 | u32 min_len; |
2550 | |
|
2551 | 0 | init_block_split_stats(&c->split_stats); |
2552 | 0 | deflate_begin_sequences(c, seq); |
2553 | 0 | min_len = calculate_min_match_len(in_next, |
2554 | 0 | in_max_block_end - in_next, |
2555 | 0 | c->max_search_depth); |
2556 | 0 | do { |
2557 | 0 | u32 length; |
2558 | 0 | u32 offset; |
2559 | |
|
2560 | 0 | adjust_max_and_nice_len(&max_len, &nice_len, |
2561 | 0 | in_end - in_next); |
2562 | 0 | length = hc_matchfinder_longest_match( |
2563 | 0 | &c->p.g.hc_mf, |
2564 | 0 | &in_cur_base, |
2565 | 0 | in_next, |
2566 | 0 | min_len - 1, |
2567 | 0 | max_len, |
2568 | 0 | nice_len, |
2569 | 0 | c->max_search_depth, |
2570 | 0 | next_hashes, |
2571 | 0 | &offset); |
2572 | |
|
2573 | 0 | if (length >= min_len && |
2574 | 0 | (length > DEFLATE_MIN_MATCH_LEN || |
2575 | 0 | offset <= 4096)) { |
2576 | | /* Match found */ |
2577 | 0 | deflate_choose_match(c, length, offset, true, |
2578 | 0 | &seq); |
2579 | 0 | hc_matchfinder_skip_bytes(&c->p.g.hc_mf, |
2580 | 0 | &in_cur_base, |
2581 | 0 | in_next + 1, |
2582 | 0 | in_end, |
2583 | 0 | length - 1, |
2584 | 0 | next_hashes); |
2585 | 0 | in_next += length; |
2586 | 0 | } else { |
2587 | | /* No match found */ |
2588 | 0 | deflate_choose_literal(c, *in_next++, true, |
2589 | 0 | seq); |
2590 | 0 | } |
2591 | | |
2592 | | /* Check if it's time to output another block. */ |
2593 | 0 | } while (in_next < in_max_block_end && |
2594 | 0 | seq < &c->p.g.sequences[SEQ_STORE_LENGTH] && |
2595 | 0 | !should_end_block(&c->split_stats, |
2596 | 0 | in_block_begin, in_next, in_end)); |
2597 | |
|
2598 | 0 | deflate_finish_block(c, os, in_block_begin, |
2599 | 0 | in_next - in_block_begin, |
2600 | 0 | c->p.g.sequences, in_next == in_end); |
2601 | 0 | } while (in_next != in_end && !os->overflow); |
2602 | 0 | } |
2603 | | |
2604 | | static forceinline void |
2605 | | deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c, |
2606 | | const u8 *in, size_t in_nbytes, |
2607 | | struct deflate_output_bitstream *os, bool lazy2) |
2608 | 0 | { |
2609 | 0 | const u8 *in_next = in; |
2610 | 0 | const u8 *in_end = in_next + in_nbytes; |
2611 | 0 | const u8 *in_cur_base = in_next; |
2612 | 0 | u32 max_len = DEFLATE_MAX_MATCH_LEN; |
2613 | 0 | u32 nice_len = MIN(c->nice_match_length, max_len); |
2614 | 0 | u32 next_hashes[2] = {0, 0}; |
2615 | |
|
2616 | 0 | hc_matchfinder_init(&c->p.g.hc_mf); |
2617 | |
|
2618 | 0 | do { |
2619 | | /* Starting a new DEFLATE block */ |
2620 | |
|
2621 | 0 | const u8 * const in_block_begin = in_next; |
2622 | 0 | const u8 * const in_max_block_end = choose_max_block_end( |
2623 | 0 | in_next, in_end, SOFT_MAX_BLOCK_LENGTH); |
2624 | 0 | const u8 *next_recalc_min_len = |
2625 | 0 | in_next + MIN(in_end - in_next, 10000); |
2626 | 0 | struct deflate_sequence *seq = c->p.g.sequences; |
2627 | 0 | u32 min_len; |
2628 | |
|
2629 | 0 | init_block_split_stats(&c->split_stats); |
2630 | 0 | deflate_begin_sequences(c, seq); |
2631 | 0 | min_len = calculate_min_match_len(in_next, |
2632 | 0 | in_max_block_end - in_next, |
2633 | 0 | c->max_search_depth); |
2634 | 0 | do { |
2635 | 0 | u32 cur_len; |
2636 | 0 | u32 cur_offset; |
2637 | 0 | u32 next_len; |
2638 | 0 | u32 next_offset; |
2639 | | |
2640 | | /* |
2641 | | * Recalculate the minimum match length if it hasn't |
2642 | | * been done recently. |
2643 | | */ |
2644 | 0 | if (in_next >= next_recalc_min_len) { |
2645 | 0 | min_len = recalculate_min_match_len( |
2646 | 0 | &c->freqs, |
2647 | 0 | c->max_search_depth); |
2648 | 0 | next_recalc_min_len += |
2649 | 0 | MIN(in_end - next_recalc_min_len, |
2650 | 0 | in_next - in_block_begin); |
2651 | 0 | } |
2652 | | |
2653 | | /* Find the longest match at the current position. */ |
2654 | 0 | adjust_max_and_nice_len(&max_len, &nice_len, |
2655 | 0 | in_end - in_next); |
2656 | 0 | cur_len = hc_matchfinder_longest_match( |
2657 | 0 | &c->p.g.hc_mf, |
2658 | 0 | &in_cur_base, |
2659 | 0 | in_next, |
2660 | 0 | min_len - 1, |
2661 | 0 | max_len, |
2662 | 0 | nice_len, |
2663 | 0 | c->max_search_depth, |
2664 | 0 | next_hashes, |
2665 | 0 | &cur_offset); |
2666 | 0 | if (cur_len < min_len || |
2667 | 0 | (cur_len == DEFLATE_MIN_MATCH_LEN && |
2668 | 0 | cur_offset > 8192)) { |
2669 | | /* No match found. Choose a literal. */ |
2670 | 0 | deflate_choose_literal(c, *in_next++, true, |
2671 | 0 | seq); |
2672 | 0 | continue; |
2673 | 0 | } |
2674 | 0 | in_next++; |
2675 | |
|
2676 | 0 | have_cur_match: |
2677 | | /* |
2678 | | * We have a match at the current position. |
2679 | | * If it's very long, choose it immediately. |
2680 | | */ |
2681 | 0 | if (cur_len >= nice_len) { |
2682 | 0 | deflate_choose_match(c, cur_len, cur_offset, |
2683 | 0 | true, &seq); |
2684 | 0 | hc_matchfinder_skip_bytes(&c->p.g.hc_mf, |
2685 | 0 | &in_cur_base, |
2686 | 0 | in_next, |
2687 | 0 | in_end, |
2688 | 0 | cur_len - 1, |
2689 | 0 | next_hashes); |
2690 | 0 | in_next += cur_len - 1; |
2691 | 0 | continue; |
2692 | 0 | } |
2693 | | |
2694 | | /* |
2695 | | * Try to find a better match at the next position. |
2696 | | * |
2697 | | * Note: since we already have a match at the *current* |
2698 | | * position, we use only half the 'max_search_depth' |
2699 | | * when checking the *next* position. This is a useful |
2700 | | * trade-off because it's more worthwhile to use a |
2701 | | * greater search depth on the initial match. |
2702 | | * |
2703 | | * Note: it's possible to structure the code such that |
2704 | | * there's only one call to longest_match(), which |
2705 | | * handles both the "find the initial match" and "try to |
2706 | | * find a better match" cases. However, it is faster to |
2707 | | * have two call sites, with longest_match() inlined at |
2708 | | * each. |
2709 | | */ |
2710 | 0 | adjust_max_and_nice_len(&max_len, &nice_len, |
2711 | 0 | in_end - in_next); |
2712 | 0 | next_len = hc_matchfinder_longest_match( |
2713 | 0 | &c->p.g.hc_mf, |
2714 | 0 | &in_cur_base, |
2715 | 0 | in_next++, |
2716 | 0 | cur_len - 1, |
2717 | 0 | max_len, |
2718 | 0 | nice_len, |
2719 | 0 | c->max_search_depth >> 1, |
2720 | 0 | next_hashes, |
2721 | 0 | &next_offset); |
2722 | 0 | if (next_len >= cur_len && |
2723 | 0 | 4 * (int)(next_len - cur_len) + |
2724 | 0 | ((int)bsr32(cur_offset) - |
2725 | 0 | (int)bsr32(next_offset)) > 2) { |
2726 | | /* |
2727 | | * Found a better match at the next position. |
2728 | | * Output a literal. Then the next match |
2729 | | * becomes the current match. |
2730 | | */ |
2731 | 0 | deflate_choose_literal(c, *(in_next - 2), true, |
2732 | 0 | seq); |
2733 | 0 | cur_len = next_len; |
2734 | 0 | cur_offset = next_offset; |
2735 | 0 | goto have_cur_match; |
2736 | 0 | } |
2737 | | |
2738 | 0 | if (lazy2) { |
2739 | | /* In lazy2 mode, look ahead another position */ |
2740 | 0 | adjust_max_and_nice_len(&max_len, &nice_len, |
2741 | 0 | in_end - in_next); |
2742 | 0 | next_len = hc_matchfinder_longest_match( |
2743 | 0 | &c->p.g.hc_mf, |
2744 | 0 | &in_cur_base, |
2745 | 0 | in_next++, |
2746 | 0 | cur_len - 1, |
2747 | 0 | max_len, |
2748 | 0 | nice_len, |
2749 | 0 | c->max_search_depth >> 2, |
2750 | 0 | next_hashes, |
2751 | 0 | &next_offset); |
2752 | 0 | if (next_len >= cur_len && |
2753 | 0 | 4 * (int)(next_len - cur_len) + |
2754 | 0 | ((int)bsr32(cur_offset) - |
2755 | 0 | (int)bsr32(next_offset)) > 6) { |
2756 | | /* |
2757 | | * There's a much better match two |
2758 | | * positions ahead, so use two literals. |
2759 | | */ |
2760 | 0 | deflate_choose_literal( |
2761 | 0 | c, *(in_next - 3), true, seq); |
2762 | 0 | deflate_choose_literal( |
2763 | 0 | c, *(in_next - 2), true, seq); |
2764 | 0 | cur_len = next_len; |
2765 | 0 | cur_offset = next_offset; |
2766 | 0 | goto have_cur_match; |
2767 | 0 | } |
2768 | | /* |
2769 | | * No better match at either of the next 2 |
2770 | | * positions. Output the current match. |
2771 | | */ |
2772 | 0 | deflate_choose_match(c, cur_len, cur_offset, |
2773 | 0 | true, &seq); |
2774 | 0 | if (cur_len > 3) { |
2775 | 0 | hc_matchfinder_skip_bytes(&c->p.g.hc_mf, |
2776 | 0 | &in_cur_base, |
2777 | 0 | in_next, |
2778 | 0 | in_end, |
2779 | 0 | cur_len - 3, |
2780 | 0 | next_hashes); |
2781 | 0 | in_next += cur_len - 3; |
2782 | 0 | } |
2783 | 0 | } else { /* !lazy2 */ |
2784 | | /* |
2785 | | * No better match at the next position. Output |
2786 | | * the current match. |
2787 | | */ |
2788 | 0 | deflate_choose_match(c, cur_len, cur_offset, |
2789 | 0 | true, &seq); |
2790 | 0 | hc_matchfinder_skip_bytes(&c->p.g.hc_mf, |
2791 | 0 | &in_cur_base, |
2792 | 0 | in_next, |
2793 | 0 | in_end, |
2794 | 0 | cur_len - 2, |
2795 | 0 | next_hashes); |
2796 | 0 | in_next += cur_len - 2; |
2797 | 0 | } |
2798 | | /* Check if it's time to output another block. */ |
2799 | 0 | } while (in_next < in_max_block_end && |
2800 | 0 | seq < &c->p.g.sequences[SEQ_STORE_LENGTH] && |
2801 | 0 | !should_end_block(&c->split_stats, |
2802 | 0 | in_block_begin, in_next, in_end)); |
2803 | | |
2804 | 0 | deflate_finish_block(c, os, in_block_begin, |
2805 | 0 | in_next - in_block_begin, |
2806 | 0 | c->p.g.sequences, in_next == in_end); |
2807 | 0 | } while (in_next != in_end && !os->overflow); |
2808 | 0 | } |
2809 | | |
2810 | | /* |
2811 | | * This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to |
2812 | | * see if there's a better match at the next position. If yes, it outputs a |
2813 | | * literal and continues to the next position. If no, it outputs the match. |
2814 | | */ |
2815 | | static void |
2816 | | deflate_compress_lazy(struct libdeflate_compressor * restrict c, |
2817 | | const u8 *in, size_t in_nbytes, |
2818 | | struct deflate_output_bitstream *os) |
2819 | 0 | { |
2820 | 0 | deflate_compress_lazy_generic(c, in, in_nbytes, os, false); |
2821 | 0 | } |
2822 | | |
2823 | | /* |
2824 | | * The lazy2 compressor. This is similar to the regular lazy one, but it looks |
2825 | | * for a better match at the next 2 positions rather than the next 1. This |
2826 | | * makes it take slightly more time, but compress some inputs slightly more. |
2827 | | */ |
2828 | | static void |
2829 | | deflate_compress_lazy2(struct libdeflate_compressor * restrict c, |
2830 | | const u8 *in, size_t in_nbytes, |
2831 | | struct deflate_output_bitstream *os) |
2832 | 0 | { |
2833 | 0 | deflate_compress_lazy_generic(c, in, in_nbytes, os, true); |
2834 | 0 | } |
2835 | | |
2836 | | #if SUPPORT_NEAR_OPTIMAL_PARSING |
2837 | | |
2838 | | /* |
2839 | | * Follow the minimum-cost path in the graph of possible match/literal choices |
2840 | | * for the current block and compute the frequencies of the Huffman symbols that |
2841 | | * would be needed to output those matches and literals. |
2842 | | */ |
2843 | | static void |
2844 | | deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length) |
2845 | 0 | { |
2846 | 0 | struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; |
2847 | 0 | struct deflate_optimum_node *end_node = |
2848 | 0 | &c->p.n.optimum_nodes[block_length]; |
2849 | |
|
2850 | 0 | do { |
2851 | 0 | u32 length = cur_node->item & OPTIMUM_LEN_MASK; |
2852 | 0 | u32 offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; |
2853 | |
|
2854 | 0 | if (length == 1) { |
2855 | | /* Literal */ |
2856 | 0 | c->freqs.litlen[offset]++; |
2857 | 0 | } else { |
2858 | | /* Match */ |
2859 | 0 | c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + |
2860 | 0 | deflate_length_slot[length]]++; |
2861 | 0 | c->freqs.offset[c->p.n.offset_slot_full[offset]]++; |
2862 | 0 | } |
2863 | 0 | cur_node += length; |
2864 | 0 | } while (cur_node != end_node); |
2865 | | |
2866 | | /* Tally the end-of-block symbol. */ |
2867 | 0 | c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; |
2868 | 0 | } |
2869 | | |
2870 | | static void |
2871 | | deflate_choose_all_literals(struct libdeflate_compressor *c, |
2872 | | const u8 *block, u32 block_length) |
2873 | 0 | { |
2874 | 0 | u32 i; |
2875 | |
|
2876 | 0 | deflate_reset_symbol_frequencies(c); |
2877 | 0 | for (i = 0; i < block_length; i++) |
2878 | 0 | c->freqs.litlen[block[i]]++; |
2879 | 0 | c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; |
2880 | |
|
2881 | 0 | deflate_make_huffman_codes(&c->freqs, &c->codes); |
2882 | 0 | } |
2883 | | |
2884 | | /* |
2885 | | * Compute the exact cost, in bits, that would be required to output the matches |
2886 | | * and literals described by @c->freqs as a dynamic Huffman block. The litlen |
2887 | | * and offset codes are assumed to have already been built in @c->codes. |
2888 | | */ |
2889 | | static u32 |
2890 | | deflate_compute_true_cost(struct libdeflate_compressor *c) |
2891 | 0 | { |
2892 | 0 | u32 cost = 0; |
2893 | 0 | unsigned sym; |
2894 | |
|
2895 | 0 | deflate_precompute_huffman_header(c); |
2896 | |
|
2897 | 0 | memset(&c->codes.lens.litlen[c->o.precode.num_litlen_syms], 0, |
2898 | 0 | DEFLATE_NUM_LITLEN_SYMS - c->o.precode.num_litlen_syms); |
2899 | |
|
2900 | 0 | cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens); |
2901 | 0 | for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { |
2902 | 0 | cost += c->o.precode.freqs[sym] * |
2903 | 0 | (c->o.precode.lens[sym] + |
2904 | 0 | deflate_extra_precode_bits[sym]); |
2905 | 0 | } |
2906 | |
|
2907 | 0 | for (sym = 0; sym < DEFLATE_FIRST_LEN_SYM; sym++) |
2908 | 0 | cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym]; |
2909 | |
|
2910 | 0 | for (; sym < DEFLATE_FIRST_LEN_SYM + |
2911 | 0 | ARRAY_LEN(deflate_extra_length_bits); sym++) |
2912 | 0 | cost += c->freqs.litlen[sym] * |
2913 | 0 | (c->codes.lens.litlen[sym] + |
2914 | 0 | deflate_extra_length_bits[sym - DEFLATE_FIRST_LEN_SYM]); |
2915 | |
|
2916 | 0 | for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) |
2917 | 0 | cost += c->freqs.offset[sym] * |
2918 | 0 | (c->codes.lens.offset[sym] + |
2919 | 0 | deflate_extra_offset_bits[sym]); |
2920 | 0 | return cost; |
2921 | 0 | } |
2922 | | |
2923 | | /* Set the current cost model from the codeword lengths specified in @lens. */ |
2924 | | static void |
2925 | | deflate_set_costs_from_codes(struct libdeflate_compressor *c, |
2926 | | const struct deflate_lens *lens) |
2927 | 0 | { |
2928 | 0 | unsigned i; |
2929 | | |
2930 | | /* Literals */ |
2931 | 0 | for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { |
2932 | 0 | u32 bits = (lens->litlen[i] ? |
2933 | 0 | lens->litlen[i] : LITERAL_NOSTAT_BITS); |
2934 | |
|
2935 | 0 | c->p.n.costs.literal[i] = bits * BIT_COST; |
2936 | 0 | } |
2937 | | |
2938 | | /* Lengths */ |
2939 | 0 | for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) { |
2940 | 0 | unsigned length_slot = deflate_length_slot[i]; |
2941 | 0 | unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot; |
2942 | 0 | u32 bits = (lens->litlen[litlen_sym] ? |
2943 | 0 | lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS); |
2944 | |
|
2945 | 0 | bits += deflate_extra_length_bits[length_slot]; |
2946 | 0 | c->p.n.costs.length[i] = bits * BIT_COST; |
2947 | 0 | } |
2948 | | |
2949 | | /* Offset slots */ |
2950 | 0 | for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) { |
2951 | 0 | u32 bits = (lens->offset[i] ? |
2952 | 0 | lens->offset[i] : OFFSET_NOSTAT_BITS); |
2953 | |
|
2954 | 0 | bits += deflate_extra_offset_bits[i]; |
2955 | 0 | c->p.n.costs.offset_slot[i] = bits * BIT_COST; |
2956 | 0 | } |
2957 | 0 | } |
2958 | | |
2959 | | /* |
2960 | | * This lookup table gives the default cost of a literal symbol and of a length |
2961 | | * symbol, depending on the characteristics of the input data. It was generated |
2962 | | * by scripts/gen_default_litlen_costs.py. |
2963 | | * |
2964 | | * This table is indexed first by the estimated match probability: |
2965 | | * |
2966 | | * i=0: data doesn't contain many matches [match_prob=0.25] |
2967 | | * i=1: neutral [match_prob=0.50] |
2968 | | * i=2: data contains lots of matches [match_prob=0.75] |
2969 | | * |
2970 | | * This lookup produces a subtable which maps the number of distinct used |
2971 | | * literals to the default cost of a literal symbol, i.e.: |
2972 | | * |
2973 | | * int(-log2((1 - match_prob) / num_used_literals) * BIT_COST) |
2974 | | * |
2975 | | * ... for num_used_literals in [1, 256] (and 0, which is copied from 1). This |
2976 | | * accounts for literals usually getting cheaper as the number of distinct |
2977 | | * literals decreases, and as the proportion of literals to matches increases. |
2978 | | * |
2979 | | * The lookup also produces the cost of a length symbol, which is: |
2980 | | * |
2981 | | * int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST) |
2982 | | * |
2983 | | * Note: we don't currently assign different costs to different literal symbols, |
2984 | | * or to different length symbols, as this is hard to do in a useful way. |
2985 | | */ |
2986 | | static const struct { |
2987 | | u8 used_lits_to_lit_cost[257]; |
2988 | | u8 len_sym_cost; |
2989 | | } default_litlen_costs[] = { |
2990 | | { /* match_prob = 0.25 */ |
2991 | | .used_lits_to_lit_cost = { |
2992 | | 6, 6, 22, 32, 38, 43, 48, 51, |
2993 | | 54, 57, 59, 61, 64, 65, 67, 69, |
2994 | | 70, 72, 73, 74, 75, 76, 77, 79, |
2995 | | 80, 80, 81, 82, 83, 84, 85, 85, |
2996 | | 86, 87, 88, 88, 89, 89, 90, 91, |
2997 | | 91, 92, 92, 93, 93, 94, 95, 95, |
2998 | | 96, 96, 96, 97, 97, 98, 98, 99, |
2999 | | 99, 99, 100, 100, 101, 101, 101, 102, |
3000 | | 102, 102, 103, 103, 104, 104, 104, 105, |
3001 | | 105, 105, 105, 106, 106, 106, 107, 107, |
3002 | | 107, 108, 108, 108, 108, 109, 109, 109, |
3003 | | 109, 110, 110, 110, 111, 111, 111, 111, |
3004 | | 112, 112, 112, 112, 112, 113, 113, 113, |
3005 | | 113, 114, 114, 114, 114, 114, 115, 115, |
3006 | | 115, 115, 115, 116, 116, 116, 116, 116, |
3007 | | 117, 117, 117, 117, 117, 118, 118, 118, |
3008 | | 118, 118, 118, 119, 119, 119, 119, 119, |
3009 | | 120, 120, 120, 120, 120, 120, 121, 121, |
3010 | | 121, 121, 121, 121, 121, 122, 122, 122, |
3011 | | 122, 122, 122, 123, 123, 123, 123, 123, |
3012 | | 123, 123, 124, 124, 124, 124, 124, 124, |
3013 | | 124, 125, 125, 125, 125, 125, 125, 125, |
3014 | | 125, 126, 126, 126, 126, 126, 126, 126, |
3015 | | 127, 127, 127, 127, 127, 127, 127, 127, |
3016 | | 128, 128, 128, 128, 128, 128, 128, 128, |
3017 | | 128, 129, 129, 129, 129, 129, 129, 129, |
3018 | | 129, 129, 130, 130, 130, 130, 130, 130, |
3019 | | 130, 130, 130, 131, 131, 131, 131, 131, |
3020 | | 131, 131, 131, 131, 131, 132, 132, 132, |
3021 | | 132, 132, 132, 132, 132, 132, 132, 133, |
3022 | | 133, 133, 133, 133, 133, 133, 133, 133, |
3023 | | 133, 134, 134, 134, 134, 134, 134, 134, |
3024 | | 134, |
3025 | | }, |
3026 | | .len_sym_cost = 109, |
3027 | | }, { /* match_prob = 0.5 */ |
3028 | | .used_lits_to_lit_cost = { |
3029 | | 16, 16, 32, 41, 48, 53, 57, 60, |
3030 | | 64, 66, 69, 71, 73, 75, 76, 78, |
3031 | | 80, 81, 82, 83, 85, 86, 87, 88, |
3032 | | 89, 90, 91, 92, 92, 93, 94, 95, |
3033 | | 96, 96, 97, 98, 98, 99, 99, 100, |
3034 | | 101, 101, 102, 102, 103, 103, 104, 104, |
3035 | | 105, 105, 106, 106, 107, 107, 108, 108, |
3036 | | 108, 109, 109, 110, 110, 110, 111, 111, |
3037 | | 112, 112, 112, 113, 113, 113, 114, 114, |
3038 | | 114, 115, 115, 115, 115, 116, 116, 116, |
3039 | | 117, 117, 117, 118, 118, 118, 118, 119, |
3040 | | 119, 119, 119, 120, 120, 120, 120, 121, |
3041 | | 121, 121, 121, 122, 122, 122, 122, 122, |
3042 | | 123, 123, 123, 123, 124, 124, 124, 124, |
3043 | | 124, 125, 125, 125, 125, 125, 126, 126, |
3044 | | 126, 126, 126, 127, 127, 127, 127, 127, |
3045 | | 128, 128, 128, 128, 128, 128, 129, 129, |
3046 | | 129, 129, 129, 129, 130, 130, 130, 130, |
3047 | | 130, 130, 131, 131, 131, 131, 131, 131, |
3048 | | 131, 132, 132, 132, 132, 132, 132, 133, |
3049 | | 133, 133, 133, 133, 133, 133, 134, 134, |
3050 | | 134, 134, 134, 134, 134, 134, 135, 135, |
3051 | | 135, 135, 135, 135, 135, 135, 136, 136, |
3052 | | 136, 136, 136, 136, 136, 136, 137, 137, |
3053 | | 137, 137, 137, 137, 137, 137, 138, 138, |
3054 | | 138, 138, 138, 138, 138, 138, 138, 139, |
3055 | | 139, 139, 139, 139, 139, 139, 139, 139, |
3056 | | 140, 140, 140, 140, 140, 140, 140, 140, |
3057 | | 140, 141, 141, 141, 141, 141, 141, 141, |
3058 | | 141, 141, 141, 142, 142, 142, 142, 142, |
3059 | | 142, 142, 142, 142, 142, 142, 143, 143, |
3060 | | 143, 143, 143, 143, 143, 143, 143, 143, |
3061 | | 144, |
3062 | | }, |
3063 | | .len_sym_cost = 93, |
3064 | | }, { /* match_prob = 0.75 */ |
3065 | | .used_lits_to_lit_cost = { |
3066 | | 32, 32, 48, 57, 64, 69, 73, 76, |
3067 | | 80, 82, 85, 87, 89, 91, 92, 94, |
3068 | | 96, 97, 98, 99, 101, 102, 103, 104, |
3069 | | 105, 106, 107, 108, 108, 109, 110, 111, |
3070 | | 112, 112, 113, 114, 114, 115, 115, 116, |
3071 | | 117, 117, 118, 118, 119, 119, 120, 120, |
3072 | | 121, 121, 122, 122, 123, 123, 124, 124, |
3073 | | 124, 125, 125, 126, 126, 126, 127, 127, |
3074 | | 128, 128, 128, 129, 129, 129, 130, 130, |
3075 | | 130, 131, 131, 131, 131, 132, 132, 132, |
3076 | | 133, 133, 133, 134, 134, 134, 134, 135, |
3077 | | 135, 135, 135, 136, 136, 136, 136, 137, |
3078 | | 137, 137, 137, 138, 138, 138, 138, 138, |
3079 | | 139, 139, 139, 139, 140, 140, 140, 140, |
3080 | | 140, 141, 141, 141, 141, 141, 142, 142, |
3081 | | 142, 142, 142, 143, 143, 143, 143, 143, |
3082 | | 144, 144, 144, 144, 144, 144, 145, 145, |
3083 | | 145, 145, 145, 145, 146, 146, 146, 146, |
3084 | | 146, 146, 147, 147, 147, 147, 147, 147, |
3085 | | 147, 148, 148, 148, 148, 148, 148, 149, |
3086 | | 149, 149, 149, 149, 149, 149, 150, 150, |
3087 | | 150, 150, 150, 150, 150, 150, 151, 151, |
3088 | | 151, 151, 151, 151, 151, 151, 152, 152, |
3089 | | 152, 152, 152, 152, 152, 152, 153, 153, |
3090 | | 153, 153, 153, 153, 153, 153, 154, 154, |
3091 | | 154, 154, 154, 154, 154, 154, 154, 155, |
3092 | | 155, 155, 155, 155, 155, 155, 155, 155, |
3093 | | 156, 156, 156, 156, 156, 156, 156, 156, |
3094 | | 156, 157, 157, 157, 157, 157, 157, 157, |
3095 | | 157, 157, 157, 158, 158, 158, 158, 158, |
3096 | | 158, 158, 158, 158, 158, 158, 159, 159, |
3097 | | 159, 159, 159, 159, 159, 159, 159, 159, |
3098 | | 160, |
3099 | | }, |
3100 | | .len_sym_cost = 84, |
3101 | | }, |
3102 | | }; |
3103 | | |
3104 | | /* |
3105 | | * Choose the default costs for literal and length symbols. These symbols are |
3106 | | * both part of the litlen alphabet. |
3107 | | */ |
3108 | | static void |
3109 | | deflate_choose_default_litlen_costs(struct libdeflate_compressor *c, |
3110 | | const u8 *block_begin, u32 block_length, |
3111 | | u32 *lit_cost, u32 *len_sym_cost) |
3112 | 0 | { |
3113 | 0 | u32 num_used_literals = 0; |
3114 | 0 | u32 literal_freq = block_length; |
3115 | 0 | u32 match_freq = 0; |
3116 | 0 | u32 cutoff; |
3117 | 0 | u32 i; |
3118 | | |
3119 | | /* Calculate the number of distinct literals that exist in the data. */ |
3120 | 0 | memset(c->freqs.litlen, 0, |
3121 | 0 | DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0])); |
3122 | 0 | cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */ |
3123 | 0 | for (i = 0; i < block_length; i++) |
3124 | 0 | c->freqs.litlen[block_begin[i]]++; |
3125 | 0 | for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { |
3126 | 0 | if (c->freqs.litlen[i] > cutoff) |
3127 | 0 | num_used_literals++; |
3128 | 0 | } |
3129 | 0 | if (num_used_literals == 0) |
3130 | 0 | num_used_literals = 1; |
3131 | | |
3132 | | /* |
3133 | | * Estimate the relative frequency of literals and matches in the |
3134 | | * optimal parsing solution. We don't know the optimal solution, so |
3135 | | * this can only be a very rough estimate. Therefore, we basically use |
3136 | | * the match frequency from a greedy parse. We also apply the min_len |
3137 | | * heuristic used by the greedy and lazy parsers, to avoid counting too |
3138 | | * many matches when literals are cheaper than short matches. |
3139 | | */ |
3140 | 0 | match_freq = 0; |
3141 | 0 | i = choose_min_match_len(num_used_literals, c->max_search_depth); |
3142 | 0 | for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) { |
3143 | 0 | match_freq += c->p.n.match_len_freqs[i]; |
3144 | 0 | literal_freq -= i * c->p.n.match_len_freqs[i]; |
3145 | 0 | } |
3146 | 0 | if ((s32)literal_freq < 0) /* shouldn't happen */ |
3147 | 0 | literal_freq = 0; |
3148 | |
|
3149 | 0 | if (match_freq > literal_freq) |
3150 | 0 | i = 2; /* many matches */ |
3151 | 0 | else if (match_freq * 4 > literal_freq) |
3152 | 0 | i = 1; /* neutral */ |
3153 | 0 | else |
3154 | 0 | i = 0; /* few matches */ |
3155 | |
|
3156 | 0 | STATIC_ASSERT(BIT_COST == 16); |
3157 | 0 | *lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[ |
3158 | 0 | num_used_literals]; |
3159 | 0 | *len_sym_cost = default_litlen_costs[i].len_sym_cost; |
3160 | 0 | } |
3161 | | |
3162 | | static forceinline u32 |
3163 | | deflate_default_length_cost(u32 len, u32 len_sym_cost) |
3164 | 0 | { |
3165 | 0 | unsigned slot = deflate_length_slot[len]; |
3166 | 0 | u32 num_extra_bits = deflate_extra_length_bits[slot]; |
3167 | |
|
3168 | 0 | return len_sym_cost + (num_extra_bits * BIT_COST); |
3169 | 0 | } |
3170 | | |
3171 | | static forceinline u32 |
3172 | | deflate_default_offset_slot_cost(unsigned slot) |
3173 | 0 | { |
3174 | 0 | u32 num_extra_bits = deflate_extra_offset_bits[slot]; |
3175 | | /* |
3176 | | * Assume that all offset symbols are equally probable. |
3177 | | * The resulting cost is 'int(-log2(1/30) * BIT_COST)', |
3178 | | * where 30 is the number of potentially-used offset symbols. |
3179 | | */ |
3180 | 0 | u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000; |
3181 | |
|
3182 | 0 | return offset_sym_cost + (num_extra_bits * BIT_COST); |
3183 | 0 | } |
3184 | | |
3185 | | /* Set default symbol costs for the first block's first optimization pass. */ |
3186 | | static void |
3187 | | deflate_set_default_costs(struct libdeflate_compressor *c, |
3188 | | u32 lit_cost, u32 len_sym_cost) |
3189 | 0 | { |
3190 | 0 | u32 i; |
3191 | | |
3192 | | /* Literals */ |
3193 | 0 | for (i = 0; i < DEFLATE_NUM_LITERALS; i++) |
3194 | 0 | c->p.n.costs.literal[i] = lit_cost; |
3195 | | |
3196 | | /* Lengths */ |
3197 | 0 | for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) |
3198 | 0 | c->p.n.costs.length[i] = |
3199 | 0 | deflate_default_length_cost(i, len_sym_cost); |
3200 | | |
3201 | | /* Offset slots */ |
3202 | 0 | for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) |
3203 | 0 | c->p.n.costs.offset_slot[i] = |
3204 | 0 | deflate_default_offset_slot_cost(i); |
3205 | 0 | } |
3206 | | |
3207 | | static forceinline void |
3208 | | deflate_adjust_cost(u32 *cost_p, u32 default_cost, int change_amount) |
3209 | 0 | { |
3210 | 0 | if (change_amount == 0) |
3211 | | /* Block is very similar to previous; prefer previous costs. */ |
3212 | 0 | *cost_p = (default_cost + 3 * *cost_p) / 4; |
3213 | 0 | else if (change_amount == 1) |
3214 | 0 | *cost_p = (default_cost + *cost_p) / 2; |
3215 | 0 | else if (change_amount == 2) |
3216 | 0 | *cost_p = (5 * default_cost + 3 * *cost_p) / 8; |
3217 | 0 | else |
3218 | | /* Block differs greatly from previous; prefer default costs. */ |
3219 | 0 | *cost_p = (3 * default_cost + *cost_p) / 4; |
3220 | 0 | } |
3221 | | |
3222 | | static forceinline void |
3223 | | deflate_adjust_costs_impl(struct libdeflate_compressor *c, |
3224 | | u32 lit_cost, u32 len_sym_cost, int change_amount) |
3225 | 0 | { |
3226 | 0 | u32 i; |
3227 | | |
3228 | | /* Literals */ |
3229 | 0 | for (i = 0; i < DEFLATE_NUM_LITERALS; i++) |
3230 | 0 | deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost, |
3231 | 0 | change_amount); |
3232 | | |
3233 | | /* Lengths */ |
3234 | 0 | for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) |
3235 | 0 | deflate_adjust_cost(&c->p.n.costs.length[i], |
3236 | 0 | deflate_default_length_cost(i, |
3237 | 0 | len_sym_cost), |
3238 | 0 | change_amount); |
3239 | | |
3240 | | /* Offset slots */ |
3241 | 0 | for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) |
3242 | 0 | deflate_adjust_cost(&c->p.n.costs.offset_slot[i], |
3243 | 0 | deflate_default_offset_slot_cost(i), |
3244 | 0 | change_amount); |
3245 | 0 | } |
3246 | | |
3247 | | /* |
3248 | | * Adjust the costs when beginning a new block. |
3249 | | * |
3250 | | * Since the current costs are optimized for the data already, it can be helpful |
3251 | | * to reuse them instead of starting over with the default costs. However, this |
3252 | | * depends on how similar the new block is to the previous block. Therefore, |
3253 | | * use a heuristic to decide how similar the blocks are, and mix together the |
3254 | | * current costs and the default costs accordingly. |
3255 | | */ |
3256 | | static void |
3257 | | deflate_adjust_costs(struct libdeflate_compressor *c, |
3258 | | u32 lit_cost, u32 len_sym_cost) |
3259 | 0 | { |
3260 | 0 | u64 total_delta = 0; |
3261 | 0 | u64 cutoff; |
3262 | 0 | int i; |
3263 | | |
3264 | | /* |
3265 | | * Decide how different the current block is from the previous block, |
3266 | | * using the block splitting statistics from the current and previous |
3267 | | * blocks. The more different the current block is, the more we prefer |
3268 | | * the default costs rather than the previous block's costs. |
3269 | | * |
3270 | | * The algorithm here is similar to the end-of-block check one, but here |
3271 | | * we compare two entire blocks rather than a partial block with a small |
3272 | | * extra part, and therefore we need 64-bit numbers in some places. |
3273 | | */ |
3274 | 0 | for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { |
3275 | 0 | u64 prev = (u64)c->p.n.prev_observations[i] * |
3276 | 0 | c->split_stats.num_observations; |
3277 | 0 | u64 cur = (u64)c->split_stats.observations[i] * |
3278 | 0 | c->p.n.prev_num_observations; |
3279 | |
|
3280 | 0 | total_delta += prev > cur ? prev - cur : cur - prev; |
3281 | 0 | } |
3282 | 0 | cutoff = ((u64)c->p.n.prev_num_observations * |
3283 | 0 | c->split_stats.num_observations * 200) / 512; |
3284 | |
|
3285 | 0 | if (total_delta > 3 * cutoff) |
3286 | | /* Big change in the data; just use the default costs. */ |
3287 | 0 | deflate_set_default_costs(c, lit_cost, len_sym_cost); |
3288 | 0 | else if (4 * total_delta > 9 * cutoff) |
3289 | 0 | deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3); |
3290 | 0 | else if (2 * total_delta > 3 * cutoff) |
3291 | 0 | deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2); |
3292 | 0 | else if (2 * total_delta > cutoff) |
3293 | 0 | deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1); |
3294 | 0 | else |
3295 | 0 | deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0); |
3296 | 0 | } |
3297 | | |
3298 | | static void |
3299 | | deflate_set_initial_costs(struct libdeflate_compressor *c, |
3300 | | const u8 *block_begin, u32 block_length, |
3301 | | bool is_first_block) |
3302 | 0 | { |
3303 | 0 | u32 lit_cost, len_sym_cost; |
3304 | |
|
3305 | 0 | deflate_choose_default_litlen_costs(c, block_begin, block_length, |
3306 | 0 | &lit_cost, &len_sym_cost); |
3307 | 0 | if (is_first_block) |
3308 | 0 | deflate_set_default_costs(c, lit_cost, len_sym_cost); |
3309 | 0 | else |
3310 | 0 | deflate_adjust_costs(c, lit_cost, len_sym_cost); |
3311 | 0 | } |
3312 | | |
3313 | | /* |
3314 | | * Find the minimum-cost path through the graph of possible match/literal |
3315 | | * choices for this block. |
3316 | | * |
3317 | | * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which |
3318 | | * represents the node at the beginning of the block, to |
3319 | | * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of |
3320 | | * the block. Edge costs are evaluated using the cost model 'c->p.n.costs'. |
3321 | | * |
3322 | | * The algorithm works backwards, starting at the end node and proceeding |
3323 | | * backwards one node at a time. At each node, the minimum cost to reach the |
3324 | | * end node is computed and the match/literal choice that begins that path is |
3325 | | * saved. |
3326 | | */ |
3327 | | static void |
3328 | | deflate_find_min_cost_path(struct libdeflate_compressor *c, |
3329 | | const u32 block_length, |
3330 | | const struct lz_match *cache_ptr) |
3331 | 0 | { |
3332 | 0 | struct deflate_optimum_node *end_node = |
3333 | 0 | &c->p.n.optimum_nodes[block_length]; |
3334 | 0 | struct deflate_optimum_node *cur_node = end_node; |
3335 | |
|
3336 | 0 | cur_node->cost_to_end = 0; |
3337 | 0 | do { |
3338 | 0 | unsigned num_matches; |
3339 | 0 | u32 literal; |
3340 | 0 | u32 best_cost_to_end; |
3341 | |
|
3342 | 0 | cur_node--; |
3343 | 0 | cache_ptr--; |
3344 | |
|
3345 | 0 | num_matches = cache_ptr->length; |
3346 | 0 | literal = cache_ptr->offset; |
3347 | | |
3348 | | /* It's always possible to choose a literal. */ |
3349 | 0 | best_cost_to_end = c->p.n.costs.literal[literal] + |
3350 | 0 | (cur_node + 1)->cost_to_end; |
3351 | 0 | cur_node->item = (literal << OPTIMUM_OFFSET_SHIFT) | 1; |
3352 | | |
3353 | | /* Also consider matches if there are any. */ |
3354 | 0 | if (num_matches) { |
3355 | 0 | const struct lz_match *match; |
3356 | 0 | u32 len; |
3357 | 0 | u32 offset; |
3358 | 0 | u32 offset_slot; |
3359 | 0 | u32 offset_cost; |
3360 | 0 | u32 cost_to_end; |
3361 | | |
3362 | | /* |
3363 | | * Consider each length from the minimum |
3364 | | * (DEFLATE_MIN_MATCH_LEN) to the length of the longest |
3365 | | * match found at this position. For each length, we |
3366 | | * consider only the smallest offset for which that |
3367 | | * length is available. Although this is not guaranteed |
3368 | | * to be optimal due to the possibility of a larger |
3369 | | * offset costing less than a smaller offset to code, |
3370 | | * this is a very useful heuristic. |
3371 | | */ |
3372 | 0 | match = cache_ptr - num_matches; |
3373 | 0 | len = DEFLATE_MIN_MATCH_LEN; |
3374 | 0 | do { |
3375 | 0 | offset = match->offset; |
3376 | 0 | offset_slot = c->p.n.offset_slot_full[offset]; |
3377 | 0 | offset_cost = |
3378 | 0 | c->p.n.costs.offset_slot[offset_slot]; |
3379 | 0 | do { |
3380 | 0 | cost_to_end = offset_cost + |
3381 | 0 | c->p.n.costs.length[len] + |
3382 | 0 | (cur_node + len)->cost_to_end; |
3383 | 0 | if (cost_to_end < best_cost_to_end) { |
3384 | 0 | best_cost_to_end = cost_to_end; |
3385 | 0 | cur_node->item = len | |
3386 | 0 | (offset << |
3387 | 0 | OPTIMUM_OFFSET_SHIFT); |
3388 | 0 | } |
3389 | 0 | } while (++len <= match->length); |
3390 | 0 | } while (++match != cache_ptr); |
3391 | 0 | cache_ptr -= num_matches; |
3392 | 0 | } |
3393 | 0 | cur_node->cost_to_end = best_cost_to_end; |
3394 | 0 | } while (cur_node != &c->p.n.optimum_nodes[0]); |
3395 | |
|
3396 | 0 | deflate_reset_symbol_frequencies(c); |
3397 | 0 | deflate_tally_item_list(c, block_length); |
3398 | 0 | deflate_make_huffman_codes(&c->freqs, &c->codes); |
3399 | 0 | } |
3400 | | |
3401 | | /* |
3402 | | * Choose the literals and matches for the current block, then output the block. |
3403 | | * |
3404 | | * To choose the literal/match sequence, we find the minimum-cost path through |
3405 | | * the block's graph of literal/match choices, given a cost model. However, the |
3406 | | * true cost of each symbol is unknown until the Huffman codes have been built, |
3407 | | * but at the same time the Huffman codes depend on the frequencies of chosen |
3408 | | * symbols. Consequently, multiple passes must be used to try to approximate an |
3409 | | * optimal solution. The first pass uses default costs, mixed with the costs |
3410 | | * from the previous block when it seems appropriate. Later passes use the |
3411 | | * Huffman codeword lengths from the previous pass as the costs. |
3412 | | * |
3413 | | * As an alternate strategy, also consider using only literals. The boolean |
3414 | | * returned in *used_only_literals indicates whether that strategy was best. |
3415 | | */ |
3416 | | static void |
3417 | | deflate_optimize_and_flush_block(struct libdeflate_compressor *c, |
3418 | | struct deflate_output_bitstream *os, |
3419 | | const u8 *block_begin, u32 block_length, |
3420 | | const struct lz_match *cache_ptr, |
3421 | | bool is_first_block, bool is_final_block, |
3422 | | bool *used_only_literals) |
3423 | 0 | { |
3424 | 0 | unsigned num_passes_remaining = c->p.n.max_optim_passes; |
3425 | 0 | u32 best_true_cost = UINT32_MAX; |
3426 | 0 | u32 true_cost; |
3427 | 0 | u32 only_lits_cost; |
3428 | 0 | u32 static_cost = UINT32_MAX; |
3429 | 0 | struct deflate_sequence seq_; |
3430 | 0 | struct deflate_sequence *seq = NULL; |
3431 | 0 | u32 i; |
3432 | | |
3433 | | /* |
3434 | | * On some data, using only literals (no matches) ends up being better |
3435 | | * than what the iterative optimization algorithm produces. Therefore, |
3436 | | * consider using only literals. |
3437 | | */ |
3438 | 0 | deflate_choose_all_literals(c, block_begin, block_length); |
3439 | 0 | only_lits_cost = deflate_compute_true_cost(c); |
3440 | | |
3441 | | /* |
3442 | | * Force the block to really end at the desired length, even if some |
3443 | | * matches extend beyond it. |
3444 | | */ |
3445 | 0 | for (i = block_length; |
3446 | 0 | i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN, |
3447 | 0 | ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++) |
3448 | 0 | c->p.n.optimum_nodes[i].cost_to_end = 0x80000000; |
3449 | | |
3450 | | /* |
3451 | | * Sometimes a static Huffman block ends up being cheapest, particularly |
3452 | | * if the block is small. So, if the block is sufficiently small, find |
3453 | | * the optimal static block solution and remember its cost. |
3454 | | */ |
3455 | 0 | if (block_length <= c->p.n.max_len_to_optimize_static_block) { |
3456 | | /* Save c->p.n.costs temporarily. */ |
3457 | 0 | c->p.n.costs_saved = c->p.n.costs; |
3458 | |
|
3459 | 0 | deflate_set_costs_from_codes(c, &c->static_codes.lens); |
3460 | 0 | deflate_find_min_cost_path(c, block_length, cache_ptr); |
3461 | 0 | static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST; |
3462 | 0 | static_cost += 7; /* for the end-of-block symbol */ |
3463 | | |
3464 | | /* Restore c->p.n.costs. */ |
3465 | 0 | c->p.n.costs = c->p.n.costs_saved; |
3466 | 0 | } |
3467 | | |
3468 | | /* Initialize c->p.n.costs with default costs. */ |
3469 | 0 | deflate_set_initial_costs(c, block_begin, block_length, is_first_block); |
3470 | |
|
3471 | 0 | do { |
3472 | | /* |
3473 | | * Find the minimum-cost path for this pass. |
3474 | | * Also set c->freqs and c->codes to match the path. |
3475 | | */ |
3476 | 0 | deflate_find_min_cost_path(c, block_length, cache_ptr); |
3477 | | |
3478 | | /* |
3479 | | * Compute the exact cost of the block if the path were to be |
3480 | | * used. Note that this differs from |
3481 | | * c->p.n.optimum_nodes[0].cost_to_end in that true_cost uses |
3482 | | * the actual Huffman codes instead of c->p.n.costs. |
3483 | | */ |
3484 | 0 | true_cost = deflate_compute_true_cost(c); |
3485 | | |
3486 | | /* |
3487 | | * If the cost didn't improve much from the previous pass, then |
3488 | | * doing more passes probably won't be helpful, so stop early. |
3489 | | */ |
3490 | 0 | if (true_cost + c->p.n.min_improvement_to_continue > |
3491 | 0 | best_true_cost) |
3492 | 0 | break; |
3493 | | |
3494 | 0 | best_true_cost = true_cost; |
3495 | | |
3496 | | /* Save the cost model that gave 'best_true_cost'. */ |
3497 | 0 | c->p.n.costs_saved = c->p.n.costs; |
3498 | | |
3499 | | /* Update the cost model from the Huffman codes. */ |
3500 | 0 | deflate_set_costs_from_codes(c, &c->codes.lens); |
3501 | |
|
3502 | 0 | } while (--num_passes_remaining); |
3503 | | |
3504 | 0 | *used_only_literals = false; |
3505 | 0 | if (MIN(only_lits_cost, static_cost) < best_true_cost) { |
3506 | 0 | if (only_lits_cost < static_cost) { |
3507 | | /* Using only literals ended up being best! */ |
3508 | 0 | deflate_choose_all_literals(c, block_begin, block_length); |
3509 | 0 | deflate_set_costs_from_codes(c, &c->codes.lens); |
3510 | 0 | seq_.litrunlen_and_length = block_length; |
3511 | 0 | seq = &seq_; |
3512 | 0 | *used_only_literals = true; |
3513 | 0 | } else { |
3514 | | /* Static block ended up being best! */ |
3515 | 0 | deflate_set_costs_from_codes(c, &c->static_codes.lens); |
3516 | 0 | deflate_find_min_cost_path(c, block_length, cache_ptr); |
3517 | 0 | } |
3518 | 0 | } else if (true_cost >= |
3519 | 0 | best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) { |
3520 | | /* |
3521 | | * The best solution was actually from a non-final optimization |
3522 | | * pass, so recover and use the min-cost path from that pass. |
3523 | | */ |
3524 | 0 | c->p.n.costs = c->p.n.costs_saved; |
3525 | 0 | deflate_find_min_cost_path(c, block_length, cache_ptr); |
3526 | 0 | deflate_set_costs_from_codes(c, &c->codes.lens); |
3527 | 0 | } |
3528 | 0 | deflate_flush_block(c, os, block_begin, block_length, seq, |
3529 | 0 | is_final_block); |
3530 | 0 | } |
3531 | | |
3532 | | static void |
3533 | | deflate_near_optimal_init_stats(struct libdeflate_compressor *c) |
3534 | 0 | { |
3535 | 0 | init_block_split_stats(&c->split_stats); |
3536 | 0 | memset(c->p.n.new_match_len_freqs, 0, |
3537 | 0 | sizeof(c->p.n.new_match_len_freqs)); |
3538 | 0 | memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs)); |
3539 | 0 | } |
3540 | | |
3541 | | static void |
3542 | | deflate_near_optimal_merge_stats(struct libdeflate_compressor *c) |
3543 | 0 | { |
3544 | 0 | unsigned i; |
3545 | |
|
3546 | 0 | merge_new_observations(&c->split_stats); |
3547 | 0 | for (i = 0; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) { |
3548 | 0 | c->p.n.match_len_freqs[i] += c->p.n.new_match_len_freqs[i]; |
3549 | 0 | c->p.n.new_match_len_freqs[i] = 0; |
3550 | 0 | } |
3551 | 0 | } |
3552 | | |
3553 | | /* |
3554 | | * Save some literal/match statistics from the previous block so that |
3555 | | * deflate_adjust_costs() will be able to decide how much the current block |
3556 | | * differs from the previous one. |
3557 | | */ |
3558 | | static void |
3559 | | deflate_near_optimal_save_stats(struct libdeflate_compressor *c) |
3560 | 0 | { |
3561 | 0 | int i; |
3562 | |
|
3563 | 0 | for (i = 0; i < NUM_OBSERVATION_TYPES; i++) |
3564 | 0 | c->p.n.prev_observations[i] = c->split_stats.observations[i]; |
3565 | 0 | c->p.n.prev_num_observations = c->split_stats.num_observations; |
3566 | 0 | } |
3567 | | |
3568 | | static void |
3569 | | deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c) |
3570 | 0 | { |
3571 | 0 | int i; |
3572 | |
|
3573 | 0 | for (i = 0; i < NUM_OBSERVATION_TYPES; i++) |
3574 | 0 | c->split_stats.observations[i] = 0; |
3575 | 0 | c->split_stats.num_observations = 0; |
3576 | 0 | memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs)); |
3577 | 0 | } |
3578 | | |
3579 | | /* |
3580 | | * This is the "near-optimal" DEFLATE compressor. It computes the optimal |
3581 | | * representation of each DEFLATE block using a minimum-cost path search over |
3582 | | * the graph of possible match/literal choices for that block, assuming a |
3583 | | * certain cost for each Huffman symbol. |
3584 | | * |
3585 | | * For several reasons, the end result is not guaranteed to be optimal: |
3586 | | * |
3587 | | * - Nonoptimal choice of blocks |
3588 | | * - Heuristic limitations on which matches are actually considered |
3589 | | * - Symbol costs are unknown until the symbols have already been chosen |
3590 | | * (so iterative optimization must be used) |
3591 | | */ |
3592 | | static void |
3593 | | deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, |
3594 | | const u8 *in, size_t in_nbytes, |
3595 | | struct deflate_output_bitstream *os) |
3596 | 0 | { |
3597 | 0 | const u8 *in_next = in; |
3598 | 0 | const u8 *in_block_begin = in_next; |
3599 | 0 | const u8 *in_end = in_next + in_nbytes; |
3600 | 0 | const u8 *in_cur_base = in_next; |
3601 | 0 | const u8 *in_next_slide = |
3602 | 0 | in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE); |
3603 | 0 | u32 max_len = DEFLATE_MAX_MATCH_LEN; |
3604 | 0 | u32 nice_len = MIN(c->nice_match_length, max_len); |
3605 | 0 | struct lz_match *cache_ptr = c->p.n.match_cache; |
3606 | 0 | u32 next_hashes[2] = {0, 0}; |
3607 | 0 | bool prev_block_used_only_literals = false; |
3608 | |
|
3609 | 0 | bt_matchfinder_init(&c->p.n.bt_mf); |
3610 | 0 | deflate_near_optimal_init_stats(c); |
3611 | |
|
3612 | 0 | do { |
3613 | | /* Starting a new DEFLATE block */ |
3614 | 0 | const u8 * const in_max_block_end = choose_max_block_end( |
3615 | 0 | in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH); |
3616 | 0 | const u8 *prev_end_block_check = NULL; |
3617 | 0 | bool change_detected = false; |
3618 | 0 | const u8 *next_observation = in_next; |
3619 | 0 | u32 min_len; |
3620 | | |
3621 | | /* |
3622 | | * Use the minimum match length heuristic to improve the |
3623 | | * literal/match statistics gathered during matchfinding. |
3624 | | * However, the actual near-optimal parse won't respect min_len, |
3625 | | * as it can accurately assess the costs of different matches. |
3626 | | * |
3627 | | * If the "use only literals" strategy happened to be the best |
3628 | | * strategy on the previous block, then probably the |
3629 | | * min_match_len heuristic is still not aggressive enough for |
3630 | | * the data, so force gathering literal stats only. |
3631 | | */ |
3632 | 0 | if (prev_block_used_only_literals) |
3633 | 0 | min_len = DEFLATE_MAX_MATCH_LEN + 1; |
3634 | 0 | else |
3635 | 0 | min_len = calculate_min_match_len( |
3636 | 0 | in_block_begin, |
3637 | 0 | in_max_block_end - in_block_begin, |
3638 | 0 | c->max_search_depth); |
3639 | | |
3640 | | /* |
3641 | | * Find matches until we decide to end the block. We end the |
3642 | | * block if any of the following is true: |
3643 | | * |
3644 | | * (1) Maximum block length has been reached |
3645 | | * (2) Match catch may overflow. |
3646 | | * (3) Block split heuristic says to split now. |
3647 | | */ |
3648 | 0 | for (;;) { |
3649 | 0 | struct lz_match *matches; |
3650 | 0 | u32 best_len; |
3651 | 0 | size_t remaining = in_end - in_next; |
3652 | | |
3653 | | /* Slide the window forward if needed. */ |
3654 | 0 | if (in_next == in_next_slide) { |
3655 | 0 | bt_matchfinder_slide_window(&c->p.n.bt_mf); |
3656 | 0 | in_cur_base = in_next; |
3657 | 0 | in_next_slide = in_next + |
3658 | 0 | MIN(remaining, MATCHFINDER_WINDOW_SIZE); |
3659 | 0 | } |
3660 | | |
3661 | | /* |
3662 | | * Find matches with the current position using the |
3663 | | * binary tree matchfinder and save them in match_cache. |
3664 | | * |
3665 | | * Note: the binary tree matchfinder is more suited for |
3666 | | * optimal parsing than the hash chain matchfinder. The |
3667 | | * reasons for this include: |
3668 | | * |
3669 | | * - The binary tree matchfinder can find more matches |
3670 | | * in the same number of steps. |
3671 | | * - One of the major advantages of hash chains is that |
3672 | | * skipping positions (not searching for matches at |
3673 | | * them) is faster; however, with optimal parsing we |
3674 | | * search for matches at almost all positions, so this |
3675 | | * advantage of hash chains is negated. |
3676 | | */ |
3677 | 0 | matches = cache_ptr; |
3678 | 0 | best_len = 0; |
3679 | 0 | adjust_max_and_nice_len(&max_len, &nice_len, remaining); |
3680 | 0 | if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) { |
3681 | 0 | cache_ptr = bt_matchfinder_get_matches( |
3682 | 0 | &c->p.n.bt_mf, |
3683 | 0 | in_cur_base, |
3684 | 0 | in_next - in_cur_base, |
3685 | 0 | max_len, |
3686 | 0 | nice_len, |
3687 | 0 | c->max_search_depth, |
3688 | 0 | next_hashes, |
3689 | 0 | matches); |
3690 | 0 | if (cache_ptr > matches) |
3691 | 0 | best_len = cache_ptr[-1].length; |
3692 | 0 | } |
3693 | 0 | if (in_next >= next_observation) { |
3694 | 0 | if (best_len >= min_len) { |
3695 | 0 | observe_match(&c->split_stats, |
3696 | 0 | best_len); |
3697 | 0 | next_observation = in_next + best_len; |
3698 | 0 | c->p.n.new_match_len_freqs[best_len]++; |
3699 | 0 | } else { |
3700 | 0 | observe_literal(&c->split_stats, |
3701 | 0 | *in_next); |
3702 | 0 | next_observation = in_next + 1; |
3703 | 0 | } |
3704 | 0 | } |
3705 | |
|
3706 | 0 | cache_ptr->length = cache_ptr - matches; |
3707 | 0 | cache_ptr->offset = *in_next; |
3708 | 0 | in_next++; |
3709 | 0 | cache_ptr++; |
3710 | | |
3711 | | /* |
3712 | | * If there was a very long match found, don't cache any |
3713 | | * matches for the bytes covered by that match. This |
3714 | | * avoids degenerate behavior when compressing highly |
3715 | | * redundant data, where the number of matches can be |
3716 | | * very large. |
3717 | | * |
3718 | | * This heuristic doesn't actually hurt the compression |
3719 | | * ratio very much. If there's a long match, then the |
3720 | | * data must be highly compressible, so it doesn't |
3721 | | * matter much what we do. |
3722 | | */ |
3723 | 0 | if (best_len >= DEFLATE_MIN_MATCH_LEN && |
3724 | 0 | best_len >= nice_len) { |
3725 | 0 | --best_len; |
3726 | 0 | do { |
3727 | 0 | remaining = in_end - in_next; |
3728 | 0 | if (in_next == in_next_slide) { |
3729 | 0 | bt_matchfinder_slide_window( |
3730 | 0 | &c->p.n.bt_mf); |
3731 | 0 | in_cur_base = in_next; |
3732 | 0 | in_next_slide = in_next + |
3733 | 0 | MIN(remaining, |
3734 | 0 | MATCHFINDER_WINDOW_SIZE); |
3735 | 0 | } |
3736 | 0 | adjust_max_and_nice_len(&max_len, |
3737 | 0 | &nice_len, |
3738 | 0 | remaining); |
3739 | 0 | if (max_len >= |
3740 | 0 | BT_MATCHFINDER_REQUIRED_NBYTES) { |
3741 | 0 | bt_matchfinder_skip_byte( |
3742 | 0 | &c->p.n.bt_mf, |
3743 | 0 | in_cur_base, |
3744 | 0 | in_next - in_cur_base, |
3745 | 0 | nice_len, |
3746 | 0 | c->max_search_depth, |
3747 | 0 | next_hashes); |
3748 | 0 | } |
3749 | 0 | cache_ptr->length = 0; |
3750 | 0 | cache_ptr->offset = *in_next; |
3751 | 0 | in_next++; |
3752 | 0 | cache_ptr++; |
3753 | 0 | } while (--best_len); |
3754 | 0 | } |
3755 | | /* Maximum block length or end of input reached? */ |
3756 | 0 | if (in_next >= in_max_block_end) |
3757 | 0 | break; |
3758 | | /* Match cache overflowed? */ |
3759 | 0 | if (cache_ptr >= |
3760 | 0 | &c->p.n.match_cache[MATCH_CACHE_LENGTH]) |
3761 | 0 | break; |
3762 | | /* Not ready to try to end the block (again)? */ |
3763 | 0 | if (!ready_to_check_block(&c->split_stats, |
3764 | 0 | in_block_begin, in_next, |
3765 | 0 | in_end)) |
3766 | 0 | continue; |
3767 | | /* Check if it would be worthwhile to end the block. */ |
3768 | 0 | if (do_end_block_check(&c->split_stats, |
3769 | 0 | in_next - in_block_begin)) { |
3770 | 0 | change_detected = true; |
3771 | 0 | break; |
3772 | 0 | } |
3773 | | /* Ending the block doesn't seem worthwhile here. */ |
3774 | 0 | deflate_near_optimal_merge_stats(c); |
3775 | 0 | prev_end_block_check = in_next; |
3776 | 0 | } |
3777 | | /* |
3778 | | * All the matches for this block have been cached. Now choose |
3779 | | * the precise end of the block and the sequence of items to |
3780 | | * output to represent it, then flush the block. |
3781 | | */ |
3782 | 0 | if (change_detected && prev_end_block_check != NULL) { |
3783 | | /* |
3784 | | * The block is being ended because a recent chunk of |
3785 | | * data differs from the rest of the block. We could |
3786 | | * end the block at 'in_next' like the greedy and lazy |
3787 | | * compressors do, but that's not ideal since it would |
3788 | | * include the differing chunk in the block. The |
3789 | | * near-optimal compressor has time to do a better job. |
3790 | | * Therefore, we rewind to just before the chunk, and |
3791 | | * output a block that only goes up to there. |
3792 | | * |
3793 | | * We then set things up to correctly start the next |
3794 | | * block, considering that some work has already been |
3795 | | * done on it (some matches found and stats gathered). |
3796 | | */ |
3797 | 0 | struct lz_match *orig_cache_ptr = cache_ptr; |
3798 | 0 | const u8 *in_block_end = prev_end_block_check; |
3799 | 0 | u32 block_length = in_block_end - in_block_begin; |
3800 | 0 | bool is_first = (in_block_begin == in); |
3801 | 0 | bool is_final = false; |
3802 | 0 | u32 num_bytes_to_rewind = in_next - in_block_end; |
3803 | 0 | size_t cache_len_rewound; |
3804 | | |
3805 | | /* Rewind the match cache. */ |
3806 | 0 | do { |
3807 | 0 | cache_ptr--; |
3808 | 0 | cache_ptr -= cache_ptr->length; |
3809 | 0 | } while (--num_bytes_to_rewind); |
3810 | 0 | cache_len_rewound = orig_cache_ptr - cache_ptr; |
3811 | |
|
3812 | 0 | deflate_optimize_and_flush_block( |
3813 | 0 | c, os, in_block_begin, |
3814 | 0 | block_length, cache_ptr, |
3815 | 0 | is_first, is_final, |
3816 | 0 | &prev_block_used_only_literals); |
3817 | 0 | memmove(c->p.n.match_cache, cache_ptr, |
3818 | 0 | cache_len_rewound * sizeof(*cache_ptr)); |
3819 | 0 | cache_ptr = &c->p.n.match_cache[cache_len_rewound]; |
3820 | 0 | deflate_near_optimal_save_stats(c); |
3821 | | /* |
3822 | | * Clear the stats for the just-flushed block, leaving |
3823 | | * just the stats for the beginning of the next block. |
3824 | | */ |
3825 | 0 | deflate_near_optimal_clear_old_stats(c); |
3826 | 0 | in_block_begin = in_block_end; |
3827 | 0 | } else { |
3828 | | /* |
3829 | | * The block is being ended for a reason other than a |
3830 | | * differing data chunk being detected. Don't rewind at |
3831 | | * all; just end the block at the current position. |
3832 | | */ |
3833 | 0 | u32 block_length = in_next - in_block_begin; |
3834 | 0 | bool is_first = (in_block_begin == in); |
3835 | 0 | bool is_final = (in_next == in_end); |
3836 | |
|
3837 | 0 | deflate_near_optimal_merge_stats(c); |
3838 | 0 | deflate_optimize_and_flush_block( |
3839 | 0 | c, os, in_block_begin, |
3840 | 0 | block_length, cache_ptr, |
3841 | 0 | is_first, is_final, |
3842 | 0 | &prev_block_used_only_literals); |
3843 | 0 | cache_ptr = &c->p.n.match_cache[0]; |
3844 | 0 | deflate_near_optimal_save_stats(c); |
3845 | 0 | deflate_near_optimal_init_stats(c); |
3846 | 0 | in_block_begin = in_next; |
3847 | 0 | } |
3848 | 0 | } while (in_next != in_end && !os->overflow); |
3849 | 0 | } |
3850 | | |
3851 | | /* Initialize c->p.n.offset_slot_full. */ |
3852 | | static void |
3853 | | deflate_init_offset_slot_full(struct libdeflate_compressor *c) |
3854 | 0 | { |
3855 | 0 | u32 offset_slot; |
3856 | 0 | u32 offset; |
3857 | 0 | u32 offset_end; |
3858 | |
|
3859 | 0 | for (offset_slot = 0; offset_slot < ARRAY_LEN(deflate_offset_slot_base); |
3860 | 0 | offset_slot++) { |
3861 | 0 | offset = deflate_offset_slot_base[offset_slot]; |
3862 | 0 | offset_end = offset + |
3863 | 0 | (1 << deflate_extra_offset_bits[offset_slot]); |
3864 | 0 | do { |
3865 | 0 | c->p.n.offset_slot_full[offset] = offset_slot; |
3866 | 0 | } while (++offset != offset_end); |
3867 | 0 | } |
3868 | 0 | } |
3869 | | |
3870 | | #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ |
3871 | | |
3872 | | LIBDEFLATEAPI struct libdeflate_compressor * |
3873 | | libdeflate_alloc_compressor_ex(int compression_level, |
3874 | | const struct libdeflate_options *options) |
3875 | 0 | { |
3876 | 0 | struct libdeflate_compressor *c; |
3877 | 0 | size_t size = offsetof(struct libdeflate_compressor, p); |
3878 | |
|
3879 | 0 | check_buildtime_parameters(); |
3880 | | |
3881 | | /* |
3882 | | * Note: if more fields are added to libdeflate_options, this code will |
3883 | | * need to be updated to support both the old and new structs. |
3884 | | */ |
3885 | 0 | if (options->sizeof_options != sizeof(*options)) |
3886 | 0 | return NULL; |
3887 | | |
3888 | 0 | if (compression_level < 0 || compression_level > 12) |
3889 | 0 | return NULL; |
3890 | | |
3891 | 0 | #if SUPPORT_NEAR_OPTIMAL_PARSING |
3892 | 0 | if (compression_level >= 10) |
3893 | 0 | size += sizeof(c->p.n); |
3894 | 0 | else |
3895 | 0 | #endif |
3896 | 0 | { |
3897 | 0 | if (compression_level >= 2) |
3898 | 0 | size += sizeof(c->p.g); |
3899 | 0 | else if (compression_level == 1) |
3900 | 0 | size += sizeof(c->p.f); |
3901 | 0 | } |
3902 | |
|
3903 | 0 | c = libdeflate_aligned_malloc(options->malloc_func ? |
3904 | 0 | options->malloc_func : |
3905 | 0 | libdeflate_default_malloc_func, |
3906 | 0 | MATCHFINDER_MEM_ALIGNMENT, size); |
3907 | 0 | if (!c) |
3908 | 0 | return NULL; |
3909 | 0 | c->free_func = options->free_func ? |
3910 | 0 | options->free_func : libdeflate_default_free_func; |
3911 | |
|
3912 | 0 | c->compression_level = compression_level; |
3913 | | |
3914 | | /* |
3915 | | * The higher the compression level, the more we should bother trying to |
3916 | | * compress very small inputs. |
3917 | | */ |
3918 | 0 | c->max_passthrough_size = 55 - (compression_level * 4); |
3919 | |
|
3920 | 0 | switch (compression_level) { |
3921 | 0 | case 0: |
3922 | 0 | c->max_passthrough_size = SIZE_MAX; |
3923 | 0 | c->impl = NULL; /* not used */ |
3924 | 0 | break; |
3925 | 0 | case 1: |
3926 | 0 | c->impl = deflate_compress_fastest; |
3927 | | /* max_search_depth is unused. */ |
3928 | 0 | c->nice_match_length = 32; |
3929 | 0 | break; |
3930 | 0 | case 2: |
3931 | 0 | c->impl = deflate_compress_greedy; |
3932 | 0 | c->max_search_depth = 6; |
3933 | 0 | c->nice_match_length = 10; |
3934 | 0 | break; |
3935 | 0 | case 3: |
3936 | 0 | c->impl = deflate_compress_greedy; |
3937 | 0 | c->max_search_depth = 12; |
3938 | 0 | c->nice_match_length = 14; |
3939 | 0 | break; |
3940 | 0 | case 4: |
3941 | 0 | c->impl = deflate_compress_greedy; |
3942 | 0 | c->max_search_depth = 16; |
3943 | 0 | c->nice_match_length = 30; |
3944 | 0 | break; |
3945 | 0 | case 5: |
3946 | 0 | c->impl = deflate_compress_lazy; |
3947 | 0 | c->max_search_depth = 16; |
3948 | 0 | c->nice_match_length = 30; |
3949 | 0 | break; |
3950 | 0 | case 6: |
3951 | 0 | c->impl = deflate_compress_lazy; |
3952 | 0 | c->max_search_depth = 35; |
3953 | 0 | c->nice_match_length = 65; |
3954 | 0 | break; |
3955 | 0 | case 7: |
3956 | 0 | c->impl = deflate_compress_lazy; |
3957 | 0 | c->max_search_depth = 100; |
3958 | 0 | c->nice_match_length = 130; |
3959 | 0 | break; |
3960 | 0 | case 8: |
3961 | 0 | c->impl = deflate_compress_lazy2; |
3962 | 0 | c->max_search_depth = 300; |
3963 | 0 | c->nice_match_length = DEFLATE_MAX_MATCH_LEN; |
3964 | 0 | break; |
3965 | 0 | case 9: |
3966 | | #if !SUPPORT_NEAR_OPTIMAL_PARSING |
3967 | | default: |
3968 | | #endif |
3969 | 0 | c->impl = deflate_compress_lazy2; |
3970 | 0 | c->max_search_depth = 600; |
3971 | 0 | c->nice_match_length = DEFLATE_MAX_MATCH_LEN; |
3972 | 0 | break; |
3973 | 0 | #if SUPPORT_NEAR_OPTIMAL_PARSING |
3974 | 0 | case 10: |
3975 | 0 | c->impl = deflate_compress_near_optimal; |
3976 | 0 | c->max_search_depth = 35; |
3977 | 0 | c->nice_match_length = 75; |
3978 | 0 | c->p.n.max_optim_passes = 2; |
3979 | 0 | c->p.n.min_improvement_to_continue = 32; |
3980 | 0 | c->p.n.min_bits_to_use_nonfinal_path = 32; |
3981 | 0 | c->p.n.max_len_to_optimize_static_block = 0; |
3982 | 0 | deflate_init_offset_slot_full(c); |
3983 | 0 | break; |
3984 | 0 | case 11: |
3985 | 0 | c->impl = deflate_compress_near_optimal; |
3986 | 0 | c->max_search_depth = 100; |
3987 | 0 | c->nice_match_length = 150; |
3988 | 0 | c->p.n.max_optim_passes = 4; |
3989 | 0 | c->p.n.min_improvement_to_continue = 16; |
3990 | 0 | c->p.n.min_bits_to_use_nonfinal_path = 16; |
3991 | 0 | c->p.n.max_len_to_optimize_static_block = 1000; |
3992 | 0 | deflate_init_offset_slot_full(c); |
3993 | 0 | break; |
3994 | 0 | case 12: |
3995 | 0 | default: |
3996 | 0 | c->impl = deflate_compress_near_optimal; |
3997 | 0 | c->max_search_depth = 300; |
3998 | 0 | c->nice_match_length = DEFLATE_MAX_MATCH_LEN; |
3999 | 0 | c->p.n.max_optim_passes = 10; |
4000 | 0 | c->p.n.min_improvement_to_continue = 1; |
4001 | 0 | c->p.n.min_bits_to_use_nonfinal_path = 1; |
4002 | 0 | c->p.n.max_len_to_optimize_static_block = 10000; |
4003 | 0 | deflate_init_offset_slot_full(c); |
4004 | 0 | break; |
4005 | 0 | #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ |
4006 | 0 | } |
4007 | | |
4008 | 0 | deflate_init_static_codes(c); |
4009 | |
|
4010 | 0 | return c; |
4011 | 0 | } |
4012 | | |
4013 | | |
4014 | | LIBDEFLATEAPI struct libdeflate_compressor * |
4015 | | libdeflate_alloc_compressor(int compression_level) |
4016 | 0 | { |
4017 | 0 | static const struct libdeflate_options defaults = { |
4018 | 0 | .sizeof_options = sizeof(defaults), |
4019 | 0 | }; |
4020 | 0 | return libdeflate_alloc_compressor_ex(compression_level, &defaults); |
4021 | 0 | } |
4022 | | |
4023 | | LIBDEFLATEAPI size_t |
4024 | | libdeflate_deflate_compress(struct libdeflate_compressor *c, |
4025 | | const void *in, size_t in_nbytes, |
4026 | | void *out, size_t out_nbytes_avail) |
4027 | 0 | { |
4028 | 0 | struct deflate_output_bitstream os; |
4029 | | |
4030 | | /* |
4031 | | * For extremely short inputs, or for compression level 0, just output |
4032 | | * uncompressed blocks. |
4033 | | */ |
4034 | 0 | if (unlikely(in_nbytes <= c->max_passthrough_size)) |
4035 | 0 | return deflate_compress_none(in, in_nbytes, |
4036 | 0 | out, out_nbytes_avail); |
4037 | | |
4038 | | /* Initialize the output bitstream structure. */ |
4039 | 0 | os.bitbuf = 0; |
4040 | 0 | os.bitcount = 0; |
4041 | 0 | os.next = out; |
4042 | 0 | os.end = os.next + out_nbytes_avail; |
4043 | 0 | os.overflow = false; |
4044 | | |
4045 | | /* Call the actual compression function. */ |
4046 | 0 | (*c->impl)(c, in, in_nbytes, &os); |
4047 | | |
4048 | | /* Return 0 if the output buffer is too small. */ |
4049 | 0 | if (os.overflow) |
4050 | 0 | return 0; |
4051 | | |
4052 | | /* |
4053 | | * Write the final byte if needed. This can't overflow the output |
4054 | | * buffer because deflate_flush_block() would have set the overflow flag |
4055 | | * if there wasn't enough space remaining for the full final block. |
4056 | | */ |
4057 | 0 | ASSERT(os.bitcount <= 7); |
4058 | 0 | if (os.bitcount) { |
4059 | 0 | ASSERT(os.next < os.end); |
4060 | 0 | *os.next++ = os.bitbuf; |
4061 | 0 | } |
4062 | | |
4063 | | /* Return the compressed size in bytes. */ |
4064 | 0 | return os.next - (u8 *)out; |
4065 | 0 | } |
4066 | | |
4067 | | LIBDEFLATEAPI void |
4068 | | libdeflate_free_compressor(struct libdeflate_compressor *c) |
4069 | 0 | { |
4070 | 0 | if (c) |
4071 | 0 | libdeflate_aligned_free(c->free_func, c); |
4072 | 0 | } |
4073 | | |
4074 | | unsigned int |
4075 | | libdeflate_get_compression_level(struct libdeflate_compressor *c) |
4076 | 0 | { |
4077 | 0 | return c->compression_level; |
4078 | 0 | } |
4079 | | |
4080 | | LIBDEFLATEAPI size_t |
4081 | | libdeflate_deflate_compress_bound(struct libdeflate_compressor *c, |
4082 | | size_t in_nbytes) |
4083 | 0 | { |
4084 | 0 | size_t max_blocks; |
4085 | | |
4086 | | /* |
4087 | | * Since the compressor never uses a compressed block when an |
4088 | | * uncompressed block is cheaper, the worst case can be no worse than |
4089 | | * the case where only uncompressed blocks are used. |
4090 | | * |
4091 | | * This is true even though up to 7 bits are "wasted" to byte-align the |
4092 | | * bitstream when a compressed block is followed by an uncompressed |
4093 | | * block. This is because a compressed block wouldn't have been used if |
4094 | | * it wasn't cheaper than an uncompressed block, and uncompressed blocks |
4095 | | * always end on a byte boundary. So the alignment bits will, at worst, |
4096 | | * go up to the place where the uncompressed block would have ended. |
4097 | | */ |
4098 | | |
4099 | | /* |
4100 | | * Calculate the maximum number of uncompressed blocks that the |
4101 | | * compressor can use for 'in_nbytes' of data. |
4102 | | * |
4103 | | * The minimum length that is passed to deflate_flush_block() is |
4104 | | * MIN_BLOCK_LENGTH bytes, except for the final block if needed. If |
4105 | | * deflate_flush_block() decides to use an uncompressed block, it |
4106 | | * actually will (in general) output a series of uncompressed blocks in |
4107 | | * order to stay within the UINT16_MAX limit of DEFLATE. But this can |
4108 | | * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX', |
4109 | | * as in that case this behavior can't result in more blocks than the |
4110 | | * case where deflate_flush_block() is called with min-length inputs. |
4111 | | * |
4112 | | * So the number of uncompressed blocks needed would be bounded by |
4113 | | * DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH). However, empty inputs |
4114 | | * need 1 (empty) block, which gives the final expression below. |
4115 | | */ |
4116 | 0 | STATIC_ASSERT(2 * MIN_BLOCK_LENGTH <= UINT16_MAX); |
4117 | 0 | max_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1); |
4118 | | |
4119 | | /* |
4120 | | * Each uncompressed block has 5 bytes of overhead, for the BFINAL, |
4121 | | * BTYPE, LEN, and NLEN fields. (For the reason explained earlier, the |
4122 | | * alignment bits at the very start of the block can be disregarded; |
4123 | | * they would otherwise increase the overhead to 6 bytes per block.) |
4124 | | * Therefore, the maximum number of overhead bytes is '5 * max_blocks'. |
4125 | | * To get the final bound, add the number of uncompressed bytes. |
4126 | | */ |
4127 | 0 | return (5 * max_blocks) + in_nbytes; |
4128 | 0 | } |