/work/_deps/deflate-src/lib/deflate_compress.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * deflate_compress.c - a compressor for DEFLATE |
3 | | * |
4 | | * Copyright 2016 Eric Biggers |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person |
7 | | * obtaining a copy of this software and associated documentation |
8 | | * files (the "Software"), to deal in the Software without |
9 | | * restriction, including without limitation the rights to use, |
10 | | * copy, modify, merge, publish, distribute, sublicense, and/or sell |
11 | | * copies of the Software, and to permit persons to whom the |
12 | | * Software is furnished to do so, subject to the following |
13 | | * conditions: |
14 | | * |
15 | | * The above copyright notice and this permission notice shall be |
16 | | * included in all copies or substantial portions of the Software. |
17 | | * |
18 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
19 | | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
20 | | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
21 | | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
22 | | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
23 | | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
24 | | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
25 | | * OTHER DEALINGS IN THE SOFTWARE. |
26 | | */ |
27 | | |
28 | | #include "deflate_compress.h" |
29 | | #include "deflate_constants.h" |
30 | | |
31 | | /******************************************************************************/ |
32 | | |
33 | | /* |
34 | | * The following parameters can be changed at build time to customize the |
35 | | * compression algorithms slightly: |
36 | | * |
37 | | * (Note, not all customizable parameters are here. Some others can be found in |
38 | | * libdeflate_alloc_compressor() and in *_matchfinder.h.) |
39 | | */ |
40 | | |
41 | | /* |
42 | | * If this parameter is defined to 1, then the near-optimal parsing algorithm |
43 | | * will be included, and compression levels 10-12 will use it. This algorithm |
44 | | * usually produces a compression ratio significantly better than the other |
45 | | * algorithms. However, it is slow. If this parameter is defined to 0, then |
46 | | * levels 10-12 will be the same as level 9 and will use the lazy2 algorithm. |
47 | | */ |
48 | | #define SUPPORT_NEAR_OPTIMAL_PARSING 1 |
49 | | |
50 | | /* |
51 | | * This is the minimum block length that the compressor will use, in |
52 | | * uncompressed bytes. This should be a value below which using shorter blocks |
53 | | * is unlikely to be worthwhile, due to the per-block overhead. This value does |
54 | | * not apply to the final block, which may be shorter than this (if the input is |
55 | | * shorter, it will have to be), or to the final uncompressed block in a series |
56 | | * of uncompressed blocks that cover more than UINT16_MAX bytes. |
57 | | * |
58 | | * This value is also approximately the amount by which what would otherwise be |
59 | | * the second-to-last block is allowed to grow past the soft maximum length in |
60 | | * order to avoid having to use a very short final block. |
61 | | * |
62 | | * Defining a fixed minimum block length is needed in order to guarantee a |
63 | | * reasonable upper bound on the compressed size. It's also needed because our |
64 | | * block splitting algorithm doesn't work well on very short blocks. |
65 | | */ |
66 | 0 | #define MIN_BLOCK_LENGTH 5000 |
67 | | |
68 | | /* |
69 | | * For the greedy, lazy, lazy2, and near-optimal compressors: This is the soft |
70 | | * maximum block length, in uncompressed bytes. The compressor will try to end |
71 | | * blocks at this length, but it may go slightly past it if there is a match |
72 | | * that straddles this limit or if the input data ends soon after this limit. |
73 | | * This parameter doesn't apply to uncompressed blocks, which the DEFLATE format |
74 | | * limits to 65535 bytes. |
75 | | * |
76 | | * This should be a value above which it is very likely that splitting the block |
77 | | * would produce a better compression ratio. For the near-optimal compressor, |
78 | | * increasing/decreasing this parameter will increase/decrease per-compressor |
79 | | * memory usage linearly. |
80 | | */ |
81 | 0 | #define SOFT_MAX_BLOCK_LENGTH 300000 |
82 | | |
83 | | /* |
84 | | * For the greedy, lazy, and lazy2 compressors: this is the length of the |
85 | | * sequence store, which is an array where the compressor temporarily stores |
86 | | * matches that it's going to use in the current block. This value is the |
87 | | * maximum number of matches that can be used in a block. If the sequence store |
88 | | * fills up, then the compressor will be forced to end the block early. This |
89 | | * value should be large enough so that this rarely happens, due to the block |
90 | | * being ended normally before then. Increasing/decreasing this value will |
91 | | * increase/decrease per-compressor memory usage linearly. |
92 | | */ |
93 | 0 | #define SEQ_STORE_LENGTH 50000 |
94 | | |
95 | | /* |
96 | | * For deflate_compress_fastest(): This is the soft maximum block length. |
97 | | * deflate_compress_fastest() doesn't use the regular block splitting algorithm; |
98 | | * it only ends blocks when they reach FAST_SOFT_MAX_BLOCK_LENGTH bytes or |
99 | | * FAST_SEQ_STORE_LENGTH matches. Therefore, this value should be lower than |
100 | | * the regular SOFT_MAX_BLOCK_LENGTH. |
101 | | */ |
102 | 0 | #define FAST_SOFT_MAX_BLOCK_LENGTH 65535 |
103 | | |
104 | | /* |
105 | | * For deflate_compress_fastest(): this is the length of the sequence store. |
106 | | * This is like SEQ_STORE_LENGTH, but this should be a lower value. |
107 | | */ |
108 | 0 | #define FAST_SEQ_STORE_LENGTH 8192 |
109 | | |
110 | | /* |
111 | | * These are the maximum codeword lengths, in bits, the compressor will use for |
112 | | * each Huffman code. The DEFLATE format defines limits for these. However, |
113 | | * further limiting litlen codewords to 14 bits is beneficial, since it has |
114 | | * negligible effect on compression ratio but allows some optimizations when |
115 | | * outputting bits. (It allows 4 literals to be written at once rather than 3.) |
116 | | */ |
117 | 0 | #define MAX_LITLEN_CODEWORD_LEN 14 |
118 | 0 | #define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN |
119 | 0 | #define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN |
120 | | |
121 | | #if SUPPORT_NEAR_OPTIMAL_PARSING |
122 | | |
123 | | /* Parameters specific to the near-optimal parsing algorithm */ |
124 | | |
125 | | /* |
126 | | * BIT_COST is a scaling factor that allows the near-optimal compressor to |
127 | | * consider fractional bit costs when deciding which literal/match sequence to |
128 | | * use. This is useful when the true symbol costs are unknown. For example, if |
129 | | * the compressor thinks that a symbol has 6.5 bits of entropy, it can set its |
130 | | * cost to 6.5 bits rather than have to use 6 or 7 bits. Although in the end |
131 | | * each symbol will use a whole number of bits due to the Huffman coding, |
132 | | * considering fractional bits can be helpful due to the limited information. |
133 | | * |
134 | | * BIT_COST should be a power of 2. A value of 8 or 16 works well. A higher |
135 | | * value isn't very useful since the calculations are approximate anyway. |
136 | | * |
137 | | * BIT_COST doesn't apply to deflate_flush_block() and |
138 | | * deflate_compute_true_cost(), which consider whole bits. |
139 | | */ |
140 | 0 | #define BIT_COST 16 |
141 | | |
142 | | /* |
143 | | * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to |
144 | | * be needed to output a symbol that was unused in the previous optimization |
145 | | * pass. Assigning a default cost allows the symbol to be used in the next |
146 | | * optimization pass. However, the cost should be relatively high because the |
147 | | * symbol probably won't be used very many times (if at all). |
148 | | */ |
149 | 0 | #define LITERAL_NOSTAT_BITS 13 |
150 | 0 | #define LENGTH_NOSTAT_BITS 13 |
151 | 0 | #define OFFSET_NOSTAT_BITS 10 |
152 | | |
153 | | /* |
154 | | * This is (slightly less than) the maximum number of matches that the |
155 | | * near-optimal compressor will cache per block. This behaves similarly to |
156 | | * SEQ_STORE_LENGTH for the other compressors. |
157 | | */ |
158 | 0 | #define MATCH_CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5) |
159 | | |
160 | | #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ |
161 | | |
162 | | /******************************************************************************/ |
163 | | |
164 | | /* Include the needed matchfinders. */ |
165 | 0 | #define MATCHFINDER_WINDOW_ORDER DEFLATE_WINDOW_ORDER |
166 | | #include "hc_matchfinder.h" |
167 | | #include "ht_matchfinder.h" |
168 | | #if SUPPORT_NEAR_OPTIMAL_PARSING |
169 | | # include "bt_matchfinder.h" |
170 | | /* |
171 | | * This is the maximum number of matches the binary trees matchfinder can find |
172 | | * at a single position. Since the matchfinder never finds more than one match |
173 | | * for the same length, presuming one of each possible length is sufficient for |
174 | | * an upper bound. (This says nothing about whether it is worthwhile to |
175 | | * consider so many matches; this is just defining the worst case.) |
176 | | */ |
177 | | #define MAX_MATCHES_PER_POS \ |
178 | | (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1) |
179 | | #endif |
180 | | |
181 | | /* |
182 | | * The largest block length we will ever use is when the final block is of |
183 | | * length SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, or when any block is of |
184 | | * length SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN. The latter case |
185 | | * occurs when the lazy2 compressor chooses two literals and a maximum-length |
186 | | * match, starting at SOFT_MAX_BLOCK_LENGTH - 1. |
187 | | */ |
188 | | #define MAX_BLOCK_LENGTH \ |
189 | | MAX(SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, \ |
190 | | SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN) |
191 | | |
192 | | static forceinline void |
193 | | check_buildtime_parameters(void) |
194 | 0 | { |
195 | | /* |
196 | | * Verify that MIN_BLOCK_LENGTH is being honored, as |
197 | | * libdeflate_deflate_compress_bound() depends on it. |
198 | | */ |
199 | 0 | STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH); |
200 | 0 | STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH); |
201 | 0 | STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN >= |
202 | 0 | MIN_BLOCK_LENGTH); |
203 | 0 | STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN >= |
204 | 0 | MIN_BLOCK_LENGTH); |
205 | 0 | #if SUPPORT_NEAR_OPTIMAL_PARSING |
206 | 0 | STATIC_ASSERT(MIN_BLOCK_LENGTH * MAX_MATCHES_PER_POS <= |
207 | 0 | MATCH_CACHE_LENGTH); |
208 | 0 | #endif |
209 | | |
210 | | /* The definition of MAX_BLOCK_LENGTH assumes this. */ |
211 | 0 | STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH); |
212 | | |
213 | | /* Verify that the sequence stores aren't uselessly large. */ |
214 | 0 | STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN <= |
215 | 0 | SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH); |
216 | 0 | STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN <= |
217 | 0 | FAST_SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH); |
218 | | |
219 | | /* Verify that the maximum codeword lengths are valid. */ |
220 | 0 | STATIC_ASSERT( |
221 | 0 | MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN); |
222 | 0 | STATIC_ASSERT( |
223 | 0 | MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN); |
224 | 0 | STATIC_ASSERT( |
225 | 0 | MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN); |
226 | 0 | STATIC_ASSERT( |
227 | 0 | (1U << MAX_LITLEN_CODEWORD_LEN) >= DEFLATE_NUM_LITLEN_SYMS); |
228 | 0 | STATIC_ASSERT( |
229 | 0 | (1U << MAX_OFFSET_CODEWORD_LEN) >= DEFLATE_NUM_OFFSET_SYMS); |
230 | 0 | STATIC_ASSERT( |
231 | 0 | (1U << MAX_PRE_CODEWORD_LEN) >= DEFLATE_NUM_PRECODE_SYMS); |
232 | 0 | } |
233 | | |
234 | | /******************************************************************************/ |
235 | | |
236 | | /* Table: length slot => length slot base value */ |
237 | | static const unsigned deflate_length_slot_base[] = { |
238 | | 3, 4, 5, 6, 7, 8, 9, 10, |
239 | | 11, 13, 15, 17, 19, 23, 27, 31, |
240 | | 35, 43, 51, 59, 67, 83, 99, 115, |
241 | | 131, 163, 195, 227, 258, |
242 | | }; |
243 | | |
244 | | /* Table: length slot => number of extra length bits */ |
245 | | static const u8 deflate_extra_length_bits[] = { |
246 | | 0, 0, 0, 0, 0, 0, 0, 0, |
247 | | 1, 1, 1, 1, 2, 2, 2, 2, |
248 | | 3, 3, 3, 3, 4, 4, 4, 4, |
249 | | 5, 5, 5, 5, 0, |
250 | | }; |
251 | | |
252 | | /* Table: offset slot => offset slot base value */ |
253 | | static const unsigned deflate_offset_slot_base[] = { |
254 | | 1, 2, 3, 4, 5, 7, 9, 13, |
255 | | 17, 25, 33, 49, 65, 97, 129, 193, |
256 | | 257, 385, 513, 769, 1025, 1537, 2049, 3073, |
257 | | 4097, 6145, 8193, 12289, 16385, 24577, |
258 | | }; |
259 | | |
260 | | /* Table: offset slot => number of extra offset bits */ |
261 | | static const u8 deflate_extra_offset_bits[] = { |
262 | | 0, 0, 0, 0, 1, 1, 2, 2, |
263 | | 3, 3, 4, 4, 5, 5, 6, 6, |
264 | | 7, 7, 8, 8, 9, 9, 10, 10, |
265 | | 11, 11, 12, 12, 13, 13, |
266 | | }; |
267 | | |
268 | | /* Table: length => length slot */ |
269 | | static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = { |
270 | | 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, |
271 | | 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, |
272 | | 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, |
273 | | 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, |
274 | | 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, |
275 | | 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, |
276 | | 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, |
277 | | 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, |
278 | | 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, |
279 | | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, |
280 | | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, |
281 | | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, |
282 | | 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
283 | | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
284 | | 27, 27, 28, |
285 | | }; |
286 | | |
287 | | /* |
288 | | * A condensed table which maps offset => offset slot as follows: |
289 | | * |
290 | | * offset <= 256: deflate_offset_slot[offset] |
291 | | * offset > 256: deflate_offset_slot[256 + ((offset - 1) >> 7)] |
292 | | * |
293 | | * This table was generated by scripts/gen_offset_slot_map.py. |
294 | | */ |
295 | | static const u8 deflate_offset_slot[512] = { |
296 | | 0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, |
297 | | 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, |
298 | | 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
299 | | 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, |
300 | | 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, |
301 | | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, |
302 | | 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, |
303 | | 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, |
304 | | 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
305 | | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
306 | | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
307 | | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
308 | | 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
309 | | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
310 | | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
311 | | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
312 | | 15, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, |
313 | | 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, |
314 | | 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, |
315 | | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, |
316 | | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, |
317 | | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, |
318 | | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
319 | | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
320 | | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, |
321 | | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, |
322 | | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, |
323 | | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, |
324 | | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, |
325 | | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, |
326 | | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, |
327 | | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, |
328 | | }; |
329 | | |
330 | | /* The order in which precode codeword lengths are stored */ |
331 | | static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { |
332 | | 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 |
333 | | }; |
334 | | |
335 | | /* Table: precode symbol => number of extra bits */ |
336 | | static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = { |
337 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7 |
338 | | }; |
339 | | |
340 | | /* Codewords for the DEFLATE Huffman codes */ |
341 | | struct deflate_codewords { |
342 | | u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; |
343 | | u32 offset[DEFLATE_NUM_OFFSET_SYMS]; |
344 | | }; |
345 | | |
346 | | /* |
347 | | * Codeword lengths (in bits) for the DEFLATE Huffman codes. |
348 | | * A zero length means the corresponding symbol had zero frequency. |
349 | | */ |
350 | | struct deflate_lens { |
351 | | u8 litlen[DEFLATE_NUM_LITLEN_SYMS]; |
352 | | u8 offset[DEFLATE_NUM_OFFSET_SYMS]; |
353 | | }; |
354 | | |
355 | | /* Codewords and lengths for the DEFLATE Huffman codes */ |
356 | | struct deflate_codes { |
357 | | struct deflate_codewords codewords; |
358 | | struct deflate_lens lens; |
359 | | }; |
360 | | |
361 | | /* Symbol frequency counters for the DEFLATE Huffman codes */ |
362 | | struct deflate_freqs { |
363 | | u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; |
364 | | u32 offset[DEFLATE_NUM_OFFSET_SYMS]; |
365 | | }; |
366 | | |
367 | | /* |
368 | | * Represents a run of literals followed by a match or end-of-block. This |
369 | | * struct is needed to temporarily store items chosen by the parser, since items |
370 | | * cannot be written until all items for the block have been chosen and the |
371 | | * block's Huffman codes have been computed. |
372 | | */ |
373 | | struct deflate_sequence { |
374 | | |
375 | | /* |
376 | | * Bits 0..22: the number of literals in this run. This may be 0 and |
377 | | * can be at most MAX_BLOCK_LENGTH. The literals are not stored |
378 | | * explicitly in this structure; instead, they are read directly from |
379 | | * the uncompressed data. |
380 | | * |
381 | | * Bits 23..31: the length of the match which follows the literals, or 0 |
382 | | * if this literal run was the last in the block, so there is no match |
383 | | * which follows it. |
384 | | */ |
385 | 0 | #define SEQ_LENGTH_SHIFT 23 |
386 | 0 | #define SEQ_LITRUNLEN_MASK (((u32)1 << SEQ_LENGTH_SHIFT) - 1) |
387 | | u32 litrunlen_and_length; |
388 | | |
389 | | /* |
390 | | * If 'length' doesn't indicate end-of-block, then this is the offset of |
391 | | * the match which follows the literals. |
392 | | */ |
393 | | u16 offset; |
394 | | |
395 | | /* |
396 | | * If 'length' doesn't indicate end-of-block, then this is the offset |
397 | | * slot of the match which follows the literals. |
398 | | */ |
399 | | u16 offset_slot; |
400 | | }; |
401 | | |
402 | | #if SUPPORT_NEAR_OPTIMAL_PARSING |
403 | | |
404 | | /* Costs for the near-optimal parsing algorithm */ |
405 | | struct deflate_costs { |
406 | | |
407 | | /* The cost to output each possible literal */ |
408 | | u32 literal[DEFLATE_NUM_LITERALS]; |
409 | | |
410 | | /* The cost to output each possible match length */ |
411 | | u32 length[DEFLATE_MAX_MATCH_LEN + 1]; |
412 | | |
413 | | /* The cost to output a match offset of each possible offset slot */ |
414 | | u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS]; |
415 | | }; |
416 | | |
417 | | /* |
418 | | * This structure represents a byte position in the input data and a node in the |
419 | | * graph of possible match/literal choices for the current block. |
420 | | * |
421 | | * Logically, each incoming edge to this node is labeled with a literal or a |
422 | | * match that can be taken to reach this position from an earlier position; and |
423 | | * each outgoing edge from this node is labeled with a literal or a match that |
424 | | * can be taken to advance from this position to a later position. |
425 | | * |
426 | | * But these "edges" are actually stored elsewhere (in 'match_cache'). Here we |
427 | | * associate with each node just two pieces of information: |
428 | | * |
429 | | * 'cost_to_end' is the minimum cost to reach the end of the block from |
430 | | * this position. |
431 | | * |
432 | | * 'item' represents the literal or match that must be chosen from here to |
433 | | * reach the end of the block with the minimum cost. Equivalently, this |
434 | | * can be interpreted as the label of the outgoing edge on the minimum-cost |
435 | | * path to the "end of block" node from this node. |
436 | | */ |
437 | | struct deflate_optimum_node { |
438 | | |
439 | | u32 cost_to_end; |
440 | | |
441 | | /* |
442 | | * Notes on the match/literal representation used here: |
443 | | * |
444 | | * The low bits of 'item' are the length: 1 if this is a literal, |
445 | | * or the match length if this is a match. |
446 | | * |
447 | | * The high bits of 'item' are the actual literal byte if this is a |
448 | | * literal, or the match offset if this is a match. |
449 | | */ |
450 | 0 | #define OPTIMUM_OFFSET_SHIFT 9 |
451 | 0 | #define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1) |
452 | | u32 item; |
453 | | |
454 | | }; |
455 | | |
456 | | #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ |
457 | | |
458 | | /* Block split statistics. See "Block splitting algorithm" below. */ |
459 | 0 | #define NUM_LITERAL_OBSERVATION_TYPES 8 |
460 | 0 | #define NUM_MATCH_OBSERVATION_TYPES 2 |
461 | 0 | #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \ |
462 | 0 | NUM_MATCH_OBSERVATION_TYPES) |
463 | 0 | #define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512 |
464 | | struct block_split_stats { |
465 | | u32 new_observations[NUM_OBSERVATION_TYPES]; |
466 | | u32 observations[NUM_OBSERVATION_TYPES]; |
467 | | u32 num_new_observations; |
468 | | u32 num_observations; |
469 | | }; |
470 | | |
471 | | struct deflate_output_bitstream; |
472 | | |
473 | | /* The main DEFLATE compressor structure */ |
474 | | struct libdeflate_compressor { |
475 | | |
476 | | /* Pointer to the compress() implementation chosen at allocation time */ |
477 | | void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in, |
478 | | size_t in_nbytes, struct deflate_output_bitstream *os); |
479 | | |
480 | | /* The compression level with which this compressor was created */ |
481 | | unsigned compression_level; |
482 | | |
483 | | /* Anything of this size or less we won't bother trying to compress. */ |
484 | | size_t max_passthrough_size; |
485 | | |
486 | | /* |
487 | | * The maximum search depth: consider at most this many potential |
488 | | * matches at each position |
489 | | */ |
490 | | unsigned max_search_depth; |
491 | | |
492 | | /* |
493 | | * The "nice" match length: if a match of this length is found, choose |
494 | | * it immediately without further consideration |
495 | | */ |
496 | | unsigned nice_match_length; |
497 | | |
498 | | /* Frequency counters for the current block */ |
499 | | struct deflate_freqs freqs; |
500 | | |
501 | | /* Block split statistics for the current block */ |
502 | | struct block_split_stats split_stats; |
503 | | |
504 | | /* Dynamic Huffman codes for the current block */ |
505 | | struct deflate_codes codes; |
506 | | |
507 | | /* The static Huffman codes defined by the DEFLATE format */ |
508 | | struct deflate_codes static_codes; |
509 | | |
510 | | /* Temporary space for block flushing */ |
511 | | union { |
512 | | /* Information about the precode */ |
513 | | struct { |
514 | | u32 freqs[DEFLATE_NUM_PRECODE_SYMS]; |
515 | | u32 codewords[DEFLATE_NUM_PRECODE_SYMS]; |
516 | | u8 lens[DEFLATE_NUM_PRECODE_SYMS]; |
517 | | unsigned items[DEFLATE_NUM_LITLEN_SYMS + |
518 | | DEFLATE_NUM_OFFSET_SYMS]; |
519 | | unsigned num_litlen_syms; |
520 | | unsigned num_offset_syms; |
521 | | unsigned num_explicit_lens; |
522 | | unsigned num_items; |
523 | | } precode; |
524 | | /* |
525 | | * The "full" length codewords. Used only after the information |
526 | | * in 'precode' is no longer needed. |
527 | | */ |
528 | | struct { |
529 | | u32 codewords[DEFLATE_MAX_MATCH_LEN + 1]; |
530 | | u8 lens[DEFLATE_MAX_MATCH_LEN + 1]; |
531 | | } length; |
532 | | } o; |
533 | | |
534 | | union { |
535 | | /* Data for greedy or lazy parsing */ |
536 | | struct { |
537 | | /* Hash chains matchfinder */ |
538 | | struct hc_matchfinder hc_mf; |
539 | | |
540 | | /* Matches and literals chosen for the current block */ |
541 | | struct deflate_sequence sequences[SEQ_STORE_LENGTH + 1]; |
542 | | |
543 | | } g; /* (g)reedy */ |
544 | | |
545 | | /* Data for fastest parsing */ |
546 | | struct { |
547 | | /* Hash table matchfinder */ |
548 | | struct ht_matchfinder ht_mf; |
549 | | |
550 | | /* Matches and literals chosen for the current block */ |
551 | | struct deflate_sequence sequences[ |
552 | | FAST_SEQ_STORE_LENGTH + 1]; |
553 | | |
554 | | } f; /* (f)astest */ |
555 | | |
556 | | #if SUPPORT_NEAR_OPTIMAL_PARSING |
557 | | /* Data for near-optimal parsing */ |
558 | | struct { |
559 | | |
560 | | /* Binary tree matchfinder */ |
561 | | struct bt_matchfinder bt_mf; |
562 | | |
563 | | /* |
564 | | * Cached matches for the current block. This array |
565 | | * contains the matches that were found at each position |
566 | | * in the block. Specifically, for each position, there |
567 | | * is a list of matches found at that position, if any, |
568 | | * sorted by strictly increasing length. In addition, |
569 | | * following the matches for each position, there is a |
570 | | * special 'struct lz_match' whose 'length' member |
571 | | * contains the number of matches found at that |
572 | | * position, and whose 'offset' member contains the |
573 | | * literal at that position. |
574 | | * |
575 | | * Note: in rare cases, there will be a very high number |
576 | | * of matches in the block and this array will overflow. |
577 | | * If this happens, we force the end of the current |
578 | | * block. MATCH_CACHE_LENGTH is the length at which we |
579 | | * actually check for overflow. The extra slots beyond |
580 | | * this are enough to absorb the worst case overflow, |
581 | | * which occurs if starting at |
582 | | * &match_cache[MATCH_CACHE_LENGTH - 1], we write |
583 | | * MAX_MATCHES_PER_POS matches and a match count header, |
584 | | * then skip searching for matches at |
585 | | * 'DEFLATE_MAX_MATCH_LEN - 1' positions and write the |
586 | | * match count header for each. |
587 | | */ |
588 | | struct lz_match match_cache[MATCH_CACHE_LENGTH + |
589 | | MAX_MATCHES_PER_POS + |
590 | | DEFLATE_MAX_MATCH_LEN - 1]; |
591 | | |
592 | | /* |
593 | | * Array of nodes, one per position, for running the |
594 | | * minimum-cost path algorithm. |
595 | | * |
596 | | * This array must be large enough to accommodate the |
597 | | * worst-case number of nodes, which is MAX_BLOCK_LENGTH |
598 | | * plus 1 for the end-of-block node. |
599 | | */ |
600 | | struct deflate_optimum_node optimum_nodes[ |
601 | | MAX_BLOCK_LENGTH + 1]; |
602 | | |
603 | | /* The current cost model being used */ |
604 | | struct deflate_costs costs; |
605 | | |
606 | | struct deflate_costs costs_producing_best_true_cost; |
607 | | |
608 | | /* |
609 | | * A table that maps match offset to offset slot. This |
610 | | * differs from deflate_offset_slot[] in that this is a |
611 | | * full map, not a condensed one. The full map is more |
612 | | * appropriate for the near-optimal parser, since the |
613 | | * near-optimal parser does more offset => offset_slot |
614 | | * translations, it doesn't intersperse them with |
615 | | * matchfinding (so cache evictions are less of a |
616 | | * concern), and it uses more memory anyway. |
617 | | */ |
618 | | u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1]; |
619 | | |
620 | | /* Literal/match statistics saved from previous block */ |
621 | | u32 prev_observations[NUM_OBSERVATION_TYPES]; |
622 | | u32 prev_num_observations; |
623 | | |
624 | | /* |
625 | | * Approximate match length frequencies based on a |
626 | | * greedy parse, gathered during matchfinding. This is |
627 | | * used for setting the initial symbol costs. |
628 | | */ |
629 | | u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1]; |
630 | | u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1]; |
631 | | |
632 | | /* |
633 | | * The maximum number of optimization passes |
634 | | * (min-cost path searches) per block. |
635 | | * Larger values = more compression. |
636 | | */ |
637 | | unsigned max_optim_passes; |
638 | | |
639 | | /* |
640 | | * If an optimization pass improves the cost by fewer |
641 | | * than this number of bits, then optimization will stop |
642 | | * early, before max_optim_passes has been reached. |
643 | | * Smaller values = more compression. |
644 | | */ |
645 | | unsigned min_improvement_to_continue; |
646 | | |
647 | | /* |
648 | | * The minimum number of bits that would need to be |
649 | | * saved for it to be considered worth the time to |
650 | | * regenerate and use the min-cost path from a previous |
651 | | * optimization pass, in the case where the final |
652 | | * optimization pass actually increased the cost. |
653 | | * Smaller values = more compression. |
654 | | */ |
655 | | unsigned min_bits_to_use_nonfinal_path; |
656 | | |
657 | | } n; /* (n)ear-optimal */ |
658 | | #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ |
659 | | |
660 | | } p; /* (p)arser */ |
661 | | }; |
662 | | |
663 | | /* |
664 | | * The type for the bitbuffer variable, which temporarily holds bits that are |
665 | | * being packed into bytes and written to the output buffer. For best |
666 | | * performance, this should have size equal to a machine word. |
667 | | */ |
668 | | typedef machine_word_t bitbuf_t; |
669 | | |
670 | | /* |
671 | | * The capacity of the bitbuffer, in bits. This is 1 less than the real size, |
672 | | * in order to avoid undefined behavior when doing bitbuf >>= bitcount & ~7. |
673 | | */ |
674 | 0 | #define BITBUF_NBITS (8 * sizeof(bitbuf_t) - 1) |
675 | | |
676 | | /* |
677 | | * Can the specified number of bits always be added to 'bitbuf' after any |
678 | | * pending bytes have been flushed? There can be up to 7 bits remaining after a |
679 | | * flush, so the count must not exceed BITBUF_NBITS after adding 'n' more bits. |
680 | | */ |
681 | 0 | #define CAN_BUFFER(n) (7 + (n) <= BITBUF_NBITS) |
682 | | |
683 | | /* |
684 | | * Structure to keep track of the current state of sending bits to the |
685 | | * compressed output buffer |
686 | | */ |
687 | | struct deflate_output_bitstream { |
688 | | |
689 | | /* Bits that haven't yet been written to the output buffer */ |
690 | | bitbuf_t bitbuf; |
691 | | |
692 | | /* |
693 | | * Number of bits currently held in @bitbuf. This can be between 0 and |
694 | | * BITBUF_NBITS in general, or between 0 and 7 after a flush. |
695 | | */ |
696 | | unsigned bitcount; |
697 | | |
698 | | /* |
699 | | * Pointer to the position in the output buffer at which the next byte |
700 | | * should be written |
701 | | */ |
702 | | u8 *next; |
703 | | |
704 | | /* |
705 | | * Pointer to near the end of the output buffer. 'next' will never |
706 | | * exceed this. There are OUTPUT_END_PADDING bytes reserved after this |
707 | | * to allow branchlessly writing a whole word at this location. |
708 | | */ |
709 | | u8 *end; |
710 | | }; |
711 | | |
712 | | /* |
713 | | * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be |
714 | | * present following os->end, in order to not overrun the buffer when generating |
715 | | * output. When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t) |
716 | | * bytes for put_unaligned_leword(). Otherwise we need only 1 byte. However, |
717 | | * to make the compression algorithm produce the same result on all CPU |
718 | | * architectures (which is sometimes desirable), we have to unconditionally use |
719 | | * the maximum for any CPU, which is sizeof(bitbuf_t) == 8. |
720 | | */ |
721 | 28.9k | #define OUTPUT_END_PADDING 8 |
722 | | |
723 | | /* |
724 | | * Add some bits to the bitbuffer variable of the output bitstream. The caller |
725 | | * must ensure that 'bitcount + n <= BITBUF_NBITS', by calling FLUSH_BITS() |
726 | | * frequently enough. |
727 | | */ |
728 | 0 | #define ADD_BITS(bits, n) \ |
729 | 0 | do { \ |
730 | 0 | bitbuf |= (bitbuf_t)(bits) << bitcount; \ |
731 | 0 | bitcount += (n); \ |
732 | 0 | ASSERT(bitcount <= BITBUF_NBITS); \ |
733 | 0 | } while (0) |
734 | | |
735 | | /* Flush bits from the bitbuffer variable to the output buffer. */ |
736 | 0 | #define FLUSH_BITS() \ |
737 | 0 | do { \ |
738 | 0 | if (UNALIGNED_ACCESS_IS_FAST) { \ |
739 | 0 | /* Flush a whole word (branchlessly). */ \ |
740 | 0 | put_unaligned_leword(bitbuf, out_next); \ |
741 | 0 | bitbuf >>= bitcount & ~7; \ |
742 | 0 | out_next += MIN(out_end - out_next, bitcount >> 3); \ |
743 | 0 | bitcount &= 7; \ |
744 | 0 | } else { \ |
745 | 0 | /* Flush a byte at a time. */ \ |
746 | 0 | while (bitcount >= 8) { \ |
747 | 0 | *out_next = bitbuf; \ |
748 | 0 | if (out_next != out_end) \ |
749 | 0 | out_next++; \ |
750 | 0 | bitcount -= 8; \ |
751 | 0 | bitbuf >>= 8; \ |
752 | 0 | } \ |
753 | 0 | } \ |
754 | 0 | } while (0) |
755 | | |
756 | | /* |
757 | | * Given the binary tree node A[subtree_idx] whose children already satisfy the |
758 | | * maxheap property, swap the node with its greater child until it is greater |
759 | | * than or equal to both of its children, so that the maxheap property is |
760 | | * satisfied in the subtree rooted at A[subtree_idx]. 'A' uses 1-based indices. |
761 | | */ |
762 | | static void |
763 | | heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx) |
764 | 0 | { |
765 | 0 | unsigned parent_idx; |
766 | 0 | unsigned child_idx; |
767 | 0 | u32 v; |
768 | |
|
769 | 0 | v = A[subtree_idx]; |
770 | 0 | parent_idx = subtree_idx; |
771 | 0 | while ((child_idx = parent_idx * 2) <= length) { |
772 | 0 | if (child_idx < length && A[child_idx + 1] > A[child_idx]) |
773 | 0 | child_idx++; |
774 | 0 | if (v >= A[child_idx]) |
775 | 0 | break; |
776 | 0 | A[parent_idx] = A[child_idx]; |
777 | 0 | parent_idx = child_idx; |
778 | 0 | } |
779 | 0 | A[parent_idx] = v; |
780 | 0 | } |
781 | | |
782 | | /* |
783 | | * Rearrange the array 'A' so that it satisfies the maxheap property. |
784 | | * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1]. |
785 | | */ |
786 | | static void |
787 | | heapify_array(u32 A[], unsigned length) |
788 | 0 | { |
789 | 0 | unsigned subtree_idx; |
790 | |
|
791 | 0 | for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--) |
792 | 0 | heapify_subtree(A, length, subtree_idx); |
793 | 0 | } |
794 | | |
795 | | /* |
796 | | * Sort the array 'A', which contains 'length' unsigned 32-bit integers. |
797 | | * |
798 | | * Note: name this function heap_sort() instead of heapsort() to avoid colliding |
799 | | * with heapsort() from stdlib.h on BSD-derived systems. |
800 | | */ |
801 | | static void |
802 | | heap_sort(u32 A[], unsigned length) |
803 | 0 | { |
804 | 0 | A--; /* Use 1-based indices */ |
805 | |
|
806 | 0 | heapify_array(A, length); |
807 | |
|
808 | 0 | while (length >= 2) { |
809 | 0 | u32 tmp = A[length]; |
810 | |
|
811 | 0 | A[length] = A[1]; |
812 | 0 | A[1] = tmp; |
813 | 0 | length--; |
814 | 0 | heapify_subtree(A, length, 1); |
815 | 0 | } |
816 | 0 | } |
817 | | |
818 | 0 | #define NUM_SYMBOL_BITS 10 |
819 | | #define NUM_FREQ_BITS (32 - NUM_SYMBOL_BITS) |
820 | 0 | #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) |
821 | 0 | #define FREQ_MASK (~SYMBOL_MASK) |
822 | | |
823 | 0 | #define GET_NUM_COUNTERS(num_syms) (num_syms) |
824 | | |
825 | | /* |
826 | | * Sort the symbols primarily by frequency and secondarily by symbol value. |
827 | | * Discard symbols with zero frequency and fill in an array with the remaining |
828 | | * symbols, along with their frequencies. The low NUM_SYMBOL_BITS bits of each |
829 | | * array entry will contain the symbol value, and the remaining bits will |
830 | | * contain the frequency. |
831 | | * |
832 | | * @num_syms |
833 | | * Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS. |
834 | | * |
835 | | * @freqs[num_syms] |
836 | | * Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1. |
837 | | * |
838 | | * @lens[num_syms] |
839 | | * An array that eventually will hold the length of each codeword. This |
840 | | * function only fills in the codeword lengths for symbols that have zero |
841 | | * frequency, which are not well defined per se but will be set to 0. |
842 | | * |
843 | | * @symout[num_syms] |
844 | | * The output array, described above. |
845 | | * |
846 | | * Returns the number of entries in 'symout' that were filled. This is the |
847 | | * number of symbols that have nonzero frequency. |
848 | | */ |
849 | | static unsigned |
850 | | sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[]) |
851 | 0 | { |
852 | 0 | unsigned sym; |
853 | 0 | unsigned i; |
854 | 0 | unsigned num_used_syms; |
855 | 0 | unsigned num_counters; |
856 | 0 | unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)]; |
857 | | |
858 | | /* |
859 | | * We use heapsort, but with an added optimization. Since often most |
860 | | * symbol frequencies are low, we first do a count sort using a limited |
861 | | * number of counters. High frequencies are counted in the last |
862 | | * counter, and only they will be sorted with heapsort. |
863 | | * |
864 | | * Note: with more symbols, it is generally beneficial to have more |
865 | | * counters. About 1 counter per symbol seems fastest. |
866 | | */ |
867 | |
|
868 | 0 | num_counters = GET_NUM_COUNTERS(num_syms); |
869 | |
|
870 | 0 | memset(counters, 0, num_counters * sizeof(counters[0])); |
871 | | |
872 | | /* Count the frequencies. */ |
873 | 0 | for (sym = 0; sym < num_syms; sym++) |
874 | 0 | counters[MIN(freqs[sym], num_counters - 1)]++; |
875 | | |
876 | | /* |
877 | | * Make the counters cumulative, ignoring the zero-th, which counted |
878 | | * symbols with zero frequency. As a side effect, this calculates the |
879 | | * number of symbols with nonzero frequency. |
880 | | */ |
881 | 0 | num_used_syms = 0; |
882 | 0 | for (i = 1; i < num_counters; i++) { |
883 | 0 | unsigned count = counters[i]; |
884 | |
|
885 | 0 | counters[i] = num_used_syms; |
886 | 0 | num_used_syms += count; |
887 | 0 | } |
888 | | |
889 | | /* |
890 | | * Sort nonzero-frequency symbols using the counters. At the same time, |
891 | | * set the codeword lengths of zero-frequency symbols to 0. |
892 | | */ |
893 | 0 | for (sym = 0; sym < num_syms; sym++) { |
894 | 0 | u32 freq = freqs[sym]; |
895 | |
|
896 | 0 | if (freq != 0) { |
897 | 0 | symout[counters[MIN(freq, num_counters - 1)]++] = |
898 | 0 | sym | (freq << NUM_SYMBOL_BITS); |
899 | 0 | } else { |
900 | 0 | lens[sym] = 0; |
901 | 0 | } |
902 | 0 | } |
903 | | |
904 | | /* Sort the symbols counted in the last counter. */ |
905 | 0 | heap_sort(symout + counters[num_counters - 2], |
906 | 0 | counters[num_counters - 1] - counters[num_counters - 2]); |
907 | |
|
908 | 0 | return num_used_syms; |
909 | 0 | } |
910 | | |
911 | | /* |
912 | | * Build a Huffman tree. |
913 | | * |
914 | | * This is an optimized implementation that |
915 | | * (a) takes advantage of the frequencies being already sorted; |
916 | | * (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman |
917 | | * tree are sufficient to generate a canonical code; |
918 | | * (c) Only stores parent pointers, not child pointers; |
919 | | * (d) Produces the nodes in the same memory used for input frequency |
920 | | * information. |
921 | | * |
922 | | * Array 'A', which contains 'sym_count' entries, is used for both input and |
923 | | * output. For this function, 'sym_count' must be at least 2. |
924 | | * |
925 | | * For input, the array must contain the frequencies of the symbols, sorted in |
926 | | * increasing order. Specifically, each entry must contain a frequency left |
927 | | * shifted by NUM_SYMBOL_BITS bits. Any data in the low NUM_SYMBOL_BITS bits of |
928 | | * the entries will be ignored by this function. Although these bits will, in |
929 | | * fact, contain the symbols that correspond to the frequencies, this function |
930 | | * is concerned with frequencies only and keeps the symbols as-is. |
931 | | * |
932 | | * For output, this function will produce the non-leaf nodes of the Huffman |
933 | | * tree. These nodes will be stored in the first (sym_count - 1) entries of the |
934 | | * array. Entry A[sym_count - 2] will represent the root node. Each other node |
935 | | * will contain the zero-based index of its parent node in 'A', left shifted by |
936 | | * NUM_SYMBOL_BITS bits. The low NUM_SYMBOL_BITS bits of each entry in A will |
937 | | * be kept as-is. Again, note that although these low bits will, in fact, |
938 | | * contain a symbol value, this symbol will have *no relationship* with the |
939 | | * Huffman tree node that happens to occupy the same slot. This is because this |
940 | | * implementation only generates the non-leaf nodes of the tree. |
941 | | */ |
942 | | static void |
943 | | build_tree(u32 A[], unsigned sym_count) |
944 | 0 | { |
945 | 0 | const unsigned last_idx = sym_count - 1; |
946 | | |
947 | | /* Index of the next lowest frequency leaf that still needs a parent */ |
948 | 0 | unsigned i = 0; |
949 | | |
950 | | /* |
951 | | * Index of the next lowest frequency non-leaf that still needs a |
952 | | * parent, or 'e' if there is currently no such node |
953 | | */ |
954 | 0 | unsigned b = 0; |
955 | | |
956 | | /* Index of the next spot for a non-leaf (will overwrite a leaf) */ |
957 | 0 | unsigned e = 0; |
958 | |
|
959 | 0 | do { |
960 | 0 | u32 new_freq; |
961 | | |
962 | | /* |
963 | | * Select the next two lowest frequency nodes among the leaves |
964 | | * A[i] and non-leaves A[b], and create a new node A[e] to be |
965 | | * their parent. Set the new node's frequency to the sum of the |
966 | | * frequencies of its two children. |
967 | | * |
968 | | * Usually the next two lowest frequency nodes are of the same |
969 | | * type (leaf or non-leaf), so check those cases first. |
970 | | */ |
971 | 0 | if (i + 1 <= last_idx && |
972 | 0 | (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) { |
973 | | /* Two leaves */ |
974 | 0 | new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK); |
975 | 0 | i += 2; |
976 | 0 | } else if (b + 2 <= e && |
977 | 0 | (i > last_idx || |
978 | 0 | (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) { |
979 | | /* Two non-leaves */ |
980 | 0 | new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK); |
981 | 0 | A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); |
982 | 0 | A[b + 1] = (e << NUM_SYMBOL_BITS) | |
983 | 0 | (A[b + 1] & SYMBOL_MASK); |
984 | 0 | b += 2; |
985 | 0 | } else { |
986 | | /* One leaf and one non-leaf */ |
987 | 0 | new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK); |
988 | 0 | A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); |
989 | 0 | i++; |
990 | 0 | b++; |
991 | 0 | } |
992 | 0 | A[e] = new_freq | (A[e] & SYMBOL_MASK); |
993 | | /* |
994 | | * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the |
995 | | * tree is complete once we've created 'n - 1' non-leaves. |
996 | | */ |
997 | 0 | } while (++e < last_idx); |
998 | 0 | } |
999 | | |
1000 | | /* |
1001 | | * Given the stripped-down Huffman tree constructed by build_tree(), determine |
1002 | | * the number of codewords that should be assigned each possible length, taking |
1003 | | * into account the length-limited constraint. |
1004 | | * |
1005 | | * @A |
1006 | | * The array produced by build_tree(), containing parent index information |
1007 | | * for the non-leaf nodes of the Huffman tree. Each entry in this array is |
1008 | | * a node; a node's parent always has a greater index than that node |
1009 | | * itself. This function will overwrite the parent index information in |
1010 | | * this array, so essentially it will destroy the tree. However, the data |
1011 | | * in the low NUM_SYMBOL_BITS of each entry will be preserved. |
1012 | | * |
1013 | | * @root_idx |
1014 | | * The 0-based index of the root node in 'A', and consequently one less |
1015 | | * than the number of tree node entries in 'A'. (Or, really 2 less than |
1016 | | * the actual length of 'A'.) |
1017 | | * |
1018 | | * @len_counts |
1019 | | * An array of length ('max_codeword_len' + 1) in which the number of |
1020 | | * codewords having each length <= max_codeword_len will be returned. |
1021 | | * |
1022 | | * @max_codeword_len |
1023 | | * The maximum permissible codeword length. |
1024 | | */ |
1025 | | static void |
1026 | | compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[], |
1027 | | unsigned max_codeword_len) |
1028 | 0 | { |
1029 | 0 | unsigned len; |
1030 | 0 | int node; |
1031 | | |
1032 | | /* |
1033 | | * The key observations are: |
1034 | | * |
1035 | | * (1) We can traverse the non-leaf nodes of the tree, always visiting a |
1036 | | * parent before its children, by simply iterating through the array |
1037 | | * in reverse order. Consequently, we can compute the depth of each |
1038 | | * node in one pass, overwriting the parent indices with depths. |
1039 | | * |
1040 | | * (2) We can initially assume that in the real Huffman tree, both |
1041 | | * children of the root are leaves. This corresponds to two |
1042 | | * codewords of length 1. Then, whenever we visit a (non-leaf) node |
1043 | | * during the traversal, we modify this assumption to account for |
1044 | | * the current node *not* being a leaf, but rather its two children |
1045 | | * being leaves. This causes the loss of one codeword for the |
1046 | | * current depth and the addition of two codewords for the current |
1047 | | * depth plus one. |
1048 | | * |
1049 | | * (3) We can handle the length-limited constraint fairly easily by |
1050 | | * simply using the largest length available when a depth exceeds |
1051 | | * max_codeword_len. |
1052 | | */ |
1053 | |
|
1054 | 0 | for (len = 0; len <= max_codeword_len; len++) |
1055 | 0 | len_counts[len] = 0; |
1056 | 0 | len_counts[1] = 2; |
1057 | | |
1058 | | /* Set the root node's depth to 0. */ |
1059 | 0 | A[root_idx] &= SYMBOL_MASK; |
1060 | |
|
1061 | 0 | for (node = root_idx - 1; node >= 0; node--) { |
1062 | | |
1063 | | /* Calculate the depth of this node. */ |
1064 | |
|
1065 | 0 | unsigned parent = A[node] >> NUM_SYMBOL_BITS; |
1066 | 0 | unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS; |
1067 | 0 | unsigned depth = parent_depth + 1; |
1068 | | |
1069 | | /* |
1070 | | * Set the depth of this node so that it is available when its |
1071 | | * children (if any) are processed. |
1072 | | */ |
1073 | 0 | A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS); |
1074 | | |
1075 | | /* |
1076 | | * If needed, decrease the length to meet the length-limited |
1077 | | * constraint. This is not the optimal method for generating |
1078 | | * length-limited Huffman codes! But it should be good enough. |
1079 | | */ |
1080 | 0 | if (depth >= max_codeword_len) { |
1081 | 0 | depth = max_codeword_len; |
1082 | 0 | do { |
1083 | 0 | depth--; |
1084 | 0 | } while (len_counts[depth] == 0); |
1085 | 0 | } |
1086 | | |
1087 | | /* |
1088 | | * Account for the fact that we have a non-leaf node at the |
1089 | | * current depth. |
1090 | | */ |
1091 | 0 | len_counts[depth]--; |
1092 | 0 | len_counts[depth + 1] += 2; |
1093 | 0 | } |
1094 | 0 | } |
1095 | | |
1096 | | /* |
1097 | | * DEFLATE uses bit-reversed codewords, so we must bit-reverse the codewords |
1098 | | * after generating them. All codewords have length <= 16 bits. If the CPU has |
1099 | | * a bit-reversal instruction, then that is the fastest method. Otherwise the |
1100 | | * fastest method is to reverse the bits in each of the two bytes using a table. |
1101 | | * The table method is slightly faster than using bitwise operations to flip |
1102 | | * adjacent 1, 2, 4, and then 8-bit fields, even if 2 to 4 codewords are packed |
1103 | | * into a machine word and processed together using that method. |
1104 | | */ |
1105 | | |
1106 | | #ifdef rbit32 |
1107 | | static forceinline u32 reverse_codeword(u32 codeword, u8 len) |
1108 | | { |
1109 | | return rbit32(codeword) >> ((32 - len) & 31); |
1110 | | } |
1111 | | #else |
1112 | | /* Generated by scripts/gen_bitreverse_tab.py */ |
1113 | | static const u8 bitreverse_tab[256] = { |
1114 | | 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, |
1115 | | 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, |
1116 | | 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, |
1117 | | 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, |
1118 | | 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, |
1119 | | 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, |
1120 | | 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, |
1121 | | 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, |
1122 | | 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, |
1123 | | 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, |
1124 | | 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, |
1125 | | 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa, |
1126 | | 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, |
1127 | | 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, |
1128 | | 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, |
1129 | | 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe, |
1130 | | 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, |
1131 | | 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, |
1132 | | 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, |
1133 | | 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, |
1134 | | 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, |
1135 | | 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, |
1136 | | 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, |
1137 | | 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd, |
1138 | | 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, |
1139 | | 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, |
1140 | | 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, |
1141 | | 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb, |
1142 | | 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, |
1143 | | 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, |
1144 | | 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, |
1145 | | 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff, |
1146 | | }; |
1147 | | |
1148 | | static forceinline u32 reverse_codeword(u32 codeword, u8 len) |
1149 | 0 | { |
1150 | 0 | STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16); |
1151 | 0 | codeword = ((u32)bitreverse_tab[codeword & 0xff] << 8) | |
1152 | 0 | bitreverse_tab[codeword >> 8]; |
1153 | 0 | return codeword >> (16 - len); |
1154 | 0 | } |
1155 | | #endif /* !rbit32 */ |
1156 | | |
1157 | | /* |
1158 | | * Generate the codewords for a canonical Huffman code. |
1159 | | * |
1160 | | * @A |
1161 | | * The output array for codewords. In addition, initially this |
1162 | | * array must contain the symbols, sorted primarily by frequency and |
1163 | | * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of |
1164 | | * each entry. |
1165 | | * |
1166 | | * @len |
1167 | | * Output array for codeword lengths. |
1168 | | * |
1169 | | * @len_counts |
1170 | | * An array that provides the number of codewords that will have |
1171 | | * each possible length <= max_codeword_len. |
1172 | | * |
1173 | | * @max_codeword_len |
1174 | | * Maximum length, in bits, of each codeword. |
1175 | | * |
1176 | | * @num_syms |
1177 | | * Number of symbols in the alphabet, including symbols with zero |
1178 | | * frequency. This is the length of the 'A' and 'len' arrays. |
1179 | | */ |
1180 | | static void |
1181 | | gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[], |
1182 | | unsigned max_codeword_len, unsigned num_syms) |
1183 | 0 | { |
1184 | 0 | u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1]; |
1185 | 0 | unsigned i; |
1186 | 0 | unsigned len; |
1187 | 0 | unsigned sym; |
1188 | | |
1189 | | /* |
1190 | | * Given the number of codewords that will have each length, assign |
1191 | | * codeword lengths to symbols. We do this by assigning the lengths in |
1192 | | * decreasing order to the symbols sorted primarily by increasing |
1193 | | * frequency and secondarily by increasing symbol value. |
1194 | | */ |
1195 | 0 | for (i = 0, len = max_codeword_len; len >= 1; len--) { |
1196 | 0 | unsigned count = len_counts[len]; |
1197 | |
|
1198 | 0 | while (count--) |
1199 | 0 | lens[A[i++] & SYMBOL_MASK] = len; |
1200 | 0 | } |
1201 | | |
1202 | | /* |
1203 | | * Generate the codewords themselves. We initialize the |
1204 | | * 'next_codewords' array to provide the lexicographically first |
1205 | | * codeword of each length, then assign codewords in symbol order. This |
1206 | | * produces a canonical code. |
1207 | | */ |
1208 | 0 | next_codewords[0] = 0; |
1209 | 0 | next_codewords[1] = 0; |
1210 | 0 | for (len = 2; len <= max_codeword_len; len++) |
1211 | 0 | next_codewords[len] = |
1212 | 0 | (next_codewords[len - 1] + len_counts[len - 1]) << 1; |
1213 | |
|
1214 | 0 | for (sym = 0; sym < num_syms; sym++) { |
1215 | | /* DEFLATE requires bit-reversed codewords. */ |
1216 | 0 | A[sym] = reverse_codeword(next_codewords[lens[sym]]++, |
1217 | 0 | lens[sym]); |
1218 | 0 | } |
1219 | 0 | } |
1220 | | |
1221 | | /* |
1222 | | * --------------------------------------------------------------------- |
1223 | | * deflate_make_huffman_code() |
1224 | | * --------------------------------------------------------------------- |
1225 | | * |
1226 | | * Given an alphabet and the frequency of each symbol in it, construct a |
1227 | | * length-limited canonical Huffman code. |
1228 | | * |
1229 | | * @num_syms |
1230 | | * The number of symbols in the alphabet. The symbols are the integers in |
1231 | | * the range [0, num_syms - 1]. This parameter must be at least 2 and |
1232 | | * must not exceed (1 << NUM_SYMBOL_BITS). |
1233 | | * |
1234 | | * @max_codeword_len |
1235 | | * The maximum permissible codeword length. |
1236 | | * |
1237 | | * @freqs |
1238 | | * An array of length @num_syms that gives the frequency of each symbol. |
1239 | | * It is valid for some, none, or all of the frequencies to be 0. The sum |
1240 | | * of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1. |
1241 | | * |
1242 | | * @lens |
1243 | | * An array of @num_syms entries in which this function will return the |
1244 | | * length, in bits, of the codeword assigned to each symbol. Symbols with |
1245 | | * 0 frequency will not have codewords per se, but their entries in this |
1246 | | * array will be set to 0. No lengths greater than @max_codeword_len will |
1247 | | * be assigned. |
1248 | | * |
1249 | | * @codewords |
1250 | | * An array of @num_syms entries in which this function will return the |
1251 | | * codeword for each symbol, right-justified and padded on the left with |
1252 | | * zeroes. Codewords for symbols with 0 frequency will be undefined. |
1253 | | * |
1254 | | * --------------------------------------------------------------------- |
1255 | | * |
1256 | | * This function builds a length-limited canonical Huffman code. |
1257 | | * |
1258 | | * A length-limited Huffman code contains no codewords longer than some |
1259 | | * specified length, and has exactly (with some algorithms) or approximately |
1260 | | * (with the algorithm used here) the minimum weighted path length from the |
1261 | | * root, given this constraint. |
1262 | | * |
1263 | | * A canonical Huffman code satisfies the properties that a longer codeword |
1264 | | * never lexicographically precedes a shorter codeword, and the lexicographic |
1265 | | * ordering of codewords of the same length is the same as the lexicographic |
1266 | | * ordering of the corresponding symbols. A canonical Huffman code, or more |
1267 | | * generally a canonical prefix code, can be reconstructed from only a list |
1268 | | * containing the codeword length of each symbol. |
1269 | | * |
1270 | | * The classic algorithm to generate a Huffman code creates a node for each |
1271 | | * symbol, then inserts these nodes into a min-heap keyed by symbol frequency. |
1272 | | * Then, repeatedly, the two lowest-frequency nodes are removed from the |
1273 | | * min-heap and added as the children of a new node having frequency equal to |
1274 | | * the sum of its two children, which is then inserted into the min-heap. When |
1275 | | * only a single node remains in the min-heap, it is the root of the Huffman |
1276 | | * tree. The codeword for each symbol is determined by the path needed to reach |
1277 | | * the corresponding node from the root. Descending to the left child appends a |
1278 | | * 0 bit, whereas descending to the right child appends a 1 bit. |
1279 | | * |
1280 | | * The classic algorithm is relatively easy to understand, but it is subject to |
1281 | | * a number of inefficiencies. In practice, it is fastest to first sort the |
1282 | | * symbols by frequency. (This itself can be subject to an optimization based |
1283 | | * on the fact that most frequencies tend to be low.) At the same time, we sort |
1284 | | * secondarily by symbol value, which aids the process of generating a canonical |
1285 | | * code. Then, during tree construction, no heap is necessary because both the |
1286 | | * leaf nodes and the unparented non-leaf nodes can be easily maintained in |
1287 | | * sorted order. Consequently, there can never be more than two possibilities |
1288 | | * for the next-lowest-frequency node. |
1289 | | * |
1290 | | * In addition, because we're generating a canonical code, we actually don't |
1291 | | * need the leaf nodes of the tree at all, only the non-leaf nodes. This is |
1292 | | * because for canonical code generation we don't need to know where the symbols |
1293 | | * are in the tree. Rather, we only need to know how many leaf nodes have each |
1294 | | * depth (codeword length). And this information can, in fact, be quickly |
1295 | | * generated from the tree of non-leaves only. |
1296 | | * |
1297 | | * Furthermore, we can build this stripped-down Huffman tree directly in the |
1298 | | * array in which the codewords are to be generated, provided that these array |
1299 | | * slots are large enough to hold a symbol and frequency value. |
1300 | | * |
1301 | | * Still furthermore, we don't even need to maintain explicit child pointers. |
1302 | | * We only need the parent pointers, and even those can be overwritten in-place |
1303 | | * with depth information as part of the process of extracting codeword lengths |
1304 | | * from the tree. So in summary, we do NOT need a big structure like: |
1305 | | * |
1306 | | * struct huffman_tree_node { |
1307 | | * unsigned int symbol; |
1308 | | * unsigned int frequency; |
1309 | | * unsigned int depth; |
1310 | | * struct huffman_tree_node *left_child; |
1311 | | * struct huffman_tree_node *right_child; |
1312 | | * }; |
1313 | | * |
1314 | | * |
1315 | | * ... which often gets used in "naive" implementations of Huffman code |
1316 | | * generation. |
1317 | | * |
1318 | | * Many of these optimizations are based on the implementation in 7-Zip (source |
1319 | | * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov. |
1320 | | */ |
1321 | | static void |
1322 | | deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len, |
1323 | | const u32 freqs[], u8 lens[], u32 codewords[]) |
1324 | 0 | { |
1325 | 0 | u32 *A = codewords; |
1326 | 0 | unsigned num_used_syms; |
1327 | |
|
1328 | 0 | STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS); |
1329 | 0 | STATIC_ASSERT(MAX_BLOCK_LENGTH <= ((u32)1 << NUM_FREQ_BITS) - 1); |
1330 | | |
1331 | | /* |
1332 | | * We begin by sorting the symbols primarily by frequency and |
1333 | | * secondarily by symbol value. As an optimization, the array used for |
1334 | | * this purpose ('A') shares storage with the space in which we will |
1335 | | * eventually return the codewords. |
1336 | | */ |
1337 | 0 | num_used_syms = sort_symbols(num_syms, freqs, lens, A); |
1338 | | |
1339 | | /* |
1340 | | * 'num_used_syms' is the number of symbols with nonzero frequency. |
1341 | | * This may be less than @num_syms. 'num_used_syms' is also the number |
1342 | | * of entries in 'A' that are valid. Each entry consists of a distinct |
1343 | | * symbol and a nonzero frequency packed into a 32-bit integer. |
1344 | | */ |
1345 | | |
1346 | | /* |
1347 | | * Handle special cases where only 0 or 1 symbols were used (had nonzero |
1348 | | * frequency). |
1349 | | */ |
1350 | |
|
1351 | 0 | if (unlikely(num_used_syms == 0)) { |
1352 | | /* |
1353 | | * Code is empty. sort_symbols() already set all lengths to 0, |
1354 | | * so there is nothing more to do. |
1355 | | */ |
1356 | 0 | return; |
1357 | 0 | } |
1358 | | |
1359 | 0 | if (unlikely(num_used_syms == 1)) { |
1360 | | /* |
1361 | | * Only one symbol was used, so we only need one codeword. But |
1362 | | * two codewords are needed to form the smallest complete |
1363 | | * Huffman code, which uses codewords 0 and 1. Therefore, we |
1364 | | * choose another symbol to which to assign a codeword. We use |
1365 | | * 0 (if the used symbol is not 0) or 1 (if the used symbol is |
1366 | | * 0). In either case, the lesser-valued symbol must be |
1367 | | * assigned codeword 0 so that the resulting code is canonical. |
1368 | | */ |
1369 | |
|
1370 | 0 | unsigned sym = A[0] & SYMBOL_MASK; |
1371 | 0 | unsigned nonzero_idx = sym ? sym : 1; |
1372 | |
|
1373 | 0 | codewords[0] = 0; |
1374 | 0 | lens[0] = 1; |
1375 | 0 | codewords[nonzero_idx] = 1; |
1376 | 0 | lens[nonzero_idx] = 1; |
1377 | 0 | return; |
1378 | 0 | } |
1379 | | |
1380 | | /* |
1381 | | * Build a stripped-down version of the Huffman tree, sharing the array |
1382 | | * 'A' with the symbol values. Then extract length counts from the tree |
1383 | | * and use them to generate the final codewords. |
1384 | | */ |
1385 | | |
1386 | 0 | build_tree(A, num_used_syms); |
1387 | |
|
1388 | 0 | { |
1389 | 0 | unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; |
1390 | |
|
1391 | 0 | compute_length_counts(A, num_used_syms - 2, |
1392 | 0 | len_counts, max_codeword_len); |
1393 | |
|
1394 | 0 | gen_codewords(A, lens, len_counts, max_codeword_len, num_syms); |
1395 | 0 | } |
1396 | 0 | } |
1397 | | |
1398 | | /* |
1399 | | * Clear the Huffman symbol frequency counters. This must be called when |
1400 | | * starting a new DEFLATE block. |
1401 | | */ |
1402 | | static void |
1403 | | deflate_reset_symbol_frequencies(struct libdeflate_compressor *c) |
1404 | 0 | { |
1405 | 0 | memset(&c->freqs, 0, sizeof(c->freqs)); |
1406 | 0 | } |
1407 | | |
1408 | | /* |
1409 | | * Build the literal/length and offset Huffman codes for a DEFLATE block. |
1410 | | * |
1411 | | * This takes as input the frequency tables for each alphabet and produces as |
1412 | | * output a set of tables that map symbols to codewords and codeword lengths. |
1413 | | */ |
1414 | | static void |
1415 | | deflate_make_huffman_codes(const struct deflate_freqs *freqs, |
1416 | | struct deflate_codes *codes) |
1417 | 0 | { |
1418 | 0 | deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS, |
1419 | 0 | MAX_LITLEN_CODEWORD_LEN, |
1420 | 0 | freqs->litlen, |
1421 | 0 | codes->lens.litlen, |
1422 | 0 | codes->codewords.litlen); |
1423 | |
|
1424 | 0 | deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS, |
1425 | 0 | MAX_OFFSET_CODEWORD_LEN, |
1426 | 0 | freqs->offset, |
1427 | 0 | codes->lens.offset, |
1428 | 0 | codes->codewords.offset); |
1429 | 0 | } |
1430 | | |
1431 | | /* Initialize c->static_codes. */ |
1432 | | static void |
1433 | | deflate_init_static_codes(struct libdeflate_compressor *c) |
1434 | 0 | { |
1435 | 0 | unsigned i; |
1436 | |
|
1437 | 0 | for (i = 0; i < 144; i++) |
1438 | 0 | c->freqs.litlen[i] = 1 << (9 - 8); |
1439 | 0 | for (; i < 256; i++) |
1440 | 0 | c->freqs.litlen[i] = 1 << (9 - 9); |
1441 | 0 | for (; i < 280; i++) |
1442 | 0 | c->freqs.litlen[i] = 1 << (9 - 7); |
1443 | 0 | for (; i < 288; i++) |
1444 | 0 | c->freqs.litlen[i] = 1 << (9 - 8); |
1445 | |
|
1446 | 0 | for (i = 0; i < 32; i++) |
1447 | 0 | c->freqs.offset[i] = 1 << (5 - 5); |
1448 | |
|
1449 | 0 | deflate_make_huffman_codes(&c->freqs, &c->static_codes); |
1450 | 0 | } |
1451 | | |
1452 | | /* Return the offset slot for the given match offset, using the small map. */ |
1453 | | static forceinline unsigned |
1454 | | deflate_get_offset_slot(unsigned offset) |
1455 | 0 | { |
1456 | 0 | #if 1 |
1457 | 0 | if (offset <= 256) |
1458 | 0 | return deflate_offset_slot[offset]; |
1459 | 0 | else |
1460 | 0 | return deflate_offset_slot[256 + ((offset - 1) >> 7)]; |
1461 | | #else /* Branchless version */ |
1462 | | u32 i1 = offset; |
1463 | | u32 i2 = 256 + ((offset - 1) >> 7); |
1464 | | u32 is_small = (s32)(offset - 257) >> 31; |
1465 | | |
1466 | | return deflate_offset_slot[(i1 & is_small) ^ (i2 & ~is_small)]; |
1467 | | #endif |
1468 | 0 | } |
1469 | | |
1470 | | static unsigned |
1471 | | deflate_compute_precode_items(const u8 lens[], const unsigned num_lens, |
1472 | | u32 precode_freqs[], unsigned precode_items[]) |
1473 | 0 | { |
1474 | 0 | unsigned *itemptr; |
1475 | 0 | unsigned run_start; |
1476 | 0 | unsigned run_end; |
1477 | 0 | unsigned extra_bits; |
1478 | 0 | u8 len; |
1479 | |
|
1480 | 0 | memset(precode_freqs, 0, |
1481 | 0 | DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0])); |
1482 | |
|
1483 | 0 | itemptr = precode_items; |
1484 | 0 | run_start = 0; |
1485 | 0 | do { |
1486 | | /* Find the next run of codeword lengths. */ |
1487 | | |
1488 | | /* len = the length being repeated */ |
1489 | 0 | len = lens[run_start]; |
1490 | | |
1491 | | /* Extend the run. */ |
1492 | 0 | run_end = run_start; |
1493 | 0 | do { |
1494 | 0 | run_end++; |
1495 | 0 | } while (run_end != num_lens && len == lens[run_end]); |
1496 | |
|
1497 | 0 | if (len == 0) { |
1498 | | /* Run of zeroes. */ |
1499 | | |
1500 | | /* Symbol 18: RLE 11 to 138 zeroes at a time. */ |
1501 | 0 | while ((run_end - run_start) >= 11) { |
1502 | 0 | extra_bits = MIN((run_end - run_start) - 11, |
1503 | 0 | 0x7F); |
1504 | 0 | precode_freqs[18]++; |
1505 | 0 | *itemptr++ = 18 | (extra_bits << 5); |
1506 | 0 | run_start += 11 + extra_bits; |
1507 | 0 | } |
1508 | | |
1509 | | /* Symbol 17: RLE 3 to 10 zeroes at a time. */ |
1510 | 0 | if ((run_end - run_start) >= 3) { |
1511 | 0 | extra_bits = MIN((run_end - run_start) - 3, |
1512 | 0 | 0x7); |
1513 | 0 | precode_freqs[17]++; |
1514 | 0 | *itemptr++ = 17 | (extra_bits << 5); |
1515 | 0 | run_start += 3 + extra_bits; |
1516 | 0 | } |
1517 | 0 | } else { |
1518 | | |
1519 | | /* A run of nonzero lengths. */ |
1520 | | |
1521 | | /* Symbol 16: RLE 3 to 6 of the previous length. */ |
1522 | 0 | if ((run_end - run_start) >= 4) { |
1523 | 0 | precode_freqs[len]++; |
1524 | 0 | *itemptr++ = len; |
1525 | 0 | run_start++; |
1526 | 0 | do { |
1527 | 0 | extra_bits = MIN((run_end - run_start) - |
1528 | 0 | 3, 0x3); |
1529 | 0 | precode_freqs[16]++; |
1530 | 0 | *itemptr++ = 16 | (extra_bits << 5); |
1531 | 0 | run_start += 3 + extra_bits; |
1532 | 0 | } while ((run_end - run_start) >= 3); |
1533 | 0 | } |
1534 | 0 | } |
1535 | | |
1536 | | /* Output any remaining lengths without RLE. */ |
1537 | 0 | while (run_start != run_end) { |
1538 | 0 | precode_freqs[len]++; |
1539 | 0 | *itemptr++ = len; |
1540 | 0 | run_start++; |
1541 | 0 | } |
1542 | 0 | } while (run_start != num_lens); |
1543 | |
|
1544 | 0 | return itemptr - precode_items; |
1545 | 0 | } |
1546 | | |
1547 | | /* |
1548 | | * Huffman codeword lengths for dynamic Huffman blocks are compressed using a |
1549 | | * separate Huffman code, the "precode", which contains a symbol for each |
1550 | | * possible codeword length in the larger code as well as several special |
1551 | | * symbols to represent repeated codeword lengths (a form of run-length |
1552 | | * encoding). The precode is itself constructed in canonical form, and its |
1553 | | * codeword lengths are represented literally in 19 3-bit fields that |
1554 | | * immediately precede the compressed codeword lengths of the larger code. |
1555 | | */ |
1556 | | |
1557 | | /* Precompute the information needed to output dynamic Huffman codes. */ |
1558 | | static void |
1559 | | deflate_precompute_huffman_header(struct libdeflate_compressor *c) |
1560 | 0 | { |
1561 | | /* Compute how many litlen and offset symbols are needed. */ |
1562 | |
|
1563 | 0 | for (c->o.precode.num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS; |
1564 | 0 | c->o.precode.num_litlen_syms > 257; |
1565 | 0 | c->o.precode.num_litlen_syms--) |
1566 | 0 | if (c->codes.lens.litlen[c->o.precode.num_litlen_syms - 1] != 0) |
1567 | 0 | break; |
1568 | |
|
1569 | 0 | for (c->o.precode.num_offset_syms = DEFLATE_NUM_OFFSET_SYMS; |
1570 | 0 | c->o.precode.num_offset_syms > 1; |
1571 | 0 | c->o.precode.num_offset_syms--) |
1572 | 0 | if (c->codes.lens.offset[c->o.precode.num_offset_syms - 1] != 0) |
1573 | 0 | break; |
1574 | | |
1575 | | /* |
1576 | | * If we're not using the full set of literal/length codeword lengths, |
1577 | | * then temporarily move the offset codeword lengths over so that the |
1578 | | * literal/length and offset codeword lengths are contiguous. |
1579 | | */ |
1580 | 0 | STATIC_ASSERT(offsetof(struct deflate_lens, offset) == |
1581 | 0 | DEFLATE_NUM_LITLEN_SYMS); |
1582 | 0 | if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { |
1583 | 0 | memmove((u8 *)&c->codes.lens + c->o.precode.num_litlen_syms, |
1584 | 0 | (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, |
1585 | 0 | c->o.precode.num_offset_syms); |
1586 | 0 | } |
1587 | | |
1588 | | /* |
1589 | | * Compute the "items" (RLE / literal tokens and extra bits) with which |
1590 | | * the codeword lengths in the larger code will be output. |
1591 | | */ |
1592 | 0 | c->o.precode.num_items = |
1593 | 0 | deflate_compute_precode_items((u8 *)&c->codes.lens, |
1594 | 0 | c->o.precode.num_litlen_syms + |
1595 | 0 | c->o.precode.num_offset_syms, |
1596 | 0 | c->o.precode.freqs, |
1597 | 0 | c->o.precode.items); |
1598 | | |
1599 | | /* Build the precode. */ |
1600 | 0 | deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS, |
1601 | 0 | MAX_PRE_CODEWORD_LEN, |
1602 | 0 | c->o.precode.freqs, c->o.precode.lens, |
1603 | 0 | c->o.precode.codewords); |
1604 | | |
1605 | | /* Count how many precode lengths we actually need to output. */ |
1606 | 0 | for (c->o.precode.num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS; |
1607 | 0 | c->o.precode.num_explicit_lens > 4; |
1608 | 0 | c->o.precode.num_explicit_lens--) |
1609 | 0 | if (c->o.precode.lens[deflate_precode_lens_permutation[ |
1610 | 0 | c->o.precode.num_explicit_lens - 1]] != 0) |
1611 | 0 | break; |
1612 | | |
1613 | | /* Restore the offset codeword lengths if needed. */ |
1614 | 0 | if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { |
1615 | 0 | memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, |
1616 | 0 | (u8 *)&c->codes.lens + c->o.precode.num_litlen_syms, |
1617 | 0 | c->o.precode.num_offset_syms); |
1618 | 0 | } |
1619 | 0 | } |
1620 | | |
1621 | | /* |
1622 | | * To make it faster to output matches, compute the "full" match length |
1623 | | * codewords, i.e. the concatenation of the litlen codeword and the extra bits |
1624 | | * for each possible match length. |
1625 | | */ |
1626 | | static void |
1627 | | deflate_compute_full_len_codewords(struct libdeflate_compressor *c, |
1628 | | const struct deflate_codes *codes) |
1629 | 0 | { |
1630 | 0 | unsigned len; |
1631 | |
|
1632 | 0 | STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN + |
1633 | 0 | DEFLATE_MAX_EXTRA_LENGTH_BITS <= 32); |
1634 | |
|
1635 | 0 | for (len = DEFLATE_MIN_MATCH_LEN; len <= DEFLATE_MAX_MATCH_LEN; len++) { |
1636 | 0 | unsigned slot = deflate_length_slot[len]; |
1637 | 0 | unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + slot; |
1638 | 0 | u32 extra_bits = len - deflate_length_slot_base[slot]; |
1639 | |
|
1640 | 0 | c->o.length.codewords[len] = |
1641 | 0 | codes->codewords.litlen[litlen_sym] | |
1642 | 0 | (extra_bits << codes->lens.litlen[litlen_sym]); |
1643 | 0 | c->o.length.lens[len] = codes->lens.litlen[litlen_sym] + |
1644 | 0 | deflate_extra_length_bits[slot]; |
1645 | 0 | } |
1646 | 0 | } |
1647 | | |
1648 | | /* Write a match to the output buffer. */ |
1649 | 0 | #define WRITE_MATCH(c_, codes_, length_, offset_, offset_slot_) \ |
1650 | 0 | do { \ |
1651 | 0 | const struct libdeflate_compressor *c__ = (c_); \ |
1652 | 0 | const struct deflate_codes *codes__ = (codes_); \ |
1653 | 0 | unsigned length__ = (length_); \ |
1654 | 0 | unsigned offset__ = (offset_); \ |
1655 | 0 | unsigned offset_slot__ = (offset_slot_); \ |
1656 | 0 | \ |
1657 | 0 | /* Litlen symbol and extra length bits */ \ |
1658 | 0 | STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + \ |
1659 | 0 | DEFLATE_MAX_EXTRA_LENGTH_BITS)); \ |
1660 | 0 | ADD_BITS(c__->o.length.codewords[length__], \ |
1661 | 0 | c__->o.length.lens[length__]); \ |
1662 | 0 | \ |
1663 | 0 | if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + \ |
1664 | 0 | DEFLATE_MAX_EXTRA_LENGTH_BITS + \ |
1665 | 0 | MAX_OFFSET_CODEWORD_LEN + \ |
1666 | 0 | DEFLATE_MAX_EXTRA_OFFSET_BITS)) \ |
1667 | 0 | FLUSH_BITS(); \ |
1668 | 0 | \ |
1669 | 0 | /* Offset symbol */ \ |
1670 | 0 | ADD_BITS(codes__->codewords.offset[offset_slot__], \ |
1671 | 0 | codes__->lens.offset[offset_slot__]); \ |
1672 | 0 | \ |
1673 | 0 | if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + \ |
1674 | 0 | DEFLATE_MAX_EXTRA_OFFSET_BITS)) \ |
1675 | 0 | FLUSH_BITS(); \ |
1676 | 0 | \ |
1677 | 0 | /* Extra offset bits */ \ |
1678 | 0 | ADD_BITS(offset__ - deflate_offset_slot_base[offset_slot__], \ |
1679 | 0 | deflate_extra_offset_bits[offset_slot__]); \ |
1680 | 0 | \ |
1681 | 0 | FLUSH_BITS(); \ |
1682 | 0 | } while (0) |
1683 | | |
1684 | | /* |
1685 | | * Choose the best type of block to use (dynamic Huffman, static Huffman, or |
1686 | | * uncompressed), then output it. |
1687 | | * |
1688 | | * The uncompressed data of the block is @block_begin[0..@block_length-1]. The |
1689 | | * sequence of literals and matches that will be used to compress the block (if |
1690 | | * a compressed block is chosen) is given by @sequences if it's non-NULL, or |
1691 | | * else @c->p.n.optimum_nodes. @c->freqs and @c->codes must be already set |
1692 | | * according to the literals, matches, and end-of-block symbol. |
1693 | | */ |
1694 | | static void |
1695 | | deflate_flush_block(struct libdeflate_compressor *c, |
1696 | | struct deflate_output_bitstream *os, |
1697 | | const u8 *block_begin, u32 block_length, |
1698 | | const struct deflate_sequence *sequences, |
1699 | | bool is_final_block) |
1700 | 0 | { |
1701 | | /* |
1702 | | * It is hard to get compilers to understand that writes to 'os->next' |
1703 | | * don't alias 'os'. That hurts performance significantly, as |
1704 | | * everything in 'os' would keep getting re-loaded. ('restrict' |
1705 | | * *should* do the trick, but it's unreliable.) Therefore, we keep all |
1706 | | * the output bitstream state in local variables, and output bits using |
1707 | | * macros. This is similar to what the decompressor does. |
1708 | | */ |
1709 | 0 | const u8 *in_next = block_begin; |
1710 | 0 | const u8 * const in_end = block_begin + block_length; |
1711 | 0 | bitbuf_t bitbuf = os->bitbuf; |
1712 | 0 | unsigned bitcount = os->bitcount; |
1713 | 0 | u8 *out_next = os->next; |
1714 | 0 | u8 * const out_end = os->end; |
1715 | | /* The cost for each block type, in bits */ |
1716 | 0 | u32 dynamic_cost = 0; |
1717 | 0 | u32 static_cost = 0; |
1718 | 0 | u32 uncompressed_cost = 0; |
1719 | 0 | u32 best_cost; |
1720 | 0 | struct deflate_codes *codes; |
1721 | 0 | unsigned sym; |
1722 | |
|
1723 | 0 | ASSERT(block_length >= MIN_BLOCK_LENGTH || is_final_block); |
1724 | 0 | ASSERT(block_length <= MAX_BLOCK_LENGTH); |
1725 | 0 | ASSERT(bitcount <= 7); |
1726 | 0 | ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0); |
1727 | 0 | ASSERT(out_next <= out_end); |
1728 | | |
1729 | | /* Precompute the precode items and build the precode. */ |
1730 | 0 | deflate_precompute_huffman_header(c); |
1731 | | |
1732 | | /* Account for the cost of encoding dynamic Huffman codes. */ |
1733 | 0 | dynamic_cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens); |
1734 | 0 | for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { |
1735 | 0 | u32 extra = deflate_extra_precode_bits[sym]; |
1736 | |
|
1737 | 0 | dynamic_cost += c->o.precode.freqs[sym] * |
1738 | 0 | (extra + c->o.precode.lens[sym]); |
1739 | 0 | } |
1740 | | |
1741 | | /* Account for the cost of encoding literals. */ |
1742 | 0 | for (sym = 0; sym < 144; sym++) { |
1743 | 0 | dynamic_cost += c->freqs.litlen[sym] * |
1744 | 0 | c->codes.lens.litlen[sym]; |
1745 | 0 | static_cost += c->freqs.litlen[sym] * 8; |
1746 | 0 | } |
1747 | 0 | for (; sym < 256; sym++) { |
1748 | 0 | dynamic_cost += c->freqs.litlen[sym] * |
1749 | 0 | c->codes.lens.litlen[sym]; |
1750 | 0 | static_cost += c->freqs.litlen[sym] * 9; |
1751 | 0 | } |
1752 | | |
1753 | | /* Account for the cost of encoding the end-of-block symbol. */ |
1754 | 0 | dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK]; |
1755 | 0 | static_cost += 7; |
1756 | | |
1757 | | /* Account for the cost of encoding lengths. */ |
1758 | 0 | for (sym = DEFLATE_FIRST_LEN_SYM; |
1759 | 0 | sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits); |
1760 | 0 | sym++) { |
1761 | 0 | u32 extra = deflate_extra_length_bits[ |
1762 | 0 | sym - DEFLATE_FIRST_LEN_SYM]; |
1763 | |
|
1764 | 0 | dynamic_cost += c->freqs.litlen[sym] * |
1765 | 0 | (extra + c->codes.lens.litlen[sym]); |
1766 | 0 | static_cost += c->freqs.litlen[sym] * |
1767 | 0 | (extra + c->static_codes.lens.litlen[sym]); |
1768 | 0 | } |
1769 | | |
1770 | | /* Account for the cost of encoding offsets. */ |
1771 | 0 | for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) { |
1772 | 0 | u32 extra = deflate_extra_offset_bits[sym]; |
1773 | |
|
1774 | 0 | dynamic_cost += c->freqs.offset[sym] * |
1775 | 0 | (extra + c->codes.lens.offset[sym]); |
1776 | 0 | static_cost += c->freqs.offset[sym] * (extra + 5); |
1777 | 0 | } |
1778 | | |
1779 | | /* Compute the cost of using uncompressed blocks. */ |
1780 | 0 | uncompressed_cost += (-(bitcount + 3) & 7) + 32 + |
1781 | 0 | (40 * (DIV_ROUND_UP(block_length, |
1782 | 0 | UINT16_MAX) - 1)) + |
1783 | 0 | (8 * block_length); |
1784 | | |
1785 | | /* Choose and output the cheapest type of block. */ |
1786 | 0 | best_cost = MIN(static_cost, uncompressed_cost); |
1787 | 0 | if (dynamic_cost < best_cost) { |
1788 | 0 | const unsigned num_explicit_lens = c->o.precode.num_explicit_lens; |
1789 | 0 | const unsigned num_precode_items = c->o.precode.num_items; |
1790 | 0 | unsigned precode_sym, precode_item; |
1791 | 0 | unsigned i; |
1792 | | |
1793 | | /* Dynamic Huffman block */ |
1794 | |
|
1795 | 0 | best_cost = dynamic_cost; |
1796 | 0 | codes = &c->codes; |
1797 | 0 | STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3)); |
1798 | 0 | ADD_BITS(is_final_block, 1); |
1799 | 0 | ADD_BITS(DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN, 2); |
1800 | 0 | ADD_BITS(c->o.precode.num_litlen_syms - 257, 5); |
1801 | 0 | ADD_BITS(c->o.precode.num_offset_syms - 1, 5); |
1802 | 0 | ADD_BITS(num_explicit_lens - 4, 4); |
1803 | | |
1804 | | /* Output the lengths of the codewords in the precode. */ |
1805 | 0 | if (CAN_BUFFER(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { |
1806 | | /* |
1807 | | * A 64-bit bitbuffer is just one bit too small to hold |
1808 | | * the maximum number of precode lens, so to minimize |
1809 | | * flushes we merge one len with the previous fields. |
1810 | | */ |
1811 | 0 | precode_sym = deflate_precode_lens_permutation[0]; |
1812 | 0 | ADD_BITS(c->o.precode.lens[precode_sym], 3); |
1813 | 0 | FLUSH_BITS(); |
1814 | 0 | i = 1; /* num_explicit_lens >= 4 */ |
1815 | 0 | do { |
1816 | 0 | precode_sym = |
1817 | 0 | deflate_precode_lens_permutation[i]; |
1818 | 0 | ADD_BITS(c->o.precode.lens[precode_sym], 3); |
1819 | 0 | } while (++i < num_explicit_lens); |
1820 | 0 | FLUSH_BITS(); |
1821 | 0 | } else { |
1822 | 0 | FLUSH_BITS(); |
1823 | 0 | i = 0; |
1824 | 0 | do { |
1825 | 0 | precode_sym = |
1826 | 0 | deflate_precode_lens_permutation[i]; |
1827 | 0 | ADD_BITS(c->o.precode.lens[precode_sym], 3); |
1828 | 0 | FLUSH_BITS(); |
1829 | 0 | } while (++i < num_explicit_lens); |
1830 | 0 | } |
1831 | | |
1832 | | /* |
1833 | | * Output the lengths of the codewords in the litlen and offset |
1834 | | * codes, encoded by the precode. |
1835 | | */ |
1836 | 0 | i = 0; |
1837 | 0 | do { |
1838 | 0 | precode_item = c->o.precode.items[i]; |
1839 | 0 | precode_sym = precode_item & 0x1F; |
1840 | 0 | STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7)); |
1841 | 0 | ADD_BITS(c->o.precode.codewords[precode_sym], |
1842 | 0 | c->o.precode.lens[precode_sym]); |
1843 | 0 | ADD_BITS(precode_item >> 5, |
1844 | 0 | deflate_extra_precode_bits[precode_sym]); |
1845 | 0 | FLUSH_BITS(); |
1846 | 0 | } while (++i < num_precode_items); |
1847 | 0 | } else if (static_cost < uncompressed_cost) { |
1848 | | /* Static Huffman block */ |
1849 | 0 | codes = &c->static_codes; |
1850 | 0 | ADD_BITS(is_final_block, 1); |
1851 | 0 | ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2); |
1852 | 0 | FLUSH_BITS(); |
1853 | 0 | } else { |
1854 | | /* |
1855 | | * Uncompressed block(s). DEFLATE limits the length of |
1856 | | * uncompressed blocks to UINT16_MAX bytes, so if the length of |
1857 | | * the "block" we're flushing is over UINT16_MAX, we actually |
1858 | | * output multiple blocks. |
1859 | | */ |
1860 | 0 | do { |
1861 | 0 | u8 bfinal = 0; |
1862 | 0 | size_t len = UINT16_MAX; |
1863 | |
|
1864 | 0 | if (in_end - in_next <= UINT16_MAX) { |
1865 | 0 | bfinal = is_final_block; |
1866 | 0 | len = in_end - in_next; |
1867 | 0 | } |
1868 | 0 | if (out_end - out_next < |
1869 | 0 | (bitcount + 3 + 7) / 8 + 4 + len) { |
1870 | | /* Not enough output space remaining. */ |
1871 | 0 | out_next = out_end; |
1872 | 0 | goto out; |
1873 | 0 | } |
1874 | | /* |
1875 | | * Output BFINAL (1 bit) and BTYPE (2 bits), then align |
1876 | | * to a byte boundary. |
1877 | | */ |
1878 | 0 | STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0); |
1879 | 0 | *out_next++ = (bfinal << bitcount) | bitbuf; |
1880 | 0 | if (bitcount > 5) |
1881 | 0 | *out_next++ = 0; |
1882 | 0 | bitbuf = 0; |
1883 | 0 | bitcount = 0; |
1884 | | /* Output LEN and NLEN, then the data itself. */ |
1885 | 0 | put_unaligned_le16(len, out_next); |
1886 | 0 | out_next += 2; |
1887 | 0 | put_unaligned_le16(~len, out_next); |
1888 | 0 | out_next += 2; |
1889 | 0 | memcpy(out_next, in_next, len); |
1890 | 0 | out_next += len; |
1891 | 0 | in_next += len; |
1892 | 0 | } while (in_next != in_end); |
1893 | | /* Done outputting uncompressed block(s) */ |
1894 | 0 | goto out; |
1895 | 0 | } |
1896 | | |
1897 | | /* Output the literals and matches for a dynamic or static block. */ |
1898 | 0 | ASSERT(bitcount <= 7); |
1899 | 0 | deflate_compute_full_len_codewords(c, codes); |
1900 | 0 | #if SUPPORT_NEAR_OPTIMAL_PARSING |
1901 | 0 | if (sequences == NULL) { |
1902 | | /* Output the literals and matches from the minimum-cost path */ |
1903 | 0 | struct deflate_optimum_node *cur_node = |
1904 | 0 | &c->p.n.optimum_nodes[0]; |
1905 | 0 | struct deflate_optimum_node * const end_node = |
1906 | 0 | &c->p.n.optimum_nodes[block_length]; |
1907 | 0 | do { |
1908 | 0 | unsigned length = cur_node->item & OPTIMUM_LEN_MASK; |
1909 | 0 | unsigned offset = cur_node->item >> |
1910 | 0 | OPTIMUM_OFFSET_SHIFT; |
1911 | 0 | if (length == 1) { |
1912 | | /* Literal */ |
1913 | 0 | ADD_BITS(codes->codewords.litlen[offset], |
1914 | 0 | codes->lens.litlen[offset]); |
1915 | 0 | FLUSH_BITS(); |
1916 | 0 | } else { |
1917 | | /* Match */ |
1918 | 0 | WRITE_MATCH(c, codes, length, offset, |
1919 | 0 | c->p.n.offset_slot_full[offset]); |
1920 | 0 | } |
1921 | 0 | cur_node += length; |
1922 | 0 | } while (cur_node != end_node); |
1923 | 0 | } else |
1924 | 0 | #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ |
1925 | 0 | { |
1926 | | /* Output the literals and matches from the sequences list. */ |
1927 | 0 | const struct deflate_sequence *seq; |
1928 | |
|
1929 | 0 | for (seq = sequences; ; seq++) { |
1930 | 0 | u32 litrunlen = seq->litrunlen_and_length & |
1931 | 0 | SEQ_LITRUNLEN_MASK; |
1932 | 0 | unsigned length = seq->litrunlen_and_length >> |
1933 | 0 | SEQ_LENGTH_SHIFT; |
1934 | 0 | unsigned lit; |
1935 | | |
1936 | | /* Output a run of literals. */ |
1937 | 0 | if (CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) { |
1938 | 0 | for (; litrunlen >= 4; litrunlen -= 4) { |
1939 | 0 | lit = *in_next++; |
1940 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
1941 | 0 | codes->lens.litlen[lit]); |
1942 | 0 | lit = *in_next++; |
1943 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
1944 | 0 | codes->lens.litlen[lit]); |
1945 | 0 | lit = *in_next++; |
1946 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
1947 | 0 | codes->lens.litlen[lit]); |
1948 | 0 | lit = *in_next++; |
1949 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
1950 | 0 | codes->lens.litlen[lit]); |
1951 | 0 | FLUSH_BITS(); |
1952 | 0 | } |
1953 | 0 | if (litrunlen-- != 0) { |
1954 | 0 | lit = *in_next++; |
1955 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
1956 | 0 | codes->lens.litlen[lit]); |
1957 | 0 | if (litrunlen-- != 0) { |
1958 | 0 | lit = *in_next++; |
1959 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
1960 | 0 | codes->lens.litlen[lit]); |
1961 | 0 | if (litrunlen-- != 0) { |
1962 | 0 | lit = *in_next++; |
1963 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
1964 | 0 | codes->lens.litlen[lit]); |
1965 | 0 | } |
1966 | 0 | } |
1967 | 0 | FLUSH_BITS(); |
1968 | 0 | } |
1969 | 0 | } else { |
1970 | 0 | while (litrunlen--) { |
1971 | 0 | lit = *in_next++; |
1972 | 0 | ADD_BITS(codes->codewords.litlen[lit], |
1973 | 0 | codes->lens.litlen[lit]); |
1974 | 0 | FLUSH_BITS(); |
1975 | 0 | } |
1976 | 0 | } |
1977 | |
|
1978 | 0 | if (length == 0) { /* Last sequence? */ |
1979 | 0 | ASSERT(in_next == in_end); |
1980 | 0 | break; |
1981 | 0 | } |
1982 | | |
1983 | | /* Output a match. */ |
1984 | 0 | WRITE_MATCH(c, codes, length, seq->offset, |
1985 | 0 | seq->offset_slot); |
1986 | 0 | in_next += length; |
1987 | 0 | } |
1988 | 0 | } |
1989 | | |
1990 | | /* Output the end-of-block symbol. */ |
1991 | 0 | ASSERT(bitcount <= 7); |
1992 | 0 | ADD_BITS(codes->codewords.litlen[DEFLATE_END_OF_BLOCK], |
1993 | 0 | codes->lens.litlen[DEFLATE_END_OF_BLOCK]); |
1994 | 0 | FLUSH_BITS(); |
1995 | 0 | out: |
1996 | 0 | ASSERT(bitcount <= 7); |
1997 | | /* |
1998 | | * Assert that the block cost was computed correctly, as |
1999 | | * libdeflate_deflate_compress_bound() relies on this via the assumption |
2000 | | * that uncompressed blocks will always be used when cheaper. |
2001 | | */ |
2002 | 0 | ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == |
2003 | 0 | 3 + best_cost || out_next == out_end); |
2004 | |
|
2005 | 0 | os->bitbuf = bitbuf; |
2006 | 0 | os->bitcount = bitcount; |
2007 | 0 | os->next = out_next; |
2008 | 0 | } |
2009 | | |
2010 | | static void |
2011 | | deflate_finish_block(struct libdeflate_compressor *c, |
2012 | | struct deflate_output_bitstream *os, |
2013 | | const u8 *block_begin, u32 block_length, |
2014 | | const struct deflate_sequence *sequences, |
2015 | | bool is_final_block) |
2016 | 0 | { |
2017 | 0 | c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; |
2018 | 0 | deflate_make_huffman_codes(&c->freqs, &c->codes); |
2019 | 0 | deflate_flush_block(c, os, block_begin, block_length, sequences, |
2020 | 0 | is_final_block); |
2021 | 0 | } |
2022 | | |
2023 | | /******************************************************************************/ |
2024 | | |
2025 | | /* |
2026 | | * Block splitting algorithm. The problem is to decide when it is worthwhile to |
2027 | | * start a new block with new Huffman codes. There is a theoretically optimal |
2028 | | * solution: recursively consider every possible block split, considering the |
2029 | | * exact cost of each block, and choose the minimum cost approach. But this is |
2030 | | * far too slow. Instead, as an approximation, we can count symbols and after |
2031 | | * every N symbols, compare the expected distribution of symbols based on the |
2032 | | * previous data with the actual distribution. If they differ "by enough", then |
2033 | | * start a new block. |
2034 | | * |
2035 | | * As an optimization and heuristic, we don't distinguish between every symbol |
2036 | | * but rather we combine many symbols into a single "observation type". For |
2037 | | * literals we only look at the high bits and low bits, and for matches we only |
2038 | | * look at whether the match is long or not. The assumption is that for typical |
2039 | | * "real" data, places that are good block boundaries will tend to be noticeable |
2040 | | * based only on changes in these aggregate probabilities, without looking for |
2041 | | * subtle differences in individual symbols. For example, a change from ASCII |
2042 | | * bytes to non-ASCII bytes, or from few matches (generally less compressible) |
2043 | | * to many matches (generally more compressible), would be easily noticed based |
2044 | | * on the aggregates. |
2045 | | * |
2046 | | * For determining whether the probability distributions are "different enough" |
2047 | | * to start a new block, the simple heuristic of splitting when the sum of |
2048 | | * absolute differences exceeds a constant seems to be good enough. We also add |
2049 | | * a number proportional to the block length so that the algorithm is more |
2050 | | * likely to end long blocks than short blocks. This reflects the general |
2051 | | * expectation that it will become increasingly beneficial to start a new block |
2052 | | * as the current block grows longer. |
2053 | | * |
2054 | | * Finally, for an approximation, it is not strictly necessary that the exact |
2055 | | * symbols being used are considered. With "near-optimal parsing", for example, |
2056 | | * the actual symbols that will be used are unknown until after the block |
2057 | | * boundary is chosen and the block has been optimized. Since the final choices |
2058 | | * cannot be used, we can use preliminary "greedy" choices instead. |
2059 | | */ |
2060 | | |
2061 | | /* Initialize the block split statistics when starting a new block. */ |
2062 | | static void |
2063 | | init_block_split_stats(struct block_split_stats *stats) |
2064 | 0 | { |
2065 | 0 | int i; |
2066 | |
|
2067 | 0 | for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { |
2068 | 0 | stats->new_observations[i] = 0; |
2069 | 0 | stats->observations[i] = 0; |
2070 | 0 | } |
2071 | 0 | stats->num_new_observations = 0; |
2072 | 0 | stats->num_observations = 0; |
2073 | 0 | } |
2074 | | |
2075 | | /* |
2076 | | * Literal observation. Heuristic: use the top 2 bits and low 1 bits of the |
2077 | | * literal, for 8 possible literal observation types. |
2078 | | */ |
2079 | | static forceinline void |
2080 | | observe_literal(struct block_split_stats *stats, u8 lit) |
2081 | 0 | { |
2082 | 0 | stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++; |
2083 | 0 | stats->num_new_observations++; |
2084 | 0 | } |
2085 | | |
2086 | | /* |
2087 | | * Match observation. Heuristic: use one observation type for "short match" and |
2088 | | * one observation type for "long match". |
2089 | | */ |
2090 | | static forceinline void |
2091 | | observe_match(struct block_split_stats *stats, unsigned length) |
2092 | 0 | { |
2093 | 0 | stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + |
2094 | 0 | (length >= 9)]++; |
2095 | 0 | stats->num_new_observations++; |
2096 | 0 | } |
2097 | | |
2098 | | static void |
2099 | | merge_new_observations(struct block_split_stats *stats) |
2100 | 0 | { |
2101 | 0 | int i; |
2102 | |
|
2103 | 0 | for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { |
2104 | 0 | stats->observations[i] += stats->new_observations[i]; |
2105 | 0 | stats->new_observations[i] = 0; |
2106 | 0 | } |
2107 | 0 | stats->num_observations += stats->num_new_observations; |
2108 | 0 | stats->num_new_observations = 0; |
2109 | 0 | } |
2110 | | |
2111 | | static bool |
2112 | | do_end_block_check(struct block_split_stats *stats, u32 block_length) |
2113 | 0 | { |
2114 | 0 | if (stats->num_observations > 0) { |
2115 | | /* |
2116 | | * Compute the sum of absolute differences of probabilities. To |
2117 | | * avoid needing to use floating point arithmetic or do slow |
2118 | | * divisions, we do all arithmetic with the probabilities |
2119 | | * multiplied by num_observations * num_new_observations. E.g., |
2120 | | * for the "old" observations the probabilities would be |
2121 | | * (double)observations[i] / num_observations, but since we |
2122 | | * multiply by both num_observations and num_new_observations we |
2123 | | * really do observations[i] * num_new_observations. |
2124 | | */ |
2125 | 0 | u32 total_delta = 0; |
2126 | 0 | u32 num_items; |
2127 | 0 | u32 cutoff; |
2128 | 0 | int i; |
2129 | |
|
2130 | 0 | for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { |
2131 | 0 | u32 expected = stats->observations[i] * |
2132 | 0 | stats->num_new_observations; |
2133 | 0 | u32 actual = stats->new_observations[i] * |
2134 | 0 | stats->num_observations; |
2135 | 0 | u32 delta = (actual > expected) ? actual - expected : |
2136 | 0 | expected - actual; |
2137 | |
|
2138 | 0 | total_delta += delta; |
2139 | 0 | } |
2140 | |
|
2141 | 0 | num_items = stats->num_observations + |
2142 | 0 | stats->num_new_observations; |
2143 | | /* |
2144 | | * Heuristic: the cutoff is when the sum of absolute differences |
2145 | | * of probabilities becomes at least 200/512. As above, the |
2146 | | * probability is multiplied by both num_new_observations and |
2147 | | * num_observations. Be careful to avoid integer overflow. |
2148 | | */ |
2149 | 0 | cutoff = stats->num_new_observations * 200 / 512 * |
2150 | 0 | stats->num_observations; |
2151 | | /* |
2152 | | * Very short blocks have a lot of overhead for the Huffman |
2153 | | * codes, so only use them if it clearly seems worthwhile. |
2154 | | * (This is an additional penalty, which adds to the smaller |
2155 | | * penalty below which scales more slowly.) |
2156 | | */ |
2157 | 0 | if (block_length < 10000 && num_items < 8192) |
2158 | 0 | cutoff += (u64)cutoff * (8192 - num_items) / 8192; |
2159 | | |
2160 | | /* Ready to end the block? */ |
2161 | 0 | if (total_delta + |
2162 | 0 | (block_length / 4096) * stats->num_observations >= cutoff) |
2163 | 0 | return true; |
2164 | 0 | } |
2165 | 0 | merge_new_observations(stats); |
2166 | 0 | return false; |
2167 | 0 | } |
2168 | | |
2169 | | static forceinline bool |
2170 | | ready_to_check_block(const struct block_split_stats *stats, |
2171 | | const u8 *in_block_begin, const u8 *in_next, |
2172 | | const u8 *in_end) |
2173 | 0 | { |
2174 | 0 | return stats->num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK |
2175 | 0 | && in_next - in_block_begin >= MIN_BLOCK_LENGTH |
2176 | 0 | && in_end - in_next >= MIN_BLOCK_LENGTH; |
2177 | 0 | } |
2178 | | |
2179 | | static forceinline bool |
2180 | | should_end_block(struct block_split_stats *stats, |
2181 | | const u8 *in_block_begin, const u8 *in_next, const u8 *in_end) |
2182 | 0 | { |
2183 | | /* Ready to try to end the block (again)? */ |
2184 | 0 | if (!ready_to_check_block(stats, in_block_begin, in_next, in_end)) |
2185 | 0 | return false; |
2186 | | |
2187 | 0 | return do_end_block_check(stats, in_next - in_block_begin); |
2188 | 0 | } |
2189 | | |
2190 | | /******************************************************************************/ |
2191 | | |
2192 | | static void |
2193 | | deflate_begin_sequences(struct libdeflate_compressor *c, |
2194 | | struct deflate_sequence *first_seq) |
2195 | 0 | { |
2196 | 0 | deflate_reset_symbol_frequencies(c); |
2197 | 0 | first_seq->litrunlen_and_length = 0; |
2198 | 0 | } |
2199 | | |
2200 | | static forceinline void |
2201 | | deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal, |
2202 | | bool gather_split_stats, struct deflate_sequence *seq) |
2203 | 0 | { |
2204 | 0 | c->freqs.litlen[literal]++; |
2205 | |
|
2206 | 0 | if (gather_split_stats) |
2207 | 0 | observe_literal(&c->split_stats, literal); |
2208 | |
|
2209 | 0 | STATIC_ASSERT(MAX_BLOCK_LENGTH <= SEQ_LITRUNLEN_MASK); |
2210 | 0 | seq->litrunlen_and_length++; |
2211 | 0 | } |
2212 | | |
2213 | | static forceinline void |
2214 | | deflate_choose_match(struct libdeflate_compressor *c, |
2215 | | unsigned length, unsigned offset, bool gather_split_stats, |
2216 | | struct deflate_sequence **seq_p) |
2217 | 0 | { |
2218 | 0 | struct deflate_sequence *seq = *seq_p; |
2219 | 0 | unsigned length_slot = deflate_length_slot[length]; |
2220 | 0 | unsigned offset_slot = deflate_get_offset_slot(offset); |
2221 | |
|
2222 | 0 | c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++; |
2223 | 0 | c->freqs.offset[offset_slot]++; |
2224 | 0 | if (gather_split_stats) |
2225 | 0 | observe_match(&c->split_stats, length); |
2226 | |
|
2227 | 0 | seq->litrunlen_and_length |= (u32)length << SEQ_LENGTH_SHIFT; |
2228 | 0 | seq->offset = offset; |
2229 | 0 | seq->offset_slot = offset_slot; |
2230 | |
|
2231 | 0 | seq++; |
2232 | 0 | seq->litrunlen_and_length = 0; |
2233 | 0 | *seq_p = seq; |
2234 | 0 | } |
2235 | | |
2236 | | /* |
2237 | | * Decrease the maximum and nice match lengths if we're approaching the end of |
2238 | | * the input buffer. |
2239 | | */ |
2240 | | static forceinline void |
2241 | | adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining) |
2242 | 0 | { |
2243 | 0 | if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) { |
2244 | 0 | *max_len = remaining; |
2245 | 0 | *nice_len = MIN(*nice_len, *max_len); |
2246 | 0 | } |
2247 | 0 | } |
2248 | | |
2249 | | /* |
2250 | | * Choose the minimum match length for the greedy and lazy parsers. |
2251 | | * |
2252 | | * By default the minimum match length is 3, which is the smallest length the |
2253 | | * DEFLATE format allows. However, with greedy and lazy parsing, some data |
2254 | | * (e.g. DNA sequencing data) benefits greatly from a longer minimum length. |
2255 | | * Typically, this is because literals are very cheap. In general, the |
2256 | | * near-optimal parser handles this case naturally, but the greedy and lazy |
2257 | | * parsers need a heuristic to decide when to use short matches. |
2258 | | * |
2259 | | * The heuristic we use is to make the minimum match length depend on the number |
2260 | | * of different literals that exist in the data. If there are many different |
2261 | | * literals, then literals will probably be expensive, so short matches will |
2262 | | * probably be worthwhile. Conversely, if not many literals are used, then |
2263 | | * probably literals will be cheap and short matches won't be worthwhile. |
2264 | | */ |
2265 | | static unsigned |
2266 | | choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth) |
2267 | 0 | { |
2268 | | /* map from num_used_literals to min_len */ |
2269 | 0 | static const u8 min_lens[] = { |
2270 | 0 | 9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6, |
2271 | 0 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, |
2272 | 0 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, |
2273 | 0 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
2274 | 0 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
2275 | | /* The rest is implicitly 3. */ |
2276 | 0 | }; |
2277 | 0 | unsigned min_len; |
2278 | |
|
2279 | 0 | STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3); |
2280 | 0 | STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1); |
2281 | |
|
2282 | 0 | if (num_used_literals >= ARRAY_LEN(min_lens)) |
2283 | 0 | return 3; |
2284 | 0 | min_len = min_lens[num_used_literals]; |
2285 | | /* |
2286 | | * With a low max_search_depth, it may be too hard to find long matches. |
2287 | | */ |
2288 | 0 | if (max_search_depth < 16) { |
2289 | 0 | if (max_search_depth < 5) |
2290 | 0 | min_len = MIN(min_len, 4); |
2291 | 0 | else if (max_search_depth < 10) |
2292 | 0 | min_len = MIN(min_len, 5); |
2293 | 0 | else |
2294 | 0 | min_len = MIN(min_len, 7); |
2295 | 0 | } |
2296 | 0 | return min_len; |
2297 | 0 | } |
2298 | | |
2299 | | static unsigned |
2300 | | calculate_min_match_len(const u8 *data, size_t data_len, |
2301 | | unsigned max_search_depth) |
2302 | 0 | { |
2303 | 0 | u8 used[256] = { 0 }; |
2304 | 0 | unsigned num_used_literals = 0; |
2305 | 0 | size_t i; |
2306 | | |
2307 | | /* |
2308 | | * For an initial approximation, scan the first 4 KiB of data. The |
2309 | | * caller may use recalculate_min_match_len() to update min_len later. |
2310 | | */ |
2311 | 0 | data_len = MIN(data_len, 4096); |
2312 | 0 | for (i = 0; i < data_len; i++) |
2313 | 0 | used[data[i]] = 1; |
2314 | 0 | for (i = 0; i < 256; i++) |
2315 | 0 | num_used_literals += used[i]; |
2316 | 0 | return choose_min_match_len(num_used_literals, max_search_depth); |
2317 | 0 | } |
2318 | | |
2319 | | /* |
2320 | | * Recalculate the minimum match length for a block, now that we know the |
2321 | | * distribution of literals that are actually being used (freqs->litlen). |
2322 | | */ |
2323 | | static unsigned |
2324 | | recalculate_min_match_len(const struct deflate_freqs *freqs, |
2325 | | unsigned max_search_depth) |
2326 | 0 | { |
2327 | 0 | u32 literal_freq = 0; |
2328 | 0 | u32 cutoff; |
2329 | 0 | unsigned num_used_literals = 0; |
2330 | 0 | int i; |
2331 | |
|
2332 | 0 | for (i = 0; i < DEFLATE_NUM_LITERALS; i++) |
2333 | 0 | literal_freq += freqs->litlen[i]; |
2334 | |
|
2335 | 0 | cutoff = literal_freq >> 10; /* Ignore literals used very rarely. */ |
2336 | |
|
2337 | 0 | for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { |
2338 | 0 | if (freqs->litlen[i] > cutoff) |
2339 | 0 | num_used_literals++; |
2340 | 0 | } |
2341 | 0 | return choose_min_match_len(num_used_literals, max_search_depth); |
2342 | 0 | } |
2343 | | |
2344 | | static forceinline const u8 * |
2345 | | choose_max_block_end(const u8 *in_block_begin, const u8 *in_end, |
2346 | | size_t soft_max_len) |
2347 | 0 | { |
2348 | 0 | if (in_end - in_block_begin < soft_max_len + MIN_BLOCK_LENGTH) |
2349 | 0 | return in_end; |
2350 | 0 | return in_block_begin + soft_max_len; |
2351 | 0 | } |
2352 | | |
2353 | | /* |
2354 | | * This is the level 0 "compressor". It always outputs uncompressed blocks. |
2355 | | */ |
2356 | | static size_t |
2357 | | deflate_compress_none(const u8 *in, size_t in_nbytes, |
2358 | | u8 *out, size_t out_nbytes_avail) |
2359 | 0 | { |
2360 | 0 | const u8 *in_next = in; |
2361 | 0 | const u8 * const in_end = in + in_nbytes; |
2362 | 0 | u8 *out_next = out; |
2363 | 0 | u8 * const out_end = out + out_nbytes_avail; |
2364 | | |
2365 | | /* |
2366 | | * If the input is zero-length, we still must output a block in order |
2367 | | * for the output to be a valid DEFLATE stream. Handle this case |
2368 | | * specially to avoid potentially passing NULL to memcpy() below. |
2369 | | */ |
2370 | 0 | if (unlikely(in_nbytes == 0)) { |
2371 | 0 | if (out_nbytes_avail < 5) |
2372 | 0 | return 0; |
2373 | | /* BFINAL and BTYPE */ |
2374 | 0 | *out_next++ = 1 | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1); |
2375 | | /* LEN and NLEN */ |
2376 | 0 | put_unaligned_le32(0xFFFF0000, out_next); |
2377 | 0 | return 5; |
2378 | 0 | } |
2379 | | |
2380 | 0 | do { |
2381 | 0 | u8 bfinal = 0; |
2382 | 0 | size_t len = UINT16_MAX; |
2383 | |
|
2384 | 0 | if (in_end - in_next <= UINT16_MAX) { |
2385 | 0 | bfinal = 1; |
2386 | 0 | len = in_end - in_next; |
2387 | 0 | } |
2388 | 0 | if (out_end - out_next < 5 + len) |
2389 | 0 | return 0; |
2390 | | /* |
2391 | | * Output BFINAL and BTYPE. The stream is already byte-aligned |
2392 | | * here, so this step always requires outputting exactly 1 byte. |
2393 | | */ |
2394 | 0 | *out_next++ = bfinal | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1); |
2395 | | |
2396 | | /* Output LEN and NLEN, then the data itself. */ |
2397 | 0 | put_unaligned_le16(len, out_next); |
2398 | 0 | out_next += 2; |
2399 | 0 | put_unaligned_le16(~len, out_next); |
2400 | 0 | out_next += 2; |
2401 | 0 | memcpy(out_next, in_next, len); |
2402 | 0 | out_next += len; |
2403 | 0 | in_next += len; |
2404 | 0 | } while (in_next != in_end); |
2405 | | |
2406 | 0 | return out_next - out; |
2407 | 0 | } |
2408 | | |
2409 | | /* |
2410 | | * This is a faster variant of deflate_compress_greedy(). It uses the |
2411 | | * ht_matchfinder rather than the hc_matchfinder. It also skips the block |
2412 | | * splitting algorithm and just uses fixed length blocks. c->max_search_depth |
2413 | | * has no effect with this algorithm, as it is hardcoded in ht_matchfinder.h. |
2414 | | */ |
2415 | | static void |
2416 | | deflate_compress_fastest(struct libdeflate_compressor * restrict c, |
2417 | | const u8 *in, size_t in_nbytes, |
2418 | | struct deflate_output_bitstream *os) |
2419 | 0 | { |
2420 | 0 | const u8 *in_next = in; |
2421 | 0 | const u8 *in_end = in_next + in_nbytes; |
2422 | 0 | const u8 *in_cur_base = in_next; |
2423 | 0 | unsigned max_len = DEFLATE_MAX_MATCH_LEN; |
2424 | 0 | unsigned nice_len = MIN(c->nice_match_length, max_len); |
2425 | 0 | u32 next_hash = 0; |
2426 | |
|
2427 | 0 | ht_matchfinder_init(&c->p.f.ht_mf); |
2428 | |
|
2429 | 0 | do { |
2430 | | /* Starting a new DEFLATE block */ |
2431 | |
|
2432 | 0 | const u8 * const in_block_begin = in_next; |
2433 | 0 | const u8 * const in_max_block_end = choose_max_block_end( |
2434 | 0 | in_next, in_end, FAST_SOFT_MAX_BLOCK_LENGTH); |
2435 | 0 | struct deflate_sequence *seq = c->p.f.sequences; |
2436 | |
|
2437 | 0 | deflate_begin_sequences(c, seq); |
2438 | |
|
2439 | 0 | do { |
2440 | 0 | u32 length; |
2441 | 0 | u32 offset; |
2442 | 0 | size_t remaining = in_end - in_next; |
2443 | |
|
2444 | 0 | if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) { |
2445 | 0 | max_len = remaining; |
2446 | 0 | if (max_len < HT_MATCHFINDER_REQUIRED_NBYTES) { |
2447 | 0 | do { |
2448 | 0 | deflate_choose_literal(c, |
2449 | 0 | *in_next++, false, seq); |
2450 | 0 | } while (--max_len); |
2451 | 0 | break; |
2452 | 0 | } |
2453 | 0 | nice_len = MIN(nice_len, max_len); |
2454 | 0 | } |
2455 | 0 | length = ht_matchfinder_longest_match(&c->p.f.ht_mf, |
2456 | 0 | &in_cur_base, |
2457 | 0 | in_next, |
2458 | 0 | max_len, |
2459 | 0 | nice_len, |
2460 | 0 | &next_hash, |
2461 | 0 | &offset); |
2462 | 0 | if (length) { |
2463 | | /* Match found */ |
2464 | 0 | deflate_choose_match(c, length, offset, false, |
2465 | 0 | &seq); |
2466 | 0 | ht_matchfinder_skip_bytes(&c->p.f.ht_mf, |
2467 | 0 | &in_cur_base, |
2468 | 0 | in_next + 1, |
2469 | 0 | in_end, |
2470 | 0 | length - 1, |
2471 | 0 | &next_hash); |
2472 | 0 | in_next += length; |
2473 | 0 | } else { |
2474 | | /* No match found */ |
2475 | 0 | deflate_choose_literal(c, *in_next++, false, |
2476 | 0 | seq); |
2477 | 0 | } |
2478 | | |
2479 | | /* Check if it's time to output another block. */ |
2480 | 0 | } while (in_next < in_max_block_end && |
2481 | 0 | seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]); |
2482 | | |
2483 | 0 | deflate_finish_block(c, os, in_block_begin, |
2484 | 0 | in_next - in_block_begin, |
2485 | 0 | c->p.f.sequences, in_next == in_end); |
2486 | 0 | } while (in_next != in_end); |
2487 | 0 | } |
2488 | | |
2489 | | /* |
2490 | | * This is the "greedy" DEFLATE compressor. It always chooses the longest match. |
2491 | | */ |
2492 | | static void |
2493 | | deflate_compress_greedy(struct libdeflate_compressor * restrict c, |
2494 | | const u8 *in, size_t in_nbytes, |
2495 | | struct deflate_output_bitstream *os) |
2496 | 0 | { |
2497 | 0 | const u8 *in_next = in; |
2498 | 0 | const u8 *in_end = in_next + in_nbytes; |
2499 | 0 | const u8 *in_cur_base = in_next; |
2500 | 0 | unsigned max_len = DEFLATE_MAX_MATCH_LEN; |
2501 | 0 | unsigned nice_len = MIN(c->nice_match_length, max_len); |
2502 | 0 | u32 next_hashes[2] = {0, 0}; |
2503 | |
|
2504 | 0 | hc_matchfinder_init(&c->p.g.hc_mf); |
2505 | |
|
2506 | 0 | do { |
2507 | | /* Starting a new DEFLATE block */ |
2508 | |
|
2509 | 0 | const u8 * const in_block_begin = in_next; |
2510 | 0 | const u8 * const in_max_block_end = choose_max_block_end( |
2511 | 0 | in_next, in_end, SOFT_MAX_BLOCK_LENGTH); |
2512 | 0 | struct deflate_sequence *seq = c->p.g.sequences; |
2513 | 0 | unsigned min_len; |
2514 | |
|
2515 | 0 | init_block_split_stats(&c->split_stats); |
2516 | 0 | deflate_begin_sequences(c, seq); |
2517 | 0 | min_len = calculate_min_match_len(in_next, |
2518 | 0 | in_max_block_end - in_next, |
2519 | 0 | c->max_search_depth); |
2520 | 0 | do { |
2521 | 0 | u32 length; |
2522 | 0 | u32 offset; |
2523 | |
|
2524 | 0 | adjust_max_and_nice_len(&max_len, &nice_len, |
2525 | 0 | in_end - in_next); |
2526 | 0 | length = hc_matchfinder_longest_match( |
2527 | 0 | &c->p.g.hc_mf, |
2528 | 0 | &in_cur_base, |
2529 | 0 | in_next, |
2530 | 0 | min_len - 1, |
2531 | 0 | max_len, |
2532 | 0 | nice_len, |
2533 | 0 | c->max_search_depth, |
2534 | 0 | next_hashes, |
2535 | 0 | &offset); |
2536 | |
|
2537 | 0 | if (length >= min_len && |
2538 | 0 | (length > DEFLATE_MIN_MATCH_LEN || |
2539 | 0 | offset <= 4096)) { |
2540 | | /* Match found */ |
2541 | 0 | deflate_choose_match(c, length, offset, true, |
2542 | 0 | &seq); |
2543 | 0 | hc_matchfinder_skip_bytes(&c->p.g.hc_mf, |
2544 | 0 | &in_cur_base, |
2545 | 0 | in_next + 1, |
2546 | 0 | in_end, |
2547 | 0 | length - 1, |
2548 | 0 | next_hashes); |
2549 | 0 | in_next += length; |
2550 | 0 | } else { |
2551 | | /* No match found */ |
2552 | 0 | deflate_choose_literal(c, *in_next++, true, |
2553 | 0 | seq); |
2554 | 0 | } |
2555 | | |
2556 | | /* Check if it's time to output another block. */ |
2557 | 0 | } while (in_next < in_max_block_end && |
2558 | 0 | seq < &c->p.g.sequences[SEQ_STORE_LENGTH] && |
2559 | 0 | !should_end_block(&c->split_stats, |
2560 | 0 | in_block_begin, in_next, in_end)); |
2561 | |
|
2562 | 0 | deflate_finish_block(c, os, in_block_begin, |
2563 | 0 | in_next - in_block_begin, |
2564 | 0 | c->p.g.sequences, in_next == in_end); |
2565 | 0 | } while (in_next != in_end); |
2566 | 0 | } |
2567 | | |
2568 | | static forceinline void |
2569 | | deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c, |
2570 | | const u8 *in, size_t in_nbytes, |
2571 | | struct deflate_output_bitstream *os, bool lazy2) |
2572 | 0 | { |
2573 | 0 | const u8 *in_next = in; |
2574 | 0 | const u8 *in_end = in_next + in_nbytes; |
2575 | 0 | const u8 *in_cur_base = in_next; |
2576 | 0 | unsigned max_len = DEFLATE_MAX_MATCH_LEN; |
2577 | 0 | unsigned nice_len = MIN(c->nice_match_length, max_len); |
2578 | 0 | u32 next_hashes[2] = {0, 0}; |
2579 | |
|
2580 | 0 | hc_matchfinder_init(&c->p.g.hc_mf); |
2581 | |
|
2582 | 0 | do { |
2583 | | /* Starting a new DEFLATE block */ |
2584 | |
|
2585 | 0 | const u8 * const in_block_begin = in_next; |
2586 | 0 | const u8 * const in_max_block_end = choose_max_block_end( |
2587 | 0 | in_next, in_end, SOFT_MAX_BLOCK_LENGTH); |
2588 | 0 | const u8 *next_recalc_min_len = |
2589 | 0 | in_next + MIN(in_end - in_next, 10000); |
2590 | 0 | struct deflate_sequence *seq = c->p.g.sequences; |
2591 | 0 | unsigned min_len; |
2592 | |
|
2593 | 0 | init_block_split_stats(&c->split_stats); |
2594 | 0 | deflate_begin_sequences(c, seq); |
2595 | 0 | min_len = calculate_min_match_len(in_next, |
2596 | 0 | in_max_block_end - in_next, |
2597 | 0 | c->max_search_depth); |
2598 | 0 | do { |
2599 | 0 | unsigned cur_len; |
2600 | 0 | unsigned cur_offset; |
2601 | 0 | unsigned next_len; |
2602 | 0 | unsigned next_offset; |
2603 | | |
2604 | | /* |
2605 | | * Recalculate the minimum match length if it hasn't |
2606 | | * been done recently. |
2607 | | */ |
2608 | 0 | if (in_next >= next_recalc_min_len) { |
2609 | 0 | min_len = recalculate_min_match_len( |
2610 | 0 | &c->freqs, |
2611 | 0 | c->max_search_depth); |
2612 | 0 | next_recalc_min_len += |
2613 | 0 | MIN(in_end - next_recalc_min_len, |
2614 | 0 | in_next - in_block_begin); |
2615 | 0 | } |
2616 | | |
2617 | | /* Find the longest match at the current position. */ |
2618 | 0 | adjust_max_and_nice_len(&max_len, &nice_len, |
2619 | 0 | in_end - in_next); |
2620 | 0 | cur_len = hc_matchfinder_longest_match( |
2621 | 0 | &c->p.g.hc_mf, |
2622 | 0 | &in_cur_base, |
2623 | 0 | in_next, |
2624 | 0 | min_len - 1, |
2625 | 0 | max_len, |
2626 | 0 | nice_len, |
2627 | 0 | c->max_search_depth, |
2628 | 0 | next_hashes, |
2629 | 0 | &cur_offset); |
2630 | 0 | if (cur_len < min_len || |
2631 | 0 | (cur_len == DEFLATE_MIN_MATCH_LEN && |
2632 | 0 | cur_offset > 8192)) { |
2633 | | /* No match found. Choose a literal. */ |
2634 | 0 | deflate_choose_literal(c, *in_next++, true, |
2635 | 0 | seq); |
2636 | 0 | continue; |
2637 | 0 | } |
2638 | 0 | in_next++; |
2639 | |
|
2640 | 0 | have_cur_match: |
2641 | | /* |
2642 | | * We have a match at the current position. |
2643 | | * If it's very long, choose it immediately. |
2644 | | */ |
2645 | 0 | if (cur_len >= nice_len) { |
2646 | 0 | deflate_choose_match(c, cur_len, cur_offset, |
2647 | 0 | true, &seq); |
2648 | 0 | hc_matchfinder_skip_bytes(&c->p.g.hc_mf, |
2649 | 0 | &in_cur_base, |
2650 | 0 | in_next, |
2651 | 0 | in_end, |
2652 | 0 | cur_len - 1, |
2653 | 0 | next_hashes); |
2654 | 0 | in_next += cur_len - 1; |
2655 | 0 | continue; |
2656 | 0 | } |
2657 | | |
2658 | | /* |
2659 | | * Try to find a better match at the next position. |
2660 | | * |
2661 | | * Note: since we already have a match at the *current* |
2662 | | * position, we use only half the 'max_search_depth' |
2663 | | * when checking the *next* position. This is a useful |
2664 | | * trade-off because it's more worthwhile to use a |
2665 | | * greater search depth on the initial match. |
2666 | | * |
2667 | | * Note: it's possible to structure the code such that |
2668 | | * there's only one call to longest_match(), which |
2669 | | * handles both the "find the initial match" and "try to |
2670 | | * find a better match" cases. However, it is faster to |
2671 | | * have two call sites, with longest_match() inlined at |
2672 | | * each. |
2673 | | */ |
2674 | 0 | adjust_max_and_nice_len(&max_len, &nice_len, |
2675 | 0 | in_end - in_next); |
2676 | 0 | next_len = hc_matchfinder_longest_match( |
2677 | 0 | &c->p.g.hc_mf, |
2678 | 0 | &in_cur_base, |
2679 | 0 | in_next++, |
2680 | 0 | cur_len - 1, |
2681 | 0 | max_len, |
2682 | 0 | nice_len, |
2683 | 0 | c->max_search_depth >> 1, |
2684 | 0 | next_hashes, |
2685 | 0 | &next_offset); |
2686 | 0 | if (next_len >= cur_len && |
2687 | 0 | 4 * (int)(next_len - cur_len) + |
2688 | 0 | ((int)bsr32(cur_offset) - |
2689 | 0 | (int)bsr32(next_offset)) > 2) { |
2690 | | /* |
2691 | | * Found a better match at the next position. |
2692 | | * Output a literal. Then the next match |
2693 | | * becomes the current match. |
2694 | | */ |
2695 | 0 | deflate_choose_literal(c, *(in_next - 2), true, |
2696 | 0 | seq); |
2697 | 0 | cur_len = next_len; |
2698 | 0 | cur_offset = next_offset; |
2699 | 0 | goto have_cur_match; |
2700 | 0 | } |
2701 | | |
2702 | 0 | if (lazy2) { |
2703 | | /* In lazy2 mode, look ahead another position */ |
2704 | 0 | adjust_max_and_nice_len(&max_len, &nice_len, |
2705 | 0 | in_end - in_next); |
2706 | 0 | next_len = hc_matchfinder_longest_match( |
2707 | 0 | &c->p.g.hc_mf, |
2708 | 0 | &in_cur_base, |
2709 | 0 | in_next++, |
2710 | 0 | cur_len - 1, |
2711 | 0 | max_len, |
2712 | 0 | nice_len, |
2713 | 0 | c->max_search_depth >> 2, |
2714 | 0 | next_hashes, |
2715 | 0 | &next_offset); |
2716 | 0 | if (next_len >= cur_len && |
2717 | 0 | 4 * (int)(next_len - cur_len) + |
2718 | 0 | ((int)bsr32(cur_offset) - |
2719 | 0 | (int)bsr32(next_offset)) > 6) { |
2720 | | /* |
2721 | | * There's a much better match two |
2722 | | * positions ahead, so use two literals. |
2723 | | */ |
2724 | 0 | deflate_choose_literal( |
2725 | 0 | c, *(in_next - 3), true, seq); |
2726 | 0 | deflate_choose_literal( |
2727 | 0 | c, *(in_next - 2), true, seq); |
2728 | 0 | cur_len = next_len; |
2729 | 0 | cur_offset = next_offset; |
2730 | 0 | goto have_cur_match; |
2731 | 0 | } |
2732 | | /* |
2733 | | * No better match at either of the next 2 |
2734 | | * positions. Output the current match. |
2735 | | */ |
2736 | 0 | deflate_choose_match(c, cur_len, cur_offset, |
2737 | 0 | true, &seq); |
2738 | 0 | if (cur_len > 3) { |
2739 | 0 | hc_matchfinder_skip_bytes(&c->p.g.hc_mf, |
2740 | 0 | &in_cur_base, |
2741 | 0 | in_next, |
2742 | 0 | in_end, |
2743 | 0 | cur_len - 3, |
2744 | 0 | next_hashes); |
2745 | 0 | in_next += cur_len - 3; |
2746 | 0 | } |
2747 | 0 | } else { /* !lazy2 */ |
2748 | | /* |
2749 | | * No better match at the next position. Output |
2750 | | * the current match. |
2751 | | */ |
2752 | 0 | deflate_choose_match(c, cur_len, cur_offset, |
2753 | 0 | true, &seq); |
2754 | 0 | hc_matchfinder_skip_bytes(&c->p.g.hc_mf, |
2755 | 0 | &in_cur_base, |
2756 | 0 | in_next, |
2757 | 0 | in_end, |
2758 | 0 | cur_len - 2, |
2759 | 0 | next_hashes); |
2760 | 0 | in_next += cur_len - 2; |
2761 | 0 | } |
2762 | | /* Check if it's time to output another block. */ |
2763 | 0 | } while (in_next < in_max_block_end && |
2764 | 0 | seq < &c->p.g.sequences[SEQ_STORE_LENGTH] && |
2765 | 0 | !should_end_block(&c->split_stats, |
2766 | 0 | in_block_begin, in_next, in_end)); |
2767 | | |
2768 | 0 | deflate_finish_block(c, os, in_block_begin, |
2769 | 0 | in_next - in_block_begin, |
2770 | 0 | c->p.g.sequences, in_next == in_end); |
2771 | 0 | } while (in_next != in_end); |
2772 | 0 | } |
2773 | | |
2774 | | /* |
2775 | | * This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to |
2776 | | * see if there's a better match at the next position. If yes, it outputs a |
2777 | | * literal and continues to the next position. If no, it outputs the match. |
2778 | | */ |
2779 | | static void |
2780 | | deflate_compress_lazy(struct libdeflate_compressor * restrict c, |
2781 | | const u8 *in, size_t in_nbytes, |
2782 | | struct deflate_output_bitstream *os) |
2783 | 0 | { |
2784 | 0 | deflate_compress_lazy_generic(c, in, in_nbytes, os, false); |
2785 | 0 | } |
2786 | | |
2787 | | /* |
2788 | | * The lazy2 compressor. This is similar to the regular lazy one, but it looks |
2789 | | * for a better match at the next 2 positions rather than the next 1. This |
2790 | | * makes it take slightly more time, but compress some inputs slightly more. |
2791 | | */ |
2792 | | static void |
2793 | | deflate_compress_lazy2(struct libdeflate_compressor * restrict c, |
2794 | | const u8 *in, size_t in_nbytes, |
2795 | | struct deflate_output_bitstream *os) |
2796 | 0 | { |
2797 | 0 | deflate_compress_lazy_generic(c, in, in_nbytes, os, true); |
2798 | 0 | } |
2799 | | |
2800 | | #if SUPPORT_NEAR_OPTIMAL_PARSING |
2801 | | |
2802 | | /* |
2803 | | * Follow the minimum-cost path in the graph of possible match/literal choices |
2804 | | * for the current block and compute the frequencies of the Huffman symbols that |
2805 | | * would be needed to output those matches and literals. |
2806 | | */ |
2807 | | static void |
2808 | | deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length) |
2809 | 0 | { |
2810 | 0 | struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; |
2811 | 0 | struct deflate_optimum_node *end_node = |
2812 | 0 | &c->p.n.optimum_nodes[block_length]; |
2813 | |
|
2814 | 0 | do { |
2815 | 0 | unsigned length = cur_node->item & OPTIMUM_LEN_MASK; |
2816 | 0 | unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; |
2817 | |
|
2818 | 0 | if (length == 1) { |
2819 | | /* Literal */ |
2820 | 0 | c->freqs.litlen[offset]++; |
2821 | 0 | } else { |
2822 | | /* Match */ |
2823 | 0 | c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + |
2824 | 0 | deflate_length_slot[length]]++; |
2825 | 0 | c->freqs.offset[c->p.n.offset_slot_full[offset]]++; |
2826 | 0 | } |
2827 | 0 | cur_node += length; |
2828 | 0 | } while (cur_node != end_node); |
2829 | | |
2830 | | /* Tally the end-of-block symbol. */ |
2831 | 0 | c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; |
2832 | 0 | } |
2833 | | |
2834 | | static void |
2835 | | deflate_choose_all_literals(struct libdeflate_compressor *c, |
2836 | | const u8 *block, u32 block_length) |
2837 | 0 | { |
2838 | 0 | u32 i; |
2839 | |
|
2840 | 0 | deflate_reset_symbol_frequencies(c); |
2841 | 0 | for (i = 0; i < block_length; i++) |
2842 | 0 | c->freqs.litlen[block[i]]++; |
2843 | 0 | c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; |
2844 | |
|
2845 | 0 | deflate_make_huffman_codes(&c->freqs, &c->codes); |
2846 | 0 | } |
2847 | | |
2848 | | /* |
2849 | | * Compute the exact cost, in bits, that would be required to output the matches |
2850 | | * and literals described by @c->freqs as a dynamic Huffman block. The litlen |
2851 | | * and offset codes are assumed to have already been built in @c->codes. |
2852 | | */ |
2853 | | static u32 |
2854 | | deflate_compute_true_cost(struct libdeflate_compressor *c) |
2855 | 0 | { |
2856 | 0 | u32 cost = 0; |
2857 | 0 | unsigned sym; |
2858 | |
|
2859 | 0 | deflate_precompute_huffman_header(c); |
2860 | |
|
2861 | 0 | memset(&c->codes.lens.litlen[c->o.precode.num_litlen_syms], 0, |
2862 | 0 | DEFLATE_NUM_LITLEN_SYMS - c->o.precode.num_litlen_syms); |
2863 | |
|
2864 | 0 | cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens); |
2865 | 0 | for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { |
2866 | 0 | cost += c->o.precode.freqs[sym] * |
2867 | 0 | (c->o.precode.lens[sym] + |
2868 | 0 | deflate_extra_precode_bits[sym]); |
2869 | 0 | } |
2870 | |
|
2871 | 0 | for (sym = 0; sym < DEFLATE_FIRST_LEN_SYM; sym++) |
2872 | 0 | cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym]; |
2873 | |
|
2874 | 0 | for (; sym < DEFLATE_FIRST_LEN_SYM + |
2875 | 0 | ARRAY_LEN(deflate_extra_length_bits); sym++) |
2876 | 0 | cost += c->freqs.litlen[sym] * |
2877 | 0 | (c->codes.lens.litlen[sym] + |
2878 | 0 | deflate_extra_length_bits[sym - DEFLATE_FIRST_LEN_SYM]); |
2879 | |
|
2880 | 0 | for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) |
2881 | 0 | cost += c->freqs.offset[sym] * |
2882 | 0 | (c->codes.lens.offset[sym] + |
2883 | 0 | deflate_extra_offset_bits[sym]); |
2884 | 0 | return cost; |
2885 | 0 | } |
2886 | | |
2887 | | /* Set the current cost model from the codeword lengths specified in @lens. */ |
2888 | | static void |
2889 | | deflate_set_costs_from_codes(struct libdeflate_compressor *c, |
2890 | | const struct deflate_lens *lens) |
2891 | 0 | { |
2892 | 0 | unsigned i; |
2893 | | |
2894 | | /* Literals */ |
2895 | 0 | for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { |
2896 | 0 | u32 bits = (lens->litlen[i] ? |
2897 | 0 | lens->litlen[i] : LITERAL_NOSTAT_BITS); |
2898 | |
|
2899 | 0 | c->p.n.costs.literal[i] = bits * BIT_COST; |
2900 | 0 | } |
2901 | | |
2902 | | /* Lengths */ |
2903 | 0 | for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) { |
2904 | 0 | unsigned length_slot = deflate_length_slot[i]; |
2905 | 0 | unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot; |
2906 | 0 | u32 bits = (lens->litlen[litlen_sym] ? |
2907 | 0 | lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS); |
2908 | |
|
2909 | 0 | bits += deflate_extra_length_bits[length_slot]; |
2910 | 0 | c->p.n.costs.length[i] = bits * BIT_COST; |
2911 | 0 | } |
2912 | | |
2913 | | /* Offset slots */ |
2914 | 0 | for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) { |
2915 | 0 | u32 bits = (lens->offset[i] ? |
2916 | 0 | lens->offset[i] : OFFSET_NOSTAT_BITS); |
2917 | |
|
2918 | 0 | bits += deflate_extra_offset_bits[i]; |
2919 | 0 | c->p.n.costs.offset_slot[i] = bits * BIT_COST; |
2920 | 0 | } |
2921 | 0 | } |
2922 | | |
2923 | | /* |
2924 | | * This lookup table gives the default cost of a literal symbol and of a length |
2925 | | * symbol, depending on the characteristics of the input data. It was generated |
2926 | | * by scripts/gen_default_litlen_costs.py. |
2927 | | * |
2928 | | * This table is indexed first by the estimated match probability: |
2929 | | * |
2930 | | * i=0: data doesn't contain many matches [match_prob=0.25] |
2931 | | * i=1: neutral [match_prob=0.50] |
2932 | | * i=2: data contains lots of matches [match_prob=0.75] |
2933 | | * |
2934 | | * This lookup produces a subtable which maps the number of distinct used |
2935 | | * literals to the default cost of a literal symbol, i.e.: |
2936 | | * |
2937 | | * int(-log2((1 - match_prob) / num_used_literals) * BIT_COST) |
2938 | | * |
2939 | | * ... for num_used_literals in [1, 256] (and 0, which is copied from 1). This |
2940 | | * accounts for literals usually getting cheaper as the number of distinct |
2941 | | * literals decreases, and as the proportion of literals to matches increases. |
2942 | | * |
2943 | | * The lookup also produces the cost of a length symbol, which is: |
2944 | | * |
2945 | | * int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST) |
2946 | | * |
2947 | | * Note: we don't currently assign different costs to different literal symbols, |
2948 | | * or to different length symbols, as this is hard to do in a useful way. |
2949 | | */ |
2950 | | static const struct { |
2951 | | u8 used_lits_to_lit_cost[257]; |
2952 | | u8 len_sym_cost; |
2953 | | } default_litlen_costs[] = { |
2954 | | { /* match_prob = 0.25 */ |
2955 | | .used_lits_to_lit_cost = { |
2956 | | 6, 6, 22, 32, 38, 43, 48, 51, |
2957 | | 54, 57, 59, 61, 64, 65, 67, 69, |
2958 | | 70, 72, 73, 74, 75, 76, 77, 79, |
2959 | | 80, 80, 81, 82, 83, 84, 85, 85, |
2960 | | 86, 87, 88, 88, 89, 89, 90, 91, |
2961 | | 91, 92, 92, 93, 93, 94, 95, 95, |
2962 | | 96, 96, 96, 97, 97, 98, 98, 99, |
2963 | | 99, 99, 100, 100, 101, 101, 101, 102, |
2964 | | 102, 102, 103, 103, 104, 104, 104, 105, |
2965 | | 105, 105, 105, 106, 106, 106, 107, 107, |
2966 | | 107, 108, 108, 108, 108, 109, 109, 109, |
2967 | | 109, 110, 110, 110, 111, 111, 111, 111, |
2968 | | 112, 112, 112, 112, 112, 113, 113, 113, |
2969 | | 113, 114, 114, 114, 114, 114, 115, 115, |
2970 | | 115, 115, 115, 116, 116, 116, 116, 116, |
2971 | | 117, 117, 117, 117, 117, 118, 118, 118, |
2972 | | 118, 118, 118, 119, 119, 119, 119, 119, |
2973 | | 120, 120, 120, 120, 120, 120, 121, 121, |
2974 | | 121, 121, 121, 121, 121, 122, 122, 122, |
2975 | | 122, 122, 122, 123, 123, 123, 123, 123, |
2976 | | 123, 123, 124, 124, 124, 124, 124, 124, |
2977 | | 124, 125, 125, 125, 125, 125, 125, 125, |
2978 | | 125, 126, 126, 126, 126, 126, 126, 126, |
2979 | | 127, 127, 127, 127, 127, 127, 127, 127, |
2980 | | 128, 128, 128, 128, 128, 128, 128, 128, |
2981 | | 128, 129, 129, 129, 129, 129, 129, 129, |
2982 | | 129, 129, 130, 130, 130, 130, 130, 130, |
2983 | | 130, 130, 130, 131, 131, 131, 131, 131, |
2984 | | 131, 131, 131, 131, 131, 132, 132, 132, |
2985 | | 132, 132, 132, 132, 132, 132, 132, 133, |
2986 | | 133, 133, 133, 133, 133, 133, 133, 133, |
2987 | | 133, 134, 134, 134, 134, 134, 134, 134, |
2988 | | 134, |
2989 | | }, |
2990 | | .len_sym_cost = 109, |
2991 | | }, { /* match_prob = 0.5 */ |
2992 | | .used_lits_to_lit_cost = { |
2993 | | 16, 16, 32, 41, 48, 53, 57, 60, |
2994 | | 64, 66, 69, 71, 73, 75, 76, 78, |
2995 | | 80, 81, 82, 83, 85, 86, 87, 88, |
2996 | | 89, 90, 91, 92, 92, 93, 94, 95, |
2997 | | 96, 96, 97, 98, 98, 99, 99, 100, |
2998 | | 101, 101, 102, 102, 103, 103, 104, 104, |
2999 | | 105, 105, 106, 106, 107, 107, 108, 108, |
3000 | | 108, 109, 109, 110, 110, 110, 111, 111, |
3001 | | 112, 112, 112, 113, 113, 113, 114, 114, |
3002 | | 114, 115, 115, 115, 115, 116, 116, 116, |
3003 | | 117, 117, 117, 118, 118, 118, 118, 119, |
3004 | | 119, 119, 119, 120, 120, 120, 120, 121, |
3005 | | 121, 121, 121, 122, 122, 122, 122, 122, |
3006 | | 123, 123, 123, 123, 124, 124, 124, 124, |
3007 | | 124, 125, 125, 125, 125, 125, 126, 126, |
3008 | | 126, 126, 126, 127, 127, 127, 127, 127, |
3009 | | 128, 128, 128, 128, 128, 128, 129, 129, |
3010 | | 129, 129, 129, 129, 130, 130, 130, 130, |
3011 | | 130, 130, 131, 131, 131, 131, 131, 131, |
3012 | | 131, 132, 132, 132, 132, 132, 132, 133, |
3013 | | 133, 133, 133, 133, 133, 133, 134, 134, |
3014 | | 134, 134, 134, 134, 134, 134, 135, 135, |
3015 | | 135, 135, 135, 135, 135, 135, 136, 136, |
3016 | | 136, 136, 136, 136, 136, 136, 137, 137, |
3017 | | 137, 137, 137, 137, 137, 137, 138, 138, |
3018 | | 138, 138, 138, 138, 138, 138, 138, 139, |
3019 | | 139, 139, 139, 139, 139, 139, 139, 139, |
3020 | | 140, 140, 140, 140, 140, 140, 140, 140, |
3021 | | 140, 141, 141, 141, 141, 141, 141, 141, |
3022 | | 141, 141, 141, 142, 142, 142, 142, 142, |
3023 | | 142, 142, 142, 142, 142, 142, 143, 143, |
3024 | | 143, 143, 143, 143, 143, 143, 143, 143, |
3025 | | 144, |
3026 | | }, |
3027 | | .len_sym_cost = 93, |
3028 | | }, { /* match_prob = 0.75 */ |
3029 | | .used_lits_to_lit_cost = { |
3030 | | 32, 32, 48, 57, 64, 69, 73, 76, |
3031 | | 80, 82, 85, 87, 89, 91, 92, 94, |
3032 | | 96, 97, 98, 99, 101, 102, 103, 104, |
3033 | | 105, 106, 107, 108, 108, 109, 110, 111, |
3034 | | 112, 112, 113, 114, 114, 115, 115, 116, |
3035 | | 117, 117, 118, 118, 119, 119, 120, 120, |
3036 | | 121, 121, 122, 122, 123, 123, 124, 124, |
3037 | | 124, 125, 125, 126, 126, 126, 127, 127, |
3038 | | 128, 128, 128, 129, 129, 129, 130, 130, |
3039 | | 130, 131, 131, 131, 131, 132, 132, 132, |
3040 | | 133, 133, 133, 134, 134, 134, 134, 135, |
3041 | | 135, 135, 135, 136, 136, 136, 136, 137, |
3042 | | 137, 137, 137, 138, 138, 138, 138, 138, |
3043 | | 139, 139, 139, 139, 140, 140, 140, 140, |
3044 | | 140, 141, 141, 141, 141, 141, 142, 142, |
3045 | | 142, 142, 142, 143, 143, 143, 143, 143, |
3046 | | 144, 144, 144, 144, 144, 144, 145, 145, |
3047 | | 145, 145, 145, 145, 146, 146, 146, 146, |
3048 | | 146, 146, 147, 147, 147, 147, 147, 147, |
3049 | | 147, 148, 148, 148, 148, 148, 148, 149, |
3050 | | 149, 149, 149, 149, 149, 149, 150, 150, |
3051 | | 150, 150, 150, 150, 150, 150, 151, 151, |
3052 | | 151, 151, 151, 151, 151, 151, 152, 152, |
3053 | | 152, 152, 152, 152, 152, 152, 153, 153, |
3054 | | 153, 153, 153, 153, 153, 153, 154, 154, |
3055 | | 154, 154, 154, 154, 154, 154, 154, 155, |
3056 | | 155, 155, 155, 155, 155, 155, 155, 155, |
3057 | | 156, 156, 156, 156, 156, 156, 156, 156, |
3058 | | 156, 157, 157, 157, 157, 157, 157, 157, |
3059 | | 157, 157, 157, 158, 158, 158, 158, 158, |
3060 | | 158, 158, 158, 158, 158, 158, 159, 159, |
3061 | | 159, 159, 159, 159, 159, 159, 159, 159, |
3062 | | 160, |
3063 | | }, |
3064 | | .len_sym_cost = 84, |
3065 | | }, |
3066 | | }; |
3067 | | |
3068 | | /* |
3069 | | * Choose the default costs for literal and length symbols. These symbols are |
3070 | | * both part of the litlen alphabet. |
3071 | | */ |
3072 | | static void |
3073 | | deflate_choose_default_litlen_costs(struct libdeflate_compressor *c, |
3074 | | const u8 *block_begin, u32 block_length, |
3075 | | u32 *lit_cost, u32 *len_sym_cost) |
3076 | 0 | { |
3077 | 0 | unsigned num_used_literals = 0; |
3078 | 0 | u32 literal_freq = block_length; |
3079 | 0 | u32 match_freq = 0; |
3080 | 0 | u32 cutoff; |
3081 | 0 | u32 i; |
3082 | | |
3083 | | /* Calculate the number of distinct literals that exist in the data. */ |
3084 | 0 | memset(c->freqs.litlen, 0, |
3085 | 0 | DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0])); |
3086 | 0 | cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */ |
3087 | 0 | for (i = 0; i < block_length; i++) |
3088 | 0 | c->freqs.litlen[block_begin[i]]++; |
3089 | 0 | for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { |
3090 | 0 | if (c->freqs.litlen[i] > cutoff) |
3091 | 0 | num_used_literals++; |
3092 | 0 | } |
3093 | 0 | if (num_used_literals == 0) |
3094 | 0 | num_used_literals = 1; |
3095 | | |
3096 | | /* |
3097 | | * Estimate the relative frequency of literals and matches in the |
3098 | | * optimal parsing solution. We don't know the optimal solution, so |
3099 | | * this can only be a very rough estimate. Therefore, we basically use |
3100 | | * the match frequency from a greedy parse. We also apply the min_len |
3101 | | * heuristic used by the greedy and lazy parsers, to avoid counting too |
3102 | | * many matches when literals are cheaper than short matches. |
3103 | | */ |
3104 | 0 | match_freq = 0; |
3105 | 0 | i = choose_min_match_len(num_used_literals, c->max_search_depth); |
3106 | 0 | for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) { |
3107 | 0 | match_freq += c->p.n.match_len_freqs[i]; |
3108 | 0 | literal_freq -= i * c->p.n.match_len_freqs[i]; |
3109 | 0 | } |
3110 | 0 | if ((s32)literal_freq < 0) /* shouldn't happen */ |
3111 | 0 | literal_freq = 0; |
3112 | |
|
3113 | 0 | if (match_freq > literal_freq) |
3114 | 0 | i = 2; /* many matches */ |
3115 | 0 | else if (match_freq * 4 > literal_freq) |
3116 | 0 | i = 1; /* neutral */ |
3117 | 0 | else |
3118 | 0 | i = 0; /* few matches */ |
3119 | |
|
3120 | 0 | STATIC_ASSERT(BIT_COST == 16); |
3121 | 0 | *lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[ |
3122 | 0 | num_used_literals]; |
3123 | 0 | *len_sym_cost = default_litlen_costs[i].len_sym_cost; |
3124 | 0 | } |
3125 | | |
3126 | | static forceinline u32 |
3127 | | deflate_default_length_cost(unsigned len, u32 len_sym_cost) |
3128 | 0 | { |
3129 | 0 | unsigned slot = deflate_length_slot[len]; |
3130 | 0 | u32 num_extra_bits = deflate_extra_length_bits[slot]; |
3131 | |
|
3132 | 0 | return len_sym_cost + (num_extra_bits * BIT_COST); |
3133 | 0 | } |
3134 | | |
3135 | | static forceinline u32 |
3136 | | deflate_default_offset_slot_cost(unsigned slot) |
3137 | 0 | { |
3138 | 0 | u32 num_extra_bits = deflate_extra_offset_bits[slot]; |
3139 | | /* |
3140 | | * Assume that all offset symbols are equally probable. |
3141 | | * The resulting cost is 'int(-log2(1/30) * BIT_COST)', |
3142 | | * where 30 is the number of potentially-used offset symbols. |
3143 | | */ |
3144 | 0 | u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000; |
3145 | |
|
3146 | 0 | return offset_sym_cost + (num_extra_bits * BIT_COST); |
3147 | 0 | } |
3148 | | |
3149 | | /* Set default symbol costs for the first block's first optimization pass. */ |
3150 | | static void |
3151 | | deflate_set_default_costs(struct libdeflate_compressor *c, |
3152 | | u32 lit_cost, u32 len_sym_cost) |
3153 | 0 | { |
3154 | 0 | unsigned i; |
3155 | | |
3156 | | /* Literals */ |
3157 | 0 | for (i = 0; i < DEFLATE_NUM_LITERALS; i++) |
3158 | 0 | c->p.n.costs.literal[i] = lit_cost; |
3159 | | |
3160 | | /* Lengths */ |
3161 | 0 | for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) |
3162 | 0 | c->p.n.costs.length[i] = |
3163 | 0 | deflate_default_length_cost(i, len_sym_cost); |
3164 | | |
3165 | | /* Offset slots */ |
3166 | 0 | for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) |
3167 | 0 | c->p.n.costs.offset_slot[i] = |
3168 | 0 | deflate_default_offset_slot_cost(i); |
3169 | 0 | } |
3170 | | |
3171 | | static forceinline void |
3172 | | deflate_adjust_cost(u32 *cost_p, u32 default_cost, int change_amount) |
3173 | 0 | { |
3174 | 0 | if (change_amount == 0) |
3175 | | /* Block is very similar to previous; prefer previous costs. */ |
3176 | 0 | *cost_p = (default_cost + 3 * *cost_p) / 4; |
3177 | 0 | else if (change_amount == 1) |
3178 | 0 | *cost_p = (default_cost + *cost_p) / 2; |
3179 | 0 | else if (change_amount == 2) |
3180 | 0 | *cost_p = (5 * default_cost + 3 * *cost_p) / 8; |
3181 | 0 | else |
3182 | | /* Block differs greatly from previous; prefer default costs. */ |
3183 | 0 | *cost_p = (3 * default_cost + *cost_p) / 4; |
3184 | 0 | } |
3185 | | |
3186 | | static forceinline void |
3187 | | deflate_adjust_costs_impl(struct libdeflate_compressor *c, |
3188 | | u32 lit_cost, u32 len_sym_cost, int change_amount) |
3189 | 0 | { |
3190 | 0 | unsigned i; |
3191 | | |
3192 | | /* Literals */ |
3193 | 0 | for (i = 0; i < DEFLATE_NUM_LITERALS; i++) |
3194 | 0 | deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost, |
3195 | 0 | change_amount); |
3196 | | |
3197 | | /* Lengths */ |
3198 | 0 | for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) |
3199 | 0 | deflate_adjust_cost(&c->p.n.costs.length[i], |
3200 | 0 | deflate_default_length_cost(i, |
3201 | 0 | len_sym_cost), |
3202 | 0 | change_amount); |
3203 | | |
3204 | | /* Offset slots */ |
3205 | 0 | for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) |
3206 | 0 | deflate_adjust_cost(&c->p.n.costs.offset_slot[i], |
3207 | 0 | deflate_default_offset_slot_cost(i), |
3208 | 0 | change_amount); |
3209 | 0 | } |
3210 | | |
3211 | | /* |
3212 | | * Adjust the costs when beginning a new block. |
3213 | | * |
3214 | | * Since the current costs are optimized for the data already, it can be helpful |
3215 | | * to reuse them instead of starting over with the default costs. However, this |
3216 | | * depends on how similar the new block is to the previous block. Therefore, |
3217 | | * use a heuristic to decide how similar the blocks are, and mix together the |
3218 | | * current costs and the default costs accordingly. |
3219 | | */ |
3220 | | static void |
3221 | | deflate_adjust_costs(struct libdeflate_compressor *c, |
3222 | | u32 lit_cost, u32 len_sym_cost) |
3223 | 0 | { |
3224 | 0 | u64 total_delta = 0; |
3225 | 0 | u64 cutoff; |
3226 | 0 | int i; |
3227 | | |
3228 | | /* |
3229 | | * Decide how different the current block is from the previous block, |
3230 | | * using the block splitting statistics from the current and previous |
3231 | | * blocks. The more different the current block is, the more we prefer |
3232 | | * the default costs rather than the previous block's costs. |
3233 | | * |
3234 | | * The algorithm here is similar to the end-of-block check one, but here |
3235 | | * we compare two entire blocks rather than a partial block with a small |
3236 | | * extra part, and therefore we need 64-bit numbers in some places. |
3237 | | */ |
3238 | 0 | for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { |
3239 | 0 | u64 prev = (u64)c->p.n.prev_observations[i] * |
3240 | 0 | c->split_stats.num_observations; |
3241 | 0 | u64 cur = (u64)c->split_stats.observations[i] * |
3242 | 0 | c->p.n.prev_num_observations; |
3243 | |
|
3244 | 0 | total_delta += prev > cur ? prev - cur : cur - prev; |
3245 | 0 | } |
3246 | 0 | cutoff = ((u64)c->p.n.prev_num_observations * |
3247 | 0 | c->split_stats.num_observations * 200) / 512; |
3248 | |
|
3249 | 0 | if (total_delta > 3 * cutoff) |
3250 | | /* Big change in the data; just use the default costs. */ |
3251 | 0 | deflate_set_default_costs(c, lit_cost, len_sym_cost); |
3252 | 0 | else if (4 * total_delta > 9 * cutoff) |
3253 | 0 | deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3); |
3254 | 0 | else if (2 * total_delta > 3 * cutoff) |
3255 | 0 | deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2); |
3256 | 0 | else if (2 * total_delta > cutoff) |
3257 | 0 | deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1); |
3258 | 0 | else |
3259 | 0 | deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0); |
3260 | 0 | } |
3261 | | |
3262 | | static void |
3263 | | deflate_set_initial_costs(struct libdeflate_compressor *c, |
3264 | | const u8 *block_begin, u32 block_length, |
3265 | | bool is_first_block) |
3266 | 0 | { |
3267 | 0 | u32 lit_cost, len_sym_cost; |
3268 | |
|
3269 | 0 | deflate_choose_default_litlen_costs(c, block_begin, block_length, |
3270 | 0 | &lit_cost, &len_sym_cost); |
3271 | 0 | if (is_first_block) |
3272 | 0 | deflate_set_default_costs(c, lit_cost, len_sym_cost); |
3273 | 0 | else |
3274 | 0 | deflate_adjust_costs(c, lit_cost, len_sym_cost); |
3275 | 0 | } |
3276 | | |
3277 | | /* |
3278 | | * Find the minimum-cost path through the graph of possible match/literal |
3279 | | * choices for this block. |
3280 | | * |
3281 | | * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which |
3282 | | * represents the node at the beginning of the block, to |
3283 | | * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of |
3284 | | * the block. Edge costs are evaluated using the cost model 'c->p.n.costs'. |
3285 | | * |
3286 | | * The algorithm works backwards, starting at the end node and proceeding |
3287 | | * backwards one node at a time. At each node, the minimum cost to reach the |
3288 | | * end node is computed and the match/literal choice that begins that path is |
3289 | | * saved. |
3290 | | */ |
3291 | | static void |
3292 | | deflate_find_min_cost_path(struct libdeflate_compressor *c, |
3293 | | const u32 block_length, |
3294 | | const struct lz_match *cache_ptr) |
3295 | 0 | { |
3296 | 0 | struct deflate_optimum_node *end_node = |
3297 | 0 | &c->p.n.optimum_nodes[block_length]; |
3298 | 0 | struct deflate_optimum_node *cur_node = end_node; |
3299 | |
|
3300 | 0 | cur_node->cost_to_end = 0; |
3301 | 0 | do { |
3302 | 0 | unsigned num_matches; |
3303 | 0 | unsigned literal; |
3304 | 0 | u32 best_cost_to_end; |
3305 | |
|
3306 | 0 | cur_node--; |
3307 | 0 | cache_ptr--; |
3308 | |
|
3309 | 0 | num_matches = cache_ptr->length; |
3310 | 0 | literal = cache_ptr->offset; |
3311 | | |
3312 | | /* It's always possible to choose a literal. */ |
3313 | 0 | best_cost_to_end = c->p.n.costs.literal[literal] + |
3314 | 0 | (cur_node + 1)->cost_to_end; |
3315 | 0 | cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1; |
3316 | | |
3317 | | /* Also consider matches if there are any. */ |
3318 | 0 | if (num_matches) { |
3319 | 0 | const struct lz_match *match; |
3320 | 0 | unsigned len; |
3321 | 0 | unsigned offset; |
3322 | 0 | unsigned offset_slot; |
3323 | 0 | u32 offset_cost; |
3324 | 0 | u32 cost_to_end; |
3325 | | |
3326 | | /* |
3327 | | * Consider each length from the minimum |
3328 | | * (DEFLATE_MIN_MATCH_LEN) to the length of the longest |
3329 | | * match found at this position. For each length, we |
3330 | | * consider only the smallest offset for which that |
3331 | | * length is available. Although this is not guaranteed |
3332 | | * to be optimal due to the possibility of a larger |
3333 | | * offset costing less than a smaller offset to code, |
3334 | | * this is a very useful heuristic. |
3335 | | */ |
3336 | 0 | match = cache_ptr - num_matches; |
3337 | 0 | len = DEFLATE_MIN_MATCH_LEN; |
3338 | 0 | do { |
3339 | 0 | offset = match->offset; |
3340 | 0 | offset_slot = c->p.n.offset_slot_full[offset]; |
3341 | 0 | offset_cost = |
3342 | 0 | c->p.n.costs.offset_slot[offset_slot]; |
3343 | 0 | do { |
3344 | 0 | cost_to_end = offset_cost + |
3345 | 0 | c->p.n.costs.length[len] + |
3346 | 0 | (cur_node + len)->cost_to_end; |
3347 | 0 | if (cost_to_end < best_cost_to_end) { |
3348 | 0 | best_cost_to_end = cost_to_end; |
3349 | 0 | cur_node->item = len | |
3350 | 0 | ((u32)offset << |
3351 | 0 | OPTIMUM_OFFSET_SHIFT); |
3352 | 0 | } |
3353 | 0 | } while (++len <= match->length); |
3354 | 0 | } while (++match != cache_ptr); |
3355 | 0 | cache_ptr -= num_matches; |
3356 | 0 | } |
3357 | 0 | cur_node->cost_to_end = best_cost_to_end; |
3358 | 0 | } while (cur_node != &c->p.n.optimum_nodes[0]); |
3359 | |
|
3360 | 0 | deflate_reset_symbol_frequencies(c); |
3361 | 0 | deflate_tally_item_list(c, block_length); |
3362 | 0 | deflate_make_huffman_codes(&c->freqs, &c->codes); |
3363 | 0 | } |
3364 | | |
3365 | | /* |
3366 | | * Choose the literals and matches for the current block, then output the block. |
3367 | | * |
3368 | | * To choose the literal/match sequence, we find the minimum-cost path through |
3369 | | * the block's graph of literal/match choices, given a cost model. However, the |
3370 | | * true cost of each symbol is unknown until the Huffman codes have been built, |
3371 | | * but at the same time the Huffman codes depend on the frequencies of chosen |
3372 | | * symbols. Consequently, multiple passes must be used to try to approximate an |
3373 | | * optimal solution. The first pass uses default costs, mixed with the costs |
3374 | | * from the previous block when it seems appropriate. Later passes use the |
3375 | | * Huffman codeword lengths from the previous pass as the costs. |
3376 | | * |
3377 | | * As an alternate strategy, also consider using only literals. The boolean |
3378 | | * returned in *used_only_literals indicates whether that strategy was best. |
3379 | | */ |
3380 | | static void |
3381 | | deflate_optimize_and_flush_block(struct libdeflate_compressor *c, |
3382 | | struct deflate_output_bitstream *os, |
3383 | | const u8 *block_begin, u32 block_length, |
3384 | | const struct lz_match *cache_ptr, |
3385 | | bool is_first_block, bool is_final_block, |
3386 | | bool *used_only_literals) |
3387 | 0 | { |
3388 | 0 | unsigned num_passes_remaining = c->p.n.max_optim_passes; |
3389 | 0 | u32 best_true_cost = UINT32_MAX; |
3390 | 0 | u32 true_cost; |
3391 | 0 | u32 only_lits_cost; |
3392 | 0 | struct deflate_sequence seq_; |
3393 | 0 | struct deflate_sequence *seq = NULL; |
3394 | 0 | u32 i; |
3395 | | |
3396 | | /* |
3397 | | * On some data, using only literals (no matches) ends up being better |
3398 | | * than what the iterative optimization algorithm produces. Therefore, |
3399 | | * consider using only literals. |
3400 | | */ |
3401 | 0 | deflate_choose_all_literals(c, block_begin, block_length); |
3402 | 0 | only_lits_cost = deflate_compute_true_cost(c); |
3403 | | |
3404 | | /* |
3405 | | * Force the block to really end at the desired length, even if some |
3406 | | * matches extend beyond it. |
3407 | | */ |
3408 | 0 | for (i = block_length; |
3409 | 0 | i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN, |
3410 | 0 | ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++) |
3411 | 0 | c->p.n.optimum_nodes[i].cost_to_end = 0x80000000; |
3412 | | |
3413 | | /* Initialize c->p.n.costs with default costs. */ |
3414 | 0 | deflate_set_initial_costs(c, block_begin, block_length, is_first_block); |
3415 | |
|
3416 | 0 | do { |
3417 | | /* |
3418 | | * Find the minimum-cost path for this pass. |
3419 | | * Also set c->freqs and c->codes to match the path. |
3420 | | */ |
3421 | 0 | deflate_find_min_cost_path(c, block_length, cache_ptr); |
3422 | | |
3423 | | /* |
3424 | | * Compute the exact cost of the block if the path were to be |
3425 | | * used. Note that this differs from |
3426 | | * c->p.n.optimum_nodes[0].cost_to_end in that true_cost uses |
3427 | | * the actual Huffman codes instead of c->p.n.costs. |
3428 | | */ |
3429 | 0 | true_cost = deflate_compute_true_cost(c); |
3430 | | |
3431 | | /* |
3432 | | * If the cost didn't improve much from the previous pass, then |
3433 | | * doing more passes probably won't be helpful, so stop early. |
3434 | | */ |
3435 | 0 | if (true_cost + c->p.n.min_improvement_to_continue > |
3436 | 0 | best_true_cost) |
3437 | 0 | break; |
3438 | | |
3439 | 0 | best_true_cost = true_cost; |
3440 | 0 | c->p.n.costs_producing_best_true_cost = c->p.n.costs; |
3441 | | |
3442 | | /* Update the cost model from the Huffman codes. */ |
3443 | 0 | deflate_set_costs_from_codes(c, &c->codes.lens); |
3444 | |
|
3445 | 0 | } while (--num_passes_remaining); |
3446 | | |
3447 | 0 | *used_only_literals = false; |
3448 | 0 | if (only_lits_cost < best_true_cost) { |
3449 | | /* Using only literals ended up being best! */ |
3450 | 0 | deflate_choose_all_literals(c, block_begin, block_length); |
3451 | 0 | deflate_set_costs_from_codes(c, &c->codes.lens); |
3452 | 0 | seq_.litrunlen_and_length = block_length; |
3453 | 0 | seq = &seq_; |
3454 | 0 | *used_only_literals = true; |
3455 | 0 | } else if (true_cost >= |
3456 | 0 | best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) { |
3457 | | /* |
3458 | | * The best solution was actually from a non-final optimization |
3459 | | * pass, so recover and use the min-cost path from that pass. |
3460 | | */ |
3461 | 0 | c->p.n.costs = c->p.n.costs_producing_best_true_cost; |
3462 | 0 | deflate_find_min_cost_path(c, block_length, cache_ptr); |
3463 | 0 | deflate_set_costs_from_codes(c, &c->codes.lens); |
3464 | 0 | } |
3465 | 0 | deflate_flush_block(c, os, block_begin, block_length, seq, |
3466 | 0 | is_final_block); |
3467 | 0 | } |
3468 | | |
3469 | | static void |
3470 | | deflate_near_optimal_init_stats(struct libdeflate_compressor *c) |
3471 | 0 | { |
3472 | 0 | init_block_split_stats(&c->split_stats); |
3473 | 0 | memset(c->p.n.new_match_len_freqs, 0, |
3474 | 0 | sizeof(c->p.n.new_match_len_freqs)); |
3475 | 0 | memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs)); |
3476 | 0 | } |
3477 | | |
3478 | | static void |
3479 | | deflate_near_optimal_merge_stats(struct libdeflate_compressor *c) |
3480 | 0 | { |
3481 | 0 | unsigned i; |
3482 | |
|
3483 | 0 | merge_new_observations(&c->split_stats); |
3484 | 0 | for (i = 0; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) { |
3485 | 0 | c->p.n.match_len_freqs[i] += c->p.n.new_match_len_freqs[i]; |
3486 | 0 | c->p.n.new_match_len_freqs[i] = 0; |
3487 | 0 | } |
3488 | 0 | } |
3489 | | |
3490 | | /* |
3491 | | * Save some literal/match statistics from the previous block so that |
3492 | | * deflate_adjust_costs() will be able to decide how much the current block |
3493 | | * differs from the previous one. |
3494 | | */ |
3495 | | static void |
3496 | | deflate_near_optimal_save_stats(struct libdeflate_compressor *c) |
3497 | 0 | { |
3498 | 0 | int i; |
3499 | |
|
3500 | 0 | for (i = 0; i < NUM_OBSERVATION_TYPES; i++) |
3501 | 0 | c->p.n.prev_observations[i] = c->split_stats.observations[i]; |
3502 | 0 | c->p.n.prev_num_observations = c->split_stats.num_observations; |
3503 | 0 | } |
3504 | | |
3505 | | static void |
3506 | | deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c) |
3507 | 0 | { |
3508 | 0 | int i; |
3509 | |
|
3510 | 0 | for (i = 0; i < NUM_OBSERVATION_TYPES; i++) |
3511 | 0 | c->split_stats.observations[i] = 0; |
3512 | 0 | c->split_stats.num_observations = 0; |
3513 | 0 | memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs)); |
3514 | 0 | } |
3515 | | |
3516 | | /* |
3517 | | * This is the "near-optimal" DEFLATE compressor. It computes the optimal |
3518 | | * representation of each DEFLATE block using a minimum-cost path search over |
3519 | | * the graph of possible match/literal choices for that block, assuming a |
3520 | | * certain cost for each Huffman symbol. |
3521 | | * |
3522 | | * For several reasons, the end result is not guaranteed to be optimal: |
3523 | | * |
3524 | | * - Nonoptimal choice of blocks |
3525 | | * - Heuristic limitations on which matches are actually considered |
3526 | | * - Symbol costs are unknown until the symbols have already been chosen |
3527 | | * (so iterative optimization must be used) |
3528 | | */ |
3529 | | static void |
3530 | | deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, |
3531 | | const u8 *in, size_t in_nbytes, |
3532 | | struct deflate_output_bitstream *os) |
3533 | 0 | { |
3534 | 0 | const u8 *in_next = in; |
3535 | 0 | const u8 *in_block_begin = in_next; |
3536 | 0 | const u8 *in_end = in_next + in_nbytes; |
3537 | 0 | const u8 *in_cur_base = in_next; |
3538 | 0 | const u8 *in_next_slide = |
3539 | 0 | in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE); |
3540 | 0 | unsigned max_len = DEFLATE_MAX_MATCH_LEN; |
3541 | 0 | unsigned nice_len = MIN(c->nice_match_length, max_len); |
3542 | 0 | struct lz_match *cache_ptr = c->p.n.match_cache; |
3543 | 0 | u32 next_hashes[2] = {0, 0}; |
3544 | 0 | bool prev_block_used_only_literals = false; |
3545 | |
|
3546 | 0 | bt_matchfinder_init(&c->p.n.bt_mf); |
3547 | 0 | deflate_near_optimal_init_stats(c); |
3548 | |
|
3549 | 0 | do { |
3550 | | /* Starting a new DEFLATE block */ |
3551 | 0 | const u8 * const in_max_block_end = choose_max_block_end( |
3552 | 0 | in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH); |
3553 | 0 | const u8 *prev_end_block_check = NULL; |
3554 | 0 | bool change_detected = false; |
3555 | 0 | const u8 *next_observation = in_next; |
3556 | 0 | unsigned min_len; |
3557 | | |
3558 | | /* |
3559 | | * Use the minimum match length heuristic to improve the |
3560 | | * literal/match statistics gathered during matchfinding. |
3561 | | * However, the actual near-optimal parse won't respect min_len, |
3562 | | * as it can accurately assess the costs of different matches. |
3563 | | * |
3564 | | * If the "use only literals" strategy happened to be the best |
3565 | | * strategy on the previous block, then probably the |
3566 | | * min_match_len heuristic is still not aggressive enough for |
3567 | | * the data, so force gathering literal stats only. |
3568 | | */ |
3569 | 0 | if (prev_block_used_only_literals) |
3570 | 0 | min_len = DEFLATE_MAX_MATCH_LEN + 1; |
3571 | 0 | else |
3572 | 0 | min_len = calculate_min_match_len( |
3573 | 0 | in_block_begin, |
3574 | 0 | in_max_block_end - in_block_begin, |
3575 | 0 | c->max_search_depth); |
3576 | | |
3577 | | /* |
3578 | | * Find matches until we decide to end the block. We end the |
3579 | | * block if any of the following is true: |
3580 | | * |
3581 | | * (1) Maximum block length has been reached |
3582 | | * (2) Match catch may overflow. |
3583 | | * (3) Block split heuristic says to split now. |
3584 | | */ |
3585 | 0 | for (;;) { |
3586 | 0 | struct lz_match *matches; |
3587 | 0 | unsigned best_len; |
3588 | 0 | size_t remaining = in_end - in_next; |
3589 | | |
3590 | | /* Slide the window forward if needed. */ |
3591 | 0 | if (in_next == in_next_slide) { |
3592 | 0 | bt_matchfinder_slide_window(&c->p.n.bt_mf); |
3593 | 0 | in_cur_base = in_next; |
3594 | 0 | in_next_slide = in_next + |
3595 | 0 | MIN(remaining, MATCHFINDER_WINDOW_SIZE); |
3596 | 0 | } |
3597 | | |
3598 | | /* |
3599 | | * Find matches with the current position using the |
3600 | | * binary tree matchfinder and save them in match_cache. |
3601 | | * |
3602 | | * Note: the binary tree matchfinder is more suited for |
3603 | | * optimal parsing than the hash chain matchfinder. The |
3604 | | * reasons for this include: |
3605 | | * |
3606 | | * - The binary tree matchfinder can find more matches |
3607 | | * in the same number of steps. |
3608 | | * - One of the major advantages of hash chains is that |
3609 | | * skipping positions (not searching for matches at |
3610 | | * them) is faster; however, with optimal parsing we |
3611 | | * search for matches at almost all positions, so this |
3612 | | * advantage of hash chains is negated. |
3613 | | */ |
3614 | 0 | matches = cache_ptr; |
3615 | 0 | best_len = 0; |
3616 | 0 | adjust_max_and_nice_len(&max_len, &nice_len, remaining); |
3617 | 0 | if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) { |
3618 | 0 | cache_ptr = bt_matchfinder_get_matches( |
3619 | 0 | &c->p.n.bt_mf, |
3620 | 0 | in_cur_base, |
3621 | 0 | in_next - in_cur_base, |
3622 | 0 | max_len, |
3623 | 0 | nice_len, |
3624 | 0 | c->max_search_depth, |
3625 | 0 | next_hashes, |
3626 | 0 | matches); |
3627 | 0 | if (cache_ptr > matches) |
3628 | 0 | best_len = cache_ptr[-1].length; |
3629 | 0 | } |
3630 | 0 | if (in_next >= next_observation) { |
3631 | 0 | if (best_len >= min_len) { |
3632 | 0 | observe_match(&c->split_stats, |
3633 | 0 | best_len); |
3634 | 0 | next_observation = in_next + best_len; |
3635 | 0 | c->p.n.new_match_len_freqs[best_len]++; |
3636 | 0 | } else { |
3637 | 0 | observe_literal(&c->split_stats, |
3638 | 0 | *in_next); |
3639 | 0 | next_observation = in_next + 1; |
3640 | 0 | } |
3641 | 0 | } |
3642 | |
|
3643 | 0 | cache_ptr->length = cache_ptr - matches; |
3644 | 0 | cache_ptr->offset = *in_next; |
3645 | 0 | in_next++; |
3646 | 0 | cache_ptr++; |
3647 | | |
3648 | | /* |
3649 | | * If there was a very long match found, don't cache any |
3650 | | * matches for the bytes covered by that match. This |
3651 | | * avoids degenerate behavior when compressing highly |
3652 | | * redundant data, where the number of matches can be |
3653 | | * very large. |
3654 | | * |
3655 | | * This heuristic doesn't actually hurt the compression |
3656 | | * ratio very much. If there's a long match, then the |
3657 | | * data must be highly compressible, so it doesn't |
3658 | | * matter much what we do. |
3659 | | */ |
3660 | 0 | if (best_len >= DEFLATE_MIN_MATCH_LEN && |
3661 | 0 | best_len >= nice_len) { |
3662 | 0 | --best_len; |
3663 | 0 | do { |
3664 | 0 | remaining = in_end - in_next; |
3665 | 0 | if (in_next == in_next_slide) { |
3666 | 0 | bt_matchfinder_slide_window( |
3667 | 0 | &c->p.n.bt_mf); |
3668 | 0 | in_cur_base = in_next; |
3669 | 0 | in_next_slide = in_next + |
3670 | 0 | MIN(remaining, |
3671 | 0 | MATCHFINDER_WINDOW_SIZE); |
3672 | 0 | } |
3673 | 0 | adjust_max_and_nice_len(&max_len, |
3674 | 0 | &nice_len, |
3675 | 0 | remaining); |
3676 | 0 | if (max_len >= |
3677 | 0 | BT_MATCHFINDER_REQUIRED_NBYTES) { |
3678 | 0 | bt_matchfinder_skip_byte( |
3679 | 0 | &c->p.n.bt_mf, |
3680 | 0 | in_cur_base, |
3681 | 0 | in_next - in_cur_base, |
3682 | 0 | nice_len, |
3683 | 0 | c->max_search_depth, |
3684 | 0 | next_hashes); |
3685 | 0 | } |
3686 | 0 | cache_ptr->length = 0; |
3687 | 0 | cache_ptr->offset = *in_next; |
3688 | 0 | in_next++; |
3689 | 0 | cache_ptr++; |
3690 | 0 | } while (--best_len); |
3691 | 0 | } |
3692 | | /* Maximum block length or end of input reached? */ |
3693 | 0 | if (in_next >= in_max_block_end) |
3694 | 0 | break; |
3695 | | /* Match cache overflowed? */ |
3696 | 0 | if (cache_ptr >= |
3697 | 0 | &c->p.n.match_cache[MATCH_CACHE_LENGTH]) |
3698 | 0 | break; |
3699 | | /* Not ready to try to end the block (again)? */ |
3700 | 0 | if (!ready_to_check_block(&c->split_stats, |
3701 | 0 | in_block_begin, in_next, |
3702 | 0 | in_end)) |
3703 | 0 | continue; |
3704 | | /* Check if it would be worthwhile to end the block. */ |
3705 | 0 | if (do_end_block_check(&c->split_stats, |
3706 | 0 | in_next - in_block_begin)) { |
3707 | 0 | change_detected = true; |
3708 | 0 | break; |
3709 | 0 | } |
3710 | | /* Ending the block doesn't seem worthwhile here. */ |
3711 | 0 | deflate_near_optimal_merge_stats(c); |
3712 | 0 | prev_end_block_check = in_next; |
3713 | 0 | } |
3714 | | /* |
3715 | | * All the matches for this block have been cached. Now choose |
3716 | | * the precise end of the block and the sequence of items to |
3717 | | * output to represent it, then flush the block. |
3718 | | */ |
3719 | 0 | if (change_detected && prev_end_block_check != NULL) { |
3720 | | /* |
3721 | | * The block is being ended because a recent chunk of |
3722 | | * data differs from the rest of the block. We could |
3723 | | * end the block at 'in_next' like the greedy and lazy |
3724 | | * compressors do, but that's not ideal since it would |
3725 | | * include the differing chunk in the block. The |
3726 | | * near-optimal compressor has time to do a better job. |
3727 | | * Therefore, we rewind to just before the chunk, and |
3728 | | * output a block that only goes up to there. |
3729 | | * |
3730 | | * We then set things up to correctly start the next |
3731 | | * block, considering that some work has already been |
3732 | | * done on it (some matches found and stats gathered). |
3733 | | */ |
3734 | 0 | struct lz_match *orig_cache_ptr = cache_ptr; |
3735 | 0 | const u8 *in_block_end = prev_end_block_check; |
3736 | 0 | u32 block_length = in_block_end - in_block_begin; |
3737 | 0 | bool is_first = (in_block_begin == in); |
3738 | 0 | bool is_final = false; |
3739 | 0 | u32 num_bytes_to_rewind = in_next - in_block_end; |
3740 | 0 | size_t cache_len_rewound; |
3741 | | |
3742 | | /* Rewind the match cache. */ |
3743 | 0 | do { |
3744 | 0 | cache_ptr--; |
3745 | 0 | cache_ptr -= cache_ptr->length; |
3746 | 0 | } while (--num_bytes_to_rewind); |
3747 | 0 | cache_len_rewound = orig_cache_ptr - cache_ptr; |
3748 | |
|
3749 | 0 | deflate_optimize_and_flush_block( |
3750 | 0 | c, os, in_block_begin, |
3751 | 0 | block_length, cache_ptr, |
3752 | 0 | is_first, is_final, |
3753 | 0 | &prev_block_used_only_literals); |
3754 | 0 | memmove(c->p.n.match_cache, cache_ptr, |
3755 | 0 | cache_len_rewound * sizeof(*cache_ptr)); |
3756 | 0 | cache_ptr = &c->p.n.match_cache[cache_len_rewound]; |
3757 | 0 | deflate_near_optimal_save_stats(c); |
3758 | | /* |
3759 | | * Clear the stats for the just-flushed block, leaving |
3760 | | * just the stats for the beginning of the next block. |
3761 | | */ |
3762 | 0 | deflate_near_optimal_clear_old_stats(c); |
3763 | 0 | in_block_begin = in_block_end; |
3764 | 0 | } else { |
3765 | | /* |
3766 | | * The block is being ended for a reason other than a |
3767 | | * differing data chunk being detected. Don't rewind at |
3768 | | * all; just end the block at the current position. |
3769 | | */ |
3770 | 0 | u32 block_length = in_next - in_block_begin; |
3771 | 0 | bool is_first = (in_block_begin == in); |
3772 | 0 | bool is_final = (in_next == in_end); |
3773 | |
|
3774 | 0 | deflate_near_optimal_merge_stats(c); |
3775 | 0 | deflate_optimize_and_flush_block( |
3776 | 0 | c, os, in_block_begin, |
3777 | 0 | block_length, cache_ptr, |
3778 | 0 | is_first, is_final, |
3779 | 0 | &prev_block_used_only_literals); |
3780 | 0 | cache_ptr = &c->p.n.match_cache[0]; |
3781 | 0 | deflate_near_optimal_save_stats(c); |
3782 | 0 | deflate_near_optimal_init_stats(c); |
3783 | 0 | in_block_begin = in_next; |
3784 | 0 | } |
3785 | 0 | } while (in_next != in_end); |
3786 | 0 | } |
3787 | | |
3788 | | /* Initialize c->p.n.offset_slot_full. */ |
3789 | | static void |
3790 | | deflate_init_offset_slot_full(struct libdeflate_compressor *c) |
3791 | 0 | { |
3792 | 0 | unsigned offset_slot; |
3793 | 0 | unsigned offset; |
3794 | 0 | unsigned offset_end; |
3795 | |
|
3796 | 0 | for (offset_slot = 0; offset_slot < ARRAY_LEN(deflate_offset_slot_base); |
3797 | 0 | offset_slot++) { |
3798 | 0 | offset = deflate_offset_slot_base[offset_slot]; |
3799 | 0 | offset_end = offset + |
3800 | 0 | (1 << deflate_extra_offset_bits[offset_slot]); |
3801 | 0 | do { |
3802 | 0 | c->p.n.offset_slot_full[offset] = offset_slot; |
3803 | 0 | } while (++offset != offset_end); |
3804 | 0 | } |
3805 | 0 | } |
3806 | | |
3807 | | #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ |
3808 | | |
3809 | | LIBDEFLATEAPI struct libdeflate_compressor * |
3810 | | libdeflate_alloc_compressor(int compression_level) |
3811 | 0 | { |
3812 | 0 | struct libdeflate_compressor *c; |
3813 | 0 | size_t size = offsetof(struct libdeflate_compressor, p); |
3814 | |
|
3815 | 0 | check_buildtime_parameters(); |
3816 | |
|
3817 | 0 | if (compression_level < 0 || compression_level > 12) |
3818 | 0 | return NULL; |
3819 | | |
3820 | 0 | #if SUPPORT_NEAR_OPTIMAL_PARSING |
3821 | 0 | if (compression_level >= 10) |
3822 | 0 | size += sizeof(c->p.n); |
3823 | 0 | else |
3824 | 0 | #endif |
3825 | 0 | { |
3826 | 0 | if (compression_level >= 2) |
3827 | 0 | size += sizeof(c->p.g); |
3828 | 0 | else if (compression_level == 1) |
3829 | 0 | size += sizeof(c->p.f); |
3830 | 0 | } |
3831 | |
|
3832 | 0 | c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size); |
3833 | 0 | if (!c) |
3834 | 0 | return NULL; |
3835 | | |
3836 | 0 | c->compression_level = compression_level; |
3837 | | |
3838 | | /* |
3839 | | * The higher the compression level, the more we should bother trying to |
3840 | | * compress very small inputs. |
3841 | | */ |
3842 | 0 | c->max_passthrough_size = 55 - (compression_level * 4); |
3843 | |
|
3844 | 0 | switch (compression_level) { |
3845 | 0 | case 0: |
3846 | 0 | c->max_passthrough_size = SIZE_MAX; |
3847 | 0 | c->impl = NULL; /* not used */ |
3848 | 0 | break; |
3849 | 0 | case 1: |
3850 | 0 | c->impl = deflate_compress_fastest; |
3851 | | /* max_search_depth is unused. */ |
3852 | 0 | c->nice_match_length = 32; |
3853 | 0 | break; |
3854 | 0 | case 2: |
3855 | 0 | c->impl = deflate_compress_greedy; |
3856 | 0 | c->max_search_depth = 6; |
3857 | 0 | c->nice_match_length = 10; |
3858 | 0 | break; |
3859 | 0 | case 3: |
3860 | 0 | c->impl = deflate_compress_greedy; |
3861 | 0 | c->max_search_depth = 12; |
3862 | 0 | c->nice_match_length = 14; |
3863 | 0 | break; |
3864 | 0 | case 4: |
3865 | 0 | c->impl = deflate_compress_greedy; |
3866 | 0 | c->max_search_depth = 16; |
3867 | 0 | c->nice_match_length = 30; |
3868 | 0 | break; |
3869 | 0 | case 5: |
3870 | 0 | c->impl = deflate_compress_lazy; |
3871 | 0 | c->max_search_depth = 16; |
3872 | 0 | c->nice_match_length = 30; |
3873 | 0 | break; |
3874 | 0 | case 6: |
3875 | 0 | c->impl = deflate_compress_lazy; |
3876 | 0 | c->max_search_depth = 35; |
3877 | 0 | c->nice_match_length = 65; |
3878 | 0 | break; |
3879 | 0 | case 7: |
3880 | 0 | c->impl = deflate_compress_lazy; |
3881 | 0 | c->max_search_depth = 100; |
3882 | 0 | c->nice_match_length = 130; |
3883 | 0 | break; |
3884 | 0 | case 8: |
3885 | 0 | c->impl = deflate_compress_lazy2; |
3886 | 0 | c->max_search_depth = 300; |
3887 | 0 | c->nice_match_length = DEFLATE_MAX_MATCH_LEN; |
3888 | 0 | break; |
3889 | 0 | case 9: |
3890 | | #if !SUPPORT_NEAR_OPTIMAL_PARSING |
3891 | | default: |
3892 | | #endif |
3893 | 0 | c->impl = deflate_compress_lazy2; |
3894 | 0 | c->max_search_depth = 600; |
3895 | 0 | c->nice_match_length = DEFLATE_MAX_MATCH_LEN; |
3896 | 0 | break; |
3897 | 0 | #if SUPPORT_NEAR_OPTIMAL_PARSING |
3898 | 0 | case 10: |
3899 | 0 | c->impl = deflate_compress_near_optimal; |
3900 | 0 | c->max_search_depth = 35; |
3901 | 0 | c->nice_match_length = 75; |
3902 | 0 | c->p.n.max_optim_passes = 2; |
3903 | 0 | c->p.n.min_improvement_to_continue = 32; |
3904 | 0 | c->p.n.min_bits_to_use_nonfinal_path = 32; |
3905 | 0 | deflate_init_offset_slot_full(c); |
3906 | 0 | break; |
3907 | 0 | case 11: |
3908 | 0 | c->impl = deflate_compress_near_optimal; |
3909 | 0 | c->max_search_depth = 100; |
3910 | 0 | c->nice_match_length = 150; |
3911 | 0 | c->p.n.max_optim_passes = 4; |
3912 | 0 | c->p.n.min_improvement_to_continue = 16; |
3913 | 0 | c->p.n.min_bits_to_use_nonfinal_path = 16; |
3914 | 0 | deflate_init_offset_slot_full(c); |
3915 | 0 | break; |
3916 | 0 | case 12: |
3917 | 0 | default: |
3918 | 0 | c->impl = deflate_compress_near_optimal; |
3919 | 0 | c->max_search_depth = 300; |
3920 | 0 | c->nice_match_length = DEFLATE_MAX_MATCH_LEN; |
3921 | 0 | c->p.n.max_optim_passes = 10; |
3922 | 0 | c->p.n.min_improvement_to_continue = 1; |
3923 | 0 | c->p.n.min_bits_to_use_nonfinal_path = 1; |
3924 | 0 | deflate_init_offset_slot_full(c); |
3925 | 0 | break; |
3926 | 0 | #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ |
3927 | 0 | } |
3928 | | |
3929 | 0 | deflate_init_static_codes(c); |
3930 | |
|
3931 | 0 | return c; |
3932 | 0 | } |
3933 | | |
3934 | | LIBDEFLATEAPI size_t |
3935 | | libdeflate_deflate_compress(struct libdeflate_compressor *c, |
3936 | | const void *in, size_t in_nbytes, |
3937 | | void *out, size_t out_nbytes_avail) |
3938 | 0 | { |
3939 | 0 | struct deflate_output_bitstream os; |
3940 | | |
3941 | | /* |
3942 | | * For extremely short inputs, or for compression level 0, just output |
3943 | | * uncompressed blocks. |
3944 | | */ |
3945 | 0 | if (unlikely(in_nbytes <= c->max_passthrough_size)) |
3946 | 0 | return deflate_compress_none(in, in_nbytes, |
3947 | 0 | out, out_nbytes_avail); |
3948 | | |
3949 | | /* |
3950 | | * Initialize the output bitstream structure. |
3951 | | * |
3952 | | * The end is set to OUTPUT_END_PADDING below the true end, so that |
3953 | | * FLUSH_BITS() can be more efficient. |
3954 | | */ |
3955 | 0 | if (unlikely(out_nbytes_avail <= OUTPUT_END_PADDING)) |
3956 | 0 | return 0; |
3957 | 0 | os.bitbuf = 0; |
3958 | 0 | os.bitcount = 0; |
3959 | 0 | os.next = out; |
3960 | 0 | os.end = os.next + out_nbytes_avail - OUTPUT_END_PADDING; |
3961 | 0 | (*c->impl)(c, in, in_nbytes, &os); |
3962 | | /* |
3963 | | * If 'os.next' reached 'os.end', then either there was not enough space |
3964 | | * in the output buffer, or the compressed size would have been within |
3965 | | * OUTPUT_END_PADDING of the true end. For performance reasons we don't |
3966 | | * distinguish between these cases; we just make sure to return some |
3967 | | * extra space from libdeflate_deflate_compress_bound(). |
3968 | | */ |
3969 | 0 | if (os.next >= os.end) |
3970 | 0 | return 0; |
3971 | 0 | ASSERT(os.bitcount <= 7); |
3972 | 0 | if (os.bitcount) |
3973 | 0 | *os.next++ = os.bitbuf; |
3974 | 0 | return os.next - (u8 *)out; |
3975 | 0 | } |
3976 | | |
3977 | | LIBDEFLATEAPI void |
3978 | | libdeflate_free_compressor(struct libdeflate_compressor *c) |
3979 | 0 | { |
3980 | 0 | libdeflate_aligned_free(c); |
3981 | 0 | } |
3982 | | |
3983 | | unsigned int |
3984 | | libdeflate_get_compression_level(struct libdeflate_compressor *c) |
3985 | 0 | { |
3986 | 0 | return c->compression_level; |
3987 | 0 | } |
3988 | | |
3989 | | LIBDEFLATEAPI size_t |
3990 | | libdeflate_deflate_compress_bound(struct libdeflate_compressor *c, |
3991 | | size_t in_nbytes) |
3992 | 28.9k | { |
3993 | 28.9k | size_t bound = 0; |
3994 | 28.9k | size_t max_blocks; |
3995 | | |
3996 | | /* |
3997 | | * Since the compressor never uses a compressed block when an |
3998 | | * uncompressed block is cheaper, the worst case can be no worse than |
3999 | | * the case where only uncompressed blocks are used. |
4000 | | * |
4001 | | * This is true even though up to 7 bits are "wasted" to byte-align the |
4002 | | * bitstream when a compressed block is followed by an uncompressed |
4003 | | * block. This is because a compressed block wouldn't have been used if |
4004 | | * it wasn't cheaper than an uncompressed block, and uncompressed blocks |
4005 | | * always end on a byte boundary. So the alignment bits will, at worst, |
4006 | | * go up to the place where the uncompressed block would have ended. |
4007 | | */ |
4008 | | |
4009 | | /* |
4010 | | * The minimum length that is passed to deflate_flush_block() is |
4011 | | * MIN_BLOCK_LENGTH bytes, except for the final block if needed. |
4012 | | * |
4013 | | * If deflate_flush_block() decides to use an uncompressed block, it |
4014 | | * actually will (in general) output a series of uncompressed blocks in |
4015 | | * order to stay within the UINT16_MAX limit of DEFLATE. But this can |
4016 | | * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX', |
4017 | | * as in that case this behavior can't result in more blocks than the |
4018 | | * case where deflate_flush_block() is called with min-length inputs. |
4019 | | * |
4020 | | * So the number of uncompressed blocks needed would be bounded by |
4021 | | * DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH). However, empty inputs |
4022 | | * need 1 (empty) block, which gives the final expression below. |
4023 | | */ |
4024 | 28.9k | STATIC_ASSERT(2 * MIN_BLOCK_LENGTH <= UINT16_MAX); |
4025 | 28.9k | max_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1); |
4026 | | |
4027 | | /* |
4028 | | * Each uncompressed block has 5 bytes of overhead, for the BFINAL, |
4029 | | * BTYPE, LEN, and NLEN fields. (For the reason explained earlier, the |
4030 | | * alignment bits at the very start of the block can be disregarded; |
4031 | | * they would otherwise increase the overhead to 6 bytes per block.) |
4032 | | */ |
4033 | 28.9k | bound += 5 * max_blocks; |
4034 | | |
4035 | | /* Account for the data itself, stored uncompressed. */ |
4036 | 28.9k | bound += in_nbytes; |
4037 | | |
4038 | | /* |
4039 | | * Add 1 + OUTPUT_END_PADDING because for performance reasons, the |
4040 | | * compressor doesn't distinguish between cases where there wasn't |
4041 | | * enough space and cases where the compressed size would have been |
4042 | | * 'out_nbytes_avail - OUTPUT_END_PADDING' or greater. Adding |
4043 | | * 1 + OUTPUT_END_PADDING to the bound ensures the needed wiggle room. |
4044 | | */ |
4045 | 28.9k | bound += 1 + OUTPUT_END_PADDING; |
4046 | | |
4047 | 28.9k | return bound; |
4048 | 28.9k | } |