/work/_deps/deflate-src/lib/deflate_decompress.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * deflate_decompress.c - a decompressor for DEFLATE |
3 | | * |
4 | | * Copyright 2016 Eric Biggers |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person |
7 | | * obtaining a copy of this software and associated documentation |
8 | | * files (the "Software"), to deal in the Software without |
9 | | * restriction, including without limitation the rights to use, |
10 | | * copy, modify, merge, publish, distribute, sublicense, and/or sell |
11 | | * copies of the Software, and to permit persons to whom the |
12 | | * Software is furnished to do so, subject to the following |
13 | | * conditions: |
14 | | * |
15 | | * The above copyright notice and this permission notice shall be |
16 | | * included in all copies or substantial portions of the Software. |
17 | | * |
18 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
19 | | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
20 | | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
21 | | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
22 | | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
23 | | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
24 | | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
25 | | * OTHER DEALINGS IN THE SOFTWARE. |
26 | | * |
27 | | * --------------------------------------------------------------------------- |
28 | | * |
29 | | * This is a highly optimized DEFLATE decompressor. It is much faster than |
30 | | * vanilla zlib, typically well over twice as fast, though results vary by CPU. |
31 | | * |
32 | | * Why this is faster than vanilla zlib: |
33 | | * |
34 | | * - Word accesses rather than byte accesses when reading input |
35 | | * - Word accesses rather than byte accesses when copying matches |
36 | | * - Faster Huffman decoding combined with various DEFLATE-specific tricks |
37 | | * - Larger bitbuffer variable that doesn't need to be refilled as often |
38 | | * - Other optimizations to remove unnecessary branches |
39 | | * - Only full-buffer decompression is supported, so the code doesn't need to |
40 | | * support stopping and resuming decompression. |
41 | | * - On x86_64, a version of the decompression routine is compiled with BMI2 |
42 | | * instructions enabled and is used automatically at runtime when supported. |
43 | | */ |
44 | | |
45 | | #include "lib_common.h" |
46 | | #include "deflate_constants.h" |
47 | | |
48 | | /* |
49 | | * If the expression passed to SAFETY_CHECK() evaluates to false, then the |
50 | | * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the |
51 | | * compressed data is invalid. |
52 | | * |
53 | | * Theoretically, these checks could be disabled for specialized applications |
54 | | * where all input to the decompressor will be trusted. |
55 | | */ |
56 | | #if 0 |
57 | | # pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!") |
58 | | # define SAFETY_CHECK(expr) (void)(expr) |
59 | | #else |
60 | 132k | # define SAFETY_CHECK(expr) if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA |
61 | | #endif |
62 | | |
63 | | /***************************************************************************** |
64 | | * Input bitstream * |
65 | | *****************************************************************************/ |
66 | | |
67 | | /* |
68 | | * The state of the "input bitstream" consists of the following variables: |
69 | | * |
70 | | * - in_next: a pointer to the next unread byte in the input buffer |
71 | | * |
72 | | * - in_end: a pointer to just past the end of the input buffer |
73 | | * |
74 | | * - bitbuf: a word-sized variable containing bits that have been read from |
75 | | * the input buffer or from the implicit appended zero bytes |
76 | | * |
77 | | * - bitsleft: the number of bits in 'bitbuf' available to be consumed. |
78 | | * After REFILL_BITS_BRANCHLESS(), 'bitbuf' can actually |
79 | | * contain more bits than this. However, only the bits counted |
80 | | * by 'bitsleft' can actually be consumed; the rest can only be |
81 | | * used for preloading. |
82 | | * |
83 | | * As a micro-optimization, we allow bits 8 and higher of |
84 | | * 'bitsleft' to contain garbage. When consuming the bits |
85 | | * associated with a decode table entry, this allows us to do |
86 | | * 'bitsleft -= entry' instead of 'bitsleft -= (u8)entry'. |
87 | | * On some CPUs, this helps reduce instruction dependencies. |
88 | | * This does have the disadvantage that 'bitsleft' sometimes |
89 | | * needs to be cast to 'u8', such as when it's used as a shift |
90 | | * amount in REFILL_BITS_BRANCHLESS(). But that one happens |
91 | | * for free since most CPUs ignore high bits in shift amounts. |
92 | | * |
93 | | * - overread_count: the total number of implicit appended zero bytes that |
94 | | * have been loaded into the bitbuffer, including any |
95 | | * counted by 'bitsleft' and any already consumed |
96 | | */ |
97 | | |
98 | | /* |
99 | | * The type for the bitbuffer variable ('bitbuf' described above). For best |
100 | | * performance, this should have size equal to a machine word. |
101 | | * |
102 | | * 64-bit platforms have a significant advantage: they get a bigger bitbuffer |
103 | | * which they don't have to refill as often. |
104 | | */ |
105 | | typedef machine_word_t bitbuf_t; |
106 | 339k | #define BITBUF_NBITS (8 * (int)sizeof(bitbuf_t)) |
107 | | |
108 | | /* BITMASK(n) returns a bitmask of length 'n'. */ |
109 | 452k | #define BITMASK(n) (((bitbuf_t)1 << (n)) - 1) |
110 | | |
111 | | /* |
112 | | * MAX_BITSLEFT is the maximum number of consumable bits, i.e. the maximum value |
113 | | * of '(u8)bitsleft'. This is the size of the bitbuffer variable, minus 1 if |
114 | | * the branchless refill method is being used (see REFILL_BITS_BRANCHLESS()). |
115 | | */ |
116 | | #define MAX_BITSLEFT \ |
117 | 339k | (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS - 1 : BITBUF_NBITS) |
118 | | |
119 | | /* |
120 | | * CONSUMABLE_NBITS is the minimum number of bits that are guaranteed to be |
121 | | * consumable (counted in 'bitsleft') immediately after refilling the bitbuffer. |
122 | | * Since only whole bytes can be added to 'bitsleft', the worst case is |
123 | | * 'MAX_BITSLEFT - 7': the smallest amount where another byte doesn't fit. |
124 | | */ |
125 | 198k | #define CONSUMABLE_NBITS (MAX_BITSLEFT - 7) |
126 | | |
127 | | /* |
128 | | * FASTLOOP_PRELOADABLE_NBITS is the minimum number of bits that are guaranteed |
129 | | * to be preloadable immediately after REFILL_BITS_IN_FASTLOOP(). (It is *not* |
130 | | * guaranteed after REFILL_BITS(), since REFILL_BITS() falls back to a |
131 | | * byte-at-a-time refill method near the end of input.) This may exceed the |
132 | | * number of consumable bits (counted by 'bitsleft'). Any bits not counted in |
133 | | * 'bitsleft' can only be used for precomputation and cannot be consumed. |
134 | | */ |
135 | | #define FASTLOOP_PRELOADABLE_NBITS \ |
136 | 0 | (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS : CONSUMABLE_NBITS) |
137 | | |
138 | | /* |
139 | | * PRELOAD_SLACK is the minimum number of bits that are guaranteed to be |
140 | | * preloadable but not consumable, following REFILL_BITS_IN_FASTLOOP() and any |
141 | | * subsequent consumptions. This is 1 bit if the branchless refill method is |
142 | | * being used, and 0 bits otherwise. |
143 | | */ |
144 | | #define PRELOAD_SLACK MAX(0, FASTLOOP_PRELOADABLE_NBITS - MAX_BITSLEFT) |
145 | | |
146 | | /* |
147 | | * CAN_CONSUME(n) is true if it's guaranteed that if the bitbuffer has just been |
148 | | * refilled, then it's always possible to consume 'n' bits from it. 'n' should |
149 | | * be a compile-time constant, to enable compile-time evaluation. |
150 | | */ |
151 | 7.09k | #define CAN_CONSUME(n) (CONSUMABLE_NBITS >= (n)) |
152 | | |
153 | | /* |
154 | | * CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) is true if it's |
155 | | * guaranteed that after REFILL_BITS_IN_FASTLOOP(), it's always possible to |
156 | | * consume 'consume_nbits' bits, then preload 'preload_nbits' bits. The |
157 | | * arguments should be compile-time constants to enable compile-time evaluation. |
158 | | */ |
159 | | #define CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) \ |
160 | 289k | (CONSUMABLE_NBITS >= (consume_nbits) && \ |
161 | 289k | FASTLOOP_PRELOADABLE_NBITS >= (consume_nbits) + (preload_nbits)) |
162 | | |
163 | | /* |
164 | | * REFILL_BITS_BRANCHLESS() branchlessly refills the bitbuffer variable by |
165 | | * reading the next word from the input buffer and updating 'in_next' and |
166 | | * 'bitsleft' based on how many bits were refilled -- counting whole bytes only. |
167 | | * This is much faster than reading a byte at a time, at least if the CPU is |
168 | | * little endian and supports fast unaligned memory accesses. |
169 | | * |
170 | | * The simplest way of branchlessly updating 'bitsleft' would be: |
171 | | * |
172 | | * bitsleft += (MAX_BITSLEFT - bitsleft) & ~7; |
173 | | * |
174 | | * To make it faster, we define MAX_BITSLEFT to be 'WORDBITS - 1' rather than |
175 | | * WORDBITS, so that in binary it looks like 111111 or 11111. Then, we update |
176 | | * 'bitsleft' by just setting the bits above the low 3 bits: |
177 | | * |
178 | | * bitsleft |= MAX_BITSLEFT & ~7; |
179 | | * |
180 | | * That compiles down to a single instruction like 'or $0x38, %rbp'. Using |
181 | | * 'MAX_BITSLEFT == WORDBITS - 1' also has the advantage that refills can be |
182 | | * done when 'bitsleft == MAX_BITSLEFT' without invoking undefined behavior. |
183 | | * |
184 | | * The simplest way of branchlessly updating 'in_next' would be: |
185 | | * |
186 | | * in_next += (MAX_BITSLEFT - bitsleft) >> 3; |
187 | | * |
188 | | * With 'MAX_BITSLEFT == WORDBITS - 1' we could use an XOR instead, though this |
189 | | * isn't really better: |
190 | | * |
191 | | * in_next += (MAX_BITSLEFT ^ bitsleft) >> 3; |
192 | | * |
193 | | * An alternative which can be marginally better is the following: |
194 | | * |
195 | | * in_next += sizeof(bitbuf_t) - 1; |
196 | | * in_next -= (bitsleft >> 3) & 0x7; |
197 | | * |
198 | | * It seems this would increase the number of CPU instructions from 3 (sub, shr, |
199 | | * add) to 4 (add, shr, and, sub). However, if the CPU has a bitfield |
200 | | * extraction instruction (e.g. arm's ubfx), it stays at 3, and is potentially |
201 | | * more efficient because the length of the longest dependency chain decreases |
202 | | * from 3 to 2. This alternative also has the advantage that it ignores the |
203 | | * high bits in 'bitsleft', so it is compatible with the micro-optimization we |
204 | | * use where we let the high bits of 'bitsleft' contain garbage. |
205 | | */ |
206 | 141k | #define REFILL_BITS_BRANCHLESS() \ |
207 | 141k | do { \ |
208 | 141k | bitbuf |= get_unaligned_leword(in_next) << (u8)bitsleft; \ |
209 | 141k | in_next += sizeof(bitbuf_t) - 1; \ |
210 | 141k | in_next -= (bitsleft >> 3) & 0x7; \ |
211 | 141k | bitsleft |= MAX_BITSLEFT & ~7; \ |
212 | 141k | } while (0) |
213 | | |
214 | | /* |
215 | | * REFILL_BITS() loads bits from the input buffer until the bitbuffer variable |
216 | | * contains at least CONSUMABLE_NBITS consumable bits. |
217 | | * |
218 | | * This checks for the end of input, and it doesn't guarantee |
219 | | * FASTLOOP_PRELOADABLE_NBITS, so it can't be used in the fastloop. |
220 | | * |
221 | | * If we would overread the input buffer, we just don't read anything, leaving |
222 | | * the bits zeroed but marking them filled. This simplifies the decompressor |
223 | | * because it removes the need to always be able to distinguish between real |
224 | | * overreads and overreads caused only by the decompressor's own lookahead. |
225 | | * |
226 | | * We do still keep track of the number of bytes that have been overread, for |
227 | | * two reasons. First, it allows us to determine the exact number of bytes that |
228 | | * were consumed once the stream ends or an uncompressed block is reached. |
229 | | * Second, it allows us to stop early if the overread amount gets so large (more |
230 | | * than sizeof bitbuf) that it can only be caused by a real overread. (The |
231 | | * second part is arguably unneeded, since libdeflate is buffer-based; given |
232 | | * infinite zeroes, it will eventually either completely fill the output buffer |
233 | | * or return an error. However, we do it to be slightly more friendly to the |
234 | | * not-recommended use case of decompressing with an unknown output size.) |
235 | | */ |
236 | 41.8k | #define REFILL_BITS() \ |
237 | 41.8k | do { \ |
238 | 41.8k | if (UNALIGNED_ACCESS_IS_FAST && \ |
239 | 41.8k | likely(in_end - in_next >= sizeof(bitbuf_t))) { \ |
240 | 29.8k | REFILL_BITS_BRANCHLESS(); \ |
241 | 29.8k | } else { \ |
242 | 22.4k | while ((u8)bitsleft < CONSUMABLE_NBITS) { \ |
243 | 10.6k | if (likely(in_next != in_end)) { \ |
244 | 4.43k | bitbuf |= (bitbuf_t)*in_next++ << \ |
245 | 4.43k | (u8)bitsleft; \ |
246 | 6.25k | } else { \ |
247 | 6.25k | overread_count++; \ |
248 | 6.25k | SAFETY_CHECK(overread_count <= \ |
249 | 6.25k | sizeof(bitbuf_t)); \ |
250 | 6.25k | } \ |
251 | 10.6k | bitsleft += 8; \ |
252 | 10.3k | } \ |
253 | 12.0k | } \ |
254 | 41.8k | } while (0) |
255 | | |
256 | | /* |
257 | | * REFILL_BITS_IN_FASTLOOP() is like REFILL_BITS(), but it doesn't check for the |
258 | | * end of the input. It can only be used in the fastloop. |
259 | | */ |
260 | 111k | #define REFILL_BITS_IN_FASTLOOP() \ |
261 | 111k | do { \ |
262 | 111k | STATIC_ASSERT(UNALIGNED_ACCESS_IS_FAST || \ |
263 | 111k | FASTLOOP_PRELOADABLE_NBITS == CONSUMABLE_NBITS); \ |
264 | 111k | if (UNALIGNED_ACCESS_IS_FAST) { \ |
265 | 111k | REFILL_BITS_BRANCHLESS(); \ |
266 | 111k | } else { \ |
267 | 0 | while ((u8)bitsleft < CONSUMABLE_NBITS) { \ |
268 | 0 | bitbuf |= (bitbuf_t)*in_next++ << (u8)bitsleft; \ |
269 | 0 | bitsleft += 8; \ |
270 | 0 | } \ |
271 | 0 | } \ |
272 | 111k | } while (0) |
273 | | |
274 | | /* |
275 | | * This is the worst-case maximum number of output bytes that are written to |
276 | | * during each iteration of the fastloop. The worst case is 2 literals, then a |
277 | | * match of length DEFLATE_MAX_MATCH_LEN. Additionally, some slack space must |
278 | | * be included for the intentional overrun in the match copy implementation. |
279 | | */ |
280 | | #define FASTLOOP_MAX_BYTES_WRITTEN \ |
281 | | (2 + DEFLATE_MAX_MATCH_LEN + (5 * WORDBYTES) - 1) |
282 | | |
283 | | /* |
284 | | * This is the worst-case maximum number of input bytes that are read during |
285 | | * each iteration of the fastloop. To get this value, we first compute the |
286 | | * greatest number of bits that can be refilled during a loop iteration. The |
287 | | * refill at the beginning can add at most MAX_BITSLEFT, and the amount that can |
288 | | * be refilled later is no more than the maximum amount that can be consumed by |
289 | | * 2 literals that don't need a subtable, then a match. We convert this value |
290 | | * to bytes, rounding up; this gives the maximum number of bytes that 'in_next' |
291 | | * can be advanced. Finally, we add sizeof(bitbuf_t) to account for |
292 | | * REFILL_BITS_BRANCHLESS() reading a word past 'in_next'. |
293 | | */ |
294 | | #define FASTLOOP_MAX_BYTES_READ \ |
295 | | (DIV_ROUND_UP(MAX_BITSLEFT + (2 * LITLEN_TABLEBITS) + \ |
296 | | LENGTH_MAXBITS + OFFSET_MAXBITS, 8) + \ |
297 | | sizeof(bitbuf_t)) |
298 | | |
299 | | /***************************************************************************** |
300 | | * Huffman decoding * |
301 | | *****************************************************************************/ |
302 | | |
303 | | /* |
304 | | * The fastest way to decode Huffman-encoded data is basically to use a decode |
305 | | * table that maps the next TABLEBITS bits of data to their symbol. Each entry |
306 | | * decode_table[i] maps to the symbol whose codeword is a prefix of 'i'. A |
307 | | * symbol with codeword length 'n' has '2**(TABLEBITS-n)' entries in the table. |
308 | | * |
309 | | * Ideally, TABLEBITS and the maximum codeword length would be the same; some |
310 | | * compression formats are designed with this goal in mind. Unfortunately, in |
311 | | * DEFLATE, the maximum litlen and offset codeword lengths are 15 bits, which is |
312 | | * too large for a practical TABLEBITS. It's not *that* much larger, though, so |
313 | | * the workaround is to use a single level of subtables. In the main table, |
314 | | * entries for prefixes of codewords longer than TABLEBITS contain a "pointer" |
315 | | * to the appropriate subtable along with the number of bits it is indexed with. |
316 | | * |
317 | | * The most efficient way to allocate subtables is to allocate them dynamically |
318 | | * after the main table. The worst-case number of table entries needed, |
319 | | * including subtables, is precomputable; see the ENOUGH constants below. |
320 | | * |
321 | | * A useful optimization is to store the codeword lengths in the decode table so |
322 | | * that they don't have to be looked up by indexing a separate table that maps |
323 | | * symbols to their codeword lengths. We basically do this; however, for the |
324 | | * litlen and offset codes we also implement some DEFLATE-specific optimizations |
325 | | * that build in the consideration of the "extra bits" and the |
326 | | * literal/length/end-of-block division. For the exact decode table entry |
327 | | * format we use, see the definitions of the *_decode_results[] arrays below. |
328 | | */ |
329 | | |
330 | | |
331 | | /* |
332 | | * These are the TABLEBITS values we use for each of the DEFLATE Huffman codes, |
333 | | * along with their corresponding ENOUGH values. |
334 | | * |
335 | | * For the precode, we use PRECODE_TABLEBITS == 7 since this is the maximum |
336 | | * precode codeword length. This avoids ever needing subtables. |
337 | | * |
338 | | * For the litlen and offset codes, we cannot realistically avoid ever needing |
339 | | * subtables, since litlen and offset codewords can be up to 15 bits. A higher |
340 | | * TABLEBITS reduces the number of lookups that need a subtable, which increases |
341 | | * performance; however, it increases memory usage and makes building the table |
342 | | * take longer, which decreases performance. We choose values that work well in |
343 | | * practice, making subtables rarely needed without making the tables too large. |
344 | | * |
345 | | * Our choice of OFFSET_TABLEBITS == 8 is a bit low; without any special |
346 | | * considerations, 9 would fit the trade-off curve better. However, there is a |
347 | | * performance benefit to using exactly 8 bits when it is a compile-time |
348 | | * constant, as many CPUs can take the low byte more easily than the low 9 bits. |
349 | | * |
350 | | * zlib treats its equivalents of TABLEBITS as maximum values; whenever it |
351 | | * builds a table, it caps the actual table_bits to the longest codeword. This |
352 | | * makes sense in theory, as there's no need for the table to be any larger than |
353 | | * needed to support the longest codeword. However, having the table bits be a |
354 | | * compile-time constant is beneficial to the performance of the decode loop, so |
355 | | * there is a trade-off. libdeflate currently uses the dynamic table_bits |
356 | | * strategy for the litlen table only, due to its larger maximum size. |
357 | | * PRECODE_TABLEBITS and OFFSET_TABLEBITS are smaller, so going dynamic there |
358 | | * isn't as useful, and OFFSET_TABLEBITS=8 is useful as mentioned above. |
359 | | * |
360 | | * Each TABLEBITS value has a corresponding ENOUGH value that gives the |
361 | | * worst-case maximum number of decode table entries, including the main table |
362 | | * and all subtables. The ENOUGH value depends on three parameters: |
363 | | * |
364 | | * (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS) |
365 | | * (2) the maximum number of main table bits (*_TABLEBITS) |
366 | | * (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN) |
367 | | * |
368 | | * The ENOUGH values were computed using the utility program 'enough' from zlib. |
369 | | */ |
370 | 2.27k | #define PRECODE_TABLEBITS 7 |
371 | | #define PRECODE_ENOUGH 128 /* enough 19 7 7 */ |
372 | 4.73k | #define LITLEN_TABLEBITS 11 |
373 | | #define LITLEN_ENOUGH 2342 /* enough 288 11 15 */ |
374 | 4.76k | #define OFFSET_TABLEBITS 8 |
375 | | #define OFFSET_ENOUGH 402 /* enough 32 8 15 */ |
376 | | |
377 | | /* |
378 | | * make_decode_table_entry() creates a decode table entry for the given symbol |
379 | | * by combining the static part 'decode_results[sym]' with the dynamic part |
380 | | * 'len', which is the remaining codeword length (the codeword length for main |
381 | | * table entries, or the codeword length minus TABLEBITS for subtable entries). |
382 | | * |
383 | | * In all cases, we add 'len' to each of the two low-order bytes to create the |
384 | | * appropriately-formatted decode table entry. See the definitions of the |
385 | | * *_decode_results[] arrays below, where the entry format is described. |
386 | | */ |
387 | | static forceinline u32 |
388 | | make_decode_table_entry(const u32 decode_results[], u32 sym, u32 len) |
389 | 1.09M | { |
390 | 1.09M | return decode_results[sym] + (len << 8) + len; |
391 | 1.09M | } |
392 | | |
393 | | /* |
394 | | * Here is the format of our precode decode table entries. Bits not explicitly |
395 | | * described contain zeroes: |
396 | | * |
397 | | * Bit 20-16: presym |
398 | | * Bit 10-8: codeword length [not used] |
399 | | * Bit 2-0: codeword length |
400 | | * |
401 | | * The precode decode table never has subtables, since we use |
402 | | * PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN. |
403 | | * |
404 | | * precode_decode_results[] contains the static part of the entry for each |
405 | | * symbol. make_decode_table_entry() produces the final entries. |
406 | | */ |
407 | | static const u32 precode_decode_results[] = { |
408 | | #define ENTRY(presym) ((u32)presym << 16) |
409 | | ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , |
410 | | ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , |
411 | | ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , |
412 | | ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , |
413 | | ENTRY(16) , ENTRY(17) , ENTRY(18) , |
414 | | #undef ENTRY |
415 | | }; |
416 | | |
417 | | /* Litlen and offset decode table entry flags */ |
418 | | |
419 | | /* Indicates a literal entry in the litlen decode table */ |
420 | 266k | #define HUFFDEC_LITERAL 0x80000000 |
421 | | |
422 | | /* Indicates that HUFFDEC_SUBTABLE_POINTER or HUFFDEC_END_OF_BLOCK is set */ |
423 | 1.28k | #define HUFFDEC_EXCEPTIONAL 0x00008000 |
424 | | |
425 | | /* Indicates a subtable pointer entry in the litlen or offset decode table */ |
426 | 1.28k | #define HUFFDEC_SUBTABLE_POINTER 0x00004000 |
427 | | |
428 | | /* Indicates an end-of-block entry in the litlen decode table */ |
429 | | #define HUFFDEC_END_OF_BLOCK 0x00002000 |
430 | | |
431 | | /* Maximum number of bits that can be consumed by decoding a match length */ |
432 | | #define LENGTH_MAXBITS (DEFLATE_MAX_LITLEN_CODEWORD_LEN + \ |
433 | | DEFLATE_MAX_EXTRA_LENGTH_BITS) |
434 | | #define LENGTH_MAXFASTBITS (LITLEN_TABLEBITS /* no subtable needed */ + \ |
435 | | DEFLATE_MAX_EXTRA_LENGTH_BITS) |
436 | | |
437 | | /* |
438 | | * Here is the format of our litlen decode table entries. Bits not explicitly |
439 | | * described contain zeroes: |
440 | | * |
441 | | * Literals: |
442 | | * Bit 31: 1 (HUFFDEC_LITERAL) |
443 | | * Bit 23-16: literal value |
444 | | * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) |
445 | | * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) |
446 | | * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) |
447 | | * Bit 11-8: remaining codeword length [not used] |
448 | | * Bit 3-0: remaining codeword length |
449 | | * Lengths: |
450 | | * Bit 31: 0 (!HUFFDEC_LITERAL) |
451 | | * Bit 24-16: length base value |
452 | | * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) |
453 | | * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) |
454 | | * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) |
455 | | * Bit 11-8: remaining codeword length |
456 | | * Bit 4-0: remaining codeword length + number of extra bits |
457 | | * End of block: |
458 | | * Bit 31: 0 (!HUFFDEC_LITERAL) |
459 | | * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) |
460 | | * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) |
461 | | * Bit 13: 1 (HUFFDEC_END_OF_BLOCK) |
462 | | * Bit 11-8: remaining codeword length [not used] |
463 | | * Bit 3-0: remaining codeword length |
464 | | * Subtable pointer: |
465 | | * Bit 31: 0 (!HUFFDEC_LITERAL) |
466 | | * Bit 30-16: index of start of subtable |
467 | | * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) |
468 | | * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER) |
469 | | * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) |
470 | | * Bit 11-8: number of subtable bits |
471 | | * Bit 3-0: number of main table bits |
472 | | * |
473 | | * This format has several desirable properties: |
474 | | * |
475 | | * - The codeword length, length slot base, and number of extra length bits |
476 | | * are all built in. This eliminates the need to separately look up this |
477 | | * information by indexing separate arrays by symbol or length slot. |
478 | | * |
479 | | * - The HUFFDEC_* flags enable easily distinguishing between the different |
480 | | * types of entries. The HUFFDEC_LITERAL flag enables a fast path for |
481 | | * literals; the high bit is used for this, as some CPUs can test the |
482 | | * high bit more easily than other bits. The HUFFDEC_EXCEPTIONAL flag |
483 | | * makes it possible to detect the two unlikely cases (subtable pointer |
484 | | * and end of block) in a single bit flag test. |
485 | | * |
486 | | * - The low byte is the number of bits that need to be removed from the |
487 | | * bitstream; this makes this value easily accessible, and it enables the |
488 | | * micro-optimization of doing 'bitsleft -= entry' instead of |
489 | | * 'bitsleft -= (u8)entry'. It also includes the number of extra bits, |
490 | | * so they don't need to be removed separately. |
491 | | * |
492 | | * - The flags in bits 15-13 are arranged to be 0 when the |
493 | | * "remaining codeword length" in bits 11-8 is needed, making this value |
494 | | * fairly easily accessible as well via a shift and downcast. |
495 | | * |
496 | | * - Similarly, bits 13-12 are 0 when the "subtable bits" in bits 11-8 are |
497 | | * needed, making it possible to extract this value with '& 0x3F' rather |
498 | | * than '& 0xF'. This value is only used as a shift amount, so this can |
499 | | * save an 'and' instruction as the masking by 0x3F happens implicitly. |
500 | | * |
501 | | * litlen_decode_results[] contains the static part of the entry for each |
502 | | * symbol. make_decode_table_entry() produces the final entries. |
503 | | */ |
504 | | static const u32 litlen_decode_results[] = { |
505 | | |
506 | | /* Literals */ |
507 | | #define ENTRY(literal) (HUFFDEC_LITERAL | ((u32)literal << 16)) |
508 | | ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , |
509 | | ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , |
510 | | ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , |
511 | | ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , |
512 | | ENTRY(16) , ENTRY(17) , ENTRY(18) , ENTRY(19) , |
513 | | ENTRY(20) , ENTRY(21) , ENTRY(22) , ENTRY(23) , |
514 | | ENTRY(24) , ENTRY(25) , ENTRY(26) , ENTRY(27) , |
515 | | ENTRY(28) , ENTRY(29) , ENTRY(30) , ENTRY(31) , |
516 | | ENTRY(32) , ENTRY(33) , ENTRY(34) , ENTRY(35) , |
517 | | ENTRY(36) , ENTRY(37) , ENTRY(38) , ENTRY(39) , |
518 | | ENTRY(40) , ENTRY(41) , ENTRY(42) , ENTRY(43) , |
519 | | ENTRY(44) , ENTRY(45) , ENTRY(46) , ENTRY(47) , |
520 | | ENTRY(48) , ENTRY(49) , ENTRY(50) , ENTRY(51) , |
521 | | ENTRY(52) , ENTRY(53) , ENTRY(54) , ENTRY(55) , |
522 | | ENTRY(56) , ENTRY(57) , ENTRY(58) , ENTRY(59) , |
523 | | ENTRY(60) , ENTRY(61) , ENTRY(62) , ENTRY(63) , |
524 | | ENTRY(64) , ENTRY(65) , ENTRY(66) , ENTRY(67) , |
525 | | ENTRY(68) , ENTRY(69) , ENTRY(70) , ENTRY(71) , |
526 | | ENTRY(72) , ENTRY(73) , ENTRY(74) , ENTRY(75) , |
527 | | ENTRY(76) , ENTRY(77) , ENTRY(78) , ENTRY(79) , |
528 | | ENTRY(80) , ENTRY(81) , ENTRY(82) , ENTRY(83) , |
529 | | ENTRY(84) , ENTRY(85) , ENTRY(86) , ENTRY(87) , |
530 | | ENTRY(88) , ENTRY(89) , ENTRY(90) , ENTRY(91) , |
531 | | ENTRY(92) , ENTRY(93) , ENTRY(94) , ENTRY(95) , |
532 | | ENTRY(96) , ENTRY(97) , ENTRY(98) , ENTRY(99) , |
533 | | ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) , |
534 | | ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) , |
535 | | ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) , |
536 | | ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) , |
537 | | ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) , |
538 | | ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) , |
539 | | ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) , |
540 | | ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) , |
541 | | ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) , |
542 | | ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) , |
543 | | ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) , |
544 | | ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) , |
545 | | ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) , |
546 | | ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) , |
547 | | ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) , |
548 | | ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) , |
549 | | ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) , |
550 | | ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) , |
551 | | ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) , |
552 | | ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) , |
553 | | ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) , |
554 | | ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) , |
555 | | ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) , |
556 | | ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) , |
557 | | ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) , |
558 | | ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) , |
559 | | ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) , |
560 | | ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) , |
561 | | ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) , |
562 | | ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) , |
563 | | ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) , |
564 | | ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) , |
565 | | ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) , |
566 | | ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) , |
567 | | ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) , |
568 | | ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) , |
569 | | ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) , |
570 | | ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) , |
571 | | ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) , |
572 | | #undef ENTRY |
573 | | |
574 | | /* End of block */ |
575 | | HUFFDEC_EXCEPTIONAL | HUFFDEC_END_OF_BLOCK, |
576 | | |
577 | | /* Lengths */ |
578 | | #define ENTRY(length_base, num_extra_bits) \ |
579 | | (((u32)(length_base) << 16) | (num_extra_bits)) |
580 | | ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 0) , ENTRY(6 , 0), |
581 | | ENTRY(7 , 0) , ENTRY(8 , 0) , ENTRY(9 , 0) , ENTRY(10 , 0), |
582 | | ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1), |
583 | | ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2), |
584 | | ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3), |
585 | | ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4), |
586 | | ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5), |
587 | | ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) , |
588 | | #undef ENTRY |
589 | | }; |
590 | | |
591 | | /* Maximum number of bits that can be consumed by decoding a match offset */ |
592 | | #define OFFSET_MAXBITS (DEFLATE_MAX_OFFSET_CODEWORD_LEN + \ |
593 | | DEFLATE_MAX_EXTRA_OFFSET_BITS) |
594 | | #define OFFSET_MAXFASTBITS (OFFSET_TABLEBITS /* no subtable needed */ + \ |
595 | | DEFLATE_MAX_EXTRA_OFFSET_BITS) |
596 | | |
597 | | /* |
598 | | * Here is the format of our offset decode table entries. Bits not explicitly |
599 | | * described contain zeroes: |
600 | | * |
601 | | * Offsets: |
602 | | * Bit 31-16: offset base value |
603 | | * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) |
604 | | * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) |
605 | | * Bit 11-8: remaining codeword length |
606 | | * Bit 4-0: remaining codeword length + number of extra bits |
607 | | * Subtable pointer: |
608 | | * Bit 31-16: index of start of subtable |
609 | | * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) |
610 | | * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER) |
611 | | * Bit 11-8: number of subtable bits |
612 | | * Bit 3-0: number of main table bits |
613 | | * |
614 | | * These work the same way as the length entries and subtable pointer entries in |
615 | | * the litlen decode table; see litlen_decode_results[] above. |
616 | | */ |
617 | | static const u32 offset_decode_results[] = { |
618 | | #define ENTRY(offset_base, num_extra_bits) \ |
619 | | (((u32)(offset_base) << 16) | (num_extra_bits)) |
620 | | ENTRY(1 , 0) , ENTRY(2 , 0) , ENTRY(3 , 0) , ENTRY(4 , 0) , |
621 | | ENTRY(5 , 1) , ENTRY(7 , 1) , ENTRY(9 , 2) , ENTRY(13 , 2) , |
622 | | ENTRY(17 , 3) , ENTRY(25 , 3) , ENTRY(33 , 4) , ENTRY(49 , 4) , |
623 | | ENTRY(65 , 5) , ENTRY(97 , 5) , ENTRY(129 , 6) , ENTRY(193 , 6) , |
624 | | ENTRY(257 , 7) , ENTRY(385 , 7) , ENTRY(513 , 8) , ENTRY(769 , 8) , |
625 | | ENTRY(1025 , 9) , ENTRY(1537 , 9) , ENTRY(2049 , 10) , ENTRY(3073 , 10) , |
626 | | ENTRY(4097 , 11) , ENTRY(6145 , 11) , ENTRY(8193 , 12) , ENTRY(12289 , 12) , |
627 | | ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , |
628 | | #undef ENTRY |
629 | | }; |
630 | | |
631 | | /* |
632 | | * The main DEFLATE decompressor structure. Since libdeflate only supports |
633 | | * full-buffer decompression, this structure doesn't store the entire |
634 | | * decompression state, most of which is in stack variables. Instead, this |
635 | | * struct just contains the decode tables and some temporary arrays used for |
636 | | * building them, as these are too large to comfortably allocate on the stack. |
637 | | * |
638 | | * Storing the decode tables in the decompressor struct also allows the decode |
639 | | * tables for the static codes to be reused whenever two static Huffman blocks |
640 | | * are decoded without an intervening dynamic block, even across streams. |
641 | | */ |
642 | | struct libdeflate_decompressor { |
643 | | |
644 | | /* |
645 | | * The arrays aren't all needed at the same time. 'precode_lens' and |
646 | | * 'precode_decode_table' are unneeded after 'lens' has been filled. |
647 | | * Furthermore, 'lens' need not be retained after building the litlen |
648 | | * and offset decode tables. In fact, 'lens' can be in union with |
649 | | * 'litlen_decode_table' provided that 'offset_decode_table' is separate |
650 | | * and is built first. |
651 | | */ |
652 | | |
653 | | union { |
654 | | u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS]; |
655 | | |
656 | | struct { |
657 | | u8 lens[DEFLATE_NUM_LITLEN_SYMS + |
658 | | DEFLATE_NUM_OFFSET_SYMS + |
659 | | DEFLATE_MAX_LENS_OVERRUN]; |
660 | | |
661 | | u32 precode_decode_table[PRECODE_ENOUGH]; |
662 | | } l; |
663 | | |
664 | | u32 litlen_decode_table[LITLEN_ENOUGH]; |
665 | | } u; |
666 | | |
667 | | u32 offset_decode_table[OFFSET_ENOUGH]; |
668 | | |
669 | | /* used only during build_decode_table() */ |
670 | | u16 sorted_syms[DEFLATE_MAX_NUM_SYMS]; |
671 | | |
672 | | bool static_codes_loaded; |
673 | | unsigned litlen_tablebits; |
674 | | }; |
675 | | |
676 | | /* |
677 | | * Build a table for fast decoding of symbols from a Huffman code. As input, |
678 | | * this function takes the codeword length of each symbol which may be used in |
679 | | * the code. As output, it produces a decode table for the canonical Huffman |
680 | | * code described by the codeword lengths. The decode table is built with the |
681 | | * assumption that it will be indexed with "bit-reversed" codewords, where the |
682 | | * low-order bit is the first bit of the codeword. This format is used for all |
683 | | * Huffman codes in DEFLATE. |
684 | | * |
685 | | * @decode_table |
686 | | * The array in which the decode table will be generated. This array must |
687 | | * have sufficient length; see the definition of the ENOUGH numbers. |
688 | | * @lens |
689 | | * An array which provides, for each symbol, the length of the |
690 | | * corresponding codeword in bits, or 0 if the symbol is unused. This may |
691 | | * alias @decode_table, since nothing is written to @decode_table until all |
692 | | * @lens have been consumed. All codeword lengths are assumed to be <= |
693 | | * @max_codeword_len but are otherwise considered untrusted. If they do |
694 | | * not form a valid Huffman code, then the decode table is not built and |
695 | | * %false is returned. |
696 | | * @num_syms |
697 | | * The number of symbols in the code, including all unused symbols. |
698 | | * @decode_results |
699 | | * An array which gives the incomplete decode result for each symbol. The |
700 | | * needed values in this array will be combined with codeword lengths to |
701 | | * make the final decode table entries using make_decode_table_entry(). |
702 | | * @table_bits |
703 | | * The log base-2 of the number of main table entries to use. |
704 | | * If @table_bits_ret != NULL, then @table_bits is treated as a maximum |
705 | | * value and it will be decreased if a smaller table would be sufficient. |
706 | | * @max_codeword_len |
707 | | * The maximum allowed codeword length for this Huffman code. |
708 | | * Must be <= DEFLATE_MAX_CODEWORD_LEN. |
709 | | * @sorted_syms |
710 | | * A temporary array of length @num_syms. |
711 | | * @table_bits_ret |
712 | | * If non-NULL, then the dynamic table_bits is enabled, and the actual |
713 | | * table_bits value will be returned here. |
714 | | * |
715 | | * Returns %true if successful; %false if the codeword lengths do not form a |
716 | | * valid Huffman code. |
717 | | */ |
718 | | static bool |
719 | | build_decode_table(u32 decode_table[], |
720 | | const u8 lens[], |
721 | | const unsigned num_syms, |
722 | | const u32 decode_results[], |
723 | | unsigned table_bits, |
724 | | unsigned max_codeword_len, |
725 | | u16 *sorted_syms, |
726 | | unsigned *table_bits_ret) |
727 | 11.7k | { |
728 | 11.7k | unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; |
729 | 11.7k | unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1]; |
730 | 11.7k | unsigned sym; /* current symbol */ |
731 | 11.7k | unsigned codeword; /* current codeword, bit-reversed */ |
732 | 11.7k | unsigned len; /* current codeword length in bits */ |
733 | 11.7k | unsigned count; /* num codewords remaining with this length */ |
734 | 11.7k | u32 codespace_used; /* codespace used out of '2^max_codeword_len' */ |
735 | 11.7k | unsigned cur_table_end; /* end index of current table */ |
736 | 11.7k | unsigned subtable_prefix; /* codeword prefix of current subtable */ |
737 | 11.7k | unsigned subtable_start; /* start index of current subtable */ |
738 | 11.7k | unsigned subtable_bits; /* log2 of current subtable length */ |
739 | | |
740 | | /* Count how many codewords have each length, including 0. */ |
741 | 181k | for (len = 0; len <= max_codeword_len; len++) |
742 | 170k | len_counts[len] = 0; |
743 | 1.51M | for (sym = 0; sym < num_syms; sym++) |
744 | 1.50M | len_counts[lens[sym]]++; |
745 | | |
746 | | /* |
747 | | * Determine the actual maximum codeword length that was used, and |
748 | | * decrease table_bits to it if allowed. |
749 | | */ |
750 | 103k | while (max_codeword_len > 1 && len_counts[max_codeword_len] == 0) |
751 | 91.6k | max_codeword_len--; |
752 | 11.7k | if (table_bits_ret != NULL) { |
753 | 4.73k | table_bits = MIN(table_bits, max_codeword_len); |
754 | 4.73k | *table_bits_ret = table_bits; |
755 | 4.73k | } |
756 | | |
757 | | /* |
758 | | * Sort the symbols primarily by increasing codeword length and |
759 | | * secondarily by increasing symbol value; or equivalently by their |
760 | | * codewords in lexicographic order, since a canonical code is assumed. |
761 | | * |
762 | | * For efficiency, also compute 'codespace_used' in the same pass over |
763 | | * 'len_counts[]' used to build 'offsets[]' for sorting. |
764 | | */ |
765 | | |
766 | | /* Ensure that 'codespace_used' cannot overflow. */ |
767 | 11.7k | STATIC_ASSERT(sizeof(codespace_used) == 4); |
768 | 11.7k | STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >= |
769 | 11.7k | DEFLATE_MAX_NUM_SYMS); |
770 | | |
771 | 11.7k | offsets[0] = 0; |
772 | 11.7k | offsets[1] = len_counts[0]; |
773 | 11.7k | codespace_used = 0; |
774 | 66.7k | for (len = 1; len < max_codeword_len; len++) { |
775 | 54.9k | offsets[len + 1] = offsets[len] + len_counts[len]; |
776 | 54.9k | codespace_used = (codespace_used << 1) + len_counts[len]; |
777 | 54.9k | } |
778 | 11.7k | codespace_used = (codespace_used << 1) + len_counts[len]; |
779 | | |
780 | 1.51M | for (sym = 0; sym < num_syms; sym++) |
781 | 1.50M | sorted_syms[offsets[lens[sym]]++] = sym; |
782 | | |
783 | 11.7k | sorted_syms += offsets[0]; /* Skip unused symbols */ |
784 | | |
785 | | /* lens[] is done being used, so we can write to decode_table[] now. */ |
786 | | |
787 | | /* |
788 | | * Check whether the lengths form a complete code (exactly fills the |
789 | | * codespace), an incomplete code (doesn't fill the codespace), or an |
790 | | * overfull code (overflows the codespace). A codeword of length 'n' |
791 | | * uses proportion '1/(2^n)' of the codespace. An overfull code is |
792 | | * nonsensical, so is considered invalid. An incomplete code is |
793 | | * considered valid only in two specific cases; see below. |
794 | | */ |
795 | | |
796 | | /* overfull code? */ |
797 | 11.7k | if (unlikely(codespace_used > (1U << max_codeword_len))) |
798 | 51 | return false; |
799 | | |
800 | | /* incomplete code? */ |
801 | 11.7k | if (unlikely(codespace_used < (1U << max_codeword_len))) { |
802 | 2.27k | u32 entry; |
803 | 2.27k | unsigned i; |
804 | | |
805 | 2.27k | if (codespace_used == 0) { |
806 | | /* |
807 | | * An empty code is allowed. This can happen for the |
808 | | * offset code in DEFLATE, since a dynamic Huffman block |
809 | | * need not contain any matches. |
810 | | */ |
811 | | |
812 | | /* sym=0, len=1 (arbitrary) */ |
813 | 2.03k | entry = make_decode_table_entry(decode_results, 0, 1); |
814 | 2.03k | } else { |
815 | | /* |
816 | | * Allow codes with a single used symbol, with codeword |
817 | | * length 1. The DEFLATE RFC is unclear regarding this |
818 | | * case. What zlib's decompressor does is permit this |
819 | | * for the litlen and offset codes and assume the |
820 | | * codeword is '0' rather than '1'. We do the same |
821 | | * except we allow this for precodes too, since there's |
822 | | * no convincing reason to treat the codes differently. |
823 | | * We also assign both codewords '0' and '1' to the |
824 | | * symbol to avoid having to handle '1' specially. |
825 | | */ |
826 | 234 | if (codespace_used != (1U << (max_codeword_len - 1)) || |
827 | 234 | len_counts[1] != 1) |
828 | 92 | return false; |
829 | 142 | entry = make_decode_table_entry(decode_results, |
830 | 142 | *sorted_syms, 1); |
831 | 142 | } |
832 | | /* |
833 | | * Note: the decode table still must be fully initialized, in |
834 | | * case the stream is malformed and contains bits from the part |
835 | | * of the codespace the incomplete code doesn't use. |
836 | | */ |
837 | 526k | for (i = 0; i < (1U << table_bits); i++) |
838 | 524k | decode_table[i] = entry; |
839 | 2.18k | return true; |
840 | 2.27k | } |
841 | | |
842 | | /* |
843 | | * The lengths form a complete code. Now, enumerate the codewords in |
844 | | * lexicographic order and fill the decode table entries for each one. |
845 | | * |
846 | | * First, process all codewords with len <= table_bits. Each one gets |
847 | | * '2^(table_bits-len)' direct entries in the table. |
848 | | * |
849 | | * Since DEFLATE uses bit-reversed codewords, these entries aren't |
850 | | * consecutive but rather are spaced '2^len' entries apart. This makes |
851 | | * filling them naively somewhat awkward and inefficient, since strided |
852 | | * stores are less cache-friendly and preclude the use of word or |
853 | | * vector-at-a-time stores to fill multiple entries per instruction. |
854 | | * |
855 | | * To optimize this, we incrementally double the table size. When |
856 | | * processing codewords with length 'len', the table is treated as |
857 | | * having only '2^len' entries, so each codeword uses just one entry. |
858 | | * Then, each time 'len' is incremented, the table size is doubled and |
859 | | * the first half is copied to the second half. This significantly |
860 | | * improves performance over naively doing strided stores. |
861 | | * |
862 | | * Note that some entries copied for each table doubling may not have |
863 | | * been initialized yet, but it doesn't matter since they're guaranteed |
864 | | * to be initialized later (because the Huffman code is complete). |
865 | | */ |
866 | 9.44k | codeword = 0; |
867 | 9.44k | len = 1; |
868 | 46.6k | while ((count = len_counts[len]) == 0) |
869 | 37.1k | len++; |
870 | 9.44k | cur_table_end = 1U << len; |
871 | 24.8k | while (len <= table_bits) { |
872 | | /* Process all 'count' codewords with length 'len' bits. */ |
873 | 1.09M | do { |
874 | 1.09M | unsigned bit; |
875 | | |
876 | | /* Fill the first entry for the current codeword. */ |
877 | 1.09M | decode_table[codeword] = |
878 | 1.09M | make_decode_table_entry(decode_results, |
879 | 1.09M | *sorted_syms++, len); |
880 | | |
881 | 1.09M | if (codeword == cur_table_end - 1) { |
882 | | /* Last codeword (all 1's) */ |
883 | 21.1k | for (; len < table_bits; len++) { |
884 | 11.9k | memcpy(&decode_table[cur_table_end], |
885 | 11.9k | decode_table, |
886 | 11.9k | cur_table_end * |
887 | 11.9k | sizeof(decode_table[0])); |
888 | 11.9k | cur_table_end <<= 1; |
889 | 11.9k | } |
890 | 9.21k | return true; |
891 | 9.21k | } |
892 | | /* |
893 | | * To advance to the lexicographically next codeword in |
894 | | * the canonical code, the codeword must be incremented, |
895 | | * then 0's must be appended to the codeword as needed |
896 | | * to match the next codeword's length. |
897 | | * |
898 | | * Since the codeword is bit-reversed, appending 0's is |
899 | | * a no-op. However, incrementing it is nontrivial. To |
900 | | * do so efficiently, use the 'bsr' instruction to find |
901 | | * the last (highest order) 0 bit in the codeword, set |
902 | | * it, and clear any later (higher order) 1 bits. But |
903 | | * 'bsr' actually finds the highest order 1 bit, so to |
904 | | * use it first flip all bits in the codeword by XOR'ing |
905 | | * it with (1U << len) - 1 == cur_table_end - 1. |
906 | | */ |
907 | 1.08M | bit = 1U << bsr32(codeword ^ (cur_table_end - 1)); |
908 | 1.08M | codeword &= bit - 1; |
909 | 1.08M | codeword |= bit; |
910 | 1.08M | } while (--count); |
911 | | |
912 | | /* Advance to the next codeword length. */ |
913 | 16.4k | do { |
914 | 16.4k | if (++len <= table_bits) { |
915 | 16.2k | memcpy(&decode_table[cur_table_end], |
916 | 16.2k | decode_table, |
917 | 16.2k | cur_table_end * sizeof(decode_table[0])); |
918 | 16.2k | cur_table_end <<= 1; |
919 | 16.2k | } |
920 | 16.4k | } while ((count = len_counts[len]) == 0); |
921 | 15.3k | } |
922 | | |
923 | | /* Process codewords with len > table_bits. These require subtables. */ |
924 | 236 | cur_table_end = 1U << table_bits; |
925 | 236 | subtable_prefix = -1; |
926 | 236 | subtable_start = 0; |
927 | 3.45k | for (;;) { |
928 | 3.45k | u32 entry; |
929 | 3.45k | unsigned i; |
930 | 3.45k | unsigned stride; |
931 | 3.45k | unsigned bit; |
932 | | |
933 | | /* |
934 | | * Start a new subtable if the first 'table_bits' bits of the |
935 | | * codeword don't match the prefix of the current subtable. |
936 | | */ |
937 | 3.45k | if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) { |
938 | 1.28k | subtable_prefix = (codeword & ((1U << table_bits) - 1)); |
939 | 1.28k | subtable_start = cur_table_end; |
940 | | /* |
941 | | * Calculate the subtable length. If the codeword has |
942 | | * length 'table_bits + n', then the subtable needs |
943 | | * '2^n' entries. But it may need more; if fewer than |
944 | | * '2^n' codewords of length 'table_bits + n' remain, |
945 | | * then the length will need to be incremented to bring |
946 | | * in longer codewords until the subtable can be |
947 | | * completely filled. Note that because the Huffman |
948 | | * code is complete, it will always be possible to fill |
949 | | * the subtable eventually. |
950 | | */ |
951 | 1.28k | subtable_bits = len - table_bits; |
952 | 1.28k | codespace_used = count; |
953 | 1.43k | while (codespace_used < (1U << subtable_bits)) { |
954 | 155 | subtable_bits++; |
955 | 155 | codespace_used = (codespace_used << 1) + |
956 | 155 | len_counts[table_bits + subtable_bits]; |
957 | 155 | } |
958 | 1.28k | cur_table_end = subtable_start + (1U << subtable_bits); |
959 | | |
960 | | /* |
961 | | * Create the entry that points from the main table to |
962 | | * the subtable. |
963 | | */ |
964 | 1.28k | decode_table[subtable_prefix] = |
965 | 1.28k | ((u32)subtable_start << 16) | |
966 | 1.28k | HUFFDEC_EXCEPTIONAL | |
967 | 1.28k | HUFFDEC_SUBTABLE_POINTER | |
968 | 1.28k | (subtable_bits << 8) | table_bits; |
969 | 1.28k | } |
970 | | |
971 | | /* Fill the subtable entries for the current codeword. */ |
972 | 3.45k | entry = make_decode_table_entry(decode_results, *sorted_syms++, |
973 | 3.45k | len - table_bits); |
974 | 3.45k | i = subtable_start + (codeword >> table_bits); |
975 | 3.45k | stride = 1U << (len - table_bits); |
976 | 3.82k | do { |
977 | 3.82k | decode_table[i] = entry; |
978 | 3.82k | i += stride; |
979 | 3.82k | } while (i < cur_table_end); |
980 | | |
981 | | /* Advance to the next codeword. */ |
982 | 3.45k | if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */ |
983 | 236 | return true; |
984 | 3.21k | bit = 1U << bsr32(codeword ^ ((1U << len) - 1)); |
985 | 3.21k | codeword &= bit - 1; |
986 | 3.21k | codeword |= bit; |
987 | 3.21k | count--; |
988 | 3.46k | while (count == 0) |
989 | 245 | count = len_counts[++len]; |
990 | 3.21k | } |
991 | 236 | } |
992 | | |
993 | | /* Build the decode table for the precode. */ |
994 | | static bool |
995 | | build_precode_decode_table(struct libdeflate_decompressor *d) |
996 | 2.27k | { |
997 | | /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ |
998 | 2.27k | STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128); |
999 | | |
1000 | 2.27k | STATIC_ASSERT(ARRAY_LEN(precode_decode_results) == |
1001 | 2.27k | DEFLATE_NUM_PRECODE_SYMS); |
1002 | | |
1003 | 2.27k | return build_decode_table(d->u.l.precode_decode_table, |
1004 | 2.27k | d->u.precode_lens, |
1005 | 2.27k | DEFLATE_NUM_PRECODE_SYMS, |
1006 | 2.27k | precode_decode_results, |
1007 | 2.27k | PRECODE_TABLEBITS, |
1008 | 2.27k | DEFLATE_MAX_PRE_CODEWORD_LEN, |
1009 | 2.27k | d->sorted_syms, |
1010 | 2.27k | NULL); |
1011 | 2.27k | } |
1012 | | |
1013 | | /* Build the decode table for the literal/length code. */ |
1014 | | static bool |
1015 | | build_litlen_decode_table(struct libdeflate_decompressor *d, |
1016 | | unsigned num_litlen_syms, unsigned num_offset_syms) |
1017 | 4.73k | { |
1018 | | /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ |
1019 | 4.73k | STATIC_ASSERT(LITLEN_TABLEBITS == 11 && LITLEN_ENOUGH == 2342); |
1020 | | |
1021 | 4.73k | STATIC_ASSERT(ARRAY_LEN(litlen_decode_results) == |
1022 | 4.73k | DEFLATE_NUM_LITLEN_SYMS); |
1023 | | |
1024 | 4.73k | return build_decode_table(d->u.litlen_decode_table, |
1025 | 4.73k | d->u.l.lens, |
1026 | 4.73k | num_litlen_syms, |
1027 | 4.73k | litlen_decode_results, |
1028 | 4.73k | LITLEN_TABLEBITS, |
1029 | 4.73k | DEFLATE_MAX_LITLEN_CODEWORD_LEN, |
1030 | 4.73k | d->sorted_syms, |
1031 | 4.73k | &d->litlen_tablebits); |
1032 | 4.73k | } |
1033 | | |
1034 | | /* Build the decode table for the offset code. */ |
1035 | | static bool |
1036 | | build_offset_decode_table(struct libdeflate_decompressor *d, |
1037 | | unsigned num_litlen_syms, unsigned num_offset_syms) |
1038 | 4.76k | { |
1039 | | /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ |
1040 | 4.76k | STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402); |
1041 | | |
1042 | 4.76k | STATIC_ASSERT(ARRAY_LEN(offset_decode_results) == |
1043 | 4.76k | DEFLATE_NUM_OFFSET_SYMS); |
1044 | | |
1045 | 4.76k | return build_decode_table(d->offset_decode_table, |
1046 | 4.76k | d->u.l.lens + num_litlen_syms, |
1047 | 4.76k | num_offset_syms, |
1048 | 4.76k | offset_decode_results, |
1049 | 4.76k | OFFSET_TABLEBITS, |
1050 | 4.76k | DEFLATE_MAX_OFFSET_CODEWORD_LEN, |
1051 | 4.76k | d->sorted_syms, |
1052 | 4.76k | NULL); |
1053 | 4.76k | } |
1054 | | |
1055 | | /***************************************************************************** |
1056 | | * Main decompression routine |
1057 | | *****************************************************************************/ |
1058 | | |
1059 | | typedef enum libdeflate_result (*decompress_func_t) |
1060 | | (struct libdeflate_decompressor * restrict d, |
1061 | | const void * restrict in, size_t in_nbytes, |
1062 | | void * restrict out, size_t out_nbytes_avail, |
1063 | | size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); |
1064 | | |
1065 | | #define FUNCNAME deflate_decompress_default |
1066 | | #undef ATTRIBUTES |
1067 | | #undef EXTRACT_VARBITS |
1068 | | #undef EXTRACT_VARBITS8 |
1069 | | #include "decompress_template.h" |
1070 | | |
1071 | | /* Include architecture-specific implementation(s) if available. */ |
1072 | | #undef DEFAULT_IMPL |
1073 | | #undef arch_select_decompress_func |
1074 | | #if defined(ARCH_X86_32) || defined(ARCH_X86_64) |
1075 | | # include "x86/decompress_impl.h" |
1076 | | #endif |
1077 | | |
1078 | | #ifndef DEFAULT_IMPL |
1079 | 0 | # define DEFAULT_IMPL deflate_decompress_default |
1080 | | #endif |
1081 | | |
1082 | | #ifdef arch_select_decompress_func |
1083 | | static enum libdeflate_result |
1084 | | dispatch_decomp(struct libdeflate_decompressor *d, |
1085 | | const void *in, size_t in_nbytes, |
1086 | | void *out, size_t out_nbytes_avail, |
1087 | | size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); |
1088 | | |
1089 | | static volatile decompress_func_t decompress_impl = dispatch_decomp; |
1090 | | |
1091 | | /* Choose the best implementation at runtime. */ |
1092 | | static enum libdeflate_result |
1093 | | dispatch_decomp(struct libdeflate_decompressor *d, |
1094 | | const void *in, size_t in_nbytes, |
1095 | | void *out, size_t out_nbytes_avail, |
1096 | | size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) |
1097 | 2 | { |
1098 | 2 | decompress_func_t f = arch_select_decompress_func(); |
1099 | | |
1100 | 2 | if (f == NULL) |
1101 | 0 | f = DEFAULT_IMPL; |
1102 | | |
1103 | 2 | decompress_impl = f; |
1104 | 2 | return f(d, in, in_nbytes, out, out_nbytes_avail, |
1105 | 2 | actual_in_nbytes_ret, actual_out_nbytes_ret); |
1106 | 2 | } |
1107 | | #else |
1108 | | /* The best implementation is statically known, so call it directly. */ |
1109 | | # define decompress_impl DEFAULT_IMPL |
1110 | | #endif |
1111 | | |
1112 | | /* |
1113 | | * This is the main DEFLATE decompression routine. See libdeflate.h for the |
1114 | | * documentation. |
1115 | | * |
1116 | | * Note that the real code is in decompress_template.h. The part here just |
1117 | | * handles calling the appropriate implementation depending on the CPU features |
1118 | | * at runtime. |
1119 | | */ |
1120 | | LIBDEFLATEAPI enum libdeflate_result |
1121 | | libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d, |
1122 | | const void *in, size_t in_nbytes, |
1123 | | void *out, size_t out_nbytes_avail, |
1124 | | size_t *actual_in_nbytes_ret, |
1125 | | size_t *actual_out_nbytes_ret) |
1126 | 1.24k | { |
1127 | 1.24k | return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail, |
1128 | 1.24k | actual_in_nbytes_ret, actual_out_nbytes_ret); |
1129 | 1.24k | } |
1130 | | |
1131 | | LIBDEFLATEAPI enum libdeflate_result |
1132 | | libdeflate_deflate_decompress(struct libdeflate_decompressor *d, |
1133 | | const void *in, size_t in_nbytes, |
1134 | | void *out, size_t out_nbytes_avail, |
1135 | | size_t *actual_out_nbytes_ret) |
1136 | 0 | { |
1137 | 0 | return libdeflate_deflate_decompress_ex(d, in, in_nbytes, |
1138 | 0 | out, out_nbytes_avail, |
1139 | 0 | NULL, actual_out_nbytes_ret); |
1140 | 0 | } |
1141 | | |
1142 | | LIBDEFLATEAPI struct libdeflate_decompressor * |
1143 | | libdeflate_alloc_decompressor(void) |
1144 | 1.51k | { |
1145 | | /* |
1146 | | * Note that only certain parts of the decompressor actually must be |
1147 | | * initialized here: |
1148 | | * |
1149 | | * - 'static_codes_loaded' must be initialized to false. |
1150 | | * |
1151 | | * - The first half of the main portion of each decode table must be |
1152 | | * initialized to any value, to avoid reading from uninitialized |
1153 | | * memory during table expansion in build_decode_table(). (Although, |
1154 | | * this is really just to avoid warnings with dynamic tools like |
1155 | | * valgrind, since build_decode_table() is guaranteed to initialize |
1156 | | * all entries eventually anyway.) |
1157 | | * |
1158 | | * But for simplicity, we currently just zero the whole decompressor. |
1159 | | */ |
1160 | 1.51k | struct libdeflate_decompressor *d = libdeflate_malloc(sizeof(*d)); |
1161 | | |
1162 | 1.51k | if (d == NULL) |
1163 | 0 | return NULL; |
1164 | 1.51k | memset(d, 0, sizeof(*d)); |
1165 | 1.51k | return d; |
1166 | 1.51k | } |
1167 | | |
1168 | | LIBDEFLATEAPI void |
1169 | | libdeflate_free_decompressor(struct libdeflate_decompressor *d) |
1170 | 1.51k | { |
1171 | 1.51k | libdeflate_free(d); |
1172 | 1.51k | } |