/src/libdeflate/lib/decompress_template.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * decompress_template.h |
3 | | * |
4 | | * Copyright 2016 Eric Biggers |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person |
7 | | * obtaining a copy of this software and associated documentation |
8 | | * files (the "Software"), to deal in the Software without |
9 | | * restriction, including without limitation the rights to use, |
10 | | * copy, modify, merge, publish, distribute, sublicense, and/or sell |
11 | | * copies of the Software, and to permit persons to whom the |
12 | | * Software is furnished to do so, subject to the following |
13 | | * conditions: |
14 | | * |
15 | | * The above copyright notice and this permission notice shall be |
16 | | * included in all copies or substantial portions of the Software. |
17 | | * |
18 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
19 | | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
20 | | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
21 | | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
22 | | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
23 | | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
24 | | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
25 | | * OTHER DEALINGS IN THE SOFTWARE. |
26 | | */ |
27 | | |
28 | | /* |
29 | | * This is the actual DEFLATE decompression routine, lifted out of |
30 | | * deflate_decompress.c so that it can be compiled multiple times with different |
31 | | * target instruction sets. |
32 | | */ |
33 | | |
34 | | #ifndef ATTRIBUTES |
35 | | # define ATTRIBUTES |
36 | | #endif |
37 | | #ifndef EXTRACT_VARBITS |
38 | 8.68k | # define EXTRACT_VARBITS(word, count) ((word) & BITMASK(count)) |
39 | | #endif |
40 | | #ifndef EXTRACT_VARBITS8 |
41 | 306k | # define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count))) |
42 | | #endif |
43 | | |
44 | | static ATTRIBUTES MAYBE_UNUSED enum libdeflate_result |
45 | | FUNCNAME(struct libdeflate_decompressor * restrict d, |
46 | | const void * restrict in, size_t in_nbytes, |
47 | | void * restrict out, size_t out_nbytes_avail, |
48 | | size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) |
49 | 6.30k | { |
50 | 6.30k | u8 *out_next = out; |
51 | 6.30k | u8 * const out_end = out_next + out_nbytes_avail; |
52 | 6.30k | u8 * const out_fastloop_end = |
53 | 6.30k | out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN); |
54 | | |
55 | | /* Input bitstream state; see deflate_decompress.c for documentation */ |
56 | 6.30k | const u8 *in_next = in; |
57 | 6.30k | const u8 * const in_end = in_next + in_nbytes; |
58 | 6.30k | const u8 * const in_fastloop_end = |
59 | 6.30k | in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ); |
60 | 6.30k | bitbuf_t bitbuf = 0; |
61 | 6.30k | bitbuf_t saved_bitbuf; |
62 | 6.30k | u32 bitsleft = 0; |
63 | 6.30k | size_t overread_count = 0; |
64 | | |
65 | 6.30k | bool is_final_block; |
66 | 6.30k | unsigned block_type; |
67 | 6.30k | unsigned num_litlen_syms; |
68 | 6.30k | unsigned num_offset_syms; |
69 | 6.30k | bitbuf_t litlen_tablemask; |
70 | 6.30k | u32 entry; |
71 | | |
72 | 10.4k | next_block: |
73 | | /* Starting to read the next block */ |
74 | 10.4k | ; |
75 | | |
76 | 10.4k | STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3)); |
77 | 10.4k | REFILL_BITS(); |
78 | | |
79 | | /* BFINAL: 1 bit */ |
80 | 10.3k | is_final_block = bitbuf & BITMASK(1); |
81 | | |
82 | | /* BTYPE: 2 bits */ |
83 | 10.3k | block_type = (bitbuf >> 1) & BITMASK(2); |
84 | | |
85 | 10.3k | if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { |
86 | | |
87 | | /* Dynamic Huffman block */ |
88 | | |
89 | | /* The order in which precode lengths are stored */ |
90 | 4.78k | static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { |
91 | 4.78k | 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 |
92 | 4.78k | }; |
93 | | |
94 | 4.78k | unsigned num_explicit_precode_lens; |
95 | 4.78k | unsigned i; |
96 | | |
97 | | /* Read the codeword length counts. */ |
98 | | |
99 | 4.78k | STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5)); |
100 | 4.78k | num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5)); |
101 | | |
102 | 4.78k | STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5)); |
103 | 4.78k | num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5)); |
104 | | |
105 | 4.78k | STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4)); |
106 | 4.78k | num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4)); |
107 | | |
108 | 4.78k | d->static_codes_loaded = false; |
109 | | |
110 | | /* |
111 | | * Read the precode codeword lengths. |
112 | | * |
113 | | * A 64-bit bitbuffer is just one bit too small to hold the |
114 | | * maximum number of precode lens, so to minimize branches we |
115 | | * merge one len with the previous fields. |
116 | | */ |
117 | 4.78k | STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); |
118 | 4.78k | if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { |
119 | 4.78k | d->u.precode_lens[deflate_precode_lens_permutation[0]] = |
120 | 4.78k | (bitbuf >> 17) & BITMASK(3); |
121 | 4.78k | bitbuf >>= 20; |
122 | 4.78k | bitsleft -= 20; |
123 | 4.78k | REFILL_BITS(); |
124 | 4.76k | i = 1; |
125 | 60.7k | do { |
126 | 60.7k | d->u.precode_lens[deflate_precode_lens_permutation[i]] = |
127 | 60.7k | bitbuf & BITMASK(3); |
128 | 60.7k | bitbuf >>= 3; |
129 | 60.7k | bitsleft -= 3; |
130 | 60.7k | } while (++i < num_explicit_precode_lens); |
131 | 4.76k | } else { |
132 | 0 | bitbuf >>= 17; |
133 | 0 | bitsleft -= 17; |
134 | 0 | i = 0; |
135 | 0 | do { |
136 | 0 | if ((u8)bitsleft < 3) |
137 | 0 | REFILL_BITS(); |
138 | 0 | d->u.precode_lens[deflate_precode_lens_permutation[i]] = |
139 | 0 | bitbuf & BITMASK(3); |
140 | 0 | bitbuf >>= 3; |
141 | 0 | bitsleft -= 3; |
142 | 0 | } while (++i < num_explicit_precode_lens); |
143 | 0 | } |
144 | 29.8k | for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) |
145 | 25.0k | d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0; |
146 | | |
147 | | /* Build the decode table for the precode. */ |
148 | 4.76k | SAFETY_CHECK(build_precode_decode_table(d)); |
149 | | |
150 | | /* Decode the litlen and offset codeword lengths. */ |
151 | 4.61k | i = 0; |
152 | 401k | do { |
153 | 401k | unsigned presym; |
154 | 401k | u8 rep_val; |
155 | 401k | unsigned rep_count; |
156 | | |
157 | 401k | if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7) |
158 | 15.9k | REFILL_BITS(); |
159 | | |
160 | | /* |
161 | | * The code below assumes that the precode decode table |
162 | | * doesn't have any subtables. |
163 | | */ |
164 | 401k | STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); |
165 | | |
166 | | /* Decode the next precode symbol. */ |
167 | 401k | entry = d->u.l.precode_decode_table[ |
168 | 401k | bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)]; |
169 | 401k | bitbuf >>= (u8)entry; |
170 | 401k | bitsleft -= entry; /* optimization: subtract full entry */ |
171 | 401k | presym = entry >> 16; |
172 | | |
173 | 401k | if (presym < 16) { |
174 | | /* Explicit codeword length */ |
175 | 376k | d->u.l.lens[i++] = presym; |
176 | 376k | continue; |
177 | 376k | } |
178 | | |
179 | | /* Run-length encoded codeword lengths */ |
180 | | |
181 | | /* |
182 | | * Note: we don't need to immediately verify that the |
183 | | * repeat count doesn't overflow the number of elements, |
184 | | * since we've sized the lens array to have enough extra |
185 | | * space to allow for the worst-case overrun (138 zeroes |
186 | | * when only 1 length was remaining). |
187 | | * |
188 | | * In the case of the small repeat counts (presyms 16 |
189 | | * and 17), it is fastest to always write the maximum |
190 | | * number of entries. That gets rid of branches that |
191 | | * would otherwise be required. |
192 | | * |
193 | | * It is not just because of the numerical order that |
194 | | * our checks go in the order 'presym < 16', 'presym == |
195 | | * 16', and 'presym == 17'. For typical data this is |
196 | | * ordered from most frequent to least frequent case. |
197 | | */ |
198 | 24.9k | STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); |
199 | | |
200 | 24.9k | if (presym == 16) { |
201 | | /* Repeat the previous length 3 - 6 times. */ |
202 | 4.71k | SAFETY_CHECK(i != 0); |
203 | 4.69k | rep_val = d->u.l.lens[i - 1]; |
204 | 4.69k | STATIC_ASSERT(3 + BITMASK(2) == 6); |
205 | 4.69k | rep_count = 3 + (bitbuf & BITMASK(2)); |
206 | 4.69k | bitbuf >>= 2; |
207 | 4.69k | bitsleft -= 2; |
208 | 4.69k | d->u.l.lens[i + 0] = rep_val; |
209 | 4.69k | d->u.l.lens[i + 1] = rep_val; |
210 | 4.69k | d->u.l.lens[i + 2] = rep_val; |
211 | 4.69k | d->u.l.lens[i + 3] = rep_val; |
212 | 4.69k | d->u.l.lens[i + 4] = rep_val; |
213 | 4.69k | d->u.l.lens[i + 5] = rep_val; |
214 | 4.69k | i += rep_count; |
215 | 20.1k | } else if (presym == 17) { |
216 | | /* Repeat zero 3 - 10 times. */ |
217 | 8.09k | STATIC_ASSERT(3 + BITMASK(3) == 10); |
218 | 8.09k | rep_count = 3 + (bitbuf & BITMASK(3)); |
219 | 8.09k | bitbuf >>= 3; |
220 | 8.09k | bitsleft -= 3; |
221 | 8.09k | d->u.l.lens[i + 0] = 0; |
222 | 8.09k | d->u.l.lens[i + 1] = 0; |
223 | 8.09k | d->u.l.lens[i + 2] = 0; |
224 | 8.09k | d->u.l.lens[i + 3] = 0; |
225 | 8.09k | d->u.l.lens[i + 4] = 0; |
226 | 8.09k | d->u.l.lens[i + 5] = 0; |
227 | 8.09k | d->u.l.lens[i + 6] = 0; |
228 | 8.09k | d->u.l.lens[i + 7] = 0; |
229 | 8.09k | d->u.l.lens[i + 8] = 0; |
230 | 8.09k | d->u.l.lens[i + 9] = 0; |
231 | 8.09k | i += rep_count; |
232 | 12.1k | } else { |
233 | | /* Repeat zero 11 - 138 times. */ |
234 | 12.1k | STATIC_ASSERT(11 + BITMASK(7) == 138); |
235 | 12.1k | rep_count = 11 + (bitbuf & BITMASK(7)); |
236 | 12.1k | bitbuf >>= 7; |
237 | 12.1k | bitsleft -= 7; |
238 | 12.1k | memset(&d->u.l.lens[i], 0, |
239 | 12.1k | rep_count * sizeof(d->u.l.lens[i])); |
240 | 12.1k | i += rep_count; |
241 | 12.1k | } |
242 | 401k | } while (i < num_litlen_syms + num_offset_syms); |
243 | | |
244 | | /* Unnecessary, but check this for consistency with zlib. */ |
245 | 4.40k | SAFETY_CHECK(i == num_litlen_syms + num_offset_syms); |
246 | | |
247 | 5.57k | } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { |
248 | 1.16k | u16 len, nlen; |
249 | | |
250 | | /* |
251 | | * Uncompressed block: copy 'len' bytes literally from the input |
252 | | * buffer to the output buffer. |
253 | | */ |
254 | | |
255 | 1.16k | bitsleft -= 3; /* for BTYPE and BFINAL */ |
256 | | |
257 | | /* |
258 | | * Align the bitstream to the next byte boundary. This means |
259 | | * the next byte boundary as if we were reading a byte at a |
260 | | * time. Therefore, we have to rewind 'in_next' by any bytes |
261 | | * that have been refilled but not actually consumed yet (not |
262 | | * counting overread bytes, which don't increment 'in_next'). |
263 | | */ |
264 | 1.16k | bitsleft = (u8)bitsleft; |
265 | 1.16k | SAFETY_CHECK(overread_count <= (bitsleft >> 3)); |
266 | 1.09k | in_next -= (bitsleft >> 3) - overread_count; |
267 | 1.09k | overread_count = 0; |
268 | 1.09k | bitbuf = 0; |
269 | 1.09k | bitsleft = 0; |
270 | | |
271 | 1.09k | SAFETY_CHECK(in_end - in_next >= 4); |
272 | 1.05k | len = get_unaligned_le16(in_next); |
273 | 1.05k | nlen = get_unaligned_le16(in_next + 2); |
274 | 1.05k | in_next += 4; |
275 | | |
276 | 1.05k | SAFETY_CHECK(len == (u16)~nlen); |
277 | 821 | if (unlikely(len > out_end - out_next)) |
278 | 364 | return LIBDEFLATE_INSUFFICIENT_SPACE; |
279 | 457 | SAFETY_CHECK(len <= in_end - in_next); |
280 | | |
281 | 442 | memcpy(out_next, in_next, len); |
282 | 442 | in_next += len; |
283 | 442 | out_next += len; |
284 | | |
285 | 442 | goto block_done; |
286 | | |
287 | 4.40k | } else { |
288 | 4.40k | unsigned i; |
289 | | |
290 | 4.40k | SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); |
291 | | |
292 | | /* |
293 | | * Static Huffman block: build the decode tables for the static |
294 | | * codes. Skip doing so if the tables are already set up from |
295 | | * an earlier static block; this speeds up decompression of |
296 | | * degenerate input of many empty or very short static blocks. |
297 | | * |
298 | | * Afterwards, the remainder is the same as decompressing a |
299 | | * dynamic Huffman block. |
300 | | */ |
301 | | |
302 | 4.36k | bitbuf >>= 3; /* for BTYPE and BFINAL */ |
303 | 4.36k | bitsleft -= 3; |
304 | | |
305 | 4.36k | if (d->static_codes_loaded) |
306 | 2.43k | goto have_decode_tables; |
307 | | |
308 | 1.93k | d->static_codes_loaded = true; |
309 | | |
310 | 1.93k | STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); |
311 | 1.93k | STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); |
312 | | |
313 | 281k | for (i = 0; i < 144; i++) |
314 | 279k | d->u.l.lens[i] = 8; |
315 | 218k | for (; i < 256; i++) |
316 | 217k | d->u.l.lens[i] = 9; |
317 | 48.4k | for (; i < 280; i++) |
318 | 46.5k | d->u.l.lens[i] = 7; |
319 | 17.4k | for (; i < 288; i++) |
320 | 15.5k | d->u.l.lens[i] = 8; |
321 | | |
322 | 63.9k | for (; i < 288 + 32; i++) |
323 | 62.0k | d->u.l.lens[i] = 5; |
324 | | |
325 | 1.93k | num_litlen_syms = 288; |
326 | 1.93k | num_offset_syms = 32; |
327 | 1.93k | } |
328 | | |
329 | | /* Decompressing a Huffman block (either dynamic or static) */ |
330 | | |
331 | 6.26k | SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); |
332 | 6.18k | SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); |
333 | 8.55k | have_decode_tables: |
334 | 8.55k | litlen_tablemask = BITMASK(d->litlen_tablebits); |
335 | | |
336 | | /* |
337 | | * This is the "fastloop" for decoding literals and matches. It does |
338 | | * bounds checks on in_next and out_next in the loop conditions so that |
339 | | * additional bounds checks aren't needed inside the loop body. |
340 | | * |
341 | | * To reduce latency, the bitbuffer is refilled and the next litlen |
342 | | * decode table entry is preloaded before each loop iteration. |
343 | | */ |
344 | 8.55k | if (in_next >= in_fastloop_end || out_next >= out_fastloop_end) |
345 | 5.32k | goto generic_loop; |
346 | 3.23k | REFILL_BITS_IN_FASTLOOP(); |
347 | 3.23k | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; |
348 | 162k | do { |
349 | 162k | u32 length, offset, lit; |
350 | 162k | const u8 *src; |
351 | 162k | u8 *dst; |
352 | | |
353 | | /* |
354 | | * Consume the bits for the litlen decode table entry. Save the |
355 | | * original bitbuf for later, in case the extra match length |
356 | | * bits need to be extracted from it. |
357 | | */ |
358 | 162k | saved_bitbuf = bitbuf; |
359 | 162k | bitbuf >>= (u8)entry; |
360 | 162k | bitsleft -= entry; /* optimization: subtract full entry */ |
361 | | |
362 | | /* |
363 | | * Begin by checking for a "fast" literal, i.e. a literal that |
364 | | * doesn't need a subtable. |
365 | | */ |
366 | 162k | if (entry & HUFFDEC_LITERAL) { |
367 | | /* |
368 | | * On 64-bit platforms, we decode up to 2 extra fast |
369 | | * literals in addition to the primary item, as this |
370 | | * increases performance and still leaves enough bits |
371 | | * remaining for what follows. We could actually do 3, |
372 | | * assuming LITLEN_TABLEBITS=11, but that actually |
373 | | * decreases performance slightly (perhaps by messing |
374 | | * with the branch prediction of the conditional refill |
375 | | * that happens later while decoding the match offset). |
376 | | * |
377 | | * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN |
378 | | * and FASTLOOP_MAX_BYTES_READ need to be updated if the |
379 | | * number of extra literals decoded here is changed. |
380 | | */ |
381 | 89.2k | if (/* enough bits for 2 fast literals + length + offset preload? */ |
382 | 89.2k | CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + |
383 | 89.2k | LENGTH_MAXBITS, |
384 | 89.2k | OFFSET_TABLEBITS) && |
385 | | /* enough bits for 2 fast literals + slow literal + litlen preload? */ |
386 | 89.2k | CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + |
387 | 89.2k | DEFLATE_MAX_LITLEN_CODEWORD_LEN, |
388 | 89.2k | LITLEN_TABLEBITS)) { |
389 | | /* 1st extra fast literal */ |
390 | 89.2k | lit = entry >> 16; |
391 | 89.2k | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; |
392 | 89.2k | saved_bitbuf = bitbuf; |
393 | 89.2k | bitbuf >>= (u8)entry; |
394 | 89.2k | bitsleft -= entry; |
395 | 89.2k | *out_next++ = lit; |
396 | 89.2k | if (entry & HUFFDEC_LITERAL) { |
397 | | /* 2nd extra fast literal */ |
398 | 81.4k | lit = entry >> 16; |
399 | 81.4k | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; |
400 | 81.4k | saved_bitbuf = bitbuf; |
401 | 81.4k | bitbuf >>= (u8)entry; |
402 | 81.4k | bitsleft -= entry; |
403 | 81.4k | *out_next++ = lit; |
404 | 81.4k | if (entry & HUFFDEC_LITERAL) { |
405 | | /* |
406 | | * Another fast literal, but |
407 | | * this one is in lieu of the |
408 | | * primary item, so it doesn't |
409 | | * count as one of the extras. |
410 | | */ |
411 | 76.0k | lit = entry >> 16; |
412 | 76.0k | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; |
413 | 76.0k | REFILL_BITS_IN_FASTLOOP(); |
414 | 76.0k | *out_next++ = lit; |
415 | 76.0k | continue; |
416 | 76.0k | } |
417 | 81.4k | } |
418 | 89.2k | } else { |
419 | | /* |
420 | | * Decode a literal. While doing so, preload |
421 | | * the next litlen decode table entry and refill |
422 | | * the bitbuffer. To reduce latency, we've |
423 | | * arranged for there to be enough "preloadable" |
424 | | * bits remaining to do the table preload |
425 | | * independently of the refill. |
426 | | */ |
427 | 0 | STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD( |
428 | 0 | LITLEN_TABLEBITS, LITLEN_TABLEBITS)); |
429 | 0 | lit = entry >> 16; |
430 | 0 | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; |
431 | 0 | REFILL_BITS_IN_FASTLOOP(); |
432 | 0 | *out_next++ = lit; |
433 | 0 | continue; |
434 | 0 | } |
435 | 89.2k | } |
436 | | |
437 | | /* |
438 | | * It's not a literal entry, so it can be a length entry, a |
439 | | * subtable pointer entry, or an end-of-block entry. Detect the |
440 | | * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag. |
441 | | */ |
442 | 86.6k | if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { |
443 | | /* Subtable pointer or end-of-block entry */ |
444 | | |
445 | 6.52k | if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) |
446 | 1.74k | goto block_done; |
447 | | |
448 | | /* |
449 | | * A subtable is required. Load and consume the |
450 | | * subtable entry. The subtable entry can be of any |
451 | | * type: literal, length, or end-of-block. |
452 | | */ |
453 | 4.78k | entry = d->u.litlen_decode_table[(entry >> 16) + |
454 | 4.78k | EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; |
455 | 4.78k | saved_bitbuf = bitbuf; |
456 | 4.78k | bitbuf >>= (u8)entry; |
457 | 4.78k | bitsleft -= entry; |
458 | | |
459 | | /* |
460 | | * 32-bit platforms that use the byte-at-a-time refill |
461 | | * method have to do a refill here for there to always |
462 | | * be enough bits to decode a literal that requires a |
463 | | * subtable, then preload the next litlen decode table |
464 | | * entry; or to decode a match length that requires a |
465 | | * subtable, then preload the offset decode table entry. |
466 | | */ |
467 | 4.78k | if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN, |
468 | 4.78k | LITLEN_TABLEBITS) || |
469 | 4.78k | !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS, |
470 | 4.78k | OFFSET_TABLEBITS)) |
471 | 0 | REFILL_BITS_IN_FASTLOOP(); |
472 | 4.78k | if (entry & HUFFDEC_LITERAL) { |
473 | | /* Decode a literal that required a subtable. */ |
474 | 1.24k | lit = entry >> 16; |
475 | 1.24k | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; |
476 | 1.24k | REFILL_BITS_IN_FASTLOOP(); |
477 | 1.24k | *out_next++ = lit; |
478 | 1.24k | continue; |
479 | 1.24k | } |
480 | 3.54k | if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) |
481 | 61 | goto block_done; |
482 | | /* Else, it's a length that required a subtable. */ |
483 | 3.54k | } |
484 | | |
485 | | /* |
486 | | * Decode the match length: the length base value associated |
487 | | * with the litlen symbol (which we extract from the decode |
488 | | * table entry), plus the extra length bits. We don't need to |
489 | | * consume the extra length bits here, as they were included in |
490 | | * the bits consumed by the entry earlier. We also don't need |
491 | | * to check for too-long matches here, as this is inside the |
492 | | * fastloop where it's already been verified that the output |
493 | | * buffer has enough space remaining to copy a max-length match. |
494 | | */ |
495 | 83.6k | length = entry >> 16; |
496 | 83.6k | length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); |
497 | | |
498 | | /* |
499 | | * Decode the match offset. There are enough "preloadable" bits |
500 | | * remaining to preload the offset decode table entry, but a |
501 | | * refill might be needed before consuming it. |
502 | | */ |
503 | 83.6k | STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS, |
504 | 83.6k | OFFSET_TABLEBITS)); |
505 | 83.6k | entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; |
506 | 83.6k | if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS, |
507 | 83.6k | LITLEN_TABLEBITS)) { |
508 | | /* |
509 | | * Decoding a match offset on a 64-bit platform. We may |
510 | | * need to refill once, but then we can decode the whole |
511 | | * offset and preload the next litlen table entry. |
512 | | */ |
513 | 83.6k | if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { |
514 | | /* Offset codeword requires a subtable */ |
515 | 3.00k | if (unlikely((u8)bitsleft < OFFSET_MAXBITS + |
516 | 3.00k | LITLEN_TABLEBITS - PRELOAD_SLACK)) |
517 | 214 | REFILL_BITS_IN_FASTLOOP(); |
518 | 3.00k | bitbuf >>= OFFSET_TABLEBITS; |
519 | 3.00k | bitsleft -= OFFSET_TABLEBITS; |
520 | 3.00k | entry = d->offset_decode_table[(entry >> 16) + |
521 | 3.00k | EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; |
522 | 80.6k | } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS + |
523 | 80.6k | LITLEN_TABLEBITS - PRELOAD_SLACK)) |
524 | 243 | REFILL_BITS_IN_FASTLOOP(); |
525 | 83.6k | } else { |
526 | | /* Decoding a match offset on a 32-bit platform */ |
527 | 0 | REFILL_BITS_IN_FASTLOOP(); |
528 | 0 | if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { |
529 | | /* Offset codeword requires a subtable */ |
530 | 0 | bitbuf >>= OFFSET_TABLEBITS; |
531 | 0 | bitsleft -= OFFSET_TABLEBITS; |
532 | 0 | entry = d->offset_decode_table[(entry >> 16) + |
533 | 0 | EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; |
534 | 0 | REFILL_BITS_IN_FASTLOOP(); |
535 | | /* No further refill needed before extra bits */ |
536 | 0 | STATIC_ASSERT(CAN_CONSUME( |
537 | 0 | OFFSET_MAXBITS - OFFSET_TABLEBITS)); |
538 | 0 | } else { |
539 | | /* No refill needed before extra bits */ |
540 | 0 | STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS)); |
541 | 0 | } |
542 | 0 | } |
543 | 83.6k | saved_bitbuf = bitbuf; |
544 | 83.6k | bitbuf >>= (u8)entry; |
545 | 83.6k | bitsleft -= entry; /* optimization: subtract full entry */ |
546 | 83.6k | offset = entry >> 16; |
547 | 83.6k | offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); |
548 | | |
549 | | /* Validate the match offset; needed even in the fastloop. */ |
550 | 83.6k | SAFETY_CHECK(offset <= out_next - (const u8 *)out); |
551 | 83.5k | src = out_next - offset; |
552 | 83.5k | dst = out_next; |
553 | 83.5k | out_next += length; |
554 | | |
555 | | /* |
556 | | * Before starting to issue the instructions to copy the match, |
557 | | * refill the bitbuffer and preload the litlen decode table |
558 | | * entry for the next loop iteration. This can increase |
559 | | * performance by allowing the latency of the match copy to |
560 | | * overlap with these other operations. To further reduce |
561 | | * latency, we've arranged for there to be enough bits remaining |
562 | | * to do the table preload independently of the refill, except |
563 | | * on 32-bit platforms using the byte-at-a-time refill method. |
564 | | */ |
565 | 83.5k | if (!CAN_CONSUME_AND_THEN_PRELOAD( |
566 | 83.5k | MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS, |
567 | 83.5k | OFFSET_MAXFASTBITS), |
568 | 83.5k | LITLEN_TABLEBITS) && |
569 | 83.5k | unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK)) |
570 | 0 | REFILL_BITS_IN_FASTLOOP(); |
571 | 83.5k | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; |
572 | 83.5k | REFILL_BITS_IN_FASTLOOP(); |
573 | | |
574 | | /* |
575 | | * Copy the match. On most CPUs the fastest method is a |
576 | | * word-at-a-time copy, unconditionally copying about 5 words |
577 | | * since this is enough for most matches without being too much. |
578 | | * |
579 | | * The normal word-at-a-time copy works for offset >= WORDBYTES, |
580 | | * which is most cases. The case of offset == 1 is also common |
581 | | * and is worth optimizing for, since it is just RLE encoding of |
582 | | * the previous byte, which is the result of compressing long |
583 | | * runs of the same byte. |
584 | | * |
585 | | * Writing past the match 'length' is allowed here, since it's |
586 | | * been ensured there is enough output space left for a slight |
587 | | * overrun. FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if |
588 | | * the maximum possible overrun here is changed. |
589 | | */ |
590 | 83.5k | if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) { |
591 | 17.8k | store_word_unaligned(load_word_unaligned(src), dst); |
592 | 17.8k | src += WORDBYTES; |
593 | 17.8k | dst += WORDBYTES; |
594 | 17.8k | store_word_unaligned(load_word_unaligned(src), dst); |
595 | 17.8k | src += WORDBYTES; |
596 | 17.8k | dst += WORDBYTES; |
597 | 17.8k | store_word_unaligned(load_word_unaligned(src), dst); |
598 | 17.8k | src += WORDBYTES; |
599 | 17.8k | dst += WORDBYTES; |
600 | 17.8k | store_word_unaligned(load_word_unaligned(src), dst); |
601 | 17.8k | src += WORDBYTES; |
602 | 17.8k | dst += WORDBYTES; |
603 | 17.8k | store_word_unaligned(load_word_unaligned(src), dst); |
604 | 17.8k | src += WORDBYTES; |
605 | 17.8k | dst += WORDBYTES; |
606 | 28.2k | while (dst < out_next) { |
607 | 10.3k | store_word_unaligned(load_word_unaligned(src), dst); |
608 | 10.3k | src += WORDBYTES; |
609 | 10.3k | dst += WORDBYTES; |
610 | 10.3k | store_word_unaligned(load_word_unaligned(src), dst); |
611 | 10.3k | src += WORDBYTES; |
612 | 10.3k | dst += WORDBYTES; |
613 | 10.3k | store_word_unaligned(load_word_unaligned(src), dst); |
614 | 10.3k | src += WORDBYTES; |
615 | 10.3k | dst += WORDBYTES; |
616 | 10.3k | store_word_unaligned(load_word_unaligned(src), dst); |
617 | 10.3k | src += WORDBYTES; |
618 | 10.3k | dst += WORDBYTES; |
619 | 10.3k | store_word_unaligned(load_word_unaligned(src), dst); |
620 | 10.3k | src += WORDBYTES; |
621 | 10.3k | dst += WORDBYTES; |
622 | 10.3k | } |
623 | 65.6k | } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) { |
624 | 22.1k | machine_word_t v; |
625 | | |
626 | | /* |
627 | | * This part tends to get auto-vectorized, so keep it |
628 | | * copying a multiple of 16 bytes at a time. |
629 | | */ |
630 | 22.1k | v = (machine_word_t)0x0101010101010101 * src[0]; |
631 | 22.1k | store_word_unaligned(v, dst); |
632 | 22.1k | dst += WORDBYTES; |
633 | 22.1k | store_word_unaligned(v, dst); |
634 | 22.1k | dst += WORDBYTES; |
635 | 22.1k | store_word_unaligned(v, dst); |
636 | 22.1k | dst += WORDBYTES; |
637 | 22.1k | store_word_unaligned(v, dst); |
638 | 22.1k | dst += WORDBYTES; |
639 | 150k | while (dst < out_next) { |
640 | 127k | store_word_unaligned(v, dst); |
641 | 127k | dst += WORDBYTES; |
642 | 127k | store_word_unaligned(v, dst); |
643 | 127k | dst += WORDBYTES; |
644 | 127k | store_word_unaligned(v, dst); |
645 | 127k | dst += WORDBYTES; |
646 | 127k | store_word_unaligned(v, dst); |
647 | 127k | dst += WORDBYTES; |
648 | 127k | } |
649 | 43.4k | } else if (UNALIGNED_ACCESS_IS_FAST) { |
650 | 43.4k | store_word_unaligned(load_word_unaligned(src), dst); |
651 | 43.4k | src += offset; |
652 | 43.4k | dst += offset; |
653 | 43.4k | store_word_unaligned(load_word_unaligned(src), dst); |
654 | 43.4k | src += offset; |
655 | 43.4k | dst += offset; |
656 | 1.80M | do { |
657 | 1.80M | store_word_unaligned(load_word_unaligned(src), dst); |
658 | 1.80M | src += offset; |
659 | 1.80M | dst += offset; |
660 | 1.80M | store_word_unaligned(load_word_unaligned(src), dst); |
661 | 1.80M | src += offset; |
662 | 1.80M | dst += offset; |
663 | 1.80M | } while (dst < out_next); |
664 | 43.4k | } else { |
665 | 0 | *dst++ = *src++; |
666 | 0 | *dst++ = *src++; |
667 | 0 | do { |
668 | 0 | *dst++ = *src++; |
669 | 0 | } while (dst < out_next); |
670 | 0 | } |
671 | 160k | } while (in_next < in_fastloop_end && out_next < out_fastloop_end); |
672 | | |
673 | | /* |
674 | | * This is the generic loop for decoding literals and matches. This |
675 | | * handles cases where in_next and out_next are close to the end of |
676 | | * their respective buffers. Usually this loop isn't performance- |
677 | | * critical, as most time is spent in the fastloop above instead. We |
678 | | * therefore omit some optimizations here in favor of smaller code. |
679 | | */ |
680 | 6.67k | generic_loop: |
681 | 224k | for (;;) { |
682 | 224k | u32 length, offset; |
683 | 224k | const u8 *src; |
684 | 224k | u8 *dst; |
685 | | |
686 | 224k | REFILL_BITS(); |
687 | 223k | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; |
688 | 223k | saved_bitbuf = bitbuf; |
689 | 223k | bitbuf >>= (u8)entry; |
690 | 223k | bitsleft -= entry; |
691 | 223k | if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) { |
692 | 531 | entry = d->u.litlen_decode_table[(entry >> 16) + |
693 | 531 | EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; |
694 | 531 | saved_bitbuf = bitbuf; |
695 | 531 | bitbuf >>= (u8)entry; |
696 | 531 | bitsleft -= entry; |
697 | 531 | } |
698 | 223k | length = entry >> 16; |
699 | 223k | if (entry & HUFFDEC_LITERAL) { |
700 | 150k | if (unlikely(out_next == out_end)) |
701 | 1.23k | return LIBDEFLATE_INSUFFICIENT_SPACE; |
702 | 148k | *out_next++ = length; |
703 | 148k | continue; |
704 | 150k | } |
705 | 73.5k | if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) |
706 | 3.08k | goto block_done; |
707 | 70.5k | length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); |
708 | 70.5k | if (unlikely(length > out_end - out_next)) |
709 | 1.87k | return LIBDEFLATE_INSUFFICIENT_SPACE; |
710 | | |
711 | 68.6k | if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS)) |
712 | 0 | REFILL_BITS(); |
713 | 68.6k | entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; |
714 | 68.6k | if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { |
715 | 366 | bitbuf >>= OFFSET_TABLEBITS; |
716 | 366 | bitsleft -= OFFSET_TABLEBITS; |
717 | 366 | entry = d->offset_decode_table[(entry >> 16) + |
718 | 366 | EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; |
719 | 366 | if (!CAN_CONSUME(OFFSET_MAXBITS)) |
720 | 0 | REFILL_BITS(); |
721 | 366 | } |
722 | 68.6k | offset = entry >> 16; |
723 | 68.6k | offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8); |
724 | 68.6k | bitbuf >>= (u8)entry; |
725 | 68.6k | bitsleft -= entry; |
726 | | |
727 | 68.6k | SAFETY_CHECK(offset <= out_next - (const u8 *)out); |
728 | 68.4k | src = out_next - offset; |
729 | 68.4k | dst = out_next; |
730 | 68.4k | out_next += length; |
731 | | |
732 | 68.4k | STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3); |
733 | 68.4k | *dst++ = *src++; |
734 | 68.4k | *dst++ = *src++; |
735 | 16.0M | do { |
736 | 16.0M | *dst++ = *src++; |
737 | 16.0M | } while (dst < out_next); |
738 | 68.4k | } |
739 | | |
740 | 5.33k | block_done: |
741 | | /* Finished decoding a block */ |
742 | | |
743 | 5.33k | if (!is_final_block) |
744 | 4.10k | goto next_block; |
745 | | |
746 | | /* That was the last block. */ |
747 | | |
748 | 1.22k | bitsleft = (u8)bitsleft; |
749 | | |
750 | | /* |
751 | | * If any of the implicit appended zero bytes were consumed (not just |
752 | | * refilled) before hitting end of stream, then the data is bad. |
753 | | */ |
754 | 1.22k | SAFETY_CHECK(overread_count <= (bitsleft >> 3)); |
755 | | |
756 | | /* Optionally return the actual number of bytes consumed. */ |
757 | 1.17k | if (actual_in_nbytes_ret) { |
758 | | /* Don't count bytes that were refilled but not consumed. */ |
759 | 1.17k | in_next -= (bitsleft >> 3) - overread_count; |
760 | | |
761 | 1.17k | *actual_in_nbytes_ret = in_next - (u8 *)in; |
762 | 1.17k | } |
763 | | |
764 | | /* Optionally return the actual number of bytes written. */ |
765 | 1.17k | if (actual_out_nbytes_ret) { |
766 | 0 | *actual_out_nbytes_ret = out_next - (u8 *)out; |
767 | 1.17k | } else { |
768 | 1.17k | if (out_next != out_end) |
769 | 131 | return LIBDEFLATE_SHORT_OUTPUT; |
770 | 1.17k | } |
771 | 1.04k | return LIBDEFLATE_SUCCESS; |
772 | 1.17k | } deflate_decompress.c:deflate_decompress_bmi2 Line | Count | Source | 49 | 6.30k | { | 50 | 6.30k | u8 *out_next = out; | 51 | 6.30k | u8 * const out_end = out_next + out_nbytes_avail; | 52 | 6.30k | u8 * const out_fastloop_end = | 53 | 6.30k | out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN); | 54 | | | 55 | | /* Input bitstream state; see deflate_decompress.c for documentation */ | 56 | 6.30k | const u8 *in_next = in; | 57 | 6.30k | const u8 * const in_end = in_next + in_nbytes; | 58 | 6.30k | const u8 * const in_fastloop_end = | 59 | 6.30k | in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ); | 60 | 6.30k | bitbuf_t bitbuf = 0; | 61 | 6.30k | bitbuf_t saved_bitbuf; | 62 | 6.30k | u32 bitsleft = 0; | 63 | 6.30k | size_t overread_count = 0; | 64 | | | 65 | 6.30k | bool is_final_block; | 66 | 6.30k | unsigned block_type; | 67 | 6.30k | unsigned num_litlen_syms; | 68 | 6.30k | unsigned num_offset_syms; | 69 | 6.30k | bitbuf_t litlen_tablemask; | 70 | 6.30k | u32 entry; | 71 | | | 72 | 10.4k | next_block: | 73 | | /* Starting to read the next block */ | 74 | 10.4k | ; | 75 | | | 76 | 10.4k | STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3)); | 77 | 10.4k | REFILL_BITS(); | 78 | | | 79 | | /* BFINAL: 1 bit */ | 80 | 10.3k | is_final_block = bitbuf & BITMASK(1); | 81 | | | 82 | | /* BTYPE: 2 bits */ | 83 | 10.3k | block_type = (bitbuf >> 1) & BITMASK(2); | 84 | | | 85 | 10.3k | if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { | 86 | | | 87 | | /* Dynamic Huffman block */ | 88 | | | 89 | | /* The order in which precode lengths are stored */ | 90 | 4.78k | static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { | 91 | 4.78k | 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 | 92 | 4.78k | }; | 93 | | | 94 | 4.78k | unsigned num_explicit_precode_lens; | 95 | 4.78k | unsigned i; | 96 | | | 97 | | /* Read the codeword length counts. */ | 98 | | | 99 | 4.78k | STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5)); | 100 | 4.78k | num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5)); | 101 | | | 102 | 4.78k | STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5)); | 103 | 4.78k | num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5)); | 104 | | | 105 | 4.78k | STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4)); | 106 | 4.78k | num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4)); | 107 | | | 108 | 4.78k | d->static_codes_loaded = false; | 109 | | | 110 | | /* | 111 | | * Read the precode codeword lengths. | 112 | | * | 113 | | * A 64-bit bitbuffer is just one bit too small to hold the | 114 | | * maximum number of precode lens, so to minimize branches we | 115 | | * merge one len with the previous fields. | 116 | | */ | 117 | 4.78k | STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); | 118 | 4.78k | if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { | 119 | 4.78k | d->u.precode_lens[deflate_precode_lens_permutation[0]] = | 120 | 4.78k | (bitbuf >> 17) & BITMASK(3); | 121 | 4.78k | bitbuf >>= 20; | 122 | 4.78k | bitsleft -= 20; | 123 | 4.78k | REFILL_BITS(); | 124 | 4.76k | i = 1; | 125 | 60.7k | do { | 126 | 60.7k | d->u.precode_lens[deflate_precode_lens_permutation[i]] = | 127 | 60.7k | bitbuf & BITMASK(3); | 128 | 60.7k | bitbuf >>= 3; | 129 | 60.7k | bitsleft -= 3; | 130 | 60.7k | } while (++i < num_explicit_precode_lens); | 131 | 4.76k | } else { | 132 | 0 | bitbuf >>= 17; | 133 | 0 | bitsleft -= 17; | 134 | 0 | i = 0; | 135 | 0 | do { | 136 | 0 | if ((u8)bitsleft < 3) | 137 | 0 | REFILL_BITS(); | 138 | 0 | d->u.precode_lens[deflate_precode_lens_permutation[i]] = | 139 | 0 | bitbuf & BITMASK(3); | 140 | 0 | bitbuf >>= 3; | 141 | 0 | bitsleft -= 3; | 142 | 0 | } while (++i < num_explicit_precode_lens); | 143 | 0 | } | 144 | 29.8k | for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) | 145 | 25.0k | d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0; | 146 | | | 147 | | /* Build the decode table for the precode. */ | 148 | 4.76k | SAFETY_CHECK(build_precode_decode_table(d)); | 149 | | | 150 | | /* Decode the litlen and offset codeword lengths. */ | 151 | 4.61k | i = 0; | 152 | 401k | do { | 153 | 401k | unsigned presym; | 154 | 401k | u8 rep_val; | 155 | 401k | unsigned rep_count; | 156 | | | 157 | 401k | if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7) | 158 | 15.9k | REFILL_BITS(); | 159 | | | 160 | | /* | 161 | | * The code below assumes that the precode decode table | 162 | | * doesn't have any subtables. | 163 | | */ | 164 | 401k | STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); | 165 | | | 166 | | /* Decode the next precode symbol. */ | 167 | 401k | entry = d->u.l.precode_decode_table[ | 168 | 401k | bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)]; | 169 | 401k | bitbuf >>= (u8)entry; | 170 | 401k | bitsleft -= entry; /* optimization: subtract full entry */ | 171 | 401k | presym = entry >> 16; | 172 | | | 173 | 401k | if (presym < 16) { | 174 | | /* Explicit codeword length */ | 175 | 376k | d->u.l.lens[i++] = presym; | 176 | 376k | continue; | 177 | 376k | } | 178 | | | 179 | | /* Run-length encoded codeword lengths */ | 180 | | | 181 | | /* | 182 | | * Note: we don't need to immediately verify that the | 183 | | * repeat count doesn't overflow the number of elements, | 184 | | * since we've sized the lens array to have enough extra | 185 | | * space to allow for the worst-case overrun (138 zeroes | 186 | | * when only 1 length was remaining). | 187 | | * | 188 | | * In the case of the small repeat counts (presyms 16 | 189 | | * and 17), it is fastest to always write the maximum | 190 | | * number of entries. That gets rid of branches that | 191 | | * would otherwise be required. | 192 | | * | 193 | | * It is not just because of the numerical order that | 194 | | * our checks go in the order 'presym < 16', 'presym == | 195 | | * 16', and 'presym == 17'. For typical data this is | 196 | | * ordered from most frequent to least frequent case. | 197 | | */ | 198 | 24.9k | STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); | 199 | | | 200 | 24.9k | if (presym == 16) { | 201 | | /* Repeat the previous length 3 - 6 times. */ | 202 | 4.71k | SAFETY_CHECK(i != 0); | 203 | 4.69k | rep_val = d->u.l.lens[i - 1]; | 204 | 4.69k | STATIC_ASSERT(3 + BITMASK(2) == 6); | 205 | 4.69k | rep_count = 3 + (bitbuf & BITMASK(2)); | 206 | 4.69k | bitbuf >>= 2; | 207 | 4.69k | bitsleft -= 2; | 208 | 4.69k | d->u.l.lens[i + 0] = rep_val; | 209 | 4.69k | d->u.l.lens[i + 1] = rep_val; | 210 | 4.69k | d->u.l.lens[i + 2] = rep_val; | 211 | 4.69k | d->u.l.lens[i + 3] = rep_val; | 212 | 4.69k | d->u.l.lens[i + 4] = rep_val; | 213 | 4.69k | d->u.l.lens[i + 5] = rep_val; | 214 | 4.69k | i += rep_count; | 215 | 20.1k | } else if (presym == 17) { | 216 | | /* Repeat zero 3 - 10 times. */ | 217 | 8.09k | STATIC_ASSERT(3 + BITMASK(3) == 10); | 218 | 8.09k | rep_count = 3 + (bitbuf & BITMASK(3)); | 219 | 8.09k | bitbuf >>= 3; | 220 | 8.09k | bitsleft -= 3; | 221 | 8.09k | d->u.l.lens[i + 0] = 0; | 222 | 8.09k | d->u.l.lens[i + 1] = 0; | 223 | 8.09k | d->u.l.lens[i + 2] = 0; | 224 | 8.09k | d->u.l.lens[i + 3] = 0; | 225 | 8.09k | d->u.l.lens[i + 4] = 0; | 226 | 8.09k | d->u.l.lens[i + 5] = 0; | 227 | 8.09k | d->u.l.lens[i + 6] = 0; | 228 | 8.09k | d->u.l.lens[i + 7] = 0; | 229 | 8.09k | d->u.l.lens[i + 8] = 0; | 230 | 8.09k | d->u.l.lens[i + 9] = 0; | 231 | 8.09k | i += rep_count; | 232 | 12.1k | } else { | 233 | | /* Repeat zero 11 - 138 times. */ | 234 | 12.1k | STATIC_ASSERT(11 + BITMASK(7) == 138); | 235 | 12.1k | rep_count = 11 + (bitbuf & BITMASK(7)); | 236 | 12.1k | bitbuf >>= 7; | 237 | 12.1k | bitsleft -= 7; | 238 | 12.1k | memset(&d->u.l.lens[i], 0, | 239 | 12.1k | rep_count * sizeof(d->u.l.lens[i])); | 240 | 12.1k | i += rep_count; | 241 | 12.1k | } | 242 | 401k | } while (i < num_litlen_syms + num_offset_syms); | 243 | | | 244 | | /* Unnecessary, but check this for consistency with zlib. */ | 245 | 4.40k | SAFETY_CHECK(i == num_litlen_syms + num_offset_syms); | 246 | | | 247 | 5.57k | } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { | 248 | 1.16k | u16 len, nlen; | 249 | | | 250 | | /* | 251 | | * Uncompressed block: copy 'len' bytes literally from the input | 252 | | * buffer to the output buffer. | 253 | | */ | 254 | | | 255 | 1.16k | bitsleft -= 3; /* for BTYPE and BFINAL */ | 256 | | | 257 | | /* | 258 | | * Align the bitstream to the next byte boundary. This means | 259 | | * the next byte boundary as if we were reading a byte at a | 260 | | * time. Therefore, we have to rewind 'in_next' by any bytes | 261 | | * that have been refilled but not actually consumed yet (not | 262 | | * counting overread bytes, which don't increment 'in_next'). | 263 | | */ | 264 | 1.16k | bitsleft = (u8)bitsleft; | 265 | 1.16k | SAFETY_CHECK(overread_count <= (bitsleft >> 3)); | 266 | 1.09k | in_next -= (bitsleft >> 3) - overread_count; | 267 | 1.09k | overread_count = 0; | 268 | 1.09k | bitbuf = 0; | 269 | 1.09k | bitsleft = 0; | 270 | | | 271 | 1.09k | SAFETY_CHECK(in_end - in_next >= 4); | 272 | 1.05k | len = get_unaligned_le16(in_next); | 273 | 1.05k | nlen = get_unaligned_le16(in_next + 2); | 274 | 1.05k | in_next += 4; | 275 | | | 276 | 1.05k | SAFETY_CHECK(len == (u16)~nlen); | 277 | 821 | if (unlikely(len > out_end - out_next)) | 278 | 364 | return LIBDEFLATE_INSUFFICIENT_SPACE; | 279 | 457 | SAFETY_CHECK(len <= in_end - in_next); | 280 | | | 281 | 442 | memcpy(out_next, in_next, len); | 282 | 442 | in_next += len; | 283 | 442 | out_next += len; | 284 | | | 285 | 442 | goto block_done; | 286 | | | 287 | 4.40k | } else { | 288 | 4.40k | unsigned i; | 289 | | | 290 | 4.40k | SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); | 291 | | | 292 | | /* | 293 | | * Static Huffman block: build the decode tables for the static | 294 | | * codes. Skip doing so if the tables are already set up from | 295 | | * an earlier static block; this speeds up decompression of | 296 | | * degenerate input of many empty or very short static blocks. | 297 | | * | 298 | | * Afterwards, the remainder is the same as decompressing a | 299 | | * dynamic Huffman block. | 300 | | */ | 301 | | | 302 | 4.36k | bitbuf >>= 3; /* for BTYPE and BFINAL */ | 303 | 4.36k | bitsleft -= 3; | 304 | | | 305 | 4.36k | if (d->static_codes_loaded) | 306 | 2.43k | goto have_decode_tables; | 307 | | | 308 | 1.93k | d->static_codes_loaded = true; | 309 | | | 310 | 1.93k | STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); | 311 | 1.93k | STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); | 312 | | | 313 | 281k | for (i = 0; i < 144; i++) | 314 | 279k | d->u.l.lens[i] = 8; | 315 | 218k | for (; i < 256; i++) | 316 | 217k | d->u.l.lens[i] = 9; | 317 | 48.4k | for (; i < 280; i++) | 318 | 46.5k | d->u.l.lens[i] = 7; | 319 | 17.4k | for (; i < 288; i++) | 320 | 15.5k | d->u.l.lens[i] = 8; | 321 | | | 322 | 63.9k | for (; i < 288 + 32; i++) | 323 | 62.0k | d->u.l.lens[i] = 5; | 324 | | | 325 | 1.93k | num_litlen_syms = 288; | 326 | 1.93k | num_offset_syms = 32; | 327 | 1.93k | } | 328 | | | 329 | | /* Decompressing a Huffman block (either dynamic or static) */ | 330 | | | 331 | 6.26k | SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); | 332 | 6.18k | SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); | 333 | 8.55k | have_decode_tables: | 334 | 8.55k | litlen_tablemask = BITMASK(d->litlen_tablebits); | 335 | | | 336 | | /* | 337 | | * This is the "fastloop" for decoding literals and matches. It does | 338 | | * bounds checks on in_next and out_next in the loop conditions so that | 339 | | * additional bounds checks aren't needed inside the loop body. | 340 | | * | 341 | | * To reduce latency, the bitbuffer is refilled and the next litlen | 342 | | * decode table entry is preloaded before each loop iteration. | 343 | | */ | 344 | 8.55k | if (in_next >= in_fastloop_end || out_next >= out_fastloop_end) | 345 | 5.32k | goto generic_loop; | 346 | 3.23k | REFILL_BITS_IN_FASTLOOP(); | 347 | 3.23k | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; | 348 | 162k | do { | 349 | 162k | u32 length, offset, lit; | 350 | 162k | const u8 *src; | 351 | 162k | u8 *dst; | 352 | | | 353 | | /* | 354 | | * Consume the bits for the litlen decode table entry. Save the | 355 | | * original bitbuf for later, in case the extra match length | 356 | | * bits need to be extracted from it. | 357 | | */ | 358 | 162k | saved_bitbuf = bitbuf; | 359 | 162k | bitbuf >>= (u8)entry; | 360 | 162k | bitsleft -= entry; /* optimization: subtract full entry */ | 361 | | | 362 | | /* | 363 | | * Begin by checking for a "fast" literal, i.e. a literal that | 364 | | * doesn't need a subtable. | 365 | | */ | 366 | 162k | if (entry & HUFFDEC_LITERAL) { | 367 | | /* | 368 | | * On 64-bit platforms, we decode up to 2 extra fast | 369 | | * literals in addition to the primary item, as this | 370 | | * increases performance and still leaves enough bits | 371 | | * remaining for what follows. We could actually do 3, | 372 | | * assuming LITLEN_TABLEBITS=11, but that actually | 373 | | * decreases performance slightly (perhaps by messing | 374 | | * with the branch prediction of the conditional refill | 375 | | * that happens later while decoding the match offset). | 376 | | * | 377 | | * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN | 378 | | * and FASTLOOP_MAX_BYTES_READ need to be updated if the | 379 | | * number of extra literals decoded here is changed. | 380 | | */ | 381 | 89.2k | if (/* enough bits for 2 fast literals + length + offset preload? */ | 382 | 89.2k | CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + | 383 | 89.2k | LENGTH_MAXBITS, | 384 | 89.2k | OFFSET_TABLEBITS) && | 385 | | /* enough bits for 2 fast literals + slow literal + litlen preload? */ | 386 | 89.2k | CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + | 387 | 89.2k | DEFLATE_MAX_LITLEN_CODEWORD_LEN, | 388 | 89.2k | LITLEN_TABLEBITS)) { | 389 | | /* 1st extra fast literal */ | 390 | 89.2k | lit = entry >> 16; | 391 | 89.2k | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; | 392 | 89.2k | saved_bitbuf = bitbuf; | 393 | 89.2k | bitbuf >>= (u8)entry; | 394 | 89.2k | bitsleft -= entry; | 395 | 89.2k | *out_next++ = lit; | 396 | 89.2k | if (entry & HUFFDEC_LITERAL) { | 397 | | /* 2nd extra fast literal */ | 398 | 81.4k | lit = entry >> 16; | 399 | 81.4k | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; | 400 | 81.4k | saved_bitbuf = bitbuf; | 401 | 81.4k | bitbuf >>= (u8)entry; | 402 | 81.4k | bitsleft -= entry; | 403 | 81.4k | *out_next++ = lit; | 404 | 81.4k | if (entry & HUFFDEC_LITERAL) { | 405 | | /* | 406 | | * Another fast literal, but | 407 | | * this one is in lieu of the | 408 | | * primary item, so it doesn't | 409 | | * count as one of the extras. | 410 | | */ | 411 | 76.0k | lit = entry >> 16; | 412 | 76.0k | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; | 413 | 76.0k | REFILL_BITS_IN_FASTLOOP(); | 414 | 76.0k | *out_next++ = lit; | 415 | 76.0k | continue; | 416 | 76.0k | } | 417 | 81.4k | } | 418 | 89.2k | } else { | 419 | | /* | 420 | | * Decode a literal. While doing so, preload | 421 | | * the next litlen decode table entry and refill | 422 | | * the bitbuffer. To reduce latency, we've | 423 | | * arranged for there to be enough "preloadable" | 424 | | * bits remaining to do the table preload | 425 | | * independently of the refill. | 426 | | */ | 427 | 0 | STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD( | 428 | 0 | LITLEN_TABLEBITS, LITLEN_TABLEBITS)); | 429 | 0 | lit = entry >> 16; | 430 | 0 | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; | 431 | 0 | REFILL_BITS_IN_FASTLOOP(); | 432 | 0 | *out_next++ = lit; | 433 | 0 | continue; | 434 | 0 | } | 435 | 89.2k | } | 436 | | | 437 | | /* | 438 | | * It's not a literal entry, so it can be a length entry, a | 439 | | * subtable pointer entry, or an end-of-block entry. Detect the | 440 | | * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag. | 441 | | */ | 442 | 86.6k | if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { | 443 | | /* Subtable pointer or end-of-block entry */ | 444 | | | 445 | 6.52k | if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) | 446 | 1.74k | goto block_done; | 447 | | | 448 | | /* | 449 | | * A subtable is required. Load and consume the | 450 | | * subtable entry. The subtable entry can be of any | 451 | | * type: literal, length, or end-of-block. | 452 | | */ | 453 | 4.78k | entry = d->u.litlen_decode_table[(entry >> 16) + | 454 | 4.78k | EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; | 455 | 4.78k | saved_bitbuf = bitbuf; | 456 | 4.78k | bitbuf >>= (u8)entry; | 457 | 4.78k | bitsleft -= entry; | 458 | | | 459 | | /* | 460 | | * 32-bit platforms that use the byte-at-a-time refill | 461 | | * method have to do a refill here for there to always | 462 | | * be enough bits to decode a literal that requires a | 463 | | * subtable, then preload the next litlen decode table | 464 | | * entry; or to decode a match length that requires a | 465 | | * subtable, then preload the offset decode table entry. | 466 | | */ | 467 | 4.78k | if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN, | 468 | 4.78k | LITLEN_TABLEBITS) || | 469 | 4.78k | !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS, | 470 | 4.78k | OFFSET_TABLEBITS)) | 471 | 0 | REFILL_BITS_IN_FASTLOOP(); | 472 | 4.78k | if (entry & HUFFDEC_LITERAL) { | 473 | | /* Decode a literal that required a subtable. */ | 474 | 1.24k | lit = entry >> 16; | 475 | 1.24k | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; | 476 | 1.24k | REFILL_BITS_IN_FASTLOOP(); | 477 | 1.24k | *out_next++ = lit; | 478 | 1.24k | continue; | 479 | 1.24k | } | 480 | 3.54k | if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) | 481 | 61 | goto block_done; | 482 | | /* Else, it's a length that required a subtable. */ | 483 | 3.54k | } | 484 | | | 485 | | /* | 486 | | * Decode the match length: the length base value associated | 487 | | * with the litlen symbol (which we extract from the decode | 488 | | * table entry), plus the extra length bits. We don't need to | 489 | | * consume the extra length bits here, as they were included in | 490 | | * the bits consumed by the entry earlier. We also don't need | 491 | | * to check for too-long matches here, as this is inside the | 492 | | * fastloop where it's already been verified that the output | 493 | | * buffer has enough space remaining to copy a max-length match. | 494 | | */ | 495 | 83.6k | length = entry >> 16; | 496 | 83.6k | length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); | 497 | | | 498 | | /* | 499 | | * Decode the match offset. There are enough "preloadable" bits | 500 | | * remaining to preload the offset decode table entry, but a | 501 | | * refill might be needed before consuming it. | 502 | | */ | 503 | 83.6k | STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS, | 504 | 83.6k | OFFSET_TABLEBITS)); | 505 | 83.6k | entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; | 506 | 83.6k | if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS, | 507 | 83.6k | LITLEN_TABLEBITS)) { | 508 | | /* | 509 | | * Decoding a match offset on a 64-bit platform. We may | 510 | | * need to refill once, but then we can decode the whole | 511 | | * offset and preload the next litlen table entry. | 512 | | */ | 513 | 83.6k | if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { | 514 | | /* Offset codeword requires a subtable */ | 515 | 3.00k | if (unlikely((u8)bitsleft < OFFSET_MAXBITS + | 516 | 3.00k | LITLEN_TABLEBITS - PRELOAD_SLACK)) | 517 | 214 | REFILL_BITS_IN_FASTLOOP(); | 518 | 3.00k | bitbuf >>= OFFSET_TABLEBITS; | 519 | 3.00k | bitsleft -= OFFSET_TABLEBITS; | 520 | 3.00k | entry = d->offset_decode_table[(entry >> 16) + | 521 | 3.00k | EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; | 522 | 80.6k | } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS + | 523 | 80.6k | LITLEN_TABLEBITS - PRELOAD_SLACK)) | 524 | 243 | REFILL_BITS_IN_FASTLOOP(); | 525 | 83.6k | } else { | 526 | | /* Decoding a match offset on a 32-bit platform */ | 527 | 0 | REFILL_BITS_IN_FASTLOOP(); | 528 | 0 | if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { | 529 | | /* Offset codeword requires a subtable */ | 530 | 0 | bitbuf >>= OFFSET_TABLEBITS; | 531 | 0 | bitsleft -= OFFSET_TABLEBITS; | 532 | 0 | entry = d->offset_decode_table[(entry >> 16) + | 533 | 0 | EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; | 534 | 0 | REFILL_BITS_IN_FASTLOOP(); | 535 | | /* No further refill needed before extra bits */ | 536 | 0 | STATIC_ASSERT(CAN_CONSUME( | 537 | 0 | OFFSET_MAXBITS - OFFSET_TABLEBITS)); | 538 | 0 | } else { | 539 | | /* No refill needed before extra bits */ | 540 | 0 | STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS)); | 541 | 0 | } | 542 | 0 | } | 543 | 83.6k | saved_bitbuf = bitbuf; | 544 | 83.6k | bitbuf >>= (u8)entry; | 545 | 83.6k | bitsleft -= entry; /* optimization: subtract full entry */ | 546 | 83.6k | offset = entry >> 16; | 547 | 83.6k | offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); | 548 | | | 549 | | /* Validate the match offset; needed even in the fastloop. */ | 550 | 83.6k | SAFETY_CHECK(offset <= out_next - (const u8 *)out); | 551 | 83.5k | src = out_next - offset; | 552 | 83.5k | dst = out_next; | 553 | 83.5k | out_next += length; | 554 | | | 555 | | /* | 556 | | * Before starting to issue the instructions to copy the match, | 557 | | * refill the bitbuffer and preload the litlen decode table | 558 | | * entry for the next loop iteration. This can increase | 559 | | * performance by allowing the latency of the match copy to | 560 | | * overlap with these other operations. To further reduce | 561 | | * latency, we've arranged for there to be enough bits remaining | 562 | | * to do the table preload independently of the refill, except | 563 | | * on 32-bit platforms using the byte-at-a-time refill method. | 564 | | */ | 565 | 83.5k | if (!CAN_CONSUME_AND_THEN_PRELOAD( | 566 | 83.5k | MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS, | 567 | 83.5k | OFFSET_MAXFASTBITS), | 568 | 83.5k | LITLEN_TABLEBITS) && | 569 | 83.5k | unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK)) | 570 | 0 | REFILL_BITS_IN_FASTLOOP(); | 571 | 83.5k | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; | 572 | 83.5k | REFILL_BITS_IN_FASTLOOP(); | 573 | | | 574 | | /* | 575 | | * Copy the match. On most CPUs the fastest method is a | 576 | | * word-at-a-time copy, unconditionally copying about 5 words | 577 | | * since this is enough for most matches without being too much. | 578 | | * | 579 | | * The normal word-at-a-time copy works for offset >= WORDBYTES, | 580 | | * which is most cases. The case of offset == 1 is also common | 581 | | * and is worth optimizing for, since it is just RLE encoding of | 582 | | * the previous byte, which is the result of compressing long | 583 | | * runs of the same byte. | 584 | | * | 585 | | * Writing past the match 'length' is allowed here, since it's | 586 | | * been ensured there is enough output space left for a slight | 587 | | * overrun. FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if | 588 | | * the maximum possible overrun here is changed. | 589 | | */ | 590 | 83.5k | if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) { | 591 | 17.8k | store_word_unaligned(load_word_unaligned(src), dst); | 592 | 17.8k | src += WORDBYTES; | 593 | 17.8k | dst += WORDBYTES; | 594 | 17.8k | store_word_unaligned(load_word_unaligned(src), dst); | 595 | 17.8k | src += WORDBYTES; | 596 | 17.8k | dst += WORDBYTES; | 597 | 17.8k | store_word_unaligned(load_word_unaligned(src), dst); | 598 | 17.8k | src += WORDBYTES; | 599 | 17.8k | dst += WORDBYTES; | 600 | 17.8k | store_word_unaligned(load_word_unaligned(src), dst); | 601 | 17.8k | src += WORDBYTES; | 602 | 17.8k | dst += WORDBYTES; | 603 | 17.8k | store_word_unaligned(load_word_unaligned(src), dst); | 604 | 17.8k | src += WORDBYTES; | 605 | 17.8k | dst += WORDBYTES; | 606 | 28.2k | while (dst < out_next) { | 607 | 10.3k | store_word_unaligned(load_word_unaligned(src), dst); | 608 | 10.3k | src += WORDBYTES; | 609 | 10.3k | dst += WORDBYTES; | 610 | 10.3k | store_word_unaligned(load_word_unaligned(src), dst); | 611 | 10.3k | src += WORDBYTES; | 612 | 10.3k | dst += WORDBYTES; | 613 | 10.3k | store_word_unaligned(load_word_unaligned(src), dst); | 614 | 10.3k | src += WORDBYTES; | 615 | 10.3k | dst += WORDBYTES; | 616 | 10.3k | store_word_unaligned(load_word_unaligned(src), dst); | 617 | 10.3k | src += WORDBYTES; | 618 | 10.3k | dst += WORDBYTES; | 619 | 10.3k | store_word_unaligned(load_word_unaligned(src), dst); | 620 | 10.3k | src += WORDBYTES; | 621 | 10.3k | dst += WORDBYTES; | 622 | 10.3k | } | 623 | 65.6k | } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) { | 624 | 22.1k | machine_word_t v; | 625 | | | 626 | | /* | 627 | | * This part tends to get auto-vectorized, so keep it | 628 | | * copying a multiple of 16 bytes at a time. | 629 | | */ | 630 | 22.1k | v = (machine_word_t)0x0101010101010101 * src[0]; | 631 | 22.1k | store_word_unaligned(v, dst); | 632 | 22.1k | dst += WORDBYTES; | 633 | 22.1k | store_word_unaligned(v, dst); | 634 | 22.1k | dst += WORDBYTES; | 635 | 22.1k | store_word_unaligned(v, dst); | 636 | 22.1k | dst += WORDBYTES; | 637 | 22.1k | store_word_unaligned(v, dst); | 638 | 22.1k | dst += WORDBYTES; | 639 | 150k | while (dst < out_next) { | 640 | 127k | store_word_unaligned(v, dst); | 641 | 127k | dst += WORDBYTES; | 642 | 127k | store_word_unaligned(v, dst); | 643 | 127k | dst += WORDBYTES; | 644 | 127k | store_word_unaligned(v, dst); | 645 | 127k | dst += WORDBYTES; | 646 | 127k | store_word_unaligned(v, dst); | 647 | 127k | dst += WORDBYTES; | 648 | 127k | } | 649 | 43.4k | } else if (UNALIGNED_ACCESS_IS_FAST) { | 650 | 43.4k | store_word_unaligned(load_word_unaligned(src), dst); | 651 | 43.4k | src += offset; | 652 | 43.4k | dst += offset; | 653 | 43.4k | store_word_unaligned(load_word_unaligned(src), dst); | 654 | 43.4k | src += offset; | 655 | 43.4k | dst += offset; | 656 | 1.80M | do { | 657 | 1.80M | store_word_unaligned(load_word_unaligned(src), dst); | 658 | 1.80M | src += offset; | 659 | 1.80M | dst += offset; | 660 | 1.80M | store_word_unaligned(load_word_unaligned(src), dst); | 661 | 1.80M | src += offset; | 662 | 1.80M | dst += offset; | 663 | 1.80M | } while (dst < out_next); | 664 | 43.4k | } else { | 665 | 0 | *dst++ = *src++; | 666 | 0 | *dst++ = *src++; | 667 | 0 | do { | 668 | 0 | *dst++ = *src++; | 669 | 0 | } while (dst < out_next); | 670 | 0 | } | 671 | 160k | } while (in_next < in_fastloop_end && out_next < out_fastloop_end); | 672 | | | 673 | | /* | 674 | | * This is the generic loop for decoding literals and matches. This | 675 | | * handles cases where in_next and out_next are close to the end of | 676 | | * their respective buffers. Usually this loop isn't performance- | 677 | | * critical, as most time is spent in the fastloop above instead. We | 678 | | * therefore omit some optimizations here in favor of smaller code. | 679 | | */ | 680 | 6.67k | generic_loop: | 681 | 224k | for (;;) { | 682 | 224k | u32 length, offset; | 683 | 224k | const u8 *src; | 684 | 224k | u8 *dst; | 685 | | | 686 | 224k | REFILL_BITS(); | 687 | 223k | entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; | 688 | 223k | saved_bitbuf = bitbuf; | 689 | 223k | bitbuf >>= (u8)entry; | 690 | 223k | bitsleft -= entry; | 691 | 223k | if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) { | 692 | 531 | entry = d->u.litlen_decode_table[(entry >> 16) + | 693 | 531 | EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; | 694 | 531 | saved_bitbuf = bitbuf; | 695 | 531 | bitbuf >>= (u8)entry; | 696 | 531 | bitsleft -= entry; | 697 | 531 | } | 698 | 223k | length = entry >> 16; | 699 | 223k | if (entry & HUFFDEC_LITERAL) { | 700 | 150k | if (unlikely(out_next == out_end)) | 701 | 1.23k | return LIBDEFLATE_INSUFFICIENT_SPACE; | 702 | 148k | *out_next++ = length; | 703 | 148k | continue; | 704 | 150k | } | 705 | 73.5k | if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) | 706 | 3.08k | goto block_done; | 707 | 70.5k | length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); | 708 | 70.5k | if (unlikely(length > out_end - out_next)) | 709 | 1.87k | return LIBDEFLATE_INSUFFICIENT_SPACE; | 710 | | | 711 | 68.6k | if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS)) | 712 | 0 | REFILL_BITS(); | 713 | 68.6k | entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; | 714 | 68.6k | if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { | 715 | 366 | bitbuf >>= OFFSET_TABLEBITS; | 716 | 366 | bitsleft -= OFFSET_TABLEBITS; | 717 | 366 | entry = d->offset_decode_table[(entry >> 16) + | 718 | 366 | EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; | 719 | 366 | if (!CAN_CONSUME(OFFSET_MAXBITS)) | 720 | 0 | REFILL_BITS(); | 721 | 366 | } | 722 | 68.6k | offset = entry >> 16; | 723 | 68.6k | offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8); | 724 | 68.6k | bitbuf >>= (u8)entry; | 725 | 68.6k | bitsleft -= entry; | 726 | | | 727 | 68.6k | SAFETY_CHECK(offset <= out_next - (const u8 *)out); | 728 | 68.4k | src = out_next - offset; | 729 | 68.4k | dst = out_next; | 730 | 68.4k | out_next += length; | 731 | | | 732 | 68.4k | STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3); | 733 | 68.4k | *dst++ = *src++; | 734 | 68.4k | *dst++ = *src++; | 735 | 16.0M | do { | 736 | 16.0M | *dst++ = *src++; | 737 | 16.0M | } while (dst < out_next); | 738 | 68.4k | } | 739 | | | 740 | 5.33k | block_done: | 741 | | /* Finished decoding a block */ | 742 | | | 743 | 5.33k | if (!is_final_block) | 744 | 4.10k | goto next_block; | 745 | | | 746 | | /* That was the last block. */ | 747 | | | 748 | 1.22k | bitsleft = (u8)bitsleft; | 749 | | | 750 | | /* | 751 | | * If any of the implicit appended zero bytes were consumed (not just | 752 | | * refilled) before hitting end of stream, then the data is bad. | 753 | | */ | 754 | 1.22k | SAFETY_CHECK(overread_count <= (bitsleft >> 3)); | 755 | | | 756 | | /* Optionally return the actual number of bytes consumed. */ | 757 | 1.17k | if (actual_in_nbytes_ret) { | 758 | | /* Don't count bytes that were refilled but not consumed. */ | 759 | 1.17k | in_next -= (bitsleft >> 3) - overread_count; | 760 | | | 761 | 1.17k | *actual_in_nbytes_ret = in_next - (u8 *)in; | 762 | 1.17k | } | 763 | | | 764 | | /* Optionally return the actual number of bytes written. */ | 765 | 1.17k | if (actual_out_nbytes_ret) { | 766 | 0 | *actual_out_nbytes_ret = out_next - (u8 *)out; | 767 | 1.17k | } else { | 768 | 1.17k | if (out_next != out_end) | 769 | 131 | return LIBDEFLATE_SHORT_OUTPUT; | 770 | 1.17k | } | 771 | 1.04k | return LIBDEFLATE_SUCCESS; | 772 | 1.17k | } |
Unexecuted instantiation: deflate_decompress.c:deflate_decompress_default |
773 | | |
774 | | #undef FUNCNAME |
775 | | #undef ATTRIBUTES |
776 | | #undef EXTRACT_VARBITS |
777 | | #undef EXTRACT_VARBITS8 |