Coverage Report

Created: 2024-05-21 06:06

/work/_deps/deflate-src/lib/deflate_decompress.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * deflate_decompress.c - a decompressor for DEFLATE
3
 *
4
 * Copyright 2016 Eric Biggers
5
 *
6
 * Permission is hereby granted, free of charge, to any person
7
 * obtaining a copy of this software and associated documentation
8
 * files (the "Software"), to deal in the Software without
9
 * restriction, including without limitation the rights to use,
10
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
 * copies of the Software, and to permit persons to whom the
12
 * Software is furnished to do so, subject to the following
13
 * conditions:
14
 *
15
 * The above copyright notice and this permission notice shall be
16
 * included in all copies or substantial portions of the Software.
17
 *
18
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
 * OTHER DEALINGS IN THE SOFTWARE.
26
 *
27
 * ---------------------------------------------------------------------------
28
 *
29
 * This is a highly optimized DEFLATE decompressor.  It is much faster than
30
 * vanilla zlib, typically well over twice as fast, though results vary by CPU.
31
 *
32
 * Why this is faster than vanilla zlib:
33
 *
34
 * - Word accesses rather than byte accesses when reading input
35
 * - Word accesses rather than byte accesses when copying matches
36
 * - Faster Huffman decoding combined with various DEFLATE-specific tricks
37
 * - Larger bitbuffer variable that doesn't need to be refilled as often
38
 * - Other optimizations to remove unnecessary branches
39
 * - Only full-buffer decompression is supported, so the code doesn't need to
40
 *   support stopping and resuming decompression.
41
 * - On x86_64, a version of the decompression routine is compiled with BMI2
42
 *   instructions enabled and is used automatically at runtime when supported.
43
 */
44
45
#include "lib_common.h"
46
#include "deflate_constants.h"
47
48
/*
49
 * If the expression passed to SAFETY_CHECK() evaluates to false, then the
50
 * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the
51
 * compressed data is invalid.
52
 *
53
 * Theoretically, these checks could be disabled for specialized applications
54
 * where all input to the decompressor will be trusted.
55
 */
56
#if 0
57
#  pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!")
58
#  define SAFETY_CHECK(expr)  (void)(expr)
59
#else
60
132k
#  define SAFETY_CHECK(expr)  if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA
61
#endif
62
63
/*****************************************************************************
64
 *        Input bitstream                              *
65
 *****************************************************************************/
66
67
/*
68
 * The state of the "input bitstream" consists of the following variables:
69
 *
70
 *  - in_next: a pointer to the next unread byte in the input buffer
71
 *
72
 *  - in_end: a pointer to just past the end of the input buffer
73
 *
74
 *  - bitbuf: a word-sized variable containing bits that have been read from
75
 *      the input buffer or from the implicit appended zero bytes
76
 *
77
 *  - bitsleft: the number of bits in 'bitbuf' available to be consumed.
78
 *        After REFILL_BITS_BRANCHLESS(), 'bitbuf' can actually
79
 *        contain more bits than this.  However, only the bits counted
80
 *        by 'bitsleft' can actually be consumed; the rest can only be
81
 *        used for preloading.
82
 *
83
 *        As a micro-optimization, we allow bits 8 and higher of
84
 *        'bitsleft' to contain garbage.  When consuming the bits
85
 *        associated with a decode table entry, this allows us to do
86
 *        'bitsleft -= entry' instead of 'bitsleft -= (u8)entry'.
87
 *        On some CPUs, this helps reduce instruction dependencies.
88
 *        This does have the disadvantage that 'bitsleft' sometimes
89
 *        needs to be cast to 'u8', such as when it's used as a shift
90
 *        amount in REFILL_BITS_BRANCHLESS().  But that one happens
91
 *        for free since most CPUs ignore high bits in shift amounts.
92
 *
93
 *  - overread_count: the total number of implicit appended zero bytes that
94
 *        have been loaded into the bitbuffer, including any
95
 *        counted by 'bitsleft' and any already consumed
96
 */
97
98
/*
99
 * The type for the bitbuffer variable ('bitbuf' described above).  For best
100
 * performance, this should have size equal to a machine word.
101
 *
102
 * 64-bit platforms have a significant advantage: they get a bigger bitbuffer
103
 * which they don't have to refill as often.
104
 */
105
typedef machine_word_t bitbuf_t;
106
339k
#define BITBUF_NBITS  (8 * (int)sizeof(bitbuf_t))
107
108
/* BITMASK(n) returns a bitmask of length 'n'. */
109
452k
#define BITMASK(n)  (((bitbuf_t)1 << (n)) - 1)
110
111
/*
112
 * MAX_BITSLEFT is the maximum number of consumable bits, i.e. the maximum value
113
 * of '(u8)bitsleft'.  This is the size of the bitbuffer variable, minus 1 if
114
 * the branchless refill method is being used (see REFILL_BITS_BRANCHLESS()).
115
 */
116
#define MAX_BITSLEFT  \
117
339k
  (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS - 1 : BITBUF_NBITS)
118
119
/*
120
 * CONSUMABLE_NBITS is the minimum number of bits that are guaranteed to be
121
 * consumable (counted in 'bitsleft') immediately after refilling the bitbuffer.
122
 * Since only whole bytes can be added to 'bitsleft', the worst case is
123
 * 'MAX_BITSLEFT - 7': the smallest amount where another byte doesn't fit.
124
 */
125
198k
#define CONSUMABLE_NBITS  (MAX_BITSLEFT - 7)
126
127
/*
128
 * FASTLOOP_PRELOADABLE_NBITS is the minimum number of bits that are guaranteed
129
 * to be preloadable immediately after REFILL_BITS_IN_FASTLOOP().  (It is *not*
130
 * guaranteed after REFILL_BITS(), since REFILL_BITS() falls back to a
131
 * byte-at-a-time refill method near the end of input.)  This may exceed the
132
 * number of consumable bits (counted by 'bitsleft').  Any bits not counted in
133
 * 'bitsleft' can only be used for precomputation and cannot be consumed.
134
 */
135
#define FASTLOOP_PRELOADABLE_NBITS  \
136
0
  (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS : CONSUMABLE_NBITS)
137
138
/*
139
 * PRELOAD_SLACK is the minimum number of bits that are guaranteed to be
140
 * preloadable but not consumable, following REFILL_BITS_IN_FASTLOOP() and any
141
 * subsequent consumptions.  This is 1 bit if the branchless refill method is
142
 * being used, and 0 bits otherwise.
143
 */
144
#define PRELOAD_SLACK MAX(0, FASTLOOP_PRELOADABLE_NBITS - MAX_BITSLEFT)
145
146
/*
147
 * CAN_CONSUME(n) is true if it's guaranteed that if the bitbuffer has just been
148
 * refilled, then it's always possible to consume 'n' bits from it.  'n' should
149
 * be a compile-time constant, to enable compile-time evaluation.
150
 */
151
7.09k
#define CAN_CONSUME(n)  (CONSUMABLE_NBITS >= (n))
152
153
/*
154
 * CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) is true if it's
155
 * guaranteed that after REFILL_BITS_IN_FASTLOOP(), it's always possible to
156
 * consume 'consume_nbits' bits, then preload 'preload_nbits' bits.  The
157
 * arguments should be compile-time constants to enable compile-time evaluation.
158
 */
159
#define CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits)  \
160
289k
  (CONSUMABLE_NBITS >= (consume_nbits) &&       \
161
289k
   FASTLOOP_PRELOADABLE_NBITS >= (consume_nbits) + (preload_nbits))
162
163
/*
164
 * REFILL_BITS_BRANCHLESS() branchlessly refills the bitbuffer variable by
165
 * reading the next word from the input buffer and updating 'in_next' and
166
 * 'bitsleft' based on how many bits were refilled -- counting whole bytes only.
167
 * This is much faster than reading a byte at a time, at least if the CPU is
168
 * little endian and supports fast unaligned memory accesses.
169
 *
170
 * The simplest way of branchlessly updating 'bitsleft' would be:
171
 *
172
 *  bitsleft += (MAX_BITSLEFT - bitsleft) & ~7;
173
 *
174
 * To make it faster, we define MAX_BITSLEFT to be 'WORDBITS - 1' rather than
175
 * WORDBITS, so that in binary it looks like 111111 or 11111.  Then, we update
176
 * 'bitsleft' by just setting the bits above the low 3 bits:
177
 *
178
 *  bitsleft |= MAX_BITSLEFT & ~7;
179
 *
180
 * That compiles down to a single instruction like 'or $0x38, %rbp'.  Using
181
 * 'MAX_BITSLEFT == WORDBITS - 1' also has the advantage that refills can be
182
 * done when 'bitsleft == MAX_BITSLEFT' without invoking undefined behavior.
183
 *
184
 * The simplest way of branchlessly updating 'in_next' would be:
185
 *
186
 *  in_next += (MAX_BITSLEFT - bitsleft) >> 3;
187
 *
188
 * With 'MAX_BITSLEFT == WORDBITS - 1' we could use an XOR instead, though this
189
 * isn't really better:
190
 *
191
 *  in_next += (MAX_BITSLEFT ^ bitsleft) >> 3;
192
 *
193
 * An alternative which can be marginally better is the following:
194
 *
195
 *  in_next += sizeof(bitbuf_t) - 1;
196
 *  in_next -= (bitsleft >> 3) & 0x7;
197
 *
198
 * It seems this would increase the number of CPU instructions from 3 (sub, shr,
199
 * add) to 4 (add, shr, and, sub).  However, if the CPU has a bitfield
200
 * extraction instruction (e.g. arm's ubfx), it stays at 3, and is potentially
201
 * more efficient because the length of the longest dependency chain decreases
202
 * from 3 to 2.  This alternative also has the advantage that it ignores the
203
 * high bits in 'bitsleft', so it is compatible with the micro-optimization we
204
 * use where we let the high bits of 'bitsleft' contain garbage.
205
 */
206
141k
#define REFILL_BITS_BRANCHLESS()          \
207
141k
do {                 \
208
141k
  bitbuf |= get_unaligned_leword(in_next) << (u8)bitsleft;  \
209
141k
  in_next += sizeof(bitbuf_t) - 1;        \
210
141k
  in_next -= (bitsleft >> 3) & 0x7;       \
211
141k
  bitsleft |= MAX_BITSLEFT & ~7;          \
212
141k
} while (0)
213
214
/*
215
 * REFILL_BITS() loads bits from the input buffer until the bitbuffer variable
216
 * contains at least CONSUMABLE_NBITS consumable bits.
217
 *
218
 * This checks for the end of input, and it doesn't guarantee
219
 * FASTLOOP_PRELOADABLE_NBITS, so it can't be used in the fastloop.
220
 *
221
 * If we would overread the input buffer, we just don't read anything, leaving
222
 * the bits zeroed but marking them filled.  This simplifies the decompressor
223
 * because it removes the need to always be able to distinguish between real
224
 * overreads and overreads caused only by the decompressor's own lookahead.
225
 *
226
 * We do still keep track of the number of bytes that have been overread, for
227
 * two reasons.  First, it allows us to determine the exact number of bytes that
228
 * were consumed once the stream ends or an uncompressed block is reached.
229
 * Second, it allows us to stop early if the overread amount gets so large (more
230
 * than sizeof bitbuf) that it can only be caused by a real overread.  (The
231
 * second part is arguably unneeded, since libdeflate is buffer-based; given
232
 * infinite zeroes, it will eventually either completely fill the output buffer
233
 * or return an error.  However, we do it to be slightly more friendly to the
234
 * not-recommended use case of decompressing with an unknown output size.)
235
 */
236
41.8k
#define REFILL_BITS()             \
237
41.8k
do {                 \
238
41.8k
  if (UNALIGNED_ACCESS_IS_FAST &&         \
239
41.8k
      likely(in_end - in_next >= sizeof(bitbuf_t))) {   \
240
29.8k
    REFILL_BITS_BRANCHLESS();       \
241
29.8k
  } else {             \
242
22.4k
    while ((u8)bitsleft < CONSUMABLE_NBITS) {   \
243
10.6k
      if (likely(in_next != in_end)) {   \
244
4.43k
        bitbuf |= (bitbuf_t)*in_next++ << \
245
4.43k
            (u8)bitsleft;     \
246
6.25k
      } else {         \
247
6.25k
        overread_count++;     \
248
6.25k
        SAFETY_CHECK(overread_count <=    \
249
6.25k
               sizeof(bitbuf_t));   \
250
6.25k
      }            \
251
10.6k
      bitsleft += 8;          \
252
10.3k
    }              \
253
12.0k
  }                \
254
41.8k
} while (0)
255
256
/*
257
 * REFILL_BITS_IN_FASTLOOP() is like REFILL_BITS(), but it doesn't check for the
258
 * end of the input.  It can only be used in the fastloop.
259
 */
260
111k
#define REFILL_BITS_IN_FASTLOOP()         \
261
111k
do {                 \
262
111k
  STATIC_ASSERT(UNALIGNED_ACCESS_IS_FAST ||      \
263
111k
          FASTLOOP_PRELOADABLE_NBITS == CONSUMABLE_NBITS);  \
264
111k
  if (UNALIGNED_ACCESS_IS_FAST) {         \
265
111k
    REFILL_BITS_BRANCHLESS();       \
266
111k
  } else {             \
267
0
    while ((u8)bitsleft < CONSUMABLE_NBITS) {   \
268
0
      bitbuf |= (bitbuf_t)*in_next++ << (u8)bitsleft; \
269
0
      bitsleft += 8;          \
270
0
    }              \
271
0
  }                \
272
111k
} while (0)
273
274
/*
275
 * This is the worst-case maximum number of output bytes that are written to
276
 * during each iteration of the fastloop.  The worst case is 2 literals, then a
277
 * match of length DEFLATE_MAX_MATCH_LEN.  Additionally, some slack space must
278
 * be included for the intentional overrun in the match copy implementation.
279
 */
280
#define FASTLOOP_MAX_BYTES_WRITTEN  \
281
  (2 + DEFLATE_MAX_MATCH_LEN + (5 * WORDBYTES) - 1)
282
283
/*
284
 * This is the worst-case maximum number of input bytes that are read during
285
 * each iteration of the fastloop.  To get this value, we first compute the
286
 * greatest number of bits that can be refilled during a loop iteration.  The
287
 * refill at the beginning can add at most MAX_BITSLEFT, and the amount that can
288
 * be refilled later is no more than the maximum amount that can be consumed by
289
 * 2 literals that don't need a subtable, then a match.  We convert this value
290
 * to bytes, rounding up; this gives the maximum number of bytes that 'in_next'
291
 * can be advanced.  Finally, we add sizeof(bitbuf_t) to account for
292
 * REFILL_BITS_BRANCHLESS() reading a word past 'in_next'.
293
 */
294
#define FASTLOOP_MAX_BYTES_READ         \
295
  (DIV_ROUND_UP(MAX_BITSLEFT + (2 * LITLEN_TABLEBITS) + \
296
          LENGTH_MAXBITS + OFFSET_MAXBITS, 8) + \
297
   sizeof(bitbuf_t))
298
299
/*****************************************************************************
300
 *                              Huffman decoding                             *
301
 *****************************************************************************/
302
303
/*
304
 * The fastest way to decode Huffman-encoded data is basically to use a decode
305
 * table that maps the next TABLEBITS bits of data to their symbol.  Each entry
306
 * decode_table[i] maps to the symbol whose codeword is a prefix of 'i'.  A
307
 * symbol with codeword length 'n' has '2**(TABLEBITS-n)' entries in the table.
308
 *
309
 * Ideally, TABLEBITS and the maximum codeword length would be the same; some
310
 * compression formats are designed with this goal in mind.  Unfortunately, in
311
 * DEFLATE, the maximum litlen and offset codeword lengths are 15 bits, which is
312
 * too large for a practical TABLEBITS.  It's not *that* much larger, though, so
313
 * the workaround is to use a single level of subtables.  In the main table,
314
 * entries for prefixes of codewords longer than TABLEBITS contain a "pointer"
315
 * to the appropriate subtable along with the number of bits it is indexed with.
316
 *
317
 * The most efficient way to allocate subtables is to allocate them dynamically
318
 * after the main table.  The worst-case number of table entries needed,
319
 * including subtables, is precomputable; see the ENOUGH constants below.
320
 *
321
 * A useful optimization is to store the codeword lengths in the decode table so
322
 * that they don't have to be looked up by indexing a separate table that maps
323
 * symbols to their codeword lengths.  We basically do this; however, for the
324
 * litlen and offset codes we also implement some DEFLATE-specific optimizations
325
 * that build in the consideration of the "extra bits" and the
326
 * literal/length/end-of-block division.  For the exact decode table entry
327
 * format we use, see the definitions of the *_decode_results[] arrays below.
328
 */
329
330
331
/*
332
 * These are the TABLEBITS values we use for each of the DEFLATE Huffman codes,
333
 * along with their corresponding ENOUGH values.
334
 *
335
 * For the precode, we use PRECODE_TABLEBITS == 7 since this is the maximum
336
 * precode codeword length.  This avoids ever needing subtables.
337
 *
338
 * For the litlen and offset codes, we cannot realistically avoid ever needing
339
 * subtables, since litlen and offset codewords can be up to 15 bits.  A higher
340
 * TABLEBITS reduces the number of lookups that need a subtable, which increases
341
 * performance; however, it increases memory usage and makes building the table
342
 * take longer, which decreases performance.  We choose values that work well in
343
 * practice, making subtables rarely needed without making the tables too large.
344
 *
345
 * Our choice of OFFSET_TABLEBITS == 8 is a bit low; without any special
346
 * considerations, 9 would fit the trade-off curve better.  However, there is a
347
 * performance benefit to using exactly 8 bits when it is a compile-time
348
 * constant, as many CPUs can take the low byte more easily than the low 9 bits.
349
 *
350
 * zlib treats its equivalents of TABLEBITS as maximum values; whenever it
351
 * builds a table, it caps the actual table_bits to the longest codeword.  This
352
 * makes sense in theory, as there's no need for the table to be any larger than
353
 * needed to support the longest codeword.  However, having the table bits be a
354
 * compile-time constant is beneficial to the performance of the decode loop, so
355
 * there is a trade-off.  libdeflate currently uses the dynamic table_bits
356
 * strategy for the litlen table only, due to its larger maximum size.
357
 * PRECODE_TABLEBITS and OFFSET_TABLEBITS are smaller, so going dynamic there
358
 * isn't as useful, and OFFSET_TABLEBITS=8 is useful as mentioned above.
359
 *
360
 * Each TABLEBITS value has a corresponding ENOUGH value that gives the
361
 * worst-case maximum number of decode table entries, including the main table
362
 * and all subtables.  The ENOUGH value depends on three parameters:
363
 *
364
 *  (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS)
365
 *  (2) the maximum number of main table bits (*_TABLEBITS)
366
 *  (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
367
 *
368
 * The ENOUGH values were computed using the utility program 'enough' from zlib.
369
 */
370
2.27k
#define PRECODE_TABLEBITS 7
371
#define PRECODE_ENOUGH    128 /* enough 19 7 7  */
372
4.73k
#define LITLEN_TABLEBITS  11
373
#define LITLEN_ENOUGH   2342  /* enough 288 11 15 */
374
4.76k
#define OFFSET_TABLEBITS  8
375
#define OFFSET_ENOUGH   402 /* enough 32 8 15 */
376
377
/*
378
 * make_decode_table_entry() creates a decode table entry for the given symbol
379
 * by combining the static part 'decode_results[sym]' with the dynamic part
380
 * 'len', which is the remaining codeword length (the codeword length for main
381
 * table entries, or the codeword length minus TABLEBITS for subtable entries).
382
 *
383
 * In all cases, we add 'len' to each of the two low-order bytes to create the
384
 * appropriately-formatted decode table entry.  See the definitions of the
385
 * *_decode_results[] arrays below, where the entry format is described.
386
 */
387
static forceinline u32
388
make_decode_table_entry(const u32 decode_results[], u32 sym, u32 len)
389
1.09M
{
390
1.09M
  return decode_results[sym] + (len << 8) + len;
391
1.09M
}
392
393
/*
394
 * Here is the format of our precode decode table entries.  Bits not explicitly
395
 * described contain zeroes:
396
 *
397
 *  Bit 20-16:  presym
398
 *  Bit 10-8:   codeword length [not used]
399
 *  Bit 2-0:    codeword length
400
 *
401
 * The precode decode table never has subtables, since we use
402
 * PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN.
403
 *
404
 * precode_decode_results[] contains the static part of the entry for each
405
 * symbol.  make_decode_table_entry() produces the final entries.
406
 */
407
static const u32 precode_decode_results[] = {
408
#define ENTRY(presym) ((u32)presym << 16)
409
  ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
410
  ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
411
  ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
412
  ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
413
  ENTRY(16)  , ENTRY(17)  , ENTRY(18)  ,
414
#undef ENTRY
415
};
416
417
/* Litlen and offset decode table entry flags */
418
419
/* Indicates a literal entry in the litlen decode table */
420
266k
#define HUFFDEC_LITERAL     0x80000000
421
422
/* Indicates that HUFFDEC_SUBTABLE_POINTER or HUFFDEC_END_OF_BLOCK is set */
423
1.28k
#define HUFFDEC_EXCEPTIONAL   0x00008000
424
425
/* Indicates a subtable pointer entry in the litlen or offset decode table */
426
1.28k
#define HUFFDEC_SUBTABLE_POINTER  0x00004000
427
428
/* Indicates an end-of-block entry in the litlen decode table */
429
#define HUFFDEC_END_OF_BLOCK    0x00002000
430
431
/* Maximum number of bits that can be consumed by decoding a match length */
432
#define LENGTH_MAXBITS    (DEFLATE_MAX_LITLEN_CODEWORD_LEN + \
433
         DEFLATE_MAX_EXTRA_LENGTH_BITS)
434
#define LENGTH_MAXFASTBITS  (LITLEN_TABLEBITS /* no subtable needed */ + \
435
         DEFLATE_MAX_EXTRA_LENGTH_BITS)
436
437
/*
438
 * Here is the format of our litlen decode table entries.  Bits not explicitly
439
 * described contain zeroes:
440
 *
441
 *  Literals:
442
 *    Bit 31:     1 (HUFFDEC_LITERAL)
443
 *    Bit 23-16:  literal value
444
 *    Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
445
 *    Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
446
 *    Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
447
 *    Bit 11-8:   remaining codeword length [not used]
448
 *    Bit 3-0:    remaining codeword length
449
 *  Lengths:
450
 *    Bit 31:     0 (!HUFFDEC_LITERAL)
451
 *    Bit 24-16:  length base value
452
 *    Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
453
 *    Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
454
 *    Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
455
 *    Bit 11-8:   remaining codeword length
456
 *    Bit 4-0:    remaining codeword length + number of extra bits
457
 *  End of block:
458
 *    Bit 31:     0 (!HUFFDEC_LITERAL)
459
 *    Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
460
 *    Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
461
 *    Bit 13:     1 (HUFFDEC_END_OF_BLOCK)
462
 *    Bit 11-8:   remaining codeword length [not used]
463
 *    Bit 3-0:    remaining codeword length
464
 *  Subtable pointer:
465
 *    Bit 31:     0 (!HUFFDEC_LITERAL)
466
 *    Bit 30-16:  index of start of subtable
467
 *    Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
468
 *    Bit 14:     1 (HUFFDEC_SUBTABLE_POINTER)
469
 *    Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
470
 *    Bit 11-8:   number of subtable bits
471
 *    Bit 3-0:    number of main table bits
472
 *
473
 * This format has several desirable properties:
474
 *
475
 *  - The codeword length, length slot base, and number of extra length bits
476
 *    are all built in.  This eliminates the need to separately look up this
477
 *    information by indexing separate arrays by symbol or length slot.
478
 *
479
 *  - The HUFFDEC_* flags enable easily distinguishing between the different
480
 *    types of entries.  The HUFFDEC_LITERAL flag enables a fast path for
481
 *    literals; the high bit is used for this, as some CPUs can test the
482
 *    high bit more easily than other bits.  The HUFFDEC_EXCEPTIONAL flag
483
 *    makes it possible to detect the two unlikely cases (subtable pointer
484
 *    and end of block) in a single bit flag test.
485
 *
486
 *  - The low byte is the number of bits that need to be removed from the
487
 *    bitstream; this makes this value easily accessible, and it enables the
488
 *    micro-optimization of doing 'bitsleft -= entry' instead of
489
 *    'bitsleft -= (u8)entry'.  It also includes the number of extra bits,
490
 *    so they don't need to be removed separately.
491
 *
492
 *  - The flags in bits 15-13 are arranged to be 0 when the
493
 *    "remaining codeword length" in bits 11-8 is needed, making this value
494
 *    fairly easily accessible as well via a shift and downcast.
495
 *
496
 *  - Similarly, bits 13-12 are 0 when the "subtable bits" in bits 11-8 are
497
 *    needed, making it possible to extract this value with '& 0x3F' rather
498
 *    than '& 0xF'.  This value is only used as a shift amount, so this can
499
 *    save an 'and' instruction as the masking by 0x3F happens implicitly.
500
 *
501
 * litlen_decode_results[] contains the static part of the entry for each
502
 * symbol.  make_decode_table_entry() produces the final entries.
503
 */
504
static const u32 litlen_decode_results[] = {
505
506
  /* Literals */
507
#define ENTRY(literal)  (HUFFDEC_LITERAL | ((u32)literal << 16))
508
  ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
509
  ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
510
  ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
511
  ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
512
  ENTRY(16)  , ENTRY(17)  , ENTRY(18)  , ENTRY(19)  ,
513
  ENTRY(20)  , ENTRY(21)  , ENTRY(22)  , ENTRY(23)  ,
514
  ENTRY(24)  , ENTRY(25)  , ENTRY(26)  , ENTRY(27)  ,
515
  ENTRY(28)  , ENTRY(29)  , ENTRY(30)  , ENTRY(31)  ,
516
  ENTRY(32)  , ENTRY(33)  , ENTRY(34)  , ENTRY(35)  ,
517
  ENTRY(36)  , ENTRY(37)  , ENTRY(38)  , ENTRY(39)  ,
518
  ENTRY(40)  , ENTRY(41)  , ENTRY(42)  , ENTRY(43)  ,
519
  ENTRY(44)  , ENTRY(45)  , ENTRY(46)  , ENTRY(47)  ,
520
  ENTRY(48)  , ENTRY(49)  , ENTRY(50)  , ENTRY(51)  ,
521
  ENTRY(52)  , ENTRY(53)  , ENTRY(54)  , ENTRY(55)  ,
522
  ENTRY(56)  , ENTRY(57)  , ENTRY(58)  , ENTRY(59)  ,
523
  ENTRY(60)  , ENTRY(61)  , ENTRY(62)  , ENTRY(63)  ,
524
  ENTRY(64)  , ENTRY(65)  , ENTRY(66)  , ENTRY(67)  ,
525
  ENTRY(68)  , ENTRY(69)  , ENTRY(70)  , ENTRY(71)  ,
526
  ENTRY(72)  , ENTRY(73)  , ENTRY(74)  , ENTRY(75)  ,
527
  ENTRY(76)  , ENTRY(77)  , ENTRY(78)  , ENTRY(79)  ,
528
  ENTRY(80)  , ENTRY(81)  , ENTRY(82)  , ENTRY(83)  ,
529
  ENTRY(84)  , ENTRY(85)  , ENTRY(86)  , ENTRY(87)  ,
530
  ENTRY(88)  , ENTRY(89)  , ENTRY(90)  , ENTRY(91)  ,
531
  ENTRY(92)  , ENTRY(93)  , ENTRY(94)  , ENTRY(95)  ,
532
  ENTRY(96)  , ENTRY(97)  , ENTRY(98)  , ENTRY(99)  ,
533
  ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) ,
534
  ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) ,
535
  ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) ,
536
  ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) ,
537
  ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) ,
538
  ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) ,
539
  ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) ,
540
  ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) ,
541
  ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) ,
542
  ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) ,
543
  ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) ,
544
  ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) ,
545
  ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) ,
546
  ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) ,
547
  ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) ,
548
  ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) ,
549
  ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) ,
550
  ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) ,
551
  ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) ,
552
  ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) ,
553
  ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) ,
554
  ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) ,
555
  ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) ,
556
  ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) ,
557
  ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) ,
558
  ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) ,
559
  ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) ,
560
  ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) ,
561
  ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) ,
562
  ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) ,
563
  ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) ,
564
  ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) ,
565
  ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) ,
566
  ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) ,
567
  ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) ,
568
  ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) ,
569
  ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) ,
570
  ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) ,
571
  ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) ,
572
#undef ENTRY
573
574
  /* End of block */
575
  HUFFDEC_EXCEPTIONAL | HUFFDEC_END_OF_BLOCK,
576
577
  /* Lengths */
578
#define ENTRY(length_base, num_extra_bits)  \
579
  (((u32)(length_base) << 16) | (num_extra_bits))
580
  ENTRY(3  , 0) , ENTRY(4  , 0) , ENTRY(5  , 0) , ENTRY(6  , 0),
581
  ENTRY(7  , 0) , ENTRY(8  , 0) , ENTRY(9  , 0) , ENTRY(10 , 0),
582
  ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1),
583
  ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2),
584
  ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3),
585
  ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4),
586
  ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5),
587
  ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) ,
588
#undef ENTRY
589
};
590
591
/* Maximum number of bits that can be consumed by decoding a match offset */
592
#define OFFSET_MAXBITS    (DEFLATE_MAX_OFFSET_CODEWORD_LEN + \
593
         DEFLATE_MAX_EXTRA_OFFSET_BITS)
594
#define OFFSET_MAXFASTBITS  (OFFSET_TABLEBITS /* no subtable needed */ + \
595
         DEFLATE_MAX_EXTRA_OFFSET_BITS)
596
597
/*
598
 * Here is the format of our offset decode table entries.  Bits not explicitly
599
 * described contain zeroes:
600
 *
601
 *  Offsets:
602
 *    Bit 31-16:  offset base value
603
 *    Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
604
 *    Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
605
 *    Bit 11-8:   remaining codeword length
606
 *    Bit 4-0:    remaining codeword length + number of extra bits
607
 *  Subtable pointer:
608
 *    Bit 31-16:  index of start of subtable
609
 *    Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
610
 *    Bit 14:     1 (HUFFDEC_SUBTABLE_POINTER)
611
 *    Bit 11-8:   number of subtable bits
612
 *    Bit 3-0:    number of main table bits
613
 *
614
 * These work the same way as the length entries and subtable pointer entries in
615
 * the litlen decode table; see litlen_decode_results[] above.
616
 */
617
static const u32 offset_decode_results[] = {
618
#define ENTRY(offset_base, num_extra_bits)  \
619
  (((u32)(offset_base) << 16) | (num_extra_bits))
620
  ENTRY(1     , 0)  , ENTRY(2     , 0)  , ENTRY(3     , 0)  , ENTRY(4     , 0)  ,
621
  ENTRY(5     , 1)  , ENTRY(7     , 1)  , ENTRY(9     , 2)  , ENTRY(13    , 2) ,
622
  ENTRY(17    , 3)  , ENTRY(25    , 3)  , ENTRY(33    , 4)  , ENTRY(49    , 4)  ,
623
  ENTRY(65    , 5)  , ENTRY(97    , 5)  , ENTRY(129   , 6)  , ENTRY(193   , 6)  ,
624
  ENTRY(257   , 7)  , ENTRY(385   , 7)  , ENTRY(513   , 8)  , ENTRY(769   , 8)  ,
625
  ENTRY(1025  , 9)  , ENTRY(1537  , 9)  , ENTRY(2049  , 10) , ENTRY(3073  , 10) ,
626
  ENTRY(4097  , 11) , ENTRY(6145  , 11) , ENTRY(8193  , 12) , ENTRY(12289 , 12) ,
627
  ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) ,
628
#undef ENTRY
629
};
630
631
/*
632
 * The main DEFLATE decompressor structure.  Since libdeflate only supports
633
 * full-buffer decompression, this structure doesn't store the entire
634
 * decompression state, most of which is in stack variables.  Instead, this
635
 * struct just contains the decode tables and some temporary arrays used for
636
 * building them, as these are too large to comfortably allocate on the stack.
637
 *
638
 * Storing the decode tables in the decompressor struct also allows the decode
639
 * tables for the static codes to be reused whenever two static Huffman blocks
640
 * are decoded without an intervening dynamic block, even across streams.
641
 */
642
struct libdeflate_decompressor {
643
644
  /*
645
   * The arrays aren't all needed at the same time.  'precode_lens' and
646
   * 'precode_decode_table' are unneeded after 'lens' has been filled.
647
   * Furthermore, 'lens' need not be retained after building the litlen
648
   * and offset decode tables.  In fact, 'lens' can be in union with
649
   * 'litlen_decode_table' provided that 'offset_decode_table' is separate
650
   * and is built first.
651
   */
652
653
  union {
654
    u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
655
656
    struct {
657
      u8 lens[DEFLATE_NUM_LITLEN_SYMS +
658
        DEFLATE_NUM_OFFSET_SYMS +
659
        DEFLATE_MAX_LENS_OVERRUN];
660
661
      u32 precode_decode_table[PRECODE_ENOUGH];
662
    } l;
663
664
    u32 litlen_decode_table[LITLEN_ENOUGH];
665
  } u;
666
667
  u32 offset_decode_table[OFFSET_ENOUGH];
668
669
  /* used only during build_decode_table() */
670
  u16 sorted_syms[DEFLATE_MAX_NUM_SYMS];
671
672
  bool static_codes_loaded;
673
  unsigned litlen_tablebits;
674
};
675
676
/*
677
 * Build a table for fast decoding of symbols from a Huffman code.  As input,
678
 * this function takes the codeword length of each symbol which may be used in
679
 * the code.  As output, it produces a decode table for the canonical Huffman
680
 * code described by the codeword lengths.  The decode table is built with the
681
 * assumption that it will be indexed with "bit-reversed" codewords, where the
682
 * low-order bit is the first bit of the codeword.  This format is used for all
683
 * Huffman codes in DEFLATE.
684
 *
685
 * @decode_table
686
 *  The array in which the decode table will be generated.  This array must
687
 *  have sufficient length; see the definition of the ENOUGH numbers.
688
 * @lens
689
 *  An array which provides, for each symbol, the length of the
690
 *  corresponding codeword in bits, or 0 if the symbol is unused.  This may
691
 *  alias @decode_table, since nothing is written to @decode_table until all
692
 *  @lens have been consumed.  All codeword lengths are assumed to be <=
693
 *  @max_codeword_len but are otherwise considered untrusted.  If they do
694
 *  not form a valid Huffman code, then the decode table is not built and
695
 *  %false is returned.
696
 * @num_syms
697
 *  The number of symbols in the code, including all unused symbols.
698
 * @decode_results
699
 *  An array which gives the incomplete decode result for each symbol.  The
700
 *  needed values in this array will be combined with codeword lengths to
701
 *  make the final decode table entries using make_decode_table_entry().
702
 * @table_bits
703
 *  The log base-2 of the number of main table entries to use.
704
 *  If @table_bits_ret != NULL, then @table_bits is treated as a maximum
705
 *  value and it will be decreased if a smaller table would be sufficient.
706
 * @max_codeword_len
707
 *  The maximum allowed codeword length for this Huffman code.
708
 *  Must be <= DEFLATE_MAX_CODEWORD_LEN.
709
 * @sorted_syms
710
 *  A temporary array of length @num_syms.
711
 * @table_bits_ret
712
 *  If non-NULL, then the dynamic table_bits is enabled, and the actual
713
 *  table_bits value will be returned here.
714
 *
715
 * Returns %true if successful; %false if the codeword lengths do not form a
716
 * valid Huffman code.
717
 */
718
static bool
719
build_decode_table(u32 decode_table[],
720
       const u8 lens[],
721
       const unsigned num_syms,
722
       const u32 decode_results[],
723
       unsigned table_bits,
724
       unsigned max_codeword_len,
725
       u16 *sorted_syms,
726
       unsigned *table_bits_ret)
727
11.7k
{
728
11.7k
  unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
729
11.7k
  unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1];
730
11.7k
  unsigned sym;   /* current symbol */
731
11.7k
  unsigned codeword;  /* current codeword, bit-reversed */
732
11.7k
  unsigned len;   /* current codeword length in bits */
733
11.7k
  unsigned count;   /* num codewords remaining with this length */
734
11.7k
  u32 codespace_used; /* codespace used out of '2^max_codeword_len' */
735
11.7k
  unsigned cur_table_end; /* end index of current table */
736
11.7k
  unsigned subtable_prefix; /* codeword prefix of current subtable */
737
11.7k
  unsigned subtable_start;  /* start index of current subtable */
738
11.7k
  unsigned subtable_bits;   /* log2 of current subtable length */
739
740
  /* Count how many codewords have each length, including 0. */
741
181k
  for (len = 0; len <= max_codeword_len; len++)
742
170k
    len_counts[len] = 0;
743
1.51M
  for (sym = 0; sym < num_syms; sym++)
744
1.50M
    len_counts[lens[sym]]++;
745
746
  /*
747
   * Determine the actual maximum codeword length that was used, and
748
   * decrease table_bits to it if allowed.
749
   */
750
103k
  while (max_codeword_len > 1 && len_counts[max_codeword_len] == 0)
751
91.6k
    max_codeword_len--;
752
11.7k
  if (table_bits_ret != NULL) {
753
4.73k
    table_bits = MIN(table_bits, max_codeword_len);
754
4.73k
    *table_bits_ret = table_bits;
755
4.73k
  }
756
757
  /*
758
   * Sort the symbols primarily by increasing codeword length and
759
   * secondarily by increasing symbol value; or equivalently by their
760
   * codewords in lexicographic order, since a canonical code is assumed.
761
   *
762
   * For efficiency, also compute 'codespace_used' in the same pass over
763
   * 'len_counts[]' used to build 'offsets[]' for sorting.
764
   */
765
766
  /* Ensure that 'codespace_used' cannot overflow. */
767
11.7k
  STATIC_ASSERT(sizeof(codespace_used) == 4);
768
11.7k
  STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >=
769
11.7k
          DEFLATE_MAX_NUM_SYMS);
770
771
11.7k
  offsets[0] = 0;
772
11.7k
  offsets[1] = len_counts[0];
773
11.7k
  codespace_used = 0;
774
66.7k
  for (len = 1; len < max_codeword_len; len++) {
775
54.9k
    offsets[len + 1] = offsets[len] + len_counts[len];
776
54.9k
    codespace_used = (codespace_used << 1) + len_counts[len];
777
54.9k
  }
778
11.7k
  codespace_used = (codespace_used << 1) + len_counts[len];
779
780
1.51M
  for (sym = 0; sym < num_syms; sym++)
781
1.50M
    sorted_syms[offsets[lens[sym]]++] = sym;
782
783
11.7k
  sorted_syms += offsets[0]; /* Skip unused symbols */
784
785
  /* lens[] is done being used, so we can write to decode_table[] now. */
786
787
  /*
788
   * Check whether the lengths form a complete code (exactly fills the
789
   * codespace), an incomplete code (doesn't fill the codespace), or an
790
   * overfull code (overflows the codespace).  A codeword of length 'n'
791
   * uses proportion '1/(2^n)' of the codespace.  An overfull code is
792
   * nonsensical, so is considered invalid.  An incomplete code is
793
   * considered valid only in two specific cases; see below.
794
   */
795
796
  /* overfull code? */
797
11.7k
  if (unlikely(codespace_used > (1U << max_codeword_len)))
798
51
    return false;
799
800
  /* incomplete code? */
801
11.7k
  if (unlikely(codespace_used < (1U << max_codeword_len))) {
802
2.27k
    u32 entry;
803
2.27k
    unsigned i;
804
805
2.27k
    if (codespace_used == 0) {
806
      /*
807
       * An empty code is allowed.  This can happen for the
808
       * offset code in DEFLATE, since a dynamic Huffman block
809
       * need not contain any matches.
810
       */
811
812
      /* sym=0, len=1 (arbitrary) */
813
2.03k
      entry = make_decode_table_entry(decode_results, 0, 1);
814
2.03k
    } else {
815
      /*
816
       * Allow codes with a single used symbol, with codeword
817
       * length 1.  The DEFLATE RFC is unclear regarding this
818
       * case.  What zlib's decompressor does is permit this
819
       * for the litlen and offset codes and assume the
820
       * codeword is '0' rather than '1'.  We do the same
821
       * except we allow this for precodes too, since there's
822
       * no convincing reason to treat the codes differently.
823
       * We also assign both codewords '0' and '1' to the
824
       * symbol to avoid having to handle '1' specially.
825
       */
826
234
      if (codespace_used != (1U << (max_codeword_len - 1)) ||
827
234
          len_counts[1] != 1)
828
92
        return false;
829
142
      entry = make_decode_table_entry(decode_results,
830
142
              *sorted_syms, 1);
831
142
    }
832
    /*
833
     * Note: the decode table still must be fully initialized, in
834
     * case the stream is malformed and contains bits from the part
835
     * of the codespace the incomplete code doesn't use.
836
     */
837
526k
    for (i = 0; i < (1U << table_bits); i++)
838
524k
      decode_table[i] = entry;
839
2.18k
    return true;
840
2.27k
  }
841
842
  /*
843
   * The lengths form a complete code.  Now, enumerate the codewords in
844
   * lexicographic order and fill the decode table entries for each one.
845
   *
846
   * First, process all codewords with len <= table_bits.  Each one gets
847
   * '2^(table_bits-len)' direct entries in the table.
848
   *
849
   * Since DEFLATE uses bit-reversed codewords, these entries aren't
850
   * consecutive but rather are spaced '2^len' entries apart.  This makes
851
   * filling them naively somewhat awkward and inefficient, since strided
852
   * stores are less cache-friendly and preclude the use of word or
853
   * vector-at-a-time stores to fill multiple entries per instruction.
854
   *
855
   * To optimize this, we incrementally double the table size.  When
856
   * processing codewords with length 'len', the table is treated as
857
   * having only '2^len' entries, so each codeword uses just one entry.
858
   * Then, each time 'len' is incremented, the table size is doubled and
859
   * the first half is copied to the second half.  This significantly
860
   * improves performance over naively doing strided stores.
861
   *
862
   * Note that some entries copied for each table doubling may not have
863
   * been initialized yet, but it doesn't matter since they're guaranteed
864
   * to be initialized later (because the Huffman code is complete).
865
   */
866
9.44k
  codeword = 0;
867
9.44k
  len = 1;
868
46.6k
  while ((count = len_counts[len]) == 0)
869
37.1k
    len++;
870
9.44k
  cur_table_end = 1U << len;
871
24.8k
  while (len <= table_bits) {
872
    /* Process all 'count' codewords with length 'len' bits. */
873
1.09M
    do {
874
1.09M
      unsigned bit;
875
876
      /* Fill the first entry for the current codeword. */
877
1.09M
      decode_table[codeword] =
878
1.09M
        make_decode_table_entry(decode_results,
879
1.09M
              *sorted_syms++, len);
880
881
1.09M
      if (codeword == cur_table_end - 1) {
882
        /* Last codeword (all 1's) */
883
21.1k
        for (; len < table_bits; len++) {
884
11.9k
          memcpy(&decode_table[cur_table_end],
885
11.9k
                 decode_table,
886
11.9k
                 cur_table_end *
887
11.9k
            sizeof(decode_table[0]));
888
11.9k
          cur_table_end <<= 1;
889
11.9k
        }
890
9.21k
        return true;
891
9.21k
      }
892
      /*
893
       * To advance to the lexicographically next codeword in
894
       * the canonical code, the codeword must be incremented,
895
       * then 0's must be appended to the codeword as needed
896
       * to match the next codeword's length.
897
       *
898
       * Since the codeword is bit-reversed, appending 0's is
899
       * a no-op.  However, incrementing it is nontrivial.  To
900
       * do so efficiently, use the 'bsr' instruction to find
901
       * the last (highest order) 0 bit in the codeword, set
902
       * it, and clear any later (higher order) 1 bits.  But
903
       * 'bsr' actually finds the highest order 1 bit, so to
904
       * use it first flip all bits in the codeword by XOR'ing
905
       * it with (1U << len) - 1 == cur_table_end - 1.
906
       */
907
1.08M
      bit = 1U << bsr32(codeword ^ (cur_table_end - 1));
908
1.08M
      codeword &= bit - 1;
909
1.08M
      codeword |= bit;
910
1.08M
    } while (--count);
911
912
    /* Advance to the next codeword length. */
913
16.4k
    do {
914
16.4k
      if (++len <= table_bits) {
915
16.2k
        memcpy(&decode_table[cur_table_end],
916
16.2k
               decode_table,
917
16.2k
               cur_table_end * sizeof(decode_table[0]));
918
16.2k
        cur_table_end <<= 1;
919
16.2k
      }
920
16.4k
    } while ((count = len_counts[len]) == 0);
921
15.3k
  }
922
923
  /* Process codewords with len > table_bits.  These require subtables. */
924
236
  cur_table_end = 1U << table_bits;
925
236
  subtable_prefix = -1;
926
236
  subtable_start = 0;
927
3.45k
  for (;;) {
928
3.45k
    u32 entry;
929
3.45k
    unsigned i;
930
3.45k
    unsigned stride;
931
3.45k
    unsigned bit;
932
933
    /*
934
     * Start a new subtable if the first 'table_bits' bits of the
935
     * codeword don't match the prefix of the current subtable.
936
     */
937
3.45k
    if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) {
938
1.28k
      subtable_prefix = (codeword & ((1U << table_bits) - 1));
939
1.28k
      subtable_start = cur_table_end;
940
      /*
941
       * Calculate the subtable length.  If the codeword has
942
       * length 'table_bits + n', then the subtable needs
943
       * '2^n' entries.  But it may need more; if fewer than
944
       * '2^n' codewords of length 'table_bits + n' remain,
945
       * then the length will need to be incremented to bring
946
       * in longer codewords until the subtable can be
947
       * completely filled.  Note that because the Huffman
948
       * code is complete, it will always be possible to fill
949
       * the subtable eventually.
950
       */
951
1.28k
      subtable_bits = len - table_bits;
952
1.28k
      codespace_used = count;
953
1.43k
      while (codespace_used < (1U << subtable_bits)) {
954
155
        subtable_bits++;
955
155
        codespace_used = (codespace_used << 1) +
956
155
          len_counts[table_bits + subtable_bits];
957
155
      }
958
1.28k
      cur_table_end = subtable_start + (1U << subtable_bits);
959
960
      /*
961
       * Create the entry that points from the main table to
962
       * the subtable.
963
       */
964
1.28k
      decode_table[subtable_prefix] =
965
1.28k
        ((u32)subtable_start << 16) |
966
1.28k
        HUFFDEC_EXCEPTIONAL |
967
1.28k
        HUFFDEC_SUBTABLE_POINTER |
968
1.28k
        (subtable_bits << 8) | table_bits;
969
1.28k
    }
970
971
    /* Fill the subtable entries for the current codeword. */
972
3.45k
    entry = make_decode_table_entry(decode_results, *sorted_syms++,
973
3.45k
            len - table_bits);
974
3.45k
    i = subtable_start + (codeword >> table_bits);
975
3.45k
    stride = 1U << (len - table_bits);
976
3.82k
    do {
977
3.82k
      decode_table[i] = entry;
978
3.82k
      i += stride;
979
3.82k
    } while (i < cur_table_end);
980
981
    /* Advance to the next codeword. */
982
3.45k
    if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */
983
236
      return true;
984
3.21k
    bit = 1U << bsr32(codeword ^ ((1U << len) - 1));
985
3.21k
    codeword &= bit - 1;
986
3.21k
    codeword |= bit;
987
3.21k
    count--;
988
3.46k
    while (count == 0)
989
245
      count = len_counts[++len];
990
3.21k
  }
991
236
}
992
993
/* Build the decode table for the precode.  */
994
static bool
995
build_precode_decode_table(struct libdeflate_decompressor *d)
996
2.27k
{
997
  /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
998
2.27k
  STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128);
999
1000
2.27k
  STATIC_ASSERT(ARRAY_LEN(precode_decode_results) ==
1001
2.27k
          DEFLATE_NUM_PRECODE_SYMS);
1002
1003
2.27k
  return build_decode_table(d->u.l.precode_decode_table,
1004
2.27k
          d->u.precode_lens,
1005
2.27k
          DEFLATE_NUM_PRECODE_SYMS,
1006
2.27k
          precode_decode_results,
1007
2.27k
          PRECODE_TABLEBITS,
1008
2.27k
          DEFLATE_MAX_PRE_CODEWORD_LEN,
1009
2.27k
          d->sorted_syms,
1010
2.27k
          NULL);
1011
2.27k
}
1012
1013
/* Build the decode table for the literal/length code.  */
1014
static bool
1015
build_litlen_decode_table(struct libdeflate_decompressor *d,
1016
        unsigned num_litlen_syms, unsigned num_offset_syms)
1017
4.73k
{
1018
  /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
1019
4.73k
  STATIC_ASSERT(LITLEN_TABLEBITS == 11 && LITLEN_ENOUGH == 2342);
1020
1021
4.73k
  STATIC_ASSERT(ARRAY_LEN(litlen_decode_results) ==
1022
4.73k
          DEFLATE_NUM_LITLEN_SYMS);
1023
1024
4.73k
  return build_decode_table(d->u.litlen_decode_table,
1025
4.73k
          d->u.l.lens,
1026
4.73k
          num_litlen_syms,
1027
4.73k
          litlen_decode_results,
1028
4.73k
          LITLEN_TABLEBITS,
1029
4.73k
          DEFLATE_MAX_LITLEN_CODEWORD_LEN,
1030
4.73k
          d->sorted_syms,
1031
4.73k
          &d->litlen_tablebits);
1032
4.73k
}
1033
1034
/* Build the decode table for the offset code.  */
1035
static bool
1036
build_offset_decode_table(struct libdeflate_decompressor *d,
1037
        unsigned num_litlen_syms, unsigned num_offset_syms)
1038
4.76k
{
1039
  /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
1040
4.76k
  STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402);
1041
1042
4.76k
  STATIC_ASSERT(ARRAY_LEN(offset_decode_results) ==
1043
4.76k
          DEFLATE_NUM_OFFSET_SYMS);
1044
1045
4.76k
  return build_decode_table(d->offset_decode_table,
1046
4.76k
          d->u.l.lens + num_litlen_syms,
1047
4.76k
          num_offset_syms,
1048
4.76k
          offset_decode_results,
1049
4.76k
          OFFSET_TABLEBITS,
1050
4.76k
          DEFLATE_MAX_OFFSET_CODEWORD_LEN,
1051
4.76k
          d->sorted_syms,
1052
4.76k
          NULL);
1053
4.76k
}
1054
1055
/*****************************************************************************
1056
 *                         Main decompression routine
1057
 *****************************************************************************/
1058
1059
typedef enum libdeflate_result (*decompress_func_t)
1060
  (struct libdeflate_decompressor * restrict d,
1061
   const void * restrict in, size_t in_nbytes,
1062
   void * restrict out, size_t out_nbytes_avail,
1063
   size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
1064
1065
#define FUNCNAME deflate_decompress_default
1066
#undef ATTRIBUTES
1067
#undef EXTRACT_VARBITS
1068
#undef EXTRACT_VARBITS8
1069
#include "decompress_template.h"
1070
1071
/* Include architecture-specific implementation(s) if available. */
1072
#undef DEFAULT_IMPL
1073
#undef arch_select_decompress_func
1074
#if defined(ARCH_X86_32) || defined(ARCH_X86_64)
1075
#  include "x86/decompress_impl.h"
1076
#endif
1077
1078
#ifndef DEFAULT_IMPL
1079
0
#  define DEFAULT_IMPL deflate_decompress_default
1080
#endif
1081
1082
#ifdef arch_select_decompress_func
1083
static enum libdeflate_result
1084
dispatch_decomp(struct libdeflate_decompressor *d,
1085
    const void *in, size_t in_nbytes,
1086
    void *out, size_t out_nbytes_avail,
1087
    size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
1088
1089
static volatile decompress_func_t decompress_impl = dispatch_decomp;
1090
1091
/* Choose the best implementation at runtime. */
1092
static enum libdeflate_result
1093
dispatch_decomp(struct libdeflate_decompressor *d,
1094
    const void *in, size_t in_nbytes,
1095
    void *out, size_t out_nbytes_avail,
1096
    size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
1097
2
{
1098
2
  decompress_func_t f = arch_select_decompress_func();
1099
1100
2
  if (f == NULL)
1101
0
    f = DEFAULT_IMPL;
1102
1103
2
  decompress_impl = f;
1104
2
  return f(d, in, in_nbytes, out, out_nbytes_avail,
1105
2
     actual_in_nbytes_ret, actual_out_nbytes_ret);
1106
2
}
1107
#else
1108
/* The best implementation is statically known, so call it directly. */
1109
#  define decompress_impl DEFAULT_IMPL
1110
#endif
1111
1112
/*
1113
 * This is the main DEFLATE decompression routine.  See libdeflate.h for the
1114
 * documentation.
1115
 *
1116
 * Note that the real code is in decompress_template.h.  The part here just
1117
 * handles calling the appropriate implementation depending on the CPU features
1118
 * at runtime.
1119
 */
1120
LIBDEFLATEAPI enum libdeflate_result
1121
libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d,
1122
         const void *in, size_t in_nbytes,
1123
         void *out, size_t out_nbytes_avail,
1124
         size_t *actual_in_nbytes_ret,
1125
         size_t *actual_out_nbytes_ret)
1126
1.24k
{
1127
1.24k
  return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail,
1128
1.24k
             actual_in_nbytes_ret, actual_out_nbytes_ret);
1129
1.24k
}
1130
1131
LIBDEFLATEAPI enum libdeflate_result
1132
libdeflate_deflate_decompress(struct libdeflate_decompressor *d,
1133
            const void *in, size_t in_nbytes,
1134
            void *out, size_t out_nbytes_avail,
1135
            size_t *actual_out_nbytes_ret)
1136
0
{
1137
0
  return libdeflate_deflate_decompress_ex(d, in, in_nbytes,
1138
0
            out, out_nbytes_avail,
1139
0
            NULL, actual_out_nbytes_ret);
1140
0
}
1141
1142
LIBDEFLATEAPI struct libdeflate_decompressor *
1143
libdeflate_alloc_decompressor(void)
1144
1.51k
{
1145
  /*
1146
   * Note that only certain parts of the decompressor actually must be
1147
   * initialized here:
1148
   *
1149
   * - 'static_codes_loaded' must be initialized to false.
1150
   *
1151
   * - The first half of the main portion of each decode table must be
1152
   *   initialized to any value, to avoid reading from uninitialized
1153
   *   memory during table expansion in build_decode_table().  (Although,
1154
   *   this is really just to avoid warnings with dynamic tools like
1155
   *   valgrind, since build_decode_table() is guaranteed to initialize
1156
   *   all entries eventually anyway.)
1157
   *
1158
   * But for simplicity, we currently just zero the whole decompressor.
1159
   */
1160
1.51k
  struct libdeflate_decompressor *d = libdeflate_malloc(sizeof(*d));
1161
1162
1.51k
  if (d == NULL)
1163
0
    return NULL;
1164
1.51k
  memset(d, 0, sizeof(*d));
1165
1.51k
  return d;
1166
1.51k
}
1167
1168
LIBDEFLATEAPI void
1169
libdeflate_free_decompressor(struct libdeflate_decompressor *d)
1170
1.51k
{
1171
1.51k
  libdeflate_free(d);
1172
1.51k
}