Coverage Report

Created: 2025-06-16 07:00

/src/libdeflate/lib/decompress_template.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * decompress_template.h
3
 *
4
 * Copyright 2016 Eric Biggers
5
 *
6
 * Permission is hereby granted, free of charge, to any person
7
 * obtaining a copy of this software and associated documentation
8
 * files (the "Software"), to deal in the Software without
9
 * restriction, including without limitation the rights to use,
10
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
 * copies of the Software, and to permit persons to whom the
12
 * Software is furnished to do so, subject to the following
13
 * conditions:
14
 *
15
 * The above copyright notice and this permission notice shall be
16
 * included in all copies or substantial portions of the Software.
17
 *
18
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
 * OTHER DEALINGS IN THE SOFTWARE.
26
 */
27
28
/*
29
 * This is the actual DEFLATE decompression routine, lifted out of
30
 * deflate_decompress.c so that it can be compiled multiple times with different
31
 * target instruction sets.
32
 */
33
34
#ifndef ATTRIBUTES
35
#  define ATTRIBUTES
36
#endif
37
#ifndef EXTRACT_VARBITS
38
8.68k
#  define EXTRACT_VARBITS(word, count)  ((word) & BITMASK(count))
39
#endif
40
#ifndef EXTRACT_VARBITS8
41
306k
#  define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count)))
42
#endif
43
44
static ATTRIBUTES MAYBE_UNUSED enum libdeflate_result
45
FUNCNAME(struct libdeflate_decompressor * restrict d,
46
   const void * restrict in, size_t in_nbytes,
47
   void * restrict out, size_t out_nbytes_avail,
48
   size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
49
6.30k
{
50
6.30k
  u8 *out_next = out;
51
6.30k
  u8 * const out_end = out_next + out_nbytes_avail;
52
6.30k
  u8 * const out_fastloop_end =
53
6.30k
    out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);
54
55
  /* Input bitstream state; see deflate_decompress.c for documentation */
56
6.30k
  const u8 *in_next = in;
57
6.30k
  const u8 * const in_end = in_next + in_nbytes;
58
6.30k
  const u8 * const in_fastloop_end =
59
6.30k
    in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
60
6.30k
  bitbuf_t bitbuf = 0;
61
6.30k
  bitbuf_t saved_bitbuf;
62
6.30k
  u32 bitsleft = 0;
63
6.30k
  size_t overread_count = 0;
64
65
6.30k
  bool is_final_block;
66
6.30k
  unsigned block_type;
67
6.30k
  unsigned num_litlen_syms;
68
6.30k
  unsigned num_offset_syms;
69
6.30k
  bitbuf_t litlen_tablemask;
70
6.30k
  u32 entry;
71
72
10.4k
next_block:
73
  /* Starting to read the next block */
74
10.4k
  ;
75
76
10.4k
  STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
77
10.4k
  REFILL_BITS();
78
79
  /* BFINAL: 1 bit */
80
10.3k
  is_final_block = bitbuf & BITMASK(1);
81
82
  /* BTYPE: 2 bits */
83
10.3k
  block_type = (bitbuf >> 1) & BITMASK(2);
84
85
10.3k
  if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
86
87
    /* Dynamic Huffman block */
88
89
    /* The order in which precode lengths are stored */
90
4.78k
    static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
91
4.78k
      16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
92
4.78k
    };
93
94
4.78k
    unsigned num_explicit_precode_lens;
95
4.78k
    unsigned i;
96
97
    /* Read the codeword length counts. */
98
99
4.78k
    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
100
4.78k
    num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));
101
102
4.78k
    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
103
4.78k
    num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));
104
105
4.78k
    STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
106
4.78k
    num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));
107
108
4.78k
    d->static_codes_loaded = false;
109
110
    /*
111
     * Read the precode codeword lengths.
112
     *
113
     * A 64-bit bitbuffer is just one bit too small to hold the
114
     * maximum number of precode lens, so to minimize branches we
115
     * merge one len with the previous fields.
116
     */
117
4.78k
    STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
118
4.78k
    if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
119
4.78k
      d->u.precode_lens[deflate_precode_lens_permutation[0]] =
120
4.78k
        (bitbuf >> 17) & BITMASK(3);
121
4.78k
      bitbuf >>= 20;
122
4.78k
      bitsleft -= 20;
123
4.78k
      REFILL_BITS();
124
4.76k
      i = 1;
125
60.7k
      do {
126
60.7k
        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
127
60.7k
          bitbuf & BITMASK(3);
128
60.7k
        bitbuf >>= 3;
129
60.7k
        bitsleft -= 3;
130
60.7k
      } while (++i < num_explicit_precode_lens);
131
4.76k
    } else {
132
0
      bitbuf >>= 17;
133
0
      bitsleft -= 17;
134
0
      i = 0;
135
0
      do {
136
0
        if ((u8)bitsleft < 3)
137
0
          REFILL_BITS();
138
0
        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
139
0
          bitbuf & BITMASK(3);
140
0
        bitbuf >>= 3;
141
0
        bitsleft -= 3;
142
0
      } while (++i < num_explicit_precode_lens);
143
0
    }
144
29.8k
    for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
145
25.0k
      d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
146
147
    /* Build the decode table for the precode. */
148
4.76k
    SAFETY_CHECK(build_precode_decode_table(d));
149
150
    /* Decode the litlen and offset codeword lengths. */
151
4.61k
    i = 0;
152
401k
    do {
153
401k
      unsigned presym;
154
401k
      u8 rep_val;
155
401k
      unsigned rep_count;
156
157
401k
      if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
158
15.9k
        REFILL_BITS();
159
160
      /*
161
       * The code below assumes that the precode decode table
162
       * doesn't have any subtables.
163
       */
164
401k
      STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
165
166
      /* Decode the next precode symbol. */
167
401k
      entry = d->u.l.precode_decode_table[
168
401k
        bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
169
401k
      bitbuf >>= (u8)entry;
170
401k
      bitsleft -= entry; /* optimization: subtract full entry */
171
401k
      presym = entry >> 16;
172
173
401k
      if (presym < 16) {
174
        /* Explicit codeword length */
175
376k
        d->u.l.lens[i++] = presym;
176
376k
        continue;
177
376k
      }
178
179
      /* Run-length encoded codeword lengths */
180
181
      /*
182
       * Note: we don't need to immediately verify that the
183
       * repeat count doesn't overflow the number of elements,
184
       * since we've sized the lens array to have enough extra
185
       * space to allow for the worst-case overrun (138 zeroes
186
       * when only 1 length was remaining).
187
       *
188
       * In the case of the small repeat counts (presyms 16
189
       * and 17), it is fastest to always write the maximum
190
       * number of entries.  That gets rid of branches that
191
       * would otherwise be required.
192
       *
193
       * It is not just because of the numerical order that
194
       * our checks go in the order 'presym < 16', 'presym ==
195
       * 16', and 'presym == 17'.  For typical data this is
196
       * ordered from most frequent to least frequent case.
197
       */
198
24.9k
      STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
199
200
24.9k
      if (presym == 16) {
201
        /* Repeat the previous length 3 - 6 times. */
202
4.71k
        SAFETY_CHECK(i != 0);
203
4.69k
        rep_val = d->u.l.lens[i - 1];
204
4.69k
        STATIC_ASSERT(3 + BITMASK(2) == 6);
205
4.69k
        rep_count = 3 + (bitbuf & BITMASK(2));
206
4.69k
        bitbuf >>= 2;
207
4.69k
        bitsleft -= 2;
208
4.69k
        d->u.l.lens[i + 0] = rep_val;
209
4.69k
        d->u.l.lens[i + 1] = rep_val;
210
4.69k
        d->u.l.lens[i + 2] = rep_val;
211
4.69k
        d->u.l.lens[i + 3] = rep_val;
212
4.69k
        d->u.l.lens[i + 4] = rep_val;
213
4.69k
        d->u.l.lens[i + 5] = rep_val;
214
4.69k
        i += rep_count;
215
20.1k
      } else if (presym == 17) {
216
        /* Repeat zero 3 - 10 times. */
217
8.09k
        STATIC_ASSERT(3 + BITMASK(3) == 10);
218
8.09k
        rep_count = 3 + (bitbuf & BITMASK(3));
219
8.09k
        bitbuf >>= 3;
220
8.09k
        bitsleft -= 3;
221
8.09k
        d->u.l.lens[i + 0] = 0;
222
8.09k
        d->u.l.lens[i + 1] = 0;
223
8.09k
        d->u.l.lens[i + 2] = 0;
224
8.09k
        d->u.l.lens[i + 3] = 0;
225
8.09k
        d->u.l.lens[i + 4] = 0;
226
8.09k
        d->u.l.lens[i + 5] = 0;
227
8.09k
        d->u.l.lens[i + 6] = 0;
228
8.09k
        d->u.l.lens[i + 7] = 0;
229
8.09k
        d->u.l.lens[i + 8] = 0;
230
8.09k
        d->u.l.lens[i + 9] = 0;
231
8.09k
        i += rep_count;
232
12.1k
      } else {
233
        /* Repeat zero 11 - 138 times. */
234
12.1k
        STATIC_ASSERT(11 + BITMASK(7) == 138);
235
12.1k
        rep_count = 11 + (bitbuf & BITMASK(7));
236
12.1k
        bitbuf >>= 7;
237
12.1k
        bitsleft -= 7;
238
12.1k
        memset(&d->u.l.lens[i], 0,
239
12.1k
               rep_count * sizeof(d->u.l.lens[i]));
240
12.1k
        i += rep_count;
241
12.1k
      }
242
401k
    } while (i < num_litlen_syms + num_offset_syms);
243
244
    /* Unnecessary, but check this for consistency with zlib. */
245
4.40k
    SAFETY_CHECK(i == num_litlen_syms + num_offset_syms);
246
247
5.57k
  } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
248
1.16k
    u16 len, nlen;
249
250
    /*
251
     * Uncompressed block: copy 'len' bytes literally from the input
252
     * buffer to the output buffer.
253
     */
254
255
1.16k
    bitsleft -= 3; /* for BTYPE and BFINAL */
256
257
    /*
258
     * Align the bitstream to the next byte boundary.  This means
259
     * the next byte boundary as if we were reading a byte at a
260
     * time.  Therefore, we have to rewind 'in_next' by any bytes
261
     * that have been refilled but not actually consumed yet (not
262
     * counting overread bytes, which don't increment 'in_next').
263
     */
264
1.16k
    bitsleft = (u8)bitsleft;
265
1.16k
    SAFETY_CHECK(overread_count <= (bitsleft >> 3));
266
1.09k
    in_next -= (bitsleft >> 3) - overread_count;
267
1.09k
    overread_count = 0;
268
1.09k
    bitbuf = 0;
269
1.09k
    bitsleft = 0;
270
271
1.09k
    SAFETY_CHECK(in_end - in_next >= 4);
272
1.05k
    len = get_unaligned_le16(in_next);
273
1.05k
    nlen = get_unaligned_le16(in_next + 2);
274
1.05k
    in_next += 4;
275
276
1.05k
    SAFETY_CHECK(len == (u16)~nlen);
277
821
    if (unlikely(len > out_end - out_next))
278
364
      return LIBDEFLATE_INSUFFICIENT_SPACE;
279
457
    SAFETY_CHECK(len <= in_end - in_next);
280
281
442
    memcpy(out_next, in_next, len);
282
442
    in_next += len;
283
442
    out_next += len;
284
285
442
    goto block_done;
286
287
4.40k
  } else {
288
4.40k
    unsigned i;
289
290
4.40k
    SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
291
292
    /*
293
     * Static Huffman block: build the decode tables for the static
294
     * codes.  Skip doing so if the tables are already set up from
295
     * an earlier static block; this speeds up decompression of
296
     * degenerate input of many empty or very short static blocks.
297
     *
298
     * Afterwards, the remainder is the same as decompressing a
299
     * dynamic Huffman block.
300
     */
301
302
4.36k
    bitbuf >>= 3; /* for BTYPE and BFINAL */
303
4.36k
    bitsleft -= 3;
304
305
4.36k
    if (d->static_codes_loaded)
306
2.43k
      goto have_decode_tables;
307
308
1.93k
    d->static_codes_loaded = true;
309
310
1.93k
    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
311
1.93k
    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
312
313
281k
    for (i = 0; i < 144; i++)
314
279k
      d->u.l.lens[i] = 8;
315
218k
    for (; i < 256; i++)
316
217k
      d->u.l.lens[i] = 9;
317
48.4k
    for (; i < 280; i++)
318
46.5k
      d->u.l.lens[i] = 7;
319
17.4k
    for (; i < 288; i++)
320
15.5k
      d->u.l.lens[i] = 8;
321
322
63.9k
    for (; i < 288 + 32; i++)
323
62.0k
      d->u.l.lens[i] = 5;
324
325
1.93k
    num_litlen_syms = 288;
326
1.93k
    num_offset_syms = 32;
327
1.93k
  }
328
329
  /* Decompressing a Huffman block (either dynamic or static) */
330
331
6.26k
  SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
332
6.18k
  SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
333
8.55k
have_decode_tables:
334
8.55k
  litlen_tablemask = BITMASK(d->litlen_tablebits);
335
336
  /*
337
   * This is the "fastloop" for decoding literals and matches.  It does
338
   * bounds checks on in_next and out_next in the loop conditions so that
339
   * additional bounds checks aren't needed inside the loop body.
340
   *
341
   * To reduce latency, the bitbuffer is refilled and the next litlen
342
   * decode table entry is preloaded before each loop iteration.
343
   */
344
8.55k
  if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
345
5.32k
    goto generic_loop;
346
3.23k
  REFILL_BITS_IN_FASTLOOP();
347
3.23k
  entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
348
162k
  do {
349
162k
    u32 length, offset, lit;
350
162k
    const u8 *src;
351
162k
    u8 *dst;
352
353
    /*
354
     * Consume the bits for the litlen decode table entry.  Save the
355
     * original bitbuf for later, in case the extra match length
356
     * bits need to be extracted from it.
357
     */
358
162k
    saved_bitbuf = bitbuf;
359
162k
    bitbuf >>= (u8)entry;
360
162k
    bitsleft -= entry; /* optimization: subtract full entry */
361
362
    /*
363
     * Begin by checking for a "fast" literal, i.e. a literal that
364
     * doesn't need a subtable.
365
     */
366
162k
    if (entry & HUFFDEC_LITERAL) {
367
      /*
368
       * On 64-bit platforms, we decode up to 2 extra fast
369
       * literals in addition to the primary item, as this
370
       * increases performance and still leaves enough bits
371
       * remaining for what follows.  We could actually do 3,
372
       * assuming LITLEN_TABLEBITS=11, but that actually
373
       * decreases performance slightly (perhaps by messing
374
       * with the branch prediction of the conditional refill
375
       * that happens later while decoding the match offset).
376
       *
377
       * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
378
       * and FASTLOOP_MAX_BYTES_READ need to be updated if the
379
       * number of extra literals decoded here is changed.
380
       */
381
89.2k
      if (/* enough bits for 2 fast literals + length + offset preload? */
382
89.2k
          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
383
89.2k
               LENGTH_MAXBITS,
384
89.2k
               OFFSET_TABLEBITS) &&
385
          /* enough bits for 2 fast literals + slow literal + litlen preload? */
386
89.2k
          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
387
89.2k
               DEFLATE_MAX_LITLEN_CODEWORD_LEN,
388
89.2k
               LITLEN_TABLEBITS)) {
389
        /* 1st extra fast literal */
390
89.2k
        lit = entry >> 16;
391
89.2k
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
392
89.2k
        saved_bitbuf = bitbuf;
393
89.2k
        bitbuf >>= (u8)entry;
394
89.2k
        bitsleft -= entry;
395
89.2k
        *out_next++ = lit;
396
89.2k
        if (entry & HUFFDEC_LITERAL) {
397
          /* 2nd extra fast literal */
398
81.4k
          lit = entry >> 16;
399
81.4k
          entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
400
81.4k
          saved_bitbuf = bitbuf;
401
81.4k
          bitbuf >>= (u8)entry;
402
81.4k
          bitsleft -= entry;
403
81.4k
          *out_next++ = lit;
404
81.4k
          if (entry & HUFFDEC_LITERAL) {
405
            /*
406
             * Another fast literal, but
407
             * this one is in lieu of the
408
             * primary item, so it doesn't
409
             * count as one of the extras.
410
             */
411
76.0k
            lit = entry >> 16;
412
76.0k
            entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
413
76.0k
            REFILL_BITS_IN_FASTLOOP();
414
76.0k
            *out_next++ = lit;
415
76.0k
            continue;
416
76.0k
          }
417
81.4k
        }
418
89.2k
      } else {
419
        /*
420
         * Decode a literal.  While doing so, preload
421
         * the next litlen decode table entry and refill
422
         * the bitbuffer.  To reduce latency, we've
423
         * arranged for there to be enough "preloadable"
424
         * bits remaining to do the table preload
425
         * independently of the refill.
426
         */
427
0
        STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
428
0
            LITLEN_TABLEBITS, LITLEN_TABLEBITS));
429
0
        lit = entry >> 16;
430
0
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
431
0
        REFILL_BITS_IN_FASTLOOP();
432
0
        *out_next++ = lit;
433
0
        continue;
434
0
      }
435
89.2k
    }
436
437
    /*
438
     * It's not a literal entry, so it can be a length entry, a
439
     * subtable pointer entry, or an end-of-block entry.  Detect the
440
     * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
441
     */
442
86.6k
    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
443
      /* Subtable pointer or end-of-block entry */
444
445
6.52k
      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
446
1.74k
        goto block_done;
447
448
      /*
449
       * A subtable is required.  Load and consume the
450
       * subtable entry.  The subtable entry can be of any
451
       * type: literal, length, or end-of-block.
452
       */
453
4.78k
      entry = d->u.litlen_decode_table[(entry >> 16) +
454
4.78k
        EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
455
4.78k
      saved_bitbuf = bitbuf;
456
4.78k
      bitbuf >>= (u8)entry;
457
4.78k
      bitsleft -= entry;
458
459
      /*
460
       * 32-bit platforms that use the byte-at-a-time refill
461
       * method have to do a refill here for there to always
462
       * be enough bits to decode a literal that requires a
463
       * subtable, then preload the next litlen decode table
464
       * entry; or to decode a match length that requires a
465
       * subtable, then preload the offset decode table entry.
466
       */
467
4.78k
      if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
468
4.78k
                LITLEN_TABLEBITS) ||
469
4.78k
          !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
470
4.78k
                OFFSET_TABLEBITS))
471
0
        REFILL_BITS_IN_FASTLOOP();
472
4.78k
      if (entry & HUFFDEC_LITERAL) {
473
        /* Decode a literal that required a subtable. */
474
1.24k
        lit = entry >> 16;
475
1.24k
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
476
1.24k
        REFILL_BITS_IN_FASTLOOP();
477
1.24k
        *out_next++ = lit;
478
1.24k
        continue;
479
1.24k
      }
480
3.54k
      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
481
61
        goto block_done;
482
      /* Else, it's a length that required a subtable. */
483
3.54k
    }
484
485
    /*
486
     * Decode the match length: the length base value associated
487
     * with the litlen symbol (which we extract from the decode
488
     * table entry), plus the extra length bits.  We don't need to
489
     * consume the extra length bits here, as they were included in
490
     * the bits consumed by the entry earlier.  We also don't need
491
     * to check for too-long matches here, as this is inside the
492
     * fastloop where it's already been verified that the output
493
     * buffer has enough space remaining to copy a max-length match.
494
     */
495
83.6k
    length = entry >> 16;
496
83.6k
    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
497
498
    /*
499
     * Decode the match offset.  There are enough "preloadable" bits
500
     * remaining to preload the offset decode table entry, but a
501
     * refill might be needed before consuming it.
502
     */
503
83.6k
    STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
504
83.6k
                 OFFSET_TABLEBITS));
505
83.6k
    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
506
83.6k
    if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
507
83.6k
             LITLEN_TABLEBITS)) {
508
      /*
509
       * Decoding a match offset on a 64-bit platform.  We may
510
       * need to refill once, but then we can decode the whole
511
       * offset and preload the next litlen table entry.
512
       */
513
83.6k
      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
514
        /* Offset codeword requires a subtable */
515
3.00k
        if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
516
3.00k
               LITLEN_TABLEBITS - PRELOAD_SLACK))
517
214
          REFILL_BITS_IN_FASTLOOP();
518
3.00k
        bitbuf >>= OFFSET_TABLEBITS;
519
3.00k
        bitsleft -= OFFSET_TABLEBITS;
520
3.00k
        entry = d->offset_decode_table[(entry >> 16) +
521
3.00k
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
522
80.6k
      } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
523
80.6k
              LITLEN_TABLEBITS - PRELOAD_SLACK))
524
243
        REFILL_BITS_IN_FASTLOOP();
525
83.6k
    } else {
526
      /* Decoding a match offset on a 32-bit platform */
527
0
      REFILL_BITS_IN_FASTLOOP();
528
0
      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
529
        /* Offset codeword requires a subtable */
530
0
        bitbuf >>= OFFSET_TABLEBITS;
531
0
        bitsleft -= OFFSET_TABLEBITS;
532
0
        entry = d->offset_decode_table[(entry >> 16) +
533
0
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
534
0
        REFILL_BITS_IN_FASTLOOP();
535
        /* No further refill needed before extra bits */
536
0
        STATIC_ASSERT(CAN_CONSUME(
537
0
          OFFSET_MAXBITS - OFFSET_TABLEBITS));
538
0
      } else {
539
        /* No refill needed before extra bits */
540
0
        STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
541
0
      }
542
0
    }
543
83.6k
    saved_bitbuf = bitbuf;
544
83.6k
    bitbuf >>= (u8)entry;
545
83.6k
    bitsleft -= entry; /* optimization: subtract full entry */
546
83.6k
    offset = entry >> 16;
547
83.6k
    offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
548
549
    /* Validate the match offset; needed even in the fastloop. */
550
83.6k
    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
551
83.5k
    src = out_next - offset;
552
83.5k
    dst = out_next;
553
83.5k
    out_next += length;
554
555
    /*
556
     * Before starting to issue the instructions to copy the match,
557
     * refill the bitbuffer and preload the litlen decode table
558
     * entry for the next loop iteration.  This can increase
559
     * performance by allowing the latency of the match copy to
560
     * overlap with these other operations.  To further reduce
561
     * latency, we've arranged for there to be enough bits remaining
562
     * to do the table preload independently of the refill, except
563
     * on 32-bit platforms using the byte-at-a-time refill method.
564
     */
565
83.5k
    if (!CAN_CONSUME_AND_THEN_PRELOAD(
566
83.5k
      MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
567
83.5k
          OFFSET_MAXFASTBITS),
568
83.5k
      LITLEN_TABLEBITS) &&
569
83.5k
        unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
570
0
      REFILL_BITS_IN_FASTLOOP();
571
83.5k
    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
572
83.5k
    REFILL_BITS_IN_FASTLOOP();
573
574
    /*
575
     * Copy the match.  On most CPUs the fastest method is a
576
     * word-at-a-time copy, unconditionally copying about 5 words
577
     * since this is enough for most matches without being too much.
578
     *
579
     * The normal word-at-a-time copy works for offset >= WORDBYTES,
580
     * which is most cases.  The case of offset == 1 is also common
581
     * and is worth optimizing for, since it is just RLE encoding of
582
     * the previous byte, which is the result of compressing long
583
     * runs of the same byte.
584
     *
585
     * Writing past the match 'length' is allowed here, since it's
586
     * been ensured there is enough output space left for a slight
587
     * overrun.  FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
588
     * the maximum possible overrun here is changed.
589
     */
590
83.5k
    if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
591
17.8k
      store_word_unaligned(load_word_unaligned(src), dst);
592
17.8k
      src += WORDBYTES;
593
17.8k
      dst += WORDBYTES;
594
17.8k
      store_word_unaligned(load_word_unaligned(src), dst);
595
17.8k
      src += WORDBYTES;
596
17.8k
      dst += WORDBYTES;
597
17.8k
      store_word_unaligned(load_word_unaligned(src), dst);
598
17.8k
      src += WORDBYTES;
599
17.8k
      dst += WORDBYTES;
600
17.8k
      store_word_unaligned(load_word_unaligned(src), dst);
601
17.8k
      src += WORDBYTES;
602
17.8k
      dst += WORDBYTES;
603
17.8k
      store_word_unaligned(load_word_unaligned(src), dst);
604
17.8k
      src += WORDBYTES;
605
17.8k
      dst += WORDBYTES;
606
28.2k
      while (dst < out_next) {
607
10.3k
        store_word_unaligned(load_word_unaligned(src), dst);
608
10.3k
        src += WORDBYTES;
609
10.3k
        dst += WORDBYTES;
610
10.3k
        store_word_unaligned(load_word_unaligned(src), dst);
611
10.3k
        src += WORDBYTES;
612
10.3k
        dst += WORDBYTES;
613
10.3k
        store_word_unaligned(load_word_unaligned(src), dst);
614
10.3k
        src += WORDBYTES;
615
10.3k
        dst += WORDBYTES;
616
10.3k
        store_word_unaligned(load_word_unaligned(src), dst);
617
10.3k
        src += WORDBYTES;
618
10.3k
        dst += WORDBYTES;
619
10.3k
        store_word_unaligned(load_word_unaligned(src), dst);
620
10.3k
        src += WORDBYTES;
621
10.3k
        dst += WORDBYTES;
622
10.3k
      }
623
65.6k
    } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
624
22.1k
      machine_word_t v;
625
626
      /*
627
       * This part tends to get auto-vectorized, so keep it
628
       * copying a multiple of 16 bytes at a time.
629
       */
630
22.1k
      v = (machine_word_t)0x0101010101010101 * src[0];
631
22.1k
      store_word_unaligned(v, dst);
632
22.1k
      dst += WORDBYTES;
633
22.1k
      store_word_unaligned(v, dst);
634
22.1k
      dst += WORDBYTES;
635
22.1k
      store_word_unaligned(v, dst);
636
22.1k
      dst += WORDBYTES;
637
22.1k
      store_word_unaligned(v, dst);
638
22.1k
      dst += WORDBYTES;
639
150k
      while (dst < out_next) {
640
127k
        store_word_unaligned(v, dst);
641
127k
        dst += WORDBYTES;
642
127k
        store_word_unaligned(v, dst);
643
127k
        dst += WORDBYTES;
644
127k
        store_word_unaligned(v, dst);
645
127k
        dst += WORDBYTES;
646
127k
        store_word_unaligned(v, dst);
647
127k
        dst += WORDBYTES;
648
127k
      }
649
43.4k
    } else if (UNALIGNED_ACCESS_IS_FAST) {
650
43.4k
      store_word_unaligned(load_word_unaligned(src), dst);
651
43.4k
      src += offset;
652
43.4k
      dst += offset;
653
43.4k
      store_word_unaligned(load_word_unaligned(src), dst);
654
43.4k
      src += offset;
655
43.4k
      dst += offset;
656
1.80M
      do {
657
1.80M
        store_word_unaligned(load_word_unaligned(src), dst);
658
1.80M
        src += offset;
659
1.80M
        dst += offset;
660
1.80M
        store_word_unaligned(load_word_unaligned(src), dst);
661
1.80M
        src += offset;
662
1.80M
        dst += offset;
663
1.80M
      } while (dst < out_next);
664
43.4k
    } else {
665
0
      *dst++ = *src++;
666
0
      *dst++ = *src++;
667
0
      do {
668
0
        *dst++ = *src++;
669
0
      } while (dst < out_next);
670
0
    }
671
160k
  } while (in_next < in_fastloop_end && out_next < out_fastloop_end);
672
673
  /*
674
   * This is the generic loop for decoding literals and matches.  This
675
   * handles cases where in_next and out_next are close to the end of
676
   * their respective buffers.  Usually this loop isn't performance-
677
   * critical, as most time is spent in the fastloop above instead.  We
678
   * therefore omit some optimizations here in favor of smaller code.
679
   */
680
6.67k
generic_loop:
681
224k
  for (;;) {
682
224k
    u32 length, offset;
683
224k
    const u8 *src;
684
224k
    u8 *dst;
685
686
224k
    REFILL_BITS();
687
223k
    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
688
223k
    saved_bitbuf = bitbuf;
689
223k
    bitbuf >>= (u8)entry;
690
223k
    bitsleft -= entry;
691
223k
    if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
692
531
      entry = d->u.litlen_decode_table[(entry >> 16) +
693
531
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
694
531
      saved_bitbuf = bitbuf;
695
531
      bitbuf >>= (u8)entry;
696
531
      bitsleft -= entry;
697
531
    }
698
223k
    length = entry >> 16;
699
223k
    if (entry & HUFFDEC_LITERAL) {
700
150k
      if (unlikely(out_next == out_end))
701
1.23k
        return LIBDEFLATE_INSUFFICIENT_SPACE;
702
148k
      *out_next++ = length;
703
148k
      continue;
704
150k
    }
705
73.5k
    if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
706
3.08k
      goto block_done;
707
70.5k
    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
708
70.5k
    if (unlikely(length > out_end - out_next))
709
1.87k
      return LIBDEFLATE_INSUFFICIENT_SPACE;
710
711
68.6k
    if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
712
0
      REFILL_BITS();
713
68.6k
    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
714
68.6k
    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
715
366
      bitbuf >>= OFFSET_TABLEBITS;
716
366
      bitsleft -= OFFSET_TABLEBITS;
717
366
      entry = d->offset_decode_table[(entry >> 16) +
718
366
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
719
366
      if (!CAN_CONSUME(OFFSET_MAXBITS))
720
0
        REFILL_BITS();
721
366
    }
722
68.6k
    offset = entry >> 16;
723
68.6k
    offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
724
68.6k
    bitbuf >>= (u8)entry;
725
68.6k
    bitsleft -= entry;
726
727
68.6k
    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
728
68.4k
    src = out_next - offset;
729
68.4k
    dst = out_next;
730
68.4k
    out_next += length;
731
732
68.4k
    STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
733
68.4k
    *dst++ = *src++;
734
68.4k
    *dst++ = *src++;
735
16.0M
    do {
736
16.0M
      *dst++ = *src++;
737
16.0M
    } while (dst < out_next);
738
68.4k
  }
739
740
5.33k
block_done:
741
  /* Finished decoding a block */
742
743
5.33k
  if (!is_final_block)
744
4.10k
    goto next_block;
745
746
  /* That was the last block. */
747
748
1.22k
  bitsleft = (u8)bitsleft;
749
750
  /*
751
   * If any of the implicit appended zero bytes were consumed (not just
752
   * refilled) before hitting end of stream, then the data is bad.
753
   */
754
1.22k
  SAFETY_CHECK(overread_count <= (bitsleft >> 3));
755
756
  /* Optionally return the actual number of bytes consumed. */
757
1.17k
  if (actual_in_nbytes_ret) {
758
    /* Don't count bytes that were refilled but not consumed. */
759
1.17k
    in_next -= (bitsleft >> 3) - overread_count;
760
761
1.17k
    *actual_in_nbytes_ret = in_next - (u8 *)in;
762
1.17k
  }
763
764
  /* Optionally return the actual number of bytes written. */
765
1.17k
  if (actual_out_nbytes_ret) {
766
0
    *actual_out_nbytes_ret = out_next - (u8 *)out;
767
1.17k
  } else {
768
1.17k
    if (out_next != out_end)
769
131
      return LIBDEFLATE_SHORT_OUTPUT;
770
1.17k
  }
771
1.04k
  return LIBDEFLATE_SUCCESS;
772
1.17k
}
deflate_decompress.c:deflate_decompress_bmi2
Line
Count
Source
49
6.30k
{
50
6.30k
  u8 *out_next = out;
51
6.30k
  u8 * const out_end = out_next + out_nbytes_avail;
52
6.30k
  u8 * const out_fastloop_end =
53
6.30k
    out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);
54
55
  /* Input bitstream state; see deflate_decompress.c for documentation */
56
6.30k
  const u8 *in_next = in;
57
6.30k
  const u8 * const in_end = in_next + in_nbytes;
58
6.30k
  const u8 * const in_fastloop_end =
59
6.30k
    in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
60
6.30k
  bitbuf_t bitbuf = 0;
61
6.30k
  bitbuf_t saved_bitbuf;
62
6.30k
  u32 bitsleft = 0;
63
6.30k
  size_t overread_count = 0;
64
65
6.30k
  bool is_final_block;
66
6.30k
  unsigned block_type;
67
6.30k
  unsigned num_litlen_syms;
68
6.30k
  unsigned num_offset_syms;
69
6.30k
  bitbuf_t litlen_tablemask;
70
6.30k
  u32 entry;
71
72
10.4k
next_block:
73
  /* Starting to read the next block */
74
10.4k
  ;
75
76
10.4k
  STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
77
10.4k
  REFILL_BITS();
78
79
  /* BFINAL: 1 bit */
80
10.3k
  is_final_block = bitbuf & BITMASK(1);
81
82
  /* BTYPE: 2 bits */
83
10.3k
  block_type = (bitbuf >> 1) & BITMASK(2);
84
85
10.3k
  if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
86
87
    /* Dynamic Huffman block */
88
89
    /* The order in which precode lengths are stored */
90
4.78k
    static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
91
4.78k
      16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
92
4.78k
    };
93
94
4.78k
    unsigned num_explicit_precode_lens;
95
4.78k
    unsigned i;
96
97
    /* Read the codeword length counts. */
98
99
4.78k
    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
100
4.78k
    num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));
101
102
4.78k
    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
103
4.78k
    num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));
104
105
4.78k
    STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
106
4.78k
    num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));
107
108
4.78k
    d->static_codes_loaded = false;
109
110
    /*
111
     * Read the precode codeword lengths.
112
     *
113
     * A 64-bit bitbuffer is just one bit too small to hold the
114
     * maximum number of precode lens, so to minimize branches we
115
     * merge one len with the previous fields.
116
     */
117
4.78k
    STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
118
4.78k
    if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
119
4.78k
      d->u.precode_lens[deflate_precode_lens_permutation[0]] =
120
4.78k
        (bitbuf >> 17) & BITMASK(3);
121
4.78k
      bitbuf >>= 20;
122
4.78k
      bitsleft -= 20;
123
4.78k
      REFILL_BITS();
124
4.76k
      i = 1;
125
60.7k
      do {
126
60.7k
        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
127
60.7k
          bitbuf & BITMASK(3);
128
60.7k
        bitbuf >>= 3;
129
60.7k
        bitsleft -= 3;
130
60.7k
      } while (++i < num_explicit_precode_lens);
131
4.76k
    } else {
132
0
      bitbuf >>= 17;
133
0
      bitsleft -= 17;
134
0
      i = 0;
135
0
      do {
136
0
        if ((u8)bitsleft < 3)
137
0
          REFILL_BITS();
138
0
        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
139
0
          bitbuf & BITMASK(3);
140
0
        bitbuf >>= 3;
141
0
        bitsleft -= 3;
142
0
      } while (++i < num_explicit_precode_lens);
143
0
    }
144
29.8k
    for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
145
25.0k
      d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
146
147
    /* Build the decode table for the precode. */
148
4.76k
    SAFETY_CHECK(build_precode_decode_table(d));
149
150
    /* Decode the litlen and offset codeword lengths. */
151
4.61k
    i = 0;
152
401k
    do {
153
401k
      unsigned presym;
154
401k
      u8 rep_val;
155
401k
      unsigned rep_count;
156
157
401k
      if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
158
15.9k
        REFILL_BITS();
159
160
      /*
161
       * The code below assumes that the precode decode table
162
       * doesn't have any subtables.
163
       */
164
401k
      STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
165
166
      /* Decode the next precode symbol. */
167
401k
      entry = d->u.l.precode_decode_table[
168
401k
        bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
169
401k
      bitbuf >>= (u8)entry;
170
401k
      bitsleft -= entry; /* optimization: subtract full entry */
171
401k
      presym = entry >> 16;
172
173
401k
      if (presym < 16) {
174
        /* Explicit codeword length */
175
376k
        d->u.l.lens[i++] = presym;
176
376k
        continue;
177
376k
      }
178
179
      /* Run-length encoded codeword lengths */
180
181
      /*
182
       * Note: we don't need to immediately verify that the
183
       * repeat count doesn't overflow the number of elements,
184
       * since we've sized the lens array to have enough extra
185
       * space to allow for the worst-case overrun (138 zeroes
186
       * when only 1 length was remaining).
187
       *
188
       * In the case of the small repeat counts (presyms 16
189
       * and 17), it is fastest to always write the maximum
190
       * number of entries.  That gets rid of branches that
191
       * would otherwise be required.
192
       *
193
       * It is not just because of the numerical order that
194
       * our checks go in the order 'presym < 16', 'presym ==
195
       * 16', and 'presym == 17'.  For typical data this is
196
       * ordered from most frequent to least frequent case.
197
       */
198
24.9k
      STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
199
200
24.9k
      if (presym == 16) {
201
        /* Repeat the previous length 3 - 6 times. */
202
4.71k
        SAFETY_CHECK(i != 0);
203
4.69k
        rep_val = d->u.l.lens[i - 1];
204
4.69k
        STATIC_ASSERT(3 + BITMASK(2) == 6);
205
4.69k
        rep_count = 3 + (bitbuf & BITMASK(2));
206
4.69k
        bitbuf >>= 2;
207
4.69k
        bitsleft -= 2;
208
4.69k
        d->u.l.lens[i + 0] = rep_val;
209
4.69k
        d->u.l.lens[i + 1] = rep_val;
210
4.69k
        d->u.l.lens[i + 2] = rep_val;
211
4.69k
        d->u.l.lens[i + 3] = rep_val;
212
4.69k
        d->u.l.lens[i + 4] = rep_val;
213
4.69k
        d->u.l.lens[i + 5] = rep_val;
214
4.69k
        i += rep_count;
215
20.1k
      } else if (presym == 17) {
216
        /* Repeat zero 3 - 10 times. */
217
8.09k
        STATIC_ASSERT(3 + BITMASK(3) == 10);
218
8.09k
        rep_count = 3 + (bitbuf & BITMASK(3));
219
8.09k
        bitbuf >>= 3;
220
8.09k
        bitsleft -= 3;
221
8.09k
        d->u.l.lens[i + 0] = 0;
222
8.09k
        d->u.l.lens[i + 1] = 0;
223
8.09k
        d->u.l.lens[i + 2] = 0;
224
8.09k
        d->u.l.lens[i + 3] = 0;
225
8.09k
        d->u.l.lens[i + 4] = 0;
226
8.09k
        d->u.l.lens[i + 5] = 0;
227
8.09k
        d->u.l.lens[i + 6] = 0;
228
8.09k
        d->u.l.lens[i + 7] = 0;
229
8.09k
        d->u.l.lens[i + 8] = 0;
230
8.09k
        d->u.l.lens[i + 9] = 0;
231
8.09k
        i += rep_count;
232
12.1k
      } else {
233
        /* Repeat zero 11 - 138 times. */
234
12.1k
        STATIC_ASSERT(11 + BITMASK(7) == 138);
235
12.1k
        rep_count = 11 + (bitbuf & BITMASK(7));
236
12.1k
        bitbuf >>= 7;
237
12.1k
        bitsleft -= 7;
238
12.1k
        memset(&d->u.l.lens[i], 0,
239
12.1k
               rep_count * sizeof(d->u.l.lens[i]));
240
12.1k
        i += rep_count;
241
12.1k
      }
242
401k
    } while (i < num_litlen_syms + num_offset_syms);
243
244
    /* Unnecessary, but check this for consistency with zlib. */
245
4.40k
    SAFETY_CHECK(i == num_litlen_syms + num_offset_syms);
246
247
5.57k
  } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
248
1.16k
    u16 len, nlen;
249
250
    /*
251
     * Uncompressed block: copy 'len' bytes literally from the input
252
     * buffer to the output buffer.
253
     */
254
255
1.16k
    bitsleft -= 3; /* for BTYPE and BFINAL */
256
257
    /*
258
     * Align the bitstream to the next byte boundary.  This means
259
     * the next byte boundary as if we were reading a byte at a
260
     * time.  Therefore, we have to rewind 'in_next' by any bytes
261
     * that have been refilled but not actually consumed yet (not
262
     * counting overread bytes, which don't increment 'in_next').
263
     */
264
1.16k
    bitsleft = (u8)bitsleft;
265
1.16k
    SAFETY_CHECK(overread_count <= (bitsleft >> 3));
266
1.09k
    in_next -= (bitsleft >> 3) - overread_count;
267
1.09k
    overread_count = 0;
268
1.09k
    bitbuf = 0;
269
1.09k
    bitsleft = 0;
270
271
1.09k
    SAFETY_CHECK(in_end - in_next >= 4);
272
1.05k
    len = get_unaligned_le16(in_next);
273
1.05k
    nlen = get_unaligned_le16(in_next + 2);
274
1.05k
    in_next += 4;
275
276
1.05k
    SAFETY_CHECK(len == (u16)~nlen);
277
821
    if (unlikely(len > out_end - out_next))
278
364
      return LIBDEFLATE_INSUFFICIENT_SPACE;
279
457
    SAFETY_CHECK(len <= in_end - in_next);
280
281
442
    memcpy(out_next, in_next, len);
282
442
    in_next += len;
283
442
    out_next += len;
284
285
442
    goto block_done;
286
287
4.40k
  } else {
288
4.40k
    unsigned i;
289
290
4.40k
    SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
291
292
    /*
293
     * Static Huffman block: build the decode tables for the static
294
     * codes.  Skip doing so if the tables are already set up from
295
     * an earlier static block; this speeds up decompression of
296
     * degenerate input of many empty or very short static blocks.
297
     *
298
     * Afterwards, the remainder is the same as decompressing a
299
     * dynamic Huffman block.
300
     */
301
302
4.36k
    bitbuf >>= 3; /* for BTYPE and BFINAL */
303
4.36k
    bitsleft -= 3;
304
305
4.36k
    if (d->static_codes_loaded)
306
2.43k
      goto have_decode_tables;
307
308
1.93k
    d->static_codes_loaded = true;
309
310
1.93k
    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
311
1.93k
    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
312
313
281k
    for (i = 0; i < 144; i++)
314
279k
      d->u.l.lens[i] = 8;
315
218k
    for (; i < 256; i++)
316
217k
      d->u.l.lens[i] = 9;
317
48.4k
    for (; i < 280; i++)
318
46.5k
      d->u.l.lens[i] = 7;
319
17.4k
    for (; i < 288; i++)
320
15.5k
      d->u.l.lens[i] = 8;
321
322
63.9k
    for (; i < 288 + 32; i++)
323
62.0k
      d->u.l.lens[i] = 5;
324
325
1.93k
    num_litlen_syms = 288;
326
1.93k
    num_offset_syms = 32;
327
1.93k
  }
328
329
  /* Decompressing a Huffman block (either dynamic or static) */
330
331
6.26k
  SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
332
6.18k
  SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
333
8.55k
have_decode_tables:
334
8.55k
  litlen_tablemask = BITMASK(d->litlen_tablebits);
335
336
  /*
337
   * This is the "fastloop" for decoding literals and matches.  It does
338
   * bounds checks on in_next and out_next in the loop conditions so that
339
   * additional bounds checks aren't needed inside the loop body.
340
   *
341
   * To reduce latency, the bitbuffer is refilled and the next litlen
342
   * decode table entry is preloaded before each loop iteration.
343
   */
344
8.55k
  if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
345
5.32k
    goto generic_loop;
346
3.23k
  REFILL_BITS_IN_FASTLOOP();
347
3.23k
  entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
348
162k
  do {
349
162k
    u32 length, offset, lit;
350
162k
    const u8 *src;
351
162k
    u8 *dst;
352
353
    /*
354
     * Consume the bits for the litlen decode table entry.  Save the
355
     * original bitbuf for later, in case the extra match length
356
     * bits need to be extracted from it.
357
     */
358
162k
    saved_bitbuf = bitbuf;
359
162k
    bitbuf >>= (u8)entry;
360
162k
    bitsleft -= entry; /* optimization: subtract full entry */
361
362
    /*
363
     * Begin by checking for a "fast" literal, i.e. a literal that
364
     * doesn't need a subtable.
365
     */
366
162k
    if (entry & HUFFDEC_LITERAL) {
367
      /*
368
       * On 64-bit platforms, we decode up to 2 extra fast
369
       * literals in addition to the primary item, as this
370
       * increases performance and still leaves enough bits
371
       * remaining for what follows.  We could actually do 3,
372
       * assuming LITLEN_TABLEBITS=11, but that actually
373
       * decreases performance slightly (perhaps by messing
374
       * with the branch prediction of the conditional refill
375
       * that happens later while decoding the match offset).
376
       *
377
       * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
378
       * and FASTLOOP_MAX_BYTES_READ need to be updated if the
379
       * number of extra literals decoded here is changed.
380
       */
381
89.2k
      if (/* enough bits for 2 fast literals + length + offset preload? */
382
89.2k
          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
383
89.2k
               LENGTH_MAXBITS,
384
89.2k
               OFFSET_TABLEBITS) &&
385
          /* enough bits for 2 fast literals + slow literal + litlen preload? */
386
89.2k
          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
387
89.2k
               DEFLATE_MAX_LITLEN_CODEWORD_LEN,
388
89.2k
               LITLEN_TABLEBITS)) {
389
        /* 1st extra fast literal */
390
89.2k
        lit = entry >> 16;
391
89.2k
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
392
89.2k
        saved_bitbuf = bitbuf;
393
89.2k
        bitbuf >>= (u8)entry;
394
89.2k
        bitsleft -= entry;
395
89.2k
        *out_next++ = lit;
396
89.2k
        if (entry & HUFFDEC_LITERAL) {
397
          /* 2nd extra fast literal */
398
81.4k
          lit = entry >> 16;
399
81.4k
          entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
400
81.4k
          saved_bitbuf = bitbuf;
401
81.4k
          bitbuf >>= (u8)entry;
402
81.4k
          bitsleft -= entry;
403
81.4k
          *out_next++ = lit;
404
81.4k
          if (entry & HUFFDEC_LITERAL) {
405
            /*
406
             * Another fast literal, but
407
             * this one is in lieu of the
408
             * primary item, so it doesn't
409
             * count as one of the extras.
410
             */
411
76.0k
            lit = entry >> 16;
412
76.0k
            entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
413
76.0k
            REFILL_BITS_IN_FASTLOOP();
414
76.0k
            *out_next++ = lit;
415
76.0k
            continue;
416
76.0k
          }
417
81.4k
        }
418
89.2k
      } else {
419
        /*
420
         * Decode a literal.  While doing so, preload
421
         * the next litlen decode table entry and refill
422
         * the bitbuffer.  To reduce latency, we've
423
         * arranged for there to be enough "preloadable"
424
         * bits remaining to do the table preload
425
         * independently of the refill.
426
         */
427
0
        STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
428
0
            LITLEN_TABLEBITS, LITLEN_TABLEBITS));
429
0
        lit = entry >> 16;
430
0
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
431
0
        REFILL_BITS_IN_FASTLOOP();
432
0
        *out_next++ = lit;
433
0
        continue;
434
0
      }
435
89.2k
    }
436
437
    /*
438
     * It's not a literal entry, so it can be a length entry, a
439
     * subtable pointer entry, or an end-of-block entry.  Detect the
440
     * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
441
     */
442
86.6k
    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
443
      /* Subtable pointer or end-of-block entry */
444
445
6.52k
      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
446
1.74k
        goto block_done;
447
448
      /*
449
       * A subtable is required.  Load and consume the
450
       * subtable entry.  The subtable entry can be of any
451
       * type: literal, length, or end-of-block.
452
       */
453
4.78k
      entry = d->u.litlen_decode_table[(entry >> 16) +
454
4.78k
        EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
455
4.78k
      saved_bitbuf = bitbuf;
456
4.78k
      bitbuf >>= (u8)entry;
457
4.78k
      bitsleft -= entry;
458
459
      /*
460
       * 32-bit platforms that use the byte-at-a-time refill
461
       * method have to do a refill here for there to always
462
       * be enough bits to decode a literal that requires a
463
       * subtable, then preload the next litlen decode table
464
       * entry; or to decode a match length that requires a
465
       * subtable, then preload the offset decode table entry.
466
       */
467
4.78k
      if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
468
4.78k
                LITLEN_TABLEBITS) ||
469
4.78k
          !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
470
4.78k
                OFFSET_TABLEBITS))
471
0
        REFILL_BITS_IN_FASTLOOP();
472
4.78k
      if (entry & HUFFDEC_LITERAL) {
473
        /* Decode a literal that required a subtable. */
474
1.24k
        lit = entry >> 16;
475
1.24k
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
476
1.24k
        REFILL_BITS_IN_FASTLOOP();
477
1.24k
        *out_next++ = lit;
478
1.24k
        continue;
479
1.24k
      }
480
3.54k
      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
481
61
        goto block_done;
482
      /* Else, it's a length that required a subtable. */
483
3.54k
    }
484
485
    /*
486
     * Decode the match length: the length base value associated
487
     * with the litlen symbol (which we extract from the decode
488
     * table entry), plus the extra length bits.  We don't need to
489
     * consume the extra length bits here, as they were included in
490
     * the bits consumed by the entry earlier.  We also don't need
491
     * to check for too-long matches here, as this is inside the
492
     * fastloop where it's already been verified that the output
493
     * buffer has enough space remaining to copy a max-length match.
494
     */
495
83.6k
    length = entry >> 16;
496
83.6k
    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
497
498
    /*
499
     * Decode the match offset.  There are enough "preloadable" bits
500
     * remaining to preload the offset decode table entry, but a
501
     * refill might be needed before consuming it.
502
     */
503
83.6k
    STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
504
83.6k
                 OFFSET_TABLEBITS));
505
83.6k
    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
506
83.6k
    if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
507
83.6k
             LITLEN_TABLEBITS)) {
508
      /*
509
       * Decoding a match offset on a 64-bit platform.  We may
510
       * need to refill once, but then we can decode the whole
511
       * offset and preload the next litlen table entry.
512
       */
513
83.6k
      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
514
        /* Offset codeword requires a subtable */
515
3.00k
        if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
516
3.00k
               LITLEN_TABLEBITS - PRELOAD_SLACK))
517
214
          REFILL_BITS_IN_FASTLOOP();
518
3.00k
        bitbuf >>= OFFSET_TABLEBITS;
519
3.00k
        bitsleft -= OFFSET_TABLEBITS;
520
3.00k
        entry = d->offset_decode_table[(entry >> 16) +
521
3.00k
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
522
80.6k
      } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
523
80.6k
              LITLEN_TABLEBITS - PRELOAD_SLACK))
524
243
        REFILL_BITS_IN_FASTLOOP();
525
83.6k
    } else {
526
      /* Decoding a match offset on a 32-bit platform */
527
0
      REFILL_BITS_IN_FASTLOOP();
528
0
      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
529
        /* Offset codeword requires a subtable */
530
0
        bitbuf >>= OFFSET_TABLEBITS;
531
0
        bitsleft -= OFFSET_TABLEBITS;
532
0
        entry = d->offset_decode_table[(entry >> 16) +
533
0
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
534
0
        REFILL_BITS_IN_FASTLOOP();
535
        /* No further refill needed before extra bits */
536
0
        STATIC_ASSERT(CAN_CONSUME(
537
0
          OFFSET_MAXBITS - OFFSET_TABLEBITS));
538
0
      } else {
539
        /* No refill needed before extra bits */
540
0
        STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
541
0
      }
542
0
    }
543
83.6k
    saved_bitbuf = bitbuf;
544
83.6k
    bitbuf >>= (u8)entry;
545
83.6k
    bitsleft -= entry; /* optimization: subtract full entry */
546
83.6k
    offset = entry >> 16;
547
83.6k
    offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
548
549
    /* Validate the match offset; needed even in the fastloop. */
550
83.6k
    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
551
83.5k
    src = out_next - offset;
552
83.5k
    dst = out_next;
553
83.5k
    out_next += length;
554
555
    /*
556
     * Before starting to issue the instructions to copy the match,
557
     * refill the bitbuffer and preload the litlen decode table
558
     * entry for the next loop iteration.  This can increase
559
     * performance by allowing the latency of the match copy to
560
     * overlap with these other operations.  To further reduce
561
     * latency, we've arranged for there to be enough bits remaining
562
     * to do the table preload independently of the refill, except
563
     * on 32-bit platforms using the byte-at-a-time refill method.
564
     */
565
83.5k
    if (!CAN_CONSUME_AND_THEN_PRELOAD(
566
83.5k
      MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
567
83.5k
          OFFSET_MAXFASTBITS),
568
83.5k
      LITLEN_TABLEBITS) &&
569
83.5k
        unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
570
0
      REFILL_BITS_IN_FASTLOOP();
571
83.5k
    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
572
83.5k
    REFILL_BITS_IN_FASTLOOP();
573
574
    /*
575
     * Copy the match.  On most CPUs the fastest method is a
576
     * word-at-a-time copy, unconditionally copying about 5 words
577
     * since this is enough for most matches without being too much.
578
     *
579
     * The normal word-at-a-time copy works for offset >= WORDBYTES,
580
     * which is most cases.  The case of offset == 1 is also common
581
     * and is worth optimizing for, since it is just RLE encoding of
582
     * the previous byte, which is the result of compressing long
583
     * runs of the same byte.
584
     *
585
     * Writing past the match 'length' is allowed here, since it's
586
     * been ensured there is enough output space left for a slight
587
     * overrun.  FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
588
     * the maximum possible overrun here is changed.
589
     */
590
83.5k
    if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
591
17.8k
      store_word_unaligned(load_word_unaligned(src), dst);
592
17.8k
      src += WORDBYTES;
593
17.8k
      dst += WORDBYTES;
594
17.8k
      store_word_unaligned(load_word_unaligned(src), dst);
595
17.8k
      src += WORDBYTES;
596
17.8k
      dst += WORDBYTES;
597
17.8k
      store_word_unaligned(load_word_unaligned(src), dst);
598
17.8k
      src += WORDBYTES;
599
17.8k
      dst += WORDBYTES;
600
17.8k
      store_word_unaligned(load_word_unaligned(src), dst);
601
17.8k
      src += WORDBYTES;
602
17.8k
      dst += WORDBYTES;
603
17.8k
      store_word_unaligned(load_word_unaligned(src), dst);
604
17.8k
      src += WORDBYTES;
605
17.8k
      dst += WORDBYTES;
606
28.2k
      while (dst < out_next) {
607
10.3k
        store_word_unaligned(load_word_unaligned(src), dst);
608
10.3k
        src += WORDBYTES;
609
10.3k
        dst += WORDBYTES;
610
10.3k
        store_word_unaligned(load_word_unaligned(src), dst);
611
10.3k
        src += WORDBYTES;
612
10.3k
        dst += WORDBYTES;
613
10.3k
        store_word_unaligned(load_word_unaligned(src), dst);
614
10.3k
        src += WORDBYTES;
615
10.3k
        dst += WORDBYTES;
616
10.3k
        store_word_unaligned(load_word_unaligned(src), dst);
617
10.3k
        src += WORDBYTES;
618
10.3k
        dst += WORDBYTES;
619
10.3k
        store_word_unaligned(load_word_unaligned(src), dst);
620
10.3k
        src += WORDBYTES;
621
10.3k
        dst += WORDBYTES;
622
10.3k
      }
623
65.6k
    } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
624
22.1k
      machine_word_t v;
625
626
      /*
627
       * This part tends to get auto-vectorized, so keep it
628
       * copying a multiple of 16 bytes at a time.
629
       */
630
22.1k
      v = (machine_word_t)0x0101010101010101 * src[0];
631
22.1k
      store_word_unaligned(v, dst);
632
22.1k
      dst += WORDBYTES;
633
22.1k
      store_word_unaligned(v, dst);
634
22.1k
      dst += WORDBYTES;
635
22.1k
      store_word_unaligned(v, dst);
636
22.1k
      dst += WORDBYTES;
637
22.1k
      store_word_unaligned(v, dst);
638
22.1k
      dst += WORDBYTES;
639
150k
      while (dst < out_next) {
640
127k
        store_word_unaligned(v, dst);
641
127k
        dst += WORDBYTES;
642
127k
        store_word_unaligned(v, dst);
643
127k
        dst += WORDBYTES;
644
127k
        store_word_unaligned(v, dst);
645
127k
        dst += WORDBYTES;
646
127k
        store_word_unaligned(v, dst);
647
127k
        dst += WORDBYTES;
648
127k
      }
649
43.4k
    } else if (UNALIGNED_ACCESS_IS_FAST) {
650
43.4k
      store_word_unaligned(load_word_unaligned(src), dst);
651
43.4k
      src += offset;
652
43.4k
      dst += offset;
653
43.4k
      store_word_unaligned(load_word_unaligned(src), dst);
654
43.4k
      src += offset;
655
43.4k
      dst += offset;
656
1.80M
      do {
657
1.80M
        store_word_unaligned(load_word_unaligned(src), dst);
658
1.80M
        src += offset;
659
1.80M
        dst += offset;
660
1.80M
        store_word_unaligned(load_word_unaligned(src), dst);
661
1.80M
        src += offset;
662
1.80M
        dst += offset;
663
1.80M
      } while (dst < out_next);
664
43.4k
    } else {
665
0
      *dst++ = *src++;
666
0
      *dst++ = *src++;
667
0
      do {
668
0
        *dst++ = *src++;
669
0
      } while (dst < out_next);
670
0
    }
671
160k
  } while (in_next < in_fastloop_end && out_next < out_fastloop_end);
672
673
  /*
674
   * This is the generic loop for decoding literals and matches.  This
675
   * handles cases where in_next and out_next are close to the end of
676
   * their respective buffers.  Usually this loop isn't performance-
677
   * critical, as most time is spent in the fastloop above instead.  We
678
   * therefore omit some optimizations here in favor of smaller code.
679
   */
680
6.67k
generic_loop:
681
224k
  for (;;) {
682
224k
    u32 length, offset;
683
224k
    const u8 *src;
684
224k
    u8 *dst;
685
686
224k
    REFILL_BITS();
687
223k
    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
688
223k
    saved_bitbuf = bitbuf;
689
223k
    bitbuf >>= (u8)entry;
690
223k
    bitsleft -= entry;
691
223k
    if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
692
531
      entry = d->u.litlen_decode_table[(entry >> 16) +
693
531
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
694
531
      saved_bitbuf = bitbuf;
695
531
      bitbuf >>= (u8)entry;
696
531
      bitsleft -= entry;
697
531
    }
698
223k
    length = entry >> 16;
699
223k
    if (entry & HUFFDEC_LITERAL) {
700
150k
      if (unlikely(out_next == out_end))
701
1.23k
        return LIBDEFLATE_INSUFFICIENT_SPACE;
702
148k
      *out_next++ = length;
703
148k
      continue;
704
150k
    }
705
73.5k
    if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
706
3.08k
      goto block_done;
707
70.5k
    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
708
70.5k
    if (unlikely(length > out_end - out_next))
709
1.87k
      return LIBDEFLATE_INSUFFICIENT_SPACE;
710
711
68.6k
    if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
712
0
      REFILL_BITS();
713
68.6k
    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
714
68.6k
    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
715
366
      bitbuf >>= OFFSET_TABLEBITS;
716
366
      bitsleft -= OFFSET_TABLEBITS;
717
366
      entry = d->offset_decode_table[(entry >> 16) +
718
366
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
719
366
      if (!CAN_CONSUME(OFFSET_MAXBITS))
720
0
        REFILL_BITS();
721
366
    }
722
68.6k
    offset = entry >> 16;
723
68.6k
    offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
724
68.6k
    bitbuf >>= (u8)entry;
725
68.6k
    bitsleft -= entry;
726
727
68.6k
    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
728
68.4k
    src = out_next - offset;
729
68.4k
    dst = out_next;
730
68.4k
    out_next += length;
731
732
68.4k
    STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
733
68.4k
    *dst++ = *src++;
734
68.4k
    *dst++ = *src++;
735
16.0M
    do {
736
16.0M
      *dst++ = *src++;
737
16.0M
    } while (dst < out_next);
738
68.4k
  }
739
740
5.33k
block_done:
741
  /* Finished decoding a block */
742
743
5.33k
  if (!is_final_block)
744
4.10k
    goto next_block;
745
746
  /* That was the last block. */
747
748
1.22k
  bitsleft = (u8)bitsleft;
749
750
  /*
751
   * If any of the implicit appended zero bytes were consumed (not just
752
   * refilled) before hitting end of stream, then the data is bad.
753
   */
754
1.22k
  SAFETY_CHECK(overread_count <= (bitsleft >> 3));
755
756
  /* Optionally return the actual number of bytes consumed. */
757
1.17k
  if (actual_in_nbytes_ret) {
758
    /* Don't count bytes that were refilled but not consumed. */
759
1.17k
    in_next -= (bitsleft >> 3) - overread_count;
760
761
1.17k
    *actual_in_nbytes_ret = in_next - (u8 *)in;
762
1.17k
  }
763
764
  /* Optionally return the actual number of bytes written. */
765
1.17k
  if (actual_out_nbytes_ret) {
766
0
    *actual_out_nbytes_ret = out_next - (u8 *)out;
767
1.17k
  } else {
768
1.17k
    if (out_next != out_end)
769
131
      return LIBDEFLATE_SHORT_OUTPUT;
770
1.17k
  }
771
1.04k
  return LIBDEFLATE_SUCCESS;
772
1.17k
}
Unexecuted instantiation: deflate_decompress.c:deflate_decompress_default
773
774
#undef FUNCNAME
775
#undef ATTRIBUTES
776
#undef EXTRACT_VARBITS
777
#undef EXTRACT_VARBITS8