Coverage Report

Created: 2024-07-27 06:04

/work/_deps/deflate-src/lib/decompress_template.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * decompress_template.h
3
 *
4
 * Copyright 2016 Eric Biggers
5
 *
6
 * Permission is hereby granted, free of charge, to any person
7
 * obtaining a copy of this software and associated documentation
8
 * files (the "Software"), to deal in the Software without
9
 * restriction, including without limitation the rights to use,
10
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
 * copies of the Software, and to permit persons to whom the
12
 * Software is furnished to do so, subject to the following
13
 * conditions:
14
 *
15
 * The above copyright notice and this permission notice shall be
16
 * included in all copies or substantial portions of the Software.
17
 *
18
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
 * OTHER DEALINGS IN THE SOFTWARE.
26
 */
27
28
/*
29
 * This is the actual DEFLATE decompression routine, lifted out of
30
 * deflate_decompress.c so that it can be compiled multiple times with different
31
 * target instruction sets.
32
 */
33
34
#ifndef ATTRIBUTES
35
#  define ATTRIBUTES
36
#endif
37
#ifndef EXTRACT_VARBITS
38
2.02k
#  define EXTRACT_VARBITS(word, count)  ((word) & BITMASK(count))
39
#endif
40
#ifndef EXTRACT_VARBITS8
41
93.8k
#  define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count)))
42
#endif
43
44
static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED
45
FUNCNAME(struct libdeflate_decompressor * restrict d,
46
   const void * restrict in, size_t in_nbytes,
47
   void * restrict out, size_t out_nbytes_avail,
48
   size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
49
1.41k
{
50
1.41k
  u8 *out_next = out;
51
1.41k
  u8 * const out_end = out_next + out_nbytes_avail;
52
1.41k
  u8 * const out_fastloop_end =
53
1.41k
    out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);
54
55
  /* Input bitstream state; see deflate_decompress.c for documentation */
56
1.41k
  const u8 *in_next = in;
57
1.41k
  const u8 * const in_end = in_next + in_nbytes;
58
1.41k
  const u8 * const in_fastloop_end =
59
1.41k
    in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
60
1.41k
  bitbuf_t bitbuf = 0;
61
1.41k
  bitbuf_t saved_bitbuf;
62
1.41k
  u32 bitsleft = 0;
63
1.41k
  size_t overread_count = 0;
64
65
1.41k
  bool is_final_block;
66
1.41k
  unsigned block_type;
67
1.41k
  unsigned num_litlen_syms;
68
1.41k
  unsigned num_offset_syms;
69
1.41k
  bitbuf_t litlen_tablemask;
70
1.41k
  u32 entry;
71
72
7.27k
next_block:
73
  /* Starting to read the next block */
74
7.27k
  ;
75
76
7.27k
  STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
77
7.27k
  REFILL_BITS();
78
79
  /* BFINAL: 1 bit */
80
7.24k
  is_final_block = bitbuf & BITMASK(1);
81
82
  /* BTYPE: 2 bits */
83
7.24k
  block_type = (bitbuf >> 1) & BITMASK(2);
84
85
7.24k
  if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
86
87
    /* Dynamic Huffman block */
88
89
    /* The order in which precode lengths are stored */
90
2.11k
    static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
91
2.11k
      16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
92
2.11k
    };
93
94
2.11k
    unsigned num_explicit_precode_lens;
95
2.11k
    unsigned i;
96
97
    /* Read the codeword length counts. */
98
99
2.11k
    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
100
2.11k
    num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));
101
102
2.11k
    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
103
2.11k
    num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));
104
105
2.11k
    STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
106
2.11k
    num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));
107
108
2.11k
    d->static_codes_loaded = false;
109
110
    /*
111
     * Read the precode codeword lengths.
112
     *
113
     * A 64-bit bitbuffer is just one bit too small to hold the
114
     * maximum number of precode lens, so to minimize branches we
115
     * merge one len with the previous fields.
116
     */
117
2.11k
    STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
118
2.11k
    if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
119
2.11k
      d->u.precode_lens[deflate_precode_lens_permutation[0]] =
120
2.11k
        (bitbuf >> 17) & BITMASK(3);
121
2.11k
      bitbuf >>= 20;
122
2.11k
      bitsleft -= 20;
123
2.11k
      REFILL_BITS();
124
2.10k
      i = 1;
125
32.4k
      do {
126
32.4k
        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
127
32.4k
          bitbuf & BITMASK(3);
128
32.4k
        bitbuf >>= 3;
129
32.4k
        bitsleft -= 3;
130
32.4k
      } while (++i < num_explicit_precode_lens);
131
2.10k
    } else {
132
0
      bitbuf >>= 17;
133
0
      bitsleft -= 17;
134
0
      i = 0;
135
0
      do {
136
0
        if ((u8)bitsleft < 3)
137
0
          REFILL_BITS();
138
0
        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
139
0
          bitbuf & BITMASK(3);
140
0
        bitbuf >>= 3;
141
0
        bitsleft -= 3;
142
0
      } while (++i < num_explicit_precode_lens);
143
0
    }
144
7.57k
    for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
145
5.47k
      d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
146
147
    /* Build the decode table for the precode. */
148
2.10k
    SAFETY_CHECK(build_precode_decode_table(d));
149
150
    /* Decode the litlen and offset codeword lengths. */
151
2.05k
    i = 0;
152
147k
    do {
153
147k
      unsigned presym;
154
147k
      u8 rep_val;
155
147k
      unsigned rep_count;
156
157
147k
      if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
158
9.56k
        REFILL_BITS();
159
160
      /*
161
       * The code below assumes that the precode decode table
162
       * doesn't have any subtables.
163
       */
164
147k
      STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
165
166
      /* Decode the next precode symbol. */
167
147k
      entry = d->u.l.precode_decode_table[
168
147k
        bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
169
147k
      bitbuf >>= (u8)entry;
170
147k
      bitsleft -= entry; /* optimization: subtract full entry */
171
147k
      presym = entry >> 16;
172
173
147k
      if (presym < 16) {
174
        /* Explicit codeword length */
175
95.3k
        d->u.l.lens[i++] = presym;
176
95.3k
        continue;
177
95.3k
      }
178
179
      /* Run-length encoded codeword lengths */
180
181
      /*
182
       * Note: we don't need to immediately verify that the
183
       * repeat count doesn't overflow the number of elements,
184
       * since we've sized the lens array to have enough extra
185
       * space to allow for the worst-case overrun (138 zeroes
186
       * when only 1 length was remaining).
187
       *
188
       * In the case of the small repeat counts (presyms 16
189
       * and 17), it is fastest to always write the maximum
190
       * number of entries.  That gets rid of branches that
191
       * would otherwise be required.
192
       *
193
       * It is not just because of the numerical order that
194
       * our checks go in the order 'presym < 16', 'presym ==
195
       * 16', and 'presym == 17'.  For typical data this is
196
       * ordered from most frequent to least frequent case.
197
       */
198
51.7k
      STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
199
200
51.7k
      if (presym == 16) {
201
        /* Repeat the previous length 3 - 6 times. */
202
43.3k
        SAFETY_CHECK(i != 0);
203
43.3k
        rep_val = d->u.l.lens[i - 1];
204
43.3k
        STATIC_ASSERT(3 + BITMASK(2) == 6);
205
43.3k
        rep_count = 3 + (bitbuf & BITMASK(2));
206
43.3k
        bitbuf >>= 2;
207
43.3k
        bitsleft -= 2;
208
43.3k
        d->u.l.lens[i + 0] = rep_val;
209
43.3k
        d->u.l.lens[i + 1] = rep_val;
210
43.3k
        d->u.l.lens[i + 2] = rep_val;
211
43.3k
        d->u.l.lens[i + 3] = rep_val;
212
43.3k
        d->u.l.lens[i + 4] = rep_val;
213
43.3k
        d->u.l.lens[i + 5] = rep_val;
214
43.3k
        i += rep_count;
215
43.3k
      } else if (presym == 17) {
216
        /* Repeat zero 3 - 10 times. */
217
4.46k
        STATIC_ASSERT(3 + BITMASK(3) == 10);
218
4.46k
        rep_count = 3 + (bitbuf & BITMASK(3));
219
4.46k
        bitbuf >>= 3;
220
4.46k
        bitsleft -= 3;
221
4.46k
        d->u.l.lens[i + 0] = 0;
222
4.46k
        d->u.l.lens[i + 1] = 0;
223
4.46k
        d->u.l.lens[i + 2] = 0;
224
4.46k
        d->u.l.lens[i + 3] = 0;
225
4.46k
        d->u.l.lens[i + 4] = 0;
226
4.46k
        d->u.l.lens[i + 5] = 0;
227
4.46k
        d->u.l.lens[i + 6] = 0;
228
4.46k
        d->u.l.lens[i + 7] = 0;
229
4.46k
        d->u.l.lens[i + 8] = 0;
230
4.46k
        d->u.l.lens[i + 9] = 0;
231
4.46k
        i += rep_count;
232
4.46k
      } else {
233
        /* Repeat zero 11 - 138 times. */
234
3.96k
        STATIC_ASSERT(11 + BITMASK(7) == 138);
235
3.96k
        rep_count = 11 + (bitbuf & BITMASK(7));
236
3.96k
        bitbuf >>= 7;
237
3.96k
        bitsleft -= 7;
238
3.96k
        memset(&d->u.l.lens[i], 0,
239
3.96k
               rep_count * sizeof(d->u.l.lens[i]));
240
3.96k
        i += rep_count;
241
3.96k
      }
242
147k
    } while (i < num_litlen_syms + num_offset_syms);
243
244
    /* Unnecessary, but check this for consistency with zlib. */
245
1.96k
    SAFETY_CHECK(i == num_litlen_syms + num_offset_syms);
246
247
5.12k
  } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
248
1.54k
    u16 len, nlen;
249
250
    /*
251
     * Uncompressed block: copy 'len' bytes literally from the input
252
     * buffer to the output buffer.
253
     */
254
255
1.54k
    bitsleft -= 3; /* for BTYPE and BFINAL */
256
257
    /*
258
     * Align the bitstream to the next byte boundary.  This means
259
     * the next byte boundary as if we were reading a byte at a
260
     * time.  Therefore, we have to rewind 'in_next' by any bytes
261
     * that have been refilled but not actually consumed yet (not
262
     * counting overread bytes, which don't increment 'in_next').
263
     */
264
1.54k
    bitsleft = (u8)bitsleft;
265
1.54k
    SAFETY_CHECK(overread_count <= (bitsleft >> 3));
266
1.50k
    in_next -= (bitsleft >> 3) - overread_count;
267
1.50k
    overread_count = 0;
268
1.50k
    bitbuf = 0;
269
1.50k
    bitsleft = 0;
270
271
1.50k
    SAFETY_CHECK(in_end - in_next >= 4);
272
1.49k
    len = get_unaligned_le16(in_next);
273
1.49k
    nlen = get_unaligned_le16(in_next + 2);
274
1.49k
    in_next += 4;
275
276
1.49k
    SAFETY_CHECK(len == (u16)~nlen);
277
1.41k
    if (unlikely(len > out_end - out_next))
278
3
      return LIBDEFLATE_INSUFFICIENT_SPACE;
279
1.41k
    SAFETY_CHECK(len <= in_end - in_next);
280
281
1.40k
    memcpy(out_next, in_next, len);
282
1.40k
    in_next += len;
283
1.40k
    out_next += len;
284
285
1.40k
    goto block_done;
286
287
3.58k
  } else {
288
3.58k
    unsigned i;
289
290
3.58k
    SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
291
292
    /*
293
     * Static Huffman block: build the decode tables for the static
294
     * codes.  Skip doing so if the tables are already set up from
295
     * an earlier static block; this speeds up decompression of
296
     * degenerate input of many empty or very short static blocks.
297
     *
298
     * Afterwards, the remainder is the same as decompressing a
299
     * dynamic Huffman block.
300
     */
301
302
3.56k
    bitbuf >>= 3; /* for BTYPE and BFINAL */
303
3.56k
    bitsleft -= 3;
304
305
3.56k
    if (d->static_codes_loaded)
306
912
      goto have_decode_tables;
307
308
2.65k
    d->static_codes_loaded = true;
309
310
2.65k
    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
311
2.65k
    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
312
313
384k
    for (i = 0; i < 144; i++)
314
382k
      d->u.l.lens[i] = 8;
315
299k
    for (; i < 256; i++)
316
297k
      d->u.l.lens[i] = 9;
317
66.3k
    for (; i < 280; i++)
318
63.6k
      d->u.l.lens[i] = 7;
319
23.8k
    for (; i < 288; i++)
320
21.2k
      d->u.l.lens[i] = 8;
321
322
87.5k
    for (; i < 288 + 32; i++)
323
84.9k
      d->u.l.lens[i] = 5;
324
325
2.65k
    num_litlen_syms = 288;
326
2.65k
    num_offset_syms = 32;
327
2.65k
  }
328
329
  /* Decompressing a Huffman block (either dynamic or static) */
330
331
4.58k
  SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
332
4.54k
  SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
333
5.41k
have_decode_tables:
334
5.41k
  litlen_tablemask = BITMASK(d->litlen_tablebits);
335
336
  /*
337
   * This is the "fastloop" for decoding literals and matches.  It does
338
   * bounds checks on in_next and out_next in the loop conditions so that
339
   * additional bounds checks aren't needed inside the loop body.
340
   *
341
   * To reduce latency, the bitbuffer is refilled and the next litlen
342
   * decode table entry is preloaded before each loop iteration.
343
   */
344
5.41k
  if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
345
1.51k
    goto generic_loop;
346
3.90k
  REFILL_BITS_IN_FASTLOOP();
347
3.90k
  entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
348
89.3k
  do {
349
89.3k
    u32 length, offset, lit;
350
89.3k
    const u8 *src;
351
89.3k
    u8 *dst;
352
353
    /*
354
     * Consume the bits for the litlen decode table entry.  Save the
355
     * original bitbuf for later, in case the extra match length
356
     * bits need to be extracted from it.
357
     */
358
89.3k
    saved_bitbuf = bitbuf;
359
89.3k
    bitbuf >>= (u8)entry;
360
89.3k
    bitsleft -= entry; /* optimization: subtract full entry */
361
362
    /*
363
     * Begin by checking for a "fast" literal, i.e. a literal that
364
     * doesn't need a subtable.
365
     */
366
89.3k
    if (entry & HUFFDEC_LITERAL) {
367
      /*
368
       * On 64-bit platforms, we decode up to 2 extra fast
369
       * literals in addition to the primary item, as this
370
       * increases performance and still leaves enough bits
371
       * remaining for what follows.  We could actually do 3,
372
       * assuming LITLEN_TABLEBITS=11, but that actually
373
       * decreases performance slightly (perhaps by messing
374
       * with the branch prediction of the conditional refill
375
       * that happens later while decoding the match offset).
376
       *
377
       * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
378
       * and FASTLOOP_MAX_BYTES_READ need to be updated if the
379
       * number of extra literals decoded here is changed.
380
       */
381
55.1k
      if (/* enough bits for 2 fast literals + length + offset preload? */
382
55.1k
          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
383
55.1k
               LENGTH_MAXBITS,
384
55.1k
               OFFSET_TABLEBITS) &&
385
          /* enough bits for 2 fast literals + slow literal + litlen preload? */
386
55.1k
          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
387
55.1k
               DEFLATE_MAX_LITLEN_CODEWORD_LEN,
388
55.1k
               LITLEN_TABLEBITS)) {
389
        /* 1st extra fast literal */
390
55.1k
        lit = entry >> 16;
391
55.1k
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
392
55.1k
        saved_bitbuf = bitbuf;
393
55.1k
        bitbuf >>= (u8)entry;
394
55.1k
        bitsleft -= entry;
395
55.1k
        *out_next++ = lit;
396
55.1k
        if (entry & HUFFDEC_LITERAL) {
397
          /* 2nd extra fast literal */
398
48.9k
          lit = entry >> 16;
399
48.9k
          entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
400
48.9k
          saved_bitbuf = bitbuf;
401
48.9k
          bitbuf >>= (u8)entry;
402
48.9k
          bitsleft -= entry;
403
48.9k
          *out_next++ = lit;
404
48.9k
          if (entry & HUFFDEC_LITERAL) {
405
            /*
406
             * Another fast literal, but
407
             * this one is in lieu of the
408
             * primary item, so it doesn't
409
             * count as one of the extras.
410
             */
411
42.9k
            lit = entry >> 16;
412
42.9k
            entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
413
42.9k
            REFILL_BITS_IN_FASTLOOP();
414
42.9k
            *out_next++ = lit;
415
42.9k
            continue;
416
42.9k
          }
417
48.9k
        }
418
55.1k
      } else {
419
        /*
420
         * Decode a literal.  While doing so, preload
421
         * the next litlen decode table entry and refill
422
         * the bitbuffer.  To reduce latency, we've
423
         * arranged for there to be enough "preloadable"
424
         * bits remaining to do the table preload
425
         * independently of the refill.
426
         */
427
0
        STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
428
0
            LITLEN_TABLEBITS, LITLEN_TABLEBITS));
429
0
        lit = entry >> 16;
430
0
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
431
0
        REFILL_BITS_IN_FASTLOOP();
432
0
        *out_next++ = lit;
433
0
        continue;
434
0
      }
435
55.1k
    }
436
437
    /*
438
     * It's not a literal entry, so it can be a length entry, a
439
     * subtable pointer entry, or an end-of-block entry.  Detect the
440
     * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
441
     */
442
46.3k
    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
443
      /* Subtable pointer or end-of-block entry */
444
445
5.12k
      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
446
3.45k
        goto block_done;
447
448
      /*
449
       * A subtable is required.  Load and consume the
450
       * subtable entry.  The subtable entry can be of any
451
       * type: literal, length, or end-of-block.
452
       */
453
1.66k
      entry = d->u.litlen_decode_table[(entry >> 16) +
454
1.66k
        EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
455
1.66k
      saved_bitbuf = bitbuf;
456
1.66k
      bitbuf >>= (u8)entry;
457
1.66k
      bitsleft -= entry;
458
459
      /*
460
       * 32-bit platforms that use the byte-at-a-time refill
461
       * method have to do a refill here for there to always
462
       * be enough bits to decode a literal that requires a
463
       * subtable, then preload the next litlen decode table
464
       * entry; or to decode a match length that requires a
465
       * subtable, then preload the offset decode table entry.
466
       */
467
1.66k
      if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
468
1.66k
                LITLEN_TABLEBITS) ||
469
1.66k
          !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
470
1.66k
                OFFSET_TABLEBITS))
471
0
        REFILL_BITS_IN_FASTLOOP();
472
1.66k
      if (entry & HUFFDEC_LITERAL) {
473
        /* Decode a literal that required a subtable. */
474
1.32k
        lit = entry >> 16;
475
1.32k
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
476
1.32k
        REFILL_BITS_IN_FASTLOOP();
477
1.32k
        *out_next++ = lit;
478
1.32k
        continue;
479
1.32k
      }
480
346
      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
481
135
        goto block_done;
482
      /* Else, it's a length that required a subtable. */
483
346
    }
484
485
    /*
486
     * Decode the match length: the length base value associated
487
     * with the litlen symbol (which we extract from the decode
488
     * table entry), plus the extra length bits.  We don't need to
489
     * consume the extra length bits here, as they were included in
490
     * the bits consumed by the entry earlier.  We also don't need
491
     * to check for too-long matches here, as this is inside the
492
     * fastloop where it's already been verified that the output
493
     * buffer has enough space remaining to copy a max-length match.
494
     */
495
41.4k
    length = entry >> 16;
496
41.4k
    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
497
498
    /*
499
     * Decode the match offset.  There are enough "preloadable" bits
500
     * remaining to preload the offset decode table entry, but a
501
     * refill might be needed before consuming it.
502
     */
503
41.4k
    STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
504
41.4k
                 OFFSET_TABLEBITS));
505
41.4k
    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
506
41.4k
    if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
507
41.4k
             LITLEN_TABLEBITS)) {
508
      /*
509
       * Decoding a match offset on a 64-bit platform.  We may
510
       * need to refill once, but then we can decode the whole
511
       * offset and preload the next litlen table entry.
512
       */
513
41.4k
      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
514
        /* Offset codeword requires a subtable */
515
0
        if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
516
0
               LITLEN_TABLEBITS - PRELOAD_SLACK))
517
0
          REFILL_BITS_IN_FASTLOOP();
518
0
        bitbuf >>= OFFSET_TABLEBITS;
519
0
        bitsleft -= OFFSET_TABLEBITS;
520
0
        entry = d->offset_decode_table[(entry >> 16) +
521
0
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
522
41.4k
      } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
523
41.4k
              LITLEN_TABLEBITS - PRELOAD_SLACK))
524
502
        REFILL_BITS_IN_FASTLOOP();
525
41.4k
    } else {
526
      /* Decoding a match offset on a 32-bit platform */
527
0
      REFILL_BITS_IN_FASTLOOP();
528
0
      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
529
        /* Offset codeword requires a subtable */
530
0
        bitbuf >>= OFFSET_TABLEBITS;
531
0
        bitsleft -= OFFSET_TABLEBITS;
532
0
        entry = d->offset_decode_table[(entry >> 16) +
533
0
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
534
0
        REFILL_BITS_IN_FASTLOOP();
535
        /* No further refill needed before extra bits */
536
0
        STATIC_ASSERT(CAN_CONSUME(
537
0
          OFFSET_MAXBITS - OFFSET_TABLEBITS));
538
0
      } else {
539
        /* No refill needed before extra bits */
540
0
        STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
541
0
      }
542
0
    }
543
41.4k
    saved_bitbuf = bitbuf;
544
41.4k
    bitbuf >>= (u8)entry;
545
41.4k
    bitsleft -= entry; /* optimization: subtract full entry */
546
41.4k
    offset = entry >> 16;
547
41.4k
    offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
548
549
    /* Validate the match offset; needed even in the fastloop. */
550
41.4k
    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
551
41.4k
    src = out_next - offset;
552
41.4k
    dst = out_next;
553
41.4k
    out_next += length;
554
555
    /*
556
     * Before starting to issue the instructions to copy the match,
557
     * refill the bitbuffer and preload the litlen decode table
558
     * entry for the next loop iteration.  This can increase
559
     * performance by allowing the latency of the match copy to
560
     * overlap with these other operations.  To further reduce
561
     * latency, we've arranged for there to be enough bits remaining
562
     * to do the table preload independently of the refill, except
563
     * on 32-bit platforms using the byte-at-a-time refill method.
564
     */
565
41.4k
    if (!CAN_CONSUME_AND_THEN_PRELOAD(
566
41.4k
      MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
567
41.4k
          OFFSET_MAXFASTBITS),
568
41.4k
      LITLEN_TABLEBITS) &&
569
41.4k
        unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
570
0
      REFILL_BITS_IN_FASTLOOP();
571
41.4k
    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
572
41.4k
    REFILL_BITS_IN_FASTLOOP();
573
574
    /*
575
     * Copy the match.  On most CPUs the fastest method is a
576
     * word-at-a-time copy, unconditionally copying about 5 words
577
     * since this is enough for most matches without being too much.
578
     *
579
     * The normal word-at-a-time copy works for offset >= WORDBYTES,
580
     * which is most cases.  The case of offset == 1 is also common
581
     * and is worth optimizing for, since it is just RLE encoding of
582
     * the previous byte, which is the result of compressing long
583
     * runs of the same byte.
584
     *
585
     * Writing past the match 'length' is allowed here, since it's
586
     * been ensured there is enough output space left for a slight
587
     * overrun.  FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
588
     * the maximum possible overrun here is changed.
589
     */
590
41.4k
    if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
591
6.68k
      store_word_unaligned(load_word_unaligned(src), dst);
592
6.68k
      src += WORDBYTES;
593
6.68k
      dst += WORDBYTES;
594
6.68k
      store_word_unaligned(load_word_unaligned(src), dst);
595
6.68k
      src += WORDBYTES;
596
6.68k
      dst += WORDBYTES;
597
6.68k
      store_word_unaligned(load_word_unaligned(src), dst);
598
6.68k
      src += WORDBYTES;
599
6.68k
      dst += WORDBYTES;
600
6.68k
      store_word_unaligned(load_word_unaligned(src), dst);
601
6.68k
      src += WORDBYTES;
602
6.68k
      dst += WORDBYTES;
603
6.68k
      store_word_unaligned(load_word_unaligned(src), dst);
604
6.68k
      src += WORDBYTES;
605
6.68k
      dst += WORDBYTES;
606
25.5k
      while (dst < out_next) {
607
18.8k
        store_word_unaligned(load_word_unaligned(src), dst);
608
18.8k
        src += WORDBYTES;
609
18.8k
        dst += WORDBYTES;
610
18.8k
        store_word_unaligned(load_word_unaligned(src), dst);
611
18.8k
        src += WORDBYTES;
612
18.8k
        dst += WORDBYTES;
613
18.8k
        store_word_unaligned(load_word_unaligned(src), dst);
614
18.8k
        src += WORDBYTES;
615
18.8k
        dst += WORDBYTES;
616
18.8k
        store_word_unaligned(load_word_unaligned(src), dst);
617
18.8k
        src += WORDBYTES;
618
18.8k
        dst += WORDBYTES;
619
18.8k
        store_word_unaligned(load_word_unaligned(src), dst);
620
18.8k
        src += WORDBYTES;
621
18.8k
        dst += WORDBYTES;
622
18.8k
      }
623
34.7k
    } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
624
33.0k
      machine_word_t v;
625
626
      /*
627
       * This part tends to get auto-vectorized, so keep it
628
       * copying a multiple of 16 bytes at a time.
629
       */
630
33.0k
      v = (machine_word_t)0x0101010101010101 * src[0];
631
33.0k
      store_word_unaligned(v, dst);
632
33.0k
      dst += WORDBYTES;
633
33.0k
      store_word_unaligned(v, dst);
634
33.0k
      dst += WORDBYTES;
635
33.0k
      store_word_unaligned(v, dst);
636
33.0k
      dst += WORDBYTES;
637
33.0k
      store_word_unaligned(v, dst);
638
33.0k
      dst += WORDBYTES;
639
151k
      while (dst < out_next) {
640
118k
        store_word_unaligned(v, dst);
641
118k
        dst += WORDBYTES;
642
118k
        store_word_unaligned(v, dst);
643
118k
        dst += WORDBYTES;
644
118k
        store_word_unaligned(v, dst);
645
118k
        dst += WORDBYTES;
646
118k
        store_word_unaligned(v, dst);
647
118k
        dst += WORDBYTES;
648
118k
      }
649
33.0k
    } else if (UNALIGNED_ACCESS_IS_FAST) {
650
1.62k
      store_word_unaligned(load_word_unaligned(src), dst);
651
1.62k
      src += offset;
652
1.62k
      dst += offset;
653
1.62k
      store_word_unaligned(load_word_unaligned(src), dst);
654
1.62k
      src += offset;
655
1.62k
      dst += offset;
656
42.6k
      do {
657
42.6k
        store_word_unaligned(load_word_unaligned(src), dst);
658
42.6k
        src += offset;
659
42.6k
        dst += offset;
660
42.6k
        store_word_unaligned(load_word_unaligned(src), dst);
661
42.6k
        src += offset;
662
42.6k
        dst += offset;
663
42.6k
      } while (dst < out_next);
664
1.62k
    } else {
665
0
      *dst++ = *src++;
666
0
      *dst++ = *src++;
667
0
      do {
668
0
        *dst++ = *src++;
669
0
      } while (dst < out_next);
670
0
    }
671
85.7k
  } while (in_next < in_fastloop_end && out_next < out_fastloop_end);
672
673
  /*
674
   * This is the generic loop for decoding literals and matches.  This
675
   * handles cases where in_next and out_next are close to the end of
676
   * their respective buffers.  Usually this loop isn't performance-
677
   * critical, as most time is spent in the fastloop above instead.  We
678
   * therefore omit some optimizations here in favor of smaller code.
679
   */
680
1.81k
generic_loop:
681
19.8k
  for (;;) {
682
19.8k
    u32 length, offset;
683
19.8k
    const u8 *src;
684
19.8k
    u8 *dst;
685
686
19.8k
    REFILL_BITS();
687
19.6k
    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
688
19.6k
    saved_bitbuf = bitbuf;
689
19.6k
    bitbuf >>= (u8)entry;
690
19.6k
    bitsleft -= entry;
691
19.6k
    if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
692
357
      entry = d->u.litlen_decode_table[(entry >> 16) +
693
357
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
694
357
      saved_bitbuf = bitbuf;
695
357
      bitbuf >>= (u8)entry;
696
357
      bitsleft -= entry;
697
357
    }
698
19.6k
    length = entry >> 16;
699
19.6k
    if (entry & HUFFDEC_LITERAL) {
700
12.6k
      if (unlikely(out_next == out_end))
701
9
        return LIBDEFLATE_INSUFFICIENT_SPACE;
702
12.6k
      *out_next++ = length;
703
12.6k
      continue;
704
12.6k
    }
705
7.02k
    if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
706
1.53k
      goto block_done;
707
5.49k
    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
708
5.49k
    if (unlikely(length > out_end - out_next))
709
20
      return LIBDEFLATE_INSUFFICIENT_SPACE;
710
711
5.47k
    if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
712
0
      REFILL_BITS();
713
5.47k
    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
714
5.47k
    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
715
0
      bitbuf >>= OFFSET_TABLEBITS;
716
0
      bitsleft -= OFFSET_TABLEBITS;
717
0
      entry = d->offset_decode_table[(entry >> 16) +
718
0
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
719
0
      if (!CAN_CONSUME(OFFSET_MAXBITS))
720
0
        REFILL_BITS();
721
0
    }
722
5.47k
    offset = entry >> 16;
723
5.47k
    offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
724
5.47k
    bitbuf >>= (u8)entry;
725
5.47k
    bitsleft -= entry;
726
727
5.47k
    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
728
5.38k
    src = out_next - offset;
729
5.38k
    dst = out_next;
730
5.38k
    out_next += length;
731
732
5.38k
    STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
733
5.38k
    *dst++ = *src++;
734
5.38k
    *dst++ = *src++;
735
442k
    do {
736
442k
      *dst++ = *src++;
737
442k
    } while (dst < out_next);
738
5.38k
  }
739
740
6.52k
block_done:
741
  /* Finished decoding a block */
742
743
6.52k
  if (!is_final_block)
744
5.86k
    goto next_block;
745
746
  /* That was the last block. */
747
748
661
  bitsleft = (u8)bitsleft;
749
750
  /*
751
   * If any of the implicit appended zero bytes were consumed (not just
752
   * refilled) before hitting end of stream, then the data is bad.
753
   */
754
661
  SAFETY_CHECK(overread_count <= (bitsleft >> 3));
755
756
  /* Optionally return the actual number of bytes consumed. */
757
611
  if (actual_in_nbytes_ret) {
758
    /* Don't count bytes that were refilled but not consumed. */
759
611
    in_next -= (bitsleft >> 3) - overread_count;
760
761
611
    *actual_in_nbytes_ret = in_next - (u8 *)in;
762
611
  }
763
764
  /* Optionally return the actual number of bytes written. */
765
611
  if (actual_out_nbytes_ret) {
766
607
    *actual_out_nbytes_ret = out_next - (u8 *)out;
767
607
  } else {
768
4
    if (out_next != out_end)
769
2
      return LIBDEFLATE_SHORT_OUTPUT;
770
4
  }
771
609
  return LIBDEFLATE_SUCCESS;
772
611
}
deflate_decompress.c:deflate_decompress_bmi2
Line
Count
Source
49
1.41k
{
50
1.41k
  u8 *out_next = out;
51
1.41k
  u8 * const out_end = out_next + out_nbytes_avail;
52
1.41k
  u8 * const out_fastloop_end =
53
1.41k
    out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);
54
55
  /* Input bitstream state; see deflate_decompress.c for documentation */
56
1.41k
  const u8 *in_next = in;
57
1.41k
  const u8 * const in_end = in_next + in_nbytes;
58
1.41k
  const u8 * const in_fastloop_end =
59
1.41k
    in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
60
1.41k
  bitbuf_t bitbuf = 0;
61
1.41k
  bitbuf_t saved_bitbuf;
62
1.41k
  u32 bitsleft = 0;
63
1.41k
  size_t overread_count = 0;
64
65
1.41k
  bool is_final_block;
66
1.41k
  unsigned block_type;
67
1.41k
  unsigned num_litlen_syms;
68
1.41k
  unsigned num_offset_syms;
69
1.41k
  bitbuf_t litlen_tablemask;
70
1.41k
  u32 entry;
71
72
7.27k
next_block:
73
  /* Starting to read the next block */
74
7.27k
  ;
75
76
7.27k
  STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
77
7.27k
  REFILL_BITS();
78
79
  /* BFINAL: 1 bit */
80
7.24k
  is_final_block = bitbuf & BITMASK(1);
81
82
  /* BTYPE: 2 bits */
83
7.24k
  block_type = (bitbuf >> 1) & BITMASK(2);
84
85
7.24k
  if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
86
87
    /* Dynamic Huffman block */
88
89
    /* The order in which precode lengths are stored */
90
2.11k
    static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
91
2.11k
      16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
92
2.11k
    };
93
94
2.11k
    unsigned num_explicit_precode_lens;
95
2.11k
    unsigned i;
96
97
    /* Read the codeword length counts. */
98
99
2.11k
    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
100
2.11k
    num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));
101
102
2.11k
    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
103
2.11k
    num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));
104
105
2.11k
    STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
106
2.11k
    num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));
107
108
2.11k
    d->static_codes_loaded = false;
109
110
    /*
111
     * Read the precode codeword lengths.
112
     *
113
     * A 64-bit bitbuffer is just one bit too small to hold the
114
     * maximum number of precode lens, so to minimize branches we
115
     * merge one len with the previous fields.
116
     */
117
2.11k
    STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
118
2.11k
    if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
119
2.11k
      d->u.precode_lens[deflate_precode_lens_permutation[0]] =
120
2.11k
        (bitbuf >> 17) & BITMASK(3);
121
2.11k
      bitbuf >>= 20;
122
2.11k
      bitsleft -= 20;
123
2.11k
      REFILL_BITS();
124
2.10k
      i = 1;
125
32.4k
      do {
126
32.4k
        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
127
32.4k
          bitbuf & BITMASK(3);
128
32.4k
        bitbuf >>= 3;
129
32.4k
        bitsleft -= 3;
130
32.4k
      } while (++i < num_explicit_precode_lens);
131
2.10k
    } else {
132
0
      bitbuf >>= 17;
133
0
      bitsleft -= 17;
134
0
      i = 0;
135
0
      do {
136
0
        if ((u8)bitsleft < 3)
137
0
          REFILL_BITS();
138
0
        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
139
0
          bitbuf & BITMASK(3);
140
0
        bitbuf >>= 3;
141
0
        bitsleft -= 3;
142
0
      } while (++i < num_explicit_precode_lens);
143
0
    }
144
7.57k
    for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
145
5.47k
      d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
146
147
    /* Build the decode table for the precode. */
148
2.10k
    SAFETY_CHECK(build_precode_decode_table(d));
149
150
    /* Decode the litlen and offset codeword lengths. */
151
2.05k
    i = 0;
152
147k
    do {
153
147k
      unsigned presym;
154
147k
      u8 rep_val;
155
147k
      unsigned rep_count;
156
157
147k
      if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
158
9.56k
        REFILL_BITS();
159
160
      /*
161
       * The code below assumes that the precode decode table
162
       * doesn't have any subtables.
163
       */
164
147k
      STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
165
166
      /* Decode the next precode symbol. */
167
147k
      entry = d->u.l.precode_decode_table[
168
147k
        bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
169
147k
      bitbuf >>= (u8)entry;
170
147k
      bitsleft -= entry; /* optimization: subtract full entry */
171
147k
      presym = entry >> 16;
172
173
147k
      if (presym < 16) {
174
        /* Explicit codeword length */
175
95.3k
        d->u.l.lens[i++] = presym;
176
95.3k
        continue;
177
95.3k
      }
178
179
      /* Run-length encoded codeword lengths */
180
181
      /*
182
       * Note: we don't need to immediately verify that the
183
       * repeat count doesn't overflow the number of elements,
184
       * since we've sized the lens array to have enough extra
185
       * space to allow for the worst-case overrun (138 zeroes
186
       * when only 1 length was remaining).
187
       *
188
       * In the case of the small repeat counts (presyms 16
189
       * and 17), it is fastest to always write the maximum
190
       * number of entries.  That gets rid of branches that
191
       * would otherwise be required.
192
       *
193
       * It is not just because of the numerical order that
194
       * our checks go in the order 'presym < 16', 'presym ==
195
       * 16', and 'presym == 17'.  For typical data this is
196
       * ordered from most frequent to least frequent case.
197
       */
198
51.7k
      STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
199
200
51.7k
      if (presym == 16) {
201
        /* Repeat the previous length 3 - 6 times. */
202
43.3k
        SAFETY_CHECK(i != 0);
203
43.3k
        rep_val = d->u.l.lens[i - 1];
204
43.3k
        STATIC_ASSERT(3 + BITMASK(2) == 6);
205
43.3k
        rep_count = 3 + (bitbuf & BITMASK(2));
206
43.3k
        bitbuf >>= 2;
207
43.3k
        bitsleft -= 2;
208
43.3k
        d->u.l.lens[i + 0] = rep_val;
209
43.3k
        d->u.l.lens[i + 1] = rep_val;
210
43.3k
        d->u.l.lens[i + 2] = rep_val;
211
43.3k
        d->u.l.lens[i + 3] = rep_val;
212
43.3k
        d->u.l.lens[i + 4] = rep_val;
213
43.3k
        d->u.l.lens[i + 5] = rep_val;
214
43.3k
        i += rep_count;
215
43.3k
      } else if (presym == 17) {
216
        /* Repeat zero 3 - 10 times. */
217
4.46k
        STATIC_ASSERT(3 + BITMASK(3) == 10);
218
4.46k
        rep_count = 3 + (bitbuf & BITMASK(3));
219
4.46k
        bitbuf >>= 3;
220
4.46k
        bitsleft -= 3;
221
4.46k
        d->u.l.lens[i + 0] = 0;
222
4.46k
        d->u.l.lens[i + 1] = 0;
223
4.46k
        d->u.l.lens[i + 2] = 0;
224
4.46k
        d->u.l.lens[i + 3] = 0;
225
4.46k
        d->u.l.lens[i + 4] = 0;
226
4.46k
        d->u.l.lens[i + 5] = 0;
227
4.46k
        d->u.l.lens[i + 6] = 0;
228
4.46k
        d->u.l.lens[i + 7] = 0;
229
4.46k
        d->u.l.lens[i + 8] = 0;
230
4.46k
        d->u.l.lens[i + 9] = 0;
231
4.46k
        i += rep_count;
232
4.46k
      } else {
233
        /* Repeat zero 11 - 138 times. */
234
3.96k
        STATIC_ASSERT(11 + BITMASK(7) == 138);
235
3.96k
        rep_count = 11 + (bitbuf & BITMASK(7));
236
3.96k
        bitbuf >>= 7;
237
3.96k
        bitsleft -= 7;
238
3.96k
        memset(&d->u.l.lens[i], 0,
239
3.96k
               rep_count * sizeof(d->u.l.lens[i]));
240
3.96k
        i += rep_count;
241
3.96k
      }
242
147k
    } while (i < num_litlen_syms + num_offset_syms);
243
244
    /* Unnecessary, but check this for consistency with zlib. */
245
1.96k
    SAFETY_CHECK(i == num_litlen_syms + num_offset_syms);
246
247
5.12k
  } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
248
1.54k
    u16 len, nlen;
249
250
    /*
251
     * Uncompressed block: copy 'len' bytes literally from the input
252
     * buffer to the output buffer.
253
     */
254
255
1.54k
    bitsleft -= 3; /* for BTYPE and BFINAL */
256
257
    /*
258
     * Align the bitstream to the next byte boundary.  This means
259
     * the next byte boundary as if we were reading a byte at a
260
     * time.  Therefore, we have to rewind 'in_next' by any bytes
261
     * that have been refilled but not actually consumed yet (not
262
     * counting overread bytes, which don't increment 'in_next').
263
     */
264
1.54k
    bitsleft = (u8)bitsleft;
265
1.54k
    SAFETY_CHECK(overread_count <= (bitsleft >> 3));
266
1.50k
    in_next -= (bitsleft >> 3) - overread_count;
267
1.50k
    overread_count = 0;
268
1.50k
    bitbuf = 0;
269
1.50k
    bitsleft = 0;
270
271
1.50k
    SAFETY_CHECK(in_end - in_next >= 4);
272
1.49k
    len = get_unaligned_le16(in_next);
273
1.49k
    nlen = get_unaligned_le16(in_next + 2);
274
1.49k
    in_next += 4;
275
276
1.49k
    SAFETY_CHECK(len == (u16)~nlen);
277
1.41k
    if (unlikely(len > out_end - out_next))
278
3
      return LIBDEFLATE_INSUFFICIENT_SPACE;
279
1.41k
    SAFETY_CHECK(len <= in_end - in_next);
280
281
1.40k
    memcpy(out_next, in_next, len);
282
1.40k
    in_next += len;
283
1.40k
    out_next += len;
284
285
1.40k
    goto block_done;
286
287
3.58k
  } else {
288
3.58k
    unsigned i;
289
290
3.58k
    SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
291
292
    /*
293
     * Static Huffman block: build the decode tables for the static
294
     * codes.  Skip doing so if the tables are already set up from
295
     * an earlier static block; this speeds up decompression of
296
     * degenerate input of many empty or very short static blocks.
297
     *
298
     * Afterwards, the remainder is the same as decompressing a
299
     * dynamic Huffman block.
300
     */
301
302
3.56k
    bitbuf >>= 3; /* for BTYPE and BFINAL */
303
3.56k
    bitsleft -= 3;
304
305
3.56k
    if (d->static_codes_loaded)
306
912
      goto have_decode_tables;
307
308
2.65k
    d->static_codes_loaded = true;
309
310
2.65k
    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
311
2.65k
    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
312
313
384k
    for (i = 0; i < 144; i++)
314
382k
      d->u.l.lens[i] = 8;
315
299k
    for (; i < 256; i++)
316
297k
      d->u.l.lens[i] = 9;
317
66.3k
    for (; i < 280; i++)
318
63.6k
      d->u.l.lens[i] = 7;
319
23.8k
    for (; i < 288; i++)
320
21.2k
      d->u.l.lens[i] = 8;
321
322
87.5k
    for (; i < 288 + 32; i++)
323
84.9k
      d->u.l.lens[i] = 5;
324
325
2.65k
    num_litlen_syms = 288;
326
2.65k
    num_offset_syms = 32;
327
2.65k
  }
328
329
  /* Decompressing a Huffman block (either dynamic or static) */
330
331
4.58k
  SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
332
4.54k
  SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
333
5.41k
have_decode_tables:
334
5.41k
  litlen_tablemask = BITMASK(d->litlen_tablebits);
335
336
  /*
337
   * This is the "fastloop" for decoding literals and matches.  It does
338
   * bounds checks on in_next and out_next in the loop conditions so that
339
   * additional bounds checks aren't needed inside the loop body.
340
   *
341
   * To reduce latency, the bitbuffer is refilled and the next litlen
342
   * decode table entry is preloaded before each loop iteration.
343
   */
344
5.41k
  if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
345
1.51k
    goto generic_loop;
346
3.90k
  REFILL_BITS_IN_FASTLOOP();
347
3.90k
  entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
348
89.3k
  do {
349
89.3k
    u32 length, offset, lit;
350
89.3k
    const u8 *src;
351
89.3k
    u8 *dst;
352
353
    /*
354
     * Consume the bits for the litlen decode table entry.  Save the
355
     * original bitbuf for later, in case the extra match length
356
     * bits need to be extracted from it.
357
     */
358
89.3k
    saved_bitbuf = bitbuf;
359
89.3k
    bitbuf >>= (u8)entry;
360
89.3k
    bitsleft -= entry; /* optimization: subtract full entry */
361
362
    /*
363
     * Begin by checking for a "fast" literal, i.e. a literal that
364
     * doesn't need a subtable.
365
     */
366
89.3k
    if (entry & HUFFDEC_LITERAL) {
367
      /*
368
       * On 64-bit platforms, we decode up to 2 extra fast
369
       * literals in addition to the primary item, as this
370
       * increases performance and still leaves enough bits
371
       * remaining for what follows.  We could actually do 3,
372
       * assuming LITLEN_TABLEBITS=11, but that actually
373
       * decreases performance slightly (perhaps by messing
374
       * with the branch prediction of the conditional refill
375
       * that happens later while decoding the match offset).
376
       *
377
       * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
378
       * and FASTLOOP_MAX_BYTES_READ need to be updated if the
379
       * number of extra literals decoded here is changed.
380
       */
381
55.1k
      if (/* enough bits for 2 fast literals + length + offset preload? */
382
55.1k
          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
383
55.1k
               LENGTH_MAXBITS,
384
55.1k
               OFFSET_TABLEBITS) &&
385
          /* enough bits for 2 fast literals + slow literal + litlen preload? */
386
55.1k
          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
387
55.1k
               DEFLATE_MAX_LITLEN_CODEWORD_LEN,
388
55.1k
               LITLEN_TABLEBITS)) {
389
        /* 1st extra fast literal */
390
55.1k
        lit = entry >> 16;
391
55.1k
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
392
55.1k
        saved_bitbuf = bitbuf;
393
55.1k
        bitbuf >>= (u8)entry;
394
55.1k
        bitsleft -= entry;
395
55.1k
        *out_next++ = lit;
396
55.1k
        if (entry & HUFFDEC_LITERAL) {
397
          /* 2nd extra fast literal */
398
48.9k
          lit = entry >> 16;
399
48.9k
          entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
400
48.9k
          saved_bitbuf = bitbuf;
401
48.9k
          bitbuf >>= (u8)entry;
402
48.9k
          bitsleft -= entry;
403
48.9k
          *out_next++ = lit;
404
48.9k
          if (entry & HUFFDEC_LITERAL) {
405
            /*
406
             * Another fast literal, but
407
             * this one is in lieu of the
408
             * primary item, so it doesn't
409
             * count as one of the extras.
410
             */
411
42.9k
            lit = entry >> 16;
412
42.9k
            entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
413
42.9k
            REFILL_BITS_IN_FASTLOOP();
414
42.9k
            *out_next++ = lit;
415
42.9k
            continue;
416
42.9k
          }
417
48.9k
        }
418
55.1k
      } else {
419
        /*
420
         * Decode a literal.  While doing so, preload
421
         * the next litlen decode table entry and refill
422
         * the bitbuffer.  To reduce latency, we've
423
         * arranged for there to be enough "preloadable"
424
         * bits remaining to do the table preload
425
         * independently of the refill.
426
         */
427
0
        STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
428
0
            LITLEN_TABLEBITS, LITLEN_TABLEBITS));
429
0
        lit = entry >> 16;
430
0
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
431
0
        REFILL_BITS_IN_FASTLOOP();
432
0
        *out_next++ = lit;
433
0
        continue;
434
0
      }
435
55.1k
    }
436
437
    /*
438
     * It's not a literal entry, so it can be a length entry, a
439
     * subtable pointer entry, or an end-of-block entry.  Detect the
440
     * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
441
     */
442
46.3k
    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
443
      /* Subtable pointer or end-of-block entry */
444
445
5.12k
      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
446
3.45k
        goto block_done;
447
448
      /*
449
       * A subtable is required.  Load and consume the
450
       * subtable entry.  The subtable entry can be of any
451
       * type: literal, length, or end-of-block.
452
       */
453
1.66k
      entry = d->u.litlen_decode_table[(entry >> 16) +
454
1.66k
        EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
455
1.66k
      saved_bitbuf = bitbuf;
456
1.66k
      bitbuf >>= (u8)entry;
457
1.66k
      bitsleft -= entry;
458
459
      /*
460
       * 32-bit platforms that use the byte-at-a-time refill
461
       * method have to do a refill here for there to always
462
       * be enough bits to decode a literal that requires a
463
       * subtable, then preload the next litlen decode table
464
       * entry; or to decode a match length that requires a
465
       * subtable, then preload the offset decode table entry.
466
       */
467
1.66k
      if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
468
1.66k
                LITLEN_TABLEBITS) ||
469
1.66k
          !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
470
1.66k
                OFFSET_TABLEBITS))
471
0
        REFILL_BITS_IN_FASTLOOP();
472
1.66k
      if (entry & HUFFDEC_LITERAL) {
473
        /* Decode a literal that required a subtable. */
474
1.32k
        lit = entry >> 16;
475
1.32k
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
476
1.32k
        REFILL_BITS_IN_FASTLOOP();
477
1.32k
        *out_next++ = lit;
478
1.32k
        continue;
479
1.32k
      }
480
346
      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
481
135
        goto block_done;
482
      /* Else, it's a length that required a subtable. */
483
346
    }
484
485
    /*
486
     * Decode the match length: the length base value associated
487
     * with the litlen symbol (which we extract from the decode
488
     * table entry), plus the extra length bits.  We don't need to
489
     * consume the extra length bits here, as they were included in
490
     * the bits consumed by the entry earlier.  We also don't need
491
     * to check for too-long matches here, as this is inside the
492
     * fastloop where it's already been verified that the output
493
     * buffer has enough space remaining to copy a max-length match.
494
     */
495
41.4k
    length = entry >> 16;
496
41.4k
    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
497
498
    /*
499
     * Decode the match offset.  There are enough "preloadable" bits
500
     * remaining to preload the offset decode table entry, but a
501
     * refill might be needed before consuming it.
502
     */
503
41.4k
    STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
504
41.4k
                 OFFSET_TABLEBITS));
505
41.4k
    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
506
41.4k
    if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
507
41.4k
             LITLEN_TABLEBITS)) {
508
      /*
509
       * Decoding a match offset on a 64-bit platform.  We may
510
       * need to refill once, but then we can decode the whole
511
       * offset and preload the next litlen table entry.
512
       */
513
41.4k
      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
514
        /* Offset codeword requires a subtable */
515
0
        if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
516
0
               LITLEN_TABLEBITS - PRELOAD_SLACK))
517
0
          REFILL_BITS_IN_FASTLOOP();
518
0
        bitbuf >>= OFFSET_TABLEBITS;
519
0
        bitsleft -= OFFSET_TABLEBITS;
520
0
        entry = d->offset_decode_table[(entry >> 16) +
521
0
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
522
41.4k
      } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
523
41.4k
              LITLEN_TABLEBITS - PRELOAD_SLACK))
524
502
        REFILL_BITS_IN_FASTLOOP();
525
41.4k
    } else {
526
      /* Decoding a match offset on a 32-bit platform */
527
0
      REFILL_BITS_IN_FASTLOOP();
528
0
      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
529
        /* Offset codeword requires a subtable */
530
0
        bitbuf >>= OFFSET_TABLEBITS;
531
0
        bitsleft -= OFFSET_TABLEBITS;
532
0
        entry = d->offset_decode_table[(entry >> 16) +
533
0
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
534
0
        REFILL_BITS_IN_FASTLOOP();
535
        /* No further refill needed before extra bits */
536
0
        STATIC_ASSERT(CAN_CONSUME(
537
0
          OFFSET_MAXBITS - OFFSET_TABLEBITS));
538
0
      } else {
539
        /* No refill needed before extra bits */
540
0
        STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
541
0
      }
542
0
    }
543
41.4k
    saved_bitbuf = bitbuf;
544
41.4k
    bitbuf >>= (u8)entry;
545
41.4k
    bitsleft -= entry; /* optimization: subtract full entry */
546
41.4k
    offset = entry >> 16;
547
41.4k
    offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
548
549
    /* Validate the match offset; needed even in the fastloop. */
550
41.4k
    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
551
41.4k
    src = out_next - offset;
552
41.4k
    dst = out_next;
553
41.4k
    out_next += length;
554
555
    /*
556
     * Before starting to issue the instructions to copy the match,
557
     * refill the bitbuffer and preload the litlen decode table
558
     * entry for the next loop iteration.  This can increase
559
     * performance by allowing the latency of the match copy to
560
     * overlap with these other operations.  To further reduce
561
     * latency, we've arranged for there to be enough bits remaining
562
     * to do the table preload independently of the refill, except
563
     * on 32-bit platforms using the byte-at-a-time refill method.
564
     */
565
41.4k
    if (!CAN_CONSUME_AND_THEN_PRELOAD(
566
41.4k
      MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
567
41.4k
          OFFSET_MAXFASTBITS),
568
41.4k
      LITLEN_TABLEBITS) &&
569
41.4k
        unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
570
0
      REFILL_BITS_IN_FASTLOOP();
571
41.4k
    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
572
41.4k
    REFILL_BITS_IN_FASTLOOP();
573
574
    /*
575
     * Copy the match.  On most CPUs the fastest method is a
576
     * word-at-a-time copy, unconditionally copying about 5 words
577
     * since this is enough for most matches without being too much.
578
     *
579
     * The normal word-at-a-time copy works for offset >= WORDBYTES,
580
     * which is most cases.  The case of offset == 1 is also common
581
     * and is worth optimizing for, since it is just RLE encoding of
582
     * the previous byte, which is the result of compressing long
583
     * runs of the same byte.
584
     *
585
     * Writing past the match 'length' is allowed here, since it's
586
     * been ensured there is enough output space left for a slight
587
     * overrun.  FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
588
     * the maximum possible overrun here is changed.
589
     */
590
41.4k
    if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
591
6.68k
      store_word_unaligned(load_word_unaligned(src), dst);
592
6.68k
      src += WORDBYTES;
593
6.68k
      dst += WORDBYTES;
594
6.68k
      store_word_unaligned(load_word_unaligned(src), dst);
595
6.68k
      src += WORDBYTES;
596
6.68k
      dst += WORDBYTES;
597
6.68k
      store_word_unaligned(load_word_unaligned(src), dst);
598
6.68k
      src += WORDBYTES;
599
6.68k
      dst += WORDBYTES;
600
6.68k
      store_word_unaligned(load_word_unaligned(src), dst);
601
6.68k
      src += WORDBYTES;
602
6.68k
      dst += WORDBYTES;
603
6.68k
      store_word_unaligned(load_word_unaligned(src), dst);
604
6.68k
      src += WORDBYTES;
605
6.68k
      dst += WORDBYTES;
606
25.5k
      while (dst < out_next) {
607
18.8k
        store_word_unaligned(load_word_unaligned(src), dst);
608
18.8k
        src += WORDBYTES;
609
18.8k
        dst += WORDBYTES;
610
18.8k
        store_word_unaligned(load_word_unaligned(src), dst);
611
18.8k
        src += WORDBYTES;
612
18.8k
        dst += WORDBYTES;
613
18.8k
        store_word_unaligned(load_word_unaligned(src), dst);
614
18.8k
        src += WORDBYTES;
615
18.8k
        dst += WORDBYTES;
616
18.8k
        store_word_unaligned(load_word_unaligned(src), dst);
617
18.8k
        src += WORDBYTES;
618
18.8k
        dst += WORDBYTES;
619
18.8k
        store_word_unaligned(load_word_unaligned(src), dst);
620
18.8k
        src += WORDBYTES;
621
18.8k
        dst += WORDBYTES;
622
18.8k
      }
623
34.7k
    } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
624
33.0k
      machine_word_t v;
625
626
      /*
627
       * This part tends to get auto-vectorized, so keep it
628
       * copying a multiple of 16 bytes at a time.
629
       */
630
33.0k
      v = (machine_word_t)0x0101010101010101 * src[0];
631
33.0k
      store_word_unaligned(v, dst);
632
33.0k
      dst += WORDBYTES;
633
33.0k
      store_word_unaligned(v, dst);
634
33.0k
      dst += WORDBYTES;
635
33.0k
      store_word_unaligned(v, dst);
636
33.0k
      dst += WORDBYTES;
637
33.0k
      store_word_unaligned(v, dst);
638
33.0k
      dst += WORDBYTES;
639
151k
      while (dst < out_next) {
640
118k
        store_word_unaligned(v, dst);
641
118k
        dst += WORDBYTES;
642
118k
        store_word_unaligned(v, dst);
643
118k
        dst += WORDBYTES;
644
118k
        store_word_unaligned(v, dst);
645
118k
        dst += WORDBYTES;
646
118k
        store_word_unaligned(v, dst);
647
118k
        dst += WORDBYTES;
648
118k
      }
649
33.0k
    } else if (UNALIGNED_ACCESS_IS_FAST) {
650
1.62k
      store_word_unaligned(load_word_unaligned(src), dst);
651
1.62k
      src += offset;
652
1.62k
      dst += offset;
653
1.62k
      store_word_unaligned(load_word_unaligned(src), dst);
654
1.62k
      src += offset;
655
1.62k
      dst += offset;
656
42.6k
      do {
657
42.6k
        store_word_unaligned(load_word_unaligned(src), dst);
658
42.6k
        src += offset;
659
42.6k
        dst += offset;
660
42.6k
        store_word_unaligned(load_word_unaligned(src), dst);
661
42.6k
        src += offset;
662
42.6k
        dst += offset;
663
42.6k
      } while (dst < out_next);
664
1.62k
    } else {
665
0
      *dst++ = *src++;
666
0
      *dst++ = *src++;
667
0
      do {
668
0
        *dst++ = *src++;
669
0
      } while (dst < out_next);
670
0
    }
671
85.7k
  } while (in_next < in_fastloop_end && out_next < out_fastloop_end);
672
673
  /*
674
   * This is the generic loop for decoding literals and matches.  This
675
   * handles cases where in_next and out_next are close to the end of
676
   * their respective buffers.  Usually this loop isn't performance-
677
   * critical, as most time is spent in the fastloop above instead.  We
678
   * therefore omit some optimizations here in favor of smaller code.
679
   */
680
1.81k
generic_loop:
681
19.8k
  for (;;) {
682
19.8k
    u32 length, offset;
683
19.8k
    const u8 *src;
684
19.8k
    u8 *dst;
685
686
19.8k
    REFILL_BITS();
687
19.6k
    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
688
19.6k
    saved_bitbuf = bitbuf;
689
19.6k
    bitbuf >>= (u8)entry;
690
19.6k
    bitsleft -= entry;
691
19.6k
    if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
692
357
      entry = d->u.litlen_decode_table[(entry >> 16) +
693
357
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
694
357
      saved_bitbuf = bitbuf;
695
357
      bitbuf >>= (u8)entry;
696
357
      bitsleft -= entry;
697
357
    }
698
19.6k
    length = entry >> 16;
699
19.6k
    if (entry & HUFFDEC_LITERAL) {
700
12.6k
      if (unlikely(out_next == out_end))
701
9
        return LIBDEFLATE_INSUFFICIENT_SPACE;
702
12.6k
      *out_next++ = length;
703
12.6k
      continue;
704
12.6k
    }
705
7.02k
    if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
706
1.53k
      goto block_done;
707
5.49k
    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
708
5.49k
    if (unlikely(length > out_end - out_next))
709
20
      return LIBDEFLATE_INSUFFICIENT_SPACE;
710
711
5.47k
    if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
712
0
      REFILL_BITS();
713
5.47k
    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
714
5.47k
    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
715
0
      bitbuf >>= OFFSET_TABLEBITS;
716
0
      bitsleft -= OFFSET_TABLEBITS;
717
0
      entry = d->offset_decode_table[(entry >> 16) +
718
0
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
719
0
      if (!CAN_CONSUME(OFFSET_MAXBITS))
720
0
        REFILL_BITS();
721
0
    }
722
5.47k
    offset = entry >> 16;
723
5.47k
    offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
724
5.47k
    bitbuf >>= (u8)entry;
725
5.47k
    bitsleft -= entry;
726
727
5.47k
    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
728
5.38k
    src = out_next - offset;
729
5.38k
    dst = out_next;
730
5.38k
    out_next += length;
731
732
5.38k
    STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
733
5.38k
    *dst++ = *src++;
734
5.38k
    *dst++ = *src++;
735
442k
    do {
736
442k
      *dst++ = *src++;
737
442k
    } while (dst < out_next);
738
5.38k
  }
739
740
6.52k
block_done:
741
  /* Finished decoding a block */
742
743
6.52k
  if (!is_final_block)
744
5.86k
    goto next_block;
745
746
  /* That was the last block. */
747
748
661
  bitsleft = (u8)bitsleft;
749
750
  /*
751
   * If any of the implicit appended zero bytes were consumed (not just
752
   * refilled) before hitting end of stream, then the data is bad.
753
   */
754
661
  SAFETY_CHECK(overread_count <= (bitsleft >> 3));
755
756
  /* Optionally return the actual number of bytes consumed. */
757
611
  if (actual_in_nbytes_ret) {
758
    /* Don't count bytes that were refilled but not consumed. */
759
611
    in_next -= (bitsleft >> 3) - overread_count;
760
761
611
    *actual_in_nbytes_ret = in_next - (u8 *)in;
762
611
  }
763
764
  /* Optionally return the actual number of bytes written. */
765
611
  if (actual_out_nbytes_ret) {
766
607
    *actual_out_nbytes_ret = out_next - (u8 *)out;
767
607
  } else {
768
4
    if (out_next != out_end)
769
2
      return LIBDEFLATE_SHORT_OUTPUT;
770
4
  }
771
609
  return LIBDEFLATE_SUCCESS;
772
611
}
Unexecuted instantiation: deflate_decompress.c:deflate_decompress_default
773
774
#undef FUNCNAME
775
#undef ATTRIBUTES
776
#undef EXTRACT_VARBITS
777
#undef EXTRACT_VARBITS8