Coverage Report

Created: 2025-10-10 06:38

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/openexr/external/deflate/lib/decompress_template.h
Line
Count
Source
1
/*
2
 * decompress_template.h
3
 *
4
 * Copyright 2016 Eric Biggers
5
 *
6
 * Permission is hereby granted, free of charge, to any person
7
 * obtaining a copy of this software and associated documentation
8
 * files (the "Software"), to deal in the Software without
9
 * restriction, including without limitation the rights to use,
10
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
 * copies of the Software, and to permit persons to whom the
12
 * Software is furnished to do so, subject to the following
13
 * conditions:
14
 *
15
 * The above copyright notice and this permission notice shall be
16
 * included in all copies or substantial portions of the Software.
17
 *
18
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
 * OTHER DEALINGS IN THE SOFTWARE.
26
 */
27
28
/*
29
 * This is the actual DEFLATE decompression routine, lifted out of
30
 * deflate_decompress.c so that it can be compiled multiple times with different
31
 * target instruction sets.
32
 */
33
34
#ifndef ATTRIBUTES
35
#  define ATTRIBUTES
36
#endif
37
#ifndef EXTRACT_VARBITS
38
9.53k
#  define EXTRACT_VARBITS(word, count)  ((word) & BITMASK(count))
39
#endif
40
#ifndef EXTRACT_VARBITS8
41
142k
#  define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count)))
42
#endif
43
44
static ATTRIBUTES MAYBE_UNUSED enum libdeflate_result
45
FUNCNAME(struct libdeflate_decompressor * restrict d,
46
   const void * restrict in, size_t in_nbytes,
47
   void * restrict out, size_t out_nbytes_avail,
48
   size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
49
1.87k
{
50
1.87k
  u8 *out_next = out;
51
1.87k
  u8 * const out_end = out_next + out_nbytes_avail;
52
1.87k
  u8 * const out_fastloop_end =
53
1.87k
    out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);
54
55
  /* Input bitstream state; see deflate_decompress.c for documentation */
56
1.87k
  const u8 *in_next = in;
57
1.87k
  const u8 * const in_end = in_next + in_nbytes;
58
1.87k
  const u8 * const in_fastloop_end =
59
1.87k
    in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
60
1.87k
  bitbuf_t bitbuf = 0;
61
1.87k
  bitbuf_t saved_bitbuf;
62
1.87k
  u32 bitsleft = 0;
63
1.87k
  size_t overread_count = 0;
64
65
1.87k
  bool is_final_block;
66
1.87k
  unsigned block_type;
67
1.87k
  unsigned num_litlen_syms;
68
1.87k
  unsigned num_offset_syms;
69
1.87k
  bitbuf_t litlen_tablemask;
70
1.87k
  u32 entry;
71
72
7.64k
next_block:
73
  /* Starting to read the next block */
74
7.64k
  ;
75
76
7.64k
  STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
77
7.64k
  REFILL_BITS();
78
79
  /* BFINAL: 1 bit */
80
7.60k
  is_final_block = bitbuf & BITMASK(1);
81
82
  /* BTYPE: 2 bits */
83
7.60k
  block_type = (bitbuf >> 1) & BITMASK(2);
84
85
7.60k
  if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
86
87
    /* Dynamic Huffman block */
88
89
    /* The order in which precode lengths are stored */
90
2.51k
    static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
91
2.51k
      16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
92
2.51k
    };
93
94
2.51k
    unsigned num_explicit_precode_lens;
95
2.51k
    unsigned i;
96
97
    /* Read the codeword length counts. */
98
99
2.51k
    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
100
2.51k
    num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));
101
102
2.51k
    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
103
2.51k
    num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));
104
105
2.51k
    STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
106
2.51k
    num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));
107
108
2.51k
    d->static_codes_loaded = false;
109
110
    /*
111
     * Read the precode codeword lengths.
112
     *
113
     * A 64-bit bitbuffer is just one bit too small to hold the
114
     * maximum number of precode lens, so to minimize branches we
115
     * merge one len with the previous fields.
116
     */
117
2.51k
    STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
118
2.51k
    if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
119
2.51k
      d->u.precode_lens[deflate_precode_lens_permutation[0]] =
120
2.51k
        (bitbuf >> 17) & BITMASK(3);
121
2.51k
      bitbuf >>= 20;
122
2.51k
      bitsleft -= 20;
123
2.51k
      REFILL_BITS();
124
2.50k
      i = 1;
125
37.0k
      do {
126
37.0k
        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
127
37.0k
          bitbuf & BITMASK(3);
128
37.0k
        bitbuf >>= 3;
129
37.0k
        bitsleft -= 3;
130
37.0k
      } while (++i < num_explicit_precode_lens);
131
2.50k
    } else {
132
0
      bitbuf >>= 17;
133
0
      bitsleft -= 17;
134
0
      i = 0;
135
0
      do {
136
0
        if ((u8)bitsleft < 3)
137
0
          REFILL_BITS();
138
0
        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
139
0
          bitbuf & BITMASK(3);
140
0
        bitbuf >>= 3;
141
0
        bitsleft -= 3;
142
0
      } while (++i < num_explicit_precode_lens);
143
0
    }
144
10.6k
    for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
145
8.12k
      d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
146
147
    /* Build the decode table for the precode. */
148
2.50k
    SAFETY_CHECK(build_precode_decode_table(d));
149
150
    /* Decode the litlen and offset codeword lengths. */
151
2.44k
    i = 0;
152
172k
    do {
153
172k
      unsigned presym;
154
172k
      u8 rep_val;
155
172k
      unsigned rep_count;
156
157
172k
      if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
158
12.8k
        REFILL_BITS();
159
160
      /*
161
       * The code below assumes that the precode decode table
162
       * doesn't have any subtables.
163
       */
164
172k
      STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
165
166
      /* Decode the next precode symbol. */
167
172k
      entry = d->u.l.precode_decode_table[
168
172k
        bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
169
172k
      bitbuf >>= (u8)entry;
170
172k
      bitsleft -= entry; /* optimization: subtract full entry */
171
172k
      presym = entry >> 16;
172
173
172k
      if (presym < 16) {
174
        /* Explicit codeword length */
175
127k
        d->u.l.lens[i++] = presym;
176
127k
        continue;
177
127k
      }
178
179
      /* Run-length encoded codeword lengths */
180
181
      /*
182
       * Note: we don't need to immediately verify that the
183
       * repeat count doesn't overflow the number of elements,
184
       * since we've sized the lens array to have enough extra
185
       * space to allow for the worst-case overrun (138 zeroes
186
       * when only 1 length was remaining).
187
       *
188
       * In the case of the small repeat counts (presyms 16
189
       * and 17), it is fastest to always write the maximum
190
       * number of entries.  That gets rid of branches that
191
       * would otherwise be required.
192
       *
193
       * It is not just because of the numerical order that
194
       * our checks go in the order 'presym < 16', 'presym ==
195
       * 16', and 'presym == 17'.  For typical data this is
196
       * ordered from most frequent to least frequent case.
197
       */
198
44.5k
      STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
199
200
44.5k
      if (presym == 16) {
201
        /* Repeat the previous length 3 - 6 times. */
202
30.4k
        SAFETY_CHECK(i != 0);
203
30.4k
        rep_val = d->u.l.lens[i - 1];
204
30.4k
        STATIC_ASSERT(3 + BITMASK(2) == 6);
205
30.4k
        rep_count = 3 + (bitbuf & BITMASK(2));
206
30.4k
        bitbuf >>= 2;
207
30.4k
        bitsleft -= 2;
208
30.4k
        d->u.l.lens[i + 0] = rep_val;
209
30.4k
        d->u.l.lens[i + 1] = rep_val;
210
30.4k
        d->u.l.lens[i + 2] = rep_val;
211
30.4k
        d->u.l.lens[i + 3] = rep_val;
212
30.4k
        d->u.l.lens[i + 4] = rep_val;
213
30.4k
        d->u.l.lens[i + 5] = rep_val;
214
30.4k
        i += rep_count;
215
30.4k
      } else if (presym == 17) {
216
        /* Repeat zero 3 - 10 times. */
217
8.73k
        STATIC_ASSERT(3 + BITMASK(3) == 10);
218
8.73k
        rep_count = 3 + (bitbuf & BITMASK(3));
219
8.73k
        bitbuf >>= 3;
220
8.73k
        bitsleft -= 3;
221
8.73k
        d->u.l.lens[i + 0] = 0;
222
8.73k
        d->u.l.lens[i + 1] = 0;
223
8.73k
        d->u.l.lens[i + 2] = 0;
224
8.73k
        d->u.l.lens[i + 3] = 0;
225
8.73k
        d->u.l.lens[i + 4] = 0;
226
8.73k
        d->u.l.lens[i + 5] = 0;
227
8.73k
        d->u.l.lens[i + 6] = 0;
228
8.73k
        d->u.l.lens[i + 7] = 0;
229
8.73k
        d->u.l.lens[i + 8] = 0;
230
8.73k
        d->u.l.lens[i + 9] = 0;
231
8.73k
        i += rep_count;
232
8.73k
      } else {
233
        /* Repeat zero 11 - 138 times. */
234
5.37k
        STATIC_ASSERT(11 + BITMASK(7) == 138);
235
5.37k
        rep_count = 11 + (bitbuf & BITMASK(7));
236
5.37k
        bitbuf >>= 7;
237
5.37k
        bitsleft -= 7;
238
5.37k
        memset(&d->u.l.lens[i], 0,
239
5.37k
               rep_count * sizeof(d->u.l.lens[i]));
240
5.37k
        i += rep_count;
241
5.37k
      }
242
172k
    } while (i < num_litlen_syms + num_offset_syms);
243
244
    /* Unnecessary, but check this for consistency with zlib. */
245
2.35k
    SAFETY_CHECK(i == num_litlen_syms + num_offset_syms);
246
247
5.08k
  } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
248
936
    u16 len, nlen;
249
250
    /*
251
     * Uncompressed block: copy 'len' bytes literally from the input
252
     * buffer to the output buffer.
253
     */
254
255
936
    bitsleft -= 3; /* for BTYPE and BFINAL */
256
257
    /*
258
     * Align the bitstream to the next byte boundary.  This means
259
     * the next byte boundary as if we were reading a byte at a
260
     * time.  Therefore, we have to rewind 'in_next' by any bytes
261
     * that have been refilled but not actually consumed yet (not
262
     * counting overread bytes, which don't increment 'in_next').
263
     */
264
936
    bitsleft = (u8)bitsleft;
265
936
    SAFETY_CHECK(overread_count <= (bitsleft >> 3));
266
896
    in_next -= (bitsleft >> 3) - overread_count;
267
896
    overread_count = 0;
268
896
    bitbuf = 0;
269
896
    bitsleft = 0;
270
271
896
    SAFETY_CHECK(in_end - in_next >= 4);
272
851
    len = get_unaligned_le16(in_next);
273
851
    nlen = get_unaligned_le16(in_next + 2);
274
851
    in_next += 4;
275
276
851
    SAFETY_CHECK(len == (u16)~nlen);
277
754
    if (unlikely(len > out_end - out_next))
278
7
      return LIBDEFLATE_INSUFFICIENT_SPACE;
279
747
    SAFETY_CHECK(len <= in_end - in_next);
280
281
729
    memcpy(out_next, in_next, len);
282
729
    in_next += len;
283
729
    out_next += len;
284
285
729
    goto block_done;
286
287
4.15k
  } else {
288
4.15k
    unsigned i;
289
290
4.15k
    SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
291
292
    /*
293
     * Static Huffman block: build the decode tables for the static
294
     * codes.  Skip doing so if the tables are already set up from
295
     * an earlier static block; this speeds up decompression of
296
     * degenerate input of many empty or very short static blocks.
297
     *
298
     * Afterwards, the remainder is the same as decompressing a
299
     * dynamic Huffman block.
300
     */
301
302
4.12k
    bitbuf >>= 3; /* for BTYPE and BFINAL */
303
4.12k
    bitsleft -= 3;
304
305
4.12k
    if (d->static_codes_loaded)
306
582
      goto have_decode_tables;
307
308
3.54k
    d->static_codes_loaded = true;
309
310
3.54k
    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
311
3.54k
    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
312
313
513k
    for (i = 0; i < 144; i++)
314
510k
      d->u.l.lens[i] = 8;
315
400k
    for (; i < 256; i++)
316
396k
      d->u.l.lens[i] = 9;
317
88.5k
    for (; i < 280; i++)
318
85.0k
      d->u.l.lens[i] = 7;
319
31.8k
    for (; i < 288; i++)
320
28.3k
      d->u.l.lens[i] = 8;
321
322
116k
    for (; i < 288 + 32; i++)
323
113k
      d->u.l.lens[i] = 5;
324
325
3.54k
    num_litlen_syms = 288;
326
3.54k
    num_offset_syms = 32;
327
3.54k
  }
328
329
  /* Decompressing a Huffman block (either dynamic or static) */
330
331
5.86k
  SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
332
5.84k
  SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
333
6.38k
have_decode_tables:
334
6.38k
  litlen_tablemask = BITMASK(d->litlen_tablebits);
335
336
  /*
337
   * This is the "fastloop" for decoding literals and matches.  It does
338
   * bounds checks on in_next and out_next in the loop conditions so that
339
   * additional bounds checks aren't needed inside the loop body.
340
   *
341
   * To reduce latency, the bitbuffer is refilled and the next litlen
342
   * decode table entry is preloaded before each loop iteration.
343
   */
344
6.38k
  if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
345
1.42k
    goto generic_loop;
346
4.96k
  REFILL_BITS_IN_FASTLOOP();
347
4.96k
  entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
348
171k
  do {
349
171k
    u32 length, offset, lit;
350
171k
    const u8 *src;
351
171k
    u8 *dst;
352
353
    /*
354
     * Consume the bits for the litlen decode table entry.  Save the
355
     * original bitbuf for later, in case the extra match length
356
     * bits need to be extracted from it.
357
     */
358
171k
    saved_bitbuf = bitbuf;
359
171k
    bitbuf >>= (u8)entry;
360
171k
    bitsleft -= entry; /* optimization: subtract full entry */
361
362
    /*
363
     * Begin by checking for a "fast" literal, i.e. a literal that
364
     * doesn't need a subtable.
365
     */
366
171k
    if (entry & HUFFDEC_LITERAL) {
367
      /*
368
       * On 64-bit platforms, we decode up to 2 extra fast
369
       * literals in addition to the primary item, as this
370
       * increases performance and still leaves enough bits
371
       * remaining for what follows.  We could actually do 3,
372
       * assuming LITLEN_TABLEBITS=11, but that actually
373
       * decreases performance slightly (perhaps by messing
374
       * with the branch prediction of the conditional refill
375
       * that happens later while decoding the match offset).
376
       *
377
       * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
378
       * and FASTLOOP_MAX_BYTES_READ need to be updated if the
379
       * number of extra literals decoded here is changed.
380
       */
381
108k
      if (/* enough bits for 2 fast literals + length + offset preload? */
382
108k
          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
383
108k
               LENGTH_MAXBITS,
384
108k
               OFFSET_TABLEBITS) &&
385
          /* enough bits for 2 fast literals + slow literal + litlen preload? */
386
0
          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
387
108k
               DEFLATE_MAX_LITLEN_CODEWORD_LEN,
388
108k
               LITLEN_TABLEBITS)) {
389
        /* 1st extra fast literal */
390
108k
        lit = entry >> 16;
391
108k
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
392
108k
        saved_bitbuf = bitbuf;
393
108k
        bitbuf >>= (u8)entry;
394
108k
        bitsleft -= entry;
395
108k
        *out_next++ = lit;
396
108k
        if (entry & HUFFDEC_LITERAL) {
397
          /* 2nd extra fast literal */
398
99.9k
          lit = entry >> 16;
399
99.9k
          entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
400
99.9k
          saved_bitbuf = bitbuf;
401
99.9k
          bitbuf >>= (u8)entry;
402
99.9k
          bitsleft -= entry;
403
99.9k
          *out_next++ = lit;
404
99.9k
          if (entry & HUFFDEC_LITERAL) {
405
            /*
406
             * Another fast literal, but
407
             * this one is in lieu of the
408
             * primary item, so it doesn't
409
             * count as one of the extras.
410
             */
411
95.2k
            lit = entry >> 16;
412
95.2k
            entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
413
95.2k
            REFILL_BITS_IN_FASTLOOP();
414
95.2k
            *out_next++ = lit;
415
95.2k
            continue;
416
95.2k
          }
417
99.9k
        }
418
108k
      } else {
419
        /*
420
         * Decode a literal.  While doing so, preload
421
         * the next litlen decode table entry and refill
422
         * the bitbuffer.  To reduce latency, we've
423
         * arranged for there to be enough "preloadable"
424
         * bits remaining to do the table preload
425
         * independently of the refill.
426
         */
427
0
        STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
428
0
            LITLEN_TABLEBITS, LITLEN_TABLEBITS));
429
0
        lit = entry >> 16;
430
0
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
431
0
        REFILL_BITS_IN_FASTLOOP();
432
0
        *out_next++ = lit;
433
0
        continue;
434
0
      }
435
108k
    }
436
437
    /*
438
     * It's not a literal entry, so it can be a length entry, a
439
     * subtable pointer entry, or an end-of-block entry.  Detect the
440
     * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
441
     */
442
75.9k
    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
443
      /* Subtable pointer or end-of-block entry */
444
445
12.5k
      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
446
3.46k
        goto block_done;
447
448
      /*
449
       * A subtable is required.  Load and consume the
450
       * subtable entry.  The subtable entry can be of any
451
       * type: literal, length, or end-of-block.
452
       */
453
9.09k
      entry = d->u.litlen_decode_table[(entry >> 16) +
454
9.09k
        EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
455
9.09k
      saved_bitbuf = bitbuf;
456
9.09k
      bitbuf >>= (u8)entry;
457
9.09k
      bitsleft -= entry;
458
459
      /*
460
       * 32-bit platforms that use the byte-at-a-time refill
461
       * method have to do a refill here for there to always
462
       * be enough bits to decode a literal that requires a
463
       * subtable, then preload the next litlen decode table
464
       * entry; or to decode a match length that requires a
465
       * subtable, then preload the offset decode table entry.
466
       */
467
9.09k
      if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
468
9.09k
                LITLEN_TABLEBITS) ||
469
0
          !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
470
9.09k
                OFFSET_TABLEBITS))
471
0
        REFILL_BITS_IN_FASTLOOP();
472
9.09k
      if (entry & HUFFDEC_LITERAL) {
473
        /* Decode a literal that required a subtable. */
474
7.83k
        lit = entry >> 16;
475
7.83k
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
476
7.83k
        REFILL_BITS_IN_FASTLOOP();
477
7.83k
        *out_next++ = lit;
478
7.83k
        continue;
479
7.83k
      }
480
1.26k
      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
481
1.02k
        goto block_done;
482
      /* Else, it's a length that required a subtable. */
483
1.26k
    }
484
485
    /*
486
     * Decode the match length: the length base value associated
487
     * with the litlen symbol (which we extract from the decode
488
     * table entry), plus the extra length bits.  We don't need to
489
     * consume the extra length bits here, as they were included in
490
     * the bits consumed by the entry earlier.  We also don't need
491
     * to check for too-long matches here, as this is inside the
492
     * fastloop where it's already been verified that the output
493
     * buffer has enough space remaining to copy a max-length match.
494
     */
495
63.6k
    length = entry >> 16;
496
63.6k
    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
497
498
    /*
499
     * Decode the match offset.  There are enough "preloadable" bits
500
     * remaining to preload the offset decode table entry, but a
501
     * refill might be needed before consuming it.
502
     */
503
63.6k
    STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
504
63.6k
                 OFFSET_TABLEBITS));
505
63.6k
    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
506
63.6k
    if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
507
63.6k
             LITLEN_TABLEBITS)) {
508
      /*
509
       * Decoding a match offset on a 64-bit platform.  We may
510
       * need to refill once, but then we can decode the whole
511
       * offset and preload the next litlen table entry.
512
       */
513
63.6k
      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
514
        /* Offset codeword requires a subtable */
515
0
        if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
516
0
               LITLEN_TABLEBITS - PRELOAD_SLACK))
517
0
          REFILL_BITS_IN_FASTLOOP();
518
0
        bitbuf >>= OFFSET_TABLEBITS;
519
0
        bitsleft -= OFFSET_TABLEBITS;
520
0
        entry = d->offset_decode_table[(entry >> 16) +
521
0
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
522
63.6k
      } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
523
63.6k
              LITLEN_TABLEBITS - PRELOAD_SLACK))
524
321
        REFILL_BITS_IN_FASTLOOP();
525
63.6k
    } else {
526
      /* Decoding a match offset on a 32-bit platform */
527
0
      REFILL_BITS_IN_FASTLOOP();
528
0
      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
529
        /* Offset codeword requires a subtable */
530
0
        bitbuf >>= OFFSET_TABLEBITS;
531
0
        bitsleft -= OFFSET_TABLEBITS;
532
0
        entry = d->offset_decode_table[(entry >> 16) +
533
0
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
534
0
        REFILL_BITS_IN_FASTLOOP();
535
        /* No further refill needed before extra bits */
536
0
        STATIC_ASSERT(CAN_CONSUME(
537
0
          OFFSET_MAXBITS - OFFSET_TABLEBITS));
538
0
      } else {
539
        /* No refill needed before extra bits */
540
0
        STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
541
0
      }
542
0
    }
543
63.6k
    saved_bitbuf = bitbuf;
544
63.6k
    bitbuf >>= (u8)entry;
545
63.6k
    bitsleft -= entry; /* optimization: subtract full entry */
546
63.6k
    offset = entry >> 16;
547
63.6k
    offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
548
549
    /* Validate the match offset; needed even in the fastloop. */
550
63.6k
    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
551
63.6k
    src = out_next - offset;
552
63.6k
    dst = out_next;
553
63.6k
    out_next += length;
554
555
    /*
556
     * Before starting to issue the instructions to copy the match,
557
     * refill the bitbuffer and preload the litlen decode table
558
     * entry for the next loop iteration.  This can increase
559
     * performance by allowing the latency of the match copy to
560
     * overlap with these other operations.  To further reduce
561
     * latency, we've arranged for there to be enough bits remaining
562
     * to do the table preload independently of the refill, except
563
     * on 32-bit platforms using the byte-at-a-time refill method.
564
     */
565
63.6k
    if (!CAN_CONSUME_AND_THEN_PRELOAD(
566
63.6k
      MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
567
63.6k
          OFFSET_MAXFASTBITS),
568
63.6k
      LITLEN_TABLEBITS) &&
569
0
        unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
570
0
      REFILL_BITS_IN_FASTLOOP();
571
63.6k
    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
572
63.6k
    REFILL_BITS_IN_FASTLOOP();
573
574
    /*
575
     * Copy the match.  On most CPUs the fastest method is a
576
     * word-at-a-time copy, unconditionally copying about 5 words
577
     * since this is enough for most matches without being too much.
578
     *
579
     * The normal word-at-a-time copy works for offset >= WORDBYTES,
580
     * which is most cases.  The case of offset == 1 is also common
581
     * and is worth optimizing for, since it is just RLE encoding of
582
     * the previous byte, which is the result of compressing long
583
     * runs of the same byte.
584
     *
585
     * Writing past the match 'length' is allowed here, since it's
586
     * been ensured there is enough output space left for a slight
587
     * overrun.  FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
588
     * the maximum possible overrun here is changed.
589
     */
590
63.6k
    if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
591
5.38k
      store_word_unaligned(load_word_unaligned(src), dst);
592
5.38k
      src += WORDBYTES;
593
5.38k
      dst += WORDBYTES;
594
5.38k
      store_word_unaligned(load_word_unaligned(src), dst);
595
5.38k
      src += WORDBYTES;
596
5.38k
      dst += WORDBYTES;
597
5.38k
      store_word_unaligned(load_word_unaligned(src), dst);
598
5.38k
      src += WORDBYTES;
599
5.38k
      dst += WORDBYTES;
600
5.38k
      store_word_unaligned(load_word_unaligned(src), dst);
601
5.38k
      src += WORDBYTES;
602
5.38k
      dst += WORDBYTES;
603
5.38k
      store_word_unaligned(load_word_unaligned(src), dst);
604
5.38k
      src += WORDBYTES;
605
5.38k
      dst += WORDBYTES;
606
21.5k
      while (dst < out_next) {
607
16.1k
        store_word_unaligned(load_word_unaligned(src), dst);
608
16.1k
        src += WORDBYTES;
609
16.1k
        dst += WORDBYTES;
610
16.1k
        store_word_unaligned(load_word_unaligned(src), dst);
611
16.1k
        src += WORDBYTES;
612
16.1k
        dst += WORDBYTES;
613
16.1k
        store_word_unaligned(load_word_unaligned(src), dst);
614
16.1k
        src += WORDBYTES;
615
16.1k
        dst += WORDBYTES;
616
16.1k
        store_word_unaligned(load_word_unaligned(src), dst);
617
16.1k
        src += WORDBYTES;
618
16.1k
        dst += WORDBYTES;
619
16.1k
        store_word_unaligned(load_word_unaligned(src), dst);
620
16.1k
        src += WORDBYTES;
621
16.1k
        dst += WORDBYTES;
622
16.1k
      }
623
58.2k
    } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
624
53.5k
      machine_word_t v;
625
626
      /*
627
       * This part tends to get auto-vectorized, so keep it
628
       * copying a multiple of 16 bytes at a time.
629
       */
630
53.5k
      v = (machine_word_t)0x0101010101010101 * src[0];
631
53.5k
      store_word_unaligned(v, dst);
632
53.5k
      dst += WORDBYTES;
633
53.5k
      store_word_unaligned(v, dst);
634
53.5k
      dst += WORDBYTES;
635
53.5k
      store_word_unaligned(v, dst);
636
53.5k
      dst += WORDBYTES;
637
53.5k
      store_word_unaligned(v, dst);
638
53.5k
      dst += WORDBYTES;
639
280k
      while (dst < out_next) {
640
227k
        store_word_unaligned(v, dst);
641
227k
        dst += WORDBYTES;
642
227k
        store_word_unaligned(v, dst);
643
227k
        dst += WORDBYTES;
644
227k
        store_word_unaligned(v, dst);
645
227k
        dst += WORDBYTES;
646
227k
        store_word_unaligned(v, dst);
647
227k
        dst += WORDBYTES;
648
227k
      }
649
53.5k
    } else if (UNALIGNED_ACCESS_IS_FAST) {
650
4.70k
      store_word_unaligned(load_word_unaligned(src), dst);
651
4.70k
      src += offset;
652
4.70k
      dst += offset;
653
4.70k
      store_word_unaligned(load_word_unaligned(src), dst);
654
4.70k
      src += offset;
655
4.70k
      dst += offset;
656
76.6k
      do {
657
76.6k
        store_word_unaligned(load_word_unaligned(src), dst);
658
76.6k
        src += offset;
659
76.6k
        dst += offset;
660
76.6k
        store_word_unaligned(load_word_unaligned(src), dst);
661
76.6k
        src += offset;
662
76.6k
        dst += offset;
663
76.6k
      } while (dst < out_next);
664
4.70k
    } else {
665
0
      *dst++ = *src++;
666
0
      *dst++ = *src++;
667
0
      do {
668
0
        *dst++ = *src++;
669
0
      } while (dst < out_next);
670
0
    }
671
166k
  } while (in_next < in_fastloop_end && out_next < out_fastloop_end);
672
673
  /*
674
   * This is the generic loop for decoding literals and matches.  This
675
   * handles cases where in_next and out_next are close to the end of
676
   * their respective buffers.  Usually this loop isn't performance-
677
   * critical, as most time is spent in the fastloop above instead.  We
678
   * therefore omit some optimizations here in favor of smaller code.
679
   */
680
1.87k
generic_loop:
681
26.5k
  for (;;) {
682
26.5k
    u32 length, offset;
683
26.5k
    const u8 *src;
684
26.5k
    u8 *dst;
685
686
26.5k
    REFILL_BITS();
687
26.3k
    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
688
26.3k
    saved_bitbuf = bitbuf;
689
26.3k
    bitbuf >>= (u8)entry;
690
26.3k
    bitsleft -= entry;
691
26.3k
    if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
692
438
      entry = d->u.litlen_decode_table[(entry >> 16) +
693
438
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
694
438
      saved_bitbuf = bitbuf;
695
438
      bitbuf >>= (u8)entry;
696
438
      bitsleft -= entry;
697
438
    }
698
26.3k
    length = entry >> 16;
699
26.3k
    if (entry & HUFFDEC_LITERAL) {
700
17.1k
      if (unlikely(out_next == out_end))
701
12
        return LIBDEFLATE_INSUFFICIENT_SPACE;
702
17.1k
      *out_next++ = length;
703
17.1k
      continue;
704
17.1k
    }
705
9.25k
    if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
706
1.57k
      goto block_done;
707
7.68k
    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
708
7.68k
    if (unlikely(length > out_end - out_next))
709
24
      return LIBDEFLATE_INSUFFICIENT_SPACE;
710
711
7.66k
    if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
712
0
      REFILL_BITS();
713
7.66k
    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
714
7.66k
    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
715
0
      bitbuf >>= OFFSET_TABLEBITS;
716
0
      bitsleft -= OFFSET_TABLEBITS;
717
0
      entry = d->offset_decode_table[(entry >> 16) +
718
0
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
719
0
      if (!CAN_CONSUME(OFFSET_MAXBITS))
720
0
        REFILL_BITS();
721
0
    }
722
7.66k
    offset = entry >> 16;
723
7.66k
    offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
724
7.66k
    bitbuf >>= (u8)entry;
725
7.66k
    bitsleft -= entry;
726
727
7.66k
    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
728
7.58k
    src = out_next - offset;
729
7.58k
    dst = out_next;
730
7.58k
    out_next += length;
731
732
7.58k
    STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
733
7.58k
    *dst++ = *src++;
734
7.58k
    *dst++ = *src++;
735
898k
    do {
736
898k
      *dst++ = *src++;
737
898k
    } while (dst < out_next);
738
7.58k
  }
739
740
6.79k
block_done:
741
  /* Finished decoding a block */
742
743
6.79k
  if (!is_final_block)
744
5.77k
    goto next_block;
745
746
  /* That was the last block. */
747
748
1.02k
  bitsleft = (u8)bitsleft;
749
750
  /*
751
   * If any of the implicit appended zero bytes were consumed (not just
752
   * refilled) before hitting end of stream, then the data is bad.
753
   */
754
1.02k
  SAFETY_CHECK(overread_count <= (bitsleft >> 3));
755
756
  /* Optionally return the actual number of bytes consumed. */
757
998
  if (actual_in_nbytes_ret) {
758
    /* Don't count bytes that were refilled but not consumed. */
759
998
    in_next -= (bitsleft >> 3) - overread_count;
760
761
998
    *actual_in_nbytes_ret = in_next - (u8 *)in;
762
998
  }
763
764
  /* Optionally return the actual number of bytes written. */
765
998
  if (actual_out_nbytes_ret) {
766
996
    *actual_out_nbytes_ret = out_next - (u8 *)out;
767
996
  } else {
768
2
    if (out_next != out_end)
769
1
      return LIBDEFLATE_SHORT_OUTPUT;
770
2
  }
771
997
  return LIBDEFLATE_SUCCESS;
772
998
}
compression.c:deflate_decompress_bmi2
Line
Count
Source
49
1.87k
{
50
1.87k
  u8 *out_next = out;
51
1.87k
  u8 * const out_end = out_next + out_nbytes_avail;
52
1.87k
  u8 * const out_fastloop_end =
53
1.87k
    out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);
54
55
  /* Input bitstream state; see deflate_decompress.c for documentation */
56
1.87k
  const u8 *in_next = in;
57
1.87k
  const u8 * const in_end = in_next + in_nbytes;
58
1.87k
  const u8 * const in_fastloop_end =
59
1.87k
    in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
60
1.87k
  bitbuf_t bitbuf = 0;
61
1.87k
  bitbuf_t saved_bitbuf;
62
1.87k
  u32 bitsleft = 0;
63
1.87k
  size_t overread_count = 0;
64
65
1.87k
  bool is_final_block;
66
1.87k
  unsigned block_type;
67
1.87k
  unsigned num_litlen_syms;
68
1.87k
  unsigned num_offset_syms;
69
1.87k
  bitbuf_t litlen_tablemask;
70
1.87k
  u32 entry;
71
72
7.64k
next_block:
73
  /* Starting to read the next block */
74
7.64k
  ;
75
76
7.64k
  STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
77
7.64k
  REFILL_BITS();
78
79
  /* BFINAL: 1 bit */
80
7.60k
  is_final_block = bitbuf & BITMASK(1);
81
82
  /* BTYPE: 2 bits */
83
7.60k
  block_type = (bitbuf >> 1) & BITMASK(2);
84
85
7.60k
  if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
86
87
    /* Dynamic Huffman block */
88
89
    /* The order in which precode lengths are stored */
90
2.51k
    static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
91
2.51k
      16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
92
2.51k
    };
93
94
2.51k
    unsigned num_explicit_precode_lens;
95
2.51k
    unsigned i;
96
97
    /* Read the codeword length counts. */
98
99
2.51k
    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
100
2.51k
    num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));
101
102
2.51k
    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
103
2.51k
    num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));
104
105
2.51k
    STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
106
2.51k
    num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));
107
108
2.51k
    d->static_codes_loaded = false;
109
110
    /*
111
     * Read the precode codeword lengths.
112
     *
113
     * A 64-bit bitbuffer is just one bit too small to hold the
114
     * maximum number of precode lens, so to minimize branches we
115
     * merge one len with the previous fields.
116
     */
117
2.51k
    STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
118
2.51k
    if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
119
2.51k
      d->u.precode_lens[deflate_precode_lens_permutation[0]] =
120
2.51k
        (bitbuf >> 17) & BITMASK(3);
121
2.51k
      bitbuf >>= 20;
122
2.51k
      bitsleft -= 20;
123
2.51k
      REFILL_BITS();
124
2.50k
      i = 1;
125
37.0k
      do {
126
37.0k
        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
127
37.0k
          bitbuf & BITMASK(3);
128
37.0k
        bitbuf >>= 3;
129
37.0k
        bitsleft -= 3;
130
37.0k
      } while (++i < num_explicit_precode_lens);
131
2.50k
    } else {
132
0
      bitbuf >>= 17;
133
0
      bitsleft -= 17;
134
0
      i = 0;
135
0
      do {
136
0
        if ((u8)bitsleft < 3)
137
0
          REFILL_BITS();
138
0
        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
139
0
          bitbuf & BITMASK(3);
140
0
        bitbuf >>= 3;
141
0
        bitsleft -= 3;
142
0
      } while (++i < num_explicit_precode_lens);
143
0
    }
144
10.6k
    for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
145
8.12k
      d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
146
147
    /* Build the decode table for the precode. */
148
2.50k
    SAFETY_CHECK(build_precode_decode_table(d));
149
150
    /* Decode the litlen and offset codeword lengths. */
151
2.44k
    i = 0;
152
172k
    do {
153
172k
      unsigned presym;
154
172k
      u8 rep_val;
155
172k
      unsigned rep_count;
156
157
172k
      if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
158
12.8k
        REFILL_BITS();
159
160
      /*
161
       * The code below assumes that the precode decode table
162
       * doesn't have any subtables.
163
       */
164
172k
      STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
165
166
      /* Decode the next precode symbol. */
167
172k
      entry = d->u.l.precode_decode_table[
168
172k
        bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
169
172k
      bitbuf >>= (u8)entry;
170
172k
      bitsleft -= entry; /* optimization: subtract full entry */
171
172k
      presym = entry >> 16;
172
173
172k
      if (presym < 16) {
174
        /* Explicit codeword length */
175
127k
        d->u.l.lens[i++] = presym;
176
127k
        continue;
177
127k
      }
178
179
      /* Run-length encoded codeword lengths */
180
181
      /*
182
       * Note: we don't need to immediately verify that the
183
       * repeat count doesn't overflow the number of elements,
184
       * since we've sized the lens array to have enough extra
185
       * space to allow for the worst-case overrun (138 zeroes
186
       * when only 1 length was remaining).
187
       *
188
       * In the case of the small repeat counts (presyms 16
189
       * and 17), it is fastest to always write the maximum
190
       * number of entries.  That gets rid of branches that
191
       * would otherwise be required.
192
       *
193
       * It is not just because of the numerical order that
194
       * our checks go in the order 'presym < 16', 'presym ==
195
       * 16', and 'presym == 17'.  For typical data this is
196
       * ordered from most frequent to least frequent case.
197
       */
198
44.5k
      STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
199
200
44.5k
      if (presym == 16) {
201
        /* Repeat the previous length 3 - 6 times. */
202
30.4k
        SAFETY_CHECK(i != 0);
203
30.4k
        rep_val = d->u.l.lens[i - 1];
204
30.4k
        STATIC_ASSERT(3 + BITMASK(2) == 6);
205
30.4k
        rep_count = 3 + (bitbuf & BITMASK(2));
206
30.4k
        bitbuf >>= 2;
207
30.4k
        bitsleft -= 2;
208
30.4k
        d->u.l.lens[i + 0] = rep_val;
209
30.4k
        d->u.l.lens[i + 1] = rep_val;
210
30.4k
        d->u.l.lens[i + 2] = rep_val;
211
30.4k
        d->u.l.lens[i + 3] = rep_val;
212
30.4k
        d->u.l.lens[i + 4] = rep_val;
213
30.4k
        d->u.l.lens[i + 5] = rep_val;
214
30.4k
        i += rep_count;
215
30.4k
      } else if (presym == 17) {
216
        /* Repeat zero 3 - 10 times. */
217
8.73k
        STATIC_ASSERT(3 + BITMASK(3) == 10);
218
8.73k
        rep_count = 3 + (bitbuf & BITMASK(3));
219
8.73k
        bitbuf >>= 3;
220
8.73k
        bitsleft -= 3;
221
8.73k
        d->u.l.lens[i + 0] = 0;
222
8.73k
        d->u.l.lens[i + 1] = 0;
223
8.73k
        d->u.l.lens[i + 2] = 0;
224
8.73k
        d->u.l.lens[i + 3] = 0;
225
8.73k
        d->u.l.lens[i + 4] = 0;
226
8.73k
        d->u.l.lens[i + 5] = 0;
227
8.73k
        d->u.l.lens[i + 6] = 0;
228
8.73k
        d->u.l.lens[i + 7] = 0;
229
8.73k
        d->u.l.lens[i + 8] = 0;
230
8.73k
        d->u.l.lens[i + 9] = 0;
231
8.73k
        i += rep_count;
232
8.73k
      } else {
233
        /* Repeat zero 11 - 138 times. */
234
5.37k
        STATIC_ASSERT(11 + BITMASK(7) == 138);
235
5.37k
        rep_count = 11 + (bitbuf & BITMASK(7));
236
5.37k
        bitbuf >>= 7;
237
5.37k
        bitsleft -= 7;
238
5.37k
        memset(&d->u.l.lens[i], 0,
239
5.37k
               rep_count * sizeof(d->u.l.lens[i]));
240
5.37k
        i += rep_count;
241
5.37k
      }
242
172k
    } while (i < num_litlen_syms + num_offset_syms);
243
244
    /* Unnecessary, but check this for consistency with zlib. */
245
2.35k
    SAFETY_CHECK(i == num_litlen_syms + num_offset_syms);
246
247
5.08k
  } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
248
936
    u16 len, nlen;
249
250
    /*
251
     * Uncompressed block: copy 'len' bytes literally from the input
252
     * buffer to the output buffer.
253
     */
254
255
936
    bitsleft -= 3; /* for BTYPE and BFINAL */
256
257
    /*
258
     * Align the bitstream to the next byte boundary.  This means
259
     * the next byte boundary as if we were reading a byte at a
260
     * time.  Therefore, we have to rewind 'in_next' by any bytes
261
     * that have been refilled but not actually consumed yet (not
262
     * counting overread bytes, which don't increment 'in_next').
263
     */
264
936
    bitsleft = (u8)bitsleft;
265
936
    SAFETY_CHECK(overread_count <= (bitsleft >> 3));
266
896
    in_next -= (bitsleft >> 3) - overread_count;
267
896
    overread_count = 0;
268
896
    bitbuf = 0;
269
896
    bitsleft = 0;
270
271
896
    SAFETY_CHECK(in_end - in_next >= 4);
272
851
    len = get_unaligned_le16(in_next);
273
851
    nlen = get_unaligned_le16(in_next + 2);
274
851
    in_next += 4;
275
276
851
    SAFETY_CHECK(len == (u16)~nlen);
277
754
    if (unlikely(len > out_end - out_next))
278
7
      return LIBDEFLATE_INSUFFICIENT_SPACE;
279
747
    SAFETY_CHECK(len <= in_end - in_next);
280
281
729
    memcpy(out_next, in_next, len);
282
729
    in_next += len;
283
729
    out_next += len;
284
285
729
    goto block_done;
286
287
4.15k
  } else {
288
4.15k
    unsigned i;
289
290
4.15k
    SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
291
292
    /*
293
     * Static Huffman block: build the decode tables for the static
294
     * codes.  Skip doing so if the tables are already set up from
295
     * an earlier static block; this speeds up decompression of
296
     * degenerate input of many empty or very short static blocks.
297
     *
298
     * Afterwards, the remainder is the same as decompressing a
299
     * dynamic Huffman block.
300
     */
301
302
4.12k
    bitbuf >>= 3; /* for BTYPE and BFINAL */
303
4.12k
    bitsleft -= 3;
304
305
4.12k
    if (d->static_codes_loaded)
306
582
      goto have_decode_tables;
307
308
3.54k
    d->static_codes_loaded = true;
309
310
3.54k
    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
311
3.54k
    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
312
313
513k
    for (i = 0; i < 144; i++)
314
510k
      d->u.l.lens[i] = 8;
315
400k
    for (; i < 256; i++)
316
396k
      d->u.l.lens[i] = 9;
317
88.5k
    for (; i < 280; i++)
318
85.0k
      d->u.l.lens[i] = 7;
319
31.8k
    for (; i < 288; i++)
320
28.3k
      d->u.l.lens[i] = 8;
321
322
116k
    for (; i < 288 + 32; i++)
323
113k
      d->u.l.lens[i] = 5;
324
325
3.54k
    num_litlen_syms = 288;
326
3.54k
    num_offset_syms = 32;
327
3.54k
  }
328
329
  /* Decompressing a Huffman block (either dynamic or static) */
330
331
5.86k
  SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
332
5.84k
  SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
333
6.38k
have_decode_tables:
334
6.38k
  litlen_tablemask = BITMASK(d->litlen_tablebits);
335
336
  /*
337
   * This is the "fastloop" for decoding literals and matches.  It does
338
   * bounds checks on in_next and out_next in the loop conditions so that
339
   * additional bounds checks aren't needed inside the loop body.
340
   *
341
   * To reduce latency, the bitbuffer is refilled and the next litlen
342
   * decode table entry is preloaded before each loop iteration.
343
   */
344
6.38k
  if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
345
1.42k
    goto generic_loop;
346
4.96k
  REFILL_BITS_IN_FASTLOOP();
347
4.96k
  entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
348
171k
  do {
349
171k
    u32 length, offset, lit;
350
171k
    const u8 *src;
351
171k
    u8 *dst;
352
353
    /*
354
     * Consume the bits for the litlen decode table entry.  Save the
355
     * original bitbuf for later, in case the extra match length
356
     * bits need to be extracted from it.
357
     */
358
171k
    saved_bitbuf = bitbuf;
359
171k
    bitbuf >>= (u8)entry;
360
171k
    bitsleft -= entry; /* optimization: subtract full entry */
361
362
    /*
363
     * Begin by checking for a "fast" literal, i.e. a literal that
364
     * doesn't need a subtable.
365
     */
366
171k
    if (entry & HUFFDEC_LITERAL) {
367
      /*
368
       * On 64-bit platforms, we decode up to 2 extra fast
369
       * literals in addition to the primary item, as this
370
       * increases performance and still leaves enough bits
371
       * remaining for what follows.  We could actually do 3,
372
       * assuming LITLEN_TABLEBITS=11, but that actually
373
       * decreases performance slightly (perhaps by messing
374
       * with the branch prediction of the conditional refill
375
       * that happens later while decoding the match offset).
376
       *
377
       * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
378
       * and FASTLOOP_MAX_BYTES_READ need to be updated if the
379
       * number of extra literals decoded here is changed.
380
       */
381
108k
      if (/* enough bits for 2 fast literals + length + offset preload? */
382
108k
          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
383
108k
               LENGTH_MAXBITS,
384
108k
               OFFSET_TABLEBITS) &&
385
          /* enough bits for 2 fast literals + slow literal + litlen preload? */
386
0
          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
387
108k
               DEFLATE_MAX_LITLEN_CODEWORD_LEN,
388
108k
               LITLEN_TABLEBITS)) {
389
        /* 1st extra fast literal */
390
108k
        lit = entry >> 16;
391
108k
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
392
108k
        saved_bitbuf = bitbuf;
393
108k
        bitbuf >>= (u8)entry;
394
108k
        bitsleft -= entry;
395
108k
        *out_next++ = lit;
396
108k
        if (entry & HUFFDEC_LITERAL) {
397
          /* 2nd extra fast literal */
398
99.9k
          lit = entry >> 16;
399
99.9k
          entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
400
99.9k
          saved_bitbuf = bitbuf;
401
99.9k
          bitbuf >>= (u8)entry;
402
99.9k
          bitsleft -= entry;
403
99.9k
          *out_next++ = lit;
404
99.9k
          if (entry & HUFFDEC_LITERAL) {
405
            /*
406
             * Another fast literal, but
407
             * this one is in lieu of the
408
             * primary item, so it doesn't
409
             * count as one of the extras.
410
             */
411
95.2k
            lit = entry >> 16;
412
95.2k
            entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
413
95.2k
            REFILL_BITS_IN_FASTLOOP();
414
95.2k
            *out_next++ = lit;
415
95.2k
            continue;
416
95.2k
          }
417
99.9k
        }
418
108k
      } else {
419
        /*
420
         * Decode a literal.  While doing so, preload
421
         * the next litlen decode table entry and refill
422
         * the bitbuffer.  To reduce latency, we've
423
         * arranged for there to be enough "preloadable"
424
         * bits remaining to do the table preload
425
         * independently of the refill.
426
         */
427
0
        STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
428
0
            LITLEN_TABLEBITS, LITLEN_TABLEBITS));
429
0
        lit = entry >> 16;
430
0
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
431
0
        REFILL_BITS_IN_FASTLOOP();
432
0
        *out_next++ = lit;
433
0
        continue;
434
0
      }
435
108k
    }
436
437
    /*
438
     * It's not a literal entry, so it can be a length entry, a
439
     * subtable pointer entry, or an end-of-block entry.  Detect the
440
     * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
441
     */
442
75.9k
    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
443
      /* Subtable pointer or end-of-block entry */
444
445
12.5k
      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
446
3.46k
        goto block_done;
447
448
      /*
449
       * A subtable is required.  Load and consume the
450
       * subtable entry.  The subtable entry can be of any
451
       * type: literal, length, or end-of-block.
452
       */
453
9.09k
      entry = d->u.litlen_decode_table[(entry >> 16) +
454
9.09k
        EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
455
9.09k
      saved_bitbuf = bitbuf;
456
9.09k
      bitbuf >>= (u8)entry;
457
9.09k
      bitsleft -= entry;
458
459
      /*
460
       * 32-bit platforms that use the byte-at-a-time refill
461
       * method have to do a refill here for there to always
462
       * be enough bits to decode a literal that requires a
463
       * subtable, then preload the next litlen decode table
464
       * entry; or to decode a match length that requires a
465
       * subtable, then preload the offset decode table entry.
466
       */
467
9.09k
      if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
468
9.09k
                LITLEN_TABLEBITS) ||
469
0
          !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
470
9.09k
                OFFSET_TABLEBITS))
471
0
        REFILL_BITS_IN_FASTLOOP();
472
9.09k
      if (entry & HUFFDEC_LITERAL) {
473
        /* Decode a literal that required a subtable. */
474
7.83k
        lit = entry >> 16;
475
7.83k
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
476
7.83k
        REFILL_BITS_IN_FASTLOOP();
477
7.83k
        *out_next++ = lit;
478
7.83k
        continue;
479
7.83k
      }
480
1.26k
      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
481
1.02k
        goto block_done;
482
      /* Else, it's a length that required a subtable. */
483
1.26k
    }
484
485
    /*
486
     * Decode the match length: the length base value associated
487
     * with the litlen symbol (which we extract from the decode
488
     * table entry), plus the extra length bits.  We don't need to
489
     * consume the extra length bits here, as they were included in
490
     * the bits consumed by the entry earlier.  We also don't need
491
     * to check for too-long matches here, as this is inside the
492
     * fastloop where it's already been verified that the output
493
     * buffer has enough space remaining to copy a max-length match.
494
     */
495
63.6k
    length = entry >> 16;
496
63.6k
    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
497
498
    /*
499
     * Decode the match offset.  There are enough "preloadable" bits
500
     * remaining to preload the offset decode table entry, but a
501
     * refill might be needed before consuming it.
502
     */
503
63.6k
    STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
504
63.6k
                 OFFSET_TABLEBITS));
505
63.6k
    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
506
63.6k
    if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
507
63.6k
             LITLEN_TABLEBITS)) {
508
      /*
509
       * Decoding a match offset on a 64-bit platform.  We may
510
       * need to refill once, but then we can decode the whole
511
       * offset and preload the next litlen table entry.
512
       */
513
63.6k
      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
514
        /* Offset codeword requires a subtable */
515
0
        if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
516
0
               LITLEN_TABLEBITS - PRELOAD_SLACK))
517
0
          REFILL_BITS_IN_FASTLOOP();
518
0
        bitbuf >>= OFFSET_TABLEBITS;
519
0
        bitsleft -= OFFSET_TABLEBITS;
520
0
        entry = d->offset_decode_table[(entry >> 16) +
521
0
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
522
63.6k
      } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
523
63.6k
              LITLEN_TABLEBITS - PRELOAD_SLACK))
524
321
        REFILL_BITS_IN_FASTLOOP();
525
63.6k
    } else {
526
      /* Decoding a match offset on a 32-bit platform */
527
0
      REFILL_BITS_IN_FASTLOOP();
528
0
      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
529
        /* Offset codeword requires a subtable */
530
0
        bitbuf >>= OFFSET_TABLEBITS;
531
0
        bitsleft -= OFFSET_TABLEBITS;
532
0
        entry = d->offset_decode_table[(entry >> 16) +
533
0
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
534
0
        REFILL_BITS_IN_FASTLOOP();
535
        /* No further refill needed before extra bits */
536
0
        STATIC_ASSERT(CAN_CONSUME(
537
0
          OFFSET_MAXBITS - OFFSET_TABLEBITS));
538
0
      } else {
539
        /* No refill needed before extra bits */
540
0
        STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
541
0
      }
542
0
    }
543
63.6k
    saved_bitbuf = bitbuf;
544
63.6k
    bitbuf >>= (u8)entry;
545
63.6k
    bitsleft -= entry; /* optimization: subtract full entry */
546
63.6k
    offset = entry >> 16;
547
63.6k
    offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
548
549
    /* Validate the match offset; needed even in the fastloop. */
550
63.6k
    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
551
63.6k
    src = out_next - offset;
552
63.6k
    dst = out_next;
553
63.6k
    out_next += length;
554
555
    /*
556
     * Before starting to issue the instructions to copy the match,
557
     * refill the bitbuffer and preload the litlen decode table
558
     * entry for the next loop iteration.  This can increase
559
     * performance by allowing the latency of the match copy to
560
     * overlap with these other operations.  To further reduce
561
     * latency, we've arranged for there to be enough bits remaining
562
     * to do the table preload independently of the refill, except
563
     * on 32-bit platforms using the byte-at-a-time refill method.
564
     */
565
63.6k
    if (!CAN_CONSUME_AND_THEN_PRELOAD(
566
63.6k
      MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
567
63.6k
          OFFSET_MAXFASTBITS),
568
63.6k
      LITLEN_TABLEBITS) &&
569
0
        unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
570
0
      REFILL_BITS_IN_FASTLOOP();
571
63.6k
    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
572
63.6k
    REFILL_BITS_IN_FASTLOOP();
573
574
    /*
575
     * Copy the match.  On most CPUs the fastest method is a
576
     * word-at-a-time copy, unconditionally copying about 5 words
577
     * since this is enough for most matches without being too much.
578
     *
579
     * The normal word-at-a-time copy works for offset >= WORDBYTES,
580
     * which is most cases.  The case of offset == 1 is also common
581
     * and is worth optimizing for, since it is just RLE encoding of
582
     * the previous byte, which is the result of compressing long
583
     * runs of the same byte.
584
     *
585
     * Writing past the match 'length' is allowed here, since it's
586
     * been ensured there is enough output space left for a slight
587
     * overrun.  FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
588
     * the maximum possible overrun here is changed.
589
     */
590
63.6k
    if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
591
5.38k
      store_word_unaligned(load_word_unaligned(src), dst);
592
5.38k
      src += WORDBYTES;
593
5.38k
      dst += WORDBYTES;
594
5.38k
      store_word_unaligned(load_word_unaligned(src), dst);
595
5.38k
      src += WORDBYTES;
596
5.38k
      dst += WORDBYTES;
597
5.38k
      store_word_unaligned(load_word_unaligned(src), dst);
598
5.38k
      src += WORDBYTES;
599
5.38k
      dst += WORDBYTES;
600
5.38k
      store_word_unaligned(load_word_unaligned(src), dst);
601
5.38k
      src += WORDBYTES;
602
5.38k
      dst += WORDBYTES;
603
5.38k
      store_word_unaligned(load_word_unaligned(src), dst);
604
5.38k
      src += WORDBYTES;
605
5.38k
      dst += WORDBYTES;
606
21.5k
      while (dst < out_next) {
607
16.1k
        store_word_unaligned(load_word_unaligned(src), dst);
608
16.1k
        src += WORDBYTES;
609
16.1k
        dst += WORDBYTES;
610
16.1k
        store_word_unaligned(load_word_unaligned(src), dst);
611
16.1k
        src += WORDBYTES;
612
16.1k
        dst += WORDBYTES;
613
16.1k
        store_word_unaligned(load_word_unaligned(src), dst);
614
16.1k
        src += WORDBYTES;
615
16.1k
        dst += WORDBYTES;
616
16.1k
        store_word_unaligned(load_word_unaligned(src), dst);
617
16.1k
        src += WORDBYTES;
618
16.1k
        dst += WORDBYTES;
619
16.1k
        store_word_unaligned(load_word_unaligned(src), dst);
620
16.1k
        src += WORDBYTES;
621
16.1k
        dst += WORDBYTES;
622
16.1k
      }
623
58.2k
    } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
624
53.5k
      machine_word_t v;
625
626
      /*
627
       * This part tends to get auto-vectorized, so keep it
628
       * copying a multiple of 16 bytes at a time.
629
       */
630
53.5k
      v = (machine_word_t)0x0101010101010101 * src[0];
631
53.5k
      store_word_unaligned(v, dst);
632
53.5k
      dst += WORDBYTES;
633
53.5k
      store_word_unaligned(v, dst);
634
53.5k
      dst += WORDBYTES;
635
53.5k
      store_word_unaligned(v, dst);
636
53.5k
      dst += WORDBYTES;
637
53.5k
      store_word_unaligned(v, dst);
638
53.5k
      dst += WORDBYTES;
639
280k
      while (dst < out_next) {
640
227k
        store_word_unaligned(v, dst);
641
227k
        dst += WORDBYTES;
642
227k
        store_word_unaligned(v, dst);
643
227k
        dst += WORDBYTES;
644
227k
        store_word_unaligned(v, dst);
645
227k
        dst += WORDBYTES;
646
227k
        store_word_unaligned(v, dst);
647
227k
        dst += WORDBYTES;
648
227k
      }
649
53.5k
    } else if (UNALIGNED_ACCESS_IS_FAST) {
650
4.70k
      store_word_unaligned(load_word_unaligned(src), dst);
651
4.70k
      src += offset;
652
4.70k
      dst += offset;
653
4.70k
      store_word_unaligned(load_word_unaligned(src), dst);
654
4.70k
      src += offset;
655
4.70k
      dst += offset;
656
76.6k
      do {
657
76.6k
        store_word_unaligned(load_word_unaligned(src), dst);
658
76.6k
        src += offset;
659
76.6k
        dst += offset;
660
76.6k
        store_word_unaligned(load_word_unaligned(src), dst);
661
76.6k
        src += offset;
662
76.6k
        dst += offset;
663
76.6k
      } while (dst < out_next);
664
4.70k
    } else {
665
0
      *dst++ = *src++;
666
0
      *dst++ = *src++;
667
0
      do {
668
0
        *dst++ = *src++;
669
0
      } while (dst < out_next);
670
0
    }
671
166k
  } while (in_next < in_fastloop_end && out_next < out_fastloop_end);
672
673
  /*
674
   * This is the generic loop for decoding literals and matches.  This
675
   * handles cases where in_next and out_next are close to the end of
676
   * their respective buffers.  Usually this loop isn't performance-
677
   * critical, as most time is spent in the fastloop above instead.  We
678
   * therefore omit some optimizations here in favor of smaller code.
679
   */
680
1.87k
generic_loop:
681
26.5k
  for (;;) {
682
26.5k
    u32 length, offset;
683
26.5k
    const u8 *src;
684
26.5k
    u8 *dst;
685
686
26.5k
    REFILL_BITS();
687
26.3k
    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
688
26.3k
    saved_bitbuf = bitbuf;
689
26.3k
    bitbuf >>= (u8)entry;
690
26.3k
    bitsleft -= entry;
691
26.3k
    if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
692
438
      entry = d->u.litlen_decode_table[(entry >> 16) +
693
438
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
694
438
      saved_bitbuf = bitbuf;
695
438
      bitbuf >>= (u8)entry;
696
438
      bitsleft -= entry;
697
438
    }
698
26.3k
    length = entry >> 16;
699
26.3k
    if (entry & HUFFDEC_LITERAL) {
700
17.1k
      if (unlikely(out_next == out_end))
701
12
        return LIBDEFLATE_INSUFFICIENT_SPACE;
702
17.1k
      *out_next++ = length;
703
17.1k
      continue;
704
17.1k
    }
705
9.25k
    if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
706
1.57k
      goto block_done;
707
7.68k
    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
708
7.68k
    if (unlikely(length > out_end - out_next))
709
24
      return LIBDEFLATE_INSUFFICIENT_SPACE;
710
711
7.66k
    if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
712
0
      REFILL_BITS();
713
7.66k
    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
714
7.66k
    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
715
0
      bitbuf >>= OFFSET_TABLEBITS;
716
0
      bitsleft -= OFFSET_TABLEBITS;
717
0
      entry = d->offset_decode_table[(entry >> 16) +
718
0
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
719
0
      if (!CAN_CONSUME(OFFSET_MAXBITS))
720
0
        REFILL_BITS();
721
0
    }
722
7.66k
    offset = entry >> 16;
723
7.66k
    offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
724
7.66k
    bitbuf >>= (u8)entry;
725
7.66k
    bitsleft -= entry;
726
727
7.66k
    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
728
7.58k
    src = out_next - offset;
729
7.58k
    dst = out_next;
730
7.58k
    out_next += length;
731
732
7.58k
    STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
733
7.58k
    *dst++ = *src++;
734
7.58k
    *dst++ = *src++;
735
898k
    do {
736
898k
      *dst++ = *src++;
737
898k
    } while (dst < out_next);
738
7.58k
  }
739
740
6.79k
block_done:
741
  /* Finished decoding a block */
742
743
6.79k
  if (!is_final_block)
744
5.77k
    goto next_block;
745
746
  /* That was the last block. */
747
748
1.02k
  bitsleft = (u8)bitsleft;
749
750
  /*
751
   * If any of the implicit appended zero bytes were consumed (not just
752
   * refilled) before hitting end of stream, then the data is bad.
753
   */
754
1.02k
  SAFETY_CHECK(overread_count <= (bitsleft >> 3));
755
756
  /* Optionally return the actual number of bytes consumed. */
757
998
  if (actual_in_nbytes_ret) {
758
    /* Don't count bytes that were refilled but not consumed. */
759
998
    in_next -= (bitsleft >> 3) - overread_count;
760
761
998
    *actual_in_nbytes_ret = in_next - (u8 *)in;
762
998
  }
763
764
  /* Optionally return the actual number of bytes written. */
765
998
  if (actual_out_nbytes_ret) {
766
996
    *actual_out_nbytes_ret = out_next - (u8 *)out;
767
996
  } else {
768
2
    if (out_next != out_end)
769
1
      return LIBDEFLATE_SHORT_OUTPUT;
770
2
  }
771
997
  return LIBDEFLATE_SUCCESS;
772
998
}
Unexecuted instantiation: compression.c:deflate_decompress_default
773
774
#undef FUNCNAME
775
#undef ATTRIBUTES
776
#undef EXTRACT_VARBITS
777
#undef EXTRACT_VARBITS8