/src/openexr/external/deflate/lib/decompress_template.h

Source
/*
 * decompress_template.h
 *
 * Copyright 2016 Eric Biggers
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

/*
 * This is the actual DEFLATE decompression routine, lifted out of
 * deflate_decompress.c so that it can be compiled multiple times with different
 * target instruction sets.
 */

#ifndef ATTRIBUTES
#  define ATTRIBUTES
#endif
#ifndef EXTRACT_VARBITS
#  define EXTRACT_VARBITS(word, count)  ((word) & BITMASK(count))
#endif
#ifndef EXTRACT_VARBITS8
#  define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count)))
#endif

static ATTRIBUTES MAYBE_UNUSED enum libdeflate_result
FUNCNAME(struct libdeflate_decompressor * restrict d,
   const void * restrict in, size_t in_nbytes,
   void * restrict out, size_t out_nbytes_avail,
   size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
{
  u8 *out_next = out;
  u8 * const out_end = out_next + out_nbytes_avail;
  u8 * const out_fastloop_end =
    out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);

  /* Input bitstream state; see deflate_decompress.c for documentation */
  const u8 *in_next = in;
  const u8 * const in_end = in_next + in_nbytes;
  const u8 * const in_fastloop_end =
    in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
  bitbuf_t bitbuf = 0;
  bitbuf_t saved_bitbuf;
  u32 bitsleft = 0;
  size_t overread_count = 0;

  bool is_final_block;
  unsigned block_type;
  unsigned num_litlen_syms;
  unsigned num_offset_syms;
  bitbuf_t litlen_tablemask;
  u32 entry;

next_block:
  /* Starting to read the next block */
  ;

  STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
  REFILL_BITS();

  /* BFINAL: 1 bit */
  is_final_block = bitbuf & BITMASK(1);

  /* BTYPE: 2 bits */
  block_type = (bitbuf >> 1) & BITMASK(2);

  if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {

    /* Dynamic Huffman block */

    /* The order in which precode lengths are stored */
    static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
      16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
    };

    unsigned num_explicit_precode_lens;
    unsigned i;

    /* Read the codeword length counts. */

    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
    num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));

    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
    num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));

    STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
    num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));

    d->static_codes_loaded = false;

    /*
     * Read the precode codeword lengths.
     *
     * A 64-bit bitbuffer is just one bit too small to hold the
     * maximum number of precode lens, so to minimize branches we
     * merge one len with the previous fields.
     */
    STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
    if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
      d->u.precode_lens[deflate_precode_lens_permutation[0]] =
        (bitbuf >> 17) & BITMASK(3);
      bitbuf >>= 20;
      bitsleft -= 20;
      REFILL_BITS();
      i = 1;
      do {
        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
          bitbuf & BITMASK(3);
        bitbuf >>= 3;
        bitsleft -= 3;
      } while (++i < num_explicit_precode_lens);
    } else {
      bitbuf >>= 17;
      bitsleft -= 17;
      i = 0;
      do {
        if ((u8)bitsleft < 3)
          REFILL_BITS();
        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
          bitbuf & BITMASK(3);
        bitbuf >>= 3;
        bitsleft -= 3;
      } while (++i < num_explicit_precode_lens);
    }
    for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
      d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;

    /* Build the decode table for the precode. */
    SAFETY_CHECK(build_precode_decode_table(d));

    /* Decode the litlen and offset codeword lengths. */
    i = 0;
    do {
      unsigned presym;
      u8 rep_val;
      unsigned rep_count;

      if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
        REFILL_BITS();

      /*
       * The code below assumes that the precode decode table
       * doesn't have any subtables.
       */
      STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);

      /* Decode the next precode symbol. */
      entry = d->u.l.precode_decode_table[
        bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
      bitbuf >>= (u8)entry;
      bitsleft -= entry; /* optimization: subtract full entry */
      presym = entry >> 16;

      if (presym < 16) {
        /* Explicit codeword length */
        d->u.l.lens[i++] = presym;
        continue;
      }

      /* Run-length encoded codeword lengths */

      /*
       * Note: we don't need to immediately verify that the
       * repeat count doesn't overflow the number of elements,
       * since we've sized the lens array to have enough extra
       * space to allow for the worst-case overrun (138 zeroes
       * when only 1 length was remaining).
       *
       * In the case of the small repeat counts (presyms 16
       * and 17), it is fastest to always write the maximum
       * number of entries.  That gets rid of branches that
       * would otherwise be required.
       *
       * It is not just because of the numerical order that
       * our checks go in the order 'presym < 16', 'presym ==
       * 16', and 'presym == 17'.  For typical data this is
       * ordered from most frequent to least frequent case.
       */
      STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);

      if (presym == 16) {
        /* Repeat the previous length 3 - 6 times. */
        SAFETY_CHECK(i != 0);
        rep_val = d->u.l.lens[i - 1];
        STATIC_ASSERT(3 + BITMASK(2) == 6);
        rep_count = 3 + (bitbuf & BITMASK(2));
        bitbuf >>= 2;
        bitsleft -= 2;
        d->u.l.lens[i + 0] = rep_val;
        d->u.l.lens[i + 1] = rep_val;
        d->u.l.lens[i + 2] = rep_val;
        d->u.l.lens[i + 3] = rep_val;
        d->u.l.lens[i + 4] = rep_val;
        d->u.l.lens[i + 5] = rep_val;
        i += rep_count;
      } else if (presym == 17) {
        /* Repeat zero 3 - 10 times. */
        STATIC_ASSERT(3 + BITMASK(3) == 10);
        rep_count = 3 + (bitbuf & BITMASK(3));
        bitbuf >>= 3;
        bitsleft -= 3;
        d->u.l.lens[i + 0] = 0;
        d->u.l.lens[i + 1] = 0;
        d->u.l.lens[i + 2] = 0;
        d->u.l.lens[i + 3] = 0;
        d->u.l.lens[i + 4] = 0;
        d->u.l.lens[i + 5] = 0;
        d->u.l.lens[i + 6] = 0;
        d->u.l.lens[i + 7] = 0;
        d->u.l.lens[i + 8] = 0;
        d->u.l.lens[i + 9] = 0;
        i += rep_count;
      } else {
        /* Repeat zero 11 - 138 times. */
        STATIC_ASSERT(11 + BITMASK(7) == 138);
        rep_count = 11 + (bitbuf & BITMASK(7));
        bitbuf >>= 7;
        bitsleft -= 7;
        memset(&d->u.l.lens[i], 0,
               rep_count * sizeof(d->u.l.lens[i]));
        i += rep_count;
      }
    } while (i < num_litlen_syms + num_offset_syms);

    /* Unnecessary, but check this for consistency with zlib. */
    SAFETY_CHECK(i == num_litlen_syms + num_offset_syms);

  } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
    u16 len, nlen;

    /*
     * Uncompressed block: copy 'len' bytes literally from the input
     * buffer to the output buffer.
     */

    bitsleft -= 3; /* for BTYPE and BFINAL */

    /*
     * Align the bitstream to the next byte boundary.  This means
     * the next byte boundary as if we were reading a byte at a
     * time.  Therefore, we have to rewind 'in_next' by any bytes
     * that have been refilled but not actually consumed yet (not
     * counting overread bytes, which don't increment 'in_next').
     */
    bitsleft = (u8)bitsleft;
    SAFETY_CHECK(overread_count <= (bitsleft >> 3));
    in_next -= (bitsleft >> 3) - overread_count;
    overread_count = 0;
    bitbuf = 0;
    bitsleft = 0;

    SAFETY_CHECK(in_end - in_next >= 4);
    len = get_unaligned_le16(in_next);
    nlen = get_unaligned_le16(in_next + 2);
    in_next += 4;

    SAFETY_CHECK(len == (u16)~nlen);
    if (unlikely(len > out_end - out_next))
      return LIBDEFLATE_INSUFFICIENT_SPACE;
    SAFETY_CHECK(len <= in_end - in_next);

    memcpy(out_next, in_next, len);
    in_next += len;
    out_next += len;

    goto block_done;

  } else {
    unsigned i;

    SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);

    /*
     * Static Huffman block: build the decode tables for the static
     * codes.  Skip doing so if the tables are already set up from
     * an earlier static block; this speeds up decompression of
     * degenerate input of many empty or very short static blocks.
     *
     * Afterwards, the remainder is the same as decompressing a
     * dynamic Huffman block.
     */

    bitbuf >>= 3; /* for BTYPE and BFINAL */
    bitsleft -= 3;

    if (d->static_codes_loaded)
      goto have_decode_tables;

    d->static_codes_loaded = true;

    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);

    for (i = 0; i < 144; i++)
      d->u.l.lens[i] = 8;
    for (; i < 256; i++)
      d->u.l.lens[i] = 9;
    for (; i < 280; i++)
      d->u.l.lens[i] = 7;
    for (; i < 288; i++)
      d->u.l.lens[i] = 8;

    for (; i < 288 + 32; i++)
      d->u.l.lens[i] = 5;

    num_litlen_syms = 288;
    num_offset_syms = 32;
  }

  /* Decompressing a Huffman block (either dynamic or static) */

  SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
  SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
have_decode_tables:
  litlen_tablemask = BITMASK(d->litlen_tablebits);

  /*
   * This is the "fastloop" for decoding literals and matches.  It does
   * bounds checks on in_next and out_next in the loop conditions so that
   * additional bounds checks aren't needed inside the loop body.
   *
   * To reduce latency, the bitbuffer is refilled and the next litlen
   * decode table entry is preloaded before each loop iteration.
   */
  if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
    goto generic_loop;
  REFILL_BITS_IN_FASTLOOP();
  entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
  do {
    u32 length, offset, lit;
    const u8 *src;
    u8 *dst;

    /*
     * Consume the bits for the litlen decode table entry.  Save the
     * original bitbuf for later, in case the extra match length
     * bits need to be extracted from it.
     */
    saved_bitbuf = bitbuf;
    bitbuf >>= (u8)entry;
    bitsleft -= entry; /* optimization: subtract full entry */

    /*
     * Begin by checking for a "fast" literal, i.e. a literal that
     * doesn't need a subtable.
     */
    if (entry & HUFFDEC_LITERAL) {
      /*
       * On 64-bit platforms, we decode up to 2 extra fast
       * literals in addition to the primary item, as this
       * increases performance and still leaves enough bits
       * remaining for what follows.  We could actually do 3,
       * assuming LITLEN_TABLEBITS=11, but that actually
       * decreases performance slightly (perhaps by messing
       * with the branch prediction of the conditional refill
       * that happens later while decoding the match offset).
       *
       * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
       * and FASTLOOP_MAX_BYTES_READ need to be updated if the
       * number of extra literals decoded here is changed.
       */
      if (/* enough bits for 2 fast literals + length + offset preload? */
          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
               LENGTH_MAXBITS,
               OFFSET_TABLEBITS) &&
          /* enough bits for 2 fast literals + slow literal + litlen preload? */
          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
               DEFLATE_MAX_LITLEN_CODEWORD_LEN,
               LITLEN_TABLEBITS)) {
        /* 1st extra fast literal */
        lit = entry >> 16;
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
        saved_bitbuf = bitbuf;
        bitbuf >>= (u8)entry;
        bitsleft -= entry;
        *out_next++ = lit;
        if (entry & HUFFDEC_LITERAL) {
          /* 2nd extra fast literal */
          lit = entry >> 16;
          entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
          saved_bitbuf = bitbuf;
          bitbuf >>= (u8)entry;
          bitsleft -= entry;
          *out_next++ = lit;
          if (entry & HUFFDEC_LITERAL) {
            /*
             * Another fast literal, but
             * this one is in lieu of the
             * primary item, so it doesn't
             * count as one of the extras.
             */
            lit = entry >> 16;
            entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
            REFILL_BITS_IN_FASTLOOP();
            *out_next++ = lit;
            continue;
          }
        }
      } else {
        /*
         * Decode a literal.  While doing so, preload
         * the next litlen decode table entry and refill
         * the bitbuffer.  To reduce latency, we've
         * arranged for there to be enough "preloadable"
         * bits remaining to do the table preload
         * independently of the refill.
         */
        STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
            LITLEN_TABLEBITS, LITLEN_TABLEBITS));
        lit = entry >> 16;
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
        REFILL_BITS_IN_FASTLOOP();
        *out_next++ = lit;
        continue;
      }
    }

    /*
     * It's not a literal entry, so it can be a length entry, a
     * subtable pointer entry, or an end-of-block entry.  Detect the
     * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
     */
    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
      /* Subtable pointer or end-of-block entry */

      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
        goto block_done;

      /*
       * A subtable is required.  Load and consume the
       * subtable entry.  The subtable entry can be of any
       * type: literal, length, or end-of-block.
       */
      entry = d->u.litlen_decode_table[(entry >> 16) +
        EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
      saved_bitbuf = bitbuf;
      bitbuf >>= (u8)entry;
      bitsleft -= entry;

      /*
       * 32-bit platforms that use the byte-at-a-time refill
       * method have to do a refill here for there to always
       * be enough bits to decode a literal that requires a
       * subtable, then preload the next litlen decode table
       * entry; or to decode a match length that requires a
       * subtable, then preload the offset decode table entry.
       */
      if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
                LITLEN_TABLEBITS) ||
          !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
                OFFSET_TABLEBITS))
        REFILL_BITS_IN_FASTLOOP();
      if (entry & HUFFDEC_LITERAL) {
        /* Decode a literal that required a subtable. */
        lit = entry >> 16;
        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
        REFILL_BITS_IN_FASTLOOP();
        *out_next++ = lit;
        continue;
      }
      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
        goto block_done;
      /* Else, it's a length that required a subtable. */
    }

    /*
     * Decode the match length: the length base value associated
     * with the litlen symbol (which we extract from the decode
     * table entry), plus the extra length bits.  We don't need to
     * consume the extra length bits here, as they were included in
     * the bits consumed by the entry earlier.  We also don't need
     * to check for too-long matches here, as this is inside the
     * fastloop where it's already been verified that the output
     * buffer has enough space remaining to copy a max-length match.
     */
    length = entry >> 16;
    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);

    /*
     * Decode the match offset.  There are enough "preloadable" bits
     * remaining to preload the offset decode table entry, but a
     * refill might be needed before consuming it.
     */
    STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
                 OFFSET_TABLEBITS));
    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
    if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
             LITLEN_TABLEBITS)) {
      /*
       * Decoding a match offset on a 64-bit platform.  We may
       * need to refill once, but then we can decode the whole
       * offset and preload the next litlen table entry.
       */
      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
        /* Offset codeword requires a subtable */
        if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
               LITLEN_TABLEBITS - PRELOAD_SLACK))
          REFILL_BITS_IN_FASTLOOP();
        bitbuf >>= OFFSET_TABLEBITS;
        bitsleft -= OFFSET_TABLEBITS;
        entry = d->offset_decode_table[(entry >> 16) +
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
      } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
              LITLEN_TABLEBITS - PRELOAD_SLACK))
        REFILL_BITS_IN_FASTLOOP();
    } else {
      /* Decoding a match offset on a 32-bit platform */
      REFILL_BITS_IN_FASTLOOP();
      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
        /* Offset codeword requires a subtable */
        bitbuf >>= OFFSET_TABLEBITS;
        bitsleft -= OFFSET_TABLEBITS;
        entry = d->offset_decode_table[(entry >> 16) +
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
        REFILL_BITS_IN_FASTLOOP();
        /* No further refill needed before extra bits */
        STATIC_ASSERT(CAN_CONSUME(
          OFFSET_MAXBITS - OFFSET_TABLEBITS));
      } else {
        /* No refill needed before extra bits */
        STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
      }
    }
    saved_bitbuf = bitbuf;
    bitbuf >>= (u8)entry;
    bitsleft -= entry; /* optimization: subtract full entry */
    offset = entry >> 16;
    offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);

    /* Validate the match offset; needed even in the fastloop. */
    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
    src = out_next - offset;
    dst = out_next;
    out_next += length;

    /*
     * Before starting to issue the instructions to copy the match,
     * refill the bitbuffer and preload the litlen decode table
     * entry for the next loop iteration.  This can increase
     * performance by allowing the latency of the match copy to
     * overlap with these other operations.  To further reduce
     * latency, we've arranged for there to be enough bits remaining
     * to do the table preload independently of the refill, except
     * on 32-bit platforms using the byte-at-a-time refill method.
     */
    if (!CAN_CONSUME_AND_THEN_PRELOAD(
      MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
          OFFSET_MAXFASTBITS),
      LITLEN_TABLEBITS) &&
        unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
      REFILL_BITS_IN_FASTLOOP();
    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
    REFILL_BITS_IN_FASTLOOP();

    /*
     * Copy the match.  On most CPUs the fastest method is a
     * word-at-a-time copy, unconditionally copying about 5 words
     * since this is enough for most matches without being too much.
     *
     * The normal word-at-a-time copy works for offset >= WORDBYTES,
     * which is most cases.  The case of offset == 1 is also common
     * and is worth optimizing for, since it is just RLE encoding of
     * the previous byte, which is the result of compressing long
     * runs of the same byte.
     *
     * Writing past the match 'length' is allowed here, since it's
     * been ensured there is enough output space left for a slight
     * overrun.  FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
     * the maximum possible overrun here is changed.
     */
    if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
      store_word_unaligned(load_word_unaligned(src), dst);
      src += WORDBYTES;
      dst += WORDBYTES;
      store_word_unaligned(load_word_unaligned(src), dst);
      src += WORDBYTES;
      dst += WORDBYTES;
      store_word_unaligned(load_word_unaligned(src), dst);
      src += WORDBYTES;
      dst += WORDBYTES;
      store_word_unaligned(load_word_unaligned(src), dst);
      src += WORDBYTES;
      dst += WORDBYTES;
      store_word_unaligned(load_word_unaligned(src), dst);
      src += WORDBYTES;
      dst += WORDBYTES;
      while (dst < out_next) {
        store_word_unaligned(load_word_unaligned(src), dst);
        src += WORDBYTES;
        dst += WORDBYTES;
        store_word_unaligned(load_word_unaligned(src), dst);
        src += WORDBYTES;
        dst += WORDBYTES;
        store_word_unaligned(load_word_unaligned(src), dst);
        src += WORDBYTES;
        dst += WORDBYTES;
        store_word_unaligned(load_word_unaligned(src), dst);
        src += WORDBYTES;
        dst += WORDBYTES;
        store_word_unaligned(load_word_unaligned(src), dst);
        src += WORDBYTES;
        dst += WORDBYTES;
      }
    } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
      machine_word_t v;

      /*
       * This part tends to get auto-vectorized, so keep it
       * copying a multiple of 16 bytes at a time.
       */
      v = (machine_word_t)0x0101010101010101 * src[0];
      store_word_unaligned(v, dst);
      dst += WORDBYTES;
      store_word_unaligned(v, dst);
      dst += WORDBYTES;
      store_word_unaligned(v, dst);
      dst += WORDBYTES;
      store_word_unaligned(v, dst);
      dst += WORDBYTES;
      while (dst < out_next) {
        store_word_unaligned(v, dst);
        dst += WORDBYTES;
        store_word_unaligned(v, dst);
        dst += WORDBYTES;
        store_word_unaligned(v, dst);
        dst += WORDBYTES;
        store_word_unaligned(v, dst);
        dst += WORDBYTES;
      }
    } else if (UNALIGNED_ACCESS_IS_FAST) {
      store_word_unaligned(load_word_unaligned(src), dst);
      src += offset;
      dst += offset;
      store_word_unaligned(load_word_unaligned(src), dst);
      src += offset;
      dst += offset;
      do {
        store_word_unaligned(load_word_unaligned(src), dst);
        src += offset;
        dst += offset;
        store_word_unaligned(load_word_unaligned(src), dst);
        src += offset;
        dst += offset;
      } while (dst < out_next);
    } else {
      *dst++ = *src++;
      *dst++ = *src++;
      do {
        *dst++ = *src++;
      } while (dst < out_next);
    }
  } while (in_next < in_fastloop_end && out_next < out_fastloop_end);

  /*
   * This is the generic loop for decoding literals and matches.  This
   * handles cases where in_next and out_next are close to the end of
   * their respective buffers.  Usually this loop isn't performance-
   * critical, as most time is spent in the fastloop above instead.  We
   * therefore omit some optimizations here in favor of smaller code.
   */
generic_loop:
  for (;;) {
    u32 length, offset;
    const u8 *src;
    u8 *dst;

    REFILL_BITS();
    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
    saved_bitbuf = bitbuf;
    bitbuf >>= (u8)entry;
    bitsleft -= entry;
    if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
      entry = d->u.litlen_decode_table[(entry >> 16) +
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
      saved_bitbuf = bitbuf;
      bitbuf >>= (u8)entry;
      bitsleft -= entry;
    }
    length = entry >> 16;
    if (entry & HUFFDEC_LITERAL) {
      if (unlikely(out_next == out_end))
        return LIBDEFLATE_INSUFFICIENT_SPACE;
      *out_next++ = length;
      continue;
    }
    if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
      goto block_done;
    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
    if (unlikely(length > out_end - out_next))
      return LIBDEFLATE_INSUFFICIENT_SPACE;

    if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
      REFILL_BITS();
    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
      bitbuf >>= OFFSET_TABLEBITS;
      bitsleft -= OFFSET_TABLEBITS;
      entry = d->offset_decode_table[(entry >> 16) +
          EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
      if (!CAN_CONSUME(OFFSET_MAXBITS))
        REFILL_BITS();
    }
    offset = entry >> 16;
    offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
    bitbuf >>= (u8)entry;
    bitsleft -= entry;

    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
    src = out_next - offset;
    dst = out_next;
    out_next += length;

    STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
    *dst++ = *src++;
    *dst++ = *src++;
    do {
      *dst++ = *src++;
    } while (dst < out_next);
  }

block_done:
  /* Finished decoding a block */

  if (!is_final_block)
    goto next_block;

  /* That was the last block. */

  bitsleft = (u8)bitsleft;

  /*
   * If any of the implicit appended zero bytes were consumed (not just
   * refilled) before hitting end of stream, then the data is bad.
   */
  SAFETY_CHECK(overread_count <= (bitsleft >> 3));

  /* Optionally return the actual number of bytes consumed. */
  if (actual_in_nbytes_ret) {
    /* Don't count bytes that were refilled but not consumed. */
    in_next -= (bitsleft >> 3) - overread_count;

    *actual_in_nbytes_ret = in_next - (u8 *)in;
  }

  /* Optionally return the actual number of bytes written. */
  if (actual_out_nbytes_ret) {
    *actual_out_nbytes_ret = out_next - (u8 *)out;
  } else {
    if (out_next != out_end)
      return LIBDEFLATE_SHORT_OUTPUT;
  }
  return LIBDEFLATE_SUCCESS;
}

#undef FUNCNAME
#undef ATTRIBUTES
#undef EXTRACT_VARBITS
#undef EXTRACT_VARBITS8

Coverage Report

Created: 2025-10-10 06:38