/src/duckdb/third_party/brotli/enc/block_splitter.cpp

Source
/* Copyright 2013 Google Inc. All Rights Reserved.

   Distributed under MIT license.
   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
*/

/* Block split point selection utilities. */

#include "block_splitter.h"

#include <string.h>  /* memcpy, memset */

#include "../common/brotli_platform.h"
#include "bit_cost.h"
#include "cluster.h"
#include "command.h"
#include "fast_log.h"
#include "histogram.h"
#include "memory.h"
#include "quality.h"

using namespace duckdb_brotli;

static const size_t kMaxLiteralHistograms = 100;
static const size_t kMaxCommandHistograms = 50;
static const double kLiteralBlockSwitchCost = 28.1;
static const double kCommandBlockSwitchCost = 13.5;
static const double kDistanceBlockSwitchCost = 14.6;
static const size_t kLiteralStrideLength = 70;
static const size_t kCommandStrideLength = 40;
static const size_t kDistanceStrideLength = 40;
static const size_t kSymbolsPerLiteralHistogram = 544;
static const size_t kSymbolsPerCommandHistogram = 530;
static const size_t kSymbolsPerDistanceHistogram = 544;
static const size_t kMinLengthForBlockSplitting = 128;
static const size_t kIterMulForRefining = 2;
static const size_t kMinItersForRefining = 100;

static size_t CountLiterals(const Command* cmds, const size_t num_commands) {
  /* Count how many we have. */
  size_t total_length = 0;
  size_t i;
  for (i = 0; i < num_commands; ++i) {
    total_length += cmds[i].insert_len_;
  }
  return total_length;
}

static void CopyLiteralsToByteArray(const Command* cmds,
                                    const size_t num_commands,
                                    const uint8_t* data,
                                    const size_t offset,
                                    const size_t mask,
                                    uint8_t* literals) {
  size_t pos = 0;
  size_t from_pos = offset & mask;
  size_t i;
  for (i = 0; i < num_commands; ++i) {
    size_t insert_len = cmds[i].insert_len_;
    if (from_pos + insert_len > mask) {
      size_t head_size = mask + 1 - from_pos;
      memcpy(literals + pos, data + from_pos, head_size);
      from_pos = 0;
      pos += head_size;
      insert_len -= head_size;
    }
    if (insert_len > 0) {
      memcpy(literals + pos, data + from_pos, insert_len);
      pos += insert_len;
    }
    from_pos = (from_pos + insert_len + CommandCopyLen(&cmds[i])) & mask;
  }
}

static BROTLI_INLINE uint32_t MyRand(uint32_t* seed) {
  /* Initial seed should be 7. In this case, loop length is (1 << 29). */
  *seed *= 16807U;
  return *seed;
}

static BROTLI_INLINE double BitCost(size_t count) {
  return count == 0 ? -2.0 : FastLog2(count);
}

#define HISTOGRAMS_PER_BATCH 64
#define CLUSTERS_PER_BATCH 16

#define FN(X) X ## Literal
#define DataType uint8_t
/* NOLINTNEXTLINE(build/include) */
/* NOLINT(build/header_guard) */
/* Copyright 2013 Google Inc. All Rights Reserved.

   Distributed under MIT license.
   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
*/

/* template parameters: FN, DataType */

#define HistogramType FN(Histogram)

static void FN(InitialEntropyCodes)(const DataType* data, size_t length,
                                    size_t stride,
                                    size_t num_histograms,
                                    HistogramType* histograms) {
  uint32_t seed = 7;
  size_t block_length = length / num_histograms;
  size_t i;
  FN(ClearHistograms)(histograms, num_histograms);
  for (i = 0; i < num_histograms; ++i) {
    size_t pos = length * i / num_histograms;
    if (i != 0) {
      pos += MyRand(&seed) % block_length;
    }
    if (pos + stride >= length) {
      pos = length - stride - 1;
    }
    FN(HistogramAddVector)(&histograms[i], data + pos, stride);
  }
}

static void FN(RandomSample)(uint32_t* seed,
                             const DataType* data,
                             size_t length,
                             size_t stride,
                             HistogramType* sample) {
  size_t pos = 0;
  if (stride >= length) {
    stride = length;
  } else {
    pos = MyRand(seed) % (length - stride + 1);
  }
  FN(HistogramAddVector)(sample, data + pos, stride);
}

static void FN(RefineEntropyCodes)(const DataType* data, size_t length,
                                   size_t stride,
                                   size_t num_histograms,
                                   HistogramType* histograms,
                                   HistogramType* tmp) {
  size_t iters =
      kIterMulForRefining * length / stride + kMinItersForRefining;
  uint32_t seed = 7;
  size_t iter;
  iters = ((iters + num_histograms - 1) / num_histograms) * num_histograms;
  for (iter = 0; iter < iters; ++iter) {
    FN(HistogramClear)(tmp);
    FN(RandomSample)(&seed, data, length, stride, tmp);
    FN(HistogramAddHistogram)(&histograms[iter % num_histograms], tmp);
  }
}

/* Assigns a block id from the range [0, num_histograms) to each data element
   in data[0..length) and fills in block_id[0..length) with the assigned values.
   Returns the number of blocks, i.e. one plus the number of block switches. */
static size_t FN(FindBlocks)(const DataType* data, const size_t length,
                             const double block_switch_bitcost,
                             const size_t num_histograms,
                             const HistogramType* histograms,
                             double* insert_cost,
                             double* cost,
                             uint8_t* switch_signal,
                             uint8_t* block_id) {
  const size_t alphabet_size = FN(HistogramDataSize)();
  const size_t bitmap_len = (num_histograms + 7) >> 3;
  size_t num_blocks = 1;
  size_t byte_ix;
  size_t i;
  size_t j;
  BROTLI_DCHECK(num_histograms <= 256);

  /* Trivial case: single historgram -> single block type. */
  if (num_histograms <= 1) {
    for (i = 0; i < length; ++i) {
      block_id[i] = 0;
    }
    return 1;
  }

  /* Fill bitcost for each symbol of all histograms.
   * Non-existing symbol cost: 2 + log2(total_count).
   * Regular symbol cost: -log2(symbol_count / total_count). */
  memset(insert_cost, 0,
         sizeof(insert_cost[0]) * alphabet_size * num_histograms);
  for (i = 0; i < num_histograms; ++i) {
    insert_cost[i] = FastLog2((uint32_t)histograms[i].total_count_);
  }
  for (i = alphabet_size; i != 0;) {
    /* Reverse order to use the 0-th row as a temporary storage. */
    --i;
    for (j = 0; j < num_histograms; ++j) {
      insert_cost[i * num_histograms + j] =
          insert_cost[j] - BitCost(histograms[j].data_[i]);
    }
  }

  /* After each iteration of this loop, cost[k] will contain the difference
     between the minimum cost of arriving at the current byte position using
     entropy code k, and the minimum cost of arriving at the current byte
     position. This difference is capped at the block switch cost, and if it
     reaches block switch cost, it means that when we trace back from the last
     position, we need to switch here. */
  memset(cost, 0, sizeof(cost[0]) * num_histograms);
  memset(switch_signal, 0, sizeof(switch_signal[0]) * length * bitmap_len);
  for (byte_ix = 0; byte_ix < length; ++byte_ix) {
    size_t ix = byte_ix * bitmap_len;
    size_t symbol = data[byte_ix];
    size_t insert_cost_ix = symbol * num_histograms;
    double min_cost = 1e99;
    double block_switch_cost = block_switch_bitcost;
    size_t k;
    for (k = 0; k < num_histograms; ++k) {
      /* We are coding the symbol with entropy code k. */
      cost[k] += insert_cost[insert_cost_ix + k];
      if (cost[k] < min_cost) {
        min_cost = cost[k];
        block_id[byte_ix] = (uint8_t)k;
      }
    }
    /* More blocks for the beginning. */
    if (byte_ix < 2000) {
      block_switch_cost *= 0.77 + 0.07 * (double)byte_ix / 2000;
    }
    for (k = 0; k < num_histograms; ++k) {
      cost[k] -= min_cost;
      if (cost[k] >= block_switch_cost) {
        const uint8_t mask = (uint8_t)(1u << (k & 7));
        cost[k] = block_switch_cost;
        BROTLI_DCHECK((k >> 3) < bitmap_len);
        switch_signal[ix + (k >> 3)] |= mask;
      }
    }
  }

  byte_ix = length - 1;
  {  /* Trace back from the last position and switch at the marked places. */
    size_t ix = byte_ix * bitmap_len;
    uint8_t cur_id = block_id[byte_ix];
    while (byte_ix > 0) {
      const uint8_t mask = (uint8_t)(1u << (cur_id & 7));
      BROTLI_DCHECK(((size_t)cur_id >> 3) < bitmap_len);
      --byte_ix;
      ix -= bitmap_len;
      if (switch_signal[ix + (cur_id >> 3)] & mask) {
        if (cur_id != block_id[byte_ix]) {
          cur_id = block_id[byte_ix];
          ++num_blocks;
        }
      }
      block_id[byte_ix] = cur_id;
    }
  }
  return num_blocks;
}

static size_t FN(RemapBlockIds)(uint8_t* block_ids, const size_t length,
                                uint16_t* new_id, const size_t num_histograms) {
  static const uint16_t kInvalidId = 256;
  uint16_t next_id = 0;
  size_t i;
  for (i = 0; i < num_histograms; ++i) {
    new_id[i] = kInvalidId;
  }
  for (i = 0; i < length; ++i) {
    BROTLI_DCHECK(block_ids[i] < num_histograms);
    if (new_id[block_ids[i]] == kInvalidId) {
      new_id[block_ids[i]] = next_id++;
    }
  }
  for (i = 0; i < length; ++i) {
    block_ids[i] = (uint8_t)new_id[block_ids[i]];
    BROTLI_DCHECK(block_ids[i] < num_histograms);
  }
  BROTLI_DCHECK(next_id <= num_histograms);
  return next_id;
}

static void FN(BuildBlockHistograms)(const DataType* data, const size_t length,
                                     const uint8_t* block_ids,
                                     const size_t num_histograms,
                                     HistogramType* histograms) {
  size_t i;
  FN(ClearHistograms)(histograms, num_histograms);
  for (i = 0; i < length; ++i) {
    FN(HistogramAdd)(&histograms[block_ids[i]], data[i]);
  }
}

/* Given the initial partitioning build partitioning with limited number
 * of histograms (and block types). */
static void FN(ClusterBlocks)(MemoryManager* m,
                              const DataType* data, const size_t length,
                              const size_t num_blocks,
                              uint8_t* block_ids,
                              BlockSplit* split) {
  uint32_t* histogram_symbols = BROTLI_ALLOC(m, uint32_t, num_blocks);
  uint32_t* u32 =
      BROTLI_ALLOC(m, uint32_t, num_blocks + 4 * HISTOGRAMS_PER_BATCH);
  const size_t expected_num_clusters = CLUSTERS_PER_BATCH *
      (num_blocks + HISTOGRAMS_PER_BATCH - 1) / HISTOGRAMS_PER_BATCH;
  size_t all_histograms_size = 0;
  size_t all_histograms_capacity = expected_num_clusters;
  HistogramType* all_histograms =
      BROTLI_ALLOC(m, HistogramType, all_histograms_capacity);
  size_t cluster_size_size = 0;
  size_t cluster_size_capacity = expected_num_clusters;
  uint32_t* cluster_size = BROTLI_ALLOC(m, uint32_t, cluster_size_capacity);
  size_t num_clusters = 0;
  HistogramType* histograms = BROTLI_ALLOC(m, HistogramType,
      BROTLI_MIN(size_t, num_blocks, HISTOGRAMS_PER_BATCH));
  size_t max_num_pairs =
      HISTOGRAMS_PER_BATCH * HISTOGRAMS_PER_BATCH / 2;
  size_t pairs_capacity = max_num_pairs + 1;
  HistogramPair* pairs = BROTLI_ALLOC(m, HistogramPair, pairs_capacity);
  size_t pos = 0;
  uint32_t* clusters;
  size_t num_final_clusters;
  static const uint32_t kInvalidIndex = BROTLI_UINT32_MAX;
  uint32_t* new_index;
  size_t i;
  uint32_t* BROTLI_RESTRICT const sizes = u32 + 0 * HISTOGRAMS_PER_BATCH;
  uint32_t* BROTLI_RESTRICT const new_clusters = u32 + 1 * HISTOGRAMS_PER_BATCH;
  uint32_t* BROTLI_RESTRICT const symbols = u32 + 2 * HISTOGRAMS_PER_BATCH;
  uint32_t* BROTLI_RESTRICT const remap = u32 + 3 * HISTOGRAMS_PER_BATCH;
  uint32_t* BROTLI_RESTRICT const block_lengths =
      u32 + 4 * HISTOGRAMS_PER_BATCH;
  /* TODO(eustas): move to arena? */
  HistogramType* tmp = BROTLI_ALLOC(m, HistogramType, 2);

  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(histogram_symbols) ||
      BROTLI_IS_NULL(u32) || BROTLI_IS_NULL(all_histograms) ||
      BROTLI_IS_NULL(cluster_size) || BROTLI_IS_NULL(histograms) ||
      BROTLI_IS_NULL(pairs) || BROTLI_IS_NULL(tmp)) {
    return;
  }

  memset(u32, 0, (num_blocks + 4 * HISTOGRAMS_PER_BATCH) * sizeof(uint32_t));

  /* Calculate block lengths (convert repeating values -> series length). */
  {
    size_t block_idx = 0;
    for (i = 0; i < length; ++i) {
      BROTLI_DCHECK(block_idx < num_blocks);
      ++block_lengths[block_idx];
      if (i + 1 == length || block_ids[i] != block_ids[i + 1]) {
        ++block_idx;
      }
    }
    BROTLI_DCHECK(block_idx == num_blocks);
  }

  /* Pre-cluster blocks (cluster batches). */
  for (i = 0; i < num_blocks; i += HISTOGRAMS_PER_BATCH) {
    const size_t num_to_combine =
        BROTLI_MIN(size_t, num_blocks - i, HISTOGRAMS_PER_BATCH);
    size_t num_new_clusters;
    size_t j;
    for (j = 0; j < num_to_combine; ++j) {
      size_t k;
      size_t block_length = block_lengths[i + j];
      FN(HistogramClear)(&histograms[j]);
      for (k = 0; k < block_length; ++k) {
        FN(HistogramAdd)(&histograms[j], data[pos++]);
      }
      histograms[j].bit_cost_ = FN(BrotliPopulationCost)(&histograms[j]);
      new_clusters[j] = (uint32_t)j;
      symbols[j] = (uint32_t)j;
      sizes[j] = 1;
    }
    num_new_clusters = FN(BrotliHistogramCombine)(
        histograms, tmp, sizes, symbols, new_clusters, pairs, num_to_combine,
        num_to_combine, HISTOGRAMS_PER_BATCH, max_num_pairs);
    BROTLI_ENSURE_CAPACITY(m, HistogramType, all_histograms,
        all_histograms_capacity, all_histograms_size + num_new_clusters);
    BROTLI_ENSURE_CAPACITY(m, uint32_t, cluster_size,
        cluster_size_capacity, cluster_size_size + num_new_clusters);
    if (BROTLI_IS_OOM(m)) return;
    for (j = 0; j < num_new_clusters; ++j) {
      all_histograms[all_histograms_size++] = histograms[new_clusters[j]];
      cluster_size[cluster_size_size++] = sizes[new_clusters[j]];
      remap[new_clusters[j]] = (uint32_t)j;
    }
    for (j = 0; j < num_to_combine; ++j) {
      histogram_symbols[i + j] = (uint32_t)num_clusters + remap[symbols[j]];
    }
    num_clusters += num_new_clusters;
    BROTLI_DCHECK(num_clusters == cluster_size_size);
    BROTLI_DCHECK(num_clusters == all_histograms_size);
  }
  BROTLI_FREE(m, histograms);

  /* Final clustering. */
  max_num_pairs =
      BROTLI_MIN(size_t, 64 * num_clusters, (num_clusters / 2) * num_clusters);
  if (pairs_capacity < max_num_pairs + 1) {
    BROTLI_FREE(m, pairs);
    pairs = BROTLI_ALLOC(m, HistogramPair, max_num_pairs + 1);
    if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(pairs)) return;
  }
  clusters = BROTLI_ALLOC(m, uint32_t, num_clusters);
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(clusters)) return;
  for (i = 0; i < num_clusters; ++i) {
    clusters[i] = (uint32_t)i;
  }
  num_final_clusters = FN(BrotliHistogramCombine)(
      all_histograms, tmp, cluster_size, histogram_symbols, clusters, pairs,
      num_clusters, num_blocks, BROTLI_MAX_NUMBER_OF_BLOCK_TYPES,
      max_num_pairs);
  BROTLI_FREE(m, pairs);
  BROTLI_FREE(m, cluster_size);

  /* Assign blocks to final histograms. */
  new_index = BROTLI_ALLOC(m, uint32_t, num_clusters);
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(new_index)) return;
  for (i = 0; i < num_clusters; ++i) new_index[i] = kInvalidIndex;
  pos = 0;
  {
    uint32_t next_index = 0;
    for (i = 0; i < num_blocks; ++i) {
      size_t j;
      uint32_t best_out;
      double best_bits;
      FN(HistogramClear)(tmp);
      for (j = 0; j < block_lengths[i]; ++j) {
        FN(HistogramAdd)(tmp, data[pos++]);
      }
      /* Among equally good histograms prefer last used. */
      /* TODO(eustas): should we give a block-switch discount here? */
      best_out = (i == 0) ? histogram_symbols[0] : histogram_symbols[i - 1];
      best_bits = FN(BrotliHistogramBitCostDistance)(
          tmp, &all_histograms[best_out], tmp + 1);
      for (j = 0; j < num_final_clusters; ++j) {
        const double cur_bits = FN(BrotliHistogramBitCostDistance)(
            tmp, &all_histograms[clusters[j]], tmp + 1);
        if (cur_bits < best_bits) {
          best_bits = cur_bits;
          best_out = clusters[j];
        }
      }
      histogram_symbols[i] = best_out;
      if (new_index[best_out] == kInvalidIndex) {
        new_index[best_out] = next_index++;
      }
    }
  }
  BROTLI_FREE(m, tmp);
  BROTLI_FREE(m, clusters);
  BROTLI_FREE(m, all_histograms);
  BROTLI_ENSURE_CAPACITY(
      m, uint8_t, split->types, split->types_alloc_size, num_blocks);
  BROTLI_ENSURE_CAPACITY(
      m, uint32_t, split->lengths, split->lengths_alloc_size, num_blocks);
  if (BROTLI_IS_OOM(m)) return;

  /* Rewrite final assignment to block-split. There might be less blocks
   * than |num_blocks| due to clustering. */
  {
    uint32_t cur_length = 0;
    size_t block_idx = 0;
    uint8_t max_type = 0;
    for (i = 0; i < num_blocks; ++i) {
      cur_length += block_lengths[i];
      if (i + 1 == num_blocks ||
          histogram_symbols[i] != histogram_symbols[i + 1]) {
        const uint8_t id = (uint8_t)new_index[histogram_symbols[i]];
        split->types[block_idx] = id;
        split->lengths[block_idx] = cur_length;
        max_type = BROTLI_MAX(uint8_t, max_type, id);
        cur_length = 0;
        ++block_idx;
      }
    }
    split->num_blocks = block_idx;
    split->num_types = (size_t)max_type + 1;
  }
  BROTLI_FREE(m, new_index);
  BROTLI_FREE(m, u32);
  BROTLI_FREE(m, histogram_symbols);
}

/* Create BlockSplit (partitioning) given the limits, estimates and "effort"
 * parameters.
 *
 * NB: max_histograms is often less than number of histograms allowed by format;
 *     this is done intentionally, to save some "space" for context-aware
 *     clustering (here entropy is estimated for context-free symbols). */
static void FN(SplitByteVector)(MemoryManager* m,
                                const DataType* data, const size_t length,
                                const size_t symbols_per_histogram,
                                const size_t max_histograms,
                                const size_t sampling_stride_length,
                                const double block_switch_cost,
                                const BrotliEncoderParams* params,
                                BlockSplit* split) {
  const size_t data_size = FN(HistogramDataSize)();
  HistogramType* histograms;
  HistogramType* tmp;
  /* Calculate number of histograms; initial estimate is one histogram per
   * specified amount of symbols; however, this value is capped. */
  size_t num_histograms = length / symbols_per_histogram + 1;
  if (num_histograms > max_histograms) {
    num_histograms = max_histograms;
  }

  /* Corner case: no input. */
  if (length == 0) {
    split->num_types = 1;
    return;
  }

  if (length < kMinLengthForBlockSplitting) {
    BROTLI_ENSURE_CAPACITY(m, uint8_t,
        split->types, split->types_alloc_size, split->num_blocks + 1);
    BROTLI_ENSURE_CAPACITY(m, uint32_t,
        split->lengths, split->lengths_alloc_size, split->num_blocks + 1);
    if (BROTLI_IS_OOM(m)) return;
    split->num_types = 1;
    split->types[split->num_blocks] = 0;
    split->lengths[split->num_blocks] = (uint32_t)length;
    split->num_blocks++;
    return;
  }
  histograms = BROTLI_ALLOC(m, HistogramType, num_histograms + 1);
  tmp = histograms + num_histograms;
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(histograms)) return;
  /* Find good entropy codes. */
  FN(InitialEntropyCodes)(data, length,
                          sampling_stride_length,
                          num_histograms, histograms);
  FN(RefineEntropyCodes)(data, length,
                         sampling_stride_length,
                         num_histograms, histograms, tmp);
  {
    /* Find a good path through literals with the good entropy codes. */
    uint8_t* block_ids = BROTLI_ALLOC(m, uint8_t, length);
    size_t num_blocks = 0;
    const size_t bitmaplen = (num_histograms + 7) >> 3;
    double* insert_cost = BROTLI_ALLOC(m, double, data_size * num_histograms);
    double* cost = BROTLI_ALLOC(m, double, num_histograms);
    uint8_t* switch_signal = BROTLI_ALLOC(m, uint8_t, length * bitmaplen);
    uint16_t* new_id = BROTLI_ALLOC(m, uint16_t, num_histograms);
    const size_t iters = params->quality < HQ_ZOPFLIFICATION_QUALITY ? 3 : 10;
    size_t i;
    if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(block_ids) ||
        BROTLI_IS_NULL(insert_cost) || BROTLI_IS_NULL(cost) ||
        BROTLI_IS_NULL(switch_signal) || BROTLI_IS_NULL(new_id)) {
      return;
    }
    for (i = 0; i < iters; ++i) {
      num_blocks = FN(FindBlocks)(data, length,
                                  block_switch_cost,
                                  num_histograms, histograms,
                                  insert_cost, cost, switch_signal,
                                  block_ids);
      num_histograms = FN(RemapBlockIds)(block_ids, length,
                                         new_id, num_histograms);
      FN(BuildBlockHistograms)(data, length, block_ids,
                               num_histograms, histograms);
    }
    BROTLI_FREE(m, insert_cost);
    BROTLI_FREE(m, cost);
    BROTLI_FREE(m, switch_signal);
    BROTLI_FREE(m, new_id);
    BROTLI_FREE(m, histograms);
    FN(ClusterBlocks)(m, data, length, num_blocks, block_ids, split);
    if (BROTLI_IS_OOM(m)) return;
    BROTLI_FREE(m, block_ids);
  }
}

#undef HistogramType
#undef DataType
#undef FN

#define FN(X) X ## Command
#define DataType uint16_t
/* NOLINTNEXTLINE(build/include) */
/* NOLINT(build/header_guard) */
/* Copyright 2013 Google Inc. All Rights Reserved.

   Distributed under MIT license.
   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
*/

/* template parameters: FN, DataType */

#define HistogramType FN(Histogram)

static void FN(InitialEntropyCodes)(const DataType* data, size_t length,
                                    size_t stride,
                                    size_t num_histograms,
                                    HistogramType* histograms) {
  uint32_t seed = 7;
  size_t block_length = length / num_histograms;
  size_t i;
  FN(ClearHistograms)(histograms, num_histograms);
  for (i = 0; i < num_histograms; ++i) {
    size_t pos = length * i / num_histograms;
    if (i != 0) {
      pos += MyRand(&seed) % block_length;
    }
    if (pos + stride >= length) {
      pos = length - stride - 1;
    }
    FN(HistogramAddVector)(&histograms[i], data + pos, stride);
  }
}

static void FN(RandomSample)(uint32_t* seed,
                             const DataType* data,
                             size_t length,
                             size_t stride,
                             HistogramType* sample) {
  size_t pos = 0;
  if (stride >= length) {
    stride = length;
  } else {
    pos = MyRand(seed) % (length - stride + 1);
  }
  FN(HistogramAddVector)(sample, data + pos, stride);
}

static void FN(RefineEntropyCodes)(const DataType* data, size_t length,
                                   size_t stride,
                                   size_t num_histograms,
                                   HistogramType* histograms,
                                   HistogramType* tmp) {
  size_t iters =
      kIterMulForRefining * length / stride + kMinItersForRefining;
  uint32_t seed = 7;
  size_t iter;
  iters = ((iters + num_histograms - 1) / num_histograms) * num_histograms;
  for (iter = 0; iter < iters; ++iter) {
    FN(HistogramClear)(tmp);
    FN(RandomSample)(&seed, data, length, stride, tmp);
    FN(HistogramAddHistogram)(&histograms[iter % num_histograms], tmp);
  }
}

/* Assigns a block id from the range [0, num_histograms) to each data element
   in data[0..length) and fills in block_id[0..length) with the assigned values.
   Returns the number of blocks, i.e. one plus the number of block switches. */
static size_t FN(FindBlocks)(const DataType* data, const size_t length,
                             const double block_switch_bitcost,
                             const size_t num_histograms,
                             const HistogramType* histograms,
                             double* insert_cost,
                             double* cost,
                             uint8_t* switch_signal,
                             uint8_t* block_id) {
  const size_t alphabet_size = FN(HistogramDataSize)();
  const size_t bitmap_len = (num_histograms + 7) >> 3;
  size_t num_blocks = 1;
  size_t byte_ix;
  size_t i;
  size_t j;
  BROTLI_DCHECK(num_histograms <= 256);

  /* Trivial case: single historgram -> single block type. */
  if (num_histograms <= 1) {
    for (i = 0; i < length; ++i) {
      block_id[i] = 0;
    }
    return 1;
  }

  /* Fill bitcost for each symbol of all histograms.
   * Non-existing symbol cost: 2 + log2(total_count).
   * Regular symbol cost: -log2(symbol_count / total_count). */
  memset(insert_cost, 0,
         sizeof(insert_cost[0]) * alphabet_size * num_histograms);
  for (i = 0; i < num_histograms; ++i) {
    insert_cost[i] = FastLog2((uint32_t)histograms[i].total_count_);
  }
  for (i = alphabet_size; i != 0;) {
    /* Reverse order to use the 0-th row as a temporary storage. */
    --i;
    for (j = 0; j < num_histograms; ++j) {
      insert_cost[i * num_histograms + j] =
          insert_cost[j] - BitCost(histograms[j].data_[i]);
    }
  }

  /* After each iteration of this loop, cost[k] will contain the difference
     between the minimum cost of arriving at the current byte position using
     entropy code k, and the minimum cost of arriving at the current byte
     position. This difference is capped at the block switch cost, and if it
     reaches block switch cost, it means that when we trace back from the last
     position, we need to switch here. */
  memset(cost, 0, sizeof(cost[0]) * num_histograms);
  memset(switch_signal, 0, sizeof(switch_signal[0]) * length * bitmap_len);
  for (byte_ix = 0; byte_ix < length; ++byte_ix) {
    size_t ix = byte_ix * bitmap_len;
    size_t symbol = data[byte_ix];
    size_t insert_cost_ix = symbol * num_histograms;
    double min_cost = 1e99;
    double block_switch_cost = block_switch_bitcost;
    size_t k;
    for (k = 0; k < num_histograms; ++k) {
      /* We are coding the symbol with entropy code k. */
      cost[k] += insert_cost[insert_cost_ix + k];
      if (cost[k] < min_cost) {
        min_cost = cost[k];
        block_id[byte_ix] = (uint8_t)k;
      }
    }
    /* More blocks for the beginning. */
    if (byte_ix < 2000) {
      block_switch_cost *= 0.77 + 0.07 * (double)byte_ix / 2000;
    }
    for (k = 0; k < num_histograms; ++k) {
      cost[k] -= min_cost;
      if (cost[k] >= block_switch_cost) {
        const uint8_t mask = (uint8_t)(1u << (k & 7));
        cost[k] = block_switch_cost;
        BROTLI_DCHECK((k >> 3) < bitmap_len);
        switch_signal[ix + (k >> 3)] |= mask;
      }
    }
  }

  byte_ix = length - 1;
  {  /* Trace back from the last position and switch at the marked places. */
    size_t ix = byte_ix * bitmap_len;
    uint8_t cur_id = block_id[byte_ix];
    while (byte_ix > 0) {
      const uint8_t mask = (uint8_t)(1u << (cur_id & 7));
      BROTLI_DCHECK(((size_t)cur_id >> 3) < bitmap_len);
      --byte_ix;
      ix -= bitmap_len;
      if (switch_signal[ix + (cur_id >> 3)] & mask) {
        if (cur_id != block_id[byte_ix]) {
          cur_id = block_id[byte_ix];
          ++num_blocks;
        }
      }
      block_id[byte_ix] = cur_id;
    }
  }
  return num_blocks;
}

static size_t FN(RemapBlockIds)(uint8_t* block_ids, const size_t length,
                                uint16_t* new_id, const size_t num_histograms) {
  static const uint16_t kInvalidId = 256;
  uint16_t next_id = 0;
  size_t i;
  for (i = 0; i < num_histograms; ++i) {
    new_id[i] = kInvalidId;
  }
  for (i = 0; i < length; ++i) {
    BROTLI_DCHECK(block_ids[i] < num_histograms);
    if (new_id[block_ids[i]] == kInvalidId) {
      new_id[block_ids[i]] = next_id++;
    }
  }
  for (i = 0; i < length; ++i) {
    block_ids[i] = (uint8_t)new_id[block_ids[i]];
    BROTLI_DCHECK(block_ids[i] < num_histograms);
  }
  BROTLI_DCHECK(next_id <= num_histograms);
  return next_id;
}

static void FN(BuildBlockHistograms)(const DataType* data, const size_t length,
                                     const uint8_t* block_ids,
                                     const size_t num_histograms,
                                     HistogramType* histograms) {
  size_t i;
  FN(ClearHistograms)(histograms, num_histograms);
  for (i = 0; i < length; ++i) {
    FN(HistogramAdd)(&histograms[block_ids[i]], data[i]);
  }
}

/* Given the initial partitioning build partitioning with limited number
 * of histograms (and block types). */
static void FN(ClusterBlocks)(MemoryManager* m,
                              const DataType* data, const size_t length,
                              const size_t num_blocks,
                              uint8_t* block_ids,
                              BlockSplit* split) {
  uint32_t* histogram_symbols = BROTLI_ALLOC(m, uint32_t, num_blocks);
  uint32_t* u32 =
      BROTLI_ALLOC(m, uint32_t, num_blocks + 4 * HISTOGRAMS_PER_BATCH);
  const size_t expected_num_clusters = CLUSTERS_PER_BATCH *
      (num_blocks + HISTOGRAMS_PER_BATCH - 1) / HISTOGRAMS_PER_BATCH;
  size_t all_histograms_size = 0;
  size_t all_histograms_capacity = expected_num_clusters;
  HistogramType* all_histograms =
      BROTLI_ALLOC(m, HistogramType, all_histograms_capacity);
  size_t cluster_size_size = 0;
  size_t cluster_size_capacity = expected_num_clusters;
  uint32_t* cluster_size = BROTLI_ALLOC(m, uint32_t, cluster_size_capacity);
  size_t num_clusters = 0;
  HistogramType* histograms = BROTLI_ALLOC(m, HistogramType,
      BROTLI_MIN(size_t, num_blocks, HISTOGRAMS_PER_BATCH));
  size_t max_num_pairs =
      HISTOGRAMS_PER_BATCH * HISTOGRAMS_PER_BATCH / 2;
  size_t pairs_capacity = max_num_pairs + 1;
  HistogramPair* pairs = BROTLI_ALLOC(m, HistogramPair, pairs_capacity);
  size_t pos = 0;
  uint32_t* clusters;
  size_t num_final_clusters;
  static const uint32_t kInvalidIndex = BROTLI_UINT32_MAX;
  uint32_t* new_index;
  size_t i;
  uint32_t* BROTLI_RESTRICT const sizes = u32 + 0 * HISTOGRAMS_PER_BATCH;
  uint32_t* BROTLI_RESTRICT const new_clusters = u32 + 1 * HISTOGRAMS_PER_BATCH;
  uint32_t* BROTLI_RESTRICT const symbols = u32 + 2 * HISTOGRAMS_PER_BATCH;
  uint32_t* BROTLI_RESTRICT const remap = u32 + 3 * HISTOGRAMS_PER_BATCH;
  uint32_t* BROTLI_RESTRICT const block_lengths =
      u32 + 4 * HISTOGRAMS_PER_BATCH;
  /* TODO(eustas): move to arena? */
  HistogramType* tmp = BROTLI_ALLOC(m, HistogramType, 2);

  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(histogram_symbols) ||
      BROTLI_IS_NULL(u32) || BROTLI_IS_NULL(all_histograms) ||
      BROTLI_IS_NULL(cluster_size) || BROTLI_IS_NULL(histograms) ||
      BROTLI_IS_NULL(pairs) || BROTLI_IS_NULL(tmp)) {
    return;
  }

  memset(u32, 0, (num_blocks + 4 * HISTOGRAMS_PER_BATCH) * sizeof(uint32_t));

  /* Calculate block lengths (convert repeating values -> series length). */
  {
    size_t block_idx = 0;
    for (i = 0; i < length; ++i) {
      BROTLI_DCHECK(block_idx < num_blocks);
      ++block_lengths[block_idx];
      if (i + 1 == length || block_ids[i] != block_ids[i + 1]) {
        ++block_idx;
      }
    }
    BROTLI_DCHECK(block_idx == num_blocks);
  }

  /* Pre-cluster blocks (cluster batches). */
  for (i = 0; i < num_blocks; i += HISTOGRAMS_PER_BATCH) {
    const size_t num_to_combine =
        BROTLI_MIN(size_t, num_blocks - i, HISTOGRAMS_PER_BATCH);
    size_t num_new_clusters;
    size_t j;
    for (j = 0; j < num_to_combine; ++j) {
      size_t k;
      size_t block_length = block_lengths[i + j];
      FN(HistogramClear)(&histograms[j]);
      for (k = 0; k < block_length; ++k) {
        FN(HistogramAdd)(&histograms[j], data[pos++]);
      }
      histograms[j].bit_cost_ = FN(BrotliPopulationCost)(&histograms[j]);
      new_clusters[j] = (uint32_t)j;
      symbols[j] = (uint32_t)j;
      sizes[j] = 1;
    }
    num_new_clusters = FN(BrotliHistogramCombine)(
        histograms, tmp, sizes, symbols, new_clusters, pairs, num_to_combine,
        num_to_combine, HISTOGRAMS_PER_BATCH, max_num_pairs);
    BROTLI_ENSURE_CAPACITY(m, HistogramType, all_histograms,
        all_histograms_capacity, all_histograms_size + num_new_clusters);
    BROTLI_ENSURE_CAPACITY(m, uint32_t, cluster_size,
        cluster_size_capacity, cluster_size_size + num_new_clusters);
    if (BROTLI_IS_OOM(m)) return;
    for (j = 0; j < num_new_clusters; ++j) {
      all_histograms[all_histograms_size++] = histograms[new_clusters[j]];
      cluster_size[cluster_size_size++] = sizes[new_clusters[j]];
      remap[new_clusters[j]] = (uint32_t)j;
    }
    for (j = 0; j < num_to_combine; ++j) {
      histogram_symbols[i + j] = (uint32_t)num_clusters + remap[symbols[j]];
    }
    num_clusters += num_new_clusters;
    BROTLI_DCHECK(num_clusters == cluster_size_size);
    BROTLI_DCHECK(num_clusters == all_histograms_size);
  }
  BROTLI_FREE(m, histograms);

  /* Final clustering. */
  max_num_pairs =
      BROTLI_MIN(size_t, 64 * num_clusters, (num_clusters / 2) * num_clusters);
  if (pairs_capacity < max_num_pairs + 1) {
    BROTLI_FREE(m, pairs);
    pairs = BROTLI_ALLOC(m, HistogramPair, max_num_pairs + 1);
    if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(pairs)) return;
  }
  clusters = BROTLI_ALLOC(m, uint32_t, num_clusters);
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(clusters)) return;
  for (i = 0; i < num_clusters; ++i) {
    clusters[i] = (uint32_t)i;
  }
  num_final_clusters = FN(BrotliHistogramCombine)(
      all_histograms, tmp, cluster_size, histogram_symbols, clusters, pairs,
      num_clusters, num_blocks, BROTLI_MAX_NUMBER_OF_BLOCK_TYPES,
      max_num_pairs);
  BROTLI_FREE(m, pairs);
  BROTLI_FREE(m, cluster_size);

  /* Assign blocks to final histograms. */
  new_index = BROTLI_ALLOC(m, uint32_t, num_clusters);
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(new_index)) return;
  for (i = 0; i < num_clusters; ++i) new_index[i] = kInvalidIndex;
  pos = 0;
  {
    uint32_t next_index = 0;
    for (i = 0; i < num_blocks; ++i) {
      size_t j;
      uint32_t best_out;
      double best_bits;
      FN(HistogramClear)(tmp);
      for (j = 0; j < block_lengths[i]; ++j) {
        FN(HistogramAdd)(tmp, data[pos++]);
      }
      /* Among equally good histograms prefer last used. */
      /* TODO(eustas): should we give a block-switch discount here? */
      best_out = (i == 0) ? histogram_symbols[0] : histogram_symbols[i - 1];
      best_bits = FN(BrotliHistogramBitCostDistance)(
          tmp, &all_histograms[best_out], tmp + 1);
      for (j = 0; j < num_final_clusters; ++j) {
        const double cur_bits = FN(BrotliHistogramBitCostDistance)(
            tmp, &all_histograms[clusters[j]], tmp + 1);
        if (cur_bits < best_bits) {
          best_bits = cur_bits;
          best_out = clusters[j];
        }
      }
      histogram_symbols[i] = best_out;
      if (new_index[best_out] == kInvalidIndex) {
        new_index[best_out] = next_index++;
      }
    }
  }
  BROTLI_FREE(m, tmp);
  BROTLI_FREE(m, clusters);
  BROTLI_FREE(m, all_histograms);
  BROTLI_ENSURE_CAPACITY(
      m, uint8_t, split->types, split->types_alloc_size, num_blocks);
  BROTLI_ENSURE_CAPACITY(
      m, uint32_t, split->lengths, split->lengths_alloc_size, num_blocks);
  if (BROTLI_IS_OOM(m)) return;

  /* Rewrite final assignment to block-split. There might be less blocks
   * than |num_blocks| due to clustering. */
  {
    uint32_t cur_length = 0;
    size_t block_idx = 0;
    uint8_t max_type = 0;
    for (i = 0; i < num_blocks; ++i) {
      cur_length += block_lengths[i];
      if (i + 1 == num_blocks ||
          histogram_symbols[i] != histogram_symbols[i + 1]) {
        const uint8_t id = (uint8_t)new_index[histogram_symbols[i]];
        split->types[block_idx] = id;
        split->lengths[block_idx] = cur_length;
        max_type = BROTLI_MAX(uint8_t, max_type, id);
        cur_length = 0;
        ++block_idx;
      }
    }
    split->num_blocks = block_idx;
    split->num_types = (size_t)max_type + 1;
  }
  BROTLI_FREE(m, new_index);
  BROTLI_FREE(m, u32);
  BROTLI_FREE(m, histogram_symbols);
}

/* Create BlockSplit (partitioning) given the limits, estimates and "effort"
 * parameters.
 *
 * NB: max_histograms is often less than number of histograms allowed by format;
 *     this is done intentionally, to save some "space" for context-aware
 *     clustering (here entropy is estimated for context-free symbols). */
static void FN(SplitByteVector)(MemoryManager* m,
                                const DataType* data, const size_t length,
                                const size_t symbols_per_histogram,
                                const size_t max_histograms,
                                const size_t sampling_stride_length,
                                const double block_switch_cost,
                                const BrotliEncoderParams* params,
                                BlockSplit* split) {
  const size_t data_size = FN(HistogramDataSize)();
  HistogramType* histograms;
  HistogramType* tmp;
  /* Calculate number of histograms; initial estimate is one histogram per
   * specified amount of symbols; however, this value is capped. */
  size_t num_histograms = length / symbols_per_histogram + 1;
  if (num_histograms > max_histograms) {
    num_histograms = max_histograms;
  }

  /* Corner case: no input. */
  if (length == 0) {
    split->num_types = 1;
    return;
  }

  if (length < kMinLengthForBlockSplitting) {
    BROTLI_ENSURE_CAPACITY(m, uint8_t,
        split->types, split->types_alloc_size, split->num_blocks + 1);
    BROTLI_ENSURE_CAPACITY(m, uint32_t,
        split->lengths, split->lengths_alloc_size, split->num_blocks + 1);
    if (BROTLI_IS_OOM(m)) return;
    split->num_types = 1;
    split->types[split->num_blocks] = 0;
    split->lengths[split->num_blocks] = (uint32_t)length;
    split->num_blocks++;
    return;
  }
  histograms = BROTLI_ALLOC(m, HistogramType, num_histograms + 1);
  tmp = histograms + num_histograms;
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(histograms)) return;
  /* Find good entropy codes. */
  FN(InitialEntropyCodes)(data, length,
                          sampling_stride_length,
                          num_histograms, histograms);
  FN(RefineEntropyCodes)(data, length,
                         sampling_stride_length,
                         num_histograms, histograms, tmp);
  {
    /* Find a good path through literals with the good entropy codes. */
    uint8_t* block_ids = BROTLI_ALLOC(m, uint8_t, length);
    size_t num_blocks = 0;
    const size_t bitmaplen = (num_histograms + 7) >> 3;
    double* insert_cost = BROTLI_ALLOC(m, double, data_size * num_histograms);
    double* cost = BROTLI_ALLOC(m, double, num_histograms);
    uint8_t* switch_signal = BROTLI_ALLOC(m, uint8_t, length * bitmaplen);
    uint16_t* new_id = BROTLI_ALLOC(m, uint16_t, num_histograms);
    const size_t iters = params->quality < HQ_ZOPFLIFICATION_QUALITY ? 3 : 10;
    size_t i;
    if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(block_ids) ||
        BROTLI_IS_NULL(insert_cost) || BROTLI_IS_NULL(cost) ||
        BROTLI_IS_NULL(switch_signal) || BROTLI_IS_NULL(new_id)) {
      return;
    }
    for (i = 0; i < iters; ++i) {
      num_blocks = FN(FindBlocks)(data, length,
                                  block_switch_cost,
                                  num_histograms, histograms,
                                  insert_cost, cost, switch_signal,
                                  block_ids);
      num_histograms = FN(RemapBlockIds)(block_ids, length,
                                         new_id, num_histograms);
      FN(BuildBlockHistograms)(data, length, block_ids,
                               num_histograms, histograms);
    }
    BROTLI_FREE(m, insert_cost);
    BROTLI_FREE(m, cost);
    BROTLI_FREE(m, switch_signal);
    BROTLI_FREE(m, new_id);
    BROTLI_FREE(m, histograms);
    FN(ClusterBlocks)(m, data, length, num_blocks, block_ids, split);
    if (BROTLI_IS_OOM(m)) return;
    BROTLI_FREE(m, block_ids);
  }
}

#undef HistogramType
#undef FN

#define FN(X) X ## Distance
/* NOLINTNEXTLINE(build/include) */
/* NOLINT(build/header_guard) */
/* Copyright 2013 Google Inc. All Rights Reserved.

   Distributed under MIT license.
   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
*/

/* template parameters: FN, DataType */

#define HistogramType FN(Histogram)

static void FN(InitialEntropyCodes)(const DataType* data, size_t length,
                                    size_t stride,
                                    size_t num_histograms,
                                    HistogramType* histograms) {
  uint32_t seed = 7;
  size_t block_length = length / num_histograms;
  size_t i;
  FN(ClearHistograms)(histograms, num_histograms);
  for (i = 0; i < num_histograms; ++i) {
    size_t pos = length * i / num_histograms;
    if (i != 0) {
      pos += MyRand(&seed) % block_length;
    }
    if (pos + stride >= length) {
      pos = length - stride - 1;
    }
    FN(HistogramAddVector)(&histograms[i], data + pos, stride);
  }
}

static void FN(RandomSample)(uint32_t* seed,
                             const DataType* data,
                             size_t length,
                             size_t stride,
                             HistogramType* sample) {
  size_t pos = 0;
  if (stride >= length) {
    stride = length;
  } else {
    pos = MyRand(seed) % (length - stride + 1);
  }
  FN(HistogramAddVector)(sample, data + pos, stride);
}

static void FN(RefineEntropyCodes)(const DataType* data, size_t length,
                                   size_t stride,
                                   size_t num_histograms,
                                   HistogramType* histograms,
                                   HistogramType* tmp) {
  size_t iters =
      kIterMulForRefining * length / stride + kMinItersForRefining;
  uint32_t seed = 7;
  size_t iter;
  iters = ((iters + num_histograms - 1) / num_histograms) * num_histograms;
  for (iter = 0; iter < iters; ++iter) {
    FN(HistogramClear)(tmp);
    FN(RandomSample)(&seed, data, length, stride, tmp);
    FN(HistogramAddHistogram)(&histograms[iter % num_histograms], tmp);
  }
}

/* Assigns a block id from the range [0, num_histograms) to each data element
   in data[0..length) and fills in block_id[0..length) with the assigned values.
   Returns the number of blocks, i.e. one plus the number of block switches. */
static size_t FN(FindBlocks)(const DataType* data, const size_t length,
                             const double block_switch_bitcost,
                             const size_t num_histograms,
                             const HistogramType* histograms,
                             double* insert_cost,
                             double* cost,
                             uint8_t* switch_signal,
                             uint8_t* block_id) {
  const size_t alphabet_size = FN(HistogramDataSize)();
  const size_t bitmap_len = (num_histograms + 7) >> 3;
  size_t num_blocks = 1;
  size_t byte_ix;
  size_t i;
  size_t j;
  BROTLI_DCHECK(num_histograms <= 256);

  /* Trivial case: single historgram -> single block type. */
  if (num_histograms <= 1) {
    for (i = 0; i < length; ++i) {
      block_id[i] = 0;
    }
    return 1;
  }

  /* Fill bitcost for each symbol of all histograms.
   * Non-existing symbol cost: 2 + log2(total_count).
   * Regular symbol cost: -log2(symbol_count / total_count). */
  memset(insert_cost, 0,
         sizeof(insert_cost[0]) * alphabet_size * num_histograms);
  for (i = 0; i < num_histograms; ++i) {
    insert_cost[i] = FastLog2((uint32_t)histograms[i].total_count_);
  }
  for (i = alphabet_size; i != 0;) {
    /* Reverse order to use the 0-th row as a temporary storage. */
    --i;
    for (j = 0; j < num_histograms; ++j) {
      insert_cost[i * num_histograms + j] =
          insert_cost[j] - BitCost(histograms[j].data_[i]);
    }
  }

  /* After each iteration of this loop, cost[k] will contain the difference
     between the minimum cost of arriving at the current byte position using
     entropy code k, and the minimum cost of arriving at the current byte
     position. This difference is capped at the block switch cost, and if it
     reaches block switch cost, it means that when we trace back from the last
     position, we need to switch here. */
  memset(cost, 0, sizeof(cost[0]) * num_histograms);
  memset(switch_signal, 0, sizeof(switch_signal[0]) * length * bitmap_len);
  for (byte_ix = 0; byte_ix < length; ++byte_ix) {
    size_t ix = byte_ix * bitmap_len;
    size_t symbol = data[byte_ix];
    size_t insert_cost_ix = symbol * num_histograms;
    double min_cost = 1e99;
    double block_switch_cost = block_switch_bitcost;
    size_t k;
    for (k = 0; k < num_histograms; ++k) {
      /* We are coding the symbol with entropy code k. */
      cost[k] += insert_cost[insert_cost_ix + k];
      if (cost[k] < min_cost) {
        min_cost = cost[k];
        block_id[byte_ix] = (uint8_t)k;
      }
    }
    /* More blocks for the beginning. */
    if (byte_ix < 2000) {
      block_switch_cost *= 0.77 + 0.07 * (double)byte_ix / 2000;
    }
    for (k = 0; k < num_histograms; ++k) {
      cost[k] -= min_cost;
      if (cost[k] >= block_switch_cost) {
        const uint8_t mask = (uint8_t)(1u << (k & 7));
        cost[k] = block_switch_cost;
        BROTLI_DCHECK((k >> 3) < bitmap_len);
        switch_signal[ix + (k >> 3)] |= mask;
      }
    }
  }

  byte_ix = length - 1;
  {  /* Trace back from the last position and switch at the marked places. */
    size_t ix = byte_ix * bitmap_len;
    uint8_t cur_id = block_id[byte_ix];
    while (byte_ix > 0) {
      const uint8_t mask = (uint8_t)(1u << (cur_id & 7));
      BROTLI_DCHECK(((size_t)cur_id >> 3) < bitmap_len);
      --byte_ix;
      ix -= bitmap_len;
      if (switch_signal[ix + (cur_id >> 3)] & mask) {
        if (cur_id != block_id[byte_ix]) {
          cur_id = block_id[byte_ix];
          ++num_blocks;
        }
      }
      block_id[byte_ix] = cur_id;
    }
  }
  return num_blocks;
}

static size_t FN(RemapBlockIds)(uint8_t* block_ids, const size_t length,
                                uint16_t* new_id, const size_t num_histograms) {
  static const uint16_t kInvalidId = 256;
  uint16_t next_id = 0;
  size_t i;
  for (i = 0; i < num_histograms; ++i) {
    new_id[i] = kInvalidId;
  }
  for (i = 0; i < length; ++i) {
    BROTLI_DCHECK(block_ids[i] < num_histograms);
    if (new_id[block_ids[i]] == kInvalidId) {
      new_id[block_ids[i]] = next_id++;
    }
  }
  for (i = 0; i < length; ++i) {
    block_ids[i] = (uint8_t)new_id[block_ids[i]];
    BROTLI_DCHECK(block_ids[i] < num_histograms);
  }
  BROTLI_DCHECK(next_id <= num_histograms);
  return next_id;
}

static void FN(BuildBlockHistograms)(const DataType* data, const size_t length,
                                     const uint8_t* block_ids,
                                     const size_t num_histograms,
                                     HistogramType* histograms) {
  size_t i;
  FN(ClearHistograms)(histograms, num_histograms);
  for (i = 0; i < length; ++i) {
    FN(HistogramAdd)(&histograms[block_ids[i]], data[i]);
  }
}

/* Given the initial partitioning build partitioning with limited number
 * of histograms (and block types). */
static void FN(ClusterBlocks)(MemoryManager* m,
                              const DataType* data, const size_t length,
                              const size_t num_blocks,
                              uint8_t* block_ids,
                              BlockSplit* split) {
  uint32_t* histogram_symbols = BROTLI_ALLOC(m, uint32_t, num_blocks);
  uint32_t* u32 =
      BROTLI_ALLOC(m, uint32_t, num_blocks + 4 * HISTOGRAMS_PER_BATCH);
  const size_t expected_num_clusters = CLUSTERS_PER_BATCH *
      (num_blocks + HISTOGRAMS_PER_BATCH - 1) / HISTOGRAMS_PER_BATCH;
  size_t all_histograms_size = 0;
  size_t all_histograms_capacity = expected_num_clusters;
  HistogramType* all_histograms =
      BROTLI_ALLOC(m, HistogramType, all_histograms_capacity);
  size_t cluster_size_size = 0;
  size_t cluster_size_capacity = expected_num_clusters;
  uint32_t* cluster_size = BROTLI_ALLOC(m, uint32_t, cluster_size_capacity);
  size_t num_clusters = 0;
  HistogramType* histograms = BROTLI_ALLOC(m, HistogramType,
      BROTLI_MIN(size_t, num_blocks, HISTOGRAMS_PER_BATCH));
  size_t max_num_pairs =
      HISTOGRAMS_PER_BATCH * HISTOGRAMS_PER_BATCH / 2;
  size_t pairs_capacity = max_num_pairs + 1;
  HistogramPair* pairs = BROTLI_ALLOC(m, HistogramPair, pairs_capacity);
  size_t pos = 0;
  uint32_t* clusters;
  size_t num_final_clusters;
  static const uint32_t kInvalidIndex = BROTLI_UINT32_MAX;
  uint32_t* new_index;
  size_t i;
  uint32_t* BROTLI_RESTRICT const sizes = u32 + 0 * HISTOGRAMS_PER_BATCH;
  uint32_t* BROTLI_RESTRICT const new_clusters = u32 + 1 * HISTOGRAMS_PER_BATCH;
  uint32_t* BROTLI_RESTRICT const symbols = u32 + 2 * HISTOGRAMS_PER_BATCH;
  uint32_t* BROTLI_RESTRICT const remap = u32 + 3 * HISTOGRAMS_PER_BATCH;
  uint32_t* BROTLI_RESTRICT const block_lengths =
      u32 + 4 * HISTOGRAMS_PER_BATCH;
  /* TODO(eustas): move to arena? */
  HistogramType* tmp = BROTLI_ALLOC(m, HistogramType, 2);

  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(histogram_symbols) ||
      BROTLI_IS_NULL(u32) || BROTLI_IS_NULL(all_histograms) ||
      BROTLI_IS_NULL(cluster_size) || BROTLI_IS_NULL(histograms) ||
      BROTLI_IS_NULL(pairs) || BROTLI_IS_NULL(tmp)) {
    return;
  }

  memset(u32, 0, (num_blocks + 4 * HISTOGRAMS_PER_BATCH) * sizeof(uint32_t));

  /* Calculate block lengths (convert repeating values -> series length). */
  {
    size_t block_idx = 0;
    for (i = 0; i < length; ++i) {
      BROTLI_DCHECK(block_idx < num_blocks);
      ++block_lengths[block_idx];
      if (i + 1 == length || block_ids[i] != block_ids[i + 1]) {
        ++block_idx;
      }
    }
    BROTLI_DCHECK(block_idx == num_blocks);
  }

  /* Pre-cluster blocks (cluster batches). */
  for (i = 0; i < num_blocks; i += HISTOGRAMS_PER_BATCH) {
    const size_t num_to_combine =
        BROTLI_MIN(size_t, num_blocks - i, HISTOGRAMS_PER_BATCH);
    size_t num_new_clusters;
    size_t j;
    for (j = 0; j < num_to_combine; ++j) {
      size_t k;
      size_t block_length = block_lengths[i + j];
      FN(HistogramClear)(&histograms[j]);
      for (k = 0; k < block_length; ++k) {
        FN(HistogramAdd)(&histograms[j], data[pos++]);
      }
      histograms[j].bit_cost_ = FN(BrotliPopulationCost)(&histograms[j]);
      new_clusters[j] = (uint32_t)j;
      symbols[j] = (uint32_t)j;
      sizes[j] = 1;
    }
    num_new_clusters = FN(BrotliHistogramCombine)(
        histograms, tmp, sizes, symbols, new_clusters, pairs, num_to_combine,
        num_to_combine, HISTOGRAMS_PER_BATCH, max_num_pairs);
    BROTLI_ENSURE_CAPACITY(m, HistogramType, all_histograms,
        all_histograms_capacity, all_histograms_size + num_new_clusters);
    BROTLI_ENSURE_CAPACITY(m, uint32_t, cluster_size,
        cluster_size_capacity, cluster_size_size + num_new_clusters);
    if (BROTLI_IS_OOM(m)) return;
    for (j = 0; j < num_new_clusters; ++j) {
      all_histograms[all_histograms_size++] = histograms[new_clusters[j]];
      cluster_size[cluster_size_size++] = sizes[new_clusters[j]];
      remap[new_clusters[j]] = (uint32_t)j;
    }
    for (j = 0; j < num_to_combine; ++j) {
      histogram_symbols[i + j] = (uint32_t)num_clusters + remap[symbols[j]];
    }
    num_clusters += num_new_clusters;
    BROTLI_DCHECK(num_clusters == cluster_size_size);
    BROTLI_DCHECK(num_clusters == all_histograms_size);
  }
  BROTLI_FREE(m, histograms);

  /* Final clustering. */
  max_num_pairs =
      BROTLI_MIN(size_t, 64 * num_clusters, (num_clusters / 2) * num_clusters);
  if (pairs_capacity < max_num_pairs + 1) {
    BROTLI_FREE(m, pairs);
    pairs = BROTLI_ALLOC(m, HistogramPair, max_num_pairs + 1);
    if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(pairs)) return;
  }
  clusters = BROTLI_ALLOC(m, uint32_t, num_clusters);
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(clusters)) return;
  for (i = 0; i < num_clusters; ++i) {
    clusters[i] = (uint32_t)i;
  }
  num_final_clusters = FN(BrotliHistogramCombine)(
      all_histograms, tmp, cluster_size, histogram_symbols, clusters, pairs,
      num_clusters, num_blocks, BROTLI_MAX_NUMBER_OF_BLOCK_TYPES,
      max_num_pairs);
  BROTLI_FREE(m, pairs);
  BROTLI_FREE(m, cluster_size);

  /* Assign blocks to final histograms. */
  new_index = BROTLI_ALLOC(m, uint32_t, num_clusters);
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(new_index)) return;
  for (i = 0; i < num_clusters; ++i) new_index[i] = kInvalidIndex;
  pos = 0;
  {
    uint32_t next_index = 0;
    for (i = 0; i < num_blocks; ++i) {
      size_t j;
      uint32_t best_out;
      double best_bits;
      FN(HistogramClear)(tmp);
      for (j = 0; j < block_lengths[i]; ++j) {
        FN(HistogramAdd)(tmp, data[pos++]);
      }
      /* Among equally good histograms prefer last used. */
      /* TODO(eustas): should we give a block-switch discount here? */
      best_out = (i == 0) ? histogram_symbols[0] : histogram_symbols[i - 1];
      best_bits = FN(BrotliHistogramBitCostDistance)(
          tmp, &all_histograms[best_out], tmp + 1);
      for (j = 0; j < num_final_clusters; ++j) {
        const double cur_bits = FN(BrotliHistogramBitCostDistance)(
            tmp, &all_histograms[clusters[j]], tmp + 1);
        if (cur_bits < best_bits) {
          best_bits = cur_bits;
          best_out = clusters[j];
        }
      }
      histogram_symbols[i] = best_out;
      if (new_index[best_out] == kInvalidIndex) {
        new_index[best_out] = next_index++;
      }
    }
  }
  BROTLI_FREE(m, tmp);
  BROTLI_FREE(m, clusters);
  BROTLI_FREE(m, all_histograms);
  BROTLI_ENSURE_CAPACITY(
      m, uint8_t, split->types, split->types_alloc_size, num_blocks);
  BROTLI_ENSURE_CAPACITY(
      m, uint32_t, split->lengths, split->lengths_alloc_size, num_blocks);
  if (BROTLI_IS_OOM(m)) return;

  /* Rewrite final assignment to block-split. There might be less blocks
   * than |num_blocks| due to clustering. */
  {
    uint32_t cur_length = 0;
    size_t block_idx = 0;
    uint8_t max_type = 0;
    for (i = 0; i < num_blocks; ++i) {
      cur_length += block_lengths[i];
      if (i + 1 == num_blocks ||
          histogram_symbols[i] != histogram_symbols[i + 1]) {
        const uint8_t id = (uint8_t)new_index[histogram_symbols[i]];
        split->types[block_idx] = id;
        split->lengths[block_idx] = cur_length;
        max_type = BROTLI_MAX(uint8_t, max_type, id);
        cur_length = 0;
        ++block_idx;
      }
    }
    split->num_blocks = block_idx;
    split->num_types = (size_t)max_type + 1;
  }
  BROTLI_FREE(m, new_index);
  BROTLI_FREE(m, u32);
  BROTLI_FREE(m, histogram_symbols);
}

/* Create BlockSplit (partitioning) given the limits, estimates and "effort"
 * parameters.
 *
 * NB: max_histograms is often less than number of histograms allowed by format;
 *     this is done intentionally, to save some "space" for context-aware
 *     clustering (here entropy is estimated for context-free symbols). */
static void FN(SplitByteVector)(MemoryManager* m,
                                const DataType* data, const size_t length,
                                const size_t symbols_per_histogram,
                                const size_t max_histograms,
                                const size_t sampling_stride_length,
                                const double block_switch_cost,
                                const BrotliEncoderParams* params,
                                BlockSplit* split) {
  const size_t data_size = FN(HistogramDataSize)();
  HistogramType* histograms;
  HistogramType* tmp;
  /* Calculate number of histograms; initial estimate is one histogram per
   * specified amount of symbols; however, this value is capped. */
  size_t num_histograms = length / symbols_per_histogram + 1;
  if (num_histograms > max_histograms) {
    num_histograms = max_histograms;
  }

  /* Corner case: no input. */
  if (length == 0) {
    split->num_types = 1;
    return;
  }

  if (length < kMinLengthForBlockSplitting) {
    BROTLI_ENSURE_CAPACITY(m, uint8_t,
        split->types, split->types_alloc_size, split->num_blocks + 1);
    BROTLI_ENSURE_CAPACITY(m, uint32_t,
        split->lengths, split->lengths_alloc_size, split->num_blocks + 1);
    if (BROTLI_IS_OOM(m)) return;
    split->num_types = 1;
    split->types[split->num_blocks] = 0;
    split->lengths[split->num_blocks] = (uint32_t)length;
    split->num_blocks++;
    return;
  }
  histograms = BROTLI_ALLOC(m, HistogramType, num_histograms + 1);
  tmp = histograms + num_histograms;
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(histograms)) return;
  /* Find good entropy codes. */
  FN(InitialEntropyCodes)(data, length,
                          sampling_stride_length,
                          num_histograms, histograms);
  FN(RefineEntropyCodes)(data, length,
                         sampling_stride_length,
                         num_histograms, histograms, tmp);
  {
    /* Find a good path through literals with the good entropy codes. */
    uint8_t* block_ids = BROTLI_ALLOC(m, uint8_t, length);
    size_t num_blocks = 0;
    const size_t bitmaplen = (num_histograms + 7) >> 3;
    double* insert_cost = BROTLI_ALLOC(m, double, data_size * num_histograms);
    double* cost = BROTLI_ALLOC(m, double, num_histograms);
    uint8_t* switch_signal = BROTLI_ALLOC(m, uint8_t, length * bitmaplen);
    uint16_t* new_id = BROTLI_ALLOC(m, uint16_t, num_histograms);
    const size_t iters = params->quality < HQ_ZOPFLIFICATION_QUALITY ? 3 : 10;
    size_t i;
    if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(block_ids) ||
        BROTLI_IS_NULL(insert_cost) || BROTLI_IS_NULL(cost) ||
        BROTLI_IS_NULL(switch_signal) || BROTLI_IS_NULL(new_id)) {
      return;
    }
    for (i = 0; i < iters; ++i) {
      num_blocks = FN(FindBlocks)(data, length,
                                  block_switch_cost,
                                  num_histograms, histograms,
                                  insert_cost, cost, switch_signal,
                                  block_ids);
      num_histograms = FN(RemapBlockIds)(block_ids, length,
                                         new_id, num_histograms);
      FN(BuildBlockHistograms)(data, length, block_ids,
                               num_histograms, histograms);
    }
    BROTLI_FREE(m, insert_cost);
    BROTLI_FREE(m, cost);
    BROTLI_FREE(m, switch_signal);
    BROTLI_FREE(m, new_id);
    BROTLI_FREE(m, histograms);
    FN(ClusterBlocks)(m, data, length, num_blocks, block_ids, split);
    if (BROTLI_IS_OOM(m)) return;
    BROTLI_FREE(m, block_ids);
  }
}

#undef HistogramType
#undef DataType
#undef FN

void duckdb_brotli::BrotliInitBlockSplit(BlockSplit* self) {
  self->num_types = 0;
  self->num_blocks = 0;
  self->types = 0;
  self->lengths = 0;
  self->types_alloc_size = 0;
  self->lengths_alloc_size = 0;
}

void duckdb_brotli::BrotliDestroyBlockSplit(MemoryManager* m, BlockSplit* self) {
  BROTLI_FREE(m, self->types);
  BROTLI_FREE(m, self->lengths);
}

/* Extracts literals, command distance and prefix codes, then applies
 * SplitByteVector to create partitioning. */
void duckdb_brotli::BrotliSplitBlock(MemoryManager* m,
                      const Command* cmds,
                      const size_t num_commands,
                      const uint8_t* data,
                      const size_t pos,
                      const size_t mask,
                      const BrotliEncoderParams* params,
                      BlockSplit* literal_split,
                      BlockSplit* insert_and_copy_split,
                      BlockSplit* dist_split) {
  {
    size_t literals_count = CountLiterals(cmds, num_commands);
    uint8_t* literals = BROTLI_ALLOC(m, uint8_t, literals_count);
    if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(literals)) return;
    /* Create a continuous array of literals. */
    CopyLiteralsToByteArray(cmds, num_commands, data, pos, mask, literals);
    /* Create the block split on the array of literals.
     * Literal histograms can have alphabet size up to 256.
     * Though, to accomodate context modeling, less than half of maximum size
     * is allowed. */
    SplitByteVectorLiteral(
        m, literals, literals_count,
        kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
        kLiteralStrideLength, kLiteralBlockSwitchCost, params,
        literal_split);
    if (BROTLI_IS_OOM(m)) return;
    BROTLI_FREE(m, literals);
    /* NB: this might be a good place for injecting extra splitting without
     *     increasing encoder complexity; however, output parition would be less
     *     optimal than one produced with forced splitting inside
     *     SplitByteVector (FindBlocks / ClusterBlocks). */
  }

  {
    /* Compute prefix codes for commands. */
    uint16_t* insert_and_copy_codes = BROTLI_ALLOC(m, uint16_t, num_commands);
    size_t i;
    if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(insert_and_copy_codes)) return;
    for (i = 0; i < num_commands; ++i) {
      insert_and_copy_codes[i] = cmds[i].cmd_prefix_;
    }
    /* Create the block split on the array of command prefixes. */
    SplitByteVectorCommand(
        m, insert_and_copy_codes, num_commands,
        kSymbolsPerCommandHistogram, kMaxCommandHistograms,
        kCommandStrideLength, kCommandBlockSwitchCost, params,
        insert_and_copy_split);
    if (BROTLI_IS_OOM(m)) return;
    /* TODO(eustas): reuse for distances? */
    BROTLI_FREE(m, insert_and_copy_codes);
  }

  {
    /* Create a continuous array of distance prefixes. */
    uint16_t* distance_prefixes = BROTLI_ALLOC(m, uint16_t, num_commands);
    size_t j = 0;
    size_t i;
    if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(distance_prefixes)) return;
    for (i = 0; i < num_commands; ++i) {
      const Command* cmd = &cmds[i];
      if (CommandCopyLen(cmd) && cmd->cmd_prefix_ >= 128) {
        distance_prefixes[j++] = cmd->dist_prefix_ & 0x3FF;
      }
    }
    /* Create the block split on the array of distance prefixes. */
    SplitByteVectorDistance(
        m, distance_prefixes, j,
        kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
        kDistanceStrideLength, kDistanceBlockSwitchCost, params,
        dist_split);
    if (BROTLI_IS_OOM(m)) return;
    BROTLI_FREE(m, distance_prefixes);
  }
}

#if defined(BROTLI_TEST)
size_t CountLiteralsForTest(const Command*, const size_t);
size_t CountLiteralsForTest(const Command* cmds, const size_t num_commands) {
  return CountLiterals(cmds, num_commands);
}

void CopyLiteralsToByteArrayForTest(const Command*,
    const size_t, const uint8_t*, const size_t, const size_t, uint8_t*);
void CopyLiteralsToByteArrayForTest(const Command* cmds,
    const size_t num_commands, const uint8_t* data, const size_t offset,
    const size_t mask, uint8_t* literals) {
  CopyLiteralsToByteArray(cmds, num_commands, data, offset, mask, literals);
}
#endif



Coverage Report

Created: 2025-11-15 07:36