Coverage Report

Created: 2025-06-12 07:25

/src/duckdb/third_party/brotli/enc/metablock.cpp
Line
Count
Source (jump to first uncovered line)
1
/* Copyright 2015 Google Inc. All Rights Reserved.
2
3
   Distributed under MIT license.
4
   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5
*/
6
7
/* Algorithms for distributing the literals and commands of a metablock between
8
   block types and contexts. */
9
10
#include "metablock.h"
11
12
#include <brotli/types.h>
13
14
#include "../common/brotli_constants.h"
15
#include "../common/context.h"
16
#include "../common/brotli_platform.h"
17
#include "bit_cost.h"
18
#include "block_splitter.h"
19
#include "cluster.h"
20
#include "entropy_encode.h"
21
#include "histogram.h"
22
#include "memory.h"
23
#include "quality.h"
24
25
using namespace duckdb_brotli;
26
27
void duckdb_brotli::BrotliInitDistanceParams(BrotliDistanceParams* dist_params,
28
0
    uint32_t npostfix, uint32_t ndirect, BROTLI_BOOL large_window) {
29
0
  uint32_t alphabet_size_max;
30
0
  uint32_t alphabet_size_limit;
31
0
  uint32_t max_distance;
32
33
0
  dist_params->distance_postfix_bits = npostfix;
34
0
  dist_params->num_direct_distance_codes = ndirect;
35
36
0
  alphabet_size_max = BROTLI_DISTANCE_ALPHABET_SIZE(
37
0
      npostfix, ndirect, BROTLI_MAX_DISTANCE_BITS);
38
0
  alphabet_size_limit = alphabet_size_max;
39
0
  max_distance = ndirect + (1U << (BROTLI_MAX_DISTANCE_BITS + npostfix + 2)) -
40
0
      (1U << (npostfix + 2));
41
42
0
  if (large_window) {
43
0
    BrotliDistanceCodeLimit limit = BrotliCalculateDistanceCodeLimit(
44
0
        BROTLI_MAX_ALLOWED_DISTANCE, npostfix, ndirect);
45
0
    alphabet_size_max = BROTLI_DISTANCE_ALPHABET_SIZE(
46
0
        npostfix, ndirect, BROTLI_LARGE_MAX_DISTANCE_BITS);
47
0
    alphabet_size_limit = limit.max_alphabet_size;
48
0
    max_distance = limit.max_distance;
49
0
  }
50
51
0
  dist_params->alphabet_size_max = alphabet_size_max;
52
0
  dist_params->alphabet_size_limit = alphabet_size_limit;
53
0
  dist_params->max_distance = max_distance;
54
0
}
55
56
static void RecomputeDistancePrefixes(Command* cmds,
57
                                      size_t num_commands,
58
                                      const BrotliDistanceParams* orig_params,
59
0
                                      const BrotliDistanceParams* new_params) {
60
0
  size_t i;
61
62
0
  if (orig_params->distance_postfix_bits == new_params->distance_postfix_bits &&
63
0
      orig_params->num_direct_distance_codes ==
64
0
      new_params->num_direct_distance_codes) {
65
0
    return;
66
0
  }
67
68
0
  for (i = 0; i < num_commands; ++i) {
69
0
    Command* cmd = &cmds[i];
70
0
    if (CommandCopyLen(cmd) && cmd->cmd_prefix_ >= 128) {
71
0
      PrefixEncodeCopyDistance(CommandRestoreDistanceCode(cmd, orig_params),
72
0
                               new_params->num_direct_distance_codes,
73
0
                               new_params->distance_postfix_bits,
74
0
                               &cmd->dist_prefix_,
75
0
                               &cmd->dist_extra_);
76
0
    }
77
0
  }
78
0
}
79
80
static BROTLI_BOOL ComputeDistanceCost(const Command* cmds,
81
                                       size_t num_commands,
82
                                       const BrotliDistanceParams* orig_params,
83
                                       const BrotliDistanceParams* new_params,
84
                                       double* cost,
85
0
                                       HistogramDistance* tmp) {
86
0
  size_t i;
87
0
  BROTLI_BOOL equal_params = BROTLI_FALSE;
88
0
  uint16_t dist_prefix;
89
0
  uint32_t dist_extra;
90
0
  double extra_bits = 0.0;
91
0
  HistogramClearDistance(tmp);
92
93
0
  if (orig_params->distance_postfix_bits == new_params->distance_postfix_bits &&
94
0
      orig_params->num_direct_distance_codes ==
95
0
      new_params->num_direct_distance_codes) {
96
0
    equal_params = BROTLI_TRUE;
97
0
  }
98
99
0
  for (i = 0; i < num_commands; i++) {
100
0
    const Command* cmd = &cmds[i];
101
0
    if (CommandCopyLen(cmd) && cmd->cmd_prefix_ >= 128) {
102
0
      if (equal_params) {
103
0
        dist_prefix = cmd->dist_prefix_;
104
0
      } else {
105
0
        uint32_t distance = CommandRestoreDistanceCode(cmd, orig_params);
106
0
        if (distance > new_params->max_distance) {
107
0
          return BROTLI_FALSE;
108
0
        }
109
0
        PrefixEncodeCopyDistance(distance,
110
0
                                 new_params->num_direct_distance_codes,
111
0
                                 new_params->distance_postfix_bits,
112
0
                                 &dist_prefix,
113
0
                                 &dist_extra);
114
0
      }
115
0
      HistogramAddDistance(tmp, dist_prefix & 0x3FF);
116
0
      extra_bits += dist_prefix >> 10;
117
0
    }
118
0
  }
119
120
0
  *cost = BrotliPopulationCostDistance(tmp) + extra_bits;
121
0
  return BROTLI_TRUE;
122
0
}
123
124
void duckdb_brotli::BrotliBuildMetaBlock(MemoryManager* m,
125
                          const uint8_t* ringbuffer,
126
                          const size_t pos,
127
                          const size_t mask,
128
                          BrotliEncoderParams* params,
129
                          uint8_t prev_byte,
130
                          uint8_t prev_byte2,
131
                          Command* cmds,
132
                          size_t num_commands,
133
                          ContextType literal_context_mode,
134
0
                          MetaBlockSplit* mb) {
135
  /* Histogram ids need to fit in one byte. */
136
0
  static const size_t kMaxNumberOfHistograms = 256;
137
0
  HistogramDistance* distance_histograms;
138
0
  HistogramLiteral* literal_histograms;
139
0
  ContextType* literal_context_modes = NULL;
140
0
  size_t literal_histograms_size;
141
0
  size_t distance_histograms_size;
142
0
  size_t i;
143
0
  size_t literal_context_multiplier = 1;
144
0
  uint32_t npostfix;
145
0
  uint32_t ndirect_msb = 0;
146
0
  BROTLI_BOOL check_orig = BROTLI_TRUE;
147
0
  double best_dist_cost = 1e99;
148
0
  BrotliDistanceParams orig_params = params->dist;
149
0
  BrotliDistanceParams new_params = params->dist;
150
0
  HistogramDistance* tmp = BROTLI_ALLOC(m, HistogramDistance, 1);
151
152
0
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(tmp)) return;
153
154
0
  for (npostfix = 0; npostfix <= BROTLI_MAX_NPOSTFIX; npostfix++) {
155
0
    for (; ndirect_msb < 16; ndirect_msb++) {
156
0
      uint32_t ndirect = ndirect_msb << npostfix;
157
0
      BROTLI_BOOL skip;
158
0
      double dist_cost;
159
0
      BrotliInitDistanceParams(&new_params, npostfix, ndirect,
160
0
                               params->large_window);
161
0
      if (npostfix == orig_params.distance_postfix_bits &&
162
0
          ndirect == orig_params.num_direct_distance_codes) {
163
0
        check_orig = BROTLI_FALSE;
164
0
      }
165
0
      skip = !ComputeDistanceCost(
166
0
          cmds, num_commands, &orig_params, &new_params, &dist_cost, tmp);
167
0
      if (skip || (dist_cost > best_dist_cost)) {
168
0
        break;
169
0
      }
170
0
      best_dist_cost = dist_cost;
171
0
      params->dist = new_params;
172
0
    }
173
0
    if (ndirect_msb > 0) ndirect_msb--;
174
0
    ndirect_msb /= 2;
175
0
  }
176
0
  if (check_orig) {
177
0
    double dist_cost;
178
0
    ComputeDistanceCost(cmds, num_commands, &orig_params, &orig_params,
179
0
                        &dist_cost, tmp);
180
0
    if (dist_cost < best_dist_cost) {
181
      /* NB: currently unused; uncomment when more param tuning is added. */
182
      /* best_dist_cost = dist_cost; */
183
0
      params->dist = orig_params;
184
0
    }
185
0
  }
186
0
  BROTLI_FREE(m, tmp);
187
0
  RecomputeDistancePrefixes(cmds, num_commands, &orig_params, &params->dist);
188
189
0
  BrotliSplitBlock(m, cmds, num_commands,
190
0
                   ringbuffer, pos, mask, params,
191
0
                   &mb->literal_split,
192
0
                   &mb->command_split,
193
0
                   &mb->distance_split);
194
0
  if (BROTLI_IS_OOM(m)) return;
195
196
0
  if (!params->disable_literal_context_modeling) {
197
0
    literal_context_multiplier = 1 << BROTLI_LITERAL_CONTEXT_BITS;
198
0
    literal_context_modes =
199
0
        BROTLI_ALLOC(m, ContextType, mb->literal_split.num_types);
200
0
    if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(literal_context_modes)) return;
201
0
    for (i = 0; i < mb->literal_split.num_types; ++i) {
202
0
      literal_context_modes[i] = literal_context_mode;
203
0
    }
204
0
  }
205
206
0
  literal_histograms_size =
207
0
      mb->literal_split.num_types * literal_context_multiplier;
208
0
  literal_histograms =
209
0
      BROTLI_ALLOC(m, HistogramLiteral, literal_histograms_size);
210
0
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(literal_histograms)) return;
211
0
  ClearHistogramsLiteral(literal_histograms, literal_histograms_size);
212
213
0
  distance_histograms_size =
214
0
      mb->distance_split.num_types << BROTLI_DISTANCE_CONTEXT_BITS;
215
0
  distance_histograms =
216
0
      BROTLI_ALLOC(m, HistogramDistance, distance_histograms_size);
217
0
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(distance_histograms)) return;
218
0
  ClearHistogramsDistance(distance_histograms, distance_histograms_size);
219
220
0
  BROTLI_DCHECK(mb->command_histograms == 0);
221
0
  mb->command_histograms_size = mb->command_split.num_types;
222
0
  mb->command_histograms =
223
0
      BROTLI_ALLOC(m, HistogramCommand, mb->command_histograms_size);
224
0
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(mb->command_histograms)) return;
225
0
  ClearHistogramsCommand(mb->command_histograms, mb->command_histograms_size);
226
227
0
  BrotliBuildHistogramsWithContext(cmds, num_commands,
228
0
      &mb->literal_split, &mb->command_split, &mb->distance_split,
229
0
      ringbuffer, pos, mask, prev_byte, prev_byte2, literal_context_modes,
230
0
      literal_histograms, mb->command_histograms, distance_histograms);
231
0
  BROTLI_FREE(m, literal_context_modes);
232
233
0
  BROTLI_DCHECK(mb->literal_context_map == 0);
234
0
  mb->literal_context_map_size =
235
0
      mb->literal_split.num_types << BROTLI_LITERAL_CONTEXT_BITS;
236
0
  mb->literal_context_map =
237
0
      BROTLI_ALLOC(m, uint32_t, mb->literal_context_map_size);
238
0
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(mb->literal_context_map)) return;
239
240
0
  BROTLI_DCHECK(mb->literal_histograms == 0);
241
0
  mb->literal_histograms_size = mb->literal_context_map_size;
242
0
  mb->literal_histograms =
243
0
      BROTLI_ALLOC(m, HistogramLiteral, mb->literal_histograms_size);
244
0
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(mb->literal_histograms)) return;
245
246
0
  BrotliClusterHistogramsLiteral(m, literal_histograms, literal_histograms_size,
247
0
      kMaxNumberOfHistograms, mb->literal_histograms,
248
0
      &mb->literal_histograms_size, mb->literal_context_map);
249
0
  if (BROTLI_IS_OOM(m)) return;
250
0
  BROTLI_FREE(m, literal_histograms);
251
252
0
  if (params->disable_literal_context_modeling) {
253
    /* Distribute assignment to all contexts. */
254
0
    for (i = mb->literal_split.num_types; i != 0;) {
255
0
      size_t j = 0;
256
0
      i--;
257
0
      for (; j < (1 << BROTLI_LITERAL_CONTEXT_BITS); j++) {
258
0
        mb->literal_context_map[(i << BROTLI_LITERAL_CONTEXT_BITS) + j] =
259
0
            mb->literal_context_map[i];
260
0
      }
261
0
    }
262
0
  }
263
264
0
  BROTLI_DCHECK(mb->distance_context_map == 0);
265
0
  mb->distance_context_map_size =
266
0
      mb->distance_split.num_types << BROTLI_DISTANCE_CONTEXT_BITS;
267
0
  mb->distance_context_map =
268
0
      BROTLI_ALLOC(m, uint32_t, mb->distance_context_map_size);
269
0
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(mb->distance_context_map)) return;
270
271
0
  BROTLI_DCHECK(mb->distance_histograms == 0);
272
0
  mb->distance_histograms_size = mb->distance_context_map_size;
273
0
  mb->distance_histograms =
274
0
      BROTLI_ALLOC(m, HistogramDistance, mb->distance_histograms_size);
275
0
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(mb->distance_histograms)) return;
276
277
0
  BrotliClusterHistogramsDistance(m, distance_histograms,
278
0
                                  mb->distance_context_map_size,
279
0
                                  kMaxNumberOfHistograms,
280
0
                                  mb->distance_histograms,
281
0
                                  &mb->distance_histograms_size,
282
0
                                  mb->distance_context_map);
283
0
  if (BROTLI_IS_OOM(m)) return;
284
0
  BROTLI_FREE(m, distance_histograms);
285
0
}
286
287
0
#define FN(X) X ## Literal
288
/* NOLINT(build/header_guard) */
289
/* Copyright 2015 Google Inc. All Rights Reserved.
290
291
   Distributed under MIT license.
292
   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
293
*/
294
295
/* template parameters: FN */
296
297
0
#define HistogramType FN(Histogram)
298
299
/* Greedy block splitter for one block category (literal, command or distance).
300
*/
301
typedef struct FN(BlockSplitter) {
302
  /* Alphabet size of particular block category. */
303
  size_t alphabet_size_;
304
  /* We collect at least this many symbols for each block. */
305
  size_t min_block_size_;
306
  /* We merge histograms A and B if
307
       entropy(A+B) < entropy(A) + entropy(B) + split_threshold_,
308
     where A is the current histogram and B is the histogram of the last or the
309
     second last block type. */
310
  double split_threshold_;
311
312
  size_t num_blocks_;
313
  BlockSplit* split_;  /* not owned */
314
  HistogramType* histograms_;  /* not owned */
315
  size_t* histograms_size_;  /* not owned */
316
317
  /* Temporary storage for BlockSplitterFinishBlock. */
318
  HistogramType combined_histo[2];
319
320
  /* The number of symbols that we want to collect before deciding on whether
321
     or not to merge the block with a previous one or emit a new block. */
322
  size_t target_block_size_;
323
  /* The number of symbols in the current histogram. */
324
  size_t block_size_;
325
  /* Offset of the current histogram. */
326
  size_t curr_histogram_ix_;
327
  /* Offset of the histograms of the previous two block types. */
328
  size_t last_histogram_ix_[2];
329
  /* Entropy of the previous two block types. */
330
  double last_entropy_[2];
331
  /* The number of times we merged the current block with the last one. */
332
  size_t merge_last_count_;
333
} FN(BlockSplitter);
334
335
static void FN(InitBlockSplitter)(
336
    MemoryManager* m, FN(BlockSplitter)* self, size_t alphabet_size,
337
    size_t min_block_size, double split_threshold, size_t num_symbols,
338
0
    BlockSplit* split, HistogramType** histograms, size_t* histograms_size) {
339
0
  size_t max_num_blocks = num_symbols / min_block_size + 1;
340
  /* We have to allocate one more histogram than the maximum number of block
341
     types for the current histogram when the meta-block is too big. */
342
0
  size_t max_num_types =
343
0
      BROTLI_MIN(size_t, max_num_blocks, BROTLI_MAX_NUMBER_OF_BLOCK_TYPES + 1);
344
0
  self->alphabet_size_ = alphabet_size;
345
0
  self->min_block_size_ = min_block_size;
346
0
  self->split_threshold_ = split_threshold;
347
0
  self->num_blocks_ = 0;
348
0
  self->split_ = split;
349
0
  self->histograms_size_ = histograms_size;
350
0
  self->target_block_size_ = min_block_size;
351
0
  self->block_size_ = 0;
352
0
  self->curr_histogram_ix_ = 0;
353
0
  self->merge_last_count_ = 0;
354
0
  BROTLI_ENSURE_CAPACITY(m, uint8_t,
355
0
      split->types, split->types_alloc_size, max_num_blocks);
356
0
  BROTLI_ENSURE_CAPACITY(m, uint32_t,
357
0
      split->lengths, split->lengths_alloc_size, max_num_blocks);
358
0
  if (BROTLI_IS_OOM(m)) return;
359
0
  self->split_->num_blocks = max_num_blocks;
360
0
  BROTLI_DCHECK(*histograms == 0);
361
0
  *histograms_size = max_num_types;
362
0
  *histograms = BROTLI_ALLOC(m, HistogramType, *histograms_size);
363
0
  self->histograms_ = *histograms;
364
0
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(*histograms)) return;
365
  /* Clear only current histogram. */
366
0
  FN(HistogramClear)(&self->histograms_[0]);
367
0
  self->last_histogram_ix_[0] = self->last_histogram_ix_[1] = 0;
368
0
}
369
370
/* Does either of three things:
371
     (1) emits the current block with a new block type;
372
     (2) emits the current block with the type of the second last block;
373
     (3) merges the current block with the last block. */
374
static void FN(BlockSplitterFinishBlock)(
375
0
    FN(BlockSplitter)* self, BROTLI_BOOL is_final) {
376
0
  BlockSplit* split = self->split_;
377
0
  double* last_entropy = self->last_entropy_;
378
0
  HistogramType* histograms = self->histograms_;
379
0
  self->block_size_ =
380
0
      BROTLI_MAX(size_t, self->block_size_, self->min_block_size_);
381
0
  if (self->num_blocks_ == 0) {
382
    /* Create first block. */
383
0
    split->lengths[0] = (uint32_t)self->block_size_;
384
0
    split->types[0] = 0;
385
0
    last_entropy[0] =
386
0
        BitsEntropy(histograms[0].data_, self->alphabet_size_);
387
0
    last_entropy[1] = last_entropy[0];
388
0
    ++self->num_blocks_;
389
0
    ++split->num_types;
390
0
    ++self->curr_histogram_ix_;
391
0
    if (self->curr_histogram_ix_ < *self->histograms_size_)
392
0
      FN(HistogramClear)(&histograms[self->curr_histogram_ix_]);
393
0
    self->block_size_ = 0;
394
0
  } else if (self->block_size_ > 0) {
395
0
    double entropy = BitsEntropy(histograms[self->curr_histogram_ix_].data_,
396
0
                                 self->alphabet_size_);
397
0
    double combined_entropy[2];
398
0
    double diff[2];
399
0
    size_t j;
400
0
    for (j = 0; j < 2; ++j) {
401
0
      size_t last_histogram_ix = self->last_histogram_ix_[j];
402
0
      self->combined_histo[j] = histograms[self->curr_histogram_ix_];
403
0
      FN(HistogramAddHistogram)(&self->combined_histo[j],
404
0
          &histograms[last_histogram_ix]);
405
0
      combined_entropy[j] = BitsEntropy(
406
0
          &self->combined_histo[j].data_[0], self->alphabet_size_);
407
0
      diff[j] = combined_entropy[j] - entropy - last_entropy[j];
408
0
    }
409
410
0
    if (split->num_types < BROTLI_MAX_NUMBER_OF_BLOCK_TYPES &&
411
0
        diff[0] > self->split_threshold_ &&
412
0
        diff[1] > self->split_threshold_) {
413
      /* Create new block. */
414
0
      split->lengths[self->num_blocks_] = (uint32_t)self->block_size_;
415
0
      split->types[self->num_blocks_] = (uint8_t)split->num_types;
416
0
      self->last_histogram_ix_[1] = self->last_histogram_ix_[0];
417
0
      self->last_histogram_ix_[0] = (uint8_t)split->num_types;
418
0
      last_entropy[1] = last_entropy[0];
419
0
      last_entropy[0] = entropy;
420
0
      ++self->num_blocks_;
421
0
      ++split->num_types;
422
0
      ++self->curr_histogram_ix_;
423
0
      if (self->curr_histogram_ix_ < *self->histograms_size_)
424
0
        FN(HistogramClear)(&histograms[self->curr_histogram_ix_]);
425
0
      self->block_size_ = 0;
426
0
      self->merge_last_count_ = 0;
427
0
      self->target_block_size_ = self->min_block_size_;
428
0
    } else if (diff[1] < diff[0] - 20.0) {
429
      /* Combine this block with second last block. */
430
0
      split->lengths[self->num_blocks_] = (uint32_t)self->block_size_;
431
0
      split->types[self->num_blocks_] = split->types[self->num_blocks_ - 2];
432
0
      BROTLI_SWAP(size_t, self->last_histogram_ix_, 0, 1);
433
0
      histograms[self->last_histogram_ix_[0]] = self->combined_histo[1];
434
0
      last_entropy[1] = last_entropy[0];
435
0
      last_entropy[0] = combined_entropy[1];
436
0
      ++self->num_blocks_;
437
0
      self->block_size_ = 0;
438
0
      FN(HistogramClear)(&histograms[self->curr_histogram_ix_]);
439
0
      self->merge_last_count_ = 0;
440
0
      self->target_block_size_ = self->min_block_size_;
441
0
    } else {
442
      /* Combine this block with last block. */
443
0
      split->lengths[self->num_blocks_ - 1] += (uint32_t)self->block_size_;
444
0
      histograms[self->last_histogram_ix_[0]] = self->combined_histo[0];
445
0
      last_entropy[0] = combined_entropy[0];
446
0
      if (split->num_types == 1) {
447
0
        last_entropy[1] = last_entropy[0];
448
0
      }
449
0
      self->block_size_ = 0;
450
0
      FN(HistogramClear)(&histograms[self->curr_histogram_ix_]);
451
0
      if (++self->merge_last_count_ > 1) {
452
0
        self->target_block_size_ += self->min_block_size_;
453
0
      }
454
0
    }
455
0
  }
456
0
  if (is_final) {
457
0
    *self->histograms_size_ = split->num_types;
458
0
    split->num_blocks = self->num_blocks_;
459
0
  }
460
0
}
461
462
/* Adds the next symbol to the current histogram. When the current histogram
463
   reaches the target size, decides on merging the block. */
464
0
static void FN(BlockSplitterAddSymbol)(FN(BlockSplitter)* self, size_t symbol) {
465
0
  FN(HistogramAdd)(&self->histograms_[self->curr_histogram_ix_], symbol);
466
0
  ++self->block_size_;
467
0
  if (self->block_size_ == self->target_block_size_) {
468
0
    FN(BlockSplitterFinishBlock)(self, /* is_final = */ BROTLI_FALSE);
469
0
  }
470
0
}
471
472
#undef HistogramType
473
#undef FN
474
475
0
#define FN(X) X ## Command
476
/* NOLINT(build/header_guard) */
477
/* Copyright 2015 Google Inc. All Rights Reserved.
478
479
   Distributed under MIT license.
480
   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
481
*/
482
483
/* template parameters: FN */
484
485
0
#define HistogramType FN(Histogram)
486
487
/* Greedy block splitter for one block category (literal, command or distance).
488
*/
489
typedef struct FN(BlockSplitter) {
490
  /* Alphabet size of particular block category. */
491
  size_t alphabet_size_;
492
  /* We collect at least this many symbols for each block. */
493
  size_t min_block_size_;
494
  /* We merge histograms A and B if
495
       entropy(A+B) < entropy(A) + entropy(B) + split_threshold_,
496
     where A is the current histogram and B is the histogram of the last or the
497
     second last block type. */
498
  double split_threshold_;
499
500
  size_t num_blocks_;
501
  BlockSplit* split_;  /* not owned */
502
  HistogramType* histograms_;  /* not owned */
503
  size_t* histograms_size_;  /* not owned */
504
505
  /* Temporary storage for BlockSplitterFinishBlock. */
506
  HistogramType combined_histo[2];
507
508
  /* The number of symbols that we want to collect before deciding on whether
509
     or not to merge the block with a previous one or emit a new block. */
510
  size_t target_block_size_;
511
  /* The number of symbols in the current histogram. */
512
  size_t block_size_;
513
  /* Offset of the current histogram. */
514
  size_t curr_histogram_ix_;
515
  /* Offset of the histograms of the previous two block types. */
516
  size_t last_histogram_ix_[2];
517
  /* Entropy of the previous two block types. */
518
  double last_entropy_[2];
519
  /* The number of times we merged the current block with the last one. */
520
  size_t merge_last_count_;
521
} FN(BlockSplitter);
522
523
static void FN(InitBlockSplitter)(
524
    MemoryManager* m, FN(BlockSplitter)* self, size_t alphabet_size,
525
    size_t min_block_size, double split_threshold, size_t num_symbols,
526
0
    BlockSplit* split, HistogramType** histograms, size_t* histograms_size) {
527
0
  size_t max_num_blocks = num_symbols / min_block_size + 1;
528
  /* We have to allocate one more histogram than the maximum number of block
529
     types for the current histogram when the meta-block is too big. */
530
0
  size_t max_num_types =
531
0
      BROTLI_MIN(size_t, max_num_blocks, BROTLI_MAX_NUMBER_OF_BLOCK_TYPES + 1);
532
0
  self->alphabet_size_ = alphabet_size;
533
0
  self->min_block_size_ = min_block_size;
534
0
  self->split_threshold_ = split_threshold;
535
0
  self->num_blocks_ = 0;
536
0
  self->split_ = split;
537
0
  self->histograms_size_ = histograms_size;
538
0
  self->target_block_size_ = min_block_size;
539
0
  self->block_size_ = 0;
540
0
  self->curr_histogram_ix_ = 0;
541
0
  self->merge_last_count_ = 0;
542
0
  BROTLI_ENSURE_CAPACITY(m, uint8_t,
543
0
      split->types, split->types_alloc_size, max_num_blocks);
544
0
  BROTLI_ENSURE_CAPACITY(m, uint32_t,
545
0
      split->lengths, split->lengths_alloc_size, max_num_blocks);
546
0
  if (BROTLI_IS_OOM(m)) return;
547
0
  self->split_->num_blocks = max_num_blocks;
548
0
  BROTLI_DCHECK(*histograms == 0);
549
0
  *histograms_size = max_num_types;
550
0
  *histograms = BROTLI_ALLOC(m, HistogramType, *histograms_size);
551
0
  self->histograms_ = *histograms;
552
0
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(*histograms)) return;
553
  /* Clear only current histogram. */
554
0
  FN(HistogramClear)(&self->histograms_[0]);
555
0
  self->last_histogram_ix_[0] = self->last_histogram_ix_[1] = 0;
556
0
}
557
558
/* Does either of three things:
559
     (1) emits the current block with a new block type;
560
     (2) emits the current block with the type of the second last block;
561
     (3) merges the current block with the last block. */
562
static void FN(BlockSplitterFinishBlock)(
563
0
    FN(BlockSplitter)* self, BROTLI_BOOL is_final) {
564
0
  BlockSplit* split = self->split_;
565
0
  double* last_entropy = self->last_entropy_;
566
0
  HistogramType* histograms = self->histograms_;
567
0
  self->block_size_ =
568
0
      BROTLI_MAX(size_t, self->block_size_, self->min_block_size_);
569
0
  if (self->num_blocks_ == 0) {
570
    /* Create first block. */
571
0
    split->lengths[0] = (uint32_t)self->block_size_;
572
0
    split->types[0] = 0;
573
0
    last_entropy[0] =
574
0
        BitsEntropy(histograms[0].data_, self->alphabet_size_);
575
0
    last_entropy[1] = last_entropy[0];
576
0
    ++self->num_blocks_;
577
0
    ++split->num_types;
578
0
    ++self->curr_histogram_ix_;
579
0
    if (self->curr_histogram_ix_ < *self->histograms_size_)
580
0
      FN(HistogramClear)(&histograms[self->curr_histogram_ix_]);
581
0
    self->block_size_ = 0;
582
0
  } else if (self->block_size_ > 0) {
583
0
    double entropy = BitsEntropy(histograms[self->curr_histogram_ix_].data_,
584
0
                                 self->alphabet_size_);
585
0
    double combined_entropy[2];
586
0
    double diff[2];
587
0
    size_t j;
588
0
    for (j = 0; j < 2; ++j) {
589
0
      size_t last_histogram_ix = self->last_histogram_ix_[j];
590
0
      self->combined_histo[j] = histograms[self->curr_histogram_ix_];
591
0
      FN(HistogramAddHistogram)(&self->combined_histo[j],
592
0
          &histograms[last_histogram_ix]);
593
0
      combined_entropy[j] = BitsEntropy(
594
0
          &self->combined_histo[j].data_[0], self->alphabet_size_);
595
0
      diff[j] = combined_entropy[j] - entropy - last_entropy[j];
596
0
    }
597
598
0
    if (split->num_types < BROTLI_MAX_NUMBER_OF_BLOCK_TYPES &&
599
0
        diff[0] > self->split_threshold_ &&
600
0
        diff[1] > self->split_threshold_) {
601
      /* Create new block. */
602
0
      split->lengths[self->num_blocks_] = (uint32_t)self->block_size_;
603
0
      split->types[self->num_blocks_] = (uint8_t)split->num_types;
604
0
      self->last_histogram_ix_[1] = self->last_histogram_ix_[0];
605
0
      self->last_histogram_ix_[0] = (uint8_t)split->num_types;
606
0
      last_entropy[1] = last_entropy[0];
607
0
      last_entropy[0] = entropy;
608
0
      ++self->num_blocks_;
609
0
      ++split->num_types;
610
0
      ++self->curr_histogram_ix_;
611
0
      if (self->curr_histogram_ix_ < *self->histograms_size_)
612
0
        FN(HistogramClear)(&histograms[self->curr_histogram_ix_]);
613
0
      self->block_size_ = 0;
614
0
      self->merge_last_count_ = 0;
615
0
      self->target_block_size_ = self->min_block_size_;
616
0
    } else if (diff[1] < diff[0] - 20.0) {
617
      /* Combine this block with second last block. */
618
0
      split->lengths[self->num_blocks_] = (uint32_t)self->block_size_;
619
0
      split->types[self->num_blocks_] = split->types[self->num_blocks_ - 2];
620
0
      BROTLI_SWAP(size_t, self->last_histogram_ix_, 0, 1);
621
0
      histograms[self->last_histogram_ix_[0]] = self->combined_histo[1];
622
0
      last_entropy[1] = last_entropy[0];
623
0
      last_entropy[0] = combined_entropy[1];
624
0
      ++self->num_blocks_;
625
0
      self->block_size_ = 0;
626
0
      FN(HistogramClear)(&histograms[self->curr_histogram_ix_]);
627
0
      self->merge_last_count_ = 0;
628
0
      self->target_block_size_ = self->min_block_size_;
629
0
    } else {
630
      /* Combine this block with last block. */
631
0
      split->lengths[self->num_blocks_ - 1] += (uint32_t)self->block_size_;
632
0
      histograms[self->last_histogram_ix_[0]] = self->combined_histo[0];
633
0
      last_entropy[0] = combined_entropy[0];
634
0
      if (split->num_types == 1) {
635
0
        last_entropy[1] = last_entropy[0];
636
0
      }
637
0
      self->block_size_ = 0;
638
0
      FN(HistogramClear)(&histograms[self->curr_histogram_ix_]);
639
0
      if (++self->merge_last_count_ > 1) {
640
0
        self->target_block_size_ += self->min_block_size_;
641
0
      }
642
0
    }
643
0
  }
644
0
  if (is_final) {
645
0
    *self->histograms_size_ = split->num_types;
646
0
    split->num_blocks = self->num_blocks_;
647
0
  }
648
0
}
649
650
/* Adds the next symbol to the current histogram. When the current histogram
651
   reaches the target size, decides on merging the block. */
652
0
static void FN(BlockSplitterAddSymbol)(FN(BlockSplitter)* self, size_t symbol) {
653
0
  FN(HistogramAdd)(&self->histograms_[self->curr_histogram_ix_], symbol);
654
0
  ++self->block_size_;
655
0
  if (self->block_size_ == self->target_block_size_) {
656
0
    FN(BlockSplitterFinishBlock)(self, /* is_final = */ BROTLI_FALSE);
657
0
  }
658
0
}
659
660
#undef HistogramType
661
#undef FN
662
663
0
#define FN(X) X ## Distance
664
/* NOLINT(build/header_guard) */
665
/* Copyright 2015 Google Inc. All Rights Reserved.
666
667
   Distributed under MIT license.
668
   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
669
*/
670
671
/* template parameters: FN */
672
673
0
#define HistogramType FN(Histogram)
674
675
/* Greedy block splitter for one block category (literal, command or distance).
676
*/
677
typedef struct FN(BlockSplitter) {
678
  /* Alphabet size of particular block category. */
679
  size_t alphabet_size_;
680
  /* We collect at least this many symbols for each block. */
681
  size_t min_block_size_;
682
  /* We merge histograms A and B if
683
       entropy(A+B) < entropy(A) + entropy(B) + split_threshold_,
684
     where A is the current histogram and B is the histogram of the last or the
685
     second last block type. */
686
  double split_threshold_;
687
688
  size_t num_blocks_;
689
  BlockSplit* split_;  /* not owned */
690
  HistogramType* histograms_;  /* not owned */
691
  size_t* histograms_size_;  /* not owned */
692
693
  /* Temporary storage for BlockSplitterFinishBlock. */
694
  HistogramType combined_histo[2];
695
696
  /* The number of symbols that we want to collect before deciding on whether
697
     or not to merge the block with a previous one or emit a new block. */
698
  size_t target_block_size_;
699
  /* The number of symbols in the current histogram. */
700
  size_t block_size_;
701
  /* Offset of the current histogram. */
702
  size_t curr_histogram_ix_;
703
  /* Offset of the histograms of the previous two block types. */
704
  size_t last_histogram_ix_[2];
705
  /* Entropy of the previous two block types. */
706
  double last_entropy_[2];
707
  /* The number of times we merged the current block with the last one. */
708
  size_t merge_last_count_;
709
} FN(BlockSplitter);
710
711
static void FN(InitBlockSplitter)(
712
    MemoryManager* m, FN(BlockSplitter)* self, size_t alphabet_size,
713
    size_t min_block_size, double split_threshold, size_t num_symbols,
714
0
    BlockSplit* split, HistogramType** histograms, size_t* histograms_size) {
715
0
  size_t max_num_blocks = num_symbols / min_block_size + 1;
716
  /* We have to allocate one more histogram than the maximum number of block
717
     types for the current histogram when the meta-block is too big. */
718
0
  size_t max_num_types =
719
0
      BROTLI_MIN(size_t, max_num_blocks, BROTLI_MAX_NUMBER_OF_BLOCK_TYPES + 1);
720
0
  self->alphabet_size_ = alphabet_size;
721
0
  self->min_block_size_ = min_block_size;
722
0
  self->split_threshold_ = split_threshold;
723
0
  self->num_blocks_ = 0;
724
0
  self->split_ = split;
725
0
  self->histograms_size_ = histograms_size;
726
0
  self->target_block_size_ = min_block_size;
727
0
  self->block_size_ = 0;
728
0
  self->curr_histogram_ix_ = 0;
729
0
  self->merge_last_count_ = 0;
730
0
  BROTLI_ENSURE_CAPACITY(m, uint8_t,
731
0
      split->types, split->types_alloc_size, max_num_blocks);
732
0
  BROTLI_ENSURE_CAPACITY(m, uint32_t,
733
0
      split->lengths, split->lengths_alloc_size, max_num_blocks);
734
0
  if (BROTLI_IS_OOM(m)) return;
735
0
  self->split_->num_blocks = max_num_blocks;
736
0
  BROTLI_DCHECK(*histograms == 0);
737
0
  *histograms_size = max_num_types;
738
0
  *histograms = BROTLI_ALLOC(m, HistogramType, *histograms_size);
739
0
  self->histograms_ = *histograms;
740
0
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(*histograms)) return;
741
  /* Clear only current histogram. */
742
0
  FN(HistogramClear)(&self->histograms_[0]);
743
0
  self->last_histogram_ix_[0] = self->last_histogram_ix_[1] = 0;
744
0
}
745
746
/* Does either of three things:
747
     (1) emits the current block with a new block type;
748
     (2) emits the current block with the type of the second last block;
749
     (3) merges the current block with the last block. */
750
static void FN(BlockSplitterFinishBlock)(
751
0
    FN(BlockSplitter)* self, BROTLI_BOOL is_final) {
752
0
  BlockSplit* split = self->split_;
753
0
  double* last_entropy = self->last_entropy_;
754
0
  HistogramType* histograms = self->histograms_;
755
0
  self->block_size_ =
756
0
      BROTLI_MAX(size_t, self->block_size_, self->min_block_size_);
757
0
  if (self->num_blocks_ == 0) {
758
    /* Create first block. */
759
0
    split->lengths[0] = (uint32_t)self->block_size_;
760
0
    split->types[0] = 0;
761
0
    last_entropy[0] =
762
0
        BitsEntropy(histograms[0].data_, self->alphabet_size_);
763
0
    last_entropy[1] = last_entropy[0];
764
0
    ++self->num_blocks_;
765
0
    ++split->num_types;
766
0
    ++self->curr_histogram_ix_;
767
0
    if (self->curr_histogram_ix_ < *self->histograms_size_)
768
0
      FN(HistogramClear)(&histograms[self->curr_histogram_ix_]);
769
0
    self->block_size_ = 0;
770
0
  } else if (self->block_size_ > 0) {
771
0
    double entropy = BitsEntropy(histograms[self->curr_histogram_ix_].data_,
772
0
                                 self->alphabet_size_);
773
0
    double combined_entropy[2];
774
0
    double diff[2];
775
0
    size_t j;
776
0
    for (j = 0; j < 2; ++j) {
777
0
      size_t last_histogram_ix = self->last_histogram_ix_[j];
778
0
      self->combined_histo[j] = histograms[self->curr_histogram_ix_];
779
0
      FN(HistogramAddHistogram)(&self->combined_histo[j],
780
0
          &histograms[last_histogram_ix]);
781
0
      combined_entropy[j] = BitsEntropy(
782
0
          &self->combined_histo[j].data_[0], self->alphabet_size_);
783
0
      diff[j] = combined_entropy[j] - entropy - last_entropy[j];
784
0
    }
785
786
0
    if (split->num_types < BROTLI_MAX_NUMBER_OF_BLOCK_TYPES &&
787
0
        diff[0] > self->split_threshold_ &&
788
0
        diff[1] > self->split_threshold_) {
789
      /* Create new block. */
790
0
      split->lengths[self->num_blocks_] = (uint32_t)self->block_size_;
791
0
      split->types[self->num_blocks_] = (uint8_t)split->num_types;
792
0
      self->last_histogram_ix_[1] = self->last_histogram_ix_[0];
793
0
      self->last_histogram_ix_[0] = (uint8_t)split->num_types;
794
0
      last_entropy[1] = last_entropy[0];
795
0
      last_entropy[0] = entropy;
796
0
      ++self->num_blocks_;
797
0
      ++split->num_types;
798
0
      ++self->curr_histogram_ix_;
799
0
      if (self->curr_histogram_ix_ < *self->histograms_size_)
800
0
        FN(HistogramClear)(&histograms[self->curr_histogram_ix_]);
801
0
      self->block_size_ = 0;
802
0
      self->merge_last_count_ = 0;
803
0
      self->target_block_size_ = self->min_block_size_;
804
0
    } else if (diff[1] < diff[0] - 20.0) {
805
      /* Combine this block with second last block. */
806
0
      split->lengths[self->num_blocks_] = (uint32_t)self->block_size_;
807
0
      split->types[self->num_blocks_] = split->types[self->num_blocks_ - 2];
808
0
      BROTLI_SWAP(size_t, self->last_histogram_ix_, 0, 1);
809
0
      histograms[self->last_histogram_ix_[0]] = self->combined_histo[1];
810
0
      last_entropy[1] = last_entropy[0];
811
0
      last_entropy[0] = combined_entropy[1];
812
0
      ++self->num_blocks_;
813
0
      self->block_size_ = 0;
814
0
      FN(HistogramClear)(&histograms[self->curr_histogram_ix_]);
815
0
      self->merge_last_count_ = 0;
816
0
      self->target_block_size_ = self->min_block_size_;
817
0
    } else {
818
      /* Combine this block with last block. */
819
0
      split->lengths[self->num_blocks_ - 1] += (uint32_t)self->block_size_;
820
0
      histograms[self->last_histogram_ix_[0]] = self->combined_histo[0];
821
0
      last_entropy[0] = combined_entropy[0];
822
0
      if (split->num_types == 1) {
823
0
        last_entropy[1] = last_entropy[0];
824
0
      }
825
0
      self->block_size_ = 0;
826
0
      FN(HistogramClear)(&histograms[self->curr_histogram_ix_]);
827
0
      if (++self->merge_last_count_ > 1) {
828
0
        self->target_block_size_ += self->min_block_size_;
829
0
      }
830
0
    }
831
0
  }
832
0
  if (is_final) {
833
0
    *self->histograms_size_ = split->num_types;
834
0
    split->num_blocks = self->num_blocks_;
835
0
  }
836
0
}
837
838
/* Adds the next symbol to the current histogram. When the current histogram
839
   reaches the target size, decides on merging the block. */
840
0
static void FN(BlockSplitterAddSymbol)(FN(BlockSplitter)* self, size_t symbol) {
841
0
  FN(HistogramAdd)(&self->histograms_[self->curr_histogram_ix_], symbol);
842
0
  ++self->block_size_;
843
0
  if (self->block_size_ == self->target_block_size_) {
844
0
    FN(BlockSplitterFinishBlock)(self, /* is_final = */ BROTLI_FALSE);
845
0
  }
846
0
}
847
848
#undef HistogramType
849
#undef FN
850
851
#define BROTLI_MAX_STATIC_CONTEXTS 13
852
853
/* Greedy block splitter for one block category (literal, command or distance).
854
   Gathers histograms for all context buckets. */
855
typedef struct ContextBlockSplitter {
856
  /* Alphabet size of particular block category. */
857
  size_t alphabet_size_;
858
  size_t num_contexts_;
859
  size_t max_block_types_;
860
  /* We collect at least this many symbols for each block. */
861
  size_t min_block_size_;
862
  /* We merge histograms A and B if
863
       entropy(A+B) < entropy(A) + entropy(B) + split_threshold_,
864
     where A is the current histogram and B is the histogram of the last or the
865
     second last block type. */
866
  double split_threshold_;
867
868
  size_t num_blocks_;
869
  BlockSplit* split_;  /* not owned */
870
  HistogramLiteral* histograms_;  /* not owned */
871
  size_t* histograms_size_;  /* not owned */
872
873
  /* The number of symbols that we want to collect before deciding on whether
874
     or not to merge the block with a previous one or emit a new block. */
875
  size_t target_block_size_;
876
  /* The number of symbols in the current histogram. */
877
  size_t block_size_;
878
  /* Offset of the current histogram. */
879
  size_t curr_histogram_ix_;
880
  /* Offset of the histograms of the previous two block types. */
881
  size_t last_histogram_ix_[2];
882
  /* Entropy of the previous two block types. */
883
  double last_entropy_[2 * BROTLI_MAX_STATIC_CONTEXTS];
884
  /* The number of times we merged the current block with the last one. */
885
  size_t merge_last_count_;
886
} ContextBlockSplitter;
887
888
static void InitContextBlockSplitter(
889
    MemoryManager* m, ContextBlockSplitter* self, size_t alphabet_size,
890
    size_t num_contexts, size_t min_block_size, double split_threshold,
891
    size_t num_symbols, BlockSplit* split, HistogramLiteral** histograms,
892
0
    size_t* histograms_size) {
893
0
  size_t max_num_blocks = num_symbols / min_block_size + 1;
894
0
  size_t max_num_types;
895
0
  BROTLI_DCHECK(num_contexts <= BROTLI_MAX_STATIC_CONTEXTS);
896
897
0
  self->alphabet_size_ = alphabet_size;
898
0
  self->num_contexts_ = num_contexts;
899
0
  self->max_block_types_ = BROTLI_MAX_NUMBER_OF_BLOCK_TYPES / num_contexts;
900
0
  self->min_block_size_ = min_block_size;
901
0
  self->split_threshold_ = split_threshold;
902
0
  self->num_blocks_ = 0;
903
0
  self->split_ = split;
904
0
  self->histograms_size_ = histograms_size;
905
0
  self->target_block_size_ = min_block_size;
906
0
  self->block_size_ = 0;
907
0
  self->curr_histogram_ix_ = 0;
908
0
  self->merge_last_count_ = 0;
909
910
  /* We have to allocate one more histogram than the maximum number of block
911
     types for the current histogram when the meta-block is too big. */
912
0
  max_num_types =
913
0
      BROTLI_MIN(size_t, max_num_blocks, self->max_block_types_ + 1);
914
0
  BROTLI_ENSURE_CAPACITY(m, uint8_t,
915
0
      split->types, split->types_alloc_size, max_num_blocks);
916
0
  BROTLI_ENSURE_CAPACITY(m, uint32_t,
917
0
      split->lengths, split->lengths_alloc_size, max_num_blocks);
918
0
  if (BROTLI_IS_OOM(m)) return;
919
0
  split->num_blocks = max_num_blocks;
920
0
  if (BROTLI_IS_OOM(m)) return;
921
0
  BROTLI_DCHECK(*histograms == 0);
922
0
  *histograms_size = max_num_types * num_contexts;
923
0
  *histograms = BROTLI_ALLOC(m, HistogramLiteral, *histograms_size);
924
0
  self->histograms_ = *histograms;
925
0
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(*histograms)) return;
926
  /* Clear only current histogram. */
927
0
  ClearHistogramsLiteral(&self->histograms_[0], num_contexts);
928
0
  self->last_histogram_ix_[0] = self->last_histogram_ix_[1] = 0;
929
0
}
930
931
/* Does either of three things:
932
     (1) emits the current block with a new block type;
933
     (2) emits the current block with the type of the second last block;
934
     (3) merges the current block with the last block. */
935
static void ContextBlockSplitterFinishBlock(
936
0
    ContextBlockSplitter* self, MemoryManager* m, BROTLI_BOOL is_final) {
937
0
  BlockSplit* split = self->split_;
938
0
  const size_t num_contexts = self->num_contexts_;
939
0
  double* last_entropy = self->last_entropy_;
940
0
  HistogramLiteral* histograms = self->histograms_;
941
942
0
  if (self->block_size_ < self->min_block_size_) {
943
0
    self->block_size_ = self->min_block_size_;
944
0
  }
945
0
  if (self->num_blocks_ == 0) {
946
0
    size_t i;
947
    /* Create first block. */
948
0
    split->lengths[0] = (uint32_t)self->block_size_;
949
0
    split->types[0] = 0;
950
951
0
    for (i = 0; i < num_contexts; ++i) {
952
0
      last_entropy[i] =
953
0
          BitsEntropy(histograms[i].data_, self->alphabet_size_);
954
0
      last_entropy[num_contexts + i] = last_entropy[i];
955
0
    }
956
0
    ++self->num_blocks_;
957
0
    ++split->num_types;
958
0
    self->curr_histogram_ix_ += num_contexts;
959
0
    if (self->curr_histogram_ix_ < *self->histograms_size_) {
960
0
      ClearHistogramsLiteral(
961
0
          &self->histograms_[self->curr_histogram_ix_], self->num_contexts_);
962
0
    }
963
0
    self->block_size_ = 0;
964
0
  } else if (self->block_size_ > 0) {
965
    /* Try merging the set of histograms for the current block type with the
966
       respective set of histograms for the last and second last block types.
967
       Decide over the split based on the total reduction of entropy across
968
       all contexts. */
969
0
    double entropy[BROTLI_MAX_STATIC_CONTEXTS];
970
0
    HistogramLiteral* combined_histo =
971
0
        BROTLI_ALLOC(m, HistogramLiteral, 2 * num_contexts);
972
0
    double combined_entropy[2 * BROTLI_MAX_STATIC_CONTEXTS];
973
0
    double diff[2] = { 0.0 };
974
0
    size_t i;
975
0
    if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(combined_histo)) return;
976
0
    for (i = 0; i < num_contexts; ++i) {
977
0
      size_t curr_histo_ix = self->curr_histogram_ix_ + i;
978
0
      size_t j;
979
0
      entropy[i] = BitsEntropy(histograms[curr_histo_ix].data_,
980
0
                               self->alphabet_size_);
981
0
      for (j = 0; j < 2; ++j) {
982
0
        size_t jx = j * num_contexts + i;
983
0
        size_t last_histogram_ix = self->last_histogram_ix_[j] + i;
984
0
        combined_histo[jx] = histograms[curr_histo_ix];
985
0
        HistogramAddHistogramLiteral(&combined_histo[jx],
986
0
            &histograms[last_histogram_ix]);
987
0
        combined_entropy[jx] = BitsEntropy(
988
0
            &combined_histo[jx].data_[0], self->alphabet_size_);
989
0
        diff[j] += combined_entropy[jx] - entropy[i] - last_entropy[jx];
990
0
      }
991
0
    }
992
993
0
    if (split->num_types < self->max_block_types_ &&
994
0
        diff[0] > self->split_threshold_ &&
995
0
        diff[1] > self->split_threshold_) {
996
      /* Create new block. */
997
0
      split->lengths[self->num_blocks_] = (uint32_t)self->block_size_;
998
0
      split->types[self->num_blocks_] = (uint8_t)split->num_types;
999
0
      self->last_histogram_ix_[1] = self->last_histogram_ix_[0];
1000
0
      self->last_histogram_ix_[0] = split->num_types * num_contexts;
1001
0
      for (i = 0; i < num_contexts; ++i) {
1002
0
        last_entropy[num_contexts + i] = last_entropy[i];
1003
0
        last_entropy[i] = entropy[i];
1004
0
      }
1005
0
      ++self->num_blocks_;
1006
0
      ++split->num_types;
1007
0
      self->curr_histogram_ix_ += num_contexts;
1008
0
      if (self->curr_histogram_ix_ < *self->histograms_size_) {
1009
0
        ClearHistogramsLiteral(
1010
0
            &self->histograms_[self->curr_histogram_ix_], self->num_contexts_);
1011
0
      }
1012
0
      self->block_size_ = 0;
1013
0
      self->merge_last_count_ = 0;
1014
0
      self->target_block_size_ = self->min_block_size_;
1015
0
    } else if (diff[1] < diff[0] - 20.0) {
1016
      /* Combine this block with second last block. */
1017
0
      split->lengths[self->num_blocks_] = (uint32_t)self->block_size_;
1018
0
      split->types[self->num_blocks_] = split->types[self->num_blocks_ - 2];
1019
0
      BROTLI_SWAP(size_t, self->last_histogram_ix_, 0, 1);
1020
0
      for (i = 0; i < num_contexts; ++i) {
1021
0
        histograms[self->last_histogram_ix_[0] + i] =
1022
0
            combined_histo[num_contexts + i];
1023
0
        last_entropy[num_contexts + i] = last_entropy[i];
1024
0
        last_entropy[i] = combined_entropy[num_contexts + i];
1025
0
        HistogramClearLiteral(&histograms[self->curr_histogram_ix_ + i]);
1026
0
      }
1027
0
      ++self->num_blocks_;
1028
0
      self->block_size_ = 0;
1029
0
      self->merge_last_count_ = 0;
1030
0
      self->target_block_size_ = self->min_block_size_;
1031
0
    } else {
1032
      /* Combine this block with last block. */
1033
0
      split->lengths[self->num_blocks_ - 1] += (uint32_t)self->block_size_;
1034
0
      for (i = 0; i < num_contexts; ++i) {
1035
0
        histograms[self->last_histogram_ix_[0] + i] = combined_histo[i];
1036
0
        last_entropy[i] = combined_entropy[i];
1037
0
        if (split->num_types == 1) {
1038
0
          last_entropy[num_contexts + i] = last_entropy[i];
1039
0
        }
1040
0
        HistogramClearLiteral(&histograms[self->curr_histogram_ix_ + i]);
1041
0
      }
1042
0
      self->block_size_ = 0;
1043
0
      if (++self->merge_last_count_ > 1) {
1044
0
        self->target_block_size_ += self->min_block_size_;
1045
0
      }
1046
0
    }
1047
0
    BROTLI_FREE(m, combined_histo);
1048
0
  }
1049
0
  if (is_final) {
1050
0
    *self->histograms_size_ = split->num_types * num_contexts;
1051
0
    split->num_blocks = self->num_blocks_;
1052
0
  }
1053
0
}
1054
1055
/* Adds the next symbol to the current block type and context. When the
1056
   current block reaches the target size, decides on merging the block. */
1057
static void ContextBlockSplitterAddSymbol(
1058
    ContextBlockSplitter* self, MemoryManager* m,
1059
0
    size_t symbol, size_t context) {
1060
0
  HistogramAddLiteral(&self->histograms_[self->curr_histogram_ix_ + context],
1061
0
      symbol);
1062
0
  ++self->block_size_;
1063
0
  if (self->block_size_ == self->target_block_size_) {
1064
0
    ContextBlockSplitterFinishBlock(self, m, /* is_final = */ BROTLI_FALSE);
1065
0
    if (BROTLI_IS_OOM(m)) return;
1066
0
  }
1067
0
}
1068
1069
static void MapStaticContexts(MemoryManager* m,
1070
                              size_t num_contexts,
1071
                              const uint32_t* static_context_map,
1072
0
                              MetaBlockSplit* mb) {
1073
0
  size_t i;
1074
0
  BROTLI_DCHECK(mb->literal_context_map == 0);
1075
0
  mb->literal_context_map_size =
1076
0
      mb->literal_split.num_types << BROTLI_LITERAL_CONTEXT_BITS;
1077
0
  mb->literal_context_map =
1078
0
      BROTLI_ALLOC(m, uint32_t, mb->literal_context_map_size);
1079
0
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(mb->literal_context_map)) return;
1080
1081
0
  for (i = 0; i < mb->literal_split.num_types; ++i) {
1082
0
    uint32_t offset = (uint32_t)(i * num_contexts);
1083
0
    size_t j;
1084
0
    for (j = 0; j < (1u << BROTLI_LITERAL_CONTEXT_BITS); ++j) {
1085
0
      mb->literal_context_map[(i << BROTLI_LITERAL_CONTEXT_BITS) + j] =
1086
0
          offset + static_context_map[j];
1087
0
    }
1088
0
  }
1089
0
}
1090
1091
typedef struct GreedyMetablockArena {
1092
  union {
1093
    BlockSplitterLiteral plain;
1094
    ContextBlockSplitter ctx;
1095
  } lit_blocks;
1096
  BlockSplitterCommand cmd_blocks;
1097
  BlockSplitterDistance dist_blocks;
1098
} GreedyMetablockArena;
1099
1100
static BROTLI_INLINE void BrotliBuildMetaBlockGreedyInternal(
1101
    MemoryManager* m, GreedyMetablockArena* arena, const uint8_t* ringbuffer,
1102
    size_t pos, size_t mask, uint8_t prev_byte, uint8_t prev_byte2,
1103
    ContextLut literal_context_lut, const size_t num_contexts,
1104
    const uint32_t* static_context_map, const Command* commands,
1105
0
    size_t n_commands, MetaBlockSplit* mb) {
1106
0
  size_t num_literals = 0;
1107
0
  size_t i;
1108
0
  for (i = 0; i < n_commands; ++i) {
1109
0
    num_literals += commands[i].insert_len_;
1110
0
  }
1111
1112
0
  if (num_contexts == 1) {
1113
0
    InitBlockSplitterLiteral(m, &arena->lit_blocks.plain, 256, 512, 400.0,
1114
0
        num_literals, &mb->literal_split, &mb->literal_histograms,
1115
0
        &mb->literal_histograms_size);
1116
0
  } else {
1117
0
    InitContextBlockSplitter(m, &arena->lit_blocks.ctx, 256, num_contexts, 512,
1118
0
        400.0, num_literals, &mb->literal_split, &mb->literal_histograms,
1119
0
        &mb->literal_histograms_size);
1120
0
  }
1121
0
  if (BROTLI_IS_OOM(m)) return;
1122
0
  InitBlockSplitterCommand(m, &arena->cmd_blocks, BROTLI_NUM_COMMAND_SYMBOLS,
1123
0
      1024, 500.0, n_commands, &mb->command_split, &mb->command_histograms,
1124
0
      &mb->command_histograms_size);
1125
0
  if (BROTLI_IS_OOM(m)) return;
1126
0
  InitBlockSplitterDistance(m, &arena->dist_blocks, 64, 512, 100.0, n_commands,
1127
0
      &mb->distance_split, &mb->distance_histograms,
1128
0
      &mb->distance_histograms_size);
1129
0
  if (BROTLI_IS_OOM(m)) return;
1130
1131
0
  for (i = 0; i < n_commands; ++i) {
1132
0
    const Command cmd = commands[i];
1133
0
    size_t j;
1134
0
    BlockSplitterAddSymbolCommand(&arena->cmd_blocks, cmd.cmd_prefix_);
1135
0
    for (j = cmd.insert_len_; j != 0; --j) {
1136
0
      uint8_t literal = ringbuffer[pos & mask];
1137
0
      if (num_contexts == 1) {
1138
0
        BlockSplitterAddSymbolLiteral(&arena->lit_blocks.plain, literal);
1139
0
      } else {
1140
0
        size_t context =
1141
0
            BROTLI_CONTEXT(prev_byte, prev_byte2, literal_context_lut);
1142
0
        ContextBlockSplitterAddSymbol(&arena->lit_blocks.ctx, m, literal,
1143
0
                                      static_context_map[context]);
1144
0
        if (BROTLI_IS_OOM(m)) return;
1145
0
      }
1146
0
      prev_byte2 = prev_byte;
1147
0
      prev_byte = literal;
1148
0
      ++pos;
1149
0
    }
1150
0
    pos += CommandCopyLen(&cmd);
1151
0
    if (CommandCopyLen(&cmd)) {
1152
0
      prev_byte2 = ringbuffer[(pos - 2) & mask];
1153
0
      prev_byte = ringbuffer[(pos - 1) & mask];
1154
0
      if (cmd.cmd_prefix_ >= 128) {
1155
0
        BlockSplitterAddSymbolDistance(
1156
0
            &arena->dist_blocks, cmd.dist_prefix_ & 0x3FF);
1157
0
      }
1158
0
    }
1159
0
  }
1160
1161
0
  if (num_contexts == 1) {
1162
0
    BlockSplitterFinishBlockLiteral(
1163
0
        &arena->lit_blocks.plain, /* is_final = */ BROTLI_TRUE);
1164
0
  } else {
1165
0
    ContextBlockSplitterFinishBlock(
1166
0
        &arena->lit_blocks.ctx, m, /* is_final = */ BROTLI_TRUE);
1167
0
    if (BROTLI_IS_OOM(m)) return;
1168
0
  }
1169
0
  BlockSplitterFinishBlockCommand(
1170
0
      &arena->cmd_blocks, /* is_final = */ BROTLI_TRUE);
1171
0
  BlockSplitterFinishBlockDistance(
1172
0
      &arena->dist_blocks, /* is_final = */ BROTLI_TRUE);
1173
1174
0
  if (num_contexts > 1) {
1175
0
    MapStaticContexts(m, num_contexts, static_context_map, mb);
1176
0
  }
1177
0
}
1178
1179
void duckdb_brotli::BrotliBuildMetaBlockGreedy(MemoryManager* m,
1180
                                const uint8_t* ringbuffer,
1181
                                size_t pos,
1182
                                size_t mask,
1183
                                uint8_t prev_byte,
1184
                                uint8_t prev_byte2,
1185
                                ContextLut literal_context_lut,
1186
                                size_t num_contexts,
1187
                                const uint32_t* static_context_map,
1188
                                const Command* commands,
1189
                                size_t n_commands,
1190
0
                                MetaBlockSplit* mb) {
1191
0
  GreedyMetablockArena* arena = BROTLI_ALLOC(m, GreedyMetablockArena, 1);
1192
0
  if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(arena)) return;
1193
0
  if (num_contexts == 1) {
1194
0
    BrotliBuildMetaBlockGreedyInternal(m, arena, ringbuffer, pos, mask,
1195
0
        prev_byte, prev_byte2, literal_context_lut, 1, NULL, commands,
1196
0
        n_commands, mb);
1197
0
  } else {
1198
0
    BrotliBuildMetaBlockGreedyInternal(m, arena, ringbuffer, pos, mask,
1199
0
        prev_byte, prev_byte2, literal_context_lut, num_contexts,
1200
0
        static_context_map, commands, n_commands, mb);
1201
0
  }
1202
0
  BROTLI_FREE(m, arena);
1203
0
}
1204
1205
void duckdb_brotli::BrotliOptimizeHistograms(uint32_t num_distance_codes,
1206
0
                              MetaBlockSplit* mb) {
1207
0
  uint8_t good_for_rle[BROTLI_NUM_COMMAND_SYMBOLS];
1208
0
  size_t i;
1209
0
  for (i = 0; i < mb->literal_histograms_size; ++i) {
1210
0
    BrotliOptimizeHuffmanCountsForRle(256, mb->literal_histograms[i].data_,
1211
0
                                      good_for_rle);
1212
0
  }
1213
0
  for (i = 0; i < mb->command_histograms_size; ++i) {
1214
0
    BrotliOptimizeHuffmanCountsForRle(BROTLI_NUM_COMMAND_SYMBOLS,
1215
0
                                      mb->command_histograms[i].data_,
1216
0
                                      good_for_rle);
1217
0
  }
1218
0
  for (i = 0; i < mb->distance_histograms_size; ++i) {
1219
0
    BrotliOptimizeHuffmanCountsForRle(num_distance_codes,
1220
0
                                      mb->distance_histograms[i].data_,
1221
0
                                      good_for_rle);
1222
0
  }
1223
0
}
1224
1225