/src/duckdb/third_party/brotli/enc/metablock.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* Copyright 2015 Google Inc. All Rights Reserved. |
2 | | |
3 | | Distributed under MIT license. |
4 | | See file LICENSE for detail or copy at https://opensource.org/licenses/MIT |
5 | | */ |
6 | | |
7 | | /* Algorithms for distributing the literals and commands of a metablock between |
8 | | block types and contexts. */ |
9 | | |
10 | | #include "metablock.h" |
11 | | |
12 | | #include <brotli/types.h> |
13 | | |
14 | | #include "../common/brotli_constants.h" |
15 | | #include "../common/context.h" |
16 | | #include "../common/brotli_platform.h" |
17 | | #include "bit_cost.h" |
18 | | #include "block_splitter.h" |
19 | | #include "cluster.h" |
20 | | #include "entropy_encode.h" |
21 | | #include "histogram.h" |
22 | | #include "memory.h" |
23 | | #include "quality.h" |
24 | | |
25 | | using namespace duckdb_brotli; |
26 | | |
27 | | void duckdb_brotli::BrotliInitDistanceParams(BrotliDistanceParams* dist_params, |
28 | 0 | uint32_t npostfix, uint32_t ndirect, BROTLI_BOOL large_window) { |
29 | 0 | uint32_t alphabet_size_max; |
30 | 0 | uint32_t alphabet_size_limit; |
31 | 0 | uint32_t max_distance; |
32 | |
|
33 | 0 | dist_params->distance_postfix_bits = npostfix; |
34 | 0 | dist_params->num_direct_distance_codes = ndirect; |
35 | |
|
36 | 0 | alphabet_size_max = BROTLI_DISTANCE_ALPHABET_SIZE( |
37 | 0 | npostfix, ndirect, BROTLI_MAX_DISTANCE_BITS); |
38 | 0 | alphabet_size_limit = alphabet_size_max; |
39 | 0 | max_distance = ndirect + (1U << (BROTLI_MAX_DISTANCE_BITS + npostfix + 2)) - |
40 | 0 | (1U << (npostfix + 2)); |
41 | |
|
42 | 0 | if (large_window) { |
43 | 0 | BrotliDistanceCodeLimit limit = BrotliCalculateDistanceCodeLimit( |
44 | 0 | BROTLI_MAX_ALLOWED_DISTANCE, npostfix, ndirect); |
45 | 0 | alphabet_size_max = BROTLI_DISTANCE_ALPHABET_SIZE( |
46 | 0 | npostfix, ndirect, BROTLI_LARGE_MAX_DISTANCE_BITS); |
47 | 0 | alphabet_size_limit = limit.max_alphabet_size; |
48 | 0 | max_distance = limit.max_distance; |
49 | 0 | } |
50 | |
|
51 | 0 | dist_params->alphabet_size_max = alphabet_size_max; |
52 | 0 | dist_params->alphabet_size_limit = alphabet_size_limit; |
53 | 0 | dist_params->max_distance = max_distance; |
54 | 0 | } |
55 | | |
56 | | static void RecomputeDistancePrefixes(Command* cmds, |
57 | | size_t num_commands, |
58 | | const BrotliDistanceParams* orig_params, |
59 | 0 | const BrotliDistanceParams* new_params) { |
60 | 0 | size_t i; |
61 | |
|
62 | 0 | if (orig_params->distance_postfix_bits == new_params->distance_postfix_bits && |
63 | 0 | orig_params->num_direct_distance_codes == |
64 | 0 | new_params->num_direct_distance_codes) { |
65 | 0 | return; |
66 | 0 | } |
67 | | |
68 | 0 | for (i = 0; i < num_commands; ++i) { |
69 | 0 | Command* cmd = &cmds[i]; |
70 | 0 | if (CommandCopyLen(cmd) && cmd->cmd_prefix_ >= 128) { |
71 | 0 | PrefixEncodeCopyDistance(CommandRestoreDistanceCode(cmd, orig_params), |
72 | 0 | new_params->num_direct_distance_codes, |
73 | 0 | new_params->distance_postfix_bits, |
74 | 0 | &cmd->dist_prefix_, |
75 | 0 | &cmd->dist_extra_); |
76 | 0 | } |
77 | 0 | } |
78 | 0 | } |
79 | | |
80 | | static BROTLI_BOOL ComputeDistanceCost(const Command* cmds, |
81 | | size_t num_commands, |
82 | | const BrotliDistanceParams* orig_params, |
83 | | const BrotliDistanceParams* new_params, |
84 | | double* cost, |
85 | 0 | HistogramDistance* tmp) { |
86 | 0 | size_t i; |
87 | 0 | BROTLI_BOOL equal_params = BROTLI_FALSE; |
88 | 0 | uint16_t dist_prefix; |
89 | 0 | uint32_t dist_extra; |
90 | 0 | double extra_bits = 0.0; |
91 | 0 | HistogramClearDistance(tmp); |
92 | |
|
93 | 0 | if (orig_params->distance_postfix_bits == new_params->distance_postfix_bits && |
94 | 0 | orig_params->num_direct_distance_codes == |
95 | 0 | new_params->num_direct_distance_codes) { |
96 | 0 | equal_params = BROTLI_TRUE; |
97 | 0 | } |
98 | |
|
99 | 0 | for (i = 0; i < num_commands; i++) { |
100 | 0 | const Command* cmd = &cmds[i]; |
101 | 0 | if (CommandCopyLen(cmd) && cmd->cmd_prefix_ >= 128) { |
102 | 0 | if (equal_params) { |
103 | 0 | dist_prefix = cmd->dist_prefix_; |
104 | 0 | } else { |
105 | 0 | uint32_t distance = CommandRestoreDistanceCode(cmd, orig_params); |
106 | 0 | if (distance > new_params->max_distance) { |
107 | 0 | return BROTLI_FALSE; |
108 | 0 | } |
109 | 0 | PrefixEncodeCopyDistance(distance, |
110 | 0 | new_params->num_direct_distance_codes, |
111 | 0 | new_params->distance_postfix_bits, |
112 | 0 | &dist_prefix, |
113 | 0 | &dist_extra); |
114 | 0 | } |
115 | 0 | HistogramAddDistance(tmp, dist_prefix & 0x3FF); |
116 | 0 | extra_bits += dist_prefix >> 10; |
117 | 0 | } |
118 | 0 | } |
119 | | |
120 | 0 | *cost = BrotliPopulationCostDistance(tmp) + extra_bits; |
121 | 0 | return BROTLI_TRUE; |
122 | 0 | } |
123 | | |
124 | | void duckdb_brotli::BrotliBuildMetaBlock(MemoryManager* m, |
125 | | const uint8_t* ringbuffer, |
126 | | const size_t pos, |
127 | | const size_t mask, |
128 | | BrotliEncoderParams* params, |
129 | | uint8_t prev_byte, |
130 | | uint8_t prev_byte2, |
131 | | Command* cmds, |
132 | | size_t num_commands, |
133 | | ContextType literal_context_mode, |
134 | 0 | MetaBlockSplit* mb) { |
135 | | /* Histogram ids need to fit in one byte. */ |
136 | 0 | static const size_t kMaxNumberOfHistograms = 256; |
137 | 0 | HistogramDistance* distance_histograms; |
138 | 0 | HistogramLiteral* literal_histograms; |
139 | 0 | ContextType* literal_context_modes = NULL; |
140 | 0 | size_t literal_histograms_size; |
141 | 0 | size_t distance_histograms_size; |
142 | 0 | size_t i; |
143 | 0 | size_t literal_context_multiplier = 1; |
144 | 0 | uint32_t npostfix; |
145 | 0 | uint32_t ndirect_msb = 0; |
146 | 0 | BROTLI_BOOL check_orig = BROTLI_TRUE; |
147 | 0 | double best_dist_cost = 1e99; |
148 | 0 | BrotliDistanceParams orig_params = params->dist; |
149 | 0 | BrotliDistanceParams new_params = params->dist; |
150 | 0 | HistogramDistance* tmp = BROTLI_ALLOC(m, HistogramDistance, 1); |
151 | |
|
152 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(tmp)) return; |
153 | | |
154 | 0 | for (npostfix = 0; npostfix <= BROTLI_MAX_NPOSTFIX; npostfix++) { |
155 | 0 | for (; ndirect_msb < 16; ndirect_msb++) { |
156 | 0 | uint32_t ndirect = ndirect_msb << npostfix; |
157 | 0 | BROTLI_BOOL skip; |
158 | 0 | double dist_cost; |
159 | 0 | BrotliInitDistanceParams(&new_params, npostfix, ndirect, |
160 | 0 | params->large_window); |
161 | 0 | if (npostfix == orig_params.distance_postfix_bits && |
162 | 0 | ndirect == orig_params.num_direct_distance_codes) { |
163 | 0 | check_orig = BROTLI_FALSE; |
164 | 0 | } |
165 | 0 | skip = !ComputeDistanceCost( |
166 | 0 | cmds, num_commands, &orig_params, &new_params, &dist_cost, tmp); |
167 | 0 | if (skip || (dist_cost > best_dist_cost)) { |
168 | 0 | break; |
169 | 0 | } |
170 | 0 | best_dist_cost = dist_cost; |
171 | 0 | params->dist = new_params; |
172 | 0 | } |
173 | 0 | if (ndirect_msb > 0) ndirect_msb--; |
174 | 0 | ndirect_msb /= 2; |
175 | 0 | } |
176 | 0 | if (check_orig) { |
177 | 0 | double dist_cost; |
178 | 0 | ComputeDistanceCost(cmds, num_commands, &orig_params, &orig_params, |
179 | 0 | &dist_cost, tmp); |
180 | 0 | if (dist_cost < best_dist_cost) { |
181 | | /* NB: currently unused; uncomment when more param tuning is added. */ |
182 | | /* best_dist_cost = dist_cost; */ |
183 | 0 | params->dist = orig_params; |
184 | 0 | } |
185 | 0 | } |
186 | 0 | BROTLI_FREE(m, tmp); |
187 | 0 | RecomputeDistancePrefixes(cmds, num_commands, &orig_params, ¶ms->dist); |
188 | |
|
189 | 0 | BrotliSplitBlock(m, cmds, num_commands, |
190 | 0 | ringbuffer, pos, mask, params, |
191 | 0 | &mb->literal_split, |
192 | 0 | &mb->command_split, |
193 | 0 | &mb->distance_split); |
194 | 0 | if (BROTLI_IS_OOM(m)) return; |
195 | | |
196 | 0 | if (!params->disable_literal_context_modeling) { |
197 | 0 | literal_context_multiplier = 1 << BROTLI_LITERAL_CONTEXT_BITS; |
198 | 0 | literal_context_modes = |
199 | 0 | BROTLI_ALLOC(m, ContextType, mb->literal_split.num_types); |
200 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(literal_context_modes)) return; |
201 | 0 | for (i = 0; i < mb->literal_split.num_types; ++i) { |
202 | 0 | literal_context_modes[i] = literal_context_mode; |
203 | 0 | } |
204 | 0 | } |
205 | | |
206 | 0 | literal_histograms_size = |
207 | 0 | mb->literal_split.num_types * literal_context_multiplier; |
208 | 0 | literal_histograms = |
209 | 0 | BROTLI_ALLOC(m, HistogramLiteral, literal_histograms_size); |
210 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(literal_histograms)) return; |
211 | 0 | ClearHistogramsLiteral(literal_histograms, literal_histograms_size); |
212 | |
|
213 | 0 | distance_histograms_size = |
214 | 0 | mb->distance_split.num_types << BROTLI_DISTANCE_CONTEXT_BITS; |
215 | 0 | distance_histograms = |
216 | 0 | BROTLI_ALLOC(m, HistogramDistance, distance_histograms_size); |
217 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(distance_histograms)) return; |
218 | 0 | ClearHistogramsDistance(distance_histograms, distance_histograms_size); |
219 | |
|
220 | 0 | BROTLI_DCHECK(mb->command_histograms == 0); |
221 | 0 | mb->command_histograms_size = mb->command_split.num_types; |
222 | 0 | mb->command_histograms = |
223 | 0 | BROTLI_ALLOC(m, HistogramCommand, mb->command_histograms_size); |
224 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(mb->command_histograms)) return; |
225 | 0 | ClearHistogramsCommand(mb->command_histograms, mb->command_histograms_size); |
226 | |
|
227 | 0 | BrotliBuildHistogramsWithContext(cmds, num_commands, |
228 | 0 | &mb->literal_split, &mb->command_split, &mb->distance_split, |
229 | 0 | ringbuffer, pos, mask, prev_byte, prev_byte2, literal_context_modes, |
230 | 0 | literal_histograms, mb->command_histograms, distance_histograms); |
231 | 0 | BROTLI_FREE(m, literal_context_modes); |
232 | |
|
233 | 0 | BROTLI_DCHECK(mb->literal_context_map == 0); |
234 | 0 | mb->literal_context_map_size = |
235 | 0 | mb->literal_split.num_types << BROTLI_LITERAL_CONTEXT_BITS; |
236 | 0 | mb->literal_context_map = |
237 | 0 | BROTLI_ALLOC(m, uint32_t, mb->literal_context_map_size); |
238 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(mb->literal_context_map)) return; |
239 | | |
240 | 0 | BROTLI_DCHECK(mb->literal_histograms == 0); |
241 | 0 | mb->literal_histograms_size = mb->literal_context_map_size; |
242 | 0 | mb->literal_histograms = |
243 | 0 | BROTLI_ALLOC(m, HistogramLiteral, mb->literal_histograms_size); |
244 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(mb->literal_histograms)) return; |
245 | | |
246 | 0 | BrotliClusterHistogramsLiteral(m, literal_histograms, literal_histograms_size, |
247 | 0 | kMaxNumberOfHistograms, mb->literal_histograms, |
248 | 0 | &mb->literal_histograms_size, mb->literal_context_map); |
249 | 0 | if (BROTLI_IS_OOM(m)) return; |
250 | 0 | BROTLI_FREE(m, literal_histograms); |
251 | |
|
252 | 0 | if (params->disable_literal_context_modeling) { |
253 | | /* Distribute assignment to all contexts. */ |
254 | 0 | for (i = mb->literal_split.num_types; i != 0;) { |
255 | 0 | size_t j = 0; |
256 | 0 | i--; |
257 | 0 | for (; j < (1 << BROTLI_LITERAL_CONTEXT_BITS); j++) { |
258 | 0 | mb->literal_context_map[(i << BROTLI_LITERAL_CONTEXT_BITS) + j] = |
259 | 0 | mb->literal_context_map[i]; |
260 | 0 | } |
261 | 0 | } |
262 | 0 | } |
263 | |
|
264 | 0 | BROTLI_DCHECK(mb->distance_context_map == 0); |
265 | 0 | mb->distance_context_map_size = |
266 | 0 | mb->distance_split.num_types << BROTLI_DISTANCE_CONTEXT_BITS; |
267 | 0 | mb->distance_context_map = |
268 | 0 | BROTLI_ALLOC(m, uint32_t, mb->distance_context_map_size); |
269 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(mb->distance_context_map)) return; |
270 | | |
271 | 0 | BROTLI_DCHECK(mb->distance_histograms == 0); |
272 | 0 | mb->distance_histograms_size = mb->distance_context_map_size; |
273 | 0 | mb->distance_histograms = |
274 | 0 | BROTLI_ALLOC(m, HistogramDistance, mb->distance_histograms_size); |
275 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(mb->distance_histograms)) return; |
276 | | |
277 | 0 | BrotliClusterHistogramsDistance(m, distance_histograms, |
278 | 0 | mb->distance_context_map_size, |
279 | 0 | kMaxNumberOfHistograms, |
280 | 0 | mb->distance_histograms, |
281 | 0 | &mb->distance_histograms_size, |
282 | 0 | mb->distance_context_map); |
283 | 0 | if (BROTLI_IS_OOM(m)) return; |
284 | 0 | BROTLI_FREE(m, distance_histograms); |
285 | 0 | } |
286 | | |
287 | 0 | #define FN(X) X ## Literal |
288 | | /* NOLINT(build/header_guard) */ |
289 | | /* Copyright 2015 Google Inc. All Rights Reserved. |
290 | | |
291 | | Distributed under MIT license. |
292 | | See file LICENSE for detail or copy at https://opensource.org/licenses/MIT |
293 | | */ |
294 | | |
295 | | /* template parameters: FN */ |
296 | | |
297 | 0 | #define HistogramType FN(Histogram) |
298 | | |
299 | | /* Greedy block splitter for one block category (literal, command or distance). |
300 | | */ |
301 | | typedef struct FN(BlockSplitter) { |
302 | | /* Alphabet size of particular block category. */ |
303 | | size_t alphabet_size_; |
304 | | /* We collect at least this many symbols for each block. */ |
305 | | size_t min_block_size_; |
306 | | /* We merge histograms A and B if |
307 | | entropy(A+B) < entropy(A) + entropy(B) + split_threshold_, |
308 | | where A is the current histogram and B is the histogram of the last or the |
309 | | second last block type. */ |
310 | | double split_threshold_; |
311 | | |
312 | | size_t num_blocks_; |
313 | | BlockSplit* split_; /* not owned */ |
314 | | HistogramType* histograms_; /* not owned */ |
315 | | size_t* histograms_size_; /* not owned */ |
316 | | |
317 | | /* Temporary storage for BlockSplitterFinishBlock. */ |
318 | | HistogramType combined_histo[2]; |
319 | | |
320 | | /* The number of symbols that we want to collect before deciding on whether |
321 | | or not to merge the block with a previous one or emit a new block. */ |
322 | | size_t target_block_size_; |
323 | | /* The number of symbols in the current histogram. */ |
324 | | size_t block_size_; |
325 | | /* Offset of the current histogram. */ |
326 | | size_t curr_histogram_ix_; |
327 | | /* Offset of the histograms of the previous two block types. */ |
328 | | size_t last_histogram_ix_[2]; |
329 | | /* Entropy of the previous two block types. */ |
330 | | double last_entropy_[2]; |
331 | | /* The number of times we merged the current block with the last one. */ |
332 | | size_t merge_last_count_; |
333 | | } FN(BlockSplitter); |
334 | | |
335 | | static void FN(InitBlockSplitter)( |
336 | | MemoryManager* m, FN(BlockSplitter)* self, size_t alphabet_size, |
337 | | size_t min_block_size, double split_threshold, size_t num_symbols, |
338 | 0 | BlockSplit* split, HistogramType** histograms, size_t* histograms_size) { |
339 | 0 | size_t max_num_blocks = num_symbols / min_block_size + 1; |
340 | | /* We have to allocate one more histogram than the maximum number of block |
341 | | types for the current histogram when the meta-block is too big. */ |
342 | 0 | size_t max_num_types = |
343 | 0 | BROTLI_MIN(size_t, max_num_blocks, BROTLI_MAX_NUMBER_OF_BLOCK_TYPES + 1); |
344 | 0 | self->alphabet_size_ = alphabet_size; |
345 | 0 | self->min_block_size_ = min_block_size; |
346 | 0 | self->split_threshold_ = split_threshold; |
347 | 0 | self->num_blocks_ = 0; |
348 | 0 | self->split_ = split; |
349 | 0 | self->histograms_size_ = histograms_size; |
350 | 0 | self->target_block_size_ = min_block_size; |
351 | 0 | self->block_size_ = 0; |
352 | 0 | self->curr_histogram_ix_ = 0; |
353 | 0 | self->merge_last_count_ = 0; |
354 | 0 | BROTLI_ENSURE_CAPACITY(m, uint8_t, |
355 | 0 | split->types, split->types_alloc_size, max_num_blocks); |
356 | 0 | BROTLI_ENSURE_CAPACITY(m, uint32_t, |
357 | 0 | split->lengths, split->lengths_alloc_size, max_num_blocks); |
358 | 0 | if (BROTLI_IS_OOM(m)) return; |
359 | 0 | self->split_->num_blocks = max_num_blocks; |
360 | 0 | BROTLI_DCHECK(*histograms == 0); |
361 | 0 | *histograms_size = max_num_types; |
362 | 0 | *histograms = BROTLI_ALLOC(m, HistogramType, *histograms_size); |
363 | 0 | self->histograms_ = *histograms; |
364 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(*histograms)) return; |
365 | | /* Clear only current histogram. */ |
366 | 0 | FN(HistogramClear)(&self->histograms_[0]); |
367 | 0 | self->last_histogram_ix_[0] = self->last_histogram_ix_[1] = 0; |
368 | 0 | } |
369 | | |
370 | | /* Does either of three things: |
371 | | (1) emits the current block with a new block type; |
372 | | (2) emits the current block with the type of the second last block; |
373 | | (3) merges the current block with the last block. */ |
374 | | static void FN(BlockSplitterFinishBlock)( |
375 | 0 | FN(BlockSplitter)* self, BROTLI_BOOL is_final) { |
376 | 0 | BlockSplit* split = self->split_; |
377 | 0 | double* last_entropy = self->last_entropy_; |
378 | 0 | HistogramType* histograms = self->histograms_; |
379 | 0 | self->block_size_ = |
380 | 0 | BROTLI_MAX(size_t, self->block_size_, self->min_block_size_); |
381 | 0 | if (self->num_blocks_ == 0) { |
382 | | /* Create first block. */ |
383 | 0 | split->lengths[0] = (uint32_t)self->block_size_; |
384 | 0 | split->types[0] = 0; |
385 | 0 | last_entropy[0] = |
386 | 0 | BitsEntropy(histograms[0].data_, self->alphabet_size_); |
387 | 0 | last_entropy[1] = last_entropy[0]; |
388 | 0 | ++self->num_blocks_; |
389 | 0 | ++split->num_types; |
390 | 0 | ++self->curr_histogram_ix_; |
391 | 0 | if (self->curr_histogram_ix_ < *self->histograms_size_) |
392 | 0 | FN(HistogramClear)(&histograms[self->curr_histogram_ix_]); |
393 | 0 | self->block_size_ = 0; |
394 | 0 | } else if (self->block_size_ > 0) { |
395 | 0 | double entropy = BitsEntropy(histograms[self->curr_histogram_ix_].data_, |
396 | 0 | self->alphabet_size_); |
397 | 0 | double combined_entropy[2]; |
398 | 0 | double diff[2]; |
399 | 0 | size_t j; |
400 | 0 | for (j = 0; j < 2; ++j) { |
401 | 0 | size_t last_histogram_ix = self->last_histogram_ix_[j]; |
402 | 0 | self->combined_histo[j] = histograms[self->curr_histogram_ix_]; |
403 | 0 | FN(HistogramAddHistogram)(&self->combined_histo[j], |
404 | 0 | &histograms[last_histogram_ix]); |
405 | 0 | combined_entropy[j] = BitsEntropy( |
406 | 0 | &self->combined_histo[j].data_[0], self->alphabet_size_); |
407 | 0 | diff[j] = combined_entropy[j] - entropy - last_entropy[j]; |
408 | 0 | } |
409 | |
|
410 | 0 | if (split->num_types < BROTLI_MAX_NUMBER_OF_BLOCK_TYPES && |
411 | 0 | diff[0] > self->split_threshold_ && |
412 | 0 | diff[1] > self->split_threshold_) { |
413 | | /* Create new block. */ |
414 | 0 | split->lengths[self->num_blocks_] = (uint32_t)self->block_size_; |
415 | 0 | split->types[self->num_blocks_] = (uint8_t)split->num_types; |
416 | 0 | self->last_histogram_ix_[1] = self->last_histogram_ix_[0]; |
417 | 0 | self->last_histogram_ix_[0] = (uint8_t)split->num_types; |
418 | 0 | last_entropy[1] = last_entropy[0]; |
419 | 0 | last_entropy[0] = entropy; |
420 | 0 | ++self->num_blocks_; |
421 | 0 | ++split->num_types; |
422 | 0 | ++self->curr_histogram_ix_; |
423 | 0 | if (self->curr_histogram_ix_ < *self->histograms_size_) |
424 | 0 | FN(HistogramClear)(&histograms[self->curr_histogram_ix_]); |
425 | 0 | self->block_size_ = 0; |
426 | 0 | self->merge_last_count_ = 0; |
427 | 0 | self->target_block_size_ = self->min_block_size_; |
428 | 0 | } else if (diff[1] < diff[0] - 20.0) { |
429 | | /* Combine this block with second last block. */ |
430 | 0 | split->lengths[self->num_blocks_] = (uint32_t)self->block_size_; |
431 | 0 | split->types[self->num_blocks_] = split->types[self->num_blocks_ - 2]; |
432 | 0 | BROTLI_SWAP(size_t, self->last_histogram_ix_, 0, 1); |
433 | 0 | histograms[self->last_histogram_ix_[0]] = self->combined_histo[1]; |
434 | 0 | last_entropy[1] = last_entropy[0]; |
435 | 0 | last_entropy[0] = combined_entropy[1]; |
436 | 0 | ++self->num_blocks_; |
437 | 0 | self->block_size_ = 0; |
438 | 0 | FN(HistogramClear)(&histograms[self->curr_histogram_ix_]); |
439 | 0 | self->merge_last_count_ = 0; |
440 | 0 | self->target_block_size_ = self->min_block_size_; |
441 | 0 | } else { |
442 | | /* Combine this block with last block. */ |
443 | 0 | split->lengths[self->num_blocks_ - 1] += (uint32_t)self->block_size_; |
444 | 0 | histograms[self->last_histogram_ix_[0]] = self->combined_histo[0]; |
445 | 0 | last_entropy[0] = combined_entropy[0]; |
446 | 0 | if (split->num_types == 1) { |
447 | 0 | last_entropy[1] = last_entropy[0]; |
448 | 0 | } |
449 | 0 | self->block_size_ = 0; |
450 | 0 | FN(HistogramClear)(&histograms[self->curr_histogram_ix_]); |
451 | 0 | if (++self->merge_last_count_ > 1) { |
452 | 0 | self->target_block_size_ += self->min_block_size_; |
453 | 0 | } |
454 | 0 | } |
455 | 0 | } |
456 | 0 | if (is_final) { |
457 | 0 | *self->histograms_size_ = split->num_types; |
458 | 0 | split->num_blocks = self->num_blocks_; |
459 | 0 | } |
460 | 0 | } |
461 | | |
462 | | /* Adds the next symbol to the current histogram. When the current histogram |
463 | | reaches the target size, decides on merging the block. */ |
464 | 0 | static void FN(BlockSplitterAddSymbol)(FN(BlockSplitter)* self, size_t symbol) { |
465 | 0 | FN(HistogramAdd)(&self->histograms_[self->curr_histogram_ix_], symbol); |
466 | 0 | ++self->block_size_; |
467 | 0 | if (self->block_size_ == self->target_block_size_) { |
468 | 0 | FN(BlockSplitterFinishBlock)(self, /* is_final = */ BROTLI_FALSE); |
469 | 0 | } |
470 | 0 | } |
471 | | |
472 | | #undef HistogramType |
473 | | #undef FN |
474 | | |
475 | 0 | #define FN(X) X ## Command |
476 | | /* NOLINT(build/header_guard) */ |
477 | | /* Copyright 2015 Google Inc. All Rights Reserved. |
478 | | |
479 | | Distributed under MIT license. |
480 | | See file LICENSE for detail or copy at https://opensource.org/licenses/MIT |
481 | | */ |
482 | | |
483 | | /* template parameters: FN */ |
484 | | |
485 | 0 | #define HistogramType FN(Histogram) |
486 | | |
487 | | /* Greedy block splitter for one block category (literal, command or distance). |
488 | | */ |
489 | | typedef struct FN(BlockSplitter) { |
490 | | /* Alphabet size of particular block category. */ |
491 | | size_t alphabet_size_; |
492 | | /* We collect at least this many symbols for each block. */ |
493 | | size_t min_block_size_; |
494 | | /* We merge histograms A and B if |
495 | | entropy(A+B) < entropy(A) + entropy(B) + split_threshold_, |
496 | | where A is the current histogram and B is the histogram of the last or the |
497 | | second last block type. */ |
498 | | double split_threshold_; |
499 | | |
500 | | size_t num_blocks_; |
501 | | BlockSplit* split_; /* not owned */ |
502 | | HistogramType* histograms_; /* not owned */ |
503 | | size_t* histograms_size_; /* not owned */ |
504 | | |
505 | | /* Temporary storage for BlockSplitterFinishBlock. */ |
506 | | HistogramType combined_histo[2]; |
507 | | |
508 | | /* The number of symbols that we want to collect before deciding on whether |
509 | | or not to merge the block with a previous one or emit a new block. */ |
510 | | size_t target_block_size_; |
511 | | /* The number of symbols in the current histogram. */ |
512 | | size_t block_size_; |
513 | | /* Offset of the current histogram. */ |
514 | | size_t curr_histogram_ix_; |
515 | | /* Offset of the histograms of the previous two block types. */ |
516 | | size_t last_histogram_ix_[2]; |
517 | | /* Entropy of the previous two block types. */ |
518 | | double last_entropy_[2]; |
519 | | /* The number of times we merged the current block with the last one. */ |
520 | | size_t merge_last_count_; |
521 | | } FN(BlockSplitter); |
522 | | |
523 | | static void FN(InitBlockSplitter)( |
524 | | MemoryManager* m, FN(BlockSplitter)* self, size_t alphabet_size, |
525 | | size_t min_block_size, double split_threshold, size_t num_symbols, |
526 | 0 | BlockSplit* split, HistogramType** histograms, size_t* histograms_size) { |
527 | 0 | size_t max_num_blocks = num_symbols / min_block_size + 1; |
528 | | /* We have to allocate one more histogram than the maximum number of block |
529 | | types for the current histogram when the meta-block is too big. */ |
530 | 0 | size_t max_num_types = |
531 | 0 | BROTLI_MIN(size_t, max_num_blocks, BROTLI_MAX_NUMBER_OF_BLOCK_TYPES + 1); |
532 | 0 | self->alphabet_size_ = alphabet_size; |
533 | 0 | self->min_block_size_ = min_block_size; |
534 | 0 | self->split_threshold_ = split_threshold; |
535 | 0 | self->num_blocks_ = 0; |
536 | 0 | self->split_ = split; |
537 | 0 | self->histograms_size_ = histograms_size; |
538 | 0 | self->target_block_size_ = min_block_size; |
539 | 0 | self->block_size_ = 0; |
540 | 0 | self->curr_histogram_ix_ = 0; |
541 | 0 | self->merge_last_count_ = 0; |
542 | 0 | BROTLI_ENSURE_CAPACITY(m, uint8_t, |
543 | 0 | split->types, split->types_alloc_size, max_num_blocks); |
544 | 0 | BROTLI_ENSURE_CAPACITY(m, uint32_t, |
545 | 0 | split->lengths, split->lengths_alloc_size, max_num_blocks); |
546 | 0 | if (BROTLI_IS_OOM(m)) return; |
547 | 0 | self->split_->num_blocks = max_num_blocks; |
548 | 0 | BROTLI_DCHECK(*histograms == 0); |
549 | 0 | *histograms_size = max_num_types; |
550 | 0 | *histograms = BROTLI_ALLOC(m, HistogramType, *histograms_size); |
551 | 0 | self->histograms_ = *histograms; |
552 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(*histograms)) return; |
553 | | /* Clear only current histogram. */ |
554 | 0 | FN(HistogramClear)(&self->histograms_[0]); |
555 | 0 | self->last_histogram_ix_[0] = self->last_histogram_ix_[1] = 0; |
556 | 0 | } |
557 | | |
558 | | /* Does either of three things: |
559 | | (1) emits the current block with a new block type; |
560 | | (2) emits the current block with the type of the second last block; |
561 | | (3) merges the current block with the last block. */ |
562 | | static void FN(BlockSplitterFinishBlock)( |
563 | 0 | FN(BlockSplitter)* self, BROTLI_BOOL is_final) { |
564 | 0 | BlockSplit* split = self->split_; |
565 | 0 | double* last_entropy = self->last_entropy_; |
566 | 0 | HistogramType* histograms = self->histograms_; |
567 | 0 | self->block_size_ = |
568 | 0 | BROTLI_MAX(size_t, self->block_size_, self->min_block_size_); |
569 | 0 | if (self->num_blocks_ == 0) { |
570 | | /* Create first block. */ |
571 | 0 | split->lengths[0] = (uint32_t)self->block_size_; |
572 | 0 | split->types[0] = 0; |
573 | 0 | last_entropy[0] = |
574 | 0 | BitsEntropy(histograms[0].data_, self->alphabet_size_); |
575 | 0 | last_entropy[1] = last_entropy[0]; |
576 | 0 | ++self->num_blocks_; |
577 | 0 | ++split->num_types; |
578 | 0 | ++self->curr_histogram_ix_; |
579 | 0 | if (self->curr_histogram_ix_ < *self->histograms_size_) |
580 | 0 | FN(HistogramClear)(&histograms[self->curr_histogram_ix_]); |
581 | 0 | self->block_size_ = 0; |
582 | 0 | } else if (self->block_size_ > 0) { |
583 | 0 | double entropy = BitsEntropy(histograms[self->curr_histogram_ix_].data_, |
584 | 0 | self->alphabet_size_); |
585 | 0 | double combined_entropy[2]; |
586 | 0 | double diff[2]; |
587 | 0 | size_t j; |
588 | 0 | for (j = 0; j < 2; ++j) { |
589 | 0 | size_t last_histogram_ix = self->last_histogram_ix_[j]; |
590 | 0 | self->combined_histo[j] = histograms[self->curr_histogram_ix_]; |
591 | 0 | FN(HistogramAddHistogram)(&self->combined_histo[j], |
592 | 0 | &histograms[last_histogram_ix]); |
593 | 0 | combined_entropy[j] = BitsEntropy( |
594 | 0 | &self->combined_histo[j].data_[0], self->alphabet_size_); |
595 | 0 | diff[j] = combined_entropy[j] - entropy - last_entropy[j]; |
596 | 0 | } |
597 | |
|
598 | 0 | if (split->num_types < BROTLI_MAX_NUMBER_OF_BLOCK_TYPES && |
599 | 0 | diff[0] > self->split_threshold_ && |
600 | 0 | diff[1] > self->split_threshold_) { |
601 | | /* Create new block. */ |
602 | 0 | split->lengths[self->num_blocks_] = (uint32_t)self->block_size_; |
603 | 0 | split->types[self->num_blocks_] = (uint8_t)split->num_types; |
604 | 0 | self->last_histogram_ix_[1] = self->last_histogram_ix_[0]; |
605 | 0 | self->last_histogram_ix_[0] = (uint8_t)split->num_types; |
606 | 0 | last_entropy[1] = last_entropy[0]; |
607 | 0 | last_entropy[0] = entropy; |
608 | 0 | ++self->num_blocks_; |
609 | 0 | ++split->num_types; |
610 | 0 | ++self->curr_histogram_ix_; |
611 | 0 | if (self->curr_histogram_ix_ < *self->histograms_size_) |
612 | 0 | FN(HistogramClear)(&histograms[self->curr_histogram_ix_]); |
613 | 0 | self->block_size_ = 0; |
614 | 0 | self->merge_last_count_ = 0; |
615 | 0 | self->target_block_size_ = self->min_block_size_; |
616 | 0 | } else if (diff[1] < diff[0] - 20.0) { |
617 | | /* Combine this block with second last block. */ |
618 | 0 | split->lengths[self->num_blocks_] = (uint32_t)self->block_size_; |
619 | 0 | split->types[self->num_blocks_] = split->types[self->num_blocks_ - 2]; |
620 | 0 | BROTLI_SWAP(size_t, self->last_histogram_ix_, 0, 1); |
621 | 0 | histograms[self->last_histogram_ix_[0]] = self->combined_histo[1]; |
622 | 0 | last_entropy[1] = last_entropy[0]; |
623 | 0 | last_entropy[0] = combined_entropy[1]; |
624 | 0 | ++self->num_blocks_; |
625 | 0 | self->block_size_ = 0; |
626 | 0 | FN(HistogramClear)(&histograms[self->curr_histogram_ix_]); |
627 | 0 | self->merge_last_count_ = 0; |
628 | 0 | self->target_block_size_ = self->min_block_size_; |
629 | 0 | } else { |
630 | | /* Combine this block with last block. */ |
631 | 0 | split->lengths[self->num_blocks_ - 1] += (uint32_t)self->block_size_; |
632 | 0 | histograms[self->last_histogram_ix_[0]] = self->combined_histo[0]; |
633 | 0 | last_entropy[0] = combined_entropy[0]; |
634 | 0 | if (split->num_types == 1) { |
635 | 0 | last_entropy[1] = last_entropy[0]; |
636 | 0 | } |
637 | 0 | self->block_size_ = 0; |
638 | 0 | FN(HistogramClear)(&histograms[self->curr_histogram_ix_]); |
639 | 0 | if (++self->merge_last_count_ > 1) { |
640 | 0 | self->target_block_size_ += self->min_block_size_; |
641 | 0 | } |
642 | 0 | } |
643 | 0 | } |
644 | 0 | if (is_final) { |
645 | 0 | *self->histograms_size_ = split->num_types; |
646 | 0 | split->num_blocks = self->num_blocks_; |
647 | 0 | } |
648 | 0 | } |
649 | | |
650 | | /* Adds the next symbol to the current histogram. When the current histogram |
651 | | reaches the target size, decides on merging the block. */ |
652 | 0 | static void FN(BlockSplitterAddSymbol)(FN(BlockSplitter)* self, size_t symbol) { |
653 | 0 | FN(HistogramAdd)(&self->histograms_[self->curr_histogram_ix_], symbol); |
654 | 0 | ++self->block_size_; |
655 | 0 | if (self->block_size_ == self->target_block_size_) { |
656 | 0 | FN(BlockSplitterFinishBlock)(self, /* is_final = */ BROTLI_FALSE); |
657 | 0 | } |
658 | 0 | } |
659 | | |
660 | | #undef HistogramType |
661 | | #undef FN |
662 | | |
663 | 0 | #define FN(X) X ## Distance |
664 | | /* NOLINT(build/header_guard) */ |
665 | | /* Copyright 2015 Google Inc. All Rights Reserved. |
666 | | |
667 | | Distributed under MIT license. |
668 | | See file LICENSE for detail or copy at https://opensource.org/licenses/MIT |
669 | | */ |
670 | | |
671 | | /* template parameters: FN */ |
672 | | |
673 | 0 | #define HistogramType FN(Histogram) |
674 | | |
675 | | /* Greedy block splitter for one block category (literal, command or distance). |
676 | | */ |
677 | | typedef struct FN(BlockSplitter) { |
678 | | /* Alphabet size of particular block category. */ |
679 | | size_t alphabet_size_; |
680 | | /* We collect at least this many symbols for each block. */ |
681 | | size_t min_block_size_; |
682 | | /* We merge histograms A and B if |
683 | | entropy(A+B) < entropy(A) + entropy(B) + split_threshold_, |
684 | | where A is the current histogram and B is the histogram of the last or the |
685 | | second last block type. */ |
686 | | double split_threshold_; |
687 | | |
688 | | size_t num_blocks_; |
689 | | BlockSplit* split_; /* not owned */ |
690 | | HistogramType* histograms_; /* not owned */ |
691 | | size_t* histograms_size_; /* not owned */ |
692 | | |
693 | | /* Temporary storage for BlockSplitterFinishBlock. */ |
694 | | HistogramType combined_histo[2]; |
695 | | |
696 | | /* The number of symbols that we want to collect before deciding on whether |
697 | | or not to merge the block with a previous one or emit a new block. */ |
698 | | size_t target_block_size_; |
699 | | /* The number of symbols in the current histogram. */ |
700 | | size_t block_size_; |
701 | | /* Offset of the current histogram. */ |
702 | | size_t curr_histogram_ix_; |
703 | | /* Offset of the histograms of the previous two block types. */ |
704 | | size_t last_histogram_ix_[2]; |
705 | | /* Entropy of the previous two block types. */ |
706 | | double last_entropy_[2]; |
707 | | /* The number of times we merged the current block with the last one. */ |
708 | | size_t merge_last_count_; |
709 | | } FN(BlockSplitter); |
710 | | |
711 | | static void FN(InitBlockSplitter)( |
712 | | MemoryManager* m, FN(BlockSplitter)* self, size_t alphabet_size, |
713 | | size_t min_block_size, double split_threshold, size_t num_symbols, |
714 | 0 | BlockSplit* split, HistogramType** histograms, size_t* histograms_size) { |
715 | 0 | size_t max_num_blocks = num_symbols / min_block_size + 1; |
716 | | /* We have to allocate one more histogram than the maximum number of block |
717 | | types for the current histogram when the meta-block is too big. */ |
718 | 0 | size_t max_num_types = |
719 | 0 | BROTLI_MIN(size_t, max_num_blocks, BROTLI_MAX_NUMBER_OF_BLOCK_TYPES + 1); |
720 | 0 | self->alphabet_size_ = alphabet_size; |
721 | 0 | self->min_block_size_ = min_block_size; |
722 | 0 | self->split_threshold_ = split_threshold; |
723 | 0 | self->num_blocks_ = 0; |
724 | 0 | self->split_ = split; |
725 | 0 | self->histograms_size_ = histograms_size; |
726 | 0 | self->target_block_size_ = min_block_size; |
727 | 0 | self->block_size_ = 0; |
728 | 0 | self->curr_histogram_ix_ = 0; |
729 | 0 | self->merge_last_count_ = 0; |
730 | 0 | BROTLI_ENSURE_CAPACITY(m, uint8_t, |
731 | 0 | split->types, split->types_alloc_size, max_num_blocks); |
732 | 0 | BROTLI_ENSURE_CAPACITY(m, uint32_t, |
733 | 0 | split->lengths, split->lengths_alloc_size, max_num_blocks); |
734 | 0 | if (BROTLI_IS_OOM(m)) return; |
735 | 0 | self->split_->num_blocks = max_num_blocks; |
736 | 0 | BROTLI_DCHECK(*histograms == 0); |
737 | 0 | *histograms_size = max_num_types; |
738 | 0 | *histograms = BROTLI_ALLOC(m, HistogramType, *histograms_size); |
739 | 0 | self->histograms_ = *histograms; |
740 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(*histograms)) return; |
741 | | /* Clear only current histogram. */ |
742 | 0 | FN(HistogramClear)(&self->histograms_[0]); |
743 | 0 | self->last_histogram_ix_[0] = self->last_histogram_ix_[1] = 0; |
744 | 0 | } |
745 | | |
746 | | /* Does either of three things: |
747 | | (1) emits the current block with a new block type; |
748 | | (2) emits the current block with the type of the second last block; |
749 | | (3) merges the current block with the last block. */ |
750 | | static void FN(BlockSplitterFinishBlock)( |
751 | 0 | FN(BlockSplitter)* self, BROTLI_BOOL is_final) { |
752 | 0 | BlockSplit* split = self->split_; |
753 | 0 | double* last_entropy = self->last_entropy_; |
754 | 0 | HistogramType* histograms = self->histograms_; |
755 | 0 | self->block_size_ = |
756 | 0 | BROTLI_MAX(size_t, self->block_size_, self->min_block_size_); |
757 | 0 | if (self->num_blocks_ == 0) { |
758 | | /* Create first block. */ |
759 | 0 | split->lengths[0] = (uint32_t)self->block_size_; |
760 | 0 | split->types[0] = 0; |
761 | 0 | last_entropy[0] = |
762 | 0 | BitsEntropy(histograms[0].data_, self->alphabet_size_); |
763 | 0 | last_entropy[1] = last_entropy[0]; |
764 | 0 | ++self->num_blocks_; |
765 | 0 | ++split->num_types; |
766 | 0 | ++self->curr_histogram_ix_; |
767 | 0 | if (self->curr_histogram_ix_ < *self->histograms_size_) |
768 | 0 | FN(HistogramClear)(&histograms[self->curr_histogram_ix_]); |
769 | 0 | self->block_size_ = 0; |
770 | 0 | } else if (self->block_size_ > 0) { |
771 | 0 | double entropy = BitsEntropy(histograms[self->curr_histogram_ix_].data_, |
772 | 0 | self->alphabet_size_); |
773 | 0 | double combined_entropy[2]; |
774 | 0 | double diff[2]; |
775 | 0 | size_t j; |
776 | 0 | for (j = 0; j < 2; ++j) { |
777 | 0 | size_t last_histogram_ix = self->last_histogram_ix_[j]; |
778 | 0 | self->combined_histo[j] = histograms[self->curr_histogram_ix_]; |
779 | 0 | FN(HistogramAddHistogram)(&self->combined_histo[j], |
780 | 0 | &histograms[last_histogram_ix]); |
781 | 0 | combined_entropy[j] = BitsEntropy( |
782 | 0 | &self->combined_histo[j].data_[0], self->alphabet_size_); |
783 | 0 | diff[j] = combined_entropy[j] - entropy - last_entropy[j]; |
784 | 0 | } |
785 | |
|
786 | 0 | if (split->num_types < BROTLI_MAX_NUMBER_OF_BLOCK_TYPES && |
787 | 0 | diff[0] > self->split_threshold_ && |
788 | 0 | diff[1] > self->split_threshold_) { |
789 | | /* Create new block. */ |
790 | 0 | split->lengths[self->num_blocks_] = (uint32_t)self->block_size_; |
791 | 0 | split->types[self->num_blocks_] = (uint8_t)split->num_types; |
792 | 0 | self->last_histogram_ix_[1] = self->last_histogram_ix_[0]; |
793 | 0 | self->last_histogram_ix_[0] = (uint8_t)split->num_types; |
794 | 0 | last_entropy[1] = last_entropy[0]; |
795 | 0 | last_entropy[0] = entropy; |
796 | 0 | ++self->num_blocks_; |
797 | 0 | ++split->num_types; |
798 | 0 | ++self->curr_histogram_ix_; |
799 | 0 | if (self->curr_histogram_ix_ < *self->histograms_size_) |
800 | 0 | FN(HistogramClear)(&histograms[self->curr_histogram_ix_]); |
801 | 0 | self->block_size_ = 0; |
802 | 0 | self->merge_last_count_ = 0; |
803 | 0 | self->target_block_size_ = self->min_block_size_; |
804 | 0 | } else if (diff[1] < diff[0] - 20.0) { |
805 | | /* Combine this block with second last block. */ |
806 | 0 | split->lengths[self->num_blocks_] = (uint32_t)self->block_size_; |
807 | 0 | split->types[self->num_blocks_] = split->types[self->num_blocks_ - 2]; |
808 | 0 | BROTLI_SWAP(size_t, self->last_histogram_ix_, 0, 1); |
809 | 0 | histograms[self->last_histogram_ix_[0]] = self->combined_histo[1]; |
810 | 0 | last_entropy[1] = last_entropy[0]; |
811 | 0 | last_entropy[0] = combined_entropy[1]; |
812 | 0 | ++self->num_blocks_; |
813 | 0 | self->block_size_ = 0; |
814 | 0 | FN(HistogramClear)(&histograms[self->curr_histogram_ix_]); |
815 | 0 | self->merge_last_count_ = 0; |
816 | 0 | self->target_block_size_ = self->min_block_size_; |
817 | 0 | } else { |
818 | | /* Combine this block with last block. */ |
819 | 0 | split->lengths[self->num_blocks_ - 1] += (uint32_t)self->block_size_; |
820 | 0 | histograms[self->last_histogram_ix_[0]] = self->combined_histo[0]; |
821 | 0 | last_entropy[0] = combined_entropy[0]; |
822 | 0 | if (split->num_types == 1) { |
823 | 0 | last_entropy[1] = last_entropy[0]; |
824 | 0 | } |
825 | 0 | self->block_size_ = 0; |
826 | 0 | FN(HistogramClear)(&histograms[self->curr_histogram_ix_]); |
827 | 0 | if (++self->merge_last_count_ > 1) { |
828 | 0 | self->target_block_size_ += self->min_block_size_; |
829 | 0 | } |
830 | 0 | } |
831 | 0 | } |
832 | 0 | if (is_final) { |
833 | 0 | *self->histograms_size_ = split->num_types; |
834 | 0 | split->num_blocks = self->num_blocks_; |
835 | 0 | } |
836 | 0 | } |
837 | | |
838 | | /* Adds the next symbol to the current histogram. When the current histogram |
839 | | reaches the target size, decides on merging the block. */ |
840 | 0 | static void FN(BlockSplitterAddSymbol)(FN(BlockSplitter)* self, size_t symbol) { |
841 | 0 | FN(HistogramAdd)(&self->histograms_[self->curr_histogram_ix_], symbol); |
842 | 0 | ++self->block_size_; |
843 | 0 | if (self->block_size_ == self->target_block_size_) { |
844 | 0 | FN(BlockSplitterFinishBlock)(self, /* is_final = */ BROTLI_FALSE); |
845 | 0 | } |
846 | 0 | } |
847 | | |
848 | | #undef HistogramType |
849 | | #undef FN |
850 | | |
851 | | #define BROTLI_MAX_STATIC_CONTEXTS 13 |
852 | | |
853 | | /* Greedy block splitter for one block category (literal, command or distance). |
854 | | Gathers histograms for all context buckets. */ |
855 | | typedef struct ContextBlockSplitter { |
856 | | /* Alphabet size of particular block category. */ |
857 | | size_t alphabet_size_; |
858 | | size_t num_contexts_; |
859 | | size_t max_block_types_; |
860 | | /* We collect at least this many symbols for each block. */ |
861 | | size_t min_block_size_; |
862 | | /* We merge histograms A and B if |
863 | | entropy(A+B) < entropy(A) + entropy(B) + split_threshold_, |
864 | | where A is the current histogram and B is the histogram of the last or the |
865 | | second last block type. */ |
866 | | double split_threshold_; |
867 | | |
868 | | size_t num_blocks_; |
869 | | BlockSplit* split_; /* not owned */ |
870 | | HistogramLiteral* histograms_; /* not owned */ |
871 | | size_t* histograms_size_; /* not owned */ |
872 | | |
873 | | /* The number of symbols that we want to collect before deciding on whether |
874 | | or not to merge the block with a previous one or emit a new block. */ |
875 | | size_t target_block_size_; |
876 | | /* The number of symbols in the current histogram. */ |
877 | | size_t block_size_; |
878 | | /* Offset of the current histogram. */ |
879 | | size_t curr_histogram_ix_; |
880 | | /* Offset of the histograms of the previous two block types. */ |
881 | | size_t last_histogram_ix_[2]; |
882 | | /* Entropy of the previous two block types. */ |
883 | | double last_entropy_[2 * BROTLI_MAX_STATIC_CONTEXTS]; |
884 | | /* The number of times we merged the current block with the last one. */ |
885 | | size_t merge_last_count_; |
886 | | } ContextBlockSplitter; |
887 | | |
888 | | static void InitContextBlockSplitter( |
889 | | MemoryManager* m, ContextBlockSplitter* self, size_t alphabet_size, |
890 | | size_t num_contexts, size_t min_block_size, double split_threshold, |
891 | | size_t num_symbols, BlockSplit* split, HistogramLiteral** histograms, |
892 | 0 | size_t* histograms_size) { |
893 | 0 | size_t max_num_blocks = num_symbols / min_block_size + 1; |
894 | 0 | size_t max_num_types; |
895 | 0 | BROTLI_DCHECK(num_contexts <= BROTLI_MAX_STATIC_CONTEXTS); |
896 | |
|
897 | 0 | self->alphabet_size_ = alphabet_size; |
898 | 0 | self->num_contexts_ = num_contexts; |
899 | 0 | self->max_block_types_ = BROTLI_MAX_NUMBER_OF_BLOCK_TYPES / num_contexts; |
900 | 0 | self->min_block_size_ = min_block_size; |
901 | 0 | self->split_threshold_ = split_threshold; |
902 | 0 | self->num_blocks_ = 0; |
903 | 0 | self->split_ = split; |
904 | 0 | self->histograms_size_ = histograms_size; |
905 | 0 | self->target_block_size_ = min_block_size; |
906 | 0 | self->block_size_ = 0; |
907 | 0 | self->curr_histogram_ix_ = 0; |
908 | 0 | self->merge_last_count_ = 0; |
909 | | |
910 | | /* We have to allocate one more histogram than the maximum number of block |
911 | | types for the current histogram when the meta-block is too big. */ |
912 | 0 | max_num_types = |
913 | 0 | BROTLI_MIN(size_t, max_num_blocks, self->max_block_types_ + 1); |
914 | 0 | BROTLI_ENSURE_CAPACITY(m, uint8_t, |
915 | 0 | split->types, split->types_alloc_size, max_num_blocks); |
916 | 0 | BROTLI_ENSURE_CAPACITY(m, uint32_t, |
917 | 0 | split->lengths, split->lengths_alloc_size, max_num_blocks); |
918 | 0 | if (BROTLI_IS_OOM(m)) return; |
919 | 0 | split->num_blocks = max_num_blocks; |
920 | 0 | if (BROTLI_IS_OOM(m)) return; |
921 | 0 | BROTLI_DCHECK(*histograms == 0); |
922 | 0 | *histograms_size = max_num_types * num_contexts; |
923 | 0 | *histograms = BROTLI_ALLOC(m, HistogramLiteral, *histograms_size); |
924 | 0 | self->histograms_ = *histograms; |
925 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(*histograms)) return; |
926 | | /* Clear only current histogram. */ |
927 | 0 | ClearHistogramsLiteral(&self->histograms_[0], num_contexts); |
928 | 0 | self->last_histogram_ix_[0] = self->last_histogram_ix_[1] = 0; |
929 | 0 | } |
930 | | |
931 | | /* Does either of three things: |
932 | | (1) emits the current block with a new block type; |
933 | | (2) emits the current block with the type of the second last block; |
934 | | (3) merges the current block with the last block. */ |
935 | | static void ContextBlockSplitterFinishBlock( |
936 | 0 | ContextBlockSplitter* self, MemoryManager* m, BROTLI_BOOL is_final) { |
937 | 0 | BlockSplit* split = self->split_; |
938 | 0 | const size_t num_contexts = self->num_contexts_; |
939 | 0 | double* last_entropy = self->last_entropy_; |
940 | 0 | HistogramLiteral* histograms = self->histograms_; |
941 | |
|
942 | 0 | if (self->block_size_ < self->min_block_size_) { |
943 | 0 | self->block_size_ = self->min_block_size_; |
944 | 0 | } |
945 | 0 | if (self->num_blocks_ == 0) { |
946 | 0 | size_t i; |
947 | | /* Create first block. */ |
948 | 0 | split->lengths[0] = (uint32_t)self->block_size_; |
949 | 0 | split->types[0] = 0; |
950 | |
|
951 | 0 | for (i = 0; i < num_contexts; ++i) { |
952 | 0 | last_entropy[i] = |
953 | 0 | BitsEntropy(histograms[i].data_, self->alphabet_size_); |
954 | 0 | last_entropy[num_contexts + i] = last_entropy[i]; |
955 | 0 | } |
956 | 0 | ++self->num_blocks_; |
957 | 0 | ++split->num_types; |
958 | 0 | self->curr_histogram_ix_ += num_contexts; |
959 | 0 | if (self->curr_histogram_ix_ < *self->histograms_size_) { |
960 | 0 | ClearHistogramsLiteral( |
961 | 0 | &self->histograms_[self->curr_histogram_ix_], self->num_contexts_); |
962 | 0 | } |
963 | 0 | self->block_size_ = 0; |
964 | 0 | } else if (self->block_size_ > 0) { |
965 | | /* Try merging the set of histograms for the current block type with the |
966 | | respective set of histograms for the last and second last block types. |
967 | | Decide over the split based on the total reduction of entropy across |
968 | | all contexts. */ |
969 | 0 | double entropy[BROTLI_MAX_STATIC_CONTEXTS]; |
970 | 0 | HistogramLiteral* combined_histo = |
971 | 0 | BROTLI_ALLOC(m, HistogramLiteral, 2 * num_contexts); |
972 | 0 | double combined_entropy[2 * BROTLI_MAX_STATIC_CONTEXTS]; |
973 | 0 | double diff[2] = { 0.0 }; |
974 | 0 | size_t i; |
975 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(combined_histo)) return; |
976 | 0 | for (i = 0; i < num_contexts; ++i) { |
977 | 0 | size_t curr_histo_ix = self->curr_histogram_ix_ + i; |
978 | 0 | size_t j; |
979 | 0 | entropy[i] = BitsEntropy(histograms[curr_histo_ix].data_, |
980 | 0 | self->alphabet_size_); |
981 | 0 | for (j = 0; j < 2; ++j) { |
982 | 0 | size_t jx = j * num_contexts + i; |
983 | 0 | size_t last_histogram_ix = self->last_histogram_ix_[j] + i; |
984 | 0 | combined_histo[jx] = histograms[curr_histo_ix]; |
985 | 0 | HistogramAddHistogramLiteral(&combined_histo[jx], |
986 | 0 | &histograms[last_histogram_ix]); |
987 | 0 | combined_entropy[jx] = BitsEntropy( |
988 | 0 | &combined_histo[jx].data_[0], self->alphabet_size_); |
989 | 0 | diff[j] += combined_entropy[jx] - entropy[i] - last_entropy[jx]; |
990 | 0 | } |
991 | 0 | } |
992 | |
|
993 | 0 | if (split->num_types < self->max_block_types_ && |
994 | 0 | diff[0] > self->split_threshold_ && |
995 | 0 | diff[1] > self->split_threshold_) { |
996 | | /* Create new block. */ |
997 | 0 | split->lengths[self->num_blocks_] = (uint32_t)self->block_size_; |
998 | 0 | split->types[self->num_blocks_] = (uint8_t)split->num_types; |
999 | 0 | self->last_histogram_ix_[1] = self->last_histogram_ix_[0]; |
1000 | 0 | self->last_histogram_ix_[0] = split->num_types * num_contexts; |
1001 | 0 | for (i = 0; i < num_contexts; ++i) { |
1002 | 0 | last_entropy[num_contexts + i] = last_entropy[i]; |
1003 | 0 | last_entropy[i] = entropy[i]; |
1004 | 0 | } |
1005 | 0 | ++self->num_blocks_; |
1006 | 0 | ++split->num_types; |
1007 | 0 | self->curr_histogram_ix_ += num_contexts; |
1008 | 0 | if (self->curr_histogram_ix_ < *self->histograms_size_) { |
1009 | 0 | ClearHistogramsLiteral( |
1010 | 0 | &self->histograms_[self->curr_histogram_ix_], self->num_contexts_); |
1011 | 0 | } |
1012 | 0 | self->block_size_ = 0; |
1013 | 0 | self->merge_last_count_ = 0; |
1014 | 0 | self->target_block_size_ = self->min_block_size_; |
1015 | 0 | } else if (diff[1] < diff[0] - 20.0) { |
1016 | | /* Combine this block with second last block. */ |
1017 | 0 | split->lengths[self->num_blocks_] = (uint32_t)self->block_size_; |
1018 | 0 | split->types[self->num_blocks_] = split->types[self->num_blocks_ - 2]; |
1019 | 0 | BROTLI_SWAP(size_t, self->last_histogram_ix_, 0, 1); |
1020 | 0 | for (i = 0; i < num_contexts; ++i) { |
1021 | 0 | histograms[self->last_histogram_ix_[0] + i] = |
1022 | 0 | combined_histo[num_contexts + i]; |
1023 | 0 | last_entropy[num_contexts + i] = last_entropy[i]; |
1024 | 0 | last_entropy[i] = combined_entropy[num_contexts + i]; |
1025 | 0 | HistogramClearLiteral(&histograms[self->curr_histogram_ix_ + i]); |
1026 | 0 | } |
1027 | 0 | ++self->num_blocks_; |
1028 | 0 | self->block_size_ = 0; |
1029 | 0 | self->merge_last_count_ = 0; |
1030 | 0 | self->target_block_size_ = self->min_block_size_; |
1031 | 0 | } else { |
1032 | | /* Combine this block with last block. */ |
1033 | 0 | split->lengths[self->num_blocks_ - 1] += (uint32_t)self->block_size_; |
1034 | 0 | for (i = 0; i < num_contexts; ++i) { |
1035 | 0 | histograms[self->last_histogram_ix_[0] + i] = combined_histo[i]; |
1036 | 0 | last_entropy[i] = combined_entropy[i]; |
1037 | 0 | if (split->num_types == 1) { |
1038 | 0 | last_entropy[num_contexts + i] = last_entropy[i]; |
1039 | 0 | } |
1040 | 0 | HistogramClearLiteral(&histograms[self->curr_histogram_ix_ + i]); |
1041 | 0 | } |
1042 | 0 | self->block_size_ = 0; |
1043 | 0 | if (++self->merge_last_count_ > 1) { |
1044 | 0 | self->target_block_size_ += self->min_block_size_; |
1045 | 0 | } |
1046 | 0 | } |
1047 | 0 | BROTLI_FREE(m, combined_histo); |
1048 | 0 | } |
1049 | 0 | if (is_final) { |
1050 | 0 | *self->histograms_size_ = split->num_types * num_contexts; |
1051 | 0 | split->num_blocks = self->num_blocks_; |
1052 | 0 | } |
1053 | 0 | } |
1054 | | |
1055 | | /* Adds the next symbol to the current block type and context. When the |
1056 | | current block reaches the target size, decides on merging the block. */ |
1057 | | static void ContextBlockSplitterAddSymbol( |
1058 | | ContextBlockSplitter* self, MemoryManager* m, |
1059 | 0 | size_t symbol, size_t context) { |
1060 | 0 | HistogramAddLiteral(&self->histograms_[self->curr_histogram_ix_ + context], |
1061 | 0 | symbol); |
1062 | 0 | ++self->block_size_; |
1063 | 0 | if (self->block_size_ == self->target_block_size_) { |
1064 | 0 | ContextBlockSplitterFinishBlock(self, m, /* is_final = */ BROTLI_FALSE); |
1065 | 0 | if (BROTLI_IS_OOM(m)) return; |
1066 | 0 | } |
1067 | 0 | } |
1068 | | |
1069 | | static void MapStaticContexts(MemoryManager* m, |
1070 | | size_t num_contexts, |
1071 | | const uint32_t* static_context_map, |
1072 | 0 | MetaBlockSplit* mb) { |
1073 | 0 | size_t i; |
1074 | 0 | BROTLI_DCHECK(mb->literal_context_map == 0); |
1075 | 0 | mb->literal_context_map_size = |
1076 | 0 | mb->literal_split.num_types << BROTLI_LITERAL_CONTEXT_BITS; |
1077 | 0 | mb->literal_context_map = |
1078 | 0 | BROTLI_ALLOC(m, uint32_t, mb->literal_context_map_size); |
1079 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(mb->literal_context_map)) return; |
1080 | | |
1081 | 0 | for (i = 0; i < mb->literal_split.num_types; ++i) { |
1082 | 0 | uint32_t offset = (uint32_t)(i * num_contexts); |
1083 | 0 | size_t j; |
1084 | 0 | for (j = 0; j < (1u << BROTLI_LITERAL_CONTEXT_BITS); ++j) { |
1085 | 0 | mb->literal_context_map[(i << BROTLI_LITERAL_CONTEXT_BITS) + j] = |
1086 | 0 | offset + static_context_map[j]; |
1087 | 0 | } |
1088 | 0 | } |
1089 | 0 | } |
1090 | | |
1091 | | typedef struct GreedyMetablockArena { |
1092 | | union { |
1093 | | BlockSplitterLiteral plain; |
1094 | | ContextBlockSplitter ctx; |
1095 | | } lit_blocks; |
1096 | | BlockSplitterCommand cmd_blocks; |
1097 | | BlockSplitterDistance dist_blocks; |
1098 | | } GreedyMetablockArena; |
1099 | | |
1100 | | static BROTLI_INLINE void BrotliBuildMetaBlockGreedyInternal( |
1101 | | MemoryManager* m, GreedyMetablockArena* arena, const uint8_t* ringbuffer, |
1102 | | size_t pos, size_t mask, uint8_t prev_byte, uint8_t prev_byte2, |
1103 | | ContextLut literal_context_lut, const size_t num_contexts, |
1104 | | const uint32_t* static_context_map, const Command* commands, |
1105 | 0 | size_t n_commands, MetaBlockSplit* mb) { |
1106 | 0 | size_t num_literals = 0; |
1107 | 0 | size_t i; |
1108 | 0 | for (i = 0; i < n_commands; ++i) { |
1109 | 0 | num_literals += commands[i].insert_len_; |
1110 | 0 | } |
1111 | |
|
1112 | 0 | if (num_contexts == 1) { |
1113 | 0 | InitBlockSplitterLiteral(m, &arena->lit_blocks.plain, 256, 512, 400.0, |
1114 | 0 | num_literals, &mb->literal_split, &mb->literal_histograms, |
1115 | 0 | &mb->literal_histograms_size); |
1116 | 0 | } else { |
1117 | 0 | InitContextBlockSplitter(m, &arena->lit_blocks.ctx, 256, num_contexts, 512, |
1118 | 0 | 400.0, num_literals, &mb->literal_split, &mb->literal_histograms, |
1119 | 0 | &mb->literal_histograms_size); |
1120 | 0 | } |
1121 | 0 | if (BROTLI_IS_OOM(m)) return; |
1122 | 0 | InitBlockSplitterCommand(m, &arena->cmd_blocks, BROTLI_NUM_COMMAND_SYMBOLS, |
1123 | 0 | 1024, 500.0, n_commands, &mb->command_split, &mb->command_histograms, |
1124 | 0 | &mb->command_histograms_size); |
1125 | 0 | if (BROTLI_IS_OOM(m)) return; |
1126 | 0 | InitBlockSplitterDistance(m, &arena->dist_blocks, 64, 512, 100.0, n_commands, |
1127 | 0 | &mb->distance_split, &mb->distance_histograms, |
1128 | 0 | &mb->distance_histograms_size); |
1129 | 0 | if (BROTLI_IS_OOM(m)) return; |
1130 | | |
1131 | 0 | for (i = 0; i < n_commands; ++i) { |
1132 | 0 | const Command cmd = commands[i]; |
1133 | 0 | size_t j; |
1134 | 0 | BlockSplitterAddSymbolCommand(&arena->cmd_blocks, cmd.cmd_prefix_); |
1135 | 0 | for (j = cmd.insert_len_; j != 0; --j) { |
1136 | 0 | uint8_t literal = ringbuffer[pos & mask]; |
1137 | 0 | if (num_contexts == 1) { |
1138 | 0 | BlockSplitterAddSymbolLiteral(&arena->lit_blocks.plain, literal); |
1139 | 0 | } else { |
1140 | 0 | size_t context = |
1141 | 0 | BROTLI_CONTEXT(prev_byte, prev_byte2, literal_context_lut); |
1142 | 0 | ContextBlockSplitterAddSymbol(&arena->lit_blocks.ctx, m, literal, |
1143 | 0 | static_context_map[context]); |
1144 | 0 | if (BROTLI_IS_OOM(m)) return; |
1145 | 0 | } |
1146 | 0 | prev_byte2 = prev_byte; |
1147 | 0 | prev_byte = literal; |
1148 | 0 | ++pos; |
1149 | 0 | } |
1150 | 0 | pos += CommandCopyLen(&cmd); |
1151 | 0 | if (CommandCopyLen(&cmd)) { |
1152 | 0 | prev_byte2 = ringbuffer[(pos - 2) & mask]; |
1153 | 0 | prev_byte = ringbuffer[(pos - 1) & mask]; |
1154 | 0 | if (cmd.cmd_prefix_ >= 128) { |
1155 | 0 | BlockSplitterAddSymbolDistance( |
1156 | 0 | &arena->dist_blocks, cmd.dist_prefix_ & 0x3FF); |
1157 | 0 | } |
1158 | 0 | } |
1159 | 0 | } |
1160 | | |
1161 | 0 | if (num_contexts == 1) { |
1162 | 0 | BlockSplitterFinishBlockLiteral( |
1163 | 0 | &arena->lit_blocks.plain, /* is_final = */ BROTLI_TRUE); |
1164 | 0 | } else { |
1165 | 0 | ContextBlockSplitterFinishBlock( |
1166 | 0 | &arena->lit_blocks.ctx, m, /* is_final = */ BROTLI_TRUE); |
1167 | 0 | if (BROTLI_IS_OOM(m)) return; |
1168 | 0 | } |
1169 | 0 | BlockSplitterFinishBlockCommand( |
1170 | 0 | &arena->cmd_blocks, /* is_final = */ BROTLI_TRUE); |
1171 | 0 | BlockSplitterFinishBlockDistance( |
1172 | 0 | &arena->dist_blocks, /* is_final = */ BROTLI_TRUE); |
1173 | |
|
1174 | 0 | if (num_contexts > 1) { |
1175 | 0 | MapStaticContexts(m, num_contexts, static_context_map, mb); |
1176 | 0 | } |
1177 | 0 | } |
1178 | | |
1179 | | void duckdb_brotli::BrotliBuildMetaBlockGreedy(MemoryManager* m, |
1180 | | const uint8_t* ringbuffer, |
1181 | | size_t pos, |
1182 | | size_t mask, |
1183 | | uint8_t prev_byte, |
1184 | | uint8_t prev_byte2, |
1185 | | ContextLut literal_context_lut, |
1186 | | size_t num_contexts, |
1187 | | const uint32_t* static_context_map, |
1188 | | const Command* commands, |
1189 | | size_t n_commands, |
1190 | 0 | MetaBlockSplit* mb) { |
1191 | 0 | GreedyMetablockArena* arena = BROTLI_ALLOC(m, GreedyMetablockArena, 1); |
1192 | 0 | if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(arena)) return; |
1193 | 0 | if (num_contexts == 1) { |
1194 | 0 | BrotliBuildMetaBlockGreedyInternal(m, arena, ringbuffer, pos, mask, |
1195 | 0 | prev_byte, prev_byte2, literal_context_lut, 1, NULL, commands, |
1196 | 0 | n_commands, mb); |
1197 | 0 | } else { |
1198 | 0 | BrotliBuildMetaBlockGreedyInternal(m, arena, ringbuffer, pos, mask, |
1199 | 0 | prev_byte, prev_byte2, literal_context_lut, num_contexts, |
1200 | 0 | static_context_map, commands, n_commands, mb); |
1201 | 0 | } |
1202 | 0 | BROTLI_FREE(m, arena); |
1203 | 0 | } |
1204 | | |
1205 | | void duckdb_brotli::BrotliOptimizeHistograms(uint32_t num_distance_codes, |
1206 | 0 | MetaBlockSplit* mb) { |
1207 | 0 | uint8_t good_for_rle[BROTLI_NUM_COMMAND_SYMBOLS]; |
1208 | 0 | size_t i; |
1209 | 0 | for (i = 0; i < mb->literal_histograms_size; ++i) { |
1210 | 0 | BrotliOptimizeHuffmanCountsForRle(256, mb->literal_histograms[i].data_, |
1211 | 0 | good_for_rle); |
1212 | 0 | } |
1213 | 0 | for (i = 0; i < mb->command_histograms_size; ++i) { |
1214 | 0 | BrotliOptimizeHuffmanCountsForRle(BROTLI_NUM_COMMAND_SYMBOLS, |
1215 | 0 | mb->command_histograms[i].data_, |
1216 | 0 | good_for_rle); |
1217 | 0 | } |
1218 | 0 | for (i = 0; i < mb->distance_histograms_size; ++i) { |
1219 | 0 | BrotliOptimizeHuffmanCountsForRle(num_distance_codes, |
1220 | 0 | mb->distance_histograms[i].data_, |
1221 | 0 | good_for_rle); |
1222 | 0 | } |
1223 | 0 | } |
1224 | | |
1225 | | |