Coverage Report

Created: 2026-06-16 07:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libdeflate/lib/deflate_compress.c
Line
Count
Source
1
/*
2
 * deflate_compress.c - a compressor for DEFLATE
3
 *
4
 * Copyright 2016 Eric Biggers
5
 *
6
 * Permission is hereby granted, free of charge, to any person
7
 * obtaining a copy of this software and associated documentation
8
 * files (the "Software"), to deal in the Software without
9
 * restriction, including without limitation the rights to use,
10
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
 * copies of the Software, and to permit persons to whom the
12
 * Software is furnished to do so, subject to the following
13
 * conditions:
14
 *
15
 * The above copyright notice and this permission notice shall be
16
 * included in all copies or substantial portions of the Software.
17
 *
18
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
 * OTHER DEALINGS IN THE SOFTWARE.
26
 */
27
28
#include "deflate_compress.h"
29
#include "deflate_constants.h"
30
31
/******************************************************************************/
32
33
/*
34
 * The following parameters can be changed at build time to customize the
35
 * compression algorithms slightly:
36
 *
37
 * (Note, not all customizable parameters are here.  Some others can be found in
38
 * libdeflate_alloc_compressor() and in *_matchfinder.h.)
39
 */
40
41
/*
42
 * If this parameter is defined to 1, then the near-optimal parsing algorithm
43
 * will be included, and compression levels 10-12 will use it.  This algorithm
44
 * usually produces a compression ratio significantly better than the other
45
 * algorithms.  However, it is slow.  If this parameter is defined to 0, then
46
 * levels 10-12 will be the same as level 9 and will use the lazy2 algorithm.
47
 */
48
#define SUPPORT_NEAR_OPTIMAL_PARSING  1
49
50
/*
51
 * This is the minimum block length that the compressor will use, in
52
 * uncompressed bytes.  This should be a value below which using shorter blocks
53
 * is unlikely to be worthwhile, due to the per-block overhead.  This value does
54
 * not apply to the final block, which may be shorter than this (if the input is
55
 * shorter, it will have to be), or to the final uncompressed block in a series
56
 * of uncompressed blocks that cover more than UINT16_MAX bytes.
57
 *
58
 * This value is also approximately the amount by which what would otherwise be
59
 * the second-to-last block is allowed to grow past the soft maximum length in
60
 * order to avoid having to use a very short final block.
61
 *
62
 * Defining a fixed minimum block length is needed in order to guarantee a
63
 * reasonable upper bound on the compressed size.  It's also needed because our
64
 * block splitting algorithm doesn't work well on very short blocks.
65
 */
66
1.12M
#define MIN_BLOCK_LENGTH  5000
67
68
/*
69
 * For the greedy, lazy, lazy2, and near-optimal compressors: This is the soft
70
 * maximum block length, in uncompressed bytes.  The compressor will try to end
71
 * blocks at this length, but it may go slightly past it if there is a match
72
 * that straddles this limit or if the input data ends soon after this limit.
73
 * This parameter doesn't apply to uncompressed blocks, which the DEFLATE format
74
 * limits to 65535 bytes.
75
 *
76
 * This should be a value above which it is very likely that splitting the block
77
 * would produce a better compression ratio.  For the near-optimal compressor,
78
 * increasing/decreasing this parameter will increase/decrease per-compressor
79
 * memory usage linearly.
80
 */
81
1.14k
#define SOFT_MAX_BLOCK_LENGTH 300000
82
83
/*
84
 * For the greedy, lazy, and lazy2 compressors: this is the length of the
85
 * sequence store, which is an array where the compressor temporarily stores
86
 * matches that it's going to use in the current block.  This value is the
87
 * maximum number of matches that can be used in a block.  If the sequence store
88
 * fills up, then the compressor will be forced to end the block early.  This
89
 * value should be large enough so that this rarely happens, due to the block
90
 * being ended normally before then.  Increasing/decreasing this value will
91
 * increase/decrease per-compressor memory usage linearly.
92
 */
93
453k
#define SEQ_STORE_LENGTH  50000
94
95
/*
96
 * For deflate_compress_fastest(): This is the soft maximum block length.
97
 * deflate_compress_fastest() doesn't use the regular block splitting algorithm;
98
 * it only ends blocks when they reach FAST_SOFT_MAX_BLOCK_LENGTH bytes or
99
 * FAST_SEQ_STORE_LENGTH matches.  Therefore, this value should be lower than
100
 * the regular SOFT_MAX_BLOCK_LENGTH.
101
 */
102
0
#define FAST_SOFT_MAX_BLOCK_LENGTH  65535
103
104
/*
105
 * For deflate_compress_fastest(): this is the length of the sequence store.
106
 * This is like SEQ_STORE_LENGTH, but this should be a lower value.
107
 */
108
0
#define FAST_SEQ_STORE_LENGTH 8192
109
110
/*
111
 * These are the maximum codeword lengths, in bits, the compressor will use for
112
 * each Huffman code.  The DEFLATE format defines limits for these.  However,
113
 * further limiting litlen codewords to 14 bits is beneficial, since it has
114
 * negligible effect on compression ratio but allows some optimizations when
115
 * outputting bits.  (It allows 4 literals to be written at once rather than 3.)
116
 */
117
2.31k
#define MAX_LITLEN_CODEWORD_LEN   14
118
2.31k
#define MAX_OFFSET_CODEWORD_LEN   DEFLATE_MAX_OFFSET_CODEWORD_LEN
119
1.14k
#define MAX_PRE_CODEWORD_LEN    DEFLATE_MAX_PRE_CODEWORD_LEN
120
121
#if SUPPORT_NEAR_OPTIMAL_PARSING
122
123
/* Parameters specific to the near-optimal parsing algorithm */
124
125
/*
126
 * BIT_COST is a scaling factor that allows the near-optimal compressor to
127
 * consider fractional bit costs when deciding which literal/match sequence to
128
 * use.  This is useful when the true symbol costs are unknown.  For example, if
129
 * the compressor thinks that a symbol has 6.5 bits of entropy, it can set its
130
 * cost to 6.5 bits rather than have to use 6 or 7 bits.  Although in the end
131
 * each symbol will use a whole number of bits due to the Huffman coding,
132
 * considering fractional bits can be helpful due to the limited information.
133
 *
134
 * BIT_COST should be a power of 2.  A value of 8 or 16 works well.  A higher
135
 * value isn't very useful since the calculations are approximate anyway.
136
 *
137
 * BIT_COST doesn't apply to deflate_flush_block() and
138
 * deflate_compute_true_cost(), which consider whole bits.
139
 */
140
0
#define BIT_COST  16
141
142
/*
143
 * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
144
 * be needed to output a symbol that was unused in the previous optimization
145
 * pass.  Assigning a default cost allows the symbol to be used in the next
146
 * optimization pass.  However, the cost should be relatively high because the
147
 * symbol probably won't be used very many times (if at all).
148
 */
149
0
#define LITERAL_NOSTAT_BITS 13
150
0
#define LENGTH_NOSTAT_BITS  13
151
0
#define OFFSET_NOSTAT_BITS  10
152
153
/*
154
 * This is (slightly less than) the maximum number of matches that the
155
 * near-optimal compressor will cache per block.  This behaves similarly to
156
 * SEQ_STORE_LENGTH for the other compressors.
157
 */
158
0
#define MATCH_CACHE_LENGTH  (SOFT_MAX_BLOCK_LENGTH * 5)
159
160
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
161
162
/******************************************************************************/
163
164
/* Include the needed matchfinders. */
165
16.9M
#define MATCHFINDER_WINDOW_ORDER  DEFLATE_WINDOW_ORDER
166
#include "hc_matchfinder.h"
167
#include "ht_matchfinder.h"
168
#if SUPPORT_NEAR_OPTIMAL_PARSING
169
#  include "bt_matchfinder.h"
170
/*
171
 * This is the maximum number of matches the binary trees matchfinder can find
172
 * at a single position.  Since the matchfinder never finds more than one match
173
 * for the same length, presuming one of each possible length is sufficient for
174
 * an upper bound.  (This says nothing about whether it is worthwhile to
175
 * consider so many matches; this is just defining the worst case.)
176
 */
177
#define MAX_MATCHES_PER_POS \
178
  (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
179
#endif
180
181
/*
182
 * The largest block length we will ever use is when the final block is of
183
 * length SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, or when any block is of
184
 * length SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN.  The latter case
185
 * occurs when the lazy2 compressor chooses two literals and a maximum-length
186
 * match, starting at SOFT_MAX_BLOCK_LENGTH - 1.
187
 */
188
#define MAX_BLOCK_LENGTH  \
189
  MAX(SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, \
190
      SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN)
191
192
static forceinline void
193
check_buildtime_parameters(void)
194
1.17k
{
195
  /*
196
   * Verify that MIN_BLOCK_LENGTH is being honored, as
197
   * libdeflate_deflate_compress_bound() depends on it.
198
   */
199
1.17k
  STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
200
1.17k
  STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
201
1.17k
  STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN >=
202
1.17k
          MIN_BLOCK_LENGTH);
203
1.17k
  STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN >=
204
1.17k
          MIN_BLOCK_LENGTH);
205
1.17k
#if SUPPORT_NEAR_OPTIMAL_PARSING
206
1.17k
  STATIC_ASSERT(MIN_BLOCK_LENGTH * MAX_MATCHES_PER_POS <=
207
1.17k
          MATCH_CACHE_LENGTH);
208
1.17k
#endif
209
210
  /* The definition of MAX_BLOCK_LENGTH assumes this. */
211
1.17k
  STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH);
212
213
  /* Verify that the sequence stores aren't uselessly large. */
214
1.17k
  STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN <=
215
1.17k
          SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
216
1.17k
  STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN <=
217
1.17k
          FAST_SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
218
219
  /* Verify that the maximum codeword lengths are valid. */
220
1.17k
  STATIC_ASSERT(
221
1.17k
    MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
222
1.17k
  STATIC_ASSERT(
223
1.17k
    MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
224
1.17k
  STATIC_ASSERT(
225
1.17k
    MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
226
1.17k
  STATIC_ASSERT(
227
1.17k
    (1U << MAX_LITLEN_CODEWORD_LEN) >= DEFLATE_NUM_LITLEN_SYMS);
228
1.17k
  STATIC_ASSERT(
229
1.17k
    (1U << MAX_OFFSET_CODEWORD_LEN) >= DEFLATE_NUM_OFFSET_SYMS);
230
1.17k
  STATIC_ASSERT(
231
1.17k
    (1U << MAX_PRE_CODEWORD_LEN) >= DEFLATE_NUM_PRECODE_SYMS);
232
1.17k
}
233
234
/******************************************************************************/
235
236
/* Table: length slot => length slot base value */
237
static const u32 deflate_length_slot_base[] = {
238
  3,    4,    5,    6,    7,    8,    9,    10,
239
  11,   13,   15,   17,   19,   23,   27,   31,
240
  35,   43,   51,   59,   67,   83,   99,   115,
241
  131,  163,  195,  227,  258,
242
};
243
244
/* Table: length slot => number of extra length bits */
245
static const u8 deflate_extra_length_bits[] = {
246
  0,    0,    0,    0,    0,    0,    0,    0,
247
  1,    1,    1,    1,    2,    2,    2,    2,
248
  3,    3,    3,    3,    4,    4,    4,    4,
249
  5,    5,    5,    5,    0,
250
};
251
252
/* Table: offset slot => offset slot base value */
253
static const u32 deflate_offset_slot_base[] = {
254
  1,     2,     3,     4,     5,     7,     9,     13,
255
  17,    25,    33,    49,    65,    97,    129,   193,
256
  257,   385,   513,   769,   1025,  1537,  2049,  3073,
257
  4097,  6145,  8193,  12289, 16385, 24577,
258
};
259
260
/* Table: offset slot => number of extra offset bits */
261
static const u8 deflate_extra_offset_bits[] = {
262
  0,     0,     0,     0,     1,     1,     2,     2,
263
  3,     3,     4,     4,     5,     5,     6,     6,
264
  7,     7,     8,     8,     9,     9,     10,    10,
265
  11,    11,    12,    12,    13,    13,
266
};
267
268
/* Table: length => length slot */
269
static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
270
  0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
271
  12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
272
  16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
273
  18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
274
  20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
275
  21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
276
  22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
277
  23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
278
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
279
  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
280
  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
281
  26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
282
  26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
283
  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
284
  27, 27, 28,
285
};
286
287
/*
288
 * Table: 'offset - 1 => offset_slot' for offset <= 256.
289
 * This was generated by scripts/gen_offset_slot_map.py.
290
 */
291
static const u8 deflate_offset_slot[256] = {
292
  0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
293
  8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
294
  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
295
  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
296
  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
297
  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
298
  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
299
  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
300
  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
301
  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
302
  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
303
  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
304
  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
305
  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
306
  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
307
  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
308
};
309
310
/* The order in which precode codeword lengths are stored */
311
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
312
  16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
313
};
314
315
/* Table: precode symbol => number of extra bits */
316
static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
317
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7
318
};
319
320
/* Codewords for the DEFLATE Huffman codes */
321
struct deflate_codewords {
322
  u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
323
  u32 offset[DEFLATE_NUM_OFFSET_SYMS];
324
};
325
326
/*
327
 * Codeword lengths (in bits) for the DEFLATE Huffman codes.
328
 * A zero length means the corresponding symbol had zero frequency.
329
 */
330
struct deflate_lens {
331
  u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
332
  u8 offset[DEFLATE_NUM_OFFSET_SYMS];
333
};
334
335
/* Codewords and lengths for the DEFLATE Huffman codes */
336
struct deflate_codes {
337
  struct deflate_codewords codewords;
338
  struct deflate_lens lens;
339
};
340
341
/* Symbol frequency counters for the DEFLATE Huffman codes */
342
struct deflate_freqs {
343
  u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
344
  u32 offset[DEFLATE_NUM_OFFSET_SYMS];
345
};
346
347
/*
348
 * Represents a run of literals followed by a match or end-of-block.  This
349
 * struct is needed to temporarily store items chosen by the parser, since items
350
 * cannot be written until all items for the block have been chosen and the
351
 * block's Huffman codes have been computed.
352
 */
353
struct deflate_sequence {
354
355
  /*
356
   * Bits 0..22: the number of literals in this run.  This may be 0 and
357
   * can be at most MAX_BLOCK_LENGTH.  The literals are not stored
358
   * explicitly in this structure; instead, they are read directly from
359
   * the uncompressed data.
360
   *
361
   * Bits 23..31: the length of the match which follows the literals, or 0
362
   * if this literal run was the last in the block, so there is no match
363
   * which follows it.
364
   */
365
379k
#define SEQ_LENGTH_SHIFT 23
366
126k
#define SEQ_LITRUNLEN_MASK (((u32)1 << SEQ_LENGTH_SHIFT) - 1)
367
  u32 litrunlen_and_length;
368
369
  /*
370
   * If 'length' doesn't indicate end-of-block, then this is the offset of
371
   * the match which follows the literals.
372
   */
373
  u16 offset;
374
375
  /*
376
   * If 'length' doesn't indicate end-of-block, then this is the offset
377
   * slot of the match which follows the literals.
378
   */
379
  u16 offset_slot;
380
};
381
382
#if SUPPORT_NEAR_OPTIMAL_PARSING
383
384
/* Costs for the near-optimal parsing algorithm */
385
struct deflate_costs {
386
387
  /* The cost to output each possible literal */
388
  u32 literal[DEFLATE_NUM_LITERALS];
389
390
  /* The cost to output each possible match length */
391
  u32 length[DEFLATE_MAX_MATCH_LEN + 1];
392
393
  /* The cost to output a match offset of each possible offset slot */
394
  u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
395
};
396
397
/*
398
 * This structure represents a byte position in the input data and a node in the
399
 * graph of possible match/literal choices for the current block.
400
 *
401
 * Logically, each incoming edge to this node is labeled with a literal or a
402
 * match that can be taken to reach this position from an earlier position; and
403
 * each outgoing edge from this node is labeled with a literal or a match that
404
 * can be taken to advance from this position to a later position.
405
 *
406
 * But these "edges" are actually stored elsewhere (in 'match_cache').  Here we
407
 * associate with each node just two pieces of information:
408
 *
409
 *  'cost_to_end' is the minimum cost to reach the end of the block from
410
 *  this position.
411
 *
412
 *  'item' represents the literal or match that must be chosen from here to
413
 *  reach the end of the block with the minimum cost.  Equivalently, this
414
 *  can be interpreted as the label of the outgoing edge on the minimum-cost
415
 *  path to the "end of block" node from this node.
416
 */
417
struct deflate_optimum_node {
418
419
  u32 cost_to_end;
420
421
  /*
422
   * Notes on the match/literal representation used here:
423
   *
424
   *  The low bits of 'item' are the length: 1 if this is a literal,
425
   *  or the match length if this is a match.
426
   *
427
   *  The high bits of 'item' are the actual literal byte if this is a
428
   *  literal, or the match offset if this is a match.
429
   */
430
0
#define OPTIMUM_OFFSET_SHIFT 9
431
0
#define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
432
  u32 item;
433
434
};
435
436
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
437
438
/* Block split statistics.  See "Block splitting algorithm" below. */
439
138k
#define NUM_LITERAL_OBSERVATION_TYPES 8
440
12.5k
#define NUM_MATCH_OBSERVATION_TYPES 2
441
12.5k
#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
442
12.5k
             NUM_MATCH_OBSERVATION_TYPES)
443
906k
#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
444
struct block_split_stats {
445
  u32 new_observations[NUM_OBSERVATION_TYPES];
446
  u32 observations[NUM_OBSERVATION_TYPES];
447
  u32 num_new_observations;
448
  u32 num_observations;
449
};
450
451
struct deflate_output_bitstream;
452
453
/* The main DEFLATE compressor structure */
454
struct libdeflate_compressor {
455
456
  /* Pointer to the compress() implementation chosen at allocation time */
457
  void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in,
458
         size_t in_nbytes, struct deflate_output_bitstream *os);
459
460
  /* The free() function for this struct, chosen at allocation time */
461
  free_func_t free_func;
462
463
  /* The compression level with which this compressor was created */
464
  unsigned compression_level;
465
466
  /* Anything of this size or less we won't bother trying to compress. */
467
  size_t max_passthrough_size;
468
469
  /*
470
   * The maximum search depth: consider at most this many potential
471
   * matches at each position
472
   */
473
  u32 max_search_depth;
474
475
  /*
476
   * The "nice" match length: if a match of this length is found, choose
477
   * it immediately without further consideration
478
   */
479
  u32 nice_match_length;
480
481
  /* Frequency counters for the current block */
482
  struct deflate_freqs freqs;
483
484
  /* Block split statistics for the current block */
485
  struct block_split_stats split_stats;
486
487
  /* Dynamic Huffman codes for the current block */
488
  struct deflate_codes codes;
489
490
  /* The static Huffman codes defined by the DEFLATE format */
491
  struct deflate_codes static_codes;
492
493
  /* Temporary space for block flushing */
494
  union {
495
    /* Information about the precode */
496
    struct {
497
      u32 freqs[DEFLATE_NUM_PRECODE_SYMS];
498
      u32 codewords[DEFLATE_NUM_PRECODE_SYMS];
499
      u8 lens[DEFLATE_NUM_PRECODE_SYMS];
500
      unsigned items[DEFLATE_NUM_LITLEN_SYMS +
501
               DEFLATE_NUM_OFFSET_SYMS];
502
      unsigned num_litlen_syms;
503
      unsigned num_offset_syms;
504
      unsigned num_explicit_lens;
505
      unsigned num_items;
506
    } precode;
507
    /*
508
     * The "full" length codewords.  Used only after the information
509
     * in 'precode' is no longer needed.
510
     */
511
    struct {
512
      u32 codewords[DEFLATE_MAX_MATCH_LEN + 1];
513
      u8 lens[DEFLATE_MAX_MATCH_LEN + 1];
514
    } length;
515
  } o;
516
517
  union {
518
    /* Data for greedy or lazy parsing */
519
    struct {
520
      /* Hash chains matchfinder */
521
      struct hc_matchfinder hc_mf;
522
523
      /* Matches and literals chosen for the current block */
524
      struct deflate_sequence sequences[SEQ_STORE_LENGTH + 1];
525
526
    } g; /* (g)reedy */
527
528
    /* Data for fastest parsing */
529
    struct {
530
      /* Hash table matchfinder */
531
      struct ht_matchfinder ht_mf;
532
533
      /* Matches and literals chosen for the current block */
534
      struct deflate_sequence sequences[
535
            FAST_SEQ_STORE_LENGTH + 1];
536
537
    } f; /* (f)astest */
538
539
  #if SUPPORT_NEAR_OPTIMAL_PARSING
540
    /* Data for near-optimal parsing */
541
    struct {
542
543
      /* Binary tree matchfinder */
544
      struct bt_matchfinder bt_mf;
545
546
      /*
547
       * Cached matches for the current block.  This array
548
       * contains the matches that were found at each position
549
       * in the block.  Specifically, for each position, there
550
       * is a list of matches found at that position, if any,
551
       * sorted by strictly increasing length.  In addition,
552
       * following the matches for each position, there is a
553
       * special 'struct lz_match' whose 'length' member
554
       * contains the number of matches found at that
555
       * position, and whose 'offset' member contains the
556
       * literal at that position.
557
       *
558
       * Note: in rare cases, there will be a very high number
559
       * of matches in the block and this array will overflow.
560
       * If this happens, we force the end of the current
561
       * block.  MATCH_CACHE_LENGTH is the length at which we
562
       * actually check for overflow.  The extra slots beyond
563
       * this are enough to absorb the worst case overflow,
564
       * which occurs if starting at
565
       * &match_cache[MATCH_CACHE_LENGTH - 1], we write
566
       * MAX_MATCHES_PER_POS matches and a match count header,
567
       * then skip searching for matches at
568
       * 'DEFLATE_MAX_MATCH_LEN - 1' positions and write the
569
       * match count header for each.
570
       */
571
      struct lz_match match_cache[MATCH_CACHE_LENGTH +
572
                MAX_MATCHES_PER_POS +
573
                DEFLATE_MAX_MATCH_LEN - 1];
574
575
      /*
576
       * Array of nodes, one per position, for running the
577
       * minimum-cost path algorithm.
578
       *
579
       * This array must be large enough to accommodate the
580
       * worst-case number of nodes, which is MAX_BLOCK_LENGTH
581
       * plus 1 for the end-of-block node.
582
       */
583
      struct deflate_optimum_node optimum_nodes[
584
        MAX_BLOCK_LENGTH + 1];
585
586
      /* The current cost model being used */
587
      struct deflate_costs costs;
588
589
      /* Saved cost model */
590
      struct deflate_costs costs_saved;
591
592
      /*
593
       * A table that maps match offset to offset slot.  This
594
       * differs from deflate_offset_slot[] in that this is a
595
       * full map, not a condensed one.  The full map is more
596
       * appropriate for the near-optimal parser, since the
597
       * near-optimal parser does more offset => offset_slot
598
       * translations, it doesn't intersperse them with
599
       * matchfinding (so cache evictions are less of a
600
       * concern), and it uses more memory anyway.
601
       */
602
      u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1];
603
604
      /* Literal/match statistics saved from previous block */
605
      u32 prev_observations[NUM_OBSERVATION_TYPES];
606
      u32 prev_num_observations;
607
608
      /*
609
       * Approximate match length frequencies based on a
610
       * greedy parse, gathered during matchfinding.  This is
611
       * used for setting the initial symbol costs.
612
       */
613
      u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
614
      u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
615
616
      /*
617
       * The maximum number of optimization passes
618
       * (min-cost path searches) per block.
619
       * Larger values = more compression.
620
       */
621
      unsigned max_optim_passes;
622
623
      /*
624
       * If an optimization pass improves the cost by fewer
625
       * than this number of bits, then optimization will stop
626
       * early, before max_optim_passes has been reached.
627
       * Smaller values = more compression.
628
       */
629
      u32 min_improvement_to_continue;
630
631
      /*
632
       * The minimum number of bits that would need to be
633
       * saved for it to be considered worth the time to
634
       * regenerate and use the min-cost path from a previous
635
       * optimization pass, in the case where the final
636
       * optimization pass actually increased the cost.
637
       * Smaller values = more compression.
638
       */
639
      u32 min_bits_to_use_nonfinal_path;
640
641
      /*
642
       * The maximum block length, in uncompressed bytes, at
643
       * which to find and consider the optimal match/literal
644
       * list for the static Huffman codes.  This strategy
645
       * improves the compression ratio produced by static
646
       * Huffman blocks and can discover more cases in which
647
       * static blocks are worthwhile.  This helps mostly with
648
       * small blocks, hence why this parameter is a max_len.
649
       *
650
       * Above this block length, static Huffman blocks are
651
       * only used opportunistically.  I.e. a static Huffman
652
       * block is only used if a static block using the same
653
       * match/literal list as the optimized dynamic block
654
       * happens to be cheaper than the dynamic block itself.
655
       */
656
      u32 max_len_to_optimize_static_block;
657
658
    } n; /* (n)ear-optimal */
659
  #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
660
661
  } p; /* (p)arser */
662
};
663
664
/*
665
 * The type for the bitbuffer variable, which temporarily holds bits that are
666
 * being packed into bytes and written to the output buffer.  For best
667
 * performance, this should have size equal to a machine word.
668
 */
669
typedef machine_word_t bitbuf_t;
670
671
/*
672
 * The capacity of the bitbuffer, in bits.  This is 1 less than the real size,
673
 * in order to avoid undefined behavior when doing bitbuf >>= bitcount & ~7.
674
 */
675
378k
#define BITBUF_NBITS  (8 * sizeof(bitbuf_t) - 1)
676
677
/*
678
 * Can the specified number of bits always be added to 'bitbuf' after any
679
 * pending bytes have been flushed?  There can be up to 7 bits remaining after a
680
 * flush, so the count must not exceed BITBUF_NBITS after adding 'n' more bits.
681
 */
682
378k
#define CAN_BUFFER(n) (7 + (n) <= BITBUF_NBITS)
683
684
/*
685
 * Structure to keep track of the current state of sending bits to the
686
 * compressed output buffer
687
 */
688
struct deflate_output_bitstream {
689
690
  /* Bits that haven't yet been written to the output buffer */
691
  bitbuf_t bitbuf;
692
693
  /*
694
   * Number of bits currently held in @bitbuf.  This can be between 0 and
695
   * BITBUF_NBITS in general, or between 0 and 7 after a flush.
696
   */
697
  unsigned bitcount;
698
699
  /*
700
   * Pointer to the position in the output buffer at which the next byte
701
   * should be written
702
   */
703
  u8 *next;
704
705
  /* Pointer to the end of the output buffer */
706
  u8 *end;
707
708
  /* true if the output buffer ran out of space */
709
  bool overflow;
710
};
711
712
/*
713
 * Add some bits to the bitbuffer variable of the output bitstream.  The caller
714
 * must ensure that 'bitcount + n <= BITBUF_NBITS', by calling FLUSH_BITS()
715
 * frequently enough.
716
 */
717
967k
#define ADD_BITS(bits, n)     \
718
967k
do {           \
719
967k
  bitbuf |= (bitbuf_t)(bits) << bitcount; \
720
967k
  bitcount += (n);      \
721
967k
  ASSERT(bitcount <= BITBUF_NBITS); \
722
967k
} while (0)
723
724
/*
725
 * Flush bits from the bitbuffer variable to the output buffer.  After this, the
726
 * bitbuffer will contain at most 7 bits (a partial byte).
727
 *
728
 * Since deflate_flush_block() verified ahead of time that there is enough space
729
 * remaining before actually writing the block, it's guaranteed that out_next
730
 * won't exceed os->end.  However, there might not be enough space remaining to
731
 * flush a whole word, even though that's fastest.  Therefore, flush a whole
732
 * word if there is space for it, otherwise flush a byte at a time.
733
 */
734
381k
#define FLUSH_BITS()              \
735
381k
do {                 \
736
381k
  if (UNALIGNED_ACCESS_IS_FAST && likely(out_next < out_fast_end)) { \
737
381k
    /* Flush a whole word (branchlessly). */    \
738
381k
    put_unaligned_leword(bitbuf, out_next);     \
739
381k
    bitbuf >>= bitcount & ~7;       \
740
381k
    out_next += bitcount >> 3;        \
741
381k
    bitcount &= 7;            \
742
381k
  } else {             \
743
0
    /* Flush a byte at a time. */       \
744
0
    while (bitcount >= 8) {         \
745
0
      ASSERT(out_next < os->end);     \
746
0
      *out_next++ = bitbuf;       \
747
0
      bitcount -= 8;          \
748
0
      bitbuf >>= 8;         \
749
0
    }             \
750
0
  }                \
751
381k
} while (0)
752
753
/*
754
 * Given the binary tree node A[subtree_idx] whose children already satisfy the
755
 * maxheap property, swap the node with its greater child until it is greater
756
 * than or equal to both of its children, so that the maxheap property is
757
 * satisfied in the subtree rooted at A[subtree_idx].  'A' uses 1-based indices.
758
 */
759
static void
760
heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
761
4.02k
{
762
4.02k
  unsigned parent_idx;
763
4.02k
  unsigned child_idx;
764
4.02k
  u32 v;
765
766
4.02k
  v = A[subtree_idx];
767
4.02k
  parent_idx = subtree_idx;
768
6.61k
  while ((child_idx = parent_idx * 2) <= length) {
769
3.49k
    if (child_idx < length && A[child_idx + 1] > A[child_idx])
770
874
      child_idx++;
771
3.49k
    if (v >= A[child_idx])
772
914
      break;
773
2.58k
    A[parent_idx] = A[child_idx];
774
2.58k
    parent_idx = child_idx;
775
2.58k
  }
776
4.02k
  A[parent_idx] = v;
777
4.02k
}
778
779
/*
780
 * Rearrange the array 'A' so that it satisfies the maxheap property.
781
 * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
782
 */
783
static void
784
heapify_array(u32 A[], unsigned length)
785
5.77k
{
786
5.77k
  unsigned subtree_idx;
787
788
7.29k
  for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
789
1.51k
    heapify_subtree(A, length, subtree_idx);
790
5.77k
}
791
792
/*
793
 * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
794
 *
795
 * Note: name this function heap_sort() instead of heapsort() to avoid colliding
796
 * with heapsort() from stdlib.h on BSD-derived systems.
797
 */
798
static void
799
heap_sort(u32 A[], unsigned length)
800
5.77k
{
801
5.77k
  A--; /* Use 1-based indices  */
802
803
5.77k
  heapify_array(A, length);
804
805
8.29k
  while (length >= 2) {
806
2.51k
    u32 tmp = A[length];
807
808
2.51k
    A[length] = A[1];
809
2.51k
    A[1] = tmp;
810
2.51k
    length--;
811
2.51k
    heapify_subtree(A, length, 1);
812
2.51k
  }
813
5.77k
}
814
815
6.40M
#define NUM_SYMBOL_BITS 10
816
#define NUM_FREQ_BITS (32 - NUM_SYMBOL_BITS)
817
3.89M
#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
818
1.87M
#define FREQ_MASK (~SYMBOL_MASK)
819
820
5.77k
#define GET_NUM_COUNTERS(num_syms)  (num_syms)
821
822
/*
823
 * Sort the symbols primarily by frequency and secondarily by symbol value.
824
 * Discard symbols with zero frequency and fill in an array with the remaining
825
 * symbols, along with their frequencies.  The low NUM_SYMBOL_BITS bits of each
826
 * array entry will contain the symbol value, and the remaining bits will
827
 * contain the frequency.
828
 *
829
 * @num_syms
830
 *  Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS.
831
 *
832
 * @freqs[num_syms]
833
 *  Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1.
834
 *
835
 * @lens[num_syms]
836
 *  An array that eventually will hold the length of each codeword.  This
837
 *  function only fills in the codeword lengths for symbols that have zero
838
 *  frequency, which are not well defined per se but will be set to 0.
839
 *
840
 * @symout[num_syms]
841
 *  The output array, described above.
842
 *
843
 * Returns the number of entries in 'symout' that were filled.  This is the
844
 * number of symbols that have nonzero frequency.
845
 */
846
static unsigned
847
sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[])
848
5.77k
{
849
5.77k
  unsigned sym;
850
5.77k
  unsigned i;
851
5.77k
  unsigned num_used_syms;
852
5.77k
  unsigned num_counters;
853
5.77k
  unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
854
855
  /*
856
   * We use heapsort, but with an added optimization.  Since often most
857
   * symbol frequencies are low, we first do a count sort using a limited
858
   * number of counters.  High frequencies are counted in the last
859
   * counter, and only they will be sorted with heapsort.
860
   *
861
   * Note: with more symbols, it is generally beneficial to have more
862
   * counters.  About 1 counter per symbol seems fastest.
863
   */
864
865
5.77k
  num_counters = GET_NUM_COUNTERS(num_syms);
866
867
5.77k
  memset(counters, 0, num_counters * sizeof(counters[0]));
868
869
  /* Count the frequencies. */
870
768k
  for (sym = 0; sym < num_syms; sym++)
871
763k
    counters[MIN(freqs[sym], num_counters - 1)]++;
872
873
  /*
874
   * Make the counters cumulative, ignoring the zero-th, which counted
875
   * symbols with zero frequency.  As a side effect, this calculates the
876
   * number of symbols with nonzero frequency.
877
   */
878
5.77k
  num_used_syms = 0;
879
763k
  for (i = 1; i < num_counters; i++) {
880
757k
    unsigned count = counters[i];
881
882
757k
    counters[i] = num_used_syms;
883
757k
    num_used_syms += count;
884
757k
  }
885
886
  /*
887
   * Sort nonzero-frequency symbols using the counters.  At the same time,
888
   * set the codeword lengths of zero-frequency symbols to 0.
889
   */
890
768k
  for (sym = 0; sym < num_syms; sym++) {
891
763k
    u32 freq = freqs[sym];
892
893
763k
    if (freq != 0) {
894
511k
      symout[counters[MIN(freq, num_counters - 1)]++] =
895
511k
        sym | (freq << NUM_SYMBOL_BITS);
896
511k
    } else {
897
251k
      lens[sym] = 0;
898
251k
    }
899
763k
  }
900
901
  /* Sort the symbols counted in the last counter. */
902
5.77k
  heap_sort(symout + counters[num_counters - 2],
903
5.77k
      counters[num_counters - 1] - counters[num_counters - 2]);
904
905
5.77k
  return num_used_syms;
906
5.77k
}
907
908
/*
909
 * Build a Huffman tree.
910
 *
911
 * This is an optimized implementation that
912
 *  (a) takes advantage of the frequencies being already sorted;
913
 *  (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman
914
 *      tree are sufficient to generate a canonical code;
915
 *  (c) Only stores parent pointers, not child pointers;
916
 *  (d) Produces the nodes in the same memory used for input frequency
917
 *      information.
918
 *
919
 * Array 'A', which contains 'sym_count' entries, is used for both input and
920
 * output.  For this function, 'sym_count' must be at least 2.
921
 *
922
 * For input, the array must contain the frequencies of the symbols, sorted in
923
 * increasing order.  Specifically, each entry must contain a frequency left
924
 * shifted by NUM_SYMBOL_BITS bits.  Any data in the low NUM_SYMBOL_BITS bits of
925
 * the entries will be ignored by this function.  Although these bits will, in
926
 * fact, contain the symbols that correspond to the frequencies, this function
927
 * is concerned with frequencies only and keeps the symbols as-is.
928
 *
929
 * For output, this function will produce the non-leaf nodes of the Huffman
930
 * tree.  These nodes will be stored in the first (sym_count - 1) entries of the
931
 * array.  Entry A[sym_count - 2] will represent the root node.  Each other node
932
 * will contain the zero-based index of its parent node in 'A', left shifted by
933
 * NUM_SYMBOL_BITS bits.  The low NUM_SYMBOL_BITS bits of each entry in A will
934
 * be kept as-is.  Again, note that although these low bits will, in fact,
935
 * contain a symbol value, this symbol will have *no relationship* with the
936
 * Huffman tree node that happens to occupy the same slot.  This is because this
937
 * implementation only generates the non-leaf nodes of the tree.
938
 */
939
static void
940
build_tree(u32 A[], unsigned sym_count)
941
5.69k
{
942
5.69k
  const unsigned last_idx = sym_count - 1;
943
944
  /* Index of the next lowest frequency leaf that still needs a parent */
945
5.69k
  unsigned i = 0;
946
947
  /*
948
   * Index of the next lowest frequency non-leaf that still needs a
949
   * parent, or 'e' if there is currently no such node
950
   */
951
5.69k
  unsigned b = 0;
952
953
  /* Index of the next spot for a non-leaf (will overwrite a leaf) */
954
5.69k
  unsigned e = 0;
955
956
505k
  do {
957
505k
    u32 new_freq;
958
959
    /*
960
     * Select the next two lowest frequency nodes among the leaves
961
     * A[i] and non-leaves A[b], and create a new node A[e] to be
962
     * their parent.  Set the new node's frequency to the sum of the
963
     * frequencies of its two children.
964
     *
965
     * Usually the next two lowest frequency nodes are of the same
966
     * type (leaf or non-leaf), so check those cases first.
967
     */
968
505k
    if (i + 1 <= last_idx &&
969
339k
        (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) {
970
      /* Two leaves */
971
247k
      new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK);
972
247k
      i += 2;
973
258k
    } else if (b + 2 <= e &&
974
256k
         (i > last_idx ||
975
241k
          (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) {
976
      /* Two non-leaves */
977
241k
      new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK);
978
241k
      A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
979
241k
      A[b + 1] = (e << NUM_SYMBOL_BITS) |
980
241k
           (A[b + 1] & SYMBOL_MASK);
981
241k
      b += 2;
982
241k
    } else {
983
      /* One leaf and one non-leaf */
984
17.3k
      new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK);
985
17.3k
      A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
986
17.3k
      i++;
987
17.3k
      b++;
988
17.3k
    }
989
505k
    A[e] = new_freq | (A[e] & SYMBOL_MASK);
990
    /*
991
     * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the
992
     * tree is complete once we've created 'n - 1' non-leaves.
993
     */
994
505k
  } while (++e < last_idx);
995
5.69k
}
996
997
/*
998
 * Given the stripped-down Huffman tree constructed by build_tree(), determine
999
 * the number of codewords that should be assigned each possible length, taking
1000
 * into account the length-limited constraint.
1001
 *
1002
 * @A
1003
 *  The array produced by build_tree(), containing parent index information
1004
 *  for the non-leaf nodes of the Huffman tree.  Each entry in this array is
1005
 *  a node; a node's parent always has a greater index than that node
1006
 *  itself.  This function will overwrite the parent index information in
1007
 *  this array, so essentially it will destroy the tree.  However, the data
1008
 *  in the low NUM_SYMBOL_BITS of each entry will be preserved.
1009
 *
1010
 * @root_idx
1011
 *  The 0-based index of the root node in 'A', and consequently one less
1012
 *  than the number of tree node entries in 'A'.  (Or, really 2 less than
1013
 *  the actual length of 'A'.)
1014
 *
1015
 * @len_counts
1016
 *  An array of length ('max_codeword_len' + 1) in which the number of
1017
 *  codewords having each length <= max_codeword_len will be returned.
1018
 *
1019
 * @max_codeword_len
1020
 *  The maximum permissible codeword length.
1021
 */
1022
static void
1023
compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[],
1024
          unsigned max_codeword_len)
1025
5.69k
{
1026
5.69k
  unsigned len;
1027
5.69k
  int node;
1028
1029
  /*
1030
   * The key observations are:
1031
   *
1032
   * (1) We can traverse the non-leaf nodes of the tree, always visiting a
1033
   *     parent before its children, by simply iterating through the array
1034
   *     in reverse order.  Consequently, we can compute the depth of each
1035
   *     node in one pass, overwriting the parent indices with depths.
1036
   *
1037
   * (2) We can initially assume that in the real Huffman tree, both
1038
   *     children of the root are leaves.  This corresponds to two
1039
   *     codewords of length 1.  Then, whenever we visit a (non-leaf) node
1040
   *     during the traversal, we modify this assumption to account for
1041
   *     the current node *not* being a leaf, but rather its two children
1042
   *     being leaves.  This causes the loss of one codeword for the
1043
   *     current depth and the addition of two codewords for the current
1044
   *     depth plus one.
1045
   *
1046
   * (3) We can handle the length-limited constraint fairly easily by
1047
   *     simply using the largest length available when a depth exceeds
1048
   *     max_codeword_len.
1049
   */
1050
1051
85.3k
  for (len = 0; len <= max_codeword_len; len++)
1052
79.6k
    len_counts[len] = 0;
1053
5.69k
  len_counts[1] = 2;
1054
1055
  /* Set the root node's depth to 0. */
1056
5.69k
  A[root_idx] &= SYMBOL_MASK;
1057
1058
505k
  for (node = root_idx - 1; node >= 0; node--) {
1059
1060
    /* Calculate the depth of this node. */
1061
1062
500k
    unsigned parent = A[node] >> NUM_SYMBOL_BITS;
1063
500k
    unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
1064
500k
    unsigned depth = parent_depth + 1;
1065
1066
    /*
1067
     * Set the depth of this node so that it is available when its
1068
     * children (if any) are processed.
1069
     */
1070
500k
    A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
1071
1072
    /*
1073
     * If needed, decrease the length to meet the length-limited
1074
     * constraint.  This is not the optimal method for generating
1075
     * length-limited Huffman codes!  But it should be good enough.
1076
     */
1077
500k
    if (depth >= max_codeword_len) {
1078
220
      depth = max_codeword_len;
1079
272
      do {
1080
272
        depth--;
1081
272
      } while (len_counts[depth] == 0);
1082
220
    }
1083
1084
    /*
1085
     * Account for the fact that we have a non-leaf node at the
1086
     * current depth.
1087
     */
1088
500k
    len_counts[depth]--;
1089
500k
    len_counts[depth + 1] += 2;
1090
500k
  }
1091
5.69k
}
1092
1093
/*
1094
 * DEFLATE uses bit-reversed codewords, so we must bit-reverse the codewords
1095
 * after generating them.  All codewords have length <= 16 bits.  If the CPU has
1096
 * a bit-reversal instruction, then that is the fastest method.  Otherwise the
1097
 * fastest method is to reverse the bits in each of the two bytes using a table.
1098
 * The table method is slightly faster than using bitwise operations to flip
1099
 * adjacent 1, 2, 4, and then 8-bit fields, even if 2 to 4 codewords are packed
1100
 * into a machine word and processed together using that method.
1101
 */
1102
1103
#ifdef rbit32
1104
static forceinline u32 reverse_codeword(u32 codeword, u8 len)
1105
{
1106
  return rbit32(codeword) >> ((32 - len) & 31);
1107
}
1108
#else
1109
/* Generated by scripts/gen_bitreverse_tab.py */
1110
static const u8 bitreverse_tab[256] = {
1111
  0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
1112
  0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
1113
  0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
1114
  0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
1115
  0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
1116
  0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
1117
  0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
1118
  0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
1119
  0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
1120
  0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
1121
  0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
1122
  0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
1123
  0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
1124
  0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
1125
  0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
1126
  0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
1127
  0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
1128
  0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
1129
  0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
1130
  0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
1131
  0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
1132
  0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
1133
  0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
1134
  0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
1135
  0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
1136
  0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
1137
  0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
1138
  0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
1139
  0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
1140
  0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
1141
  0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
1142
  0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
1143
};
1144
1145
static forceinline u32 reverse_codeword(u32 codeword, u8 len)
1146
760k
{
1147
760k
  STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
1148
760k
  codeword = ((u32)bitreverse_tab[codeword & 0xff] << 8) |
1149
760k
       bitreverse_tab[codeword >> 8];
1150
760k
  return codeword >> (16 - len);
1151
760k
}
1152
#endif /* !rbit32 */
1153
1154
/*
1155
 * Generate the codewords for a canonical Huffman code.
1156
 *
1157
 * @A
1158
 *  The output array for codewords.  In addition, initially this
1159
 *  array must contain the symbols, sorted primarily by frequency and
1160
 *  secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
1161
 *  each entry.
1162
 *
1163
 * @len
1164
 *  Output array for codeword lengths.
1165
 *
1166
 * @len_counts
1167
 *  An array that provides the number of codewords that will have
1168
 *  each possible length <= max_codeword_len.
1169
 *
1170
 * @max_codeword_len
1171
 *  Maximum length, in bits, of each codeword.
1172
 *
1173
 * @num_syms
1174
 *  Number of symbols in the alphabet, including symbols with zero
1175
 *  frequency.  This is the length of the 'A' and 'len' arrays.
1176
 */
1177
static void
1178
gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[],
1179
        unsigned max_codeword_len, unsigned num_syms)
1180
5.69k
{
1181
5.69k
  u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
1182
5.69k
  unsigned i;
1183
5.69k
  unsigned len;
1184
5.69k
  unsigned sym;
1185
1186
  /*
1187
   * Given the number of codewords that will have each length, assign
1188
   * codeword lengths to symbols.  We do this by assigning the lengths in
1189
   * decreasing order to the symbols sorted primarily by increasing
1190
   * frequency and secondarily by increasing symbol value.
1191
   */
1192
79.6k
  for (i = 0, len = max_codeword_len; len >= 1; len--) {
1193
73.9k
    unsigned count = len_counts[len];
1194
1195
585k
    while (count--)
1196
511k
      lens[A[i++] & SYMBOL_MASK] = len;
1197
73.9k
  }
1198
1199
  /*
1200
   * Generate the codewords themselves.  We initialize the
1201
   * 'next_codewords' array to provide the lexicographically first
1202
   * codeword of each length, then assign codewords in symbol order.  This
1203
   * produces a canonical code.
1204
   */
1205
5.69k
  next_codewords[0] = 0;
1206
5.69k
  next_codewords[1] = 0;
1207
73.9k
  for (len = 2; len <= max_codeword_len; len++)
1208
68.2k
    next_codewords[len] =
1209
68.2k
      (next_codewords[len - 1] + len_counts[len - 1]) << 1;
1210
1211
766k
  for (sym = 0; sym < num_syms; sym++) {
1212
    /* DEFLATE requires bit-reversed codewords. */
1213
760k
    A[sym] = reverse_codeword(next_codewords[lens[sym]]++,
1214
760k
            lens[sym]);
1215
760k
  }
1216
5.69k
}
1217
1218
/*
1219
 * ---------------------------------------------------------------------
1220
 *      deflate_make_huffman_code()
1221
 * ---------------------------------------------------------------------
1222
 *
1223
 * Given an alphabet and the frequency of each symbol in it, construct a
1224
 * length-limited canonical Huffman code.
1225
 *
1226
 * @num_syms
1227
 *  The number of symbols in the alphabet.  The symbols are the integers in
1228
 *  the range [0, num_syms - 1].  This parameter must be at least 2 and
1229
 *  must not exceed (1 << NUM_SYMBOL_BITS).
1230
 *
1231
 * @max_codeword_len
1232
 *  The maximum permissible codeword length.
1233
 *
1234
 * @freqs
1235
 *  An array of length @num_syms that gives the frequency of each symbol.
1236
 *  It is valid for some, none, or all of the frequencies to be 0.  The sum
1237
 *  of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1.
1238
 *
1239
 * @lens
1240
 *  An array of @num_syms entries in which this function will return the
1241
 *  length, in bits, of the codeword assigned to each symbol.  Symbols with
1242
 *  0 frequency will not have codewords per se, but their entries in this
1243
 *  array will be set to 0.  No lengths greater than @max_codeword_len will
1244
 *  be assigned.
1245
 *
1246
 * @codewords
1247
 *  An array of @num_syms entries in which this function will return the
1248
 *  codeword for each symbol, right-justified and padded on the left with
1249
 *  zeroes.  Codewords for symbols with 0 frequency will be undefined.
1250
 *
1251
 * ---------------------------------------------------------------------
1252
 *
1253
 * This function builds a length-limited canonical Huffman code.
1254
 *
1255
 * A length-limited Huffman code contains no codewords longer than some
1256
 * specified length, and has exactly (with some algorithms) or approximately
1257
 * (with the algorithm used here) the minimum weighted path length from the
1258
 * root, given this constraint.
1259
 *
1260
 * A canonical Huffman code satisfies the properties that a longer codeword
1261
 * never lexicographically precedes a shorter codeword, and the lexicographic
1262
 * ordering of codewords of the same length is the same as the lexicographic
1263
 * ordering of the corresponding symbols.  A canonical Huffman code, or more
1264
 * generally a canonical prefix code, can be reconstructed from only a list
1265
 * containing the codeword length of each symbol.
1266
 *
1267
 * The classic algorithm to generate a Huffman code creates a node for each
1268
 * symbol, then inserts these nodes into a min-heap keyed by symbol frequency.
1269
 * Then, repeatedly, the two lowest-frequency nodes are removed from the
1270
 * min-heap and added as the children of a new node having frequency equal to
1271
 * the sum of its two children, which is then inserted into the min-heap.  When
1272
 * only a single node remains in the min-heap, it is the root of the Huffman
1273
 * tree.  The codeword for each symbol is determined by the path needed to reach
1274
 * the corresponding node from the root.  Descending to the left child appends a
1275
 * 0 bit, whereas descending to the right child appends a 1 bit.
1276
 *
1277
 * The classic algorithm is relatively easy to understand, but it is subject to
1278
 * a number of inefficiencies.  In practice, it is fastest to first sort the
1279
 * symbols by frequency.  (This itself can be subject to an optimization based
1280
 * on the fact that most frequencies tend to be low.)  At the same time, we sort
1281
 * secondarily by symbol value, which aids the process of generating a canonical
1282
 * code.  Then, during tree construction, no heap is necessary because both the
1283
 * leaf nodes and the unparented non-leaf nodes can be easily maintained in
1284
 * sorted order.  Consequently, there can never be more than two possibilities
1285
 * for the next-lowest-frequency node.
1286
 *
1287
 * In addition, because we're generating a canonical code, we actually don't
1288
 * need the leaf nodes of the tree at all, only the non-leaf nodes.  This is
1289
 * because for canonical code generation we don't need to know where the symbols
1290
 * are in the tree.  Rather, we only need to know how many leaf nodes have each
1291
 * depth (codeword length).  And this information can, in fact, be quickly
1292
 * generated from the tree of non-leaves only.
1293
 *
1294
 * Furthermore, we can build this stripped-down Huffman tree directly in the
1295
 * array in which the codewords are to be generated, provided that these array
1296
 * slots are large enough to hold a symbol and frequency value.
1297
 *
1298
 * Still furthermore, we don't even need to maintain explicit child pointers.
1299
 * We only need the parent pointers, and even those can be overwritten in-place
1300
 * with depth information as part of the process of extracting codeword lengths
1301
 * from the tree.  So in summary, we do NOT need a big structure like:
1302
 *
1303
 *  struct huffman_tree_node {
1304
 *    unsigned int symbol;
1305
 *    unsigned int frequency;
1306
 *    unsigned int depth;
1307
 *    struct huffman_tree_node *left_child;
1308
 *    struct huffman_tree_node *right_child;
1309
 *  };
1310
 *
1311
 *
1312
 * ... which often gets used in "naive" implementations of Huffman code
1313
 * generation.
1314
 *
1315
 * Many of these optimizations are based on the implementation in 7-Zip (source
1316
 * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov.
1317
 */
1318
static void
1319
deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1320
        const u32 freqs[], u8 lens[], u32 codewords[])
1321
5.77k
{
1322
5.77k
  u32 *A = codewords;
1323
5.77k
  unsigned num_used_syms;
1324
1325
5.77k
  STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
1326
5.77k
  STATIC_ASSERT(MAX_BLOCK_LENGTH <= ((u32)1 << NUM_FREQ_BITS) - 1);
1327
1328
  /*
1329
   * We begin by sorting the symbols primarily by frequency and
1330
   * secondarily by symbol value.  As an optimization, the array used for
1331
   * this purpose ('A') shares storage with the space in which we will
1332
   * eventually return the codewords.
1333
   */
1334
5.77k
  num_used_syms = sort_symbols(num_syms, freqs, lens, A);
1335
  /*
1336
   * 'num_used_syms' is the number of symbols with nonzero frequency.
1337
   * This may be less than @num_syms.  'num_used_syms' is also the number
1338
   * of entries in 'A' that are valid.  Each entry consists of a distinct
1339
   * symbol and a nonzero frequency packed into a 32-bit integer.
1340
   */
1341
1342
  /*
1343
   * A complete Huffman code must contain at least 2 codewords.  Yet, it's
1344
   * possible that fewer than 2 symbols were used.  When this happens,
1345
   * it's usually for the offset code (0-1 symbols used).  But it's also
1346
   * theoretically possible for the litlen and pre codes (1 symbol used).
1347
   *
1348
   * The DEFLATE RFC explicitly allows the offset code to contain just 1
1349
   * codeword, or even be completely empty.  But it's silent about the
1350
   * other codes.  It also doesn't say whether, in the 1-codeword case,
1351
   * the codeword (which it says must be 1 bit) is '0' or '1'.
1352
   *
1353
   * In any case, some DEFLATE decompressors reject these cases.  zlib
1354
   * generally allows them, but it does reject precodes that have just 1
1355
   * codeword.  More problematically, zlib v1.2.1 and earlier rejected
1356
   * empty offset codes, and this behavior can also be seen in Windows
1357
   * Explorer's ZIP unpacker (supposedly even still in Windows 11).
1358
   *
1359
   * Other DEFLATE compressors, including zlib, always send at least 2
1360
   * codewords in order to make a complete Huffman code.  Therefore, this
1361
   * is a case where practice does not entirely match the specification.
1362
   * We follow practice by generating 2 codewords of length 1: codeword
1363
   * '0' for symbol 0, and codeword '1' for another symbol -- the used
1364
   * symbol if it exists and is not symbol 0, otherwise symbol 1.  This
1365
   * does worsen the compression ratio by having to send an unnecessary
1366
   * offset codeword length.  But this only affects rare cases such as
1367
   * blocks containing all literals, and it only makes a tiny difference.
1368
   */
1369
5.77k
  if (unlikely(num_used_syms < 2)) {
1370
80
    unsigned sym = num_used_syms ? (A[0] & SYMBOL_MASK) : 0;
1371
80
    unsigned nonzero_idx = sym ? sym : 1;
1372
1373
80
    codewords[0] = 0;
1374
80
    lens[0] = 1;
1375
80
    codewords[nonzero_idx] = 1;
1376
80
    lens[nonzero_idx] = 1;
1377
80
    return;
1378
80
  }
1379
1380
  /*
1381
   * Build a stripped-down version of the Huffman tree, sharing the array
1382
   * 'A' with the symbol values.  Then extract length counts from the tree
1383
   * and use them to generate the final codewords.
1384
   */
1385
1386
5.69k
  build_tree(A, num_used_syms);
1387
1388
5.69k
  {
1389
5.69k
    unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
1390
1391
5.69k
    compute_length_counts(A, num_used_syms - 2,
1392
5.69k
              len_counts, max_codeword_len);
1393
1394
5.69k
    gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
1395
5.69k
  }
1396
5.69k
}
1397
1398
/*
1399
 * Clear the Huffman symbol frequency counters.  This must be called when
1400
 * starting a new DEFLATE block.
1401
 */
1402
static void
1403
deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
1404
1.14k
{
1405
1.14k
  memset(&c->freqs, 0, sizeof(c->freqs));
1406
1.14k
}
1407
1408
/*
1409
 * Build the literal/length and offset Huffman codes for a DEFLATE block.
1410
 *
1411
 * This takes as input the frequency tables for each alphabet and produces as
1412
 * output a set of tables that map symbols to codewords and codeword lengths.
1413
 */
1414
static void
1415
deflate_make_huffman_codes(const struct deflate_freqs *freqs,
1416
         struct deflate_codes *codes)
1417
2.31k
{
1418
2.31k
  deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
1419
2.31k
          MAX_LITLEN_CODEWORD_LEN,
1420
2.31k
          freqs->litlen,
1421
2.31k
          codes->lens.litlen,
1422
2.31k
          codes->codewords.litlen);
1423
1424
2.31k
  deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
1425
2.31k
          MAX_OFFSET_CODEWORD_LEN,
1426
2.31k
          freqs->offset,
1427
2.31k
          codes->lens.offset,
1428
2.31k
          codes->codewords.offset);
1429
2.31k
}
1430
1431
/* Initialize c->static_codes. */
1432
static void
1433
deflate_init_static_codes(struct libdeflate_compressor *c)
1434
1.17k
{
1435
1.17k
  unsigned i;
1436
1437
170k
  for (i = 0; i < 144; i++)
1438
169k
    c->freqs.litlen[i] = 1 << (9 - 8);
1439
132k
  for (; i < 256; i++)
1440
131k
    c->freqs.litlen[i] = 1 << (9 - 9);
1441
29.3k
  for (; i < 280; i++)
1442
28.1k
    c->freqs.litlen[i] = 1 << (9 - 7);
1443
10.5k
  for (; i < 288; i++)
1444
9.39k
    c->freqs.litlen[i] = 1 << (9 - 8);
1445
1446
38.7k
  for (i = 0; i < 32; i++)
1447
37.5k
    c->freqs.offset[i] = 1 << (5 - 5);
1448
1449
1.17k
  deflate_make_huffman_codes(&c->freqs, &c->static_codes);
1450
1.17k
}
1451
1452
/* Return the offset slot for the given match offset, using the small map. */
1453
static forceinline unsigned
1454
deflate_get_offset_slot(u32 offset)
1455
125k
{
1456
  /*
1457
   * 1 <= offset <= 32768 here.  For 1 <= offset <= 256,
1458
   * deflate_offset_slot[offset - 1] gives the slot.
1459
   *
1460
   * For 257 <= offset <= 32768, we take advantage of the fact that 257 is
1461
   * the beginning of slot 16, and each slot [16..30) is exactly 1 << 7 ==
1462
   * 128 times larger than each slot [2..16) (since the number of extra
1463
   * bits increases by 1 every 2 slots).  Thus, the slot is:
1464
   *
1465
   *  deflate_offset_slot[2 + ((offset - 257) >> 7)] + (16 - 2)
1466
   *   == deflate_offset_slot[((offset - 1) >> 7)] + 14
1467
   *
1468
   * Define 'n = (offset <= 256) ? 0 : 7'.  Then any offset is handled by:
1469
   *
1470
   *      deflate_offset_slot[(offset - 1) >> n] + (n << 1)
1471
   *
1472
   * For better performance, replace 'n = (offset <= 256) ? 0 : 7' with
1473
   * the equivalent (for offset <= 536871168) 'n = (256 - offset) >> 29'.
1474
   */
1475
125k
  unsigned n = (256 - offset) >> 29;
1476
1477
125k
  ASSERT(offset >= 1 && offset <= 32768);
1478
1479
125k
  return deflate_offset_slot[(offset - 1) >> n] + (n << 1);
1480
125k
}
1481
1482
static unsigned
1483
deflate_compute_precode_items(const u8 lens[], const unsigned num_lens,
1484
            u32 precode_freqs[], unsigned precode_items[])
1485
1.14k
{
1486
1.14k
  unsigned *itemptr;
1487
1.14k
  unsigned run_start;
1488
1.14k
  unsigned run_end;
1489
1.14k
  unsigned extra_bits;
1490
1.14k
  u8 len;
1491
1492
1.14k
  memset(precode_freqs, 0,
1493
1.14k
         DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
1494
1495
1.14k
  itemptr = precode_items;
1496
1.14k
  run_start = 0;
1497
129k
  do {
1498
    /* Find the next run of codeword lengths. */
1499
1500
    /* len = the length being repeated */
1501
129k
    len = lens[run_start];
1502
1503
    /* Extend the run. */
1504
129k
    run_end = run_start;
1505
341k
    do {
1506
341k
      run_end++;
1507
341k
    } while (run_end != num_lens && len == lens[run_end]);
1508
1509
129k
    if (len == 0) {
1510
      /* Run of zeroes. */
1511
1512
      /* Symbol 18: RLE 11 to 138 zeroes at a time. */
1513
41.9k
      while ((run_end - run_start) >= 11) {
1514
3.37k
        extra_bits = MIN((run_end - run_start) - 11,
1515
3.37k
             0x7F);
1516
3.37k
        precode_freqs[18]++;
1517
3.37k
        *itemptr++ = 18 | (extra_bits << 5);
1518
3.37k
        run_start += 11 + extra_bits;
1519
3.37k
      }
1520
1521
      /* Symbol 17: RLE 3 to 10 zeroes at a time. */
1522
38.6k
      if ((run_end - run_start) >= 3) {
1523
12.8k
        extra_bits = MIN((run_end - run_start) - 3,
1524
12.8k
             0x7);
1525
12.8k
        precode_freqs[17]++;
1526
12.8k
        *itemptr++ = 17 | (extra_bits << 5);
1527
12.8k
        run_start += 3 + extra_bits;
1528
12.8k
      }
1529
90.6k
    } else {
1530
1531
      /* A run of nonzero lengths. */
1532
1533
      /* Symbol 16: RLE 3 to 6 of the previous length. */
1534
90.6k
      if ((run_end - run_start) >= 4) {
1535
2.80k
        precode_freqs[len]++;
1536
2.80k
        *itemptr++ = len;
1537
2.80k
        run_start++;
1538
3.01k
        do {
1539
3.01k
          extra_bits = MIN((run_end - run_start) -
1540
3.01k
               3, 0x3);
1541
3.01k
          precode_freqs[16]++;
1542
3.01k
          *itemptr++ = 16 | (extra_bits << 5);
1543
3.01k
          run_start += 3 + extra_bits;
1544
3.01k
        } while ((run_end - run_start) >= 3);
1545
2.80k
      }
1546
90.6k
    }
1547
1548
    /* Output any remaining lengths without RLE. */
1549
269k
    while (run_start != run_end) {
1550
139k
      precode_freqs[len]++;
1551
139k
      *itemptr++ = len;
1552
139k
      run_start++;
1553
139k
    }
1554
129k
  } while (run_start != num_lens);
1555
1556
1.14k
  return itemptr - precode_items;
1557
1.14k
}
1558
1559
/*
1560
 * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
1561
 * separate Huffman code, the "precode", which contains a symbol for each
1562
 * possible codeword length in the larger code as well as several special
1563
 * symbols to represent repeated codeword lengths (a form of run-length
1564
 * encoding).  The precode is itself constructed in canonical form, and its
1565
 * codeword lengths are represented literally in 19 3-bit fields that
1566
 * immediately precede the compressed codeword lengths of the larger code.
1567
 */
1568
1569
/* Precompute the information needed to output dynamic Huffman codes. */
1570
static void
1571
deflate_precompute_huffman_header(struct libdeflate_compressor *c)
1572
1.14k
{
1573
  /* Compute how many litlen and offset symbols are needed. */
1574
1575
1.14k
  for (c->o.precode.num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
1576
8.36k
       c->o.precode.num_litlen_syms > 257;
1577
7.22k
       c->o.precode.num_litlen_syms--)
1578
8.36k
    if (c->codes.lens.litlen[c->o.precode.num_litlen_syms - 1] != 0)
1579
1.14k
      break;
1580
1581
1.14k
  for (c->o.precode.num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
1582
18.0k
       c->o.precode.num_offset_syms > 1;
1583
16.8k
       c->o.precode.num_offset_syms--)
1584
18.0k
    if (c->codes.lens.offset[c->o.precode.num_offset_syms - 1] != 0)
1585
1.14k
      break;
1586
1587
  /*
1588
   * If we're not using the full set of literal/length codeword lengths,
1589
   * then temporarily move the offset codeword lengths over so that the
1590
   * literal/length and offset codeword lengths are contiguous.
1591
   */
1592
1.14k
  STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
1593
1.14k
          DEFLATE_NUM_LITLEN_SYMS);
1594
1.14k
  if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1595
1.14k
    memmove((u8 *)&c->codes.lens + c->o.precode.num_litlen_syms,
1596
1.14k
      (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1597
1.14k
      c->o.precode.num_offset_syms);
1598
1.14k
  }
1599
1600
  /*
1601
   * Compute the "items" (RLE / literal tokens and extra bits) with which
1602
   * the codeword lengths in the larger code will be output.
1603
   */
1604
1.14k
  c->o.precode.num_items =
1605
1.14k
    deflate_compute_precode_items((u8 *)&c->codes.lens,
1606
1.14k
                c->o.precode.num_litlen_syms +
1607
1.14k
                c->o.precode.num_offset_syms,
1608
1.14k
                c->o.precode.freqs,
1609
1.14k
                c->o.precode.items);
1610
1611
  /* Build the precode. */
1612
1.14k
  deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
1613
1.14k
          MAX_PRE_CODEWORD_LEN,
1614
1.14k
          c->o.precode.freqs, c->o.precode.lens,
1615
1.14k
          c->o.precode.codewords);
1616
1617
  /* Count how many precode lengths we actually need to output. */
1618
1.14k
  for (c->o.precode.num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
1619
3.77k
       c->o.precode.num_explicit_lens > 4;
1620
2.63k
       c->o.precode.num_explicit_lens--)
1621
3.77k
    if (c->o.precode.lens[deflate_precode_lens_permutation[
1622
3.77k
        c->o.precode.num_explicit_lens - 1]] != 0)
1623
1.14k
      break;
1624
1625
  /* Restore the offset codeword lengths if needed. */
1626
1.14k
  if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1627
1.14k
    memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1628
1.14k
      (u8 *)&c->codes.lens + c->o.precode.num_litlen_syms,
1629
1.14k
      c->o.precode.num_offset_syms);
1630
1.14k
  }
1631
1.14k
}
1632
1633
/*
1634
 * To make it faster to output matches, compute the "full" match length
1635
 * codewords, i.e. the concatenation of the litlen codeword and the extra bits
1636
 * for each possible match length.
1637
 */
1638
static void
1639
deflate_compute_full_len_codewords(struct libdeflate_compressor *c,
1640
           const struct deflate_codes *codes)
1641
1.14k
{
1642
1.14k
  u32 len;
1643
1644
1.14k
  STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN +
1645
1.14k
          DEFLATE_MAX_EXTRA_LENGTH_BITS <= 32);
1646
1647
293k
  for (len = DEFLATE_MIN_MATCH_LEN; len <= DEFLATE_MAX_MATCH_LEN; len++) {
1648
292k
    unsigned slot = deflate_length_slot[len];
1649
292k
    unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + slot;
1650
292k
    u32 extra_bits = len - deflate_length_slot_base[slot];
1651
1652
292k
    c->o.length.codewords[len] =
1653
292k
      codes->codewords.litlen[litlen_sym] |
1654
292k
      (extra_bits << codes->lens.litlen[litlen_sym]);
1655
292k
    c->o.length.lens[len] = codes->lens.litlen[litlen_sym] +
1656
292k
          deflate_extra_length_bits[slot];
1657
292k
  }
1658
1.14k
}
1659
1660
/* Write a match to the output buffer. */
1661
125k
#define WRITE_MATCH(c_, codes_, length_, offset_, offset_slot_)   \
1662
125k
do {                 \
1663
125k
  const struct libdeflate_compressor *c__ = (c_);     \
1664
125k
  const struct deflate_codes *codes__ = (codes_);     \
1665
125k
  u32 length__ = (length_);         \
1666
125k
  u32 offset__ = (offset_);         \
1667
125k
  unsigned offset_slot__ = (offset_slot_);      \
1668
125k
                  \
1669
125k
  /* Litlen symbol and extra length bits */     \
1670
125k
  STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +   \
1671
125k
         DEFLATE_MAX_EXTRA_LENGTH_BITS)); \
1672
125k
  ADD_BITS(c__->o.length.codewords[length__],     \
1673
125k
     c__->o.length.lens[length__]);       \
1674
125k
                  \
1675
125k
  if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +      \
1676
125k
      DEFLATE_MAX_EXTRA_LENGTH_BITS +     \
1677
125k
      MAX_OFFSET_CODEWORD_LEN +     \
1678
125k
      DEFLATE_MAX_EXTRA_OFFSET_BITS))     \
1679
125k
    FLUSH_BITS();           \
1680
125k
                  \
1681
125k
  /* Offset symbol */           \
1682
125k
  ADD_BITS(codes__->codewords.offset[offset_slot__],    \
1683
125k
     codes__->lens.offset[offset_slot__]);      \
1684
125k
                  \
1685
125k
  if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +      \
1686
125k
      DEFLATE_MAX_EXTRA_OFFSET_BITS))     \
1687
125k
    FLUSH_BITS();           \
1688
125k
                  \
1689
125k
  /* Extra offset bits */           \
1690
125k
  ADD_BITS(offset__ - deflate_offset_slot_base[offset_slot__],  \
1691
125k
     deflate_extra_offset_bits[offset_slot__]);   \
1692
125k
                  \
1693
125k
  FLUSH_BITS();             \
1694
125k
} while (0)
1695
1696
/*
1697
 * Choose the best type of block to use (dynamic Huffman, static Huffman, or
1698
 * uncompressed), then output it.
1699
 *
1700
 * The uncompressed data of the block is @block_begin[0..@block_length-1].  The
1701
 * sequence of literals and matches that will be used to compress the block (if
1702
 * a compressed block is chosen) is given by @sequences if it's non-NULL, or
1703
 * else @c->p.n.optimum_nodes.  @c->freqs and @c->codes must be already set
1704
 * according to the literals, matches, and end-of-block symbol.
1705
 */
1706
static void
1707
deflate_flush_block(struct libdeflate_compressor *c,
1708
        struct deflate_output_bitstream *os,
1709
        const u8 *block_begin, u32 block_length,
1710
        const struct deflate_sequence *sequences,
1711
        bool is_final_block)
1712
1.14k
{
1713
  /*
1714
   * It is hard to get compilers to understand that writes to 'os->next'
1715
   * don't alias 'os'.  That hurts performance significantly, as
1716
   * everything in 'os' would keep getting re-loaded.  ('restrict'
1717
   * *should* do the trick, but it's unreliable.)  Therefore, we keep all
1718
   * the output bitstream state in local variables, and output bits using
1719
   * macros.  This is similar to what the decompressor does.
1720
   */
1721
1.14k
  const u8 *in_next = block_begin;
1722
1.14k
  const u8 * const in_end = block_begin + block_length;
1723
1.14k
  bitbuf_t bitbuf = os->bitbuf;
1724
1.14k
  unsigned bitcount = os->bitcount;
1725
1.14k
  u8 *out_next = os->next;
1726
1.14k
  u8 * const out_fast_end =
1727
1.14k
    os->end - MIN(WORDBYTES - 1, os->end - out_next);
1728
  /*
1729
   * The cost for each block type, in bits.  Start with the cost of the
1730
   * block header which is 3 bits.
1731
   */
1732
1.14k
  u32 dynamic_cost = 3;
1733
1.14k
  u32 static_cost = 3;
1734
1.14k
  u32 uncompressed_cost = 3;
1735
1.14k
  u32 best_cost;
1736
1.14k
  struct deflate_codes *codes;
1737
1.14k
  unsigned sym;
1738
1739
1.14k
  ASSERT(block_length >= MIN_BLOCK_LENGTH ||
1740
1.14k
         (is_final_block && block_length > 0));
1741
1.14k
  ASSERT(block_length <= MAX_BLOCK_LENGTH);
1742
1.14k
  ASSERT(bitcount <= 7);
1743
1.14k
  ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0);
1744
1.14k
  ASSERT(out_next <= os->end);
1745
1.14k
  ASSERT(!os->overflow);
1746
1747
  /* Precompute the precode items and build the precode. */
1748
1.14k
  deflate_precompute_huffman_header(c);
1749
1750
  /* Account for the cost of encoding dynamic Huffman codes. */
1751
1.14k
  dynamic_cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens);
1752
22.8k
  for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
1753
21.7k
    u32 extra = deflate_extra_precode_bits[sym];
1754
1755
21.7k
    dynamic_cost += c->o.precode.freqs[sym] *
1756
21.7k
        (extra + c->o.precode.lens[sym]);
1757
21.7k
  }
1758
1759
  /* Account for the cost of encoding literals. */
1760
165k
  for (sym = 0; sym < 144; sym++) {
1761
164k
    dynamic_cost += c->freqs.litlen[sym] *
1762
164k
        c->codes.lens.litlen[sym];
1763
164k
    static_cost += c->freqs.litlen[sym] * 8;
1764
164k
  }
1765
129k
  for (; sym < 256; sym++) {
1766
128k
    dynamic_cost += c->freqs.litlen[sym] *
1767
128k
        c->codes.lens.litlen[sym];
1768
128k
    static_cost += c->freqs.litlen[sym] * 9;
1769
128k
  }
1770
1771
  /* Account for the cost of encoding the end-of-block symbol. */
1772
1.14k
  dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK];
1773
1.14k
  static_cost += 7;
1774
1775
  /* Account for the cost of encoding lengths. */
1776
1.14k
  for (sym = DEFLATE_FIRST_LEN_SYM;
1777
34.2k
       sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits);
1778
33.1k
       sym++) {
1779
33.1k
    u32 extra = deflate_extra_length_bits[
1780
33.1k
          sym - DEFLATE_FIRST_LEN_SYM];
1781
1782
33.1k
    dynamic_cost += c->freqs.litlen[sym] *
1783
33.1k
        (extra + c->codes.lens.litlen[sym]);
1784
33.1k
    static_cost += c->freqs.litlen[sym] *
1785
33.1k
        (extra + c->static_codes.lens.litlen[sym]);
1786
33.1k
  }
1787
1788
  /* Account for the cost of encoding offsets. */
1789
35.4k
  for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
1790
34.2k
    u32 extra = deflate_extra_offset_bits[sym];
1791
1792
34.2k
    dynamic_cost += c->freqs.offset[sym] *
1793
34.2k
        (extra + c->codes.lens.offset[sym]);
1794
34.2k
    static_cost += c->freqs.offset[sym] * (extra + 5);
1795
34.2k
  }
1796
1797
  /* Compute the cost of using uncompressed blocks. */
1798
1.14k
  uncompressed_cost += (-(bitcount + 3) & 7) + 32 +
1799
1.14k
           (40 * (DIV_ROUND_UP(block_length,
1800
1.14k
             UINT16_MAX) - 1)) +
1801
1.14k
           (8 * block_length);
1802
1803
  /*
1804
   * Choose and output the cheapest type of block.  If there is a tie,
1805
   * prefer uncompressed, then static, then dynamic.
1806
   */
1807
1808
1.14k
  best_cost = MIN(dynamic_cost, MIN(static_cost, uncompressed_cost));
1809
1810
  /* If the block isn't going to fit, then stop early. */
1811
1.14k
  if (DIV_ROUND_UP(bitcount + best_cost, 8) > os->end - out_next) {
1812
0
    os->overflow = true;
1813
0
    return;
1814
0
  }
1815
  /*
1816
   * Else, now we know that the block fits, so no further bounds checks on
1817
   * the output buffer are required until the next block.
1818
   */
1819
1820
1.14k
  if (best_cost == uncompressed_cost) {
1821
    /*
1822
     * Uncompressed block(s).  DEFLATE limits the length of
1823
     * uncompressed blocks to UINT16_MAX bytes, so if the length of
1824
     * the "block" we're flushing is over UINT16_MAX, we actually
1825
     * output multiple blocks.
1826
     */
1827
1
    do {
1828
1
      u8 bfinal = 0;
1829
1
      size_t len = UINT16_MAX;
1830
1831
1
      if (in_end - in_next <= UINT16_MAX) {
1832
1
        bfinal = is_final_block;
1833
1
        len = in_end - in_next;
1834
1
      }
1835
      /* It was already checked that there is enough space. */
1836
1
      ASSERT(os->end - out_next >=
1837
1
             DIV_ROUND_UP(bitcount + 3, 8) + 4 + len);
1838
      /*
1839
       * Output BFINAL (1 bit) and BTYPE (2 bits), then align
1840
       * to a byte boundary.
1841
       */
1842
1
      STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0);
1843
1
      *out_next++ = (bfinal << bitcount) | bitbuf;
1844
1
      if (bitcount > 5)
1845
0
        *out_next++ = 0;
1846
1
      bitbuf = 0;
1847
1
      bitcount = 0;
1848
      /* Output LEN and NLEN, then the data itself. */
1849
1
      put_unaligned_le16(len, out_next);
1850
1
      out_next += 2;
1851
1
      put_unaligned_le16(~len, out_next);
1852
1
      out_next += 2;
1853
1
      memcpy(out_next, in_next, len);
1854
1
      out_next += len;
1855
1
      in_next += len;
1856
1
    } while (in_next != in_end);
1857
    /* Done outputting uncompressed block(s) */
1858
1
    goto out;
1859
1
  }
1860
1861
1.14k
  if (best_cost == static_cost) {
1862
    /* Static Huffman block */
1863
510
    codes = &c->static_codes;
1864
510
    ADD_BITS(is_final_block, 1);
1865
510
    ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2);
1866
510
    FLUSH_BITS();
1867
632
  } else {
1868
632
    const unsigned num_explicit_lens = c->o.precode.num_explicit_lens;
1869
632
    const unsigned num_precode_items = c->o.precode.num_items;
1870
632
    unsigned precode_sym, precode_item;
1871
632
    unsigned i;
1872
1873
    /* Dynamic Huffman block */
1874
1875
632
    codes = &c->codes;
1876
632
    STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3));
1877
632
    ADD_BITS(is_final_block, 1);
1878
632
    ADD_BITS(DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN, 2);
1879
632
    ADD_BITS(c->o.precode.num_litlen_syms - 257, 5);
1880
632
    ADD_BITS(c->o.precode.num_offset_syms - 1, 5);
1881
632
    ADD_BITS(num_explicit_lens - 4, 4);
1882
1883
    /* Output the lengths of the codewords in the precode. */
1884
632
    if (CAN_BUFFER(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
1885
      /*
1886
       * A 64-bit bitbuffer is just one bit too small to hold
1887
       * the maximum number of precode lens, so to minimize
1888
       * flushes we merge one len with the previous fields.
1889
       */
1890
632
      precode_sym = deflate_precode_lens_permutation[0];
1891
632
      ADD_BITS(c->o.precode.lens[precode_sym], 3);
1892
632
      FLUSH_BITS();
1893
632
      i = 1; /* num_explicit_lens >= 4 */
1894
10.0k
      do {
1895
10.0k
        precode_sym =
1896
10.0k
          deflate_precode_lens_permutation[i];
1897
10.0k
        ADD_BITS(c->o.precode.lens[precode_sym], 3);
1898
10.0k
      } while (++i < num_explicit_lens);
1899
632
      FLUSH_BITS();
1900
632
    } else {
1901
0
      FLUSH_BITS();
1902
0
      i = 0;
1903
0
      do {
1904
0
        precode_sym =
1905
0
          deflate_precode_lens_permutation[i];
1906
0
        ADD_BITS(c->o.precode.lens[precode_sym], 3);
1907
0
        FLUSH_BITS();
1908
0
      } while (++i < num_explicit_lens);
1909
0
    }
1910
1911
    /*
1912
     * Output the lengths of the codewords in the litlen and offset
1913
     * codes, encoded by the precode.
1914
     */
1915
632
    i = 0;
1916
110k
    do {
1917
110k
      precode_item = c->o.precode.items[i];
1918
110k
      precode_sym = precode_item & 0x1F;
1919
110k
      STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7));
1920
110k
      ADD_BITS(c->o.precode.codewords[precode_sym],
1921
110k
         c->o.precode.lens[precode_sym]);
1922
110k
      ADD_BITS(precode_item >> 5,
1923
110k
         deflate_extra_precode_bits[precode_sym]);
1924
110k
      FLUSH_BITS();
1925
110k
    } while (++i < num_precode_items);
1926
632
  }
1927
1928
  /* Output the literals and matches for a dynamic or static block. */
1929
1.14k
  ASSERT(bitcount <= 7);
1930
1.14k
  deflate_compute_full_len_codewords(c, codes);
1931
1.14k
#if SUPPORT_NEAR_OPTIMAL_PARSING
1932
1.14k
  if (sequences == NULL) {
1933
    /* Output the literals and matches from the minimum-cost path */
1934
0
    struct deflate_optimum_node *cur_node =
1935
0
      &c->p.n.optimum_nodes[0];
1936
0
    struct deflate_optimum_node * const end_node =
1937
0
      &c->p.n.optimum_nodes[block_length];
1938
0
    do {
1939
0
      u32 length = cur_node->item & OPTIMUM_LEN_MASK;
1940
0
      u32 offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
1941
1942
0
      if (length == 1) {
1943
        /* Literal */
1944
0
        ADD_BITS(codes->codewords.litlen[offset],
1945
0
           codes->lens.litlen[offset]);
1946
0
        FLUSH_BITS();
1947
0
      } else {
1948
        /* Match */
1949
0
        WRITE_MATCH(c, codes, length, offset,
1950
0
              c->p.n.offset_slot_full[offset]);
1951
0
      }
1952
0
      cur_node += length;
1953
0
    } while (cur_node != end_node);
1954
0
  } else
1955
1.14k
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
1956
1.14k
  {
1957
    /* Output the literals and matches from the sequences list. */
1958
1.14k
    const struct deflate_sequence *seq;
1959
1960
126k
    for (seq = sequences; ; seq++) {
1961
126k
      u32 litrunlen = seq->litrunlen_and_length &
1962
126k
          SEQ_LITRUNLEN_MASK;
1963
126k
      u32 length = seq->litrunlen_and_length >>
1964
126k
             SEQ_LENGTH_SHIFT;
1965
126k
      unsigned lit;
1966
1967
      /* Output a run of literals. */
1968
126k
      if (CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) {
1969
182k
        for (; litrunlen >= 4; litrunlen -= 4) {
1970
55.6k
          lit = *in_next++;
1971
55.6k
          ADD_BITS(codes->codewords.litlen[lit],
1972
55.6k
             codes->lens.litlen[lit]);
1973
55.6k
          lit = *in_next++;
1974
55.6k
          ADD_BITS(codes->codewords.litlen[lit],
1975
55.6k
             codes->lens.litlen[lit]);
1976
55.6k
          lit = *in_next++;
1977
55.6k
          ADD_BITS(codes->codewords.litlen[lit],
1978
55.6k
             codes->lens.litlen[lit]);
1979
55.6k
          lit = *in_next++;
1980
55.6k
          ADD_BITS(codes->codewords.litlen[lit],
1981
55.6k
             codes->lens.litlen[lit]);
1982
55.6k
          FLUSH_BITS();
1983
55.6k
        }
1984
126k
        if (litrunlen-- != 0) {
1985
87.0k
          lit = *in_next++;
1986
87.0k
          ADD_BITS(codes->codewords.litlen[lit],
1987
87.0k
             codes->lens.litlen[lit]);
1988
87.0k
          if (litrunlen-- != 0) {
1989
32.8k
            lit = *in_next++;
1990
32.8k
            ADD_BITS(codes->codewords.litlen[lit],
1991
32.8k
               codes->lens.litlen[lit]);
1992
32.8k
            if (litrunlen-- != 0) {
1993
11.2k
              lit = *in_next++;
1994
11.2k
              ADD_BITS(codes->codewords.litlen[lit],
1995
11.2k
                 codes->lens.litlen[lit]);
1996
11.2k
            }
1997
32.8k
          }
1998
87.0k
          FLUSH_BITS();
1999
87.0k
        }
2000
126k
      } else {
2001
0
        while (litrunlen--) {
2002
0
          lit = *in_next++;
2003
0
          ADD_BITS(codes->codewords.litlen[lit],
2004
0
             codes->lens.litlen[lit]);
2005
0
          FLUSH_BITS();
2006
0
        }
2007
0
      }
2008
2009
126k
      if (length == 0) { /* Last sequence? */
2010
1.14k
        ASSERT(in_next == in_end);
2011
1.14k
        break;
2012
1.14k
      }
2013
2014
      /* Output a match. */
2015
125k
      WRITE_MATCH(c, codes, length, seq->offset,
2016
125k
            seq->offset_slot);
2017
125k
      in_next += length;
2018
125k
    }
2019
1.14k
  }
2020
2021
  /* Output the end-of-block symbol. */
2022
1.14k
  ASSERT(bitcount <= 7);
2023
1.14k
  ADD_BITS(codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
2024
1.14k
     codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
2025
1.14k
  FLUSH_BITS();
2026
1.14k
out:
2027
1.14k
  ASSERT(bitcount <= 7);
2028
  /*
2029
   * Assert that the block cost was computed correctly.  This is relied on
2030
   * above for the bounds check on the output buffer.  Also,
2031
   * libdeflate_deflate_compress_bound() relies on this via the assumption
2032
   * that uncompressed blocks will always be used when cheapest.
2033
   */
2034
1.14k
  ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == best_cost);
2035
1.14k
  os->bitbuf = bitbuf;
2036
1.14k
  os->bitcount = bitcount;
2037
1.14k
  os->next = out_next;
2038
1.14k
}
2039
2040
static void
2041
deflate_finish_block(struct libdeflate_compressor *c,
2042
         struct deflate_output_bitstream *os,
2043
         const u8 *block_begin, u32 block_length,
2044
         const struct deflate_sequence *sequences,
2045
         bool is_final_block)
2046
1.14k
{
2047
1.14k
  c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
2048
1.14k
  deflate_make_huffman_codes(&c->freqs, &c->codes);
2049
1.14k
  deflate_flush_block(c, os, block_begin, block_length, sequences,
2050
1.14k
          is_final_block);
2051
1.14k
}
2052
2053
/******************************************************************************/
2054
2055
/*
2056
 * Block splitting algorithm.  The problem is to decide when it is worthwhile to
2057
 * start a new block with new Huffman codes.  There is a theoretically optimal
2058
 * solution: recursively consider every possible block split, considering the
2059
 * exact cost of each block, and choose the minimum cost approach.  But this is
2060
 * far too slow.  Instead, as an approximation, we can count symbols and after
2061
 * every N symbols, compare the expected distribution of symbols based on the
2062
 * previous data with the actual distribution.  If they differ "by enough", then
2063
 * start a new block.
2064
 *
2065
 * As an optimization and heuristic, we don't distinguish between every symbol
2066
 * but rather we combine many symbols into a single "observation type".  For
2067
 * literals we only look at the high bits and low bits, and for matches we only
2068
 * look at whether the match is long or not.  The assumption is that for typical
2069
 * "real" data, places that are good block boundaries will tend to be noticeable
2070
 * based only on changes in these aggregate probabilities, without looking for
2071
 * subtle differences in individual symbols.  For example, a change from ASCII
2072
 * bytes to non-ASCII bytes, or from few matches (generally less compressible)
2073
 * to many matches (generally more compressible), would be easily noticed based
2074
 * on the aggregates.
2075
 *
2076
 * For determining whether the probability distributions are "different enough"
2077
 * to start a new block, the simple heuristic of splitting when the sum of
2078
 * absolute differences exceeds a constant seems to be good enough.  We also add
2079
 * a number proportional to the block length so that the algorithm is more
2080
 * likely to end long blocks than short blocks.  This reflects the general
2081
 * expectation that it will become increasingly beneficial to start a new block
2082
 * as the current block grows longer.
2083
 *
2084
 * Finally, for an approximation, it is not strictly necessary that the exact
2085
 * symbols being used are considered.  With "near-optimal parsing", for example,
2086
 * the actual symbols that will be used are unknown until after the block
2087
 * boundary is chosen and the block has been optimized.  Since the final choices
2088
 * cannot be used, we can use preliminary "greedy" choices instead.
2089
 */
2090
2091
/* Initialize the block split statistics when starting a new block. */
2092
static void
2093
init_block_split_stats(struct block_split_stats *stats)
2094
1.14k
{
2095
1.14k
  int i;
2096
2097
12.5k
  for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
2098
11.4k
    stats->new_observations[i] = 0;
2099
11.4k
    stats->observations[i] = 0;
2100
11.4k
  }
2101
1.14k
  stats->num_new_observations = 0;
2102
1.14k
  stats->num_observations = 0;
2103
1.14k
}
2104
2105
/*
2106
 * Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
2107
 * literal, for 8 possible literal observation types.
2108
 */
2109
static forceinline void
2110
observe_literal(struct block_split_stats *stats, u8 lit)
2111
355k
{
2112
355k
  stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
2113
355k
  stats->num_new_observations++;
2114
355k
}
2115
2116
/*
2117
 * Match observation.  Heuristic: use one observation type for "short match" and
2118
 * one observation type for "long match".
2119
 */
2120
static forceinline void
2121
observe_match(struct block_split_stats *stats, u32 length)
2122
125k
{
2123
125k
  stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES +
2124
125k
        (length >= 9)]++;
2125
125k
  stats->num_new_observations++;
2126
125k
}
2127
2128
static void
2129
merge_new_observations(struct block_split_stats *stats)
2130
0
{
2131
0
  int i;
2132
2133
0
  for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
2134
0
    stats->observations[i] += stats->new_observations[i];
2135
0
    stats->new_observations[i] = 0;
2136
0
  }
2137
0
  stats->num_observations += stats->num_new_observations;
2138
0
  stats->num_new_observations = 0;
2139
0
}
2140
2141
static bool
2142
do_end_block_check(struct block_split_stats *stats, u32 block_length)
2143
0
{
2144
0
  if (stats->num_observations > 0) {
2145
    /*
2146
     * Compute the sum of absolute differences of probabilities.  To
2147
     * avoid needing to use floating point arithmetic or do slow
2148
     * divisions, we do all arithmetic with the probabilities
2149
     * multiplied by num_observations * num_new_observations.  E.g.,
2150
     * for the "old" observations the probabilities would be
2151
     * (double)observations[i] / num_observations, but since we
2152
     * multiply by both num_observations and num_new_observations we
2153
     * really do observations[i] * num_new_observations.
2154
     */
2155
0
    u32 total_delta = 0;
2156
0
    u32 num_items;
2157
0
    u32 cutoff;
2158
0
    int i;
2159
2160
0
    for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
2161
0
      u32 expected = stats->observations[i] *
2162
0
               stats->num_new_observations;
2163
0
      u32 actual = stats->new_observations[i] *
2164
0
             stats->num_observations;
2165
0
      u32 delta = (actual > expected) ? actual - expected :
2166
0
                expected - actual;
2167
2168
0
      total_delta += delta;
2169
0
    }
2170
2171
0
    num_items = stats->num_observations +
2172
0
          stats->num_new_observations;
2173
    /*
2174
     * Heuristic: the cutoff is when the sum of absolute differences
2175
     * of probabilities becomes at least 200/512.  As above, the
2176
     * probability is multiplied by both num_new_observations and
2177
     * num_observations.  Be careful to avoid integer overflow.
2178
     */
2179
0
    cutoff = stats->num_new_observations * 200 / 512 *
2180
0
       stats->num_observations;
2181
    /*
2182
     * Very short blocks have a lot of overhead for the Huffman
2183
     * codes, so only use them if it clearly seems worthwhile.
2184
     * (This is an additional penalty, which adds to the smaller
2185
     * penalty below which scales more slowly.)
2186
     */
2187
0
    if (block_length < 10000 && num_items < 8192)
2188
0
      cutoff += (u64)cutoff * (8192 - num_items) / 8192;
2189
2190
    /* Ready to end the block? */
2191
0
    if (total_delta +
2192
0
        (block_length / 4096) * stats->num_observations >= cutoff)
2193
0
      return true;
2194
0
  }
2195
0
  merge_new_observations(stats);
2196
0
  return false;
2197
0
}
2198
2199
static forceinline bool
2200
ready_to_check_block(const struct block_split_stats *stats,
2201
         const u8 *in_block_begin, const u8 *in_next,
2202
         const u8 *in_end)
2203
453k
{
2204
453k
  return stats->num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK
2205
180k
    && in_next - in_block_begin >= MIN_BLOCK_LENGTH
2206
37.2k
    && in_end - in_next >= MIN_BLOCK_LENGTH;
2207
453k
}
2208
2209
static forceinline bool
2210
should_end_block(struct block_split_stats *stats,
2211
     const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
2212
453k
{
2213
  /* Ready to try to end the block (again)? */
2214
453k
  if (!ready_to_check_block(stats, in_block_begin, in_next, in_end))
2215
453k
    return false;
2216
2217
0
  return do_end_block_check(stats, in_next - in_block_begin);
2218
453k
}
2219
2220
/******************************************************************************/
2221
2222
static void
2223
deflate_begin_sequences(struct libdeflate_compressor *c,
2224
      struct deflate_sequence *first_seq)
2225
1.14k
{
2226
1.14k
  deflate_reset_symbol_frequencies(c);
2227
1.14k
  first_seq->litrunlen_and_length = 0;
2228
1.14k
}
2229
2230
static forceinline void
2231
deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
2232
           bool gather_split_stats, struct deflate_sequence *seq)
2233
355k
{
2234
355k
  c->freqs.litlen[literal]++;
2235
2236
355k
  if (gather_split_stats)
2237
355k
    observe_literal(&c->split_stats, literal);
2238
2239
355k
  STATIC_ASSERT(MAX_BLOCK_LENGTH <= SEQ_LITRUNLEN_MASK);
2240
355k
  seq->litrunlen_and_length++;
2241
355k
}
2242
2243
static forceinline void
2244
deflate_choose_match(struct libdeflate_compressor *c,
2245
         u32 length, u32 offset, bool gather_split_stats,
2246
         struct deflate_sequence **seq_p)
2247
125k
{
2248
125k
  struct deflate_sequence *seq = *seq_p;
2249
125k
  unsigned length_slot = deflate_length_slot[length];
2250
125k
  unsigned offset_slot = deflate_get_offset_slot(offset);
2251
2252
125k
  c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++;
2253
125k
  c->freqs.offset[offset_slot]++;
2254
125k
  if (gather_split_stats)
2255
125k
    observe_match(&c->split_stats, length);
2256
2257
125k
  seq->litrunlen_and_length |= length << SEQ_LENGTH_SHIFT;
2258
125k
  seq->offset = offset;
2259
125k
  seq->offset_slot = offset_slot;
2260
2261
125k
  seq++;
2262
125k
  seq->litrunlen_and_length = 0;
2263
125k
  *seq_p = seq;
2264
125k
}
2265
2266
/*
2267
 * Decrease the maximum and nice match lengths if we're approaching the end of
2268
 * the input buffer.
2269
 */
2270
static forceinline void
2271
adjust_max_and_nice_len(u32 *max_len, u32 *nice_len, size_t remaining)
2272
724k
{
2273
724k
  if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
2274
58.6k
    *max_len = remaining;
2275
58.6k
    *nice_len = MIN(*nice_len, *max_len);
2276
58.6k
  }
2277
724k
}
2278
2279
/*
2280
 * Choose the minimum match length for the greedy and lazy parsers.
2281
 *
2282
 * By default the minimum match length is 3, which is the smallest length the
2283
 * DEFLATE format allows.  However, with greedy and lazy parsing, some data
2284
 * (e.g. DNA sequencing data) benefits greatly from a longer minimum length.
2285
 * Typically, this is because literals are very cheap.  In general, the
2286
 * near-optimal parser handles this case naturally, but the greedy and lazy
2287
 * parsers need a heuristic to decide when to use short matches.
2288
 *
2289
 * The heuristic we use is to make the minimum match length depend on the number
2290
 * of different literals that exist in the data.  If there are many different
2291
 * literals, then literals will probably be expensive, so short matches will
2292
 * probably be worthwhile.  Conversely, if not many literals are used, then
2293
 * probably literals will be cheap and short matches won't be worthwhile.
2294
 */
2295
static u32
2296
choose_min_match_len(u32 num_used_literals, u32 max_search_depth)
2297
965
{
2298
  /* map from num_used_literals to min_len */
2299
965
  static const u8 min_lens[] = {
2300
965
    9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6,
2301
965
    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
2302
965
    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4,
2303
965
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
2304
965
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
2305
    /* The rest is implicitly 3. */
2306
965
  };
2307
965
  u32 min_len;
2308
2309
965
  STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3);
2310
965
  STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1);
2311
2312
965
  if (num_used_literals >= ARRAY_LEN(min_lens))
2313
541
    return 3;
2314
424
  min_len = min_lens[num_used_literals];
2315
  /*
2316
   * With a low max_search_depth, it may be too hard to find long matches.
2317
   */
2318
424
  if (max_search_depth < 16) {
2319
0
    if (max_search_depth < 5)
2320
0
      min_len = MIN(min_len, 4);
2321
0
    else if (max_search_depth < 10)
2322
0
      min_len = MIN(min_len, 5);
2323
0
    else
2324
0
      min_len = MIN(min_len, 7);
2325
0
  }
2326
424
  return min_len;
2327
965
}
2328
2329
static u32
2330
calculate_min_match_len(const u8 *data, size_t data_len, u32 max_search_depth)
2331
1.14k
{
2332
1.14k
  u8 used[256] = { 0 };
2333
1.14k
  u32 num_used_literals = 0;
2334
1.14k
  size_t i;
2335
2336
  /*
2337
   * For very short inputs, the static Huffman code has a good chance of
2338
   * being best, in which case there is no reason to avoid short matches.
2339
   */
2340
1.14k
  if (data_len < 512)
2341
181
    return DEFLATE_MIN_MATCH_LEN;
2342
2343
  /*
2344
   * For an initial approximation, scan the first 4 KiB of data.  The
2345
   * caller may use recalculate_min_match_len() to update min_len later.
2346
   */
2347
962
  data_len = MIN(data_len, 4096);
2348
2.36M
  for (i = 0; i < data_len; i++)
2349
2.36M
    used[data[i]] = 1;
2350
247k
  for (i = 0; i < 256; i++)
2351
246k
    num_used_literals += used[i];
2352
962
  return choose_min_match_len(num_used_literals, max_search_depth);
2353
1.14k
}
2354
2355
/*
2356
 * Recalculate the minimum match length for a block, now that we know the
2357
 * distribution of literals that are actually being used (freqs->litlen).
2358
 */
2359
static u32
2360
recalculate_min_match_len(const struct deflate_freqs *freqs,
2361
        u32 max_search_depth)
2362
3
{
2363
3
  u32 literal_freq = 0;
2364
3
  u32 cutoff;
2365
3
  u32 num_used_literals = 0;
2366
3
  int i;
2367
2368
771
  for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2369
768
    literal_freq += freqs->litlen[i];
2370
2371
3
  cutoff = literal_freq >> 10; /* Ignore literals used very rarely. */
2372
2373
771
  for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
2374
768
    if (freqs->litlen[i] > cutoff)
2375
6
      num_used_literals++;
2376
768
  }
2377
3
  return choose_min_match_len(num_used_literals, max_search_depth);
2378
3
}
2379
2380
static forceinline const u8 *
2381
choose_max_block_end(const u8 *in_block_begin, const u8 *in_end,
2382
         size_t soft_max_len)
2383
1.14k
{
2384
1.14k
  if (in_end - in_block_begin < soft_max_len + MIN_BLOCK_LENGTH)
2385
1.14k
    return in_end;
2386
0
  return in_block_begin + soft_max_len;
2387
1.14k
}
2388
2389
/*
2390
 * This is the level 0 "compressor".  It always outputs uncompressed blocks.
2391
 */
2392
static size_t
2393
deflate_compress_none(const u8 *in, size_t in_nbytes,
2394
          u8 *out, size_t out_nbytes_avail)
2395
151
{
2396
151
  const u8 *in_next = in;
2397
151
  const u8 * const in_end = in + in_nbytes;
2398
151
  u8 *out_next = out;
2399
151
  u8 * const out_end = out + out_nbytes_avail;
2400
2401
  /*
2402
   * If the input is zero-length, we still must output a block in order
2403
   * for the output to be a valid DEFLATE stream.  Handle this case
2404
   * specially to avoid potentially passing NULL to memcpy() below.
2405
   */
2406
151
  if (unlikely(in_nbytes == 0)) {
2407
0
    if (out_nbytes_avail < 5)
2408
0
      return 0;
2409
    /* BFINAL and BTYPE */
2410
0
    *out_next++ = 1 | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1);
2411
    /* LEN and NLEN */
2412
0
    put_unaligned_le32(0xFFFF0000, out_next);
2413
0
    return 5;
2414
0
  }
2415
2416
151
  do {
2417
151
    u8 bfinal = 0;
2418
151
    size_t len = UINT16_MAX;
2419
2420
151
    if (in_end - in_next <= UINT16_MAX) {
2421
151
      bfinal = 1;
2422
151
      len = in_end - in_next;
2423
151
    }
2424
151
    if (out_end - out_next < 5 + len)
2425
0
      return 0;
2426
    /*
2427
     * Output BFINAL and BTYPE.  The stream is already byte-aligned
2428
     * here, so this step always requires outputting exactly 1 byte.
2429
     */
2430
151
    *out_next++ = bfinal | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1);
2431
2432
    /* Output LEN and NLEN, then the data itself. */
2433
151
    put_unaligned_le16(len, out_next);
2434
151
    out_next += 2;
2435
151
    put_unaligned_le16(~len, out_next);
2436
151
    out_next += 2;
2437
151
    memcpy(out_next, in_next, len);
2438
151
    out_next += len;
2439
151
    in_next += len;
2440
151
  } while (in_next != in_end);
2441
2442
151
  return out_next - out;
2443
151
}
2444
2445
/*
2446
 * This is a faster variant of deflate_compress_greedy().  It uses the
2447
 * ht_matchfinder rather than the hc_matchfinder.  It also skips the block
2448
 * splitting algorithm and just uses fixed length blocks.  c->max_search_depth
2449
 * has no effect with this algorithm, as it is hardcoded in ht_matchfinder.h.
2450
 */
2451
static void
2452
deflate_compress_fastest(struct libdeflate_compressor * restrict c,
2453
       const u8 *in, size_t in_nbytes,
2454
       struct deflate_output_bitstream *os)
2455
0
{
2456
0
  const u8 *in_next = in;
2457
0
  const u8 *in_end = in_next + in_nbytes;
2458
0
  const u8 *in_cur_base = in_next;
2459
0
  u32 max_len = DEFLATE_MAX_MATCH_LEN;
2460
0
  u32 nice_len = MIN(c->nice_match_length, max_len);
2461
0
  u32 next_hash = 0;
2462
2463
0
  ht_matchfinder_init(&c->p.f.ht_mf);
2464
2465
0
  do {
2466
    /* Starting a new DEFLATE block */
2467
2468
0
    const u8 * const in_block_begin = in_next;
2469
0
    const u8 * const in_max_block_end = choose_max_block_end(
2470
0
        in_next, in_end, FAST_SOFT_MAX_BLOCK_LENGTH);
2471
0
    struct deflate_sequence *seq = c->p.f.sequences;
2472
2473
0
    deflate_begin_sequences(c, seq);
2474
2475
0
    do {
2476
0
      u32 length;
2477
0
      u32 offset;
2478
0
      size_t remaining = in_end - in_next;
2479
2480
0
      if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
2481
0
        max_len = remaining;
2482
0
        if (max_len < HT_MATCHFINDER_REQUIRED_NBYTES) {
2483
0
          do {
2484
0
            deflate_choose_literal(c,
2485
0
              *in_next++, false, seq);
2486
0
          } while (--max_len);
2487
0
          break;
2488
0
        }
2489
0
        nice_len = MIN(nice_len, max_len);
2490
0
      }
2491
0
      length = ht_matchfinder_longest_match(&c->p.f.ht_mf,
2492
0
                    &in_cur_base,
2493
0
                    in_next,
2494
0
                    max_len,
2495
0
                    nice_len,
2496
0
                    &next_hash,
2497
0
                    &offset);
2498
0
      if (length) {
2499
        /* Match found */
2500
0
        deflate_choose_match(c, length, offset, false,
2501
0
                 &seq);
2502
0
        ht_matchfinder_skip_bytes(&c->p.f.ht_mf,
2503
0
                &in_cur_base,
2504
0
                in_next + 1,
2505
0
                in_end,
2506
0
                length - 1,
2507
0
                &next_hash);
2508
0
        in_next += length;
2509
0
      } else {
2510
        /* No match found */
2511
0
        deflate_choose_literal(c, *in_next++, false,
2512
0
                   seq);
2513
0
      }
2514
2515
      /* Check if it's time to output another block. */
2516
0
    } while (in_next < in_max_block_end &&
2517
0
       seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]);
2518
2519
0
    deflate_finish_block(c, os, in_block_begin,
2520
0
             in_next - in_block_begin,
2521
0
             c->p.f.sequences, in_next == in_end);
2522
0
  } while (in_next != in_end && !os->overflow);
2523
0
}
2524
2525
/*
2526
 * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
2527
 */
2528
static void
2529
deflate_compress_greedy(struct libdeflate_compressor * restrict c,
2530
      const u8 *in, size_t in_nbytes,
2531
      struct deflate_output_bitstream *os)
2532
0
{
2533
0
  const u8 *in_next = in;
2534
0
  const u8 *in_end = in_next + in_nbytes;
2535
0
  const u8 *in_cur_base = in_next;
2536
0
  u32 max_len = DEFLATE_MAX_MATCH_LEN;
2537
0
  u32 nice_len = MIN(c->nice_match_length, max_len);
2538
0
  u32 next_hashes[2] = {0, 0};
2539
2540
0
  hc_matchfinder_init(&c->p.g.hc_mf);
2541
2542
0
  do {
2543
    /* Starting a new DEFLATE block */
2544
2545
0
    const u8 * const in_block_begin = in_next;
2546
0
    const u8 * const in_max_block_end = choose_max_block_end(
2547
0
        in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
2548
0
    struct deflate_sequence *seq = c->p.g.sequences;
2549
0
    u32 min_len;
2550
2551
0
    init_block_split_stats(&c->split_stats);
2552
0
    deflate_begin_sequences(c, seq);
2553
0
    min_len = calculate_min_match_len(in_next,
2554
0
              in_max_block_end - in_next,
2555
0
              c->max_search_depth);
2556
0
    do {
2557
0
      u32 length;
2558
0
      u32 offset;
2559
2560
0
      adjust_max_and_nice_len(&max_len, &nice_len,
2561
0
            in_end - in_next);
2562
0
      length = hc_matchfinder_longest_match(
2563
0
            &c->p.g.hc_mf,
2564
0
            &in_cur_base,
2565
0
            in_next,
2566
0
            min_len - 1,
2567
0
            max_len,
2568
0
            nice_len,
2569
0
            c->max_search_depth,
2570
0
            next_hashes,
2571
0
            &offset);
2572
2573
0
      if (length >= min_len &&
2574
0
          (length > DEFLATE_MIN_MATCH_LEN ||
2575
0
           offset <= 4096)) {
2576
        /* Match found */
2577
0
        deflate_choose_match(c, length, offset, true,
2578
0
                 &seq);
2579
0
        hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
2580
0
                &in_cur_base,
2581
0
                in_next + 1,
2582
0
                in_end,
2583
0
                length - 1,
2584
0
                next_hashes);
2585
0
        in_next += length;
2586
0
      } else {
2587
        /* No match found */
2588
0
        deflate_choose_literal(c, *in_next++, true,
2589
0
                   seq);
2590
0
      }
2591
2592
      /* Check if it's time to output another block. */
2593
0
    } while (in_next < in_max_block_end &&
2594
0
       seq < &c->p.g.sequences[SEQ_STORE_LENGTH] &&
2595
0
       !should_end_block(&c->split_stats,
2596
0
             in_block_begin, in_next, in_end));
2597
2598
0
    deflate_finish_block(c, os, in_block_begin,
2599
0
             in_next - in_block_begin,
2600
0
             c->p.g.sequences, in_next == in_end);
2601
0
  } while (in_next != in_end && !os->overflow);
2602
0
}
2603
2604
static forceinline void
2605
deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
2606
            const u8 *in, size_t in_nbytes,
2607
            struct deflate_output_bitstream *os, bool lazy2)
2608
1.14k
{
2609
1.14k
  const u8 *in_next = in;
2610
1.14k
  const u8 *in_end = in_next + in_nbytes;
2611
1.14k
  const u8 *in_cur_base = in_next;
2612
1.14k
  u32 max_len = DEFLATE_MAX_MATCH_LEN;
2613
1.14k
  u32 nice_len = MIN(c->nice_match_length, max_len);
2614
1.14k
  u32 next_hashes[2] = {0, 0};
2615
2616
1.14k
  hc_matchfinder_init(&c->p.g.hc_mf);
2617
2618
1.14k
  do {
2619
    /* Starting a new DEFLATE block */
2620
2621
1.14k
    const u8 * const in_block_begin = in_next;
2622
1.14k
    const u8 * const in_max_block_end = choose_max_block_end(
2623
1.14k
        in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
2624
1.14k
    const u8 *next_recalc_min_len =
2625
1.14k
      in_next + MIN(in_end - in_next, 10000);
2626
1.14k
    struct deflate_sequence *seq = c->p.g.sequences;
2627
1.14k
    u32 min_len;
2628
2629
1.14k
    init_block_split_stats(&c->split_stats);
2630
1.14k
    deflate_begin_sequences(c, seq);
2631
1.14k
    min_len = calculate_min_match_len(in_next,
2632
1.14k
              in_max_block_end - in_next,
2633
1.14k
              c->max_search_depth);
2634
454k
    do {
2635
454k
      u32 cur_len;
2636
454k
      u32 cur_offset;
2637
454k
      u32 next_len;
2638
454k
      u32 next_offset;
2639
2640
      /*
2641
       * Recalculate the minimum match length if it hasn't
2642
       * been done recently.
2643
       */
2644
454k
      if (in_next >= next_recalc_min_len) {
2645
3
        min_len = recalculate_min_match_len(
2646
3
            &c->freqs,
2647
3
            c->max_search_depth);
2648
3
        next_recalc_min_len +=
2649
3
          MIN(in_end - next_recalc_min_len,
2650
3
              in_next - in_block_begin);
2651
3
      }
2652
2653
      /* Find the longest match at the current position. */
2654
454k
      adjust_max_and_nice_len(&max_len, &nice_len,
2655
454k
            in_end - in_next);
2656
454k
      cur_len = hc_matchfinder_longest_match(
2657
454k
            &c->p.g.hc_mf,
2658
454k
            &in_cur_base,
2659
454k
            in_next,
2660
454k
            min_len - 1,
2661
454k
            max_len,
2662
454k
            nice_len,
2663
454k
            c->max_search_depth,
2664
454k
            next_hashes,
2665
454k
            &cur_offset);
2666
454k
      if (cur_len < min_len ||
2667
125k
          (cur_len == DEFLATE_MIN_MATCH_LEN &&
2668
328k
           cur_offset > 8192)) {
2669
        /* No match found.  Choose a literal. */
2670
328k
        deflate_choose_literal(c, *in_next++, true,
2671
328k
                   seq);
2672
328k
        continue;
2673
328k
      }
2674
125k
      in_next++;
2675
2676
146k
have_cur_match:
2677
      /*
2678
       * We have a match at the current position.
2679
       * If it's very long, choose it immediately.
2680
       */
2681
146k
      if (cur_len >= nice_len) {
2682
3.92k
        deflate_choose_match(c, cur_len, cur_offset,
2683
3.92k
                 true, &seq);
2684
3.92k
        hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
2685
3.92k
                &in_cur_base,
2686
3.92k
                in_next,
2687
3.92k
                in_end,
2688
3.92k
                cur_len - 1,
2689
3.92k
                next_hashes);
2690
3.92k
        in_next += cur_len - 1;
2691
3.92k
        continue;
2692
3.92k
      }
2693
2694
      /*
2695
       * Try to find a better match at the next position.
2696
       *
2697
       * Note: since we already have a match at the *current*
2698
       * position, we use only half the 'max_search_depth'
2699
       * when checking the *next* position.  This is a useful
2700
       * trade-off because it's more worthwhile to use a
2701
       * greater search depth on the initial match.
2702
       *
2703
       * Note: it's possible to structure the code such that
2704
       * there's only one call to longest_match(), which
2705
       * handles both the "find the initial match" and "try to
2706
       * find a better match" cases.  However, it is faster to
2707
       * have two call sites, with longest_match() inlined at
2708
       * each.
2709
       */
2710
143k
      adjust_max_and_nice_len(&max_len, &nice_len,
2711
143k
            in_end - in_next);
2712
143k
      next_len = hc_matchfinder_longest_match(
2713
143k
            &c->p.g.hc_mf,
2714
143k
            &in_cur_base,
2715
143k
            in_next++,
2716
143k
            cur_len - 1,
2717
143k
            max_len,
2718
143k
            nice_len,
2719
143k
            c->max_search_depth >> 1,
2720
143k
            next_hashes,
2721
143k
            &next_offset);
2722
143k
      if (next_len >= cur_len &&
2723
26.0k
          4 * (int)(next_len - cur_len) +
2724
26.0k
          ((int)bsr32(cur_offset) -
2725
26.0k
           (int)bsr32(next_offset)) > 2) {
2726
        /*
2727
         * Found a better match at the next position.
2728
         * Output a literal.  Then the next match
2729
         * becomes the current match.
2730
         */
2731
15.6k
        deflate_choose_literal(c, *(in_next - 2), true,
2732
15.6k
                   seq);
2733
15.6k
        cur_len = next_len;
2734
15.6k
        cur_offset = next_offset;
2735
15.6k
        goto have_cur_match;
2736
15.6k
      }
2737
2738
127k
      if (lazy2) {
2739
        /* In lazy2 mode, look ahead another position */
2740
127k
        adjust_max_and_nice_len(&max_len, &nice_len,
2741
127k
              in_end - in_next);
2742
127k
        next_len = hc_matchfinder_longest_match(
2743
127k
            &c->p.g.hc_mf,
2744
127k
            &in_cur_base,
2745
127k
            in_next++,
2746
127k
            cur_len - 1,
2747
127k
            max_len,
2748
127k
            nice_len,
2749
127k
            c->max_search_depth >> 2,
2750
127k
            next_hashes,
2751
127k
            &next_offset);
2752
127k
        if (next_len >= cur_len &&
2753
9.39k
            4 * (int)(next_len - cur_len) +
2754
9.39k
            ((int)bsr32(cur_offset) -
2755
9.39k
             (int)bsr32(next_offset)) > 6) {
2756
          /*
2757
           * There's a much better match two
2758
           * positions ahead, so use two literals.
2759
           */
2760
5.61k
          deflate_choose_literal(
2761
5.61k
            c, *(in_next - 3), true, seq);
2762
5.61k
          deflate_choose_literal(
2763
5.61k
            c, *(in_next - 2), true, seq);
2764
5.61k
          cur_len = next_len;
2765
5.61k
          cur_offset = next_offset;
2766
5.61k
          goto have_cur_match;
2767
5.61k
        }
2768
        /*
2769
         * No better match at either of the next 2
2770
         * positions.  Output the current match.
2771
         */
2772
121k
        deflate_choose_match(c, cur_len, cur_offset,
2773
121k
                 true, &seq);
2774
121k
        if (cur_len > 3) {
2775
86.5k
          hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
2776
86.5k
                  &in_cur_base,
2777
86.5k
                  in_next,
2778
86.5k
                  in_end,
2779
86.5k
                  cur_len - 3,
2780
86.5k
                  next_hashes);
2781
86.5k
          in_next += cur_len - 3;
2782
86.5k
        }
2783
121k
      } else { /* !lazy2 */
2784
        /*
2785
         * No better match at the next position.  Output
2786
         * the current match.
2787
         */
2788
0
        deflate_choose_match(c, cur_len, cur_offset,
2789
0
                 true, &seq);
2790
0
        hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
2791
0
                &in_cur_base,
2792
0
                in_next,
2793
0
                in_end,
2794
0
                cur_len - 2,
2795
0
                next_hashes);
2796
0
        in_next += cur_len - 2;
2797
0
      }
2798
      /* Check if it's time to output another block. */
2799
454k
    } while (in_next < in_max_block_end &&
2800
453k
       seq < &c->p.g.sequences[SEQ_STORE_LENGTH] &&
2801
453k
       !should_end_block(&c->split_stats,
2802
453k
             in_block_begin, in_next, in_end));
2803
2804
1.14k
    deflate_finish_block(c, os, in_block_begin,
2805
1.14k
             in_next - in_block_begin,
2806
1.14k
             c->p.g.sequences, in_next == in_end);
2807
1.14k
  } while (in_next != in_end && !os->overflow);
2808
1.14k
}
2809
2810
/*
2811
 * This is the "lazy" DEFLATE compressor.  Before choosing a match, it checks to
2812
 * see if there's a better match at the next position.  If yes, it outputs a
2813
 * literal and continues to the next position.  If no, it outputs the match.
2814
 */
2815
static void
2816
deflate_compress_lazy(struct libdeflate_compressor * restrict c,
2817
          const u8 *in, size_t in_nbytes,
2818
          struct deflate_output_bitstream *os)
2819
0
{
2820
0
  deflate_compress_lazy_generic(c, in, in_nbytes, os, false);
2821
0
}
2822
2823
/*
2824
 * The lazy2 compressor.  This is similar to the regular lazy one, but it looks
2825
 * for a better match at the next 2 positions rather than the next 1.  This
2826
 * makes it take slightly more time, but compress some inputs slightly more.
2827
 */
2828
static void
2829
deflate_compress_lazy2(struct libdeflate_compressor * restrict c,
2830
           const u8 *in, size_t in_nbytes,
2831
           struct deflate_output_bitstream *os)
2832
1.14k
{
2833
1.14k
  deflate_compress_lazy_generic(c, in, in_nbytes, os, true);
2834
1.14k
}
2835
2836
#if SUPPORT_NEAR_OPTIMAL_PARSING
2837
2838
/*
2839
 * Follow the minimum-cost path in the graph of possible match/literal choices
2840
 * for the current block and compute the frequencies of the Huffman symbols that
2841
 * would be needed to output those matches and literals.
2842
 */
2843
static void
2844
deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
2845
0
{
2846
0
  struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
2847
0
  struct deflate_optimum_node *end_node =
2848
0
    &c->p.n.optimum_nodes[block_length];
2849
2850
0
  do {
2851
0
    u32 length = cur_node->item & OPTIMUM_LEN_MASK;
2852
0
    u32 offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
2853
2854
0
    if (length == 1) {
2855
      /* Literal */
2856
0
      c->freqs.litlen[offset]++;
2857
0
    } else {
2858
      /* Match */
2859
0
      c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
2860
0
          deflate_length_slot[length]]++;
2861
0
      c->freqs.offset[c->p.n.offset_slot_full[offset]]++;
2862
0
    }
2863
0
    cur_node += length;
2864
0
  } while (cur_node != end_node);
2865
2866
  /* Tally the end-of-block symbol. */
2867
0
  c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
2868
0
}
2869
2870
static void
2871
deflate_choose_all_literals(struct libdeflate_compressor *c,
2872
          const u8 *block, u32 block_length)
2873
0
{
2874
0
  u32 i;
2875
2876
0
  deflate_reset_symbol_frequencies(c);
2877
0
  for (i = 0; i < block_length; i++)
2878
0
    c->freqs.litlen[block[i]]++;
2879
0
  c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
2880
2881
0
  deflate_make_huffman_codes(&c->freqs, &c->codes);
2882
0
}
2883
2884
/*
2885
 * Compute the exact cost, in bits, that would be required to output the matches
2886
 * and literals described by @c->freqs as a dynamic Huffman block.  The litlen
2887
 * and offset codes are assumed to have already been built in @c->codes.
2888
 */
2889
static u32
2890
deflate_compute_true_cost(struct libdeflate_compressor *c)
2891
0
{
2892
0
  u32 cost = 0;
2893
0
  unsigned sym;
2894
2895
0
  deflate_precompute_huffman_header(c);
2896
2897
0
  memset(&c->codes.lens.litlen[c->o.precode.num_litlen_syms], 0,
2898
0
         DEFLATE_NUM_LITLEN_SYMS - c->o.precode.num_litlen_syms);
2899
2900
0
  cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens);
2901
0
  for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
2902
0
    cost += c->o.precode.freqs[sym] *
2903
0
      (c->o.precode.lens[sym] +
2904
0
       deflate_extra_precode_bits[sym]);
2905
0
  }
2906
2907
0
  for (sym = 0; sym < DEFLATE_FIRST_LEN_SYM; sym++)
2908
0
    cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym];
2909
2910
0
  for (; sym < DEFLATE_FIRST_LEN_SYM +
2911
0
         ARRAY_LEN(deflate_extra_length_bits); sym++)
2912
0
    cost += c->freqs.litlen[sym] *
2913
0
      (c->codes.lens.litlen[sym] +
2914
0
       deflate_extra_length_bits[sym - DEFLATE_FIRST_LEN_SYM]);
2915
2916
0
  for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++)
2917
0
    cost += c->freqs.offset[sym] *
2918
0
      (c->codes.lens.offset[sym] +
2919
0
       deflate_extra_offset_bits[sym]);
2920
0
  return cost;
2921
0
}
2922
2923
/* Set the current cost model from the codeword lengths specified in @lens. */
2924
static void
2925
deflate_set_costs_from_codes(struct libdeflate_compressor *c,
2926
           const struct deflate_lens *lens)
2927
0
{
2928
0
  unsigned i;
2929
2930
  /* Literals */
2931
0
  for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
2932
0
    u32 bits = (lens->litlen[i] ?
2933
0
          lens->litlen[i] : LITERAL_NOSTAT_BITS);
2934
2935
0
    c->p.n.costs.literal[i] = bits * BIT_COST;
2936
0
  }
2937
2938
  /* Lengths */
2939
0
  for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
2940
0
    unsigned length_slot = deflate_length_slot[i];
2941
0
    unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot;
2942
0
    u32 bits = (lens->litlen[litlen_sym] ?
2943
0
          lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
2944
2945
0
    bits += deflate_extra_length_bits[length_slot];
2946
0
    c->p.n.costs.length[i] = bits * BIT_COST;
2947
0
  }
2948
2949
  /* Offset slots */
2950
0
  for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
2951
0
    u32 bits = (lens->offset[i] ?
2952
0
          lens->offset[i] : OFFSET_NOSTAT_BITS);
2953
2954
0
    bits += deflate_extra_offset_bits[i];
2955
0
    c->p.n.costs.offset_slot[i] = bits * BIT_COST;
2956
0
  }
2957
0
}
2958
2959
/*
2960
 * This lookup table gives the default cost of a literal symbol and of a length
2961
 * symbol, depending on the characteristics of the input data.  It was generated
2962
 * by scripts/gen_default_litlen_costs.py.
2963
 *
2964
 * This table is indexed first by the estimated match probability:
2965
 *
2966
 *  i=0: data doesn't contain many matches  [match_prob=0.25]
2967
 *  i=1: neutral        [match_prob=0.50]
2968
 *  i=2: data contains lots of matches  [match_prob=0.75]
2969
 *
2970
 * This lookup produces a subtable which maps the number of distinct used
2971
 * literals to the default cost of a literal symbol, i.e.:
2972
 *
2973
 *  int(-log2((1 - match_prob) / num_used_literals) * BIT_COST)
2974
 *
2975
 * ... for num_used_literals in [1, 256] (and 0, which is copied from 1).  This
2976
 * accounts for literals usually getting cheaper as the number of distinct
2977
 * literals decreases, and as the proportion of literals to matches increases.
2978
 *
2979
 * The lookup also produces the cost of a length symbol, which is:
2980
 *
2981
 *  int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST)
2982
 *
2983
 * Note: we don't currently assign different costs to different literal symbols,
2984
 * or to different length symbols, as this is hard to do in a useful way.
2985
 */
2986
static const struct {
2987
  u8 used_lits_to_lit_cost[257];
2988
  u8 len_sym_cost;
2989
} default_litlen_costs[] = {
2990
  { /* match_prob = 0.25 */
2991
    .used_lits_to_lit_cost = {
2992
      6, 6, 22, 32, 38, 43, 48, 51,
2993
      54, 57, 59, 61, 64, 65, 67, 69,
2994
      70, 72, 73, 74, 75, 76, 77, 79,
2995
      80, 80, 81, 82, 83, 84, 85, 85,
2996
      86, 87, 88, 88, 89, 89, 90, 91,
2997
      91, 92, 92, 93, 93, 94, 95, 95,
2998
      96, 96, 96, 97, 97, 98, 98, 99,
2999
      99, 99, 100, 100, 101, 101, 101, 102,
3000
      102, 102, 103, 103, 104, 104, 104, 105,
3001
      105, 105, 105, 106, 106, 106, 107, 107,
3002
      107, 108, 108, 108, 108, 109, 109, 109,
3003
      109, 110, 110, 110, 111, 111, 111, 111,
3004
      112, 112, 112, 112, 112, 113, 113, 113,
3005
      113, 114, 114, 114, 114, 114, 115, 115,
3006
      115, 115, 115, 116, 116, 116, 116, 116,
3007
      117, 117, 117, 117, 117, 118, 118, 118,
3008
      118, 118, 118, 119, 119, 119, 119, 119,
3009
      120, 120, 120, 120, 120, 120, 121, 121,
3010
      121, 121, 121, 121, 121, 122, 122, 122,
3011
      122, 122, 122, 123, 123, 123, 123, 123,
3012
      123, 123, 124, 124, 124, 124, 124, 124,
3013
      124, 125, 125, 125, 125, 125, 125, 125,
3014
      125, 126, 126, 126, 126, 126, 126, 126,
3015
      127, 127, 127, 127, 127, 127, 127, 127,
3016
      128, 128, 128, 128, 128, 128, 128, 128,
3017
      128, 129, 129, 129, 129, 129, 129, 129,
3018
      129, 129, 130, 130, 130, 130, 130, 130,
3019
      130, 130, 130, 131, 131, 131, 131, 131,
3020
      131, 131, 131, 131, 131, 132, 132, 132,
3021
      132, 132, 132, 132, 132, 132, 132, 133,
3022
      133, 133, 133, 133, 133, 133, 133, 133,
3023
      133, 134, 134, 134, 134, 134, 134, 134,
3024
      134,
3025
    },
3026
    .len_sym_cost = 109,
3027
  }, { /* match_prob = 0.5 */
3028
    .used_lits_to_lit_cost = {
3029
      16, 16, 32, 41, 48, 53, 57, 60,
3030
      64, 66, 69, 71, 73, 75, 76, 78,
3031
      80, 81, 82, 83, 85, 86, 87, 88,
3032
      89, 90, 91, 92, 92, 93, 94, 95,
3033
      96, 96, 97, 98, 98, 99, 99, 100,
3034
      101, 101, 102, 102, 103, 103, 104, 104,
3035
      105, 105, 106, 106, 107, 107, 108, 108,
3036
      108, 109, 109, 110, 110, 110, 111, 111,
3037
      112, 112, 112, 113, 113, 113, 114, 114,
3038
      114, 115, 115, 115, 115, 116, 116, 116,
3039
      117, 117, 117, 118, 118, 118, 118, 119,
3040
      119, 119, 119, 120, 120, 120, 120, 121,
3041
      121, 121, 121, 122, 122, 122, 122, 122,
3042
      123, 123, 123, 123, 124, 124, 124, 124,
3043
      124, 125, 125, 125, 125, 125, 126, 126,
3044
      126, 126, 126, 127, 127, 127, 127, 127,
3045
      128, 128, 128, 128, 128, 128, 129, 129,
3046
      129, 129, 129, 129, 130, 130, 130, 130,
3047
      130, 130, 131, 131, 131, 131, 131, 131,
3048
      131, 132, 132, 132, 132, 132, 132, 133,
3049
      133, 133, 133, 133, 133, 133, 134, 134,
3050
      134, 134, 134, 134, 134, 134, 135, 135,
3051
      135, 135, 135, 135, 135, 135, 136, 136,
3052
      136, 136, 136, 136, 136, 136, 137, 137,
3053
      137, 137, 137, 137, 137, 137, 138, 138,
3054
      138, 138, 138, 138, 138, 138, 138, 139,
3055
      139, 139, 139, 139, 139, 139, 139, 139,
3056
      140, 140, 140, 140, 140, 140, 140, 140,
3057
      140, 141, 141, 141, 141, 141, 141, 141,
3058
      141, 141, 141, 142, 142, 142, 142, 142,
3059
      142, 142, 142, 142, 142, 142, 143, 143,
3060
      143, 143, 143, 143, 143, 143, 143, 143,
3061
      144,
3062
    },
3063
    .len_sym_cost = 93,
3064
  }, { /* match_prob = 0.75 */
3065
    .used_lits_to_lit_cost = {
3066
      32, 32, 48, 57, 64, 69, 73, 76,
3067
      80, 82, 85, 87, 89, 91, 92, 94,
3068
      96, 97, 98, 99, 101, 102, 103, 104,
3069
      105, 106, 107, 108, 108, 109, 110, 111,
3070
      112, 112, 113, 114, 114, 115, 115, 116,
3071
      117, 117, 118, 118, 119, 119, 120, 120,
3072
      121, 121, 122, 122, 123, 123, 124, 124,
3073
      124, 125, 125, 126, 126, 126, 127, 127,
3074
      128, 128, 128, 129, 129, 129, 130, 130,
3075
      130, 131, 131, 131, 131, 132, 132, 132,
3076
      133, 133, 133, 134, 134, 134, 134, 135,
3077
      135, 135, 135, 136, 136, 136, 136, 137,
3078
      137, 137, 137, 138, 138, 138, 138, 138,
3079
      139, 139, 139, 139, 140, 140, 140, 140,
3080
      140, 141, 141, 141, 141, 141, 142, 142,
3081
      142, 142, 142, 143, 143, 143, 143, 143,
3082
      144, 144, 144, 144, 144, 144, 145, 145,
3083
      145, 145, 145, 145, 146, 146, 146, 146,
3084
      146, 146, 147, 147, 147, 147, 147, 147,
3085
      147, 148, 148, 148, 148, 148, 148, 149,
3086
      149, 149, 149, 149, 149, 149, 150, 150,
3087
      150, 150, 150, 150, 150, 150, 151, 151,
3088
      151, 151, 151, 151, 151, 151, 152, 152,
3089
      152, 152, 152, 152, 152, 152, 153, 153,
3090
      153, 153, 153, 153, 153, 153, 154, 154,
3091
      154, 154, 154, 154, 154, 154, 154, 155,
3092
      155, 155, 155, 155, 155, 155, 155, 155,
3093
      156, 156, 156, 156, 156, 156, 156, 156,
3094
      156, 157, 157, 157, 157, 157, 157, 157,
3095
      157, 157, 157, 158, 158, 158, 158, 158,
3096
      158, 158, 158, 158, 158, 158, 159, 159,
3097
      159, 159, 159, 159, 159, 159, 159, 159,
3098
      160,
3099
    },
3100
    .len_sym_cost = 84,
3101
  },
3102
};
3103
3104
/*
3105
 * Choose the default costs for literal and length symbols.  These symbols are
3106
 * both part of the litlen alphabet.
3107
 */
3108
static void
3109
deflate_choose_default_litlen_costs(struct libdeflate_compressor *c,
3110
            const u8 *block_begin, u32 block_length,
3111
            u32 *lit_cost, u32 *len_sym_cost)
3112
0
{
3113
0
  u32 num_used_literals = 0;
3114
0
  u32 literal_freq = block_length;
3115
0
  u32 match_freq = 0;
3116
0
  u32 cutoff;
3117
0
  u32 i;
3118
3119
  /* Calculate the number of distinct literals that exist in the data. */
3120
0
  memset(c->freqs.litlen, 0,
3121
0
         DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0]));
3122
0
  cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */
3123
0
  for (i = 0; i < block_length; i++)
3124
0
    c->freqs.litlen[block_begin[i]]++;
3125
0
  for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
3126
0
    if (c->freqs.litlen[i] > cutoff)
3127
0
      num_used_literals++;
3128
0
  }
3129
0
  if (num_used_literals == 0)
3130
0
    num_used_literals = 1;
3131
3132
  /*
3133
   * Estimate the relative frequency of literals and matches in the
3134
   * optimal parsing solution.  We don't know the optimal solution, so
3135
   * this can only be a very rough estimate.  Therefore, we basically use
3136
   * the match frequency from a greedy parse.  We also apply the min_len
3137
   * heuristic used by the greedy and lazy parsers, to avoid counting too
3138
   * many matches when literals are cheaper than short matches.
3139
   */
3140
0
  match_freq = 0;
3141
0
  i = choose_min_match_len(num_used_literals, c->max_search_depth);
3142
0
  for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
3143
0
    match_freq += c->p.n.match_len_freqs[i];
3144
0
    literal_freq -= i * c->p.n.match_len_freqs[i];
3145
0
  }
3146
0
  if ((s32)literal_freq < 0) /* shouldn't happen */
3147
0
    literal_freq = 0;
3148
3149
0
  if (match_freq > literal_freq)
3150
0
    i = 2; /* many matches */
3151
0
  else if (match_freq * 4 > literal_freq)
3152
0
    i = 1; /* neutral */
3153
0
  else
3154
0
    i = 0; /* few matches */
3155
3156
0
  STATIC_ASSERT(BIT_COST == 16);
3157
0
  *lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[
3158
0
              num_used_literals];
3159
0
  *len_sym_cost = default_litlen_costs[i].len_sym_cost;
3160
0
}
3161
3162
static forceinline u32
3163
deflate_default_length_cost(u32 len, u32 len_sym_cost)
3164
0
{
3165
0
  unsigned slot = deflate_length_slot[len];
3166
0
  u32 num_extra_bits = deflate_extra_length_bits[slot];
3167
3168
0
  return len_sym_cost + (num_extra_bits * BIT_COST);
3169
0
}
3170
3171
static forceinline u32
3172
deflate_default_offset_slot_cost(unsigned slot)
3173
0
{
3174
0
  u32 num_extra_bits = deflate_extra_offset_bits[slot];
3175
  /*
3176
   * Assume that all offset symbols are equally probable.
3177
   * The resulting cost is 'int(-log2(1/30) * BIT_COST)',
3178
   * where 30 is the number of potentially-used offset symbols.
3179
   */
3180
0
  u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000;
3181
3182
0
  return offset_sym_cost + (num_extra_bits * BIT_COST);
3183
0
}
3184
3185
/* Set default symbol costs for the first block's first optimization pass. */
3186
static void
3187
deflate_set_default_costs(struct libdeflate_compressor *c,
3188
        u32 lit_cost, u32 len_sym_cost)
3189
0
{
3190
0
  u32 i;
3191
3192
  /* Literals */
3193
0
  for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
3194
0
    c->p.n.costs.literal[i] = lit_cost;
3195
3196
  /* Lengths */
3197
0
  for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
3198
0
    c->p.n.costs.length[i] =
3199
0
      deflate_default_length_cost(i, len_sym_cost);
3200
3201
  /* Offset slots */
3202
0
  for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
3203
0
    c->p.n.costs.offset_slot[i] =
3204
0
      deflate_default_offset_slot_cost(i);
3205
0
}
3206
3207
static forceinline void
3208
deflate_adjust_cost(u32 *cost_p, u32 default_cost, int change_amount)
3209
0
{
3210
0
  if (change_amount == 0)
3211
    /* Block is very similar to previous; prefer previous costs. */
3212
0
    *cost_p = (default_cost + 3 * *cost_p) / 4;
3213
0
  else if (change_amount == 1)
3214
0
    *cost_p = (default_cost + *cost_p) / 2;
3215
0
  else if (change_amount == 2)
3216
0
    *cost_p = (5 * default_cost + 3 * *cost_p) / 8;
3217
0
  else
3218
    /* Block differs greatly from previous; prefer default costs. */
3219
0
    *cost_p = (3 * default_cost + *cost_p) / 4;
3220
0
}
3221
3222
static forceinline void
3223
deflate_adjust_costs_impl(struct libdeflate_compressor *c,
3224
        u32 lit_cost, u32 len_sym_cost, int change_amount)
3225
0
{
3226
0
  u32 i;
3227
3228
  /* Literals */
3229
0
  for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
3230
0
    deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost,
3231
0
            change_amount);
3232
3233
  /* Lengths */
3234
0
  for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
3235
0
    deflate_adjust_cost(&c->p.n.costs.length[i],
3236
0
            deflate_default_length_cost(i,
3237
0
                len_sym_cost),
3238
0
            change_amount);
3239
3240
  /* Offset slots */
3241
0
  for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
3242
0
    deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
3243
0
            deflate_default_offset_slot_cost(i),
3244
0
            change_amount);
3245
0
}
3246
3247
/*
3248
 * Adjust the costs when beginning a new block.
3249
 *
3250
 * Since the current costs are optimized for the data already, it can be helpful
3251
 * to reuse them instead of starting over with the default costs.  However, this
3252
 * depends on how similar the new block is to the previous block.  Therefore,
3253
 * use a heuristic to decide how similar the blocks are, and mix together the
3254
 * current costs and the default costs accordingly.
3255
 */
3256
static void
3257
deflate_adjust_costs(struct libdeflate_compressor *c,
3258
         u32 lit_cost, u32 len_sym_cost)
3259
0
{
3260
0
  u64 total_delta = 0;
3261
0
  u64 cutoff;
3262
0
  int i;
3263
3264
  /*
3265
   * Decide how different the current block is from the previous block,
3266
   * using the block splitting statistics from the current and previous
3267
   * blocks.  The more different the current block is, the more we prefer
3268
   * the default costs rather than the previous block's costs.
3269
   *
3270
   * The algorithm here is similar to the end-of-block check one, but here
3271
   * we compare two entire blocks rather than a partial block with a small
3272
   * extra part, and therefore we need 64-bit numbers in some places.
3273
   */
3274
0
  for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
3275
0
    u64 prev = (u64)c->p.n.prev_observations[i] *
3276
0
          c->split_stats.num_observations;
3277
0
    u64 cur = (u64)c->split_stats.observations[i] *
3278
0
        c->p.n.prev_num_observations;
3279
3280
0
    total_delta += prev > cur ? prev - cur : cur - prev;
3281
0
  }
3282
0
  cutoff = ((u64)c->p.n.prev_num_observations *
3283
0
      c->split_stats.num_observations * 200) / 512;
3284
3285
0
  if (total_delta > 3 * cutoff)
3286
    /* Big change in the data; just use the default costs. */
3287
0
    deflate_set_default_costs(c, lit_cost, len_sym_cost);
3288
0
  else if (4 * total_delta > 9 * cutoff)
3289
0
    deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3);
3290
0
  else if (2 * total_delta > 3 * cutoff)
3291
0
    deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2);
3292
0
  else if (2 * total_delta > cutoff)
3293
0
    deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1);
3294
0
  else
3295
0
    deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0);
3296
0
}
3297
3298
static void
3299
deflate_set_initial_costs(struct libdeflate_compressor *c,
3300
        const u8 *block_begin, u32 block_length,
3301
        bool is_first_block)
3302
0
{
3303
0
  u32 lit_cost, len_sym_cost;
3304
3305
0
  deflate_choose_default_litlen_costs(c, block_begin, block_length,
3306
0
              &lit_cost, &len_sym_cost);
3307
0
  if (is_first_block)
3308
0
    deflate_set_default_costs(c, lit_cost, len_sym_cost);
3309
0
  else
3310
0
    deflate_adjust_costs(c, lit_cost, len_sym_cost);
3311
0
}
3312
3313
/*
3314
 * Find the minimum-cost path through the graph of possible match/literal
3315
 * choices for this block.
3316
 *
3317
 * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
3318
 * represents the node at the beginning of the block, to
3319
 * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
3320
 * the block.  Edge costs are evaluated using the cost model 'c->p.n.costs'.
3321
 *
3322
 * The algorithm works backwards, starting at the end node and proceeding
3323
 * backwards one node at a time.  At each node, the minimum cost to reach the
3324
 * end node is computed and the match/literal choice that begins that path is
3325
 * saved.
3326
 */
3327
static void
3328
deflate_find_min_cost_path(struct libdeflate_compressor *c,
3329
         const u32 block_length,
3330
         const struct lz_match *cache_ptr)
3331
0
{
3332
0
  struct deflate_optimum_node *end_node =
3333
0
    &c->p.n.optimum_nodes[block_length];
3334
0
  struct deflate_optimum_node *cur_node = end_node;
3335
3336
0
  cur_node->cost_to_end = 0;
3337
0
  do {
3338
0
    unsigned num_matches;
3339
0
    u32 literal;
3340
0
    u32 best_cost_to_end;
3341
3342
0
    cur_node--;
3343
0
    cache_ptr--;
3344
3345
0
    num_matches = cache_ptr->length;
3346
0
    literal = cache_ptr->offset;
3347
3348
    /* It's always possible to choose a literal. */
3349
0
    best_cost_to_end = c->p.n.costs.literal[literal] +
3350
0
           (cur_node + 1)->cost_to_end;
3351
0
    cur_node->item = (literal << OPTIMUM_OFFSET_SHIFT) | 1;
3352
3353
    /* Also consider matches if there are any. */
3354
0
    if (num_matches) {
3355
0
      const struct lz_match *match;
3356
0
      u32 len;
3357
0
      u32 offset;
3358
0
      u32 offset_slot;
3359
0
      u32 offset_cost;
3360
0
      u32 cost_to_end;
3361
3362
      /*
3363
       * Consider each length from the minimum
3364
       * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
3365
       * match found at this position.  For each length, we
3366
       * consider only the smallest offset for which that
3367
       * length is available.  Although this is not guaranteed
3368
       * to be optimal due to the possibility of a larger
3369
       * offset costing less than a smaller offset to code,
3370
       * this is a very useful heuristic.
3371
       */
3372
0
      match = cache_ptr - num_matches;
3373
0
      len = DEFLATE_MIN_MATCH_LEN;
3374
0
      do {
3375
0
        offset = match->offset;
3376
0
        offset_slot = c->p.n.offset_slot_full[offset];
3377
0
        offset_cost =
3378
0
          c->p.n.costs.offset_slot[offset_slot];
3379
0
        do {
3380
0
          cost_to_end = offset_cost +
3381
0
            c->p.n.costs.length[len] +
3382
0
            (cur_node + len)->cost_to_end;
3383
0
          if (cost_to_end < best_cost_to_end) {
3384
0
            best_cost_to_end = cost_to_end;
3385
0
            cur_node->item = len |
3386
0
              (offset <<
3387
0
               OPTIMUM_OFFSET_SHIFT);
3388
0
          }
3389
0
        } while (++len <= match->length);
3390
0
      } while (++match != cache_ptr);
3391
0
      cache_ptr -= num_matches;
3392
0
    }
3393
0
    cur_node->cost_to_end = best_cost_to_end;
3394
0
  } while (cur_node != &c->p.n.optimum_nodes[0]);
3395
3396
0
  deflate_reset_symbol_frequencies(c);
3397
0
  deflate_tally_item_list(c, block_length);
3398
0
  deflate_make_huffman_codes(&c->freqs, &c->codes);
3399
0
}
3400
3401
/*
3402
 * Choose the literals and matches for the current block, then output the block.
3403
 *
3404
 * To choose the literal/match sequence, we find the minimum-cost path through
3405
 * the block's graph of literal/match choices, given a cost model.  However, the
3406
 * true cost of each symbol is unknown until the Huffman codes have been built,
3407
 * but at the same time the Huffman codes depend on the frequencies of chosen
3408
 * symbols.  Consequently, multiple passes must be used to try to approximate an
3409
 * optimal solution.  The first pass uses default costs, mixed with the costs
3410
 * from the previous block when it seems appropriate.  Later passes use the
3411
 * Huffman codeword lengths from the previous pass as the costs.
3412
 *
3413
 * As an alternate strategy, also consider using only literals.  The boolean
3414
 * returned in *used_only_literals indicates whether that strategy was best.
3415
 */
3416
static void
3417
deflate_optimize_and_flush_block(struct libdeflate_compressor *c,
3418
         struct deflate_output_bitstream *os,
3419
         const u8 *block_begin, u32 block_length,
3420
         const struct lz_match *cache_ptr,
3421
         bool is_first_block, bool is_final_block,
3422
         bool *used_only_literals)
3423
0
{
3424
0
  unsigned num_passes_remaining = c->p.n.max_optim_passes;
3425
0
  u32 best_true_cost = UINT32_MAX;
3426
0
  u32 true_cost;
3427
0
  u32 only_lits_cost;
3428
0
  u32 static_cost = UINT32_MAX;
3429
0
  struct deflate_sequence seq_;
3430
0
  struct deflate_sequence *seq = NULL;
3431
0
  u32 i;
3432
3433
  /*
3434
   * On some data, using only literals (no matches) ends up being better
3435
   * than what the iterative optimization algorithm produces.  Therefore,
3436
   * consider using only literals.
3437
   */
3438
0
  deflate_choose_all_literals(c, block_begin, block_length);
3439
0
  only_lits_cost = deflate_compute_true_cost(c);
3440
3441
  /*
3442
   * Force the block to really end at the desired length, even if some
3443
   * matches extend beyond it.
3444
   */
3445
0
  for (i = block_length;
3446
0
       i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
3447
0
          ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
3448
0
    c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
3449
3450
  /*
3451
   * Sometimes a static Huffman block ends up being cheapest, particularly
3452
   * if the block is small.  So, if the block is sufficiently small, find
3453
   * the optimal static block solution and remember its cost.
3454
   */
3455
0
  if (block_length <= c->p.n.max_len_to_optimize_static_block) {
3456
    /* Save c->p.n.costs temporarily. */
3457
0
    c->p.n.costs_saved = c->p.n.costs;
3458
3459
0
    deflate_set_costs_from_codes(c, &c->static_codes.lens);
3460
0
    deflate_find_min_cost_path(c, block_length, cache_ptr);
3461
0
    static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST;
3462
0
    static_cost += 7; /* for the end-of-block symbol */
3463
3464
    /* Restore c->p.n.costs. */
3465
0
    c->p.n.costs = c->p.n.costs_saved;
3466
0
  }
3467
3468
  /* Initialize c->p.n.costs with default costs. */
3469
0
  deflate_set_initial_costs(c, block_begin, block_length, is_first_block);
3470
3471
0
  do {
3472
    /*
3473
     * Find the minimum-cost path for this pass.
3474
     * Also set c->freqs and c->codes to match the path.
3475
     */
3476
0
    deflate_find_min_cost_path(c, block_length, cache_ptr);
3477
3478
    /*
3479
     * Compute the exact cost of the block if the path were to be
3480
     * used.  Note that this differs from
3481
     * c->p.n.optimum_nodes[0].cost_to_end in that true_cost uses
3482
     * the actual Huffman codes instead of c->p.n.costs.
3483
     */
3484
0
    true_cost = deflate_compute_true_cost(c);
3485
3486
    /*
3487
     * If the cost didn't improve much from the previous pass, then
3488
     * doing more passes probably won't be helpful, so stop early.
3489
     */
3490
0
    if (true_cost + c->p.n.min_improvement_to_continue >
3491
0
        best_true_cost)
3492
0
      break;
3493
3494
0
    best_true_cost = true_cost;
3495
3496
    /* Save the cost model that gave 'best_true_cost'. */
3497
0
    c->p.n.costs_saved = c->p.n.costs;
3498
3499
    /* Update the cost model from the Huffman codes. */
3500
0
    deflate_set_costs_from_codes(c, &c->codes.lens);
3501
3502
0
  } while (--num_passes_remaining);
3503
3504
0
  *used_only_literals = false;
3505
0
  if (MIN(only_lits_cost, static_cost) < best_true_cost) {
3506
0
    if (only_lits_cost < static_cost) {
3507
      /* Using only literals ended up being best! */
3508
0
      deflate_choose_all_literals(c, block_begin, block_length);
3509
0
      deflate_set_costs_from_codes(c, &c->codes.lens);
3510
0
      seq_.litrunlen_and_length = block_length;
3511
0
      seq = &seq_;
3512
0
      *used_only_literals = true;
3513
0
    } else {
3514
      /* Static block ended up being best! */
3515
0
      deflate_set_costs_from_codes(c, &c->static_codes.lens);
3516
0
      deflate_find_min_cost_path(c, block_length, cache_ptr);
3517
0
    }
3518
0
  } else if (true_cost >=
3519
0
       best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) {
3520
    /*
3521
     * The best solution was actually from a non-final optimization
3522
     * pass, so recover and use the min-cost path from that pass.
3523
     */
3524
0
    c->p.n.costs = c->p.n.costs_saved;
3525
0
    deflate_find_min_cost_path(c, block_length, cache_ptr);
3526
0
    deflate_set_costs_from_codes(c, &c->codes.lens);
3527
0
  }
3528
0
  deflate_flush_block(c, os, block_begin, block_length, seq,
3529
0
          is_final_block);
3530
0
}
3531
3532
static void
3533
deflate_near_optimal_init_stats(struct libdeflate_compressor *c)
3534
0
{
3535
0
  init_block_split_stats(&c->split_stats);
3536
0
  memset(c->p.n.new_match_len_freqs, 0,
3537
0
         sizeof(c->p.n.new_match_len_freqs));
3538
0
  memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
3539
0
}
3540
3541
static void
3542
deflate_near_optimal_merge_stats(struct libdeflate_compressor *c)
3543
0
{
3544
0
  unsigned i;
3545
3546
0
  merge_new_observations(&c->split_stats);
3547
0
  for (i = 0; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
3548
0
    c->p.n.match_len_freqs[i] += c->p.n.new_match_len_freqs[i];
3549
0
    c->p.n.new_match_len_freqs[i] = 0;
3550
0
  }
3551
0
}
3552
3553
/*
3554
 * Save some literal/match statistics from the previous block so that
3555
 * deflate_adjust_costs() will be able to decide how much the current block
3556
 * differs from the previous one.
3557
 */
3558
static void
3559
deflate_near_optimal_save_stats(struct libdeflate_compressor *c)
3560
0
{
3561
0
  int i;
3562
3563
0
  for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
3564
0
    c->p.n.prev_observations[i] = c->split_stats.observations[i];
3565
0
  c->p.n.prev_num_observations = c->split_stats.num_observations;
3566
0
}
3567
3568
static void
3569
deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c)
3570
0
{
3571
0
  int i;
3572
3573
0
  for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
3574
0
    c->split_stats.observations[i] = 0;
3575
0
  c->split_stats.num_observations = 0;
3576
0
  memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
3577
0
}
3578
3579
/*
3580
 * This is the "near-optimal" DEFLATE compressor.  It computes the optimal
3581
 * representation of each DEFLATE block using a minimum-cost path search over
3582
 * the graph of possible match/literal choices for that block, assuming a
3583
 * certain cost for each Huffman symbol.
3584
 *
3585
 * For several reasons, the end result is not guaranteed to be optimal:
3586
 *
3587
 * - Nonoptimal choice of blocks
3588
 * - Heuristic limitations on which matches are actually considered
3589
 * - Symbol costs are unknown until the symbols have already been chosen
3590
 *   (so iterative optimization must be used)
3591
 */
3592
static void
3593
deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
3594
            const u8 *in, size_t in_nbytes,
3595
            struct deflate_output_bitstream *os)
3596
0
{
3597
0
  const u8 *in_next = in;
3598
0
  const u8 *in_block_begin = in_next;
3599
0
  const u8 *in_end = in_next + in_nbytes;
3600
0
  const u8 *in_cur_base = in_next;
3601
0
  const u8 *in_next_slide =
3602
0
    in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
3603
0
  u32 max_len = DEFLATE_MAX_MATCH_LEN;
3604
0
  u32 nice_len = MIN(c->nice_match_length, max_len);
3605
0
  struct lz_match *cache_ptr = c->p.n.match_cache;
3606
0
  u32 next_hashes[2] = {0, 0};
3607
0
  bool prev_block_used_only_literals = false;
3608
3609
0
  bt_matchfinder_init(&c->p.n.bt_mf);
3610
0
  deflate_near_optimal_init_stats(c);
3611
3612
0
  do {
3613
    /* Starting a new DEFLATE block */
3614
0
    const u8 * const in_max_block_end = choose_max_block_end(
3615
0
        in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH);
3616
0
    const u8 *prev_end_block_check = NULL;
3617
0
    bool change_detected = false;
3618
0
    const u8 *next_observation = in_next;
3619
0
    u32 min_len;
3620
3621
    /*
3622
     * Use the minimum match length heuristic to improve the
3623
     * literal/match statistics gathered during matchfinding.
3624
     * However, the actual near-optimal parse won't respect min_len,
3625
     * as it can accurately assess the costs of different matches.
3626
     *
3627
     * If the "use only literals" strategy happened to be the best
3628
     * strategy on the previous block, then probably the
3629
     * min_match_len heuristic is still not aggressive enough for
3630
     * the data, so force gathering literal stats only.
3631
     */
3632
0
    if (prev_block_used_only_literals)
3633
0
      min_len = DEFLATE_MAX_MATCH_LEN + 1;
3634
0
    else
3635
0
      min_len = calculate_min_match_len(
3636
0
          in_block_begin,
3637
0
          in_max_block_end - in_block_begin,
3638
0
          c->max_search_depth);
3639
3640
    /*
3641
     * Find matches until we decide to end the block.  We end the
3642
     * block if any of the following is true:
3643
     *
3644
     * (1) Maximum block length has been reached
3645
     * (2) Match catch may overflow.
3646
     * (3) Block split heuristic says to split now.
3647
     */
3648
0
    for (;;) {
3649
0
      struct lz_match *matches;
3650
0
      u32 best_len;
3651
0
      size_t remaining = in_end - in_next;
3652
3653
      /* Slide the window forward if needed. */
3654
0
      if (in_next == in_next_slide) {
3655
0
        bt_matchfinder_slide_window(&c->p.n.bt_mf);
3656
0
        in_cur_base = in_next;
3657
0
        in_next_slide = in_next +
3658
0
          MIN(remaining, MATCHFINDER_WINDOW_SIZE);
3659
0
      }
3660
3661
      /*
3662
       * Find matches with the current position using the
3663
       * binary tree matchfinder and save them in match_cache.
3664
       *
3665
       * Note: the binary tree matchfinder is more suited for
3666
       * optimal parsing than the hash chain matchfinder.  The
3667
       * reasons for this include:
3668
       *
3669
       * - The binary tree matchfinder can find more matches
3670
       *   in the same number of steps.
3671
       * - One of the major advantages of hash chains is that
3672
       *   skipping positions (not searching for matches at
3673
       *   them) is faster; however, with optimal parsing we
3674
       *   search for matches at almost all positions, so this
3675
       *   advantage of hash chains is negated.
3676
       */
3677
0
      matches = cache_ptr;
3678
0
      best_len = 0;
3679
0
      adjust_max_and_nice_len(&max_len, &nice_len, remaining);
3680
0
      if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
3681
0
        cache_ptr = bt_matchfinder_get_matches(
3682
0
            &c->p.n.bt_mf,
3683
0
            in_cur_base,
3684
0
            in_next - in_cur_base,
3685
0
            max_len,
3686
0
            nice_len,
3687
0
            c->max_search_depth,
3688
0
            next_hashes,
3689
0
            matches);
3690
0
        if (cache_ptr > matches)
3691
0
          best_len = cache_ptr[-1].length;
3692
0
      }
3693
0
      if (in_next >= next_observation) {
3694
0
        if (best_len >= min_len) {
3695
0
          observe_match(&c->split_stats,
3696
0
                  best_len);
3697
0
          next_observation = in_next + best_len;
3698
0
          c->p.n.new_match_len_freqs[best_len]++;
3699
0
        } else {
3700
0
          observe_literal(&c->split_stats,
3701
0
              *in_next);
3702
0
          next_observation = in_next + 1;
3703
0
        }
3704
0
      }
3705
3706
0
      cache_ptr->length = cache_ptr - matches;
3707
0
      cache_ptr->offset = *in_next;
3708
0
      in_next++;
3709
0
      cache_ptr++;
3710
3711
      /*
3712
       * If there was a very long match found, don't cache any
3713
       * matches for the bytes covered by that match.  This
3714
       * avoids degenerate behavior when compressing highly
3715
       * redundant data, where the number of matches can be
3716
       * very large.
3717
       *
3718
       * This heuristic doesn't actually hurt the compression
3719
       * ratio very much.  If there's a long match, then the
3720
       * data must be highly compressible, so it doesn't
3721
       * matter much what we do.
3722
       */
3723
0
      if (best_len >= DEFLATE_MIN_MATCH_LEN &&
3724
0
          best_len >= nice_len) {
3725
0
        --best_len;
3726
0
        do {
3727
0
          remaining = in_end - in_next;
3728
0
          if (in_next == in_next_slide) {
3729
0
            bt_matchfinder_slide_window(
3730
0
              &c->p.n.bt_mf);
3731
0
            in_cur_base = in_next;
3732
0
            in_next_slide = in_next +
3733
0
              MIN(remaining,
3734
0
                  MATCHFINDER_WINDOW_SIZE);
3735
0
          }
3736
0
          adjust_max_and_nice_len(&max_len,
3737
0
                &nice_len,
3738
0
                remaining);
3739
0
          if (max_len >=
3740
0
              BT_MATCHFINDER_REQUIRED_NBYTES) {
3741
0
            bt_matchfinder_skip_byte(
3742
0
              &c->p.n.bt_mf,
3743
0
              in_cur_base,
3744
0
              in_next - in_cur_base,
3745
0
              nice_len,
3746
0
              c->max_search_depth,
3747
0
              next_hashes);
3748
0
          }
3749
0
          cache_ptr->length = 0;
3750
0
          cache_ptr->offset = *in_next;
3751
0
          in_next++;
3752
0
          cache_ptr++;
3753
0
        } while (--best_len);
3754
0
      }
3755
      /* Maximum block length or end of input reached? */
3756
0
      if (in_next >= in_max_block_end)
3757
0
        break;
3758
      /* Match cache overflowed? */
3759
0
      if (cache_ptr >=
3760
0
          &c->p.n.match_cache[MATCH_CACHE_LENGTH])
3761
0
        break;
3762
      /* Not ready to try to end the block (again)? */
3763
0
      if (!ready_to_check_block(&c->split_stats,
3764
0
              in_block_begin, in_next,
3765
0
              in_end))
3766
0
        continue;
3767
      /* Check if it would be worthwhile to end the block. */
3768
0
      if (do_end_block_check(&c->split_stats,
3769
0
                 in_next - in_block_begin)) {
3770
0
        change_detected = true;
3771
0
        break;
3772
0
      }
3773
      /* Ending the block doesn't seem worthwhile here. */
3774
0
      deflate_near_optimal_merge_stats(c);
3775
0
      prev_end_block_check = in_next;
3776
0
    }
3777
    /*
3778
     * All the matches for this block have been cached.  Now choose
3779
     * the precise end of the block and the sequence of items to
3780
     * output to represent it, then flush the block.
3781
     */
3782
0
    if (change_detected && prev_end_block_check != NULL) {
3783
      /*
3784
       * The block is being ended because a recent chunk of
3785
       * data differs from the rest of the block.  We could
3786
       * end the block at 'in_next' like the greedy and lazy
3787
       * compressors do, but that's not ideal since it would
3788
       * include the differing chunk in the block.  The
3789
       * near-optimal compressor has time to do a better job.
3790
       * Therefore, we rewind to just before the chunk, and
3791
       * output a block that only goes up to there.
3792
       *
3793
       * We then set things up to correctly start the next
3794
       * block, considering that some work has already been
3795
       * done on it (some matches found and stats gathered).
3796
       */
3797
0
      struct lz_match *orig_cache_ptr = cache_ptr;
3798
0
      const u8 *in_block_end = prev_end_block_check;
3799
0
      u32 block_length = in_block_end - in_block_begin;
3800
0
      bool is_first = (in_block_begin == in);
3801
0
      bool is_final = false;
3802
0
      u32 num_bytes_to_rewind = in_next - in_block_end;
3803
0
      size_t cache_len_rewound;
3804
3805
      /* Rewind the match cache. */
3806
0
      do {
3807
0
        cache_ptr--;
3808
0
        cache_ptr -= cache_ptr->length;
3809
0
      } while (--num_bytes_to_rewind);
3810
0
      cache_len_rewound = orig_cache_ptr - cache_ptr;
3811
3812
0
      deflate_optimize_and_flush_block(
3813
0
            c, os, in_block_begin,
3814
0
            block_length, cache_ptr,
3815
0
            is_first, is_final,
3816
0
            &prev_block_used_only_literals);
3817
0
      memmove(c->p.n.match_cache, cache_ptr,
3818
0
        cache_len_rewound * sizeof(*cache_ptr));
3819
0
      cache_ptr = &c->p.n.match_cache[cache_len_rewound];
3820
0
      deflate_near_optimal_save_stats(c);
3821
      /*
3822
       * Clear the stats for the just-flushed block, leaving
3823
       * just the stats for the beginning of the next block.
3824
       */
3825
0
      deflate_near_optimal_clear_old_stats(c);
3826
0
      in_block_begin = in_block_end;
3827
0
    } else {
3828
      /*
3829
       * The block is being ended for a reason other than a
3830
       * differing data chunk being detected.  Don't rewind at
3831
       * all; just end the block at the current position.
3832
       */
3833
0
      u32 block_length = in_next - in_block_begin;
3834
0
      bool is_first = (in_block_begin == in);
3835
0
      bool is_final = (in_next == in_end);
3836
3837
0
      deflate_near_optimal_merge_stats(c);
3838
0
      deflate_optimize_and_flush_block(
3839
0
            c, os, in_block_begin,
3840
0
            block_length, cache_ptr,
3841
0
            is_first, is_final,
3842
0
            &prev_block_used_only_literals);
3843
0
      cache_ptr = &c->p.n.match_cache[0];
3844
0
      deflate_near_optimal_save_stats(c);
3845
0
      deflate_near_optimal_init_stats(c);
3846
0
      in_block_begin = in_next;
3847
0
    }
3848
0
  } while (in_next != in_end && !os->overflow);
3849
0
}
3850
3851
/* Initialize c->p.n.offset_slot_full. */
3852
static void
3853
deflate_init_offset_slot_full(struct libdeflate_compressor *c)
3854
0
{
3855
0
  u32 offset_slot;
3856
0
  u32 offset;
3857
0
  u32 offset_end;
3858
3859
0
  for (offset_slot = 0; offset_slot < ARRAY_LEN(deflate_offset_slot_base);
3860
0
       offset_slot++) {
3861
0
    offset = deflate_offset_slot_base[offset_slot];
3862
0
    offset_end = offset +
3863
0
           (1 << deflate_extra_offset_bits[offset_slot]);
3864
0
    do {
3865
0
      c->p.n.offset_slot_full[offset] = offset_slot;
3866
0
    } while (++offset != offset_end);
3867
0
  }
3868
0
}
3869
3870
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
3871
3872
LIBDEFLATEAPI struct libdeflate_compressor *
3873
libdeflate_alloc_compressor_ex(int compression_level,
3874
             const struct libdeflate_options *options)
3875
1.17k
{
3876
1.17k
  struct libdeflate_compressor *c;
3877
1.17k
  size_t size = offsetof(struct libdeflate_compressor, p);
3878
3879
1.17k
  check_buildtime_parameters();
3880
3881
  /*
3882
   * Note: if more fields are added to libdeflate_options, this code will
3883
   * need to be updated to support both the old and new structs.
3884
   */
3885
1.17k
  if (options->sizeof_options != sizeof(*options))
3886
0
    return NULL;
3887
3888
  /*
3889
   * Note: For similarity with zlib's API, -1 is accepted as an alias for
3890
   * the default compression level.
3891
   */
3892
1.17k
  if (compression_level == -1)
3893
0
    compression_level = 6;
3894
3895
1.17k
  if (compression_level < 0 || compression_level > 12)
3896
0
    return NULL;
3897
3898
1.17k
#if SUPPORT_NEAR_OPTIMAL_PARSING
3899
1.17k
  if (compression_level >= 10)
3900
0
    size += sizeof(c->p.n);
3901
1.17k
  else
3902
1.17k
#endif
3903
1.17k
  {
3904
1.17k
    if (compression_level >= 2)
3905
1.17k
      size += sizeof(c->p.g);
3906
0
    else if (compression_level == 1)
3907
0
      size += sizeof(c->p.f);
3908
1.17k
  }
3909
3910
1.17k
  c = libdeflate_aligned_malloc(options->malloc_func ?
3911
0
              options->malloc_func :
3912
1.17k
              libdeflate_default_malloc_func,
3913
1.17k
              MATCHFINDER_MEM_ALIGNMENT, size);
3914
1.17k
  if (!c)
3915
0
    return NULL;
3916
1.17k
  c->free_func = options->free_func ?
3917
1.17k
           options->free_func : libdeflate_default_free_func;
3918
3919
1.17k
  c->compression_level = compression_level;
3920
3921
  /*
3922
   * The higher the compression level, the more we should bother trying to
3923
   * compress very small inputs.
3924
   */
3925
1.17k
  c->max_passthrough_size = 55 - (compression_level * 4);
3926
3927
1.17k
  switch (compression_level) {
3928
0
  case 0:
3929
0
    c->max_passthrough_size = SIZE_MAX;
3930
0
    c->impl = NULL; /* not used */
3931
0
    break;
3932
0
  case 1:
3933
0
    c->impl = deflate_compress_fastest;
3934
    /* max_search_depth is unused. */
3935
0
    c->nice_match_length = 32;
3936
0
    break;
3937
0
  case 2:
3938
0
    c->impl = deflate_compress_greedy;
3939
0
    c->max_search_depth = 6;
3940
0
    c->nice_match_length = 10;
3941
0
    break;
3942
0
  case 3:
3943
0
    c->impl = deflate_compress_greedy;
3944
0
    c->max_search_depth = 12;
3945
0
    c->nice_match_length = 14;
3946
0
    break;
3947
0
  case 4:
3948
0
    c->impl = deflate_compress_greedy;
3949
0
    c->max_search_depth = 16;
3950
0
    c->nice_match_length = 30;
3951
0
    break;
3952
0
  case 5:
3953
0
    c->impl = deflate_compress_lazy;
3954
0
    c->max_search_depth = 16;
3955
0
    c->nice_match_length = 30;
3956
0
    break;
3957
0
  case 6:
3958
0
    c->impl = deflate_compress_lazy;
3959
0
    c->max_search_depth = 35;
3960
0
    c->nice_match_length = 65;
3961
0
    break;
3962
0
  case 7:
3963
0
    c->impl = deflate_compress_lazy;
3964
0
    c->max_search_depth = 100;
3965
0
    c->nice_match_length = 130;
3966
0
    break;
3967
1.17k
  case 8:
3968
1.17k
    c->impl = deflate_compress_lazy2;
3969
1.17k
    c->max_search_depth = 300;
3970
1.17k
    c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
3971
1.17k
    break;
3972
0
  case 9:
3973
#if !SUPPORT_NEAR_OPTIMAL_PARSING
3974
  default:
3975
#endif
3976
0
    c->impl = deflate_compress_lazy2;
3977
0
    c->max_search_depth = 600;
3978
0
    c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
3979
0
    break;
3980
0
#if SUPPORT_NEAR_OPTIMAL_PARSING
3981
0
  case 10:
3982
0
    c->impl = deflate_compress_near_optimal;
3983
0
    c->max_search_depth = 35;
3984
0
    c->nice_match_length = 75;
3985
0
    c->p.n.max_optim_passes = 2;
3986
0
    c->p.n.min_improvement_to_continue = 32;
3987
0
    c->p.n.min_bits_to_use_nonfinal_path = 32;
3988
0
    c->p.n.max_len_to_optimize_static_block = 0;
3989
0
    deflate_init_offset_slot_full(c);
3990
0
    break;
3991
0
  case 11:
3992
0
    c->impl = deflate_compress_near_optimal;
3993
0
    c->max_search_depth = 100;
3994
0
    c->nice_match_length = 150;
3995
0
    c->p.n.max_optim_passes = 4;
3996
0
    c->p.n.min_improvement_to_continue = 16;
3997
0
    c->p.n.min_bits_to_use_nonfinal_path = 16;
3998
0
    c->p.n.max_len_to_optimize_static_block = 1000;
3999
0
    deflate_init_offset_slot_full(c);
4000
0
    break;
4001
0
  case 12:
4002
0
  default:
4003
0
    c->impl = deflate_compress_near_optimal;
4004
0
    c->max_search_depth = 300;
4005
0
    c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
4006
0
    c->p.n.max_optim_passes = 10;
4007
0
    c->p.n.min_improvement_to_continue = 1;
4008
0
    c->p.n.min_bits_to_use_nonfinal_path = 1;
4009
0
    c->p.n.max_len_to_optimize_static_block = 10000;
4010
0
    deflate_init_offset_slot_full(c);
4011
0
    break;
4012
1.17k
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
4013
1.17k
  }
4014
4015
1.17k
  deflate_init_static_codes(c);
4016
4017
1.17k
  return c;
4018
1.17k
}
4019
4020
4021
LIBDEFLATEAPI struct libdeflate_compressor *
4022
libdeflate_alloc_compressor(int compression_level)
4023
1.17k
{
4024
1.17k
  static const struct libdeflate_options defaults = {
4025
1.17k
    .sizeof_options = sizeof(defaults),
4026
1.17k
  };
4027
1.17k
  return libdeflate_alloc_compressor_ex(compression_level, &defaults);
4028
1.17k
}
4029
4030
LIBDEFLATEAPI size_t
4031
libdeflate_deflate_compress(struct libdeflate_compressor *c,
4032
          const void *in, size_t in_nbytes,
4033
          void *out, size_t out_nbytes_avail)
4034
1.29k
{
4035
1.29k
  struct deflate_output_bitstream os;
4036
4037
  /*
4038
   * For extremely short inputs, or for compression level 0, just output
4039
   * uncompressed blocks.
4040
   */
4041
1.29k
  if (unlikely(in_nbytes <= c->max_passthrough_size))
4042
151
    return deflate_compress_none(in, in_nbytes,
4043
151
               out, out_nbytes_avail);
4044
4045
  /* Initialize the output bitstream structure. */
4046
1.14k
  os.bitbuf = 0;
4047
1.14k
  os.bitcount = 0;
4048
1.14k
  os.next = out;
4049
1.14k
  os.end = os.next + out_nbytes_avail;
4050
1.14k
  os.overflow = false;
4051
4052
  /* Call the actual compression function. */
4053
1.14k
  (*c->impl)(c, in, in_nbytes, &os);
4054
4055
  /* Return 0 if the output buffer is too small. */
4056
1.14k
  if (os.overflow)
4057
0
    return 0;
4058
4059
  /*
4060
   * Write the final byte if needed.  This can't overflow the output
4061
   * buffer because deflate_flush_block() would have set the overflow flag
4062
   * if there wasn't enough space remaining for the full final block.
4063
   */
4064
1.14k
  ASSERT(os.bitcount <= 7);
4065
1.14k
  if (os.bitcount) {
4066
994
    ASSERT(os.next < os.end);
4067
994
    *os.next++ = os.bitbuf;
4068
994
  }
4069
4070
  /* Return the compressed size in bytes. */
4071
1.14k
  return os.next - (u8 *)out;
4072
1.14k
}
4073
4074
LIBDEFLATEAPI void
4075
libdeflate_free_compressor(struct libdeflate_compressor *c)
4076
1.17k
{
4077
1.17k
  if (c)
4078
1.17k
    libdeflate_aligned_free(c->free_func, c);
4079
1.17k
}
4080
4081
unsigned int
4082
libdeflate_get_compression_level(struct libdeflate_compressor *c)
4083
1.29k
{
4084
1.29k
  return c->compression_level;
4085
1.29k
}
4086
4087
LIBDEFLATEAPI size_t
4088
libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
4089
          size_t in_nbytes)
4090
1.29k
{
4091
1.29k
  size_t max_blocks;
4092
4093
  /*
4094
   * Since the compressor never uses a compressed block when an
4095
   * uncompressed block is cheaper, the worst case can be no worse than
4096
   * the case where only uncompressed blocks are used.
4097
   *
4098
   * This is true even though up to 7 bits are "wasted" to byte-align the
4099
   * bitstream when a compressed block is followed by an uncompressed
4100
   * block.  This is because a compressed block wouldn't have been used if
4101
   * it wasn't cheaper than an uncompressed block, and uncompressed blocks
4102
   * always end on a byte boundary.  So the alignment bits will, at worst,
4103
   * go up to the place where the uncompressed block would have ended.
4104
   */
4105
4106
  /*
4107
   * Calculate the maximum number of uncompressed blocks that the
4108
   * compressor can use for 'in_nbytes' of data.
4109
   *
4110
   * The minimum length that is passed to deflate_flush_block() is
4111
   * MIN_BLOCK_LENGTH bytes, except for the final block if needed.  If
4112
   * deflate_flush_block() decides to use an uncompressed block, it
4113
   * actually will (in general) output a series of uncompressed blocks in
4114
   * order to stay within the UINT16_MAX limit of DEFLATE.  But this can
4115
   * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX',
4116
   * as in that case this behavior can't result in more blocks than the
4117
   * case where deflate_flush_block() is called with min-length inputs.
4118
   *
4119
   * So the number of uncompressed blocks needed would be bounded by
4120
   * DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH).  However, empty inputs
4121
   * need 1 (empty) block, which gives the final expression below.
4122
   */
4123
1.29k
  STATIC_ASSERT(2 * MIN_BLOCK_LENGTH <= UINT16_MAX);
4124
1.29k
  max_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
4125
4126
  /*
4127
   * Each uncompressed block has 5 bytes of overhead, for the BFINAL,
4128
   * BTYPE, LEN, and NLEN fields.  (For the reason explained earlier, the
4129
   * alignment bits at the very start of the block can be disregarded;
4130
   * they would otherwise increase the overhead to 6 bytes per block.)
4131
   * Therefore, the maximum number of overhead bytes is '5 * max_blocks'.
4132
   * To get the final bound, add the number of uncompressed bytes.
4133
   */
4134
1.29k
  return (5 * max_blocks) + in_nbytes;
4135
1.29k
}