Coverage Report

Created: 2024-07-27 06:04

/work/_deps/deflate-src/lib/deflate_compress.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * deflate_compress.c - a compressor for DEFLATE
3
 *
4
 * Copyright 2016 Eric Biggers
5
 *
6
 * Permission is hereby granted, free of charge, to any person
7
 * obtaining a copy of this software and associated documentation
8
 * files (the "Software"), to deal in the Software without
9
 * restriction, including without limitation the rights to use,
10
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
 * copies of the Software, and to permit persons to whom the
12
 * Software is furnished to do so, subject to the following
13
 * conditions:
14
 *
15
 * The above copyright notice and this permission notice shall be
16
 * included in all copies or substantial portions of the Software.
17
 *
18
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
 * OTHER DEALINGS IN THE SOFTWARE.
26
 */
27
28
#include "deflate_compress.h"
29
#include "deflate_constants.h"
30
31
/******************************************************************************/
32
33
/*
34
 * The following parameters can be changed at build time to customize the
35
 * compression algorithms slightly:
36
 *
37
 * (Note, not all customizable parameters are here.  Some others can be found in
38
 * libdeflate_alloc_compressor() and in *_matchfinder.h.)
39
 */
40
41
/*
42
 * If this parameter is defined to 1, then the near-optimal parsing algorithm
43
 * will be included, and compression levels 10-12 will use it.  This algorithm
44
 * usually produces a compression ratio significantly better than the other
45
 * algorithms.  However, it is slow.  If this parameter is defined to 0, then
46
 * levels 10-12 will be the same as level 9 and will use the lazy2 algorithm.
47
 */
48
#define SUPPORT_NEAR_OPTIMAL_PARSING  1
49
50
/*
51
 * This is the minimum block length that the compressor will use, in
52
 * uncompressed bytes.  This should be a value below which using shorter blocks
53
 * is unlikely to be worthwhile, due to the per-block overhead.  This value does
54
 * not apply to the final block, which may be shorter than this (if the input is
55
 * shorter, it will have to be), or to the final uncompressed block in a series
56
 * of uncompressed blocks that cover more than UINT16_MAX bytes.
57
 *
58
 * This value is also approximately the amount by which what would otherwise be
59
 * the second-to-last block is allowed to grow past the soft maximum length in
60
 * order to avoid having to use a very short final block.
61
 *
62
 * Defining a fixed minimum block length is needed in order to guarantee a
63
 * reasonable upper bound on the compressed size.  It's also needed because our
64
 * block splitting algorithm doesn't work well on very short blocks.
65
 */
66
0
#define MIN_BLOCK_LENGTH  5000
67
68
/*
69
 * For the greedy, lazy, lazy2, and near-optimal compressors: This is the soft
70
 * maximum block length, in uncompressed bytes.  The compressor will try to end
71
 * blocks at this length, but it may go slightly past it if there is a match
72
 * that straddles this limit or if the input data ends soon after this limit.
73
 * This parameter doesn't apply to uncompressed blocks, which the DEFLATE format
74
 * limits to 65535 bytes.
75
 *
76
 * This should be a value above which it is very likely that splitting the block
77
 * would produce a better compression ratio.  For the near-optimal compressor,
78
 * increasing/decreasing this parameter will increase/decrease per-compressor
79
 * memory usage linearly.
80
 */
81
0
#define SOFT_MAX_BLOCK_LENGTH 300000
82
83
/*
84
 * For the greedy, lazy, and lazy2 compressors: this is the length of the
85
 * sequence store, which is an array where the compressor temporarily stores
86
 * matches that it's going to use in the current block.  This value is the
87
 * maximum number of matches that can be used in a block.  If the sequence store
88
 * fills up, then the compressor will be forced to end the block early.  This
89
 * value should be large enough so that this rarely happens, due to the block
90
 * being ended normally before then.  Increasing/decreasing this value will
91
 * increase/decrease per-compressor memory usage linearly.
92
 */
93
0
#define SEQ_STORE_LENGTH  50000
94
95
/*
96
 * For deflate_compress_fastest(): This is the soft maximum block length.
97
 * deflate_compress_fastest() doesn't use the regular block splitting algorithm;
98
 * it only ends blocks when they reach FAST_SOFT_MAX_BLOCK_LENGTH bytes or
99
 * FAST_SEQ_STORE_LENGTH matches.  Therefore, this value should be lower than
100
 * the regular SOFT_MAX_BLOCK_LENGTH.
101
 */
102
0
#define FAST_SOFT_MAX_BLOCK_LENGTH  65535
103
104
/*
105
 * For deflate_compress_fastest(): this is the length of the sequence store.
106
 * This is like SEQ_STORE_LENGTH, but this should be a lower value.
107
 */
108
0
#define FAST_SEQ_STORE_LENGTH 8192
109
110
/*
111
 * These are the maximum codeword lengths, in bits, the compressor will use for
112
 * each Huffman code.  The DEFLATE format defines limits for these.  However,
113
 * further limiting litlen codewords to 14 bits is beneficial, since it has
114
 * negligible effect on compression ratio but allows some optimizations when
115
 * outputting bits.  (It allows 4 literals to be written at once rather than 3.)
116
 */
117
0
#define MAX_LITLEN_CODEWORD_LEN   14
118
0
#define MAX_OFFSET_CODEWORD_LEN   DEFLATE_MAX_OFFSET_CODEWORD_LEN
119
0
#define MAX_PRE_CODEWORD_LEN    DEFLATE_MAX_PRE_CODEWORD_LEN
120
121
#if SUPPORT_NEAR_OPTIMAL_PARSING
122
123
/* Parameters specific to the near-optimal parsing algorithm */
124
125
/*
126
 * BIT_COST is a scaling factor that allows the near-optimal compressor to
127
 * consider fractional bit costs when deciding which literal/match sequence to
128
 * use.  This is useful when the true symbol costs are unknown.  For example, if
129
 * the compressor thinks that a symbol has 6.5 bits of entropy, it can set its
130
 * cost to 6.5 bits rather than have to use 6 or 7 bits.  Although in the end
131
 * each symbol will use a whole number of bits due to the Huffman coding,
132
 * considering fractional bits can be helpful due to the limited information.
133
 *
134
 * BIT_COST should be a power of 2.  A value of 8 or 16 works well.  A higher
135
 * value isn't very useful since the calculations are approximate anyway.
136
 *
137
 * BIT_COST doesn't apply to deflate_flush_block() and
138
 * deflate_compute_true_cost(), which consider whole bits.
139
 */
140
0
#define BIT_COST  16
141
142
/*
143
 * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
144
 * be needed to output a symbol that was unused in the previous optimization
145
 * pass.  Assigning a default cost allows the symbol to be used in the next
146
 * optimization pass.  However, the cost should be relatively high because the
147
 * symbol probably won't be used very many times (if at all).
148
 */
149
0
#define LITERAL_NOSTAT_BITS 13
150
0
#define LENGTH_NOSTAT_BITS  13
151
0
#define OFFSET_NOSTAT_BITS  10
152
153
/*
154
 * This is (slightly less than) the maximum number of matches that the
155
 * near-optimal compressor will cache per block.  This behaves similarly to
156
 * SEQ_STORE_LENGTH for the other compressors.
157
 */
158
0
#define MATCH_CACHE_LENGTH  (SOFT_MAX_BLOCK_LENGTH * 5)
159
160
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
161
162
/******************************************************************************/
163
164
/* Include the needed matchfinders. */
165
0
#define MATCHFINDER_WINDOW_ORDER  DEFLATE_WINDOW_ORDER
166
#include "hc_matchfinder.h"
167
#include "ht_matchfinder.h"
168
#if SUPPORT_NEAR_OPTIMAL_PARSING
169
#  include "bt_matchfinder.h"
170
/*
171
 * This is the maximum number of matches the binary trees matchfinder can find
172
 * at a single position.  Since the matchfinder never finds more than one match
173
 * for the same length, presuming one of each possible length is sufficient for
174
 * an upper bound.  (This says nothing about whether it is worthwhile to
175
 * consider so many matches; this is just defining the worst case.)
176
 */
177
#define MAX_MATCHES_PER_POS \
178
  (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
179
#endif
180
181
/*
182
 * The largest block length we will ever use is when the final block is of
183
 * length SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, or when any block is of
184
 * length SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN.  The latter case
185
 * occurs when the lazy2 compressor chooses two literals and a maximum-length
186
 * match, starting at SOFT_MAX_BLOCK_LENGTH - 1.
187
 */
188
#define MAX_BLOCK_LENGTH  \
189
  MAX(SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, \
190
      SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN)
191
192
static forceinline void
193
check_buildtime_parameters(void)
194
0
{
195
  /*
196
   * Verify that MIN_BLOCK_LENGTH is being honored, as
197
   * libdeflate_deflate_compress_bound() depends on it.
198
   */
199
0
  STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
200
0
  STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
201
0
  STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN >=
202
0
          MIN_BLOCK_LENGTH);
203
0
  STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN >=
204
0
          MIN_BLOCK_LENGTH);
205
0
#if SUPPORT_NEAR_OPTIMAL_PARSING
206
0
  STATIC_ASSERT(MIN_BLOCK_LENGTH * MAX_MATCHES_PER_POS <=
207
0
          MATCH_CACHE_LENGTH);
208
0
#endif
209
210
  /* The definition of MAX_BLOCK_LENGTH assumes this. */
211
0
  STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH);
212
213
  /* Verify that the sequence stores aren't uselessly large. */
214
0
  STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN <=
215
0
          SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
216
0
  STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN <=
217
0
          FAST_SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
218
219
  /* Verify that the maximum codeword lengths are valid. */
220
0
  STATIC_ASSERT(
221
0
    MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
222
0
  STATIC_ASSERT(
223
0
    MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
224
0
  STATIC_ASSERT(
225
0
    MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
226
0
  STATIC_ASSERT(
227
0
    (1U << MAX_LITLEN_CODEWORD_LEN) >= DEFLATE_NUM_LITLEN_SYMS);
228
0
  STATIC_ASSERT(
229
0
    (1U << MAX_OFFSET_CODEWORD_LEN) >= DEFLATE_NUM_OFFSET_SYMS);
230
0
  STATIC_ASSERT(
231
0
    (1U << MAX_PRE_CODEWORD_LEN) >= DEFLATE_NUM_PRECODE_SYMS);
232
0
}
233
234
/******************************************************************************/
235
236
/* Table: length slot => length slot base value */
237
static const unsigned deflate_length_slot_base[] = {
238
  3,    4,    5,    6,    7,    8,    9,    10,
239
  11,   13,   15,   17,   19,   23,   27,   31,
240
  35,   43,   51,   59,   67,   83,   99,   115,
241
  131,  163,  195,  227,  258,
242
};
243
244
/* Table: length slot => number of extra length bits */
245
static const u8 deflate_extra_length_bits[] = {
246
  0,    0,    0,    0,    0,    0,    0,    0,
247
  1,    1,    1,    1,    2,    2,    2,    2,
248
  3,    3,    3,    3,    4,    4,    4,    4,
249
  5,    5,    5,    5,    0,
250
};
251
252
/* Table: offset slot => offset slot base value */
253
static const unsigned deflate_offset_slot_base[] = {
254
  1,     2,     3,     4,     5,     7,     9,     13,
255
  17,    25,    33,    49,    65,    97,    129,   193,
256
  257,   385,   513,   769,   1025,  1537,  2049,  3073,
257
  4097,  6145,  8193,  12289, 16385, 24577,
258
};
259
260
/* Table: offset slot => number of extra offset bits */
261
static const u8 deflate_extra_offset_bits[] = {
262
  0,     0,     0,     0,     1,     1,     2,     2,
263
  3,     3,     4,     4,     5,     5,     6,     6,
264
  7,     7,     8,     8,     9,     9,     10,    10,
265
  11,    11,    12,    12,    13,    13,
266
};
267
268
/* Table: length => length slot */
269
static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
270
  0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
271
  12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
272
  16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
273
  18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
274
  20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
275
  21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
276
  22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
277
  23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
278
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
279
  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
280
  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
281
  26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
282
  26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
283
  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
284
  27, 27, 28,
285
};
286
287
/*
288
 * A condensed table which maps offset => offset slot as follows:
289
 *
290
 *  offset <= 256: deflate_offset_slot[offset]
291
 *  offset > 256: deflate_offset_slot[256 + ((offset - 1) >> 7)]
292
 *
293
 * This table was generated by scripts/gen_offset_slot_map.py.
294
 */
295
static const u8 deflate_offset_slot[512] = {
296
  0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7,
297
  7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9,
298
  9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
299
  10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
300
  11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
301
  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
302
  12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
303
  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
304
  13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
305
  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
306
  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
307
  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
308
  14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
309
  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
310
  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
311
  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
312
  15, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
313
  22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
314
  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
315
  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
316
  26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
317
  26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
318
  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
319
  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
320
  28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
321
  28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
322
  28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
323
  28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
324
  29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
325
  29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
326
  29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
327
  29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
328
};
329
330
/* The order in which precode codeword lengths are stored */
331
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
332
  16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
333
};
334
335
/* Table: precode symbol => number of extra bits */
336
static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
337
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7
338
};
339
340
/* Codewords for the DEFLATE Huffman codes */
341
struct deflate_codewords {
342
  u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
343
  u32 offset[DEFLATE_NUM_OFFSET_SYMS];
344
};
345
346
/*
347
 * Codeword lengths (in bits) for the DEFLATE Huffman codes.
348
 * A zero length means the corresponding symbol had zero frequency.
349
 */
350
struct deflate_lens {
351
  u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
352
  u8 offset[DEFLATE_NUM_OFFSET_SYMS];
353
};
354
355
/* Codewords and lengths for the DEFLATE Huffman codes */
356
struct deflate_codes {
357
  struct deflate_codewords codewords;
358
  struct deflate_lens lens;
359
};
360
361
/* Symbol frequency counters for the DEFLATE Huffman codes */
362
struct deflate_freqs {
363
  u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
364
  u32 offset[DEFLATE_NUM_OFFSET_SYMS];
365
};
366
367
/*
368
 * Represents a run of literals followed by a match or end-of-block.  This
369
 * struct is needed to temporarily store items chosen by the parser, since items
370
 * cannot be written until all items for the block have been chosen and the
371
 * block's Huffman codes have been computed.
372
 */
373
struct deflate_sequence {
374
375
  /*
376
   * Bits 0..22: the number of literals in this run.  This may be 0 and
377
   * can be at most MAX_BLOCK_LENGTH.  The literals are not stored
378
   * explicitly in this structure; instead, they are read directly from
379
   * the uncompressed data.
380
   *
381
   * Bits 23..31: the length of the match which follows the literals, or 0
382
   * if this literal run was the last in the block, so there is no match
383
   * which follows it.
384
   */
385
0
#define SEQ_LENGTH_SHIFT 23
386
0
#define SEQ_LITRUNLEN_MASK (((u32)1 << SEQ_LENGTH_SHIFT) - 1)
387
  u32 litrunlen_and_length;
388
389
  /*
390
   * If 'length' doesn't indicate end-of-block, then this is the offset of
391
   * the match which follows the literals.
392
   */
393
  u16 offset;
394
395
  /*
396
   * If 'length' doesn't indicate end-of-block, then this is the offset
397
   * slot of the match which follows the literals.
398
   */
399
  u16 offset_slot;
400
};
401
402
#if SUPPORT_NEAR_OPTIMAL_PARSING
403
404
/* Costs for the near-optimal parsing algorithm */
405
struct deflate_costs {
406
407
  /* The cost to output each possible literal */
408
  u32 literal[DEFLATE_NUM_LITERALS];
409
410
  /* The cost to output each possible match length */
411
  u32 length[DEFLATE_MAX_MATCH_LEN + 1];
412
413
  /* The cost to output a match offset of each possible offset slot */
414
  u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
415
};
416
417
/*
418
 * This structure represents a byte position in the input data and a node in the
419
 * graph of possible match/literal choices for the current block.
420
 *
421
 * Logically, each incoming edge to this node is labeled with a literal or a
422
 * match that can be taken to reach this position from an earlier position; and
423
 * each outgoing edge from this node is labeled with a literal or a match that
424
 * can be taken to advance from this position to a later position.
425
 *
426
 * But these "edges" are actually stored elsewhere (in 'match_cache').  Here we
427
 * associate with each node just two pieces of information:
428
 *
429
 *  'cost_to_end' is the minimum cost to reach the end of the block from
430
 *  this position.
431
 *
432
 *  'item' represents the literal or match that must be chosen from here to
433
 *  reach the end of the block with the minimum cost.  Equivalently, this
434
 *  can be interpreted as the label of the outgoing edge on the minimum-cost
435
 *  path to the "end of block" node from this node.
436
 */
437
struct deflate_optimum_node {
438
439
  u32 cost_to_end;
440
441
  /*
442
   * Notes on the match/literal representation used here:
443
   *
444
   *  The low bits of 'item' are the length: 1 if this is a literal,
445
   *  or the match length if this is a match.
446
   *
447
   *  The high bits of 'item' are the actual literal byte if this is a
448
   *  literal, or the match offset if this is a match.
449
   */
450
0
#define OPTIMUM_OFFSET_SHIFT 9
451
0
#define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
452
  u32 item;
453
454
};
455
456
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
457
458
/* Block split statistics.  See "Block splitting algorithm" below. */
459
0
#define NUM_LITERAL_OBSERVATION_TYPES 8
460
0
#define NUM_MATCH_OBSERVATION_TYPES 2
461
0
#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
462
0
             NUM_MATCH_OBSERVATION_TYPES)
463
0
#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
464
struct block_split_stats {
465
  u32 new_observations[NUM_OBSERVATION_TYPES];
466
  u32 observations[NUM_OBSERVATION_TYPES];
467
  u32 num_new_observations;
468
  u32 num_observations;
469
};
470
471
struct deflate_output_bitstream;
472
473
/* The main DEFLATE compressor structure */
474
struct libdeflate_compressor {
475
476
  /* Pointer to the compress() implementation chosen at allocation time */
477
  void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in,
478
         size_t in_nbytes, struct deflate_output_bitstream *os);
479
480
  /* The compression level with which this compressor was created */
481
  unsigned compression_level;
482
483
  /* Anything of this size or less we won't bother trying to compress. */
484
  size_t max_passthrough_size;
485
486
  /*
487
   * The maximum search depth: consider at most this many potential
488
   * matches at each position
489
   */
490
  unsigned max_search_depth;
491
492
  /*
493
   * The "nice" match length: if a match of this length is found, choose
494
   * it immediately without further consideration
495
   */
496
  unsigned nice_match_length;
497
498
  /* Frequency counters for the current block */
499
  struct deflate_freqs freqs;
500
501
  /* Block split statistics for the current block */
502
  struct block_split_stats split_stats;
503
504
  /* Dynamic Huffman codes for the current block */
505
  struct deflate_codes codes;
506
507
  /* The static Huffman codes defined by the DEFLATE format */
508
  struct deflate_codes static_codes;
509
510
  /* Temporary space for block flushing */
511
  union {
512
    /* Information about the precode */
513
    struct {
514
      u32 freqs[DEFLATE_NUM_PRECODE_SYMS];
515
      u32 codewords[DEFLATE_NUM_PRECODE_SYMS];
516
      u8 lens[DEFLATE_NUM_PRECODE_SYMS];
517
      unsigned items[DEFLATE_NUM_LITLEN_SYMS +
518
               DEFLATE_NUM_OFFSET_SYMS];
519
      unsigned num_litlen_syms;
520
      unsigned num_offset_syms;
521
      unsigned num_explicit_lens;
522
      unsigned num_items;
523
    } precode;
524
    /*
525
     * The "full" length codewords.  Used only after the information
526
     * in 'precode' is no longer needed.
527
     */
528
    struct {
529
      u32 codewords[DEFLATE_MAX_MATCH_LEN + 1];
530
      u8 lens[DEFLATE_MAX_MATCH_LEN + 1];
531
    } length;
532
  } o;
533
534
  union {
535
    /* Data for greedy or lazy parsing */
536
    struct {
537
      /* Hash chains matchfinder */
538
      struct hc_matchfinder hc_mf;
539
540
      /* Matches and literals chosen for the current block */
541
      struct deflate_sequence sequences[SEQ_STORE_LENGTH + 1];
542
543
    } g; /* (g)reedy */
544
545
    /* Data for fastest parsing */
546
    struct {
547
      /* Hash table matchfinder */
548
      struct ht_matchfinder ht_mf;
549
550
      /* Matches and literals chosen for the current block */
551
      struct deflate_sequence sequences[
552
            FAST_SEQ_STORE_LENGTH + 1];
553
554
    } f; /* (f)astest */
555
556
  #if SUPPORT_NEAR_OPTIMAL_PARSING
557
    /* Data for near-optimal parsing */
558
    struct {
559
560
      /* Binary tree matchfinder */
561
      struct bt_matchfinder bt_mf;
562
563
      /*
564
       * Cached matches for the current block.  This array
565
       * contains the matches that were found at each position
566
       * in the block.  Specifically, for each position, there
567
       * is a list of matches found at that position, if any,
568
       * sorted by strictly increasing length.  In addition,
569
       * following the matches for each position, there is a
570
       * special 'struct lz_match' whose 'length' member
571
       * contains the number of matches found at that
572
       * position, and whose 'offset' member contains the
573
       * literal at that position.
574
       *
575
       * Note: in rare cases, there will be a very high number
576
       * of matches in the block and this array will overflow.
577
       * If this happens, we force the end of the current
578
       * block.  MATCH_CACHE_LENGTH is the length at which we
579
       * actually check for overflow.  The extra slots beyond
580
       * this are enough to absorb the worst case overflow,
581
       * which occurs if starting at
582
       * &match_cache[MATCH_CACHE_LENGTH - 1], we write
583
       * MAX_MATCHES_PER_POS matches and a match count header,
584
       * then skip searching for matches at
585
       * 'DEFLATE_MAX_MATCH_LEN - 1' positions and write the
586
       * match count header for each.
587
       */
588
      struct lz_match match_cache[MATCH_CACHE_LENGTH +
589
                MAX_MATCHES_PER_POS +
590
                DEFLATE_MAX_MATCH_LEN - 1];
591
592
      /*
593
       * Array of nodes, one per position, for running the
594
       * minimum-cost path algorithm.
595
       *
596
       * This array must be large enough to accommodate the
597
       * worst-case number of nodes, which is MAX_BLOCK_LENGTH
598
       * plus 1 for the end-of-block node.
599
       */
600
      struct deflate_optimum_node optimum_nodes[
601
        MAX_BLOCK_LENGTH + 1];
602
603
      /* The current cost model being used */
604
      struct deflate_costs costs;
605
606
      struct deflate_costs costs_producing_best_true_cost;
607
608
      /*
609
       * A table that maps match offset to offset slot.  This
610
       * differs from deflate_offset_slot[] in that this is a
611
       * full map, not a condensed one.  The full map is more
612
       * appropriate for the near-optimal parser, since the
613
       * near-optimal parser does more offset => offset_slot
614
       * translations, it doesn't intersperse them with
615
       * matchfinding (so cache evictions are less of a
616
       * concern), and it uses more memory anyway.
617
       */
618
      u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1];
619
620
      /* Literal/match statistics saved from previous block */
621
      u32 prev_observations[NUM_OBSERVATION_TYPES];
622
      u32 prev_num_observations;
623
624
      /*
625
       * Approximate match length frequencies based on a
626
       * greedy parse, gathered during matchfinding.  This is
627
       * used for setting the initial symbol costs.
628
       */
629
      u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
630
      u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
631
632
      /*
633
       * The maximum number of optimization passes
634
       * (min-cost path searches) per block.
635
       * Larger values = more compression.
636
       */
637
      unsigned max_optim_passes;
638
639
      /*
640
       * If an optimization pass improves the cost by fewer
641
       * than this number of bits, then optimization will stop
642
       * early, before max_optim_passes has been reached.
643
       * Smaller values = more compression.
644
       */
645
      unsigned min_improvement_to_continue;
646
647
      /*
648
       * The minimum number of bits that would need to be
649
       * saved for it to be considered worth the time to
650
       * regenerate and use the min-cost path from a previous
651
       * optimization pass, in the case where the final
652
       * optimization pass actually increased the cost.
653
       * Smaller values = more compression.
654
       */
655
      unsigned min_bits_to_use_nonfinal_path;
656
657
    } n; /* (n)ear-optimal */
658
  #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
659
660
  } p; /* (p)arser */
661
};
662
663
/*
664
 * The type for the bitbuffer variable, which temporarily holds bits that are
665
 * being packed into bytes and written to the output buffer.  For best
666
 * performance, this should have size equal to a machine word.
667
 */
668
typedef machine_word_t bitbuf_t;
669
670
/*
671
 * The capacity of the bitbuffer, in bits.  This is 1 less than the real size,
672
 * in order to avoid undefined behavior when doing bitbuf >>= bitcount & ~7.
673
 */
674
0
#define BITBUF_NBITS  (8 * sizeof(bitbuf_t) - 1)
675
676
/*
677
 * Can the specified number of bits always be added to 'bitbuf' after any
678
 * pending bytes have been flushed?  There can be up to 7 bits remaining after a
679
 * flush, so the count must not exceed BITBUF_NBITS after adding 'n' more bits.
680
 */
681
0
#define CAN_BUFFER(n) (7 + (n) <= BITBUF_NBITS)
682
683
/*
684
 * Structure to keep track of the current state of sending bits to the
685
 * compressed output buffer
686
 */
687
struct deflate_output_bitstream {
688
689
  /* Bits that haven't yet been written to the output buffer */
690
  bitbuf_t bitbuf;
691
692
  /*
693
   * Number of bits currently held in @bitbuf.  This can be between 0 and
694
   * BITBUF_NBITS in general, or between 0 and 7 after a flush.
695
   */
696
  unsigned bitcount;
697
698
  /*
699
   * Pointer to the position in the output buffer at which the next byte
700
   * should be written
701
   */
702
  u8 *next;
703
704
  /*
705
   * Pointer to near the end of the output buffer.  'next' will never
706
   * exceed this.  There are OUTPUT_END_PADDING bytes reserved after this
707
   * to allow branchlessly writing a whole word at this location.
708
   */
709
  u8 *end;
710
};
711
712
/*
713
 * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
714
 * present following os->end, in order to not overrun the buffer when generating
715
 * output.  When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
716
 * bytes for put_unaligned_leword().  Otherwise we need only 1 byte.  However,
717
 * to make the compression algorithm produce the same result on all CPU
718
 * architectures (which is sometimes desirable), we have to unconditionally use
719
 * the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
720
 */
721
28.9k
#define OUTPUT_END_PADDING  8
722
723
/*
724
 * Add some bits to the bitbuffer variable of the output bitstream.  The caller
725
 * must ensure that 'bitcount + n <= BITBUF_NBITS', by calling FLUSH_BITS()
726
 * frequently enough.
727
 */
728
0
#define ADD_BITS(bits, n)     \
729
0
do {           \
730
0
  bitbuf |= (bitbuf_t)(bits) << bitcount; \
731
0
  bitcount += (n);      \
732
0
  ASSERT(bitcount <= BITBUF_NBITS); \
733
0
} while (0)
734
735
/* Flush bits from the bitbuffer variable to the output buffer. */
736
0
#define FLUSH_BITS()              \
737
0
do {                 \
738
0
  if (UNALIGNED_ACCESS_IS_FAST) {         \
739
0
    /* Flush a whole word (branchlessly). */    \
740
0
    put_unaligned_leword(bitbuf, out_next);     \
741
0
    bitbuf >>= bitcount & ~7;       \
742
0
    out_next += MIN(out_end - out_next, bitcount >> 3);  \
743
0
    bitcount &= 7;            \
744
0
  } else {             \
745
0
    /* Flush a byte at a time. */       \
746
0
    while (bitcount >= 8) {         \
747
0
      *out_next = bitbuf;       \
748
0
      if (out_next != out_end)     \
749
0
        out_next++;       \
750
0
      bitcount -= 8;          \
751
0
      bitbuf >>= 8;         \
752
0
    }             \
753
0
  }               \
754
0
} while (0)
755
756
/*
757
 * Given the binary tree node A[subtree_idx] whose children already satisfy the
758
 * maxheap property, swap the node with its greater child until it is greater
759
 * than or equal to both of its children, so that the maxheap property is
760
 * satisfied in the subtree rooted at A[subtree_idx].  'A' uses 1-based indices.
761
 */
762
static void
763
heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
764
0
{
765
0
  unsigned parent_idx;
766
0
  unsigned child_idx;
767
0
  u32 v;
768
769
0
  v = A[subtree_idx];
770
0
  parent_idx = subtree_idx;
771
0
  while ((child_idx = parent_idx * 2) <= length) {
772
0
    if (child_idx < length && A[child_idx + 1] > A[child_idx])
773
0
      child_idx++;
774
0
    if (v >= A[child_idx])
775
0
      break;
776
0
    A[parent_idx] = A[child_idx];
777
0
    parent_idx = child_idx;
778
0
  }
779
0
  A[parent_idx] = v;
780
0
}
781
782
/*
783
 * Rearrange the array 'A' so that it satisfies the maxheap property.
784
 * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
785
 */
786
static void
787
heapify_array(u32 A[], unsigned length)
788
0
{
789
0
  unsigned subtree_idx;
790
791
0
  for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
792
0
    heapify_subtree(A, length, subtree_idx);
793
0
}
794
795
/*
796
 * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
797
 *
798
 * Note: name this function heap_sort() instead of heapsort() to avoid colliding
799
 * with heapsort() from stdlib.h on BSD-derived systems.
800
 */
801
static void
802
heap_sort(u32 A[], unsigned length)
803
0
{
804
0
  A--; /* Use 1-based indices  */
805
806
0
  heapify_array(A, length);
807
808
0
  while (length >= 2) {
809
0
    u32 tmp = A[length];
810
811
0
    A[length] = A[1];
812
0
    A[1] = tmp;
813
0
    length--;
814
0
    heapify_subtree(A, length, 1);
815
0
  }
816
0
}
817
818
0
#define NUM_SYMBOL_BITS 10
819
#define NUM_FREQ_BITS (32 - NUM_SYMBOL_BITS)
820
0
#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
821
0
#define FREQ_MASK (~SYMBOL_MASK)
822
823
0
#define GET_NUM_COUNTERS(num_syms)  (num_syms)
824
825
/*
826
 * Sort the symbols primarily by frequency and secondarily by symbol value.
827
 * Discard symbols with zero frequency and fill in an array with the remaining
828
 * symbols, along with their frequencies.  The low NUM_SYMBOL_BITS bits of each
829
 * array entry will contain the symbol value, and the remaining bits will
830
 * contain the frequency.
831
 *
832
 * @num_syms
833
 *  Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS.
834
 *
835
 * @freqs[num_syms]
836
 *  Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1.
837
 *
838
 * @lens[num_syms]
839
 *  An array that eventually will hold the length of each codeword.  This
840
 *  function only fills in the codeword lengths for symbols that have zero
841
 *  frequency, which are not well defined per se but will be set to 0.
842
 *
843
 * @symout[num_syms]
844
 *  The output array, described above.
845
 *
846
 * Returns the number of entries in 'symout' that were filled.  This is the
847
 * number of symbols that have nonzero frequency.
848
 */
849
static unsigned
850
sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[])
851
0
{
852
0
  unsigned sym;
853
0
  unsigned i;
854
0
  unsigned num_used_syms;
855
0
  unsigned num_counters;
856
0
  unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
857
858
  /*
859
   * We use heapsort, but with an added optimization.  Since often most
860
   * symbol frequencies are low, we first do a count sort using a limited
861
   * number of counters.  High frequencies are counted in the last
862
   * counter, and only they will be sorted with heapsort.
863
   *
864
   * Note: with more symbols, it is generally beneficial to have more
865
   * counters.  About 1 counter per symbol seems fastest.
866
   */
867
868
0
  num_counters = GET_NUM_COUNTERS(num_syms);
869
870
0
  memset(counters, 0, num_counters * sizeof(counters[0]));
871
872
  /* Count the frequencies. */
873
0
  for (sym = 0; sym < num_syms; sym++)
874
0
    counters[MIN(freqs[sym], num_counters - 1)]++;
875
876
  /*
877
   * Make the counters cumulative, ignoring the zero-th, which counted
878
   * symbols with zero frequency.  As a side effect, this calculates the
879
   * number of symbols with nonzero frequency.
880
   */
881
0
  num_used_syms = 0;
882
0
  for (i = 1; i < num_counters; i++) {
883
0
    unsigned count = counters[i];
884
885
0
    counters[i] = num_used_syms;
886
0
    num_used_syms += count;
887
0
  }
888
889
  /*
890
   * Sort nonzero-frequency symbols using the counters.  At the same time,
891
   * set the codeword lengths of zero-frequency symbols to 0.
892
   */
893
0
  for (sym = 0; sym < num_syms; sym++) {
894
0
    u32 freq = freqs[sym];
895
896
0
    if (freq != 0) {
897
0
      symout[counters[MIN(freq, num_counters - 1)]++] =
898
0
        sym | (freq << NUM_SYMBOL_BITS);
899
0
    } else {
900
0
      lens[sym] = 0;
901
0
    }
902
0
  }
903
904
  /* Sort the symbols counted in the last counter. */
905
0
  heap_sort(symout + counters[num_counters - 2],
906
0
      counters[num_counters - 1] - counters[num_counters - 2]);
907
908
0
  return num_used_syms;
909
0
}
910
911
/*
912
 * Build a Huffman tree.
913
 *
914
 * This is an optimized implementation that
915
 *  (a) takes advantage of the frequencies being already sorted;
916
 *  (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman
917
 *      tree are sufficient to generate a canonical code;
918
 *  (c) Only stores parent pointers, not child pointers;
919
 *  (d) Produces the nodes in the same memory used for input frequency
920
 *      information.
921
 *
922
 * Array 'A', which contains 'sym_count' entries, is used for both input and
923
 * output.  For this function, 'sym_count' must be at least 2.
924
 *
925
 * For input, the array must contain the frequencies of the symbols, sorted in
926
 * increasing order.  Specifically, each entry must contain a frequency left
927
 * shifted by NUM_SYMBOL_BITS bits.  Any data in the low NUM_SYMBOL_BITS bits of
928
 * the entries will be ignored by this function.  Although these bits will, in
929
 * fact, contain the symbols that correspond to the frequencies, this function
930
 * is concerned with frequencies only and keeps the symbols as-is.
931
 *
932
 * For output, this function will produce the non-leaf nodes of the Huffman
933
 * tree.  These nodes will be stored in the first (sym_count - 1) entries of the
934
 * array.  Entry A[sym_count - 2] will represent the root node.  Each other node
935
 * will contain the zero-based index of its parent node in 'A', left shifted by
936
 * NUM_SYMBOL_BITS bits.  The low NUM_SYMBOL_BITS bits of each entry in A will
937
 * be kept as-is.  Again, note that although these low bits will, in fact,
938
 * contain a symbol value, this symbol will have *no relationship* with the
939
 * Huffman tree node that happens to occupy the same slot.  This is because this
940
 * implementation only generates the non-leaf nodes of the tree.
941
 */
942
static void
943
build_tree(u32 A[], unsigned sym_count)
944
0
{
945
0
  const unsigned last_idx = sym_count - 1;
946
947
  /* Index of the next lowest frequency leaf that still needs a parent */
948
0
  unsigned i = 0;
949
950
  /*
951
   * Index of the next lowest frequency non-leaf that still needs a
952
   * parent, or 'e' if there is currently no such node
953
   */
954
0
  unsigned b = 0;
955
956
  /* Index of the next spot for a non-leaf (will overwrite a leaf) */
957
0
  unsigned e = 0;
958
959
0
  do {
960
0
    u32 new_freq;
961
962
    /*
963
     * Select the next two lowest frequency nodes among the leaves
964
     * A[i] and non-leaves A[b], and create a new node A[e] to be
965
     * their parent.  Set the new node's frequency to the sum of the
966
     * frequencies of its two children.
967
     *
968
     * Usually the next two lowest frequency nodes are of the same
969
     * type (leaf or non-leaf), so check those cases first.
970
     */
971
0
    if (i + 1 <= last_idx &&
972
0
        (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) {
973
      /* Two leaves */
974
0
      new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK);
975
0
      i += 2;
976
0
    } else if (b + 2 <= e &&
977
0
         (i > last_idx ||
978
0
          (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) {
979
      /* Two non-leaves */
980
0
      new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK);
981
0
      A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
982
0
      A[b + 1] = (e << NUM_SYMBOL_BITS) |
983
0
           (A[b + 1] & SYMBOL_MASK);
984
0
      b += 2;
985
0
    } else {
986
      /* One leaf and one non-leaf */
987
0
      new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK);
988
0
      A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
989
0
      i++;
990
0
      b++;
991
0
    }
992
0
    A[e] = new_freq | (A[e] & SYMBOL_MASK);
993
    /*
994
     * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the
995
     * tree is complete once we've created 'n - 1' non-leaves.
996
     */
997
0
  } while (++e < last_idx);
998
0
}
999
1000
/*
1001
 * Given the stripped-down Huffman tree constructed by build_tree(), determine
1002
 * the number of codewords that should be assigned each possible length, taking
1003
 * into account the length-limited constraint.
1004
 *
1005
 * @A
1006
 *  The array produced by build_tree(), containing parent index information
1007
 *  for the non-leaf nodes of the Huffman tree.  Each entry in this array is
1008
 *  a node; a node's parent always has a greater index than that node
1009
 *  itself.  This function will overwrite the parent index information in
1010
 *  this array, so essentially it will destroy the tree.  However, the data
1011
 *  in the low NUM_SYMBOL_BITS of each entry will be preserved.
1012
 *
1013
 * @root_idx
1014
 *  The 0-based index of the root node in 'A', and consequently one less
1015
 *  than the number of tree node entries in 'A'.  (Or, really 2 less than
1016
 *  the actual length of 'A'.)
1017
 *
1018
 * @len_counts
1019
 *  An array of length ('max_codeword_len' + 1) in which the number of
1020
 *  codewords having each length <= max_codeword_len will be returned.
1021
 *
1022
 * @max_codeword_len
1023
 *  The maximum permissible codeword length.
1024
 */
1025
static void
1026
compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[],
1027
          unsigned max_codeword_len)
1028
0
{
1029
0
  unsigned len;
1030
0
  int node;
1031
1032
  /*
1033
   * The key observations are:
1034
   *
1035
   * (1) We can traverse the non-leaf nodes of the tree, always visiting a
1036
   *     parent before its children, by simply iterating through the array
1037
   *     in reverse order.  Consequently, we can compute the depth of each
1038
   *     node in one pass, overwriting the parent indices with depths.
1039
   *
1040
   * (2) We can initially assume that in the real Huffman tree, both
1041
   *     children of the root are leaves.  This corresponds to two
1042
   *     codewords of length 1.  Then, whenever we visit a (non-leaf) node
1043
   *     during the traversal, we modify this assumption to account for
1044
   *     the current node *not* being a leaf, but rather its two children
1045
   *     being leaves.  This causes the loss of one codeword for the
1046
   *     current depth and the addition of two codewords for the current
1047
   *     depth plus one.
1048
   *
1049
   * (3) We can handle the length-limited constraint fairly easily by
1050
   *     simply using the largest length available when a depth exceeds
1051
   *     max_codeword_len.
1052
   */
1053
1054
0
  for (len = 0; len <= max_codeword_len; len++)
1055
0
    len_counts[len] = 0;
1056
0
  len_counts[1] = 2;
1057
1058
  /* Set the root node's depth to 0. */
1059
0
  A[root_idx] &= SYMBOL_MASK;
1060
1061
0
  for (node = root_idx - 1; node >= 0; node--) {
1062
1063
    /* Calculate the depth of this node. */
1064
1065
0
    unsigned parent = A[node] >> NUM_SYMBOL_BITS;
1066
0
    unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
1067
0
    unsigned depth = parent_depth + 1;
1068
1069
    /*
1070
     * Set the depth of this node so that it is available when its
1071
     * children (if any) are processed.
1072
     */
1073
0
    A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
1074
1075
    /*
1076
     * If needed, decrease the length to meet the length-limited
1077
     * constraint.  This is not the optimal method for generating
1078
     * length-limited Huffman codes!  But it should be good enough.
1079
     */
1080
0
    if (depth >= max_codeword_len) {
1081
0
      depth = max_codeword_len;
1082
0
      do {
1083
0
        depth--;
1084
0
      } while (len_counts[depth] == 0);
1085
0
    }
1086
1087
    /*
1088
     * Account for the fact that we have a non-leaf node at the
1089
     * current depth.
1090
     */
1091
0
    len_counts[depth]--;
1092
0
    len_counts[depth + 1] += 2;
1093
0
  }
1094
0
}
1095
1096
/*
1097
 * DEFLATE uses bit-reversed codewords, so we must bit-reverse the codewords
1098
 * after generating them.  All codewords have length <= 16 bits.  If the CPU has
1099
 * a bit-reversal instruction, then that is the fastest method.  Otherwise the
1100
 * fastest method is to reverse the bits in each of the two bytes using a table.
1101
 * The table method is slightly faster than using bitwise operations to flip
1102
 * adjacent 1, 2, 4, and then 8-bit fields, even if 2 to 4 codewords are packed
1103
 * into a machine word and processed together using that method.
1104
 */
1105
1106
#ifdef rbit32
1107
static forceinline u32 reverse_codeword(u32 codeword, u8 len)
1108
{
1109
  return rbit32(codeword) >> ((32 - len) & 31);
1110
}
1111
#else
1112
/* Generated by scripts/gen_bitreverse_tab.py */
1113
static const u8 bitreverse_tab[256] = {
1114
  0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
1115
  0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
1116
  0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
1117
  0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
1118
  0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
1119
  0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
1120
  0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
1121
  0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
1122
  0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
1123
  0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
1124
  0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
1125
  0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
1126
  0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
1127
  0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
1128
  0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
1129
  0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
1130
  0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
1131
  0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
1132
  0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
1133
  0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
1134
  0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
1135
  0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
1136
  0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
1137
  0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
1138
  0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
1139
  0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
1140
  0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
1141
  0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
1142
  0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
1143
  0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
1144
  0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
1145
  0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
1146
};
1147
1148
static forceinline u32 reverse_codeword(u32 codeword, u8 len)
1149
0
{
1150
0
  STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
1151
0
  codeword = ((u32)bitreverse_tab[codeword & 0xff] << 8) |
1152
0
       bitreverse_tab[codeword >> 8];
1153
0
  return codeword >> (16 - len);
1154
0
}
1155
#endif /* !rbit32 */
1156
1157
/*
1158
 * Generate the codewords for a canonical Huffman code.
1159
 *
1160
 * @A
1161
 *  The output array for codewords.  In addition, initially this
1162
 *  array must contain the symbols, sorted primarily by frequency and
1163
 *  secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
1164
 *  each entry.
1165
 *
1166
 * @len
1167
 *  Output array for codeword lengths.
1168
 *
1169
 * @len_counts
1170
 *  An array that provides the number of codewords that will have
1171
 *  each possible length <= max_codeword_len.
1172
 *
1173
 * @max_codeword_len
1174
 *  Maximum length, in bits, of each codeword.
1175
 *
1176
 * @num_syms
1177
 *  Number of symbols in the alphabet, including symbols with zero
1178
 *  frequency.  This is the length of the 'A' and 'len' arrays.
1179
 */
1180
static void
1181
gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[],
1182
        unsigned max_codeword_len, unsigned num_syms)
1183
0
{
1184
0
  u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
1185
0
  unsigned i;
1186
0
  unsigned len;
1187
0
  unsigned sym;
1188
1189
  /*
1190
   * Given the number of codewords that will have each length, assign
1191
   * codeword lengths to symbols.  We do this by assigning the lengths in
1192
   * decreasing order to the symbols sorted primarily by increasing
1193
   * frequency and secondarily by increasing symbol value.
1194
   */
1195
0
  for (i = 0, len = max_codeword_len; len >= 1; len--) {
1196
0
    unsigned count = len_counts[len];
1197
1198
0
    while (count--)
1199
0
      lens[A[i++] & SYMBOL_MASK] = len;
1200
0
  }
1201
1202
  /*
1203
   * Generate the codewords themselves.  We initialize the
1204
   * 'next_codewords' array to provide the lexicographically first
1205
   * codeword of each length, then assign codewords in symbol order.  This
1206
   * produces a canonical code.
1207
   */
1208
0
  next_codewords[0] = 0;
1209
0
  next_codewords[1] = 0;
1210
0
  for (len = 2; len <= max_codeword_len; len++)
1211
0
    next_codewords[len] =
1212
0
      (next_codewords[len - 1] + len_counts[len - 1]) << 1;
1213
1214
0
  for (sym = 0; sym < num_syms; sym++) {
1215
    /* DEFLATE requires bit-reversed codewords. */
1216
0
    A[sym] = reverse_codeword(next_codewords[lens[sym]]++,
1217
0
            lens[sym]);
1218
0
  }
1219
0
}
1220
1221
/*
1222
 * ---------------------------------------------------------------------
1223
 *      deflate_make_huffman_code()
1224
 * ---------------------------------------------------------------------
1225
 *
1226
 * Given an alphabet and the frequency of each symbol in it, construct a
1227
 * length-limited canonical Huffman code.
1228
 *
1229
 * @num_syms
1230
 *  The number of symbols in the alphabet.  The symbols are the integers in
1231
 *  the range [0, num_syms - 1].  This parameter must be at least 2 and
1232
 *  must not exceed (1 << NUM_SYMBOL_BITS).
1233
 *
1234
 * @max_codeword_len
1235
 *  The maximum permissible codeword length.
1236
 *
1237
 * @freqs
1238
 *  An array of length @num_syms that gives the frequency of each symbol.
1239
 *  It is valid for some, none, or all of the frequencies to be 0.  The sum
1240
 *  of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1.
1241
 *
1242
 * @lens
1243
 *  An array of @num_syms entries in which this function will return the
1244
 *  length, in bits, of the codeword assigned to each symbol.  Symbols with
1245
 *  0 frequency will not have codewords per se, but their entries in this
1246
 *  array will be set to 0.  No lengths greater than @max_codeword_len will
1247
 *  be assigned.
1248
 *
1249
 * @codewords
1250
 *  An array of @num_syms entries in which this function will return the
1251
 *  codeword for each symbol, right-justified and padded on the left with
1252
 *  zeroes.  Codewords for symbols with 0 frequency will be undefined.
1253
 *
1254
 * ---------------------------------------------------------------------
1255
 *
1256
 * This function builds a length-limited canonical Huffman code.
1257
 *
1258
 * A length-limited Huffman code contains no codewords longer than some
1259
 * specified length, and has exactly (with some algorithms) or approximately
1260
 * (with the algorithm used here) the minimum weighted path length from the
1261
 * root, given this constraint.
1262
 *
1263
 * A canonical Huffman code satisfies the properties that a longer codeword
1264
 * never lexicographically precedes a shorter codeword, and the lexicographic
1265
 * ordering of codewords of the same length is the same as the lexicographic
1266
 * ordering of the corresponding symbols.  A canonical Huffman code, or more
1267
 * generally a canonical prefix code, can be reconstructed from only a list
1268
 * containing the codeword length of each symbol.
1269
 *
1270
 * The classic algorithm to generate a Huffman code creates a node for each
1271
 * symbol, then inserts these nodes into a min-heap keyed by symbol frequency.
1272
 * Then, repeatedly, the two lowest-frequency nodes are removed from the
1273
 * min-heap and added as the children of a new node having frequency equal to
1274
 * the sum of its two children, which is then inserted into the min-heap.  When
1275
 * only a single node remains in the min-heap, it is the root of the Huffman
1276
 * tree.  The codeword for each symbol is determined by the path needed to reach
1277
 * the corresponding node from the root.  Descending to the left child appends a
1278
 * 0 bit, whereas descending to the right child appends a 1 bit.
1279
 *
1280
 * The classic algorithm is relatively easy to understand, but it is subject to
1281
 * a number of inefficiencies.  In practice, it is fastest to first sort the
1282
 * symbols by frequency.  (This itself can be subject to an optimization based
1283
 * on the fact that most frequencies tend to be low.)  At the same time, we sort
1284
 * secondarily by symbol value, which aids the process of generating a canonical
1285
 * code.  Then, during tree construction, no heap is necessary because both the
1286
 * leaf nodes and the unparented non-leaf nodes can be easily maintained in
1287
 * sorted order.  Consequently, there can never be more than two possibilities
1288
 * for the next-lowest-frequency node.
1289
 *
1290
 * In addition, because we're generating a canonical code, we actually don't
1291
 * need the leaf nodes of the tree at all, only the non-leaf nodes.  This is
1292
 * because for canonical code generation we don't need to know where the symbols
1293
 * are in the tree.  Rather, we only need to know how many leaf nodes have each
1294
 * depth (codeword length).  And this information can, in fact, be quickly
1295
 * generated from the tree of non-leaves only.
1296
 *
1297
 * Furthermore, we can build this stripped-down Huffman tree directly in the
1298
 * array in which the codewords are to be generated, provided that these array
1299
 * slots are large enough to hold a symbol and frequency value.
1300
 *
1301
 * Still furthermore, we don't even need to maintain explicit child pointers.
1302
 * We only need the parent pointers, and even those can be overwritten in-place
1303
 * with depth information as part of the process of extracting codeword lengths
1304
 * from the tree.  So in summary, we do NOT need a big structure like:
1305
 *
1306
 *  struct huffman_tree_node {
1307
 *    unsigned int symbol;
1308
 *    unsigned int frequency;
1309
 *    unsigned int depth;
1310
 *    struct huffman_tree_node *left_child;
1311
 *    struct huffman_tree_node *right_child;
1312
 *  };
1313
 *
1314
 *
1315
 * ... which often gets used in "naive" implementations of Huffman code
1316
 * generation.
1317
 *
1318
 * Many of these optimizations are based on the implementation in 7-Zip (source
1319
 * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov.
1320
 */
1321
static void
1322
deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
1323
        const u32 freqs[], u8 lens[], u32 codewords[])
1324
0
{
1325
0
  u32 *A = codewords;
1326
0
  unsigned num_used_syms;
1327
1328
0
  STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
1329
0
  STATIC_ASSERT(MAX_BLOCK_LENGTH <= ((u32)1 << NUM_FREQ_BITS) - 1);
1330
1331
  /*
1332
   * We begin by sorting the symbols primarily by frequency and
1333
   * secondarily by symbol value.  As an optimization, the array used for
1334
   * this purpose ('A') shares storage with the space in which we will
1335
   * eventually return the codewords.
1336
   */
1337
0
  num_used_syms = sort_symbols(num_syms, freqs, lens, A);
1338
1339
  /*
1340
   * 'num_used_syms' is the number of symbols with nonzero frequency.
1341
   * This may be less than @num_syms.  'num_used_syms' is also the number
1342
   * of entries in 'A' that are valid.  Each entry consists of a distinct
1343
   * symbol and a nonzero frequency packed into a 32-bit integer.
1344
   */
1345
1346
  /*
1347
   * Handle special cases where only 0 or 1 symbols were used (had nonzero
1348
   * frequency).
1349
   */
1350
1351
0
  if (unlikely(num_used_syms == 0)) {
1352
    /*
1353
     * Code is empty.  sort_symbols() already set all lengths to 0,
1354
     * so there is nothing more to do.
1355
     */
1356
0
    return;
1357
0
  }
1358
1359
0
  if (unlikely(num_used_syms == 1)) {
1360
    /*
1361
     * Only one symbol was used, so we only need one codeword.  But
1362
     * two codewords are needed to form the smallest complete
1363
     * Huffman code, which uses codewords 0 and 1.  Therefore, we
1364
     * choose another symbol to which to assign a codeword.  We use
1365
     * 0 (if the used symbol is not 0) or 1 (if the used symbol is
1366
     * 0).  In either case, the lesser-valued symbol must be
1367
     * assigned codeword 0 so that the resulting code is canonical.
1368
     */
1369
1370
0
    unsigned sym = A[0] & SYMBOL_MASK;
1371
0
    unsigned nonzero_idx = sym ? sym : 1;
1372
1373
0
    codewords[0] = 0;
1374
0
    lens[0] = 1;
1375
0
    codewords[nonzero_idx] = 1;
1376
0
    lens[nonzero_idx] = 1;
1377
0
    return;
1378
0
  }
1379
1380
  /*
1381
   * Build a stripped-down version of the Huffman tree, sharing the array
1382
   * 'A' with the symbol values.  Then extract length counts from the tree
1383
   * and use them to generate the final codewords.
1384
   */
1385
1386
0
  build_tree(A, num_used_syms);
1387
1388
0
  {
1389
0
    unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
1390
1391
0
    compute_length_counts(A, num_used_syms - 2,
1392
0
              len_counts, max_codeword_len);
1393
1394
0
    gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
1395
0
  }
1396
0
}
1397
1398
/*
1399
 * Clear the Huffman symbol frequency counters.  This must be called when
1400
 * starting a new DEFLATE block.
1401
 */
1402
static void
1403
deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
1404
0
{
1405
0
  memset(&c->freqs, 0, sizeof(c->freqs));
1406
0
}
1407
1408
/*
1409
 * Build the literal/length and offset Huffman codes for a DEFLATE block.
1410
 *
1411
 * This takes as input the frequency tables for each alphabet and produces as
1412
 * output a set of tables that map symbols to codewords and codeword lengths.
1413
 */
1414
static void
1415
deflate_make_huffman_codes(const struct deflate_freqs *freqs,
1416
         struct deflate_codes *codes)
1417
0
{
1418
0
  deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
1419
0
          MAX_LITLEN_CODEWORD_LEN,
1420
0
          freqs->litlen,
1421
0
          codes->lens.litlen,
1422
0
          codes->codewords.litlen);
1423
1424
0
  deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
1425
0
          MAX_OFFSET_CODEWORD_LEN,
1426
0
          freqs->offset,
1427
0
          codes->lens.offset,
1428
0
          codes->codewords.offset);
1429
0
}
1430
1431
/* Initialize c->static_codes. */
1432
static void
1433
deflate_init_static_codes(struct libdeflate_compressor *c)
1434
0
{
1435
0
  unsigned i;
1436
1437
0
  for (i = 0; i < 144; i++)
1438
0
    c->freqs.litlen[i] = 1 << (9 - 8);
1439
0
  for (; i < 256; i++)
1440
0
    c->freqs.litlen[i] = 1 << (9 - 9);
1441
0
  for (; i < 280; i++)
1442
0
    c->freqs.litlen[i] = 1 << (9 - 7);
1443
0
  for (; i < 288; i++)
1444
0
    c->freqs.litlen[i] = 1 << (9 - 8);
1445
1446
0
  for (i = 0; i < 32; i++)
1447
0
    c->freqs.offset[i] = 1 << (5 - 5);
1448
1449
0
  deflate_make_huffman_codes(&c->freqs, &c->static_codes);
1450
0
}
1451
1452
/* Return the offset slot for the given match offset, using the small map. */
1453
static forceinline unsigned
1454
deflate_get_offset_slot(unsigned offset)
1455
0
{
1456
0
#if 1
1457
0
  if (offset <= 256)
1458
0
    return deflate_offset_slot[offset];
1459
0
  else
1460
0
    return deflate_offset_slot[256 + ((offset - 1) >> 7)];
1461
#else /* Branchless version */
1462
  u32 i1 = offset;
1463
  u32 i2 = 256 + ((offset - 1) >> 7);
1464
  u32 is_small = (s32)(offset - 257) >> 31;
1465
1466
  return deflate_offset_slot[(i1 & is_small) ^ (i2 & ~is_small)];
1467
#endif
1468
0
}
1469
1470
static unsigned
1471
deflate_compute_precode_items(const u8 lens[], const unsigned num_lens,
1472
            u32 precode_freqs[], unsigned precode_items[])
1473
0
{
1474
0
  unsigned *itemptr;
1475
0
  unsigned run_start;
1476
0
  unsigned run_end;
1477
0
  unsigned extra_bits;
1478
0
  u8 len;
1479
1480
0
  memset(precode_freqs, 0,
1481
0
         DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
1482
1483
0
  itemptr = precode_items;
1484
0
  run_start = 0;
1485
0
  do {
1486
    /* Find the next run of codeword lengths. */
1487
1488
    /* len = the length being repeated */
1489
0
    len = lens[run_start];
1490
1491
    /* Extend the run. */
1492
0
    run_end = run_start;
1493
0
    do {
1494
0
      run_end++;
1495
0
    } while (run_end != num_lens && len == lens[run_end]);
1496
1497
0
    if (len == 0) {
1498
      /* Run of zeroes. */
1499
1500
      /* Symbol 18: RLE 11 to 138 zeroes at a time. */
1501
0
      while ((run_end - run_start) >= 11) {
1502
0
        extra_bits = MIN((run_end - run_start) - 11,
1503
0
             0x7F);
1504
0
        precode_freqs[18]++;
1505
0
        *itemptr++ = 18 | (extra_bits << 5);
1506
0
        run_start += 11 + extra_bits;
1507
0
      }
1508
1509
      /* Symbol 17: RLE 3 to 10 zeroes at a time. */
1510
0
      if ((run_end - run_start) >= 3) {
1511
0
        extra_bits = MIN((run_end - run_start) - 3,
1512
0
             0x7);
1513
0
        precode_freqs[17]++;
1514
0
        *itemptr++ = 17 | (extra_bits << 5);
1515
0
        run_start += 3 + extra_bits;
1516
0
      }
1517
0
    } else {
1518
1519
      /* A run of nonzero lengths. */
1520
1521
      /* Symbol 16: RLE 3 to 6 of the previous length. */
1522
0
      if ((run_end - run_start) >= 4) {
1523
0
        precode_freqs[len]++;
1524
0
        *itemptr++ = len;
1525
0
        run_start++;
1526
0
        do {
1527
0
          extra_bits = MIN((run_end - run_start) -
1528
0
               3, 0x3);
1529
0
          precode_freqs[16]++;
1530
0
          *itemptr++ = 16 | (extra_bits << 5);
1531
0
          run_start += 3 + extra_bits;
1532
0
        } while ((run_end - run_start) >= 3);
1533
0
      }
1534
0
    }
1535
1536
    /* Output any remaining lengths without RLE. */
1537
0
    while (run_start != run_end) {
1538
0
      precode_freqs[len]++;
1539
0
      *itemptr++ = len;
1540
0
      run_start++;
1541
0
    }
1542
0
  } while (run_start != num_lens);
1543
1544
0
  return itemptr - precode_items;
1545
0
}
1546
1547
/*
1548
 * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
1549
 * separate Huffman code, the "precode", which contains a symbol for each
1550
 * possible codeword length in the larger code as well as several special
1551
 * symbols to represent repeated codeword lengths (a form of run-length
1552
 * encoding).  The precode is itself constructed in canonical form, and its
1553
 * codeword lengths are represented literally in 19 3-bit fields that
1554
 * immediately precede the compressed codeword lengths of the larger code.
1555
 */
1556
1557
/* Precompute the information needed to output dynamic Huffman codes. */
1558
static void
1559
deflate_precompute_huffman_header(struct libdeflate_compressor *c)
1560
0
{
1561
  /* Compute how many litlen and offset symbols are needed. */
1562
1563
0
  for (c->o.precode.num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
1564
0
       c->o.precode.num_litlen_syms > 257;
1565
0
       c->o.precode.num_litlen_syms--)
1566
0
    if (c->codes.lens.litlen[c->o.precode.num_litlen_syms - 1] != 0)
1567
0
      break;
1568
1569
0
  for (c->o.precode.num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
1570
0
       c->o.precode.num_offset_syms > 1;
1571
0
       c->o.precode.num_offset_syms--)
1572
0
    if (c->codes.lens.offset[c->o.precode.num_offset_syms - 1] != 0)
1573
0
      break;
1574
1575
  /*
1576
   * If we're not using the full set of literal/length codeword lengths,
1577
   * then temporarily move the offset codeword lengths over so that the
1578
   * literal/length and offset codeword lengths are contiguous.
1579
   */
1580
0
  STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
1581
0
          DEFLATE_NUM_LITLEN_SYMS);
1582
0
  if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1583
0
    memmove((u8 *)&c->codes.lens + c->o.precode.num_litlen_syms,
1584
0
      (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1585
0
      c->o.precode.num_offset_syms);
1586
0
  }
1587
1588
  /*
1589
   * Compute the "items" (RLE / literal tokens and extra bits) with which
1590
   * the codeword lengths in the larger code will be output.
1591
   */
1592
0
  c->o.precode.num_items =
1593
0
    deflate_compute_precode_items((u8 *)&c->codes.lens,
1594
0
                c->o.precode.num_litlen_syms +
1595
0
                c->o.precode.num_offset_syms,
1596
0
                c->o.precode.freqs,
1597
0
                c->o.precode.items);
1598
1599
  /* Build the precode. */
1600
0
  deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
1601
0
          MAX_PRE_CODEWORD_LEN,
1602
0
          c->o.precode.freqs, c->o.precode.lens,
1603
0
          c->o.precode.codewords);
1604
1605
  /* Count how many precode lengths we actually need to output. */
1606
0
  for (c->o.precode.num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
1607
0
       c->o.precode.num_explicit_lens > 4;
1608
0
       c->o.precode.num_explicit_lens--)
1609
0
    if (c->o.precode.lens[deflate_precode_lens_permutation[
1610
0
        c->o.precode.num_explicit_lens - 1]] != 0)
1611
0
      break;
1612
1613
  /* Restore the offset codeword lengths if needed. */
1614
0
  if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
1615
0
    memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
1616
0
      (u8 *)&c->codes.lens + c->o.precode.num_litlen_syms,
1617
0
      c->o.precode.num_offset_syms);
1618
0
  }
1619
0
}
1620
1621
/*
1622
 * To make it faster to output matches, compute the "full" match length
1623
 * codewords, i.e. the concatenation of the litlen codeword and the extra bits
1624
 * for each possible match length.
1625
 */
1626
static void
1627
deflate_compute_full_len_codewords(struct libdeflate_compressor *c,
1628
           const struct deflate_codes *codes)
1629
0
{
1630
0
  unsigned len;
1631
1632
0
  STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN +
1633
0
          DEFLATE_MAX_EXTRA_LENGTH_BITS <= 32);
1634
1635
0
  for (len = DEFLATE_MIN_MATCH_LEN; len <= DEFLATE_MAX_MATCH_LEN; len++) {
1636
0
    unsigned slot = deflate_length_slot[len];
1637
0
    unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + slot;
1638
0
    u32 extra_bits = len - deflate_length_slot_base[slot];
1639
1640
0
    c->o.length.codewords[len] =
1641
0
      codes->codewords.litlen[litlen_sym] |
1642
0
      (extra_bits << codes->lens.litlen[litlen_sym]);
1643
0
    c->o.length.lens[len] = codes->lens.litlen[litlen_sym] +
1644
0
          deflate_extra_length_bits[slot];
1645
0
  }
1646
0
}
1647
1648
/* Write a match to the output buffer. */
1649
0
#define WRITE_MATCH(c_, codes_, length_, offset_, offset_slot_)   \
1650
0
do {                 \
1651
0
  const struct libdeflate_compressor *c__ = (c_);     \
1652
0
  const struct deflate_codes *codes__ = (codes_);     \
1653
0
  unsigned length__ = (length_);          \
1654
0
  unsigned offset__ = (offset_);          \
1655
0
  unsigned offset_slot__ = (offset_slot_);      \
1656
0
                  \
1657
0
  /* Litlen symbol and extra length bits */     \
1658
0
  STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +   \
1659
0
         DEFLATE_MAX_EXTRA_LENGTH_BITS)); \
1660
0
  ADD_BITS(c__->o.length.codewords[length__],     \
1661
0
     c__->o.length.lens[length__]);       \
1662
0
                  \
1663
0
  if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +      \
1664
0
      DEFLATE_MAX_EXTRA_LENGTH_BITS +     \
1665
0
      MAX_OFFSET_CODEWORD_LEN +     \
1666
0
      DEFLATE_MAX_EXTRA_OFFSET_BITS))     \
1667
0
    FLUSH_BITS();           \
1668
0
                  \
1669
0
  /* Offset symbol */           \
1670
0
  ADD_BITS(codes__->codewords.offset[offset_slot__],    \
1671
0
     codes__->lens.offset[offset_slot__]);      \
1672
0
                  \
1673
0
  if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +      \
1674
0
      DEFLATE_MAX_EXTRA_OFFSET_BITS))     \
1675
0
    FLUSH_BITS();           \
1676
0
                  \
1677
0
  /* Extra offset bits */           \
1678
0
  ADD_BITS(offset__ - deflate_offset_slot_base[offset_slot__],  \
1679
0
     deflate_extra_offset_bits[offset_slot__]);   \
1680
0
                  \
1681
0
  FLUSH_BITS();             \
1682
0
} while (0)
1683
1684
/*
1685
 * Choose the best type of block to use (dynamic Huffman, static Huffman, or
1686
 * uncompressed), then output it.
1687
 *
1688
 * The uncompressed data of the block is @block_begin[0..@block_length-1].  The
1689
 * sequence of literals and matches that will be used to compress the block (if
1690
 * a compressed block is chosen) is given by @sequences if it's non-NULL, or
1691
 * else @c->p.n.optimum_nodes.  @c->freqs and @c->codes must be already set
1692
 * according to the literals, matches, and end-of-block symbol.
1693
 */
1694
static void
1695
deflate_flush_block(struct libdeflate_compressor *c,
1696
        struct deflate_output_bitstream *os,
1697
        const u8 *block_begin, u32 block_length,
1698
        const struct deflate_sequence *sequences,
1699
        bool is_final_block)
1700
0
{
1701
  /*
1702
   * It is hard to get compilers to understand that writes to 'os->next'
1703
   * don't alias 'os'.  That hurts performance significantly, as
1704
   * everything in 'os' would keep getting re-loaded.  ('restrict'
1705
   * *should* do the trick, but it's unreliable.)  Therefore, we keep all
1706
   * the output bitstream state in local variables, and output bits using
1707
   * macros.  This is similar to what the decompressor does.
1708
   */
1709
0
  const u8 *in_next = block_begin;
1710
0
  const u8 * const in_end = block_begin + block_length;
1711
0
  bitbuf_t bitbuf = os->bitbuf;
1712
0
  unsigned bitcount = os->bitcount;
1713
0
  u8 *out_next = os->next;
1714
0
  u8 * const out_end = os->end;
1715
  /* The cost for each block type, in bits */
1716
0
  u32 dynamic_cost = 0;
1717
0
  u32 static_cost = 0;
1718
0
  u32 uncompressed_cost = 0;
1719
0
  u32 best_cost;
1720
0
  struct deflate_codes *codes;
1721
0
  unsigned sym;
1722
1723
0
  ASSERT(block_length >= MIN_BLOCK_LENGTH || is_final_block);
1724
0
  ASSERT(block_length <= MAX_BLOCK_LENGTH);
1725
0
  ASSERT(bitcount <= 7);
1726
0
  ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0);
1727
0
  ASSERT(out_next <= out_end);
1728
1729
  /* Precompute the precode items and build the precode. */
1730
0
  deflate_precompute_huffman_header(c);
1731
1732
  /* Account for the cost of encoding dynamic Huffman codes. */
1733
0
  dynamic_cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens);
1734
0
  for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
1735
0
    u32 extra = deflate_extra_precode_bits[sym];
1736
1737
0
    dynamic_cost += c->o.precode.freqs[sym] *
1738
0
        (extra + c->o.precode.lens[sym]);
1739
0
  }
1740
1741
  /* Account for the cost of encoding literals. */
1742
0
  for (sym = 0; sym < 144; sym++) {
1743
0
    dynamic_cost += c->freqs.litlen[sym] *
1744
0
        c->codes.lens.litlen[sym];
1745
0
    static_cost += c->freqs.litlen[sym] * 8;
1746
0
  }
1747
0
  for (; sym < 256; sym++) {
1748
0
    dynamic_cost += c->freqs.litlen[sym] *
1749
0
        c->codes.lens.litlen[sym];
1750
0
    static_cost += c->freqs.litlen[sym] * 9;
1751
0
  }
1752
1753
  /* Account for the cost of encoding the end-of-block symbol. */
1754
0
  dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK];
1755
0
  static_cost += 7;
1756
1757
  /* Account for the cost of encoding lengths. */
1758
0
  for (sym = DEFLATE_FIRST_LEN_SYM;
1759
0
       sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits);
1760
0
       sym++) {
1761
0
    u32 extra = deflate_extra_length_bits[
1762
0
          sym - DEFLATE_FIRST_LEN_SYM];
1763
1764
0
    dynamic_cost += c->freqs.litlen[sym] *
1765
0
        (extra + c->codes.lens.litlen[sym]);
1766
0
    static_cost += c->freqs.litlen[sym] *
1767
0
        (extra + c->static_codes.lens.litlen[sym]);
1768
0
  }
1769
1770
  /* Account for the cost of encoding offsets. */
1771
0
  for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
1772
0
    u32 extra = deflate_extra_offset_bits[sym];
1773
1774
0
    dynamic_cost += c->freqs.offset[sym] *
1775
0
        (extra + c->codes.lens.offset[sym]);
1776
0
    static_cost += c->freqs.offset[sym] * (extra + 5);
1777
0
  }
1778
1779
  /* Compute the cost of using uncompressed blocks. */
1780
0
  uncompressed_cost += (-(bitcount + 3) & 7) + 32 +
1781
0
           (40 * (DIV_ROUND_UP(block_length,
1782
0
             UINT16_MAX) - 1)) +
1783
0
           (8 * block_length);
1784
1785
  /* Choose and output the cheapest type of block. */
1786
0
  best_cost = MIN(static_cost, uncompressed_cost);
1787
0
  if (dynamic_cost < best_cost) {
1788
0
    const unsigned num_explicit_lens = c->o.precode.num_explicit_lens;
1789
0
    const unsigned num_precode_items = c->o.precode.num_items;
1790
0
    unsigned precode_sym, precode_item;
1791
0
    unsigned i;
1792
1793
    /* Dynamic Huffman block */
1794
1795
0
    best_cost = dynamic_cost;
1796
0
    codes = &c->codes;
1797
0
    STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3));
1798
0
    ADD_BITS(is_final_block, 1);
1799
0
    ADD_BITS(DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN, 2);
1800
0
    ADD_BITS(c->o.precode.num_litlen_syms - 257, 5);
1801
0
    ADD_BITS(c->o.precode.num_offset_syms - 1, 5);
1802
0
    ADD_BITS(num_explicit_lens - 4, 4);
1803
1804
    /* Output the lengths of the codewords in the precode. */
1805
0
    if (CAN_BUFFER(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
1806
      /*
1807
       * A 64-bit bitbuffer is just one bit too small to hold
1808
       * the maximum number of precode lens, so to minimize
1809
       * flushes we merge one len with the previous fields.
1810
       */
1811
0
      precode_sym = deflate_precode_lens_permutation[0];
1812
0
      ADD_BITS(c->o.precode.lens[precode_sym], 3);
1813
0
      FLUSH_BITS();
1814
0
      i = 1; /* num_explicit_lens >= 4 */
1815
0
      do {
1816
0
        precode_sym =
1817
0
          deflate_precode_lens_permutation[i];
1818
0
        ADD_BITS(c->o.precode.lens[precode_sym], 3);
1819
0
      } while (++i < num_explicit_lens);
1820
0
      FLUSH_BITS();
1821
0
    } else {
1822
0
      FLUSH_BITS();
1823
0
      i = 0;
1824
0
      do {
1825
0
        precode_sym =
1826
0
          deflate_precode_lens_permutation[i];
1827
0
        ADD_BITS(c->o.precode.lens[precode_sym], 3);
1828
0
        FLUSH_BITS();
1829
0
      } while (++i < num_explicit_lens);
1830
0
    }
1831
1832
    /*
1833
     * Output the lengths of the codewords in the litlen and offset
1834
     * codes, encoded by the precode.
1835
     */
1836
0
    i = 0;
1837
0
    do {
1838
0
      precode_item = c->o.precode.items[i];
1839
0
      precode_sym = precode_item & 0x1F;
1840
0
      STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7));
1841
0
      ADD_BITS(c->o.precode.codewords[precode_sym],
1842
0
         c->o.precode.lens[precode_sym]);
1843
0
      ADD_BITS(precode_item >> 5,
1844
0
         deflate_extra_precode_bits[precode_sym]);
1845
0
      FLUSH_BITS();
1846
0
    } while (++i < num_precode_items);
1847
0
  } else if (static_cost < uncompressed_cost) {
1848
    /* Static Huffman block */
1849
0
    codes = &c->static_codes;
1850
0
    ADD_BITS(is_final_block, 1);
1851
0
    ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2);
1852
0
    FLUSH_BITS();
1853
0
  } else {
1854
    /*
1855
     * Uncompressed block(s).  DEFLATE limits the length of
1856
     * uncompressed blocks to UINT16_MAX bytes, so if the length of
1857
     * the "block" we're flushing is over UINT16_MAX, we actually
1858
     * output multiple blocks.
1859
     */
1860
0
    do {
1861
0
      u8 bfinal = 0;
1862
0
      size_t len = UINT16_MAX;
1863
1864
0
      if (in_end - in_next <= UINT16_MAX) {
1865
0
        bfinal = is_final_block;
1866
0
        len = in_end - in_next;
1867
0
      }
1868
0
      if (out_end - out_next <
1869
0
          (bitcount + 3 + 7) / 8 + 4 + len) {
1870
        /* Not enough output space remaining. */
1871
0
        out_next = out_end;
1872
0
        goto out;
1873
0
      }
1874
      /*
1875
       * Output BFINAL (1 bit) and BTYPE (2 bits), then align
1876
       * to a byte boundary.
1877
       */
1878
0
      STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0);
1879
0
      *out_next++ = (bfinal << bitcount) | bitbuf;
1880
0
      if (bitcount > 5)
1881
0
        *out_next++ = 0;
1882
0
      bitbuf = 0;
1883
0
      bitcount = 0;
1884
      /* Output LEN and NLEN, then the data itself. */
1885
0
      put_unaligned_le16(len, out_next);
1886
0
      out_next += 2;
1887
0
      put_unaligned_le16(~len, out_next);
1888
0
      out_next += 2;
1889
0
      memcpy(out_next, in_next, len);
1890
0
      out_next += len;
1891
0
      in_next += len;
1892
0
    } while (in_next != in_end);
1893
    /* Done outputting uncompressed block(s) */
1894
0
    goto out;
1895
0
  }
1896
1897
  /* Output the literals and matches for a dynamic or static block. */
1898
0
  ASSERT(bitcount <= 7);
1899
0
  deflate_compute_full_len_codewords(c, codes);
1900
0
#if SUPPORT_NEAR_OPTIMAL_PARSING
1901
0
  if (sequences == NULL) {
1902
    /* Output the literals and matches from the minimum-cost path */
1903
0
    struct deflate_optimum_node *cur_node =
1904
0
      &c->p.n.optimum_nodes[0];
1905
0
    struct deflate_optimum_node * const end_node =
1906
0
      &c->p.n.optimum_nodes[block_length];
1907
0
    do {
1908
0
      unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
1909
0
      unsigned offset = cur_node->item >>
1910
0
            OPTIMUM_OFFSET_SHIFT;
1911
0
      if (length == 1) {
1912
        /* Literal */
1913
0
        ADD_BITS(codes->codewords.litlen[offset],
1914
0
           codes->lens.litlen[offset]);
1915
0
        FLUSH_BITS();
1916
0
      } else {
1917
        /* Match */
1918
0
        WRITE_MATCH(c, codes, length, offset,
1919
0
              c->p.n.offset_slot_full[offset]);
1920
0
      }
1921
0
      cur_node += length;
1922
0
    } while (cur_node != end_node);
1923
0
  } else
1924
0
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
1925
0
  {
1926
    /* Output the literals and matches from the sequences list. */
1927
0
    const struct deflate_sequence *seq;
1928
1929
0
    for (seq = sequences; ; seq++) {
1930
0
      u32 litrunlen = seq->litrunlen_and_length &
1931
0
          SEQ_LITRUNLEN_MASK;
1932
0
      unsigned length = seq->litrunlen_and_length >>
1933
0
            SEQ_LENGTH_SHIFT;
1934
0
      unsigned lit;
1935
1936
      /* Output a run of literals. */
1937
0
      if (CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) {
1938
0
        for (; litrunlen >= 4; litrunlen -= 4) {
1939
0
          lit = *in_next++;
1940
0
          ADD_BITS(codes->codewords.litlen[lit],
1941
0
             codes->lens.litlen[lit]);
1942
0
          lit = *in_next++;
1943
0
          ADD_BITS(codes->codewords.litlen[lit],
1944
0
             codes->lens.litlen[lit]);
1945
0
          lit = *in_next++;
1946
0
          ADD_BITS(codes->codewords.litlen[lit],
1947
0
             codes->lens.litlen[lit]);
1948
0
          lit = *in_next++;
1949
0
          ADD_BITS(codes->codewords.litlen[lit],
1950
0
             codes->lens.litlen[lit]);
1951
0
          FLUSH_BITS();
1952
0
        }
1953
0
        if (litrunlen-- != 0) {
1954
0
          lit = *in_next++;
1955
0
          ADD_BITS(codes->codewords.litlen[lit],
1956
0
             codes->lens.litlen[lit]);
1957
0
          if (litrunlen-- != 0) {
1958
0
            lit = *in_next++;
1959
0
            ADD_BITS(codes->codewords.litlen[lit],
1960
0
               codes->lens.litlen[lit]);
1961
0
            if (litrunlen-- != 0) {
1962
0
              lit = *in_next++;
1963
0
              ADD_BITS(codes->codewords.litlen[lit],
1964
0
                 codes->lens.litlen[lit]);
1965
0
            }
1966
0
          }
1967
0
          FLUSH_BITS();
1968
0
        }
1969
0
      } else {
1970
0
        while (litrunlen--) {
1971
0
          lit = *in_next++;
1972
0
          ADD_BITS(codes->codewords.litlen[lit],
1973
0
             codes->lens.litlen[lit]);
1974
0
          FLUSH_BITS();
1975
0
        }
1976
0
      }
1977
1978
0
      if (length == 0) { /* Last sequence? */
1979
0
        ASSERT(in_next == in_end);
1980
0
        break;
1981
0
      }
1982
1983
      /* Output a match. */
1984
0
      WRITE_MATCH(c, codes, length, seq->offset,
1985
0
            seq->offset_slot);
1986
0
      in_next += length;
1987
0
    }
1988
0
  }
1989
1990
  /* Output the end-of-block symbol. */
1991
0
  ASSERT(bitcount <= 7);
1992
0
  ADD_BITS(codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
1993
0
     codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
1994
0
  FLUSH_BITS();
1995
0
out:
1996
0
  ASSERT(bitcount <= 7);
1997
  /*
1998
   * Assert that the block cost was computed correctly, as
1999
   * libdeflate_deflate_compress_bound() relies on this via the assumption
2000
   * that uncompressed blocks will always be used when cheaper.
2001
   */
2002
0
  ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount ==
2003
0
         3 + best_cost || out_next == out_end);
2004
2005
0
  os->bitbuf = bitbuf;
2006
0
  os->bitcount = bitcount;
2007
0
  os->next = out_next;
2008
0
}
2009
2010
static void
2011
deflate_finish_block(struct libdeflate_compressor *c,
2012
         struct deflate_output_bitstream *os,
2013
         const u8 *block_begin, u32 block_length,
2014
         const struct deflate_sequence *sequences,
2015
         bool is_final_block)
2016
0
{
2017
0
  c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
2018
0
  deflate_make_huffman_codes(&c->freqs, &c->codes);
2019
0
  deflate_flush_block(c, os, block_begin, block_length, sequences,
2020
0
          is_final_block);
2021
0
}
2022
2023
/******************************************************************************/
2024
2025
/*
2026
 * Block splitting algorithm.  The problem is to decide when it is worthwhile to
2027
 * start a new block with new Huffman codes.  There is a theoretically optimal
2028
 * solution: recursively consider every possible block split, considering the
2029
 * exact cost of each block, and choose the minimum cost approach.  But this is
2030
 * far too slow.  Instead, as an approximation, we can count symbols and after
2031
 * every N symbols, compare the expected distribution of symbols based on the
2032
 * previous data with the actual distribution.  If they differ "by enough", then
2033
 * start a new block.
2034
 *
2035
 * As an optimization and heuristic, we don't distinguish between every symbol
2036
 * but rather we combine many symbols into a single "observation type".  For
2037
 * literals we only look at the high bits and low bits, and for matches we only
2038
 * look at whether the match is long or not.  The assumption is that for typical
2039
 * "real" data, places that are good block boundaries will tend to be noticeable
2040
 * based only on changes in these aggregate probabilities, without looking for
2041
 * subtle differences in individual symbols.  For example, a change from ASCII
2042
 * bytes to non-ASCII bytes, or from few matches (generally less compressible)
2043
 * to many matches (generally more compressible), would be easily noticed based
2044
 * on the aggregates.
2045
 *
2046
 * For determining whether the probability distributions are "different enough"
2047
 * to start a new block, the simple heuristic of splitting when the sum of
2048
 * absolute differences exceeds a constant seems to be good enough.  We also add
2049
 * a number proportional to the block length so that the algorithm is more
2050
 * likely to end long blocks than short blocks.  This reflects the general
2051
 * expectation that it will become increasingly beneficial to start a new block
2052
 * as the current block grows longer.
2053
 *
2054
 * Finally, for an approximation, it is not strictly necessary that the exact
2055
 * symbols being used are considered.  With "near-optimal parsing", for example,
2056
 * the actual symbols that will be used are unknown until after the block
2057
 * boundary is chosen and the block has been optimized.  Since the final choices
2058
 * cannot be used, we can use preliminary "greedy" choices instead.
2059
 */
2060
2061
/* Initialize the block split statistics when starting a new block. */
2062
static void
2063
init_block_split_stats(struct block_split_stats *stats)
2064
0
{
2065
0
  int i;
2066
2067
0
  for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
2068
0
    stats->new_observations[i] = 0;
2069
0
    stats->observations[i] = 0;
2070
0
  }
2071
0
  stats->num_new_observations = 0;
2072
0
  stats->num_observations = 0;
2073
0
}
2074
2075
/*
2076
 * Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
2077
 * literal, for 8 possible literal observation types.
2078
 */
2079
static forceinline void
2080
observe_literal(struct block_split_stats *stats, u8 lit)
2081
0
{
2082
0
  stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
2083
0
  stats->num_new_observations++;
2084
0
}
2085
2086
/*
2087
 * Match observation.  Heuristic: use one observation type for "short match" and
2088
 * one observation type for "long match".
2089
 */
2090
static forceinline void
2091
observe_match(struct block_split_stats *stats, unsigned length)
2092
0
{
2093
0
  stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES +
2094
0
        (length >= 9)]++;
2095
0
  stats->num_new_observations++;
2096
0
}
2097
2098
static void
2099
merge_new_observations(struct block_split_stats *stats)
2100
0
{
2101
0
  int i;
2102
2103
0
  for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
2104
0
    stats->observations[i] += stats->new_observations[i];
2105
0
    stats->new_observations[i] = 0;
2106
0
  }
2107
0
  stats->num_observations += stats->num_new_observations;
2108
0
  stats->num_new_observations = 0;
2109
0
}
2110
2111
static bool
2112
do_end_block_check(struct block_split_stats *stats, u32 block_length)
2113
0
{
2114
0
  if (stats->num_observations > 0) {
2115
    /*
2116
     * Compute the sum of absolute differences of probabilities.  To
2117
     * avoid needing to use floating point arithmetic or do slow
2118
     * divisions, we do all arithmetic with the probabilities
2119
     * multiplied by num_observations * num_new_observations.  E.g.,
2120
     * for the "old" observations the probabilities would be
2121
     * (double)observations[i] / num_observations, but since we
2122
     * multiply by both num_observations and num_new_observations we
2123
     * really do observations[i] * num_new_observations.
2124
     */
2125
0
    u32 total_delta = 0;
2126
0
    u32 num_items;
2127
0
    u32 cutoff;
2128
0
    int i;
2129
2130
0
    for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
2131
0
      u32 expected = stats->observations[i] *
2132
0
               stats->num_new_observations;
2133
0
      u32 actual = stats->new_observations[i] *
2134
0
             stats->num_observations;
2135
0
      u32 delta = (actual > expected) ? actual - expected :
2136
0
                expected - actual;
2137
2138
0
      total_delta += delta;
2139
0
    }
2140
2141
0
    num_items = stats->num_observations +
2142
0
          stats->num_new_observations;
2143
    /*
2144
     * Heuristic: the cutoff is when the sum of absolute differences
2145
     * of probabilities becomes at least 200/512.  As above, the
2146
     * probability is multiplied by both num_new_observations and
2147
     * num_observations.  Be careful to avoid integer overflow.
2148
     */
2149
0
    cutoff = stats->num_new_observations * 200 / 512 *
2150
0
       stats->num_observations;
2151
    /*
2152
     * Very short blocks have a lot of overhead for the Huffman
2153
     * codes, so only use them if it clearly seems worthwhile.
2154
     * (This is an additional penalty, which adds to the smaller
2155
     * penalty below which scales more slowly.)
2156
     */
2157
0
    if (block_length < 10000 && num_items < 8192)
2158
0
      cutoff += (u64)cutoff * (8192 - num_items) / 8192;
2159
2160
    /* Ready to end the block? */
2161
0
    if (total_delta +
2162
0
        (block_length / 4096) * stats->num_observations >= cutoff)
2163
0
      return true;
2164
0
  }
2165
0
  merge_new_observations(stats);
2166
0
  return false;
2167
0
}
2168
2169
static forceinline bool
2170
ready_to_check_block(const struct block_split_stats *stats,
2171
         const u8 *in_block_begin, const u8 *in_next,
2172
         const u8 *in_end)
2173
0
{
2174
0
  return stats->num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK
2175
0
    && in_next - in_block_begin >= MIN_BLOCK_LENGTH
2176
0
    && in_end - in_next >= MIN_BLOCK_LENGTH;
2177
0
}
2178
2179
static forceinline bool
2180
should_end_block(struct block_split_stats *stats,
2181
     const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
2182
0
{
2183
  /* Ready to try to end the block (again)? */
2184
0
  if (!ready_to_check_block(stats, in_block_begin, in_next, in_end))
2185
0
    return false;
2186
2187
0
  return do_end_block_check(stats, in_next - in_block_begin);
2188
0
}
2189
2190
/******************************************************************************/
2191
2192
static void
2193
deflate_begin_sequences(struct libdeflate_compressor *c,
2194
      struct deflate_sequence *first_seq)
2195
0
{
2196
0
  deflate_reset_symbol_frequencies(c);
2197
0
  first_seq->litrunlen_and_length = 0;
2198
0
}
2199
2200
static forceinline void
2201
deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
2202
           bool gather_split_stats, struct deflate_sequence *seq)
2203
0
{
2204
0
  c->freqs.litlen[literal]++;
2205
2206
0
  if (gather_split_stats)
2207
0
    observe_literal(&c->split_stats, literal);
2208
2209
0
  STATIC_ASSERT(MAX_BLOCK_LENGTH <= SEQ_LITRUNLEN_MASK);
2210
0
  seq->litrunlen_and_length++;
2211
0
}
2212
2213
static forceinline void
2214
deflate_choose_match(struct libdeflate_compressor *c,
2215
         unsigned length, unsigned offset, bool gather_split_stats,
2216
         struct deflate_sequence **seq_p)
2217
0
{
2218
0
  struct deflate_sequence *seq = *seq_p;
2219
0
  unsigned length_slot = deflate_length_slot[length];
2220
0
  unsigned offset_slot = deflate_get_offset_slot(offset);
2221
2222
0
  c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++;
2223
0
  c->freqs.offset[offset_slot]++;
2224
0
  if (gather_split_stats)
2225
0
    observe_match(&c->split_stats, length);
2226
2227
0
  seq->litrunlen_and_length |= (u32)length << SEQ_LENGTH_SHIFT;
2228
0
  seq->offset = offset;
2229
0
  seq->offset_slot = offset_slot;
2230
2231
0
  seq++;
2232
0
  seq->litrunlen_and_length = 0;
2233
0
  *seq_p = seq;
2234
0
}
2235
2236
/*
2237
 * Decrease the maximum and nice match lengths if we're approaching the end of
2238
 * the input buffer.
2239
 */
2240
static forceinline void
2241
adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining)
2242
0
{
2243
0
  if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
2244
0
    *max_len = remaining;
2245
0
    *nice_len = MIN(*nice_len, *max_len);
2246
0
  }
2247
0
}
2248
2249
/*
2250
 * Choose the minimum match length for the greedy and lazy parsers.
2251
 *
2252
 * By default the minimum match length is 3, which is the smallest length the
2253
 * DEFLATE format allows.  However, with greedy and lazy parsing, some data
2254
 * (e.g. DNA sequencing data) benefits greatly from a longer minimum length.
2255
 * Typically, this is because literals are very cheap.  In general, the
2256
 * near-optimal parser handles this case naturally, but the greedy and lazy
2257
 * parsers need a heuristic to decide when to use short matches.
2258
 *
2259
 * The heuristic we use is to make the minimum match length depend on the number
2260
 * of different literals that exist in the data.  If there are many different
2261
 * literals, then literals will probably be expensive, so short matches will
2262
 * probably be worthwhile.  Conversely, if not many literals are used, then
2263
 * probably literals will be cheap and short matches won't be worthwhile.
2264
 */
2265
static unsigned
2266
choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth)
2267
0
{
2268
  /* map from num_used_literals to min_len */
2269
0
  static const u8 min_lens[] = {
2270
0
    9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6,
2271
0
    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
2272
0
    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4,
2273
0
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
2274
0
    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
2275
    /* The rest is implicitly 3. */
2276
0
  };
2277
0
  unsigned min_len;
2278
2279
0
  STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3);
2280
0
  STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1);
2281
2282
0
  if (num_used_literals >= ARRAY_LEN(min_lens))
2283
0
    return 3;
2284
0
  min_len = min_lens[num_used_literals];
2285
  /*
2286
   * With a low max_search_depth, it may be too hard to find long matches.
2287
   */
2288
0
  if (max_search_depth < 16) {
2289
0
    if (max_search_depth < 5)
2290
0
      min_len = MIN(min_len, 4);
2291
0
    else if (max_search_depth < 10)
2292
0
      min_len = MIN(min_len, 5);
2293
0
    else
2294
0
      min_len = MIN(min_len, 7);
2295
0
  }
2296
0
  return min_len;
2297
0
}
2298
2299
static unsigned
2300
calculate_min_match_len(const u8 *data, size_t data_len,
2301
      unsigned max_search_depth)
2302
0
{
2303
0
  u8 used[256] = { 0 };
2304
0
  unsigned num_used_literals = 0;
2305
0
  size_t i;
2306
2307
  /*
2308
   * For an initial approximation, scan the first 4 KiB of data.  The
2309
   * caller may use recalculate_min_match_len() to update min_len later.
2310
   */
2311
0
  data_len = MIN(data_len, 4096);
2312
0
  for (i = 0; i < data_len; i++)
2313
0
    used[data[i]] = 1;
2314
0
  for (i = 0; i < 256; i++)
2315
0
    num_used_literals += used[i];
2316
0
  return choose_min_match_len(num_used_literals, max_search_depth);
2317
0
}
2318
2319
/*
2320
 * Recalculate the minimum match length for a block, now that we know the
2321
 * distribution of literals that are actually being used (freqs->litlen).
2322
 */
2323
static unsigned
2324
recalculate_min_match_len(const struct deflate_freqs *freqs,
2325
        unsigned max_search_depth)
2326
0
{
2327
0
  u32 literal_freq = 0;
2328
0
  u32 cutoff;
2329
0
  unsigned num_used_literals = 0;
2330
0
  int i;
2331
2332
0
  for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
2333
0
    literal_freq += freqs->litlen[i];
2334
2335
0
  cutoff = literal_freq >> 10; /* Ignore literals used very rarely. */
2336
2337
0
  for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
2338
0
    if (freqs->litlen[i] > cutoff)
2339
0
      num_used_literals++;
2340
0
  }
2341
0
  return choose_min_match_len(num_used_literals, max_search_depth);
2342
0
}
2343
2344
static forceinline const u8 *
2345
choose_max_block_end(const u8 *in_block_begin, const u8 *in_end,
2346
         size_t soft_max_len)
2347
0
{
2348
0
  if (in_end - in_block_begin < soft_max_len + MIN_BLOCK_LENGTH)
2349
0
    return in_end;
2350
0
  return in_block_begin + soft_max_len;
2351
0
}
2352
2353
/*
2354
 * This is the level 0 "compressor".  It always outputs uncompressed blocks.
2355
 */
2356
static size_t
2357
deflate_compress_none(const u8 *in, size_t in_nbytes,
2358
          u8 *out, size_t out_nbytes_avail)
2359
0
{
2360
0
  const u8 *in_next = in;
2361
0
  const u8 * const in_end = in + in_nbytes;
2362
0
  u8 *out_next = out;
2363
0
  u8 * const out_end = out + out_nbytes_avail;
2364
2365
  /*
2366
   * If the input is zero-length, we still must output a block in order
2367
   * for the output to be a valid DEFLATE stream.  Handle this case
2368
   * specially to avoid potentially passing NULL to memcpy() below.
2369
   */
2370
0
  if (unlikely(in_nbytes == 0)) {
2371
0
    if (out_nbytes_avail < 5)
2372
0
      return 0;
2373
    /* BFINAL and BTYPE */
2374
0
    *out_next++ = 1 | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1);
2375
    /* LEN and NLEN */
2376
0
    put_unaligned_le32(0xFFFF0000, out_next);
2377
0
    return 5;
2378
0
  }
2379
2380
0
  do {
2381
0
    u8 bfinal = 0;
2382
0
    size_t len = UINT16_MAX;
2383
2384
0
    if (in_end - in_next <= UINT16_MAX) {
2385
0
      bfinal = 1;
2386
0
      len = in_end - in_next;
2387
0
    }
2388
0
    if (out_end - out_next < 5 + len)
2389
0
      return 0;
2390
    /*
2391
     * Output BFINAL and BTYPE.  The stream is already byte-aligned
2392
     * here, so this step always requires outputting exactly 1 byte.
2393
     */
2394
0
    *out_next++ = bfinal | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1);
2395
2396
    /* Output LEN and NLEN, then the data itself. */
2397
0
    put_unaligned_le16(len, out_next);
2398
0
    out_next += 2;
2399
0
    put_unaligned_le16(~len, out_next);
2400
0
    out_next += 2;
2401
0
    memcpy(out_next, in_next, len);
2402
0
    out_next += len;
2403
0
    in_next += len;
2404
0
  } while (in_next != in_end);
2405
2406
0
  return out_next - out;
2407
0
}
2408
2409
/*
2410
 * This is a faster variant of deflate_compress_greedy().  It uses the
2411
 * ht_matchfinder rather than the hc_matchfinder.  It also skips the block
2412
 * splitting algorithm and just uses fixed length blocks.  c->max_search_depth
2413
 * has no effect with this algorithm, as it is hardcoded in ht_matchfinder.h.
2414
 */
2415
static void
2416
deflate_compress_fastest(struct libdeflate_compressor * restrict c,
2417
       const u8 *in, size_t in_nbytes,
2418
       struct deflate_output_bitstream *os)
2419
0
{
2420
0
  const u8 *in_next = in;
2421
0
  const u8 *in_end = in_next + in_nbytes;
2422
0
  const u8 *in_cur_base = in_next;
2423
0
  unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2424
0
  unsigned nice_len = MIN(c->nice_match_length, max_len);
2425
0
  u32 next_hash = 0;
2426
2427
0
  ht_matchfinder_init(&c->p.f.ht_mf);
2428
2429
0
  do {
2430
    /* Starting a new DEFLATE block */
2431
2432
0
    const u8 * const in_block_begin = in_next;
2433
0
    const u8 * const in_max_block_end = choose_max_block_end(
2434
0
        in_next, in_end, FAST_SOFT_MAX_BLOCK_LENGTH);
2435
0
    struct deflate_sequence *seq = c->p.f.sequences;
2436
2437
0
    deflate_begin_sequences(c, seq);
2438
2439
0
    do {
2440
0
      u32 length;
2441
0
      u32 offset;
2442
0
      size_t remaining = in_end - in_next;
2443
2444
0
      if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
2445
0
        max_len = remaining;
2446
0
        if (max_len < HT_MATCHFINDER_REQUIRED_NBYTES) {
2447
0
          do {
2448
0
            deflate_choose_literal(c,
2449
0
              *in_next++, false, seq);
2450
0
          } while (--max_len);
2451
0
          break;
2452
0
        }
2453
0
        nice_len = MIN(nice_len, max_len);
2454
0
      }
2455
0
      length = ht_matchfinder_longest_match(&c->p.f.ht_mf,
2456
0
                    &in_cur_base,
2457
0
                    in_next,
2458
0
                    max_len,
2459
0
                    nice_len,
2460
0
                    &next_hash,
2461
0
                    &offset);
2462
0
      if (length) {
2463
        /* Match found */
2464
0
        deflate_choose_match(c, length, offset, false,
2465
0
                 &seq);
2466
0
        ht_matchfinder_skip_bytes(&c->p.f.ht_mf,
2467
0
                &in_cur_base,
2468
0
                in_next + 1,
2469
0
                in_end,
2470
0
                length - 1,
2471
0
                &next_hash);
2472
0
        in_next += length;
2473
0
      } else {
2474
        /* No match found */
2475
0
        deflate_choose_literal(c, *in_next++, false,
2476
0
                   seq);
2477
0
      }
2478
2479
      /* Check if it's time to output another block. */
2480
0
    } while (in_next < in_max_block_end &&
2481
0
       seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]);
2482
2483
0
    deflate_finish_block(c, os, in_block_begin,
2484
0
             in_next - in_block_begin,
2485
0
             c->p.f.sequences, in_next == in_end);
2486
0
  } while (in_next != in_end);
2487
0
}
2488
2489
/*
2490
 * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
2491
 */
2492
static void
2493
deflate_compress_greedy(struct libdeflate_compressor * restrict c,
2494
      const u8 *in, size_t in_nbytes,
2495
      struct deflate_output_bitstream *os)
2496
0
{
2497
0
  const u8 *in_next = in;
2498
0
  const u8 *in_end = in_next + in_nbytes;
2499
0
  const u8 *in_cur_base = in_next;
2500
0
  unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2501
0
  unsigned nice_len = MIN(c->nice_match_length, max_len);
2502
0
  u32 next_hashes[2] = {0, 0};
2503
2504
0
  hc_matchfinder_init(&c->p.g.hc_mf);
2505
2506
0
  do {
2507
    /* Starting a new DEFLATE block */
2508
2509
0
    const u8 * const in_block_begin = in_next;
2510
0
    const u8 * const in_max_block_end = choose_max_block_end(
2511
0
        in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
2512
0
    struct deflate_sequence *seq = c->p.g.sequences;
2513
0
    unsigned min_len;
2514
2515
0
    init_block_split_stats(&c->split_stats);
2516
0
    deflate_begin_sequences(c, seq);
2517
0
    min_len = calculate_min_match_len(in_next,
2518
0
              in_max_block_end - in_next,
2519
0
              c->max_search_depth);
2520
0
    do {
2521
0
      u32 length;
2522
0
      u32 offset;
2523
2524
0
      adjust_max_and_nice_len(&max_len, &nice_len,
2525
0
            in_end - in_next);
2526
0
      length = hc_matchfinder_longest_match(
2527
0
            &c->p.g.hc_mf,
2528
0
            &in_cur_base,
2529
0
            in_next,
2530
0
            min_len - 1,
2531
0
            max_len,
2532
0
            nice_len,
2533
0
            c->max_search_depth,
2534
0
            next_hashes,
2535
0
            &offset);
2536
2537
0
      if (length >= min_len &&
2538
0
          (length > DEFLATE_MIN_MATCH_LEN ||
2539
0
           offset <= 4096)) {
2540
        /* Match found */
2541
0
        deflate_choose_match(c, length, offset, true,
2542
0
                 &seq);
2543
0
        hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
2544
0
                &in_cur_base,
2545
0
                in_next + 1,
2546
0
                in_end,
2547
0
                length - 1,
2548
0
                next_hashes);
2549
0
        in_next += length;
2550
0
      } else {
2551
        /* No match found */
2552
0
        deflate_choose_literal(c, *in_next++, true,
2553
0
                   seq);
2554
0
      }
2555
2556
      /* Check if it's time to output another block. */
2557
0
    } while (in_next < in_max_block_end &&
2558
0
       seq < &c->p.g.sequences[SEQ_STORE_LENGTH] &&
2559
0
       !should_end_block(&c->split_stats,
2560
0
             in_block_begin, in_next, in_end));
2561
2562
0
    deflate_finish_block(c, os, in_block_begin,
2563
0
             in_next - in_block_begin,
2564
0
             c->p.g.sequences, in_next == in_end);
2565
0
  } while (in_next != in_end);
2566
0
}
2567
2568
static forceinline void
2569
deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
2570
            const u8 *in, size_t in_nbytes,
2571
            struct deflate_output_bitstream *os, bool lazy2)
2572
0
{
2573
0
  const u8 *in_next = in;
2574
0
  const u8 *in_end = in_next + in_nbytes;
2575
0
  const u8 *in_cur_base = in_next;
2576
0
  unsigned max_len = DEFLATE_MAX_MATCH_LEN;
2577
0
  unsigned nice_len = MIN(c->nice_match_length, max_len);
2578
0
  u32 next_hashes[2] = {0, 0};
2579
2580
0
  hc_matchfinder_init(&c->p.g.hc_mf);
2581
2582
0
  do {
2583
    /* Starting a new DEFLATE block */
2584
2585
0
    const u8 * const in_block_begin = in_next;
2586
0
    const u8 * const in_max_block_end = choose_max_block_end(
2587
0
        in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
2588
0
    const u8 *next_recalc_min_len =
2589
0
      in_next + MIN(in_end - in_next, 10000);
2590
0
    struct deflate_sequence *seq = c->p.g.sequences;
2591
0
    unsigned min_len;
2592
2593
0
    init_block_split_stats(&c->split_stats);
2594
0
    deflate_begin_sequences(c, seq);
2595
0
    min_len = calculate_min_match_len(in_next,
2596
0
              in_max_block_end - in_next,
2597
0
              c->max_search_depth);
2598
0
    do {
2599
0
      unsigned cur_len;
2600
0
      unsigned cur_offset;
2601
0
      unsigned next_len;
2602
0
      unsigned next_offset;
2603
2604
      /*
2605
       * Recalculate the minimum match length if it hasn't
2606
       * been done recently.
2607
       */
2608
0
      if (in_next >= next_recalc_min_len) {
2609
0
        min_len = recalculate_min_match_len(
2610
0
            &c->freqs,
2611
0
            c->max_search_depth);
2612
0
        next_recalc_min_len +=
2613
0
          MIN(in_end - next_recalc_min_len,
2614
0
              in_next - in_block_begin);
2615
0
      }
2616
2617
      /* Find the longest match at the current position. */
2618
0
      adjust_max_and_nice_len(&max_len, &nice_len,
2619
0
            in_end - in_next);
2620
0
      cur_len = hc_matchfinder_longest_match(
2621
0
            &c->p.g.hc_mf,
2622
0
            &in_cur_base,
2623
0
            in_next,
2624
0
            min_len - 1,
2625
0
            max_len,
2626
0
            nice_len,
2627
0
            c->max_search_depth,
2628
0
            next_hashes,
2629
0
            &cur_offset);
2630
0
      if (cur_len < min_len ||
2631
0
          (cur_len == DEFLATE_MIN_MATCH_LEN &&
2632
0
           cur_offset > 8192)) {
2633
        /* No match found.  Choose a literal. */
2634
0
        deflate_choose_literal(c, *in_next++, true,
2635
0
                   seq);
2636
0
        continue;
2637
0
      }
2638
0
      in_next++;
2639
2640
0
have_cur_match:
2641
      /*
2642
       * We have a match at the current position.
2643
       * If it's very long, choose it immediately.
2644
       */
2645
0
      if (cur_len >= nice_len) {
2646
0
        deflate_choose_match(c, cur_len, cur_offset,
2647
0
                 true, &seq);
2648
0
        hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
2649
0
                &in_cur_base,
2650
0
                in_next,
2651
0
                in_end,
2652
0
                cur_len - 1,
2653
0
                next_hashes);
2654
0
        in_next += cur_len - 1;
2655
0
        continue;
2656
0
      }
2657
2658
      /*
2659
       * Try to find a better match at the next position.
2660
       *
2661
       * Note: since we already have a match at the *current*
2662
       * position, we use only half the 'max_search_depth'
2663
       * when checking the *next* position.  This is a useful
2664
       * trade-off because it's more worthwhile to use a
2665
       * greater search depth on the initial match.
2666
       *
2667
       * Note: it's possible to structure the code such that
2668
       * there's only one call to longest_match(), which
2669
       * handles both the "find the initial match" and "try to
2670
       * find a better match" cases.  However, it is faster to
2671
       * have two call sites, with longest_match() inlined at
2672
       * each.
2673
       */
2674
0
      adjust_max_and_nice_len(&max_len, &nice_len,
2675
0
            in_end - in_next);
2676
0
      next_len = hc_matchfinder_longest_match(
2677
0
            &c->p.g.hc_mf,
2678
0
            &in_cur_base,
2679
0
            in_next++,
2680
0
            cur_len - 1,
2681
0
            max_len,
2682
0
            nice_len,
2683
0
            c->max_search_depth >> 1,
2684
0
            next_hashes,
2685
0
            &next_offset);
2686
0
      if (next_len >= cur_len &&
2687
0
          4 * (int)(next_len - cur_len) +
2688
0
          ((int)bsr32(cur_offset) -
2689
0
           (int)bsr32(next_offset)) > 2) {
2690
        /*
2691
         * Found a better match at the next position.
2692
         * Output a literal.  Then the next match
2693
         * becomes the current match.
2694
         */
2695
0
        deflate_choose_literal(c, *(in_next - 2), true,
2696
0
                   seq);
2697
0
        cur_len = next_len;
2698
0
        cur_offset = next_offset;
2699
0
        goto have_cur_match;
2700
0
      }
2701
2702
0
      if (lazy2) {
2703
        /* In lazy2 mode, look ahead another position */
2704
0
        adjust_max_and_nice_len(&max_len, &nice_len,
2705
0
              in_end - in_next);
2706
0
        next_len = hc_matchfinder_longest_match(
2707
0
            &c->p.g.hc_mf,
2708
0
            &in_cur_base,
2709
0
            in_next++,
2710
0
            cur_len - 1,
2711
0
            max_len,
2712
0
            nice_len,
2713
0
            c->max_search_depth >> 2,
2714
0
            next_hashes,
2715
0
            &next_offset);
2716
0
        if (next_len >= cur_len &&
2717
0
            4 * (int)(next_len - cur_len) +
2718
0
            ((int)bsr32(cur_offset) -
2719
0
             (int)bsr32(next_offset)) > 6) {
2720
          /*
2721
           * There's a much better match two
2722
           * positions ahead, so use two literals.
2723
           */
2724
0
          deflate_choose_literal(
2725
0
            c, *(in_next - 3), true, seq);
2726
0
          deflate_choose_literal(
2727
0
            c, *(in_next - 2), true, seq);
2728
0
          cur_len = next_len;
2729
0
          cur_offset = next_offset;
2730
0
          goto have_cur_match;
2731
0
        }
2732
        /*
2733
         * No better match at either of the next 2
2734
         * positions.  Output the current match.
2735
         */
2736
0
        deflate_choose_match(c, cur_len, cur_offset,
2737
0
                 true, &seq);
2738
0
        if (cur_len > 3) {
2739
0
          hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
2740
0
                  &in_cur_base,
2741
0
                  in_next,
2742
0
                  in_end,
2743
0
                  cur_len - 3,
2744
0
                  next_hashes);
2745
0
          in_next += cur_len - 3;
2746
0
        }
2747
0
      } else { /* !lazy2 */
2748
        /*
2749
         * No better match at the next position.  Output
2750
         * the current match.
2751
         */
2752
0
        deflate_choose_match(c, cur_len, cur_offset,
2753
0
                 true, &seq);
2754
0
        hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
2755
0
                &in_cur_base,
2756
0
                in_next,
2757
0
                in_end,
2758
0
                cur_len - 2,
2759
0
                next_hashes);
2760
0
        in_next += cur_len - 2;
2761
0
      }
2762
      /* Check if it's time to output another block. */
2763
0
    } while (in_next < in_max_block_end &&
2764
0
       seq < &c->p.g.sequences[SEQ_STORE_LENGTH] &&
2765
0
       !should_end_block(&c->split_stats,
2766
0
             in_block_begin, in_next, in_end));
2767
2768
0
    deflate_finish_block(c, os, in_block_begin,
2769
0
             in_next - in_block_begin,
2770
0
             c->p.g.sequences, in_next == in_end);
2771
0
  } while (in_next != in_end);
2772
0
}
2773
2774
/*
2775
 * This is the "lazy" DEFLATE compressor.  Before choosing a match, it checks to
2776
 * see if there's a better match at the next position.  If yes, it outputs a
2777
 * literal and continues to the next position.  If no, it outputs the match.
2778
 */
2779
static void
2780
deflate_compress_lazy(struct libdeflate_compressor * restrict c,
2781
          const u8 *in, size_t in_nbytes,
2782
          struct deflate_output_bitstream *os)
2783
0
{
2784
0
  deflate_compress_lazy_generic(c, in, in_nbytes, os, false);
2785
0
}
2786
2787
/*
2788
 * The lazy2 compressor.  This is similar to the regular lazy one, but it looks
2789
 * for a better match at the next 2 positions rather than the next 1.  This
2790
 * makes it take slightly more time, but compress some inputs slightly more.
2791
 */
2792
static void
2793
deflate_compress_lazy2(struct libdeflate_compressor * restrict c,
2794
           const u8 *in, size_t in_nbytes,
2795
           struct deflate_output_bitstream *os)
2796
0
{
2797
0
  deflate_compress_lazy_generic(c, in, in_nbytes, os, true);
2798
0
}
2799
2800
#if SUPPORT_NEAR_OPTIMAL_PARSING
2801
2802
/*
2803
 * Follow the minimum-cost path in the graph of possible match/literal choices
2804
 * for the current block and compute the frequencies of the Huffman symbols that
2805
 * would be needed to output those matches and literals.
2806
 */
2807
static void
2808
deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
2809
0
{
2810
0
  struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
2811
0
  struct deflate_optimum_node *end_node =
2812
0
    &c->p.n.optimum_nodes[block_length];
2813
2814
0
  do {
2815
0
    unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
2816
0
    unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
2817
2818
0
    if (length == 1) {
2819
      /* Literal */
2820
0
      c->freqs.litlen[offset]++;
2821
0
    } else {
2822
      /* Match */
2823
0
      c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
2824
0
          deflate_length_slot[length]]++;
2825
0
      c->freqs.offset[c->p.n.offset_slot_full[offset]]++;
2826
0
    }
2827
0
    cur_node += length;
2828
0
  } while (cur_node != end_node);
2829
2830
  /* Tally the end-of-block symbol. */
2831
0
  c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
2832
0
}
2833
2834
static void
2835
deflate_choose_all_literals(struct libdeflate_compressor *c,
2836
          const u8 *block, u32 block_length)
2837
0
{
2838
0
  u32 i;
2839
2840
0
  deflate_reset_symbol_frequencies(c);
2841
0
  for (i = 0; i < block_length; i++)
2842
0
    c->freqs.litlen[block[i]]++;
2843
0
  c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
2844
2845
0
  deflate_make_huffman_codes(&c->freqs, &c->codes);
2846
0
}
2847
2848
/*
2849
 * Compute the exact cost, in bits, that would be required to output the matches
2850
 * and literals described by @c->freqs as a dynamic Huffman block.  The litlen
2851
 * and offset codes are assumed to have already been built in @c->codes.
2852
 */
2853
static u32
2854
deflate_compute_true_cost(struct libdeflate_compressor *c)
2855
0
{
2856
0
  u32 cost = 0;
2857
0
  unsigned sym;
2858
2859
0
  deflate_precompute_huffman_header(c);
2860
2861
0
  memset(&c->codes.lens.litlen[c->o.precode.num_litlen_syms], 0,
2862
0
         DEFLATE_NUM_LITLEN_SYMS - c->o.precode.num_litlen_syms);
2863
2864
0
  cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens);
2865
0
  for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
2866
0
    cost += c->o.precode.freqs[sym] *
2867
0
      (c->o.precode.lens[sym] +
2868
0
       deflate_extra_precode_bits[sym]);
2869
0
  }
2870
2871
0
  for (sym = 0; sym < DEFLATE_FIRST_LEN_SYM; sym++)
2872
0
    cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym];
2873
2874
0
  for (; sym < DEFLATE_FIRST_LEN_SYM +
2875
0
         ARRAY_LEN(deflate_extra_length_bits); sym++)
2876
0
    cost += c->freqs.litlen[sym] *
2877
0
      (c->codes.lens.litlen[sym] +
2878
0
       deflate_extra_length_bits[sym - DEFLATE_FIRST_LEN_SYM]);
2879
2880
0
  for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++)
2881
0
    cost += c->freqs.offset[sym] *
2882
0
      (c->codes.lens.offset[sym] +
2883
0
       deflate_extra_offset_bits[sym]);
2884
0
  return cost;
2885
0
}
2886
2887
/* Set the current cost model from the codeword lengths specified in @lens. */
2888
static void
2889
deflate_set_costs_from_codes(struct libdeflate_compressor *c,
2890
           const struct deflate_lens *lens)
2891
0
{
2892
0
  unsigned i;
2893
2894
  /* Literals */
2895
0
  for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
2896
0
    u32 bits = (lens->litlen[i] ?
2897
0
          lens->litlen[i] : LITERAL_NOSTAT_BITS);
2898
2899
0
    c->p.n.costs.literal[i] = bits * BIT_COST;
2900
0
  }
2901
2902
  /* Lengths */
2903
0
  for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
2904
0
    unsigned length_slot = deflate_length_slot[i];
2905
0
    unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot;
2906
0
    u32 bits = (lens->litlen[litlen_sym] ?
2907
0
          lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
2908
2909
0
    bits += deflate_extra_length_bits[length_slot];
2910
0
    c->p.n.costs.length[i] = bits * BIT_COST;
2911
0
  }
2912
2913
  /* Offset slots */
2914
0
  for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
2915
0
    u32 bits = (lens->offset[i] ?
2916
0
          lens->offset[i] : OFFSET_NOSTAT_BITS);
2917
2918
0
    bits += deflate_extra_offset_bits[i];
2919
0
    c->p.n.costs.offset_slot[i] = bits * BIT_COST;
2920
0
  }
2921
0
}
2922
2923
/*
2924
 * This lookup table gives the default cost of a literal symbol and of a length
2925
 * symbol, depending on the characteristics of the input data.  It was generated
2926
 * by scripts/gen_default_litlen_costs.py.
2927
 *
2928
 * This table is indexed first by the estimated match probability:
2929
 *
2930
 *  i=0: data doesn't contain many matches  [match_prob=0.25]
2931
 *  i=1: neutral        [match_prob=0.50]
2932
 *  i=2: data contains lots of matches  [match_prob=0.75]
2933
 *
2934
 * This lookup produces a subtable which maps the number of distinct used
2935
 * literals to the default cost of a literal symbol, i.e.:
2936
 *
2937
 *  int(-log2((1 - match_prob) / num_used_literals) * BIT_COST)
2938
 *
2939
 * ... for num_used_literals in [1, 256] (and 0, which is copied from 1).  This
2940
 * accounts for literals usually getting cheaper as the number of distinct
2941
 * literals decreases, and as the proportion of literals to matches increases.
2942
 *
2943
 * The lookup also produces the cost of a length symbol, which is:
2944
 *
2945
 *  int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST)
2946
 *
2947
 * Note: we don't currently assign different costs to different literal symbols,
2948
 * or to different length symbols, as this is hard to do in a useful way.
2949
 */
2950
static const struct {
2951
  u8 used_lits_to_lit_cost[257];
2952
  u8 len_sym_cost;
2953
} default_litlen_costs[] = {
2954
  { /* match_prob = 0.25 */
2955
    .used_lits_to_lit_cost = {
2956
      6, 6, 22, 32, 38, 43, 48, 51,
2957
      54, 57, 59, 61, 64, 65, 67, 69,
2958
      70, 72, 73, 74, 75, 76, 77, 79,
2959
      80, 80, 81, 82, 83, 84, 85, 85,
2960
      86, 87, 88, 88, 89, 89, 90, 91,
2961
      91, 92, 92, 93, 93, 94, 95, 95,
2962
      96, 96, 96, 97, 97, 98, 98, 99,
2963
      99, 99, 100, 100, 101, 101, 101, 102,
2964
      102, 102, 103, 103, 104, 104, 104, 105,
2965
      105, 105, 105, 106, 106, 106, 107, 107,
2966
      107, 108, 108, 108, 108, 109, 109, 109,
2967
      109, 110, 110, 110, 111, 111, 111, 111,
2968
      112, 112, 112, 112, 112, 113, 113, 113,
2969
      113, 114, 114, 114, 114, 114, 115, 115,
2970
      115, 115, 115, 116, 116, 116, 116, 116,
2971
      117, 117, 117, 117, 117, 118, 118, 118,
2972
      118, 118, 118, 119, 119, 119, 119, 119,
2973
      120, 120, 120, 120, 120, 120, 121, 121,
2974
      121, 121, 121, 121, 121, 122, 122, 122,
2975
      122, 122, 122, 123, 123, 123, 123, 123,
2976
      123, 123, 124, 124, 124, 124, 124, 124,
2977
      124, 125, 125, 125, 125, 125, 125, 125,
2978
      125, 126, 126, 126, 126, 126, 126, 126,
2979
      127, 127, 127, 127, 127, 127, 127, 127,
2980
      128, 128, 128, 128, 128, 128, 128, 128,
2981
      128, 129, 129, 129, 129, 129, 129, 129,
2982
      129, 129, 130, 130, 130, 130, 130, 130,
2983
      130, 130, 130, 131, 131, 131, 131, 131,
2984
      131, 131, 131, 131, 131, 132, 132, 132,
2985
      132, 132, 132, 132, 132, 132, 132, 133,
2986
      133, 133, 133, 133, 133, 133, 133, 133,
2987
      133, 134, 134, 134, 134, 134, 134, 134,
2988
      134,
2989
    },
2990
    .len_sym_cost = 109,
2991
  }, { /* match_prob = 0.5 */
2992
    .used_lits_to_lit_cost = {
2993
      16, 16, 32, 41, 48, 53, 57, 60,
2994
      64, 66, 69, 71, 73, 75, 76, 78,
2995
      80, 81, 82, 83, 85, 86, 87, 88,
2996
      89, 90, 91, 92, 92, 93, 94, 95,
2997
      96, 96, 97, 98, 98, 99, 99, 100,
2998
      101, 101, 102, 102, 103, 103, 104, 104,
2999
      105, 105, 106, 106, 107, 107, 108, 108,
3000
      108, 109, 109, 110, 110, 110, 111, 111,
3001
      112, 112, 112, 113, 113, 113, 114, 114,
3002
      114, 115, 115, 115, 115, 116, 116, 116,
3003
      117, 117, 117, 118, 118, 118, 118, 119,
3004
      119, 119, 119, 120, 120, 120, 120, 121,
3005
      121, 121, 121, 122, 122, 122, 122, 122,
3006
      123, 123, 123, 123, 124, 124, 124, 124,
3007
      124, 125, 125, 125, 125, 125, 126, 126,
3008
      126, 126, 126, 127, 127, 127, 127, 127,
3009
      128, 128, 128, 128, 128, 128, 129, 129,
3010
      129, 129, 129, 129, 130, 130, 130, 130,
3011
      130, 130, 131, 131, 131, 131, 131, 131,
3012
      131, 132, 132, 132, 132, 132, 132, 133,
3013
      133, 133, 133, 133, 133, 133, 134, 134,
3014
      134, 134, 134, 134, 134, 134, 135, 135,
3015
      135, 135, 135, 135, 135, 135, 136, 136,
3016
      136, 136, 136, 136, 136, 136, 137, 137,
3017
      137, 137, 137, 137, 137, 137, 138, 138,
3018
      138, 138, 138, 138, 138, 138, 138, 139,
3019
      139, 139, 139, 139, 139, 139, 139, 139,
3020
      140, 140, 140, 140, 140, 140, 140, 140,
3021
      140, 141, 141, 141, 141, 141, 141, 141,
3022
      141, 141, 141, 142, 142, 142, 142, 142,
3023
      142, 142, 142, 142, 142, 142, 143, 143,
3024
      143, 143, 143, 143, 143, 143, 143, 143,
3025
      144,
3026
    },
3027
    .len_sym_cost = 93,
3028
  }, { /* match_prob = 0.75 */
3029
    .used_lits_to_lit_cost = {
3030
      32, 32, 48, 57, 64, 69, 73, 76,
3031
      80, 82, 85, 87, 89, 91, 92, 94,
3032
      96, 97, 98, 99, 101, 102, 103, 104,
3033
      105, 106, 107, 108, 108, 109, 110, 111,
3034
      112, 112, 113, 114, 114, 115, 115, 116,
3035
      117, 117, 118, 118, 119, 119, 120, 120,
3036
      121, 121, 122, 122, 123, 123, 124, 124,
3037
      124, 125, 125, 126, 126, 126, 127, 127,
3038
      128, 128, 128, 129, 129, 129, 130, 130,
3039
      130, 131, 131, 131, 131, 132, 132, 132,
3040
      133, 133, 133, 134, 134, 134, 134, 135,
3041
      135, 135, 135, 136, 136, 136, 136, 137,
3042
      137, 137, 137, 138, 138, 138, 138, 138,
3043
      139, 139, 139, 139, 140, 140, 140, 140,
3044
      140, 141, 141, 141, 141, 141, 142, 142,
3045
      142, 142, 142, 143, 143, 143, 143, 143,
3046
      144, 144, 144, 144, 144, 144, 145, 145,
3047
      145, 145, 145, 145, 146, 146, 146, 146,
3048
      146, 146, 147, 147, 147, 147, 147, 147,
3049
      147, 148, 148, 148, 148, 148, 148, 149,
3050
      149, 149, 149, 149, 149, 149, 150, 150,
3051
      150, 150, 150, 150, 150, 150, 151, 151,
3052
      151, 151, 151, 151, 151, 151, 152, 152,
3053
      152, 152, 152, 152, 152, 152, 153, 153,
3054
      153, 153, 153, 153, 153, 153, 154, 154,
3055
      154, 154, 154, 154, 154, 154, 154, 155,
3056
      155, 155, 155, 155, 155, 155, 155, 155,
3057
      156, 156, 156, 156, 156, 156, 156, 156,
3058
      156, 157, 157, 157, 157, 157, 157, 157,
3059
      157, 157, 157, 158, 158, 158, 158, 158,
3060
      158, 158, 158, 158, 158, 158, 159, 159,
3061
      159, 159, 159, 159, 159, 159, 159, 159,
3062
      160,
3063
    },
3064
    .len_sym_cost = 84,
3065
  },
3066
};
3067
3068
/*
3069
 * Choose the default costs for literal and length symbols.  These symbols are
3070
 * both part of the litlen alphabet.
3071
 */
3072
static void
3073
deflate_choose_default_litlen_costs(struct libdeflate_compressor *c,
3074
            const u8 *block_begin, u32 block_length,
3075
            u32 *lit_cost, u32 *len_sym_cost)
3076
0
{
3077
0
  unsigned num_used_literals = 0;
3078
0
  u32 literal_freq = block_length;
3079
0
  u32 match_freq = 0;
3080
0
  u32 cutoff;
3081
0
  u32 i;
3082
3083
  /* Calculate the number of distinct literals that exist in the data. */
3084
0
  memset(c->freqs.litlen, 0,
3085
0
         DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0]));
3086
0
  cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */
3087
0
  for (i = 0; i < block_length; i++)
3088
0
    c->freqs.litlen[block_begin[i]]++;
3089
0
  for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
3090
0
    if (c->freqs.litlen[i] > cutoff)
3091
0
      num_used_literals++;
3092
0
  }
3093
0
  if (num_used_literals == 0)
3094
0
    num_used_literals = 1;
3095
3096
  /*
3097
   * Estimate the relative frequency of literals and matches in the
3098
   * optimal parsing solution.  We don't know the optimal solution, so
3099
   * this can only be a very rough estimate.  Therefore, we basically use
3100
   * the match frequency from a greedy parse.  We also apply the min_len
3101
   * heuristic used by the greedy and lazy parsers, to avoid counting too
3102
   * many matches when literals are cheaper than short matches.
3103
   */
3104
0
  match_freq = 0;
3105
0
  i = choose_min_match_len(num_used_literals, c->max_search_depth);
3106
0
  for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
3107
0
    match_freq += c->p.n.match_len_freqs[i];
3108
0
    literal_freq -= i * c->p.n.match_len_freqs[i];
3109
0
  }
3110
0
  if ((s32)literal_freq < 0) /* shouldn't happen */
3111
0
    literal_freq = 0;
3112
3113
0
  if (match_freq > literal_freq)
3114
0
    i = 2; /* many matches */
3115
0
  else if (match_freq * 4 > literal_freq)
3116
0
    i = 1; /* neutral */
3117
0
  else
3118
0
    i = 0; /* few matches */
3119
3120
0
  STATIC_ASSERT(BIT_COST == 16);
3121
0
  *lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[
3122
0
              num_used_literals];
3123
0
  *len_sym_cost = default_litlen_costs[i].len_sym_cost;
3124
0
}
3125
3126
static forceinline u32
3127
deflate_default_length_cost(unsigned len, u32 len_sym_cost)
3128
0
{
3129
0
  unsigned slot = deflate_length_slot[len];
3130
0
  u32 num_extra_bits = deflate_extra_length_bits[slot];
3131
3132
0
  return len_sym_cost + (num_extra_bits * BIT_COST);
3133
0
}
3134
3135
static forceinline u32
3136
deflate_default_offset_slot_cost(unsigned slot)
3137
0
{
3138
0
  u32 num_extra_bits = deflate_extra_offset_bits[slot];
3139
  /*
3140
   * Assume that all offset symbols are equally probable.
3141
   * The resulting cost is 'int(-log2(1/30) * BIT_COST)',
3142
   * where 30 is the number of potentially-used offset symbols.
3143
   */
3144
0
  u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000;
3145
3146
0
  return offset_sym_cost + (num_extra_bits * BIT_COST);
3147
0
}
3148
3149
/* Set default symbol costs for the first block's first optimization pass. */
3150
static void
3151
deflate_set_default_costs(struct libdeflate_compressor *c,
3152
        u32 lit_cost, u32 len_sym_cost)
3153
0
{
3154
0
  unsigned i;
3155
3156
  /* Literals */
3157
0
  for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
3158
0
    c->p.n.costs.literal[i] = lit_cost;
3159
3160
  /* Lengths */
3161
0
  for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
3162
0
    c->p.n.costs.length[i] =
3163
0
      deflate_default_length_cost(i, len_sym_cost);
3164
3165
  /* Offset slots */
3166
0
  for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
3167
0
    c->p.n.costs.offset_slot[i] =
3168
0
      deflate_default_offset_slot_cost(i);
3169
0
}
3170
3171
static forceinline void
3172
deflate_adjust_cost(u32 *cost_p, u32 default_cost, int change_amount)
3173
0
{
3174
0
  if (change_amount == 0)
3175
    /* Block is very similar to previous; prefer previous costs. */
3176
0
    *cost_p = (default_cost + 3 * *cost_p) / 4;
3177
0
  else if (change_amount == 1)
3178
0
    *cost_p = (default_cost + *cost_p) / 2;
3179
0
  else if (change_amount == 2)
3180
0
    *cost_p = (5 * default_cost + 3 * *cost_p) / 8;
3181
0
  else
3182
    /* Block differs greatly from previous; prefer default costs. */
3183
0
    *cost_p = (3 * default_cost + *cost_p) / 4;
3184
0
}
3185
3186
static forceinline void
3187
deflate_adjust_costs_impl(struct libdeflate_compressor *c,
3188
        u32 lit_cost, u32 len_sym_cost, int change_amount)
3189
0
{
3190
0
  unsigned i;
3191
3192
  /* Literals */
3193
0
  for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
3194
0
    deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost,
3195
0
            change_amount);
3196
3197
  /* Lengths */
3198
0
  for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
3199
0
    deflate_adjust_cost(&c->p.n.costs.length[i],
3200
0
            deflate_default_length_cost(i,
3201
0
                len_sym_cost),
3202
0
            change_amount);
3203
3204
  /* Offset slots */
3205
0
  for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
3206
0
    deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
3207
0
            deflate_default_offset_slot_cost(i),
3208
0
            change_amount);
3209
0
}
3210
3211
/*
3212
 * Adjust the costs when beginning a new block.
3213
 *
3214
 * Since the current costs are optimized for the data already, it can be helpful
3215
 * to reuse them instead of starting over with the default costs.  However, this
3216
 * depends on how similar the new block is to the previous block.  Therefore,
3217
 * use a heuristic to decide how similar the blocks are, and mix together the
3218
 * current costs and the default costs accordingly.
3219
 */
3220
static void
3221
deflate_adjust_costs(struct libdeflate_compressor *c,
3222
         u32 lit_cost, u32 len_sym_cost)
3223
0
{
3224
0
  u64 total_delta = 0;
3225
0
  u64 cutoff;
3226
0
  int i;
3227
3228
  /*
3229
   * Decide how different the current block is from the previous block,
3230
   * using the block splitting statistics from the current and previous
3231
   * blocks.  The more different the current block is, the more we prefer
3232
   * the default costs rather than the previous block's costs.
3233
   *
3234
   * The algorithm here is similar to the end-of-block check one, but here
3235
   * we compare two entire blocks rather than a partial block with a small
3236
   * extra part, and therefore we need 64-bit numbers in some places.
3237
   */
3238
0
  for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
3239
0
    u64 prev = (u64)c->p.n.prev_observations[i] *
3240
0
          c->split_stats.num_observations;
3241
0
    u64 cur = (u64)c->split_stats.observations[i] *
3242
0
        c->p.n.prev_num_observations;
3243
3244
0
    total_delta += prev > cur ? prev - cur : cur - prev;
3245
0
  }
3246
0
  cutoff = ((u64)c->p.n.prev_num_observations *
3247
0
      c->split_stats.num_observations * 200) / 512;
3248
3249
0
  if (total_delta > 3 * cutoff)
3250
    /* Big change in the data; just use the default costs. */
3251
0
    deflate_set_default_costs(c, lit_cost, len_sym_cost);
3252
0
  else if (4 * total_delta > 9 * cutoff)
3253
0
    deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3);
3254
0
  else if (2 * total_delta > 3 * cutoff)
3255
0
    deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2);
3256
0
  else if (2 * total_delta > cutoff)
3257
0
    deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1);
3258
0
  else
3259
0
    deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0);
3260
0
}
3261
3262
static void
3263
deflate_set_initial_costs(struct libdeflate_compressor *c,
3264
        const u8 *block_begin, u32 block_length,
3265
        bool is_first_block)
3266
0
{
3267
0
  u32 lit_cost, len_sym_cost;
3268
3269
0
  deflate_choose_default_litlen_costs(c, block_begin, block_length,
3270
0
              &lit_cost, &len_sym_cost);
3271
0
  if (is_first_block)
3272
0
    deflate_set_default_costs(c, lit_cost, len_sym_cost);
3273
0
  else
3274
0
    deflate_adjust_costs(c, lit_cost, len_sym_cost);
3275
0
}
3276
3277
/*
3278
 * Find the minimum-cost path through the graph of possible match/literal
3279
 * choices for this block.
3280
 *
3281
 * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
3282
 * represents the node at the beginning of the block, to
3283
 * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
3284
 * the block.  Edge costs are evaluated using the cost model 'c->p.n.costs'.
3285
 *
3286
 * The algorithm works backwards, starting at the end node and proceeding
3287
 * backwards one node at a time.  At each node, the minimum cost to reach the
3288
 * end node is computed and the match/literal choice that begins that path is
3289
 * saved.
3290
 */
3291
static void
3292
deflate_find_min_cost_path(struct libdeflate_compressor *c,
3293
         const u32 block_length,
3294
         const struct lz_match *cache_ptr)
3295
0
{
3296
0
  struct deflate_optimum_node *end_node =
3297
0
    &c->p.n.optimum_nodes[block_length];
3298
0
  struct deflate_optimum_node *cur_node = end_node;
3299
3300
0
  cur_node->cost_to_end = 0;
3301
0
  do {
3302
0
    unsigned num_matches;
3303
0
    unsigned literal;
3304
0
    u32 best_cost_to_end;
3305
3306
0
    cur_node--;
3307
0
    cache_ptr--;
3308
3309
0
    num_matches = cache_ptr->length;
3310
0
    literal = cache_ptr->offset;
3311
3312
    /* It's always possible to choose a literal. */
3313
0
    best_cost_to_end = c->p.n.costs.literal[literal] +
3314
0
           (cur_node + 1)->cost_to_end;
3315
0
    cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
3316
3317
    /* Also consider matches if there are any. */
3318
0
    if (num_matches) {
3319
0
      const struct lz_match *match;
3320
0
      unsigned len;
3321
0
      unsigned offset;
3322
0
      unsigned offset_slot;
3323
0
      u32 offset_cost;
3324
0
      u32 cost_to_end;
3325
3326
      /*
3327
       * Consider each length from the minimum
3328
       * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
3329
       * match found at this position.  For each length, we
3330
       * consider only the smallest offset for which that
3331
       * length is available.  Although this is not guaranteed
3332
       * to be optimal due to the possibility of a larger
3333
       * offset costing less than a smaller offset to code,
3334
       * this is a very useful heuristic.
3335
       */
3336
0
      match = cache_ptr - num_matches;
3337
0
      len = DEFLATE_MIN_MATCH_LEN;
3338
0
      do {
3339
0
        offset = match->offset;
3340
0
        offset_slot = c->p.n.offset_slot_full[offset];
3341
0
        offset_cost =
3342
0
          c->p.n.costs.offset_slot[offset_slot];
3343
0
        do {
3344
0
          cost_to_end = offset_cost +
3345
0
            c->p.n.costs.length[len] +
3346
0
            (cur_node + len)->cost_to_end;
3347
0
          if (cost_to_end < best_cost_to_end) {
3348
0
            best_cost_to_end = cost_to_end;
3349
0
            cur_node->item = len |
3350
0
              ((u32)offset <<
3351
0
               OPTIMUM_OFFSET_SHIFT);
3352
0
          }
3353
0
        } while (++len <= match->length);
3354
0
      } while (++match != cache_ptr);
3355
0
      cache_ptr -= num_matches;
3356
0
    }
3357
0
    cur_node->cost_to_end = best_cost_to_end;
3358
0
  } while (cur_node != &c->p.n.optimum_nodes[0]);
3359
3360
0
  deflate_reset_symbol_frequencies(c);
3361
0
  deflate_tally_item_list(c, block_length);
3362
0
  deflate_make_huffman_codes(&c->freqs, &c->codes);
3363
0
}
3364
3365
/*
3366
 * Choose the literals and matches for the current block, then output the block.
3367
 *
3368
 * To choose the literal/match sequence, we find the minimum-cost path through
3369
 * the block's graph of literal/match choices, given a cost model.  However, the
3370
 * true cost of each symbol is unknown until the Huffman codes have been built,
3371
 * but at the same time the Huffman codes depend on the frequencies of chosen
3372
 * symbols.  Consequently, multiple passes must be used to try to approximate an
3373
 * optimal solution.  The first pass uses default costs, mixed with the costs
3374
 * from the previous block when it seems appropriate.  Later passes use the
3375
 * Huffman codeword lengths from the previous pass as the costs.
3376
 *
3377
 * As an alternate strategy, also consider using only literals.  The boolean
3378
 * returned in *used_only_literals indicates whether that strategy was best.
3379
 */
3380
static void
3381
deflate_optimize_and_flush_block(struct libdeflate_compressor *c,
3382
         struct deflate_output_bitstream *os,
3383
         const u8 *block_begin, u32 block_length,
3384
         const struct lz_match *cache_ptr,
3385
         bool is_first_block, bool is_final_block,
3386
         bool *used_only_literals)
3387
0
{
3388
0
  unsigned num_passes_remaining = c->p.n.max_optim_passes;
3389
0
  u32 best_true_cost = UINT32_MAX;
3390
0
  u32 true_cost;
3391
0
  u32 only_lits_cost;
3392
0
  struct deflate_sequence seq_;
3393
0
  struct deflate_sequence *seq = NULL;
3394
0
  u32 i;
3395
3396
  /*
3397
   * On some data, using only literals (no matches) ends up being better
3398
   * than what the iterative optimization algorithm produces.  Therefore,
3399
   * consider using only literals.
3400
   */
3401
0
  deflate_choose_all_literals(c, block_begin, block_length);
3402
0
  only_lits_cost = deflate_compute_true_cost(c);
3403
3404
  /*
3405
   * Force the block to really end at the desired length, even if some
3406
   * matches extend beyond it.
3407
   */
3408
0
  for (i = block_length;
3409
0
       i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
3410
0
          ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
3411
0
    c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
3412
3413
  /* Initialize c->p.n.costs with default costs. */
3414
0
  deflate_set_initial_costs(c, block_begin, block_length, is_first_block);
3415
3416
0
  do {
3417
    /*
3418
     * Find the minimum-cost path for this pass.
3419
     * Also set c->freqs and c->codes to match the path.
3420
     */
3421
0
    deflate_find_min_cost_path(c, block_length, cache_ptr);
3422
3423
    /*
3424
     * Compute the exact cost of the block if the path were to be
3425
     * used.  Note that this differs from
3426
     * c->p.n.optimum_nodes[0].cost_to_end in that true_cost uses
3427
     * the actual Huffman codes instead of c->p.n.costs.
3428
     */
3429
0
    true_cost = deflate_compute_true_cost(c);
3430
3431
    /*
3432
     * If the cost didn't improve much from the previous pass, then
3433
     * doing more passes probably won't be helpful, so stop early.
3434
     */
3435
0
    if (true_cost + c->p.n.min_improvement_to_continue >
3436
0
        best_true_cost)
3437
0
      break;
3438
3439
0
    best_true_cost = true_cost;
3440
0
    c->p.n.costs_producing_best_true_cost = c->p.n.costs;
3441
3442
    /* Update the cost model from the Huffman codes. */
3443
0
    deflate_set_costs_from_codes(c, &c->codes.lens);
3444
3445
0
  } while (--num_passes_remaining);
3446
3447
0
  *used_only_literals = false;
3448
0
  if (only_lits_cost < best_true_cost) {
3449
    /* Using only literals ended up being best! */
3450
0
    deflate_choose_all_literals(c, block_begin, block_length);
3451
0
    deflate_set_costs_from_codes(c, &c->codes.lens);
3452
0
    seq_.litrunlen_and_length = block_length;
3453
0
    seq = &seq_;
3454
0
    *used_only_literals = true;
3455
0
  } else if (true_cost >=
3456
0
       best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) {
3457
    /*
3458
     * The best solution was actually from a non-final optimization
3459
     * pass, so recover and use the min-cost path from that pass.
3460
     */
3461
0
    c->p.n.costs = c->p.n.costs_producing_best_true_cost;
3462
0
    deflate_find_min_cost_path(c, block_length, cache_ptr);
3463
0
    deflate_set_costs_from_codes(c, &c->codes.lens);
3464
0
  }
3465
0
  deflate_flush_block(c, os, block_begin, block_length, seq,
3466
0
          is_final_block);
3467
0
}
3468
3469
static void
3470
deflate_near_optimal_init_stats(struct libdeflate_compressor *c)
3471
0
{
3472
0
  init_block_split_stats(&c->split_stats);
3473
0
  memset(c->p.n.new_match_len_freqs, 0,
3474
0
         sizeof(c->p.n.new_match_len_freqs));
3475
0
  memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
3476
0
}
3477
3478
static void
3479
deflate_near_optimal_merge_stats(struct libdeflate_compressor *c)
3480
0
{
3481
0
  unsigned i;
3482
3483
0
  merge_new_observations(&c->split_stats);
3484
0
  for (i = 0; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
3485
0
    c->p.n.match_len_freqs[i] += c->p.n.new_match_len_freqs[i];
3486
0
    c->p.n.new_match_len_freqs[i] = 0;
3487
0
  }
3488
0
}
3489
3490
/*
3491
 * Save some literal/match statistics from the previous block so that
3492
 * deflate_adjust_costs() will be able to decide how much the current block
3493
 * differs from the previous one.
3494
 */
3495
static void
3496
deflate_near_optimal_save_stats(struct libdeflate_compressor *c)
3497
0
{
3498
0
  int i;
3499
3500
0
  for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
3501
0
    c->p.n.prev_observations[i] = c->split_stats.observations[i];
3502
0
  c->p.n.prev_num_observations = c->split_stats.num_observations;
3503
0
}
3504
3505
static void
3506
deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c)
3507
0
{
3508
0
  int i;
3509
3510
0
  for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
3511
0
    c->split_stats.observations[i] = 0;
3512
0
  c->split_stats.num_observations = 0;
3513
0
  memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
3514
0
}
3515
3516
/*
3517
 * This is the "near-optimal" DEFLATE compressor.  It computes the optimal
3518
 * representation of each DEFLATE block using a minimum-cost path search over
3519
 * the graph of possible match/literal choices for that block, assuming a
3520
 * certain cost for each Huffman symbol.
3521
 *
3522
 * For several reasons, the end result is not guaranteed to be optimal:
3523
 *
3524
 * - Nonoptimal choice of blocks
3525
 * - Heuristic limitations on which matches are actually considered
3526
 * - Symbol costs are unknown until the symbols have already been chosen
3527
 *   (so iterative optimization must be used)
3528
 */
3529
static void
3530
deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
3531
            const u8 *in, size_t in_nbytes,
3532
            struct deflate_output_bitstream *os)
3533
0
{
3534
0
  const u8 *in_next = in;
3535
0
  const u8 *in_block_begin = in_next;
3536
0
  const u8 *in_end = in_next + in_nbytes;
3537
0
  const u8 *in_cur_base = in_next;
3538
0
  const u8 *in_next_slide =
3539
0
    in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
3540
0
  unsigned max_len = DEFLATE_MAX_MATCH_LEN;
3541
0
  unsigned nice_len = MIN(c->nice_match_length, max_len);
3542
0
  struct lz_match *cache_ptr = c->p.n.match_cache;
3543
0
  u32 next_hashes[2] = {0, 0};
3544
0
  bool prev_block_used_only_literals = false;
3545
3546
0
  bt_matchfinder_init(&c->p.n.bt_mf);
3547
0
  deflate_near_optimal_init_stats(c);
3548
3549
0
  do {
3550
    /* Starting a new DEFLATE block */
3551
0
    const u8 * const in_max_block_end = choose_max_block_end(
3552
0
        in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH);
3553
0
    const u8 *prev_end_block_check = NULL;
3554
0
    bool change_detected = false;
3555
0
    const u8 *next_observation = in_next;
3556
0
    unsigned min_len;
3557
3558
    /*
3559
     * Use the minimum match length heuristic to improve the
3560
     * literal/match statistics gathered during matchfinding.
3561
     * However, the actual near-optimal parse won't respect min_len,
3562
     * as it can accurately assess the costs of different matches.
3563
     *
3564
     * If the "use only literals" strategy happened to be the best
3565
     * strategy on the previous block, then probably the
3566
     * min_match_len heuristic is still not aggressive enough for
3567
     * the data, so force gathering literal stats only.
3568
     */
3569
0
    if (prev_block_used_only_literals)
3570
0
      min_len = DEFLATE_MAX_MATCH_LEN + 1;
3571
0
    else
3572
0
      min_len = calculate_min_match_len(
3573
0
          in_block_begin,
3574
0
          in_max_block_end - in_block_begin,
3575
0
          c->max_search_depth);
3576
3577
    /*
3578
     * Find matches until we decide to end the block.  We end the
3579
     * block if any of the following is true:
3580
     *
3581
     * (1) Maximum block length has been reached
3582
     * (2) Match catch may overflow.
3583
     * (3) Block split heuristic says to split now.
3584
     */
3585
0
    for (;;) {
3586
0
      struct lz_match *matches;
3587
0
      unsigned best_len;
3588
0
      size_t remaining = in_end - in_next;
3589
3590
      /* Slide the window forward if needed. */
3591
0
      if (in_next == in_next_slide) {
3592
0
        bt_matchfinder_slide_window(&c->p.n.bt_mf);
3593
0
        in_cur_base = in_next;
3594
0
        in_next_slide = in_next +
3595
0
          MIN(remaining, MATCHFINDER_WINDOW_SIZE);
3596
0
      }
3597
3598
      /*
3599
       * Find matches with the current position using the
3600
       * binary tree matchfinder and save them in match_cache.
3601
       *
3602
       * Note: the binary tree matchfinder is more suited for
3603
       * optimal parsing than the hash chain matchfinder.  The
3604
       * reasons for this include:
3605
       *
3606
       * - The binary tree matchfinder can find more matches
3607
       *   in the same number of steps.
3608
       * - One of the major advantages of hash chains is that
3609
       *   skipping positions (not searching for matches at
3610
       *   them) is faster; however, with optimal parsing we
3611
       *   search for matches at almost all positions, so this
3612
       *   advantage of hash chains is negated.
3613
       */
3614
0
      matches = cache_ptr;
3615
0
      best_len = 0;
3616
0
      adjust_max_and_nice_len(&max_len, &nice_len, remaining);
3617
0
      if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
3618
0
        cache_ptr = bt_matchfinder_get_matches(
3619
0
            &c->p.n.bt_mf,
3620
0
            in_cur_base,
3621
0
            in_next - in_cur_base,
3622
0
            max_len,
3623
0
            nice_len,
3624
0
            c->max_search_depth,
3625
0
            next_hashes,
3626
0
            matches);
3627
0
        if (cache_ptr > matches)
3628
0
          best_len = cache_ptr[-1].length;
3629
0
      }
3630
0
      if (in_next >= next_observation) {
3631
0
        if (best_len >= min_len) {
3632
0
          observe_match(&c->split_stats,
3633
0
                  best_len);
3634
0
          next_observation = in_next + best_len;
3635
0
          c->p.n.new_match_len_freqs[best_len]++;
3636
0
        } else {
3637
0
          observe_literal(&c->split_stats,
3638
0
              *in_next);
3639
0
          next_observation = in_next + 1;
3640
0
        }
3641
0
      }
3642
3643
0
      cache_ptr->length = cache_ptr - matches;
3644
0
      cache_ptr->offset = *in_next;
3645
0
      in_next++;
3646
0
      cache_ptr++;
3647
3648
      /*
3649
       * If there was a very long match found, don't cache any
3650
       * matches for the bytes covered by that match.  This
3651
       * avoids degenerate behavior when compressing highly
3652
       * redundant data, where the number of matches can be
3653
       * very large.
3654
       *
3655
       * This heuristic doesn't actually hurt the compression
3656
       * ratio very much.  If there's a long match, then the
3657
       * data must be highly compressible, so it doesn't
3658
       * matter much what we do.
3659
       */
3660
0
      if (best_len >= DEFLATE_MIN_MATCH_LEN &&
3661
0
          best_len >= nice_len) {
3662
0
        --best_len;
3663
0
        do {
3664
0
          remaining = in_end - in_next;
3665
0
          if (in_next == in_next_slide) {
3666
0
            bt_matchfinder_slide_window(
3667
0
              &c->p.n.bt_mf);
3668
0
            in_cur_base = in_next;
3669
0
            in_next_slide = in_next +
3670
0
              MIN(remaining,
3671
0
                  MATCHFINDER_WINDOW_SIZE);
3672
0
          }
3673
0
          adjust_max_and_nice_len(&max_len,
3674
0
                &nice_len,
3675
0
                remaining);
3676
0
          if (max_len >=
3677
0
              BT_MATCHFINDER_REQUIRED_NBYTES) {
3678
0
            bt_matchfinder_skip_byte(
3679
0
              &c->p.n.bt_mf,
3680
0
              in_cur_base,
3681
0
              in_next - in_cur_base,
3682
0
              nice_len,
3683
0
              c->max_search_depth,
3684
0
              next_hashes);
3685
0
          }
3686
0
          cache_ptr->length = 0;
3687
0
          cache_ptr->offset = *in_next;
3688
0
          in_next++;
3689
0
          cache_ptr++;
3690
0
        } while (--best_len);
3691
0
      }
3692
      /* Maximum block length or end of input reached? */
3693
0
      if (in_next >= in_max_block_end)
3694
0
        break;
3695
      /* Match cache overflowed? */
3696
0
      if (cache_ptr >=
3697
0
          &c->p.n.match_cache[MATCH_CACHE_LENGTH])
3698
0
        break;
3699
      /* Not ready to try to end the block (again)? */
3700
0
      if (!ready_to_check_block(&c->split_stats,
3701
0
              in_block_begin, in_next,
3702
0
              in_end))
3703
0
        continue;
3704
      /* Check if it would be worthwhile to end the block. */
3705
0
      if (do_end_block_check(&c->split_stats,
3706
0
                 in_next - in_block_begin)) {
3707
0
        change_detected = true;
3708
0
        break;
3709
0
      }
3710
      /* Ending the block doesn't seem worthwhile here. */
3711
0
      deflate_near_optimal_merge_stats(c);
3712
0
      prev_end_block_check = in_next;
3713
0
    }
3714
    /*
3715
     * All the matches for this block have been cached.  Now choose
3716
     * the precise end of the block and the sequence of items to
3717
     * output to represent it, then flush the block.
3718
     */
3719
0
    if (change_detected && prev_end_block_check != NULL) {
3720
      /*
3721
       * The block is being ended because a recent chunk of
3722
       * data differs from the rest of the block.  We could
3723
       * end the block at 'in_next' like the greedy and lazy
3724
       * compressors do, but that's not ideal since it would
3725
       * include the differing chunk in the block.  The
3726
       * near-optimal compressor has time to do a better job.
3727
       * Therefore, we rewind to just before the chunk, and
3728
       * output a block that only goes up to there.
3729
       *
3730
       * We then set things up to correctly start the next
3731
       * block, considering that some work has already been
3732
       * done on it (some matches found and stats gathered).
3733
       */
3734
0
      struct lz_match *orig_cache_ptr = cache_ptr;
3735
0
      const u8 *in_block_end = prev_end_block_check;
3736
0
      u32 block_length = in_block_end - in_block_begin;
3737
0
      bool is_first = (in_block_begin == in);
3738
0
      bool is_final = false;
3739
0
      u32 num_bytes_to_rewind = in_next - in_block_end;
3740
0
      size_t cache_len_rewound;
3741
3742
      /* Rewind the match cache. */
3743
0
      do {
3744
0
        cache_ptr--;
3745
0
        cache_ptr -= cache_ptr->length;
3746
0
      } while (--num_bytes_to_rewind);
3747
0
      cache_len_rewound = orig_cache_ptr - cache_ptr;
3748
3749
0
      deflate_optimize_and_flush_block(
3750
0
            c, os, in_block_begin,
3751
0
            block_length, cache_ptr,
3752
0
            is_first, is_final,
3753
0
            &prev_block_used_only_literals);
3754
0
      memmove(c->p.n.match_cache, cache_ptr,
3755
0
        cache_len_rewound * sizeof(*cache_ptr));
3756
0
      cache_ptr = &c->p.n.match_cache[cache_len_rewound];
3757
0
      deflate_near_optimal_save_stats(c);
3758
      /*
3759
       * Clear the stats for the just-flushed block, leaving
3760
       * just the stats for the beginning of the next block.
3761
       */
3762
0
      deflate_near_optimal_clear_old_stats(c);
3763
0
      in_block_begin = in_block_end;
3764
0
    } else {
3765
      /*
3766
       * The block is being ended for a reason other than a
3767
       * differing data chunk being detected.  Don't rewind at
3768
       * all; just end the block at the current position.
3769
       */
3770
0
      u32 block_length = in_next - in_block_begin;
3771
0
      bool is_first = (in_block_begin == in);
3772
0
      bool is_final = (in_next == in_end);
3773
3774
0
      deflate_near_optimal_merge_stats(c);
3775
0
      deflate_optimize_and_flush_block(
3776
0
            c, os, in_block_begin,
3777
0
            block_length, cache_ptr,
3778
0
            is_first, is_final,
3779
0
            &prev_block_used_only_literals);
3780
0
      cache_ptr = &c->p.n.match_cache[0];
3781
0
      deflate_near_optimal_save_stats(c);
3782
0
      deflate_near_optimal_init_stats(c);
3783
0
      in_block_begin = in_next;
3784
0
    }
3785
0
  } while (in_next != in_end);
3786
0
}
3787
3788
/* Initialize c->p.n.offset_slot_full. */
3789
static void
3790
deflate_init_offset_slot_full(struct libdeflate_compressor *c)
3791
0
{
3792
0
  unsigned offset_slot;
3793
0
  unsigned offset;
3794
0
  unsigned offset_end;
3795
3796
0
  for (offset_slot = 0; offset_slot < ARRAY_LEN(deflate_offset_slot_base);
3797
0
       offset_slot++) {
3798
0
    offset = deflate_offset_slot_base[offset_slot];
3799
0
    offset_end = offset +
3800
0
           (1 << deflate_extra_offset_bits[offset_slot]);
3801
0
    do {
3802
0
      c->p.n.offset_slot_full[offset] = offset_slot;
3803
0
    } while (++offset != offset_end);
3804
0
  }
3805
0
}
3806
3807
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
3808
3809
LIBDEFLATEAPI struct libdeflate_compressor *
3810
libdeflate_alloc_compressor(int compression_level)
3811
0
{
3812
0
  struct libdeflate_compressor *c;
3813
0
  size_t size = offsetof(struct libdeflate_compressor, p);
3814
3815
0
  check_buildtime_parameters();
3816
3817
0
  if (compression_level < 0 || compression_level > 12)
3818
0
    return NULL;
3819
3820
0
#if SUPPORT_NEAR_OPTIMAL_PARSING
3821
0
  if (compression_level >= 10)
3822
0
    size += sizeof(c->p.n);
3823
0
  else
3824
0
#endif
3825
0
  {
3826
0
    if (compression_level >= 2)
3827
0
      size += sizeof(c->p.g);
3828
0
    else if (compression_level == 1)
3829
0
      size += sizeof(c->p.f);
3830
0
  }
3831
3832
0
  c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size);
3833
0
  if (!c)
3834
0
    return NULL;
3835
3836
0
  c->compression_level = compression_level;
3837
3838
  /*
3839
   * The higher the compression level, the more we should bother trying to
3840
   * compress very small inputs.
3841
   */
3842
0
  c->max_passthrough_size = 55 - (compression_level * 4);
3843
3844
0
  switch (compression_level) {
3845
0
  case 0:
3846
0
    c->max_passthrough_size = SIZE_MAX;
3847
0
    c->impl = NULL; /* not used */
3848
0
    break;
3849
0
  case 1:
3850
0
    c->impl = deflate_compress_fastest;
3851
    /* max_search_depth is unused. */
3852
0
    c->nice_match_length = 32;
3853
0
    break;
3854
0
  case 2:
3855
0
    c->impl = deflate_compress_greedy;
3856
0
    c->max_search_depth = 6;
3857
0
    c->nice_match_length = 10;
3858
0
    break;
3859
0
  case 3:
3860
0
    c->impl = deflate_compress_greedy;
3861
0
    c->max_search_depth = 12;
3862
0
    c->nice_match_length = 14;
3863
0
    break;
3864
0
  case 4:
3865
0
    c->impl = deflate_compress_greedy;
3866
0
    c->max_search_depth = 16;
3867
0
    c->nice_match_length = 30;
3868
0
    break;
3869
0
  case 5:
3870
0
    c->impl = deflate_compress_lazy;
3871
0
    c->max_search_depth = 16;
3872
0
    c->nice_match_length = 30;
3873
0
    break;
3874
0
  case 6:
3875
0
    c->impl = deflate_compress_lazy;
3876
0
    c->max_search_depth = 35;
3877
0
    c->nice_match_length = 65;
3878
0
    break;
3879
0
  case 7:
3880
0
    c->impl = deflate_compress_lazy;
3881
0
    c->max_search_depth = 100;
3882
0
    c->nice_match_length = 130;
3883
0
    break;
3884
0
  case 8:
3885
0
    c->impl = deflate_compress_lazy2;
3886
0
    c->max_search_depth = 300;
3887
0
    c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
3888
0
    break;
3889
0
  case 9:
3890
#if !SUPPORT_NEAR_OPTIMAL_PARSING
3891
  default:
3892
#endif
3893
0
    c->impl = deflate_compress_lazy2;
3894
0
    c->max_search_depth = 600;
3895
0
    c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
3896
0
    break;
3897
0
#if SUPPORT_NEAR_OPTIMAL_PARSING
3898
0
  case 10:
3899
0
    c->impl = deflate_compress_near_optimal;
3900
0
    c->max_search_depth = 35;
3901
0
    c->nice_match_length = 75;
3902
0
    c->p.n.max_optim_passes = 2;
3903
0
    c->p.n.min_improvement_to_continue = 32;
3904
0
    c->p.n.min_bits_to_use_nonfinal_path = 32;
3905
0
    deflate_init_offset_slot_full(c);
3906
0
    break;
3907
0
  case 11:
3908
0
    c->impl = deflate_compress_near_optimal;
3909
0
    c->max_search_depth = 100;
3910
0
    c->nice_match_length = 150;
3911
0
    c->p.n.max_optim_passes = 4;
3912
0
    c->p.n.min_improvement_to_continue = 16;
3913
0
    c->p.n.min_bits_to_use_nonfinal_path = 16;
3914
0
    deflate_init_offset_slot_full(c);
3915
0
    break;
3916
0
  case 12:
3917
0
  default:
3918
0
    c->impl = deflate_compress_near_optimal;
3919
0
    c->max_search_depth = 300;
3920
0
    c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
3921
0
    c->p.n.max_optim_passes = 10;
3922
0
    c->p.n.min_improvement_to_continue = 1;
3923
0
    c->p.n.min_bits_to_use_nonfinal_path = 1;
3924
0
    deflate_init_offset_slot_full(c);
3925
0
    break;
3926
0
#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
3927
0
  }
3928
3929
0
  deflate_init_static_codes(c);
3930
3931
0
  return c;
3932
0
}
3933
3934
LIBDEFLATEAPI size_t
3935
libdeflate_deflate_compress(struct libdeflate_compressor *c,
3936
          const void *in, size_t in_nbytes,
3937
          void *out, size_t out_nbytes_avail)
3938
0
{
3939
0
  struct deflate_output_bitstream os;
3940
3941
  /*
3942
   * For extremely short inputs, or for compression level 0, just output
3943
   * uncompressed blocks.
3944
   */
3945
0
  if (unlikely(in_nbytes <= c->max_passthrough_size))
3946
0
    return deflate_compress_none(in, in_nbytes,
3947
0
               out, out_nbytes_avail);
3948
3949
  /*
3950
   * Initialize the output bitstream structure.
3951
   *
3952
   * The end is set to OUTPUT_END_PADDING below the true end, so that
3953
   * FLUSH_BITS() can be more efficient.
3954
   */
3955
0
  if (unlikely(out_nbytes_avail <= OUTPUT_END_PADDING))
3956
0
    return 0;
3957
0
  os.bitbuf = 0;
3958
0
  os.bitcount = 0;
3959
0
  os.next = out;
3960
0
  os.end = os.next + out_nbytes_avail - OUTPUT_END_PADDING;
3961
0
  (*c->impl)(c, in, in_nbytes, &os);
3962
  /*
3963
   * If 'os.next' reached 'os.end', then either there was not enough space
3964
   * in the output buffer, or the compressed size would have been within
3965
   * OUTPUT_END_PADDING of the true end.  For performance reasons we don't
3966
   * distinguish between these cases; we just make sure to return some
3967
   * extra space from libdeflate_deflate_compress_bound().
3968
   */
3969
0
  if (os.next >= os.end)
3970
0
    return 0;
3971
0
  ASSERT(os.bitcount <= 7);
3972
0
  if (os.bitcount)
3973
0
    *os.next++ = os.bitbuf;
3974
0
  return os.next - (u8 *)out;
3975
0
}
3976
3977
LIBDEFLATEAPI void
3978
libdeflate_free_compressor(struct libdeflate_compressor *c)
3979
0
{
3980
0
  libdeflate_aligned_free(c);
3981
0
}
3982
3983
unsigned int
3984
libdeflate_get_compression_level(struct libdeflate_compressor *c)
3985
0
{
3986
0
  return c->compression_level;
3987
0
}
3988
3989
LIBDEFLATEAPI size_t
3990
libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
3991
          size_t in_nbytes)
3992
28.9k
{
3993
28.9k
  size_t bound = 0;
3994
28.9k
  size_t max_blocks;
3995
3996
  /*
3997
   * Since the compressor never uses a compressed block when an
3998
   * uncompressed block is cheaper, the worst case can be no worse than
3999
   * the case where only uncompressed blocks are used.
4000
   *
4001
   * This is true even though up to 7 bits are "wasted" to byte-align the
4002
   * bitstream when a compressed block is followed by an uncompressed
4003
   * block.  This is because a compressed block wouldn't have been used if
4004
   * it wasn't cheaper than an uncompressed block, and uncompressed blocks
4005
   * always end on a byte boundary.  So the alignment bits will, at worst,
4006
   * go up to the place where the uncompressed block would have ended.
4007
   */
4008
4009
  /*
4010
   * The minimum length that is passed to deflate_flush_block() is
4011
   * MIN_BLOCK_LENGTH bytes, except for the final block if needed.
4012
   *
4013
   * If deflate_flush_block() decides to use an uncompressed block, it
4014
   * actually will (in general) output a series of uncompressed blocks in
4015
   * order to stay within the UINT16_MAX limit of DEFLATE.  But this can
4016
   * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX',
4017
   * as in that case this behavior can't result in more blocks than the
4018
   * case where deflate_flush_block() is called with min-length inputs.
4019
   *
4020
   * So the number of uncompressed blocks needed would be bounded by
4021
   * DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH).  However, empty inputs
4022
   * need 1 (empty) block, which gives the final expression below.
4023
   */
4024
28.9k
  STATIC_ASSERT(2 * MIN_BLOCK_LENGTH <= UINT16_MAX);
4025
28.9k
  max_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
4026
4027
  /*
4028
   * Each uncompressed block has 5 bytes of overhead, for the BFINAL,
4029
   * BTYPE, LEN, and NLEN fields.  (For the reason explained earlier, the
4030
   * alignment bits at the very start of the block can be disregarded;
4031
   * they would otherwise increase the overhead to 6 bytes per block.)
4032
   */
4033
28.9k
  bound += 5 * max_blocks;
4034
4035
  /* Account for the data itself, stored uncompressed. */
4036
28.9k
  bound += in_nbytes;
4037
4038
  /*
4039
   * Add 1 + OUTPUT_END_PADDING because for performance reasons, the
4040
   * compressor doesn't distinguish between cases where there wasn't
4041
   * enough space and cases where the compressed size would have been
4042
   * 'out_nbytes_avail - OUTPUT_END_PADDING' or greater.  Adding
4043
   * 1 + OUTPUT_END_PADDING to the bound ensures the needed wiggle room.
4044
   */
4045
28.9k
  bound += 1 + OUTPUT_END_PADDING;
4046
4047
28.9k
  return bound;
4048
28.9k
}