/src/samba/lib/compression/lzxpress_huffman.c
Line | Count | Source |
1 | | /* |
2 | | * Samba compression library - LGPLv3 |
3 | | * |
4 | | * Copyright © Catalyst IT 2022 |
5 | | * |
6 | | * Written by Douglas Bagnall <douglas.bagnall@catalyst.net.nz> |
7 | | * and Jennifer Sutton <jennifersutton@catalyst.net.nz> |
8 | | * |
9 | | * ** NOTE! The following LGPL license applies to this file. |
10 | | * ** It does NOT imply that all of Samba is released under the LGPL |
11 | | * |
12 | | * This library is free software; you can redistribute it and/or |
13 | | * modify it under the terms of the GNU Lesser General Public |
14 | | * License as published by the Free Software Foundation; either |
15 | | * version 3 of the License, or (at your option) any later version. |
16 | | * |
17 | | * This library is distributed in the hope that it will be useful, |
18 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
19 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
20 | | * Lesser General Public License for more details. |
21 | | * |
22 | | * You should have received a copy of the GNU Lesser General Public |
23 | | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
24 | | */ |
25 | | |
26 | | #include <talloc.h> |
27 | | |
28 | | #include "replace.h" |
29 | | #include "lzxpress_huffman.h" |
30 | | #include "lib/util/stable_sort.h" |
31 | | #include "lib/util/debug.h" |
32 | | #include "lib/util/byteorder.h" |
33 | | #include "lib/util/bytearray.h" |
34 | | |
35 | | /* |
36 | | * DEBUG_NO_LZ77_MATCHES toggles the encoding of matches as matches. If it is |
37 | | * false the potential match is written as a series of literals, which is a |
38 | | * valid but usually inefficient encoding. This is useful for isolating a |
39 | | * problem to either the LZ77 or the Huffman stage. |
40 | | */ |
41 | | #ifndef DEBUG_NO_LZ77_MATCHES |
42 | 5.24k | #define DEBUG_NO_LZ77_MATCHES false |
43 | | #endif |
44 | | |
45 | | /* |
46 | | * DEBUG_HUFFMAN_TREE forces the drawing of ascii art huffman trees during |
47 | | * compression and decompression. |
48 | | * |
49 | | * These trees will also be drawn at DEBUG level 10, but that doesn't work |
50 | | * with cmocka tests. |
51 | | */ |
52 | | #ifndef DEBUG_HUFFMAN_TREE |
53 | 15.8k | #define DEBUG_HUFFMAN_TREE false |
54 | | #endif |
55 | | |
56 | | #if DEBUG_HUFFMAN_TREE |
57 | | #define DBG(...) fprintf(stderr, __VA_ARGS__) |
58 | | #else |
59 | 0 | #define DBG(...) DBG_INFO(__VA_ARGS__) |
60 | | #endif |
61 | | |
62 | | |
63 | 2.17k | #define LZXPRESS_ERROR -1LL |
64 | | |
65 | | /* |
66 | | * We won't encode a match length longer than MAX_MATCH_LENGTH. |
67 | | * |
68 | | * Reports are that Windows has a limit at 64M. |
69 | | */ |
70 | | #define MAX_MATCH_LENGTH (64 * 1024 * 1024) |
71 | | |
72 | | |
73 | | struct bitstream { |
74 | | const uint8_t *bytes; |
75 | | size_t byte_pos; |
76 | | size_t byte_size; |
77 | | uint32_t bits; |
78 | | int remaining_bits; |
79 | | uint16_t *table; |
80 | | }; |
81 | | |
82 | | |
83 | | #if ! defined __has_builtin |
84 | | #define __has_builtin(x) 0 |
85 | | #endif |
86 | | |
87 | | /* |
88 | | * bitlen_nonzero_16() returns the bit number of the most significant bit, or |
89 | | * put another way, the integer log base 2. Log(0) is undefined; the argument |
90 | | * has to be non-zero! |
91 | | * 1 -> 0 |
92 | | * 2,3 -> 1 |
93 | | * 4-7 -> 2 |
94 | | * 1024 -> 10, etc |
95 | | * |
96 | | * Probably this is handled by a compiler intrinsic function that maps to a |
97 | | * dedicated machine instruction. |
98 | | */ |
99 | | |
100 | | static inline int bitlen_nonzero_16(uint16_t x) |
101 | 81.1M | { |
102 | 81.1M | #if __has_builtin(__builtin_clz) |
103 | | |
104 | | /* __builtin_clz returns the number of leading zeros */ |
105 | 81.1M | return (sizeof(unsigned int) * CHAR_BIT) - 1 |
106 | 81.1M | - __builtin_clz((unsigned int) x); |
107 | | |
108 | | #else |
109 | | |
110 | | int count = -1; |
111 | | while(x) { |
112 | | x >>= 1; |
113 | | count++; |
114 | | } |
115 | | return count; |
116 | | |
117 | | #endif |
118 | 81.1M | } |
119 | | |
120 | | |
121 | | struct lzxhuff_compressor_context { |
122 | | const uint8_t *input_bytes; |
123 | | size_t input_size; |
124 | | size_t input_pos; |
125 | | size_t prev_block_pos; |
126 | | uint8_t *output; |
127 | | size_t available_size; |
128 | | size_t output_pos; |
129 | | }; |
130 | | |
131 | | static int compare_huffman_node_count(struct huffman_node *a, |
132 | | struct huffman_node *b) |
133 | 5.61M | { |
134 | 5.61M | return a->count - b->count; |
135 | 5.61M | } |
136 | | |
137 | | static int compare_huffman_node_depth(struct huffman_node *a, |
138 | | struct huffman_node *b) |
139 | 4.73M | { |
140 | 4.73M | int c = a->depth - b->depth; |
141 | 4.73M | if (c != 0) { |
142 | 651k | return c; |
143 | 651k | } |
144 | 4.08M | return (int)a->symbol - (int)b->symbol; |
145 | 4.73M | } |
146 | | |
147 | | |
148 | 1.00G | #define HASH_MASK ((1 << LZX_HUFF_COMP_HASH_BITS) - 1) |
149 | | |
150 | | static inline uint16_t three_byte_hash(const uint8_t *bytes) |
151 | 74.8M | { |
152 | | /* |
153 | | * MS-XCA says "three byte hash", but does not specify it. |
154 | | * |
155 | | * This one is just cobbled together, but has quite good distribution |
156 | | * in the 12-14 bit forms, which is what we care about most. |
157 | | * e.g: 13 bit: median 2048, min 2022, max 2074, stddev 6.0 |
158 | | */ |
159 | 74.8M | uint16_t a = bytes[0]; |
160 | 74.8M | uint16_t b = bytes[1] ^ 0x2e; |
161 | 74.8M | uint16_t c = bytes[2] ^ 0x55; |
162 | 74.8M | uint16_t ca = c - a; |
163 | 74.8M | uint16_t d = ((a + b) << 8) ^ (ca << 5) ^ (c + b) ^ (0xcab + a); |
164 | 74.8M | return d & HASH_MASK; |
165 | 74.8M | } |
166 | | |
167 | | |
168 | | static inline uint16_t encode_match(size_t len, size_t offset) |
169 | 2.76M | { |
170 | 2.76M | uint16_t code = 256; |
171 | 2.76M | code |= MIN(len - 3, 15); |
172 | 2.76M | code |= bitlen_nonzero_16(offset) << 4; |
173 | 2.76M | return code; |
174 | 2.76M | } |
175 | | |
176 | | /* |
177 | | * debug_huffman_tree() uses debug_huffman_tree_print() to draw the Huffman |
178 | | * tree in ascii art. |
179 | | * |
180 | | * Note that the Huffman tree is probably not the same as that implied by the |
181 | | * canonical Huffman encoding that is finally used. That tree would be the |
182 | | * same shape, but with the left and right toggled to sort the branches by |
183 | | * length, after which the symbols for each length sorted by value. |
184 | | */ |
185 | | |
186 | | static void debug_huffman_tree_print(struct huffman_node *node, |
187 | | int *trail, int depth) |
188 | 0 | { |
189 | 0 | if (node->left == NULL) { |
190 | | /* time to print a row */ |
191 | 0 | int j; |
192 | 0 | bool branched = false; |
193 | 0 | int row[17]; |
194 | 0 | char c[100]; |
195 | 0 | int s = node->symbol; |
196 | 0 | char code[17]; |
197 | 0 | if (depth > 15) { |
198 | 0 | fprintf(stderr, |
199 | 0 | " \033[1;31m Max depth exceeded! (%d)\033[0m " |
200 | 0 | " symbol %#3x claimed depth %d count %d\n", |
201 | 0 | depth, node->symbol, node->depth, node->count); |
202 | 0 | return; |
203 | 0 | } |
204 | 0 | for (j = depth - 1; j >= 0; j--) { |
205 | 0 | if (branched) { |
206 | 0 | if (trail[j] == -1) { |
207 | 0 | row[j] = -3; |
208 | 0 | } else { |
209 | 0 | row[j] = -2; |
210 | 0 | } |
211 | 0 | } else if (trail[j] == -1) { |
212 | 0 | row[j] = -1; |
213 | 0 | branched = true; |
214 | 0 | } else { |
215 | 0 | row[j] = trail[j]; |
216 | 0 | } |
217 | 0 | } |
218 | 0 | for (j = 0; j < depth; j++) { |
219 | 0 | switch (row[j]) { |
220 | 0 | case -3: |
221 | 0 | code[j] = '1'; |
222 | 0 | fprintf(stderr, " "); |
223 | 0 | break; |
224 | 0 | case -2: |
225 | 0 | code[j] = '0'; |
226 | 0 | fprintf(stderr, " │ "); |
227 | 0 | break; |
228 | 0 | case -1: |
229 | 0 | code[j] = '1'; |
230 | 0 | fprintf(stderr, " ╰─"); |
231 | 0 | break; |
232 | 0 | default: |
233 | 0 | code[j] = '0'; |
234 | 0 | fprintf(stderr, "%5d─┬─", row[j]); |
235 | 0 | break; |
236 | 0 | } |
237 | 0 | } |
238 | 0 | code[depth] = 0; |
239 | 0 | if (s < 32) { |
240 | 0 | snprintf(c, sizeof(c), |
241 | 0 | "\033[1;32m%02x\033[0m \033[1;33m%c%c%c\033[0m", |
242 | 0 | s, |
243 | 0 | 0xE2, 0x90, 0x80 + s); /* utf-8 for symbol */ |
244 | 0 | } else if (s < 127) { |
245 | 0 | snprintf(c, sizeof(c), |
246 | 0 | "\033[1;32m%2x\033[0m '\033[10;32m%c\033[0m'", |
247 | 0 | s, s); |
248 | 0 | } else if (s < 256) { |
249 | 0 | snprintf(c, sizeof(c), "\033[1;32m%2x\033[0m", s); |
250 | 0 | } else { |
251 | 0 | uint16_t len = (s & 15) + 3; |
252 | 0 | uint16_t dbits = ((s >> 4) & 15) + 1; |
253 | 0 | snprintf(c, sizeof(c), |
254 | 0 | " \033[0;33mlen:%2d%s, " |
255 | 0 | "dist:%d-%d \033[0m \033[1;32m%3x\033[0m%s", |
256 | 0 | len, |
257 | 0 | len == 18 ? "+" : "", |
258 | 0 | 1 << (dbits - 1), |
259 | 0 | (1 << dbits) - 1, |
260 | 0 | s, |
261 | 0 | s == 256 ? " \033[1;31mEOF\033[0m" : ""); |
262 | |
|
263 | 0 | } |
264 | |
|
265 | 0 | fprintf(stderr, "──%5d %s \033[2;37m%s\033[0m\n", |
266 | 0 | node->count, c, code); |
267 | 0 | return; |
268 | 0 | } |
269 | 0 | trail[depth] = node->count; |
270 | 0 | debug_huffman_tree_print(node->left, trail, depth + 1); |
271 | 0 | trail[depth] = -1; |
272 | 0 | debug_huffman_tree_print(node->right, trail, depth + 1); |
273 | 0 | } |
274 | | |
275 | | |
276 | | /* |
277 | | * If DEBUG_HUFFMAN_TREE is defined true, debug_huffman_tree() |
278 | | * will print a tree looking something like this: |
279 | | * |
280 | | * 7─┬─── 3 len:18+, dist:1-1 10f 0 |
281 | | * ╰─ 4─┬─ 2─┬─── 1 61 'a' 100 |
282 | | * │ ╰─── 1 62 'b' 101 |
283 | | * ╰─ 2─┬─── 1 63 'c' 110 |
284 | | * ╰─── 1 len: 3, dist:1-1 100 EOF 111 |
285 | | * |
286 | | * This is based off a Huffman root node, and the tree may not be the same as |
287 | | * the canonical tree. |
288 | | */ |
289 | | static void debug_huffman_tree(struct huffman_node *root) |
290 | 0 | { |
291 | 0 | int trail[17]; |
292 | 0 | debug_huffman_tree_print(root, trail, 0); |
293 | 0 | } |
294 | | |
295 | | |
296 | | /* |
297 | | * If DEBUG_HUFFMAN_TREE is defined true, debug_huffman_tree_from_table() |
298 | | * will print something like this based on a decoding symbol table. |
299 | | * |
300 | | * Tree from decoding table 9 nodes → 5 codes |
301 | | * 10000─┬─── 5000 len:18+, dist:1-1 10f 0 |
302 | | * ╰─ 5000─┬─ 2500─┬─── 1250 61 'a' 100 |
303 | | * │ ╰─── 1250 62 'b' 101 |
304 | | * ╰─ 2500─┬─── 1250 63 'c' 110 |
305 | | * ╰─── 1250 len: 3, dist:1-1 100 EOF 111 |
306 | | * |
307 | | * This is the canonical form of the Huffman tree where the actual counts |
308 | | * aren't known (we use "10000" to help indicate relative frequencies). |
309 | | */ |
310 | | static void debug_huffman_tree_from_table(uint16_t *table) |
311 | 0 | { |
312 | 0 | int trail[17]; |
313 | 0 | struct huffman_node nodes[1024] = {{0}}; |
314 | 0 | uint16_t codes[1024]; |
315 | 0 | size_t n = 1; |
316 | 0 | size_t i = 0; |
317 | 0 | codes[0] = 0; |
318 | 0 | nodes[0].count = 10000; |
319 | |
|
320 | 0 | while (i < n) { |
321 | 0 | uint16_t index = codes[i]; |
322 | 0 | struct huffman_node *node = &nodes[i]; |
323 | 0 | if (table[index] == 0xffff) { |
324 | | /* internal node */ |
325 | 0 | index <<= 1; |
326 | | /* left */ |
327 | 0 | index++; |
328 | 0 | codes[n] = index; |
329 | 0 | node->left = nodes + n; |
330 | 0 | nodes[n].count = node->count >> 1; |
331 | 0 | n++; |
332 | | /*right*/ |
333 | 0 | index++; |
334 | 0 | codes[n] = index; |
335 | 0 | node->right = nodes + n; |
336 | 0 | nodes[n].count = node->count >> 1; |
337 | 0 | n++; |
338 | 0 | } else { |
339 | | /* leaf node */ |
340 | 0 | node->symbol = table[index] & 511; |
341 | 0 | } |
342 | 0 | i++; |
343 | 0 | } |
344 | |
|
345 | 0 | fprintf(stderr, |
346 | 0 | "\033[1;34m Tree from decoding table\033[0m " |
347 | 0 | "%zu nodes → %zu codes\n", |
348 | 0 | n, (n + 1) / 2); |
349 | 0 | debug_huffman_tree_print(nodes, trail, 0); |
350 | 0 | } |
351 | | |
352 | | |
353 | | static bool depth_walk(struct huffman_node *n, uint32_t depth) |
354 | 2.12M | { |
355 | 2.12M | bool ok; |
356 | 2.12M | if (n->left == NULL) { |
357 | | /* this is a leaf, record the depth */ |
358 | 1.06M | n->depth = depth; |
359 | 1.06M | return true; |
360 | 1.06M | } |
361 | 1.06M | if (depth > 14) { |
362 | 963 | return false; |
363 | 963 | } |
364 | 1.06M | ok = (depth_walk(n->left, depth + 1) && |
365 | 1.05M | depth_walk(n->right, depth + 1)); |
366 | | |
367 | 1.06M | return ok; |
368 | 1.06M | } |
369 | | |
370 | | |
371 | | static bool check_and_record_depths(struct huffman_node *root) |
372 | 6.20k | { |
373 | 6.20k | return depth_walk(root, 0); |
374 | 6.20k | } |
375 | | |
376 | | |
377 | | static bool encode_values(struct huffman_node *leaves, |
378 | | size_t n_leaves, |
379 | | uint16_t symbol_values[512]) |
380 | 5.24k | { |
381 | 5.24k | size_t i; |
382 | | /* |
383 | | * See, we have a leading 1 in our internal code representation, which |
384 | | * indicates the code length. |
385 | | */ |
386 | 5.24k | uint32_t code = 1; |
387 | 5.24k | uint32_t code_len = 0; |
388 | 5.24k | memset(symbol_values, 0, sizeof(uint16_t) * 512); |
389 | 820k | for (i = 0; i < n_leaves; i++) { |
390 | 815k | code <<= leaves[i].depth - code_len; |
391 | 815k | code_len = leaves[i].depth; |
392 | | |
393 | 815k | symbol_values[leaves[i].symbol] = code; |
394 | 815k | code++; |
395 | 815k | } |
396 | | /* |
397 | | * The last code should be 11111... with code_len + 1 ones. The final |
398 | | * code++ will wrap this round to 1000... with code_len + 1 zeroes. |
399 | | */ |
400 | | |
401 | 5.24k | if (code != 2 << code_len) { |
402 | 0 | return false; |
403 | 0 | } |
404 | 5.24k | return true; |
405 | 5.24k | } |
406 | | |
407 | | |
408 | | static int generate_huffman_codes(struct huffman_node *leaf_nodes, |
409 | | struct huffman_node *internal_nodes, |
410 | | uint16_t symbol_values[512]) |
411 | 5.24k | { |
412 | 5.24k | size_t head_leaf = 0; |
413 | 5.24k | size_t head_branch = 0; |
414 | 5.24k | size_t tail_branch = 0; |
415 | 5.24k | struct huffman_node *huffman_root = NULL; |
416 | 5.24k | size_t i, j; |
417 | 5.24k | size_t n_leaves = 0; |
418 | | |
419 | | /* |
420 | | * Before we sort the nodes, we can eliminate the unused ones. |
421 | | */ |
422 | 2.69M | for (i = 0; i < 512; i++) { |
423 | 2.68M | if (leaf_nodes[i].count) { |
424 | 815k | leaf_nodes[n_leaves] = leaf_nodes[i]; |
425 | 815k | n_leaves++; |
426 | 815k | } |
427 | 2.68M | } |
428 | 5.24k | if (n_leaves == 0) { |
429 | 0 | return LZXPRESS_ERROR; |
430 | 0 | } |
431 | 5.24k | if (n_leaves == 1) { |
432 | | /* |
433 | | * There is *almost* no way this should happen, and it would |
434 | | * ruin the tree (because the shortest possible codes are 1 |
435 | | * bit long, and there are two of them). |
436 | | * |
437 | | * The only way to get here is in an internal block in a |
438 | | * 3-or-more block message (i.e. > 128k), which consists |
439 | | * entirely of a match starting in the previous block (if it |
440 | | * was the end block, it would have the EOF symbol). |
441 | | * |
442 | | * What we do is add a dummy symbol which is this one XOR 256. |
443 | | * It won't be used in the stream but will balance the tree. |
444 | | */ |
445 | 72 | leaf_nodes[1] = leaf_nodes[0]; |
446 | 72 | leaf_nodes[1].symbol ^= 0x100; |
447 | 72 | n_leaves = 2; |
448 | 72 | } |
449 | | |
450 | | /* note, in sort we're using internal_nodes as auxiliary space */ |
451 | 5.24k | stable_sort(leaf_nodes, |
452 | 5.24k | internal_nodes, |
453 | 5.24k | n_leaves, |
454 | 5.24k | sizeof(struct huffman_node), |
455 | 5.24k | (samba_compare_fn_t)compare_huffman_node_count); |
456 | | |
457 | | /* |
458 | | * This outer loop is for re-quantizing the counts if the tree is too |
459 | | * tall (>15), which we need to do because the final encoding can't |
460 | | * express a tree that deep. |
461 | | * |
462 | | * In theory, this should be a 'while (true)' loop, but we chicken |
463 | | * out with 10 iterations, just in case. |
464 | | * |
465 | | * In practice it will almost always resolve in the first round; if |
466 | | * not then, in the second or third. Remember we'll looking at 64k or |
467 | | * less, so the rarest we can have is 1 in 64k; each round of |
468 | | * quantization effectively doubles its frequency to 1 in 32k, 1 in |
469 | | * 16k, etc, until we're treating the rare symbol as actually quite |
470 | | * common. |
471 | | */ |
472 | 6.20k | for (j = 0; j < 10; j++) { |
473 | 6.20k | bool less_than_15_bits; |
474 | 1.10M | while (true) { |
475 | 1.10M | struct huffman_node *a = NULL; |
476 | 1.10M | struct huffman_node *b = NULL; |
477 | 1.10M | size_t leaf_len = n_leaves - head_leaf; |
478 | 1.10M | size_t internal_len = tail_branch - head_branch; |
479 | | |
480 | 1.10M | if (leaf_len + internal_len == 1) { |
481 | | /* |
482 | | * We have the complete tree. The root will be |
483 | | * an internal node unless there is just one |
484 | | * symbol, which is already impossible. |
485 | | */ |
486 | 6.20k | if (unlikely(leaf_len == 1)) { |
487 | 0 | return LZXPRESS_ERROR; |
488 | 6.20k | } else { |
489 | 6.20k | huffman_root = \ |
490 | 6.20k | &internal_nodes[head_branch]; |
491 | 6.20k | } |
492 | 6.20k | break; |
493 | 6.20k | } |
494 | | /* |
495 | | * We know here we have at least two nodes, and we |
496 | | * want to select the two lowest scoring ones. Those |
497 | | * have to be either a) the head of each queue, or b) |
498 | | * the first two nodes of either queue. |
499 | | * |
500 | | * The complicating factors are: a) we need to check |
501 | | * the length of each queue, and b) in the case of |
502 | | * ties, we prefer to pair leaves with leaves. |
503 | | * |
504 | | * Note a complication we don't have: the leaf node |
505 | | * queue never grows, and the subtree queue starts |
506 | | * empty and cannot grow beyond n - 1. It feeds on |
507 | | * itself. We don't need to think about overflow. |
508 | | */ |
509 | 1.09M | if (leaf_len == 0) { |
510 | | /* two from subtrees */ |
511 | 295k | a = &internal_nodes[head_branch]; |
512 | 295k | b = &internal_nodes[head_branch + 1]; |
513 | 295k | head_branch += 2; |
514 | 800k | } else if (internal_len == 0) { |
515 | | /* two from nodes */ |
516 | 6.20k | a = &leaf_nodes[head_leaf]; |
517 | 6.20k | b = &leaf_nodes[head_leaf + 1]; |
518 | 6.20k | head_leaf += 2; |
519 | 794k | } else if (leaf_len == 1 && internal_len == 1) { |
520 | | /* one of each */ |
521 | 686 | a = &leaf_nodes[head_leaf]; |
522 | 686 | b = &internal_nodes[head_branch]; |
523 | 686 | head_branch++; |
524 | 686 | head_leaf++; |
525 | 794k | } else { |
526 | | /* |
527 | | * Take the lowest head, twice, checking for |
528 | | * length after taking the first one. |
529 | | */ |
530 | 794k | if (leaf_nodes[head_leaf].count > |
531 | 794k | internal_nodes[head_branch].count) { |
532 | 247k | a = &internal_nodes[head_branch]; |
533 | 247k | head_branch++; |
534 | 247k | if (internal_len == 1) { |
535 | 1.52k | b = &leaf_nodes[head_leaf]; |
536 | 1.52k | head_leaf++; |
537 | 1.52k | goto done; |
538 | 1.52k | } |
539 | 546k | } else { |
540 | 546k | a = &leaf_nodes[head_leaf]; |
541 | 546k | head_leaf++; |
542 | 546k | if (leaf_len == 1) { |
543 | 3.02k | b = &internal_nodes[head_branch]; |
544 | 3.02k | head_branch++; |
545 | 3.02k | goto done; |
546 | 3.02k | } |
547 | 546k | } |
548 | | /* the other node */ |
549 | 789k | if (leaf_nodes[head_leaf].count > |
550 | 789k | internal_nodes[head_branch].count) { |
551 | 247k | b = &internal_nodes[head_branch]; |
552 | 247k | head_branch++; |
553 | 541k | } else { |
554 | 541k | b = &leaf_nodes[head_leaf]; |
555 | 541k | head_leaf++; |
556 | 541k | } |
557 | 789k | } |
558 | 1.09M | done: |
559 | | /* |
560 | | * Now we add a new node to the subtrees list that |
561 | | * combines the score of node_a and node_b, and points |
562 | | * to them as children. |
563 | | */ |
564 | 1.09M | internal_nodes[tail_branch].count = a->count + b->count; |
565 | 1.09M | internal_nodes[tail_branch].left = a; |
566 | 1.09M | internal_nodes[tail_branch].right = b; |
567 | 1.09M | tail_branch++; |
568 | 1.09M | if (tail_branch == n_leaves) { |
569 | | /* |
570 | | * We're not getting here, no way, never ever. |
571 | | * Unless we made a terrible mistake. |
572 | | * |
573 | | * That is, in a binary tree with n leaves, |
574 | | * there are ALWAYS n-1 internal nodes. |
575 | | */ |
576 | 0 | return LZXPRESS_ERROR; |
577 | 0 | } |
578 | 1.09M | } |
579 | 6.20k | if (CHECK_DEBUGLVL(10) || DEBUG_HUFFMAN_TREE) { |
580 | 0 | debug_huffman_tree(huffman_root); |
581 | 0 | } |
582 | | /* |
583 | | * We have a tree, and need to turn it into a lookup table, |
584 | | * and see if it is shallow enough (<= 15). |
585 | | */ |
586 | 6.20k | less_than_15_bits = check_and_record_depths(huffman_root); |
587 | 6.20k | if (less_than_15_bits) { |
588 | | /* |
589 | | * Now the leaf nodes know how deep they are, and we |
590 | | * no longer need the internal nodes. |
591 | | * |
592 | | * We need to sort the nodes of equal depth, so that |
593 | | * they are sorted by depth first, and symbol value |
594 | | * second. The internal_nodes can again be auxiliary |
595 | | * memory. |
596 | | */ |
597 | 5.24k | stable_sort( |
598 | 5.24k | leaf_nodes, |
599 | 5.24k | internal_nodes, |
600 | 5.24k | n_leaves, |
601 | 5.24k | sizeof(struct huffman_node), |
602 | 5.24k | (samba_compare_fn_t)compare_huffman_node_depth); |
603 | | |
604 | 5.24k | encode_values(leaf_nodes, n_leaves, symbol_values); |
605 | | |
606 | 5.24k | return n_leaves; |
607 | 5.24k | } |
608 | | |
609 | | /* |
610 | | * requantize by halving and rounding up, so that small counts |
611 | | * become relatively bigger. This will lead to a flatter tree. |
612 | | */ |
613 | 287k | for (i = 0; i < n_leaves; i++) { |
614 | 286k | leaf_nodes[i].count >>= 1; |
615 | 286k | leaf_nodes[i].count += 1; |
616 | 286k | } |
617 | 963 | head_leaf = 0; |
618 | 963 | head_branch = 0; |
619 | 963 | tail_branch = 0; |
620 | 963 | } |
621 | 0 | return LZXPRESS_ERROR; |
622 | 5.24k | } |
623 | | |
624 | | /* |
625 | | * LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS is how far ahead to search in the |
626 | | * circular hash table for a match, before we give up. A bigger number will |
627 | | * generally lead to better but slower compression, but a stupidly big number |
628 | | * will just be worse. |
629 | | * |
630 | | * If you're fiddling with this, consider also fiddling with |
631 | | * LZX_HUFF_COMP_HASH_BITS. |
632 | | */ |
633 | 1.12G | #define LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS 5 |
634 | | |
635 | | static inline void store_match(uint16_t *hash_table, |
636 | | uint16_t h, |
637 | | uint16_t offset) |
638 | 74.8M | { |
639 | 74.8M | int i; |
640 | 74.8M | uint16_t o = hash_table[h]; |
641 | 74.8M | uint16_t h2; |
642 | 74.8M | uint16_t worst_h; |
643 | 74.8M | int worst_score; |
644 | | |
645 | 74.8M | if (o == 0xffff) { |
646 | | /* there is nothing there yet */ |
647 | 16.4M | hash_table[h] = offset; |
648 | 16.4M | return; |
649 | 16.4M | } |
650 | 258M | for (i = 1; i < LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS; i++) { |
651 | 211M | h2 = (h + i) & HASH_MASK; |
652 | 211M | if (hash_table[h2] == 0xffff) { |
653 | 11.0M | hash_table[h2] = offset; |
654 | 11.0M | return; |
655 | 11.0M | } |
656 | 211M | } |
657 | | /* |
658 | | * There are no slots, but we really want to store this, so we'll kick |
659 | | * out the one with the longest distance. |
660 | | */ |
661 | 47.2M | worst_h = h; |
662 | 47.2M | worst_score = offset - o; |
663 | 236M | for (i = 1; i < LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS; i++) { |
664 | 189M | int score; |
665 | 189M | h2 = (h + i) & HASH_MASK; |
666 | 189M | o = hash_table[h2]; |
667 | 189M | score = offset - o; |
668 | 189M | if (score > worst_score) { |
669 | 55.3M | worst_score = score; |
670 | 55.3M | worst_h = h2; |
671 | 55.3M | } |
672 | 189M | } |
673 | 47.2M | hash_table[worst_h] = offset; |
674 | 47.2M | } |
675 | | |
676 | | |
677 | | /* |
678 | | * Yes, struct match looks a lot like a DATA_BLOB. |
679 | | */ |
680 | | struct match { |
681 | | const uint8_t *there; |
682 | | size_t length; |
683 | | }; |
684 | | |
685 | | |
686 | | static inline struct match lookup_match(uint16_t *hash_table, |
687 | | uint16_t h, |
688 | | const uint8_t *data, |
689 | | const uint8_t *here, |
690 | | size_t max_len) |
691 | 126M | { |
692 | 126M | int i; |
693 | 126M | uint16_t o = hash_table[h]; |
694 | 126M | uint16_t h2; |
695 | 126M | size_t len; |
696 | 126M | const uint8_t *there = NULL; |
697 | 126M | struct match best = {0}; |
698 | | |
699 | 630M | for (i = 0; i < LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS; i++) { |
700 | 534M | h2 = (h + i) & HASH_MASK; |
701 | 534M | o = hash_table[h2]; |
702 | 534M | if (o == 0xffff) { |
703 | | /* |
704 | | * in setting this, we would never have stepped over |
705 | | * an 0xffff, so we won't now. |
706 | | */ |
707 | 30.7M | break; |
708 | 30.7M | } |
709 | 503M | there = data + o; |
710 | 503M | if (here - there > 65534 || there > here) { |
711 | 45.8M | continue; |
712 | 45.8M | } |
713 | | |
714 | | /* |
715 | | * When we already have a long match, we can try to avoid |
716 | | * measuring out another long, but shorter match. |
717 | | */ |
718 | 458M | if (best.length > 1000 && |
719 | 30.5k | there[best.length - 1] != best.there[best.length - 1]) { |
720 | 11.0k | continue; |
721 | 11.0k | } |
722 | | |
723 | 458M | for (len = 0; |
724 | 740M | len < max_len && here[len] == there[len]; |
725 | 458M | len++) { |
726 | | /* counting */ |
727 | 282M | } |
728 | 458M | if (len > 2) { |
729 | | /* |
730 | | * As a tiebreaker, we prefer the closer match which |
731 | | * is likely to encode smaller (and certainly no worse). |
732 | | */ |
733 | 8.18M | if (len > best.length || |
734 | 5.36M | (len == best.length && there > best.there)) { |
735 | 5.36M | best.length = len; |
736 | 5.36M | best.there = there; |
737 | 5.36M | } |
738 | 8.18M | } |
739 | 458M | } |
740 | 126M | return best; |
741 | 126M | } |
742 | | |
743 | | |
744 | | |
745 | | static ssize_t lz77_encode_block(struct lzxhuff_compressor_context *cmp_ctx, |
746 | | struct lzxhuff_compressor_mem *cmp_mem, |
747 | | uint16_t *hash_table, |
748 | | uint16_t *prev_hash_table) |
749 | 5.24k | { |
750 | 5.24k | uint16_t *intermediate = cmp_mem->intermediate; |
751 | 5.24k | struct huffman_node *leaf_nodes = cmp_mem->leaf_nodes; |
752 | 5.24k | uint16_t *symbol_values = cmp_mem->symbol_values; |
753 | 5.24k | size_t i, j, intermediate_len; |
754 | 5.24k | const uint8_t *data = cmp_ctx->input_bytes + cmp_ctx->input_pos; |
755 | 5.24k | const uint8_t *prev_block = NULL; |
756 | 5.24k | size_t remaining_size = cmp_ctx->input_size - cmp_ctx->input_pos; |
757 | 5.24k | size_t block_end = MIN(65536, remaining_size); |
758 | 5.24k | struct match match; |
759 | 5.24k | int n_symbols; |
760 | | |
761 | 5.24k | if (cmp_ctx->input_size < cmp_ctx->input_pos) { |
762 | 0 | return LZXPRESS_ERROR; |
763 | 0 | } |
764 | | |
765 | 5.24k | if (cmp_ctx->prev_block_pos != cmp_ctx->input_pos) { |
766 | 2.34k | prev_block = cmp_ctx->input_bytes + cmp_ctx->prev_block_pos; |
767 | 2.89k | } else if (prev_hash_table != NULL) { |
768 | | /* we've got confused! hash and block should go together */ |
769 | 0 | return LZXPRESS_ERROR; |
770 | 0 | } |
771 | | |
772 | | /* |
773 | | * leaf_nodes is used to count the symbols seen, for later Huffman |
774 | | * encoding. |
775 | | */ |
776 | 2.69M | for (i = 0; i < 512; i++) { |
777 | 2.68M | leaf_nodes[i] = (struct huffman_node) { |
778 | 2.68M | .symbol = i |
779 | 2.68M | }; |
780 | 2.68M | } |
781 | | |
782 | 5.24k | j = 0; |
783 | | |
784 | 5.24k | if (remaining_size < 41 || DEBUG_NO_LZ77_MATCHES) { |
785 | | /* |
786 | | * There is no point doing a hash table and looking for |
787 | | * matches in this tiny block (remembering we are committed to |
788 | | * using 32 bits, so there's a good chance we wouldn't even |
789 | | * save a byte). The threshold of 41 matches Windows. |
790 | | * If remaining_size < 3, we *can't* do the hash. |
791 | | */ |
792 | 603 | i = 0; |
793 | 4.64k | } else { |
794 | | /* |
795 | | * We use 0xffff as the unset value for table, because it is |
796 | | * not a valid match offset (and 0x0 is). |
797 | | */ |
798 | 4.64k | memset(hash_table, 0xff, sizeof(cmp_mem->hash_table1)); |
799 | | |
800 | 74.8M | for (i = 0; i <= block_end - 3; i++) { |
801 | 74.8M | uint16_t code; |
802 | 74.8M | const uint8_t *here = data + i; |
803 | 74.8M | uint16_t h = three_byte_hash(here); |
804 | 74.8M | size_t max_len = MIN(remaining_size - i, MAX_MATCH_LENGTH); |
805 | 74.8M | match = lookup_match(hash_table, |
806 | 74.8M | h, |
807 | 74.8M | data, |
808 | 74.8M | here, |
809 | 74.8M | max_len); |
810 | | |
811 | 74.8M | if (match.there == NULL && prev_hash_table != NULL) { |
812 | | /* |
813 | | * If this is not the first block, |
814 | | * backreferences can look into the previous |
815 | | * block (but only as far as 65535 bytes, so |
816 | | * the end of this block cannot see the start |
817 | | * of the last one). |
818 | | */ |
819 | 51.5M | match = lookup_match(prev_hash_table, |
820 | 51.5M | h, |
821 | 51.5M | prev_block, |
822 | 51.5M | here, |
823 | 51.5M | remaining_size - i); |
824 | 51.5M | } |
825 | | |
826 | 74.8M | store_match(hash_table, h, i); |
827 | | |
828 | 74.8M | if (match.there == NULL) { |
829 | | /* add a literal and move on. */ |
830 | 72.1M | uint8_t c = data[i]; |
831 | 72.1M | leaf_nodes[c].count++; |
832 | 72.1M | intermediate[j] = c; |
833 | 72.1M | j++; |
834 | 72.1M | continue; |
835 | 72.1M | } |
836 | | |
837 | | /* a real match */ |
838 | 2.76M | if (match.length <= 65538) { |
839 | 2.76M | intermediate[j] = 0xffff; |
840 | 2.76M | intermediate[j + 1] = match.length - 3; |
841 | 2.76M | intermediate[j + 2] = here - match.there; |
842 | 2.76M | j += 3; |
843 | 2.76M | } else { |
844 | 317 | size_t m = match.length - 3; |
845 | 317 | intermediate[j] = 0xfffe; |
846 | 317 | intermediate[j + 1] = m & 0xffff; |
847 | 317 | intermediate[j + 2] = m >> 16; |
848 | 317 | intermediate[j + 3] = here - match.there; |
849 | 317 | j += 4; |
850 | 317 | } |
851 | 2.76M | code = encode_match(match.length, here - match.there); |
852 | 2.76M | leaf_nodes[code].count++; |
853 | 2.76M | i += match.length - 1; /* `- 1` for the loop i++ */ |
854 | | /* |
855 | | * A match can take us past the intended block length, |
856 | | * extending the block. We don't need to do anything |
857 | | * special for this case -- the loops will naturally |
858 | | * do the right thing. |
859 | | */ |
860 | 2.76M | } |
861 | 4.64k | } |
862 | | |
863 | | /* |
864 | | * There might be some bytes at the end. |
865 | | */ |
866 | 18.3k | for (; i < block_end; i++) { |
867 | 13.0k | leaf_nodes[data[i]].count++; |
868 | 13.0k | intermediate[j] = data[i]; |
869 | 13.0k | j++; |
870 | 13.0k | } |
871 | | |
872 | 5.24k | if (i == remaining_size) { |
873 | | /* add a trailing EOF marker (256) */ |
874 | 2.88k | intermediate[j] = 0xffff; |
875 | 2.88k | intermediate[j + 1] = 0; |
876 | 2.88k | intermediate[j + 2] = 1; |
877 | 2.88k | j += 3; |
878 | 2.88k | leaf_nodes[256].count++; |
879 | 2.88k | } |
880 | | |
881 | 5.24k | intermediate_len = j; |
882 | | |
883 | 5.24k | cmp_ctx->prev_block_pos = cmp_ctx->input_pos; |
884 | 5.24k | cmp_ctx->input_pos += i; |
885 | | |
886 | | /* fill in the symbols table */ |
887 | 5.24k | n_symbols = generate_huffman_codes(leaf_nodes, |
888 | 5.24k | cmp_mem->internal_nodes, |
889 | 5.24k | symbol_values); |
890 | 5.24k | if (n_symbols < 0) { |
891 | 0 | return n_symbols; |
892 | 0 | } |
893 | | |
894 | 5.24k | return intermediate_len; |
895 | 5.24k | } |
896 | | |
897 | | |
898 | | |
899 | | static ssize_t write_huffman_table(uint16_t symbol_values[512], |
900 | | uint8_t *output, |
901 | | size_t available_size) |
902 | 5.24k | { |
903 | 5.24k | size_t i; |
904 | | |
905 | 5.24k | if (available_size < 256) { |
906 | 0 | return LZXPRESS_ERROR; |
907 | 0 | } |
908 | | |
909 | 1.34M | for (i = 0; i < 256; i++) { |
910 | 1.34M | uint8_t b = 0; |
911 | 1.34M | uint16_t even = symbol_values[i * 2]; |
912 | 1.34M | uint16_t odd = symbol_values[i * 2 + 1]; |
913 | 1.34M | if (even != 0) { |
914 | 402k | b = bitlen_nonzero_16(even); |
915 | 402k | } |
916 | 1.34M | if (odd != 0) { |
917 | 413k | b |= bitlen_nonzero_16(odd) << 4; |
918 | 413k | } |
919 | 1.34M | output[i] = b; |
920 | 1.34M | } |
921 | 5.24k | return i; |
922 | 5.24k | } |
923 | | |
924 | | |
925 | | struct write_context { |
926 | | uint8_t *dest; |
927 | | size_t dest_len; |
928 | | size_t head; /* where lengths go */ |
929 | | size_t next_code; /* where symbol stream goes */ |
930 | | size_t pending_next_code; /* will be next_code */ |
931 | | unsigned bit_len; |
932 | | uint32_t bits; |
933 | | }; |
934 | | |
935 | | /* |
936 | | * Write out 16 bits, little-endian, for write_huffman_codes() |
937 | | * |
938 | | * As you'll notice, there's a bit to do. |
939 | | * |
940 | | * We are collecting up bits in a uint32_t, then when there are 16 of them we |
941 | | * write out a word into the stream, using a trio of offsets (wc->next_code, |
942 | | * wc->pending_next_code, and wc->head) which dance around ensuring that the |
943 | | * bitstream and the interspersed lengths are in the right places relative to |
944 | | * each other. |
945 | | */ |
946 | | |
947 | | static inline bool write_bits(struct write_context *wc, |
948 | | uint16_t code, uint16_t length) |
949 | 77.5M | { |
950 | 77.5M | wc->bits <<= length; |
951 | 77.5M | wc->bits |= code; |
952 | 77.5M | wc->bit_len += length; |
953 | 77.5M | if (wc->bit_len > 16) { |
954 | 38.1M | uint32_t w = wc->bits >> (wc->bit_len - 16); |
955 | 38.1M | wc->bit_len -= 16; |
956 | 38.1M | if (wc->next_code + 2 > wc->dest_len || |
957 | 38.1M | unlikely(wc->bit_len > 16)) { |
958 | 21 | return false; |
959 | 21 | } |
960 | 38.1M | wc->dest[wc->next_code] = w & 0xff; |
961 | 38.1M | wc->dest[wc->next_code + 1] = (w >> 8) & 0xff; |
962 | 38.1M | wc->next_code = wc->pending_next_code; |
963 | 38.1M | wc->pending_next_code = wc->head; |
964 | 38.1M | wc->head += 2; |
965 | 38.1M | } |
966 | 77.5M | return true; |
967 | 77.5M | } |
968 | | |
969 | | |
970 | | static inline bool write_code(struct write_context *wc, uint16_t code) |
971 | 74.7M | { |
972 | 74.7M | int code_bit_len = bitlen_nonzero_16(code); |
973 | 74.7M | if (unlikely(code == 0)) { |
974 | 0 | return false; |
975 | 0 | } |
976 | 74.7M | code &= (1 << code_bit_len) - 1; |
977 | 74.7M | return write_bits(wc, code, code_bit_len); |
978 | 74.7M | } |
979 | | |
980 | | static inline bool write_byte(struct write_context *wc, uint8_t byte) |
981 | 451k | { |
982 | 451k | if (wc->head + 1 > wc->dest_len) { |
983 | 4 | return false; |
984 | 4 | } |
985 | 451k | wc->dest[wc->head] = byte; |
986 | 451k | wc->head++; |
987 | 451k | return true; |
988 | 451k | } |
989 | | |
990 | | |
991 | | static inline bool write_long_len(struct write_context *wc, size_t len) |
992 | 30.1k | { |
993 | 30.1k | if (len < 65535) { |
994 | 29.8k | if (wc->head + 3 > wc->dest_len) { |
995 | 23 | return false; |
996 | 23 | } |
997 | 29.8k | wc->dest[wc->head] = 255; |
998 | 29.8k | wc->dest[wc->head + 1] = len & 255; |
999 | 29.8k | wc->dest[wc->head + 2] = len >> 8; |
1000 | 29.8k | wc->head += 3; |
1001 | 29.8k | } else { |
1002 | 343 | if (wc->head + 7 > wc->dest_len) { |
1003 | 10 | return false; |
1004 | 10 | } |
1005 | 333 | wc->dest[wc->head] = 255; |
1006 | 333 | wc->dest[wc->head + 1] = 0; |
1007 | 333 | wc->dest[wc->head + 2] = 0; |
1008 | 333 | wc->dest[wc->head + 3] = len & 255; |
1009 | 333 | wc->dest[wc->head + 4] = (len >> 8) & 255; |
1010 | 333 | wc->dest[wc->head + 5] = (len >> 16) & 255; |
1011 | 333 | wc->dest[wc->head + 6] = (len >> 24) & 255; |
1012 | 333 | wc->head += 7; |
1013 | 333 | } |
1014 | 30.1k | return true; |
1015 | 30.1k | } |
1016 | | |
1017 | | static ssize_t write_compressed_bytes(uint16_t symbol_values[512], |
1018 | | uint16_t *intermediate, |
1019 | | size_t intermediate_len, |
1020 | | uint8_t *dest, |
1021 | | size_t dest_len) |
1022 | 5.24k | { |
1023 | 5.24k | bool ok; |
1024 | 5.24k | size_t i; |
1025 | 5.24k | size_t end; |
1026 | 5.24k | struct write_context wc = { |
1027 | 5.24k | .head = 4, |
1028 | 5.24k | .pending_next_code = 2, |
1029 | 5.24k | .dest = dest, |
1030 | 5.24k | .dest_len = dest_len |
1031 | 5.24k | }; |
1032 | 74.7M | for (i = 0; i < intermediate_len; i++) { |
1033 | 74.7M | uint16_t c = intermediate[i]; |
1034 | 74.7M | size_t len; |
1035 | 74.7M | uint16_t distance; |
1036 | 74.7M | uint16_t code_len = 0; |
1037 | 74.7M | uint16_t code_dist = 0; |
1038 | 74.7M | if (c < 256) { |
1039 | 72.0M | ok = write_code(&wc, symbol_values[c]); |
1040 | 72.0M | if (!ok) { |
1041 | 5 | return LZXPRESS_ERROR; |
1042 | 5 | } |
1043 | 72.0M | continue; |
1044 | 72.0M | } |
1045 | | |
1046 | 2.76M | if (c == 0xfffe) { |
1047 | 316 | if (i > intermediate_len - 4) { |
1048 | 0 | return LZXPRESS_ERROR; |
1049 | 0 | } |
1050 | | |
1051 | 316 | len = intermediate[i + 1]; |
1052 | 316 | len |= (uint32_t)intermediate[i + 2] << 16; |
1053 | 316 | distance = intermediate[i + 3]; |
1054 | 316 | i += 3; |
1055 | 2.76M | } else if (c == 0xffff) { |
1056 | 2.76M | if (i > intermediate_len - 3) { |
1057 | 0 | return LZXPRESS_ERROR; |
1058 | 0 | } |
1059 | 2.76M | len = intermediate[i + 1]; |
1060 | 2.76M | distance = intermediate[i + 2]; |
1061 | 2.76M | i += 2; |
1062 | 2.76M | } else { |
1063 | 0 | return LZXPRESS_ERROR; |
1064 | 0 | } |
1065 | 2.76M | if (unlikely(distance == 0)) { |
1066 | 0 | return LZXPRESS_ERROR; |
1067 | 0 | } |
1068 | | /* len has already had 3 subtracted */ |
1069 | 2.76M | if (len >= 15) { |
1070 | | /* |
1071 | | * We are going to need to write extra length |
1072 | | * bytes into the stream, but we don't do it |
1073 | | * now, we do it after the code has been |
1074 | | * written (and before the distance bits). |
1075 | | */ |
1076 | 481k | code_len = 15; |
1077 | 2.28M | } else { |
1078 | 2.28M | code_len = len; |
1079 | 2.28M | } |
1080 | 2.76M | code_dist = bitlen_nonzero_16(distance); |
1081 | 2.76M | c = 256 | (code_dist << 4) | code_len; |
1082 | 2.76M | if (c > 511) { |
1083 | 0 | return LZXPRESS_ERROR; |
1084 | 0 | } |
1085 | | |
1086 | 2.76M | ok = write_code(&wc, symbol_values[c]); |
1087 | 2.76M | if (!ok) { |
1088 | 3 | return LZXPRESS_ERROR; |
1089 | 3 | } |
1090 | | |
1091 | 2.76M | if (code_len == 15) { |
1092 | 481k | if (len >= 270) { |
1093 | 30.1k | ok = write_long_len(&wc, len); |
1094 | 451k | } else { |
1095 | 451k | ok = write_byte(&wc, len - 15); |
1096 | 451k | } |
1097 | 481k | if (! ok) { |
1098 | 37 | return LZXPRESS_ERROR; |
1099 | 37 | } |
1100 | 481k | } |
1101 | 2.76M | if (code_dist != 0) { |
1102 | 2.73M | uint16_t dist_bits = distance - (1 << code_dist); |
1103 | 2.73M | ok = write_bits(&wc, dist_bits, code_dist); |
1104 | 2.73M | if (!ok) { |
1105 | 3 | return LZXPRESS_ERROR; |
1106 | 3 | } |
1107 | 2.73M | } |
1108 | 2.76M | } |
1109 | | /* |
1110 | | * There are some intricacies around flushing the bits and returning |
1111 | | * the length. |
1112 | | * |
1113 | | * If the returned length is not exactly right and there is another |
1114 | | * block, that block will read its huffman table from the wrong place, |
1115 | | * and have all the symbol codes out by a multiple of 4. |
1116 | | */ |
1117 | 5.19k | end = wc.head; |
1118 | 5.19k | if (wc.bit_len == 0) { |
1119 | 0 | end -= 2; |
1120 | 0 | } |
1121 | 5.19k | ok = write_bits(&wc, 0, 16 - wc.bit_len); |
1122 | 5.19k | if (!ok) { |
1123 | 0 | return LZXPRESS_ERROR; |
1124 | 0 | } |
1125 | 15.5k | for (i = 0; i < 2; i++) { |
1126 | | /* |
1127 | | * Flush out the bits with zeroes. It doesn't matter if we do |
1128 | | * a round too many, as we have buffer space, and have already |
1129 | | * determined the returned length (end). |
1130 | | */ |
1131 | 10.3k | ok = write_bits(&wc, 0, 16); |
1132 | 10.3k | if (!ok) { |
1133 | 10 | return LZXPRESS_ERROR; |
1134 | 10 | } |
1135 | 10.3k | } |
1136 | 5.18k | return end; |
1137 | 5.19k | } |
1138 | | |
1139 | | |
1140 | | static ssize_t lzx_huffman_compress_block(struct lzxhuff_compressor_context *cmp_ctx, |
1141 | | struct lzxhuff_compressor_mem *cmp_mem, |
1142 | | size_t block_no) |
1143 | 5.25k | { |
1144 | 5.25k | ssize_t intermediate_size; |
1145 | 5.25k | uint16_t *hash_table = NULL; |
1146 | 5.25k | uint16_t *back_window_hash_table = NULL; |
1147 | 5.25k | ssize_t bytes_written; |
1148 | | |
1149 | 5.25k | if (cmp_ctx->available_size - cmp_ctx->output_pos < 260) { |
1150 | | /* huffman block + 4 bytes */ |
1151 | 13 | return LZXPRESS_ERROR; |
1152 | 13 | } |
1153 | | |
1154 | | /* |
1155 | | * For LZ77 compression, we keep a hash table for the previous block, |
1156 | | * via alternation after the first block. |
1157 | | * |
1158 | | * LZ77 writes into the intermediate buffer in the cmp_mem context. |
1159 | | */ |
1160 | 5.24k | if (block_no == 0) { |
1161 | 2.89k | hash_table = cmp_mem->hash_table1; |
1162 | 2.89k | back_window_hash_table = NULL; |
1163 | 2.89k | } else if (block_no & 1) { |
1164 | 1.45k | hash_table = cmp_mem->hash_table2; |
1165 | 1.45k | back_window_hash_table = cmp_mem->hash_table1; |
1166 | 1.45k | } else { |
1167 | 894 | hash_table = cmp_mem->hash_table1; |
1168 | 894 | back_window_hash_table = cmp_mem->hash_table2; |
1169 | 894 | } |
1170 | | |
1171 | 5.24k | intermediate_size = lz77_encode_block(cmp_ctx, |
1172 | 5.24k | cmp_mem, |
1173 | 5.24k | hash_table, |
1174 | 5.24k | back_window_hash_table); |
1175 | | |
1176 | 5.24k | if (intermediate_size < 0) { |
1177 | 0 | return intermediate_size; |
1178 | 0 | } |
1179 | | |
1180 | | /* |
1181 | | * Write the 256 byte Huffman table, based on the counts gained in |
1182 | | * LZ77 phase. |
1183 | | */ |
1184 | 5.24k | bytes_written = write_huffman_table( |
1185 | 5.24k | cmp_mem->symbol_values, |
1186 | 5.24k | cmp_ctx->output + cmp_ctx->output_pos, |
1187 | 5.24k | cmp_ctx->available_size - cmp_ctx->output_pos); |
1188 | | |
1189 | 5.24k | if (bytes_written != 256) { |
1190 | 0 | return LZXPRESS_ERROR; |
1191 | 0 | } |
1192 | 5.24k | cmp_ctx->output_pos += 256; |
1193 | | |
1194 | | /* |
1195 | | * Write the compressed bytes using the LZ77 matches and Huffman codes |
1196 | | * worked out in the previous steps. |
1197 | | */ |
1198 | 5.24k | bytes_written = write_compressed_bytes( |
1199 | 5.24k | cmp_mem->symbol_values, |
1200 | 5.24k | cmp_mem->intermediate, |
1201 | 5.24k | intermediate_size, |
1202 | 5.24k | cmp_ctx->output + cmp_ctx->output_pos, |
1203 | 5.24k | cmp_ctx->available_size - cmp_ctx->output_pos); |
1204 | | |
1205 | 5.24k | if (bytes_written < 0) { |
1206 | 58 | return bytes_written; |
1207 | 58 | } |
1208 | | |
1209 | 5.18k | cmp_ctx->output_pos += bytes_written; |
1210 | 5.18k | return bytes_written; |
1211 | 5.24k | } |
1212 | | |
1213 | | /* |
1214 | | * lzxpress_huffman_max_compressed_size() |
1215 | | * |
1216 | | * Return the most bytes the compression can take, to allow |
1217 | | * pre-allocation. |
1218 | | */ |
1219 | | size_t lzxpress_huffman_max_compressed_size(size_t input_size) |
1220 | 0 | { |
1221 | | /* |
1222 | | * In the worst case, the output size should be about the same as the |
1223 | | * input size, plus the 256 byte header per 64k block. We aim for |
1224 | | * ample, but within the order of magnitude. |
1225 | | */ |
1226 | 0 | return input_size + (input_size / 8) + 270; |
1227 | 0 | } |
1228 | | |
1229 | | /* |
1230 | | * lzxpress_huffman_compress_talloc() |
1231 | | * |
1232 | | * This is the convenience function that allocates the compressor context and |
1233 | | * output memory for you. The return value is the number of bytes written to |
1234 | | * the location indicated by the output pointer. |
1235 | | * |
1236 | | * The maximum input_size is effectively around 227MB due to the need to guess |
1237 | | * an upper bound on the output size that hits an internal limitation in |
1238 | | * talloc. |
1239 | | * |
1240 | | * @param mem_ctx TALLOC_CTX parent for the compressed buffer. |
1241 | | * @param input_bytes memory to be compressed. |
1242 | | * @param input_size length of the input buffer. |
1243 | | * @param output destination pointer for the compressed data. |
1244 | | * |
1245 | | * @return the number of bytes written or -1 on error. |
1246 | | */ |
1247 | | |
1248 | | ssize_t lzxpress_huffman_compress_talloc(TALLOC_CTX *mem_ctx, |
1249 | | const uint8_t *input_bytes, |
1250 | | size_t input_size, |
1251 | | uint8_t **output) |
1252 | 0 | { |
1253 | 0 | struct lzxhuff_compressor_mem *cmp = NULL; |
1254 | 0 | size_t alloc_size = lzxpress_huffman_max_compressed_size(input_size); |
1255 | |
|
1256 | 0 | ssize_t output_size; |
1257 | |
|
1258 | 0 | *output = talloc_array(mem_ctx, uint8_t, alloc_size); |
1259 | 0 | if (*output == NULL) { |
1260 | 0 | return LZXPRESS_ERROR; |
1261 | 0 | } |
1262 | | |
1263 | 0 | cmp = talloc(mem_ctx, struct lzxhuff_compressor_mem); |
1264 | 0 | if (cmp == NULL) { |
1265 | 0 | TALLOC_FREE(*output); |
1266 | 0 | return LZXPRESS_ERROR; |
1267 | 0 | } |
1268 | | |
1269 | 0 | output_size = lzxpress_huffman_compress(cmp, |
1270 | 0 | input_bytes, |
1271 | 0 | input_size, |
1272 | 0 | *output, |
1273 | 0 | alloc_size); |
1274 | |
|
1275 | 0 | talloc_free(cmp); |
1276 | |
|
1277 | 0 | if (output_size < 0) { |
1278 | 0 | TALLOC_FREE(*output); |
1279 | 0 | return LZXPRESS_ERROR; |
1280 | 0 | } |
1281 | | |
1282 | 0 | *output = talloc_realloc(mem_ctx, *output, uint8_t, output_size); |
1283 | 0 | if (*output == NULL) { |
1284 | 0 | return LZXPRESS_ERROR; |
1285 | 0 | } |
1286 | | |
1287 | 0 | return output_size; |
1288 | 0 | } |
1289 | | |
1290 | | /* |
1291 | | * lzxpress_huffman_compress() |
1292 | | * |
1293 | | * This is the inconvenience function, slightly faster and fiddlier than |
1294 | | * lzxpress_huffman_compress_talloc(). |
1295 | | * |
1296 | | * To use this, you need to have allocated (but not initialised) a `struct |
1297 | | * lzxhuff_compressor_mem`, and an output buffer. If the buffer is not big |
1298 | | * enough (per `output_size`), you'll get a negative return value, otherwise |
1299 | | * the number of bytes actually consumed, which will always be at least 260. |
1300 | | * |
1301 | | * The `struct lzxhuff_compressor_mem` is reusable -- it is basically a |
1302 | | * collection of uninitialised memory buffers. The total size is less than |
1303 | | * 150k, so stack allocation is plausible. |
1304 | | * |
1305 | | * input_size and available_size are limited to the minimum of UINT32_MAX and |
1306 | | * SSIZE_MAX. On 64 bit machines that will be UINT32_MAX, or 4GB. |
1307 | | * |
1308 | | * @param cmp_mem a struct lzxhuff_compressor_mem. |
1309 | | * @param input_bytes memory to be compressed. |
1310 | | * @param input_size length of the input buffer. |
1311 | | * @param output destination for the compressed data. |
1312 | | * @param available_size allocated output bytes. |
1313 | | * |
1314 | | * @return the number of bytes written or -1 on error. |
1315 | | */ |
1316 | | ssize_t lzxpress_huffman_compress(struct lzxhuff_compressor_mem *cmp_mem, |
1317 | | const uint8_t *input_bytes, |
1318 | | size_t input_size, |
1319 | | uint8_t *output, |
1320 | | size_t available_size) |
1321 | 2.93k | { |
1322 | 2.93k | size_t i = 0; |
1323 | 2.93k | struct lzxhuff_compressor_context cmp_ctx = { |
1324 | 2.93k | .input_bytes = input_bytes, |
1325 | 2.93k | .input_size = input_size, |
1326 | 2.93k | .input_pos = 0, |
1327 | 2.93k | .prev_block_pos = 0, |
1328 | 2.93k | .output = output, |
1329 | 2.93k | .available_size = available_size, |
1330 | 2.93k | .output_pos = 0 |
1331 | 2.93k | }; |
1332 | | |
1333 | 2.93k | if (input_size == 0) { |
1334 | | /* |
1335 | | * We can't deal with this for a number of reasons (e.g. it |
1336 | | * breaks the Huffman tree), and the output will be infinitely |
1337 | | * bigger than the input. The caller needs to go and think |
1338 | | * about what they're trying to do here. |
1339 | | */ |
1340 | 32 | return LZXPRESS_ERROR; |
1341 | 32 | } |
1342 | | |
1343 | 2.90k | if (input_size > SSIZE_MAX || |
1344 | 2.90k | input_size > UINT32_MAX || |
1345 | 2.90k | available_size > SSIZE_MAX || |
1346 | 2.90k | available_size > UINT32_MAX || |
1347 | 2.90k | available_size == 0) { |
1348 | | /* |
1349 | | * We use negative ssize_t to return errors, which is limiting |
1350 | | * on 32 bit machines; otherwise we adhere to Microsoft's 4GB |
1351 | | * limit. |
1352 | | * |
1353 | | * lzxpress_huffman_compress_talloc() will not get this far, |
1354 | | * having already have failed on talloc's 256 MB limit. |
1355 | | */ |
1356 | 1 | return LZXPRESS_ERROR; |
1357 | 1 | } |
1358 | | |
1359 | 2.90k | if (cmp_mem == NULL || |
1360 | 2.90k | output == NULL || |
1361 | 2.90k | input_bytes == NULL) { |
1362 | 0 | return LZXPRESS_ERROR; |
1363 | 0 | } |
1364 | | |
1365 | 8.09k | while (cmp_ctx.input_pos < cmp_ctx.input_size) { |
1366 | 5.25k | ssize_t ret; |
1367 | 5.25k | ret = lzx_huffman_compress_block(&cmp_ctx, |
1368 | 5.25k | cmp_mem, |
1369 | 5.25k | i); |
1370 | 5.25k | if (ret < 0) { |
1371 | 71 | return ret; |
1372 | 71 | } |
1373 | 5.18k | i++; |
1374 | 5.18k | } |
1375 | | |
1376 | 2.83k | return cmp_ctx.output_pos; |
1377 | 2.90k | } |
1378 | | |
1379 | | static void debug_tree_codes(struct bitstream *input) |
1380 | 0 | { |
1381 | | /* |
1382 | | */ |
1383 | 0 | size_t head = 0; |
1384 | 0 | size_t tail = 2; |
1385 | 0 | size_t ffff_count = 0; |
1386 | 0 | struct q { |
1387 | 0 | uint16_t tree_code; |
1388 | 0 | uint16_t code_code; |
1389 | 0 | }; |
1390 | 0 | struct q queue[65536]; |
1391 | 0 | char bits[17]; |
1392 | 0 | uint16_t *t = input->table; |
1393 | 0 | queue[0].tree_code = 1; |
1394 | 0 | queue[0].code_code = 2; |
1395 | 0 | queue[1].tree_code = 2; |
1396 | 0 | queue[1].code_code = 3; |
1397 | 0 | while (head < tail) { |
1398 | 0 | struct q q = queue[head]; |
1399 | 0 | uint16_t x = t[q.tree_code]; |
1400 | 0 | if (x != 0xffff) { |
1401 | 0 | int k; |
1402 | 0 | uint16_t j = q.code_code; |
1403 | 0 | size_t offset = bitlen_nonzero_16(j) - 1; |
1404 | 0 | if (unlikely(j == 0)) { |
1405 | 0 | DBG("BROKEN code is 0!\n"); |
1406 | 0 | return; |
1407 | 0 | } |
1408 | | |
1409 | 0 | for (k = 0; k <= offset; k++) { |
1410 | 0 | bool b = (j >> (offset - k)) & 1; |
1411 | 0 | bits[k] = b ? '1' : '0'; |
1412 | 0 | } |
1413 | 0 | bits[k] = 0; |
1414 | 0 | DBG("%03x %s\n", x & 511, bits); |
1415 | 0 | head++; |
1416 | 0 | continue; |
1417 | 0 | } |
1418 | 0 | ffff_count++; |
1419 | 0 | queue[tail].tree_code = q.tree_code * 2 + 1; |
1420 | 0 | queue[tail].code_code = q.code_code * 2; |
1421 | 0 | tail++; |
1422 | 0 | queue[tail].tree_code = q.tree_code * 2 + 1 + 1; |
1423 | 0 | queue[tail].code_code = q.code_code * 2 + 1; |
1424 | 0 | tail++; |
1425 | 0 | head++; |
1426 | 0 | } |
1427 | 0 | DBG("0xffff count: %zu\n", ffff_count); |
1428 | 0 | } |
1429 | | |
1430 | | /** |
1431 | | * Determines the sort order of one prefix_code_symbol relative to another |
1432 | | */ |
1433 | | static int compare_uint16(const uint16_t *a, const uint16_t *b) |
1434 | 8.83M | { |
1435 | 8.83M | if (*a < *b) { |
1436 | 6.05M | return -1; |
1437 | 6.05M | } |
1438 | 2.77M | if (*a > *b) { |
1439 | 2.77M | return 1; |
1440 | 2.77M | } |
1441 | 0 | return 0; |
1442 | 2.77M | } |
1443 | | |
1444 | | |
1445 | | static bool fill_decomp_table(struct bitstream *input) |
1446 | 9.98k | { |
1447 | | /* |
1448 | | * There are 512 symbols, each encoded in 4 bits, which indicates |
1449 | | * their depth in the Huffman tree. The even numbers get the lower |
1450 | | * nibble of each byte, so that the byte hex values look backwards |
1451 | | * (i.e. 0xab encodes b then a). These are allocated Huffman codes in |
1452 | | * order of appearance, per depth. |
1453 | | * |
1454 | | * For example, if the first two bytes were: |
1455 | | * |
1456 | | * 0x23 0x53 |
1457 | | * |
1458 | | * the first four codes have the lengths 3, 2, 3, 5. |
1459 | | * Let's call them A, B, C, D. |
1460 | | * |
1461 | | * Suppose there is no other codeword with length 1 (which is |
1462 | | * necessarily true in this example) or 2, but there might be others |
1463 | | * of length 3 or 4. Then we can say this about the codes: |
1464 | | * |
1465 | | * _ --*--_ |
1466 | | * / \ |
1467 | | * 0 1 |
1468 | | * / \ / \ |
1469 | | * 0 1 0 1 |
1470 | | * B |\ / \ |\ |
1471 | | * 0 1 0 1 0 1 |
1472 | | * A C |\ /| | |\ |
1473 | | * |
1474 | | * pos bits code |
1475 | | * A 3 010 |
1476 | | * B 2 00 |
1477 | | * C 3 011 |
1478 | | * D 5 1???? |
1479 | | * |
1480 | | * B has the shortest code, so takes the leftmost branch, 00. That |
1481 | | * ends the branch -- nothing else can start with 00. There are no |
1482 | | * more 2s, so we look at the 3s, starting as far left as possible. So |
1483 | | * A takes 010 and C takes 011. That means everything else has to |
1484 | | * start with 1xx. We don't know how many codewords of length 3 or 4 |
1485 | | * there are; if there are none, D would end up with 10000, the |
1486 | | * leftmost available code of length 5. If the compressor is any good, |
1487 | | * there should be no unused leaf nodes left dangling at the end. |
1488 | | * |
1489 | | * (this is "Canonical Huffman Coding"). |
1490 | | * |
1491 | | * |
1492 | | * But what symbols do these codes actually stand for? |
1493 | | * -------------------------------------------------- |
1494 | | * |
1495 | | * Good question. The first 256 codes stand for the corresponding |
1496 | | * literal bytes. The codes from 256 to 511 stand for LZ77 matches, |
1497 | | * which have a distance and a length, encoded in a strange way that |
1498 | | * isn't entirely the purview of this function. |
1499 | | * |
1500 | | * What does the value 0 mean? |
1501 | | * --------------------------- |
1502 | | * |
1503 | | * The code does not occur. For example, if the next byte in the |
1504 | | * example above was 0x07, that would give the byte 0x04 a 7-long |
1505 | | * code, and no code to the 0x05 byte, which means we there is no way |
1506 | | * we going to see a 5 in the decoded stream. |
1507 | | * |
1508 | | * Isn't LZ77 + Huffman what zip/gzip/zlib do? |
1509 | | * ------------------------------------------- |
1510 | | * |
1511 | | * Yes, DEFLATE is LZ77 + Huffman, but the details are quite different. |
1512 | | */ |
1513 | 9.98k | uint16_t symbols[512]; |
1514 | 9.98k | uint16_t sort_mem[512]; |
1515 | 9.98k | size_t i, n_symbols; |
1516 | 9.98k | ssize_t code; |
1517 | 9.98k | uint16_t len = 0, prev_len; |
1518 | 9.98k | const uint8_t *table_bytes = input->bytes + input->byte_pos; |
1519 | | |
1520 | 9.98k | if (input->byte_pos + 260 > input->byte_size) { |
1521 | 35 | return false; |
1522 | 35 | } |
1523 | | |
1524 | 9.95k | n_symbols = 0; |
1525 | 2.55M | for (i = 0; i < 256; i++) { |
1526 | 2.54M | uint16_t even = table_bytes[i] & 15; |
1527 | 2.54M | uint16_t odd = table_bytes[i] >> 4; |
1528 | 2.54M | if (even != 0) { |
1529 | 586k | symbols[n_symbols] = (even << 9) + i * 2; |
1530 | 586k | n_symbols++; |
1531 | 586k | } |
1532 | 2.54M | if (odd != 0) { |
1533 | 1.54M | symbols[n_symbols] = (odd << 9) + i * 2 + 1; |
1534 | 1.54M | n_symbols++; |
1535 | 1.54M | } |
1536 | 2.54M | } |
1537 | 9.95k | input->byte_pos += 256; |
1538 | 9.95k | if (n_symbols == 0) { |
1539 | 3 | return false; |
1540 | 3 | } |
1541 | | |
1542 | 9.94k | stable_sort(symbols, sort_mem, n_symbols, sizeof(uint16_t), |
1543 | 9.94k | (samba_compare_fn_t)compare_uint16); |
1544 | | |
1545 | | /* |
1546 | | * we're using an implicit binary tree, as you'd see in a heap. |
1547 | | * table[0] = unused |
1548 | | * table[1] = '0' |
1549 | | * table[2] = '1' |
1550 | | * table[3] = '00' <-- '00' and '01' are children of '0' |
1551 | | * table[4] = '01' <-- '0' is [0], children are [0 * 2 + {1,2}] |
1552 | | * table[5] = '10' |
1553 | | * table[6] = '11' |
1554 | | * table[7] = '000' |
1555 | | * table[8] = '001' |
1556 | | * table[9] = '010' |
1557 | | * table[10]= '011' |
1558 | | * table[11]= '100 |
1559 | | *' |
1560 | | * table[1 << n - 1] = '0' * n |
1561 | | * table[1 << n - 1 + x] = n-bit wide x (left padded with '0') |
1562 | | * table[1 << n - 2] = '1' * (n - 1) |
1563 | | * |
1564 | | * table[i]->left = table[i*2 + 1] |
1565 | | * table[i]->right = table[i*2 + 2] |
1566 | | * table[0xffff] = unused (16 '0's, max len is 15) |
1567 | | * |
1568 | | * therefore e.g. table[70] = table[64 - 1 + 7] |
1569 | | * = table[1 << 6 - 1 + 7] |
1570 | | * = '000111' (binary 7, widened to 6 bits) |
1571 | | * |
1572 | | * and if '000111' is a code, |
1573 | | * '00011', '0001', '000', '00', '0' are unavailable prefixes. |
1574 | | * 34 16 7 3 1 are their indices |
1575 | | * and (i - 1) >> 1 is the path back from 70 through these. |
1576 | | * |
1577 | | * the lookup is |
1578 | | * |
1579 | | * 1 start with i = 0 |
1580 | | * 2 extract a symbol bit (i = (i << 1) + bit + 1) |
1581 | | * 3 is table[i] == 0xffff? |
1582 | | * 4 yes -- goto 2 |
1583 | | * 4 table[i] & 511 is the symbol, stop |
1584 | | * |
1585 | | * and the construction (here) is sort of the reverse. |
1586 | | * |
1587 | | * Most of this table is free space that can never be reached, and |
1588 | | * most of the activity is at the beginning (since all codes start |
1589 | | * there, and by design the shortest codes are the most common). |
1590 | | */ |
1591 | 328k | for (i = 0; i < 32; i++) { |
1592 | | /* prefill the table head */ |
1593 | 318k | input->table[i] = 0xffff; |
1594 | 318k | } |
1595 | 9.94k | code = -1; |
1596 | 9.94k | prev_len = 0; |
1597 | 2.13M | for (i = 0; i < n_symbols; i++) { |
1598 | 2.12M | uint16_t s = symbols[i]; |
1599 | 2.12M | uint16_t prefix; |
1600 | 2.12M | len = (s >> 9) & 15; |
1601 | 2.12M | s &= 511; |
1602 | 2.12M | code++; |
1603 | 2.20M | while (len != prev_len) { |
1604 | 80.8k | code <<= 1; |
1605 | 80.8k | code++; |
1606 | 80.8k | prev_len++; |
1607 | 80.8k | } |
1608 | | |
1609 | 2.12M | if (code >= 65535) { |
1610 | 174 | return false; |
1611 | 174 | } |
1612 | 2.12M | input->table[code] = s; |
1613 | 2.12M | for(prefix = (code - 1) >> 1; |
1614 | 9.51M | prefix > 31; |
1615 | 7.39M | prefix = (prefix - 1) >> 1) { |
1616 | 7.39M | input->table[prefix] = 0xffff; |
1617 | 7.39M | } |
1618 | 2.12M | } |
1619 | 9.77k | if (CHECK_DEBUGLVL(10)) { |
1620 | 0 | debug_tree_codes(input); |
1621 | 0 | } |
1622 | | |
1623 | | /* |
1624 | | * check that the last code encodes 11111..., with right number of |
1625 | | * ones, pointing to the right symbol -- otherwise we have a dangling |
1626 | | * uninitialised symbol. |
1627 | | */ |
1628 | 9.77k | if (code != (1 << (len + 1)) - 2) { |
1629 | 118 | return false; |
1630 | 118 | } |
1631 | 9.65k | return true; |
1632 | 9.77k | } |
1633 | | |
1634 | | |
1635 | | #define CHECK_READ_32(dest) \ |
1636 | 2.64k | do { \ |
1637 | 2.64k | if (input->byte_pos + 4 > input->byte_size) { \ |
1638 | 19 | return LZXPRESS_ERROR; \ |
1639 | 19 | } \ |
1640 | 2.64k | dest = PULL_LE_U32(input->bytes, input->byte_pos); \ |
1641 | 2.62k | input->byte_pos += 4; \ |
1642 | 2.62k | } while (0) |
1643 | | |
1644 | | #define CHECK_READ_16(dest) \ |
1645 | 34.9M | do { \ |
1646 | 34.9M | if (input->byte_pos + 2 > input->byte_size) { \ |
1647 | 22 | return LZXPRESS_ERROR; \ |
1648 | 22 | } \ |
1649 | 34.9M | dest = PULL_LE_U16(input->bytes, input->byte_pos); \ |
1650 | 34.9M | input->byte_pos += 2; \ |
1651 | 34.9M | } while (0) |
1652 | | |
1653 | | #define CHECK_READ_8(dest) \ |
1654 | 167k | do { \ |
1655 | 167k | if (input->byte_pos >= input->byte_size) { \ |
1656 | 55 | return LZXPRESS_ERROR; \ |
1657 | 55 | } \ |
1658 | 167k | dest = PULL_LE_U8(input->bytes, input->byte_pos); \ |
1659 | 167k | input->byte_pos++; \ |
1660 | 167k | } while(0) |
1661 | | |
1662 | | |
1663 | | static inline ssize_t pull_bits(struct bitstream *input) |
1664 | 34.8M | { |
1665 | 34.8M | if (input->byte_pos + 1 < input->byte_size) { |
1666 | 34.8M | uint16_t tmp; |
1667 | 34.8M | CHECK_READ_16(tmp); |
1668 | 34.8M | input->remaining_bits += 16; |
1669 | 34.8M | input->bits <<= 16; |
1670 | 34.8M | input->bits |= tmp; |
1671 | 34.8M | } else if (input->byte_pos < input->byte_size) { |
1672 | 179 | uint8_t tmp; |
1673 | 179 | CHECK_READ_8(tmp); |
1674 | 179 | input->remaining_bits += 8; |
1675 | 179 | input->bits <<= 8; |
1676 | 179 | input->bits |= tmp; |
1677 | 179 | } else { |
1678 | 167 | return LZXPRESS_ERROR; |
1679 | 167 | } |
1680 | 34.8M | return 0; |
1681 | 34.8M | } |
1682 | | |
1683 | | |
1684 | | /* |
1685 | | * Decompress a block. The actual decompressed size is returned (or -1 on |
1686 | | * error). The putative block length is 64k (or shorter, if the message ends |
1687 | | * first), but a match can run over the end, extending the block. That's why |
1688 | | * we need the overall output size as well as the block size. A match encoded |
1689 | | * in this block can point back to previous blocks, but not before the |
1690 | | * beginning of the message, so we also need the previously decoded size. |
1691 | | * |
1692 | | * The compressed block will have 256 bytes for the Huffman table, and at |
1693 | | * least 4 bytes of (possibly padded) encoded values. |
1694 | | */ |
1695 | | static ssize_t lzx_huffman_decompress_block(struct bitstream *input, |
1696 | | uint8_t *output, |
1697 | | size_t block_size, |
1698 | | size_t output_size, |
1699 | | size_t previous_size) |
1700 | 9.98k | { |
1701 | 9.98k | size_t output_pos = 0; |
1702 | 9.98k | uint16_t symbol; |
1703 | 9.98k | size_t index; |
1704 | 9.98k | uint16_t distance_bits_wanted = 0; |
1705 | 9.98k | size_t distance = 0; |
1706 | 9.98k | size_t length = 0; |
1707 | 9.98k | bool ok; |
1708 | 9.98k | uint32_t tmp; |
1709 | 9.98k | bool seen_eof_marker = false; |
1710 | | |
1711 | 9.98k | ok = fill_decomp_table(input); |
1712 | 9.98k | if (! ok) { |
1713 | 330 | return LZXPRESS_ERROR; |
1714 | 330 | } |
1715 | 9.65k | if (CHECK_DEBUGLVL(10) || DEBUG_HUFFMAN_TREE) { |
1716 | 0 | debug_huffman_tree_from_table(input->table); |
1717 | 0 | } |
1718 | | /* |
1719 | | * Always read 32 bits at the start, even if we don't need them. |
1720 | | */ |
1721 | 9.65k | CHECK_READ_16(tmp); |
1722 | 9.65k | CHECK_READ_16(input->bits); |
1723 | 9.65k | input->bits |= tmp << 16; |
1724 | 9.65k | input->remaining_bits = 32; |
1725 | | |
1726 | | /* |
1727 | | * This loop iterates over individual *bits*. These are read from |
1728 | | * little-endian 16 bit words, most significant bit first. |
1729 | | * |
1730 | | * At points in the bitstream, the following are possible: |
1731 | | * |
1732 | | * # the source word is empty and needs to be refilled from the input |
1733 | | * stream. |
1734 | | * # an incomplete codeword is being extended. |
1735 | | * # a codeword is resolved, either as a literal or a match. |
1736 | | * # a literal is written. |
1737 | | * # a match is collecting distance bits. |
1738 | | * # the output stream is copied, as specified by a match. |
1739 | | * # input bytes are read for match lengths. |
1740 | | * |
1741 | | * Note that we *don't* specifically check for the EOF marker (symbol |
1742 | | * 256) in this loop, because the precondition for stopping for the |
1743 | | * EOF marker is that the output buffer is full (otherwise, you |
1744 | | * wouldn't know which 256 is EOF, rather than an actual symbol), and |
1745 | | * we *always* want to stop when the buffer is full. So we work out if |
1746 | | * there is an EOF in another loop after we stop writing. |
1747 | | */ |
1748 | | |
1749 | 9.65k | index = 0; |
1750 | 557M | while (output_pos < block_size) { |
1751 | 557M | uint16_t b; |
1752 | 557M | if (input->remaining_bits == 16) { |
1753 | 34.8M | ssize_t ret = pull_bits(input); |
1754 | 34.8M | if (ret) { |
1755 | 167 | return ret; |
1756 | 167 | } |
1757 | 34.8M | } |
1758 | 557M | input->remaining_bits--; |
1759 | | |
1760 | 557M | b = (input->bits >> input->remaining_bits) & 1; |
1761 | 557M | if (length == 0) { |
1762 | | /* not in a match; pulling a codeword */ |
1763 | 540M | index <<= 1; |
1764 | 540M | index += b + 1; |
1765 | 540M | if (input->table[index] == 0xffff) { |
1766 | | /* incomplete codeword, the common case */ |
1767 | 468M | continue; |
1768 | 468M | } |
1769 | | /* found the symbol, reset the code string */ |
1770 | 71.7M | symbol = input->table[index] & 511; |
1771 | 71.7M | index = 0; |
1772 | 71.7M | if (symbol < 256) { |
1773 | | /* a literal, the easy case */ |
1774 | 66.0M | output[output_pos] = symbol; |
1775 | 66.0M | output_pos++; |
1776 | 66.0M | continue; |
1777 | 66.0M | } |
1778 | | |
1779 | | /* the beginning of a match */ |
1780 | 5.64M | distance_bits_wanted = (symbol >> 4) & 15; |
1781 | 5.64M | distance = 1 << distance_bits_wanted; |
1782 | 5.64M | length = symbol & 15; |
1783 | 5.64M | if (length == 15) { |
1784 | 167k | CHECK_READ_8(tmp); |
1785 | 167k | length += tmp; |
1786 | 167k | if (length == 255 + 15) { |
1787 | | /* |
1788 | | * note, we discard (don't add) the |
1789 | | * length so far. |
1790 | | */ |
1791 | 18.9k | CHECK_READ_16(length); |
1792 | 18.9k | if (length == 0) { |
1793 | 2.64k | CHECK_READ_32(length); |
1794 | 2.64k | } |
1795 | 18.9k | } |
1796 | 167k | } |
1797 | 5.64M | length += 3; |
1798 | 17.7M | } else { |
1799 | | /* we are pulling extra distance bits */ |
1800 | 17.7M | distance_bits_wanted--; |
1801 | 17.7M | distance |= b << distance_bits_wanted; |
1802 | 17.7M | } |
1803 | | |
1804 | 23.4M | if (distance_bits_wanted == 0) { |
1805 | | /* |
1806 | | * We have a complete match, and it is time to do the |
1807 | | * copy (byte by byte, because the ranges can overlap, |
1808 | | * and we might need to copy bytes we just copied in). |
1809 | | * |
1810 | | * It is possible that this match will extend beyond |
1811 | | * the end of the expected block. That's fine, so long |
1812 | | * as it doesn't extend past the total output size. |
1813 | | */ |
1814 | 5.64M | size_t i; |
1815 | 5.64M | size_t end = output_pos + length; |
1816 | 5.64M | uint8_t *here = output + output_pos; |
1817 | 5.64M | uint8_t *there = here - distance; |
1818 | 5.64M | if (end > output_size || |
1819 | 5.64M | previous_size + output_pos < distance || |
1820 | 5.64M | unlikely(end < output_pos || there > here)) { |
1821 | 161 | return LZXPRESS_ERROR; |
1822 | 161 | } |
1823 | 2.06G | for (i = 0; i < length; i++) { |
1824 | 2.06G | here[i] = there[i]; |
1825 | 2.06G | } |
1826 | 5.64M | output_pos += length; |
1827 | 5.64M | distance = 0; |
1828 | 5.64M | length = 0; |
1829 | 5.64M | } |
1830 | 23.4M | } |
1831 | | |
1832 | 9.23k | if (length != 0 || index != 0) { |
1833 | | /* it seems like we've hit an early end, mid-code */ |
1834 | 0 | return LZXPRESS_ERROR; |
1835 | 0 | } |
1836 | | |
1837 | 9.23k | if (input->byte_pos + 256 < input->byte_size) { |
1838 | | /* |
1839 | | * This block is over, but it clearly isn't the last block, so |
1840 | | * we don't want to look for the EOF. |
1841 | | */ |
1842 | 7.35k | return output_pos; |
1843 | 7.35k | } |
1844 | | /* |
1845 | | * We won't write any more, but we try to read some more to make sure |
1846 | | * we're finishing in a good place. That means we want to see a 256 |
1847 | | * symbol and then some number of zeroes, possibly zero, but as many |
1848 | | * as 32. |
1849 | | * |
1850 | | * In this we are perhaps a bit stricter than Windows, which |
1851 | | * apparently does not insist on the EOF marker, nor on a lack of |
1852 | | * trailing bytes. |
1853 | | */ |
1854 | 77.2k | while (true) { |
1855 | 77.2k | uint16_t b; |
1856 | 77.2k | if (input->remaining_bits == 16) { |
1857 | 5.61k | ssize_t ret; |
1858 | 5.61k | if (input->byte_pos == input->byte_size) { |
1859 | | /* FIN */ |
1860 | 1.69k | break; |
1861 | 1.69k | } |
1862 | 3.92k | ret = pull_bits(input); |
1863 | 3.92k | if (ret) { |
1864 | 0 | return ret; |
1865 | 0 | } |
1866 | 3.92k | } |
1867 | 75.5k | input->remaining_bits--; |
1868 | 75.5k | b = (input->bits >> input->remaining_bits) & 1; |
1869 | 75.5k | if (seen_eof_marker) { |
1870 | | /* |
1871 | | * we have read an EOF symbols. Now we just want to |
1872 | | * see zeroes. |
1873 | | */ |
1874 | 64.3k | if (b != 0) { |
1875 | 24 | return LZXPRESS_ERROR; |
1876 | 24 | } |
1877 | 64.2k | continue; |
1878 | 64.3k | } |
1879 | | |
1880 | | /* we're pulling in a symbol, which had better be 256 */ |
1881 | 11.2k | index <<= 1; |
1882 | 11.2k | index += b + 1; |
1883 | 11.2k | if (input->table[index] == 0xffff) { |
1884 | 9.40k | continue; |
1885 | 9.40k | } |
1886 | | |
1887 | 1.86k | symbol = input->table[index] & 511; |
1888 | 1.86k | if (symbol != 256) { |
1889 | 156 | return LZXPRESS_ERROR; |
1890 | 156 | } |
1891 | 1.70k | seen_eof_marker = true; |
1892 | 1.70k | continue; |
1893 | 1.86k | } |
1894 | | |
1895 | 1.69k | if (! seen_eof_marker) { |
1896 | 10 | return LZXPRESS_ERROR; |
1897 | 10 | } |
1898 | | |
1899 | 1.68k | return output_pos; |
1900 | 1.69k | } |
1901 | | |
1902 | | static ssize_t lzxpress_huffman_decompress_internal(struct bitstream *input, |
1903 | | uint8_t *output, |
1904 | | size_t output_size) |
1905 | 2.73k | { |
1906 | 2.73k | size_t output_pos = 0; |
1907 | | |
1908 | 2.73k | if (input->byte_size < 260) { |
1909 | 103 | return LZXPRESS_ERROR; |
1910 | 103 | } |
1911 | | |
1912 | 11.6k | while (input->byte_pos < input->byte_size) { |
1913 | 9.98k | ssize_t block_output_pos; |
1914 | 9.98k | ssize_t block_output_size; |
1915 | 9.98k | size_t remaining_output_size = output_size - output_pos; |
1916 | | |
1917 | 9.98k | block_output_size = MIN(65536, remaining_output_size); |
1918 | | |
1919 | 9.98k | block_output_pos = lzx_huffman_decompress_block( |
1920 | 9.98k | input, |
1921 | 9.98k | output + output_pos, |
1922 | 9.98k | block_output_size, |
1923 | 9.98k | remaining_output_size, |
1924 | 9.98k | output_pos); |
1925 | | |
1926 | 9.98k | if (block_output_pos < block_output_size) { |
1927 | 944 | return LZXPRESS_ERROR; |
1928 | 944 | } |
1929 | 9.04k | output_pos += block_output_pos; |
1930 | 9.04k | if (output_pos > output_size) { |
1931 | | /* not expecting to get here. */ |
1932 | 0 | return LZXPRESS_ERROR; |
1933 | 0 | } |
1934 | 9.04k | } |
1935 | | |
1936 | 1.68k | if (input->byte_pos != input->byte_size) { |
1937 | 0 | return LZXPRESS_ERROR; |
1938 | 0 | } |
1939 | | |
1940 | 1.68k | return output_pos; |
1941 | 1.68k | } |
1942 | | |
1943 | | |
1944 | | /* |
1945 | | * lzxpress_huffman_decompress() |
1946 | | * |
1947 | | * output_size must be the expected length of the decompressed data. |
1948 | | * input_size and output_size are limited to the minimum of UINT32_MAX and |
1949 | | * SSIZE_MAX. On 64 bit machines that will be UINT32_MAX, or 4GB. |
1950 | | * |
1951 | | * @param input_bytes memory to be decompressed. |
1952 | | * @param input_size length of the compressed buffer. |
1953 | | * @param output destination for the decompressed data. |
1954 | | * @param output_size exact expected length of the decompressed data. |
1955 | | * |
1956 | | * @return the number of bytes written or -1 on error. |
1957 | | */ |
1958 | | |
1959 | | ssize_t lzxpress_huffman_decompress(const uint8_t *input_bytes, |
1960 | | size_t input_size, |
1961 | | uint8_t *output, |
1962 | | size_t output_size) |
1963 | 2.81k | { |
1964 | 2.81k | uint16_t table[65536]; |
1965 | 2.81k | struct bitstream input = { |
1966 | 2.81k | .bytes = input_bytes, |
1967 | 2.81k | .byte_size = input_size, |
1968 | 2.81k | .byte_pos = 0, |
1969 | 2.81k | .bits = 0, |
1970 | 2.81k | .remaining_bits = 0, |
1971 | 2.81k | .table = table |
1972 | 2.81k | }; |
1973 | | |
1974 | 2.81k | if (input_size > SSIZE_MAX || |
1975 | 2.81k | input_size > UINT32_MAX || |
1976 | 2.81k | output_size > SSIZE_MAX || |
1977 | 2.81k | output_size > UINT32_MAX || |
1978 | 2.81k | input_size == 0 || |
1979 | 2.73k | output_size == 0 || |
1980 | 2.73k | input_bytes == NULL || |
1981 | 2.73k | output == NULL) { |
1982 | | /* |
1983 | | * We use negative ssize_t to return errors, which is limiting |
1984 | | * on 32 bit machines, and the 4GB limit exists on Windows. |
1985 | | */ |
1986 | 82 | return LZXPRESS_ERROR; |
1987 | 82 | } |
1988 | | |
1989 | 2.73k | return lzxpress_huffman_decompress_internal(&input, |
1990 | 2.73k | output, |
1991 | 2.73k | output_size); |
1992 | 2.81k | } |
1993 | | |
1994 | | |
1995 | | /** |
1996 | | * lzxpress_huffman_decompress_talloc() |
1997 | | * |
1998 | | * The caller must provide the exact size of the expected output. |
1999 | | * |
2000 | | * The input_size is limited to the minimum of UINT32_MAX and SSIZE_MAX, but |
2001 | | * output_size is limited to 256MB due to a limit in talloc. This effectively |
2002 | | * limits input_size too, as non-crafted compressed data will not exceed the |
2003 | | * decompressed size by very much. |
2004 | | * |
2005 | | * @param mem_ctx TALLOC_CTX parent for the decompressed buffer. |
2006 | | * @param input_bytes memory to be decompressed. |
2007 | | * @param input_size length of the compressed buffer. |
2008 | | * @param output_size expected decompressed size. |
2009 | | * |
2010 | | * @return a talloc'ed buffer exactly output_size in length, or NULL. |
2011 | | */ |
2012 | | |
2013 | | uint8_t *lzxpress_huffman_decompress_talloc(TALLOC_CTX *mem_ctx, |
2014 | | const uint8_t *input_bytes, |
2015 | | size_t input_size, |
2016 | | size_t output_size) |
2017 | 0 | { |
2018 | 0 | ssize_t result; |
2019 | 0 | uint8_t *output = NULL; |
2020 | 0 | struct bitstream input = { |
2021 | 0 | .bytes = input_bytes, |
2022 | 0 | .byte_size = input_size |
2023 | 0 | }; |
2024 | |
|
2025 | 0 | output = talloc_array(mem_ctx, uint8_t, output_size); |
2026 | 0 | if (output == NULL) { |
2027 | 0 | return NULL; |
2028 | 0 | } |
2029 | | |
2030 | 0 | input.table = talloc_array(mem_ctx, uint16_t, 65536); |
2031 | 0 | if (input.table == NULL) { |
2032 | 0 | talloc_free(output); |
2033 | 0 | return NULL; |
2034 | 0 | } |
2035 | 0 | result = lzxpress_huffman_decompress_internal(&input, |
2036 | 0 | output, |
2037 | 0 | output_size); |
2038 | 0 | talloc_free(input.table); |
2039 | |
|
2040 | 0 | if (result != output_size) { |
2041 | 0 | talloc_free(output); |
2042 | 0 | return NULL; |
2043 | 0 | } |
2044 | 0 | return output; |
2045 | 0 | } |