Coverage Report

Created: 2026-02-14 07:07

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/samba/lib/compression/lzxpress_huffman.c
Line
Count
Source
1
/*
2
 * Samba compression library - LGPLv3
3
 *
4
 * Copyright © Catalyst IT 2022
5
 *
6
 * Written by Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
7
 *        and Jennifer Sutton <jennifersutton@catalyst.net.nz>
8
 *
9
 *  ** NOTE! The following LGPL license applies to this file.
10
 *  ** It does NOT imply that all of Samba is released under the LGPL
11
 *
12
 *  This library is free software; you can redistribute it and/or
13
 *  modify it under the terms of the GNU Lesser General Public
14
 *  License as published by the Free Software Foundation; either
15
 *  version 3 of the License, or (at your option) any later version.
16
 *
17
 *  This library is distributed in the hope that it will be useful,
18
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
 *  Lesser General Public License for more details.
21
 *
22
 *  You should have received a copy of the GNU Lesser General Public
23
 *  License along with this library; if not, see <http://www.gnu.org/licenses/>.
24
 */
25
26
#include <talloc.h>
27
28
#include "replace.h"
29
#include "lzxpress_huffman.h"
30
#include "lib/util/stable_sort.h"
31
#include "lib/util/debug.h"
32
#include "lib/util/byteorder.h"
33
#include "lib/util/bytearray.h"
34
35
/*
36
 * DEBUG_NO_LZ77_MATCHES toggles the encoding of matches as matches. If it is
37
 * false the potential match is written as a series of literals, which is a
38
 * valid but usually inefficient encoding. This is useful for isolating a
39
 * problem to either the LZ77 or the Huffman stage.
40
 */
41
#ifndef DEBUG_NO_LZ77_MATCHES
42
5.24k
#define DEBUG_NO_LZ77_MATCHES false
43
#endif
44
45
/*
46
 * DEBUG_HUFFMAN_TREE forces the drawing of ascii art huffman trees during
47
 * compression and decompression.
48
 *
49
 * These trees will also be drawn at DEBUG level 10, but that doesn't work
50
 * with cmocka tests.
51
 */
52
#ifndef DEBUG_HUFFMAN_TREE
53
15.8k
#define DEBUG_HUFFMAN_TREE false
54
#endif
55
56
#if DEBUG_HUFFMAN_TREE
57
#define DBG(...) fprintf(stderr, __VA_ARGS__)
58
#else
59
0
#define DBG(...) DBG_INFO(__VA_ARGS__)
60
#endif
61
62
63
2.17k
#define LZXPRESS_ERROR -1LL
64
65
/*
66
 * We won't encode a match length longer than MAX_MATCH_LENGTH.
67
 *
68
 * Reports are that Windows has a limit at 64M.
69
 */
70
#define MAX_MATCH_LENGTH (64 * 1024 * 1024)
71
72
73
struct bitstream {
74
  const uint8_t *bytes;
75
  size_t byte_pos;
76
  size_t byte_size;
77
  uint32_t bits;
78
  int remaining_bits;
79
  uint16_t *table;
80
};
81
82
83
#if ! defined __has_builtin
84
#define __has_builtin(x) 0
85
#endif
86
87
/*
88
 * bitlen_nonzero_16() returns the bit number of the most significant bit, or
89
 * put another way, the integer log base 2. Log(0) is undefined; the argument
90
 * has to be non-zero!
91
 * 1     -> 0
92
 * 2,3   -> 1
93
 * 4-7   -> 2
94
 * 1024  -> 10, etc
95
 *
96
 * Probably this is handled by a compiler intrinsic function that maps to a
97
 * dedicated machine instruction.
98
 */
99
100
static inline int bitlen_nonzero_16(uint16_t x)
101
81.1M
{
102
81.1M
#if  __has_builtin(__builtin_clz)
103
104
  /* __builtin_clz returns the number of leading zeros */
105
81.1M
  return (sizeof(unsigned int) * CHAR_BIT) - 1
106
81.1M
    - __builtin_clz((unsigned int) x);
107
108
#else
109
110
  int count = -1;
111
  while(x) {
112
    x >>= 1;
113
    count++;
114
  }
115
  return count;
116
117
#endif
118
81.1M
}
119
120
121
struct lzxhuff_compressor_context {
122
  const uint8_t *input_bytes;
123
  size_t input_size;
124
  size_t input_pos;
125
  size_t prev_block_pos;
126
  uint8_t *output;
127
  size_t available_size;
128
  size_t output_pos;
129
};
130
131
static int compare_huffman_node_count(struct huffman_node *a,
132
              struct huffman_node *b)
133
5.61M
{
134
5.61M
  return a->count - b->count;
135
5.61M
}
136
137
static int compare_huffman_node_depth(struct huffman_node *a,
138
              struct huffman_node *b)
139
4.73M
{
140
4.73M
  int c = a->depth - b->depth;
141
4.73M
  if (c != 0) {
142
651k
    return c;
143
651k
  }
144
4.08M
  return (int)a->symbol - (int)b->symbol;
145
4.73M
}
146
147
148
1.00G
#define HASH_MASK ((1 << LZX_HUFF_COMP_HASH_BITS) - 1)
149
150
static inline uint16_t three_byte_hash(const uint8_t *bytes)
151
74.8M
{
152
  /*
153
   * MS-XCA says "three byte hash", but does not specify it.
154
   *
155
   * This one is just cobbled together, but has quite good distribution
156
   * in the 12-14 bit forms, which is what we care about most.
157
   * e.g: 13 bit: median 2048, min 2022, max 2074, stddev 6.0
158
   */
159
74.8M
  uint16_t a = bytes[0];
160
74.8M
  uint16_t b = bytes[1] ^ 0x2e;
161
74.8M
  uint16_t c = bytes[2] ^ 0x55;
162
74.8M
  uint16_t ca = c - a;
163
74.8M
  uint16_t d = ((a + b) << 8) ^ (ca << 5) ^ (c + b) ^ (0xcab + a);
164
74.8M
  return d & HASH_MASK;
165
74.8M
}
166
167
168
static inline uint16_t encode_match(size_t len, size_t offset)
169
2.76M
{
170
2.76M
  uint16_t code = 256;
171
2.76M
  code |= MIN(len - 3, 15);
172
2.76M
  code |= bitlen_nonzero_16(offset) << 4;
173
2.76M
  return code;
174
2.76M
}
175
176
/*
177
 * debug_huffman_tree() uses debug_huffman_tree_print() to draw the Huffman
178
 * tree in ascii art.
179
 *
180
 * Note that the Huffman tree is probably not the same as that implied by the
181
 * canonical Huffman encoding that is finally used. That tree would be the
182
 * same shape, but with the left and right toggled to sort the branches by
183
 * length, after which the symbols for each length sorted by value.
184
 */
185
186
static void debug_huffman_tree_print(struct huffman_node *node,
187
             int *trail, int depth)
188
0
{
189
0
  if (node->left == NULL) {
190
    /* time to print a row */
191
0
    int j;
192
0
    bool branched = false;
193
0
    int row[17];
194
0
    char c[100];
195
0
    int s = node->symbol;
196
0
    char code[17];
197
0
    if (depth > 15) {
198
0
      fprintf(stderr,
199
0
        " \033[1;31m Max depth exceeded! (%d)\033[0m "
200
0
        " symbol %#3x claimed depth %d count %d\n",
201
0
        depth, node->symbol, node->depth, node->count);
202
0
      return;
203
0
    }
204
0
    for (j = depth - 1; j >= 0; j--) {
205
0
      if (branched) {
206
0
        if (trail[j] == -1) {
207
0
          row[j] = -3;
208
0
        } else {
209
0
          row[j] = -2;
210
0
        }
211
0
      } else if (trail[j] == -1) {
212
0
        row[j] = -1;
213
0
        branched = true;
214
0
      } else {
215
0
        row[j] = trail[j];
216
0
      }
217
0
    }
218
0
    for (j = 0; j < depth; j++) {
219
0
      switch (row[j]) {
220
0
      case -3:
221
0
        code[j] = '1';
222
0
        fprintf(stderr, "        ");
223
0
        break;
224
0
      case -2:
225
0
        code[j] = '0';
226
0
        fprintf(stderr, "      │ ");
227
0
        break;
228
0
      case -1:
229
0
        code[j] = '1';
230
0
        fprintf(stderr, "      ╰─");
231
0
        break;
232
0
      default:
233
0
        code[j] = '0';
234
0
        fprintf(stderr, "%5d─┬─", row[j]);
235
0
        break;
236
0
      }
237
0
    }
238
0
    code[depth] = 0;
239
0
    if (s < 32) {
240
0
      snprintf(c, sizeof(c),
241
0
        "\033[1;32m%02x\033[0m \033[1;33m%c%c%c\033[0m",
242
0
         s,
243
0
         0xE2, 0x90, 0x80 + s); /* utf-8 for symbol */
244
0
    }  else if (s < 127) {
245
0
      snprintf(c, sizeof(c),
246
0
         "\033[1;32m%2x\033[0m '\033[10;32m%c\033[0m'",
247
0
         s, s);
248
0
    } else if (s < 256) {
249
0
      snprintf(c, sizeof(c), "\033[1;32m%2x\033[0m", s);
250
0
    } else {
251
0
      uint16_t len = (s & 15) + 3;
252
0
      uint16_t dbits = ((s >> 4) & 15) + 1;
253
0
      snprintf(c, sizeof(c),
254
0
         " \033[0;33mlen:%2d%s, "
255
0
         "dist:%d-%d \033[0m \033[1;32m%3x\033[0m%s",
256
0
         len,
257
0
         len == 18 ? "+" : "",
258
0
         1 << (dbits - 1),
259
0
         (1 << dbits) - 1,
260
0
         s,
261
0
         s == 256 ? " \033[1;31mEOF\033[0m" : "");
262
263
0
    }
264
265
0
    fprintf(stderr, "──%5d %s \033[2;37m%s\033[0m\n",
266
0
      node->count, c, code);
267
0
    return;
268
0
  }
269
0
  trail[depth] = node->count;
270
0
  debug_huffman_tree_print(node->left, trail, depth + 1);
271
0
  trail[depth] = -1;
272
0
  debug_huffman_tree_print(node->right, trail, depth + 1);
273
0
}
274
275
276
/*
277
 * If DEBUG_HUFFMAN_TREE is defined true, debug_huffman_tree()
278
 * will print a tree looking something like this:
279
 *
280
 *     7─┬───    3  len:18+, dist:1-1  10f 0
281
 *       ╰─    4─┬─    2─┬───    1 61 'a' 100
282
 *               │       ╰───    1 62 'b' 101
283
 *               ╰─    2─┬───    1 63 'c' 110
284
 *                       ╰───    1  len: 3, dist:1-1  100 EOF 111
285
 *
286
 * This is based off a Huffman root node, and the tree may not be the same as
287
 * the canonical tree.
288
 */
289
static void debug_huffman_tree(struct huffman_node *root)
290
0
{
291
0
  int trail[17];
292
0
  debug_huffman_tree_print(root, trail, 0);
293
0
}
294
295
296
/*
297
 * If DEBUG_HUFFMAN_TREE is defined true, debug_huffman_tree_from_table()
298
 * will print something like this based on a decoding symbol table.
299
 *
300
 *  Tree from decoding table 9 nodes → 5 codes
301
 * 10000─┬─── 5000  len:18+, dist:1-1  10f 0
302
 *       ╰─ 5000─┬─ 2500─┬─── 1250 61 'a' 100
303
 *               │       ╰─── 1250 62 'b' 101
304
 *               ╰─ 2500─┬─── 1250 63 'c' 110
305
 *                       ╰─── 1250  len: 3, dist:1-1  100 EOF 111
306
 *
307
 * This is the canonical form of the Huffman tree where the actual counts
308
 * aren't known (we use "10000" to help indicate relative frequencies).
309
 */
310
static void debug_huffman_tree_from_table(uint16_t *table)
311
0
{
312
0
  int trail[17];
313
0
  struct huffman_node nodes[1024] = {{0}};
314
0
  uint16_t codes[1024];
315
0
  size_t n = 1;
316
0
  size_t i = 0;
317
0
  codes[0] = 0;
318
0
  nodes[0].count = 10000;
319
320
0
  while (i < n) {
321
0
    uint16_t index = codes[i];
322
0
    struct huffman_node *node = &nodes[i];
323
0
    if (table[index] == 0xffff) {
324
      /* internal node */
325
0
      index <<= 1;
326
      /* left */
327
0
      index++;
328
0
      codes[n] = index;
329
0
      node->left = nodes + n;
330
0
      nodes[n].count = node->count >> 1;
331
0
      n++;
332
      /*right*/
333
0
      index++;
334
0
      codes[n] = index;
335
0
      node->right = nodes + n;
336
0
      nodes[n].count = node->count >> 1;
337
0
      n++;
338
0
    } else {
339
      /* leaf node */
340
0
      node->symbol = table[index] & 511;
341
0
    }
342
0
    i++;
343
0
  }
344
345
0
  fprintf(stderr,
346
0
    "\033[1;34m Tree from decoding table\033[0m "
347
0
    "%zu nodes → %zu codes\n",
348
0
    n, (n + 1) / 2);
349
0
  debug_huffman_tree_print(nodes, trail, 0);
350
0
}
351
352
353
static bool depth_walk(struct huffman_node *n, uint32_t depth)
354
2.12M
{
355
2.12M
  bool ok;
356
2.12M
  if (n->left == NULL) {
357
    /* this is a leaf, record the depth */
358
1.06M
    n->depth = depth;
359
1.06M
    return true;
360
1.06M
  }
361
1.06M
  if (depth > 14) {
362
963
    return false;
363
963
  }
364
1.06M
  ok = (depth_walk(n->left, depth + 1) &&
365
1.05M
        depth_walk(n->right, depth + 1));
366
367
1.06M
  return ok;
368
1.06M
}
369
370
371
static bool check_and_record_depths(struct huffman_node *root)
372
6.20k
{
373
6.20k
  return depth_walk(root, 0);
374
6.20k
}
375
376
377
static bool encode_values(struct huffman_node *leaves,
378
        size_t n_leaves,
379
        uint16_t symbol_values[512])
380
5.24k
{
381
5.24k
  size_t i;
382
  /*
383
   * See, we have a leading 1 in our internal code representation, which
384
   * indicates the code length.
385
   */
386
5.24k
  uint32_t code = 1;
387
5.24k
  uint32_t code_len = 0;
388
5.24k
  memset(symbol_values, 0, sizeof(uint16_t) * 512);
389
820k
  for (i = 0; i < n_leaves; i++) {
390
815k
    code <<= leaves[i].depth - code_len;
391
815k
    code_len = leaves[i].depth;
392
393
815k
    symbol_values[leaves[i].symbol] = code;
394
815k
    code++;
395
815k
  }
396
  /*
397
   * The last code should be 11111... with code_len + 1 ones. The final
398
   * code++ will wrap this round to 1000... with code_len + 1 zeroes.
399
   */
400
401
5.24k
  if (code != 2 << code_len) {
402
0
    return false;
403
0
  }
404
5.24k
  return true;
405
5.24k
}
406
407
408
static int generate_huffman_codes(struct huffman_node *leaf_nodes,
409
          struct huffman_node *internal_nodes,
410
          uint16_t symbol_values[512])
411
5.24k
{
412
5.24k
  size_t head_leaf = 0;
413
5.24k
  size_t head_branch = 0;
414
5.24k
  size_t tail_branch = 0;
415
5.24k
  struct huffman_node *huffman_root = NULL;
416
5.24k
  size_t i, j;
417
5.24k
  size_t n_leaves = 0;
418
419
  /*
420
   * Before we sort the nodes, we can eliminate the unused ones.
421
   */
422
2.69M
  for (i = 0; i < 512; i++) {
423
2.68M
    if (leaf_nodes[i].count) {
424
815k
      leaf_nodes[n_leaves] = leaf_nodes[i];
425
815k
      n_leaves++;
426
815k
    }
427
2.68M
  }
428
5.24k
  if (n_leaves == 0) {
429
0
    return LZXPRESS_ERROR;
430
0
  }
431
5.24k
  if (n_leaves == 1) {
432
    /*
433
     * There is *almost* no way this should happen, and it would
434
     * ruin the tree (because the shortest possible codes are 1
435
     * bit long, and there are two of them).
436
     *
437
     * The only way to get here is in an internal block in a
438
     * 3-or-more block message (i.e. > 128k), which consists
439
     * entirely of a match starting in the previous block (if it
440
     * was the end block, it would have the EOF symbol).
441
     *
442
     * What we do is add a dummy symbol which is this one XOR 256.
443
     * It won't be used in the stream but will balance the tree.
444
     */
445
72
    leaf_nodes[1] = leaf_nodes[0];
446
72
    leaf_nodes[1].symbol ^= 0x100;
447
72
    n_leaves = 2;
448
72
  }
449
450
  /* note, in sort we're using internal_nodes as auxiliary space */
451
5.24k
  stable_sort(leaf_nodes,
452
5.24k
        internal_nodes,
453
5.24k
        n_leaves,
454
5.24k
        sizeof(struct huffman_node),
455
5.24k
        (samba_compare_fn_t)compare_huffman_node_count);
456
457
  /*
458
   * This outer loop is for re-quantizing the counts if the tree is too
459
   * tall (>15), which we need to do because the final encoding can't
460
   * express a tree that deep.
461
   *
462
   * In theory, this should be a 'while (true)' loop, but we chicken
463
   * out with 10 iterations, just in case.
464
   *
465
   * In practice it will almost always resolve in the first round; if
466
   * not then, in the second or third. Remember we'll looking at 64k or
467
   * less, so the rarest we can have is 1 in 64k; each round of
468
   * quantization effectively doubles its frequency to 1 in 32k, 1 in
469
   * 16k, etc, until we're treating the rare symbol as actually quite
470
   * common.
471
   */
472
6.20k
  for (j = 0; j < 10; j++) {
473
6.20k
    bool less_than_15_bits;
474
1.10M
    while (true) {
475
1.10M
      struct huffman_node *a = NULL;
476
1.10M
      struct huffman_node *b = NULL;
477
1.10M
      size_t leaf_len = n_leaves - head_leaf;
478
1.10M
      size_t internal_len = tail_branch - head_branch;
479
480
1.10M
      if (leaf_len + internal_len == 1) {
481
        /*
482
         * We have the complete tree. The root will be
483
         * an internal node unless there is just one
484
         * symbol, which is already impossible.
485
         */
486
6.20k
        if (unlikely(leaf_len == 1)) {
487
0
          return LZXPRESS_ERROR;
488
6.20k
        } else {
489
6.20k
          huffman_root = \
490
6.20k
            &internal_nodes[head_branch];
491
6.20k
        }
492
6.20k
        break;
493
6.20k
      }
494
      /*
495
       * We know here we have at least two nodes, and we
496
       * want to select the two lowest scoring ones. Those
497
       * have to be either a) the head of each queue, or b)
498
       * the first two nodes of either queue.
499
       *
500
       * The complicating factors are: a) we need to check
501
       * the length of each queue, and b) in the case of
502
       * ties, we prefer to pair leaves with leaves.
503
       *
504
       * Note a complication we don't have: the leaf node
505
       * queue never grows, and the subtree queue starts
506
       * empty and cannot grow beyond n - 1. It feeds on
507
       * itself. We don't need to think about overflow.
508
       */
509
1.09M
      if (leaf_len == 0) {
510
        /* two from subtrees */
511
295k
        a = &internal_nodes[head_branch];
512
295k
        b = &internal_nodes[head_branch + 1];
513
295k
        head_branch += 2;
514
800k
      } else if (internal_len == 0) {
515
        /* two from nodes */
516
6.20k
        a = &leaf_nodes[head_leaf];
517
6.20k
        b = &leaf_nodes[head_leaf + 1];
518
6.20k
        head_leaf += 2;
519
794k
      } else if (leaf_len == 1 && internal_len == 1) {
520
        /* one of each */
521
686
        a = &leaf_nodes[head_leaf];
522
686
        b = &internal_nodes[head_branch];
523
686
        head_branch++;
524
686
        head_leaf++;
525
794k
      } else {
526
        /*
527
         * Take the lowest head, twice, checking for
528
         * length after taking the first one.
529
         */
530
794k
        if (leaf_nodes[head_leaf].count >
531
794k
            internal_nodes[head_branch].count) {
532
247k
          a = &internal_nodes[head_branch];
533
247k
          head_branch++;
534
247k
          if (internal_len == 1) {
535
1.52k
            b = &leaf_nodes[head_leaf];
536
1.52k
            head_leaf++;
537
1.52k
            goto done;
538
1.52k
          }
539
546k
        } else {
540
546k
          a = &leaf_nodes[head_leaf];
541
546k
          head_leaf++;
542
546k
          if (leaf_len == 1) {
543
3.02k
            b = &internal_nodes[head_branch];
544
3.02k
            head_branch++;
545
3.02k
            goto done;
546
3.02k
          }
547
546k
        }
548
        /* the other node */
549
789k
        if (leaf_nodes[head_leaf].count >
550
789k
            internal_nodes[head_branch].count) {
551
247k
          b = &internal_nodes[head_branch];
552
247k
          head_branch++;
553
541k
        } else {
554
541k
          b = &leaf_nodes[head_leaf];
555
541k
          head_leaf++;
556
541k
        }
557
789k
      }
558
1.09M
    done:
559
      /*
560
       * Now we add a new node to the subtrees list that
561
       * combines the score of node_a and node_b, and points
562
       * to them as children.
563
       */
564
1.09M
      internal_nodes[tail_branch].count = a->count + b->count;
565
1.09M
      internal_nodes[tail_branch].left = a;
566
1.09M
      internal_nodes[tail_branch].right = b;
567
1.09M
      tail_branch++;
568
1.09M
      if (tail_branch == n_leaves) {
569
        /*
570
         * We're not getting here, no way, never ever.
571
         * Unless we made a terrible mistake.
572
         *
573
         * That is, in a binary tree with n leaves,
574
         * there are ALWAYS n-1 internal nodes.
575
         */
576
0
        return LZXPRESS_ERROR;
577
0
      }
578
1.09M
    }
579
6.20k
    if (CHECK_DEBUGLVL(10) || DEBUG_HUFFMAN_TREE) {
580
0
      debug_huffman_tree(huffman_root);
581
0
    }
582
    /*
583
     * We have a tree, and need to turn it into a lookup table,
584
     * and see if it is shallow enough (<= 15).
585
     */
586
6.20k
    less_than_15_bits = check_and_record_depths(huffman_root);
587
6.20k
    if (less_than_15_bits) {
588
      /*
589
       * Now the leaf nodes know how deep they are, and we
590
       * no longer need the internal nodes.
591
       *
592
       * We need to sort the nodes of equal depth, so that
593
       * they are sorted by depth first, and symbol value
594
       * second. The internal_nodes can again be auxiliary
595
       * memory.
596
       */
597
5.24k
      stable_sort(
598
5.24k
        leaf_nodes,
599
5.24k
        internal_nodes,
600
5.24k
        n_leaves,
601
5.24k
        sizeof(struct huffman_node),
602
5.24k
        (samba_compare_fn_t)compare_huffman_node_depth);
603
604
5.24k
      encode_values(leaf_nodes, n_leaves, symbol_values);
605
606
5.24k
      return n_leaves;
607
5.24k
    }
608
609
    /*
610
     * requantize by halving and rounding up, so that small counts
611
     * become relatively bigger. This will lead to a flatter tree.
612
     */
613
287k
    for (i = 0; i < n_leaves; i++) {
614
286k
      leaf_nodes[i].count >>= 1;
615
286k
      leaf_nodes[i].count += 1;
616
286k
    }
617
963
    head_leaf = 0;
618
963
    head_branch = 0;
619
963
    tail_branch = 0;
620
963
  }
621
0
  return LZXPRESS_ERROR;
622
5.24k
}
623
624
/*
625
 * LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS is how far ahead to search in the
626
 * circular hash table for a match, before we give up. A bigger number will
627
 * generally lead to better but slower compression, but a stupidly big number
628
 * will just be worse.
629
 *
630
 * If you're fiddling with this, consider also fiddling with
631
 * LZX_HUFF_COMP_HASH_BITS.
632
 */
633
1.12G
#define LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS 5
634
635
static inline void store_match(uint16_t *hash_table,
636
             uint16_t h,
637
             uint16_t offset)
638
74.8M
{
639
74.8M
  int i;
640
74.8M
  uint16_t o = hash_table[h];
641
74.8M
  uint16_t h2;
642
74.8M
  uint16_t worst_h;
643
74.8M
  int worst_score;
644
645
74.8M
  if (o == 0xffff) {
646
    /* there is nothing there yet */
647
16.4M
    hash_table[h] = offset;
648
16.4M
    return;
649
16.4M
  }
650
258M
  for (i = 1; i < LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS; i++) {
651
211M
    h2 = (h + i) & HASH_MASK;
652
211M
    if (hash_table[h2] == 0xffff) {
653
11.0M
      hash_table[h2] = offset;
654
11.0M
      return;
655
11.0M
    }
656
211M
  }
657
  /*
658
   * There are no slots, but we really want to store this, so we'll kick
659
   * out the one with the longest distance.
660
   */
661
47.2M
  worst_h = h;
662
47.2M
  worst_score = offset - o;
663
236M
  for (i = 1; i < LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS; i++) {
664
189M
    int score;
665
189M
    h2 = (h + i) & HASH_MASK;
666
189M
    o = hash_table[h2];
667
189M
    score = offset - o;
668
189M
    if (score > worst_score) {
669
55.3M
      worst_score = score;
670
55.3M
      worst_h = h2;
671
55.3M
    }
672
189M
  }
673
47.2M
  hash_table[worst_h] = offset;
674
47.2M
}
675
676
677
/*
678
 * Yes, struct match looks a lot like a DATA_BLOB.
679
 */
680
struct match {
681
  const uint8_t *there;
682
  size_t length;
683
};
684
685
686
static inline struct match lookup_match(uint16_t *hash_table,
687
          uint16_t h,
688
          const uint8_t *data,
689
          const uint8_t *here,
690
          size_t max_len)
691
126M
{
692
126M
  int i;
693
126M
  uint16_t o = hash_table[h];
694
126M
  uint16_t h2;
695
126M
  size_t len;
696
126M
  const uint8_t *there = NULL;
697
126M
  struct match best = {0};
698
699
630M
  for (i = 0; i < LZX_HUFF_COMP_HASH_SEARCH_ATTEMPTS; i++) {
700
534M
    h2 = (h + i) & HASH_MASK;
701
534M
    o = hash_table[h2];
702
534M
    if (o == 0xffff) {
703
      /*
704
       * in setting this, we would never have stepped over
705
       * an 0xffff, so we won't now.
706
       */
707
30.7M
      break;
708
30.7M
    }
709
503M
    there = data + o;
710
503M
    if (here - there > 65534 || there > here) {
711
45.8M
      continue;
712
45.8M
    }
713
714
    /*
715
     * When we already have a long match, we can try to avoid
716
     * measuring out another long, but shorter match.
717
     */
718
458M
    if (best.length > 1000 &&
719
30.5k
        there[best.length - 1] != best.there[best.length - 1]) {
720
11.0k
      continue;
721
11.0k
    }
722
723
458M
    for (len = 0;
724
740M
         len < max_len && here[len] == there[len];
725
458M
         len++) {
726
      /* counting */
727
282M
    }
728
458M
    if (len > 2) {
729
      /*
730
       * As a tiebreaker, we prefer the closer match which
731
       * is likely to encode smaller (and certainly no worse).
732
       */
733
8.18M
      if (len > best.length ||
734
5.36M
          (len == best.length && there > best.there)) {
735
5.36M
        best.length = len;
736
5.36M
        best.there = there;
737
5.36M
      }
738
8.18M
    }
739
458M
  }
740
126M
  return best;
741
126M
}
742
743
744
745
static ssize_t lz77_encode_block(struct lzxhuff_compressor_context *cmp_ctx,
746
         struct lzxhuff_compressor_mem *cmp_mem,
747
         uint16_t *hash_table,
748
         uint16_t *prev_hash_table)
749
5.24k
{
750
5.24k
  uint16_t *intermediate = cmp_mem->intermediate;
751
5.24k
  struct huffman_node *leaf_nodes = cmp_mem->leaf_nodes;
752
5.24k
  uint16_t *symbol_values = cmp_mem->symbol_values;
753
5.24k
  size_t i, j, intermediate_len;
754
5.24k
  const uint8_t *data = cmp_ctx->input_bytes + cmp_ctx->input_pos;
755
5.24k
  const uint8_t *prev_block = NULL;
756
5.24k
  size_t remaining_size = cmp_ctx->input_size - cmp_ctx->input_pos;
757
5.24k
  size_t block_end = MIN(65536, remaining_size);
758
5.24k
  struct match match;
759
5.24k
  int n_symbols;
760
761
5.24k
  if (cmp_ctx->input_size < cmp_ctx->input_pos) {
762
0
    return LZXPRESS_ERROR;
763
0
  }
764
765
5.24k
  if (cmp_ctx->prev_block_pos != cmp_ctx->input_pos) {
766
2.34k
    prev_block = cmp_ctx->input_bytes + cmp_ctx->prev_block_pos;
767
2.89k
  } else if (prev_hash_table != NULL) {
768
    /* we've got confused! hash and block should go together */
769
0
    return LZXPRESS_ERROR;
770
0
  }
771
772
  /*
773
   * leaf_nodes is used to count the symbols seen, for later Huffman
774
   * encoding.
775
   */
776
2.69M
  for (i = 0; i < 512; i++) {
777
2.68M
    leaf_nodes[i] = (struct huffman_node) {
778
2.68M
      .symbol = i
779
2.68M
    };
780
2.68M
  }
781
782
5.24k
  j = 0;
783
784
5.24k
  if (remaining_size < 41 || DEBUG_NO_LZ77_MATCHES) {
785
    /*
786
     * There is no point doing a hash table and looking for
787
     * matches in this tiny block (remembering we are committed to
788
     * using 32 bits, so there's a good chance we wouldn't even
789
     * save a byte). The threshold of 41 matches Windows.
790
     * If remaining_size < 3, we *can't* do the hash.
791
     */
792
603
    i = 0;
793
4.64k
  } else {
794
    /*
795
     * We use 0xffff as the unset value for table, because it is
796
     * not a valid match offset (and 0x0 is).
797
     */
798
4.64k
    memset(hash_table, 0xff, sizeof(cmp_mem->hash_table1));
799
800
74.8M
    for (i = 0; i <= block_end - 3; i++) {
801
74.8M
      uint16_t code;
802
74.8M
      const uint8_t *here = data + i;
803
74.8M
      uint16_t h = three_byte_hash(here);
804
74.8M
      size_t max_len = MIN(remaining_size - i, MAX_MATCH_LENGTH);
805
74.8M
      match = lookup_match(hash_table,
806
74.8M
               h,
807
74.8M
               data,
808
74.8M
               here,
809
74.8M
               max_len);
810
811
74.8M
      if (match.there == NULL && prev_hash_table != NULL) {
812
        /*
813
         * If this is not the first block,
814
         * backreferences can look into the previous
815
         * block (but only as far as 65535 bytes, so
816
         * the end of this block cannot see the start
817
         * of the last one).
818
         */
819
51.5M
        match = lookup_match(prev_hash_table,
820
51.5M
                 h,
821
51.5M
                 prev_block,
822
51.5M
                 here,
823
51.5M
                 remaining_size - i);
824
51.5M
      }
825
826
74.8M
      store_match(hash_table, h, i);
827
828
74.8M
      if (match.there == NULL) {
829
        /* add a literal and move on. */
830
72.1M
        uint8_t c = data[i];
831
72.1M
        leaf_nodes[c].count++;
832
72.1M
        intermediate[j] = c;
833
72.1M
        j++;
834
72.1M
        continue;
835
72.1M
      }
836
837
      /* a real match */
838
2.76M
      if (match.length <= 65538) {
839
2.76M
        intermediate[j] = 0xffff;
840
2.76M
        intermediate[j + 1] = match.length - 3;
841
2.76M
        intermediate[j + 2] = here - match.there;
842
2.76M
        j += 3;
843
2.76M
      } else {
844
317
        size_t m = match.length - 3;
845
317
        intermediate[j] = 0xfffe;
846
317
        intermediate[j + 1] = m & 0xffff;
847
317
        intermediate[j + 2] = m >> 16;
848
317
        intermediate[j + 3] = here - match.there;
849
317
        j += 4;
850
317
      }
851
2.76M
      code = encode_match(match.length, here - match.there);
852
2.76M
      leaf_nodes[code].count++;
853
2.76M
      i += match.length - 1; /* `- 1` for the loop i++ */
854
      /*
855
       * A match can take us past the intended block length,
856
       * extending the block. We don't need to do anything
857
       * special for this case -- the loops will naturally
858
       * do the right thing.
859
       */
860
2.76M
    }
861
4.64k
  }
862
863
  /*
864
   * There might be some bytes at the end.
865
   */
866
18.3k
  for (; i < block_end; i++) {
867
13.0k
    leaf_nodes[data[i]].count++;
868
13.0k
    intermediate[j] = data[i];
869
13.0k
    j++;
870
13.0k
  }
871
872
5.24k
  if (i == remaining_size) {
873
    /* add a trailing EOF marker (256) */
874
2.88k
    intermediate[j] = 0xffff;
875
2.88k
    intermediate[j + 1] = 0;
876
2.88k
    intermediate[j + 2] = 1;
877
2.88k
    j += 3;
878
2.88k
    leaf_nodes[256].count++;
879
2.88k
  }
880
881
5.24k
  intermediate_len = j;
882
883
5.24k
  cmp_ctx->prev_block_pos = cmp_ctx->input_pos;
884
5.24k
  cmp_ctx->input_pos += i;
885
886
  /* fill in the symbols table */
887
5.24k
  n_symbols = generate_huffman_codes(leaf_nodes,
888
5.24k
             cmp_mem->internal_nodes,
889
5.24k
             symbol_values);
890
5.24k
  if (n_symbols < 0) {
891
0
    return n_symbols;
892
0
  }
893
894
5.24k
  return intermediate_len;
895
5.24k
}
896
897
898
899
static ssize_t write_huffman_table(uint16_t symbol_values[512],
900
           uint8_t *output,
901
           size_t available_size)
902
5.24k
{
903
5.24k
  size_t i;
904
905
5.24k
  if (available_size < 256) {
906
0
    return LZXPRESS_ERROR;
907
0
  }
908
909
1.34M
  for (i = 0; i < 256; i++) {
910
1.34M
    uint8_t b = 0;
911
1.34M
    uint16_t even = symbol_values[i * 2];
912
1.34M
    uint16_t odd = symbol_values[i * 2 + 1];
913
1.34M
    if (even != 0) {
914
402k
      b = bitlen_nonzero_16(even);
915
402k
    }
916
1.34M
    if (odd != 0) {
917
413k
      b |= bitlen_nonzero_16(odd) << 4;
918
413k
    }
919
1.34M
    output[i] = b;
920
1.34M
  }
921
5.24k
  return i;
922
5.24k
}
923
924
925
struct write_context {
926
  uint8_t *dest;
927
  size_t dest_len;
928
  size_t head;                 /* where lengths go */
929
  size_t next_code;            /* where symbol stream goes */
930
  size_t pending_next_code;    /* will be next_code */
931
  unsigned bit_len;
932
  uint32_t bits;
933
};
934
935
/*
936
 * Write out 16 bits, little-endian, for write_huffman_codes()
937
 *
938
 * As you'll notice, there's a bit to do.
939
 *
940
 * We are collecting up bits in a uint32_t, then when there are 16 of them we
941
 * write out a word into the stream, using a trio of offsets (wc->next_code,
942
 * wc->pending_next_code, and wc->head) which dance around ensuring that the
943
 * bitstream and the interspersed lengths are in the right places relative to
944
 * each other.
945
 */
946
947
static inline bool write_bits(struct write_context *wc,
948
            uint16_t code, uint16_t length)
949
77.5M
{
950
77.5M
  wc->bits <<= length;
951
77.5M
  wc->bits |= code;
952
77.5M
  wc->bit_len += length;
953
77.5M
  if (wc->bit_len > 16) {
954
38.1M
    uint32_t w = wc->bits >> (wc->bit_len - 16);
955
38.1M
    wc->bit_len -= 16;
956
38.1M
    if (wc->next_code + 2 > wc->dest_len ||
957
38.1M
        unlikely(wc->bit_len > 16)) {
958
21
      return false;
959
21
    }
960
38.1M
    wc->dest[wc->next_code] = w & 0xff;
961
38.1M
    wc->dest[wc->next_code + 1] = (w >> 8) & 0xff;
962
38.1M
    wc->next_code = wc->pending_next_code;
963
38.1M
    wc->pending_next_code = wc->head;
964
38.1M
    wc->head += 2;
965
38.1M
  }
966
77.5M
  return true;
967
77.5M
}
968
969
970
static inline bool write_code(struct write_context *wc, uint16_t code)
971
74.7M
{
972
74.7M
  int code_bit_len = bitlen_nonzero_16(code);
973
74.7M
  if (unlikely(code == 0)) {
974
0
    return false;
975
0
  }
976
74.7M
  code &= (1 << code_bit_len) - 1;
977
74.7M
  return  write_bits(wc, code, code_bit_len);
978
74.7M
}
979
980
static inline bool write_byte(struct write_context *wc, uint8_t byte)
981
451k
{
982
451k
  if (wc->head + 1 > wc->dest_len) {
983
4
    return false;
984
4
  }
985
451k
  wc->dest[wc->head] = byte;
986
451k
  wc->head++;
987
451k
  return true;
988
451k
}
989
990
991
static inline bool write_long_len(struct write_context *wc, size_t len)
992
30.1k
{
993
30.1k
  if (len < 65535) {
994
29.8k
    if (wc->head + 3 > wc->dest_len) {
995
23
      return false;
996
23
    }
997
29.8k
    wc->dest[wc->head] = 255;
998
29.8k
    wc->dest[wc->head + 1] = len & 255;
999
29.8k
    wc->dest[wc->head + 2] = len >> 8;
1000
29.8k
    wc->head += 3;
1001
29.8k
  } else {
1002
343
    if (wc->head + 7 > wc->dest_len) {
1003
10
      return false;
1004
10
    }
1005
333
    wc->dest[wc->head] = 255;
1006
333
    wc->dest[wc->head + 1] = 0;
1007
333
    wc->dest[wc->head + 2] = 0;
1008
333
    wc->dest[wc->head + 3] = len & 255;
1009
333
    wc->dest[wc->head + 4] = (len >> 8) & 255;
1010
333
    wc->dest[wc->head + 5] = (len >> 16) & 255;
1011
333
    wc->dest[wc->head + 6] = (len >> 24) & 255;
1012
333
    wc->head += 7;
1013
333
  }
1014
30.1k
  return true;
1015
30.1k
}
1016
1017
static ssize_t write_compressed_bytes(uint16_t symbol_values[512],
1018
              uint16_t *intermediate,
1019
              size_t intermediate_len,
1020
              uint8_t *dest,
1021
              size_t dest_len)
1022
5.24k
{
1023
5.24k
  bool ok;
1024
5.24k
  size_t i;
1025
5.24k
  size_t end;
1026
5.24k
  struct write_context wc = {
1027
5.24k
    .head = 4,
1028
5.24k
    .pending_next_code = 2,
1029
5.24k
    .dest = dest,
1030
5.24k
    .dest_len = dest_len
1031
5.24k
  };
1032
74.7M
  for (i = 0; i < intermediate_len; i++) {
1033
74.7M
    uint16_t c = intermediate[i];
1034
74.7M
    size_t len;
1035
74.7M
    uint16_t distance;
1036
74.7M
    uint16_t code_len = 0;
1037
74.7M
    uint16_t code_dist = 0;
1038
74.7M
    if (c < 256) {
1039
72.0M
      ok = write_code(&wc, symbol_values[c]);
1040
72.0M
      if (!ok) {
1041
5
        return LZXPRESS_ERROR;
1042
5
      }
1043
72.0M
      continue;
1044
72.0M
    }
1045
1046
2.76M
    if (c == 0xfffe) {
1047
316
      if (i > intermediate_len - 4) {
1048
0
        return LZXPRESS_ERROR;
1049
0
      }
1050
1051
316
      len = intermediate[i + 1];
1052
316
      len |= (uint32_t)intermediate[i + 2] << 16;
1053
316
      distance = intermediate[i + 3];
1054
316
      i += 3;
1055
2.76M
    } else if (c == 0xffff) {
1056
2.76M
      if (i > intermediate_len - 3) {
1057
0
        return LZXPRESS_ERROR;
1058
0
      }
1059
2.76M
      len = intermediate[i + 1];
1060
2.76M
      distance = intermediate[i + 2];
1061
2.76M
      i += 2;
1062
2.76M
    } else {
1063
0
      return LZXPRESS_ERROR;
1064
0
    }
1065
2.76M
    if (unlikely(distance == 0)) {
1066
0
      return LZXPRESS_ERROR;
1067
0
    }
1068
    /* len has already had 3 subtracted */
1069
2.76M
    if (len >= 15) {
1070
      /*
1071
       * We are going to need to write extra length
1072
       * bytes into the stream, but we don't do it
1073
       * now, we do it after the code has been
1074
       * written (and before the distance bits).
1075
       */
1076
481k
      code_len = 15;
1077
2.28M
    } else {
1078
2.28M
      code_len = len;
1079
2.28M
    }
1080
2.76M
    code_dist = bitlen_nonzero_16(distance);
1081
2.76M
    c = 256 | (code_dist << 4) | code_len;
1082
2.76M
    if (c > 511) {
1083
0
      return LZXPRESS_ERROR;
1084
0
    }
1085
1086
2.76M
    ok = write_code(&wc, symbol_values[c]);
1087
2.76M
    if (!ok) {
1088
3
      return LZXPRESS_ERROR;
1089
3
    }
1090
1091
2.76M
    if (code_len == 15) {
1092
481k
      if (len >= 270) {
1093
30.1k
        ok = write_long_len(&wc, len);
1094
451k
      } else {
1095
451k
        ok = write_byte(&wc, len - 15);
1096
451k
      }
1097
481k
      if (! ok) {
1098
37
        return LZXPRESS_ERROR;
1099
37
      }
1100
481k
    }
1101
2.76M
    if (code_dist != 0) {
1102
2.73M
      uint16_t dist_bits = distance - (1 << code_dist);
1103
2.73M
      ok = write_bits(&wc, dist_bits, code_dist);
1104
2.73M
      if (!ok) {
1105
3
        return LZXPRESS_ERROR;
1106
3
      }
1107
2.73M
    }
1108
2.76M
  }
1109
  /*
1110
   * There are some intricacies around flushing the bits and returning
1111
   * the length.
1112
   *
1113
   * If the returned length is not exactly right and there is another
1114
   * block, that block will read its huffman table from the wrong place,
1115
   * and have all the symbol codes out by a multiple of 4.
1116
   */
1117
5.19k
  end = wc.head;
1118
5.19k
  if (wc.bit_len == 0) {
1119
0
    end -= 2;
1120
0
  }
1121
5.19k
  ok = write_bits(&wc, 0, 16 - wc.bit_len);
1122
5.19k
  if (!ok) {
1123
0
    return LZXPRESS_ERROR;
1124
0
  }
1125
15.5k
  for (i = 0; i < 2; i++) {
1126
    /*
1127
     * Flush out the bits with zeroes. It doesn't matter if we do
1128
     * a round too many, as we have buffer space, and have already
1129
     * determined the returned length (end).
1130
     */
1131
10.3k
    ok = write_bits(&wc, 0, 16);
1132
10.3k
    if (!ok) {
1133
10
      return LZXPRESS_ERROR;
1134
10
    }
1135
10.3k
  }
1136
5.18k
  return end;
1137
5.19k
}
1138
1139
1140
static ssize_t lzx_huffman_compress_block(struct lzxhuff_compressor_context *cmp_ctx,
1141
            struct lzxhuff_compressor_mem *cmp_mem,
1142
            size_t block_no)
1143
5.25k
{
1144
5.25k
  ssize_t intermediate_size;
1145
5.25k
  uint16_t *hash_table = NULL;
1146
5.25k
  uint16_t *back_window_hash_table = NULL;
1147
5.25k
  ssize_t bytes_written;
1148
1149
5.25k
  if (cmp_ctx->available_size - cmp_ctx->output_pos < 260) {
1150
    /* huffman block + 4 bytes */
1151
13
    return LZXPRESS_ERROR;
1152
13
  }
1153
1154
  /*
1155
   * For LZ77 compression, we keep a hash table for the previous block,
1156
   * via alternation after the first block.
1157
   *
1158
   * LZ77 writes into the intermediate buffer in the cmp_mem context.
1159
   */
1160
5.24k
  if (block_no == 0) {
1161
2.89k
    hash_table = cmp_mem->hash_table1;
1162
2.89k
    back_window_hash_table = NULL;
1163
2.89k
  } else if (block_no & 1) {
1164
1.45k
    hash_table = cmp_mem->hash_table2;
1165
1.45k
    back_window_hash_table = cmp_mem->hash_table1;
1166
1.45k
  } else {
1167
894
    hash_table = cmp_mem->hash_table1;
1168
894
    back_window_hash_table = cmp_mem->hash_table2;
1169
894
  }
1170
1171
5.24k
  intermediate_size = lz77_encode_block(cmp_ctx,
1172
5.24k
                cmp_mem,
1173
5.24k
                hash_table,
1174
5.24k
                back_window_hash_table);
1175
1176
5.24k
  if (intermediate_size < 0) {
1177
0
    return intermediate_size;
1178
0
  }
1179
1180
  /*
1181
   * Write the 256 byte Huffman table, based on the counts gained in
1182
   * LZ77 phase.
1183
   */
1184
5.24k
  bytes_written = write_huffman_table(
1185
5.24k
    cmp_mem->symbol_values,
1186
5.24k
    cmp_ctx->output + cmp_ctx->output_pos,
1187
5.24k
    cmp_ctx->available_size - cmp_ctx->output_pos);
1188
1189
5.24k
  if (bytes_written != 256) {
1190
0
    return LZXPRESS_ERROR;
1191
0
  }
1192
5.24k
  cmp_ctx->output_pos += 256;
1193
1194
  /*
1195
   * Write the compressed bytes using the LZ77 matches and Huffman codes
1196
   * worked out in the previous steps.
1197
   */
1198
5.24k
  bytes_written = write_compressed_bytes(
1199
5.24k
    cmp_mem->symbol_values,
1200
5.24k
    cmp_mem->intermediate,
1201
5.24k
    intermediate_size,
1202
5.24k
    cmp_ctx->output + cmp_ctx->output_pos,
1203
5.24k
    cmp_ctx->available_size - cmp_ctx->output_pos);
1204
1205
5.24k
  if (bytes_written < 0) {
1206
58
    return bytes_written;
1207
58
  }
1208
1209
5.18k
  cmp_ctx->output_pos += bytes_written;
1210
5.18k
  return bytes_written;
1211
5.24k
}
1212
1213
/*
1214
 * lzxpress_huffman_max_compressed_size()
1215
 *
1216
 * Return the most bytes the compression can take, to allow
1217
 * pre-allocation.
1218
 */
1219
size_t lzxpress_huffman_max_compressed_size(size_t input_size)
1220
0
{
1221
  /*
1222
   * In the worst case, the output size should be about the same as the
1223
   * input size, plus the 256 byte header per 64k block. We aim for
1224
   * ample, but within the order of magnitude.
1225
   */
1226
0
  return input_size + (input_size / 8) + 270;
1227
0
}
1228
1229
/*
1230
 * lzxpress_huffman_compress_talloc()
1231
 *
1232
 * This is the convenience function that allocates the compressor context and
1233
 * output memory for you. The return value is the number of bytes written to
1234
 * the location indicated by the output pointer.
1235
 *
1236
 * The maximum input_size is effectively around 227MB due to the need to guess
1237
 * an upper bound on the output size that hits an internal limitation in
1238
 * talloc.
1239
 *
1240
 * @param mem_ctx      TALLOC_CTX parent for the compressed buffer.
1241
 * @param input_bytes  memory to be compressed.
1242
 * @param input_size   length of the input buffer.
1243
 * @param output       destination pointer for the compressed data.
1244
 *
1245
 * @return the number of bytes written or -1 on error.
1246
 */
1247
1248
ssize_t lzxpress_huffman_compress_talloc(TALLOC_CTX *mem_ctx,
1249
           const uint8_t *input_bytes,
1250
           size_t input_size,
1251
           uint8_t **output)
1252
0
{
1253
0
  struct lzxhuff_compressor_mem *cmp = NULL;
1254
0
  size_t alloc_size = lzxpress_huffman_max_compressed_size(input_size);
1255
1256
0
  ssize_t output_size;
1257
1258
0
  *output = talloc_array(mem_ctx, uint8_t, alloc_size);
1259
0
  if (*output == NULL) {
1260
0
    return LZXPRESS_ERROR;
1261
0
  }
1262
1263
0
  cmp = talloc(mem_ctx, struct lzxhuff_compressor_mem);
1264
0
  if (cmp == NULL) {
1265
0
    TALLOC_FREE(*output);
1266
0
    return LZXPRESS_ERROR;
1267
0
  }
1268
1269
0
  output_size = lzxpress_huffman_compress(cmp,
1270
0
            input_bytes,
1271
0
            input_size,
1272
0
            *output,
1273
0
            alloc_size);
1274
1275
0
  talloc_free(cmp);
1276
1277
0
  if (output_size < 0) {
1278
0
    TALLOC_FREE(*output);
1279
0
    return LZXPRESS_ERROR;
1280
0
  }
1281
1282
0
  *output = talloc_realloc(mem_ctx, *output, uint8_t, output_size);
1283
0
  if (*output == NULL) {
1284
0
    return LZXPRESS_ERROR;
1285
0
  }
1286
1287
0
  return output_size;
1288
0
}
1289
1290
/*
1291
 * lzxpress_huffman_compress()
1292
 *
1293
 * This is the inconvenience function, slightly faster and fiddlier than
1294
 * lzxpress_huffman_compress_talloc().
1295
 *
1296
 * To use this, you need to have allocated (but not initialised) a `struct
1297
 * lzxhuff_compressor_mem`, and an output buffer. If the buffer is not big
1298
 * enough (per `output_size`), you'll get a negative return value, otherwise
1299
 * the number of bytes actually consumed, which will always be at least 260.
1300
 *
1301
 * The `struct lzxhuff_compressor_mem` is reusable -- it is basically a
1302
 * collection of uninitialised memory buffers. The total size is less than
1303
 * 150k, so stack allocation is plausible.
1304
 *
1305
 * input_size and available_size are limited to the minimum of UINT32_MAX and
1306
 * SSIZE_MAX. On 64 bit machines that will be UINT32_MAX, or 4GB.
1307
 *
1308
 * @param cmp_mem         a struct lzxhuff_compressor_mem.
1309
 * @param input_bytes     memory to be compressed.
1310
 * @param input_size      length of the input buffer.
1311
 * @param output          destination for the compressed data.
1312
 * @param available_size  allocated output bytes.
1313
 *
1314
 * @return the number of bytes written or -1 on error.
1315
 */
1316
ssize_t lzxpress_huffman_compress(struct lzxhuff_compressor_mem *cmp_mem,
1317
          const uint8_t *input_bytes,
1318
          size_t input_size,
1319
          uint8_t *output,
1320
          size_t available_size)
1321
2.93k
{
1322
2.93k
  size_t i = 0;
1323
2.93k
  struct lzxhuff_compressor_context cmp_ctx = {
1324
2.93k
    .input_bytes = input_bytes,
1325
2.93k
    .input_size = input_size,
1326
2.93k
    .input_pos = 0,
1327
2.93k
    .prev_block_pos = 0,
1328
2.93k
    .output = output,
1329
2.93k
    .available_size = available_size,
1330
2.93k
    .output_pos = 0
1331
2.93k
  };
1332
1333
2.93k
  if (input_size == 0) {
1334
    /*
1335
     * We can't deal with this for a number of reasons (e.g. it
1336
     * breaks the Huffman tree), and the output will be infinitely
1337
     * bigger than the input. The caller needs to go and think
1338
     * about what they're trying to do here.
1339
     */
1340
32
    return LZXPRESS_ERROR;
1341
32
  }
1342
1343
2.90k
  if (input_size > SSIZE_MAX ||
1344
2.90k
      input_size > UINT32_MAX ||
1345
2.90k
      available_size > SSIZE_MAX ||
1346
2.90k
      available_size > UINT32_MAX ||
1347
2.90k
      available_size == 0) {
1348
    /*
1349
     * We use negative ssize_t to return errors, which is limiting
1350
     * on 32 bit machines; otherwise we adhere to Microsoft's 4GB
1351
     * limit.
1352
     *
1353
     * lzxpress_huffman_compress_talloc() will not get this far,
1354
     * having already have failed on talloc's 256 MB limit.
1355
     */
1356
1
    return LZXPRESS_ERROR;
1357
1
  }
1358
1359
2.90k
  if (cmp_mem == NULL ||
1360
2.90k
      output == NULL ||
1361
2.90k
      input_bytes == NULL) {
1362
0
    return LZXPRESS_ERROR;
1363
0
  }
1364
1365
8.09k
  while (cmp_ctx.input_pos < cmp_ctx.input_size) {
1366
5.25k
    ssize_t ret;
1367
5.25k
    ret = lzx_huffman_compress_block(&cmp_ctx,
1368
5.25k
             cmp_mem,
1369
5.25k
             i);
1370
5.25k
    if (ret < 0) {
1371
71
      return ret;
1372
71
    }
1373
5.18k
    i++;
1374
5.18k
  }
1375
1376
2.83k
  return cmp_ctx.output_pos;
1377
2.90k
}
1378
1379
static void debug_tree_codes(struct bitstream *input)
1380
0
{
1381
  /*
1382
   */
1383
0
  size_t head = 0;
1384
0
  size_t tail = 2;
1385
0
  size_t ffff_count = 0;
1386
0
  struct q {
1387
0
    uint16_t tree_code;
1388
0
    uint16_t code_code;
1389
0
  };
1390
0
  struct q queue[65536];
1391
0
  char bits[17];
1392
0
  uint16_t *t = input->table;
1393
0
  queue[0].tree_code = 1;
1394
0
  queue[0].code_code = 2;
1395
0
  queue[1].tree_code = 2;
1396
0
  queue[1].code_code = 3;
1397
0
  while (head < tail) {
1398
0
    struct q q = queue[head];
1399
0
    uint16_t x = t[q.tree_code];
1400
0
    if (x != 0xffff) {
1401
0
      int k;
1402
0
      uint16_t j = q.code_code;
1403
0
      size_t offset = bitlen_nonzero_16(j) - 1;
1404
0
      if (unlikely(j == 0)) {
1405
0
        DBG("BROKEN code is 0!\n");
1406
0
        return;
1407
0
      }
1408
1409
0
      for (k = 0; k <= offset; k++) {
1410
0
        bool b = (j >> (offset - k)) & 1;
1411
0
        bits[k] = b ? '1' : '0';
1412
0
      }
1413
0
      bits[k] = 0;
1414
0
      DBG("%03x   %s\n", x & 511, bits);
1415
0
      head++;
1416
0
      continue;
1417
0
    }
1418
0
    ffff_count++;
1419
0
    queue[tail].tree_code = q.tree_code * 2 + 1;
1420
0
    queue[tail].code_code = q.code_code * 2;
1421
0
    tail++;
1422
0
    queue[tail].tree_code = q.tree_code * 2 + 1 + 1;
1423
0
    queue[tail].code_code = q.code_code * 2 + 1;
1424
0
    tail++;
1425
0
    head++;
1426
0
  }
1427
0
  DBG("0xffff count: %zu\n", ffff_count);
1428
0
}
1429
1430
/**
1431
 * Determines the sort order of one prefix_code_symbol relative to another
1432
 */
1433
static int compare_uint16(const uint16_t *a, const uint16_t *b)
1434
8.83M
{
1435
8.83M
  if (*a < *b) {
1436
6.05M
    return -1;
1437
6.05M
  }
1438
2.77M
  if (*a > *b) {
1439
2.77M
    return 1;
1440
2.77M
  }
1441
0
  return 0;
1442
2.77M
}
1443
1444
1445
static bool fill_decomp_table(struct bitstream *input)
1446
9.98k
{
1447
  /*
1448
   * There are 512 symbols, each encoded in 4 bits, which indicates
1449
   * their depth in the Huffman tree. The even numbers get the lower
1450
   * nibble of each byte, so that the byte hex values look backwards
1451
   * (i.e. 0xab encodes b then a). These are allocated Huffman codes in
1452
   * order of appearance, per depth.
1453
   *
1454
   * For example, if the first two bytes were:
1455
   *
1456
   * 0x23 0x53
1457
   *
1458
   * the first four codes have the lengths 3, 2, 3, 5.
1459
   * Let's call them A, B, C, D.
1460
   *
1461
   * Suppose there is no other codeword with length 1 (which is
1462
   * necessarily true in this example) or 2, but there might be others
1463
   * of length 3 or 4. Then we can say this about the codes:
1464
   *
1465
   *        _ --*--_
1466
   *      /          \
1467
   *     0           1
1468
   *    / \         / \
1469
   *   0   1       0   1
1470
   *  B    |\     / \  |\
1471
   *       0 1   0   1 0 1
1472
   *       A C   |\ /| | |\
1473
   *
1474
   * pos bits  code
1475
   * A    3    010
1476
   * B    2    00
1477
   * C    3    011
1478
   * D    5    1????
1479
   *
1480
   * B has the shortest code, so takes the leftmost branch, 00. That
1481
   * ends the branch -- nothing else can start with 00. There are no
1482
   * more 2s, so we look at the 3s, starting as far left as possible. So
1483
   * A takes 010 and C takes 011. That means everything else has to
1484
   * start with 1xx. We don't know how many codewords of length 3 or 4
1485
   * there are; if there are none, D would end up with 10000, the
1486
   * leftmost available code of length 5. If the compressor is any good,
1487
   * there should be no unused leaf nodes left dangling at the end.
1488
   *
1489
   * (this is "Canonical Huffman Coding").
1490
   *
1491
   *
1492
   * But what symbols do these codes actually stand for?
1493
   * --------------------------------------------------
1494
   *
1495
   * Good question. The first 256 codes stand for the corresponding
1496
   * literal bytes. The codes from 256 to 511 stand for LZ77 matches,
1497
   * which have a distance and a length, encoded in a strange way that
1498
   * isn't entirely the purview of this function.
1499
   *
1500
   * What does the value 0 mean?
1501
   * ---------------------------
1502
   *
1503
   * The code does not occur. For example, if the next byte in the
1504
   * example above was 0x07, that would give the byte 0x04 a 7-long
1505
   * code, and no code to the 0x05 byte, which means we there is no way
1506
   * we going to see a 5 in the decoded stream.
1507
   *
1508
   * Isn't LZ77 + Huffman what zip/gzip/zlib do?
1509
   * -------------------------------------------
1510
   *
1511
   * Yes, DEFLATE is LZ77 + Huffman, but the details are quite different.
1512
   */
1513
9.98k
  uint16_t symbols[512];
1514
9.98k
  uint16_t sort_mem[512];
1515
9.98k
  size_t i, n_symbols;
1516
9.98k
  ssize_t code;
1517
9.98k
  uint16_t len = 0, prev_len;
1518
9.98k
  const uint8_t *table_bytes = input->bytes + input->byte_pos;
1519
1520
9.98k
  if (input->byte_pos + 260 > input->byte_size) {
1521
35
    return false;
1522
35
  }
1523
1524
9.95k
  n_symbols = 0;
1525
2.55M
  for (i = 0; i < 256; i++) {
1526
2.54M
    uint16_t even = table_bytes[i] & 15;
1527
2.54M
    uint16_t odd = table_bytes[i] >> 4;
1528
2.54M
    if (even != 0) {
1529
586k
      symbols[n_symbols] = (even << 9) + i * 2;
1530
586k
      n_symbols++;
1531
586k
    }
1532
2.54M
    if (odd != 0) {
1533
1.54M
      symbols[n_symbols] = (odd << 9) + i * 2 + 1;
1534
1.54M
      n_symbols++;
1535
1.54M
    }
1536
2.54M
  }
1537
9.95k
  input->byte_pos += 256;
1538
9.95k
  if (n_symbols == 0) {
1539
3
    return false;
1540
3
  }
1541
1542
9.94k
  stable_sort(symbols, sort_mem, n_symbols, sizeof(uint16_t),
1543
9.94k
        (samba_compare_fn_t)compare_uint16);
1544
1545
  /*
1546
   * we're using an implicit binary tree, as you'd see in a heap.
1547
   * table[0] = unused
1548
   * table[1] = '0'
1549
   * table[2] = '1'
1550
   * table[3] = '00'     <-- '00' and '01' are children of '0'
1551
   * table[4] = '01'     <-- '0' is [0], children are [0 * 2 + {1,2}]
1552
   * table[5] = '10'
1553
   * table[6] = '11'
1554
   * table[7] = '000'
1555
   * table[8] = '001'
1556
   * table[9] = '010'
1557
   * table[10]= '011'
1558
   * table[11]= '100
1559
   *'
1560
   * table[1 << n - 1] = '0' * n
1561
   * table[1 << n - 1 + x] = n-bit wide x (left padded with '0')
1562
   * table[1 << n - 2] = '1' * (n - 1)
1563
   *
1564
   * table[i]->left =  table[i*2 + 1]
1565
   * table[i]->right = table[i*2 + 2]
1566
   * table[0xffff] = unused (16 '0's, max len is 15)
1567
   *
1568
   * therefore e.g. table[70] = table[64     - 1 + 7]
1569
   *                          = table[1 << 6 - 1 + 7]
1570
   *                          = '000111' (binary 7, widened to 6 bits)
1571
   *
1572
   *   and if '000111' is a code,
1573
   *   '00011', '0001', '000', '00', '0' are unavailable prefixes.
1574
   *       34      16      7     3    1  are their indices
1575
   *   and (i - 1) >> 1 is the path back from 70 through these.
1576
   *
1577
   * the lookup is
1578
   *
1579
   * 1 start with i = 0
1580
   * 2 extract a symbol bit (i = (i << 1) + bit + 1)
1581
   * 3 is table[i] == 0xffff?
1582
   * 4  yes -- goto 2
1583
   * 4  table[i] & 511 is the symbol, stop
1584
   *
1585
   * and the construction (here) is sort of the reverse.
1586
   *
1587
   * Most of this table is free space that can never be reached, and
1588
   * most of the activity is at the beginning (since all codes start
1589
   * there, and by design the shortest codes are the most common).
1590
   */
1591
328k
  for (i = 0; i < 32; i++) {
1592
    /* prefill the table head */
1593
318k
    input->table[i] = 0xffff;
1594
318k
  }
1595
9.94k
  code = -1;
1596
9.94k
  prev_len = 0;
1597
2.13M
  for (i = 0; i < n_symbols; i++) {
1598
2.12M
    uint16_t s = symbols[i];
1599
2.12M
    uint16_t prefix;
1600
2.12M
    len = (s >> 9) & 15;
1601
2.12M
    s &= 511;
1602
2.12M
    code++;
1603
2.20M
    while (len != prev_len) {
1604
80.8k
      code <<= 1;
1605
80.8k
      code++;
1606
80.8k
      prev_len++;
1607
80.8k
    }
1608
1609
2.12M
    if (code >= 65535) {
1610
174
      return false;
1611
174
    }
1612
2.12M
    input->table[code] = s;
1613
2.12M
    for(prefix = (code - 1) >> 1;
1614
9.51M
        prefix > 31;
1615
7.39M
        prefix = (prefix - 1) >> 1) {
1616
7.39M
      input->table[prefix] = 0xffff;
1617
7.39M
    }
1618
2.12M
  }
1619
9.77k
  if (CHECK_DEBUGLVL(10)) {
1620
0
    debug_tree_codes(input);
1621
0
  }
1622
1623
  /*
1624
   * check that the last code encodes 11111..., with right number of
1625
   * ones, pointing to the right symbol -- otherwise we have a dangling
1626
   * uninitialised symbol.
1627
   */
1628
9.77k
  if (code != (1 << (len + 1)) - 2) {
1629
118
    return false;
1630
118
  }
1631
9.65k
  return true;
1632
9.77k
}
1633
1634
1635
#define CHECK_READ_32(dest)           \
1636
2.64k
  do {               \
1637
2.64k
    if (input->byte_pos + 4 > input->byte_size) {     \
1638
19
      return LZXPRESS_ERROR;        \
1639
19
    }               \
1640
2.64k
    dest = PULL_LE_U32(input->bytes, input->byte_pos); \
1641
2.62k
    input->byte_pos += 4;          \
1642
2.62k
  } while (0)
1643
1644
#define CHECK_READ_16(dest)           \
1645
34.9M
  do {               \
1646
34.9M
    if (input->byte_pos + 2 > input->byte_size) {     \
1647
22
      return LZXPRESS_ERROR;        \
1648
22
    }               \
1649
34.9M
    dest = PULL_LE_U16(input->bytes, input->byte_pos); \
1650
34.9M
    input->byte_pos += 2;          \
1651
34.9M
  } while (0)
1652
1653
#define CHECK_READ_8(dest) \
1654
167k
  do {               \
1655
167k
    if (input->byte_pos >= input->byte_size) {   \
1656
55
      return LZXPRESS_ERROR;       \
1657
55
    }              \
1658
167k
    dest = PULL_LE_U8(input->bytes, input->byte_pos);  \
1659
167k
    input->byte_pos++;          \
1660
167k
  } while(0)
1661
1662
1663
static inline ssize_t pull_bits(struct bitstream *input)
1664
34.8M
{
1665
34.8M
  if (input->byte_pos + 1 < input->byte_size) {
1666
34.8M
    uint16_t tmp;
1667
34.8M
    CHECK_READ_16(tmp);
1668
34.8M
    input->remaining_bits += 16;
1669
34.8M
    input->bits <<= 16;
1670
34.8M
    input->bits |= tmp;
1671
34.8M
  } else if (input->byte_pos < input->byte_size) {
1672
179
    uint8_t tmp;
1673
179
    CHECK_READ_8(tmp);
1674
179
    input->remaining_bits += 8;
1675
179
    input->bits <<= 8;
1676
179
    input->bits |= tmp;
1677
179
  } else {
1678
167
    return LZXPRESS_ERROR;
1679
167
  }
1680
34.8M
  return 0;
1681
34.8M
}
1682
1683
1684
/*
1685
 * Decompress a block. The actual decompressed size is returned (or -1 on
1686
 * error). The putative block length is 64k (or shorter, if the message ends
1687
 * first), but a match can run over the end, extending the block. That's why
1688
 * we need the overall output size as well as the block size. A match encoded
1689
 * in this block can point back to previous blocks, but not before the
1690
 * beginning of the message, so we also need the previously decoded size.
1691
 *
1692
 * The compressed block will have 256 bytes for the Huffman table, and at
1693
 * least 4 bytes of (possibly padded) encoded values.
1694
 */
1695
static ssize_t lzx_huffman_decompress_block(struct bitstream *input,
1696
              uint8_t *output,
1697
              size_t block_size,
1698
              size_t output_size,
1699
              size_t previous_size)
1700
9.98k
{
1701
9.98k
  size_t output_pos = 0;
1702
9.98k
  uint16_t symbol;
1703
9.98k
  size_t index;
1704
9.98k
  uint16_t distance_bits_wanted = 0;
1705
9.98k
  size_t distance = 0;
1706
9.98k
  size_t length = 0;
1707
9.98k
  bool ok;
1708
9.98k
  uint32_t tmp;
1709
9.98k
  bool seen_eof_marker = false;
1710
1711
9.98k
  ok = fill_decomp_table(input);
1712
9.98k
  if (! ok) {
1713
330
    return LZXPRESS_ERROR;
1714
330
  }
1715
9.65k
  if (CHECK_DEBUGLVL(10) || DEBUG_HUFFMAN_TREE) {
1716
0
    debug_huffman_tree_from_table(input->table);
1717
0
  }
1718
  /*
1719
   * Always read 32 bits at the start, even if we don't need them.
1720
   */
1721
9.65k
  CHECK_READ_16(tmp);
1722
9.65k
  CHECK_READ_16(input->bits);
1723
9.65k
  input->bits |= tmp << 16;
1724
9.65k
  input->remaining_bits = 32;
1725
1726
  /*
1727
   * This loop iterates over individual *bits*. These are read from
1728
   * little-endian 16 bit words, most significant bit first.
1729
   *
1730
   * At points in the bitstream, the following are possible:
1731
   *
1732
   * # the source word is empty and needs to be refilled from the input
1733
   *    stream.
1734
   * # an incomplete codeword is being extended.
1735
   * # a codeword is resolved, either as a literal or a match.
1736
   * # a literal is written.
1737
   * # a match is collecting distance bits.
1738
   * # the output stream is copied, as specified by a match.
1739
   * # input bytes are read for match lengths.
1740
   *
1741
   * Note that we *don't* specifically check for the EOF marker (symbol
1742
   * 256) in this loop, because the precondition for stopping for the
1743
   * EOF marker is that the output buffer is full (otherwise, you
1744
   * wouldn't know which 256 is EOF, rather than an actual symbol), and
1745
   * we *always* want to stop when the buffer is full. So we work out if
1746
   * there is an EOF in another loop after we stop writing.
1747
   */
1748
1749
9.65k
  index = 0;
1750
557M
  while (output_pos < block_size) {
1751
557M
    uint16_t b;
1752
557M
    if (input->remaining_bits == 16) {
1753
34.8M
      ssize_t ret = pull_bits(input);
1754
34.8M
      if (ret) {
1755
167
        return ret;
1756
167
      }
1757
34.8M
    }
1758
557M
    input->remaining_bits--;
1759
1760
557M
    b = (input->bits >> input->remaining_bits) & 1;
1761
557M
    if (length == 0) {
1762
      /* not in a match; pulling a codeword */
1763
540M
      index <<= 1;
1764
540M
      index += b + 1;
1765
540M
      if (input->table[index] == 0xffff) {
1766
        /* incomplete codeword, the common case */
1767
468M
        continue;
1768
468M
      }
1769
      /* found the symbol, reset the code string */
1770
71.7M
      symbol = input->table[index] & 511;
1771
71.7M
      index = 0;
1772
71.7M
      if (symbol < 256) {
1773
        /* a literal, the easy case */
1774
66.0M
        output[output_pos] = symbol;
1775
66.0M
        output_pos++;
1776
66.0M
        continue;
1777
66.0M
      }
1778
1779
      /* the beginning of a match */
1780
5.64M
      distance_bits_wanted = (symbol >> 4) & 15;
1781
5.64M
      distance = 1 << distance_bits_wanted;
1782
5.64M
      length = symbol & 15;
1783
5.64M
      if (length == 15) {
1784
167k
        CHECK_READ_8(tmp);
1785
167k
        length += tmp;
1786
167k
        if (length == 255 + 15) {
1787
          /*
1788
           * note, we discard (don't add) the
1789
           * length so far.
1790
           */
1791
18.9k
          CHECK_READ_16(length);
1792
18.9k
          if (length == 0) {
1793
2.64k
            CHECK_READ_32(length);
1794
2.64k
          }
1795
18.9k
        }
1796
167k
      }
1797
5.64M
      length += 3;
1798
17.7M
    } else {
1799
      /* we are pulling extra distance bits */
1800
17.7M
      distance_bits_wanted--;
1801
17.7M
      distance |= b << distance_bits_wanted;
1802
17.7M
    }
1803
1804
23.4M
    if (distance_bits_wanted == 0) {
1805
      /*
1806
       * We have a complete match, and it is time to do the
1807
       * copy (byte by byte, because the ranges can overlap,
1808
       * and we might need to copy bytes we just copied in).
1809
       *
1810
       * It is possible that this match will extend beyond
1811
       * the end of the expected block. That's fine, so long
1812
       * as it doesn't extend past the total output size.
1813
       */
1814
5.64M
      size_t i;
1815
5.64M
      size_t end = output_pos + length;
1816
5.64M
      uint8_t *here = output + output_pos;
1817
5.64M
      uint8_t *there = here - distance;
1818
5.64M
      if (end > output_size ||
1819
5.64M
          previous_size + output_pos < distance ||
1820
5.64M
          unlikely(end < output_pos || there > here)) {
1821
161
        return LZXPRESS_ERROR;
1822
161
      }
1823
2.06G
      for (i = 0; i < length; i++) {
1824
2.06G
        here[i] = there[i];
1825
2.06G
      }
1826
5.64M
      output_pos += length;
1827
5.64M
      distance = 0;
1828
5.64M
      length = 0;
1829
5.64M
    }
1830
23.4M
  }
1831
1832
9.23k
  if (length != 0 || index != 0) {
1833
    /* it seems like we've hit an early end, mid-code */
1834
0
    return LZXPRESS_ERROR;
1835
0
  }
1836
1837
9.23k
  if (input->byte_pos + 256 < input->byte_size) {
1838
    /*
1839
     * This block is over, but it clearly isn't the last block, so
1840
     * we don't want to look for the EOF.
1841
     */
1842
7.35k
    return output_pos;
1843
7.35k
  }
1844
  /*
1845
   * We won't write any more, but we try to read some more to make sure
1846
   * we're finishing in a good place. That means we want to see a 256
1847
   * symbol and then some number of zeroes, possibly zero, but as many
1848
   * as 32.
1849
   *
1850
   * In this we are perhaps a bit stricter than Windows, which
1851
   * apparently does not insist on the EOF marker, nor on a lack of
1852
   * trailing bytes.
1853
   */
1854
77.2k
  while (true) {
1855
77.2k
    uint16_t b;
1856
77.2k
    if (input->remaining_bits == 16) {
1857
5.61k
      ssize_t ret;
1858
5.61k
      if (input->byte_pos == input->byte_size) {
1859
        /* FIN */
1860
1.69k
        break;
1861
1.69k
      }
1862
3.92k
      ret = pull_bits(input);
1863
3.92k
      if (ret) {
1864
0
        return ret;
1865
0
      }
1866
3.92k
    }
1867
75.5k
    input->remaining_bits--;
1868
75.5k
    b = (input->bits >> input->remaining_bits) & 1;
1869
75.5k
    if (seen_eof_marker) {
1870
      /*
1871
       * we have read an EOF symbols. Now we just want to
1872
       * see zeroes.
1873
       */
1874
64.3k
      if (b != 0) {
1875
24
        return LZXPRESS_ERROR;
1876
24
      }
1877
64.2k
      continue;
1878
64.3k
    }
1879
1880
    /* we're pulling in a symbol, which had better be 256 */
1881
11.2k
    index <<= 1;
1882
11.2k
    index += b + 1;
1883
11.2k
    if (input->table[index] == 0xffff) {
1884
9.40k
      continue;
1885
9.40k
    }
1886
1887
1.86k
    symbol = input->table[index] & 511;
1888
1.86k
    if (symbol != 256) {
1889
156
      return LZXPRESS_ERROR;
1890
156
    }
1891
1.70k
    seen_eof_marker = true;
1892
1.70k
    continue;
1893
1.86k
  }
1894
1895
1.69k
  if (! seen_eof_marker) {
1896
10
    return LZXPRESS_ERROR;
1897
10
  }
1898
1899
1.68k
  return output_pos;
1900
1.69k
}
1901
1902
static ssize_t lzxpress_huffman_decompress_internal(struct bitstream *input,
1903
                uint8_t *output,
1904
                size_t output_size)
1905
2.73k
{
1906
2.73k
  size_t output_pos = 0;
1907
1908
2.73k
  if (input->byte_size < 260) {
1909
103
    return LZXPRESS_ERROR;
1910
103
  }
1911
1912
11.6k
  while (input->byte_pos < input->byte_size) {
1913
9.98k
    ssize_t block_output_pos;
1914
9.98k
    ssize_t block_output_size;
1915
9.98k
    size_t remaining_output_size = output_size - output_pos;
1916
1917
9.98k
    block_output_size = MIN(65536, remaining_output_size);
1918
1919
9.98k
    block_output_pos = lzx_huffman_decompress_block(
1920
9.98k
      input,
1921
9.98k
      output + output_pos,
1922
9.98k
      block_output_size,
1923
9.98k
      remaining_output_size,
1924
9.98k
      output_pos);
1925
1926
9.98k
    if (block_output_pos < block_output_size) {
1927
944
      return LZXPRESS_ERROR;
1928
944
    }
1929
9.04k
    output_pos += block_output_pos;
1930
9.04k
    if (output_pos > output_size) {
1931
      /* not expecting to get here. */
1932
0
      return LZXPRESS_ERROR;
1933
0
    }
1934
9.04k
  }
1935
1936
1.68k
  if (input->byte_pos != input->byte_size) {
1937
0
    return LZXPRESS_ERROR;
1938
0
  }
1939
1940
1.68k
  return output_pos;
1941
1.68k
}
1942
1943
1944
/*
1945
 * lzxpress_huffman_decompress()
1946
 *
1947
 * output_size must be the expected length of the decompressed data.
1948
 * input_size and output_size are limited to the minimum of UINT32_MAX and
1949
 * SSIZE_MAX. On 64 bit machines that will be UINT32_MAX, or 4GB.
1950
 *
1951
 * @param input_bytes  memory to be decompressed.
1952
 * @param input_size   length of the compressed buffer.
1953
 * @param output       destination for the decompressed data.
1954
 * @param output_size  exact expected length of the decompressed data.
1955
 *
1956
 * @return the number of bytes written or -1 on error.
1957
 */
1958
1959
ssize_t lzxpress_huffman_decompress(const uint8_t *input_bytes,
1960
            size_t input_size,
1961
            uint8_t *output,
1962
            size_t output_size)
1963
2.81k
{
1964
2.81k
  uint16_t table[65536];
1965
2.81k
  struct bitstream input = {
1966
2.81k
    .bytes = input_bytes,
1967
2.81k
    .byte_size = input_size,
1968
2.81k
    .byte_pos = 0,
1969
2.81k
    .bits = 0,
1970
2.81k
    .remaining_bits = 0,
1971
2.81k
    .table = table
1972
2.81k
  };
1973
1974
2.81k
  if (input_size > SSIZE_MAX ||
1975
2.81k
      input_size > UINT32_MAX ||
1976
2.81k
      output_size > SSIZE_MAX ||
1977
2.81k
      output_size > UINT32_MAX ||
1978
2.81k
      input_size == 0 ||
1979
2.73k
      output_size == 0 ||
1980
2.73k
      input_bytes == NULL ||
1981
2.73k
      output == NULL) {
1982
    /*
1983
     * We use negative ssize_t to return errors, which is limiting
1984
     * on 32 bit machines, and the 4GB limit exists on Windows.
1985
     */
1986
82
    return  LZXPRESS_ERROR;
1987
82
  }
1988
1989
2.73k
  return lzxpress_huffman_decompress_internal(&input,
1990
2.73k
                output,
1991
2.73k
                output_size);
1992
2.81k
}
1993
1994
1995
/**
1996
 * lzxpress_huffman_decompress_talloc()
1997
 *
1998
 * The caller must provide the exact size of the expected output.
1999
 *
2000
 * The input_size is limited to the minimum of UINT32_MAX and SSIZE_MAX, but
2001
 * output_size is limited to 256MB due to a limit in talloc. This effectively
2002
 * limits input_size too, as non-crafted compressed data will not exceed the
2003
 * decompressed size by very much.
2004
 *
2005
 * @param mem_ctx      TALLOC_CTX parent for the decompressed buffer.
2006
 * @param input_bytes  memory to be decompressed.
2007
 * @param input_size   length of the compressed buffer.
2008
 * @param output_size  expected decompressed size.
2009
 *
2010
 * @return a talloc'ed buffer exactly output_size in length, or NULL.
2011
 */
2012
2013
uint8_t *lzxpress_huffman_decompress_talloc(TALLOC_CTX *mem_ctx,
2014
              const uint8_t *input_bytes,
2015
              size_t input_size,
2016
              size_t output_size)
2017
0
{
2018
0
  ssize_t result;
2019
0
  uint8_t *output = NULL;
2020
0
  struct bitstream input = {
2021
0
    .bytes = input_bytes,
2022
0
    .byte_size = input_size
2023
0
  };
2024
2025
0
  output = talloc_array(mem_ctx, uint8_t, output_size);
2026
0
  if (output == NULL) {
2027
0
    return NULL;
2028
0
  }
2029
2030
0
  input.table = talloc_array(mem_ctx, uint16_t, 65536);
2031
0
  if (input.table == NULL) {
2032
0
    talloc_free(output);
2033
0
    return NULL;
2034
0
  }
2035
0
  result = lzxpress_huffman_decompress_internal(&input,
2036
0
                  output,
2037
0
                  output_size);
2038
0
  talloc_free(input.table);
2039
2040
0
  if (result != output_size) {
2041
0
    talloc_free(output);
2042
0
    return NULL;
2043
0
  }
2044
0
  return output;
2045
0
}