Coverage Report

Created: 2025-03-04 07:22

/src/serenity/Userland/Libraries/LibCompress/Lzma.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2023, Tim Schumacher <timschumi@gmx.de>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#include <AK/Debug.h>
8
#include <AK/IntegralMath.h>
9
#include <LibCompress/Lzma.h>
10
11
namespace Compress {
12
13
u32 LzmaHeader::dictionary_size() const
14
4.82k
{
15
    // "If the value of dictionary size in properties is smaller than (1 << 12),
16
    //  the LZMA decoder must set the dictionary size variable to (1 << 12)."
17
4.82k
    constexpr u32 minimum_dictionary_size = (1 << 12);
18
4.82k
    if (unchecked_dictionary_size < minimum_dictionary_size)
19
479
        return minimum_dictionary_size;
20
21
4.34k
    return unchecked_dictionary_size;
22
4.82k
}
23
24
Optional<u64> LzmaHeader::uncompressed_size() const
25
3.34k
{
26
    // We are making a copy of the packed field here because we would otherwise
27
    // pass an unaligned reference to the constructor of Optional, which is
28
    // undefined behavior.
29
3.34k
    auto uncompressed_size = encoded_uncompressed_size;
30
31
    // "If "Uncompressed size" field contains ones in all 64 bits, it means that
32
    //  uncompressed size is unknown and there is the "end marker" in stream,
33
    //  that indicates the end of decoding point."
34
3.34k
    if (uncompressed_size == placeholder_for_unknown_uncompressed_size)
35
1.98k
        return {};
36
37
    // "In opposite case, if the value from "Uncompressed size" field is not
38
    //  equal to ((2^64) - 1), the LZMA stream decoding must be finished after
39
    //  specified number of bytes (Uncompressed size) is decoded. And if there
40
    //  is the "end marker", the LZMA decoder must read that marker also."
41
1.35k
    return uncompressed_size;
42
3.34k
}
43
44
ErrorOr<LzmaModelProperties> LzmaHeader::decode_model_properties(u8 input_bits)
45
3.34k
{
46
    // "Decodes the following values from the encoded model properties field:
47
    //
48
    //     name  Range          Description
49
    //       lc  [0, 8]         the number of "literal context" bits
50
    //       lp  [0, 4]         the number of "literal pos" bits
51
    //       pb  [0, 4]         the number of "pos" bits
52
    //
53
    //  Encoded using `((pb * 5 + lp) * 9 + lc)`."
54
55
3.34k
    if (input_bits >= (9 * 5 * 5))
56
1
        return Error::from_string_literal("Encoded model properties value is larger than the highest possible value");
57
58
3.34k
    u8 literal_context_bits = input_bits % 9;
59
3.34k
    input_bits /= 9;
60
3.34k
    VERIFY(literal_context_bits >= 0 && literal_context_bits <= 8);
61
62
3.34k
    u8 literal_position_bits = input_bits % 5;
63
3.34k
    input_bits /= 5;
64
3.34k
    VERIFY(literal_position_bits >= 0 && literal_position_bits <= 4);
65
66
3.34k
    u8 position_bits = input_bits;
67
3.34k
    VERIFY(position_bits >= 0 && position_bits <= 4);
68
69
3.34k
    return LzmaModelProperties {
70
3.34k
        .literal_context_bits = literal_context_bits,
71
3.34k
        .literal_position_bits = literal_position_bits,
72
3.34k
        .position_bits = position_bits,
73
3.34k
    };
74
3.34k
}
75
76
ErrorOr<u8> LzmaHeader::encode_model_properties(LzmaModelProperties const& model_properties)
77
1.88k
{
78
1.88k
    if (model_properties.literal_context_bits > 8)
79
0
        return Error::from_string_literal("LZMA literal context bits are too large to encode");
80
81
1.88k
    if (model_properties.literal_position_bits > 4)
82
0
        return Error::from_string_literal("LZMA literal position bits are too large to encode");
83
84
1.88k
    if (model_properties.position_bits > 4)
85
0
        return Error::from_string_literal("LZMA position bits are too large to encode");
86
87
1.88k
    return (model_properties.position_bits * 5 + model_properties.literal_position_bits) * 9 + model_properties.literal_context_bits;
88
1.88k
}
89
90
ErrorOr<LzmaDecompressorOptions> LzmaHeader::as_decompressor_options() const
91
3.34k
{
92
3.34k
    auto model_properties = TRY(decode_model_properties(encoded_model_properties));
93
94
0
    return Compress::LzmaDecompressorOptions {
95
3.34k
        .literal_context_bits = model_properties.literal_context_bits,
96
3.34k
        .literal_position_bits = model_properties.literal_position_bits,
97
3.34k
        .position_bits = model_properties.position_bits,
98
3.34k
        .dictionary_size = dictionary_size(),
99
3.34k
        .uncompressed_size = uncompressed_size(),
100
3.34k
        .reject_end_of_stream_marker = false,
101
3.34k
    };
102
3.34k
}
103
104
ErrorOr<LzmaHeader> LzmaHeader::from_compressor_options(LzmaCompressorOptions const& options)
105
1.88k
{
106
1.88k
    auto encoded_model_properties = TRY(encode_model_properties({
107
1.88k
        .literal_context_bits = options.literal_context_bits,
108
1.88k
        .literal_position_bits = options.literal_position_bits,
109
1.88k
        .position_bits = options.position_bits,
110
1.88k
    }));
111
112
0
    return LzmaHeader {
113
1.88k
        .encoded_model_properties = encoded_model_properties,
114
1.88k
        .unchecked_dictionary_size = options.dictionary_size,
115
1.88k
        .encoded_uncompressed_size = options.uncompressed_size.value_or(placeholder_for_unknown_uncompressed_size),
116
1.88k
    };
117
1.88k
}
118
119
void LzmaState::initialize_to_default_probability(Span<Probability> span)
120
460k
{
121
460k
    for (auto& entry : span)
122
332M
        entry = default_probability;
123
460k
}
124
125
ErrorOr<NonnullOwnPtr<LzmaDecompressor>> LzmaDecompressor::create_from_container(MaybeOwned<Stream> stream, Optional<MaybeOwned<CircularBuffer>> dictionary)
126
3.35k
{
127
3.35k
    auto header = TRY(stream->read_value<LzmaHeader>());
128
129
3.34k
    return TRY(LzmaDecompressor::create_from_raw_stream(move(stream), TRY(header.as_decompressor_options()), move(dictionary)));
130
3.34k
}
131
132
ErrorOr<NonnullOwnPtr<LzmaDecompressor>> LzmaDecompressor::create_from_raw_stream(MaybeOwned<Stream> stream, LzmaDecompressorOptions const& options, Optional<MaybeOwned<CircularBuffer>> dictionary)
133
3.34k
{
134
3.34k
    if (!dictionary.has_value()) {
135
3.34k
        auto new_dictionary = TRY(CircularBuffer::create_empty(options.dictionary_size));
136
3.34k
        dictionary = TRY(try_make<CircularBuffer>(move(new_dictionary)));
137
3.34k
    }
138
139
3.34k
    VERIFY((*dictionary)->capacity() >= options.dictionary_size);
140
141
    // "The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where each table contains 0x300 CProb values."
142
3.34k
    auto literal_probabilities = TRY(FixedArray<Probability>::create(literal_probability_table_size * (1 << (options.literal_context_bits + options.literal_position_bits))));
143
144
3.34k
    auto decompressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) LzmaDecompressor(move(stream), options, dictionary.release_value(), move(literal_probabilities))));
145
146
3.34k
    TRY(decompressor->initialize_range_decoder());
147
148
0
    return decompressor;
149
3.34k
}
150
151
LzmaState::LzmaState(FixedArray<Probability> literal_probabilities)
152
5.23k
    : m_literal_probabilities(move(literal_probabilities))
153
5.23k
{
154
5.23k
    initialize_to_default_probability(m_literal_probabilities.span());
155
156
5.23k
    for (auto& array : m_length_to_position_states)
157
20.9k
        initialize_to_default_probability(array);
158
159
5.23k
    for (auto& array : m_binary_tree_distance_probabilities)
160
52.3k
        initialize_to_default_probability(array);
161
162
5.23k
    initialize_to_default_probability(m_alignment_bit_probabilities);
163
164
5.23k
    initialize_to_default_probability(m_is_match_probabilities);
165
5.23k
    initialize_to_default_probability(m_is_rep_probabilities);
166
5.23k
    initialize_to_default_probability(m_is_rep_g0_probabilities);
167
5.23k
    initialize_to_default_probability(m_is_rep_g1_probabilities);
168
5.23k
    initialize_to_default_probability(m_is_rep_g2_probabilities);
169
5.23k
    initialize_to_default_probability(m_is_rep0_long_probabilities);
170
5.23k
}
171
172
LzmaDecompressor::LzmaDecompressor(MaybeOwned<Stream> stream, LzmaDecompressorOptions options, MaybeOwned<CircularBuffer> dictionary, FixedArray<Probability> literal_probabilities)
173
3.34k
    : LzmaState(move(literal_probabilities))
174
3.34k
    , m_stream(move(stream))
175
3.34k
    , m_options(move(options))
176
3.34k
    , m_dictionary(move(dictionary))
177
3.34k
{
178
3.34k
}
179
180
bool LzmaDecompressor::is_range_decoder_in_clean_state() const
181
2.04k
{
182
2.04k
    return m_range_decoder_code == 0;
183
2.04k
}
184
185
bool LzmaDecompressor::has_reached_expected_data_size() const
186
49.4M
{
187
49.4M
    if (!m_options.uncompressed_size.has_value())
188
13.5M
        return false;
189
190
35.8M
    return m_total_processed_bytes >= m_options.uncompressed_size.value();
191
49.4M
}
192
193
ErrorOr<void> LzmaDecompressor::initialize_range_decoder()
194
3.34k
{
195
    // "The LZMA Encoder always writes ZERO in initial byte of compressed stream.
196
    //  That scheme allows to simplify the code of the Range Encoder in the
197
    //  LZMA Encoder. If initial byte is not equal to ZERO, the LZMA Decoder must
198
    //  stop decoding and report error."
199
3.34k
    {
200
3.34k
        auto byte = TRY(m_stream->read_value<u8>());
201
3.23k
        if (byte != 0)
202
13
            return Error::from_string_literal("Initial byte of data stream is not zero");
203
3.23k
    }
204
205
    // Read the initial bytes into the range decoder.
206
3.22k
    m_range_decoder_code = 0;
207
16.1k
    for (size_t i = 0; i < 4; i++) {
208
12.8k
        auto byte = TRY(m_stream->read_value<u8>());
209
0
        m_range_decoder_code = m_range_decoder_code << 8 | byte;
210
12.8k
    }
211
212
3.22k
    m_range_decoder_range = 0xFFFFFFFF;
213
214
3.22k
    return {};
215
3.22k
}
216
217
ErrorOr<void> LzmaDecompressor::append_input_stream(MaybeOwned<Stream> stream, Optional<u64> uncompressed_size)
218
0
{
219
0
    m_stream = move(stream);
220
221
0
    TRY(initialize_range_decoder());
222
223
0
    if (m_options.uncompressed_size.has_value() != uncompressed_size.has_value())
224
0
        return Error::from_string_literal("Appending LZMA streams with mismatching uncompressed size status");
225
226
0
    if (uncompressed_size.has_value())
227
0
        *m_options.uncompressed_size += *uncompressed_size;
228
229
0
    return {};
230
0
}
231
232
ErrorOr<void> LzmaDecompressor::normalize_range_decoder()
233
282M
{
234
    // "The Normalize() function keeps the "Range" value in described range."
235
236
282M
    if (m_range_decoder_range >= minimum_range_value)
237
277M
        return {};
238
239
5.11M
    m_range_decoder_range <<= 8;
240
5.11M
    m_range_decoder_code <<= 8;
241
242
5.11M
    m_range_decoder_code |= TRY(m_stream->read_value<u8>());
243
244
5.11M
    VERIFY(m_range_decoder_range >= minimum_range_value);
245
246
5.11M
    return {};
247
5.11M
}
248
249
ErrorOr<void> LzmaCompressor::shift_range_encoder()
250
4.47M
{
251
4.47M
    if ((m_range_encoder_code >> 32) == 0x01) {
252
        // If there is an overflow, we can finalize the chain we were previously building.
253
        // This includes incrementing both the cached byte and all the 0xFF bytes that we generate.
254
1.55M
        VERIFY(m_range_encoder_cached_byte != 0xFF);
255
3.10M
        TRY(m_stream->write_value<u8>(m_range_encoder_cached_byte + 1));
256
1.56M
        for (size_t i = 0; i < m_range_encoder_ff_chain_length; i++)
257
6.19k
            TRY(m_stream->write_value<u8>(0x00));
258
1.55M
        m_range_encoder_ff_chain_length = 0;
259
1.55M
        m_range_encoder_cached_byte = (m_range_encoder_code >> 24);
260
2.92M
    } else if ((m_range_encoder_code >> 24) == 0xFF) {
261
        // If the byte to flush is 0xFF, it can potentially propagate an overflow and needs to be added to the chain.
262
20.2k
        m_range_encoder_ff_chain_length++;
263
2.90M
    } else {
264
        // If the byte to flush isn't 0xFF, any future overflows will not be propagated beyond this point,
265
        // so we can be sure that the built chain doesn't change anymore.
266
2.90M
        TRY(m_stream->write_value<u8>(m_range_encoder_cached_byte));
267
2.91M
        for (size_t i = 0; i < m_range_encoder_ff_chain_length; i++)
268
14.0k
            TRY(m_stream->write_value<u8>(0xFF));
269
2.90M
        m_range_encoder_ff_chain_length = 0;
270
2.90M
        m_range_encoder_cached_byte = (m_range_encoder_code >> 24);
271
2.90M
    }
272
273
    // In all three cases we now recorded the highest byte in some way, so we can shift it away and shift in a null byte as the lowest byte.
274
4.47M
    m_range_encoder_range <<= 8;
275
4.47M
    m_range_encoder_code <<= 8;
276
277
    // Since we are working with a 64-bit code, we need to limit it to 32 bits artificially.
278
4.47M
    m_range_encoder_code &= 0xFFFFFFFF;
279
280
4.47M
    return {};
281
4.47M
}
282
283
ErrorOr<void> LzmaCompressor::normalize_range_encoder()
284
71.8M
{
285
71.8M
    u64 const maximum_range_value = m_range_encoder_code + m_range_encoder_range;
286
287
    // Logically, we should only ever build up an overflow that is smaller than or equal to 0x01.
288
71.8M
    VERIFY((maximum_range_value >> 32) <= 0x01);
289
290
71.8M
    if (m_range_encoder_range >= minimum_range_value)
291
67.3M
        return {};
292
293
8.94M
    TRY(shift_range_encoder());
294
295
4.47M
    VERIFY(m_range_encoder_range >= minimum_range_value);
296
297
4.47M
    return {};
298
8.94M
}
299
300
ErrorOr<u8> LzmaDecompressor::decode_direct_bit()
301
9.38M
{
302
9.38M
    dbgln_if(LZMA_DEBUG, "Decoding direct bit {} with code = {:#x}, range = {:#x}", 1 - ((m_range_decoder_code - (m_range_decoder_range >> 1)) >> 31), m_range_decoder_code, m_range_decoder_range);
303
304
9.38M
    m_range_decoder_range >>= 1;
305
9.38M
    m_range_decoder_code -= m_range_decoder_range;
306
307
9.38M
    u32 temp = 0 - (m_range_decoder_code >> 31);
308
309
9.38M
    m_range_decoder_code += m_range_decoder_range & temp;
310
311
9.38M
    if (m_range_decoder_code == m_range_decoder_range)
312
1
        return Error::from_string_literal("Reached an invalid state while decoding LZMA stream");
313
314
9.38M
    TRY(normalize_range_decoder());
315
316
0
    return temp + 1;
317
9.38M
}
318
319
ErrorOr<void> LzmaCompressor::encode_direct_bit(u8 value)
320
9.35M
{
321
9.35M
    dbgln_if(LZMA_DEBUG, "Encoding direct bit {} with code = {:#x}, range = {:#x}", value, m_range_encoder_code, m_range_encoder_range);
322
323
9.35M
    m_range_encoder_range >>= 1;
324
325
9.35M
    if (value != 0)
326
4.59M
        m_range_encoder_code += m_range_encoder_range;
327
328
9.35M
    TRY(normalize_range_encoder());
329
330
0
    return {};
331
9.35M
}
332
333
ErrorOr<u8> LzmaDecompressor::decode_bit_with_probability(Probability& probability)
334
272M
{
335
    // "The LZMA decoder provides the pointer to CProb variable that contains
336
    //  information about estimated probability for symbol 0 and the Range Decoder
337
    //  updates that CProb variable after decoding."
338
339
272M
    u32 bound = (m_range_decoder_range >> probability_bit_count) * probability;
340
341
272M
    dbgln_if(LZMA_DEBUG, "Decoding bit {} with probability = {:#x}, bound = {:#x}, code = {:#x}, range = {:#x}", m_range_decoder_code < bound ? 0 : 1, probability, bound, m_range_decoder_code, m_range_decoder_range);
342
343
272M
    if (m_range_decoder_code < bound) {
344
111M
        probability += ((1 << probability_bit_count) - probability) >> probability_shift_width;
345
111M
        m_range_decoder_range = bound;
346
111M
        TRY(normalize_range_decoder());
347
0
        return 0;
348
161M
    } else {
349
161M
        probability -= probability >> probability_shift_width;
350
161M
        m_range_decoder_code -= bound;
351
161M
        m_range_decoder_range -= bound;
352
161M
        TRY(normalize_range_decoder());
353
0
        return 1;
354
161M
    }
355
272M
}
356
357
ErrorOr<void> LzmaCompressor::encode_bit_with_probability(Probability& probability, u8 value)
358
62.4M
{
359
62.4M
    u32 bound = (m_range_encoder_range >> probability_bit_count) * probability;
360
361
62.4M
    dbgln_if(LZMA_DEBUG, "Encoding bit {} with probability = {:#x}, bound = {:#x}, code = {:#x}, range = {:#x}", value, probability, bound, m_range_encoder_code, m_range_encoder_range);
362
363
62.4M
    if (value == 0) {
364
31.3M
        probability += ((1 << probability_bit_count) - probability) >> probability_shift_width;
365
31.3M
        m_range_encoder_range = bound;
366
31.3M
    } else {
367
31.1M
        probability -= probability >> probability_shift_width;
368
31.1M
        m_range_encoder_code += bound;
369
31.1M
        m_range_encoder_range -= bound;
370
31.1M
    }
371
372
62.4M
    TRY(normalize_range_encoder());
373
0
    return {};
374
62.4M
}
375
376
ErrorOr<u16> LzmaDecompressor::decode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree)
377
14.6M
{
378
14.6M
    VERIFY(bit_count <= sizeof(u16) * 8);
379
14.6M
    VERIFY(probability_tree.size() >= 1ul << bit_count);
380
381
    // This has been modified from the reference implementation to unlink the result and the tree index,
382
    // which should allow for better readability.
383
384
14.6M
    u16 result = 0;
385
14.6M
    size_t tree_index = 1;
386
387
107M
    for (size_t i = 0; i < bit_count; i++) {
388
92.9M
        u16 next_bit = TRY(decode_bit_with_probability(probability_tree[tree_index]));
389
0
        result = (result << 1) | next_bit;
390
92.9M
        tree_index = (tree_index << 1) | next_bit;
391
92.9M
    }
392
393
14.6M
    dbgln_if(LZMA_DEBUG, "Decoded value {:#x} with {} bits using bit tree", result, bit_count);
394
395
14.6M
    return result;
396
14.6M
}
397
398
ErrorOr<void> LzmaCompressor::encode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree, u16 value)
399
5.95M
{
400
5.95M
    VERIFY(bit_count <= sizeof(u16) * 8);
401
5.95M
    VERIFY(probability_tree.size() >= 1ul << bit_count);
402
5.95M
    VERIFY(value <= (1 << bit_count) - 1);
403
404
5.95M
    auto original_value = value;
405
406
    // Shift value to make the first sent byte the most significant bit. This makes the shifting logic a lot easier to read.
407
5.95M
    value <<= sizeof(u16) * 8 - bit_count;
408
409
5.95M
    size_t tree_index = 1;
410
411
29.4M
    for (size_t i = 0; i < bit_count; i++) {
412
23.4M
        u8 const next_bit = (value & 0x8000) >> (sizeof(u16) * 8 - 1);
413
23.4M
        value <<= 1;
414
23.4M
        TRY(encode_bit_with_probability(probability_tree[tree_index], next_bit));
415
0
        tree_index = (tree_index << 1) | next_bit;
416
23.4M
    }
417
418
5.95M
    dbgln_if(LZMA_DEBUG, "Encoded value {:#x} with {} bits using bit tree", original_value, bit_count);
419
420
5.95M
    return {};
421
5.95M
}
422
423
ErrorOr<u16> LzmaDecompressor::decode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree)
424
1.31M
{
425
1.31M
    VERIFY(bit_count <= sizeof(u16) * 8);
426
1.31M
    VERIFY(probability_tree.size() >= 1ul << bit_count);
427
428
1.31M
    u16 result = 0;
429
1.31M
    size_t tree_index = 1;
430
431
6.54M
    for (size_t i = 0; i < bit_count; i++) {
432
5.23M
        u16 next_bit = TRY(decode_bit_with_probability(probability_tree[tree_index]));
433
0
        result |= next_bit << i;
434
5.23M
        tree_index = (tree_index << 1) | next_bit;
435
5.23M
    }
436
437
1.31M
    dbgln_if(LZMA_DEBUG, "Decoded value {:#x} with {} bits using reverse bit tree", result, bit_count);
438
439
1.31M
    return result;
440
1.31M
}
441
442
ErrorOr<void> LzmaCompressor::encode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree, u16 value)
443
1.30M
{
444
1.30M
    VERIFY(bit_count <= sizeof(u16) * 8);
445
1.30M
    VERIFY(probability_tree.size() >= 1ul << bit_count);
446
1.30M
    VERIFY(value <= (1 << bit_count) - 1);
447
448
1.30M
    auto original_value = value;
449
450
1.30M
    size_t tree_index = 1;
451
452
6.51M
    for (size_t i = 0; i < bit_count; i++) {
453
5.21M
        u8 const next_bit = value & 1;
454
5.21M
        value >>= 1;
455
5.21M
        TRY(encode_bit_with_probability(probability_tree[tree_index], next_bit));
456
0
        tree_index = (tree_index << 1) | next_bit;
457
5.21M
    }
458
459
1.30M
    dbgln_if(LZMA_DEBUG, "Encoded value {:#x} with {} bits using reverse bit tree", original_value, bit_count);
460
461
1.30M
    return {};
462
1.30M
}
463
464
ErrorOr<void> LzmaDecompressor::decode_literal_to_output_buffer()
465
10.3M
{
466
10.3M
    u8 previous_byte = 0;
467
10.3M
    if (m_dictionary->seekback_limit() > 0) {
468
10.3M
        auto read_bytes = MUST(m_dictionary->read_with_seekback({ &previous_byte, sizeof(previous_byte) }, 1));
469
10.3M
        VERIFY(read_bytes.size() == sizeof(previous_byte));
470
10.3M
    }
471
472
    // "To select the table for decoding it uses the context that consists of
473
    //  (lc) high bits from previous literal and (lp) low bits from value that
474
    //  represents current position in outputStream."
475
10.3M
    u16 literal_state_bits_from_position = m_total_processed_bytes & ((1 << m_options.literal_position_bits) - 1);
476
10.3M
    u16 literal_state_bits_from_output = previous_byte >> (8 - m_options.literal_context_bits);
477
10.3M
    u16 literal_state = literal_state_bits_from_position << m_options.literal_context_bits | literal_state_bits_from_output;
478
479
10.3M
    Span<Probability> selected_probability_table = m_literal_probabilities.span().slice(literal_probability_table_size * literal_state, literal_probability_table_size);
480
481
    // The result is defined as u16 here and initialized to 1, but we will cut off the top bits before queueing them into the output buffer.
482
    // The top bit is only used to track how much we have decoded already, and to select the correct probability table.
483
10.3M
    u16 result = 1;
484
485
    // "If (State > 7), the Literal Decoder also uses "matchByte" that represents
486
    //  the byte in OutputStream at position the is the DISTANCE bytes before
487
    //  current position, where the DISTANCE is the distance in DISTANCE-LENGTH pair
488
    //  of latest decoded match."
489
    // Note: The specification says `(State > 7)`, but the reference implementation does `(State >= 7)`, which is a mismatch.
490
    //       Testing `(State > 7)` with actual test files yields errors, so the reference implementation appears to be the correct one.
491
10.3M
    if (m_state >= 7) {
492
142k
        u8 matched_byte = 0;
493
142k
        auto read_bytes = TRY(m_dictionary->read_with_seekback({ &matched_byte, sizeof(matched_byte) }, current_repetition_offset()));
494
142k
        VERIFY(read_bytes.size() == sizeof(matched_byte));
495
496
142k
        dbgln_if(LZMA_DEBUG, "Decoding literal using match byte {:#x}", matched_byte);
497
498
625k
        do {
499
625k
            u8 match_bit = (matched_byte >> 7) & 1;
500
625k
            matched_byte <<= 1;
501
502
625k
            u8 decoded_bit = TRY(decode_bit_with_probability(selected_probability_table[((1 + match_bit) << 8) + result]));
503
0
            result = result << 1 | decoded_bit;
504
505
625k
            if (match_bit != decoded_bit)
506
140k
                break;
507
625k
        } while (result < 0x100);
508
142k
    }
509
510
92.1M
    while (result < 0x100)
511
81.8M
        result = (result << 1) | TRY(decode_bit_with_probability(selected_probability_table[result]));
512
513
10.3M
    u8 actual_result = result - 0x100;
514
515
10.3M
    size_t written_bytes = m_dictionary->write({ &actual_result, sizeof(actual_result) });
516
10.3M
    VERIFY(written_bytes == sizeof(actual_result));
517
10.3M
    m_total_processed_bytes += sizeof(actual_result);
518
519
10.3M
    dbgln_if(LZMA_DEBUG, "Decoded literal {:#x} in state {} using literal state {:#x} (previous byte is {:#x})", actual_result, m_state, literal_state, previous_byte);
520
521
10.3M
    return {};
522
10.3M
}
523
524
ErrorOr<void> LzmaCompressor::encode_literal(u8 literal)
525
1.40M
{
526
    // This function largely mirrors `decode_literal_to_output_buffer`, so specification comments have been omitted.
527
528
1.40M
    TRY(encode_match_type(MatchType::Literal));
529
530
    // Note: We have already read the next byte from the input buffer, so it's now in the seekback buffer, shifting all seekback offsets by one.
531
0
    u8 previous_byte = 0;
532
1.40M
    if (m_dictionary->seekback_limit() - m_dictionary->used_space() > 1) {
533
1.40M
        auto read_bytes = MUST(m_dictionary->read_with_seekback({ &previous_byte, sizeof(previous_byte) }, 2 + m_dictionary->used_space()));
534
1.40M
        VERIFY(read_bytes.size() == sizeof(previous_byte));
535
1.40M
    }
536
1.40M
    u16 const literal_state_bits_from_position = m_total_processed_bytes & ((1 << m_options.literal_position_bits) - 1);
537
1.40M
    u16 const literal_state_bits_from_output = previous_byte >> (8 - m_options.literal_context_bits);
538
1.40M
    u16 const literal_state = literal_state_bits_from_position << m_options.literal_context_bits | literal_state_bits_from_output;
539
540
1.40M
    Span<Probability> selected_probability_table = m_literal_probabilities.span().slice(literal_probability_table_size * literal_state, literal_probability_table_size);
541
542
1.40M
    auto original_literal = literal;
543
1.40M
    u16 result = 1;
544
545
1.40M
    if (m_state >= 7) {
546
132k
        u8 matched_byte = 0;
547
132k
        auto read_bytes = TRY(m_dictionary->read_with_seekback({ &matched_byte, sizeof(matched_byte) }, current_repetition_offset() + m_dictionary->used_space() + 1));
548
132k
        VERIFY(read_bytes.size() == sizeof(matched_byte));
549
550
132k
        dbgln_if(LZMA_DEBUG, "Encoding literal using match byte {:#x}", matched_byte);
551
552
598k
        do {
553
598k
            u8 const match_bit = (matched_byte >> 7) & 1;
554
598k
            matched_byte <<= 1;
555
556
598k
            u8 const encoded_bit = (literal & 0x80) >> 7;
557
598k
            literal <<= 1;
558
559
598k
            TRY(encode_bit_with_probability(selected_probability_table[((1 + match_bit) << 8) + result], encoded_bit));
560
0
            result = result << 1 | encoded_bit;
561
562
598k
            if (match_bit != encoded_bit)
563
131k
                break;
564
598k
        } while (result < 0x100);
565
132k
    }
566
567
12.0M
    while (result < 0x100) {
568
10.6M
        u8 const encoded_bit = (literal & 0x80) >> 7;
569
10.6M
        literal <<= 1;
570
571
10.6M
        TRY(encode_bit_with_probability(selected_probability_table[result], encoded_bit));
572
573
0
        result = (result << 1) | encoded_bit;
574
10.6M
    }
575
576
1.40M
    m_total_processed_bytes += sizeof(literal);
577
578
1.40M
    dbgln_if(LZMA_DEBUG, "Encoded literal {:#x} in state {} using literal state {:#x} (previous byte is {:#x})", original_literal, m_state, literal_state, previous_byte);
579
580
1.40M
    update_state_after_literal();
581
582
1.40M
    return {};
583
1.40M
}
584
585
ErrorOr<void> LzmaCompressor::encode_existing_match(size_t real_distance, size_t real_length)
586
3.33M
{
587
3.33M
    VERIFY(real_distance >= normalized_to_real_match_distance_offset);
588
3.33M
    u32 const normalized_distance = real_distance - normalized_to_real_match_distance_offset;
589
590
3.33M
    VERIFY(real_length >= normalized_to_real_match_length_offset);
591
3.33M
    u16 const normalized_length = real_length - normalized_to_real_match_length_offset;
592
593
3.33M
    if (normalized_distance == m_rep0) {
594
3.13M
        TRY(encode_match_type(MatchType::RepMatch0));
595
3.13M
    } else if (normalized_distance == m_rep1) {
596
79.8k
        TRY(encode_match_type(MatchType::RepMatch1));
597
598
0
        u32 const distance = m_rep1;
599
79.8k
        m_rep1 = m_rep0;
600
79.8k
        m_rep0 = distance;
601
122k
    } else if (normalized_distance == m_rep2) {
602
60.3k
        TRY(encode_match_type(MatchType::RepMatch2));
603
604
0
        u32 const distance = m_rep2;
605
60.3k
        m_rep2 = m_rep1;
606
60.3k
        m_rep1 = m_rep0;
607
60.3k
        m_rep0 = distance;
608
62.0k
    } else if (normalized_distance == m_rep3) {
609
62.0k
        TRY(encode_match_type(MatchType::RepMatch3));
610
611
0
        u32 const distance = m_rep3;
612
62.0k
        m_rep3 = m_rep2;
613
62.0k
        m_rep2 = m_rep1;
614
62.0k
        m_rep1 = m_rep0;
615
62.0k
        m_rep0 = distance;
616
62.0k
    } else {
617
0
        VERIFY_NOT_REACHED();
618
0
    }
619
620
6.66M
    TRY(encode_normalized_match_length(m_rep_length_coder, normalized_length));
621
0
    update_state_after_rep();
622
6.66M
    MUST(m_dictionary->discard(real_length));
623
0
    m_total_processed_bytes += real_length;
624
625
3.33M
    return {};
626
6.66M
}
627
628
ErrorOr<void> LzmaCompressor::encode_new_match(size_t real_distance, size_t real_length)
629
1.30M
{
630
1.30M
    VERIFY(real_distance >= normalized_to_real_match_distance_offset);
631
1.30M
    u32 const normalized_distance = real_distance - normalized_to_real_match_distance_offset;
632
633
1.30M
    VERIFY(real_length >= normalized_to_real_match_length_offset);
634
1.30M
    u16 const normalized_length = real_length - normalized_to_real_match_length_offset;
635
636
1.30M
    TRY(encode_normalized_simple_match(normalized_distance, normalized_length));
637
638
1.30M
    MUST(m_dictionary->discard(real_length));
639
0
    m_total_processed_bytes += real_length;
640
641
1.30M
    return {};
642
1.30M
}
643
644
ErrorOr<void> LzmaCompressor::encode_normalized_simple_match(u32 normalized_distance, u16 normalized_length)
645
1.31M
{
646
1.31M
    TRY(encode_match_type(MatchType::SimpleMatch));
647
648
0
    m_rep3 = m_rep2;
649
1.31M
    m_rep2 = m_rep1;
650
1.31M
    m_rep1 = m_rep0;
651
652
1.31M
    TRY(encode_normalized_match_length(m_length_coder, normalized_length));
653
654
0
    update_state_after_match();
655
656
1.31M
    TRY(encode_normalized_match_distance(normalized_length, normalized_distance));
657
0
    m_rep0 = normalized_distance;
658
659
1.31M
    return {};
660
1.31M
}
661
662
LzmaState::LzmaLengthCoderState::LzmaLengthCoderState()
663
10.4k
{
664
10.4k
    for (auto& array : m_low_length_probabilities)
665
167k
        initialize_to_default_probability(array);
666
667
10.4k
    for (auto& array : m_medium_length_probabilities)
668
167k
        initialize_to_default_probability(array);
669
670
10.4k
    initialize_to_default_probability(m_high_length_probabilities);
671
10.4k
}
672
673
ErrorOr<u16> LzmaDecompressor::decode_normalized_match_length(LzmaLengthCoderState& length_decoder_state)
674
13.3M
{
675
    // "LZMA uses "posState" value as context to select the binary tree
676
    //  from LowCoder and MidCoder binary tree arrays:"
677
13.3M
    u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
678
679
    // "The following scheme is used for the match length encoding:
680
    //
681
    //   Binary encoding    Binary Tree structure    Zero-based match length
682
    //   sequence                                    (binary + decimal):
683
    //
684
    //   0 xxx              LowCoder[posState]       xxx
685
26.6M
    if (TRY(decode_bit_with_probability(length_decoder_state.m_first_choice_probability)) == 0)
686
13.3M
        return TRY(decode_symbol_using_bit_tree(3, length_decoder_state.m_low_length_probabilities[position_state].span()));
687
688
    //   1 0 yyy            MidCoder[posState]       yyy + 8
689
18.1M
    if (TRY(decode_bit_with_probability(length_decoder_state.m_second_choice_probability)) == 0)
690
83.3k
        return TRY(decode_symbol_using_bit_tree(3, length_decoder_state.m_medium_length_probabilities[position_state].span())) + 8;
691
692
    //   1 1 zzzzzzzz       HighCoder                zzzzzzzz + 16"
693
9.00M
    return TRY(decode_symbol_using_bit_tree(8, length_decoder_state.m_high_length_probabilities.span())) + 16;
694
9.00M
}
695
696
ErrorOr<void> LzmaCompressor::encode_normalized_match_length(LzmaLengthCoderState& length_coder_state, u16 normalized_length)
697
4.64M
{
698
4.64M
    u16 const position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
699
700
4.64M
    if (normalized_length < 8) {
701
4.23M
        TRY(encode_bit_with_probability(length_coder_state.m_first_choice_probability, 0));
702
4.23M
        TRY(encode_symbol_using_bit_tree(3, length_coder_state.m_low_length_probabilities[position_state].span(), normalized_length));
703
0
        return {};
704
4.23M
    }
705
706
818k
    TRY(encode_bit_with_probability(length_coder_state.m_first_choice_probability, 1));
707
708
409k
    if (normalized_length < 16) {
709
79.1k
        TRY(encode_bit_with_probability(length_coder_state.m_second_choice_probability, 0));
710
79.1k
        TRY(encode_symbol_using_bit_tree(3, length_coder_state.m_medium_length_probabilities[position_state].span(), normalized_length - 8));
711
0
        return {};
712
79.1k
    }
713
714
659k
    TRY(encode_bit_with_probability(length_coder_state.m_second_choice_probability, 1));
715
329k
    TRY(encode_symbol_using_bit_tree(8, length_coder_state.m_high_length_probabilities.span(), normalized_length - 16));
716
0
    return {};
717
659k
}
718
719
ErrorOr<u32> LzmaDecompressor::decode_normalized_match_distance(u16 normalized_match_length)
720
1.31M
{
721
    // "LZMA uses normalized match length (zero-based length)
722
    //  to calculate the context state "lenState" do decode the distance value."
723
1.31M
    u16 length_state = min(normalized_match_length, number_of_length_to_position_states - 1);
724
725
    // "At first stage the distance decoder decodes 6-bit "posSlot" value with bit
726
    //  tree decoder from PosSlotDecoder array."
727
1.31M
    u16 position_slot = TRY(decode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span()));
728
729
    // "The encoding scheme for distance value is shown in the following table:
730
    //
731
    //  posSlot (decimal) /
732
    //       zero-based distance (binary)
733
    //  0    0
734
    //  1    1
735
    //  2    10
736
    //  3    11
737
    //
738
    //  4    10 x
739
    //  5    11 x
740
    //  6    10 xx
741
    //  7    11 xx
742
    //  8    10 xxx
743
    //  9    11 xxx
744
    //  10    10 xxxx
745
    //  11    11 xxxx
746
    //  12    10 xxxxx
747
    //  13    11 xxxxx
748
    //
749
    //  14    10 yy zzzz
750
    //  15    11 yy zzzz
751
    //  16    10 yyy zzzz
752
    //  17    11 yyy zzzz
753
    //  ...
754
    //  62    10 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz
755
    //  63    11 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz
756
    //
757
    //  where
758
    //   "x ... x" means the sequence of binary symbols encoded with binary tree and
759
    //       "Reverse" scheme. It uses separated binary tree for each posSlot from 4 to 13.
760
    //   "y" means direct bit encoded with range coder.
761
    //   "zzzz" means the sequence of four binary symbols encoded with binary
762
    //       tree with "Reverse" scheme, where one common binary tree "AlignDecoder"
763
    //       is used for all posSlot values."
764
765
    // "If (posSlot < 4), the "dist" value is equal to posSlot value."
766
1.31M
    if (position_slot < first_position_slot_with_binary_tree_bits)
767
6.14k
        return position_slot;
768
769
    // From here on, the first bit of the distance is always set and the second bit is set if the last bit of the position slot is set.
770
1.31M
    u32 distance_prefix = ((1 << 1) | ((position_slot & 1) << 0));
771
772
    // "If (posSlot >= 4), the decoder uses "posSlot" value to calculate the value of
773
    //   the high bits of "dist" value and the number of the low bits.
774
    //   If (4 <= posSlot < kEndPosModelIndex), the decoder uses bit tree decoders.
775
    //     (one separated bit tree decoder per one posSlot value) and "Reverse" scheme."
776
1.31M
    if (position_slot < first_position_slot_with_direct_encoded_bits) {
777
52.6k
        size_t number_of_bits_to_decode = (position_slot / 2) - 1;
778
52.6k
        auto& selected_probability_tree = m_binary_tree_distance_probabilities[position_slot - first_position_slot_with_binary_tree_bits];
779
52.6k
        return (distance_prefix << number_of_bits_to_decode) | TRY(decode_symbol_using_reverse_bit_tree(number_of_bits_to_decode, selected_probability_tree));
780
52.6k
    }
781
782
    // "  if (posSlot >= kEndPosModelIndex), the middle bits are decoded as direct
783
    //     bits from RangeDecoder and the low 4 bits are decoded with a bit tree
784
    //     decoder "AlignDecoder" with "Reverse" scheme."
785
1.26M
    size_t number_of_direct_bits_to_decode = ((position_slot - first_position_slot_with_direct_encoded_bits) / 2) + 2;
786
10.6M
    for (size_t i = 0; i < number_of_direct_bits_to_decode; i++) {
787
9.38M
        distance_prefix = (distance_prefix << 1) | TRY(decode_direct_bit());
788
9.38M
    }
789
1.26M
    return (distance_prefix << number_of_alignment_bits) | TRY(decode_symbol_using_reverse_bit_tree(number_of_alignment_bits, m_alignment_bit_probabilities));
790
1.26M
}
791
792
ErrorOr<void> LzmaCompressor::encode_normalized_match_distance(u16 normalized_match_length, u32 normalized_match_distance)
793
1.31M
{
794
1.31M
    u16 const length_state = min(normalized_match_length, number_of_length_to_position_states - 1);
795
796
1.31M
    if (normalized_match_distance < first_position_slot_with_binary_tree_bits) {
797
        // The normalized distance gets encoded as the position slot.
798
4.61k
        TRY(encode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span(), normalized_match_distance));
799
0
        return {};
800
4.61k
    }
801
802
    // Note: This has been deduced, there is no immediate relation to the decoding function.
803
1.30M
    u16 const distance_log2 = AK::log2(normalized_match_distance);
804
1.30M
    u16 number_of_distance_bits = count_required_bits(normalized_match_distance);
805
1.30M
    u16 const position_slot = (distance_log2 << 1) + ((normalized_match_distance >> (distance_log2 - 1)) & 1);
806
807
1.30M
    TRY(encode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span(), position_slot));
808
809
    // Mask off the top two bits of the value, those are already encoded by the position slot.
810
0
    normalized_match_distance &= (1 << (number_of_distance_bits - 2)) - 1;
811
1.30M
    number_of_distance_bits -= 2;
812
813
1.30M
    if (position_slot < first_position_slot_with_direct_encoded_bits) {
814
        // The value gets encoded using only a reverse bit tree coder.
815
50.1k
        auto& selected_probability_tree = m_binary_tree_distance_probabilities[position_slot - first_position_slot_with_binary_tree_bits];
816
50.1k
        TRY(encode_symbol_using_reverse_bit_tree(number_of_distance_bits, selected_probability_tree, normalized_match_distance));
817
0
        return {};
818
50.1k
    }
819
820
    // The value is split into direct bits (everything except the last four bits) and alignment bits (last four bits).
821
1.25M
    auto direct_bits = normalized_match_distance & ~((1 << number_of_alignment_bits) - 1);
822
1.25M
    auto const alignment_bits = normalized_match_distance & ((1 << number_of_alignment_bits) - 1);
823
824
    // Shift to-be-written direct bits to the most significant position for easier access.
825
1.25M
    direct_bits <<= sizeof(direct_bits) * 8 - number_of_distance_bits;
826
827
10.6M
    for (auto i = 0u; i < number_of_distance_bits - number_of_alignment_bits; i++) {
828
9.35M
        TRY(encode_direct_bit((direct_bits & 0x80000000) ? 1 : 0));
829
0
        direct_bits <<= 1;
830
9.35M
    }
831
832
1.25M
    TRY(encode_symbol_using_reverse_bit_tree(number_of_alignment_bits, m_alignment_bit_probabilities, alignment_bits));
833
834
0
    return {};
835
1.25M
}
836
837
u32 LzmaState::current_repetition_offset() const
838
15.1M
{
839
    // LZMA never needs to read at offset 0 (i.e. the actual read head of the buffer).
840
    // Instead, the values are remapped so that the rep-value n starts reading n + 1 bytes back.
841
    // The special rep-value 0xFFFFFFFF is reserved for marking the end of the stream,
842
    // so this should never overflow.
843
15.1M
    VERIFY(m_rep0 <= NumericLimits<u32>::max() - normalized_to_real_match_distance_offset);
844
15.1M
    return m_rep0 + normalized_to_real_match_distance_offset;
845
15.1M
}
846
847
void LzmaState::update_state_after_literal()
848
11.7M
{
849
11.7M
    if (m_state < 4)
850
11.3M
        m_state = 0;
851
369k
    else if (m_state < 10)
852
264k
        m_state -= 3;
853
104k
    else
854
104k
        m_state -= 6;
855
11.7M
}
856
857
void LzmaState::update_state_after_match()
858
2.62M
{
859
2.62M
    if (m_state < 7)
860
136k
        m_state = 7;
861
2.49M
    else
862
2.49M
        m_state = 10;
863
2.62M
}
864
865
void LzmaState::update_state_after_rep()
866
15.3M
{
867
15.3M
    if (m_state < 7)
868
141k
        m_state = 8;
869
15.2M
    else
870
15.2M
        m_state = 11;
871
15.3M
}
872
873
void LzmaState::update_state_after_short_rep()
874
3.49k
{
875
3.49k
    if (m_state < 7)
876
1.91k
        m_state = 9;
877
1.58k
    else
878
1.58k
        m_state = 11;
879
3.49k
}
880
881
ErrorOr<LzmaDecompressor::MatchType> LzmaDecompressor::decode_match_type()
882
23.6M
{
883
    // "The decoder calculates "state2" variable value to select exact variable from
884
    //  "IsMatch" and "IsRep0Long" arrays."
885
23.6M
    u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
886
23.6M
    u16 state2 = (m_state << maximum_number_of_position_bits) + position_state;
887
888
    // "The decoder uses the following code flow scheme to select exact
889
    //  type of LITERAL or MATCH:
890
    //
891
    //  IsMatch[state2] decode
892
    //   0 - the Literal"
893
47.2M
    if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0) {
894
10.3M
        dbgln_if(LZMA_DEBUG, "Decoded match type 'Literal'");
895
10.3M
        return MatchType::Literal;
896
10.3M
    }
897
898
    // " 1 - the Match
899
    //     IsRep[state] decode
900
    //       0 - Simple Match"
901
26.6M
    if (TRY(decode_bit_with_probability(m_is_rep_probabilities[m_state])) == 0) {
902
1.31M
        dbgln_if(LZMA_DEBUG, "Decoded match type 'SimpleMatch'");
903
1.31M
        return MatchType::SimpleMatch;
904
1.31M
    }
905
906
    // "     1 - Rep Match
907
    //         IsRepG0[state] decode
908
    //           0 - the distance is rep0"
909
24.0M
    if (TRY(decode_bit_with_probability(m_is_rep_g0_probabilities[m_state])) == 0) {
910
        // "       IsRep0Long[state2] decode
911
        //           0 - Short Rep Match"
912
6.27M
        if (TRY(decode_bit_with_probability(m_is_rep0_long_probabilities[state2])) == 0) {
913
3.49k
            dbgln_if(LZMA_DEBUG, "Decoded match type 'ShortRepMatch'");
914
3.49k
            return MatchType::ShortRepMatch;
915
3.49k
        }
916
917
        // "         1 - Rep Match 0"
918
3.13M
        dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch0'");
919
3.13M
        return MatchType::RepMatch0;
920
3.13M
    }
921
922
    // "         1 -
923
    //             IsRepG1[state] decode
924
    //               0 - Rep Match 1"
925
17.7M
    if (TRY(decode_bit_with_probability(m_is_rep_g1_probabilities[m_state])) == 0) {
926
82.5k
        dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch1'");
927
82.5k
        return MatchType::RepMatch1;
928
82.5k
    }
929
930
    // "             1 -
931
    //                 IsRepG2[state] decode
932
    //                   0 - Rep Match 2"
933
17.5M
    if (TRY(decode_bit_with_probability(m_is_rep_g2_probabilities[m_state])) == 0) {
934
62.1k
        dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch2'");
935
62.1k
        return MatchType::RepMatch2;
936
62.1k
    }
937
938
    // "                 1 - Rep Match 3"
939
8.73M
    dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch3'");
940
8.73M
    return MatchType::RepMatch3;
941
8.79M
}
942
943
ErrorOr<void> LzmaCompressor::encode_match_type(MatchType match_type)
944
6.05M
{
945
6.05M
    u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
946
6.05M
    u16 state2 = (m_state << maximum_number_of_position_bits) + position_state;
947
948
6.05M
    if (match_type == MatchType::Literal) {
949
1.40M
        TRY(encode_bit_with_probability(m_is_match_probabilities[state2], 0));
950
1.40M
        dbgln_if(LZMA_DEBUG, "Encoded match type 'Literal'");
951
1.40M
        return {};
952
1.40M
    }
953
9.29M
    TRY(encode_bit_with_probability(m_is_match_probabilities[state2], 1));
954
955
4.64M
    if (match_type == MatchType::SimpleMatch) {
956
1.31M
        TRY(encode_bit_with_probability(m_is_rep_probabilities[m_state], 0));
957
1.31M
        dbgln_if(LZMA_DEBUG, "Encoded match type 'SimpleMatch'");
958
1.31M
        return {};
959
1.31M
    }
960
6.66M
    TRY(encode_bit_with_probability(m_is_rep_probabilities[m_state], 1));
961
962
3.33M
    if (match_type == MatchType::ShortRepMatch || match_type == MatchType::RepMatch0) {
963
3.13M
        TRY(encode_bit_with_probability(m_is_rep_g0_probabilities[m_state], 0));
964
3.13M
        TRY(encode_bit_with_probability(m_is_rep0_long_probabilities[state2], match_type == MatchType::RepMatch0));
965
        if constexpr (LZMA_DEBUG) {
966
            if (match_type == RepMatch0)
967
                dbgln("Encoded match type 'RepMatch0'");
968
            else
969
                dbgln("Encoded match type 'ShortRepMatch'");
970
        }
971
3.13M
        return {};
972
3.13M
    }
973
404k
    TRY(encode_bit_with_probability(m_is_rep_g0_probabilities[m_state], 1));
974
975
202k
    if (match_type == MatchType::RepMatch1) {
976
79.8k
        TRY(encode_bit_with_probability(m_is_rep_g1_probabilities[m_state], 0));
977
79.8k
        dbgln_if(LZMA_DEBUG, "Encoded match type 'RepMatch1'");
978
79.8k
        return {};
979
79.8k
    }
980
244k
    TRY(encode_bit_with_probability(m_is_rep_g1_probabilities[m_state], 1));
981
982
122k
    if (match_type == MatchType::RepMatch2) {
983
60.3k
        TRY(encode_bit_with_probability(m_is_rep_g2_probabilities[m_state], 0));
984
60.3k
        dbgln_if(LZMA_DEBUG, "Encoded match type 'RepMatch2'");
985
60.3k
        return {};
986
60.3k
    }
987
62.0k
    TRY(encode_bit_with_probability(m_is_rep_g2_probabilities[m_state], 1));
988
62.0k
    dbgln_if(LZMA_DEBUG, "Encoded match type 'RepMatch3'");
989
62.0k
    return {};
990
62.0k
}
991
992
ErrorOr<void> LzmaCompressor::encode_once()
993
6.05M
{
994
    // Check if any of our existing match distances are currently usable.
995
6.05M
    Vector<size_t> const existing_distances {
996
6.05M
        m_rep0 + normalized_to_real_match_distance_offset,
997
6.05M
        m_rep1 + normalized_to_real_match_distance_offset,
998
6.05M
        m_rep2 + normalized_to_real_match_distance_offset,
999
6.05M
        m_rep3 + normalized_to_real_match_distance_offset,
1000
6.05M
    };
1001
6.05M
    auto existing_distance_result = m_dictionary->find_copy_in_seekback(existing_distances, m_dictionary->used_space(), normalized_to_real_match_length_offset);
1002
1003
6.05M
    if (existing_distance_result.has_value()) {
1004
3.33M
        auto selected_match = existing_distance_result.release_value();
1005
3.33M
        TRY(encode_existing_match(selected_match.distance, selected_match.length));
1006
0
        return {};
1007
3.33M
    }
1008
1009
    // If we weren't able to find any viable existing offsets, we now have to search the rest of the dictionary for possible new offsets.
1010
2.71M
    auto new_distance_result = m_dictionary->find_copy_in_seekback(m_dictionary->used_space(), normalized_to_real_match_length_offset);
1011
1012
2.71M
    if (new_distance_result.has_value()) {
1013
1.30M
        auto selected_match = new_distance_result.release_value();
1014
1.30M
        TRY(encode_new_match(selected_match.distance, selected_match.length));
1015
0
        return {};
1016
1.30M
    }
1017
1018
    // If we weren't able to find any matches, we don't have any other choice than to encode the next byte as a literal.
1019
1.40M
    u8 next_byte { 0 };
1020
1.40M
    TRY(m_dictionary->read({ &next_byte, sizeof(next_byte) }));
1021
1.40M
    TRY(encode_literal(next_byte));
1022
0
    return {};
1023
1.40M
}
1024
1025
ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
1026
606k
{
1027
24.4M
    while (m_dictionary->used_space() < bytes.size() && m_dictionary->empty_space() != 0) {
1028
23.8M
        if (m_found_end_of_stream_marker)
1029
2.02k
            break;
1030
1031
23.8M
        if (has_reached_expected_data_size()) {
1032
            // If the decoder is in a clean state, we assume that this is fine.
1033
80
            if (is_range_decoder_in_clean_state())
1034
33
                break;
1035
1036
            // Otherwise, we give it one last try to find the end marker in the remaining data.
1037
80
        }
1038
1039
23.8M
        auto copy_match_to_buffer = [&](u16 real_length) -> ErrorOr<void> {
1040
13.5M
            VERIFY(!m_leftover_match_length.has_value());
1041
1042
13.5M
            if (m_options.uncompressed_size.has_value() && m_options.uncompressed_size.value() < m_total_processed_bytes + real_length)
1043
4
                return Error::from_string_literal("Tried to copy match beyond expected uncompressed file size");
1044
1045
13.5M
            auto copied_length = TRY(m_dictionary->copy_from_seekback(current_repetition_offset(), real_length));
1046
1047
0
            m_total_processed_bytes += copied_length;
1048
13.5M
            real_length -= copied_length;
1049
1050
13.5M
            if (real_length > 0)
1051
230k
                m_leftover_match_length = real_length;
1052
1053
13.5M
            return {};
1054
13.5M
        };
1055
1056
        // If we have a leftover part of a repeating match, we should finish that first.
1057
23.8M
        if (m_leftover_match_length.has_value()) {
1058
230k
            TRY(copy_match_to_buffer(m_leftover_match_length.release_value()));
1059
0
            continue;
1060
230k
        }
1061
1062
23.6M
        auto const match_type = TRY(decode_match_type());
1063
1064
        // If we are looking for EOS, but find another match type, the stream is also corrupted.
1065
23.6M
        if (has_reached_expected_data_size() && match_type != MatchType::SimpleMatch)
1066
37
            return Error::from_string_literal("First match type after the expected uncompressed size is not a simple match");
1067
1068
23.6M
        if (match_type == MatchType::Literal) {
1069
            // "At first the LZMA decoder must check that it doesn't exceed
1070
            //  specified uncompressed size."
1071
            // This is already checked for at the beginning of the loop.
1072
1073
            // "Then it decodes literal value and puts it to sliding window."
1074
10.3M
            TRY(decode_literal_to_output_buffer());
1075
1076
            // "Then the decoder must update the "state" value."
1077
0
            update_state_after_literal();
1078
10.3M
            continue;
1079
10.3M
        }
1080
1081
13.3M
        if (match_type == MatchType::SimpleMatch) {
1082
            // "The distance history table is updated with the following scheme:"
1083
1.31M
            m_rep3 = m_rep2;
1084
1.31M
            m_rep2 = m_rep1;
1085
1.31M
            m_rep1 = m_rep0;
1086
1087
            // "The zero-based length is decoded with "LenDecoder"."
1088
1.31M
            u16 normalized_length = TRY(decode_normalized_match_length(m_length_coder));
1089
1090
            // "The state is update with UpdateState_Match function."
1091
0
            update_state_after_match();
1092
1093
            // "and the new "rep0" value is decoded with DecodeDistance."
1094
1.31M
            m_rep0 = TRY(decode_normalized_match_distance(normalized_length));
1095
1096
            // "If the value of "rep0" is equal to 0xFFFFFFFF, it means that we have
1097
            //  "End of stream" marker, so we can stop decoding and check finishing
1098
            //  condition in Range Decoder"
1099
1.31M
            if (m_rep0 == end_of_stream_marker) {
1100
                // If we should reject end-of-stream markers, do so now.
1101
                // Note that this is not part of LZMA, as LZMA allows end-of-stream markers in all contexts, so pure LZMA should never set this option.
1102
2.02k
                if (m_options.reject_end_of_stream_marker)
1103
0
                    return Error::from_string_literal("An end-of-stream marker was found, but the LZMA stream is configured to reject them");
1104
1105
                // The range decoder condition is checked after breaking out of the loop.
1106
2.02k
                m_found_end_of_stream_marker = true;
1107
2.02k
                continue;
1108
2.02k
            }
1109
1110
            // If we are looking for EOS, but haven't found it here, the stream is corrupted.
1111
1.31M
            if (has_reached_expected_data_size())
1112
1
                return Error::from_string_literal("First simple match after the expected uncompressed size is not the EOS marker");
1113
1114
            // "If uncompressed size is defined, LZMA decoder must check that it doesn't
1115
            //  exceed that specified uncompressed size."
1116
            // This is being checked for in the common "copy to buffer" implementation.
1117
1118
            // "Also the decoder must check that "rep0" value is not larger than dictionary size
1119
            //  and is not larger than the number of already decoded bytes."
1120
1.31M
            if (current_repetition_offset() > m_dictionary->seekback_limit())
1121
180
                return Error::from_string_literal("rep0 value is larger than the possible lookback size");
1122
1123
            // "Then the decoder must copy match bytes as described in
1124
            //  "The match symbols copying" section."
1125
1.31M
            TRY(copy_match_to_buffer(normalized_length + normalized_to_real_match_length_offset));
1126
1127
0
            continue;
1128
1.31M
        }
1129
1130
12.0M
        if (match_type == MatchType::ShortRepMatch) {
1131
            // "LZMA doesn't update the distance history."
1132
1133
            // "If the subtype is "Short Rep Match", the decoder updates the state, puts
1134
            //  the one byte from window to current position in window and goes to next
1135
            //  MATCH/LITERAL symbol."
1136
3.49k
            update_state_after_short_rep();
1137
1138
3.49k
            TRY(copy_match_to_buffer(1));
1139
1140
0
            continue;
1141
3.49k
        }
1142
1143
        // Note: We don't need to do anything specific for "Rep Match 0", we just need to make sure to not
1144
        //       run the detection for other match types and to not switch around the distance history.
1145
1146
12.0M
        if (match_type == MatchType::RepMatch1) {
1147
82.5k
            u32 distance = m_rep1;
1148
82.5k
            m_rep1 = m_rep0;
1149
82.5k
            m_rep0 = distance;
1150
82.5k
        }
1151
1152
12.0M
        if (match_type == MatchType::RepMatch2) {
1153
62.1k
            u32 distance = m_rep2;
1154
62.1k
            m_rep2 = m_rep1;
1155
62.1k
            m_rep1 = m_rep0;
1156
62.1k
            m_rep0 = distance;
1157
62.1k
        }
1158
1159
12.0M
        if (match_type == MatchType::RepMatch3) {
1160
8.73M
            u32 distance = m_rep3;
1161
8.73M
            m_rep3 = m_rep2;
1162
8.73M
            m_rep2 = m_rep1;
1163
8.73M
            m_rep1 = m_rep0;
1164
8.73M
            m_rep0 = distance;
1165
8.73M
        }
1166
1167
        // "In other cases (Rep Match 0/1/2/3), it decodes the zero-based
1168
        //  length of match with "RepLenDecoder" decoder."
1169
12.0M
        u16 normalized_length = TRY(decode_normalized_match_length(m_rep_length_coder));
1170
1171
        // "Then it updates the state."
1172
0
        update_state_after_rep();
1173
1174
        // "Then the decoder must copy match bytes as described in
1175
        //  "The Match symbols copying" section."
1176
12.0M
        TRY(copy_match_to_buffer(normalized_length + normalized_to_real_match_length_offset));
1177
12.0M
    }
1178
1179
605k
    if (m_found_end_of_stream_marker || has_reached_expected_data_size()) {
1180
2.06k
        if (m_options.uncompressed_size.has_value() && m_total_processed_bytes < m_options.uncompressed_size.value())
1181
94
            return Error::from_string_literal("Found end-of-stream marker earlier than expected");
1182
1183
1.96k
        if (!is_range_decoder_in_clean_state())
1184
44
            return Error::from_string_literal("LZMA stream ends in an unclean state");
1185
1.96k
    }
1186
1187
605k
    return m_dictionary->read(bytes);
1188
605k
}
1189
1190
ErrorOr<size_t> LzmaDecompressor::write_some(ReadonlyBytes)
1191
0
{
1192
0
    return Error::from_errno(EBADF);
1193
0
}
1194
1195
bool LzmaDecompressor::is_eof() const
1196
1.19M
{
1197
1.19M
    if (m_dictionary->used_space() > 0)
1198
1.17M
        return false;
1199
1200
18.6k
    if (has_reached_expected_data_size())
1201
39
        return true;
1202
1203
18.5k
    return m_found_end_of_stream_marker;
1204
18.6k
}
1205
1206
bool LzmaDecompressor::is_open() const
1207
0
{
1208
0
    return true;
1209
0
}
1210
1211
void LzmaDecompressor::close()
1212
0
{
1213
0
}
1214
1215
ErrorOr<NonnullOwnPtr<LzmaCompressor>> LzmaCompressor::create_container(MaybeOwned<Stream> stream, LzmaCompressorOptions const& options)
1216
1.88k
{
1217
3.77k
    auto dictionary = TRY(try_make<SearchableCircularBuffer>(TRY(SearchableCircularBuffer::create_empty(options.dictionary_size + largest_real_match_length))));
1218
1219
    // "The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where each table contains 0x300 CProb values."
1220
1.88k
    auto literal_probabilities = TRY(FixedArray<Probability>::create(literal_probability_table_size * (1 << (options.literal_context_bits + options.literal_position_bits))));
1221
1222
1.88k
    auto header = TRY(LzmaHeader::from_compressor_options(options));
1223
1.88k
    TRY(stream->write_value(header));
1224
1225
1.88k
    auto compressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) LzmaCompressor(move(stream), options, move(dictionary), move(literal_probabilities))));
1226
1227
0
    return compressor;
1228
1.88k
}
1229
1230
LzmaCompressor::LzmaCompressor(MaybeOwned<AK::Stream> stream, Compress::LzmaCompressorOptions options, MaybeOwned<SearchableCircularBuffer> dictionary, FixedArray<Compress::LzmaState::Probability> literal_probabilities)
1231
1.88k
    : LzmaState(move(literal_probabilities))
1232
1.88k
    , m_stream(move(stream))
1233
1.88k
    , m_options(move(options))
1234
1.88k
    , m_dictionary(move(dictionary))
1235
1.88k
{
1236
1.88k
}
1237
1238
ErrorOr<Bytes> LzmaCompressor::read_some(Bytes)
1239
0
{
1240
0
    return Error::from_errno(EBADF);
1241
0
}
1242
1243
ErrorOr<size_t> LzmaCompressor::write_some(ReadonlyBytes bytes)
1244
5.93M
{
1245
    // Fill the input buffer until it's full or until we can't read any more data.
1246
5.93M
    size_t processed_bytes = min(bytes.size(), largest_real_match_length - m_dictionary->used_space());
1247
5.93M
    bytes = bytes.trim(processed_bytes);
1248
1249
11.8M
    while (bytes.size() > 0) {
1250
5.93M
        auto const written_bytes = m_dictionary->write(bytes);
1251
5.93M
        bytes = bytes.slice(written_bytes);
1252
5.93M
    }
1253
1254
5.93M
    VERIFY(m_dictionary->used_space() <= largest_real_match_length);
1255
1256
5.93M
    if (m_options.uncompressed_size.has_value() && m_total_processed_bytes + m_dictionary->used_space() > m_options.uncompressed_size.value())
1257
0
        return Error::from_string_literal("Tried to compress more LZMA data than announced");
1258
1259
11.8M
    TRY(encode_once());
1260
1261
    // If we read enough data to reach the final uncompressed size, flush automatically.
1262
    // Flushing will handle encoding the remaining data for us and finalize the stream.
1263
5.93M
    if (m_options.uncompressed_size.has_value() && m_total_processed_bytes + m_dictionary->used_space() >= m_options.uncompressed_size.value())
1264
0
        TRY(flush());
1265
1266
5.93M
    return processed_bytes;
1267
11.8M
}
1268
1269
ErrorOr<void> LzmaCompressor::flush()
1270
1.88k
{
1271
1.88k
    if (m_has_flushed_data)
1272
0
        return Error::from_string_literal("Flushed an LZMA stream twice");
1273
1274
122k
    while (m_dictionary->used_space() > 0)
1275
120k
        TRY(encode_once());
1276
1277
1.88k
    if (m_options.uncompressed_size.has_value() && m_total_processed_bytes < m_options.uncompressed_size.value())
1278
0
        return Error::from_string_literal("Flushing LZMA data with known but unreached uncompressed size");
1279
1280
    // The LZMA specification technically also allows both a known size and an end-of-stream marker simultaneously,
1281
    // but LZMA2 rejects them, so skip emitting the end-of-stream marker if we know the uncompressed size.
1282
1.88k
    if (!m_options.uncompressed_size.has_value())
1283
1.88k
        TRY(encode_normalized_simple_match(end_of_stream_marker, 0));
1284
1285
    // Shifting the range encoder using the normal operation handles any pending overflows.
1286
3.77k
    TRY(shift_range_encoder());
1287
1288
    // Now, the remaining bytes are the cached byte, the chain of 0xFF, and the upper 3 bytes of the current `code`.
1289
    // Incrementing the values does not have to be considered as no overflows are pending. The fourth byte is the
1290
    // null byte that we just shifted in, which should not be flushed as it would be extraneous junk data.
1291
1.88k
    TRY(m_stream->write_value<u8>(m_range_encoder_cached_byte));
1292
1.89k
    for (size_t i = 0; i < m_range_encoder_ff_chain_length; i++)
1293
6
        TRY(m_stream->write_value<u8>(0xFF));
1294
3.77k
    TRY(m_stream->write_value<u8>(m_range_encoder_code >> 24));
1295
1.88k
    TRY(m_stream->write_value<u8>(m_range_encoder_code >> 16));
1296
1.88k
    TRY(m_stream->write_value<u8>(m_range_encoder_code >> 8));
1297
1298
0
    m_has_flushed_data = true;
1299
1.88k
    return {};
1300
1.88k
}
1301
1302
bool LzmaCompressor::is_eof() const
1303
0
{
1304
0
    return true;
1305
0
}
1306
1307
bool LzmaCompressor::is_open() const
1308
0
{
1309
0
    return !m_has_flushed_data;
1310
0
}
1311
1312
void LzmaCompressor::close()
1313
0
{
1314
0
    if (!m_has_flushed_data) {
1315
        // Note: We need a better API for specifying things like this.
1316
0
        flush().release_value_but_fixme_should_propagate_errors();
1317
0
    }
1318
0
}
1319
1320
LzmaCompressor::~LzmaCompressor()
1321
1.88k
{
1322
1.88k
    if (!m_has_flushed_data) {
1323
        // Note: We need a better API for specifying things like this.
1324
0
        flush().release_value_but_fixme_should_propagate_errors();
1325
0
    }
1326
1.88k
}
1327
1328
}