Coverage Report

Created: 2026-02-14 08:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/serenity/Userland/Libraries/LibCompress/Lzma.cpp
Line
Count
Source
1
/*
2
 * Copyright (c) 2023, Tim Schumacher <timschumi@gmx.de>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#include <AK/Debug.h>
8
#include <AK/IntegralMath.h>
9
#include <LibCompress/Lzma.h>
10
11
namespace Compress {
12
13
u32 LzmaHeader::dictionary_size() const
14
6.10k
{
15
    // "If the value of dictionary size in properties is smaller than (1 << 12),
16
    //  the LZMA decoder must set the dictionary size variable to (1 << 12)."
17
6.10k
    constexpr u32 minimum_dictionary_size = (1 << 12);
18
6.10k
    if (unchecked_dictionary_size < minimum_dictionary_size)
19
666
        return minimum_dictionary_size;
20
21
5.44k
    return unchecked_dictionary_size;
22
6.10k
}
23
24
Optional<u64> LzmaHeader::uncompressed_size() const
25
4.40k
{
26
    // We are making a copy of the packed field here because we would otherwise
27
    // pass an unaligned reference to the constructor of Optional, which is
28
    // undefined behavior.
29
4.40k
    auto uncompressed_size = encoded_uncompressed_size;
30
31
    // "If "Uncompressed size" field contains ones in all 64 bits, it means that
32
    //  uncompressed size is unknown and there is the "end marker" in stream,
33
    //  that indicates the end of decoding point."
34
4.40k
    if (uncompressed_size == placeholder_for_unknown_uncompressed_size)
35
2.83k
        return {};
36
37
    // "In opposite case, if the value from "Uncompressed size" field is not
38
    //  equal to ((2^64) - 1), the LZMA stream decoding must be finished after
39
    //  specified number of bytes (Uncompressed size) is decoded. And if there
40
    //  is the "end marker", the LZMA decoder must read that marker also."
41
1.57k
    return uncompressed_size;
42
4.40k
}
43
44
ErrorOr<LzmaModelProperties> LzmaHeader::decode_model_properties(u8 input_bits)
45
4.40k
{
46
    // "Decodes the following values from the encoded model properties field:
47
    //
48
    //     name  Range          Description
49
    //       lc  [0, 8]         the number of "literal context" bits
50
    //       lp  [0, 4]         the number of "literal pos" bits
51
    //       pb  [0, 4]         the number of "pos" bits
52
    //
53
    //  Encoded using `((pb * 5 + lp) * 9 + lc)`."
54
55
4.40k
    if (input_bits >= (9 * 5 * 5))
56
3
        return Error::from_string_literal("Encoded model properties value is larger than the highest possible value");
57
58
4.40k
    u8 literal_context_bits = input_bits % 9;
59
4.40k
    input_bits /= 9;
60
4.40k
    VERIFY(literal_context_bits >= 0 && literal_context_bits <= 8);
61
62
4.40k
    u8 literal_position_bits = input_bits % 5;
63
4.40k
    input_bits /= 5;
64
4.40k
    VERIFY(literal_position_bits >= 0 && literal_position_bits <= 4);
65
66
4.40k
    u8 position_bits = input_bits;
67
4.40k
    VERIFY(position_bits >= 0 && position_bits <= 4);
68
69
4.40k
    return LzmaModelProperties {
70
4.40k
        .literal_context_bits = literal_context_bits,
71
4.40k
        .literal_position_bits = literal_position_bits,
72
4.40k
        .position_bits = position_bits,
73
4.40k
    };
74
4.40k
}
75
76
ErrorOr<u8> LzmaHeader::encode_model_properties(LzmaModelProperties const& model_properties)
77
2.72k
{
78
2.72k
    if (model_properties.literal_context_bits > 8)
79
0
        return Error::from_string_literal("LZMA literal context bits are too large to encode");
80
81
2.72k
    if (model_properties.literal_position_bits > 4)
82
0
        return Error::from_string_literal("LZMA literal position bits are too large to encode");
83
84
2.72k
    if (model_properties.position_bits > 4)
85
0
        return Error::from_string_literal("LZMA position bits are too large to encode");
86
87
2.72k
    return (model_properties.position_bits * 5 + model_properties.literal_position_bits) * 9 + model_properties.literal_context_bits;
88
2.72k
}
89
90
ErrorOr<LzmaDecompressorOptions> LzmaHeader::as_decompressor_options() const
91
4.40k
{
92
4.40k
    auto model_properties = TRY(decode_model_properties(encoded_model_properties));
93
94
4.40k
    return Compress::LzmaDecompressorOptions {
95
4.40k
        .literal_context_bits = model_properties.literal_context_bits,
96
4.40k
        .literal_position_bits = model_properties.literal_position_bits,
97
4.40k
        .position_bits = model_properties.position_bits,
98
4.40k
        .dictionary_size = dictionary_size(),
99
4.40k
        .uncompressed_size = uncompressed_size(),
100
4.40k
        .reject_end_of_stream_marker = false,
101
4.40k
    };
102
4.40k
}
103
104
ErrorOr<LzmaHeader> LzmaHeader::from_compressor_options(LzmaCompressorOptions const& options)
105
2.72k
{
106
2.72k
    auto encoded_model_properties = TRY(encode_model_properties({
107
2.72k
        .literal_context_bits = options.literal_context_bits,
108
2.72k
        .literal_position_bits = options.literal_position_bits,
109
2.72k
        .position_bits = options.position_bits,
110
2.72k
    }));
111
112
2.72k
    return LzmaHeader {
113
2.72k
        .encoded_model_properties = encoded_model_properties,
114
2.72k
        .unchecked_dictionary_size = options.dictionary_size,
115
2.72k
        .encoded_uncompressed_size = options.uncompressed_size.value_or(placeholder_for_unknown_uncompressed_size),
116
2.72k
    };
117
2.72k
}
118
119
void LzmaState::initialize_to_default_probability(Span<Probability> span)
120
627k
{
121
627k
    for (auto& entry : span)
122
385M
        entry = default_probability;
123
627k
}
124
125
ErrorOr<NonnullOwnPtr<LzmaDecompressor>> LzmaDecompressor::create_from_container(MaybeOwned<Stream> stream, Optional<MaybeOwned<CircularBuffer>> dictionary)
126
4.41k
{
127
4.41k
    auto header = TRY(stream->read_value<LzmaHeader>());
128
129
4.40k
    return TRY(LzmaDecompressor::create_from_raw_stream(move(stream), TRY(header.as_decompressor_options()), move(dictionary)));
130
4.40k
}
131
132
ErrorOr<NonnullOwnPtr<LzmaDecompressor>> LzmaDecompressor::create_from_raw_stream(MaybeOwned<Stream> stream, LzmaDecompressorOptions const& options, Optional<MaybeOwned<CircularBuffer>> dictionary)
133
4.40k
{
134
4.40k
    if (!dictionary.has_value()) {
135
4.40k
        auto new_dictionary = TRY(CircularBuffer::create_empty(options.dictionary_size));
136
4.40k
        dictionary = TRY(try_make<CircularBuffer>(move(new_dictionary)));
137
4.40k
    }
138
139
4.40k
    VERIFY((*dictionary)->capacity() >= options.dictionary_size);
140
141
    // "The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where each table contains 0x300 CProb values."
142
4.40k
    auto literal_probabilities = TRY(FixedArray<Probability>::create(literal_probability_table_size * (1 << (options.literal_context_bits + options.literal_position_bits))));
143
144
4.40k
    auto decompressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) LzmaDecompressor(move(stream), options, dictionary.release_value(), move(literal_probabilities))));
145
146
4.40k
    TRY(decompressor->initialize_range_decoder());
147
148
4.27k
    return decompressor;
149
4.40k
}
150
151
LzmaState::LzmaState(FixedArray<Probability> literal_probabilities)
152
7.13k
    : m_literal_probabilities(move(literal_probabilities))
153
7.13k
{
154
7.13k
    initialize_to_default_probability(m_literal_probabilities.span());
155
156
7.13k
    for (auto& array : m_length_to_position_states)
157
28.5k
        initialize_to_default_probability(array);
158
159
7.13k
    for (auto& array : m_binary_tree_distance_probabilities)
160
71.3k
        initialize_to_default_probability(array);
161
162
7.13k
    initialize_to_default_probability(m_alignment_bit_probabilities);
163
164
7.13k
    initialize_to_default_probability(m_is_match_probabilities);
165
7.13k
    initialize_to_default_probability(m_is_rep_probabilities);
166
7.13k
    initialize_to_default_probability(m_is_rep_g0_probabilities);
167
7.13k
    initialize_to_default_probability(m_is_rep_g1_probabilities);
168
7.13k
    initialize_to_default_probability(m_is_rep_g2_probabilities);
169
7.13k
    initialize_to_default_probability(m_is_rep0_long_probabilities);
170
7.13k
}
171
172
LzmaDecompressor::LzmaDecompressor(MaybeOwned<Stream> stream, LzmaDecompressorOptions options, MaybeOwned<CircularBuffer> dictionary, FixedArray<Probability> literal_probabilities)
173
4.40k
    : LzmaState(move(literal_probabilities))
174
4.40k
    , m_stream(move(stream))
175
4.40k
    , m_options(move(options))
176
4.40k
    , m_dictionary(move(dictionary))
177
4.40k
{
178
4.40k
}
179
180
bool LzmaDecompressor::is_range_decoder_in_clean_state() const
181
2.89k
{
182
2.89k
    return m_range_decoder_code == 0;
183
2.89k
}
184
185
bool LzmaDecompressor::has_reached_expected_data_size() const
186
24.3M
{
187
24.3M
    if (!m_options.uncompressed_size.has_value())
188
6.22M
        return false;
189
190
18.1M
    return m_total_processed_bytes >= m_options.uncompressed_size.value();
191
24.3M
}
192
193
ErrorOr<void> LzmaDecompressor::initialize_range_decoder()
194
4.40k
{
195
    // "The LZMA Encoder always writes ZERO in initial byte of compressed stream.
196
    //  That scheme allows to simplify the code of the Range Encoder in the
197
    //  LZMA Encoder. If initial byte is not equal to ZERO, the LZMA Decoder must
198
    //  stop decoding and report error."
199
4.40k
    {
200
4.40k
        auto byte = TRY(m_stream->read_value<u8>());
201
4.29k
        if (byte != 0)
202
13
            return Error::from_string_literal("Initial byte of data stream is not zero");
203
4.29k
    }
204
205
    // Read the initial bytes into the range decoder.
206
4.27k
    m_range_decoder_code = 0;
207
21.3k
    for (size_t i = 0; i < 4; i++) {
208
17.1k
        auto byte = TRY(m_stream->read_value<u8>());
209
17.1k
        m_range_decoder_code = m_range_decoder_code << 8 | byte;
210
17.1k
    }
211
212
4.27k
    m_range_decoder_range = 0xFFFFFFFF;
213
214
4.27k
    return {};
215
4.27k
}
216
217
ErrorOr<void> LzmaDecompressor::append_input_stream(MaybeOwned<Stream> stream, Optional<u64> uncompressed_size)
218
0
{
219
0
    m_stream = move(stream);
220
221
0
    TRY(initialize_range_decoder());
222
223
0
    if (m_options.uncompressed_size.has_value() != uncompressed_size.has_value())
224
0
        return Error::from_string_literal("Appending LZMA streams with mismatching uncompressed size status");
225
226
0
    if (uncompressed_size.has_value())
227
0
        *m_options.uncompressed_size += *uncompressed_size;
228
229
0
    return {};
230
0
}
231
232
ErrorOr<void> LzmaDecompressor::normalize_range_decoder()
233
163M
{
234
    // "The Normalize() function keeps the "Range" value in described range."
235
236
163M
    if (m_range_decoder_range >= minimum_range_value)
237
160M
        return {};
238
239
2.98M
    m_range_decoder_range <<= 8;
240
2.98M
    m_range_decoder_code <<= 8;
241
242
2.98M
    m_range_decoder_code |= TRY(m_stream->read_value<u8>());
243
244
2.98M
    VERIFY(m_range_decoder_range >= minimum_range_value);
245
246
2.98M
    return {};
247
2.98M
}
248
249
ErrorOr<void> LzmaCompressor::shift_range_encoder()
250
2.56M
{
251
2.56M
    if ((m_range_encoder_code >> 32) == 0x01) {
252
        // If there is an overflow, we can finalize the chain we were previously building.
253
        // This includes incrementing both the cached byte and all the 0xFF bytes that we generate.
254
882k
        VERIFY(m_range_encoder_cached_byte != 0xFF);
255
1.76M
        TRY(m_stream->write_value<u8>(m_range_encoder_cached_byte + 1));
256
1.76M
        for (size_t i = 0; i < m_range_encoder_ff_chain_length; i++)
257
3.49k
            TRY(m_stream->write_value<u8>(0x00));
258
1.76M
        m_range_encoder_ff_chain_length = 0;
259
882k
        m_range_encoder_cached_byte = (m_range_encoder_code >> 24);
260
1.68M
    } else if ((m_range_encoder_code >> 24) == 0xFF) {
261
        // If the byte to flush is 0xFF, it can potentially propagate an overflow and needs to be added to the chain.
262
13.6k
        m_range_encoder_ff_chain_length++;
263
1.67M
    } else {
264
        // If the byte to flush isn't 0xFF, any future overflows will not be propagated beyond this point,
265
        // so we can be sure that the built chain doesn't change anymore.
266
1.67M
        TRY(m_stream->write_value<u8>(m_range_encoder_cached_byte));
267
1.68M
        for (size_t i = 0; i < m_range_encoder_ff_chain_length; i++)
268
10.1k
            TRY(m_stream->write_value<u8>(0xFF));
269
1.67M
        m_range_encoder_ff_chain_length = 0;
270
1.67M
        m_range_encoder_cached_byte = (m_range_encoder_code >> 24);
271
1.67M
    }
272
273
    // In all three cases we now recorded the highest byte in some way, so we can shift it away and shift in a null byte as the lowest byte.
274
2.56M
    m_range_encoder_range <<= 8;
275
2.56M
    m_range_encoder_code <<= 8;
276
277
    // Since we are working with a 64-bit code, we need to limit it to 32 bits artificially.
278
2.56M
    m_range_encoder_code &= 0xFFFFFFFF;
279
280
2.56M
    return {};
281
2.56M
}
282
283
ErrorOr<void> LzmaCompressor::normalize_range_encoder()
284
35.0M
{
285
35.0M
    u64 const maximum_range_value = m_range_encoder_code + m_range_encoder_range;
286
287
    // Logically, we should only ever build up an overflow that is smaller than or equal to 0x01.
288
35.0M
    VERIFY((maximum_range_value >> 32) <= 0x01);
289
290
35.0M
    if (m_range_encoder_range >= minimum_range_value)
291
32.4M
        return {};
292
293
35.0M
    TRY(shift_range_encoder());
294
295
5.12M
    VERIFY(m_range_encoder_range >= minimum_range_value);
296
297
2.56M
    return {};
298
5.12M
}
299
300
ErrorOr<u8> LzmaDecompressor::decode_direct_bit()
301
4.90M
{
302
4.90M
    dbgln_if(LZMA_DEBUG, "Decoding direct bit {} with code = {:#x}, range = {:#x}", 1 - ((m_range_decoder_code - (m_range_decoder_range >> 1)) >> 31), m_range_decoder_code, m_range_decoder_range);
303
304
4.90M
    m_range_decoder_range >>= 1;
305
4.90M
    m_range_decoder_code -= m_range_decoder_range;
306
307
4.90M
    u32 temp = 0 - (m_range_decoder_code >> 31);
308
309
4.90M
    m_range_decoder_code += m_range_decoder_range & temp;
310
311
4.90M
    if (m_range_decoder_code == m_range_decoder_range)
312
1
        return Error::from_string_literal("Reached an invalid state while decoding LZMA stream");
313
314
4.90M
    TRY(normalize_range_decoder());
315
316
4.90M
    return temp + 1;
317
4.90M
}
318
319
ErrorOr<void> LzmaCompressor::encode_direct_bit(u8 value)
320
4.87M
{
321
4.87M
    dbgln_if(LZMA_DEBUG, "Encoding direct bit {} with code = {:#x}, range = {:#x}", value, m_range_encoder_code, m_range_encoder_range);
322
323
4.87M
    m_range_encoder_range >>= 1;
324
325
4.87M
    if (value != 0)
326
2.40M
        m_range_encoder_code += m_range_encoder_range;
327
328
4.87M
    TRY(normalize_range_encoder());
329
330
4.87M
    return {};
331
4.87M
}
332
333
ErrorOr<u8> LzmaDecompressor::decode_bit_with_probability(Probability& probability)
334
158M
{
335
    // "The LZMA decoder provides the pointer to CProb variable that contains
336
    //  information about estimated probability for symbol 0 and the Range Decoder
337
    //  updates that CProb variable after decoding."
338
339
158M
    u32 bound = (m_range_decoder_range >> probability_bit_count) * probability;
340
341
158M
    dbgln_if(LZMA_DEBUG, "Decoding bit {} with probability = {:#x}, bound = {:#x}, code = {:#x}, range = {:#x}", m_range_decoder_code < bound ? 0 : 1, probability, bound, m_range_decoder_code, m_range_decoder_range);
342
343
158M
    if (m_range_decoder_code < bound) {
344
18.5M
        probability += ((1 << probability_bit_count) - probability) >> probability_shift_width;
345
18.5M
        m_range_decoder_range = bound;
346
18.5M
        TRY(normalize_range_decoder());
347
18.5M
        return 0;
348
140M
    } else {
349
140M
        probability -= probability >> probability_shift_width;
350
140M
        m_range_decoder_code -= bound;
351
140M
        m_range_decoder_range -= bound;
352
140M
        TRY(normalize_range_decoder());
353
140M
        return 1;
354
140M
    }
355
158M
}
356
357
ErrorOr<void> LzmaCompressor::encode_bit_with_probability(Probability& probability, u8 value)
358
30.1M
{
359
30.1M
    u32 bound = (m_range_encoder_range >> probability_bit_count) * probability;
360
361
30.1M
    dbgln_if(LZMA_DEBUG, "Encoding bit {} with probability = {:#x}, bound = {:#x}, code = {:#x}, range = {:#x}", value, probability, bound, m_range_encoder_code, m_range_encoder_range);
362
363
30.1M
    if (value == 0) {
364
15.3M
        probability += ((1 << probability_bit_count) - probability) >> probability_shift_width;
365
15.3M
        m_range_encoder_range = bound;
366
15.3M
    } else {
367
14.8M
        probability -= probability >> probability_shift_width;
368
14.8M
        m_range_encoder_code += bound;
369
14.8M
        m_range_encoder_range -= bound;
370
14.8M
    }
371
372
30.1M
    TRY(normalize_range_encoder());
373
30.1M
    return {};
374
30.1M
}
375
376
ErrorOr<u16> LzmaDecompressor::decode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree)
377
11.1M
{
378
11.1M
    VERIFY(bit_count <= sizeof(u16) * 8);
379
11.1M
    VERIFY(probability_tree.size() >= 1ul << bit_count);
380
381
    // This has been modified from the reference implementation to unlink the result and the tree index,
382
    // which should allow for better readability.
383
384
11.1M
    u16 result = 0;
385
11.1M
    size_t tree_index = 1;
386
387
89.9M
    for (size_t i = 0; i < bit_count; i++) {
388
78.7M
        u16 next_bit = TRY(decode_bit_with_probability(probability_tree[tree_index]));
389
78.7M
        result = (result << 1) | next_bit;
390
78.7M
        tree_index = (tree_index << 1) | next_bit;
391
78.7M
    }
392
393
11.1M
    dbgln_if(LZMA_DEBUG, "Decoded value {:#x} with {} bits using bit tree", result, bit_count);
394
395
11.1M
    return result;
396
11.1M
}
397
398
ErrorOr<void> LzmaCompressor::encode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree, u16 value)
399
2.76M
{
400
2.76M
    VERIFY(bit_count <= sizeof(u16) * 8);
401
2.76M
    VERIFY(probability_tree.size() >= 1ul << bit_count);
402
2.76M
    VERIFY(value <= (1 << bit_count) - 1);
403
404
2.76M
    auto original_value = value;
405
406
    // Shift value to make the first sent byte the most significant bit. This makes the shifting logic a lot easier to read.
407
2.76M
    value <<= sizeof(u16) * 8 - bit_count;
408
409
2.76M
    size_t tree_index = 1;
410
411
14.7M
    for (size_t i = 0; i < bit_count; i++) {
412
11.9M
        u8 const next_bit = (value & 0x8000) >> (sizeof(u16) * 8 - 1);
413
11.9M
        value <<= 1;
414
11.9M
        TRY(encode_bit_with_probability(probability_tree[tree_index], next_bit));
415
11.9M
        tree_index = (tree_index << 1) | next_bit;
416
11.9M
    }
417
418
2.76M
    dbgln_if(LZMA_DEBUG, "Encoded value {:#x} with {} bits using bit tree", original_value, bit_count);
419
420
2.76M
    return {};
421
2.76M
}
422
423
ErrorOr<u16> LzmaDecompressor::decode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree)
424
811k
{
425
811k
    VERIFY(bit_count <= sizeof(u16) * 8);
426
811k
    VERIFY(probability_tree.size() >= 1ul << bit_count);
427
428
811k
    u16 result = 0;
429
811k
    size_t tree_index = 1;
430
431
4.02M
    for (size_t i = 0; i < bit_count; i++) {
432
3.21M
        u16 next_bit = TRY(decode_bit_with_probability(probability_tree[tree_index]));
433
3.21M
        result |= next_bit << i;
434
3.21M
        tree_index = (tree_index << 1) | next_bit;
435
3.21M
    }
436
437
811k
    dbgln_if(LZMA_DEBUG, "Decoded value {:#x} with {} bits using reverse bit tree", result, bit_count);
438
439
811k
    return result;
440
811k
}
441
442
ErrorOr<void> LzmaCompressor::encode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree, u16 value)
443
804k
{
444
804k
    VERIFY(bit_count <= sizeof(u16) * 8);
445
804k
    VERIFY(probability_tree.size() >= 1ul << bit_count);
446
804k
    VERIFY(value <= (1 << bit_count) - 1);
447
448
804k
    auto original_value = value;
449
450
804k
    size_t tree_index = 1;
451
452
3.99M
    for (size_t i = 0; i < bit_count; i++) {
453
3.19M
        u8 const next_bit = value & 1;
454
3.19M
        value >>= 1;
455
3.19M
        TRY(encode_bit_with_probability(probability_tree[tree_index], next_bit));
456
3.19M
        tree_index = (tree_index << 1) | next_bit;
457
3.19M
    }
458
459
804k
    dbgln_if(LZMA_DEBUG, "Encoded value {:#x} with {} bits using reverse bit tree", original_value, bit_count);
460
461
804k
    return {};
462
804k
}
463
464
ErrorOr<void> LzmaDecompressor::decode_literal_to_output_buffer()
465
1.07M
{
466
1.07M
    u8 previous_byte = 0;
467
1.07M
    if (m_dictionary->seekback_limit() > 0) {
468
1.07M
        auto read_bytes = MUST(m_dictionary->read_with_seekback({ &previous_byte, sizeof(previous_byte) }, 1));
469
1.07M
        VERIFY(read_bytes.size() == sizeof(previous_byte));
470
1.07M
    }
471
472
    // "To select the table for decoding it uses the context that consists of
473
    //  (lc) high bits from previous literal and (lp) low bits from value that
474
    //  represents current position in outputStream."
475
1.07M
    u16 literal_state_bits_from_position = m_total_processed_bytes & ((1 << m_options.literal_position_bits) - 1);
476
1.07M
    u16 literal_state_bits_from_output = previous_byte >> (8 - m_options.literal_context_bits);
477
1.07M
    u16 literal_state = literal_state_bits_from_position << m_options.literal_context_bits | literal_state_bits_from_output;
478
479
1.07M
    Span<Probability> selected_probability_table = m_literal_probabilities.span().slice(literal_probability_table_size * literal_state, literal_probability_table_size);
480
481
    // The result is defined as u16 here and initialized to 1, but we will cut off the top bits before queueing them into the output buffer.
482
    // The top bit is only used to track how much we have decoded already, and to select the correct probability table.
483
1.07M
    u16 result = 1;
484
485
    // "If (State > 7), the Literal Decoder also uses "matchByte" that represents
486
    //  the byte in OutputStream at position the is the DISTANCE bytes before
487
    //  current position, where the DISTANCE is the distance in DISTANCE-LENGTH pair
488
    //  of latest decoded match."
489
    // Note: The specification says `(State > 7)`, but the reference implementation does `(State >= 7)`, which is a mismatch.
490
    //       Testing `(State > 7)` with actual test files yields errors, so the reference implementation appears to be the correct one.
491
1.07M
    if (m_state >= 7) {
492
113k
        u8 matched_byte = 0;
493
113k
        auto read_bytes = TRY(m_dictionary->read_with_seekback({ &matched_byte, sizeof(matched_byte) }, current_repetition_offset()));
494
113k
        VERIFY(read_bytes.size() == sizeof(matched_byte));
495
496
113k
        dbgln_if(LZMA_DEBUG, "Decoding literal using match byte {:#x}", matched_byte);
497
498
422k
        do {
499
422k
            u8 match_bit = (matched_byte >> 7) & 1;
500
422k
            matched_byte <<= 1;
501
502
422k
            u8 decoded_bit = TRY(decode_bit_with_probability(selected_probability_table[((1 + match_bit) << 8) + result]));
503
422k
            result = result << 1 | decoded_bit;
504
505
422k
            if (match_bit != decoded_bit)
506
110k
                break;
507
422k
        } while (result < 0x100);
508
113k
    }
509
510
9.28M
    while (result < 0x100)
511
8.20M
        result = (result << 1) | TRY(decode_bit_with_probability(selected_probability_table[result]));
512
513
1.07M
    u8 actual_result = result - 0x100;
514
515
1.07M
    size_t written_bytes = m_dictionary->write({ &actual_result, sizeof(actual_result) });
516
1.07M
    VERIFY(written_bytes == sizeof(actual_result));
517
1.07M
    m_total_processed_bytes += sizeof(actual_result);
518
519
1.07M
    dbgln_if(LZMA_DEBUG, "Decoded literal {:#x} in state {} using literal state {:#x} (previous byte is {:#x})", actual_result, m_state, literal_state, previous_byte);
520
521
1.07M
    return {};
522
1.07M
}
523
524
ErrorOr<void> LzmaCompressor::encode_literal(u8 literal)
525
728k
{
526
    // This function largely mirrors `decode_literal_to_output_buffer`, so specification comments have been omitted.
527
528
728k
    TRY(encode_match_type(MatchType::Literal));
529
530
    // Note: We have already read the next byte from the input buffer, so it's now in the seekback buffer, shifting all seekback offsets by one.
531
728k
    u8 previous_byte = 0;
532
728k
    if (m_dictionary->seekback_limit() - m_dictionary->used_space() > 1) {
533
725k
        auto read_bytes = MUST(m_dictionary->read_with_seekback({ &previous_byte, sizeof(previous_byte) }, 2 + m_dictionary->used_space()));
534
725k
        VERIFY(read_bytes.size() == sizeof(previous_byte));
535
725k
    }
536
728k
    u16 const literal_state_bits_from_position = m_total_processed_bytes & ((1 << m_options.literal_position_bits) - 1);
537
728k
    u16 const literal_state_bits_from_output = previous_byte >> (8 - m_options.literal_context_bits);
538
728k
    u16 const literal_state = literal_state_bits_from_position << m_options.literal_context_bits | literal_state_bits_from_output;
539
540
728k
    Span<Probability> selected_probability_table = m_literal_probabilities.span().slice(literal_probability_table_size * literal_state, literal_probability_table_size);
541
542
728k
    auto original_literal = literal;
543
728k
    u16 result = 1;
544
545
728k
    if (m_state >= 7) {
546
104k
        u8 matched_byte = 0;
547
104k
        auto read_bytes = TRY(m_dictionary->read_with_seekback({ &matched_byte, sizeof(matched_byte) }, current_repetition_offset() + m_dictionary->used_space() + 1));
548
104k
        VERIFY(read_bytes.size() == sizeof(matched_byte));
549
550
104k
        dbgln_if(LZMA_DEBUG, "Encoding literal using match byte {:#x}", matched_byte);
551
552
396k
        do {
553
396k
            u8 const match_bit = (matched_byte >> 7) & 1;
554
396k
            matched_byte <<= 1;
555
556
396k
            u8 const encoded_bit = (literal & 0x80) >> 7;
557
396k
            literal <<= 1;
558
559
396k
            TRY(encode_bit_with_probability(selected_probability_table[((1 + match_bit) << 8) + result], encoded_bit));
560
396k
            result = result << 1 | encoded_bit;
561
562
396k
            if (match_bit != encoded_bit)
563
101k
                break;
564
396k
        } while (result < 0x100);
565
104k
    }
566
567
6.16M
    while (result < 0x100) {
568
5.43M
        u8 const encoded_bit = (literal & 0x80) >> 7;
569
5.43M
        literal <<= 1;
570
571
5.43M
        TRY(encode_bit_with_probability(selected_probability_table[result], encoded_bit));
572
573
5.43M
        result = (result << 1) | encoded_bit;
574
5.43M
    }
575
576
728k
    m_total_processed_bytes += sizeof(literal);
577
578
728k
    dbgln_if(LZMA_DEBUG, "Encoded literal {:#x} in state {} using literal state {:#x} (previous byte is {:#x})", original_literal, m_state, literal_state, previous_byte);
579
580
728k
    update_state_after_literal();
581
582
728k
    return {};
583
728k
}
584
585
ErrorOr<void> LzmaCompressor::encode_existing_match(size_t real_distance, size_t real_length)
586
1.13M
{
587
1.13M
    VERIFY(real_distance >= normalized_to_real_match_distance_offset);
588
1.13M
    u32 const normalized_distance = real_distance - normalized_to_real_match_distance_offset;
589
590
1.13M
    VERIFY(real_length >= normalized_to_real_match_length_offset);
591
1.13M
    u16 const normalized_length = real_length - normalized_to_real_match_length_offset;
592
593
1.13M
    if (normalized_distance == m_rep0) {
594
1.04M
        TRY(encode_match_type(MatchType::RepMatch0));
595
1.04M
    } else if (normalized_distance == m_rep1) {
596
52.6k
        TRY(encode_match_type(MatchType::RepMatch1));
597
598
52.6k
        u32 const distance = m_rep1;
599
52.6k
        m_rep1 = m_rep0;
600
52.6k
        m_rep0 = distance;
601
52.6k
    } else if (normalized_distance == m_rep2) {
602
24.2k
        TRY(encode_match_type(MatchType::RepMatch2));
603
604
24.2k
        u32 const distance = m_rep2;
605
24.2k
        m_rep2 = m_rep1;
606
24.2k
        m_rep1 = m_rep0;
607
24.2k
        m_rep0 = distance;
608
24.2k
    } else if (normalized_distance == m_rep3) {
609
17.8k
        TRY(encode_match_type(MatchType::RepMatch3));
610
611
17.8k
        u32 const distance = m_rep3;
612
17.8k
        m_rep3 = m_rep2;
613
17.8k
        m_rep2 = m_rep1;
614
17.8k
        m_rep1 = m_rep0;
615
17.8k
        m_rep0 = distance;
616
17.8k
    } else {
617
0
        VERIFY_NOT_REACHED();
618
0
    }
619
620
2.27M
    TRY(encode_normalized_match_length(m_rep_length_coder, normalized_length));
621
2.27M
    update_state_after_rep();
622
2.27M
    MUST(m_dictionary->discard(real_length));
623
1.13M
    m_total_processed_bytes += real_length;
624
625
1.13M
    return {};
626
2.27M
}
627
628
ErrorOr<void> LzmaCompressor::encode_new_match(size_t real_distance, size_t real_length)
629
812k
{
630
812k
    VERIFY(real_distance >= normalized_to_real_match_distance_offset);
631
812k
    u32 const normalized_distance = real_distance - normalized_to_real_match_distance_offset;
632
633
812k
    VERIFY(real_length >= normalized_to_real_match_length_offset);
634
812k
    u16 const normalized_length = real_length - normalized_to_real_match_length_offset;
635
636
812k
    TRY(encode_normalized_simple_match(normalized_distance, normalized_length));
637
638
812k
    MUST(m_dictionary->discard(real_length));
639
812k
    m_total_processed_bytes += real_length;
640
641
812k
    return {};
642
812k
}
643
644
ErrorOr<void> LzmaCompressor::encode_normalized_simple_match(u32 normalized_distance, u16 normalized_length)
645
815k
{
646
815k
    TRY(encode_match_type(MatchType::SimpleMatch));
647
648
815k
    m_rep3 = m_rep2;
649
815k
    m_rep2 = m_rep1;
650
815k
    m_rep1 = m_rep0;
651
652
815k
    TRY(encode_normalized_match_length(m_length_coder, normalized_length));
653
654
815k
    update_state_after_match();
655
656
815k
    TRY(encode_normalized_match_distance(normalized_length, normalized_distance));
657
815k
    m_rep0 = normalized_distance;
658
659
815k
    return {};
660
815k
}
661
662
LzmaState::LzmaLengthCoderState::LzmaLengthCoderState()
663
14.2k
{
664
14.2k
    for (auto& array : m_low_length_probabilities)
665
228k
        initialize_to_default_probability(array);
666
667
14.2k
    for (auto& array : m_medium_length_probabilities)
668
228k
        initialize_to_default_probability(array);
669
670
14.2k
    initialize_to_default_probability(m_high_length_probabilities);
671
14.2k
}
672
673
ErrorOr<u16> LzmaDecompressor::decode_normalized_match_length(LzmaLengthCoderState& length_decoder_state)
674
10.3M
{
675
    // "LZMA uses "posState" value as context to select the binary tree
676
    //  from LowCoder and MidCoder binary tree arrays:"
677
10.3M
    u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
678
679
    // "The following scheme is used for the match length encoding:
680
    //
681
    //   Binary encoding    Binary Tree structure    Zero-based match length
682
    //   sequence                                    (binary + decimal):
683
    //
684
    //   0 xxx              LowCoder[posState]       xxx
685
20.6M
    if (TRY(decode_bit_with_probability(length_decoder_state.m_first_choice_probability)) == 0)
686
10.3M
        return TRY(decode_symbol_using_bit_tree(3, length_decoder_state.m_low_length_probabilities[position_state].span()));
687
688
    //   1 0 yyy            MidCoder[posState]       yyy + 8
689
17.2M
    if (TRY(decode_bit_with_probability(length_decoder_state.m_second_choice_probability)) == 0)
690
28.2k
        return TRY(decode_symbol_using_bit_tree(3, length_decoder_state.m_medium_length_probabilities[position_state].span())) + 8;
691
692
    //   1 1 zzzzzzzz       HighCoder                zzzzzzzz + 16"
693
8.58M
    return TRY(decode_symbol_using_bit_tree(8, length_decoder_state.m_high_length_probabilities.span())) + 16;
694
8.58M
}
695
696
ErrorOr<void> LzmaCompressor::encode_normalized_match_length(LzmaLengthCoderState& length_coder_state, u16 normalized_length)
697
1.95M
{
698
1.95M
    u16 const position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
699
700
1.95M
    if (normalized_length < 8) {
701
1.68M
        TRY(encode_bit_with_probability(length_coder_state.m_first_choice_probability, 0));
702
1.68M
        TRY(encode_symbol_using_bit_tree(3, length_coder_state.m_low_length_probabilities[position_state].span(), normalized_length));
703
1.68M
        return {};
704
1.68M
    }
705
706
1.95M
    TRY(encode_bit_with_probability(length_coder_state.m_first_choice_probability, 1));
707
708
538k
    if (normalized_length < 16) {
709
23.8k
        TRY(encode_bit_with_probability(length_coder_state.m_second_choice_probability, 0));
710
23.8k
        TRY(encode_symbol_using_bit_tree(3, length_coder_state.m_medium_length_probabilities[position_state].span(), normalized_length - 8));
711
23.8k
        return {};
712
23.8k
    }
713
714
491k
    TRY(encode_bit_with_probability(length_coder_state.m_second_choice_probability, 1));
715
491k
    TRY(encode_symbol_using_bit_tree(8, length_coder_state.m_high_length_probabilities.span(), normalized_length - 16));
716
245k
    return {};
717
491k
}
718
719
ErrorOr<u32> LzmaDecompressor::decode_normalized_match_distance(u16 normalized_match_length)
720
823k
{
721
    // "LZMA uses normalized match length (zero-based length)
722
    //  to calculate the context state "lenState" do decode the distance value."
723
823k
    u16 length_state = min(normalized_match_length, number_of_length_to_position_states - 1);
724
725
    // "At first stage the distance decoder decodes 6-bit "posSlot" value with bit
726
    //  tree decoder from PosSlotDecoder array."
727
823k
    u16 position_slot = TRY(decode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span()));
728
729
    // "The encoding scheme for distance value is shown in the following table:
730
    //
731
    //  posSlot (decimal) /
732
    //       zero-based distance (binary)
733
    //  0    0
734
    //  1    1
735
    //  2    10
736
    //  3    11
737
    //
738
    //  4    10 x
739
    //  5    11 x
740
    //  6    10 xx
741
    //  7    11 xx
742
    //  8    10 xxx
743
    //  9    11 xxx
744
    //  10    10 xxxx
745
    //  11    11 xxxx
746
    //  12    10 xxxxx
747
    //  13    11 xxxxx
748
    //
749
    //  14    10 yy zzzz
750
    //  15    11 yy zzzz
751
    //  16    10 yyy zzzz
752
    //  17    11 yyy zzzz
753
    //  ...
754
    //  62    10 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz
755
    //  63    11 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz
756
    //
757
    //  where
758
    //   "x ... x" means the sequence of binary symbols encoded with binary tree and
759
    //       "Reverse" scheme. It uses separated binary tree for each posSlot from 4 to 13.
760
    //   "y" means direct bit encoded with range coder.
761
    //   "zzzz" means the sequence of four binary symbols encoded with binary
762
    //       tree with "Reverse" scheme, where one common binary tree "AlignDecoder"
763
    //       is used for all posSlot values."
764
765
    // "If (posSlot < 4), the "dist" value is equal to posSlot value."
766
822k
    if (position_slot < first_position_slot_with_binary_tree_bits)
767
11.6k
        return position_slot;
768
769
    // From here on, the first bit of the distance is always set and the second bit is set if the last bit of the position slot is set.
770
811k
    u32 distance_prefix = ((1 << 1) | ((position_slot & 1) << 0));
771
772
    // "If (posSlot >= 4), the decoder uses "posSlot" value to calculate the value of
773
    //   the high bits of "dist" value and the number of the low bits.
774
    //   If (4 <= posSlot < kEndPosModelIndex), the decoder uses bit tree decoders.
775
    //     (one separated bit tree decoder per one posSlot value) and "Reverse" scheme."
776
811k
    if (position_slot < first_position_slot_with_direct_encoded_bits) {
777
82.8k
        size_t number_of_bits_to_decode = (position_slot / 2) - 1;
778
82.8k
        auto& selected_probability_tree = m_binary_tree_distance_probabilities[position_slot - first_position_slot_with_binary_tree_bits];
779
82.8k
        return (distance_prefix << number_of_bits_to_decode) | TRY(decode_symbol_using_reverse_bit_tree(number_of_bits_to_decode, selected_probability_tree));
780
82.8k
    }
781
782
    // "  if (posSlot >= kEndPosModelIndex), the middle bits are decoded as direct
783
    //     bits from RangeDecoder and the low 4 bits are decoded with a bit tree
784
    //     decoder "AlignDecoder" with "Reverse" scheme."
785
728k
    size_t number_of_direct_bits_to_decode = ((position_slot - first_position_slot_with_direct_encoded_bits) / 2) + 2;
786
5.62M
    for (size_t i = 0; i < number_of_direct_bits_to_decode; i++) {
787
4.90M
        distance_prefix = (distance_prefix << 1) | TRY(decode_direct_bit());
788
4.90M
    }
789
728k
    return (distance_prefix << number_of_alignment_bits) | TRY(decode_symbol_using_reverse_bit_tree(number_of_alignment_bits, m_alignment_bit_probabilities));
790
728k
}
791
792
ErrorOr<void> LzmaCompressor::encode_normalized_match_distance(u16 normalized_match_length, u32 normalized_match_distance)
793
815k
{
794
815k
    u16 const length_state = min(normalized_match_length, number_of_length_to_position_states - 1);
795
796
815k
    if (normalized_match_distance < first_position_slot_with_binary_tree_bits) {
797
        // The normalized distance gets encoded as the position slot.
798
10.1k
        TRY(encode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span(), normalized_match_distance));
799
10.1k
        return {};
800
10.1k
    }
801
802
    // Note: This has been deduced, there is no immediate relation to the decoding function.
803
804k
    u16 const distance_log2 = AK::log2(normalized_match_distance);
804
804k
    u16 number_of_distance_bits = count_required_bits(normalized_match_distance);
805
804k
    u16 const position_slot = (distance_log2 << 1) + ((normalized_match_distance >> (distance_log2 - 1)) & 1);
806
807
804k
    TRY(encode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span(), position_slot));
808
809
    // Mask off the top two bits of the value, those are already encoded by the position slot.
810
804k
    normalized_match_distance &= (1 << (number_of_distance_bits - 2)) - 1;
811
804k
    number_of_distance_bits -= 2;
812
813
804k
    if (position_slot < first_position_slot_with_direct_encoded_bits) {
814
        // The value gets encoded using only a reverse bit tree coder.
815
80.1k
        auto& selected_probability_tree = m_binary_tree_distance_probabilities[position_slot - first_position_slot_with_binary_tree_bits];
816
80.1k
        TRY(encode_symbol_using_reverse_bit_tree(number_of_distance_bits, selected_probability_tree, normalized_match_distance));
817
80.1k
        return {};
818
80.1k
    }
819
820
    // The value is split into direct bits (everything except the last four bits) and alignment bits (last four bits).
821
724k
    auto direct_bits = normalized_match_distance & ~((1 << number_of_alignment_bits) - 1);
822
724k
    auto const alignment_bits = normalized_match_distance & ((1 << number_of_alignment_bits) - 1);
823
824
    // Shift to-be-written direct bits to the most significant position for easier access.
825
724k
    direct_bits <<= sizeof(direct_bits) * 8 - number_of_distance_bits;
826
827
5.60M
    for (auto i = 0u; i < number_of_distance_bits - number_of_alignment_bits; i++) {
828
4.87M
        TRY(encode_direct_bit((direct_bits & 0x80000000) ? 1 : 0));
829
4.87M
        direct_bits <<= 1;
830
4.87M
    }
831
832
724k
    TRY(encode_symbol_using_reverse_bit_tree(number_of_alignment_bits, m_alignment_bit_probabilities, alignment_bits));
833
834
724k
    return {};
835
724k
}
836
837
u32 LzmaState::current_repetition_offset() const
838
11.5M
{
839
    // LZMA never needs to read at offset 0 (i.e. the actual read head of the buffer).
840
    // Instead, the values are remapped so that the rep-value n starts reading n + 1 bytes back.
841
    // The special rep-value 0xFFFFFFFF is reserved for marking the end of the stream,
842
    // so this should never overflow.
843
11.5M
    VERIFY(m_rep0 <= NumericLimits<u32>::max() - normalized_to_real_match_distance_offset);
844
11.5M
    return m_rep0 + normalized_to_real_match_distance_offset;
845
11.5M
}
846
847
void LzmaState::update_state_after_literal()
848
1.80M
{
849
1.80M
    if (m_state < 4)
850
1.48M
        m_state = 0;
851
319k
    else if (m_state < 10)
852
251k
        m_state -= 3;
853
67.9k
    else
854
67.9k
        m_state -= 6;
855
1.80M
}
856
857
void LzmaState::update_state_after_match()
858
1.63M
{
859
1.63M
    if (m_state < 7)
860
133k
        m_state = 7;
861
1.50M
    else
862
1.50M
        m_state = 10;
863
1.63M
}
864
865
void LzmaState::update_state_after_rep()
866
10.6M
{
867
10.6M
    if (m_state < 7)
868
88.5k
        m_state = 8;
869
10.5M
    else
870
10.5M
        m_state = 11;
871
10.6M
}
872
873
void LzmaState::update_state_after_short_rep()
874
3.45k
{
875
3.45k
    if (m_state < 7)
876
1.83k
        m_state = 9;
877
1.62k
    else
878
1.62k
        m_state = 11;
879
3.45k
}
880
881
ErrorOr<LzmaDecompressor::MatchType> LzmaDecompressor::decode_match_type()
882
11.3M
{
883
    // "The decoder calculates "state2" variable value to select exact variable from
884
    //  "IsMatch" and "IsRep0Long" arrays."
885
11.3M
    u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
886
11.3M
    u16 state2 = (m_state << maximum_number_of_position_bits) + position_state;
887
888
    // "The decoder uses the following code flow scheme to select exact
889
    //  type of LITERAL or MATCH:
890
    //
891
    //  IsMatch[state2] decode
892
    //   0 - the Literal"
893
22.7M
    if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0) {
894
1.07M
        dbgln_if(LZMA_DEBUG, "Decoded match type 'Literal'");
895
1.07M
        return MatchType::Literal;
896
1.07M
    }
897
898
    // " 1 - the Match
899
    //     IsRep[state] decode
900
    //       0 - Simple Match"
901
20.6M
    if (TRY(decode_bit_with_probability(m_is_rep_probabilities[m_state])) == 0) {
902
823k
        dbgln_if(LZMA_DEBUG, "Decoded match type 'SimpleMatch'");
903
823k
        return MatchType::SimpleMatch;
904
823k
    }
905
906
    // "     1 - Rep Match
907
    //         IsRepG0[state] decode
908
    //           0 - the distance is rep0"
909
18.9M
    if (TRY(decode_bit_with_probability(m_is_rep_g0_probabilities[m_state])) == 0) {
910
        // "       IsRep0Long[state2] decode
911
        //           0 - Short Rep Match"
912
2.09M
        if (TRY(decode_bit_with_probability(m_is_rep0_long_probabilities[state2])) == 0) {
913
3.45k
            dbgln_if(LZMA_DEBUG, "Decoded match type 'ShortRepMatch'");
914
3.45k
            return MatchType::ShortRepMatch;
915
3.45k
        }
916
917
        // "         1 - Rep Match 0"
918
1.04M
        dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch0'");
919
1.04M
        return MatchType::RepMatch0;
920
1.04M
    }
921
922
    // "         1 -
923
    //             IsRepG1[state] decode
924
    //               0 - Rep Match 1"
925
16.8M
    if (TRY(decode_bit_with_probability(m_is_rep_g1_probabilities[m_state])) == 0) {
926
55.3k
        dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch1'");
927
55.3k
        return MatchType::RepMatch1;
928
55.3k
    }
929
930
    // "             1 -
931
    //                 IsRepG2[state] decode
932
    //                   0 - Rep Match 2"
933
16.7M
    if (TRY(decode_bit_with_probability(m_is_rep_g2_probabilities[m_state])) == 0) {
934
25.9k
        dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch2'");
935
25.9k
        return MatchType::RepMatch2;
936
25.9k
    }
937
938
    // "                 1 - Rep Match 3"
939
8.35M
    dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch3'");
940
8.35M
    return MatchType::RepMatch3;
941
8.38M
}
942
943
ErrorOr<void> LzmaCompressor::encode_match_type(MatchType match_type)
944
2.68M
{
945
2.68M
    u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
946
2.68M
    u16 state2 = (m_state << maximum_number_of_position_bits) + position_state;
947
948
2.68M
    if (match_type == MatchType::Literal) {
949
728k
        TRY(encode_bit_with_probability(m_is_match_probabilities[state2], 0));
950
728k
        dbgln_if(LZMA_DEBUG, "Encoded match type 'Literal'");
951
728k
        return {};
952
728k
    }
953
3.90M
    TRY(encode_bit_with_probability(m_is_match_probabilities[state2], 1));
954
955
3.90M
    if (match_type == MatchType::SimpleMatch) {
956
815k
        TRY(encode_bit_with_probability(m_is_rep_probabilities[m_state], 0));
957
815k
        dbgln_if(LZMA_DEBUG, "Encoded match type 'SimpleMatch'");
958
815k
        return {};
959
815k
    }
960
2.27M
    TRY(encode_bit_with_probability(m_is_rep_probabilities[m_state], 1));
961
962
2.27M
    if (match_type == MatchType::ShortRepMatch || match_type == MatchType::RepMatch0) {
963
1.04M
        TRY(encode_bit_with_probability(m_is_rep_g0_probabilities[m_state], 0));
964
1.04M
        TRY(encode_bit_with_probability(m_is_rep0_long_probabilities[state2], match_type == MatchType::RepMatch0));
965
        if constexpr (LZMA_DEBUG) {
966
            if (match_type == RepMatch0)
967
                dbgln("Encoded match type 'RepMatch0'");
968
            else
969
                dbgln("Encoded match type 'ShortRepMatch'");
970
        }
971
1.04M
        return {};
972
1.04M
    }
973
1.13M
    TRY(encode_bit_with_probability(m_is_rep_g0_probabilities[m_state], 1));
974
975
189k
    if (match_type == MatchType::RepMatch1) {
976
52.6k
        TRY(encode_bit_with_probability(m_is_rep_g1_probabilities[m_state], 0));
977
52.6k
        dbgln_if(LZMA_DEBUG, "Encoded match type 'RepMatch1'");
978
52.6k
        return {};
979
52.6k
    }
980
94.7k
    TRY(encode_bit_with_probability(m_is_rep_g1_probabilities[m_state], 1));
981
982
84.2k
    if (match_type == MatchType::RepMatch2) {
983
24.2k
        TRY(encode_bit_with_probability(m_is_rep_g2_probabilities[m_state], 0));
984
24.2k
        dbgln_if(LZMA_DEBUG, "Encoded match type 'RepMatch2'");
985
24.2k
        return {};
986
24.2k
    }
987
42.1k
    TRY(encode_bit_with_probability(m_is_rep_g2_probabilities[m_state], 1));
988
17.8k
    dbgln_if(LZMA_DEBUG, "Encoded match type 'RepMatch3'");
989
17.8k
    return {};
990
17.8k
}
991
992
ErrorOr<void> LzmaCompressor::encode_once()
993
2.67M
{
994
    // Check if any of our existing match distances are currently usable.
995
2.67M
    Vector<size_t> const existing_distances {
996
2.67M
        m_rep0 + normalized_to_real_match_distance_offset,
997
2.67M
        m_rep1 + normalized_to_real_match_distance_offset,
998
2.67M
        m_rep2 + normalized_to_real_match_distance_offset,
999
2.67M
        m_rep3 + normalized_to_real_match_distance_offset,
1000
2.67M
    };
1001
2.67M
    auto existing_distance_result = m_dictionary->find_copy_in_seekback(existing_distances, m_dictionary->used_space(), normalized_to_real_match_length_offset);
1002
1003
2.67M
    if (existing_distance_result.has_value()) {
1004
1.13M
        auto selected_match = existing_distance_result.release_value();
1005
1.13M
        TRY(encode_existing_match(selected_match.distance, selected_match.length));
1006
1.13M
        return {};
1007
1.13M
    }
1008
1009
    // If we weren't able to find any viable existing offsets, we now have to search the rest of the dictionary for possible new offsets.
1010
1.54M
    auto new_distance_result = m_dictionary->find_copy_in_seekback(m_dictionary->used_space(), normalized_to_real_match_length_offset);
1011
1012
1.54M
    if (new_distance_result.has_value()) {
1013
812k
        auto selected_match = new_distance_result.release_value();
1014
812k
        TRY(encode_new_match(selected_match.distance, selected_match.length));
1015
812k
        return {};
1016
812k
    }
1017
1018
    // If we weren't able to find any matches, we don't have any other choice than to encode the next byte as a literal.
1019
728k
    u8 next_byte { 0 };
1020
728k
    TRY(m_dictionary->read({ &next_byte, sizeof(next_byte) }));
1021
728k
    TRY(encode_literal(next_byte));
1022
728k
    return {};
1023
728k
}
1024
1025
ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
1026
575k
{
1027
12.1M
    while (m_dictionary->used_space() < bytes.size() && m_dictionary->empty_space() != 0) {
1028
11.6M
        if (m_found_end_of_stream_marker)
1029
2.86k
            break;
1030
1031
11.6M
        if (has_reached_expected_data_size()) {
1032
            // If the decoder is in a clean state, we assume that this is fine.
1033
85
            if (is_range_decoder_in_clean_state())
1034
37
                break;
1035
1036
            // Otherwise, we give it one last try to find the end marker in the remaining data.
1037
85
        }
1038
1039
11.6M
        auto copy_match_to_buffer = [&](u16 real_length) -> ErrorOr<void> {
1040
10.5M
            VERIFY(!m_leftover_match_length.has_value());
1041
1042
10.5M
            if (m_options.uncompressed_size.has_value() && m_options.uncompressed_size.value() < m_total_processed_bytes + real_length)
1043
4
                return Error::from_string_literal("Tried to copy match beyond expected uncompressed file size");
1044
1045
10.5M
            auto copied_length = TRY(m_dictionary->copy_from_seekback(current_repetition_offset(), real_length));
1046
1047
10.5M
            m_total_processed_bytes += copied_length;
1048
10.5M
            real_length -= copied_length;
1049
1050
10.5M
            if (real_length > 0)
1051
212k
                m_leftover_match_length = real_length;
1052
1053
10.5M
            return {};
1054
10.5M
        };
1055
1056
        // If we have a leftover part of a repeating match, we should finish that first.
1057
11.6M
        if (m_leftover_match_length.has_value()) {
1058
212k
            TRY(copy_match_to_buffer(m_leftover_match_length.release_value()));
1059
212k
            continue;
1060
212k
        }
1061
1062
11.3M
        auto const match_type = TRY(decode_match_type());
1063
1064
        // If we are looking for EOS, but find another match type, the stream is also corrupted.
1065
11.3M
        if (has_reached_expected_data_size() && match_type != MatchType::SimpleMatch)
1066
41
            return Error::from_string_literal("First match type after the expected uncompressed size is not a simple match");
1067
1068
11.3M
        if (match_type == MatchType::Literal) {
1069
            // "At first the LZMA decoder must check that it doesn't exceed
1070
            //  specified uncompressed size."
1071
            // This is already checked for at the beginning of the loop.
1072
1073
            // "Then it decodes literal value and puts it to sliding window."
1074
1.07M
            TRY(decode_literal_to_output_buffer());
1075
1076
            // "Then the decoder must update the "state" value."
1077
1.07M
            update_state_after_literal();
1078
1.07M
            continue;
1079
1.07M
        }
1080
1081
10.3M
        if (match_type == MatchType::SimpleMatch) {
1082
            // "The distance history table is updated with the following scheme:"
1083
823k
            m_rep3 = m_rep2;
1084
823k
            m_rep2 = m_rep1;
1085
823k
            m_rep1 = m_rep0;
1086
1087
            // "The zero-based length is decoded with "LenDecoder"."
1088
823k
            u16 normalized_length = TRY(decode_normalized_match_length(m_length_coder));
1089
1090
            // "The state is update with UpdateState_Match function."
1091
823k
            update_state_after_match();
1092
1093
            // "and the new "rep0" value is decoded with DecodeDistance."
1094
823k
            m_rep0 = TRY(decode_normalized_match_distance(normalized_length));
1095
1096
            // "If the value of "rep0" is equal to 0xFFFFFFFF, it means that we have
1097
            //  "End of stream" marker, so we can stop decoding and check finishing
1098
            //  condition in Range Decoder"
1099
822k
            if (m_rep0 == end_of_stream_marker) {
1100
                // If we should reject end-of-stream markers, do so now.
1101
                // Note that this is not part of LZMA, as LZMA allows end-of-stream markers in all contexts, so pure LZMA should never set this option.
1102
2.86k
                if (m_options.reject_end_of_stream_marker)
1103
0
                    return Error::from_string_literal("An end-of-stream marker was found, but the LZMA stream is configured to reject them");
1104
1105
                // The range decoder condition is checked after breaking out of the loop.
1106
2.86k
                m_found_end_of_stream_marker = true;
1107
2.86k
                continue;
1108
2.86k
            }
1109
1110
            // If we are looking for EOS, but haven't found it here, the stream is corrupted.
1111
820k
            if (has_reached_expected_data_size())
1112
1
                return Error::from_string_literal("First simple match after the expected uncompressed size is not the EOS marker");
1113
1114
            // "If uncompressed size is defined, LZMA decoder must check that it doesn't
1115
            //  exceed that specified uncompressed size."
1116
            // This is being checked for in the common "copy to buffer" implementation.
1117
1118
            // "Also the decoder must check that "rep0" value is not larger than dictionary size
1119
            //  and is not larger than the number of already decoded bytes."
1120
820k
            if (current_repetition_offset() > m_dictionary->seekback_limit())
1121
202
                return Error::from_string_literal("rep0 value is larger than the possible lookback size");
1122
1123
            // "Then the decoder must copy match bytes as described in
1124
            //  "The match symbols copying" section."
1125
820k
            TRY(copy_match_to_buffer(normalized_length + normalized_to_real_match_length_offset));
1126
1127
819k
            continue;
1128
819k
        }
1129
1130
9.48M
        if (match_type == MatchType::ShortRepMatch) {
1131
            // "LZMA doesn't update the distance history."
1132
1133
            // "If the subtype is "Short Rep Match", the decoder updates the state, puts
1134
            //  the one byte from window to current position in window and goes to next
1135
            //  MATCH/LITERAL symbol."
1136
3.45k
            update_state_after_short_rep();
1137
1138
3.45k
            TRY(copy_match_to_buffer(1));
1139
1140
3.41k
            continue;
1141
3.45k
        }
1142
1143
        // Note: We don't need to do anything specific for "Rep Match 0", we just need to make sure to not
1144
        //       run the detection for other match types and to not switch around the distance history.
1145
1146
9.48M
        if (match_type == MatchType::RepMatch1) {
1147
55.3k
            u32 distance = m_rep1;
1148
55.3k
            m_rep1 = m_rep0;
1149
55.3k
            m_rep0 = distance;
1150
55.3k
        }
1151
1152
9.48M
        if (match_type == MatchType::RepMatch2) {
1153
25.9k
            u32 distance = m_rep2;
1154
25.9k
            m_rep2 = m_rep1;
1155
25.9k
            m_rep1 = m_rep0;
1156
25.9k
            m_rep0 = distance;
1157
25.9k
        }
1158
1159
9.48M
        if (match_type == MatchType::RepMatch3) {
1160
8.35M
            u32 distance = m_rep3;
1161
8.35M
            m_rep3 = m_rep2;
1162
8.35M
            m_rep2 = m_rep1;
1163
8.35M
            m_rep1 = m_rep0;
1164
8.35M
            m_rep0 = distance;
1165
8.35M
        }
1166
1167
        // "In other cases (Rep Match 0/1/2/3), it decodes the zero-based
1168
        //  length of match with "RepLenDecoder" decoder."
1169
9.48M
        u16 normalized_length = TRY(decode_normalized_match_length(m_rep_length_coder));
1170
1171
        // "Then it updates the state."
1172
9.48M
        update_state_after_rep();
1173
1174
        // "Then the decoder must copy match bytes as described in
1175
        //  "The Match symbols copying" section."
1176
9.48M
        TRY(copy_match_to_buffer(normalized_length + normalized_to_real_match_length_offset));
1177
9.48M
    }
1178
1179
573k
    if (m_found_end_of_stream_marker || has_reached_expected_data_size()) {
1180
2.90k
        if (m_options.uncompressed_size.has_value() && m_total_processed_bytes < m_options.uncompressed_size.value())
1181
92
            return Error::from_string_literal("Found end-of-stream marker earlier than expected");
1182
1183
2.81k
        if (!is_range_decoder_in_clean_state())
1184
46
            return Error::from_string_literal("LZMA stream ends in an unclean state");
1185
2.81k
    }
1186
1187
573k
    return m_dictionary->read(bytes);
1188
573k
}
1189
1190
ErrorOr<size_t> LzmaDecompressor::write_some(ReadonlyBytes)
1191
0
{
1192
0
    return Error::from_errno(EBADF);
1193
0
}
1194
1195
bool LzmaDecompressor::is_eof() const
1196
1.13M
{
1197
1.13M
    if (m_dictionary->used_space() > 0)
1198
1.11M
        return false;
1199
1200
15.7k
    if (has_reached_expected_data_size())
1201
47
        return true;
1202
1203
15.7k
    return m_found_end_of_stream_marker;
1204
15.7k
}
1205
1206
bool LzmaDecompressor::is_open() const
1207
0
{
1208
0
    return true;
1209
0
}
1210
1211
void LzmaDecompressor::close()
1212
0
{
1213
0
}
1214
1215
ErrorOr<NonnullOwnPtr<LzmaCompressor>> LzmaCompressor::create_container(MaybeOwned<Stream> stream, LzmaCompressorOptions const& options)
1216
2.72k
{
1217
5.45k
    auto dictionary = TRY(try_make<SearchableCircularBuffer>(TRY(SearchableCircularBuffer::create_empty(options.dictionary_size + largest_real_match_length))));
1218
1219
    // "The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where each table contains 0x300 CProb values."
1220
5.45k
    auto literal_probabilities = TRY(FixedArray<Probability>::create(literal_probability_table_size * (1 << (options.literal_context_bits + options.literal_position_bits))));
1221
1222
2.72k
    auto header = TRY(LzmaHeader::from_compressor_options(options));
1223
2.72k
    TRY(stream->write_value(header));
1224
1225
2.72k
    auto compressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) LzmaCompressor(move(stream), options, move(dictionary), move(literal_probabilities))));
1226
1227
2.72k
    return compressor;
1228
2.72k
}
1229
1230
LzmaCompressor::LzmaCompressor(MaybeOwned<AK::Stream> stream, Compress::LzmaCompressorOptions options, MaybeOwned<SearchableCircularBuffer> dictionary, FixedArray<Compress::LzmaState::Probability> literal_probabilities)
1231
2.72k
    : LzmaState(move(literal_probabilities))
1232
2.72k
    , m_stream(move(stream))
1233
2.72k
    , m_options(move(options))
1234
2.72k
    , m_dictionary(move(dictionary))
1235
2.72k
{
1236
2.72k
}
1237
1238
ErrorOr<Bytes> LzmaCompressor::read_some(Bytes)
1239
0
{
1240
0
    return Error::from_errno(EBADF);
1241
0
}
1242
1243
ErrorOr<size_t> LzmaCompressor::write_some(ReadonlyBytes bytes)
1244
2.54M
{
1245
    // Fill the input buffer until it's full or until we can't read any more data.
1246
2.54M
    size_t processed_bytes = min(bytes.size(), largest_real_match_length - m_dictionary->used_space());
1247
2.54M
    bytes = bytes.trim(processed_bytes);
1248
1249
5.08M
    while (bytes.size() > 0) {
1250
2.54M
        auto const written_bytes = m_dictionary->write(bytes);
1251
2.54M
        bytes = bytes.slice(written_bytes);
1252
2.54M
    }
1253
1254
2.54M
    VERIFY(m_dictionary->used_space() <= largest_real_match_length);
1255
1256
2.54M
    if (m_options.uncompressed_size.has_value() && m_total_processed_bytes + m_dictionary->used_space() > m_options.uncompressed_size.value())
1257
0
        return Error::from_string_literal("Tried to compress more LZMA data than announced");
1258
1259
5.08M
    TRY(encode_once());
1260
1261
    // If we read enough data to reach the final uncompressed size, flush automatically.
1262
    // Flushing will handle encoding the remaining data for us and finalize the stream.
1263
5.08M
    if (m_options.uncompressed_size.has_value() && m_total_processed_bytes + m_dictionary->used_space() >= m_options.uncompressed_size.value())
1264
0
        TRY(flush());
1265
1266
5.08M
    return processed_bytes;
1267
5.08M
}
1268
1269
ErrorOr<void> LzmaCompressor::flush()
1270
2.72k
{
1271
2.72k
    if (m_has_flushed_data)
1272
0
        return Error::from_string_literal("Flushed an LZMA stream twice");
1273
1274
135k
    while (m_dictionary->used_space() > 0)
1275
132k
        TRY(encode_once());
1276
1277
2.72k
    if (m_options.uncompressed_size.has_value() && m_total_processed_bytes < m_options.uncompressed_size.value())
1278
0
        return Error::from_string_literal("Flushing LZMA data with known but unreached uncompressed size");
1279
1280
    // The LZMA specification technically also allows both a known size and an end-of-stream marker simultaneously,
1281
    // but LZMA2 rejects them, so skip emitting the end-of-stream marker if we know the uncompressed size.
1282
2.72k
    if (!m_options.uncompressed_size.has_value())
1283
2.72k
        TRY(encode_normalized_simple_match(end_of_stream_marker, 0));
1284
1285
    // Shifting the range encoder using the normal operation handles any pending overflows.
1286
5.45k
    TRY(shift_range_encoder());
1287
1288
    // Now, the remaining bytes are the cached byte, the chain of 0xFF, and the upper 3 bytes of the current `code`.
1289
    // Incrementing the values does not have to be considered as no overflows are pending. The fourth byte is the
1290
    // null byte that we just shifted in, which should not be flushed as it would be extraneous junk data.
1291
5.45k
    TRY(m_stream->write_value<u8>(m_range_encoder_cached_byte));
1292
2.73k
    for (size_t i = 0; i < m_range_encoder_ff_chain_length; i++)
1293
10
        TRY(m_stream->write_value<u8>(0xFF));
1294
5.45k
    TRY(m_stream->write_value<u8>(m_range_encoder_code >> 24));
1295
5.45k
    TRY(m_stream->write_value<u8>(m_range_encoder_code >> 16));
1296
2.72k
    TRY(m_stream->write_value<u8>(m_range_encoder_code >> 8));
1297
1298
2.72k
    m_has_flushed_data = true;
1299
2.72k
    return {};
1300
2.72k
}
1301
1302
bool LzmaCompressor::is_eof() const
1303
0
{
1304
0
    return true;
1305
0
}
1306
1307
bool LzmaCompressor::is_open() const
1308
0
{
1309
0
    return !m_has_flushed_data;
1310
0
}
1311
1312
void LzmaCompressor::close()
1313
0
{
1314
0
    if (!m_has_flushed_data) {
1315
        // Note: We need a better API for specifying things like this.
1316
0
        flush().release_value_but_fixme_should_propagate_errors();
1317
0
    }
1318
0
}
1319
1320
LzmaCompressor::~LzmaCompressor()
1321
2.72k
{
1322
2.72k
    if (!m_has_flushed_data) {
1323
        // Note: We need a better API for specifying things like this.
1324
0
        flush().release_value_but_fixme_should_propagate_errors();
1325
0
    }
1326
2.72k
}
1327
1328
}