/src/serenity/Userland/Libraries/LibCompress/Lzma.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2023, Tim Schumacher <timschumi@gmx.de> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #include <AK/Debug.h> |
8 | | #include <AK/IntegralMath.h> |
9 | | #include <LibCompress/Lzma.h> |
10 | | |
11 | | namespace Compress { |
12 | | |
13 | | u32 LzmaHeader::dictionary_size() const |
14 | 4.82k | { |
15 | | // "If the value of dictionary size in properties is smaller than (1 << 12), |
16 | | // the LZMA decoder must set the dictionary size variable to (1 << 12)." |
17 | 4.82k | constexpr u32 minimum_dictionary_size = (1 << 12); |
18 | 4.82k | if (unchecked_dictionary_size < minimum_dictionary_size) |
19 | 479 | return minimum_dictionary_size; |
20 | | |
21 | 4.34k | return unchecked_dictionary_size; |
22 | 4.82k | } |
23 | | |
24 | | Optional<u64> LzmaHeader::uncompressed_size() const |
25 | 3.34k | { |
26 | | // We are making a copy of the packed field here because we would otherwise |
27 | | // pass an unaligned reference to the constructor of Optional, which is |
28 | | // undefined behavior. |
29 | 3.34k | auto uncompressed_size = encoded_uncompressed_size; |
30 | | |
31 | | // "If "Uncompressed size" field contains ones in all 64 bits, it means that |
32 | | // uncompressed size is unknown and there is the "end marker" in stream, |
33 | | // that indicates the end of decoding point." |
34 | 3.34k | if (uncompressed_size == placeholder_for_unknown_uncompressed_size) |
35 | 1.98k | return {}; |
36 | | |
37 | | // "In opposite case, if the value from "Uncompressed size" field is not |
38 | | // equal to ((2^64) - 1), the LZMA stream decoding must be finished after |
39 | | // specified number of bytes (Uncompressed size) is decoded. And if there |
40 | | // is the "end marker", the LZMA decoder must read that marker also." |
41 | 1.35k | return uncompressed_size; |
42 | 3.34k | } |
43 | | |
44 | | ErrorOr<LzmaModelProperties> LzmaHeader::decode_model_properties(u8 input_bits) |
45 | 3.34k | { |
46 | | // "Decodes the following values from the encoded model properties field: |
47 | | // |
48 | | // name Range Description |
49 | | // lc [0, 8] the number of "literal context" bits |
50 | | // lp [0, 4] the number of "literal pos" bits |
51 | | // pb [0, 4] the number of "pos" bits |
52 | | // |
53 | | // Encoded using `((pb * 5 + lp) * 9 + lc)`." |
54 | | |
55 | 3.34k | if (input_bits >= (9 * 5 * 5)) |
56 | 1 | return Error::from_string_literal("Encoded model properties value is larger than the highest possible value"); |
57 | | |
58 | 3.34k | u8 literal_context_bits = input_bits % 9; |
59 | 3.34k | input_bits /= 9; |
60 | 3.34k | VERIFY(literal_context_bits >= 0 && literal_context_bits <= 8); |
61 | | |
62 | 3.34k | u8 literal_position_bits = input_bits % 5; |
63 | 3.34k | input_bits /= 5; |
64 | 3.34k | VERIFY(literal_position_bits >= 0 && literal_position_bits <= 4); |
65 | | |
66 | 3.34k | u8 position_bits = input_bits; |
67 | 3.34k | VERIFY(position_bits >= 0 && position_bits <= 4); |
68 | | |
69 | 3.34k | return LzmaModelProperties { |
70 | 3.34k | .literal_context_bits = literal_context_bits, |
71 | 3.34k | .literal_position_bits = literal_position_bits, |
72 | 3.34k | .position_bits = position_bits, |
73 | 3.34k | }; |
74 | 3.34k | } |
75 | | |
76 | | ErrorOr<u8> LzmaHeader::encode_model_properties(LzmaModelProperties const& model_properties) |
77 | 1.88k | { |
78 | 1.88k | if (model_properties.literal_context_bits > 8) |
79 | 0 | return Error::from_string_literal("LZMA literal context bits are too large to encode"); |
80 | | |
81 | 1.88k | if (model_properties.literal_position_bits > 4) |
82 | 0 | return Error::from_string_literal("LZMA literal position bits are too large to encode"); |
83 | | |
84 | 1.88k | if (model_properties.position_bits > 4) |
85 | 0 | return Error::from_string_literal("LZMA position bits are too large to encode"); |
86 | | |
87 | 1.88k | return (model_properties.position_bits * 5 + model_properties.literal_position_bits) * 9 + model_properties.literal_context_bits; |
88 | 1.88k | } |
89 | | |
90 | | ErrorOr<LzmaDecompressorOptions> LzmaHeader::as_decompressor_options() const |
91 | 3.34k | { |
92 | 3.34k | auto model_properties = TRY(decode_model_properties(encoded_model_properties)); |
93 | | |
94 | 0 | return Compress::LzmaDecompressorOptions { |
95 | 3.34k | .literal_context_bits = model_properties.literal_context_bits, |
96 | 3.34k | .literal_position_bits = model_properties.literal_position_bits, |
97 | 3.34k | .position_bits = model_properties.position_bits, |
98 | 3.34k | .dictionary_size = dictionary_size(), |
99 | 3.34k | .uncompressed_size = uncompressed_size(), |
100 | 3.34k | .reject_end_of_stream_marker = false, |
101 | 3.34k | }; |
102 | 3.34k | } |
103 | | |
104 | | ErrorOr<LzmaHeader> LzmaHeader::from_compressor_options(LzmaCompressorOptions const& options) |
105 | 1.88k | { |
106 | 1.88k | auto encoded_model_properties = TRY(encode_model_properties({ |
107 | 1.88k | .literal_context_bits = options.literal_context_bits, |
108 | 1.88k | .literal_position_bits = options.literal_position_bits, |
109 | 1.88k | .position_bits = options.position_bits, |
110 | 1.88k | })); |
111 | | |
112 | 0 | return LzmaHeader { |
113 | 1.88k | .encoded_model_properties = encoded_model_properties, |
114 | 1.88k | .unchecked_dictionary_size = options.dictionary_size, |
115 | 1.88k | .encoded_uncompressed_size = options.uncompressed_size.value_or(placeholder_for_unknown_uncompressed_size), |
116 | 1.88k | }; |
117 | 1.88k | } |
118 | | |
119 | | void LzmaState::initialize_to_default_probability(Span<Probability> span) |
120 | 460k | { |
121 | 460k | for (auto& entry : span) |
122 | 332M | entry = default_probability; |
123 | 460k | } |
124 | | |
125 | | ErrorOr<NonnullOwnPtr<LzmaDecompressor>> LzmaDecompressor::create_from_container(MaybeOwned<Stream> stream, Optional<MaybeOwned<CircularBuffer>> dictionary) |
126 | 3.35k | { |
127 | 3.35k | auto header = TRY(stream->read_value<LzmaHeader>()); |
128 | | |
129 | 3.34k | return TRY(LzmaDecompressor::create_from_raw_stream(move(stream), TRY(header.as_decompressor_options()), move(dictionary))); |
130 | 3.34k | } |
131 | | |
132 | | ErrorOr<NonnullOwnPtr<LzmaDecompressor>> LzmaDecompressor::create_from_raw_stream(MaybeOwned<Stream> stream, LzmaDecompressorOptions const& options, Optional<MaybeOwned<CircularBuffer>> dictionary) |
133 | 3.34k | { |
134 | 3.34k | if (!dictionary.has_value()) { |
135 | 3.34k | auto new_dictionary = TRY(CircularBuffer::create_empty(options.dictionary_size)); |
136 | 3.34k | dictionary = TRY(try_make<CircularBuffer>(move(new_dictionary))); |
137 | 3.34k | } |
138 | | |
139 | 3.34k | VERIFY((*dictionary)->capacity() >= options.dictionary_size); |
140 | | |
141 | | // "The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where each table contains 0x300 CProb values." |
142 | 3.34k | auto literal_probabilities = TRY(FixedArray<Probability>::create(literal_probability_table_size * (1 << (options.literal_context_bits + options.literal_position_bits)))); |
143 | | |
144 | 3.34k | auto decompressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) LzmaDecompressor(move(stream), options, dictionary.release_value(), move(literal_probabilities)))); |
145 | | |
146 | 3.34k | TRY(decompressor->initialize_range_decoder()); |
147 | | |
148 | 0 | return decompressor; |
149 | 3.34k | } |
150 | | |
151 | | LzmaState::LzmaState(FixedArray<Probability> literal_probabilities) |
152 | 5.23k | : m_literal_probabilities(move(literal_probabilities)) |
153 | 5.23k | { |
154 | 5.23k | initialize_to_default_probability(m_literal_probabilities.span()); |
155 | | |
156 | 5.23k | for (auto& array : m_length_to_position_states) |
157 | 20.9k | initialize_to_default_probability(array); |
158 | | |
159 | 5.23k | for (auto& array : m_binary_tree_distance_probabilities) |
160 | 52.3k | initialize_to_default_probability(array); |
161 | | |
162 | 5.23k | initialize_to_default_probability(m_alignment_bit_probabilities); |
163 | | |
164 | 5.23k | initialize_to_default_probability(m_is_match_probabilities); |
165 | 5.23k | initialize_to_default_probability(m_is_rep_probabilities); |
166 | 5.23k | initialize_to_default_probability(m_is_rep_g0_probabilities); |
167 | 5.23k | initialize_to_default_probability(m_is_rep_g1_probabilities); |
168 | 5.23k | initialize_to_default_probability(m_is_rep_g2_probabilities); |
169 | 5.23k | initialize_to_default_probability(m_is_rep0_long_probabilities); |
170 | 5.23k | } |
171 | | |
172 | | LzmaDecompressor::LzmaDecompressor(MaybeOwned<Stream> stream, LzmaDecompressorOptions options, MaybeOwned<CircularBuffer> dictionary, FixedArray<Probability> literal_probabilities) |
173 | 3.34k | : LzmaState(move(literal_probabilities)) |
174 | 3.34k | , m_stream(move(stream)) |
175 | 3.34k | , m_options(move(options)) |
176 | 3.34k | , m_dictionary(move(dictionary)) |
177 | 3.34k | { |
178 | 3.34k | } |
179 | | |
180 | | bool LzmaDecompressor::is_range_decoder_in_clean_state() const |
181 | 2.04k | { |
182 | 2.04k | return m_range_decoder_code == 0; |
183 | 2.04k | } |
184 | | |
185 | | bool LzmaDecompressor::has_reached_expected_data_size() const |
186 | 49.4M | { |
187 | 49.4M | if (!m_options.uncompressed_size.has_value()) |
188 | 13.5M | return false; |
189 | | |
190 | 35.8M | return m_total_processed_bytes >= m_options.uncompressed_size.value(); |
191 | 49.4M | } |
192 | | |
193 | | ErrorOr<void> LzmaDecompressor::initialize_range_decoder() |
194 | 3.34k | { |
195 | | // "The LZMA Encoder always writes ZERO in initial byte of compressed stream. |
196 | | // That scheme allows to simplify the code of the Range Encoder in the |
197 | | // LZMA Encoder. If initial byte is not equal to ZERO, the LZMA Decoder must |
198 | | // stop decoding and report error." |
199 | 3.34k | { |
200 | 3.34k | auto byte = TRY(m_stream->read_value<u8>()); |
201 | 3.23k | if (byte != 0) |
202 | 13 | return Error::from_string_literal("Initial byte of data stream is not zero"); |
203 | 3.23k | } |
204 | | |
205 | | // Read the initial bytes into the range decoder. |
206 | 3.22k | m_range_decoder_code = 0; |
207 | 16.1k | for (size_t i = 0; i < 4; i++) { |
208 | 12.8k | auto byte = TRY(m_stream->read_value<u8>()); |
209 | 0 | m_range_decoder_code = m_range_decoder_code << 8 | byte; |
210 | 12.8k | } |
211 | | |
212 | 3.22k | m_range_decoder_range = 0xFFFFFFFF; |
213 | | |
214 | 3.22k | return {}; |
215 | 3.22k | } |
216 | | |
217 | | ErrorOr<void> LzmaDecompressor::append_input_stream(MaybeOwned<Stream> stream, Optional<u64> uncompressed_size) |
218 | 0 | { |
219 | 0 | m_stream = move(stream); |
220 | |
|
221 | 0 | TRY(initialize_range_decoder()); |
222 | | |
223 | 0 | if (m_options.uncompressed_size.has_value() != uncompressed_size.has_value()) |
224 | 0 | return Error::from_string_literal("Appending LZMA streams with mismatching uncompressed size status"); |
225 | | |
226 | 0 | if (uncompressed_size.has_value()) |
227 | 0 | *m_options.uncompressed_size += *uncompressed_size; |
228 | |
|
229 | 0 | return {}; |
230 | 0 | } |
231 | | |
232 | | ErrorOr<void> LzmaDecompressor::normalize_range_decoder() |
233 | 282M | { |
234 | | // "The Normalize() function keeps the "Range" value in described range." |
235 | | |
236 | 282M | if (m_range_decoder_range >= minimum_range_value) |
237 | 277M | return {}; |
238 | | |
239 | 5.11M | m_range_decoder_range <<= 8; |
240 | 5.11M | m_range_decoder_code <<= 8; |
241 | | |
242 | 5.11M | m_range_decoder_code |= TRY(m_stream->read_value<u8>()); |
243 | | |
244 | 5.11M | VERIFY(m_range_decoder_range >= minimum_range_value); |
245 | | |
246 | 5.11M | return {}; |
247 | 5.11M | } |
248 | | |
249 | | ErrorOr<void> LzmaCompressor::shift_range_encoder() |
250 | 4.47M | { |
251 | 4.47M | if ((m_range_encoder_code >> 32) == 0x01) { |
252 | | // If there is an overflow, we can finalize the chain we were previously building. |
253 | | // This includes incrementing both the cached byte and all the 0xFF bytes that we generate. |
254 | 1.55M | VERIFY(m_range_encoder_cached_byte != 0xFF); |
255 | 3.10M | TRY(m_stream->write_value<u8>(m_range_encoder_cached_byte + 1)); |
256 | 1.56M | for (size_t i = 0; i < m_range_encoder_ff_chain_length; i++) |
257 | 6.19k | TRY(m_stream->write_value<u8>(0x00)); |
258 | 1.55M | m_range_encoder_ff_chain_length = 0; |
259 | 1.55M | m_range_encoder_cached_byte = (m_range_encoder_code >> 24); |
260 | 2.92M | } else if ((m_range_encoder_code >> 24) == 0xFF) { |
261 | | // If the byte to flush is 0xFF, it can potentially propagate an overflow and needs to be added to the chain. |
262 | 20.2k | m_range_encoder_ff_chain_length++; |
263 | 2.90M | } else { |
264 | | // If the byte to flush isn't 0xFF, any future overflows will not be propagated beyond this point, |
265 | | // so we can be sure that the built chain doesn't change anymore. |
266 | 2.90M | TRY(m_stream->write_value<u8>(m_range_encoder_cached_byte)); |
267 | 2.91M | for (size_t i = 0; i < m_range_encoder_ff_chain_length; i++) |
268 | 14.0k | TRY(m_stream->write_value<u8>(0xFF)); |
269 | 2.90M | m_range_encoder_ff_chain_length = 0; |
270 | 2.90M | m_range_encoder_cached_byte = (m_range_encoder_code >> 24); |
271 | 2.90M | } |
272 | | |
273 | | // In all three cases we now recorded the highest byte in some way, so we can shift it away and shift in a null byte as the lowest byte. |
274 | 4.47M | m_range_encoder_range <<= 8; |
275 | 4.47M | m_range_encoder_code <<= 8; |
276 | | |
277 | | // Since we are working with a 64-bit code, we need to limit it to 32 bits artificially. |
278 | 4.47M | m_range_encoder_code &= 0xFFFFFFFF; |
279 | | |
280 | 4.47M | return {}; |
281 | 4.47M | } |
282 | | |
283 | | ErrorOr<void> LzmaCompressor::normalize_range_encoder() |
284 | 71.8M | { |
285 | 71.8M | u64 const maximum_range_value = m_range_encoder_code + m_range_encoder_range; |
286 | | |
287 | | // Logically, we should only ever build up an overflow that is smaller than or equal to 0x01. |
288 | 71.8M | VERIFY((maximum_range_value >> 32) <= 0x01); |
289 | | |
290 | 71.8M | if (m_range_encoder_range >= minimum_range_value) |
291 | 67.3M | return {}; |
292 | | |
293 | 8.94M | TRY(shift_range_encoder()); |
294 | | |
295 | 4.47M | VERIFY(m_range_encoder_range >= minimum_range_value); |
296 | | |
297 | 4.47M | return {}; |
298 | 8.94M | } |
299 | | |
300 | | ErrorOr<u8> LzmaDecompressor::decode_direct_bit() |
301 | 9.38M | { |
302 | 9.38M | dbgln_if(LZMA_DEBUG, "Decoding direct bit {} with code = {:#x}, range = {:#x}", 1 - ((m_range_decoder_code - (m_range_decoder_range >> 1)) >> 31), m_range_decoder_code, m_range_decoder_range); |
303 | | |
304 | 9.38M | m_range_decoder_range >>= 1; |
305 | 9.38M | m_range_decoder_code -= m_range_decoder_range; |
306 | | |
307 | 9.38M | u32 temp = 0 - (m_range_decoder_code >> 31); |
308 | | |
309 | 9.38M | m_range_decoder_code += m_range_decoder_range & temp; |
310 | | |
311 | 9.38M | if (m_range_decoder_code == m_range_decoder_range) |
312 | 1 | return Error::from_string_literal("Reached an invalid state while decoding LZMA stream"); |
313 | | |
314 | 9.38M | TRY(normalize_range_decoder()); |
315 | | |
316 | 0 | return temp + 1; |
317 | 9.38M | } |
318 | | |
319 | | ErrorOr<void> LzmaCompressor::encode_direct_bit(u8 value) |
320 | 9.35M | { |
321 | 9.35M | dbgln_if(LZMA_DEBUG, "Encoding direct bit {} with code = {:#x}, range = {:#x}", value, m_range_encoder_code, m_range_encoder_range); |
322 | | |
323 | 9.35M | m_range_encoder_range >>= 1; |
324 | | |
325 | 9.35M | if (value != 0) |
326 | 4.59M | m_range_encoder_code += m_range_encoder_range; |
327 | | |
328 | 9.35M | TRY(normalize_range_encoder()); |
329 | | |
330 | 0 | return {}; |
331 | 9.35M | } |
332 | | |
333 | | ErrorOr<u8> LzmaDecompressor::decode_bit_with_probability(Probability& probability) |
334 | 272M | { |
335 | | // "The LZMA decoder provides the pointer to CProb variable that contains |
336 | | // information about estimated probability for symbol 0 and the Range Decoder |
337 | | // updates that CProb variable after decoding." |
338 | | |
339 | 272M | u32 bound = (m_range_decoder_range >> probability_bit_count) * probability; |
340 | | |
341 | 272M | dbgln_if(LZMA_DEBUG, "Decoding bit {} with probability = {:#x}, bound = {:#x}, code = {:#x}, range = {:#x}", m_range_decoder_code < bound ? 0 : 1, probability, bound, m_range_decoder_code, m_range_decoder_range); |
342 | | |
343 | 272M | if (m_range_decoder_code < bound) { |
344 | 111M | probability += ((1 << probability_bit_count) - probability) >> probability_shift_width; |
345 | 111M | m_range_decoder_range = bound; |
346 | 111M | TRY(normalize_range_decoder()); |
347 | 0 | return 0; |
348 | 161M | } else { |
349 | 161M | probability -= probability >> probability_shift_width; |
350 | 161M | m_range_decoder_code -= bound; |
351 | 161M | m_range_decoder_range -= bound; |
352 | 161M | TRY(normalize_range_decoder()); |
353 | 0 | return 1; |
354 | 161M | } |
355 | 272M | } |
356 | | |
357 | | ErrorOr<void> LzmaCompressor::encode_bit_with_probability(Probability& probability, u8 value) |
358 | 62.4M | { |
359 | 62.4M | u32 bound = (m_range_encoder_range >> probability_bit_count) * probability; |
360 | | |
361 | 62.4M | dbgln_if(LZMA_DEBUG, "Encoding bit {} with probability = {:#x}, bound = {:#x}, code = {:#x}, range = {:#x}", value, probability, bound, m_range_encoder_code, m_range_encoder_range); |
362 | | |
363 | 62.4M | if (value == 0) { |
364 | 31.3M | probability += ((1 << probability_bit_count) - probability) >> probability_shift_width; |
365 | 31.3M | m_range_encoder_range = bound; |
366 | 31.3M | } else { |
367 | 31.1M | probability -= probability >> probability_shift_width; |
368 | 31.1M | m_range_encoder_code += bound; |
369 | 31.1M | m_range_encoder_range -= bound; |
370 | 31.1M | } |
371 | | |
372 | 62.4M | TRY(normalize_range_encoder()); |
373 | 0 | return {}; |
374 | 62.4M | } |
375 | | |
376 | | ErrorOr<u16> LzmaDecompressor::decode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree) |
377 | 14.6M | { |
378 | 14.6M | VERIFY(bit_count <= sizeof(u16) * 8); |
379 | 14.6M | VERIFY(probability_tree.size() >= 1ul << bit_count); |
380 | | |
381 | | // This has been modified from the reference implementation to unlink the result and the tree index, |
382 | | // which should allow for better readability. |
383 | | |
384 | 14.6M | u16 result = 0; |
385 | 14.6M | size_t tree_index = 1; |
386 | | |
387 | 107M | for (size_t i = 0; i < bit_count; i++) { |
388 | 92.9M | u16 next_bit = TRY(decode_bit_with_probability(probability_tree[tree_index])); |
389 | 0 | result = (result << 1) | next_bit; |
390 | 92.9M | tree_index = (tree_index << 1) | next_bit; |
391 | 92.9M | } |
392 | | |
393 | 14.6M | dbgln_if(LZMA_DEBUG, "Decoded value {:#x} with {} bits using bit tree", result, bit_count); |
394 | | |
395 | 14.6M | return result; |
396 | 14.6M | } |
397 | | |
398 | | ErrorOr<void> LzmaCompressor::encode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree, u16 value) |
399 | 5.95M | { |
400 | 5.95M | VERIFY(bit_count <= sizeof(u16) * 8); |
401 | 5.95M | VERIFY(probability_tree.size() >= 1ul << bit_count); |
402 | 5.95M | VERIFY(value <= (1 << bit_count) - 1); |
403 | | |
404 | 5.95M | auto original_value = value; |
405 | | |
406 | | // Shift value to make the first sent byte the most significant bit. This makes the shifting logic a lot easier to read. |
407 | 5.95M | value <<= sizeof(u16) * 8 - bit_count; |
408 | | |
409 | 5.95M | size_t tree_index = 1; |
410 | | |
411 | 29.4M | for (size_t i = 0; i < bit_count; i++) { |
412 | 23.4M | u8 const next_bit = (value & 0x8000) >> (sizeof(u16) * 8 - 1); |
413 | 23.4M | value <<= 1; |
414 | 23.4M | TRY(encode_bit_with_probability(probability_tree[tree_index], next_bit)); |
415 | 0 | tree_index = (tree_index << 1) | next_bit; |
416 | 23.4M | } |
417 | | |
418 | 5.95M | dbgln_if(LZMA_DEBUG, "Encoded value {:#x} with {} bits using bit tree", original_value, bit_count); |
419 | | |
420 | 5.95M | return {}; |
421 | 5.95M | } |
422 | | |
423 | | ErrorOr<u16> LzmaDecompressor::decode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree) |
424 | 1.31M | { |
425 | 1.31M | VERIFY(bit_count <= sizeof(u16) * 8); |
426 | 1.31M | VERIFY(probability_tree.size() >= 1ul << bit_count); |
427 | | |
428 | 1.31M | u16 result = 0; |
429 | 1.31M | size_t tree_index = 1; |
430 | | |
431 | 6.54M | for (size_t i = 0; i < bit_count; i++) { |
432 | 5.23M | u16 next_bit = TRY(decode_bit_with_probability(probability_tree[tree_index])); |
433 | 0 | result |= next_bit << i; |
434 | 5.23M | tree_index = (tree_index << 1) | next_bit; |
435 | 5.23M | } |
436 | | |
437 | 1.31M | dbgln_if(LZMA_DEBUG, "Decoded value {:#x} with {} bits using reverse bit tree", result, bit_count); |
438 | | |
439 | 1.31M | return result; |
440 | 1.31M | } |
441 | | |
442 | | ErrorOr<void> LzmaCompressor::encode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree, u16 value) |
443 | 1.30M | { |
444 | 1.30M | VERIFY(bit_count <= sizeof(u16) * 8); |
445 | 1.30M | VERIFY(probability_tree.size() >= 1ul << bit_count); |
446 | 1.30M | VERIFY(value <= (1 << bit_count) - 1); |
447 | | |
448 | 1.30M | auto original_value = value; |
449 | | |
450 | 1.30M | size_t tree_index = 1; |
451 | | |
452 | 6.51M | for (size_t i = 0; i < bit_count; i++) { |
453 | 5.21M | u8 const next_bit = value & 1; |
454 | 5.21M | value >>= 1; |
455 | 5.21M | TRY(encode_bit_with_probability(probability_tree[tree_index], next_bit)); |
456 | 0 | tree_index = (tree_index << 1) | next_bit; |
457 | 5.21M | } |
458 | | |
459 | 1.30M | dbgln_if(LZMA_DEBUG, "Encoded value {:#x} with {} bits using reverse bit tree", original_value, bit_count); |
460 | | |
461 | 1.30M | return {}; |
462 | 1.30M | } |
463 | | |
464 | | ErrorOr<void> LzmaDecompressor::decode_literal_to_output_buffer() |
465 | 10.3M | { |
466 | 10.3M | u8 previous_byte = 0; |
467 | 10.3M | if (m_dictionary->seekback_limit() > 0) { |
468 | 10.3M | auto read_bytes = MUST(m_dictionary->read_with_seekback({ &previous_byte, sizeof(previous_byte) }, 1)); |
469 | 10.3M | VERIFY(read_bytes.size() == sizeof(previous_byte)); |
470 | 10.3M | } |
471 | | |
472 | | // "To select the table for decoding it uses the context that consists of |
473 | | // (lc) high bits from previous literal and (lp) low bits from value that |
474 | | // represents current position in outputStream." |
475 | 10.3M | u16 literal_state_bits_from_position = m_total_processed_bytes & ((1 << m_options.literal_position_bits) - 1); |
476 | 10.3M | u16 literal_state_bits_from_output = previous_byte >> (8 - m_options.literal_context_bits); |
477 | 10.3M | u16 literal_state = literal_state_bits_from_position << m_options.literal_context_bits | literal_state_bits_from_output; |
478 | | |
479 | 10.3M | Span<Probability> selected_probability_table = m_literal_probabilities.span().slice(literal_probability_table_size * literal_state, literal_probability_table_size); |
480 | | |
481 | | // The result is defined as u16 here and initialized to 1, but we will cut off the top bits before queueing them into the output buffer. |
482 | | // The top bit is only used to track how much we have decoded already, and to select the correct probability table. |
483 | 10.3M | u16 result = 1; |
484 | | |
485 | | // "If (State > 7), the Literal Decoder also uses "matchByte" that represents |
486 | | // the byte in OutputStream at position the is the DISTANCE bytes before |
487 | | // current position, where the DISTANCE is the distance in DISTANCE-LENGTH pair |
488 | | // of latest decoded match." |
489 | | // Note: The specification says `(State > 7)`, but the reference implementation does `(State >= 7)`, which is a mismatch. |
490 | | // Testing `(State > 7)` with actual test files yields errors, so the reference implementation appears to be the correct one. |
491 | 10.3M | if (m_state >= 7) { |
492 | 142k | u8 matched_byte = 0; |
493 | 142k | auto read_bytes = TRY(m_dictionary->read_with_seekback({ &matched_byte, sizeof(matched_byte) }, current_repetition_offset())); |
494 | 142k | VERIFY(read_bytes.size() == sizeof(matched_byte)); |
495 | | |
496 | 142k | dbgln_if(LZMA_DEBUG, "Decoding literal using match byte {:#x}", matched_byte); |
497 | | |
498 | 625k | do { |
499 | 625k | u8 match_bit = (matched_byte >> 7) & 1; |
500 | 625k | matched_byte <<= 1; |
501 | | |
502 | 625k | u8 decoded_bit = TRY(decode_bit_with_probability(selected_probability_table[((1 + match_bit) << 8) + result])); |
503 | 0 | result = result << 1 | decoded_bit; |
504 | | |
505 | 625k | if (match_bit != decoded_bit) |
506 | 140k | break; |
507 | 625k | } while (result < 0x100); |
508 | 142k | } |
509 | | |
510 | 92.1M | while (result < 0x100) |
511 | 81.8M | result = (result << 1) | TRY(decode_bit_with_probability(selected_probability_table[result])); |
512 | | |
513 | 10.3M | u8 actual_result = result - 0x100; |
514 | | |
515 | 10.3M | size_t written_bytes = m_dictionary->write({ &actual_result, sizeof(actual_result) }); |
516 | 10.3M | VERIFY(written_bytes == sizeof(actual_result)); |
517 | 10.3M | m_total_processed_bytes += sizeof(actual_result); |
518 | | |
519 | 10.3M | dbgln_if(LZMA_DEBUG, "Decoded literal {:#x} in state {} using literal state {:#x} (previous byte is {:#x})", actual_result, m_state, literal_state, previous_byte); |
520 | | |
521 | 10.3M | return {}; |
522 | 10.3M | } |
523 | | |
524 | | ErrorOr<void> LzmaCompressor::encode_literal(u8 literal) |
525 | 1.40M | { |
526 | | // This function largely mirrors `decode_literal_to_output_buffer`, so specification comments have been omitted. |
527 | | |
528 | 1.40M | TRY(encode_match_type(MatchType::Literal)); |
529 | | |
530 | | // Note: We have already read the next byte from the input buffer, so it's now in the seekback buffer, shifting all seekback offsets by one. |
531 | 0 | u8 previous_byte = 0; |
532 | 1.40M | if (m_dictionary->seekback_limit() - m_dictionary->used_space() > 1) { |
533 | 1.40M | auto read_bytes = MUST(m_dictionary->read_with_seekback({ &previous_byte, sizeof(previous_byte) }, 2 + m_dictionary->used_space())); |
534 | 1.40M | VERIFY(read_bytes.size() == sizeof(previous_byte)); |
535 | 1.40M | } |
536 | 1.40M | u16 const literal_state_bits_from_position = m_total_processed_bytes & ((1 << m_options.literal_position_bits) - 1); |
537 | 1.40M | u16 const literal_state_bits_from_output = previous_byte >> (8 - m_options.literal_context_bits); |
538 | 1.40M | u16 const literal_state = literal_state_bits_from_position << m_options.literal_context_bits | literal_state_bits_from_output; |
539 | | |
540 | 1.40M | Span<Probability> selected_probability_table = m_literal_probabilities.span().slice(literal_probability_table_size * literal_state, literal_probability_table_size); |
541 | | |
542 | 1.40M | auto original_literal = literal; |
543 | 1.40M | u16 result = 1; |
544 | | |
545 | 1.40M | if (m_state >= 7) { |
546 | 132k | u8 matched_byte = 0; |
547 | 132k | auto read_bytes = TRY(m_dictionary->read_with_seekback({ &matched_byte, sizeof(matched_byte) }, current_repetition_offset() + m_dictionary->used_space() + 1)); |
548 | 132k | VERIFY(read_bytes.size() == sizeof(matched_byte)); |
549 | | |
550 | 132k | dbgln_if(LZMA_DEBUG, "Encoding literal using match byte {:#x}", matched_byte); |
551 | | |
552 | 598k | do { |
553 | 598k | u8 const match_bit = (matched_byte >> 7) & 1; |
554 | 598k | matched_byte <<= 1; |
555 | | |
556 | 598k | u8 const encoded_bit = (literal & 0x80) >> 7; |
557 | 598k | literal <<= 1; |
558 | | |
559 | 598k | TRY(encode_bit_with_probability(selected_probability_table[((1 + match_bit) << 8) + result], encoded_bit)); |
560 | 0 | result = result << 1 | encoded_bit; |
561 | | |
562 | 598k | if (match_bit != encoded_bit) |
563 | 131k | break; |
564 | 598k | } while (result < 0x100); |
565 | 132k | } |
566 | | |
567 | 12.0M | while (result < 0x100) { |
568 | 10.6M | u8 const encoded_bit = (literal & 0x80) >> 7; |
569 | 10.6M | literal <<= 1; |
570 | | |
571 | 10.6M | TRY(encode_bit_with_probability(selected_probability_table[result], encoded_bit)); |
572 | | |
573 | 0 | result = (result << 1) | encoded_bit; |
574 | 10.6M | } |
575 | | |
576 | 1.40M | m_total_processed_bytes += sizeof(literal); |
577 | | |
578 | 1.40M | dbgln_if(LZMA_DEBUG, "Encoded literal {:#x} in state {} using literal state {:#x} (previous byte is {:#x})", original_literal, m_state, literal_state, previous_byte); |
579 | | |
580 | 1.40M | update_state_after_literal(); |
581 | | |
582 | 1.40M | return {}; |
583 | 1.40M | } |
584 | | |
585 | | ErrorOr<void> LzmaCompressor::encode_existing_match(size_t real_distance, size_t real_length) |
586 | 3.33M | { |
587 | 3.33M | VERIFY(real_distance >= normalized_to_real_match_distance_offset); |
588 | 3.33M | u32 const normalized_distance = real_distance - normalized_to_real_match_distance_offset; |
589 | | |
590 | 3.33M | VERIFY(real_length >= normalized_to_real_match_length_offset); |
591 | 3.33M | u16 const normalized_length = real_length - normalized_to_real_match_length_offset; |
592 | | |
593 | 3.33M | if (normalized_distance == m_rep0) { |
594 | 3.13M | TRY(encode_match_type(MatchType::RepMatch0)); |
595 | 3.13M | } else if (normalized_distance == m_rep1) { |
596 | 79.8k | TRY(encode_match_type(MatchType::RepMatch1)); |
597 | | |
598 | 0 | u32 const distance = m_rep1; |
599 | 79.8k | m_rep1 = m_rep0; |
600 | 79.8k | m_rep0 = distance; |
601 | 122k | } else if (normalized_distance == m_rep2) { |
602 | 60.3k | TRY(encode_match_type(MatchType::RepMatch2)); |
603 | | |
604 | 0 | u32 const distance = m_rep2; |
605 | 60.3k | m_rep2 = m_rep1; |
606 | 60.3k | m_rep1 = m_rep0; |
607 | 60.3k | m_rep0 = distance; |
608 | 62.0k | } else if (normalized_distance == m_rep3) { |
609 | 62.0k | TRY(encode_match_type(MatchType::RepMatch3)); |
610 | | |
611 | 0 | u32 const distance = m_rep3; |
612 | 62.0k | m_rep3 = m_rep2; |
613 | 62.0k | m_rep2 = m_rep1; |
614 | 62.0k | m_rep1 = m_rep0; |
615 | 62.0k | m_rep0 = distance; |
616 | 62.0k | } else { |
617 | 0 | VERIFY_NOT_REACHED(); |
618 | 0 | } |
619 | | |
620 | 6.66M | TRY(encode_normalized_match_length(m_rep_length_coder, normalized_length)); |
621 | 0 | update_state_after_rep(); |
622 | 6.66M | MUST(m_dictionary->discard(real_length)); |
623 | 0 | m_total_processed_bytes += real_length; |
624 | | |
625 | 3.33M | return {}; |
626 | 6.66M | } |
627 | | |
628 | | ErrorOr<void> LzmaCompressor::encode_new_match(size_t real_distance, size_t real_length) |
629 | 1.30M | { |
630 | 1.30M | VERIFY(real_distance >= normalized_to_real_match_distance_offset); |
631 | 1.30M | u32 const normalized_distance = real_distance - normalized_to_real_match_distance_offset; |
632 | | |
633 | 1.30M | VERIFY(real_length >= normalized_to_real_match_length_offset); |
634 | 1.30M | u16 const normalized_length = real_length - normalized_to_real_match_length_offset; |
635 | | |
636 | 1.30M | TRY(encode_normalized_simple_match(normalized_distance, normalized_length)); |
637 | | |
638 | 1.30M | MUST(m_dictionary->discard(real_length)); |
639 | 0 | m_total_processed_bytes += real_length; |
640 | | |
641 | 1.30M | return {}; |
642 | 1.30M | } |
643 | | |
644 | | ErrorOr<void> LzmaCompressor::encode_normalized_simple_match(u32 normalized_distance, u16 normalized_length) |
645 | 1.31M | { |
646 | 1.31M | TRY(encode_match_type(MatchType::SimpleMatch)); |
647 | | |
648 | 0 | m_rep3 = m_rep2; |
649 | 1.31M | m_rep2 = m_rep1; |
650 | 1.31M | m_rep1 = m_rep0; |
651 | | |
652 | 1.31M | TRY(encode_normalized_match_length(m_length_coder, normalized_length)); |
653 | | |
654 | 0 | update_state_after_match(); |
655 | | |
656 | 1.31M | TRY(encode_normalized_match_distance(normalized_length, normalized_distance)); |
657 | 0 | m_rep0 = normalized_distance; |
658 | | |
659 | 1.31M | return {}; |
660 | 1.31M | } |
661 | | |
662 | | LzmaState::LzmaLengthCoderState::LzmaLengthCoderState() |
663 | 10.4k | { |
664 | 10.4k | for (auto& array : m_low_length_probabilities) |
665 | 167k | initialize_to_default_probability(array); |
666 | | |
667 | 10.4k | for (auto& array : m_medium_length_probabilities) |
668 | 167k | initialize_to_default_probability(array); |
669 | | |
670 | 10.4k | initialize_to_default_probability(m_high_length_probabilities); |
671 | 10.4k | } |
672 | | |
673 | | ErrorOr<u16> LzmaDecompressor::decode_normalized_match_length(LzmaLengthCoderState& length_decoder_state) |
674 | 13.3M | { |
675 | | // "LZMA uses "posState" value as context to select the binary tree |
676 | | // from LowCoder and MidCoder binary tree arrays:" |
677 | 13.3M | u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1); |
678 | | |
679 | | // "The following scheme is used for the match length encoding: |
680 | | // |
681 | | // Binary encoding Binary Tree structure Zero-based match length |
682 | | // sequence (binary + decimal): |
683 | | // |
684 | | // 0 xxx LowCoder[posState] xxx |
685 | 26.6M | if (TRY(decode_bit_with_probability(length_decoder_state.m_first_choice_probability)) == 0) |
686 | 13.3M | return TRY(decode_symbol_using_bit_tree(3, length_decoder_state.m_low_length_probabilities[position_state].span())); |
687 | | |
688 | | // 1 0 yyy MidCoder[posState] yyy + 8 |
689 | 18.1M | if (TRY(decode_bit_with_probability(length_decoder_state.m_second_choice_probability)) == 0) |
690 | 83.3k | return TRY(decode_symbol_using_bit_tree(3, length_decoder_state.m_medium_length_probabilities[position_state].span())) + 8; |
691 | | |
692 | | // 1 1 zzzzzzzz HighCoder zzzzzzzz + 16" |
693 | 9.00M | return TRY(decode_symbol_using_bit_tree(8, length_decoder_state.m_high_length_probabilities.span())) + 16; |
694 | 9.00M | } |
695 | | |
696 | | ErrorOr<void> LzmaCompressor::encode_normalized_match_length(LzmaLengthCoderState& length_coder_state, u16 normalized_length) |
697 | 4.64M | { |
698 | 4.64M | u16 const position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1); |
699 | | |
700 | 4.64M | if (normalized_length < 8) { |
701 | 4.23M | TRY(encode_bit_with_probability(length_coder_state.m_first_choice_probability, 0)); |
702 | 4.23M | TRY(encode_symbol_using_bit_tree(3, length_coder_state.m_low_length_probabilities[position_state].span(), normalized_length)); |
703 | 0 | return {}; |
704 | 4.23M | } |
705 | | |
706 | 818k | TRY(encode_bit_with_probability(length_coder_state.m_first_choice_probability, 1)); |
707 | | |
708 | 409k | if (normalized_length < 16) { |
709 | 79.1k | TRY(encode_bit_with_probability(length_coder_state.m_second_choice_probability, 0)); |
710 | 79.1k | TRY(encode_symbol_using_bit_tree(3, length_coder_state.m_medium_length_probabilities[position_state].span(), normalized_length - 8)); |
711 | 0 | return {}; |
712 | 79.1k | } |
713 | | |
714 | 659k | TRY(encode_bit_with_probability(length_coder_state.m_second_choice_probability, 1)); |
715 | 329k | TRY(encode_symbol_using_bit_tree(8, length_coder_state.m_high_length_probabilities.span(), normalized_length - 16)); |
716 | 0 | return {}; |
717 | 659k | } |
718 | | |
719 | | ErrorOr<u32> LzmaDecompressor::decode_normalized_match_distance(u16 normalized_match_length) |
720 | 1.31M | { |
721 | | // "LZMA uses normalized match length (zero-based length) |
722 | | // to calculate the context state "lenState" do decode the distance value." |
723 | 1.31M | u16 length_state = min(normalized_match_length, number_of_length_to_position_states - 1); |
724 | | |
725 | | // "At first stage the distance decoder decodes 6-bit "posSlot" value with bit |
726 | | // tree decoder from PosSlotDecoder array." |
727 | 1.31M | u16 position_slot = TRY(decode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span())); |
728 | | |
729 | | // "The encoding scheme for distance value is shown in the following table: |
730 | | // |
731 | | // posSlot (decimal) / |
732 | | // zero-based distance (binary) |
733 | | // 0 0 |
734 | | // 1 1 |
735 | | // 2 10 |
736 | | // 3 11 |
737 | | // |
738 | | // 4 10 x |
739 | | // 5 11 x |
740 | | // 6 10 xx |
741 | | // 7 11 xx |
742 | | // 8 10 xxx |
743 | | // 9 11 xxx |
744 | | // 10 10 xxxx |
745 | | // 11 11 xxxx |
746 | | // 12 10 xxxxx |
747 | | // 13 11 xxxxx |
748 | | // |
749 | | // 14 10 yy zzzz |
750 | | // 15 11 yy zzzz |
751 | | // 16 10 yyy zzzz |
752 | | // 17 11 yyy zzzz |
753 | | // ... |
754 | | // 62 10 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz |
755 | | // 63 11 yyyyyyyyyyyyyyyyyyyyyyyyyy zzzz |
756 | | // |
757 | | // where |
758 | | // "x ... x" means the sequence of binary symbols encoded with binary tree and |
759 | | // "Reverse" scheme. It uses separated binary tree for each posSlot from 4 to 13. |
760 | | // "y" means direct bit encoded with range coder. |
761 | | // "zzzz" means the sequence of four binary symbols encoded with binary |
762 | | // tree with "Reverse" scheme, where one common binary tree "AlignDecoder" |
763 | | // is used for all posSlot values." |
764 | | |
765 | | // "If (posSlot < 4), the "dist" value is equal to posSlot value." |
766 | 1.31M | if (position_slot < first_position_slot_with_binary_tree_bits) |
767 | 6.14k | return position_slot; |
768 | | |
769 | | // From here on, the first bit of the distance is always set and the second bit is set if the last bit of the position slot is set. |
770 | 1.31M | u32 distance_prefix = ((1 << 1) | ((position_slot & 1) << 0)); |
771 | | |
772 | | // "If (posSlot >= 4), the decoder uses "posSlot" value to calculate the value of |
773 | | // the high bits of "dist" value and the number of the low bits. |
774 | | // If (4 <= posSlot < kEndPosModelIndex), the decoder uses bit tree decoders. |
775 | | // (one separated bit tree decoder per one posSlot value) and "Reverse" scheme." |
776 | 1.31M | if (position_slot < first_position_slot_with_direct_encoded_bits) { |
777 | 52.6k | size_t number_of_bits_to_decode = (position_slot / 2) - 1; |
778 | 52.6k | auto& selected_probability_tree = m_binary_tree_distance_probabilities[position_slot - first_position_slot_with_binary_tree_bits]; |
779 | 52.6k | return (distance_prefix << number_of_bits_to_decode) | TRY(decode_symbol_using_reverse_bit_tree(number_of_bits_to_decode, selected_probability_tree)); |
780 | 52.6k | } |
781 | | |
782 | | // " if (posSlot >= kEndPosModelIndex), the middle bits are decoded as direct |
783 | | // bits from RangeDecoder and the low 4 bits are decoded with a bit tree |
784 | | // decoder "AlignDecoder" with "Reverse" scheme." |
785 | 1.26M | size_t number_of_direct_bits_to_decode = ((position_slot - first_position_slot_with_direct_encoded_bits) / 2) + 2; |
786 | 10.6M | for (size_t i = 0; i < number_of_direct_bits_to_decode; i++) { |
787 | 9.38M | distance_prefix = (distance_prefix << 1) | TRY(decode_direct_bit()); |
788 | 9.38M | } |
789 | 1.26M | return (distance_prefix << number_of_alignment_bits) | TRY(decode_symbol_using_reverse_bit_tree(number_of_alignment_bits, m_alignment_bit_probabilities)); |
790 | 1.26M | } |
791 | | |
792 | | ErrorOr<void> LzmaCompressor::encode_normalized_match_distance(u16 normalized_match_length, u32 normalized_match_distance) |
793 | 1.31M | { |
794 | 1.31M | u16 const length_state = min(normalized_match_length, number_of_length_to_position_states - 1); |
795 | | |
796 | 1.31M | if (normalized_match_distance < first_position_slot_with_binary_tree_bits) { |
797 | | // The normalized distance gets encoded as the position slot. |
798 | 4.61k | TRY(encode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span(), normalized_match_distance)); |
799 | 0 | return {}; |
800 | 4.61k | } |
801 | | |
802 | | // Note: This has been deduced, there is no immediate relation to the decoding function. |
803 | 1.30M | u16 const distance_log2 = AK::log2(normalized_match_distance); |
804 | 1.30M | u16 number_of_distance_bits = count_required_bits(normalized_match_distance); |
805 | 1.30M | u16 const position_slot = (distance_log2 << 1) + ((normalized_match_distance >> (distance_log2 - 1)) & 1); |
806 | | |
807 | 1.30M | TRY(encode_symbol_using_bit_tree(6, m_length_to_position_states[length_state].span(), position_slot)); |
808 | | |
809 | | // Mask off the top two bits of the value, those are already encoded by the position slot. |
810 | 0 | normalized_match_distance &= (1 << (number_of_distance_bits - 2)) - 1; |
811 | 1.30M | number_of_distance_bits -= 2; |
812 | | |
813 | 1.30M | if (position_slot < first_position_slot_with_direct_encoded_bits) { |
814 | | // The value gets encoded using only a reverse bit tree coder. |
815 | 50.1k | auto& selected_probability_tree = m_binary_tree_distance_probabilities[position_slot - first_position_slot_with_binary_tree_bits]; |
816 | 50.1k | TRY(encode_symbol_using_reverse_bit_tree(number_of_distance_bits, selected_probability_tree, normalized_match_distance)); |
817 | 0 | return {}; |
818 | 50.1k | } |
819 | | |
820 | | // The value is split into direct bits (everything except the last four bits) and alignment bits (last four bits). |
821 | 1.25M | auto direct_bits = normalized_match_distance & ~((1 << number_of_alignment_bits) - 1); |
822 | 1.25M | auto const alignment_bits = normalized_match_distance & ((1 << number_of_alignment_bits) - 1); |
823 | | |
824 | | // Shift to-be-written direct bits to the most significant position for easier access. |
825 | 1.25M | direct_bits <<= sizeof(direct_bits) * 8 - number_of_distance_bits; |
826 | | |
827 | 10.6M | for (auto i = 0u; i < number_of_distance_bits - number_of_alignment_bits; i++) { |
828 | 9.35M | TRY(encode_direct_bit((direct_bits & 0x80000000) ? 1 : 0)); |
829 | 0 | direct_bits <<= 1; |
830 | 9.35M | } |
831 | | |
832 | 1.25M | TRY(encode_symbol_using_reverse_bit_tree(number_of_alignment_bits, m_alignment_bit_probabilities, alignment_bits)); |
833 | | |
834 | 0 | return {}; |
835 | 1.25M | } |
836 | | |
837 | | u32 LzmaState::current_repetition_offset() const |
838 | 15.1M | { |
839 | | // LZMA never needs to read at offset 0 (i.e. the actual read head of the buffer). |
840 | | // Instead, the values are remapped so that the rep-value n starts reading n + 1 bytes back. |
841 | | // The special rep-value 0xFFFFFFFF is reserved for marking the end of the stream, |
842 | | // so this should never overflow. |
843 | 15.1M | VERIFY(m_rep0 <= NumericLimits<u32>::max() - normalized_to_real_match_distance_offset); |
844 | 15.1M | return m_rep0 + normalized_to_real_match_distance_offset; |
845 | 15.1M | } |
846 | | |
847 | | void LzmaState::update_state_after_literal() |
848 | 11.7M | { |
849 | 11.7M | if (m_state < 4) |
850 | 11.3M | m_state = 0; |
851 | 369k | else if (m_state < 10) |
852 | 264k | m_state -= 3; |
853 | 104k | else |
854 | 104k | m_state -= 6; |
855 | 11.7M | } |
856 | | |
857 | | void LzmaState::update_state_after_match() |
858 | 2.62M | { |
859 | 2.62M | if (m_state < 7) |
860 | 136k | m_state = 7; |
861 | 2.49M | else |
862 | 2.49M | m_state = 10; |
863 | 2.62M | } |
864 | | |
865 | | void LzmaState::update_state_after_rep() |
866 | 15.3M | { |
867 | 15.3M | if (m_state < 7) |
868 | 141k | m_state = 8; |
869 | 15.2M | else |
870 | 15.2M | m_state = 11; |
871 | 15.3M | } |
872 | | |
873 | | void LzmaState::update_state_after_short_rep() |
874 | 3.49k | { |
875 | 3.49k | if (m_state < 7) |
876 | 1.91k | m_state = 9; |
877 | 1.58k | else |
878 | 1.58k | m_state = 11; |
879 | 3.49k | } |
880 | | |
881 | | ErrorOr<LzmaDecompressor::MatchType> LzmaDecompressor::decode_match_type() |
882 | 23.6M | { |
883 | | // "The decoder calculates "state2" variable value to select exact variable from |
884 | | // "IsMatch" and "IsRep0Long" arrays." |
885 | 23.6M | u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1); |
886 | 23.6M | u16 state2 = (m_state << maximum_number_of_position_bits) + position_state; |
887 | | |
888 | | // "The decoder uses the following code flow scheme to select exact |
889 | | // type of LITERAL or MATCH: |
890 | | // |
891 | | // IsMatch[state2] decode |
892 | | // 0 - the Literal" |
893 | 47.2M | if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0) { |
894 | 10.3M | dbgln_if(LZMA_DEBUG, "Decoded match type 'Literal'"); |
895 | 10.3M | return MatchType::Literal; |
896 | 10.3M | } |
897 | | |
898 | | // " 1 - the Match |
899 | | // IsRep[state] decode |
900 | | // 0 - Simple Match" |
901 | 26.6M | if (TRY(decode_bit_with_probability(m_is_rep_probabilities[m_state])) == 0) { |
902 | 1.31M | dbgln_if(LZMA_DEBUG, "Decoded match type 'SimpleMatch'"); |
903 | 1.31M | return MatchType::SimpleMatch; |
904 | 1.31M | } |
905 | | |
906 | | // " 1 - Rep Match |
907 | | // IsRepG0[state] decode |
908 | | // 0 - the distance is rep0" |
909 | 24.0M | if (TRY(decode_bit_with_probability(m_is_rep_g0_probabilities[m_state])) == 0) { |
910 | | // " IsRep0Long[state2] decode |
911 | | // 0 - Short Rep Match" |
912 | 6.27M | if (TRY(decode_bit_with_probability(m_is_rep0_long_probabilities[state2])) == 0) { |
913 | 3.49k | dbgln_if(LZMA_DEBUG, "Decoded match type 'ShortRepMatch'"); |
914 | 3.49k | return MatchType::ShortRepMatch; |
915 | 3.49k | } |
916 | | |
917 | | // " 1 - Rep Match 0" |
918 | 3.13M | dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch0'"); |
919 | 3.13M | return MatchType::RepMatch0; |
920 | 3.13M | } |
921 | | |
922 | | // " 1 - |
923 | | // IsRepG1[state] decode |
924 | | // 0 - Rep Match 1" |
925 | 17.7M | if (TRY(decode_bit_with_probability(m_is_rep_g1_probabilities[m_state])) == 0) { |
926 | 82.5k | dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch1'"); |
927 | 82.5k | return MatchType::RepMatch1; |
928 | 82.5k | } |
929 | | |
930 | | // " 1 - |
931 | | // IsRepG2[state] decode |
932 | | // 0 - Rep Match 2" |
933 | 17.5M | if (TRY(decode_bit_with_probability(m_is_rep_g2_probabilities[m_state])) == 0) { |
934 | 62.1k | dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch2'"); |
935 | 62.1k | return MatchType::RepMatch2; |
936 | 62.1k | } |
937 | | |
938 | | // " 1 - Rep Match 3" |
939 | 8.73M | dbgln_if(LZMA_DEBUG, "Decoded match type 'RepMatch3'"); |
940 | 8.73M | return MatchType::RepMatch3; |
941 | 8.79M | } |
942 | | |
943 | | ErrorOr<void> LzmaCompressor::encode_match_type(MatchType match_type) |
944 | 6.05M | { |
945 | 6.05M | u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1); |
946 | 6.05M | u16 state2 = (m_state << maximum_number_of_position_bits) + position_state; |
947 | | |
948 | 6.05M | if (match_type == MatchType::Literal) { |
949 | 1.40M | TRY(encode_bit_with_probability(m_is_match_probabilities[state2], 0)); |
950 | 1.40M | dbgln_if(LZMA_DEBUG, "Encoded match type 'Literal'"); |
951 | 1.40M | return {}; |
952 | 1.40M | } |
953 | 9.29M | TRY(encode_bit_with_probability(m_is_match_probabilities[state2], 1)); |
954 | | |
955 | 4.64M | if (match_type == MatchType::SimpleMatch) { |
956 | 1.31M | TRY(encode_bit_with_probability(m_is_rep_probabilities[m_state], 0)); |
957 | 1.31M | dbgln_if(LZMA_DEBUG, "Encoded match type 'SimpleMatch'"); |
958 | 1.31M | return {}; |
959 | 1.31M | } |
960 | 6.66M | TRY(encode_bit_with_probability(m_is_rep_probabilities[m_state], 1)); |
961 | | |
962 | 3.33M | if (match_type == MatchType::ShortRepMatch || match_type == MatchType::RepMatch0) { |
963 | 3.13M | TRY(encode_bit_with_probability(m_is_rep_g0_probabilities[m_state], 0)); |
964 | 3.13M | TRY(encode_bit_with_probability(m_is_rep0_long_probabilities[state2], match_type == MatchType::RepMatch0)); |
965 | | if constexpr (LZMA_DEBUG) { |
966 | | if (match_type == RepMatch0) |
967 | | dbgln("Encoded match type 'RepMatch0'"); |
968 | | else |
969 | | dbgln("Encoded match type 'ShortRepMatch'"); |
970 | | } |
971 | 3.13M | return {}; |
972 | 3.13M | } |
973 | 404k | TRY(encode_bit_with_probability(m_is_rep_g0_probabilities[m_state], 1)); |
974 | | |
975 | 202k | if (match_type == MatchType::RepMatch1) { |
976 | 79.8k | TRY(encode_bit_with_probability(m_is_rep_g1_probabilities[m_state], 0)); |
977 | 79.8k | dbgln_if(LZMA_DEBUG, "Encoded match type 'RepMatch1'"); |
978 | 79.8k | return {}; |
979 | 79.8k | } |
980 | 244k | TRY(encode_bit_with_probability(m_is_rep_g1_probabilities[m_state], 1)); |
981 | | |
982 | 122k | if (match_type == MatchType::RepMatch2) { |
983 | 60.3k | TRY(encode_bit_with_probability(m_is_rep_g2_probabilities[m_state], 0)); |
984 | 60.3k | dbgln_if(LZMA_DEBUG, "Encoded match type 'RepMatch2'"); |
985 | 60.3k | return {}; |
986 | 60.3k | } |
987 | 62.0k | TRY(encode_bit_with_probability(m_is_rep_g2_probabilities[m_state], 1)); |
988 | 62.0k | dbgln_if(LZMA_DEBUG, "Encoded match type 'RepMatch3'"); |
989 | 62.0k | return {}; |
990 | 62.0k | } |
991 | | |
992 | | ErrorOr<void> LzmaCompressor::encode_once() |
993 | 6.05M | { |
994 | | // Check if any of our existing match distances are currently usable. |
995 | 6.05M | Vector<size_t> const existing_distances { |
996 | 6.05M | m_rep0 + normalized_to_real_match_distance_offset, |
997 | 6.05M | m_rep1 + normalized_to_real_match_distance_offset, |
998 | 6.05M | m_rep2 + normalized_to_real_match_distance_offset, |
999 | 6.05M | m_rep3 + normalized_to_real_match_distance_offset, |
1000 | 6.05M | }; |
1001 | 6.05M | auto existing_distance_result = m_dictionary->find_copy_in_seekback(existing_distances, m_dictionary->used_space(), normalized_to_real_match_length_offset); |
1002 | | |
1003 | 6.05M | if (existing_distance_result.has_value()) { |
1004 | 3.33M | auto selected_match = existing_distance_result.release_value(); |
1005 | 3.33M | TRY(encode_existing_match(selected_match.distance, selected_match.length)); |
1006 | 0 | return {}; |
1007 | 3.33M | } |
1008 | | |
1009 | | // If we weren't able to find any viable existing offsets, we now have to search the rest of the dictionary for possible new offsets. |
1010 | 2.71M | auto new_distance_result = m_dictionary->find_copy_in_seekback(m_dictionary->used_space(), normalized_to_real_match_length_offset); |
1011 | | |
1012 | 2.71M | if (new_distance_result.has_value()) { |
1013 | 1.30M | auto selected_match = new_distance_result.release_value(); |
1014 | 1.30M | TRY(encode_new_match(selected_match.distance, selected_match.length)); |
1015 | 0 | return {}; |
1016 | 1.30M | } |
1017 | | |
1018 | | // If we weren't able to find any matches, we don't have any other choice than to encode the next byte as a literal. |
1019 | 1.40M | u8 next_byte { 0 }; |
1020 | 1.40M | TRY(m_dictionary->read({ &next_byte, sizeof(next_byte) })); |
1021 | 1.40M | TRY(encode_literal(next_byte)); |
1022 | 0 | return {}; |
1023 | 1.40M | } |
1024 | | |
1025 | | ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes) |
1026 | 606k | { |
1027 | 24.4M | while (m_dictionary->used_space() < bytes.size() && m_dictionary->empty_space() != 0) { |
1028 | 23.8M | if (m_found_end_of_stream_marker) |
1029 | 2.02k | break; |
1030 | | |
1031 | 23.8M | if (has_reached_expected_data_size()) { |
1032 | | // If the decoder is in a clean state, we assume that this is fine. |
1033 | 80 | if (is_range_decoder_in_clean_state()) |
1034 | 33 | break; |
1035 | | |
1036 | | // Otherwise, we give it one last try to find the end marker in the remaining data. |
1037 | 80 | } |
1038 | | |
1039 | 23.8M | auto copy_match_to_buffer = [&](u16 real_length) -> ErrorOr<void> { |
1040 | 13.5M | VERIFY(!m_leftover_match_length.has_value()); |
1041 | | |
1042 | 13.5M | if (m_options.uncompressed_size.has_value() && m_options.uncompressed_size.value() < m_total_processed_bytes + real_length) |
1043 | 4 | return Error::from_string_literal("Tried to copy match beyond expected uncompressed file size"); |
1044 | | |
1045 | 13.5M | auto copied_length = TRY(m_dictionary->copy_from_seekback(current_repetition_offset(), real_length)); |
1046 | | |
1047 | 0 | m_total_processed_bytes += copied_length; |
1048 | 13.5M | real_length -= copied_length; |
1049 | | |
1050 | 13.5M | if (real_length > 0) |
1051 | 230k | m_leftover_match_length = real_length; |
1052 | | |
1053 | 13.5M | return {}; |
1054 | 13.5M | }; |
1055 | | |
1056 | | // If we have a leftover part of a repeating match, we should finish that first. |
1057 | 23.8M | if (m_leftover_match_length.has_value()) { |
1058 | 230k | TRY(copy_match_to_buffer(m_leftover_match_length.release_value())); |
1059 | 0 | continue; |
1060 | 230k | } |
1061 | | |
1062 | 23.6M | auto const match_type = TRY(decode_match_type()); |
1063 | | |
1064 | | // If we are looking for EOS, but find another match type, the stream is also corrupted. |
1065 | 23.6M | if (has_reached_expected_data_size() && match_type != MatchType::SimpleMatch) |
1066 | 37 | return Error::from_string_literal("First match type after the expected uncompressed size is not a simple match"); |
1067 | | |
1068 | 23.6M | if (match_type == MatchType::Literal) { |
1069 | | // "At first the LZMA decoder must check that it doesn't exceed |
1070 | | // specified uncompressed size." |
1071 | | // This is already checked for at the beginning of the loop. |
1072 | | |
1073 | | // "Then it decodes literal value and puts it to sliding window." |
1074 | 10.3M | TRY(decode_literal_to_output_buffer()); |
1075 | | |
1076 | | // "Then the decoder must update the "state" value." |
1077 | 0 | update_state_after_literal(); |
1078 | 10.3M | continue; |
1079 | 10.3M | } |
1080 | | |
1081 | 13.3M | if (match_type == MatchType::SimpleMatch) { |
1082 | | // "The distance history table is updated with the following scheme:" |
1083 | 1.31M | m_rep3 = m_rep2; |
1084 | 1.31M | m_rep2 = m_rep1; |
1085 | 1.31M | m_rep1 = m_rep0; |
1086 | | |
1087 | | // "The zero-based length is decoded with "LenDecoder"." |
1088 | 1.31M | u16 normalized_length = TRY(decode_normalized_match_length(m_length_coder)); |
1089 | | |
1090 | | // "The state is update with UpdateState_Match function." |
1091 | 0 | update_state_after_match(); |
1092 | | |
1093 | | // "and the new "rep0" value is decoded with DecodeDistance." |
1094 | 1.31M | m_rep0 = TRY(decode_normalized_match_distance(normalized_length)); |
1095 | | |
1096 | | // "If the value of "rep0" is equal to 0xFFFFFFFF, it means that we have |
1097 | | // "End of stream" marker, so we can stop decoding and check finishing |
1098 | | // condition in Range Decoder" |
1099 | 1.31M | if (m_rep0 == end_of_stream_marker) { |
1100 | | // If we should reject end-of-stream markers, do so now. |
1101 | | // Note that this is not part of LZMA, as LZMA allows end-of-stream markers in all contexts, so pure LZMA should never set this option. |
1102 | 2.02k | if (m_options.reject_end_of_stream_marker) |
1103 | 0 | return Error::from_string_literal("An end-of-stream marker was found, but the LZMA stream is configured to reject them"); |
1104 | | |
1105 | | // The range decoder condition is checked after breaking out of the loop. |
1106 | 2.02k | m_found_end_of_stream_marker = true; |
1107 | 2.02k | continue; |
1108 | 2.02k | } |
1109 | | |
1110 | | // If we are looking for EOS, but haven't found it here, the stream is corrupted. |
1111 | 1.31M | if (has_reached_expected_data_size()) |
1112 | 1 | return Error::from_string_literal("First simple match after the expected uncompressed size is not the EOS marker"); |
1113 | | |
1114 | | // "If uncompressed size is defined, LZMA decoder must check that it doesn't |
1115 | | // exceed that specified uncompressed size." |
1116 | | // This is being checked for in the common "copy to buffer" implementation. |
1117 | | |
1118 | | // "Also the decoder must check that "rep0" value is not larger than dictionary size |
1119 | | // and is not larger than the number of already decoded bytes." |
1120 | 1.31M | if (current_repetition_offset() > m_dictionary->seekback_limit()) |
1121 | 180 | return Error::from_string_literal("rep0 value is larger than the possible lookback size"); |
1122 | | |
1123 | | // "Then the decoder must copy match bytes as described in |
1124 | | // "The match symbols copying" section." |
1125 | 1.31M | TRY(copy_match_to_buffer(normalized_length + normalized_to_real_match_length_offset)); |
1126 | | |
1127 | 0 | continue; |
1128 | 1.31M | } |
1129 | | |
1130 | 12.0M | if (match_type == MatchType::ShortRepMatch) { |
1131 | | // "LZMA doesn't update the distance history." |
1132 | | |
1133 | | // "If the subtype is "Short Rep Match", the decoder updates the state, puts |
1134 | | // the one byte from window to current position in window and goes to next |
1135 | | // MATCH/LITERAL symbol." |
1136 | 3.49k | update_state_after_short_rep(); |
1137 | | |
1138 | 3.49k | TRY(copy_match_to_buffer(1)); |
1139 | | |
1140 | 0 | continue; |
1141 | 3.49k | } |
1142 | | |
1143 | | // Note: We don't need to do anything specific for "Rep Match 0", we just need to make sure to not |
1144 | | // run the detection for other match types and to not switch around the distance history. |
1145 | | |
1146 | 12.0M | if (match_type == MatchType::RepMatch1) { |
1147 | 82.5k | u32 distance = m_rep1; |
1148 | 82.5k | m_rep1 = m_rep0; |
1149 | 82.5k | m_rep0 = distance; |
1150 | 82.5k | } |
1151 | | |
1152 | 12.0M | if (match_type == MatchType::RepMatch2) { |
1153 | 62.1k | u32 distance = m_rep2; |
1154 | 62.1k | m_rep2 = m_rep1; |
1155 | 62.1k | m_rep1 = m_rep0; |
1156 | 62.1k | m_rep0 = distance; |
1157 | 62.1k | } |
1158 | | |
1159 | 12.0M | if (match_type == MatchType::RepMatch3) { |
1160 | 8.73M | u32 distance = m_rep3; |
1161 | 8.73M | m_rep3 = m_rep2; |
1162 | 8.73M | m_rep2 = m_rep1; |
1163 | 8.73M | m_rep1 = m_rep0; |
1164 | 8.73M | m_rep0 = distance; |
1165 | 8.73M | } |
1166 | | |
1167 | | // "In other cases (Rep Match 0/1/2/3), it decodes the zero-based |
1168 | | // length of match with "RepLenDecoder" decoder." |
1169 | 12.0M | u16 normalized_length = TRY(decode_normalized_match_length(m_rep_length_coder)); |
1170 | | |
1171 | | // "Then it updates the state." |
1172 | 0 | update_state_after_rep(); |
1173 | | |
1174 | | // "Then the decoder must copy match bytes as described in |
1175 | | // "The Match symbols copying" section." |
1176 | 12.0M | TRY(copy_match_to_buffer(normalized_length + normalized_to_real_match_length_offset)); |
1177 | 12.0M | } |
1178 | | |
1179 | 605k | if (m_found_end_of_stream_marker || has_reached_expected_data_size()) { |
1180 | 2.06k | if (m_options.uncompressed_size.has_value() && m_total_processed_bytes < m_options.uncompressed_size.value()) |
1181 | 94 | return Error::from_string_literal("Found end-of-stream marker earlier than expected"); |
1182 | | |
1183 | 1.96k | if (!is_range_decoder_in_clean_state()) |
1184 | 44 | return Error::from_string_literal("LZMA stream ends in an unclean state"); |
1185 | 1.96k | } |
1186 | | |
1187 | 605k | return m_dictionary->read(bytes); |
1188 | 605k | } |
1189 | | |
1190 | | ErrorOr<size_t> LzmaDecompressor::write_some(ReadonlyBytes) |
1191 | 0 | { |
1192 | 0 | return Error::from_errno(EBADF); |
1193 | 0 | } |
1194 | | |
1195 | | bool LzmaDecompressor::is_eof() const |
1196 | 1.19M | { |
1197 | 1.19M | if (m_dictionary->used_space() > 0) |
1198 | 1.17M | return false; |
1199 | | |
1200 | 18.6k | if (has_reached_expected_data_size()) |
1201 | 39 | return true; |
1202 | | |
1203 | 18.5k | return m_found_end_of_stream_marker; |
1204 | 18.6k | } |
1205 | | |
1206 | | bool LzmaDecompressor::is_open() const |
1207 | 0 | { |
1208 | 0 | return true; |
1209 | 0 | } |
1210 | | |
1211 | | void LzmaDecompressor::close() |
1212 | 0 | { |
1213 | 0 | } |
1214 | | |
1215 | | ErrorOr<NonnullOwnPtr<LzmaCompressor>> LzmaCompressor::create_container(MaybeOwned<Stream> stream, LzmaCompressorOptions const& options) |
1216 | 1.88k | { |
1217 | 3.77k | auto dictionary = TRY(try_make<SearchableCircularBuffer>(TRY(SearchableCircularBuffer::create_empty(options.dictionary_size + largest_real_match_length)))); |
1218 | | |
1219 | | // "The LZMA Decoder uses (1 << (lc + lp)) tables with CProb values, where each table contains 0x300 CProb values." |
1220 | 1.88k | auto literal_probabilities = TRY(FixedArray<Probability>::create(literal_probability_table_size * (1 << (options.literal_context_bits + options.literal_position_bits)))); |
1221 | | |
1222 | 1.88k | auto header = TRY(LzmaHeader::from_compressor_options(options)); |
1223 | 1.88k | TRY(stream->write_value(header)); |
1224 | | |
1225 | 1.88k | auto compressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) LzmaCompressor(move(stream), options, move(dictionary), move(literal_probabilities)))); |
1226 | | |
1227 | 0 | return compressor; |
1228 | 1.88k | } |
1229 | | |
1230 | | LzmaCompressor::LzmaCompressor(MaybeOwned<AK::Stream> stream, Compress::LzmaCompressorOptions options, MaybeOwned<SearchableCircularBuffer> dictionary, FixedArray<Compress::LzmaState::Probability> literal_probabilities) |
1231 | 1.88k | : LzmaState(move(literal_probabilities)) |
1232 | 1.88k | , m_stream(move(stream)) |
1233 | 1.88k | , m_options(move(options)) |
1234 | 1.88k | , m_dictionary(move(dictionary)) |
1235 | 1.88k | { |
1236 | 1.88k | } |
1237 | | |
1238 | | ErrorOr<Bytes> LzmaCompressor::read_some(Bytes) |
1239 | 0 | { |
1240 | 0 | return Error::from_errno(EBADF); |
1241 | 0 | } |
1242 | | |
1243 | | ErrorOr<size_t> LzmaCompressor::write_some(ReadonlyBytes bytes) |
1244 | 5.93M | { |
1245 | | // Fill the input buffer until it's full or until we can't read any more data. |
1246 | 5.93M | size_t processed_bytes = min(bytes.size(), largest_real_match_length - m_dictionary->used_space()); |
1247 | 5.93M | bytes = bytes.trim(processed_bytes); |
1248 | | |
1249 | 11.8M | while (bytes.size() > 0) { |
1250 | 5.93M | auto const written_bytes = m_dictionary->write(bytes); |
1251 | 5.93M | bytes = bytes.slice(written_bytes); |
1252 | 5.93M | } |
1253 | | |
1254 | 5.93M | VERIFY(m_dictionary->used_space() <= largest_real_match_length); |
1255 | | |
1256 | 5.93M | if (m_options.uncompressed_size.has_value() && m_total_processed_bytes + m_dictionary->used_space() > m_options.uncompressed_size.value()) |
1257 | 0 | return Error::from_string_literal("Tried to compress more LZMA data than announced"); |
1258 | | |
1259 | 11.8M | TRY(encode_once()); |
1260 | | |
1261 | | // If we read enough data to reach the final uncompressed size, flush automatically. |
1262 | | // Flushing will handle encoding the remaining data for us and finalize the stream. |
1263 | 5.93M | if (m_options.uncompressed_size.has_value() && m_total_processed_bytes + m_dictionary->used_space() >= m_options.uncompressed_size.value()) |
1264 | 0 | TRY(flush()); |
1265 | | |
1266 | 5.93M | return processed_bytes; |
1267 | 11.8M | } |
1268 | | |
1269 | | ErrorOr<void> LzmaCompressor::flush() |
1270 | 1.88k | { |
1271 | 1.88k | if (m_has_flushed_data) |
1272 | 0 | return Error::from_string_literal("Flushed an LZMA stream twice"); |
1273 | | |
1274 | 122k | while (m_dictionary->used_space() > 0) |
1275 | 120k | TRY(encode_once()); |
1276 | | |
1277 | 1.88k | if (m_options.uncompressed_size.has_value() && m_total_processed_bytes < m_options.uncompressed_size.value()) |
1278 | 0 | return Error::from_string_literal("Flushing LZMA data with known but unreached uncompressed size"); |
1279 | | |
1280 | | // The LZMA specification technically also allows both a known size and an end-of-stream marker simultaneously, |
1281 | | // but LZMA2 rejects them, so skip emitting the end-of-stream marker if we know the uncompressed size. |
1282 | 1.88k | if (!m_options.uncompressed_size.has_value()) |
1283 | 1.88k | TRY(encode_normalized_simple_match(end_of_stream_marker, 0)); |
1284 | | |
1285 | | // Shifting the range encoder using the normal operation handles any pending overflows. |
1286 | 3.77k | TRY(shift_range_encoder()); |
1287 | | |
1288 | | // Now, the remaining bytes are the cached byte, the chain of 0xFF, and the upper 3 bytes of the current `code`. |
1289 | | // Incrementing the values does not have to be considered as no overflows are pending. The fourth byte is the |
1290 | | // null byte that we just shifted in, which should not be flushed as it would be extraneous junk data. |
1291 | 1.88k | TRY(m_stream->write_value<u8>(m_range_encoder_cached_byte)); |
1292 | 1.89k | for (size_t i = 0; i < m_range_encoder_ff_chain_length; i++) |
1293 | 6 | TRY(m_stream->write_value<u8>(0xFF)); |
1294 | 3.77k | TRY(m_stream->write_value<u8>(m_range_encoder_code >> 24)); |
1295 | 1.88k | TRY(m_stream->write_value<u8>(m_range_encoder_code >> 16)); |
1296 | 1.88k | TRY(m_stream->write_value<u8>(m_range_encoder_code >> 8)); |
1297 | | |
1298 | 0 | m_has_flushed_data = true; |
1299 | 1.88k | return {}; |
1300 | 1.88k | } |
1301 | | |
1302 | | bool LzmaCompressor::is_eof() const |
1303 | 0 | { |
1304 | 0 | return true; |
1305 | 0 | } |
1306 | | |
1307 | | bool LzmaCompressor::is_open() const |
1308 | 0 | { |
1309 | 0 | return !m_has_flushed_data; |
1310 | 0 | } |
1311 | | |
1312 | | void LzmaCompressor::close() |
1313 | 0 | { |
1314 | 0 | if (!m_has_flushed_data) { |
1315 | | // Note: We need a better API for specifying things like this. |
1316 | 0 | flush().release_value_but_fixme_should_propagate_errors(); |
1317 | 0 | } |
1318 | 0 | } |
1319 | | |
1320 | | LzmaCompressor::~LzmaCompressor() |
1321 | 1.88k | { |
1322 | 1.88k | if (!m_has_flushed_data) { |
1323 | | // Note: We need a better API for specifying things like this. |
1324 | 0 | flush().release_value_but_fixme_should_propagate_errors(); |
1325 | 0 | } |
1326 | 1.88k | } |
1327 | | |
1328 | | } |