/src/serenity/Userland/Libraries/LibCompress/Deflate.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2020, the SerenityOS developers. |
3 | | * Copyright (c) 2021, Idan Horowitz <idan.horowitz@serenityos.org> |
4 | | * |
5 | | * SPDX-License-Identifier: BSD-2-Clause |
6 | | */ |
7 | | |
8 | | #pragma once |
9 | | |
10 | | #include <AK/BitStream.h> |
11 | | #include <AK/ByteBuffer.h> |
12 | | #include <AK/CircularBuffer.h> |
13 | | #include <AK/Endian.h> |
14 | | #include <AK/Forward.h> |
15 | | #include <AK/MaybeOwned.h> |
16 | | #include <AK/Stream.h> |
17 | | #include <AK/Vector.h> |
18 | | #include <LibCompress/DeflateTables.h> |
19 | | |
20 | | namespace Compress { |
21 | | |
22 | | class CanonicalCode { |
23 | | public: |
24 | 175k | CanonicalCode() = default; |
25 | | ErrorOr<u32> read_symbol(LittleEndianInputBitStream&) const; |
26 | | ErrorOr<void> write_symbol(LittleEndianOutputBitStream&, u32) const; |
27 | | |
28 | | static CanonicalCode const& fixed_literal_codes(); |
29 | | static CanonicalCode const& fixed_distance_codes(); |
30 | | |
31 | | static ErrorOr<CanonicalCode> from_bytes(ReadonlyBytes); |
32 | | |
33 | | private: |
34 | | static constexpr size_t max_allowed_prefixed_code_length = 8; |
35 | | |
36 | | struct PrefixTableEntry { |
37 | | u16 symbol_value { 0 }; |
38 | | u16 code_length { 0 }; |
39 | | }; |
40 | | |
41 | | // Decompression - indexed by code |
42 | | Vector<u16, 286> m_symbol_values; |
43 | | |
44 | | Vector<u32, 16> m_first_symbol_of_length_after; |
45 | | Vector<u16, 16> m_offset_to_first_symbol_index; |
46 | | |
47 | | Array<PrefixTableEntry, 1 << max_allowed_prefixed_code_length> m_prefix_table {}; |
48 | | size_t m_max_prefixed_code_length { 0 }; |
49 | | |
50 | | // Compression - indexed by symbol |
51 | | // Deflate uses a maximum of 288 symbols (maximum of 32 for distances), |
52 | | // but this is also used by webp, which can use up to 256 + 24 + (1 << 11) == 2328 symbols. |
53 | | Vector<u16, 288> m_bit_codes {}; |
54 | | Vector<u16, 288> m_bit_code_lengths {}; |
55 | | }; |
56 | | |
57 | | ALWAYS_INLINE ErrorOr<void> CanonicalCode::write_symbol(LittleEndianOutputBitStream& stream, u32 symbol) const |
58 | 98.1M | { |
59 | 98.1M | auto code = m_bit_codes[symbol]; |
60 | 98.1M | auto length = m_bit_code_lengths[symbol]; |
61 | 98.1M | TRY(stream.write_bits(code, length)); |
62 | 98.1M | return {}; |
63 | 98.1M | } |
64 | | |
65 | | class DeflateDecompressor final : public Stream { |
66 | | private: |
67 | | class CompressedBlock { |
68 | | public: |
69 | | CompressedBlock(DeflateDecompressor&, CanonicalCode literal_codes, Optional<CanonicalCode> distance_codes); |
70 | | |
71 | | ErrorOr<bool> try_read_more(); |
72 | | |
73 | | private: |
74 | | bool m_eof { false }; |
75 | | |
76 | | DeflateDecompressor& m_decompressor; |
77 | | CanonicalCode m_literal_codes; |
78 | | Optional<CanonicalCode> m_distance_codes; |
79 | | }; |
80 | | |
81 | | class UncompressedBlock { |
82 | | public: |
83 | | UncompressedBlock(DeflateDecompressor&, size_t); |
84 | | |
85 | | ErrorOr<bool> try_read_more(); |
86 | | |
87 | | private: |
88 | | DeflateDecompressor& m_decompressor; |
89 | | size_t m_bytes_remaining; |
90 | | }; |
91 | | |
92 | | enum class State { |
93 | | Idle, |
94 | | ReadingCompressedBlock, |
95 | | ReadingUncompressedBlock |
96 | | }; |
97 | | |
98 | | public: |
99 | | friend CompressedBlock; |
100 | | friend UncompressedBlock; |
101 | | |
102 | | static ErrorOr<NonnullOwnPtr<DeflateDecompressor>> construct(MaybeOwned<LittleEndianInputBitStream> stream); |
103 | | ~DeflateDecompressor(); |
104 | | |
105 | | virtual ErrorOr<Bytes> read_some(Bytes) override; |
106 | | virtual ErrorOr<size_t> write_some(ReadonlyBytes) override; |
107 | | virtual bool is_eof() const override; |
108 | | virtual bool is_open() const override; |
109 | | virtual void close() override; |
110 | | |
111 | | static ErrorOr<ByteBuffer> decompress_all(ReadonlyBytes); |
112 | | |
113 | | private: |
114 | | DeflateDecompressor(MaybeOwned<LittleEndianInputBitStream> stream, CircularBuffer buffer); |
115 | | |
116 | | ErrorOr<u32> decode_length(u32); |
117 | | ErrorOr<u32> decode_distance(u32); |
118 | | ErrorOr<void> decode_codes(CanonicalCode& literal_code, Optional<CanonicalCode>& distance_code); |
119 | | |
120 | | static constexpr u16 max_back_reference_length = 258; |
121 | | |
122 | | bool m_read_final_block { false }; |
123 | | |
124 | | State m_state { State::Idle }; |
125 | | union { |
126 | | CompressedBlock m_compressed_block; |
127 | | UncompressedBlock m_uncompressed_block; |
128 | | }; |
129 | | |
130 | | MaybeOwned<LittleEndianInputBitStream> m_input_stream; |
131 | | CircularBuffer m_output_buffer; |
132 | | }; |
133 | | |
134 | | class DeflateCompressor final : public Stream { |
135 | | public: |
136 | | static constexpr size_t block_size = 32 * KiB - 1; // TODO: this can theoretically be increased to 64 KiB - 2 |
137 | | static constexpr size_t window_size = block_size * 2; |
138 | | static constexpr size_t hash_bits = 15; |
139 | | static constexpr size_t max_huffman_literals = 288; |
140 | | static constexpr size_t max_huffman_distances = 32; |
141 | | static constexpr size_t min_match_length = 4; // matches smaller than these are not worth the size of the back reference |
142 | | static constexpr size_t max_match_length = 258; // matches longer than these cannot be encoded using huffman codes |
143 | | static constexpr u16 empty_slot = UINT16_MAX; |
144 | | |
145 | | struct CompressionConstants { |
146 | | size_t good_match_length; // Once we find a match of at least this length (a good enough match) we reduce max_chain to lower processing time |
147 | | size_t max_lazy_length; // If the match is at least this long we dont defer matching to the next byte (which takes time) as its good enough |
148 | | size_t great_match_length; // Once we find a match of at least this length (a great match) we can just stop searching for longer ones |
149 | | size_t max_chain; // We only check the actual length of the max_chain closest matches |
150 | | }; |
151 | | |
152 | | // These constants were shamelessly "borrowed" from zlib |
153 | | static constexpr CompressionConstants compression_constants[] = { |
154 | | { 0, 0, 0, 0 }, |
155 | | { 4, 4, 8, 4 }, |
156 | | { 8, 16, 128, 128 }, |
157 | | { 32, 258, 258, 4096 }, |
158 | | { max_match_length, max_match_length, max_match_length, 1 << hash_bits } // disable all limits |
159 | | }; |
160 | | |
161 | | enum class CompressionLevel : int { |
162 | | STORE = 0, |
163 | | FAST, |
164 | | GOOD, |
165 | | GREAT, |
166 | | BEST // WARNING: this one can take an unreasonable amount of time! |
167 | | }; |
168 | | |
169 | | static ErrorOr<NonnullOwnPtr<DeflateCompressor>> construct(MaybeOwned<Stream>, CompressionLevel = CompressionLevel::GOOD); |
170 | | ~DeflateCompressor(); |
171 | | |
172 | | virtual ErrorOr<Bytes> read_some(Bytes) override; |
173 | | virtual ErrorOr<size_t> write_some(ReadonlyBytes) override; |
174 | | virtual bool is_eof() const override; |
175 | | virtual bool is_open() const override; |
176 | | virtual void close() override; |
177 | | ErrorOr<void> final_flush(); |
178 | | |
179 | | static ErrorOr<ByteBuffer> compress_all(ReadonlyBytes bytes, CompressionLevel = CompressionLevel::GOOD); |
180 | | |
181 | | private: |
182 | | DeflateCompressor(NonnullOwnPtr<LittleEndianOutputBitStream>, CompressionLevel = CompressionLevel::GOOD); |
183 | | |
184 | 39.2k | Bytes pending_block() { return { m_rolling_window + block_size, block_size }; } |
185 | | |
186 | | // LZ77 Compression |
187 | | static u16 hash_sequence(u8 const* bytes); |
188 | | size_t compare_match_candidate(size_t start, size_t candidate, size_t prev_match_length, size_t max_match_length); |
189 | | size_t find_back_match(size_t start, u16 hash, size_t previous_match_length, size_t max_match_length, size_t& match_position); |
190 | | void lz77_compress_block(); |
191 | | |
192 | | // Huffman Coding |
193 | | struct code_length_symbol { |
194 | | u8 symbol; |
195 | | u8 count; // used for special symbols 16-18 |
196 | | }; |
197 | | static u8 distance_to_base(u16 distance); |
198 | | size_t huffman_block_length(Array<u8, max_huffman_literals> const& literal_bit_lengths, Array<u8, max_huffman_distances> const& distance_bit_lengths); |
199 | | ErrorOr<void> write_huffman(CanonicalCode const& literal_code, Optional<CanonicalCode> const& distance_code); |
200 | | static size_t encode_huffman_lengths(ReadonlyBytes lengths, Array<code_length_symbol, max_huffman_literals + max_huffman_distances>& encoded_lengths); |
201 | | size_t encode_block_lengths(Array<u8, max_huffman_literals> const& literal_bit_lengths, Array<u8, max_huffman_distances> const& distance_bit_lengths, Array<code_length_symbol, max_huffman_literals + max_huffman_distances>& encoded_lengths, size_t& literal_code_count, size_t& distance_code_count); |
202 | | ErrorOr<void> write_dynamic_huffman(CanonicalCode const& literal_code, size_t literal_code_count, Optional<CanonicalCode> const& distance_code, size_t distance_code_count, Array<u8, 19> const& code_lengths_bit_lengths, size_t code_length_count, Array<code_length_symbol, max_huffman_literals + max_huffman_distances> const& encoded_lengths, size_t encoded_lengths_count); |
203 | | |
204 | | size_t uncompressed_block_length(); |
205 | | size_t fixed_block_length(); |
206 | | size_t dynamic_block_length(Array<u8, max_huffman_literals> const& literal_bit_lengths, Array<u8, max_huffman_distances> const& distance_bit_lengths, Array<u8, 19> const& code_lengths_bit_lengths, Array<u16, 19> const& code_lengths_frequencies, size_t code_lengths_count); |
207 | | ErrorOr<void> flush(); |
208 | | |
209 | | bool m_finished { false }; |
210 | | CompressionLevel m_compression_level; |
211 | | CompressionConstants m_compression_constants; |
212 | | NonnullOwnPtr<LittleEndianOutputBitStream> m_output_stream; |
213 | | |
214 | | u8 m_rolling_window[window_size]; |
215 | | size_t m_pending_block_size { 0 }; |
216 | | |
217 | | struct [[gnu::packed]] { |
218 | | u16 distance; // back reference length |
219 | | union { |
220 | | u16 literal; // literal byte or on of block symbol |
221 | | u16 length; // back reference length (if distance != 0) |
222 | | }; |
223 | | } m_symbol_buffer[block_size + 1]; |
224 | | size_t m_pending_symbol_size { 0 }; |
225 | | Array<u16, max_huffman_literals> m_symbol_frequencies; // there are 286 valid symbol values (symbols 286-287 never occur) |
226 | | Array<u16, max_huffman_distances> m_distance_frequencies; // there are 30 valid distance values (distances 30-31 never occur) |
227 | | |
228 | | // LZ77 Chained hash table |
229 | | u16 m_hash_head[1 << hash_bits]; |
230 | | u16 m_hash_prev[window_size]; |
231 | | }; |
232 | | |
233 | | } |