/src/serenity/Userland/Libraries/LibCompress/Lzw.h
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2018-2021, Andreas Kling <kling@serenityos.org> |
3 | | * Copyright (c) 2022, the SerenityOS developers. |
4 | | * |
5 | | * SPDX-License-Identifier: BSD-2-Clause |
6 | | */ |
7 | | |
8 | | #pragma once |
9 | | |
10 | | #include <AK/BitStream.h> |
11 | | #include <AK/Concepts.h> |
12 | | #include <AK/Debug.h> |
13 | | #include <AK/Format.h> |
14 | | #include <AK/IntegralMath.h> |
15 | | #include <AK/MemoryStream.h> |
16 | | #include <AK/Vector.h> |
17 | | |
18 | | namespace Compress { |
19 | | |
20 | | namespace Details { |
21 | | |
22 | | class LzwState { |
23 | | public: |
24 | | u16 add_control_code() |
25 | 987k | { |
26 | 987k | u16 const control_code = m_code_table.size(); |
27 | 987k | m_code_table.append(Vector<u8> {}); |
28 | 987k | m_original_code_table.append(Vector<u8> {}); |
29 | 987k | if (m_code_table.size() >= m_table_capacity && m_code_size < max_code_size) { |
30 | 981k | ++m_code_size; |
31 | 981k | ++m_original_code_size; |
32 | 981k | m_table_capacity *= 2; |
33 | 981k | } |
34 | 987k | return control_code; |
35 | 987k | } |
36 | | |
37 | | void reset() |
38 | 195k | { |
39 | 195k | m_code_table.clear(); |
40 | 195k | m_code_table.extend(m_original_code_table); |
41 | 195k | m_code_size = m_original_code_size; |
42 | 195k | m_table_capacity = AK::exp2<u32>(m_code_size); |
43 | 195k | } |
44 | | |
45 | | protected: |
46 | | static constexpr int max_code_size = 12; |
47 | | static constexpr int max_table_size = 1 << max_code_size; |
48 | | |
49 | | LzwState(u8 min_code_size, i32 offset_for_size_change) |
50 | 493k | : m_code_size(min_code_size) |
51 | 493k | , m_original_code_size(min_code_size) |
52 | 493k | , m_table_capacity(AK::exp2<u32>(min_code_size)) |
53 | 493k | , m_offset_for_size_change(offset_for_size_change) |
54 | 493k | { |
55 | 493k | init_code_table(); |
56 | 493k | } |
57 | | |
58 | | void init_code_table() |
59 | 493k | { |
60 | 493k | m_code_table.ensure_capacity(m_table_capacity); |
61 | 1.93M | for (u16 i = 0; i < m_table_capacity; ++i) { |
62 | 1.44M | m_code_table.unchecked_append({ (u8)i }); |
63 | 1.44M | } |
64 | 493k | m_original_code_table = m_code_table; |
65 | 493k | } |
66 | | |
67 | | void extend_code_table(Vector<u8> const& entry) |
68 | 7.78M | { |
69 | 7.78M | if (entry.size() > 1 && m_code_table.size() < max_table_size) { |
70 | 1.97M | m_code_table.append(entry); |
71 | 1.97M | if (m_code_table.size() >= (m_table_capacity + m_offset_for_size_change) && m_code_size < max_code_size) { |
72 | 24.0k | ++m_code_size; |
73 | 24.0k | m_table_capacity *= 2; |
74 | 24.0k | } |
75 | 1.97M | } |
76 | 7.78M | } |
77 | | |
78 | | Vector<Vector<u8>> m_code_table {}; |
79 | | Vector<Vector<u8>> m_original_code_table {}; |
80 | | |
81 | | u8 m_code_size { 0 }; |
82 | | u8 m_original_code_size { 0 }; |
83 | | |
84 | | u32 m_table_capacity { 0 }; |
85 | | i32 m_offset_for_size_change {}; |
86 | | }; |
87 | | |
88 | | } |
89 | | |
90 | | template<InputBitStream InputStream> |
91 | | class LzwDecompressor : private Details::LzwState { |
92 | | public: |
93 | | explicit LzwDecompressor(MaybeOwned<InputStream> lzw_stream, u8 min_code_size, i32 offset_for_size_change = 0) |
94 | 493k | : LzwState(min_code_size, offset_for_size_change) |
95 | 493k | , m_bit_stream(move(lzw_stream)) |
96 | | |
97 | 493k | { |
98 | 493k | } Compress::LzwDecompressor<AK::LittleEndianInputBitStream>::LzwDecompressor(AK::MaybeOwned<AK::LittleEndianInputBitStream>, unsigned char, int) Line | Count | Source | 94 | 490k | : LzwState(min_code_size, offset_for_size_change) | 95 | 490k | , m_bit_stream(move(lzw_stream)) | 96 | | | 97 | 490k | { | 98 | 490k | } |
Compress::LzwDecompressor<AK::BigEndianInputBitStream>::LzwDecompressor(AK::MaybeOwned<AK::BigEndianInputBitStream>, unsigned char, int) Line | Count | Source | 94 | 2.97k | : LzwState(min_code_size, offset_for_size_change) | 95 | 2.97k | , m_bit_stream(move(lzw_stream)) | 96 | | | 97 | 2.97k | { | 98 | 2.97k | } |
|
99 | | |
100 | | static ErrorOr<ByteBuffer> decompress_all(ReadonlyBytes bytes, u8 initial_code_size, i32 offset_for_size_change = 0) |
101 | 493k | { |
102 | 493k | auto memory_stream = make<FixedMemoryStream>(bytes); |
103 | 493k | auto lzw_stream = make<InputStream>(MaybeOwned<Stream>(move(memory_stream))); |
104 | 493k | LzwDecompressor lzw_decompressor { MaybeOwned<InputStream> { move(lzw_stream) }, initial_code_size, offset_for_size_change }; |
105 | | |
106 | 493k | ByteBuffer decompressed; |
107 | | |
108 | 493k | u16 const clear_code = lzw_decompressor.add_control_code(); |
109 | 493k | u16 const end_of_data_code = lzw_decompressor.add_control_code(); |
110 | | |
111 | 8.47M | while (true) { |
112 | 8.47M | auto const code = TRY(lzw_decompressor.next_code()); |
113 | | |
114 | 8.47M | if (code == clear_code) { |
115 | 195k | lzw_decompressor.reset(); |
116 | 195k | continue; |
117 | 195k | } |
118 | | |
119 | 8.28M | if (code == end_of_data_code) |
120 | 493k | break; |
121 | | |
122 | 15.5M | TRY(decompressed.try_append(lzw_decompressor.get_output())); |
123 | 15.5M | } |
124 | | |
125 | 493k | return decompressed; |
126 | 493k | } Compress::LzwDecompressor<AK::LittleEndianInputBitStream>::decompress_all(AK::Span<unsigned char const>, unsigned char, int) Line | Count | Source | 101 | 490k | { | 102 | 490k | auto memory_stream = make<FixedMemoryStream>(bytes); | 103 | 490k | auto lzw_stream = make<InputStream>(MaybeOwned<Stream>(move(memory_stream))); | 104 | 490k | LzwDecompressor lzw_decompressor { MaybeOwned<InputStream> { move(lzw_stream) }, initial_code_size, offset_for_size_change }; | 105 | | | 106 | 490k | ByteBuffer decompressed; | 107 | | | 108 | 490k | u16 const clear_code = lzw_decompressor.add_control_code(); | 109 | 490k | u16 const end_of_data_code = lzw_decompressor.add_control_code(); | 110 | | | 111 | 7.68M | while (true) { | 112 | 7.68M | auto const code = TRY(lzw_decompressor.next_code()); | 113 | | | 114 | 7.68M | if (code == clear_code) { | 115 | 194k | lzw_decompressor.reset(); | 116 | 194k | continue; | 117 | 194k | } | 118 | | | 119 | 7.48M | if (code == end_of_data_code) | 120 | 490k | break; | 121 | | | 122 | 13.9M | TRY(decompressed.try_append(lzw_decompressor.get_output())); | 123 | 13.9M | } | 124 | | | 125 | 490k | return decompressed; | 126 | 490k | } |
Compress::LzwDecompressor<AK::BigEndianInputBitStream>::decompress_all(AK::Span<unsigned char const>, unsigned char, int) Line | Count | Source | 101 | 2.97k | { | 102 | 2.97k | auto memory_stream = make<FixedMemoryStream>(bytes); | 103 | 2.97k | auto lzw_stream = make<InputStream>(MaybeOwned<Stream>(move(memory_stream))); | 104 | 2.97k | LzwDecompressor lzw_decompressor { MaybeOwned<InputStream> { move(lzw_stream) }, initial_code_size, offset_for_size_change }; | 105 | | | 106 | 2.97k | ByteBuffer decompressed; | 107 | | | 108 | 2.97k | u16 const clear_code = lzw_decompressor.add_control_code(); | 109 | 2.97k | u16 const end_of_data_code = lzw_decompressor.add_control_code(); | 110 | | | 111 | 791k | while (true) { | 112 | 791k | auto const code = TRY(lzw_decompressor.next_code()); | 113 | | | 114 | 791k | if (code == clear_code) { | 115 | 472 | lzw_decompressor.reset(); | 116 | 472 | continue; | 117 | 472 | } | 118 | | | 119 | 790k | if (code == end_of_data_code) | 120 | 2.96k | break; | 121 | | | 122 | 1.57M | TRY(decompressed.try_append(lzw_decompressor.get_output())); | 123 | 1.57M | } | 124 | | | 125 | 2.96k | return decompressed; | 126 | 2.97k | } |
|
127 | | |
128 | | void reset() |
129 | 195k | { |
130 | 195k | LzwState::reset(); |
131 | 195k | m_output.clear(); |
132 | 195k | } Compress::LzwDecompressor<AK::LittleEndianInputBitStream>::reset() Line | Count | Source | 129 | 194k | { | 130 | 194k | LzwState::reset(); | 131 | 194k | m_output.clear(); | 132 | 194k | } |
Compress::LzwDecompressor<AK::BigEndianInputBitStream>::reset() Line | Count | Source | 129 | 472 | { | 130 | 472 | LzwState::reset(); | 131 | 472 | m_output.clear(); | 132 | 472 | } |
|
133 | | |
134 | | ErrorOr<u16> next_code() |
135 | 8.47M | { |
136 | 8.47M | m_current_code = TRY(m_bit_stream->template read_bits<u16>(m_code_size)); |
137 | | |
138 | 8.47M | if (m_current_code > m_code_table.size()) { |
139 | 97 | dbgln_if(LZW_DEBUG, "Corrupted LZW stream, invalid code: {}, code table size: {}", |
140 | 97 | m_current_code, |
141 | 97 | m_code_table.size()); |
142 | 97 | return Error::from_string_literal("Corrupted LZW stream, invalid code"); |
143 | 8.47M | } else if (m_current_code == m_code_table.size() && m_output.is_empty()) { |
144 | 10 | dbgln_if(LZW_DEBUG, "Corrupted LZW stream, valid new code but output buffer is empty: {}, code table size: {}", |
145 | 10 | m_current_code, |
146 | 10 | m_code_table.size()); |
147 | 10 | return Error::from_string_literal("Corrupted LZW stream, valid new code but output buffer is empty"); |
148 | 10 | } |
149 | | |
150 | 8.47M | return m_current_code; |
151 | 8.47M | } Compress::LzwDecompressor<AK::LittleEndianInputBitStream>::next_code() Line | Count | Source | 135 | 7.68M | { | 136 | 7.68M | m_current_code = TRY(m_bit_stream->template read_bits<u16>(m_code_size)); | 137 | | | 138 | 7.68M | if (m_current_code > m_code_table.size()) { | 139 | 88 | dbgln_if(LZW_DEBUG, "Corrupted LZW stream, invalid code: {}, code table size: {}", | 140 | 88 | m_current_code, | 141 | 88 | m_code_table.size()); | 142 | 88 | return Error::from_string_literal("Corrupted LZW stream, invalid code"); | 143 | 7.68M | } else if (m_current_code == m_code_table.size() && m_output.is_empty()) { | 144 | 10 | dbgln_if(LZW_DEBUG, "Corrupted LZW stream, valid new code but output buffer is empty: {}, code table size: {}", | 145 | 10 | m_current_code, | 146 | 10 | m_code_table.size()); | 147 | 10 | return Error::from_string_literal("Corrupted LZW stream, valid new code but output buffer is empty"); | 148 | 10 | } | 149 | | | 150 | 7.68M | return m_current_code; | 151 | 7.68M | } |
Compress::LzwDecompressor<AK::BigEndianInputBitStream>::next_code() Line | Count | Source | 135 | 791k | { | 136 | 791k | m_current_code = TRY(m_bit_stream->template read_bits<u16>(m_code_size)); | 137 | | | 138 | 791k | if (m_current_code > m_code_table.size()) { | 139 | 9 | dbgln_if(LZW_DEBUG, "Corrupted LZW stream, invalid code: {}, code table size: {}", | 140 | 9 | m_current_code, | 141 | 9 | m_code_table.size()); | 142 | 9 | return Error::from_string_literal("Corrupted LZW stream, invalid code"); | 143 | 791k | } else if (m_current_code == m_code_table.size() && m_output.is_empty()) { | 144 | 0 | dbgln_if(LZW_DEBUG, "Corrupted LZW stream, valid new code but output buffer is empty: {}, code table size: {}", | 145 | 0 | m_current_code, | 146 | 0 | m_code_table.size()); | 147 | 0 | return Error::from_string_literal("Corrupted LZW stream, valid new code but output buffer is empty"); | 148 | 0 | } | 149 | | | 150 | 791k | return m_current_code; | 151 | 791k | } |
|
152 | | |
153 | | Vector<u8>& get_output() |
154 | 7.78M | { |
155 | 7.78M | VERIFY(m_current_code <= m_code_table.size()); |
156 | 7.78M | if (m_current_code < m_code_table.size()) { |
157 | 7.69M | Vector<u8> new_entry = m_output; |
158 | 7.69M | m_output = m_code_table.at(m_current_code); |
159 | 7.69M | new_entry.append(m_output[0]); |
160 | 7.69M | extend_code_table(new_entry); |
161 | 7.69M | } else if (m_current_code == m_code_table.size()) { |
162 | 89.5k | VERIFY(!m_output.is_empty()); |
163 | 89.5k | m_output.append(m_output[0]); |
164 | 89.5k | extend_code_table(m_output); |
165 | 89.5k | } |
166 | 7.78M | return m_output; |
167 | 7.78M | } Compress::LzwDecompressor<AK::LittleEndianInputBitStream>::get_output() Line | Count | Source | 154 | 6.99M | { | 155 | 6.99M | VERIFY(m_current_code <= m_code_table.size()); | 156 | 6.99M | if (m_current_code < m_code_table.size()) { | 157 | 6.92M | Vector<u8> new_entry = m_output; | 158 | 6.92M | m_output = m_code_table.at(m_current_code); | 159 | 6.92M | new_entry.append(m_output[0]); | 160 | 6.92M | extend_code_table(new_entry); | 161 | 6.92M | } else if (m_current_code == m_code_table.size()) { | 162 | 70.1k | VERIFY(!m_output.is_empty()); | 163 | 70.1k | m_output.append(m_output[0]); | 164 | 70.1k | extend_code_table(m_output); | 165 | 70.1k | } | 166 | 6.99M | return m_output; | 167 | 6.99M | } |
Compress::LzwDecompressor<AK::BigEndianInputBitStream>::get_output() Line | Count | Source | 154 | 787k | { | 155 | 787k | VERIFY(m_current_code <= m_code_table.size()); | 156 | 787k | if (m_current_code < m_code_table.size()) { | 157 | 768k | Vector<u8> new_entry = m_output; | 158 | 768k | m_output = m_code_table.at(m_current_code); | 159 | 768k | new_entry.append(m_output[0]); | 160 | 768k | extend_code_table(new_entry); | 161 | 768k | } else if (m_current_code == m_code_table.size()) { | 162 | 19.4k | VERIFY(!m_output.is_empty()); | 163 | 19.4k | m_output.append(m_output[0]); | 164 | 19.4k | extend_code_table(m_output); | 165 | 19.4k | } | 166 | 787k | return m_output; | 167 | 787k | } |
|
168 | | |
169 | | private: |
170 | | MaybeOwned<InputStream> m_bit_stream; |
171 | | |
172 | | u16 m_current_code { 0 }; |
173 | | Vector<u8> m_output {}; |
174 | | }; |
175 | | |
176 | | class LzwCompressor : private Details::LzwState { |
177 | | public: |
178 | | static ErrorOr<ByteBuffer> compress_all(ReadonlyBytes bytes, u8 initial_code_size) |
179 | 0 | { |
180 | 0 | LzwCompressor compressor { initial_code_size }; |
181 | 0 | AllocatingMemoryStream buffer; |
182 | 0 | LittleEndianOutputBitStream output_stream { MaybeOwned<Stream>(buffer) }; |
183 | 0 |
|
184 | 0 | u16 const clear_code = compressor.add_control_code(); |
185 | 0 | u16 const end_of_data_code = compressor.add_control_code(); |
186 | 0 |
|
187 | 0 | TRY(output_stream.write_bits(clear_code, compressor.m_code_size)); |
188 | 0 |
|
189 | 0 | u32 last_offset = 0; |
190 | 0 |
|
191 | 0 | while (last_offset < bytes.size()) { |
192 | 0 | ReadonlyBytes current_symbol {}; |
193 | 0 | u16 current_code {}; |
194 | 0 |
|
195 | 0 | if (compressor.m_code_table.size() == max_table_size - 2) { |
196 | 0 | TRY(output_stream.write_bits(clear_code, compressor.m_code_size)); |
197 | 0 | compressor.reset(); |
198 | 0 | } |
199 | 0 |
|
200 | 0 | bool found_symbol = false; |
201 | 0 |
|
202 | 0 | for (u32 symbol_size = 1; last_offset + symbol_size <= bytes.size(); ++symbol_size) { |
203 | 0 | current_symbol = bytes.slice(last_offset, symbol_size); |
204 | 0 | auto const new_code = compressor.code_for_symbol(current_symbol); |
205 | 0 |
|
206 | 0 | if (new_code.has_value()) { |
207 | 0 | current_code = *new_code; |
208 | 0 | } else { |
209 | 0 | found_symbol = true; |
210 | 0 | break; |
211 | 0 | } |
212 | 0 | } |
213 | 0 |
|
214 | 0 | TRY(output_stream.write_bits(current_code, compressor.m_code_size)); |
215 | 0 |
|
216 | 0 | if (found_symbol) { |
217 | 0 | compressor.extend_code_table(Vector(current_symbol)); |
218 | 0 | current_symbol = current_symbol.trim(current_symbol.size() - 1); |
219 | 0 | } |
220 | 0 | last_offset += current_symbol.size(); |
221 | 0 | } |
222 | 0 |
|
223 | 0 | TRY(output_stream.write_bits(end_of_data_code, compressor.m_code_size)); |
224 | 0 | TRY(output_stream.align_to_byte_boundary()); |
225 | 0 | TRY(output_stream.flush_buffer_to_stream()); |
226 | 0 |
|
227 | 0 | return TRY(buffer.read_until_eof()); |
228 | 0 | } |
229 | | |
230 | | private: |
231 | | LzwCompressor(u8 initial_code_size) |
232 | | : Details::LzwState(initial_code_size, 1) |
233 | 0 | { |
234 | 0 | } |
235 | | |
236 | | Optional<u16> code_for_symbol(ReadonlyBytes bytes) |
237 | 0 | { |
238 | 0 | for (u16 i = 0; i < m_code_table.size(); ++i) { |
239 | 0 | if (m_code_table[i].span() == bytes) |
240 | 0 | return i; |
241 | 0 | } |
242 | 0 |
|
243 | 0 | return OptionalNone {}; |
244 | 0 | } |
245 | | }; |
246 | | |
247 | | } |