/src/duckdb/src/common/gzip_file_system.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | #include "duckdb/common/gzip_file_system.hpp" |
2 | | #include "duckdb/common/exception.hpp" |
3 | | #include "duckdb/common/file_system.hpp" |
4 | | |
5 | | #include "miniz.hpp" |
6 | | #include "miniz_wrapper.hpp" |
7 | | |
8 | | #include "duckdb/common/limits.hpp" |
9 | | |
10 | | namespace duckdb { |
11 | | |
12 | | /* |
13 | | |
14 | | 0 2 bytes magic header 0x1f, 0x8b (\037 \213) |
15 | | 2 1 byte compression method |
16 | | 0: store (copied) |
17 | | 1: compress |
18 | | 2: pack |
19 | | 3: lzh |
20 | | 4..7: reserved |
21 | | 8: deflate |
22 | | 3 1 byte flags |
23 | | bit 0 set: file probably ascii text |
24 | | bit 1 set: continuation of multi-part gzip file, part number present |
25 | | bit 2 set: extra field present |
26 | | bit 3 set: original file name present |
27 | | bit 4 set: file comment present |
28 | | bit 5 set: file is encrypted, encryption header present |
29 | | bit 6,7: reserved |
30 | | 4 4 bytes file modification time in Unix format |
31 | | 8 1 byte extra flags (depend on compression method) |
32 | | 9 1 byte OS type |
33 | | [ |
34 | | 2 bytes optional part number (second part=1) |
35 | | ]? |
36 | | [ |
37 | | 2 bytes optional extra field length (e) |
38 | | (e)bytes optional extra field |
39 | | ]? |
40 | | [ |
41 | | bytes optional original file name, zero terminated |
42 | | ]? |
43 | | [ |
44 | | bytes optional file comment, zero terminated |
45 | | ]? |
46 | | [ |
47 | | 12 bytes optional encryption header |
48 | | ]? |
49 | | bytes compressed data |
50 | | 4 bytes crc32 |
51 | | 4 bytes uncompressed input size modulo 2^32 |
52 | | |
53 | | */ |
54 | | |
55 | 0 | static idx_t GZipConsumeString(FileHandle &input) { |
56 | 0 | idx_t size = 1; // terminator |
57 | 0 | char buffer[1]; |
58 | 0 | while (input.Read(buffer, 1) == 1) { |
59 | 0 | if (buffer[0] == '\0') { |
60 | 0 | break; |
61 | 0 | } |
62 | 0 | size++; |
63 | 0 | } |
64 | 0 | return size; |
65 | 0 | } |
66 | | |
67 | | struct MiniZStreamWrapper : public StreamWrapper { |
68 | | ~MiniZStreamWrapper() override; |
69 | | |
70 | | CompressedFile *file = nullptr; |
71 | | duckdb_miniz::mz_stream *mz_stream_ptr = nullptr; |
72 | | bool writing = false; |
73 | | duckdb_miniz::mz_ulong crc; |
74 | | idx_t total_size; |
75 | | |
76 | | public: |
77 | | void Initialize(CompressedFile &file, bool write) override; |
78 | | |
79 | | bool Read(StreamData &stream_data) override; |
80 | | void Write(CompressedFile &file, StreamData &stream_data, data_ptr_t buffer, int64_t nr_bytes) override; |
81 | | |
82 | | void Close() override; |
83 | | |
84 | | void FlushStream(); |
85 | | }; |
86 | | |
87 | 0 | MiniZStreamWrapper::~MiniZStreamWrapper() { |
88 | | // avoid closing if destroyed during stack unwinding |
89 | 0 | if (Exception::UncaughtException()) { |
90 | 0 | return; |
91 | 0 | } |
92 | 0 | try { |
93 | 0 | Close(); |
94 | 0 | } catch (...) { |
95 | 0 | } |
96 | 0 | } |
97 | | |
98 | 0 | void MiniZStreamWrapper::Initialize(CompressedFile &file, bool write) { |
99 | 0 | Close(); |
100 | 0 | this->file = &file; |
101 | 0 | mz_stream_ptr = new duckdb_miniz::mz_stream(); |
102 | 0 | memset(mz_stream_ptr, 0, sizeof(duckdb_miniz::mz_stream)); |
103 | 0 | this->writing = write; |
104 | | |
105 | | // TODO use custom alloc/free methods in miniz to throw exceptions on OOM |
106 | 0 | uint8_t gzip_hdr[GZIP_HEADER_MINSIZE]; |
107 | 0 | if (write) { |
108 | 0 | crc = MZ_CRC32_INIT; |
109 | 0 | total_size = 0; |
110 | |
|
111 | 0 | MiniZStream::InitializeGZIPHeader(gzip_hdr); |
112 | 0 | file.child_handle->Write(gzip_hdr, GZIP_HEADER_MINSIZE); |
113 | |
|
114 | 0 | auto ret = mz_deflateInit2((duckdb_miniz::mz_streamp)mz_stream_ptr, duckdb_miniz::MZ_DEFAULT_LEVEL, MZ_DEFLATED, |
115 | 0 | -MZ_DEFAULT_WINDOW_BITS, 1, 0); |
116 | 0 | if (ret != duckdb_miniz::MZ_OK) { |
117 | 0 | throw InternalException("Failed to initialize miniz"); |
118 | 0 | } |
119 | 0 | } else { |
120 | 0 | idx_t data_start = GZIP_HEADER_MINSIZE; |
121 | 0 | auto read_count = file.child_handle->Read(gzip_hdr, GZIP_HEADER_MINSIZE); |
122 | 0 | GZipFileSystem::VerifyGZIPHeader(gzip_hdr, read_count); |
123 | |
|
124 | 0 | if (gzip_hdr[3] & GZIP_FLAG_NAME) { |
125 | 0 | file.child_handle->Seek(data_start); |
126 | 0 | data_start += GZipConsumeString(*file.child_handle); |
127 | 0 | } |
128 | 0 | file.child_handle->Seek(data_start); |
129 | | // stream is now set to beginning of payload data |
130 | 0 | auto ret = duckdb_miniz::mz_inflateInit2((duckdb_miniz::mz_streamp)mz_stream_ptr, -MZ_DEFAULT_WINDOW_BITS); |
131 | 0 | if (ret != duckdb_miniz::MZ_OK) { |
132 | 0 | throw InternalException("Failed to initialize miniz"); |
133 | 0 | } |
134 | 0 | } |
135 | 0 | } |
136 | | |
137 | 0 | bool MiniZStreamWrapper::Read(StreamData &sd) { |
138 | | // actually decompress |
139 | 0 | mz_stream_ptr->next_in = (data_ptr_t)sd.in_buff_start; |
140 | 0 | D_ASSERT(sd.in_buff_end - sd.in_buff_start < NumericLimits<int32_t>::Maximum()); |
141 | 0 | mz_stream_ptr->avail_in = (uint32_t)(sd.in_buff_end - sd.in_buff_start); |
142 | 0 | mz_stream_ptr->next_out = (data_ptr_t)sd.out_buff_end; |
143 | 0 | mz_stream_ptr->avail_out = (uint32_t)((sd.out_buff.get() + sd.out_buf_size) - sd.out_buff_end); |
144 | 0 | auto ret = duckdb_miniz::mz_inflate(mz_stream_ptr, duckdb_miniz::MZ_NO_FLUSH); |
145 | 0 | if (ret != duckdb_miniz::MZ_OK && ret != duckdb_miniz::MZ_STREAM_END) { |
146 | 0 | throw IOException("Failed to decode gzip stream: %s", duckdb_miniz::mz_error(ret)); |
147 | 0 | } |
148 | | // update pointers following inflate() |
149 | 0 | sd.in_buff_start = (data_ptr_t)mz_stream_ptr->next_in; |
150 | 0 | sd.in_buff_end = sd.in_buff_start + mz_stream_ptr->avail_in; |
151 | 0 | sd.out_buff_end = (data_ptr_t)mz_stream_ptr->next_out; |
152 | 0 | D_ASSERT(sd.out_buff_end + mz_stream_ptr->avail_out == sd.out_buff.get() + sd.out_buf_size); |
153 | | // if stream ended, deallocate inflator |
154 | 0 | if (ret == duckdb_miniz::MZ_STREAM_END) { |
155 | 0 | Close(); |
156 | 0 | return true; |
157 | 0 | } |
158 | 0 | return false; |
159 | 0 | } |
160 | | |
161 | | void MiniZStreamWrapper::Write(CompressedFile &file, StreamData &sd, data_ptr_t uncompressed_data, |
162 | 0 | int64_t uncompressed_size) { |
163 | | // update the src and the total size |
164 | 0 | crc = duckdb_miniz::mz_crc32(crc, (const unsigned char *)uncompressed_data, uncompressed_size); |
165 | 0 | total_size += uncompressed_size; |
166 | |
|
167 | 0 | auto remaining = uncompressed_size; |
168 | 0 | while (remaining > 0) { |
169 | 0 | idx_t output_remaining = (sd.out_buff.get() + sd.out_buf_size) - sd.out_buff_start; |
170 | |
|
171 | 0 | mz_stream_ptr->next_in = (const unsigned char *)uncompressed_data; |
172 | 0 | mz_stream_ptr->avail_in = remaining; |
173 | 0 | mz_stream_ptr->next_out = sd.out_buff_start; |
174 | 0 | mz_stream_ptr->avail_out = output_remaining; |
175 | |
|
176 | 0 | auto res = mz_deflate(mz_stream_ptr, duckdb_miniz::MZ_NO_FLUSH); |
177 | 0 | if (res != duckdb_miniz::MZ_OK) { |
178 | 0 | D_ASSERT(res != duckdb_miniz::MZ_STREAM_END); |
179 | 0 | throw InternalException("Failed to compress GZIP block"); |
180 | 0 | } |
181 | 0 | sd.out_buff_start += output_remaining - mz_stream_ptr->avail_out; |
182 | 0 | if (mz_stream_ptr->avail_out == 0) { |
183 | | // no more output buffer available: flush |
184 | 0 | file.child_handle->Write(sd.out_buff.get(), sd.out_buff_start - sd.out_buff.get()); |
185 | 0 | sd.out_buff_start = sd.out_buff.get(); |
186 | 0 | } |
187 | 0 | idx_t written = remaining - mz_stream_ptr->avail_in; |
188 | 0 | uncompressed_data += written; |
189 | 0 | remaining = mz_stream_ptr->avail_in; |
190 | 0 | } |
191 | 0 | } |
192 | | |
193 | 0 | void MiniZStreamWrapper::FlushStream() { |
194 | 0 | auto &sd = file->stream_data; |
195 | 0 | mz_stream_ptr->next_in = nullptr; |
196 | 0 | mz_stream_ptr->avail_in = 0; |
197 | 0 | while (true) { |
198 | 0 | auto output_remaining = (sd.out_buff.get() + sd.out_buf_size) - sd.out_buff_start; |
199 | 0 | mz_stream_ptr->next_out = sd.out_buff_start; |
200 | 0 | mz_stream_ptr->avail_out = output_remaining; |
201 | |
|
202 | 0 | auto res = mz_deflate(mz_stream_ptr, duckdb_miniz::MZ_FINISH); |
203 | 0 | sd.out_buff_start += (output_remaining - mz_stream_ptr->avail_out); |
204 | 0 | if (sd.out_buff_start > sd.out_buff.get()) { |
205 | 0 | file->child_handle->Write(sd.out_buff.get(), sd.out_buff_start - sd.out_buff.get()); |
206 | 0 | sd.out_buff_start = sd.out_buff.get(); |
207 | 0 | } |
208 | 0 | if (res == duckdb_miniz::MZ_STREAM_END) { |
209 | 0 | break; |
210 | 0 | } |
211 | 0 | if (res != duckdb_miniz::MZ_OK) { |
212 | 0 | throw InternalException("Failed to compress GZIP block"); |
213 | 0 | } |
214 | 0 | } |
215 | 0 | } |
216 | | |
217 | 0 | void MiniZStreamWrapper::Close() { |
218 | 0 | if (!mz_stream_ptr) { |
219 | 0 | return; |
220 | 0 | } |
221 | 0 | if (writing) { |
222 | | // flush anything remaining in the stream |
223 | 0 | FlushStream(); |
224 | | |
225 | | // write the footer |
226 | 0 | unsigned char gzip_footer[MiniZStream::GZIP_FOOTER_SIZE]; |
227 | 0 | MiniZStream::InitializeGZIPFooter(gzip_footer, crc, total_size); |
228 | 0 | file->child_handle->Write(gzip_footer, MiniZStream::GZIP_FOOTER_SIZE); |
229 | |
|
230 | 0 | duckdb_miniz::mz_deflateEnd(mz_stream_ptr); |
231 | 0 | } else { |
232 | 0 | duckdb_miniz::mz_inflateEnd(mz_stream_ptr); |
233 | 0 | } |
234 | 0 | delete mz_stream_ptr; |
235 | 0 | mz_stream_ptr = nullptr; |
236 | 0 | file = nullptr; |
237 | 0 | } |
238 | | |
239 | | class GZipFile : public CompressedFile { |
240 | | public: |
241 | | GZipFile(unique_ptr<FileHandle> child_handle_p, const string &path, bool write) |
242 | 0 | : CompressedFile(gzip_fs, move(child_handle_p), path) { |
243 | 0 | Initialize(write); |
244 | 0 | } |
245 | | |
246 | | GZipFileSystem gzip_fs; |
247 | | }; |
248 | | |
249 | 0 | void GZipFileSystem::VerifyGZIPHeader(uint8_t gzip_hdr[], idx_t read_count) { |
250 | | // check for incorrectly formatted files |
251 | 0 | if (read_count != GZIP_HEADER_MINSIZE) { |
252 | 0 | throw IOException("Input is not a GZIP stream"); |
253 | 0 | } |
254 | 0 | if (gzip_hdr[0] != 0x1F || gzip_hdr[1] != 0x8B) { // magic header |
255 | 0 | throw IOException("Input is not a GZIP stream"); |
256 | 0 | } |
257 | 0 | if (gzip_hdr[2] != GZIP_COMPRESSION_DEFLATE) { // compression method |
258 | 0 | throw IOException("Unsupported GZIP compression method"); |
259 | 0 | } |
260 | 0 | if (gzip_hdr[3] & GZIP_FLAG_UNSUPPORTED) { |
261 | 0 | throw IOException("Unsupported GZIP archive"); |
262 | 0 | } |
263 | 0 | } |
264 | | |
265 | 0 | string GZipFileSystem::UncompressGZIPString(const string &in) { |
266 | | // decompress file |
267 | 0 | auto body_ptr = in.data(); |
268 | |
|
269 | 0 | auto mz_stream_ptr = new duckdb_miniz::mz_stream(); |
270 | 0 | memset(mz_stream_ptr, 0, sizeof(duckdb_miniz::mz_stream)); |
271 | |
|
272 | 0 | uint8_t gzip_hdr[GZIP_HEADER_MINSIZE]; |
273 | | |
274 | | // check for incorrectly formatted files |
275 | | |
276 | | // TODO this is mostly the same as gzip_file_system.cpp |
277 | 0 | if (in.size() < GZIP_HEADER_MINSIZE) { |
278 | 0 | throw IOException("Input is not a GZIP stream"); |
279 | 0 | } |
280 | 0 | memcpy(gzip_hdr, body_ptr, GZIP_HEADER_MINSIZE); |
281 | 0 | body_ptr += GZIP_HEADER_MINSIZE; |
282 | 0 | GZipFileSystem::VerifyGZIPHeader(gzip_hdr, GZIP_HEADER_MINSIZE); |
283 | |
|
284 | 0 | if (gzip_hdr[3] & GZIP_FLAG_NAME) { |
285 | 0 | char c; |
286 | 0 | do { |
287 | 0 | c = *body_ptr; |
288 | 0 | body_ptr++; |
289 | 0 | } while (c != '\0' && (idx_t)(body_ptr - in.data()) < in.size()); |
290 | 0 | } |
291 | | |
292 | | // stream is now set to beginning of payload data |
293 | 0 | auto status = duckdb_miniz::mz_inflateInit2(mz_stream_ptr, -MZ_DEFAULT_WINDOW_BITS); |
294 | 0 | if (status != duckdb_miniz::MZ_OK) { |
295 | 0 | throw InternalException("Failed to initialize miniz"); |
296 | 0 | } |
297 | | |
298 | 0 | auto bytes_remaining = in.size() - (body_ptr - in.data()); |
299 | 0 | mz_stream_ptr->next_in = (unsigned char *)body_ptr; |
300 | 0 | mz_stream_ptr->avail_in = bytes_remaining; |
301 | |
|
302 | 0 | unsigned char decompress_buffer[BUFSIZ]; |
303 | 0 | string decompressed; |
304 | |
|
305 | 0 | while (status == duckdb_miniz::MZ_OK) { |
306 | 0 | mz_stream_ptr->next_out = decompress_buffer; |
307 | 0 | mz_stream_ptr->avail_out = sizeof(decompress_buffer); |
308 | 0 | status = mz_inflate(mz_stream_ptr, duckdb_miniz::MZ_NO_FLUSH); |
309 | 0 | if (status != duckdb_miniz::MZ_STREAM_END && status != duckdb_miniz::MZ_OK) { |
310 | 0 | throw IOException("Failed to uncompress"); |
311 | 0 | } |
312 | 0 | decompressed.append((char *)decompress_buffer, mz_stream_ptr->total_out - decompressed.size()); |
313 | 0 | } |
314 | 0 | duckdb_miniz::mz_inflateEnd(mz_stream_ptr); |
315 | 0 | if (decompressed.empty()) { |
316 | 0 | throw IOException("Failed to uncompress"); |
317 | 0 | } |
318 | 0 | return decompressed; |
319 | 0 | } |
320 | | |
321 | 0 | unique_ptr<FileHandle> GZipFileSystem::OpenCompressedFile(unique_ptr<FileHandle> handle, bool write) { |
322 | 0 | auto path = handle->path; |
323 | 0 | return make_unique<GZipFile>(move(handle), path, write); |
324 | 0 | } |
325 | | |
326 | 0 | unique_ptr<StreamWrapper> GZipFileSystem::CreateStream() { |
327 | 0 | return make_unique<MiniZStreamWrapper>(); |
328 | 0 | } |
329 | | |
330 | 0 | idx_t GZipFileSystem::InBufferSize() { |
331 | 0 | return BUFFER_SIZE; |
332 | 0 | } |
333 | | |
334 | 0 | idx_t GZipFileSystem::OutBufferSize() { |
335 | 0 | return BUFFER_SIZE; |
336 | 0 | } |
337 | | |
338 | | } // namespace duckdb |