Coverage Report

Created: 2022-08-24 06:40

/src/duckdb/src/common/gzip_file_system.cpp
Line
Count
Source (jump to first uncovered line)
1
#include "duckdb/common/gzip_file_system.hpp"
2
#include "duckdb/common/exception.hpp"
3
#include "duckdb/common/file_system.hpp"
4
5
#include "miniz.hpp"
6
#include "miniz_wrapper.hpp"
7
8
#include "duckdb/common/limits.hpp"
9
10
namespace duckdb {
11
12
/*
13
14
  0      2 bytes  magic header  0x1f, 0x8b (\037 \213)
15
  2      1 byte   compression method
16
                     0: store (copied)
17
                     1: compress
18
                     2: pack
19
                     3: lzh
20
                     4..7: reserved
21
                     8: deflate
22
  3      1 byte   flags
23
                     bit 0 set: file probably ascii text
24
                     bit 1 set: continuation of multi-part gzip file, part number present
25
                     bit 2 set: extra field present
26
                     bit 3 set: original file name present
27
                     bit 4 set: file comment present
28
                     bit 5 set: file is encrypted, encryption header present
29
                     bit 6,7:   reserved
30
  4      4 bytes  file modification time in Unix format
31
  8      1 byte   extra flags (depend on compression method)
32
  9      1 byte   OS type
33
[
34
         2 bytes  optional part number (second part=1)
35
]?
36
[
37
         2 bytes  optional extra field length (e)
38
        (e)bytes  optional extra field
39
]?
40
[
41
           bytes  optional original file name, zero terminated
42
]?
43
[
44
           bytes  optional file comment, zero terminated
45
]?
46
[
47
        12 bytes  optional encryption header
48
]?
49
           bytes  compressed data
50
         4 bytes  crc32
51
         4 bytes  uncompressed input size modulo 2^32
52
53
 */
54
55
0
static idx_t GZipConsumeString(FileHandle &input) {
56
0
  idx_t size = 1; // terminator
57
0
  char buffer[1];
58
0
  while (input.Read(buffer, 1) == 1) {
59
0
    if (buffer[0] == '\0') {
60
0
      break;
61
0
    }
62
0
    size++;
63
0
  }
64
0
  return size;
65
0
}
66
67
struct MiniZStreamWrapper : public StreamWrapper {
68
  ~MiniZStreamWrapper() override;
69
70
  CompressedFile *file = nullptr;
71
  duckdb_miniz::mz_stream *mz_stream_ptr = nullptr;
72
  bool writing = false;
73
  duckdb_miniz::mz_ulong crc;
74
  idx_t total_size;
75
76
public:
77
  void Initialize(CompressedFile &file, bool write) override;
78
79
  bool Read(StreamData &stream_data) override;
80
  void Write(CompressedFile &file, StreamData &stream_data, data_ptr_t buffer, int64_t nr_bytes) override;
81
82
  void Close() override;
83
84
  void FlushStream();
85
};
86
87
0
MiniZStreamWrapper::~MiniZStreamWrapper() {
88
  // avoid closing if destroyed during stack unwinding
89
0
  if (Exception::UncaughtException()) {
90
0
    return;
91
0
  }
92
0
  try {
93
0
    Close();
94
0
  } catch (...) {
95
0
  }
96
0
}
97
98
0
void MiniZStreamWrapper::Initialize(CompressedFile &file, bool write) {
99
0
  Close();
100
0
  this->file = &file;
101
0
  mz_stream_ptr = new duckdb_miniz::mz_stream();
102
0
  memset(mz_stream_ptr, 0, sizeof(duckdb_miniz::mz_stream));
103
0
  this->writing = write;
104
105
  // TODO use custom alloc/free methods in miniz to throw exceptions on OOM
106
0
  uint8_t gzip_hdr[GZIP_HEADER_MINSIZE];
107
0
  if (write) {
108
0
    crc = MZ_CRC32_INIT;
109
0
    total_size = 0;
110
111
0
    MiniZStream::InitializeGZIPHeader(gzip_hdr);
112
0
    file.child_handle->Write(gzip_hdr, GZIP_HEADER_MINSIZE);
113
114
0
    auto ret = mz_deflateInit2((duckdb_miniz::mz_streamp)mz_stream_ptr, duckdb_miniz::MZ_DEFAULT_LEVEL, MZ_DEFLATED,
115
0
                               -MZ_DEFAULT_WINDOW_BITS, 1, 0);
116
0
    if (ret != duckdb_miniz::MZ_OK) {
117
0
      throw InternalException("Failed to initialize miniz");
118
0
    }
119
0
  } else {
120
0
    idx_t data_start = GZIP_HEADER_MINSIZE;
121
0
    auto read_count = file.child_handle->Read(gzip_hdr, GZIP_HEADER_MINSIZE);
122
0
    GZipFileSystem::VerifyGZIPHeader(gzip_hdr, read_count);
123
124
0
    if (gzip_hdr[3] & GZIP_FLAG_NAME) {
125
0
      file.child_handle->Seek(data_start);
126
0
      data_start += GZipConsumeString(*file.child_handle);
127
0
    }
128
0
    file.child_handle->Seek(data_start);
129
    // stream is now set to beginning of payload data
130
0
    auto ret = duckdb_miniz::mz_inflateInit2((duckdb_miniz::mz_streamp)mz_stream_ptr, -MZ_DEFAULT_WINDOW_BITS);
131
0
    if (ret != duckdb_miniz::MZ_OK) {
132
0
      throw InternalException("Failed to initialize miniz");
133
0
    }
134
0
  }
135
0
}
136
137
0
bool MiniZStreamWrapper::Read(StreamData &sd) {
138
  // actually decompress
139
0
  mz_stream_ptr->next_in = (data_ptr_t)sd.in_buff_start;
140
0
  D_ASSERT(sd.in_buff_end - sd.in_buff_start < NumericLimits<int32_t>::Maximum());
141
0
  mz_stream_ptr->avail_in = (uint32_t)(sd.in_buff_end - sd.in_buff_start);
142
0
  mz_stream_ptr->next_out = (data_ptr_t)sd.out_buff_end;
143
0
  mz_stream_ptr->avail_out = (uint32_t)((sd.out_buff.get() + sd.out_buf_size) - sd.out_buff_end);
144
0
  auto ret = duckdb_miniz::mz_inflate(mz_stream_ptr, duckdb_miniz::MZ_NO_FLUSH);
145
0
  if (ret != duckdb_miniz::MZ_OK && ret != duckdb_miniz::MZ_STREAM_END) {
146
0
    throw IOException("Failed to decode gzip stream: %s", duckdb_miniz::mz_error(ret));
147
0
  }
148
  // update pointers following inflate()
149
0
  sd.in_buff_start = (data_ptr_t)mz_stream_ptr->next_in;
150
0
  sd.in_buff_end = sd.in_buff_start + mz_stream_ptr->avail_in;
151
0
  sd.out_buff_end = (data_ptr_t)mz_stream_ptr->next_out;
152
0
  D_ASSERT(sd.out_buff_end + mz_stream_ptr->avail_out == sd.out_buff.get() + sd.out_buf_size);
153
  // if stream ended, deallocate inflator
154
0
  if (ret == duckdb_miniz::MZ_STREAM_END) {
155
0
    Close();
156
0
    return true;
157
0
  }
158
0
  return false;
159
0
}
160
161
void MiniZStreamWrapper::Write(CompressedFile &file, StreamData &sd, data_ptr_t uncompressed_data,
162
0
                               int64_t uncompressed_size) {
163
  // update the src and the total size
164
0
  crc = duckdb_miniz::mz_crc32(crc, (const unsigned char *)uncompressed_data, uncompressed_size);
165
0
  total_size += uncompressed_size;
166
167
0
  auto remaining = uncompressed_size;
168
0
  while (remaining > 0) {
169
0
    idx_t output_remaining = (sd.out_buff.get() + sd.out_buf_size) - sd.out_buff_start;
170
171
0
    mz_stream_ptr->next_in = (const unsigned char *)uncompressed_data;
172
0
    mz_stream_ptr->avail_in = remaining;
173
0
    mz_stream_ptr->next_out = sd.out_buff_start;
174
0
    mz_stream_ptr->avail_out = output_remaining;
175
176
0
    auto res = mz_deflate(mz_stream_ptr, duckdb_miniz::MZ_NO_FLUSH);
177
0
    if (res != duckdb_miniz::MZ_OK) {
178
0
      D_ASSERT(res != duckdb_miniz::MZ_STREAM_END);
179
0
      throw InternalException("Failed to compress GZIP block");
180
0
    }
181
0
    sd.out_buff_start += output_remaining - mz_stream_ptr->avail_out;
182
0
    if (mz_stream_ptr->avail_out == 0) {
183
      // no more output buffer available: flush
184
0
      file.child_handle->Write(sd.out_buff.get(), sd.out_buff_start - sd.out_buff.get());
185
0
      sd.out_buff_start = sd.out_buff.get();
186
0
    }
187
0
    idx_t written = remaining - mz_stream_ptr->avail_in;
188
0
    uncompressed_data += written;
189
0
    remaining = mz_stream_ptr->avail_in;
190
0
  }
191
0
}
192
193
0
void MiniZStreamWrapper::FlushStream() {
194
0
  auto &sd = file->stream_data;
195
0
  mz_stream_ptr->next_in = nullptr;
196
0
  mz_stream_ptr->avail_in = 0;
197
0
  while (true) {
198
0
    auto output_remaining = (sd.out_buff.get() + sd.out_buf_size) - sd.out_buff_start;
199
0
    mz_stream_ptr->next_out = sd.out_buff_start;
200
0
    mz_stream_ptr->avail_out = output_remaining;
201
202
0
    auto res = mz_deflate(mz_stream_ptr, duckdb_miniz::MZ_FINISH);
203
0
    sd.out_buff_start += (output_remaining - mz_stream_ptr->avail_out);
204
0
    if (sd.out_buff_start > sd.out_buff.get()) {
205
0
      file->child_handle->Write(sd.out_buff.get(), sd.out_buff_start - sd.out_buff.get());
206
0
      sd.out_buff_start = sd.out_buff.get();
207
0
    }
208
0
    if (res == duckdb_miniz::MZ_STREAM_END) {
209
0
      break;
210
0
    }
211
0
    if (res != duckdb_miniz::MZ_OK) {
212
0
      throw InternalException("Failed to compress GZIP block");
213
0
    }
214
0
  }
215
0
}
216
217
0
void MiniZStreamWrapper::Close() {
218
0
  if (!mz_stream_ptr) {
219
0
    return;
220
0
  }
221
0
  if (writing) {
222
    // flush anything remaining in the stream
223
0
    FlushStream();
224
225
    // write the footer
226
0
    unsigned char gzip_footer[MiniZStream::GZIP_FOOTER_SIZE];
227
0
    MiniZStream::InitializeGZIPFooter(gzip_footer, crc, total_size);
228
0
    file->child_handle->Write(gzip_footer, MiniZStream::GZIP_FOOTER_SIZE);
229
230
0
    duckdb_miniz::mz_deflateEnd(mz_stream_ptr);
231
0
  } else {
232
0
    duckdb_miniz::mz_inflateEnd(mz_stream_ptr);
233
0
  }
234
0
  delete mz_stream_ptr;
235
0
  mz_stream_ptr = nullptr;
236
0
  file = nullptr;
237
0
}
238
239
class GZipFile : public CompressedFile {
240
public:
241
  GZipFile(unique_ptr<FileHandle> child_handle_p, const string &path, bool write)
242
0
      : CompressedFile(gzip_fs, move(child_handle_p), path) {
243
0
    Initialize(write);
244
0
  }
245
246
  GZipFileSystem gzip_fs;
247
};
248
249
0
void GZipFileSystem::VerifyGZIPHeader(uint8_t gzip_hdr[], idx_t read_count) {
250
  // check for incorrectly formatted files
251
0
  if (read_count != GZIP_HEADER_MINSIZE) {
252
0
    throw IOException("Input is not a GZIP stream");
253
0
  }
254
0
  if (gzip_hdr[0] != 0x1F || gzip_hdr[1] != 0x8B) { // magic header
255
0
    throw IOException("Input is not a GZIP stream");
256
0
  }
257
0
  if (gzip_hdr[2] != GZIP_COMPRESSION_DEFLATE) { // compression method
258
0
    throw IOException("Unsupported GZIP compression method");
259
0
  }
260
0
  if (gzip_hdr[3] & GZIP_FLAG_UNSUPPORTED) {
261
0
    throw IOException("Unsupported GZIP archive");
262
0
  }
263
0
}
264
265
0
string GZipFileSystem::UncompressGZIPString(const string &in) {
266
  // decompress file
267
0
  auto body_ptr = in.data();
268
269
0
  auto mz_stream_ptr = new duckdb_miniz::mz_stream();
270
0
  memset(mz_stream_ptr, 0, sizeof(duckdb_miniz::mz_stream));
271
272
0
  uint8_t gzip_hdr[GZIP_HEADER_MINSIZE];
273
274
  // check for incorrectly formatted files
275
276
  // TODO this is mostly the same as gzip_file_system.cpp
277
0
  if (in.size() < GZIP_HEADER_MINSIZE) {
278
0
    throw IOException("Input is not a GZIP stream");
279
0
  }
280
0
  memcpy(gzip_hdr, body_ptr, GZIP_HEADER_MINSIZE);
281
0
  body_ptr += GZIP_HEADER_MINSIZE;
282
0
  GZipFileSystem::VerifyGZIPHeader(gzip_hdr, GZIP_HEADER_MINSIZE);
283
284
0
  if (gzip_hdr[3] & GZIP_FLAG_NAME) {
285
0
    char c;
286
0
    do {
287
0
      c = *body_ptr;
288
0
      body_ptr++;
289
0
    } while (c != '\0' && (idx_t)(body_ptr - in.data()) < in.size());
290
0
  }
291
292
  // stream is now set to beginning of payload data
293
0
  auto status = duckdb_miniz::mz_inflateInit2(mz_stream_ptr, -MZ_DEFAULT_WINDOW_BITS);
294
0
  if (status != duckdb_miniz::MZ_OK) {
295
0
    throw InternalException("Failed to initialize miniz");
296
0
  }
297
298
0
  auto bytes_remaining = in.size() - (body_ptr - in.data());
299
0
  mz_stream_ptr->next_in = (unsigned char *)body_ptr;
300
0
  mz_stream_ptr->avail_in = bytes_remaining;
301
302
0
  unsigned char decompress_buffer[BUFSIZ];
303
0
  string decompressed;
304
305
0
  while (status == duckdb_miniz::MZ_OK) {
306
0
    mz_stream_ptr->next_out = decompress_buffer;
307
0
    mz_stream_ptr->avail_out = sizeof(decompress_buffer);
308
0
    status = mz_inflate(mz_stream_ptr, duckdb_miniz::MZ_NO_FLUSH);
309
0
    if (status != duckdb_miniz::MZ_STREAM_END && status != duckdb_miniz::MZ_OK) {
310
0
      throw IOException("Failed to uncompress");
311
0
    }
312
0
    decompressed.append((char *)decompress_buffer, mz_stream_ptr->total_out - decompressed.size());
313
0
  }
314
0
  duckdb_miniz::mz_inflateEnd(mz_stream_ptr);
315
0
  if (decompressed.empty()) {
316
0
    throw IOException("Failed to uncompress");
317
0
  }
318
0
  return decompressed;
319
0
}
320
321
0
unique_ptr<FileHandle> GZipFileSystem::OpenCompressedFile(unique_ptr<FileHandle> handle, bool write) {
322
0
  auto path = handle->path;
323
0
  return make_unique<GZipFile>(move(handle), path, write);
324
0
}
325
326
0
unique_ptr<StreamWrapper> GZipFileSystem::CreateStream() {
327
0
  return make_unique<MiniZStreamWrapper>();
328
0
}
329
330
0
idx_t GZipFileSystem::InBufferSize() {
331
0
  return BUFFER_SIZE;
332
0
}
333
334
0
idx_t GZipFileSystem::OutBufferSize() {
335
0
  return BUFFER_SIZE;
336
0
}
337
338
} // namespace duckdb