1import io
2try:
3 from os import PathLike
4except ImportError:
5 # For Python 3.5
6 class PathLike:
7 pass
8
9from pyzstd import ZstdCompressor, _ZstdFileReader, \
10 _ZstdFileWriter, _ZSTD_DStreamSizes
11
12__all__ = ('ZstdFile', 'open')
13
14class _ZstdDecompressReader(io.RawIOBase):
15 """Adapt decompressor to RawIOBase reader API"""
16
17 def __init__(self, fp, zstd_dict, option, read_size):
18 self._fp = fp
19 self._decomp = _ZstdFileReader(fp, zstd_dict, option, read_size)
20
21 def close(self):
22 self._decomp = None
23 return super().close()
24
25 def readable(self):
26 return True
27
28 # Some file-like objects don't have .seekable(), invoke when necessary.
29 def seekable(self):
30 return self._fp.seekable()
31
32 def tell(self):
33 return self._decomp.pos
34
35 def readinto(self, b):
36 return self._decomp.readinto(b)
37
38 def readall(self):
39 return self._decomp.readall()
40
41 # If the new position is within io.BufferedReader's buffer,
42 # this method may not be called.
43 def seek(self, offset, whence=0):
44 # offset is absolute file position
45 if whence == 0: # SEEK_SET
46 pass
47 elif whence == 1: # SEEK_CUR
48 offset = self._decomp.pos + offset
49 elif whence == 2: # SEEK_END
50 if self._decomp.size < 0:
51 # Get file size
52 self._decomp.forward(None)
53 offset = self._decomp.size + offset
54 else:
55 raise ValueError("Invalid whence value: {}".format(whence))
56
57 # offset is bytes number to skip forward
58 if offset < self._decomp.pos:
59 # Rewind
60 self._decomp.eof = False
61 self._decomp.pos = 0
62 self._decomp.reset_session()
63 self._fp.seek(0)
64 else:
65 offset -= self._decomp.pos
66 # If offset <= 0, .forward() method does nothing.
67 self._decomp.forward(offset)
68
69 return self._decomp.pos
70
71_ZSTD_DStreamOutSize = _ZSTD_DStreamSizes[1]
72
73_MODE_CLOSED = 0
74_MODE_READ = 1
75_MODE_WRITE = 2
76
77class ZstdFile(io.BufferedIOBase):
78 """A file object providing transparent zstd (de)compression.
79
80 A ZstdFile can act as a wrapper for an existing file object, or refer
81 directly to a named file on disk.
82
83 Note that ZstdFile provides a *binary* file interface - data read is
84 returned as bytes, and data to be written should be an object that
85 supports the Buffer Protocol.
86 """
87 FLUSH_BLOCK = ZstdCompressor.FLUSH_BLOCK
88 FLUSH_FRAME = ZstdCompressor.FLUSH_FRAME
89
90 _READER_CLASS = _ZstdDecompressReader
91
92 def __init__(self, filename, mode="r", *,
93 level_or_option=None, zstd_dict=None,
94 read_size=131075, write_size=131591):
95 """Open a zstd compressed file in binary mode.
96
97 filename can be either an actual file name (given as a str, bytes, or
98 PathLike object), in which case the named file is opened, or it can be
99 an existing file object to read from or write to.
100
101 mode can be "r" for reading (default), "w" for (over)writing, "x" for
102 creating exclusively, or "a" for appending. These can equivalently be
103 given as "rb", "wb", "xb" and "ab" respectively.
104
105 Parameters
106 level_or_option: When it's an int object, it represents compression
107 level. When it's a dict object, it contains advanced compression
108 parameters. Note, in read mode (decompression), it can only be a
109 dict object, that represents decompression option. It doesn't
110 support int type compression level in this case.
111 zstd_dict: A ZstdDict object, pre-trained dictionary for compression /
112 decompression.
113 read_size: In reading mode, this is bytes number that read from the
114 underlying file object each time, default value is zstd's
115 recommended value. If use with Network File System, increasing
116 it may get better performance.
117 write_size: In writing modes, this is output buffer's size, default
118 value is zstd's recommended value. If use with Network File
119 System, increasing it may get better performance.
120 """
121 self._fp = None
122 self._closefp = False
123 self._mode = _MODE_CLOSED
124
125 # Read or write mode
126 if mode in ("r", "rb"):
127 if not isinstance(level_or_option, (type(None), dict)):
128 raise TypeError(
129 ("In read mode (decompression), level_or_option argument "
130 "should be a dict object, that represents decompression "
131 "option. It doesn't support int type compression level "
132 "in this case."))
133 if write_size != 131591:
134 raise ValueError(
135 "write_size argument is only valid in write modes.")
136 mode_code = _MODE_READ
137 elif mode in ("w", "wb", "a", "ab", "x", "xb"):
138 if not isinstance(level_or_option, (type(None), int, dict)):
139 raise TypeError(("level_or_option argument "
140 "should be int or dict object."))
141 if read_size != 131075:
142 raise ValueError(
143 "read_size argument is only valid in read mode.")
144 mode_code = _MODE_WRITE
145 else:
146 raise ValueError("Invalid mode: {!r}".format(mode))
147
148 # File object
149 if isinstance(filename, (str, bytes, PathLike)):
150 if "b" not in mode:
151 mode += "b"
152 self._fp = io.open(filename, mode)
153 self._closefp = True
154 elif hasattr(filename, "read") or hasattr(filename, "write"):
155 self._fp = filename
156 else:
157 raise TypeError(("filename must be a str, bytes, "
158 "file or PathLike object"))
159
160 # Set ._mode here for ._closefp in .close(). If the following code
161 # fails, IOBase's cleanup code will call .close(), so that ._fp can
162 # be closed.
163 self._mode = mode_code
164
165 # Reader or writer
166 if mode_code == _MODE_READ:
167 raw = self._READER_CLASS(
168 self._fp,
169 zstd_dict=zstd_dict,
170 option=level_or_option,
171 read_size=read_size)
172 self._buffer = io.BufferedReader(raw, _ZSTD_DStreamOutSize)
173 elif mode_code == _MODE_WRITE:
174 self._pos = 0
175 self._writer = _ZstdFileWriter(
176 self._fp,
177 level_or_option=level_or_option,
178 zstd_dict=zstd_dict,
179 write_size=write_size)
180
181 def close(self):
182 """Flush and close the file.
183
184 May be called more than once without error. Once the file is
185 closed, any other operation on it will raise a ValueError.
186 """
187 if self._mode == _MODE_CLOSED:
188 return
189
190 try:
191 # In .__init__ method, if fails after setting ._mode attribute,
192 # these attributes don't exist.
193 if hasattr(self, "_buffer"):
194 try:
195 self._buffer.close()
196 finally:
197 # Set to None for ._check_mode()
198 self._buffer = None
199 elif hasattr(self, "_writer"):
200 try:
201 self.flush(self.FLUSH_FRAME)
202 finally:
203 # Set to None for ._check_mode()
204 self._writer = None
205 finally:
206 try:
207 if self._closefp:
208 self._fp.close()
209 finally:
210 self._fp = None
211 self._closefp = False
212 self._mode = _MODE_CLOSED
213
214 # None argument means the file should be closed
215 def _check_mode(self, expected_mode=None):
216 # If closed, raise ValueError.
217 if self._mode == _MODE_CLOSED:
218 raise ValueError("I/O operation on closed file")
219
220 # Check _MODE_READ/_MODE_WRITE mode
221 if expected_mode == _MODE_READ:
222 if self._mode != _MODE_READ:
223 raise io.UnsupportedOperation("File not open for reading")
224 elif expected_mode == _MODE_WRITE:
225 if self._mode != _MODE_WRITE:
226 raise io.UnsupportedOperation("File not open for writing")
227
228 # Re-raise other AttributeError exception
229 raise
230
231 # If modify this method, also modify SeekableZstdFile.write() method.
232 def write(self, data):
233 """Write a bytes-like object to the file.
234
235 Returns the number of uncompressed bytes written, which is
236 always the length of data in bytes. Note that due to buffering,
237 the file on disk may not reflect the data written until .flush()
238 or .close() is called.
239 """
240 # Compress & write
241 try:
242 input_size, _ = self._writer.write(data)
243 except AttributeError:
244 self._check_mode(_MODE_WRITE)
245
246 self._pos += input_size
247 return input_size
248
249 # If modify this method, also modify SeekableZstdFile.flush() method.
250 def flush(self, mode=FLUSH_BLOCK):
251 """Flush remaining data to the underlying stream.
252
253 The mode argument can be ZstdFile.FLUSH_BLOCK, ZstdFile.FLUSH_FRAME.
254 Abuse of this method will reduce compression ratio, use it only when
255 necessary.
256
257 If the program is interrupted afterwards, all data can be recovered.
258 To ensure saving to disk, also need to use os.fsync(fd).
259
260 This method does nothing in reading mode.
261 """
262 if self._mode != _MODE_WRITE:
263 # Like IOBase.flush(), do nothing in reading mode.
264 # TextIOWrapper.close() relies on this behavior.
265 if self._mode == _MODE_READ:
266 return
267 # Closed, raise ValueError.
268 self._check_mode()
269
270 # Flush zstd block/frame, and write.
271 self._writer.flush(mode)
272
273 def read(self, size=-1):
274 """Read up to size uncompressed bytes from the file.
275
276 If size is negative or omitted, read until EOF is reached.
277 Returns b"" if the file is already at EOF.
278 """
279 if size is None:
280 size = -1
281 try:
282 return self._buffer.read(size)
283 except AttributeError:
284 self._check_mode(_MODE_READ)
285
286 def read1(self, size=-1):
287 """Read up to size uncompressed bytes, while trying to avoid
288 making multiple reads from the underlying stream. Reads up to a
289 buffer's worth of data if size is negative.
290
291 Returns b"" if the file is at EOF.
292 """
293 if size < 0:
294 size = _ZSTD_DStreamOutSize
295
296 try:
297 return self._buffer.read1(size)
298 except AttributeError:
299 self._check_mode(_MODE_READ)
300
301 def readinto(self, b):
302 """Read bytes into b.
303
304 Returns the number of bytes read (0 for EOF).
305 """
306 try:
307 return self._buffer.readinto(b)
308 except AttributeError:
309 self._check_mode(_MODE_READ)
310
311 def readinto1(self, b):
312 """Read bytes into b, while trying to avoid making multiple reads
313 from the underlying stream.
314
315 Returns the number of bytes read (0 for EOF).
316 """
317 try:
318 return self._buffer.readinto1(b)
319 except AttributeError:
320 self._check_mode(_MODE_READ)
321
322 def readline(self, size=-1):
323 """Read a line of uncompressed bytes from the file.
324
325 The terminating newline (if present) is retained. If size is
326 non-negative, no more than size bytes will be read (in which
327 case the line may be incomplete). Returns b'' if already at EOF.
328 """
329 if size is None:
330 size = -1
331 try:
332 return self._buffer.readline(size)
333 except AttributeError:
334 self._check_mode(_MODE_READ)
335
336 def seek(self, offset, whence=io.SEEK_SET):
337 """Change the file position.
338
339 The new position is specified by offset, relative to the
340 position indicated by whence. Possible values for whence are:
341
342 0: start of stream (default): offset must not be negative
343 1: current stream position
344 2: end of stream; offset must not be positive
345
346 Returns the new file position.
347
348 Note that seeking is emulated, so depending on the arguments,
349 this operation may be extremely slow.
350 """
351 try:
352 # BufferedReader.seek() checks seekable
353 return self._buffer.seek(offset, whence)
354 except AttributeError:
355 self._check_mode(_MODE_READ)
356
357 def peek(self, size=-1):
358 """Return buffered data without advancing the file position.
359
360 Always returns at least one byte of data, unless at EOF.
361 The exact number of bytes returned is unspecified.
362 """
363 # Relies on the undocumented fact that BufferedReader.peek() always
364 # returns at least one byte (except at EOF)
365 try:
366 return self._buffer.peek(size)
367 except AttributeError:
368 self._check_mode(_MODE_READ)
369
370 def __iter__(self):
371 try:
372 self._buffer
373 except AttributeError:
374 self._check_mode(_MODE_READ)
375 return self
376
377 def __next__(self):
378 ret = self._buffer.readline()
379 if ret:
380 return ret
381 raise StopIteration
382
383 def tell(self):
384 """Return the current file position."""
385 if self._mode == _MODE_READ:
386 return self._buffer.tell()
387 elif self._mode == _MODE_WRITE:
388 return self._pos
389
390 # Closed, raise ValueError.
391 self._check_mode()
392
393 def fileno(self):
394 """Return the file descriptor for the underlying file."""
395 try:
396 return self._fp.fileno()
397 except AttributeError:
398 # Closed, raise ValueError.
399 self._check_mode()
400
401 @property
402 def name(self):
403 """Return the file name for the underlying file."""
404 try:
405 return self._fp.name
406 except AttributeError:
407 self._check_mode()
408
409 @property
410 def closed(self):
411 """True if this file is closed."""
412 return self._mode == _MODE_CLOSED
413
414 def writable(self):
415 """Return whether the file was opened for writing."""
416 if self._mode == _MODE_WRITE:
417 return True
418 elif self._mode == _MODE_READ:
419 return False
420
421 # Closed, raise ValueError.
422 self._check_mode()
423
424 def readable(self):
425 """Return whether the file was opened for reading."""
426 if self._mode == _MODE_READ:
427 return True
428 elif self._mode == _MODE_WRITE:
429 return False
430
431 # Closed, raise ValueError.
432 self._check_mode()
433
434 def seekable(self):
435 """Return whether the file supports seeking."""
436 if self._mode == _MODE_READ:
437 return self._buffer.seekable()
438 elif self._mode == _MODE_WRITE:
439 return False
440
441 # Closed, raise ValueError.
442 self._check_mode()
443
444
445# Copied from lzma module
446def open(filename, mode="rb", *, level_or_option=None, zstd_dict=None,
447 encoding=None, errors=None, newline=None):
448 """Open a zstd compressed file in binary or text mode.
449
450 filename can be either an actual file name (given as a str, bytes, or
451 PathLike object), in which case the named file is opened, or it can be an
452 existing file object to read from or write to.
453
454 The mode parameter can be "r", "rb" (default), "w", "wb", "x", "xb", "a",
455 "ab" for binary mode, or "rt", "wt", "xt", "at" for text mode.
456
457 The level_or_option and zstd_dict parameters specify the settings, as for
458 ZstdCompressor, ZstdDecompressor and ZstdFile.
459
460 When using read mode (decompression), the level_or_option parameter can
461 only be a dict object, that represents decompression option. It doesn't
462 support int type compression level in this case.
463
464 For binary mode, this function is equivalent to the ZstdFile constructor:
465 ZstdFile(filename, mode, ...). In this case, the encoding, errors and
466 newline parameters must not be provided.
467
468 For text mode, an ZstdFile object is created, and wrapped in an
469 io.TextIOWrapper instance with the specified encoding, error handling
470 behavior, and line ending(s).
471 """
472
473 if "t" in mode:
474 if "b" in mode:
475 raise ValueError("Invalid mode: %r" % (mode,))
476 else:
477 if encoding is not None:
478 raise ValueError("Argument 'encoding' not supported in binary mode")
479 if errors is not None:
480 raise ValueError("Argument 'errors' not supported in binary mode")
481 if newline is not None:
482 raise ValueError("Argument 'newline' not supported in binary mode")
483
484 zstd_mode = mode.replace("t", "")
485 binary_file = ZstdFile(filename, zstd_mode,
486 level_or_option=level_or_option, zstd_dict=zstd_dict)
487
488 if "t" in mode:
489 return io.TextIOWrapper(binary_file, encoding, errors, newline)
490 else:
491 return binary_file