1"""Utilities for fast persistence of big data, with optional compression."""
2
3# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
4# Copyright (c) 2009 Gael Varoquaux
5# License: BSD Style, 3 clauses.
6
7import pickle
8import os
9import warnings
10import io
11from pathlib import Path
12
13from .compressor import lz4, LZ4_NOT_INSTALLED_ERROR
14from .compressor import _COMPRESSORS, register_compressor, BinaryZlibFile
15from .compressor import (ZlibCompressorWrapper, GzipCompressorWrapper,
16 BZ2CompressorWrapper, LZMACompressorWrapper,
17 XZCompressorWrapper, LZ4CompressorWrapper)
18from .numpy_pickle_utils import Unpickler, Pickler
19from .numpy_pickle_utils import _read_fileobject, _write_fileobject
20from .numpy_pickle_utils import _read_bytes, BUFFER_SIZE
21from .numpy_pickle_utils import _ensure_native_byte_order
22from .numpy_pickle_compat import load_compatibility
23from .numpy_pickle_compat import NDArrayWrapper
24# For compatibility with old versions of joblib, we need ZNDArrayWrapper
25# to be visible in the current namespace.
26# Explicitly skipping next line from flake8 as it triggers an F401 warning
27# which we don't care.
28from .numpy_pickle_compat import ZNDArrayWrapper # noqa
29from .backports import make_memmap
30
31# Register supported compressors
32register_compressor('zlib', ZlibCompressorWrapper())
33register_compressor('gzip', GzipCompressorWrapper())
34register_compressor('bz2', BZ2CompressorWrapper())
35register_compressor('lzma', LZMACompressorWrapper())
36register_compressor('xz', XZCompressorWrapper())
37register_compressor('lz4', LZ4CompressorWrapper())
38
39
40###############################################################################
41# Utility objects for persistence.
42
43# For convenience, 16 bytes are used to be sure to cover all the possible
44# dtypes' alignments. For reference, see:
45# https://numpy.org/devdocs/dev/alignment.html
46NUMPY_ARRAY_ALIGNMENT_BYTES = 16
47
48
49class NumpyArrayWrapper(object):
50 """An object to be persisted instead of numpy arrays.
51
52 This object is used to hack into the pickle machinery and read numpy
53 array data from our custom persistence format.
54 More precisely, this object is used for:
55 * carrying the information of the persisted array: subclass, shape, order,
56 dtype. Those ndarray metadata are used to correctly reconstruct the array
57 with low level numpy functions.
58 * determining if memmap is allowed on the array.
59 * reading the array bytes from a file.
60 * reading the array using memorymap from a file.
61 * writing the array bytes to a file.
62
63 Attributes
64 ----------
65 subclass: numpy.ndarray subclass
66 Determine the subclass of the wrapped array.
67 shape: numpy.ndarray shape
68 Determine the shape of the wrapped array.
69 order: {'C', 'F'}
70 Determine the order of wrapped array data. 'C' is for C order, 'F' is
71 for fortran order.
72 dtype: numpy.ndarray dtype
73 Determine the data type of the wrapped array.
74 allow_mmap: bool
75 Determine if memory mapping is allowed on the wrapped array.
76 Default: False.
77 """
78
79 def __init__(self, subclass, shape, order, dtype, allow_mmap=False,
80 numpy_array_alignment_bytes=NUMPY_ARRAY_ALIGNMENT_BYTES):
81 """Constructor. Store the useful information for later."""
82 self.subclass = subclass
83 self.shape = shape
84 self.order = order
85 self.dtype = dtype
86 self.allow_mmap = allow_mmap
87 # We make numpy_array_alignment_bytes an instance attribute to allow us
88 # to change our mind about the default alignment and still load the old
89 # pickles (with the previous alignment) correctly
90 self.numpy_array_alignment_bytes = numpy_array_alignment_bytes
91
92 def safe_get_numpy_array_alignment_bytes(self):
93 # NumpyArrayWrapper instances loaded from joblib <= 1.1 pickles don't
94 # have an numpy_array_alignment_bytes attribute
95 return getattr(self, 'numpy_array_alignment_bytes', None)
96
97 def write_array(self, array, pickler):
98 """Write array bytes to pickler file handle.
99
100 This function is an adaptation of the numpy write_array function
101 available in version 1.10.1 in numpy/lib/format.py.
102 """
103 # Set buffer size to 16 MiB to hide the Python loop overhead.
104 buffersize = max(16 * 1024 ** 2 // array.itemsize, 1)
105 if array.dtype.hasobject:
106 # We contain Python objects so we cannot write out the data
107 # directly. Instead, we will pickle it out with version 2 of the
108 # pickle protocol.
109 pickle.dump(array, pickler.file_handle, protocol=2)
110 else:
111 numpy_array_alignment_bytes = \
112 self.safe_get_numpy_array_alignment_bytes()
113 if numpy_array_alignment_bytes is not None:
114 current_pos = pickler.file_handle.tell()
115 pos_after_padding_byte = current_pos + 1
116 padding_length = numpy_array_alignment_bytes - (
117 pos_after_padding_byte % numpy_array_alignment_bytes)
118 # A single byte is written that contains the padding length in
119 # bytes
120 padding_length_byte = int.to_bytes(
121 padding_length, length=1, byteorder='little')
122 pickler.file_handle.write(padding_length_byte)
123
124 if padding_length != 0:
125 padding = b'\xff' * padding_length
126 pickler.file_handle.write(padding)
127
128 for chunk in pickler.np.nditer(array,
129 flags=['external_loop',
130 'buffered',
131 'zerosize_ok'],
132 buffersize=buffersize,
133 order=self.order):
134 pickler.file_handle.write(chunk.tobytes('C'))
135
136 def read_array(self, unpickler):
137 """Read array from unpickler file handle.
138
139 This function is an adaptation of the numpy read_array function
140 available in version 1.10.1 in numpy/lib/format.py.
141 """
142 if len(self.shape) == 0:
143 count = 1
144 else:
145 # joblib issue #859: we cast the elements of self.shape to int64 to
146 # prevent a potential overflow when computing their product.
147 shape_int64 = [unpickler.np.int64(x) for x in self.shape]
148 count = unpickler.np.multiply.reduce(shape_int64)
149 # Now read the actual data.
150 if self.dtype.hasobject:
151 # The array contained Python objects. We need to unpickle the data.
152 array = pickle.load(unpickler.file_handle)
153 else:
154 numpy_array_alignment_bytes = \
155 self.safe_get_numpy_array_alignment_bytes()
156 if numpy_array_alignment_bytes is not None:
157 padding_byte = unpickler.file_handle.read(1)
158 padding_length = int.from_bytes(
159 padding_byte, byteorder='little')
160 if padding_length != 0:
161 unpickler.file_handle.read(padding_length)
162
163 # This is not a real file. We have to read it the
164 # memory-intensive way.
165 # crc32 module fails on reads greater than 2 ** 32 bytes,
166 # breaking large reads from gzip streams. Chunk reads to
167 # BUFFER_SIZE bytes to avoid issue and reduce memory overhead
168 # of the read. In non-chunked case count < max_read_count, so
169 # only one read is performed.
170 max_read_count = BUFFER_SIZE // min(BUFFER_SIZE,
171 self.dtype.itemsize)
172
173 array = unpickler.np.empty(count, dtype=self.dtype)
174 for i in range(0, count, max_read_count):
175 read_count = min(max_read_count, count - i)
176 read_size = int(read_count * self.dtype.itemsize)
177 data = _read_bytes(unpickler.file_handle,
178 read_size, "array data")
179 array[i:i + read_count] = \
180 unpickler.np.frombuffer(data, dtype=self.dtype,
181 count=read_count)
182 del data
183
184 if self.order == 'F':
185 array.shape = self.shape[::-1]
186 array = array.transpose()
187 else:
188 array.shape = self.shape
189
190 # Detect byte order mismatch and swap as needed.
191 return _ensure_native_byte_order(array)
192
193 def read_mmap(self, unpickler):
194 """Read an array using numpy memmap."""
195 current_pos = unpickler.file_handle.tell()
196 offset = current_pos
197 numpy_array_alignment_bytes = \
198 self.safe_get_numpy_array_alignment_bytes()
199
200 if numpy_array_alignment_bytes is not None:
201 padding_byte = unpickler.file_handle.read(1)
202 padding_length = int.from_bytes(padding_byte, byteorder='little')
203 # + 1 is for the padding byte
204 offset += padding_length + 1
205
206 if unpickler.mmap_mode == 'w+':
207 unpickler.mmap_mode = 'r+'
208
209 marray = make_memmap(unpickler.filename,
210 dtype=self.dtype,
211 shape=self.shape,
212 order=self.order,
213 mode=unpickler.mmap_mode,
214 offset=offset)
215 # update the offset so that it corresponds to the end of the read array
216 unpickler.file_handle.seek(offset + marray.nbytes)
217
218 if (numpy_array_alignment_bytes is None and
219 current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES != 0):
220 message = (
221 f'The memmapped array {marray} loaded from the file '
222 f'{unpickler.file_handle.name} is not byte aligned. '
223 'This may cause segmentation faults if this memmapped array '
224 'is used in some libraries like BLAS or PyTorch. '
225 'To get rid of this warning, regenerate your pickle file '
226 'with joblib >= 1.2.0. '
227 'See https://github.com/joblib/joblib/issues/563 '
228 'for more details'
229 )
230 warnings.warn(message)
231
232 return _ensure_native_byte_order(marray)
233
234 def read(self, unpickler):
235 """Read the array corresponding to this wrapper.
236
237 Use the unpickler to get all information to correctly read the array.
238
239 Parameters
240 ----------
241 unpickler: NumpyUnpickler
242
243 Returns
244 -------
245 array: numpy.ndarray
246
247 """
248 # When requested, only use memmap mode if allowed.
249 if unpickler.mmap_mode is not None and self.allow_mmap:
250 array = self.read_mmap(unpickler)
251 else:
252 array = self.read_array(unpickler)
253
254 # Manage array subclass case
255 if (hasattr(array, '__array_prepare__') and
256 self.subclass not in (unpickler.np.ndarray,
257 unpickler.np.memmap)):
258 # We need to reconstruct another subclass
259 new_array = unpickler.np.core.multiarray._reconstruct(
260 self.subclass, (0,), 'b')
261 return new_array.__array_prepare__(array)
262 else:
263 return array
264
265###############################################################################
266# Pickler classes
267
268
269class NumpyPickler(Pickler):
270 """A pickler to persist big data efficiently.
271
272 The main features of this object are:
273 * persistence of numpy arrays in a single file.
274 * optional compression with a special care on avoiding memory copies.
275
276 Attributes
277 ----------
278 fp: file
279 File object handle used for serializing the input object.
280 protocol: int, optional
281 Pickle protocol used. Default is pickle.DEFAULT_PROTOCOL.
282 """
283
284 dispatch = Pickler.dispatch.copy()
285
286 def __init__(self, fp, protocol=None):
287 self.file_handle = fp
288 self.buffered = isinstance(self.file_handle, BinaryZlibFile)
289
290 # By default we want a pickle protocol that only changes with
291 # the major python version and not the minor one
292 if protocol is None:
293 protocol = pickle.DEFAULT_PROTOCOL
294
295 Pickler.__init__(self, self.file_handle, protocol=protocol)
296 # delayed import of numpy, to avoid tight coupling
297 try:
298 import numpy as np
299 except ImportError:
300 np = None
301 self.np = np
302
303 def _create_array_wrapper(self, array):
304 """Create and returns a numpy array wrapper from a numpy array."""
305 order = 'F' if (array.flags.f_contiguous and
306 not array.flags.c_contiguous) else 'C'
307 allow_mmap = not self.buffered and not array.dtype.hasobject
308
309 kwargs = {}
310 try:
311 self.file_handle.tell()
312 except io.UnsupportedOperation:
313 kwargs = {'numpy_array_alignment_bytes': None}
314
315 wrapper = NumpyArrayWrapper(type(array),
316 array.shape, order, array.dtype,
317 allow_mmap=allow_mmap,
318 **kwargs)
319
320 return wrapper
321
322 def save(self, obj):
323 """Subclass the Pickler `save` method.
324
325 This is a total abuse of the Pickler class in order to use the numpy
326 persistence function `save` instead of the default pickle
327 implementation. The numpy array is replaced by a custom wrapper in the
328 pickle persistence stack and the serialized array is written right
329 after in the file. Warning: the file produced does not follow the
330 pickle format. As such it can not be read with `pickle.load`.
331 """
332 if self.np is not None and type(obj) in (self.np.ndarray,
333 self.np.matrix,
334 self.np.memmap):
335 if type(obj) is self.np.memmap:
336 # Pickling doesn't work with memmapped arrays
337 obj = self.np.asanyarray(obj)
338
339 # The array wrapper is pickled instead of the real array.
340 wrapper = self._create_array_wrapper(obj)
341 Pickler.save(self, wrapper)
342
343 # A framer was introduced with pickle protocol 4 and we want to
344 # ensure the wrapper object is written before the numpy array
345 # buffer in the pickle file.
346 # See https://www.python.org/dev/peps/pep-3154/#framing to get
347 # more information on the framer behavior.
348 if self.proto >= 4:
349 self.framer.commit_frame(force=True)
350
351 # And then array bytes are written right after the wrapper.
352 wrapper.write_array(obj, self)
353 return
354
355 return Pickler.save(self, obj)
356
357
358class NumpyUnpickler(Unpickler):
359 """A subclass of the Unpickler to unpickle our numpy pickles.
360
361 Attributes
362 ----------
363 mmap_mode: str
364 The memorymap mode to use for reading numpy arrays.
365 file_handle: file_like
366 File object to unpickle from.
367 filename: str
368 Name of the file to unpickle from. It should correspond to file_handle.
369 This parameter is required when using mmap_mode.
370 np: module
371 Reference to numpy module if numpy is installed else None.
372
373 """
374
375 dispatch = Unpickler.dispatch.copy()
376
377 def __init__(self, filename, file_handle, mmap_mode=None):
378 # The next line is for backward compatibility with pickle generated
379 # with joblib versions less than 0.10.
380 self._dirname = os.path.dirname(filename)
381
382 self.mmap_mode = mmap_mode
383 self.file_handle = file_handle
384 # filename is required for numpy mmap mode.
385 self.filename = filename
386 self.compat_mode = False
387 Unpickler.__init__(self, self.file_handle)
388 try:
389 import numpy as np
390 except ImportError:
391 np = None
392 self.np = np
393
394 def load_build(self):
395 """Called to set the state of a newly created object.
396
397 We capture it to replace our place-holder objects, NDArrayWrapper or
398 NumpyArrayWrapper, by the array we are interested in. We
399 replace them directly in the stack of pickler.
400 NDArrayWrapper is used for backward compatibility with joblib <= 0.9.
401 """
402 Unpickler.load_build(self)
403
404 # For backward compatibility, we support NDArrayWrapper objects.
405 if isinstance(self.stack[-1], (NDArrayWrapper, NumpyArrayWrapper)):
406 if self.np is None:
407 raise ImportError("Trying to unpickle an ndarray, "
408 "but numpy didn't import correctly")
409 array_wrapper = self.stack.pop()
410 # If any NDArrayWrapper is found, we switch to compatibility mode,
411 # this will be used to raise a DeprecationWarning to the user at
412 # the end of the unpickling.
413 if isinstance(array_wrapper, NDArrayWrapper):
414 self.compat_mode = True
415 self.stack.append(array_wrapper.read(self))
416
417 # Be careful to register our new method.
418 dispatch[pickle.BUILD[0]] = load_build
419
420
421###############################################################################
422# Utility functions
423
424def dump(value, filename, compress=0, protocol=None, cache_size=None):
425 """Persist an arbitrary Python object into one file.
426
427 Read more in the :ref:`User Guide <persistence>`.
428
429 Parameters
430 ----------
431 value: any Python object
432 The object to store to disk.
433 filename: str, pathlib.Path, or file object.
434 The file object or path of the file in which it is to be stored.
435 The compression method corresponding to one of the supported filename
436 extensions ('.z', '.gz', '.bz2', '.xz' or '.lzma') will be used
437 automatically.
438 compress: int from 0 to 9 or bool or 2-tuple, optional
439 Optional compression level for the data. 0 or False is no compression.
440 Higher value means more compression, but also slower read and
441 write times. Using a value of 3 is often a good compromise.
442 See the notes for more details.
443 If compress is True, the compression level used is 3.
444 If compress is a 2-tuple, the first element must correspond to a string
445 between supported compressors (e.g 'zlib', 'gzip', 'bz2', 'lzma'
446 'xz'), the second element must be an integer from 0 to 9, corresponding
447 to the compression level.
448 protocol: int, optional
449 Pickle protocol, see pickle.dump documentation for more details.
450 cache_size: positive int, optional
451 This option is deprecated in 0.10 and has no effect.
452
453 Returns
454 -------
455 filenames: list of strings
456 The list of file names in which the data is stored. If
457 compress is false, each array is stored in a different file.
458
459 See Also
460 --------
461 joblib.load : corresponding loader
462
463 Notes
464 -----
465 Memmapping on load cannot be used for compressed files. Thus
466 using compression can significantly slow down loading. In
467 addition, compressed files take up extra memory during
468 dump and load.
469
470 """
471
472 if Path is not None and isinstance(filename, Path):
473 filename = str(filename)
474
475 is_filename = isinstance(filename, str)
476 is_fileobj = hasattr(filename, "write")
477
478 compress_method = 'zlib' # zlib is the default compression method.
479 if compress is True:
480 # By default, if compress is enabled, we want the default compress
481 # level of the compressor.
482 compress_level = None
483 elif isinstance(compress, tuple):
484 # a 2-tuple was set in compress
485 if len(compress) != 2:
486 raise ValueError(
487 'Compress argument tuple should contain exactly 2 elements: '
488 '(compress method, compress level), you passed {}'
489 .format(compress))
490 compress_method, compress_level = compress
491 elif isinstance(compress, str):
492 compress_method = compress
493 compress_level = None # Use default compress level
494 compress = (compress_method, compress_level)
495 else:
496 compress_level = compress
497
498 if compress_method == 'lz4' and lz4 is None:
499 raise ValueError(LZ4_NOT_INSTALLED_ERROR)
500
501 if (compress_level is not None and
502 compress_level is not False and
503 compress_level not in range(10)):
504 # Raising an error if a non valid compress level is given.
505 raise ValueError(
506 'Non valid compress level given: "{}". Possible values are '
507 '{}.'.format(compress_level, list(range(10))))
508
509 if compress_method not in _COMPRESSORS:
510 # Raising an error if an unsupported compression method is given.
511 raise ValueError(
512 'Non valid compression method given: "{}". Possible values are '
513 '{}.'.format(compress_method, _COMPRESSORS))
514
515 if not is_filename and not is_fileobj:
516 # People keep inverting arguments, and the resulting error is
517 # incomprehensible
518 raise ValueError(
519 'Second argument should be a filename or a file-like object, '
520 '%s (type %s) was given.'
521 % (filename, type(filename))
522 )
523
524 if is_filename and not isinstance(compress, tuple):
525 # In case no explicit compression was requested using both compression
526 # method and level in a tuple and the filename has an explicit
527 # extension, we select the corresponding compressor.
528
529 # unset the variable to be sure no compression level is set afterwards.
530 compress_method = None
531 for name, compressor in _COMPRESSORS.items():
532 if filename.endswith(compressor.extension):
533 compress_method = name
534
535 if compress_method in _COMPRESSORS and compress_level == 0:
536 # we choose the default compress_level in case it was not given
537 # as an argument (using compress).
538 compress_level = None
539
540 if cache_size is not None:
541 # Cache size is deprecated starting from version 0.10
542 warnings.warn("Please do not set 'cache_size' in joblib.dump, "
543 "this parameter has no effect and will be removed. "
544 "You used 'cache_size={}'".format(cache_size),
545 DeprecationWarning, stacklevel=2)
546
547 if compress_level != 0:
548 with _write_fileobject(filename, compress=(compress_method,
549 compress_level)) as f:
550 NumpyPickler(f, protocol=protocol).dump(value)
551 elif is_filename:
552 with open(filename, 'wb') as f:
553 NumpyPickler(f, protocol=protocol).dump(value)
554 else:
555 NumpyPickler(filename, protocol=protocol).dump(value)
556
557 # If the target container is a file object, nothing is returned.
558 if is_fileobj:
559 return
560
561 # For compatibility, the list of created filenames (e.g with one element
562 # after 0.10.0) is returned by default.
563 return [filename]
564
565
566def _unpickle(fobj, filename="", mmap_mode=None):
567 """Internal unpickling function."""
568 # We are careful to open the file handle early and keep it open to
569 # avoid race-conditions on renames.
570 # That said, if data is stored in companion files, which can be
571 # the case with the old persistence format, moving the directory
572 # will create a race when joblib tries to access the companion
573 # files.
574 unpickler = NumpyUnpickler(filename, fobj, mmap_mode=mmap_mode)
575 obj = None
576 try:
577 obj = unpickler.load()
578 if unpickler.compat_mode:
579 warnings.warn("The file '%s' has been generated with a "
580 "joblib version less than 0.10. "
581 "Please regenerate this pickle file."
582 % filename,
583 DeprecationWarning, stacklevel=3)
584 except UnicodeDecodeError as exc:
585 # More user-friendly error message
586 new_exc = ValueError(
587 'You may be trying to read with '
588 'python 3 a joblib pickle generated with python 2. '
589 'This feature is not supported by joblib.')
590 new_exc.__cause__ = exc
591 raise new_exc
592 return obj
593
594
595def load_temporary_memmap(filename, mmap_mode, unlink_on_gc_collect):
596 from ._memmapping_reducer import JOBLIB_MMAPS, add_maybe_unlink_finalizer
597 obj = load(filename, mmap_mode)
598 JOBLIB_MMAPS.add(obj.filename)
599 if unlink_on_gc_collect:
600 add_maybe_unlink_finalizer(obj)
601 return obj
602
603
604def load(filename, mmap_mode=None):
605 """Reconstruct a Python object from a file persisted with joblib.dump.
606
607 Read more in the :ref:`User Guide <persistence>`.
608
609 WARNING: joblib.load relies on the pickle module and can therefore
610 execute arbitrary Python code. It should therefore never be used
611 to load files from untrusted sources.
612
613 Parameters
614 ----------
615 filename: str, pathlib.Path, or file object.
616 The file object or path of the file from which to load the object
617 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional
618 If not None, the arrays are memory-mapped from the disk. This
619 mode has no effect for compressed files. Note that in this
620 case the reconstructed object might no longer match exactly
621 the originally pickled object.
622
623 Returns
624 -------
625 result: any Python object
626 The object stored in the file.
627
628 See Also
629 --------
630 joblib.dump : function to save an object
631
632 Notes
633 -----
634
635 This function can load numpy array files saved separately during the
636 dump. If the mmap_mode argument is given, it is passed to np.load and
637 arrays are loaded as memmaps. As a consequence, the reconstructed
638 object might not match the original pickled object. Note that if the
639 file was saved with compression, the arrays cannot be memmapped.
640 """
641 if Path is not None and isinstance(filename, Path):
642 filename = str(filename)
643
644 if hasattr(filename, "read"):
645 fobj = filename
646 filename = getattr(fobj, 'name', '')
647 with _read_fileobject(fobj, filename, mmap_mode) as fobj:
648 obj = _unpickle(fobj)
649 else:
650 with open(filename, 'rb') as f:
651 with _read_fileobject(f, filename, mmap_mode) as fobj:
652 if isinstance(fobj, str):
653 # if the returned file object is a string, this means we
654 # try to load a pickle file generated with an version of
655 # Joblib so we load it with joblib compatibility function.
656 return load_compatibility(fobj)
657
658 obj = _unpickle(fobj, filename, mmap_mode)
659 return obj