Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/joblib/numpy

1"""Utilities for fast persistence of big data, with optional compression."""

3# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>

5# License: BSD Style, 3 clauses.

7import pickle

8import os

9import warnings

10import io

11from pathlib import Path

13from .compressor import lz4, LZ4_NOT_INSTALLED_ERROR

14from .compressor import _COMPRESSORS, register_compressor, BinaryZlibFile

15from .compressor import (ZlibCompressorWrapper, GzipCompressorWrapper,

16 BZ2CompressorWrapper, LZMACompressorWrapper,

17 XZCompressorWrapper, LZ4CompressorWrapper)

18from .numpy_pickle_utils import Unpickler, Pickler

19from .numpy_pickle_utils import _read_fileobject, _write_fileobject

20from .numpy_pickle_utils import _read_bytes, BUFFER_SIZE

21from .numpy_pickle_utils import _ensure_native_byte_order

22from .numpy_pickle_compat import load_compatibility

23from .numpy_pickle_compat import NDArrayWrapper

24# For compatibility with old versions of joblib, we need ZNDArrayWrapper

25# to be visible in the current namespace.

26# Explicitly skipping next line from flake8 as it triggers an F401 warning

27# which we don't care.

28from .numpy_pickle_compat import ZNDArrayWrapper # noqa

29from .backports import make_memmap

31# Register supported compressors

32register_compressor('zlib', ZlibCompressorWrapper())

33register_compressor('gzip', GzipCompressorWrapper())

34register_compressor('bz2', BZ2CompressorWrapper())

35register_compressor('lzma', LZMACompressorWrapper())

36register_compressor('xz', XZCompressorWrapper())

37register_compressor('lz4', LZ4CompressorWrapper())

40###############################################################################

41# Utility objects for persistence.

43# For convenience, 16 bytes are used to be sure to cover all the possible

44# dtypes' alignments. For reference, see:

45# https://numpy.org/devdocs/dev/alignment.html

46NUMPY_ARRAY_ALIGNMENT_BYTES = 16

49class NumpyArrayWrapper(object):

50 """An object to be persisted instead of numpy arrays.

52 This object is used to hack into the pickle machinery and read numpy

53 array data from our custom persistence format.

54 More precisely, this object is used for:

55 * carrying the information of the persisted array: subclass, shape, order,

56 dtype. Those ndarray metadata are used to correctly reconstruct the array

57 with low level numpy functions.

58 * determining if memmap is allowed on the array.

59 * reading the array bytes from a file.

60 * reading the array using memorymap from a file.

61 * writing the array bytes to a file.

63 Attributes

64 ----------

65 subclass: numpy.ndarray subclass

66 Determine the subclass of the wrapped array.

67 shape: numpy.ndarray shape

68 Determine the shape of the wrapped array.

69 order: {'C', 'F'}

70 Determine the order of wrapped array data. 'C' is for C order, 'F' is

71 for fortran order.

72 dtype: numpy.ndarray dtype

73 Determine the data type of the wrapped array.

74 allow_mmap: bool

75 Determine if memory mapping is allowed on the wrapped array.

76 Default: False.

77 """

79 def __init__(self, subclass, shape, order, dtype, allow_mmap=False,

80 numpy_array_alignment_bytes=NUMPY_ARRAY_ALIGNMENT_BYTES):

81 """Constructor. Store the useful information for later."""

82 self.subclass = subclass

83 self.shape = shape

84 self.order = order

85 self.dtype = dtype

86 self.allow_mmap = allow_mmap

87 # We make numpy_array_alignment_bytes an instance attribute to allow us

88 # to change our mind about the default alignment and still load the old

89 # pickles (with the previous alignment) correctly

90 self.numpy_array_alignment_bytes = numpy_array_alignment_bytes

92 def safe_get_numpy_array_alignment_bytes(self):

93 # NumpyArrayWrapper instances loaded from joblib <= 1.1 pickles don't

94 # have an numpy_array_alignment_bytes attribute

95 return getattr(self, 'numpy_array_alignment_bytes', None)

97 def write_array(self, array, pickler):

98 """Write array bytes to pickler file handle.

100 This function is an adaptation of the numpy write_array function

101 available in version 1.10.1 in numpy/lib/format.py.

102 """

103 # Set buffer size to 16 MiB to hide the Python loop overhead.

104 buffersize = max(16 * 1024 ** 2 // array.itemsize, 1)

105 if array.dtype.hasobject:

106 # We contain Python objects so we cannot write out the data

107 # directly. Instead, we will pickle it out with version 2 of the

108 # pickle protocol.

109 pickle.dump(array, pickler.file_handle, protocol=2)

110 else:

111 numpy_array_alignment_bytes = \

112 self.safe_get_numpy_array_alignment_bytes()

113 if numpy_array_alignment_bytes is not None:

114 current_pos = pickler.file_handle.tell()

115 pos_after_padding_byte = current_pos + 1

116 padding_length = numpy_array_alignment_bytes - (

117 pos_after_padding_byte % numpy_array_alignment_bytes)

118 # A single byte is written that contains the padding length in

119 # bytes

120 padding_length_byte = int.to_bytes(

121 padding_length, length=1, byteorder='little')

122 pickler.file_handle.write(padding_length_byte)

123

124 if padding_length != 0:

125 padding = b'\xff' * padding_length

126 pickler.file_handle.write(padding)

127

128 for chunk in pickler.np.nditer(array,

129 flags=['external_loop',

130 'buffered',

131 'zerosize_ok'],

132 buffersize=buffersize,

133 order=self.order):

134 pickler.file_handle.write(chunk.tobytes('C'))

135

136 def read_array(self, unpickler):

137 """Read array from unpickler file handle.

138

139 This function is an adaptation of the numpy read_array function

140 available in version 1.10.1 in numpy/lib/format.py.

141 """

142 if len(self.shape) == 0:

143 count = 1

144 else:

145 # joblib issue #859: we cast the elements of self.shape to int64 to

146 # prevent a potential overflow when computing their product.

147 shape_int64 = [unpickler.np.int64(x) for x in self.shape]

148 count = unpickler.np.multiply.reduce(shape_int64)

149 # Now read the actual data.

150 if self.dtype.hasobject:

151 # The array contained Python objects. We need to unpickle the data.

152 array = pickle.load(unpickler.file_handle)

153 else:

154 numpy_array_alignment_bytes = \

155 self.safe_get_numpy_array_alignment_bytes()

156 if numpy_array_alignment_bytes is not None:

157 padding_byte = unpickler.file_handle.read(1)

158 padding_length = int.from_bytes(

159 padding_byte, byteorder='little')

160 if padding_length != 0:

161 unpickler.file_handle.read(padding_length)

162

163 # This is not a real file. We have to read it the

164 # memory-intensive way.

165 # crc32 module fails on reads greater than 2 ** 32 bytes,

166 # breaking large reads from gzip streams. Chunk reads to

167 # BUFFER_SIZE bytes to avoid issue and reduce memory overhead

168 # of the read. In non-chunked case count < max_read_count, so

169 # only one read is performed.

170 max_read_count = BUFFER_SIZE // min(BUFFER_SIZE,

171 self.dtype.itemsize)

172

173 array = unpickler.np.empty(count, dtype=self.dtype)

174 for i in range(0, count, max_read_count):

175 read_count = min(max_read_count, count - i)

176 read_size = int(read_count * self.dtype.itemsize)

177 data = _read_bytes(unpickler.file_handle,

178 read_size, "array data")

179 array[i:i + read_count] = \

180 unpickler.np.frombuffer(data, dtype=self.dtype,

181 count=read_count)

182 del data

183

184 if self.order == 'F':

185 array.shape = self.shape[::-1]

186 array = array.transpose()

187 else:

188 array.shape = self.shape

189

190 # Detect byte order mismatch and swap as needed.

191 return _ensure_native_byte_order(array)

192

193 def read_mmap(self, unpickler):

194 """Read an array using numpy memmap."""

195 current_pos = unpickler.file_handle.tell()

196 offset = current_pos

197 numpy_array_alignment_bytes = \

198 self.safe_get_numpy_array_alignment_bytes()

199

200 if numpy_array_alignment_bytes is not None:

201 padding_byte = unpickler.file_handle.read(1)

202 padding_length = int.from_bytes(padding_byte, byteorder='little')

203 # + 1 is for the padding byte

204 offset += padding_length + 1

205

206 if unpickler.mmap_mode == 'w+':

207 unpickler.mmap_mode = 'r+'

208

209 marray = make_memmap(unpickler.filename,

210 dtype=self.dtype,

211 shape=self.shape,

212 order=self.order,

213 mode=unpickler.mmap_mode,

214 offset=offset)

215 # update the offset so that it corresponds to the end of the read array

216 unpickler.file_handle.seek(offset + marray.nbytes)

217

218 if (numpy_array_alignment_bytes is None and

219 current_pos % NUMPY_ARRAY_ALIGNMENT_BYTES != 0):

220 message = (

221 f'The memmapped array {marray} loaded from the file '

222 f'{unpickler.file_handle.name} is not byte aligned. '

223 'This may cause segmentation faults if this memmapped array '

224 'is used in some libraries like BLAS or PyTorch. '

225 'To get rid of this warning, regenerate your pickle file '

226 'with joblib >= 1.2.0. '

227 'See https://github.com/joblib/joblib/issues/563 '

228 'for more details'

229 )

230 warnings.warn(message)

231

232 return _ensure_native_byte_order(marray)

233

234 def read(self, unpickler):

235 """Read the array corresponding to this wrapper.

236

237 Use the unpickler to get all information to correctly read the array.

238

239 Parameters

240 ----------

241 unpickler: NumpyUnpickler

242

243 Returns

244 -------

245 array: numpy.ndarray

246

247 """

248 # When requested, only use memmap mode if allowed.

249 if unpickler.mmap_mode is not None and self.allow_mmap:

250 array = self.read_mmap(unpickler)

251 else:

252 array = self.read_array(unpickler)

253

254 # Manage array subclass case

255 if (hasattr(array, '__array_prepare__') and

256 self.subclass not in (unpickler.np.ndarray,

257 unpickler.np.memmap)):

258 # We need to reconstruct another subclass

259 new_array = unpickler.np.core.multiarray._reconstruct(

260 self.subclass, (0,), 'b')

261 return new_array.__array_prepare__(array)

262 else:

263 return array

264

265###############################################################################

266# Pickler classes

267

268

269class NumpyPickler(Pickler):

270 """A pickler to persist big data efficiently.

271

272 The main features of this object are:

273 * persistence of numpy arrays in a single file.

274 * optional compression with a special care on avoiding memory copies.

275

276 Attributes

277 ----------

278 fp: file

279 File object handle used for serializing the input object.

280 protocol: int, optional

281 Pickle protocol used. Default is pickle.DEFAULT_PROTOCOL.

282 """

283

284 dispatch = Pickler.dispatch.copy()

285

286 def __init__(self, fp, protocol=None):

287 self.file_handle = fp

288 self.buffered = isinstance(self.file_handle, BinaryZlibFile)

289

290 # By default we want a pickle protocol that only changes with

291 # the major python version and not the minor one

292 if protocol is None:

293 protocol = pickle.DEFAULT_PROTOCOL

294

295 Pickler.__init__(self, self.file_handle, protocol=protocol)

296 # delayed import of numpy, to avoid tight coupling

297 try:

298 import numpy as np

299 except ImportError:

300 np = None

301 self.np = np

302

303 def _create_array_wrapper(self, array):

304 """Create and returns a numpy array wrapper from a numpy array."""

305 order = 'F' if (array.flags.f_contiguous and

306 not array.flags.c_contiguous) else 'C'

307 allow_mmap = not self.buffered and not array.dtype.hasobject

308

309 kwargs = {}

310 try:

311 self.file_handle.tell()

312 except io.UnsupportedOperation:

313 kwargs = {'numpy_array_alignment_bytes': None}

314

315 wrapper = NumpyArrayWrapper(type(array),

316 array.shape, order, array.dtype,

317 allow_mmap=allow_mmap,

318 **kwargs)

319

320 return wrapper

321

322 def save(self, obj):

323 """Subclass the Pickler `save` method.

324

325 This is a total abuse of the Pickler class in order to use the numpy

326 persistence function `save` instead of the default pickle

327 implementation. The numpy array is replaced by a custom wrapper in the

328 pickle persistence stack and the serialized array is written right

329 after in the file. Warning: the file produced does not follow the

330 pickle format. As such it can not be read with `pickle.load`.

331 """

332 if self.np is not None and type(obj) in (self.np.ndarray,

333 self.np.matrix,

334 self.np.memmap):

335 if type(obj) is self.np.memmap:

336 # Pickling doesn't work with memmapped arrays

337 obj = self.np.asanyarray(obj)

338

339 # The array wrapper is pickled instead of the real array.

340 wrapper = self._create_array_wrapper(obj)

341 Pickler.save(self, wrapper)

342

343 # A framer was introduced with pickle protocol 4 and we want to

344 # ensure the wrapper object is written before the numpy array

345 # buffer in the pickle file.

346 # See https://www.python.org/dev/peps/pep-3154/#framing to get

347 # more information on the framer behavior.

348 if self.proto >= 4:

349 self.framer.commit_frame(force=True)

350

351 # And then array bytes are written right after the wrapper.

352 wrapper.write_array(obj, self)

353 return

354

355 return Pickler.save(self, obj)

356

357

358class NumpyUnpickler(Unpickler):

359 """A subclass of the Unpickler to unpickle our numpy pickles.

360

361 Attributes

362 ----------

363 mmap_mode: str

364 The memorymap mode to use for reading numpy arrays.

365 file_handle: file_like

366 File object to unpickle from.

367 filename: str

368 Name of the file to unpickle from. It should correspond to file_handle.

369 This parameter is required when using mmap_mode.

370 np: module

371 Reference to numpy module if numpy is installed else None.

372

373 """

374

375 dispatch = Unpickler.dispatch.copy()

376

377 def __init__(self, filename, file_handle, mmap_mode=None):

378 # The next line is for backward compatibility with pickle generated

379 # with joblib versions less than 0.10.

380 self._dirname = os.path.dirname(filename)

381

382 self.mmap_mode = mmap_mode

383 self.file_handle = file_handle

384 # filename is required for numpy mmap mode.

385 self.filename = filename

386 self.compat_mode = False

387 Unpickler.__init__(self, self.file_handle)

388 try:

389 import numpy as np

390 except ImportError:

391 np = None

392 self.np = np

393

394 def load_build(self):

395 """Called to set the state of a newly created object.

396

397 We capture it to replace our place-holder objects, NDArrayWrapper or

398 NumpyArrayWrapper, by the array we are interested in. We

399 replace them directly in the stack of pickler.

400 NDArrayWrapper is used for backward compatibility with joblib <= 0.9.

401 """

402 Unpickler.load_build(self)

403

404 # For backward compatibility, we support NDArrayWrapper objects.

405 if isinstance(self.stack[-1], (NDArrayWrapper, NumpyArrayWrapper)):

406 if self.np is None:

407 raise ImportError("Trying to unpickle an ndarray, "

408 "but numpy didn't import correctly")

409 array_wrapper = self.stack.pop()

410 # If any NDArrayWrapper is found, we switch to compatibility mode,

411 # this will be used to raise a DeprecationWarning to the user at

412 # the end of the unpickling.

413 if isinstance(array_wrapper, NDArrayWrapper):

414 self.compat_mode = True

415 self.stack.append(array_wrapper.read(self))

416

417 # Be careful to register our new method.

418 dispatch[pickle.BUILD[0]] = load_build

419

420

421###############################################################################

422# Utility functions

423

424def dump(value, filename, compress=0, protocol=None, cache_size=None):

425 """Persist an arbitrary Python object into one file.

426

427 Read more in the :ref:`User Guide <persistence>`.

428

429 Parameters

430 ----------

431 value: any Python object

432 The object to store to disk.

433 filename: str, pathlib.Path, or file object.

434 The file object or path of the file in which it is to be stored.

435 The compression method corresponding to one of the supported filename

436 extensions ('.z', '.gz', '.bz2', '.xz' or '.lzma') will be used

437 automatically.

438 compress: int from 0 to 9 or bool or 2-tuple, optional

439 Optional compression level for the data. 0 or False is no compression.

440 Higher value means more compression, but also slower read and

441 write times. Using a value of 3 is often a good compromise.

442 See the notes for more details.

443 If compress is True, the compression level used is 3.

444 If compress is a 2-tuple, the first element must correspond to a string

445 between supported compressors (e.g 'zlib', 'gzip', 'bz2', 'lzma'

446 'xz'), the second element must be an integer from 0 to 9, corresponding

447 to the compression level.

448 protocol: int, optional

449 Pickle protocol, see pickle.dump documentation for more details.

450 cache_size: positive int, optional

451 This option is deprecated in 0.10 and has no effect.

452

453 Returns

454 -------

455 filenames: list of strings

456 The list of file names in which the data is stored. If

457 compress is false, each array is stored in a different file.

458

459 See Also

460 --------

461 joblib.load : corresponding loader

462

463 Notes

464 -----

465 Memmapping on load cannot be used for compressed files. Thus

466 using compression can significantly slow down loading. In

467 addition, compressed files take up extra memory during

468 dump and load.

469

470 """

471

472 if Path is not None and isinstance(filename, Path):

473 filename = str(filename)

474

475 is_filename = isinstance(filename, str)

476 is_fileobj = hasattr(filename, "write")

477

478 compress_method = 'zlib' # zlib is the default compression method.

479 if compress is True:

480 # By default, if compress is enabled, we want the default compress

481 # level of the compressor.

482 compress_level = None

483 elif isinstance(compress, tuple):

484 # a 2-tuple was set in compress

485 if len(compress) != 2:

486 raise ValueError(

487 'Compress argument tuple should contain exactly 2 elements: '

488 '(compress method, compress level), you passed {}'

489 .format(compress))

490 compress_method, compress_level = compress

491 elif isinstance(compress, str):

492 compress_method = compress

493 compress_level = None # Use default compress level

494 compress = (compress_method, compress_level)

495 else:

496 compress_level = compress

497

498 if compress_method == 'lz4' and lz4 is None:

499 raise ValueError(LZ4_NOT_INSTALLED_ERROR)

500

501 if (compress_level is not None and

502 compress_level is not False and

503 compress_level not in range(10)):

504 # Raising an error if a non valid compress level is given.

505 raise ValueError(

506 'Non valid compress level given: "{}". Possible values are '

507 '{}.'.format(compress_level, list(range(10))))

508

509 if compress_method not in _COMPRESSORS:

510 # Raising an error if an unsupported compression method is given.

511 raise ValueError(

512 'Non valid compression method given: "{}". Possible values are '

513 '{}.'.format(compress_method, _COMPRESSORS))

514

515 if not is_filename and not is_fileobj:

516 # People keep inverting arguments, and the resulting error is

517 # incomprehensible

518 raise ValueError(

519 'Second argument should be a filename or a file-like object, '

520 '%s (type %s) was given.'

521 % (filename, type(filename))

522 )

523

524 if is_filename and not isinstance(compress, tuple):

525 # In case no explicit compression was requested using both compression

526 # method and level in a tuple and the filename has an explicit

527 # extension, we select the corresponding compressor.

528

529 # unset the variable to be sure no compression level is set afterwards.

530 compress_method = None

531 for name, compressor in _COMPRESSORS.items():

532 if filename.endswith(compressor.extension):

533 compress_method = name

534

535 if compress_method in _COMPRESSORS and compress_level == 0:

536 # we choose the default compress_level in case it was not given

537 # as an argument (using compress).

538 compress_level = None

539

540 if cache_size is not None:

541 # Cache size is deprecated starting from version 0.10

542 warnings.warn("Please do not set 'cache_size' in joblib.dump, "

543 "this parameter has no effect and will be removed. "

544 "You used 'cache_size={}'".format(cache_size),

545 DeprecationWarning, stacklevel=2)

546

547 if compress_level != 0:

548 with _write_fileobject(filename, compress=(compress_method,

549 compress_level)) as f:

550 NumpyPickler(f, protocol=protocol).dump(value)

551 elif is_filename:

552 with open(filename, 'wb') as f:

553 NumpyPickler(f, protocol=protocol).dump(value)

554 else:

555 NumpyPickler(filename, protocol=protocol).dump(value)

556

557 # If the target container is a file object, nothing is returned.

558 if is_fileobj:

559 return

560

561 # For compatibility, the list of created filenames (e.g with one element

562 # after 0.10.0) is returned by default.

563 return [filename]

564

565

566def _unpickle(fobj, filename="", mmap_mode=None):

567 """Internal unpickling function."""

568 # We are careful to open the file handle early and keep it open to

569 # avoid race-conditions on renames.

570 # That said, if data is stored in companion files, which can be

571 # the case with the old persistence format, moving the directory

572 # will create a race when joblib tries to access the companion

573 # files.

574 unpickler = NumpyUnpickler(filename, fobj, mmap_mode=mmap_mode)

575 obj = None

576 try:

577 obj = unpickler.load()

578 if unpickler.compat_mode:

579 warnings.warn("The file '%s' has been generated with a "

580 "joblib version less than 0.10. "

581 "Please regenerate this pickle file."

582 % filename,

583 DeprecationWarning, stacklevel=3)

584 except UnicodeDecodeError as exc:

585 # More user-friendly error message

586 new_exc = ValueError(

587 'You may be trying to read with '

588 'python 3 a joblib pickle generated with python 2. '

589 'This feature is not supported by joblib.')

590 new_exc.__cause__ = exc

591 raise new_exc

592 return obj

593

594

595def load_temporary_memmap(filename, mmap_mode, unlink_on_gc_collect):

596 from ._memmapping_reducer import JOBLIB_MMAPS, add_maybe_unlink_finalizer

597 obj = load(filename, mmap_mode)

598 JOBLIB_MMAPS.add(obj.filename)

599 if unlink_on_gc_collect:

600 add_maybe_unlink_finalizer(obj)

601 return obj

602

603

604def load(filename, mmap_mode=None):

605 """Reconstruct a Python object from a file persisted with joblib.dump.

606

607 Read more in the :ref:`User Guide <persistence>`.

608

609 WARNING: joblib.load relies on the pickle module and can therefore

610 execute arbitrary Python code. It should therefore never be used

611 to load files from untrusted sources.

612

613 Parameters

614 ----------

615 filename: str, pathlib.Path, or file object.

616 The file object or path of the file from which to load the object

617 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional

618 If not None, the arrays are memory-mapped from the disk. This

619 mode has no effect for compressed files. Note that in this

620 case the reconstructed object might no longer match exactly

621 the originally pickled object.

622

623 Returns

624 -------

625 result: any Python object

626 The object stored in the file.

627

628 See Also

629 --------

630 joblib.dump : function to save an object

631

632 Notes

633 -----

634

635 This function can load numpy array files saved separately during the

636 dump. If the mmap_mode argument is given, it is passed to np.load and

637 arrays are loaded as memmaps. As a consequence, the reconstructed

638 object might not match the original pickled object. Note that if the

639 file was saved with compression, the arrays cannot be memmapped.

640 """

641 if Path is not None and isinstance(filename, Path):

642 filename = str(filename)

643

644 if hasattr(filename, "read"):

645 fobj = filename

646 filename = getattr(fobj, 'name', '')

647 with _read_fileobject(fobj, filename, mmap_mode) as fobj:

648 obj = _unpickle(fobj)

649 else:

650 with open(filename, 'rb') as f:

651 with _read_fileobject(f, filename, mmap_mode) as fobj:

652 if isinstance(fobj, str):

653 # if the returned file object is a string, this means we

654 # try to load a pickle file generated with an version of

655 # Joblib so we load it with joblib compatibility function.

656 return load_compatibility(fobj)

657

658 obj = _unpickle(fobj, filename, mmap_mode)

659 return obj

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/joblib/numpy_pickle.py: 58%

234 statements