Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/joblib/memory.py: 24%

1"""

2A context object for caching a function's return value each time it

3is called with the same input arguments.

5"""

7# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>

9# License: BSD Style, 3 clauses.

12import asyncio

13import datetime

14import functools

15import inspect

16import logging

17import os

18import pathlib

19import pydoc

20import re

21import textwrap

22import time

23import tokenize

24import traceback

25import warnings

26import weakref

28from . import hashing

29from ._store_backends import CacheWarning # noqa

30from ._store_backends import FileSystemStoreBackend, StoreBackendBase

31from .func_inspect import (filter_args, format_call, format_signature,

32 get_func_code, get_func_name)

33from .logger import Logger, format_time, pformat

35FIRST_LINE_TEXT = "# first line:"

37# TODO: The following object should have a data store object as a sub

38# object, and the interface to persist and query should be separated in

39# the data store.

40#

41# This would enable creating 'Memory' objects with a different logic for

42# pickling that would simply span a MemorizedFunc with the same

43# store (or do we want to copy it to avoid cross-talks?), for instance to

44# implement HDF5 pickling.

46# TODO: Same remark for the logger, and probably use the Python logging

47# mechanism.

50def extract_first_line(func_code):

51 """ Extract the first line information from the function code

52 text if available.

53 """

54 if func_code.startswith(FIRST_LINE_TEXT):

55 func_code = func_code.split('\n')

56 first_line = int(func_code[0][len(FIRST_LINE_TEXT):])

57 func_code = '\n'.join(func_code[1:])

58 else:

59 first_line = -1

60 return func_code, first_line

63class JobLibCollisionWarning(UserWarning):

64 """ Warn that there might be a collision between names of functions.

65 """

68_STORE_BACKENDS = {'local': FileSystemStoreBackend}

71def register_store_backend(backend_name, backend):

72 """Extend available store backends.

74 The Memory, MemorizeResult and MemorizeFunc objects are designed to be

75 agnostic to the type of store used behind. By default, the local file

76 system is used but this function gives the possibility to extend joblib's

77 memory pattern with other types of storage such as cloud storage (S3, GCS,

78 OpenStack, HadoopFS, etc) or blob DBs.

80 Parameters

81 ----------

82 backend_name: str

83 The name identifying the store backend being registered. For example,

84 'local' is used with FileSystemStoreBackend.

85 backend: StoreBackendBase subclass

86 The name of a class that implements the StoreBackendBase interface.

88 """

89 if not isinstance(backend_name, str):

90 raise ValueError("Store backend name should be a string, "

91 "'{0}' given.".format(backend_name))

92 if backend is None or not issubclass(backend, StoreBackendBase):

93 raise ValueError("Store backend should inherit "

94 "StoreBackendBase, "

95 "'{0}' given.".format(backend))

97 _STORE_BACKENDS[backend_name] = backend

100def _store_backend_factory(backend, location, verbose=0, backend_options=None):

101 """Return the correct store object for the given location."""

102 if backend_options is None:

103 backend_options = {}

104

105 if isinstance(location, pathlib.Path):

106 location = str(location)

107

108 if isinstance(location, StoreBackendBase):

109 return location

110 elif isinstance(location, str):

111 obj = None

112 location = os.path.expanduser(location)

113 # The location is not a local file system, we look in the

114 # registered backends if there's one matching the given backend

115 # name.

116 for backend_key, backend_obj in _STORE_BACKENDS.items():

117 if backend == backend_key:

118 obj = backend_obj()

119

120 # By default, we assume the FileSystemStoreBackend can be used if no

121 # matching backend could be found.

122 if obj is None:

123 raise TypeError('Unknown location {0} or backend {1}'.format(

124 location, backend))

125

126 # The store backend is configured with the extra named parameters,

127 # some of them are specific to the underlying store backend.

128 obj.configure(location, verbose=verbose,

129 backend_options=backend_options)

130 return obj

131 elif location is not None:

132 warnings.warn(

133 "Instantiating a backend using a {} as a location is not "

134 "supported by joblib. Returning None instead.".format(

135 location.__class__.__name__), UserWarning)

136

137 return None

138

139

140def _build_func_identifier(func):

141 """Build a roughly unique identifier for the cached function."""

142 modules, funcname = get_func_name(func)

143 # We reuse historical fs-like way of building a function identifier

144 return os.path.join(*modules, funcname)

145

146

147# An in-memory store to avoid looking at the disk-based function

148# source code to check if a function definition has changed

149_FUNCTION_HASHES = weakref.WeakKeyDictionary()

150

151

152###############################################################################

153# class `MemorizedResult`

154###############################################################################

155class MemorizedResult(Logger):

156 """Object representing a cached value.

157

158 Attributes

159 ----------

160 location: str

161 The location of joblib cache. Depends on the store backend used.

162

163 func: function or str

164 function whose output is cached. The string case is intended only for

165 instantiation based on the output of repr() on another instance.

166 (namely eval(repr(memorized_instance)) works).

167

168 argument_hash: str

169 hash of the function arguments.

170

171 backend: str

172 Type of store backend for reading/writing cache files.

173 Default is 'local'.

174

175 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}

176 The memmapping mode used when loading from cache numpy arrays. See

177 numpy.load for the meaning of the different values.

178

179 verbose: int

180 verbosity level (0 means no message).

181

182 timestamp, metadata: string

183 for internal use only.

184 """

185 def __init__(self, location, call_id, backend='local', mmap_mode=None,

186 verbose=0, timestamp=None, metadata=None):

187 Logger.__init__(self)

188 self._call_id = call_id

189 self.store_backend = _store_backend_factory(backend, location,

190 verbose=verbose)

191 self.mmap_mode = mmap_mode

192

193 if metadata is not None:

194 self.metadata = metadata

195 else:

196 self.metadata = self.store_backend.get_metadata(self._call_id)

197

198 self.duration = self.metadata.get('duration', None)

199 self.verbose = verbose

200 self.timestamp = timestamp

201

202 @property

203 def func(self):

204 return self.func_id

205

206 @property

207 def func_id(self):

208 return self._call_id[0]

209

210 @property

211 def args_id(self):

212 return self._call_id[1]

213

214 @property

215 def argument_hash(self):

216 warnings.warn(

217 "The 'argument_hash' attribute has been deprecated in version "

218 "0.12 and will be removed in version 0.14.\n"

219 "Use `args_id` attribute instead.",

220 DeprecationWarning, stacklevel=2)

221 return self.args_id

222

223 def get(self):

224 """Read value from cache and return it."""

225 try:

226 return self.store_backend.load_item(

227 self._call_id,

228 timestamp=self.timestamp,

229 metadata=self.metadata,

230 verbose=self.verbose

231 )

232 except ValueError as exc:

233 new_exc = KeyError(

234 "Error while trying to load a MemorizedResult's value. "

235 "It seems that this folder is corrupted : {}".format(

236 os.path.join(self.store_backend.location, *self._call_id)))

237 raise new_exc from exc

238

239 def clear(self):

240 """Clear value from cache"""

241 self.store_backend.clear_item(self._call_id)

242

243 def __repr__(self):

244 return '{}(location="{}", func="{}", args_id="{}")'.format(

245 self.__class__.__name__, self.store_backend.location,

246 *self._call_id

247 )

248

249 def __getstate__(self):

250 state = self.__dict__.copy()

251 state['timestamp'] = None

252 return state

253

254

255class NotMemorizedResult(object):

256 """Class representing an arbitrary value.

257

258 This class is a replacement for MemorizedResult when there is no cache.

259 """

260 __slots__ = ('value', 'valid')

261

262 def __init__(self, value):

263 self.value = value

264 self.valid = True

265

266 def get(self):

267 if self.valid:

268 return self.value

269 else:

270 raise KeyError("No value stored.")

271

272 def clear(self):

273 self.valid = False

274 self.value = None

275

276 def __repr__(self):

277 if self.valid:

278 return ('{class_name}({value})'

279 .format(class_name=self.__class__.__name__,

280 value=pformat(self.value)))

281 else:

282 return self.__class__.__name__ + ' with no value'

283

284 # __getstate__ and __setstate__ are required because of __slots__

285 def __getstate__(self):

286 return {"valid": self.valid, "value": self.value}

287

288 def __setstate__(self, state):

289 self.valid = state["valid"]

290 self.value = state["value"]

291

292

293###############################################################################

294# class `NotMemorizedFunc`

295###############################################################################

296class NotMemorizedFunc(object):

297 """No-op object decorating a function.

298

299 This class replaces MemorizedFunc when there is no cache. It provides an

300 identical API but does not write anything on disk.

301

302 Attributes

303 ----------

304 func: callable

305 Original undecorated function.

306 """

307 # Should be a light as possible (for speed)

308 def __init__(self, func):

309 self.func = func

310

311 def __call__(self, *args, **kwargs):

312 return self.func(*args, **kwargs)

313

314 def call_and_shelve(self, *args, **kwargs):

315 return NotMemorizedResult(self.func(*args, **kwargs))

316

317 def __repr__(self):

318 return '{0}(func={1})'.format(self.__class__.__name__, self.func)

319

320 def clear(self, warn=True):

321 # Argument "warn" is for compatibility with MemorizedFunc.clear

322 pass

323

324 def call(self, *args, **kwargs):

325 return self.func(*args, **kwargs), {}

326

327 def check_call_in_cache(self, *args, **kwargs):

328 return False

329

330

331###############################################################################

332# class `AsyncNotMemorizedFunc`

333###############################################################################

334class AsyncNotMemorizedFunc(NotMemorizedFunc):

335 async def call_and_shelve(self, *args, **kwargs):

336 return NotMemorizedResult(await self.func(*args, **kwargs))

337

338

339###############################################################################

340# class `MemorizedFunc`

341###############################################################################

342class MemorizedFunc(Logger):

343 """Callable object decorating a function for caching its return value

344 each time it is called.

345

346 Methods are provided to inspect the cache or clean it.

347

348 Attributes

349 ----------

350 func: callable

351 The original, undecorated, function.

352

353 location: string

354 The location of joblib cache. Depends on the store backend used.

355

356 backend: str

357 Type of store backend for reading/writing cache files.

358 Default is 'local', in which case the location is the path to a

359 disk storage.

360

361 ignore: list or None

362 List of variable names to ignore when choosing whether to

363 recompute.

364

365 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}

366 The memmapping mode used when loading from cache

367 numpy arrays. See numpy.load for the meaning of the different

368 values.

369

370 compress: boolean, or integer

371 Whether to zip the stored data on disk. If an integer is

372 given, it should be between 1 and 9, and sets the amount

373 of compression. Note that compressed arrays cannot be

374 read by memmapping.

375

376 verbose: int, optional

377 The verbosity flag, controls messages that are issued as

378 the function is evaluated.

379

380 cache_validation_callback: callable, optional

381 Callable to check if a result in cache is valid or is to be recomputed.

382 When the function is called with arguments for which a cache exists,

383 the callback is called with the cache entry's metadata as its sole

384 argument. If it returns True, the cached result is returned, else the

385 cache for these arguments is cleared and the result is recomputed.

386 """

387 # ------------------------------------------------------------------------

388 # Public interface

389 # ------------------------------------------------------------------------

390

391 def __init__(self, func, location, backend='local', ignore=None,

392 mmap_mode=None, compress=False, verbose=1, timestamp=None,

393 cache_validation_callback=None):

394 Logger.__init__(self)

395 self.mmap_mode = mmap_mode

396 self.compress = compress

397 self.func = func

398 self.cache_validation_callback = cache_validation_callback

399 self.func_id = _build_func_identifier(func)

400 self.ignore = ignore if ignore is not None else []

401 self._verbose = verbose

402

403 # retrieve store object from backend type and location.

404 self.store_backend = _store_backend_factory(backend, location,

405 verbose=verbose,

406 backend_options=dict(

407 compress=compress,

408 mmap_mode=mmap_mode),

409 )

410 if self.store_backend is not None:

411 # Create func directory on demand.

412 self.store_backend.store_cached_func_code([self.func_id])

413

414 self.timestamp = timestamp if timestamp is not None else time.time()

415 try:

416 functools.update_wrapper(self, func)

417 except Exception:

418 pass # Objects like ufunc don't like that

419 if inspect.isfunction(func):

420 doc = pydoc.TextDoc().document(func)

421 # Remove blank line

422 doc = doc.replace('\n', '\n\n', 1)

423 # Strip backspace-overprints for compatibility with autodoc

424 doc = re.sub('\x08.', '', doc)

425 else:

426 # Pydoc does a poor job on other objects

427 doc = func.__doc__

428 self.__doc__ = 'Memoized version of %s' % doc

429

430 self._func_code_info = None

431 self._func_code_id = None

432

433 def _is_in_cache_and_valid(self, call_id):

434 """Check if the function call is cached and valid for given arguments.

435

436 - Compare the function code with the one from the cached function,

437 asserting if it has changed.

438 - Check if the function call is present in the cache.

439 - Call `cache_validation_callback` for user define cache validation.

440

441 Returns True if the function call is in cache and can be used, and

442 returns False otherwise.

443 """

444 # Check if the code of the function has changed

445 if not self._check_previous_func_code(stacklevel=4):

446 return False

447

448 # Check if this specific call is in the cache

449 if not self.store_backend.contains_item(call_id):

450 return False

451

452 # Call the user defined cache validation callback

453 metadata = self.store_backend.get_metadata(call_id)

454 if (self.cache_validation_callback is not None and

455 not self.cache_validation_callback(metadata)):

456 self.store_backend.clear_item(call_id)

457 return False

458

459 return True

460

461 def _cached_call(self, args, kwargs, shelving):

462 """Call wrapped function and cache result, or read cache if available.

463

464 This function returns the wrapped function output or a reference to

465 the cached result.

466

467 Arguments:

468 ----------

469

470 args, kwargs: list and dict

471 input arguments for wrapped function

472

473 shelving: bool

474 True when called via the call_and_shelve function.

475

476

477 Returns

478 -------

479 output: Output of the wrapped function if shelving is false, or a

480 MemorizedResult reference to the value if shelving is true.

481 metadata: dict containing the metadata associated with the call.

482 """

483 args_id = self._get_args_id(*args, **kwargs)

484 call_id = (self.func_id, args_id)

485 _, func_name = get_func_name(self.func)

486 func_info = self.store_backend.get_cached_func_info([self.func_id])

487 location = func_info['location']

488

489 if self._verbose >= 20:

490 logging.basicConfig(level=logging.INFO)

491 _, signature = format_signature(self.func, *args, **kwargs)

492 self.info(

493 textwrap.dedent(

494 f"""

495 Querying {func_name} with signature

496 {signature}.

497

498 (argument hash {args_id})

499

500 The store location is {location}.

501 """

502 )

503 )

504

505 # Compare the function code with the previous to see if the

506 # function code has changed and check if the results are present in

507 # the cache.

508 if self._is_in_cache_and_valid(call_id):

509 if shelving:

510 return self._get_memorized_result(call_id), {}

511

512 try:

513 start_time = time.time()

514 output = self._load_item(call_id)

515 if self._verbose > 4:

516 self._print_duration(time.time() - start_time,

517 context='cache loaded ')

518 return output, {}

519 except Exception:

520 # XXX: Should use an exception logger

521 _, signature = format_signature(self.func, *args, **kwargs)

522 self.warn('Exception while loading results for '

523 '{}\n {}'.format(signature, traceback.format_exc()))

524

525 if self._verbose > 10:

526 self.warn(

527 f"Computing func {func_name}, argument hash {args_id} "

528 f"in location {location}"

529 )

530

531 # Returns the output but not the metadata

532 return self._call(call_id, args, kwargs, shelving)

533

534 @property

535 def func_code_info(self):

536 # 3-tuple property containing: the function source code, source file,

537 # and first line of the code inside the source file

538 if hasattr(self.func, '__code__'):

539 if self._func_code_id is None:

540 self._func_code_id = id(self.func.__code__)

541 elif id(self.func.__code__) != self._func_code_id:

542 # Be robust to dynamic reassignments of self.func.__code__

543 self._func_code_info = None

544

545 if self._func_code_info is None:

546 # Cache the source code of self.func . Provided that get_func_code

547 # (which should be called once on self) gets called in the process

548 # in which self.func was defined, this caching mechanism prevents

549 # undesired cache clearing when the cached function is called in

550 # an environment where the introspection utilities get_func_code

551 # relies on do not work (typically, in joblib child processes).

552 # See #1035 for more info

553 # TODO (pierreglaser): do the same with get_func_name?

554 self._func_code_info = get_func_code(self.func)

555 return self._func_code_info

556

557 def call_and_shelve(self, *args, **kwargs):

558 """Call wrapped function, cache result and return a reference.

559

560 This method returns a reference to the cached result instead of the

561 result itself. The reference object is small and pickeable, allowing

562 to send or store it easily. Call .get() on reference object to get

563 result.

564

565 Returns

566 -------

567 cached_result: MemorizedResult or NotMemorizedResult

568 reference to the value returned by the wrapped function. The

569 class "NotMemorizedResult" is used when there is no cache

570 activated (e.g. location=None in Memory).

571 """

572 # Return the wrapped output, without the metadata

573 return self._cached_call(args, kwargs, shelving=True)[0]

574

575 def __call__(self, *args, **kwargs):

576 # Return the output, without the metadata

577 return self._cached_call(args, kwargs, shelving=False)[0]

578

579 def __getstate__(self):

580 # Make sure self.func's source is introspected prior to being pickled -

581 # code introspection utilities typically do not work inside child

582 # processes

583 _ = self.func_code_info

584

585 # We don't store the timestamp when pickling, to avoid the hash

586 # depending from it.

587 state = self.__dict__.copy()

588 state['timestamp'] = None

589

590 # Invalidate the code id as id(obj) will be different in the child

591 state['_func_code_id'] = None

592

593 return state

594

595 def check_call_in_cache(self, *args, **kwargs):

596 """Check if function call is in the memory cache.

597

598 Does not call the function or do any work besides func inspection

599 and arg hashing.

600

601 Returns

602 -------

603 is_call_in_cache: bool

604 Whether or not the result of the function has been cached

605 for the input arguments that have been passed.

606 """

607 call_id = (self.func_id, self._get_args_id(*args, **kwargs))

608 return self.store_backend.contains_item(call_id)

609

610 # ------------------------------------------------------------------------

611 # Private interface

612 # ------------------------------------------------------------------------

613

614 def _get_args_id(self, *args, **kwargs):

615 """Return the input parameter hash of a result."""

616 return hashing.hash(filter_args(self.func, self.ignore, args, kwargs),

617 coerce_mmap=self.mmap_mode is not None)

618

619 def _hash_func(self):

620 """Hash a function to key the online cache"""

621 func_code_h = hash(getattr(self.func, '__code__', None))

622 return id(self.func), hash(self.func), func_code_h

623

624 def _write_func_code(self, func_code, first_line):

625 """ Write the function code and the filename to a file.

626 """

627 # We store the first line because the filename and the function

628 # name is not always enough to identify a function: people

629 # sometimes have several functions named the same way in a

630 # file. This is bad practice, but joblib should be robust to bad

631 # practice.

632 func_code = u'%s %i\n%s' % (FIRST_LINE_TEXT, first_line, func_code)

633 self.store_backend.store_cached_func_code([self.func_id], func_code)

634

635 # Also store in the in-memory store of function hashes

636 is_named_callable = (hasattr(self.func, '__name__') and

637 self.func.__name__ != '<lambda>')

638 if is_named_callable:

639 # Don't do this for lambda functions or strange callable

640 # objects, as it ends up being too fragile

641 func_hash = self._hash_func()

642 try:

643 _FUNCTION_HASHES[self.func] = func_hash

644 except TypeError:

645 # Some callable are not hashable

646 pass

647

648 def _check_previous_func_code(self, stacklevel=2):

649 """

650 stacklevel is the depth a which this function is called, to

651 issue useful warnings to the user.

652 """

653 # First check if our function is in the in-memory store.

654 # Using the in-memory store not only makes things faster, but it

655 # also renders us robust to variations of the files when the

656 # in-memory version of the code does not vary

657 try:

658 if self.func in _FUNCTION_HASHES:

659 # We use as an identifier the id of the function and its

660 # hash. This is more likely to falsely change than have hash

661 # collisions, thus we are on the safe side.

662 func_hash = self._hash_func()

663 if func_hash == _FUNCTION_HASHES[self.func]:

664 return True

665 except TypeError:

666 # Some callables are not hashable

667 pass

668

669 # Here, we go through some effort to be robust to dynamically

670 # changing code and collision. We cannot inspect.getsource

671 # because it is not reliable when using IPython's magic "%run".

672 func_code, source_file, first_line = self.func_code_info

673 try:

674 old_func_code, old_first_line = extract_first_line(

675 self.store_backend.get_cached_func_code([self.func_id]))

676 except (IOError, OSError): # some backend can also raise OSError

677 self._write_func_code(func_code, first_line)

678 return False

679 if old_func_code == func_code:

680 return True

681

682 # We have differing code, is this because we are referring to

683 # different functions, or because the function we are referring to has

684 # changed?

685

686 _, func_name = get_func_name(self.func, resolv_alias=False,

687 win_characters=False)

688 if old_first_line == first_line == -1 or func_name == '<lambda>':

689 if not first_line == -1:

690 func_description = ("{0} ({1}:{2})"

691 .format(func_name, source_file,

692 first_line))

693 else:

694 func_description = func_name

695 warnings.warn(JobLibCollisionWarning(

696 "Cannot detect name collisions for function '{0}'"

697 .format(func_description)), stacklevel=stacklevel)

698

699 # Fetch the code at the old location and compare it. If it is the

700 # same than the code store, we have a collision: the code in the

701 # file has not changed, but the name we have is pointing to a new

702 # code block.

703 if not old_first_line == first_line and source_file is not None:

704 if os.path.exists(source_file):

705 _, func_name = get_func_name(self.func, resolv_alias=False)

706 num_lines = len(func_code.split('\n'))

707 with tokenize.open(source_file) as f:

708 on_disk_func_code = f.readlines()[

709 old_first_line - 1:old_first_line - 1 + num_lines - 1]

710 on_disk_func_code = ''.join(on_disk_func_code)

711 possible_collision = (on_disk_func_code.rstrip() ==

712 old_func_code.rstrip())

713 else:

714 possible_collision = source_file.startswith('<doctest ')

715 if possible_collision:

716 warnings.warn(JobLibCollisionWarning(

717 'Possible name collisions between functions '

718 "'%s' (%s:%i) and '%s' (%s:%i)" %

719 (func_name, source_file, old_first_line,

720 func_name, source_file, first_line)),

721 stacklevel=stacklevel)

722

723 # The function has changed, wipe the cache directory.

724 # XXX: Should be using warnings, and giving stacklevel

725 if self._verbose > 10:

726 _, func_name = get_func_name(self.func, resolv_alias=False)

727 self.warn("Function {0} (identified by {1}) has changed"

728 ".".format(func_name, self.func_id))

729 self.clear(warn=True)

730 return False

731

732 def clear(self, warn=True):

733 """Empty the function's cache."""

734 func_id = self.func_id

735 if self._verbose > 0 and warn:

736 self.warn("Clearing function cache identified by %s" % func_id)

737 self.store_backend.clear_path([func_id, ])

738

739 func_code, _, first_line = self.func_code_info

740 self._write_func_code(func_code, first_line)

741

742 def call(self, *args, **kwargs):

743 """Force the execution of the function with the given arguments.

744

745 The output values will be persisted, i.e., the cache will be updated

746 with any new values.

747

748 Parameters

749 ----------

750 *args: arguments

751 The arguments.

752 **kwargs: keyword arguments

753 Keyword arguments.

754

755 Returns

756 -------

757 output : object

758 The output of the function call.

759 metadata : dict

760 The metadata associated with the call.

761 """

762 call_id = (self.func_id, self._get_args_id(*args, **kwargs))

763

764 # Return the output and the metadata

765 return self._call(call_id, args, kwargs)

766

767 def _call(self, call_id, args, kwargs, shelving=False):

768 # Return the output and the metadata

769 self._before_call(args, kwargs)

770 start_time = time.time()

771 output = self.func(*args, **kwargs)

772 return self._after_call(call_id, args, kwargs, shelving,

773 output, start_time)

774

775 def _before_call(self, args, kwargs):

776 if self._verbose > 0:

777 print(format_call(self.func, args, kwargs))

778

779 def _after_call(self, call_id, args, kwargs, shelving, output, start_time):

780 self.store_backend.dump_item(call_id, output, verbose=self._verbose)

781 duration = time.time() - start_time

782 if self._verbose > 0:

783 self._print_duration(duration)

784 metadata = self._persist_input(duration, call_id, args, kwargs)

785 if shelving:

786 return self._get_memorized_result(call_id, metadata), metadata

787

788 if self.mmap_mode is not None:

789 # Memmap the output at the first call to be consistent with

790 # later calls

791 output = self._load_item(call_id, metadata)

792 return output, metadata

793

794 def _persist_input(self, duration, call_id, args, kwargs,

795 this_duration_limit=0.5):

796 """ Save a small summary of the call using json format in the

797 output directory.

798

799 output_dir: string

800 directory where to write metadata.

801

802 duration: float

803 time taken by hashing input arguments, calling the wrapped

804 function and persisting its output.

805

806 args, kwargs: list and dict

807 input arguments for wrapped function

808

809 this_duration_limit: float

810 Max execution time for this function before issuing a warning.

811 """

812 start_time = time.time()

813 argument_dict = filter_args(self.func, self.ignore,

814 args, kwargs)

815

816 input_repr = dict((k, repr(v)) for k, v in argument_dict.items())

817 # This can fail due to race-conditions with multiple

818 # concurrent joblibs removing the file or the directory

819 metadata = {

820 "duration": duration, "input_args": input_repr, "time": start_time,

821 }

822

823 self.store_backend.store_metadata(call_id, metadata)

824

825 this_duration = time.time() - start_time

826 if this_duration > this_duration_limit:

827 # This persistence should be fast. It will not be if repr() takes

828 # time and its output is large, because json.dump will have to

829 # write a large file. This should not be an issue with numpy arrays

830 # for which repr() always output a short representation, but can

831 # be with complex dictionaries. Fixing the problem should be a

832 # matter of replacing repr() above by something smarter.

833 warnings.warn("Persisting input arguments took %.2fs to run."

834 "If this happens often in your code, it can cause "

835 "performance problems "

836 "(results will be correct in all cases). "

837 "The reason for this is probably some large input "

838 "arguments for a wrapped function."

839 % this_duration, stacklevel=5)

840 return metadata

841

842 def _get_memorized_result(self, call_id, metadata=None):

843 return MemorizedResult(self.store_backend, call_id,

844 metadata=metadata, timestamp=self.timestamp,

845 verbose=self._verbose - 1)

846

847 def _load_item(self, call_id, metadata=None):

848 return self.store_backend.load_item(call_id, metadata=metadata,

849 timestamp=self.timestamp,

850 verbose=self._verbose)

851

852 def _print_duration(self, duration, context=''):

853 _, name = get_func_name(self.func)

854 msg = f"{name} {context}- {format_time(duration)}"

855 print(max(0, (80 - len(msg))) * '_' + msg)

856

857 # ------------------------------------------------------------------------

858 # Private `object` interface

859 # ------------------------------------------------------------------------

860

861 def __repr__(self):

862 return '{class_name}(func={func}, location={location})'.format(

863 class_name=self.__class__.__name__,

864 func=self.func,

865 location=self.store_backend.location,)

866

867

868###############################################################################

869# class `AsyncMemorizedFunc`

870###############################################################################

871class AsyncMemorizedFunc(MemorizedFunc):

872 async def __call__(self, *args, **kwargs):

873 out = self._cached_call(args, kwargs, shelving=False)

874 out = await out if asyncio.iscoroutine(out) else out

875 return out[0] # Don't return metadata

876

877 async def call_and_shelve(self, *args, **kwargs):

878 out = self._cached_call(args, kwargs, shelving=True)

879 out = await out if asyncio.iscoroutine(out) else out

880 return out[0] # Don't return metadata

881

882 async def call(self, *args, **kwargs):

883 out = super().call(*args, **kwargs)

884 return await out if asyncio.iscoroutine(out) else out

885

886 async def _call(self, call_id, args, kwargs, shelving=False):

887 self._before_call(args, kwargs)

888 start_time = time.time()

889 output = await self.func(*args, **kwargs)

890 return self._after_call(

891 call_id, args, kwargs, shelving, output, start_time

892 )

893

894

895###############################################################################

896# class `Memory`

897###############################################################################

898class Memory(Logger):

899 """ A context object for caching a function's return value each time it

900 is called with the same input arguments.

901

902 All values are cached on the filesystem, in a deep directory

903 structure.

904

905 Read more in the :ref:`User Guide <memory>`.

906

907 Parameters

908 ----------

909 location: str, pathlib.Path or None

910 The path of the base directory to use as a data store

911 or None. If None is given, no caching is done and

912 the Memory object is completely transparent. This option

913 replaces cachedir since version 0.12.

914

915 backend: str, optional

916 Type of store backend for reading/writing cache files.

917 Default: 'local'.

918 The 'local' backend is using regular filesystem operations to

919 manipulate data (open, mv, etc) in the backend.

920

921 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional

922 The memmapping mode used when loading from cache

923 numpy arrays. See numpy.load for the meaning of the

924 arguments.

925

926 compress: boolean, or integer, optional

927 Whether to zip the stored data on disk. If an integer is

928 given, it should be between 1 and 9, and sets the amount

929 of compression. Note that compressed arrays cannot be

930 read by memmapping.

931

932 verbose: int, optional

933 Verbosity flag, controls the debug messages that are issued

934 as functions are evaluated.

935

936 backend_options: dict, optional

937 Contains a dictionary of named parameters used to configure

938 the store backend.

939 """

940 # ------------------------------------------------------------------------

941 # Public interface

942 # ------------------------------------------------------------------------

943

944 def __init__(self, location=None, backend='local', mmap_mode=None,

945 compress=False, verbose=1, backend_options=None):

946 Logger.__init__(self)

947 self._verbose = verbose

948 self.mmap_mode = mmap_mode

949 self.timestamp = time.time()

950 self.backend = backend

951 self.compress = compress

952 if backend_options is None:

953 backend_options = {}

954 self.backend_options = backend_options

955

956 if compress and mmap_mode is not None:

957 warnings.warn('Compressed results cannot be memmapped',

958 stacklevel=2)

959

960 self.location = location

961 if isinstance(location, str):

962 location = os.path.join(location, 'joblib')

963

964 self.store_backend = _store_backend_factory(

965 backend, location, verbose=self._verbose,

966 backend_options=dict(compress=compress, mmap_mode=mmap_mode,

967 **backend_options))

968

969 def cache(self, func=None, ignore=None, verbose=None, mmap_mode=False,

970 cache_validation_callback=None):

971 """ Decorates the given function func to only compute its return

972 value for input arguments not cached on disk.

973

974 Parameters

975 ----------

976 func: callable, optional

977 The function to be decorated

978 ignore: list of strings

979 A list of arguments name to ignore in the hashing

980 verbose: integer, optional

981 The verbosity mode of the function. By default that

982 of the memory object is used.

983 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional

984 The memmapping mode used when loading from cache

985 numpy arrays. See numpy.load for the meaning of the

986 arguments. By default that of the memory object is used.

987 cache_validation_callback: callable, optional

988 Callable to validate whether or not the cache is valid. When

989 the cached function is called with arguments for which a cache

990 exists, this callable is called with the metadata of the cached

991 result as its sole argument. If it returns True, then the

992 cached result is returned, else the cache for these arguments

993 is cleared and recomputed.

994

995 Returns

996 -------

997 decorated_func: MemorizedFunc object

998 The returned object is a MemorizedFunc object, that is

999 callable (behaves like a function), but offers extra

1000 methods for cache lookup and management. See the

1001 documentation for :class:`joblib.memory.MemorizedFunc`.

1002 """

1003 if (cache_validation_callback is not None and

1004 not callable(cache_validation_callback)):

1005 raise ValueError(

1006 "cache_validation_callback needs to be callable. "

1007 f"Got {cache_validation_callback}."

1008 )

1009 if func is None:

1010 # Partial application, to be able to specify extra keyword

1011 # arguments in decorators

1012 return functools.partial(

1013 self.cache, ignore=ignore,

1014 mmap_mode=mmap_mode,

1015 verbose=verbose,

1016 cache_validation_callback=cache_validation_callback

1017 )

1018 if self.store_backend is None:

1019 cls = (AsyncNotMemorizedFunc

1020 if asyncio.iscoroutinefunction(func)

1021 else NotMemorizedFunc)

1022 return cls(func)

1023 if verbose is None:

1024 verbose = self._verbose

1025 if mmap_mode is False:

1026 mmap_mode = self.mmap_mode

1027 if isinstance(func, MemorizedFunc):

1028 func = func.func

1029 cls = (AsyncMemorizedFunc

1030 if asyncio.iscoroutinefunction(func)

1031 else MemorizedFunc)

1032 return cls(

1033 func, location=self.store_backend, backend=self.backend,

1034 ignore=ignore, mmap_mode=mmap_mode, compress=self.compress,

1035 verbose=verbose, timestamp=self.timestamp,

1036 cache_validation_callback=cache_validation_callback

1037 )

1038

1039 def clear(self, warn=True):

1040 """ Erase the complete cache directory.

1041 """

1042 if warn:

1043 self.warn('Flushing completely the cache')

1044 if self.store_backend is not None:

1045 self.store_backend.clear()

1046

1047 # As the cache is completely clear, make sure the _FUNCTION_HASHES

1048 # cache is also reset. Else, for a function that is present in this

1049 # table, results cached after this clear will be have cache miss

1050 # as the function code is not re-written.

1051 _FUNCTION_HASHES.clear()

1052

1053 def reduce_size(self, bytes_limit=None, items_limit=None, age_limit=None):

1054 """Remove cache elements to make the cache fit its limits.

1055

1056 The limitation can impose that the cache size fits in ``bytes_limit``,

1057 that the number of cache items is no more than ``items_limit``, and

1058 that all files in cache are not older than ``age_limit``.

1059

1060 Parameters

1061 ----------

1062 bytes_limit: int | str, optional

1063 Limit in bytes of the size of the cache. By default, the size of

1064 the cache is unlimited. When reducing the size of the cache,

1065 ``joblib`` keeps the most recently accessed items first. If a

1066 str is passed, it is converted to a number of bytes using units

1067 { K | M | G} for kilo, mega, giga.

1068

1069 items_limit: int, optional

1070 Number of items to limit the cache to. By default, the number of

1071 items in the cache is unlimited. When reducing the size of the

1072 cache, ``joblib`` keeps the most recently accessed items first.

1073

1074 age_limit: datetime.timedelta, optional

1075 Maximum age of items to limit the cache to. When reducing the size

1076 of the cache, any items last accessed more than the given length of

1077 time ago are deleted.

1078 """

1079 if self.store_backend is None:

1080 # No cached results, this function does nothing.

1081 return

1082

1083 if bytes_limit is None and items_limit is None and age_limit is None:

1084 # No limitation to impose, returning

1085 return

1086

1087 # Defers the actual limits enforcing to the store backend.

1088 self.store_backend.enforce_store_limits(

1089 bytes_limit, items_limit, age_limit

1090 )

1091

1092 def eval(self, func, *args, **kwargs):

1093 """ Eval function func with arguments `*args` and `**kwargs`,

1094 in the context of the memory.

1095

1096 This method works similarly to the builtin `apply`, except

1097 that the function is called only if the cache is not

1098 up to date.

1099

1100 """

1101 if self.store_backend is None:

1102 return func(*args, **kwargs)

1103 return self.cache(func)(*args, **kwargs)

1104

1105 # ------------------------------------------------------------------------

1106 # Private `object` interface

1107 # ------------------------------------------------------------------------

1108

1109 def __repr__(self):

1110 return '{class_name}(location={location})'.format(

1111 class_name=self.__class__.__name__,

1112 location=(None if self.store_backend is None

1113 else self.store_backend.location))

1114

1115 def __getstate__(self):

1116 """ We don't store the timestamp when pickling, to avoid the hash

1117 depending from it.

1118 """

1119 state = self.__dict__.copy()

1120 state['timestamp'] = None

1121 return state

1122

1123

1124###############################################################################

1125# cache_validation_callback helpers

1126###############################################################################

1127

1128def expires_after(days=0, seconds=0, microseconds=0, milliseconds=0, minutes=0,

1129 hours=0, weeks=0):

1130 """Helper cache_validation_callback to force recompute after a duration.

1131

1132 Parameters

1133 ----------

1134 days, seconds, microseconds, milliseconds, minutes, hours, weeks: numbers

1135 argument passed to a timedelta.

1136 """

1137 delta = datetime.timedelta(

1138 days=days, seconds=seconds, microseconds=microseconds,

1139 milliseconds=milliseconds, minutes=minutes, hours=hours, weeks=weeks

1140 )

1141

1142 def cache_validation_callback(metadata):

1143 computation_age = time.time() - metadata['time']

1144 return computation_age < delta.total_seconds()

1145

1146 return cache_validation_callback