Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/joblib/memory.py: 20%
415 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1"""
2A context object for caching a function's return value each time it
3is called with the same input arguments.
5"""
7# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
8# Copyright (c) 2009 Gael Varoquaux
9# License: BSD Style, 3 clauses.
12from __future__ import with_statement
13import logging
14import os
15from textwrap import dedent
16import time
17import pathlib
18import pydoc
19import re
20import functools
21import traceback
22import warnings
23import inspect
24import weakref
25from datetime import timedelta
27from tokenize import open as open_py_source
29# Local imports
30from . import hashing
31from .func_inspect import get_func_code, get_func_name, filter_args
32from .func_inspect import format_call
33from .func_inspect import format_signature
34from .logger import Logger, format_time, pformat
35from ._store_backends import StoreBackendBase, FileSystemStoreBackend
36from ._store_backends import CacheWarning # noqa
39FIRST_LINE_TEXT = "# first line:"
41# TODO: The following object should have a data store object as a sub
42# object, and the interface to persist and query should be separated in
43# the data store.
44#
45# This would enable creating 'Memory' objects with a different logic for
46# pickling that would simply span a MemorizedFunc with the same
47# store (or do we want to copy it to avoid cross-talks?), for instance to
48# implement HDF5 pickling.
50# TODO: Same remark for the logger, and probably use the Python logging
51# mechanism.
54def extract_first_line(func_code):
55 """ Extract the first line information from the function code
56 text if available.
57 """
58 if func_code.startswith(FIRST_LINE_TEXT):
59 func_code = func_code.split('\n')
60 first_line = int(func_code[0][len(FIRST_LINE_TEXT):])
61 func_code = '\n'.join(func_code[1:])
62 else:
63 first_line = -1
64 return func_code, first_line
67class JobLibCollisionWarning(UserWarning):
68 """ Warn that there might be a collision between names of functions.
69 """
72_STORE_BACKENDS = {'local': FileSystemStoreBackend}
75def register_store_backend(backend_name, backend):
76 """Extend available store backends.
78 The Memory, MemorizeResult and MemorizeFunc objects are designed to be
79 agnostic to the type of store used behind. By default, the local file
80 system is used but this function gives the possibility to extend joblib's
81 memory pattern with other types of storage such as cloud storage (S3, GCS,
82 OpenStack, HadoopFS, etc) or blob DBs.
84 Parameters
85 ----------
86 backend_name: str
87 The name identifying the store backend being registered. For example,
88 'local' is used with FileSystemStoreBackend.
89 backend: StoreBackendBase subclass
90 The name of a class that implements the StoreBackendBase interface.
92 """
93 if not isinstance(backend_name, str):
94 raise ValueError("Store backend name should be a string, "
95 "'{0}' given.".format(backend_name))
96 if backend is None or not issubclass(backend, StoreBackendBase):
97 raise ValueError("Store backend should inherit "
98 "StoreBackendBase, "
99 "'{0}' given.".format(backend))
101 _STORE_BACKENDS[backend_name] = backend
104def _store_backend_factory(backend, location, verbose=0, backend_options=None):
105 """Return the correct store object for the given location."""
106 if backend_options is None:
107 backend_options = {}
109 if isinstance(location, pathlib.Path):
110 location = str(location)
112 if isinstance(location, StoreBackendBase):
113 return location
114 elif isinstance(location, str):
115 obj = None
116 location = os.path.expanduser(location)
117 # The location is not a local file system, we look in the
118 # registered backends if there's one matching the given backend
119 # name.
120 for backend_key, backend_obj in _STORE_BACKENDS.items():
121 if backend == backend_key:
122 obj = backend_obj()
124 # By default, we assume the FileSystemStoreBackend can be used if no
125 # matching backend could be found.
126 if obj is None:
127 raise TypeError('Unknown location {0} or backend {1}'.format(
128 location, backend))
130 # The store backend is configured with the extra named parameters,
131 # some of them are specific to the underlying store backend.
132 obj.configure(location, verbose=verbose,
133 backend_options=backend_options)
134 return obj
135 elif location is not None:
136 warnings.warn(
137 "Instantiating a backend using a {} as a location is not "
138 "supported by joblib. Returning None instead.".format(
139 location.__class__.__name__), UserWarning)
141 return None
144def _get_func_fullname(func):
145 """Compute the part of part associated with a function."""
146 modules, funcname = get_func_name(func)
147 modules.append(funcname)
148 return os.path.join(*modules)
151def _build_func_identifier(func):
152 """Build a roughly unique identifier for the cached function."""
153 parts = []
154 if isinstance(func, str):
155 parts.append(func)
156 else:
157 parts.append(_get_func_fullname(func))
159 # We reuse historical fs-like way of building a function identifier
160 return os.path.join(*parts)
163def _format_load_msg(func_id, args_id, timestamp=None, metadata=None):
164 """ Helper function to format the message when loading the results.
165 """
166 signature = ""
167 try:
168 if metadata is not None:
169 args = ", ".join(['%s=%s' % (name, value)
170 for name, value
171 in metadata['input_args'].items()])
172 signature = "%s(%s)" % (os.path.basename(func_id), args)
173 else:
174 signature = os.path.basename(func_id)
175 except KeyError:
176 pass
178 if timestamp is not None:
179 ts_string = "{0: <16}".format(format_time(time.time() - timestamp))
180 else:
181 ts_string = ""
182 return '[Memory]{0}: Loading {1}'.format(ts_string, str(signature))
185# An in-memory store to avoid looking at the disk-based function
186# source code to check if a function definition has changed
187_FUNCTION_HASHES = weakref.WeakKeyDictionary()
190###############################################################################
191# class `MemorizedResult`
192###############################################################################
193class MemorizedResult(Logger):
194 """Object representing a cached value.
196 Attributes
197 ----------
198 location: str
199 The location of joblib cache. Depends on the store backend used.
201 func: function or str
202 function whose output is cached. The string case is intended only for
203 instantiation based on the output of repr() on another instance.
204 (namely eval(repr(memorized_instance)) works).
206 argument_hash: str
207 hash of the function arguments.
209 backend: str
210 Type of store backend for reading/writing cache files.
211 Default is 'local'.
213 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}
214 The memmapping mode used when loading from cache numpy arrays. See
215 numpy.load for the meaning of the different values.
217 verbose: int
218 verbosity level (0 means no message).
220 timestamp, metadata: string
221 for internal use only.
222 """
223 def __init__(self, location, func, args_id, backend='local',
224 mmap_mode=None, verbose=0, timestamp=None, metadata=None):
225 Logger.__init__(self)
226 self.func_id = _build_func_identifier(func)
227 if isinstance(func, str):
228 self.func = func
229 else:
230 self.func = self.func_id
231 self.args_id = args_id
232 self.store_backend = _store_backend_factory(backend, location,
233 verbose=verbose)
234 self.mmap_mode = mmap_mode
236 if metadata is not None:
237 self.metadata = metadata
238 else:
239 self.metadata = self.store_backend.get_metadata(
240 [self.func_id, self.args_id])
242 self.duration = self.metadata.get('duration', None)
243 self.verbose = verbose
244 self.timestamp = timestamp
246 @property
247 def argument_hash(self):
248 warnings.warn(
249 "The 'argument_hash' attribute has been deprecated in version "
250 "0.12 and will be removed in version 0.14.\n"
251 "Use `args_id` attribute instead.",
252 DeprecationWarning, stacklevel=2)
253 return self.args_id
255 def get(self):
256 """Read value from cache and return it."""
257 if self.verbose:
258 msg = _format_load_msg(self.func_id, self.args_id,
259 timestamp=self.timestamp,
260 metadata=self.metadata)
261 else:
262 msg = None
264 try:
265 return self.store_backend.load_item(
266 [self.func_id, self.args_id], msg=msg, verbose=self.verbose)
267 except ValueError as exc:
268 new_exc = KeyError(
269 "Error while trying to load a MemorizedResult's value. "
270 "It seems that this folder is corrupted : {}".format(
271 os.path.join(
272 self.store_backend.location, self.func_id,
273 self.args_id)
274 ))
275 raise new_exc from exc
277 def clear(self):
278 """Clear value from cache"""
279 self.store_backend.clear_item([self.func_id, self.args_id])
281 def __repr__(self):
282 return ('{class_name}(location="{location}", func="{func}", '
283 'args_id="{args_id}")'
284 .format(class_name=self.__class__.__name__,
285 location=self.store_backend.location,
286 func=self.func,
287 args_id=self.args_id
288 ))
290 def __getstate__(self):
291 state = self.__dict__.copy()
292 state['timestamp'] = None
293 return state
296class NotMemorizedResult(object):
297 """Class representing an arbitrary value.
299 This class is a replacement for MemorizedResult when there is no cache.
300 """
301 __slots__ = ('value', 'valid')
303 def __init__(self, value):
304 self.value = value
305 self.valid = True
307 def get(self):
308 if self.valid:
309 return self.value
310 else:
311 raise KeyError("No value stored.")
313 def clear(self):
314 self.valid = False
315 self.value = None
317 def __repr__(self):
318 if self.valid:
319 return ('{class_name}({value})'
320 .format(class_name=self.__class__.__name__,
321 value=pformat(self.value)))
322 else:
323 return self.__class__.__name__ + ' with no value'
325 # __getstate__ and __setstate__ are required because of __slots__
326 def __getstate__(self):
327 return {"valid": self.valid, "value": self.value}
329 def __setstate__(self, state):
330 self.valid = state["valid"]
331 self.value = state["value"]
334###############################################################################
335# class `NotMemorizedFunc`
336###############################################################################
337class NotMemorizedFunc(object):
338 """No-op object decorating a function.
340 This class replaces MemorizedFunc when there is no cache. It provides an
341 identical API but does not write anything on disk.
343 Attributes
344 ----------
345 func: callable
346 Original undecorated function.
347 """
348 # Should be a light as possible (for speed)
349 def __init__(self, func):
350 self.func = func
352 def __call__(self, *args, **kwargs):
353 return self.func(*args, **kwargs)
355 def call_and_shelve(self, *args, **kwargs):
356 return NotMemorizedResult(self.func(*args, **kwargs))
358 def __repr__(self):
359 return '{0}(func={1})'.format(self.__class__.__name__, self.func)
361 def clear(self, warn=True):
362 # Argument "warn" is for compatibility with MemorizedFunc.clear
363 pass
365 def call(self, *args, **kwargs):
366 return self.func(*args, **kwargs)
368 def check_call_in_cache(self, *args, **kwargs):
369 return False
372###############################################################################
373# class `MemorizedFunc`
374###############################################################################
375class MemorizedFunc(Logger):
376 """Callable object decorating a function for caching its return value
377 each time it is called.
379 Methods are provided to inspect the cache or clean it.
381 Attributes
382 ----------
383 func: callable
384 The original, undecorated, function.
386 location: string
387 The location of joblib cache. Depends on the store backend used.
389 backend: str
390 Type of store backend for reading/writing cache files.
391 Default is 'local', in which case the location is the path to a
392 disk storage.
394 ignore: list or None
395 List of variable names to ignore when choosing whether to
396 recompute.
398 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}
399 The memmapping mode used when loading from cache
400 numpy arrays. See numpy.load for the meaning of the different
401 values.
403 compress: boolean, or integer
404 Whether to zip the stored data on disk. If an integer is
405 given, it should be between 1 and 9, and sets the amount
406 of compression. Note that compressed arrays cannot be
407 read by memmapping.
409 verbose: int, optional
410 The verbosity flag, controls messages that are issued as
411 the function is evaluated.
413 cache_validation_callback: callable, optional
414 Callable to check if a result in cache is valid or is to be recomputed.
415 When the function is called with arguments for which a cache exists,
416 the callback is called with the cache entry's metadata as its sole
417 argument. If it returns True, the cached result is returned, else the
418 cache for these arguments is cleared and the result is recomputed.
419 """
420 # ------------------------------------------------------------------------
421 # Public interface
422 # ------------------------------------------------------------------------
424 def __init__(self, func, location, backend='local', ignore=None,
425 mmap_mode=None, compress=False, verbose=1, timestamp=None,
426 cache_validation_callback=None):
427 Logger.__init__(self)
428 self.mmap_mode = mmap_mode
429 self.compress = compress
430 self.func = func
431 self.cache_validation_callback = cache_validation_callback
433 if ignore is None:
434 ignore = []
435 self.ignore = ignore
436 self._verbose = verbose
438 # retrieve store object from backend type and location.
439 self.store_backend = _store_backend_factory(backend, location,
440 verbose=verbose,
441 backend_options=dict(
442 compress=compress,
443 mmap_mode=mmap_mode),
444 )
445 if self.store_backend is not None:
446 # Create func directory on demand.
447 self.store_backend.store_cached_func_code([
448 _build_func_identifier(self.func)
449 ])
451 if timestamp is None:
452 timestamp = time.time()
453 self.timestamp = timestamp
454 try:
455 functools.update_wrapper(self, func)
456 except Exception:
457 " Objects like ufunc don't like that "
458 if inspect.isfunction(func):
459 doc = pydoc.TextDoc().document(func)
460 # Remove blank line
461 doc = doc.replace('\n', '\n\n', 1)
462 # Strip backspace-overprints for compatibility with autodoc
463 doc = re.sub('\x08.', '', doc)
464 else:
465 # Pydoc does a poor job on other objects
466 doc = func.__doc__
467 self.__doc__ = 'Memoized version of %s' % doc
469 self._func_code_info = None
470 self._func_code_id = None
472 def _is_in_cache_and_valid(self, path):
473 """Check if the function call is cached and valid for given arguments.
475 - Compare the function code with the one from the cached function,
476 asserting if it has changed.
477 - Check if the function call is present in the cache.
478 - Call `cache_validation_callback` for user define cache validation.
480 Returns True if the function call is in cache and can be used, and
481 returns False otherwise.
482 """
483 # Check if the code of the function has changed
484 if not self._check_previous_func_code(stacklevel=4):
485 return False
487 # Check if this specific call is in the cache
488 if not self.store_backend.contains_item(path):
489 return False
491 # Call the user defined cache validation callback
492 metadata = self.store_backend.get_metadata(path)
493 if (self.cache_validation_callback is not None and
494 not self.cache_validation_callback(metadata)):
495 self.store_backend.clear_item(path)
496 return False
498 return True
500 def _cached_call(self, args, kwargs, shelving=False):
501 """Call wrapped function and cache result, or read cache if available.
503 This function returns the wrapped function output and some metadata.
505 Arguments:
506 ----------
508 args, kwargs: list and dict
509 input arguments for wrapped function
511 shelving: bool
512 True when called via the call_and_shelve function.
515 Returns
516 -------
517 output: value or tuple or None
518 Output of the wrapped function.
519 If shelving is True and the call has been already cached,
520 output is None.
522 argument_hash: string
523 Hash of function arguments.
525 metadata: dict
526 Some metadata about wrapped function call (see _persist_input()).
527 """
528 func_id, args_id = self._get_output_identifiers(*args, **kwargs)
529 metadata = None
530 msg = None
532 # Whether or not the memorized function must be called
533 must_call = False
535 if self._verbose >= 20:
536 logging.basicConfig(level=logging.INFO)
537 _, name = get_func_name(self.func)
538 location = self.store_backend.get_cached_func_info([func_id])[
539 'location']
540 _, signature = format_signature(self.func, *args, **kwargs)
542 self.info(
543 dedent(
544 f"""
545 Querying {name} with signature
546 {signature}.
548 (argument hash {args_id})
550 The store location is {location}.
551 """
552 )
553 )
555 # Compare the function code with the previous to see if the
556 # function code has changed and check if the results are present in
557 # the cache.
558 if self._is_in_cache_and_valid([func_id, args_id]):
559 try:
560 t0 = time.time()
561 if self._verbose:
562 msg = _format_load_msg(func_id, args_id,
563 timestamp=self.timestamp,
564 metadata=metadata)
566 if not shelving:
567 # When shelving, we do not need to load the output
568 out = self.store_backend.load_item(
569 [func_id, args_id],
570 msg=msg,
571 verbose=self._verbose)
572 else:
573 out = None
575 if self._verbose > 4:
576 t = time.time() - t0
577 _, name = get_func_name(self.func)
578 msg = '%s cache loaded - %s' % (name, format_time(t))
579 print(max(0, (80 - len(msg))) * '_' + msg)
580 except Exception:
581 # XXX: Should use an exception logger
582 _, signature = format_signature(self.func, *args, **kwargs)
583 self.warn('Exception while loading results for '
584 '{}\n {}'.format(signature, traceback.format_exc()))
586 must_call = True
587 else:
588 if self._verbose > 10:
589 _, name = get_func_name(self.func)
590 self.warn('Computing func {0}, argument hash {1} '
591 'in location {2}'
592 .format(name, args_id,
593 self.store_backend.
594 get_cached_func_info([func_id])['location']))
595 must_call = True
597 if must_call:
598 out, metadata = self.call(*args, **kwargs)
599 if self.mmap_mode is not None:
600 # Memmap the output at the first call to be consistent with
601 # later calls
602 if self._verbose:
603 msg = _format_load_msg(func_id, args_id,
604 timestamp=self.timestamp,
605 metadata=metadata)
606 out = self.store_backend.load_item([func_id, args_id], msg=msg,
607 verbose=self._verbose)
609 return (out, args_id, metadata)
611 @property
612 def func_code_info(self):
613 # 3-tuple property containing: the function source code, source file,
614 # and first line of the code inside the source file
615 if hasattr(self.func, '__code__'):
616 if self._func_code_id is None:
617 self._func_code_id = id(self.func.__code__)
618 elif id(self.func.__code__) != self._func_code_id:
619 # Be robust to dynamic reassignments of self.func.__code__
620 self._func_code_info = None
622 if self._func_code_info is None:
623 # Cache the source code of self.func . Provided that get_func_code
624 # (which should be called once on self) gets called in the process
625 # in which self.func was defined, this caching mechanism prevents
626 # undesired cache clearing when the cached function is called in
627 # an environment where the introspection utilities get_func_code
628 # relies on do not work (typically, in joblib child processes).
629 # See #1035 for more info
630 # TODO (pierreglaser): do the same with get_func_name?
631 self._func_code_info = get_func_code(self.func)
632 return self._func_code_info
634 def call_and_shelve(self, *args, **kwargs):
635 """Call wrapped function, cache result and return a reference.
637 This method returns a reference to the cached result instead of the
638 result itself. The reference object is small and pickeable, allowing
639 to send or store it easily. Call .get() on reference object to get
640 result.
642 Returns
643 -------
644 cached_result: MemorizedResult or NotMemorizedResult
645 reference to the value returned by the wrapped function. The
646 class "NotMemorizedResult" is used when there is no cache
647 activated (e.g. location=None in Memory).
648 """
649 _, args_id, metadata = self._cached_call(args, kwargs, shelving=True)
650 return MemorizedResult(self.store_backend, self.func, args_id,
651 metadata=metadata, verbose=self._verbose - 1,
652 timestamp=self.timestamp)
654 def __call__(self, *args, **kwargs):
655 return self._cached_call(args, kwargs)[0]
657 def __getstate__(self):
658 # Make sure self.func's source is introspected prior to being pickled -
659 # code introspection utilities typically do not work inside child
660 # processes
661 _ = self.func_code_info
663 # We don't store the timestamp when pickling, to avoid the hash
664 # depending from it.
665 state = self.__dict__.copy()
666 state['timestamp'] = None
668 # Invalidate the code id as id(obj) will be different in the child
669 state['_func_code_id'] = None
671 return state
673 def check_call_in_cache(self, *args, **kwargs):
674 """Check if function call is in the memory cache.
676 Does not call the function or do any work besides func inspection
677 and arg hashing.
679 Returns
680 -------
681 is_call_in_cache: bool
682 Whether or not the result of the function has been cached
683 for the input arguments that have been passed.
684 """
685 func_id, args_id = self._get_output_identifiers(*args, **kwargs)
686 return self.store_backend.contains_item((func_id, args_id))
688 # ------------------------------------------------------------------------
689 # Private interface
690 # ------------------------------------------------------------------------
692 def _get_argument_hash(self, *args, **kwargs):
693 return hashing.hash(filter_args(self.func, self.ignore, args, kwargs),
694 coerce_mmap=(self.mmap_mode is not None))
696 def _get_output_identifiers(self, *args, **kwargs):
697 """Return the func identifier and input parameter hash of a result."""
698 func_id = _build_func_identifier(self.func)
699 argument_hash = self._get_argument_hash(*args, **kwargs)
700 return func_id, argument_hash
702 def _hash_func(self):
703 """Hash a function to key the online cache"""
704 func_code_h = hash(getattr(self.func, '__code__', None))
705 return id(self.func), hash(self.func), func_code_h
707 def _write_func_code(self, func_code, first_line):
708 """ Write the function code and the filename to a file.
709 """
710 # We store the first line because the filename and the function
711 # name is not always enough to identify a function: people
712 # sometimes have several functions named the same way in a
713 # file. This is bad practice, but joblib should be robust to bad
714 # practice.
715 func_id = _build_func_identifier(self.func)
716 func_code = u'%s %i\n%s' % (FIRST_LINE_TEXT, first_line, func_code)
717 self.store_backend.store_cached_func_code([func_id], func_code)
719 # Also store in the in-memory store of function hashes
720 is_named_callable = False
721 is_named_callable = (hasattr(self.func, '__name__') and
722 self.func.__name__ != '<lambda>')
723 if is_named_callable:
724 # Don't do this for lambda functions or strange callable
725 # objects, as it ends up being too fragile
726 func_hash = self._hash_func()
727 try:
728 _FUNCTION_HASHES[self.func] = func_hash
729 except TypeError:
730 # Some callable are not hashable
731 pass
733 def _check_previous_func_code(self, stacklevel=2):
734 """
735 stacklevel is the depth a which this function is called, to
736 issue useful warnings to the user.
737 """
738 # First check if our function is in the in-memory store.
739 # Using the in-memory store not only makes things faster, but it
740 # also renders us robust to variations of the files when the
741 # in-memory version of the code does not vary
742 try:
743 if self.func in _FUNCTION_HASHES:
744 # We use as an identifier the id of the function and its
745 # hash. This is more likely to falsely change than have hash
746 # collisions, thus we are on the safe side.
747 func_hash = self._hash_func()
748 if func_hash == _FUNCTION_HASHES[self.func]:
749 return True
750 except TypeError:
751 # Some callables are not hashable
752 pass
754 # Here, we go through some effort to be robust to dynamically
755 # changing code and collision. We cannot inspect.getsource
756 # because it is not reliable when using IPython's magic "%run".
757 func_code, source_file, first_line = self.func_code_info
758 func_id = _build_func_identifier(self.func)
760 try:
761 old_func_code, old_first_line =\
762 extract_first_line(
763 self.store_backend.get_cached_func_code([func_id]))
764 except (IOError, OSError): # some backend can also raise OSError
765 self._write_func_code(func_code, first_line)
766 return False
767 if old_func_code == func_code:
768 return True
770 # We have differing code, is this because we are referring to
771 # different functions, or because the function we are referring to has
772 # changed?
774 _, func_name = get_func_name(self.func, resolv_alias=False,
775 win_characters=False)
776 if old_first_line == first_line == -1 or func_name == '<lambda>':
777 if not first_line == -1:
778 func_description = ("{0} ({1}:{2})"
779 .format(func_name, source_file,
780 first_line))
781 else:
782 func_description = func_name
783 warnings.warn(JobLibCollisionWarning(
784 "Cannot detect name collisions for function '{0}'"
785 .format(func_description)), stacklevel=stacklevel)
787 # Fetch the code at the old location and compare it. If it is the
788 # same than the code store, we have a collision: the code in the
789 # file has not changed, but the name we have is pointing to a new
790 # code block.
791 if not old_first_line == first_line and source_file is not None:
792 possible_collision = False
793 if os.path.exists(source_file):
794 _, func_name = get_func_name(self.func, resolv_alias=False)
795 num_lines = len(func_code.split('\n'))
796 with open_py_source(source_file) as f:
797 on_disk_func_code = f.readlines()[
798 old_first_line - 1:old_first_line - 1 + num_lines - 1]
799 on_disk_func_code = ''.join(on_disk_func_code)
800 possible_collision = (on_disk_func_code.rstrip() ==
801 old_func_code.rstrip())
802 else:
803 possible_collision = source_file.startswith('<doctest ')
804 if possible_collision:
805 warnings.warn(JobLibCollisionWarning(
806 'Possible name collisions between functions '
807 "'%s' (%s:%i) and '%s' (%s:%i)" %
808 (func_name, source_file, old_first_line,
809 func_name, source_file, first_line)),
810 stacklevel=stacklevel)
812 # The function has changed, wipe the cache directory.
813 # XXX: Should be using warnings, and giving stacklevel
814 if self._verbose > 10:
815 _, func_name = get_func_name(self.func, resolv_alias=False)
816 self.warn("Function {0} (identified by {1}) has changed"
817 ".".format(func_name, func_id))
818 self.clear(warn=True)
819 return False
821 def clear(self, warn=True):
822 """Empty the function's cache."""
823 func_id = _build_func_identifier(self.func)
825 if self._verbose > 0 and warn:
826 self.warn("Clearing function cache identified by %s" % func_id)
827 self.store_backend.clear_path([func_id, ])
829 func_code, _, first_line = self.func_code_info
830 self._write_func_code(func_code, first_line)
832 def call(self, *args, **kwargs):
833 """Force the execution of the function with the given arguments.
835 The output values will be persisted, i.e., the cache will be updated
836 with any new values.
838 Parameters
839 ----------
840 *args: arguments
841 The arguments.
842 **kwargs: keyword arguments
843 Keyword arguments.
845 Returns
846 -------
847 output : object
848 The output of the function call.
849 metadata : dict
850 The metadata associated with the call.
851 """
852 start_time = time.time()
853 func_id, args_id = self._get_output_identifiers(*args, **kwargs)
854 if self._verbose > 0:
855 print(format_call(self.func, args, kwargs))
856 output = self.func(*args, **kwargs)
857 self.store_backend.dump_item(
858 [func_id, args_id], output, verbose=self._verbose)
860 duration = time.time() - start_time
861 metadata = self._persist_input(duration, args, kwargs)
863 if self._verbose > 0:
864 _, name = get_func_name(self.func)
865 msg = '%s - %s' % (name, format_time(duration))
866 print(max(0, (80 - len(msg))) * '_' + msg)
867 return output, metadata
869 def _persist_input(self, duration, args, kwargs, this_duration_limit=0.5):
870 """ Save a small summary of the call using json format in the
871 output directory.
873 output_dir: string
874 directory where to write metadata.
876 duration: float
877 time taken by hashing input arguments, calling the wrapped
878 function and persisting its output.
880 args, kwargs: list and dict
881 input arguments for wrapped function
883 this_duration_limit: float
884 Max execution time for this function before issuing a warning.
885 """
886 start_time = time.time()
887 argument_dict = filter_args(self.func, self.ignore,
888 args, kwargs)
890 input_repr = dict((k, repr(v)) for k, v in argument_dict.items())
891 # This can fail due to race-conditions with multiple
892 # concurrent joblibs removing the file or the directory
893 metadata = {
894 "duration": duration, "input_args": input_repr, "time": start_time,
895 }
897 func_id, args_id = self._get_output_identifiers(*args, **kwargs)
898 self.store_backend.store_metadata([func_id, args_id], metadata)
900 this_duration = time.time() - start_time
901 if this_duration > this_duration_limit:
902 # This persistence should be fast. It will not be if repr() takes
903 # time and its output is large, because json.dump will have to
904 # write a large file. This should not be an issue with numpy arrays
905 # for which repr() always output a short representation, but can
906 # be with complex dictionaries. Fixing the problem should be a
907 # matter of replacing repr() above by something smarter.
908 warnings.warn("Persisting input arguments took %.2fs to run."
909 "If this happens often in your code, it can cause "
910 "performance problems "
911 "(results will be correct in all cases). "
912 "The reason for this is probably some large input "
913 "arguments for a wrapped function."
914 % this_duration, stacklevel=5)
915 return metadata
917 # ------------------------------------------------------------------------
918 # Private `object` interface
919 # ------------------------------------------------------------------------
921 def __repr__(self):
922 return '{class_name}(func={func}, location={location})'.format(
923 class_name=self.__class__.__name__,
924 func=self.func,
925 location=self.store_backend.location,)
928###############################################################################
929# class `Memory`
930###############################################################################
931class Memory(Logger):
932 """ A context object for caching a function's return value each time it
933 is called with the same input arguments.
935 All values are cached on the filesystem, in a deep directory
936 structure.
938 Read more in the :ref:`User Guide <memory>`.
940 Parameters
941 ----------
942 location: str, pathlib.Path or None
943 The path of the base directory to use as a data store
944 or None. If None is given, no caching is done and
945 the Memory object is completely transparent. This option
946 replaces cachedir since version 0.12.
948 backend: str, optional
949 Type of store backend for reading/writing cache files.
950 Default: 'local'.
951 The 'local' backend is using regular filesystem operations to
952 manipulate data (open, mv, etc) in the backend.
954 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional
955 The memmapping mode used when loading from cache
956 numpy arrays. See numpy.load for the meaning of the
957 arguments.
959 compress: boolean, or integer, optional
960 Whether to zip the stored data on disk. If an integer is
961 given, it should be between 1 and 9, and sets the amount
962 of compression. Note that compressed arrays cannot be
963 read by memmapping.
965 verbose: int, optional
966 Verbosity flag, controls the debug messages that are issued
967 as functions are evaluated.
969 bytes_limit: int | str, optional
970 Limit in bytes of the size of the cache. By default, the size of
971 the cache is unlimited. When reducing the size of the cache,
972 ``joblib`` keeps the most recently accessed items first. If a
973 str is passed, it is converted to a number of bytes using units
974 { K | M | G} for kilo, mega, giga.
976 **Note:** You need to call :meth:`joblib.Memory.reduce_size` to
977 actually reduce the cache size to be less than ``bytes_limit``.
979 **Note:** This argument has been deprecated. One should give the
980 value of ``bytes_limit`` directly in
981 :meth:`joblib.Memory.reduce_size`.
983 backend_options: dict, optional
984 Contains a dictionary of named parameters used to configure
985 the store backend.
986 """
987 # ------------------------------------------------------------------------
988 # Public interface
989 # ------------------------------------------------------------------------
991 def __init__(self, location=None, backend='local',
992 mmap_mode=None, compress=False, verbose=1, bytes_limit=None,
993 backend_options=None):
994 Logger.__init__(self)
995 self._verbose = verbose
996 self.mmap_mode = mmap_mode
997 self.timestamp = time.time()
998 if bytes_limit is not None:
999 warnings.warn(
1000 "bytes_limit argument has been deprecated. It will be removed "
1001 "in version 1.5. Please pass its value directly to "
1002 "Memory.reduce_size.",
1003 category=DeprecationWarning
1004 )
1005 self.bytes_limit = bytes_limit
1006 self.backend = backend
1007 self.compress = compress
1008 if backend_options is None:
1009 backend_options = {}
1010 self.backend_options = backend_options
1012 if compress and mmap_mode is not None:
1013 warnings.warn('Compressed results cannot be memmapped',
1014 stacklevel=2)
1016 self.location = location
1017 if isinstance(location, str):
1018 location = os.path.join(location, 'joblib')
1020 self.store_backend = _store_backend_factory(
1021 backend, location, verbose=self._verbose,
1022 backend_options=dict(compress=compress, mmap_mode=mmap_mode,
1023 **backend_options))
1025 def cache(self, func=None, ignore=None, verbose=None, mmap_mode=False,
1026 cache_validation_callback=None):
1027 """ Decorates the given function func to only compute its return
1028 value for input arguments not cached on disk.
1030 Parameters
1031 ----------
1032 func: callable, optional
1033 The function to be decorated
1034 ignore: list of strings
1035 A list of arguments name to ignore in the hashing
1036 verbose: integer, optional
1037 The verbosity mode of the function. By default that
1038 of the memory object is used.
1039 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional
1040 The memmapping mode used when loading from cache
1041 numpy arrays. See numpy.load for the meaning of the
1042 arguments. By default that of the memory object is used.
1043 cache_validation_callback: callable, optional
1044 Callable to validate whether or not the cache is valid. When
1045 the cached function is called with arguments for which a cache
1046 exists, this callable is called with the metadata of the cached
1047 result as its sole argument. If it returns True, then the
1048 cached result is returned, else the cache for these arguments
1049 is cleared and recomputed.
1051 Returns
1052 -------
1053 decorated_func: MemorizedFunc object
1054 The returned object is a MemorizedFunc object, that is
1055 callable (behaves like a function), but offers extra
1056 methods for cache lookup and management. See the
1057 documentation for :class:`joblib.memory.MemorizedFunc`.
1058 """
1059 if (cache_validation_callback is not None and
1060 not callable(cache_validation_callback)):
1061 raise ValueError(
1062 "cache_validation_callback needs to be callable. "
1063 f"Got {cache_validation_callback}."
1064 )
1065 if func is None:
1066 # Partial application, to be able to specify extra keyword
1067 # arguments in decorators
1068 return functools.partial(
1069 self.cache, ignore=ignore,
1070 mmap_mode=mmap_mode,
1071 verbose=verbose,
1072 cache_validation_callback=cache_validation_callback
1073 )
1074 if self.store_backend is None:
1075 return NotMemorizedFunc(func)
1076 if verbose is None:
1077 verbose = self._verbose
1078 if mmap_mode is False:
1079 mmap_mode = self.mmap_mode
1080 if isinstance(func, MemorizedFunc):
1081 func = func.func
1082 return MemorizedFunc(
1083 func, location=self.store_backend, backend=self.backend,
1084 ignore=ignore, mmap_mode=mmap_mode, compress=self.compress,
1085 verbose=verbose, timestamp=self.timestamp,
1086 cache_validation_callback=cache_validation_callback
1087 )
1089 def clear(self, warn=True):
1090 """ Erase the complete cache directory.
1091 """
1092 if warn:
1093 self.warn('Flushing completely the cache')
1094 if self.store_backend is not None:
1095 self.store_backend.clear()
1097 # As the cache is completely clear, make sure the _FUNCTION_HASHES
1098 # cache is also reset. Else, for a function that is present in this
1099 # table, results cached after this clear will be have cache miss
1100 # as the function code is not re-written.
1101 _FUNCTION_HASHES.clear()
1103 def reduce_size(self, bytes_limit=None, items_limit=None, age_limit=None):
1104 """Remove cache elements to make the cache fit its limits.
1106 The limitation can impose that the cache size fits in ``bytes_limit``,
1107 that the number of cache items is no more than ``items_limit``, and
1108 that all files in cache are not older than ``age_limit``.
1110 Parameters
1111 ----------
1112 bytes_limit: int | str, optional
1113 Limit in bytes of the size of the cache. By default, the size of
1114 the cache is unlimited. When reducing the size of the cache,
1115 ``joblib`` keeps the most recently accessed items first. If a
1116 str is passed, it is converted to a number of bytes using units
1117 { K | M | G} for kilo, mega, giga.
1119 items_limit: int, optional
1120 Number of items to limit the cache to. By default, the number of
1121 items in the cache is unlimited. When reducing the size of the
1122 cache, ``joblib`` keeps the most recently accessed items first.
1124 age_limit: datetime.timedelta, optional
1125 Maximum age of items to limit the cache to. When reducing the size
1126 of the cache, any items last accessed more than the given length of
1127 time ago are deleted.
1128 """
1129 if bytes_limit is None:
1130 bytes_limit = self.bytes_limit
1132 if self.store_backend is None:
1133 # No cached results, this function does nothing.
1134 return
1136 if bytes_limit is None and items_limit is None and age_limit is None:
1137 # No limitation to impose, returning
1138 return
1140 # Defers the actual limits enforcing to the store backend.
1141 self.store_backend.enforce_store_limits(
1142 bytes_limit, items_limit, age_limit
1143 )
1145 def eval(self, func, *args, **kwargs):
1146 """ Eval function func with arguments `*args` and `**kwargs`,
1147 in the context of the memory.
1149 This method works similarly to the builtin `apply`, except
1150 that the function is called only if the cache is not
1151 up to date.
1153 """
1154 if self.store_backend is None:
1155 return func(*args, **kwargs)
1156 return self.cache(func)(*args, **kwargs)
1158 # ------------------------------------------------------------------------
1159 # Private `object` interface
1160 # ------------------------------------------------------------------------
1162 def __repr__(self):
1163 return '{class_name}(location={location})'.format(
1164 class_name=self.__class__.__name__,
1165 location=(None if self.store_backend is None
1166 else self.store_backend.location))
1168 def __getstate__(self):
1169 """ We don't store the timestamp when pickling, to avoid the hash
1170 depending from it.
1171 """
1172 state = self.__dict__.copy()
1173 state['timestamp'] = None
1174 return state
1177###############################################################################
1178# cache_validation_callback helpers
1179###############################################################################
1181def expires_after(days=0, seconds=0, microseconds=0, milliseconds=0, minutes=0,
1182 hours=0, weeks=0):
1183 """Helper cache_validation_callback to force recompute after a duration.
1185 Parameters
1186 ----------
1187 days, seconds, microseconds, milliseconds, minutes, hours, weeks: numbers
1188 argument passed to a timedelta.
1189 """
1190 delta = timedelta(
1191 days=days, seconds=seconds, microseconds=microseconds,
1192 milliseconds=milliseconds, minutes=minutes, hours=hours, weeks=weeks
1193 )
1195 def cache_validation_callback(metadata):
1196 computation_age = time.time() - metadata['time']
1197 return computation_age < delta.total_seconds()
1199 return cache_validation_callback