Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/joblib/memory.py: 24%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2A context object for caching a function's return value each time it
3is called with the same input arguments.
5"""
7# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
8# Copyright (c) 2009 Gael Varoquaux
9# License: BSD Style, 3 clauses.
11import asyncio
12import datetime
13import functools
14import inspect
15import logging
16import os
17import pathlib
18import pydoc
19import re
20import textwrap
21import time
22import tokenize
23import traceback
24import warnings
25import weakref
27from . import hashing
28from ._store_backends import (
29 CacheWarning, # noqa
30 FileSystemStoreBackend,
31 StoreBackendBase,
32)
33from .func_inspect import (
34 filter_args,
35 format_call,
36 format_signature,
37 get_func_code,
38 get_func_name,
39)
40from .logger import Logger, format_time, pformat
42FIRST_LINE_TEXT = "# first line:"
44# TODO: The following object should have a data store object as a sub
45# object, and the interface to persist and query should be separated in
46# the data store.
47#
48# This would enable creating 'Memory' objects with a different logic for
49# pickling that would simply span a MemorizedFunc with the same
50# store (or do we want to copy it to avoid cross-talks?), for instance to
51# implement HDF5 pickling.
53# TODO: Same remark for the logger, and probably use the Python logging
54# mechanism.
57def extract_first_line(func_code):
58 """Extract the first line information from the function code
59 text if available.
60 """
61 if func_code.startswith(FIRST_LINE_TEXT):
62 func_code = func_code.split("\n")
63 first_line = int(func_code[0][len(FIRST_LINE_TEXT) :])
64 func_code = "\n".join(func_code[1:])
65 else:
66 first_line = -1
67 return func_code, first_line
70class JobLibCollisionWarning(UserWarning):
71 """Warn that there might be a collision between names of functions."""
74_STORE_BACKENDS = {"local": FileSystemStoreBackend}
77def register_store_backend(backend_name, backend):
78 """Extend available store backends.
80 The Memory, MemorizeResult and MemorizeFunc objects are designed to be
81 agnostic to the type of store used behind. By default, the local file
82 system is used but this function gives the possibility to extend joblib's
83 memory pattern with other types of storage such as cloud storage (S3, GCS,
84 OpenStack, HadoopFS, etc) or blob DBs.
86 Parameters
87 ----------
88 backend_name: str
89 The name identifying the store backend being registered. For example,
90 'local' is used with FileSystemStoreBackend.
91 backend: StoreBackendBase subclass
92 The name of a class that implements the StoreBackendBase interface.
94 """
95 if not isinstance(backend_name, str):
96 raise ValueError(
97 "Store backend name should be a string, '{0}' given.".format(backend_name)
98 )
99 if backend is None or not issubclass(backend, StoreBackendBase):
100 raise ValueError(
101 "Store backend should inherit StoreBackendBase, '{0}' given.".format(
102 backend
103 )
104 )
106 _STORE_BACKENDS[backend_name] = backend
109def _store_backend_factory(backend, location, verbose=0, backend_options=None):
110 """Return the correct store object for the given location."""
111 if backend_options is None:
112 backend_options = {}
114 if isinstance(location, pathlib.Path):
115 location = str(location)
117 if isinstance(location, StoreBackendBase):
118 return location
119 elif isinstance(location, str):
120 obj = None
121 location = os.path.expanduser(location)
122 # The location is not a local file system, we look in the
123 # registered backends if there's one matching the given backend
124 # name.
125 for backend_key, backend_obj in _STORE_BACKENDS.items():
126 if backend == backend_key:
127 obj = backend_obj()
129 # By default, we assume the FileSystemStoreBackend can be used if no
130 # matching backend could be found.
131 if obj is None:
132 raise TypeError(
133 "Unknown location {0} or backend {1}".format(location, backend)
134 )
136 # The store backend is configured with the extra named parameters,
137 # some of them are specific to the underlying store backend.
138 obj.configure(location, verbose=verbose, backend_options=backend_options)
139 return obj
140 elif location is not None:
141 warnings.warn(
142 "Instantiating a backend using a {} as a location is not "
143 "supported by joblib. Returning None instead.".format(
144 location.__class__.__name__
145 ),
146 UserWarning,
147 )
149 return None
152def _build_func_identifier(func):
153 """Build a roughly unique identifier for the cached function."""
154 modules, funcname = get_func_name(func)
155 # We reuse historical fs-like way of building a function identifier
156 return os.path.join(*modules, funcname)
159# An in-memory store to avoid looking at the disk-based function
160# source code to check if a function definition has changed
161_FUNCTION_HASHES = weakref.WeakKeyDictionary()
164###############################################################################
165# class `MemorizedResult`
166###############################################################################
167class MemorizedResult(Logger):
168 """Object representing a cached value.
170 Attributes
171 ----------
172 location: str
173 The location of joblib cache. Depends on the store backend used.
175 func: function or str
176 function whose output is cached. The string case is intended only for
177 instantiation based on the output of repr() on another instance.
178 (namely eval(repr(memorized_instance)) works).
180 argument_hash: str
181 hash of the function arguments.
183 backend: str
184 Type of store backend for reading/writing cache files.
185 Default is 'local'.
187 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}
188 The memmapping mode used when loading from cache numpy arrays. See
189 numpy.load for the meaning of the different values.
191 verbose: int
192 verbosity level (0 means no message).
194 timestamp, metadata: string
195 for internal use only.
196 """
198 def __init__(
199 self,
200 location,
201 call_id,
202 backend="local",
203 mmap_mode=None,
204 verbose=0,
205 timestamp=None,
206 metadata=None,
207 ):
208 Logger.__init__(self)
209 self._call_id = call_id
210 self.store_backend = _store_backend_factory(
211 backend,
212 location,
213 verbose=verbose,
214 backend_options=dict(mmap_mode=mmap_mode),
215 )
216 self.mmap_mode = mmap_mode
218 if metadata is not None:
219 self.metadata = metadata
220 else:
221 self.metadata = self.store_backend.get_metadata(self._call_id)
223 self.duration = self.metadata.get("duration", None)
224 self.verbose = verbose
225 self.timestamp = timestamp
227 @property
228 def func(self):
229 return self.func_id
231 @property
232 def func_id(self):
233 return self._call_id[0]
235 @property
236 def args_id(self):
237 return self._call_id[1]
239 def get(self):
240 """Read value from cache and return it."""
241 try:
242 return self.store_backend.load_item(
243 self._call_id,
244 timestamp=self.timestamp,
245 metadata=self.metadata,
246 verbose=self.verbose,
247 )
248 except ValueError as exc:
249 new_exc = KeyError(
250 "Error while trying to load a MemorizedResult's value. "
251 "It seems that this folder is corrupted : {}".format(
252 os.path.join(self.store_backend.location, *self._call_id)
253 )
254 )
255 raise new_exc from exc
257 def clear(self):
258 """Clear value from cache"""
259 self.store_backend.clear_item(self._call_id)
261 def __repr__(self):
262 return '{}(location="{}", func="{}", args_id="{}")'.format(
263 self.__class__.__name__, self.store_backend.location, *self._call_id
264 )
266 def __getstate__(self):
267 state = self.__dict__.copy()
268 state["timestamp"] = None
269 return state
272class NotMemorizedResult(object):
273 """Class representing an arbitrary value.
275 This class is a replacement for MemorizedResult when there is no cache.
276 """
278 __slots__ = ("value", "valid")
280 def __init__(self, value):
281 self.value = value
282 self.valid = True
284 def get(self):
285 if self.valid:
286 return self.value
287 else:
288 raise KeyError("No value stored.")
290 def clear(self):
291 self.valid = False
292 self.value = None
294 def __repr__(self):
295 if self.valid:
296 return "{class_name}({value})".format(
297 class_name=self.__class__.__name__, value=pformat(self.value)
298 )
299 else:
300 return self.__class__.__name__ + " with no value"
302 # __getstate__ and __setstate__ are required because of __slots__
303 def __getstate__(self):
304 return {"valid": self.valid, "value": self.value}
306 def __setstate__(self, state):
307 self.valid = state["valid"]
308 self.value = state["value"]
311###############################################################################
312# class `NotMemorizedFunc`
313###############################################################################
314class NotMemorizedFunc(object):
315 """No-op object decorating a function.
317 This class replaces MemorizedFunc when there is no cache. It provides an
318 identical API but does not write anything on disk.
320 Attributes
321 ----------
322 func: callable
323 Original undecorated function.
324 """
326 # Should be a light as possible (for speed)
327 def __init__(self, func):
328 self.func = func
330 def __call__(self, *args, **kwargs):
331 return self.func(*args, **kwargs)
333 def call_and_shelve(self, *args, **kwargs):
334 return NotMemorizedResult(self.func(*args, **kwargs))
336 def __repr__(self):
337 return "{0}(func={1})".format(self.__class__.__name__, self.func)
339 def clear(self, warn=True):
340 # Argument "warn" is for compatibility with MemorizedFunc.clear
341 pass
343 def call(self, *args, **kwargs):
344 return self.func(*args, **kwargs), {}
346 def check_call_in_cache(self, *args, **kwargs):
347 return False
350###############################################################################
351# class `AsyncNotMemorizedFunc`
352###############################################################################
353class AsyncNotMemorizedFunc(NotMemorizedFunc):
354 async def call_and_shelve(self, *args, **kwargs):
355 return NotMemorizedResult(await self.func(*args, **kwargs))
358###############################################################################
359# class `MemorizedFunc`
360###############################################################################
361class MemorizedFunc(Logger):
362 """Callable object decorating a function for caching its return value
363 each time it is called.
365 Methods are provided to inspect the cache or clean it.
367 Attributes
368 ----------
369 func: callable
370 The original, undecorated, function.
372 location: string
373 The location of joblib cache. Depends on the store backend used.
375 backend: str
376 Type of store backend for reading/writing cache files.
377 Default is 'local', in which case the location is the path to a
378 disk storage.
380 ignore: list or None
381 List of variable names to ignore when choosing whether to
382 recompute.
384 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}
385 The memmapping mode used when loading from cache
386 numpy arrays. See numpy.load for the meaning of the different
387 values.
389 compress: boolean, or integer
390 Whether to zip the stored data on disk. If an integer is
391 given, it should be between 1 and 9, and sets the amount
392 of compression. Note that compressed arrays cannot be
393 read by memmapping.
395 verbose: int, optional
396 The verbosity flag, controls messages that are issued as
397 the function is evaluated.
399 cache_validation_callback: callable, optional
400 Callable to check if a result in cache is valid or is to be recomputed.
401 When the function is called with arguments for which a cache exists,
402 the callback is called with the cache entry's metadata as its sole
403 argument. If it returns True, the cached result is returned, else the
404 cache for these arguments is cleared and the result is recomputed.
405 """
407 # ------------------------------------------------------------------------
408 # Public interface
409 # ------------------------------------------------------------------------
411 def __init__(
412 self,
413 func,
414 location,
415 backend="local",
416 ignore=None,
417 mmap_mode=None,
418 compress=False,
419 verbose=1,
420 timestamp=None,
421 cache_validation_callback=None,
422 ):
423 Logger.__init__(self)
424 self.mmap_mode = mmap_mode
425 self.compress = compress
426 self.func = func
427 self.cache_validation_callback = cache_validation_callback
428 self.func_id = _build_func_identifier(func)
429 self.ignore = ignore if ignore is not None else []
430 self._verbose = verbose
432 # retrieve store object from backend type and location.
433 self.store_backend = _store_backend_factory(
434 backend,
435 location,
436 verbose=verbose,
437 backend_options=dict(compress=compress, mmap_mode=mmap_mode),
438 )
439 if self.store_backend is not None:
440 # Create func directory on demand.
441 self.store_backend.store_cached_func_code([self.func_id])
443 self.timestamp = timestamp if timestamp is not None else time.time()
444 try:
445 functools.update_wrapper(self, func)
446 except Exception:
447 pass # Objects like ufunc don't like that
448 if inspect.isfunction(func):
449 doc = pydoc.TextDoc().document(func)
450 # Remove blank line
451 doc = doc.replace("\n", "\n\n", 1)
452 # Strip backspace-overprints for compatibility with autodoc
453 doc = re.sub("\x08.", "", doc)
454 else:
455 # Pydoc does a poor job on other objects
456 doc = func.__doc__
457 self.__doc__ = "Memoized version of %s" % doc
459 self._func_code_info = None
460 self._func_code_id = None
462 def _is_in_cache_and_valid(self, call_id):
463 """Check if the function call is cached and valid for given arguments.
465 - Compare the function code with the one from the cached function,
466 asserting if it has changed.
467 - Check if the function call is present in the cache.
468 - Call `cache_validation_callback` for user define cache validation.
470 Returns True if the function call is in cache and can be used, and
471 returns False otherwise.
472 """
473 # Check if the code of the function has changed
474 if not self._check_previous_func_code(stacklevel=4):
475 return False
477 # Check if this specific call is in the cache
478 if not self.store_backend.contains_item(call_id):
479 return False
481 # Call the user defined cache validation callback
482 metadata = self.store_backend.get_metadata(call_id)
483 if (
484 self.cache_validation_callback is not None
485 and not self.cache_validation_callback(metadata)
486 ):
487 self.store_backend.clear_item(call_id)
488 return False
490 return True
492 def _cached_call(self, args, kwargs, shelving):
493 """Call wrapped function and cache result, or read cache if available.
495 This function returns the wrapped function output or a reference to
496 the cached result.
498 Arguments:
499 ----------
501 args, kwargs: list and dict
502 input arguments for wrapped function
504 shelving: bool
505 True when called via the call_and_shelve function.
508 Returns
509 -------
510 output: Output of the wrapped function if shelving is false, or a
511 MemorizedResult reference to the value if shelving is true.
512 metadata: dict containing the metadata associated with the call.
513 """
514 args_id = self._get_args_id(*args, **kwargs)
515 call_id = (self.func_id, args_id)
516 _, func_name = get_func_name(self.func)
517 func_info = self.store_backend.get_cached_func_info([self.func_id])
518 location = func_info["location"]
520 if self._verbose >= 20:
521 logging.basicConfig(level=logging.INFO)
522 _, signature = format_signature(self.func, *args, **kwargs)
523 self.info(
524 textwrap.dedent(
525 f"""
526 Querying {func_name} with signature
527 {signature}.
529 (argument hash {args_id})
531 The store location is {location}.
532 """
533 )
534 )
536 # Compare the function code with the previous to see if the
537 # function code has changed and check if the results are present in
538 # the cache.
539 if self._is_in_cache_and_valid(call_id):
540 if shelving:
541 return self._get_memorized_result(call_id), {}
543 try:
544 start_time = time.time()
545 output = self._load_item(call_id)
546 if self._verbose > 4:
547 self._print_duration(
548 time.time() - start_time, context="cache loaded "
549 )
550 return output, {}
551 except Exception:
552 # XXX: Should use an exception logger
553 _, signature = format_signature(self.func, *args, **kwargs)
554 self.warn(
555 "Exception while loading results for {}\n {}".format(
556 signature, traceback.format_exc()
557 )
558 )
560 if self._verbose > 10:
561 self.warn(
562 f"Computing func {func_name}, argument hash {args_id} "
563 f"in location {location}"
564 )
566 # Returns the output but not the metadata
567 return self._call(call_id, args, kwargs, shelving)
569 @property
570 def func_code_info(self):
571 # 3-tuple property containing: the function source code, source file,
572 # and first line of the code inside the source file
573 if hasattr(self.func, "__code__"):
574 if self._func_code_id is None:
575 self._func_code_id = id(self.func.__code__)
576 elif id(self.func.__code__) != self._func_code_id:
577 # Be robust to dynamic reassignments of self.func.__code__
578 self._func_code_info = None
580 if self._func_code_info is None:
581 # Cache the source code of self.func . Provided that get_func_code
582 # (which should be called once on self) gets called in the process
583 # in which self.func was defined, this caching mechanism prevents
584 # undesired cache clearing when the cached function is called in
585 # an environment where the introspection utilities get_func_code
586 # relies on do not work (typically, in joblib child processes).
587 # See #1035 for more info
588 # TODO (pierreglaser): do the same with get_func_name?
589 self._func_code_info = get_func_code(self.func)
590 return self._func_code_info
592 def call_and_shelve(self, *args, **kwargs):
593 """Call wrapped function, cache result and return a reference.
595 This method returns a reference to the cached result instead of the
596 result itself. The reference object is small and picklable, allowing
597 to send or store it easily. Call .get() on reference object to get
598 result.
600 Returns
601 -------
602 cached_result: MemorizedResult or NotMemorizedResult
603 reference to the value returned by the wrapped function. The
604 class "NotMemorizedResult" is used when there is no cache
605 activated (e.g. location=None in Memory).
606 """
607 # Return the wrapped output, without the metadata
608 return self._cached_call(args, kwargs, shelving=True)[0]
610 def __call__(self, *args, **kwargs):
611 # Return the output, without the metadata
612 return self._cached_call(args, kwargs, shelving=False)[0]
614 def __getstate__(self):
615 # Make sure self.func's source is introspected prior to being pickled -
616 # code introspection utilities typically do not work inside child
617 # processes
618 _ = self.func_code_info
620 # We don't store the timestamp when pickling, to avoid the hash
621 # depending from it.
622 state = self.__dict__.copy()
623 state["timestamp"] = None
625 # Invalidate the code id as id(obj) will be different in the child
626 state["_func_code_id"] = None
628 return state
630 def check_call_in_cache(self, *args, **kwargs):
631 """Check if the function call is cached and valid for given arguments.
633 Does not call the function or do any work besides function inspection
634 and argument hashing.
636 - Compare the function code with the one from the cached function,
637 asserting if it has changed.
638 - Check if the function call is present in the cache.
639 - Call `cache_validation_callback` for user define cache validation.
641 Returns
642 -------
643 is_call_in_cache: bool
644 Whether or not the function call is in cache and can be used.
645 """
646 call_id = (self.func_id, self._get_args_id(*args, **kwargs))
647 return self._is_in_cache_and_valid(call_id)
649 # ------------------------------------------------------------------------
650 # Private interface
651 # ------------------------------------------------------------------------
653 def _get_args_id(self, *args, **kwargs):
654 """Return the input parameter hash of a result."""
655 return hashing.hash(
656 filter_args(self.func, self.ignore, args, kwargs),
657 coerce_mmap=self.mmap_mode is not None,
658 )
660 def _hash_func(self):
661 """Hash a function to key the online cache"""
662 func_code_h = hash(getattr(self.func, "__code__", None))
663 return id(self.func), hash(self.func), func_code_h
665 def _write_func_code(self, func_code, first_line):
666 """Write the function code and the filename to a file."""
667 # We store the first line because the filename and the function
668 # name is not always enough to identify a function: people
669 # sometimes have several functions named the same way in a
670 # file. This is bad practice, but joblib should be robust to bad
671 # practice.
672 func_code = "%s %i\n%s" % (FIRST_LINE_TEXT, first_line, func_code)
673 self.store_backend.store_cached_func_code([self.func_id], func_code)
675 # Also store in the in-memory store of function hashes
676 is_named_callable = (
677 hasattr(self.func, "__name__") and self.func.__name__ != "<lambda>"
678 )
679 if is_named_callable:
680 # Don't do this for lambda functions or strange callable
681 # objects, as it ends up being too fragile
682 func_hash = self._hash_func()
683 try:
684 _FUNCTION_HASHES[self.func] = func_hash
685 except TypeError:
686 # Some callable are not hashable
687 pass
689 def _check_previous_func_code(self, stacklevel=2):
690 """
691 stacklevel is the depth a which this function is called, to
692 issue useful warnings to the user.
693 """
694 # First check if our function is in the in-memory store.
695 # Using the in-memory store not only makes things faster, but it
696 # also renders us robust to variations of the files when the
697 # in-memory version of the code does not vary
698 try:
699 if self.func in _FUNCTION_HASHES:
700 # We use as an identifier the id of the function and its
701 # hash. This is more likely to falsely change than have hash
702 # collisions, thus we are on the safe side.
703 func_hash = self._hash_func()
704 if func_hash == _FUNCTION_HASHES[self.func]:
705 return True
706 except TypeError:
707 # Some callables are not hashable
708 pass
710 # Here, we go through some effort to be robust to dynamically
711 # changing code and collision. We cannot inspect.getsource
712 # because it is not reliable when using IPython's magic "%run".
713 func_code, source_file, first_line = self.func_code_info
714 try:
715 old_func_code, old_first_line = extract_first_line(
716 self.store_backend.get_cached_func_code([self.func_id])
717 )
718 except (IOError, OSError): # some backend can also raise OSError
719 self._write_func_code(func_code, first_line)
720 return False
721 if old_func_code == func_code:
722 return True
724 # We have differing code, is this because we are referring to
725 # different functions, or because the function we are referring to has
726 # changed?
728 _, func_name = get_func_name(
729 self.func, resolv_alias=False, win_characters=False
730 )
731 if old_first_line == first_line == -1 or func_name == "<lambda>":
732 if not first_line == -1:
733 func_description = "{0} ({1}:{2})".format(
734 func_name, source_file, first_line
735 )
736 else:
737 func_description = func_name
738 warnings.warn(
739 JobLibCollisionWarning(
740 "Cannot detect name collisions for function '{0}'".format(
741 func_description
742 )
743 ),
744 stacklevel=stacklevel,
745 )
747 # Fetch the code at the old location and compare it. If it is the
748 # same than the code store, we have a collision: the code in the
749 # file has not changed, but the name we have is pointing to a new
750 # code block.
751 if not old_first_line == first_line and source_file is not None:
752 if os.path.exists(source_file):
753 _, func_name = get_func_name(self.func, resolv_alias=False)
754 num_lines = len(func_code.split("\n"))
755 with tokenize.open(source_file) as f:
756 on_disk_func_code = f.readlines()[
757 old_first_line - 1 : old_first_line - 1 + num_lines - 1
758 ]
759 on_disk_func_code = "".join(on_disk_func_code)
760 possible_collision = (
761 on_disk_func_code.rstrip() == old_func_code.rstrip()
762 )
763 else:
764 possible_collision = source_file.startswith("<doctest ")
765 if possible_collision:
766 warnings.warn(
767 JobLibCollisionWarning(
768 "Possible name collisions between functions "
769 "'%s' (%s:%i) and '%s' (%s:%i)"
770 % (
771 func_name,
772 source_file,
773 old_first_line,
774 func_name,
775 source_file,
776 first_line,
777 )
778 ),
779 stacklevel=stacklevel,
780 )
782 # The function has changed, wipe the cache directory.
783 # XXX: Should be using warnings, and giving stacklevel
784 if self._verbose > 10:
785 _, func_name = get_func_name(self.func, resolv_alias=False)
786 self.warn(
787 "Function {0} (identified by {1}) has changed.".format(
788 func_name, self.func_id
789 )
790 )
791 self.clear(warn=True)
792 return False
794 def clear(self, warn=True):
795 """Empty the function's cache."""
796 func_id = self.func_id
797 if self._verbose > 0 and warn:
798 self.warn("Clearing function cache identified by %s" % func_id)
799 self.store_backend.clear_path(
800 [
801 func_id,
802 ]
803 )
805 func_code, _, first_line = self.func_code_info
806 self._write_func_code(func_code, first_line)
808 def call(self, *args, **kwargs):
809 """Force the execution of the function with the given arguments.
811 The output values will be persisted, i.e., the cache will be updated
812 with any new values.
814 Parameters
815 ----------
816 *args: arguments
817 The arguments.
818 **kwargs: keyword arguments
819 Keyword arguments.
821 Returns
822 -------
823 output : object
824 The output of the function call.
825 metadata : dict
826 The metadata associated with the call.
827 """
828 call_id = (self.func_id, self._get_args_id(*args, **kwargs))
830 # Return the output and the metadata
831 return self._call(call_id, args, kwargs)
833 def _call(self, call_id, args, kwargs, shelving=False):
834 # Return the output and the metadata
835 self._before_call(args, kwargs)
836 start_time = time.time()
837 output = self.func(*args, **kwargs)
838 return self._after_call(call_id, args, kwargs, shelving, output, start_time)
840 def _before_call(self, args, kwargs):
841 if self._verbose > 0:
842 print(format_call(self.func, args, kwargs))
844 def _after_call(self, call_id, args, kwargs, shelving, output, start_time):
845 self.store_backend.dump_item(call_id, output, verbose=self._verbose)
846 duration = time.time() - start_time
847 if self._verbose > 0:
848 self._print_duration(duration)
849 metadata = self._persist_input(duration, call_id, args, kwargs)
850 if shelving:
851 return self._get_memorized_result(call_id, metadata), metadata
853 if self.mmap_mode is not None:
854 # Memmap the output at the first call to be consistent with
855 # later calls
856 output = self._load_item(call_id, metadata)
857 return output, metadata
859 def _persist_input(self, duration, call_id, args, kwargs, this_duration_limit=0.5):
860 """Save a small summary of the call using json format in the
861 output directory.
863 output_dir: string
864 directory where to write metadata.
866 duration: float
867 time taken by hashing input arguments, calling the wrapped
868 function and persisting its output.
870 args, kwargs: list and dict
871 input arguments for wrapped function
873 this_duration_limit: float
874 Max execution time for this function before issuing a warning.
875 """
876 start_time = time.time()
877 argument_dict = filter_args(self.func, self.ignore, args, kwargs)
879 input_repr = dict((k, repr(v)) for k, v in argument_dict.items())
880 # This can fail due to race-conditions with multiple
881 # concurrent joblibs removing the file or the directory
882 metadata = {
883 "duration": duration,
884 "input_args": input_repr,
885 "time": start_time,
886 }
888 self.store_backend.store_metadata(call_id, metadata)
890 this_duration = time.time() - start_time
891 if this_duration > this_duration_limit:
892 # This persistence should be fast. It will not be if repr() takes
893 # time and its output is large, because json.dump will have to
894 # write a large file. This should not be an issue with numpy arrays
895 # for which repr() always output a short representation, but can
896 # be with complex dictionaries. Fixing the problem should be a
897 # matter of replacing repr() above by something smarter.
898 warnings.warn(
899 "Persisting input arguments took %.2fs to run."
900 "If this happens often in your code, it can cause "
901 "performance problems "
902 "(results will be correct in all cases). "
903 "The reason for this is probably some large input "
904 "arguments for a wrapped function." % this_duration,
905 stacklevel=5,
906 )
907 return metadata
909 def _get_memorized_result(self, call_id, metadata=None):
910 return MemorizedResult(
911 self.store_backend,
912 call_id,
913 metadata=metadata,
914 timestamp=self.timestamp,
915 verbose=self._verbose - 1,
916 )
918 def _load_item(self, call_id, metadata=None):
919 return self.store_backend.load_item(
920 call_id, metadata=metadata, timestamp=self.timestamp, verbose=self._verbose
921 )
923 def _print_duration(self, duration, context=""):
924 _, name = get_func_name(self.func)
925 msg = f"{name} {context}- {format_time(duration)}"
926 print(max(0, (80 - len(msg))) * "_" + msg)
928 # ------------------------------------------------------------------------
929 # Private `object` interface
930 # ------------------------------------------------------------------------
932 def __repr__(self):
933 return "{class_name}(func={func}, location={location})".format(
934 class_name=self.__class__.__name__,
935 func=self.func,
936 location=self.store_backend.location,
937 )
940###############################################################################
941# class `AsyncMemorizedFunc`
942###############################################################################
943class AsyncMemorizedFunc(MemorizedFunc):
944 async def __call__(self, *args, **kwargs):
945 out = self._cached_call(args, kwargs, shelving=False)
946 out = await out if asyncio.iscoroutine(out) else out
947 return out[0] # Don't return metadata
949 async def call_and_shelve(self, *args, **kwargs):
950 out = self._cached_call(args, kwargs, shelving=True)
951 out = await out if asyncio.iscoroutine(out) else out
952 return out[0] # Don't return metadata
954 async def call(self, *args, **kwargs):
955 out = super().call(*args, **kwargs)
956 return await out if asyncio.iscoroutine(out) else out
958 async def _call(self, call_id, args, kwargs, shelving=False):
959 self._before_call(args, kwargs)
960 start_time = time.time()
961 output = await self.func(*args, **kwargs)
962 return self._after_call(call_id, args, kwargs, shelving, output, start_time)
965###############################################################################
966# class `Memory`
967###############################################################################
968class Memory(Logger):
969 """A context object for caching a function's return value each time it
970 is called with the same input arguments.
972 All values are cached on the filesystem, in a deep directory
973 structure.
975 Read more in the :ref:`User Guide <memory>`.
977 Parameters
978 ----------
979 location: str, pathlib.Path or None
980 The path of the base directory to use as a data store
981 or None. If None is given, no caching is done and
982 the Memory object is completely transparent. This option
983 replaces cachedir since version 0.12.
985 backend: str, optional, default='local'
986 Type of store backend for reading/writing cache files.
987 The 'local' backend is using regular filesystem operations to
988 manipulate data (open, mv, etc) in the backend.
990 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional
991 The memmapping mode used when loading from cache
992 numpy arrays. See numpy.load for the meaning of the
993 arguments.
995 compress: boolean, or integer, optional
996 Whether to zip the stored data on disk. If an integer is
997 given, it should be between 1 and 9, and sets the amount
998 of compression. Note that compressed arrays cannot be
999 read by memmapping.
1001 verbose: int, optional
1002 Verbosity flag, controls the debug messages that are issued
1003 as functions are evaluated.
1005 backend_options: dict, optional
1006 Contains a dictionary of named parameters used to configure
1007 the store backend.
1008 """
1010 # ------------------------------------------------------------------------
1011 # Public interface
1012 # ------------------------------------------------------------------------
1014 def __init__(
1015 self,
1016 location=None,
1017 backend="local",
1018 mmap_mode=None,
1019 compress=False,
1020 verbose=1,
1021 backend_options=None,
1022 ):
1023 Logger.__init__(self)
1024 self._verbose = verbose
1025 self.mmap_mode = mmap_mode
1026 self.timestamp = time.time()
1027 self.backend = backend
1028 self.compress = compress
1029 if backend_options is None:
1030 backend_options = {}
1031 self.backend_options = backend_options
1033 if compress and mmap_mode is not None:
1034 warnings.warn("Compressed results cannot be memmapped", stacklevel=2)
1036 self.location = location
1037 if isinstance(location, str):
1038 location = os.path.join(location, "joblib")
1040 self.store_backend = _store_backend_factory(
1041 backend,
1042 location,
1043 verbose=self._verbose,
1044 backend_options=dict(
1045 compress=compress, mmap_mode=mmap_mode, **backend_options
1046 ),
1047 )
1049 def cache(
1050 self,
1051 func=None,
1052 ignore=None,
1053 verbose=None,
1054 mmap_mode=False,
1055 cache_validation_callback=None,
1056 ):
1057 """Decorates the given function func to only compute its return
1058 value for input arguments not cached on disk.
1060 Parameters
1061 ----------
1062 func: callable, optional
1063 The function to be decorated
1064 ignore: list of strings
1065 A list of arguments name to ignore in the hashing
1066 verbose: integer, optional
1067 The verbosity mode of the function. By default that
1068 of the memory object is used.
1069 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional
1070 The memmapping mode used when loading from cache
1071 numpy arrays. See numpy.load for the meaning of the
1072 arguments. By default that of the memory object is used.
1073 cache_validation_callback: callable, optional
1074 Callable to validate whether or not the cache is valid. When
1075 the cached function is called with arguments for which a cache
1076 exists, this callable is called with the metadata of the cached
1077 result as its sole argument. If it returns True, then the
1078 cached result is returned, else the cache for these arguments
1079 is cleared and recomputed.
1081 Returns
1082 -------
1083 decorated_func: MemorizedFunc object
1084 The returned object is a MemorizedFunc object, that is
1085 callable (behaves like a function), but offers extra
1086 methods for cache lookup and management. See the
1087 documentation for :class:`joblib.memory.MemorizedFunc`.
1088 """
1089 if cache_validation_callback is not None and not callable(
1090 cache_validation_callback
1091 ):
1092 raise ValueError(
1093 "cache_validation_callback needs to be callable. "
1094 f"Got {cache_validation_callback}."
1095 )
1096 if func is None:
1097 # Partial application, to be able to specify extra keyword
1098 # arguments in decorators
1099 return functools.partial(
1100 self.cache,
1101 ignore=ignore,
1102 mmap_mode=mmap_mode,
1103 verbose=verbose,
1104 cache_validation_callback=cache_validation_callback,
1105 )
1106 if self.store_backend is None:
1107 cls = (
1108 AsyncNotMemorizedFunc
1109 if inspect.iscoroutinefunction(func)
1110 else NotMemorizedFunc
1111 )
1112 return cls(func)
1113 if verbose is None:
1114 verbose = self._verbose
1115 if mmap_mode is False:
1116 mmap_mode = self.mmap_mode
1117 if isinstance(func, MemorizedFunc):
1118 func = func.func
1119 cls = AsyncMemorizedFunc if inspect.iscoroutinefunction(func) else MemorizedFunc
1120 return cls(
1121 func,
1122 location=self.store_backend,
1123 backend=self.backend,
1124 ignore=ignore,
1125 mmap_mode=mmap_mode,
1126 compress=self.compress,
1127 verbose=verbose,
1128 timestamp=self.timestamp,
1129 cache_validation_callback=cache_validation_callback,
1130 )
1132 def clear(self, warn=True):
1133 """Erase the complete cache directory."""
1134 if warn:
1135 self.warn("Flushing completely the cache")
1136 if self.store_backend is not None:
1137 self.store_backend.clear()
1139 # As the cache is completely clear, make sure the _FUNCTION_HASHES
1140 # cache is also reset. Else, for a function that is present in this
1141 # table, results cached after this clear will be have cache miss
1142 # as the function code is not re-written.
1143 _FUNCTION_HASHES.clear()
1145 def reduce_size(self, bytes_limit=None, items_limit=None, age_limit=None):
1146 """Remove cache elements to make the cache fit its limits.
1148 The limitation can impose that the cache size fits in ``bytes_limit``,
1149 that the number of cache items is no more than ``items_limit``, and
1150 that all files in cache are not older than ``age_limit``.
1152 Parameters
1153 ----------
1154 bytes_limit: int | str, optional
1155 Limit in bytes of the size of the cache. By default, the size of
1156 the cache is unlimited. When reducing the size of the cache,
1157 ``joblib`` keeps the most recently accessed items first. If a
1158 str is passed, it is converted to a number of bytes using units
1159 { K | M | G} for kilo, mega, giga.
1161 items_limit: int, optional
1162 Number of items to limit the cache to. By default, the number of
1163 items in the cache is unlimited. When reducing the size of the
1164 cache, ``joblib`` keeps the most recently accessed items first.
1166 age_limit: datetime.timedelta, optional
1167 Maximum age of items to limit the cache to. When reducing the size
1168 of the cache, any items last accessed more than the given length of
1169 time ago are deleted. Example: to remove files older than 5 days,
1170 use datetime.timedelta(days=5). Negative timedelta are not
1171 accepted.
1172 """
1173 if self.store_backend is None:
1174 # No cached results, this function does nothing.
1175 return
1177 if bytes_limit is None and items_limit is None and age_limit is None:
1178 # No limitation to impose, returning
1179 return
1181 # Defers the actual limits enforcing to the store backend.
1182 self.store_backend.enforce_store_limits(bytes_limit, items_limit, age_limit)
1184 def eval(self, func, *args, **kwargs):
1185 """Eval function func with arguments `*args` and `**kwargs`,
1186 in the context of the memory.
1188 This method works similarly to the builtin `apply`, except
1189 that the function is called only if the cache is not
1190 up to date.
1192 """
1193 if self.store_backend is None:
1194 return func(*args, **kwargs)
1195 return self.cache(func)(*args, **kwargs)
1197 # ------------------------------------------------------------------------
1198 # Private `object` interface
1199 # ------------------------------------------------------------------------
1201 def __repr__(self):
1202 return "{class_name}(location={location})".format(
1203 class_name=self.__class__.__name__,
1204 location=(
1205 None if self.store_backend is None else self.store_backend.location
1206 ),
1207 )
1209 def __getstate__(self):
1210 """We don't store the timestamp when pickling, to avoid the hash
1211 depending from it.
1212 """
1213 state = self.__dict__.copy()
1214 state["timestamp"] = None
1215 return state
1218###############################################################################
1219# cache_validation_callback helpers
1220###############################################################################
1223def expires_after(
1224 days=0, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=0, weeks=0
1225):
1226 """Helper cache_validation_callback to force recompute after a duration.
1228 Parameters
1229 ----------
1230 days, seconds, microseconds, milliseconds, minutes, hours, weeks: numbers
1231 argument passed to a timedelta.
1232 """
1233 delta = datetime.timedelta(
1234 days=days,
1235 seconds=seconds,
1236 microseconds=microseconds,
1237 milliseconds=milliseconds,
1238 minutes=minutes,
1239 hours=hours,
1240 weeks=weeks,
1241 )
1243 def cache_validation_callback(metadata):
1244 computation_age = time.time() - metadata["time"]
1245 return computation_age < delta.total_seconds()
1247 return cache_validation_callback