Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/joblib/memory.py: 24%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2A context object for caching a function's return value each time it
3is called with the same input arguments.
5"""
7# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
8# Copyright (c) 2009 Gael Varoquaux
9# License: BSD Style, 3 clauses.
11import asyncio
12import datetime
13import functools
14import inspect
15import logging
16import os
17import pathlib
18import pydoc
19import re
20import textwrap
21import time
22import tokenize
23import traceback
24import warnings
25import weakref
27from . import hashing
28from ._store_backends import (
29 CacheWarning, # noqa
30 FileSystemStoreBackend,
31 StoreBackendBase,
32)
33from .func_inspect import (
34 filter_args,
35 format_call,
36 format_signature,
37 get_func_code,
38 get_func_name,
39)
40from .logger import Logger, format_time, pformat
42FIRST_LINE_TEXT = "# first line:"
44# TODO: The following object should have a data store object as a sub
45# object, and the interface to persist and query should be separated in
46# the data store.
47#
48# This would enable creating 'Memory' objects with a different logic for
49# pickling that would simply span a MemorizedFunc with the same
50# store (or do we want to copy it to avoid cross-talks?), for instance to
51# implement HDF5 pickling.
53# TODO: Same remark for the logger, and probably use the Python logging
54# mechanism.
57def extract_first_line(func_code):
58 """Extract the first line information from the function code
59 text if available.
60 """
61 if func_code.startswith(FIRST_LINE_TEXT):
62 func_code = func_code.split("\n")
63 first_line = int(func_code[0][len(FIRST_LINE_TEXT) :])
64 func_code = "\n".join(func_code[1:])
65 else:
66 first_line = -1
67 return func_code, first_line
70class JobLibCollisionWarning(UserWarning):
71 """Warn that there might be a collision between names of functions."""
74_STORE_BACKENDS = {"local": FileSystemStoreBackend}
77def register_store_backend(backend_name, backend):
78 """Extend available store backends.
80 The Memory, MemorizeResult and MemorizeFunc objects are designed to be
81 agnostic to the type of store used behind. By default, the local file
82 system is used but this function gives the possibility to extend joblib's
83 memory pattern with other types of storage such as cloud storage (S3, GCS,
84 OpenStack, HadoopFS, etc) or blob DBs.
86 Parameters
87 ----------
88 backend_name: str
89 The name identifying the store backend being registered. For example,
90 'local' is used with FileSystemStoreBackend.
91 backend: StoreBackendBase subclass
92 The name of a class that implements the StoreBackendBase interface.
94 """
95 if not isinstance(backend_name, str):
96 raise ValueError(
97 "Store backend name should be a string, '{0}' given.".format(backend_name)
98 )
99 if backend is None or not issubclass(backend, StoreBackendBase):
100 raise ValueError(
101 "Store backend should inherit StoreBackendBase, '{0}' given.".format(
102 backend
103 )
104 )
106 _STORE_BACKENDS[backend_name] = backend
109def _store_backend_factory(backend, location, verbose=0, backend_options=None):
110 """Return the correct store object for the given location."""
111 if backend_options is None:
112 backend_options = {}
114 if isinstance(location, pathlib.Path):
115 location = str(location)
117 if isinstance(location, StoreBackendBase):
118 return location
119 elif isinstance(location, str):
120 obj = None
121 location = os.path.expanduser(location)
122 # The location is not a local file system, we look in the
123 # registered backends if there's one matching the given backend
124 # name.
125 for backend_key, backend_obj in _STORE_BACKENDS.items():
126 if backend == backend_key:
127 obj = backend_obj()
129 # By default, we assume the FileSystemStoreBackend can be used if no
130 # matching backend could be found.
131 if obj is None:
132 raise TypeError(
133 "Unknown location {0} or backend {1}".format(location, backend)
134 )
136 # The store backend is configured with the extra named parameters,
137 # some of them are specific to the underlying store backend.
138 obj.configure(location, verbose=verbose, backend_options=backend_options)
139 return obj
140 elif location is not None:
141 warnings.warn(
142 "Instantiating a backend using a {} as a location is not "
143 "supported by joblib. Returning None instead.".format(
144 location.__class__.__name__
145 ),
146 UserWarning,
147 )
149 return None
152def _build_func_identifier(func):
153 """Build a roughly unique identifier for the cached function."""
154 modules, funcname = get_func_name(func)
155 # We reuse historical fs-like way of building a function identifier
156 return os.path.join(*modules, funcname)
159# An in-memory store to avoid looking at the disk-based function
160# source code to check if a function definition has changed
161_FUNCTION_HASHES = weakref.WeakKeyDictionary()
164###############################################################################
165# class `MemorizedResult`
166###############################################################################
167class MemorizedResult(Logger):
168 """Object representing a cached value.
170 Attributes
171 ----------
172 location: str
173 The location of joblib cache. Depends on the store backend used.
175 func: function or str
176 function whose output is cached. The string case is intended only for
177 instantiation based on the output of repr() on another instance.
178 (namely eval(repr(memorized_instance)) works).
180 argument_hash: str
181 hash of the function arguments.
183 backend: str
184 Type of store backend for reading/writing cache files.
185 Default is 'local'.
187 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}
188 The memmapping mode used when loading from cache numpy arrays. See
189 numpy.load for the meaning of the different values.
191 verbose: int
192 verbosity level (0 means no message).
194 timestamp, metadata: string
195 for internal use only.
196 """
198 def __init__(
199 self,
200 location,
201 call_id,
202 backend="local",
203 mmap_mode=None,
204 verbose=0,
205 timestamp=None,
206 metadata=None,
207 ):
208 Logger.__init__(self)
209 self._call_id = call_id
210 self.store_backend = _store_backend_factory(backend, location, verbose=verbose)
211 self.mmap_mode = mmap_mode
213 if metadata is not None:
214 self.metadata = metadata
215 else:
216 self.metadata = self.store_backend.get_metadata(self._call_id)
218 self.duration = self.metadata.get("duration", None)
219 self.verbose = verbose
220 self.timestamp = timestamp
222 @property
223 def func(self):
224 return self.func_id
226 @property
227 def func_id(self):
228 return self._call_id[0]
230 @property
231 def args_id(self):
232 return self._call_id[1]
234 def get(self):
235 """Read value from cache and return it."""
236 try:
237 return self.store_backend.load_item(
238 self._call_id,
239 timestamp=self.timestamp,
240 metadata=self.metadata,
241 verbose=self.verbose,
242 )
243 except ValueError as exc:
244 new_exc = KeyError(
245 "Error while trying to load a MemorizedResult's value. "
246 "It seems that this folder is corrupted : {}".format(
247 os.path.join(self.store_backend.location, *self._call_id)
248 )
249 )
250 raise new_exc from exc
252 def clear(self):
253 """Clear value from cache"""
254 self.store_backend.clear_item(self._call_id)
256 def __repr__(self):
257 return '{}(location="{}", func="{}", args_id="{}")'.format(
258 self.__class__.__name__, self.store_backend.location, *self._call_id
259 )
261 def __getstate__(self):
262 state = self.__dict__.copy()
263 state["timestamp"] = None
264 return state
267class NotMemorizedResult(object):
268 """Class representing an arbitrary value.
270 This class is a replacement for MemorizedResult when there is no cache.
271 """
273 __slots__ = ("value", "valid")
275 def __init__(self, value):
276 self.value = value
277 self.valid = True
279 def get(self):
280 if self.valid:
281 return self.value
282 else:
283 raise KeyError("No value stored.")
285 def clear(self):
286 self.valid = False
287 self.value = None
289 def __repr__(self):
290 if self.valid:
291 return "{class_name}({value})".format(
292 class_name=self.__class__.__name__, value=pformat(self.value)
293 )
294 else:
295 return self.__class__.__name__ + " with no value"
297 # __getstate__ and __setstate__ are required because of __slots__
298 def __getstate__(self):
299 return {"valid": self.valid, "value": self.value}
301 def __setstate__(self, state):
302 self.valid = state["valid"]
303 self.value = state["value"]
306###############################################################################
307# class `NotMemorizedFunc`
308###############################################################################
309class NotMemorizedFunc(object):
310 """No-op object decorating a function.
312 This class replaces MemorizedFunc when there is no cache. It provides an
313 identical API but does not write anything on disk.
315 Attributes
316 ----------
317 func: callable
318 Original undecorated function.
319 """
321 # Should be a light as possible (for speed)
322 def __init__(self, func):
323 self.func = func
325 def __call__(self, *args, **kwargs):
326 return self.func(*args, **kwargs)
328 def call_and_shelve(self, *args, **kwargs):
329 return NotMemorizedResult(self.func(*args, **kwargs))
331 def __repr__(self):
332 return "{0}(func={1})".format(self.__class__.__name__, self.func)
334 def clear(self, warn=True):
335 # Argument "warn" is for compatibility with MemorizedFunc.clear
336 pass
338 def call(self, *args, **kwargs):
339 return self.func(*args, **kwargs), {}
341 def check_call_in_cache(self, *args, **kwargs):
342 return False
345###############################################################################
346# class `AsyncNotMemorizedFunc`
347###############################################################################
348class AsyncNotMemorizedFunc(NotMemorizedFunc):
349 async def call_and_shelve(self, *args, **kwargs):
350 return NotMemorizedResult(await self.func(*args, **kwargs))
353###############################################################################
354# class `MemorizedFunc`
355###############################################################################
356class MemorizedFunc(Logger):
357 """Callable object decorating a function for caching its return value
358 each time it is called.
360 Methods are provided to inspect the cache or clean it.
362 Attributes
363 ----------
364 func: callable
365 The original, undecorated, function.
367 location: string
368 The location of joblib cache. Depends on the store backend used.
370 backend: str
371 Type of store backend for reading/writing cache files.
372 Default is 'local', in which case the location is the path to a
373 disk storage.
375 ignore: list or None
376 List of variable names to ignore when choosing whether to
377 recompute.
379 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}
380 The memmapping mode used when loading from cache
381 numpy arrays. See numpy.load for the meaning of the different
382 values.
384 compress: boolean, or integer
385 Whether to zip the stored data on disk. If an integer is
386 given, it should be between 1 and 9, and sets the amount
387 of compression. Note that compressed arrays cannot be
388 read by memmapping.
390 verbose: int, optional
391 The verbosity flag, controls messages that are issued as
392 the function is evaluated.
394 cache_validation_callback: callable, optional
395 Callable to check if a result in cache is valid or is to be recomputed.
396 When the function is called with arguments for which a cache exists,
397 the callback is called with the cache entry's metadata as its sole
398 argument. If it returns True, the cached result is returned, else the
399 cache for these arguments is cleared and the result is recomputed.
400 """
402 # ------------------------------------------------------------------------
403 # Public interface
404 # ------------------------------------------------------------------------
406 def __init__(
407 self,
408 func,
409 location,
410 backend="local",
411 ignore=None,
412 mmap_mode=None,
413 compress=False,
414 verbose=1,
415 timestamp=None,
416 cache_validation_callback=None,
417 ):
418 Logger.__init__(self)
419 self.mmap_mode = mmap_mode
420 self.compress = compress
421 self.func = func
422 self.cache_validation_callback = cache_validation_callback
423 self.func_id = _build_func_identifier(func)
424 self.ignore = ignore if ignore is not None else []
425 self._verbose = verbose
427 # retrieve store object from backend type and location.
428 self.store_backend = _store_backend_factory(
429 backend,
430 location,
431 verbose=verbose,
432 backend_options=dict(compress=compress, mmap_mode=mmap_mode),
433 )
434 if self.store_backend is not None:
435 # Create func directory on demand.
436 self.store_backend.store_cached_func_code([self.func_id])
438 self.timestamp = timestamp if timestamp is not None else time.time()
439 try:
440 functools.update_wrapper(self, func)
441 except Exception:
442 pass # Objects like ufunc don't like that
443 if inspect.isfunction(func):
444 doc = pydoc.TextDoc().document(func)
445 # Remove blank line
446 doc = doc.replace("\n", "\n\n", 1)
447 # Strip backspace-overprints for compatibility with autodoc
448 doc = re.sub("\x08.", "", doc)
449 else:
450 # Pydoc does a poor job on other objects
451 doc = func.__doc__
452 self.__doc__ = "Memoized version of %s" % doc
454 self._func_code_info = None
455 self._func_code_id = None
457 def _is_in_cache_and_valid(self, call_id):
458 """Check if the function call is cached and valid for given arguments.
460 - Compare the function code with the one from the cached function,
461 asserting if it has changed.
462 - Check if the function call is present in the cache.
463 - Call `cache_validation_callback` for user define cache validation.
465 Returns True if the function call is in cache and can be used, and
466 returns False otherwise.
467 """
468 # Check if the code of the function has changed
469 if not self._check_previous_func_code(stacklevel=4):
470 return False
472 # Check if this specific call is in the cache
473 if not self.store_backend.contains_item(call_id):
474 return False
476 # Call the user defined cache validation callback
477 metadata = self.store_backend.get_metadata(call_id)
478 if (
479 self.cache_validation_callback is not None
480 and not self.cache_validation_callback(metadata)
481 ):
482 self.store_backend.clear_item(call_id)
483 return False
485 return True
487 def _cached_call(self, args, kwargs, shelving):
488 """Call wrapped function and cache result, or read cache if available.
490 This function returns the wrapped function output or a reference to
491 the cached result.
493 Arguments:
494 ----------
496 args, kwargs: list and dict
497 input arguments for wrapped function
499 shelving: bool
500 True when called via the call_and_shelve function.
503 Returns
504 -------
505 output: Output of the wrapped function if shelving is false, or a
506 MemorizedResult reference to the value if shelving is true.
507 metadata: dict containing the metadata associated with the call.
508 """
509 args_id = self._get_args_id(*args, **kwargs)
510 call_id = (self.func_id, args_id)
511 _, func_name = get_func_name(self.func)
512 func_info = self.store_backend.get_cached_func_info([self.func_id])
513 location = func_info["location"]
515 if self._verbose >= 20:
516 logging.basicConfig(level=logging.INFO)
517 _, signature = format_signature(self.func, *args, **kwargs)
518 self.info(
519 textwrap.dedent(
520 f"""
521 Querying {func_name} with signature
522 {signature}.
524 (argument hash {args_id})
526 The store location is {location}.
527 """
528 )
529 )
531 # Compare the function code with the previous to see if the
532 # function code has changed and check if the results are present in
533 # the cache.
534 if self._is_in_cache_and_valid(call_id):
535 if shelving:
536 return self._get_memorized_result(call_id), {}
538 try:
539 start_time = time.time()
540 output = self._load_item(call_id)
541 if self._verbose > 4:
542 self._print_duration(
543 time.time() - start_time, context="cache loaded "
544 )
545 return output, {}
546 except Exception:
547 # XXX: Should use an exception logger
548 _, signature = format_signature(self.func, *args, **kwargs)
549 self.warn(
550 "Exception while loading results for {}\n {}".format(
551 signature, traceback.format_exc()
552 )
553 )
555 if self._verbose > 10:
556 self.warn(
557 f"Computing func {func_name}, argument hash {args_id} "
558 f"in location {location}"
559 )
561 # Returns the output but not the metadata
562 return self._call(call_id, args, kwargs, shelving)
564 @property
565 def func_code_info(self):
566 # 3-tuple property containing: the function source code, source file,
567 # and first line of the code inside the source file
568 if hasattr(self.func, "__code__"):
569 if self._func_code_id is None:
570 self._func_code_id = id(self.func.__code__)
571 elif id(self.func.__code__) != self._func_code_id:
572 # Be robust to dynamic reassignments of self.func.__code__
573 self._func_code_info = None
575 if self._func_code_info is None:
576 # Cache the source code of self.func . Provided that get_func_code
577 # (which should be called once on self) gets called in the process
578 # in which self.func was defined, this caching mechanism prevents
579 # undesired cache clearing when the cached function is called in
580 # an environment where the introspection utilities get_func_code
581 # relies on do not work (typically, in joblib child processes).
582 # See #1035 for more info
583 # TODO (pierreglaser): do the same with get_func_name?
584 self._func_code_info = get_func_code(self.func)
585 return self._func_code_info
587 def call_and_shelve(self, *args, **kwargs):
588 """Call wrapped function, cache result and return a reference.
590 This method returns a reference to the cached result instead of the
591 result itself. The reference object is small and picklable, allowing
592 to send or store it easily. Call .get() on reference object to get
593 result.
595 Returns
596 -------
597 cached_result: MemorizedResult or NotMemorizedResult
598 reference to the value returned by the wrapped function. The
599 class "NotMemorizedResult" is used when there is no cache
600 activated (e.g. location=None in Memory).
601 """
602 # Return the wrapped output, without the metadata
603 return self._cached_call(args, kwargs, shelving=True)[0]
605 def __call__(self, *args, **kwargs):
606 # Return the output, without the metadata
607 return self._cached_call(args, kwargs, shelving=False)[0]
609 def __getstate__(self):
610 # Make sure self.func's source is introspected prior to being pickled -
611 # code introspection utilities typically do not work inside child
612 # processes
613 _ = self.func_code_info
615 # We don't store the timestamp when pickling, to avoid the hash
616 # depending from it.
617 state = self.__dict__.copy()
618 state["timestamp"] = None
620 # Invalidate the code id as id(obj) will be different in the child
621 state["_func_code_id"] = None
623 return state
625 def check_call_in_cache(self, *args, **kwargs):
626 """Check if the function call is cached and valid for given arguments.
628 Does not call the function or do any work besides function inspection
629 and argument hashing.
631 - Compare the function code with the one from the cached function,
632 asserting if it has changed.
633 - Check if the function call is present in the cache.
634 - Call `cache_validation_callback` for user define cache validation.
636 Returns
637 -------
638 is_call_in_cache: bool
639 Whether or not the function call is in cache and can be used.
640 """
641 call_id = (self.func_id, self._get_args_id(*args, **kwargs))
642 return self._is_in_cache_and_valid(call_id)
644 # ------------------------------------------------------------------------
645 # Private interface
646 # ------------------------------------------------------------------------
648 def _get_args_id(self, *args, **kwargs):
649 """Return the input parameter hash of a result."""
650 return hashing.hash(
651 filter_args(self.func, self.ignore, args, kwargs),
652 coerce_mmap=self.mmap_mode is not None,
653 )
655 def _hash_func(self):
656 """Hash a function to key the online cache"""
657 func_code_h = hash(getattr(self.func, "__code__", None))
658 return id(self.func), hash(self.func), func_code_h
660 def _write_func_code(self, func_code, first_line):
661 """Write the function code and the filename to a file."""
662 # We store the first line because the filename and the function
663 # name is not always enough to identify a function: people
664 # sometimes have several functions named the same way in a
665 # file. This is bad practice, but joblib should be robust to bad
666 # practice.
667 func_code = "%s %i\n%s" % (FIRST_LINE_TEXT, first_line, func_code)
668 self.store_backend.store_cached_func_code([self.func_id], func_code)
670 # Also store in the in-memory store of function hashes
671 is_named_callable = (
672 hasattr(self.func, "__name__") and self.func.__name__ != "<lambda>"
673 )
674 if is_named_callable:
675 # Don't do this for lambda functions or strange callable
676 # objects, as it ends up being too fragile
677 func_hash = self._hash_func()
678 try:
679 _FUNCTION_HASHES[self.func] = func_hash
680 except TypeError:
681 # Some callable are not hashable
682 pass
684 def _check_previous_func_code(self, stacklevel=2):
685 """
686 stacklevel is the depth a which this function is called, to
687 issue useful warnings to the user.
688 """
689 # First check if our function is in the in-memory store.
690 # Using the in-memory store not only makes things faster, but it
691 # also renders us robust to variations of the files when the
692 # in-memory version of the code does not vary
693 try:
694 if self.func in _FUNCTION_HASHES:
695 # We use as an identifier the id of the function and its
696 # hash. This is more likely to falsely change than have hash
697 # collisions, thus we are on the safe side.
698 func_hash = self._hash_func()
699 if func_hash == _FUNCTION_HASHES[self.func]:
700 return True
701 except TypeError:
702 # Some callables are not hashable
703 pass
705 # Here, we go through some effort to be robust to dynamically
706 # changing code and collision. We cannot inspect.getsource
707 # because it is not reliable when using IPython's magic "%run".
708 func_code, source_file, first_line = self.func_code_info
709 try:
710 old_func_code, old_first_line = extract_first_line(
711 self.store_backend.get_cached_func_code([self.func_id])
712 )
713 except (IOError, OSError): # some backend can also raise OSError
714 self._write_func_code(func_code, first_line)
715 return False
716 if old_func_code == func_code:
717 return True
719 # We have differing code, is this because we are referring to
720 # different functions, or because the function we are referring to has
721 # changed?
723 _, func_name = get_func_name(
724 self.func, resolv_alias=False, win_characters=False
725 )
726 if old_first_line == first_line == -1 or func_name == "<lambda>":
727 if not first_line == -1:
728 func_description = "{0} ({1}:{2})".format(
729 func_name, source_file, first_line
730 )
731 else:
732 func_description = func_name
733 warnings.warn(
734 JobLibCollisionWarning(
735 "Cannot detect name collisions for function '{0}'".format(
736 func_description
737 )
738 ),
739 stacklevel=stacklevel,
740 )
742 # Fetch the code at the old location and compare it. If it is the
743 # same than the code store, we have a collision: the code in the
744 # file has not changed, but the name we have is pointing to a new
745 # code block.
746 if not old_first_line == first_line and source_file is not None:
747 if os.path.exists(source_file):
748 _, func_name = get_func_name(self.func, resolv_alias=False)
749 num_lines = len(func_code.split("\n"))
750 with tokenize.open(source_file) as f:
751 on_disk_func_code = f.readlines()[
752 old_first_line - 1 : old_first_line - 1 + num_lines - 1
753 ]
754 on_disk_func_code = "".join(on_disk_func_code)
755 possible_collision = (
756 on_disk_func_code.rstrip() == old_func_code.rstrip()
757 )
758 else:
759 possible_collision = source_file.startswith("<doctest ")
760 if possible_collision:
761 warnings.warn(
762 JobLibCollisionWarning(
763 "Possible name collisions between functions "
764 "'%s' (%s:%i) and '%s' (%s:%i)"
765 % (
766 func_name,
767 source_file,
768 old_first_line,
769 func_name,
770 source_file,
771 first_line,
772 )
773 ),
774 stacklevel=stacklevel,
775 )
777 # The function has changed, wipe the cache directory.
778 # XXX: Should be using warnings, and giving stacklevel
779 if self._verbose > 10:
780 _, func_name = get_func_name(self.func, resolv_alias=False)
781 self.warn(
782 "Function {0} (identified by {1}) has changed.".format(
783 func_name, self.func_id
784 )
785 )
786 self.clear(warn=True)
787 return False
789 def clear(self, warn=True):
790 """Empty the function's cache."""
791 func_id = self.func_id
792 if self._verbose > 0 and warn:
793 self.warn("Clearing function cache identified by %s" % func_id)
794 self.store_backend.clear_path(
795 [
796 func_id,
797 ]
798 )
800 func_code, _, first_line = self.func_code_info
801 self._write_func_code(func_code, first_line)
803 def call(self, *args, **kwargs):
804 """Force the execution of the function with the given arguments.
806 The output values will be persisted, i.e., the cache will be updated
807 with any new values.
809 Parameters
810 ----------
811 *args: arguments
812 The arguments.
813 **kwargs: keyword arguments
814 Keyword arguments.
816 Returns
817 -------
818 output : object
819 The output of the function call.
820 metadata : dict
821 The metadata associated with the call.
822 """
823 call_id = (self.func_id, self._get_args_id(*args, **kwargs))
825 # Return the output and the metadata
826 return self._call(call_id, args, kwargs)
828 def _call(self, call_id, args, kwargs, shelving=False):
829 # Return the output and the metadata
830 self._before_call(args, kwargs)
831 start_time = time.time()
832 output = self.func(*args, **kwargs)
833 return self._after_call(call_id, args, kwargs, shelving, output, start_time)
835 def _before_call(self, args, kwargs):
836 if self._verbose > 0:
837 print(format_call(self.func, args, kwargs))
839 def _after_call(self, call_id, args, kwargs, shelving, output, start_time):
840 self.store_backend.dump_item(call_id, output, verbose=self._verbose)
841 duration = time.time() - start_time
842 if self._verbose > 0:
843 self._print_duration(duration)
844 metadata = self._persist_input(duration, call_id, args, kwargs)
845 if shelving:
846 return self._get_memorized_result(call_id, metadata), metadata
848 if self.mmap_mode is not None:
849 # Memmap the output at the first call to be consistent with
850 # later calls
851 output = self._load_item(call_id, metadata)
852 return output, metadata
854 def _persist_input(self, duration, call_id, args, kwargs, this_duration_limit=0.5):
855 """Save a small summary of the call using json format in the
856 output directory.
858 output_dir: string
859 directory where to write metadata.
861 duration: float
862 time taken by hashing input arguments, calling the wrapped
863 function and persisting its output.
865 args, kwargs: list and dict
866 input arguments for wrapped function
868 this_duration_limit: float
869 Max execution time for this function before issuing a warning.
870 """
871 start_time = time.time()
872 argument_dict = filter_args(self.func, self.ignore, args, kwargs)
874 input_repr = dict((k, repr(v)) for k, v in argument_dict.items())
875 # This can fail due to race-conditions with multiple
876 # concurrent joblibs removing the file or the directory
877 metadata = {
878 "duration": duration,
879 "input_args": input_repr,
880 "time": start_time,
881 }
883 self.store_backend.store_metadata(call_id, metadata)
885 this_duration = time.time() - start_time
886 if this_duration > this_duration_limit:
887 # This persistence should be fast. It will not be if repr() takes
888 # time and its output is large, because json.dump will have to
889 # write a large file. This should not be an issue with numpy arrays
890 # for which repr() always output a short representation, but can
891 # be with complex dictionaries. Fixing the problem should be a
892 # matter of replacing repr() above by something smarter.
893 warnings.warn(
894 "Persisting input arguments took %.2fs to run."
895 "If this happens often in your code, it can cause "
896 "performance problems "
897 "(results will be correct in all cases). "
898 "The reason for this is probably some large input "
899 "arguments for a wrapped function." % this_duration,
900 stacklevel=5,
901 )
902 return metadata
904 def _get_memorized_result(self, call_id, metadata=None):
905 return MemorizedResult(
906 self.store_backend,
907 call_id,
908 metadata=metadata,
909 timestamp=self.timestamp,
910 verbose=self._verbose - 1,
911 )
913 def _load_item(self, call_id, metadata=None):
914 return self.store_backend.load_item(
915 call_id, metadata=metadata, timestamp=self.timestamp, verbose=self._verbose
916 )
918 def _print_duration(self, duration, context=""):
919 _, name = get_func_name(self.func)
920 msg = f"{name} {context}- {format_time(duration)}"
921 print(max(0, (80 - len(msg))) * "_" + msg)
923 # ------------------------------------------------------------------------
924 # Private `object` interface
925 # ------------------------------------------------------------------------
927 def __repr__(self):
928 return "{class_name}(func={func}, location={location})".format(
929 class_name=self.__class__.__name__,
930 func=self.func,
931 location=self.store_backend.location,
932 )
935###############################################################################
936# class `AsyncMemorizedFunc`
937###############################################################################
938class AsyncMemorizedFunc(MemorizedFunc):
939 async def __call__(self, *args, **kwargs):
940 out = self._cached_call(args, kwargs, shelving=False)
941 out = await out if asyncio.iscoroutine(out) else out
942 return out[0] # Don't return metadata
944 async def call_and_shelve(self, *args, **kwargs):
945 out = self._cached_call(args, kwargs, shelving=True)
946 out = await out if asyncio.iscoroutine(out) else out
947 return out[0] # Don't return metadata
949 async def call(self, *args, **kwargs):
950 out = super().call(*args, **kwargs)
951 return await out if asyncio.iscoroutine(out) else out
953 async def _call(self, call_id, args, kwargs, shelving=False):
954 self._before_call(args, kwargs)
955 start_time = time.time()
956 output = await self.func(*args, **kwargs)
957 return self._after_call(call_id, args, kwargs, shelving, output, start_time)
960###############################################################################
961# class `Memory`
962###############################################################################
963class Memory(Logger):
964 """A context object for caching a function's return value each time it
965 is called with the same input arguments.
967 All values are cached on the filesystem, in a deep directory
968 structure.
970 Read more in the :ref:`User Guide <memory>`.
972 Parameters
973 ----------
974 location: str, pathlib.Path or None
975 The path of the base directory to use as a data store
976 or None. If None is given, no caching is done and
977 the Memory object is completely transparent. This option
978 replaces cachedir since version 0.12.
980 backend: str, optional, default='local'
981 Type of store backend for reading/writing cache files.
982 The 'local' backend is using regular filesystem operations to
983 manipulate data (open, mv, etc) in the backend.
985 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional
986 The memmapping mode used when loading from cache
987 numpy arrays. See numpy.load for the meaning of the
988 arguments.
990 compress: boolean, or integer, optional
991 Whether to zip the stored data on disk. If an integer is
992 given, it should be between 1 and 9, and sets the amount
993 of compression. Note that compressed arrays cannot be
994 read by memmapping.
996 verbose: int, optional
997 Verbosity flag, controls the debug messages that are issued
998 as functions are evaluated.
1000 backend_options: dict, optional
1001 Contains a dictionary of named parameters used to configure
1002 the store backend.
1003 """
1005 # ------------------------------------------------------------------------
1006 # Public interface
1007 # ------------------------------------------------------------------------
1009 def __init__(
1010 self,
1011 location=None,
1012 backend="local",
1013 mmap_mode=None,
1014 compress=False,
1015 verbose=1,
1016 backend_options=None,
1017 ):
1018 Logger.__init__(self)
1019 self._verbose = verbose
1020 self.mmap_mode = mmap_mode
1021 self.timestamp = time.time()
1022 self.backend = backend
1023 self.compress = compress
1024 if backend_options is None:
1025 backend_options = {}
1026 self.backend_options = backend_options
1028 if compress and mmap_mode is not None:
1029 warnings.warn("Compressed results cannot be memmapped", stacklevel=2)
1031 self.location = location
1032 if isinstance(location, str):
1033 location = os.path.join(location, "joblib")
1035 self.store_backend = _store_backend_factory(
1036 backend,
1037 location,
1038 verbose=self._verbose,
1039 backend_options=dict(
1040 compress=compress, mmap_mode=mmap_mode, **backend_options
1041 ),
1042 )
1044 def cache(
1045 self,
1046 func=None,
1047 ignore=None,
1048 verbose=None,
1049 mmap_mode=False,
1050 cache_validation_callback=None,
1051 ):
1052 """Decorates the given function func to only compute its return
1053 value for input arguments not cached on disk.
1055 Parameters
1056 ----------
1057 func: callable, optional
1058 The function to be decorated
1059 ignore: list of strings
1060 A list of arguments name to ignore in the hashing
1061 verbose: integer, optional
1062 The verbosity mode of the function. By default that
1063 of the memory object is used.
1064 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional
1065 The memmapping mode used when loading from cache
1066 numpy arrays. See numpy.load for the meaning of the
1067 arguments. By default that of the memory object is used.
1068 cache_validation_callback: callable, optional
1069 Callable to validate whether or not the cache is valid. When
1070 the cached function is called with arguments for which a cache
1071 exists, this callable is called with the metadata of the cached
1072 result as its sole argument. If it returns True, then the
1073 cached result is returned, else the cache for these arguments
1074 is cleared and recomputed.
1076 Returns
1077 -------
1078 decorated_func: MemorizedFunc object
1079 The returned object is a MemorizedFunc object, that is
1080 callable (behaves like a function), but offers extra
1081 methods for cache lookup and management. See the
1082 documentation for :class:`joblib.memory.MemorizedFunc`.
1083 """
1084 if cache_validation_callback is not None and not callable(
1085 cache_validation_callback
1086 ):
1087 raise ValueError(
1088 "cache_validation_callback needs to be callable. "
1089 f"Got {cache_validation_callback}."
1090 )
1091 if func is None:
1092 # Partial application, to be able to specify extra keyword
1093 # arguments in decorators
1094 return functools.partial(
1095 self.cache,
1096 ignore=ignore,
1097 mmap_mode=mmap_mode,
1098 verbose=verbose,
1099 cache_validation_callback=cache_validation_callback,
1100 )
1101 if self.store_backend is None:
1102 cls = (
1103 AsyncNotMemorizedFunc
1104 if asyncio.iscoroutinefunction(func)
1105 else NotMemorizedFunc
1106 )
1107 return cls(func)
1108 if verbose is None:
1109 verbose = self._verbose
1110 if mmap_mode is False:
1111 mmap_mode = self.mmap_mode
1112 if isinstance(func, MemorizedFunc):
1113 func = func.func
1114 cls = AsyncMemorizedFunc if asyncio.iscoroutinefunction(func) else MemorizedFunc
1115 return cls(
1116 func,
1117 location=self.store_backend,
1118 backend=self.backend,
1119 ignore=ignore,
1120 mmap_mode=mmap_mode,
1121 compress=self.compress,
1122 verbose=verbose,
1123 timestamp=self.timestamp,
1124 cache_validation_callback=cache_validation_callback,
1125 )
1127 def clear(self, warn=True):
1128 """Erase the complete cache directory."""
1129 if warn:
1130 self.warn("Flushing completely the cache")
1131 if self.store_backend is not None:
1132 self.store_backend.clear()
1134 # As the cache is completely clear, make sure the _FUNCTION_HASHES
1135 # cache is also reset. Else, for a function that is present in this
1136 # table, results cached after this clear will be have cache miss
1137 # as the function code is not re-written.
1138 _FUNCTION_HASHES.clear()
1140 def reduce_size(self, bytes_limit=None, items_limit=None, age_limit=None):
1141 """Remove cache elements to make the cache fit its limits.
1143 The limitation can impose that the cache size fits in ``bytes_limit``,
1144 that the number of cache items is no more than ``items_limit``, and
1145 that all files in cache are not older than ``age_limit``.
1147 Parameters
1148 ----------
1149 bytes_limit: int | str, optional
1150 Limit in bytes of the size of the cache. By default, the size of
1151 the cache is unlimited. When reducing the size of the cache,
1152 ``joblib`` keeps the most recently accessed items first. If a
1153 str is passed, it is converted to a number of bytes using units
1154 { K | M | G} for kilo, mega, giga.
1156 items_limit: int, optional
1157 Number of items to limit the cache to. By default, the number of
1158 items in the cache is unlimited. When reducing the size of the
1159 cache, ``joblib`` keeps the most recently accessed items first.
1161 age_limit: datetime.timedelta, optional
1162 Maximum age of items to limit the cache to. When reducing the size
1163 of the cache, any items last accessed more than the given length of
1164 time ago are deleted. Example: to remove files older than 5 days,
1165 use datetime.timedelta(days=5). Negative timedelta are not
1166 accepted.
1167 """
1168 if self.store_backend is None:
1169 # No cached results, this function does nothing.
1170 return
1172 if bytes_limit is None and items_limit is None and age_limit is None:
1173 # No limitation to impose, returning
1174 return
1176 # Defers the actual limits enforcing to the store backend.
1177 self.store_backend.enforce_store_limits(bytes_limit, items_limit, age_limit)
1179 def eval(self, func, *args, **kwargs):
1180 """Eval function func with arguments `*args` and `**kwargs`,
1181 in the context of the memory.
1183 This method works similarly to the builtin `apply`, except
1184 that the function is called only if the cache is not
1185 up to date.
1187 """
1188 if self.store_backend is None:
1189 return func(*args, **kwargs)
1190 return self.cache(func)(*args, **kwargs)
1192 # ------------------------------------------------------------------------
1193 # Private `object` interface
1194 # ------------------------------------------------------------------------
1196 def __repr__(self):
1197 return "{class_name}(location={location})".format(
1198 class_name=self.__class__.__name__,
1199 location=(
1200 None if self.store_backend is None else self.store_backend.location
1201 ),
1202 )
1204 def __getstate__(self):
1205 """We don't store the timestamp when pickling, to avoid the hash
1206 depending from it.
1207 """
1208 state = self.__dict__.copy()
1209 state["timestamp"] = None
1210 return state
1213###############################################################################
1214# cache_validation_callback helpers
1215###############################################################################
1218def expires_after(
1219 days=0, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=0, weeks=0
1220):
1221 """Helper cache_validation_callback to force recompute after a duration.
1223 Parameters
1224 ----------
1225 days, seconds, microseconds, milliseconds, minutes, hours, weeks: numbers
1226 argument passed to a timedelta.
1227 """
1228 delta = datetime.timedelta(
1229 days=days,
1230 seconds=seconds,
1231 microseconds=microseconds,
1232 milliseconds=milliseconds,
1233 minutes=minutes,
1234 hours=hours,
1235 weeks=weeks,
1236 )
1238 def cache_validation_callback(metadata):
1239 computation_age = time.time() - metadata["time"]
1240 return computation_age < delta.total_seconds()
1242 return cache_validation_callback