Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/joblib/memory.py: 24%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

398 statements  

1""" 

2A context object for caching a function's return value each time it 

3is called with the same input arguments. 

4 

5""" 

6 

7# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org> 

8# Copyright (c) 2009 Gael Varoquaux 

9# License: BSD Style, 3 clauses. 

10 

11import asyncio 

12import datetime 

13import functools 

14import inspect 

15import logging 

16import os 

17import pathlib 

18import pydoc 

19import re 

20import textwrap 

21import time 

22import tokenize 

23import traceback 

24import warnings 

25import weakref 

26 

27from . import hashing 

28from ._store_backends import ( 

29 CacheWarning, # noqa 

30 FileSystemStoreBackend, 

31 StoreBackendBase, 

32) 

33from .func_inspect import ( 

34 filter_args, 

35 format_call, 

36 format_signature, 

37 get_func_code, 

38 get_func_name, 

39) 

40from .logger import Logger, format_time, pformat 

41 

42FIRST_LINE_TEXT = "# first line:" 

43 

44# TODO: The following object should have a data store object as a sub 

45# object, and the interface to persist and query should be separated in 

46# the data store. 

47# 

48# This would enable creating 'Memory' objects with a different logic for 

49# pickling that would simply span a MemorizedFunc with the same 

50# store (or do we want to copy it to avoid cross-talks?), for instance to 

51# implement HDF5 pickling. 

52 

53# TODO: Same remark for the logger, and probably use the Python logging 

54# mechanism. 

55 

56 

57def extract_first_line(func_code): 

58 """Extract the first line information from the function code 

59 text if available. 

60 """ 

61 if func_code.startswith(FIRST_LINE_TEXT): 

62 func_code = func_code.split("\n") 

63 first_line = int(func_code[0][len(FIRST_LINE_TEXT) :]) 

64 func_code = "\n".join(func_code[1:]) 

65 else: 

66 first_line = -1 

67 return func_code, first_line 

68 

69 

70class JobLibCollisionWarning(UserWarning): 

71 """Warn that there might be a collision between names of functions.""" 

72 

73 

74_STORE_BACKENDS = {"local": FileSystemStoreBackend} 

75 

76 

77def register_store_backend(backend_name, backend): 

78 """Extend available store backends. 

79 

80 The Memory, MemorizeResult and MemorizeFunc objects are designed to be 

81 agnostic to the type of store used behind. By default, the local file 

82 system is used but this function gives the possibility to extend joblib's 

83 memory pattern with other types of storage such as cloud storage (S3, GCS, 

84 OpenStack, HadoopFS, etc) or blob DBs. 

85 

86 Parameters 

87 ---------- 

88 backend_name: str 

89 The name identifying the store backend being registered. For example, 

90 'local' is used with FileSystemStoreBackend. 

91 backend: StoreBackendBase subclass 

92 The name of a class that implements the StoreBackendBase interface. 

93 

94 """ 

95 if not isinstance(backend_name, str): 

96 raise ValueError( 

97 "Store backend name should be a string, '{0}' given.".format(backend_name) 

98 ) 

99 if backend is None or not issubclass(backend, StoreBackendBase): 

100 raise ValueError( 

101 "Store backend should inherit StoreBackendBase, '{0}' given.".format( 

102 backend 

103 ) 

104 ) 

105 

106 _STORE_BACKENDS[backend_name] = backend 

107 

108 

109def _store_backend_factory(backend, location, verbose=0, backend_options=None): 

110 """Return the correct store object for the given location.""" 

111 if backend_options is None: 

112 backend_options = {} 

113 

114 if isinstance(location, pathlib.Path): 

115 location = str(location) 

116 

117 if isinstance(location, StoreBackendBase): 

118 return location 

119 elif isinstance(location, str): 

120 obj = None 

121 location = os.path.expanduser(location) 

122 # The location is not a local file system, we look in the 

123 # registered backends if there's one matching the given backend 

124 # name. 

125 for backend_key, backend_obj in _STORE_BACKENDS.items(): 

126 if backend == backend_key: 

127 obj = backend_obj() 

128 

129 # By default, we assume the FileSystemStoreBackend can be used if no 

130 # matching backend could be found. 

131 if obj is None: 

132 raise TypeError( 

133 "Unknown location {0} or backend {1}".format(location, backend) 

134 ) 

135 

136 # The store backend is configured with the extra named parameters, 

137 # some of them are specific to the underlying store backend. 

138 obj.configure(location, verbose=verbose, backend_options=backend_options) 

139 return obj 

140 elif location is not None: 

141 warnings.warn( 

142 "Instantiating a backend using a {} as a location is not " 

143 "supported by joblib. Returning None instead.".format( 

144 location.__class__.__name__ 

145 ), 

146 UserWarning, 

147 ) 

148 

149 return None 

150 

151 

152def _build_func_identifier(func): 

153 """Build a roughly unique identifier for the cached function.""" 

154 modules, funcname = get_func_name(func) 

155 # We reuse historical fs-like way of building a function identifier 

156 return os.path.join(*modules, funcname) 

157 

158 

159# An in-memory store to avoid looking at the disk-based function 

160# source code to check if a function definition has changed 

161_FUNCTION_HASHES = weakref.WeakKeyDictionary() 

162 

163 

164############################################################################### 

165# class `MemorizedResult` 

166############################################################################### 

167class MemorizedResult(Logger): 

168 """Object representing a cached value. 

169 

170 Attributes 

171 ---------- 

172 location: str 

173 The location of joblib cache. Depends on the store backend used. 

174 

175 func: function or str 

176 function whose output is cached. The string case is intended only for 

177 instantiation based on the output of repr() on another instance. 

178 (namely eval(repr(memorized_instance)) works). 

179 

180 argument_hash: str 

181 hash of the function arguments. 

182 

183 backend: str 

184 Type of store backend for reading/writing cache files. 

185 Default is 'local'. 

186 

187 mmap_mode: {None, 'r+', 'r', 'w+', 'c'} 

188 The memmapping mode used when loading from cache numpy arrays. See 

189 numpy.load for the meaning of the different values. 

190 

191 verbose: int 

192 verbosity level (0 means no message). 

193 

194 timestamp, metadata: string 

195 for internal use only. 

196 """ 

197 

198 def __init__( 

199 self, 

200 location, 

201 call_id, 

202 backend="local", 

203 mmap_mode=None, 

204 verbose=0, 

205 timestamp=None, 

206 metadata=None, 

207 ): 

208 Logger.__init__(self) 

209 self._call_id = call_id 

210 self.store_backend = _store_backend_factory( 

211 backend, 

212 location, 

213 verbose=verbose, 

214 backend_options=dict(mmap_mode=mmap_mode), 

215 ) 

216 self.mmap_mode = mmap_mode 

217 

218 if metadata is not None: 

219 self.metadata = metadata 

220 else: 

221 self.metadata = self.store_backend.get_metadata(self._call_id) 

222 

223 self.duration = self.metadata.get("duration", None) 

224 self.verbose = verbose 

225 self.timestamp = timestamp 

226 

227 @property 

228 def func(self): 

229 return self.func_id 

230 

231 @property 

232 def func_id(self): 

233 return self._call_id[0] 

234 

235 @property 

236 def args_id(self): 

237 return self._call_id[1] 

238 

239 def get(self): 

240 """Read value from cache and return it.""" 

241 try: 

242 return self.store_backend.load_item( 

243 self._call_id, 

244 timestamp=self.timestamp, 

245 metadata=self.metadata, 

246 verbose=self.verbose, 

247 ) 

248 except ValueError as exc: 

249 new_exc = KeyError( 

250 "Error while trying to load a MemorizedResult's value. " 

251 "It seems that this folder is corrupted : {}".format( 

252 os.path.join(self.store_backend.location, *self._call_id) 

253 ) 

254 ) 

255 raise new_exc from exc 

256 

257 def clear(self): 

258 """Clear value from cache""" 

259 self.store_backend.clear_item(self._call_id) 

260 

261 def __repr__(self): 

262 return '{}(location="{}", func="{}", args_id="{}")'.format( 

263 self.__class__.__name__, self.store_backend.location, *self._call_id 

264 ) 

265 

266 def __getstate__(self): 

267 state = self.__dict__.copy() 

268 state["timestamp"] = None 

269 return state 

270 

271 

272class NotMemorizedResult(object): 

273 """Class representing an arbitrary value. 

274 

275 This class is a replacement for MemorizedResult when there is no cache. 

276 """ 

277 

278 __slots__ = ("value", "valid") 

279 

280 def __init__(self, value): 

281 self.value = value 

282 self.valid = True 

283 

284 def get(self): 

285 if self.valid: 

286 return self.value 

287 else: 

288 raise KeyError("No value stored.") 

289 

290 def clear(self): 

291 self.valid = False 

292 self.value = None 

293 

294 def __repr__(self): 

295 if self.valid: 

296 return "{class_name}({value})".format( 

297 class_name=self.__class__.__name__, value=pformat(self.value) 

298 ) 

299 else: 

300 return self.__class__.__name__ + " with no value" 

301 

302 # __getstate__ and __setstate__ are required because of __slots__ 

303 def __getstate__(self): 

304 return {"valid": self.valid, "value": self.value} 

305 

306 def __setstate__(self, state): 

307 self.valid = state["valid"] 

308 self.value = state["value"] 

309 

310 

311############################################################################### 

312# class `NotMemorizedFunc` 

313############################################################################### 

314class NotMemorizedFunc(object): 

315 """No-op object decorating a function. 

316 

317 This class replaces MemorizedFunc when there is no cache. It provides an 

318 identical API but does not write anything on disk. 

319 

320 Attributes 

321 ---------- 

322 func: callable 

323 Original undecorated function. 

324 """ 

325 

326 # Should be a light as possible (for speed) 

327 def __init__(self, func): 

328 self.func = func 

329 

330 def __call__(self, *args, **kwargs): 

331 return self.func(*args, **kwargs) 

332 

333 def call_and_shelve(self, *args, **kwargs): 

334 return NotMemorizedResult(self.func(*args, **kwargs)) 

335 

336 def __repr__(self): 

337 return "{0}(func={1})".format(self.__class__.__name__, self.func) 

338 

339 def clear(self, warn=True): 

340 # Argument "warn" is for compatibility with MemorizedFunc.clear 

341 pass 

342 

343 def call(self, *args, **kwargs): 

344 return self.func(*args, **kwargs), {} 

345 

346 def check_call_in_cache(self, *args, **kwargs): 

347 return False 

348 

349 

350############################################################################### 

351# class `AsyncNotMemorizedFunc` 

352############################################################################### 

353class AsyncNotMemorizedFunc(NotMemorizedFunc): 

354 async def call_and_shelve(self, *args, **kwargs): 

355 return NotMemorizedResult(await self.func(*args, **kwargs)) 

356 

357 

358############################################################################### 

359# class `MemorizedFunc` 

360############################################################################### 

361class MemorizedFunc(Logger): 

362 """Callable object decorating a function for caching its return value 

363 each time it is called. 

364 

365 Methods are provided to inspect the cache or clean it. 

366 

367 Attributes 

368 ---------- 

369 func: callable 

370 The original, undecorated, function. 

371 

372 location: string 

373 The location of joblib cache. Depends on the store backend used. 

374 

375 backend: str 

376 Type of store backend for reading/writing cache files. 

377 Default is 'local', in which case the location is the path to a 

378 disk storage. 

379 

380 ignore: list or None 

381 List of variable names to ignore when choosing whether to 

382 recompute. 

383 

384 mmap_mode: {None, 'r+', 'r', 'w+', 'c'} 

385 The memmapping mode used when loading from cache 

386 numpy arrays. See numpy.load for the meaning of the different 

387 values. 

388 

389 compress: boolean, or integer 

390 Whether to zip the stored data on disk. If an integer is 

391 given, it should be between 1 and 9, and sets the amount 

392 of compression. Note that compressed arrays cannot be 

393 read by memmapping. 

394 

395 verbose: int, optional 

396 The verbosity flag, controls messages that are issued as 

397 the function is evaluated. 

398 

399 cache_validation_callback: callable, optional 

400 Callable to check if a result in cache is valid or is to be recomputed. 

401 When the function is called with arguments for which a cache exists, 

402 the callback is called with the cache entry's metadata as its sole 

403 argument. If it returns True, the cached result is returned, else the 

404 cache for these arguments is cleared and the result is recomputed. 

405 """ 

406 

407 # ------------------------------------------------------------------------ 

408 # Public interface 

409 # ------------------------------------------------------------------------ 

410 

411 def __init__( 

412 self, 

413 func, 

414 location, 

415 backend="local", 

416 ignore=None, 

417 mmap_mode=None, 

418 compress=False, 

419 verbose=1, 

420 timestamp=None, 

421 cache_validation_callback=None, 

422 ): 

423 Logger.__init__(self) 

424 self.mmap_mode = mmap_mode 

425 self.compress = compress 

426 self.func = func 

427 self.cache_validation_callback = cache_validation_callback 

428 self.func_id = _build_func_identifier(func) 

429 self.ignore = ignore if ignore is not None else [] 

430 self._verbose = verbose 

431 

432 # retrieve store object from backend type and location. 

433 self.store_backend = _store_backend_factory( 

434 backend, 

435 location, 

436 verbose=verbose, 

437 backend_options=dict(compress=compress, mmap_mode=mmap_mode), 

438 ) 

439 if self.store_backend is not None: 

440 # Create func directory on demand. 

441 self.store_backend.store_cached_func_code([self.func_id]) 

442 

443 self.timestamp = timestamp if timestamp is not None else time.time() 

444 try: 

445 functools.update_wrapper(self, func) 

446 except Exception: 

447 pass # Objects like ufunc don't like that 

448 if inspect.isfunction(func): 

449 doc = pydoc.TextDoc().document(func) 

450 # Remove blank line 

451 doc = doc.replace("\n", "\n\n", 1) 

452 # Strip backspace-overprints for compatibility with autodoc 

453 doc = re.sub("\x08.", "", doc) 

454 else: 

455 # Pydoc does a poor job on other objects 

456 doc = func.__doc__ 

457 self.__doc__ = "Memoized version of %s" % doc 

458 

459 self._func_code_info = None 

460 self._func_code_id = None 

461 

462 def _is_in_cache_and_valid(self, call_id): 

463 """Check if the function call is cached and valid for given arguments. 

464 

465 - Compare the function code with the one from the cached function, 

466 asserting if it has changed. 

467 - Check if the function call is present in the cache. 

468 - Call `cache_validation_callback` for user define cache validation. 

469 

470 Returns True if the function call is in cache and can be used, and 

471 returns False otherwise. 

472 """ 

473 # Check if the code of the function has changed 

474 if not self._check_previous_func_code(stacklevel=4): 

475 return False 

476 

477 # Check if this specific call is in the cache 

478 if not self.store_backend.contains_item(call_id): 

479 return False 

480 

481 # Call the user defined cache validation callback 

482 metadata = self.store_backend.get_metadata(call_id) 

483 if ( 

484 self.cache_validation_callback is not None 

485 and not self.cache_validation_callback(metadata) 

486 ): 

487 self.store_backend.clear_item(call_id) 

488 return False 

489 

490 return True 

491 

492 def _cached_call(self, args, kwargs, shelving): 

493 """Call wrapped function and cache result, or read cache if available. 

494 

495 This function returns the wrapped function output or a reference to 

496 the cached result. 

497 

498 Arguments: 

499 ---------- 

500 

501 args, kwargs: list and dict 

502 input arguments for wrapped function 

503 

504 shelving: bool 

505 True when called via the call_and_shelve function. 

506 

507 

508 Returns 

509 ------- 

510 output: Output of the wrapped function if shelving is false, or a 

511 MemorizedResult reference to the value if shelving is true. 

512 metadata: dict containing the metadata associated with the call. 

513 """ 

514 args_id = self._get_args_id(*args, **kwargs) 

515 call_id = (self.func_id, args_id) 

516 _, func_name = get_func_name(self.func) 

517 func_info = self.store_backend.get_cached_func_info([self.func_id]) 

518 location = func_info["location"] 

519 

520 if self._verbose >= 20: 

521 logging.basicConfig(level=logging.INFO) 

522 _, signature = format_signature(self.func, *args, **kwargs) 

523 self.info( 

524 textwrap.dedent( 

525 f""" 

526 Querying {func_name} with signature 

527 {signature}. 

528 

529 (argument hash {args_id}) 

530 

531 The store location is {location}. 

532 """ 

533 ) 

534 ) 

535 

536 # Compare the function code with the previous to see if the 

537 # function code has changed and check if the results are present in 

538 # the cache. 

539 if self._is_in_cache_and_valid(call_id): 

540 if shelving: 

541 return self._get_memorized_result(call_id), {} 

542 

543 try: 

544 start_time = time.time() 

545 output = self._load_item(call_id) 

546 if self._verbose > 4: 

547 self._print_duration( 

548 time.time() - start_time, context="cache loaded " 

549 ) 

550 return output, {} 

551 except Exception: 

552 # XXX: Should use an exception logger 

553 _, signature = format_signature(self.func, *args, **kwargs) 

554 self.warn( 

555 "Exception while loading results for {}\n {}".format( 

556 signature, traceback.format_exc() 

557 ) 

558 ) 

559 

560 if self._verbose > 10: 

561 self.warn( 

562 f"Computing func {func_name}, argument hash {args_id} " 

563 f"in location {location}" 

564 ) 

565 

566 # Returns the output but not the metadata 

567 return self._call(call_id, args, kwargs, shelving) 

568 

569 @property 

570 def func_code_info(self): 

571 # 3-tuple property containing: the function source code, source file, 

572 # and first line of the code inside the source file 

573 if hasattr(self.func, "__code__"): 

574 if self._func_code_id is None: 

575 self._func_code_id = id(self.func.__code__) 

576 elif id(self.func.__code__) != self._func_code_id: 

577 # Be robust to dynamic reassignments of self.func.__code__ 

578 self._func_code_info = None 

579 

580 if self._func_code_info is None: 

581 # Cache the source code of self.func . Provided that get_func_code 

582 # (which should be called once on self) gets called in the process 

583 # in which self.func was defined, this caching mechanism prevents 

584 # undesired cache clearing when the cached function is called in 

585 # an environment where the introspection utilities get_func_code 

586 # relies on do not work (typically, in joblib child processes). 

587 # See #1035 for more info 

588 # TODO (pierreglaser): do the same with get_func_name? 

589 self._func_code_info = get_func_code(self.func) 

590 return self._func_code_info 

591 

592 def call_and_shelve(self, *args, **kwargs): 

593 """Call wrapped function, cache result and return a reference. 

594 

595 This method returns a reference to the cached result instead of the 

596 result itself. The reference object is small and picklable, allowing 

597 to send or store it easily. Call .get() on reference object to get 

598 result. 

599 

600 Returns 

601 ------- 

602 cached_result: MemorizedResult or NotMemorizedResult 

603 reference to the value returned by the wrapped function. The 

604 class "NotMemorizedResult" is used when there is no cache 

605 activated (e.g. location=None in Memory). 

606 """ 

607 # Return the wrapped output, without the metadata 

608 return self._cached_call(args, kwargs, shelving=True)[0] 

609 

610 def __call__(self, *args, **kwargs): 

611 # Return the output, without the metadata 

612 return self._cached_call(args, kwargs, shelving=False)[0] 

613 

614 def __getstate__(self): 

615 # Make sure self.func's source is introspected prior to being pickled - 

616 # code introspection utilities typically do not work inside child 

617 # processes 

618 _ = self.func_code_info 

619 

620 # We don't store the timestamp when pickling, to avoid the hash 

621 # depending from it. 

622 state = self.__dict__.copy() 

623 state["timestamp"] = None 

624 

625 # Invalidate the code id as id(obj) will be different in the child 

626 state["_func_code_id"] = None 

627 

628 return state 

629 

630 def check_call_in_cache(self, *args, **kwargs): 

631 """Check if the function call is cached and valid for given arguments. 

632 

633 Does not call the function or do any work besides function inspection 

634 and argument hashing. 

635 

636 - Compare the function code with the one from the cached function, 

637 asserting if it has changed. 

638 - Check if the function call is present in the cache. 

639 - Call `cache_validation_callback` for user define cache validation. 

640 

641 Returns 

642 ------- 

643 is_call_in_cache: bool 

644 Whether or not the function call is in cache and can be used. 

645 """ 

646 call_id = (self.func_id, self._get_args_id(*args, **kwargs)) 

647 return self._is_in_cache_and_valid(call_id) 

648 

649 # ------------------------------------------------------------------------ 

650 # Private interface 

651 # ------------------------------------------------------------------------ 

652 

653 def _get_args_id(self, *args, **kwargs): 

654 """Return the input parameter hash of a result.""" 

655 return hashing.hash( 

656 filter_args(self.func, self.ignore, args, kwargs), 

657 coerce_mmap=self.mmap_mode is not None, 

658 ) 

659 

660 def _hash_func(self): 

661 """Hash a function to key the online cache""" 

662 func_code_h = hash(getattr(self.func, "__code__", None)) 

663 return id(self.func), hash(self.func), func_code_h 

664 

665 def _write_func_code(self, func_code, first_line): 

666 """Write the function code and the filename to a file.""" 

667 # We store the first line because the filename and the function 

668 # name is not always enough to identify a function: people 

669 # sometimes have several functions named the same way in a 

670 # file. This is bad practice, but joblib should be robust to bad 

671 # practice. 

672 func_code = "%s %i\n%s" % (FIRST_LINE_TEXT, first_line, func_code) 

673 self.store_backend.store_cached_func_code([self.func_id], func_code) 

674 

675 # Also store in the in-memory store of function hashes 

676 is_named_callable = ( 

677 hasattr(self.func, "__name__") and self.func.__name__ != "<lambda>" 

678 ) 

679 if is_named_callable: 

680 # Don't do this for lambda functions or strange callable 

681 # objects, as it ends up being too fragile 

682 func_hash = self._hash_func() 

683 try: 

684 _FUNCTION_HASHES[self.func] = func_hash 

685 except TypeError: 

686 # Some callable are not hashable 

687 pass 

688 

689 def _check_previous_func_code(self, stacklevel=2): 

690 """ 

691 stacklevel is the depth a which this function is called, to 

692 issue useful warnings to the user. 

693 """ 

694 # First check if our function is in the in-memory store. 

695 # Using the in-memory store not only makes things faster, but it 

696 # also renders us robust to variations of the files when the 

697 # in-memory version of the code does not vary 

698 try: 

699 if self.func in _FUNCTION_HASHES: 

700 # We use as an identifier the id of the function and its 

701 # hash. This is more likely to falsely change than have hash 

702 # collisions, thus we are on the safe side. 

703 func_hash = self._hash_func() 

704 if func_hash == _FUNCTION_HASHES[self.func]: 

705 return True 

706 except TypeError: 

707 # Some callables are not hashable 

708 pass 

709 

710 # Here, we go through some effort to be robust to dynamically 

711 # changing code and collision. We cannot inspect.getsource 

712 # because it is not reliable when using IPython's magic "%run". 

713 func_code, source_file, first_line = self.func_code_info 

714 try: 

715 old_func_code, old_first_line = extract_first_line( 

716 self.store_backend.get_cached_func_code([self.func_id]) 

717 ) 

718 except (IOError, OSError): # some backend can also raise OSError 

719 self._write_func_code(func_code, first_line) 

720 return False 

721 if old_func_code == func_code: 

722 return True 

723 

724 # We have differing code, is this because we are referring to 

725 # different functions, or because the function we are referring to has 

726 # changed? 

727 

728 _, func_name = get_func_name( 

729 self.func, resolv_alias=False, win_characters=False 

730 ) 

731 if old_first_line == first_line == -1 or func_name == "<lambda>": 

732 if not first_line == -1: 

733 func_description = "{0} ({1}:{2})".format( 

734 func_name, source_file, first_line 

735 ) 

736 else: 

737 func_description = func_name 

738 warnings.warn( 

739 JobLibCollisionWarning( 

740 "Cannot detect name collisions for function '{0}'".format( 

741 func_description 

742 ) 

743 ), 

744 stacklevel=stacklevel, 

745 ) 

746 

747 # Fetch the code at the old location and compare it. If it is the 

748 # same than the code store, we have a collision: the code in the 

749 # file has not changed, but the name we have is pointing to a new 

750 # code block. 

751 if not old_first_line == first_line and source_file is not None: 

752 if os.path.exists(source_file): 

753 _, func_name = get_func_name(self.func, resolv_alias=False) 

754 num_lines = len(func_code.split("\n")) 

755 with tokenize.open(source_file) as f: 

756 on_disk_func_code = f.readlines()[ 

757 old_first_line - 1 : old_first_line - 1 + num_lines - 1 

758 ] 

759 on_disk_func_code = "".join(on_disk_func_code) 

760 possible_collision = ( 

761 on_disk_func_code.rstrip() == old_func_code.rstrip() 

762 ) 

763 else: 

764 possible_collision = source_file.startswith("<doctest ") 

765 if possible_collision: 

766 warnings.warn( 

767 JobLibCollisionWarning( 

768 "Possible name collisions between functions " 

769 "'%s' (%s:%i) and '%s' (%s:%i)" 

770 % ( 

771 func_name, 

772 source_file, 

773 old_first_line, 

774 func_name, 

775 source_file, 

776 first_line, 

777 ) 

778 ), 

779 stacklevel=stacklevel, 

780 ) 

781 

782 # The function has changed, wipe the cache directory. 

783 # XXX: Should be using warnings, and giving stacklevel 

784 if self._verbose > 10: 

785 _, func_name = get_func_name(self.func, resolv_alias=False) 

786 self.warn( 

787 "Function {0} (identified by {1}) has changed.".format( 

788 func_name, self.func_id 

789 ) 

790 ) 

791 self.clear(warn=True) 

792 return False 

793 

794 def clear(self, warn=True): 

795 """Empty the function's cache.""" 

796 func_id = self.func_id 

797 if self._verbose > 0 and warn: 

798 self.warn("Clearing function cache identified by %s" % func_id) 

799 self.store_backend.clear_path( 

800 [ 

801 func_id, 

802 ] 

803 ) 

804 

805 func_code, _, first_line = self.func_code_info 

806 self._write_func_code(func_code, first_line) 

807 

808 def call(self, *args, **kwargs): 

809 """Force the execution of the function with the given arguments. 

810 

811 The output values will be persisted, i.e., the cache will be updated 

812 with any new values. 

813 

814 Parameters 

815 ---------- 

816 *args: arguments 

817 The arguments. 

818 **kwargs: keyword arguments 

819 Keyword arguments. 

820 

821 Returns 

822 ------- 

823 output : object 

824 The output of the function call. 

825 metadata : dict 

826 The metadata associated with the call. 

827 """ 

828 call_id = (self.func_id, self._get_args_id(*args, **kwargs)) 

829 

830 # Return the output and the metadata 

831 return self._call(call_id, args, kwargs) 

832 

833 def _call(self, call_id, args, kwargs, shelving=False): 

834 # Return the output and the metadata 

835 self._before_call(args, kwargs) 

836 start_time = time.time() 

837 output = self.func(*args, **kwargs) 

838 return self._after_call(call_id, args, kwargs, shelving, output, start_time) 

839 

840 def _before_call(self, args, kwargs): 

841 if self._verbose > 0: 

842 print(format_call(self.func, args, kwargs)) 

843 

844 def _after_call(self, call_id, args, kwargs, shelving, output, start_time): 

845 self.store_backend.dump_item(call_id, output, verbose=self._verbose) 

846 duration = time.time() - start_time 

847 if self._verbose > 0: 

848 self._print_duration(duration) 

849 metadata = self._persist_input(duration, call_id, args, kwargs) 

850 if shelving: 

851 return self._get_memorized_result(call_id, metadata), metadata 

852 

853 if self.mmap_mode is not None: 

854 # Memmap the output at the first call to be consistent with 

855 # later calls 

856 output = self._load_item(call_id, metadata) 

857 return output, metadata 

858 

859 def _persist_input(self, duration, call_id, args, kwargs, this_duration_limit=0.5): 

860 """Save a small summary of the call using json format in the 

861 output directory. 

862 

863 output_dir: string 

864 directory where to write metadata. 

865 

866 duration: float 

867 time taken by hashing input arguments, calling the wrapped 

868 function and persisting its output. 

869 

870 args, kwargs: list and dict 

871 input arguments for wrapped function 

872 

873 this_duration_limit: float 

874 Max execution time for this function before issuing a warning. 

875 """ 

876 start_time = time.time() 

877 argument_dict = filter_args(self.func, self.ignore, args, kwargs) 

878 

879 input_repr = dict((k, repr(v)) for k, v in argument_dict.items()) 

880 # This can fail due to race-conditions with multiple 

881 # concurrent joblibs removing the file or the directory 

882 metadata = { 

883 "duration": duration, 

884 "input_args": input_repr, 

885 "time": start_time, 

886 } 

887 

888 self.store_backend.store_metadata(call_id, metadata) 

889 

890 this_duration = time.time() - start_time 

891 if this_duration > this_duration_limit: 

892 # This persistence should be fast. It will not be if repr() takes 

893 # time and its output is large, because json.dump will have to 

894 # write a large file. This should not be an issue with numpy arrays 

895 # for which repr() always output a short representation, but can 

896 # be with complex dictionaries. Fixing the problem should be a 

897 # matter of replacing repr() above by something smarter. 

898 warnings.warn( 

899 "Persisting input arguments took %.2fs to run." 

900 "If this happens often in your code, it can cause " 

901 "performance problems " 

902 "(results will be correct in all cases). " 

903 "The reason for this is probably some large input " 

904 "arguments for a wrapped function." % this_duration, 

905 stacklevel=5, 

906 ) 

907 return metadata 

908 

909 def _get_memorized_result(self, call_id, metadata=None): 

910 return MemorizedResult( 

911 self.store_backend, 

912 call_id, 

913 metadata=metadata, 

914 timestamp=self.timestamp, 

915 verbose=self._verbose - 1, 

916 ) 

917 

918 def _load_item(self, call_id, metadata=None): 

919 return self.store_backend.load_item( 

920 call_id, metadata=metadata, timestamp=self.timestamp, verbose=self._verbose 

921 ) 

922 

923 def _print_duration(self, duration, context=""): 

924 _, name = get_func_name(self.func) 

925 msg = f"{name} {context}- {format_time(duration)}" 

926 print(max(0, (80 - len(msg))) * "_" + msg) 

927 

928 # ------------------------------------------------------------------------ 

929 # Private `object` interface 

930 # ------------------------------------------------------------------------ 

931 

932 def __repr__(self): 

933 return "{class_name}(func={func}, location={location})".format( 

934 class_name=self.__class__.__name__, 

935 func=self.func, 

936 location=self.store_backend.location, 

937 ) 

938 

939 

940############################################################################### 

941# class `AsyncMemorizedFunc` 

942############################################################################### 

943class AsyncMemorizedFunc(MemorizedFunc): 

944 async def __call__(self, *args, **kwargs): 

945 out = self._cached_call(args, kwargs, shelving=False) 

946 out = await out if asyncio.iscoroutine(out) else out 

947 return out[0] # Don't return metadata 

948 

949 async def call_and_shelve(self, *args, **kwargs): 

950 out = self._cached_call(args, kwargs, shelving=True) 

951 out = await out if asyncio.iscoroutine(out) else out 

952 return out[0] # Don't return metadata 

953 

954 async def call(self, *args, **kwargs): 

955 out = super().call(*args, **kwargs) 

956 return await out if asyncio.iscoroutine(out) else out 

957 

958 async def _call(self, call_id, args, kwargs, shelving=False): 

959 self._before_call(args, kwargs) 

960 start_time = time.time() 

961 output = await self.func(*args, **kwargs) 

962 return self._after_call(call_id, args, kwargs, shelving, output, start_time) 

963 

964 

965############################################################################### 

966# class `Memory` 

967############################################################################### 

968class Memory(Logger): 

969 """A context object for caching a function's return value each time it 

970 is called with the same input arguments. 

971 

972 All values are cached on the filesystem, in a deep directory 

973 structure. 

974 

975 Read more in the :ref:`User Guide <memory>`. 

976 

977 Parameters 

978 ---------- 

979 location: str, pathlib.Path or None 

980 The path of the base directory to use as a data store 

981 or None. If None is given, no caching is done and 

982 the Memory object is completely transparent. This option 

983 replaces cachedir since version 0.12. 

984 

985 backend: str, optional, default='local' 

986 Type of store backend for reading/writing cache files. 

987 The 'local' backend is using regular filesystem operations to 

988 manipulate data (open, mv, etc) in the backend. 

989 

990 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional 

991 The memmapping mode used when loading from cache 

992 numpy arrays. See numpy.load for the meaning of the 

993 arguments. 

994 

995 compress: boolean, or integer, optional 

996 Whether to zip the stored data on disk. If an integer is 

997 given, it should be between 1 and 9, and sets the amount 

998 of compression. Note that compressed arrays cannot be 

999 read by memmapping. 

1000 

1001 verbose: int, optional 

1002 Verbosity flag, controls the debug messages that are issued 

1003 as functions are evaluated. 

1004 

1005 backend_options: dict, optional 

1006 Contains a dictionary of named parameters used to configure 

1007 the store backend. 

1008 """ 

1009 

1010 # ------------------------------------------------------------------------ 

1011 # Public interface 

1012 # ------------------------------------------------------------------------ 

1013 

1014 def __init__( 

1015 self, 

1016 location=None, 

1017 backend="local", 

1018 mmap_mode=None, 

1019 compress=False, 

1020 verbose=1, 

1021 backend_options=None, 

1022 ): 

1023 Logger.__init__(self) 

1024 self._verbose = verbose 

1025 self.mmap_mode = mmap_mode 

1026 self.timestamp = time.time() 

1027 self.backend = backend 

1028 self.compress = compress 

1029 if backend_options is None: 

1030 backend_options = {} 

1031 self.backend_options = backend_options 

1032 

1033 if compress and mmap_mode is not None: 

1034 warnings.warn("Compressed results cannot be memmapped", stacklevel=2) 

1035 

1036 self.location = location 

1037 if isinstance(location, str): 

1038 location = os.path.join(location, "joblib") 

1039 

1040 self.store_backend = _store_backend_factory( 

1041 backend, 

1042 location, 

1043 verbose=self._verbose, 

1044 backend_options=dict( 

1045 compress=compress, mmap_mode=mmap_mode, **backend_options 

1046 ), 

1047 ) 

1048 

1049 def cache( 

1050 self, 

1051 func=None, 

1052 ignore=None, 

1053 verbose=None, 

1054 mmap_mode=False, 

1055 cache_validation_callback=None, 

1056 ): 

1057 """Decorates the given function func to only compute its return 

1058 value for input arguments not cached on disk. 

1059 

1060 Parameters 

1061 ---------- 

1062 func: callable, optional 

1063 The function to be decorated 

1064 ignore: list of strings 

1065 A list of arguments name to ignore in the hashing 

1066 verbose: integer, optional 

1067 The verbosity mode of the function. By default that 

1068 of the memory object is used. 

1069 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional 

1070 The memmapping mode used when loading from cache 

1071 numpy arrays. See numpy.load for the meaning of the 

1072 arguments. By default that of the memory object is used. 

1073 cache_validation_callback: callable, optional 

1074 Callable to validate whether or not the cache is valid. When 

1075 the cached function is called with arguments for which a cache 

1076 exists, this callable is called with the metadata of the cached 

1077 result as its sole argument. If it returns True, then the 

1078 cached result is returned, else the cache for these arguments 

1079 is cleared and recomputed. 

1080 

1081 Returns 

1082 ------- 

1083 decorated_func: MemorizedFunc object 

1084 The returned object is a MemorizedFunc object, that is 

1085 callable (behaves like a function), but offers extra 

1086 methods for cache lookup and management. See the 

1087 documentation for :class:`joblib.memory.MemorizedFunc`. 

1088 """ 

1089 if cache_validation_callback is not None and not callable( 

1090 cache_validation_callback 

1091 ): 

1092 raise ValueError( 

1093 "cache_validation_callback needs to be callable. " 

1094 f"Got {cache_validation_callback}." 

1095 ) 

1096 if func is None: 

1097 # Partial application, to be able to specify extra keyword 

1098 # arguments in decorators 

1099 return functools.partial( 

1100 self.cache, 

1101 ignore=ignore, 

1102 mmap_mode=mmap_mode, 

1103 verbose=verbose, 

1104 cache_validation_callback=cache_validation_callback, 

1105 ) 

1106 if self.store_backend is None: 

1107 cls = ( 

1108 AsyncNotMemorizedFunc 

1109 if inspect.iscoroutinefunction(func) 

1110 else NotMemorizedFunc 

1111 ) 

1112 return cls(func) 

1113 if verbose is None: 

1114 verbose = self._verbose 

1115 if mmap_mode is False: 

1116 mmap_mode = self.mmap_mode 

1117 if isinstance(func, MemorizedFunc): 

1118 func = func.func 

1119 cls = AsyncMemorizedFunc if inspect.iscoroutinefunction(func) else MemorizedFunc 

1120 return cls( 

1121 func, 

1122 location=self.store_backend, 

1123 backend=self.backend, 

1124 ignore=ignore, 

1125 mmap_mode=mmap_mode, 

1126 compress=self.compress, 

1127 verbose=verbose, 

1128 timestamp=self.timestamp, 

1129 cache_validation_callback=cache_validation_callback, 

1130 ) 

1131 

1132 def clear(self, warn=True): 

1133 """Erase the complete cache directory.""" 

1134 if warn: 

1135 self.warn("Flushing completely the cache") 

1136 if self.store_backend is not None: 

1137 self.store_backend.clear() 

1138 

1139 # As the cache is completely clear, make sure the _FUNCTION_HASHES 

1140 # cache is also reset. Else, for a function that is present in this 

1141 # table, results cached after this clear will be have cache miss 

1142 # as the function code is not re-written. 

1143 _FUNCTION_HASHES.clear() 

1144 

1145 def reduce_size(self, bytes_limit=None, items_limit=None, age_limit=None): 

1146 """Remove cache elements to make the cache fit its limits. 

1147 

1148 The limitation can impose that the cache size fits in ``bytes_limit``, 

1149 that the number of cache items is no more than ``items_limit``, and 

1150 that all files in cache are not older than ``age_limit``. 

1151 

1152 Parameters 

1153 ---------- 

1154 bytes_limit: int | str, optional 

1155 Limit in bytes of the size of the cache. By default, the size of 

1156 the cache is unlimited. When reducing the size of the cache, 

1157 ``joblib`` keeps the most recently accessed items first. If a 

1158 str is passed, it is converted to a number of bytes using units 

1159 { K | M | G} for kilo, mega, giga. 

1160 

1161 items_limit: int, optional 

1162 Number of items to limit the cache to. By default, the number of 

1163 items in the cache is unlimited. When reducing the size of the 

1164 cache, ``joblib`` keeps the most recently accessed items first. 

1165 

1166 age_limit: datetime.timedelta, optional 

1167 Maximum age of items to limit the cache to. When reducing the size 

1168 of the cache, any items last accessed more than the given length of 

1169 time ago are deleted. Example: to remove files older than 5 days, 

1170 use datetime.timedelta(days=5). Negative timedelta are not 

1171 accepted. 

1172 """ 

1173 if self.store_backend is None: 

1174 # No cached results, this function does nothing. 

1175 return 

1176 

1177 if bytes_limit is None and items_limit is None and age_limit is None: 

1178 # No limitation to impose, returning 

1179 return 

1180 

1181 # Defers the actual limits enforcing to the store backend. 

1182 self.store_backend.enforce_store_limits(bytes_limit, items_limit, age_limit) 

1183 

1184 def eval(self, func, *args, **kwargs): 

1185 """Eval function func with arguments `*args` and `**kwargs`, 

1186 in the context of the memory. 

1187 

1188 This method works similarly to the builtin `apply`, except 

1189 that the function is called only if the cache is not 

1190 up to date. 

1191 

1192 """ 

1193 if self.store_backend is None: 

1194 return func(*args, **kwargs) 

1195 return self.cache(func)(*args, **kwargs) 

1196 

1197 # ------------------------------------------------------------------------ 

1198 # Private `object` interface 

1199 # ------------------------------------------------------------------------ 

1200 

1201 def __repr__(self): 

1202 return "{class_name}(location={location})".format( 

1203 class_name=self.__class__.__name__, 

1204 location=( 

1205 None if self.store_backend is None else self.store_backend.location 

1206 ), 

1207 ) 

1208 

1209 def __getstate__(self): 

1210 """We don't store the timestamp when pickling, to avoid the hash 

1211 depending from it. 

1212 """ 

1213 state = self.__dict__.copy() 

1214 state["timestamp"] = None 

1215 return state 

1216 

1217 

1218############################################################################### 

1219# cache_validation_callback helpers 

1220############################################################################### 

1221 

1222 

1223def expires_after( 

1224 days=0, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=0, weeks=0 

1225): 

1226 """Helper cache_validation_callback to force recompute after a duration. 

1227 

1228 Parameters 

1229 ---------- 

1230 days, seconds, microseconds, milliseconds, minutes, hours, weeks: numbers 

1231 argument passed to a timedelta. 

1232 """ 

1233 delta = datetime.timedelta( 

1234 days=days, 

1235 seconds=seconds, 

1236 microseconds=microseconds, 

1237 milliseconds=milliseconds, 

1238 minutes=minutes, 

1239 hours=hours, 

1240 weeks=weeks, 

1241 ) 

1242 

1243 def cache_validation_callback(metadata): 

1244 computation_age = time.time() - metadata["time"] 

1245 return computation_age < delta.total_seconds() 

1246 

1247 return cache_validation_callback