Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/joblib/memory.py: 20%

415 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1""" 

2A context object for caching a function's return value each time it 

3is called with the same input arguments. 

4 

5""" 

6 

7# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org> 

8# Copyright (c) 2009 Gael Varoquaux 

9# License: BSD Style, 3 clauses. 

10 

11 

12from __future__ import with_statement 

13import logging 

14import os 

15from textwrap import dedent 

16import time 

17import pathlib 

18import pydoc 

19import re 

20import functools 

21import traceback 

22import warnings 

23import inspect 

24import weakref 

25from datetime import timedelta 

26 

27from tokenize import open as open_py_source 

28 

29# Local imports 

30from . import hashing 

31from .func_inspect import get_func_code, get_func_name, filter_args 

32from .func_inspect import format_call 

33from .func_inspect import format_signature 

34from .logger import Logger, format_time, pformat 

35from ._store_backends import StoreBackendBase, FileSystemStoreBackend 

36from ._store_backends import CacheWarning # noqa 

37 

38 

39FIRST_LINE_TEXT = "# first line:" 

40 

41# TODO: The following object should have a data store object as a sub 

42# object, and the interface to persist and query should be separated in 

43# the data store. 

44# 

45# This would enable creating 'Memory' objects with a different logic for 

46# pickling that would simply span a MemorizedFunc with the same 

47# store (or do we want to copy it to avoid cross-talks?), for instance to 

48# implement HDF5 pickling. 

49 

50# TODO: Same remark for the logger, and probably use the Python logging 

51# mechanism. 

52 

53 

54def extract_first_line(func_code): 

55 """ Extract the first line information from the function code 

56 text if available. 

57 """ 

58 if func_code.startswith(FIRST_LINE_TEXT): 

59 func_code = func_code.split('\n') 

60 first_line = int(func_code[0][len(FIRST_LINE_TEXT):]) 

61 func_code = '\n'.join(func_code[1:]) 

62 else: 

63 first_line = -1 

64 return func_code, first_line 

65 

66 

67class JobLibCollisionWarning(UserWarning): 

68 """ Warn that there might be a collision between names of functions. 

69 """ 

70 

71 

72_STORE_BACKENDS = {'local': FileSystemStoreBackend} 

73 

74 

75def register_store_backend(backend_name, backend): 

76 """Extend available store backends. 

77 

78 The Memory, MemorizeResult and MemorizeFunc objects are designed to be 

79 agnostic to the type of store used behind. By default, the local file 

80 system is used but this function gives the possibility to extend joblib's 

81 memory pattern with other types of storage such as cloud storage (S3, GCS, 

82 OpenStack, HadoopFS, etc) or blob DBs. 

83 

84 Parameters 

85 ---------- 

86 backend_name: str 

87 The name identifying the store backend being registered. For example, 

88 'local' is used with FileSystemStoreBackend. 

89 backend: StoreBackendBase subclass 

90 The name of a class that implements the StoreBackendBase interface. 

91 

92 """ 

93 if not isinstance(backend_name, str): 

94 raise ValueError("Store backend name should be a string, " 

95 "'{0}' given.".format(backend_name)) 

96 if backend is None or not issubclass(backend, StoreBackendBase): 

97 raise ValueError("Store backend should inherit " 

98 "StoreBackendBase, " 

99 "'{0}' given.".format(backend)) 

100 

101 _STORE_BACKENDS[backend_name] = backend 

102 

103 

104def _store_backend_factory(backend, location, verbose=0, backend_options=None): 

105 """Return the correct store object for the given location.""" 

106 if backend_options is None: 

107 backend_options = {} 

108 

109 if isinstance(location, pathlib.Path): 

110 location = str(location) 

111 

112 if isinstance(location, StoreBackendBase): 

113 return location 

114 elif isinstance(location, str): 

115 obj = None 

116 location = os.path.expanduser(location) 

117 # The location is not a local file system, we look in the 

118 # registered backends if there's one matching the given backend 

119 # name. 

120 for backend_key, backend_obj in _STORE_BACKENDS.items(): 

121 if backend == backend_key: 

122 obj = backend_obj() 

123 

124 # By default, we assume the FileSystemStoreBackend can be used if no 

125 # matching backend could be found. 

126 if obj is None: 

127 raise TypeError('Unknown location {0} or backend {1}'.format( 

128 location, backend)) 

129 

130 # The store backend is configured with the extra named parameters, 

131 # some of them are specific to the underlying store backend. 

132 obj.configure(location, verbose=verbose, 

133 backend_options=backend_options) 

134 return obj 

135 elif location is not None: 

136 warnings.warn( 

137 "Instantiating a backend using a {} as a location is not " 

138 "supported by joblib. Returning None instead.".format( 

139 location.__class__.__name__), UserWarning) 

140 

141 return None 

142 

143 

144def _get_func_fullname(func): 

145 """Compute the part of part associated with a function.""" 

146 modules, funcname = get_func_name(func) 

147 modules.append(funcname) 

148 return os.path.join(*modules) 

149 

150 

151def _build_func_identifier(func): 

152 """Build a roughly unique identifier for the cached function.""" 

153 parts = [] 

154 if isinstance(func, str): 

155 parts.append(func) 

156 else: 

157 parts.append(_get_func_fullname(func)) 

158 

159 # We reuse historical fs-like way of building a function identifier 

160 return os.path.join(*parts) 

161 

162 

163def _format_load_msg(func_id, args_id, timestamp=None, metadata=None): 

164 """ Helper function to format the message when loading the results. 

165 """ 

166 signature = "" 

167 try: 

168 if metadata is not None: 

169 args = ", ".join(['%s=%s' % (name, value) 

170 for name, value 

171 in metadata['input_args'].items()]) 

172 signature = "%s(%s)" % (os.path.basename(func_id), args) 

173 else: 

174 signature = os.path.basename(func_id) 

175 except KeyError: 

176 pass 

177 

178 if timestamp is not None: 

179 ts_string = "{0: <16}".format(format_time(time.time() - timestamp)) 

180 else: 

181 ts_string = "" 

182 return '[Memory]{0}: Loading {1}'.format(ts_string, str(signature)) 

183 

184 

185# An in-memory store to avoid looking at the disk-based function 

186# source code to check if a function definition has changed 

187_FUNCTION_HASHES = weakref.WeakKeyDictionary() 

188 

189 

190############################################################################### 

191# class `MemorizedResult` 

192############################################################################### 

193class MemorizedResult(Logger): 

194 """Object representing a cached value. 

195 

196 Attributes 

197 ---------- 

198 location: str 

199 The location of joblib cache. Depends on the store backend used. 

200 

201 func: function or str 

202 function whose output is cached. The string case is intended only for 

203 instantiation based on the output of repr() on another instance. 

204 (namely eval(repr(memorized_instance)) works). 

205 

206 argument_hash: str 

207 hash of the function arguments. 

208 

209 backend: str 

210 Type of store backend for reading/writing cache files. 

211 Default is 'local'. 

212 

213 mmap_mode: {None, 'r+', 'r', 'w+', 'c'} 

214 The memmapping mode used when loading from cache numpy arrays. See 

215 numpy.load for the meaning of the different values. 

216 

217 verbose: int 

218 verbosity level (0 means no message). 

219 

220 timestamp, metadata: string 

221 for internal use only. 

222 """ 

223 def __init__(self, location, func, args_id, backend='local', 

224 mmap_mode=None, verbose=0, timestamp=None, metadata=None): 

225 Logger.__init__(self) 

226 self.func_id = _build_func_identifier(func) 

227 if isinstance(func, str): 

228 self.func = func 

229 else: 

230 self.func = self.func_id 

231 self.args_id = args_id 

232 self.store_backend = _store_backend_factory(backend, location, 

233 verbose=verbose) 

234 self.mmap_mode = mmap_mode 

235 

236 if metadata is not None: 

237 self.metadata = metadata 

238 else: 

239 self.metadata = self.store_backend.get_metadata( 

240 [self.func_id, self.args_id]) 

241 

242 self.duration = self.metadata.get('duration', None) 

243 self.verbose = verbose 

244 self.timestamp = timestamp 

245 

246 @property 

247 def argument_hash(self): 

248 warnings.warn( 

249 "The 'argument_hash' attribute has been deprecated in version " 

250 "0.12 and will be removed in version 0.14.\n" 

251 "Use `args_id` attribute instead.", 

252 DeprecationWarning, stacklevel=2) 

253 return self.args_id 

254 

255 def get(self): 

256 """Read value from cache and return it.""" 

257 if self.verbose: 

258 msg = _format_load_msg(self.func_id, self.args_id, 

259 timestamp=self.timestamp, 

260 metadata=self.metadata) 

261 else: 

262 msg = None 

263 

264 try: 

265 return self.store_backend.load_item( 

266 [self.func_id, self.args_id], msg=msg, verbose=self.verbose) 

267 except ValueError as exc: 

268 new_exc = KeyError( 

269 "Error while trying to load a MemorizedResult's value. " 

270 "It seems that this folder is corrupted : {}".format( 

271 os.path.join( 

272 self.store_backend.location, self.func_id, 

273 self.args_id) 

274 )) 

275 raise new_exc from exc 

276 

277 def clear(self): 

278 """Clear value from cache""" 

279 self.store_backend.clear_item([self.func_id, self.args_id]) 

280 

281 def __repr__(self): 

282 return ('{class_name}(location="{location}", func="{func}", ' 

283 'args_id="{args_id}")' 

284 .format(class_name=self.__class__.__name__, 

285 location=self.store_backend.location, 

286 func=self.func, 

287 args_id=self.args_id 

288 )) 

289 

290 def __getstate__(self): 

291 state = self.__dict__.copy() 

292 state['timestamp'] = None 

293 return state 

294 

295 

296class NotMemorizedResult(object): 

297 """Class representing an arbitrary value. 

298 

299 This class is a replacement for MemorizedResult when there is no cache. 

300 """ 

301 __slots__ = ('value', 'valid') 

302 

303 def __init__(self, value): 

304 self.value = value 

305 self.valid = True 

306 

307 def get(self): 

308 if self.valid: 

309 return self.value 

310 else: 

311 raise KeyError("No value stored.") 

312 

313 def clear(self): 

314 self.valid = False 

315 self.value = None 

316 

317 def __repr__(self): 

318 if self.valid: 

319 return ('{class_name}({value})' 

320 .format(class_name=self.__class__.__name__, 

321 value=pformat(self.value))) 

322 else: 

323 return self.__class__.__name__ + ' with no value' 

324 

325 # __getstate__ and __setstate__ are required because of __slots__ 

326 def __getstate__(self): 

327 return {"valid": self.valid, "value": self.value} 

328 

329 def __setstate__(self, state): 

330 self.valid = state["valid"] 

331 self.value = state["value"] 

332 

333 

334############################################################################### 

335# class `NotMemorizedFunc` 

336############################################################################### 

337class NotMemorizedFunc(object): 

338 """No-op object decorating a function. 

339 

340 This class replaces MemorizedFunc when there is no cache. It provides an 

341 identical API but does not write anything on disk. 

342 

343 Attributes 

344 ---------- 

345 func: callable 

346 Original undecorated function. 

347 """ 

348 # Should be a light as possible (for speed) 

349 def __init__(self, func): 

350 self.func = func 

351 

352 def __call__(self, *args, **kwargs): 

353 return self.func(*args, **kwargs) 

354 

355 def call_and_shelve(self, *args, **kwargs): 

356 return NotMemorizedResult(self.func(*args, **kwargs)) 

357 

358 def __repr__(self): 

359 return '{0}(func={1})'.format(self.__class__.__name__, self.func) 

360 

361 def clear(self, warn=True): 

362 # Argument "warn" is for compatibility with MemorizedFunc.clear 

363 pass 

364 

365 def call(self, *args, **kwargs): 

366 return self.func(*args, **kwargs) 

367 

368 def check_call_in_cache(self, *args, **kwargs): 

369 return False 

370 

371 

372############################################################################### 

373# class `MemorizedFunc` 

374############################################################################### 

375class MemorizedFunc(Logger): 

376 """Callable object decorating a function for caching its return value 

377 each time it is called. 

378 

379 Methods are provided to inspect the cache or clean it. 

380 

381 Attributes 

382 ---------- 

383 func: callable 

384 The original, undecorated, function. 

385 

386 location: string 

387 The location of joblib cache. Depends on the store backend used. 

388 

389 backend: str 

390 Type of store backend for reading/writing cache files. 

391 Default is 'local', in which case the location is the path to a 

392 disk storage. 

393 

394 ignore: list or None 

395 List of variable names to ignore when choosing whether to 

396 recompute. 

397 

398 mmap_mode: {None, 'r+', 'r', 'w+', 'c'} 

399 The memmapping mode used when loading from cache 

400 numpy arrays. See numpy.load for the meaning of the different 

401 values. 

402 

403 compress: boolean, or integer 

404 Whether to zip the stored data on disk. If an integer is 

405 given, it should be between 1 and 9, and sets the amount 

406 of compression. Note that compressed arrays cannot be 

407 read by memmapping. 

408 

409 verbose: int, optional 

410 The verbosity flag, controls messages that are issued as 

411 the function is evaluated. 

412 

413 cache_validation_callback: callable, optional 

414 Callable to check if a result in cache is valid or is to be recomputed. 

415 When the function is called with arguments for which a cache exists, 

416 the callback is called with the cache entry's metadata as its sole 

417 argument. If it returns True, the cached result is returned, else the 

418 cache for these arguments is cleared and the result is recomputed. 

419 """ 

420 # ------------------------------------------------------------------------ 

421 # Public interface 

422 # ------------------------------------------------------------------------ 

423 

424 def __init__(self, func, location, backend='local', ignore=None, 

425 mmap_mode=None, compress=False, verbose=1, timestamp=None, 

426 cache_validation_callback=None): 

427 Logger.__init__(self) 

428 self.mmap_mode = mmap_mode 

429 self.compress = compress 

430 self.func = func 

431 self.cache_validation_callback = cache_validation_callback 

432 

433 if ignore is None: 

434 ignore = [] 

435 self.ignore = ignore 

436 self._verbose = verbose 

437 

438 # retrieve store object from backend type and location. 

439 self.store_backend = _store_backend_factory(backend, location, 

440 verbose=verbose, 

441 backend_options=dict( 

442 compress=compress, 

443 mmap_mode=mmap_mode), 

444 ) 

445 if self.store_backend is not None: 

446 # Create func directory on demand. 

447 self.store_backend.store_cached_func_code([ 

448 _build_func_identifier(self.func) 

449 ]) 

450 

451 if timestamp is None: 

452 timestamp = time.time() 

453 self.timestamp = timestamp 

454 try: 

455 functools.update_wrapper(self, func) 

456 except Exception: 

457 " Objects like ufunc don't like that " 

458 if inspect.isfunction(func): 

459 doc = pydoc.TextDoc().document(func) 

460 # Remove blank line 

461 doc = doc.replace('\n', '\n\n', 1) 

462 # Strip backspace-overprints for compatibility with autodoc 

463 doc = re.sub('\x08.', '', doc) 

464 else: 

465 # Pydoc does a poor job on other objects 

466 doc = func.__doc__ 

467 self.__doc__ = 'Memoized version of %s' % doc 

468 

469 self._func_code_info = None 

470 self._func_code_id = None 

471 

472 def _is_in_cache_and_valid(self, path): 

473 """Check if the function call is cached and valid for given arguments. 

474 

475 - Compare the function code with the one from the cached function, 

476 asserting if it has changed. 

477 - Check if the function call is present in the cache. 

478 - Call `cache_validation_callback` for user define cache validation. 

479 

480 Returns True if the function call is in cache and can be used, and 

481 returns False otherwise. 

482 """ 

483 # Check if the code of the function has changed 

484 if not self._check_previous_func_code(stacklevel=4): 

485 return False 

486 

487 # Check if this specific call is in the cache 

488 if not self.store_backend.contains_item(path): 

489 return False 

490 

491 # Call the user defined cache validation callback 

492 metadata = self.store_backend.get_metadata(path) 

493 if (self.cache_validation_callback is not None and 

494 not self.cache_validation_callback(metadata)): 

495 self.store_backend.clear_item(path) 

496 return False 

497 

498 return True 

499 

500 def _cached_call(self, args, kwargs, shelving=False): 

501 """Call wrapped function and cache result, or read cache if available. 

502 

503 This function returns the wrapped function output and some metadata. 

504 

505 Arguments: 

506 ---------- 

507 

508 args, kwargs: list and dict 

509 input arguments for wrapped function 

510 

511 shelving: bool 

512 True when called via the call_and_shelve function. 

513 

514 

515 Returns 

516 ------- 

517 output: value or tuple or None 

518 Output of the wrapped function. 

519 If shelving is True and the call has been already cached, 

520 output is None. 

521 

522 argument_hash: string 

523 Hash of function arguments. 

524 

525 metadata: dict 

526 Some metadata about wrapped function call (see _persist_input()). 

527 """ 

528 func_id, args_id = self._get_output_identifiers(*args, **kwargs) 

529 metadata = None 

530 msg = None 

531 

532 # Whether or not the memorized function must be called 

533 must_call = False 

534 

535 if self._verbose >= 20: 

536 logging.basicConfig(level=logging.INFO) 

537 _, name = get_func_name(self.func) 

538 location = self.store_backend.get_cached_func_info([func_id])[ 

539 'location'] 

540 _, signature = format_signature(self.func, *args, **kwargs) 

541 

542 self.info( 

543 dedent( 

544 f""" 

545 Querying {name} with signature 

546 {signature}. 

547 

548 (argument hash {args_id}) 

549 

550 The store location is {location}. 

551 """ 

552 ) 

553 ) 

554 

555 # Compare the function code with the previous to see if the 

556 # function code has changed and check if the results are present in 

557 # the cache. 

558 if self._is_in_cache_and_valid([func_id, args_id]): 

559 try: 

560 t0 = time.time() 

561 if self._verbose: 

562 msg = _format_load_msg(func_id, args_id, 

563 timestamp=self.timestamp, 

564 metadata=metadata) 

565 

566 if not shelving: 

567 # When shelving, we do not need to load the output 

568 out = self.store_backend.load_item( 

569 [func_id, args_id], 

570 msg=msg, 

571 verbose=self._verbose) 

572 else: 

573 out = None 

574 

575 if self._verbose > 4: 

576 t = time.time() - t0 

577 _, name = get_func_name(self.func) 

578 msg = '%s cache loaded - %s' % (name, format_time(t)) 

579 print(max(0, (80 - len(msg))) * '_' + msg) 

580 except Exception: 

581 # XXX: Should use an exception logger 

582 _, signature = format_signature(self.func, *args, **kwargs) 

583 self.warn('Exception while loading results for ' 

584 '{}\n {}'.format(signature, traceback.format_exc())) 

585 

586 must_call = True 

587 else: 

588 if self._verbose > 10: 

589 _, name = get_func_name(self.func) 

590 self.warn('Computing func {0}, argument hash {1} ' 

591 'in location {2}' 

592 .format(name, args_id, 

593 self.store_backend. 

594 get_cached_func_info([func_id])['location'])) 

595 must_call = True 

596 

597 if must_call: 

598 out, metadata = self.call(*args, **kwargs) 

599 if self.mmap_mode is not None: 

600 # Memmap the output at the first call to be consistent with 

601 # later calls 

602 if self._verbose: 

603 msg = _format_load_msg(func_id, args_id, 

604 timestamp=self.timestamp, 

605 metadata=metadata) 

606 out = self.store_backend.load_item([func_id, args_id], msg=msg, 

607 verbose=self._verbose) 

608 

609 return (out, args_id, metadata) 

610 

611 @property 

612 def func_code_info(self): 

613 # 3-tuple property containing: the function source code, source file, 

614 # and first line of the code inside the source file 

615 if hasattr(self.func, '__code__'): 

616 if self._func_code_id is None: 

617 self._func_code_id = id(self.func.__code__) 

618 elif id(self.func.__code__) != self._func_code_id: 

619 # Be robust to dynamic reassignments of self.func.__code__ 

620 self._func_code_info = None 

621 

622 if self._func_code_info is None: 

623 # Cache the source code of self.func . Provided that get_func_code 

624 # (which should be called once on self) gets called in the process 

625 # in which self.func was defined, this caching mechanism prevents 

626 # undesired cache clearing when the cached function is called in 

627 # an environment where the introspection utilities get_func_code 

628 # relies on do not work (typically, in joblib child processes). 

629 # See #1035 for more info 

630 # TODO (pierreglaser): do the same with get_func_name? 

631 self._func_code_info = get_func_code(self.func) 

632 return self._func_code_info 

633 

634 def call_and_shelve(self, *args, **kwargs): 

635 """Call wrapped function, cache result and return a reference. 

636 

637 This method returns a reference to the cached result instead of the 

638 result itself. The reference object is small and pickeable, allowing 

639 to send or store it easily. Call .get() on reference object to get 

640 result. 

641 

642 Returns 

643 ------- 

644 cached_result: MemorizedResult or NotMemorizedResult 

645 reference to the value returned by the wrapped function. The 

646 class "NotMemorizedResult" is used when there is no cache 

647 activated (e.g. location=None in Memory). 

648 """ 

649 _, args_id, metadata = self._cached_call(args, kwargs, shelving=True) 

650 return MemorizedResult(self.store_backend, self.func, args_id, 

651 metadata=metadata, verbose=self._verbose - 1, 

652 timestamp=self.timestamp) 

653 

654 def __call__(self, *args, **kwargs): 

655 return self._cached_call(args, kwargs)[0] 

656 

657 def __getstate__(self): 

658 # Make sure self.func's source is introspected prior to being pickled - 

659 # code introspection utilities typically do not work inside child 

660 # processes 

661 _ = self.func_code_info 

662 

663 # We don't store the timestamp when pickling, to avoid the hash 

664 # depending from it. 

665 state = self.__dict__.copy() 

666 state['timestamp'] = None 

667 

668 # Invalidate the code id as id(obj) will be different in the child 

669 state['_func_code_id'] = None 

670 

671 return state 

672 

673 def check_call_in_cache(self, *args, **kwargs): 

674 """Check if function call is in the memory cache. 

675 

676 Does not call the function or do any work besides func inspection 

677 and arg hashing. 

678 

679 Returns 

680 ------- 

681 is_call_in_cache: bool 

682 Whether or not the result of the function has been cached 

683 for the input arguments that have been passed. 

684 """ 

685 func_id, args_id = self._get_output_identifiers(*args, **kwargs) 

686 return self.store_backend.contains_item((func_id, args_id)) 

687 

688 # ------------------------------------------------------------------------ 

689 # Private interface 

690 # ------------------------------------------------------------------------ 

691 

692 def _get_argument_hash(self, *args, **kwargs): 

693 return hashing.hash(filter_args(self.func, self.ignore, args, kwargs), 

694 coerce_mmap=(self.mmap_mode is not None)) 

695 

696 def _get_output_identifiers(self, *args, **kwargs): 

697 """Return the func identifier and input parameter hash of a result.""" 

698 func_id = _build_func_identifier(self.func) 

699 argument_hash = self._get_argument_hash(*args, **kwargs) 

700 return func_id, argument_hash 

701 

702 def _hash_func(self): 

703 """Hash a function to key the online cache""" 

704 func_code_h = hash(getattr(self.func, '__code__', None)) 

705 return id(self.func), hash(self.func), func_code_h 

706 

707 def _write_func_code(self, func_code, first_line): 

708 """ Write the function code and the filename to a file. 

709 """ 

710 # We store the first line because the filename and the function 

711 # name is not always enough to identify a function: people 

712 # sometimes have several functions named the same way in a 

713 # file. This is bad practice, but joblib should be robust to bad 

714 # practice. 

715 func_id = _build_func_identifier(self.func) 

716 func_code = u'%s %i\n%s' % (FIRST_LINE_TEXT, first_line, func_code) 

717 self.store_backend.store_cached_func_code([func_id], func_code) 

718 

719 # Also store in the in-memory store of function hashes 

720 is_named_callable = False 

721 is_named_callable = (hasattr(self.func, '__name__') and 

722 self.func.__name__ != '<lambda>') 

723 if is_named_callable: 

724 # Don't do this for lambda functions or strange callable 

725 # objects, as it ends up being too fragile 

726 func_hash = self._hash_func() 

727 try: 

728 _FUNCTION_HASHES[self.func] = func_hash 

729 except TypeError: 

730 # Some callable are not hashable 

731 pass 

732 

733 def _check_previous_func_code(self, stacklevel=2): 

734 """ 

735 stacklevel is the depth a which this function is called, to 

736 issue useful warnings to the user. 

737 """ 

738 # First check if our function is in the in-memory store. 

739 # Using the in-memory store not only makes things faster, but it 

740 # also renders us robust to variations of the files when the 

741 # in-memory version of the code does not vary 

742 try: 

743 if self.func in _FUNCTION_HASHES: 

744 # We use as an identifier the id of the function and its 

745 # hash. This is more likely to falsely change than have hash 

746 # collisions, thus we are on the safe side. 

747 func_hash = self._hash_func() 

748 if func_hash == _FUNCTION_HASHES[self.func]: 

749 return True 

750 except TypeError: 

751 # Some callables are not hashable 

752 pass 

753 

754 # Here, we go through some effort to be robust to dynamically 

755 # changing code and collision. We cannot inspect.getsource 

756 # because it is not reliable when using IPython's magic "%run". 

757 func_code, source_file, first_line = self.func_code_info 

758 func_id = _build_func_identifier(self.func) 

759 

760 try: 

761 old_func_code, old_first_line =\ 

762 extract_first_line( 

763 self.store_backend.get_cached_func_code([func_id])) 

764 except (IOError, OSError): # some backend can also raise OSError 

765 self._write_func_code(func_code, first_line) 

766 return False 

767 if old_func_code == func_code: 

768 return True 

769 

770 # We have differing code, is this because we are referring to 

771 # different functions, or because the function we are referring to has 

772 # changed? 

773 

774 _, func_name = get_func_name(self.func, resolv_alias=False, 

775 win_characters=False) 

776 if old_first_line == first_line == -1 or func_name == '<lambda>': 

777 if not first_line == -1: 

778 func_description = ("{0} ({1}:{2})" 

779 .format(func_name, source_file, 

780 first_line)) 

781 else: 

782 func_description = func_name 

783 warnings.warn(JobLibCollisionWarning( 

784 "Cannot detect name collisions for function '{0}'" 

785 .format(func_description)), stacklevel=stacklevel) 

786 

787 # Fetch the code at the old location and compare it. If it is the 

788 # same than the code store, we have a collision: the code in the 

789 # file has not changed, but the name we have is pointing to a new 

790 # code block. 

791 if not old_first_line == first_line and source_file is not None: 

792 possible_collision = False 

793 if os.path.exists(source_file): 

794 _, func_name = get_func_name(self.func, resolv_alias=False) 

795 num_lines = len(func_code.split('\n')) 

796 with open_py_source(source_file) as f: 

797 on_disk_func_code = f.readlines()[ 

798 old_first_line - 1:old_first_line - 1 + num_lines - 1] 

799 on_disk_func_code = ''.join(on_disk_func_code) 

800 possible_collision = (on_disk_func_code.rstrip() == 

801 old_func_code.rstrip()) 

802 else: 

803 possible_collision = source_file.startswith('<doctest ') 

804 if possible_collision: 

805 warnings.warn(JobLibCollisionWarning( 

806 'Possible name collisions between functions ' 

807 "'%s' (%s:%i) and '%s' (%s:%i)" % 

808 (func_name, source_file, old_first_line, 

809 func_name, source_file, first_line)), 

810 stacklevel=stacklevel) 

811 

812 # The function has changed, wipe the cache directory. 

813 # XXX: Should be using warnings, and giving stacklevel 

814 if self._verbose > 10: 

815 _, func_name = get_func_name(self.func, resolv_alias=False) 

816 self.warn("Function {0} (identified by {1}) has changed" 

817 ".".format(func_name, func_id)) 

818 self.clear(warn=True) 

819 return False 

820 

821 def clear(self, warn=True): 

822 """Empty the function's cache.""" 

823 func_id = _build_func_identifier(self.func) 

824 

825 if self._verbose > 0 and warn: 

826 self.warn("Clearing function cache identified by %s" % func_id) 

827 self.store_backend.clear_path([func_id, ]) 

828 

829 func_code, _, first_line = self.func_code_info 

830 self._write_func_code(func_code, first_line) 

831 

832 def call(self, *args, **kwargs): 

833 """Force the execution of the function with the given arguments. 

834 

835 The output values will be persisted, i.e., the cache will be updated 

836 with any new values. 

837 

838 Parameters 

839 ---------- 

840 *args: arguments 

841 The arguments. 

842 **kwargs: keyword arguments 

843 Keyword arguments. 

844 

845 Returns 

846 ------- 

847 output : object 

848 The output of the function call. 

849 metadata : dict 

850 The metadata associated with the call. 

851 """ 

852 start_time = time.time() 

853 func_id, args_id = self._get_output_identifiers(*args, **kwargs) 

854 if self._verbose > 0: 

855 print(format_call(self.func, args, kwargs)) 

856 output = self.func(*args, **kwargs) 

857 self.store_backend.dump_item( 

858 [func_id, args_id], output, verbose=self._verbose) 

859 

860 duration = time.time() - start_time 

861 metadata = self._persist_input(duration, args, kwargs) 

862 

863 if self._verbose > 0: 

864 _, name = get_func_name(self.func) 

865 msg = '%s - %s' % (name, format_time(duration)) 

866 print(max(0, (80 - len(msg))) * '_' + msg) 

867 return output, metadata 

868 

869 def _persist_input(self, duration, args, kwargs, this_duration_limit=0.5): 

870 """ Save a small summary of the call using json format in the 

871 output directory. 

872 

873 output_dir: string 

874 directory where to write metadata. 

875 

876 duration: float 

877 time taken by hashing input arguments, calling the wrapped 

878 function and persisting its output. 

879 

880 args, kwargs: list and dict 

881 input arguments for wrapped function 

882 

883 this_duration_limit: float 

884 Max execution time for this function before issuing a warning. 

885 """ 

886 start_time = time.time() 

887 argument_dict = filter_args(self.func, self.ignore, 

888 args, kwargs) 

889 

890 input_repr = dict((k, repr(v)) for k, v in argument_dict.items()) 

891 # This can fail due to race-conditions with multiple 

892 # concurrent joblibs removing the file or the directory 

893 metadata = { 

894 "duration": duration, "input_args": input_repr, "time": start_time, 

895 } 

896 

897 func_id, args_id = self._get_output_identifiers(*args, **kwargs) 

898 self.store_backend.store_metadata([func_id, args_id], metadata) 

899 

900 this_duration = time.time() - start_time 

901 if this_duration > this_duration_limit: 

902 # This persistence should be fast. It will not be if repr() takes 

903 # time and its output is large, because json.dump will have to 

904 # write a large file. This should not be an issue with numpy arrays 

905 # for which repr() always output a short representation, but can 

906 # be with complex dictionaries. Fixing the problem should be a 

907 # matter of replacing repr() above by something smarter. 

908 warnings.warn("Persisting input arguments took %.2fs to run." 

909 "If this happens often in your code, it can cause " 

910 "performance problems " 

911 "(results will be correct in all cases). " 

912 "The reason for this is probably some large input " 

913 "arguments for a wrapped function." 

914 % this_duration, stacklevel=5) 

915 return metadata 

916 

917 # ------------------------------------------------------------------------ 

918 # Private `object` interface 

919 # ------------------------------------------------------------------------ 

920 

921 def __repr__(self): 

922 return '{class_name}(func={func}, location={location})'.format( 

923 class_name=self.__class__.__name__, 

924 func=self.func, 

925 location=self.store_backend.location,) 

926 

927 

928############################################################################### 

929# class `Memory` 

930############################################################################### 

931class Memory(Logger): 

932 """ A context object for caching a function's return value each time it 

933 is called with the same input arguments. 

934 

935 All values are cached on the filesystem, in a deep directory 

936 structure. 

937 

938 Read more in the :ref:`User Guide <memory>`. 

939 

940 Parameters 

941 ---------- 

942 location: str, pathlib.Path or None 

943 The path of the base directory to use as a data store 

944 or None. If None is given, no caching is done and 

945 the Memory object is completely transparent. This option 

946 replaces cachedir since version 0.12. 

947 

948 backend: str, optional 

949 Type of store backend for reading/writing cache files. 

950 Default: 'local'. 

951 The 'local' backend is using regular filesystem operations to 

952 manipulate data (open, mv, etc) in the backend. 

953 

954 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional 

955 The memmapping mode used when loading from cache 

956 numpy arrays. See numpy.load for the meaning of the 

957 arguments. 

958 

959 compress: boolean, or integer, optional 

960 Whether to zip the stored data on disk. If an integer is 

961 given, it should be between 1 and 9, and sets the amount 

962 of compression. Note that compressed arrays cannot be 

963 read by memmapping. 

964 

965 verbose: int, optional 

966 Verbosity flag, controls the debug messages that are issued 

967 as functions are evaluated. 

968 

969 bytes_limit: int | str, optional 

970 Limit in bytes of the size of the cache. By default, the size of 

971 the cache is unlimited. When reducing the size of the cache, 

972 ``joblib`` keeps the most recently accessed items first. If a 

973 str is passed, it is converted to a number of bytes using units 

974 { K | M | G} for kilo, mega, giga. 

975 

976 **Note:** You need to call :meth:`joblib.Memory.reduce_size` to 

977 actually reduce the cache size to be less than ``bytes_limit``. 

978 

979 **Note:** This argument has been deprecated. One should give the 

980 value of ``bytes_limit`` directly in 

981 :meth:`joblib.Memory.reduce_size`. 

982 

983 backend_options: dict, optional 

984 Contains a dictionary of named parameters used to configure 

985 the store backend. 

986 """ 

987 # ------------------------------------------------------------------------ 

988 # Public interface 

989 # ------------------------------------------------------------------------ 

990 

991 def __init__(self, location=None, backend='local', 

992 mmap_mode=None, compress=False, verbose=1, bytes_limit=None, 

993 backend_options=None): 

994 Logger.__init__(self) 

995 self._verbose = verbose 

996 self.mmap_mode = mmap_mode 

997 self.timestamp = time.time() 

998 if bytes_limit is not None: 

999 warnings.warn( 

1000 "bytes_limit argument has been deprecated. It will be removed " 

1001 "in version 1.5. Please pass its value directly to " 

1002 "Memory.reduce_size.", 

1003 category=DeprecationWarning 

1004 ) 

1005 self.bytes_limit = bytes_limit 

1006 self.backend = backend 

1007 self.compress = compress 

1008 if backend_options is None: 

1009 backend_options = {} 

1010 self.backend_options = backend_options 

1011 

1012 if compress and mmap_mode is not None: 

1013 warnings.warn('Compressed results cannot be memmapped', 

1014 stacklevel=2) 

1015 

1016 self.location = location 

1017 if isinstance(location, str): 

1018 location = os.path.join(location, 'joblib') 

1019 

1020 self.store_backend = _store_backend_factory( 

1021 backend, location, verbose=self._verbose, 

1022 backend_options=dict(compress=compress, mmap_mode=mmap_mode, 

1023 **backend_options)) 

1024 

1025 def cache(self, func=None, ignore=None, verbose=None, mmap_mode=False, 

1026 cache_validation_callback=None): 

1027 """ Decorates the given function func to only compute its return 

1028 value for input arguments not cached on disk. 

1029 

1030 Parameters 

1031 ---------- 

1032 func: callable, optional 

1033 The function to be decorated 

1034 ignore: list of strings 

1035 A list of arguments name to ignore in the hashing 

1036 verbose: integer, optional 

1037 The verbosity mode of the function. By default that 

1038 of the memory object is used. 

1039 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional 

1040 The memmapping mode used when loading from cache 

1041 numpy arrays. See numpy.load for the meaning of the 

1042 arguments. By default that of the memory object is used. 

1043 cache_validation_callback: callable, optional 

1044 Callable to validate whether or not the cache is valid. When 

1045 the cached function is called with arguments for which a cache 

1046 exists, this callable is called with the metadata of the cached 

1047 result as its sole argument. If it returns True, then the 

1048 cached result is returned, else the cache for these arguments 

1049 is cleared and recomputed. 

1050 

1051 Returns 

1052 ------- 

1053 decorated_func: MemorizedFunc object 

1054 The returned object is a MemorizedFunc object, that is 

1055 callable (behaves like a function), but offers extra 

1056 methods for cache lookup and management. See the 

1057 documentation for :class:`joblib.memory.MemorizedFunc`. 

1058 """ 

1059 if (cache_validation_callback is not None and 

1060 not callable(cache_validation_callback)): 

1061 raise ValueError( 

1062 "cache_validation_callback needs to be callable. " 

1063 f"Got {cache_validation_callback}." 

1064 ) 

1065 if func is None: 

1066 # Partial application, to be able to specify extra keyword 

1067 # arguments in decorators 

1068 return functools.partial( 

1069 self.cache, ignore=ignore, 

1070 mmap_mode=mmap_mode, 

1071 verbose=verbose, 

1072 cache_validation_callback=cache_validation_callback 

1073 ) 

1074 if self.store_backend is None: 

1075 return NotMemorizedFunc(func) 

1076 if verbose is None: 

1077 verbose = self._verbose 

1078 if mmap_mode is False: 

1079 mmap_mode = self.mmap_mode 

1080 if isinstance(func, MemorizedFunc): 

1081 func = func.func 

1082 return MemorizedFunc( 

1083 func, location=self.store_backend, backend=self.backend, 

1084 ignore=ignore, mmap_mode=mmap_mode, compress=self.compress, 

1085 verbose=verbose, timestamp=self.timestamp, 

1086 cache_validation_callback=cache_validation_callback 

1087 ) 

1088 

1089 def clear(self, warn=True): 

1090 """ Erase the complete cache directory. 

1091 """ 

1092 if warn: 

1093 self.warn('Flushing completely the cache') 

1094 if self.store_backend is not None: 

1095 self.store_backend.clear() 

1096 

1097 # As the cache is completely clear, make sure the _FUNCTION_HASHES 

1098 # cache is also reset. Else, for a function that is present in this 

1099 # table, results cached after this clear will be have cache miss 

1100 # as the function code is not re-written. 

1101 _FUNCTION_HASHES.clear() 

1102 

1103 def reduce_size(self, bytes_limit=None, items_limit=None, age_limit=None): 

1104 """Remove cache elements to make the cache fit its limits. 

1105 

1106 The limitation can impose that the cache size fits in ``bytes_limit``, 

1107 that the number of cache items is no more than ``items_limit``, and 

1108 that all files in cache are not older than ``age_limit``. 

1109 

1110 Parameters 

1111 ---------- 

1112 bytes_limit: int | str, optional 

1113 Limit in bytes of the size of the cache. By default, the size of 

1114 the cache is unlimited. When reducing the size of the cache, 

1115 ``joblib`` keeps the most recently accessed items first. If a 

1116 str is passed, it is converted to a number of bytes using units 

1117 { K | M | G} for kilo, mega, giga. 

1118 

1119 items_limit: int, optional 

1120 Number of items to limit the cache to. By default, the number of 

1121 items in the cache is unlimited. When reducing the size of the 

1122 cache, ``joblib`` keeps the most recently accessed items first. 

1123 

1124 age_limit: datetime.timedelta, optional 

1125 Maximum age of items to limit the cache to. When reducing the size 

1126 of the cache, any items last accessed more than the given length of 

1127 time ago are deleted. 

1128 """ 

1129 if bytes_limit is None: 

1130 bytes_limit = self.bytes_limit 

1131 

1132 if self.store_backend is None: 

1133 # No cached results, this function does nothing. 

1134 return 

1135 

1136 if bytes_limit is None and items_limit is None and age_limit is None: 

1137 # No limitation to impose, returning 

1138 return 

1139 

1140 # Defers the actual limits enforcing to the store backend. 

1141 self.store_backend.enforce_store_limits( 

1142 bytes_limit, items_limit, age_limit 

1143 ) 

1144 

1145 def eval(self, func, *args, **kwargs): 

1146 """ Eval function func with arguments `*args` and `**kwargs`, 

1147 in the context of the memory. 

1148 

1149 This method works similarly to the builtin `apply`, except 

1150 that the function is called only if the cache is not 

1151 up to date. 

1152 

1153 """ 

1154 if self.store_backend is None: 

1155 return func(*args, **kwargs) 

1156 return self.cache(func)(*args, **kwargs) 

1157 

1158 # ------------------------------------------------------------------------ 

1159 # Private `object` interface 

1160 # ------------------------------------------------------------------------ 

1161 

1162 def __repr__(self): 

1163 return '{class_name}(location={location})'.format( 

1164 class_name=self.__class__.__name__, 

1165 location=(None if self.store_backend is None 

1166 else self.store_backend.location)) 

1167 

1168 def __getstate__(self): 

1169 """ We don't store the timestamp when pickling, to avoid the hash 

1170 depending from it. 

1171 """ 

1172 state = self.__dict__.copy() 

1173 state['timestamp'] = None 

1174 return state 

1175 

1176 

1177############################################################################### 

1178# cache_validation_callback helpers 

1179############################################################################### 

1180 

1181def expires_after(days=0, seconds=0, microseconds=0, milliseconds=0, minutes=0, 

1182 hours=0, weeks=0): 

1183 """Helper cache_validation_callback to force recompute after a duration. 

1184 

1185 Parameters 

1186 ---------- 

1187 days, seconds, microseconds, milliseconds, minutes, hours, weeks: numbers 

1188 argument passed to a timedelta. 

1189 """ 

1190 delta = timedelta( 

1191 days=days, seconds=seconds, microseconds=microseconds, 

1192 milliseconds=milliseconds, minutes=minutes, hours=hours, weeks=weeks 

1193 ) 

1194 

1195 def cache_validation_callback(metadata): 

1196 computation_age = time.time() - metadata['time'] 

1197 return computation_age < delta.total_seconds() 

1198 

1199 return cache_validation_callback