Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/numpy/lib/_datasource.py: 21%

175 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-09 06:12 +0000

1"""A file interface for handling local and remote data files. 

2 

3The goal of datasource is to abstract some of the file system operations 

4when dealing with data files so the researcher doesn't have to know all the 

5low-level details. Through datasource, a researcher can obtain and use a 

6file with one function call, regardless of location of the file. 

7 

8DataSource is meant to augment standard python libraries, not replace them. 

9It should work seamlessly with standard file IO operations and the os 

10module. 

11 

12DataSource files can originate locally or remotely: 

13 

14- local files : '/home/guido/src/local/data.txt' 

15- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt' 

16 

17DataSource files can also be compressed or uncompressed. Currently only 

18gzip, bz2 and xz are supported. 

19 

20Example:: 

21 

22 >>> # Create a DataSource, use os.curdir (default) for local storage. 

23 >>> from numpy import DataSource 

24 >>> ds = DataSource() 

25 >>> 

26 >>> # Open a remote file. 

27 >>> # DataSource downloads the file, stores it locally in: 

28 >>> # './www.google.com/index.html' 

29 >>> # opens the file and returns a file object. 

30 >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP 

31 >>> 

32 >>> # Use the file as you normally would 

33 >>> fp.read() # doctest: +SKIP 

34 >>> fp.close() # doctest: +SKIP 

35 

36""" 

37import os 

38 

39from .._utils import set_module 

40 

41 

42_open = open 

43 

44 

45def _check_mode(mode, encoding, newline): 

46 """Check mode and that encoding and newline are compatible. 

47 

48 Parameters 

49 ---------- 

50 mode : str 

51 File open mode. 

52 encoding : str 

53 File encoding. 

54 newline : str 

55 Newline for text files. 

56 

57 """ 

58 if "t" in mode: 

59 if "b" in mode: 

60 raise ValueError("Invalid mode: %r" % (mode,)) 

61 else: 

62 if encoding is not None: 

63 raise ValueError("Argument 'encoding' not supported in binary mode") 

64 if newline is not None: 

65 raise ValueError("Argument 'newline' not supported in binary mode") 

66 

67 

68# Using a class instead of a module-level dictionary 

69# to reduce the initial 'import numpy' overhead by 

70# deferring the import of lzma, bz2 and gzip until needed 

71 

72# TODO: .zip support, .tar support? 

73class _FileOpeners: 

74 """ 

75 Container for different methods to open (un-)compressed files. 

76 

77 `_FileOpeners` contains a dictionary that holds one method for each 

78 supported file format. Attribute lookup is implemented in such a way 

79 that an instance of `_FileOpeners` itself can be indexed with the keys 

80 of that dictionary. Currently uncompressed files as well as files 

81 compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported. 

82 

83 Notes 

84 ----- 

85 `_file_openers`, an instance of `_FileOpeners`, is made available for 

86 use in the `_datasource` module. 

87 

88 Examples 

89 -------- 

90 >>> import gzip 

91 >>> np.lib._datasource._file_openers.keys() 

92 [None, '.bz2', '.gz', '.xz', '.lzma'] 

93 >>> np.lib._datasource._file_openers['.gz'] is gzip.open 

94 True 

95 

96 """ 

97 

98 def __init__(self): 

99 self._loaded = False 

100 self._file_openers = {None: open} 

101 

102 def _load(self): 

103 if self._loaded: 

104 return 

105 

106 try: 

107 import bz2 

108 self._file_openers[".bz2"] = bz2.open 

109 except ImportError: 

110 pass 

111 

112 try: 

113 import gzip 

114 self._file_openers[".gz"] = gzip.open 

115 except ImportError: 

116 pass 

117 

118 try: 

119 import lzma 

120 self._file_openers[".xz"] = lzma.open 

121 self._file_openers[".lzma"] = lzma.open 

122 except (ImportError, AttributeError): 

123 # There are incompatible backports of lzma that do not have the 

124 # lzma.open attribute, so catch that as well as ImportError. 

125 pass 

126 

127 self._loaded = True 

128 

129 def keys(self): 

130 """ 

131 Return the keys of currently supported file openers. 

132 

133 Parameters 

134 ---------- 

135 None 

136 

137 Returns 

138 ------- 

139 keys : list 

140 The keys are None for uncompressed files and the file extension 

141 strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression 

142 methods. 

143 

144 """ 

145 self._load() 

146 return list(self._file_openers.keys()) 

147 

148 def __getitem__(self, key): 

149 self._load() 

150 return self._file_openers[key] 

151 

152_file_openers = _FileOpeners() 

153 

154def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None): 

155 """ 

156 Open `path` with `mode` and return the file object. 

157 

158 If ``path`` is an URL, it will be downloaded, stored in the 

159 `DataSource` `destpath` directory and opened from there. 

160 

161 Parameters 

162 ---------- 

163 path : str or pathlib.Path 

164 Local file path or URL to open. 

165 mode : str, optional 

166 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to 

167 append. Available modes depend on the type of object specified by 

168 path. Default is 'r'. 

169 destpath : str, optional 

170 Path to the directory where the source file gets downloaded to for 

171 use. If `destpath` is None, a temporary directory will be created. 

172 The default path is the current directory. 

173 encoding : {None, str}, optional 

174 Open text file with given encoding. The default encoding will be 

175 what `open` uses. 

176 newline : {None, str}, optional 

177 Newline to use when reading text file. 

178 

179 Returns 

180 ------- 

181 out : file object 

182 The opened file. 

183 

184 Notes 

185 ----- 

186 This is a convenience function that instantiates a `DataSource` and 

187 returns the file object from ``DataSource.open(path)``. 

188 

189 """ 

190 

191 ds = DataSource(destpath) 

192 return ds.open(path, mode, encoding=encoding, newline=newline) 

193 

194 

195@set_module('numpy.lib.npyio') 

196class DataSource: 

197 """ 

198 DataSource(destpath='.') 

199 

200 A generic data source file (file, http, ftp, ...). 

201 

202 DataSources can be local files or remote files/URLs. The files may 

203 also be compressed or uncompressed. DataSource hides some of the 

204 low-level details of downloading the file, allowing you to simply pass 

205 in a valid file path (or URL) and obtain a file object. 

206 

207 Parameters 

208 ---------- 

209 destpath : str or None, optional 

210 Path to the directory where the source file gets downloaded to for 

211 use. If `destpath` is None, a temporary directory will be created. 

212 The default path is the current directory. 

213 

214 Notes 

215 ----- 

216 URLs require a scheme string (``http://``) to be used, without it they 

217 will fail:: 

218 

219 >>> repos = np.lib.npyio.DataSource() 

220 >>> repos.exists('www.google.com/index.html') 

221 False 

222 >>> repos.exists('http://www.google.com/index.html') 

223 True 

224 

225 Temporary directories are deleted when the DataSource is deleted. 

226 

227 Examples 

228 -------- 

229 :: 

230 

231 >>> ds = np.lib.npyio.DataSource('/home/guido') 

232 >>> urlname = 'http://www.google.com/' 

233 >>> gfile = ds.open('http://www.google.com/') 

234 >>> ds.abspath(urlname) 

235 '/home/guido/www.google.com/index.html' 

236 

237 >>> ds = np.lib.npyio.DataSource(None) # use with temporary file 

238 >>> ds.open('/home/guido/foobar.txt') 

239 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430> 

240 >>> ds.abspath('/home/guido/foobar.txt') 

241 '/tmp/.../home/guido/foobar.txt' 

242 

243 """ 

244 

245 def __init__(self, destpath=os.curdir): 

246 """Create a DataSource with a local path at destpath.""" 

247 if destpath: 

248 self._destpath = os.path.abspath(destpath) 

249 self._istmpdest = False 

250 else: 

251 import tempfile # deferring import to improve startup time 

252 self._destpath = tempfile.mkdtemp() 

253 self._istmpdest = True 

254 

255 def __del__(self): 

256 # Remove temp directories 

257 if hasattr(self, '_istmpdest') and self._istmpdest: 

258 import shutil 

259 

260 shutil.rmtree(self._destpath) 

261 

262 def _iszip(self, filename): 

263 """Test if the filename is a zip file by looking at the file extension. 

264 

265 """ 

266 fname, ext = os.path.splitext(filename) 

267 return ext in _file_openers.keys() 

268 

269 def _iswritemode(self, mode): 

270 """Test if the given mode will open a file for writing.""" 

271 

272 # Currently only used to test the bz2 files. 

273 _writemodes = ("w", "+") 

274 for c in mode: 

275 if c in _writemodes: 

276 return True 

277 return False 

278 

279 def _splitzipext(self, filename): 

280 """Split zip extension from filename and return filename. 

281 

282 Returns 

283 ------- 

284 base, zip_ext : {tuple} 

285 

286 """ 

287 

288 if self._iszip(filename): 

289 return os.path.splitext(filename) 

290 else: 

291 return filename, None 

292 

293 def _possible_names(self, filename): 

294 """Return a tuple containing compressed filename variations.""" 

295 names = [filename] 

296 if not self._iszip(filename): 

297 for zipext in _file_openers.keys(): 

298 if zipext: 

299 names.append(filename+zipext) 

300 return names 

301 

302 def _isurl(self, path): 

303 """Test if path is a net location. Tests the scheme and netloc.""" 

304 

305 # We do this here to reduce the 'import numpy' initial import time. 

306 from urllib.parse import urlparse 

307 

308 # BUG : URLs require a scheme string ('http://') to be used. 

309 # www.google.com will fail. 

310 # Should we prepend the scheme for those that don't have it and 

311 # test that also? Similar to the way we append .gz and test for 

312 # for compressed versions of files. 

313 

314 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) 

315 return bool(scheme and netloc) 

316 

317 def _cache(self, path): 

318 """Cache the file specified by path. 

319 

320 Creates a copy of the file in the datasource cache. 

321 

322 """ 

323 # We import these here because importing them is slow and 

324 # a significant fraction of numpy's total import time. 

325 import shutil 

326 from urllib.request import urlopen 

327 

328 upath = self.abspath(path) 

329 

330 # ensure directory exists 

331 if not os.path.exists(os.path.dirname(upath)): 

332 os.makedirs(os.path.dirname(upath)) 

333 

334 # TODO: Doesn't handle compressed files! 

335 if self._isurl(path): 

336 with urlopen(path) as openedurl: 

337 with _open(upath, 'wb') as f: 

338 shutil.copyfileobj(openedurl, f) 

339 else: 

340 shutil.copyfile(path, upath) 

341 return upath 

342 

343 def _findfile(self, path): 

344 """Searches for ``path`` and returns full path if found. 

345 

346 If path is an URL, _findfile will cache a local copy and return the 

347 path to the cached file. If path is a local file, _findfile will 

348 return a path to that local file. 

349 

350 The search will include possible compressed versions of the file 

351 and return the first occurrence found. 

352 

353 """ 

354 

355 # Build list of possible local file paths 

356 if not self._isurl(path): 

357 # Valid local paths 

358 filelist = self._possible_names(path) 

359 # Paths in self._destpath 

360 filelist += self._possible_names(self.abspath(path)) 

361 else: 

362 # Cached URLs in self._destpath 

363 filelist = self._possible_names(self.abspath(path)) 

364 # Remote URLs 

365 filelist = filelist + self._possible_names(path) 

366 

367 for name in filelist: 

368 if self.exists(name): 

369 if self._isurl(name): 

370 name = self._cache(name) 

371 return name 

372 return None 

373 

374 def abspath(self, path): 

375 """ 

376 Return absolute path of file in the DataSource directory. 

377 

378 If `path` is an URL, then `abspath` will return either the location 

379 the file exists locally or the location it would exist when opened 

380 using the `open` method. 

381 

382 Parameters 

383 ---------- 

384 path : str or pathlib.Path 

385 Can be a local file or a remote URL. 

386 

387 Returns 

388 ------- 

389 out : str 

390 Complete path, including the `DataSource` destination directory. 

391 

392 Notes 

393 ----- 

394 The functionality is based on `os.path.abspath`. 

395 

396 """ 

397 # We do this here to reduce the 'import numpy' initial import time. 

398 from urllib.parse import urlparse 

399 

400 # TODO: This should be more robust. Handles case where path includes 

401 # the destpath, but not other sub-paths. Failing case: 

402 # path = /home/guido/datafile.txt 

403 # destpath = /home/alex/ 

404 # upath = self.abspath(path) 

405 # upath == '/home/alex/home/guido/datafile.txt' 

406 

407 # handle case where path includes self._destpath 

408 splitpath = path.split(self._destpath, 2) 

409 if len(splitpath) > 1: 

410 path = splitpath[1] 

411 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) 

412 netloc = self._sanitize_relative_path(netloc) 

413 upath = self._sanitize_relative_path(upath) 

414 return os.path.join(self._destpath, netloc, upath) 

415 

416 def _sanitize_relative_path(self, path): 

417 """Return a sanitised relative path for which 

418 os.path.abspath(os.path.join(base, path)).startswith(base) 

419 """ 

420 last = None 

421 path = os.path.normpath(path) 

422 while path != last: 

423 last = path 

424 # Note: os.path.join treats '/' as os.sep on Windows 

425 path = path.lstrip(os.sep).lstrip('/') 

426 path = path.lstrip(os.pardir).lstrip('..') 

427 drive, path = os.path.splitdrive(path) # for Windows 

428 return path 

429 

430 def exists(self, path): 

431 """ 

432 Test if path exists. 

433 

434 Test if `path` exists as (and in this order): 

435 

436 - a local file. 

437 - a remote URL that has been downloaded and stored locally in the 

438 `DataSource` directory. 

439 - a remote URL that has not been downloaded, but is valid and 

440 accessible. 

441 

442 Parameters 

443 ---------- 

444 path : str or pathlib.Path 

445 Can be a local file or a remote URL. 

446 

447 Returns 

448 ------- 

449 out : bool 

450 True if `path` exists. 

451 

452 Notes 

453 ----- 

454 When `path` is an URL, `exists` will return True if it's either 

455 stored locally in the `DataSource` directory, or is a valid remote 

456 URL. `DataSource` does not discriminate between the two, the file 

457 is accessible if it exists in either location. 

458 

459 """ 

460 

461 # First test for local path 

462 if os.path.exists(path): 

463 return True 

464 

465 # We import this here because importing urllib is slow and 

466 # a significant fraction of numpy's total import time. 

467 from urllib.request import urlopen 

468 from urllib.error import URLError 

469 

470 # Test cached url 

471 upath = self.abspath(path) 

472 if os.path.exists(upath): 

473 return True 

474 

475 # Test remote url 

476 if self._isurl(path): 

477 try: 

478 netfile = urlopen(path) 

479 netfile.close() 

480 del(netfile) 

481 return True 

482 except URLError: 

483 return False 

484 return False 

485 

486 def open(self, path, mode='r', encoding=None, newline=None): 

487 """ 

488 Open and return file-like object. 

489 

490 If `path` is an URL, it will be downloaded, stored in the 

491 `DataSource` directory and opened from there. 

492 

493 Parameters 

494 ---------- 

495 path : str or pathlib.Path 

496 Local file path or URL to open. 

497 mode : {'r', 'w', 'a'}, optional 

498 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 

499 'a' to append. Available modes depend on the type of object 

500 specified by `path`. Default is 'r'. 

501 encoding : {None, str}, optional 

502 Open text file with given encoding. The default encoding will be 

503 what `open` uses. 

504 newline : {None, str}, optional 

505 Newline to use when reading text file. 

506 

507 Returns 

508 ------- 

509 out : file object 

510 File object. 

511 

512 """ 

513 

514 # TODO: There is no support for opening a file for writing which 

515 # doesn't exist yet (creating a file). Should there be? 

516 

517 # TODO: Add a ``subdir`` parameter for specifying the subdirectory 

518 # used to store URLs in self._destpath. 

519 

520 if self._isurl(path) and self._iswritemode(mode): 

521 raise ValueError("URLs are not writeable") 

522 

523 # NOTE: _findfile will fail on a new file opened for writing. 

524 found = self._findfile(path) 

525 if found: 

526 _fname, ext = self._splitzipext(found) 

527 if ext == 'bz2': 

528 mode.replace("+", "") 

529 return _file_openers[ext](found, mode=mode, 

530 encoding=encoding, newline=newline) 

531 else: 

532 raise FileNotFoundError(f"{path} not found.") 

533 

534 

535class Repository (DataSource): 

536 """ 

537 Repository(baseurl, destpath='.') 

538 

539 A data repository where multiple DataSource's share a base 

540 URL/directory. 

541 

542 `Repository` extends `DataSource` by prepending a base URL (or 

543 directory) to all the files it handles. Use `Repository` when you will 

544 be working with multiple files from one base URL. Initialize 

545 `Repository` with the base URL, then refer to each file by its filename 

546 only. 

547 

548 Parameters 

549 ---------- 

550 baseurl : str 

551 Path to the local directory or remote location that contains the 

552 data files. 

553 destpath : str or None, optional 

554 Path to the directory where the source file gets downloaded to for 

555 use. If `destpath` is None, a temporary directory will be created. 

556 The default path is the current directory. 

557 

558 Examples 

559 -------- 

560 To analyze all files in the repository, do something like this 

561 (note: this is not self-contained code):: 

562 

563 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/') 

564 >>> for filename in filelist: 

565 ... fp = repos.open(filename) 

566 ... fp.analyze() 

567 ... fp.close() 

568 

569 Similarly you could use a URL for a repository:: 

570 

571 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data') 

572 

573 """ 

574 

575 def __init__(self, baseurl, destpath=os.curdir): 

576 """Create a Repository with a shared url or directory of baseurl.""" 

577 DataSource.__init__(self, destpath=destpath) 

578 self._baseurl = baseurl 

579 

580 def __del__(self): 

581 DataSource.__del__(self) 

582 

583 def _fullpath(self, path): 

584 """Return complete path for path. Prepends baseurl if necessary.""" 

585 splitpath = path.split(self._baseurl, 2) 

586 if len(splitpath) == 1: 

587 result = os.path.join(self._baseurl, path) 

588 else: 

589 result = path # path contains baseurl already 

590 return result 

591 

592 def _findfile(self, path): 

593 """Extend DataSource method to prepend baseurl to ``path``.""" 

594 return DataSource._findfile(self, self._fullpath(path)) 

595 

596 def abspath(self, path): 

597 """ 

598 Return absolute path of file in the Repository directory. 

599 

600 If `path` is an URL, then `abspath` will return either the location 

601 the file exists locally or the location it would exist when opened 

602 using the `open` method. 

603 

604 Parameters 

605 ---------- 

606 path : str or pathlib.Path 

607 Can be a local file or a remote URL. This may, but does not 

608 have to, include the `baseurl` with which the `Repository` was 

609 initialized. 

610 

611 Returns 

612 ------- 

613 out : str 

614 Complete path, including the `DataSource` destination directory. 

615 

616 """ 

617 return DataSource.abspath(self, self._fullpath(path)) 

618 

619 def exists(self, path): 

620 """ 

621 Test if path exists prepending Repository base URL to path. 

622 

623 Test if `path` exists as (and in this order): 

624 

625 - a local file. 

626 - a remote URL that has been downloaded and stored locally in the 

627 `DataSource` directory. 

628 - a remote URL that has not been downloaded, but is valid and 

629 accessible. 

630 

631 Parameters 

632 ---------- 

633 path : str or pathlib.Path 

634 Can be a local file or a remote URL. This may, but does not 

635 have to, include the `baseurl` with which the `Repository` was 

636 initialized. 

637 

638 Returns 

639 ------- 

640 out : bool 

641 True if `path` exists. 

642 

643 Notes 

644 ----- 

645 When `path` is an URL, `exists` will return True if it's either 

646 stored locally in the `DataSource` directory, or is a valid remote 

647 URL. `DataSource` does not discriminate between the two, the file 

648 is accessible if it exists in either location. 

649 

650 """ 

651 return DataSource.exists(self, self._fullpath(path)) 

652 

653 def open(self, path, mode='r', encoding=None, newline=None): 

654 """ 

655 Open and return file-like object prepending Repository base URL. 

656 

657 If `path` is an URL, it will be downloaded, stored in the 

658 DataSource directory and opened from there. 

659 

660 Parameters 

661 ---------- 

662 path : str or pathlib.Path 

663 Local file path or URL to open. This may, but does not have to, 

664 include the `baseurl` with which the `Repository` was 

665 initialized. 

666 mode : {'r', 'w', 'a'}, optional 

667 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 

668 'a' to append. Available modes depend on the type of object 

669 specified by `path`. Default is 'r'. 

670 encoding : {None, str}, optional 

671 Open text file with given encoding. The default encoding will be 

672 what `open` uses. 

673 newline : {None, str}, optional 

674 Newline to use when reading text file. 

675 

676 Returns 

677 ------- 

678 out : file object 

679 File object. 

680 

681 """ 

682 return DataSource.open(self, self._fullpath(path), mode, 

683 encoding=encoding, newline=newline) 

684 

685 def listdir(self): 

686 """ 

687 List files in the source Repository. 

688 

689 Returns 

690 ------- 

691 files : list of str or pathlib.Path 

692 List of file names (not containing a directory part). 

693 

694 Notes 

695 ----- 

696 Does not currently work for remote repositories. 

697 

698 """ 

699 if self._isurl(self._baseurl): 

700 raise NotImplementedError( 

701 "Directory listing of URLs, not supported yet.") 

702 else: 

703 return os.listdir(self._baseurl)