Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/numpy/lib/_datasource.py: 22%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

172 statements  

1"""A file interface for handling local and remote data files. 

2 

3The goal of datasource is to abstract some of the file system operations 

4when dealing with data files so the researcher doesn't have to know all the 

5low-level details. Through datasource, a researcher can obtain and use a 

6file with one function call, regardless of location of the file. 

7 

8DataSource is meant to augment standard python libraries, not replace them. 

9It should work seamlessly with standard file IO operations and the os 

10module. 

11 

12DataSource files can originate locally or remotely: 

13 

14- local files : '/home/guido/src/local/data.txt' 

15- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt' 

16 

17DataSource files can also be compressed or uncompressed. Currently only 

18gzip, bz2 and xz are supported. 

19 

20Example:: 

21 

22 >>> # Create a DataSource, use os.curdir (default) for local storage. 

23 >>> from numpy import DataSource 

24 >>> ds = DataSource() 

25 >>> 

26 >>> # Open a remote file. 

27 >>> # DataSource downloads the file, stores it locally in: 

28 >>> # './www.google.com/index.html' 

29 >>> # opens the file and returns a file object. 

30 >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP 

31 >>> 

32 >>> # Use the file as you normally would 

33 >>> fp.read() # doctest: +SKIP 

34 >>> fp.close() # doctest: +SKIP 

35 

36""" 

37import os 

38 

39from .._utils import set_module 

40 

41 

42_open = open 

43 

44 

45def _check_mode(mode, encoding, newline): 

46 """Check mode and that encoding and newline are compatible. 

47 

48 Parameters 

49 ---------- 

50 mode : str 

51 File open mode. 

52 encoding : str 

53 File encoding. 

54 newline : str 

55 Newline for text files. 

56 

57 """ 

58 if "t" in mode: 

59 if "b" in mode: 

60 raise ValueError("Invalid mode: %r" % (mode,)) 

61 else: 

62 if encoding is not None: 

63 raise ValueError("Argument 'encoding' not supported in binary mode") 

64 if newline is not None: 

65 raise ValueError("Argument 'newline' not supported in binary mode") 

66 

67 

68# Using a class instead of a module-level dictionary 

69# to reduce the initial 'import numpy' overhead by 

70# deferring the import of lzma, bz2 and gzip until needed 

71 

72# TODO: .zip support, .tar support? 

73class _FileOpeners: 

74 """ 

75 Container for different methods to open (un-)compressed files. 

76 

77 `_FileOpeners` contains a dictionary that holds one method for each 

78 supported file format. Attribute lookup is implemented in such a way 

79 that an instance of `_FileOpeners` itself can be indexed with the keys 

80 of that dictionary. Currently uncompressed files as well as files 

81 compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported. 

82 

83 Notes 

84 ----- 

85 `_file_openers`, an instance of `_FileOpeners`, is made available for 

86 use in the `_datasource` module. 

87 

88 Examples 

89 -------- 

90 >>> import gzip 

91 >>> np.lib._datasource._file_openers.keys() 

92 [None, '.bz2', '.gz', '.xz', '.lzma'] 

93 >>> np.lib._datasource._file_openers['.gz'] is gzip.open 

94 True 

95 

96 """ 

97 

98 def __init__(self): 

99 self._loaded = False 

100 self._file_openers = {None: open} 

101 

102 def _load(self): 

103 if self._loaded: 

104 return 

105 

106 try: 

107 import bz2 

108 self._file_openers[".bz2"] = bz2.open 

109 except ImportError: 

110 pass 

111 

112 try: 

113 import gzip 

114 self._file_openers[".gz"] = gzip.open 

115 except ImportError: 

116 pass 

117 

118 try: 

119 import lzma 

120 self._file_openers[".xz"] = lzma.open 

121 self._file_openers[".lzma"] = lzma.open 

122 except (ImportError, AttributeError): 

123 # There are incompatible backports of lzma that do not have the 

124 # lzma.open attribute, so catch that as well as ImportError. 

125 pass 

126 

127 self._loaded = True 

128 

129 def keys(self): 

130 """ 

131 Return the keys of currently supported file openers. 

132 

133 Parameters 

134 ---------- 

135 None 

136 

137 Returns 

138 ------- 

139 keys : list 

140 The keys are None for uncompressed files and the file extension 

141 strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression 

142 methods. 

143 

144 """ 

145 self._load() 

146 return list(self._file_openers.keys()) 

147 

148 def __getitem__(self, key): 

149 self._load() 

150 return self._file_openers[key] 

151 

152_file_openers = _FileOpeners() 

153 

154def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None): 

155 """ 

156 Open `path` with `mode` and return the file object. 

157 

158 If ``path`` is an URL, it will be downloaded, stored in the 

159 `DataSource` `destpath` directory and opened from there. 

160 

161 Parameters 

162 ---------- 

163 path : str or pathlib.Path 

164 Local file path or URL to open. 

165 mode : str, optional 

166 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to 

167 append. Available modes depend on the type of object specified by 

168 path. Default is 'r'. 

169 destpath : str, optional 

170 Path to the directory where the source file gets downloaded to for 

171 use. If `destpath` is None, a temporary directory will be created. 

172 The default path is the current directory. 

173 encoding : {None, str}, optional 

174 Open text file with given encoding. The default encoding will be 

175 what `open` uses. 

176 newline : {None, str}, optional 

177 Newline to use when reading text file. 

178 

179 Returns 

180 ------- 

181 out : file object 

182 The opened file. 

183 

184 Notes 

185 ----- 

186 This is a convenience function that instantiates a `DataSource` and 

187 returns the file object from ``DataSource.open(path)``. 

188 

189 """ 

190 

191 ds = DataSource(destpath) 

192 return ds.open(path, mode, encoding=encoding, newline=newline) 

193 

194 

195@set_module('numpy.lib.npyio') 

196class DataSource: 

197 """ 

198 DataSource(destpath='.') 

199 

200 A generic data source file (file, http, ftp, ...). 

201 

202 DataSources can be local files or remote files/URLs. The files may 

203 also be compressed or uncompressed. DataSource hides some of the 

204 low-level details of downloading the file, allowing you to simply pass 

205 in a valid file path (or URL) and obtain a file object. 

206 

207 Parameters 

208 ---------- 

209 destpath : str or None, optional 

210 Path to the directory where the source file gets downloaded to for 

211 use. If `destpath` is None, a temporary directory will be created. 

212 The default path is the current directory. 

213 

214 Notes 

215 ----- 

216 URLs require a scheme string (``http://``) to be used, without it they 

217 will fail:: 

218 

219 >>> repos = np.lib.npyio.DataSource() 

220 >>> repos.exists('www.google.com/index.html') 

221 False 

222 >>> repos.exists('http://www.google.com/index.html') 

223 True 

224 

225 Temporary directories are deleted when the DataSource is deleted. 

226 

227 Examples 

228 -------- 

229 :: 

230 

231 >>> ds = np.lib.npyio.DataSource('/home/guido') 

232 >>> urlname = 'http://www.google.com/' 

233 >>> gfile = ds.open('http://www.google.com/') 

234 >>> ds.abspath(urlname) 

235 '/home/guido/www.google.com/index.html' 

236 

237 >>> ds = np.lib.npyio.DataSource(None) # use with temporary file 

238 >>> ds.open('/home/guido/foobar.txt') 

239 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430> 

240 >>> ds.abspath('/home/guido/foobar.txt') 

241 '/tmp/.../home/guido/foobar.txt' 

242 

243 """ 

244 

245 def __init__(self, destpath=os.curdir): 

246 """Create a DataSource with a local path at destpath.""" 

247 if destpath: 

248 self._destpath = os.path.abspath(destpath) 

249 self._istmpdest = False 

250 else: 

251 import tempfile # deferring import to improve startup time 

252 self._destpath = tempfile.mkdtemp() 

253 self._istmpdest = True 

254 

255 def __del__(self): 

256 # Remove temp directories 

257 if hasattr(self, '_istmpdest') and self._istmpdest: 

258 import shutil 

259 

260 shutil.rmtree(self._destpath) 

261 

262 def _iszip(self, filename): 

263 """Test if the filename is a zip file by looking at the file extension. 

264 

265 """ 

266 fname, ext = os.path.splitext(filename) 

267 return ext in _file_openers.keys() 

268 

269 def _iswritemode(self, mode): 

270 """Test if the given mode will open a file for writing.""" 

271 

272 # Currently only used to test the bz2 files. 

273 _writemodes = ("w", "+") 

274 return any(c in _writemodes for c in mode) 

275 

276 def _splitzipext(self, filename): 

277 """Split zip extension from filename and return filename. 

278 

279 Returns 

280 ------- 

281 base, zip_ext : {tuple} 

282 

283 """ 

284 

285 if self._iszip(filename): 

286 return os.path.splitext(filename) 

287 else: 

288 return filename, None 

289 

290 def _possible_names(self, filename): 

291 """Return a tuple containing compressed filename variations.""" 

292 names = [filename] 

293 if not self._iszip(filename): 

294 for zipext in _file_openers.keys(): 

295 if zipext: 

296 names.append(filename+zipext) 

297 return names 

298 

299 def _isurl(self, path): 

300 """Test if path is a net location. Tests the scheme and netloc.""" 

301 

302 # We do this here to reduce the 'import numpy' initial import time. 

303 from urllib.parse import urlparse 

304 

305 # BUG : URLs require a scheme string ('http://') to be used. 

306 # www.google.com will fail. 

307 # Should we prepend the scheme for those that don't have it and 

308 # test that also? Similar to the way we append .gz and test for 

309 # for compressed versions of files. 

310 

311 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) 

312 return bool(scheme and netloc) 

313 

314 def _cache(self, path): 

315 """Cache the file specified by path. 

316 

317 Creates a copy of the file in the datasource cache. 

318 

319 """ 

320 # We import these here because importing them is slow and 

321 # a significant fraction of numpy's total import time. 

322 import shutil 

323 from urllib.request import urlopen 

324 

325 upath = self.abspath(path) 

326 

327 # ensure directory exists 

328 if not os.path.exists(os.path.dirname(upath)): 

329 os.makedirs(os.path.dirname(upath)) 

330 

331 # TODO: Doesn't handle compressed files! 

332 if self._isurl(path): 

333 with urlopen(path) as openedurl: 

334 with _open(upath, 'wb') as f: 

335 shutil.copyfileobj(openedurl, f) 

336 else: 

337 shutil.copyfile(path, upath) 

338 return upath 

339 

340 def _findfile(self, path): 

341 """Searches for ``path`` and returns full path if found. 

342 

343 If path is an URL, _findfile will cache a local copy and return the 

344 path to the cached file. If path is a local file, _findfile will 

345 return a path to that local file. 

346 

347 The search will include possible compressed versions of the file 

348 and return the first occurrence found. 

349 

350 """ 

351 

352 # Build list of possible local file paths 

353 if not self._isurl(path): 

354 # Valid local paths 

355 filelist = self._possible_names(path) 

356 # Paths in self._destpath 

357 filelist += self._possible_names(self.abspath(path)) 

358 else: 

359 # Cached URLs in self._destpath 

360 filelist = self._possible_names(self.abspath(path)) 

361 # Remote URLs 

362 filelist = filelist + self._possible_names(path) 

363 

364 for name in filelist: 

365 if self.exists(name): 

366 if self._isurl(name): 

367 name = self._cache(name) 

368 return name 

369 return None 

370 

371 def abspath(self, path): 

372 """ 

373 Return absolute path of file in the DataSource directory. 

374 

375 If `path` is an URL, then `abspath` will return either the location 

376 the file exists locally or the location it would exist when opened 

377 using the `open` method. 

378 

379 Parameters 

380 ---------- 

381 path : str or pathlib.Path 

382 Can be a local file or a remote URL. 

383 

384 Returns 

385 ------- 

386 out : str 

387 Complete path, including the `DataSource` destination directory. 

388 

389 Notes 

390 ----- 

391 The functionality is based on `os.path.abspath`. 

392 

393 """ 

394 # We do this here to reduce the 'import numpy' initial import time. 

395 from urllib.parse import urlparse 

396 

397 # TODO: This should be more robust. Handles case where path includes 

398 # the destpath, but not other sub-paths. Failing case: 

399 # path = /home/guido/datafile.txt 

400 # destpath = /home/alex/ 

401 # upath = self.abspath(path) 

402 # upath == '/home/alex/home/guido/datafile.txt' 

403 

404 # handle case where path includes self._destpath 

405 splitpath = path.split(self._destpath, 2) 

406 if len(splitpath) > 1: 

407 path = splitpath[1] 

408 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) 

409 netloc = self._sanitize_relative_path(netloc) 

410 upath = self._sanitize_relative_path(upath) 

411 return os.path.join(self._destpath, netloc, upath) 

412 

413 def _sanitize_relative_path(self, path): 

414 """Return a sanitised relative path for which 

415 os.path.abspath(os.path.join(base, path)).startswith(base) 

416 """ 

417 last = None 

418 path = os.path.normpath(path) 

419 while path != last: 

420 last = path 

421 # Note: os.path.join treats '/' as os.sep on Windows 

422 path = path.lstrip(os.sep).lstrip('/') 

423 path = path.lstrip(os.pardir).removeprefix('..') 

424 drive, path = os.path.splitdrive(path) # for Windows 

425 return path 

426 

427 def exists(self, path): 

428 """ 

429 Test if path exists. 

430 

431 Test if `path` exists as (and in this order): 

432 

433 - a local file. 

434 - a remote URL that has been downloaded and stored locally in the 

435 `DataSource` directory. 

436 - a remote URL that has not been downloaded, but is valid and 

437 accessible. 

438 

439 Parameters 

440 ---------- 

441 path : str or pathlib.Path 

442 Can be a local file or a remote URL. 

443 

444 Returns 

445 ------- 

446 out : bool 

447 True if `path` exists. 

448 

449 Notes 

450 ----- 

451 When `path` is an URL, `exists` will return True if it's either 

452 stored locally in the `DataSource` directory, or is a valid remote 

453 URL. `DataSource` does not discriminate between the two, the file 

454 is accessible if it exists in either location. 

455 

456 """ 

457 

458 # First test for local path 

459 if os.path.exists(path): 

460 return True 

461 

462 # We import this here because importing urllib is slow and 

463 # a significant fraction of numpy's total import time. 

464 from urllib.request import urlopen 

465 from urllib.error import URLError 

466 

467 # Test cached url 

468 upath = self.abspath(path) 

469 if os.path.exists(upath): 

470 return True 

471 

472 # Test remote url 

473 if self._isurl(path): 

474 try: 

475 netfile = urlopen(path) 

476 netfile.close() 

477 del(netfile) 

478 return True 

479 except URLError: 

480 return False 

481 return False 

482 

483 def open(self, path, mode='r', encoding=None, newline=None): 

484 """ 

485 Open and return file-like object. 

486 

487 If `path` is an URL, it will be downloaded, stored in the 

488 `DataSource` directory and opened from there. 

489 

490 Parameters 

491 ---------- 

492 path : str or pathlib.Path 

493 Local file path or URL to open. 

494 mode : {'r', 'w', 'a'}, optional 

495 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 

496 'a' to append. Available modes depend on the type of object 

497 specified by `path`. Default is 'r'. 

498 encoding : {None, str}, optional 

499 Open text file with given encoding. The default encoding will be 

500 what `open` uses. 

501 newline : {None, str}, optional 

502 Newline to use when reading text file. 

503 

504 Returns 

505 ------- 

506 out : file object 

507 File object. 

508 

509 """ 

510 

511 # TODO: There is no support for opening a file for writing which 

512 # doesn't exist yet (creating a file). Should there be? 

513 

514 # TODO: Add a ``subdir`` parameter for specifying the subdirectory 

515 # used to store URLs in self._destpath. 

516 

517 if self._isurl(path) and self._iswritemode(mode): 

518 raise ValueError("URLs are not writeable") 

519 

520 # NOTE: _findfile will fail on a new file opened for writing. 

521 found = self._findfile(path) 

522 if found: 

523 _fname, ext = self._splitzipext(found) 

524 if ext == 'bz2': 

525 mode.replace("+", "") 

526 return _file_openers[ext](found, mode=mode, 

527 encoding=encoding, newline=newline) 

528 else: 

529 raise FileNotFoundError(f"{path} not found.") 

530 

531 

532class Repository (DataSource): 

533 """ 

534 Repository(baseurl, destpath='.') 

535 

536 A data repository where multiple DataSource's share a base 

537 URL/directory. 

538 

539 `Repository` extends `DataSource` by prepending a base URL (or 

540 directory) to all the files it handles. Use `Repository` when you will 

541 be working with multiple files from one base URL. Initialize 

542 `Repository` with the base URL, then refer to each file by its filename 

543 only. 

544 

545 Parameters 

546 ---------- 

547 baseurl : str 

548 Path to the local directory or remote location that contains the 

549 data files. 

550 destpath : str or None, optional 

551 Path to the directory where the source file gets downloaded to for 

552 use. If `destpath` is None, a temporary directory will be created. 

553 The default path is the current directory. 

554 

555 Examples 

556 -------- 

557 To analyze all files in the repository, do something like this 

558 (note: this is not self-contained code):: 

559 

560 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/') 

561 >>> for filename in filelist: 

562 ... fp = repos.open(filename) 

563 ... fp.analyze() 

564 ... fp.close() 

565 

566 Similarly you could use a URL for a repository:: 

567 

568 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data') 

569 

570 """ 

571 

572 def __init__(self, baseurl, destpath=os.curdir): 

573 """Create a Repository with a shared url or directory of baseurl.""" 

574 DataSource.__init__(self, destpath=destpath) 

575 self._baseurl = baseurl 

576 

577 def __del__(self): 

578 DataSource.__del__(self) 

579 

580 def _fullpath(self, path): 

581 """Return complete path for path. Prepends baseurl if necessary.""" 

582 splitpath = path.split(self._baseurl, 2) 

583 if len(splitpath) == 1: 

584 result = os.path.join(self._baseurl, path) 

585 else: 

586 result = path # path contains baseurl already 

587 return result 

588 

589 def _findfile(self, path): 

590 """Extend DataSource method to prepend baseurl to ``path``.""" 

591 return DataSource._findfile(self, self._fullpath(path)) 

592 

593 def abspath(self, path): 

594 """ 

595 Return absolute path of file in the Repository directory. 

596 

597 If `path` is an URL, then `abspath` will return either the location 

598 the file exists locally or the location it would exist when opened 

599 using the `open` method. 

600 

601 Parameters 

602 ---------- 

603 path : str or pathlib.Path 

604 Can be a local file or a remote URL. This may, but does not 

605 have to, include the `baseurl` with which the `Repository` was 

606 initialized. 

607 

608 Returns 

609 ------- 

610 out : str 

611 Complete path, including the `DataSource` destination directory. 

612 

613 """ 

614 return DataSource.abspath(self, self._fullpath(path)) 

615 

616 def exists(self, path): 

617 """ 

618 Test if path exists prepending Repository base URL to path. 

619 

620 Test if `path` exists as (and in this order): 

621 

622 - a local file. 

623 - a remote URL that has been downloaded and stored locally in the 

624 `DataSource` directory. 

625 - a remote URL that has not been downloaded, but is valid and 

626 accessible. 

627 

628 Parameters 

629 ---------- 

630 path : str or pathlib.Path 

631 Can be a local file or a remote URL. This may, but does not 

632 have to, include the `baseurl` with which the `Repository` was 

633 initialized. 

634 

635 Returns 

636 ------- 

637 out : bool 

638 True if `path` exists. 

639 

640 Notes 

641 ----- 

642 When `path` is an URL, `exists` will return True if it's either 

643 stored locally in the `DataSource` directory, or is a valid remote 

644 URL. `DataSource` does not discriminate between the two, the file 

645 is accessible if it exists in either location. 

646 

647 """ 

648 return DataSource.exists(self, self._fullpath(path)) 

649 

650 def open(self, path, mode='r', encoding=None, newline=None): 

651 """ 

652 Open and return file-like object prepending Repository base URL. 

653 

654 If `path` is an URL, it will be downloaded, stored in the 

655 DataSource directory and opened from there. 

656 

657 Parameters 

658 ---------- 

659 path : str or pathlib.Path 

660 Local file path or URL to open. This may, but does not have to, 

661 include the `baseurl` with which the `Repository` was 

662 initialized. 

663 mode : {'r', 'w', 'a'}, optional 

664 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 

665 'a' to append. Available modes depend on the type of object 

666 specified by `path`. Default is 'r'. 

667 encoding : {None, str}, optional 

668 Open text file with given encoding. The default encoding will be 

669 what `open` uses. 

670 newline : {None, str}, optional 

671 Newline to use when reading text file. 

672 

673 Returns 

674 ------- 

675 out : file object 

676 File object. 

677 

678 """ 

679 return DataSource.open(self, self._fullpath(path), mode, 

680 encoding=encoding, newline=newline) 

681 

682 def listdir(self): 

683 """ 

684 List files in the source Repository. 

685 

686 Returns 

687 ------- 

688 files : list of str or pathlib.Path 

689 List of file names (not containing a directory part). 

690 

691 Notes 

692 ----- 

693 Does not currently work for remote repositories. 

694 

695 """ 

696 if self._isurl(self._baseurl): 

697 raise NotImplementedError( 

698 "Directory listing of URLs, not supported yet.") 

699 else: 

700 return os.listdir(self._baseurl)