Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/numpy/lib/

1"""A file interface for handling local and remote data files.

3The goal of datasource is to abstract some of the file system operations

4when dealing with data files so the researcher doesn't have to know all the

5low-level details. Through datasource, a researcher can obtain and use a

6file with one function call, regardless of location of the file.

8DataSource is meant to augment standard python libraries, not replace them.

9It should work seamlessly with standard file IO operations and the os

10module.

12DataSource files can originate locally or remotely:

14- local files : '/home/guido/src/local/data.txt'

15- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'

17DataSource files can also be compressed or uncompressed. Currently only

18gzip, bz2 and xz are supported.

20Example::

22 >>> # Create a DataSource, use os.curdir (default) for local storage.

23 >>> from numpy import DataSource

24 >>> ds = DataSource()

25 >>>

26 >>> # Open a remote file.

27 >>> # DataSource downloads the file, stores it locally in:

28 >>> # './www.google.com/index.html'

29 >>> # opens the file and returns a file object.

30 >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP

31 >>>

32 >>> # Use the file as you normally would

33 >>> fp.read() # doctest: +SKIP

34 >>> fp.close() # doctest: +SKIP

36"""

37import os

38import io

40from numpy.core.overrides import set_module

43_open = open

46def _check_mode(mode, encoding, newline):

47 """Check mode and that encoding and newline are compatible.

49 Parameters

50 ----------

51 mode : str

52 File open mode.

53 encoding : str

54 File encoding.

55 newline : str

56 Newline for text files.

58 """

59 if "t" in mode:

60 if "b" in mode:

61 raise ValueError("Invalid mode: %r" % (mode,))

62 else:

63 if encoding is not None:

64 raise ValueError("Argument 'encoding' not supported in binary mode")

65 if newline is not None:

66 raise ValueError("Argument 'newline' not supported in binary mode")

69# Using a class instead of a module-level dictionary

70# to reduce the initial 'import numpy' overhead by

71# deferring the import of lzma, bz2 and gzip until needed

73# TODO: .zip support, .tar support?

74class _FileOpeners:

75 """

76 Container for different methods to open (un-)compressed files.

78 `_FileOpeners` contains a dictionary that holds one method for each

79 supported file format. Attribute lookup is implemented in such a way

80 that an instance of `_FileOpeners` itself can be indexed with the keys

81 of that dictionary. Currently uncompressed files as well as files

82 compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported.

84 Notes

85 -----

86 `_file_openers`, an instance of `_FileOpeners`, is made available for

87 use in the `_datasource` module.

89 Examples

90 --------

91 >>> import gzip

92 >>> np.lib._datasource._file_openers.keys()

93 [None, '.bz2', '.gz', '.xz', '.lzma']

94 >>> np.lib._datasource._file_openers['.gz'] is gzip.open

95 True

97 """

99 def __init__(self):

100 self._loaded = False

101 self._file_openers = {None: io.open}

102

103 def _load(self):

104 if self._loaded:

105 return

106

107 try:

108 import bz2

109 self._file_openers[".bz2"] = bz2.open

110 except ImportError:

111 pass

112

113 try:

114 import gzip

115 self._file_openers[".gz"] = gzip.open

116 except ImportError:

117 pass

118

119 try:

120 import lzma

121 self._file_openers[".xz"] = lzma.open

122 self._file_openers[".lzma"] = lzma.open

123 except (ImportError, AttributeError):

124 # There are incompatible backports of lzma that do not have the

125 # lzma.open attribute, so catch that as well as ImportError.

126 pass

127

128 self._loaded = True

129

130 def keys(self):

131 """

132 Return the keys of currently supported file openers.

133

134 Parameters

135 ----------

136 None

137

138 Returns

139 -------

140 keys : list

141 The keys are None for uncompressed files and the file extension

142 strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression

143 methods.

144

145 """

146 self._load()

147 return list(self._file_openers.keys())

148

149 def __getitem__(self, key):

150 self._load()

151 return self._file_openers[key]

152

153_file_openers = _FileOpeners()

154

155def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None):

156 """

157 Open `path` with `mode` and return the file object.

158

159 If ``path`` is an URL, it will be downloaded, stored in the

160 `DataSource` `destpath` directory and opened from there.

161

162 Parameters

163 ----------

164 path : str

165 Local file path or URL to open.

166 mode : str, optional

167 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to

168 append. Available modes depend on the type of object specified by

169 path. Default is 'r'.

170 destpath : str, optional

171 Path to the directory where the source file gets downloaded to for

172 use. If `destpath` is None, a temporary directory will be created.

173 The default path is the current directory.

174 encoding : {None, str}, optional

175 Open text file with given encoding. The default encoding will be

176 what `io.open` uses.

177 newline : {None, str}, optional

178 Newline to use when reading text file.

179

180 Returns

181 -------

182 out : file object

183 The opened file.

184

185 Notes

186 -----

187 This is a convenience function that instantiates a `DataSource` and

188 returns the file object from ``DataSource.open(path)``.

189

190 """

191

192 ds = DataSource(destpath)

193 return ds.open(path, mode, encoding=encoding, newline=newline)

194

195

196@set_module('numpy')

197class DataSource:

198 """

199 DataSource(destpath='.')

200

201 A generic data source file (file, http, ftp, ...).

202

203 DataSources can be local files or remote files/URLs. The files may

204 also be compressed or uncompressed. DataSource hides some of the

205 low-level details of downloading the file, allowing you to simply pass

206 in a valid file path (or URL) and obtain a file object.

207

208 Parameters

209 ----------

210 destpath : str or None, optional

211 Path to the directory where the source file gets downloaded to for

212 use. If `destpath` is None, a temporary directory will be created.

213 The default path is the current directory.

214

215 Notes

216 -----

217 URLs require a scheme string (``http://``) to be used, without it they

218 will fail::

219

220 >>> repos = np.DataSource()

221 >>> repos.exists('www.google.com/index.html')

222 False

223 >>> repos.exists('http://www.google.com/index.html')

224 True

225

226 Temporary directories are deleted when the DataSource is deleted.

227

228 Examples

229 --------

230 ::

231

232 >>> ds = np.DataSource('/home/guido')

233 >>> urlname = 'http://www.google.com/'

234 >>> gfile = ds.open('http://www.google.com/')

235 >>> ds.abspath(urlname)

236 '/home/guido/www.google.com/index.html'

237

238 >>> ds = np.DataSource(None) # use with temporary file

239 >>> ds.open('/home/guido/foobar.txt')

240 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430>

241 >>> ds.abspath('/home/guido/foobar.txt')

242 '/tmp/.../home/guido/foobar.txt'

243

244 """

245

246 def __init__(self, destpath=os.curdir):

247 """Create a DataSource with a local path at destpath."""

248 if destpath:

249 self._destpath = os.path.abspath(destpath)

250 self._istmpdest = False

251 else:

252 import tempfile # deferring import to improve startup time

253 self._destpath = tempfile.mkdtemp()

254 self._istmpdest = True

255

256 def __del__(self):

257 # Remove temp directories

258 if hasattr(self, '_istmpdest') and self._istmpdest:

259 import shutil

260

261 shutil.rmtree(self._destpath)

262

263 def _iszip(self, filename):

264 """Test if the filename is a zip file by looking at the file extension.

265

266 """

267 fname, ext = os.path.splitext(filename)

268 return ext in _file_openers.keys()

269

270 def _iswritemode(self, mode):

271 """Test if the given mode will open a file for writing."""

272

273 # Currently only used to test the bz2 files.

274 _writemodes = ("w", "+")

275 for c in mode:

276 if c in _writemodes:

277 return True

278 return False

279

280 def _splitzipext(self, filename):

281 """Split zip extension from filename and return filename.

282

283 Returns

284 -------

285 base, zip_ext : {tuple}

286

287 """

288

289 if self._iszip(filename):

290 return os.path.splitext(filename)

291 else:

292 return filename, None

293

294 def _possible_names(self, filename):

295 """Return a tuple containing compressed filename variations."""

296 names = [filename]

297 if not self._iszip(filename):

298 for zipext in _file_openers.keys():

299 if zipext:

300 names.append(filename+zipext)

301 return names

302

303 def _isurl(self, path):

304 """Test if path is a net location. Tests the scheme and netloc."""

305

306 # We do this here to reduce the 'import numpy' initial import time.

307 from urllib.parse import urlparse

308

309 # BUG : URLs require a scheme string ('http://') to be used.

310 # www.google.com will fail.

311 # Should we prepend the scheme for those that don't have it and

312 # test that also? Similar to the way we append .gz and test for

313 # for compressed versions of files.

314

315 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)

316 return bool(scheme and netloc)

317

318 def _cache(self, path):

319 """Cache the file specified by path.

320

321 Creates a copy of the file in the datasource cache.

322

323 """

324 # We import these here because importing them is slow and

325 # a significant fraction of numpy's total import time.

326 import shutil

327 from urllib.request import urlopen

328

329 upath = self.abspath(path)

330

331 # ensure directory exists

332 if not os.path.exists(os.path.dirname(upath)):

333 os.makedirs(os.path.dirname(upath))

334

335 # TODO: Doesn't handle compressed files!

336 if self._isurl(path):

337 with urlopen(path) as openedurl:

338 with _open(upath, 'wb') as f:

339 shutil.copyfileobj(openedurl, f)

340 else:

341 shutil.copyfile(path, upath)

342 return upath

343

344 def _findfile(self, path):

345 """Searches for ``path`` and returns full path if found.

346

347 If path is an URL, _findfile will cache a local copy and return the

348 path to the cached file. If path is a local file, _findfile will

349 return a path to that local file.

350

351 The search will include possible compressed versions of the file

352 and return the first occurrence found.

353

354 """

355

356 # Build list of possible local file paths

357 if not self._isurl(path):

358 # Valid local paths

359 filelist = self._possible_names(path)

360 # Paths in self._destpath

361 filelist += self._possible_names(self.abspath(path))

362 else:

363 # Cached URLs in self._destpath

364 filelist = self._possible_names(self.abspath(path))

365 # Remote URLs

366 filelist = filelist + self._possible_names(path)

367

368 for name in filelist:

369 if self.exists(name):

370 if self._isurl(name):

371 name = self._cache(name)

372 return name

373 return None

374

375 def abspath(self, path):

376 """

377 Return absolute path of file in the DataSource directory.

378

379 If `path` is an URL, then `abspath` will return either the location

380 the file exists locally or the location it would exist when opened

381 using the `open` method.

382

383 Parameters

384 ----------

385 path : str

386 Can be a local file or a remote URL.

387

388 Returns

389 -------

390 out : str

391 Complete path, including the `DataSource` destination directory.

392

393 Notes

394 -----

395 The functionality is based on `os.path.abspath`.

396

397 """

398 # We do this here to reduce the 'import numpy' initial import time.

399 from urllib.parse import urlparse

400

401 # TODO: This should be more robust. Handles case where path includes

402 # the destpath, but not other sub-paths. Failing case:

403 # path = /home/guido/datafile.txt

404 # destpath = /home/alex/

405 # upath = self.abspath(path)

406 # upath == '/home/alex/home/guido/datafile.txt'

407

408 # handle case where path includes self._destpath

409 splitpath = path.split(self._destpath, 2)

410 if len(splitpath) > 1:

411 path = splitpath[1]

412 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)

413 netloc = self._sanitize_relative_path(netloc)

414 upath = self._sanitize_relative_path(upath)

415 return os.path.join(self._destpath, netloc, upath)

416

417 def _sanitize_relative_path(self, path):

418 """Return a sanitised relative path for which

419 os.path.abspath(os.path.join(base, path)).startswith(base)

420 """

421 last = None

422 path = os.path.normpath(path)

423 while path != last:

424 last = path

425 # Note: os.path.join treats '/' as os.sep on Windows

426 path = path.lstrip(os.sep).lstrip('/')

427 path = path.lstrip(os.pardir).lstrip('..')

428 drive, path = os.path.splitdrive(path) # for Windows

429 return path

430

431 def exists(self, path):

432 """

433 Test if path exists.

434

435 Test if `path` exists as (and in this order):

436

437 - a local file.

438 - a remote URL that has been downloaded and stored locally in the

439 `DataSource` directory.

440 - a remote URL that has not been downloaded, but is valid and

441 accessible.

442

443 Parameters

444 ----------

445 path : str

446 Can be a local file or a remote URL.

447

448 Returns

449 -------

450 out : bool

451 True if `path` exists.

452

453 Notes

454 -----

455 When `path` is an URL, `exists` will return True if it's either

456 stored locally in the `DataSource` directory, or is a valid remote

457 URL. `DataSource` does not discriminate between the two, the file

458 is accessible if it exists in either location.

459

460 """

461

462 # First test for local path

463 if os.path.exists(path):

464 return True

465

466 # We import this here because importing urllib is slow and

467 # a significant fraction of numpy's total import time.

468 from urllib.request import urlopen

469 from urllib.error import URLError

470

471 # Test cached url

472 upath = self.abspath(path)

473 if os.path.exists(upath):

474 return True

475

476 # Test remote url

477 if self._isurl(path):

478 try:

479 netfile = urlopen(path)

480 netfile.close()

481 del(netfile)

482 return True

483 except URLError:

484 return False

485 return False

486

487 def open(self, path, mode='r', encoding=None, newline=None):

488 """

489 Open and return file-like object.

490

491 If `path` is an URL, it will be downloaded, stored in the

492 `DataSource` directory and opened from there.

493

494 Parameters

495 ----------

496 path : str

497 Local file path or URL to open.

498 mode : {'r', 'w', 'a'}, optional

499 Mode to open `path`. Mode 'r' for reading, 'w' for writing,

500 'a' to append. Available modes depend on the type of object

501 specified by `path`. Default is 'r'.

502 encoding : {None, str}, optional

503 Open text file with given encoding. The default encoding will be

504 what `io.open` uses.

505 newline : {None, str}, optional

506 Newline to use when reading text file.

507

508 Returns

509 -------

510 out : file object

511 File object.

512

513 """

514

515 # TODO: There is no support for opening a file for writing which

516 # doesn't exist yet (creating a file). Should there be?

517

518 # TODO: Add a ``subdir`` parameter for specifying the subdirectory

519 # used to store URLs in self._destpath.

520

521 if self._isurl(path) and self._iswritemode(mode):

522 raise ValueError("URLs are not writeable")

523

524 # NOTE: _findfile will fail on a new file opened for writing.

525 found = self._findfile(path)

526 if found:

527 _fname, ext = self._splitzipext(found)

528 if ext == 'bz2':

529 mode.replace("+", "")

530 return _file_openers[ext](found, mode=mode,

531 encoding=encoding, newline=newline)

532 else:

533 raise FileNotFoundError(f"{path} not found.")

534

535

536class Repository (DataSource):

537 """

538 Repository(baseurl, destpath='.')

539

540 A data repository where multiple DataSource's share a base

541 URL/directory.

542

543 `Repository` extends `DataSource` by prepending a base URL (or

544 directory) to all the files it handles. Use `Repository` when you will

545 be working with multiple files from one base URL. Initialize

546 `Repository` with the base URL, then refer to each file by its filename

547 only.

548

549 Parameters

550 ----------

551 baseurl : str

552 Path to the local directory or remote location that contains the

553 data files.

554 destpath : str or None, optional

555 Path to the directory where the source file gets downloaded to for

556 use. If `destpath` is None, a temporary directory will be created.

557 The default path is the current directory.

558

559 Examples

560 --------

561 To analyze all files in the repository, do something like this

562 (note: this is not self-contained code)::

563

564 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/')

565 >>> for filename in filelist:

566 ... fp = repos.open(filename)

567 ... fp.analyze()

568 ... fp.close()

569

570 Similarly you could use a URL for a repository::

571

572 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data')

573

574 """

575

576 def __init__(self, baseurl, destpath=os.curdir):

577 """Create a Repository with a shared url or directory of baseurl."""

578 DataSource.__init__(self, destpath=destpath)

579 self._baseurl = baseurl

580

581 def __del__(self):

582 DataSource.__del__(self)

583

584 def _fullpath(self, path):

585 """Return complete path for path. Prepends baseurl if necessary."""

586 splitpath = path.split(self._baseurl, 2)

587 if len(splitpath) == 1:

588 result = os.path.join(self._baseurl, path)

589 else:

590 result = path # path contains baseurl already

591 return result

592

593 def _findfile(self, path):

594 """Extend DataSource method to prepend baseurl to ``path``."""

595 return DataSource._findfile(self, self._fullpath(path))

596

597 def abspath(self, path):

598 """

599 Return absolute path of file in the Repository directory.

600

601 If `path` is an URL, then `abspath` will return either the location

602 the file exists locally or the location it would exist when opened

603 using the `open` method.

604

605 Parameters

606 ----------

607 path : str

608 Can be a local file or a remote URL. This may, but does not

609 have to, include the `baseurl` with which the `Repository` was

610 initialized.

611

612 Returns

613 -------

614 out : str

615 Complete path, including the `DataSource` destination directory.

616

617 """

618 return DataSource.abspath(self, self._fullpath(path))

619

620 def exists(self, path):

621 """

622 Test if path exists prepending Repository base URL to path.

623

624 Test if `path` exists as (and in this order):

625

626 - a local file.

627 - a remote URL that has been downloaded and stored locally in the

628 `DataSource` directory.

629 - a remote URL that has not been downloaded, but is valid and

630 accessible.

631

632 Parameters

633 ----------

634 path : str

635 Can be a local file or a remote URL. This may, but does not

636 have to, include the `baseurl` with which the `Repository` was

637 initialized.

638

639 Returns

640 -------

641 out : bool

642 True if `path` exists.

643

644 Notes

645 -----

646 When `path` is an URL, `exists` will return True if it's either

647 stored locally in the `DataSource` directory, or is a valid remote

648 URL. `DataSource` does not discriminate between the two, the file

649 is accessible if it exists in either location.

650

651 """

652 return DataSource.exists(self, self._fullpath(path))

653

654 def open(self, path, mode='r', encoding=None, newline=None):

655 """

656 Open and return file-like object prepending Repository base URL.

657

658 If `path` is an URL, it will be downloaded, stored in the

659 DataSource directory and opened from there.

660

661 Parameters

662 ----------

663 path : str

664 Local file path or URL to open. This may, but does not have to,

665 include the `baseurl` with which the `Repository` was

666 initialized.

667 mode : {'r', 'w', 'a'}, optional

668 Mode to open `path`. Mode 'r' for reading, 'w' for writing,

669 'a' to append. Available modes depend on the type of object

670 specified by `path`. Default is 'r'.

671 encoding : {None, str}, optional

672 Open text file with given encoding. The default encoding will be

673 what `io.open` uses.

674 newline : {None, str}, optional

675 Newline to use when reading text file.

676

677 Returns

678 -------

679 out : file object

680 File object.

681

682 """

683 return DataSource.open(self, self._fullpath(path), mode,

684 encoding=encoding, newline=newline)

685

686 def listdir(self):

687 """

688 List files in the source Repository.

689

690 Returns

691 -------

692 files : list of str

693 List of file names (not containing a directory part).

694

695 Notes

696 -----

697 Does not currently work for remote repositories.

698

699 """

700 if self._isurl(self._baseurl):

701 raise NotImplementedError(

702 "Directory listing of URLs, not supported yet.")

703 else:

704 return os.listdir(self._baseurl)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/numpy/lib/_datasource.py: 22%

176 statements