Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/numpy/lib/

1"""A file interface for handling local and remote data files.

3The goal of datasource is to abstract some of the file system operations

4when dealing with data files so the researcher doesn't have to know all the

5low-level details. Through datasource, a researcher can obtain and use a

6file with one function call, regardless of location of the file.

8DataSource is meant to augment standard python libraries, not replace them.

9It should work seamlessly with standard file IO operations and the os

10module.

12DataSource files can originate locally or remotely:

14- local files : '/home/guido/src/local/data.txt'

15- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'

17DataSource files can also be compressed or uncompressed. Currently only

18gzip, bz2 and xz are supported.

20Example::

22 >>> # Create a DataSource, use os.curdir (default) for local storage.

23 >>> from numpy import DataSource

24 >>> ds = DataSource()

25 >>>

26 >>> # Open a remote file.

27 >>> # DataSource downloads the file, stores it locally in:

28 >>> # './www.google.com/index.html'

29 >>> # opens the file and returns a file object.

30 >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP

31 >>>

32 >>> # Use the file as you normally would

33 >>> fp.read() # doctest: +SKIP

34 >>> fp.close() # doctest: +SKIP

36"""

37import os

39from .._utils import set_module

42_open = open

45def _check_mode(mode, encoding, newline):

46 """Check mode and that encoding and newline are compatible.

48 Parameters

49 ----------

50 mode : str

51 File open mode.

52 encoding : str

53 File encoding.

54 newline : str

55 Newline for text files.

57 """

58 if "t" in mode:

59 if "b" in mode:

60 raise ValueError("Invalid mode: %r" % (mode,))

61 else:

62 if encoding is not None:

63 raise ValueError("Argument 'encoding' not supported in binary mode")

64 if newline is not None:

65 raise ValueError("Argument 'newline' not supported in binary mode")

68# Using a class instead of a module-level dictionary

69# to reduce the initial 'import numpy' overhead by

70# deferring the import of lzma, bz2 and gzip until needed

72# TODO: .zip support, .tar support?

73class _FileOpeners:

74 """

75 Container for different methods to open (un-)compressed files.

77 `_FileOpeners` contains a dictionary that holds one method for each

78 supported file format. Attribute lookup is implemented in such a way

79 that an instance of `_FileOpeners` itself can be indexed with the keys

80 of that dictionary. Currently uncompressed files as well as files

81 compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported.

83 Notes

84 -----

85 `_file_openers`, an instance of `_FileOpeners`, is made available for

86 use in the `_datasource` module.

88 Examples

89 --------

90 >>> import gzip

91 >>> np.lib._datasource._file_openers.keys()

92 [None, '.bz2', '.gz', '.xz', '.lzma']

93 >>> np.lib._datasource._file_openers['.gz'] is gzip.open

94 True

96 """

98 def __init__(self):

99 self._loaded = False

100 self._file_openers = {None: open}

101

102 def _load(self):

103 if self._loaded:

104 return

105

106 try:

107 import bz2

108 self._file_openers[".bz2"] = bz2.open

109 except ImportError:

110 pass

111

112 try:

113 import gzip

114 self._file_openers[".gz"] = gzip.open

115 except ImportError:

116 pass

117

118 try:

119 import lzma

120 self._file_openers[".xz"] = lzma.open

121 self._file_openers[".lzma"] = lzma.open

122 except (ImportError, AttributeError):

123 # There are incompatible backports of lzma that do not have the

124 # lzma.open attribute, so catch that as well as ImportError.

125 pass

126

127 self._loaded = True

128

129 def keys(self):

130 """

131 Return the keys of currently supported file openers.

132

133 Parameters

134 ----------

135 None

136

137 Returns

138 -------

139 keys : list

140 The keys are None for uncompressed files and the file extension

141 strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression

142 methods.

143

144 """

145 self._load()

146 return list(self._file_openers.keys())

147

148 def __getitem__(self, key):

149 self._load()

150 return self._file_openers[key]

151

152_file_openers = _FileOpeners()

153

154def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None):

155 """

156 Open `path` with `mode` and return the file object.

157

158 If ``path`` is an URL, it will be downloaded, stored in the

159 `DataSource` `destpath` directory and opened from there.

160

161 Parameters

162 ----------

163 path : str or pathlib.Path

164 Local file path or URL to open.

165 mode : str, optional

166 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to

167 append. Available modes depend on the type of object specified by

168 path. Default is 'r'.

169 destpath : str, optional

170 Path to the directory where the source file gets downloaded to for

171 use. If `destpath` is None, a temporary directory will be created.

172 The default path is the current directory.

173 encoding : {None, str}, optional

174 Open text file with given encoding. The default encoding will be

175 what `open` uses.

176 newline : {None, str}, optional

177 Newline to use when reading text file.

178

179 Returns

180 -------

181 out : file object

182 The opened file.

183

184 Notes

185 -----

186 This is a convenience function that instantiates a `DataSource` and

187 returns the file object from ``DataSource.open(path)``.

188

189 """

190

191 ds = DataSource(destpath)

192 return ds.open(path, mode, encoding=encoding, newline=newline)

193

194

195@set_module('numpy.lib.npyio')

196class DataSource:

197 """

198 DataSource(destpath='.')

199

200 A generic data source file (file, http, ftp, ...).

201

202 DataSources can be local files or remote files/URLs. The files may

203 also be compressed or uncompressed. DataSource hides some of the

204 low-level details of downloading the file, allowing you to simply pass

205 in a valid file path (or URL) and obtain a file object.

206

207 Parameters

208 ----------

209 destpath : str or None, optional

210 Path to the directory where the source file gets downloaded to for

211 use. If `destpath` is None, a temporary directory will be created.

212 The default path is the current directory.

213

214 Notes

215 -----

216 URLs require a scheme string (``http://``) to be used, without it they

217 will fail::

218

219 >>> repos = np.lib.npyio.DataSource()

220 >>> repos.exists('www.google.com/index.html')

221 False

222 >>> repos.exists('http://www.google.com/index.html')

223 True

224

225 Temporary directories are deleted when the DataSource is deleted.

226

227 Examples

228 --------

229 ::

230

231 >>> ds = np.lib.npyio.DataSource('/home/guido')

232 >>> urlname = 'http://www.google.com/'

233 >>> gfile = ds.open('http://www.google.com/')

234 >>> ds.abspath(urlname)

235 '/home/guido/www.google.com/index.html'

236

237 >>> ds = np.lib.npyio.DataSource(None) # use with temporary file

238 >>> ds.open('/home/guido/foobar.txt')

239 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430>

240 >>> ds.abspath('/home/guido/foobar.txt')

241 '/tmp/.../home/guido/foobar.txt'

242

243 """

244

245 def __init__(self, destpath=os.curdir):

246 """Create a DataSource with a local path at destpath."""

247 if destpath:

248 self._destpath = os.path.abspath(destpath)

249 self._istmpdest = False

250 else:

251 import tempfile # deferring import to improve startup time

252 self._destpath = tempfile.mkdtemp()

253 self._istmpdest = True

254

255 def __del__(self):

256 # Remove temp directories

257 if hasattr(self, '_istmpdest') and self._istmpdest:

258 import shutil

259

260 shutil.rmtree(self._destpath)

261

262 def _iszip(self, filename):

263 """Test if the filename is a zip file by looking at the file extension.

264

265 """

266 fname, ext = os.path.splitext(filename)

267 return ext in _file_openers.keys()

268

269 def _iswritemode(self, mode):

270 """Test if the given mode will open a file for writing."""

271

272 # Currently only used to test the bz2 files.

273 _writemodes = ("w", "+")

274 for c in mode:

275 if c in _writemodes:

276 return True

277 return False

278

279 def _splitzipext(self, filename):

280 """Split zip extension from filename and return filename.

281

282 Returns

283 -------

284 base, zip_ext : {tuple}

285

286 """

287

288 if self._iszip(filename):

289 return os.path.splitext(filename)

290 else:

291 return filename, None

292

293 def _possible_names(self, filename):

294 """Return a tuple containing compressed filename variations."""

295 names = [filename]

296 if not self._iszip(filename):

297 for zipext in _file_openers.keys():

298 if zipext:

299 names.append(filename+zipext)

300 return names

301

302 def _isurl(self, path):

303 """Test if path is a net location. Tests the scheme and netloc."""

304

305 # We do this here to reduce the 'import numpy' initial import time.

306 from urllib.parse import urlparse

307

308 # BUG : URLs require a scheme string ('http://') to be used.

309 # www.google.com will fail.

310 # Should we prepend the scheme for those that don't have it and

311 # test that also? Similar to the way we append .gz and test for

312 # for compressed versions of files.

313

314 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)

315 return bool(scheme and netloc)

316

317 def _cache(self, path):

318 """Cache the file specified by path.

319

320 Creates a copy of the file in the datasource cache.

321

322 """

323 # We import these here because importing them is slow and

324 # a significant fraction of numpy's total import time.

325 import shutil

326 from urllib.request import urlopen

327

328 upath = self.abspath(path)

329

330 # ensure directory exists

331 if not os.path.exists(os.path.dirname(upath)):

332 os.makedirs(os.path.dirname(upath))

333

334 # TODO: Doesn't handle compressed files!

335 if self._isurl(path):

336 with urlopen(path) as openedurl:

337 with _open(upath, 'wb') as f:

338 shutil.copyfileobj(openedurl, f)

339 else:

340 shutil.copyfile(path, upath)

341 return upath

342

343 def _findfile(self, path):

344 """Searches for ``path`` and returns full path if found.

345

346 If path is an URL, _findfile will cache a local copy and return the

347 path to the cached file. If path is a local file, _findfile will

348 return a path to that local file.

349

350 The search will include possible compressed versions of the file

351 and return the first occurrence found.

352

353 """

354

355 # Build list of possible local file paths

356 if not self._isurl(path):

357 # Valid local paths

358 filelist = self._possible_names(path)

359 # Paths in self._destpath

360 filelist += self._possible_names(self.abspath(path))

361 else:

362 # Cached URLs in self._destpath

363 filelist = self._possible_names(self.abspath(path))

364 # Remote URLs

365 filelist = filelist + self._possible_names(path)

366

367 for name in filelist:

368 if self.exists(name):

369 if self._isurl(name):

370 name = self._cache(name)

371 return name

372 return None

373

374 def abspath(self, path):

375 """

376 Return absolute path of file in the DataSource directory.

377

378 If `path` is an URL, then `abspath` will return either the location

379 the file exists locally or the location it would exist when opened

380 using the `open` method.

381

382 Parameters

383 ----------

384 path : str or pathlib.Path

385 Can be a local file or a remote URL.

386

387 Returns

388 -------

389 out : str

390 Complete path, including the `DataSource` destination directory.

391

392 Notes

393 -----

394 The functionality is based on `os.path.abspath`.

395

396 """

397 # We do this here to reduce the 'import numpy' initial import time.

398 from urllib.parse import urlparse

399

400 # TODO: This should be more robust. Handles case where path includes

401 # the destpath, but not other sub-paths. Failing case:

402 # path = /home/guido/datafile.txt

403 # destpath = /home/alex/

404 # upath = self.abspath(path)

405 # upath == '/home/alex/home/guido/datafile.txt'

406

407 # handle case where path includes self._destpath

408 splitpath = path.split(self._destpath, 2)

409 if len(splitpath) > 1:

410 path = splitpath[1]

411 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)

412 netloc = self._sanitize_relative_path(netloc)

413 upath = self._sanitize_relative_path(upath)

414 return os.path.join(self._destpath, netloc, upath)

415

416 def _sanitize_relative_path(self, path):

417 """Return a sanitised relative path for which

418 os.path.abspath(os.path.join(base, path)).startswith(base)

419 """

420 last = None

421 path = os.path.normpath(path)

422 while path != last:

423 last = path

424 # Note: os.path.join treats '/' as os.sep on Windows

425 path = path.lstrip(os.sep).lstrip('/')

426 path = path.lstrip(os.pardir).lstrip('..')

427 drive, path = os.path.splitdrive(path) # for Windows

428 return path

429

430 def exists(self, path):

431 """

432 Test if path exists.

433

434 Test if `path` exists as (and in this order):

435

436 - a local file.

437 - a remote URL that has been downloaded and stored locally in the

438 `DataSource` directory.

439 - a remote URL that has not been downloaded, but is valid and

440 accessible.

441

442 Parameters

443 ----------

444 path : str or pathlib.Path

445 Can be a local file or a remote URL.

446

447 Returns

448 -------

449 out : bool

450 True if `path` exists.

451

452 Notes

453 -----

454 When `path` is an URL, `exists` will return True if it's either

455 stored locally in the `DataSource` directory, or is a valid remote

456 URL. `DataSource` does not discriminate between the two, the file

457 is accessible if it exists in either location.

458

459 """

460

461 # First test for local path

462 if os.path.exists(path):

463 return True

464

465 # We import this here because importing urllib is slow and

466 # a significant fraction of numpy's total import time.

467 from urllib.request import urlopen

468 from urllib.error import URLError

469

470 # Test cached url

471 upath = self.abspath(path)

472 if os.path.exists(upath):

473 return True

474

475 # Test remote url

476 if self._isurl(path):

477 try:

478 netfile = urlopen(path)

479 netfile.close()

480 del(netfile)

481 return True

482 except URLError:

483 return False

484 return False

485

486 def open(self, path, mode='r', encoding=None, newline=None):

487 """

488 Open and return file-like object.

489

490 If `path` is an URL, it will be downloaded, stored in the

491 `DataSource` directory and opened from there.

492

493 Parameters

494 ----------

495 path : str or pathlib.Path

496 Local file path or URL to open.

497 mode : {'r', 'w', 'a'}, optional

498 Mode to open `path`. Mode 'r' for reading, 'w' for writing,

499 'a' to append. Available modes depend on the type of object

500 specified by `path`. Default is 'r'.

501 encoding : {None, str}, optional

502 Open text file with given encoding. The default encoding will be

503 what `open` uses.

504 newline : {None, str}, optional

505 Newline to use when reading text file.

506

507 Returns

508 -------

509 out : file object

510 File object.

511

512 """

513

514 # TODO: There is no support for opening a file for writing which

515 # doesn't exist yet (creating a file). Should there be?

516

517 # TODO: Add a ``subdir`` parameter for specifying the subdirectory

518 # used to store URLs in self._destpath.

519

520 if self._isurl(path) and self._iswritemode(mode):

521 raise ValueError("URLs are not writeable")

522

523 # NOTE: _findfile will fail on a new file opened for writing.

524 found = self._findfile(path)

525 if found:

526 _fname, ext = self._splitzipext(found)

527 if ext == 'bz2':

528 mode.replace("+", "")

529 return _file_openers[ext](found, mode=mode,

530 encoding=encoding, newline=newline)

531 else:

532 raise FileNotFoundError(f"{path} not found.")

533

534

535class Repository (DataSource):

536 """

537 Repository(baseurl, destpath='.')

538

539 A data repository where multiple DataSource's share a base

540 URL/directory.

541

542 `Repository` extends `DataSource` by prepending a base URL (or

543 directory) to all the files it handles. Use `Repository` when you will

544 be working with multiple files from one base URL. Initialize

545 `Repository` with the base URL, then refer to each file by its filename

546 only.

547

548 Parameters

549 ----------

550 baseurl : str

551 Path to the local directory or remote location that contains the

552 data files.

553 destpath : str or None, optional

554 Path to the directory where the source file gets downloaded to for

555 use. If `destpath` is None, a temporary directory will be created.

556 The default path is the current directory.

557

558 Examples

559 --------

560 To analyze all files in the repository, do something like this

561 (note: this is not self-contained code)::

562

563 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/')

564 >>> for filename in filelist:

565 ... fp = repos.open(filename)

566 ... fp.analyze()

567 ... fp.close()

568

569 Similarly you could use a URL for a repository::

570

571 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data')

572

573 """

574

575 def __init__(self, baseurl, destpath=os.curdir):

576 """Create a Repository with a shared url or directory of baseurl."""

577 DataSource.__init__(self, destpath=destpath)

578 self._baseurl = baseurl

579

580 def __del__(self):

581 DataSource.__del__(self)

582

583 def _fullpath(self, path):

584 """Return complete path for path. Prepends baseurl if necessary."""

585 splitpath = path.split(self._baseurl, 2)

586 if len(splitpath) == 1:

587 result = os.path.join(self._baseurl, path)

588 else:

589 result = path # path contains baseurl already

590 return result

591

592 def _findfile(self, path):

593 """Extend DataSource method to prepend baseurl to ``path``."""

594 return DataSource._findfile(self, self._fullpath(path))

595

596 def abspath(self, path):

597 """

598 Return absolute path of file in the Repository directory.

599

600 If `path` is an URL, then `abspath` will return either the location

601 the file exists locally or the location it would exist when opened

602 using the `open` method.

603

604 Parameters

605 ----------

606 path : str or pathlib.Path

607 Can be a local file or a remote URL. This may, but does not

608 have to, include the `baseurl` with which the `Repository` was

609 initialized.

610

611 Returns

612 -------

613 out : str

614 Complete path, including the `DataSource` destination directory.

615

616 """

617 return DataSource.abspath(self, self._fullpath(path))

618

619 def exists(self, path):

620 """

621 Test if path exists prepending Repository base URL to path.

622

623 Test if `path` exists as (and in this order):

624

625 - a local file.

626 - a remote URL that has been downloaded and stored locally in the

627 `DataSource` directory.

628 - a remote URL that has not been downloaded, but is valid and

629 accessible.

630

631 Parameters

632 ----------

633 path : str or pathlib.Path

634 Can be a local file or a remote URL. This may, but does not

635 have to, include the `baseurl` with which the `Repository` was

636 initialized.

637

638 Returns

639 -------

640 out : bool

641 True if `path` exists.

642

643 Notes

644 -----

645 When `path` is an URL, `exists` will return True if it's either

646 stored locally in the `DataSource` directory, or is a valid remote

647 URL. `DataSource` does not discriminate between the two, the file

648 is accessible if it exists in either location.

649

650 """

651 return DataSource.exists(self, self._fullpath(path))

652

653 def open(self, path, mode='r', encoding=None, newline=None):

654 """

655 Open and return file-like object prepending Repository base URL.

656

657 If `path` is an URL, it will be downloaded, stored in the

658 DataSource directory and opened from there.

659

660 Parameters

661 ----------

662 path : str or pathlib.Path

663 Local file path or URL to open. This may, but does not have to,

664 include the `baseurl` with which the `Repository` was

665 initialized.

666 mode : {'r', 'w', 'a'}, optional

667 Mode to open `path`. Mode 'r' for reading, 'w' for writing,

668 'a' to append. Available modes depend on the type of object

669 specified by `path`. Default is 'r'.

670 encoding : {None, str}, optional

671 Open text file with given encoding. The default encoding will be

672 what `open` uses.

673 newline : {None, str}, optional

674 Newline to use when reading text file.

675

676 Returns

677 -------

678 out : file object

679 File object.

680

681 """

682 return DataSource.open(self, self._fullpath(path), mode,

683 encoding=encoding, newline=newline)

684

685 def listdir(self):

686 """

687 List files in the source Repository.

688

689 Returns

690 -------

691 files : list of str or pathlib.Path

692 List of file names (not containing a directory part).

693

694 Notes

695 -----

696 Does not currently work for remote repositories.

697

698 """

699 if self._isurl(self._baseurl):

700 raise NotImplementedError(

701 "Directory listing of URLs, not supported yet.")

702 else:

703 return os.listdir(self._baseurl)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/numpy/lib/_datasource.py: 21%

175 statements