Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/numpy/lib/

1"""A file interface for handling local and remote data files.

3The goal of datasource is to abstract some of the file system operations

4when dealing with data files so the researcher doesn't have to know all the

5low-level details. Through datasource, a researcher can obtain and use a

6file with one function call, regardless of location of the file.

8DataSource is meant to augment standard python libraries, not replace them.

9It should work seamlessly with standard file IO operations and the os

10module.

12DataSource files can originate locally or remotely:

14- local files : '/home/guido/src/local/data.txt'

15- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'

17DataSource files can also be compressed or uncompressed. Currently only

18gzip, bz2 and xz are supported.

20Example::

22 >>> # Create a DataSource, use os.curdir (default) for local storage.

23 >>> from numpy import DataSource

24 >>> ds = DataSource()

25 >>>

26 >>> # Open a remote file.

27 >>> # DataSource downloads the file, stores it locally in:

28 >>> # './www.google.com/index.html'

29 >>> # opens the file and returns a file object.

30 >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP

31 >>>

32 >>> # Use the file as you normally would

33 >>> fp.read() # doctest: +SKIP

34 >>> fp.close() # doctest: +SKIP

36"""

37import os

39from .._utils import set_module

42_open = open

45def _check_mode(mode, encoding, newline):

46 """Check mode and that encoding and newline are compatible.

48 Parameters

49 ----------

50 mode : str

51 File open mode.

52 encoding : str

53 File encoding.

54 newline : str

55 Newline for text files.

57 """

58 if "t" in mode:

59 if "b" in mode:

60 raise ValueError("Invalid mode: %r" % (mode,))

61 else:

62 if encoding is not None:

63 raise ValueError("Argument 'encoding' not supported in binary mode")

64 if newline is not None:

65 raise ValueError("Argument 'newline' not supported in binary mode")

68# Using a class instead of a module-level dictionary

69# to reduce the initial 'import numpy' overhead by

70# deferring the import of lzma, bz2 and gzip until needed

72# TODO: .zip support, .tar support?

73class _FileOpeners:

74 """

75 Container for different methods to open (un-)compressed files.

77 `_FileOpeners` contains a dictionary that holds one method for each

78 supported file format. Attribute lookup is implemented in such a way

79 that an instance of `_FileOpeners` itself can be indexed with the keys

80 of that dictionary. Currently uncompressed files as well as files

81 compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported.

83 Notes

84 -----

85 `_file_openers`, an instance of `_FileOpeners`, is made available for

86 use in the `_datasource` module.

88 Examples

89 --------

90 >>> import gzip

91 >>> np.lib._datasource._file_openers.keys()

92 [None, '.bz2', '.gz', '.xz', '.lzma']

93 >>> np.lib._datasource._file_openers['.gz'] is gzip.open

94 True

96 """

98 def __init__(self):

99 self._loaded = False

100 self._file_openers = {None: open}

101

102 def _load(self):

103 if self._loaded:

104 return

105

106 try:

107 import bz2

108 self._file_openers[".bz2"] = bz2.open

109 except ImportError:

110 pass

111

112 try:

113 import gzip

114 self._file_openers[".gz"] = gzip.open

115 except ImportError:

116 pass

117

118 try:

119 import lzma

120 self._file_openers[".xz"] = lzma.open

121 self._file_openers[".lzma"] = lzma.open

122 except (ImportError, AttributeError):

123 # There are incompatible backports of lzma that do not have the

124 # lzma.open attribute, so catch that as well as ImportError.

125 pass

126

127 self._loaded = True

128

129 def keys(self):

130 """

131 Return the keys of currently supported file openers.

132

133 Parameters

134 ----------

135 None

136

137 Returns

138 -------

139 keys : list

140 The keys are None for uncompressed files and the file extension

141 strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression

142 methods.

143

144 """

145 self._load()

146 return list(self._file_openers.keys())

147

148 def __getitem__(self, key):

149 self._load()

150 return self._file_openers[key]

151

152_file_openers = _FileOpeners()

153

154def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None):

155 """

156 Open `path` with `mode` and return the file object.

157

158 If ``path`` is an URL, it will be downloaded, stored in the

159 `DataSource` `destpath` directory and opened from there.

160

161 Parameters

162 ----------

163 path : str or pathlib.Path

164 Local file path or URL to open.

165 mode : str, optional

166 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to

167 append. Available modes depend on the type of object specified by

168 path. Default is 'r'.

169 destpath : str, optional

170 Path to the directory where the source file gets downloaded to for

171 use. If `destpath` is None, a temporary directory will be created.

172 The default path is the current directory.

173 encoding : {None, str}, optional

174 Open text file with given encoding. The default encoding will be

175 what `open` uses.

176 newline : {None, str}, optional

177 Newline to use when reading text file.

178

179 Returns

180 -------

181 out : file object

182 The opened file.

183

184 Notes

185 -----

186 This is a convenience function that instantiates a `DataSource` and

187 returns the file object from ``DataSource.open(path)``.

188

189 """

190

191 ds = DataSource(destpath)

192 return ds.open(path, mode, encoding=encoding, newline=newline)

193

194

195@set_module('numpy.lib.npyio')

196class DataSource:

197 """

198 DataSource(destpath='.')

199

200 A generic data source file (file, http, ftp, ...).

201

202 DataSources can be local files or remote files/URLs. The files may

203 also be compressed or uncompressed. DataSource hides some of the

204 low-level details of downloading the file, allowing you to simply pass

205 in a valid file path (or URL) and obtain a file object.

206

207 Parameters

208 ----------

209 destpath : str or None, optional

210 Path to the directory where the source file gets downloaded to for

211 use. If `destpath` is None, a temporary directory will be created.

212 The default path is the current directory.

213

214 Notes

215 -----

216 URLs require a scheme string (``http://``) to be used, without it they

217 will fail::

218

219 >>> repos = np.lib.npyio.DataSource()

220 >>> repos.exists('www.google.com/index.html')

221 False

222 >>> repos.exists('http://www.google.com/index.html')

223 True

224

225 Temporary directories are deleted when the DataSource is deleted.

226

227 Examples

228 --------

229 ::

230

231 >>> ds = np.lib.npyio.DataSource('/home/guido')

232 >>> urlname = 'http://www.google.com/'

233 >>> gfile = ds.open('http://www.google.com/')

234 >>> ds.abspath(urlname)

235 '/home/guido/www.google.com/index.html'

236

237 >>> ds = np.lib.npyio.DataSource(None) # use with temporary file

238 >>> ds.open('/home/guido/foobar.txt')

239 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430>

240 >>> ds.abspath('/home/guido/foobar.txt')

241 '/tmp/.../home/guido/foobar.txt'

242

243 """

244

245 def __init__(self, destpath=os.curdir):

246 """Create a DataSource with a local path at destpath."""

247 if destpath:

248 self._destpath = os.path.abspath(destpath)

249 self._istmpdest = False

250 else:

251 import tempfile # deferring import to improve startup time

252 self._destpath = tempfile.mkdtemp()

253 self._istmpdest = True

254

255 def __del__(self):

256 # Remove temp directories

257 if hasattr(self, '_istmpdest') and self._istmpdest:

258 import shutil

259

260 shutil.rmtree(self._destpath)

261

262 def _iszip(self, filename):

263 """Test if the filename is a zip file by looking at the file extension.

264

265 """

266 fname, ext = os.path.splitext(filename)

267 return ext in _file_openers.keys()

268

269 def _iswritemode(self, mode):

270 """Test if the given mode will open a file for writing."""

271

272 # Currently only used to test the bz2 files.

273 _writemodes = ("w", "+")

274 return any(c in _writemodes for c in mode)

275

276 def _splitzipext(self, filename):

277 """Split zip extension from filename and return filename.

278

279 Returns

280 -------

281 base, zip_ext : {tuple}

282

283 """

284

285 if self._iszip(filename):

286 return os.path.splitext(filename)

287 else:

288 return filename, None

289

290 def _possible_names(self, filename):

291 """Return a tuple containing compressed filename variations."""

292 names = [filename]

293 if not self._iszip(filename):

294 for zipext in _file_openers.keys():

295 if zipext:

296 names.append(filename+zipext)

297 return names

298

299 def _isurl(self, path):

300 """Test if path is a net location. Tests the scheme and netloc."""

301

302 # We do this here to reduce the 'import numpy' initial import time.

303 from urllib.parse import urlparse

304

305 # BUG : URLs require a scheme string ('http://') to be used.

306 # www.google.com will fail.

307 # Should we prepend the scheme for those that don't have it and

308 # test that also? Similar to the way we append .gz and test for

309 # for compressed versions of files.

310

311 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)

312 return bool(scheme and netloc)

313

314 def _cache(self, path):

315 """Cache the file specified by path.

316

317 Creates a copy of the file in the datasource cache.

318

319 """

320 # We import these here because importing them is slow and

321 # a significant fraction of numpy's total import time.

322 import shutil

323 from urllib.request import urlopen

324

325 upath = self.abspath(path)

326

327 # ensure directory exists

328 if not os.path.exists(os.path.dirname(upath)):

329 os.makedirs(os.path.dirname(upath))

330

331 # TODO: Doesn't handle compressed files!

332 if self._isurl(path):

333 with urlopen(path) as openedurl:

334 with _open(upath, 'wb') as f:

335 shutil.copyfileobj(openedurl, f)

336 else:

337 shutil.copyfile(path, upath)

338 return upath

339

340 def _findfile(self, path):

341 """Searches for ``path`` and returns full path if found.

342

343 If path is an URL, _findfile will cache a local copy and return the

344 path to the cached file. If path is a local file, _findfile will

345 return a path to that local file.

346

347 The search will include possible compressed versions of the file

348 and return the first occurrence found.

349

350 """

351

352 # Build list of possible local file paths

353 if not self._isurl(path):

354 # Valid local paths

355 filelist = self._possible_names(path)

356 # Paths in self._destpath

357 filelist += self._possible_names(self.abspath(path))

358 else:

359 # Cached URLs in self._destpath

360 filelist = self._possible_names(self.abspath(path))

361 # Remote URLs

362 filelist = filelist + self._possible_names(path)

363

364 for name in filelist:

365 if self.exists(name):

366 if self._isurl(name):

367 name = self._cache(name)

368 return name

369 return None

370

371 def abspath(self, path):

372 """

373 Return absolute path of file in the DataSource directory.

374

375 If `path` is an URL, then `abspath` will return either the location

376 the file exists locally or the location it would exist when opened

377 using the `open` method.

378

379 Parameters

380 ----------

381 path : str or pathlib.Path

382 Can be a local file or a remote URL.

383

384 Returns

385 -------

386 out : str

387 Complete path, including the `DataSource` destination directory.

388

389 Notes

390 -----

391 The functionality is based on `os.path.abspath`.

392

393 """

394 # We do this here to reduce the 'import numpy' initial import time.

395 from urllib.parse import urlparse

396

397 # TODO: This should be more robust. Handles case where path includes

398 # the destpath, but not other sub-paths. Failing case:

399 # path = /home/guido/datafile.txt

400 # destpath = /home/alex/

401 # upath = self.abspath(path)

402 # upath == '/home/alex/home/guido/datafile.txt'

403

404 # handle case where path includes self._destpath

405 splitpath = path.split(self._destpath, 2)

406 if len(splitpath) > 1:

407 path = splitpath[1]

408 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)

409 netloc = self._sanitize_relative_path(netloc)

410 upath = self._sanitize_relative_path(upath)

411 return os.path.join(self._destpath, netloc, upath)

412

413 def _sanitize_relative_path(self, path):

414 """Return a sanitised relative path for which

415 os.path.abspath(os.path.join(base, path)).startswith(base)

416 """

417 last = None

418 path = os.path.normpath(path)

419 while path != last:

420 last = path

421 # Note: os.path.join treats '/' as os.sep on Windows

422 path = path.lstrip(os.sep).lstrip('/')

423 path = path.lstrip(os.pardir).removeprefix('..')

424 drive, path = os.path.splitdrive(path) # for Windows

425 return path

426

427 def exists(self, path):

428 """

429 Test if path exists.

430

431 Test if `path` exists as (and in this order):

432

433 - a local file.

434 - a remote URL that has been downloaded and stored locally in the

435 `DataSource` directory.

436 - a remote URL that has not been downloaded, but is valid and

437 accessible.

438

439 Parameters

440 ----------

441 path : str or pathlib.Path

442 Can be a local file or a remote URL.

443

444 Returns

445 -------

446 out : bool

447 True if `path` exists.

448

449 Notes

450 -----

451 When `path` is an URL, `exists` will return True if it's either

452 stored locally in the `DataSource` directory, or is a valid remote

453 URL. `DataSource` does not discriminate between the two, the file

454 is accessible if it exists in either location.

455

456 """

457

458 # First test for local path

459 if os.path.exists(path):

460 return True

461

462 # We import this here because importing urllib is slow and

463 # a significant fraction of numpy's total import time.

464 from urllib.request import urlopen

465 from urllib.error import URLError

466

467 # Test cached url

468 upath = self.abspath(path)

469 if os.path.exists(upath):

470 return True

471

472 # Test remote url

473 if self._isurl(path):

474 try:

475 netfile = urlopen(path)

476 netfile.close()

477 del(netfile)

478 return True

479 except URLError:

480 return False

481 return False

482

483 def open(self, path, mode='r', encoding=None, newline=None):

484 """

485 Open and return file-like object.

486

487 If `path` is an URL, it will be downloaded, stored in the

488 `DataSource` directory and opened from there.

489

490 Parameters

491 ----------

492 path : str or pathlib.Path

493 Local file path or URL to open.

494 mode : {'r', 'w', 'a'}, optional

495 Mode to open `path`. Mode 'r' for reading, 'w' for writing,

496 'a' to append. Available modes depend on the type of object

497 specified by `path`. Default is 'r'.

498 encoding : {None, str}, optional

499 Open text file with given encoding. The default encoding will be

500 what `open` uses.

501 newline : {None, str}, optional

502 Newline to use when reading text file.

503

504 Returns

505 -------

506 out : file object

507 File object.

508

509 """

510

511 # TODO: There is no support for opening a file for writing which

512 # doesn't exist yet (creating a file). Should there be?

513

514 # TODO: Add a ``subdir`` parameter for specifying the subdirectory

515 # used to store URLs in self._destpath.

516

517 if self._isurl(path) and self._iswritemode(mode):

518 raise ValueError("URLs are not writeable")

519

520 # NOTE: _findfile will fail on a new file opened for writing.

521 found = self._findfile(path)

522 if found:

523 _fname, ext = self._splitzipext(found)

524 if ext == 'bz2':

525 mode.replace("+", "")

526 return _file_openers[ext](found, mode=mode,

527 encoding=encoding, newline=newline)

528 else:

529 raise FileNotFoundError(f"{path} not found.")

530

531

532class Repository (DataSource):

533 """

534 Repository(baseurl, destpath='.')

535

536 A data repository where multiple DataSource's share a base

537 URL/directory.

538

539 `Repository` extends `DataSource` by prepending a base URL (or

540 directory) to all the files it handles. Use `Repository` when you will

541 be working with multiple files from one base URL. Initialize

542 `Repository` with the base URL, then refer to each file by its filename

543 only.

544

545 Parameters

546 ----------

547 baseurl : str

548 Path to the local directory or remote location that contains the

549 data files.

550 destpath : str or None, optional

551 Path to the directory where the source file gets downloaded to for

552 use. If `destpath` is None, a temporary directory will be created.

553 The default path is the current directory.

554

555 Examples

556 --------

557 To analyze all files in the repository, do something like this

558 (note: this is not self-contained code)::

559

560 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/')

561 >>> for filename in filelist:

562 ... fp = repos.open(filename)

563 ... fp.analyze()

564 ... fp.close()

565

566 Similarly you could use a URL for a repository::

567

568 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data')

569

570 """

571

572 def __init__(self, baseurl, destpath=os.curdir):

573 """Create a Repository with a shared url or directory of baseurl."""

574 DataSource.__init__(self, destpath=destpath)

575 self._baseurl = baseurl

576

577 def __del__(self):

578 DataSource.__del__(self)

579

580 def _fullpath(self, path):

581 """Return complete path for path. Prepends baseurl if necessary."""

582 splitpath = path.split(self._baseurl, 2)

583 if len(splitpath) == 1:

584 result = os.path.join(self._baseurl, path)

585 else:

586 result = path # path contains baseurl already

587 return result

588

589 def _findfile(self, path):

590 """Extend DataSource method to prepend baseurl to ``path``."""

591 return DataSource._findfile(self, self._fullpath(path))

592

593 def abspath(self, path):

594 """

595 Return absolute path of file in the Repository directory.

596

597 If `path` is an URL, then `abspath` will return either the location

598 the file exists locally or the location it would exist when opened

599 using the `open` method.

600

601 Parameters

602 ----------

603 path : str or pathlib.Path

604 Can be a local file or a remote URL. This may, but does not

605 have to, include the `baseurl` with which the `Repository` was

606 initialized.

607

608 Returns

609 -------

610 out : str

611 Complete path, including the `DataSource` destination directory.

612

613 """

614 return DataSource.abspath(self, self._fullpath(path))

615

616 def exists(self, path):

617 """

618 Test if path exists prepending Repository base URL to path.

619

620 Test if `path` exists as (and in this order):

621

622 - a local file.

623 - a remote URL that has been downloaded and stored locally in the

624 `DataSource` directory.

625 - a remote URL that has not been downloaded, but is valid and

626 accessible.

627

628 Parameters

629 ----------

630 path : str or pathlib.Path

631 Can be a local file or a remote URL. This may, but does not

632 have to, include the `baseurl` with which the `Repository` was

633 initialized.

634

635 Returns

636 -------

637 out : bool

638 True if `path` exists.

639

640 Notes

641 -----

642 When `path` is an URL, `exists` will return True if it's either

643 stored locally in the `DataSource` directory, or is a valid remote

644 URL. `DataSource` does not discriminate between the two, the file

645 is accessible if it exists in either location.

646

647 """

648 return DataSource.exists(self, self._fullpath(path))

649

650 def open(self, path, mode='r', encoding=None, newline=None):

651 """

652 Open and return file-like object prepending Repository base URL.

653

654 If `path` is an URL, it will be downloaded, stored in the

655 DataSource directory and opened from there.

656

657 Parameters

658 ----------

659 path : str or pathlib.Path

660 Local file path or URL to open. This may, but does not have to,

661 include the `baseurl` with which the `Repository` was

662 initialized.

663 mode : {'r', 'w', 'a'}, optional

664 Mode to open `path`. Mode 'r' for reading, 'w' for writing,

665 'a' to append. Available modes depend on the type of object

666 specified by `path`. Default is 'r'.

667 encoding : {None, str}, optional

668 Open text file with given encoding. The default encoding will be

669 what `open` uses.

670 newline : {None, str}, optional

671 Newline to use when reading text file.

672

673 Returns

674 -------

675 out : file object

676 File object.

677

678 """

679 return DataSource.open(self, self._fullpath(path), mode,

680 encoding=encoding, newline=newline)

681

682 def listdir(self):

683 """

684 List files in the source Repository.

685

686 Returns

687 -------

688 files : list of str or pathlib.Path

689 List of file names (not containing a directory part).

690

691 Notes

692 -----

693 Does not currently work for remote repositories.

694

695 """

696 if self._isurl(self._baseurl):

697 raise NotImplementedError(

698 "Directory listing of URLs, not supported yet.")

699 else:

700 return os.listdir(self._baseurl)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/numpy/lib/_datasource.py: 22%

172 statements