Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/numpy/lib/_datasource.py: 21%
175 statements
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-09 06:12 +0000
« prev ^ index » next coverage.py v7.4.4, created at 2024-04-09 06:12 +0000
1"""A file interface for handling local and remote data files.
3The goal of datasource is to abstract some of the file system operations
4when dealing with data files so the researcher doesn't have to know all the
5low-level details. Through datasource, a researcher can obtain and use a
6file with one function call, regardless of location of the file.
8DataSource is meant to augment standard python libraries, not replace them.
9It should work seamlessly with standard file IO operations and the os
10module.
12DataSource files can originate locally or remotely:
14- local files : '/home/guido/src/local/data.txt'
15- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
17DataSource files can also be compressed or uncompressed. Currently only
18gzip, bz2 and xz are supported.
20Example::
22 >>> # Create a DataSource, use os.curdir (default) for local storage.
23 >>> from numpy import DataSource
24 >>> ds = DataSource()
25 >>>
26 >>> # Open a remote file.
27 >>> # DataSource downloads the file, stores it locally in:
28 >>> # './www.google.com/index.html'
29 >>> # opens the file and returns a file object.
30 >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP
31 >>>
32 >>> # Use the file as you normally would
33 >>> fp.read() # doctest: +SKIP
34 >>> fp.close() # doctest: +SKIP
36"""
37import os
39from .._utils import set_module
42_open = open
45def _check_mode(mode, encoding, newline):
46 """Check mode and that encoding and newline are compatible.
48 Parameters
49 ----------
50 mode : str
51 File open mode.
52 encoding : str
53 File encoding.
54 newline : str
55 Newline for text files.
57 """
58 if "t" in mode:
59 if "b" in mode:
60 raise ValueError("Invalid mode: %r" % (mode,))
61 else:
62 if encoding is not None:
63 raise ValueError("Argument 'encoding' not supported in binary mode")
64 if newline is not None:
65 raise ValueError("Argument 'newline' not supported in binary mode")
68# Using a class instead of a module-level dictionary
69# to reduce the initial 'import numpy' overhead by
70# deferring the import of lzma, bz2 and gzip until needed
72# TODO: .zip support, .tar support?
73class _FileOpeners:
74 """
75 Container for different methods to open (un-)compressed files.
77 `_FileOpeners` contains a dictionary that holds one method for each
78 supported file format. Attribute lookup is implemented in such a way
79 that an instance of `_FileOpeners` itself can be indexed with the keys
80 of that dictionary. Currently uncompressed files as well as files
81 compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported.
83 Notes
84 -----
85 `_file_openers`, an instance of `_FileOpeners`, is made available for
86 use in the `_datasource` module.
88 Examples
89 --------
90 >>> import gzip
91 >>> np.lib._datasource._file_openers.keys()
92 [None, '.bz2', '.gz', '.xz', '.lzma']
93 >>> np.lib._datasource._file_openers['.gz'] is gzip.open
94 True
96 """
98 def __init__(self):
99 self._loaded = False
100 self._file_openers = {None: open}
102 def _load(self):
103 if self._loaded:
104 return
106 try:
107 import bz2
108 self._file_openers[".bz2"] = bz2.open
109 except ImportError:
110 pass
112 try:
113 import gzip
114 self._file_openers[".gz"] = gzip.open
115 except ImportError:
116 pass
118 try:
119 import lzma
120 self._file_openers[".xz"] = lzma.open
121 self._file_openers[".lzma"] = lzma.open
122 except (ImportError, AttributeError):
123 # There are incompatible backports of lzma that do not have the
124 # lzma.open attribute, so catch that as well as ImportError.
125 pass
127 self._loaded = True
129 def keys(self):
130 """
131 Return the keys of currently supported file openers.
133 Parameters
134 ----------
135 None
137 Returns
138 -------
139 keys : list
140 The keys are None for uncompressed files and the file extension
141 strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression
142 methods.
144 """
145 self._load()
146 return list(self._file_openers.keys())
148 def __getitem__(self, key):
149 self._load()
150 return self._file_openers[key]
152_file_openers = _FileOpeners()
154def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None):
155 """
156 Open `path` with `mode` and return the file object.
158 If ``path`` is an URL, it will be downloaded, stored in the
159 `DataSource` `destpath` directory and opened from there.
161 Parameters
162 ----------
163 path : str or pathlib.Path
164 Local file path or URL to open.
165 mode : str, optional
166 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to
167 append. Available modes depend on the type of object specified by
168 path. Default is 'r'.
169 destpath : str, optional
170 Path to the directory where the source file gets downloaded to for
171 use. If `destpath` is None, a temporary directory will be created.
172 The default path is the current directory.
173 encoding : {None, str}, optional
174 Open text file with given encoding. The default encoding will be
175 what `open` uses.
176 newline : {None, str}, optional
177 Newline to use when reading text file.
179 Returns
180 -------
181 out : file object
182 The opened file.
184 Notes
185 -----
186 This is a convenience function that instantiates a `DataSource` and
187 returns the file object from ``DataSource.open(path)``.
189 """
191 ds = DataSource(destpath)
192 return ds.open(path, mode, encoding=encoding, newline=newline)
195@set_module('numpy.lib.npyio')
196class DataSource:
197 """
198 DataSource(destpath='.')
200 A generic data source file (file, http, ftp, ...).
202 DataSources can be local files or remote files/URLs. The files may
203 also be compressed or uncompressed. DataSource hides some of the
204 low-level details of downloading the file, allowing you to simply pass
205 in a valid file path (or URL) and obtain a file object.
207 Parameters
208 ----------
209 destpath : str or None, optional
210 Path to the directory where the source file gets downloaded to for
211 use. If `destpath` is None, a temporary directory will be created.
212 The default path is the current directory.
214 Notes
215 -----
216 URLs require a scheme string (``http://``) to be used, without it they
217 will fail::
219 >>> repos = np.lib.npyio.DataSource()
220 >>> repos.exists('www.google.com/index.html')
221 False
222 >>> repos.exists('http://www.google.com/index.html')
223 True
225 Temporary directories are deleted when the DataSource is deleted.
227 Examples
228 --------
229 ::
231 >>> ds = np.lib.npyio.DataSource('/home/guido')
232 >>> urlname = 'http://www.google.com/'
233 >>> gfile = ds.open('http://www.google.com/')
234 >>> ds.abspath(urlname)
235 '/home/guido/www.google.com/index.html'
237 >>> ds = np.lib.npyio.DataSource(None) # use with temporary file
238 >>> ds.open('/home/guido/foobar.txt')
239 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430>
240 >>> ds.abspath('/home/guido/foobar.txt')
241 '/tmp/.../home/guido/foobar.txt'
243 """
245 def __init__(self, destpath=os.curdir):
246 """Create a DataSource with a local path at destpath."""
247 if destpath:
248 self._destpath = os.path.abspath(destpath)
249 self._istmpdest = False
250 else:
251 import tempfile # deferring import to improve startup time
252 self._destpath = tempfile.mkdtemp()
253 self._istmpdest = True
255 def __del__(self):
256 # Remove temp directories
257 if hasattr(self, '_istmpdest') and self._istmpdest:
258 import shutil
260 shutil.rmtree(self._destpath)
262 def _iszip(self, filename):
263 """Test if the filename is a zip file by looking at the file extension.
265 """
266 fname, ext = os.path.splitext(filename)
267 return ext in _file_openers.keys()
269 def _iswritemode(self, mode):
270 """Test if the given mode will open a file for writing."""
272 # Currently only used to test the bz2 files.
273 _writemodes = ("w", "+")
274 for c in mode:
275 if c in _writemodes:
276 return True
277 return False
279 def _splitzipext(self, filename):
280 """Split zip extension from filename and return filename.
282 Returns
283 -------
284 base, zip_ext : {tuple}
286 """
288 if self._iszip(filename):
289 return os.path.splitext(filename)
290 else:
291 return filename, None
293 def _possible_names(self, filename):
294 """Return a tuple containing compressed filename variations."""
295 names = [filename]
296 if not self._iszip(filename):
297 for zipext in _file_openers.keys():
298 if zipext:
299 names.append(filename+zipext)
300 return names
302 def _isurl(self, path):
303 """Test if path is a net location. Tests the scheme and netloc."""
305 # We do this here to reduce the 'import numpy' initial import time.
306 from urllib.parse import urlparse
308 # BUG : URLs require a scheme string ('http://') to be used.
309 # www.google.com will fail.
310 # Should we prepend the scheme for those that don't have it and
311 # test that also? Similar to the way we append .gz and test for
312 # for compressed versions of files.
314 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
315 return bool(scheme and netloc)
317 def _cache(self, path):
318 """Cache the file specified by path.
320 Creates a copy of the file in the datasource cache.
322 """
323 # We import these here because importing them is slow and
324 # a significant fraction of numpy's total import time.
325 import shutil
326 from urllib.request import urlopen
328 upath = self.abspath(path)
330 # ensure directory exists
331 if not os.path.exists(os.path.dirname(upath)):
332 os.makedirs(os.path.dirname(upath))
334 # TODO: Doesn't handle compressed files!
335 if self._isurl(path):
336 with urlopen(path) as openedurl:
337 with _open(upath, 'wb') as f:
338 shutil.copyfileobj(openedurl, f)
339 else:
340 shutil.copyfile(path, upath)
341 return upath
343 def _findfile(self, path):
344 """Searches for ``path`` and returns full path if found.
346 If path is an URL, _findfile will cache a local copy and return the
347 path to the cached file. If path is a local file, _findfile will
348 return a path to that local file.
350 The search will include possible compressed versions of the file
351 and return the first occurrence found.
353 """
355 # Build list of possible local file paths
356 if not self._isurl(path):
357 # Valid local paths
358 filelist = self._possible_names(path)
359 # Paths in self._destpath
360 filelist += self._possible_names(self.abspath(path))
361 else:
362 # Cached URLs in self._destpath
363 filelist = self._possible_names(self.abspath(path))
364 # Remote URLs
365 filelist = filelist + self._possible_names(path)
367 for name in filelist:
368 if self.exists(name):
369 if self._isurl(name):
370 name = self._cache(name)
371 return name
372 return None
374 def abspath(self, path):
375 """
376 Return absolute path of file in the DataSource directory.
378 If `path` is an URL, then `abspath` will return either the location
379 the file exists locally or the location it would exist when opened
380 using the `open` method.
382 Parameters
383 ----------
384 path : str or pathlib.Path
385 Can be a local file or a remote URL.
387 Returns
388 -------
389 out : str
390 Complete path, including the `DataSource` destination directory.
392 Notes
393 -----
394 The functionality is based on `os.path.abspath`.
396 """
397 # We do this here to reduce the 'import numpy' initial import time.
398 from urllib.parse import urlparse
400 # TODO: This should be more robust. Handles case where path includes
401 # the destpath, but not other sub-paths. Failing case:
402 # path = /home/guido/datafile.txt
403 # destpath = /home/alex/
404 # upath = self.abspath(path)
405 # upath == '/home/alex/home/guido/datafile.txt'
407 # handle case where path includes self._destpath
408 splitpath = path.split(self._destpath, 2)
409 if len(splitpath) > 1:
410 path = splitpath[1]
411 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
412 netloc = self._sanitize_relative_path(netloc)
413 upath = self._sanitize_relative_path(upath)
414 return os.path.join(self._destpath, netloc, upath)
416 def _sanitize_relative_path(self, path):
417 """Return a sanitised relative path for which
418 os.path.abspath(os.path.join(base, path)).startswith(base)
419 """
420 last = None
421 path = os.path.normpath(path)
422 while path != last:
423 last = path
424 # Note: os.path.join treats '/' as os.sep on Windows
425 path = path.lstrip(os.sep).lstrip('/')
426 path = path.lstrip(os.pardir).lstrip('..')
427 drive, path = os.path.splitdrive(path) # for Windows
428 return path
430 def exists(self, path):
431 """
432 Test if path exists.
434 Test if `path` exists as (and in this order):
436 - a local file.
437 - a remote URL that has been downloaded and stored locally in the
438 `DataSource` directory.
439 - a remote URL that has not been downloaded, but is valid and
440 accessible.
442 Parameters
443 ----------
444 path : str or pathlib.Path
445 Can be a local file or a remote URL.
447 Returns
448 -------
449 out : bool
450 True if `path` exists.
452 Notes
453 -----
454 When `path` is an URL, `exists` will return True if it's either
455 stored locally in the `DataSource` directory, or is a valid remote
456 URL. `DataSource` does not discriminate between the two, the file
457 is accessible if it exists in either location.
459 """
461 # First test for local path
462 if os.path.exists(path):
463 return True
465 # We import this here because importing urllib is slow and
466 # a significant fraction of numpy's total import time.
467 from urllib.request import urlopen
468 from urllib.error import URLError
470 # Test cached url
471 upath = self.abspath(path)
472 if os.path.exists(upath):
473 return True
475 # Test remote url
476 if self._isurl(path):
477 try:
478 netfile = urlopen(path)
479 netfile.close()
480 del(netfile)
481 return True
482 except URLError:
483 return False
484 return False
486 def open(self, path, mode='r', encoding=None, newline=None):
487 """
488 Open and return file-like object.
490 If `path` is an URL, it will be downloaded, stored in the
491 `DataSource` directory and opened from there.
493 Parameters
494 ----------
495 path : str or pathlib.Path
496 Local file path or URL to open.
497 mode : {'r', 'w', 'a'}, optional
498 Mode to open `path`. Mode 'r' for reading, 'w' for writing,
499 'a' to append. Available modes depend on the type of object
500 specified by `path`. Default is 'r'.
501 encoding : {None, str}, optional
502 Open text file with given encoding. The default encoding will be
503 what `open` uses.
504 newline : {None, str}, optional
505 Newline to use when reading text file.
507 Returns
508 -------
509 out : file object
510 File object.
512 """
514 # TODO: There is no support for opening a file for writing which
515 # doesn't exist yet (creating a file). Should there be?
517 # TODO: Add a ``subdir`` parameter for specifying the subdirectory
518 # used to store URLs in self._destpath.
520 if self._isurl(path) and self._iswritemode(mode):
521 raise ValueError("URLs are not writeable")
523 # NOTE: _findfile will fail on a new file opened for writing.
524 found = self._findfile(path)
525 if found:
526 _fname, ext = self._splitzipext(found)
527 if ext == 'bz2':
528 mode.replace("+", "")
529 return _file_openers[ext](found, mode=mode,
530 encoding=encoding, newline=newline)
531 else:
532 raise FileNotFoundError(f"{path} not found.")
535class Repository (DataSource):
536 """
537 Repository(baseurl, destpath='.')
539 A data repository where multiple DataSource's share a base
540 URL/directory.
542 `Repository` extends `DataSource` by prepending a base URL (or
543 directory) to all the files it handles. Use `Repository` when you will
544 be working with multiple files from one base URL. Initialize
545 `Repository` with the base URL, then refer to each file by its filename
546 only.
548 Parameters
549 ----------
550 baseurl : str
551 Path to the local directory or remote location that contains the
552 data files.
553 destpath : str or None, optional
554 Path to the directory where the source file gets downloaded to for
555 use. If `destpath` is None, a temporary directory will be created.
556 The default path is the current directory.
558 Examples
559 --------
560 To analyze all files in the repository, do something like this
561 (note: this is not self-contained code)::
563 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/')
564 >>> for filename in filelist:
565 ... fp = repos.open(filename)
566 ... fp.analyze()
567 ... fp.close()
569 Similarly you could use a URL for a repository::
571 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data')
573 """
575 def __init__(self, baseurl, destpath=os.curdir):
576 """Create a Repository with a shared url or directory of baseurl."""
577 DataSource.__init__(self, destpath=destpath)
578 self._baseurl = baseurl
580 def __del__(self):
581 DataSource.__del__(self)
583 def _fullpath(self, path):
584 """Return complete path for path. Prepends baseurl if necessary."""
585 splitpath = path.split(self._baseurl, 2)
586 if len(splitpath) == 1:
587 result = os.path.join(self._baseurl, path)
588 else:
589 result = path # path contains baseurl already
590 return result
592 def _findfile(self, path):
593 """Extend DataSource method to prepend baseurl to ``path``."""
594 return DataSource._findfile(self, self._fullpath(path))
596 def abspath(self, path):
597 """
598 Return absolute path of file in the Repository directory.
600 If `path` is an URL, then `abspath` will return either the location
601 the file exists locally or the location it would exist when opened
602 using the `open` method.
604 Parameters
605 ----------
606 path : str or pathlib.Path
607 Can be a local file or a remote URL. This may, but does not
608 have to, include the `baseurl` with which the `Repository` was
609 initialized.
611 Returns
612 -------
613 out : str
614 Complete path, including the `DataSource` destination directory.
616 """
617 return DataSource.abspath(self, self._fullpath(path))
619 def exists(self, path):
620 """
621 Test if path exists prepending Repository base URL to path.
623 Test if `path` exists as (and in this order):
625 - a local file.
626 - a remote URL that has been downloaded and stored locally in the
627 `DataSource` directory.
628 - a remote URL that has not been downloaded, but is valid and
629 accessible.
631 Parameters
632 ----------
633 path : str or pathlib.Path
634 Can be a local file or a remote URL. This may, but does not
635 have to, include the `baseurl` with which the `Repository` was
636 initialized.
638 Returns
639 -------
640 out : bool
641 True if `path` exists.
643 Notes
644 -----
645 When `path` is an URL, `exists` will return True if it's either
646 stored locally in the `DataSource` directory, or is a valid remote
647 URL. `DataSource` does not discriminate between the two, the file
648 is accessible if it exists in either location.
650 """
651 return DataSource.exists(self, self._fullpath(path))
653 def open(self, path, mode='r', encoding=None, newline=None):
654 """
655 Open and return file-like object prepending Repository base URL.
657 If `path` is an URL, it will be downloaded, stored in the
658 DataSource directory and opened from there.
660 Parameters
661 ----------
662 path : str or pathlib.Path
663 Local file path or URL to open. This may, but does not have to,
664 include the `baseurl` with which the `Repository` was
665 initialized.
666 mode : {'r', 'w', 'a'}, optional
667 Mode to open `path`. Mode 'r' for reading, 'w' for writing,
668 'a' to append. Available modes depend on the type of object
669 specified by `path`. Default is 'r'.
670 encoding : {None, str}, optional
671 Open text file with given encoding. The default encoding will be
672 what `open` uses.
673 newline : {None, str}, optional
674 Newline to use when reading text file.
676 Returns
677 -------
678 out : file object
679 File object.
681 """
682 return DataSource.open(self, self._fullpath(path), mode,
683 encoding=encoding, newline=newline)
685 def listdir(self):
686 """
687 List files in the source Repository.
689 Returns
690 -------
691 files : list of str or pathlib.Path
692 List of file names (not containing a directory part).
694 Notes
695 -----
696 Does not currently work for remote repositories.
698 """
699 if self._isurl(self._baseurl):
700 raise NotImplementedError(
701 "Directory listing of URLs, not supported yet.")
702 else:
703 return os.listdir(self._baseurl)