1"""A file interface for handling local and remote data files.
2
3The goal of datasource is to abstract some of the file system operations
4when dealing with data files so the researcher doesn't have to know all the
5low-level details. Through datasource, a researcher can obtain and use a
6file with one function call, regardless of location of the file.
7
8DataSource is meant to augment standard python libraries, not replace them.
9It should work seamlessly with standard file IO operations and the os
10module.
11
12DataSource files can originate locally or remotely:
13
14- local files : '/home/guido/src/local/data.txt'
15- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
16
17DataSource files can also be compressed or uncompressed. Currently only
18gzip, bz2 and xz are supported.
19
20Example::
21
22 >>> # Create a DataSource, use os.curdir (default) for local storage.
23 >>> from numpy import DataSource
24 >>> ds = DataSource()
25 >>>
26 >>> # Open a remote file.
27 >>> # DataSource downloads the file, stores it locally in:
28 >>> # './www.google.com/index.html'
29 >>> # opens the file and returns a file object.
30 >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP
31 >>>
32 >>> # Use the file as you normally would
33 >>> fp.read() # doctest: +SKIP
34 >>> fp.close() # doctest: +SKIP
35
36"""
37import os
38
39from .._utils import set_module
40
41
42_open = open
43
44
45def _check_mode(mode, encoding, newline):
46 """Check mode and that encoding and newline are compatible.
47
48 Parameters
49 ----------
50 mode : str
51 File open mode.
52 encoding : str
53 File encoding.
54 newline : str
55 Newline for text files.
56
57 """
58 if "t" in mode:
59 if "b" in mode:
60 raise ValueError("Invalid mode: %r" % (mode,))
61 else:
62 if encoding is not None:
63 raise ValueError("Argument 'encoding' not supported in binary mode")
64 if newline is not None:
65 raise ValueError("Argument 'newline' not supported in binary mode")
66
67
68# Using a class instead of a module-level dictionary
69# to reduce the initial 'import numpy' overhead by
70# deferring the import of lzma, bz2 and gzip until needed
71
72# TODO: .zip support, .tar support?
73class _FileOpeners:
74 """
75 Container for different methods to open (un-)compressed files.
76
77 `_FileOpeners` contains a dictionary that holds one method for each
78 supported file format. Attribute lookup is implemented in such a way
79 that an instance of `_FileOpeners` itself can be indexed with the keys
80 of that dictionary. Currently uncompressed files as well as files
81 compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported.
82
83 Notes
84 -----
85 `_file_openers`, an instance of `_FileOpeners`, is made available for
86 use in the `_datasource` module.
87
88 Examples
89 --------
90 >>> import gzip
91 >>> np.lib._datasource._file_openers.keys()
92 [None, '.bz2', '.gz', '.xz', '.lzma']
93 >>> np.lib._datasource._file_openers['.gz'] is gzip.open
94 True
95
96 """
97
98 def __init__(self):
99 self._loaded = False
100 self._file_openers = {None: open}
101
102 def _load(self):
103 if self._loaded:
104 return
105
106 try:
107 import bz2
108 self._file_openers[".bz2"] = bz2.open
109 except ImportError:
110 pass
111
112 try:
113 import gzip
114 self._file_openers[".gz"] = gzip.open
115 except ImportError:
116 pass
117
118 try:
119 import lzma
120 self._file_openers[".xz"] = lzma.open
121 self._file_openers[".lzma"] = lzma.open
122 except (ImportError, AttributeError):
123 # There are incompatible backports of lzma that do not have the
124 # lzma.open attribute, so catch that as well as ImportError.
125 pass
126
127 self._loaded = True
128
129 def keys(self):
130 """
131 Return the keys of currently supported file openers.
132
133 Parameters
134 ----------
135 None
136
137 Returns
138 -------
139 keys : list
140 The keys are None for uncompressed files and the file extension
141 strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression
142 methods.
143
144 """
145 self._load()
146 return list(self._file_openers.keys())
147
148 def __getitem__(self, key):
149 self._load()
150 return self._file_openers[key]
151
152_file_openers = _FileOpeners()
153
154def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None):
155 """
156 Open `path` with `mode` and return the file object.
157
158 If ``path`` is an URL, it will be downloaded, stored in the
159 `DataSource` `destpath` directory and opened from there.
160
161 Parameters
162 ----------
163 path : str or pathlib.Path
164 Local file path or URL to open.
165 mode : str, optional
166 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to
167 append. Available modes depend on the type of object specified by
168 path. Default is 'r'.
169 destpath : str, optional
170 Path to the directory where the source file gets downloaded to for
171 use. If `destpath` is None, a temporary directory will be created.
172 The default path is the current directory.
173 encoding : {None, str}, optional
174 Open text file with given encoding. The default encoding will be
175 what `open` uses.
176 newline : {None, str}, optional
177 Newline to use when reading text file.
178
179 Returns
180 -------
181 out : file object
182 The opened file.
183
184 Notes
185 -----
186 This is a convenience function that instantiates a `DataSource` and
187 returns the file object from ``DataSource.open(path)``.
188
189 """
190
191 ds = DataSource(destpath)
192 return ds.open(path, mode, encoding=encoding, newline=newline)
193
194
195@set_module('numpy.lib.npyio')
196class DataSource:
197 """
198 DataSource(destpath='.')
199
200 A generic data source file (file, http, ftp, ...).
201
202 DataSources can be local files or remote files/URLs. The files may
203 also be compressed or uncompressed. DataSource hides some of the
204 low-level details of downloading the file, allowing you to simply pass
205 in a valid file path (or URL) and obtain a file object.
206
207 Parameters
208 ----------
209 destpath : str or None, optional
210 Path to the directory where the source file gets downloaded to for
211 use. If `destpath` is None, a temporary directory will be created.
212 The default path is the current directory.
213
214 Notes
215 -----
216 URLs require a scheme string (``http://``) to be used, without it they
217 will fail::
218
219 >>> repos = np.lib.npyio.DataSource()
220 >>> repos.exists('www.google.com/index.html')
221 False
222 >>> repos.exists('http://www.google.com/index.html')
223 True
224
225 Temporary directories are deleted when the DataSource is deleted.
226
227 Examples
228 --------
229 ::
230
231 >>> ds = np.lib.npyio.DataSource('/home/guido')
232 >>> urlname = 'http://www.google.com/'
233 >>> gfile = ds.open('http://www.google.com/')
234 >>> ds.abspath(urlname)
235 '/home/guido/www.google.com/index.html'
236
237 >>> ds = np.lib.npyio.DataSource(None) # use with temporary file
238 >>> ds.open('/home/guido/foobar.txt')
239 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430>
240 >>> ds.abspath('/home/guido/foobar.txt')
241 '/tmp/.../home/guido/foobar.txt'
242
243 """
244
245 def __init__(self, destpath=os.curdir):
246 """Create a DataSource with a local path at destpath."""
247 if destpath:
248 self._destpath = os.path.abspath(destpath)
249 self._istmpdest = False
250 else:
251 import tempfile # deferring import to improve startup time
252 self._destpath = tempfile.mkdtemp()
253 self._istmpdest = True
254
255 def __del__(self):
256 # Remove temp directories
257 if hasattr(self, '_istmpdest') and self._istmpdest:
258 import shutil
259
260 shutil.rmtree(self._destpath)
261
262 def _iszip(self, filename):
263 """Test if the filename is a zip file by looking at the file extension.
264
265 """
266 fname, ext = os.path.splitext(filename)
267 return ext in _file_openers.keys()
268
269 def _iswritemode(self, mode):
270 """Test if the given mode will open a file for writing."""
271
272 # Currently only used to test the bz2 files.
273 _writemodes = ("w", "+")
274 return any(c in _writemodes for c in mode)
275
276 def _splitzipext(self, filename):
277 """Split zip extension from filename and return filename.
278
279 Returns
280 -------
281 base, zip_ext : {tuple}
282
283 """
284
285 if self._iszip(filename):
286 return os.path.splitext(filename)
287 else:
288 return filename, None
289
290 def _possible_names(self, filename):
291 """Return a tuple containing compressed filename variations."""
292 names = [filename]
293 if not self._iszip(filename):
294 for zipext in _file_openers.keys():
295 if zipext:
296 names.append(filename+zipext)
297 return names
298
299 def _isurl(self, path):
300 """Test if path is a net location. Tests the scheme and netloc."""
301
302 # We do this here to reduce the 'import numpy' initial import time.
303 from urllib.parse import urlparse
304
305 # BUG : URLs require a scheme string ('http://') to be used.
306 # www.google.com will fail.
307 # Should we prepend the scheme for those that don't have it and
308 # test that also? Similar to the way we append .gz and test for
309 # for compressed versions of files.
310
311 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
312 return bool(scheme and netloc)
313
314 def _cache(self, path):
315 """Cache the file specified by path.
316
317 Creates a copy of the file in the datasource cache.
318
319 """
320 # We import these here because importing them is slow and
321 # a significant fraction of numpy's total import time.
322 import shutil
323 from urllib.request import urlopen
324
325 upath = self.abspath(path)
326
327 # ensure directory exists
328 if not os.path.exists(os.path.dirname(upath)):
329 os.makedirs(os.path.dirname(upath))
330
331 # TODO: Doesn't handle compressed files!
332 if self._isurl(path):
333 with urlopen(path) as openedurl:
334 with _open(upath, 'wb') as f:
335 shutil.copyfileobj(openedurl, f)
336 else:
337 shutil.copyfile(path, upath)
338 return upath
339
340 def _findfile(self, path):
341 """Searches for ``path`` and returns full path if found.
342
343 If path is an URL, _findfile will cache a local copy and return the
344 path to the cached file. If path is a local file, _findfile will
345 return a path to that local file.
346
347 The search will include possible compressed versions of the file
348 and return the first occurrence found.
349
350 """
351
352 # Build list of possible local file paths
353 if not self._isurl(path):
354 # Valid local paths
355 filelist = self._possible_names(path)
356 # Paths in self._destpath
357 filelist += self._possible_names(self.abspath(path))
358 else:
359 # Cached URLs in self._destpath
360 filelist = self._possible_names(self.abspath(path))
361 # Remote URLs
362 filelist = filelist + self._possible_names(path)
363
364 for name in filelist:
365 if self.exists(name):
366 if self._isurl(name):
367 name = self._cache(name)
368 return name
369 return None
370
371 def abspath(self, path):
372 """
373 Return absolute path of file in the DataSource directory.
374
375 If `path` is an URL, then `abspath` will return either the location
376 the file exists locally or the location it would exist when opened
377 using the `open` method.
378
379 Parameters
380 ----------
381 path : str or pathlib.Path
382 Can be a local file or a remote URL.
383
384 Returns
385 -------
386 out : str
387 Complete path, including the `DataSource` destination directory.
388
389 Notes
390 -----
391 The functionality is based on `os.path.abspath`.
392
393 """
394 # We do this here to reduce the 'import numpy' initial import time.
395 from urllib.parse import urlparse
396
397 # TODO: This should be more robust. Handles case where path includes
398 # the destpath, but not other sub-paths. Failing case:
399 # path = /home/guido/datafile.txt
400 # destpath = /home/alex/
401 # upath = self.abspath(path)
402 # upath == '/home/alex/home/guido/datafile.txt'
403
404 # handle case where path includes self._destpath
405 splitpath = path.split(self._destpath, 2)
406 if len(splitpath) > 1:
407 path = splitpath[1]
408 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
409 netloc = self._sanitize_relative_path(netloc)
410 upath = self._sanitize_relative_path(upath)
411 return os.path.join(self._destpath, netloc, upath)
412
413 def _sanitize_relative_path(self, path):
414 """Return a sanitised relative path for which
415 os.path.abspath(os.path.join(base, path)).startswith(base)
416 """
417 last = None
418 path = os.path.normpath(path)
419 while path != last:
420 last = path
421 # Note: os.path.join treats '/' as os.sep on Windows
422 path = path.lstrip(os.sep).lstrip('/')
423 path = path.lstrip(os.pardir).removeprefix('..')
424 drive, path = os.path.splitdrive(path) # for Windows
425 return path
426
427 def exists(self, path):
428 """
429 Test if path exists.
430
431 Test if `path` exists as (and in this order):
432
433 - a local file.
434 - a remote URL that has been downloaded and stored locally in the
435 `DataSource` directory.
436 - a remote URL that has not been downloaded, but is valid and
437 accessible.
438
439 Parameters
440 ----------
441 path : str or pathlib.Path
442 Can be a local file or a remote URL.
443
444 Returns
445 -------
446 out : bool
447 True if `path` exists.
448
449 Notes
450 -----
451 When `path` is an URL, `exists` will return True if it's either
452 stored locally in the `DataSource` directory, or is a valid remote
453 URL. `DataSource` does not discriminate between the two, the file
454 is accessible if it exists in either location.
455
456 """
457
458 # First test for local path
459 if os.path.exists(path):
460 return True
461
462 # We import this here because importing urllib is slow and
463 # a significant fraction of numpy's total import time.
464 from urllib.request import urlopen
465 from urllib.error import URLError
466
467 # Test cached url
468 upath = self.abspath(path)
469 if os.path.exists(upath):
470 return True
471
472 # Test remote url
473 if self._isurl(path):
474 try:
475 netfile = urlopen(path)
476 netfile.close()
477 del(netfile)
478 return True
479 except URLError:
480 return False
481 return False
482
483 def open(self, path, mode='r', encoding=None, newline=None):
484 """
485 Open and return file-like object.
486
487 If `path` is an URL, it will be downloaded, stored in the
488 `DataSource` directory and opened from there.
489
490 Parameters
491 ----------
492 path : str or pathlib.Path
493 Local file path or URL to open.
494 mode : {'r', 'w', 'a'}, optional
495 Mode to open `path`. Mode 'r' for reading, 'w' for writing,
496 'a' to append. Available modes depend on the type of object
497 specified by `path`. Default is 'r'.
498 encoding : {None, str}, optional
499 Open text file with given encoding. The default encoding will be
500 what `open` uses.
501 newline : {None, str}, optional
502 Newline to use when reading text file.
503
504 Returns
505 -------
506 out : file object
507 File object.
508
509 """
510
511 # TODO: There is no support for opening a file for writing which
512 # doesn't exist yet (creating a file). Should there be?
513
514 # TODO: Add a ``subdir`` parameter for specifying the subdirectory
515 # used to store URLs in self._destpath.
516
517 if self._isurl(path) and self._iswritemode(mode):
518 raise ValueError("URLs are not writeable")
519
520 # NOTE: _findfile will fail on a new file opened for writing.
521 found = self._findfile(path)
522 if found:
523 _fname, ext = self._splitzipext(found)
524 if ext == 'bz2':
525 mode.replace("+", "")
526 return _file_openers[ext](found, mode=mode,
527 encoding=encoding, newline=newline)
528 else:
529 raise FileNotFoundError(f"{path} not found.")
530
531
532class Repository (DataSource):
533 """
534 Repository(baseurl, destpath='.')
535
536 A data repository where multiple DataSource's share a base
537 URL/directory.
538
539 `Repository` extends `DataSource` by prepending a base URL (or
540 directory) to all the files it handles. Use `Repository` when you will
541 be working with multiple files from one base URL. Initialize
542 `Repository` with the base URL, then refer to each file by its filename
543 only.
544
545 Parameters
546 ----------
547 baseurl : str
548 Path to the local directory or remote location that contains the
549 data files.
550 destpath : str or None, optional
551 Path to the directory where the source file gets downloaded to for
552 use. If `destpath` is None, a temporary directory will be created.
553 The default path is the current directory.
554
555 Examples
556 --------
557 To analyze all files in the repository, do something like this
558 (note: this is not self-contained code)::
559
560 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/')
561 >>> for filename in filelist:
562 ... fp = repos.open(filename)
563 ... fp.analyze()
564 ... fp.close()
565
566 Similarly you could use a URL for a repository::
567
568 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data')
569
570 """
571
572 def __init__(self, baseurl, destpath=os.curdir):
573 """Create a Repository with a shared url or directory of baseurl."""
574 DataSource.__init__(self, destpath=destpath)
575 self._baseurl = baseurl
576
577 def __del__(self):
578 DataSource.__del__(self)
579
580 def _fullpath(self, path):
581 """Return complete path for path. Prepends baseurl if necessary."""
582 splitpath = path.split(self._baseurl, 2)
583 if len(splitpath) == 1:
584 result = os.path.join(self._baseurl, path)
585 else:
586 result = path # path contains baseurl already
587 return result
588
589 def _findfile(self, path):
590 """Extend DataSource method to prepend baseurl to ``path``."""
591 return DataSource._findfile(self, self._fullpath(path))
592
593 def abspath(self, path):
594 """
595 Return absolute path of file in the Repository directory.
596
597 If `path` is an URL, then `abspath` will return either the location
598 the file exists locally or the location it would exist when opened
599 using the `open` method.
600
601 Parameters
602 ----------
603 path : str or pathlib.Path
604 Can be a local file or a remote URL. This may, but does not
605 have to, include the `baseurl` with which the `Repository` was
606 initialized.
607
608 Returns
609 -------
610 out : str
611 Complete path, including the `DataSource` destination directory.
612
613 """
614 return DataSource.abspath(self, self._fullpath(path))
615
616 def exists(self, path):
617 """
618 Test if path exists prepending Repository base URL to path.
619
620 Test if `path` exists as (and in this order):
621
622 - a local file.
623 - a remote URL that has been downloaded and stored locally in the
624 `DataSource` directory.
625 - a remote URL that has not been downloaded, but is valid and
626 accessible.
627
628 Parameters
629 ----------
630 path : str or pathlib.Path
631 Can be a local file or a remote URL. This may, but does not
632 have to, include the `baseurl` with which the `Repository` was
633 initialized.
634
635 Returns
636 -------
637 out : bool
638 True if `path` exists.
639
640 Notes
641 -----
642 When `path` is an URL, `exists` will return True if it's either
643 stored locally in the `DataSource` directory, or is a valid remote
644 URL. `DataSource` does not discriminate between the two, the file
645 is accessible if it exists in either location.
646
647 """
648 return DataSource.exists(self, self._fullpath(path))
649
650 def open(self, path, mode='r', encoding=None, newline=None):
651 """
652 Open and return file-like object prepending Repository base URL.
653
654 If `path` is an URL, it will be downloaded, stored in the
655 DataSource directory and opened from there.
656
657 Parameters
658 ----------
659 path : str or pathlib.Path
660 Local file path or URL to open. This may, but does not have to,
661 include the `baseurl` with which the `Repository` was
662 initialized.
663 mode : {'r', 'w', 'a'}, optional
664 Mode to open `path`. Mode 'r' for reading, 'w' for writing,
665 'a' to append. Available modes depend on the type of object
666 specified by `path`. Default is 'r'.
667 encoding : {None, str}, optional
668 Open text file with given encoding. The default encoding will be
669 what `open` uses.
670 newline : {None, str}, optional
671 Newline to use when reading text file.
672
673 Returns
674 -------
675 out : file object
676 File object.
677
678 """
679 return DataSource.open(self, self._fullpath(path), mode,
680 encoding=encoding, newline=newline)
681
682 def listdir(self):
683 """
684 List files in the source Repository.
685
686 Returns
687 -------
688 files : list of str or pathlib.Path
689 List of file names (not containing a directory part).
690
691 Notes
692 -----
693 Does not currently work for remote repositories.
694
695 """
696 if self._isurl(self._baseurl):
697 raise NotImplementedError(
698 "Directory listing of URLs, not supported yet.")
699 else:
700 return os.listdir(self._baseurl)