1"""A file interface for handling local and remote data files.
2
3The goal of datasource is to abstract some of the file system operations
4when dealing with data files so the researcher doesn't have to know all the
5low-level details. Through datasource, a researcher can obtain and use a
6file with one function call, regardless of location of the file.
7
8DataSource is meant to augment standard python libraries, not replace them.
9It should work seamlessly with standard file IO operations and the os
10module.
11
12DataSource files can originate locally or remotely:
13
14- local files : '/home/guido/src/local/data.txt'
15- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
16
17DataSource files can also be compressed or uncompressed. Currently only
18gzip, bz2 and xz are supported.
19
20Example::
21
22 >>> # Create a DataSource, use os.curdir (default) for local storage.
23 >>> from numpy import DataSource
24 >>> ds = DataSource()
25 >>>
26 >>> # Open a remote file.
27 >>> # DataSource downloads the file, stores it locally in:
28 >>> # './www.google.com/index.html'
29 >>> # opens the file and returns a file object.
30 >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP
31 >>>
32 >>> # Use the file as you normally would
33 >>> fp.read() # doctest: +SKIP
34 >>> fp.close() # doctest: +SKIP
35
36"""
37import os
38import io
39
40from numpy.core.overrides import set_module
41
42
43_open = open
44
45
46def _check_mode(mode, encoding, newline):
47 """Check mode and that encoding and newline are compatible.
48
49 Parameters
50 ----------
51 mode : str
52 File open mode.
53 encoding : str
54 File encoding.
55 newline : str
56 Newline for text files.
57
58 """
59 if "t" in mode:
60 if "b" in mode:
61 raise ValueError("Invalid mode: %r" % (mode,))
62 else:
63 if encoding is not None:
64 raise ValueError("Argument 'encoding' not supported in binary mode")
65 if newline is not None:
66 raise ValueError("Argument 'newline' not supported in binary mode")
67
68
69# Using a class instead of a module-level dictionary
70# to reduce the initial 'import numpy' overhead by
71# deferring the import of lzma, bz2 and gzip until needed
72
73# TODO: .zip support, .tar support?
74class _FileOpeners:
75 """
76 Container for different methods to open (un-)compressed files.
77
78 `_FileOpeners` contains a dictionary that holds one method for each
79 supported file format. Attribute lookup is implemented in such a way
80 that an instance of `_FileOpeners` itself can be indexed with the keys
81 of that dictionary. Currently uncompressed files as well as files
82 compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported.
83
84 Notes
85 -----
86 `_file_openers`, an instance of `_FileOpeners`, is made available for
87 use in the `_datasource` module.
88
89 Examples
90 --------
91 >>> import gzip
92 >>> np.lib._datasource._file_openers.keys()
93 [None, '.bz2', '.gz', '.xz', '.lzma']
94 >>> np.lib._datasource._file_openers['.gz'] is gzip.open
95 True
96
97 """
98
99 def __init__(self):
100 self._loaded = False
101 self._file_openers = {None: io.open}
102
103 def _load(self):
104 if self._loaded:
105 return
106
107 try:
108 import bz2
109 self._file_openers[".bz2"] = bz2.open
110 except ImportError:
111 pass
112
113 try:
114 import gzip
115 self._file_openers[".gz"] = gzip.open
116 except ImportError:
117 pass
118
119 try:
120 import lzma
121 self._file_openers[".xz"] = lzma.open
122 self._file_openers[".lzma"] = lzma.open
123 except (ImportError, AttributeError):
124 # There are incompatible backports of lzma that do not have the
125 # lzma.open attribute, so catch that as well as ImportError.
126 pass
127
128 self._loaded = True
129
130 def keys(self):
131 """
132 Return the keys of currently supported file openers.
133
134 Parameters
135 ----------
136 None
137
138 Returns
139 -------
140 keys : list
141 The keys are None for uncompressed files and the file extension
142 strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression
143 methods.
144
145 """
146 self._load()
147 return list(self._file_openers.keys())
148
149 def __getitem__(self, key):
150 self._load()
151 return self._file_openers[key]
152
153_file_openers = _FileOpeners()
154
155def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None):
156 """
157 Open `path` with `mode` and return the file object.
158
159 If ``path`` is an URL, it will be downloaded, stored in the
160 `DataSource` `destpath` directory and opened from there.
161
162 Parameters
163 ----------
164 path : str
165 Local file path or URL to open.
166 mode : str, optional
167 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to
168 append. Available modes depend on the type of object specified by
169 path. Default is 'r'.
170 destpath : str, optional
171 Path to the directory where the source file gets downloaded to for
172 use. If `destpath` is None, a temporary directory will be created.
173 The default path is the current directory.
174 encoding : {None, str}, optional
175 Open text file with given encoding. The default encoding will be
176 what `io.open` uses.
177 newline : {None, str}, optional
178 Newline to use when reading text file.
179
180 Returns
181 -------
182 out : file object
183 The opened file.
184
185 Notes
186 -----
187 This is a convenience function that instantiates a `DataSource` and
188 returns the file object from ``DataSource.open(path)``.
189
190 """
191
192 ds = DataSource(destpath)
193 return ds.open(path, mode, encoding=encoding, newline=newline)
194
195
196@set_module('numpy')
197class DataSource:
198 """
199 DataSource(destpath='.')
200
201 A generic data source file (file, http, ftp, ...).
202
203 DataSources can be local files or remote files/URLs. The files may
204 also be compressed or uncompressed. DataSource hides some of the
205 low-level details of downloading the file, allowing you to simply pass
206 in a valid file path (or URL) and obtain a file object.
207
208 Parameters
209 ----------
210 destpath : str or None, optional
211 Path to the directory where the source file gets downloaded to for
212 use. If `destpath` is None, a temporary directory will be created.
213 The default path is the current directory.
214
215 Notes
216 -----
217 URLs require a scheme string (``http://``) to be used, without it they
218 will fail::
219
220 >>> repos = np.DataSource()
221 >>> repos.exists('www.google.com/index.html')
222 False
223 >>> repos.exists('http://www.google.com/index.html')
224 True
225
226 Temporary directories are deleted when the DataSource is deleted.
227
228 Examples
229 --------
230 ::
231
232 >>> ds = np.DataSource('/home/guido')
233 >>> urlname = 'http://www.google.com/'
234 >>> gfile = ds.open('http://www.google.com/')
235 >>> ds.abspath(urlname)
236 '/home/guido/www.google.com/index.html'
237
238 >>> ds = np.DataSource(None) # use with temporary file
239 >>> ds.open('/home/guido/foobar.txt')
240 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430>
241 >>> ds.abspath('/home/guido/foobar.txt')
242 '/tmp/.../home/guido/foobar.txt'
243
244 """
245
246 def __init__(self, destpath=os.curdir):
247 """Create a DataSource with a local path at destpath."""
248 if destpath:
249 self._destpath = os.path.abspath(destpath)
250 self._istmpdest = False
251 else:
252 import tempfile # deferring import to improve startup time
253 self._destpath = tempfile.mkdtemp()
254 self._istmpdest = True
255
256 def __del__(self):
257 # Remove temp directories
258 if hasattr(self, '_istmpdest') and self._istmpdest:
259 import shutil
260
261 shutil.rmtree(self._destpath)
262
263 def _iszip(self, filename):
264 """Test if the filename is a zip file by looking at the file extension.
265
266 """
267 fname, ext = os.path.splitext(filename)
268 return ext in _file_openers.keys()
269
270 def _iswritemode(self, mode):
271 """Test if the given mode will open a file for writing."""
272
273 # Currently only used to test the bz2 files.
274 _writemodes = ("w", "+")
275 for c in mode:
276 if c in _writemodes:
277 return True
278 return False
279
280 def _splitzipext(self, filename):
281 """Split zip extension from filename and return filename.
282
283 Returns
284 -------
285 base, zip_ext : {tuple}
286
287 """
288
289 if self._iszip(filename):
290 return os.path.splitext(filename)
291 else:
292 return filename, None
293
294 def _possible_names(self, filename):
295 """Return a tuple containing compressed filename variations."""
296 names = [filename]
297 if not self._iszip(filename):
298 for zipext in _file_openers.keys():
299 if zipext:
300 names.append(filename+zipext)
301 return names
302
303 def _isurl(self, path):
304 """Test if path is a net location. Tests the scheme and netloc."""
305
306 # We do this here to reduce the 'import numpy' initial import time.
307 from urllib.parse import urlparse
308
309 # BUG : URLs require a scheme string ('http://') to be used.
310 # www.google.com will fail.
311 # Should we prepend the scheme for those that don't have it and
312 # test that also? Similar to the way we append .gz and test for
313 # for compressed versions of files.
314
315 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
316 return bool(scheme and netloc)
317
318 def _cache(self, path):
319 """Cache the file specified by path.
320
321 Creates a copy of the file in the datasource cache.
322
323 """
324 # We import these here because importing them is slow and
325 # a significant fraction of numpy's total import time.
326 import shutil
327 from urllib.request import urlopen
328
329 upath = self.abspath(path)
330
331 # ensure directory exists
332 if not os.path.exists(os.path.dirname(upath)):
333 os.makedirs(os.path.dirname(upath))
334
335 # TODO: Doesn't handle compressed files!
336 if self._isurl(path):
337 with urlopen(path) as openedurl:
338 with _open(upath, 'wb') as f:
339 shutil.copyfileobj(openedurl, f)
340 else:
341 shutil.copyfile(path, upath)
342 return upath
343
344 def _findfile(self, path):
345 """Searches for ``path`` and returns full path if found.
346
347 If path is an URL, _findfile will cache a local copy and return the
348 path to the cached file. If path is a local file, _findfile will
349 return a path to that local file.
350
351 The search will include possible compressed versions of the file
352 and return the first occurrence found.
353
354 """
355
356 # Build list of possible local file paths
357 if not self._isurl(path):
358 # Valid local paths
359 filelist = self._possible_names(path)
360 # Paths in self._destpath
361 filelist += self._possible_names(self.abspath(path))
362 else:
363 # Cached URLs in self._destpath
364 filelist = self._possible_names(self.abspath(path))
365 # Remote URLs
366 filelist = filelist + self._possible_names(path)
367
368 for name in filelist:
369 if self.exists(name):
370 if self._isurl(name):
371 name = self._cache(name)
372 return name
373 return None
374
375 def abspath(self, path):
376 """
377 Return absolute path of file in the DataSource directory.
378
379 If `path` is an URL, then `abspath` will return either the location
380 the file exists locally or the location it would exist when opened
381 using the `open` method.
382
383 Parameters
384 ----------
385 path : str
386 Can be a local file or a remote URL.
387
388 Returns
389 -------
390 out : str
391 Complete path, including the `DataSource` destination directory.
392
393 Notes
394 -----
395 The functionality is based on `os.path.abspath`.
396
397 """
398 # We do this here to reduce the 'import numpy' initial import time.
399 from urllib.parse import urlparse
400
401 # TODO: This should be more robust. Handles case where path includes
402 # the destpath, but not other sub-paths. Failing case:
403 # path = /home/guido/datafile.txt
404 # destpath = /home/alex/
405 # upath = self.abspath(path)
406 # upath == '/home/alex/home/guido/datafile.txt'
407
408 # handle case where path includes self._destpath
409 splitpath = path.split(self._destpath, 2)
410 if len(splitpath) > 1:
411 path = splitpath[1]
412 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
413 netloc = self._sanitize_relative_path(netloc)
414 upath = self._sanitize_relative_path(upath)
415 return os.path.join(self._destpath, netloc, upath)
416
417 def _sanitize_relative_path(self, path):
418 """Return a sanitised relative path for which
419 os.path.abspath(os.path.join(base, path)).startswith(base)
420 """
421 last = None
422 path = os.path.normpath(path)
423 while path != last:
424 last = path
425 # Note: os.path.join treats '/' as os.sep on Windows
426 path = path.lstrip(os.sep).lstrip('/')
427 path = path.lstrip(os.pardir).lstrip('..')
428 drive, path = os.path.splitdrive(path) # for Windows
429 return path
430
431 def exists(self, path):
432 """
433 Test if path exists.
434
435 Test if `path` exists as (and in this order):
436
437 - a local file.
438 - a remote URL that has been downloaded and stored locally in the
439 `DataSource` directory.
440 - a remote URL that has not been downloaded, but is valid and
441 accessible.
442
443 Parameters
444 ----------
445 path : str
446 Can be a local file or a remote URL.
447
448 Returns
449 -------
450 out : bool
451 True if `path` exists.
452
453 Notes
454 -----
455 When `path` is an URL, `exists` will return True if it's either
456 stored locally in the `DataSource` directory, or is a valid remote
457 URL. `DataSource` does not discriminate between the two, the file
458 is accessible if it exists in either location.
459
460 """
461
462 # First test for local path
463 if os.path.exists(path):
464 return True
465
466 # We import this here because importing urllib is slow and
467 # a significant fraction of numpy's total import time.
468 from urllib.request import urlopen
469 from urllib.error import URLError
470
471 # Test cached url
472 upath = self.abspath(path)
473 if os.path.exists(upath):
474 return True
475
476 # Test remote url
477 if self._isurl(path):
478 try:
479 netfile = urlopen(path)
480 netfile.close()
481 del(netfile)
482 return True
483 except URLError:
484 return False
485 return False
486
487 def open(self, path, mode='r', encoding=None, newline=None):
488 """
489 Open and return file-like object.
490
491 If `path` is an URL, it will be downloaded, stored in the
492 `DataSource` directory and opened from there.
493
494 Parameters
495 ----------
496 path : str
497 Local file path or URL to open.
498 mode : {'r', 'w', 'a'}, optional
499 Mode to open `path`. Mode 'r' for reading, 'w' for writing,
500 'a' to append. Available modes depend on the type of object
501 specified by `path`. Default is 'r'.
502 encoding : {None, str}, optional
503 Open text file with given encoding. The default encoding will be
504 what `io.open` uses.
505 newline : {None, str}, optional
506 Newline to use when reading text file.
507
508 Returns
509 -------
510 out : file object
511 File object.
512
513 """
514
515 # TODO: There is no support for opening a file for writing which
516 # doesn't exist yet (creating a file). Should there be?
517
518 # TODO: Add a ``subdir`` parameter for specifying the subdirectory
519 # used to store URLs in self._destpath.
520
521 if self._isurl(path) and self._iswritemode(mode):
522 raise ValueError("URLs are not writeable")
523
524 # NOTE: _findfile will fail on a new file opened for writing.
525 found = self._findfile(path)
526 if found:
527 _fname, ext = self._splitzipext(found)
528 if ext == 'bz2':
529 mode.replace("+", "")
530 return _file_openers[ext](found, mode=mode,
531 encoding=encoding, newline=newline)
532 else:
533 raise FileNotFoundError(f"{path} not found.")
534
535
536class Repository (DataSource):
537 """
538 Repository(baseurl, destpath='.')
539
540 A data repository where multiple DataSource's share a base
541 URL/directory.
542
543 `Repository` extends `DataSource` by prepending a base URL (or
544 directory) to all the files it handles. Use `Repository` when you will
545 be working with multiple files from one base URL. Initialize
546 `Repository` with the base URL, then refer to each file by its filename
547 only.
548
549 Parameters
550 ----------
551 baseurl : str
552 Path to the local directory or remote location that contains the
553 data files.
554 destpath : str or None, optional
555 Path to the directory where the source file gets downloaded to for
556 use. If `destpath` is None, a temporary directory will be created.
557 The default path is the current directory.
558
559 Examples
560 --------
561 To analyze all files in the repository, do something like this
562 (note: this is not self-contained code)::
563
564 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/')
565 >>> for filename in filelist:
566 ... fp = repos.open(filename)
567 ... fp.analyze()
568 ... fp.close()
569
570 Similarly you could use a URL for a repository::
571
572 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data')
573
574 """
575
576 def __init__(self, baseurl, destpath=os.curdir):
577 """Create a Repository with a shared url or directory of baseurl."""
578 DataSource.__init__(self, destpath=destpath)
579 self._baseurl = baseurl
580
581 def __del__(self):
582 DataSource.__del__(self)
583
584 def _fullpath(self, path):
585 """Return complete path for path. Prepends baseurl if necessary."""
586 splitpath = path.split(self._baseurl, 2)
587 if len(splitpath) == 1:
588 result = os.path.join(self._baseurl, path)
589 else:
590 result = path # path contains baseurl already
591 return result
592
593 def _findfile(self, path):
594 """Extend DataSource method to prepend baseurl to ``path``."""
595 return DataSource._findfile(self, self._fullpath(path))
596
597 def abspath(self, path):
598 """
599 Return absolute path of file in the Repository directory.
600
601 If `path` is an URL, then `abspath` will return either the location
602 the file exists locally or the location it would exist when opened
603 using the `open` method.
604
605 Parameters
606 ----------
607 path : str
608 Can be a local file or a remote URL. This may, but does not
609 have to, include the `baseurl` with which the `Repository` was
610 initialized.
611
612 Returns
613 -------
614 out : str
615 Complete path, including the `DataSource` destination directory.
616
617 """
618 return DataSource.abspath(self, self._fullpath(path))
619
620 def exists(self, path):
621 """
622 Test if path exists prepending Repository base URL to path.
623
624 Test if `path` exists as (and in this order):
625
626 - a local file.
627 - a remote URL that has been downloaded and stored locally in the
628 `DataSource` directory.
629 - a remote URL that has not been downloaded, but is valid and
630 accessible.
631
632 Parameters
633 ----------
634 path : str
635 Can be a local file or a remote URL. This may, but does not
636 have to, include the `baseurl` with which the `Repository` was
637 initialized.
638
639 Returns
640 -------
641 out : bool
642 True if `path` exists.
643
644 Notes
645 -----
646 When `path` is an URL, `exists` will return True if it's either
647 stored locally in the `DataSource` directory, or is a valid remote
648 URL. `DataSource` does not discriminate between the two, the file
649 is accessible if it exists in either location.
650
651 """
652 return DataSource.exists(self, self._fullpath(path))
653
654 def open(self, path, mode='r', encoding=None, newline=None):
655 """
656 Open and return file-like object prepending Repository base URL.
657
658 If `path` is an URL, it will be downloaded, stored in the
659 DataSource directory and opened from there.
660
661 Parameters
662 ----------
663 path : str
664 Local file path or URL to open. This may, but does not have to,
665 include the `baseurl` with which the `Repository` was
666 initialized.
667 mode : {'r', 'w', 'a'}, optional
668 Mode to open `path`. Mode 'r' for reading, 'w' for writing,
669 'a' to append. Available modes depend on the type of object
670 specified by `path`. Default is 'r'.
671 encoding : {None, str}, optional
672 Open text file with given encoding. The default encoding will be
673 what `io.open` uses.
674 newline : {None, str}, optional
675 Newline to use when reading text file.
676
677 Returns
678 -------
679 out : file object
680 File object.
681
682 """
683 return DataSource.open(self, self._fullpath(path), mode,
684 encoding=encoding, newline=newline)
685
686 def listdir(self):
687 """
688 List files in the source Repository.
689
690 Returns
691 -------
692 files : list of str
693 List of file names (not containing a directory part).
694
695 Notes
696 -----
697 Does not currently work for remote repositories.
698
699 """
700 if self._isurl(self._baseurl):
701 raise NotImplementedError(
702 "Directory listing of URLs, not supported yet.")
703 else:
704 return os.listdir(self._baseurl)