1"""
2IO related functions.
3"""
4import os
5import re
6import functools
7import itertools
8import warnings
9import weakref
10import contextlib
11import operator
12from operator import itemgetter
13from collections.abc import Mapping
14import pickle
15
16import numpy as np
17from . import format
18from ._datasource import DataSource
19from numpy._core import overrides
20from numpy._core.multiarray import packbits, unpackbits
21from numpy._core._multiarray_umath import _load_from_filelike
22from numpy._core.overrides import finalize_array_function_like, set_module
23from ._iotools import (
24 LineSplitter, NameValidator, StringConverter, ConverterError,
25 ConverterLockError, ConversionWarning, _is_string_like,
26 has_nested_fields, flatten_dtype, easy_dtype, _decode_line
27 )
28from numpy._utils import asunicode, asbytes
29
30
31__all__ = [
32 'savetxt', 'loadtxt', 'genfromtxt', 'load', 'save', 'savez',
33 'savez_compressed', 'packbits', 'unpackbits', 'fromregex'
34 ]
35
36
37array_function_dispatch = functools.partial(
38 overrides.array_function_dispatch, module='numpy')
39
40
41class BagObj:
42 """
43 BagObj(obj)
44
45 Convert attribute look-ups to getitems on the object passed in.
46
47 Parameters
48 ----------
49 obj : class instance
50 Object on which attribute look-up is performed.
51
52 Examples
53 --------
54 >>> import numpy as np
55 >>> from numpy.lib._npyio_impl import BagObj as BO
56 >>> class BagDemo:
57 ... def __getitem__(self, key): # An instance of BagObj(BagDemo)
58 ... # will call this method when any
59 ... # attribute look-up is required
60 ... result = "Doesn't matter what you want, "
61 ... return result + "you're gonna get this"
62 ...
63 >>> demo_obj = BagDemo()
64 >>> bagobj = BO(demo_obj)
65 >>> bagobj.hello_there
66 "Doesn't matter what you want, you're gonna get this"
67 >>> bagobj.I_can_be_anything
68 "Doesn't matter what you want, you're gonna get this"
69
70 """
71
72 def __init__(self, obj):
73 # Use weakref to make NpzFile objects collectable by refcount
74 self._obj = weakref.proxy(obj)
75
76 def __getattribute__(self, key):
77 try:
78 return object.__getattribute__(self, '_obj')[key]
79 except KeyError:
80 raise AttributeError(key) from None
81
82 def __dir__(self):
83 """
84 Enables dir(bagobj) to list the files in an NpzFile.
85
86 This also enables tab-completion in an interpreter or IPython.
87 """
88 return list(object.__getattribute__(self, '_obj').keys())
89
90
91def zipfile_factory(file, *args, **kwargs):
92 """
93 Create a ZipFile.
94
95 Allows for Zip64, and the `file` argument can accept file, str, or
96 pathlib.Path objects. `args` and `kwargs` are passed to the zipfile.ZipFile
97 constructor.
98 """
99 if not hasattr(file, 'read'):
100 file = os.fspath(file)
101 import zipfile
102 kwargs['allowZip64'] = True
103 return zipfile.ZipFile(file, *args, **kwargs)
104
105
106@set_module('numpy.lib.npyio')
107class NpzFile(Mapping):
108 """
109 NpzFile(fid)
110
111 A dictionary-like object with lazy-loading of files in the zipped
112 archive provided on construction.
113
114 `NpzFile` is used to load files in the NumPy ``.npz`` data archive
115 format. It assumes that files in the archive have a ``.npy`` extension,
116 other files are ignored.
117
118 The arrays and file strings are lazily loaded on either
119 getitem access using ``obj['key']`` or attribute lookup using
120 ``obj.f.key``. A list of all files (without ``.npy`` extensions) can
121 be obtained with ``obj.files`` and the ZipFile object itself using
122 ``obj.zip``.
123
124 Attributes
125 ----------
126 files : list of str
127 List of all files in the archive with a ``.npy`` extension.
128 zip : ZipFile instance
129 The ZipFile object initialized with the zipped archive.
130 f : BagObj instance
131 An object on which attribute can be performed as an alternative
132 to getitem access on the `NpzFile` instance itself.
133 allow_pickle : bool, optional
134 Allow loading pickled data. Default: False
135 pickle_kwargs : dict, optional
136 Additional keyword arguments to pass on to pickle.load.
137 These are only useful when loading object arrays saved on
138 Python 2 when using Python 3.
139 max_header_size : int, optional
140 Maximum allowed size of the header. Large headers may not be safe
141 to load securely and thus require explicitly passing a larger value.
142 See :py:func:`ast.literal_eval()` for details.
143 This option is ignored when `allow_pickle` is passed. In that case
144 the file is by definition trusted and the limit is unnecessary.
145
146 Parameters
147 ----------
148 fid : file, str, or pathlib.Path
149 The zipped archive to open. This is either a file-like object
150 or a string containing the path to the archive.
151 own_fid : bool, optional
152 Whether NpzFile should close the file handle.
153 Requires that `fid` is a file-like object.
154
155 Examples
156 --------
157 >>> import numpy as np
158 >>> from tempfile import TemporaryFile
159 >>> outfile = TemporaryFile()
160 >>> x = np.arange(10)
161 >>> y = np.sin(x)
162 >>> np.savez(outfile, x=x, y=y)
163 >>> _ = outfile.seek(0)
164
165 >>> npz = np.load(outfile)
166 >>> isinstance(npz, np.lib.npyio.NpzFile)
167 True
168 >>> npz
169 NpzFile 'object' with keys: x, y
170 >>> sorted(npz.files)
171 ['x', 'y']
172 >>> npz['x'] # getitem access
173 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
174 >>> npz.f.x # attribute lookup
175 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
176
177 """
178 # Make __exit__ safe if zipfile_factory raises an exception
179 zip = None
180 fid = None
181 _MAX_REPR_ARRAY_COUNT = 5
182
183 def __init__(self, fid, own_fid=False, allow_pickle=False,
184 pickle_kwargs=None, *,
185 max_header_size=format._MAX_HEADER_SIZE):
186 # Import is postponed to here since zipfile depends on gzip, an
187 # optional component of the so-called standard library.
188 _zip = zipfile_factory(fid)
189 self._files = _zip.namelist()
190 self.files = []
191 self.allow_pickle = allow_pickle
192 self.max_header_size = max_header_size
193 self.pickle_kwargs = pickle_kwargs
194 for x in self._files:
195 if x.endswith('.npy'):
196 self.files.append(x[:-4])
197 else:
198 self.files.append(x)
199 self.zip = _zip
200 self.f = BagObj(self)
201 if own_fid:
202 self.fid = fid
203
204 def __enter__(self):
205 return self
206
207 def __exit__(self, exc_type, exc_value, traceback):
208 self.close()
209
210 def close(self):
211 """
212 Close the file.
213
214 """
215 if self.zip is not None:
216 self.zip.close()
217 self.zip = None
218 if self.fid is not None:
219 self.fid.close()
220 self.fid = None
221 self.f = None # break reference cycle
222
223 def __del__(self):
224 self.close()
225
226 # Implement the Mapping ABC
227 def __iter__(self):
228 return iter(self.files)
229
230 def __len__(self):
231 return len(self.files)
232
233 def __getitem__(self, key):
234 # FIXME: This seems like it will copy strings around
235 # more than is strictly necessary. The zipfile
236 # will read the string and then
237 # the format.read_array will copy the string
238 # to another place in memory.
239 # It would be better if the zipfile could read
240 # (or at least uncompress) the data
241 # directly into the array memory.
242 member = False
243 if key in self._files:
244 member = True
245 elif key in self.files:
246 member = True
247 key += '.npy'
248 if member:
249 bytes = self.zip.open(key)
250 magic = bytes.read(len(format.MAGIC_PREFIX))
251 bytes.close()
252 if magic == format.MAGIC_PREFIX:
253 bytes = self.zip.open(key)
254 return format.read_array(bytes,
255 allow_pickle=self.allow_pickle,
256 pickle_kwargs=self.pickle_kwargs,
257 max_header_size=self.max_header_size)
258 else:
259 return self.zip.read(key)
260 else:
261 raise KeyError(f"{key} is not a file in the archive")
262
263 def __contains__(self, key):
264 return (key in self._files or key in self.files)
265
266 def __repr__(self):
267 # Get filename or default to `object`
268 if isinstance(self.fid, str):
269 filename = self.fid
270 else:
271 filename = getattr(self.fid, "name", "object")
272
273 # Get the name of arrays
274 array_names = ', '.join(self.files[:self._MAX_REPR_ARRAY_COUNT])
275 if len(self.files) > self._MAX_REPR_ARRAY_COUNT:
276 array_names += "..."
277 return f"NpzFile {filename!r} with keys: {array_names}"
278
279 # Work around problems with the docstrings in the Mapping methods
280 # They contain a `->`, which confuses the type annotation interpretations
281 # of sphinx-docs. See gh-25964
282
283 def get(self, key, default=None, /):
284 """
285 D.get(k,[,d]) returns D[k] if k in D, else d. d defaults to None.
286 """
287 return Mapping.get(self, key, default)
288
289 def items(self):
290 """
291 D.items() returns a set-like object providing a view on the items
292 """
293 return Mapping.items(self)
294
295 def keys(self):
296 """
297 D.keys() returns a set-like object providing a view on the keys
298 """
299 return Mapping.keys(self)
300
301 def values(self):
302 """
303 D.values() returns a set-like object providing a view on the values
304 """
305 return Mapping.values(self)
306
307
308@set_module('numpy')
309def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
310 encoding='ASCII', *, max_header_size=format._MAX_HEADER_SIZE):
311 """
312 Load arrays or pickled objects from ``.npy``, ``.npz`` or pickled files.
313
314 .. warning:: Loading files that contain object arrays uses the ``pickle``
315 module, which is not secure against erroneous or maliciously
316 constructed data. Consider passing ``allow_pickle=False`` to
317 load data that is known not to contain object arrays for the
318 safer handling of untrusted sources.
319
320 Parameters
321 ----------
322 file : file-like object, string, or pathlib.Path
323 The file to read. File-like objects must support the
324 ``seek()`` and ``read()`` methods and must always
325 be opened in binary mode. Pickled files require that the
326 file-like object support the ``readline()`` method as well.
327 mmap_mode : {None, 'r+', 'r', 'w+', 'c'}, optional
328 If not None, then memory-map the file, using the given mode (see
329 `numpy.memmap` for a detailed description of the modes). A
330 memory-mapped array is kept on disk. However, it can be accessed
331 and sliced like any ndarray. Memory mapping is especially useful
332 for accessing small fragments of large files without reading the
333 entire file into memory.
334 allow_pickle : bool, optional
335 Allow loading pickled object arrays stored in npy files. Reasons for
336 disallowing pickles include security, as loading pickled data can
337 execute arbitrary code. If pickles are disallowed, loading object
338 arrays will fail. Default: False
339 fix_imports : bool, optional
340 Only useful when loading Python 2 generated pickled files on Python 3,
341 which includes npy/npz files containing object arrays. If `fix_imports`
342 is True, pickle will try to map the old Python 2 names to the new names
343 used in Python 3.
344 encoding : str, optional
345 What encoding to use when reading Python 2 strings. Only useful when
346 loading Python 2 generated pickled files in Python 3, which includes
347 npy/npz files containing object arrays. Values other than 'latin1',
348 'ASCII', and 'bytes' are not allowed, as they can corrupt numerical
349 data. Default: 'ASCII'
350 max_header_size : int, optional
351 Maximum allowed size of the header. Large headers may not be safe
352 to load securely and thus require explicitly passing a larger value.
353 See :py:func:`ast.literal_eval()` for details.
354 This option is ignored when `allow_pickle` is passed. In that case
355 the file is by definition trusted and the limit is unnecessary.
356
357 Returns
358 -------
359 result : array, tuple, dict, etc.
360 Data stored in the file. For ``.npz`` files, the returned instance
361 of NpzFile class must be closed to avoid leaking file descriptors.
362
363 Raises
364 ------
365 OSError
366 If the input file does not exist or cannot be read.
367 UnpicklingError
368 If ``allow_pickle=True``, but the file cannot be loaded as a pickle.
369 ValueError
370 The file contains an object array, but ``allow_pickle=False`` given.
371 EOFError
372 When calling ``np.load`` multiple times on the same file handle,
373 if all data has already been read
374
375 See Also
376 --------
377 save, savez, savez_compressed, loadtxt
378 memmap : Create a memory-map to an array stored in a file on disk.
379 lib.format.open_memmap : Create or load a memory-mapped ``.npy`` file.
380
381 Notes
382 -----
383 - If the file contains pickle data, then whatever object is stored
384 in the pickle is returned.
385 - If the file is a ``.npy`` file, then a single array is returned.
386 - If the file is a ``.npz`` file, then a dictionary-like object is
387 returned, containing ``{filename: array}`` key-value pairs, one for
388 each file in the archive.
389 - If the file is a ``.npz`` file, the returned value supports the
390 context manager protocol in a similar fashion to the open function::
391
392 with load('foo.npz') as data:
393 a = data['a']
394
395 The underlying file descriptor is closed when exiting the 'with'
396 block.
397
398 Examples
399 --------
400 >>> import numpy as np
401
402 Store data to disk, and load it again:
403
404 >>> np.save('/tmp/123', np.array([[1, 2, 3], [4, 5, 6]]))
405 >>> np.load('/tmp/123.npy')
406 array([[1, 2, 3],
407 [4, 5, 6]])
408
409 Store compressed data to disk, and load it again:
410
411 >>> a=np.array([[1, 2, 3], [4, 5, 6]])
412 >>> b=np.array([1, 2])
413 >>> np.savez('/tmp/123.npz', a=a, b=b)
414 >>> data = np.load('/tmp/123.npz')
415 >>> data['a']
416 array([[1, 2, 3],
417 [4, 5, 6]])
418 >>> data['b']
419 array([1, 2])
420 >>> data.close()
421
422 Mem-map the stored array, and then access the second row
423 directly from disk:
424
425 >>> X = np.load('/tmp/123.npy', mmap_mode='r')
426 >>> X[1, :]
427 memmap([4, 5, 6])
428
429 """
430 if encoding not in ('ASCII', 'latin1', 'bytes'):
431 # The 'encoding' value for pickle also affects what encoding
432 # the serialized binary data of NumPy arrays is loaded
433 # in. Pickle does not pass on the encoding information to
434 # NumPy. The unpickling code in numpy._core.multiarray is
435 # written to assume that unicode data appearing where binary
436 # should be is in 'latin1'. 'bytes' is also safe, as is 'ASCII'.
437 #
438 # Other encoding values can corrupt binary data, and we
439 # purposefully disallow them. For the same reason, the errors=
440 # argument is not exposed, as values other than 'strict'
441 # result can similarly silently corrupt numerical data.
442 raise ValueError("encoding must be 'ASCII', 'latin1', or 'bytes'")
443
444 pickle_kwargs = dict(encoding=encoding, fix_imports=fix_imports)
445
446 with contextlib.ExitStack() as stack:
447 if hasattr(file, 'read'):
448 fid = file
449 own_fid = False
450 else:
451 fid = stack.enter_context(open(os.fspath(file), "rb"))
452 own_fid = True
453
454 # Code to distinguish from NumPy binary files and pickles.
455 _ZIP_PREFIX = b'PK\x03\x04'
456 _ZIP_SUFFIX = b'PK\x05\x06' # empty zip files start with this
457 N = len(format.MAGIC_PREFIX)
458 magic = fid.read(N)
459 if not magic:
460 raise EOFError("No data left in file")
461 # If the file size is less than N, we need to make sure not
462 # to seek past the beginning of the file
463 fid.seek(-min(N, len(magic)), 1) # back-up
464 if magic.startswith((_ZIP_PREFIX, _ZIP_SUFFIX)):
465 # zip-file (assume .npz)
466 # Potentially transfer file ownership to NpzFile
467 stack.pop_all()
468 ret = NpzFile(fid, own_fid=own_fid, allow_pickle=allow_pickle,
469 pickle_kwargs=pickle_kwargs,
470 max_header_size=max_header_size)
471 return ret
472 elif magic == format.MAGIC_PREFIX:
473 # .npy file
474 if mmap_mode:
475 if allow_pickle:
476 max_header_size = 2**64
477 return format.open_memmap(file, mode=mmap_mode,
478 max_header_size=max_header_size)
479 else:
480 return format.read_array(fid, allow_pickle=allow_pickle,
481 pickle_kwargs=pickle_kwargs,
482 max_header_size=max_header_size)
483 else:
484 # Try a pickle
485 if not allow_pickle:
486 raise ValueError(
487 "This file contains pickled (object) data. If you trust "
488 "the file you can load it unsafely using the "
489 "`allow_pickle=` keyword argument or `pickle.load()`.")
490 try:
491 return pickle.load(fid, **pickle_kwargs)
492 except Exception as e:
493 raise pickle.UnpicklingError(
494 f"Failed to interpret file {file!r} as a pickle") from e
495
496
497def _save_dispatcher(file, arr, allow_pickle=None, fix_imports=None):
498 return (arr,)
499
500
501@array_function_dispatch(_save_dispatcher)
502def save(file, arr, allow_pickle=True, fix_imports=np._NoValue):
503 """
504 Save an array to a binary file in NumPy ``.npy`` format.
505
506 Parameters
507 ----------
508 file : file, str, or pathlib.Path
509 File or filename to which the data is saved. If file is a file-object,
510 then the filename is unchanged. If file is a string or Path,
511 a ``.npy`` extension will be appended to the filename if it does not
512 already have one.
513 arr : array_like
514 Array data to be saved.
515 allow_pickle : bool, optional
516 Allow saving object arrays using Python pickles. Reasons for
517 disallowing pickles include security (loading pickled data can execute
518 arbitrary code) and portability (pickled objects may not be loadable
519 on different Python installations, for example if the stored objects
520 require libraries that are not available, and not all pickled data is
521 compatible between different versions of Python).
522 Default: True
523 fix_imports : bool, optional
524 The `fix_imports` flag is deprecated and has no effect.
525
526 .. deprecated:: 2.1
527 This flag is ignored since NumPy 1.17 and was only needed to
528 support loading some files in Python 2 written in Python 3.
529
530 See Also
531 --------
532 savez : Save several arrays into a ``.npz`` archive
533 savetxt, load
534
535 Notes
536 -----
537 For a description of the ``.npy`` format, see :py:mod:`numpy.lib.format`.
538
539 Any data saved to the file is appended to the end of the file.
540
541 Examples
542 --------
543 >>> import numpy as np
544
545 >>> from tempfile import TemporaryFile
546 >>> outfile = TemporaryFile()
547
548 >>> x = np.arange(10)
549 >>> np.save(outfile, x)
550
551 >>> _ = outfile.seek(0) # Only needed to simulate closing & reopening file
552 >>> np.load(outfile)
553 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
554
555
556 >>> with open('test.npy', 'wb') as f:
557 ... np.save(f, np.array([1, 2]))
558 ... np.save(f, np.array([1, 3]))
559 >>> with open('test.npy', 'rb') as f:
560 ... a = np.load(f)
561 ... b = np.load(f)
562 >>> print(a, b)
563 # [1 2] [1 3]
564 """
565 if fix_imports is not np._NoValue:
566 # Deprecated 2024-05-16, NumPy 2.1
567 warnings.warn(
568 "The 'fix_imports' flag is deprecated and has no effect. "
569 "(Deprecated in NumPy 2.1)",
570 DeprecationWarning, stacklevel=2)
571 if hasattr(file, 'write'):
572 file_ctx = contextlib.nullcontext(file)
573 else:
574 file = os.fspath(file)
575 if not file.endswith('.npy'):
576 file = file + '.npy'
577 file_ctx = open(file, "wb")
578
579 with file_ctx as fid:
580 arr = np.asanyarray(arr)
581 format.write_array(fid, arr, allow_pickle=allow_pickle,
582 pickle_kwargs=dict(fix_imports=fix_imports))
583
584
585def _savez_dispatcher(file, *args, allow_pickle=True, **kwds):
586 yield from args
587 yield from kwds.values()
588
589
590@array_function_dispatch(_savez_dispatcher)
591def savez(file, *args, allow_pickle=True, **kwds):
592 """Save several arrays into a single file in uncompressed ``.npz`` format.
593
594 Provide arrays as keyword arguments to store them under the
595 corresponding name in the output file: ``savez(fn, x=x, y=y)``.
596
597 If arrays are specified as positional arguments, i.e., ``savez(fn,
598 x, y)``, their names will be `arr_0`, `arr_1`, etc.
599
600 Parameters
601 ----------
602 file : file, str, or pathlib.Path
603 Either the filename (string) or an open file (file-like object)
604 where the data will be saved. If file is a string or a Path, the
605 ``.npz`` extension will be appended to the filename if it is not
606 already there.
607 args : Arguments, optional
608 Arrays to save to the file. Please use keyword arguments (see
609 `kwds` below) to assign names to arrays. Arrays specified as
610 args will be named "arr_0", "arr_1", and so on.
611 allow_pickle : bool, optional
612 Allow saving object arrays using Python pickles. Reasons for
613 disallowing pickles include security (loading pickled data can execute
614 arbitrary code) and portability (pickled objects may not be loadable
615 on different Python installations, for example if the stored objects
616 require libraries that are not available, and not all pickled data is
617 compatible between different versions of Python).
618 Default: True
619 kwds : Keyword arguments, optional
620 Arrays to save to the file. Each array will be saved to the
621 output file with its corresponding keyword name.
622
623 Returns
624 -------
625 None
626
627 See Also
628 --------
629 save : Save a single array to a binary file in NumPy format.
630 savetxt : Save an array to a file as plain text.
631 savez_compressed : Save several arrays into a compressed ``.npz`` archive
632
633 Notes
634 -----
635 The ``.npz`` file format is a zipped archive of files named after the
636 variables they contain. The archive is not compressed and each file
637 in the archive contains one variable in ``.npy`` format. For a
638 description of the ``.npy`` format, see :py:mod:`numpy.lib.format`.
639
640 When opening the saved ``.npz`` file with `load` a `~lib.npyio.NpzFile`
641 object is returned. This is a dictionary-like object which can be queried
642 for its list of arrays (with the ``.files`` attribute), and for the arrays
643 themselves.
644
645 Keys passed in `kwds` are used as filenames inside the ZIP archive.
646 Therefore, keys should be valid filenames; e.g., avoid keys that begin with
647 ``/`` or contain ``.``.
648
649 When naming variables with keyword arguments, it is not possible to name a
650 variable ``file``, as this would cause the ``file`` argument to be defined
651 twice in the call to ``savez``.
652
653 Examples
654 --------
655 >>> import numpy as np
656 >>> from tempfile import TemporaryFile
657 >>> outfile = TemporaryFile()
658 >>> x = np.arange(10)
659 >>> y = np.sin(x)
660
661 Using `savez` with \\*args, the arrays are saved with default names.
662
663 >>> np.savez(outfile, x, y)
664 >>> _ = outfile.seek(0) # Only needed to simulate closing & reopening file
665 >>> npzfile = np.load(outfile)
666 >>> npzfile.files
667 ['arr_0', 'arr_1']
668 >>> npzfile['arr_0']
669 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
670
671 Using `savez` with \\**kwds, the arrays are saved with the keyword names.
672
673 >>> outfile = TemporaryFile()
674 >>> np.savez(outfile, x=x, y=y)
675 >>> _ = outfile.seek(0)
676 >>> npzfile = np.load(outfile)
677 >>> sorted(npzfile.files)
678 ['x', 'y']
679 >>> npzfile['x']
680 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
681
682 """
683 _savez(file, args, kwds, False, allow_pickle=allow_pickle)
684
685
686def _savez_compressed_dispatcher(file, *args, allow_pickle=True, **kwds):
687 yield from args
688 yield from kwds.values()
689
690
691@array_function_dispatch(_savez_compressed_dispatcher)
692def savez_compressed(file, *args, allow_pickle=True, **kwds):
693 """
694 Save several arrays into a single file in compressed ``.npz`` format.
695
696 Provide arrays as keyword arguments to store them under the
697 corresponding name in the output file: ``savez_compressed(fn, x=x, y=y)``.
698
699 If arrays are specified as positional arguments, i.e.,
700 ``savez_compressed(fn, x, y)``, their names will be `arr_0`, `arr_1`, etc.
701
702 Parameters
703 ----------
704 file : file, str, or pathlib.Path
705 Either the filename (string) or an open file (file-like object)
706 where the data will be saved. If file is a string or a Path, the
707 ``.npz`` extension will be appended to the filename if it is not
708 already there.
709 args : Arguments, optional
710 Arrays to save to the file. Please use keyword arguments (see
711 `kwds` below) to assign names to arrays. Arrays specified as
712 args will be named "arr_0", "arr_1", and so on.
713 allow_pickle : bool, optional
714 Allow saving object arrays using Python pickles. Reasons for
715 disallowing pickles include security (loading pickled data can execute
716 arbitrary code) and portability (pickled objects may not be loadable
717 on different Python installations, for example if the stored objects
718 require libraries that are not available, and not all pickled data is
719 compatible between different versions of Python).
720 Default: True
721 kwds : Keyword arguments, optional
722 Arrays to save to the file. Each array will be saved to the
723 output file with its corresponding keyword name.
724
725 Returns
726 -------
727 None
728
729 See Also
730 --------
731 numpy.save : Save a single array to a binary file in NumPy format.
732 numpy.savetxt : Save an array to a file as plain text.
733 numpy.savez : Save several arrays into an uncompressed ``.npz`` file format
734 numpy.load : Load the files created by savez_compressed.
735
736 Notes
737 -----
738 The ``.npz`` file format is a zipped archive of files named after the
739 variables they contain. The archive is compressed with
740 ``zipfile.ZIP_DEFLATED`` and each file in the archive contains one variable
741 in ``.npy`` format. For a description of the ``.npy`` format, see
742 :py:mod:`numpy.lib.format`.
743
744
745 When opening the saved ``.npz`` file with `load` a `~lib.npyio.NpzFile`
746 object is returned. This is a dictionary-like object which can be queried
747 for its list of arrays (with the ``.files`` attribute), and for the arrays
748 themselves.
749
750 Examples
751 --------
752 >>> import numpy as np
753 >>> test_array = np.random.rand(3, 2)
754 >>> test_vector = np.random.rand(4)
755 >>> np.savez_compressed('/tmp/123', a=test_array, b=test_vector)
756 >>> loaded = np.load('/tmp/123.npz')
757 >>> print(np.array_equal(test_array, loaded['a']))
758 True
759 >>> print(np.array_equal(test_vector, loaded['b']))
760 True
761
762 """
763 _savez(file, args, kwds, True, allow_pickle=allow_pickle)
764
765
766def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None):
767 # Import is postponed to here since zipfile depends on gzip, an optional
768 # component of the so-called standard library.
769 import zipfile
770
771 if not hasattr(file, 'write'):
772 file = os.fspath(file)
773 if not file.endswith('.npz'):
774 file = file + '.npz'
775
776 namedict = kwds
777 for i, val in enumerate(args):
778 key = 'arr_%d' % i
779 if key in namedict.keys():
780 raise ValueError(
781 "Cannot use un-named variables and keyword %s" % key)
782 namedict[key] = val
783
784 if compress:
785 compression = zipfile.ZIP_DEFLATED
786 else:
787 compression = zipfile.ZIP_STORED
788
789 zipf = zipfile_factory(file, mode="w", compression=compression)
790 try:
791 for key, val in namedict.items():
792 fname = key + '.npy'
793 val = np.asanyarray(val)
794 # always force zip64, gh-10776
795 with zipf.open(fname, 'w', force_zip64=True) as fid:
796 format.write_array(fid, val,
797 allow_pickle=allow_pickle,
798 pickle_kwargs=pickle_kwargs)
799 finally:
800 zipf.close()
801
802
803def _ensure_ndmin_ndarray_check_param(ndmin):
804 """Just checks if the param ndmin is supported on
805 _ensure_ndmin_ndarray. It is intended to be used as
806 verification before running anything expensive.
807 e.g. loadtxt, genfromtxt
808 """
809 # Check correctness of the values of `ndmin`
810 if ndmin not in [0, 1, 2]:
811 raise ValueError(f"Illegal value of ndmin keyword: {ndmin}")
812
813def _ensure_ndmin_ndarray(a, *, ndmin: int):
814 """This is a helper function of loadtxt and genfromtxt to ensure
815 proper minimum dimension as requested
816
817 ndim : int. Supported values 1, 2, 3
818 ^^ whenever this changes, keep in sync with
819 _ensure_ndmin_ndarray_check_param
820 """
821 # Verify that the array has at least dimensions `ndmin`.
822 # Tweak the size and shape of the arrays - remove extraneous dimensions
823 if a.ndim > ndmin:
824 a = np.squeeze(a)
825 # and ensure we have the minimum number of dimensions asked for
826 # - has to be in this order for the odd case ndmin=1, a.squeeze().ndim=0
827 if a.ndim < ndmin:
828 if ndmin == 1:
829 a = np.atleast_1d(a)
830 elif ndmin == 2:
831 a = np.atleast_2d(a).T
832
833 return a
834
835
836# amount of lines loadtxt reads in one chunk, can be overridden for testing
837_loadtxt_chunksize = 50000
838
839
840def _check_nonneg_int(value, name="argument"):
841 try:
842 operator.index(value)
843 except TypeError:
844 raise TypeError(f"{name} must be an integer") from None
845 if value < 0:
846 raise ValueError(f"{name} must be nonnegative")
847
848
849def _preprocess_comments(iterable, comments, encoding):
850 """
851 Generator that consumes a line iterated iterable and strips out the
852 multiple (or multi-character) comments from lines.
853 This is a pre-processing step to achieve feature parity with loadtxt
854 (we assume that this feature is a nieche feature).
855 """
856 for line in iterable:
857 if isinstance(line, bytes):
858 # Need to handle conversion here, or the splitting would fail
859 line = line.decode(encoding)
860
861 for c in comments:
862 line = line.split(c, 1)[0]
863
864 yield line
865
866
867# The number of rows we read in one go if confronted with a parametric dtype
868_loadtxt_chunksize = 50000
869
870
871def _read(fname, *, delimiter=',', comment='#', quote='"',
872 imaginary_unit='j', usecols=None, skiplines=0,
873 max_rows=None, converters=None, ndmin=None, unpack=False,
874 dtype=np.float64, encoding=None):
875 r"""
876 Read a NumPy array from a text file.
877 This is a helper function for loadtxt.
878
879 Parameters
880 ----------
881 fname : file, str, or pathlib.Path
882 The filename or the file to be read.
883 delimiter : str, optional
884 Field delimiter of the fields in line of the file.
885 Default is a comma, ','. If None any sequence of whitespace is
886 considered a delimiter.
887 comment : str or sequence of str or None, optional
888 Character that begins a comment. All text from the comment
889 character to the end of the line is ignored.
890 Multiple comments or multiple-character comment strings are supported,
891 but may be slower and `quote` must be empty if used.
892 Use None to disable all use of comments.
893 quote : str or None, optional
894 Character that is used to quote string fields. Default is '"'
895 (a double quote). Use None to disable quote support.
896 imaginary_unit : str, optional
897 Character that represent the imaginary unit `sqrt(-1)`.
898 Default is 'j'.
899 usecols : array_like, optional
900 A one-dimensional array of integer column numbers. These are the
901 columns from the file to be included in the array. If this value
902 is not given, all the columns are used.
903 skiplines : int, optional
904 Number of lines to skip before interpreting the data in the file.
905 max_rows : int, optional
906 Maximum number of rows of data to read. Default is to read the
907 entire file.
908 converters : dict or callable, optional
909 A function to parse all columns strings into the desired value, or
910 a dictionary mapping column number to a parser function.
911 E.g. if column 0 is a date string: ``converters = {0: datestr2num}``.
912 Converters can also be used to provide a default value for missing
913 data, e.g. ``converters = lambda s: float(s.strip() or 0)`` will
914 convert empty fields to 0.
915 Default: None
916 ndmin : int, optional
917 Minimum dimension of the array returned.
918 Allowed values are 0, 1 or 2. Default is 0.
919 unpack : bool, optional
920 If True, the returned array is transposed, so that arguments may be
921 unpacked using ``x, y, z = read(...)``. When used with a structured
922 data-type, arrays are returned for each field. Default is False.
923 dtype : numpy data type
924 A NumPy dtype instance, can be a structured dtype to map to the
925 columns of the file.
926 encoding : str, optional
927 Encoding used to decode the inputfile. The special value 'bytes'
928 (the default) enables backwards-compatible behavior for `converters`,
929 ensuring that inputs to the converter functions are encoded
930 bytes objects. The special value 'bytes' has no additional effect if
931 ``converters=None``. If encoding is ``'bytes'`` or ``None``, the
932 default system encoding is used.
933
934 Returns
935 -------
936 ndarray
937 NumPy array.
938 """
939 # Handle special 'bytes' keyword for encoding
940 byte_converters = False
941 if encoding == 'bytes':
942 encoding = None
943 byte_converters = True
944
945 if dtype is None:
946 raise TypeError("a dtype must be provided.")
947 dtype = np.dtype(dtype)
948
949 read_dtype_via_object_chunks = None
950 if dtype.kind in 'SUM' and (
951 dtype == "S0" or dtype == "U0" or dtype == "M8" or dtype == 'm8'):
952 # This is a legacy "flexible" dtype. We do not truly support
953 # parametric dtypes currently (no dtype discovery step in the core),
954 # but have to support these for backward compatibility.
955 read_dtype_via_object_chunks = dtype
956 dtype = np.dtype(object)
957
958 if usecols is not None:
959 # Allow usecols to be a single int or a sequence of ints, the C-code
960 # handles the rest
961 try:
962 usecols = list(usecols)
963 except TypeError:
964 usecols = [usecols]
965
966 _ensure_ndmin_ndarray_check_param(ndmin)
967
968 if comment is None:
969 comments = None
970 else:
971 # assume comments are a sequence of strings
972 if "" in comment:
973 raise ValueError(
974 "comments cannot be an empty string. Use comments=None to "
975 "disable comments."
976 )
977 comments = tuple(comment)
978 comment = None
979 if len(comments) == 0:
980 comments = None # No comments at all
981 elif len(comments) == 1:
982 # If there is only one comment, and that comment has one character,
983 # the normal parsing can deal with it just fine.
984 if isinstance(comments[0], str) and len(comments[0]) == 1:
985 comment = comments[0]
986 comments = None
987 else:
988 # Input validation if there are multiple comment characters
989 if delimiter in comments:
990 raise TypeError(
991 f"Comment characters '{comments}' cannot include the "
992 f"delimiter '{delimiter}'"
993 )
994
995 # comment is now either a 1 or 0 character string or a tuple:
996 if comments is not None:
997 # Note: An earlier version support two character comments (and could
998 # have been extended to multiple characters, we assume this is
999 # rare enough to not optimize for.
1000 if quote is not None:
1001 raise ValueError(
1002 "when multiple comments or a multi-character comment is "
1003 "given, quotes are not supported. In this case quotechar "
1004 "must be set to None.")
1005
1006 if len(imaginary_unit) != 1:
1007 raise ValueError('len(imaginary_unit) must be 1.')
1008
1009 _check_nonneg_int(skiplines)
1010 if max_rows is not None:
1011 _check_nonneg_int(max_rows)
1012 else:
1013 # Passing -1 to the C code means "read the entire file".
1014 max_rows = -1
1015
1016 fh_closing_ctx = contextlib.nullcontext()
1017 filelike = False
1018 try:
1019 if isinstance(fname, os.PathLike):
1020 fname = os.fspath(fname)
1021 if isinstance(fname, str):
1022 fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
1023 if encoding is None:
1024 encoding = getattr(fh, 'encoding', 'latin1')
1025
1026 fh_closing_ctx = contextlib.closing(fh)
1027 data = fh
1028 filelike = True
1029 else:
1030 if encoding is None:
1031 encoding = getattr(fname, 'encoding', 'latin1')
1032 data = iter(fname)
1033 except TypeError as e:
1034 raise ValueError(
1035 f"fname must be a string, filehandle, list of strings,\n"
1036 f"or generator. Got {type(fname)} instead.") from e
1037
1038 with fh_closing_ctx:
1039 if comments is not None:
1040 if filelike:
1041 data = iter(data)
1042 filelike = False
1043 data = _preprocess_comments(data, comments, encoding)
1044
1045 if read_dtype_via_object_chunks is None:
1046 arr = _load_from_filelike(
1047 data, delimiter=delimiter, comment=comment, quote=quote,
1048 imaginary_unit=imaginary_unit,
1049 usecols=usecols, skiplines=skiplines, max_rows=max_rows,
1050 converters=converters, dtype=dtype,
1051 encoding=encoding, filelike=filelike,
1052 byte_converters=byte_converters)
1053
1054 else:
1055 # This branch reads the file into chunks of object arrays and then
1056 # casts them to the desired actual dtype. This ensures correct
1057 # string-length and datetime-unit discovery (like `arr.astype()`).
1058 # Due to chunking, certain error reports are less clear, currently.
1059 if filelike:
1060 data = iter(data) # cannot chunk when reading from file
1061 filelike = False
1062
1063 c_byte_converters = False
1064 if read_dtype_via_object_chunks == "S":
1065 c_byte_converters = True # Use latin1 rather than ascii
1066
1067 chunks = []
1068 while max_rows != 0:
1069 if max_rows < 0:
1070 chunk_size = _loadtxt_chunksize
1071 else:
1072 chunk_size = min(_loadtxt_chunksize, max_rows)
1073
1074 next_arr = _load_from_filelike(
1075 data, delimiter=delimiter, comment=comment, quote=quote,
1076 imaginary_unit=imaginary_unit,
1077 usecols=usecols, skiplines=skiplines, max_rows=chunk_size,
1078 converters=converters, dtype=dtype,
1079 encoding=encoding, filelike=filelike,
1080 byte_converters=byte_converters,
1081 c_byte_converters=c_byte_converters)
1082 # Cast here already. We hope that this is better even for
1083 # large files because the storage is more compact. It could
1084 # be adapted (in principle the concatenate could cast).
1085 chunks.append(next_arr.astype(read_dtype_via_object_chunks))
1086
1087 skiprows = 0 # Only have to skip for first chunk
1088 if max_rows >= 0:
1089 max_rows -= chunk_size
1090 if len(next_arr) < chunk_size:
1091 # There was less data than requested, so we are done.
1092 break
1093
1094 # Need at least one chunk, but if empty, the last one may have
1095 # the wrong shape.
1096 if len(chunks) > 1 and len(chunks[-1]) == 0:
1097 del chunks[-1]
1098 if len(chunks) == 1:
1099 arr = chunks[0]
1100 else:
1101 arr = np.concatenate(chunks, axis=0)
1102
1103 # NOTE: ndmin works as advertised for structured dtypes, but normally
1104 # these would return a 1D result plus the structured dimension,
1105 # so ndmin=2 adds a third dimension even when no squeezing occurs.
1106 # A `squeeze=False` could be a better solution (pandas uses squeeze).
1107 arr = _ensure_ndmin_ndarray(arr, ndmin=ndmin)
1108
1109 if arr.shape:
1110 if arr.shape[0] == 0:
1111 warnings.warn(
1112 f'loadtxt: input contained no data: "{fname}"',
1113 category=UserWarning,
1114 stacklevel=3
1115 )
1116
1117 if unpack:
1118 # Unpack structured dtypes if requested:
1119 dt = arr.dtype
1120 if dt.names is not None:
1121 # For structured arrays, return an array for each field.
1122 return [arr[field] for field in dt.names]
1123 else:
1124 return arr.T
1125 else:
1126 return arr
1127
1128
1129@finalize_array_function_like
1130@set_module('numpy')
1131def loadtxt(fname, dtype=float, comments='#', delimiter=None,
1132 converters=None, skiprows=0, usecols=None, unpack=False,
1133 ndmin=0, encoding=None, max_rows=None, *, quotechar=None,
1134 like=None):
1135 r"""
1136 Load data from a text file.
1137
1138 Parameters
1139 ----------
1140 fname : file, str, pathlib.Path, list of str, generator
1141 File, filename, list, or generator to read. If the filename
1142 extension is ``.gz`` or ``.bz2``, the file is first decompressed. Note
1143 that generators must return bytes or strings. The strings
1144 in a list or produced by a generator are treated as lines.
1145 dtype : data-type, optional
1146 Data-type of the resulting array; default: float. If this is a
1147 structured data-type, the resulting array will be 1-dimensional, and
1148 each row will be interpreted as an element of the array. In this
1149 case, the number of columns used must match the number of fields in
1150 the data-type.
1151 comments : str or sequence of str or None, optional
1152 The characters or list of characters used to indicate the start of a
1153 comment. None implies no comments. For backwards compatibility, byte
1154 strings will be decoded as 'latin1'. The default is '#'.
1155 delimiter : str, optional
1156 The character used to separate the values. For backwards compatibility,
1157 byte strings will be decoded as 'latin1'. The default is whitespace.
1158
1159 .. versionchanged:: 1.23.0
1160 Only single character delimiters are supported. Newline characters
1161 cannot be used as the delimiter.
1162
1163 converters : dict or callable, optional
1164 Converter functions to customize value parsing. If `converters` is
1165 callable, the function is applied to all columns, else it must be a
1166 dict that maps column number to a parser function.
1167 See examples for further details.
1168 Default: None.
1169
1170 .. versionchanged:: 1.23.0
1171 The ability to pass a single callable to be applied to all columns
1172 was added.
1173
1174 skiprows : int, optional
1175 Skip the first `skiprows` lines, including comments; default: 0.
1176 usecols : int or sequence, optional
1177 Which columns to read, with 0 being the first. For example,
1178 ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns.
1179 The default, None, results in all columns being read.
1180 unpack : bool, optional
1181 If True, the returned array is transposed, so that arguments may be
1182 unpacked using ``x, y, z = loadtxt(...)``. When used with a
1183 structured data-type, arrays are returned for each field.
1184 Default is False.
1185 ndmin : int, optional
1186 The returned array will have at least `ndmin` dimensions.
1187 Otherwise mono-dimensional axes will be squeezed.
1188 Legal values: 0 (default), 1 or 2.
1189 encoding : str, optional
1190 Encoding used to decode the inputfile. Does not apply to input streams.
1191 The special value 'bytes' enables backward compatibility workarounds
1192 that ensures you receive byte arrays as results if possible and passes
1193 'latin1' encoded strings to converters. Override this value to receive
1194 unicode arrays and pass strings as input to converters. If set to None
1195 the system default is used. The default value is 'bytes'.
1196
1197 .. versionchanged:: 2.0
1198 Before NumPy 2, the default was ``'bytes'`` for Python 2
1199 compatibility. The default is now ``None``.
1200
1201 max_rows : int, optional
1202 Read `max_rows` rows of content after `skiprows` lines. The default is
1203 to read all the rows. Note that empty rows containing no data such as
1204 empty lines and comment lines are not counted towards `max_rows`,
1205 while such lines are counted in `skiprows`.
1206
1207 .. versionchanged:: 1.23.0
1208 Lines containing no data, including comment lines (e.g., lines
1209 starting with '#' or as specified via `comments`) are not counted
1210 towards `max_rows`.
1211 quotechar : unicode character or None, optional
1212 The character used to denote the start and end of a quoted item.
1213 Occurrences of the delimiter or comment characters are ignored within
1214 a quoted item. The default value is ``quotechar=None``, which means
1215 quoting support is disabled.
1216
1217 If two consecutive instances of `quotechar` are found within a quoted
1218 field, the first is treated as an escape character. See examples.
1219
1220 .. versionadded:: 1.23.0
1221 ${ARRAY_FUNCTION_LIKE}
1222
1223 .. versionadded:: 1.20.0
1224
1225 Returns
1226 -------
1227 out : ndarray
1228 Data read from the text file.
1229
1230 See Also
1231 --------
1232 load, fromstring, fromregex
1233 genfromtxt : Load data with missing values handled as specified.
1234 scipy.io.loadmat : reads MATLAB data files
1235
1236 Notes
1237 -----
1238 This function aims to be a fast reader for simply formatted files. The
1239 `genfromtxt` function provides more sophisticated handling of, e.g.,
1240 lines with missing values.
1241
1242 Each row in the input text file must have the same number of values to be
1243 able to read all values. If all rows do not have same number of values, a
1244 subset of up to n columns (where n is the least number of values present
1245 in all rows) can be read by specifying the columns via `usecols`.
1246
1247 The strings produced by the Python float.hex method can be used as
1248 input for floats.
1249
1250 Examples
1251 --------
1252 >>> import numpy as np
1253 >>> from io import StringIO # StringIO behaves like a file object
1254 >>> c = StringIO("0 1\n2 3")
1255 >>> np.loadtxt(c)
1256 array([[0., 1.],
1257 [2., 3.]])
1258
1259 >>> d = StringIO("M 21 72\nF 35 58")
1260 >>> np.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'),
1261 ... 'formats': ('S1', 'i4', 'f4')})
1262 array([(b'M', 21, 72.), (b'F', 35, 58.)],
1263 dtype=[('gender', 'S1'), ('age', '<i4'), ('weight', '<f4')])
1264
1265 >>> c = StringIO("1,0,2\n3,0,4")
1266 >>> x, y = np.loadtxt(c, delimiter=',', usecols=(0, 2), unpack=True)
1267 >>> x
1268 array([1., 3.])
1269 >>> y
1270 array([2., 4.])
1271
1272 The `converters` argument is used to specify functions to preprocess the
1273 text prior to parsing. `converters` can be a dictionary that maps
1274 preprocessing functions to each column:
1275
1276 >>> s = StringIO("1.618, 2.296\n3.141, 4.669\n")
1277 >>> conv = {
1278 ... 0: lambda x: np.floor(float(x)), # conversion fn for column 0
1279 ... 1: lambda x: np.ceil(float(x)), # conversion fn for column 1
1280 ... }
1281 >>> np.loadtxt(s, delimiter=",", converters=conv)
1282 array([[1., 3.],
1283 [3., 5.]])
1284
1285 `converters` can be a callable instead of a dictionary, in which case it
1286 is applied to all columns:
1287
1288 >>> s = StringIO("0xDE 0xAD\n0xC0 0xDE")
1289 >>> import functools
1290 >>> conv = functools.partial(int, base=16)
1291 >>> np.loadtxt(s, converters=conv)
1292 array([[222., 173.],
1293 [192., 222.]])
1294
1295 This example shows how `converters` can be used to convert a field
1296 with a trailing minus sign into a negative number.
1297
1298 >>> s = StringIO("10.01 31.25-\n19.22 64.31\n17.57- 63.94")
1299 >>> def conv(fld):
1300 ... return -float(fld[:-1]) if fld.endswith("-") else float(fld)
1301 ...
1302 >>> np.loadtxt(s, converters=conv)
1303 array([[ 10.01, -31.25],
1304 [ 19.22, 64.31],
1305 [-17.57, 63.94]])
1306
1307 Using a callable as the converter can be particularly useful for handling
1308 values with different formatting, e.g. floats with underscores:
1309
1310 >>> s = StringIO("1 2.7 100_000")
1311 >>> np.loadtxt(s, converters=float)
1312 array([1.e+00, 2.7e+00, 1.e+05])
1313
1314 This idea can be extended to automatically handle values specified in
1315 many different formats, such as hex values:
1316
1317 >>> def conv(val):
1318 ... try:
1319 ... return float(val)
1320 ... except ValueError:
1321 ... return float.fromhex(val)
1322 >>> s = StringIO("1, 2.5, 3_000, 0b4, 0x1.4000000000000p+2")
1323 >>> np.loadtxt(s, delimiter=",", converters=conv)
1324 array([1.0e+00, 2.5e+00, 3.0e+03, 1.8e+02, 5.0e+00])
1325
1326 Or a format where the ``-`` sign comes after the number:
1327
1328 >>> s = StringIO("10.01 31.25-\n19.22 64.31\n17.57- 63.94")
1329 >>> conv = lambda x: -float(x[:-1]) if x.endswith("-") else float(x)
1330 >>> np.loadtxt(s, converters=conv)
1331 array([[ 10.01, -31.25],
1332 [ 19.22, 64.31],
1333 [-17.57, 63.94]])
1334
1335 Support for quoted fields is enabled with the `quotechar` parameter.
1336 Comment and delimiter characters are ignored when they appear within a
1337 quoted item delineated by `quotechar`:
1338
1339 >>> s = StringIO('"alpha, #42", 10.0\n"beta, #64", 2.0\n')
1340 >>> dtype = np.dtype([("label", "U12"), ("value", float)])
1341 >>> np.loadtxt(s, dtype=dtype, delimiter=",", quotechar='"')
1342 array([('alpha, #42', 10.), ('beta, #64', 2.)],
1343 dtype=[('label', '<U12'), ('value', '<f8')])
1344
1345 Quoted fields can be separated by multiple whitespace characters:
1346
1347 >>> s = StringIO('"alpha, #42" 10.0\n"beta, #64" 2.0\n')
1348 >>> dtype = np.dtype([("label", "U12"), ("value", float)])
1349 >>> np.loadtxt(s, dtype=dtype, delimiter=None, quotechar='"')
1350 array([('alpha, #42', 10.), ('beta, #64', 2.)],
1351 dtype=[('label', '<U12'), ('value', '<f8')])
1352
1353 Two consecutive quote characters within a quoted field are treated as a
1354 single escaped character:
1355
1356 >>> s = StringIO('"Hello, my name is ""Monty""!"')
1357 >>> np.loadtxt(s, dtype="U", delimiter=",", quotechar='"')
1358 array('Hello, my name is "Monty"!', dtype='<U26')
1359
1360 Read subset of columns when all rows do not contain equal number of values:
1361
1362 >>> d = StringIO("1 2\n2 4\n3 9 12\n4 16 20")
1363 >>> np.loadtxt(d, usecols=(0, 1))
1364 array([[ 1., 2.],
1365 [ 2., 4.],
1366 [ 3., 9.],
1367 [ 4., 16.]])
1368
1369 """
1370
1371 if like is not None:
1372 return _loadtxt_with_like(
1373 like, fname, dtype=dtype, comments=comments, delimiter=delimiter,
1374 converters=converters, skiprows=skiprows, usecols=usecols,
1375 unpack=unpack, ndmin=ndmin, encoding=encoding,
1376 max_rows=max_rows
1377 )
1378
1379 if isinstance(delimiter, bytes):
1380 delimiter.decode("latin1")
1381
1382 if dtype is None:
1383 dtype = np.float64
1384
1385 comment = comments
1386 # Control character type conversions for Py3 convenience
1387 if comment is not None:
1388 if isinstance(comment, (str, bytes)):
1389 comment = [comment]
1390 comment = [
1391 x.decode('latin1') if isinstance(x, bytes) else x for x in comment]
1392 if isinstance(delimiter, bytes):
1393 delimiter = delimiter.decode('latin1')
1394
1395 arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter,
1396 converters=converters, skiplines=skiprows, usecols=usecols,
1397 unpack=unpack, ndmin=ndmin, encoding=encoding,
1398 max_rows=max_rows, quote=quotechar)
1399
1400 return arr
1401
1402
1403_loadtxt_with_like = array_function_dispatch()(loadtxt)
1404
1405
1406def _savetxt_dispatcher(fname, X, fmt=None, delimiter=None, newline=None,
1407 header=None, footer=None, comments=None,
1408 encoding=None):
1409 return (X,)
1410
1411
1412@array_function_dispatch(_savetxt_dispatcher)
1413def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
1414 footer='', comments='# ', encoding=None):
1415 """
1416 Save an array to a text file.
1417
1418 Parameters
1419 ----------
1420 fname : filename, file handle or pathlib.Path
1421 If the filename ends in ``.gz``, the file is automatically saved in
1422 compressed gzip format. `loadtxt` understands gzipped files
1423 transparently.
1424 X : 1D or 2D array_like
1425 Data to be saved to a text file.
1426 fmt : str or sequence of strs, optional
1427 A single format (%10.5f), a sequence of formats, or a
1428 multi-format string, e.g. 'Iteration %d -- %10.5f', in which
1429 case `delimiter` is ignored. For complex `X`, the legal options
1430 for `fmt` are:
1431
1432 * a single specifier, ``fmt='%.4e'``, resulting in numbers formatted
1433 like ``' (%s+%sj)' % (fmt, fmt)``
1434 * a full string specifying every real and imaginary part, e.g.
1435 ``' %.4e %+.4ej %.4e %+.4ej %.4e %+.4ej'`` for 3 columns
1436 * a list of specifiers, one per column - in this case, the real
1437 and imaginary part must have separate specifiers,
1438 e.g. ``['%.3e + %.3ej', '(%.15e%+.15ej)']`` for 2 columns
1439 delimiter : str, optional
1440 String or character separating columns.
1441 newline : str, optional
1442 String or character separating lines.
1443 header : str, optional
1444 String that will be written at the beginning of the file.
1445 footer : str, optional
1446 String that will be written at the end of the file.
1447 comments : str, optional
1448 String that will be prepended to the ``header`` and ``footer`` strings,
1449 to mark them as comments. Default: '# ', as expected by e.g.
1450 ``numpy.loadtxt``.
1451 encoding : {None, str}, optional
1452 Encoding used to encode the outputfile. Does not apply to output
1453 streams. If the encoding is something other than 'bytes' or 'latin1'
1454 you will not be able to load the file in NumPy versions < 1.14. Default
1455 is 'latin1'.
1456
1457 See Also
1458 --------
1459 save : Save an array to a binary file in NumPy ``.npy`` format
1460 savez : Save several arrays into an uncompressed ``.npz`` archive
1461 savez_compressed : Save several arrays into a compressed ``.npz`` archive
1462
1463 Notes
1464 -----
1465 Further explanation of the `fmt` parameter
1466 (``%[flag]width[.precision]specifier``):
1467
1468 flags:
1469 ``-`` : left justify
1470
1471 ``+`` : Forces to precede result with + or -.
1472
1473 ``0`` : Left pad the number with zeros instead of space (see width).
1474
1475 width:
1476 Minimum number of characters to be printed. The value is not truncated
1477 if it has more characters.
1478
1479 precision:
1480 - For integer specifiers (eg. ``d,i,o,x``), the minimum number of
1481 digits.
1482 - For ``e, E`` and ``f`` specifiers, the number of digits to print
1483 after the decimal point.
1484 - For ``g`` and ``G``, the maximum number of significant digits.
1485 - For ``s``, the maximum number of characters.
1486
1487 specifiers:
1488 ``c`` : character
1489
1490 ``d`` or ``i`` : signed decimal integer
1491
1492 ``e`` or ``E`` : scientific notation with ``e`` or ``E``.
1493
1494 ``f`` : decimal floating point
1495
1496 ``g,G`` : use the shorter of ``e,E`` or ``f``
1497
1498 ``o`` : signed octal
1499
1500 ``s`` : string of characters
1501
1502 ``u`` : unsigned decimal integer
1503
1504 ``x,X`` : unsigned hexadecimal integer
1505
1506 This explanation of ``fmt`` is not complete, for an exhaustive
1507 specification see [1]_.
1508
1509 References
1510 ----------
1511 .. [1] `Format Specification Mini-Language
1512 <https://docs.python.org/library/string.html#format-specification-mini-language>`_,
1513 Python Documentation.
1514
1515 Examples
1516 --------
1517 >>> import numpy as np
1518 >>> x = y = z = np.arange(0.0,5.0,1.0)
1519 >>> np.savetxt('test.out', x, delimiter=',') # X is an array
1520 >>> np.savetxt('test.out', (x,y,z)) # x,y,z equal sized 1D arrays
1521 >>> np.savetxt('test.out', x, fmt='%1.4e') # use exponential notation
1522
1523 """
1524
1525 class WriteWrap:
1526 """Convert to bytes on bytestream inputs.
1527
1528 """
1529 def __init__(self, fh, encoding):
1530 self.fh = fh
1531 self.encoding = encoding
1532 self.do_write = self.first_write
1533
1534 def close(self):
1535 self.fh.close()
1536
1537 def write(self, v):
1538 self.do_write(v)
1539
1540 def write_bytes(self, v):
1541 if isinstance(v, bytes):
1542 self.fh.write(v)
1543 else:
1544 self.fh.write(v.encode(self.encoding))
1545
1546 def write_normal(self, v):
1547 self.fh.write(asunicode(v))
1548
1549 def first_write(self, v):
1550 try:
1551 self.write_normal(v)
1552 self.write = self.write_normal
1553 except TypeError:
1554 # input is probably a bytestream
1555 self.write_bytes(v)
1556 self.write = self.write_bytes
1557
1558 own_fh = False
1559 if isinstance(fname, os.PathLike):
1560 fname = os.fspath(fname)
1561 if _is_string_like(fname):
1562 # datasource doesn't support creating a new file ...
1563 open(fname, 'wt').close()
1564 fh = np.lib._datasource.open(fname, 'wt', encoding=encoding)
1565 own_fh = True
1566 elif hasattr(fname, 'write'):
1567 # wrap to handle byte output streams
1568 fh = WriteWrap(fname, encoding or 'latin1')
1569 else:
1570 raise ValueError('fname must be a string or file handle')
1571
1572 try:
1573 X = np.asarray(X)
1574
1575 # Handle 1-dimensional arrays
1576 if X.ndim == 0 or X.ndim > 2:
1577 raise ValueError(
1578 "Expected 1D or 2D array, got %dD array instead" % X.ndim)
1579 elif X.ndim == 1:
1580 # Common case -- 1d array of numbers
1581 if X.dtype.names is None:
1582 X = np.atleast_2d(X).T
1583 ncol = 1
1584
1585 # Complex dtype -- each field indicates a separate column
1586 else:
1587 ncol = len(X.dtype.names)
1588 else:
1589 ncol = X.shape[1]
1590
1591 iscomplex_X = np.iscomplexobj(X)
1592 # `fmt` can be a string with multiple insertion points or a
1593 # list of formats. E.g. '%10.5f\t%10d' or ('%10.5f', '$10d')
1594 if type(fmt) in (list, tuple):
1595 if len(fmt) != ncol:
1596 raise AttributeError('fmt has wrong shape. %s' % str(fmt))
1597 format = delimiter.join(fmt)
1598 elif isinstance(fmt, str):
1599 n_fmt_chars = fmt.count('%')
1600 error = ValueError('fmt has wrong number of %% formats: %s' % fmt)
1601 if n_fmt_chars == 1:
1602 if iscomplex_X:
1603 fmt = [' (%s+%sj)' % (fmt, fmt), ] * ncol
1604 else:
1605 fmt = [fmt, ] * ncol
1606 format = delimiter.join(fmt)
1607 elif iscomplex_X and n_fmt_chars != (2 * ncol):
1608 raise error
1609 elif ((not iscomplex_X) and n_fmt_chars != ncol):
1610 raise error
1611 else:
1612 format = fmt
1613 else:
1614 raise ValueError('invalid fmt: %r' % (fmt,))
1615
1616 if len(header) > 0:
1617 header = header.replace('\n', '\n' + comments)
1618 fh.write(comments + header + newline)
1619 if iscomplex_X:
1620 for row in X:
1621 row2 = []
1622 for number in row:
1623 row2.append(number.real)
1624 row2.append(number.imag)
1625 s = format % tuple(row2) + newline
1626 fh.write(s.replace('+-', '-'))
1627 else:
1628 for row in X:
1629 try:
1630 v = format % tuple(row) + newline
1631 except TypeError as e:
1632 raise TypeError("Mismatch between array dtype ('%s') and "
1633 "format specifier ('%s')"
1634 % (str(X.dtype), format)) from e
1635 fh.write(v)
1636
1637 if len(footer) > 0:
1638 footer = footer.replace('\n', '\n' + comments)
1639 fh.write(comments + footer + newline)
1640 finally:
1641 if own_fh:
1642 fh.close()
1643
1644
1645@set_module('numpy')
1646def fromregex(file, regexp, dtype, encoding=None):
1647 r"""
1648 Construct an array from a text file, using regular expression parsing.
1649
1650 The returned array is always a structured array, and is constructed from
1651 all matches of the regular expression in the file. Groups in the regular
1652 expression are converted to fields of the structured array.
1653
1654 Parameters
1655 ----------
1656 file : file, str, or pathlib.Path
1657 Filename or file object to read.
1658
1659 .. versionchanged:: 1.22.0
1660 Now accepts `os.PathLike` implementations.
1661
1662 regexp : str or regexp
1663 Regular expression used to parse the file.
1664 Groups in the regular expression correspond to fields in the dtype.
1665 dtype : dtype or list of dtypes
1666 Dtype for the structured array; must be a structured datatype.
1667 encoding : str, optional
1668 Encoding used to decode the inputfile. Does not apply to input streams.
1669
1670 Returns
1671 -------
1672 output : ndarray
1673 The output array, containing the part of the content of `file` that
1674 was matched by `regexp`. `output` is always a structured array.
1675
1676 Raises
1677 ------
1678 TypeError
1679 When `dtype` is not a valid dtype for a structured array.
1680
1681 See Also
1682 --------
1683 fromstring, loadtxt
1684
1685 Notes
1686 -----
1687 Dtypes for structured arrays can be specified in several forms, but all
1688 forms specify at least the data type and field name. For details see
1689 `basics.rec`.
1690
1691 Examples
1692 --------
1693 >>> import numpy as np
1694 >>> from io import StringIO
1695 >>> text = StringIO("1312 foo\n1534 bar\n444 qux")
1696
1697 >>> regexp = r"(\d+)\s+(...)" # match [digits, whitespace, anything]
1698 >>> output = np.fromregex(text, regexp,
1699 ... [('num', np.int64), ('key', 'S3')])
1700 >>> output
1701 array([(1312, b'foo'), (1534, b'bar'), ( 444, b'qux')],
1702 dtype=[('num', '<i8'), ('key', 'S3')])
1703 >>> output['num']
1704 array([1312, 1534, 444])
1705
1706 """
1707 own_fh = False
1708 if not hasattr(file, "read"):
1709 file = os.fspath(file)
1710 file = np.lib._datasource.open(file, 'rt', encoding=encoding)
1711 own_fh = True
1712
1713 try:
1714 if not isinstance(dtype, np.dtype):
1715 dtype = np.dtype(dtype)
1716 if dtype.names is None:
1717 raise TypeError('dtype must be a structured datatype.')
1718
1719 content = file.read()
1720 if isinstance(content, bytes) and isinstance(regexp, str):
1721 regexp = asbytes(regexp)
1722
1723 if not hasattr(regexp, 'match'):
1724 regexp = re.compile(regexp)
1725 seq = regexp.findall(content)
1726 if seq and not isinstance(seq[0], tuple):
1727 # Only one group is in the regexp.
1728 # Create the new array as a single data-type and then
1729 # re-interpret as a single-field structured array.
1730 newdtype = np.dtype(dtype[dtype.names[0]])
1731 output = np.array(seq, dtype=newdtype)
1732 output.dtype = dtype
1733 else:
1734 output = np.array(seq, dtype=dtype)
1735
1736 return output
1737 finally:
1738 if own_fh:
1739 file.close()
1740
1741
1742#####--------------------------------------------------------------------------
1743#---- --- ASCII functions ---
1744#####--------------------------------------------------------------------------
1745
1746
1747@finalize_array_function_like
1748@set_module('numpy')
1749def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
1750 skip_header=0, skip_footer=0, converters=None,
1751 missing_values=None, filling_values=None, usecols=None,
1752 names=None, excludelist=None,
1753 deletechars=''.join(sorted(NameValidator.defaultdeletechars)),
1754 replace_space='_', autostrip=False, case_sensitive=True,
1755 defaultfmt="f%i", unpack=None, usemask=False, loose=True,
1756 invalid_raise=True, max_rows=None, encoding=None,
1757 *, ndmin=0, like=None):
1758 """
1759 Load data from a text file, with missing values handled as specified.
1760
1761 Each line past the first `skip_header` lines is split at the `delimiter`
1762 character, and characters following the `comments` character are discarded.
1763
1764 Parameters
1765 ----------
1766 fname : file, str, pathlib.Path, list of str, generator
1767 File, filename, list, or generator to read. If the filename
1768 extension is ``.gz`` or ``.bz2``, the file is first decompressed. Note
1769 that generators must return bytes or strings. The strings
1770 in a list or produced by a generator are treated as lines.
1771 dtype : dtype, optional
1772 Data type of the resulting array.
1773 If None, the dtypes will be determined by the contents of each
1774 column, individually.
1775 comments : str, optional
1776 The character used to indicate the start of a comment.
1777 All the characters occurring on a line after a comment are discarded.
1778 delimiter : str, int, or sequence, optional
1779 The string used to separate values. By default, any consecutive
1780 whitespaces act as delimiter. An integer or sequence of integers
1781 can also be provided as width(s) of each field.
1782 skiprows : int, optional
1783 `skiprows` was removed in numpy 1.10. Please use `skip_header` instead.
1784 skip_header : int, optional
1785 The number of lines to skip at the beginning of the file.
1786 skip_footer : int, optional
1787 The number of lines to skip at the end of the file.
1788 converters : variable, optional
1789 The set of functions that convert the data of a column to a value.
1790 The converters can also be used to provide a default value
1791 for missing data: ``converters = {3: lambda s: float(s or 0)}``.
1792 missing : variable, optional
1793 `missing` was removed in numpy 1.10. Please use `missing_values`
1794 instead.
1795 missing_values : variable, optional
1796 The set of strings corresponding to missing data.
1797 filling_values : variable, optional
1798 The set of values to be used as default when the data are missing.
1799 usecols : sequence, optional
1800 Which columns to read, with 0 being the first. For example,
1801 ``usecols = (1, 4, 5)`` will extract the 2nd, 5th and 6th columns.
1802 names : {None, True, str, sequence}, optional
1803 If `names` is True, the field names are read from the first line after
1804 the first `skip_header` lines. This line can optionally be preceded
1805 by a comment delimiter. Any content before the comment delimiter is
1806 discarded. If `names` is a sequence or a single-string of
1807 comma-separated names, the names will be used to define the field
1808 names in a structured dtype. If `names` is None, the names of the
1809 dtype fields will be used, if any.
1810 excludelist : sequence, optional
1811 A list of names to exclude. This list is appended to the default list
1812 ['return','file','print']. Excluded names are appended with an
1813 underscore: for example, `file` would become `file_`.
1814 deletechars : str, optional
1815 A string combining invalid characters that must be deleted from the
1816 names.
1817 defaultfmt : str, optional
1818 A format used to define default field names, such as "f%i" or "f_%02i".
1819 autostrip : bool, optional
1820 Whether to automatically strip white spaces from the variables.
1821 replace_space : char, optional
1822 Character(s) used in replacement of white spaces in the variable
1823 names. By default, use a '_'.
1824 case_sensitive : {True, False, 'upper', 'lower'}, optional
1825 If True, field names are case sensitive.
1826 If False or 'upper', field names are converted to upper case.
1827 If 'lower', field names are converted to lower case.
1828 unpack : bool, optional
1829 If True, the returned array is transposed, so that arguments may be
1830 unpacked using ``x, y, z = genfromtxt(...)``. When used with a
1831 structured data-type, arrays are returned for each field.
1832 Default is False.
1833 usemask : bool, optional
1834 If True, return a masked array.
1835 If False, return a regular array.
1836 loose : bool, optional
1837 If True, do not raise errors for invalid values.
1838 invalid_raise : bool, optional
1839 If True, an exception is raised if an inconsistency is detected in the
1840 number of columns.
1841 If False, a warning is emitted and the offending lines are skipped.
1842 max_rows : int, optional
1843 The maximum number of rows to read. Must not be used with skip_footer
1844 at the same time. If given, the value must be at least 1. Default is
1845 to read the entire file.
1846 encoding : str, optional
1847 Encoding used to decode the inputfile. Does not apply when `fname`
1848 is a file object. The special value 'bytes' enables backward
1849 compatibility workarounds that ensure that you receive byte arrays
1850 when possible and passes latin1 encoded strings to converters.
1851 Override this value to receive unicode arrays and pass strings
1852 as input to converters. If set to None the system default is used.
1853 The default value is 'bytes'.
1854
1855 .. versionchanged:: 2.0
1856 Before NumPy 2, the default was ``'bytes'`` for Python 2
1857 compatibility. The default is now ``None``.
1858
1859 ndmin : int, optional
1860 Same parameter as `loadtxt`
1861
1862 .. versionadded:: 1.23.0
1863 ${ARRAY_FUNCTION_LIKE}
1864
1865 .. versionadded:: 1.20.0
1866
1867 Returns
1868 -------
1869 out : ndarray
1870 Data read from the text file. If `usemask` is True, this is a
1871 masked array.
1872
1873 See Also
1874 --------
1875 numpy.loadtxt : equivalent function when no data is missing.
1876
1877 Notes
1878 -----
1879 * When spaces are used as delimiters, or when no delimiter has been given
1880 as input, there should not be any missing data between two fields.
1881 * When variables are named (either by a flexible dtype or with a `names`
1882 sequence), there must not be any header in the file (else a ValueError
1883 exception is raised).
1884 * Individual values are not stripped of spaces by default.
1885 When using a custom converter, make sure the function does remove spaces.
1886 * Custom converters may receive unexpected values due to dtype
1887 discovery.
1888
1889 References
1890 ----------
1891 .. [1] NumPy User Guide, section `I/O with NumPy
1892 <https://docs.scipy.org/doc/numpy/user/basics.io.genfromtxt.html>`_.
1893
1894 Examples
1895 --------
1896 >>> from io import StringIO
1897 >>> import numpy as np
1898
1899 Comma delimited file with mixed dtype
1900
1901 >>> s = StringIO("1,1.3,abcde")
1902 >>> data = np.genfromtxt(s, dtype=[('myint','i8'),('myfloat','f8'),
1903 ... ('mystring','S5')], delimiter=",")
1904 >>> data
1905 array((1, 1.3, b'abcde'),
1906 dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')])
1907
1908 Using dtype = None
1909
1910 >>> _ = s.seek(0) # needed for StringIO example only
1911 >>> data = np.genfromtxt(s, dtype=None,
1912 ... names = ['myint','myfloat','mystring'], delimiter=",")
1913 >>> data
1914 array((1, 1.3, 'abcde'),
1915 dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '<U5')])
1916
1917 Specifying dtype and names
1918
1919 >>> _ = s.seek(0)
1920 >>> data = np.genfromtxt(s, dtype="i8,f8,S5",
1921 ... names=['myint','myfloat','mystring'], delimiter=",")
1922 >>> data
1923 array((1, 1.3, b'abcde'),
1924 dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')])
1925
1926 An example with fixed-width columns
1927
1928 >>> s = StringIO("11.3abcde")
1929 >>> data = np.genfromtxt(s, dtype=None, names=['intvar','fltvar','strvar'],
1930 ... delimiter=[1,3,5])
1931 >>> data
1932 array((1, 1.3, 'abcde'),
1933 dtype=[('intvar', '<i8'), ('fltvar', '<f8'), ('strvar', '<U5')])
1934
1935 An example to show comments
1936
1937 >>> f = StringIO('''
1938 ... text,# of chars
1939 ... hello world,11
1940 ... numpy,5''')
1941 >>> np.genfromtxt(f, dtype='S12,S12', delimiter=',')
1942 array([(b'text', b''), (b'hello world', b'11'), (b'numpy', b'5')],
1943 dtype=[('f0', 'S12'), ('f1', 'S12')])
1944
1945 """
1946
1947 if like is not None:
1948 return _genfromtxt_with_like(
1949 like, fname, dtype=dtype, comments=comments, delimiter=delimiter,
1950 skip_header=skip_header, skip_footer=skip_footer,
1951 converters=converters, missing_values=missing_values,
1952 filling_values=filling_values, usecols=usecols, names=names,
1953 excludelist=excludelist, deletechars=deletechars,
1954 replace_space=replace_space, autostrip=autostrip,
1955 case_sensitive=case_sensitive, defaultfmt=defaultfmt,
1956 unpack=unpack, usemask=usemask, loose=loose,
1957 invalid_raise=invalid_raise, max_rows=max_rows, encoding=encoding,
1958 ndmin=ndmin,
1959 )
1960
1961 _ensure_ndmin_ndarray_check_param(ndmin)
1962
1963 if max_rows is not None:
1964 if skip_footer:
1965 raise ValueError(
1966 "The keywords 'skip_footer' and 'max_rows' can not be "
1967 "specified at the same time.")
1968 if max_rows < 1:
1969 raise ValueError("'max_rows' must be at least 1.")
1970
1971 if usemask:
1972 from numpy.ma import MaskedArray, make_mask_descr
1973 # Check the input dictionary of converters
1974 user_converters = converters or {}
1975 if not isinstance(user_converters, dict):
1976 raise TypeError(
1977 "The input argument 'converter' should be a valid dictionary "
1978 "(got '%s' instead)" % type(user_converters))
1979
1980 if encoding == 'bytes':
1981 encoding = None
1982 byte_converters = True
1983 else:
1984 byte_converters = False
1985
1986 # Initialize the filehandle, the LineSplitter and the NameValidator
1987 if isinstance(fname, os.PathLike):
1988 fname = os.fspath(fname)
1989 if isinstance(fname, str):
1990 fid = np.lib._datasource.open(fname, 'rt', encoding=encoding)
1991 fid_ctx = contextlib.closing(fid)
1992 else:
1993 fid = fname
1994 fid_ctx = contextlib.nullcontext(fid)
1995 try:
1996 fhd = iter(fid)
1997 except TypeError as e:
1998 raise TypeError(
1999 "fname must be a string, a filehandle, a sequence of strings,\n"
2000 f"or an iterator of strings. Got {type(fname)} instead."
2001 ) from e
2002 with fid_ctx:
2003 split_line = LineSplitter(delimiter=delimiter, comments=comments,
2004 autostrip=autostrip, encoding=encoding)
2005 validate_names = NameValidator(excludelist=excludelist,
2006 deletechars=deletechars,
2007 case_sensitive=case_sensitive,
2008 replace_space=replace_space)
2009
2010 # Skip the first `skip_header` rows
2011 try:
2012 for i in range(skip_header):
2013 next(fhd)
2014
2015 # Keep on until we find the first valid values
2016 first_values = None
2017
2018 while not first_values:
2019 first_line = _decode_line(next(fhd), encoding)
2020 if (names is True) and (comments is not None):
2021 if comments in first_line:
2022 first_line = (
2023 ''.join(first_line.split(comments)[1:]))
2024 first_values = split_line(first_line)
2025 except StopIteration:
2026 # return an empty array if the datafile is empty
2027 first_line = ''
2028 first_values = []
2029 warnings.warn(
2030 'genfromtxt: Empty input file: "%s"' % fname, stacklevel=2
2031 )
2032
2033 # Should we take the first values as names ?
2034 if names is True:
2035 fval = first_values[0].strip()
2036 if comments is not None:
2037 if fval in comments:
2038 del first_values[0]
2039
2040 # Check the columns to use: make sure `usecols` is a list
2041 if usecols is not None:
2042 try:
2043 usecols = [_.strip() for _ in usecols.split(",")]
2044 except AttributeError:
2045 try:
2046 usecols = list(usecols)
2047 except TypeError:
2048 usecols = [usecols, ]
2049 nbcols = len(usecols or first_values)
2050
2051 # Check the names and overwrite the dtype.names if needed
2052 if names is True:
2053 names = validate_names([str(_.strip()) for _ in first_values])
2054 first_line = ''
2055 elif _is_string_like(names):
2056 names = validate_names([_.strip() for _ in names.split(',')])
2057 elif names:
2058 names = validate_names(names)
2059 # Get the dtype
2060 if dtype is not None:
2061 dtype = easy_dtype(dtype, defaultfmt=defaultfmt, names=names,
2062 excludelist=excludelist,
2063 deletechars=deletechars,
2064 case_sensitive=case_sensitive,
2065 replace_space=replace_space)
2066 # Make sure the names is a list (for 2.5)
2067 if names is not None:
2068 names = list(names)
2069
2070 if usecols:
2071 for (i, current) in enumerate(usecols):
2072 # if usecols is a list of names, convert to a list of indices
2073 if _is_string_like(current):
2074 usecols[i] = names.index(current)
2075 elif current < 0:
2076 usecols[i] = current + len(first_values)
2077 # If the dtype is not None, make sure we update it
2078 if (dtype is not None) and (len(dtype) > nbcols):
2079 descr = dtype.descr
2080 dtype = np.dtype([descr[_] for _ in usecols])
2081 names = list(dtype.names)
2082 # If `names` is not None, update the names
2083 elif (names is not None) and (len(names) > nbcols):
2084 names = [names[_] for _ in usecols]
2085 elif (names is not None) and (dtype is not None):
2086 names = list(dtype.names)
2087
2088 # Process the missing values ...............................
2089 # Rename missing_values for convenience
2090 user_missing_values = missing_values or ()
2091 if isinstance(user_missing_values, bytes):
2092 user_missing_values = user_missing_values.decode('latin1')
2093
2094 # Define the list of missing_values (one column: one list)
2095 missing_values = [[''] for _ in range(nbcols)]
2096
2097 # We have a dictionary: process it field by field
2098 if isinstance(user_missing_values, dict):
2099 # Loop on the items
2100 for (key, val) in user_missing_values.items():
2101 # Is the key a string ?
2102 if _is_string_like(key):
2103 try:
2104 # Transform it into an integer
2105 key = names.index(key)
2106 except ValueError:
2107 # We couldn't find it: the name must have been dropped
2108 continue
2109 # Redefine the key as needed if it's a column number
2110 if usecols:
2111 try:
2112 key = usecols.index(key)
2113 except ValueError:
2114 pass
2115 # Transform the value as a list of string
2116 if isinstance(val, (list, tuple)):
2117 val = [str(_) for _ in val]
2118 else:
2119 val = [str(val), ]
2120 # Add the value(s) to the current list of missing
2121 if key is None:
2122 # None acts as default
2123 for miss in missing_values:
2124 miss.extend(val)
2125 else:
2126 missing_values[key].extend(val)
2127 # We have a sequence : each item matches a column
2128 elif isinstance(user_missing_values, (list, tuple)):
2129 for (value, entry) in zip(user_missing_values, missing_values):
2130 value = str(value)
2131 if value not in entry:
2132 entry.append(value)
2133 # We have a string : apply it to all entries
2134 elif isinstance(user_missing_values, str):
2135 user_value = user_missing_values.split(",")
2136 for entry in missing_values:
2137 entry.extend(user_value)
2138 # We have something else: apply it to all entries
2139 else:
2140 for entry in missing_values:
2141 entry.extend([str(user_missing_values)])
2142
2143 # Process the filling_values ...............................
2144 # Rename the input for convenience
2145 user_filling_values = filling_values
2146 if user_filling_values is None:
2147 user_filling_values = []
2148 # Define the default
2149 filling_values = [None] * nbcols
2150 # We have a dictionary : update each entry individually
2151 if isinstance(user_filling_values, dict):
2152 for (key, val) in user_filling_values.items():
2153 if _is_string_like(key):
2154 try:
2155 # Transform it into an integer
2156 key = names.index(key)
2157 except ValueError:
2158 # We couldn't find it: the name must have been dropped
2159 continue
2160 # Redefine the key if it's a column number
2161 # and usecols is defined
2162 if usecols:
2163 try:
2164 key = usecols.index(key)
2165 except ValueError:
2166 pass
2167 # Add the value to the list
2168 filling_values[key] = val
2169 # We have a sequence : update on a one-to-one basis
2170 elif isinstance(user_filling_values, (list, tuple)):
2171 n = len(user_filling_values)
2172 if (n <= nbcols):
2173 filling_values[:n] = user_filling_values
2174 else:
2175 filling_values = user_filling_values[:nbcols]
2176 # We have something else : use it for all entries
2177 else:
2178 filling_values = [user_filling_values] * nbcols
2179
2180 # Initialize the converters ................................
2181 if dtype is None:
2182 # Note: we can't use a [...]*nbcols, as we would have 3 times
2183 # the same converter, instead of 3 different converters.
2184 converters = [
2185 StringConverter(None, missing_values=miss, default=fill)
2186 for (miss, fill) in zip(missing_values, filling_values)
2187 ]
2188 else:
2189 dtype_flat = flatten_dtype(dtype, flatten_base=True)
2190 # Initialize the converters
2191 if len(dtype_flat) > 1:
2192 # Flexible type : get a converter from each dtype
2193 zipit = zip(dtype_flat, missing_values, filling_values)
2194 converters = [StringConverter(dt,
2195 locked=True,
2196 missing_values=miss,
2197 default=fill)
2198 for (dt, miss, fill) in zipit]
2199 else:
2200 # Set to a default converter (but w/ different missing values)
2201 zipit = zip(missing_values, filling_values)
2202 converters = [StringConverter(dtype,
2203 locked=True,
2204 missing_values=miss,
2205 default=fill)
2206 for (miss, fill) in zipit]
2207 # Update the converters to use the user-defined ones
2208 uc_update = []
2209 for (j, conv) in user_converters.items():
2210 # If the converter is specified by column names,
2211 # use the index instead
2212 if _is_string_like(j):
2213 try:
2214 j = names.index(j)
2215 i = j
2216 except ValueError:
2217 continue
2218 elif usecols:
2219 try:
2220 i = usecols.index(j)
2221 except ValueError:
2222 # Unused converter specified
2223 continue
2224 else:
2225 i = j
2226 # Find the value to test - first_line is not filtered by usecols:
2227 if len(first_line):
2228 testing_value = first_values[j]
2229 else:
2230 testing_value = None
2231 if conv is bytes:
2232 user_conv = asbytes
2233 elif byte_converters:
2234 # Converters may use decode to workaround numpy's old
2235 # behavior, so encode the string again before passing
2236 # to the user converter.
2237 def tobytes_first(x, conv):
2238 if type(x) is bytes:
2239 return conv(x)
2240 return conv(x.encode("latin1"))
2241 user_conv = functools.partial(tobytes_first, conv=conv)
2242 else:
2243 user_conv = conv
2244 converters[i].update(user_conv, locked=True,
2245 testing_value=testing_value,
2246 default=filling_values[i],
2247 missing_values=missing_values[i],)
2248 uc_update.append((i, user_conv))
2249 # Make sure we have the corrected keys in user_converters...
2250 user_converters.update(uc_update)
2251
2252 # Fixme: possible error as following variable never used.
2253 # miss_chars = [_.missing_values for _ in converters]
2254
2255 # Initialize the output lists ...
2256 # ... rows
2257 rows = []
2258 append_to_rows = rows.append
2259 # ... masks
2260 if usemask:
2261 masks = []
2262 append_to_masks = masks.append
2263 # ... invalid
2264 invalid = []
2265 append_to_invalid = invalid.append
2266
2267 # Parse each line
2268 for (i, line) in enumerate(itertools.chain([first_line, ], fhd)):
2269 values = split_line(line)
2270 nbvalues = len(values)
2271 # Skip an empty line
2272 if nbvalues == 0:
2273 continue
2274 if usecols:
2275 # Select only the columns we need
2276 try:
2277 values = [values[_] for _ in usecols]
2278 except IndexError:
2279 append_to_invalid((i + skip_header + 1, nbvalues))
2280 continue
2281 elif nbvalues != nbcols:
2282 append_to_invalid((i + skip_header + 1, nbvalues))
2283 continue
2284 # Store the values
2285 append_to_rows(tuple(values))
2286 if usemask:
2287 append_to_masks(tuple([v.strip() in m
2288 for (v, m) in zip(values,
2289 missing_values)]))
2290 if len(rows) == max_rows:
2291 break
2292
2293 # Upgrade the converters (if needed)
2294 if dtype is None:
2295 for (i, converter) in enumerate(converters):
2296 current_column = [itemgetter(i)(_m) for _m in rows]
2297 try:
2298 converter.iterupgrade(current_column)
2299 except ConverterLockError:
2300 errmsg = "Converter #%i is locked and cannot be upgraded: " % i
2301 current_column = map(itemgetter(i), rows)
2302 for (j, value) in enumerate(current_column):
2303 try:
2304 converter.upgrade(value)
2305 except (ConverterError, ValueError):
2306 errmsg += "(occurred line #%i for value '%s')"
2307 errmsg %= (j + 1 + skip_header, value)
2308 raise ConverterError(errmsg)
2309
2310 # Check that we don't have invalid values
2311 nbinvalid = len(invalid)
2312 if nbinvalid > 0:
2313 nbrows = len(rows) + nbinvalid - skip_footer
2314 # Construct the error message
2315 template = " Line #%%i (got %%i columns instead of %i)" % nbcols
2316 if skip_footer > 0:
2317 nbinvalid_skipped = len([_ for _ in invalid
2318 if _[0] > nbrows + skip_header])
2319 invalid = invalid[:nbinvalid - nbinvalid_skipped]
2320 skip_footer -= nbinvalid_skipped
2321#
2322# nbrows -= skip_footer
2323# errmsg = [template % (i, nb)
2324# for (i, nb) in invalid if i < nbrows]
2325# else:
2326 errmsg = [template % (i, nb)
2327 for (i, nb) in invalid]
2328 if len(errmsg):
2329 errmsg.insert(0, "Some errors were detected !")
2330 errmsg = "\n".join(errmsg)
2331 # Raise an exception ?
2332 if invalid_raise:
2333 raise ValueError(errmsg)
2334 # Issue a warning ?
2335 else:
2336 warnings.warn(errmsg, ConversionWarning, stacklevel=2)
2337
2338 # Strip the last skip_footer data
2339 if skip_footer > 0:
2340 rows = rows[:-skip_footer]
2341 if usemask:
2342 masks = masks[:-skip_footer]
2343
2344 # Convert each value according to the converter:
2345 # We want to modify the list in place to avoid creating a new one...
2346 if loose:
2347 rows = list(
2348 zip(*[[conv._loose_call(_r) for _r in map(itemgetter(i), rows)]
2349 for (i, conv) in enumerate(converters)]))
2350 else:
2351 rows = list(
2352 zip(*[[conv._strict_call(_r) for _r in map(itemgetter(i), rows)]
2353 for (i, conv) in enumerate(converters)]))
2354
2355 # Reset the dtype
2356 data = rows
2357 if dtype is None:
2358 # Get the dtypes from the types of the converters
2359 column_types = [conv.type for conv in converters]
2360 # Find the columns with strings...
2361 strcolidx = [i for (i, v) in enumerate(column_types)
2362 if v == np.str_]
2363
2364 if byte_converters and strcolidx:
2365 # convert strings back to bytes for backward compatibility
2366 warnings.warn(
2367 "Reading unicode strings without specifying the encoding "
2368 "argument is deprecated. Set the encoding, use None for the "
2369 "system default.",
2370 np.exceptions.VisibleDeprecationWarning, stacklevel=2)
2371
2372 def encode_unicode_cols(row_tup):
2373 row = list(row_tup)
2374 for i in strcolidx:
2375 row[i] = row[i].encode('latin1')
2376 return tuple(row)
2377
2378 try:
2379 data = [encode_unicode_cols(r) for r in data]
2380 except UnicodeEncodeError:
2381 pass
2382 else:
2383 for i in strcolidx:
2384 column_types[i] = np.bytes_
2385
2386 # Update string types to be the right length
2387 sized_column_types = column_types[:]
2388 for i, col_type in enumerate(column_types):
2389 if np.issubdtype(col_type, np.character):
2390 n_chars = max(len(row[i]) for row in data)
2391 sized_column_types[i] = (col_type, n_chars)
2392
2393 if names is None:
2394 # If the dtype is uniform (before sizing strings)
2395 base = {
2396 c_type
2397 for c, c_type in zip(converters, column_types)
2398 if c._checked}
2399 if len(base) == 1:
2400 uniform_type, = base
2401 (ddtype, mdtype) = (uniform_type, bool)
2402 else:
2403 ddtype = [(defaultfmt % i, dt)
2404 for (i, dt) in enumerate(sized_column_types)]
2405 if usemask:
2406 mdtype = [(defaultfmt % i, bool)
2407 for (i, dt) in enumerate(sized_column_types)]
2408 else:
2409 ddtype = list(zip(names, sized_column_types))
2410 mdtype = list(zip(names, [bool] * len(sized_column_types)))
2411 output = np.array(data, dtype=ddtype)
2412 if usemask:
2413 outputmask = np.array(masks, dtype=mdtype)
2414 else:
2415 # Overwrite the initial dtype names if needed
2416 if names and dtype.names is not None:
2417 dtype.names = names
2418 # Case 1. We have a structured type
2419 if len(dtype_flat) > 1:
2420 # Nested dtype, eg [('a', int), ('b', [('b0', int), ('b1', 'f4')])]
2421 # First, create the array using a flattened dtype:
2422 # [('a', int), ('b1', int), ('b2', float)]
2423 # Then, view the array using the specified dtype.
2424 if 'O' in (_.char for _ in dtype_flat):
2425 if has_nested_fields(dtype):
2426 raise NotImplementedError(
2427 "Nested fields involving objects are not supported...")
2428 else:
2429 output = np.array(data, dtype=dtype)
2430 else:
2431 rows = np.array(data, dtype=[('', _) for _ in dtype_flat])
2432 output = rows.view(dtype)
2433 # Now, process the rowmasks the same way
2434 if usemask:
2435 rowmasks = np.array(
2436 masks, dtype=np.dtype([('', bool) for t in dtype_flat]))
2437 # Construct the new dtype
2438 mdtype = make_mask_descr(dtype)
2439 outputmask = rowmasks.view(mdtype)
2440 # Case #2. We have a basic dtype
2441 else:
2442 # We used some user-defined converters
2443 if user_converters:
2444 ishomogeneous = True
2445 descr = []
2446 for i, ttype in enumerate([conv.type for conv in converters]):
2447 # Keep the dtype of the current converter
2448 if i in user_converters:
2449 ishomogeneous &= (ttype == dtype.type)
2450 if np.issubdtype(ttype, np.character):
2451 ttype = (ttype, max(len(row[i]) for row in data))
2452 descr.append(('', ttype))
2453 else:
2454 descr.append(('', dtype))
2455 # So we changed the dtype ?
2456 if not ishomogeneous:
2457 # We have more than one field
2458 if len(descr) > 1:
2459 dtype = np.dtype(descr)
2460 # We have only one field: drop the name if not needed.
2461 else:
2462 dtype = np.dtype(ttype)
2463 #
2464 output = np.array(data, dtype)
2465 if usemask:
2466 if dtype.names is not None:
2467 mdtype = [(_, bool) for _ in dtype.names]
2468 else:
2469 mdtype = bool
2470 outputmask = np.array(masks, dtype=mdtype)
2471 # Try to take care of the missing data we missed
2472 names = output.dtype.names
2473 if usemask and names:
2474 for (name, conv) in zip(names, converters):
2475 missing_values = [conv(_) for _ in conv.missing_values
2476 if _ != '']
2477 for mval in missing_values:
2478 outputmask[name] |= (output[name] == mval)
2479 # Construct the final array
2480 if usemask:
2481 output = output.view(MaskedArray)
2482 output._mask = outputmask
2483
2484 output = _ensure_ndmin_ndarray(output, ndmin=ndmin)
2485
2486 if unpack:
2487 if names is None:
2488 return output.T
2489 elif len(names) == 1:
2490 # squeeze single-name dtypes too
2491 return output[names[0]]
2492 else:
2493 # For structured arrays with multiple fields,
2494 # return an array for each field.
2495 return [output[field] for field in names]
2496 return output
2497
2498
2499_genfromtxt_with_like = array_function_dispatch()(genfromtxt)
2500
2501
2502def recfromtxt(fname, **kwargs):
2503 """
2504 Load ASCII data from a file and return it in a record array.
2505
2506 If ``usemask=False`` a standard `recarray` is returned,
2507 if ``usemask=True`` a MaskedRecords array is returned.
2508
2509 .. deprecated:: 2.0
2510 Use `numpy.genfromtxt` instead.
2511
2512 Parameters
2513 ----------
2514 fname, kwargs : For a description of input parameters, see `genfromtxt`.
2515
2516 See Also
2517 --------
2518 numpy.genfromtxt : generic function
2519
2520 Notes
2521 -----
2522 By default, `dtype` is None, which means that the data-type of the output
2523 array will be determined from the data.
2524
2525 """
2526
2527 # Deprecated in NumPy 2.0, 2023-07-11
2528 warnings.warn(
2529 "`recfromtxt` is deprecated, "
2530 "use `numpy.genfromtxt` instead."
2531 "(deprecated in NumPy 2.0)",
2532 DeprecationWarning,
2533 stacklevel=2
2534 )
2535
2536 kwargs.setdefault("dtype", None)
2537 usemask = kwargs.get('usemask', False)
2538 output = genfromtxt(fname, **kwargs)
2539 if usemask:
2540 from numpy.ma.mrecords import MaskedRecords
2541 output = output.view(MaskedRecords)
2542 else:
2543 output = output.view(np.recarray)
2544 return output
2545
2546
2547def recfromcsv(fname, **kwargs):
2548 """
2549 Load ASCII data stored in a comma-separated file.
2550
2551 The returned array is a record array (if ``usemask=False``, see
2552 `recarray`) or a masked record array (if ``usemask=True``,
2553 see `ma.mrecords.MaskedRecords`).
2554
2555 .. deprecated:: 2.0
2556 Use `numpy.genfromtxt` with comma as `delimiter` instead.
2557
2558 Parameters
2559 ----------
2560 fname, kwargs : For a description of input parameters, see `genfromtxt`.
2561
2562 See Also
2563 --------
2564 numpy.genfromtxt : generic function to load ASCII data.
2565
2566 Notes
2567 -----
2568 By default, `dtype` is None, which means that the data-type of the output
2569 array will be determined from the data.
2570
2571 """
2572
2573 # Deprecated in NumPy 2.0, 2023-07-11
2574 warnings.warn(
2575 "`recfromcsv` is deprecated, "
2576 "use `numpy.genfromtxt` with comma as `delimiter` instead. "
2577 "(deprecated in NumPy 2.0)",
2578 DeprecationWarning,
2579 stacklevel=2
2580 )
2581
2582 # Set default kwargs for genfromtxt as relevant to csv import.
2583 kwargs.setdefault("case_sensitive", "lower")
2584 kwargs.setdefault("names", True)
2585 kwargs.setdefault("delimiter", ",")
2586 kwargs.setdefault("dtype", None)
2587 output = genfromtxt(fname, **kwargs)
2588
2589 usemask = kwargs.get("usemask", False)
2590 if usemask:
2591 from numpy.ma.mrecords import MaskedRecords
2592 output = output.view(MaskedRecords)
2593 else:
2594 output = output.view(np.recarray)
2595 return output