Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/h5py/_hl/dataset.py: 21%
633 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:30 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:30 +0000
1# This file is part of h5py, a Python interface to the HDF5 library.
2#
3# http://www.h5py.org
4#
5# Copyright 2008-2020 Andrew Collette and contributors
6#
7# License: Standard 3-clause BSD; see "license.txt" for full license terms
8# and contributor agreement.
10"""
11 Implements support for high-level dataset access.
12"""
14import posixpath as pp
15import sys
17import numpy
19from .. import h5, h5s, h5t, h5r, h5d, h5p, h5fd, h5ds, _selector
20from .base import HLObject, phil, with_phil, Empty, cached_property, find_item_type, array_for_new_object
21from . import filters
22from . import selections as sel
23from . import selections2 as sel2
24from .datatype import Datatype
25from .compat import filename_decode
26from .vds import VDSmap, vds_support
28_LEGACY_GZIP_COMPRESSION_VALS = frozenset(range(10))
29MPI = h5.get_config().mpi
32def make_new_dset(parent, shape=None, dtype=None, data=None, name=None,
33 chunks=None, compression=None, shuffle=None,
34 fletcher32=None, maxshape=None, compression_opts=None,
35 fillvalue=None, scaleoffset=None, track_times=False,
36 external=None, track_order=None, dcpl=None, dapl=None,
37 efile_prefix=None, virtual_prefix=None, allow_unknown_filter=False,
38 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None):
39 """ Return a new low-level dataset identifier """
41 # Convert data to a C-contiguous ndarray
42 if data is not None and not isinstance(data, Empty):
43 data = array_for_new_object(data, specified_dtype=dtype)
45 # Validate shape
46 if shape is None:
47 if data is None:
48 if dtype is None:
49 raise TypeError("One of data, shape or dtype must be specified")
50 data = Empty(dtype)
51 shape = data.shape
52 else:
53 shape = (shape,) if isinstance(shape, int) else tuple(shape)
54 if data is not None and (numpy.product(shape, dtype=numpy.ulonglong) != numpy.product(data.shape, dtype=numpy.ulonglong)):
55 raise ValueError("Shape tuple is incompatible with data")
57 if isinstance(maxshape, int):
58 maxshape = (maxshape,)
59 tmp_shape = maxshape if maxshape is not None else shape
61 # Validate chunk shape
62 if isinstance(chunks, int) and not isinstance(chunks, bool):
63 chunks = (chunks,)
64 if isinstance(chunks, tuple) and any(
65 chunk > dim for dim, chunk in zip(tmp_shape, chunks) if dim is not None
66 ):
67 errmsg = "Chunk shape must not be greater than data shape in any dimension. "\
68 "{} is not compatible with {}".format(chunks, shape)
69 raise ValueError(errmsg)
71 if isinstance(dtype, Datatype):
72 # Named types are used as-is
73 tid = dtype.id
74 dtype = tid.dtype # Following code needs this
75 else:
76 # Validate dtype
77 if dtype is None and data is None:
78 dtype = numpy.dtype("=f4")
79 elif dtype is None and data is not None:
80 dtype = data.dtype
81 else:
82 dtype = numpy.dtype(dtype)
83 tid = h5t.py_create(dtype, logical=1)
85 # Legacy
86 if any((compression, shuffle, fletcher32, maxshape, scaleoffset)) and chunks is False:
87 raise ValueError("Chunked format required for given storage options")
89 # Legacy
90 if compression is True:
91 if compression_opts is None:
92 compression_opts = 4
93 compression = 'gzip'
95 # Legacy
96 if compression in _LEGACY_GZIP_COMPRESSION_VALS:
97 if compression_opts is not None:
98 raise TypeError("Conflict in compression options")
99 compression_opts = compression
100 compression = 'gzip'
101 dcpl = filters.fill_dcpl(
102 dcpl or h5p.create(h5p.DATASET_CREATE), shape, dtype,
103 chunks, compression, compression_opts, shuffle, fletcher32,
104 maxshape, scaleoffset, external, allow_unknown_filter)
106 if fillvalue is not None:
107 # prepare string-type dtypes for fillvalue
108 string_info = h5t.check_string_dtype(dtype)
109 if string_info is not None:
110 # fake vlen dtype for fixed len string fillvalue
111 # to not trigger unwanted encoding
112 dtype = h5t.string_dtype(string_info.encoding)
113 fillvalue = numpy.array(fillvalue, dtype=dtype)
114 else:
115 fillvalue = numpy.array(fillvalue)
116 dcpl.set_fill_value(fillvalue)
118 if track_times is None:
119 # In case someone explicitly passes None for the default
120 track_times = False
121 if track_times in (True, False):
122 dcpl.set_obj_track_times(track_times)
123 else:
124 raise TypeError("track_times must be either True or False")
125 if track_order is True:
126 dcpl.set_attr_creation_order(
127 h5p.CRT_ORDER_TRACKED | h5p.CRT_ORDER_INDEXED)
128 elif track_order is False:
129 dcpl.set_attr_creation_order(0)
130 elif track_order is not None:
131 raise TypeError("track_order must be either True or False")
133 if maxshape is not None:
134 maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape)
136 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):
137 dapl = dapl or h5p.create(h5p.DATASET_ACCESS)
139 if efile_prefix is not None:
140 dapl.set_efile_prefix(efile_prefix)
142 if virtual_prefix is not None:
143 dapl.set_virtual_prefix(virtual_prefix)
145 if rdcc_nbytes or rdcc_nslots or rdcc_w0:
146 cache_settings = list(dapl.get_chunk_cache())
147 if rdcc_nslots is not None:
148 cache_settings[0] = rdcc_nslots
149 if rdcc_nbytes is not None:
150 cache_settings[1] = rdcc_nbytes
151 if rdcc_w0 is not None:
152 cache_settings[2] = rdcc_w0
153 dapl.set_chunk_cache(*cache_settings)
155 if isinstance(data, Empty):
156 sid = h5s.create(h5s.NULL)
157 else:
158 sid = h5s.create_simple(shape, maxshape)
160 dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl, dapl=dapl)
162 if (data is not None) and (not isinstance(data, Empty)):
163 dset_id.write(h5s.ALL, h5s.ALL, data)
165 return dset_id
168def open_dset(parent, name, dapl=None, efile_prefix=None, virtual_prefix=None,
169 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, **kwds):
170 """ Return an existing low-level dataset identifier """
172 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):
173 dapl = dapl or h5p.create(h5p.DATASET_ACCESS)
175 if efile_prefix is not None:
176 dapl.set_efile_prefix(efile_prefix)
178 if virtual_prefix is not None:
179 dapl.set_virtual_prefix(virtual_prefix)
181 if rdcc_nbytes or rdcc_nslots or rdcc_w0:
182 cache_settings = list(dapl.get_chunk_cache())
183 if rdcc_nslots is not None:
184 cache_settings[0] = rdcc_nslots
185 if rdcc_nbytes is not None:
186 cache_settings[1] = rdcc_nbytes
187 if rdcc_w0 is not None:
188 cache_settings[2] = rdcc_w0
189 dapl.set_chunk_cache(*cache_settings)
191 dset_id = h5d.open(parent.id, name, dapl=dapl)
193 return dset_id
196class AstypeWrapper:
197 """Wrapper to convert data on reading from a dataset.
198 """
199 def __init__(self, dset, dtype):
200 self._dset = dset
201 self._dtype = numpy.dtype(dtype)
203 def __getitem__(self, args):
204 return self._dset.__getitem__(args, new_dtype=self._dtype)
206 def __len__(self):
207 """ Get the length of the underlying dataset
209 >>> length = len(dataset.astype('f8'))
210 """
211 return len(self._dset)
213 def __array__(self, dtype=None):
214 data = self[:]
215 if dtype is not None:
216 data = data.astype(dtype)
217 return data
220class AsStrWrapper:
221 """Wrapper to decode strings on reading the dataset"""
222 def __init__(self, dset, encoding, errors='strict'):
223 self._dset = dset
224 if encoding is None:
225 encoding = h5t.check_string_dtype(dset.dtype).encoding
226 self.encoding = encoding
227 self.errors = errors
229 def __getitem__(self, args):
230 bytes_arr = self._dset[args]
231 # numpy.char.decode() seems like the obvious thing to use. But it only
232 # accepts numpy string arrays, not object arrays of bytes (which we
233 # return from HDF5 variable-length strings). And the numpy
234 # implementation is not faster than doing it with a loop; in fact, by
235 # not converting the result to a numpy unicode array, the
236 # naive way can be faster! (Comparing with numpy 1.18.4, June 2020)
237 if numpy.isscalar(bytes_arr):
238 return bytes_arr.decode(self.encoding, self.errors)
240 return numpy.array([
241 b.decode(self.encoding, self.errors) for b in bytes_arr.flat
242 ], dtype=object).reshape(bytes_arr.shape)
244 def __len__(self):
245 """ Get the length of the underlying dataset
247 >>> length = len(dataset.asstr())
248 """
249 return len(self._dset)
251 def __array__(self):
252 return numpy.array([
253 b.decode(self.encoding, self.errors) for b in self._dset
254 ], dtype=object).reshape(self._dset.shape)
257class FieldsWrapper:
258 """Wrapper to extract named fields from a dataset with a struct dtype"""
259 extract_field = None
261 def __init__(self, dset, prior_dtype, names):
262 self._dset = dset
263 if isinstance(names, str):
264 self.extract_field = names
265 names = [names]
266 self.read_dtype = readtime_dtype(prior_dtype, names)
268 def __array__(self, dtype=None):
269 data = self[:]
270 if dtype is not None:
271 data = data.astype(dtype)
272 return data
274 def __getitem__(self, args):
275 data = self._dset.__getitem__(args, new_dtype=self.read_dtype)
276 if self.extract_field is not None:
277 data = data[self.extract_field]
278 return data
280 def __len__(self):
281 """ Get the length of the underlying dataset
283 >>> length = len(dataset.fields(['x', 'y']))
284 """
285 return len(self._dset)
288def readtime_dtype(basetype, names):
289 """Make a NumPy compound dtype with a subset of available fields"""
290 if basetype.names is None: # Names provided, but not compound
291 raise ValueError("Field names only allowed for compound types")
293 for name in names: # Check all names are legal
294 if name not in basetype.names:
295 raise ValueError("Field %s does not appear in this type." % name)
297 return numpy.dtype([(name, basetype.fields[name][0]) for name in names])
300if MPI:
301 class CollectiveContext:
303 """ Manages collective I/O in MPI mode """
305 # We don't bother with _local as threads are forbidden in MPI mode
307 def __init__(self, dset):
308 self._dset = dset
310 def __enter__(self):
311 # pylint: disable=protected-access
312 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_COLLECTIVE)
314 def __exit__(self, *args):
315 # pylint: disable=protected-access
316 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_INDEPENDENT)
319class ChunkIterator:
320 """
321 Class to iterate through list of chunks of a given dataset
322 """
323 def __init__(self, dset, source_sel=None):
324 self._shape = dset.shape
325 rank = len(dset.shape)
327 if not dset.chunks:
328 # can only use with chunked datasets
329 raise TypeError("Chunked dataset required")
331 self._layout = dset.chunks
332 if source_sel is None:
333 # select over entire dataset
334 slices = []
335 for dim in range(rank):
336 slices.append(slice(0, self._shape[dim]))
337 self._sel = tuple(slices)
338 else:
339 if isinstance(source_sel, slice):
340 self._sel = (source_sel,)
341 else:
342 self._sel = source_sel
343 if len(self._sel) != rank:
344 raise ValueError("Invalid selection - selection region must have same rank as dataset")
345 self._chunk_index = []
346 for dim in range(rank):
347 s = self._sel[dim]
348 if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start:
349 raise ValueError("Invalid selection - selection region must be within dataset space")
350 index = s.start // self._layout[dim]
351 self._chunk_index.append(index)
353 def __iter__(self):
354 return self
356 def __next__(self):
357 rank = len(self._shape)
358 slices = []
359 if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop:
360 # ran past the last chunk, end iteration
361 raise StopIteration()
363 for dim in range(rank):
364 s = self._sel[dim]
365 start = self._chunk_index[dim] * self._layout[dim]
366 stop = (self._chunk_index[dim] + 1) * self._layout[dim]
367 # adjust the start if this is an edge chunk
368 if start < s.start:
369 start = s.start
370 if stop > s.stop:
371 stop = s.stop # trim to end of the selection
372 s = slice(start, stop, 1)
373 slices.append(s)
375 # bump up the last index and carry forward if we run outside the selection
376 dim = rank - 1
377 while dim >= 0:
378 s = self._sel[dim]
379 self._chunk_index[dim] += 1
381 chunk_end = self._chunk_index[dim] * self._layout[dim]
382 if chunk_end < s.stop:
383 # we still have room to extend along this dimensions
384 return tuple(slices)
386 if dim > 0:
387 # reset to the start and continue iterating with higher dimension
388 self._chunk_index[dim] = 0
389 dim -= 1
390 return tuple(slices)
393class Dataset(HLObject):
395 """
396 Represents an HDF5 dataset
397 """
399 def astype(self, dtype):
400 """ Get a wrapper allowing you to perform reads to a
401 different destination type, e.g.:
403 >>> double_precision = dataset.astype('f8')[0:100:2]
404 """
405 return AstypeWrapper(self, dtype)
407 def asstr(self, encoding=None, errors='strict'):
408 """Get a wrapper to read string data as Python strings:
410 >>> str_array = dataset.asstr()[:]
412 The parameters have the same meaning as in ``bytes.decode()``.
413 If ``encoding`` is unspecified, it will use the encoding in the HDF5
414 datatype (either ascii or utf-8).
415 """
416 string_info = h5t.check_string_dtype(self.dtype)
417 if string_info is None:
418 raise TypeError(
419 "dset.asstr() can only be used on datasets with "
420 "an HDF5 string datatype"
421 )
422 if encoding is None:
423 encoding = string_info.encoding
424 return AsStrWrapper(self, encoding, errors=errors)
426 def fields(self, names, *, _prior_dtype=None):
427 """Get a wrapper to read a subset of fields from a compound data type:
429 >>> 2d_coords = dataset.fields(['x', 'y'])[:]
431 If names is a string, a single field is extracted, and the resulting
432 arrays will have that dtype. Otherwise, it should be an iterable,
433 and the read data will have a compound dtype.
434 """
435 if _prior_dtype is None:
436 _prior_dtype = self.dtype
437 return FieldsWrapper(self, _prior_dtype, names)
439 if MPI:
440 @property
441 @with_phil
442 def collective(self):
443 """ Context manager for MPI collective reads & writes """
444 return CollectiveContext(self)
446 @property
447 def dims(self):
448 """ Access dimension scales attached to this dataset. """
449 from .dims import DimensionManager
450 with phil:
451 return DimensionManager(self)
453 @property
454 @with_phil
455 def ndim(self):
456 """Numpy-style attribute giving the number of dimensions"""
457 return self.id.rank
459 @property
460 def shape(self):
461 """Numpy-style shape tuple giving dataset dimensions"""
462 if 'shape' in self._cache_props:
463 return self._cache_props['shape']
465 with phil:
466 shape = self.id.shape
468 # If the file is read-only, cache the shape to speed-up future uses.
469 # This cache is invalidated by .refresh() when using SWMR.
470 if self._readonly:
471 self._cache_props['shape'] = shape
472 return shape
474 @shape.setter
475 @with_phil
476 def shape(self, shape):
477 # pylint: disable=missing-docstring
478 self.resize(shape)
480 @property
481 def size(self):
482 """Numpy-style attribute giving the total dataset size"""
483 if 'size' in self._cache_props:
484 return self._cache_props['size']
486 if self._is_empty:
487 size = None
488 else:
489 size = numpy.prod(self.shape, dtype=numpy.intp)
491 # If the file is read-only, cache the size to speed-up future uses.
492 # This cache is invalidated by .refresh() when using SWMR.
493 if self._readonly:
494 self._cache_props['size'] = size
495 return size
497 @property
498 def nbytes(self):
499 """Numpy-style attribute giving the raw dataset size as the number of bytes"""
500 size = self.size
501 if size is None: # if we are an empty 0-D array, then there are no bytes in the dataset
502 return 0
503 return self.dtype.itemsize * size
505 @property
506 def _selector(self):
507 """Internal object for optimised selection of data"""
508 if '_selector' in self._cache_props:
509 return self._cache_props['_selector']
511 slr = _selector.Selector(self.id.get_space())
513 # If the file is read-only, cache the reader to speed up future uses.
514 # This cache is invalidated by .refresh() when using SWMR.
515 if self._readonly:
516 self._cache_props['_selector'] = slr
517 return slr
519 @property
520 def _fast_reader(self):
521 """Internal object for optimised reading of data"""
522 if '_fast_reader' in self._cache_props:
523 return self._cache_props['_fast_reader']
525 rdr = _selector.Reader(self.id)
527 # If the file is read-only, cache the reader to speed up future uses.
528 # This cache is invalidated by .refresh() when using SWMR.
529 if self._readonly:
530 self._cache_props['_fast_reader'] = rdr
531 return rdr
533 @property
534 @with_phil
535 def dtype(self):
536 """Numpy dtype representing the datatype"""
537 return self.id.dtype
539 @property
540 @with_phil
541 def chunks(self):
542 """Dataset chunks (or None)"""
543 dcpl = self._dcpl
544 if dcpl.get_layout() == h5d.CHUNKED:
545 return dcpl.get_chunk()
546 return None
548 @property
549 @with_phil
550 def compression(self):
551 """Compression strategy (or None)"""
552 for x in ('gzip','lzf','szip'):
553 if x in self._filters:
554 return x
555 return None
557 @property
558 @with_phil
559 def compression_opts(self):
560 """ Compression setting. Int(0-9) for gzip, 2-tuple for szip. """
561 return self._filters.get(self.compression, None)
563 @property
564 @with_phil
565 def shuffle(self):
566 """Shuffle filter present (T/F)"""
567 return 'shuffle' in self._filters
569 @property
570 @with_phil
571 def fletcher32(self):
572 """Fletcher32 filter is present (T/F)"""
573 return 'fletcher32' in self._filters
575 @property
576 @with_phil
577 def scaleoffset(self):
578 """Scale/offset filter settings. For integer data types, this is
579 the number of bits stored, or 0 for auto-detected. For floating
580 point data types, this is the number of decimal places retained.
581 If the scale/offset filter is not in use, this is None."""
582 try:
583 return self._filters['scaleoffset'][1]
584 except KeyError:
585 return None
587 @property
588 @with_phil
589 def external(self):
590 """External file settings. Returns a list of tuples of
591 (name, offset, size) for each external file entry, or returns None
592 if no external files are used."""
593 count = self._dcpl.get_external_count()
594 if count<=0:
595 return None
596 ext_list = list()
597 for x in range(count):
598 (name, offset, size) = self._dcpl.get_external(x)
599 ext_list.append( (filename_decode(name), offset, size) )
600 return ext_list
602 @property
603 @with_phil
604 def maxshape(self):
605 """Shape up to which this dataset can be resized. Axes with value
606 None have no resize limit. """
607 space = self.id.get_space()
608 dims = space.get_simple_extent_dims(True)
609 if dims is None:
610 return None
612 return tuple(x if x != h5s.UNLIMITED else None for x in dims)
614 @property
615 @with_phil
616 def fillvalue(self):
617 """Fill value for this dataset (0 by default)"""
618 arr = numpy.zeros((1,), dtype=self.dtype)
619 self._dcpl.get_fill_value(arr)
620 return arr[0]
622 @cached_property
623 @with_phil
624 def _extent_type(self):
625 """Get extent type for this dataset - SIMPLE, SCALAR or NULL"""
626 return self.id.get_space().get_simple_extent_type()
628 @cached_property
629 def _is_empty(self):
630 """Check if extent type is empty"""
631 return self._extent_type == h5s.NULL
633 @with_phil
634 def __init__(self, bind, *, readonly=False):
635 """ Create a new Dataset object by binding to a low-level DatasetID.
636 """
637 if not isinstance(bind, h5d.DatasetID):
638 raise ValueError("%s is not a DatasetID" % bind)
639 super().__init__(bind)
641 self._dcpl = self.id.get_create_plist()
642 self._dxpl = h5p.create(h5p.DATASET_XFER)
643 self._filters = filters.get_filters(self._dcpl)
644 self._readonly = readonly
645 self._cache_props = {}
647 def resize(self, size, axis=None):
648 """ Resize the dataset, or the specified axis.
650 The dataset must be stored in chunked format; it can be resized up to
651 the "maximum shape" (keyword maxshape) specified at creation time.
652 The rank of the dataset cannot be changed.
654 "Size" should be a shape tuple, or if an axis is specified, an integer.
656 BEWARE: This functions differently than the NumPy resize() method!
657 The data is not "reshuffled" to fit in the new shape; each axis is
658 grown or shrunk independently. The coordinates of existing data are
659 fixed.
660 """
661 with phil:
662 if self.chunks is None:
663 raise TypeError("Only chunked datasets can be resized")
665 if axis is not None:
666 if not (axis >=0 and axis < self.id.rank):
667 raise ValueError("Invalid axis (0 to %s allowed)" % (self.id.rank-1))
668 try:
669 newlen = int(size)
670 except TypeError:
671 raise TypeError("Argument must be a single int if axis is specified")
672 size = list(self.shape)
673 size[axis] = newlen
675 size = tuple(size)
676 self.id.set_extent(size)
677 #h5f.flush(self.id) # THG recommends
679 @with_phil
680 def __len__(self):
681 """ The size of the first axis. TypeError if scalar.
683 Limited to 2**32 on 32-bit systems; Dataset.len() is preferred.
684 """
685 size = self.len()
686 if size > sys.maxsize:
687 raise OverflowError("Value too big for Python's __len__; use Dataset.len() instead.")
688 return size
690 def len(self):
691 """ The size of the first axis. TypeError if scalar.
693 Use of this method is preferred to len(dset), as Python's built-in
694 len() cannot handle values greater then 2**32 on 32-bit systems.
695 """
696 with phil:
697 shape = self.shape
698 if len(shape) == 0:
699 raise TypeError("Attempt to take len() of scalar dataset")
700 return shape[0]
702 @with_phil
703 def __iter__(self):
704 """ Iterate over the first axis. TypeError if scalar.
706 BEWARE: Modifications to the yielded data are *NOT* written to file.
707 """
708 shape = self.shape
709 if len(shape) == 0:
710 raise TypeError("Can't iterate over a scalar dataset")
711 for i in range(shape[0]):
712 yield self[i]
714 @with_phil
715 def iter_chunks(self, sel=None):
716 """ Return chunk iterator. If set, the sel argument is a slice or
717 tuple of slices that defines the region to be used. If not set, the
718 entire dataspace will be used for the iterator.
720 For each chunk within the given region, the iterator yields a tuple of
721 slices that gives the intersection of the given chunk with the
722 selection area.
724 A TypeError will be raised if the dataset is not chunked.
726 A ValueError will be raised if the selection region is invalid.
728 """
729 return ChunkIterator(self, sel)
731 @cached_property
732 def _fast_read_ok(self):
733 """Is this dataset suitable for simple reading"""
734 return (
735 self._extent_type == h5s.SIMPLE
736 and isinstance(self.id.get_type(), (h5t.TypeIntegerID, h5t.TypeFloatID))
737 )
739 @with_phil
740 def __getitem__(self, args, new_dtype=None):
741 """ Read a slice from the HDF5 dataset.
743 Takes slices and recarray-style field names (more than one is
744 allowed!) in any order. Obeys basic NumPy rules, including
745 broadcasting.
747 Also supports:
749 * Boolean "mask" array indexing
750 """
751 args = args if isinstance(args, tuple) else (args,)
753 if self._fast_read_ok and (new_dtype is None):
754 try:
755 return self._fast_reader.read(args)
756 except TypeError:
757 pass # Fall back to Python read pathway below
759 if self._is_empty:
760 # Check 'is Ellipsis' to avoid equality comparison with an array:
761 # array equality returns an array, not a boolean.
762 if args == () or (len(args) == 1 and args[0] is Ellipsis):
763 return Empty(self.dtype)
764 raise ValueError("Empty datasets cannot be sliced")
766 # Sort field names from the rest of the args.
767 names = tuple(x for x in args if isinstance(x, str))
769 if names:
770 # Read a subset of the fields in this structured dtype
771 if len(names) == 1:
772 names = names[0] # Read with simpler dtype of this field
773 args = tuple(x for x in args if not isinstance(x, str))
774 return self.fields(names, _prior_dtype=new_dtype)[args]
776 if new_dtype is None:
777 new_dtype = self.dtype
778 mtype = h5t.py_create(new_dtype)
780 # === Special-case region references ====
782 if len(args) == 1 and isinstance(args[0], h5r.RegionReference):
784 obj = h5r.dereference(args[0], self.id)
785 if obj != self.id:
786 raise ValueError("Region reference must point to this dataset")
788 sid = h5r.get_region(args[0], self.id)
789 mshape = sel.guess_shape(sid)
790 if mshape is None:
791 # 0D with no data (NULL or deselected SCALAR)
792 return Empty(new_dtype)
793 out = numpy.zeros(mshape, dtype=new_dtype)
794 if out.size == 0:
795 return out
797 sid_out = h5s.create_simple(mshape)
798 sid_out.select_all()
799 self.id.read(sid_out, sid, out, mtype)
800 return out
802 # === Check for zero-sized datasets =====
804 if self.size == 0:
805 # Check 'is Ellipsis' to avoid equality comparison with an array:
806 # array equality returns an array, not a boolean.
807 if args == () or (len(args) == 1 and args[0] is Ellipsis):
808 return numpy.zeros(self.shape, dtype=new_dtype)
810 # === Scalar dataspaces =================
812 if self.shape == ():
813 fspace = self.id.get_space()
814 selection = sel2.select_read(fspace, args)
815 if selection.mshape is None:
816 arr = numpy.zeros((), dtype=new_dtype)
817 else:
818 arr = numpy.zeros(selection.mshape, dtype=new_dtype)
819 for mspace, fspace in selection:
820 self.id.read(mspace, fspace, arr, mtype)
821 if selection.mshape is None:
822 return arr[()]
823 return arr
825 # === Everything else ===================
827 # Perform the dataspace selection.
828 selection = sel.select(self.shape, args, dataset=self)
830 if selection.nselect == 0:
831 return numpy.zeros(selection.array_shape, dtype=new_dtype)
833 arr = numpy.zeros(selection.array_shape, new_dtype, order='C')
835 # Perform the actual read
836 mspace = h5s.create_simple(selection.mshape)
837 fspace = selection.id
838 self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl)
840 # Patch up the output for NumPy
841 if arr.shape == ():
842 return arr[()] # 0 dim array -> numpy scalar
843 return arr
845 @with_phil
846 def __setitem__(self, args, val):
847 """ Write to the HDF5 dataset from a Numpy array.
849 NumPy's broadcasting rules are honored, for "simple" indexing
850 (slices and integers). For advanced indexing, the shapes must
851 match.
852 """
853 args = args if isinstance(args, tuple) else (args,)
855 # Sort field indices from the slicing
856 names = tuple(x for x in args if isinstance(x, str))
857 args = tuple(x for x in args if not isinstance(x, str))
859 # Generally we try to avoid converting the arrays on the Python
860 # side. However, for compound literals this is unavoidable.
861 vlen = h5t.check_vlen_dtype(self.dtype)
862 if vlen is not None and vlen not in (bytes, str):
863 try:
864 val = numpy.asarray(val, dtype=vlen)
865 except ValueError:
866 try:
867 val = numpy.array([numpy.array(x, dtype=vlen)
868 for x in val], dtype=self.dtype)
869 except ValueError:
870 pass
871 if vlen == val.dtype:
872 if val.ndim > 1:
873 tmp = numpy.empty(shape=val.shape[:-1], dtype=object)
874 tmp.ravel()[:] = [i for i in val.reshape(
875 (numpy.product(val.shape[:-1], dtype=numpy.ulonglong), val.shape[-1]))]
876 else:
877 tmp = numpy.array([None], dtype=object)
878 tmp[0] = val
879 val = tmp
880 elif self.dtype.kind == "O" or \
881 (self.dtype.kind == 'V' and \
882 (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \
883 (self.dtype.subdtype is None)):
884 if len(names) == 1 and self.dtype.fields is not None:
885 # Single field selected for write, from a non-array source
886 if not names[0] in self.dtype.fields:
887 raise ValueError("No such field for indexing: %s" % names[0])
888 dtype = self.dtype.fields[names[0]][0]
889 cast_compound = True
890 else:
891 dtype = self.dtype
892 cast_compound = False
894 val = numpy.asarray(val, dtype=dtype.base, order='C')
895 if cast_compound:
896 val = val.view(numpy.dtype([(names[0], dtype)]))
897 val = val.reshape(val.shape[:len(val.shape) - len(dtype.shape)])
898 elif (self.dtype.kind == 'S'
899 and (h5t.check_string_dtype(self.dtype).encoding == 'utf-8')
900 and (find_item_type(val) is str)
901 ):
902 # Writing str objects to a fixed-length UTF-8 string dataset.
903 # Numpy's normal conversion only handles ASCII characters, but
904 # when the destination is UTF-8, we want to allow any unicode.
905 # This *doesn't* handle numpy fixed-length unicode data ('U' dtype),
906 # as HDF5 has no equivalent, and converting fixed length UTF-32
907 # to variable length UTF-8 would obscure what's going on.
908 str_array = numpy.asarray(val, order='C', dtype=object)
909 val = numpy.array([
910 s.encode('utf-8') for s in str_array.flat
911 ], dtype=self.dtype).reshape(str_array.shape)
912 else:
913 # If the input data is already an array, let HDF5 do the conversion.
914 # If it's a list or similar, don't make numpy guess a dtype for it.
915 dt = None if isinstance(val, numpy.ndarray) else self.dtype.base
916 val = numpy.asarray(val, order='C', dtype=dt)
918 # Check for array dtype compatibility and convert
919 if self.dtype.subdtype is not None:
920 shp = self.dtype.subdtype[1]
921 valshp = val.shape[-len(shp):]
922 if valshp != shp: # Last dimension has to match
923 raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,))
924 mtype = h5t.py_create(numpy.dtype((val.dtype, shp)))
925 mshape = val.shape[0:len(val.shape)-len(shp)]
927 # Make a compound memory type if field-name slicing is required
928 elif len(names) != 0:
930 mshape = val.shape
932 # Catch common errors
933 if self.dtype.fields is None:
934 raise TypeError("Illegal slicing argument (not a compound dataset)")
935 mismatch = [x for x in names if x not in self.dtype.fields]
936 if len(mismatch) != 0:
937 mismatch = ", ".join('"%s"'%x for x in mismatch)
938 raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch)
940 # Write non-compound source into a single dataset field
941 if len(names) == 1 and val.dtype.fields is None:
942 subtype = h5t.py_create(val.dtype)
943 mtype = h5t.create(h5t.COMPOUND, subtype.get_size())
944 mtype.insert(self._e(names[0]), 0, subtype)
946 # Make a new source type keeping only the requested fields
947 else:
948 fieldnames = [x for x in val.dtype.names if x in names] # Keep source order
949 mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize)
950 for fieldname in fieldnames:
951 subtype = h5t.py_create(val.dtype.fields[fieldname][0])
952 offset = val.dtype.fields[fieldname][1]
953 mtype.insert(self._e(fieldname), offset, subtype)
955 # Use mtype derived from array (let DatasetID.write figure it out)
956 else:
957 mshape = val.shape
958 mtype = None
960 # Perform the dataspace selection
961 selection = sel.select(self.shape, args, dataset=self)
963 if selection.nselect == 0:
964 return
966 # Broadcast scalars if necessary.
967 # In order to avoid slow broadcasting filling the destination by
968 # the scalar value, we create an intermediate array of the same
969 # size as the destination buffer provided that size is reasonable.
970 # We assume as reasonable a size smaller or equal as the used dataset
971 # chunk size if any.
972 # In case of dealing with a non-chunked destination dataset or with
973 # a selection whose size is larger than the dataset chunk size we fall
974 # back to using an intermediate array of size equal to the last dimension
975 # of the destination buffer.
976 # The reasoning behind is that it makes sense to assume the creator of
977 # the dataset used an appropriate chunk size according the available
978 # memory. In any case, if we cannot afford to create an intermediate
979 # array of the same size as the dataset chunk size, the user program has
980 # little hope to go much further. Solves h5py issue #1067
981 if mshape == () and selection.array_shape != ():
982 if self.dtype.subdtype is not None:
983 raise TypeError("Scalar broadcasting is not supported for array dtypes")
984 if self.chunks and (numpy.prod(self.chunks, dtype=numpy.float64) >=
985 numpy.prod(selection.array_shape, dtype=numpy.float64)):
986 val2 = numpy.empty(selection.array_shape, dtype=val.dtype)
987 else:
988 val2 = numpy.empty(selection.array_shape[-1], dtype=val.dtype)
989 val2[...] = val
990 val = val2
991 mshape = val.shape
993 # Perform the write, with broadcasting
994 mspace = h5s.create_simple(selection.expand_shape(mshape))
995 for fspace in selection.broadcast(mshape):
996 self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl)
998 def read_direct(self, dest, source_sel=None, dest_sel=None):
999 """ Read data directly from HDF5 into an existing NumPy array.
1001 The destination array must be C-contiguous and writable.
1002 Selections must be the output of numpy.s_[<args>].
1004 Broadcasting is supported for simple indexing.
1005 """
1006 with phil:
1007 if self._is_empty:
1008 raise TypeError("Empty datasets have no numpy representation")
1009 if source_sel is None:
1010 source_sel = sel.SimpleSelection(self.shape)
1011 else:
1012 source_sel = sel.select(self.shape, source_sel, self) # for numpy.s_
1013 fspace = source_sel.id
1015 if dest_sel is None:
1016 dest_sel = sel.SimpleSelection(dest.shape)
1017 else:
1018 dest_sel = sel.select(dest.shape, dest_sel)
1020 for mspace in dest_sel.broadcast(source_sel.array_shape):
1021 self.id.read(mspace, fspace, dest, dxpl=self._dxpl)
1023 def write_direct(self, source, source_sel=None, dest_sel=None):
1024 """ Write data directly to HDF5 from a NumPy array.
1026 The source array must be C-contiguous. Selections must be
1027 the output of numpy.s_[<args>].
1029 Broadcasting is supported for simple indexing.
1030 """
1031 with phil:
1032 if self._is_empty:
1033 raise TypeError("Empty datasets cannot be written to")
1034 if source_sel is None:
1035 source_sel = sel.SimpleSelection(source.shape)
1036 else:
1037 source_sel = sel.select(source.shape, source_sel) # for numpy.s_
1038 mspace = source_sel.id
1040 if dest_sel is None:
1041 dest_sel = sel.SimpleSelection(self.shape)
1042 else:
1043 dest_sel = sel.select(self.shape, dest_sel, self)
1045 for fspace in dest_sel.broadcast(source_sel.array_shape):
1046 self.id.write(mspace, fspace, source, dxpl=self._dxpl)
1048 @with_phil
1049 def __array__(self, dtype=None):
1050 """ Create a Numpy array containing the whole dataset. DON'T THINK
1051 THIS MEANS DATASETS ARE INTERCHANGEABLE WITH ARRAYS. For one thing,
1052 you have to read the whole dataset every time this method is called.
1053 """
1054 arr = numpy.zeros(self.shape, dtype=self.dtype if dtype is None else dtype)
1056 # Special case for (0,)*-shape datasets
1057 if numpy.product(self.shape, dtype=numpy.ulonglong) == 0:
1058 return arr
1060 self.read_direct(arr)
1061 return arr
1063 @with_phil
1064 def __repr__(self):
1065 if not self:
1066 r = '<Closed HDF5 dataset>'
1067 else:
1068 if self.name is None:
1069 namestr = '("anonymous")'
1070 else:
1071 name = pp.basename(pp.normpath(self.name))
1072 namestr = '"%s"' % (name if name != '' else '/')
1073 r = '<HDF5 dataset %s: shape %s, type "%s">' % (
1074 namestr, self.shape, self.dtype.str
1075 )
1076 return r
1078 if hasattr(h5d.DatasetID, "refresh"):
1079 @with_phil
1080 def refresh(self):
1081 """ Refresh the dataset metadata by reloading from the file.
1083 This is part of the SWMR features and only exist when the HDF5
1084 library version >=1.9.178
1085 """
1086 self._id.refresh()
1087 self._cache_props.clear()
1089 if hasattr(h5d.DatasetID, "flush"):
1090 @with_phil
1091 def flush(self):
1092 """ Flush the dataset data and metadata to the file.
1093 If the dataset is chunked, raw data chunks are written to the file.
1095 This is part of the SWMR features and only exist when the HDF5
1096 library version >=1.9.178
1097 """
1098 self._id.flush()
1100 if vds_support:
1101 @property
1102 @with_phil
1103 def is_virtual(self):
1104 """Check if this is a virtual dataset"""
1105 return self._dcpl.get_layout() == h5d.VIRTUAL
1107 @with_phil
1108 def virtual_sources(self):
1109 """Get a list of the data mappings for a virtual dataset"""
1110 if not self.is_virtual:
1111 raise RuntimeError("Not a virtual dataset")
1112 dcpl = self._dcpl
1113 return [
1114 VDSmap(dcpl.get_virtual_vspace(j),
1115 dcpl.get_virtual_filename(j),
1116 dcpl.get_virtual_dsetname(j),
1117 dcpl.get_virtual_srcspace(j))
1118 for j in range(dcpl.get_virtual_count())]
1120 @with_phil
1121 def make_scale(self, name=''):
1122 """Make this dataset an HDF5 dimension scale.
1124 You can then attach it to dimensions of other datasets like this::
1126 other_ds.dims[0].attach_scale(ds)
1128 You can optionally pass a name to associate with this scale.
1129 """
1130 h5ds.set_scale(self._id, self._e(name))
1132 @property
1133 @with_phil
1134 def is_scale(self):
1135 """Return ``True`` if this dataset is also a dimension scale.
1137 Return ``False`` otherwise.
1138 """
1139 return h5ds.is_scale(self._id)