Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/h5py/_hl/dataset.py: 21%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of h5py, a Python interface to the HDF5 library.
2#
3# http://www.h5py.org
4#
5# Copyright 2008-2020 Andrew Collette and contributors
6#
7# License: Standard 3-clause BSD; see "license.txt" for full license terms
8# and contributor agreement.
10"""
11 Implements support for high-level dataset access.
12"""
14import posixpath as pp
15import sys
17import numpy
19from .. import h5, h5s, h5t, h5r, h5d, h5p, h5fd, h5ds, _selector
20from .base import (
21 array_for_new_object, cached_property, Empty, find_item_type, HLObject,
22 phil, product, with_phil,
23)
24from . import filters
25from . import selections as sel
26from . import selections2 as sel2
27from .datatype import Datatype
28from .compat import filename_decode
29from .vds import VDSmap, vds_support
31_LEGACY_GZIP_COMPRESSION_VALS = frozenset(range(10))
32MPI = h5.get_config().mpi
35def make_new_dset(parent, shape=None, dtype=None, data=None, name=None,
36 chunks=None, compression=None, shuffle=None,
37 fletcher32=None, maxshape=None, compression_opts=None,
38 fillvalue=None, scaleoffset=None, track_times=False,
39 external=None, track_order=None, dcpl=None, dapl=None,
40 efile_prefix=None, virtual_prefix=None, allow_unknown_filter=False,
41 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None):
42 """ Return a new low-level dataset identifier """
44 # Convert data to a C-contiguous ndarray
45 if data is not None and not isinstance(data, Empty):
46 data = array_for_new_object(data, specified_dtype=dtype)
48 # Validate shape
49 if shape is None:
50 if data is None:
51 if dtype is None:
52 raise TypeError("One of data, shape or dtype must be specified")
53 data = Empty(dtype)
54 shape = data.shape
55 else:
56 shape = (shape,) if isinstance(shape, int) else tuple(shape)
57 if data is not None and (product(shape) != product(data.shape)):
58 raise ValueError("Shape tuple is incompatible with data")
60 if isinstance(maxshape, int):
61 maxshape = (maxshape,)
62 tmp_shape = maxshape if maxshape is not None else shape
64 # Validate chunk shape
65 if isinstance(chunks, int) and not isinstance(chunks, bool):
66 chunks = (chunks,)
67 if isinstance(chunks, tuple) and any(
68 chunk > dim for dim, chunk in zip(tmp_shape, chunks) if dim is not None
69 ):
70 errmsg = "Chunk shape must not be greater than data shape in any dimension. "\
71 "{} is not compatible with {}".format(chunks, shape)
72 raise ValueError(errmsg)
74 if isinstance(dtype, Datatype):
75 # Named types are used as-is
76 tid = dtype.id
77 dtype = tid.dtype # Following code needs this
78 else:
79 # Validate dtype
80 if dtype is None and data is None:
81 dtype = numpy.dtype("=f4")
82 elif dtype is None and data is not None:
83 dtype = data.dtype
84 else:
85 dtype = numpy.dtype(dtype)
86 tid = h5t.py_create(dtype, logical=1)
88 # Legacy
89 if any((compression, shuffle, fletcher32, maxshape, scaleoffset)) and chunks is False:
90 raise ValueError("Chunked format required for given storage options")
92 # Legacy
93 if compression is True:
94 if compression_opts is None:
95 compression_opts = 4
96 compression = 'gzip'
98 # Legacy
99 if compression in _LEGACY_GZIP_COMPRESSION_VALS:
100 if compression_opts is not None:
101 raise TypeError("Conflict in compression options")
102 compression_opts = compression
103 compression = 'gzip'
104 dcpl = filters.fill_dcpl(
105 dcpl or h5p.create(h5p.DATASET_CREATE), shape, dtype,
106 chunks, compression, compression_opts, shuffle, fletcher32,
107 maxshape, scaleoffset, external, allow_unknown_filter)
109 if fillvalue is not None:
110 # prepare string-type dtypes for fillvalue
111 string_info = h5t.check_string_dtype(dtype)
112 if string_info is not None:
113 # fake vlen dtype for fixed len string fillvalue
114 # to not trigger unwanted encoding
115 dtype = h5t.string_dtype(string_info.encoding)
116 fillvalue = numpy.array(fillvalue, dtype=dtype)
117 else:
118 fillvalue = numpy.array(fillvalue)
119 dcpl.set_fill_value(fillvalue)
121 if track_times is None:
122 # In case someone explicitly passes None for the default
123 track_times = False
124 if track_times in (True, False):
125 dcpl.set_obj_track_times(track_times)
126 else:
127 raise TypeError("track_times must be either True or False")
128 if track_order is True:
129 dcpl.set_attr_creation_order(
130 h5p.CRT_ORDER_TRACKED | h5p.CRT_ORDER_INDEXED)
131 elif track_order is False:
132 dcpl.set_attr_creation_order(0)
133 elif track_order is not None:
134 raise TypeError("track_order must be either True or False")
136 if maxshape is not None:
137 maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape)
139 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):
140 dapl = dapl or h5p.create(h5p.DATASET_ACCESS)
142 if efile_prefix is not None:
143 dapl.set_efile_prefix(efile_prefix)
145 if virtual_prefix is not None:
146 dapl.set_virtual_prefix(virtual_prefix)
148 if rdcc_nbytes or rdcc_nslots or rdcc_w0:
149 cache_settings = list(dapl.get_chunk_cache())
150 if rdcc_nslots is not None:
151 cache_settings[0] = rdcc_nslots
152 if rdcc_nbytes is not None:
153 cache_settings[1] = rdcc_nbytes
154 if rdcc_w0 is not None:
155 cache_settings[2] = rdcc_w0
156 dapl.set_chunk_cache(*cache_settings)
158 if isinstance(data, Empty):
159 sid = h5s.create(h5s.NULL)
160 else:
161 sid = h5s.create_simple(shape, maxshape)
163 dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl, dapl=dapl)
165 if (data is not None) and (not isinstance(data, Empty)):
166 dset_id.write(h5s.ALL, h5s.ALL, data)
168 return dset_id
171def open_dset(parent, name, dapl=None, efile_prefix=None, virtual_prefix=None,
172 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, **kwds):
173 """ Return an existing low-level dataset identifier """
175 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):
176 dapl = dapl or h5p.create(h5p.DATASET_ACCESS)
178 if efile_prefix is not None:
179 dapl.set_efile_prefix(efile_prefix)
181 if virtual_prefix is not None:
182 dapl.set_virtual_prefix(virtual_prefix)
184 if rdcc_nbytes or rdcc_nslots or rdcc_w0:
185 cache_settings = list(dapl.get_chunk_cache())
186 if rdcc_nslots is not None:
187 cache_settings[0] = rdcc_nslots
188 if rdcc_nbytes is not None:
189 cache_settings[1] = rdcc_nbytes
190 if rdcc_w0 is not None:
191 cache_settings[2] = rdcc_w0
192 dapl.set_chunk_cache(*cache_settings)
194 dset_id = h5d.open(parent.id, name, dapl=dapl)
196 return dset_id
199class AstypeWrapper:
200 """Wrapper to convert data on reading from a dataset.
201 """
202 def __init__(self, dset, dtype):
203 self._dset = dset
204 self._dtype = numpy.dtype(dtype)
206 def __getitem__(self, args):
207 return self._dset.__getitem__(args, new_dtype=self._dtype)
209 def __len__(self):
210 """ Get the length of the underlying dataset
212 >>> length = len(dataset.astype('f8'))
213 """
214 return len(self._dset)
216 def __array__(self, dtype=None, copy=True):
217 if copy is False:
218 raise ValueError(
219 f"AstypeWrapper.__array__ received {copy=} "
220 f"but memory allocation cannot be avoided on read"
221 )
223 data = self[:]
224 if dtype is not None:
225 return data.astype(dtype, copy=False)
226 return data
229class AsStrWrapper:
230 """Wrapper to decode strings on reading the dataset"""
231 def __init__(self, dset, encoding, errors='strict'):
232 self._dset = dset
233 if encoding is None:
234 encoding = h5t.check_string_dtype(dset.dtype).encoding
235 self.encoding = encoding
236 self.errors = errors
238 def __getitem__(self, args):
239 bytes_arr = self._dset[args]
240 # numpy.char.decode() seems like the obvious thing to use. But it only
241 # accepts numpy string arrays, not object arrays of bytes (which we
242 # return from HDF5 variable-length strings). And the numpy
243 # implementation is not faster than doing it with a loop; in fact, by
244 # not converting the result to a numpy unicode array, the
245 # naive way can be faster! (Comparing with numpy 1.18.4, June 2020)
246 if numpy.isscalar(bytes_arr):
247 return bytes_arr.decode(self.encoding, self.errors)
249 return numpy.array([
250 b.decode(self.encoding, self.errors) for b in bytes_arr.flat
251 ], dtype=object).reshape(bytes_arr.shape)
253 def __len__(self):
254 """ Get the length of the underlying dataset
256 >>> length = len(dataset.asstr())
257 """
258 return len(self._dset)
260 def __array__(self, dtype=None, copy=True):
261 if dtype not in (None, object):
262 raise TypeError(
263 "AsStrWrapper.__array__ doesn't support the dtype argument"
264 )
265 if copy is False:
266 raise ValueError(
267 f"AsStrWrapper.__array__ received {copy=} "
268 f"but memory allocation cannot be avoided on read"
269 )
270 return numpy.array([
271 b.decode(self.encoding, self.errors) for b in self._dset
272 ], dtype=object).reshape(self._dset.shape)
275class FieldsWrapper:
276 """Wrapper to extract named fields from a dataset with a struct dtype"""
277 extract_field = None
279 def __init__(self, dset, prior_dtype, names):
280 self._dset = dset
281 if isinstance(names, str):
282 self.extract_field = names
283 names = [names]
284 self.read_dtype = readtime_dtype(prior_dtype, names)
286 def __array__(self, dtype=None, copy=True):
287 if copy is False:
288 raise ValueError(
289 f"FieldsWrapper.__array__ received {copy=} "
290 f"but memory allocation cannot be avoided on read"
291 )
292 data = self[:]
293 if dtype is not None:
294 return data.astype(dtype, copy=False)
295 else:
296 return data
298 def __getitem__(self, args):
299 data = self._dset.__getitem__(args, new_dtype=self.read_dtype)
300 if self.extract_field is not None:
301 data = data[self.extract_field]
302 return data
304 def __len__(self):
305 """ Get the length of the underlying dataset
307 >>> length = len(dataset.fields(['x', 'y']))
308 """
309 return len(self._dset)
312def readtime_dtype(basetype, names):
313 """Make a NumPy compound dtype with a subset of available fields"""
314 if basetype.names is None: # Names provided, but not compound
315 raise ValueError("Field names only allowed for compound types")
317 for name in names: # Check all names are legal
318 if name not in basetype.names:
319 raise ValueError("Field %s does not appear in this type." % name)
321 return numpy.dtype([(name, basetype.fields[name][0]) for name in names])
324if MPI:
325 class CollectiveContext:
327 """ Manages collective I/O in MPI mode """
329 # We don't bother with _local as threads are forbidden in MPI mode
331 def __init__(self, dset):
332 self._dset = dset
334 def __enter__(self):
335 # pylint: disable=protected-access
336 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_COLLECTIVE)
338 def __exit__(self, *args):
339 # pylint: disable=protected-access
340 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_INDEPENDENT)
343class ChunkIterator:
344 """
345 Class to iterate through list of chunks of a given dataset
346 """
347 def __init__(self, dset, source_sel=None):
348 self._shape = dset.shape
349 rank = len(dset.shape)
351 if not dset.chunks:
352 # can only use with chunked datasets
353 raise TypeError("Chunked dataset required")
355 self._layout = dset.chunks
356 if source_sel is None:
357 # select over entire dataset
358 self._sel = tuple(
359 slice(0, self._shape[dim])
360 for dim in range(rank)
361 )
362 else:
363 if isinstance(source_sel, slice):
364 self._sel = (source_sel,)
365 else:
366 self._sel = source_sel
367 if len(self._sel) != rank:
368 raise ValueError("Invalid selection - selection region must have same rank as dataset")
369 self._chunk_index = []
370 for dim in range(rank):
371 s = self._sel[dim]
372 if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start:
373 raise ValueError("Invalid selection - selection region must be within dataset space")
374 index = s.start // self._layout[dim]
375 self._chunk_index.append(index)
377 def __iter__(self):
378 return self
380 def __next__(self):
381 rank = len(self._shape)
382 slices = []
383 if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop:
384 # ran past the last chunk, end iteration
385 raise StopIteration()
387 for dim in range(rank):
388 s = self._sel[dim]
389 start = self._chunk_index[dim] * self._layout[dim]
390 stop = (self._chunk_index[dim] + 1) * self._layout[dim]
391 # adjust the start if this is an edge chunk
392 if start < s.start:
393 start = s.start
394 if stop > s.stop:
395 stop = s.stop # trim to end of the selection
396 s = slice(start, stop, 1)
397 slices.append(s)
399 # bump up the last index and carry forward if we run outside the selection
400 dim = rank - 1
401 while dim >= 0:
402 s = self._sel[dim]
403 self._chunk_index[dim] += 1
405 chunk_end = self._chunk_index[dim] * self._layout[dim]
406 if chunk_end < s.stop:
407 # we still have room to extend along this dimensions
408 return tuple(slices)
410 if dim > 0:
411 # reset to the start and continue iterating with higher dimension
412 self._chunk_index[dim] = s.start // self._layout[dim]
413 dim -= 1
414 return tuple(slices)
417class Dataset(HLObject):
419 """
420 Represents an HDF5 dataset
421 """
423 def astype(self, dtype):
424 """ Get a wrapper allowing you to perform reads to a
425 different destination type, e.g.:
427 >>> double_precision = dataset.astype('f8')[0:100:2]
428 """
429 return AstypeWrapper(self, dtype)
431 def asstr(self, encoding=None, errors='strict'):
432 """Get a wrapper to read string data as Python strings:
434 >>> str_array = dataset.asstr()[:]
436 The parameters have the same meaning as in ``bytes.decode()``.
437 If ``encoding`` is unspecified, it will use the encoding in the HDF5
438 datatype (either ascii or utf-8).
439 """
440 string_info = h5t.check_string_dtype(self.dtype)
441 if string_info is None:
442 raise TypeError(
443 "dset.asstr() can only be used on datasets with "
444 "an HDF5 string datatype"
445 )
446 if encoding is None:
447 encoding = string_info.encoding
448 return AsStrWrapper(self, encoding, errors=errors)
450 def fields(self, names, *, _prior_dtype=None):
451 """Get a wrapper to read a subset of fields from a compound data type:
453 >>> 2d_coords = dataset.fields(['x', 'y'])[:]
455 If names is a string, a single field is extracted, and the resulting
456 arrays will have that dtype. Otherwise, it should be an iterable,
457 and the read data will have a compound dtype.
458 """
459 if _prior_dtype is None:
460 _prior_dtype = self.dtype
461 return FieldsWrapper(self, _prior_dtype, names)
463 if MPI:
464 @property
465 @with_phil
466 def collective(self):
467 """ Context manager for MPI collective reads & writes """
468 return CollectiveContext(self)
470 @property
471 def dims(self):
472 """ Access dimension scales attached to this dataset. """
473 from .dims import DimensionManager
474 with phil:
475 return DimensionManager(self)
477 @property
478 @with_phil
479 def ndim(self):
480 """Numpy-style attribute giving the number of dimensions"""
481 return self.id.rank
483 @property
484 def shape(self):
485 """Numpy-style shape tuple giving dataset dimensions"""
486 if 'shape' in self._cache_props:
487 return self._cache_props['shape']
489 with phil:
490 shape = self.id.shape
492 # If the file is read-only, cache the shape to speed-up future uses.
493 # This cache is invalidated by .refresh() when using SWMR.
494 if self._readonly:
495 self._cache_props['shape'] = shape
496 return shape
498 @shape.setter
499 @with_phil
500 def shape(self, shape):
501 # pylint: disable=missing-docstring
502 self.resize(shape)
504 @property
505 def size(self):
506 """Numpy-style attribute giving the total dataset size"""
507 if 'size' in self._cache_props:
508 return self._cache_props['size']
510 if self._is_empty:
511 size = None
512 else:
513 size = product(self.shape)
515 # If the file is read-only, cache the size to speed-up future uses.
516 # This cache is invalidated by .refresh() when using SWMR.
517 if self._readonly:
518 self._cache_props['size'] = size
519 return size
521 @property
522 def nbytes(self):
523 """Numpy-style attribute giving the raw dataset size as the number of bytes"""
524 size = self.size
525 if size is None: # if we are an empty 0-D array, then there are no bytes in the dataset
526 return 0
527 return self.dtype.itemsize * size
529 @property
530 def _selector(self):
531 """Internal object for optimised selection of data"""
532 if '_selector' in self._cache_props:
533 return self._cache_props['_selector']
535 slr = _selector.Selector(self.id.get_space())
537 # If the file is read-only, cache the reader to speed up future uses.
538 # This cache is invalidated by .refresh() when using SWMR.
539 if self._readonly:
540 self._cache_props['_selector'] = slr
541 return slr
543 @property
544 def _fast_reader(self):
545 """Internal object for optimised reading of data"""
546 if '_fast_reader' in self._cache_props:
547 return self._cache_props['_fast_reader']
549 rdr = _selector.Reader(self.id)
551 # If the file is read-only, cache the reader to speed up future uses.
552 # This cache is invalidated by .refresh() when using SWMR.
553 if self._readonly:
554 self._cache_props['_fast_reader'] = rdr
555 return rdr
557 @property
558 @with_phil
559 def dtype(self):
560 """Numpy dtype representing the datatype"""
561 return self.id.dtype
563 @property
564 @with_phil
565 def chunks(self):
566 """Dataset chunks (or None)"""
567 dcpl = self._dcpl
568 if dcpl.get_layout() == h5d.CHUNKED:
569 return dcpl.get_chunk()
570 return None
572 @property
573 @with_phil
574 def compression(self):
575 """Compression strategy (or None)"""
576 for x in ('gzip','lzf','szip'):
577 if x in self._filters:
578 return x
579 return None
581 @property
582 @with_phil
583 def compression_opts(self):
584 """ Compression setting. Int(0-9) for gzip, 2-tuple for szip. """
585 return self._filters.get(self.compression, None)
587 @property
588 @with_phil
589 def shuffle(self):
590 """Shuffle filter present (T/F)"""
591 return 'shuffle' in self._filters
593 @property
594 @with_phil
595 def fletcher32(self):
596 """Fletcher32 filter is present (T/F)"""
597 return 'fletcher32' in self._filters
599 @property
600 @with_phil
601 def scaleoffset(self):
602 """Scale/offset filter settings. For integer data types, this is
603 the number of bits stored, or 0 for auto-detected. For floating
604 point data types, this is the number of decimal places retained.
605 If the scale/offset filter is not in use, this is None."""
606 try:
607 return self._filters['scaleoffset'][1]
608 except KeyError:
609 return None
611 @property
612 @with_phil
613 def external(self):
614 """External file settings. Returns a list of tuples of
615 (name, offset, size) for each external file entry, or returns None
616 if no external files are used."""
617 count = self._dcpl.get_external_count()
618 if count<=0:
619 return None
620 ext_list = list()
621 for x in range(count):
622 (name, offset, size) = self._dcpl.get_external(x)
623 ext_list.append( (filename_decode(name), offset, size) )
624 return ext_list
626 @property
627 @with_phil
628 def maxshape(self):
629 """Shape up to which this dataset can be resized. Axes with value
630 None have no resize limit. """
631 space = self.id.get_space()
632 dims = space.get_simple_extent_dims(True)
633 if dims is None:
634 return None
636 return tuple(x if x != h5s.UNLIMITED else None for x in dims)
638 @property
639 @with_phil
640 def fillvalue(self):
641 """Fill value for this dataset (0 by default)"""
642 arr = numpy.zeros((1,), dtype=self.dtype)
643 self._dcpl.get_fill_value(arr)
644 return arr[0]
646 @cached_property
647 @with_phil
648 def _extent_type(self):
649 """Get extent type for this dataset - SIMPLE, SCALAR or NULL"""
650 return self.id.get_space().get_simple_extent_type()
652 @cached_property
653 def _is_empty(self):
654 """Check if extent type is empty"""
655 return self._extent_type == h5s.NULL
657 @with_phil
658 def __init__(self, bind, *, readonly=False):
659 """ Create a new Dataset object by binding to a low-level DatasetID.
660 """
661 if not isinstance(bind, h5d.DatasetID):
662 raise ValueError("%s is not a DatasetID" % bind)
663 super().__init__(bind)
665 self._dcpl = self.id.get_create_plist()
666 self._dxpl = h5p.create(h5p.DATASET_XFER)
667 self._filters = filters.get_filters(self._dcpl)
668 self._readonly = readonly
669 self._cache_props = {}
671 def resize(self, size, axis=None):
672 """ Resize the dataset, or the specified axis.
674 The dataset must be stored in chunked format; it can be resized up to
675 the "maximum shape" (keyword maxshape) specified at creation time.
676 The rank of the dataset cannot be changed.
678 "Size" should be a shape tuple, or if an axis is specified, an integer.
680 BEWARE: This functions differently than the NumPy resize() method!
681 The data is not "reshuffled" to fit in the new shape; each axis is
682 grown or shrunk independently. The coordinates of existing data are
683 fixed.
684 """
685 with phil:
686 if self.chunks is None:
687 raise TypeError("Only chunked datasets can be resized")
689 if axis is not None:
690 if not (axis >=0 and axis < self.id.rank):
691 raise ValueError("Invalid axis (0 to %s allowed)" % (self.id.rank-1))
692 try:
693 newlen = int(size)
694 except TypeError:
695 raise TypeError("Argument must be a single int if axis is specified")
696 size = list(self.shape)
697 size[axis] = newlen
699 size = tuple(size)
700 self.id.set_extent(size)
701 #h5f.flush(self.id) # THG recommends
703 @with_phil
704 def __len__(self):
705 """ The size of the first axis. TypeError if scalar.
707 Limited to 2**32 on 32-bit systems; Dataset.len() is preferred.
708 """
709 size = self.len()
710 if size > sys.maxsize:
711 raise OverflowError("Value too big for Python's __len__; use Dataset.len() instead.")
712 return size
714 def len(self):
715 """ The size of the first axis. TypeError if scalar.
717 Use of this method is preferred to len(dset), as Python's built-in
718 len() cannot handle values greater then 2**32 on 32-bit systems.
719 """
720 with phil:
721 shape = self.shape
722 if len(shape) == 0:
723 raise TypeError("Attempt to take len() of scalar dataset")
724 return shape[0]
726 @with_phil
727 def __iter__(self):
728 """ Iterate over the first axis. TypeError if scalar.
730 BEWARE: Modifications to the yielded data are *NOT* written to file.
731 """
732 shape = self.shape
733 if len(shape) == 0:
734 raise TypeError("Can't iterate over a scalar dataset")
735 for i in range(shape[0]):
736 yield self[i]
738 @with_phil
739 def iter_chunks(self, sel=None):
740 """ Return chunk iterator. If set, the sel argument is a slice or
741 tuple of slices that defines the region to be used. If not set, the
742 entire dataspace will be used for the iterator.
744 For each chunk within the given region, the iterator yields a tuple of
745 slices that gives the intersection of the given chunk with the
746 selection area.
748 A TypeError will be raised if the dataset is not chunked.
750 A ValueError will be raised if the selection region is invalid.
752 """
753 return ChunkIterator(self, sel)
755 @cached_property
756 def _fast_read_ok(self):
757 """Is this dataset suitable for simple reading"""
758 return (
759 self._extent_type == h5s.SIMPLE
760 and isinstance(self.id.get_type(), (h5t.TypeIntegerID, h5t.TypeFloatID))
761 )
763 @with_phil
764 def __getitem__(self, args, new_dtype=None):
765 """ Read a slice from the HDF5 dataset.
767 Takes slices and recarray-style field names (more than one is
768 allowed!) in any order. Obeys basic NumPy rules, including
769 broadcasting.
771 Also supports:
773 * Boolean "mask" array indexing
774 """
775 args = args if isinstance(args, tuple) else (args,)
777 if self._fast_read_ok and (new_dtype is None):
778 try:
779 return self._fast_reader.read(args)
780 except TypeError:
781 pass # Fall back to Python read pathway below
783 if self._is_empty:
784 # Check 'is Ellipsis' to avoid equality comparison with an array:
785 # array equality returns an array, not a boolean.
786 if args == () or (len(args) == 1 and args[0] is Ellipsis):
787 return Empty(self.dtype)
788 raise ValueError("Empty datasets cannot be sliced")
790 # Sort field names from the rest of the args.
791 names = tuple(x for x in args if isinstance(x, str))
793 if names:
794 # Read a subset of the fields in this structured dtype
795 if len(names) == 1:
796 names = names[0] # Read with simpler dtype of this field
797 args = tuple(x for x in args if not isinstance(x, str))
798 return self.fields(names, _prior_dtype=new_dtype)[args]
800 if new_dtype is None:
801 new_dtype = self.dtype
802 mtype = h5t.py_create(new_dtype)
804 # === Special-case region references ====
806 if len(args) == 1 and isinstance(args[0], h5r.RegionReference):
808 obj = h5r.dereference(args[0], self.id)
809 if obj != self.id:
810 raise ValueError("Region reference must point to this dataset")
812 sid = h5r.get_region(args[0], self.id)
813 mshape = sel.guess_shape(sid)
814 if mshape is None:
815 # 0D with no data (NULL or deselected SCALAR)
816 return Empty(new_dtype)
817 out = numpy.zeros(mshape, dtype=new_dtype)
818 if out.size == 0:
819 return out
821 sid_out = h5s.create_simple(mshape)
822 sid_out.select_all()
823 self.id.read(sid_out, sid, out, mtype)
824 return out
826 # === Check for zero-sized datasets =====
828 if self.size == 0:
829 # Check 'is Ellipsis' to avoid equality comparison with an array:
830 # array equality returns an array, not a boolean.
831 if args == () or (len(args) == 1 and args[0] is Ellipsis):
832 return numpy.zeros(self.shape, dtype=new_dtype)
834 # === Scalar dataspaces =================
836 if self.shape == ():
837 fspace = self.id.get_space()
838 selection = sel2.select_read(fspace, args)
839 if selection.mshape is None:
840 arr = numpy.zeros((), dtype=new_dtype)
841 else:
842 arr = numpy.zeros(selection.mshape, dtype=new_dtype)
843 for mspace, fspace in selection:
844 self.id.read(mspace, fspace, arr, mtype)
845 if selection.mshape is None:
846 return arr[()]
847 return arr
849 # === Everything else ===================
851 # Perform the dataspace selection.
852 selection = sel.select(self.shape, args, dataset=self)
854 if selection.nselect == 0:
855 return numpy.zeros(selection.array_shape, dtype=new_dtype)
857 arr = numpy.zeros(selection.array_shape, new_dtype, order='C')
859 # Perform the actual read
860 mspace = h5s.create_simple(selection.mshape)
861 fspace = selection.id
862 self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl)
864 # Patch up the output for NumPy
865 if arr.shape == ():
866 return arr[()] # 0 dim array -> numpy scalar
867 return arr
869 @with_phil
870 def __setitem__(self, args, val):
871 """ Write to the HDF5 dataset from a Numpy array.
873 NumPy's broadcasting rules are honored, for "simple" indexing
874 (slices and integers). For advanced indexing, the shapes must
875 match.
876 """
877 args = args if isinstance(args, tuple) else (args,)
879 # Sort field indices from the slicing
880 names = tuple(x for x in args if isinstance(x, str))
881 args = tuple(x for x in args if not isinstance(x, str))
883 # Generally we try to avoid converting the arrays on the Python
884 # side. However, for compound literals this is unavoidable.
885 vlen = h5t.check_vlen_dtype(self.dtype)
886 if vlen is not None and vlen not in (bytes, str):
887 try:
888 val = numpy.asarray(val, dtype=vlen)
889 except (ValueError, TypeError):
890 try:
891 val = numpy.array([numpy.array(x, dtype=vlen)
892 for x in val], dtype=self.dtype)
893 except (ValueError, TypeError):
894 pass
895 if vlen == val.dtype:
896 if val.ndim > 1:
897 tmp = numpy.empty(shape=val.shape[:-1], dtype=object)
898 tmp.ravel()[:] = [i for i in val.reshape(
899 (product(val.shape[:-1]), val.shape[-1])
900 )]
901 else:
902 tmp = numpy.array([None], dtype=object)
903 tmp[0] = val
904 val = tmp
905 elif self.dtype.kind == "O" or \
906 (self.dtype.kind == 'V' and \
907 (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \
908 (self.dtype.subdtype is None)):
909 if len(names) == 1 and self.dtype.fields is not None:
910 # Single field selected for write, from a non-array source
911 if not names[0] in self.dtype.fields:
912 raise ValueError("No such field for indexing: %s" % names[0])
913 dtype = self.dtype.fields[names[0]][0]
914 cast_compound = True
915 else:
916 dtype = self.dtype
917 cast_compound = False
919 val = numpy.asarray(val, dtype=dtype.base, order='C')
920 if cast_compound:
921 val = val.view(numpy.dtype([(names[0], dtype)]))
922 val = val.reshape(val.shape[:len(val.shape) - len(dtype.shape)])
923 elif (self.dtype.kind == 'S'
924 and (h5t.check_string_dtype(self.dtype).encoding == 'utf-8')
925 and (find_item_type(val) is str)
926 ):
927 # Writing str objects to a fixed-length UTF-8 string dataset.
928 # Numpy's normal conversion only handles ASCII characters, but
929 # when the destination is UTF-8, we want to allow any unicode.
930 # This *doesn't* handle numpy fixed-length unicode data ('U' dtype),
931 # as HDF5 has no equivalent, and converting fixed length UTF-32
932 # to variable length UTF-8 would obscure what's going on.
933 str_array = numpy.asarray(val, order='C', dtype=object)
934 val = numpy.array([
935 s.encode('utf-8') for s in str_array.flat
936 ], dtype=self.dtype).reshape(str_array.shape)
937 else:
938 # If the input data is already an array, let HDF5 do the conversion.
939 # If it's a list or similar, don't make numpy guess a dtype for it.
940 dt = None if isinstance(val, numpy.ndarray) else self.dtype.base
941 val = numpy.asarray(val, order='C', dtype=dt)
943 # Check for array dtype compatibility and convert
944 if self.dtype.subdtype is not None:
945 shp = self.dtype.subdtype[1]
946 valshp = val.shape[-len(shp):]
947 if valshp != shp: # Last dimension has to match
948 raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,))
949 mtype = h5t.py_create(numpy.dtype((val.dtype, shp)))
950 mshape = val.shape[0:len(val.shape)-len(shp)]
952 # Make a compound memory type if field-name slicing is required
953 elif len(names) != 0:
955 mshape = val.shape
957 # Catch common errors
958 if self.dtype.fields is None:
959 raise TypeError("Illegal slicing argument (not a compound dataset)")
960 mismatch = [x for x in names if x not in self.dtype.fields]
961 if len(mismatch) != 0:
962 mismatch = ", ".join('"%s"'%x for x in mismatch)
963 raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch)
965 # Write non-compound source into a single dataset field
966 if len(names) == 1 and val.dtype.fields is None:
967 subtype = h5t.py_create(val.dtype)
968 mtype = h5t.create(h5t.COMPOUND, subtype.get_size())
969 mtype.insert(self._e(names[0]), 0, subtype)
971 # Make a new source type keeping only the requested fields
972 else:
973 fieldnames = [x for x in val.dtype.names if x in names] # Keep source order
974 mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize)
975 for fieldname in fieldnames:
976 subtype = h5t.py_create(val.dtype.fields[fieldname][0])
977 offset = val.dtype.fields[fieldname][1]
978 mtype.insert(self._e(fieldname), offset, subtype)
980 # Use mtype derived from array (let DatasetID.write figure it out)
981 else:
982 mshape = val.shape
983 mtype = None
985 # Perform the dataspace selection
986 selection = sel.select(self.shape, args, dataset=self)
988 if selection.nselect == 0:
989 return
991 # Broadcast scalars if necessary.
992 # In order to avoid slow broadcasting filling the destination by
993 # the scalar value, we create an intermediate array of the same
994 # size as the destination buffer provided that size is reasonable.
995 # We assume as reasonable a size smaller or equal as the used dataset
996 # chunk size if any.
997 # In case of dealing with a non-chunked destination dataset or with
998 # a selection whose size is larger than the dataset chunk size we fall
999 # back to using an intermediate array of size equal to the last dimension
1000 # of the destination buffer.
1001 # The reasoning behind is that it makes sense to assume the creator of
1002 # the dataset used an appropriate chunk size according the available
1003 # memory. In any case, if we cannot afford to create an intermediate
1004 # array of the same size as the dataset chunk size, the user program has
1005 # little hope to go much further. Solves h5py issue #1067
1006 if mshape == () and selection.array_shape != ():
1007 if self.dtype.subdtype is not None:
1008 raise TypeError("Scalar broadcasting is not supported for array dtypes")
1009 if self.chunks and (product(self.chunks) >= product(selection.array_shape)):
1010 val2 = numpy.empty(selection.array_shape, dtype=val.dtype)
1011 else:
1012 val2 = numpy.empty(selection.array_shape[-1], dtype=val.dtype)
1013 val2[...] = val
1014 val = val2
1015 mshape = val.shape
1017 # Perform the write, with broadcasting
1018 mspace = h5s.create_simple(selection.expand_shape(mshape))
1019 for fspace in selection.broadcast(mshape):
1020 self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl)
1022 def read_direct(self, dest, source_sel=None, dest_sel=None):
1023 """ Read data directly from HDF5 into an existing NumPy array.
1025 The destination array must be C-contiguous and writable.
1026 Selections must be the output of numpy.s_[<args>].
1028 Broadcasting is supported for simple indexing.
1029 """
1030 with phil:
1031 if self._is_empty:
1032 raise TypeError("Empty datasets have no numpy representation")
1033 if source_sel is None:
1034 source_sel = sel.SimpleSelection(self.shape)
1035 else:
1036 source_sel = sel.select(self.shape, source_sel, self) # for numpy.s_
1037 fspace = source_sel.id
1039 if dest_sel is None:
1040 dest_sel = sel.SimpleSelection(dest.shape)
1041 else:
1042 dest_sel = sel.select(dest.shape, dest_sel)
1044 for mspace in dest_sel.broadcast(source_sel.array_shape):
1045 self.id.read(mspace, fspace, dest, dxpl=self._dxpl)
1047 def write_direct(self, source, source_sel=None, dest_sel=None):
1048 """ Write data directly to HDF5 from a NumPy array.
1050 The source array must be C-contiguous. Selections must be
1051 the output of numpy.s_[<args>].
1053 Broadcasting is supported for simple indexing.
1054 """
1055 with phil:
1056 if self._is_empty:
1057 raise TypeError("Empty datasets cannot be written to")
1058 if source_sel is None:
1059 source_sel = sel.SimpleSelection(source.shape)
1060 else:
1061 source_sel = sel.select(source.shape, source_sel) # for numpy.s_
1062 mspace = source_sel.id
1064 if dest_sel is None:
1065 dest_sel = sel.SimpleSelection(self.shape)
1066 else:
1067 dest_sel = sel.select(self.shape, dest_sel, self)
1069 for fspace in dest_sel.broadcast(source_sel.array_shape):
1070 self.id.write(mspace, fspace, source, dxpl=self._dxpl)
1072 @with_phil
1073 def __array__(self, dtype=None, copy=True):
1074 """ Create a Numpy array containing the whole dataset. DON'T THINK
1075 THIS MEANS DATASETS ARE INTERCHANGEABLE WITH ARRAYS. For one thing,
1076 you have to read the whole dataset every time this method is called.
1077 """
1078 if copy is False:
1079 raise ValueError(
1080 f"Dataset.__array__ received {copy=} "
1081 f"but memory allocation cannot be avoided on read"
1082 )
1083 arr = numpy.zeros(self.shape, dtype=self.dtype if dtype is None else dtype)
1085 # Special case for (0,)*-shape datasets
1086 if self.size == 0:
1087 return arr
1089 self.read_direct(arr)
1090 return arr
1092 @with_phil
1093 def __repr__(self):
1094 if not self:
1095 r = '<Closed HDF5 dataset>'
1096 else:
1097 if self.name is None:
1098 namestr = '("anonymous")'
1099 else:
1100 name = pp.basename(pp.normpath(self.name))
1101 namestr = '"%s"' % (name if name != '' else '/')
1102 r = '<HDF5 dataset %s: shape %s, type "%s">' % (
1103 namestr, self.shape, self.dtype.str
1104 )
1105 return r
1107 if hasattr(h5d.DatasetID, "refresh"):
1108 @with_phil
1109 def refresh(self):
1110 """ Refresh the dataset metadata by reloading from the file.
1112 This is part of the SWMR features and only exist when the HDF5
1113 library version >=1.9.178
1114 """
1115 self._id.refresh()
1116 self._cache_props.clear()
1118 if hasattr(h5d.DatasetID, "flush"):
1119 @with_phil
1120 def flush(self):
1121 """ Flush the dataset data and metadata to the file.
1122 If the dataset is chunked, raw data chunks are written to the file.
1124 This is part of the SWMR features and only exist when the HDF5
1125 library version >=1.9.178
1126 """
1127 self._id.flush()
1129 if vds_support:
1130 @property
1131 @with_phil
1132 def is_virtual(self):
1133 """Check if this is a virtual dataset"""
1134 return self._dcpl.get_layout() == h5d.VIRTUAL
1136 @with_phil
1137 def virtual_sources(self):
1138 """Get a list of the data mappings for a virtual dataset"""
1139 if not self.is_virtual:
1140 raise RuntimeError("Not a virtual dataset")
1141 dcpl = self._dcpl
1142 return [
1143 VDSmap(dcpl.get_virtual_vspace(j),
1144 dcpl.get_virtual_filename(j),
1145 dcpl.get_virtual_dsetname(j),
1146 dcpl.get_virtual_srcspace(j))
1147 for j in range(dcpl.get_virtual_count())]
1149 @with_phil
1150 def make_scale(self, name=''):
1151 """Make this dataset an HDF5 dimension scale.
1153 You can then attach it to dimensions of other datasets like this::
1155 other_ds.dims[0].attach_scale(ds)
1157 You can optionally pass a name to associate with this scale.
1158 """
1159 h5ds.set_scale(self._id, self._e(name))
1161 @property
1162 @with_phil
1163 def is_scale(self):
1164 """Return ``True`` if this dataset is also a dimension scale.
1166 Return ``False`` otherwise.
1167 """
1168 return h5ds.is_scale(self._id)