Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/h5py/_hl/dataset.py: 21%
633 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# This file is part of h5py, a Python interface to the HDF5 library.
2#
3# http://www.h5py.org
4#
5# Copyright 2008-2020 Andrew Collette and contributors
6#
7# License: Standard 3-clause BSD; see "license.txt" for full license terms
8# and contributor agreement.
10"""
11 Implements support for high-level dataset access.
12"""
14import posixpath as pp
15import sys
17import numpy
19from .. import h5, h5s, h5t, h5r, h5d, h5p, h5fd, h5ds, _selector
20from .base import (
21 array_for_new_object, cached_property, Empty, find_item_type, HLObject,
22 phil, product, with_phil,
23)
24from . import filters
25from . import selections as sel
26from . import selections2 as sel2
27from .datatype import Datatype
28from .compat import filename_decode
29from .vds import VDSmap, vds_support
31_LEGACY_GZIP_COMPRESSION_VALS = frozenset(range(10))
32MPI = h5.get_config().mpi
35def make_new_dset(parent, shape=None, dtype=None, data=None, name=None,
36 chunks=None, compression=None, shuffle=None,
37 fletcher32=None, maxshape=None, compression_opts=None,
38 fillvalue=None, scaleoffset=None, track_times=False,
39 external=None, track_order=None, dcpl=None, dapl=None,
40 efile_prefix=None, virtual_prefix=None, allow_unknown_filter=False,
41 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None):
42 """ Return a new low-level dataset identifier """
44 # Convert data to a C-contiguous ndarray
45 if data is not None and not isinstance(data, Empty):
46 data = array_for_new_object(data, specified_dtype=dtype)
48 # Validate shape
49 if shape is None:
50 if data is None:
51 if dtype is None:
52 raise TypeError("One of data, shape or dtype must be specified")
53 data = Empty(dtype)
54 shape = data.shape
55 else:
56 shape = (shape,) if isinstance(shape, int) else tuple(shape)
57 if data is not None and (product(shape) != product(data.shape)):
58 raise ValueError("Shape tuple is incompatible with data")
60 if isinstance(maxshape, int):
61 maxshape = (maxshape,)
62 tmp_shape = maxshape if maxshape is not None else shape
64 # Validate chunk shape
65 if isinstance(chunks, int) and not isinstance(chunks, bool):
66 chunks = (chunks,)
67 if isinstance(chunks, tuple) and any(
68 chunk > dim for dim, chunk in zip(tmp_shape, chunks) if dim is not None
69 ):
70 errmsg = "Chunk shape must not be greater than data shape in any dimension. "\
71 "{} is not compatible with {}".format(chunks, shape)
72 raise ValueError(errmsg)
74 if isinstance(dtype, Datatype):
75 # Named types are used as-is
76 tid = dtype.id
77 dtype = tid.dtype # Following code needs this
78 else:
79 # Validate dtype
80 if dtype is None and data is None:
81 dtype = numpy.dtype("=f4")
82 elif dtype is None and data is not None:
83 dtype = data.dtype
84 else:
85 dtype = numpy.dtype(dtype)
86 tid = h5t.py_create(dtype, logical=1)
88 # Legacy
89 if any((compression, shuffle, fletcher32, maxshape, scaleoffset)) and chunks is False:
90 raise ValueError("Chunked format required for given storage options")
92 # Legacy
93 if compression is True:
94 if compression_opts is None:
95 compression_opts = 4
96 compression = 'gzip'
98 # Legacy
99 if compression in _LEGACY_GZIP_COMPRESSION_VALS:
100 if compression_opts is not None:
101 raise TypeError("Conflict in compression options")
102 compression_opts = compression
103 compression = 'gzip'
104 dcpl = filters.fill_dcpl(
105 dcpl or h5p.create(h5p.DATASET_CREATE), shape, dtype,
106 chunks, compression, compression_opts, shuffle, fletcher32,
107 maxshape, scaleoffset, external, allow_unknown_filter)
109 if fillvalue is not None:
110 # prepare string-type dtypes for fillvalue
111 string_info = h5t.check_string_dtype(dtype)
112 if string_info is not None:
113 # fake vlen dtype for fixed len string fillvalue
114 # to not trigger unwanted encoding
115 dtype = h5t.string_dtype(string_info.encoding)
116 fillvalue = numpy.array(fillvalue, dtype=dtype)
117 else:
118 fillvalue = numpy.array(fillvalue)
119 dcpl.set_fill_value(fillvalue)
121 if track_times is None:
122 # In case someone explicitly passes None for the default
123 track_times = False
124 if track_times in (True, False):
125 dcpl.set_obj_track_times(track_times)
126 else:
127 raise TypeError("track_times must be either True or False")
128 if track_order is True:
129 dcpl.set_attr_creation_order(
130 h5p.CRT_ORDER_TRACKED | h5p.CRT_ORDER_INDEXED)
131 elif track_order is False:
132 dcpl.set_attr_creation_order(0)
133 elif track_order is not None:
134 raise TypeError("track_order must be either True or False")
136 if maxshape is not None:
137 maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape)
139 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):
140 dapl = dapl or h5p.create(h5p.DATASET_ACCESS)
142 if efile_prefix is not None:
143 dapl.set_efile_prefix(efile_prefix)
145 if virtual_prefix is not None:
146 dapl.set_virtual_prefix(virtual_prefix)
148 if rdcc_nbytes or rdcc_nslots or rdcc_w0:
149 cache_settings = list(dapl.get_chunk_cache())
150 if rdcc_nslots is not None:
151 cache_settings[0] = rdcc_nslots
152 if rdcc_nbytes is not None:
153 cache_settings[1] = rdcc_nbytes
154 if rdcc_w0 is not None:
155 cache_settings[2] = rdcc_w0
156 dapl.set_chunk_cache(*cache_settings)
158 if isinstance(data, Empty):
159 sid = h5s.create(h5s.NULL)
160 else:
161 sid = h5s.create_simple(shape, maxshape)
163 dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl, dapl=dapl)
165 if (data is not None) and (not isinstance(data, Empty)):
166 dset_id.write(h5s.ALL, h5s.ALL, data)
168 return dset_id
171def open_dset(parent, name, dapl=None, efile_prefix=None, virtual_prefix=None,
172 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, **kwds):
173 """ Return an existing low-level dataset identifier """
175 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):
176 dapl = dapl or h5p.create(h5p.DATASET_ACCESS)
178 if efile_prefix is not None:
179 dapl.set_efile_prefix(efile_prefix)
181 if virtual_prefix is not None:
182 dapl.set_virtual_prefix(virtual_prefix)
184 if rdcc_nbytes or rdcc_nslots or rdcc_w0:
185 cache_settings = list(dapl.get_chunk_cache())
186 if rdcc_nslots is not None:
187 cache_settings[0] = rdcc_nslots
188 if rdcc_nbytes is not None:
189 cache_settings[1] = rdcc_nbytes
190 if rdcc_w0 is not None:
191 cache_settings[2] = rdcc_w0
192 dapl.set_chunk_cache(*cache_settings)
194 dset_id = h5d.open(parent.id, name, dapl=dapl)
196 return dset_id
199class AstypeWrapper:
200 """Wrapper to convert data on reading from a dataset.
201 """
202 def __init__(self, dset, dtype):
203 self._dset = dset
204 self._dtype = numpy.dtype(dtype)
206 def __getitem__(self, args):
207 return self._dset.__getitem__(args, new_dtype=self._dtype)
209 def __len__(self):
210 """ Get the length of the underlying dataset
212 >>> length = len(dataset.astype('f8'))
213 """
214 return len(self._dset)
216 def __array__(self, dtype=None):
217 data = self[:]
218 if dtype is not None:
219 data = data.astype(dtype)
220 return data
223class AsStrWrapper:
224 """Wrapper to decode strings on reading the dataset"""
225 def __init__(self, dset, encoding, errors='strict'):
226 self._dset = dset
227 if encoding is None:
228 encoding = h5t.check_string_dtype(dset.dtype).encoding
229 self.encoding = encoding
230 self.errors = errors
232 def __getitem__(self, args):
233 bytes_arr = self._dset[args]
234 # numpy.char.decode() seems like the obvious thing to use. But it only
235 # accepts numpy string arrays, not object arrays of bytes (which we
236 # return from HDF5 variable-length strings). And the numpy
237 # implementation is not faster than doing it with a loop; in fact, by
238 # not converting the result to a numpy unicode array, the
239 # naive way can be faster! (Comparing with numpy 1.18.4, June 2020)
240 if numpy.isscalar(bytes_arr):
241 return bytes_arr.decode(self.encoding, self.errors)
243 return numpy.array([
244 b.decode(self.encoding, self.errors) for b in bytes_arr.flat
245 ], dtype=object).reshape(bytes_arr.shape)
247 def __len__(self):
248 """ Get the length of the underlying dataset
250 >>> length = len(dataset.asstr())
251 """
252 return len(self._dset)
254 def __array__(self):
255 return numpy.array([
256 b.decode(self.encoding, self.errors) for b in self._dset
257 ], dtype=object).reshape(self._dset.shape)
260class FieldsWrapper:
261 """Wrapper to extract named fields from a dataset with a struct dtype"""
262 extract_field = None
264 def __init__(self, dset, prior_dtype, names):
265 self._dset = dset
266 if isinstance(names, str):
267 self.extract_field = names
268 names = [names]
269 self.read_dtype = readtime_dtype(prior_dtype, names)
271 def __array__(self, dtype=None):
272 data = self[:]
273 if dtype is not None:
274 data = data.astype(dtype)
275 return data
277 def __getitem__(self, args):
278 data = self._dset.__getitem__(args, new_dtype=self.read_dtype)
279 if self.extract_field is not None:
280 data = data[self.extract_field]
281 return data
283 def __len__(self):
284 """ Get the length of the underlying dataset
286 >>> length = len(dataset.fields(['x', 'y']))
287 """
288 return len(self._dset)
291def readtime_dtype(basetype, names):
292 """Make a NumPy compound dtype with a subset of available fields"""
293 if basetype.names is None: # Names provided, but not compound
294 raise ValueError("Field names only allowed for compound types")
296 for name in names: # Check all names are legal
297 if name not in basetype.names:
298 raise ValueError("Field %s does not appear in this type." % name)
300 return numpy.dtype([(name, basetype.fields[name][0]) for name in names])
303if MPI:
304 class CollectiveContext:
306 """ Manages collective I/O in MPI mode """
308 # We don't bother with _local as threads are forbidden in MPI mode
310 def __init__(self, dset):
311 self._dset = dset
313 def __enter__(self):
314 # pylint: disable=protected-access
315 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_COLLECTIVE)
317 def __exit__(self, *args):
318 # pylint: disable=protected-access
319 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_INDEPENDENT)
322class ChunkIterator:
323 """
324 Class to iterate through list of chunks of a given dataset
325 """
326 def __init__(self, dset, source_sel=None):
327 self._shape = dset.shape
328 rank = len(dset.shape)
330 if not dset.chunks:
331 # can only use with chunked datasets
332 raise TypeError("Chunked dataset required")
334 self._layout = dset.chunks
335 if source_sel is None:
336 # select over entire dataset
337 slices = []
338 for dim in range(rank):
339 slices.append(slice(0, self._shape[dim]))
340 self._sel = tuple(slices)
341 else:
342 if isinstance(source_sel, slice):
343 self._sel = (source_sel,)
344 else:
345 self._sel = source_sel
346 if len(self._sel) != rank:
347 raise ValueError("Invalid selection - selection region must have same rank as dataset")
348 self._chunk_index = []
349 for dim in range(rank):
350 s = self._sel[dim]
351 if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start:
352 raise ValueError("Invalid selection - selection region must be within dataset space")
353 index = s.start // self._layout[dim]
354 self._chunk_index.append(index)
356 def __iter__(self):
357 return self
359 def __next__(self):
360 rank = len(self._shape)
361 slices = []
362 if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop:
363 # ran past the last chunk, end iteration
364 raise StopIteration()
366 for dim in range(rank):
367 s = self._sel[dim]
368 start = self._chunk_index[dim] * self._layout[dim]
369 stop = (self._chunk_index[dim] + 1) * self._layout[dim]
370 # adjust the start if this is an edge chunk
371 if start < s.start:
372 start = s.start
373 if stop > s.stop:
374 stop = s.stop # trim to end of the selection
375 s = slice(start, stop, 1)
376 slices.append(s)
378 # bump up the last index and carry forward if we run outside the selection
379 dim = rank - 1
380 while dim >= 0:
381 s = self._sel[dim]
382 self._chunk_index[dim] += 1
384 chunk_end = self._chunk_index[dim] * self._layout[dim]
385 if chunk_end < s.stop:
386 # we still have room to extend along this dimensions
387 return tuple(slices)
389 if dim > 0:
390 # reset to the start and continue iterating with higher dimension
391 self._chunk_index[dim] = 0
392 dim -= 1
393 return tuple(slices)
396class Dataset(HLObject):
398 """
399 Represents an HDF5 dataset
400 """
402 def astype(self, dtype):
403 """ Get a wrapper allowing you to perform reads to a
404 different destination type, e.g.:
406 >>> double_precision = dataset.astype('f8')[0:100:2]
407 """
408 return AstypeWrapper(self, dtype)
410 def asstr(self, encoding=None, errors='strict'):
411 """Get a wrapper to read string data as Python strings:
413 >>> str_array = dataset.asstr()[:]
415 The parameters have the same meaning as in ``bytes.decode()``.
416 If ``encoding`` is unspecified, it will use the encoding in the HDF5
417 datatype (either ascii or utf-8).
418 """
419 string_info = h5t.check_string_dtype(self.dtype)
420 if string_info is None:
421 raise TypeError(
422 "dset.asstr() can only be used on datasets with "
423 "an HDF5 string datatype"
424 )
425 if encoding is None:
426 encoding = string_info.encoding
427 return AsStrWrapper(self, encoding, errors=errors)
429 def fields(self, names, *, _prior_dtype=None):
430 """Get a wrapper to read a subset of fields from a compound data type:
432 >>> 2d_coords = dataset.fields(['x', 'y'])[:]
434 If names is a string, a single field is extracted, and the resulting
435 arrays will have that dtype. Otherwise, it should be an iterable,
436 and the read data will have a compound dtype.
437 """
438 if _prior_dtype is None:
439 _prior_dtype = self.dtype
440 return FieldsWrapper(self, _prior_dtype, names)
442 if MPI:
443 @property
444 @with_phil
445 def collective(self):
446 """ Context manager for MPI collective reads & writes """
447 return CollectiveContext(self)
449 @property
450 def dims(self):
451 """ Access dimension scales attached to this dataset. """
452 from .dims import DimensionManager
453 with phil:
454 return DimensionManager(self)
456 @property
457 @with_phil
458 def ndim(self):
459 """Numpy-style attribute giving the number of dimensions"""
460 return self.id.rank
462 @property
463 def shape(self):
464 """Numpy-style shape tuple giving dataset dimensions"""
465 if 'shape' in self._cache_props:
466 return self._cache_props['shape']
468 with phil:
469 shape = self.id.shape
471 # If the file is read-only, cache the shape to speed-up future uses.
472 # This cache is invalidated by .refresh() when using SWMR.
473 if self._readonly:
474 self._cache_props['shape'] = shape
475 return shape
477 @shape.setter
478 @with_phil
479 def shape(self, shape):
480 # pylint: disable=missing-docstring
481 self.resize(shape)
483 @property
484 def size(self):
485 """Numpy-style attribute giving the total dataset size"""
486 if 'size' in self._cache_props:
487 return self._cache_props['size']
489 if self._is_empty:
490 size = None
491 else:
492 size = product(self.shape)
494 # If the file is read-only, cache the size to speed-up future uses.
495 # This cache is invalidated by .refresh() when using SWMR.
496 if self._readonly:
497 self._cache_props['size'] = size
498 return size
500 @property
501 def nbytes(self):
502 """Numpy-style attribute giving the raw dataset size as the number of bytes"""
503 size = self.size
504 if size is None: # if we are an empty 0-D array, then there are no bytes in the dataset
505 return 0
506 return self.dtype.itemsize * size
508 @property
509 def _selector(self):
510 """Internal object for optimised selection of data"""
511 if '_selector' in self._cache_props:
512 return self._cache_props['_selector']
514 slr = _selector.Selector(self.id.get_space())
516 # If the file is read-only, cache the reader to speed up future uses.
517 # This cache is invalidated by .refresh() when using SWMR.
518 if self._readonly:
519 self._cache_props['_selector'] = slr
520 return slr
522 @property
523 def _fast_reader(self):
524 """Internal object for optimised reading of data"""
525 if '_fast_reader' in self._cache_props:
526 return self._cache_props['_fast_reader']
528 rdr = _selector.Reader(self.id)
530 # If the file is read-only, cache the reader to speed up future uses.
531 # This cache is invalidated by .refresh() when using SWMR.
532 if self._readonly:
533 self._cache_props['_fast_reader'] = rdr
534 return rdr
536 @property
537 @with_phil
538 def dtype(self):
539 """Numpy dtype representing the datatype"""
540 return self.id.dtype
542 @property
543 @with_phil
544 def chunks(self):
545 """Dataset chunks (or None)"""
546 dcpl = self._dcpl
547 if dcpl.get_layout() == h5d.CHUNKED:
548 return dcpl.get_chunk()
549 return None
551 @property
552 @with_phil
553 def compression(self):
554 """Compression strategy (or None)"""
555 for x in ('gzip','lzf','szip'):
556 if x in self._filters:
557 return x
558 return None
560 @property
561 @with_phil
562 def compression_opts(self):
563 """ Compression setting. Int(0-9) for gzip, 2-tuple for szip. """
564 return self._filters.get(self.compression, None)
566 @property
567 @with_phil
568 def shuffle(self):
569 """Shuffle filter present (T/F)"""
570 return 'shuffle' in self._filters
572 @property
573 @with_phil
574 def fletcher32(self):
575 """Fletcher32 filter is present (T/F)"""
576 return 'fletcher32' in self._filters
578 @property
579 @with_phil
580 def scaleoffset(self):
581 """Scale/offset filter settings. For integer data types, this is
582 the number of bits stored, or 0 for auto-detected. For floating
583 point data types, this is the number of decimal places retained.
584 If the scale/offset filter is not in use, this is None."""
585 try:
586 return self._filters['scaleoffset'][1]
587 except KeyError:
588 return None
590 @property
591 @with_phil
592 def external(self):
593 """External file settings. Returns a list of tuples of
594 (name, offset, size) for each external file entry, or returns None
595 if no external files are used."""
596 count = self._dcpl.get_external_count()
597 if count<=0:
598 return None
599 ext_list = list()
600 for x in range(count):
601 (name, offset, size) = self._dcpl.get_external(x)
602 ext_list.append( (filename_decode(name), offset, size) )
603 return ext_list
605 @property
606 @with_phil
607 def maxshape(self):
608 """Shape up to which this dataset can be resized. Axes with value
609 None have no resize limit. """
610 space = self.id.get_space()
611 dims = space.get_simple_extent_dims(True)
612 if dims is None:
613 return None
615 return tuple(x if x != h5s.UNLIMITED else None for x in dims)
617 @property
618 @with_phil
619 def fillvalue(self):
620 """Fill value for this dataset (0 by default)"""
621 arr = numpy.zeros((1,), dtype=self.dtype)
622 self._dcpl.get_fill_value(arr)
623 return arr[0]
625 @cached_property
626 @with_phil
627 def _extent_type(self):
628 """Get extent type for this dataset - SIMPLE, SCALAR or NULL"""
629 return self.id.get_space().get_simple_extent_type()
631 @cached_property
632 def _is_empty(self):
633 """Check if extent type is empty"""
634 return self._extent_type == h5s.NULL
636 @with_phil
637 def __init__(self, bind, *, readonly=False):
638 """ Create a new Dataset object by binding to a low-level DatasetID.
639 """
640 if not isinstance(bind, h5d.DatasetID):
641 raise ValueError("%s is not a DatasetID" % bind)
642 super().__init__(bind)
644 self._dcpl = self.id.get_create_plist()
645 self._dxpl = h5p.create(h5p.DATASET_XFER)
646 self._filters = filters.get_filters(self._dcpl)
647 self._readonly = readonly
648 self._cache_props = {}
650 def resize(self, size, axis=None):
651 """ Resize the dataset, or the specified axis.
653 The dataset must be stored in chunked format; it can be resized up to
654 the "maximum shape" (keyword maxshape) specified at creation time.
655 The rank of the dataset cannot be changed.
657 "Size" should be a shape tuple, or if an axis is specified, an integer.
659 BEWARE: This functions differently than the NumPy resize() method!
660 The data is not "reshuffled" to fit in the new shape; each axis is
661 grown or shrunk independently. The coordinates of existing data are
662 fixed.
663 """
664 with phil:
665 if self.chunks is None:
666 raise TypeError("Only chunked datasets can be resized")
668 if axis is not None:
669 if not (axis >=0 and axis < self.id.rank):
670 raise ValueError("Invalid axis (0 to %s allowed)" % (self.id.rank-1))
671 try:
672 newlen = int(size)
673 except TypeError:
674 raise TypeError("Argument must be a single int if axis is specified")
675 size = list(self.shape)
676 size[axis] = newlen
678 size = tuple(size)
679 self.id.set_extent(size)
680 #h5f.flush(self.id) # THG recommends
682 @with_phil
683 def __len__(self):
684 """ The size of the first axis. TypeError if scalar.
686 Limited to 2**32 on 32-bit systems; Dataset.len() is preferred.
687 """
688 size = self.len()
689 if size > sys.maxsize:
690 raise OverflowError("Value too big for Python's __len__; use Dataset.len() instead.")
691 return size
693 def len(self):
694 """ The size of the first axis. TypeError if scalar.
696 Use of this method is preferred to len(dset), as Python's built-in
697 len() cannot handle values greater then 2**32 on 32-bit systems.
698 """
699 with phil:
700 shape = self.shape
701 if len(shape) == 0:
702 raise TypeError("Attempt to take len() of scalar dataset")
703 return shape[0]
705 @with_phil
706 def __iter__(self):
707 """ Iterate over the first axis. TypeError if scalar.
709 BEWARE: Modifications to the yielded data are *NOT* written to file.
710 """
711 shape = self.shape
712 if len(shape) == 0:
713 raise TypeError("Can't iterate over a scalar dataset")
714 for i in range(shape[0]):
715 yield self[i]
717 @with_phil
718 def iter_chunks(self, sel=None):
719 """ Return chunk iterator. If set, the sel argument is a slice or
720 tuple of slices that defines the region to be used. If not set, the
721 entire dataspace will be used for the iterator.
723 For each chunk within the given region, the iterator yields a tuple of
724 slices that gives the intersection of the given chunk with the
725 selection area.
727 A TypeError will be raised if the dataset is not chunked.
729 A ValueError will be raised if the selection region is invalid.
731 """
732 return ChunkIterator(self, sel)
734 @cached_property
735 def _fast_read_ok(self):
736 """Is this dataset suitable for simple reading"""
737 return (
738 self._extent_type == h5s.SIMPLE
739 and isinstance(self.id.get_type(), (h5t.TypeIntegerID, h5t.TypeFloatID))
740 )
742 @with_phil
743 def __getitem__(self, args, new_dtype=None):
744 """ Read a slice from the HDF5 dataset.
746 Takes slices and recarray-style field names (more than one is
747 allowed!) in any order. Obeys basic NumPy rules, including
748 broadcasting.
750 Also supports:
752 * Boolean "mask" array indexing
753 """
754 args = args if isinstance(args, tuple) else (args,)
756 if self._fast_read_ok and (new_dtype is None):
757 try:
758 return self._fast_reader.read(args)
759 except TypeError:
760 pass # Fall back to Python read pathway below
762 if self._is_empty:
763 # Check 'is Ellipsis' to avoid equality comparison with an array:
764 # array equality returns an array, not a boolean.
765 if args == () or (len(args) == 1 and args[0] is Ellipsis):
766 return Empty(self.dtype)
767 raise ValueError("Empty datasets cannot be sliced")
769 # Sort field names from the rest of the args.
770 names = tuple(x for x in args if isinstance(x, str))
772 if names:
773 # Read a subset of the fields in this structured dtype
774 if len(names) == 1:
775 names = names[0] # Read with simpler dtype of this field
776 args = tuple(x for x in args if not isinstance(x, str))
777 return self.fields(names, _prior_dtype=new_dtype)[args]
779 if new_dtype is None:
780 new_dtype = self.dtype
781 mtype = h5t.py_create(new_dtype)
783 # === Special-case region references ====
785 if len(args) == 1 and isinstance(args[0], h5r.RegionReference):
787 obj = h5r.dereference(args[0], self.id)
788 if obj != self.id:
789 raise ValueError("Region reference must point to this dataset")
791 sid = h5r.get_region(args[0], self.id)
792 mshape = sel.guess_shape(sid)
793 if mshape is None:
794 # 0D with no data (NULL or deselected SCALAR)
795 return Empty(new_dtype)
796 out = numpy.zeros(mshape, dtype=new_dtype)
797 if out.size == 0:
798 return out
800 sid_out = h5s.create_simple(mshape)
801 sid_out.select_all()
802 self.id.read(sid_out, sid, out, mtype)
803 return out
805 # === Check for zero-sized datasets =====
807 if self.size == 0:
808 # Check 'is Ellipsis' to avoid equality comparison with an array:
809 # array equality returns an array, not a boolean.
810 if args == () or (len(args) == 1 and args[0] is Ellipsis):
811 return numpy.zeros(self.shape, dtype=new_dtype)
813 # === Scalar dataspaces =================
815 if self.shape == ():
816 fspace = self.id.get_space()
817 selection = sel2.select_read(fspace, args)
818 if selection.mshape is None:
819 arr = numpy.zeros((), dtype=new_dtype)
820 else:
821 arr = numpy.zeros(selection.mshape, dtype=new_dtype)
822 for mspace, fspace in selection:
823 self.id.read(mspace, fspace, arr, mtype)
824 if selection.mshape is None:
825 return arr[()]
826 return arr
828 # === Everything else ===================
830 # Perform the dataspace selection.
831 selection = sel.select(self.shape, args, dataset=self)
833 if selection.nselect == 0:
834 return numpy.zeros(selection.array_shape, dtype=new_dtype)
836 arr = numpy.zeros(selection.array_shape, new_dtype, order='C')
838 # Perform the actual read
839 mspace = h5s.create_simple(selection.mshape)
840 fspace = selection.id
841 self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl)
843 # Patch up the output for NumPy
844 if arr.shape == ():
845 return arr[()] # 0 dim array -> numpy scalar
846 return arr
848 @with_phil
849 def __setitem__(self, args, val):
850 """ Write to the HDF5 dataset from a Numpy array.
852 NumPy's broadcasting rules are honored, for "simple" indexing
853 (slices and integers). For advanced indexing, the shapes must
854 match.
855 """
856 args = args if isinstance(args, tuple) else (args,)
858 # Sort field indices from the slicing
859 names = tuple(x for x in args if isinstance(x, str))
860 args = tuple(x for x in args if not isinstance(x, str))
862 # Generally we try to avoid converting the arrays on the Python
863 # side. However, for compound literals this is unavoidable.
864 vlen = h5t.check_vlen_dtype(self.dtype)
865 if vlen is not None and vlen not in (bytes, str):
866 try:
867 val = numpy.asarray(val, dtype=vlen)
868 except ValueError:
869 try:
870 val = numpy.array([numpy.array(x, dtype=vlen)
871 for x in val], dtype=self.dtype)
872 except ValueError:
873 pass
874 if vlen == val.dtype:
875 if val.ndim > 1:
876 tmp = numpy.empty(shape=val.shape[:-1], dtype=object)
877 tmp.ravel()[:] = [i for i in val.reshape(
878 (product(val.shape[:-1]), val.shape[-1])
879 )]
880 else:
881 tmp = numpy.array([None], dtype=object)
882 tmp[0] = val
883 val = tmp
884 elif self.dtype.kind == "O" or \
885 (self.dtype.kind == 'V' and \
886 (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \
887 (self.dtype.subdtype is None)):
888 if len(names) == 1 and self.dtype.fields is not None:
889 # Single field selected for write, from a non-array source
890 if not names[0] in self.dtype.fields:
891 raise ValueError("No such field for indexing: %s" % names[0])
892 dtype = self.dtype.fields[names[0]][0]
893 cast_compound = True
894 else:
895 dtype = self.dtype
896 cast_compound = False
898 val = numpy.asarray(val, dtype=dtype.base, order='C')
899 if cast_compound:
900 val = val.view(numpy.dtype([(names[0], dtype)]))
901 val = val.reshape(val.shape[:len(val.shape) - len(dtype.shape)])
902 elif (self.dtype.kind == 'S'
903 and (h5t.check_string_dtype(self.dtype).encoding == 'utf-8')
904 and (find_item_type(val) is str)
905 ):
906 # Writing str objects to a fixed-length UTF-8 string dataset.
907 # Numpy's normal conversion only handles ASCII characters, but
908 # when the destination is UTF-8, we want to allow any unicode.
909 # This *doesn't* handle numpy fixed-length unicode data ('U' dtype),
910 # as HDF5 has no equivalent, and converting fixed length UTF-32
911 # to variable length UTF-8 would obscure what's going on.
912 str_array = numpy.asarray(val, order='C', dtype=object)
913 val = numpy.array([
914 s.encode('utf-8') for s in str_array.flat
915 ], dtype=self.dtype).reshape(str_array.shape)
916 else:
917 # If the input data is already an array, let HDF5 do the conversion.
918 # If it's a list or similar, don't make numpy guess a dtype for it.
919 dt = None if isinstance(val, numpy.ndarray) else self.dtype.base
920 val = numpy.asarray(val, order='C', dtype=dt)
922 # Check for array dtype compatibility and convert
923 if self.dtype.subdtype is not None:
924 shp = self.dtype.subdtype[1]
925 valshp = val.shape[-len(shp):]
926 if valshp != shp: # Last dimension has to match
927 raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,))
928 mtype = h5t.py_create(numpy.dtype((val.dtype, shp)))
929 mshape = val.shape[0:len(val.shape)-len(shp)]
931 # Make a compound memory type if field-name slicing is required
932 elif len(names) != 0:
934 mshape = val.shape
936 # Catch common errors
937 if self.dtype.fields is None:
938 raise TypeError("Illegal slicing argument (not a compound dataset)")
939 mismatch = [x for x in names if x not in self.dtype.fields]
940 if len(mismatch) != 0:
941 mismatch = ", ".join('"%s"'%x for x in mismatch)
942 raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch)
944 # Write non-compound source into a single dataset field
945 if len(names) == 1 and val.dtype.fields is None:
946 subtype = h5t.py_create(val.dtype)
947 mtype = h5t.create(h5t.COMPOUND, subtype.get_size())
948 mtype.insert(self._e(names[0]), 0, subtype)
950 # Make a new source type keeping only the requested fields
951 else:
952 fieldnames = [x for x in val.dtype.names if x in names] # Keep source order
953 mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize)
954 for fieldname in fieldnames:
955 subtype = h5t.py_create(val.dtype.fields[fieldname][0])
956 offset = val.dtype.fields[fieldname][1]
957 mtype.insert(self._e(fieldname), offset, subtype)
959 # Use mtype derived from array (let DatasetID.write figure it out)
960 else:
961 mshape = val.shape
962 mtype = None
964 # Perform the dataspace selection
965 selection = sel.select(self.shape, args, dataset=self)
967 if selection.nselect == 0:
968 return
970 # Broadcast scalars if necessary.
971 # In order to avoid slow broadcasting filling the destination by
972 # the scalar value, we create an intermediate array of the same
973 # size as the destination buffer provided that size is reasonable.
974 # We assume as reasonable a size smaller or equal as the used dataset
975 # chunk size if any.
976 # In case of dealing with a non-chunked destination dataset or with
977 # a selection whose size is larger than the dataset chunk size we fall
978 # back to using an intermediate array of size equal to the last dimension
979 # of the destination buffer.
980 # The reasoning behind is that it makes sense to assume the creator of
981 # the dataset used an appropriate chunk size according the available
982 # memory. In any case, if we cannot afford to create an intermediate
983 # array of the same size as the dataset chunk size, the user program has
984 # little hope to go much further. Solves h5py issue #1067
985 if mshape == () and selection.array_shape != ():
986 if self.dtype.subdtype is not None:
987 raise TypeError("Scalar broadcasting is not supported for array dtypes")
988 if self.chunks and (product(self.chunks) >= product(selection.array_shape)):
989 val2 = numpy.empty(selection.array_shape, dtype=val.dtype)
990 else:
991 val2 = numpy.empty(selection.array_shape[-1], dtype=val.dtype)
992 val2[...] = val
993 val = val2
994 mshape = val.shape
996 # Perform the write, with broadcasting
997 mspace = h5s.create_simple(selection.expand_shape(mshape))
998 for fspace in selection.broadcast(mshape):
999 self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl)
1001 def read_direct(self, dest, source_sel=None, dest_sel=None):
1002 """ Read data directly from HDF5 into an existing NumPy array.
1004 The destination array must be C-contiguous and writable.
1005 Selections must be the output of numpy.s_[<args>].
1007 Broadcasting is supported for simple indexing.
1008 """
1009 with phil:
1010 if self._is_empty:
1011 raise TypeError("Empty datasets have no numpy representation")
1012 if source_sel is None:
1013 source_sel = sel.SimpleSelection(self.shape)
1014 else:
1015 source_sel = sel.select(self.shape, source_sel, self) # for numpy.s_
1016 fspace = source_sel.id
1018 if dest_sel is None:
1019 dest_sel = sel.SimpleSelection(dest.shape)
1020 else:
1021 dest_sel = sel.select(dest.shape, dest_sel)
1023 for mspace in dest_sel.broadcast(source_sel.array_shape):
1024 self.id.read(mspace, fspace, dest, dxpl=self._dxpl)
1026 def write_direct(self, source, source_sel=None, dest_sel=None):
1027 """ Write data directly to HDF5 from a NumPy array.
1029 The source array must be C-contiguous. Selections must be
1030 the output of numpy.s_[<args>].
1032 Broadcasting is supported for simple indexing.
1033 """
1034 with phil:
1035 if self._is_empty:
1036 raise TypeError("Empty datasets cannot be written to")
1037 if source_sel is None:
1038 source_sel = sel.SimpleSelection(source.shape)
1039 else:
1040 source_sel = sel.select(source.shape, source_sel) # for numpy.s_
1041 mspace = source_sel.id
1043 if dest_sel is None:
1044 dest_sel = sel.SimpleSelection(self.shape)
1045 else:
1046 dest_sel = sel.select(self.shape, dest_sel, self)
1048 for fspace in dest_sel.broadcast(source_sel.array_shape):
1049 self.id.write(mspace, fspace, source, dxpl=self._dxpl)
1051 @with_phil
1052 def __array__(self, dtype=None):
1053 """ Create a Numpy array containing the whole dataset. DON'T THINK
1054 THIS MEANS DATASETS ARE INTERCHANGEABLE WITH ARRAYS. For one thing,
1055 you have to read the whole dataset every time this method is called.
1056 """
1057 arr = numpy.zeros(self.shape, dtype=self.dtype if dtype is None else dtype)
1059 # Special case for (0,)*-shape datasets
1060 if self.size == 0:
1061 return arr
1063 self.read_direct(arr)
1064 return arr
1066 @with_phil
1067 def __repr__(self):
1068 if not self:
1069 r = '<Closed HDF5 dataset>'
1070 else:
1071 if self.name is None:
1072 namestr = '("anonymous")'
1073 else:
1074 name = pp.basename(pp.normpath(self.name))
1075 namestr = '"%s"' % (name if name != '' else '/')
1076 r = '<HDF5 dataset %s: shape %s, type "%s">' % (
1077 namestr, self.shape, self.dtype.str
1078 )
1079 return r
1081 if hasattr(h5d.DatasetID, "refresh"):
1082 @with_phil
1083 def refresh(self):
1084 """ Refresh the dataset metadata by reloading from the file.
1086 This is part of the SWMR features and only exist when the HDF5
1087 library version >=1.9.178
1088 """
1089 self._id.refresh()
1090 self._cache_props.clear()
1092 if hasattr(h5d.DatasetID, "flush"):
1093 @with_phil
1094 def flush(self):
1095 """ Flush the dataset data and metadata to the file.
1096 If the dataset is chunked, raw data chunks are written to the file.
1098 This is part of the SWMR features and only exist when the HDF5
1099 library version >=1.9.178
1100 """
1101 self._id.flush()
1103 if vds_support:
1104 @property
1105 @with_phil
1106 def is_virtual(self):
1107 """Check if this is a virtual dataset"""
1108 return self._dcpl.get_layout() == h5d.VIRTUAL
1110 @with_phil
1111 def virtual_sources(self):
1112 """Get a list of the data mappings for a virtual dataset"""
1113 if not self.is_virtual:
1114 raise RuntimeError("Not a virtual dataset")
1115 dcpl = self._dcpl
1116 return [
1117 VDSmap(dcpl.get_virtual_vspace(j),
1118 dcpl.get_virtual_filename(j),
1119 dcpl.get_virtual_dsetname(j),
1120 dcpl.get_virtual_srcspace(j))
1121 for j in range(dcpl.get_virtual_count())]
1123 @with_phil
1124 def make_scale(self, name=''):
1125 """Make this dataset an HDF5 dimension scale.
1127 You can then attach it to dimensions of other datasets like this::
1129 other_ds.dims[0].attach_scale(ds)
1131 You can optionally pass a name to associate with this scale.
1132 """
1133 h5ds.set_scale(self._id, self._e(name))
1135 @property
1136 @with_phil
1137 def is_scale(self):
1138 """Return ``True`` if this dataset is also a dimension scale.
1140 Return ``False`` otherwise.
1141 """
1142 return h5ds.is_scale(self._id)