Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/h5py/

1# This file is part of h5py, a Python interface to the HDF5 library.

3# http://www.h5py.org

7# License: Standard 3-clause BSD; see "license.txt" for full license terms

8# and contributor agreement.

10"""

11 Implements support for high-level dataset access.

12"""

14import posixpath as pp

15import sys

17import numpy

19from .. import h5, h5s, h5t, h5r, h5d, h5p, h5fd, h5ds, _selector

20from .base import (

21 array_for_new_object, cached_property, Empty, find_item_type, HLObject,

22 phil, product, with_phil,

23)

24from . import filters

25from . import selections as sel

26from . import selections2 as sel2

27from .datatype import Datatype

28from .compat import filename_decode

29from .vds import VDSmap, vds_support

31_LEGACY_GZIP_COMPRESSION_VALS = frozenset(range(10))

32MPI = h5.get_config().mpi

35def make_new_dset(parent, shape=None, dtype=None, data=None, name=None,

36 chunks=None, compression=None, shuffle=None,

37 fletcher32=None, maxshape=None, compression_opts=None,

38 fillvalue=None, scaleoffset=None, track_times=False,

39 external=None, track_order=None, dcpl=None, dapl=None,

40 efile_prefix=None, virtual_prefix=None, allow_unknown_filter=False,

41 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None):

42 """ Return a new low-level dataset identifier """

44 # Convert data to a C-contiguous ndarray

45 if data is not None and not isinstance(data, Empty):

46 data = array_for_new_object(data, specified_dtype=dtype)

48 # Validate shape

49 if shape is None:

50 if data is None:

51 if dtype is None:

52 raise TypeError("One of data, shape or dtype must be specified")

53 data = Empty(dtype)

54 shape = data.shape

55 else:

56 shape = (shape,) if isinstance(shape, int) else tuple(shape)

57 if data is not None and (product(shape) != product(data.shape)):

58 raise ValueError("Shape tuple is incompatible with data")

60 if isinstance(maxshape, int):

61 maxshape = (maxshape,)

62 tmp_shape = maxshape if maxshape is not None else shape

64 # Validate chunk shape

65 if isinstance(chunks, int) and not isinstance(chunks, bool):

66 chunks = (chunks,)

67 if isinstance(chunks, tuple) and any(

68 chunk > dim for dim, chunk in zip(tmp_shape, chunks) if dim is not None

69 ):

70 errmsg = "Chunk shape must not be greater than data shape in any dimension. "\

71 "{} is not compatible with {}".format(chunks, shape)

72 raise ValueError(errmsg)

74 if isinstance(dtype, Datatype):

75 # Named types are used as-is

76 tid = dtype.id

77 dtype = tid.dtype # Following code needs this

78 else:

79 # Validate dtype

80 if dtype is None and data is None:

81 dtype = numpy.dtype("=f4")

82 elif dtype is None and data is not None:

83 dtype = data.dtype

84 else:

85 dtype = numpy.dtype(dtype)

86 tid = h5t.py_create(dtype, logical=1)

88 # Legacy

89 if any((compression, shuffle, fletcher32, maxshape, scaleoffset)) and chunks is False:

90 raise ValueError("Chunked format required for given storage options")

92 # Legacy

93 if compression is True:

94 if compression_opts is None:

95 compression_opts = 4

96 compression = 'gzip'

98 # Legacy

99 if compression in _LEGACY_GZIP_COMPRESSION_VALS:

100 if compression_opts is not None:

101 raise TypeError("Conflict in compression options")

102 compression_opts = compression

103 compression = 'gzip'

104 dcpl = filters.fill_dcpl(

105 dcpl or h5p.create(h5p.DATASET_CREATE), shape, dtype,

106 chunks, compression, compression_opts, shuffle, fletcher32,

107 maxshape, scaleoffset, external, allow_unknown_filter)

108

109 if fillvalue is not None:

110 # prepare string-type dtypes for fillvalue

111 string_info = h5t.check_string_dtype(dtype)

112 if string_info is not None:

113 # fake vlen dtype for fixed len string fillvalue

114 # to not trigger unwanted encoding

115 dtype = h5t.string_dtype(string_info.encoding)

116 fillvalue = numpy.array(fillvalue, dtype=dtype)

117 else:

118 fillvalue = numpy.array(fillvalue)

119 dcpl.set_fill_value(fillvalue)

120

121 if track_times is None:

122 # In case someone explicitly passes None for the default

123 track_times = False

124 if track_times in (True, False):

125 dcpl.set_obj_track_times(track_times)

126 else:

127 raise TypeError("track_times must be either True or False")

128 if track_order is True:

129 dcpl.set_attr_creation_order(

130 h5p.CRT_ORDER_TRACKED | h5p.CRT_ORDER_INDEXED)

131 elif track_order is False:

132 dcpl.set_attr_creation_order(0)

133 elif track_order is not None:

134 raise TypeError("track_order must be either True or False")

135

136 if maxshape is not None:

137 maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape)

138

139 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):

140 dapl = dapl or h5p.create(h5p.DATASET_ACCESS)

141

142 if efile_prefix is not None:

143 dapl.set_efile_prefix(efile_prefix)

144

145 if virtual_prefix is not None:

146 dapl.set_virtual_prefix(virtual_prefix)

147

148 if rdcc_nbytes or rdcc_nslots or rdcc_w0:

149 cache_settings = list(dapl.get_chunk_cache())

150 if rdcc_nslots is not None:

151 cache_settings[0] = rdcc_nslots

152 if rdcc_nbytes is not None:

153 cache_settings[1] = rdcc_nbytes

154 if rdcc_w0 is not None:

155 cache_settings[2] = rdcc_w0

156 dapl.set_chunk_cache(*cache_settings)

157

158 if isinstance(data, Empty):

159 sid = h5s.create(h5s.NULL)

160 else:

161 sid = h5s.create_simple(shape, maxshape)

162

163 dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl, dapl=dapl)

164

165 if (data is not None) and (not isinstance(data, Empty)):

166 dset_id.write(h5s.ALL, h5s.ALL, data)

167

168 return dset_id

169

170

171def open_dset(parent, name, dapl=None, efile_prefix=None, virtual_prefix=None,

172 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, **kwds):

173 """ Return an existing low-level dataset identifier """

174

175 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):

176 dapl = dapl or h5p.create(h5p.DATASET_ACCESS)

177

178 if efile_prefix is not None:

179 dapl.set_efile_prefix(efile_prefix)

180

181 if virtual_prefix is not None:

182 dapl.set_virtual_prefix(virtual_prefix)

183

184 if rdcc_nbytes or rdcc_nslots or rdcc_w0:

185 cache_settings = list(dapl.get_chunk_cache())

186 if rdcc_nslots is not None:

187 cache_settings[0] = rdcc_nslots

188 if rdcc_nbytes is not None:

189 cache_settings[1] = rdcc_nbytes

190 if rdcc_w0 is not None:

191 cache_settings[2] = rdcc_w0

192 dapl.set_chunk_cache(*cache_settings)

193

194 dset_id = h5d.open(parent.id, name, dapl=dapl)

195

196 return dset_id

197

198

199class AstypeWrapper:

200 """Wrapper to convert data on reading from a dataset.

201 """

202 def __init__(self, dset, dtype):

203 self._dset = dset

204 self._dtype = numpy.dtype(dtype)

205

206 def __getitem__(self, args):

207 return self._dset.__getitem__(args, new_dtype=self._dtype)

208

209 def __len__(self):

210 """ Get the length of the underlying dataset

211

212 >>> length = len(dataset.astype('f8'))

213 """

214 return len(self._dset)

215

216 def __array__(self, dtype=None, copy=True):

217 if copy is False:

218 raise ValueError(

219 f"AstypeWrapper.__array__ received {copy=} "

220 f"but memory allocation cannot be avoided on read"

221 )

222

223 data = self[:]

224 if dtype is not None:

225 return data.astype(dtype, copy=False)

226 return data

227

228

229class AsStrWrapper:

230 """Wrapper to decode strings on reading the dataset"""

231 def __init__(self, dset, encoding, errors='strict'):

232 self._dset = dset

233 if encoding is None:

234 encoding = h5t.check_string_dtype(dset.dtype).encoding

235 self.encoding = encoding

236 self.errors = errors

237

238 def __getitem__(self, args):

239 bytes_arr = self._dset[args]

240 # numpy.char.decode() seems like the obvious thing to use. But it only

241 # accepts numpy string arrays, not object arrays of bytes (which we

242 # return from HDF5 variable-length strings). And the numpy

243 # implementation is not faster than doing it with a loop; in fact, by

244 # not converting the result to a numpy unicode array, the

245 # naive way can be faster! (Comparing with numpy 1.18.4, June 2020)

246 if numpy.isscalar(bytes_arr):

247 return bytes_arr.decode(self.encoding, self.errors)

248

249 return numpy.array([

250 b.decode(self.encoding, self.errors) for b in bytes_arr.flat

251 ], dtype=object).reshape(bytes_arr.shape)

252

253 def __len__(self):

254 """ Get the length of the underlying dataset

255

256 >>> length = len(dataset.asstr())

257 """

258 return len(self._dset)

259

260 def __array__(self, dtype=None, copy=True):

261 if dtype not in (None, object):

262 raise TypeError(

263 "AsStrWrapper.__array__ doesn't support the dtype argument"

264 )

265 if copy is False:

266 raise ValueError(

267 f"AsStrWrapper.__array__ received {copy=} "

268 f"but memory allocation cannot be avoided on read"

269 )

270 return numpy.array([

271 b.decode(self.encoding, self.errors) for b in self._dset

272 ], dtype=object).reshape(self._dset.shape)

273

274

275class FieldsWrapper:

276 """Wrapper to extract named fields from a dataset with a struct dtype"""

277 extract_field = None

278

279 def __init__(self, dset, prior_dtype, names):

280 self._dset = dset

281 if isinstance(names, str):

282 self.extract_field = names

283 names = [names]

284 self.read_dtype = readtime_dtype(prior_dtype, names)

285

286 def __array__(self, dtype=None, copy=True):

287 if copy is False:

288 raise ValueError(

289 f"FieldsWrapper.__array__ received {copy=} "

290 f"but memory allocation cannot be avoided on read"

291 )

292 data = self[:]

293 if dtype is not None:

294 return data.astype(dtype, copy=False)

295 else:

296 return data

297

298 def __getitem__(self, args):

299 data = self._dset.__getitem__(args, new_dtype=self.read_dtype)

300 if self.extract_field is not None:

301 data = data[self.extract_field]

302 return data

303

304 def __len__(self):

305 """ Get the length of the underlying dataset

306

307 >>> length = len(dataset.fields(['x', 'y']))

308 """

309 return len(self._dset)

310

311

312def readtime_dtype(basetype, names):

313 """Make a NumPy compound dtype with a subset of available fields"""

314 if basetype.names is None: # Names provided, but not compound

315 raise ValueError("Field names only allowed for compound types")

316

317 for name in names: # Check all names are legal

318 if name not in basetype.names:

319 raise ValueError("Field %s does not appear in this type." % name)

320

321 return numpy.dtype([(name, basetype.fields[name][0]) for name in names])

322

323

324if MPI:

325 class CollectiveContext:

326

327 """ Manages collective I/O in MPI mode """

328

329 # We don't bother with _local as threads are forbidden in MPI mode

330

331 def __init__(self, dset):

332 self._dset = dset

333

334 def __enter__(self):

335 # pylint: disable=protected-access

336 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_COLLECTIVE)

337

338 def __exit__(self, *args):

339 # pylint: disable=protected-access

340 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_INDEPENDENT)

341

342

343class ChunkIterator:

344 """

345 Class to iterate through list of chunks of a given dataset

346 """

347 def __init__(self, dset, source_sel=None):

348 self._shape = dset.shape

349 rank = len(dset.shape)

350

351 if not dset.chunks:

352 # can only use with chunked datasets

353 raise TypeError("Chunked dataset required")

354

355 self._layout = dset.chunks

356 if source_sel is None:

357 # select over entire dataset

358 self._sel = tuple(

359 slice(0, self._shape[dim])

360 for dim in range(rank)

361 )

362 else:

363 if isinstance(source_sel, slice):

364 self._sel = (source_sel,)

365 else:

366 self._sel = source_sel

367 if len(self._sel) != rank:

368 raise ValueError("Invalid selection - selection region must have same rank as dataset")

369 self._chunk_index = []

370 for dim in range(rank):

371 s = self._sel[dim]

372 if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start:

373 raise ValueError("Invalid selection - selection region must be within dataset space")

374 index = s.start // self._layout[dim]

375 self._chunk_index.append(index)

376

377 def __iter__(self):

378 return self

379

380 def __next__(self):

381 rank = len(self._shape)

382 slices = []

383 if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop:

384 # ran past the last chunk, end iteration

385 raise StopIteration()

386

387 for dim in range(rank):

388 s = self._sel[dim]

389 start = self._chunk_index[dim] * self._layout[dim]

390 stop = (self._chunk_index[dim] + 1) * self._layout[dim]

391 # adjust the start if this is an edge chunk

392 if start < s.start:

393 start = s.start

394 if stop > s.stop:

395 stop = s.stop # trim to end of the selection

396 s = slice(start, stop, 1)

397 slices.append(s)

398

399 # bump up the last index and carry forward if we run outside the selection

400 dim = rank - 1

401 while dim >= 0:

402 s = self._sel[dim]

403 self._chunk_index[dim] += 1

404

405 chunk_end = self._chunk_index[dim] * self._layout[dim]

406 if chunk_end < s.stop:

407 # we still have room to extend along this dimensions

408 return tuple(slices)

409

410 if dim > 0:

411 # reset to the start and continue iterating with higher dimension

412 self._chunk_index[dim] = s.start // self._layout[dim]

413 dim -= 1

414 return tuple(slices)

415

416

417class Dataset(HLObject):

418

419 """

420 Represents an HDF5 dataset

421 """

422

423 def astype(self, dtype):

424 """ Get a wrapper allowing you to perform reads to a

425 different destination type, e.g.:

426

427 >>> double_precision = dataset.astype('f8')[0:100:2]

428 """

429 return AstypeWrapper(self, dtype)

430

431 def asstr(self, encoding=None, errors='strict'):

432 """Get a wrapper to read string data as Python strings:

433

434 >>> str_array = dataset.asstr()[:]

435

436 The parameters have the same meaning as in ``bytes.decode()``.

437 If ``encoding`` is unspecified, it will use the encoding in the HDF5

438 datatype (either ascii or utf-8).

439 """

440 string_info = h5t.check_string_dtype(self.dtype)

441 if string_info is None:

442 raise TypeError(

443 "dset.asstr() can only be used on datasets with "

444 "an HDF5 string datatype"

445 )

446 if encoding is None:

447 encoding = string_info.encoding

448 return AsStrWrapper(self, encoding, errors=errors)

449

450 def fields(self, names, *, _prior_dtype=None):

451 """Get a wrapper to read a subset of fields from a compound data type:

452

453 >>> 2d_coords = dataset.fields(['x', 'y'])[:]

454

455 If names is a string, a single field is extracted, and the resulting

456 arrays will have that dtype. Otherwise, it should be an iterable,

457 and the read data will have a compound dtype.

458 """

459 if _prior_dtype is None:

460 _prior_dtype = self.dtype

461 return FieldsWrapper(self, _prior_dtype, names)

462

463 if MPI:

464 @property

465 @with_phil

466 def collective(self):

467 """ Context manager for MPI collective reads & writes """

468 return CollectiveContext(self)

469

470 @property

471 def dims(self):

472 """ Access dimension scales attached to this dataset. """

473 from .dims import DimensionManager

474 with phil:

475 return DimensionManager(self)

476

477 @property

478 @with_phil

479 def ndim(self):

480 """Numpy-style attribute giving the number of dimensions"""

481 return self.id.rank

482

483 @property

484 def shape(self):

485 """Numpy-style shape tuple giving dataset dimensions"""

486 if 'shape' in self._cache_props:

487 return self._cache_props['shape']

488

489 with phil:

490 shape = self.id.shape

491

492 # If the file is read-only, cache the shape to speed-up future uses.

493 # This cache is invalidated by .refresh() when using SWMR.

494 if self._readonly:

495 self._cache_props['shape'] = shape

496 return shape

497

498 @shape.setter

499 @with_phil

500 def shape(self, shape):

501 # pylint: disable=missing-docstring

502 self.resize(shape)

503

504 @property

505 def size(self):

506 """Numpy-style attribute giving the total dataset size"""

507 if 'size' in self._cache_props:

508 return self._cache_props['size']

509

510 if self._is_empty:

511 size = None

512 else:

513 size = product(self.shape)

514

515 # If the file is read-only, cache the size to speed-up future uses.

516 # This cache is invalidated by .refresh() when using SWMR.

517 if self._readonly:

518 self._cache_props['size'] = size

519 return size

520

521 @property

522 def nbytes(self):

523 """Numpy-style attribute giving the raw dataset size as the number of bytes"""

524 size = self.size

525 if size is None: # if we are an empty 0-D array, then there are no bytes in the dataset

526 return 0

527 return self.dtype.itemsize * size

528

529 @property

530 def _selector(self):

531 """Internal object for optimised selection of data"""

532 if '_selector' in self._cache_props:

533 return self._cache_props['_selector']

534

535 slr = _selector.Selector(self.id.get_space())

536

537 # If the file is read-only, cache the reader to speed up future uses.

538 # This cache is invalidated by .refresh() when using SWMR.

539 if self._readonly:

540 self._cache_props['_selector'] = slr

541 return slr

542

543 @property

544 def _fast_reader(self):

545 """Internal object for optimised reading of data"""

546 if '_fast_reader' in self._cache_props:

547 return self._cache_props['_fast_reader']

548

549 rdr = _selector.Reader(self.id)

550

551 # If the file is read-only, cache the reader to speed up future uses.

552 # This cache is invalidated by .refresh() when using SWMR.

553 if self._readonly:

554 self._cache_props['_fast_reader'] = rdr

555 return rdr

556

557 @property

558 @with_phil

559 def dtype(self):

560 """Numpy dtype representing the datatype"""

561 return self.id.dtype

562

563 @property

564 @with_phil

565 def chunks(self):

566 """Dataset chunks (or None)"""

567 dcpl = self._dcpl

568 if dcpl.get_layout() == h5d.CHUNKED:

569 return dcpl.get_chunk()

570 return None

571

572 @property

573 @with_phil

574 def compression(self):

575 """Compression strategy (or None)"""

576 for x in ('gzip','lzf','szip'):

577 if x in self._filters:

578 return x

579 return None

580

581 @property

582 @with_phil

583 def compression_opts(self):

584 """ Compression setting. Int(0-9) for gzip, 2-tuple for szip. """

585 return self._filters.get(self.compression, None)

586

587 @property

588 @with_phil

589 def shuffle(self):

590 """Shuffle filter present (T/F)"""

591 return 'shuffle' in self._filters

592

593 @property

594 @with_phil

595 def fletcher32(self):

596 """Fletcher32 filter is present (T/F)"""

597 return 'fletcher32' in self._filters

598

599 @property

600 @with_phil

601 def scaleoffset(self):

602 """Scale/offset filter settings. For integer data types, this is

603 the number of bits stored, or 0 for auto-detected. For floating

604 point data types, this is the number of decimal places retained.

605 If the scale/offset filter is not in use, this is None."""

606 try:

607 return self._filters['scaleoffset'][1]

608 except KeyError:

609 return None

610

611 @property

612 @with_phil

613 def external(self):

614 """External file settings. Returns a list of tuples of

615 (name, offset, size) for each external file entry, or returns None

616 if no external files are used."""

617 count = self._dcpl.get_external_count()

618 if count<=0:

619 return None

620 ext_list = list()

621 for x in range(count):

622 (name, offset, size) = self._dcpl.get_external(x)

623 ext_list.append( (filename_decode(name), offset, size) )

624 return ext_list

625

626 @property

627 @with_phil

628 def maxshape(self):

629 """Shape up to which this dataset can be resized. Axes with value

630 None have no resize limit. """

631 space = self.id.get_space()

632 dims = space.get_simple_extent_dims(True)

633 if dims is None:

634 return None

635

636 return tuple(x if x != h5s.UNLIMITED else None for x in dims)

637

638 @property

639 @with_phil

640 def fillvalue(self):

641 """Fill value for this dataset (0 by default)"""

642 arr = numpy.zeros((1,), dtype=self.dtype)

643 self._dcpl.get_fill_value(arr)

644 return arr[0]

645

646 @cached_property

647 @with_phil

648 def _extent_type(self):

649 """Get extent type for this dataset - SIMPLE, SCALAR or NULL"""

650 return self.id.get_space().get_simple_extent_type()

651

652 @cached_property

653 def _is_empty(self):

654 """Check if extent type is empty"""

655 return self._extent_type == h5s.NULL

656

657 @with_phil

658 def __init__(self, bind, *, readonly=False):

659 """ Create a new Dataset object by binding to a low-level DatasetID.

660 """

661 if not isinstance(bind, h5d.DatasetID):

662 raise ValueError("%s is not a DatasetID" % bind)

663 super().__init__(bind)

664

665 self._dcpl = self.id.get_create_plist()

666 self._dxpl = h5p.create(h5p.DATASET_XFER)

667 self._filters = filters.get_filters(self._dcpl)

668 self._readonly = readonly

669 self._cache_props = {}

670

671 def resize(self, size, axis=None):

672 """ Resize the dataset, or the specified axis.

673

674 The dataset must be stored in chunked format; it can be resized up to

675 the "maximum shape" (keyword maxshape) specified at creation time.

676 The rank of the dataset cannot be changed.

677

678 "Size" should be a shape tuple, or if an axis is specified, an integer.

679

680 BEWARE: This functions differently than the NumPy resize() method!

681 The data is not "reshuffled" to fit in the new shape; each axis is

682 grown or shrunk independently. The coordinates of existing data are

683 fixed.

684 """

685 with phil:

686 if self.chunks is None:

687 raise TypeError("Only chunked datasets can be resized")

688

689 if axis is not None:

690 if not (axis >=0 and axis < self.id.rank):

691 raise ValueError("Invalid axis (0 to %s allowed)" % (self.id.rank-1))

692 try:

693 newlen = int(size)

694 except TypeError:

695 raise TypeError("Argument must be a single int if axis is specified")

696 size = list(self.shape)

697 size[axis] = newlen

698

699 size = tuple(size)

700 self.id.set_extent(size)

701 #h5f.flush(self.id) # THG recommends

702

703 @with_phil

704 def __len__(self):

705 """ The size of the first axis. TypeError if scalar.

706

707 Limited to 2**32 on 32-bit systems; Dataset.len() is preferred.

708 """

709 size = self.len()

710 if size > sys.maxsize:

711 raise OverflowError("Value too big for Python's __len__; use Dataset.len() instead.")

712 return size

713

714 def len(self):

715 """ The size of the first axis. TypeError if scalar.

716

717 Use of this method is preferred to len(dset), as Python's built-in

718 len() cannot handle values greater then 2**32 on 32-bit systems.

719 """

720 with phil:

721 shape = self.shape

722 if len(shape) == 0:

723 raise TypeError("Attempt to take len() of scalar dataset")

724 return shape[0]

725

726 @with_phil

727 def __iter__(self):

728 """ Iterate over the first axis. TypeError if scalar.

729

730 BEWARE: Modifications to the yielded data are *NOT* written to file.

731 """

732 shape = self.shape

733 if len(shape) == 0:

734 raise TypeError("Can't iterate over a scalar dataset")

735 for i in range(shape[0]):

736 yield self[i]

737

738 @with_phil

739 def iter_chunks(self, sel=None):

740 """ Return chunk iterator. If set, the sel argument is a slice or

741 tuple of slices that defines the region to be used. If not set, the

742 entire dataspace will be used for the iterator.

743

744 For each chunk within the given region, the iterator yields a tuple of

745 slices that gives the intersection of the given chunk with the

746 selection area.

747

748 A TypeError will be raised if the dataset is not chunked.

749

750 A ValueError will be raised if the selection region is invalid.

751

752 """

753 return ChunkIterator(self, sel)

754

755 @cached_property

756 def _fast_read_ok(self):

757 """Is this dataset suitable for simple reading"""

758 return (

759 self._extent_type == h5s.SIMPLE

760 and isinstance(self.id.get_type(), (h5t.TypeIntegerID, h5t.TypeFloatID))

761 )

762

763 @with_phil

764 def __getitem__(self, args, new_dtype=None):

765 """ Read a slice from the HDF5 dataset.

766

767 Takes slices and recarray-style field names (more than one is

768 allowed!) in any order. Obeys basic NumPy rules, including

769 broadcasting.

770

771 Also supports:

772

773 * Boolean "mask" array indexing

774 """

775 args = args if isinstance(args, tuple) else (args,)

776

777 if self._fast_read_ok and (new_dtype is None):

778 try:

779 return self._fast_reader.read(args)

780 except TypeError:

781 pass # Fall back to Python read pathway below

782

783 if self._is_empty:

784 # Check 'is Ellipsis' to avoid equality comparison with an array:

785 # array equality returns an array, not a boolean.

786 if args == () or (len(args) == 1 and args[0] is Ellipsis):

787 return Empty(self.dtype)

788 raise ValueError("Empty datasets cannot be sliced")

789

790 # Sort field names from the rest of the args.

791 names = tuple(x for x in args if isinstance(x, str))

792

793 if names:

794 # Read a subset of the fields in this structured dtype

795 if len(names) == 1:

796 names = names[0] # Read with simpler dtype of this field

797 args = tuple(x for x in args if not isinstance(x, str))

798 return self.fields(names, _prior_dtype=new_dtype)[args]

799

800 if new_dtype is None:

801 new_dtype = self.dtype

802 mtype = h5t.py_create(new_dtype)

803

804 # === Special-case region references ====

805

806 if len(args) == 1 and isinstance(args[0], h5r.RegionReference):

807

808 obj = h5r.dereference(args[0], self.id)

809 if obj != self.id:

810 raise ValueError("Region reference must point to this dataset")

811

812 sid = h5r.get_region(args[0], self.id)

813 mshape = sel.guess_shape(sid)

814 if mshape is None:

815 # 0D with no data (NULL or deselected SCALAR)

816 return Empty(new_dtype)

817 out = numpy.zeros(mshape, dtype=new_dtype)

818 if out.size == 0:

819 return out

820

821 sid_out = h5s.create_simple(mshape)

822 sid_out.select_all()

823 self.id.read(sid_out, sid, out, mtype)

824 return out

825

826 # === Check for zero-sized datasets =====

827

828 if self.size == 0:

829 # Check 'is Ellipsis' to avoid equality comparison with an array:

830 # array equality returns an array, not a boolean.

831 if args == () or (len(args) == 1 and args[0] is Ellipsis):

832 return numpy.zeros(self.shape, dtype=new_dtype)

833

834 # === Scalar dataspaces =================

835

836 if self.shape == ():

837 fspace = self.id.get_space()

838 selection = sel2.select_read(fspace, args)

839 if selection.mshape is None:

840 arr = numpy.zeros((), dtype=new_dtype)

841 else:

842 arr = numpy.zeros(selection.mshape, dtype=new_dtype)

843 for mspace, fspace in selection:

844 self.id.read(mspace, fspace, arr, mtype)

845 if selection.mshape is None:

846 return arr[()]

847 return arr

848

849 # === Everything else ===================

850

851 # Perform the dataspace selection.

852 selection = sel.select(self.shape, args, dataset=self)

853

854 if selection.nselect == 0:

855 return numpy.zeros(selection.array_shape, dtype=new_dtype)

856

857 arr = numpy.zeros(selection.array_shape, new_dtype, order='C')

858

859 # Perform the actual read

860 mspace = h5s.create_simple(selection.mshape)

861 fspace = selection.id

862 self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl)

863

864 # Patch up the output for NumPy

865 if arr.shape == ():

866 return arr[()] # 0 dim array -> numpy scalar

867 return arr

868

869 @with_phil

870 def __setitem__(self, args, val):

871 """ Write to the HDF5 dataset from a Numpy array.

872

873 NumPy's broadcasting rules are honored, for "simple" indexing

874 (slices and integers). For advanced indexing, the shapes must

875 match.

876 """

877 args = args if isinstance(args, tuple) else (args,)

878

879 # Sort field indices from the slicing

880 names = tuple(x for x in args if isinstance(x, str))

881 args = tuple(x for x in args if not isinstance(x, str))

882

883 # Generally we try to avoid converting the arrays on the Python

884 # side. However, for compound literals this is unavoidable.

885 vlen = h5t.check_vlen_dtype(self.dtype)

886 if vlen is not None and vlen not in (bytes, str):

887 try:

888 val = numpy.asarray(val, dtype=vlen)

889 except (ValueError, TypeError):

890 try:

891 val = numpy.array([numpy.array(x, dtype=vlen)

892 for x in val], dtype=self.dtype)

893 except (ValueError, TypeError):

894 pass

895 if vlen == val.dtype:

896 if val.ndim > 1:

897 tmp = numpy.empty(shape=val.shape[:-1], dtype=object)

898 tmp.ravel()[:] = [i for i in val.reshape(

899 (product(val.shape[:-1]), val.shape[-1])

900 )]

901 else:

902 tmp = numpy.array([None], dtype=object)

903 tmp[0] = val

904 val = tmp

905 elif self.dtype.kind == "O" or \

906 (self.dtype.kind == 'V' and \

907 (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \

908 (self.dtype.subdtype is None)):

909 if len(names) == 1 and self.dtype.fields is not None:

910 # Single field selected for write, from a non-array source

911 if not names[0] in self.dtype.fields:

912 raise ValueError("No such field for indexing: %s" % names[0])

913 dtype = self.dtype.fields[names[0]][0]

914 cast_compound = True

915 else:

916 dtype = self.dtype

917 cast_compound = False

918

919 val = numpy.asarray(val, dtype=dtype.base, order='C')

920 if cast_compound:

921 val = val.view(numpy.dtype([(names[0], dtype)]))

922 val = val.reshape(val.shape[:len(val.shape) - len(dtype.shape)])

923 elif (self.dtype.kind == 'S'

924 and (h5t.check_string_dtype(self.dtype).encoding == 'utf-8')

925 and (find_item_type(val) is str)

926 ):

927 # Writing str objects to a fixed-length UTF-8 string dataset.

928 # Numpy's normal conversion only handles ASCII characters, but

929 # when the destination is UTF-8, we want to allow any unicode.

930 # This *doesn't* handle numpy fixed-length unicode data ('U' dtype),

931 # as HDF5 has no equivalent, and converting fixed length UTF-32

932 # to variable length UTF-8 would obscure what's going on.

933 str_array = numpy.asarray(val, order='C', dtype=object)

934 val = numpy.array([

935 s.encode('utf-8') for s in str_array.flat

936 ], dtype=self.dtype).reshape(str_array.shape)

937 else:

938 # If the input data is already an array, let HDF5 do the conversion.

939 # If it's a list or similar, don't make numpy guess a dtype for it.

940 dt = None if isinstance(val, numpy.ndarray) else self.dtype.base

941 val = numpy.asarray(val, order='C', dtype=dt)

942

943 # Check for array dtype compatibility and convert

944 if self.dtype.subdtype is not None:

945 shp = self.dtype.subdtype[1]

946 valshp = val.shape[-len(shp):]

947 if valshp != shp: # Last dimension has to match

948 raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,))

949 mtype = h5t.py_create(numpy.dtype((val.dtype, shp)))

950 mshape = val.shape[0:len(val.shape)-len(shp)]

951

952 # Make a compound memory type if field-name slicing is required

953 elif len(names) != 0:

954

955 mshape = val.shape

956

957 # Catch common errors

958 if self.dtype.fields is None:

959 raise TypeError("Illegal slicing argument (not a compound dataset)")

960 mismatch = [x for x in names if x not in self.dtype.fields]

961 if len(mismatch) != 0:

962 mismatch = ", ".join('"%s"'%x for x in mismatch)

963 raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch)

964

965 # Write non-compound source into a single dataset field

966 if len(names) == 1 and val.dtype.fields is None:

967 subtype = h5t.py_create(val.dtype)

968 mtype = h5t.create(h5t.COMPOUND, subtype.get_size())

969 mtype.insert(self._e(names[0]), 0, subtype)

970

971 # Make a new source type keeping only the requested fields

972 else:

973 fieldnames = [x for x in val.dtype.names if x in names] # Keep source order

974 mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize)

975 for fieldname in fieldnames:

976 subtype = h5t.py_create(val.dtype.fields[fieldname][0])

977 offset = val.dtype.fields[fieldname][1]

978 mtype.insert(self._e(fieldname), offset, subtype)

979

980 # Use mtype derived from array (let DatasetID.write figure it out)

981 else:

982 mshape = val.shape

983 mtype = None

984

985 # Perform the dataspace selection

986 selection = sel.select(self.shape, args, dataset=self)

987

988 if selection.nselect == 0:

989 return

990

991 # Broadcast scalars if necessary.

992 # In order to avoid slow broadcasting filling the destination by

993 # the scalar value, we create an intermediate array of the same

994 # size as the destination buffer provided that size is reasonable.

995 # We assume as reasonable a size smaller or equal as the used dataset

996 # chunk size if any.

997 # In case of dealing with a non-chunked destination dataset or with

998 # a selection whose size is larger than the dataset chunk size we fall

999 # back to using an intermediate array of size equal to the last dimension

1000 # of the destination buffer.

1001 # The reasoning behind is that it makes sense to assume the creator of

1002 # the dataset used an appropriate chunk size according the available

1003 # memory. In any case, if we cannot afford to create an intermediate

1004 # array of the same size as the dataset chunk size, the user program has

1005 # little hope to go much further. Solves h5py issue #1067

1006 if mshape == () and selection.array_shape != ():

1007 if self.dtype.subdtype is not None:

1008 raise TypeError("Scalar broadcasting is not supported for array dtypes")

1009 if self.chunks and (product(self.chunks) >= product(selection.array_shape)):

1010 val2 = numpy.empty(selection.array_shape, dtype=val.dtype)

1011 else:

1012 val2 = numpy.empty(selection.array_shape[-1], dtype=val.dtype)

1013 val2[...] = val

1014 val = val2

1015 mshape = val.shape

1016

1017 # Perform the write, with broadcasting

1018 mspace = h5s.create_simple(selection.expand_shape(mshape))

1019 for fspace in selection.broadcast(mshape):

1020 self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl)

1021

1022 def read_direct(self, dest, source_sel=None, dest_sel=None):

1023 """ Read data directly from HDF5 into an existing NumPy array.

1024

1025 The destination array must be C-contiguous and writable.

1026 Selections must be the output of numpy.s_[<args>].

1027

1028 Broadcasting is supported for simple indexing.

1029 """

1030 with phil:

1031 if self._is_empty:

1032 raise TypeError("Empty datasets have no numpy representation")

1033 if source_sel is None:

1034 source_sel = sel.SimpleSelection(self.shape)

1035 else:

1036 source_sel = sel.select(self.shape, source_sel, self) # for numpy.s_

1037 fspace = source_sel.id

1038

1039 if dest_sel is None:

1040 dest_sel = sel.SimpleSelection(dest.shape)

1041 else:

1042 dest_sel = sel.select(dest.shape, dest_sel)

1043

1044 for mspace in dest_sel.broadcast(source_sel.array_shape):

1045 self.id.read(mspace, fspace, dest, dxpl=self._dxpl)

1046

1047 def write_direct(self, source, source_sel=None, dest_sel=None):

1048 """ Write data directly to HDF5 from a NumPy array.

1049

1050 The source array must be C-contiguous. Selections must be

1051 the output of numpy.s_[<args>].

1052

1053 Broadcasting is supported for simple indexing.

1054 """

1055 with phil:

1056 if self._is_empty:

1057 raise TypeError("Empty datasets cannot be written to")

1058 if source_sel is None:

1059 source_sel = sel.SimpleSelection(source.shape)

1060 else:

1061 source_sel = sel.select(source.shape, source_sel) # for numpy.s_

1062 mspace = source_sel.id

1063

1064 if dest_sel is None:

1065 dest_sel = sel.SimpleSelection(self.shape)

1066 else:

1067 dest_sel = sel.select(self.shape, dest_sel, self)

1068

1069 for fspace in dest_sel.broadcast(source_sel.array_shape):

1070 self.id.write(mspace, fspace, source, dxpl=self._dxpl)

1071

1072 @with_phil

1073 def __array__(self, dtype=None, copy=True):

1074 """ Create a Numpy array containing the whole dataset. DON'T THINK

1075 THIS MEANS DATASETS ARE INTERCHANGEABLE WITH ARRAYS. For one thing,

1076 you have to read the whole dataset every time this method is called.

1077 """

1078 if copy is False:

1079 raise ValueError(

1080 f"Dataset.__array__ received {copy=} "

1081 f"but memory allocation cannot be avoided on read"

1082 )

1083 arr = numpy.zeros(self.shape, dtype=self.dtype if dtype is None else dtype)

1084

1085 # Special case for (0,)*-shape datasets

1086 if self.size == 0:

1087 return arr

1088

1089 self.read_direct(arr)

1090 return arr

1091

1092 @with_phil

1093 def __repr__(self):

1094 if not self:

1095 r = '<Closed HDF5 dataset>'

1096 else:

1097 if self.name is None:

1098 namestr = '("anonymous")'

1099 else:

1100 name = pp.basename(pp.normpath(self.name))

1101 namestr = '"%s"' % (name if name != '' else '/')

1102 r = '<HDF5 dataset %s: shape %s, type "%s">' % (

1103 namestr, self.shape, self.dtype.str

1104 )

1105 return r

1106

1107 if hasattr(h5d.DatasetID, "refresh"):

1108 @with_phil

1109 def refresh(self):

1110 """ Refresh the dataset metadata by reloading from the file.

1111

1112 This is part of the SWMR features and only exist when the HDF5

1113 library version >=1.9.178

1114 """

1115 self._id.refresh()

1116 self._cache_props.clear()

1117

1118 if hasattr(h5d.DatasetID, "flush"):

1119 @with_phil

1120 def flush(self):

1121 """ Flush the dataset data and metadata to the file.

1122 If the dataset is chunked, raw data chunks are written to the file.

1123

1124 This is part of the SWMR features and only exist when the HDF5

1125 library version >=1.9.178

1126 """

1127 self._id.flush()

1128

1129 if vds_support:

1130 @property

1131 @with_phil

1132 def is_virtual(self):

1133 """Check if this is a virtual dataset"""

1134 return self._dcpl.get_layout() == h5d.VIRTUAL

1135

1136 @with_phil

1137 def virtual_sources(self):

1138 """Get a list of the data mappings for a virtual dataset"""

1139 if not self.is_virtual:

1140 raise RuntimeError("Not a virtual dataset")

1141 dcpl = self._dcpl

1142 return [

1143 VDSmap(dcpl.get_virtual_vspace(j),

1144 dcpl.get_virtual_filename(j),

1145 dcpl.get_virtual_dsetname(j),

1146 dcpl.get_virtual_srcspace(j))

1147 for j in range(dcpl.get_virtual_count())]

1148

1149 @with_phil

1150 def make_scale(self, name=''):

1151 """Make this dataset an HDF5 dimension scale.

1152

1153 You can then attach it to dimensions of other datasets like this::

1154

1155 other_ds.dims[0].attach_scale(ds)

1156

1157 You can optionally pass a name to associate with this scale.

1158 """

1159 h5ds.set_scale(self._id, self._e(name))

1160

1161 @property

1162 @with_phil

1163 def is_scale(self):

1164 """Return ``True`` if this dataset is also a dimension scale.

1165

1166 Return ``False`` otherwise.

1167 """

1168 return h5ds.is_scale(self._id)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/h5py/_hl/dataset.py: 21%

640 statements