Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/h5py/

1# This file is part of h5py, a Python interface to the HDF5 library.

3# http://www.h5py.org

7# License: Standard 3-clause BSD; see "license.txt" for full license terms

8# and contributor agreement.

10"""

11 Implements support for high-level dataset access.

12"""

14import posixpath as pp

15import sys

17import numpy

19from .. import h5, h5s, h5t, h5r, h5d, h5p, h5fd, h5ds, _selector

20from .base import HLObject, phil, with_phil, Empty, cached_property, find_item_type, array_for_new_object

21from . import filters

22from . import selections as sel

23from . import selections2 as sel2

24from .datatype import Datatype

25from .compat import filename_decode

26from .vds import VDSmap, vds_support

28_LEGACY_GZIP_COMPRESSION_VALS = frozenset(range(10))

29MPI = h5.get_config().mpi

32def make_new_dset(parent, shape=None, dtype=None, data=None, name=None,

33 chunks=None, compression=None, shuffle=None,

34 fletcher32=None, maxshape=None, compression_opts=None,

35 fillvalue=None, scaleoffset=None, track_times=False,

36 external=None, track_order=None, dcpl=None, dapl=None,

37 efile_prefix=None, virtual_prefix=None, allow_unknown_filter=False,

38 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None):

39 """ Return a new low-level dataset identifier """

41 # Convert data to a C-contiguous ndarray

42 if data is not None and not isinstance(data, Empty):

43 data = array_for_new_object(data, specified_dtype=dtype)

45 # Validate shape

46 if shape is None:

47 if data is None:

48 if dtype is None:

49 raise TypeError("One of data, shape or dtype must be specified")

50 data = Empty(dtype)

51 shape = data.shape

52 else:

53 shape = (shape,) if isinstance(shape, int) else tuple(shape)

54 if data is not None and (numpy.product(shape, dtype=numpy.ulonglong) != numpy.product(data.shape, dtype=numpy.ulonglong)):

55 raise ValueError("Shape tuple is incompatible with data")

57 if isinstance(maxshape, int):

58 maxshape = (maxshape,)

59 tmp_shape = maxshape if maxshape is not None else shape

61 # Validate chunk shape

62 if isinstance(chunks, int) and not isinstance(chunks, bool):

63 chunks = (chunks,)

64 if isinstance(chunks, tuple) and any(

65 chunk > dim for dim, chunk in zip(tmp_shape, chunks) if dim is not None

66 ):

67 errmsg = "Chunk shape must not be greater than data shape in any dimension. "\

68 "{} is not compatible with {}".format(chunks, shape)

69 raise ValueError(errmsg)

71 if isinstance(dtype, Datatype):

72 # Named types are used as-is

73 tid = dtype.id

74 dtype = tid.dtype # Following code needs this

75 else:

76 # Validate dtype

77 if dtype is None and data is None:

78 dtype = numpy.dtype("=f4")

79 elif dtype is None and data is not None:

80 dtype = data.dtype

81 else:

82 dtype = numpy.dtype(dtype)

83 tid = h5t.py_create(dtype, logical=1)

85 # Legacy

86 if any((compression, shuffle, fletcher32, maxshape, scaleoffset)) and chunks is False:

87 raise ValueError("Chunked format required for given storage options")

89 # Legacy

90 if compression is True:

91 if compression_opts is None:

92 compression_opts = 4

93 compression = 'gzip'

95 # Legacy

96 if compression in _LEGACY_GZIP_COMPRESSION_VALS:

97 if compression_opts is not None:

98 raise TypeError("Conflict in compression options")

99 compression_opts = compression

100 compression = 'gzip'

101 dcpl = filters.fill_dcpl(

102 dcpl or h5p.create(h5p.DATASET_CREATE), shape, dtype,

103 chunks, compression, compression_opts, shuffle, fletcher32,

104 maxshape, scaleoffset, external, allow_unknown_filter)

105

106 if fillvalue is not None:

107 # prepare string-type dtypes for fillvalue

108 string_info = h5t.check_string_dtype(dtype)

109 if string_info is not None:

110 # fake vlen dtype for fixed len string fillvalue

111 # to not trigger unwanted encoding

112 dtype = h5t.string_dtype(string_info.encoding)

113 fillvalue = numpy.array(fillvalue, dtype=dtype)

114 else:

115 fillvalue = numpy.array(fillvalue)

116 dcpl.set_fill_value(fillvalue)

117

118 if track_times is None:

119 # In case someone explicitly passes None for the default

120 track_times = False

121 if track_times in (True, False):

122 dcpl.set_obj_track_times(track_times)

123 else:

124 raise TypeError("track_times must be either True or False")

125 if track_order is True:

126 dcpl.set_attr_creation_order(

127 h5p.CRT_ORDER_TRACKED | h5p.CRT_ORDER_INDEXED)

128 elif track_order is False:

129 dcpl.set_attr_creation_order(0)

130 elif track_order is not None:

131 raise TypeError("track_order must be either True or False")

132

133 if maxshape is not None:

134 maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape)

135

136 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):

137 dapl = dapl or h5p.create(h5p.DATASET_ACCESS)

138

139 if efile_prefix is not None:

140 dapl.set_efile_prefix(efile_prefix)

141

142 if virtual_prefix is not None:

143 dapl.set_virtual_prefix(virtual_prefix)

144

145 if rdcc_nbytes or rdcc_nslots or rdcc_w0:

146 cache_settings = list(dapl.get_chunk_cache())

147 if rdcc_nslots is not None:

148 cache_settings[0] = rdcc_nslots

149 if rdcc_nbytes is not None:

150 cache_settings[1] = rdcc_nbytes

151 if rdcc_w0 is not None:

152 cache_settings[2] = rdcc_w0

153 dapl.set_chunk_cache(*cache_settings)

154

155 if isinstance(data, Empty):

156 sid = h5s.create(h5s.NULL)

157 else:

158 sid = h5s.create_simple(shape, maxshape)

159

160 dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl, dapl=dapl)

161

162 if (data is not None) and (not isinstance(data, Empty)):

163 dset_id.write(h5s.ALL, h5s.ALL, data)

164

165 return dset_id

166

167

168def open_dset(parent, name, dapl=None, efile_prefix=None, virtual_prefix=None,

169 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, **kwds):

170 """ Return an existing low-level dataset identifier """

171

172 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):

173 dapl = dapl or h5p.create(h5p.DATASET_ACCESS)

174

175 if efile_prefix is not None:

176 dapl.set_efile_prefix(efile_prefix)

177

178 if virtual_prefix is not None:

179 dapl.set_virtual_prefix(virtual_prefix)

180

181 if rdcc_nbytes or rdcc_nslots or rdcc_w0:

182 cache_settings = list(dapl.get_chunk_cache())

183 if rdcc_nslots is not None:

184 cache_settings[0] = rdcc_nslots

185 if rdcc_nbytes is not None:

186 cache_settings[1] = rdcc_nbytes

187 if rdcc_w0 is not None:

188 cache_settings[2] = rdcc_w0

189 dapl.set_chunk_cache(*cache_settings)

190

191 dset_id = h5d.open(parent.id, name, dapl=dapl)

192

193 return dset_id

194

195

196class AstypeWrapper:

197 """Wrapper to convert data on reading from a dataset.

198 """

199 def __init__(self, dset, dtype):

200 self._dset = dset

201 self._dtype = numpy.dtype(dtype)

202

203 def __getitem__(self, args):

204 return self._dset.__getitem__(args, new_dtype=self._dtype)

205

206 def __len__(self):

207 """ Get the length of the underlying dataset

208

209 >>> length = len(dataset.astype('f8'))

210 """

211 return len(self._dset)

212

213 def __array__(self, dtype=None):

214 data = self[:]

215 if dtype is not None:

216 data = data.astype(dtype)

217 return data

218

219

220class AsStrWrapper:

221 """Wrapper to decode strings on reading the dataset"""

222 def __init__(self, dset, encoding, errors='strict'):

223 self._dset = dset

224 if encoding is None:

225 encoding = h5t.check_string_dtype(dset.dtype).encoding

226 self.encoding = encoding

227 self.errors = errors

228

229 def __getitem__(self, args):

230 bytes_arr = self._dset[args]

231 # numpy.char.decode() seems like the obvious thing to use. But it only

232 # accepts numpy string arrays, not object arrays of bytes (which we

233 # return from HDF5 variable-length strings). And the numpy

234 # implementation is not faster than doing it with a loop; in fact, by

235 # not converting the result to a numpy unicode array, the

236 # naive way can be faster! (Comparing with numpy 1.18.4, June 2020)

237 if numpy.isscalar(bytes_arr):

238 return bytes_arr.decode(self.encoding, self.errors)

239

240 return numpy.array([

241 b.decode(self.encoding, self.errors) for b in bytes_arr.flat

242 ], dtype=object).reshape(bytes_arr.shape)

243

244 def __len__(self):

245 """ Get the length of the underlying dataset

246

247 >>> length = len(dataset.asstr())

248 """

249 return len(self._dset)

250

251 def __array__(self):

252 return numpy.array([

253 b.decode(self.encoding, self.errors) for b in self._dset

254 ], dtype=object).reshape(self._dset.shape)

255

256

257class FieldsWrapper:

258 """Wrapper to extract named fields from a dataset with a struct dtype"""

259 extract_field = None

260

261 def __init__(self, dset, prior_dtype, names):

262 self._dset = dset

263 if isinstance(names, str):

264 self.extract_field = names

265 names = [names]

266 self.read_dtype = readtime_dtype(prior_dtype, names)

267

268 def __array__(self, dtype=None):

269 data = self[:]

270 if dtype is not None:

271 data = data.astype(dtype)

272 return data

273

274 def __getitem__(self, args):

275 data = self._dset.__getitem__(args, new_dtype=self.read_dtype)

276 if self.extract_field is not None:

277 data = data[self.extract_field]

278 return data

279

280 def __len__(self):

281 """ Get the length of the underlying dataset

282

283 >>> length = len(dataset.fields(['x', 'y']))

284 """

285 return len(self._dset)

286

287

288def readtime_dtype(basetype, names):

289 """Make a NumPy compound dtype with a subset of available fields"""

290 if basetype.names is None: # Names provided, but not compound

291 raise ValueError("Field names only allowed for compound types")

292

293 for name in names: # Check all names are legal

294 if name not in basetype.names:

295 raise ValueError("Field %s does not appear in this type." % name)

296

297 return numpy.dtype([(name, basetype.fields[name][0]) for name in names])

298

299

300if MPI:

301 class CollectiveContext:

302

303 """ Manages collective I/O in MPI mode """

304

305 # We don't bother with _local as threads are forbidden in MPI mode

306

307 def __init__(self, dset):

308 self._dset = dset

309

310 def __enter__(self):

311 # pylint: disable=protected-access

312 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_COLLECTIVE)

313

314 def __exit__(self, *args):

315 # pylint: disable=protected-access

316 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_INDEPENDENT)

317

318

319class ChunkIterator:

320 """

321 Class to iterate through list of chunks of a given dataset

322 """

323 def __init__(self, dset, source_sel=None):

324 self._shape = dset.shape

325 rank = len(dset.shape)

326

327 if not dset.chunks:

328 # can only use with chunked datasets

329 raise TypeError("Chunked dataset required")

330

331 self._layout = dset.chunks

332 if source_sel is None:

333 # select over entire dataset

334 slices = []

335 for dim in range(rank):

336 slices.append(slice(0, self._shape[dim]))

337 self._sel = tuple(slices)

338 else:

339 if isinstance(source_sel, slice):

340 self._sel = (source_sel,)

341 else:

342 self._sel = source_sel

343 if len(self._sel) != rank:

344 raise ValueError("Invalid selection - selection region must have same rank as dataset")

345 self._chunk_index = []

346 for dim in range(rank):

347 s = self._sel[dim]

348 if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start:

349 raise ValueError("Invalid selection - selection region must be within dataset space")

350 index = s.start // self._layout[dim]

351 self._chunk_index.append(index)

352

353 def __iter__(self):

354 return self

355

356 def __next__(self):

357 rank = len(self._shape)

358 slices = []

359 if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop:

360 # ran past the last chunk, end iteration

361 raise StopIteration()

362

363 for dim in range(rank):

364 s = self._sel[dim]

365 start = self._chunk_index[dim] * self._layout[dim]

366 stop = (self._chunk_index[dim] + 1) * self._layout[dim]

367 # adjust the start if this is an edge chunk

368 if start < s.start:

369 start = s.start

370 if stop > s.stop:

371 stop = s.stop # trim to end of the selection

372 s = slice(start, stop, 1)

373 slices.append(s)

374

375 # bump up the last index and carry forward if we run outside the selection

376 dim = rank - 1

377 while dim >= 0:

378 s = self._sel[dim]

379 self._chunk_index[dim] += 1

380

381 chunk_end = self._chunk_index[dim] * self._layout[dim]

382 if chunk_end < s.stop:

383 # we still have room to extend along this dimensions

384 return tuple(slices)

385

386 if dim > 0:

387 # reset to the start and continue iterating with higher dimension

388 self._chunk_index[dim] = 0

389 dim -= 1

390 return tuple(slices)

391

392

393class Dataset(HLObject):

394

395 """

396 Represents an HDF5 dataset

397 """

398

399 def astype(self, dtype):

400 """ Get a wrapper allowing you to perform reads to a

401 different destination type, e.g.:

402

403 >>> double_precision = dataset.astype('f8')[0:100:2]

404 """

405 return AstypeWrapper(self, dtype)

406

407 def asstr(self, encoding=None, errors='strict'):

408 """Get a wrapper to read string data as Python strings:

409

410 >>> str_array = dataset.asstr()[:]

411

412 The parameters have the same meaning as in ``bytes.decode()``.

413 If ``encoding`` is unspecified, it will use the encoding in the HDF5

414 datatype (either ascii or utf-8).

415 """

416 string_info = h5t.check_string_dtype(self.dtype)

417 if string_info is None:

418 raise TypeError(

419 "dset.asstr() can only be used on datasets with "

420 "an HDF5 string datatype"

421 )

422 if encoding is None:

423 encoding = string_info.encoding

424 return AsStrWrapper(self, encoding, errors=errors)

425

426 def fields(self, names, *, _prior_dtype=None):

427 """Get a wrapper to read a subset of fields from a compound data type:

428

429 >>> 2d_coords = dataset.fields(['x', 'y'])[:]

430

431 If names is a string, a single field is extracted, and the resulting

432 arrays will have that dtype. Otherwise, it should be an iterable,

433 and the read data will have a compound dtype.

434 """

435 if _prior_dtype is None:

436 _prior_dtype = self.dtype

437 return FieldsWrapper(self, _prior_dtype, names)

438

439 if MPI:

440 @property

441 @with_phil

442 def collective(self):

443 """ Context manager for MPI collective reads & writes """

444 return CollectiveContext(self)

445

446 @property

447 def dims(self):

448 """ Access dimension scales attached to this dataset. """

449 from .dims import DimensionManager

450 with phil:

451 return DimensionManager(self)

452

453 @property

454 @with_phil

455 def ndim(self):

456 """Numpy-style attribute giving the number of dimensions"""

457 return self.id.rank

458

459 @property

460 def shape(self):

461 """Numpy-style shape tuple giving dataset dimensions"""

462 if 'shape' in self._cache_props:

463 return self._cache_props['shape']

464

465 with phil:

466 shape = self.id.shape

467

468 # If the file is read-only, cache the shape to speed-up future uses.

469 # This cache is invalidated by .refresh() when using SWMR.

470 if self._readonly:

471 self._cache_props['shape'] = shape

472 return shape

473

474 @shape.setter

475 @with_phil

476 def shape(self, shape):

477 # pylint: disable=missing-docstring

478 self.resize(shape)

479

480 @property

481 def size(self):

482 """Numpy-style attribute giving the total dataset size"""

483 if 'size' in self._cache_props:

484 return self._cache_props['size']

485

486 if self._is_empty:

487 size = None

488 else:

489 size = numpy.prod(self.shape, dtype=numpy.intp)

490

491 # If the file is read-only, cache the size to speed-up future uses.

492 # This cache is invalidated by .refresh() when using SWMR.

493 if self._readonly:

494 self._cache_props['size'] = size

495 return size

496

497 @property

498 def nbytes(self):

499 """Numpy-style attribute giving the raw dataset size as the number of bytes"""

500 size = self.size

501 if size is None: # if we are an empty 0-D array, then there are no bytes in the dataset

502 return 0

503 return self.dtype.itemsize * size

504

505 @property

506 def _selector(self):

507 """Internal object for optimised selection of data"""

508 if '_selector' in self._cache_props:

509 return self._cache_props['_selector']

510

511 slr = _selector.Selector(self.id.get_space())

512

513 # If the file is read-only, cache the reader to speed up future uses.

514 # This cache is invalidated by .refresh() when using SWMR.

515 if self._readonly:

516 self._cache_props['_selector'] = slr

517 return slr

518

519 @property

520 def _fast_reader(self):

521 """Internal object for optimised reading of data"""

522 if '_fast_reader' in self._cache_props:

523 return self._cache_props['_fast_reader']

524

525 rdr = _selector.Reader(self.id)

526

527 # If the file is read-only, cache the reader to speed up future uses.

528 # This cache is invalidated by .refresh() when using SWMR.

529 if self._readonly:

530 self._cache_props['_fast_reader'] = rdr

531 return rdr

532

533 @property

534 @with_phil

535 def dtype(self):

536 """Numpy dtype representing the datatype"""

537 return self.id.dtype

538

539 @property

540 @with_phil

541 def chunks(self):

542 """Dataset chunks (or None)"""

543 dcpl = self._dcpl

544 if dcpl.get_layout() == h5d.CHUNKED:

545 return dcpl.get_chunk()

546 return None

547

548 @property

549 @with_phil

550 def compression(self):

551 """Compression strategy (or None)"""

552 for x in ('gzip','lzf','szip'):

553 if x in self._filters:

554 return x

555 return None

556

557 @property

558 @with_phil

559 def compression_opts(self):

560 """ Compression setting. Int(0-9) for gzip, 2-tuple for szip. """

561 return self._filters.get(self.compression, None)

562

563 @property

564 @with_phil

565 def shuffle(self):

566 """Shuffle filter present (T/F)"""

567 return 'shuffle' in self._filters

568

569 @property

570 @with_phil

571 def fletcher32(self):

572 """Fletcher32 filter is present (T/F)"""

573 return 'fletcher32' in self._filters

574

575 @property

576 @with_phil

577 def scaleoffset(self):

578 """Scale/offset filter settings. For integer data types, this is

579 the number of bits stored, or 0 for auto-detected. For floating

580 point data types, this is the number of decimal places retained.

581 If the scale/offset filter is not in use, this is None."""

582 try:

583 return self._filters['scaleoffset'][1]

584 except KeyError:

585 return None

586

587 @property

588 @with_phil

589 def external(self):

590 """External file settings. Returns a list of tuples of

591 (name, offset, size) for each external file entry, or returns None

592 if no external files are used."""

593 count = self._dcpl.get_external_count()

594 if count<=0:

595 return None

596 ext_list = list()

597 for x in range(count):

598 (name, offset, size) = self._dcpl.get_external(x)

599 ext_list.append( (filename_decode(name), offset, size) )

600 return ext_list

601

602 @property

603 @with_phil

604 def maxshape(self):

605 """Shape up to which this dataset can be resized. Axes with value

606 None have no resize limit. """

607 space = self.id.get_space()

608 dims = space.get_simple_extent_dims(True)

609 if dims is None:

610 return None

611

612 return tuple(x if x != h5s.UNLIMITED else None for x in dims)

613

614 @property

615 @with_phil

616 def fillvalue(self):

617 """Fill value for this dataset (0 by default)"""

618 arr = numpy.zeros((1,), dtype=self.dtype)

619 self._dcpl.get_fill_value(arr)

620 return arr[0]

621

622 @cached_property

623 @with_phil

624 def _extent_type(self):

625 """Get extent type for this dataset - SIMPLE, SCALAR or NULL"""

626 return self.id.get_space().get_simple_extent_type()

627

628 @cached_property

629 def _is_empty(self):

630 """Check if extent type is empty"""

631 return self._extent_type == h5s.NULL

632

633 @with_phil

634 def __init__(self, bind, *, readonly=False):

635 """ Create a new Dataset object by binding to a low-level DatasetID.

636 """

637 if not isinstance(bind, h5d.DatasetID):

638 raise ValueError("%s is not a DatasetID" % bind)

639 super().__init__(bind)

640

641 self._dcpl = self.id.get_create_plist()

642 self._dxpl = h5p.create(h5p.DATASET_XFER)

643 self._filters = filters.get_filters(self._dcpl)

644 self._readonly = readonly

645 self._cache_props = {}

646

647 def resize(self, size, axis=None):

648 """ Resize the dataset, or the specified axis.

649

650 The dataset must be stored in chunked format; it can be resized up to

651 the "maximum shape" (keyword maxshape) specified at creation time.

652 The rank of the dataset cannot be changed.

653

654 "Size" should be a shape tuple, or if an axis is specified, an integer.

655

656 BEWARE: This functions differently than the NumPy resize() method!

657 The data is not "reshuffled" to fit in the new shape; each axis is

658 grown or shrunk independently. The coordinates of existing data are

659 fixed.

660 """

661 with phil:

662 if self.chunks is None:

663 raise TypeError("Only chunked datasets can be resized")

664

665 if axis is not None:

666 if not (axis >=0 and axis < self.id.rank):

667 raise ValueError("Invalid axis (0 to %s allowed)" % (self.id.rank-1))

668 try:

669 newlen = int(size)

670 except TypeError:

671 raise TypeError("Argument must be a single int if axis is specified")

672 size = list(self.shape)

673 size[axis] = newlen

674

675 size = tuple(size)

676 self.id.set_extent(size)

677 #h5f.flush(self.id) # THG recommends

678

679 @with_phil

680 def __len__(self):

681 """ The size of the first axis. TypeError if scalar.

682

683 Limited to 2**32 on 32-bit systems; Dataset.len() is preferred.

684 """

685 size = self.len()

686 if size > sys.maxsize:

687 raise OverflowError("Value too big for Python's __len__; use Dataset.len() instead.")

688 return size

689

690 def len(self):

691 """ The size of the first axis. TypeError if scalar.

692

693 Use of this method is preferred to len(dset), as Python's built-in

694 len() cannot handle values greater then 2**32 on 32-bit systems.

695 """

696 with phil:

697 shape = self.shape

698 if len(shape) == 0:

699 raise TypeError("Attempt to take len() of scalar dataset")

700 return shape[0]

701

702 @with_phil

703 def __iter__(self):

704 """ Iterate over the first axis. TypeError if scalar.

705

706 BEWARE: Modifications to the yielded data are *NOT* written to file.

707 """

708 shape = self.shape

709 if len(shape) == 0:

710 raise TypeError("Can't iterate over a scalar dataset")

711 for i in range(shape[0]):

712 yield self[i]

713

714 @with_phil

715 def iter_chunks(self, sel=None):

716 """ Return chunk iterator. If set, the sel argument is a slice or

717 tuple of slices that defines the region to be used. If not set, the

718 entire dataspace will be used for the iterator.

719

720 For each chunk within the given region, the iterator yields a tuple of

721 slices that gives the intersection of the given chunk with the

722 selection area.

723

724 A TypeError will be raised if the dataset is not chunked.

725

726 A ValueError will be raised if the selection region is invalid.

727

728 """

729 return ChunkIterator(self, sel)

730

731 @cached_property

732 def _fast_read_ok(self):

733 """Is this dataset suitable for simple reading"""

734 return (

735 self._extent_type == h5s.SIMPLE

736 and isinstance(self.id.get_type(), (h5t.TypeIntegerID, h5t.TypeFloatID))

737 )

738

739 @with_phil

740 def __getitem__(self, args, new_dtype=None):

741 """ Read a slice from the HDF5 dataset.

742

743 Takes slices and recarray-style field names (more than one is

744 allowed!) in any order. Obeys basic NumPy rules, including

745 broadcasting.

746

747 Also supports:

748

749 * Boolean "mask" array indexing

750 """

751 args = args if isinstance(args, tuple) else (args,)

752

753 if self._fast_read_ok and (new_dtype is None):

754 try:

755 return self._fast_reader.read(args)

756 except TypeError:

757 pass # Fall back to Python read pathway below

758

759 if self._is_empty:

760 # Check 'is Ellipsis' to avoid equality comparison with an array:

761 # array equality returns an array, not a boolean.

762 if args == () or (len(args) == 1 and args[0] is Ellipsis):

763 return Empty(self.dtype)

764 raise ValueError("Empty datasets cannot be sliced")

765

766 # Sort field names from the rest of the args.

767 names = tuple(x for x in args if isinstance(x, str))

768

769 if names:

770 # Read a subset of the fields in this structured dtype

771 if len(names) == 1:

772 names = names[0] # Read with simpler dtype of this field

773 args = tuple(x for x in args if not isinstance(x, str))

774 return self.fields(names, _prior_dtype=new_dtype)[args]

775

776 if new_dtype is None:

777 new_dtype = self.dtype

778 mtype = h5t.py_create(new_dtype)

779

780 # === Special-case region references ====

781

782 if len(args) == 1 and isinstance(args[0], h5r.RegionReference):

783

784 obj = h5r.dereference(args[0], self.id)

785 if obj != self.id:

786 raise ValueError("Region reference must point to this dataset")

787

788 sid = h5r.get_region(args[0], self.id)

789 mshape = sel.guess_shape(sid)

790 if mshape is None:

791 # 0D with no data (NULL or deselected SCALAR)

792 return Empty(new_dtype)

793 out = numpy.zeros(mshape, dtype=new_dtype)

794 if out.size == 0:

795 return out

796

797 sid_out = h5s.create_simple(mshape)

798 sid_out.select_all()

799 self.id.read(sid_out, sid, out, mtype)

800 return out

801

802 # === Check for zero-sized datasets =====

803

804 if self.size == 0:

805 # Check 'is Ellipsis' to avoid equality comparison with an array:

806 # array equality returns an array, not a boolean.

807 if args == () or (len(args) == 1 and args[0] is Ellipsis):

808 return numpy.zeros(self.shape, dtype=new_dtype)

809

810 # === Scalar dataspaces =================

811

812 if self.shape == ():

813 fspace = self.id.get_space()

814 selection = sel2.select_read(fspace, args)

815 if selection.mshape is None:

816 arr = numpy.zeros((), dtype=new_dtype)

817 else:

818 arr = numpy.zeros(selection.mshape, dtype=new_dtype)

819 for mspace, fspace in selection:

820 self.id.read(mspace, fspace, arr, mtype)

821 if selection.mshape is None:

822 return arr[()]

823 return arr

824

825 # === Everything else ===================

826

827 # Perform the dataspace selection.

828 selection = sel.select(self.shape, args, dataset=self)

829

830 if selection.nselect == 0:

831 return numpy.zeros(selection.array_shape, dtype=new_dtype)

832

833 arr = numpy.zeros(selection.array_shape, new_dtype, order='C')

834

835 # Perform the actual read

836 mspace = h5s.create_simple(selection.mshape)

837 fspace = selection.id

838 self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl)

839

840 # Patch up the output for NumPy

841 if arr.shape == ():

842 return arr[()] # 0 dim array -> numpy scalar

843 return arr

844

845 @with_phil

846 def __setitem__(self, args, val):

847 """ Write to the HDF5 dataset from a Numpy array.

848

849 NumPy's broadcasting rules are honored, for "simple" indexing

850 (slices and integers). For advanced indexing, the shapes must

851 match.

852 """

853 args = args if isinstance(args, tuple) else (args,)

854

855 # Sort field indices from the slicing

856 names = tuple(x for x in args if isinstance(x, str))

857 args = tuple(x for x in args if not isinstance(x, str))

858

859 # Generally we try to avoid converting the arrays on the Python

860 # side. However, for compound literals this is unavoidable.

861 vlen = h5t.check_vlen_dtype(self.dtype)

862 if vlen is not None and vlen not in (bytes, str):

863 try:

864 val = numpy.asarray(val, dtype=vlen)

865 except ValueError:

866 try:

867 val = numpy.array([numpy.array(x, dtype=vlen)

868 for x in val], dtype=self.dtype)

869 except ValueError:

870 pass

871 if vlen == val.dtype:

872 if val.ndim > 1:

873 tmp = numpy.empty(shape=val.shape[:-1], dtype=object)

874 tmp.ravel()[:] = [i for i in val.reshape(

875 (numpy.product(val.shape[:-1], dtype=numpy.ulonglong), val.shape[-1]))]

876 else:

877 tmp = numpy.array([None], dtype=object)

878 tmp[0] = val

879 val = tmp

880 elif self.dtype.kind == "O" or \

881 (self.dtype.kind == 'V' and \

882 (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \

883 (self.dtype.subdtype is None)):

884 if len(names) == 1 and self.dtype.fields is not None:

885 # Single field selected for write, from a non-array source

886 if not names[0] in self.dtype.fields:

887 raise ValueError("No such field for indexing: %s" % names[0])

888 dtype = self.dtype.fields[names[0]][0]

889 cast_compound = True

890 else:

891 dtype = self.dtype

892 cast_compound = False

893

894 val = numpy.asarray(val, dtype=dtype.base, order='C')

895 if cast_compound:

896 val = val.view(numpy.dtype([(names[0], dtype)]))

897 val = val.reshape(val.shape[:len(val.shape) - len(dtype.shape)])

898 elif (self.dtype.kind == 'S'

899 and (h5t.check_string_dtype(self.dtype).encoding == 'utf-8')

900 and (find_item_type(val) is str)

901 ):

902 # Writing str objects to a fixed-length UTF-8 string dataset.

903 # Numpy's normal conversion only handles ASCII characters, but

904 # when the destination is UTF-8, we want to allow any unicode.

905 # This *doesn't* handle numpy fixed-length unicode data ('U' dtype),

906 # as HDF5 has no equivalent, and converting fixed length UTF-32

907 # to variable length UTF-8 would obscure what's going on.

908 str_array = numpy.asarray(val, order='C', dtype=object)

909 val = numpy.array([

910 s.encode('utf-8') for s in str_array.flat

911 ], dtype=self.dtype).reshape(str_array.shape)

912 else:

913 # If the input data is already an array, let HDF5 do the conversion.

914 # If it's a list or similar, don't make numpy guess a dtype for it.

915 dt = None if isinstance(val, numpy.ndarray) else self.dtype.base

916 val = numpy.asarray(val, order='C', dtype=dt)

917

918 # Check for array dtype compatibility and convert

919 if self.dtype.subdtype is not None:

920 shp = self.dtype.subdtype[1]

921 valshp = val.shape[-len(shp):]

922 if valshp != shp: # Last dimension has to match

923 raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,))

924 mtype = h5t.py_create(numpy.dtype((val.dtype, shp)))

925 mshape = val.shape[0:len(val.shape)-len(shp)]

926

927 # Make a compound memory type if field-name slicing is required

928 elif len(names) != 0:

929

930 mshape = val.shape

931

932 # Catch common errors

933 if self.dtype.fields is None:

934 raise TypeError("Illegal slicing argument (not a compound dataset)")

935 mismatch = [x for x in names if x not in self.dtype.fields]

936 if len(mismatch) != 0:

937 mismatch = ", ".join('"%s"'%x for x in mismatch)

938 raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch)

939

940 # Write non-compound source into a single dataset field

941 if len(names) == 1 and val.dtype.fields is None:

942 subtype = h5t.py_create(val.dtype)

943 mtype = h5t.create(h5t.COMPOUND, subtype.get_size())

944 mtype.insert(self._e(names[0]), 0, subtype)

945

946 # Make a new source type keeping only the requested fields

947 else:

948 fieldnames = [x for x in val.dtype.names if x in names] # Keep source order

949 mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize)

950 for fieldname in fieldnames:

951 subtype = h5t.py_create(val.dtype.fields[fieldname][0])

952 offset = val.dtype.fields[fieldname][1]

953 mtype.insert(self._e(fieldname), offset, subtype)

954

955 # Use mtype derived from array (let DatasetID.write figure it out)

956 else:

957 mshape = val.shape

958 mtype = None

959

960 # Perform the dataspace selection

961 selection = sel.select(self.shape, args, dataset=self)

962

963 if selection.nselect == 0:

964 return

965

966 # Broadcast scalars if necessary.

967 # In order to avoid slow broadcasting filling the destination by

968 # the scalar value, we create an intermediate array of the same

969 # size as the destination buffer provided that size is reasonable.

970 # We assume as reasonable a size smaller or equal as the used dataset

971 # chunk size if any.

972 # In case of dealing with a non-chunked destination dataset or with

973 # a selection whose size is larger than the dataset chunk size we fall

974 # back to using an intermediate array of size equal to the last dimension

975 # of the destination buffer.

976 # The reasoning behind is that it makes sense to assume the creator of

977 # the dataset used an appropriate chunk size according the available

978 # memory. In any case, if we cannot afford to create an intermediate

979 # array of the same size as the dataset chunk size, the user program has

980 # little hope to go much further. Solves h5py issue #1067

981 if mshape == () and selection.array_shape != ():

982 if self.dtype.subdtype is not None:

983 raise TypeError("Scalar broadcasting is not supported for array dtypes")

984 if self.chunks and (numpy.prod(self.chunks, dtype=numpy.float64) >=

985 numpy.prod(selection.array_shape, dtype=numpy.float64)):

986 val2 = numpy.empty(selection.array_shape, dtype=val.dtype)

987 else:

988 val2 = numpy.empty(selection.array_shape[-1], dtype=val.dtype)

989 val2[...] = val

990 val = val2

991 mshape = val.shape

992

993 # Perform the write, with broadcasting

994 mspace = h5s.create_simple(selection.expand_shape(mshape))

995 for fspace in selection.broadcast(mshape):

996 self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl)

997

998 def read_direct(self, dest, source_sel=None, dest_sel=None):

999 """ Read data directly from HDF5 into an existing NumPy array.

1000

1001 The destination array must be C-contiguous and writable.

1002 Selections must be the output of numpy.s_[<args>].

1003

1004 Broadcasting is supported for simple indexing.

1005 """

1006 with phil:

1007 if self._is_empty:

1008 raise TypeError("Empty datasets have no numpy representation")

1009 if source_sel is None:

1010 source_sel = sel.SimpleSelection(self.shape)

1011 else:

1012 source_sel = sel.select(self.shape, source_sel, self) # for numpy.s_

1013 fspace = source_sel.id

1014

1015 if dest_sel is None:

1016 dest_sel = sel.SimpleSelection(dest.shape)

1017 else:

1018 dest_sel = sel.select(dest.shape, dest_sel)

1019

1020 for mspace in dest_sel.broadcast(source_sel.array_shape):

1021 self.id.read(mspace, fspace, dest, dxpl=self._dxpl)

1022

1023 def write_direct(self, source, source_sel=None, dest_sel=None):

1024 """ Write data directly to HDF5 from a NumPy array.

1025

1026 The source array must be C-contiguous. Selections must be

1027 the output of numpy.s_[<args>].

1028

1029 Broadcasting is supported for simple indexing.

1030 """

1031 with phil:

1032 if self._is_empty:

1033 raise TypeError("Empty datasets cannot be written to")

1034 if source_sel is None:

1035 source_sel = sel.SimpleSelection(source.shape)

1036 else:

1037 source_sel = sel.select(source.shape, source_sel) # for numpy.s_

1038 mspace = source_sel.id

1039

1040 if dest_sel is None:

1041 dest_sel = sel.SimpleSelection(self.shape)

1042 else:

1043 dest_sel = sel.select(self.shape, dest_sel, self)

1044

1045 for fspace in dest_sel.broadcast(source_sel.array_shape):

1046 self.id.write(mspace, fspace, source, dxpl=self._dxpl)

1047

1048 @with_phil

1049 def __array__(self, dtype=None):

1050 """ Create a Numpy array containing the whole dataset. DON'T THINK

1051 THIS MEANS DATASETS ARE INTERCHANGEABLE WITH ARRAYS. For one thing,

1052 you have to read the whole dataset every time this method is called.

1053 """

1054 arr = numpy.zeros(self.shape, dtype=self.dtype if dtype is None else dtype)

1055

1056 # Special case for (0,)*-shape datasets

1057 if numpy.product(self.shape, dtype=numpy.ulonglong) == 0:

1058 return arr

1059

1060 self.read_direct(arr)

1061 return arr

1062

1063 @with_phil

1064 def __repr__(self):

1065 if not self:

1066 r = '<Closed HDF5 dataset>'

1067 else:

1068 if self.name is None:

1069 namestr = '("anonymous")'

1070 else:

1071 name = pp.basename(pp.normpath(self.name))

1072 namestr = '"%s"' % (name if name != '' else '/')

1073 r = '<HDF5 dataset %s: shape %s, type "%s">' % (

1074 namestr, self.shape, self.dtype.str

1075 )

1076 return r

1077

1078 if hasattr(h5d.DatasetID, "refresh"):

1079 @with_phil

1080 def refresh(self):

1081 """ Refresh the dataset metadata by reloading from the file.

1082

1083 This is part of the SWMR features and only exist when the HDF5

1084 library version >=1.9.178

1085 """

1086 self._id.refresh()

1087 self._cache_props.clear()

1088

1089 if hasattr(h5d.DatasetID, "flush"):

1090 @with_phil

1091 def flush(self):

1092 """ Flush the dataset data and metadata to the file.

1093 If the dataset is chunked, raw data chunks are written to the file.

1094

1095 This is part of the SWMR features and only exist when the HDF5

1096 library version >=1.9.178

1097 """

1098 self._id.flush()

1099

1100 if vds_support:

1101 @property

1102 @with_phil

1103 def is_virtual(self):

1104 """Check if this is a virtual dataset"""

1105 return self._dcpl.get_layout() == h5d.VIRTUAL

1106

1107 @with_phil

1108 def virtual_sources(self):

1109 """Get a list of the data mappings for a virtual dataset"""

1110 if not self.is_virtual:

1111 raise RuntimeError("Not a virtual dataset")

1112 dcpl = self._dcpl

1113 return [

1114 VDSmap(dcpl.get_virtual_vspace(j),

1115 dcpl.get_virtual_filename(j),

1116 dcpl.get_virtual_dsetname(j),

1117 dcpl.get_virtual_srcspace(j))

1118 for j in range(dcpl.get_virtual_count())]

1119

1120 @with_phil

1121 def make_scale(self, name=''):

1122 """Make this dataset an HDF5 dimension scale.

1123

1124 You can then attach it to dimensions of other datasets like this::

1125

1126 other_ds.dims[0].attach_scale(ds)

1127

1128 You can optionally pass a name to associate with this scale.

1129 """

1130 h5ds.set_scale(self._id, self._e(name))

1131

1132 @property

1133 @with_phil

1134 def is_scale(self):

1135 """Return ``True`` if this dataset is also a dimension scale.

1136

1137 Return ``False`` otherwise.

1138 """

1139 return h5ds.is_scale(self._id)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/h5py/_hl/dataset.py: 21%

633 statements