Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/h5py/

1# This file is part of h5py, a Python interface to the HDF5 library.

3# http://www.h5py.org

7# License: Standard 3-clause BSD; see "license.txt" for full license terms

8# and contributor agreement.

10"""

11 Implements support for high-level dataset access.

12"""

14import posixpath as pp

15import sys

17import numpy

19from .. import h5, h5s, h5t, h5r, h5d, h5p, h5fd, h5ds, _selector

20from .base import (

21 array_for_new_object, cached_property, Empty, find_item_type, HLObject,

22 phil, product, with_phil,

23)

24from . import filters

25from . import selections as sel

26from . import selections2 as sel2

27from .datatype import Datatype

28from .compat import filename_decode

29from .vds import VDSmap, vds_support

31_LEGACY_GZIP_COMPRESSION_VALS = frozenset(range(10))

32MPI = h5.get_config().mpi

35def make_new_dset(parent, shape=None, dtype=None, data=None, name=None,

36 chunks=None, compression=None, shuffle=None,

37 fletcher32=None, maxshape=None, compression_opts=None,

38 fillvalue=None, scaleoffset=None, track_times=False,

39 external=None, track_order=None, dcpl=None, dapl=None,

40 efile_prefix=None, virtual_prefix=None, allow_unknown_filter=False,

41 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None):

42 """ Return a new low-level dataset identifier """

44 # Convert data to a C-contiguous ndarray

45 if data is not None and not isinstance(data, Empty):

46 data = array_for_new_object(data, specified_dtype=dtype)

48 # Validate shape

49 if shape is None:

50 if data is None:

51 if dtype is None:

52 raise TypeError("One of data, shape or dtype must be specified")

53 data = Empty(dtype)

54 shape = data.shape

55 else:

56 shape = (shape,) if isinstance(shape, int) else tuple(shape)

57 if data is not None and (product(shape) != product(data.shape)):

58 raise ValueError("Shape tuple is incompatible with data")

60 if isinstance(maxshape, int):

61 maxshape = (maxshape,)

62 tmp_shape = maxshape if maxshape is not None else shape

64 # Validate chunk shape

65 if isinstance(chunks, int) and not isinstance(chunks, bool):

66 chunks = (chunks,)

67 if isinstance(chunks, tuple) and any(

68 chunk > dim for dim, chunk in zip(tmp_shape, chunks) if dim is not None

69 ):

70 errmsg = "Chunk shape must not be greater than data shape in any dimension. "\

71 "{} is not compatible with {}".format(chunks, shape)

72 raise ValueError(errmsg)

74 if isinstance(dtype, Datatype):

75 # Named types are used as-is

76 tid = dtype.id

77 dtype = tid.dtype # Following code needs this

78 else:

79 # Validate dtype

80 if dtype is None and data is None:

81 dtype = numpy.dtype("=f4")

82 elif dtype is None and data is not None:

83 dtype = data.dtype

84 else:

85 dtype = numpy.dtype(dtype)

86 tid = h5t.py_create(dtype, logical=1)

88 # Legacy

89 if any((compression, shuffle, fletcher32, maxshape, scaleoffset)) and chunks is False:

90 raise ValueError("Chunked format required for given storage options")

92 # Legacy

93 if compression is True:

94 if compression_opts is None:

95 compression_opts = 4

96 compression = 'gzip'

98 # Legacy

99 if compression in _LEGACY_GZIP_COMPRESSION_VALS:

100 if compression_opts is not None:

101 raise TypeError("Conflict in compression options")

102 compression_opts = compression

103 compression = 'gzip'

104 dcpl = filters.fill_dcpl(

105 dcpl or h5p.create(h5p.DATASET_CREATE), shape, dtype,

106 chunks, compression, compression_opts, shuffle, fletcher32,

107 maxshape, scaleoffset, external, allow_unknown_filter)

108

109 if fillvalue is not None:

110 # prepare string-type dtypes for fillvalue

111 string_info = h5t.check_string_dtype(dtype)

112 if string_info is not None:

113 # fake vlen dtype for fixed len string fillvalue

114 # to not trigger unwanted encoding

115 dtype = h5t.string_dtype(string_info.encoding)

116 fillvalue = numpy.array(fillvalue, dtype=dtype)

117 else:

118 fillvalue = numpy.array(fillvalue)

119 dcpl.set_fill_value(fillvalue)

120

121 if track_times is None:

122 # In case someone explicitly passes None for the default

123 track_times = False

124 if track_times in (True, False):

125 dcpl.set_obj_track_times(track_times)

126 else:

127 raise TypeError("track_times must be either True or False")

128 if track_order is True:

129 dcpl.set_attr_creation_order(

130 h5p.CRT_ORDER_TRACKED | h5p.CRT_ORDER_INDEXED)

131 elif track_order is False:

132 dcpl.set_attr_creation_order(0)

133 elif track_order is not None:

134 raise TypeError("track_order must be either True or False")

135

136 if maxshape is not None:

137 maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape)

138

139 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):

140 dapl = dapl or h5p.create(h5p.DATASET_ACCESS)

141

142 if efile_prefix is not None:

143 dapl.set_efile_prefix(efile_prefix)

144

145 if virtual_prefix is not None:

146 dapl.set_virtual_prefix(virtual_prefix)

147

148 if rdcc_nbytes or rdcc_nslots or rdcc_w0:

149 cache_settings = list(dapl.get_chunk_cache())

150 if rdcc_nslots is not None:

151 cache_settings[0] = rdcc_nslots

152 if rdcc_nbytes is not None:

153 cache_settings[1] = rdcc_nbytes

154 if rdcc_w0 is not None:

155 cache_settings[2] = rdcc_w0

156 dapl.set_chunk_cache(*cache_settings)

157

158 if isinstance(data, Empty):

159 sid = h5s.create(h5s.NULL)

160 else:

161 sid = h5s.create_simple(shape, maxshape)

162

163 dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl, dapl=dapl)

164

165 if (data is not None) and (not isinstance(data, Empty)):

166 dset_id.write(h5s.ALL, h5s.ALL, data)

167

168 return dset_id

169

170

171def open_dset(parent, name, dapl=None, efile_prefix=None, virtual_prefix=None,

172 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, **kwds):

173 """ Return an existing low-level dataset identifier """

174

175 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]):

176 dapl = dapl or h5p.create(h5p.DATASET_ACCESS)

177

178 if efile_prefix is not None:

179 dapl.set_efile_prefix(efile_prefix)

180

181 if virtual_prefix is not None:

182 dapl.set_virtual_prefix(virtual_prefix)

183

184 if rdcc_nbytes or rdcc_nslots or rdcc_w0:

185 cache_settings = list(dapl.get_chunk_cache())

186 if rdcc_nslots is not None:

187 cache_settings[0] = rdcc_nslots

188 if rdcc_nbytes is not None:

189 cache_settings[1] = rdcc_nbytes

190 if rdcc_w0 is not None:

191 cache_settings[2] = rdcc_w0

192 dapl.set_chunk_cache(*cache_settings)

193

194 dset_id = h5d.open(parent.id, name, dapl=dapl)

195

196 return dset_id

197

198

199class AstypeWrapper:

200 """Wrapper to convert data on reading from a dataset.

201 """

202 def __init__(self, dset, dtype):

203 self._dset = dset

204 self._dtype = numpy.dtype(dtype)

205

206 def __getitem__(self, args):

207 return self._dset.__getitem__(args, new_dtype=self._dtype)

208

209 def __len__(self):

210 """ Get the length of the underlying dataset

211

212 >>> length = len(dataset.astype('f8'))

213 """

214 return len(self._dset)

215

216 def __array__(self, dtype=None):

217 data = self[:]

218 if dtype is not None:

219 data = data.astype(dtype)

220 return data

221

222

223class AsStrWrapper:

224 """Wrapper to decode strings on reading the dataset"""

225 def __init__(self, dset, encoding, errors='strict'):

226 self._dset = dset

227 if encoding is None:

228 encoding = h5t.check_string_dtype(dset.dtype).encoding

229 self.encoding = encoding

230 self.errors = errors

231

232 def __getitem__(self, args):

233 bytes_arr = self._dset[args]

234 # numpy.char.decode() seems like the obvious thing to use. But it only

235 # accepts numpy string arrays, not object arrays of bytes (which we

236 # return from HDF5 variable-length strings). And the numpy

237 # implementation is not faster than doing it with a loop; in fact, by

238 # not converting the result to a numpy unicode array, the

239 # naive way can be faster! (Comparing with numpy 1.18.4, June 2020)

240 if numpy.isscalar(bytes_arr):

241 return bytes_arr.decode(self.encoding, self.errors)

242

243 return numpy.array([

244 b.decode(self.encoding, self.errors) for b in bytes_arr.flat

245 ], dtype=object).reshape(bytes_arr.shape)

246

247 def __len__(self):

248 """ Get the length of the underlying dataset

249

250 >>> length = len(dataset.asstr())

251 """

252 return len(self._dset)

253

254 def __array__(self):

255 return numpy.array([

256 b.decode(self.encoding, self.errors) for b in self._dset

257 ], dtype=object).reshape(self._dset.shape)

258

259

260class FieldsWrapper:

261 """Wrapper to extract named fields from a dataset with a struct dtype"""

262 extract_field = None

263

264 def __init__(self, dset, prior_dtype, names):

265 self._dset = dset

266 if isinstance(names, str):

267 self.extract_field = names

268 names = [names]

269 self.read_dtype = readtime_dtype(prior_dtype, names)

270

271 def __array__(self, dtype=None):

272 data = self[:]

273 if dtype is not None:

274 data = data.astype(dtype)

275 return data

276

277 def __getitem__(self, args):

278 data = self._dset.__getitem__(args, new_dtype=self.read_dtype)

279 if self.extract_field is not None:

280 data = data[self.extract_field]

281 return data

282

283 def __len__(self):

284 """ Get the length of the underlying dataset

285

286 >>> length = len(dataset.fields(['x', 'y']))

287 """

288 return len(self._dset)

289

290

291def readtime_dtype(basetype, names):

292 """Make a NumPy compound dtype with a subset of available fields"""

293 if basetype.names is None: # Names provided, but not compound

294 raise ValueError("Field names only allowed for compound types")

295

296 for name in names: # Check all names are legal

297 if name not in basetype.names:

298 raise ValueError("Field %s does not appear in this type." % name)

299

300 return numpy.dtype([(name, basetype.fields[name][0]) for name in names])

301

302

303if MPI:

304 class CollectiveContext:

305

306 """ Manages collective I/O in MPI mode """

307

308 # We don't bother with _local as threads are forbidden in MPI mode

309

310 def __init__(self, dset):

311 self._dset = dset

312

313 def __enter__(self):

314 # pylint: disable=protected-access

315 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_COLLECTIVE)

316

317 def __exit__(self, *args):

318 # pylint: disable=protected-access

319 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_INDEPENDENT)

320

321

322class ChunkIterator:

323 """

324 Class to iterate through list of chunks of a given dataset

325 """

326 def __init__(self, dset, source_sel=None):

327 self._shape = dset.shape

328 rank = len(dset.shape)

329

330 if not dset.chunks:

331 # can only use with chunked datasets

332 raise TypeError("Chunked dataset required")

333

334 self._layout = dset.chunks

335 if source_sel is None:

336 # select over entire dataset

337 slices = []

338 for dim in range(rank):

339 slices.append(slice(0, self._shape[dim]))

340 self._sel = tuple(slices)

341 else:

342 if isinstance(source_sel, slice):

343 self._sel = (source_sel,)

344 else:

345 self._sel = source_sel

346 if len(self._sel) != rank:

347 raise ValueError("Invalid selection - selection region must have same rank as dataset")

348 self._chunk_index = []

349 for dim in range(rank):

350 s = self._sel[dim]

351 if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start:

352 raise ValueError("Invalid selection - selection region must be within dataset space")

353 index = s.start // self._layout[dim]

354 self._chunk_index.append(index)

355

356 def __iter__(self):

357 return self

358

359 def __next__(self):

360 rank = len(self._shape)

361 slices = []

362 if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop:

363 # ran past the last chunk, end iteration

364 raise StopIteration()

365

366 for dim in range(rank):

367 s = self._sel[dim]

368 start = self._chunk_index[dim] * self._layout[dim]

369 stop = (self._chunk_index[dim] + 1) * self._layout[dim]

370 # adjust the start if this is an edge chunk

371 if start < s.start:

372 start = s.start

373 if stop > s.stop:

374 stop = s.stop # trim to end of the selection

375 s = slice(start, stop, 1)

376 slices.append(s)

377

378 # bump up the last index and carry forward if we run outside the selection

379 dim = rank - 1

380 while dim >= 0:

381 s = self._sel[dim]

382 self._chunk_index[dim] += 1

383

384 chunk_end = self._chunk_index[dim] * self._layout[dim]

385 if chunk_end < s.stop:

386 # we still have room to extend along this dimensions

387 return tuple(slices)

388

389 if dim > 0:

390 # reset to the start and continue iterating with higher dimension

391 self._chunk_index[dim] = 0

392 dim -= 1

393 return tuple(slices)

394

395

396class Dataset(HLObject):

397

398 """

399 Represents an HDF5 dataset

400 """

401

402 def astype(self, dtype):

403 """ Get a wrapper allowing you to perform reads to a

404 different destination type, e.g.:

405

406 >>> double_precision = dataset.astype('f8')[0:100:2]

407 """

408 return AstypeWrapper(self, dtype)

409

410 def asstr(self, encoding=None, errors='strict'):

411 """Get a wrapper to read string data as Python strings:

412

413 >>> str_array = dataset.asstr()[:]

414

415 The parameters have the same meaning as in ``bytes.decode()``.

416 If ``encoding`` is unspecified, it will use the encoding in the HDF5

417 datatype (either ascii or utf-8).

418 """

419 string_info = h5t.check_string_dtype(self.dtype)

420 if string_info is None:

421 raise TypeError(

422 "dset.asstr() can only be used on datasets with "

423 "an HDF5 string datatype"

424 )

425 if encoding is None:

426 encoding = string_info.encoding

427 return AsStrWrapper(self, encoding, errors=errors)

428

429 def fields(self, names, *, _prior_dtype=None):

430 """Get a wrapper to read a subset of fields from a compound data type:

431

432 >>> 2d_coords = dataset.fields(['x', 'y'])[:]

433

434 If names is a string, a single field is extracted, and the resulting

435 arrays will have that dtype. Otherwise, it should be an iterable,

436 and the read data will have a compound dtype.

437 """

438 if _prior_dtype is None:

439 _prior_dtype = self.dtype

440 return FieldsWrapper(self, _prior_dtype, names)

441

442 if MPI:

443 @property

444 @with_phil

445 def collective(self):

446 """ Context manager for MPI collective reads & writes """

447 return CollectiveContext(self)

448

449 @property

450 def dims(self):

451 """ Access dimension scales attached to this dataset. """

452 from .dims import DimensionManager

453 with phil:

454 return DimensionManager(self)

455

456 @property

457 @with_phil

458 def ndim(self):

459 """Numpy-style attribute giving the number of dimensions"""

460 return self.id.rank

461

462 @property

463 def shape(self):

464 """Numpy-style shape tuple giving dataset dimensions"""

465 if 'shape' in self._cache_props:

466 return self._cache_props['shape']

467

468 with phil:

469 shape = self.id.shape

470

471 # If the file is read-only, cache the shape to speed-up future uses.

472 # This cache is invalidated by .refresh() when using SWMR.

473 if self._readonly:

474 self._cache_props['shape'] = shape

475 return shape

476

477 @shape.setter

478 @with_phil

479 def shape(self, shape):

480 # pylint: disable=missing-docstring

481 self.resize(shape)

482

483 @property

484 def size(self):

485 """Numpy-style attribute giving the total dataset size"""

486 if 'size' in self._cache_props:

487 return self._cache_props['size']

488

489 if self._is_empty:

490 size = None

491 else:

492 size = product(self.shape)

493

494 # If the file is read-only, cache the size to speed-up future uses.

495 # This cache is invalidated by .refresh() when using SWMR.

496 if self._readonly:

497 self._cache_props['size'] = size

498 return size

499

500 @property

501 def nbytes(self):

502 """Numpy-style attribute giving the raw dataset size as the number of bytes"""

503 size = self.size

504 if size is None: # if we are an empty 0-D array, then there are no bytes in the dataset

505 return 0

506 return self.dtype.itemsize * size

507

508 @property

509 def _selector(self):

510 """Internal object for optimised selection of data"""

511 if '_selector' in self._cache_props:

512 return self._cache_props['_selector']

513

514 slr = _selector.Selector(self.id.get_space())

515

516 # If the file is read-only, cache the reader to speed up future uses.

517 # This cache is invalidated by .refresh() when using SWMR.

518 if self._readonly:

519 self._cache_props['_selector'] = slr

520 return slr

521

522 @property

523 def _fast_reader(self):

524 """Internal object for optimised reading of data"""

525 if '_fast_reader' in self._cache_props:

526 return self._cache_props['_fast_reader']

527

528 rdr = _selector.Reader(self.id)

529

530 # If the file is read-only, cache the reader to speed up future uses.

531 # This cache is invalidated by .refresh() when using SWMR.

532 if self._readonly:

533 self._cache_props['_fast_reader'] = rdr

534 return rdr

535

536 @property

537 @with_phil

538 def dtype(self):

539 """Numpy dtype representing the datatype"""

540 return self.id.dtype

541

542 @property

543 @with_phil

544 def chunks(self):

545 """Dataset chunks (or None)"""

546 dcpl = self._dcpl

547 if dcpl.get_layout() == h5d.CHUNKED:

548 return dcpl.get_chunk()

549 return None

550

551 @property

552 @with_phil

553 def compression(self):

554 """Compression strategy (or None)"""

555 for x in ('gzip','lzf','szip'):

556 if x in self._filters:

557 return x

558 return None

559

560 @property

561 @with_phil

562 def compression_opts(self):

563 """ Compression setting. Int(0-9) for gzip, 2-tuple for szip. """

564 return self._filters.get(self.compression, None)

565

566 @property

567 @with_phil

568 def shuffle(self):

569 """Shuffle filter present (T/F)"""

570 return 'shuffle' in self._filters

571

572 @property

573 @with_phil

574 def fletcher32(self):

575 """Fletcher32 filter is present (T/F)"""

576 return 'fletcher32' in self._filters

577

578 @property

579 @with_phil

580 def scaleoffset(self):

581 """Scale/offset filter settings. For integer data types, this is

582 the number of bits stored, or 0 for auto-detected. For floating

583 point data types, this is the number of decimal places retained.

584 If the scale/offset filter is not in use, this is None."""

585 try:

586 return self._filters['scaleoffset'][1]

587 except KeyError:

588 return None

589

590 @property

591 @with_phil

592 def external(self):

593 """External file settings. Returns a list of tuples of

594 (name, offset, size) for each external file entry, or returns None

595 if no external files are used."""

596 count = self._dcpl.get_external_count()

597 if count<=0:

598 return None

599 ext_list = list()

600 for x in range(count):

601 (name, offset, size) = self._dcpl.get_external(x)

602 ext_list.append( (filename_decode(name), offset, size) )

603 return ext_list

604

605 @property

606 @with_phil

607 def maxshape(self):

608 """Shape up to which this dataset can be resized. Axes with value

609 None have no resize limit. """

610 space = self.id.get_space()

611 dims = space.get_simple_extent_dims(True)

612 if dims is None:

613 return None

614

615 return tuple(x if x != h5s.UNLIMITED else None for x in dims)

616

617 @property

618 @with_phil

619 def fillvalue(self):

620 """Fill value for this dataset (0 by default)"""

621 arr = numpy.zeros((1,), dtype=self.dtype)

622 self._dcpl.get_fill_value(arr)

623 return arr[0]

624

625 @cached_property

626 @with_phil

627 def _extent_type(self):

628 """Get extent type for this dataset - SIMPLE, SCALAR or NULL"""

629 return self.id.get_space().get_simple_extent_type()

630

631 @cached_property

632 def _is_empty(self):

633 """Check if extent type is empty"""

634 return self._extent_type == h5s.NULL

635

636 @with_phil

637 def __init__(self, bind, *, readonly=False):

638 """ Create a new Dataset object by binding to a low-level DatasetID.

639 """

640 if not isinstance(bind, h5d.DatasetID):

641 raise ValueError("%s is not a DatasetID" % bind)

642 super().__init__(bind)

643

644 self._dcpl = self.id.get_create_plist()

645 self._dxpl = h5p.create(h5p.DATASET_XFER)

646 self._filters = filters.get_filters(self._dcpl)

647 self._readonly = readonly

648 self._cache_props = {}

649

650 def resize(self, size, axis=None):

651 """ Resize the dataset, or the specified axis.

652

653 The dataset must be stored in chunked format; it can be resized up to

654 the "maximum shape" (keyword maxshape) specified at creation time.

655 The rank of the dataset cannot be changed.

656

657 "Size" should be a shape tuple, or if an axis is specified, an integer.

658

659 BEWARE: This functions differently than the NumPy resize() method!

660 The data is not "reshuffled" to fit in the new shape; each axis is

661 grown or shrunk independently. The coordinates of existing data are

662 fixed.

663 """

664 with phil:

665 if self.chunks is None:

666 raise TypeError("Only chunked datasets can be resized")

667

668 if axis is not None:

669 if not (axis >=0 and axis < self.id.rank):

670 raise ValueError("Invalid axis (0 to %s allowed)" % (self.id.rank-1))

671 try:

672 newlen = int(size)

673 except TypeError:

674 raise TypeError("Argument must be a single int if axis is specified")

675 size = list(self.shape)

676 size[axis] = newlen

677

678 size = tuple(size)

679 self.id.set_extent(size)

680 #h5f.flush(self.id) # THG recommends

681

682 @with_phil

683 def __len__(self):

684 """ The size of the first axis. TypeError if scalar.

685

686 Limited to 2**32 on 32-bit systems; Dataset.len() is preferred.

687 """

688 size = self.len()

689 if size > sys.maxsize:

690 raise OverflowError("Value too big for Python's __len__; use Dataset.len() instead.")

691 return size

692

693 def len(self):

694 """ The size of the first axis. TypeError if scalar.

695

696 Use of this method is preferred to len(dset), as Python's built-in

697 len() cannot handle values greater then 2**32 on 32-bit systems.

698 """

699 with phil:

700 shape = self.shape

701 if len(shape) == 0:

702 raise TypeError("Attempt to take len() of scalar dataset")

703 return shape[0]

704

705 @with_phil

706 def __iter__(self):

707 """ Iterate over the first axis. TypeError if scalar.

708

709 BEWARE: Modifications to the yielded data are *NOT* written to file.

710 """

711 shape = self.shape

712 if len(shape) == 0:

713 raise TypeError("Can't iterate over a scalar dataset")

714 for i in range(shape[0]):

715 yield self[i]

716

717 @with_phil

718 def iter_chunks(self, sel=None):

719 """ Return chunk iterator. If set, the sel argument is a slice or

720 tuple of slices that defines the region to be used. If not set, the

721 entire dataspace will be used for the iterator.

722

723 For each chunk within the given region, the iterator yields a tuple of

724 slices that gives the intersection of the given chunk with the

725 selection area.

726

727 A TypeError will be raised if the dataset is not chunked.

728

729 A ValueError will be raised if the selection region is invalid.

730

731 """

732 return ChunkIterator(self, sel)

733

734 @cached_property

735 def _fast_read_ok(self):

736 """Is this dataset suitable for simple reading"""

737 return (

738 self._extent_type == h5s.SIMPLE

739 and isinstance(self.id.get_type(), (h5t.TypeIntegerID, h5t.TypeFloatID))

740 )

741

742 @with_phil

743 def __getitem__(self, args, new_dtype=None):

744 """ Read a slice from the HDF5 dataset.

745

746 Takes slices and recarray-style field names (more than one is

747 allowed!) in any order. Obeys basic NumPy rules, including

748 broadcasting.

749

750 Also supports:

751

752 * Boolean "mask" array indexing

753 """

754 args = args if isinstance(args, tuple) else (args,)

755

756 if self._fast_read_ok and (new_dtype is None):

757 try:

758 return self._fast_reader.read(args)

759 except TypeError:

760 pass # Fall back to Python read pathway below

761

762 if self._is_empty:

763 # Check 'is Ellipsis' to avoid equality comparison with an array:

764 # array equality returns an array, not a boolean.

765 if args == () or (len(args) == 1 and args[0] is Ellipsis):

766 return Empty(self.dtype)

767 raise ValueError("Empty datasets cannot be sliced")

768

769 # Sort field names from the rest of the args.

770 names = tuple(x for x in args if isinstance(x, str))

771

772 if names:

773 # Read a subset of the fields in this structured dtype

774 if len(names) == 1:

775 names = names[0] # Read with simpler dtype of this field

776 args = tuple(x for x in args if not isinstance(x, str))

777 return self.fields(names, _prior_dtype=new_dtype)[args]

778

779 if new_dtype is None:

780 new_dtype = self.dtype

781 mtype = h5t.py_create(new_dtype)

782

783 # === Special-case region references ====

784

785 if len(args) == 1 and isinstance(args[0], h5r.RegionReference):

786

787 obj = h5r.dereference(args[0], self.id)

788 if obj != self.id:

789 raise ValueError("Region reference must point to this dataset")

790

791 sid = h5r.get_region(args[0], self.id)

792 mshape = sel.guess_shape(sid)

793 if mshape is None:

794 # 0D with no data (NULL or deselected SCALAR)

795 return Empty(new_dtype)

796 out = numpy.zeros(mshape, dtype=new_dtype)

797 if out.size == 0:

798 return out

799

800 sid_out = h5s.create_simple(mshape)

801 sid_out.select_all()

802 self.id.read(sid_out, sid, out, mtype)

803 return out

804

805 # === Check for zero-sized datasets =====

806

807 if self.size == 0:

808 # Check 'is Ellipsis' to avoid equality comparison with an array:

809 # array equality returns an array, not a boolean.

810 if args == () or (len(args) == 1 and args[0] is Ellipsis):

811 return numpy.zeros(self.shape, dtype=new_dtype)

812

813 # === Scalar dataspaces =================

814

815 if self.shape == ():

816 fspace = self.id.get_space()

817 selection = sel2.select_read(fspace, args)

818 if selection.mshape is None:

819 arr = numpy.zeros((), dtype=new_dtype)

820 else:

821 arr = numpy.zeros(selection.mshape, dtype=new_dtype)

822 for mspace, fspace in selection:

823 self.id.read(mspace, fspace, arr, mtype)

824 if selection.mshape is None:

825 return arr[()]

826 return arr

827

828 # === Everything else ===================

829

830 # Perform the dataspace selection.

831 selection = sel.select(self.shape, args, dataset=self)

832

833 if selection.nselect == 0:

834 return numpy.zeros(selection.array_shape, dtype=new_dtype)

835

836 arr = numpy.zeros(selection.array_shape, new_dtype, order='C')

837

838 # Perform the actual read

839 mspace = h5s.create_simple(selection.mshape)

840 fspace = selection.id

841 self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl)

842

843 # Patch up the output for NumPy

844 if arr.shape == ():

845 return arr[()] # 0 dim array -> numpy scalar

846 return arr

847

848 @with_phil

849 def __setitem__(self, args, val):

850 """ Write to the HDF5 dataset from a Numpy array.

851

852 NumPy's broadcasting rules are honored, for "simple" indexing

853 (slices and integers). For advanced indexing, the shapes must

854 match.

855 """

856 args = args if isinstance(args, tuple) else (args,)

857

858 # Sort field indices from the slicing

859 names = tuple(x for x in args if isinstance(x, str))

860 args = tuple(x for x in args if not isinstance(x, str))

861

862 # Generally we try to avoid converting the arrays on the Python

863 # side. However, for compound literals this is unavoidable.

864 vlen = h5t.check_vlen_dtype(self.dtype)

865 if vlen is not None and vlen not in (bytes, str):

866 try:

867 val = numpy.asarray(val, dtype=vlen)

868 except ValueError:

869 try:

870 val = numpy.array([numpy.array(x, dtype=vlen)

871 for x in val], dtype=self.dtype)

872 except ValueError:

873 pass

874 if vlen == val.dtype:

875 if val.ndim > 1:

876 tmp = numpy.empty(shape=val.shape[:-1], dtype=object)

877 tmp.ravel()[:] = [i for i in val.reshape(

878 (product(val.shape[:-1]), val.shape[-1])

879 )]

880 else:

881 tmp = numpy.array([None], dtype=object)

882 tmp[0] = val

883 val = tmp

884 elif self.dtype.kind == "O" or \

885 (self.dtype.kind == 'V' and \

886 (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \

887 (self.dtype.subdtype is None)):

888 if len(names) == 1 and self.dtype.fields is not None:

889 # Single field selected for write, from a non-array source

890 if not names[0] in self.dtype.fields:

891 raise ValueError("No such field for indexing: %s" % names[0])

892 dtype = self.dtype.fields[names[0]][0]

893 cast_compound = True

894 else:

895 dtype = self.dtype

896 cast_compound = False

897

898 val = numpy.asarray(val, dtype=dtype.base, order='C')

899 if cast_compound:

900 val = val.view(numpy.dtype([(names[0], dtype)]))

901 val = val.reshape(val.shape[:len(val.shape) - len(dtype.shape)])

902 elif (self.dtype.kind == 'S'

903 and (h5t.check_string_dtype(self.dtype).encoding == 'utf-8')

904 and (find_item_type(val) is str)

905 ):

906 # Writing str objects to a fixed-length UTF-8 string dataset.

907 # Numpy's normal conversion only handles ASCII characters, but

908 # when the destination is UTF-8, we want to allow any unicode.

909 # This *doesn't* handle numpy fixed-length unicode data ('U' dtype),

910 # as HDF5 has no equivalent, and converting fixed length UTF-32

911 # to variable length UTF-8 would obscure what's going on.

912 str_array = numpy.asarray(val, order='C', dtype=object)

913 val = numpy.array([

914 s.encode('utf-8') for s in str_array.flat

915 ], dtype=self.dtype).reshape(str_array.shape)

916 else:

917 # If the input data is already an array, let HDF5 do the conversion.

918 # If it's a list or similar, don't make numpy guess a dtype for it.

919 dt = None if isinstance(val, numpy.ndarray) else self.dtype.base

920 val = numpy.asarray(val, order='C', dtype=dt)

921

922 # Check for array dtype compatibility and convert

923 if self.dtype.subdtype is not None:

924 shp = self.dtype.subdtype[1]

925 valshp = val.shape[-len(shp):]

926 if valshp != shp: # Last dimension has to match

927 raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,))

928 mtype = h5t.py_create(numpy.dtype((val.dtype, shp)))

929 mshape = val.shape[0:len(val.shape)-len(shp)]

930

931 # Make a compound memory type if field-name slicing is required

932 elif len(names) != 0:

933

934 mshape = val.shape

935

936 # Catch common errors

937 if self.dtype.fields is None:

938 raise TypeError("Illegal slicing argument (not a compound dataset)")

939 mismatch = [x for x in names if x not in self.dtype.fields]

940 if len(mismatch) != 0:

941 mismatch = ", ".join('"%s"'%x for x in mismatch)

942 raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch)

943

944 # Write non-compound source into a single dataset field

945 if len(names) == 1 and val.dtype.fields is None:

946 subtype = h5t.py_create(val.dtype)

947 mtype = h5t.create(h5t.COMPOUND, subtype.get_size())

948 mtype.insert(self._e(names[0]), 0, subtype)

949

950 # Make a new source type keeping only the requested fields

951 else:

952 fieldnames = [x for x in val.dtype.names if x in names] # Keep source order

953 mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize)

954 for fieldname in fieldnames:

955 subtype = h5t.py_create(val.dtype.fields[fieldname][0])

956 offset = val.dtype.fields[fieldname][1]

957 mtype.insert(self._e(fieldname), offset, subtype)

958

959 # Use mtype derived from array (let DatasetID.write figure it out)

960 else:

961 mshape = val.shape

962 mtype = None

963

964 # Perform the dataspace selection

965 selection = sel.select(self.shape, args, dataset=self)

966

967 if selection.nselect == 0:

968 return

969

970 # Broadcast scalars if necessary.

971 # In order to avoid slow broadcasting filling the destination by

972 # the scalar value, we create an intermediate array of the same

973 # size as the destination buffer provided that size is reasonable.

974 # We assume as reasonable a size smaller or equal as the used dataset

975 # chunk size if any.

976 # In case of dealing with a non-chunked destination dataset or with

977 # a selection whose size is larger than the dataset chunk size we fall

978 # back to using an intermediate array of size equal to the last dimension

979 # of the destination buffer.

980 # The reasoning behind is that it makes sense to assume the creator of

981 # the dataset used an appropriate chunk size according the available

982 # memory. In any case, if we cannot afford to create an intermediate

983 # array of the same size as the dataset chunk size, the user program has

984 # little hope to go much further. Solves h5py issue #1067

985 if mshape == () and selection.array_shape != ():

986 if self.dtype.subdtype is not None:

987 raise TypeError("Scalar broadcasting is not supported for array dtypes")

988 if self.chunks and (product(self.chunks) >= product(selection.array_shape)):

989 val2 = numpy.empty(selection.array_shape, dtype=val.dtype)

990 else:

991 val2 = numpy.empty(selection.array_shape[-1], dtype=val.dtype)

992 val2[...] = val

993 val = val2

994 mshape = val.shape

995

996 # Perform the write, with broadcasting

997 mspace = h5s.create_simple(selection.expand_shape(mshape))

998 for fspace in selection.broadcast(mshape):

999 self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl)

1000

1001 def read_direct(self, dest, source_sel=None, dest_sel=None):

1002 """ Read data directly from HDF5 into an existing NumPy array.

1003

1004 The destination array must be C-contiguous and writable.

1005 Selections must be the output of numpy.s_[<args>].

1006

1007 Broadcasting is supported for simple indexing.

1008 """

1009 with phil:

1010 if self._is_empty:

1011 raise TypeError("Empty datasets have no numpy representation")

1012 if source_sel is None:

1013 source_sel = sel.SimpleSelection(self.shape)

1014 else:

1015 source_sel = sel.select(self.shape, source_sel, self) # for numpy.s_

1016 fspace = source_sel.id

1017

1018 if dest_sel is None:

1019 dest_sel = sel.SimpleSelection(dest.shape)

1020 else:

1021 dest_sel = sel.select(dest.shape, dest_sel)

1022

1023 for mspace in dest_sel.broadcast(source_sel.array_shape):

1024 self.id.read(mspace, fspace, dest, dxpl=self._dxpl)

1025

1026 def write_direct(self, source, source_sel=None, dest_sel=None):

1027 """ Write data directly to HDF5 from a NumPy array.

1028

1029 The source array must be C-contiguous. Selections must be

1030 the output of numpy.s_[<args>].

1031

1032 Broadcasting is supported for simple indexing.

1033 """

1034 with phil:

1035 if self._is_empty:

1036 raise TypeError("Empty datasets cannot be written to")

1037 if source_sel is None:

1038 source_sel = sel.SimpleSelection(source.shape)

1039 else:

1040 source_sel = sel.select(source.shape, source_sel) # for numpy.s_

1041 mspace = source_sel.id

1042

1043 if dest_sel is None:

1044 dest_sel = sel.SimpleSelection(self.shape)

1045 else:

1046 dest_sel = sel.select(self.shape, dest_sel, self)

1047

1048 for fspace in dest_sel.broadcast(source_sel.array_shape):

1049 self.id.write(mspace, fspace, source, dxpl=self._dxpl)

1050

1051 @with_phil

1052 def __array__(self, dtype=None):

1053 """ Create a Numpy array containing the whole dataset. DON'T THINK

1054 THIS MEANS DATASETS ARE INTERCHANGEABLE WITH ARRAYS. For one thing,

1055 you have to read the whole dataset every time this method is called.

1056 """

1057 arr = numpy.zeros(self.shape, dtype=self.dtype if dtype is None else dtype)

1058

1059 # Special case for (0,)*-shape datasets

1060 if self.size == 0:

1061 return arr

1062

1063 self.read_direct(arr)

1064 return arr

1065

1066 @with_phil

1067 def __repr__(self):

1068 if not self:

1069 r = '<Closed HDF5 dataset>'

1070 else:

1071 if self.name is None:

1072 namestr = '("anonymous")'

1073 else:

1074 name = pp.basename(pp.normpath(self.name))

1075 namestr = '"%s"' % (name if name != '' else '/')

1076 r = '<HDF5 dataset %s: shape %s, type "%s">' % (

1077 namestr, self.shape, self.dtype.str

1078 )

1079 return r

1080

1081 if hasattr(h5d.DatasetID, "refresh"):

1082 @with_phil

1083 def refresh(self):

1084 """ Refresh the dataset metadata by reloading from the file.

1085

1086 This is part of the SWMR features and only exist when the HDF5

1087 library version >=1.9.178

1088 """

1089 self._id.refresh()

1090 self._cache_props.clear()

1091

1092 if hasattr(h5d.DatasetID, "flush"):

1093 @with_phil

1094 def flush(self):

1095 """ Flush the dataset data and metadata to the file.

1096 If the dataset is chunked, raw data chunks are written to the file.

1097

1098 This is part of the SWMR features and only exist when the HDF5

1099 library version >=1.9.178

1100 """

1101 self._id.flush()

1102

1103 if vds_support:

1104 @property

1105 @with_phil

1106 def is_virtual(self):

1107 """Check if this is a virtual dataset"""

1108 return self._dcpl.get_layout() == h5d.VIRTUAL

1109

1110 @with_phil

1111 def virtual_sources(self):

1112 """Get a list of the data mappings for a virtual dataset"""

1113 if not self.is_virtual:

1114 raise RuntimeError("Not a virtual dataset")

1115 dcpl = self._dcpl

1116 return [

1117 VDSmap(dcpl.get_virtual_vspace(j),

1118 dcpl.get_virtual_filename(j),

1119 dcpl.get_virtual_dsetname(j),

1120 dcpl.get_virtual_srcspace(j))

1121 for j in range(dcpl.get_virtual_count())]

1122

1123 @with_phil

1124 def make_scale(self, name=''):

1125 """Make this dataset an HDF5 dimension scale.

1126

1127 You can then attach it to dimensions of other datasets like this::

1128

1129 other_ds.dims[0].attach_scale(ds)

1130

1131 You can optionally pass a name to associate with this scale.

1132 """

1133 h5ds.set_scale(self._id, self._e(name))

1134

1135 @property

1136 @with_phil

1137 def is_scale(self):

1138 """Return ``True`` if this dataset is also a dimension scale.

1139

1140 Return ``False`` otherwise.

1141 """

1142 return h5ds.is_scale(self._id)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/h5py/_hl/dataset.py: 21%

633 statements