Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/h5py/_hl/dataset.py: 21%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

640 statements  

1# This file is part of h5py, a Python interface to the HDF5 library. 

2# 

3# http://www.h5py.org 

4# 

5# Copyright 2008-2020 Andrew Collette and contributors 

6# 

7# License: Standard 3-clause BSD; see "license.txt" for full license terms 

8# and contributor agreement. 

9 

10""" 

11 Implements support for high-level dataset access. 

12""" 

13 

14import posixpath as pp 

15import sys 

16 

17import numpy 

18 

19from .. import h5, h5s, h5t, h5r, h5d, h5p, h5fd, h5ds, _selector 

20from .base import ( 

21 array_for_new_object, cached_property, Empty, find_item_type, HLObject, 

22 phil, product, with_phil, 

23) 

24from . import filters 

25from . import selections as sel 

26from . import selections2 as sel2 

27from .datatype import Datatype 

28from .compat import filename_decode 

29from .vds import VDSmap, vds_support 

30 

31_LEGACY_GZIP_COMPRESSION_VALS = frozenset(range(10)) 

32MPI = h5.get_config().mpi 

33 

34 

35def make_new_dset(parent, shape=None, dtype=None, data=None, name=None, 

36 chunks=None, compression=None, shuffle=None, 

37 fletcher32=None, maxshape=None, compression_opts=None, 

38 fillvalue=None, scaleoffset=None, track_times=False, 

39 external=None, track_order=None, dcpl=None, dapl=None, 

40 efile_prefix=None, virtual_prefix=None, allow_unknown_filter=False, 

41 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None): 

42 """ Return a new low-level dataset identifier """ 

43 

44 # Convert data to a C-contiguous ndarray 

45 if data is not None and not isinstance(data, Empty): 

46 data = array_for_new_object(data, specified_dtype=dtype) 

47 

48 # Validate shape 

49 if shape is None: 

50 if data is None: 

51 if dtype is None: 

52 raise TypeError("One of data, shape or dtype must be specified") 

53 data = Empty(dtype) 

54 shape = data.shape 

55 else: 

56 shape = (shape,) if isinstance(shape, int) else tuple(shape) 

57 if data is not None and (product(shape) != product(data.shape)): 

58 raise ValueError("Shape tuple is incompatible with data") 

59 

60 if isinstance(maxshape, int): 

61 maxshape = (maxshape,) 

62 tmp_shape = maxshape if maxshape is not None else shape 

63 

64 # Validate chunk shape 

65 if isinstance(chunks, int) and not isinstance(chunks, bool): 

66 chunks = (chunks,) 

67 if isinstance(chunks, tuple) and any( 

68 chunk > dim for dim, chunk in zip(tmp_shape, chunks) if dim is not None 

69 ): 

70 errmsg = "Chunk shape must not be greater than data shape in any dimension. "\ 

71 "{} is not compatible with {}".format(chunks, shape) 

72 raise ValueError(errmsg) 

73 

74 if isinstance(dtype, Datatype): 

75 # Named types are used as-is 

76 tid = dtype.id 

77 dtype = tid.dtype # Following code needs this 

78 else: 

79 # Validate dtype 

80 if dtype is None and data is None: 

81 dtype = numpy.dtype("=f4") 

82 elif dtype is None and data is not None: 

83 dtype = data.dtype 

84 else: 

85 dtype = numpy.dtype(dtype) 

86 tid = h5t.py_create(dtype, logical=1) 

87 

88 # Legacy 

89 if any((compression, shuffle, fletcher32, maxshape, scaleoffset)) and chunks is False: 

90 raise ValueError("Chunked format required for given storage options") 

91 

92 # Legacy 

93 if compression is True: 

94 if compression_opts is None: 

95 compression_opts = 4 

96 compression = 'gzip' 

97 

98 # Legacy 

99 if compression in _LEGACY_GZIP_COMPRESSION_VALS: 

100 if compression_opts is not None: 

101 raise TypeError("Conflict in compression options") 

102 compression_opts = compression 

103 compression = 'gzip' 

104 dcpl = filters.fill_dcpl( 

105 dcpl or h5p.create(h5p.DATASET_CREATE), shape, dtype, 

106 chunks, compression, compression_opts, shuffle, fletcher32, 

107 maxshape, scaleoffset, external, allow_unknown_filter) 

108 

109 if fillvalue is not None: 

110 # prepare string-type dtypes for fillvalue 

111 string_info = h5t.check_string_dtype(dtype) 

112 if string_info is not None: 

113 # fake vlen dtype for fixed len string fillvalue 

114 # to not trigger unwanted encoding 

115 dtype = h5t.string_dtype(string_info.encoding) 

116 fillvalue = numpy.array(fillvalue, dtype=dtype) 

117 else: 

118 fillvalue = numpy.array(fillvalue) 

119 dcpl.set_fill_value(fillvalue) 

120 

121 if track_times is None: 

122 # In case someone explicitly passes None for the default 

123 track_times = False 

124 if track_times in (True, False): 

125 dcpl.set_obj_track_times(track_times) 

126 else: 

127 raise TypeError("track_times must be either True or False") 

128 if track_order is True: 

129 dcpl.set_attr_creation_order( 

130 h5p.CRT_ORDER_TRACKED | h5p.CRT_ORDER_INDEXED) 

131 elif track_order is False: 

132 dcpl.set_attr_creation_order(0) 

133 elif track_order is not None: 

134 raise TypeError("track_order must be either True or False") 

135 

136 if maxshape is not None: 

137 maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape) 

138 

139 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]): 

140 dapl = dapl or h5p.create(h5p.DATASET_ACCESS) 

141 

142 if efile_prefix is not None: 

143 dapl.set_efile_prefix(efile_prefix) 

144 

145 if virtual_prefix is not None: 

146 dapl.set_virtual_prefix(virtual_prefix) 

147 

148 if rdcc_nbytes or rdcc_nslots or rdcc_w0: 

149 cache_settings = list(dapl.get_chunk_cache()) 

150 if rdcc_nslots is not None: 

151 cache_settings[0] = rdcc_nslots 

152 if rdcc_nbytes is not None: 

153 cache_settings[1] = rdcc_nbytes 

154 if rdcc_w0 is not None: 

155 cache_settings[2] = rdcc_w0 

156 dapl.set_chunk_cache(*cache_settings) 

157 

158 if isinstance(data, Empty): 

159 sid = h5s.create(h5s.NULL) 

160 else: 

161 sid = h5s.create_simple(shape, maxshape) 

162 

163 dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl, dapl=dapl) 

164 

165 if (data is not None) and (not isinstance(data, Empty)): 

166 dset_id.write(h5s.ALL, h5s.ALL, data) 

167 

168 return dset_id 

169 

170 

171def open_dset(parent, name, dapl=None, efile_prefix=None, virtual_prefix=None, 

172 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, **kwds): 

173 """ Return an existing low-level dataset identifier """ 

174 

175 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]): 

176 dapl = dapl or h5p.create(h5p.DATASET_ACCESS) 

177 

178 if efile_prefix is not None: 

179 dapl.set_efile_prefix(efile_prefix) 

180 

181 if virtual_prefix is not None: 

182 dapl.set_virtual_prefix(virtual_prefix) 

183 

184 if rdcc_nbytes or rdcc_nslots or rdcc_w0: 

185 cache_settings = list(dapl.get_chunk_cache()) 

186 if rdcc_nslots is not None: 

187 cache_settings[0] = rdcc_nslots 

188 if rdcc_nbytes is not None: 

189 cache_settings[1] = rdcc_nbytes 

190 if rdcc_w0 is not None: 

191 cache_settings[2] = rdcc_w0 

192 dapl.set_chunk_cache(*cache_settings) 

193 

194 dset_id = h5d.open(parent.id, name, dapl=dapl) 

195 

196 return dset_id 

197 

198 

199class AstypeWrapper: 

200 """Wrapper to convert data on reading from a dataset. 

201 """ 

202 def __init__(self, dset, dtype): 

203 self._dset = dset 

204 self._dtype = numpy.dtype(dtype) 

205 

206 def __getitem__(self, args): 

207 return self._dset.__getitem__(args, new_dtype=self._dtype) 

208 

209 def __len__(self): 

210 """ Get the length of the underlying dataset 

211 

212 >>> length = len(dataset.astype('f8')) 

213 """ 

214 return len(self._dset) 

215 

216 def __array__(self, dtype=None, copy=True): 

217 if copy is False: 

218 raise ValueError( 

219 f"AstypeWrapper.__array__ received {copy=} " 

220 f"but memory allocation cannot be avoided on read" 

221 ) 

222 

223 data = self[:] 

224 if dtype is not None: 

225 return data.astype(dtype, copy=False) 

226 return data 

227 

228 

229class AsStrWrapper: 

230 """Wrapper to decode strings on reading the dataset""" 

231 def __init__(self, dset, encoding, errors='strict'): 

232 self._dset = dset 

233 if encoding is None: 

234 encoding = h5t.check_string_dtype(dset.dtype).encoding 

235 self.encoding = encoding 

236 self.errors = errors 

237 

238 def __getitem__(self, args): 

239 bytes_arr = self._dset[args] 

240 # numpy.char.decode() seems like the obvious thing to use. But it only 

241 # accepts numpy string arrays, not object arrays of bytes (which we 

242 # return from HDF5 variable-length strings). And the numpy 

243 # implementation is not faster than doing it with a loop; in fact, by 

244 # not converting the result to a numpy unicode array, the 

245 # naive way can be faster! (Comparing with numpy 1.18.4, June 2020) 

246 if numpy.isscalar(bytes_arr): 

247 return bytes_arr.decode(self.encoding, self.errors) 

248 

249 return numpy.array([ 

250 b.decode(self.encoding, self.errors) for b in bytes_arr.flat 

251 ], dtype=object).reshape(bytes_arr.shape) 

252 

253 def __len__(self): 

254 """ Get the length of the underlying dataset 

255 

256 >>> length = len(dataset.asstr()) 

257 """ 

258 return len(self._dset) 

259 

260 def __array__(self, dtype=None, copy=True): 

261 if dtype not in (None, object): 

262 raise TypeError( 

263 "AsStrWrapper.__array__ doesn't support the dtype argument" 

264 ) 

265 if copy is False: 

266 raise ValueError( 

267 f"AsStrWrapper.__array__ received {copy=} " 

268 f"but memory allocation cannot be avoided on read" 

269 ) 

270 return numpy.array([ 

271 b.decode(self.encoding, self.errors) for b in self._dset 

272 ], dtype=object).reshape(self._dset.shape) 

273 

274 

275class FieldsWrapper: 

276 """Wrapper to extract named fields from a dataset with a struct dtype""" 

277 extract_field = None 

278 

279 def __init__(self, dset, prior_dtype, names): 

280 self._dset = dset 

281 if isinstance(names, str): 

282 self.extract_field = names 

283 names = [names] 

284 self.read_dtype = readtime_dtype(prior_dtype, names) 

285 

286 def __array__(self, dtype=None, copy=True): 

287 if copy is False: 

288 raise ValueError( 

289 f"FieldsWrapper.__array__ received {copy=} " 

290 f"but memory allocation cannot be avoided on read" 

291 ) 

292 data = self[:] 

293 if dtype is not None: 

294 return data.astype(dtype, copy=False) 

295 else: 

296 return data 

297 

298 def __getitem__(self, args): 

299 data = self._dset.__getitem__(args, new_dtype=self.read_dtype) 

300 if self.extract_field is not None: 

301 data = data[self.extract_field] 

302 return data 

303 

304 def __len__(self): 

305 """ Get the length of the underlying dataset 

306 

307 >>> length = len(dataset.fields(['x', 'y'])) 

308 """ 

309 return len(self._dset) 

310 

311 

312def readtime_dtype(basetype, names): 

313 """Make a NumPy compound dtype with a subset of available fields""" 

314 if basetype.names is None: # Names provided, but not compound 

315 raise ValueError("Field names only allowed for compound types") 

316 

317 for name in names: # Check all names are legal 

318 if name not in basetype.names: 

319 raise ValueError("Field %s does not appear in this type." % name) 

320 

321 return numpy.dtype([(name, basetype.fields[name][0]) for name in names]) 

322 

323 

324if MPI: 

325 class CollectiveContext: 

326 

327 """ Manages collective I/O in MPI mode """ 

328 

329 # We don't bother with _local as threads are forbidden in MPI mode 

330 

331 def __init__(self, dset): 

332 self._dset = dset 

333 

334 def __enter__(self): 

335 # pylint: disable=protected-access 

336 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_COLLECTIVE) 

337 

338 def __exit__(self, *args): 

339 # pylint: disable=protected-access 

340 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_INDEPENDENT) 

341 

342 

343class ChunkIterator: 

344 """ 

345 Class to iterate through list of chunks of a given dataset 

346 """ 

347 def __init__(self, dset, source_sel=None): 

348 self._shape = dset.shape 

349 rank = len(dset.shape) 

350 

351 if not dset.chunks: 

352 # can only use with chunked datasets 

353 raise TypeError("Chunked dataset required") 

354 

355 self._layout = dset.chunks 

356 if source_sel is None: 

357 # select over entire dataset 

358 self._sel = tuple( 

359 slice(0, self._shape[dim]) 

360 for dim in range(rank) 

361 ) 

362 else: 

363 if isinstance(source_sel, slice): 

364 self._sel = (source_sel,) 

365 else: 

366 self._sel = source_sel 

367 if len(self._sel) != rank: 

368 raise ValueError("Invalid selection - selection region must have same rank as dataset") 

369 self._chunk_index = [] 

370 for dim in range(rank): 

371 s = self._sel[dim] 

372 if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start: 

373 raise ValueError("Invalid selection - selection region must be within dataset space") 

374 index = s.start // self._layout[dim] 

375 self._chunk_index.append(index) 

376 

377 def __iter__(self): 

378 return self 

379 

380 def __next__(self): 

381 rank = len(self._shape) 

382 slices = [] 

383 if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop: 

384 # ran past the last chunk, end iteration 

385 raise StopIteration() 

386 

387 for dim in range(rank): 

388 s = self._sel[dim] 

389 start = self._chunk_index[dim] * self._layout[dim] 

390 stop = (self._chunk_index[dim] + 1) * self._layout[dim] 

391 # adjust the start if this is an edge chunk 

392 if start < s.start: 

393 start = s.start 

394 if stop > s.stop: 

395 stop = s.stop # trim to end of the selection 

396 s = slice(start, stop, 1) 

397 slices.append(s) 

398 

399 # bump up the last index and carry forward if we run outside the selection 

400 dim = rank - 1 

401 while dim >= 0: 

402 s = self._sel[dim] 

403 self._chunk_index[dim] += 1 

404 

405 chunk_end = self._chunk_index[dim] * self._layout[dim] 

406 if chunk_end < s.stop: 

407 # we still have room to extend along this dimensions 

408 return tuple(slices) 

409 

410 if dim > 0: 

411 # reset to the start and continue iterating with higher dimension 

412 self._chunk_index[dim] = s.start // self._layout[dim] 

413 dim -= 1 

414 return tuple(slices) 

415 

416 

417class Dataset(HLObject): 

418 

419 """ 

420 Represents an HDF5 dataset 

421 """ 

422 

423 def astype(self, dtype): 

424 """ Get a wrapper allowing you to perform reads to a 

425 different destination type, e.g.: 

426 

427 >>> double_precision = dataset.astype('f8')[0:100:2] 

428 """ 

429 return AstypeWrapper(self, dtype) 

430 

431 def asstr(self, encoding=None, errors='strict'): 

432 """Get a wrapper to read string data as Python strings: 

433 

434 >>> str_array = dataset.asstr()[:] 

435 

436 The parameters have the same meaning as in ``bytes.decode()``. 

437 If ``encoding`` is unspecified, it will use the encoding in the HDF5 

438 datatype (either ascii or utf-8). 

439 """ 

440 string_info = h5t.check_string_dtype(self.dtype) 

441 if string_info is None: 

442 raise TypeError( 

443 "dset.asstr() can only be used on datasets with " 

444 "an HDF5 string datatype" 

445 ) 

446 if encoding is None: 

447 encoding = string_info.encoding 

448 return AsStrWrapper(self, encoding, errors=errors) 

449 

450 def fields(self, names, *, _prior_dtype=None): 

451 """Get a wrapper to read a subset of fields from a compound data type: 

452 

453 >>> 2d_coords = dataset.fields(['x', 'y'])[:] 

454 

455 If names is a string, a single field is extracted, and the resulting 

456 arrays will have that dtype. Otherwise, it should be an iterable, 

457 and the read data will have a compound dtype. 

458 """ 

459 if _prior_dtype is None: 

460 _prior_dtype = self.dtype 

461 return FieldsWrapper(self, _prior_dtype, names) 

462 

463 if MPI: 

464 @property 

465 @with_phil 

466 def collective(self): 

467 """ Context manager for MPI collective reads & writes """ 

468 return CollectiveContext(self) 

469 

470 @property 

471 def dims(self): 

472 """ Access dimension scales attached to this dataset. """ 

473 from .dims import DimensionManager 

474 with phil: 

475 return DimensionManager(self) 

476 

477 @property 

478 @with_phil 

479 def ndim(self): 

480 """Numpy-style attribute giving the number of dimensions""" 

481 return self.id.rank 

482 

483 @property 

484 def shape(self): 

485 """Numpy-style shape tuple giving dataset dimensions""" 

486 if 'shape' in self._cache_props: 

487 return self._cache_props['shape'] 

488 

489 with phil: 

490 shape = self.id.shape 

491 

492 # If the file is read-only, cache the shape to speed-up future uses. 

493 # This cache is invalidated by .refresh() when using SWMR. 

494 if self._readonly: 

495 self._cache_props['shape'] = shape 

496 return shape 

497 

498 @shape.setter 

499 @with_phil 

500 def shape(self, shape): 

501 # pylint: disable=missing-docstring 

502 self.resize(shape) 

503 

504 @property 

505 def size(self): 

506 """Numpy-style attribute giving the total dataset size""" 

507 if 'size' in self._cache_props: 

508 return self._cache_props['size'] 

509 

510 if self._is_empty: 

511 size = None 

512 else: 

513 size = product(self.shape) 

514 

515 # If the file is read-only, cache the size to speed-up future uses. 

516 # This cache is invalidated by .refresh() when using SWMR. 

517 if self._readonly: 

518 self._cache_props['size'] = size 

519 return size 

520 

521 @property 

522 def nbytes(self): 

523 """Numpy-style attribute giving the raw dataset size as the number of bytes""" 

524 size = self.size 

525 if size is None: # if we are an empty 0-D array, then there are no bytes in the dataset 

526 return 0 

527 return self.dtype.itemsize * size 

528 

529 @property 

530 def _selector(self): 

531 """Internal object for optimised selection of data""" 

532 if '_selector' in self._cache_props: 

533 return self._cache_props['_selector'] 

534 

535 slr = _selector.Selector(self.id.get_space()) 

536 

537 # If the file is read-only, cache the reader to speed up future uses. 

538 # This cache is invalidated by .refresh() when using SWMR. 

539 if self._readonly: 

540 self._cache_props['_selector'] = slr 

541 return slr 

542 

543 @property 

544 def _fast_reader(self): 

545 """Internal object for optimised reading of data""" 

546 if '_fast_reader' in self._cache_props: 

547 return self._cache_props['_fast_reader'] 

548 

549 rdr = _selector.Reader(self.id) 

550 

551 # If the file is read-only, cache the reader to speed up future uses. 

552 # This cache is invalidated by .refresh() when using SWMR. 

553 if self._readonly: 

554 self._cache_props['_fast_reader'] = rdr 

555 return rdr 

556 

557 @property 

558 @with_phil 

559 def dtype(self): 

560 """Numpy dtype representing the datatype""" 

561 return self.id.dtype 

562 

563 @property 

564 @with_phil 

565 def chunks(self): 

566 """Dataset chunks (or None)""" 

567 dcpl = self._dcpl 

568 if dcpl.get_layout() == h5d.CHUNKED: 

569 return dcpl.get_chunk() 

570 return None 

571 

572 @property 

573 @with_phil 

574 def compression(self): 

575 """Compression strategy (or None)""" 

576 for x in ('gzip','lzf','szip'): 

577 if x in self._filters: 

578 return x 

579 return None 

580 

581 @property 

582 @with_phil 

583 def compression_opts(self): 

584 """ Compression setting. Int(0-9) for gzip, 2-tuple for szip. """ 

585 return self._filters.get(self.compression, None) 

586 

587 @property 

588 @with_phil 

589 def shuffle(self): 

590 """Shuffle filter present (T/F)""" 

591 return 'shuffle' in self._filters 

592 

593 @property 

594 @with_phil 

595 def fletcher32(self): 

596 """Fletcher32 filter is present (T/F)""" 

597 return 'fletcher32' in self._filters 

598 

599 @property 

600 @with_phil 

601 def scaleoffset(self): 

602 """Scale/offset filter settings. For integer data types, this is 

603 the number of bits stored, or 0 for auto-detected. For floating 

604 point data types, this is the number of decimal places retained. 

605 If the scale/offset filter is not in use, this is None.""" 

606 try: 

607 return self._filters['scaleoffset'][1] 

608 except KeyError: 

609 return None 

610 

611 @property 

612 @with_phil 

613 def external(self): 

614 """External file settings. Returns a list of tuples of 

615 (name, offset, size) for each external file entry, or returns None 

616 if no external files are used.""" 

617 count = self._dcpl.get_external_count() 

618 if count<=0: 

619 return None 

620 ext_list = list() 

621 for x in range(count): 

622 (name, offset, size) = self._dcpl.get_external(x) 

623 ext_list.append( (filename_decode(name), offset, size) ) 

624 return ext_list 

625 

626 @property 

627 @with_phil 

628 def maxshape(self): 

629 """Shape up to which this dataset can be resized. Axes with value 

630 None have no resize limit. """ 

631 space = self.id.get_space() 

632 dims = space.get_simple_extent_dims(True) 

633 if dims is None: 

634 return None 

635 

636 return tuple(x if x != h5s.UNLIMITED else None for x in dims) 

637 

638 @property 

639 @with_phil 

640 def fillvalue(self): 

641 """Fill value for this dataset (0 by default)""" 

642 arr = numpy.zeros((1,), dtype=self.dtype) 

643 self._dcpl.get_fill_value(arr) 

644 return arr[0] 

645 

646 @cached_property 

647 @with_phil 

648 def _extent_type(self): 

649 """Get extent type for this dataset - SIMPLE, SCALAR or NULL""" 

650 return self.id.get_space().get_simple_extent_type() 

651 

652 @cached_property 

653 def _is_empty(self): 

654 """Check if extent type is empty""" 

655 return self._extent_type == h5s.NULL 

656 

657 @with_phil 

658 def __init__(self, bind, *, readonly=False): 

659 """ Create a new Dataset object by binding to a low-level DatasetID. 

660 """ 

661 if not isinstance(bind, h5d.DatasetID): 

662 raise ValueError("%s is not a DatasetID" % bind) 

663 super().__init__(bind) 

664 

665 self._dcpl = self.id.get_create_plist() 

666 self._dxpl = h5p.create(h5p.DATASET_XFER) 

667 self._filters = filters.get_filters(self._dcpl) 

668 self._readonly = readonly 

669 self._cache_props = {} 

670 

671 def resize(self, size, axis=None): 

672 """ Resize the dataset, or the specified axis. 

673 

674 The dataset must be stored in chunked format; it can be resized up to 

675 the "maximum shape" (keyword maxshape) specified at creation time. 

676 The rank of the dataset cannot be changed. 

677 

678 "Size" should be a shape tuple, or if an axis is specified, an integer. 

679 

680 BEWARE: This functions differently than the NumPy resize() method! 

681 The data is not "reshuffled" to fit in the new shape; each axis is 

682 grown or shrunk independently. The coordinates of existing data are 

683 fixed. 

684 """ 

685 with phil: 

686 if self.chunks is None: 

687 raise TypeError("Only chunked datasets can be resized") 

688 

689 if axis is not None: 

690 if not (axis >=0 and axis < self.id.rank): 

691 raise ValueError("Invalid axis (0 to %s allowed)" % (self.id.rank-1)) 

692 try: 

693 newlen = int(size) 

694 except TypeError: 

695 raise TypeError("Argument must be a single int if axis is specified") 

696 size = list(self.shape) 

697 size[axis] = newlen 

698 

699 size = tuple(size) 

700 self.id.set_extent(size) 

701 #h5f.flush(self.id) # THG recommends 

702 

703 @with_phil 

704 def __len__(self): 

705 """ The size of the first axis. TypeError if scalar. 

706 

707 Limited to 2**32 on 32-bit systems; Dataset.len() is preferred. 

708 """ 

709 size = self.len() 

710 if size > sys.maxsize: 

711 raise OverflowError("Value too big for Python's __len__; use Dataset.len() instead.") 

712 return size 

713 

714 def len(self): 

715 """ The size of the first axis. TypeError if scalar. 

716 

717 Use of this method is preferred to len(dset), as Python's built-in 

718 len() cannot handle values greater then 2**32 on 32-bit systems. 

719 """ 

720 with phil: 

721 shape = self.shape 

722 if len(shape) == 0: 

723 raise TypeError("Attempt to take len() of scalar dataset") 

724 return shape[0] 

725 

726 @with_phil 

727 def __iter__(self): 

728 """ Iterate over the first axis. TypeError if scalar. 

729 

730 BEWARE: Modifications to the yielded data are *NOT* written to file. 

731 """ 

732 shape = self.shape 

733 if len(shape) == 0: 

734 raise TypeError("Can't iterate over a scalar dataset") 

735 for i in range(shape[0]): 

736 yield self[i] 

737 

738 @with_phil 

739 def iter_chunks(self, sel=None): 

740 """ Return chunk iterator. If set, the sel argument is a slice or 

741 tuple of slices that defines the region to be used. If not set, the 

742 entire dataspace will be used for the iterator. 

743 

744 For each chunk within the given region, the iterator yields a tuple of 

745 slices that gives the intersection of the given chunk with the 

746 selection area. 

747 

748 A TypeError will be raised if the dataset is not chunked. 

749 

750 A ValueError will be raised if the selection region is invalid. 

751 

752 """ 

753 return ChunkIterator(self, sel) 

754 

755 @cached_property 

756 def _fast_read_ok(self): 

757 """Is this dataset suitable for simple reading""" 

758 return ( 

759 self._extent_type == h5s.SIMPLE 

760 and isinstance(self.id.get_type(), (h5t.TypeIntegerID, h5t.TypeFloatID)) 

761 ) 

762 

763 @with_phil 

764 def __getitem__(self, args, new_dtype=None): 

765 """ Read a slice from the HDF5 dataset. 

766 

767 Takes slices and recarray-style field names (more than one is 

768 allowed!) in any order. Obeys basic NumPy rules, including 

769 broadcasting. 

770 

771 Also supports: 

772 

773 * Boolean "mask" array indexing 

774 """ 

775 args = args if isinstance(args, tuple) else (args,) 

776 

777 if self._fast_read_ok and (new_dtype is None): 

778 try: 

779 return self._fast_reader.read(args) 

780 except TypeError: 

781 pass # Fall back to Python read pathway below 

782 

783 if self._is_empty: 

784 # Check 'is Ellipsis' to avoid equality comparison with an array: 

785 # array equality returns an array, not a boolean. 

786 if args == () or (len(args) == 1 and args[0] is Ellipsis): 

787 return Empty(self.dtype) 

788 raise ValueError("Empty datasets cannot be sliced") 

789 

790 # Sort field names from the rest of the args. 

791 names = tuple(x for x in args if isinstance(x, str)) 

792 

793 if names: 

794 # Read a subset of the fields in this structured dtype 

795 if len(names) == 1: 

796 names = names[0] # Read with simpler dtype of this field 

797 args = tuple(x for x in args if not isinstance(x, str)) 

798 return self.fields(names, _prior_dtype=new_dtype)[args] 

799 

800 if new_dtype is None: 

801 new_dtype = self.dtype 

802 mtype = h5t.py_create(new_dtype) 

803 

804 # === Special-case region references ==== 

805 

806 if len(args) == 1 and isinstance(args[0], h5r.RegionReference): 

807 

808 obj = h5r.dereference(args[0], self.id) 

809 if obj != self.id: 

810 raise ValueError("Region reference must point to this dataset") 

811 

812 sid = h5r.get_region(args[0], self.id) 

813 mshape = sel.guess_shape(sid) 

814 if mshape is None: 

815 # 0D with no data (NULL or deselected SCALAR) 

816 return Empty(new_dtype) 

817 out = numpy.zeros(mshape, dtype=new_dtype) 

818 if out.size == 0: 

819 return out 

820 

821 sid_out = h5s.create_simple(mshape) 

822 sid_out.select_all() 

823 self.id.read(sid_out, sid, out, mtype) 

824 return out 

825 

826 # === Check for zero-sized datasets ===== 

827 

828 if self.size == 0: 

829 # Check 'is Ellipsis' to avoid equality comparison with an array: 

830 # array equality returns an array, not a boolean. 

831 if args == () or (len(args) == 1 and args[0] is Ellipsis): 

832 return numpy.zeros(self.shape, dtype=new_dtype) 

833 

834 # === Scalar dataspaces ================= 

835 

836 if self.shape == (): 

837 fspace = self.id.get_space() 

838 selection = sel2.select_read(fspace, args) 

839 if selection.mshape is None: 

840 arr = numpy.zeros((), dtype=new_dtype) 

841 else: 

842 arr = numpy.zeros(selection.mshape, dtype=new_dtype) 

843 for mspace, fspace in selection: 

844 self.id.read(mspace, fspace, arr, mtype) 

845 if selection.mshape is None: 

846 return arr[()] 

847 return arr 

848 

849 # === Everything else =================== 

850 

851 # Perform the dataspace selection. 

852 selection = sel.select(self.shape, args, dataset=self) 

853 

854 if selection.nselect == 0: 

855 return numpy.zeros(selection.array_shape, dtype=new_dtype) 

856 

857 arr = numpy.zeros(selection.array_shape, new_dtype, order='C') 

858 

859 # Perform the actual read 

860 mspace = h5s.create_simple(selection.mshape) 

861 fspace = selection.id 

862 self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl) 

863 

864 # Patch up the output for NumPy 

865 if arr.shape == (): 

866 return arr[()] # 0 dim array -> numpy scalar 

867 return arr 

868 

869 @with_phil 

870 def __setitem__(self, args, val): 

871 """ Write to the HDF5 dataset from a Numpy array. 

872 

873 NumPy's broadcasting rules are honored, for "simple" indexing 

874 (slices and integers). For advanced indexing, the shapes must 

875 match. 

876 """ 

877 args = args if isinstance(args, tuple) else (args,) 

878 

879 # Sort field indices from the slicing 

880 names = tuple(x for x in args if isinstance(x, str)) 

881 args = tuple(x for x in args if not isinstance(x, str)) 

882 

883 # Generally we try to avoid converting the arrays on the Python 

884 # side. However, for compound literals this is unavoidable. 

885 vlen = h5t.check_vlen_dtype(self.dtype) 

886 if vlen is not None and vlen not in (bytes, str): 

887 try: 

888 val = numpy.asarray(val, dtype=vlen) 

889 except (ValueError, TypeError): 

890 try: 

891 val = numpy.array([numpy.array(x, dtype=vlen) 

892 for x in val], dtype=self.dtype) 

893 except (ValueError, TypeError): 

894 pass 

895 if vlen == val.dtype: 

896 if val.ndim > 1: 

897 tmp = numpy.empty(shape=val.shape[:-1], dtype=object) 

898 tmp.ravel()[:] = [i for i in val.reshape( 

899 (product(val.shape[:-1]), val.shape[-1]) 

900 )] 

901 else: 

902 tmp = numpy.array([None], dtype=object) 

903 tmp[0] = val 

904 val = tmp 

905 elif self.dtype.kind == "O" or \ 

906 (self.dtype.kind == 'V' and \ 

907 (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \ 

908 (self.dtype.subdtype is None)): 

909 if len(names) == 1 and self.dtype.fields is not None: 

910 # Single field selected for write, from a non-array source 

911 if not names[0] in self.dtype.fields: 

912 raise ValueError("No such field for indexing: %s" % names[0]) 

913 dtype = self.dtype.fields[names[0]][0] 

914 cast_compound = True 

915 else: 

916 dtype = self.dtype 

917 cast_compound = False 

918 

919 val = numpy.asarray(val, dtype=dtype.base, order='C') 

920 if cast_compound: 

921 val = val.view(numpy.dtype([(names[0], dtype)])) 

922 val = val.reshape(val.shape[:len(val.shape) - len(dtype.shape)]) 

923 elif (self.dtype.kind == 'S' 

924 and (h5t.check_string_dtype(self.dtype).encoding == 'utf-8') 

925 and (find_item_type(val) is str) 

926 ): 

927 # Writing str objects to a fixed-length UTF-8 string dataset. 

928 # Numpy's normal conversion only handles ASCII characters, but 

929 # when the destination is UTF-8, we want to allow any unicode. 

930 # This *doesn't* handle numpy fixed-length unicode data ('U' dtype), 

931 # as HDF5 has no equivalent, and converting fixed length UTF-32 

932 # to variable length UTF-8 would obscure what's going on. 

933 str_array = numpy.asarray(val, order='C', dtype=object) 

934 val = numpy.array([ 

935 s.encode('utf-8') for s in str_array.flat 

936 ], dtype=self.dtype).reshape(str_array.shape) 

937 else: 

938 # If the input data is already an array, let HDF5 do the conversion. 

939 # If it's a list or similar, don't make numpy guess a dtype for it. 

940 dt = None if isinstance(val, numpy.ndarray) else self.dtype.base 

941 val = numpy.asarray(val, order='C', dtype=dt) 

942 

943 # Check for array dtype compatibility and convert 

944 if self.dtype.subdtype is not None: 

945 shp = self.dtype.subdtype[1] 

946 valshp = val.shape[-len(shp):] 

947 if valshp != shp: # Last dimension has to match 

948 raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,)) 

949 mtype = h5t.py_create(numpy.dtype((val.dtype, shp))) 

950 mshape = val.shape[0:len(val.shape)-len(shp)] 

951 

952 # Make a compound memory type if field-name slicing is required 

953 elif len(names) != 0: 

954 

955 mshape = val.shape 

956 

957 # Catch common errors 

958 if self.dtype.fields is None: 

959 raise TypeError("Illegal slicing argument (not a compound dataset)") 

960 mismatch = [x for x in names if x not in self.dtype.fields] 

961 if len(mismatch) != 0: 

962 mismatch = ", ".join('"%s"'%x for x in mismatch) 

963 raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch) 

964 

965 # Write non-compound source into a single dataset field 

966 if len(names) == 1 and val.dtype.fields is None: 

967 subtype = h5t.py_create(val.dtype) 

968 mtype = h5t.create(h5t.COMPOUND, subtype.get_size()) 

969 mtype.insert(self._e(names[0]), 0, subtype) 

970 

971 # Make a new source type keeping only the requested fields 

972 else: 

973 fieldnames = [x for x in val.dtype.names if x in names] # Keep source order 

974 mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize) 

975 for fieldname in fieldnames: 

976 subtype = h5t.py_create(val.dtype.fields[fieldname][0]) 

977 offset = val.dtype.fields[fieldname][1] 

978 mtype.insert(self._e(fieldname), offset, subtype) 

979 

980 # Use mtype derived from array (let DatasetID.write figure it out) 

981 else: 

982 mshape = val.shape 

983 mtype = None 

984 

985 # Perform the dataspace selection 

986 selection = sel.select(self.shape, args, dataset=self) 

987 

988 if selection.nselect == 0: 

989 return 

990 

991 # Broadcast scalars if necessary. 

992 # In order to avoid slow broadcasting filling the destination by 

993 # the scalar value, we create an intermediate array of the same 

994 # size as the destination buffer provided that size is reasonable. 

995 # We assume as reasonable a size smaller or equal as the used dataset 

996 # chunk size if any. 

997 # In case of dealing with a non-chunked destination dataset or with 

998 # a selection whose size is larger than the dataset chunk size we fall 

999 # back to using an intermediate array of size equal to the last dimension 

1000 # of the destination buffer. 

1001 # The reasoning behind is that it makes sense to assume the creator of 

1002 # the dataset used an appropriate chunk size according the available 

1003 # memory. In any case, if we cannot afford to create an intermediate 

1004 # array of the same size as the dataset chunk size, the user program has 

1005 # little hope to go much further. Solves h5py issue #1067 

1006 if mshape == () and selection.array_shape != (): 

1007 if self.dtype.subdtype is not None: 

1008 raise TypeError("Scalar broadcasting is not supported for array dtypes") 

1009 if self.chunks and (product(self.chunks) >= product(selection.array_shape)): 

1010 val2 = numpy.empty(selection.array_shape, dtype=val.dtype) 

1011 else: 

1012 val2 = numpy.empty(selection.array_shape[-1], dtype=val.dtype) 

1013 val2[...] = val 

1014 val = val2 

1015 mshape = val.shape 

1016 

1017 # Perform the write, with broadcasting 

1018 mspace = h5s.create_simple(selection.expand_shape(mshape)) 

1019 for fspace in selection.broadcast(mshape): 

1020 self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl) 

1021 

1022 def read_direct(self, dest, source_sel=None, dest_sel=None): 

1023 """ Read data directly from HDF5 into an existing NumPy array. 

1024 

1025 The destination array must be C-contiguous and writable. 

1026 Selections must be the output of numpy.s_[<args>]. 

1027 

1028 Broadcasting is supported for simple indexing. 

1029 """ 

1030 with phil: 

1031 if self._is_empty: 

1032 raise TypeError("Empty datasets have no numpy representation") 

1033 if source_sel is None: 

1034 source_sel = sel.SimpleSelection(self.shape) 

1035 else: 

1036 source_sel = sel.select(self.shape, source_sel, self) # for numpy.s_ 

1037 fspace = source_sel.id 

1038 

1039 if dest_sel is None: 

1040 dest_sel = sel.SimpleSelection(dest.shape) 

1041 else: 

1042 dest_sel = sel.select(dest.shape, dest_sel) 

1043 

1044 for mspace in dest_sel.broadcast(source_sel.array_shape): 

1045 self.id.read(mspace, fspace, dest, dxpl=self._dxpl) 

1046 

1047 def write_direct(self, source, source_sel=None, dest_sel=None): 

1048 """ Write data directly to HDF5 from a NumPy array. 

1049 

1050 The source array must be C-contiguous. Selections must be 

1051 the output of numpy.s_[<args>]. 

1052 

1053 Broadcasting is supported for simple indexing. 

1054 """ 

1055 with phil: 

1056 if self._is_empty: 

1057 raise TypeError("Empty datasets cannot be written to") 

1058 if source_sel is None: 

1059 source_sel = sel.SimpleSelection(source.shape) 

1060 else: 

1061 source_sel = sel.select(source.shape, source_sel) # for numpy.s_ 

1062 mspace = source_sel.id 

1063 

1064 if dest_sel is None: 

1065 dest_sel = sel.SimpleSelection(self.shape) 

1066 else: 

1067 dest_sel = sel.select(self.shape, dest_sel, self) 

1068 

1069 for fspace in dest_sel.broadcast(source_sel.array_shape): 

1070 self.id.write(mspace, fspace, source, dxpl=self._dxpl) 

1071 

1072 @with_phil 

1073 def __array__(self, dtype=None, copy=True): 

1074 """ Create a Numpy array containing the whole dataset. DON'T THINK 

1075 THIS MEANS DATASETS ARE INTERCHANGEABLE WITH ARRAYS. For one thing, 

1076 you have to read the whole dataset every time this method is called. 

1077 """ 

1078 if copy is False: 

1079 raise ValueError( 

1080 f"Dataset.__array__ received {copy=} " 

1081 f"but memory allocation cannot be avoided on read" 

1082 ) 

1083 arr = numpy.zeros(self.shape, dtype=self.dtype if dtype is None else dtype) 

1084 

1085 # Special case for (0,)*-shape datasets 

1086 if self.size == 0: 

1087 return arr 

1088 

1089 self.read_direct(arr) 

1090 return arr 

1091 

1092 @with_phil 

1093 def __repr__(self): 

1094 if not self: 

1095 r = '<Closed HDF5 dataset>' 

1096 else: 

1097 if self.name is None: 

1098 namestr = '("anonymous")' 

1099 else: 

1100 name = pp.basename(pp.normpath(self.name)) 

1101 namestr = '"%s"' % (name if name != '' else '/') 

1102 r = '<HDF5 dataset %s: shape %s, type "%s">' % ( 

1103 namestr, self.shape, self.dtype.str 

1104 ) 

1105 return r 

1106 

1107 if hasattr(h5d.DatasetID, "refresh"): 

1108 @with_phil 

1109 def refresh(self): 

1110 """ Refresh the dataset metadata by reloading from the file. 

1111 

1112 This is part of the SWMR features and only exist when the HDF5 

1113 library version >=1.9.178 

1114 """ 

1115 self._id.refresh() 

1116 self._cache_props.clear() 

1117 

1118 if hasattr(h5d.DatasetID, "flush"): 

1119 @with_phil 

1120 def flush(self): 

1121 """ Flush the dataset data and metadata to the file. 

1122 If the dataset is chunked, raw data chunks are written to the file. 

1123 

1124 This is part of the SWMR features and only exist when the HDF5 

1125 library version >=1.9.178 

1126 """ 

1127 self._id.flush() 

1128 

1129 if vds_support: 

1130 @property 

1131 @with_phil 

1132 def is_virtual(self): 

1133 """Check if this is a virtual dataset""" 

1134 return self._dcpl.get_layout() == h5d.VIRTUAL 

1135 

1136 @with_phil 

1137 def virtual_sources(self): 

1138 """Get a list of the data mappings for a virtual dataset""" 

1139 if not self.is_virtual: 

1140 raise RuntimeError("Not a virtual dataset") 

1141 dcpl = self._dcpl 

1142 return [ 

1143 VDSmap(dcpl.get_virtual_vspace(j), 

1144 dcpl.get_virtual_filename(j), 

1145 dcpl.get_virtual_dsetname(j), 

1146 dcpl.get_virtual_srcspace(j)) 

1147 for j in range(dcpl.get_virtual_count())] 

1148 

1149 @with_phil 

1150 def make_scale(self, name=''): 

1151 """Make this dataset an HDF5 dimension scale. 

1152 

1153 You can then attach it to dimensions of other datasets like this:: 

1154 

1155 other_ds.dims[0].attach_scale(ds) 

1156 

1157 You can optionally pass a name to associate with this scale. 

1158 """ 

1159 h5ds.set_scale(self._id, self._e(name)) 

1160 

1161 @property 

1162 @with_phil 

1163 def is_scale(self): 

1164 """Return ``True`` if this dataset is also a dimension scale. 

1165 

1166 Return ``False`` otherwise. 

1167 """ 

1168 return h5ds.is_scale(self._id)