Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/h5py/_hl/dataset.py: 21%

633 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:30 +0000

1# This file is part of h5py, a Python interface to the HDF5 library. 

2# 

3# http://www.h5py.org 

4# 

5# Copyright 2008-2020 Andrew Collette and contributors 

6# 

7# License: Standard 3-clause BSD; see "license.txt" for full license terms 

8# and contributor agreement. 

9 

10""" 

11 Implements support for high-level dataset access. 

12""" 

13 

14import posixpath as pp 

15import sys 

16 

17import numpy 

18 

19from .. import h5, h5s, h5t, h5r, h5d, h5p, h5fd, h5ds, _selector 

20from .base import HLObject, phil, with_phil, Empty, cached_property, find_item_type, array_for_new_object 

21from . import filters 

22from . import selections as sel 

23from . import selections2 as sel2 

24from .datatype import Datatype 

25from .compat import filename_decode 

26from .vds import VDSmap, vds_support 

27 

28_LEGACY_GZIP_COMPRESSION_VALS = frozenset(range(10)) 

29MPI = h5.get_config().mpi 

30 

31 

32def make_new_dset(parent, shape=None, dtype=None, data=None, name=None, 

33 chunks=None, compression=None, shuffle=None, 

34 fletcher32=None, maxshape=None, compression_opts=None, 

35 fillvalue=None, scaleoffset=None, track_times=False, 

36 external=None, track_order=None, dcpl=None, dapl=None, 

37 efile_prefix=None, virtual_prefix=None, allow_unknown_filter=False, 

38 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None): 

39 """ Return a new low-level dataset identifier """ 

40 

41 # Convert data to a C-contiguous ndarray 

42 if data is not None and not isinstance(data, Empty): 

43 data = array_for_new_object(data, specified_dtype=dtype) 

44 

45 # Validate shape 

46 if shape is None: 

47 if data is None: 

48 if dtype is None: 

49 raise TypeError("One of data, shape or dtype must be specified") 

50 data = Empty(dtype) 

51 shape = data.shape 

52 else: 

53 shape = (shape,) if isinstance(shape, int) else tuple(shape) 

54 if data is not None and (numpy.product(shape, dtype=numpy.ulonglong) != numpy.product(data.shape, dtype=numpy.ulonglong)): 

55 raise ValueError("Shape tuple is incompatible with data") 

56 

57 if isinstance(maxshape, int): 

58 maxshape = (maxshape,) 

59 tmp_shape = maxshape if maxshape is not None else shape 

60 

61 # Validate chunk shape 

62 if isinstance(chunks, int) and not isinstance(chunks, bool): 

63 chunks = (chunks,) 

64 if isinstance(chunks, tuple) and any( 

65 chunk > dim for dim, chunk in zip(tmp_shape, chunks) if dim is not None 

66 ): 

67 errmsg = "Chunk shape must not be greater than data shape in any dimension. "\ 

68 "{} is not compatible with {}".format(chunks, shape) 

69 raise ValueError(errmsg) 

70 

71 if isinstance(dtype, Datatype): 

72 # Named types are used as-is 

73 tid = dtype.id 

74 dtype = tid.dtype # Following code needs this 

75 else: 

76 # Validate dtype 

77 if dtype is None and data is None: 

78 dtype = numpy.dtype("=f4") 

79 elif dtype is None and data is not None: 

80 dtype = data.dtype 

81 else: 

82 dtype = numpy.dtype(dtype) 

83 tid = h5t.py_create(dtype, logical=1) 

84 

85 # Legacy 

86 if any((compression, shuffle, fletcher32, maxshape, scaleoffset)) and chunks is False: 

87 raise ValueError("Chunked format required for given storage options") 

88 

89 # Legacy 

90 if compression is True: 

91 if compression_opts is None: 

92 compression_opts = 4 

93 compression = 'gzip' 

94 

95 # Legacy 

96 if compression in _LEGACY_GZIP_COMPRESSION_VALS: 

97 if compression_opts is not None: 

98 raise TypeError("Conflict in compression options") 

99 compression_opts = compression 

100 compression = 'gzip' 

101 dcpl = filters.fill_dcpl( 

102 dcpl or h5p.create(h5p.DATASET_CREATE), shape, dtype, 

103 chunks, compression, compression_opts, shuffle, fletcher32, 

104 maxshape, scaleoffset, external, allow_unknown_filter) 

105 

106 if fillvalue is not None: 

107 # prepare string-type dtypes for fillvalue 

108 string_info = h5t.check_string_dtype(dtype) 

109 if string_info is not None: 

110 # fake vlen dtype for fixed len string fillvalue 

111 # to not trigger unwanted encoding 

112 dtype = h5t.string_dtype(string_info.encoding) 

113 fillvalue = numpy.array(fillvalue, dtype=dtype) 

114 else: 

115 fillvalue = numpy.array(fillvalue) 

116 dcpl.set_fill_value(fillvalue) 

117 

118 if track_times is None: 

119 # In case someone explicitly passes None for the default 

120 track_times = False 

121 if track_times in (True, False): 

122 dcpl.set_obj_track_times(track_times) 

123 else: 

124 raise TypeError("track_times must be either True or False") 

125 if track_order is True: 

126 dcpl.set_attr_creation_order( 

127 h5p.CRT_ORDER_TRACKED | h5p.CRT_ORDER_INDEXED) 

128 elif track_order is False: 

129 dcpl.set_attr_creation_order(0) 

130 elif track_order is not None: 

131 raise TypeError("track_order must be either True or False") 

132 

133 if maxshape is not None: 

134 maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape) 

135 

136 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]): 

137 dapl = dapl or h5p.create(h5p.DATASET_ACCESS) 

138 

139 if efile_prefix is not None: 

140 dapl.set_efile_prefix(efile_prefix) 

141 

142 if virtual_prefix is not None: 

143 dapl.set_virtual_prefix(virtual_prefix) 

144 

145 if rdcc_nbytes or rdcc_nslots or rdcc_w0: 

146 cache_settings = list(dapl.get_chunk_cache()) 

147 if rdcc_nslots is not None: 

148 cache_settings[0] = rdcc_nslots 

149 if rdcc_nbytes is not None: 

150 cache_settings[1] = rdcc_nbytes 

151 if rdcc_w0 is not None: 

152 cache_settings[2] = rdcc_w0 

153 dapl.set_chunk_cache(*cache_settings) 

154 

155 if isinstance(data, Empty): 

156 sid = h5s.create(h5s.NULL) 

157 else: 

158 sid = h5s.create_simple(shape, maxshape) 

159 

160 dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl, dapl=dapl) 

161 

162 if (data is not None) and (not isinstance(data, Empty)): 

163 dset_id.write(h5s.ALL, h5s.ALL, data) 

164 

165 return dset_id 

166 

167 

168def open_dset(parent, name, dapl=None, efile_prefix=None, virtual_prefix=None, 

169 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, **kwds): 

170 """ Return an existing low-level dataset identifier """ 

171 

172 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]): 

173 dapl = dapl or h5p.create(h5p.DATASET_ACCESS) 

174 

175 if efile_prefix is not None: 

176 dapl.set_efile_prefix(efile_prefix) 

177 

178 if virtual_prefix is not None: 

179 dapl.set_virtual_prefix(virtual_prefix) 

180 

181 if rdcc_nbytes or rdcc_nslots or rdcc_w0: 

182 cache_settings = list(dapl.get_chunk_cache()) 

183 if rdcc_nslots is not None: 

184 cache_settings[0] = rdcc_nslots 

185 if rdcc_nbytes is not None: 

186 cache_settings[1] = rdcc_nbytes 

187 if rdcc_w0 is not None: 

188 cache_settings[2] = rdcc_w0 

189 dapl.set_chunk_cache(*cache_settings) 

190 

191 dset_id = h5d.open(parent.id, name, dapl=dapl) 

192 

193 return dset_id 

194 

195 

196class AstypeWrapper: 

197 """Wrapper to convert data on reading from a dataset. 

198 """ 

199 def __init__(self, dset, dtype): 

200 self._dset = dset 

201 self._dtype = numpy.dtype(dtype) 

202 

203 def __getitem__(self, args): 

204 return self._dset.__getitem__(args, new_dtype=self._dtype) 

205 

206 def __len__(self): 

207 """ Get the length of the underlying dataset 

208 

209 >>> length = len(dataset.astype('f8')) 

210 """ 

211 return len(self._dset) 

212 

213 def __array__(self, dtype=None): 

214 data = self[:] 

215 if dtype is not None: 

216 data = data.astype(dtype) 

217 return data 

218 

219 

220class AsStrWrapper: 

221 """Wrapper to decode strings on reading the dataset""" 

222 def __init__(self, dset, encoding, errors='strict'): 

223 self._dset = dset 

224 if encoding is None: 

225 encoding = h5t.check_string_dtype(dset.dtype).encoding 

226 self.encoding = encoding 

227 self.errors = errors 

228 

229 def __getitem__(self, args): 

230 bytes_arr = self._dset[args] 

231 # numpy.char.decode() seems like the obvious thing to use. But it only 

232 # accepts numpy string arrays, not object arrays of bytes (which we 

233 # return from HDF5 variable-length strings). And the numpy 

234 # implementation is not faster than doing it with a loop; in fact, by 

235 # not converting the result to a numpy unicode array, the 

236 # naive way can be faster! (Comparing with numpy 1.18.4, June 2020) 

237 if numpy.isscalar(bytes_arr): 

238 return bytes_arr.decode(self.encoding, self.errors) 

239 

240 return numpy.array([ 

241 b.decode(self.encoding, self.errors) for b in bytes_arr.flat 

242 ], dtype=object).reshape(bytes_arr.shape) 

243 

244 def __len__(self): 

245 """ Get the length of the underlying dataset 

246 

247 >>> length = len(dataset.asstr()) 

248 """ 

249 return len(self._dset) 

250 

251 def __array__(self): 

252 return numpy.array([ 

253 b.decode(self.encoding, self.errors) for b in self._dset 

254 ], dtype=object).reshape(self._dset.shape) 

255 

256 

257class FieldsWrapper: 

258 """Wrapper to extract named fields from a dataset with a struct dtype""" 

259 extract_field = None 

260 

261 def __init__(self, dset, prior_dtype, names): 

262 self._dset = dset 

263 if isinstance(names, str): 

264 self.extract_field = names 

265 names = [names] 

266 self.read_dtype = readtime_dtype(prior_dtype, names) 

267 

268 def __array__(self, dtype=None): 

269 data = self[:] 

270 if dtype is not None: 

271 data = data.astype(dtype) 

272 return data 

273 

274 def __getitem__(self, args): 

275 data = self._dset.__getitem__(args, new_dtype=self.read_dtype) 

276 if self.extract_field is not None: 

277 data = data[self.extract_field] 

278 return data 

279 

280 def __len__(self): 

281 """ Get the length of the underlying dataset 

282 

283 >>> length = len(dataset.fields(['x', 'y'])) 

284 """ 

285 return len(self._dset) 

286 

287 

288def readtime_dtype(basetype, names): 

289 """Make a NumPy compound dtype with a subset of available fields""" 

290 if basetype.names is None: # Names provided, but not compound 

291 raise ValueError("Field names only allowed for compound types") 

292 

293 for name in names: # Check all names are legal 

294 if name not in basetype.names: 

295 raise ValueError("Field %s does not appear in this type." % name) 

296 

297 return numpy.dtype([(name, basetype.fields[name][0]) for name in names]) 

298 

299 

300if MPI: 

301 class CollectiveContext: 

302 

303 """ Manages collective I/O in MPI mode """ 

304 

305 # We don't bother with _local as threads are forbidden in MPI mode 

306 

307 def __init__(self, dset): 

308 self._dset = dset 

309 

310 def __enter__(self): 

311 # pylint: disable=protected-access 

312 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_COLLECTIVE) 

313 

314 def __exit__(self, *args): 

315 # pylint: disable=protected-access 

316 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_INDEPENDENT) 

317 

318 

319class ChunkIterator: 

320 """ 

321 Class to iterate through list of chunks of a given dataset 

322 """ 

323 def __init__(self, dset, source_sel=None): 

324 self._shape = dset.shape 

325 rank = len(dset.shape) 

326 

327 if not dset.chunks: 

328 # can only use with chunked datasets 

329 raise TypeError("Chunked dataset required") 

330 

331 self._layout = dset.chunks 

332 if source_sel is None: 

333 # select over entire dataset 

334 slices = [] 

335 for dim in range(rank): 

336 slices.append(slice(0, self._shape[dim])) 

337 self._sel = tuple(slices) 

338 else: 

339 if isinstance(source_sel, slice): 

340 self._sel = (source_sel,) 

341 else: 

342 self._sel = source_sel 

343 if len(self._sel) != rank: 

344 raise ValueError("Invalid selection - selection region must have same rank as dataset") 

345 self._chunk_index = [] 

346 for dim in range(rank): 

347 s = self._sel[dim] 

348 if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start: 

349 raise ValueError("Invalid selection - selection region must be within dataset space") 

350 index = s.start // self._layout[dim] 

351 self._chunk_index.append(index) 

352 

353 def __iter__(self): 

354 return self 

355 

356 def __next__(self): 

357 rank = len(self._shape) 

358 slices = [] 

359 if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop: 

360 # ran past the last chunk, end iteration 

361 raise StopIteration() 

362 

363 for dim in range(rank): 

364 s = self._sel[dim] 

365 start = self._chunk_index[dim] * self._layout[dim] 

366 stop = (self._chunk_index[dim] + 1) * self._layout[dim] 

367 # adjust the start if this is an edge chunk 

368 if start < s.start: 

369 start = s.start 

370 if stop > s.stop: 

371 stop = s.stop # trim to end of the selection 

372 s = slice(start, stop, 1) 

373 slices.append(s) 

374 

375 # bump up the last index and carry forward if we run outside the selection 

376 dim = rank - 1 

377 while dim >= 0: 

378 s = self._sel[dim] 

379 self._chunk_index[dim] += 1 

380 

381 chunk_end = self._chunk_index[dim] * self._layout[dim] 

382 if chunk_end < s.stop: 

383 # we still have room to extend along this dimensions 

384 return tuple(slices) 

385 

386 if dim > 0: 

387 # reset to the start and continue iterating with higher dimension 

388 self._chunk_index[dim] = 0 

389 dim -= 1 

390 return tuple(slices) 

391 

392 

393class Dataset(HLObject): 

394 

395 """ 

396 Represents an HDF5 dataset 

397 """ 

398 

399 def astype(self, dtype): 

400 """ Get a wrapper allowing you to perform reads to a 

401 different destination type, e.g.: 

402 

403 >>> double_precision = dataset.astype('f8')[0:100:2] 

404 """ 

405 return AstypeWrapper(self, dtype) 

406 

407 def asstr(self, encoding=None, errors='strict'): 

408 """Get a wrapper to read string data as Python strings: 

409 

410 >>> str_array = dataset.asstr()[:] 

411 

412 The parameters have the same meaning as in ``bytes.decode()``. 

413 If ``encoding`` is unspecified, it will use the encoding in the HDF5 

414 datatype (either ascii or utf-8). 

415 """ 

416 string_info = h5t.check_string_dtype(self.dtype) 

417 if string_info is None: 

418 raise TypeError( 

419 "dset.asstr() can only be used on datasets with " 

420 "an HDF5 string datatype" 

421 ) 

422 if encoding is None: 

423 encoding = string_info.encoding 

424 return AsStrWrapper(self, encoding, errors=errors) 

425 

426 def fields(self, names, *, _prior_dtype=None): 

427 """Get a wrapper to read a subset of fields from a compound data type: 

428 

429 >>> 2d_coords = dataset.fields(['x', 'y'])[:] 

430 

431 If names is a string, a single field is extracted, and the resulting 

432 arrays will have that dtype. Otherwise, it should be an iterable, 

433 and the read data will have a compound dtype. 

434 """ 

435 if _prior_dtype is None: 

436 _prior_dtype = self.dtype 

437 return FieldsWrapper(self, _prior_dtype, names) 

438 

439 if MPI: 

440 @property 

441 @with_phil 

442 def collective(self): 

443 """ Context manager for MPI collective reads & writes """ 

444 return CollectiveContext(self) 

445 

446 @property 

447 def dims(self): 

448 """ Access dimension scales attached to this dataset. """ 

449 from .dims import DimensionManager 

450 with phil: 

451 return DimensionManager(self) 

452 

453 @property 

454 @with_phil 

455 def ndim(self): 

456 """Numpy-style attribute giving the number of dimensions""" 

457 return self.id.rank 

458 

459 @property 

460 def shape(self): 

461 """Numpy-style shape tuple giving dataset dimensions""" 

462 if 'shape' in self._cache_props: 

463 return self._cache_props['shape'] 

464 

465 with phil: 

466 shape = self.id.shape 

467 

468 # If the file is read-only, cache the shape to speed-up future uses. 

469 # This cache is invalidated by .refresh() when using SWMR. 

470 if self._readonly: 

471 self._cache_props['shape'] = shape 

472 return shape 

473 

474 @shape.setter 

475 @with_phil 

476 def shape(self, shape): 

477 # pylint: disable=missing-docstring 

478 self.resize(shape) 

479 

480 @property 

481 def size(self): 

482 """Numpy-style attribute giving the total dataset size""" 

483 if 'size' in self._cache_props: 

484 return self._cache_props['size'] 

485 

486 if self._is_empty: 

487 size = None 

488 else: 

489 size = numpy.prod(self.shape, dtype=numpy.intp) 

490 

491 # If the file is read-only, cache the size to speed-up future uses. 

492 # This cache is invalidated by .refresh() when using SWMR. 

493 if self._readonly: 

494 self._cache_props['size'] = size 

495 return size 

496 

497 @property 

498 def nbytes(self): 

499 """Numpy-style attribute giving the raw dataset size as the number of bytes""" 

500 size = self.size 

501 if size is None: # if we are an empty 0-D array, then there are no bytes in the dataset 

502 return 0 

503 return self.dtype.itemsize * size 

504 

505 @property 

506 def _selector(self): 

507 """Internal object for optimised selection of data""" 

508 if '_selector' in self._cache_props: 

509 return self._cache_props['_selector'] 

510 

511 slr = _selector.Selector(self.id.get_space()) 

512 

513 # If the file is read-only, cache the reader to speed up future uses. 

514 # This cache is invalidated by .refresh() when using SWMR. 

515 if self._readonly: 

516 self._cache_props['_selector'] = slr 

517 return slr 

518 

519 @property 

520 def _fast_reader(self): 

521 """Internal object for optimised reading of data""" 

522 if '_fast_reader' in self._cache_props: 

523 return self._cache_props['_fast_reader'] 

524 

525 rdr = _selector.Reader(self.id) 

526 

527 # If the file is read-only, cache the reader to speed up future uses. 

528 # This cache is invalidated by .refresh() when using SWMR. 

529 if self._readonly: 

530 self._cache_props['_fast_reader'] = rdr 

531 return rdr 

532 

533 @property 

534 @with_phil 

535 def dtype(self): 

536 """Numpy dtype representing the datatype""" 

537 return self.id.dtype 

538 

539 @property 

540 @with_phil 

541 def chunks(self): 

542 """Dataset chunks (or None)""" 

543 dcpl = self._dcpl 

544 if dcpl.get_layout() == h5d.CHUNKED: 

545 return dcpl.get_chunk() 

546 return None 

547 

548 @property 

549 @with_phil 

550 def compression(self): 

551 """Compression strategy (or None)""" 

552 for x in ('gzip','lzf','szip'): 

553 if x in self._filters: 

554 return x 

555 return None 

556 

557 @property 

558 @with_phil 

559 def compression_opts(self): 

560 """ Compression setting. Int(0-9) for gzip, 2-tuple for szip. """ 

561 return self._filters.get(self.compression, None) 

562 

563 @property 

564 @with_phil 

565 def shuffle(self): 

566 """Shuffle filter present (T/F)""" 

567 return 'shuffle' in self._filters 

568 

569 @property 

570 @with_phil 

571 def fletcher32(self): 

572 """Fletcher32 filter is present (T/F)""" 

573 return 'fletcher32' in self._filters 

574 

575 @property 

576 @with_phil 

577 def scaleoffset(self): 

578 """Scale/offset filter settings. For integer data types, this is 

579 the number of bits stored, or 0 for auto-detected. For floating 

580 point data types, this is the number of decimal places retained. 

581 If the scale/offset filter is not in use, this is None.""" 

582 try: 

583 return self._filters['scaleoffset'][1] 

584 except KeyError: 

585 return None 

586 

587 @property 

588 @with_phil 

589 def external(self): 

590 """External file settings. Returns a list of tuples of 

591 (name, offset, size) for each external file entry, or returns None 

592 if no external files are used.""" 

593 count = self._dcpl.get_external_count() 

594 if count<=0: 

595 return None 

596 ext_list = list() 

597 for x in range(count): 

598 (name, offset, size) = self._dcpl.get_external(x) 

599 ext_list.append( (filename_decode(name), offset, size) ) 

600 return ext_list 

601 

602 @property 

603 @with_phil 

604 def maxshape(self): 

605 """Shape up to which this dataset can be resized. Axes with value 

606 None have no resize limit. """ 

607 space = self.id.get_space() 

608 dims = space.get_simple_extent_dims(True) 

609 if dims is None: 

610 return None 

611 

612 return tuple(x if x != h5s.UNLIMITED else None for x in dims) 

613 

614 @property 

615 @with_phil 

616 def fillvalue(self): 

617 """Fill value for this dataset (0 by default)""" 

618 arr = numpy.zeros((1,), dtype=self.dtype) 

619 self._dcpl.get_fill_value(arr) 

620 return arr[0] 

621 

622 @cached_property 

623 @with_phil 

624 def _extent_type(self): 

625 """Get extent type for this dataset - SIMPLE, SCALAR or NULL""" 

626 return self.id.get_space().get_simple_extent_type() 

627 

628 @cached_property 

629 def _is_empty(self): 

630 """Check if extent type is empty""" 

631 return self._extent_type == h5s.NULL 

632 

633 @with_phil 

634 def __init__(self, bind, *, readonly=False): 

635 """ Create a new Dataset object by binding to a low-level DatasetID. 

636 """ 

637 if not isinstance(bind, h5d.DatasetID): 

638 raise ValueError("%s is not a DatasetID" % bind) 

639 super().__init__(bind) 

640 

641 self._dcpl = self.id.get_create_plist() 

642 self._dxpl = h5p.create(h5p.DATASET_XFER) 

643 self._filters = filters.get_filters(self._dcpl) 

644 self._readonly = readonly 

645 self._cache_props = {} 

646 

647 def resize(self, size, axis=None): 

648 """ Resize the dataset, or the specified axis. 

649 

650 The dataset must be stored in chunked format; it can be resized up to 

651 the "maximum shape" (keyword maxshape) specified at creation time. 

652 The rank of the dataset cannot be changed. 

653 

654 "Size" should be a shape tuple, or if an axis is specified, an integer. 

655 

656 BEWARE: This functions differently than the NumPy resize() method! 

657 The data is not "reshuffled" to fit in the new shape; each axis is 

658 grown or shrunk independently. The coordinates of existing data are 

659 fixed. 

660 """ 

661 with phil: 

662 if self.chunks is None: 

663 raise TypeError("Only chunked datasets can be resized") 

664 

665 if axis is not None: 

666 if not (axis >=0 and axis < self.id.rank): 

667 raise ValueError("Invalid axis (0 to %s allowed)" % (self.id.rank-1)) 

668 try: 

669 newlen = int(size) 

670 except TypeError: 

671 raise TypeError("Argument must be a single int if axis is specified") 

672 size = list(self.shape) 

673 size[axis] = newlen 

674 

675 size = tuple(size) 

676 self.id.set_extent(size) 

677 #h5f.flush(self.id) # THG recommends 

678 

679 @with_phil 

680 def __len__(self): 

681 """ The size of the first axis. TypeError if scalar. 

682 

683 Limited to 2**32 on 32-bit systems; Dataset.len() is preferred. 

684 """ 

685 size = self.len() 

686 if size > sys.maxsize: 

687 raise OverflowError("Value too big for Python's __len__; use Dataset.len() instead.") 

688 return size 

689 

690 def len(self): 

691 """ The size of the first axis. TypeError if scalar. 

692 

693 Use of this method is preferred to len(dset), as Python's built-in 

694 len() cannot handle values greater then 2**32 on 32-bit systems. 

695 """ 

696 with phil: 

697 shape = self.shape 

698 if len(shape) == 0: 

699 raise TypeError("Attempt to take len() of scalar dataset") 

700 return shape[0] 

701 

702 @with_phil 

703 def __iter__(self): 

704 """ Iterate over the first axis. TypeError if scalar. 

705 

706 BEWARE: Modifications to the yielded data are *NOT* written to file. 

707 """ 

708 shape = self.shape 

709 if len(shape) == 0: 

710 raise TypeError("Can't iterate over a scalar dataset") 

711 for i in range(shape[0]): 

712 yield self[i] 

713 

714 @with_phil 

715 def iter_chunks(self, sel=None): 

716 """ Return chunk iterator. If set, the sel argument is a slice or 

717 tuple of slices that defines the region to be used. If not set, the 

718 entire dataspace will be used for the iterator. 

719 

720 For each chunk within the given region, the iterator yields a tuple of 

721 slices that gives the intersection of the given chunk with the 

722 selection area. 

723 

724 A TypeError will be raised if the dataset is not chunked. 

725 

726 A ValueError will be raised if the selection region is invalid. 

727 

728 """ 

729 return ChunkIterator(self, sel) 

730 

731 @cached_property 

732 def _fast_read_ok(self): 

733 """Is this dataset suitable for simple reading""" 

734 return ( 

735 self._extent_type == h5s.SIMPLE 

736 and isinstance(self.id.get_type(), (h5t.TypeIntegerID, h5t.TypeFloatID)) 

737 ) 

738 

739 @with_phil 

740 def __getitem__(self, args, new_dtype=None): 

741 """ Read a slice from the HDF5 dataset. 

742 

743 Takes slices and recarray-style field names (more than one is 

744 allowed!) in any order. Obeys basic NumPy rules, including 

745 broadcasting. 

746 

747 Also supports: 

748 

749 * Boolean "mask" array indexing 

750 """ 

751 args = args if isinstance(args, tuple) else (args,) 

752 

753 if self._fast_read_ok and (new_dtype is None): 

754 try: 

755 return self._fast_reader.read(args) 

756 except TypeError: 

757 pass # Fall back to Python read pathway below 

758 

759 if self._is_empty: 

760 # Check 'is Ellipsis' to avoid equality comparison with an array: 

761 # array equality returns an array, not a boolean. 

762 if args == () or (len(args) == 1 and args[0] is Ellipsis): 

763 return Empty(self.dtype) 

764 raise ValueError("Empty datasets cannot be sliced") 

765 

766 # Sort field names from the rest of the args. 

767 names = tuple(x for x in args if isinstance(x, str)) 

768 

769 if names: 

770 # Read a subset of the fields in this structured dtype 

771 if len(names) == 1: 

772 names = names[0] # Read with simpler dtype of this field 

773 args = tuple(x for x in args if not isinstance(x, str)) 

774 return self.fields(names, _prior_dtype=new_dtype)[args] 

775 

776 if new_dtype is None: 

777 new_dtype = self.dtype 

778 mtype = h5t.py_create(new_dtype) 

779 

780 # === Special-case region references ==== 

781 

782 if len(args) == 1 and isinstance(args[0], h5r.RegionReference): 

783 

784 obj = h5r.dereference(args[0], self.id) 

785 if obj != self.id: 

786 raise ValueError("Region reference must point to this dataset") 

787 

788 sid = h5r.get_region(args[0], self.id) 

789 mshape = sel.guess_shape(sid) 

790 if mshape is None: 

791 # 0D with no data (NULL or deselected SCALAR) 

792 return Empty(new_dtype) 

793 out = numpy.zeros(mshape, dtype=new_dtype) 

794 if out.size == 0: 

795 return out 

796 

797 sid_out = h5s.create_simple(mshape) 

798 sid_out.select_all() 

799 self.id.read(sid_out, sid, out, mtype) 

800 return out 

801 

802 # === Check for zero-sized datasets ===== 

803 

804 if self.size == 0: 

805 # Check 'is Ellipsis' to avoid equality comparison with an array: 

806 # array equality returns an array, not a boolean. 

807 if args == () or (len(args) == 1 and args[0] is Ellipsis): 

808 return numpy.zeros(self.shape, dtype=new_dtype) 

809 

810 # === Scalar dataspaces ================= 

811 

812 if self.shape == (): 

813 fspace = self.id.get_space() 

814 selection = sel2.select_read(fspace, args) 

815 if selection.mshape is None: 

816 arr = numpy.zeros((), dtype=new_dtype) 

817 else: 

818 arr = numpy.zeros(selection.mshape, dtype=new_dtype) 

819 for mspace, fspace in selection: 

820 self.id.read(mspace, fspace, arr, mtype) 

821 if selection.mshape is None: 

822 return arr[()] 

823 return arr 

824 

825 # === Everything else =================== 

826 

827 # Perform the dataspace selection. 

828 selection = sel.select(self.shape, args, dataset=self) 

829 

830 if selection.nselect == 0: 

831 return numpy.zeros(selection.array_shape, dtype=new_dtype) 

832 

833 arr = numpy.zeros(selection.array_shape, new_dtype, order='C') 

834 

835 # Perform the actual read 

836 mspace = h5s.create_simple(selection.mshape) 

837 fspace = selection.id 

838 self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl) 

839 

840 # Patch up the output for NumPy 

841 if arr.shape == (): 

842 return arr[()] # 0 dim array -> numpy scalar 

843 return arr 

844 

845 @with_phil 

846 def __setitem__(self, args, val): 

847 """ Write to the HDF5 dataset from a Numpy array. 

848 

849 NumPy's broadcasting rules are honored, for "simple" indexing 

850 (slices and integers). For advanced indexing, the shapes must 

851 match. 

852 """ 

853 args = args if isinstance(args, tuple) else (args,) 

854 

855 # Sort field indices from the slicing 

856 names = tuple(x for x in args if isinstance(x, str)) 

857 args = tuple(x for x in args if not isinstance(x, str)) 

858 

859 # Generally we try to avoid converting the arrays on the Python 

860 # side. However, for compound literals this is unavoidable. 

861 vlen = h5t.check_vlen_dtype(self.dtype) 

862 if vlen is not None and vlen not in (bytes, str): 

863 try: 

864 val = numpy.asarray(val, dtype=vlen) 

865 except ValueError: 

866 try: 

867 val = numpy.array([numpy.array(x, dtype=vlen) 

868 for x in val], dtype=self.dtype) 

869 except ValueError: 

870 pass 

871 if vlen == val.dtype: 

872 if val.ndim > 1: 

873 tmp = numpy.empty(shape=val.shape[:-1], dtype=object) 

874 tmp.ravel()[:] = [i for i in val.reshape( 

875 (numpy.product(val.shape[:-1], dtype=numpy.ulonglong), val.shape[-1]))] 

876 else: 

877 tmp = numpy.array([None], dtype=object) 

878 tmp[0] = val 

879 val = tmp 

880 elif self.dtype.kind == "O" or \ 

881 (self.dtype.kind == 'V' and \ 

882 (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \ 

883 (self.dtype.subdtype is None)): 

884 if len(names) == 1 and self.dtype.fields is not None: 

885 # Single field selected for write, from a non-array source 

886 if not names[0] in self.dtype.fields: 

887 raise ValueError("No such field for indexing: %s" % names[0]) 

888 dtype = self.dtype.fields[names[0]][0] 

889 cast_compound = True 

890 else: 

891 dtype = self.dtype 

892 cast_compound = False 

893 

894 val = numpy.asarray(val, dtype=dtype.base, order='C') 

895 if cast_compound: 

896 val = val.view(numpy.dtype([(names[0], dtype)])) 

897 val = val.reshape(val.shape[:len(val.shape) - len(dtype.shape)]) 

898 elif (self.dtype.kind == 'S' 

899 and (h5t.check_string_dtype(self.dtype).encoding == 'utf-8') 

900 and (find_item_type(val) is str) 

901 ): 

902 # Writing str objects to a fixed-length UTF-8 string dataset. 

903 # Numpy's normal conversion only handles ASCII characters, but 

904 # when the destination is UTF-8, we want to allow any unicode. 

905 # This *doesn't* handle numpy fixed-length unicode data ('U' dtype), 

906 # as HDF5 has no equivalent, and converting fixed length UTF-32 

907 # to variable length UTF-8 would obscure what's going on. 

908 str_array = numpy.asarray(val, order='C', dtype=object) 

909 val = numpy.array([ 

910 s.encode('utf-8') for s in str_array.flat 

911 ], dtype=self.dtype).reshape(str_array.shape) 

912 else: 

913 # If the input data is already an array, let HDF5 do the conversion. 

914 # If it's a list or similar, don't make numpy guess a dtype for it. 

915 dt = None if isinstance(val, numpy.ndarray) else self.dtype.base 

916 val = numpy.asarray(val, order='C', dtype=dt) 

917 

918 # Check for array dtype compatibility and convert 

919 if self.dtype.subdtype is not None: 

920 shp = self.dtype.subdtype[1] 

921 valshp = val.shape[-len(shp):] 

922 if valshp != shp: # Last dimension has to match 

923 raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,)) 

924 mtype = h5t.py_create(numpy.dtype((val.dtype, shp))) 

925 mshape = val.shape[0:len(val.shape)-len(shp)] 

926 

927 # Make a compound memory type if field-name slicing is required 

928 elif len(names) != 0: 

929 

930 mshape = val.shape 

931 

932 # Catch common errors 

933 if self.dtype.fields is None: 

934 raise TypeError("Illegal slicing argument (not a compound dataset)") 

935 mismatch = [x for x in names if x not in self.dtype.fields] 

936 if len(mismatch) != 0: 

937 mismatch = ", ".join('"%s"'%x for x in mismatch) 

938 raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch) 

939 

940 # Write non-compound source into a single dataset field 

941 if len(names) == 1 and val.dtype.fields is None: 

942 subtype = h5t.py_create(val.dtype) 

943 mtype = h5t.create(h5t.COMPOUND, subtype.get_size()) 

944 mtype.insert(self._e(names[0]), 0, subtype) 

945 

946 # Make a new source type keeping only the requested fields 

947 else: 

948 fieldnames = [x for x in val.dtype.names if x in names] # Keep source order 

949 mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize) 

950 for fieldname in fieldnames: 

951 subtype = h5t.py_create(val.dtype.fields[fieldname][0]) 

952 offset = val.dtype.fields[fieldname][1] 

953 mtype.insert(self._e(fieldname), offset, subtype) 

954 

955 # Use mtype derived from array (let DatasetID.write figure it out) 

956 else: 

957 mshape = val.shape 

958 mtype = None 

959 

960 # Perform the dataspace selection 

961 selection = sel.select(self.shape, args, dataset=self) 

962 

963 if selection.nselect == 0: 

964 return 

965 

966 # Broadcast scalars if necessary. 

967 # In order to avoid slow broadcasting filling the destination by 

968 # the scalar value, we create an intermediate array of the same 

969 # size as the destination buffer provided that size is reasonable. 

970 # We assume as reasonable a size smaller or equal as the used dataset 

971 # chunk size if any. 

972 # In case of dealing with a non-chunked destination dataset or with 

973 # a selection whose size is larger than the dataset chunk size we fall 

974 # back to using an intermediate array of size equal to the last dimension 

975 # of the destination buffer. 

976 # The reasoning behind is that it makes sense to assume the creator of 

977 # the dataset used an appropriate chunk size according the available 

978 # memory. In any case, if we cannot afford to create an intermediate 

979 # array of the same size as the dataset chunk size, the user program has 

980 # little hope to go much further. Solves h5py issue #1067 

981 if mshape == () and selection.array_shape != (): 

982 if self.dtype.subdtype is not None: 

983 raise TypeError("Scalar broadcasting is not supported for array dtypes") 

984 if self.chunks and (numpy.prod(self.chunks, dtype=numpy.float64) >= 

985 numpy.prod(selection.array_shape, dtype=numpy.float64)): 

986 val2 = numpy.empty(selection.array_shape, dtype=val.dtype) 

987 else: 

988 val2 = numpy.empty(selection.array_shape[-1], dtype=val.dtype) 

989 val2[...] = val 

990 val = val2 

991 mshape = val.shape 

992 

993 # Perform the write, with broadcasting 

994 mspace = h5s.create_simple(selection.expand_shape(mshape)) 

995 for fspace in selection.broadcast(mshape): 

996 self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl) 

997 

998 def read_direct(self, dest, source_sel=None, dest_sel=None): 

999 """ Read data directly from HDF5 into an existing NumPy array. 

1000 

1001 The destination array must be C-contiguous and writable. 

1002 Selections must be the output of numpy.s_[<args>]. 

1003 

1004 Broadcasting is supported for simple indexing. 

1005 """ 

1006 with phil: 

1007 if self._is_empty: 

1008 raise TypeError("Empty datasets have no numpy representation") 

1009 if source_sel is None: 

1010 source_sel = sel.SimpleSelection(self.shape) 

1011 else: 

1012 source_sel = sel.select(self.shape, source_sel, self) # for numpy.s_ 

1013 fspace = source_sel.id 

1014 

1015 if dest_sel is None: 

1016 dest_sel = sel.SimpleSelection(dest.shape) 

1017 else: 

1018 dest_sel = sel.select(dest.shape, dest_sel) 

1019 

1020 for mspace in dest_sel.broadcast(source_sel.array_shape): 

1021 self.id.read(mspace, fspace, dest, dxpl=self._dxpl) 

1022 

1023 def write_direct(self, source, source_sel=None, dest_sel=None): 

1024 """ Write data directly to HDF5 from a NumPy array. 

1025 

1026 The source array must be C-contiguous. Selections must be 

1027 the output of numpy.s_[<args>]. 

1028 

1029 Broadcasting is supported for simple indexing. 

1030 """ 

1031 with phil: 

1032 if self._is_empty: 

1033 raise TypeError("Empty datasets cannot be written to") 

1034 if source_sel is None: 

1035 source_sel = sel.SimpleSelection(source.shape) 

1036 else: 

1037 source_sel = sel.select(source.shape, source_sel) # for numpy.s_ 

1038 mspace = source_sel.id 

1039 

1040 if dest_sel is None: 

1041 dest_sel = sel.SimpleSelection(self.shape) 

1042 else: 

1043 dest_sel = sel.select(self.shape, dest_sel, self) 

1044 

1045 for fspace in dest_sel.broadcast(source_sel.array_shape): 

1046 self.id.write(mspace, fspace, source, dxpl=self._dxpl) 

1047 

1048 @with_phil 

1049 def __array__(self, dtype=None): 

1050 """ Create a Numpy array containing the whole dataset. DON'T THINK 

1051 THIS MEANS DATASETS ARE INTERCHANGEABLE WITH ARRAYS. For one thing, 

1052 you have to read the whole dataset every time this method is called. 

1053 """ 

1054 arr = numpy.zeros(self.shape, dtype=self.dtype if dtype is None else dtype) 

1055 

1056 # Special case for (0,)*-shape datasets 

1057 if numpy.product(self.shape, dtype=numpy.ulonglong) == 0: 

1058 return arr 

1059 

1060 self.read_direct(arr) 

1061 return arr 

1062 

1063 @with_phil 

1064 def __repr__(self): 

1065 if not self: 

1066 r = '<Closed HDF5 dataset>' 

1067 else: 

1068 if self.name is None: 

1069 namestr = '("anonymous")' 

1070 else: 

1071 name = pp.basename(pp.normpath(self.name)) 

1072 namestr = '"%s"' % (name if name != '' else '/') 

1073 r = '<HDF5 dataset %s: shape %s, type "%s">' % ( 

1074 namestr, self.shape, self.dtype.str 

1075 ) 

1076 return r 

1077 

1078 if hasattr(h5d.DatasetID, "refresh"): 

1079 @with_phil 

1080 def refresh(self): 

1081 """ Refresh the dataset metadata by reloading from the file. 

1082 

1083 This is part of the SWMR features and only exist when the HDF5 

1084 library version >=1.9.178 

1085 """ 

1086 self._id.refresh() 

1087 self._cache_props.clear() 

1088 

1089 if hasattr(h5d.DatasetID, "flush"): 

1090 @with_phil 

1091 def flush(self): 

1092 """ Flush the dataset data and metadata to the file. 

1093 If the dataset is chunked, raw data chunks are written to the file. 

1094 

1095 This is part of the SWMR features and only exist when the HDF5 

1096 library version >=1.9.178 

1097 """ 

1098 self._id.flush() 

1099 

1100 if vds_support: 

1101 @property 

1102 @with_phil 

1103 def is_virtual(self): 

1104 """Check if this is a virtual dataset""" 

1105 return self._dcpl.get_layout() == h5d.VIRTUAL 

1106 

1107 @with_phil 

1108 def virtual_sources(self): 

1109 """Get a list of the data mappings for a virtual dataset""" 

1110 if not self.is_virtual: 

1111 raise RuntimeError("Not a virtual dataset") 

1112 dcpl = self._dcpl 

1113 return [ 

1114 VDSmap(dcpl.get_virtual_vspace(j), 

1115 dcpl.get_virtual_filename(j), 

1116 dcpl.get_virtual_dsetname(j), 

1117 dcpl.get_virtual_srcspace(j)) 

1118 for j in range(dcpl.get_virtual_count())] 

1119 

1120 @with_phil 

1121 def make_scale(self, name=''): 

1122 """Make this dataset an HDF5 dimension scale. 

1123 

1124 You can then attach it to dimensions of other datasets like this:: 

1125 

1126 other_ds.dims[0].attach_scale(ds) 

1127 

1128 You can optionally pass a name to associate with this scale. 

1129 """ 

1130 h5ds.set_scale(self._id, self._e(name)) 

1131 

1132 @property 

1133 @with_phil 

1134 def is_scale(self): 

1135 """Return ``True`` if this dataset is also a dimension scale. 

1136 

1137 Return ``False`` otherwise. 

1138 """ 

1139 return h5ds.is_scale(self._id)