Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/h5py/_hl/dataset.py: 21%

633 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# This file is part of h5py, a Python interface to the HDF5 library. 

2# 

3# http://www.h5py.org 

4# 

5# Copyright 2008-2020 Andrew Collette and contributors 

6# 

7# License: Standard 3-clause BSD; see "license.txt" for full license terms 

8# and contributor agreement. 

9 

10""" 

11 Implements support for high-level dataset access. 

12""" 

13 

14import posixpath as pp 

15import sys 

16 

17import numpy 

18 

19from .. import h5, h5s, h5t, h5r, h5d, h5p, h5fd, h5ds, _selector 

20from .base import ( 

21 array_for_new_object, cached_property, Empty, find_item_type, HLObject, 

22 phil, product, with_phil, 

23) 

24from . import filters 

25from . import selections as sel 

26from . import selections2 as sel2 

27from .datatype import Datatype 

28from .compat import filename_decode 

29from .vds import VDSmap, vds_support 

30 

31_LEGACY_GZIP_COMPRESSION_VALS = frozenset(range(10)) 

32MPI = h5.get_config().mpi 

33 

34 

35def make_new_dset(parent, shape=None, dtype=None, data=None, name=None, 

36 chunks=None, compression=None, shuffle=None, 

37 fletcher32=None, maxshape=None, compression_opts=None, 

38 fillvalue=None, scaleoffset=None, track_times=False, 

39 external=None, track_order=None, dcpl=None, dapl=None, 

40 efile_prefix=None, virtual_prefix=None, allow_unknown_filter=False, 

41 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None): 

42 """ Return a new low-level dataset identifier """ 

43 

44 # Convert data to a C-contiguous ndarray 

45 if data is not None and not isinstance(data, Empty): 

46 data = array_for_new_object(data, specified_dtype=dtype) 

47 

48 # Validate shape 

49 if shape is None: 

50 if data is None: 

51 if dtype is None: 

52 raise TypeError("One of data, shape or dtype must be specified") 

53 data = Empty(dtype) 

54 shape = data.shape 

55 else: 

56 shape = (shape,) if isinstance(shape, int) else tuple(shape) 

57 if data is not None and (product(shape) != product(data.shape)): 

58 raise ValueError("Shape tuple is incompatible with data") 

59 

60 if isinstance(maxshape, int): 

61 maxshape = (maxshape,) 

62 tmp_shape = maxshape if maxshape is not None else shape 

63 

64 # Validate chunk shape 

65 if isinstance(chunks, int) and not isinstance(chunks, bool): 

66 chunks = (chunks,) 

67 if isinstance(chunks, tuple) and any( 

68 chunk > dim for dim, chunk in zip(tmp_shape, chunks) if dim is not None 

69 ): 

70 errmsg = "Chunk shape must not be greater than data shape in any dimension. "\ 

71 "{} is not compatible with {}".format(chunks, shape) 

72 raise ValueError(errmsg) 

73 

74 if isinstance(dtype, Datatype): 

75 # Named types are used as-is 

76 tid = dtype.id 

77 dtype = tid.dtype # Following code needs this 

78 else: 

79 # Validate dtype 

80 if dtype is None and data is None: 

81 dtype = numpy.dtype("=f4") 

82 elif dtype is None and data is not None: 

83 dtype = data.dtype 

84 else: 

85 dtype = numpy.dtype(dtype) 

86 tid = h5t.py_create(dtype, logical=1) 

87 

88 # Legacy 

89 if any((compression, shuffle, fletcher32, maxshape, scaleoffset)) and chunks is False: 

90 raise ValueError("Chunked format required for given storage options") 

91 

92 # Legacy 

93 if compression is True: 

94 if compression_opts is None: 

95 compression_opts = 4 

96 compression = 'gzip' 

97 

98 # Legacy 

99 if compression in _LEGACY_GZIP_COMPRESSION_VALS: 

100 if compression_opts is not None: 

101 raise TypeError("Conflict in compression options") 

102 compression_opts = compression 

103 compression = 'gzip' 

104 dcpl = filters.fill_dcpl( 

105 dcpl or h5p.create(h5p.DATASET_CREATE), shape, dtype, 

106 chunks, compression, compression_opts, shuffle, fletcher32, 

107 maxshape, scaleoffset, external, allow_unknown_filter) 

108 

109 if fillvalue is not None: 

110 # prepare string-type dtypes for fillvalue 

111 string_info = h5t.check_string_dtype(dtype) 

112 if string_info is not None: 

113 # fake vlen dtype for fixed len string fillvalue 

114 # to not trigger unwanted encoding 

115 dtype = h5t.string_dtype(string_info.encoding) 

116 fillvalue = numpy.array(fillvalue, dtype=dtype) 

117 else: 

118 fillvalue = numpy.array(fillvalue) 

119 dcpl.set_fill_value(fillvalue) 

120 

121 if track_times is None: 

122 # In case someone explicitly passes None for the default 

123 track_times = False 

124 if track_times in (True, False): 

125 dcpl.set_obj_track_times(track_times) 

126 else: 

127 raise TypeError("track_times must be either True or False") 

128 if track_order is True: 

129 dcpl.set_attr_creation_order( 

130 h5p.CRT_ORDER_TRACKED | h5p.CRT_ORDER_INDEXED) 

131 elif track_order is False: 

132 dcpl.set_attr_creation_order(0) 

133 elif track_order is not None: 

134 raise TypeError("track_order must be either True or False") 

135 

136 if maxshape is not None: 

137 maxshape = tuple(m if m is not None else h5s.UNLIMITED for m in maxshape) 

138 

139 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]): 

140 dapl = dapl or h5p.create(h5p.DATASET_ACCESS) 

141 

142 if efile_prefix is not None: 

143 dapl.set_efile_prefix(efile_prefix) 

144 

145 if virtual_prefix is not None: 

146 dapl.set_virtual_prefix(virtual_prefix) 

147 

148 if rdcc_nbytes or rdcc_nslots or rdcc_w0: 

149 cache_settings = list(dapl.get_chunk_cache()) 

150 if rdcc_nslots is not None: 

151 cache_settings[0] = rdcc_nslots 

152 if rdcc_nbytes is not None: 

153 cache_settings[1] = rdcc_nbytes 

154 if rdcc_w0 is not None: 

155 cache_settings[2] = rdcc_w0 

156 dapl.set_chunk_cache(*cache_settings) 

157 

158 if isinstance(data, Empty): 

159 sid = h5s.create(h5s.NULL) 

160 else: 

161 sid = h5s.create_simple(shape, maxshape) 

162 

163 dset_id = h5d.create(parent.id, name, tid, sid, dcpl=dcpl, dapl=dapl) 

164 

165 if (data is not None) and (not isinstance(data, Empty)): 

166 dset_id.write(h5s.ALL, h5s.ALL, data) 

167 

168 return dset_id 

169 

170 

171def open_dset(parent, name, dapl=None, efile_prefix=None, virtual_prefix=None, 

172 rdcc_nslots=None, rdcc_nbytes=None, rdcc_w0=None, **kwds): 

173 """ Return an existing low-level dataset identifier """ 

174 

175 if any([efile_prefix, virtual_prefix, rdcc_nbytes, rdcc_nslots, rdcc_w0]): 

176 dapl = dapl or h5p.create(h5p.DATASET_ACCESS) 

177 

178 if efile_prefix is not None: 

179 dapl.set_efile_prefix(efile_prefix) 

180 

181 if virtual_prefix is not None: 

182 dapl.set_virtual_prefix(virtual_prefix) 

183 

184 if rdcc_nbytes or rdcc_nslots or rdcc_w0: 

185 cache_settings = list(dapl.get_chunk_cache()) 

186 if rdcc_nslots is not None: 

187 cache_settings[0] = rdcc_nslots 

188 if rdcc_nbytes is not None: 

189 cache_settings[1] = rdcc_nbytes 

190 if rdcc_w0 is not None: 

191 cache_settings[2] = rdcc_w0 

192 dapl.set_chunk_cache(*cache_settings) 

193 

194 dset_id = h5d.open(parent.id, name, dapl=dapl) 

195 

196 return dset_id 

197 

198 

199class AstypeWrapper: 

200 """Wrapper to convert data on reading from a dataset. 

201 """ 

202 def __init__(self, dset, dtype): 

203 self._dset = dset 

204 self._dtype = numpy.dtype(dtype) 

205 

206 def __getitem__(self, args): 

207 return self._dset.__getitem__(args, new_dtype=self._dtype) 

208 

209 def __len__(self): 

210 """ Get the length of the underlying dataset 

211 

212 >>> length = len(dataset.astype('f8')) 

213 """ 

214 return len(self._dset) 

215 

216 def __array__(self, dtype=None): 

217 data = self[:] 

218 if dtype is not None: 

219 data = data.astype(dtype) 

220 return data 

221 

222 

223class AsStrWrapper: 

224 """Wrapper to decode strings on reading the dataset""" 

225 def __init__(self, dset, encoding, errors='strict'): 

226 self._dset = dset 

227 if encoding is None: 

228 encoding = h5t.check_string_dtype(dset.dtype).encoding 

229 self.encoding = encoding 

230 self.errors = errors 

231 

232 def __getitem__(self, args): 

233 bytes_arr = self._dset[args] 

234 # numpy.char.decode() seems like the obvious thing to use. But it only 

235 # accepts numpy string arrays, not object arrays of bytes (which we 

236 # return from HDF5 variable-length strings). And the numpy 

237 # implementation is not faster than doing it with a loop; in fact, by 

238 # not converting the result to a numpy unicode array, the 

239 # naive way can be faster! (Comparing with numpy 1.18.4, June 2020) 

240 if numpy.isscalar(bytes_arr): 

241 return bytes_arr.decode(self.encoding, self.errors) 

242 

243 return numpy.array([ 

244 b.decode(self.encoding, self.errors) for b in bytes_arr.flat 

245 ], dtype=object).reshape(bytes_arr.shape) 

246 

247 def __len__(self): 

248 """ Get the length of the underlying dataset 

249 

250 >>> length = len(dataset.asstr()) 

251 """ 

252 return len(self._dset) 

253 

254 def __array__(self): 

255 return numpy.array([ 

256 b.decode(self.encoding, self.errors) for b in self._dset 

257 ], dtype=object).reshape(self._dset.shape) 

258 

259 

260class FieldsWrapper: 

261 """Wrapper to extract named fields from a dataset with a struct dtype""" 

262 extract_field = None 

263 

264 def __init__(self, dset, prior_dtype, names): 

265 self._dset = dset 

266 if isinstance(names, str): 

267 self.extract_field = names 

268 names = [names] 

269 self.read_dtype = readtime_dtype(prior_dtype, names) 

270 

271 def __array__(self, dtype=None): 

272 data = self[:] 

273 if dtype is not None: 

274 data = data.astype(dtype) 

275 return data 

276 

277 def __getitem__(self, args): 

278 data = self._dset.__getitem__(args, new_dtype=self.read_dtype) 

279 if self.extract_field is not None: 

280 data = data[self.extract_field] 

281 return data 

282 

283 def __len__(self): 

284 """ Get the length of the underlying dataset 

285 

286 >>> length = len(dataset.fields(['x', 'y'])) 

287 """ 

288 return len(self._dset) 

289 

290 

291def readtime_dtype(basetype, names): 

292 """Make a NumPy compound dtype with a subset of available fields""" 

293 if basetype.names is None: # Names provided, but not compound 

294 raise ValueError("Field names only allowed for compound types") 

295 

296 for name in names: # Check all names are legal 

297 if name not in basetype.names: 

298 raise ValueError("Field %s does not appear in this type." % name) 

299 

300 return numpy.dtype([(name, basetype.fields[name][0]) for name in names]) 

301 

302 

303if MPI: 

304 class CollectiveContext: 

305 

306 """ Manages collective I/O in MPI mode """ 

307 

308 # We don't bother with _local as threads are forbidden in MPI mode 

309 

310 def __init__(self, dset): 

311 self._dset = dset 

312 

313 def __enter__(self): 

314 # pylint: disable=protected-access 

315 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_COLLECTIVE) 

316 

317 def __exit__(self, *args): 

318 # pylint: disable=protected-access 

319 self._dset._dxpl.set_dxpl_mpio(h5fd.MPIO_INDEPENDENT) 

320 

321 

322class ChunkIterator: 

323 """ 

324 Class to iterate through list of chunks of a given dataset 

325 """ 

326 def __init__(self, dset, source_sel=None): 

327 self._shape = dset.shape 

328 rank = len(dset.shape) 

329 

330 if not dset.chunks: 

331 # can only use with chunked datasets 

332 raise TypeError("Chunked dataset required") 

333 

334 self._layout = dset.chunks 

335 if source_sel is None: 

336 # select over entire dataset 

337 slices = [] 

338 for dim in range(rank): 

339 slices.append(slice(0, self._shape[dim])) 

340 self._sel = tuple(slices) 

341 else: 

342 if isinstance(source_sel, slice): 

343 self._sel = (source_sel,) 

344 else: 

345 self._sel = source_sel 

346 if len(self._sel) != rank: 

347 raise ValueError("Invalid selection - selection region must have same rank as dataset") 

348 self._chunk_index = [] 

349 for dim in range(rank): 

350 s = self._sel[dim] 

351 if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start: 

352 raise ValueError("Invalid selection - selection region must be within dataset space") 

353 index = s.start // self._layout[dim] 

354 self._chunk_index.append(index) 

355 

356 def __iter__(self): 

357 return self 

358 

359 def __next__(self): 

360 rank = len(self._shape) 

361 slices = [] 

362 if rank == 0 or self._chunk_index[0] * self._layout[0] >= self._sel[0].stop: 

363 # ran past the last chunk, end iteration 

364 raise StopIteration() 

365 

366 for dim in range(rank): 

367 s = self._sel[dim] 

368 start = self._chunk_index[dim] * self._layout[dim] 

369 stop = (self._chunk_index[dim] + 1) * self._layout[dim] 

370 # adjust the start if this is an edge chunk 

371 if start < s.start: 

372 start = s.start 

373 if stop > s.stop: 

374 stop = s.stop # trim to end of the selection 

375 s = slice(start, stop, 1) 

376 slices.append(s) 

377 

378 # bump up the last index and carry forward if we run outside the selection 

379 dim = rank - 1 

380 while dim >= 0: 

381 s = self._sel[dim] 

382 self._chunk_index[dim] += 1 

383 

384 chunk_end = self._chunk_index[dim] * self._layout[dim] 

385 if chunk_end < s.stop: 

386 # we still have room to extend along this dimensions 

387 return tuple(slices) 

388 

389 if dim > 0: 

390 # reset to the start and continue iterating with higher dimension 

391 self._chunk_index[dim] = 0 

392 dim -= 1 

393 return tuple(slices) 

394 

395 

396class Dataset(HLObject): 

397 

398 """ 

399 Represents an HDF5 dataset 

400 """ 

401 

402 def astype(self, dtype): 

403 """ Get a wrapper allowing you to perform reads to a 

404 different destination type, e.g.: 

405 

406 >>> double_precision = dataset.astype('f8')[0:100:2] 

407 """ 

408 return AstypeWrapper(self, dtype) 

409 

410 def asstr(self, encoding=None, errors='strict'): 

411 """Get a wrapper to read string data as Python strings: 

412 

413 >>> str_array = dataset.asstr()[:] 

414 

415 The parameters have the same meaning as in ``bytes.decode()``. 

416 If ``encoding`` is unspecified, it will use the encoding in the HDF5 

417 datatype (either ascii or utf-8). 

418 """ 

419 string_info = h5t.check_string_dtype(self.dtype) 

420 if string_info is None: 

421 raise TypeError( 

422 "dset.asstr() can only be used on datasets with " 

423 "an HDF5 string datatype" 

424 ) 

425 if encoding is None: 

426 encoding = string_info.encoding 

427 return AsStrWrapper(self, encoding, errors=errors) 

428 

429 def fields(self, names, *, _prior_dtype=None): 

430 """Get a wrapper to read a subset of fields from a compound data type: 

431 

432 >>> 2d_coords = dataset.fields(['x', 'y'])[:] 

433 

434 If names is a string, a single field is extracted, and the resulting 

435 arrays will have that dtype. Otherwise, it should be an iterable, 

436 and the read data will have a compound dtype. 

437 """ 

438 if _prior_dtype is None: 

439 _prior_dtype = self.dtype 

440 return FieldsWrapper(self, _prior_dtype, names) 

441 

442 if MPI: 

443 @property 

444 @with_phil 

445 def collective(self): 

446 """ Context manager for MPI collective reads & writes """ 

447 return CollectiveContext(self) 

448 

449 @property 

450 def dims(self): 

451 """ Access dimension scales attached to this dataset. """ 

452 from .dims import DimensionManager 

453 with phil: 

454 return DimensionManager(self) 

455 

456 @property 

457 @with_phil 

458 def ndim(self): 

459 """Numpy-style attribute giving the number of dimensions""" 

460 return self.id.rank 

461 

462 @property 

463 def shape(self): 

464 """Numpy-style shape tuple giving dataset dimensions""" 

465 if 'shape' in self._cache_props: 

466 return self._cache_props['shape'] 

467 

468 with phil: 

469 shape = self.id.shape 

470 

471 # If the file is read-only, cache the shape to speed-up future uses. 

472 # This cache is invalidated by .refresh() when using SWMR. 

473 if self._readonly: 

474 self._cache_props['shape'] = shape 

475 return shape 

476 

477 @shape.setter 

478 @with_phil 

479 def shape(self, shape): 

480 # pylint: disable=missing-docstring 

481 self.resize(shape) 

482 

483 @property 

484 def size(self): 

485 """Numpy-style attribute giving the total dataset size""" 

486 if 'size' in self._cache_props: 

487 return self._cache_props['size'] 

488 

489 if self._is_empty: 

490 size = None 

491 else: 

492 size = product(self.shape) 

493 

494 # If the file is read-only, cache the size to speed-up future uses. 

495 # This cache is invalidated by .refresh() when using SWMR. 

496 if self._readonly: 

497 self._cache_props['size'] = size 

498 return size 

499 

500 @property 

501 def nbytes(self): 

502 """Numpy-style attribute giving the raw dataset size as the number of bytes""" 

503 size = self.size 

504 if size is None: # if we are an empty 0-D array, then there are no bytes in the dataset 

505 return 0 

506 return self.dtype.itemsize * size 

507 

508 @property 

509 def _selector(self): 

510 """Internal object for optimised selection of data""" 

511 if '_selector' in self._cache_props: 

512 return self._cache_props['_selector'] 

513 

514 slr = _selector.Selector(self.id.get_space()) 

515 

516 # If the file is read-only, cache the reader to speed up future uses. 

517 # This cache is invalidated by .refresh() when using SWMR. 

518 if self._readonly: 

519 self._cache_props['_selector'] = slr 

520 return slr 

521 

522 @property 

523 def _fast_reader(self): 

524 """Internal object for optimised reading of data""" 

525 if '_fast_reader' in self._cache_props: 

526 return self._cache_props['_fast_reader'] 

527 

528 rdr = _selector.Reader(self.id) 

529 

530 # If the file is read-only, cache the reader to speed up future uses. 

531 # This cache is invalidated by .refresh() when using SWMR. 

532 if self._readonly: 

533 self._cache_props['_fast_reader'] = rdr 

534 return rdr 

535 

536 @property 

537 @with_phil 

538 def dtype(self): 

539 """Numpy dtype representing the datatype""" 

540 return self.id.dtype 

541 

542 @property 

543 @with_phil 

544 def chunks(self): 

545 """Dataset chunks (or None)""" 

546 dcpl = self._dcpl 

547 if dcpl.get_layout() == h5d.CHUNKED: 

548 return dcpl.get_chunk() 

549 return None 

550 

551 @property 

552 @with_phil 

553 def compression(self): 

554 """Compression strategy (or None)""" 

555 for x in ('gzip','lzf','szip'): 

556 if x in self._filters: 

557 return x 

558 return None 

559 

560 @property 

561 @with_phil 

562 def compression_opts(self): 

563 """ Compression setting. Int(0-9) for gzip, 2-tuple for szip. """ 

564 return self._filters.get(self.compression, None) 

565 

566 @property 

567 @with_phil 

568 def shuffle(self): 

569 """Shuffle filter present (T/F)""" 

570 return 'shuffle' in self._filters 

571 

572 @property 

573 @with_phil 

574 def fletcher32(self): 

575 """Fletcher32 filter is present (T/F)""" 

576 return 'fletcher32' in self._filters 

577 

578 @property 

579 @with_phil 

580 def scaleoffset(self): 

581 """Scale/offset filter settings. For integer data types, this is 

582 the number of bits stored, or 0 for auto-detected. For floating 

583 point data types, this is the number of decimal places retained. 

584 If the scale/offset filter is not in use, this is None.""" 

585 try: 

586 return self._filters['scaleoffset'][1] 

587 except KeyError: 

588 return None 

589 

590 @property 

591 @with_phil 

592 def external(self): 

593 """External file settings. Returns a list of tuples of 

594 (name, offset, size) for each external file entry, or returns None 

595 if no external files are used.""" 

596 count = self._dcpl.get_external_count() 

597 if count<=0: 

598 return None 

599 ext_list = list() 

600 for x in range(count): 

601 (name, offset, size) = self._dcpl.get_external(x) 

602 ext_list.append( (filename_decode(name), offset, size) ) 

603 return ext_list 

604 

605 @property 

606 @with_phil 

607 def maxshape(self): 

608 """Shape up to which this dataset can be resized. Axes with value 

609 None have no resize limit. """ 

610 space = self.id.get_space() 

611 dims = space.get_simple_extent_dims(True) 

612 if dims is None: 

613 return None 

614 

615 return tuple(x if x != h5s.UNLIMITED else None for x in dims) 

616 

617 @property 

618 @with_phil 

619 def fillvalue(self): 

620 """Fill value for this dataset (0 by default)""" 

621 arr = numpy.zeros((1,), dtype=self.dtype) 

622 self._dcpl.get_fill_value(arr) 

623 return arr[0] 

624 

625 @cached_property 

626 @with_phil 

627 def _extent_type(self): 

628 """Get extent type for this dataset - SIMPLE, SCALAR or NULL""" 

629 return self.id.get_space().get_simple_extent_type() 

630 

631 @cached_property 

632 def _is_empty(self): 

633 """Check if extent type is empty""" 

634 return self._extent_type == h5s.NULL 

635 

636 @with_phil 

637 def __init__(self, bind, *, readonly=False): 

638 """ Create a new Dataset object by binding to a low-level DatasetID. 

639 """ 

640 if not isinstance(bind, h5d.DatasetID): 

641 raise ValueError("%s is not a DatasetID" % bind) 

642 super().__init__(bind) 

643 

644 self._dcpl = self.id.get_create_plist() 

645 self._dxpl = h5p.create(h5p.DATASET_XFER) 

646 self._filters = filters.get_filters(self._dcpl) 

647 self._readonly = readonly 

648 self._cache_props = {} 

649 

650 def resize(self, size, axis=None): 

651 """ Resize the dataset, or the specified axis. 

652 

653 The dataset must be stored in chunked format; it can be resized up to 

654 the "maximum shape" (keyword maxshape) specified at creation time. 

655 The rank of the dataset cannot be changed. 

656 

657 "Size" should be a shape tuple, or if an axis is specified, an integer. 

658 

659 BEWARE: This functions differently than the NumPy resize() method! 

660 The data is not "reshuffled" to fit in the new shape; each axis is 

661 grown or shrunk independently. The coordinates of existing data are 

662 fixed. 

663 """ 

664 with phil: 

665 if self.chunks is None: 

666 raise TypeError("Only chunked datasets can be resized") 

667 

668 if axis is not None: 

669 if not (axis >=0 and axis < self.id.rank): 

670 raise ValueError("Invalid axis (0 to %s allowed)" % (self.id.rank-1)) 

671 try: 

672 newlen = int(size) 

673 except TypeError: 

674 raise TypeError("Argument must be a single int if axis is specified") 

675 size = list(self.shape) 

676 size[axis] = newlen 

677 

678 size = tuple(size) 

679 self.id.set_extent(size) 

680 #h5f.flush(self.id) # THG recommends 

681 

682 @with_phil 

683 def __len__(self): 

684 """ The size of the first axis. TypeError if scalar. 

685 

686 Limited to 2**32 on 32-bit systems; Dataset.len() is preferred. 

687 """ 

688 size = self.len() 

689 if size > sys.maxsize: 

690 raise OverflowError("Value too big for Python's __len__; use Dataset.len() instead.") 

691 return size 

692 

693 def len(self): 

694 """ The size of the first axis. TypeError if scalar. 

695 

696 Use of this method is preferred to len(dset), as Python's built-in 

697 len() cannot handle values greater then 2**32 on 32-bit systems. 

698 """ 

699 with phil: 

700 shape = self.shape 

701 if len(shape) == 0: 

702 raise TypeError("Attempt to take len() of scalar dataset") 

703 return shape[0] 

704 

705 @with_phil 

706 def __iter__(self): 

707 """ Iterate over the first axis. TypeError if scalar. 

708 

709 BEWARE: Modifications to the yielded data are *NOT* written to file. 

710 """ 

711 shape = self.shape 

712 if len(shape) == 0: 

713 raise TypeError("Can't iterate over a scalar dataset") 

714 for i in range(shape[0]): 

715 yield self[i] 

716 

717 @with_phil 

718 def iter_chunks(self, sel=None): 

719 """ Return chunk iterator. If set, the sel argument is a slice or 

720 tuple of slices that defines the region to be used. If not set, the 

721 entire dataspace will be used for the iterator. 

722 

723 For each chunk within the given region, the iterator yields a tuple of 

724 slices that gives the intersection of the given chunk with the 

725 selection area. 

726 

727 A TypeError will be raised if the dataset is not chunked. 

728 

729 A ValueError will be raised if the selection region is invalid. 

730 

731 """ 

732 return ChunkIterator(self, sel) 

733 

734 @cached_property 

735 def _fast_read_ok(self): 

736 """Is this dataset suitable for simple reading""" 

737 return ( 

738 self._extent_type == h5s.SIMPLE 

739 and isinstance(self.id.get_type(), (h5t.TypeIntegerID, h5t.TypeFloatID)) 

740 ) 

741 

742 @with_phil 

743 def __getitem__(self, args, new_dtype=None): 

744 """ Read a slice from the HDF5 dataset. 

745 

746 Takes slices and recarray-style field names (more than one is 

747 allowed!) in any order. Obeys basic NumPy rules, including 

748 broadcasting. 

749 

750 Also supports: 

751 

752 * Boolean "mask" array indexing 

753 """ 

754 args = args if isinstance(args, tuple) else (args,) 

755 

756 if self._fast_read_ok and (new_dtype is None): 

757 try: 

758 return self._fast_reader.read(args) 

759 except TypeError: 

760 pass # Fall back to Python read pathway below 

761 

762 if self._is_empty: 

763 # Check 'is Ellipsis' to avoid equality comparison with an array: 

764 # array equality returns an array, not a boolean. 

765 if args == () or (len(args) == 1 and args[0] is Ellipsis): 

766 return Empty(self.dtype) 

767 raise ValueError("Empty datasets cannot be sliced") 

768 

769 # Sort field names from the rest of the args. 

770 names = tuple(x for x in args if isinstance(x, str)) 

771 

772 if names: 

773 # Read a subset of the fields in this structured dtype 

774 if len(names) == 1: 

775 names = names[0] # Read with simpler dtype of this field 

776 args = tuple(x for x in args if not isinstance(x, str)) 

777 return self.fields(names, _prior_dtype=new_dtype)[args] 

778 

779 if new_dtype is None: 

780 new_dtype = self.dtype 

781 mtype = h5t.py_create(new_dtype) 

782 

783 # === Special-case region references ==== 

784 

785 if len(args) == 1 and isinstance(args[0], h5r.RegionReference): 

786 

787 obj = h5r.dereference(args[0], self.id) 

788 if obj != self.id: 

789 raise ValueError("Region reference must point to this dataset") 

790 

791 sid = h5r.get_region(args[0], self.id) 

792 mshape = sel.guess_shape(sid) 

793 if mshape is None: 

794 # 0D with no data (NULL or deselected SCALAR) 

795 return Empty(new_dtype) 

796 out = numpy.zeros(mshape, dtype=new_dtype) 

797 if out.size == 0: 

798 return out 

799 

800 sid_out = h5s.create_simple(mshape) 

801 sid_out.select_all() 

802 self.id.read(sid_out, sid, out, mtype) 

803 return out 

804 

805 # === Check for zero-sized datasets ===== 

806 

807 if self.size == 0: 

808 # Check 'is Ellipsis' to avoid equality comparison with an array: 

809 # array equality returns an array, not a boolean. 

810 if args == () or (len(args) == 1 and args[0] is Ellipsis): 

811 return numpy.zeros(self.shape, dtype=new_dtype) 

812 

813 # === Scalar dataspaces ================= 

814 

815 if self.shape == (): 

816 fspace = self.id.get_space() 

817 selection = sel2.select_read(fspace, args) 

818 if selection.mshape is None: 

819 arr = numpy.zeros((), dtype=new_dtype) 

820 else: 

821 arr = numpy.zeros(selection.mshape, dtype=new_dtype) 

822 for mspace, fspace in selection: 

823 self.id.read(mspace, fspace, arr, mtype) 

824 if selection.mshape is None: 

825 return arr[()] 

826 return arr 

827 

828 # === Everything else =================== 

829 

830 # Perform the dataspace selection. 

831 selection = sel.select(self.shape, args, dataset=self) 

832 

833 if selection.nselect == 0: 

834 return numpy.zeros(selection.array_shape, dtype=new_dtype) 

835 

836 arr = numpy.zeros(selection.array_shape, new_dtype, order='C') 

837 

838 # Perform the actual read 

839 mspace = h5s.create_simple(selection.mshape) 

840 fspace = selection.id 

841 self.id.read(mspace, fspace, arr, mtype, dxpl=self._dxpl) 

842 

843 # Patch up the output for NumPy 

844 if arr.shape == (): 

845 return arr[()] # 0 dim array -> numpy scalar 

846 return arr 

847 

848 @with_phil 

849 def __setitem__(self, args, val): 

850 """ Write to the HDF5 dataset from a Numpy array. 

851 

852 NumPy's broadcasting rules are honored, for "simple" indexing 

853 (slices and integers). For advanced indexing, the shapes must 

854 match. 

855 """ 

856 args = args if isinstance(args, tuple) else (args,) 

857 

858 # Sort field indices from the slicing 

859 names = tuple(x for x in args if isinstance(x, str)) 

860 args = tuple(x for x in args if not isinstance(x, str)) 

861 

862 # Generally we try to avoid converting the arrays on the Python 

863 # side. However, for compound literals this is unavoidable. 

864 vlen = h5t.check_vlen_dtype(self.dtype) 

865 if vlen is not None and vlen not in (bytes, str): 

866 try: 

867 val = numpy.asarray(val, dtype=vlen) 

868 except ValueError: 

869 try: 

870 val = numpy.array([numpy.array(x, dtype=vlen) 

871 for x in val], dtype=self.dtype) 

872 except ValueError: 

873 pass 

874 if vlen == val.dtype: 

875 if val.ndim > 1: 

876 tmp = numpy.empty(shape=val.shape[:-1], dtype=object) 

877 tmp.ravel()[:] = [i for i in val.reshape( 

878 (product(val.shape[:-1]), val.shape[-1]) 

879 )] 

880 else: 

881 tmp = numpy.array([None], dtype=object) 

882 tmp[0] = val 

883 val = tmp 

884 elif self.dtype.kind == "O" or \ 

885 (self.dtype.kind == 'V' and \ 

886 (not isinstance(val, numpy.ndarray) or val.dtype.kind != 'V') and \ 

887 (self.dtype.subdtype is None)): 

888 if len(names) == 1 and self.dtype.fields is not None: 

889 # Single field selected for write, from a non-array source 

890 if not names[0] in self.dtype.fields: 

891 raise ValueError("No such field for indexing: %s" % names[0]) 

892 dtype = self.dtype.fields[names[0]][0] 

893 cast_compound = True 

894 else: 

895 dtype = self.dtype 

896 cast_compound = False 

897 

898 val = numpy.asarray(val, dtype=dtype.base, order='C') 

899 if cast_compound: 

900 val = val.view(numpy.dtype([(names[0], dtype)])) 

901 val = val.reshape(val.shape[:len(val.shape) - len(dtype.shape)]) 

902 elif (self.dtype.kind == 'S' 

903 and (h5t.check_string_dtype(self.dtype).encoding == 'utf-8') 

904 and (find_item_type(val) is str) 

905 ): 

906 # Writing str objects to a fixed-length UTF-8 string dataset. 

907 # Numpy's normal conversion only handles ASCII characters, but 

908 # when the destination is UTF-8, we want to allow any unicode. 

909 # This *doesn't* handle numpy fixed-length unicode data ('U' dtype), 

910 # as HDF5 has no equivalent, and converting fixed length UTF-32 

911 # to variable length UTF-8 would obscure what's going on. 

912 str_array = numpy.asarray(val, order='C', dtype=object) 

913 val = numpy.array([ 

914 s.encode('utf-8') for s in str_array.flat 

915 ], dtype=self.dtype).reshape(str_array.shape) 

916 else: 

917 # If the input data is already an array, let HDF5 do the conversion. 

918 # If it's a list or similar, don't make numpy guess a dtype for it. 

919 dt = None if isinstance(val, numpy.ndarray) else self.dtype.base 

920 val = numpy.asarray(val, order='C', dtype=dt) 

921 

922 # Check for array dtype compatibility and convert 

923 if self.dtype.subdtype is not None: 

924 shp = self.dtype.subdtype[1] 

925 valshp = val.shape[-len(shp):] 

926 if valshp != shp: # Last dimension has to match 

927 raise TypeError("When writing to array types, last N dimensions have to match (got %s, but should be %s)" % (valshp, shp,)) 

928 mtype = h5t.py_create(numpy.dtype((val.dtype, shp))) 

929 mshape = val.shape[0:len(val.shape)-len(shp)] 

930 

931 # Make a compound memory type if field-name slicing is required 

932 elif len(names) != 0: 

933 

934 mshape = val.shape 

935 

936 # Catch common errors 

937 if self.dtype.fields is None: 

938 raise TypeError("Illegal slicing argument (not a compound dataset)") 

939 mismatch = [x for x in names if x not in self.dtype.fields] 

940 if len(mismatch) != 0: 

941 mismatch = ", ".join('"%s"'%x for x in mismatch) 

942 raise ValueError("Illegal slicing argument (fields %s not in dataset type)" % mismatch) 

943 

944 # Write non-compound source into a single dataset field 

945 if len(names) == 1 and val.dtype.fields is None: 

946 subtype = h5t.py_create(val.dtype) 

947 mtype = h5t.create(h5t.COMPOUND, subtype.get_size()) 

948 mtype.insert(self._e(names[0]), 0, subtype) 

949 

950 # Make a new source type keeping only the requested fields 

951 else: 

952 fieldnames = [x for x in val.dtype.names if x in names] # Keep source order 

953 mtype = h5t.create(h5t.COMPOUND, val.dtype.itemsize) 

954 for fieldname in fieldnames: 

955 subtype = h5t.py_create(val.dtype.fields[fieldname][0]) 

956 offset = val.dtype.fields[fieldname][1] 

957 mtype.insert(self._e(fieldname), offset, subtype) 

958 

959 # Use mtype derived from array (let DatasetID.write figure it out) 

960 else: 

961 mshape = val.shape 

962 mtype = None 

963 

964 # Perform the dataspace selection 

965 selection = sel.select(self.shape, args, dataset=self) 

966 

967 if selection.nselect == 0: 

968 return 

969 

970 # Broadcast scalars if necessary. 

971 # In order to avoid slow broadcasting filling the destination by 

972 # the scalar value, we create an intermediate array of the same 

973 # size as the destination buffer provided that size is reasonable. 

974 # We assume as reasonable a size smaller or equal as the used dataset 

975 # chunk size if any. 

976 # In case of dealing with a non-chunked destination dataset or with 

977 # a selection whose size is larger than the dataset chunk size we fall 

978 # back to using an intermediate array of size equal to the last dimension 

979 # of the destination buffer. 

980 # The reasoning behind is that it makes sense to assume the creator of 

981 # the dataset used an appropriate chunk size according the available 

982 # memory. In any case, if we cannot afford to create an intermediate 

983 # array of the same size as the dataset chunk size, the user program has 

984 # little hope to go much further. Solves h5py issue #1067 

985 if mshape == () and selection.array_shape != (): 

986 if self.dtype.subdtype is not None: 

987 raise TypeError("Scalar broadcasting is not supported for array dtypes") 

988 if self.chunks and (product(self.chunks) >= product(selection.array_shape)): 

989 val2 = numpy.empty(selection.array_shape, dtype=val.dtype) 

990 else: 

991 val2 = numpy.empty(selection.array_shape[-1], dtype=val.dtype) 

992 val2[...] = val 

993 val = val2 

994 mshape = val.shape 

995 

996 # Perform the write, with broadcasting 

997 mspace = h5s.create_simple(selection.expand_shape(mshape)) 

998 for fspace in selection.broadcast(mshape): 

999 self.id.write(mspace, fspace, val, mtype, dxpl=self._dxpl) 

1000 

1001 def read_direct(self, dest, source_sel=None, dest_sel=None): 

1002 """ Read data directly from HDF5 into an existing NumPy array. 

1003 

1004 The destination array must be C-contiguous and writable. 

1005 Selections must be the output of numpy.s_[<args>]. 

1006 

1007 Broadcasting is supported for simple indexing. 

1008 """ 

1009 with phil: 

1010 if self._is_empty: 

1011 raise TypeError("Empty datasets have no numpy representation") 

1012 if source_sel is None: 

1013 source_sel = sel.SimpleSelection(self.shape) 

1014 else: 

1015 source_sel = sel.select(self.shape, source_sel, self) # for numpy.s_ 

1016 fspace = source_sel.id 

1017 

1018 if dest_sel is None: 

1019 dest_sel = sel.SimpleSelection(dest.shape) 

1020 else: 

1021 dest_sel = sel.select(dest.shape, dest_sel) 

1022 

1023 for mspace in dest_sel.broadcast(source_sel.array_shape): 

1024 self.id.read(mspace, fspace, dest, dxpl=self._dxpl) 

1025 

1026 def write_direct(self, source, source_sel=None, dest_sel=None): 

1027 """ Write data directly to HDF5 from a NumPy array. 

1028 

1029 The source array must be C-contiguous. Selections must be 

1030 the output of numpy.s_[<args>]. 

1031 

1032 Broadcasting is supported for simple indexing. 

1033 """ 

1034 with phil: 

1035 if self._is_empty: 

1036 raise TypeError("Empty datasets cannot be written to") 

1037 if source_sel is None: 

1038 source_sel = sel.SimpleSelection(source.shape) 

1039 else: 

1040 source_sel = sel.select(source.shape, source_sel) # for numpy.s_ 

1041 mspace = source_sel.id 

1042 

1043 if dest_sel is None: 

1044 dest_sel = sel.SimpleSelection(self.shape) 

1045 else: 

1046 dest_sel = sel.select(self.shape, dest_sel, self) 

1047 

1048 for fspace in dest_sel.broadcast(source_sel.array_shape): 

1049 self.id.write(mspace, fspace, source, dxpl=self._dxpl) 

1050 

1051 @with_phil 

1052 def __array__(self, dtype=None): 

1053 """ Create a Numpy array containing the whole dataset. DON'T THINK 

1054 THIS MEANS DATASETS ARE INTERCHANGEABLE WITH ARRAYS. For one thing, 

1055 you have to read the whole dataset every time this method is called. 

1056 """ 

1057 arr = numpy.zeros(self.shape, dtype=self.dtype if dtype is None else dtype) 

1058 

1059 # Special case for (0,)*-shape datasets 

1060 if self.size == 0: 

1061 return arr 

1062 

1063 self.read_direct(arr) 

1064 return arr 

1065 

1066 @with_phil 

1067 def __repr__(self): 

1068 if not self: 

1069 r = '<Closed HDF5 dataset>' 

1070 else: 

1071 if self.name is None: 

1072 namestr = '("anonymous")' 

1073 else: 

1074 name = pp.basename(pp.normpath(self.name)) 

1075 namestr = '"%s"' % (name if name != '' else '/') 

1076 r = '<HDF5 dataset %s: shape %s, type "%s">' % ( 

1077 namestr, self.shape, self.dtype.str 

1078 ) 

1079 return r 

1080 

1081 if hasattr(h5d.DatasetID, "refresh"): 

1082 @with_phil 

1083 def refresh(self): 

1084 """ Refresh the dataset metadata by reloading from the file. 

1085 

1086 This is part of the SWMR features and only exist when the HDF5 

1087 library version >=1.9.178 

1088 """ 

1089 self._id.refresh() 

1090 self._cache_props.clear() 

1091 

1092 if hasattr(h5d.DatasetID, "flush"): 

1093 @with_phil 

1094 def flush(self): 

1095 """ Flush the dataset data and metadata to the file. 

1096 If the dataset is chunked, raw data chunks are written to the file. 

1097 

1098 This is part of the SWMR features and only exist when the HDF5 

1099 library version >=1.9.178 

1100 """ 

1101 self._id.flush() 

1102 

1103 if vds_support: 

1104 @property 

1105 @with_phil 

1106 def is_virtual(self): 

1107 """Check if this is a virtual dataset""" 

1108 return self._dcpl.get_layout() == h5d.VIRTUAL 

1109 

1110 @with_phil 

1111 def virtual_sources(self): 

1112 """Get a list of the data mappings for a virtual dataset""" 

1113 if not self.is_virtual: 

1114 raise RuntimeError("Not a virtual dataset") 

1115 dcpl = self._dcpl 

1116 return [ 

1117 VDSmap(dcpl.get_virtual_vspace(j), 

1118 dcpl.get_virtual_filename(j), 

1119 dcpl.get_virtual_dsetname(j), 

1120 dcpl.get_virtual_srcspace(j)) 

1121 for j in range(dcpl.get_virtual_count())] 

1122 

1123 @with_phil 

1124 def make_scale(self, name=''): 

1125 """Make this dataset an HDF5 dimension scale. 

1126 

1127 You can then attach it to dimensions of other datasets like this:: 

1128 

1129 other_ds.dims[0].attach_scale(ds) 

1130 

1131 You can optionally pass a name to associate with this scale. 

1132 """ 

1133 h5ds.set_scale(self._id, self._e(name)) 

1134 

1135 @property 

1136 @with_phil 

1137 def is_scale(self): 

1138 """Return ``True`` if this dataset is also a dimension scale. 

1139 

1140 Return ``False`` otherwise. 

1141 """ 

1142 return h5ds.is_scale(self._id)