Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/pytables.py: 19%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

2276 statements  

1""" 

2High level interface to PyTables for reading and writing pandas data structures 

3to disk 

4""" 

5from __future__ import annotations 

6 

7from contextlib import suppress 

8import copy 

9from datetime import ( 

10 date, 

11 tzinfo, 

12) 

13import itertools 

14import os 

15import re 

16from textwrap import dedent 

17from typing import ( 

18 TYPE_CHECKING, 

19 Any, 

20 Callable, 

21 Final, 

22 Literal, 

23 cast, 

24 overload, 

25) 

26import warnings 

27 

28import numpy as np 

29 

30from pandas._config import ( 

31 config, 

32 get_option, 

33 using_copy_on_write, 

34 using_pyarrow_string_dtype, 

35) 

36 

37from pandas._libs import ( 

38 lib, 

39 writers as libwriters, 

40) 

41from pandas._libs.lib import is_string_array 

42from pandas._libs.tslibs import timezones 

43from pandas.compat._optional import import_optional_dependency 

44from pandas.compat.pickle_compat import patch_pickle 

45from pandas.errors import ( 

46 AttributeConflictWarning, 

47 ClosedFileError, 

48 IncompatibilityWarning, 

49 PerformanceWarning, 

50 PossibleDataLossError, 

51) 

52from pandas.util._decorators import cache_readonly 

53from pandas.util._exceptions import find_stack_level 

54 

55from pandas.core.dtypes.common import ( 

56 ensure_object, 

57 is_bool_dtype, 

58 is_complex_dtype, 

59 is_list_like, 

60 is_string_dtype, 

61 needs_i8_conversion, 

62) 

63from pandas.core.dtypes.dtypes import ( 

64 CategoricalDtype, 

65 DatetimeTZDtype, 

66 ExtensionDtype, 

67 PeriodDtype, 

68) 

69from pandas.core.dtypes.missing import array_equivalent 

70 

71from pandas import ( 

72 DataFrame, 

73 DatetimeIndex, 

74 Index, 

75 MultiIndex, 

76 PeriodIndex, 

77 RangeIndex, 

78 Series, 

79 TimedeltaIndex, 

80 concat, 

81 isna, 

82) 

83from pandas.core.arrays import ( 

84 Categorical, 

85 DatetimeArray, 

86 PeriodArray, 

87) 

88import pandas.core.common as com 

89from pandas.core.computation.pytables import ( 

90 PyTablesExpr, 

91 maybe_expression, 

92) 

93from pandas.core.construction import extract_array 

94from pandas.core.indexes.api import ensure_index 

95from pandas.core.internals import ( 

96 ArrayManager, 

97 BlockManager, 

98) 

99 

100from pandas.io.common import stringify_path 

101from pandas.io.formats.printing import ( 

102 adjoin, 

103 pprint_thing, 

104) 

105 

106if TYPE_CHECKING: 

107 from collections.abc import ( 

108 Hashable, 

109 Iterator, 

110 Sequence, 

111 ) 

112 from types import TracebackType 

113 

114 from tables import ( 

115 Col, 

116 File, 

117 Node, 

118 ) 

119 

120 from pandas._typing import ( 

121 AnyArrayLike, 

122 ArrayLike, 

123 AxisInt, 

124 DtypeArg, 

125 FilePath, 

126 Self, 

127 Shape, 

128 npt, 

129 ) 

130 

131 from pandas.core.internals import Block 

132 

133# versioning attribute 

134_version = "0.15.2" 

135 

136# encoding 

137_default_encoding = "UTF-8" 

138 

139 

140def _ensure_decoded(s): 

141 """if we have bytes, decode them to unicode""" 

142 if isinstance(s, np.bytes_): 

143 s = s.decode("UTF-8") 

144 return s 

145 

146 

147def _ensure_encoding(encoding: str | None) -> str: 

148 # set the encoding if we need 

149 if encoding is None: 

150 encoding = _default_encoding 

151 

152 return encoding 

153 

154 

155def _ensure_str(name): 

156 """ 

157 Ensure that an index / column name is a str (python 3); otherwise they 

158 may be np.string dtype. Non-string dtypes are passed through unchanged. 

159 

160 https://github.com/pandas-dev/pandas/issues/13492 

161 """ 

162 if isinstance(name, str): 

163 name = str(name) 

164 return name 

165 

166 

167Term = PyTablesExpr 

168 

169 

170def _ensure_term(where, scope_level: int): 

171 """ 

172 Ensure that the where is a Term or a list of Term. 

173 

174 This makes sure that we are capturing the scope of variables that are 

175 passed create the terms here with a frame_level=2 (we are 2 levels down) 

176 """ 

177 # only consider list/tuple here as an ndarray is automatically a coordinate 

178 # list 

179 level = scope_level + 1 

180 if isinstance(where, (list, tuple)): 

181 where = [ 

182 Term(term, scope_level=level + 1) if maybe_expression(term) else term 

183 for term in where 

184 if term is not None 

185 ] 

186 elif maybe_expression(where): 

187 where = Term(where, scope_level=level) 

188 return where if where is None or len(where) else None 

189 

190 

191incompatibility_doc: Final = """ 

192where criteria is being ignored as this version [%s] is too old (or 

193not-defined), read the file in and write it out to a new file to upgrade (with 

194the copy_to method) 

195""" 

196 

197attribute_conflict_doc: Final = """ 

198the [%s] attribute of the existing index is [%s] which conflicts with the new 

199[%s], resetting the attribute to None 

200""" 

201 

202performance_doc: Final = """ 

203your performance may suffer as PyTables will pickle object types that it cannot 

204map directly to c-types [inferred_type->%s,key->%s] [items->%s] 

205""" 

206 

207# formats 

208_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"} 

209 

210# axes map 

211_AXES_MAP = {DataFrame: [0]} 

212 

213# register our configuration options 

214dropna_doc: Final = """ 

215: boolean 

216 drop ALL nan rows when appending to a table 

217""" 

218format_doc: Final = """ 

219: format 

220 default format writing format, if None, then 

221 put will default to 'fixed' and append will default to 'table' 

222""" 

223 

224with config.config_prefix("io.hdf"): 

225 config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool) 

226 config.register_option( 

227 "default_format", 

228 None, 

229 format_doc, 

230 validator=config.is_one_of_factory(["fixed", "table", None]), 

231 ) 

232 

233# oh the troubles to reduce import time 

234_table_mod = None 

235_table_file_open_policy_is_strict = False 

236 

237 

238def _tables(): 

239 global _table_mod 

240 global _table_file_open_policy_is_strict 

241 if _table_mod is None: 

242 import tables 

243 

244 _table_mod = tables 

245 

246 # set the file open policy 

247 # return the file open policy; this changes as of pytables 3.1 

248 # depending on the HDF5 version 

249 with suppress(AttributeError): 

250 _table_file_open_policy_is_strict = ( 

251 tables.file._FILE_OPEN_POLICY == "strict" 

252 ) 

253 

254 return _table_mod 

255 

256 

257# interface to/from ### 

258 

259 

260def to_hdf( 

261 path_or_buf: FilePath | HDFStore, 

262 key: str, 

263 value: DataFrame | Series, 

264 mode: str = "a", 

265 complevel: int | None = None, 

266 complib: str | None = None, 

267 append: bool = False, 

268 format: str | None = None, 

269 index: bool = True, 

270 min_itemsize: int | dict[str, int] | None = None, 

271 nan_rep=None, 

272 dropna: bool | None = None, 

273 data_columns: Literal[True] | list[str] | None = None, 

274 errors: str = "strict", 

275 encoding: str = "UTF-8", 

276) -> None: 

277 """store this object, close it if we opened it""" 

278 if append: 

279 f = lambda store: store.append( 

280 key, 

281 value, 

282 format=format, 

283 index=index, 

284 min_itemsize=min_itemsize, 

285 nan_rep=nan_rep, 

286 dropna=dropna, 

287 data_columns=data_columns, 

288 errors=errors, 

289 encoding=encoding, 

290 ) 

291 else: 

292 # NB: dropna is not passed to `put` 

293 f = lambda store: store.put( 

294 key, 

295 value, 

296 format=format, 

297 index=index, 

298 min_itemsize=min_itemsize, 

299 nan_rep=nan_rep, 

300 data_columns=data_columns, 

301 errors=errors, 

302 encoding=encoding, 

303 dropna=dropna, 

304 ) 

305 

306 path_or_buf = stringify_path(path_or_buf) 

307 if isinstance(path_or_buf, str): 

308 with HDFStore( 

309 path_or_buf, mode=mode, complevel=complevel, complib=complib 

310 ) as store: 

311 f(store) 

312 else: 

313 f(path_or_buf) 

314 

315 

316def read_hdf( 

317 path_or_buf: FilePath | HDFStore, 

318 key=None, 

319 mode: str = "r", 

320 errors: str = "strict", 

321 where: str | list | None = None, 

322 start: int | None = None, 

323 stop: int | None = None, 

324 columns: list[str] | None = None, 

325 iterator: bool = False, 

326 chunksize: int | None = None, 

327 **kwargs, 

328): 

329 """ 

330 Read from the store, close it if we opened it. 

331 

332 Retrieve pandas object stored in file, optionally based on where 

333 criteria. 

334 

335 .. warning:: 

336 

337 Pandas uses PyTables for reading and writing HDF5 files, which allows 

338 serializing object-dtype data with pickle when using the "fixed" format. 

339 Loading pickled data received from untrusted sources can be unsafe. 

340 

341 See: https://docs.python.org/3/library/pickle.html for more. 

342 

343 Parameters 

344 ---------- 

345 path_or_buf : str, path object, pandas.HDFStore 

346 Any valid string path is acceptable. Only supports the local file system, 

347 remote URLs and file-like objects are not supported. 

348 

349 If you want to pass in a path object, pandas accepts any 

350 ``os.PathLike``. 

351 

352 Alternatively, pandas accepts an open :class:`pandas.HDFStore` object. 

353 

354 key : object, optional 

355 The group identifier in the store. Can be omitted if the HDF file 

356 contains a single pandas object. 

357 mode : {'r', 'r+', 'a'}, default 'r' 

358 Mode to use when opening the file. Ignored if path_or_buf is a 

359 :class:`pandas.HDFStore`. Default is 'r'. 

360 errors : str, default 'strict' 

361 Specifies how encoding and decoding errors are to be handled. 

362 See the errors argument for :func:`open` for a full list 

363 of options. 

364 where : list, optional 

365 A list of Term (or convertible) objects. 

366 start : int, optional 

367 Row number to start selection. 

368 stop : int, optional 

369 Row number to stop selection. 

370 columns : list, optional 

371 A list of columns names to return. 

372 iterator : bool, optional 

373 Return an iterator object. 

374 chunksize : int, optional 

375 Number of rows to include in an iteration when using an iterator. 

376 **kwargs 

377 Additional keyword arguments passed to HDFStore. 

378 

379 Returns 

380 ------- 

381 object 

382 The selected object. Return type depends on the object stored. 

383 

384 See Also 

385 -------- 

386 DataFrame.to_hdf : Write a HDF file from a DataFrame. 

387 HDFStore : Low-level access to HDF files. 

388 

389 Examples 

390 -------- 

391 >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP 

392 >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP 

393 >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP 

394 """ 

395 if mode not in ["r", "r+", "a"]: 

396 raise ValueError( 

397 f"mode {mode} is not allowed while performing a read. " 

398 f"Allowed modes are r, r+ and a." 

399 ) 

400 # grab the scope 

401 if where is not None: 

402 where = _ensure_term(where, scope_level=1) 

403 

404 if isinstance(path_or_buf, HDFStore): 

405 if not path_or_buf.is_open: 

406 raise OSError("The HDFStore must be open for reading.") 

407 

408 store = path_or_buf 

409 auto_close = False 

410 else: 

411 path_or_buf = stringify_path(path_or_buf) 

412 if not isinstance(path_or_buf, str): 

413 raise NotImplementedError( 

414 "Support for generic buffers has not been implemented." 

415 ) 

416 try: 

417 exists = os.path.exists(path_or_buf) 

418 

419 # if filepath is too long 

420 except (TypeError, ValueError): 

421 exists = False 

422 

423 if not exists: 

424 raise FileNotFoundError(f"File {path_or_buf} does not exist") 

425 

426 store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs) 

427 # can't auto open/close if we are using an iterator 

428 # so delegate to the iterator 

429 auto_close = True 

430 

431 try: 

432 if key is None: 

433 groups = store.groups() 

434 if len(groups) == 0: 

435 raise ValueError( 

436 "Dataset(s) incompatible with Pandas data types, " 

437 "not table, or no datasets found in HDF5 file." 

438 ) 

439 candidate_only_group = groups[0] 

440 

441 # For the HDF file to have only one dataset, all other groups 

442 # should then be metadata groups for that candidate group. (This 

443 # assumes that the groups() method enumerates parent groups 

444 # before their children.) 

445 for group_to_check in groups[1:]: 

446 if not _is_metadata_of(group_to_check, candidate_only_group): 

447 raise ValueError( 

448 "key must be provided when HDF5 " 

449 "file contains multiple datasets." 

450 ) 

451 key = candidate_only_group._v_pathname 

452 return store.select( 

453 key, 

454 where=where, 

455 start=start, 

456 stop=stop, 

457 columns=columns, 

458 iterator=iterator, 

459 chunksize=chunksize, 

460 auto_close=auto_close, 

461 ) 

462 except (ValueError, TypeError, LookupError): 

463 if not isinstance(path_or_buf, HDFStore): 

464 # if there is an error, close the store if we opened it. 

465 with suppress(AttributeError): 

466 store.close() 

467 

468 raise 

469 

470 

471def _is_metadata_of(group: Node, parent_group: Node) -> bool: 

472 """Check if a given group is a metadata group for a given parent_group.""" 

473 if group._v_depth <= parent_group._v_depth: 

474 return False 

475 

476 current = group 

477 while current._v_depth > 1: 

478 parent = current._v_parent 

479 if parent == parent_group and current._v_name == "meta": 

480 return True 

481 current = current._v_parent 

482 return False 

483 

484 

485class HDFStore: 

486 """ 

487 Dict-like IO interface for storing pandas objects in PyTables. 

488 

489 Either Fixed or Table format. 

490 

491 .. warning:: 

492 

493 Pandas uses PyTables for reading and writing HDF5 files, which allows 

494 serializing object-dtype data with pickle when using the "fixed" format. 

495 Loading pickled data received from untrusted sources can be unsafe. 

496 

497 See: https://docs.python.org/3/library/pickle.html for more. 

498 

499 Parameters 

500 ---------- 

501 path : str 

502 File path to HDF5 file. 

503 mode : {'a', 'w', 'r', 'r+'}, default 'a' 

504 

505 ``'r'`` 

506 Read-only; no data can be modified. 

507 ``'w'`` 

508 Write; a new file is created (an existing file with the same 

509 name would be deleted). 

510 ``'a'`` 

511 Append; an existing file is opened for reading and writing, 

512 and if the file does not exist it is created. 

513 ``'r+'`` 

514 It is similar to ``'a'``, but the file must already exist. 

515 complevel : int, 0-9, default None 

516 Specifies a compression level for data. 

517 A value of 0 or None disables compression. 

518 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' 

519 Specifies the compression library to be used. 

520 These additional compressors for Blosc are supported 

521 (default if no compressor specified: 'blosc:blosclz'): 

522 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 

523 'blosc:zlib', 'blosc:zstd'}. 

524 Specifying a compression library which is not available issues 

525 a ValueError. 

526 fletcher32 : bool, default False 

527 If applying compression use the fletcher32 checksum. 

528 **kwargs 

529 These parameters will be passed to the PyTables open_file method. 

530 

531 Examples 

532 -------- 

533 >>> bar = pd.DataFrame(np.random.randn(10, 4)) 

534 >>> store = pd.HDFStore('test.h5') 

535 >>> store['foo'] = bar # write to HDF5 

536 >>> bar = store['foo'] # retrieve 

537 >>> store.close() 

538 

539 **Create or load HDF5 file in-memory** 

540 

541 When passing the `driver` option to the PyTables open_file method through 

542 **kwargs, the HDF5 file is loaded or created in-memory and will only be 

543 written when closed: 

544 

545 >>> bar = pd.DataFrame(np.random.randn(10, 4)) 

546 >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE') 

547 >>> store['foo'] = bar 

548 >>> store.close() # only now, data is written to disk 

549 """ 

550 

551 _handle: File | None 

552 _mode: str 

553 

554 def __init__( 

555 self, 

556 path, 

557 mode: str = "a", 

558 complevel: int | None = None, 

559 complib=None, 

560 fletcher32: bool = False, 

561 **kwargs, 

562 ) -> None: 

563 if "format" in kwargs: 

564 raise ValueError("format is not a defined argument for HDFStore") 

565 

566 tables = import_optional_dependency("tables") 

567 

568 if complib is not None and complib not in tables.filters.all_complibs: 

569 raise ValueError( 

570 f"complib only supports {tables.filters.all_complibs} compression." 

571 ) 

572 

573 if complib is None and complevel is not None: 

574 complib = tables.filters.default_complib 

575 

576 self._path = stringify_path(path) 

577 if mode is None: 

578 mode = "a" 

579 self._mode = mode 

580 self._handle = None 

581 self._complevel = complevel if complevel else 0 

582 self._complib = complib 

583 self._fletcher32 = fletcher32 

584 self._filters = None 

585 self.open(mode=mode, **kwargs) 

586 

587 def __fspath__(self) -> str: 

588 return self._path 

589 

590 @property 

591 def root(self): 

592 """return the root node""" 

593 self._check_if_open() 

594 assert self._handle is not None # for mypy 

595 return self._handle.root 

596 

597 @property 

598 def filename(self) -> str: 

599 return self._path 

600 

601 def __getitem__(self, key: str): 

602 return self.get(key) 

603 

604 def __setitem__(self, key: str, value) -> None: 

605 self.put(key, value) 

606 

607 def __delitem__(self, key: str) -> None: 

608 return self.remove(key) 

609 

610 def __getattr__(self, name: str): 

611 """allow attribute access to get stores""" 

612 try: 

613 return self.get(name) 

614 except (KeyError, ClosedFileError): 

615 pass 

616 raise AttributeError( 

617 f"'{type(self).__name__}' object has no attribute '{name}'" 

618 ) 

619 

620 def __contains__(self, key: str) -> bool: 

621 """ 

622 check for existence of this key 

623 can match the exact pathname or the pathnm w/o the leading '/' 

624 """ 

625 node = self.get_node(key) 

626 if node is not None: 

627 name = node._v_pathname 

628 if key in (name, name[1:]): 

629 return True 

630 return False 

631 

632 def __len__(self) -> int: 

633 return len(self.groups()) 

634 

635 def __repr__(self) -> str: 

636 pstr = pprint_thing(self._path) 

637 return f"{type(self)}\nFile path: {pstr}\n" 

638 

639 def __enter__(self) -> Self: 

640 return self 

641 

642 def __exit__( 

643 self, 

644 exc_type: type[BaseException] | None, 

645 exc_value: BaseException | None, 

646 traceback: TracebackType | None, 

647 ) -> None: 

648 self.close() 

649 

650 def keys(self, include: str = "pandas") -> list[str]: 

651 """ 

652 Return a list of keys corresponding to objects stored in HDFStore. 

653 

654 Parameters 

655 ---------- 

656 

657 include : str, default 'pandas' 

658 When kind equals 'pandas' return pandas objects. 

659 When kind equals 'native' return native HDF5 Table objects. 

660 

661 Returns 

662 ------- 

663 list 

664 List of ABSOLUTE path-names (e.g. have the leading '/'). 

665 

666 Raises 

667 ------ 

668 raises ValueError if kind has an illegal value 

669 

670 Examples 

671 -------- 

672 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) 

673 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP 

674 >>> store.put('data', df) # doctest: +SKIP 

675 >>> store.get('data') # doctest: +SKIP 

676 >>> print(store.keys()) # doctest: +SKIP 

677 ['/data1', '/data2'] 

678 >>> store.close() # doctest: +SKIP 

679 """ 

680 if include == "pandas": 

681 return [n._v_pathname for n in self.groups()] 

682 

683 elif include == "native": 

684 assert self._handle is not None # mypy 

685 return [ 

686 n._v_pathname for n in self._handle.walk_nodes("/", classname="Table") 

687 ] 

688 raise ValueError( 

689 f"`include` should be either 'pandas' or 'native' but is '{include}'" 

690 ) 

691 

692 def __iter__(self) -> Iterator[str]: 

693 return iter(self.keys()) 

694 

695 def items(self) -> Iterator[tuple[str, list]]: 

696 """ 

697 iterate on key->group 

698 """ 

699 for g in self.groups(): 

700 yield g._v_pathname, g 

701 

702 def open(self, mode: str = "a", **kwargs) -> None: 

703 """ 

704 Open the file in the specified mode 

705 

706 Parameters 

707 ---------- 

708 mode : {'a', 'w', 'r', 'r+'}, default 'a' 

709 See HDFStore docstring or tables.open_file for info about modes 

710 **kwargs 

711 These parameters will be passed to the PyTables open_file method. 

712 """ 

713 tables = _tables() 

714 

715 if self._mode != mode: 

716 # if we are changing a write mode to read, ok 

717 if self._mode in ["a", "w"] and mode in ["r", "r+"]: 

718 pass 

719 elif mode in ["w"]: 

720 # this would truncate, raise here 

721 if self.is_open: 

722 raise PossibleDataLossError( 

723 f"Re-opening the file [{self._path}] with mode [{self._mode}] " 

724 "will delete the current file!" 

725 ) 

726 

727 self._mode = mode 

728 

729 # close and reopen the handle 

730 if self.is_open: 

731 self.close() 

732 

733 if self._complevel and self._complevel > 0: 

734 self._filters = _tables().Filters( 

735 self._complevel, self._complib, fletcher32=self._fletcher32 

736 ) 

737 

738 if _table_file_open_policy_is_strict and self.is_open: 

739 msg = ( 

740 "Cannot open HDF5 file, which is already opened, " 

741 "even in read-only mode." 

742 ) 

743 raise ValueError(msg) 

744 

745 self._handle = tables.open_file(self._path, self._mode, **kwargs) 

746 

747 def close(self) -> None: 

748 """ 

749 Close the PyTables file handle 

750 """ 

751 if self._handle is not None: 

752 self._handle.close() 

753 self._handle = None 

754 

755 @property 

756 def is_open(self) -> bool: 

757 """ 

758 return a boolean indicating whether the file is open 

759 """ 

760 if self._handle is None: 

761 return False 

762 return bool(self._handle.isopen) 

763 

764 def flush(self, fsync: bool = False) -> None: 

765 """ 

766 Force all buffered modifications to be written to disk. 

767 

768 Parameters 

769 ---------- 

770 fsync : bool (default False) 

771 call ``os.fsync()`` on the file handle to force writing to disk. 

772 

773 Notes 

774 ----- 

775 Without ``fsync=True``, flushing may not guarantee that the OS writes 

776 to disk. With fsync, the operation will block until the OS claims the 

777 file has been written; however, other caching layers may still 

778 interfere. 

779 """ 

780 if self._handle is not None: 

781 self._handle.flush() 

782 if fsync: 

783 with suppress(OSError): 

784 os.fsync(self._handle.fileno()) 

785 

786 def get(self, key: str): 

787 """ 

788 Retrieve pandas object stored in file. 

789 

790 Parameters 

791 ---------- 

792 key : str 

793 

794 Returns 

795 ------- 

796 object 

797 Same type as object stored in file. 

798 

799 Examples 

800 -------- 

801 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) 

802 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP 

803 >>> store.put('data', df) # doctest: +SKIP 

804 >>> store.get('data') # doctest: +SKIP 

805 >>> store.close() # doctest: +SKIP 

806 """ 

807 with patch_pickle(): 

808 # GH#31167 Without this patch, pickle doesn't know how to unpickle 

809 # old DateOffset objects now that they are cdef classes. 

810 group = self.get_node(key) 

811 if group is None: 

812 raise KeyError(f"No object named {key} in the file") 

813 return self._read_group(group) 

814 

815 def select( 

816 self, 

817 key: str, 

818 where=None, 

819 start=None, 

820 stop=None, 

821 columns=None, 

822 iterator: bool = False, 

823 chunksize: int | None = None, 

824 auto_close: bool = False, 

825 ): 

826 """ 

827 Retrieve pandas object stored in file, optionally based on where criteria. 

828 

829 .. warning:: 

830 

831 Pandas uses PyTables for reading and writing HDF5 files, which allows 

832 serializing object-dtype data with pickle when using the "fixed" format. 

833 Loading pickled data received from untrusted sources can be unsafe. 

834 

835 See: https://docs.python.org/3/library/pickle.html for more. 

836 

837 Parameters 

838 ---------- 

839 key : str 

840 Object being retrieved from file. 

841 where : list or None 

842 List of Term (or convertible) objects, optional. 

843 start : int or None 

844 Row number to start selection. 

845 stop : int, default None 

846 Row number to stop selection. 

847 columns : list or None 

848 A list of columns that if not None, will limit the return columns. 

849 iterator : bool or False 

850 Returns an iterator. 

851 chunksize : int or None 

852 Number or rows to include in iteration, return an iterator. 

853 auto_close : bool or False 

854 Should automatically close the store when finished. 

855 

856 Returns 

857 ------- 

858 object 

859 Retrieved object from file. 

860 

861 Examples 

862 -------- 

863 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) 

864 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP 

865 >>> store.put('data', df) # doctest: +SKIP 

866 >>> store.get('data') # doctest: +SKIP 

867 >>> print(store.keys()) # doctest: +SKIP 

868 ['/data1', '/data2'] 

869 >>> store.select('/data1') # doctest: +SKIP 

870 A B 

871 0 1 2 

872 1 3 4 

873 >>> store.select('/data1', where='columns == A') # doctest: +SKIP 

874 A 

875 0 1 

876 1 3 

877 >>> store.close() # doctest: +SKIP 

878 """ 

879 group = self.get_node(key) 

880 if group is None: 

881 raise KeyError(f"No object named {key} in the file") 

882 

883 # create the storer and axes 

884 where = _ensure_term(where, scope_level=1) 

885 s = self._create_storer(group) 

886 s.infer_axes() 

887 

888 # function to call on iteration 

889 def func(_start, _stop, _where): 

890 return s.read(start=_start, stop=_stop, where=_where, columns=columns) 

891 

892 # create the iterator 

893 it = TableIterator( 

894 self, 

895 s, 

896 func, 

897 where=where, 

898 nrows=s.nrows, 

899 start=start, 

900 stop=stop, 

901 iterator=iterator, 

902 chunksize=chunksize, 

903 auto_close=auto_close, 

904 ) 

905 

906 return it.get_result() 

907 

908 def select_as_coordinates( 

909 self, 

910 key: str, 

911 where=None, 

912 start: int | None = None, 

913 stop: int | None = None, 

914 ): 

915 """ 

916 return the selection as an Index 

917 

918 .. warning:: 

919 

920 Pandas uses PyTables for reading and writing HDF5 files, which allows 

921 serializing object-dtype data with pickle when using the "fixed" format. 

922 Loading pickled data received from untrusted sources can be unsafe. 

923 

924 See: https://docs.python.org/3/library/pickle.html for more. 

925 

926 

927 Parameters 

928 ---------- 

929 key : str 

930 where : list of Term (or convertible) objects, optional 

931 start : integer (defaults to None), row number to start selection 

932 stop : integer (defaults to None), row number to stop selection 

933 """ 

934 where = _ensure_term(where, scope_level=1) 

935 tbl = self.get_storer(key) 

936 if not isinstance(tbl, Table): 

937 raise TypeError("can only read_coordinates with a table") 

938 return tbl.read_coordinates(where=where, start=start, stop=stop) 

939 

940 def select_column( 

941 self, 

942 key: str, 

943 column: str, 

944 start: int | None = None, 

945 stop: int | None = None, 

946 ): 

947 """ 

948 return a single column from the table. This is generally only useful to 

949 select an indexable 

950 

951 .. warning:: 

952 

953 Pandas uses PyTables for reading and writing HDF5 files, which allows 

954 serializing object-dtype data with pickle when using the "fixed" format. 

955 Loading pickled data received from untrusted sources can be unsafe. 

956 

957 See: https://docs.python.org/3/library/pickle.html for more. 

958 

959 Parameters 

960 ---------- 

961 key : str 

962 column : str 

963 The column of interest. 

964 start : int or None, default None 

965 stop : int or None, default None 

966 

967 Raises 

968 ------ 

969 raises KeyError if the column is not found (or key is not a valid 

970 store) 

971 raises ValueError if the column can not be extracted individually (it 

972 is part of a data block) 

973 

974 """ 

975 tbl = self.get_storer(key) 

976 if not isinstance(tbl, Table): 

977 raise TypeError("can only read_column with a table") 

978 return tbl.read_column(column=column, start=start, stop=stop) 

979 

980 def select_as_multiple( 

981 self, 

982 keys, 

983 where=None, 

984 selector=None, 

985 columns=None, 

986 start=None, 

987 stop=None, 

988 iterator: bool = False, 

989 chunksize: int | None = None, 

990 auto_close: bool = False, 

991 ): 

992 """ 

993 Retrieve pandas objects from multiple tables. 

994 

995 .. warning:: 

996 

997 Pandas uses PyTables for reading and writing HDF5 files, which allows 

998 serializing object-dtype data with pickle when using the "fixed" format. 

999 Loading pickled data received from untrusted sources can be unsafe. 

1000 

1001 See: https://docs.python.org/3/library/pickle.html for more. 

1002 

1003 Parameters 

1004 ---------- 

1005 keys : a list of the tables 

1006 selector : the table to apply the where criteria (defaults to keys[0] 

1007 if not supplied) 

1008 columns : the columns I want back 

1009 start : integer (defaults to None), row number to start selection 

1010 stop : integer (defaults to None), row number to stop selection 

1011 iterator : bool, return an iterator, default False 

1012 chunksize : nrows to include in iteration, return an iterator 

1013 auto_close : bool, default False 

1014 Should automatically close the store when finished. 

1015 

1016 Raises 

1017 ------ 

1018 raises KeyError if keys or selector is not found or keys is empty 

1019 raises TypeError if keys is not a list or tuple 

1020 raises ValueError if the tables are not ALL THE SAME DIMENSIONS 

1021 """ 

1022 # default to single select 

1023 where = _ensure_term(where, scope_level=1) 

1024 if isinstance(keys, (list, tuple)) and len(keys) == 1: 

1025 keys = keys[0] 

1026 if isinstance(keys, str): 

1027 return self.select( 

1028 key=keys, 

1029 where=where, 

1030 columns=columns, 

1031 start=start, 

1032 stop=stop, 

1033 iterator=iterator, 

1034 chunksize=chunksize, 

1035 auto_close=auto_close, 

1036 ) 

1037 

1038 if not isinstance(keys, (list, tuple)): 

1039 raise TypeError("keys must be a list/tuple") 

1040 

1041 if not len(keys): 

1042 raise ValueError("keys must have a non-zero length") 

1043 

1044 if selector is None: 

1045 selector = keys[0] 

1046 

1047 # collect the tables 

1048 tbls = [self.get_storer(k) for k in keys] 

1049 s = self.get_storer(selector) 

1050 

1051 # validate rows 

1052 nrows = None 

1053 for t, k in itertools.chain([(s, selector)], zip(tbls, keys)): 

1054 if t is None: 

1055 raise KeyError(f"Invalid table [{k}]") 

1056 if not t.is_table: 

1057 raise TypeError( 

1058 f"object [{t.pathname}] is not a table, and cannot be used in all " 

1059 "select as multiple" 

1060 ) 

1061 

1062 if nrows is None: 

1063 nrows = t.nrows 

1064 elif t.nrows != nrows: 

1065 raise ValueError("all tables must have exactly the same nrows!") 

1066 

1067 # The isinstance checks here are redundant with the check above, 

1068 # but necessary for mypy; see GH#29757 

1069 _tbls = [x for x in tbls if isinstance(x, Table)] 

1070 

1071 # axis is the concentration axes 

1072 axis = {t.non_index_axes[0][0] for t in _tbls}.pop() 

1073 

1074 def func(_start, _stop, _where): 

1075 # retrieve the objs, _where is always passed as a set of 

1076 # coordinates here 

1077 objs = [ 

1078 t.read(where=_where, columns=columns, start=_start, stop=_stop) 

1079 for t in tbls 

1080 ] 

1081 

1082 # concat and return 

1083 return concat(objs, axis=axis, verify_integrity=False)._consolidate() 

1084 

1085 # create the iterator 

1086 it = TableIterator( 

1087 self, 

1088 s, 

1089 func, 

1090 where=where, 

1091 nrows=nrows, 

1092 start=start, 

1093 stop=stop, 

1094 iterator=iterator, 

1095 chunksize=chunksize, 

1096 auto_close=auto_close, 

1097 ) 

1098 

1099 return it.get_result(coordinates=True) 

1100 

1101 def put( 

1102 self, 

1103 key: str, 

1104 value: DataFrame | Series, 

1105 format=None, 

1106 index: bool = True, 

1107 append: bool = False, 

1108 complib=None, 

1109 complevel: int | None = None, 

1110 min_itemsize: int | dict[str, int] | None = None, 

1111 nan_rep=None, 

1112 data_columns: Literal[True] | list[str] | None = None, 

1113 encoding=None, 

1114 errors: str = "strict", 

1115 track_times: bool = True, 

1116 dropna: bool = False, 

1117 ) -> None: 

1118 """ 

1119 Store object in HDFStore. 

1120 

1121 Parameters 

1122 ---------- 

1123 key : str 

1124 value : {Series, DataFrame} 

1125 format : 'fixed(f)|table(t)', default is 'fixed' 

1126 Format to use when storing object in HDFStore. Value can be one of: 

1127 

1128 ``'fixed'`` 

1129 Fixed format. Fast writing/reading. Not-appendable, nor searchable. 

1130 ``'table'`` 

1131 Table format. Write as a PyTables Table structure which may perform 

1132 worse but allow more flexible operations like searching / selecting 

1133 subsets of the data. 

1134 index : bool, default True 

1135 Write DataFrame index as a column. 

1136 append : bool, default False 

1137 This will force Table format, append the input data to the existing. 

1138 data_columns : list of columns or True, default None 

1139 List of columns to create as data columns, or True to use all columns. 

1140 See `here 

1141 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__. 

1142 encoding : str, default None 

1143 Provide an encoding for strings. 

1144 track_times : bool, default True 

1145 Parameter is propagated to 'create_table' method of 'PyTables'. 

1146 If set to False it enables to have the same h5 files (same hashes) 

1147 independent on creation time. 

1148 dropna : bool, default False, optional 

1149 Remove missing values. 

1150 

1151 Examples 

1152 -------- 

1153 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) 

1154 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP 

1155 >>> store.put('data', df) # doctest: +SKIP 

1156 """ 

1157 if format is None: 

1158 format = get_option("io.hdf.default_format") or "fixed" 

1159 format = self._validate_format(format) 

1160 self._write_to_group( 

1161 key, 

1162 value, 

1163 format=format, 

1164 index=index, 

1165 append=append, 

1166 complib=complib, 

1167 complevel=complevel, 

1168 min_itemsize=min_itemsize, 

1169 nan_rep=nan_rep, 

1170 data_columns=data_columns, 

1171 encoding=encoding, 

1172 errors=errors, 

1173 track_times=track_times, 

1174 dropna=dropna, 

1175 ) 

1176 

1177 def remove(self, key: str, where=None, start=None, stop=None) -> None: 

1178 """ 

1179 Remove pandas object partially by specifying the where condition 

1180 

1181 Parameters 

1182 ---------- 

1183 key : str 

1184 Node to remove or delete rows from 

1185 where : list of Term (or convertible) objects, optional 

1186 start : integer (defaults to None), row number to start selection 

1187 stop : integer (defaults to None), row number to stop selection 

1188 

1189 Returns 

1190 ------- 

1191 number of rows removed (or None if not a Table) 

1192 

1193 Raises 

1194 ------ 

1195 raises KeyError if key is not a valid store 

1196 

1197 """ 

1198 where = _ensure_term(where, scope_level=1) 

1199 try: 

1200 s = self.get_storer(key) 

1201 except KeyError: 

1202 # the key is not a valid store, re-raising KeyError 

1203 raise 

1204 except AssertionError: 

1205 # surface any assertion errors for e.g. debugging 

1206 raise 

1207 except Exception as err: 

1208 # In tests we get here with ClosedFileError, TypeError, and 

1209 # _table_mod.NoSuchNodeError. TODO: Catch only these? 

1210 

1211 if where is not None: 

1212 raise ValueError( 

1213 "trying to remove a node with a non-None where clause!" 

1214 ) from err 

1215 

1216 # we are actually trying to remove a node (with children) 

1217 node = self.get_node(key) 

1218 if node is not None: 

1219 node._f_remove(recursive=True) 

1220 return None 

1221 

1222 # remove the node 

1223 if com.all_none(where, start, stop): 

1224 s.group._f_remove(recursive=True) 

1225 

1226 # delete from the table 

1227 else: 

1228 if not s.is_table: 

1229 raise ValueError( 

1230 "can only remove with where on objects written as tables" 

1231 ) 

1232 return s.delete(where=where, start=start, stop=stop) 

1233 

1234 def append( 

1235 self, 

1236 key: str, 

1237 value: DataFrame | Series, 

1238 format=None, 

1239 axes=None, 

1240 index: bool | list[str] = True, 

1241 append: bool = True, 

1242 complib=None, 

1243 complevel: int | None = None, 

1244 columns=None, 

1245 min_itemsize: int | dict[str, int] | None = None, 

1246 nan_rep=None, 

1247 chunksize: int | None = None, 

1248 expectedrows=None, 

1249 dropna: bool | None = None, 

1250 data_columns: Literal[True] | list[str] | None = None, 

1251 encoding=None, 

1252 errors: str = "strict", 

1253 ) -> None: 

1254 """ 

1255 Append to Table in file. 

1256 

1257 Node must already exist and be Table format. 

1258 

1259 Parameters 

1260 ---------- 

1261 key : str 

1262 value : {Series, DataFrame} 

1263 format : 'table' is the default 

1264 Format to use when storing object in HDFStore. Value can be one of: 

1265 

1266 ``'table'`` 

1267 Table format. Write as a PyTables Table structure which may perform 

1268 worse but allow more flexible operations like searching / selecting 

1269 subsets of the data. 

1270 index : bool, default True 

1271 Write DataFrame index as a column. 

1272 append : bool, default True 

1273 Append the input data to the existing. 

1274 data_columns : list of columns, or True, default None 

1275 List of columns to create as indexed data columns for on-disk 

1276 queries, or True to use all columns. By default only the axes 

1277 of the object are indexed. See `here 

1278 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__. 

1279 min_itemsize : dict of columns that specify minimum str sizes 

1280 nan_rep : str to use as str nan representation 

1281 chunksize : size to chunk the writing 

1282 expectedrows : expected TOTAL row size of this table 

1283 encoding : default None, provide an encoding for str 

1284 dropna : bool, default False, optional 

1285 Do not write an ALL nan row to the store settable 

1286 by the option 'io.hdf.dropna_table'. 

1287 

1288 Notes 

1289 ----- 

1290 Does *not* check if data being appended overlaps with existing 

1291 data in the table, so be careful 

1292 

1293 Examples 

1294 -------- 

1295 >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) 

1296 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP 

1297 >>> store.put('data', df1, format='table') # doctest: +SKIP 

1298 >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B']) 

1299 >>> store.append('data', df2) # doctest: +SKIP 

1300 >>> store.close() # doctest: +SKIP 

1301 A B 

1302 0 1 2 

1303 1 3 4 

1304 0 5 6 

1305 1 7 8 

1306 """ 

1307 if columns is not None: 

1308 raise TypeError( 

1309 "columns is not a supported keyword in append, try data_columns" 

1310 ) 

1311 

1312 if dropna is None: 

1313 dropna = get_option("io.hdf.dropna_table") 

1314 if format is None: 

1315 format = get_option("io.hdf.default_format") or "table" 

1316 format = self._validate_format(format) 

1317 self._write_to_group( 

1318 key, 

1319 value, 

1320 format=format, 

1321 axes=axes, 

1322 index=index, 

1323 append=append, 

1324 complib=complib, 

1325 complevel=complevel, 

1326 min_itemsize=min_itemsize, 

1327 nan_rep=nan_rep, 

1328 chunksize=chunksize, 

1329 expectedrows=expectedrows, 

1330 dropna=dropna, 

1331 data_columns=data_columns, 

1332 encoding=encoding, 

1333 errors=errors, 

1334 ) 

1335 

1336 def append_to_multiple( 

1337 self, 

1338 d: dict, 

1339 value, 

1340 selector, 

1341 data_columns=None, 

1342 axes=None, 

1343 dropna: bool = False, 

1344 **kwargs, 

1345 ) -> None: 

1346 """ 

1347 Append to multiple tables 

1348 

1349 Parameters 

1350 ---------- 

1351 d : a dict of table_name to table_columns, None is acceptable as the 

1352 values of one node (this will get all the remaining columns) 

1353 value : a pandas object 

1354 selector : a string that designates the indexable table; all of its 

1355 columns will be designed as data_columns, unless data_columns is 

1356 passed, in which case these are used 

1357 data_columns : list of columns to create as data columns, or True to 

1358 use all columns 

1359 dropna : if evaluates to True, drop rows from all tables if any single 

1360 row in each table has all NaN. Default False. 

1361 

1362 Notes 

1363 ----- 

1364 axes parameter is currently not accepted 

1365 

1366 """ 

1367 if axes is not None: 

1368 raise TypeError( 

1369 "axes is currently not accepted as a parameter to append_to_multiple; " 

1370 "you can create the tables independently instead" 

1371 ) 

1372 

1373 if not isinstance(d, dict): 

1374 raise ValueError( 

1375 "append_to_multiple must have a dictionary specified as the " 

1376 "way to split the value" 

1377 ) 

1378 

1379 if selector not in d: 

1380 raise ValueError( 

1381 "append_to_multiple requires a selector that is in passed dict" 

1382 ) 

1383 

1384 # figure out the splitting axis (the non_index_axis) 

1385 axis = next(iter(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))) 

1386 

1387 # figure out how to split the value 

1388 remain_key = None 

1389 remain_values: list = [] 

1390 for k, v in d.items(): 

1391 if v is None: 

1392 if remain_key is not None: 

1393 raise ValueError( 

1394 "append_to_multiple can only have one value in d that is None" 

1395 ) 

1396 remain_key = k 

1397 else: 

1398 remain_values.extend(v) 

1399 if remain_key is not None: 

1400 ordered = value.axes[axis] 

1401 ordd = ordered.difference(Index(remain_values)) 

1402 ordd = sorted(ordered.get_indexer(ordd)) 

1403 d[remain_key] = ordered.take(ordd) 

1404 

1405 # data_columns 

1406 if data_columns is None: 

1407 data_columns = d[selector] 

1408 

1409 # ensure rows are synchronized across the tables 

1410 if dropna: 

1411 idxs = (value[cols].dropna(how="all").index for cols in d.values()) 

1412 valid_index = next(idxs) 

1413 for index in idxs: 

1414 valid_index = valid_index.intersection(index) 

1415 value = value.loc[valid_index] 

1416 

1417 min_itemsize = kwargs.pop("min_itemsize", None) 

1418 

1419 # append 

1420 for k, v in d.items(): 

1421 dc = data_columns if k == selector else None 

1422 

1423 # compute the val 

1424 val = value.reindex(v, axis=axis) 

1425 

1426 filtered = ( 

1427 {key: value for (key, value) in min_itemsize.items() if key in v} 

1428 if min_itemsize is not None 

1429 else None 

1430 ) 

1431 self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs) 

1432 

1433 def create_table_index( 

1434 self, 

1435 key: str, 

1436 columns=None, 

1437 optlevel: int | None = None, 

1438 kind: str | None = None, 

1439 ) -> None: 

1440 """ 

1441 Create a pytables index on the table. 

1442 

1443 Parameters 

1444 ---------- 

1445 key : str 

1446 columns : None, bool, or listlike[str] 

1447 Indicate which columns to create an index on. 

1448 

1449 * False : Do not create any indexes. 

1450 * True : Create indexes on all columns. 

1451 * None : Create indexes on all columns. 

1452 * listlike : Create indexes on the given columns. 

1453 

1454 optlevel : int or None, default None 

1455 Optimization level, if None, pytables defaults to 6. 

1456 kind : str or None, default None 

1457 Kind of index, if None, pytables defaults to "medium". 

1458 

1459 Raises 

1460 ------ 

1461 TypeError: raises if the node is not a table 

1462 """ 

1463 # version requirements 

1464 _tables() 

1465 s = self.get_storer(key) 

1466 if s is None: 

1467 return 

1468 

1469 if not isinstance(s, Table): 

1470 raise TypeError("cannot create table index on a Fixed format store") 

1471 s.create_index(columns=columns, optlevel=optlevel, kind=kind) 

1472 

1473 def groups(self) -> list: 

1474 """ 

1475 Return a list of all the top-level nodes. 

1476 

1477 Each node returned is not a pandas storage object. 

1478 

1479 Returns 

1480 ------- 

1481 list 

1482 List of objects. 

1483 

1484 Examples 

1485 -------- 

1486 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) 

1487 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP 

1488 >>> store.put('data', df) # doctest: +SKIP 

1489 >>> print(store.groups()) # doctest: +SKIP 

1490 >>> store.close() # doctest: +SKIP 

1491 [/data (Group) '' 

1492 children := ['axis0' (Array), 'axis1' (Array), 'block0_values' (Array), 

1493 'block0_items' (Array)]] 

1494 """ 

1495 _tables() 

1496 self._check_if_open() 

1497 assert self._handle is not None # for mypy 

1498 assert _table_mod is not None # for mypy 

1499 return [ 

1500 g 

1501 for g in self._handle.walk_groups() 

1502 if ( 

1503 not isinstance(g, _table_mod.link.Link) 

1504 and ( 

1505 getattr(g._v_attrs, "pandas_type", None) 

1506 or getattr(g, "table", None) 

1507 or (isinstance(g, _table_mod.table.Table) and g._v_name != "table") 

1508 ) 

1509 ) 

1510 ] 

1511 

1512 def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]: 

1513 """ 

1514 Walk the pytables group hierarchy for pandas objects. 

1515 

1516 This generator will yield the group path, subgroups and pandas object 

1517 names for each group. 

1518 

1519 Any non-pandas PyTables objects that are not a group will be ignored. 

1520 

1521 The `where` group itself is listed first (preorder), then each of its 

1522 child groups (following an alphanumerical order) is also traversed, 

1523 following the same procedure. 

1524 

1525 Parameters 

1526 ---------- 

1527 where : str, default "/" 

1528 Group where to start walking. 

1529 

1530 Yields 

1531 ------ 

1532 path : str 

1533 Full path to a group (without trailing '/'). 

1534 groups : list 

1535 Names (strings) of the groups contained in `path`. 

1536 leaves : list 

1537 Names (strings) of the pandas objects contained in `path`. 

1538 

1539 Examples 

1540 -------- 

1541 >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) 

1542 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP 

1543 >>> store.put('data', df1, format='table') # doctest: +SKIP 

1544 >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B']) 

1545 >>> store.append('data', df2) # doctest: +SKIP 

1546 >>> store.close() # doctest: +SKIP 

1547 >>> for group in store.walk(): # doctest: +SKIP 

1548 ... print(group) # doctest: +SKIP 

1549 >>> store.close() # doctest: +SKIP 

1550 """ 

1551 _tables() 

1552 self._check_if_open() 

1553 assert self._handle is not None # for mypy 

1554 assert _table_mod is not None # for mypy 

1555 

1556 for g in self._handle.walk_groups(where): 

1557 if getattr(g._v_attrs, "pandas_type", None) is not None: 

1558 continue 

1559 

1560 groups = [] 

1561 leaves = [] 

1562 for child in g._v_children.values(): 

1563 pandas_type = getattr(child._v_attrs, "pandas_type", None) 

1564 if pandas_type is None: 

1565 if isinstance(child, _table_mod.group.Group): 

1566 groups.append(child._v_name) 

1567 else: 

1568 leaves.append(child._v_name) 

1569 

1570 yield (g._v_pathname.rstrip("/"), groups, leaves) 

1571 

1572 def get_node(self, key: str) -> Node | None: 

1573 """return the node with the key or None if it does not exist""" 

1574 self._check_if_open() 

1575 if not key.startswith("/"): 

1576 key = "/" + key 

1577 

1578 assert self._handle is not None 

1579 assert _table_mod is not None # for mypy 

1580 try: 

1581 node = self._handle.get_node(self.root, key) 

1582 except _table_mod.exceptions.NoSuchNodeError: 

1583 return None 

1584 

1585 assert isinstance(node, _table_mod.Node), type(node) 

1586 return node 

1587 

1588 def get_storer(self, key: str) -> GenericFixed | Table: 

1589 """return the storer object for a key, raise if not in the file""" 

1590 group = self.get_node(key) 

1591 if group is None: 

1592 raise KeyError(f"No object named {key} in the file") 

1593 

1594 s = self._create_storer(group) 

1595 s.infer_axes() 

1596 return s 

1597 

1598 def copy( 

1599 self, 

1600 file, 

1601 mode: str = "w", 

1602 propindexes: bool = True, 

1603 keys=None, 

1604 complib=None, 

1605 complevel: int | None = None, 

1606 fletcher32: bool = False, 

1607 overwrite: bool = True, 

1608 ) -> HDFStore: 

1609 """ 

1610 Copy the existing store to a new file, updating in place. 

1611 

1612 Parameters 

1613 ---------- 

1614 propindexes : bool, default True 

1615 Restore indexes in copied file. 

1616 keys : list, optional 

1617 List of keys to include in the copy (defaults to all). 

1618 overwrite : bool, default True 

1619 Whether to overwrite (remove and replace) existing nodes in the new store. 

1620 mode, complib, complevel, fletcher32 same as in HDFStore.__init__ 

1621 

1622 Returns 

1623 ------- 

1624 open file handle of the new store 

1625 """ 

1626 new_store = HDFStore( 

1627 file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32 

1628 ) 

1629 if keys is None: 

1630 keys = list(self.keys()) 

1631 if not isinstance(keys, (tuple, list)): 

1632 keys = [keys] 

1633 for k in keys: 

1634 s = self.get_storer(k) 

1635 if s is not None: 

1636 if k in new_store: 

1637 if overwrite: 

1638 new_store.remove(k) 

1639 

1640 data = self.select(k) 

1641 if isinstance(s, Table): 

1642 index: bool | list[str] = False 

1643 if propindexes: 

1644 index = [a.name for a in s.axes if a.is_indexed] 

1645 new_store.append( 

1646 k, 

1647 data, 

1648 index=index, 

1649 data_columns=getattr(s, "data_columns", None), 

1650 encoding=s.encoding, 

1651 ) 

1652 else: 

1653 new_store.put(k, data, encoding=s.encoding) 

1654 

1655 return new_store 

1656 

1657 def info(self) -> str: 

1658 """ 

1659 Print detailed information on the store. 

1660 

1661 Returns 

1662 ------- 

1663 str 

1664 

1665 Examples 

1666 -------- 

1667 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) 

1668 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP 

1669 >>> store.put('data', df) # doctest: +SKIP 

1670 >>> print(store.info()) # doctest: +SKIP 

1671 >>> store.close() # doctest: +SKIP 

1672 <class 'pandas.io.pytables.HDFStore'> 

1673 File path: store.h5 

1674 /data frame (shape->[2,2]) 

1675 """ 

1676 path = pprint_thing(self._path) 

1677 output = f"{type(self)}\nFile path: {path}\n" 

1678 

1679 if self.is_open: 

1680 lkeys = sorted(self.keys()) 

1681 if len(lkeys): 

1682 keys = [] 

1683 values = [] 

1684 

1685 for k in lkeys: 

1686 try: 

1687 s = self.get_storer(k) 

1688 if s is not None: 

1689 keys.append(pprint_thing(s.pathname or k)) 

1690 values.append(pprint_thing(s or "invalid_HDFStore node")) 

1691 except AssertionError: 

1692 # surface any assertion errors for e.g. debugging 

1693 raise 

1694 except Exception as detail: 

1695 keys.append(k) 

1696 dstr = pprint_thing(detail) 

1697 values.append(f"[invalid_HDFStore node: {dstr}]") 

1698 

1699 output += adjoin(12, keys, values) 

1700 else: 

1701 output += "Empty" 

1702 else: 

1703 output += "File is CLOSED" 

1704 

1705 return output 

1706 

1707 # ------------------------------------------------------------------------ 

1708 # private methods 

1709 

1710 def _check_if_open(self) -> None: 

1711 if not self.is_open: 

1712 raise ClosedFileError(f"{self._path} file is not open!") 

1713 

1714 def _validate_format(self, format: str) -> str: 

1715 """validate / deprecate formats""" 

1716 # validate 

1717 try: 

1718 format = _FORMAT_MAP[format.lower()] 

1719 except KeyError as err: 

1720 raise TypeError(f"invalid HDFStore format specified [{format}]") from err 

1721 

1722 return format 

1723 

1724 def _create_storer( 

1725 self, 

1726 group, 

1727 format=None, 

1728 value: DataFrame | Series | None = None, 

1729 encoding: str = "UTF-8", 

1730 errors: str = "strict", 

1731 ) -> GenericFixed | Table: 

1732 """return a suitable class to operate""" 

1733 cls: type[GenericFixed | Table] 

1734 

1735 if value is not None and not isinstance(value, (Series, DataFrame)): 

1736 raise TypeError("value must be None, Series, or DataFrame") 

1737 

1738 pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None)) 

1739 tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None)) 

1740 

1741 # infer the pt from the passed value 

1742 if pt is None: 

1743 if value is None: 

1744 _tables() 

1745 assert _table_mod is not None # for mypy 

1746 if getattr(group, "table", None) or isinstance( 

1747 group, _table_mod.table.Table 

1748 ): 

1749 pt = "frame_table" 

1750 tt = "generic_table" 

1751 else: 

1752 raise TypeError( 

1753 "cannot create a storer if the object is not existing " 

1754 "nor a value are passed" 

1755 ) 

1756 else: 

1757 if isinstance(value, Series): 

1758 pt = "series" 

1759 else: 

1760 pt = "frame" 

1761 

1762 # we are actually a table 

1763 if format == "table": 

1764 pt += "_table" 

1765 

1766 # a storer node 

1767 if "table" not in pt: 

1768 _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed} 

1769 try: 

1770 cls = _STORER_MAP[pt] 

1771 except KeyError as err: 

1772 raise TypeError( 

1773 f"cannot properly create the storer for: [_STORER_MAP] [group->" 

1774 f"{group},value->{type(value)},format->{format}" 

1775 ) from err 

1776 return cls(self, group, encoding=encoding, errors=errors) 

1777 

1778 # existing node (and must be a table) 

1779 if tt is None: 

1780 # if we are a writer, determine the tt 

1781 if value is not None: 

1782 if pt == "series_table": 

1783 index = getattr(value, "index", None) 

1784 if index is not None: 

1785 if index.nlevels == 1: 

1786 tt = "appendable_series" 

1787 elif index.nlevels > 1: 

1788 tt = "appendable_multiseries" 

1789 elif pt == "frame_table": 

1790 index = getattr(value, "index", None) 

1791 if index is not None: 

1792 if index.nlevels == 1: 

1793 tt = "appendable_frame" 

1794 elif index.nlevels > 1: 

1795 tt = "appendable_multiframe" 

1796 

1797 _TABLE_MAP = { 

1798 "generic_table": GenericTable, 

1799 "appendable_series": AppendableSeriesTable, 

1800 "appendable_multiseries": AppendableMultiSeriesTable, 

1801 "appendable_frame": AppendableFrameTable, 

1802 "appendable_multiframe": AppendableMultiFrameTable, 

1803 "worm": WORMTable, 

1804 } 

1805 try: 

1806 cls = _TABLE_MAP[tt] 

1807 except KeyError as err: 

1808 raise TypeError( 

1809 f"cannot properly create the storer for: [_TABLE_MAP] [group->" 

1810 f"{group},value->{type(value)},format->{format}" 

1811 ) from err 

1812 

1813 return cls(self, group, encoding=encoding, errors=errors) 

1814 

1815 def _write_to_group( 

1816 self, 

1817 key: str, 

1818 value: DataFrame | Series, 

1819 format, 

1820 axes=None, 

1821 index: bool | list[str] = True, 

1822 append: bool = False, 

1823 complib=None, 

1824 complevel: int | None = None, 

1825 fletcher32=None, 

1826 min_itemsize: int | dict[str, int] | None = None, 

1827 chunksize: int | None = None, 

1828 expectedrows=None, 

1829 dropna: bool = False, 

1830 nan_rep=None, 

1831 data_columns=None, 

1832 encoding=None, 

1833 errors: str = "strict", 

1834 track_times: bool = True, 

1835 ) -> None: 

1836 # we don't want to store a table node at all if our object is 0-len 

1837 # as there are not dtypes 

1838 if getattr(value, "empty", None) and (format == "table" or append): 

1839 return 

1840 

1841 group = self._identify_group(key, append) 

1842 

1843 s = self._create_storer(group, format, value, encoding=encoding, errors=errors) 

1844 if append: 

1845 # raise if we are trying to append to a Fixed format, 

1846 # or a table that exists (and we are putting) 

1847 if not s.is_table or (s.is_table and format == "fixed" and s.is_exists): 

1848 raise ValueError("Can only append to Tables") 

1849 if not s.is_exists: 

1850 s.set_object_info() 

1851 else: 

1852 s.set_object_info() 

1853 

1854 if not s.is_table and complib: 

1855 raise ValueError("Compression not supported on Fixed format stores") 

1856 

1857 # write the object 

1858 s.write( 

1859 obj=value, 

1860 axes=axes, 

1861 append=append, 

1862 complib=complib, 

1863 complevel=complevel, 

1864 fletcher32=fletcher32, 

1865 min_itemsize=min_itemsize, 

1866 chunksize=chunksize, 

1867 expectedrows=expectedrows, 

1868 dropna=dropna, 

1869 nan_rep=nan_rep, 

1870 data_columns=data_columns, 

1871 track_times=track_times, 

1872 ) 

1873 

1874 if isinstance(s, Table) and index: 

1875 s.create_index(columns=index) 

1876 

1877 def _read_group(self, group: Node): 

1878 s = self._create_storer(group) 

1879 s.infer_axes() 

1880 return s.read() 

1881 

1882 def _identify_group(self, key: str, append: bool) -> Node: 

1883 """Identify HDF5 group based on key, delete/create group if needed.""" 

1884 group = self.get_node(key) 

1885 

1886 # we make this assertion for mypy; the get_node call will already 

1887 # have raised if this is incorrect 

1888 assert self._handle is not None 

1889 

1890 # remove the node if we are not appending 

1891 if group is not None and not append: 

1892 self._handle.remove_node(group, recursive=True) 

1893 group = None 

1894 

1895 if group is None: 

1896 group = self._create_nodes_and_group(key) 

1897 

1898 return group 

1899 

1900 def _create_nodes_and_group(self, key: str) -> Node: 

1901 """Create nodes from key and return group name.""" 

1902 # assertion for mypy 

1903 assert self._handle is not None 

1904 

1905 paths = key.split("/") 

1906 # recursively create the groups 

1907 path = "/" 

1908 for p in paths: 

1909 if not len(p): 

1910 continue 

1911 new_path = path 

1912 if not path.endswith("/"): 

1913 new_path += "/" 

1914 new_path += p 

1915 group = self.get_node(new_path) 

1916 if group is None: 

1917 group = self._handle.create_group(path, p) 

1918 path = new_path 

1919 return group 

1920 

1921 

1922class TableIterator: 

1923 """ 

1924 Define the iteration interface on a table 

1925 

1926 Parameters 

1927 ---------- 

1928 store : HDFStore 

1929 s : the referred storer 

1930 func : the function to execute the query 

1931 where : the where of the query 

1932 nrows : the rows to iterate on 

1933 start : the passed start value (default is None) 

1934 stop : the passed stop value (default is None) 

1935 iterator : bool, default False 

1936 Whether to use the default iterator. 

1937 chunksize : the passed chunking value (default is 100000) 

1938 auto_close : bool, default False 

1939 Whether to automatically close the store at the end of iteration. 

1940 """ 

1941 

1942 chunksize: int | None 

1943 store: HDFStore 

1944 s: GenericFixed | Table 

1945 

1946 def __init__( 

1947 self, 

1948 store: HDFStore, 

1949 s: GenericFixed | Table, 

1950 func, 

1951 where, 

1952 nrows, 

1953 start=None, 

1954 stop=None, 

1955 iterator: bool = False, 

1956 chunksize: int | None = None, 

1957 auto_close: bool = False, 

1958 ) -> None: 

1959 self.store = store 

1960 self.s = s 

1961 self.func = func 

1962 self.where = where 

1963 

1964 # set start/stop if they are not set if we are a table 

1965 if self.s.is_table: 

1966 if nrows is None: 

1967 nrows = 0 

1968 if start is None: 

1969 start = 0 

1970 if stop is None: 

1971 stop = nrows 

1972 stop = min(nrows, stop) 

1973 

1974 self.nrows = nrows 

1975 self.start = start 

1976 self.stop = stop 

1977 

1978 self.coordinates = None 

1979 if iterator or chunksize is not None: 

1980 if chunksize is None: 

1981 chunksize = 100000 

1982 self.chunksize = int(chunksize) 

1983 else: 

1984 self.chunksize = None 

1985 

1986 self.auto_close = auto_close 

1987 

1988 def __iter__(self) -> Iterator: 

1989 # iterate 

1990 current = self.start 

1991 if self.coordinates is None: 

1992 raise ValueError("Cannot iterate until get_result is called.") 

1993 while current < self.stop: 

1994 stop = min(current + self.chunksize, self.stop) 

1995 value = self.func(None, None, self.coordinates[current:stop]) 

1996 current = stop 

1997 if value is None or not len(value): 

1998 continue 

1999 

2000 yield value 

2001 

2002 self.close() 

2003 

2004 def close(self) -> None: 

2005 if self.auto_close: 

2006 self.store.close() 

2007 

2008 def get_result(self, coordinates: bool = False): 

2009 # return the actual iterator 

2010 if self.chunksize is not None: 

2011 if not isinstance(self.s, Table): 

2012 raise TypeError("can only use an iterator or chunksize on a table") 

2013 

2014 self.coordinates = self.s.read_coordinates(where=self.where) 

2015 

2016 return self 

2017 

2018 # if specified read via coordinates (necessary for multiple selections 

2019 if coordinates: 

2020 if not isinstance(self.s, Table): 

2021 raise TypeError("can only read_coordinates on a table") 

2022 where = self.s.read_coordinates( 

2023 where=self.where, start=self.start, stop=self.stop 

2024 ) 

2025 else: 

2026 where = self.where 

2027 

2028 # directly return the result 

2029 results = self.func(self.start, self.stop, where) 

2030 self.close() 

2031 return results 

2032 

2033 

2034class IndexCol: 

2035 """ 

2036 an index column description class 

2037 

2038 Parameters 

2039 ---------- 

2040 axis : axis which I reference 

2041 values : the ndarray like converted values 

2042 kind : a string description of this type 

2043 typ : the pytables type 

2044 pos : the position in the pytables 

2045 

2046 """ 

2047 

2048 is_an_indexable: bool = True 

2049 is_data_indexable: bool = True 

2050 _info_fields = ["freq", "tz", "index_name"] 

2051 

2052 def __init__( 

2053 self, 

2054 name: str, 

2055 values=None, 

2056 kind=None, 

2057 typ=None, 

2058 cname: str | None = None, 

2059 axis=None, 

2060 pos=None, 

2061 freq=None, 

2062 tz=None, 

2063 index_name=None, 

2064 ordered=None, 

2065 table=None, 

2066 meta=None, 

2067 metadata=None, 

2068 ) -> None: 

2069 if not isinstance(name, str): 

2070 raise ValueError("`name` must be a str.") 

2071 

2072 self.values = values 

2073 self.kind = kind 

2074 self.typ = typ 

2075 self.name = name 

2076 self.cname = cname or name 

2077 self.axis = axis 

2078 self.pos = pos 

2079 self.freq = freq 

2080 self.tz = tz 

2081 self.index_name = index_name 

2082 self.ordered = ordered 

2083 self.table = table 

2084 self.meta = meta 

2085 self.metadata = metadata 

2086 

2087 if pos is not None: 

2088 self.set_pos(pos) 

2089 

2090 # These are ensured as long as the passed arguments match the 

2091 # constructor annotations. 

2092 assert isinstance(self.name, str) 

2093 assert isinstance(self.cname, str) 

2094 

2095 @property 

2096 def itemsize(self) -> int: 

2097 # Assumes self.typ has already been initialized 

2098 return self.typ.itemsize 

2099 

2100 @property 

2101 def kind_attr(self) -> str: 

2102 return f"{self.name}_kind" 

2103 

2104 def set_pos(self, pos: int) -> None: 

2105 """set the position of this column in the Table""" 

2106 self.pos = pos 

2107 if pos is not None and self.typ is not None: 

2108 self.typ._v_pos = pos 

2109 

2110 def __repr__(self) -> str: 

2111 temp = tuple( 

2112 map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind)) 

2113 ) 

2114 return ",".join( 

2115 [ 

2116 f"{key}->{value}" 

2117 for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp) 

2118 ] 

2119 ) 

2120 

2121 def __eq__(self, other: object) -> bool: 

2122 """compare 2 col items""" 

2123 return all( 

2124 getattr(self, a, None) == getattr(other, a, None) 

2125 for a in ["name", "cname", "axis", "pos"] 

2126 ) 

2127 

2128 def __ne__(self, other) -> bool: 

2129 return not self.__eq__(other) 

2130 

2131 @property 

2132 def is_indexed(self) -> bool: 

2133 """return whether I am an indexed column""" 

2134 if not hasattr(self.table, "cols"): 

2135 # e.g. if infer hasn't been called yet, self.table will be None. 

2136 return False 

2137 return getattr(self.table.cols, self.cname).is_indexed 

2138 

2139 def convert( 

2140 self, values: np.ndarray, nan_rep, encoding: str, errors: str 

2141 ) -> tuple[np.ndarray, np.ndarray] | tuple[Index, Index]: 

2142 """ 

2143 Convert the data from this selection to the appropriate pandas type. 

2144 """ 

2145 assert isinstance(values, np.ndarray), type(values) 

2146 

2147 # values is a recarray 

2148 if values.dtype.fields is not None: 

2149 # Copy, otherwise values will be a view 

2150 # preventing the original recarry from being free'ed 

2151 values = values[self.cname].copy() 

2152 

2153 val_kind = _ensure_decoded(self.kind) 

2154 values = _maybe_convert(values, val_kind, encoding, errors) 

2155 kwargs = {} 

2156 kwargs["name"] = _ensure_decoded(self.index_name) 

2157 

2158 if self.freq is not None: 

2159 kwargs["freq"] = _ensure_decoded(self.freq) 

2160 

2161 factory: type[Index | DatetimeIndex] = Index 

2162 if lib.is_np_dtype(values.dtype, "M") or isinstance( 

2163 values.dtype, DatetimeTZDtype 

2164 ): 

2165 factory = DatetimeIndex 

2166 elif values.dtype == "i8" and "freq" in kwargs: 

2167 # PeriodIndex data is stored as i8 

2168 # error: Incompatible types in assignment (expression has type 

2169 # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type 

2170 # "Union[Type[Index], Type[DatetimeIndex]]") 

2171 factory = lambda x, **kwds: PeriodIndex.from_ordinals( # type: ignore[assignment] 

2172 x, freq=kwds.get("freq", None) 

2173 )._rename( 

2174 kwds["name"] 

2175 ) 

2176 

2177 # making an Index instance could throw a number of different errors 

2178 try: 

2179 new_pd_index = factory(values, **kwargs) 

2180 except ValueError: 

2181 # if the output freq is different that what we recorded, 

2182 # it should be None (see also 'doc example part 2') 

2183 if "freq" in kwargs: 

2184 kwargs["freq"] = None 

2185 new_pd_index = factory(values, **kwargs) 

2186 final_pd_index = _set_tz(new_pd_index, self.tz) 

2187 return final_pd_index, final_pd_index 

2188 

2189 def take_data(self): 

2190 """return the values""" 

2191 return self.values 

2192 

2193 @property 

2194 def attrs(self): 

2195 return self.table._v_attrs 

2196 

2197 @property 

2198 def description(self): 

2199 return self.table.description 

2200 

2201 @property 

2202 def col(self): 

2203 """return my current col description""" 

2204 return getattr(self.description, self.cname, None) 

2205 

2206 @property 

2207 def cvalues(self): 

2208 """return my cython values""" 

2209 return self.values 

2210 

2211 def __iter__(self) -> Iterator: 

2212 return iter(self.values) 

2213 

2214 def maybe_set_size(self, min_itemsize=None) -> None: 

2215 """ 

2216 maybe set a string col itemsize: 

2217 min_itemsize can be an integer or a dict with this columns name 

2218 with an integer size 

2219 """ 

2220 if _ensure_decoded(self.kind) == "string": 

2221 if isinstance(min_itemsize, dict): 

2222 min_itemsize = min_itemsize.get(self.name) 

2223 

2224 if min_itemsize is not None and self.typ.itemsize < min_itemsize: 

2225 self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos) 

2226 

2227 def validate_names(self) -> None: 

2228 pass 

2229 

2230 def validate_and_set(self, handler: AppendableTable, append: bool) -> None: 

2231 self.table = handler.table 

2232 self.validate_col() 

2233 self.validate_attr(append) 

2234 self.validate_metadata(handler) 

2235 self.write_metadata(handler) 

2236 self.set_attr() 

2237 

2238 def validate_col(self, itemsize=None): 

2239 """validate this column: return the compared against itemsize""" 

2240 # validate this column for string truncation (or reset to the max size) 

2241 if _ensure_decoded(self.kind) == "string": 

2242 c = self.col 

2243 if c is not None: 

2244 if itemsize is None: 

2245 itemsize = self.itemsize 

2246 if c.itemsize < itemsize: 

2247 raise ValueError( 

2248 f"Trying to store a string with len [{itemsize}] in " 

2249 f"[{self.cname}] column but\nthis column has a limit of " 

2250 f"[{c.itemsize}]!\nConsider using min_itemsize to " 

2251 "preset the sizes on these columns" 

2252 ) 

2253 return c.itemsize 

2254 

2255 return None 

2256 

2257 def validate_attr(self, append: bool) -> None: 

2258 # check for backwards incompatibility 

2259 if append: 

2260 existing_kind = getattr(self.attrs, self.kind_attr, None) 

2261 if existing_kind is not None and existing_kind != self.kind: 

2262 raise TypeError( 

2263 f"incompatible kind in col [{existing_kind} - {self.kind}]" 

2264 ) 

2265 

2266 def update_info(self, info) -> None: 

2267 """ 

2268 set/update the info for this indexable with the key/value 

2269 if there is a conflict raise/warn as needed 

2270 """ 

2271 for key in self._info_fields: 

2272 value = getattr(self, key, None) 

2273 idx = info.setdefault(self.name, {}) 

2274 

2275 existing_value = idx.get(key) 

2276 if key in idx and value is not None and existing_value != value: 

2277 # frequency/name just warn 

2278 if key in ["freq", "index_name"]: 

2279 ws = attribute_conflict_doc % (key, existing_value, value) 

2280 warnings.warn( 

2281 ws, AttributeConflictWarning, stacklevel=find_stack_level() 

2282 ) 

2283 

2284 # reset 

2285 idx[key] = None 

2286 setattr(self, key, None) 

2287 

2288 else: 

2289 raise ValueError( 

2290 f"invalid info for [{self.name}] for [{key}], " 

2291 f"existing_value [{existing_value}] conflicts with " 

2292 f"new value [{value}]" 

2293 ) 

2294 elif value is not None or existing_value is not None: 

2295 idx[key] = value 

2296 

2297 def set_info(self, info) -> None: 

2298 """set my state from the passed info""" 

2299 idx = info.get(self.name) 

2300 if idx is not None: 

2301 self.__dict__.update(idx) 

2302 

2303 def set_attr(self) -> None: 

2304 """set the kind for this column""" 

2305 setattr(self.attrs, self.kind_attr, self.kind) 

2306 

2307 def validate_metadata(self, handler: AppendableTable) -> None: 

2308 """validate that kind=category does not change the categories""" 

2309 if self.meta == "category": 

2310 new_metadata = self.metadata 

2311 cur_metadata = handler.read_metadata(self.cname) 

2312 if ( 

2313 new_metadata is not None 

2314 and cur_metadata is not None 

2315 and not array_equivalent( 

2316 new_metadata, cur_metadata, strict_nan=True, dtype_equal=True 

2317 ) 

2318 ): 

2319 raise ValueError( 

2320 "cannot append a categorical with " 

2321 "different categories to the existing" 

2322 ) 

2323 

2324 def write_metadata(self, handler: AppendableTable) -> None: 

2325 """set the meta data""" 

2326 if self.metadata is not None: 

2327 handler.write_metadata(self.cname, self.metadata) 

2328 

2329 

2330class GenericIndexCol(IndexCol): 

2331 """an index which is not represented in the data of the table""" 

2332 

2333 @property 

2334 def is_indexed(self) -> bool: 

2335 return False 

2336 

2337 def convert( 

2338 self, values: np.ndarray, nan_rep, encoding: str, errors: str 

2339 ) -> tuple[Index, Index]: 

2340 """ 

2341 Convert the data from this selection to the appropriate pandas type. 

2342 

2343 Parameters 

2344 ---------- 

2345 values : np.ndarray 

2346 nan_rep : str 

2347 encoding : str 

2348 errors : str 

2349 """ 

2350 assert isinstance(values, np.ndarray), type(values) 

2351 

2352 index = RangeIndex(len(values)) 

2353 return index, index 

2354 

2355 def set_attr(self) -> None: 

2356 pass 

2357 

2358 

2359class DataCol(IndexCol): 

2360 """ 

2361 a data holding column, by definition this is not indexable 

2362 

2363 Parameters 

2364 ---------- 

2365 data : the actual data 

2366 cname : the column name in the table to hold the data (typically 

2367 values) 

2368 meta : a string description of the metadata 

2369 metadata : the actual metadata 

2370 """ 

2371 

2372 is_an_indexable = False 

2373 is_data_indexable = False 

2374 _info_fields = ["tz", "ordered"] 

2375 

2376 def __init__( 

2377 self, 

2378 name: str, 

2379 values=None, 

2380 kind=None, 

2381 typ=None, 

2382 cname: str | None = None, 

2383 pos=None, 

2384 tz=None, 

2385 ordered=None, 

2386 table=None, 

2387 meta=None, 

2388 metadata=None, 

2389 dtype: DtypeArg | None = None, 

2390 data=None, 

2391 ) -> None: 

2392 super().__init__( 

2393 name=name, 

2394 values=values, 

2395 kind=kind, 

2396 typ=typ, 

2397 pos=pos, 

2398 cname=cname, 

2399 tz=tz, 

2400 ordered=ordered, 

2401 table=table, 

2402 meta=meta, 

2403 metadata=metadata, 

2404 ) 

2405 self.dtype = dtype 

2406 self.data = data 

2407 

2408 @property 

2409 def dtype_attr(self) -> str: 

2410 return f"{self.name}_dtype" 

2411 

2412 @property 

2413 def meta_attr(self) -> str: 

2414 return f"{self.name}_meta" 

2415 

2416 def __repr__(self) -> str: 

2417 temp = tuple( 

2418 map( 

2419 pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape) 

2420 ) 

2421 ) 

2422 return ",".join( 

2423 [ 

2424 f"{key}->{value}" 

2425 for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp) 

2426 ] 

2427 ) 

2428 

2429 def __eq__(self, other: object) -> bool: 

2430 """compare 2 col items""" 

2431 return all( 

2432 getattr(self, a, None) == getattr(other, a, None) 

2433 for a in ["name", "cname", "dtype", "pos"] 

2434 ) 

2435 

2436 def set_data(self, data: ArrayLike) -> None: 

2437 assert data is not None 

2438 assert self.dtype is None 

2439 

2440 data, dtype_name = _get_data_and_dtype_name(data) 

2441 

2442 self.data = data 

2443 self.dtype = dtype_name 

2444 self.kind = _dtype_to_kind(dtype_name) 

2445 

2446 def take_data(self): 

2447 """return the data""" 

2448 return self.data 

2449 

2450 @classmethod 

2451 def _get_atom(cls, values: ArrayLike) -> Col: 

2452 """ 

2453 Get an appropriately typed and shaped pytables.Col object for values. 

2454 """ 

2455 dtype = values.dtype 

2456 # error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no 

2457 # attribute "itemsize" 

2458 itemsize = dtype.itemsize # type: ignore[union-attr] 

2459 

2460 shape = values.shape 

2461 if values.ndim == 1: 

2462 # EA, use block shape pretending it is 2D 

2463 # TODO(EA2D): not necessary with 2D EAs 

2464 shape = (1, values.size) 

2465 

2466 if isinstance(values, Categorical): 

2467 codes = values.codes 

2468 atom = cls.get_atom_data(shape, kind=codes.dtype.name) 

2469 elif lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype): 

2470 atom = cls.get_atom_datetime64(shape) 

2471 elif lib.is_np_dtype(dtype, "m"): 

2472 atom = cls.get_atom_timedelta64(shape) 

2473 elif is_complex_dtype(dtype): 

2474 atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0]) 

2475 elif is_string_dtype(dtype): 

2476 atom = cls.get_atom_string(shape, itemsize) 

2477 else: 

2478 atom = cls.get_atom_data(shape, kind=dtype.name) 

2479 

2480 return atom 

2481 

2482 @classmethod 

2483 def get_atom_string(cls, shape, itemsize): 

2484 return _tables().StringCol(itemsize=itemsize, shape=shape[0]) 

2485 

2486 @classmethod 

2487 def get_atom_coltype(cls, kind: str) -> type[Col]: 

2488 """return the PyTables column class for this column""" 

2489 if kind.startswith("uint"): 

2490 k4 = kind[4:] 

2491 col_name = f"UInt{k4}Col" 

2492 elif kind.startswith("period"): 

2493 # we store as integer 

2494 col_name = "Int64Col" 

2495 else: 

2496 kcap = kind.capitalize() 

2497 col_name = f"{kcap}Col" 

2498 

2499 return getattr(_tables(), col_name) 

2500 

2501 @classmethod 

2502 def get_atom_data(cls, shape, kind: str) -> Col: 

2503 return cls.get_atom_coltype(kind=kind)(shape=shape[0]) 

2504 

2505 @classmethod 

2506 def get_atom_datetime64(cls, shape): 

2507 return _tables().Int64Col(shape=shape[0]) 

2508 

2509 @classmethod 

2510 def get_atom_timedelta64(cls, shape): 

2511 return _tables().Int64Col(shape=shape[0]) 

2512 

2513 @property 

2514 def shape(self): 

2515 return getattr(self.data, "shape", None) 

2516 

2517 @property 

2518 def cvalues(self): 

2519 """return my cython values""" 

2520 return self.data 

2521 

2522 def validate_attr(self, append) -> None: 

2523 """validate that we have the same order as the existing & same dtype""" 

2524 if append: 

2525 existing_fields = getattr(self.attrs, self.kind_attr, None) 

2526 if existing_fields is not None and existing_fields != list(self.values): 

2527 raise ValueError("appended items do not match existing items in table!") 

2528 

2529 existing_dtype = getattr(self.attrs, self.dtype_attr, None) 

2530 if existing_dtype is not None and existing_dtype != self.dtype: 

2531 raise ValueError( 

2532 "appended items dtype do not match existing items dtype in table!" 

2533 ) 

2534 

2535 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): 

2536 """ 

2537 Convert the data from this selection to the appropriate pandas type. 

2538 

2539 Parameters 

2540 ---------- 

2541 values : np.ndarray 

2542 nan_rep : 

2543 encoding : str 

2544 errors : str 

2545 

2546 Returns 

2547 ------- 

2548 index : listlike to become an Index 

2549 data : ndarraylike to become a column 

2550 """ 

2551 assert isinstance(values, np.ndarray), type(values) 

2552 

2553 # values is a recarray 

2554 if values.dtype.fields is not None: 

2555 values = values[self.cname] 

2556 

2557 assert self.typ is not None 

2558 if self.dtype is None: 

2559 # Note: in tests we never have timedelta64 or datetime64, 

2560 # so the _get_data_and_dtype_name may be unnecessary 

2561 converted, dtype_name = _get_data_and_dtype_name(values) 

2562 kind = _dtype_to_kind(dtype_name) 

2563 else: 

2564 converted = values 

2565 dtype_name = self.dtype 

2566 kind = self.kind 

2567 

2568 assert isinstance(converted, np.ndarray) # for mypy 

2569 

2570 # use the meta if needed 

2571 meta = _ensure_decoded(self.meta) 

2572 metadata = self.metadata 

2573 ordered = self.ordered 

2574 tz = self.tz 

2575 

2576 assert dtype_name is not None 

2577 # convert to the correct dtype 

2578 dtype = _ensure_decoded(dtype_name) 

2579 

2580 # reverse converts 

2581 if dtype.startswith("datetime64"): 

2582 # recreate with tz if indicated 

2583 converted = _set_tz(converted, tz, coerce=True) 

2584 

2585 elif dtype == "timedelta64": 

2586 converted = np.asarray(converted, dtype="m8[ns]") 

2587 elif dtype == "date": 

2588 try: 

2589 converted = np.asarray( 

2590 [date.fromordinal(v) for v in converted], dtype=object 

2591 ) 

2592 except ValueError: 

2593 converted = np.asarray( 

2594 [date.fromtimestamp(v) for v in converted], dtype=object 

2595 ) 

2596 

2597 elif meta == "category": 

2598 # we have a categorical 

2599 categories = metadata 

2600 codes = converted.ravel() 

2601 

2602 # if we have stored a NaN in the categories 

2603 # then strip it; in theory we could have BOTH 

2604 # -1s in the codes and nulls :< 

2605 if categories is None: 

2606 # Handle case of NaN-only categorical columns in which case 

2607 # the categories are an empty array; when this is stored, 

2608 # pytables cannot write a zero-len array, so on readback 

2609 # the categories would be None and `read_hdf()` would fail. 

2610 categories = Index([], dtype=np.float64) 

2611 else: 

2612 mask = isna(categories) 

2613 if mask.any(): 

2614 categories = categories[~mask] 

2615 codes[codes != -1] -= mask.astype(int).cumsum()._values 

2616 

2617 converted = Categorical.from_codes( 

2618 codes, categories=categories, ordered=ordered, validate=False 

2619 ) 

2620 

2621 else: 

2622 try: 

2623 converted = converted.astype(dtype, copy=False) 

2624 except TypeError: 

2625 converted = converted.astype("O", copy=False) 

2626 

2627 # convert nans / decode 

2628 if _ensure_decoded(kind) == "string": 

2629 converted = _unconvert_string_array( 

2630 converted, nan_rep=nan_rep, encoding=encoding, errors=errors 

2631 ) 

2632 

2633 return self.values, converted 

2634 

2635 def set_attr(self) -> None: 

2636 """set the data for this column""" 

2637 setattr(self.attrs, self.kind_attr, self.values) 

2638 setattr(self.attrs, self.meta_attr, self.meta) 

2639 assert self.dtype is not None 

2640 setattr(self.attrs, self.dtype_attr, self.dtype) 

2641 

2642 

2643class DataIndexableCol(DataCol): 

2644 """represent a data column that can be indexed""" 

2645 

2646 is_data_indexable = True 

2647 

2648 def validate_names(self) -> None: 

2649 if not is_string_dtype(Index(self.values).dtype): 

2650 # TODO: should the message here be more specifically non-str? 

2651 raise ValueError("cannot have non-object label DataIndexableCol") 

2652 

2653 @classmethod 

2654 def get_atom_string(cls, shape, itemsize): 

2655 return _tables().StringCol(itemsize=itemsize) 

2656 

2657 @classmethod 

2658 def get_atom_data(cls, shape, kind: str) -> Col: 

2659 return cls.get_atom_coltype(kind=kind)() 

2660 

2661 @classmethod 

2662 def get_atom_datetime64(cls, shape): 

2663 return _tables().Int64Col() 

2664 

2665 @classmethod 

2666 def get_atom_timedelta64(cls, shape): 

2667 return _tables().Int64Col() 

2668 

2669 

2670class GenericDataIndexableCol(DataIndexableCol): 

2671 """represent a generic pytables data column""" 

2672 

2673 

2674class Fixed: 

2675 """ 

2676 represent an object in my store 

2677 facilitate read/write of various types of objects 

2678 this is an abstract base class 

2679 

2680 Parameters 

2681 ---------- 

2682 parent : HDFStore 

2683 group : Node 

2684 The group node where the table resides. 

2685 """ 

2686 

2687 pandas_kind: str 

2688 format_type: str = "fixed" # GH#30962 needed by dask 

2689 obj_type: type[DataFrame | Series] 

2690 ndim: int 

2691 parent: HDFStore 

2692 is_table: bool = False 

2693 

2694 def __init__( 

2695 self, 

2696 parent: HDFStore, 

2697 group: Node, 

2698 encoding: str | None = "UTF-8", 

2699 errors: str = "strict", 

2700 ) -> None: 

2701 assert isinstance(parent, HDFStore), type(parent) 

2702 assert _table_mod is not None # needed for mypy 

2703 assert isinstance(group, _table_mod.Node), type(group) 

2704 self.parent = parent 

2705 self.group = group 

2706 self.encoding = _ensure_encoding(encoding) 

2707 self.errors = errors 

2708 

2709 @property 

2710 def is_old_version(self) -> bool: 

2711 return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1 

2712 

2713 @property 

2714 def version(self) -> tuple[int, int, int]: 

2715 """compute and set our version""" 

2716 version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None)) 

2717 try: 

2718 version = tuple(int(x) for x in version.split(".")) 

2719 if len(version) == 2: 

2720 version = version + (0,) 

2721 except AttributeError: 

2722 version = (0, 0, 0) 

2723 return version 

2724 

2725 @property 

2726 def pandas_type(self): 

2727 return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None)) 

2728 

2729 def __repr__(self) -> str: 

2730 """return a pretty representation of myself""" 

2731 self.infer_axes() 

2732 s = self.shape 

2733 if s is not None: 

2734 if isinstance(s, (list, tuple)): 

2735 jshape = ",".join([pprint_thing(x) for x in s]) 

2736 s = f"[{jshape}]" 

2737 return f"{self.pandas_type:12.12} (shape->{s})" 

2738 return self.pandas_type 

2739 

2740 def set_object_info(self) -> None: 

2741 """set my pandas type & version""" 

2742 self.attrs.pandas_type = str(self.pandas_kind) 

2743 self.attrs.pandas_version = str(_version) 

2744 

2745 def copy(self) -> Fixed: 

2746 new_self = copy.copy(self) 

2747 return new_self 

2748 

2749 @property 

2750 def shape(self): 

2751 return self.nrows 

2752 

2753 @property 

2754 def pathname(self): 

2755 return self.group._v_pathname 

2756 

2757 @property 

2758 def _handle(self): 

2759 return self.parent._handle 

2760 

2761 @property 

2762 def _filters(self): 

2763 return self.parent._filters 

2764 

2765 @property 

2766 def _complevel(self) -> int: 

2767 return self.parent._complevel 

2768 

2769 @property 

2770 def _fletcher32(self) -> bool: 

2771 return self.parent._fletcher32 

2772 

2773 @property 

2774 def attrs(self): 

2775 return self.group._v_attrs 

2776 

2777 def set_attrs(self) -> None: 

2778 """set our object attributes""" 

2779 

2780 def get_attrs(self) -> None: 

2781 """get our object attributes""" 

2782 

2783 @property 

2784 def storable(self): 

2785 """return my storable""" 

2786 return self.group 

2787 

2788 @property 

2789 def is_exists(self) -> bool: 

2790 return False 

2791 

2792 @property 

2793 def nrows(self): 

2794 return getattr(self.storable, "nrows", None) 

2795 

2796 def validate(self, other) -> Literal[True] | None: 

2797 """validate against an existing storable""" 

2798 if other is None: 

2799 return None 

2800 return True 

2801 

2802 def validate_version(self, where=None) -> None: 

2803 """are we trying to operate on an old version?""" 

2804 

2805 def infer_axes(self) -> bool: 

2806 """ 

2807 infer the axes of my storer 

2808 return a boolean indicating if we have a valid storer or not 

2809 """ 

2810 s = self.storable 

2811 if s is None: 

2812 return False 

2813 self.get_attrs() 

2814 return True 

2815 

2816 def read( 

2817 self, 

2818 where=None, 

2819 columns=None, 

2820 start: int | None = None, 

2821 stop: int | None = None, 

2822 ): 

2823 raise NotImplementedError( 

2824 "cannot read on an abstract storer: subclasses should implement" 

2825 ) 

2826 

2827 def write(self, obj, **kwargs) -> None: 

2828 raise NotImplementedError( 

2829 "cannot write on an abstract storer: subclasses should implement" 

2830 ) 

2831 

2832 def delete( 

2833 self, where=None, start: int | None = None, stop: int | None = None 

2834 ) -> None: 

2835 """ 

2836 support fully deleting the node in its entirety (only) - where 

2837 specification must be None 

2838 """ 

2839 if com.all_none(where, start, stop): 

2840 self._handle.remove_node(self.group, recursive=True) 

2841 return None 

2842 

2843 raise TypeError("cannot delete on an abstract storer") 

2844 

2845 

2846class GenericFixed(Fixed): 

2847 """a generified fixed version""" 

2848 

2849 _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"} 

2850 _reverse_index_map = {v: k for k, v in _index_type_map.items()} 

2851 attributes: list[str] = [] 

2852 

2853 # indexer helpers 

2854 def _class_to_alias(self, cls) -> str: 

2855 return self._index_type_map.get(cls, "") 

2856 

2857 def _alias_to_class(self, alias): 

2858 if isinstance(alias, type): # pragma: no cover 

2859 # compat: for a short period of time master stored types 

2860 return alias 

2861 return self._reverse_index_map.get(alias, Index) 

2862 

2863 def _get_index_factory(self, attrs): 

2864 index_class = self._alias_to_class( 

2865 _ensure_decoded(getattr(attrs, "index_class", "")) 

2866 ) 

2867 

2868 factory: Callable 

2869 

2870 if index_class == DatetimeIndex: 

2871 

2872 def f(values, freq=None, tz=None): 

2873 # data are already in UTC, localize and convert if tz present 

2874 dta = DatetimeArray._simple_new( 

2875 values.values, dtype=values.dtype, freq=freq 

2876 ) 

2877 result = DatetimeIndex._simple_new(dta, name=None) 

2878 if tz is not None: 

2879 result = result.tz_localize("UTC").tz_convert(tz) 

2880 return result 

2881 

2882 factory = f 

2883 elif index_class == PeriodIndex: 

2884 

2885 def f(values, freq=None, tz=None): 

2886 dtype = PeriodDtype(freq) 

2887 parr = PeriodArray._simple_new(values, dtype=dtype) 

2888 return PeriodIndex._simple_new(parr, name=None) 

2889 

2890 factory = f 

2891 else: 

2892 factory = index_class 

2893 

2894 kwargs = {} 

2895 if "freq" in attrs: 

2896 kwargs["freq"] = attrs["freq"] 

2897 if index_class is Index: 

2898 # DTI/PI would be gotten by _alias_to_class 

2899 factory = TimedeltaIndex 

2900 

2901 if "tz" in attrs: 

2902 if isinstance(attrs["tz"], bytes): 

2903 # created by python2 

2904 kwargs["tz"] = attrs["tz"].decode("utf-8") 

2905 else: 

2906 # created by python3 

2907 kwargs["tz"] = attrs["tz"] 

2908 assert index_class is DatetimeIndex # just checking 

2909 

2910 return factory, kwargs 

2911 

2912 def validate_read(self, columns, where) -> None: 

2913 """ 

2914 raise if any keywords are passed which are not-None 

2915 """ 

2916 if columns is not None: 

2917 raise TypeError( 

2918 "cannot pass a column specification when reading " 

2919 "a Fixed format store. this store must be selected in its entirety" 

2920 ) 

2921 if where is not None: 

2922 raise TypeError( 

2923 "cannot pass a where specification when reading " 

2924 "from a Fixed format store. this store must be selected in its entirety" 

2925 ) 

2926 

2927 @property 

2928 def is_exists(self) -> bool: 

2929 return True 

2930 

2931 def set_attrs(self) -> None: 

2932 """set our object attributes""" 

2933 self.attrs.encoding = self.encoding 

2934 self.attrs.errors = self.errors 

2935 

2936 def get_attrs(self) -> None: 

2937 """retrieve our attributes""" 

2938 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) 

2939 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) 

2940 for n in self.attributes: 

2941 setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) 

2942 

2943 def write(self, obj, **kwargs) -> None: 

2944 self.set_attrs() 

2945 

2946 def read_array(self, key: str, start: int | None = None, stop: int | None = None): 

2947 """read an array for the specified node (off of group""" 

2948 import tables 

2949 

2950 node = getattr(self.group, key) 

2951 attrs = node._v_attrs 

2952 

2953 transposed = getattr(attrs, "transposed", False) 

2954 

2955 if isinstance(node, tables.VLArray): 

2956 ret = node[0][start:stop] 

2957 else: 

2958 dtype = _ensure_decoded(getattr(attrs, "value_type", None)) 

2959 shape = getattr(attrs, "shape", None) 

2960 

2961 if shape is not None: 

2962 # length 0 axis 

2963 ret = np.empty(shape, dtype=dtype) 

2964 else: 

2965 ret = node[start:stop] 

2966 

2967 if dtype and dtype.startswith("datetime64"): 

2968 # reconstruct a timezone if indicated 

2969 tz = getattr(attrs, "tz", None) 

2970 ret = _set_tz(ret, tz, coerce=True) 

2971 

2972 elif dtype == "timedelta64": 

2973 ret = np.asarray(ret, dtype="m8[ns]") 

2974 

2975 if transposed: 

2976 return ret.T 

2977 else: 

2978 return ret 

2979 

2980 def read_index( 

2981 self, key: str, start: int | None = None, stop: int | None = None 

2982 ) -> Index: 

2983 variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety")) 

2984 

2985 if variety == "multi": 

2986 return self.read_multi_index(key, start=start, stop=stop) 

2987 elif variety == "regular": 

2988 node = getattr(self.group, key) 

2989 index = self.read_index_node(node, start=start, stop=stop) 

2990 return index 

2991 else: # pragma: no cover 

2992 raise TypeError(f"unrecognized index variety: {variety}") 

2993 

2994 def write_index(self, key: str, index: Index) -> None: 

2995 if isinstance(index, MultiIndex): 

2996 setattr(self.attrs, f"{key}_variety", "multi") 

2997 self.write_multi_index(key, index) 

2998 else: 

2999 setattr(self.attrs, f"{key}_variety", "regular") 

3000 converted = _convert_index("index", index, self.encoding, self.errors) 

3001 

3002 self.write_array(key, converted.values) 

3003 

3004 node = getattr(self.group, key) 

3005 node._v_attrs.kind = converted.kind 

3006 node._v_attrs.name = index.name 

3007 

3008 if isinstance(index, (DatetimeIndex, PeriodIndex)): 

3009 node._v_attrs.index_class = self._class_to_alias(type(index)) 

3010 

3011 if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): 

3012 node._v_attrs.freq = index.freq 

3013 

3014 if isinstance(index, DatetimeIndex) and index.tz is not None: 

3015 node._v_attrs.tz = _get_tz(index.tz) 

3016 

3017 def write_multi_index(self, key: str, index: MultiIndex) -> None: 

3018 setattr(self.attrs, f"{key}_nlevels", index.nlevels) 

3019 

3020 for i, (lev, level_codes, name) in enumerate( 

3021 zip(index.levels, index.codes, index.names) 

3022 ): 

3023 # write the level 

3024 if isinstance(lev.dtype, ExtensionDtype): 

3025 raise NotImplementedError( 

3026 "Saving a MultiIndex with an extension dtype is not supported." 

3027 ) 

3028 level_key = f"{key}_level{i}" 

3029 conv_level = _convert_index(level_key, lev, self.encoding, self.errors) 

3030 self.write_array(level_key, conv_level.values) 

3031 node = getattr(self.group, level_key) 

3032 node._v_attrs.kind = conv_level.kind 

3033 node._v_attrs.name = name 

3034 

3035 # write the name 

3036 setattr(node._v_attrs, f"{key}_name{name}", name) 

3037 

3038 # write the labels 

3039 label_key = f"{key}_label{i}" 

3040 self.write_array(label_key, level_codes) 

3041 

3042 def read_multi_index( 

3043 self, key: str, start: int | None = None, stop: int | None = None 

3044 ) -> MultiIndex: 

3045 nlevels = getattr(self.attrs, f"{key}_nlevels") 

3046 

3047 levels = [] 

3048 codes = [] 

3049 names: list[Hashable] = [] 

3050 for i in range(nlevels): 

3051 level_key = f"{key}_level{i}" 

3052 node = getattr(self.group, level_key) 

3053 lev = self.read_index_node(node, start=start, stop=stop) 

3054 levels.append(lev) 

3055 names.append(lev.name) 

3056 

3057 label_key = f"{key}_label{i}" 

3058 level_codes = self.read_array(label_key, start=start, stop=stop) 

3059 codes.append(level_codes) 

3060 

3061 return MultiIndex( 

3062 levels=levels, codes=codes, names=names, verify_integrity=True 

3063 ) 

3064 

3065 def read_index_node( 

3066 self, node: Node, start: int | None = None, stop: int | None = None 

3067 ) -> Index: 

3068 data = node[start:stop] 

3069 # If the index was an empty array write_array_empty() will 

3070 # have written a sentinel. Here we replace it with the original. 

3071 if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0: 

3072 data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type) 

3073 kind = _ensure_decoded(node._v_attrs.kind) 

3074 name = None 

3075 

3076 if "name" in node._v_attrs: 

3077 name = _ensure_str(node._v_attrs.name) 

3078 name = _ensure_decoded(name) 

3079 

3080 attrs = node._v_attrs 

3081 factory, kwargs = self._get_index_factory(attrs) 

3082 

3083 if kind in ("date", "object"): 

3084 index = factory( 

3085 _unconvert_index( 

3086 data, kind, encoding=self.encoding, errors=self.errors 

3087 ), 

3088 dtype=object, 

3089 **kwargs, 

3090 ) 

3091 else: 

3092 index = factory( 

3093 _unconvert_index( 

3094 data, kind, encoding=self.encoding, errors=self.errors 

3095 ), 

3096 **kwargs, 

3097 ) 

3098 

3099 index.name = name 

3100 

3101 return index 

3102 

3103 def write_array_empty(self, key: str, value: ArrayLike) -> None: 

3104 """write a 0-len array""" 

3105 # ugly hack for length 0 axes 

3106 arr = np.empty((1,) * value.ndim) 

3107 self._handle.create_array(self.group, key, arr) 

3108 node = getattr(self.group, key) 

3109 node._v_attrs.value_type = str(value.dtype) 

3110 node._v_attrs.shape = value.shape 

3111 

3112 def write_array( 

3113 self, key: str, obj: AnyArrayLike, items: Index | None = None 

3114 ) -> None: 

3115 # TODO: we only have a few tests that get here, the only EA 

3116 # that gets passed is DatetimeArray, and we never have 

3117 # both self._filters and EA 

3118 

3119 value = extract_array(obj, extract_numpy=True) 

3120 

3121 if key in self.group: 

3122 self._handle.remove_node(self.group, key) 

3123 

3124 # Transform needed to interface with pytables row/col notation 

3125 empty_array = value.size == 0 

3126 transposed = False 

3127 

3128 if isinstance(value.dtype, CategoricalDtype): 

3129 raise NotImplementedError( 

3130 "Cannot store a category dtype in a HDF5 dataset that uses format=" 

3131 '"fixed". Use format="table".' 

3132 ) 

3133 if not empty_array: 

3134 if hasattr(value, "T"): 

3135 # ExtensionArrays (1d) may not have transpose. 

3136 value = value.T 

3137 transposed = True 

3138 

3139 atom = None 

3140 if self._filters is not None: 

3141 with suppress(ValueError): 

3142 # get the atom for this datatype 

3143 atom = _tables().Atom.from_dtype(value.dtype) 

3144 

3145 if atom is not None: 

3146 # We only get here if self._filters is non-None and 

3147 # the Atom.from_dtype call succeeded 

3148 

3149 # create an empty chunked array and fill it from value 

3150 if not empty_array: 

3151 ca = self._handle.create_carray( 

3152 self.group, key, atom, value.shape, filters=self._filters 

3153 ) 

3154 ca[:] = value 

3155 

3156 else: 

3157 self.write_array_empty(key, value) 

3158 

3159 elif value.dtype.type == np.object_: 

3160 # infer the type, warn if we have a non-string type here (for 

3161 # performance) 

3162 inferred_type = lib.infer_dtype(value, skipna=False) 

3163 if empty_array: 

3164 pass 

3165 elif inferred_type == "string": 

3166 pass 

3167 else: 

3168 ws = performance_doc % (inferred_type, key, items) 

3169 warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level()) 

3170 

3171 vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) 

3172 vlarr.append(value) 

3173 

3174 elif lib.is_np_dtype(value.dtype, "M"): 

3175 self._handle.create_array(self.group, key, value.view("i8")) 

3176 getattr(self.group, key)._v_attrs.value_type = str(value.dtype) 

3177 elif isinstance(value.dtype, DatetimeTZDtype): 

3178 # store as UTC 

3179 # with a zone 

3180 

3181 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no 

3182 # attribute "asi8" 

3183 self._handle.create_array( 

3184 self.group, key, value.asi8 # type: ignore[union-attr] 

3185 ) 

3186 

3187 node = getattr(self.group, key) 

3188 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no 

3189 # attribute "tz" 

3190 node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr] 

3191 node._v_attrs.value_type = f"datetime64[{value.dtype.unit}]" 

3192 elif lib.is_np_dtype(value.dtype, "m"): 

3193 self._handle.create_array(self.group, key, value.view("i8")) 

3194 getattr(self.group, key)._v_attrs.value_type = "timedelta64" 

3195 elif empty_array: 

3196 self.write_array_empty(key, value) 

3197 else: 

3198 self._handle.create_array(self.group, key, value) 

3199 

3200 getattr(self.group, key)._v_attrs.transposed = transposed 

3201 

3202 

3203class SeriesFixed(GenericFixed): 

3204 pandas_kind = "series" 

3205 attributes = ["name"] 

3206 

3207 name: Hashable 

3208 

3209 @property 

3210 def shape(self): 

3211 try: 

3212 return (len(self.group.values),) 

3213 except (TypeError, AttributeError): 

3214 return None 

3215 

3216 def read( 

3217 self, 

3218 where=None, 

3219 columns=None, 

3220 start: int | None = None, 

3221 stop: int | None = None, 

3222 ) -> Series: 

3223 self.validate_read(columns, where) 

3224 index = self.read_index("index", start=start, stop=stop) 

3225 values = self.read_array("values", start=start, stop=stop) 

3226 result = Series(values, index=index, name=self.name, copy=False) 

3227 if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): 

3228 result = result.astype("string[pyarrow_numpy]") 

3229 return result 

3230 

3231 def write(self, obj, **kwargs) -> None: 

3232 super().write(obj, **kwargs) 

3233 self.write_index("index", obj.index) 

3234 self.write_array("values", obj) 

3235 self.attrs.name = obj.name 

3236 

3237 

3238class BlockManagerFixed(GenericFixed): 

3239 attributes = ["ndim", "nblocks"] 

3240 

3241 nblocks: int 

3242 

3243 @property 

3244 def shape(self) -> Shape | None: 

3245 try: 

3246 ndim = self.ndim 

3247 

3248 # items 

3249 items = 0 

3250 for i in range(self.nblocks): 

3251 node = getattr(self.group, f"block{i}_items") 

3252 shape = getattr(node, "shape", None) 

3253 if shape is not None: 

3254 items += shape[0] 

3255 

3256 # data shape 

3257 node = self.group.block0_values 

3258 shape = getattr(node, "shape", None) 

3259 if shape is not None: 

3260 shape = list(shape[0 : (ndim - 1)]) 

3261 else: 

3262 shape = [] 

3263 

3264 shape.append(items) 

3265 

3266 return shape 

3267 except AttributeError: 

3268 return None 

3269 

3270 def read( 

3271 self, 

3272 where=None, 

3273 columns=None, 

3274 start: int | None = None, 

3275 stop: int | None = None, 

3276 ) -> DataFrame: 

3277 # start, stop applied to rows, so 0th axis only 

3278 self.validate_read(columns, where) 

3279 select_axis = self.obj_type()._get_block_manager_axis(0) 

3280 

3281 axes = [] 

3282 for i in range(self.ndim): 

3283 _start, _stop = (start, stop) if i == select_axis else (None, None) 

3284 ax = self.read_index(f"axis{i}", start=_start, stop=_stop) 

3285 axes.append(ax) 

3286 

3287 items = axes[0] 

3288 dfs = [] 

3289 

3290 for i in range(self.nblocks): 

3291 blk_items = self.read_index(f"block{i}_items") 

3292 values = self.read_array(f"block{i}_values", start=_start, stop=_stop) 

3293 

3294 columns = items[items.get_indexer(blk_items)] 

3295 df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) 

3296 if using_pyarrow_string_dtype() and is_string_array(values, skipna=True): 

3297 df = df.astype("string[pyarrow_numpy]") 

3298 dfs.append(df) 

3299 

3300 if len(dfs) > 0: 

3301 out = concat(dfs, axis=1, copy=True) 

3302 if using_copy_on_write(): 

3303 # with CoW, concat ignores the copy keyword. Here, we still want 

3304 # to copy to enforce optimized column-major layout 

3305 out = out.copy() 

3306 out = out.reindex(columns=items, copy=False) 

3307 return out 

3308 

3309 return DataFrame(columns=axes[0], index=axes[1]) 

3310 

3311 def write(self, obj, **kwargs) -> None: 

3312 super().write(obj, **kwargs) 

3313 

3314 # TODO(ArrayManager) HDFStore relies on accessing the blocks 

3315 if isinstance(obj._mgr, ArrayManager): 

3316 obj = obj._as_manager("block") 

3317 

3318 data = obj._mgr 

3319 if not data.is_consolidated(): 

3320 data = data.consolidate() 

3321 

3322 self.attrs.ndim = data.ndim 

3323 for i, ax in enumerate(data.axes): 

3324 if i == 0 and (not ax.is_unique): 

3325 raise ValueError("Columns index has to be unique for fixed format") 

3326 self.write_index(f"axis{i}", ax) 

3327 

3328 # Supporting mixed-type DataFrame objects...nontrivial 

3329 self.attrs.nblocks = len(data.blocks) 

3330 for i, blk in enumerate(data.blocks): 

3331 # I have no idea why, but writing values before items fixed #2299 

3332 blk_items = data.items.take(blk.mgr_locs) 

3333 self.write_array(f"block{i}_values", blk.values, items=blk_items) 

3334 self.write_index(f"block{i}_items", blk_items) 

3335 

3336 

3337class FrameFixed(BlockManagerFixed): 

3338 pandas_kind = "frame" 

3339 obj_type = DataFrame 

3340 

3341 

3342class Table(Fixed): 

3343 """ 

3344 represent a table: 

3345 facilitate read/write of various types of tables 

3346 

3347 Attrs in Table Node 

3348 ------------------- 

3349 These are attributes that are store in the main table node, they are 

3350 necessary to recreate these tables when read back in. 

3351 

3352 index_axes : a list of tuples of the (original indexing axis and 

3353 index column) 

3354 non_index_axes: a list of tuples of the (original index axis and 

3355 columns on a non-indexing axis) 

3356 values_axes : a list of the columns which comprise the data of this 

3357 table 

3358 data_columns : a list of the columns that we are allowing indexing 

3359 (these become single columns in values_axes) 

3360 nan_rep : the string to use for nan representations for string 

3361 objects 

3362 levels : the names of levels 

3363 metadata : the names of the metadata columns 

3364 """ 

3365 

3366 pandas_kind = "wide_table" 

3367 format_type: str = "table" # GH#30962 needed by dask 

3368 table_type: str 

3369 levels: int | list[Hashable] = 1 

3370 is_table = True 

3371 

3372 metadata: list 

3373 

3374 def __init__( 

3375 self, 

3376 parent: HDFStore, 

3377 group: Node, 

3378 encoding: str | None = None, 

3379 errors: str = "strict", 

3380 index_axes: list[IndexCol] | None = None, 

3381 non_index_axes: list[tuple[AxisInt, Any]] | None = None, 

3382 values_axes: list[DataCol] | None = None, 

3383 data_columns: list | None = None, 

3384 info: dict | None = None, 

3385 nan_rep=None, 

3386 ) -> None: 

3387 super().__init__(parent, group, encoding=encoding, errors=errors) 

3388 self.index_axes = index_axes or [] 

3389 self.non_index_axes = non_index_axes or [] 

3390 self.values_axes = values_axes or [] 

3391 self.data_columns = data_columns or [] 

3392 self.info = info or {} 

3393 self.nan_rep = nan_rep 

3394 

3395 @property 

3396 def table_type_short(self) -> str: 

3397 return self.table_type.split("_")[0] 

3398 

3399 def __repr__(self) -> str: 

3400 """return a pretty representation of myself""" 

3401 self.infer_axes() 

3402 jdc = ",".join(self.data_columns) if len(self.data_columns) else "" 

3403 dc = f",dc->[{jdc}]" 

3404 

3405 ver = "" 

3406 if self.is_old_version: 

3407 jver = ".".join([str(x) for x in self.version]) 

3408 ver = f"[{jver}]" 

3409 

3410 jindex_axes = ",".join([a.name for a in self.index_axes]) 

3411 return ( 

3412 f"{self.pandas_type:12.12}{ver} " 

3413 f"(typ->{self.table_type_short},nrows->{self.nrows}," 

3414 f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})" 

3415 ) 

3416 

3417 def __getitem__(self, c: str): 

3418 """return the axis for c""" 

3419 for a in self.axes: 

3420 if c == a.name: 

3421 return a 

3422 return None 

3423 

3424 def validate(self, other) -> None: 

3425 """validate against an existing table""" 

3426 if other is None: 

3427 return 

3428 

3429 if other.table_type != self.table_type: 

3430 raise TypeError( 

3431 "incompatible table_type with existing " 

3432 f"[{other.table_type} - {self.table_type}]" 

3433 ) 

3434 

3435 for c in ["index_axes", "non_index_axes", "values_axes"]: 

3436 sv = getattr(self, c, None) 

3437 ov = getattr(other, c, None) 

3438 if sv != ov: 

3439 # show the error for the specific axes 

3440 # Argument 1 to "enumerate" has incompatible type 

3441 # "Optional[Any]"; expected "Iterable[Any]" [arg-type] 

3442 for i, sax in enumerate(sv): # type: ignore[arg-type] 

3443 # Value of type "Optional[Any]" is not indexable [index] 

3444 oax = ov[i] # type: ignore[index] 

3445 if sax != oax: 

3446 raise ValueError( 

3447 f"invalid combination of [{c}] on appending data " 

3448 f"[{sax}] vs current table [{oax}]" 

3449 ) 

3450 

3451 # should never get here 

3452 raise Exception( 

3453 f"invalid combination of [{c}] on appending data [{sv}] vs " 

3454 f"current table [{ov}]" 

3455 ) 

3456 

3457 @property 

3458 def is_multi_index(self) -> bool: 

3459 """the levels attribute is 1 or a list in the case of a multi-index""" 

3460 return isinstance(self.levels, list) 

3461 

3462 def validate_multiindex( 

3463 self, obj: DataFrame | Series 

3464 ) -> tuple[DataFrame, list[Hashable]]: 

3465 """ 

3466 validate that we can store the multi-index; reset and return the 

3467 new object 

3468 """ 

3469 levels = com.fill_missing_names(obj.index.names) 

3470 try: 

3471 reset_obj = obj.reset_index() 

3472 except ValueError as err: 

3473 raise ValueError( 

3474 "duplicate names/columns in the multi-index when storing as a table" 

3475 ) from err 

3476 assert isinstance(reset_obj, DataFrame) # for mypy 

3477 return reset_obj, levels 

3478 

3479 @property 

3480 def nrows_expected(self) -> int: 

3481 """based on our axes, compute the expected nrows""" 

3482 return np.prod([i.cvalues.shape[0] for i in self.index_axes]) 

3483 

3484 @property 

3485 def is_exists(self) -> bool: 

3486 """has this table been created""" 

3487 return "table" in self.group 

3488 

3489 @property 

3490 def storable(self): 

3491 return getattr(self.group, "table", None) 

3492 

3493 @property 

3494 def table(self): 

3495 """return the table group (this is my storable)""" 

3496 return self.storable 

3497 

3498 @property 

3499 def dtype(self): 

3500 return self.table.dtype 

3501 

3502 @property 

3503 def description(self): 

3504 return self.table.description 

3505 

3506 @property 

3507 def axes(self) -> itertools.chain[IndexCol]: 

3508 return itertools.chain(self.index_axes, self.values_axes) 

3509 

3510 @property 

3511 def ncols(self) -> int: 

3512 """the number of total columns in the values axes""" 

3513 return sum(len(a.values) for a in self.values_axes) 

3514 

3515 @property 

3516 def is_transposed(self) -> bool: 

3517 return False 

3518 

3519 @property 

3520 def data_orientation(self) -> tuple[int, ...]: 

3521 """return a tuple of my permutated axes, non_indexable at the front""" 

3522 return tuple( 

3523 itertools.chain( 

3524 [int(a[0]) for a in self.non_index_axes], 

3525 [int(a.axis) for a in self.index_axes], 

3526 ) 

3527 ) 

3528 

3529 def queryables(self) -> dict[str, Any]: 

3530 """return a dict of the kinds allowable columns for this object""" 

3531 # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here 

3532 axis_names = {0: "index", 1: "columns"} 

3533 

3534 # compute the values_axes queryables 

3535 d1 = [(a.cname, a) for a in self.index_axes] 

3536 d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes] 

3537 d3 = [ 

3538 (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns) 

3539 ] 

3540 

3541 return dict(d1 + d2 + d3) 

3542 

3543 def index_cols(self): 

3544 """return a list of my index cols""" 

3545 # Note: each `i.cname` below is assured to be a str. 

3546 return [(i.axis, i.cname) for i in self.index_axes] 

3547 

3548 def values_cols(self) -> list[str]: 

3549 """return a list of my values cols""" 

3550 return [i.cname for i in self.values_axes] 

3551 

3552 def _get_metadata_path(self, key: str) -> str: 

3553 """return the metadata pathname for this key""" 

3554 group = self.group._v_pathname 

3555 return f"{group}/meta/{key}/meta" 

3556 

3557 def write_metadata(self, key: str, values: np.ndarray) -> None: 

3558 """ 

3559 Write out a metadata array to the key as a fixed-format Series. 

3560 

3561 Parameters 

3562 ---------- 

3563 key : str 

3564 values : ndarray 

3565 """ 

3566 self.parent.put( 

3567 self._get_metadata_path(key), 

3568 Series(values, copy=False), 

3569 format="table", 

3570 encoding=self.encoding, 

3571 errors=self.errors, 

3572 nan_rep=self.nan_rep, 

3573 ) 

3574 

3575 def read_metadata(self, key: str): 

3576 """return the meta data array for this key""" 

3577 if getattr(getattr(self.group, "meta", None), key, None) is not None: 

3578 return self.parent.select(self._get_metadata_path(key)) 

3579 return None 

3580 

3581 def set_attrs(self) -> None: 

3582 """set our table type & indexables""" 

3583 self.attrs.table_type = str(self.table_type) 

3584 self.attrs.index_cols = self.index_cols() 

3585 self.attrs.values_cols = self.values_cols() 

3586 self.attrs.non_index_axes = self.non_index_axes 

3587 self.attrs.data_columns = self.data_columns 

3588 self.attrs.nan_rep = self.nan_rep 

3589 self.attrs.encoding = self.encoding 

3590 self.attrs.errors = self.errors 

3591 self.attrs.levels = self.levels 

3592 self.attrs.info = self.info 

3593 

3594 def get_attrs(self) -> None: 

3595 """retrieve our attributes""" 

3596 self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or [] 

3597 self.data_columns = getattr(self.attrs, "data_columns", None) or [] 

3598 self.info = getattr(self.attrs, "info", None) or {} 

3599 self.nan_rep = getattr(self.attrs, "nan_rep", None) 

3600 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) 

3601 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) 

3602 self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or [] 

3603 self.index_axes = [a for a in self.indexables if a.is_an_indexable] 

3604 self.values_axes = [a for a in self.indexables if not a.is_an_indexable] 

3605 

3606 def validate_version(self, where=None) -> None: 

3607 """are we trying to operate on an old version?""" 

3608 if where is not None: 

3609 if self.is_old_version: 

3610 ws = incompatibility_doc % ".".join([str(x) for x in self.version]) 

3611 warnings.warn( 

3612 ws, 

3613 IncompatibilityWarning, 

3614 stacklevel=find_stack_level(), 

3615 ) 

3616 

3617 def validate_min_itemsize(self, min_itemsize) -> None: 

3618 """ 

3619 validate the min_itemsize doesn't contain items that are not in the 

3620 axes this needs data_columns to be defined 

3621 """ 

3622 if min_itemsize is None: 

3623 return 

3624 if not isinstance(min_itemsize, dict): 

3625 return 

3626 

3627 q = self.queryables() 

3628 for k in min_itemsize: 

3629 # ok, apply generally 

3630 if k == "values": 

3631 continue 

3632 if k not in q: 

3633 raise ValueError( 

3634 f"min_itemsize has the key [{k}] which is not an axis or " 

3635 "data_column" 

3636 ) 

3637 

3638 @cache_readonly 

3639 def indexables(self): 

3640 """create/cache the indexables if they don't exist""" 

3641 _indexables = [] 

3642 

3643 desc = self.description 

3644 table_attrs = self.table.attrs 

3645 

3646 # Note: each of the `name` kwargs below are str, ensured 

3647 # by the definition in index_cols. 

3648 # index columns 

3649 for i, (axis, name) in enumerate(self.attrs.index_cols): 

3650 atom = getattr(desc, name) 

3651 md = self.read_metadata(name) 

3652 meta = "category" if md is not None else None 

3653 

3654 kind_attr = f"{name}_kind" 

3655 kind = getattr(table_attrs, kind_attr, None) 

3656 

3657 index_col = IndexCol( 

3658 name=name, 

3659 axis=axis, 

3660 pos=i, 

3661 kind=kind, 

3662 typ=atom, 

3663 table=self.table, 

3664 meta=meta, 

3665 metadata=md, 

3666 ) 

3667 _indexables.append(index_col) 

3668 

3669 # values columns 

3670 dc = set(self.data_columns) 

3671 base_pos = len(_indexables) 

3672 

3673 def f(i, c): 

3674 assert isinstance(c, str) 

3675 klass = DataCol 

3676 if c in dc: 

3677 klass = DataIndexableCol 

3678 

3679 atom = getattr(desc, c) 

3680 adj_name = _maybe_adjust_name(c, self.version) 

3681 

3682 # TODO: why kind_attr here? 

3683 values = getattr(table_attrs, f"{adj_name}_kind", None) 

3684 dtype = getattr(table_attrs, f"{adj_name}_dtype", None) 

3685 # Argument 1 to "_dtype_to_kind" has incompatible type 

3686 # "Optional[Any]"; expected "str" [arg-type] 

3687 kind = _dtype_to_kind(dtype) # type: ignore[arg-type] 

3688 

3689 md = self.read_metadata(c) 

3690 # TODO: figure out why these two versions of `meta` dont always match. 

3691 # meta = "category" if md is not None else None 

3692 meta = getattr(table_attrs, f"{adj_name}_meta", None) 

3693 

3694 obj = klass( 

3695 name=adj_name, 

3696 cname=c, 

3697 values=values, 

3698 kind=kind, 

3699 pos=base_pos + i, 

3700 typ=atom, 

3701 table=self.table, 

3702 meta=meta, 

3703 metadata=md, 

3704 dtype=dtype, 

3705 ) 

3706 return obj 

3707 

3708 # Note: the definition of `values_cols` ensures that each 

3709 # `c` below is a str. 

3710 _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)]) 

3711 

3712 return _indexables 

3713 

3714 def create_index( 

3715 self, columns=None, optlevel=None, kind: str | None = None 

3716 ) -> None: 

3717 """ 

3718 Create a pytables index on the specified columns. 

3719 

3720 Parameters 

3721 ---------- 

3722 columns : None, bool, or listlike[str] 

3723 Indicate which columns to create an index on. 

3724 

3725 * False : Do not create any indexes. 

3726 * True : Create indexes on all columns. 

3727 * None : Create indexes on all columns. 

3728 * listlike : Create indexes on the given columns. 

3729 

3730 optlevel : int or None, default None 

3731 Optimization level, if None, pytables defaults to 6. 

3732 kind : str or None, default None 

3733 Kind of index, if None, pytables defaults to "medium". 

3734 

3735 Raises 

3736 ------ 

3737 TypeError if trying to create an index on a complex-type column. 

3738 

3739 Notes 

3740 ----- 

3741 Cannot index Time64Col or ComplexCol. 

3742 Pytables must be >= 3.0. 

3743 """ 

3744 if not self.infer_axes(): 

3745 return 

3746 if columns is False: 

3747 return 

3748 

3749 # index all indexables and data_columns 

3750 if columns is None or columns is True: 

3751 columns = [a.cname for a in self.axes if a.is_data_indexable] 

3752 if not isinstance(columns, (tuple, list)): 

3753 columns = [columns] 

3754 

3755 kw = {} 

3756 if optlevel is not None: 

3757 kw["optlevel"] = optlevel 

3758 if kind is not None: 

3759 kw["kind"] = kind 

3760 

3761 table = self.table 

3762 for c in columns: 

3763 v = getattr(table.cols, c, None) 

3764 if v is not None: 

3765 # remove the index if the kind/optlevel have changed 

3766 if v.is_indexed: 

3767 index = v.index 

3768 cur_optlevel = index.optlevel 

3769 cur_kind = index.kind 

3770 

3771 if kind is not None and cur_kind != kind: 

3772 v.remove_index() 

3773 else: 

3774 kw["kind"] = cur_kind 

3775 

3776 if optlevel is not None and cur_optlevel != optlevel: 

3777 v.remove_index() 

3778 else: 

3779 kw["optlevel"] = cur_optlevel 

3780 

3781 # create the index 

3782 if not v.is_indexed: 

3783 if v.type.startswith("complex"): 

3784 raise TypeError( 

3785 "Columns containing complex values can be stored but " 

3786 "cannot be indexed when using table format. Either use " 

3787 "fixed format, set index=False, or do not include " 

3788 "the columns containing complex values to " 

3789 "data_columns when initializing the table." 

3790 ) 

3791 v.create_index(**kw) 

3792 elif c in self.non_index_axes[0][1]: 

3793 # GH 28156 

3794 raise AttributeError( 

3795 f"column {c} is not a data_column.\n" 

3796 f"In order to read column {c} you must reload the dataframe \n" 

3797 f"into HDFStore and include {c} with the data_columns argument." 

3798 ) 

3799 

3800 def _read_axes( 

3801 self, where, start: int | None = None, stop: int | None = None 

3802 ) -> list[tuple[np.ndarray, np.ndarray] | tuple[Index, Index]]: 

3803 """ 

3804 Create the axes sniffed from the table. 

3805 

3806 Parameters 

3807 ---------- 

3808 where : ??? 

3809 start : int or None, default None 

3810 stop : int or None, default None 

3811 

3812 Returns 

3813 ------- 

3814 List[Tuple[index_values, column_values]] 

3815 """ 

3816 # create the selection 

3817 selection = Selection(self, where=where, start=start, stop=stop) 

3818 values = selection.select() 

3819 

3820 results = [] 

3821 # convert the data 

3822 for a in self.axes: 

3823 a.set_info(self.info) 

3824 res = a.convert( 

3825 values, 

3826 nan_rep=self.nan_rep, 

3827 encoding=self.encoding, 

3828 errors=self.errors, 

3829 ) 

3830 results.append(res) 

3831 

3832 return results 

3833 

3834 @classmethod 

3835 def get_object(cls, obj, transposed: bool): 

3836 """return the data for this obj""" 

3837 return obj 

3838 

3839 def validate_data_columns(self, data_columns, min_itemsize, non_index_axes): 

3840 """ 

3841 take the input data_columns and min_itemize and create a data 

3842 columns spec 

3843 """ 

3844 if not len(non_index_axes): 

3845 return [] 

3846 

3847 axis, axis_labels = non_index_axes[0] 

3848 info = self.info.get(axis, {}) 

3849 if info.get("type") == "MultiIndex" and data_columns: 

3850 raise ValueError( 

3851 f"cannot use a multi-index on axis [{axis}] with " 

3852 f"data_columns {data_columns}" 

3853 ) 

3854 

3855 # evaluate the passed data_columns, True == use all columns 

3856 # take only valid axis labels 

3857 if data_columns is True: 

3858 data_columns = list(axis_labels) 

3859 elif data_columns is None: 

3860 data_columns = [] 

3861 

3862 # if min_itemsize is a dict, add the keys (exclude 'values') 

3863 if isinstance(min_itemsize, dict): 

3864 existing_data_columns = set(data_columns) 

3865 data_columns = list(data_columns) # ensure we do not modify 

3866 data_columns.extend( 

3867 [ 

3868 k 

3869 for k in min_itemsize.keys() 

3870 if k != "values" and k not in existing_data_columns 

3871 ] 

3872 ) 

3873 

3874 # return valid columns in the order of our axis 

3875 return [c for c in data_columns if c in axis_labels] 

3876 

3877 def _create_axes( 

3878 self, 

3879 axes, 

3880 obj: DataFrame, 

3881 validate: bool = True, 

3882 nan_rep=None, 

3883 data_columns=None, 

3884 min_itemsize=None, 

3885 ): 

3886 """ 

3887 Create and return the axes. 

3888 

3889 Parameters 

3890 ---------- 

3891 axes: list or None 

3892 The names or numbers of the axes to create. 

3893 obj : DataFrame 

3894 The object to create axes on. 

3895 validate: bool, default True 

3896 Whether to validate the obj against an existing object already written. 

3897 nan_rep : 

3898 A value to use for string column nan_rep. 

3899 data_columns : List[str], True, or None, default None 

3900 Specify the columns that we want to create to allow indexing on. 

3901 

3902 * True : Use all available columns. 

3903 * None : Use no columns. 

3904 * List[str] : Use the specified columns. 

3905 

3906 min_itemsize: Dict[str, int] or None, default None 

3907 The min itemsize for a column in bytes. 

3908 """ 

3909 if not isinstance(obj, DataFrame): 

3910 group = self.group._v_name 

3911 raise TypeError( 

3912 f"cannot properly create the storer for: [group->{group}," 

3913 f"value->{type(obj)}]" 

3914 ) 

3915 

3916 # set the default axes if needed 

3917 if axes is None: 

3918 axes = [0] 

3919 

3920 # map axes to numbers 

3921 axes = [obj._get_axis_number(a) for a in axes] 

3922 

3923 # do we have an existing table (if so, use its axes & data_columns) 

3924 if self.infer_axes(): 

3925 table_exists = True 

3926 axes = [a.axis for a in self.index_axes] 

3927 data_columns = list(self.data_columns) 

3928 nan_rep = self.nan_rep 

3929 # TODO: do we always have validate=True here? 

3930 else: 

3931 table_exists = False 

3932 

3933 new_info = self.info 

3934 

3935 assert self.ndim == 2 # with next check, we must have len(axes) == 1 

3936 # currently support on ndim-1 axes 

3937 if len(axes) != self.ndim - 1: 

3938 raise ValueError( 

3939 "currently only support ndim-1 indexers in an AppendableTable" 

3940 ) 

3941 

3942 # create according to the new data 

3943 new_non_index_axes: list = [] 

3944 

3945 # nan_representation 

3946 if nan_rep is None: 

3947 nan_rep = "nan" 

3948 

3949 # We construct the non-index-axis first, since that alters new_info 

3950 idx = next(x for x in [0, 1] if x not in axes) 

3951 

3952 a = obj.axes[idx] 

3953 # we might be able to change the axes on the appending data if necessary 

3954 append_axis = list(a) 

3955 if table_exists: 

3956 indexer = len(new_non_index_axes) # i.e. 0 

3957 exist_axis = self.non_index_axes[indexer][1] 

3958 if not array_equivalent( 

3959 np.array(append_axis), 

3960 np.array(exist_axis), 

3961 strict_nan=True, 

3962 dtype_equal=True, 

3963 ): 

3964 # ahah! -> reindex 

3965 if array_equivalent( 

3966 np.array(sorted(append_axis)), 

3967 np.array(sorted(exist_axis)), 

3968 strict_nan=True, 

3969 dtype_equal=True, 

3970 ): 

3971 append_axis = exist_axis 

3972 

3973 # the non_index_axes info 

3974 info = new_info.setdefault(idx, {}) 

3975 info["names"] = list(a.names) 

3976 info["type"] = type(a).__name__ 

3977 

3978 new_non_index_axes.append((idx, append_axis)) 

3979 

3980 # Now we can construct our new index axis 

3981 idx = axes[0] 

3982 a = obj.axes[idx] 

3983 axis_name = obj._get_axis_name(idx) 

3984 new_index = _convert_index(axis_name, a, self.encoding, self.errors) 

3985 new_index.axis = idx 

3986 

3987 # Because we are always 2D, there is only one new_index, so 

3988 # we know it will have pos=0 

3989 new_index.set_pos(0) 

3990 new_index.update_info(new_info) 

3991 new_index.maybe_set_size(min_itemsize) # check for column conflicts 

3992 

3993 new_index_axes = [new_index] 

3994 j = len(new_index_axes) # i.e. 1 

3995 assert j == 1 

3996 

3997 # reindex by our non_index_axes & compute data_columns 

3998 assert len(new_non_index_axes) == 1 

3999 for a in new_non_index_axes: 

4000 obj = _reindex_axis(obj, a[0], a[1]) 

4001 

4002 transposed = new_index.axis == 1 

4003 

4004 # figure out data_columns and get out blocks 

4005 data_columns = self.validate_data_columns( 

4006 data_columns, min_itemsize, new_non_index_axes 

4007 ) 

4008 

4009 frame = self.get_object(obj, transposed)._consolidate() 

4010 

4011 blocks, blk_items = self._get_blocks_and_items( 

4012 frame, table_exists, new_non_index_axes, self.values_axes, data_columns 

4013 ) 

4014 

4015 # add my values 

4016 vaxes = [] 

4017 for i, (blk, b_items) in enumerate(zip(blocks, blk_items)): 

4018 # shape of the data column are the indexable axes 

4019 klass = DataCol 

4020 name = None 

4021 

4022 # we have a data_column 

4023 if data_columns and len(b_items) == 1 and b_items[0] in data_columns: 

4024 klass = DataIndexableCol 

4025 name = b_items[0] 

4026 if not (name is None or isinstance(name, str)): 

4027 # TODO: should the message here be more specifically non-str? 

4028 raise ValueError("cannot have non-object label DataIndexableCol") 

4029 

4030 # make sure that we match up the existing columns 

4031 # if we have an existing table 

4032 existing_col: DataCol | None 

4033 

4034 if table_exists and validate: 

4035 try: 

4036 existing_col = self.values_axes[i] 

4037 except (IndexError, KeyError) as err: 

4038 raise ValueError( 

4039 f"Incompatible appended table [{blocks}]" 

4040 f"with existing table [{self.values_axes}]" 

4041 ) from err 

4042 else: 

4043 existing_col = None 

4044 

4045 new_name = name or f"values_block_{i}" 

4046 data_converted = _maybe_convert_for_string_atom( 

4047 new_name, 

4048 blk.values, 

4049 existing_col=existing_col, 

4050 min_itemsize=min_itemsize, 

4051 nan_rep=nan_rep, 

4052 encoding=self.encoding, 

4053 errors=self.errors, 

4054 columns=b_items, 

4055 ) 

4056 adj_name = _maybe_adjust_name(new_name, self.version) 

4057 

4058 typ = klass._get_atom(data_converted) 

4059 kind = _dtype_to_kind(data_converted.dtype.name) 

4060 tz = None 

4061 if getattr(data_converted, "tz", None) is not None: 

4062 tz = _get_tz(data_converted.tz) 

4063 

4064 meta = metadata = ordered = None 

4065 if isinstance(data_converted.dtype, CategoricalDtype): 

4066 ordered = data_converted.ordered 

4067 meta = "category" 

4068 metadata = np.asarray(data_converted.categories).ravel() 

4069 

4070 data, dtype_name = _get_data_and_dtype_name(data_converted) 

4071 

4072 col = klass( 

4073 name=adj_name, 

4074 cname=new_name, 

4075 values=list(b_items), 

4076 typ=typ, 

4077 pos=j, 

4078 kind=kind, 

4079 tz=tz, 

4080 ordered=ordered, 

4081 meta=meta, 

4082 metadata=metadata, 

4083 dtype=dtype_name, 

4084 data=data, 

4085 ) 

4086 col.update_info(new_info) 

4087 

4088 vaxes.append(col) 

4089 

4090 j += 1 

4091 

4092 dcs = [col.name for col in vaxes if col.is_data_indexable] 

4093 

4094 new_table = type(self)( 

4095 parent=self.parent, 

4096 group=self.group, 

4097 encoding=self.encoding, 

4098 errors=self.errors, 

4099 index_axes=new_index_axes, 

4100 non_index_axes=new_non_index_axes, 

4101 values_axes=vaxes, 

4102 data_columns=dcs, 

4103 info=new_info, 

4104 nan_rep=nan_rep, 

4105 ) 

4106 if hasattr(self, "levels"): 

4107 # TODO: get this into constructor, only for appropriate subclass 

4108 new_table.levels = self.levels 

4109 

4110 new_table.validate_min_itemsize(min_itemsize) 

4111 

4112 if validate and table_exists: 

4113 new_table.validate(self) 

4114 

4115 return new_table 

4116 

4117 @staticmethod 

4118 def _get_blocks_and_items( 

4119 frame: DataFrame, 

4120 table_exists: bool, 

4121 new_non_index_axes, 

4122 values_axes, 

4123 data_columns, 

4124 ): 

4125 # Helper to clarify non-state-altering parts of _create_axes 

4126 

4127 # TODO(ArrayManager) HDFStore relies on accessing the blocks 

4128 if isinstance(frame._mgr, ArrayManager): 

4129 frame = frame._as_manager("block") 

4130 

4131 def get_blk_items(mgr): 

4132 return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks] 

4133 

4134 mgr = frame._mgr 

4135 mgr = cast(BlockManager, mgr) 

4136 blocks: list[Block] = list(mgr.blocks) 

4137 blk_items: list[Index] = get_blk_items(mgr) 

4138 

4139 if len(data_columns): 

4140 # TODO: prove that we only get here with axis == 1? 

4141 # It is the case in all extant tests, but NOT the case 

4142 # outside this `if len(data_columns)` check. 

4143 

4144 axis, axis_labels = new_non_index_axes[0] 

4145 new_labels = Index(axis_labels).difference(Index(data_columns)) 

4146 mgr = frame.reindex(new_labels, axis=axis)._mgr 

4147 mgr = cast(BlockManager, mgr) 

4148 

4149 blocks = list(mgr.blocks) 

4150 blk_items = get_blk_items(mgr) 

4151 for c in data_columns: 

4152 # This reindex would raise ValueError if we had a duplicate 

4153 # index, so we can infer that (as long as axis==1) we 

4154 # get a single column back, so a single block. 

4155 mgr = frame.reindex([c], axis=axis)._mgr 

4156 mgr = cast(BlockManager, mgr) 

4157 blocks.extend(mgr.blocks) 

4158 blk_items.extend(get_blk_items(mgr)) 

4159 

4160 # reorder the blocks in the same order as the existing table if we can 

4161 if table_exists: 

4162 by_items = { 

4163 tuple(b_items.tolist()): (b, b_items) 

4164 for b, b_items in zip(blocks, blk_items) 

4165 } 

4166 new_blocks: list[Block] = [] 

4167 new_blk_items = [] 

4168 for ea in values_axes: 

4169 items = tuple(ea.values) 

4170 try: 

4171 b, b_items = by_items.pop(items) 

4172 new_blocks.append(b) 

4173 new_blk_items.append(b_items) 

4174 except (IndexError, KeyError) as err: 

4175 jitems = ",".join([pprint_thing(item) for item in items]) 

4176 raise ValueError( 

4177 f"cannot match existing table structure for [{jitems}] " 

4178 "on appending data" 

4179 ) from err 

4180 blocks = new_blocks 

4181 blk_items = new_blk_items 

4182 

4183 return blocks, blk_items 

4184 

4185 def process_axes(self, obj, selection: Selection, columns=None) -> DataFrame: 

4186 """process axes filters""" 

4187 # make a copy to avoid side effects 

4188 if columns is not None: 

4189 columns = list(columns) 

4190 

4191 # make sure to include levels if we have them 

4192 if columns is not None and self.is_multi_index: 

4193 assert isinstance(self.levels, list) # assured by is_multi_index 

4194 for n in self.levels: 

4195 if n not in columns: 

4196 columns.insert(0, n) 

4197 

4198 # reorder by any non_index_axes & limit to the select columns 

4199 for axis, labels in self.non_index_axes: 

4200 obj = _reindex_axis(obj, axis, labels, columns) 

4201 

4202 def process_filter(field, filt, op): 

4203 for axis_name in obj._AXIS_ORDERS: 

4204 axis_number = obj._get_axis_number(axis_name) 

4205 axis_values = obj._get_axis(axis_name) 

4206 assert axis_number is not None 

4207 

4208 # see if the field is the name of an axis 

4209 if field == axis_name: 

4210 # if we have a multi-index, then need to include 

4211 # the levels 

4212 if self.is_multi_index: 

4213 filt = filt.union(Index(self.levels)) 

4214 

4215 takers = op(axis_values, filt) 

4216 return obj.loc(axis=axis_number)[takers] 

4217 

4218 # this might be the name of a file IN an axis 

4219 elif field in axis_values: 

4220 # we need to filter on this dimension 

4221 values = ensure_index(getattr(obj, field).values) 

4222 filt = ensure_index(filt) 

4223 

4224 # hack until we support reversed dim flags 

4225 if isinstance(obj, DataFrame): 

4226 axis_number = 1 - axis_number 

4227 

4228 takers = op(values, filt) 

4229 return obj.loc(axis=axis_number)[takers] 

4230 

4231 raise ValueError(f"cannot find the field [{field}] for filtering!") 

4232 

4233 # apply the selection filters (but keep in the same order) 

4234 if selection.filter is not None: 

4235 for field, op, filt in selection.filter.format(): 

4236 obj = process_filter(field, filt, op) 

4237 

4238 return obj 

4239 

4240 def create_description( 

4241 self, 

4242 complib, 

4243 complevel: int | None, 

4244 fletcher32: bool, 

4245 expectedrows: int | None, 

4246 ) -> dict[str, Any]: 

4247 """create the description of the table from the axes & values""" 

4248 # provided expected rows if its passed 

4249 if expectedrows is None: 

4250 expectedrows = max(self.nrows_expected, 10000) 

4251 

4252 d = {"name": "table", "expectedrows": expectedrows} 

4253 

4254 # description from the axes & values 

4255 d["description"] = {a.cname: a.typ for a in self.axes} 

4256 

4257 if complib: 

4258 if complevel is None: 

4259 complevel = self._complevel or 9 

4260 filters = _tables().Filters( 

4261 complevel=complevel, 

4262 complib=complib, 

4263 fletcher32=fletcher32 or self._fletcher32, 

4264 ) 

4265 d["filters"] = filters 

4266 elif self._filters is not None: 

4267 d["filters"] = self._filters 

4268 

4269 return d 

4270 

4271 def read_coordinates( 

4272 self, where=None, start: int | None = None, stop: int | None = None 

4273 ): 

4274 """ 

4275 select coordinates (row numbers) from a table; return the 

4276 coordinates object 

4277 """ 

4278 # validate the version 

4279 self.validate_version(where) 

4280 

4281 # infer the data kind 

4282 if not self.infer_axes(): 

4283 return False 

4284 

4285 # create the selection 

4286 selection = Selection(self, where=where, start=start, stop=stop) 

4287 coords = selection.select_coords() 

4288 if selection.filter is not None: 

4289 for field, op, filt in selection.filter.format(): 

4290 data = self.read_column( 

4291 field, start=coords.min(), stop=coords.max() + 1 

4292 ) 

4293 coords = coords[op(data.iloc[coords - coords.min()], filt).values] 

4294 

4295 return Index(coords) 

4296 

4297 def read_column( 

4298 self, 

4299 column: str, 

4300 where=None, 

4301 start: int | None = None, 

4302 stop: int | None = None, 

4303 ): 

4304 """ 

4305 return a single column from the table, generally only indexables 

4306 are interesting 

4307 """ 

4308 # validate the version 

4309 self.validate_version() 

4310 

4311 # infer the data kind 

4312 if not self.infer_axes(): 

4313 return False 

4314 

4315 if where is not None: 

4316 raise TypeError("read_column does not currently accept a where clause") 

4317 

4318 # find the axes 

4319 for a in self.axes: 

4320 if column == a.name: 

4321 if not a.is_data_indexable: 

4322 raise ValueError( 

4323 f"column [{column}] can not be extracted individually; " 

4324 "it is not data indexable" 

4325 ) 

4326 

4327 # column must be an indexable or a data column 

4328 c = getattr(self.table.cols, column) 

4329 a.set_info(self.info) 

4330 col_values = a.convert( 

4331 c[start:stop], 

4332 nan_rep=self.nan_rep, 

4333 encoding=self.encoding, 

4334 errors=self.errors, 

4335 ) 

4336 return Series(_set_tz(col_values[1], a.tz), name=column, copy=False) 

4337 

4338 raise KeyError(f"column [{column}] not found in the table") 

4339 

4340 

4341class WORMTable(Table): 

4342 """ 

4343 a write-once read-many table: this format DOES NOT ALLOW appending to a 

4344 table. writing is a one-time operation the data are stored in a format 

4345 that allows for searching the data on disk 

4346 """ 

4347 

4348 table_type = "worm" 

4349 

4350 def read( 

4351 self, 

4352 where=None, 

4353 columns=None, 

4354 start: int | None = None, 

4355 stop: int | None = None, 

4356 ): 

4357 """ 

4358 read the indices and the indexing array, calculate offset rows and return 

4359 """ 

4360 raise NotImplementedError("WORMTable needs to implement read") 

4361 

4362 def write(self, obj, **kwargs) -> None: 

4363 """ 

4364 write in a format that we can search later on (but cannot append 

4365 to): write out the indices and the values using _write_array 

4366 (e.g. a CArray) create an indexing table so that we can search 

4367 """ 

4368 raise NotImplementedError("WORMTable needs to implement write") 

4369 

4370 

4371class AppendableTable(Table): 

4372 """support the new appendable table formats""" 

4373 

4374 table_type = "appendable" 

4375 

4376 # error: Signature of "write" incompatible with supertype "Fixed" 

4377 def write( # type: ignore[override] 

4378 self, 

4379 obj, 

4380 axes=None, 

4381 append: bool = False, 

4382 complib=None, 

4383 complevel=None, 

4384 fletcher32=None, 

4385 min_itemsize=None, 

4386 chunksize: int | None = None, 

4387 expectedrows=None, 

4388 dropna: bool = False, 

4389 nan_rep=None, 

4390 data_columns=None, 

4391 track_times: bool = True, 

4392 ) -> None: 

4393 if not append and self.is_exists: 

4394 self._handle.remove_node(self.group, "table") 

4395 

4396 # create the axes 

4397 table = self._create_axes( 

4398 axes=axes, 

4399 obj=obj, 

4400 validate=append, 

4401 min_itemsize=min_itemsize, 

4402 nan_rep=nan_rep, 

4403 data_columns=data_columns, 

4404 ) 

4405 

4406 for a in table.axes: 

4407 a.validate_names() 

4408 

4409 if not table.is_exists: 

4410 # create the table 

4411 options = table.create_description( 

4412 complib=complib, 

4413 complevel=complevel, 

4414 fletcher32=fletcher32, 

4415 expectedrows=expectedrows, 

4416 ) 

4417 

4418 # set the table attributes 

4419 table.set_attrs() 

4420 

4421 options["track_times"] = track_times 

4422 

4423 # create the table 

4424 table._handle.create_table(table.group, **options) 

4425 

4426 # update my info 

4427 table.attrs.info = table.info 

4428 

4429 # validate the axes and set the kinds 

4430 for a in table.axes: 

4431 a.validate_and_set(table, append) 

4432 

4433 # add the rows 

4434 table.write_data(chunksize, dropna=dropna) 

4435 

4436 def write_data(self, chunksize: int | None, dropna: bool = False) -> None: 

4437 """ 

4438 we form the data into a 2-d including indexes,values,mask write chunk-by-chunk 

4439 """ 

4440 names = self.dtype.names 

4441 nrows = self.nrows_expected 

4442 

4443 # if dropna==True, then drop ALL nan rows 

4444 masks = [] 

4445 if dropna: 

4446 for a in self.values_axes: 

4447 # figure the mask: only do if we can successfully process this 

4448 # column, otherwise ignore the mask 

4449 mask = isna(a.data).all(axis=0) 

4450 if isinstance(mask, np.ndarray): 

4451 masks.append(mask.astype("u1", copy=False)) 

4452 

4453 # consolidate masks 

4454 if len(masks): 

4455 mask = masks[0] 

4456 for m in masks[1:]: 

4457 mask = mask & m 

4458 mask = mask.ravel() 

4459 else: 

4460 mask = None 

4461 

4462 # broadcast the indexes if needed 

4463 indexes = [a.cvalues for a in self.index_axes] 

4464 nindexes = len(indexes) 

4465 assert nindexes == 1, nindexes # ensures we dont need to broadcast 

4466 

4467 # transpose the values so first dimension is last 

4468 # reshape the values if needed 

4469 values = [a.take_data() for a in self.values_axes] 

4470 values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values] 

4471 bvalues = [] 

4472 for i, v in enumerate(values): 

4473 new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape 

4474 bvalues.append(v.reshape(new_shape)) 

4475 

4476 # write the chunks 

4477 if chunksize is None: 

4478 chunksize = 100000 

4479 

4480 rows = np.empty(min(chunksize, nrows), dtype=self.dtype) 

4481 chunks = nrows // chunksize + 1 

4482 for i in range(chunks): 

4483 start_i = i * chunksize 

4484 end_i = min((i + 1) * chunksize, nrows) 

4485 if start_i >= end_i: 

4486 break 

4487 

4488 self.write_data_chunk( 

4489 rows, 

4490 indexes=[a[start_i:end_i] for a in indexes], 

4491 mask=mask[start_i:end_i] if mask is not None else None, 

4492 values=[v[start_i:end_i] for v in bvalues], 

4493 ) 

4494 

4495 def write_data_chunk( 

4496 self, 

4497 rows: np.ndarray, 

4498 indexes: list[np.ndarray], 

4499 mask: npt.NDArray[np.bool_] | None, 

4500 values: list[np.ndarray], 

4501 ) -> None: 

4502 """ 

4503 Parameters 

4504 ---------- 

4505 rows : an empty memory space where we are putting the chunk 

4506 indexes : an array of the indexes 

4507 mask : an array of the masks 

4508 values : an array of the values 

4509 """ 

4510 # 0 len 

4511 for v in values: 

4512 if not np.prod(v.shape): 

4513 return 

4514 

4515 nrows = indexes[0].shape[0] 

4516 if nrows != len(rows): 

4517 rows = np.empty(nrows, dtype=self.dtype) 

4518 names = self.dtype.names 

4519 nindexes = len(indexes) 

4520 

4521 # indexes 

4522 for i, idx in enumerate(indexes): 

4523 rows[names[i]] = idx 

4524 

4525 # values 

4526 for i, v in enumerate(values): 

4527 rows[names[i + nindexes]] = v 

4528 

4529 # mask 

4530 if mask is not None: 

4531 m = ~mask.ravel().astype(bool, copy=False) 

4532 if not m.all(): 

4533 rows = rows[m] 

4534 

4535 if len(rows): 

4536 self.table.append(rows) 

4537 self.table.flush() 

4538 

4539 def delete(self, where=None, start: int | None = None, stop: int | None = None): 

4540 # delete all rows (and return the nrows) 

4541 if where is None or not len(where): 

4542 if start is None and stop is None: 

4543 nrows = self.nrows 

4544 self._handle.remove_node(self.group, recursive=True) 

4545 else: 

4546 # pytables<3.0 would remove a single row with stop=None 

4547 if stop is None: 

4548 stop = self.nrows 

4549 nrows = self.table.remove_rows(start=start, stop=stop) 

4550 self.table.flush() 

4551 return nrows 

4552 

4553 # infer the data kind 

4554 if not self.infer_axes(): 

4555 return None 

4556 

4557 # create the selection 

4558 table = self.table 

4559 selection = Selection(self, where, start=start, stop=stop) 

4560 values = selection.select_coords() 

4561 

4562 # delete the rows in reverse order 

4563 sorted_series = Series(values, copy=False).sort_values() 

4564 ln = len(sorted_series) 

4565 

4566 if ln: 

4567 # construct groups of consecutive rows 

4568 diff = sorted_series.diff() 

4569 groups = list(diff[diff > 1].index) 

4570 

4571 # 1 group 

4572 if not len(groups): 

4573 groups = [0] 

4574 

4575 # final element 

4576 if groups[-1] != ln: 

4577 groups.append(ln) 

4578 

4579 # initial element 

4580 if groups[0] != 0: 

4581 groups.insert(0, 0) 

4582 

4583 # we must remove in reverse order! 

4584 pg = groups.pop() 

4585 for g in reversed(groups): 

4586 rows = sorted_series.take(range(g, pg)) 

4587 table.remove_rows( 

4588 start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1 

4589 ) 

4590 pg = g 

4591 

4592 self.table.flush() 

4593 

4594 # return the number of rows removed 

4595 return ln 

4596 

4597 

4598class AppendableFrameTable(AppendableTable): 

4599 """support the new appendable table formats""" 

4600 

4601 pandas_kind = "frame_table" 

4602 table_type = "appendable_frame" 

4603 ndim = 2 

4604 obj_type: type[DataFrame | Series] = DataFrame 

4605 

4606 @property 

4607 def is_transposed(self) -> bool: 

4608 return self.index_axes[0].axis == 1 

4609 

4610 @classmethod 

4611 def get_object(cls, obj, transposed: bool): 

4612 """these are written transposed""" 

4613 if transposed: 

4614 obj = obj.T 

4615 return obj 

4616 

4617 def read( 

4618 self, 

4619 where=None, 

4620 columns=None, 

4621 start: int | None = None, 

4622 stop: int | None = None, 

4623 ): 

4624 # validate the version 

4625 self.validate_version(where) 

4626 

4627 # infer the data kind 

4628 if not self.infer_axes(): 

4629 return None 

4630 

4631 result = self._read_axes(where=where, start=start, stop=stop) 

4632 

4633 info = ( 

4634 self.info.get(self.non_index_axes[0][0], {}) 

4635 if len(self.non_index_axes) 

4636 else {} 

4637 ) 

4638 

4639 inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]] 

4640 assert len(inds) == 1 

4641 ind = inds[0] 

4642 

4643 index = result[ind][0] 

4644 

4645 frames = [] 

4646 for i, a in enumerate(self.axes): 

4647 if a not in self.values_axes: 

4648 continue 

4649 index_vals, cvalues = result[i] 

4650 

4651 # we could have a multi-index constructor here 

4652 # ensure_index doesn't recognized our list-of-tuples here 

4653 if info.get("type") != "MultiIndex": 

4654 cols = Index(index_vals) 

4655 else: 

4656 cols = MultiIndex.from_tuples(index_vals) 

4657 

4658 names = info.get("names") 

4659 if names is not None: 

4660 cols.set_names(names, inplace=True) 

4661 

4662 if self.is_transposed: 

4663 values = cvalues 

4664 index_ = cols 

4665 cols_ = Index(index, name=getattr(index, "name", None)) 

4666 else: 

4667 values = cvalues.T 

4668 index_ = Index(index, name=getattr(index, "name", None)) 

4669 cols_ = cols 

4670 

4671 # if we have a DataIndexableCol, its shape will only be 1 dim 

4672 if values.ndim == 1 and isinstance(values, np.ndarray): 

4673 values = values.reshape((1, values.shape[0])) 

4674 

4675 if isinstance(values, np.ndarray): 

4676 df = DataFrame(values.T, columns=cols_, index=index_, copy=False) 

4677 elif isinstance(values, Index): 

4678 df = DataFrame(values, columns=cols_, index=index_) 

4679 else: 

4680 # Categorical 

4681 df = DataFrame._from_arrays([values], columns=cols_, index=index_) 

4682 if not (using_pyarrow_string_dtype() and values.dtype.kind == "O"): 

4683 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) 

4684 if using_pyarrow_string_dtype() and is_string_array( 

4685 values, # type: ignore[arg-type] 

4686 skipna=True, 

4687 ): 

4688 df = df.astype("string[pyarrow_numpy]") 

4689 frames.append(df) 

4690 

4691 if len(frames) == 1: 

4692 df = frames[0] 

4693 else: 

4694 df = concat(frames, axis=1) 

4695 

4696 selection = Selection(self, where=where, start=start, stop=stop) 

4697 # apply the selection filters & axis orderings 

4698 df = self.process_axes(df, selection=selection, columns=columns) 

4699 return df 

4700 

4701 

4702class AppendableSeriesTable(AppendableFrameTable): 

4703 """support the new appendable table formats""" 

4704 

4705 pandas_kind = "series_table" 

4706 table_type = "appendable_series" 

4707 ndim = 2 

4708 obj_type = Series 

4709 

4710 @property 

4711 def is_transposed(self) -> bool: 

4712 return False 

4713 

4714 @classmethod 

4715 def get_object(cls, obj, transposed: bool): 

4716 return obj 

4717 

4718 # error: Signature of "write" incompatible with supertype "Fixed" 

4719 def write(self, obj, data_columns=None, **kwargs) -> None: # type: ignore[override] 

4720 """we are going to write this as a frame table""" 

4721 if not isinstance(obj, DataFrame): 

4722 name = obj.name or "values" 

4723 obj = obj.to_frame(name) 

4724 super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs) 

4725 

4726 def read( 

4727 self, 

4728 where=None, 

4729 columns=None, 

4730 start: int | None = None, 

4731 stop: int | None = None, 

4732 ) -> Series: 

4733 is_multi_index = self.is_multi_index 

4734 if columns is not None and is_multi_index: 

4735 assert isinstance(self.levels, list) # needed for mypy 

4736 for n in self.levels: 

4737 if n not in columns: 

4738 columns.insert(0, n) 

4739 s = super().read(where=where, columns=columns, start=start, stop=stop) 

4740 if is_multi_index: 

4741 s.set_index(self.levels, inplace=True) 

4742 

4743 s = s.iloc[:, 0] 

4744 

4745 # remove the default name 

4746 if s.name == "values": 

4747 s.name = None 

4748 return s 

4749 

4750 

4751class AppendableMultiSeriesTable(AppendableSeriesTable): 

4752 """support the new appendable table formats""" 

4753 

4754 pandas_kind = "series_table" 

4755 table_type = "appendable_multiseries" 

4756 

4757 # error: Signature of "write" incompatible with supertype "Fixed" 

4758 def write(self, obj, **kwargs) -> None: # type: ignore[override] 

4759 """we are going to write this as a frame table""" 

4760 name = obj.name or "values" 

4761 newobj, self.levels = self.validate_multiindex(obj) 

4762 assert isinstance(self.levels, list) # for mypy 

4763 cols = list(self.levels) 

4764 cols.append(name) 

4765 newobj.columns = Index(cols) 

4766 super().write(obj=newobj, **kwargs) 

4767 

4768 

4769class GenericTable(AppendableFrameTable): 

4770 """a table that read/writes the generic pytables table format""" 

4771 

4772 pandas_kind = "frame_table" 

4773 table_type = "generic_table" 

4774 ndim = 2 

4775 obj_type = DataFrame 

4776 levels: list[Hashable] 

4777 

4778 @property 

4779 def pandas_type(self) -> str: 

4780 return self.pandas_kind 

4781 

4782 @property 

4783 def storable(self): 

4784 return getattr(self.group, "table", None) or self.group 

4785 

4786 def get_attrs(self) -> None: 

4787 """retrieve our attributes""" 

4788 self.non_index_axes = [] 

4789 self.nan_rep = None 

4790 self.levels = [] 

4791 

4792 self.index_axes = [a for a in self.indexables if a.is_an_indexable] 

4793 self.values_axes = [a for a in self.indexables if not a.is_an_indexable] 

4794 self.data_columns = [a.name for a in self.values_axes] 

4795 

4796 @cache_readonly 

4797 def indexables(self): 

4798 """create the indexables from the table description""" 

4799 d = self.description 

4800 

4801 # TODO: can we get a typ for this? AFAICT it is the only place 

4802 # where we aren't passing one 

4803 # the index columns is just a simple index 

4804 md = self.read_metadata("index") 

4805 meta = "category" if md is not None else None 

4806 index_col = GenericIndexCol( 

4807 name="index", axis=0, table=self.table, meta=meta, metadata=md 

4808 ) 

4809 

4810 _indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col] 

4811 

4812 for i, n in enumerate(d._v_names): 

4813 assert isinstance(n, str) 

4814 

4815 atom = getattr(d, n) 

4816 md = self.read_metadata(n) 

4817 meta = "category" if md is not None else None 

4818 dc = GenericDataIndexableCol( 

4819 name=n, 

4820 pos=i, 

4821 values=[n], 

4822 typ=atom, 

4823 table=self.table, 

4824 meta=meta, 

4825 metadata=md, 

4826 ) 

4827 _indexables.append(dc) 

4828 

4829 return _indexables 

4830 

4831 # error: Signature of "write" incompatible with supertype "AppendableTable" 

4832 def write(self, **kwargs) -> None: # type: ignore[override] 

4833 raise NotImplementedError("cannot write on an generic table") 

4834 

4835 

4836class AppendableMultiFrameTable(AppendableFrameTable): 

4837 """a frame with a multi-index""" 

4838 

4839 table_type = "appendable_multiframe" 

4840 obj_type = DataFrame 

4841 ndim = 2 

4842 _re_levels = re.compile(r"^level_\d+$") 

4843 

4844 @property 

4845 def table_type_short(self) -> str: 

4846 return "appendable_multi" 

4847 

4848 # error: Signature of "write" incompatible with supertype "Fixed" 

4849 def write(self, obj, data_columns=None, **kwargs) -> None: # type: ignore[override] 

4850 if data_columns is None: 

4851 data_columns = [] 

4852 elif data_columns is True: 

4853 data_columns = obj.columns.tolist() 

4854 obj, self.levels = self.validate_multiindex(obj) 

4855 assert isinstance(self.levels, list) # for mypy 

4856 for n in self.levels: 

4857 if n not in data_columns: 

4858 data_columns.insert(0, n) 

4859 super().write(obj=obj, data_columns=data_columns, **kwargs) 

4860 

4861 def read( 

4862 self, 

4863 where=None, 

4864 columns=None, 

4865 start: int | None = None, 

4866 stop: int | None = None, 

4867 ): 

4868 df = super().read(where=where, columns=columns, start=start, stop=stop) 

4869 df = df.set_index(self.levels) 

4870 

4871 # remove names for 'level_%d' 

4872 df.index = df.index.set_names( 

4873 [None if self._re_levels.search(name) else name for name in df.index.names] 

4874 ) 

4875 

4876 return df 

4877 

4878 

4879def _reindex_axis( 

4880 obj: DataFrame, axis: AxisInt, labels: Index, other=None 

4881) -> DataFrame: 

4882 ax = obj._get_axis(axis) 

4883 labels = ensure_index(labels) 

4884 

4885 # try not to reindex even if other is provided 

4886 # if it equals our current index 

4887 if other is not None: 

4888 other = ensure_index(other) 

4889 if (other is None or labels.equals(other)) and labels.equals(ax): 

4890 return obj 

4891 

4892 labels = ensure_index(labels.unique()) 

4893 if other is not None: 

4894 labels = ensure_index(other.unique()).intersection(labels, sort=False) 

4895 if not labels.equals(ax): 

4896 slicer: list[slice | Index] = [slice(None, None)] * obj.ndim 

4897 slicer[axis] = labels 

4898 obj = obj.loc[tuple(slicer)] 

4899 return obj 

4900 

4901 

4902# tz to/from coercion 

4903 

4904 

4905def _get_tz(tz: tzinfo) -> str | tzinfo: 

4906 """for a tz-aware type, return an encoded zone""" 

4907 zone = timezones.get_timezone(tz) 

4908 return zone 

4909 

4910 

4911@overload 

4912def _set_tz( 

4913 values: np.ndarray | Index, tz: str | tzinfo, coerce: bool = False 

4914) -> DatetimeIndex: 

4915 ... 

4916 

4917 

4918@overload 

4919def _set_tz(values: np.ndarray | Index, tz: None, coerce: bool = False) -> np.ndarray: 

4920 ... 

4921 

4922 

4923def _set_tz( 

4924 values: np.ndarray | Index, tz: str | tzinfo | None, coerce: bool = False 

4925) -> np.ndarray | DatetimeIndex: 

4926 """ 

4927 coerce the values to a DatetimeIndex if tz is set 

4928 preserve the input shape if possible 

4929 

4930 Parameters 

4931 ---------- 

4932 values : ndarray or Index 

4933 tz : str or tzinfo 

4934 coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray 

4935 """ 

4936 if isinstance(values, DatetimeIndex): 

4937 # If values is tzaware, the tz gets dropped in the values.ravel() 

4938 # call below (which returns an ndarray). So we are only non-lossy 

4939 # if `tz` matches `values.tz`. 

4940 assert values.tz is None or values.tz == tz 

4941 if values.tz is not None: 

4942 return values 

4943 

4944 if tz is not None: 

4945 if isinstance(values, DatetimeIndex): 

4946 name = values.name 

4947 else: 

4948 name = None 

4949 values = values.ravel() 

4950 

4951 tz = _ensure_decoded(tz) 

4952 values = DatetimeIndex(values, name=name) 

4953 values = values.tz_localize("UTC").tz_convert(tz) 

4954 elif coerce: 

4955 values = np.asarray(values, dtype="M8[ns]") 

4956 

4957 # error: Incompatible return value type (got "Union[ndarray, Index]", 

4958 # expected "Union[ndarray, DatetimeIndex]") 

4959 return values # type: ignore[return-value] 

4960 

4961 

4962def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol: 

4963 assert isinstance(name, str) 

4964 

4965 index_name = index.name 

4966 # error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index"; 

4967 # expected "Union[ExtensionArray, ndarray]" 

4968 converted, dtype_name = _get_data_and_dtype_name(index) # type: ignore[arg-type] 

4969 kind = _dtype_to_kind(dtype_name) 

4970 atom = DataIndexableCol._get_atom(converted) 

4971 

4972 if ( 

4973 lib.is_np_dtype(index.dtype, "iu") 

4974 or needs_i8_conversion(index.dtype) 

4975 or is_bool_dtype(index.dtype) 

4976 ): 

4977 # Includes Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, 

4978 # in which case "kind" is "integer", "integer", "datetime64", 

4979 # "timedelta64", and "integer", respectively. 

4980 return IndexCol( 

4981 name, 

4982 values=converted, 

4983 kind=kind, 

4984 typ=atom, 

4985 freq=getattr(index, "freq", None), 

4986 tz=getattr(index, "tz", None), 

4987 index_name=index_name, 

4988 ) 

4989 

4990 if isinstance(index, MultiIndex): 

4991 raise TypeError("MultiIndex not supported here!") 

4992 

4993 inferred_type = lib.infer_dtype(index, skipna=False) 

4994 # we won't get inferred_type of "datetime64" or "timedelta64" as these 

4995 # would go through the DatetimeIndex/TimedeltaIndex paths above 

4996 

4997 values = np.asarray(index) 

4998 

4999 if inferred_type == "date": 

5000 converted = np.asarray([v.toordinal() for v in values], dtype=np.int32) 

5001 return IndexCol( 

5002 name, converted, "date", _tables().Time32Col(), index_name=index_name 

5003 ) 

5004 elif inferred_type == "string": 

5005 converted = _convert_string_array(values, encoding, errors) 

5006 itemsize = converted.dtype.itemsize 

5007 return IndexCol( 

5008 name, 

5009 converted, 

5010 "string", 

5011 _tables().StringCol(itemsize), 

5012 index_name=index_name, 

5013 ) 

5014 

5015 elif inferred_type in ["integer", "floating"]: 

5016 return IndexCol( 

5017 name, values=converted, kind=kind, typ=atom, index_name=index_name 

5018 ) 

5019 else: 

5020 assert isinstance(converted, np.ndarray) and converted.dtype == object 

5021 assert kind == "object", kind 

5022 atom = _tables().ObjectAtom() 

5023 return IndexCol(name, converted, kind, atom, index_name=index_name) 

5024 

5025 

5026def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index: 

5027 index: Index | np.ndarray 

5028 

5029 if kind.startswith("datetime64"): 

5030 if kind == "datetime64": 

5031 # created before we stored resolution information 

5032 index = DatetimeIndex(data) 

5033 else: 

5034 index = DatetimeIndex(data.view(kind)) 

5035 elif kind == "timedelta64": 

5036 index = TimedeltaIndex(data) 

5037 elif kind == "date": 

5038 try: 

5039 index = np.asarray([date.fromordinal(v) for v in data], dtype=object) 

5040 except ValueError: 

5041 index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object) 

5042 elif kind in ("integer", "float", "bool"): 

5043 index = np.asarray(data) 

5044 elif kind in ("string"): 

5045 index = _unconvert_string_array( 

5046 data, nan_rep=None, encoding=encoding, errors=errors 

5047 ) 

5048 elif kind == "object": 

5049 index = np.asarray(data[0]) 

5050 else: # pragma: no cover 

5051 raise ValueError(f"unrecognized index type {kind}") 

5052 return index 

5053 

5054 

5055def _maybe_convert_for_string_atom( 

5056 name: str, 

5057 bvalues: ArrayLike, 

5058 existing_col, 

5059 min_itemsize, 

5060 nan_rep, 

5061 encoding, 

5062 errors, 

5063 columns: list[str], 

5064): 

5065 if bvalues.dtype != object: 

5066 return bvalues 

5067 

5068 bvalues = cast(np.ndarray, bvalues) 

5069 

5070 dtype_name = bvalues.dtype.name 

5071 inferred_type = lib.infer_dtype(bvalues, skipna=False) 

5072 

5073 if inferred_type == "date": 

5074 raise TypeError("[date] is not implemented as a table column") 

5075 if inferred_type == "datetime": 

5076 # after GH#8260 

5077 # this only would be hit for a multi-timezone dtype which is an error 

5078 raise TypeError( 

5079 "too many timezones in this block, create separate data columns" 

5080 ) 

5081 

5082 if not (inferred_type == "string" or dtype_name == "object"): 

5083 return bvalues 

5084 

5085 mask = isna(bvalues) 

5086 data = bvalues.copy() 

5087 data[mask] = nan_rep 

5088 

5089 # see if we have a valid string type 

5090 inferred_type = lib.infer_dtype(data, skipna=False) 

5091 if inferred_type != "string": 

5092 # we cannot serialize this data, so report an exception on a column 

5093 # by column basis 

5094 

5095 # expected behaviour: 

5096 # search block for a non-string object column by column 

5097 for i in range(data.shape[0]): 

5098 col = data[i] 

5099 inferred_type = lib.infer_dtype(col, skipna=False) 

5100 if inferred_type != "string": 

5101 error_column_label = columns[i] if len(columns) > i else f"No.{i}" 

5102 raise TypeError( 

5103 f"Cannot serialize the column [{error_column_label}]\n" 

5104 f"because its data contents are not [string] but " 

5105 f"[{inferred_type}] object dtype" 

5106 ) 

5107 

5108 # itemsize is the maximum length of a string (along any dimension) 

5109 

5110 data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape) 

5111 itemsize = data_converted.itemsize 

5112 

5113 # specified min_itemsize? 

5114 if isinstance(min_itemsize, dict): 

5115 min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0) 

5116 itemsize = max(min_itemsize or 0, itemsize) 

5117 

5118 # check for column in the values conflicts 

5119 if existing_col is not None: 

5120 eci = existing_col.validate_col(itemsize) 

5121 if eci is not None and eci > itemsize: 

5122 itemsize = eci 

5123 

5124 data_converted = data_converted.astype(f"|S{itemsize}", copy=False) 

5125 return data_converted 

5126 

5127 

5128def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray: 

5129 """ 

5130 Take a string-like that is object dtype and coerce to a fixed size string type. 

5131 

5132 Parameters 

5133 ---------- 

5134 data : np.ndarray[object] 

5135 encoding : str 

5136 errors : str 

5137 Handler for encoding errors. 

5138 

5139 Returns 

5140 ------- 

5141 np.ndarray[fixed-length-string] 

5142 """ 

5143 # encode if needed 

5144 if len(data): 

5145 data = ( 

5146 Series(data.ravel(), copy=False) 

5147 .str.encode(encoding, errors) 

5148 ._values.reshape(data.shape) 

5149 ) 

5150 

5151 # create the sized dtype 

5152 ensured = ensure_object(data.ravel()) 

5153 itemsize = max(1, libwriters.max_len_string_array(ensured)) 

5154 

5155 data = np.asarray(data, dtype=f"S{itemsize}") 

5156 return data 

5157 

5158 

5159def _unconvert_string_array( 

5160 data: np.ndarray, nan_rep, encoding: str, errors: str 

5161) -> np.ndarray: 

5162 """ 

5163 Inverse of _convert_string_array. 

5164 

5165 Parameters 

5166 ---------- 

5167 data : np.ndarray[fixed-length-string] 

5168 nan_rep : the storage repr of NaN 

5169 encoding : str 

5170 errors : str 

5171 Handler for encoding errors. 

5172 

5173 Returns 

5174 ------- 

5175 np.ndarray[object] 

5176 Decoded data. 

5177 """ 

5178 shape = data.shape 

5179 data = np.asarray(data.ravel(), dtype=object) 

5180 

5181 if len(data): 

5182 itemsize = libwriters.max_len_string_array(ensure_object(data)) 

5183 dtype = f"U{itemsize}" 

5184 

5185 if isinstance(data[0], bytes): 

5186 data = Series(data, copy=False).str.decode(encoding, errors=errors)._values 

5187 else: 

5188 data = data.astype(dtype, copy=False).astype(object, copy=False) 

5189 

5190 if nan_rep is None: 

5191 nan_rep = "nan" 

5192 

5193 libwriters.string_array_replace_from_nan_rep(data, nan_rep) 

5194 return data.reshape(shape) 

5195 

5196 

5197def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str): 

5198 assert isinstance(val_kind, str), type(val_kind) 

5199 if _need_convert(val_kind): 

5200 conv = _get_converter(val_kind, encoding, errors) 

5201 values = conv(values) 

5202 return values 

5203 

5204 

5205def _get_converter(kind: str, encoding: str, errors: str): 

5206 if kind == "datetime64": 

5207 return lambda x: np.asarray(x, dtype="M8[ns]") 

5208 elif "datetime64" in kind: 

5209 return lambda x: np.asarray(x, dtype=kind) 

5210 elif kind == "string": 

5211 return lambda x: _unconvert_string_array( 

5212 x, nan_rep=None, encoding=encoding, errors=errors 

5213 ) 

5214 else: # pragma: no cover 

5215 raise ValueError(f"invalid kind {kind}") 

5216 

5217 

5218def _need_convert(kind: str) -> bool: 

5219 if kind in ("datetime64", "string") or "datetime64" in kind: 

5220 return True 

5221 return False 

5222 

5223 

5224def _maybe_adjust_name(name: str, version: Sequence[int]) -> str: 

5225 """ 

5226 Prior to 0.10.1, we named values blocks like: values_block_0 an the 

5227 name values_0, adjust the given name if necessary. 

5228 

5229 Parameters 

5230 ---------- 

5231 name : str 

5232 version : Tuple[int, int, int] 

5233 

5234 Returns 

5235 ------- 

5236 str 

5237 """ 

5238 if isinstance(version, str) or len(version) < 3: 

5239 raise ValueError("Version is incorrect, expected sequence of 3 integers.") 

5240 

5241 if version[0] == 0 and version[1] <= 10 and version[2] == 0: 

5242 m = re.search(r"values_block_(\d+)", name) 

5243 if m: 

5244 grp = m.groups()[0] 

5245 name = f"values_{grp}" 

5246 return name 

5247 

5248 

5249def _dtype_to_kind(dtype_str: str) -> str: 

5250 """ 

5251 Find the "kind" string describing the given dtype name. 

5252 """ 

5253 dtype_str = _ensure_decoded(dtype_str) 

5254 

5255 if dtype_str.startswith(("string", "bytes")): 

5256 kind = "string" 

5257 elif dtype_str.startswith("float"): 

5258 kind = "float" 

5259 elif dtype_str.startswith("complex"): 

5260 kind = "complex" 

5261 elif dtype_str.startswith(("int", "uint")): 

5262 kind = "integer" 

5263 elif dtype_str.startswith("datetime64"): 

5264 kind = dtype_str 

5265 elif dtype_str.startswith("timedelta"): 

5266 kind = "timedelta64" 

5267 elif dtype_str.startswith("bool"): 

5268 kind = "bool" 

5269 elif dtype_str.startswith("category"): 

5270 kind = "category" 

5271 elif dtype_str.startswith("period"): 

5272 # We store the `freq` attr so we can restore from integers 

5273 kind = "integer" 

5274 elif dtype_str == "object": 

5275 kind = "object" 

5276 else: 

5277 raise ValueError(f"cannot interpret dtype of [{dtype_str}]") 

5278 

5279 return kind 

5280 

5281 

5282def _get_data_and_dtype_name(data: ArrayLike): 

5283 """ 

5284 Convert the passed data into a storable form and a dtype string. 

5285 """ 

5286 if isinstance(data, Categorical): 

5287 data = data.codes 

5288 

5289 if isinstance(data.dtype, DatetimeTZDtype): 

5290 # For datetime64tz we need to drop the TZ in tests TODO: why? 

5291 dtype_name = f"datetime64[{data.dtype.unit}]" 

5292 else: 

5293 dtype_name = data.dtype.name 

5294 

5295 if data.dtype.kind in "mM": 

5296 data = np.asarray(data.view("i8")) 

5297 # TODO: we used to reshape for the dt64tz case, but no longer 

5298 # doing that doesn't seem to break anything. why? 

5299 

5300 elif isinstance(data, PeriodIndex): 

5301 data = data.asi8 

5302 

5303 data = np.asarray(data) 

5304 return data, dtype_name 

5305 

5306 

5307class Selection: 

5308 """ 

5309 Carries out a selection operation on a tables.Table object. 

5310 

5311 Parameters 

5312 ---------- 

5313 table : a Table object 

5314 where : list of Terms (or convertible to) 

5315 start, stop: indices to start and/or stop selection 

5316 

5317 """ 

5318 

5319 def __init__( 

5320 self, 

5321 table: Table, 

5322 where=None, 

5323 start: int | None = None, 

5324 stop: int | None = None, 

5325 ) -> None: 

5326 self.table = table 

5327 self.where = where 

5328 self.start = start 

5329 self.stop = stop 

5330 self.condition = None 

5331 self.filter = None 

5332 self.terms = None 

5333 self.coordinates = None 

5334 

5335 if is_list_like(where): 

5336 # see if we have a passed coordinate like 

5337 with suppress(ValueError): 

5338 inferred = lib.infer_dtype(where, skipna=False) 

5339 if inferred in ("integer", "boolean"): 

5340 where = np.asarray(where) 

5341 if where.dtype == np.bool_: 

5342 start, stop = self.start, self.stop 

5343 if start is None: 

5344 start = 0 

5345 if stop is None: 

5346 stop = self.table.nrows 

5347 self.coordinates = np.arange(start, stop)[where] 

5348 elif issubclass(where.dtype.type, np.integer): 

5349 if (self.start is not None and (where < self.start).any()) or ( 

5350 self.stop is not None and (where >= self.stop).any() 

5351 ): 

5352 raise ValueError( 

5353 "where must have index locations >= start and < stop" 

5354 ) 

5355 self.coordinates = where 

5356 

5357 if self.coordinates is None: 

5358 self.terms = self.generate(where) 

5359 

5360 # create the numexpr & the filter 

5361 if self.terms is not None: 

5362 self.condition, self.filter = self.terms.evaluate() 

5363 

5364 def generate(self, where): 

5365 """where can be a : dict,list,tuple,string""" 

5366 if where is None: 

5367 return None 

5368 

5369 q = self.table.queryables() 

5370 try: 

5371 return PyTablesExpr(where, queryables=q, encoding=self.table.encoding) 

5372 except NameError as err: 

5373 # raise a nice message, suggesting that the user should use 

5374 # data_columns 

5375 qkeys = ",".join(q.keys()) 

5376 msg = dedent( 

5377 f"""\ 

5378 The passed where expression: {where} 

5379 contains an invalid variable reference 

5380 all of the variable references must be a reference to 

5381 an axis (e.g. 'index' or 'columns'), or a data_column 

5382 The currently defined references are: {qkeys} 

5383 """ 

5384 ) 

5385 raise ValueError(msg) from err 

5386 

5387 def select(self): 

5388 """ 

5389 generate the selection 

5390 """ 

5391 if self.condition is not None: 

5392 return self.table.table.read_where( 

5393 self.condition.format(), start=self.start, stop=self.stop 

5394 ) 

5395 elif self.coordinates is not None: 

5396 return self.table.table.read_coordinates(self.coordinates) 

5397 return self.table.table.read(start=self.start, stop=self.stop) 

5398 

5399 def select_coords(self): 

5400 """ 

5401 generate the selection 

5402 """ 

5403 start, stop = self.start, self.stop 

5404 nrows = self.table.nrows 

5405 if start is None: 

5406 start = 0 

5407 elif start < 0: 

5408 start += nrows 

5409 if stop is None: 

5410 stop = nrows 

5411 elif stop < 0: 

5412 stop += nrows 

5413 

5414 if self.condition is not None: 

5415 return self.table.table.get_where_list( 

5416 self.condition.format(), start=start, stop=stop, sort=True 

5417 ) 

5418 elif self.coordinates is not None: 

5419 return self.coordinates 

5420 

5421 return np.arange(start, stop)