Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/pytables.py: 19%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

2255 statements  

1""" 

2High level interface to PyTables for reading and writing pandas data structures 

3to disk 

4""" 

5from __future__ import annotations 

6 

7from contextlib import suppress 

8import copy 

9from datetime import ( 

10 date, 

11 tzinfo, 

12) 

13import itertools 

14import os 

15import re 

16from textwrap import dedent 

17from types import TracebackType 

18from typing import ( 

19 TYPE_CHECKING, 

20 Any, 

21 Callable, 

22 Final, 

23 Hashable, 

24 Iterator, 

25 Literal, 

26 Sequence, 

27 cast, 

28 overload, 

29) 

30import warnings 

31 

32import numpy as np 

33 

34from pandas._config import ( 

35 config, 

36 get_option, 

37) 

38 

39from pandas._libs import ( 

40 lib, 

41 writers as libwriters, 

42) 

43from pandas._libs.tslibs import timezones 

44from pandas._typing import ( 

45 AnyArrayLike, 

46 ArrayLike, 

47 AxisInt, 

48 DtypeArg, 

49 FilePath, 

50 Shape, 

51 npt, 

52) 

53from pandas.compat._optional import import_optional_dependency 

54from pandas.compat.pickle_compat import patch_pickle 

55from pandas.errors import ( 

56 AttributeConflictWarning, 

57 ClosedFileError, 

58 IncompatibilityWarning, 

59 PerformanceWarning, 

60 PossibleDataLossError, 

61) 

62from pandas.util._decorators import cache_readonly 

63from pandas.util._exceptions import find_stack_level 

64 

65from pandas.core.dtypes.common import ( 

66 ensure_object, 

67 is_bool_dtype, 

68 is_categorical_dtype, 

69 is_complex_dtype, 

70 is_datetime64_dtype, 

71 is_datetime64tz_dtype, 

72 is_extension_array_dtype, 

73 is_integer_dtype, 

74 is_list_like, 

75 is_object_dtype, 

76 is_string_dtype, 

77 is_timedelta64_dtype, 

78 needs_i8_conversion, 

79) 

80from pandas.core.dtypes.missing import array_equivalent 

81 

82from pandas import ( 

83 DataFrame, 

84 DatetimeIndex, 

85 Index, 

86 MultiIndex, 

87 PeriodIndex, 

88 RangeIndex, 

89 Series, 

90 TimedeltaIndex, 

91 concat, 

92 isna, 

93) 

94from pandas.core.arrays import ( 

95 Categorical, 

96 DatetimeArray, 

97 PeriodArray, 

98) 

99import pandas.core.common as com 

100from pandas.core.computation.pytables import ( 

101 PyTablesExpr, 

102 maybe_expression, 

103) 

104from pandas.core.construction import extract_array 

105from pandas.core.indexes.api import ensure_index 

106from pandas.core.internals import ( 

107 ArrayManager, 

108 BlockManager, 

109) 

110 

111from pandas.io.common import stringify_path 

112from pandas.io.formats.printing import ( 

113 adjoin, 

114 pprint_thing, 

115) 

116 

117if TYPE_CHECKING: 

118 from tables import ( 

119 Col, 

120 File, 

121 Node, 

122 ) 

123 

124 from pandas.core.internals import Block 

125 

126 

127# versioning attribute 

128_version = "0.15.2" 

129 

130# encoding 

131_default_encoding = "UTF-8" 

132 

133 

134def _ensure_decoded(s): 

135 """if we have bytes, decode them to unicode""" 

136 if isinstance(s, np.bytes_): 

137 s = s.decode("UTF-8") 

138 return s 

139 

140 

141def _ensure_encoding(encoding: str | None) -> str: 

142 # set the encoding if we need 

143 if encoding is None: 

144 encoding = _default_encoding 

145 

146 return encoding 

147 

148 

149def _ensure_str(name): 

150 """ 

151 Ensure that an index / column name is a str (python 3); otherwise they 

152 may be np.string dtype. Non-string dtypes are passed through unchanged. 

153 

154 https://github.com/pandas-dev/pandas/issues/13492 

155 """ 

156 if isinstance(name, str): 

157 name = str(name) 

158 return name 

159 

160 

161Term = PyTablesExpr 

162 

163 

164def _ensure_term(where, scope_level: int): 

165 """ 

166 Ensure that the where is a Term or a list of Term. 

167 

168 This makes sure that we are capturing the scope of variables that are 

169 passed create the terms here with a frame_level=2 (we are 2 levels down) 

170 """ 

171 # only consider list/tuple here as an ndarray is automatically a coordinate 

172 # list 

173 level = scope_level + 1 

174 if isinstance(where, (list, tuple)): 

175 where = [ 

176 Term(term, scope_level=level + 1) if maybe_expression(term) else term 

177 for term in where 

178 if term is not None 

179 ] 

180 elif maybe_expression(where): 

181 where = Term(where, scope_level=level) 

182 return where if where is None or len(where) else None 

183 

184 

185incompatibility_doc: Final = """ 

186where criteria is being ignored as this version [%s] is too old (or 

187not-defined), read the file in and write it out to a new file to upgrade (with 

188the copy_to method) 

189""" 

190 

191attribute_conflict_doc: Final = """ 

192the [%s] attribute of the existing index is [%s] which conflicts with the new 

193[%s], resetting the attribute to None 

194""" 

195 

196performance_doc: Final = """ 

197your performance may suffer as PyTables will pickle object types that it cannot 

198map directly to c-types [inferred_type->%s,key->%s] [items->%s] 

199""" 

200 

201# formats 

202_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"} 

203 

204# axes map 

205_AXES_MAP = {DataFrame: [0]} 

206 

207# register our configuration options 

208dropna_doc: Final = """ 

209: boolean 

210 drop ALL nan rows when appending to a table 

211""" 

212format_doc: Final = """ 

213: format 

214 default format writing format, if None, then 

215 put will default to 'fixed' and append will default to 'table' 

216""" 

217 

218with config.config_prefix("io.hdf"): 

219 config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool) 

220 config.register_option( 

221 "default_format", 

222 None, 

223 format_doc, 

224 validator=config.is_one_of_factory(["fixed", "table", None]), 

225 ) 

226 

227# oh the troubles to reduce import time 

228_table_mod = None 

229_table_file_open_policy_is_strict = False 

230 

231 

232def _tables(): 

233 global _table_mod 

234 global _table_file_open_policy_is_strict 

235 if _table_mod is None: 

236 import tables 

237 

238 _table_mod = tables 

239 

240 # set the file open policy 

241 # return the file open policy; this changes as of pytables 3.1 

242 # depending on the HDF5 version 

243 with suppress(AttributeError): 

244 _table_file_open_policy_is_strict = ( 

245 tables.file._FILE_OPEN_POLICY == "strict" 

246 ) 

247 

248 return _table_mod 

249 

250 

251# interface to/from ### 

252 

253 

254def to_hdf( 

255 path_or_buf: FilePath | HDFStore, 

256 key: str, 

257 value: DataFrame | Series, 

258 mode: str = "a", 

259 complevel: int | None = None, 

260 complib: str | None = None, 

261 append: bool = False, 

262 format: str | None = None, 

263 index: bool = True, 

264 min_itemsize: int | dict[str, int] | None = None, 

265 nan_rep=None, 

266 dropna: bool | None = None, 

267 data_columns: Literal[True] | list[str] | None = None, 

268 errors: str = "strict", 

269 encoding: str = "UTF-8", 

270) -> None: 

271 """store this object, close it if we opened it""" 

272 if append: 

273 f = lambda store: store.append( 

274 key, 

275 value, 

276 format=format, 

277 index=index, 

278 min_itemsize=min_itemsize, 

279 nan_rep=nan_rep, 

280 dropna=dropna, 

281 data_columns=data_columns, 

282 errors=errors, 

283 encoding=encoding, 

284 ) 

285 else: 

286 # NB: dropna is not passed to `put` 

287 f = lambda store: store.put( 

288 key, 

289 value, 

290 format=format, 

291 index=index, 

292 min_itemsize=min_itemsize, 

293 nan_rep=nan_rep, 

294 data_columns=data_columns, 

295 errors=errors, 

296 encoding=encoding, 

297 dropna=dropna, 

298 ) 

299 

300 path_or_buf = stringify_path(path_or_buf) 

301 if isinstance(path_or_buf, str): 

302 with HDFStore( 

303 path_or_buf, mode=mode, complevel=complevel, complib=complib 

304 ) as store: 

305 f(store) 

306 else: 

307 f(path_or_buf) 

308 

309 

310def read_hdf( 

311 path_or_buf: FilePath | HDFStore, 

312 key=None, 

313 mode: str = "r", 

314 errors: str = "strict", 

315 where: str | list | None = None, 

316 start: int | None = None, 

317 stop: int | None = None, 

318 columns: list[str] | None = None, 

319 iterator: bool = False, 

320 chunksize: int | None = None, 

321 **kwargs, 

322): 

323 """ 

324 Read from the store, close it if we opened it. 

325 

326 Retrieve pandas object stored in file, optionally based on where 

327 criteria. 

328 

329 .. warning:: 

330 

331 Pandas uses PyTables for reading and writing HDF5 files, which allows 

332 serializing object-dtype data with pickle when using the "fixed" format. 

333 Loading pickled data received from untrusted sources can be unsafe. 

334 

335 See: https://docs.python.org/3/library/pickle.html for more. 

336 

337 Parameters 

338 ---------- 

339 path_or_buf : str, path object, pandas.HDFStore 

340 Any valid string path is acceptable. Only supports the local file system, 

341 remote URLs and file-like objects are not supported. 

342 

343 If you want to pass in a path object, pandas accepts any 

344 ``os.PathLike``. 

345 

346 Alternatively, pandas accepts an open :class:`pandas.HDFStore` object. 

347 

348 key : object, optional 

349 The group identifier in the store. Can be omitted if the HDF file 

350 contains a single pandas object. 

351 mode : {'r', 'r+', 'a'}, default 'r' 

352 Mode to use when opening the file. Ignored if path_or_buf is a 

353 :class:`pandas.HDFStore`. Default is 'r'. 

354 errors : str, default 'strict' 

355 Specifies how encoding and decoding errors are to be handled. 

356 See the errors argument for :func:`open` for a full list 

357 of options. 

358 where : list, optional 

359 A list of Term (or convertible) objects. 

360 start : int, optional 

361 Row number to start selection. 

362 stop : int, optional 

363 Row number to stop selection. 

364 columns : list, optional 

365 A list of columns names to return. 

366 iterator : bool, optional 

367 Return an iterator object. 

368 chunksize : int, optional 

369 Number of rows to include in an iteration when using an iterator. 

370 **kwargs 

371 Additional keyword arguments passed to HDFStore. 

372 

373 Returns 

374 ------- 

375 object 

376 The selected object. Return type depends on the object stored. 

377 

378 See Also 

379 -------- 

380 DataFrame.to_hdf : Write a HDF file from a DataFrame. 

381 HDFStore : Low-level access to HDF files. 

382 

383 Examples 

384 -------- 

385 >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP 

386 >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP 

387 >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP 

388 """ 

389 if mode not in ["r", "r+", "a"]: 

390 raise ValueError( 

391 f"mode {mode} is not allowed while performing a read. " 

392 f"Allowed modes are r, r+ and a." 

393 ) 

394 # grab the scope 

395 if where is not None: 

396 where = _ensure_term(where, scope_level=1) 

397 

398 if isinstance(path_or_buf, HDFStore): 

399 if not path_or_buf.is_open: 

400 raise OSError("The HDFStore must be open for reading.") 

401 

402 store = path_or_buf 

403 auto_close = False 

404 else: 

405 path_or_buf = stringify_path(path_or_buf) 

406 if not isinstance(path_or_buf, str): 

407 raise NotImplementedError( 

408 "Support for generic buffers has not been implemented." 

409 ) 

410 try: 

411 exists = os.path.exists(path_or_buf) 

412 

413 # if filepath is too long 

414 except (TypeError, ValueError): 

415 exists = False 

416 

417 if not exists: 

418 raise FileNotFoundError(f"File {path_or_buf} does not exist") 

419 

420 store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs) 

421 # can't auto open/close if we are using an iterator 

422 # so delegate to the iterator 

423 auto_close = True 

424 

425 try: 

426 if key is None: 

427 groups = store.groups() 

428 if len(groups) == 0: 

429 raise ValueError( 

430 "Dataset(s) incompatible with Pandas data types, " 

431 "not table, or no datasets found in HDF5 file." 

432 ) 

433 candidate_only_group = groups[0] 

434 

435 # For the HDF file to have only one dataset, all other groups 

436 # should then be metadata groups for that candidate group. (This 

437 # assumes that the groups() method enumerates parent groups 

438 # before their children.) 

439 for group_to_check in groups[1:]: 

440 if not _is_metadata_of(group_to_check, candidate_only_group): 

441 raise ValueError( 

442 "key must be provided when HDF5 " 

443 "file contains multiple datasets." 

444 ) 

445 key = candidate_only_group._v_pathname 

446 return store.select( 

447 key, 

448 where=where, 

449 start=start, 

450 stop=stop, 

451 columns=columns, 

452 iterator=iterator, 

453 chunksize=chunksize, 

454 auto_close=auto_close, 

455 ) 

456 except (ValueError, TypeError, KeyError): 

457 if not isinstance(path_or_buf, HDFStore): 

458 # if there is an error, close the store if we opened it. 

459 with suppress(AttributeError): 

460 store.close() 

461 

462 raise 

463 

464 

465def _is_metadata_of(group: Node, parent_group: Node) -> bool: 

466 """Check if a given group is a metadata group for a given parent_group.""" 

467 if group._v_depth <= parent_group._v_depth: 

468 return False 

469 

470 current = group 

471 while current._v_depth > 1: 

472 parent = current._v_parent 

473 if parent == parent_group and current._v_name == "meta": 

474 return True 

475 current = current._v_parent 

476 return False 

477 

478 

479class HDFStore: 

480 """ 

481 Dict-like IO interface for storing pandas objects in PyTables. 

482 

483 Either Fixed or Table format. 

484 

485 .. warning:: 

486 

487 Pandas uses PyTables for reading and writing HDF5 files, which allows 

488 serializing object-dtype data with pickle when using the "fixed" format. 

489 Loading pickled data received from untrusted sources can be unsafe. 

490 

491 See: https://docs.python.org/3/library/pickle.html for more. 

492 

493 Parameters 

494 ---------- 

495 path : str 

496 File path to HDF5 file. 

497 mode : {'a', 'w', 'r', 'r+'}, default 'a' 

498 

499 ``'r'`` 

500 Read-only; no data can be modified. 

501 ``'w'`` 

502 Write; a new file is created (an existing file with the same 

503 name would be deleted). 

504 ``'a'`` 

505 Append; an existing file is opened for reading and writing, 

506 and if the file does not exist it is created. 

507 ``'r+'`` 

508 It is similar to ``'a'``, but the file must already exist. 

509 complevel : int, 0-9, default None 

510 Specifies a compression level for data. 

511 A value of 0 or None disables compression. 

512 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' 

513 Specifies the compression library to be used. 

514 As of v0.20.2 these additional compressors for Blosc are supported 

515 (default if no compressor specified: 'blosc:blosclz'): 

516 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 

517 'blosc:zlib', 'blosc:zstd'}. 

518 Specifying a compression library which is not available issues 

519 a ValueError. 

520 fletcher32 : bool, default False 

521 If applying compression use the fletcher32 checksum. 

522 **kwargs 

523 These parameters will be passed to the PyTables open_file method. 

524 

525 Examples 

526 -------- 

527 >>> bar = pd.DataFrame(np.random.randn(10, 4)) 

528 >>> store = pd.HDFStore('test.h5') 

529 >>> store['foo'] = bar # write to HDF5 

530 >>> bar = store['foo'] # retrieve 

531 >>> store.close() 

532 

533 **Create or load HDF5 file in-memory** 

534 

535 When passing the `driver` option to the PyTables open_file method through 

536 **kwargs, the HDF5 file is loaded or created in-memory and will only be 

537 written when closed: 

538 

539 >>> bar = pd.DataFrame(np.random.randn(10, 4)) 

540 >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE') 

541 >>> store['foo'] = bar 

542 >>> store.close() # only now, data is written to disk 

543 """ 

544 

545 _handle: File | None 

546 _mode: str 

547 

548 def __init__( 

549 self, 

550 path, 

551 mode: str = "a", 

552 complevel: int | None = None, 

553 complib=None, 

554 fletcher32: bool = False, 

555 **kwargs, 

556 ) -> None: 

557 if "format" in kwargs: 

558 raise ValueError("format is not a defined argument for HDFStore") 

559 

560 tables = import_optional_dependency("tables") 

561 

562 if complib is not None and complib not in tables.filters.all_complibs: 

563 raise ValueError( 

564 f"complib only supports {tables.filters.all_complibs} compression." 

565 ) 

566 

567 if complib is None and complevel is not None: 

568 complib = tables.filters.default_complib 

569 

570 self._path = stringify_path(path) 

571 if mode is None: 

572 mode = "a" 

573 self._mode = mode 

574 self._handle = None 

575 self._complevel = complevel if complevel else 0 

576 self._complib = complib 

577 self._fletcher32 = fletcher32 

578 self._filters = None 

579 self.open(mode=mode, **kwargs) 

580 

581 def __fspath__(self) -> str: 

582 return self._path 

583 

584 @property 

585 def root(self): 

586 """return the root node""" 

587 self._check_if_open() 

588 assert self._handle is not None # for mypy 

589 return self._handle.root 

590 

591 @property 

592 def filename(self) -> str: 

593 return self._path 

594 

595 def __getitem__(self, key: str): 

596 return self.get(key) 

597 

598 def __setitem__(self, key: str, value) -> None: 

599 self.put(key, value) 

600 

601 def __delitem__(self, key: str) -> None: 

602 return self.remove(key) 

603 

604 def __getattr__(self, name: str): 

605 """allow attribute access to get stores""" 

606 try: 

607 return self.get(name) 

608 except (KeyError, ClosedFileError): 

609 pass 

610 raise AttributeError( 

611 f"'{type(self).__name__}' object has no attribute '{name}'" 

612 ) 

613 

614 def __contains__(self, key: str) -> bool: 

615 """ 

616 check for existence of this key 

617 can match the exact pathname or the pathnm w/o the leading '/' 

618 """ 

619 node = self.get_node(key) 

620 if node is not None: 

621 name = node._v_pathname 

622 if key in (name, name[1:]): 

623 return True 

624 return False 

625 

626 def __len__(self) -> int: 

627 return len(self.groups()) 

628 

629 def __repr__(self) -> str: 

630 pstr = pprint_thing(self._path) 

631 return f"{type(self)}\nFile path: {pstr}\n" 

632 

633 def __enter__(self) -> HDFStore: 

634 return self 

635 

636 def __exit__( 

637 self, 

638 exc_type: type[BaseException] | None, 

639 exc_value: BaseException | None, 

640 traceback: TracebackType | None, 

641 ) -> None: 

642 self.close() 

643 

644 def keys(self, include: str = "pandas") -> list[str]: 

645 """ 

646 Return a list of keys corresponding to objects stored in HDFStore. 

647 

648 Parameters 

649 ---------- 

650 

651 include : str, default 'pandas' 

652 When kind equals 'pandas' return pandas objects. 

653 When kind equals 'native' return native HDF5 Table objects. 

654 

655 .. versionadded:: 1.1.0 

656 

657 Returns 

658 ------- 

659 list 

660 List of ABSOLUTE path-names (e.g. have the leading '/'). 

661 

662 Raises 

663 ------ 

664 raises ValueError if kind has an illegal value 

665 """ 

666 if include == "pandas": 

667 return [n._v_pathname for n in self.groups()] 

668 

669 elif include == "native": 

670 assert self._handle is not None # mypy 

671 return [ 

672 n._v_pathname for n in self._handle.walk_nodes("/", classname="Table") 

673 ] 

674 raise ValueError( 

675 f"`include` should be either 'pandas' or 'native' but is '{include}'" 

676 ) 

677 

678 def __iter__(self) -> Iterator[str]: 

679 return iter(self.keys()) 

680 

681 def items(self) -> Iterator[tuple[str, list]]: 

682 """ 

683 iterate on key->group 

684 """ 

685 for g in self.groups(): 

686 yield g._v_pathname, g 

687 

688 def open(self, mode: str = "a", **kwargs) -> None: 

689 """ 

690 Open the file in the specified mode 

691 

692 Parameters 

693 ---------- 

694 mode : {'a', 'w', 'r', 'r+'}, default 'a' 

695 See HDFStore docstring or tables.open_file for info about modes 

696 **kwargs 

697 These parameters will be passed to the PyTables open_file method. 

698 """ 

699 tables = _tables() 

700 

701 if self._mode != mode: 

702 # if we are changing a write mode to read, ok 

703 if self._mode in ["a", "w"] and mode in ["r", "r+"]: 

704 pass 

705 elif mode in ["w"]: 

706 # this would truncate, raise here 

707 if self.is_open: 

708 raise PossibleDataLossError( 

709 f"Re-opening the file [{self._path}] with mode [{self._mode}] " 

710 "will delete the current file!" 

711 ) 

712 

713 self._mode = mode 

714 

715 # close and reopen the handle 

716 if self.is_open: 

717 self.close() 

718 

719 if self._complevel and self._complevel > 0: 

720 self._filters = _tables().Filters( 

721 self._complevel, self._complib, fletcher32=self._fletcher32 

722 ) 

723 

724 if _table_file_open_policy_is_strict and self.is_open: 

725 msg = ( 

726 "Cannot open HDF5 file, which is already opened, " 

727 "even in read-only mode." 

728 ) 

729 raise ValueError(msg) 

730 

731 self._handle = tables.open_file(self._path, self._mode, **kwargs) 

732 

733 def close(self) -> None: 

734 """ 

735 Close the PyTables file handle 

736 """ 

737 if self._handle is not None: 

738 self._handle.close() 

739 self._handle = None 

740 

741 @property 

742 def is_open(self) -> bool: 

743 """ 

744 return a boolean indicating whether the file is open 

745 """ 

746 if self._handle is None: 

747 return False 

748 return bool(self._handle.isopen) 

749 

750 def flush(self, fsync: bool = False) -> None: 

751 """ 

752 Force all buffered modifications to be written to disk. 

753 

754 Parameters 

755 ---------- 

756 fsync : bool (default False) 

757 call ``os.fsync()`` on the file handle to force writing to disk. 

758 

759 Notes 

760 ----- 

761 Without ``fsync=True``, flushing may not guarantee that the OS writes 

762 to disk. With fsync, the operation will block until the OS claims the 

763 file has been written; however, other caching layers may still 

764 interfere. 

765 """ 

766 if self._handle is not None: 

767 self._handle.flush() 

768 if fsync: 

769 with suppress(OSError): 

770 os.fsync(self._handle.fileno()) 

771 

772 def get(self, key: str): 

773 """ 

774 Retrieve pandas object stored in file. 

775 

776 Parameters 

777 ---------- 

778 key : str 

779 

780 Returns 

781 ------- 

782 object 

783 Same type as object stored in file. 

784 """ 

785 with patch_pickle(): 

786 # GH#31167 Without this patch, pickle doesn't know how to unpickle 

787 # old DateOffset objects now that they are cdef classes. 

788 group = self.get_node(key) 

789 if group is None: 

790 raise KeyError(f"No object named {key} in the file") 

791 return self._read_group(group) 

792 

793 def select( 

794 self, 

795 key: str, 

796 where=None, 

797 start=None, 

798 stop=None, 

799 columns=None, 

800 iterator: bool = False, 

801 chunksize=None, 

802 auto_close: bool = False, 

803 ): 

804 """ 

805 Retrieve pandas object stored in file, optionally based on where criteria. 

806 

807 .. warning:: 

808 

809 Pandas uses PyTables for reading and writing HDF5 files, which allows 

810 serializing object-dtype data with pickle when using the "fixed" format. 

811 Loading pickled data received from untrusted sources can be unsafe. 

812 

813 See: https://docs.python.org/3/library/pickle.html for more. 

814 

815 Parameters 

816 ---------- 

817 key : str 

818 Object being retrieved from file. 

819 where : list or None 

820 List of Term (or convertible) objects, optional. 

821 start : int or None 

822 Row number to start selection. 

823 stop : int, default None 

824 Row number to stop selection. 

825 columns : list or None 

826 A list of columns that if not None, will limit the return columns. 

827 iterator : bool or False 

828 Returns an iterator. 

829 chunksize : int or None 

830 Number or rows to include in iteration, return an iterator. 

831 auto_close : bool or False 

832 Should automatically close the store when finished. 

833 

834 Returns 

835 ------- 

836 object 

837 Retrieved object from file. 

838 """ 

839 group = self.get_node(key) 

840 if group is None: 

841 raise KeyError(f"No object named {key} in the file") 

842 

843 # create the storer and axes 

844 where = _ensure_term(where, scope_level=1) 

845 s = self._create_storer(group) 

846 s.infer_axes() 

847 

848 # function to call on iteration 

849 def func(_start, _stop, _where): 

850 return s.read(start=_start, stop=_stop, where=_where, columns=columns) 

851 

852 # create the iterator 

853 it = TableIterator( 

854 self, 

855 s, 

856 func, 

857 where=where, 

858 nrows=s.nrows, 

859 start=start, 

860 stop=stop, 

861 iterator=iterator, 

862 chunksize=chunksize, 

863 auto_close=auto_close, 

864 ) 

865 

866 return it.get_result() 

867 

868 def select_as_coordinates( 

869 self, 

870 key: str, 

871 where=None, 

872 start: int | None = None, 

873 stop: int | None = None, 

874 ): 

875 """ 

876 return the selection as an Index 

877 

878 .. warning:: 

879 

880 Pandas uses PyTables for reading and writing HDF5 files, which allows 

881 serializing object-dtype data with pickle when using the "fixed" format. 

882 Loading pickled data received from untrusted sources can be unsafe. 

883 

884 See: https://docs.python.org/3/library/pickle.html for more. 

885 

886 

887 Parameters 

888 ---------- 

889 key : str 

890 where : list of Term (or convertible) objects, optional 

891 start : integer (defaults to None), row number to start selection 

892 stop : integer (defaults to None), row number to stop selection 

893 """ 

894 where = _ensure_term(where, scope_level=1) 

895 tbl = self.get_storer(key) 

896 if not isinstance(tbl, Table): 

897 raise TypeError("can only read_coordinates with a table") 

898 return tbl.read_coordinates(where=where, start=start, stop=stop) 

899 

900 def select_column( 

901 self, 

902 key: str, 

903 column: str, 

904 start: int | None = None, 

905 stop: int | None = None, 

906 ): 

907 """ 

908 return a single column from the table. This is generally only useful to 

909 select an indexable 

910 

911 .. warning:: 

912 

913 Pandas uses PyTables for reading and writing HDF5 files, which allows 

914 serializing object-dtype data with pickle when using the "fixed" format. 

915 Loading pickled data received from untrusted sources can be unsafe. 

916 

917 See: https://docs.python.org/3/library/pickle.html for more. 

918 

919 Parameters 

920 ---------- 

921 key : str 

922 column : str 

923 The column of interest. 

924 start : int or None, default None 

925 stop : int or None, default None 

926 

927 Raises 

928 ------ 

929 raises KeyError if the column is not found (or key is not a valid 

930 store) 

931 raises ValueError if the column can not be extracted individually (it 

932 is part of a data block) 

933 

934 """ 

935 tbl = self.get_storer(key) 

936 if not isinstance(tbl, Table): 

937 raise TypeError("can only read_column with a table") 

938 return tbl.read_column(column=column, start=start, stop=stop) 

939 

940 def select_as_multiple( 

941 self, 

942 keys, 

943 where=None, 

944 selector=None, 

945 columns=None, 

946 start=None, 

947 stop=None, 

948 iterator: bool = False, 

949 chunksize=None, 

950 auto_close: bool = False, 

951 ): 

952 """ 

953 Retrieve pandas objects from multiple tables. 

954 

955 .. warning:: 

956 

957 Pandas uses PyTables for reading and writing HDF5 files, which allows 

958 serializing object-dtype data with pickle when using the "fixed" format. 

959 Loading pickled data received from untrusted sources can be unsafe. 

960 

961 See: https://docs.python.org/3/library/pickle.html for more. 

962 

963 Parameters 

964 ---------- 

965 keys : a list of the tables 

966 selector : the table to apply the where criteria (defaults to keys[0] 

967 if not supplied) 

968 columns : the columns I want back 

969 start : integer (defaults to None), row number to start selection 

970 stop : integer (defaults to None), row number to stop selection 

971 iterator : bool, return an iterator, default False 

972 chunksize : nrows to include in iteration, return an iterator 

973 auto_close : bool, default False 

974 Should automatically close the store when finished. 

975 

976 Raises 

977 ------ 

978 raises KeyError if keys or selector is not found or keys is empty 

979 raises TypeError if keys is not a list or tuple 

980 raises ValueError if the tables are not ALL THE SAME DIMENSIONS 

981 """ 

982 # default to single select 

983 where = _ensure_term(where, scope_level=1) 

984 if isinstance(keys, (list, tuple)) and len(keys) == 1: 

985 keys = keys[0] 

986 if isinstance(keys, str): 

987 return self.select( 

988 key=keys, 

989 where=where, 

990 columns=columns, 

991 start=start, 

992 stop=stop, 

993 iterator=iterator, 

994 chunksize=chunksize, 

995 auto_close=auto_close, 

996 ) 

997 

998 if not isinstance(keys, (list, tuple)): 

999 raise TypeError("keys must be a list/tuple") 

1000 

1001 if not len(keys): 

1002 raise ValueError("keys must have a non-zero length") 

1003 

1004 if selector is None: 

1005 selector = keys[0] 

1006 

1007 # collect the tables 

1008 tbls = [self.get_storer(k) for k in keys] 

1009 s = self.get_storer(selector) 

1010 

1011 # validate rows 

1012 nrows = None 

1013 for t, k in itertools.chain([(s, selector)], zip(tbls, keys)): 

1014 if t is None: 

1015 raise KeyError(f"Invalid table [{k}]") 

1016 if not t.is_table: 

1017 raise TypeError( 

1018 f"object [{t.pathname}] is not a table, and cannot be used in all " 

1019 "select as multiple" 

1020 ) 

1021 

1022 if nrows is None: 

1023 nrows = t.nrows 

1024 elif t.nrows != nrows: 

1025 raise ValueError("all tables must have exactly the same nrows!") 

1026 

1027 # The isinstance checks here are redundant with the check above, 

1028 # but necessary for mypy; see GH#29757 

1029 _tbls = [x for x in tbls if isinstance(x, Table)] 

1030 

1031 # axis is the concentration axes 

1032 axis = {t.non_index_axes[0][0] for t in _tbls}.pop() 

1033 

1034 def func(_start, _stop, _where): 

1035 # retrieve the objs, _where is always passed as a set of 

1036 # coordinates here 

1037 objs = [ 

1038 t.read(where=_where, columns=columns, start=_start, stop=_stop) 

1039 for t in tbls 

1040 ] 

1041 

1042 # concat and return 

1043 return concat(objs, axis=axis, verify_integrity=False)._consolidate() 

1044 

1045 # create the iterator 

1046 it = TableIterator( 

1047 self, 

1048 s, 

1049 func, 

1050 where=where, 

1051 nrows=nrows, 

1052 start=start, 

1053 stop=stop, 

1054 iterator=iterator, 

1055 chunksize=chunksize, 

1056 auto_close=auto_close, 

1057 ) 

1058 

1059 return it.get_result(coordinates=True) 

1060 

1061 def put( 

1062 self, 

1063 key: str, 

1064 value: DataFrame | Series, 

1065 format=None, 

1066 index: bool = True, 

1067 append: bool = False, 

1068 complib=None, 

1069 complevel: int | None = None, 

1070 min_itemsize: int | dict[str, int] | None = None, 

1071 nan_rep=None, 

1072 data_columns: Literal[True] | list[str] | None = None, 

1073 encoding=None, 

1074 errors: str = "strict", 

1075 track_times: bool = True, 

1076 dropna: bool = False, 

1077 ) -> None: 

1078 """ 

1079 Store object in HDFStore. 

1080 

1081 Parameters 

1082 ---------- 

1083 key : str 

1084 value : {Series, DataFrame} 

1085 format : 'fixed(f)|table(t)', default is 'fixed' 

1086 Format to use when storing object in HDFStore. Value can be one of: 

1087 

1088 ``'fixed'`` 

1089 Fixed format. Fast writing/reading. Not-appendable, nor searchable. 

1090 ``'table'`` 

1091 Table format. Write as a PyTables Table structure which may perform 

1092 worse but allow more flexible operations like searching / selecting 

1093 subsets of the data. 

1094 index : bool, default True 

1095 Write DataFrame index as a column. 

1096 append : bool, default False 

1097 This will force Table format, append the input data to the existing. 

1098 data_columns : list of columns or True, default None 

1099 List of columns to create as data columns, or True to use all columns. 

1100 See `here 

1101 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__. 

1102 encoding : str, default None 

1103 Provide an encoding for strings. 

1104 track_times : bool, default True 

1105 Parameter is propagated to 'create_table' method of 'PyTables'. 

1106 If set to False it enables to have the same h5 files (same hashes) 

1107 independent on creation time. 

1108 dropna : bool, default False, optional 

1109 Remove missing values. 

1110 

1111 .. versionadded:: 1.1.0 

1112 """ 

1113 if format is None: 

1114 format = get_option("io.hdf.default_format") or "fixed" 

1115 format = self._validate_format(format) 

1116 self._write_to_group( 

1117 key, 

1118 value, 

1119 format=format, 

1120 index=index, 

1121 append=append, 

1122 complib=complib, 

1123 complevel=complevel, 

1124 min_itemsize=min_itemsize, 

1125 nan_rep=nan_rep, 

1126 data_columns=data_columns, 

1127 encoding=encoding, 

1128 errors=errors, 

1129 track_times=track_times, 

1130 dropna=dropna, 

1131 ) 

1132 

1133 def remove(self, key: str, where=None, start=None, stop=None) -> None: 

1134 """ 

1135 Remove pandas object partially by specifying the where condition 

1136 

1137 Parameters 

1138 ---------- 

1139 key : str 

1140 Node to remove or delete rows from 

1141 where : list of Term (or convertible) objects, optional 

1142 start : integer (defaults to None), row number to start selection 

1143 stop : integer (defaults to None), row number to stop selection 

1144 

1145 Returns 

1146 ------- 

1147 number of rows removed (or None if not a Table) 

1148 

1149 Raises 

1150 ------ 

1151 raises KeyError if key is not a valid store 

1152 

1153 """ 

1154 where = _ensure_term(where, scope_level=1) 

1155 try: 

1156 s = self.get_storer(key) 

1157 except KeyError: 

1158 # the key is not a valid store, re-raising KeyError 

1159 raise 

1160 except AssertionError: 

1161 # surface any assertion errors for e.g. debugging 

1162 raise 

1163 except Exception as err: 

1164 # In tests we get here with ClosedFileError, TypeError, and 

1165 # _table_mod.NoSuchNodeError. TODO: Catch only these? 

1166 

1167 if where is not None: 

1168 raise ValueError( 

1169 "trying to remove a node with a non-None where clause!" 

1170 ) from err 

1171 

1172 # we are actually trying to remove a node (with children) 

1173 node = self.get_node(key) 

1174 if node is not None: 

1175 node._f_remove(recursive=True) 

1176 return None 

1177 

1178 # remove the node 

1179 if com.all_none(where, start, stop): 

1180 s.group._f_remove(recursive=True) 

1181 

1182 # delete from the table 

1183 else: 

1184 if not s.is_table: 

1185 raise ValueError( 

1186 "can only remove with where on objects written as tables" 

1187 ) 

1188 return s.delete(where=where, start=start, stop=stop) 

1189 

1190 def append( 

1191 self, 

1192 key: str, 

1193 value: DataFrame | Series, 

1194 format=None, 

1195 axes=None, 

1196 index: bool | list[str] = True, 

1197 append: bool = True, 

1198 complib=None, 

1199 complevel: int | None = None, 

1200 columns=None, 

1201 min_itemsize: int | dict[str, int] | None = None, 

1202 nan_rep=None, 

1203 chunksize=None, 

1204 expectedrows=None, 

1205 dropna: bool | None = None, 

1206 data_columns: Literal[True] | list[str] | None = None, 

1207 encoding=None, 

1208 errors: str = "strict", 

1209 ) -> None: 

1210 """ 

1211 Append to Table in file. 

1212 

1213 Node must already exist and be Table format. 

1214 

1215 Parameters 

1216 ---------- 

1217 key : str 

1218 value : {Series, DataFrame} 

1219 format : 'table' is the default 

1220 Format to use when storing object in HDFStore. Value can be one of: 

1221 

1222 ``'table'`` 

1223 Table format. Write as a PyTables Table structure which may perform 

1224 worse but allow more flexible operations like searching / selecting 

1225 subsets of the data. 

1226 index : bool, default True 

1227 Write DataFrame index as a column. 

1228 append : bool, default True 

1229 Append the input data to the existing. 

1230 data_columns : list of columns, or True, default None 

1231 List of columns to create as indexed data columns for on-disk 

1232 queries, or True to use all columns. By default only the axes 

1233 of the object are indexed. See `here 

1234 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__. 

1235 min_itemsize : dict of columns that specify minimum str sizes 

1236 nan_rep : str to use as str nan representation 

1237 chunksize : size to chunk the writing 

1238 expectedrows : expected TOTAL row size of this table 

1239 encoding : default None, provide an encoding for str 

1240 dropna : bool, default False, optional 

1241 Do not write an ALL nan row to the store settable 

1242 by the option 'io.hdf.dropna_table'. 

1243 

1244 Notes 

1245 ----- 

1246 Does *not* check if data being appended overlaps with existing 

1247 data in the table, so be careful 

1248 """ 

1249 if columns is not None: 

1250 raise TypeError( 

1251 "columns is not a supported keyword in append, try data_columns" 

1252 ) 

1253 

1254 if dropna is None: 

1255 dropna = get_option("io.hdf.dropna_table") 

1256 if format is None: 

1257 format = get_option("io.hdf.default_format") or "table" 

1258 format = self._validate_format(format) 

1259 self._write_to_group( 

1260 key, 

1261 value, 

1262 format=format, 

1263 axes=axes, 

1264 index=index, 

1265 append=append, 

1266 complib=complib, 

1267 complevel=complevel, 

1268 min_itemsize=min_itemsize, 

1269 nan_rep=nan_rep, 

1270 chunksize=chunksize, 

1271 expectedrows=expectedrows, 

1272 dropna=dropna, 

1273 data_columns=data_columns, 

1274 encoding=encoding, 

1275 errors=errors, 

1276 ) 

1277 

1278 def append_to_multiple( 

1279 self, 

1280 d: dict, 

1281 value, 

1282 selector, 

1283 data_columns=None, 

1284 axes=None, 

1285 dropna: bool = False, 

1286 **kwargs, 

1287 ) -> None: 

1288 """ 

1289 Append to multiple tables 

1290 

1291 Parameters 

1292 ---------- 

1293 d : a dict of table_name to table_columns, None is acceptable as the 

1294 values of one node (this will get all the remaining columns) 

1295 value : a pandas object 

1296 selector : a string that designates the indexable table; all of its 

1297 columns will be designed as data_columns, unless data_columns is 

1298 passed, in which case these are used 

1299 data_columns : list of columns to create as data columns, or True to 

1300 use all columns 

1301 dropna : if evaluates to True, drop rows from all tables if any single 

1302 row in each table has all NaN. Default False. 

1303 

1304 Notes 

1305 ----- 

1306 axes parameter is currently not accepted 

1307 

1308 """ 

1309 if axes is not None: 

1310 raise TypeError( 

1311 "axes is currently not accepted as a parameter to append_to_multiple; " 

1312 "you can create the tables independently instead" 

1313 ) 

1314 

1315 if not isinstance(d, dict): 

1316 raise ValueError( 

1317 "append_to_multiple must have a dictionary specified as the " 

1318 "way to split the value" 

1319 ) 

1320 

1321 if selector not in d: 

1322 raise ValueError( 

1323 "append_to_multiple requires a selector that is in passed dict" 

1324 ) 

1325 

1326 # figure out the splitting axis (the non_index_axis) 

1327 axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0] 

1328 

1329 # figure out how to split the value 

1330 remain_key = None 

1331 remain_values: list = [] 

1332 for k, v in d.items(): 

1333 if v is None: 

1334 if remain_key is not None: 

1335 raise ValueError( 

1336 "append_to_multiple can only have one value in d that is None" 

1337 ) 

1338 remain_key = k 

1339 else: 

1340 remain_values.extend(v) 

1341 if remain_key is not None: 

1342 ordered = value.axes[axis] 

1343 ordd = ordered.difference(Index(remain_values)) 

1344 ordd = sorted(ordered.get_indexer(ordd)) 

1345 d[remain_key] = ordered.take(ordd) 

1346 

1347 # data_columns 

1348 if data_columns is None: 

1349 data_columns = d[selector] 

1350 

1351 # ensure rows are synchronized across the tables 

1352 if dropna: 

1353 idxs = (value[cols].dropna(how="all").index for cols in d.values()) 

1354 valid_index = next(idxs) 

1355 for index in idxs: 

1356 valid_index = valid_index.intersection(index) 

1357 value = value.loc[valid_index] 

1358 

1359 min_itemsize = kwargs.pop("min_itemsize", None) 

1360 

1361 # append 

1362 for k, v in d.items(): 

1363 dc = data_columns if k == selector else None 

1364 

1365 # compute the val 

1366 val = value.reindex(v, axis=axis) 

1367 

1368 filtered = ( 

1369 {key: value for (key, value) in min_itemsize.items() if key in v} 

1370 if min_itemsize is not None 

1371 else None 

1372 ) 

1373 self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs) 

1374 

1375 def create_table_index( 

1376 self, 

1377 key: str, 

1378 columns=None, 

1379 optlevel: int | None = None, 

1380 kind: str | None = None, 

1381 ) -> None: 

1382 """ 

1383 Create a pytables index on the table. 

1384 

1385 Parameters 

1386 ---------- 

1387 key : str 

1388 columns : None, bool, or listlike[str] 

1389 Indicate which columns to create an index on. 

1390 

1391 * False : Do not create any indexes. 

1392 * True : Create indexes on all columns. 

1393 * None : Create indexes on all columns. 

1394 * listlike : Create indexes on the given columns. 

1395 

1396 optlevel : int or None, default None 

1397 Optimization level, if None, pytables defaults to 6. 

1398 kind : str or None, default None 

1399 Kind of index, if None, pytables defaults to "medium". 

1400 

1401 Raises 

1402 ------ 

1403 TypeError: raises if the node is not a table 

1404 """ 

1405 # version requirements 

1406 _tables() 

1407 s = self.get_storer(key) 

1408 if s is None: 

1409 return 

1410 

1411 if not isinstance(s, Table): 

1412 raise TypeError("cannot create table index on a Fixed format store") 

1413 s.create_index(columns=columns, optlevel=optlevel, kind=kind) 

1414 

1415 def groups(self) -> list: 

1416 """ 

1417 Return a list of all the top-level nodes. 

1418 

1419 Each node returned is not a pandas storage object. 

1420 

1421 Returns 

1422 ------- 

1423 list 

1424 List of objects. 

1425 """ 

1426 _tables() 

1427 self._check_if_open() 

1428 assert self._handle is not None # for mypy 

1429 assert _table_mod is not None # for mypy 

1430 return [ 

1431 g 

1432 for g in self._handle.walk_groups() 

1433 if ( 

1434 not isinstance(g, _table_mod.link.Link) 

1435 and ( 

1436 getattr(g._v_attrs, "pandas_type", None) 

1437 or getattr(g, "table", None) 

1438 or (isinstance(g, _table_mod.table.Table) and g._v_name != "table") 

1439 ) 

1440 ) 

1441 ] 

1442 

1443 def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]: 

1444 """ 

1445 Walk the pytables group hierarchy for pandas objects. 

1446 

1447 This generator will yield the group path, subgroups and pandas object 

1448 names for each group. 

1449 

1450 Any non-pandas PyTables objects that are not a group will be ignored. 

1451 

1452 The `where` group itself is listed first (preorder), then each of its 

1453 child groups (following an alphanumerical order) is also traversed, 

1454 following the same procedure. 

1455 

1456 Parameters 

1457 ---------- 

1458 where : str, default "/" 

1459 Group where to start walking. 

1460 

1461 Yields 

1462 ------ 

1463 path : str 

1464 Full path to a group (without trailing '/'). 

1465 groups : list 

1466 Names (strings) of the groups contained in `path`. 

1467 leaves : list 

1468 Names (strings) of the pandas objects contained in `path`. 

1469 """ 

1470 _tables() 

1471 self._check_if_open() 

1472 assert self._handle is not None # for mypy 

1473 assert _table_mod is not None # for mypy 

1474 

1475 for g in self._handle.walk_groups(where): 

1476 if getattr(g._v_attrs, "pandas_type", None) is not None: 

1477 continue 

1478 

1479 groups = [] 

1480 leaves = [] 

1481 for child in g._v_children.values(): 

1482 pandas_type = getattr(child._v_attrs, "pandas_type", None) 

1483 if pandas_type is None: 

1484 if isinstance(child, _table_mod.group.Group): 

1485 groups.append(child._v_name) 

1486 else: 

1487 leaves.append(child._v_name) 

1488 

1489 yield (g._v_pathname.rstrip("/"), groups, leaves) 

1490 

1491 def get_node(self, key: str) -> Node | None: 

1492 """return the node with the key or None if it does not exist""" 

1493 self._check_if_open() 

1494 if not key.startswith("/"): 

1495 key = "/" + key 

1496 

1497 assert self._handle is not None 

1498 assert _table_mod is not None # for mypy 

1499 try: 

1500 node = self._handle.get_node(self.root, key) 

1501 except _table_mod.exceptions.NoSuchNodeError: 

1502 return None 

1503 

1504 assert isinstance(node, _table_mod.Node), type(node) 

1505 return node 

1506 

1507 def get_storer(self, key: str) -> GenericFixed | Table: 

1508 """return the storer object for a key, raise if not in the file""" 

1509 group = self.get_node(key) 

1510 if group is None: 

1511 raise KeyError(f"No object named {key} in the file") 

1512 

1513 s = self._create_storer(group) 

1514 s.infer_axes() 

1515 return s 

1516 

1517 def copy( 

1518 self, 

1519 file, 

1520 mode: str = "w", 

1521 propindexes: bool = True, 

1522 keys=None, 

1523 complib=None, 

1524 complevel: int | None = None, 

1525 fletcher32: bool = False, 

1526 overwrite: bool = True, 

1527 ) -> HDFStore: 

1528 """ 

1529 Copy the existing store to a new file, updating in place. 

1530 

1531 Parameters 

1532 ---------- 

1533 propindexes : bool, default True 

1534 Restore indexes in copied file. 

1535 keys : list, optional 

1536 List of keys to include in the copy (defaults to all). 

1537 overwrite : bool, default True 

1538 Whether to overwrite (remove and replace) existing nodes in the new store. 

1539 mode, complib, complevel, fletcher32 same as in HDFStore.__init__ 

1540 

1541 Returns 

1542 ------- 

1543 open file handle of the new store 

1544 """ 

1545 new_store = HDFStore( 

1546 file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32 

1547 ) 

1548 if keys is None: 

1549 keys = list(self.keys()) 

1550 if not isinstance(keys, (tuple, list)): 

1551 keys = [keys] 

1552 for k in keys: 

1553 s = self.get_storer(k) 

1554 if s is not None: 

1555 if k in new_store: 

1556 if overwrite: 

1557 new_store.remove(k) 

1558 

1559 data = self.select(k) 

1560 if isinstance(s, Table): 

1561 index: bool | list[str] = False 

1562 if propindexes: 

1563 index = [a.name for a in s.axes if a.is_indexed] 

1564 new_store.append( 

1565 k, 

1566 data, 

1567 index=index, 

1568 data_columns=getattr(s, "data_columns", None), 

1569 encoding=s.encoding, 

1570 ) 

1571 else: 

1572 new_store.put(k, data, encoding=s.encoding) 

1573 

1574 return new_store 

1575 

1576 def info(self) -> str: 

1577 """ 

1578 Print detailed information on the store. 

1579 

1580 Returns 

1581 ------- 

1582 str 

1583 """ 

1584 path = pprint_thing(self._path) 

1585 output = f"{type(self)}\nFile path: {path}\n" 

1586 

1587 if self.is_open: 

1588 lkeys = sorted(self.keys()) 

1589 if len(lkeys): 

1590 keys = [] 

1591 values = [] 

1592 

1593 for k in lkeys: 

1594 try: 

1595 s = self.get_storer(k) 

1596 if s is not None: 

1597 keys.append(pprint_thing(s.pathname or k)) 

1598 values.append(pprint_thing(s or "invalid_HDFStore node")) 

1599 except AssertionError: 

1600 # surface any assertion errors for e.g. debugging 

1601 raise 

1602 except Exception as detail: 

1603 keys.append(k) 

1604 dstr = pprint_thing(detail) 

1605 values.append(f"[invalid_HDFStore node: {dstr}]") 

1606 

1607 output += adjoin(12, keys, values) 

1608 else: 

1609 output += "Empty" 

1610 else: 

1611 output += "File is CLOSED" 

1612 

1613 return output 

1614 

1615 # ------------------------------------------------------------------------ 

1616 # private methods 

1617 

1618 def _check_if_open(self): 

1619 if not self.is_open: 

1620 raise ClosedFileError(f"{self._path} file is not open!") 

1621 

1622 def _validate_format(self, format: str) -> str: 

1623 """validate / deprecate formats""" 

1624 # validate 

1625 try: 

1626 format = _FORMAT_MAP[format.lower()] 

1627 except KeyError as err: 

1628 raise TypeError(f"invalid HDFStore format specified [{format}]") from err 

1629 

1630 return format 

1631 

1632 def _create_storer( 

1633 self, 

1634 group, 

1635 format=None, 

1636 value: DataFrame | Series | None = None, 

1637 encoding: str = "UTF-8", 

1638 errors: str = "strict", 

1639 ) -> GenericFixed | Table: 

1640 """return a suitable class to operate""" 

1641 cls: type[GenericFixed] | type[Table] 

1642 

1643 if value is not None and not isinstance(value, (Series, DataFrame)): 

1644 raise TypeError("value must be None, Series, or DataFrame") 

1645 

1646 pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None)) 

1647 tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None)) 

1648 

1649 # infer the pt from the passed value 

1650 if pt is None: 

1651 if value is None: 

1652 _tables() 

1653 assert _table_mod is not None # for mypy 

1654 if getattr(group, "table", None) or isinstance( 

1655 group, _table_mod.table.Table 

1656 ): 

1657 pt = "frame_table" 

1658 tt = "generic_table" 

1659 else: 

1660 raise TypeError( 

1661 "cannot create a storer if the object is not existing " 

1662 "nor a value are passed" 

1663 ) 

1664 else: 

1665 if isinstance(value, Series): 

1666 pt = "series" 

1667 else: 

1668 pt = "frame" 

1669 

1670 # we are actually a table 

1671 if format == "table": 

1672 pt += "_table" 

1673 

1674 # a storer node 

1675 if "table" not in pt: 

1676 _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed} 

1677 try: 

1678 cls = _STORER_MAP[pt] 

1679 except KeyError as err: 

1680 raise TypeError( 

1681 f"cannot properly create the storer for: [_STORER_MAP] [group->" 

1682 f"{group},value->{type(value)},format->{format}" 

1683 ) from err 

1684 return cls(self, group, encoding=encoding, errors=errors) 

1685 

1686 # existing node (and must be a table) 

1687 if tt is None: 

1688 # if we are a writer, determine the tt 

1689 if value is not None: 

1690 if pt == "series_table": 

1691 index = getattr(value, "index", None) 

1692 if index is not None: 

1693 if index.nlevels == 1: 

1694 tt = "appendable_series" 

1695 elif index.nlevels > 1: 

1696 tt = "appendable_multiseries" 

1697 elif pt == "frame_table": 

1698 index = getattr(value, "index", None) 

1699 if index is not None: 

1700 if index.nlevels == 1: 

1701 tt = "appendable_frame" 

1702 elif index.nlevels > 1: 

1703 tt = "appendable_multiframe" 

1704 

1705 _TABLE_MAP = { 

1706 "generic_table": GenericTable, 

1707 "appendable_series": AppendableSeriesTable, 

1708 "appendable_multiseries": AppendableMultiSeriesTable, 

1709 "appendable_frame": AppendableFrameTable, 

1710 "appendable_multiframe": AppendableMultiFrameTable, 

1711 "worm": WORMTable, 

1712 } 

1713 try: 

1714 cls = _TABLE_MAP[tt] 

1715 except KeyError as err: 

1716 raise TypeError( 

1717 f"cannot properly create the storer for: [_TABLE_MAP] [group->" 

1718 f"{group},value->{type(value)},format->{format}" 

1719 ) from err 

1720 

1721 return cls(self, group, encoding=encoding, errors=errors) 

1722 

1723 def _write_to_group( 

1724 self, 

1725 key: str, 

1726 value: DataFrame | Series, 

1727 format, 

1728 axes=None, 

1729 index: bool | list[str] = True, 

1730 append: bool = False, 

1731 complib=None, 

1732 complevel: int | None = None, 

1733 fletcher32=None, 

1734 min_itemsize: int | dict[str, int] | None = None, 

1735 chunksize=None, 

1736 expectedrows=None, 

1737 dropna: bool = False, 

1738 nan_rep=None, 

1739 data_columns=None, 

1740 encoding=None, 

1741 errors: str = "strict", 

1742 track_times: bool = True, 

1743 ) -> None: 

1744 # we don't want to store a table node at all if our object is 0-len 

1745 # as there are not dtypes 

1746 if getattr(value, "empty", None) and (format == "table" or append): 

1747 return 

1748 

1749 group = self._identify_group(key, append) 

1750 

1751 s = self._create_storer(group, format, value, encoding=encoding, errors=errors) 

1752 if append: 

1753 # raise if we are trying to append to a Fixed format, 

1754 # or a table that exists (and we are putting) 

1755 if not s.is_table or (s.is_table and format == "fixed" and s.is_exists): 

1756 raise ValueError("Can only append to Tables") 

1757 if not s.is_exists: 

1758 s.set_object_info() 

1759 else: 

1760 s.set_object_info() 

1761 

1762 if not s.is_table and complib: 

1763 raise ValueError("Compression not supported on Fixed format stores") 

1764 

1765 # write the object 

1766 s.write( 

1767 obj=value, 

1768 axes=axes, 

1769 append=append, 

1770 complib=complib, 

1771 complevel=complevel, 

1772 fletcher32=fletcher32, 

1773 min_itemsize=min_itemsize, 

1774 chunksize=chunksize, 

1775 expectedrows=expectedrows, 

1776 dropna=dropna, 

1777 nan_rep=nan_rep, 

1778 data_columns=data_columns, 

1779 track_times=track_times, 

1780 ) 

1781 

1782 if isinstance(s, Table) and index: 

1783 s.create_index(columns=index) 

1784 

1785 def _read_group(self, group: Node): 

1786 s = self._create_storer(group) 

1787 s.infer_axes() 

1788 return s.read() 

1789 

1790 def _identify_group(self, key: str, append: bool) -> Node: 

1791 """Identify HDF5 group based on key, delete/create group if needed.""" 

1792 group = self.get_node(key) 

1793 

1794 # we make this assertion for mypy; the get_node call will already 

1795 # have raised if this is incorrect 

1796 assert self._handle is not None 

1797 

1798 # remove the node if we are not appending 

1799 if group is not None and not append: 

1800 self._handle.remove_node(group, recursive=True) 

1801 group = None 

1802 

1803 if group is None: 

1804 group = self._create_nodes_and_group(key) 

1805 

1806 return group 

1807 

1808 def _create_nodes_and_group(self, key: str) -> Node: 

1809 """Create nodes from key and return group name.""" 

1810 # assertion for mypy 

1811 assert self._handle is not None 

1812 

1813 paths = key.split("/") 

1814 # recursively create the groups 

1815 path = "/" 

1816 for p in paths: 

1817 if not len(p): 

1818 continue 

1819 new_path = path 

1820 if not path.endswith("/"): 

1821 new_path += "/" 

1822 new_path += p 

1823 group = self.get_node(new_path) 

1824 if group is None: 

1825 group = self._handle.create_group(path, p) 

1826 path = new_path 

1827 return group 

1828 

1829 

1830class TableIterator: 

1831 """ 

1832 Define the iteration interface on a table 

1833 

1834 Parameters 

1835 ---------- 

1836 store : HDFStore 

1837 s : the referred storer 

1838 func : the function to execute the query 

1839 where : the where of the query 

1840 nrows : the rows to iterate on 

1841 start : the passed start value (default is None) 

1842 stop : the passed stop value (default is None) 

1843 iterator : bool, default False 

1844 Whether to use the default iterator. 

1845 chunksize : the passed chunking value (default is 100000) 

1846 auto_close : bool, default False 

1847 Whether to automatically close the store at the end of iteration. 

1848 """ 

1849 

1850 chunksize: int | None 

1851 store: HDFStore 

1852 s: GenericFixed | Table 

1853 

1854 def __init__( 

1855 self, 

1856 store: HDFStore, 

1857 s: GenericFixed | Table, 

1858 func, 

1859 where, 

1860 nrows, 

1861 start=None, 

1862 stop=None, 

1863 iterator: bool = False, 

1864 chunksize: int | None = None, 

1865 auto_close: bool = False, 

1866 ) -> None: 

1867 self.store = store 

1868 self.s = s 

1869 self.func = func 

1870 self.where = where 

1871 

1872 # set start/stop if they are not set if we are a table 

1873 if self.s.is_table: 

1874 if nrows is None: 

1875 nrows = 0 

1876 if start is None: 

1877 start = 0 

1878 if stop is None: 

1879 stop = nrows 

1880 stop = min(nrows, stop) 

1881 

1882 self.nrows = nrows 

1883 self.start = start 

1884 self.stop = stop 

1885 

1886 self.coordinates = None 

1887 if iterator or chunksize is not None: 

1888 if chunksize is None: 

1889 chunksize = 100000 

1890 self.chunksize = int(chunksize) 

1891 else: 

1892 self.chunksize = None 

1893 

1894 self.auto_close = auto_close 

1895 

1896 def __iter__(self) -> Iterator: 

1897 # iterate 

1898 current = self.start 

1899 if self.coordinates is None: 

1900 raise ValueError("Cannot iterate until get_result is called.") 

1901 while current < self.stop: 

1902 stop = min(current + self.chunksize, self.stop) 

1903 value = self.func(None, None, self.coordinates[current:stop]) 

1904 current = stop 

1905 if value is None or not len(value): 

1906 continue 

1907 

1908 yield value 

1909 

1910 self.close() 

1911 

1912 def close(self) -> None: 

1913 if self.auto_close: 

1914 self.store.close() 

1915 

1916 def get_result(self, coordinates: bool = False): 

1917 # return the actual iterator 

1918 if self.chunksize is not None: 

1919 if not isinstance(self.s, Table): 

1920 raise TypeError("can only use an iterator or chunksize on a table") 

1921 

1922 self.coordinates = self.s.read_coordinates(where=self.where) 

1923 

1924 return self 

1925 

1926 # if specified read via coordinates (necessary for multiple selections 

1927 if coordinates: 

1928 if not isinstance(self.s, Table): 

1929 raise TypeError("can only read_coordinates on a table") 

1930 where = self.s.read_coordinates( 

1931 where=self.where, start=self.start, stop=self.stop 

1932 ) 

1933 else: 

1934 where = self.where 

1935 

1936 # directly return the result 

1937 results = self.func(self.start, self.stop, where) 

1938 self.close() 

1939 return results 

1940 

1941 

1942class IndexCol: 

1943 """ 

1944 an index column description class 

1945 

1946 Parameters 

1947 ---------- 

1948 axis : axis which I reference 

1949 values : the ndarray like converted values 

1950 kind : a string description of this type 

1951 typ : the pytables type 

1952 pos : the position in the pytables 

1953 

1954 """ 

1955 

1956 is_an_indexable: bool = True 

1957 is_data_indexable: bool = True 

1958 _info_fields = ["freq", "tz", "index_name"] 

1959 

1960 def __init__( 

1961 self, 

1962 name: str, 

1963 values=None, 

1964 kind=None, 

1965 typ=None, 

1966 cname: str | None = None, 

1967 axis=None, 

1968 pos=None, 

1969 freq=None, 

1970 tz=None, 

1971 index_name=None, 

1972 ordered=None, 

1973 table=None, 

1974 meta=None, 

1975 metadata=None, 

1976 ) -> None: 

1977 if not isinstance(name, str): 

1978 raise ValueError("`name` must be a str.") 

1979 

1980 self.values = values 

1981 self.kind = kind 

1982 self.typ = typ 

1983 self.name = name 

1984 self.cname = cname or name 

1985 self.axis = axis 

1986 self.pos = pos 

1987 self.freq = freq 

1988 self.tz = tz 

1989 self.index_name = index_name 

1990 self.ordered = ordered 

1991 self.table = table 

1992 self.meta = meta 

1993 self.metadata = metadata 

1994 

1995 if pos is not None: 

1996 self.set_pos(pos) 

1997 

1998 # These are ensured as long as the passed arguments match the 

1999 # constructor annotations. 

2000 assert isinstance(self.name, str) 

2001 assert isinstance(self.cname, str) 

2002 

2003 @property 

2004 def itemsize(self) -> int: 

2005 # Assumes self.typ has already been initialized 

2006 return self.typ.itemsize 

2007 

2008 @property 

2009 def kind_attr(self) -> str: 

2010 return f"{self.name}_kind" 

2011 

2012 def set_pos(self, pos: int) -> None: 

2013 """set the position of this column in the Table""" 

2014 self.pos = pos 

2015 if pos is not None and self.typ is not None: 

2016 self.typ._v_pos = pos 

2017 

2018 def __repr__(self) -> str: 

2019 temp = tuple( 

2020 map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind)) 

2021 ) 

2022 return ",".join( 

2023 [ 

2024 f"{key}->{value}" 

2025 for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp) 

2026 ] 

2027 ) 

2028 

2029 def __eq__(self, other: Any) -> bool: 

2030 """compare 2 col items""" 

2031 return all( 

2032 getattr(self, a, None) == getattr(other, a, None) 

2033 for a in ["name", "cname", "axis", "pos"] 

2034 ) 

2035 

2036 def __ne__(self, other) -> bool: 

2037 return not self.__eq__(other) 

2038 

2039 @property 

2040 def is_indexed(self) -> bool: 

2041 """return whether I am an indexed column""" 

2042 if not hasattr(self.table, "cols"): 

2043 # e.g. if infer hasn't been called yet, self.table will be None. 

2044 return False 

2045 return getattr(self.table.cols, self.cname).is_indexed 

2046 

2047 def convert( 

2048 self, values: np.ndarray, nan_rep, encoding: str, errors: str 

2049 ) -> tuple[np.ndarray, np.ndarray] | tuple[Index, Index]: 

2050 """ 

2051 Convert the data from this selection to the appropriate pandas type. 

2052 """ 

2053 assert isinstance(values, np.ndarray), type(values) 

2054 

2055 # values is a recarray 

2056 if values.dtype.fields is not None: 

2057 # Copy, otherwise values will be a view 

2058 # preventing the original recarry from being free'ed 

2059 values = values[self.cname].copy() 

2060 

2061 val_kind = _ensure_decoded(self.kind) 

2062 values = _maybe_convert(values, val_kind, encoding, errors) 

2063 

2064 kwargs = {} 

2065 kwargs["name"] = _ensure_decoded(self.index_name) 

2066 

2067 if self.freq is not None: 

2068 kwargs["freq"] = _ensure_decoded(self.freq) 

2069 

2070 factory: type[Index] | type[DatetimeIndex] = Index 

2071 if is_datetime64_dtype(values.dtype) or is_datetime64tz_dtype(values.dtype): 

2072 factory = DatetimeIndex 

2073 elif values.dtype == "i8" and "freq" in kwargs: 

2074 # PeriodIndex data is stored as i8 

2075 # error: Incompatible types in assignment (expression has type 

2076 # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type 

2077 # "Union[Type[Index], Type[DatetimeIndex]]") 

2078 factory = lambda x, **kwds: PeriodIndex( # type: ignore[assignment] 

2079 ordinal=x, **kwds 

2080 ) 

2081 

2082 # making an Index instance could throw a number of different errors 

2083 try: 

2084 new_pd_index = factory(values, **kwargs) 

2085 except ValueError: 

2086 # if the output freq is different that what we recorded, 

2087 # it should be None (see also 'doc example part 2') 

2088 if "freq" in kwargs: 

2089 kwargs["freq"] = None 

2090 new_pd_index = factory(values, **kwargs) 

2091 final_pd_index = _set_tz(new_pd_index, self.tz) 

2092 return final_pd_index, final_pd_index 

2093 

2094 def take_data(self): 

2095 """return the values""" 

2096 return self.values 

2097 

2098 @property 

2099 def attrs(self): 

2100 return self.table._v_attrs 

2101 

2102 @property 

2103 def description(self): 

2104 return self.table.description 

2105 

2106 @property 

2107 def col(self): 

2108 """return my current col description""" 

2109 return getattr(self.description, self.cname, None) 

2110 

2111 @property 

2112 def cvalues(self): 

2113 """return my cython values""" 

2114 return self.values 

2115 

2116 def __iter__(self) -> Iterator: 

2117 return iter(self.values) 

2118 

2119 def maybe_set_size(self, min_itemsize=None) -> None: 

2120 """ 

2121 maybe set a string col itemsize: 

2122 min_itemsize can be an integer or a dict with this columns name 

2123 with an integer size 

2124 """ 

2125 if _ensure_decoded(self.kind) == "string": 

2126 if isinstance(min_itemsize, dict): 

2127 min_itemsize = min_itemsize.get(self.name) 

2128 

2129 if min_itemsize is not None and self.typ.itemsize < min_itemsize: 

2130 self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos) 

2131 

2132 def validate_names(self) -> None: 

2133 pass 

2134 

2135 def validate_and_set(self, handler: AppendableTable, append: bool) -> None: 

2136 self.table = handler.table 

2137 self.validate_col() 

2138 self.validate_attr(append) 

2139 self.validate_metadata(handler) 

2140 self.write_metadata(handler) 

2141 self.set_attr() 

2142 

2143 def validate_col(self, itemsize=None): 

2144 """validate this column: return the compared against itemsize""" 

2145 # validate this column for string truncation (or reset to the max size) 

2146 if _ensure_decoded(self.kind) == "string": 

2147 c = self.col 

2148 if c is not None: 

2149 if itemsize is None: 

2150 itemsize = self.itemsize 

2151 if c.itemsize < itemsize: 

2152 raise ValueError( 

2153 f"Trying to store a string with len [{itemsize}] in " 

2154 f"[{self.cname}] column but\nthis column has a limit of " 

2155 f"[{c.itemsize}]!\nConsider using min_itemsize to " 

2156 "preset the sizes on these columns" 

2157 ) 

2158 return c.itemsize 

2159 

2160 return None 

2161 

2162 def validate_attr(self, append: bool) -> None: 

2163 # check for backwards incompatibility 

2164 if append: 

2165 existing_kind = getattr(self.attrs, self.kind_attr, None) 

2166 if existing_kind is not None and existing_kind != self.kind: 

2167 raise TypeError( 

2168 f"incompatible kind in col [{existing_kind} - {self.kind}]" 

2169 ) 

2170 

2171 def update_info(self, info) -> None: 

2172 """ 

2173 set/update the info for this indexable with the key/value 

2174 if there is a conflict raise/warn as needed 

2175 """ 

2176 for key in self._info_fields: 

2177 value = getattr(self, key, None) 

2178 idx = info.setdefault(self.name, {}) 

2179 

2180 existing_value = idx.get(key) 

2181 if key in idx and value is not None and existing_value != value: 

2182 # frequency/name just warn 

2183 if key in ["freq", "index_name"]: 

2184 ws = attribute_conflict_doc % (key, existing_value, value) 

2185 warnings.warn( 

2186 ws, AttributeConflictWarning, stacklevel=find_stack_level() 

2187 ) 

2188 

2189 # reset 

2190 idx[key] = None 

2191 setattr(self, key, None) 

2192 

2193 else: 

2194 raise ValueError( 

2195 f"invalid info for [{self.name}] for [{key}], " 

2196 f"existing_value [{existing_value}] conflicts with " 

2197 f"new value [{value}]" 

2198 ) 

2199 else: 

2200 if value is not None or existing_value is not None: 

2201 idx[key] = value 

2202 

2203 def set_info(self, info) -> None: 

2204 """set my state from the passed info""" 

2205 idx = info.get(self.name) 

2206 if idx is not None: 

2207 self.__dict__.update(idx) 

2208 

2209 def set_attr(self) -> None: 

2210 """set the kind for this column""" 

2211 setattr(self.attrs, self.kind_attr, self.kind) 

2212 

2213 def validate_metadata(self, handler: AppendableTable) -> None: 

2214 """validate that kind=category does not change the categories""" 

2215 if self.meta == "category": 

2216 new_metadata = self.metadata 

2217 cur_metadata = handler.read_metadata(self.cname) 

2218 if ( 

2219 new_metadata is not None 

2220 and cur_metadata is not None 

2221 and not array_equivalent(new_metadata, cur_metadata) 

2222 ): 

2223 raise ValueError( 

2224 "cannot append a categorical with " 

2225 "different categories to the existing" 

2226 ) 

2227 

2228 def write_metadata(self, handler: AppendableTable) -> None: 

2229 """set the meta data""" 

2230 if self.metadata is not None: 

2231 handler.write_metadata(self.cname, self.metadata) 

2232 

2233 

2234class GenericIndexCol(IndexCol): 

2235 """an index which is not represented in the data of the table""" 

2236 

2237 @property 

2238 def is_indexed(self) -> bool: 

2239 return False 

2240 

2241 def convert( 

2242 self, values: np.ndarray, nan_rep, encoding: str, errors: str 

2243 ) -> tuple[Index, Index]: 

2244 """ 

2245 Convert the data from this selection to the appropriate pandas type. 

2246 

2247 Parameters 

2248 ---------- 

2249 values : np.ndarray 

2250 nan_rep : str 

2251 encoding : str 

2252 errors : str 

2253 """ 

2254 assert isinstance(values, np.ndarray), type(values) 

2255 

2256 index = RangeIndex(len(values)) 

2257 return index, index 

2258 

2259 def set_attr(self) -> None: 

2260 pass 

2261 

2262 

2263class DataCol(IndexCol): 

2264 """ 

2265 a data holding column, by definition this is not indexable 

2266 

2267 Parameters 

2268 ---------- 

2269 data : the actual data 

2270 cname : the column name in the table to hold the data (typically 

2271 values) 

2272 meta : a string description of the metadata 

2273 metadata : the actual metadata 

2274 """ 

2275 

2276 is_an_indexable = False 

2277 is_data_indexable = False 

2278 _info_fields = ["tz", "ordered"] 

2279 

2280 def __init__( 

2281 self, 

2282 name: str, 

2283 values=None, 

2284 kind=None, 

2285 typ=None, 

2286 cname: str | None = None, 

2287 pos=None, 

2288 tz=None, 

2289 ordered=None, 

2290 table=None, 

2291 meta=None, 

2292 metadata=None, 

2293 dtype: DtypeArg | None = None, 

2294 data=None, 

2295 ) -> None: 

2296 super().__init__( 

2297 name=name, 

2298 values=values, 

2299 kind=kind, 

2300 typ=typ, 

2301 pos=pos, 

2302 cname=cname, 

2303 tz=tz, 

2304 ordered=ordered, 

2305 table=table, 

2306 meta=meta, 

2307 metadata=metadata, 

2308 ) 

2309 self.dtype = dtype 

2310 self.data = data 

2311 

2312 @property 

2313 def dtype_attr(self) -> str: 

2314 return f"{self.name}_dtype" 

2315 

2316 @property 

2317 def meta_attr(self) -> str: 

2318 return f"{self.name}_meta" 

2319 

2320 def __repr__(self) -> str: 

2321 temp = tuple( 

2322 map( 

2323 pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape) 

2324 ) 

2325 ) 

2326 return ",".join( 

2327 [ 

2328 f"{key}->{value}" 

2329 for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp) 

2330 ] 

2331 ) 

2332 

2333 def __eq__(self, other: Any) -> bool: 

2334 """compare 2 col items""" 

2335 return all( 

2336 getattr(self, a, None) == getattr(other, a, None) 

2337 for a in ["name", "cname", "dtype", "pos"] 

2338 ) 

2339 

2340 def set_data(self, data: ArrayLike) -> None: 

2341 assert data is not None 

2342 assert self.dtype is None 

2343 

2344 data, dtype_name = _get_data_and_dtype_name(data) 

2345 

2346 self.data = data 

2347 self.dtype = dtype_name 

2348 self.kind = _dtype_to_kind(dtype_name) 

2349 

2350 def take_data(self): 

2351 """return the data""" 

2352 return self.data 

2353 

2354 @classmethod 

2355 def _get_atom(cls, values: ArrayLike) -> Col: 

2356 """ 

2357 Get an appropriately typed and shaped pytables.Col object for values. 

2358 """ 

2359 dtype = values.dtype 

2360 # error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no 

2361 # attribute "itemsize" 

2362 itemsize = dtype.itemsize # type: ignore[union-attr] 

2363 

2364 shape = values.shape 

2365 if values.ndim == 1: 

2366 # EA, use block shape pretending it is 2D 

2367 # TODO(EA2D): not necessary with 2D EAs 

2368 shape = (1, values.size) 

2369 

2370 if isinstance(values, Categorical): 

2371 codes = values.codes 

2372 atom = cls.get_atom_data(shape, kind=codes.dtype.name) 

2373 elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): 

2374 atom = cls.get_atom_datetime64(shape) 

2375 elif is_timedelta64_dtype(dtype): 

2376 atom = cls.get_atom_timedelta64(shape) 

2377 elif is_complex_dtype(dtype): 

2378 atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0]) 

2379 elif is_string_dtype(dtype): 

2380 atom = cls.get_atom_string(shape, itemsize) 

2381 else: 

2382 atom = cls.get_atom_data(shape, kind=dtype.name) 

2383 

2384 return atom 

2385 

2386 @classmethod 

2387 def get_atom_string(cls, shape, itemsize): 

2388 return _tables().StringCol(itemsize=itemsize, shape=shape[0]) 

2389 

2390 @classmethod 

2391 def get_atom_coltype(cls, kind: str) -> type[Col]: 

2392 """return the PyTables column class for this column""" 

2393 if kind.startswith("uint"): 

2394 k4 = kind[4:] 

2395 col_name = f"UInt{k4}Col" 

2396 elif kind.startswith("period"): 

2397 # we store as integer 

2398 col_name = "Int64Col" 

2399 else: 

2400 kcap = kind.capitalize() 

2401 col_name = f"{kcap}Col" 

2402 

2403 return getattr(_tables(), col_name) 

2404 

2405 @classmethod 

2406 def get_atom_data(cls, shape, kind: str) -> Col: 

2407 return cls.get_atom_coltype(kind=kind)(shape=shape[0]) 

2408 

2409 @classmethod 

2410 def get_atom_datetime64(cls, shape): 

2411 return _tables().Int64Col(shape=shape[0]) 

2412 

2413 @classmethod 

2414 def get_atom_timedelta64(cls, shape): 

2415 return _tables().Int64Col(shape=shape[0]) 

2416 

2417 @property 

2418 def shape(self): 

2419 return getattr(self.data, "shape", None) 

2420 

2421 @property 

2422 def cvalues(self): 

2423 """return my cython values""" 

2424 return self.data 

2425 

2426 def validate_attr(self, append) -> None: 

2427 """validate that we have the same order as the existing & same dtype""" 

2428 if append: 

2429 existing_fields = getattr(self.attrs, self.kind_attr, None) 

2430 if existing_fields is not None and existing_fields != list(self.values): 

2431 raise ValueError("appended items do not match existing items in table!") 

2432 

2433 existing_dtype = getattr(self.attrs, self.dtype_attr, None) 

2434 if existing_dtype is not None and existing_dtype != self.dtype: 

2435 raise ValueError( 

2436 "appended items dtype do not match existing items dtype in table!" 

2437 ) 

2438 

2439 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): 

2440 """ 

2441 Convert the data from this selection to the appropriate pandas type. 

2442 

2443 Parameters 

2444 ---------- 

2445 values : np.ndarray 

2446 nan_rep : 

2447 encoding : str 

2448 errors : str 

2449 

2450 Returns 

2451 ------- 

2452 index : listlike to become an Index 

2453 data : ndarraylike to become a column 

2454 """ 

2455 assert isinstance(values, np.ndarray), type(values) 

2456 

2457 # values is a recarray 

2458 if values.dtype.fields is not None: 

2459 values = values[self.cname] 

2460 

2461 assert self.typ is not None 

2462 if self.dtype is None: 

2463 # Note: in tests we never have timedelta64 or datetime64, 

2464 # so the _get_data_and_dtype_name may be unnecessary 

2465 converted, dtype_name = _get_data_and_dtype_name(values) 

2466 kind = _dtype_to_kind(dtype_name) 

2467 else: 

2468 converted = values 

2469 dtype_name = self.dtype 

2470 kind = self.kind 

2471 

2472 assert isinstance(converted, np.ndarray) # for mypy 

2473 

2474 # use the meta if needed 

2475 meta = _ensure_decoded(self.meta) 

2476 metadata = self.metadata 

2477 ordered = self.ordered 

2478 tz = self.tz 

2479 

2480 assert dtype_name is not None 

2481 # convert to the correct dtype 

2482 dtype = _ensure_decoded(dtype_name) 

2483 

2484 # reverse converts 

2485 if dtype == "datetime64": 

2486 # recreate with tz if indicated 

2487 converted = _set_tz(converted, tz, coerce=True) 

2488 

2489 elif dtype == "timedelta64": 

2490 converted = np.asarray(converted, dtype="m8[ns]") 

2491 elif dtype == "date": 

2492 try: 

2493 converted = np.asarray( 

2494 [date.fromordinal(v) for v in converted], dtype=object 

2495 ) 

2496 except ValueError: 

2497 converted = np.asarray( 

2498 [date.fromtimestamp(v) for v in converted], dtype=object 

2499 ) 

2500 

2501 elif meta == "category": 

2502 # we have a categorical 

2503 categories = metadata 

2504 codes = converted.ravel() 

2505 

2506 # if we have stored a NaN in the categories 

2507 # then strip it; in theory we could have BOTH 

2508 # -1s in the codes and nulls :< 

2509 if categories is None: 

2510 # Handle case of NaN-only categorical columns in which case 

2511 # the categories are an empty array; when this is stored, 

2512 # pytables cannot write a zero-len array, so on readback 

2513 # the categories would be None and `read_hdf()` would fail. 

2514 categories = Index([], dtype=np.float64) 

2515 else: 

2516 mask = isna(categories) 

2517 if mask.any(): 

2518 categories = categories[~mask] 

2519 codes[codes != -1] -= mask.astype(int).cumsum()._values 

2520 

2521 converted = Categorical.from_codes( 

2522 codes, categories=categories, ordered=ordered 

2523 ) 

2524 

2525 else: 

2526 try: 

2527 converted = converted.astype(dtype, copy=False) 

2528 except TypeError: 

2529 converted = converted.astype("O", copy=False) 

2530 

2531 # convert nans / decode 

2532 if _ensure_decoded(kind) == "string": 

2533 converted = _unconvert_string_array( 

2534 converted, nan_rep=nan_rep, encoding=encoding, errors=errors 

2535 ) 

2536 

2537 return self.values, converted 

2538 

2539 def set_attr(self) -> None: 

2540 """set the data for this column""" 

2541 setattr(self.attrs, self.kind_attr, self.values) 

2542 setattr(self.attrs, self.meta_attr, self.meta) 

2543 assert self.dtype is not None 

2544 setattr(self.attrs, self.dtype_attr, self.dtype) 

2545 

2546 

2547class DataIndexableCol(DataCol): 

2548 """represent a data column that can be indexed""" 

2549 

2550 is_data_indexable = True 

2551 

2552 def validate_names(self) -> None: 

2553 if not is_object_dtype(Index(self.values)): 

2554 # TODO: should the message here be more specifically non-str? 

2555 raise ValueError("cannot have non-object label DataIndexableCol") 

2556 

2557 @classmethod 

2558 def get_atom_string(cls, shape, itemsize): 

2559 return _tables().StringCol(itemsize=itemsize) 

2560 

2561 @classmethod 

2562 def get_atom_data(cls, shape, kind: str) -> Col: 

2563 return cls.get_atom_coltype(kind=kind)() 

2564 

2565 @classmethod 

2566 def get_atom_datetime64(cls, shape): 

2567 return _tables().Int64Col() 

2568 

2569 @classmethod 

2570 def get_atom_timedelta64(cls, shape): 

2571 return _tables().Int64Col() 

2572 

2573 

2574class GenericDataIndexableCol(DataIndexableCol): 

2575 """represent a generic pytables data column""" 

2576 

2577 

2578class Fixed: 

2579 """ 

2580 represent an object in my store 

2581 facilitate read/write of various types of objects 

2582 this is an abstract base class 

2583 

2584 Parameters 

2585 ---------- 

2586 parent : HDFStore 

2587 group : Node 

2588 The group node where the table resides. 

2589 """ 

2590 

2591 pandas_kind: str 

2592 format_type: str = "fixed" # GH#30962 needed by dask 

2593 obj_type: type[DataFrame | Series] 

2594 ndim: int 

2595 parent: HDFStore 

2596 is_table: bool = False 

2597 

2598 def __init__( 

2599 self, 

2600 parent: HDFStore, 

2601 group: Node, 

2602 encoding: str | None = "UTF-8", 

2603 errors: str = "strict", 

2604 ) -> None: 

2605 assert isinstance(parent, HDFStore), type(parent) 

2606 assert _table_mod is not None # needed for mypy 

2607 assert isinstance(group, _table_mod.Node), type(group) 

2608 self.parent = parent 

2609 self.group = group 

2610 self.encoding = _ensure_encoding(encoding) 

2611 self.errors = errors 

2612 

2613 @property 

2614 def is_old_version(self) -> bool: 

2615 return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1 

2616 

2617 @property 

2618 def version(self) -> tuple[int, int, int]: 

2619 """compute and set our version""" 

2620 version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None)) 

2621 try: 

2622 version = tuple(int(x) for x in version.split(".")) 

2623 if len(version) == 2: 

2624 version = version + (0,) 

2625 except AttributeError: 

2626 version = (0, 0, 0) 

2627 return version 

2628 

2629 @property 

2630 def pandas_type(self): 

2631 return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None)) 

2632 

2633 def __repr__(self) -> str: 

2634 """return a pretty representation of myself""" 

2635 self.infer_axes() 

2636 s = self.shape 

2637 if s is not None: 

2638 if isinstance(s, (list, tuple)): 

2639 jshape = ",".join([pprint_thing(x) for x in s]) 

2640 s = f"[{jshape}]" 

2641 return f"{self.pandas_type:12.12} (shape->{s})" 

2642 return self.pandas_type 

2643 

2644 def set_object_info(self) -> None: 

2645 """set my pandas type & version""" 

2646 self.attrs.pandas_type = str(self.pandas_kind) 

2647 self.attrs.pandas_version = str(_version) 

2648 

2649 def copy(self) -> Fixed: 

2650 new_self = copy.copy(self) 

2651 return new_self 

2652 

2653 @property 

2654 def shape(self): 

2655 return self.nrows 

2656 

2657 @property 

2658 def pathname(self): 

2659 return self.group._v_pathname 

2660 

2661 @property 

2662 def _handle(self): 

2663 return self.parent._handle 

2664 

2665 @property 

2666 def _filters(self): 

2667 return self.parent._filters 

2668 

2669 @property 

2670 def _complevel(self) -> int: 

2671 return self.parent._complevel 

2672 

2673 @property 

2674 def _fletcher32(self) -> bool: 

2675 return self.parent._fletcher32 

2676 

2677 @property 

2678 def attrs(self): 

2679 return self.group._v_attrs 

2680 

2681 def set_attrs(self) -> None: 

2682 """set our object attributes""" 

2683 

2684 def get_attrs(self) -> None: 

2685 """get our object attributes""" 

2686 

2687 @property 

2688 def storable(self): 

2689 """return my storable""" 

2690 return self.group 

2691 

2692 @property 

2693 def is_exists(self) -> bool: 

2694 return False 

2695 

2696 @property 

2697 def nrows(self): 

2698 return getattr(self.storable, "nrows", None) 

2699 

2700 def validate(self, other) -> Literal[True] | None: 

2701 """validate against an existing storable""" 

2702 if other is None: 

2703 return None 

2704 return True 

2705 

2706 def validate_version(self, where=None) -> None: 

2707 """are we trying to operate on an old version?""" 

2708 

2709 def infer_axes(self) -> bool: 

2710 """ 

2711 infer the axes of my storer 

2712 return a boolean indicating if we have a valid storer or not 

2713 """ 

2714 s = self.storable 

2715 if s is None: 

2716 return False 

2717 self.get_attrs() 

2718 return True 

2719 

2720 def read( 

2721 self, 

2722 where=None, 

2723 columns=None, 

2724 start: int | None = None, 

2725 stop: int | None = None, 

2726 ): 

2727 raise NotImplementedError( 

2728 "cannot read on an abstract storer: subclasses should implement" 

2729 ) 

2730 

2731 def write(self, **kwargs): 

2732 raise NotImplementedError( 

2733 "cannot write on an abstract storer: subclasses should implement" 

2734 ) 

2735 

2736 def delete( 

2737 self, where=None, start: int | None = None, stop: int | None = None 

2738 ) -> None: 

2739 """ 

2740 support fully deleting the node in its entirety (only) - where 

2741 specification must be None 

2742 """ 

2743 if com.all_none(where, start, stop): 

2744 self._handle.remove_node(self.group, recursive=True) 

2745 return None 

2746 

2747 raise TypeError("cannot delete on an abstract storer") 

2748 

2749 

2750class GenericFixed(Fixed): 

2751 """a generified fixed version""" 

2752 

2753 _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"} 

2754 _reverse_index_map = {v: k for k, v in _index_type_map.items()} 

2755 attributes: list[str] = [] 

2756 

2757 # indexer helpers 

2758 def _class_to_alias(self, cls) -> str: 

2759 return self._index_type_map.get(cls, "") 

2760 

2761 def _alias_to_class(self, alias): 

2762 if isinstance(alias, type): # pragma: no cover 

2763 # compat: for a short period of time master stored types 

2764 return alias 

2765 return self._reverse_index_map.get(alias, Index) 

2766 

2767 def _get_index_factory(self, attrs): 

2768 index_class = self._alias_to_class( 

2769 _ensure_decoded(getattr(attrs, "index_class", "")) 

2770 ) 

2771 

2772 factory: Callable 

2773 

2774 if index_class == DatetimeIndex: 

2775 

2776 def f(values, freq=None, tz=None): 

2777 # data are already in UTC, localize and convert if tz present 

2778 dta = DatetimeArray._simple_new(values.values, freq=freq) 

2779 result = DatetimeIndex._simple_new(dta, name=None) 

2780 if tz is not None: 

2781 result = result.tz_localize("UTC").tz_convert(tz) 

2782 return result 

2783 

2784 factory = f 

2785 elif index_class == PeriodIndex: 

2786 

2787 def f(values, freq=None, tz=None): 

2788 parr = PeriodArray._simple_new(values, freq=freq) 

2789 return PeriodIndex._simple_new(parr, name=None) 

2790 

2791 factory = f 

2792 else: 

2793 factory = index_class 

2794 

2795 kwargs = {} 

2796 if "freq" in attrs: 

2797 kwargs["freq"] = attrs["freq"] 

2798 if index_class is Index: 

2799 # DTI/PI would be gotten by _alias_to_class 

2800 factory = TimedeltaIndex 

2801 

2802 if "tz" in attrs: 

2803 if isinstance(attrs["tz"], bytes): 

2804 # created by python2 

2805 kwargs["tz"] = attrs["tz"].decode("utf-8") 

2806 else: 

2807 # created by python3 

2808 kwargs["tz"] = attrs["tz"] 

2809 assert index_class is DatetimeIndex # just checking 

2810 

2811 return factory, kwargs 

2812 

2813 def validate_read(self, columns, where) -> None: 

2814 """ 

2815 raise if any keywords are passed which are not-None 

2816 """ 

2817 if columns is not None: 

2818 raise TypeError( 

2819 "cannot pass a column specification when reading " 

2820 "a Fixed format store. this store must be selected in its entirety" 

2821 ) 

2822 if where is not None: 

2823 raise TypeError( 

2824 "cannot pass a where specification when reading " 

2825 "from a Fixed format store. this store must be selected in its entirety" 

2826 ) 

2827 

2828 @property 

2829 def is_exists(self) -> bool: 

2830 return True 

2831 

2832 def set_attrs(self) -> None: 

2833 """set our object attributes""" 

2834 self.attrs.encoding = self.encoding 

2835 self.attrs.errors = self.errors 

2836 

2837 def get_attrs(self) -> None: 

2838 """retrieve our attributes""" 

2839 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) 

2840 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) 

2841 for n in self.attributes: 

2842 setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) 

2843 

2844 # error: Signature of "write" incompatible with supertype "Fixed" 

2845 def write(self, obj, **kwargs) -> None: # type: ignore[override] 

2846 self.set_attrs() 

2847 

2848 def read_array(self, key: str, start: int | None = None, stop: int | None = None): 

2849 """read an array for the specified node (off of group""" 

2850 import tables 

2851 

2852 node = getattr(self.group, key) 

2853 attrs = node._v_attrs 

2854 

2855 transposed = getattr(attrs, "transposed", False) 

2856 

2857 if isinstance(node, tables.VLArray): 

2858 ret = node[0][start:stop] 

2859 else: 

2860 dtype = _ensure_decoded(getattr(attrs, "value_type", None)) 

2861 shape = getattr(attrs, "shape", None) 

2862 

2863 if shape is not None: 

2864 # length 0 axis 

2865 ret = np.empty(shape, dtype=dtype) 

2866 else: 

2867 ret = node[start:stop] 

2868 

2869 if dtype == "datetime64": 

2870 # reconstruct a timezone if indicated 

2871 tz = getattr(attrs, "tz", None) 

2872 ret = _set_tz(ret, tz, coerce=True) 

2873 

2874 elif dtype == "timedelta64": 

2875 ret = np.asarray(ret, dtype="m8[ns]") 

2876 

2877 if transposed: 

2878 return ret.T 

2879 else: 

2880 return ret 

2881 

2882 def read_index( 

2883 self, key: str, start: int | None = None, stop: int | None = None 

2884 ) -> Index: 

2885 variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety")) 

2886 

2887 if variety == "multi": 

2888 return self.read_multi_index(key, start=start, stop=stop) 

2889 elif variety == "regular": 

2890 node = getattr(self.group, key) 

2891 index = self.read_index_node(node, start=start, stop=stop) 

2892 return index 

2893 else: # pragma: no cover 

2894 raise TypeError(f"unrecognized index variety: {variety}") 

2895 

2896 def write_index(self, key: str, index: Index) -> None: 

2897 if isinstance(index, MultiIndex): 

2898 setattr(self.attrs, f"{key}_variety", "multi") 

2899 self.write_multi_index(key, index) 

2900 else: 

2901 setattr(self.attrs, f"{key}_variety", "regular") 

2902 converted = _convert_index("index", index, self.encoding, self.errors) 

2903 

2904 self.write_array(key, converted.values) 

2905 

2906 node = getattr(self.group, key) 

2907 node._v_attrs.kind = converted.kind 

2908 node._v_attrs.name = index.name 

2909 

2910 if isinstance(index, (DatetimeIndex, PeriodIndex)): 

2911 node._v_attrs.index_class = self._class_to_alias(type(index)) 

2912 

2913 if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): 

2914 node._v_attrs.freq = index.freq 

2915 

2916 if isinstance(index, DatetimeIndex) and index.tz is not None: 

2917 node._v_attrs.tz = _get_tz(index.tz) 

2918 

2919 def write_multi_index(self, key: str, index: MultiIndex) -> None: 

2920 setattr(self.attrs, f"{key}_nlevels", index.nlevels) 

2921 

2922 for i, (lev, level_codes, name) in enumerate( 

2923 zip(index.levels, index.codes, index.names) 

2924 ): 

2925 # write the level 

2926 if is_extension_array_dtype(lev): 

2927 raise NotImplementedError( 

2928 "Saving a MultiIndex with an extension dtype is not supported." 

2929 ) 

2930 level_key = f"{key}_level{i}" 

2931 conv_level = _convert_index(level_key, lev, self.encoding, self.errors) 

2932 self.write_array(level_key, conv_level.values) 

2933 node = getattr(self.group, level_key) 

2934 node._v_attrs.kind = conv_level.kind 

2935 node._v_attrs.name = name 

2936 

2937 # write the name 

2938 setattr(node._v_attrs, f"{key}_name{name}", name) 

2939 

2940 # write the labels 

2941 label_key = f"{key}_label{i}" 

2942 self.write_array(label_key, level_codes) 

2943 

2944 def read_multi_index( 

2945 self, key: str, start: int | None = None, stop: int | None = None 

2946 ) -> MultiIndex: 

2947 nlevels = getattr(self.attrs, f"{key}_nlevels") 

2948 

2949 levels = [] 

2950 codes = [] 

2951 names: list[Hashable] = [] 

2952 for i in range(nlevels): 

2953 level_key = f"{key}_level{i}" 

2954 node = getattr(self.group, level_key) 

2955 lev = self.read_index_node(node, start=start, stop=stop) 

2956 levels.append(lev) 

2957 names.append(lev.name) 

2958 

2959 label_key = f"{key}_label{i}" 

2960 level_codes = self.read_array(label_key, start=start, stop=stop) 

2961 codes.append(level_codes) 

2962 

2963 return MultiIndex( 

2964 levels=levels, codes=codes, names=names, verify_integrity=True 

2965 ) 

2966 

2967 def read_index_node( 

2968 self, node: Node, start: int | None = None, stop: int | None = None 

2969 ) -> Index: 

2970 data = node[start:stop] 

2971 # If the index was an empty array write_array_empty() will 

2972 # have written a sentinel. Here we replace it with the original. 

2973 if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0: 

2974 data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type) 

2975 kind = _ensure_decoded(node._v_attrs.kind) 

2976 name = None 

2977 

2978 if "name" in node._v_attrs: 

2979 name = _ensure_str(node._v_attrs.name) 

2980 name = _ensure_decoded(name) 

2981 

2982 attrs = node._v_attrs 

2983 factory, kwargs = self._get_index_factory(attrs) 

2984 

2985 if kind in ("date", "object"): 

2986 index = factory( 

2987 _unconvert_index( 

2988 data, kind, encoding=self.encoding, errors=self.errors 

2989 ), 

2990 dtype=object, 

2991 **kwargs, 

2992 ) 

2993 else: 

2994 index = factory( 

2995 _unconvert_index( 

2996 data, kind, encoding=self.encoding, errors=self.errors 

2997 ), 

2998 **kwargs, 

2999 ) 

3000 

3001 index.name = name 

3002 

3003 return index 

3004 

3005 def write_array_empty(self, key: str, value: ArrayLike) -> None: 

3006 """write a 0-len array""" 

3007 # ugly hack for length 0 axes 

3008 arr = np.empty((1,) * value.ndim) 

3009 self._handle.create_array(self.group, key, arr) 

3010 node = getattr(self.group, key) 

3011 node._v_attrs.value_type = str(value.dtype) 

3012 node._v_attrs.shape = value.shape 

3013 

3014 def write_array( 

3015 self, key: str, obj: AnyArrayLike, items: Index | None = None 

3016 ) -> None: 

3017 # TODO: we only have a few tests that get here, the only EA 

3018 # that gets passed is DatetimeArray, and we never have 

3019 # both self._filters and EA 

3020 

3021 value = extract_array(obj, extract_numpy=True) 

3022 

3023 if key in self.group: 

3024 self._handle.remove_node(self.group, key) 

3025 

3026 # Transform needed to interface with pytables row/col notation 

3027 empty_array = value.size == 0 

3028 transposed = False 

3029 

3030 if is_categorical_dtype(value.dtype): 

3031 raise NotImplementedError( 

3032 "Cannot store a category dtype in a HDF5 dataset that uses format=" 

3033 '"fixed". Use format="table".' 

3034 ) 

3035 if not empty_array: 

3036 if hasattr(value, "T"): 

3037 # ExtensionArrays (1d) may not have transpose. 

3038 value = value.T 

3039 transposed = True 

3040 

3041 atom = None 

3042 if self._filters is not None: 

3043 with suppress(ValueError): 

3044 # get the atom for this datatype 

3045 atom = _tables().Atom.from_dtype(value.dtype) 

3046 

3047 if atom is not None: 

3048 # We only get here if self._filters is non-None and 

3049 # the Atom.from_dtype call succeeded 

3050 

3051 # create an empty chunked array and fill it from value 

3052 if not empty_array: 

3053 ca = self._handle.create_carray( 

3054 self.group, key, atom, value.shape, filters=self._filters 

3055 ) 

3056 ca[:] = value 

3057 

3058 else: 

3059 self.write_array_empty(key, value) 

3060 

3061 elif value.dtype.type == np.object_: 

3062 # infer the type, warn if we have a non-string type here (for 

3063 # performance) 

3064 inferred_type = lib.infer_dtype(value, skipna=False) 

3065 if empty_array: 

3066 pass 

3067 elif inferred_type == "string": 

3068 pass 

3069 else: 

3070 ws = performance_doc % (inferred_type, key, items) 

3071 warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level()) 

3072 

3073 vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom()) 

3074 vlarr.append(value) 

3075 

3076 elif is_datetime64_dtype(value.dtype): 

3077 self._handle.create_array(self.group, key, value.view("i8")) 

3078 getattr(self.group, key)._v_attrs.value_type = "datetime64" 

3079 elif is_datetime64tz_dtype(value.dtype): 

3080 # store as UTC 

3081 # with a zone 

3082 

3083 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no 

3084 # attribute "asi8" 

3085 self._handle.create_array( 

3086 self.group, key, value.asi8 # type: ignore[union-attr] 

3087 ) 

3088 

3089 node = getattr(self.group, key) 

3090 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no 

3091 # attribute "tz" 

3092 node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr] 

3093 node._v_attrs.value_type = "datetime64" 

3094 elif is_timedelta64_dtype(value.dtype): 

3095 self._handle.create_array(self.group, key, value.view("i8")) 

3096 getattr(self.group, key)._v_attrs.value_type = "timedelta64" 

3097 elif empty_array: 

3098 self.write_array_empty(key, value) 

3099 else: 

3100 self._handle.create_array(self.group, key, value) 

3101 

3102 getattr(self.group, key)._v_attrs.transposed = transposed 

3103 

3104 

3105class SeriesFixed(GenericFixed): 

3106 pandas_kind = "series" 

3107 attributes = ["name"] 

3108 

3109 name: Hashable 

3110 

3111 @property 

3112 def shape(self): 

3113 try: 

3114 return (len(self.group.values),) 

3115 except (TypeError, AttributeError): 

3116 return None 

3117 

3118 def read( 

3119 self, 

3120 where=None, 

3121 columns=None, 

3122 start: int | None = None, 

3123 stop: int | None = None, 

3124 ) -> Series: 

3125 self.validate_read(columns, where) 

3126 index = self.read_index("index", start=start, stop=stop) 

3127 values = self.read_array("values", start=start, stop=stop) 

3128 return Series(values, index=index, name=self.name, copy=False) 

3129 

3130 # error: Signature of "write" incompatible with supertype "Fixed" 

3131 def write(self, obj, **kwargs) -> None: # type: ignore[override] 

3132 super().write(obj, **kwargs) 

3133 self.write_index("index", obj.index) 

3134 self.write_array("values", obj) 

3135 self.attrs.name = obj.name 

3136 

3137 

3138class BlockManagerFixed(GenericFixed): 

3139 attributes = ["ndim", "nblocks"] 

3140 

3141 nblocks: int 

3142 

3143 @property 

3144 def shape(self) -> Shape | None: 

3145 try: 

3146 ndim = self.ndim 

3147 

3148 # items 

3149 items = 0 

3150 for i in range(self.nblocks): 

3151 node = getattr(self.group, f"block{i}_items") 

3152 shape = getattr(node, "shape", None) 

3153 if shape is not None: 

3154 items += shape[0] 

3155 

3156 # data shape 

3157 node = self.group.block0_values 

3158 shape = getattr(node, "shape", None) 

3159 if shape is not None: 

3160 shape = list(shape[0 : (ndim - 1)]) 

3161 else: 

3162 shape = [] 

3163 

3164 shape.append(items) 

3165 

3166 return shape 

3167 except AttributeError: 

3168 return None 

3169 

3170 def read( 

3171 self, 

3172 where=None, 

3173 columns=None, 

3174 start: int | None = None, 

3175 stop: int | None = None, 

3176 ) -> DataFrame: 

3177 # start, stop applied to rows, so 0th axis only 

3178 self.validate_read(columns, where) 

3179 select_axis = self.obj_type()._get_block_manager_axis(0) 

3180 

3181 axes = [] 

3182 for i in range(self.ndim): 

3183 _start, _stop = (start, stop) if i == select_axis else (None, None) 

3184 ax = self.read_index(f"axis{i}", start=_start, stop=_stop) 

3185 axes.append(ax) 

3186 

3187 items = axes[0] 

3188 dfs = [] 

3189 

3190 for i in range(self.nblocks): 

3191 blk_items = self.read_index(f"block{i}_items") 

3192 values = self.read_array(f"block{i}_values", start=_start, stop=_stop) 

3193 

3194 columns = items[items.get_indexer(blk_items)] 

3195 df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) 

3196 dfs.append(df) 

3197 

3198 if len(dfs) > 0: 

3199 out = concat(dfs, axis=1, copy=True) 

3200 out = out.reindex(columns=items, copy=False) 

3201 return out 

3202 

3203 return DataFrame(columns=axes[0], index=axes[1]) 

3204 

3205 # error: Signature of "write" incompatible with supertype "Fixed" 

3206 def write(self, obj, **kwargs) -> None: # type: ignore[override] 

3207 super().write(obj, **kwargs) 

3208 

3209 # TODO(ArrayManager) HDFStore relies on accessing the blocks 

3210 if isinstance(obj._mgr, ArrayManager): 

3211 obj = obj._as_manager("block") 

3212 

3213 data = obj._mgr 

3214 if not data.is_consolidated(): 

3215 data = data.consolidate() 

3216 

3217 self.attrs.ndim = data.ndim 

3218 for i, ax in enumerate(data.axes): 

3219 if i == 0 and (not ax.is_unique): 

3220 raise ValueError("Columns index has to be unique for fixed format") 

3221 self.write_index(f"axis{i}", ax) 

3222 

3223 # Supporting mixed-type DataFrame objects...nontrivial 

3224 self.attrs.nblocks = len(data.blocks) 

3225 for i, blk in enumerate(data.blocks): 

3226 # I have no idea why, but writing values before items fixed #2299 

3227 blk_items = data.items.take(blk.mgr_locs) 

3228 self.write_array(f"block{i}_values", blk.values, items=blk_items) 

3229 self.write_index(f"block{i}_items", blk_items) 

3230 

3231 

3232class FrameFixed(BlockManagerFixed): 

3233 pandas_kind = "frame" 

3234 obj_type = DataFrame 

3235 

3236 

3237class Table(Fixed): 

3238 """ 

3239 represent a table: 

3240 facilitate read/write of various types of tables 

3241 

3242 Attrs in Table Node 

3243 ------------------- 

3244 These are attributes that are store in the main table node, they are 

3245 necessary to recreate these tables when read back in. 

3246 

3247 index_axes : a list of tuples of the (original indexing axis and 

3248 index column) 

3249 non_index_axes: a list of tuples of the (original index axis and 

3250 columns on a non-indexing axis) 

3251 values_axes : a list of the columns which comprise the data of this 

3252 table 

3253 data_columns : a list of the columns that we are allowing indexing 

3254 (these become single columns in values_axes) 

3255 nan_rep : the string to use for nan representations for string 

3256 objects 

3257 levels : the names of levels 

3258 metadata : the names of the metadata columns 

3259 """ 

3260 

3261 pandas_kind = "wide_table" 

3262 format_type: str = "table" # GH#30962 needed by dask 

3263 table_type: str 

3264 levels: int | list[Hashable] = 1 

3265 is_table = True 

3266 

3267 metadata: list 

3268 

3269 def __init__( 

3270 self, 

3271 parent: HDFStore, 

3272 group: Node, 

3273 encoding: str | None = None, 

3274 errors: str = "strict", 

3275 index_axes: list[IndexCol] | None = None, 

3276 non_index_axes: list[tuple[AxisInt, Any]] | None = None, 

3277 values_axes: list[DataCol] | None = None, 

3278 data_columns: list | None = None, 

3279 info: dict | None = None, 

3280 nan_rep=None, 

3281 ) -> None: 

3282 super().__init__(parent, group, encoding=encoding, errors=errors) 

3283 self.index_axes = index_axes or [] 

3284 self.non_index_axes = non_index_axes or [] 

3285 self.values_axes = values_axes or [] 

3286 self.data_columns = data_columns or [] 

3287 self.info = info or {} 

3288 self.nan_rep = nan_rep 

3289 

3290 @property 

3291 def table_type_short(self) -> str: 

3292 return self.table_type.split("_")[0] 

3293 

3294 def __repr__(self) -> str: 

3295 """return a pretty representation of myself""" 

3296 self.infer_axes() 

3297 jdc = ",".join(self.data_columns) if len(self.data_columns) else "" 

3298 dc = f",dc->[{jdc}]" 

3299 

3300 ver = "" 

3301 if self.is_old_version: 

3302 jver = ".".join([str(x) for x in self.version]) 

3303 ver = f"[{jver}]" 

3304 

3305 jindex_axes = ",".join([a.name for a in self.index_axes]) 

3306 return ( 

3307 f"{self.pandas_type:12.12}{ver} " 

3308 f"(typ->{self.table_type_short},nrows->{self.nrows}," 

3309 f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})" 

3310 ) 

3311 

3312 def __getitem__(self, c: str): 

3313 """return the axis for c""" 

3314 for a in self.axes: 

3315 if c == a.name: 

3316 return a 

3317 return None 

3318 

3319 def validate(self, other) -> None: 

3320 """validate against an existing table""" 

3321 if other is None: 

3322 return 

3323 

3324 if other.table_type != self.table_type: 

3325 raise TypeError( 

3326 "incompatible table_type with existing " 

3327 f"[{other.table_type} - {self.table_type}]" 

3328 ) 

3329 

3330 for c in ["index_axes", "non_index_axes", "values_axes"]: 

3331 sv = getattr(self, c, None) 

3332 ov = getattr(other, c, None) 

3333 if sv != ov: 

3334 # show the error for the specific axes 

3335 # Argument 1 to "enumerate" has incompatible type 

3336 # "Optional[Any]"; expected "Iterable[Any]" [arg-type] 

3337 for i, sax in enumerate(sv): # type: ignore[arg-type] 

3338 # Value of type "Optional[Any]" is not indexable [index] 

3339 oax = ov[i] # type: ignore[index] 

3340 if sax != oax: 

3341 raise ValueError( 

3342 f"invalid combination of [{c}] on appending data " 

3343 f"[{sax}] vs current table [{oax}]" 

3344 ) 

3345 

3346 # should never get here 

3347 raise Exception( 

3348 f"invalid combination of [{c}] on appending data [{sv}] vs " 

3349 f"current table [{ov}]" 

3350 ) 

3351 

3352 @property 

3353 def is_multi_index(self) -> bool: 

3354 """the levels attribute is 1 or a list in the case of a multi-index""" 

3355 return isinstance(self.levels, list) 

3356 

3357 def validate_multiindex( 

3358 self, obj: DataFrame | Series 

3359 ) -> tuple[DataFrame, list[Hashable]]: 

3360 """ 

3361 validate that we can store the multi-index; reset and return the 

3362 new object 

3363 """ 

3364 levels = com.fill_missing_names(obj.index.names) 

3365 try: 

3366 reset_obj = obj.reset_index() 

3367 except ValueError as err: 

3368 raise ValueError( 

3369 "duplicate names/columns in the multi-index when storing as a table" 

3370 ) from err 

3371 assert isinstance(reset_obj, DataFrame) # for mypy 

3372 return reset_obj, levels 

3373 

3374 @property 

3375 def nrows_expected(self) -> int: 

3376 """based on our axes, compute the expected nrows""" 

3377 return np.prod([i.cvalues.shape[0] for i in self.index_axes]) 

3378 

3379 @property 

3380 def is_exists(self) -> bool: 

3381 """has this table been created""" 

3382 return "table" in self.group 

3383 

3384 @property 

3385 def storable(self): 

3386 return getattr(self.group, "table", None) 

3387 

3388 @property 

3389 def table(self): 

3390 """return the table group (this is my storable)""" 

3391 return self.storable 

3392 

3393 @property 

3394 def dtype(self): 

3395 return self.table.dtype 

3396 

3397 @property 

3398 def description(self): 

3399 return self.table.description 

3400 

3401 @property 

3402 def axes(self): 

3403 return itertools.chain(self.index_axes, self.values_axes) 

3404 

3405 @property 

3406 def ncols(self) -> int: 

3407 """the number of total columns in the values axes""" 

3408 return sum(len(a.values) for a in self.values_axes) 

3409 

3410 @property 

3411 def is_transposed(self) -> bool: 

3412 return False 

3413 

3414 @property 

3415 def data_orientation(self) -> tuple[int, ...]: 

3416 """return a tuple of my permutated axes, non_indexable at the front""" 

3417 return tuple( 

3418 itertools.chain( 

3419 [int(a[0]) for a in self.non_index_axes], 

3420 [int(a.axis) for a in self.index_axes], 

3421 ) 

3422 ) 

3423 

3424 def queryables(self) -> dict[str, Any]: 

3425 """return a dict of the kinds allowable columns for this object""" 

3426 # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here 

3427 axis_names = {0: "index", 1: "columns"} 

3428 

3429 # compute the values_axes queryables 

3430 d1 = [(a.cname, a) for a in self.index_axes] 

3431 d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes] 

3432 d3 = [ 

3433 (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns) 

3434 ] 

3435 

3436 return dict(d1 + d2 + d3) 

3437 

3438 def index_cols(self): 

3439 """return a list of my index cols""" 

3440 # Note: each `i.cname` below is assured to be a str. 

3441 return [(i.axis, i.cname) for i in self.index_axes] 

3442 

3443 def values_cols(self) -> list[str]: 

3444 """return a list of my values cols""" 

3445 return [i.cname for i in self.values_axes] 

3446 

3447 def _get_metadata_path(self, key: str) -> str: 

3448 """return the metadata pathname for this key""" 

3449 group = self.group._v_pathname 

3450 return f"{group}/meta/{key}/meta" 

3451 

3452 def write_metadata(self, key: str, values: np.ndarray) -> None: 

3453 """ 

3454 Write out a metadata array to the key as a fixed-format Series. 

3455 

3456 Parameters 

3457 ---------- 

3458 key : str 

3459 values : ndarray 

3460 """ 

3461 self.parent.put( 

3462 self._get_metadata_path(key), 

3463 Series(values, copy=False), 

3464 format="table", 

3465 encoding=self.encoding, 

3466 errors=self.errors, 

3467 nan_rep=self.nan_rep, 

3468 ) 

3469 

3470 def read_metadata(self, key: str): 

3471 """return the meta data array for this key""" 

3472 if getattr(getattr(self.group, "meta", None), key, None) is not None: 

3473 return self.parent.select(self._get_metadata_path(key)) 

3474 return None 

3475 

3476 def set_attrs(self) -> None: 

3477 """set our table type & indexables""" 

3478 self.attrs.table_type = str(self.table_type) 

3479 self.attrs.index_cols = self.index_cols() 

3480 self.attrs.values_cols = self.values_cols() 

3481 self.attrs.non_index_axes = self.non_index_axes 

3482 self.attrs.data_columns = self.data_columns 

3483 self.attrs.nan_rep = self.nan_rep 

3484 self.attrs.encoding = self.encoding 

3485 self.attrs.errors = self.errors 

3486 self.attrs.levels = self.levels 

3487 self.attrs.info = self.info 

3488 

3489 def get_attrs(self) -> None: 

3490 """retrieve our attributes""" 

3491 self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or [] 

3492 self.data_columns = getattr(self.attrs, "data_columns", None) or [] 

3493 self.info = getattr(self.attrs, "info", None) or {} 

3494 self.nan_rep = getattr(self.attrs, "nan_rep", None) 

3495 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None)) 

3496 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict")) 

3497 self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or [] 

3498 self.index_axes = [a for a in self.indexables if a.is_an_indexable] 

3499 self.values_axes = [a for a in self.indexables if not a.is_an_indexable] 

3500 

3501 def validate_version(self, where=None) -> None: 

3502 """are we trying to operate on an old version?""" 

3503 if where is not None: 

3504 if self.is_old_version: 

3505 ws = incompatibility_doc % ".".join([str(x) for x in self.version]) 

3506 warnings.warn( 

3507 ws, 

3508 IncompatibilityWarning, 

3509 stacklevel=find_stack_level(), 

3510 ) 

3511 

3512 def validate_min_itemsize(self, min_itemsize) -> None: 

3513 """ 

3514 validate the min_itemsize doesn't contain items that are not in the 

3515 axes this needs data_columns to be defined 

3516 """ 

3517 if min_itemsize is None: 

3518 return 

3519 if not isinstance(min_itemsize, dict): 

3520 return 

3521 

3522 q = self.queryables() 

3523 for k in min_itemsize: 

3524 # ok, apply generally 

3525 if k == "values": 

3526 continue 

3527 if k not in q: 

3528 raise ValueError( 

3529 f"min_itemsize has the key [{k}] which is not an axis or " 

3530 "data_column" 

3531 ) 

3532 

3533 @cache_readonly 

3534 def indexables(self): 

3535 """create/cache the indexables if they don't exist""" 

3536 _indexables = [] 

3537 

3538 desc = self.description 

3539 table_attrs = self.table.attrs 

3540 

3541 # Note: each of the `name` kwargs below are str, ensured 

3542 # by the definition in index_cols. 

3543 # index columns 

3544 for i, (axis, name) in enumerate(self.attrs.index_cols): 

3545 atom = getattr(desc, name) 

3546 md = self.read_metadata(name) 

3547 meta = "category" if md is not None else None 

3548 

3549 kind_attr = f"{name}_kind" 

3550 kind = getattr(table_attrs, kind_attr, None) 

3551 

3552 index_col = IndexCol( 

3553 name=name, 

3554 axis=axis, 

3555 pos=i, 

3556 kind=kind, 

3557 typ=atom, 

3558 table=self.table, 

3559 meta=meta, 

3560 metadata=md, 

3561 ) 

3562 _indexables.append(index_col) 

3563 

3564 # values columns 

3565 dc = set(self.data_columns) 

3566 base_pos = len(_indexables) 

3567 

3568 def f(i, c): 

3569 assert isinstance(c, str) 

3570 klass = DataCol 

3571 if c in dc: 

3572 klass = DataIndexableCol 

3573 

3574 atom = getattr(desc, c) 

3575 adj_name = _maybe_adjust_name(c, self.version) 

3576 

3577 # TODO: why kind_attr here? 

3578 values = getattr(table_attrs, f"{adj_name}_kind", None) 

3579 dtype = getattr(table_attrs, f"{adj_name}_dtype", None) 

3580 # Argument 1 to "_dtype_to_kind" has incompatible type 

3581 # "Optional[Any]"; expected "str" [arg-type] 

3582 kind = _dtype_to_kind(dtype) # type: ignore[arg-type] 

3583 

3584 md = self.read_metadata(c) 

3585 # TODO: figure out why these two versions of `meta` dont always match. 

3586 # meta = "category" if md is not None else None 

3587 meta = getattr(table_attrs, f"{adj_name}_meta", None) 

3588 

3589 obj = klass( 

3590 name=adj_name, 

3591 cname=c, 

3592 values=values, 

3593 kind=kind, 

3594 pos=base_pos + i, 

3595 typ=atom, 

3596 table=self.table, 

3597 meta=meta, 

3598 metadata=md, 

3599 dtype=dtype, 

3600 ) 

3601 return obj 

3602 

3603 # Note: the definition of `values_cols` ensures that each 

3604 # `c` below is a str. 

3605 _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)]) 

3606 

3607 return _indexables 

3608 

3609 def create_index( 

3610 self, columns=None, optlevel=None, kind: str | None = None 

3611 ) -> None: 

3612 """ 

3613 Create a pytables index on the specified columns. 

3614 

3615 Parameters 

3616 ---------- 

3617 columns : None, bool, or listlike[str] 

3618 Indicate which columns to create an index on. 

3619 

3620 * False : Do not create any indexes. 

3621 * True : Create indexes on all columns. 

3622 * None : Create indexes on all columns. 

3623 * listlike : Create indexes on the given columns. 

3624 

3625 optlevel : int or None, default None 

3626 Optimization level, if None, pytables defaults to 6. 

3627 kind : str or None, default None 

3628 Kind of index, if None, pytables defaults to "medium". 

3629 

3630 Raises 

3631 ------ 

3632 TypeError if trying to create an index on a complex-type column. 

3633 

3634 Notes 

3635 ----- 

3636 Cannot index Time64Col or ComplexCol. 

3637 Pytables must be >= 3.0. 

3638 """ 

3639 if not self.infer_axes(): 

3640 return 

3641 if columns is False: 

3642 return 

3643 

3644 # index all indexables and data_columns 

3645 if columns is None or columns is True: 

3646 columns = [a.cname for a in self.axes if a.is_data_indexable] 

3647 if not isinstance(columns, (tuple, list)): 

3648 columns = [columns] 

3649 

3650 kw = {} 

3651 if optlevel is not None: 

3652 kw["optlevel"] = optlevel 

3653 if kind is not None: 

3654 kw["kind"] = kind 

3655 

3656 table = self.table 

3657 for c in columns: 

3658 v = getattr(table.cols, c, None) 

3659 if v is not None: 

3660 # remove the index if the kind/optlevel have changed 

3661 if v.is_indexed: 

3662 index = v.index 

3663 cur_optlevel = index.optlevel 

3664 cur_kind = index.kind 

3665 

3666 if kind is not None and cur_kind != kind: 

3667 v.remove_index() 

3668 else: 

3669 kw["kind"] = cur_kind 

3670 

3671 if optlevel is not None and cur_optlevel != optlevel: 

3672 v.remove_index() 

3673 else: 

3674 kw["optlevel"] = cur_optlevel 

3675 

3676 # create the index 

3677 if not v.is_indexed: 

3678 if v.type.startswith("complex"): 

3679 raise TypeError( 

3680 "Columns containing complex values can be stored but " 

3681 "cannot be indexed when using table format. Either use " 

3682 "fixed format, set index=False, or do not include " 

3683 "the columns containing complex values to " 

3684 "data_columns when initializing the table." 

3685 ) 

3686 v.create_index(**kw) 

3687 elif c in self.non_index_axes[0][1]: 

3688 # GH 28156 

3689 raise AttributeError( 

3690 f"column {c} is not a data_column.\n" 

3691 f"In order to read column {c} you must reload the dataframe \n" 

3692 f"into HDFStore and include {c} with the data_columns argument." 

3693 ) 

3694 

3695 def _read_axes( 

3696 self, where, start: int | None = None, stop: int | None = None 

3697 ) -> list[tuple[ArrayLike, ArrayLike]]: 

3698 """ 

3699 Create the axes sniffed from the table. 

3700 

3701 Parameters 

3702 ---------- 

3703 where : ??? 

3704 start : int or None, default None 

3705 stop : int or None, default None 

3706 

3707 Returns 

3708 ------- 

3709 List[Tuple[index_values, column_values]] 

3710 """ 

3711 # create the selection 

3712 selection = Selection(self, where=where, start=start, stop=stop) 

3713 values = selection.select() 

3714 

3715 results = [] 

3716 # convert the data 

3717 for a in self.axes: 

3718 a.set_info(self.info) 

3719 res = a.convert( 

3720 values, 

3721 nan_rep=self.nan_rep, 

3722 encoding=self.encoding, 

3723 errors=self.errors, 

3724 ) 

3725 results.append(res) 

3726 

3727 return results 

3728 

3729 @classmethod 

3730 def get_object(cls, obj, transposed: bool): 

3731 """return the data for this obj""" 

3732 return obj 

3733 

3734 def validate_data_columns(self, data_columns, min_itemsize, non_index_axes): 

3735 """ 

3736 take the input data_columns and min_itemize and create a data 

3737 columns spec 

3738 """ 

3739 if not len(non_index_axes): 

3740 return [] 

3741 

3742 axis, axis_labels = non_index_axes[0] 

3743 info = self.info.get(axis, {}) 

3744 if info.get("type") == "MultiIndex" and data_columns: 

3745 raise ValueError( 

3746 f"cannot use a multi-index on axis [{axis}] with " 

3747 f"data_columns {data_columns}" 

3748 ) 

3749 

3750 # evaluate the passed data_columns, True == use all columns 

3751 # take only valid axis labels 

3752 if data_columns is True: 

3753 data_columns = list(axis_labels) 

3754 elif data_columns is None: 

3755 data_columns = [] 

3756 

3757 # if min_itemsize is a dict, add the keys (exclude 'values') 

3758 if isinstance(min_itemsize, dict): 

3759 existing_data_columns = set(data_columns) 

3760 data_columns = list(data_columns) # ensure we do not modify 

3761 data_columns.extend( 

3762 [ 

3763 k 

3764 for k in min_itemsize.keys() 

3765 if k != "values" and k not in existing_data_columns 

3766 ] 

3767 ) 

3768 

3769 # return valid columns in the order of our axis 

3770 return [c for c in data_columns if c in axis_labels] 

3771 

3772 def _create_axes( 

3773 self, 

3774 axes, 

3775 obj: DataFrame, 

3776 validate: bool = True, 

3777 nan_rep=None, 

3778 data_columns=None, 

3779 min_itemsize=None, 

3780 ): 

3781 """ 

3782 Create and return the axes. 

3783 

3784 Parameters 

3785 ---------- 

3786 axes: list or None 

3787 The names or numbers of the axes to create. 

3788 obj : DataFrame 

3789 The object to create axes on. 

3790 validate: bool, default True 

3791 Whether to validate the obj against an existing object already written. 

3792 nan_rep : 

3793 A value to use for string column nan_rep. 

3794 data_columns : List[str], True, or None, default None 

3795 Specify the columns that we want to create to allow indexing on. 

3796 

3797 * True : Use all available columns. 

3798 * None : Use no columns. 

3799 * List[str] : Use the specified columns. 

3800 

3801 min_itemsize: Dict[str, int] or None, default None 

3802 The min itemsize for a column in bytes. 

3803 """ 

3804 if not isinstance(obj, DataFrame): 

3805 group = self.group._v_name 

3806 raise TypeError( 

3807 f"cannot properly create the storer for: [group->{group}," 

3808 f"value->{type(obj)}]" 

3809 ) 

3810 

3811 # set the default axes if needed 

3812 if axes is None: 

3813 axes = [0] 

3814 

3815 # map axes to numbers 

3816 axes = [obj._get_axis_number(a) for a in axes] 

3817 

3818 # do we have an existing table (if so, use its axes & data_columns) 

3819 if self.infer_axes(): 

3820 table_exists = True 

3821 axes = [a.axis for a in self.index_axes] 

3822 data_columns = list(self.data_columns) 

3823 nan_rep = self.nan_rep 

3824 # TODO: do we always have validate=True here? 

3825 else: 

3826 table_exists = False 

3827 

3828 new_info = self.info 

3829 

3830 assert self.ndim == 2 # with next check, we must have len(axes) == 1 

3831 # currently support on ndim-1 axes 

3832 if len(axes) != self.ndim - 1: 

3833 raise ValueError( 

3834 "currently only support ndim-1 indexers in an AppendableTable" 

3835 ) 

3836 

3837 # create according to the new data 

3838 new_non_index_axes: list = [] 

3839 

3840 # nan_representation 

3841 if nan_rep is None: 

3842 nan_rep = "nan" 

3843 

3844 # We construct the non-index-axis first, since that alters new_info 

3845 idx = [x for x in [0, 1] if x not in axes][0] 

3846 

3847 a = obj.axes[idx] 

3848 # we might be able to change the axes on the appending data if necessary 

3849 append_axis = list(a) 

3850 if table_exists: 

3851 indexer = len(new_non_index_axes) # i.e. 0 

3852 exist_axis = self.non_index_axes[indexer][1] 

3853 if not array_equivalent(np.array(append_axis), np.array(exist_axis)): 

3854 # ahah! -> reindex 

3855 if array_equivalent( 

3856 np.array(sorted(append_axis)), np.array(sorted(exist_axis)) 

3857 ): 

3858 append_axis = exist_axis 

3859 

3860 # the non_index_axes info 

3861 info = new_info.setdefault(idx, {}) 

3862 info["names"] = list(a.names) 

3863 info["type"] = type(a).__name__ 

3864 

3865 new_non_index_axes.append((idx, append_axis)) 

3866 

3867 # Now we can construct our new index axis 

3868 idx = axes[0] 

3869 a = obj.axes[idx] 

3870 axis_name = obj._get_axis_name(idx) 

3871 new_index = _convert_index(axis_name, a, self.encoding, self.errors) 

3872 new_index.axis = idx 

3873 

3874 # Because we are always 2D, there is only one new_index, so 

3875 # we know it will have pos=0 

3876 new_index.set_pos(0) 

3877 new_index.update_info(new_info) 

3878 new_index.maybe_set_size(min_itemsize) # check for column conflicts 

3879 

3880 new_index_axes = [new_index] 

3881 j = len(new_index_axes) # i.e. 1 

3882 assert j == 1 

3883 

3884 # reindex by our non_index_axes & compute data_columns 

3885 assert len(new_non_index_axes) == 1 

3886 for a in new_non_index_axes: 

3887 obj = _reindex_axis(obj, a[0], a[1]) 

3888 

3889 transposed = new_index.axis == 1 

3890 

3891 # figure out data_columns and get out blocks 

3892 data_columns = self.validate_data_columns( 

3893 data_columns, min_itemsize, new_non_index_axes 

3894 ) 

3895 

3896 frame = self.get_object(obj, transposed)._consolidate() 

3897 

3898 blocks, blk_items = self._get_blocks_and_items( 

3899 frame, table_exists, new_non_index_axes, self.values_axes, data_columns 

3900 ) 

3901 

3902 # add my values 

3903 vaxes = [] 

3904 for i, (blk, b_items) in enumerate(zip(blocks, blk_items)): 

3905 # shape of the data column are the indexable axes 

3906 klass = DataCol 

3907 name = None 

3908 

3909 # we have a data_column 

3910 if data_columns and len(b_items) == 1 and b_items[0] in data_columns: 

3911 klass = DataIndexableCol 

3912 name = b_items[0] 

3913 if not (name is None or isinstance(name, str)): 

3914 # TODO: should the message here be more specifically non-str? 

3915 raise ValueError("cannot have non-object label DataIndexableCol") 

3916 

3917 # make sure that we match up the existing columns 

3918 # if we have an existing table 

3919 existing_col: DataCol | None 

3920 

3921 if table_exists and validate: 

3922 try: 

3923 existing_col = self.values_axes[i] 

3924 except (IndexError, KeyError) as err: 

3925 raise ValueError( 

3926 f"Incompatible appended table [{blocks}]" 

3927 f"with existing table [{self.values_axes}]" 

3928 ) from err 

3929 else: 

3930 existing_col = None 

3931 

3932 new_name = name or f"values_block_{i}" 

3933 data_converted = _maybe_convert_for_string_atom( 

3934 new_name, 

3935 blk.values, 

3936 existing_col=existing_col, 

3937 min_itemsize=min_itemsize, 

3938 nan_rep=nan_rep, 

3939 encoding=self.encoding, 

3940 errors=self.errors, 

3941 columns=b_items, 

3942 ) 

3943 adj_name = _maybe_adjust_name(new_name, self.version) 

3944 

3945 typ = klass._get_atom(data_converted) 

3946 kind = _dtype_to_kind(data_converted.dtype.name) 

3947 tz = None 

3948 if getattr(data_converted, "tz", None) is not None: 

3949 tz = _get_tz(data_converted.tz) 

3950 

3951 meta = metadata = ordered = None 

3952 if is_categorical_dtype(data_converted.dtype): 

3953 ordered = data_converted.ordered 

3954 meta = "category" 

3955 metadata = np.array(data_converted.categories, copy=False).ravel() 

3956 

3957 data, dtype_name = _get_data_and_dtype_name(data_converted) 

3958 

3959 col = klass( 

3960 name=adj_name, 

3961 cname=new_name, 

3962 values=list(b_items), 

3963 typ=typ, 

3964 pos=j, 

3965 kind=kind, 

3966 tz=tz, 

3967 ordered=ordered, 

3968 meta=meta, 

3969 metadata=metadata, 

3970 dtype=dtype_name, 

3971 data=data, 

3972 ) 

3973 col.update_info(new_info) 

3974 

3975 vaxes.append(col) 

3976 

3977 j += 1 

3978 

3979 dcs = [col.name for col in vaxes if col.is_data_indexable] 

3980 

3981 new_table = type(self)( 

3982 parent=self.parent, 

3983 group=self.group, 

3984 encoding=self.encoding, 

3985 errors=self.errors, 

3986 index_axes=new_index_axes, 

3987 non_index_axes=new_non_index_axes, 

3988 values_axes=vaxes, 

3989 data_columns=dcs, 

3990 info=new_info, 

3991 nan_rep=nan_rep, 

3992 ) 

3993 if hasattr(self, "levels"): 

3994 # TODO: get this into constructor, only for appropriate subclass 

3995 new_table.levels = self.levels 

3996 

3997 new_table.validate_min_itemsize(min_itemsize) 

3998 

3999 if validate and table_exists: 

4000 new_table.validate(self) 

4001 

4002 return new_table 

4003 

4004 @staticmethod 

4005 def _get_blocks_and_items( 

4006 frame: DataFrame, 

4007 table_exists: bool, 

4008 new_non_index_axes, 

4009 values_axes, 

4010 data_columns, 

4011 ): 

4012 # Helper to clarify non-state-altering parts of _create_axes 

4013 

4014 # TODO(ArrayManager) HDFStore relies on accessing the blocks 

4015 if isinstance(frame._mgr, ArrayManager): 

4016 frame = frame._as_manager("block") 

4017 

4018 def get_blk_items(mgr): 

4019 return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks] 

4020 

4021 mgr = frame._mgr 

4022 mgr = cast(BlockManager, mgr) 

4023 blocks: list[Block] = list(mgr.blocks) 

4024 blk_items: list[Index] = get_blk_items(mgr) 

4025 

4026 if len(data_columns): 

4027 # TODO: prove that we only get here with axis == 1? 

4028 # It is the case in all extant tests, but NOT the case 

4029 # outside this `if len(data_columns)` check. 

4030 

4031 axis, axis_labels = new_non_index_axes[0] 

4032 new_labels = Index(axis_labels).difference(Index(data_columns)) 

4033 mgr = frame.reindex(new_labels, axis=axis)._mgr 

4034 mgr = cast(BlockManager, mgr) 

4035 

4036 blocks = list(mgr.blocks) 

4037 blk_items = get_blk_items(mgr) 

4038 for c in data_columns: 

4039 # This reindex would raise ValueError if we had a duplicate 

4040 # index, so we can infer that (as long as axis==1) we 

4041 # get a single column back, so a single block. 

4042 mgr = frame.reindex([c], axis=axis)._mgr 

4043 mgr = cast(BlockManager, mgr) 

4044 blocks.extend(mgr.blocks) 

4045 blk_items.extend(get_blk_items(mgr)) 

4046 

4047 # reorder the blocks in the same order as the existing table if we can 

4048 if table_exists: 

4049 by_items = { 

4050 tuple(b_items.tolist()): (b, b_items) 

4051 for b, b_items in zip(blocks, blk_items) 

4052 } 

4053 new_blocks: list[Block] = [] 

4054 new_blk_items = [] 

4055 for ea in values_axes: 

4056 items = tuple(ea.values) 

4057 try: 

4058 b, b_items = by_items.pop(items) 

4059 new_blocks.append(b) 

4060 new_blk_items.append(b_items) 

4061 except (IndexError, KeyError) as err: 

4062 jitems = ",".join([pprint_thing(item) for item in items]) 

4063 raise ValueError( 

4064 f"cannot match existing table structure for [{jitems}] " 

4065 "on appending data" 

4066 ) from err 

4067 blocks = new_blocks 

4068 blk_items = new_blk_items 

4069 

4070 return blocks, blk_items 

4071 

4072 def process_axes(self, obj, selection: Selection, columns=None) -> DataFrame: 

4073 """process axes filters""" 

4074 # make a copy to avoid side effects 

4075 if columns is not None: 

4076 columns = list(columns) 

4077 

4078 # make sure to include levels if we have them 

4079 if columns is not None and self.is_multi_index: 

4080 assert isinstance(self.levels, list) # assured by is_multi_index 

4081 for n in self.levels: 

4082 if n not in columns: 

4083 columns.insert(0, n) 

4084 

4085 # reorder by any non_index_axes & limit to the select columns 

4086 for axis, labels in self.non_index_axes: 

4087 obj = _reindex_axis(obj, axis, labels, columns) 

4088 

4089 def process_filter(field, filt, op): 

4090 for axis_name in obj._AXIS_ORDERS: 

4091 axis_number = obj._get_axis_number(axis_name) 

4092 axis_values = obj._get_axis(axis_name) 

4093 assert axis_number is not None 

4094 

4095 # see if the field is the name of an axis 

4096 if field == axis_name: 

4097 # if we have a multi-index, then need to include 

4098 # the levels 

4099 if self.is_multi_index: 

4100 filt = filt.union(Index(self.levels)) 

4101 

4102 takers = op(axis_values, filt) 

4103 return obj.loc(axis=axis_number)[takers] 

4104 

4105 # this might be the name of a file IN an axis 

4106 elif field in axis_values: 

4107 # we need to filter on this dimension 

4108 values = ensure_index(getattr(obj, field).values) 

4109 filt = ensure_index(filt) 

4110 

4111 # hack until we support reversed dim flags 

4112 if isinstance(obj, DataFrame): 

4113 axis_number = 1 - axis_number 

4114 

4115 takers = op(values, filt) 

4116 return obj.loc(axis=axis_number)[takers] 

4117 

4118 raise ValueError(f"cannot find the field [{field}] for filtering!") 

4119 

4120 # apply the selection filters (but keep in the same order) 

4121 if selection.filter is not None: 

4122 for field, op, filt in selection.filter.format(): 

4123 obj = process_filter(field, filt, op) 

4124 

4125 return obj 

4126 

4127 def create_description( 

4128 self, 

4129 complib, 

4130 complevel: int | None, 

4131 fletcher32: bool, 

4132 expectedrows: int | None, 

4133 ) -> dict[str, Any]: 

4134 """create the description of the table from the axes & values""" 

4135 # provided expected rows if its passed 

4136 if expectedrows is None: 

4137 expectedrows = max(self.nrows_expected, 10000) 

4138 

4139 d = {"name": "table", "expectedrows": expectedrows} 

4140 

4141 # description from the axes & values 

4142 d["description"] = {a.cname: a.typ for a in self.axes} 

4143 

4144 if complib: 

4145 if complevel is None: 

4146 complevel = self._complevel or 9 

4147 filters = _tables().Filters( 

4148 complevel=complevel, 

4149 complib=complib, 

4150 fletcher32=fletcher32 or self._fletcher32, 

4151 ) 

4152 d["filters"] = filters 

4153 elif self._filters is not None: 

4154 d["filters"] = self._filters 

4155 

4156 return d 

4157 

4158 def read_coordinates( 

4159 self, where=None, start: int | None = None, stop: int | None = None 

4160 ): 

4161 """ 

4162 select coordinates (row numbers) from a table; return the 

4163 coordinates object 

4164 """ 

4165 # validate the version 

4166 self.validate_version(where) 

4167 

4168 # infer the data kind 

4169 if not self.infer_axes(): 

4170 return False 

4171 

4172 # create the selection 

4173 selection = Selection(self, where=where, start=start, stop=stop) 

4174 coords = selection.select_coords() 

4175 if selection.filter is not None: 

4176 for field, op, filt in selection.filter.format(): 

4177 data = self.read_column( 

4178 field, start=coords.min(), stop=coords.max() + 1 

4179 ) 

4180 coords = coords[op(data.iloc[coords - coords.min()], filt).values] 

4181 

4182 return Index(coords) 

4183 

4184 def read_column( 

4185 self, 

4186 column: str, 

4187 where=None, 

4188 start: int | None = None, 

4189 stop: int | None = None, 

4190 ): 

4191 """ 

4192 return a single column from the table, generally only indexables 

4193 are interesting 

4194 """ 

4195 # validate the version 

4196 self.validate_version() 

4197 

4198 # infer the data kind 

4199 if not self.infer_axes(): 

4200 return False 

4201 

4202 if where is not None: 

4203 raise TypeError("read_column does not currently accept a where clause") 

4204 

4205 # find the axes 

4206 for a in self.axes: 

4207 if column == a.name: 

4208 if not a.is_data_indexable: 

4209 raise ValueError( 

4210 f"column [{column}] can not be extracted individually; " 

4211 "it is not data indexable" 

4212 ) 

4213 

4214 # column must be an indexable or a data column 

4215 c = getattr(self.table.cols, column) 

4216 a.set_info(self.info) 

4217 col_values = a.convert( 

4218 c[start:stop], 

4219 nan_rep=self.nan_rep, 

4220 encoding=self.encoding, 

4221 errors=self.errors, 

4222 ) 

4223 return Series(_set_tz(col_values[1], a.tz), name=column, copy=False) 

4224 

4225 raise KeyError(f"column [{column}] not found in the table") 

4226 

4227 

4228class WORMTable(Table): 

4229 """ 

4230 a write-once read-many table: this format DOES NOT ALLOW appending to a 

4231 table. writing is a one-time operation the data are stored in a format 

4232 that allows for searching the data on disk 

4233 """ 

4234 

4235 table_type = "worm" 

4236 

4237 def read( 

4238 self, 

4239 where=None, 

4240 columns=None, 

4241 start: int | None = None, 

4242 stop: int | None = None, 

4243 ): 

4244 """ 

4245 read the indices and the indexing array, calculate offset rows and return 

4246 """ 

4247 raise NotImplementedError("WORMTable needs to implement read") 

4248 

4249 def write(self, **kwargs) -> None: 

4250 """ 

4251 write in a format that we can search later on (but cannot append 

4252 to): write out the indices and the values using _write_array 

4253 (e.g. a CArray) create an indexing table so that we can search 

4254 """ 

4255 raise NotImplementedError("WORMTable needs to implement write") 

4256 

4257 

4258class AppendableTable(Table): 

4259 """support the new appendable table formats""" 

4260 

4261 table_type = "appendable" 

4262 

4263 # error: Signature of "write" incompatible with supertype "Fixed" 

4264 def write( # type: ignore[override] 

4265 self, 

4266 obj, 

4267 axes=None, 

4268 append: bool = False, 

4269 complib=None, 

4270 complevel=None, 

4271 fletcher32=None, 

4272 min_itemsize=None, 

4273 chunksize=None, 

4274 expectedrows=None, 

4275 dropna: bool = False, 

4276 nan_rep=None, 

4277 data_columns=None, 

4278 track_times: bool = True, 

4279 ) -> None: 

4280 if not append and self.is_exists: 

4281 self._handle.remove_node(self.group, "table") 

4282 

4283 # create the axes 

4284 table = self._create_axes( 

4285 axes=axes, 

4286 obj=obj, 

4287 validate=append, 

4288 min_itemsize=min_itemsize, 

4289 nan_rep=nan_rep, 

4290 data_columns=data_columns, 

4291 ) 

4292 

4293 for a in table.axes: 

4294 a.validate_names() 

4295 

4296 if not table.is_exists: 

4297 # create the table 

4298 options = table.create_description( 

4299 complib=complib, 

4300 complevel=complevel, 

4301 fletcher32=fletcher32, 

4302 expectedrows=expectedrows, 

4303 ) 

4304 

4305 # set the table attributes 

4306 table.set_attrs() 

4307 

4308 options["track_times"] = track_times 

4309 

4310 # create the table 

4311 table._handle.create_table(table.group, **options) 

4312 

4313 # update my info 

4314 table.attrs.info = table.info 

4315 

4316 # validate the axes and set the kinds 

4317 for a in table.axes: 

4318 a.validate_and_set(table, append) 

4319 

4320 # add the rows 

4321 table.write_data(chunksize, dropna=dropna) 

4322 

4323 def write_data(self, chunksize: int | None, dropna: bool = False) -> None: 

4324 """ 

4325 we form the data into a 2-d including indexes,values,mask write chunk-by-chunk 

4326 """ 

4327 names = self.dtype.names 

4328 nrows = self.nrows_expected 

4329 

4330 # if dropna==True, then drop ALL nan rows 

4331 masks = [] 

4332 if dropna: 

4333 for a in self.values_axes: 

4334 # figure the mask: only do if we can successfully process this 

4335 # column, otherwise ignore the mask 

4336 mask = isna(a.data).all(axis=0) 

4337 if isinstance(mask, np.ndarray): 

4338 masks.append(mask.astype("u1", copy=False)) 

4339 

4340 # consolidate masks 

4341 if len(masks): 

4342 mask = masks[0] 

4343 for m in masks[1:]: 

4344 mask = mask & m 

4345 mask = mask.ravel() 

4346 else: 

4347 mask = None 

4348 

4349 # broadcast the indexes if needed 

4350 indexes = [a.cvalues for a in self.index_axes] 

4351 nindexes = len(indexes) 

4352 assert nindexes == 1, nindexes # ensures we dont need to broadcast 

4353 

4354 # transpose the values so first dimension is last 

4355 # reshape the values if needed 

4356 values = [a.take_data() for a in self.values_axes] 

4357 values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values] 

4358 bvalues = [] 

4359 for i, v in enumerate(values): 

4360 new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape 

4361 bvalues.append(v.reshape(new_shape)) 

4362 

4363 # write the chunks 

4364 if chunksize is None: 

4365 chunksize = 100000 

4366 

4367 rows = np.empty(min(chunksize, nrows), dtype=self.dtype) 

4368 chunks = nrows // chunksize + 1 

4369 for i in range(chunks): 

4370 start_i = i * chunksize 

4371 end_i = min((i + 1) * chunksize, nrows) 

4372 if start_i >= end_i: 

4373 break 

4374 

4375 self.write_data_chunk( 

4376 rows, 

4377 indexes=[a[start_i:end_i] for a in indexes], 

4378 mask=mask[start_i:end_i] if mask is not None else None, 

4379 values=[v[start_i:end_i] for v in bvalues], 

4380 ) 

4381 

4382 def write_data_chunk( 

4383 self, 

4384 rows: np.ndarray, 

4385 indexes: list[np.ndarray], 

4386 mask: npt.NDArray[np.bool_] | None, 

4387 values: list[np.ndarray], 

4388 ) -> None: 

4389 """ 

4390 Parameters 

4391 ---------- 

4392 rows : an empty memory space where we are putting the chunk 

4393 indexes : an array of the indexes 

4394 mask : an array of the masks 

4395 values : an array of the values 

4396 """ 

4397 # 0 len 

4398 for v in values: 

4399 if not np.prod(v.shape): 

4400 return 

4401 

4402 nrows = indexes[0].shape[0] 

4403 if nrows != len(rows): 

4404 rows = np.empty(nrows, dtype=self.dtype) 

4405 names = self.dtype.names 

4406 nindexes = len(indexes) 

4407 

4408 # indexes 

4409 for i, idx in enumerate(indexes): 

4410 rows[names[i]] = idx 

4411 

4412 # values 

4413 for i, v in enumerate(values): 

4414 rows[names[i + nindexes]] = v 

4415 

4416 # mask 

4417 if mask is not None: 

4418 m = ~mask.ravel().astype(bool, copy=False) 

4419 if not m.all(): 

4420 rows = rows[m] 

4421 

4422 if len(rows): 

4423 self.table.append(rows) 

4424 self.table.flush() 

4425 

4426 def delete(self, where=None, start: int | None = None, stop: int | None = None): 

4427 # delete all rows (and return the nrows) 

4428 if where is None or not len(where): 

4429 if start is None and stop is None: 

4430 nrows = self.nrows 

4431 self._handle.remove_node(self.group, recursive=True) 

4432 else: 

4433 # pytables<3.0 would remove a single row with stop=None 

4434 if stop is None: 

4435 stop = self.nrows 

4436 nrows = self.table.remove_rows(start=start, stop=stop) 

4437 self.table.flush() 

4438 return nrows 

4439 

4440 # infer the data kind 

4441 if not self.infer_axes(): 

4442 return None 

4443 

4444 # create the selection 

4445 table = self.table 

4446 selection = Selection(self, where, start=start, stop=stop) 

4447 values = selection.select_coords() 

4448 

4449 # delete the rows in reverse order 

4450 sorted_series = Series(values, copy=False).sort_values() 

4451 ln = len(sorted_series) 

4452 

4453 if ln: 

4454 # construct groups of consecutive rows 

4455 diff = sorted_series.diff() 

4456 groups = list(diff[diff > 1].index) 

4457 

4458 # 1 group 

4459 if not len(groups): 

4460 groups = [0] 

4461 

4462 # final element 

4463 if groups[-1] != ln: 

4464 groups.append(ln) 

4465 

4466 # initial element 

4467 if groups[0] != 0: 

4468 groups.insert(0, 0) 

4469 

4470 # we must remove in reverse order! 

4471 pg = groups.pop() 

4472 for g in reversed(groups): 

4473 rows = sorted_series.take(range(g, pg)) 

4474 table.remove_rows( 

4475 start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1 

4476 ) 

4477 pg = g 

4478 

4479 self.table.flush() 

4480 

4481 # return the number of rows removed 

4482 return ln 

4483 

4484 

4485class AppendableFrameTable(AppendableTable): 

4486 """support the new appendable table formats""" 

4487 

4488 pandas_kind = "frame_table" 

4489 table_type = "appendable_frame" 

4490 ndim = 2 

4491 obj_type: type[DataFrame | Series] = DataFrame 

4492 

4493 @property 

4494 def is_transposed(self) -> bool: 

4495 return self.index_axes[0].axis == 1 

4496 

4497 @classmethod 

4498 def get_object(cls, obj, transposed: bool): 

4499 """these are written transposed""" 

4500 if transposed: 

4501 obj = obj.T 

4502 return obj 

4503 

4504 def read( 

4505 self, 

4506 where=None, 

4507 columns=None, 

4508 start: int | None = None, 

4509 stop: int | None = None, 

4510 ): 

4511 # validate the version 

4512 self.validate_version(where) 

4513 

4514 # infer the data kind 

4515 if not self.infer_axes(): 

4516 return None 

4517 

4518 result = self._read_axes(where=where, start=start, stop=stop) 

4519 

4520 info = ( 

4521 self.info.get(self.non_index_axes[0][0], {}) 

4522 if len(self.non_index_axes) 

4523 else {} 

4524 ) 

4525 

4526 inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]] 

4527 assert len(inds) == 1 

4528 ind = inds[0] 

4529 

4530 index = result[ind][0] 

4531 

4532 frames = [] 

4533 for i, a in enumerate(self.axes): 

4534 if a not in self.values_axes: 

4535 continue 

4536 index_vals, cvalues = result[i] 

4537 

4538 # we could have a multi-index constructor here 

4539 # ensure_index doesn't recognized our list-of-tuples here 

4540 if info.get("type") != "MultiIndex": 

4541 cols = Index(index_vals) 

4542 else: 

4543 cols = MultiIndex.from_tuples(index_vals) 

4544 

4545 names = info.get("names") 

4546 if names is not None: 

4547 cols.set_names(names, inplace=True) 

4548 

4549 if self.is_transposed: 

4550 values = cvalues 

4551 index_ = cols 

4552 cols_ = Index(index, name=getattr(index, "name", None)) 

4553 else: 

4554 values = cvalues.T 

4555 index_ = Index(index, name=getattr(index, "name", None)) 

4556 cols_ = cols 

4557 

4558 # if we have a DataIndexableCol, its shape will only be 1 dim 

4559 if values.ndim == 1 and isinstance(values, np.ndarray): 

4560 values = values.reshape((1, values.shape[0])) 

4561 

4562 if isinstance(values, np.ndarray): 

4563 df = DataFrame(values.T, columns=cols_, index=index_, copy=False) 

4564 elif isinstance(values, Index): 

4565 df = DataFrame(values, columns=cols_, index=index_) 

4566 else: 

4567 # Categorical 

4568 df = DataFrame._from_arrays([values], columns=cols_, index=index_) 

4569 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) 

4570 frames.append(df) 

4571 

4572 if len(frames) == 1: 

4573 df = frames[0] 

4574 else: 

4575 df = concat(frames, axis=1) 

4576 

4577 selection = Selection(self, where=where, start=start, stop=stop) 

4578 # apply the selection filters & axis orderings 

4579 df = self.process_axes(df, selection=selection, columns=columns) 

4580 

4581 return df 

4582 

4583 

4584class AppendableSeriesTable(AppendableFrameTable): 

4585 """support the new appendable table formats""" 

4586 

4587 pandas_kind = "series_table" 

4588 table_type = "appendable_series" 

4589 ndim = 2 

4590 obj_type = Series 

4591 

4592 @property 

4593 def is_transposed(self) -> bool: 

4594 return False 

4595 

4596 @classmethod 

4597 def get_object(cls, obj, transposed: bool): 

4598 return obj 

4599 

4600 def write(self, obj, data_columns=None, **kwargs): 

4601 """we are going to write this as a frame table""" 

4602 if not isinstance(obj, DataFrame): 

4603 name = obj.name or "values" 

4604 obj = obj.to_frame(name) 

4605 return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs) 

4606 

4607 def read( 

4608 self, 

4609 where=None, 

4610 columns=None, 

4611 start: int | None = None, 

4612 stop: int | None = None, 

4613 ) -> Series: 

4614 is_multi_index = self.is_multi_index 

4615 if columns is not None and is_multi_index: 

4616 assert isinstance(self.levels, list) # needed for mypy 

4617 for n in self.levels: 

4618 if n not in columns: 

4619 columns.insert(0, n) 

4620 s = super().read(where=where, columns=columns, start=start, stop=stop) 

4621 if is_multi_index: 

4622 s.set_index(self.levels, inplace=True) 

4623 

4624 s = s.iloc[:, 0] 

4625 

4626 # remove the default name 

4627 if s.name == "values": 

4628 s.name = None 

4629 return s 

4630 

4631 

4632class AppendableMultiSeriesTable(AppendableSeriesTable): 

4633 """support the new appendable table formats""" 

4634 

4635 pandas_kind = "series_table" 

4636 table_type = "appendable_multiseries" 

4637 

4638 def write(self, obj, **kwargs): 

4639 """we are going to write this as a frame table""" 

4640 name = obj.name or "values" 

4641 newobj, self.levels = self.validate_multiindex(obj) 

4642 assert isinstance(self.levels, list) # for mypy 

4643 cols = list(self.levels) 

4644 cols.append(name) 

4645 newobj.columns = Index(cols) 

4646 return super().write(obj=newobj, **kwargs) 

4647 

4648 

4649class GenericTable(AppendableFrameTable): 

4650 """a table that read/writes the generic pytables table format""" 

4651 

4652 pandas_kind = "frame_table" 

4653 table_type = "generic_table" 

4654 ndim = 2 

4655 obj_type = DataFrame 

4656 levels: list[Hashable] 

4657 

4658 @property 

4659 def pandas_type(self) -> str: 

4660 return self.pandas_kind 

4661 

4662 @property 

4663 def storable(self): 

4664 return getattr(self.group, "table", None) or self.group 

4665 

4666 def get_attrs(self) -> None: 

4667 """retrieve our attributes""" 

4668 self.non_index_axes = [] 

4669 self.nan_rep = None 

4670 self.levels = [] 

4671 

4672 self.index_axes = [a for a in self.indexables if a.is_an_indexable] 

4673 self.values_axes = [a for a in self.indexables if not a.is_an_indexable] 

4674 self.data_columns = [a.name for a in self.values_axes] 

4675 

4676 @cache_readonly 

4677 def indexables(self): 

4678 """create the indexables from the table description""" 

4679 d = self.description 

4680 

4681 # TODO: can we get a typ for this? AFAICT it is the only place 

4682 # where we aren't passing one 

4683 # the index columns is just a simple index 

4684 md = self.read_metadata("index") 

4685 meta = "category" if md is not None else None 

4686 index_col = GenericIndexCol( 

4687 name="index", axis=0, table=self.table, meta=meta, metadata=md 

4688 ) 

4689 

4690 _indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col] 

4691 

4692 for i, n in enumerate(d._v_names): 

4693 assert isinstance(n, str) 

4694 

4695 atom = getattr(d, n) 

4696 md = self.read_metadata(n) 

4697 meta = "category" if md is not None else None 

4698 dc = GenericDataIndexableCol( 

4699 name=n, 

4700 pos=i, 

4701 values=[n], 

4702 typ=atom, 

4703 table=self.table, 

4704 meta=meta, 

4705 metadata=md, 

4706 ) 

4707 _indexables.append(dc) 

4708 

4709 return _indexables 

4710 

4711 def write(self, **kwargs): 

4712 raise NotImplementedError("cannot write on an generic table") 

4713 

4714 

4715class AppendableMultiFrameTable(AppendableFrameTable): 

4716 """a frame with a multi-index""" 

4717 

4718 table_type = "appendable_multiframe" 

4719 obj_type = DataFrame 

4720 ndim = 2 

4721 _re_levels = re.compile(r"^level_\d+$") 

4722 

4723 @property 

4724 def table_type_short(self) -> str: 

4725 return "appendable_multi" 

4726 

4727 def write(self, obj, data_columns=None, **kwargs): 

4728 if data_columns is None: 

4729 data_columns = [] 

4730 elif data_columns is True: 

4731 data_columns = obj.columns.tolist() 

4732 obj, self.levels = self.validate_multiindex(obj) 

4733 assert isinstance(self.levels, list) # for mypy 

4734 for n in self.levels: 

4735 if n not in data_columns: 

4736 data_columns.insert(0, n) 

4737 return super().write(obj=obj, data_columns=data_columns, **kwargs) 

4738 

4739 def read( 

4740 self, 

4741 where=None, 

4742 columns=None, 

4743 start: int | None = None, 

4744 stop: int | None = None, 

4745 ): 

4746 df = super().read(where=where, columns=columns, start=start, stop=stop) 

4747 df = df.set_index(self.levels) 

4748 

4749 # remove names for 'level_%d' 

4750 df.index = df.index.set_names( 

4751 [None if self._re_levels.search(name) else name for name in df.index.names] 

4752 ) 

4753 

4754 return df 

4755 

4756 

4757def _reindex_axis( 

4758 obj: DataFrame, axis: AxisInt, labels: Index, other=None 

4759) -> DataFrame: 

4760 ax = obj._get_axis(axis) 

4761 labels = ensure_index(labels) 

4762 

4763 # try not to reindex even if other is provided 

4764 # if it equals our current index 

4765 if other is not None: 

4766 other = ensure_index(other) 

4767 if (other is None or labels.equals(other)) and labels.equals(ax): 

4768 return obj 

4769 

4770 labels = ensure_index(labels.unique()) 

4771 if other is not None: 

4772 labels = ensure_index(other.unique()).intersection(labels, sort=False) 

4773 if not labels.equals(ax): 

4774 slicer: list[slice | Index] = [slice(None, None)] * obj.ndim 

4775 slicer[axis] = labels 

4776 obj = obj.loc[tuple(slicer)] 

4777 return obj 

4778 

4779 

4780# tz to/from coercion 

4781 

4782 

4783def _get_tz(tz: tzinfo) -> str | tzinfo: 

4784 """for a tz-aware type, return an encoded zone""" 

4785 zone = timezones.get_timezone(tz) 

4786 return zone 

4787 

4788 

4789@overload 

4790def _set_tz( 

4791 values: np.ndarray | Index, tz: str | tzinfo, coerce: bool = False 

4792) -> DatetimeIndex: 

4793 ... 

4794 

4795 

4796@overload 

4797def _set_tz(values: np.ndarray | Index, tz: None, coerce: bool = False) -> np.ndarray: 

4798 ... 

4799 

4800 

4801def _set_tz( 

4802 values: np.ndarray | Index, tz: str | tzinfo | None, coerce: bool = False 

4803) -> np.ndarray | DatetimeIndex: 

4804 """ 

4805 coerce the values to a DatetimeIndex if tz is set 

4806 preserve the input shape if possible 

4807 

4808 Parameters 

4809 ---------- 

4810 values : ndarray or Index 

4811 tz : str or tzinfo 

4812 coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray 

4813 """ 

4814 if isinstance(values, DatetimeIndex): 

4815 # If values is tzaware, the tz gets dropped in the values.ravel() 

4816 # call below (which returns an ndarray). So we are only non-lossy 

4817 # if `tz` matches `values.tz`. 

4818 assert values.tz is None or values.tz == tz 

4819 

4820 if tz is not None: 

4821 if isinstance(values, DatetimeIndex): 

4822 name = values.name 

4823 values = values.asi8 

4824 else: 

4825 name = None 

4826 values = values.ravel() 

4827 

4828 tz = _ensure_decoded(tz) 

4829 values = DatetimeIndex(values, name=name) 

4830 values = values.tz_localize("UTC").tz_convert(tz) 

4831 elif coerce: 

4832 values = np.asarray(values, dtype="M8[ns]") 

4833 

4834 # error: Incompatible return value type (got "Union[ndarray, Index]", 

4835 # expected "Union[ndarray, DatetimeIndex]") 

4836 return values # type: ignore[return-value] 

4837 

4838 

4839def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol: 

4840 assert isinstance(name, str) 

4841 

4842 index_name = index.name 

4843 # error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index"; 

4844 # expected "Union[ExtensionArray, ndarray]" 

4845 converted, dtype_name = _get_data_and_dtype_name(index) # type: ignore[arg-type] 

4846 kind = _dtype_to_kind(dtype_name) 

4847 atom = DataIndexableCol._get_atom(converted) 

4848 

4849 if ( 

4850 (isinstance(index.dtype, np.dtype) and is_integer_dtype(index)) 

4851 or needs_i8_conversion(index.dtype) 

4852 or is_bool_dtype(index.dtype) 

4853 ): 

4854 # Includes Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, 

4855 # in which case "kind" is "integer", "integer", "datetime64", 

4856 # "timedelta64", and "integer", respectively. 

4857 return IndexCol( 

4858 name, 

4859 values=converted, 

4860 kind=kind, 

4861 typ=atom, 

4862 freq=getattr(index, "freq", None), 

4863 tz=getattr(index, "tz", None), 

4864 index_name=index_name, 

4865 ) 

4866 

4867 if isinstance(index, MultiIndex): 

4868 raise TypeError("MultiIndex not supported here!") 

4869 

4870 inferred_type = lib.infer_dtype(index, skipna=False) 

4871 # we won't get inferred_type of "datetime64" or "timedelta64" as these 

4872 # would go through the DatetimeIndex/TimedeltaIndex paths above 

4873 

4874 values = np.asarray(index) 

4875 

4876 if inferred_type == "date": 

4877 converted = np.asarray([v.toordinal() for v in values], dtype=np.int32) 

4878 return IndexCol( 

4879 name, converted, "date", _tables().Time32Col(), index_name=index_name 

4880 ) 

4881 elif inferred_type == "string": 

4882 converted = _convert_string_array(values, encoding, errors) 

4883 itemsize = converted.dtype.itemsize 

4884 return IndexCol( 

4885 name, 

4886 converted, 

4887 "string", 

4888 _tables().StringCol(itemsize), 

4889 index_name=index_name, 

4890 ) 

4891 

4892 elif inferred_type in ["integer", "floating"]: 

4893 return IndexCol( 

4894 name, values=converted, kind=kind, typ=atom, index_name=index_name 

4895 ) 

4896 else: 

4897 assert isinstance(converted, np.ndarray) and converted.dtype == object 

4898 assert kind == "object", kind 

4899 atom = _tables().ObjectAtom() 

4900 return IndexCol(name, converted, kind, atom, index_name=index_name) 

4901 

4902 

4903def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index: 

4904 index: Index | np.ndarray 

4905 

4906 if kind == "datetime64": 

4907 index = DatetimeIndex(data) 

4908 elif kind == "timedelta64": 

4909 index = TimedeltaIndex(data) 

4910 elif kind == "date": 

4911 try: 

4912 index = np.asarray([date.fromordinal(v) for v in data], dtype=object) 

4913 except ValueError: 

4914 index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object) 

4915 elif kind in ("integer", "float", "bool"): 

4916 index = np.asarray(data) 

4917 elif kind in ("string"): 

4918 index = _unconvert_string_array( 

4919 data, nan_rep=None, encoding=encoding, errors=errors 

4920 ) 

4921 elif kind == "object": 

4922 index = np.asarray(data[0]) 

4923 else: # pragma: no cover 

4924 raise ValueError(f"unrecognized index type {kind}") 

4925 return index 

4926 

4927 

4928def _maybe_convert_for_string_atom( 

4929 name: str, 

4930 bvalues: ArrayLike, 

4931 existing_col, 

4932 min_itemsize, 

4933 nan_rep, 

4934 encoding, 

4935 errors, 

4936 columns: list[str], 

4937): 

4938 if bvalues.dtype != object: 

4939 return bvalues 

4940 

4941 bvalues = cast(np.ndarray, bvalues) 

4942 

4943 dtype_name = bvalues.dtype.name 

4944 inferred_type = lib.infer_dtype(bvalues, skipna=False) 

4945 

4946 if inferred_type == "date": 

4947 raise TypeError("[date] is not implemented as a table column") 

4948 if inferred_type == "datetime": 

4949 # after GH#8260 

4950 # this only would be hit for a multi-timezone dtype which is an error 

4951 raise TypeError( 

4952 "too many timezones in this block, create separate data columns" 

4953 ) 

4954 

4955 if not (inferred_type == "string" or dtype_name == "object"): 

4956 return bvalues 

4957 

4958 mask = isna(bvalues) 

4959 data = bvalues.copy() 

4960 data[mask] = nan_rep 

4961 

4962 # see if we have a valid string type 

4963 inferred_type = lib.infer_dtype(data, skipna=False) 

4964 if inferred_type != "string": 

4965 # we cannot serialize this data, so report an exception on a column 

4966 # by column basis 

4967 

4968 # expected behaviour: 

4969 # search block for a non-string object column by column 

4970 for i in range(data.shape[0]): 

4971 col = data[i] 

4972 inferred_type = lib.infer_dtype(col, skipna=False) 

4973 if inferred_type != "string": 

4974 error_column_label = columns[i] if len(columns) > i else f"No.{i}" 

4975 raise TypeError( 

4976 f"Cannot serialize the column [{error_column_label}]\n" 

4977 f"because its data contents are not [string] but " 

4978 f"[{inferred_type}] object dtype" 

4979 ) 

4980 

4981 # itemsize is the maximum length of a string (along any dimension) 

4982 

4983 data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape) 

4984 itemsize = data_converted.itemsize 

4985 

4986 # specified min_itemsize? 

4987 if isinstance(min_itemsize, dict): 

4988 min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0) 

4989 itemsize = max(min_itemsize or 0, itemsize) 

4990 

4991 # check for column in the values conflicts 

4992 if existing_col is not None: 

4993 eci = existing_col.validate_col(itemsize) 

4994 if eci is not None and eci > itemsize: 

4995 itemsize = eci 

4996 

4997 data_converted = data_converted.astype(f"|S{itemsize}", copy=False) 

4998 return data_converted 

4999 

5000 

5001def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray: 

5002 """ 

5003 Take a string-like that is object dtype and coerce to a fixed size string type. 

5004 

5005 Parameters 

5006 ---------- 

5007 data : np.ndarray[object] 

5008 encoding : str 

5009 errors : str 

5010 Handler for encoding errors. 

5011 

5012 Returns 

5013 ------- 

5014 np.ndarray[fixed-length-string] 

5015 """ 

5016 # encode if needed 

5017 if len(data): 

5018 data = ( 

5019 Series(data.ravel(), copy=False) 

5020 .str.encode(encoding, errors) 

5021 ._values.reshape(data.shape) 

5022 ) 

5023 

5024 # create the sized dtype 

5025 ensured = ensure_object(data.ravel()) 

5026 itemsize = max(1, libwriters.max_len_string_array(ensured)) 

5027 

5028 data = np.asarray(data, dtype=f"S{itemsize}") 

5029 return data 

5030 

5031 

5032def _unconvert_string_array( 

5033 data: np.ndarray, nan_rep, encoding: str, errors: str 

5034) -> np.ndarray: 

5035 """ 

5036 Inverse of _convert_string_array. 

5037 

5038 Parameters 

5039 ---------- 

5040 data : np.ndarray[fixed-length-string] 

5041 nan_rep : the storage repr of NaN 

5042 encoding : str 

5043 errors : str 

5044 Handler for encoding errors. 

5045 

5046 Returns 

5047 ------- 

5048 np.ndarray[object] 

5049 Decoded data. 

5050 """ 

5051 shape = data.shape 

5052 data = np.asarray(data.ravel(), dtype=object) 

5053 

5054 if len(data): 

5055 itemsize = libwriters.max_len_string_array(ensure_object(data)) 

5056 dtype = f"U{itemsize}" 

5057 

5058 if isinstance(data[0], bytes): 

5059 data = Series(data, copy=False).str.decode(encoding, errors=errors)._values 

5060 else: 

5061 data = data.astype(dtype, copy=False).astype(object, copy=False) 

5062 

5063 if nan_rep is None: 

5064 nan_rep = "nan" 

5065 

5066 libwriters.string_array_replace_from_nan_rep(data, nan_rep) 

5067 return data.reshape(shape) 

5068 

5069 

5070def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str): 

5071 assert isinstance(val_kind, str), type(val_kind) 

5072 if _need_convert(val_kind): 

5073 conv = _get_converter(val_kind, encoding, errors) 

5074 values = conv(values) 

5075 return values 

5076 

5077 

5078def _get_converter(kind: str, encoding: str, errors: str): 

5079 if kind == "datetime64": 

5080 return lambda x: np.asarray(x, dtype="M8[ns]") 

5081 elif kind == "string": 

5082 return lambda x: _unconvert_string_array( 

5083 x, nan_rep=None, encoding=encoding, errors=errors 

5084 ) 

5085 else: # pragma: no cover 

5086 raise ValueError(f"invalid kind {kind}") 

5087 

5088 

5089def _need_convert(kind: str) -> bool: 

5090 if kind in ("datetime64", "string"): 

5091 return True 

5092 return False 

5093 

5094 

5095def _maybe_adjust_name(name: str, version: Sequence[int]) -> str: 

5096 """ 

5097 Prior to 0.10.1, we named values blocks like: values_block_0 an the 

5098 name values_0, adjust the given name if necessary. 

5099 

5100 Parameters 

5101 ---------- 

5102 name : str 

5103 version : Tuple[int, int, int] 

5104 

5105 Returns 

5106 ------- 

5107 str 

5108 """ 

5109 if isinstance(version, str) or len(version) < 3: 

5110 raise ValueError("Version is incorrect, expected sequence of 3 integers.") 

5111 

5112 if version[0] == 0 and version[1] <= 10 and version[2] == 0: 

5113 m = re.search(r"values_block_(\d+)", name) 

5114 if m: 

5115 grp = m.groups()[0] 

5116 name = f"values_{grp}" 

5117 return name 

5118 

5119 

5120def _dtype_to_kind(dtype_str: str) -> str: 

5121 """ 

5122 Find the "kind" string describing the given dtype name. 

5123 """ 

5124 dtype_str = _ensure_decoded(dtype_str) 

5125 

5126 if dtype_str.startswith("string") or dtype_str.startswith("bytes"): 

5127 kind = "string" 

5128 elif dtype_str.startswith("float"): 

5129 kind = "float" 

5130 elif dtype_str.startswith("complex"): 

5131 kind = "complex" 

5132 elif dtype_str.startswith("int") or dtype_str.startswith("uint"): 

5133 kind = "integer" 

5134 elif dtype_str.startswith("datetime64"): 

5135 kind = "datetime64" 

5136 elif dtype_str.startswith("timedelta"): 

5137 kind = "timedelta64" 

5138 elif dtype_str.startswith("bool"): 

5139 kind = "bool" 

5140 elif dtype_str.startswith("category"): 

5141 kind = "category" 

5142 elif dtype_str.startswith("period"): 

5143 # We store the `freq` attr so we can restore from integers 

5144 kind = "integer" 

5145 elif dtype_str == "object": 

5146 kind = "object" 

5147 else: 

5148 raise ValueError(f"cannot interpret dtype of [{dtype_str}]") 

5149 

5150 return kind 

5151 

5152 

5153def _get_data_and_dtype_name(data: ArrayLike): 

5154 """ 

5155 Convert the passed data into a storable form and a dtype string. 

5156 """ 

5157 if isinstance(data, Categorical): 

5158 data = data.codes 

5159 

5160 # For datetime64tz we need to drop the TZ in tests TODO: why? 

5161 dtype_name = data.dtype.name.split("[")[0] 

5162 

5163 if data.dtype.kind in ["m", "M"]: 

5164 data = np.asarray(data.view("i8")) 

5165 # TODO: we used to reshape for the dt64tz case, but no longer 

5166 # doing that doesn't seem to break anything. why? 

5167 

5168 elif isinstance(data, PeriodIndex): 

5169 data = data.asi8 

5170 

5171 data = np.asarray(data) 

5172 return data, dtype_name 

5173 

5174 

5175class Selection: 

5176 """ 

5177 Carries out a selection operation on a tables.Table object. 

5178 

5179 Parameters 

5180 ---------- 

5181 table : a Table object 

5182 where : list of Terms (or convertible to) 

5183 start, stop: indices to start and/or stop selection 

5184 

5185 """ 

5186 

5187 def __init__( 

5188 self, 

5189 table: Table, 

5190 where=None, 

5191 start: int | None = None, 

5192 stop: int | None = None, 

5193 ) -> None: 

5194 self.table = table 

5195 self.where = where 

5196 self.start = start 

5197 self.stop = stop 

5198 self.condition = None 

5199 self.filter = None 

5200 self.terms = None 

5201 self.coordinates = None 

5202 

5203 if is_list_like(where): 

5204 # see if we have a passed coordinate like 

5205 with suppress(ValueError): 

5206 inferred = lib.infer_dtype(where, skipna=False) 

5207 if inferred in ("integer", "boolean"): 

5208 where = np.asarray(where) 

5209 if where.dtype == np.bool_: 

5210 start, stop = self.start, self.stop 

5211 if start is None: 

5212 start = 0 

5213 if stop is None: 

5214 stop = self.table.nrows 

5215 self.coordinates = np.arange(start, stop)[where] 

5216 elif issubclass(where.dtype.type, np.integer): 

5217 if (self.start is not None and (where < self.start).any()) or ( 

5218 self.stop is not None and (where >= self.stop).any() 

5219 ): 

5220 raise ValueError( 

5221 "where must have index locations >= start and < stop" 

5222 ) 

5223 self.coordinates = where 

5224 

5225 if self.coordinates is None: 

5226 self.terms = self.generate(where) 

5227 

5228 # create the numexpr & the filter 

5229 if self.terms is not None: 

5230 self.condition, self.filter = self.terms.evaluate() 

5231 

5232 def generate(self, where): 

5233 """where can be a : dict,list,tuple,string""" 

5234 if where is None: 

5235 return None 

5236 

5237 q = self.table.queryables() 

5238 try: 

5239 return PyTablesExpr(where, queryables=q, encoding=self.table.encoding) 

5240 except NameError as err: 

5241 # raise a nice message, suggesting that the user should use 

5242 # data_columns 

5243 qkeys = ",".join(q.keys()) 

5244 msg = dedent( 

5245 f"""\ 

5246 The passed where expression: {where} 

5247 contains an invalid variable reference 

5248 all of the variable references must be a reference to 

5249 an axis (e.g. 'index' or 'columns'), or a data_column 

5250 The currently defined references are: {qkeys} 

5251 """ 

5252 ) 

5253 raise ValueError(msg) from err 

5254 

5255 def select(self): 

5256 """ 

5257 generate the selection 

5258 """ 

5259 if self.condition is not None: 

5260 return self.table.table.read_where( 

5261 self.condition.format(), start=self.start, stop=self.stop 

5262 ) 

5263 elif self.coordinates is not None: 

5264 return self.table.table.read_coordinates(self.coordinates) 

5265 return self.table.table.read(start=self.start, stop=self.stop) 

5266 

5267 def select_coords(self): 

5268 """ 

5269 generate the selection 

5270 """ 

5271 start, stop = self.start, self.stop 

5272 nrows = self.table.nrows 

5273 if start is None: 

5274 start = 0 

5275 elif start < 0: 

5276 start += nrows 

5277 if stop is None: 

5278 stop = nrows 

5279 elif stop < 0: 

5280 stop += nrows 

5281 

5282 if self.condition is not None: 

5283 return self.table.table.get_where_list( 

5284 self.condition.format(), start=start, stop=stop, sort=True 

5285 ) 

5286 elif self.coordinates is not None: 

5287 return self.coordinates 

5288 

5289 return np.arange(start, stop)