Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/pytables.py: 19%

1"""

2High level interface to PyTables for reading and writing pandas data structures

3to disk

4"""

5from __future__ import annotations

7from contextlib import suppress

8import copy

9from datetime import (

10 date,

11 tzinfo,

12)

13import itertools

14import os

15import re

16from textwrap import dedent

17from typing import (

18 TYPE_CHECKING,

19 Any,

20 Callable,

21 Final,

22 Literal,

23 cast,

24 overload,

25)

26import warnings

28import numpy as np

30from pandas._config import (

31 config,

32 get_option,

33 using_copy_on_write,

34 using_pyarrow_string_dtype,

35)

37from pandas._libs import (

38 lib,

39 writers as libwriters,

40)

41from pandas._libs.lib import is_string_array

42from pandas._libs.tslibs import timezones

43from pandas.compat._optional import import_optional_dependency

44from pandas.compat.pickle_compat import patch_pickle

45from pandas.errors import (

46 AttributeConflictWarning,

47 ClosedFileError,

48 IncompatibilityWarning,

49 PerformanceWarning,

50 PossibleDataLossError,

51)

52from pandas.util._decorators import cache_readonly

53from pandas.util._exceptions import find_stack_level

55from pandas.core.dtypes.common import (

56 ensure_object,

57 is_bool_dtype,

58 is_complex_dtype,

59 is_list_like,

60 is_string_dtype,

61 needs_i8_conversion,

62)

63from pandas.core.dtypes.dtypes import (

64 CategoricalDtype,

65 DatetimeTZDtype,

66 ExtensionDtype,

67 PeriodDtype,

68)

69from pandas.core.dtypes.missing import array_equivalent

71from pandas import (

72 DataFrame,

73 DatetimeIndex,

74 Index,

75 MultiIndex,

76 PeriodIndex,

77 RangeIndex,

78 Series,

79 TimedeltaIndex,

80 concat,

81 isna,

82)

83from pandas.core.arrays import (

84 Categorical,

85 DatetimeArray,

86 PeriodArray,

87)

88import pandas.core.common as com

89from pandas.core.computation.pytables import (

90 PyTablesExpr,

91 maybe_expression,

92)

93from pandas.core.construction import extract_array

94from pandas.core.indexes.api import ensure_index

95from pandas.core.internals import (

96 ArrayManager,

97 BlockManager,

98)

100from pandas.io.common import stringify_path

101from pandas.io.formats.printing import (

102 adjoin,

103 pprint_thing,

104)

105

106if TYPE_CHECKING:

107 from collections.abc import (

108 Hashable,

109 Iterator,

110 Sequence,

111 )

112 from types import TracebackType

113

114 from tables import (

115 Col,

116 File,

117 Node,

118 )

119

120 from pandas._typing import (

121 AnyArrayLike,

122 ArrayLike,

123 AxisInt,

124 DtypeArg,

125 FilePath,

126 Self,

127 Shape,

128 npt,

129 )

130

131 from pandas.core.internals import Block

132

133# versioning attribute

134_version = "0.15.2"

135

136# encoding

137_default_encoding = "UTF-8"

138

139

140def _ensure_decoded(s):

141 """if we have bytes, decode them to unicode"""

142 if isinstance(s, np.bytes_):

143 s = s.decode("UTF-8")

144 return s

145

146

147def _ensure_encoding(encoding: str | None) -> str:

148 # set the encoding if we need

149 if encoding is None:

150 encoding = _default_encoding

151

152 return encoding

153

154

155def _ensure_str(name):

156 """

157 Ensure that an index / column name is a str (python 3); otherwise they

158 may be np.string dtype. Non-string dtypes are passed through unchanged.

159

160 https://github.com/pandas-dev/pandas/issues/13492

161 """

162 if isinstance(name, str):

163 name = str(name)

164 return name

165

166

167Term = PyTablesExpr

168

169

170def _ensure_term(where, scope_level: int):

171 """

172 Ensure that the where is a Term or a list of Term.

173

174 This makes sure that we are capturing the scope of variables that are

175 passed create the terms here with a frame_level=2 (we are 2 levels down)

176 """

177 # only consider list/tuple here as an ndarray is automatically a coordinate

178 # list

179 level = scope_level + 1

180 if isinstance(where, (list, tuple)):

181 where = [

182 Term(term, scope_level=level + 1) if maybe_expression(term) else term

183 for term in where

184 if term is not None

185 ]

186 elif maybe_expression(where):

187 where = Term(where, scope_level=level)

188 return where if where is None or len(where) else None

189

190

191incompatibility_doc: Final = """

192where criteria is being ignored as this version [%s] is too old (or

193not-defined), read the file in and write it out to a new file to upgrade (with

194the copy_to method)

195"""

196

197attribute_conflict_doc: Final = """

198the [%s] attribute of the existing index is [%s] which conflicts with the new

199[%s], resetting the attribute to None

200"""

201

202performance_doc: Final = """

203your performance may suffer as PyTables will pickle object types that it cannot

204map directly to c-types [inferred_type->%s,key->%s] [items->%s]

205"""

206

207# formats

208_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}

209

210# axes map

211_AXES_MAP = {DataFrame: [0]}

212

213# register our configuration options

214dropna_doc: Final = """

215: boolean

216 drop ALL nan rows when appending to a table

217"""

218format_doc: Final = """

219: format

220 default format writing format, if None, then

221 put will default to 'fixed' and append will default to 'table'

222"""

223

224with config.config_prefix("io.hdf"):

225 config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)

226 config.register_option(

227 "default_format",

228 None,

229 format_doc,

230 validator=config.is_one_of_factory(["fixed", "table", None]),

231 )

232

233# oh the troubles to reduce import time

234_table_mod = None

235_table_file_open_policy_is_strict = False

236

237

238def _tables():

239 global _table_mod

240 global _table_file_open_policy_is_strict

241 if _table_mod is None:

242 import tables

243

244 _table_mod = tables

245

246 # set the file open policy

247 # return the file open policy; this changes as of pytables 3.1

248 # depending on the HDF5 version

249 with suppress(AttributeError):

250 _table_file_open_policy_is_strict = (

251 tables.file._FILE_OPEN_POLICY == "strict"

252 )

253

254 return _table_mod

255

256

257# interface to/from ###

258

259

260def to_hdf(

261 path_or_buf: FilePath | HDFStore,

262 key: str,

263 value: DataFrame | Series,

264 mode: str = "a",

265 complevel: int | None = None,

266 complib: str | None = None,

267 append: bool = False,

268 format: str | None = None,

269 index: bool = True,

270 min_itemsize: int | dict[str, int] | None = None,

271 nan_rep=None,

272 dropna: bool | None = None,

273 data_columns: Literal[True] | list[str] | None = None,

274 errors: str = "strict",

275 encoding: str = "UTF-8",

276) -> None:

277 """store this object, close it if we opened it"""

278 if append:

279 f = lambda store: store.append(

280 key,

281 value,

282 format=format,

283 index=index,

284 min_itemsize=min_itemsize,

285 nan_rep=nan_rep,

286 dropna=dropna,

287 data_columns=data_columns,

288 errors=errors,

289 encoding=encoding,

290 )

291 else:

292 # NB: dropna is not passed to `put`

293 f = lambda store: store.put(

294 key,

295 value,

296 format=format,

297 index=index,

298 min_itemsize=min_itemsize,

299 nan_rep=nan_rep,

300 data_columns=data_columns,

301 errors=errors,

302 encoding=encoding,

303 dropna=dropna,

304 )

305

306 path_or_buf = stringify_path(path_or_buf)

307 if isinstance(path_or_buf, str):

308 with HDFStore(

309 path_or_buf, mode=mode, complevel=complevel, complib=complib

310 ) as store:

311 f(store)

312 else:

313 f(path_or_buf)

314

315

316def read_hdf(

317 path_or_buf: FilePath | HDFStore,

318 key=None,

319 mode: str = "r",

320 errors: str = "strict",

321 where: str | list | None = None,

322 start: int | None = None,

323 stop: int | None = None,

324 columns: list[str] | None = None,

325 iterator: bool = False,

326 chunksize: int | None = None,

327 **kwargs,

328):

329 """

330 Read from the store, close it if we opened it.

331

332 Retrieve pandas object stored in file, optionally based on where

333 criteria.

334

335 .. warning::

336

337 Pandas uses PyTables for reading and writing HDF5 files, which allows

338 serializing object-dtype data with pickle when using the "fixed" format.

339 Loading pickled data received from untrusted sources can be unsafe.

340

341 See: https://docs.python.org/3/library/pickle.html for more.

342

343 Parameters

344 ----------

345 path_or_buf : str, path object, pandas.HDFStore

346 Any valid string path is acceptable. Only supports the local file system,

347 remote URLs and file-like objects are not supported.

348

349 If you want to pass in a path object, pandas accepts any

350 ``os.PathLike``.

351

352 Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.

353

354 key : object, optional

355 The group identifier in the store. Can be omitted if the HDF file

356 contains a single pandas object.

357 mode : {'r', 'r+', 'a'}, default 'r'

358 Mode to use when opening the file. Ignored if path_or_buf is a

359 :class:`pandas.HDFStore`. Default is 'r'.

360 errors : str, default 'strict'

361 Specifies how encoding and decoding errors are to be handled.

362 See the errors argument for :func:`open` for a full list

363 of options.

364 where : list, optional

365 A list of Term (or convertible) objects.

366 start : int, optional

367 Row number to start selection.

368 stop : int, optional

369 Row number to stop selection.

370 columns : list, optional

371 A list of columns names to return.

372 iterator : bool, optional

373 Return an iterator object.

374 chunksize : int, optional

375 Number of rows to include in an iteration when using an iterator.

376 **kwargs

377 Additional keyword arguments passed to HDFStore.

378

379 Returns

380 -------

381 object

382 The selected object. Return type depends on the object stored.

383

384 See Also

385 --------

386 DataFrame.to_hdf : Write a HDF file from a DataFrame.

387 HDFStore : Low-level access to HDF files.

388

389 Examples

390 --------

391 >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP

392 >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP

393 >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP

394 """

395 if mode not in ["r", "r+", "a"]:

396 raise ValueError(

397 f"mode {mode} is not allowed while performing a read. "

398 f"Allowed modes are r, r+ and a."

399 )

400 # grab the scope

401 if where is not None:

402 where = _ensure_term(where, scope_level=1)

403

404 if isinstance(path_or_buf, HDFStore):

405 if not path_or_buf.is_open:

406 raise OSError("The HDFStore must be open for reading.")

407

408 store = path_or_buf

409 auto_close = False

410 else:

411 path_or_buf = stringify_path(path_or_buf)

412 if not isinstance(path_or_buf, str):

413 raise NotImplementedError(

414 "Support for generic buffers has not been implemented."

415 )

416 try:

417 exists = os.path.exists(path_or_buf)

418

419 # if filepath is too long

420 except (TypeError, ValueError):

421 exists = False

422

423 if not exists:

424 raise FileNotFoundError(f"File {path_or_buf} does not exist")

425

426 store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)

427 # can't auto open/close if we are using an iterator

428 # so delegate to the iterator

429 auto_close = True

430

431 try:

432 if key is None:

433 groups = store.groups()

434 if len(groups) == 0:

435 raise ValueError(

436 "Dataset(s) incompatible with Pandas data types, "

437 "not table, or no datasets found in HDF5 file."

438 )

439 candidate_only_group = groups[0]

440

441 # For the HDF file to have only one dataset, all other groups

442 # should then be metadata groups for that candidate group. (This

443 # assumes that the groups() method enumerates parent groups

444 # before their children.)

445 for group_to_check in groups[1:]:

446 if not _is_metadata_of(group_to_check, candidate_only_group):

447 raise ValueError(

448 "key must be provided when HDF5 "

449 "file contains multiple datasets."

450 )

451 key = candidate_only_group._v_pathname

452 return store.select(

453 key,

454 where=where,

455 start=start,

456 stop=stop,

457 columns=columns,

458 iterator=iterator,

459 chunksize=chunksize,

460 auto_close=auto_close,

461 )

462 except (ValueError, TypeError, LookupError):

463 if not isinstance(path_or_buf, HDFStore):

464 # if there is an error, close the store if we opened it.

465 with suppress(AttributeError):

466 store.close()

467

468 raise

469

470

471def _is_metadata_of(group: Node, parent_group: Node) -> bool:

472 """Check if a given group is a metadata group for a given parent_group."""

473 if group._v_depth <= parent_group._v_depth:

474 return False

475

476 current = group

477 while current._v_depth > 1:

478 parent = current._v_parent

479 if parent == parent_group and current._v_name == "meta":

480 return True

481 current = current._v_parent

482 return False

483

484

485class HDFStore:

486 """

487 Dict-like IO interface for storing pandas objects in PyTables.

488

489 Either Fixed or Table format.

490

491 .. warning::

492

493 Pandas uses PyTables for reading and writing HDF5 files, which allows

494 serializing object-dtype data with pickle when using the "fixed" format.

495 Loading pickled data received from untrusted sources can be unsafe.

496

497 See: https://docs.python.org/3/library/pickle.html for more.

498

499 Parameters

500 ----------

501 path : str

502 File path to HDF5 file.

503 mode : {'a', 'w', 'r', 'r+'}, default 'a'

504

505 ``'r'``

506 Read-only; no data can be modified.

507 ``'w'``

508 Write; a new file is created (an existing file with the same

509 name would be deleted).

510 ``'a'``

511 Append; an existing file is opened for reading and writing,

512 and if the file does not exist it is created.

513 ``'r+'``

514 It is similar to ``'a'``, but the file must already exist.

515 complevel : int, 0-9, default None

516 Specifies a compression level for data.

517 A value of 0 or None disables compression.

518 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'

519 Specifies the compression library to be used.

520 These additional compressors for Blosc are supported

521 (default if no compressor specified: 'blosc:blosclz'):

522 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',

523 'blosc:zlib', 'blosc:zstd'}.

524 Specifying a compression library which is not available issues

525 a ValueError.

526 fletcher32 : bool, default False

527 If applying compression use the fletcher32 checksum.

528 **kwargs

529 These parameters will be passed to the PyTables open_file method.

530

531 Examples

532 --------

533 >>> bar = pd.DataFrame(np.random.randn(10, 4))

534 >>> store = pd.HDFStore('test.h5')

535 >>> store['foo'] = bar # write to HDF5

536 >>> bar = store['foo'] # retrieve

537 >>> store.close()

538

539 **Create or load HDF5 file in-memory**

540

541 When passing the `driver` option to the PyTables open_file method through

542 **kwargs, the HDF5 file is loaded or created in-memory and will only be

543 written when closed:

544

545 >>> bar = pd.DataFrame(np.random.randn(10, 4))

546 >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE')

547 >>> store['foo'] = bar

548 >>> store.close() # only now, data is written to disk

549 """

550

551 _handle: File | None

552 _mode: str

553

554 def __init__(

555 self,

556 path,

557 mode: str = "a",

558 complevel: int | None = None,

559 complib=None,

560 fletcher32: bool = False,

561 **kwargs,

562 ) -> None:

563 if "format" in kwargs:

564 raise ValueError("format is not a defined argument for HDFStore")

565

566 tables = import_optional_dependency("tables")

567

568 if complib is not None and complib not in tables.filters.all_complibs:

569 raise ValueError(

570 f"complib only supports {tables.filters.all_complibs} compression."

571 )

572

573 if complib is None and complevel is not None:

574 complib = tables.filters.default_complib

575

576 self._path = stringify_path(path)

577 if mode is None:

578 mode = "a"

579 self._mode = mode

580 self._handle = None

581 self._complevel = complevel if complevel else 0

582 self._complib = complib

583 self._fletcher32 = fletcher32

584 self._filters = None

585 self.open(mode=mode, **kwargs)

586

587 def __fspath__(self) -> str:

588 return self._path

589

590 @property

591 def root(self):

592 """return the root node"""

593 self._check_if_open()

594 assert self._handle is not None # for mypy

595 return self._handle.root

596

597 @property

598 def filename(self) -> str:

599 return self._path

600

601 def __getitem__(self, key: str):

602 return self.get(key)

603

604 def __setitem__(self, key: str, value) -> None:

605 self.put(key, value)

606

607 def __delitem__(self, key: str) -> None:

608 return self.remove(key)

609

610 def __getattr__(self, name: str):

611 """allow attribute access to get stores"""

612 try:

613 return self.get(name)

614 except (KeyError, ClosedFileError):

615 pass

616 raise AttributeError(

617 f"'{type(self).__name__}' object has no attribute '{name}'"

618 )

619

620 def __contains__(self, key: str) -> bool:

621 """

622 check for existence of this key

623 can match the exact pathname or the pathnm w/o the leading '/'

624 """

625 node = self.get_node(key)

626 if node is not None:

627 name = node._v_pathname

628 if key in (name, name[1:]):

629 return True

630 return False

631

632 def __len__(self) -> int:

633 return len(self.groups())

634

635 def __repr__(self) -> str:

636 pstr = pprint_thing(self._path)

637 return f"{type(self)}\nFile path: {pstr}\n"

638

639 def __enter__(self) -> Self:

640 return self

641

642 def __exit__(

643 self,

644 exc_type: type[BaseException] | None,

645 exc_value: BaseException | None,

646 traceback: TracebackType | None,

647 ) -> None:

648 self.close()

649

650 def keys(self, include: str = "pandas") -> list[str]:

651 """

652 Return a list of keys corresponding to objects stored in HDFStore.

653

654 Parameters

655 ----------

656

657 include : str, default 'pandas'

658 When kind equals 'pandas' return pandas objects.

659 When kind equals 'native' return native HDF5 Table objects.

660

661 Returns

662 -------

663 list

664 List of ABSOLUTE path-names (e.g. have the leading '/').

665

666 Raises

667 ------

668 raises ValueError if kind has an illegal value

669

670 Examples

671 --------

672 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])

673 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP

674 >>> store.put('data', df) # doctest: +SKIP

675 >>> store.get('data') # doctest: +SKIP

676 >>> print(store.keys()) # doctest: +SKIP

677 ['/data1', '/data2']

678 >>> store.close() # doctest: +SKIP

679 """

680 if include == "pandas":

681 return [n._v_pathname for n in self.groups()]

682

683 elif include == "native":

684 assert self._handle is not None # mypy

685 return [

686 n._v_pathname for n in self._handle.walk_nodes("/", classname="Table")

687 ]

688 raise ValueError(

689 f"`include` should be either 'pandas' or 'native' but is '{include}'"

690 )

691

692 def __iter__(self) -> Iterator[str]:

693 return iter(self.keys())

694

695 def items(self) -> Iterator[tuple[str, list]]:

696 """

697 iterate on key->group

698 """

699 for g in self.groups():

700 yield g._v_pathname, g

701

702 def open(self, mode: str = "a", **kwargs) -> None:

703 """

704 Open the file in the specified mode

705

706 Parameters

707 ----------

708 mode : {'a', 'w', 'r', 'r+'}, default 'a'

709 See HDFStore docstring or tables.open_file for info about modes

710 **kwargs

711 These parameters will be passed to the PyTables open_file method.

712 """

713 tables = _tables()

714

715 if self._mode != mode:

716 # if we are changing a write mode to read, ok

717 if self._mode in ["a", "w"] and mode in ["r", "r+"]:

718 pass

719 elif mode in ["w"]:

720 # this would truncate, raise here

721 if self.is_open:

722 raise PossibleDataLossError(

723 f"Re-opening the file [{self._path}] with mode [{self._mode}] "

724 "will delete the current file!"

725 )

726

727 self._mode = mode

728

729 # close and reopen the handle

730 if self.is_open:

731 self.close()

732

733 if self._complevel and self._complevel > 0:

734 self._filters = _tables().Filters(

735 self._complevel, self._complib, fletcher32=self._fletcher32

736 )

737

738 if _table_file_open_policy_is_strict and self.is_open:

739 msg = (

740 "Cannot open HDF5 file, which is already opened, "

741 "even in read-only mode."

742 )

743 raise ValueError(msg)

744

745 self._handle = tables.open_file(self._path, self._mode, **kwargs)

746

747 def close(self) -> None:

748 """

749 Close the PyTables file handle

750 """

751 if self._handle is not None:

752 self._handle.close()

753 self._handle = None

754

755 @property

756 def is_open(self) -> bool:

757 """

758 return a boolean indicating whether the file is open

759 """

760 if self._handle is None:

761 return False

762 return bool(self._handle.isopen)

763

764 def flush(self, fsync: bool = False) -> None:

765 """

766 Force all buffered modifications to be written to disk.

767

768 Parameters

769 ----------

770 fsync : bool (default False)

771 call ``os.fsync()`` on the file handle to force writing to disk.

772

773 Notes

774 -----

775 Without ``fsync=True``, flushing may not guarantee that the OS writes

776 to disk. With fsync, the operation will block until the OS claims the

777 file has been written; however, other caching layers may still

778 interfere.

779 """

780 if self._handle is not None:

781 self._handle.flush()

782 if fsync:

783 with suppress(OSError):

784 os.fsync(self._handle.fileno())

785

786 def get(self, key: str):

787 """

788 Retrieve pandas object stored in file.

789

790 Parameters

791 ----------

792 key : str

793

794 Returns

795 -------

796 object

797 Same type as object stored in file.

798

799 Examples

800 --------

801 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])

802 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP

803 >>> store.put('data', df) # doctest: +SKIP

804 >>> store.get('data') # doctest: +SKIP

805 >>> store.close() # doctest: +SKIP

806 """

807 with patch_pickle():

808 # GH#31167 Without this patch, pickle doesn't know how to unpickle

809 # old DateOffset objects now that they are cdef classes.

810 group = self.get_node(key)

811 if group is None:

812 raise KeyError(f"No object named {key} in the file")

813 return self._read_group(group)

814

815 def select(

816 self,

817 key: str,

818 where=None,

819 start=None,

820 stop=None,

821 columns=None,

822 iterator: bool = False,

823 chunksize: int | None = None,

824 auto_close: bool = False,

825 ):

826 """

827 Retrieve pandas object stored in file, optionally based on where criteria.

828

829 .. warning::

830

831 Pandas uses PyTables for reading and writing HDF5 files, which allows

832 serializing object-dtype data with pickle when using the "fixed" format.

833 Loading pickled data received from untrusted sources can be unsafe.

834

835 See: https://docs.python.org/3/library/pickle.html for more.

836

837 Parameters

838 ----------

839 key : str

840 Object being retrieved from file.

841 where : list or None

842 List of Term (or convertible) objects, optional.

843 start : int or None

844 Row number to start selection.

845 stop : int, default None

846 Row number to stop selection.

847 columns : list or None

848 A list of columns that if not None, will limit the return columns.

849 iterator : bool or False

850 Returns an iterator.

851 chunksize : int or None

852 Number or rows to include in iteration, return an iterator.

853 auto_close : bool or False

854 Should automatically close the store when finished.

855

856 Returns

857 -------

858 object

859 Retrieved object from file.

860

861 Examples

862 --------

863 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])

864 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP

865 >>> store.put('data', df) # doctest: +SKIP

866 >>> store.get('data') # doctest: +SKIP

867 >>> print(store.keys()) # doctest: +SKIP

868 ['/data1', '/data2']

869 >>> store.select('/data1') # doctest: +SKIP

870 A B

871 0 1 2

872 1 3 4

873 >>> store.select('/data1', where='columns == A') # doctest: +SKIP

874 A

875 0 1

876 1 3

877 >>> store.close() # doctest: +SKIP

878 """

879 group = self.get_node(key)

880 if group is None:

881 raise KeyError(f"No object named {key} in the file")

882

883 # create the storer and axes

884 where = _ensure_term(where, scope_level=1)

885 s = self._create_storer(group)

886 s.infer_axes()

887

888 # function to call on iteration

889 def func(_start, _stop, _where):

890 return s.read(start=_start, stop=_stop, where=_where, columns=columns)

891

892 # create the iterator

893 it = TableIterator(

894 self,

895 s,

896 func,

897 where=where,

898 nrows=s.nrows,

899 start=start,

900 stop=stop,

901 iterator=iterator,

902 chunksize=chunksize,

903 auto_close=auto_close,

904 )

905

906 return it.get_result()

907

908 def select_as_coordinates(

909 self,

910 key: str,

911 where=None,

912 start: int | None = None,

913 stop: int | None = None,

914 ):

915 """

916 return the selection as an Index

917

918 .. warning::

919

920 Pandas uses PyTables for reading and writing HDF5 files, which allows

921 serializing object-dtype data with pickle when using the "fixed" format.

922 Loading pickled data received from untrusted sources can be unsafe.

923

924 See: https://docs.python.org/3/library/pickle.html for more.

925

926

927 Parameters

928 ----------

929 key : str

930 where : list of Term (or convertible) objects, optional

931 start : integer (defaults to None), row number to start selection

932 stop : integer (defaults to None), row number to stop selection

933 """

934 where = _ensure_term(where, scope_level=1)

935 tbl = self.get_storer(key)

936 if not isinstance(tbl, Table):

937 raise TypeError("can only read_coordinates with a table")

938 return tbl.read_coordinates(where=where, start=start, stop=stop)

939

940 def select_column(

941 self,

942 key: str,

943 column: str,

944 start: int | None = None,

945 stop: int | None = None,

946 ):

947 """

948 return a single column from the table. This is generally only useful to

949 select an indexable

950

951 .. warning::

952

953 Pandas uses PyTables for reading and writing HDF5 files, which allows

954 serializing object-dtype data with pickle when using the "fixed" format.

955 Loading pickled data received from untrusted sources can be unsafe.

956

957 See: https://docs.python.org/3/library/pickle.html for more.

958

959 Parameters

960 ----------

961 key : str

962 column : str

963 The column of interest.

964 start : int or None, default None

965 stop : int or None, default None

966

967 Raises

968 ------

969 raises KeyError if the column is not found (or key is not a valid

970 store)

971 raises ValueError if the column can not be extracted individually (it

972 is part of a data block)

973

974 """

975 tbl = self.get_storer(key)

976 if not isinstance(tbl, Table):

977 raise TypeError("can only read_column with a table")

978 return tbl.read_column(column=column, start=start, stop=stop)

979

980 def select_as_multiple(

981 self,

982 keys,

983 where=None,

984 selector=None,

985 columns=None,

986 start=None,

987 stop=None,

988 iterator: bool = False,

989 chunksize: int | None = None,

990 auto_close: bool = False,

991 ):

992 """

993 Retrieve pandas objects from multiple tables.

994

995 .. warning::

996

997 Pandas uses PyTables for reading and writing HDF5 files, which allows

998 serializing object-dtype data with pickle when using the "fixed" format.

999 Loading pickled data received from untrusted sources can be unsafe.

1000

1001 See: https://docs.python.org/3/library/pickle.html for more.

1002

1003 Parameters

1004 ----------

1005 keys : a list of the tables

1006 selector : the table to apply the where criteria (defaults to keys[0]

1007 if not supplied)

1008 columns : the columns I want back

1009 start : integer (defaults to None), row number to start selection

1010 stop : integer (defaults to None), row number to stop selection

1011 iterator : bool, return an iterator, default False

1012 chunksize : nrows to include in iteration, return an iterator

1013 auto_close : bool, default False

1014 Should automatically close the store when finished.

1015

1016 Raises

1017 ------

1018 raises KeyError if keys or selector is not found or keys is empty

1019 raises TypeError if keys is not a list or tuple

1020 raises ValueError if the tables are not ALL THE SAME DIMENSIONS

1021 """

1022 # default to single select

1023 where = _ensure_term(where, scope_level=1)

1024 if isinstance(keys, (list, tuple)) and len(keys) == 1:

1025 keys = keys[0]

1026 if isinstance(keys, str):

1027 return self.select(

1028 key=keys,

1029 where=where,

1030 columns=columns,

1031 start=start,

1032 stop=stop,

1033 iterator=iterator,

1034 chunksize=chunksize,

1035 auto_close=auto_close,

1036 )

1037

1038 if not isinstance(keys, (list, tuple)):

1039 raise TypeError("keys must be a list/tuple")

1040

1041 if not len(keys):

1042 raise ValueError("keys must have a non-zero length")

1043

1044 if selector is None:

1045 selector = keys[0]

1046

1047 # collect the tables

1048 tbls = [self.get_storer(k) for k in keys]

1049 s = self.get_storer(selector)

1050

1051 # validate rows

1052 nrows = None

1053 for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):

1054 if t is None:

1055 raise KeyError(f"Invalid table [{k}]")

1056 if not t.is_table:

1057 raise TypeError(

1058 f"object [{t.pathname}] is not a table, and cannot be used in all "

1059 "select as multiple"

1060 )

1061

1062 if nrows is None:

1063 nrows = t.nrows

1064 elif t.nrows != nrows:

1065 raise ValueError("all tables must have exactly the same nrows!")

1066

1067 # The isinstance checks here are redundant with the check above,

1068 # but necessary for mypy; see GH#29757

1069 _tbls = [x for x in tbls if isinstance(x, Table)]

1070

1071 # axis is the concentration axes

1072 axis = {t.non_index_axes[0][0] for t in _tbls}.pop()

1073

1074 def func(_start, _stop, _where):

1075 # retrieve the objs, _where is always passed as a set of

1076 # coordinates here

1077 objs = [

1078 t.read(where=_where, columns=columns, start=_start, stop=_stop)

1079 for t in tbls

1080 ]

1081

1082 # concat and return

1083 return concat(objs, axis=axis, verify_integrity=False)._consolidate()

1084

1085 # create the iterator

1086 it = TableIterator(

1087 self,

1088 s,

1089 func,

1090 where=where,

1091 nrows=nrows,

1092 start=start,

1093 stop=stop,

1094 iterator=iterator,

1095 chunksize=chunksize,

1096 auto_close=auto_close,

1097 )

1098

1099 return it.get_result(coordinates=True)

1100

1101 def put(

1102 self,

1103 key: str,

1104 value: DataFrame | Series,

1105 format=None,

1106 index: bool = True,

1107 append: bool = False,

1108 complib=None,

1109 complevel: int | None = None,

1110 min_itemsize: int | dict[str, int] | None = None,

1111 nan_rep=None,

1112 data_columns: Literal[True] | list[str] | None = None,

1113 encoding=None,

1114 errors: str = "strict",

1115 track_times: bool = True,

1116 dropna: bool = False,

1117 ) -> None:

1118 """

1119 Store object in HDFStore.

1120

1121 Parameters

1122 ----------

1123 key : str

1124 value : {Series, DataFrame}

1125 format : 'fixed(f)|table(t)', default is 'fixed'

1126 Format to use when storing object in HDFStore. Value can be one of:

1127

1128 ``'fixed'``

1129 Fixed format. Fast writing/reading. Not-appendable, nor searchable.

1130 ``'table'``

1131 Table format. Write as a PyTables Table structure which may perform

1132 worse but allow more flexible operations like searching / selecting

1133 subsets of the data.

1134 index : bool, default True

1135 Write DataFrame index as a column.

1136 append : bool, default False

1137 This will force Table format, append the input data to the existing.

1138 data_columns : list of columns or True, default None

1139 List of columns to create as data columns, or True to use all columns.

1140 See `here

1141 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.

1142 encoding : str, default None

1143 Provide an encoding for strings.

1144 track_times : bool, default True

1145 Parameter is propagated to 'create_table' method of 'PyTables'.

1146 If set to False it enables to have the same h5 files (same hashes)

1147 independent on creation time.

1148 dropna : bool, default False, optional

1149 Remove missing values.

1150

1151 Examples

1152 --------

1153 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])

1154 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP

1155 >>> store.put('data', df) # doctest: +SKIP

1156 """

1157 if format is None:

1158 format = get_option("io.hdf.default_format") or "fixed"

1159 format = self._validate_format(format)

1160 self._write_to_group(

1161 key,

1162 value,

1163 format=format,

1164 index=index,

1165 append=append,

1166 complib=complib,

1167 complevel=complevel,

1168 min_itemsize=min_itemsize,

1169 nan_rep=nan_rep,

1170 data_columns=data_columns,

1171 encoding=encoding,

1172 errors=errors,

1173 track_times=track_times,

1174 dropna=dropna,

1175 )

1176

1177 def remove(self, key: str, where=None, start=None, stop=None) -> None:

1178 """

1179 Remove pandas object partially by specifying the where condition

1180

1181 Parameters

1182 ----------

1183 key : str

1184 Node to remove or delete rows from

1185 where : list of Term (or convertible) objects, optional

1186 start : integer (defaults to None), row number to start selection

1187 stop : integer (defaults to None), row number to stop selection

1188

1189 Returns

1190 -------

1191 number of rows removed (or None if not a Table)

1192

1193 Raises

1194 ------

1195 raises KeyError if key is not a valid store

1196

1197 """

1198 where = _ensure_term(where, scope_level=1)

1199 try:

1200 s = self.get_storer(key)

1201 except KeyError:

1202 # the key is not a valid store, re-raising KeyError

1203 raise

1204 except AssertionError:

1205 # surface any assertion errors for e.g. debugging

1206 raise

1207 except Exception as err:

1208 # In tests we get here with ClosedFileError, TypeError, and

1209 # _table_mod.NoSuchNodeError. TODO: Catch only these?

1210

1211 if where is not None:

1212 raise ValueError(

1213 "trying to remove a node with a non-None where clause!"

1214 ) from err

1215

1216 # we are actually trying to remove a node (with children)

1217 node = self.get_node(key)

1218 if node is not None:

1219 node._f_remove(recursive=True)

1220 return None

1221

1222 # remove the node

1223 if com.all_none(where, start, stop):

1224 s.group._f_remove(recursive=True)

1225

1226 # delete from the table

1227 else:

1228 if not s.is_table:

1229 raise ValueError(

1230 "can only remove with where on objects written as tables"

1231 )

1232 return s.delete(where=where, start=start, stop=stop)

1233

1234 def append(

1235 self,

1236 key: str,

1237 value: DataFrame | Series,

1238 format=None,

1239 axes=None,

1240 index: bool | list[str] = True,

1241 append: bool = True,

1242 complib=None,

1243 complevel: int | None = None,

1244 columns=None,

1245 min_itemsize: int | dict[str, int] | None = None,

1246 nan_rep=None,

1247 chunksize: int | None = None,

1248 expectedrows=None,

1249 dropna: bool | None = None,

1250 data_columns: Literal[True] | list[str] | None = None,

1251 encoding=None,

1252 errors: str = "strict",

1253 ) -> None:

1254 """

1255 Append to Table in file.

1256

1257 Node must already exist and be Table format.

1258

1259 Parameters

1260 ----------

1261 key : str

1262 value : {Series, DataFrame}

1263 format : 'table' is the default

1264 Format to use when storing object in HDFStore. Value can be one of:

1265

1266 ``'table'``

1267 Table format. Write as a PyTables Table structure which may perform

1268 worse but allow more flexible operations like searching / selecting

1269 subsets of the data.

1270 index : bool, default True

1271 Write DataFrame index as a column.

1272 append : bool, default True

1273 Append the input data to the existing.

1274 data_columns : list of columns, or True, default None

1275 List of columns to create as indexed data columns for on-disk

1276 queries, or True to use all columns. By default only the axes

1277 of the object are indexed. See `here

1278 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.

1279 min_itemsize : dict of columns that specify minimum str sizes

1280 nan_rep : str to use as str nan representation

1281 chunksize : size to chunk the writing

1282 expectedrows : expected TOTAL row size of this table

1283 encoding : default None, provide an encoding for str

1284 dropna : bool, default False, optional

1285 Do not write an ALL nan row to the store settable

1286 by the option 'io.hdf.dropna_table'.

1287

1288 Notes

1289 -----

1290 Does *not* check if data being appended overlaps with existing

1291 data in the table, so be careful

1292

1293 Examples

1294 --------

1295 >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])

1296 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP

1297 >>> store.put('data', df1, format='table') # doctest: +SKIP

1298 >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B'])

1299 >>> store.append('data', df2) # doctest: +SKIP

1300 >>> store.close() # doctest: +SKIP

1301 A B

1302 0 1 2

1303 1 3 4

1304 0 5 6

1305 1 7 8

1306 """

1307 if columns is not None:

1308 raise TypeError(

1309 "columns is not a supported keyword in append, try data_columns"

1310 )

1311

1312 if dropna is None:

1313 dropna = get_option("io.hdf.dropna_table")

1314 if format is None:

1315 format = get_option("io.hdf.default_format") or "table"

1316 format = self._validate_format(format)

1317 self._write_to_group(

1318 key,

1319 value,

1320 format=format,

1321 axes=axes,

1322 index=index,

1323 append=append,

1324 complib=complib,

1325 complevel=complevel,

1326 min_itemsize=min_itemsize,

1327 nan_rep=nan_rep,

1328 chunksize=chunksize,

1329 expectedrows=expectedrows,

1330 dropna=dropna,

1331 data_columns=data_columns,

1332 encoding=encoding,

1333 errors=errors,

1334 )

1335

1336 def append_to_multiple(

1337 self,

1338 d: dict,

1339 value,

1340 selector,

1341 data_columns=None,

1342 axes=None,

1343 dropna: bool = False,

1344 **kwargs,

1345 ) -> None:

1346 """

1347 Append to multiple tables

1348

1349 Parameters

1350 ----------

1351 d : a dict of table_name to table_columns, None is acceptable as the

1352 values of one node (this will get all the remaining columns)

1353 value : a pandas object

1354 selector : a string that designates the indexable table; all of its

1355 columns will be designed as data_columns, unless data_columns is

1356 passed, in which case these are used

1357 data_columns : list of columns to create as data columns, or True to

1358 use all columns

1359 dropna : if evaluates to True, drop rows from all tables if any single

1360 row in each table has all NaN. Default False.

1361

1362 Notes

1363 -----

1364 axes parameter is currently not accepted

1365

1366 """

1367 if axes is not None:

1368 raise TypeError(

1369 "axes is currently not accepted as a parameter to append_to_multiple; "

1370 "you can create the tables independently instead"

1371 )

1372

1373 if not isinstance(d, dict):

1374 raise ValueError(

1375 "append_to_multiple must have a dictionary specified as the "

1376 "way to split the value"

1377 )

1378

1379 if selector not in d:

1380 raise ValueError(

1381 "append_to_multiple requires a selector that is in passed dict"

1382 )

1383

1384 # figure out the splitting axis (the non_index_axis)

1385 axis = next(iter(set(range(value.ndim)) - set(_AXES_MAP[type(value)])))

1386

1387 # figure out how to split the value

1388 remain_key = None

1389 remain_values: list = []

1390 for k, v in d.items():

1391 if v is None:

1392 if remain_key is not None:

1393 raise ValueError(

1394 "append_to_multiple can only have one value in d that is None"

1395 )

1396 remain_key = k

1397 else:

1398 remain_values.extend(v)

1399 if remain_key is not None:

1400 ordered = value.axes[axis]

1401 ordd = ordered.difference(Index(remain_values))

1402 ordd = sorted(ordered.get_indexer(ordd))

1403 d[remain_key] = ordered.take(ordd)

1404

1405 # data_columns

1406 if data_columns is None:

1407 data_columns = d[selector]

1408

1409 # ensure rows are synchronized across the tables

1410 if dropna:

1411 idxs = (value[cols].dropna(how="all").index for cols in d.values())

1412 valid_index = next(idxs)

1413 for index in idxs:

1414 valid_index = valid_index.intersection(index)

1415 value = value.loc[valid_index]

1416

1417 min_itemsize = kwargs.pop("min_itemsize", None)

1418

1419 # append

1420 for k, v in d.items():

1421 dc = data_columns if k == selector else None

1422

1423 # compute the val

1424 val = value.reindex(v, axis=axis)

1425

1426 filtered = (

1427 {key: value for (key, value) in min_itemsize.items() if key in v}

1428 if min_itemsize is not None

1429 else None

1430 )

1431 self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs)

1432

1433 def create_table_index(

1434 self,

1435 key: str,

1436 columns=None,

1437 optlevel: int | None = None,

1438 kind: str | None = None,

1439 ) -> None:

1440 """

1441 Create a pytables index on the table.

1442

1443 Parameters

1444 ----------

1445 key : str

1446 columns : None, bool, or listlike[str]

1447 Indicate which columns to create an index on.

1448

1449 * False : Do not create any indexes.

1450 * True : Create indexes on all columns.

1451 * None : Create indexes on all columns.

1452 * listlike : Create indexes on the given columns.

1453

1454 optlevel : int or None, default None

1455 Optimization level, if None, pytables defaults to 6.

1456 kind : str or None, default None

1457 Kind of index, if None, pytables defaults to "medium".

1458

1459 Raises

1460 ------

1461 TypeError: raises if the node is not a table

1462 """

1463 # version requirements

1464 _tables()

1465 s = self.get_storer(key)

1466 if s is None:

1467 return

1468

1469 if not isinstance(s, Table):

1470 raise TypeError("cannot create table index on a Fixed format store")

1471 s.create_index(columns=columns, optlevel=optlevel, kind=kind)

1472

1473 def groups(self) -> list:

1474 """

1475 Return a list of all the top-level nodes.

1476

1477 Each node returned is not a pandas storage object.

1478

1479 Returns

1480 -------

1481 list

1482 List of objects.

1483

1484 Examples

1485 --------

1486 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])

1487 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP

1488 >>> store.put('data', df) # doctest: +SKIP

1489 >>> print(store.groups()) # doctest: +SKIP

1490 >>> store.close() # doctest: +SKIP

1491 [/data (Group) ''

1492 children := ['axis0' (Array), 'axis1' (Array), 'block0_values' (Array),

1493 'block0_items' (Array)]]

1494 """

1495 _tables()

1496 self._check_if_open()

1497 assert self._handle is not None # for mypy

1498 assert _table_mod is not None # for mypy

1499 return [

1500 g

1501 for g in self._handle.walk_groups()

1502 if (

1503 not isinstance(g, _table_mod.link.Link)

1504 and (

1505 getattr(g._v_attrs, "pandas_type", None)

1506 or getattr(g, "table", None)

1507 or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")

1508 )

1509 )

1510 ]

1511

1512 def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]:

1513 """

1514 Walk the pytables group hierarchy for pandas objects.

1515

1516 This generator will yield the group path, subgroups and pandas object

1517 names for each group.

1518

1519 Any non-pandas PyTables objects that are not a group will be ignored.

1520

1521 The `where` group itself is listed first (preorder), then each of its

1522 child groups (following an alphanumerical order) is also traversed,

1523 following the same procedure.

1524

1525 Parameters

1526 ----------

1527 where : str, default "/"

1528 Group where to start walking.

1529

1530 Yields

1531 ------

1532 path : str

1533 Full path to a group (without trailing '/').

1534 groups : list

1535 Names (strings) of the groups contained in `path`.

1536 leaves : list

1537 Names (strings) of the pandas objects contained in `path`.

1538

1539 Examples

1540 --------

1541 >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])

1542 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP

1543 >>> store.put('data', df1, format='table') # doctest: +SKIP

1544 >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B'])

1545 >>> store.append('data', df2) # doctest: +SKIP

1546 >>> store.close() # doctest: +SKIP

1547 >>> for group in store.walk(): # doctest: +SKIP

1548 ... print(group) # doctest: +SKIP

1549 >>> store.close() # doctest: +SKIP

1550 """

1551 _tables()

1552 self._check_if_open()

1553 assert self._handle is not None # for mypy

1554 assert _table_mod is not None # for mypy

1555

1556 for g in self._handle.walk_groups(where):

1557 if getattr(g._v_attrs, "pandas_type", None) is not None:

1558 continue

1559

1560 groups = []

1561 leaves = []

1562 for child in g._v_children.values():

1563 pandas_type = getattr(child._v_attrs, "pandas_type", None)

1564 if pandas_type is None:

1565 if isinstance(child, _table_mod.group.Group):

1566 groups.append(child._v_name)

1567 else:

1568 leaves.append(child._v_name)

1569

1570 yield (g._v_pathname.rstrip("/"), groups, leaves)

1571

1572 def get_node(self, key: str) -> Node | None:

1573 """return the node with the key or None if it does not exist"""

1574 self._check_if_open()

1575 if not key.startswith("/"):

1576 key = "/" + key

1577

1578 assert self._handle is not None

1579 assert _table_mod is not None # for mypy

1580 try:

1581 node = self._handle.get_node(self.root, key)

1582 except _table_mod.exceptions.NoSuchNodeError:

1583 return None

1584

1585 assert isinstance(node, _table_mod.Node), type(node)

1586 return node

1587

1588 def get_storer(self, key: str) -> GenericFixed | Table:

1589 """return the storer object for a key, raise if not in the file"""

1590 group = self.get_node(key)

1591 if group is None:

1592 raise KeyError(f"No object named {key} in the file")

1593

1594 s = self._create_storer(group)

1595 s.infer_axes()

1596 return s

1597

1598 def copy(

1599 self,

1600 file,

1601 mode: str = "w",

1602 propindexes: bool = True,

1603 keys=None,

1604 complib=None,

1605 complevel: int | None = None,

1606 fletcher32: bool = False,

1607 overwrite: bool = True,

1608 ) -> HDFStore:

1609 """

1610 Copy the existing store to a new file, updating in place.

1611

1612 Parameters

1613 ----------

1614 propindexes : bool, default True

1615 Restore indexes in copied file.

1616 keys : list, optional

1617 List of keys to include in the copy (defaults to all).

1618 overwrite : bool, default True

1619 Whether to overwrite (remove and replace) existing nodes in the new store.

1620 mode, complib, complevel, fletcher32 same as in HDFStore.__init__

1621

1622 Returns

1623 -------

1624 open file handle of the new store

1625 """

1626 new_store = HDFStore(

1627 file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32

1628 )

1629 if keys is None:

1630 keys = list(self.keys())

1631 if not isinstance(keys, (tuple, list)):

1632 keys = [keys]

1633 for k in keys:

1634 s = self.get_storer(k)

1635 if s is not None:

1636 if k in new_store:

1637 if overwrite:

1638 new_store.remove(k)

1639

1640 data = self.select(k)

1641 if isinstance(s, Table):

1642 index: bool | list[str] = False

1643 if propindexes:

1644 index = [a.name for a in s.axes if a.is_indexed]

1645 new_store.append(

1646 k,

1647 data,

1648 index=index,

1649 data_columns=getattr(s, "data_columns", None),

1650 encoding=s.encoding,

1651 )

1652 else:

1653 new_store.put(k, data, encoding=s.encoding)

1654

1655 return new_store

1656

1657 def info(self) -> str:

1658 """

1659 Print detailed information on the store.

1660

1661 Returns

1662 -------

1663 str

1664

1665 Examples

1666 --------

1667 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])

1668 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP

1669 >>> store.put('data', df) # doctest: +SKIP

1670 >>> print(store.info()) # doctest: +SKIP

1671 >>> store.close() # doctest: +SKIP

1672 <class 'pandas.io.pytables.HDFStore'>

1673 File path: store.h5

1674 /data frame (shape->[2,2])

1675 """

1676 path = pprint_thing(self._path)

1677 output = f"{type(self)}\nFile path: {path}\n"

1678

1679 if self.is_open:

1680 lkeys = sorted(self.keys())

1681 if len(lkeys):

1682 keys = []

1683 values = []

1684

1685 for k in lkeys:

1686 try:

1687 s = self.get_storer(k)

1688 if s is not None:

1689 keys.append(pprint_thing(s.pathname or k))

1690 values.append(pprint_thing(s or "invalid_HDFStore node"))

1691 except AssertionError:

1692 # surface any assertion errors for e.g. debugging

1693 raise

1694 except Exception as detail:

1695 keys.append(k)

1696 dstr = pprint_thing(detail)

1697 values.append(f"[invalid_HDFStore node: {dstr}]")

1698

1699 output += adjoin(12, keys, values)

1700 else:

1701 output += "Empty"

1702 else:

1703 output += "File is CLOSED"

1704

1705 return output

1706

1707 # ------------------------------------------------------------------------

1708 # private methods

1709

1710 def _check_if_open(self) -> None:

1711 if not self.is_open:

1712 raise ClosedFileError(f"{self._path} file is not open!")

1713

1714 def _validate_format(self, format: str) -> str:

1715 """validate / deprecate formats"""

1716 # validate

1717 try:

1718 format = _FORMAT_MAP[format.lower()]

1719 except KeyError as err:

1720 raise TypeError(f"invalid HDFStore format specified [{format}]") from err

1721

1722 return format

1723

1724 def _create_storer(

1725 self,

1726 group,

1727 format=None,

1728 value: DataFrame | Series | None = None,

1729 encoding: str = "UTF-8",

1730 errors: str = "strict",

1731 ) -> GenericFixed | Table:

1732 """return a suitable class to operate"""

1733 cls: type[GenericFixed | Table]

1734

1735 if value is not None and not isinstance(value, (Series, DataFrame)):

1736 raise TypeError("value must be None, Series, or DataFrame")

1737

1738 pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))

1739 tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))

1740

1741 # infer the pt from the passed value

1742 if pt is None:

1743 if value is None:

1744 _tables()

1745 assert _table_mod is not None # for mypy

1746 if getattr(group, "table", None) or isinstance(

1747 group, _table_mod.table.Table

1748 ):

1749 pt = "frame_table"

1750 tt = "generic_table"

1751 else:

1752 raise TypeError(

1753 "cannot create a storer if the object is not existing "

1754 "nor a value are passed"

1755 )

1756 else:

1757 if isinstance(value, Series):

1758 pt = "series"

1759 else:

1760 pt = "frame"

1761

1762 # we are actually a table

1763 if format == "table":

1764 pt += "_table"

1765

1766 # a storer node

1767 if "table" not in pt:

1768 _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed}

1769 try:

1770 cls = _STORER_MAP[pt]

1771 except KeyError as err:

1772 raise TypeError(

1773 f"cannot properly create the storer for: [_STORER_MAP] [group->"

1774 f"{group},value->{type(value)},format->{format}"

1775 ) from err

1776 return cls(self, group, encoding=encoding, errors=errors)

1777

1778 # existing node (and must be a table)

1779 if tt is None:

1780 # if we are a writer, determine the tt

1781 if value is not None:

1782 if pt == "series_table":

1783 index = getattr(value, "index", None)

1784 if index is not None:

1785 if index.nlevels == 1:

1786 tt = "appendable_series"

1787 elif index.nlevels > 1:

1788 tt = "appendable_multiseries"

1789 elif pt == "frame_table":

1790 index = getattr(value, "index", None)

1791 if index is not None:

1792 if index.nlevels == 1:

1793 tt = "appendable_frame"

1794 elif index.nlevels > 1:

1795 tt = "appendable_multiframe"

1796

1797 _TABLE_MAP = {

1798 "generic_table": GenericTable,

1799 "appendable_series": AppendableSeriesTable,

1800 "appendable_multiseries": AppendableMultiSeriesTable,

1801 "appendable_frame": AppendableFrameTable,

1802 "appendable_multiframe": AppendableMultiFrameTable,

1803 "worm": WORMTable,

1804 }

1805 try:

1806 cls = _TABLE_MAP[tt]

1807 except KeyError as err:

1808 raise TypeError(

1809 f"cannot properly create the storer for: [_TABLE_MAP] [group->"

1810 f"{group},value->{type(value)},format->{format}"

1811 ) from err

1812

1813 return cls(self, group, encoding=encoding, errors=errors)

1814

1815 def _write_to_group(

1816 self,

1817 key: str,

1818 value: DataFrame | Series,

1819 format,

1820 axes=None,

1821 index: bool | list[str] = True,

1822 append: bool = False,

1823 complib=None,

1824 complevel: int | None = None,

1825 fletcher32=None,

1826 min_itemsize: int | dict[str, int] | None = None,

1827 chunksize: int | None = None,

1828 expectedrows=None,

1829 dropna: bool = False,

1830 nan_rep=None,

1831 data_columns=None,

1832 encoding=None,

1833 errors: str = "strict",

1834 track_times: bool = True,

1835 ) -> None:

1836 # we don't want to store a table node at all if our object is 0-len

1837 # as there are not dtypes

1838 if getattr(value, "empty", None) and (format == "table" or append):

1839 return

1840

1841 group = self._identify_group(key, append)

1842

1843 s = self._create_storer(group, format, value, encoding=encoding, errors=errors)

1844 if append:

1845 # raise if we are trying to append to a Fixed format,

1846 # or a table that exists (and we are putting)

1847 if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):

1848 raise ValueError("Can only append to Tables")

1849 if not s.is_exists:

1850 s.set_object_info()

1851 else:

1852 s.set_object_info()

1853

1854 if not s.is_table and complib:

1855 raise ValueError("Compression not supported on Fixed format stores")

1856

1857 # write the object

1858 s.write(

1859 obj=value,

1860 axes=axes,

1861 append=append,

1862 complib=complib,

1863 complevel=complevel,

1864 fletcher32=fletcher32,

1865 min_itemsize=min_itemsize,

1866 chunksize=chunksize,

1867 expectedrows=expectedrows,

1868 dropna=dropna,

1869 nan_rep=nan_rep,

1870 data_columns=data_columns,

1871 track_times=track_times,

1872 )

1873

1874 if isinstance(s, Table) and index:

1875 s.create_index(columns=index)

1876

1877 def _read_group(self, group: Node):

1878 s = self._create_storer(group)

1879 s.infer_axes()

1880 return s.read()

1881

1882 def _identify_group(self, key: str, append: bool) -> Node:

1883 """Identify HDF5 group based on key, delete/create group if needed."""

1884 group = self.get_node(key)

1885

1886 # we make this assertion for mypy; the get_node call will already

1887 # have raised if this is incorrect

1888 assert self._handle is not None

1889

1890 # remove the node if we are not appending

1891 if group is not None and not append:

1892 self._handle.remove_node(group, recursive=True)

1893 group = None

1894

1895 if group is None:

1896 group = self._create_nodes_and_group(key)

1897

1898 return group

1899

1900 def _create_nodes_and_group(self, key: str) -> Node:

1901 """Create nodes from key and return group name."""

1902 # assertion for mypy

1903 assert self._handle is not None

1904

1905 paths = key.split("/")

1906 # recursively create the groups

1907 path = "/"

1908 for p in paths:

1909 if not len(p):

1910 continue

1911 new_path = path

1912 if not path.endswith("/"):

1913 new_path += "/"

1914 new_path += p

1915 group = self.get_node(new_path)

1916 if group is None:

1917 group = self._handle.create_group(path, p)

1918 path = new_path

1919 return group

1920

1921

1922class TableIterator:

1923 """

1924 Define the iteration interface on a table

1925

1926 Parameters

1927 ----------

1928 store : HDFStore

1929 s : the referred storer

1930 func : the function to execute the query

1931 where : the where of the query

1932 nrows : the rows to iterate on

1933 start : the passed start value (default is None)

1934 stop : the passed stop value (default is None)

1935 iterator : bool, default False

1936 Whether to use the default iterator.

1937 chunksize : the passed chunking value (default is 100000)

1938 auto_close : bool, default False

1939 Whether to automatically close the store at the end of iteration.

1940 """

1941

1942 chunksize: int | None

1943 store: HDFStore

1944 s: GenericFixed | Table

1945

1946 def __init__(

1947 self,

1948 store: HDFStore,

1949 s: GenericFixed | Table,

1950 func,

1951 where,

1952 nrows,

1953 start=None,

1954 stop=None,

1955 iterator: bool = False,

1956 chunksize: int | None = None,

1957 auto_close: bool = False,

1958 ) -> None:

1959 self.store = store

1960 self.s = s

1961 self.func = func

1962 self.where = where

1963

1964 # set start/stop if they are not set if we are a table

1965 if self.s.is_table:

1966 if nrows is None:

1967 nrows = 0

1968 if start is None:

1969 start = 0

1970 if stop is None:

1971 stop = nrows

1972 stop = min(nrows, stop)

1973

1974 self.nrows = nrows

1975 self.start = start

1976 self.stop = stop

1977

1978 self.coordinates = None

1979 if iterator or chunksize is not None:

1980 if chunksize is None:

1981 chunksize = 100000

1982 self.chunksize = int(chunksize)

1983 else:

1984 self.chunksize = None

1985

1986 self.auto_close = auto_close

1987

1988 def __iter__(self) -> Iterator:

1989 # iterate

1990 current = self.start

1991 if self.coordinates is None:

1992 raise ValueError("Cannot iterate until get_result is called.")

1993 while current < self.stop:

1994 stop = min(current + self.chunksize, self.stop)

1995 value = self.func(None, None, self.coordinates[current:stop])

1996 current = stop

1997 if value is None or not len(value):

1998 continue

1999

2000 yield value

2001

2002 self.close()

2003

2004 def close(self) -> None:

2005 if self.auto_close:

2006 self.store.close()

2007

2008 def get_result(self, coordinates: bool = False):

2009 # return the actual iterator

2010 if self.chunksize is not None:

2011 if not isinstance(self.s, Table):

2012 raise TypeError("can only use an iterator or chunksize on a table")

2013

2014 self.coordinates = self.s.read_coordinates(where=self.where)

2015

2016 return self

2017

2018 # if specified read via coordinates (necessary for multiple selections

2019 if coordinates:

2020 if not isinstance(self.s, Table):

2021 raise TypeError("can only read_coordinates on a table")

2022 where = self.s.read_coordinates(

2023 where=self.where, start=self.start, stop=self.stop

2024 )

2025 else:

2026 where = self.where

2027

2028 # directly return the result

2029 results = self.func(self.start, self.stop, where)

2030 self.close()

2031 return results

2032

2033

2034class IndexCol:

2035 """

2036 an index column description class

2037

2038 Parameters

2039 ----------

2040 axis : axis which I reference

2041 values : the ndarray like converted values

2042 kind : a string description of this type

2043 typ : the pytables type

2044 pos : the position in the pytables

2045

2046 """

2047

2048 is_an_indexable: bool = True

2049 is_data_indexable: bool = True

2050 _info_fields = ["freq", "tz", "index_name"]

2051

2052 def __init__(

2053 self,

2054 name: str,

2055 values=None,

2056 kind=None,

2057 typ=None,

2058 cname: str | None = None,

2059 axis=None,

2060 pos=None,

2061 freq=None,

2062 tz=None,

2063 index_name=None,

2064 ordered=None,

2065 table=None,

2066 meta=None,

2067 metadata=None,

2068 ) -> None:

2069 if not isinstance(name, str):

2070 raise ValueError("`name` must be a str.")

2071

2072 self.values = values

2073 self.kind = kind

2074 self.typ = typ

2075 self.name = name

2076 self.cname = cname or name

2077 self.axis = axis

2078 self.pos = pos

2079 self.freq = freq

2080 self.tz = tz

2081 self.index_name = index_name

2082 self.ordered = ordered

2083 self.table = table

2084 self.meta = meta

2085 self.metadata = metadata

2086

2087 if pos is not None:

2088 self.set_pos(pos)

2089

2090 # These are ensured as long as the passed arguments match the

2091 # constructor annotations.

2092 assert isinstance(self.name, str)

2093 assert isinstance(self.cname, str)

2094

2095 @property

2096 def itemsize(self) -> int:

2097 # Assumes self.typ has already been initialized

2098 return self.typ.itemsize

2099

2100 @property

2101 def kind_attr(self) -> str:

2102 return f"{self.name}_kind"

2103

2104 def set_pos(self, pos: int) -> None:

2105 """set the position of this column in the Table"""

2106 self.pos = pos

2107 if pos is not None and self.typ is not None:

2108 self.typ._v_pos = pos

2109

2110 def __repr__(self) -> str:

2111 temp = tuple(

2112 map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))

2113 )

2114 return ",".join(

2115 [

2116 f"{key}->{value}"

2117 for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp)

2118 ]

2119 )

2120

2121 def __eq__(self, other: object) -> bool:

2122 """compare 2 col items"""

2123 return all(

2124 getattr(self, a, None) == getattr(other, a, None)

2125 for a in ["name", "cname", "axis", "pos"]

2126 )

2127

2128 def __ne__(self, other) -> bool:

2129 return not self.__eq__(other)

2130

2131 @property

2132 def is_indexed(self) -> bool:

2133 """return whether I am an indexed column"""

2134 if not hasattr(self.table, "cols"):

2135 # e.g. if infer hasn't been called yet, self.table will be None.

2136 return False

2137 return getattr(self.table.cols, self.cname).is_indexed

2138

2139 def convert(

2140 self, values: np.ndarray, nan_rep, encoding: str, errors: str

2141 ) -> tuple[np.ndarray, np.ndarray] | tuple[Index, Index]:

2142 """

2143 Convert the data from this selection to the appropriate pandas type.

2144 """

2145 assert isinstance(values, np.ndarray), type(values)

2146

2147 # values is a recarray

2148 if values.dtype.fields is not None:

2149 # Copy, otherwise values will be a view

2150 # preventing the original recarry from being free'ed

2151 values = values[self.cname].copy()

2152

2153 val_kind = _ensure_decoded(self.kind)

2154 values = _maybe_convert(values, val_kind, encoding, errors)

2155 kwargs = {}

2156 kwargs["name"] = _ensure_decoded(self.index_name)

2157

2158 if self.freq is not None:

2159 kwargs["freq"] = _ensure_decoded(self.freq)

2160

2161 factory: type[Index | DatetimeIndex] = Index

2162 if lib.is_np_dtype(values.dtype, "M") or isinstance(

2163 values.dtype, DatetimeTZDtype

2164 ):

2165 factory = DatetimeIndex

2166 elif values.dtype == "i8" and "freq" in kwargs:

2167 # PeriodIndex data is stored as i8

2168 # error: Incompatible types in assignment (expression has type

2169 # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type

2170 # "Union[Type[Index], Type[DatetimeIndex]]")

2171 factory = lambda x, **kwds: PeriodIndex.from_ordinals( # type: ignore[assignment]

2172 x, freq=kwds.get("freq", None)

2173 )._rename(

2174 kwds["name"]

2175 )

2176

2177 # making an Index instance could throw a number of different errors

2178 try:

2179 new_pd_index = factory(values, **kwargs)

2180 except ValueError:

2181 # if the output freq is different that what we recorded,

2182 # it should be None (see also 'doc example part 2')

2183 if "freq" in kwargs:

2184 kwargs["freq"] = None

2185 new_pd_index = factory(values, **kwargs)

2186 final_pd_index = _set_tz(new_pd_index, self.tz)

2187 return final_pd_index, final_pd_index

2188

2189 def take_data(self):

2190 """return the values"""

2191 return self.values

2192

2193 @property

2194 def attrs(self):

2195 return self.table._v_attrs

2196

2197 @property

2198 def description(self):

2199 return self.table.description

2200

2201 @property

2202 def col(self):

2203 """return my current col description"""

2204 return getattr(self.description, self.cname, None)

2205

2206 @property

2207 def cvalues(self):

2208 """return my cython values"""

2209 return self.values

2210

2211 def __iter__(self) -> Iterator:

2212 return iter(self.values)

2213

2214 def maybe_set_size(self, min_itemsize=None) -> None:

2215 """

2216 maybe set a string col itemsize:

2217 min_itemsize can be an integer or a dict with this columns name

2218 with an integer size

2219 """

2220 if _ensure_decoded(self.kind) == "string":

2221 if isinstance(min_itemsize, dict):

2222 min_itemsize = min_itemsize.get(self.name)

2223

2224 if min_itemsize is not None and self.typ.itemsize < min_itemsize:

2225 self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)

2226

2227 def validate_names(self) -> None:

2228 pass

2229

2230 def validate_and_set(self, handler: AppendableTable, append: bool) -> None:

2231 self.table = handler.table

2232 self.validate_col()

2233 self.validate_attr(append)

2234 self.validate_metadata(handler)

2235 self.write_metadata(handler)

2236 self.set_attr()

2237

2238 def validate_col(self, itemsize=None):

2239 """validate this column: return the compared against itemsize"""

2240 # validate this column for string truncation (or reset to the max size)

2241 if _ensure_decoded(self.kind) == "string":

2242 c = self.col

2243 if c is not None:

2244 if itemsize is None:

2245 itemsize = self.itemsize

2246 if c.itemsize < itemsize:

2247 raise ValueError(

2248 f"Trying to store a string with len [{itemsize}] in "

2249 f"[{self.cname}] column but\nthis column has a limit of "

2250 f"[{c.itemsize}]!\nConsider using min_itemsize to "

2251 "preset the sizes on these columns"

2252 )

2253 return c.itemsize

2254

2255 return None

2256

2257 def validate_attr(self, append: bool) -> None:

2258 # check for backwards incompatibility

2259 if append:

2260 existing_kind = getattr(self.attrs, self.kind_attr, None)

2261 if existing_kind is not None and existing_kind != self.kind:

2262 raise TypeError(

2263 f"incompatible kind in col [{existing_kind} - {self.kind}]"

2264 )

2265

2266 def update_info(self, info) -> None:

2267 """

2268 set/update the info for this indexable with the key/value

2269 if there is a conflict raise/warn as needed

2270 """

2271 for key in self._info_fields:

2272 value = getattr(self, key, None)

2273 idx = info.setdefault(self.name, {})

2274

2275 existing_value = idx.get(key)

2276 if key in idx and value is not None and existing_value != value:

2277 # frequency/name just warn

2278 if key in ["freq", "index_name"]:

2279 ws = attribute_conflict_doc % (key, existing_value, value)

2280 warnings.warn(

2281 ws, AttributeConflictWarning, stacklevel=find_stack_level()

2282 )

2283

2284 # reset

2285 idx[key] = None

2286 setattr(self, key, None)

2287

2288 else:

2289 raise ValueError(

2290 f"invalid info for [{self.name}] for [{key}], "

2291 f"existing_value [{existing_value}] conflicts with "

2292 f"new value [{value}]"

2293 )

2294 elif value is not None or existing_value is not None:

2295 idx[key] = value

2296

2297 def set_info(self, info) -> None:

2298 """set my state from the passed info"""

2299 idx = info.get(self.name)

2300 if idx is not None:

2301 self.__dict__.update(idx)

2302

2303 def set_attr(self) -> None:

2304 """set the kind for this column"""

2305 setattr(self.attrs, self.kind_attr, self.kind)

2306

2307 def validate_metadata(self, handler: AppendableTable) -> None:

2308 """validate that kind=category does not change the categories"""

2309 if self.meta == "category":

2310 new_metadata = self.metadata

2311 cur_metadata = handler.read_metadata(self.cname)

2312 if (

2313 new_metadata is not None

2314 and cur_metadata is not None

2315 and not array_equivalent(

2316 new_metadata, cur_metadata, strict_nan=True, dtype_equal=True

2317 )

2318 ):

2319 raise ValueError(

2320 "cannot append a categorical with "

2321 "different categories to the existing"

2322 )

2323

2324 def write_metadata(self, handler: AppendableTable) -> None:

2325 """set the meta data"""

2326 if self.metadata is not None:

2327 handler.write_metadata(self.cname, self.metadata)

2328

2329

2330class GenericIndexCol(IndexCol):

2331 """an index which is not represented in the data of the table"""

2332

2333 @property

2334 def is_indexed(self) -> bool:

2335 return False

2336

2337 def convert(

2338 self, values: np.ndarray, nan_rep, encoding: str, errors: str

2339 ) -> tuple[Index, Index]:

2340 """

2341 Convert the data from this selection to the appropriate pandas type.

2342

2343 Parameters

2344 ----------

2345 values : np.ndarray

2346 nan_rep : str

2347 encoding : str

2348 errors : str

2349 """

2350 assert isinstance(values, np.ndarray), type(values)

2351

2352 index = RangeIndex(len(values))

2353 return index, index

2354

2355 def set_attr(self) -> None:

2356 pass

2357

2358

2359class DataCol(IndexCol):

2360 """

2361 a data holding column, by definition this is not indexable

2362

2363 Parameters

2364 ----------

2365 data : the actual data

2366 cname : the column name in the table to hold the data (typically

2367 values)

2368 meta : a string description of the metadata

2369 metadata : the actual metadata

2370 """

2371

2372 is_an_indexable = False

2373 is_data_indexable = False

2374 _info_fields = ["tz", "ordered"]

2375

2376 def __init__(

2377 self,

2378 name: str,

2379 values=None,

2380 kind=None,

2381 typ=None,

2382 cname: str | None = None,

2383 pos=None,

2384 tz=None,

2385 ordered=None,

2386 table=None,

2387 meta=None,

2388 metadata=None,

2389 dtype: DtypeArg | None = None,

2390 data=None,

2391 ) -> None:

2392 super().__init__(

2393 name=name,

2394 values=values,

2395 kind=kind,

2396 typ=typ,

2397 pos=pos,

2398 cname=cname,

2399 tz=tz,

2400 ordered=ordered,

2401 table=table,

2402 meta=meta,

2403 metadata=metadata,

2404 )

2405 self.dtype = dtype

2406 self.data = data

2407

2408 @property

2409 def dtype_attr(self) -> str:

2410 return f"{self.name}_dtype"

2411

2412 @property

2413 def meta_attr(self) -> str:

2414 return f"{self.name}_meta"

2415

2416 def __repr__(self) -> str:

2417 temp = tuple(

2418 map(

2419 pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)

2420 )

2421 )

2422 return ",".join(

2423 [

2424 f"{key}->{value}"

2425 for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp)

2426 ]

2427 )

2428

2429 def __eq__(self, other: object) -> bool:

2430 """compare 2 col items"""

2431 return all(

2432 getattr(self, a, None) == getattr(other, a, None)

2433 for a in ["name", "cname", "dtype", "pos"]

2434 )

2435

2436 def set_data(self, data: ArrayLike) -> None:

2437 assert data is not None

2438 assert self.dtype is None

2439

2440 data, dtype_name = _get_data_and_dtype_name(data)

2441

2442 self.data = data

2443 self.dtype = dtype_name

2444 self.kind = _dtype_to_kind(dtype_name)

2445

2446 def take_data(self):

2447 """return the data"""

2448 return self.data

2449

2450 @classmethod

2451 def _get_atom(cls, values: ArrayLike) -> Col:

2452 """

2453 Get an appropriately typed and shaped pytables.Col object for values.

2454 """

2455 dtype = values.dtype

2456 # error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no

2457 # attribute "itemsize"

2458 itemsize = dtype.itemsize # type: ignore[union-attr]

2459

2460 shape = values.shape

2461 if values.ndim == 1:

2462 # EA, use block shape pretending it is 2D

2463 # TODO(EA2D): not necessary with 2D EAs

2464 shape = (1, values.size)

2465

2466 if isinstance(values, Categorical):

2467 codes = values.codes

2468 atom = cls.get_atom_data(shape, kind=codes.dtype.name)

2469 elif lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype):

2470 atom = cls.get_atom_datetime64(shape)

2471 elif lib.is_np_dtype(dtype, "m"):

2472 atom = cls.get_atom_timedelta64(shape)

2473 elif is_complex_dtype(dtype):

2474 atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0])

2475 elif is_string_dtype(dtype):

2476 atom = cls.get_atom_string(shape, itemsize)

2477 else:

2478 atom = cls.get_atom_data(shape, kind=dtype.name)

2479

2480 return atom

2481

2482 @classmethod

2483 def get_atom_string(cls, shape, itemsize):

2484 return _tables().StringCol(itemsize=itemsize, shape=shape[0])

2485

2486 @classmethod

2487 def get_atom_coltype(cls, kind: str) -> type[Col]:

2488 """return the PyTables column class for this column"""

2489 if kind.startswith("uint"):

2490 k4 = kind[4:]

2491 col_name = f"UInt{k4}Col"

2492 elif kind.startswith("period"):

2493 # we store as integer

2494 col_name = "Int64Col"

2495 else:

2496 kcap = kind.capitalize()

2497 col_name = f"{kcap}Col"

2498

2499 return getattr(_tables(), col_name)

2500

2501 @classmethod

2502 def get_atom_data(cls, shape, kind: str) -> Col:

2503 return cls.get_atom_coltype(kind=kind)(shape=shape[0])

2504

2505 @classmethod

2506 def get_atom_datetime64(cls, shape):

2507 return _tables().Int64Col(shape=shape[0])

2508

2509 @classmethod

2510 def get_atom_timedelta64(cls, shape):

2511 return _tables().Int64Col(shape=shape[0])

2512

2513 @property

2514 def shape(self):

2515 return getattr(self.data, "shape", None)

2516

2517 @property

2518 def cvalues(self):

2519 """return my cython values"""

2520 return self.data

2521

2522 def validate_attr(self, append) -> None:

2523 """validate that we have the same order as the existing & same dtype"""

2524 if append:

2525 existing_fields = getattr(self.attrs, self.kind_attr, None)

2526 if existing_fields is not None and existing_fields != list(self.values):

2527 raise ValueError("appended items do not match existing items in table!")

2528

2529 existing_dtype = getattr(self.attrs, self.dtype_attr, None)

2530 if existing_dtype is not None and existing_dtype != self.dtype:

2531 raise ValueError(

2532 "appended items dtype do not match existing items dtype in table!"

2533 )

2534

2535 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):

2536 """

2537 Convert the data from this selection to the appropriate pandas type.

2538

2539 Parameters

2540 ----------

2541 values : np.ndarray

2542 nan_rep :

2543 encoding : str

2544 errors : str

2545

2546 Returns

2547 -------

2548 index : listlike to become an Index

2549 data : ndarraylike to become a column

2550 """

2551 assert isinstance(values, np.ndarray), type(values)

2552

2553 # values is a recarray

2554 if values.dtype.fields is not None:

2555 values = values[self.cname]

2556

2557 assert self.typ is not None

2558 if self.dtype is None:

2559 # Note: in tests we never have timedelta64 or datetime64,

2560 # so the _get_data_and_dtype_name may be unnecessary

2561 converted, dtype_name = _get_data_and_dtype_name(values)

2562 kind = _dtype_to_kind(dtype_name)

2563 else:

2564 converted = values

2565 dtype_name = self.dtype

2566 kind = self.kind

2567

2568 assert isinstance(converted, np.ndarray) # for mypy

2569

2570 # use the meta if needed

2571 meta = _ensure_decoded(self.meta)

2572 metadata = self.metadata

2573 ordered = self.ordered

2574 tz = self.tz

2575

2576 assert dtype_name is not None

2577 # convert to the correct dtype

2578 dtype = _ensure_decoded(dtype_name)

2579

2580 # reverse converts

2581 if dtype.startswith("datetime64"):

2582 # recreate with tz if indicated

2583 converted = _set_tz(converted, tz, coerce=True)

2584

2585 elif dtype == "timedelta64":

2586 converted = np.asarray(converted, dtype="m8[ns]")

2587 elif dtype == "date":

2588 try:

2589 converted = np.asarray(

2590 [date.fromordinal(v) for v in converted], dtype=object

2591 )

2592 except ValueError:

2593 converted = np.asarray(

2594 [date.fromtimestamp(v) for v in converted], dtype=object

2595 )

2596

2597 elif meta == "category":

2598 # we have a categorical

2599 categories = metadata

2600 codes = converted.ravel()

2601

2602 # if we have stored a NaN in the categories

2603 # then strip it; in theory we could have BOTH

2604 # -1s in the codes and nulls :<

2605 if categories is None:

2606 # Handle case of NaN-only categorical columns in which case

2607 # the categories are an empty array; when this is stored,

2608 # pytables cannot write a zero-len array, so on readback

2609 # the categories would be None and `read_hdf()` would fail.

2610 categories = Index([], dtype=np.float64)

2611 else:

2612 mask = isna(categories)

2613 if mask.any():

2614 categories = categories[~mask]

2615 codes[codes != -1] -= mask.astype(int).cumsum()._values

2616

2617 converted = Categorical.from_codes(

2618 codes, categories=categories, ordered=ordered, validate=False

2619 )

2620

2621 else:

2622 try:

2623 converted = converted.astype(dtype, copy=False)

2624 except TypeError:

2625 converted = converted.astype("O", copy=False)

2626

2627 # convert nans / decode

2628 if _ensure_decoded(kind) == "string":

2629 converted = _unconvert_string_array(

2630 converted, nan_rep=nan_rep, encoding=encoding, errors=errors

2631 )

2632

2633 return self.values, converted

2634

2635 def set_attr(self) -> None:

2636 """set the data for this column"""

2637 setattr(self.attrs, self.kind_attr, self.values)

2638 setattr(self.attrs, self.meta_attr, self.meta)

2639 assert self.dtype is not None

2640 setattr(self.attrs, self.dtype_attr, self.dtype)

2641

2642

2643class DataIndexableCol(DataCol):

2644 """represent a data column that can be indexed"""

2645

2646 is_data_indexable = True

2647

2648 def validate_names(self) -> None:

2649 if not is_string_dtype(Index(self.values).dtype):

2650 # TODO: should the message here be more specifically non-str?

2651 raise ValueError("cannot have non-object label DataIndexableCol")

2652

2653 @classmethod

2654 def get_atom_string(cls, shape, itemsize):

2655 return _tables().StringCol(itemsize=itemsize)

2656

2657 @classmethod

2658 def get_atom_data(cls, shape, kind: str) -> Col:

2659 return cls.get_atom_coltype(kind=kind)()

2660

2661 @classmethod

2662 def get_atom_datetime64(cls, shape):

2663 return _tables().Int64Col()

2664

2665 @classmethod

2666 def get_atom_timedelta64(cls, shape):

2667 return _tables().Int64Col()

2668

2669

2670class GenericDataIndexableCol(DataIndexableCol):

2671 """represent a generic pytables data column"""

2672

2673

2674class Fixed:

2675 """

2676 represent an object in my store

2677 facilitate read/write of various types of objects

2678 this is an abstract base class

2679

2680 Parameters

2681 ----------

2682 parent : HDFStore

2683 group : Node

2684 The group node where the table resides.

2685 """

2686

2687 pandas_kind: str

2688 format_type: str = "fixed" # GH#30962 needed by dask

2689 obj_type: type[DataFrame | Series]

2690 ndim: int

2691 parent: HDFStore

2692 is_table: bool = False

2693

2694 def __init__(

2695 self,

2696 parent: HDFStore,

2697 group: Node,

2698 encoding: str | None = "UTF-8",

2699 errors: str = "strict",

2700 ) -> None:

2701 assert isinstance(parent, HDFStore), type(parent)

2702 assert _table_mod is not None # needed for mypy

2703 assert isinstance(group, _table_mod.Node), type(group)

2704 self.parent = parent

2705 self.group = group

2706 self.encoding = _ensure_encoding(encoding)

2707 self.errors = errors

2708

2709 @property

2710 def is_old_version(self) -> bool:

2711 return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1

2712

2713 @property

2714 def version(self) -> tuple[int, int, int]:

2715 """compute and set our version"""

2716 version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))

2717 try:

2718 version = tuple(int(x) for x in version.split("."))

2719 if len(version) == 2:

2720 version = version + (0,)

2721 except AttributeError:

2722 version = (0, 0, 0)

2723 return version

2724

2725 @property

2726 def pandas_type(self):

2727 return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))

2728

2729 def __repr__(self) -> str:

2730 """return a pretty representation of myself"""

2731 self.infer_axes()

2732 s = self.shape

2733 if s is not None:

2734 if isinstance(s, (list, tuple)):

2735 jshape = ",".join([pprint_thing(x) for x in s])

2736 s = f"[{jshape}]"

2737 return f"{self.pandas_type:12.12} (shape->{s})"

2738 return self.pandas_type

2739

2740 def set_object_info(self) -> None:

2741 """set my pandas type & version"""

2742 self.attrs.pandas_type = str(self.pandas_kind)

2743 self.attrs.pandas_version = str(_version)

2744

2745 def copy(self) -> Fixed:

2746 new_self = copy.copy(self)

2747 return new_self

2748

2749 @property

2750 def shape(self):

2751 return self.nrows

2752

2753 @property

2754 def pathname(self):

2755 return self.group._v_pathname

2756

2757 @property

2758 def _handle(self):

2759 return self.parent._handle

2760

2761 @property

2762 def _filters(self):

2763 return self.parent._filters

2764

2765 @property

2766 def _complevel(self) -> int:

2767 return self.parent._complevel

2768

2769 @property

2770 def _fletcher32(self) -> bool:

2771 return self.parent._fletcher32

2772

2773 @property

2774 def attrs(self):

2775 return self.group._v_attrs

2776

2777 def set_attrs(self) -> None:

2778 """set our object attributes"""

2779

2780 def get_attrs(self) -> None:

2781 """get our object attributes"""

2782

2783 @property

2784 def storable(self):

2785 """return my storable"""

2786 return self.group

2787

2788 @property

2789 def is_exists(self) -> bool:

2790 return False

2791

2792 @property

2793 def nrows(self):

2794 return getattr(self.storable, "nrows", None)

2795

2796 def validate(self, other) -> Literal[True] | None:

2797 """validate against an existing storable"""

2798 if other is None:

2799 return None

2800 return True

2801

2802 def validate_version(self, where=None) -> None:

2803 """are we trying to operate on an old version?"""

2804

2805 def infer_axes(self) -> bool:

2806 """

2807 infer the axes of my storer

2808 return a boolean indicating if we have a valid storer or not

2809 """

2810 s = self.storable

2811 if s is None:

2812 return False

2813 self.get_attrs()

2814 return True

2815

2816 def read(

2817 self,

2818 where=None,

2819 columns=None,

2820 start: int | None = None,

2821 stop: int | None = None,

2822 ):

2823 raise NotImplementedError(

2824 "cannot read on an abstract storer: subclasses should implement"

2825 )

2826

2827 def write(self, obj, **kwargs) -> None:

2828 raise NotImplementedError(

2829 "cannot write on an abstract storer: subclasses should implement"

2830 )

2831

2832 def delete(

2833 self, where=None, start: int | None = None, stop: int | None = None

2834 ) -> None:

2835 """

2836 support fully deleting the node in its entirety (only) - where

2837 specification must be None

2838 """

2839 if com.all_none(where, start, stop):

2840 self._handle.remove_node(self.group, recursive=True)

2841 return None

2842

2843 raise TypeError("cannot delete on an abstract storer")

2844

2845

2846class GenericFixed(Fixed):

2847 """a generified fixed version"""

2848

2849 _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}

2850 _reverse_index_map = {v: k for k, v in _index_type_map.items()}

2851 attributes: list[str] = []

2852

2853 # indexer helpers

2854 def _class_to_alias(self, cls) -> str:

2855 return self._index_type_map.get(cls, "")

2856

2857 def _alias_to_class(self, alias):

2858 if isinstance(alias, type): # pragma: no cover

2859 # compat: for a short period of time master stored types

2860 return alias

2861 return self._reverse_index_map.get(alias, Index)

2862

2863 def _get_index_factory(self, attrs):

2864 index_class = self._alias_to_class(

2865 _ensure_decoded(getattr(attrs, "index_class", ""))

2866 )

2867

2868 factory: Callable

2869

2870 if index_class == DatetimeIndex:

2871

2872 def f(values, freq=None, tz=None):

2873 # data are already in UTC, localize and convert if tz present

2874 dta = DatetimeArray._simple_new(

2875 values.values, dtype=values.dtype, freq=freq

2876 )

2877 result = DatetimeIndex._simple_new(dta, name=None)

2878 if tz is not None:

2879 result = result.tz_localize("UTC").tz_convert(tz)

2880 return result

2881

2882 factory = f

2883 elif index_class == PeriodIndex:

2884

2885 def f(values, freq=None, tz=None):

2886 dtype = PeriodDtype(freq)

2887 parr = PeriodArray._simple_new(values, dtype=dtype)

2888 return PeriodIndex._simple_new(parr, name=None)

2889

2890 factory = f

2891 else:

2892 factory = index_class

2893

2894 kwargs = {}

2895 if "freq" in attrs:

2896 kwargs["freq"] = attrs["freq"]

2897 if index_class is Index:

2898 # DTI/PI would be gotten by _alias_to_class

2899 factory = TimedeltaIndex

2900

2901 if "tz" in attrs:

2902 if isinstance(attrs["tz"], bytes):

2903 # created by python2

2904 kwargs["tz"] = attrs["tz"].decode("utf-8")

2905 else:

2906 # created by python3

2907 kwargs["tz"] = attrs["tz"]

2908 assert index_class is DatetimeIndex # just checking

2909

2910 return factory, kwargs

2911

2912 def validate_read(self, columns, where) -> None:

2913 """

2914 raise if any keywords are passed which are not-None

2915 """

2916 if columns is not None:

2917 raise TypeError(

2918 "cannot pass a column specification when reading "

2919 "a Fixed format store. this store must be selected in its entirety"

2920 )

2921 if where is not None:

2922 raise TypeError(

2923 "cannot pass a where specification when reading "

2924 "from a Fixed format store. this store must be selected in its entirety"

2925 )

2926

2927 @property

2928 def is_exists(self) -> bool:

2929 return True

2930

2931 def set_attrs(self) -> None:

2932 """set our object attributes"""

2933 self.attrs.encoding = self.encoding

2934 self.attrs.errors = self.errors

2935

2936 def get_attrs(self) -> None:

2937 """retrieve our attributes"""

2938 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))

2939 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))

2940 for n in self.attributes:

2941 setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))

2942

2943 def write(self, obj, **kwargs) -> None:

2944 self.set_attrs()

2945

2946 def read_array(self, key: str, start: int | None = None, stop: int | None = None):

2947 """read an array for the specified node (off of group"""

2948 import tables

2949

2950 node = getattr(self.group, key)

2951 attrs = node._v_attrs

2952

2953 transposed = getattr(attrs, "transposed", False)

2954

2955 if isinstance(node, tables.VLArray):

2956 ret = node[0][start:stop]

2957 else:

2958 dtype = _ensure_decoded(getattr(attrs, "value_type", None))

2959 shape = getattr(attrs, "shape", None)

2960

2961 if shape is not None:

2962 # length 0 axis

2963 ret = np.empty(shape, dtype=dtype)

2964 else:

2965 ret = node[start:stop]

2966

2967 if dtype and dtype.startswith("datetime64"):

2968 # reconstruct a timezone if indicated

2969 tz = getattr(attrs, "tz", None)

2970 ret = _set_tz(ret, tz, coerce=True)

2971

2972 elif dtype == "timedelta64":

2973 ret = np.asarray(ret, dtype="m8[ns]")

2974

2975 if transposed:

2976 return ret.T

2977 else:

2978 return ret

2979

2980 def read_index(

2981 self, key: str, start: int | None = None, stop: int | None = None

2982 ) -> Index:

2983 variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))

2984

2985 if variety == "multi":

2986 return self.read_multi_index(key, start=start, stop=stop)

2987 elif variety == "regular":

2988 node = getattr(self.group, key)

2989 index = self.read_index_node(node, start=start, stop=stop)

2990 return index

2991 else: # pragma: no cover

2992 raise TypeError(f"unrecognized index variety: {variety}")

2993

2994 def write_index(self, key: str, index: Index) -> None:

2995 if isinstance(index, MultiIndex):

2996 setattr(self.attrs, f"{key}_variety", "multi")

2997 self.write_multi_index(key, index)

2998 else:

2999 setattr(self.attrs, f"{key}_variety", "regular")

3000 converted = _convert_index("index", index, self.encoding, self.errors)

3001

3002 self.write_array(key, converted.values)

3003

3004 node = getattr(self.group, key)

3005 node._v_attrs.kind = converted.kind

3006 node._v_attrs.name = index.name

3007

3008 if isinstance(index, (DatetimeIndex, PeriodIndex)):

3009 node._v_attrs.index_class = self._class_to_alias(type(index))

3010

3011 if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):

3012 node._v_attrs.freq = index.freq

3013

3014 if isinstance(index, DatetimeIndex) and index.tz is not None:

3015 node._v_attrs.tz = _get_tz(index.tz)

3016

3017 def write_multi_index(self, key: str, index: MultiIndex) -> None:

3018 setattr(self.attrs, f"{key}_nlevels", index.nlevels)

3019

3020 for i, (lev, level_codes, name) in enumerate(

3021 zip(index.levels, index.codes, index.names)

3022 ):

3023 # write the level

3024 if isinstance(lev.dtype, ExtensionDtype):

3025 raise NotImplementedError(

3026 "Saving a MultiIndex with an extension dtype is not supported."

3027 )

3028 level_key = f"{key}_level{i}"

3029 conv_level = _convert_index(level_key, lev, self.encoding, self.errors)

3030 self.write_array(level_key, conv_level.values)

3031 node = getattr(self.group, level_key)

3032 node._v_attrs.kind = conv_level.kind

3033 node._v_attrs.name = name

3034

3035 # write the name

3036 setattr(node._v_attrs, f"{key}_name{name}", name)

3037

3038 # write the labels

3039 label_key = f"{key}_label{i}"

3040 self.write_array(label_key, level_codes)

3041

3042 def read_multi_index(

3043 self, key: str, start: int | None = None, stop: int | None = None

3044 ) -> MultiIndex:

3045 nlevels = getattr(self.attrs, f"{key}_nlevels")

3046

3047 levels = []

3048 codes = []

3049 names: list[Hashable] = []

3050 for i in range(nlevels):

3051 level_key = f"{key}_level{i}"

3052 node = getattr(self.group, level_key)

3053 lev = self.read_index_node(node, start=start, stop=stop)

3054 levels.append(lev)

3055 names.append(lev.name)

3056

3057 label_key = f"{key}_label{i}"

3058 level_codes = self.read_array(label_key, start=start, stop=stop)

3059 codes.append(level_codes)

3060

3061 return MultiIndex(

3062 levels=levels, codes=codes, names=names, verify_integrity=True

3063 )

3064

3065 def read_index_node(

3066 self, node: Node, start: int | None = None, stop: int | None = None

3067 ) -> Index:

3068 data = node[start:stop]

3069 # If the index was an empty array write_array_empty() will

3070 # have written a sentinel. Here we replace it with the original.

3071 if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:

3072 data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)

3073 kind = _ensure_decoded(node._v_attrs.kind)

3074 name = None

3075

3076 if "name" in node._v_attrs:

3077 name = _ensure_str(node._v_attrs.name)

3078 name = _ensure_decoded(name)

3079

3080 attrs = node._v_attrs

3081 factory, kwargs = self._get_index_factory(attrs)

3082

3083 if kind in ("date", "object"):

3084 index = factory(

3085 _unconvert_index(

3086 data, kind, encoding=self.encoding, errors=self.errors

3087 ),

3088 dtype=object,

3089 **kwargs,

3090 )

3091 else:

3092 index = factory(

3093 _unconvert_index(

3094 data, kind, encoding=self.encoding, errors=self.errors

3095 ),

3096 **kwargs,

3097 )

3098

3099 index.name = name

3100

3101 return index

3102

3103 def write_array_empty(self, key: str, value: ArrayLike) -> None:

3104 """write a 0-len array"""

3105 # ugly hack for length 0 axes

3106 arr = np.empty((1,) * value.ndim)

3107 self._handle.create_array(self.group, key, arr)

3108 node = getattr(self.group, key)

3109 node._v_attrs.value_type = str(value.dtype)

3110 node._v_attrs.shape = value.shape

3111

3112 def write_array(

3113 self, key: str, obj: AnyArrayLike, items: Index | None = None

3114 ) -> None:

3115 # TODO: we only have a few tests that get here, the only EA

3116 # that gets passed is DatetimeArray, and we never have

3117 # both self._filters and EA

3118

3119 value = extract_array(obj, extract_numpy=True)

3120

3121 if key in self.group:

3122 self._handle.remove_node(self.group, key)

3123

3124 # Transform needed to interface with pytables row/col notation

3125 empty_array = value.size == 0

3126 transposed = False

3127

3128 if isinstance(value.dtype, CategoricalDtype):

3129 raise NotImplementedError(

3130 "Cannot store a category dtype in a HDF5 dataset that uses format="

3131 '"fixed". Use format="table".'

3132 )

3133 if not empty_array:

3134 if hasattr(value, "T"):

3135 # ExtensionArrays (1d) may not have transpose.

3136 value = value.T

3137 transposed = True

3138

3139 atom = None

3140 if self._filters is not None:

3141 with suppress(ValueError):

3142 # get the atom for this datatype

3143 atom = _tables().Atom.from_dtype(value.dtype)

3144

3145 if atom is not None:

3146 # We only get here if self._filters is non-None and

3147 # the Atom.from_dtype call succeeded

3148

3149 # create an empty chunked array and fill it from value

3150 if not empty_array:

3151 ca = self._handle.create_carray(

3152 self.group, key, atom, value.shape, filters=self._filters

3153 )

3154 ca[:] = value

3155

3156 else:

3157 self.write_array_empty(key, value)

3158

3159 elif value.dtype.type == np.object_:

3160 # infer the type, warn if we have a non-string type here (for

3161 # performance)

3162 inferred_type = lib.infer_dtype(value, skipna=False)

3163 if empty_array:

3164 pass

3165 elif inferred_type == "string":

3166 pass

3167 else:

3168 ws = performance_doc % (inferred_type, key, items)

3169 warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())

3170

3171 vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())

3172 vlarr.append(value)

3173

3174 elif lib.is_np_dtype(value.dtype, "M"):

3175 self._handle.create_array(self.group, key, value.view("i8"))

3176 getattr(self.group, key)._v_attrs.value_type = str(value.dtype)

3177 elif isinstance(value.dtype, DatetimeTZDtype):

3178 # store as UTC

3179 # with a zone

3180

3181 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no

3182 # attribute "asi8"

3183 self._handle.create_array(

3184 self.group, key, value.asi8 # type: ignore[union-attr]

3185 )

3186

3187 node = getattr(self.group, key)

3188 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no

3189 # attribute "tz"

3190 node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr]

3191 node._v_attrs.value_type = f"datetime64[{value.dtype.unit}]"

3192 elif lib.is_np_dtype(value.dtype, "m"):

3193 self._handle.create_array(self.group, key, value.view("i8"))

3194 getattr(self.group, key)._v_attrs.value_type = "timedelta64"

3195 elif empty_array:

3196 self.write_array_empty(key, value)

3197 else:

3198 self._handle.create_array(self.group, key, value)

3199

3200 getattr(self.group, key)._v_attrs.transposed = transposed

3201

3202

3203class SeriesFixed(GenericFixed):

3204 pandas_kind = "series"

3205 attributes = ["name"]

3206

3207 name: Hashable

3208

3209 @property

3210 def shape(self):

3211 try:

3212 return (len(self.group.values),)

3213 except (TypeError, AttributeError):

3214 return None

3215

3216 def read(

3217 self,

3218 where=None,

3219 columns=None,

3220 start: int | None = None,

3221 stop: int | None = None,

3222 ) -> Series:

3223 self.validate_read(columns, where)

3224 index = self.read_index("index", start=start, stop=stop)

3225 values = self.read_array("values", start=start, stop=stop)

3226 result = Series(values, index=index, name=self.name, copy=False)

3227 if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):

3228 result = result.astype("string[pyarrow_numpy]")

3229 return result

3230

3231 def write(self, obj, **kwargs) -> None:

3232 super().write(obj, **kwargs)

3233 self.write_index("index", obj.index)

3234 self.write_array("values", obj)

3235 self.attrs.name = obj.name

3236

3237

3238class BlockManagerFixed(GenericFixed):

3239 attributes = ["ndim", "nblocks"]

3240

3241 nblocks: int

3242

3243 @property

3244 def shape(self) -> Shape | None:

3245 try:

3246 ndim = self.ndim

3247

3248 # items

3249 items = 0

3250 for i in range(self.nblocks):

3251 node = getattr(self.group, f"block{i}_items")

3252 shape = getattr(node, "shape", None)

3253 if shape is not None:

3254 items += shape[0]

3255

3256 # data shape

3257 node = self.group.block0_values

3258 shape = getattr(node, "shape", None)

3259 if shape is not None:

3260 shape = list(shape[0 : (ndim - 1)])

3261 else:

3262 shape = []

3263

3264 shape.append(items)

3265

3266 return shape

3267 except AttributeError:

3268 return None

3269

3270 def read(

3271 self,

3272 where=None,

3273 columns=None,

3274 start: int | None = None,

3275 stop: int | None = None,

3276 ) -> DataFrame:

3277 # start, stop applied to rows, so 0th axis only

3278 self.validate_read(columns, where)

3279 select_axis = self.obj_type()._get_block_manager_axis(0)

3280

3281 axes = []

3282 for i in range(self.ndim):

3283 _start, _stop = (start, stop) if i == select_axis else (None, None)

3284 ax = self.read_index(f"axis{i}", start=_start, stop=_stop)

3285 axes.append(ax)

3286

3287 items = axes[0]

3288 dfs = []

3289

3290 for i in range(self.nblocks):

3291 blk_items = self.read_index(f"block{i}_items")

3292 values = self.read_array(f"block{i}_values", start=_start, stop=_stop)

3293

3294 columns = items[items.get_indexer(blk_items)]

3295 df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)

3296 if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):

3297 df = df.astype("string[pyarrow_numpy]")

3298 dfs.append(df)

3299

3300 if len(dfs) > 0:

3301 out = concat(dfs, axis=1, copy=True)

3302 if using_copy_on_write():

3303 # with CoW, concat ignores the copy keyword. Here, we still want

3304 # to copy to enforce optimized column-major layout

3305 out = out.copy()

3306 out = out.reindex(columns=items, copy=False)

3307 return out

3308

3309 return DataFrame(columns=axes[0], index=axes[1])

3310

3311 def write(self, obj, **kwargs) -> None:

3312 super().write(obj, **kwargs)

3313

3314 # TODO(ArrayManager) HDFStore relies on accessing the blocks

3315 if isinstance(obj._mgr, ArrayManager):

3316 obj = obj._as_manager("block")

3317

3318 data = obj._mgr

3319 if not data.is_consolidated():

3320 data = data.consolidate()

3321

3322 self.attrs.ndim = data.ndim

3323 for i, ax in enumerate(data.axes):

3324 if i == 0 and (not ax.is_unique):

3325 raise ValueError("Columns index has to be unique for fixed format")

3326 self.write_index(f"axis{i}", ax)

3327

3328 # Supporting mixed-type DataFrame objects...nontrivial

3329 self.attrs.nblocks = len(data.blocks)

3330 for i, blk in enumerate(data.blocks):

3331 # I have no idea why, but writing values before items fixed #2299

3332 blk_items = data.items.take(blk.mgr_locs)

3333 self.write_array(f"block{i}_values", blk.values, items=blk_items)

3334 self.write_index(f"block{i}_items", blk_items)

3335

3336

3337class FrameFixed(BlockManagerFixed):

3338 pandas_kind = "frame"

3339 obj_type = DataFrame

3340

3341

3342class Table(Fixed):

3343 """

3344 represent a table:

3345 facilitate read/write of various types of tables

3346

3347 Attrs in Table Node

3348 -------------------

3349 These are attributes that are store in the main table node, they are

3350 necessary to recreate these tables when read back in.

3351

3352 index_axes : a list of tuples of the (original indexing axis and

3353 index column)

3354 non_index_axes: a list of tuples of the (original index axis and

3355 columns on a non-indexing axis)

3356 values_axes : a list of the columns which comprise the data of this

3357 table

3358 data_columns : a list of the columns that we are allowing indexing

3359 (these become single columns in values_axes)

3360 nan_rep : the string to use for nan representations for string

3361 objects

3362 levels : the names of levels

3363 metadata : the names of the metadata columns

3364 """

3365

3366 pandas_kind = "wide_table"

3367 format_type: str = "table" # GH#30962 needed by dask

3368 table_type: str

3369 levels: int | list[Hashable] = 1

3370 is_table = True

3371

3372 metadata: list

3373

3374 def __init__(

3375 self,

3376 parent: HDFStore,

3377 group: Node,

3378 encoding: str | None = None,

3379 errors: str = "strict",

3380 index_axes: list[IndexCol] | None = None,

3381 non_index_axes: list[tuple[AxisInt, Any]] | None = None,

3382 values_axes: list[DataCol] | None = None,

3383 data_columns: list | None = None,

3384 info: dict | None = None,

3385 nan_rep=None,

3386 ) -> None:

3387 super().__init__(parent, group, encoding=encoding, errors=errors)

3388 self.index_axes = index_axes or []

3389 self.non_index_axes = non_index_axes or []

3390 self.values_axes = values_axes or []

3391 self.data_columns = data_columns or []

3392 self.info = info or {}

3393 self.nan_rep = nan_rep

3394

3395 @property

3396 def table_type_short(self) -> str:

3397 return self.table_type.split("_")[0]

3398

3399 def __repr__(self) -> str:

3400 """return a pretty representation of myself"""

3401 self.infer_axes()

3402 jdc = ",".join(self.data_columns) if len(self.data_columns) else ""

3403 dc = f",dc->[{jdc}]"

3404

3405 ver = ""

3406 if self.is_old_version:

3407 jver = ".".join([str(x) for x in self.version])

3408 ver = f"[{jver}]"

3409

3410 jindex_axes = ",".join([a.name for a in self.index_axes])

3411 return (

3412 f"{self.pandas_type:12.12}{ver} "

3413 f"(typ->{self.table_type_short},nrows->{self.nrows},"

3414 f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})"

3415 )

3416

3417 def __getitem__(self, c: str):

3418 """return the axis for c"""

3419 for a in self.axes:

3420 if c == a.name:

3421 return a

3422 return None

3423

3424 def validate(self, other) -> None:

3425 """validate against an existing table"""

3426 if other is None:

3427 return

3428

3429 if other.table_type != self.table_type:

3430 raise TypeError(

3431 "incompatible table_type with existing "

3432 f"[{other.table_type} - {self.table_type}]"

3433 )

3434

3435 for c in ["index_axes", "non_index_axes", "values_axes"]:

3436 sv = getattr(self, c, None)

3437 ov = getattr(other, c, None)

3438 if sv != ov:

3439 # show the error for the specific axes

3440 # Argument 1 to "enumerate" has incompatible type

3441 # "Optional[Any]"; expected "Iterable[Any]" [arg-type]

3442 for i, sax in enumerate(sv): # type: ignore[arg-type]

3443 # Value of type "Optional[Any]" is not indexable [index]

3444 oax = ov[i] # type: ignore[index]

3445 if sax != oax:

3446 raise ValueError(

3447 f"invalid combination of [{c}] on appending data "

3448 f"[{sax}] vs current table [{oax}]"

3449 )

3450

3451 # should never get here

3452 raise Exception(

3453 f"invalid combination of [{c}] on appending data [{sv}] vs "

3454 f"current table [{ov}]"

3455 )

3456

3457 @property

3458 def is_multi_index(self) -> bool:

3459 """the levels attribute is 1 or a list in the case of a multi-index"""

3460 return isinstance(self.levels, list)

3461

3462 def validate_multiindex(

3463 self, obj: DataFrame | Series

3464 ) -> tuple[DataFrame, list[Hashable]]:

3465 """

3466 validate that we can store the multi-index; reset and return the

3467 new object

3468 """

3469 levels = com.fill_missing_names(obj.index.names)

3470 try:

3471 reset_obj = obj.reset_index()

3472 except ValueError as err:

3473 raise ValueError(

3474 "duplicate names/columns in the multi-index when storing as a table"

3475 ) from err

3476 assert isinstance(reset_obj, DataFrame) # for mypy

3477 return reset_obj, levels

3478

3479 @property

3480 def nrows_expected(self) -> int:

3481 """based on our axes, compute the expected nrows"""

3482 return np.prod([i.cvalues.shape[0] for i in self.index_axes])

3483

3484 @property

3485 def is_exists(self) -> bool:

3486 """has this table been created"""

3487 return "table" in self.group

3488

3489 @property

3490 def storable(self):

3491 return getattr(self.group, "table", None)

3492

3493 @property

3494 def table(self):

3495 """return the table group (this is my storable)"""

3496 return self.storable

3497

3498 @property

3499 def dtype(self):

3500 return self.table.dtype

3501

3502 @property

3503 def description(self):

3504 return self.table.description

3505

3506 @property

3507 def axes(self) -> itertools.chain[IndexCol]:

3508 return itertools.chain(self.index_axes, self.values_axes)

3509

3510 @property

3511 def ncols(self) -> int:

3512 """the number of total columns in the values axes"""

3513 return sum(len(a.values) for a in self.values_axes)

3514

3515 @property

3516 def is_transposed(self) -> bool:

3517 return False

3518

3519 @property

3520 def data_orientation(self) -> tuple[int, ...]:

3521 """return a tuple of my permutated axes, non_indexable at the front"""

3522 return tuple(

3523 itertools.chain(

3524 [int(a[0]) for a in self.non_index_axes],

3525 [int(a.axis) for a in self.index_axes],

3526 )

3527 )

3528

3529 def queryables(self) -> dict[str, Any]:

3530 """return a dict of the kinds allowable columns for this object"""

3531 # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here

3532 axis_names = {0: "index", 1: "columns"}

3533

3534 # compute the values_axes queryables

3535 d1 = [(a.cname, a) for a in self.index_axes]

3536 d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes]

3537 d3 = [

3538 (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)

3539 ]

3540

3541 return dict(d1 + d2 + d3)

3542

3543 def index_cols(self):

3544 """return a list of my index cols"""

3545 # Note: each `i.cname` below is assured to be a str.

3546 return [(i.axis, i.cname) for i in self.index_axes]

3547

3548 def values_cols(self) -> list[str]:

3549 """return a list of my values cols"""

3550 return [i.cname for i in self.values_axes]

3551

3552 def _get_metadata_path(self, key: str) -> str:

3553 """return the metadata pathname for this key"""

3554 group = self.group._v_pathname

3555 return f"{group}/meta/{key}/meta"

3556

3557 def write_metadata(self, key: str, values: np.ndarray) -> None:

3558 """

3559 Write out a metadata array to the key as a fixed-format Series.

3560

3561 Parameters

3562 ----------

3563 key : str

3564 values : ndarray

3565 """

3566 self.parent.put(

3567 self._get_metadata_path(key),

3568 Series(values, copy=False),

3569 format="table",

3570 encoding=self.encoding,

3571 errors=self.errors,

3572 nan_rep=self.nan_rep,

3573 )

3574

3575 def read_metadata(self, key: str):

3576 """return the meta data array for this key"""

3577 if getattr(getattr(self.group, "meta", None), key, None) is not None:

3578 return self.parent.select(self._get_metadata_path(key))

3579 return None

3580

3581 def set_attrs(self) -> None:

3582 """set our table type & indexables"""

3583 self.attrs.table_type = str(self.table_type)

3584 self.attrs.index_cols = self.index_cols()

3585 self.attrs.values_cols = self.values_cols()

3586 self.attrs.non_index_axes = self.non_index_axes

3587 self.attrs.data_columns = self.data_columns

3588 self.attrs.nan_rep = self.nan_rep

3589 self.attrs.encoding = self.encoding

3590 self.attrs.errors = self.errors

3591 self.attrs.levels = self.levels

3592 self.attrs.info = self.info

3593

3594 def get_attrs(self) -> None:

3595 """retrieve our attributes"""

3596 self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []

3597 self.data_columns = getattr(self.attrs, "data_columns", None) or []

3598 self.info = getattr(self.attrs, "info", None) or {}

3599 self.nan_rep = getattr(self.attrs, "nan_rep", None)

3600 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))

3601 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))

3602 self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []

3603 self.index_axes = [a for a in self.indexables if a.is_an_indexable]

3604 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]

3605

3606 def validate_version(self, where=None) -> None:

3607 """are we trying to operate on an old version?"""

3608 if where is not None:

3609 if self.is_old_version:

3610 ws = incompatibility_doc % ".".join([str(x) for x in self.version])

3611 warnings.warn(

3612 ws,

3613 IncompatibilityWarning,

3614 stacklevel=find_stack_level(),

3615 )

3616

3617 def validate_min_itemsize(self, min_itemsize) -> None:

3618 """

3619 validate the min_itemsize doesn't contain items that are not in the

3620 axes this needs data_columns to be defined

3621 """

3622 if min_itemsize is None:

3623 return

3624 if not isinstance(min_itemsize, dict):

3625 return

3626

3627 q = self.queryables()

3628 for k in min_itemsize:

3629 # ok, apply generally

3630 if k == "values":

3631 continue

3632 if k not in q:

3633 raise ValueError(

3634 f"min_itemsize has the key [{k}] which is not an axis or "

3635 "data_column"

3636 )

3637

3638 @cache_readonly

3639 def indexables(self):

3640 """create/cache the indexables if they don't exist"""

3641 _indexables = []

3642

3643 desc = self.description

3644 table_attrs = self.table.attrs

3645

3646 # Note: each of the `name` kwargs below are str, ensured

3647 # by the definition in index_cols.

3648 # index columns

3649 for i, (axis, name) in enumerate(self.attrs.index_cols):

3650 atom = getattr(desc, name)

3651 md = self.read_metadata(name)

3652 meta = "category" if md is not None else None

3653

3654 kind_attr = f"{name}_kind"

3655 kind = getattr(table_attrs, kind_attr, None)

3656

3657 index_col = IndexCol(

3658 name=name,

3659 axis=axis,

3660 pos=i,

3661 kind=kind,

3662 typ=atom,

3663 table=self.table,

3664 meta=meta,

3665 metadata=md,

3666 )

3667 _indexables.append(index_col)

3668

3669 # values columns

3670 dc = set(self.data_columns)

3671 base_pos = len(_indexables)

3672

3673 def f(i, c):

3674 assert isinstance(c, str)

3675 klass = DataCol

3676 if c in dc:

3677 klass = DataIndexableCol

3678

3679 atom = getattr(desc, c)

3680 adj_name = _maybe_adjust_name(c, self.version)

3681

3682 # TODO: why kind_attr here?

3683 values = getattr(table_attrs, f"{adj_name}_kind", None)

3684 dtype = getattr(table_attrs, f"{adj_name}_dtype", None)

3685 # Argument 1 to "_dtype_to_kind" has incompatible type

3686 # "Optional[Any]"; expected "str" [arg-type]

3687 kind = _dtype_to_kind(dtype) # type: ignore[arg-type]

3688

3689 md = self.read_metadata(c)

3690 # TODO: figure out why these two versions of `meta` dont always match.

3691 # meta = "category" if md is not None else None

3692 meta = getattr(table_attrs, f"{adj_name}_meta", None)

3693

3694 obj = klass(

3695 name=adj_name,

3696 cname=c,

3697 values=values,

3698 kind=kind,

3699 pos=base_pos + i,

3700 typ=atom,

3701 table=self.table,

3702 meta=meta,

3703 metadata=md,

3704 dtype=dtype,

3705 )

3706 return obj

3707

3708 # Note: the definition of `values_cols` ensures that each

3709 # `c` below is a str.

3710 _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)])

3711

3712 return _indexables

3713

3714 def create_index(

3715 self, columns=None, optlevel=None, kind: str | None = None

3716 ) -> None:

3717 """

3718 Create a pytables index on the specified columns.

3719

3720 Parameters

3721 ----------

3722 columns : None, bool, or listlike[str]

3723 Indicate which columns to create an index on.

3724

3725 * False : Do not create any indexes.

3726 * True : Create indexes on all columns.

3727 * None : Create indexes on all columns.

3728 * listlike : Create indexes on the given columns.

3729

3730 optlevel : int or None, default None

3731 Optimization level, if None, pytables defaults to 6.

3732 kind : str or None, default None

3733 Kind of index, if None, pytables defaults to "medium".

3734

3735 Raises

3736 ------

3737 TypeError if trying to create an index on a complex-type column.

3738

3739 Notes

3740 -----

3741 Cannot index Time64Col or ComplexCol.

3742 Pytables must be >= 3.0.

3743 """

3744 if not self.infer_axes():

3745 return

3746 if columns is False:

3747 return

3748

3749 # index all indexables and data_columns

3750 if columns is None or columns is True:

3751 columns = [a.cname for a in self.axes if a.is_data_indexable]

3752 if not isinstance(columns, (tuple, list)):

3753 columns = [columns]

3754

3755 kw = {}

3756 if optlevel is not None:

3757 kw["optlevel"] = optlevel

3758 if kind is not None:

3759 kw["kind"] = kind

3760

3761 table = self.table

3762 for c in columns:

3763 v = getattr(table.cols, c, None)

3764 if v is not None:

3765 # remove the index if the kind/optlevel have changed

3766 if v.is_indexed:

3767 index = v.index

3768 cur_optlevel = index.optlevel

3769 cur_kind = index.kind

3770

3771 if kind is not None and cur_kind != kind:

3772 v.remove_index()

3773 else:

3774 kw["kind"] = cur_kind

3775

3776 if optlevel is not None and cur_optlevel != optlevel:

3777 v.remove_index()

3778 else:

3779 kw["optlevel"] = cur_optlevel

3780

3781 # create the index

3782 if not v.is_indexed:

3783 if v.type.startswith("complex"):

3784 raise TypeError(

3785 "Columns containing complex values can be stored but "

3786 "cannot be indexed when using table format. Either use "

3787 "fixed format, set index=False, or do not include "

3788 "the columns containing complex values to "

3789 "data_columns when initializing the table."

3790 )

3791 v.create_index(**kw)

3792 elif c in self.non_index_axes[0][1]:

3793 # GH 28156

3794 raise AttributeError(

3795 f"column {c} is not a data_column.\n"

3796 f"In order to read column {c} you must reload the dataframe \n"

3797 f"into HDFStore and include {c} with the data_columns argument."

3798 )

3799

3800 def _read_axes(

3801 self, where, start: int | None = None, stop: int | None = None

3802 ) -> list[tuple[np.ndarray, np.ndarray] | tuple[Index, Index]]:

3803 """

3804 Create the axes sniffed from the table.

3805

3806 Parameters

3807 ----------

3808 where : ???

3809 start : int or None, default None

3810 stop : int or None, default None

3811

3812 Returns

3813 -------

3814 List[Tuple[index_values, column_values]]

3815 """

3816 # create the selection

3817 selection = Selection(self, where=where, start=start, stop=stop)

3818 values = selection.select()

3819

3820 results = []

3821 # convert the data

3822 for a in self.axes:

3823 a.set_info(self.info)

3824 res = a.convert(

3825 values,

3826 nan_rep=self.nan_rep,

3827 encoding=self.encoding,

3828 errors=self.errors,

3829 )

3830 results.append(res)

3831

3832 return results

3833

3834 @classmethod

3835 def get_object(cls, obj, transposed: bool):

3836 """return the data for this obj"""

3837 return obj

3838

3839 def validate_data_columns(self, data_columns, min_itemsize, non_index_axes):

3840 """

3841 take the input data_columns and min_itemize and create a data

3842 columns spec

3843 """

3844 if not len(non_index_axes):

3845 return []

3846

3847 axis, axis_labels = non_index_axes[0]

3848 info = self.info.get(axis, {})

3849 if info.get("type") == "MultiIndex" and data_columns:

3850 raise ValueError(

3851 f"cannot use a multi-index on axis [{axis}] with "

3852 f"data_columns {data_columns}"

3853 )

3854

3855 # evaluate the passed data_columns, True == use all columns

3856 # take only valid axis labels

3857 if data_columns is True:

3858 data_columns = list(axis_labels)

3859 elif data_columns is None:

3860 data_columns = []

3861

3862 # if min_itemsize is a dict, add the keys (exclude 'values')

3863 if isinstance(min_itemsize, dict):

3864 existing_data_columns = set(data_columns)

3865 data_columns = list(data_columns) # ensure we do not modify

3866 data_columns.extend(

3867 [

3868 k

3869 for k in min_itemsize.keys()

3870 if k != "values" and k not in existing_data_columns

3871 ]

3872 )

3873

3874 # return valid columns in the order of our axis

3875 return [c for c in data_columns if c in axis_labels]

3876

3877 def _create_axes(

3878 self,

3879 axes,

3880 obj: DataFrame,

3881 validate: bool = True,

3882 nan_rep=None,

3883 data_columns=None,

3884 min_itemsize=None,

3885 ):

3886 """

3887 Create and return the axes.

3888

3889 Parameters

3890 ----------

3891 axes: list or None

3892 The names or numbers of the axes to create.

3893 obj : DataFrame

3894 The object to create axes on.

3895 validate: bool, default True

3896 Whether to validate the obj against an existing object already written.

3897 nan_rep :

3898 A value to use for string column nan_rep.

3899 data_columns : List[str], True, or None, default None

3900 Specify the columns that we want to create to allow indexing on.

3901

3902 * True : Use all available columns.

3903 * None : Use no columns.

3904 * List[str] : Use the specified columns.

3905

3906 min_itemsize: Dict[str, int] or None, default None

3907 The min itemsize for a column in bytes.

3908 """

3909 if not isinstance(obj, DataFrame):

3910 group = self.group._v_name

3911 raise TypeError(

3912 f"cannot properly create the storer for: [group->{group},"

3913 f"value->{type(obj)}]"

3914 )

3915

3916 # set the default axes if needed

3917 if axes is None:

3918 axes = [0]

3919

3920 # map axes to numbers

3921 axes = [obj._get_axis_number(a) for a in axes]

3922

3923 # do we have an existing table (if so, use its axes & data_columns)

3924 if self.infer_axes():

3925 table_exists = True

3926 axes = [a.axis for a in self.index_axes]

3927 data_columns = list(self.data_columns)

3928 nan_rep = self.nan_rep

3929 # TODO: do we always have validate=True here?

3930 else:

3931 table_exists = False

3932

3933 new_info = self.info

3934

3935 assert self.ndim == 2 # with next check, we must have len(axes) == 1

3936 # currently support on ndim-1 axes

3937 if len(axes) != self.ndim - 1:

3938 raise ValueError(

3939 "currently only support ndim-1 indexers in an AppendableTable"

3940 )

3941

3942 # create according to the new data

3943 new_non_index_axes: list = []

3944

3945 # nan_representation

3946 if nan_rep is None:

3947 nan_rep = "nan"

3948

3949 # We construct the non-index-axis first, since that alters new_info

3950 idx = next(x for x in [0, 1] if x not in axes)

3951

3952 a = obj.axes[idx]

3953 # we might be able to change the axes on the appending data if necessary

3954 append_axis = list(a)

3955 if table_exists:

3956 indexer = len(new_non_index_axes) # i.e. 0

3957 exist_axis = self.non_index_axes[indexer][1]

3958 if not array_equivalent(

3959 np.array(append_axis),

3960 np.array(exist_axis),

3961 strict_nan=True,

3962 dtype_equal=True,

3963 ):

3964 # ahah! -> reindex

3965 if array_equivalent(

3966 np.array(sorted(append_axis)),

3967 np.array(sorted(exist_axis)),

3968 strict_nan=True,

3969 dtype_equal=True,

3970 ):

3971 append_axis = exist_axis

3972

3973 # the non_index_axes info

3974 info = new_info.setdefault(idx, {})

3975 info["names"] = list(a.names)

3976 info["type"] = type(a).__name__

3977

3978 new_non_index_axes.append((idx, append_axis))

3979

3980 # Now we can construct our new index axis

3981 idx = axes[0]

3982 a = obj.axes[idx]

3983 axis_name = obj._get_axis_name(idx)

3984 new_index = _convert_index(axis_name, a, self.encoding, self.errors)

3985 new_index.axis = idx

3986

3987 # Because we are always 2D, there is only one new_index, so

3988 # we know it will have pos=0

3989 new_index.set_pos(0)

3990 new_index.update_info(new_info)

3991 new_index.maybe_set_size(min_itemsize) # check for column conflicts

3992

3993 new_index_axes = [new_index]

3994 j = len(new_index_axes) # i.e. 1

3995 assert j == 1

3996

3997 # reindex by our non_index_axes & compute data_columns

3998 assert len(new_non_index_axes) == 1

3999 for a in new_non_index_axes:

4000 obj = _reindex_axis(obj, a[0], a[1])

4001

4002 transposed = new_index.axis == 1

4003

4004 # figure out data_columns and get out blocks

4005 data_columns = self.validate_data_columns(

4006 data_columns, min_itemsize, new_non_index_axes

4007 )

4008

4009 frame = self.get_object(obj, transposed)._consolidate()

4010

4011 blocks, blk_items = self._get_blocks_and_items(

4012 frame, table_exists, new_non_index_axes, self.values_axes, data_columns

4013 )

4014

4015 # add my values

4016 vaxes = []

4017 for i, (blk, b_items) in enumerate(zip(blocks, blk_items)):

4018 # shape of the data column are the indexable axes

4019 klass = DataCol

4020 name = None

4021

4022 # we have a data_column

4023 if data_columns and len(b_items) == 1 and b_items[0] in data_columns:

4024 klass = DataIndexableCol

4025 name = b_items[0]

4026 if not (name is None or isinstance(name, str)):

4027 # TODO: should the message here be more specifically non-str?

4028 raise ValueError("cannot have non-object label DataIndexableCol")

4029

4030 # make sure that we match up the existing columns

4031 # if we have an existing table

4032 existing_col: DataCol | None

4033

4034 if table_exists and validate:

4035 try:

4036 existing_col = self.values_axes[i]

4037 except (IndexError, KeyError) as err:

4038 raise ValueError(

4039 f"Incompatible appended table [{blocks}]"

4040 f"with existing table [{self.values_axes}]"

4041 ) from err

4042 else:

4043 existing_col = None

4044

4045 new_name = name or f"values_block_{i}"

4046 data_converted = _maybe_convert_for_string_atom(

4047 new_name,

4048 blk.values,

4049 existing_col=existing_col,

4050 min_itemsize=min_itemsize,

4051 nan_rep=nan_rep,

4052 encoding=self.encoding,

4053 errors=self.errors,

4054 columns=b_items,

4055 )

4056 adj_name = _maybe_adjust_name(new_name, self.version)

4057

4058 typ = klass._get_atom(data_converted)

4059 kind = _dtype_to_kind(data_converted.dtype.name)

4060 tz = None

4061 if getattr(data_converted, "tz", None) is not None:

4062 tz = _get_tz(data_converted.tz)

4063

4064 meta = metadata = ordered = None

4065 if isinstance(data_converted.dtype, CategoricalDtype):

4066 ordered = data_converted.ordered

4067 meta = "category"

4068 metadata = np.asarray(data_converted.categories).ravel()

4069

4070 data, dtype_name = _get_data_and_dtype_name(data_converted)

4071

4072 col = klass(

4073 name=adj_name,

4074 cname=new_name,

4075 values=list(b_items),

4076 typ=typ,

4077 pos=j,

4078 kind=kind,

4079 tz=tz,

4080 ordered=ordered,

4081 meta=meta,

4082 metadata=metadata,

4083 dtype=dtype_name,

4084 data=data,

4085 )

4086 col.update_info(new_info)

4087

4088 vaxes.append(col)

4089

4090 j += 1

4091

4092 dcs = [col.name for col in vaxes if col.is_data_indexable]

4093

4094 new_table = type(self)(

4095 parent=self.parent,

4096 group=self.group,

4097 encoding=self.encoding,

4098 errors=self.errors,

4099 index_axes=new_index_axes,

4100 non_index_axes=new_non_index_axes,

4101 values_axes=vaxes,

4102 data_columns=dcs,

4103 info=new_info,

4104 nan_rep=nan_rep,

4105 )

4106 if hasattr(self, "levels"):

4107 # TODO: get this into constructor, only for appropriate subclass

4108 new_table.levels = self.levels

4109

4110 new_table.validate_min_itemsize(min_itemsize)

4111

4112 if validate and table_exists:

4113 new_table.validate(self)

4114

4115 return new_table

4116

4117 @staticmethod

4118 def _get_blocks_and_items(

4119 frame: DataFrame,

4120 table_exists: bool,

4121 new_non_index_axes,

4122 values_axes,

4123 data_columns,

4124 ):

4125 # Helper to clarify non-state-altering parts of _create_axes

4126

4127 # TODO(ArrayManager) HDFStore relies on accessing the blocks

4128 if isinstance(frame._mgr, ArrayManager):

4129 frame = frame._as_manager("block")

4130

4131 def get_blk_items(mgr):

4132 return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks]

4133

4134 mgr = frame._mgr

4135 mgr = cast(BlockManager, mgr)

4136 blocks: list[Block] = list(mgr.blocks)

4137 blk_items: list[Index] = get_blk_items(mgr)

4138

4139 if len(data_columns):

4140 # TODO: prove that we only get here with axis == 1?

4141 # It is the case in all extant tests, but NOT the case

4142 # outside this `if len(data_columns)` check.

4143

4144 axis, axis_labels = new_non_index_axes[0]

4145 new_labels = Index(axis_labels).difference(Index(data_columns))

4146 mgr = frame.reindex(new_labels, axis=axis)._mgr

4147 mgr = cast(BlockManager, mgr)

4148

4149 blocks = list(mgr.blocks)

4150 blk_items = get_blk_items(mgr)

4151 for c in data_columns:

4152 # This reindex would raise ValueError if we had a duplicate

4153 # index, so we can infer that (as long as axis==1) we

4154 # get a single column back, so a single block.

4155 mgr = frame.reindex([c], axis=axis)._mgr

4156 mgr = cast(BlockManager, mgr)

4157 blocks.extend(mgr.blocks)

4158 blk_items.extend(get_blk_items(mgr))

4159

4160 # reorder the blocks in the same order as the existing table if we can

4161 if table_exists:

4162 by_items = {

4163 tuple(b_items.tolist()): (b, b_items)

4164 for b, b_items in zip(blocks, blk_items)

4165 }

4166 new_blocks: list[Block] = []

4167 new_blk_items = []

4168 for ea in values_axes:

4169 items = tuple(ea.values)

4170 try:

4171 b, b_items = by_items.pop(items)

4172 new_blocks.append(b)

4173 new_blk_items.append(b_items)

4174 except (IndexError, KeyError) as err:

4175 jitems = ",".join([pprint_thing(item) for item in items])

4176 raise ValueError(

4177 f"cannot match existing table structure for [{jitems}] "

4178 "on appending data"

4179 ) from err

4180 blocks = new_blocks

4181 blk_items = new_blk_items

4182

4183 return blocks, blk_items

4184

4185 def process_axes(self, obj, selection: Selection, columns=None) -> DataFrame:

4186 """process axes filters"""

4187 # make a copy to avoid side effects

4188 if columns is not None:

4189 columns = list(columns)

4190

4191 # make sure to include levels if we have them

4192 if columns is not None and self.is_multi_index:

4193 assert isinstance(self.levels, list) # assured by is_multi_index

4194 for n in self.levels:

4195 if n not in columns:

4196 columns.insert(0, n)

4197

4198 # reorder by any non_index_axes & limit to the select columns

4199 for axis, labels in self.non_index_axes:

4200 obj = _reindex_axis(obj, axis, labels, columns)

4201

4202 def process_filter(field, filt, op):

4203 for axis_name in obj._AXIS_ORDERS:

4204 axis_number = obj._get_axis_number(axis_name)

4205 axis_values = obj._get_axis(axis_name)

4206 assert axis_number is not None

4207

4208 # see if the field is the name of an axis

4209 if field == axis_name:

4210 # if we have a multi-index, then need to include

4211 # the levels

4212 if self.is_multi_index:

4213 filt = filt.union(Index(self.levels))

4214

4215 takers = op(axis_values, filt)

4216 return obj.loc(axis=axis_number)[takers]

4217

4218 # this might be the name of a file IN an axis

4219 elif field in axis_values:

4220 # we need to filter on this dimension

4221 values = ensure_index(getattr(obj, field).values)

4222 filt = ensure_index(filt)

4223

4224 # hack until we support reversed dim flags

4225 if isinstance(obj, DataFrame):

4226 axis_number = 1 - axis_number

4227

4228 takers = op(values, filt)

4229 return obj.loc(axis=axis_number)[takers]

4230

4231 raise ValueError(f"cannot find the field [{field}] for filtering!")

4232

4233 # apply the selection filters (but keep in the same order)

4234 if selection.filter is not None:

4235 for field, op, filt in selection.filter.format():

4236 obj = process_filter(field, filt, op)

4237

4238 return obj

4239

4240 def create_description(

4241 self,

4242 complib,

4243 complevel: int | None,

4244 fletcher32: bool,

4245 expectedrows: int | None,

4246 ) -> dict[str, Any]:

4247 """create the description of the table from the axes & values"""

4248 # provided expected rows if its passed

4249 if expectedrows is None:

4250 expectedrows = max(self.nrows_expected, 10000)

4251

4252 d = {"name": "table", "expectedrows": expectedrows}

4253

4254 # description from the axes & values

4255 d["description"] = {a.cname: a.typ for a in self.axes}

4256

4257 if complib:

4258 if complevel is None:

4259 complevel = self._complevel or 9

4260 filters = _tables().Filters(

4261 complevel=complevel,

4262 complib=complib,

4263 fletcher32=fletcher32 or self._fletcher32,

4264 )

4265 d["filters"] = filters

4266 elif self._filters is not None:

4267 d["filters"] = self._filters

4268

4269 return d

4270

4271 def read_coordinates(

4272 self, where=None, start: int | None = None, stop: int | None = None

4273 ):

4274 """

4275 select coordinates (row numbers) from a table; return the

4276 coordinates object

4277 """

4278 # validate the version

4279 self.validate_version(where)

4280

4281 # infer the data kind

4282 if not self.infer_axes():

4283 return False

4284

4285 # create the selection

4286 selection = Selection(self, where=where, start=start, stop=stop)

4287 coords = selection.select_coords()

4288 if selection.filter is not None:

4289 for field, op, filt in selection.filter.format():

4290 data = self.read_column(

4291 field, start=coords.min(), stop=coords.max() + 1

4292 )

4293 coords = coords[op(data.iloc[coords - coords.min()], filt).values]

4294

4295 return Index(coords)

4296

4297 def read_column(

4298 self,

4299 column: str,

4300 where=None,

4301 start: int | None = None,

4302 stop: int | None = None,

4303 ):

4304 """

4305 return a single column from the table, generally only indexables

4306 are interesting

4307 """

4308 # validate the version

4309 self.validate_version()

4310

4311 # infer the data kind

4312 if not self.infer_axes():

4313 return False

4314

4315 if where is not None:

4316 raise TypeError("read_column does not currently accept a where clause")

4317

4318 # find the axes

4319 for a in self.axes:

4320 if column == a.name:

4321 if not a.is_data_indexable:

4322 raise ValueError(

4323 f"column [{column}] can not be extracted individually; "

4324 "it is not data indexable"

4325 )

4326

4327 # column must be an indexable or a data column

4328 c = getattr(self.table.cols, column)

4329 a.set_info(self.info)

4330 col_values = a.convert(

4331 c[start:stop],

4332 nan_rep=self.nan_rep,

4333 encoding=self.encoding,

4334 errors=self.errors,

4335 )

4336 return Series(_set_tz(col_values[1], a.tz), name=column, copy=False)

4337

4338 raise KeyError(f"column [{column}] not found in the table")

4339

4340

4341class WORMTable(Table):

4342 """

4343 a write-once read-many table: this format DOES NOT ALLOW appending to a

4344 table. writing is a one-time operation the data are stored in a format

4345 that allows for searching the data on disk

4346 """

4347

4348 table_type = "worm"

4349

4350 def read(

4351 self,

4352 where=None,

4353 columns=None,

4354 start: int | None = None,

4355 stop: int | None = None,

4356 ):

4357 """

4358 read the indices and the indexing array, calculate offset rows and return

4359 """

4360 raise NotImplementedError("WORMTable needs to implement read")

4361

4362 def write(self, obj, **kwargs) -> None:

4363 """

4364 write in a format that we can search later on (but cannot append

4365 to): write out the indices and the values using _write_array

4366 (e.g. a CArray) create an indexing table so that we can search

4367 """

4368 raise NotImplementedError("WORMTable needs to implement write")

4369

4370

4371class AppendableTable(Table):

4372 """support the new appendable table formats"""

4373

4374 table_type = "appendable"

4375

4376 # error: Signature of "write" incompatible with supertype "Fixed"

4377 def write( # type: ignore[override]

4378 self,

4379 obj,

4380 axes=None,

4381 append: bool = False,

4382 complib=None,

4383 complevel=None,

4384 fletcher32=None,

4385 min_itemsize=None,

4386 chunksize: int | None = None,

4387 expectedrows=None,

4388 dropna: bool = False,

4389 nan_rep=None,

4390 data_columns=None,

4391 track_times: bool = True,

4392 ) -> None:

4393 if not append and self.is_exists:

4394 self._handle.remove_node(self.group, "table")

4395

4396 # create the axes

4397 table = self._create_axes(

4398 axes=axes,

4399 obj=obj,

4400 validate=append,

4401 min_itemsize=min_itemsize,

4402 nan_rep=nan_rep,

4403 data_columns=data_columns,

4404 )

4405

4406 for a in table.axes:

4407 a.validate_names()

4408

4409 if not table.is_exists:

4410 # create the table

4411 options = table.create_description(

4412 complib=complib,

4413 complevel=complevel,

4414 fletcher32=fletcher32,

4415 expectedrows=expectedrows,

4416 )

4417

4418 # set the table attributes

4419 table.set_attrs()

4420

4421 options["track_times"] = track_times

4422

4423 # create the table

4424 table._handle.create_table(table.group, **options)

4425

4426 # update my info

4427 table.attrs.info = table.info

4428

4429 # validate the axes and set the kinds

4430 for a in table.axes:

4431 a.validate_and_set(table, append)

4432

4433 # add the rows

4434 table.write_data(chunksize, dropna=dropna)

4435

4436 def write_data(self, chunksize: int | None, dropna: bool = False) -> None:

4437 """

4438 we form the data into a 2-d including indexes,values,mask write chunk-by-chunk

4439 """

4440 names = self.dtype.names

4441 nrows = self.nrows_expected

4442

4443 # if dropna==True, then drop ALL nan rows

4444 masks = []

4445 if dropna:

4446 for a in self.values_axes:

4447 # figure the mask: only do if we can successfully process this

4448 # column, otherwise ignore the mask

4449 mask = isna(a.data).all(axis=0)

4450 if isinstance(mask, np.ndarray):

4451 masks.append(mask.astype("u1", copy=False))

4452

4453 # consolidate masks

4454 if len(masks):

4455 mask = masks[0]

4456 for m in masks[1:]:

4457 mask = mask & m

4458 mask = mask.ravel()

4459 else:

4460 mask = None

4461

4462 # broadcast the indexes if needed

4463 indexes = [a.cvalues for a in self.index_axes]

4464 nindexes = len(indexes)

4465 assert nindexes == 1, nindexes # ensures we dont need to broadcast

4466

4467 # transpose the values so first dimension is last

4468 # reshape the values if needed

4469 values = [a.take_data() for a in self.values_axes]

4470 values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]

4471 bvalues = []

4472 for i, v in enumerate(values):

4473 new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape

4474 bvalues.append(v.reshape(new_shape))

4475

4476 # write the chunks

4477 if chunksize is None:

4478 chunksize = 100000

4479

4480 rows = np.empty(min(chunksize, nrows), dtype=self.dtype)

4481 chunks = nrows // chunksize + 1

4482 for i in range(chunks):

4483 start_i = i * chunksize

4484 end_i = min((i + 1) * chunksize, nrows)

4485 if start_i >= end_i:

4486 break

4487

4488 self.write_data_chunk(

4489 rows,

4490 indexes=[a[start_i:end_i] for a in indexes],

4491 mask=mask[start_i:end_i] if mask is not None else None,

4492 values=[v[start_i:end_i] for v in bvalues],

4493 )

4494

4495 def write_data_chunk(

4496 self,

4497 rows: np.ndarray,

4498 indexes: list[np.ndarray],

4499 mask: npt.NDArray[np.bool_] | None,

4500 values: list[np.ndarray],

4501 ) -> None:

4502 """

4503 Parameters

4504 ----------

4505 rows : an empty memory space where we are putting the chunk

4506 indexes : an array of the indexes

4507 mask : an array of the masks

4508 values : an array of the values

4509 """

4510 # 0 len

4511 for v in values:

4512 if not np.prod(v.shape):

4513 return

4514

4515 nrows = indexes[0].shape[0]

4516 if nrows != len(rows):

4517 rows = np.empty(nrows, dtype=self.dtype)

4518 names = self.dtype.names

4519 nindexes = len(indexes)

4520

4521 # indexes

4522 for i, idx in enumerate(indexes):

4523 rows[names[i]] = idx

4524

4525 # values

4526 for i, v in enumerate(values):

4527 rows[names[i + nindexes]] = v

4528

4529 # mask

4530 if mask is not None:

4531 m = ~mask.ravel().astype(bool, copy=False)

4532 if not m.all():

4533 rows = rows[m]

4534

4535 if len(rows):

4536 self.table.append(rows)

4537 self.table.flush()

4538

4539 def delete(self, where=None, start: int | None = None, stop: int | None = None):

4540 # delete all rows (and return the nrows)

4541 if where is None or not len(where):

4542 if start is None and stop is None:

4543 nrows = self.nrows

4544 self._handle.remove_node(self.group, recursive=True)

4545 else:

4546 # pytables<3.0 would remove a single row with stop=None

4547 if stop is None:

4548 stop = self.nrows

4549 nrows = self.table.remove_rows(start=start, stop=stop)

4550 self.table.flush()

4551 return nrows

4552

4553 # infer the data kind

4554 if not self.infer_axes():

4555 return None

4556

4557 # create the selection

4558 table = self.table

4559 selection = Selection(self, where, start=start, stop=stop)

4560 values = selection.select_coords()

4561

4562 # delete the rows in reverse order

4563 sorted_series = Series(values, copy=False).sort_values()

4564 ln = len(sorted_series)

4565

4566 if ln:

4567 # construct groups of consecutive rows

4568 diff = sorted_series.diff()

4569 groups = list(diff[diff > 1].index)

4570

4571 # 1 group

4572 if not len(groups):

4573 groups = [0]

4574

4575 # final element

4576 if groups[-1] != ln:

4577 groups.append(ln)

4578

4579 # initial element

4580 if groups[0] != 0:

4581 groups.insert(0, 0)

4582

4583 # we must remove in reverse order!

4584 pg = groups.pop()

4585 for g in reversed(groups):

4586 rows = sorted_series.take(range(g, pg))

4587 table.remove_rows(

4588 start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1

4589 )

4590 pg = g

4591

4592 self.table.flush()

4593

4594 # return the number of rows removed

4595 return ln

4596

4597

4598class AppendableFrameTable(AppendableTable):

4599 """support the new appendable table formats"""

4600

4601 pandas_kind = "frame_table"

4602 table_type = "appendable_frame"

4603 ndim = 2

4604 obj_type: type[DataFrame | Series] = DataFrame

4605

4606 @property

4607 def is_transposed(self) -> bool:

4608 return self.index_axes[0].axis == 1

4609

4610 @classmethod

4611 def get_object(cls, obj, transposed: bool):

4612 """these are written transposed"""

4613 if transposed:

4614 obj = obj.T

4615 return obj

4616

4617 def read(

4618 self,

4619 where=None,

4620 columns=None,

4621 start: int | None = None,

4622 stop: int | None = None,

4623 ):

4624 # validate the version

4625 self.validate_version(where)

4626

4627 # infer the data kind

4628 if not self.infer_axes():

4629 return None

4630

4631 result = self._read_axes(where=where, start=start, stop=stop)

4632

4633 info = (

4634 self.info.get(self.non_index_axes[0][0], {})

4635 if len(self.non_index_axes)

4636 else {}

4637 )

4638

4639 inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]

4640 assert len(inds) == 1

4641 ind = inds[0]

4642

4643 index = result[ind][0]

4644

4645 frames = []

4646 for i, a in enumerate(self.axes):

4647 if a not in self.values_axes:

4648 continue

4649 index_vals, cvalues = result[i]

4650

4651 # we could have a multi-index constructor here

4652 # ensure_index doesn't recognized our list-of-tuples here

4653 if info.get("type") != "MultiIndex":

4654 cols = Index(index_vals)

4655 else:

4656 cols = MultiIndex.from_tuples(index_vals)

4657

4658 names = info.get("names")

4659 if names is not None:

4660 cols.set_names(names, inplace=True)

4661

4662 if self.is_transposed:

4663 values = cvalues

4664 index_ = cols

4665 cols_ = Index(index, name=getattr(index, "name", None))

4666 else:

4667 values = cvalues.T

4668 index_ = Index(index, name=getattr(index, "name", None))

4669 cols_ = cols

4670

4671 # if we have a DataIndexableCol, its shape will only be 1 dim

4672 if values.ndim == 1 and isinstance(values, np.ndarray):

4673 values = values.reshape((1, values.shape[0]))

4674

4675 if isinstance(values, np.ndarray):

4676 df = DataFrame(values.T, columns=cols_, index=index_, copy=False)

4677 elif isinstance(values, Index):

4678 df = DataFrame(values, columns=cols_, index=index_)

4679 else:

4680 # Categorical

4681 df = DataFrame._from_arrays([values], columns=cols_, index=index_)

4682 if not (using_pyarrow_string_dtype() and values.dtype.kind == "O"):

4683 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)

4684 if using_pyarrow_string_dtype() and is_string_array(

4685 values, # type: ignore[arg-type]

4686 skipna=True,

4687 ):

4688 df = df.astype("string[pyarrow_numpy]")

4689 frames.append(df)

4690

4691 if len(frames) == 1:

4692 df = frames[0]

4693 else:

4694 df = concat(frames, axis=1)

4695

4696 selection = Selection(self, where=where, start=start, stop=stop)

4697 # apply the selection filters & axis orderings

4698 df = self.process_axes(df, selection=selection, columns=columns)

4699 return df

4700

4701

4702class AppendableSeriesTable(AppendableFrameTable):

4703 """support the new appendable table formats"""

4704

4705 pandas_kind = "series_table"

4706 table_type = "appendable_series"

4707 ndim = 2

4708 obj_type = Series

4709

4710 @property

4711 def is_transposed(self) -> bool:

4712 return False

4713

4714 @classmethod

4715 def get_object(cls, obj, transposed: bool):

4716 return obj

4717

4718 # error: Signature of "write" incompatible with supertype "Fixed"

4719 def write(self, obj, data_columns=None, **kwargs) -> None: # type: ignore[override]

4720 """we are going to write this as a frame table"""

4721 if not isinstance(obj, DataFrame):

4722 name = obj.name or "values"

4723 obj = obj.to_frame(name)

4724 super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)

4725

4726 def read(

4727 self,

4728 where=None,

4729 columns=None,

4730 start: int | None = None,

4731 stop: int | None = None,

4732 ) -> Series:

4733 is_multi_index = self.is_multi_index

4734 if columns is not None and is_multi_index:

4735 assert isinstance(self.levels, list) # needed for mypy

4736 for n in self.levels:

4737 if n not in columns:

4738 columns.insert(0, n)

4739 s = super().read(where=where, columns=columns, start=start, stop=stop)

4740 if is_multi_index:

4741 s.set_index(self.levels, inplace=True)

4742

4743 s = s.iloc[:, 0]

4744

4745 # remove the default name

4746 if s.name == "values":

4747 s.name = None

4748 return s

4749

4750

4751class AppendableMultiSeriesTable(AppendableSeriesTable):

4752 """support the new appendable table formats"""

4753

4754 pandas_kind = "series_table"

4755 table_type = "appendable_multiseries"

4756

4757 # error: Signature of "write" incompatible with supertype "Fixed"

4758 def write(self, obj, **kwargs) -> None: # type: ignore[override]

4759 """we are going to write this as a frame table"""

4760 name = obj.name or "values"

4761 newobj, self.levels = self.validate_multiindex(obj)

4762 assert isinstance(self.levels, list) # for mypy

4763 cols = list(self.levels)

4764 cols.append(name)

4765 newobj.columns = Index(cols)

4766 super().write(obj=newobj, **kwargs)

4767

4768

4769class GenericTable(AppendableFrameTable):

4770 """a table that read/writes the generic pytables table format"""

4771

4772 pandas_kind = "frame_table"

4773 table_type = "generic_table"

4774 ndim = 2

4775 obj_type = DataFrame

4776 levels: list[Hashable]

4777

4778 @property

4779 def pandas_type(self) -> str:

4780 return self.pandas_kind

4781

4782 @property

4783 def storable(self):

4784 return getattr(self.group, "table", None) or self.group

4785

4786 def get_attrs(self) -> None:

4787 """retrieve our attributes"""

4788 self.non_index_axes = []

4789 self.nan_rep = None

4790 self.levels = []

4791

4792 self.index_axes = [a for a in self.indexables if a.is_an_indexable]

4793 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]

4794 self.data_columns = [a.name for a in self.values_axes]

4795

4796 @cache_readonly

4797 def indexables(self):

4798 """create the indexables from the table description"""

4799 d = self.description

4800

4801 # TODO: can we get a typ for this? AFAICT it is the only place

4802 # where we aren't passing one

4803 # the index columns is just a simple index

4804 md = self.read_metadata("index")

4805 meta = "category" if md is not None else None

4806 index_col = GenericIndexCol(

4807 name="index", axis=0, table=self.table, meta=meta, metadata=md

4808 )

4809

4810 _indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col]

4811

4812 for i, n in enumerate(d._v_names):

4813 assert isinstance(n, str)

4814

4815 atom = getattr(d, n)

4816 md = self.read_metadata(n)

4817 meta = "category" if md is not None else None

4818 dc = GenericDataIndexableCol(

4819 name=n,

4820 pos=i,

4821 values=[n],

4822 typ=atom,

4823 table=self.table,

4824 meta=meta,

4825 metadata=md,

4826 )

4827 _indexables.append(dc)

4828

4829 return _indexables

4830

4831 # error: Signature of "write" incompatible with supertype "AppendableTable"

4832 def write(self, **kwargs) -> None: # type: ignore[override]

4833 raise NotImplementedError("cannot write on an generic table")

4834

4835

4836class AppendableMultiFrameTable(AppendableFrameTable):

4837 """a frame with a multi-index"""

4838

4839 table_type = "appendable_multiframe"

4840 obj_type = DataFrame

4841 ndim = 2

4842 _re_levels = re.compile(r"^level_\d+$")

4843

4844 @property

4845 def table_type_short(self) -> str:

4846 return "appendable_multi"

4847

4848 # error: Signature of "write" incompatible with supertype "Fixed"

4849 def write(self, obj, data_columns=None, **kwargs) -> None: # type: ignore[override]

4850 if data_columns is None:

4851 data_columns = []

4852 elif data_columns is True:

4853 data_columns = obj.columns.tolist()

4854 obj, self.levels = self.validate_multiindex(obj)

4855 assert isinstance(self.levels, list) # for mypy

4856 for n in self.levels:

4857 if n not in data_columns:

4858 data_columns.insert(0, n)

4859 super().write(obj=obj, data_columns=data_columns, **kwargs)

4860

4861 def read(

4862 self,

4863 where=None,

4864 columns=None,

4865 start: int | None = None,

4866 stop: int | None = None,

4867 ):

4868 df = super().read(where=where, columns=columns, start=start, stop=stop)

4869 df = df.set_index(self.levels)

4870

4871 # remove names for 'level_%d'

4872 df.index = df.index.set_names(

4873 [None if self._re_levels.search(name) else name for name in df.index.names]

4874 )

4875

4876 return df

4877

4878

4879def _reindex_axis(

4880 obj: DataFrame, axis: AxisInt, labels: Index, other=None

4881) -> DataFrame:

4882 ax = obj._get_axis(axis)

4883 labels = ensure_index(labels)

4884

4885 # try not to reindex even if other is provided

4886 # if it equals our current index

4887 if other is not None:

4888 other = ensure_index(other)

4889 if (other is None or labels.equals(other)) and labels.equals(ax):

4890 return obj

4891

4892 labels = ensure_index(labels.unique())

4893 if other is not None:

4894 labels = ensure_index(other.unique()).intersection(labels, sort=False)

4895 if not labels.equals(ax):

4896 slicer: list[slice | Index] = [slice(None, None)] * obj.ndim

4897 slicer[axis] = labels

4898 obj = obj.loc[tuple(slicer)]

4899 return obj

4900

4901

4902# tz to/from coercion

4903

4904

4905def _get_tz(tz: tzinfo) -> str | tzinfo:

4906 """for a tz-aware type, return an encoded zone"""

4907 zone = timezones.get_timezone(tz)

4908 return zone

4909

4910

4911@overload

4912def _set_tz(

4913 values: np.ndarray | Index, tz: str | tzinfo, coerce: bool = False

4914) -> DatetimeIndex:

4915 ...

4916

4917

4918@overload

4919def _set_tz(values: np.ndarray | Index, tz: None, coerce: bool = False) -> np.ndarray:

4920 ...

4921

4922

4923def _set_tz(

4924 values: np.ndarray | Index, tz: str | tzinfo | None, coerce: bool = False

4925) -> np.ndarray | DatetimeIndex:

4926 """

4927 coerce the values to a DatetimeIndex if tz is set

4928 preserve the input shape if possible

4929

4930 Parameters

4931 ----------

4932 values : ndarray or Index

4933 tz : str or tzinfo

4934 coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray

4935 """

4936 if isinstance(values, DatetimeIndex):

4937 # If values is tzaware, the tz gets dropped in the values.ravel()

4938 # call below (which returns an ndarray). So we are only non-lossy

4939 # if `tz` matches `values.tz`.

4940 assert values.tz is None or values.tz == tz

4941 if values.tz is not None:

4942 return values

4943

4944 if tz is not None:

4945 if isinstance(values, DatetimeIndex):

4946 name = values.name

4947 else:

4948 name = None

4949 values = values.ravel()

4950

4951 tz = _ensure_decoded(tz)

4952 values = DatetimeIndex(values, name=name)

4953 values = values.tz_localize("UTC").tz_convert(tz)

4954 elif coerce:

4955 values = np.asarray(values, dtype="M8[ns]")

4956

4957 # error: Incompatible return value type (got "Union[ndarray, Index]",

4958 # expected "Union[ndarray, DatetimeIndex]")

4959 return values # type: ignore[return-value]

4960

4961

4962def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol:

4963 assert isinstance(name, str)

4964

4965 index_name = index.name

4966 # error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index";

4967 # expected "Union[ExtensionArray, ndarray]"

4968 converted, dtype_name = _get_data_and_dtype_name(index) # type: ignore[arg-type]

4969 kind = _dtype_to_kind(dtype_name)

4970 atom = DataIndexableCol._get_atom(converted)

4971

4972 if (

4973 lib.is_np_dtype(index.dtype, "iu")

4974 or needs_i8_conversion(index.dtype)

4975 or is_bool_dtype(index.dtype)

4976 ):

4977 # Includes Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,

4978 # in which case "kind" is "integer", "integer", "datetime64",

4979 # "timedelta64", and "integer", respectively.

4980 return IndexCol(

4981 name,

4982 values=converted,

4983 kind=kind,

4984 typ=atom,

4985 freq=getattr(index, "freq", None),

4986 tz=getattr(index, "tz", None),

4987 index_name=index_name,

4988 )

4989

4990 if isinstance(index, MultiIndex):

4991 raise TypeError("MultiIndex not supported here!")

4992

4993 inferred_type = lib.infer_dtype(index, skipna=False)

4994 # we won't get inferred_type of "datetime64" or "timedelta64" as these

4995 # would go through the DatetimeIndex/TimedeltaIndex paths above

4996

4997 values = np.asarray(index)

4998

4999 if inferred_type == "date":

5000 converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)

5001 return IndexCol(

5002 name, converted, "date", _tables().Time32Col(), index_name=index_name

5003 )

5004 elif inferred_type == "string":

5005 converted = _convert_string_array(values, encoding, errors)

5006 itemsize = converted.dtype.itemsize

5007 return IndexCol(

5008 name,

5009 converted,

5010 "string",

5011 _tables().StringCol(itemsize),

5012 index_name=index_name,

5013 )

5014

5015 elif inferred_type in ["integer", "floating"]:

5016 return IndexCol(

5017 name, values=converted, kind=kind, typ=atom, index_name=index_name

5018 )

5019 else:

5020 assert isinstance(converted, np.ndarray) and converted.dtype == object

5021 assert kind == "object", kind

5022 atom = _tables().ObjectAtom()

5023 return IndexCol(name, converted, kind, atom, index_name=index_name)

5024

5025

5026def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index:

5027 index: Index | np.ndarray

5028

5029 if kind.startswith("datetime64"):

5030 if kind == "datetime64":

5031 # created before we stored resolution information

5032 index = DatetimeIndex(data)

5033 else:

5034 index = DatetimeIndex(data.view(kind))

5035 elif kind == "timedelta64":

5036 index = TimedeltaIndex(data)

5037 elif kind == "date":

5038 try:

5039 index = np.asarray([date.fromordinal(v) for v in data], dtype=object)

5040 except ValueError:

5041 index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)

5042 elif kind in ("integer", "float", "bool"):

5043 index = np.asarray(data)

5044 elif kind in ("string"):

5045 index = _unconvert_string_array(

5046 data, nan_rep=None, encoding=encoding, errors=errors

5047 )

5048 elif kind == "object":

5049 index = np.asarray(data[0])

5050 else: # pragma: no cover

5051 raise ValueError(f"unrecognized index type {kind}")

5052 return index

5053

5054

5055def _maybe_convert_for_string_atom(

5056 name: str,

5057 bvalues: ArrayLike,

5058 existing_col,

5059 min_itemsize,

5060 nan_rep,

5061 encoding,

5062 errors,

5063 columns: list[str],

5064):

5065 if bvalues.dtype != object:

5066 return bvalues

5067

5068 bvalues = cast(np.ndarray, bvalues)

5069

5070 dtype_name = bvalues.dtype.name

5071 inferred_type = lib.infer_dtype(bvalues, skipna=False)

5072

5073 if inferred_type == "date":

5074 raise TypeError("[date] is not implemented as a table column")

5075 if inferred_type == "datetime":

5076 # after GH#8260

5077 # this only would be hit for a multi-timezone dtype which is an error

5078 raise TypeError(

5079 "too many timezones in this block, create separate data columns"

5080 )

5081

5082 if not (inferred_type == "string" or dtype_name == "object"):

5083 return bvalues

5084

5085 mask = isna(bvalues)

5086 data = bvalues.copy()

5087 data[mask] = nan_rep

5088

5089 # see if we have a valid string type

5090 inferred_type = lib.infer_dtype(data, skipna=False)

5091 if inferred_type != "string":

5092 # we cannot serialize this data, so report an exception on a column

5093 # by column basis

5094

5095 # expected behaviour:

5096 # search block for a non-string object column by column

5097 for i in range(data.shape[0]):

5098 col = data[i]

5099 inferred_type = lib.infer_dtype(col, skipna=False)

5100 if inferred_type != "string":

5101 error_column_label = columns[i] if len(columns) > i else f"No.{i}"

5102 raise TypeError(

5103 f"Cannot serialize the column [{error_column_label}]\n"

5104 f"because its data contents are not [string] but "

5105 f"[{inferred_type}] object dtype"

5106 )

5107

5108 # itemsize is the maximum length of a string (along any dimension)

5109

5110 data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape)

5111 itemsize = data_converted.itemsize

5112

5113 # specified min_itemsize?

5114 if isinstance(min_itemsize, dict):

5115 min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)

5116 itemsize = max(min_itemsize or 0, itemsize)

5117

5118 # check for column in the values conflicts

5119 if existing_col is not None:

5120 eci = existing_col.validate_col(itemsize)

5121 if eci is not None and eci > itemsize:

5122 itemsize = eci

5123

5124 data_converted = data_converted.astype(f"|S{itemsize}", copy=False)

5125 return data_converted

5126

5127

5128def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray:

5129 """

5130 Take a string-like that is object dtype and coerce to a fixed size string type.

5131

5132 Parameters

5133 ----------

5134 data : np.ndarray[object]

5135 encoding : str

5136 errors : str

5137 Handler for encoding errors.

5138

5139 Returns

5140 -------

5141 np.ndarray[fixed-length-string]

5142 """

5143 # encode if needed

5144 if len(data):

5145 data = (

5146 Series(data.ravel(), copy=False)

5147 .str.encode(encoding, errors)

5148 ._values.reshape(data.shape)

5149 )

5150

5151 # create the sized dtype

5152 ensured = ensure_object(data.ravel())

5153 itemsize = max(1, libwriters.max_len_string_array(ensured))

5154

5155 data = np.asarray(data, dtype=f"S{itemsize}")

5156 return data

5157

5158

5159def _unconvert_string_array(

5160 data: np.ndarray, nan_rep, encoding: str, errors: str

5161) -> np.ndarray:

5162 """

5163 Inverse of _convert_string_array.

5164

5165 Parameters

5166 ----------

5167 data : np.ndarray[fixed-length-string]

5168 nan_rep : the storage repr of NaN

5169 encoding : str

5170 errors : str

5171 Handler for encoding errors.

5172

5173 Returns

5174 -------

5175 np.ndarray[object]

5176 Decoded data.

5177 """

5178 shape = data.shape

5179 data = np.asarray(data.ravel(), dtype=object)

5180

5181 if len(data):

5182 itemsize = libwriters.max_len_string_array(ensure_object(data))

5183 dtype = f"U{itemsize}"

5184

5185 if isinstance(data[0], bytes):

5186 data = Series(data, copy=False).str.decode(encoding, errors=errors)._values

5187 else:

5188 data = data.astype(dtype, copy=False).astype(object, copy=False)

5189

5190 if nan_rep is None:

5191 nan_rep = "nan"

5192

5193 libwriters.string_array_replace_from_nan_rep(data, nan_rep)

5194 return data.reshape(shape)

5195

5196

5197def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str):

5198 assert isinstance(val_kind, str), type(val_kind)

5199 if _need_convert(val_kind):

5200 conv = _get_converter(val_kind, encoding, errors)

5201 values = conv(values)

5202 return values

5203

5204

5205def _get_converter(kind: str, encoding: str, errors: str):

5206 if kind == "datetime64":

5207 return lambda x: np.asarray(x, dtype="M8[ns]")

5208 elif "datetime64" in kind:

5209 return lambda x: np.asarray(x, dtype=kind)

5210 elif kind == "string":

5211 return lambda x: _unconvert_string_array(

5212 x, nan_rep=None, encoding=encoding, errors=errors

5213 )

5214 else: # pragma: no cover

5215 raise ValueError(f"invalid kind {kind}")

5216

5217

5218def _need_convert(kind: str) -> bool:

5219 if kind in ("datetime64", "string") or "datetime64" in kind:

5220 return True

5221 return False

5222

5223

5224def _maybe_adjust_name(name: str, version: Sequence[int]) -> str:

5225 """

5226 Prior to 0.10.1, we named values blocks like: values_block_0 an the

5227 name values_0, adjust the given name if necessary.

5228

5229 Parameters

5230 ----------

5231 name : str

5232 version : Tuple[int, int, int]

5233

5234 Returns

5235 -------

5236 str

5237 """

5238 if isinstance(version, str) or len(version) < 3:

5239 raise ValueError("Version is incorrect, expected sequence of 3 integers.")

5240

5241 if version[0] == 0 and version[1] <= 10 and version[2] == 0:

5242 m = re.search(r"values_block_(\d+)", name)

5243 if m:

5244 grp = m.groups()[0]

5245 name = f"values_{grp}"

5246 return name

5247

5248

5249def _dtype_to_kind(dtype_str: str) -> str:

5250 """

5251 Find the "kind" string describing the given dtype name.

5252 """

5253 dtype_str = _ensure_decoded(dtype_str)

5254

5255 if dtype_str.startswith(("string", "bytes")):

5256 kind = "string"

5257 elif dtype_str.startswith("float"):

5258 kind = "float"

5259 elif dtype_str.startswith("complex"):

5260 kind = "complex"

5261 elif dtype_str.startswith(("int", "uint")):

5262 kind = "integer"

5263 elif dtype_str.startswith("datetime64"):

5264 kind = dtype_str

5265 elif dtype_str.startswith("timedelta"):

5266 kind = "timedelta64"

5267 elif dtype_str.startswith("bool"):

5268 kind = "bool"

5269 elif dtype_str.startswith("category"):

5270 kind = "category"

5271 elif dtype_str.startswith("period"):

5272 # We store the `freq` attr so we can restore from integers

5273 kind = "integer"

5274 elif dtype_str == "object":

5275 kind = "object"

5276 else:

5277 raise ValueError(f"cannot interpret dtype of [{dtype_str}]")

5278

5279 return kind

5280

5281

5282def _get_data_and_dtype_name(data: ArrayLike):

5283 """

5284 Convert the passed data into a storable form and a dtype string.

5285 """

5286 if isinstance(data, Categorical):

5287 data = data.codes

5288

5289 if isinstance(data.dtype, DatetimeTZDtype):

5290 # For datetime64tz we need to drop the TZ in tests TODO: why?

5291 dtype_name = f"datetime64[{data.dtype.unit}]"

5292 else:

5293 dtype_name = data.dtype.name

5294

5295 if data.dtype.kind in "mM":

5296 data = np.asarray(data.view("i8"))

5297 # TODO: we used to reshape for the dt64tz case, but no longer

5298 # doing that doesn't seem to break anything. why?

5299

5300 elif isinstance(data, PeriodIndex):

5301 data = data.asi8

5302

5303 data = np.asarray(data)

5304 return data, dtype_name

5305

5306

5307class Selection:

5308 """

5309 Carries out a selection operation on a tables.Table object.

5310

5311 Parameters

5312 ----------

5313 table : a Table object

5314 where : list of Terms (or convertible to)

5315 start, stop: indices to start and/or stop selection

5316

5317 """

5318

5319 def __init__(

5320 self,

5321 table: Table,

5322 where=None,

5323 start: int | None = None,

5324 stop: int | None = None,

5325 ) -> None:

5326 self.table = table

5327 self.where = where

5328 self.start = start

5329 self.stop = stop

5330 self.condition = None

5331 self.filter = None

5332 self.terms = None

5333 self.coordinates = None

5334

5335 if is_list_like(where):

5336 # see if we have a passed coordinate like

5337 with suppress(ValueError):

5338 inferred = lib.infer_dtype(where, skipna=False)

5339 if inferred in ("integer", "boolean"):

5340 where = np.asarray(where)

5341 if where.dtype == np.bool_:

5342 start, stop = self.start, self.stop

5343 if start is None:

5344 start = 0

5345 if stop is None:

5346 stop = self.table.nrows

5347 self.coordinates = np.arange(start, stop)[where]

5348 elif issubclass(where.dtype.type, np.integer):

5349 if (self.start is not None and (where < self.start).any()) or (

5350 self.stop is not None and (where >= self.stop).any()

5351 ):

5352 raise ValueError(

5353 "where must have index locations >= start and < stop"

5354 )

5355 self.coordinates = where

5356

5357 if self.coordinates is None:

5358 self.terms = self.generate(where)

5359

5360 # create the numexpr & the filter

5361 if self.terms is not None:

5362 self.condition, self.filter = self.terms.evaluate()

5363

5364 def generate(self, where):

5365 """where can be a : dict,list,tuple,string"""

5366 if where is None:

5367 return None

5368

5369 q = self.table.queryables()

5370 try:

5371 return PyTablesExpr(where, queryables=q, encoding=self.table.encoding)

5372 except NameError as err:

5373 # raise a nice message, suggesting that the user should use

5374 # data_columns

5375 qkeys = ",".join(q.keys())

5376 msg = dedent(

5377 f"""\

5378 The passed where expression: {where}

5379 contains an invalid variable reference

5380 all of the variable references must be a reference to

5381 an axis (e.g. 'index' or 'columns'), or a data_column

5382 The currently defined references are: {qkeys}

5383 """

5384 )

5385 raise ValueError(msg) from err

5386

5387 def select(self):

5388 """

5389 generate the selection

5390 """

5391 if self.condition is not None:

5392 return self.table.table.read_where(

5393 self.condition.format(), start=self.start, stop=self.stop

5394 )

5395 elif self.coordinates is not None:

5396 return self.table.table.read_coordinates(self.coordinates)

5397 return self.table.table.read(start=self.start, stop=self.stop)

5398

5399 def select_coords(self):

5400 """

5401 generate the selection

5402 """

5403 start, stop = self.start, self.stop

5404 nrows = self.table.nrows

5405 if start is None:

5406 start = 0

5407 elif start < 0:

5408 start += nrows

5409 if stop is None:

5410 stop = nrows

5411 elif stop < 0:

5412 stop += nrows

5413

5414 if self.condition is not None:

5415 return self.table.table.get_where_list(

5416 self.condition.format(), start=start, stop=stop, sort=True

5417 )

5418 elif self.coordinates is not None:

5419 return self.coordinates

5420

5421 return np.arange(start, stop)