Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/io/pytables.py: 19%

1"""

2High level interface to PyTables for reading and writing pandas data structures

3to disk

4"""

5from __future__ import annotations

7from contextlib import suppress

8import copy

9from datetime import (

10 date,

11 tzinfo,

12)

13import itertools

14import os

15import re

16from textwrap import dedent

17from types import TracebackType

18from typing import (

19 TYPE_CHECKING,

20 Any,

21 Callable,

22 Final,

23 Hashable,

24 Iterator,

25 Literal,

26 Sequence,

27 cast,

28 overload,

29)

30import warnings

32import numpy as np

34from pandas._config import (

35 config,

36 get_option,

37)

39from pandas._libs import (

40 lib,

41 writers as libwriters,

42)

43from pandas._libs.tslibs import timezones

44from pandas._typing import (

45 AnyArrayLike,

46 ArrayLike,

47 AxisInt,

48 DtypeArg,

49 FilePath,

50 Shape,

51 npt,

52)

53from pandas.compat._optional import import_optional_dependency

54from pandas.compat.pickle_compat import patch_pickle

55from pandas.errors import (

56 AttributeConflictWarning,

57 ClosedFileError,

58 IncompatibilityWarning,

59 PerformanceWarning,

60 PossibleDataLossError,

61)

62from pandas.util._decorators import cache_readonly

63from pandas.util._exceptions import find_stack_level

65from pandas.core.dtypes.common import (

66 ensure_object,

67 is_bool_dtype,

68 is_categorical_dtype,

69 is_complex_dtype,

70 is_datetime64_dtype,

71 is_datetime64tz_dtype,

72 is_extension_array_dtype,

73 is_integer_dtype,

74 is_list_like,

75 is_object_dtype,

76 is_string_dtype,

77 is_timedelta64_dtype,

78 needs_i8_conversion,

79)

80from pandas.core.dtypes.missing import array_equivalent

82from pandas import (

83 DataFrame,

84 DatetimeIndex,

85 Index,

86 MultiIndex,

87 PeriodIndex,

88 RangeIndex,

89 Series,

90 TimedeltaIndex,

91 concat,

92 isna,

93)

94from pandas.core.arrays import (

95 Categorical,

96 DatetimeArray,

97 PeriodArray,

98)

99import pandas.core.common as com

100from pandas.core.computation.pytables import (

101 PyTablesExpr,

102 maybe_expression,

103)

104from pandas.core.construction import extract_array

105from pandas.core.indexes.api import ensure_index

106from pandas.core.internals import (

107 ArrayManager,

108 BlockManager,

109)

110

111from pandas.io.common import stringify_path

112from pandas.io.formats.printing import (

113 adjoin,

114 pprint_thing,

115)

116

117if TYPE_CHECKING:

118 from tables import (

119 Col,

120 File,

121 Node,

122 )

123

124 from pandas.core.internals import Block

125

126

127# versioning attribute

128_version = "0.15.2"

129

130# encoding

131_default_encoding = "UTF-8"

132

133

134def _ensure_decoded(s):

135 """if we have bytes, decode them to unicode"""

136 if isinstance(s, np.bytes_):

137 s = s.decode("UTF-8")

138 return s

139

140

141def _ensure_encoding(encoding: str | None) -> str:

142 # set the encoding if we need

143 if encoding is None:

144 encoding = _default_encoding

145

146 return encoding

147

148

149def _ensure_str(name):

150 """

151 Ensure that an index / column name is a str (python 3); otherwise they

152 may be np.string dtype. Non-string dtypes are passed through unchanged.

153

154 https://github.com/pandas-dev/pandas/issues/13492

155 """

156 if isinstance(name, str):

157 name = str(name)

158 return name

159

160

161Term = PyTablesExpr

162

163

164def _ensure_term(where, scope_level: int):

165 """

166 Ensure that the where is a Term or a list of Term.

167

168 This makes sure that we are capturing the scope of variables that are

169 passed create the terms here with a frame_level=2 (we are 2 levels down)

170 """

171 # only consider list/tuple here as an ndarray is automatically a coordinate

172 # list

173 level = scope_level + 1

174 if isinstance(where, (list, tuple)):

175 where = [

176 Term(term, scope_level=level + 1) if maybe_expression(term) else term

177 for term in where

178 if term is not None

179 ]

180 elif maybe_expression(where):

181 where = Term(where, scope_level=level)

182 return where if where is None or len(where) else None

183

184

185incompatibility_doc: Final = """

186where criteria is being ignored as this version [%s] is too old (or

187not-defined), read the file in and write it out to a new file to upgrade (with

188the copy_to method)

189"""

190

191attribute_conflict_doc: Final = """

192the [%s] attribute of the existing index is [%s] which conflicts with the new

193[%s], resetting the attribute to None

194"""

195

196performance_doc: Final = """

197your performance may suffer as PyTables will pickle object types that it cannot

198map directly to c-types [inferred_type->%s,key->%s] [items->%s]

199"""

200

201# formats

202_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}

203

204# axes map

205_AXES_MAP = {DataFrame: [0]}

206

207# register our configuration options

208dropna_doc: Final = """

209: boolean

210 drop ALL nan rows when appending to a table

211"""

212format_doc: Final = """

213: format

214 default format writing format, if None, then

215 put will default to 'fixed' and append will default to 'table'

216"""

217

218with config.config_prefix("io.hdf"):

219 config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)

220 config.register_option(

221 "default_format",

222 None,

223 format_doc,

224 validator=config.is_one_of_factory(["fixed", "table", None]),

225 )

226

227# oh the troubles to reduce import time

228_table_mod = None

229_table_file_open_policy_is_strict = False

230

231

232def _tables():

233 global _table_mod

234 global _table_file_open_policy_is_strict

235 if _table_mod is None:

236 import tables

237

238 _table_mod = tables

239

240 # set the file open policy

241 # return the file open policy; this changes as of pytables 3.1

242 # depending on the HDF5 version

243 with suppress(AttributeError):

244 _table_file_open_policy_is_strict = (

245 tables.file._FILE_OPEN_POLICY == "strict"

246 )

247

248 return _table_mod

249

250

251# interface to/from ###

252

253

254def to_hdf(

255 path_or_buf: FilePath | HDFStore,

256 key: str,

257 value: DataFrame | Series,

258 mode: str = "a",

259 complevel: int | None = None,

260 complib: str | None = None,

261 append: bool = False,

262 format: str | None = None,

263 index: bool = True,

264 min_itemsize: int | dict[str, int] | None = None,

265 nan_rep=None,

266 dropna: bool | None = None,

267 data_columns: Literal[True] | list[str] | None = None,

268 errors: str = "strict",

269 encoding: str = "UTF-8",

270) -> None:

271 """store this object, close it if we opened it"""

272 if append:

273 f = lambda store: store.append(

274 key,

275 value,

276 format=format,

277 index=index,

278 min_itemsize=min_itemsize,

279 nan_rep=nan_rep,

280 dropna=dropna,

281 data_columns=data_columns,

282 errors=errors,

283 encoding=encoding,

284 )

285 else:

286 # NB: dropna is not passed to `put`

287 f = lambda store: store.put(

288 key,

289 value,

290 format=format,

291 index=index,

292 min_itemsize=min_itemsize,

293 nan_rep=nan_rep,

294 data_columns=data_columns,

295 errors=errors,

296 encoding=encoding,

297 dropna=dropna,

298 )

299

300 path_or_buf = stringify_path(path_or_buf)

301 if isinstance(path_or_buf, str):

302 with HDFStore(

303 path_or_buf, mode=mode, complevel=complevel, complib=complib

304 ) as store:

305 f(store)

306 else:

307 f(path_or_buf)

308

309

310def read_hdf(

311 path_or_buf: FilePath | HDFStore,

312 key=None,

313 mode: str = "r",

314 errors: str = "strict",

315 where: str | list | None = None,

316 start: int | None = None,

317 stop: int | None = None,

318 columns: list[str] | None = None,

319 iterator: bool = False,

320 chunksize: int | None = None,

321 **kwargs,

322):

323 """

324 Read from the store, close it if we opened it.

325

326 Retrieve pandas object stored in file, optionally based on where

327 criteria.

328

329 .. warning::

330

331 Pandas uses PyTables for reading and writing HDF5 files, which allows

332 serializing object-dtype data with pickle when using the "fixed" format.

333 Loading pickled data received from untrusted sources can be unsafe.

334

335 See: https://docs.python.org/3/library/pickle.html for more.

336

337 Parameters

338 ----------

339 path_or_buf : str, path object, pandas.HDFStore

340 Any valid string path is acceptable. Only supports the local file system,

341 remote URLs and file-like objects are not supported.

342

343 If you want to pass in a path object, pandas accepts any

344 ``os.PathLike``.

345

346 Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.

347

348 key : object, optional

349 The group identifier in the store. Can be omitted if the HDF file

350 contains a single pandas object.

351 mode : {'r', 'r+', 'a'}, default 'r'

352 Mode to use when opening the file. Ignored if path_or_buf is a

353 :class:`pandas.HDFStore`. Default is 'r'.

354 errors : str, default 'strict'

355 Specifies how encoding and decoding errors are to be handled.

356 See the errors argument for :func:`open` for a full list

357 of options.

358 where : list, optional

359 A list of Term (or convertible) objects.

360 start : int, optional

361 Row number to start selection.

362 stop : int, optional

363 Row number to stop selection.

364 columns : list, optional

365 A list of columns names to return.

366 iterator : bool, optional

367 Return an iterator object.

368 chunksize : int, optional

369 Number of rows to include in an iteration when using an iterator.

370 **kwargs

371 Additional keyword arguments passed to HDFStore.

372

373 Returns

374 -------

375 object

376 The selected object. Return type depends on the object stored.

377

378 See Also

379 --------

380 DataFrame.to_hdf : Write a HDF file from a DataFrame.

381 HDFStore : Low-level access to HDF files.

382

383 Examples

384 --------

385 >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP

386 >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP

387 >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP

388 """

389 if mode not in ["r", "r+", "a"]:

390 raise ValueError(

391 f"mode {mode} is not allowed while performing a read. "

392 f"Allowed modes are r, r+ and a."

393 )

394 # grab the scope

395 if where is not None:

396 where = _ensure_term(where, scope_level=1)

397

398 if isinstance(path_or_buf, HDFStore):

399 if not path_or_buf.is_open:

400 raise OSError("The HDFStore must be open for reading.")

401

402 store = path_or_buf

403 auto_close = False

404 else:

405 path_or_buf = stringify_path(path_or_buf)

406 if not isinstance(path_or_buf, str):

407 raise NotImplementedError(

408 "Support for generic buffers has not been implemented."

409 )

410 try:

411 exists = os.path.exists(path_or_buf)

412

413 # if filepath is too long

414 except (TypeError, ValueError):

415 exists = False

416

417 if not exists:

418 raise FileNotFoundError(f"File {path_or_buf} does not exist")

419

420 store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)

421 # can't auto open/close if we are using an iterator

422 # so delegate to the iterator

423 auto_close = True

424

425 try:

426 if key is None:

427 groups = store.groups()

428 if len(groups) == 0:

429 raise ValueError(

430 "Dataset(s) incompatible with Pandas data types, "

431 "not table, or no datasets found in HDF5 file."

432 )

433 candidate_only_group = groups[0]

434

435 # For the HDF file to have only one dataset, all other groups

436 # should then be metadata groups for that candidate group. (This

437 # assumes that the groups() method enumerates parent groups

438 # before their children.)

439 for group_to_check in groups[1:]:

440 if not _is_metadata_of(group_to_check, candidate_only_group):

441 raise ValueError(

442 "key must be provided when HDF5 "

443 "file contains multiple datasets."

444 )

445 key = candidate_only_group._v_pathname

446 return store.select(

447 key,

448 where=where,

449 start=start,

450 stop=stop,

451 columns=columns,

452 iterator=iterator,

453 chunksize=chunksize,

454 auto_close=auto_close,

455 )

456 except (ValueError, TypeError, KeyError):

457 if not isinstance(path_or_buf, HDFStore):

458 # if there is an error, close the store if we opened it.

459 with suppress(AttributeError):

460 store.close()

461

462 raise

463

464

465def _is_metadata_of(group: Node, parent_group: Node) -> bool:

466 """Check if a given group is a metadata group for a given parent_group."""

467 if group._v_depth <= parent_group._v_depth:

468 return False

469

470 current = group

471 while current._v_depth > 1:

472 parent = current._v_parent

473 if parent == parent_group and current._v_name == "meta":

474 return True

475 current = current._v_parent

476 return False

477

478

479class HDFStore:

480 """

481 Dict-like IO interface for storing pandas objects in PyTables.

482

483 Either Fixed or Table format.

484

485 .. warning::

486

487 Pandas uses PyTables for reading and writing HDF5 files, which allows

488 serializing object-dtype data with pickle when using the "fixed" format.

489 Loading pickled data received from untrusted sources can be unsafe.

490

491 See: https://docs.python.org/3/library/pickle.html for more.

492

493 Parameters

494 ----------

495 path : str

496 File path to HDF5 file.

497 mode : {'a', 'w', 'r', 'r+'}, default 'a'

498

499 ``'r'``

500 Read-only; no data can be modified.

501 ``'w'``

502 Write; a new file is created (an existing file with the same

503 name would be deleted).

504 ``'a'``

505 Append; an existing file is opened for reading and writing,

506 and if the file does not exist it is created.

507 ``'r+'``

508 It is similar to ``'a'``, but the file must already exist.

509 complevel : int, 0-9, default None

510 Specifies a compression level for data.

511 A value of 0 or None disables compression.

512 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'

513 Specifies the compression library to be used.

514 As of v0.20.2 these additional compressors for Blosc are supported

515 (default if no compressor specified: 'blosc:blosclz'):

516 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',

517 'blosc:zlib', 'blosc:zstd'}.

518 Specifying a compression library which is not available issues

519 a ValueError.

520 fletcher32 : bool, default False

521 If applying compression use the fletcher32 checksum.

522 **kwargs

523 These parameters will be passed to the PyTables open_file method.

524

525 Examples

526 --------

527 >>> bar = pd.DataFrame(np.random.randn(10, 4))

528 >>> store = pd.HDFStore('test.h5')

529 >>> store['foo'] = bar # write to HDF5

530 >>> bar = store['foo'] # retrieve

531 >>> store.close()

532

533 **Create or load HDF5 file in-memory**

534

535 When passing the `driver` option to the PyTables open_file method through

536 **kwargs, the HDF5 file is loaded or created in-memory and will only be

537 written when closed:

538

539 >>> bar = pd.DataFrame(np.random.randn(10, 4))

540 >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE')

541 >>> store['foo'] = bar

542 >>> store.close() # only now, data is written to disk

543 """

544

545 _handle: File | None

546 _mode: str

547

548 def __init__(

549 self,

550 path,

551 mode: str = "a",

552 complevel: int | None = None,

553 complib=None,

554 fletcher32: bool = False,

555 **kwargs,

556 ) -> None:

557 if "format" in kwargs:

558 raise ValueError("format is not a defined argument for HDFStore")

559

560 tables = import_optional_dependency("tables")

561

562 if complib is not None and complib not in tables.filters.all_complibs:

563 raise ValueError(

564 f"complib only supports {tables.filters.all_complibs} compression."

565 )

566

567 if complib is None and complevel is not None:

568 complib = tables.filters.default_complib

569

570 self._path = stringify_path(path)

571 if mode is None:

572 mode = "a"

573 self._mode = mode

574 self._handle = None

575 self._complevel = complevel if complevel else 0

576 self._complib = complib

577 self._fletcher32 = fletcher32

578 self._filters = None

579 self.open(mode=mode, **kwargs)

580

581 def __fspath__(self) -> str:

582 return self._path

583

584 @property

585 def root(self):

586 """return the root node"""

587 self._check_if_open()

588 assert self._handle is not None # for mypy

589 return self._handle.root

590

591 @property

592 def filename(self) -> str:

593 return self._path

594

595 def __getitem__(self, key: str):

596 return self.get(key)

597

598 def __setitem__(self, key: str, value) -> None:

599 self.put(key, value)

600

601 def __delitem__(self, key: str) -> None:

602 return self.remove(key)

603

604 def __getattr__(self, name: str):

605 """allow attribute access to get stores"""

606 try:

607 return self.get(name)

608 except (KeyError, ClosedFileError):

609 pass

610 raise AttributeError(

611 f"'{type(self).__name__}' object has no attribute '{name}'"

612 )

613

614 def __contains__(self, key: str) -> bool:

615 """

616 check for existence of this key

617 can match the exact pathname or the pathnm w/o the leading '/'

618 """

619 node = self.get_node(key)

620 if node is not None:

621 name = node._v_pathname

622 if key in (name, name[1:]):

623 return True

624 return False

625

626 def __len__(self) -> int:

627 return len(self.groups())

628

629 def __repr__(self) -> str:

630 pstr = pprint_thing(self._path)

631 return f"{type(self)}\nFile path: {pstr}\n"

632

633 def __enter__(self) -> HDFStore:

634 return self

635

636 def __exit__(

637 self,

638 exc_type: type[BaseException] | None,

639 exc_value: BaseException | None,

640 traceback: TracebackType | None,

641 ) -> None:

642 self.close()

643

644 def keys(self, include: str = "pandas") -> list[str]:

645 """

646 Return a list of keys corresponding to objects stored in HDFStore.

647

648 Parameters

649 ----------

650

651 include : str, default 'pandas'

652 When kind equals 'pandas' return pandas objects.

653 When kind equals 'native' return native HDF5 Table objects.

654

655 .. versionadded:: 1.1.0

656

657 Returns

658 -------

659 list

660 List of ABSOLUTE path-names (e.g. have the leading '/').

661

662 Raises

663 ------

664 raises ValueError if kind has an illegal value

665 """

666 if include == "pandas":

667 return [n._v_pathname for n in self.groups()]

668

669 elif include == "native":

670 assert self._handle is not None # mypy

671 return [

672 n._v_pathname for n in self._handle.walk_nodes("/", classname="Table")

673 ]

674 raise ValueError(

675 f"`include` should be either 'pandas' or 'native' but is '{include}'"

676 )

677

678 def __iter__(self) -> Iterator[str]:

679 return iter(self.keys())

680

681 def items(self) -> Iterator[tuple[str, list]]:

682 """

683 iterate on key->group

684 """

685 for g in self.groups():

686 yield g._v_pathname, g

687

688 def open(self, mode: str = "a", **kwargs) -> None:

689 """

690 Open the file in the specified mode

691

692 Parameters

693 ----------

694 mode : {'a', 'w', 'r', 'r+'}, default 'a'

695 See HDFStore docstring or tables.open_file for info about modes

696 **kwargs

697 These parameters will be passed to the PyTables open_file method.

698 """

699 tables = _tables()

700

701 if self._mode != mode:

702 # if we are changing a write mode to read, ok

703 if self._mode in ["a", "w"] and mode in ["r", "r+"]:

704 pass

705 elif mode in ["w"]:

706 # this would truncate, raise here

707 if self.is_open:

708 raise PossibleDataLossError(

709 f"Re-opening the file [{self._path}] with mode [{self._mode}] "

710 "will delete the current file!"

711 )

712

713 self._mode = mode

714

715 # close and reopen the handle

716 if self.is_open:

717 self.close()

718

719 if self._complevel and self._complevel > 0:

720 self._filters = _tables().Filters(

721 self._complevel, self._complib, fletcher32=self._fletcher32

722 )

723

724 if _table_file_open_policy_is_strict and self.is_open:

725 msg = (

726 "Cannot open HDF5 file, which is already opened, "

727 "even in read-only mode."

728 )

729 raise ValueError(msg)

730

731 self._handle = tables.open_file(self._path, self._mode, **kwargs)

732

733 def close(self) -> None:

734 """

735 Close the PyTables file handle

736 """

737 if self._handle is not None:

738 self._handle.close()

739 self._handle = None

740

741 @property

742 def is_open(self) -> bool:

743 """

744 return a boolean indicating whether the file is open

745 """

746 if self._handle is None:

747 return False

748 return bool(self._handle.isopen)

749

750 def flush(self, fsync: bool = False) -> None:

751 """

752 Force all buffered modifications to be written to disk.

753

754 Parameters

755 ----------

756 fsync : bool (default False)

757 call ``os.fsync()`` on the file handle to force writing to disk.

758

759 Notes

760 -----

761 Without ``fsync=True``, flushing may not guarantee that the OS writes

762 to disk. With fsync, the operation will block until the OS claims the

763 file has been written; however, other caching layers may still

764 interfere.

765 """

766 if self._handle is not None:

767 self._handle.flush()

768 if fsync:

769 with suppress(OSError):

770 os.fsync(self._handle.fileno())

771

772 def get(self, key: str):

773 """

774 Retrieve pandas object stored in file.

775

776 Parameters

777 ----------

778 key : str

779

780 Returns

781 -------

782 object

783 Same type as object stored in file.

784 """

785 with patch_pickle():

786 # GH#31167 Without this patch, pickle doesn't know how to unpickle

787 # old DateOffset objects now that they are cdef classes.

788 group = self.get_node(key)

789 if group is None:

790 raise KeyError(f"No object named {key} in the file")

791 return self._read_group(group)

792

793 def select(

794 self,

795 key: str,

796 where=None,

797 start=None,

798 stop=None,

799 columns=None,

800 iterator: bool = False,

801 chunksize=None,

802 auto_close: bool = False,

803 ):

804 """

805 Retrieve pandas object stored in file, optionally based on where criteria.

806

807 .. warning::

808

809 Pandas uses PyTables for reading and writing HDF5 files, which allows

810 serializing object-dtype data with pickle when using the "fixed" format.

811 Loading pickled data received from untrusted sources can be unsafe.

812

813 See: https://docs.python.org/3/library/pickle.html for more.

814

815 Parameters

816 ----------

817 key : str

818 Object being retrieved from file.

819 where : list or None

820 List of Term (or convertible) objects, optional.

821 start : int or None

822 Row number to start selection.

823 stop : int, default None

824 Row number to stop selection.

825 columns : list or None

826 A list of columns that if not None, will limit the return columns.

827 iterator : bool or False

828 Returns an iterator.

829 chunksize : int or None

830 Number or rows to include in iteration, return an iterator.

831 auto_close : bool or False

832 Should automatically close the store when finished.

833

834 Returns

835 -------

836 object

837 Retrieved object from file.

838 """

839 group = self.get_node(key)

840 if group is None:

841 raise KeyError(f"No object named {key} in the file")

842

843 # create the storer and axes

844 where = _ensure_term(where, scope_level=1)

845 s = self._create_storer(group)

846 s.infer_axes()

847

848 # function to call on iteration

849 def func(_start, _stop, _where):

850 return s.read(start=_start, stop=_stop, where=_where, columns=columns)

851

852 # create the iterator

853 it = TableIterator(

854 self,

855 s,

856 func,

857 where=where,

858 nrows=s.nrows,

859 start=start,

860 stop=stop,

861 iterator=iterator,

862 chunksize=chunksize,

863 auto_close=auto_close,

864 )

865

866 return it.get_result()

867

868 def select_as_coordinates(

869 self,

870 key: str,

871 where=None,

872 start: int | None = None,

873 stop: int | None = None,

874 ):

875 """

876 return the selection as an Index

877

878 .. warning::

879

880 Pandas uses PyTables for reading and writing HDF5 files, which allows

881 serializing object-dtype data with pickle when using the "fixed" format.

882 Loading pickled data received from untrusted sources can be unsafe.

883

884 See: https://docs.python.org/3/library/pickle.html for more.

885

886

887 Parameters

888 ----------

889 key : str

890 where : list of Term (or convertible) objects, optional

891 start : integer (defaults to None), row number to start selection

892 stop : integer (defaults to None), row number to stop selection

893 """

894 where = _ensure_term(where, scope_level=1)

895 tbl = self.get_storer(key)

896 if not isinstance(tbl, Table):

897 raise TypeError("can only read_coordinates with a table")

898 return tbl.read_coordinates(where=where, start=start, stop=stop)

899

900 def select_column(

901 self,

902 key: str,

903 column: str,

904 start: int | None = None,

905 stop: int | None = None,

906 ):

907 """

908 return a single column from the table. This is generally only useful to

909 select an indexable

910

911 .. warning::

912

913 Pandas uses PyTables for reading and writing HDF5 files, which allows

914 serializing object-dtype data with pickle when using the "fixed" format.

915 Loading pickled data received from untrusted sources can be unsafe.

916

917 See: https://docs.python.org/3/library/pickle.html for more.

918

919 Parameters

920 ----------

921 key : str

922 column : str

923 The column of interest.

924 start : int or None, default None

925 stop : int or None, default None

926

927 Raises

928 ------

929 raises KeyError if the column is not found (or key is not a valid

930 store)

931 raises ValueError if the column can not be extracted individually (it

932 is part of a data block)

933

934 """

935 tbl = self.get_storer(key)

936 if not isinstance(tbl, Table):

937 raise TypeError("can only read_column with a table")

938 return tbl.read_column(column=column, start=start, stop=stop)

939

940 def select_as_multiple(

941 self,

942 keys,

943 where=None,

944 selector=None,

945 columns=None,

946 start=None,

947 stop=None,

948 iterator: bool = False,

949 chunksize=None,

950 auto_close: bool = False,

951 ):

952 """

953 Retrieve pandas objects from multiple tables.

954

955 .. warning::

956

957 Pandas uses PyTables for reading and writing HDF5 files, which allows

958 serializing object-dtype data with pickle when using the "fixed" format.

959 Loading pickled data received from untrusted sources can be unsafe.

960

961 See: https://docs.python.org/3/library/pickle.html for more.

962

963 Parameters

964 ----------

965 keys : a list of the tables

966 selector : the table to apply the where criteria (defaults to keys[0]

967 if not supplied)

968 columns : the columns I want back

969 start : integer (defaults to None), row number to start selection

970 stop : integer (defaults to None), row number to stop selection

971 iterator : bool, return an iterator, default False

972 chunksize : nrows to include in iteration, return an iterator

973 auto_close : bool, default False

974 Should automatically close the store when finished.

975

976 Raises

977 ------

978 raises KeyError if keys or selector is not found or keys is empty

979 raises TypeError if keys is not a list or tuple

980 raises ValueError if the tables are not ALL THE SAME DIMENSIONS

981 """

982 # default to single select

983 where = _ensure_term(where, scope_level=1)

984 if isinstance(keys, (list, tuple)) and len(keys) == 1:

985 keys = keys[0]

986 if isinstance(keys, str):

987 return self.select(

988 key=keys,

989 where=where,

990 columns=columns,

991 start=start,

992 stop=stop,

993 iterator=iterator,

994 chunksize=chunksize,

995 auto_close=auto_close,

996 )

997

998 if not isinstance(keys, (list, tuple)):

999 raise TypeError("keys must be a list/tuple")

1000

1001 if not len(keys):

1002 raise ValueError("keys must have a non-zero length")

1003

1004 if selector is None:

1005 selector = keys[0]

1006

1007 # collect the tables

1008 tbls = [self.get_storer(k) for k in keys]

1009 s = self.get_storer(selector)

1010

1011 # validate rows

1012 nrows = None

1013 for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):

1014 if t is None:

1015 raise KeyError(f"Invalid table [{k}]")

1016 if not t.is_table:

1017 raise TypeError(

1018 f"object [{t.pathname}] is not a table, and cannot be used in all "

1019 "select as multiple"

1020 )

1021

1022 if nrows is None:

1023 nrows = t.nrows

1024 elif t.nrows != nrows:

1025 raise ValueError("all tables must have exactly the same nrows!")

1026

1027 # The isinstance checks here are redundant with the check above,

1028 # but necessary for mypy; see GH#29757

1029 _tbls = [x for x in tbls if isinstance(x, Table)]

1030

1031 # axis is the concentration axes

1032 axis = {t.non_index_axes[0][0] for t in _tbls}.pop()

1033

1034 def func(_start, _stop, _where):

1035 # retrieve the objs, _where is always passed as a set of

1036 # coordinates here

1037 objs = [

1038 t.read(where=_where, columns=columns, start=_start, stop=_stop)

1039 for t in tbls

1040 ]

1041

1042 # concat and return

1043 return concat(objs, axis=axis, verify_integrity=False)._consolidate()

1044

1045 # create the iterator

1046 it = TableIterator(

1047 self,

1048 s,

1049 func,

1050 where=where,

1051 nrows=nrows,

1052 start=start,

1053 stop=stop,

1054 iterator=iterator,

1055 chunksize=chunksize,

1056 auto_close=auto_close,

1057 )

1058

1059 return it.get_result(coordinates=True)

1060

1061 def put(

1062 self,

1063 key: str,

1064 value: DataFrame | Series,

1065 format=None,

1066 index: bool = True,

1067 append: bool = False,

1068 complib=None,

1069 complevel: int | None = None,

1070 min_itemsize: int | dict[str, int] | None = None,

1071 nan_rep=None,

1072 data_columns: Literal[True] | list[str] | None = None,

1073 encoding=None,

1074 errors: str = "strict",

1075 track_times: bool = True,

1076 dropna: bool = False,

1077 ) -> None:

1078 """

1079 Store object in HDFStore.

1080

1081 Parameters

1082 ----------

1083 key : str

1084 value : {Series, DataFrame}

1085 format : 'fixed(f)|table(t)', default is 'fixed'

1086 Format to use when storing object in HDFStore. Value can be one of:

1087

1088 ``'fixed'``

1089 Fixed format. Fast writing/reading. Not-appendable, nor searchable.

1090 ``'table'``

1091 Table format. Write as a PyTables Table structure which may perform

1092 worse but allow more flexible operations like searching / selecting

1093 subsets of the data.

1094 index : bool, default True

1095 Write DataFrame index as a column.

1096 append : bool, default False

1097 This will force Table format, append the input data to the existing.

1098 data_columns : list of columns or True, default None

1099 List of columns to create as data columns, or True to use all columns.

1100 See `here

1101 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.

1102 encoding : str, default None

1103 Provide an encoding for strings.

1104 track_times : bool, default True

1105 Parameter is propagated to 'create_table' method of 'PyTables'.

1106 If set to False it enables to have the same h5 files (same hashes)

1107 independent on creation time.

1108 dropna : bool, default False, optional

1109 Remove missing values.

1110

1111 .. versionadded:: 1.1.0

1112 """

1113 if format is None:

1114 format = get_option("io.hdf.default_format") or "fixed"

1115 format = self._validate_format(format)

1116 self._write_to_group(

1117 key,

1118 value,

1119 format=format,

1120 index=index,

1121 append=append,

1122 complib=complib,

1123 complevel=complevel,

1124 min_itemsize=min_itemsize,

1125 nan_rep=nan_rep,

1126 data_columns=data_columns,

1127 encoding=encoding,

1128 errors=errors,

1129 track_times=track_times,

1130 dropna=dropna,

1131 )

1132

1133 def remove(self, key: str, where=None, start=None, stop=None) -> None:

1134 """

1135 Remove pandas object partially by specifying the where condition

1136

1137 Parameters

1138 ----------

1139 key : str

1140 Node to remove or delete rows from

1141 where : list of Term (or convertible) objects, optional

1142 start : integer (defaults to None), row number to start selection

1143 stop : integer (defaults to None), row number to stop selection

1144

1145 Returns

1146 -------

1147 number of rows removed (or None if not a Table)

1148

1149 Raises

1150 ------

1151 raises KeyError if key is not a valid store

1152

1153 """

1154 where = _ensure_term(where, scope_level=1)

1155 try:

1156 s = self.get_storer(key)

1157 except KeyError:

1158 # the key is not a valid store, re-raising KeyError

1159 raise

1160 except AssertionError:

1161 # surface any assertion errors for e.g. debugging

1162 raise

1163 except Exception as err:

1164 # In tests we get here with ClosedFileError, TypeError, and

1165 # _table_mod.NoSuchNodeError. TODO: Catch only these?

1166

1167 if where is not None:

1168 raise ValueError(

1169 "trying to remove a node with a non-None where clause!"

1170 ) from err

1171

1172 # we are actually trying to remove a node (with children)

1173 node = self.get_node(key)

1174 if node is not None:

1175 node._f_remove(recursive=True)

1176 return None

1177

1178 # remove the node

1179 if com.all_none(where, start, stop):

1180 s.group._f_remove(recursive=True)

1181

1182 # delete from the table

1183 else:

1184 if not s.is_table:

1185 raise ValueError(

1186 "can only remove with where on objects written as tables"

1187 )

1188 return s.delete(where=where, start=start, stop=stop)

1189

1190 def append(

1191 self,

1192 key: str,

1193 value: DataFrame | Series,

1194 format=None,

1195 axes=None,

1196 index: bool | list[str] = True,

1197 append: bool = True,

1198 complib=None,

1199 complevel: int | None = None,

1200 columns=None,

1201 min_itemsize: int | dict[str, int] | None = None,

1202 nan_rep=None,

1203 chunksize=None,

1204 expectedrows=None,

1205 dropna: bool | None = None,

1206 data_columns: Literal[True] | list[str] | None = None,

1207 encoding=None,

1208 errors: str = "strict",

1209 ) -> None:

1210 """

1211 Append to Table in file.

1212

1213 Node must already exist and be Table format.

1214

1215 Parameters

1216 ----------

1217 key : str

1218 value : {Series, DataFrame}

1219 format : 'table' is the default

1220 Format to use when storing object in HDFStore. Value can be one of:

1221

1222 ``'table'``

1223 Table format. Write as a PyTables Table structure which may perform

1224 worse but allow more flexible operations like searching / selecting

1225 subsets of the data.

1226 index : bool, default True

1227 Write DataFrame index as a column.

1228 append : bool, default True

1229 Append the input data to the existing.

1230 data_columns : list of columns, or True, default None

1231 List of columns to create as indexed data columns for on-disk

1232 queries, or True to use all columns. By default only the axes

1233 of the object are indexed. See `here

1234 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.

1235 min_itemsize : dict of columns that specify minimum str sizes

1236 nan_rep : str to use as str nan representation

1237 chunksize : size to chunk the writing

1238 expectedrows : expected TOTAL row size of this table

1239 encoding : default None, provide an encoding for str

1240 dropna : bool, default False, optional

1241 Do not write an ALL nan row to the store settable

1242 by the option 'io.hdf.dropna_table'.

1243

1244 Notes

1245 -----

1246 Does *not* check if data being appended overlaps with existing

1247 data in the table, so be careful

1248 """

1249 if columns is not None:

1250 raise TypeError(

1251 "columns is not a supported keyword in append, try data_columns"

1252 )

1253

1254 if dropna is None:

1255 dropna = get_option("io.hdf.dropna_table")

1256 if format is None:

1257 format = get_option("io.hdf.default_format") or "table"

1258 format = self._validate_format(format)

1259 self._write_to_group(

1260 key,

1261 value,

1262 format=format,

1263 axes=axes,

1264 index=index,

1265 append=append,

1266 complib=complib,

1267 complevel=complevel,

1268 min_itemsize=min_itemsize,

1269 nan_rep=nan_rep,

1270 chunksize=chunksize,

1271 expectedrows=expectedrows,

1272 dropna=dropna,

1273 data_columns=data_columns,

1274 encoding=encoding,

1275 errors=errors,

1276 )

1277

1278 def append_to_multiple(

1279 self,

1280 d: dict,

1281 value,

1282 selector,

1283 data_columns=None,

1284 axes=None,

1285 dropna: bool = False,

1286 **kwargs,

1287 ) -> None:

1288 """

1289 Append to multiple tables

1290

1291 Parameters

1292 ----------

1293 d : a dict of table_name to table_columns, None is acceptable as the

1294 values of one node (this will get all the remaining columns)

1295 value : a pandas object

1296 selector : a string that designates the indexable table; all of its

1297 columns will be designed as data_columns, unless data_columns is

1298 passed, in which case these are used

1299 data_columns : list of columns to create as data columns, or True to

1300 use all columns

1301 dropna : if evaluates to True, drop rows from all tables if any single

1302 row in each table has all NaN. Default False.

1303

1304 Notes

1305 -----

1306 axes parameter is currently not accepted

1307

1308 """

1309 if axes is not None:

1310 raise TypeError(

1311 "axes is currently not accepted as a parameter to append_to_multiple; "

1312 "you can create the tables independently instead"

1313 )

1314

1315 if not isinstance(d, dict):

1316 raise ValueError(

1317 "append_to_multiple must have a dictionary specified as the "

1318 "way to split the value"

1319 )

1320

1321 if selector not in d:

1322 raise ValueError(

1323 "append_to_multiple requires a selector that is in passed dict"

1324 )

1325

1326 # figure out the splitting axis (the non_index_axis)

1327 axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]

1328

1329 # figure out how to split the value

1330 remain_key = None

1331 remain_values: list = []

1332 for k, v in d.items():

1333 if v is None:

1334 if remain_key is not None:

1335 raise ValueError(

1336 "append_to_multiple can only have one value in d that is None"

1337 )

1338 remain_key = k

1339 else:

1340 remain_values.extend(v)

1341 if remain_key is not None:

1342 ordered = value.axes[axis]

1343 ordd = ordered.difference(Index(remain_values))

1344 ordd = sorted(ordered.get_indexer(ordd))

1345 d[remain_key] = ordered.take(ordd)

1346

1347 # data_columns

1348 if data_columns is None:

1349 data_columns = d[selector]

1350

1351 # ensure rows are synchronized across the tables

1352 if dropna:

1353 idxs = (value[cols].dropna(how="all").index for cols in d.values())

1354 valid_index = next(idxs)

1355 for index in idxs:

1356 valid_index = valid_index.intersection(index)

1357 value = value.loc[valid_index]

1358

1359 min_itemsize = kwargs.pop("min_itemsize", None)

1360

1361 # append

1362 for k, v in d.items():

1363 dc = data_columns if k == selector else None

1364

1365 # compute the val

1366 val = value.reindex(v, axis=axis)

1367

1368 filtered = (

1369 {key: value for (key, value) in min_itemsize.items() if key in v}

1370 if min_itemsize is not None

1371 else None

1372 )

1373 self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs)

1374

1375 def create_table_index(

1376 self,

1377 key: str,

1378 columns=None,

1379 optlevel: int | None = None,

1380 kind: str | None = None,

1381 ) -> None:

1382 """

1383 Create a pytables index on the table.

1384

1385 Parameters

1386 ----------

1387 key : str

1388 columns : None, bool, or listlike[str]

1389 Indicate which columns to create an index on.

1390

1391 * False : Do not create any indexes.

1392 * True : Create indexes on all columns.

1393 * None : Create indexes on all columns.

1394 * listlike : Create indexes on the given columns.

1395

1396 optlevel : int or None, default None

1397 Optimization level, if None, pytables defaults to 6.

1398 kind : str or None, default None

1399 Kind of index, if None, pytables defaults to "medium".

1400

1401 Raises

1402 ------

1403 TypeError: raises if the node is not a table

1404 """

1405 # version requirements

1406 _tables()

1407 s = self.get_storer(key)

1408 if s is None:

1409 return

1410

1411 if not isinstance(s, Table):

1412 raise TypeError("cannot create table index on a Fixed format store")

1413 s.create_index(columns=columns, optlevel=optlevel, kind=kind)

1414

1415 def groups(self) -> list:

1416 """

1417 Return a list of all the top-level nodes.

1418

1419 Each node returned is not a pandas storage object.

1420

1421 Returns

1422 -------

1423 list

1424 List of objects.

1425 """

1426 _tables()

1427 self._check_if_open()

1428 assert self._handle is not None # for mypy

1429 assert _table_mod is not None # for mypy

1430 return [

1431 g

1432 for g in self._handle.walk_groups()

1433 if (

1434 not isinstance(g, _table_mod.link.Link)

1435 and (

1436 getattr(g._v_attrs, "pandas_type", None)

1437 or getattr(g, "table", None)

1438 or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")

1439 )

1440 )

1441 ]

1442

1443 def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]:

1444 """

1445 Walk the pytables group hierarchy for pandas objects.

1446

1447 This generator will yield the group path, subgroups and pandas object

1448 names for each group.

1449

1450 Any non-pandas PyTables objects that are not a group will be ignored.

1451

1452 The `where` group itself is listed first (preorder), then each of its

1453 child groups (following an alphanumerical order) is also traversed,

1454 following the same procedure.

1455

1456 Parameters

1457 ----------

1458 where : str, default "/"

1459 Group where to start walking.

1460

1461 Yields

1462 ------

1463 path : str

1464 Full path to a group (without trailing '/').

1465 groups : list

1466 Names (strings) of the groups contained in `path`.

1467 leaves : list

1468 Names (strings) of the pandas objects contained in `path`.

1469 """

1470 _tables()

1471 self._check_if_open()

1472 assert self._handle is not None # for mypy

1473 assert _table_mod is not None # for mypy

1474

1475 for g in self._handle.walk_groups(where):

1476 if getattr(g._v_attrs, "pandas_type", None) is not None:

1477 continue

1478

1479 groups = []

1480 leaves = []

1481 for child in g._v_children.values():

1482 pandas_type = getattr(child._v_attrs, "pandas_type", None)

1483 if pandas_type is None:

1484 if isinstance(child, _table_mod.group.Group):

1485 groups.append(child._v_name)

1486 else:

1487 leaves.append(child._v_name)

1488

1489 yield (g._v_pathname.rstrip("/"), groups, leaves)

1490

1491 def get_node(self, key: str) -> Node | None:

1492 """return the node with the key or None if it does not exist"""

1493 self._check_if_open()

1494 if not key.startswith("/"):

1495 key = "/" + key

1496

1497 assert self._handle is not None

1498 assert _table_mod is not None # for mypy

1499 try:

1500 node = self._handle.get_node(self.root, key)

1501 except _table_mod.exceptions.NoSuchNodeError:

1502 return None

1503

1504 assert isinstance(node, _table_mod.Node), type(node)

1505 return node

1506

1507 def get_storer(self, key: str) -> GenericFixed | Table:

1508 """return the storer object for a key, raise if not in the file"""

1509 group = self.get_node(key)

1510 if group is None:

1511 raise KeyError(f"No object named {key} in the file")

1512

1513 s = self._create_storer(group)

1514 s.infer_axes()

1515 return s

1516

1517 def copy(

1518 self,

1519 file,

1520 mode: str = "w",

1521 propindexes: bool = True,

1522 keys=None,

1523 complib=None,

1524 complevel: int | None = None,

1525 fletcher32: bool = False,

1526 overwrite: bool = True,

1527 ) -> HDFStore:

1528 """

1529 Copy the existing store to a new file, updating in place.

1530

1531 Parameters

1532 ----------

1533 propindexes : bool, default True

1534 Restore indexes in copied file.

1535 keys : list, optional

1536 List of keys to include in the copy (defaults to all).

1537 overwrite : bool, default True

1538 Whether to overwrite (remove and replace) existing nodes in the new store.

1539 mode, complib, complevel, fletcher32 same as in HDFStore.__init__

1540

1541 Returns

1542 -------

1543 open file handle of the new store

1544 """

1545 new_store = HDFStore(

1546 file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32

1547 )

1548 if keys is None:

1549 keys = list(self.keys())

1550 if not isinstance(keys, (tuple, list)):

1551 keys = [keys]

1552 for k in keys:

1553 s = self.get_storer(k)

1554 if s is not None:

1555 if k in new_store:

1556 if overwrite:

1557 new_store.remove(k)

1558

1559 data = self.select(k)

1560 if isinstance(s, Table):

1561 index: bool | list[str] = False

1562 if propindexes:

1563 index = [a.name for a in s.axes if a.is_indexed]

1564 new_store.append(

1565 k,

1566 data,

1567 index=index,

1568 data_columns=getattr(s, "data_columns", None),

1569 encoding=s.encoding,

1570 )

1571 else:

1572 new_store.put(k, data, encoding=s.encoding)

1573

1574 return new_store

1575

1576 def info(self) -> str:

1577 """

1578 Print detailed information on the store.

1579

1580 Returns

1581 -------

1582 str

1583 """

1584 path = pprint_thing(self._path)

1585 output = f"{type(self)}\nFile path: {path}\n"

1586

1587 if self.is_open:

1588 lkeys = sorted(self.keys())

1589 if len(lkeys):

1590 keys = []

1591 values = []

1592

1593 for k in lkeys:

1594 try:

1595 s = self.get_storer(k)

1596 if s is not None:

1597 keys.append(pprint_thing(s.pathname or k))

1598 values.append(pprint_thing(s or "invalid_HDFStore node"))

1599 except AssertionError:

1600 # surface any assertion errors for e.g. debugging

1601 raise

1602 except Exception as detail:

1603 keys.append(k)

1604 dstr = pprint_thing(detail)

1605 values.append(f"[invalid_HDFStore node: {dstr}]")

1606

1607 output += adjoin(12, keys, values)

1608 else:

1609 output += "Empty"

1610 else:

1611 output += "File is CLOSED"

1612

1613 return output

1614

1615 # ------------------------------------------------------------------------

1616 # private methods

1617

1618 def _check_if_open(self):

1619 if not self.is_open:

1620 raise ClosedFileError(f"{self._path} file is not open!")

1621

1622 def _validate_format(self, format: str) -> str:

1623 """validate / deprecate formats"""

1624 # validate

1625 try:

1626 format = _FORMAT_MAP[format.lower()]

1627 except KeyError as err:

1628 raise TypeError(f"invalid HDFStore format specified [{format}]") from err

1629

1630 return format

1631

1632 def _create_storer(

1633 self,

1634 group,

1635 format=None,

1636 value: DataFrame | Series | None = None,

1637 encoding: str = "UTF-8",

1638 errors: str = "strict",

1639 ) -> GenericFixed | Table:

1640 """return a suitable class to operate"""

1641 cls: type[GenericFixed] | type[Table]

1642

1643 if value is not None and not isinstance(value, (Series, DataFrame)):

1644 raise TypeError("value must be None, Series, or DataFrame")

1645

1646 pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))

1647 tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))

1648

1649 # infer the pt from the passed value

1650 if pt is None:

1651 if value is None:

1652 _tables()

1653 assert _table_mod is not None # for mypy

1654 if getattr(group, "table", None) or isinstance(

1655 group, _table_mod.table.Table

1656 ):

1657 pt = "frame_table"

1658 tt = "generic_table"

1659 else:

1660 raise TypeError(

1661 "cannot create a storer if the object is not existing "

1662 "nor a value are passed"

1663 )

1664 else:

1665 if isinstance(value, Series):

1666 pt = "series"

1667 else:

1668 pt = "frame"

1669

1670 # we are actually a table

1671 if format == "table":

1672 pt += "_table"

1673

1674 # a storer node

1675 if "table" not in pt:

1676 _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed}

1677 try:

1678 cls = _STORER_MAP[pt]

1679 except KeyError as err:

1680 raise TypeError(

1681 f"cannot properly create the storer for: [_STORER_MAP] [group->"

1682 f"{group},value->{type(value)},format->{format}"

1683 ) from err

1684 return cls(self, group, encoding=encoding, errors=errors)

1685

1686 # existing node (and must be a table)

1687 if tt is None:

1688 # if we are a writer, determine the tt

1689 if value is not None:

1690 if pt == "series_table":

1691 index = getattr(value, "index", None)

1692 if index is not None:

1693 if index.nlevels == 1:

1694 tt = "appendable_series"

1695 elif index.nlevels > 1:

1696 tt = "appendable_multiseries"

1697 elif pt == "frame_table":

1698 index = getattr(value, "index", None)

1699 if index is not None:

1700 if index.nlevels == 1:

1701 tt = "appendable_frame"

1702 elif index.nlevels > 1:

1703 tt = "appendable_multiframe"

1704

1705 _TABLE_MAP = {

1706 "generic_table": GenericTable,

1707 "appendable_series": AppendableSeriesTable,

1708 "appendable_multiseries": AppendableMultiSeriesTable,

1709 "appendable_frame": AppendableFrameTable,

1710 "appendable_multiframe": AppendableMultiFrameTable,

1711 "worm": WORMTable,

1712 }

1713 try:

1714 cls = _TABLE_MAP[tt]

1715 except KeyError as err:

1716 raise TypeError(

1717 f"cannot properly create the storer for: [_TABLE_MAP] [group->"

1718 f"{group},value->{type(value)},format->{format}"

1719 ) from err

1720

1721 return cls(self, group, encoding=encoding, errors=errors)

1722

1723 def _write_to_group(

1724 self,

1725 key: str,

1726 value: DataFrame | Series,

1727 format,

1728 axes=None,

1729 index: bool | list[str] = True,

1730 append: bool = False,

1731 complib=None,

1732 complevel: int | None = None,

1733 fletcher32=None,

1734 min_itemsize: int | dict[str, int] | None = None,

1735 chunksize=None,

1736 expectedrows=None,

1737 dropna: bool = False,

1738 nan_rep=None,

1739 data_columns=None,

1740 encoding=None,

1741 errors: str = "strict",

1742 track_times: bool = True,

1743 ) -> None:

1744 # we don't want to store a table node at all if our object is 0-len

1745 # as there are not dtypes

1746 if getattr(value, "empty", None) and (format == "table" or append):

1747 return

1748

1749 group = self._identify_group(key, append)

1750

1751 s = self._create_storer(group, format, value, encoding=encoding, errors=errors)

1752 if append:

1753 # raise if we are trying to append to a Fixed format,

1754 # or a table that exists (and we are putting)

1755 if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):

1756 raise ValueError("Can only append to Tables")

1757 if not s.is_exists:

1758 s.set_object_info()

1759 else:

1760 s.set_object_info()

1761

1762 if not s.is_table and complib:

1763 raise ValueError("Compression not supported on Fixed format stores")

1764

1765 # write the object

1766 s.write(

1767 obj=value,

1768 axes=axes,

1769 append=append,

1770 complib=complib,

1771 complevel=complevel,

1772 fletcher32=fletcher32,

1773 min_itemsize=min_itemsize,

1774 chunksize=chunksize,

1775 expectedrows=expectedrows,

1776 dropna=dropna,

1777 nan_rep=nan_rep,

1778 data_columns=data_columns,

1779 track_times=track_times,

1780 )

1781

1782 if isinstance(s, Table) and index:

1783 s.create_index(columns=index)

1784

1785 def _read_group(self, group: Node):

1786 s = self._create_storer(group)

1787 s.infer_axes()

1788 return s.read()

1789

1790 def _identify_group(self, key: str, append: bool) -> Node:

1791 """Identify HDF5 group based on key, delete/create group if needed."""

1792 group = self.get_node(key)

1793

1794 # we make this assertion for mypy; the get_node call will already

1795 # have raised if this is incorrect

1796 assert self._handle is not None

1797

1798 # remove the node if we are not appending

1799 if group is not None and not append:

1800 self._handle.remove_node(group, recursive=True)

1801 group = None

1802

1803 if group is None:

1804 group = self._create_nodes_and_group(key)

1805

1806 return group

1807

1808 def _create_nodes_and_group(self, key: str) -> Node:

1809 """Create nodes from key and return group name."""

1810 # assertion for mypy

1811 assert self._handle is not None

1812

1813 paths = key.split("/")

1814 # recursively create the groups

1815 path = "/"

1816 for p in paths:

1817 if not len(p):

1818 continue

1819 new_path = path

1820 if not path.endswith("/"):

1821 new_path += "/"

1822 new_path += p

1823 group = self.get_node(new_path)

1824 if group is None:

1825 group = self._handle.create_group(path, p)

1826 path = new_path

1827 return group

1828

1829

1830class TableIterator:

1831 """

1832 Define the iteration interface on a table

1833

1834 Parameters

1835 ----------

1836 store : HDFStore

1837 s : the referred storer

1838 func : the function to execute the query

1839 where : the where of the query

1840 nrows : the rows to iterate on

1841 start : the passed start value (default is None)

1842 stop : the passed stop value (default is None)

1843 iterator : bool, default False

1844 Whether to use the default iterator.

1845 chunksize : the passed chunking value (default is 100000)

1846 auto_close : bool, default False

1847 Whether to automatically close the store at the end of iteration.

1848 """

1849

1850 chunksize: int | None

1851 store: HDFStore

1852 s: GenericFixed | Table

1853

1854 def __init__(

1855 self,

1856 store: HDFStore,

1857 s: GenericFixed | Table,

1858 func,

1859 where,

1860 nrows,

1861 start=None,

1862 stop=None,

1863 iterator: bool = False,

1864 chunksize: int | None = None,

1865 auto_close: bool = False,

1866 ) -> None:

1867 self.store = store

1868 self.s = s

1869 self.func = func

1870 self.where = where

1871

1872 # set start/stop if they are not set if we are a table

1873 if self.s.is_table:

1874 if nrows is None:

1875 nrows = 0

1876 if start is None:

1877 start = 0

1878 if stop is None:

1879 stop = nrows

1880 stop = min(nrows, stop)

1881

1882 self.nrows = nrows

1883 self.start = start

1884 self.stop = stop

1885

1886 self.coordinates = None

1887 if iterator or chunksize is not None:

1888 if chunksize is None:

1889 chunksize = 100000

1890 self.chunksize = int(chunksize)

1891 else:

1892 self.chunksize = None

1893

1894 self.auto_close = auto_close

1895

1896 def __iter__(self) -> Iterator:

1897 # iterate

1898 current = self.start

1899 if self.coordinates is None:

1900 raise ValueError("Cannot iterate until get_result is called.")

1901 while current < self.stop:

1902 stop = min(current + self.chunksize, self.stop)

1903 value = self.func(None, None, self.coordinates[current:stop])

1904 current = stop

1905 if value is None or not len(value):

1906 continue

1907

1908 yield value

1909

1910 self.close()

1911

1912 def close(self) -> None:

1913 if self.auto_close:

1914 self.store.close()

1915

1916 def get_result(self, coordinates: bool = False):

1917 # return the actual iterator

1918 if self.chunksize is not None:

1919 if not isinstance(self.s, Table):

1920 raise TypeError("can only use an iterator or chunksize on a table")

1921

1922 self.coordinates = self.s.read_coordinates(where=self.where)

1923

1924 return self

1925

1926 # if specified read via coordinates (necessary for multiple selections

1927 if coordinates:

1928 if not isinstance(self.s, Table):

1929 raise TypeError("can only read_coordinates on a table")

1930 where = self.s.read_coordinates(

1931 where=self.where, start=self.start, stop=self.stop

1932 )

1933 else:

1934 where = self.where

1935

1936 # directly return the result

1937 results = self.func(self.start, self.stop, where)

1938 self.close()

1939 return results

1940

1941

1942class IndexCol:

1943 """

1944 an index column description class

1945

1946 Parameters

1947 ----------

1948 axis : axis which I reference

1949 values : the ndarray like converted values

1950 kind : a string description of this type

1951 typ : the pytables type

1952 pos : the position in the pytables

1953

1954 """

1955

1956 is_an_indexable: bool = True

1957 is_data_indexable: bool = True

1958 _info_fields = ["freq", "tz", "index_name"]

1959

1960 def __init__(

1961 self,

1962 name: str,

1963 values=None,

1964 kind=None,

1965 typ=None,

1966 cname: str | None = None,

1967 axis=None,

1968 pos=None,

1969 freq=None,

1970 tz=None,

1971 index_name=None,

1972 ordered=None,

1973 table=None,

1974 meta=None,

1975 metadata=None,

1976 ) -> None:

1977 if not isinstance(name, str):

1978 raise ValueError("`name` must be a str.")

1979

1980 self.values = values

1981 self.kind = kind

1982 self.typ = typ

1983 self.name = name

1984 self.cname = cname or name

1985 self.axis = axis

1986 self.pos = pos

1987 self.freq = freq

1988 self.tz = tz

1989 self.index_name = index_name

1990 self.ordered = ordered

1991 self.table = table

1992 self.meta = meta

1993 self.metadata = metadata

1994

1995 if pos is not None:

1996 self.set_pos(pos)

1997

1998 # These are ensured as long as the passed arguments match the

1999 # constructor annotations.

2000 assert isinstance(self.name, str)

2001 assert isinstance(self.cname, str)

2002

2003 @property

2004 def itemsize(self) -> int:

2005 # Assumes self.typ has already been initialized

2006 return self.typ.itemsize

2007

2008 @property

2009 def kind_attr(self) -> str:

2010 return f"{self.name}_kind"

2011

2012 def set_pos(self, pos: int) -> None:

2013 """set the position of this column in the Table"""

2014 self.pos = pos

2015 if pos is not None and self.typ is not None:

2016 self.typ._v_pos = pos

2017

2018 def __repr__(self) -> str:

2019 temp = tuple(

2020 map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))

2021 )

2022 return ",".join(

2023 [

2024 f"{key}->{value}"

2025 for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp)

2026 ]

2027 )

2028

2029 def __eq__(self, other: Any) -> bool:

2030 """compare 2 col items"""

2031 return all(

2032 getattr(self, a, None) == getattr(other, a, None)

2033 for a in ["name", "cname", "axis", "pos"]

2034 )

2035

2036 def __ne__(self, other) -> bool:

2037 return not self.__eq__(other)

2038

2039 @property

2040 def is_indexed(self) -> bool:

2041 """return whether I am an indexed column"""

2042 if not hasattr(self.table, "cols"):

2043 # e.g. if infer hasn't been called yet, self.table will be None.

2044 return False

2045 return getattr(self.table.cols, self.cname).is_indexed

2046

2047 def convert(

2048 self, values: np.ndarray, nan_rep, encoding: str, errors: str

2049 ) -> tuple[np.ndarray, np.ndarray] | tuple[Index, Index]:

2050 """

2051 Convert the data from this selection to the appropriate pandas type.

2052 """

2053 assert isinstance(values, np.ndarray), type(values)

2054

2055 # values is a recarray

2056 if values.dtype.fields is not None:

2057 # Copy, otherwise values will be a view

2058 # preventing the original recarry from being free'ed

2059 values = values[self.cname].copy()

2060

2061 val_kind = _ensure_decoded(self.kind)

2062 values = _maybe_convert(values, val_kind, encoding, errors)

2063

2064 kwargs = {}

2065 kwargs["name"] = _ensure_decoded(self.index_name)

2066

2067 if self.freq is not None:

2068 kwargs["freq"] = _ensure_decoded(self.freq)

2069

2070 factory: type[Index] | type[DatetimeIndex] = Index

2071 if is_datetime64_dtype(values.dtype) or is_datetime64tz_dtype(values.dtype):

2072 factory = DatetimeIndex

2073 elif values.dtype == "i8" and "freq" in kwargs:

2074 # PeriodIndex data is stored as i8

2075 # error: Incompatible types in assignment (expression has type

2076 # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type

2077 # "Union[Type[Index], Type[DatetimeIndex]]")

2078 factory = lambda x, **kwds: PeriodIndex( # type: ignore[assignment]

2079 ordinal=x, **kwds

2080 )

2081

2082 # making an Index instance could throw a number of different errors

2083 try:

2084 new_pd_index = factory(values, **kwargs)

2085 except ValueError:

2086 # if the output freq is different that what we recorded,

2087 # it should be None (see also 'doc example part 2')

2088 if "freq" in kwargs:

2089 kwargs["freq"] = None

2090 new_pd_index = factory(values, **kwargs)

2091 final_pd_index = _set_tz(new_pd_index, self.tz)

2092 return final_pd_index, final_pd_index

2093

2094 def take_data(self):

2095 """return the values"""

2096 return self.values

2097

2098 @property

2099 def attrs(self):

2100 return self.table._v_attrs

2101

2102 @property

2103 def description(self):

2104 return self.table.description

2105

2106 @property

2107 def col(self):

2108 """return my current col description"""

2109 return getattr(self.description, self.cname, None)

2110

2111 @property

2112 def cvalues(self):

2113 """return my cython values"""

2114 return self.values

2115

2116 def __iter__(self) -> Iterator:

2117 return iter(self.values)

2118

2119 def maybe_set_size(self, min_itemsize=None) -> None:

2120 """

2121 maybe set a string col itemsize:

2122 min_itemsize can be an integer or a dict with this columns name

2123 with an integer size

2124 """

2125 if _ensure_decoded(self.kind) == "string":

2126 if isinstance(min_itemsize, dict):

2127 min_itemsize = min_itemsize.get(self.name)

2128

2129 if min_itemsize is not None and self.typ.itemsize < min_itemsize:

2130 self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)

2131

2132 def validate_names(self) -> None:

2133 pass

2134

2135 def validate_and_set(self, handler: AppendableTable, append: bool) -> None:

2136 self.table = handler.table

2137 self.validate_col()

2138 self.validate_attr(append)

2139 self.validate_metadata(handler)

2140 self.write_metadata(handler)

2141 self.set_attr()

2142

2143 def validate_col(self, itemsize=None):

2144 """validate this column: return the compared against itemsize"""

2145 # validate this column for string truncation (or reset to the max size)

2146 if _ensure_decoded(self.kind) == "string":

2147 c = self.col

2148 if c is not None:

2149 if itemsize is None:

2150 itemsize = self.itemsize

2151 if c.itemsize < itemsize:

2152 raise ValueError(

2153 f"Trying to store a string with len [{itemsize}] in "

2154 f"[{self.cname}] column but\nthis column has a limit of "

2155 f"[{c.itemsize}]!\nConsider using min_itemsize to "

2156 "preset the sizes on these columns"

2157 )

2158 return c.itemsize

2159

2160 return None

2161

2162 def validate_attr(self, append: bool) -> None:

2163 # check for backwards incompatibility

2164 if append:

2165 existing_kind = getattr(self.attrs, self.kind_attr, None)

2166 if existing_kind is not None and existing_kind != self.kind:

2167 raise TypeError(

2168 f"incompatible kind in col [{existing_kind} - {self.kind}]"

2169 )

2170

2171 def update_info(self, info) -> None:

2172 """

2173 set/update the info for this indexable with the key/value

2174 if there is a conflict raise/warn as needed

2175 """

2176 for key in self._info_fields:

2177 value = getattr(self, key, None)

2178 idx = info.setdefault(self.name, {})

2179

2180 existing_value = idx.get(key)

2181 if key in idx and value is not None and existing_value != value:

2182 # frequency/name just warn

2183 if key in ["freq", "index_name"]:

2184 ws = attribute_conflict_doc % (key, existing_value, value)

2185 warnings.warn(

2186 ws, AttributeConflictWarning, stacklevel=find_stack_level()

2187 )

2188

2189 # reset

2190 idx[key] = None

2191 setattr(self, key, None)

2192

2193 else:

2194 raise ValueError(

2195 f"invalid info for [{self.name}] for [{key}], "

2196 f"existing_value [{existing_value}] conflicts with "

2197 f"new value [{value}]"

2198 )

2199 else:

2200 if value is not None or existing_value is not None:

2201 idx[key] = value

2202

2203 def set_info(self, info) -> None:

2204 """set my state from the passed info"""

2205 idx = info.get(self.name)

2206 if idx is not None:

2207 self.__dict__.update(idx)

2208

2209 def set_attr(self) -> None:

2210 """set the kind for this column"""

2211 setattr(self.attrs, self.kind_attr, self.kind)

2212

2213 def validate_metadata(self, handler: AppendableTable) -> None:

2214 """validate that kind=category does not change the categories"""

2215 if self.meta == "category":

2216 new_metadata = self.metadata

2217 cur_metadata = handler.read_metadata(self.cname)

2218 if (

2219 new_metadata is not None

2220 and cur_metadata is not None

2221 and not array_equivalent(new_metadata, cur_metadata)

2222 ):

2223 raise ValueError(

2224 "cannot append a categorical with "

2225 "different categories to the existing"

2226 )

2227

2228 def write_metadata(self, handler: AppendableTable) -> None:

2229 """set the meta data"""

2230 if self.metadata is not None:

2231 handler.write_metadata(self.cname, self.metadata)

2232

2233

2234class GenericIndexCol(IndexCol):

2235 """an index which is not represented in the data of the table"""

2236

2237 @property

2238 def is_indexed(self) -> bool:

2239 return False

2240

2241 def convert(

2242 self, values: np.ndarray, nan_rep, encoding: str, errors: str

2243 ) -> tuple[Index, Index]:

2244 """

2245 Convert the data from this selection to the appropriate pandas type.

2246

2247 Parameters

2248 ----------

2249 values : np.ndarray

2250 nan_rep : str

2251 encoding : str

2252 errors : str

2253 """

2254 assert isinstance(values, np.ndarray), type(values)

2255

2256 index = RangeIndex(len(values))

2257 return index, index

2258

2259 def set_attr(self) -> None:

2260 pass

2261

2262

2263class DataCol(IndexCol):

2264 """

2265 a data holding column, by definition this is not indexable

2266

2267 Parameters

2268 ----------

2269 data : the actual data

2270 cname : the column name in the table to hold the data (typically

2271 values)

2272 meta : a string description of the metadata

2273 metadata : the actual metadata

2274 """

2275

2276 is_an_indexable = False

2277 is_data_indexable = False

2278 _info_fields = ["tz", "ordered"]

2279

2280 def __init__(

2281 self,

2282 name: str,

2283 values=None,

2284 kind=None,

2285 typ=None,

2286 cname: str | None = None,

2287 pos=None,

2288 tz=None,

2289 ordered=None,

2290 table=None,

2291 meta=None,

2292 metadata=None,

2293 dtype: DtypeArg | None = None,

2294 data=None,

2295 ) -> None:

2296 super().__init__(

2297 name=name,

2298 values=values,

2299 kind=kind,

2300 typ=typ,

2301 pos=pos,

2302 cname=cname,

2303 tz=tz,

2304 ordered=ordered,

2305 table=table,

2306 meta=meta,

2307 metadata=metadata,

2308 )

2309 self.dtype = dtype

2310 self.data = data

2311

2312 @property

2313 def dtype_attr(self) -> str:

2314 return f"{self.name}_dtype"

2315

2316 @property

2317 def meta_attr(self) -> str:

2318 return f"{self.name}_meta"

2319

2320 def __repr__(self) -> str:

2321 temp = tuple(

2322 map(

2323 pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)

2324 )

2325 )

2326 return ",".join(

2327 [

2328 f"{key}->{value}"

2329 for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp)

2330 ]

2331 )

2332

2333 def __eq__(self, other: Any) -> bool:

2334 """compare 2 col items"""

2335 return all(

2336 getattr(self, a, None) == getattr(other, a, None)

2337 for a in ["name", "cname", "dtype", "pos"]

2338 )

2339

2340 def set_data(self, data: ArrayLike) -> None:

2341 assert data is not None

2342 assert self.dtype is None

2343

2344 data, dtype_name = _get_data_and_dtype_name(data)

2345

2346 self.data = data

2347 self.dtype = dtype_name

2348 self.kind = _dtype_to_kind(dtype_name)

2349

2350 def take_data(self):

2351 """return the data"""

2352 return self.data

2353

2354 @classmethod

2355 def _get_atom(cls, values: ArrayLike) -> Col:

2356 """

2357 Get an appropriately typed and shaped pytables.Col object for values.

2358 """

2359 dtype = values.dtype

2360 # error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no

2361 # attribute "itemsize"

2362 itemsize = dtype.itemsize # type: ignore[union-attr]

2363

2364 shape = values.shape

2365 if values.ndim == 1:

2366 # EA, use block shape pretending it is 2D

2367 # TODO(EA2D): not necessary with 2D EAs

2368 shape = (1, values.size)

2369

2370 if isinstance(values, Categorical):

2371 codes = values.codes

2372 atom = cls.get_atom_data(shape, kind=codes.dtype.name)

2373 elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):

2374 atom = cls.get_atom_datetime64(shape)

2375 elif is_timedelta64_dtype(dtype):

2376 atom = cls.get_atom_timedelta64(shape)

2377 elif is_complex_dtype(dtype):

2378 atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0])

2379 elif is_string_dtype(dtype):

2380 atom = cls.get_atom_string(shape, itemsize)

2381 else:

2382 atom = cls.get_atom_data(shape, kind=dtype.name)

2383

2384 return atom

2385

2386 @classmethod

2387 def get_atom_string(cls, shape, itemsize):

2388 return _tables().StringCol(itemsize=itemsize, shape=shape[0])

2389

2390 @classmethod

2391 def get_atom_coltype(cls, kind: str) -> type[Col]:

2392 """return the PyTables column class for this column"""

2393 if kind.startswith("uint"):

2394 k4 = kind[4:]

2395 col_name = f"UInt{k4}Col"

2396 elif kind.startswith("period"):

2397 # we store as integer

2398 col_name = "Int64Col"

2399 else:

2400 kcap = kind.capitalize()

2401 col_name = f"{kcap}Col"

2402

2403 return getattr(_tables(), col_name)

2404

2405 @classmethod

2406 def get_atom_data(cls, shape, kind: str) -> Col:

2407 return cls.get_atom_coltype(kind=kind)(shape=shape[0])

2408

2409 @classmethod

2410 def get_atom_datetime64(cls, shape):

2411 return _tables().Int64Col(shape=shape[0])

2412

2413 @classmethod

2414 def get_atom_timedelta64(cls, shape):

2415 return _tables().Int64Col(shape=shape[0])

2416

2417 @property

2418 def shape(self):

2419 return getattr(self.data, "shape", None)

2420

2421 @property

2422 def cvalues(self):

2423 """return my cython values"""

2424 return self.data

2425

2426 def validate_attr(self, append) -> None:

2427 """validate that we have the same order as the existing & same dtype"""

2428 if append:

2429 existing_fields = getattr(self.attrs, self.kind_attr, None)

2430 if existing_fields is not None and existing_fields != list(self.values):

2431 raise ValueError("appended items do not match existing items in table!")

2432

2433 existing_dtype = getattr(self.attrs, self.dtype_attr, None)

2434 if existing_dtype is not None and existing_dtype != self.dtype:

2435 raise ValueError(

2436 "appended items dtype do not match existing items dtype in table!"

2437 )

2438

2439 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):

2440 """

2441 Convert the data from this selection to the appropriate pandas type.

2442

2443 Parameters

2444 ----------

2445 values : np.ndarray

2446 nan_rep :

2447 encoding : str

2448 errors : str

2449

2450 Returns

2451 -------

2452 index : listlike to become an Index

2453 data : ndarraylike to become a column

2454 """

2455 assert isinstance(values, np.ndarray), type(values)

2456

2457 # values is a recarray

2458 if values.dtype.fields is not None:

2459 values = values[self.cname]

2460

2461 assert self.typ is not None

2462 if self.dtype is None:

2463 # Note: in tests we never have timedelta64 or datetime64,

2464 # so the _get_data_and_dtype_name may be unnecessary

2465 converted, dtype_name = _get_data_and_dtype_name(values)

2466 kind = _dtype_to_kind(dtype_name)

2467 else:

2468 converted = values

2469 dtype_name = self.dtype

2470 kind = self.kind

2471

2472 assert isinstance(converted, np.ndarray) # for mypy

2473

2474 # use the meta if needed

2475 meta = _ensure_decoded(self.meta)

2476 metadata = self.metadata

2477 ordered = self.ordered

2478 tz = self.tz

2479

2480 assert dtype_name is not None

2481 # convert to the correct dtype

2482 dtype = _ensure_decoded(dtype_name)

2483

2484 # reverse converts

2485 if dtype == "datetime64":

2486 # recreate with tz if indicated

2487 converted = _set_tz(converted, tz, coerce=True)

2488

2489 elif dtype == "timedelta64":

2490 converted = np.asarray(converted, dtype="m8[ns]")

2491 elif dtype == "date":

2492 try:

2493 converted = np.asarray(

2494 [date.fromordinal(v) for v in converted], dtype=object

2495 )

2496 except ValueError:

2497 converted = np.asarray(

2498 [date.fromtimestamp(v) for v in converted], dtype=object

2499 )

2500

2501 elif meta == "category":

2502 # we have a categorical

2503 categories = metadata

2504 codes = converted.ravel()

2505

2506 # if we have stored a NaN in the categories

2507 # then strip it; in theory we could have BOTH

2508 # -1s in the codes and nulls :<

2509 if categories is None:

2510 # Handle case of NaN-only categorical columns in which case

2511 # the categories are an empty array; when this is stored,

2512 # pytables cannot write a zero-len array, so on readback

2513 # the categories would be None and `read_hdf()` would fail.

2514 categories = Index([], dtype=np.float64)

2515 else:

2516 mask = isna(categories)

2517 if mask.any():

2518 categories = categories[~mask]

2519 codes[codes != -1] -= mask.astype(int).cumsum()._values

2520

2521 converted = Categorical.from_codes(

2522 codes, categories=categories, ordered=ordered

2523 )

2524

2525 else:

2526 try:

2527 converted = converted.astype(dtype, copy=False)

2528 except TypeError:

2529 converted = converted.astype("O", copy=False)

2530

2531 # convert nans / decode

2532 if _ensure_decoded(kind) == "string":

2533 converted = _unconvert_string_array(

2534 converted, nan_rep=nan_rep, encoding=encoding, errors=errors

2535 )

2536

2537 return self.values, converted

2538

2539 def set_attr(self) -> None:

2540 """set the data for this column"""

2541 setattr(self.attrs, self.kind_attr, self.values)

2542 setattr(self.attrs, self.meta_attr, self.meta)

2543 assert self.dtype is not None

2544 setattr(self.attrs, self.dtype_attr, self.dtype)

2545

2546

2547class DataIndexableCol(DataCol):

2548 """represent a data column that can be indexed"""

2549

2550 is_data_indexable = True

2551

2552 def validate_names(self) -> None:

2553 if not is_object_dtype(Index(self.values)):

2554 # TODO: should the message here be more specifically non-str?

2555 raise ValueError("cannot have non-object label DataIndexableCol")

2556

2557 @classmethod

2558 def get_atom_string(cls, shape, itemsize):

2559 return _tables().StringCol(itemsize=itemsize)

2560

2561 @classmethod

2562 def get_atom_data(cls, shape, kind: str) -> Col:

2563 return cls.get_atom_coltype(kind=kind)()

2564

2565 @classmethod

2566 def get_atom_datetime64(cls, shape):

2567 return _tables().Int64Col()

2568

2569 @classmethod

2570 def get_atom_timedelta64(cls, shape):

2571 return _tables().Int64Col()

2572

2573

2574class GenericDataIndexableCol(DataIndexableCol):

2575 """represent a generic pytables data column"""

2576

2577

2578class Fixed:

2579 """

2580 represent an object in my store

2581 facilitate read/write of various types of objects

2582 this is an abstract base class

2583

2584 Parameters

2585 ----------

2586 parent : HDFStore

2587 group : Node

2588 The group node where the table resides.

2589 """

2590

2591 pandas_kind: str

2592 format_type: str = "fixed" # GH#30962 needed by dask

2593 obj_type: type[DataFrame | Series]

2594 ndim: int

2595 parent: HDFStore

2596 is_table: bool = False

2597

2598 def __init__(

2599 self,

2600 parent: HDFStore,

2601 group: Node,

2602 encoding: str | None = "UTF-8",

2603 errors: str = "strict",

2604 ) -> None:

2605 assert isinstance(parent, HDFStore), type(parent)

2606 assert _table_mod is not None # needed for mypy

2607 assert isinstance(group, _table_mod.Node), type(group)

2608 self.parent = parent

2609 self.group = group

2610 self.encoding = _ensure_encoding(encoding)

2611 self.errors = errors

2612

2613 @property

2614 def is_old_version(self) -> bool:

2615 return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1

2616

2617 @property

2618 def version(self) -> tuple[int, int, int]:

2619 """compute and set our version"""

2620 version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))

2621 try:

2622 version = tuple(int(x) for x in version.split("."))

2623 if len(version) == 2:

2624 version = version + (0,)

2625 except AttributeError:

2626 version = (0, 0, 0)

2627 return version

2628

2629 @property

2630 def pandas_type(self):

2631 return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))

2632

2633 def __repr__(self) -> str:

2634 """return a pretty representation of myself"""

2635 self.infer_axes()

2636 s = self.shape

2637 if s is not None:

2638 if isinstance(s, (list, tuple)):

2639 jshape = ",".join([pprint_thing(x) for x in s])

2640 s = f"[{jshape}]"

2641 return f"{self.pandas_type:12.12} (shape->{s})"

2642 return self.pandas_type

2643

2644 def set_object_info(self) -> None:

2645 """set my pandas type & version"""

2646 self.attrs.pandas_type = str(self.pandas_kind)

2647 self.attrs.pandas_version = str(_version)

2648

2649 def copy(self) -> Fixed:

2650 new_self = copy.copy(self)

2651 return new_self

2652

2653 @property

2654 def shape(self):

2655 return self.nrows

2656

2657 @property

2658 def pathname(self):

2659 return self.group._v_pathname

2660

2661 @property

2662 def _handle(self):

2663 return self.parent._handle

2664

2665 @property

2666 def _filters(self):

2667 return self.parent._filters

2668

2669 @property

2670 def _complevel(self) -> int:

2671 return self.parent._complevel

2672

2673 @property

2674 def _fletcher32(self) -> bool:

2675 return self.parent._fletcher32

2676

2677 @property

2678 def attrs(self):

2679 return self.group._v_attrs

2680

2681 def set_attrs(self) -> None:

2682 """set our object attributes"""

2683

2684 def get_attrs(self) -> None:

2685 """get our object attributes"""

2686

2687 @property

2688 def storable(self):

2689 """return my storable"""

2690 return self.group

2691

2692 @property

2693 def is_exists(self) -> bool:

2694 return False

2695

2696 @property

2697 def nrows(self):

2698 return getattr(self.storable, "nrows", None)

2699

2700 def validate(self, other) -> Literal[True] | None:

2701 """validate against an existing storable"""

2702 if other is None:

2703 return None

2704 return True

2705

2706 def validate_version(self, where=None) -> None:

2707 """are we trying to operate on an old version?"""

2708

2709 def infer_axes(self) -> bool:

2710 """

2711 infer the axes of my storer

2712 return a boolean indicating if we have a valid storer or not

2713 """

2714 s = self.storable

2715 if s is None:

2716 return False

2717 self.get_attrs()

2718 return True

2719

2720 def read(

2721 self,

2722 where=None,

2723 columns=None,

2724 start: int | None = None,

2725 stop: int | None = None,

2726 ):

2727 raise NotImplementedError(

2728 "cannot read on an abstract storer: subclasses should implement"

2729 )

2730

2731 def write(self, **kwargs):

2732 raise NotImplementedError(

2733 "cannot write on an abstract storer: subclasses should implement"

2734 )

2735

2736 def delete(

2737 self, where=None, start: int | None = None, stop: int | None = None

2738 ) -> None:

2739 """

2740 support fully deleting the node in its entirety (only) - where

2741 specification must be None

2742 """

2743 if com.all_none(where, start, stop):

2744 self._handle.remove_node(self.group, recursive=True)

2745 return None

2746

2747 raise TypeError("cannot delete on an abstract storer")

2748

2749

2750class GenericFixed(Fixed):

2751 """a generified fixed version"""

2752

2753 _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}

2754 _reverse_index_map = {v: k for k, v in _index_type_map.items()}

2755 attributes: list[str] = []

2756

2757 # indexer helpers

2758 def _class_to_alias(self, cls) -> str:

2759 return self._index_type_map.get(cls, "")

2760

2761 def _alias_to_class(self, alias):

2762 if isinstance(alias, type): # pragma: no cover

2763 # compat: for a short period of time master stored types

2764 return alias

2765 return self._reverse_index_map.get(alias, Index)

2766

2767 def _get_index_factory(self, attrs):

2768 index_class = self._alias_to_class(

2769 _ensure_decoded(getattr(attrs, "index_class", ""))

2770 )

2771

2772 factory: Callable

2773

2774 if index_class == DatetimeIndex:

2775

2776 def f(values, freq=None, tz=None):

2777 # data are already in UTC, localize and convert if tz present

2778 dta = DatetimeArray._simple_new(values.values, freq=freq)

2779 result = DatetimeIndex._simple_new(dta, name=None)

2780 if tz is not None:

2781 result = result.tz_localize("UTC").tz_convert(tz)

2782 return result

2783

2784 factory = f

2785 elif index_class == PeriodIndex:

2786

2787 def f(values, freq=None, tz=None):

2788 parr = PeriodArray._simple_new(values, freq=freq)

2789 return PeriodIndex._simple_new(parr, name=None)

2790

2791 factory = f

2792 else:

2793 factory = index_class

2794

2795 kwargs = {}

2796 if "freq" in attrs:

2797 kwargs["freq"] = attrs["freq"]

2798 if index_class is Index:

2799 # DTI/PI would be gotten by _alias_to_class

2800 factory = TimedeltaIndex

2801

2802 if "tz" in attrs:

2803 if isinstance(attrs["tz"], bytes):

2804 # created by python2

2805 kwargs["tz"] = attrs["tz"].decode("utf-8")

2806 else:

2807 # created by python3

2808 kwargs["tz"] = attrs["tz"]

2809 assert index_class is DatetimeIndex # just checking

2810

2811 return factory, kwargs

2812

2813 def validate_read(self, columns, where) -> None:

2814 """

2815 raise if any keywords are passed which are not-None

2816 """

2817 if columns is not None:

2818 raise TypeError(

2819 "cannot pass a column specification when reading "

2820 "a Fixed format store. this store must be selected in its entirety"

2821 )

2822 if where is not None:

2823 raise TypeError(

2824 "cannot pass a where specification when reading "

2825 "from a Fixed format store. this store must be selected in its entirety"

2826 )

2827

2828 @property

2829 def is_exists(self) -> bool:

2830 return True

2831

2832 def set_attrs(self) -> None:

2833 """set our object attributes"""

2834 self.attrs.encoding = self.encoding

2835 self.attrs.errors = self.errors

2836

2837 def get_attrs(self) -> None:

2838 """retrieve our attributes"""

2839 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))

2840 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))

2841 for n in self.attributes:

2842 setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))

2843

2844 # error: Signature of "write" incompatible with supertype "Fixed"

2845 def write(self, obj, **kwargs) -> None: # type: ignore[override]

2846 self.set_attrs()

2847

2848 def read_array(self, key: str, start: int | None = None, stop: int | None = None):

2849 """read an array for the specified node (off of group"""

2850 import tables

2851

2852 node = getattr(self.group, key)

2853 attrs = node._v_attrs

2854

2855 transposed = getattr(attrs, "transposed", False)

2856

2857 if isinstance(node, tables.VLArray):

2858 ret = node[0][start:stop]

2859 else:

2860 dtype = _ensure_decoded(getattr(attrs, "value_type", None))

2861 shape = getattr(attrs, "shape", None)

2862

2863 if shape is not None:

2864 # length 0 axis

2865 ret = np.empty(shape, dtype=dtype)

2866 else:

2867 ret = node[start:stop]

2868

2869 if dtype == "datetime64":

2870 # reconstruct a timezone if indicated

2871 tz = getattr(attrs, "tz", None)

2872 ret = _set_tz(ret, tz, coerce=True)

2873

2874 elif dtype == "timedelta64":

2875 ret = np.asarray(ret, dtype="m8[ns]")

2876

2877 if transposed:

2878 return ret.T

2879 else:

2880 return ret

2881

2882 def read_index(

2883 self, key: str, start: int | None = None, stop: int | None = None

2884 ) -> Index:

2885 variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))

2886

2887 if variety == "multi":

2888 return self.read_multi_index(key, start=start, stop=stop)

2889 elif variety == "regular":

2890 node = getattr(self.group, key)

2891 index = self.read_index_node(node, start=start, stop=stop)

2892 return index

2893 else: # pragma: no cover

2894 raise TypeError(f"unrecognized index variety: {variety}")

2895

2896 def write_index(self, key: str, index: Index) -> None:

2897 if isinstance(index, MultiIndex):

2898 setattr(self.attrs, f"{key}_variety", "multi")

2899 self.write_multi_index(key, index)

2900 else:

2901 setattr(self.attrs, f"{key}_variety", "regular")

2902 converted = _convert_index("index", index, self.encoding, self.errors)

2903

2904 self.write_array(key, converted.values)

2905

2906 node = getattr(self.group, key)

2907 node._v_attrs.kind = converted.kind

2908 node._v_attrs.name = index.name

2909

2910 if isinstance(index, (DatetimeIndex, PeriodIndex)):

2911 node._v_attrs.index_class = self._class_to_alias(type(index))

2912

2913 if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):

2914 node._v_attrs.freq = index.freq

2915

2916 if isinstance(index, DatetimeIndex) and index.tz is not None:

2917 node._v_attrs.tz = _get_tz(index.tz)

2918

2919 def write_multi_index(self, key: str, index: MultiIndex) -> None:

2920 setattr(self.attrs, f"{key}_nlevels", index.nlevels)

2921

2922 for i, (lev, level_codes, name) in enumerate(

2923 zip(index.levels, index.codes, index.names)

2924 ):

2925 # write the level

2926 if is_extension_array_dtype(lev):

2927 raise NotImplementedError(

2928 "Saving a MultiIndex with an extension dtype is not supported."

2929 )

2930 level_key = f"{key}_level{i}"

2931 conv_level = _convert_index(level_key, lev, self.encoding, self.errors)

2932 self.write_array(level_key, conv_level.values)

2933 node = getattr(self.group, level_key)

2934 node._v_attrs.kind = conv_level.kind

2935 node._v_attrs.name = name

2936

2937 # write the name

2938 setattr(node._v_attrs, f"{key}_name{name}", name)

2939

2940 # write the labels

2941 label_key = f"{key}_label{i}"

2942 self.write_array(label_key, level_codes)

2943

2944 def read_multi_index(

2945 self, key: str, start: int | None = None, stop: int | None = None

2946 ) -> MultiIndex:

2947 nlevels = getattr(self.attrs, f"{key}_nlevels")

2948

2949 levels = []

2950 codes = []

2951 names: list[Hashable] = []

2952 for i in range(nlevels):

2953 level_key = f"{key}_level{i}"

2954 node = getattr(self.group, level_key)

2955 lev = self.read_index_node(node, start=start, stop=stop)

2956 levels.append(lev)

2957 names.append(lev.name)

2958

2959 label_key = f"{key}_label{i}"

2960 level_codes = self.read_array(label_key, start=start, stop=stop)

2961 codes.append(level_codes)

2962

2963 return MultiIndex(

2964 levels=levels, codes=codes, names=names, verify_integrity=True

2965 )

2966

2967 def read_index_node(

2968 self, node: Node, start: int | None = None, stop: int | None = None

2969 ) -> Index:

2970 data = node[start:stop]

2971 # If the index was an empty array write_array_empty() will

2972 # have written a sentinel. Here we replace it with the original.

2973 if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:

2974 data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)

2975 kind = _ensure_decoded(node._v_attrs.kind)

2976 name = None

2977

2978 if "name" in node._v_attrs:

2979 name = _ensure_str(node._v_attrs.name)

2980 name = _ensure_decoded(name)

2981

2982 attrs = node._v_attrs

2983 factory, kwargs = self._get_index_factory(attrs)

2984

2985 if kind in ("date", "object"):

2986 index = factory(

2987 _unconvert_index(

2988 data, kind, encoding=self.encoding, errors=self.errors

2989 ),

2990 dtype=object,

2991 **kwargs,

2992 )

2993 else:

2994 index = factory(

2995 _unconvert_index(

2996 data, kind, encoding=self.encoding, errors=self.errors

2997 ),

2998 **kwargs,

2999 )

3000

3001 index.name = name

3002

3003 return index

3004

3005 def write_array_empty(self, key: str, value: ArrayLike) -> None:

3006 """write a 0-len array"""

3007 # ugly hack for length 0 axes

3008 arr = np.empty((1,) * value.ndim)

3009 self._handle.create_array(self.group, key, arr)

3010 node = getattr(self.group, key)

3011 node._v_attrs.value_type = str(value.dtype)

3012 node._v_attrs.shape = value.shape

3013

3014 def write_array(

3015 self, key: str, obj: AnyArrayLike, items: Index | None = None

3016 ) -> None:

3017 # TODO: we only have a few tests that get here, the only EA

3018 # that gets passed is DatetimeArray, and we never have

3019 # both self._filters and EA

3020

3021 value = extract_array(obj, extract_numpy=True)

3022

3023 if key in self.group:

3024 self._handle.remove_node(self.group, key)

3025

3026 # Transform needed to interface with pytables row/col notation

3027 empty_array = value.size == 0

3028 transposed = False

3029

3030 if is_categorical_dtype(value.dtype):

3031 raise NotImplementedError(

3032 "Cannot store a category dtype in a HDF5 dataset that uses format="

3033 '"fixed". Use format="table".'

3034 )

3035 if not empty_array:

3036 if hasattr(value, "T"):

3037 # ExtensionArrays (1d) may not have transpose.

3038 value = value.T

3039 transposed = True

3040

3041 atom = None

3042 if self._filters is not None:

3043 with suppress(ValueError):

3044 # get the atom for this datatype

3045 atom = _tables().Atom.from_dtype(value.dtype)

3046

3047 if atom is not None:

3048 # We only get here if self._filters is non-None and

3049 # the Atom.from_dtype call succeeded

3050

3051 # create an empty chunked array and fill it from value

3052 if not empty_array:

3053 ca = self._handle.create_carray(

3054 self.group, key, atom, value.shape, filters=self._filters

3055 )

3056 ca[:] = value

3057

3058 else:

3059 self.write_array_empty(key, value)

3060

3061 elif value.dtype.type == np.object_:

3062 # infer the type, warn if we have a non-string type here (for

3063 # performance)

3064 inferred_type = lib.infer_dtype(value, skipna=False)

3065 if empty_array:

3066 pass

3067 elif inferred_type == "string":

3068 pass

3069 else:

3070 ws = performance_doc % (inferred_type, key, items)

3071 warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())

3072

3073 vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())

3074 vlarr.append(value)

3075

3076 elif is_datetime64_dtype(value.dtype):

3077 self._handle.create_array(self.group, key, value.view("i8"))

3078 getattr(self.group, key)._v_attrs.value_type = "datetime64"

3079 elif is_datetime64tz_dtype(value.dtype):

3080 # store as UTC

3081 # with a zone

3082

3083 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no

3084 # attribute "asi8"

3085 self._handle.create_array(

3086 self.group, key, value.asi8 # type: ignore[union-attr]

3087 )

3088

3089 node = getattr(self.group, key)

3090 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no

3091 # attribute "tz"

3092 node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr]

3093 node._v_attrs.value_type = "datetime64"

3094 elif is_timedelta64_dtype(value.dtype):

3095 self._handle.create_array(self.group, key, value.view("i8"))

3096 getattr(self.group, key)._v_attrs.value_type = "timedelta64"

3097 elif empty_array:

3098 self.write_array_empty(key, value)

3099 else:

3100 self._handle.create_array(self.group, key, value)

3101

3102 getattr(self.group, key)._v_attrs.transposed = transposed

3103

3104

3105class SeriesFixed(GenericFixed):

3106 pandas_kind = "series"

3107 attributes = ["name"]

3108

3109 name: Hashable

3110

3111 @property

3112 def shape(self):

3113 try:

3114 return (len(self.group.values),)

3115 except (TypeError, AttributeError):

3116 return None

3117

3118 def read(

3119 self,

3120 where=None,

3121 columns=None,

3122 start: int | None = None,

3123 stop: int | None = None,

3124 ) -> Series:

3125 self.validate_read(columns, where)

3126 index = self.read_index("index", start=start, stop=stop)

3127 values = self.read_array("values", start=start, stop=stop)

3128 return Series(values, index=index, name=self.name, copy=False)

3129

3130 # error: Signature of "write" incompatible with supertype "Fixed"

3131 def write(self, obj, **kwargs) -> None: # type: ignore[override]

3132 super().write(obj, **kwargs)

3133 self.write_index("index", obj.index)

3134 self.write_array("values", obj)

3135 self.attrs.name = obj.name

3136

3137

3138class BlockManagerFixed(GenericFixed):

3139 attributes = ["ndim", "nblocks"]

3140

3141 nblocks: int

3142

3143 @property

3144 def shape(self) -> Shape | None:

3145 try:

3146 ndim = self.ndim

3147

3148 # items

3149 items = 0

3150 for i in range(self.nblocks):

3151 node = getattr(self.group, f"block{i}_items")

3152 shape = getattr(node, "shape", None)

3153 if shape is not None:

3154 items += shape[0]

3155

3156 # data shape

3157 node = self.group.block0_values

3158 shape = getattr(node, "shape", None)

3159 if shape is not None:

3160 shape = list(shape[0 : (ndim - 1)])

3161 else:

3162 shape = []

3163

3164 shape.append(items)

3165

3166 return shape

3167 except AttributeError:

3168 return None

3169

3170 def read(

3171 self,

3172 where=None,

3173 columns=None,

3174 start: int | None = None,

3175 stop: int | None = None,

3176 ) -> DataFrame:

3177 # start, stop applied to rows, so 0th axis only

3178 self.validate_read(columns, where)

3179 select_axis = self.obj_type()._get_block_manager_axis(0)

3180

3181 axes = []

3182 for i in range(self.ndim):

3183 _start, _stop = (start, stop) if i == select_axis else (None, None)

3184 ax = self.read_index(f"axis{i}", start=_start, stop=_stop)

3185 axes.append(ax)

3186

3187 items = axes[0]

3188 dfs = []

3189

3190 for i in range(self.nblocks):

3191 blk_items = self.read_index(f"block{i}_items")

3192 values = self.read_array(f"block{i}_values", start=_start, stop=_stop)

3193

3194 columns = items[items.get_indexer(blk_items)]

3195 df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)

3196 dfs.append(df)

3197

3198 if len(dfs) > 0:

3199 out = concat(dfs, axis=1, copy=True)

3200 out = out.reindex(columns=items, copy=False)

3201 return out

3202

3203 return DataFrame(columns=axes[0], index=axes[1])

3204

3205 # error: Signature of "write" incompatible with supertype "Fixed"

3206 def write(self, obj, **kwargs) -> None: # type: ignore[override]

3207 super().write(obj, **kwargs)

3208

3209 # TODO(ArrayManager) HDFStore relies on accessing the blocks

3210 if isinstance(obj._mgr, ArrayManager):

3211 obj = obj._as_manager("block")

3212

3213 data = obj._mgr

3214 if not data.is_consolidated():

3215 data = data.consolidate()

3216

3217 self.attrs.ndim = data.ndim

3218 for i, ax in enumerate(data.axes):

3219 if i == 0 and (not ax.is_unique):

3220 raise ValueError("Columns index has to be unique for fixed format")

3221 self.write_index(f"axis{i}", ax)

3222

3223 # Supporting mixed-type DataFrame objects...nontrivial

3224 self.attrs.nblocks = len(data.blocks)

3225 for i, blk in enumerate(data.blocks):

3226 # I have no idea why, but writing values before items fixed #2299

3227 blk_items = data.items.take(blk.mgr_locs)

3228 self.write_array(f"block{i}_values", blk.values, items=blk_items)

3229 self.write_index(f"block{i}_items", blk_items)

3230

3231

3232class FrameFixed(BlockManagerFixed):

3233 pandas_kind = "frame"

3234 obj_type = DataFrame

3235

3236

3237class Table(Fixed):

3238 """

3239 represent a table:

3240 facilitate read/write of various types of tables

3241

3242 Attrs in Table Node

3243 -------------------

3244 These are attributes that are store in the main table node, they are

3245 necessary to recreate these tables when read back in.

3246

3247 index_axes : a list of tuples of the (original indexing axis and

3248 index column)

3249 non_index_axes: a list of tuples of the (original index axis and

3250 columns on a non-indexing axis)

3251 values_axes : a list of the columns which comprise the data of this

3252 table

3253 data_columns : a list of the columns that we are allowing indexing

3254 (these become single columns in values_axes)

3255 nan_rep : the string to use for nan representations for string

3256 objects

3257 levels : the names of levels

3258 metadata : the names of the metadata columns

3259 """

3260

3261 pandas_kind = "wide_table"

3262 format_type: str = "table" # GH#30962 needed by dask

3263 table_type: str

3264 levels: int | list[Hashable] = 1

3265 is_table = True

3266

3267 metadata: list

3268

3269 def __init__(

3270 self,

3271 parent: HDFStore,

3272 group: Node,

3273 encoding: str | None = None,

3274 errors: str = "strict",

3275 index_axes: list[IndexCol] | None = None,

3276 non_index_axes: list[tuple[AxisInt, Any]] | None = None,

3277 values_axes: list[DataCol] | None = None,

3278 data_columns: list | None = None,

3279 info: dict | None = None,

3280 nan_rep=None,

3281 ) -> None:

3282 super().__init__(parent, group, encoding=encoding, errors=errors)

3283 self.index_axes = index_axes or []

3284 self.non_index_axes = non_index_axes or []

3285 self.values_axes = values_axes or []

3286 self.data_columns = data_columns or []

3287 self.info = info or {}

3288 self.nan_rep = nan_rep

3289

3290 @property

3291 def table_type_short(self) -> str:

3292 return self.table_type.split("_")[0]

3293

3294 def __repr__(self) -> str:

3295 """return a pretty representation of myself"""

3296 self.infer_axes()

3297 jdc = ",".join(self.data_columns) if len(self.data_columns) else ""

3298 dc = f",dc->[{jdc}]"

3299

3300 ver = ""

3301 if self.is_old_version:

3302 jver = ".".join([str(x) for x in self.version])

3303 ver = f"[{jver}]"

3304

3305 jindex_axes = ",".join([a.name for a in self.index_axes])

3306 return (

3307 f"{self.pandas_type:12.12}{ver} "

3308 f"(typ->{self.table_type_short},nrows->{self.nrows},"

3309 f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})"

3310 )

3311

3312 def __getitem__(self, c: str):

3313 """return the axis for c"""

3314 for a in self.axes:

3315 if c == a.name:

3316 return a

3317 return None

3318

3319 def validate(self, other) -> None:

3320 """validate against an existing table"""

3321 if other is None:

3322 return

3323

3324 if other.table_type != self.table_type:

3325 raise TypeError(

3326 "incompatible table_type with existing "

3327 f"[{other.table_type} - {self.table_type}]"

3328 )

3329

3330 for c in ["index_axes", "non_index_axes", "values_axes"]:

3331 sv = getattr(self, c, None)

3332 ov = getattr(other, c, None)

3333 if sv != ov:

3334 # show the error for the specific axes

3335 # Argument 1 to "enumerate" has incompatible type

3336 # "Optional[Any]"; expected "Iterable[Any]" [arg-type]

3337 for i, sax in enumerate(sv): # type: ignore[arg-type]

3338 # Value of type "Optional[Any]" is not indexable [index]

3339 oax = ov[i] # type: ignore[index]

3340 if sax != oax:

3341 raise ValueError(

3342 f"invalid combination of [{c}] on appending data "

3343 f"[{sax}] vs current table [{oax}]"

3344 )

3345

3346 # should never get here

3347 raise Exception(

3348 f"invalid combination of [{c}] on appending data [{sv}] vs "

3349 f"current table [{ov}]"

3350 )

3351

3352 @property

3353 def is_multi_index(self) -> bool:

3354 """the levels attribute is 1 or a list in the case of a multi-index"""

3355 return isinstance(self.levels, list)

3356

3357 def validate_multiindex(

3358 self, obj: DataFrame | Series

3359 ) -> tuple[DataFrame, list[Hashable]]:

3360 """

3361 validate that we can store the multi-index; reset and return the

3362 new object

3363 """

3364 levels = com.fill_missing_names(obj.index.names)

3365 try:

3366 reset_obj = obj.reset_index()

3367 except ValueError as err:

3368 raise ValueError(

3369 "duplicate names/columns in the multi-index when storing as a table"

3370 ) from err

3371 assert isinstance(reset_obj, DataFrame) # for mypy

3372 return reset_obj, levels

3373

3374 @property

3375 def nrows_expected(self) -> int:

3376 """based on our axes, compute the expected nrows"""

3377 return np.prod([i.cvalues.shape[0] for i in self.index_axes])

3378

3379 @property

3380 def is_exists(self) -> bool:

3381 """has this table been created"""

3382 return "table" in self.group

3383

3384 @property

3385 def storable(self):

3386 return getattr(self.group, "table", None)

3387

3388 @property

3389 def table(self):

3390 """return the table group (this is my storable)"""

3391 return self.storable

3392

3393 @property

3394 def dtype(self):

3395 return self.table.dtype

3396

3397 @property

3398 def description(self):

3399 return self.table.description

3400

3401 @property

3402 def axes(self):

3403 return itertools.chain(self.index_axes, self.values_axes)

3404

3405 @property

3406 def ncols(self) -> int:

3407 """the number of total columns in the values axes"""

3408 return sum(len(a.values) for a in self.values_axes)

3409

3410 @property

3411 def is_transposed(self) -> bool:

3412 return False

3413

3414 @property

3415 def data_orientation(self) -> tuple[int, ...]:

3416 """return a tuple of my permutated axes, non_indexable at the front"""

3417 return tuple(

3418 itertools.chain(

3419 [int(a[0]) for a in self.non_index_axes],

3420 [int(a.axis) for a in self.index_axes],

3421 )

3422 )

3423

3424 def queryables(self) -> dict[str, Any]:

3425 """return a dict of the kinds allowable columns for this object"""

3426 # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here

3427 axis_names = {0: "index", 1: "columns"}

3428

3429 # compute the values_axes queryables

3430 d1 = [(a.cname, a) for a in self.index_axes]

3431 d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes]

3432 d3 = [

3433 (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)

3434 ]

3435

3436 return dict(d1 + d2 + d3)

3437

3438 def index_cols(self):

3439 """return a list of my index cols"""

3440 # Note: each `i.cname` below is assured to be a str.

3441 return [(i.axis, i.cname) for i in self.index_axes]

3442

3443 def values_cols(self) -> list[str]:

3444 """return a list of my values cols"""

3445 return [i.cname for i in self.values_axes]

3446

3447 def _get_metadata_path(self, key: str) -> str:

3448 """return the metadata pathname for this key"""

3449 group = self.group._v_pathname

3450 return f"{group}/meta/{key}/meta"

3451

3452 def write_metadata(self, key: str, values: np.ndarray) -> None:

3453 """

3454 Write out a metadata array to the key as a fixed-format Series.

3455

3456 Parameters

3457 ----------

3458 key : str

3459 values : ndarray

3460 """

3461 self.parent.put(

3462 self._get_metadata_path(key),

3463 Series(values, copy=False),

3464 format="table",

3465 encoding=self.encoding,

3466 errors=self.errors,

3467 nan_rep=self.nan_rep,

3468 )

3469

3470 def read_metadata(self, key: str):

3471 """return the meta data array for this key"""

3472 if getattr(getattr(self.group, "meta", None), key, None) is not None:

3473 return self.parent.select(self._get_metadata_path(key))

3474 return None

3475

3476 def set_attrs(self) -> None:

3477 """set our table type & indexables"""

3478 self.attrs.table_type = str(self.table_type)

3479 self.attrs.index_cols = self.index_cols()

3480 self.attrs.values_cols = self.values_cols()

3481 self.attrs.non_index_axes = self.non_index_axes

3482 self.attrs.data_columns = self.data_columns

3483 self.attrs.nan_rep = self.nan_rep

3484 self.attrs.encoding = self.encoding

3485 self.attrs.errors = self.errors

3486 self.attrs.levels = self.levels

3487 self.attrs.info = self.info

3488

3489 def get_attrs(self) -> None:

3490 """retrieve our attributes"""

3491 self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []

3492 self.data_columns = getattr(self.attrs, "data_columns", None) or []

3493 self.info = getattr(self.attrs, "info", None) or {}

3494 self.nan_rep = getattr(self.attrs, "nan_rep", None)

3495 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))

3496 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))

3497 self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []

3498 self.index_axes = [a for a in self.indexables if a.is_an_indexable]

3499 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]

3500

3501 def validate_version(self, where=None) -> None:

3502 """are we trying to operate on an old version?"""

3503 if where is not None:

3504 if self.is_old_version:

3505 ws = incompatibility_doc % ".".join([str(x) for x in self.version])

3506 warnings.warn(

3507 ws,

3508 IncompatibilityWarning,

3509 stacklevel=find_stack_level(),

3510 )

3511

3512 def validate_min_itemsize(self, min_itemsize) -> None:

3513 """

3514 validate the min_itemsize doesn't contain items that are not in the

3515 axes this needs data_columns to be defined

3516 """

3517 if min_itemsize is None:

3518 return

3519 if not isinstance(min_itemsize, dict):

3520 return

3521

3522 q = self.queryables()

3523 for k in min_itemsize:

3524 # ok, apply generally

3525 if k == "values":

3526 continue

3527 if k not in q:

3528 raise ValueError(

3529 f"min_itemsize has the key [{k}] which is not an axis or "

3530 "data_column"

3531 )

3532

3533 @cache_readonly

3534 def indexables(self):

3535 """create/cache the indexables if they don't exist"""

3536 _indexables = []

3537

3538 desc = self.description

3539 table_attrs = self.table.attrs

3540

3541 # Note: each of the `name` kwargs below are str, ensured

3542 # by the definition in index_cols.

3543 # index columns

3544 for i, (axis, name) in enumerate(self.attrs.index_cols):

3545 atom = getattr(desc, name)

3546 md = self.read_metadata(name)

3547 meta = "category" if md is not None else None

3548

3549 kind_attr = f"{name}_kind"

3550 kind = getattr(table_attrs, kind_attr, None)

3551

3552 index_col = IndexCol(

3553 name=name,

3554 axis=axis,

3555 pos=i,

3556 kind=kind,

3557 typ=atom,

3558 table=self.table,

3559 meta=meta,

3560 metadata=md,

3561 )

3562 _indexables.append(index_col)

3563

3564 # values columns

3565 dc = set(self.data_columns)

3566 base_pos = len(_indexables)

3567

3568 def f(i, c):

3569 assert isinstance(c, str)

3570 klass = DataCol

3571 if c in dc:

3572 klass = DataIndexableCol

3573

3574 atom = getattr(desc, c)

3575 adj_name = _maybe_adjust_name(c, self.version)

3576

3577 # TODO: why kind_attr here?

3578 values = getattr(table_attrs, f"{adj_name}_kind", None)

3579 dtype = getattr(table_attrs, f"{adj_name}_dtype", None)

3580 # Argument 1 to "_dtype_to_kind" has incompatible type

3581 # "Optional[Any]"; expected "str" [arg-type]

3582 kind = _dtype_to_kind(dtype) # type: ignore[arg-type]

3583

3584 md = self.read_metadata(c)

3585 # TODO: figure out why these two versions of `meta` dont always match.

3586 # meta = "category" if md is not None else None

3587 meta = getattr(table_attrs, f"{adj_name}_meta", None)

3588

3589 obj = klass(

3590 name=adj_name,

3591 cname=c,

3592 values=values,

3593 kind=kind,

3594 pos=base_pos + i,

3595 typ=atom,

3596 table=self.table,

3597 meta=meta,

3598 metadata=md,

3599 dtype=dtype,

3600 )

3601 return obj

3602

3603 # Note: the definition of `values_cols` ensures that each

3604 # `c` below is a str.

3605 _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)])

3606

3607 return _indexables

3608

3609 def create_index(

3610 self, columns=None, optlevel=None, kind: str | None = None

3611 ) -> None:

3612 """

3613 Create a pytables index on the specified columns.

3614

3615 Parameters

3616 ----------

3617 columns : None, bool, or listlike[str]

3618 Indicate which columns to create an index on.

3619

3620 * False : Do not create any indexes.

3621 * True : Create indexes on all columns.

3622 * None : Create indexes on all columns.

3623 * listlike : Create indexes on the given columns.

3624

3625 optlevel : int or None, default None

3626 Optimization level, if None, pytables defaults to 6.

3627 kind : str or None, default None

3628 Kind of index, if None, pytables defaults to "medium".

3629

3630 Raises

3631 ------

3632 TypeError if trying to create an index on a complex-type column.

3633

3634 Notes

3635 -----

3636 Cannot index Time64Col or ComplexCol.

3637 Pytables must be >= 3.0.

3638 """

3639 if not self.infer_axes():

3640 return

3641 if columns is False:

3642 return

3643

3644 # index all indexables and data_columns

3645 if columns is None or columns is True:

3646 columns = [a.cname for a in self.axes if a.is_data_indexable]

3647 if not isinstance(columns, (tuple, list)):

3648 columns = [columns]

3649

3650 kw = {}

3651 if optlevel is not None:

3652 kw["optlevel"] = optlevel

3653 if kind is not None:

3654 kw["kind"] = kind

3655

3656 table = self.table

3657 for c in columns:

3658 v = getattr(table.cols, c, None)

3659 if v is not None:

3660 # remove the index if the kind/optlevel have changed

3661 if v.is_indexed:

3662 index = v.index

3663 cur_optlevel = index.optlevel

3664 cur_kind = index.kind

3665

3666 if kind is not None and cur_kind != kind:

3667 v.remove_index()

3668 else:

3669 kw["kind"] = cur_kind

3670

3671 if optlevel is not None and cur_optlevel != optlevel:

3672 v.remove_index()

3673 else:

3674 kw["optlevel"] = cur_optlevel

3675

3676 # create the index

3677 if not v.is_indexed:

3678 if v.type.startswith("complex"):

3679 raise TypeError(

3680 "Columns containing complex values can be stored but "

3681 "cannot be indexed when using table format. Either use "

3682 "fixed format, set index=False, or do not include "

3683 "the columns containing complex values to "

3684 "data_columns when initializing the table."

3685 )

3686 v.create_index(**kw)

3687 elif c in self.non_index_axes[0][1]:

3688 # GH 28156

3689 raise AttributeError(

3690 f"column {c} is not a data_column.\n"

3691 f"In order to read column {c} you must reload the dataframe \n"

3692 f"into HDFStore and include {c} with the data_columns argument."

3693 )

3694

3695 def _read_axes(

3696 self, where, start: int | None = None, stop: int | None = None

3697 ) -> list[tuple[ArrayLike, ArrayLike]]:

3698 """

3699 Create the axes sniffed from the table.

3700

3701 Parameters

3702 ----------

3703 where : ???

3704 start : int or None, default None

3705 stop : int or None, default None

3706

3707 Returns

3708 -------

3709 List[Tuple[index_values, column_values]]

3710 """

3711 # create the selection

3712 selection = Selection(self, where=where, start=start, stop=stop)

3713 values = selection.select()

3714

3715 results = []

3716 # convert the data

3717 for a in self.axes:

3718 a.set_info(self.info)

3719 res = a.convert(

3720 values,

3721 nan_rep=self.nan_rep,

3722 encoding=self.encoding,

3723 errors=self.errors,

3724 )

3725 results.append(res)

3726

3727 return results

3728

3729 @classmethod

3730 def get_object(cls, obj, transposed: bool):

3731 """return the data for this obj"""

3732 return obj

3733

3734 def validate_data_columns(self, data_columns, min_itemsize, non_index_axes):

3735 """

3736 take the input data_columns and min_itemize and create a data

3737 columns spec

3738 """

3739 if not len(non_index_axes):

3740 return []

3741

3742 axis, axis_labels = non_index_axes[0]

3743 info = self.info.get(axis, {})

3744 if info.get("type") == "MultiIndex" and data_columns:

3745 raise ValueError(

3746 f"cannot use a multi-index on axis [{axis}] with "

3747 f"data_columns {data_columns}"

3748 )

3749

3750 # evaluate the passed data_columns, True == use all columns

3751 # take only valid axis labels

3752 if data_columns is True:

3753 data_columns = list(axis_labels)

3754 elif data_columns is None:

3755 data_columns = []

3756

3757 # if min_itemsize is a dict, add the keys (exclude 'values')

3758 if isinstance(min_itemsize, dict):

3759 existing_data_columns = set(data_columns)

3760 data_columns = list(data_columns) # ensure we do not modify

3761 data_columns.extend(

3762 [

3763 k

3764 for k in min_itemsize.keys()

3765 if k != "values" and k not in existing_data_columns

3766 ]

3767 )

3768

3769 # return valid columns in the order of our axis

3770 return [c for c in data_columns if c in axis_labels]

3771

3772 def _create_axes(

3773 self,

3774 axes,

3775 obj: DataFrame,

3776 validate: bool = True,

3777 nan_rep=None,

3778 data_columns=None,

3779 min_itemsize=None,

3780 ):

3781 """

3782 Create and return the axes.

3783

3784 Parameters

3785 ----------

3786 axes: list or None

3787 The names or numbers of the axes to create.

3788 obj : DataFrame

3789 The object to create axes on.

3790 validate: bool, default True

3791 Whether to validate the obj against an existing object already written.

3792 nan_rep :

3793 A value to use for string column nan_rep.

3794 data_columns : List[str], True, or None, default None

3795 Specify the columns that we want to create to allow indexing on.

3796

3797 * True : Use all available columns.

3798 * None : Use no columns.

3799 * List[str] : Use the specified columns.

3800

3801 min_itemsize: Dict[str, int] or None, default None

3802 The min itemsize for a column in bytes.

3803 """

3804 if not isinstance(obj, DataFrame):

3805 group = self.group._v_name

3806 raise TypeError(

3807 f"cannot properly create the storer for: [group->{group},"

3808 f"value->{type(obj)}]"

3809 )

3810

3811 # set the default axes if needed

3812 if axes is None:

3813 axes = [0]

3814

3815 # map axes to numbers

3816 axes = [obj._get_axis_number(a) for a in axes]

3817

3818 # do we have an existing table (if so, use its axes & data_columns)

3819 if self.infer_axes():

3820 table_exists = True

3821 axes = [a.axis for a in self.index_axes]

3822 data_columns = list(self.data_columns)

3823 nan_rep = self.nan_rep

3824 # TODO: do we always have validate=True here?

3825 else:

3826 table_exists = False

3827

3828 new_info = self.info

3829

3830 assert self.ndim == 2 # with next check, we must have len(axes) == 1

3831 # currently support on ndim-1 axes

3832 if len(axes) != self.ndim - 1:

3833 raise ValueError(

3834 "currently only support ndim-1 indexers in an AppendableTable"

3835 )

3836

3837 # create according to the new data

3838 new_non_index_axes: list = []

3839

3840 # nan_representation

3841 if nan_rep is None:

3842 nan_rep = "nan"

3843

3844 # We construct the non-index-axis first, since that alters new_info

3845 idx = [x for x in [0, 1] if x not in axes][0]

3846

3847 a = obj.axes[idx]

3848 # we might be able to change the axes on the appending data if necessary

3849 append_axis = list(a)

3850 if table_exists:

3851 indexer = len(new_non_index_axes) # i.e. 0

3852 exist_axis = self.non_index_axes[indexer][1]

3853 if not array_equivalent(np.array(append_axis), np.array(exist_axis)):

3854 # ahah! -> reindex

3855 if array_equivalent(

3856 np.array(sorted(append_axis)), np.array(sorted(exist_axis))

3857 ):

3858 append_axis = exist_axis

3859

3860 # the non_index_axes info

3861 info = new_info.setdefault(idx, {})

3862 info["names"] = list(a.names)

3863 info["type"] = type(a).__name__

3864

3865 new_non_index_axes.append((idx, append_axis))

3866

3867 # Now we can construct our new index axis

3868 idx = axes[0]

3869 a = obj.axes[idx]

3870 axis_name = obj._get_axis_name(idx)

3871 new_index = _convert_index(axis_name, a, self.encoding, self.errors)

3872 new_index.axis = idx

3873

3874 # Because we are always 2D, there is only one new_index, so

3875 # we know it will have pos=0

3876 new_index.set_pos(0)

3877 new_index.update_info(new_info)

3878 new_index.maybe_set_size(min_itemsize) # check for column conflicts

3879

3880 new_index_axes = [new_index]

3881 j = len(new_index_axes) # i.e. 1

3882 assert j == 1

3883

3884 # reindex by our non_index_axes & compute data_columns

3885 assert len(new_non_index_axes) == 1

3886 for a in new_non_index_axes:

3887 obj = _reindex_axis(obj, a[0], a[1])

3888

3889 transposed = new_index.axis == 1

3890

3891 # figure out data_columns and get out blocks

3892 data_columns = self.validate_data_columns(

3893 data_columns, min_itemsize, new_non_index_axes

3894 )

3895

3896 frame = self.get_object(obj, transposed)._consolidate()

3897

3898 blocks, blk_items = self._get_blocks_and_items(

3899 frame, table_exists, new_non_index_axes, self.values_axes, data_columns

3900 )

3901

3902 # add my values

3903 vaxes = []

3904 for i, (blk, b_items) in enumerate(zip(blocks, blk_items)):

3905 # shape of the data column are the indexable axes

3906 klass = DataCol

3907 name = None

3908

3909 # we have a data_column

3910 if data_columns and len(b_items) == 1 and b_items[0] in data_columns:

3911 klass = DataIndexableCol

3912 name = b_items[0]

3913 if not (name is None or isinstance(name, str)):

3914 # TODO: should the message here be more specifically non-str?

3915 raise ValueError("cannot have non-object label DataIndexableCol")

3916

3917 # make sure that we match up the existing columns

3918 # if we have an existing table

3919 existing_col: DataCol | None

3920

3921 if table_exists and validate:

3922 try:

3923 existing_col = self.values_axes[i]

3924 except (IndexError, KeyError) as err:

3925 raise ValueError(

3926 f"Incompatible appended table [{blocks}]"

3927 f"with existing table [{self.values_axes}]"

3928 ) from err

3929 else:

3930 existing_col = None

3931

3932 new_name = name or f"values_block_{i}"

3933 data_converted = _maybe_convert_for_string_atom(

3934 new_name,

3935 blk.values,

3936 existing_col=existing_col,

3937 min_itemsize=min_itemsize,

3938 nan_rep=nan_rep,

3939 encoding=self.encoding,

3940 errors=self.errors,

3941 columns=b_items,

3942 )

3943 adj_name = _maybe_adjust_name(new_name, self.version)

3944

3945 typ = klass._get_atom(data_converted)

3946 kind = _dtype_to_kind(data_converted.dtype.name)

3947 tz = None

3948 if getattr(data_converted, "tz", None) is not None:

3949 tz = _get_tz(data_converted.tz)

3950

3951 meta = metadata = ordered = None

3952 if is_categorical_dtype(data_converted.dtype):

3953 ordered = data_converted.ordered

3954 meta = "category"

3955 metadata = np.array(data_converted.categories, copy=False).ravel()

3956

3957 data, dtype_name = _get_data_and_dtype_name(data_converted)

3958

3959 col = klass(

3960 name=adj_name,

3961 cname=new_name,

3962 values=list(b_items),

3963 typ=typ,

3964 pos=j,

3965 kind=kind,

3966 tz=tz,

3967 ordered=ordered,

3968 meta=meta,

3969 metadata=metadata,

3970 dtype=dtype_name,

3971 data=data,

3972 )

3973 col.update_info(new_info)

3974

3975 vaxes.append(col)

3976

3977 j += 1

3978

3979 dcs = [col.name for col in vaxes if col.is_data_indexable]

3980

3981 new_table = type(self)(

3982 parent=self.parent,

3983 group=self.group,

3984 encoding=self.encoding,

3985 errors=self.errors,

3986 index_axes=new_index_axes,

3987 non_index_axes=new_non_index_axes,

3988 values_axes=vaxes,

3989 data_columns=dcs,

3990 info=new_info,

3991 nan_rep=nan_rep,

3992 )

3993 if hasattr(self, "levels"):

3994 # TODO: get this into constructor, only for appropriate subclass

3995 new_table.levels = self.levels

3996

3997 new_table.validate_min_itemsize(min_itemsize)

3998

3999 if validate and table_exists:

4000 new_table.validate(self)

4001

4002 return new_table

4003

4004 @staticmethod

4005 def _get_blocks_and_items(

4006 frame: DataFrame,

4007 table_exists: bool,

4008 new_non_index_axes,

4009 values_axes,

4010 data_columns,

4011 ):

4012 # Helper to clarify non-state-altering parts of _create_axes

4013

4014 # TODO(ArrayManager) HDFStore relies on accessing the blocks

4015 if isinstance(frame._mgr, ArrayManager):

4016 frame = frame._as_manager("block")

4017

4018 def get_blk_items(mgr):

4019 return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks]

4020

4021 mgr = frame._mgr

4022 mgr = cast(BlockManager, mgr)

4023 blocks: list[Block] = list(mgr.blocks)

4024 blk_items: list[Index] = get_blk_items(mgr)

4025

4026 if len(data_columns):

4027 # TODO: prove that we only get here with axis == 1?

4028 # It is the case in all extant tests, but NOT the case

4029 # outside this `if len(data_columns)` check.

4030

4031 axis, axis_labels = new_non_index_axes[0]

4032 new_labels = Index(axis_labels).difference(Index(data_columns))

4033 mgr = frame.reindex(new_labels, axis=axis)._mgr

4034 mgr = cast(BlockManager, mgr)

4035

4036 blocks = list(mgr.blocks)

4037 blk_items = get_blk_items(mgr)

4038 for c in data_columns:

4039 # This reindex would raise ValueError if we had a duplicate

4040 # index, so we can infer that (as long as axis==1) we

4041 # get a single column back, so a single block.

4042 mgr = frame.reindex([c], axis=axis)._mgr

4043 mgr = cast(BlockManager, mgr)

4044 blocks.extend(mgr.blocks)

4045 blk_items.extend(get_blk_items(mgr))

4046

4047 # reorder the blocks in the same order as the existing table if we can

4048 if table_exists:

4049 by_items = {

4050 tuple(b_items.tolist()): (b, b_items)

4051 for b, b_items in zip(blocks, blk_items)

4052 }

4053 new_blocks: list[Block] = []

4054 new_blk_items = []

4055 for ea in values_axes:

4056 items = tuple(ea.values)

4057 try:

4058 b, b_items = by_items.pop(items)

4059 new_blocks.append(b)

4060 new_blk_items.append(b_items)

4061 except (IndexError, KeyError) as err:

4062 jitems = ",".join([pprint_thing(item) for item in items])

4063 raise ValueError(

4064 f"cannot match existing table structure for [{jitems}] "

4065 "on appending data"

4066 ) from err

4067 blocks = new_blocks

4068 blk_items = new_blk_items

4069

4070 return blocks, blk_items

4071

4072 def process_axes(self, obj, selection: Selection, columns=None) -> DataFrame:

4073 """process axes filters"""

4074 # make a copy to avoid side effects

4075 if columns is not None:

4076 columns = list(columns)

4077

4078 # make sure to include levels if we have them

4079 if columns is not None and self.is_multi_index:

4080 assert isinstance(self.levels, list) # assured by is_multi_index

4081 for n in self.levels:

4082 if n not in columns:

4083 columns.insert(0, n)

4084

4085 # reorder by any non_index_axes & limit to the select columns

4086 for axis, labels in self.non_index_axes:

4087 obj = _reindex_axis(obj, axis, labels, columns)

4088

4089 def process_filter(field, filt, op):

4090 for axis_name in obj._AXIS_ORDERS:

4091 axis_number = obj._get_axis_number(axis_name)

4092 axis_values = obj._get_axis(axis_name)

4093 assert axis_number is not None

4094

4095 # see if the field is the name of an axis

4096 if field == axis_name:

4097 # if we have a multi-index, then need to include

4098 # the levels

4099 if self.is_multi_index:

4100 filt = filt.union(Index(self.levels))

4101

4102 takers = op(axis_values, filt)

4103 return obj.loc(axis=axis_number)[takers]

4104

4105 # this might be the name of a file IN an axis

4106 elif field in axis_values:

4107 # we need to filter on this dimension

4108 values = ensure_index(getattr(obj, field).values)

4109 filt = ensure_index(filt)

4110

4111 # hack until we support reversed dim flags

4112 if isinstance(obj, DataFrame):

4113 axis_number = 1 - axis_number

4114

4115 takers = op(values, filt)

4116 return obj.loc(axis=axis_number)[takers]

4117

4118 raise ValueError(f"cannot find the field [{field}] for filtering!")

4119

4120 # apply the selection filters (but keep in the same order)

4121 if selection.filter is not None:

4122 for field, op, filt in selection.filter.format():

4123 obj = process_filter(field, filt, op)

4124

4125 return obj

4126

4127 def create_description(

4128 self,

4129 complib,

4130 complevel: int | None,

4131 fletcher32: bool,

4132 expectedrows: int | None,

4133 ) -> dict[str, Any]:

4134 """create the description of the table from the axes & values"""

4135 # provided expected rows if its passed

4136 if expectedrows is None:

4137 expectedrows = max(self.nrows_expected, 10000)

4138

4139 d = {"name": "table", "expectedrows": expectedrows}

4140

4141 # description from the axes & values

4142 d["description"] = {a.cname: a.typ for a in self.axes}

4143

4144 if complib:

4145 if complevel is None:

4146 complevel = self._complevel or 9

4147 filters = _tables().Filters(

4148 complevel=complevel,

4149 complib=complib,

4150 fletcher32=fletcher32 or self._fletcher32,

4151 )

4152 d["filters"] = filters

4153 elif self._filters is not None:

4154 d["filters"] = self._filters

4155

4156 return d

4157

4158 def read_coordinates(

4159 self, where=None, start: int | None = None, stop: int | None = None

4160 ):

4161 """

4162 select coordinates (row numbers) from a table; return the

4163 coordinates object

4164 """

4165 # validate the version

4166 self.validate_version(where)

4167

4168 # infer the data kind

4169 if not self.infer_axes():

4170 return False

4171

4172 # create the selection

4173 selection = Selection(self, where=where, start=start, stop=stop)

4174 coords = selection.select_coords()

4175 if selection.filter is not None:

4176 for field, op, filt in selection.filter.format():

4177 data = self.read_column(

4178 field, start=coords.min(), stop=coords.max() + 1

4179 )

4180 coords = coords[op(data.iloc[coords - coords.min()], filt).values]

4181

4182 return Index(coords)

4183

4184 def read_column(

4185 self,

4186 column: str,

4187 where=None,

4188 start: int | None = None,

4189 stop: int | None = None,

4190 ):

4191 """

4192 return a single column from the table, generally only indexables

4193 are interesting

4194 """

4195 # validate the version

4196 self.validate_version()

4197

4198 # infer the data kind

4199 if not self.infer_axes():

4200 return False

4201

4202 if where is not None:

4203 raise TypeError("read_column does not currently accept a where clause")

4204

4205 # find the axes

4206 for a in self.axes:

4207 if column == a.name:

4208 if not a.is_data_indexable:

4209 raise ValueError(

4210 f"column [{column}] can not be extracted individually; "

4211 "it is not data indexable"

4212 )

4213

4214 # column must be an indexable or a data column

4215 c = getattr(self.table.cols, column)

4216 a.set_info(self.info)

4217 col_values = a.convert(

4218 c[start:stop],

4219 nan_rep=self.nan_rep,

4220 encoding=self.encoding,

4221 errors=self.errors,

4222 )

4223 return Series(_set_tz(col_values[1], a.tz), name=column, copy=False)

4224

4225 raise KeyError(f"column [{column}] not found in the table")

4226

4227

4228class WORMTable(Table):

4229 """

4230 a write-once read-many table: this format DOES NOT ALLOW appending to a

4231 table. writing is a one-time operation the data are stored in a format

4232 that allows for searching the data on disk

4233 """

4234

4235 table_type = "worm"

4236

4237 def read(

4238 self,

4239 where=None,

4240 columns=None,

4241 start: int | None = None,

4242 stop: int | None = None,

4243 ):

4244 """

4245 read the indices and the indexing array, calculate offset rows and return

4246 """

4247 raise NotImplementedError("WORMTable needs to implement read")

4248

4249 def write(self, **kwargs) -> None:

4250 """

4251 write in a format that we can search later on (but cannot append

4252 to): write out the indices and the values using _write_array

4253 (e.g. a CArray) create an indexing table so that we can search

4254 """

4255 raise NotImplementedError("WORMTable needs to implement write")

4256

4257

4258class AppendableTable(Table):

4259 """support the new appendable table formats"""

4260

4261 table_type = "appendable"

4262

4263 # error: Signature of "write" incompatible with supertype "Fixed"

4264 def write( # type: ignore[override]

4265 self,

4266 obj,

4267 axes=None,

4268 append: bool = False,

4269 complib=None,

4270 complevel=None,

4271 fletcher32=None,

4272 min_itemsize=None,

4273 chunksize=None,

4274 expectedrows=None,

4275 dropna: bool = False,

4276 nan_rep=None,

4277 data_columns=None,

4278 track_times: bool = True,

4279 ) -> None:

4280 if not append and self.is_exists:

4281 self._handle.remove_node(self.group, "table")

4282

4283 # create the axes

4284 table = self._create_axes(

4285 axes=axes,

4286 obj=obj,

4287 validate=append,

4288 min_itemsize=min_itemsize,

4289 nan_rep=nan_rep,

4290 data_columns=data_columns,

4291 )

4292

4293 for a in table.axes:

4294 a.validate_names()

4295

4296 if not table.is_exists:

4297 # create the table

4298 options = table.create_description(

4299 complib=complib,

4300 complevel=complevel,

4301 fletcher32=fletcher32,

4302 expectedrows=expectedrows,

4303 )

4304

4305 # set the table attributes

4306 table.set_attrs()

4307

4308 options["track_times"] = track_times

4309

4310 # create the table

4311 table._handle.create_table(table.group, **options)

4312

4313 # update my info

4314 table.attrs.info = table.info

4315

4316 # validate the axes and set the kinds

4317 for a in table.axes:

4318 a.validate_and_set(table, append)

4319

4320 # add the rows

4321 table.write_data(chunksize, dropna=dropna)

4322

4323 def write_data(self, chunksize: int | None, dropna: bool = False) -> None:

4324 """

4325 we form the data into a 2-d including indexes,values,mask write chunk-by-chunk

4326 """

4327 names = self.dtype.names

4328 nrows = self.nrows_expected

4329

4330 # if dropna==True, then drop ALL nan rows

4331 masks = []

4332 if dropna:

4333 for a in self.values_axes:

4334 # figure the mask: only do if we can successfully process this

4335 # column, otherwise ignore the mask

4336 mask = isna(a.data).all(axis=0)

4337 if isinstance(mask, np.ndarray):

4338 masks.append(mask.astype("u1", copy=False))

4339

4340 # consolidate masks

4341 if len(masks):

4342 mask = masks[0]

4343 for m in masks[1:]:

4344 mask = mask & m

4345 mask = mask.ravel()

4346 else:

4347 mask = None

4348

4349 # broadcast the indexes if needed

4350 indexes = [a.cvalues for a in self.index_axes]

4351 nindexes = len(indexes)

4352 assert nindexes == 1, nindexes # ensures we dont need to broadcast

4353

4354 # transpose the values so first dimension is last

4355 # reshape the values if needed

4356 values = [a.take_data() for a in self.values_axes]

4357 values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]

4358 bvalues = []

4359 for i, v in enumerate(values):

4360 new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape

4361 bvalues.append(v.reshape(new_shape))

4362

4363 # write the chunks

4364 if chunksize is None:

4365 chunksize = 100000

4366

4367 rows = np.empty(min(chunksize, nrows), dtype=self.dtype)

4368 chunks = nrows // chunksize + 1

4369 for i in range(chunks):

4370 start_i = i * chunksize

4371 end_i = min((i + 1) * chunksize, nrows)

4372 if start_i >= end_i:

4373 break

4374

4375 self.write_data_chunk(

4376 rows,

4377 indexes=[a[start_i:end_i] for a in indexes],

4378 mask=mask[start_i:end_i] if mask is not None else None,

4379 values=[v[start_i:end_i] for v in bvalues],

4380 )

4381

4382 def write_data_chunk(

4383 self,

4384 rows: np.ndarray,

4385 indexes: list[np.ndarray],

4386 mask: npt.NDArray[np.bool_] | None,

4387 values: list[np.ndarray],

4388 ) -> None:

4389 """

4390 Parameters

4391 ----------

4392 rows : an empty memory space where we are putting the chunk

4393 indexes : an array of the indexes

4394 mask : an array of the masks

4395 values : an array of the values

4396 """

4397 # 0 len

4398 for v in values:

4399 if not np.prod(v.shape):

4400 return

4401

4402 nrows = indexes[0].shape[0]

4403 if nrows != len(rows):

4404 rows = np.empty(nrows, dtype=self.dtype)

4405 names = self.dtype.names

4406 nindexes = len(indexes)

4407

4408 # indexes

4409 for i, idx in enumerate(indexes):

4410 rows[names[i]] = idx

4411

4412 # values

4413 for i, v in enumerate(values):

4414 rows[names[i + nindexes]] = v

4415

4416 # mask

4417 if mask is not None:

4418 m = ~mask.ravel().astype(bool, copy=False)

4419 if not m.all():

4420 rows = rows[m]

4421

4422 if len(rows):

4423 self.table.append(rows)

4424 self.table.flush()

4425

4426 def delete(self, where=None, start: int | None = None, stop: int | None = None):

4427 # delete all rows (and return the nrows)

4428 if where is None or not len(where):

4429 if start is None and stop is None:

4430 nrows = self.nrows

4431 self._handle.remove_node(self.group, recursive=True)

4432 else:

4433 # pytables<3.0 would remove a single row with stop=None

4434 if stop is None:

4435 stop = self.nrows

4436 nrows = self.table.remove_rows(start=start, stop=stop)

4437 self.table.flush()

4438 return nrows

4439

4440 # infer the data kind

4441 if not self.infer_axes():

4442 return None

4443

4444 # create the selection

4445 table = self.table

4446 selection = Selection(self, where, start=start, stop=stop)

4447 values = selection.select_coords()

4448

4449 # delete the rows in reverse order

4450 sorted_series = Series(values, copy=False).sort_values()

4451 ln = len(sorted_series)

4452

4453 if ln:

4454 # construct groups of consecutive rows

4455 diff = sorted_series.diff()

4456 groups = list(diff[diff > 1].index)

4457

4458 # 1 group

4459 if not len(groups):

4460 groups = [0]

4461

4462 # final element

4463 if groups[-1] != ln:

4464 groups.append(ln)

4465

4466 # initial element

4467 if groups[0] != 0:

4468 groups.insert(0, 0)

4469

4470 # we must remove in reverse order!

4471 pg = groups.pop()

4472 for g in reversed(groups):

4473 rows = sorted_series.take(range(g, pg))

4474 table.remove_rows(

4475 start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1

4476 )

4477 pg = g

4478

4479 self.table.flush()

4480

4481 # return the number of rows removed

4482 return ln

4483

4484

4485class AppendableFrameTable(AppendableTable):

4486 """support the new appendable table formats"""

4487

4488 pandas_kind = "frame_table"

4489 table_type = "appendable_frame"

4490 ndim = 2

4491 obj_type: type[DataFrame | Series] = DataFrame

4492

4493 @property

4494 def is_transposed(self) -> bool:

4495 return self.index_axes[0].axis == 1

4496

4497 @classmethod

4498 def get_object(cls, obj, transposed: bool):

4499 """these are written transposed"""

4500 if transposed:

4501 obj = obj.T

4502 return obj

4503

4504 def read(

4505 self,

4506 where=None,

4507 columns=None,

4508 start: int | None = None,

4509 stop: int | None = None,

4510 ):

4511 # validate the version

4512 self.validate_version(where)

4513

4514 # infer the data kind

4515 if not self.infer_axes():

4516 return None

4517

4518 result = self._read_axes(where=where, start=start, stop=stop)

4519

4520 info = (

4521 self.info.get(self.non_index_axes[0][0], {})

4522 if len(self.non_index_axes)

4523 else {}

4524 )

4525

4526 inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]

4527 assert len(inds) == 1

4528 ind = inds[0]

4529

4530 index = result[ind][0]

4531

4532 frames = []

4533 for i, a in enumerate(self.axes):

4534 if a not in self.values_axes:

4535 continue

4536 index_vals, cvalues = result[i]

4537

4538 # we could have a multi-index constructor here

4539 # ensure_index doesn't recognized our list-of-tuples here

4540 if info.get("type") != "MultiIndex":

4541 cols = Index(index_vals)

4542 else:

4543 cols = MultiIndex.from_tuples(index_vals)

4544

4545 names = info.get("names")

4546 if names is not None:

4547 cols.set_names(names, inplace=True)

4548

4549 if self.is_transposed:

4550 values = cvalues

4551 index_ = cols

4552 cols_ = Index(index, name=getattr(index, "name", None))

4553 else:

4554 values = cvalues.T

4555 index_ = Index(index, name=getattr(index, "name", None))

4556 cols_ = cols

4557

4558 # if we have a DataIndexableCol, its shape will only be 1 dim

4559 if values.ndim == 1 and isinstance(values, np.ndarray):

4560 values = values.reshape((1, values.shape[0]))

4561

4562 if isinstance(values, np.ndarray):

4563 df = DataFrame(values.T, columns=cols_, index=index_, copy=False)

4564 elif isinstance(values, Index):

4565 df = DataFrame(values, columns=cols_, index=index_)

4566 else:

4567 # Categorical

4568 df = DataFrame._from_arrays([values], columns=cols_, index=index_)

4569 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)

4570 frames.append(df)

4571

4572 if len(frames) == 1:

4573 df = frames[0]

4574 else:

4575 df = concat(frames, axis=1)

4576

4577 selection = Selection(self, where=where, start=start, stop=stop)

4578 # apply the selection filters & axis orderings

4579 df = self.process_axes(df, selection=selection, columns=columns)

4580

4581 return df

4582

4583

4584class AppendableSeriesTable(AppendableFrameTable):

4585 """support the new appendable table formats"""

4586

4587 pandas_kind = "series_table"

4588 table_type = "appendable_series"

4589 ndim = 2

4590 obj_type = Series

4591

4592 @property

4593 def is_transposed(self) -> bool:

4594 return False

4595

4596 @classmethod

4597 def get_object(cls, obj, transposed: bool):

4598 return obj

4599

4600 def write(self, obj, data_columns=None, **kwargs):

4601 """we are going to write this as a frame table"""

4602 if not isinstance(obj, DataFrame):

4603 name = obj.name or "values"

4604 obj = obj.to_frame(name)

4605 return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)

4606

4607 def read(

4608 self,

4609 where=None,

4610 columns=None,

4611 start: int | None = None,

4612 stop: int | None = None,

4613 ) -> Series:

4614 is_multi_index = self.is_multi_index

4615 if columns is not None and is_multi_index:

4616 assert isinstance(self.levels, list) # needed for mypy

4617 for n in self.levels:

4618 if n not in columns:

4619 columns.insert(0, n)

4620 s = super().read(where=where, columns=columns, start=start, stop=stop)

4621 if is_multi_index:

4622 s.set_index(self.levels, inplace=True)

4623

4624 s = s.iloc[:, 0]

4625

4626 # remove the default name

4627 if s.name == "values":

4628 s.name = None

4629 return s

4630

4631

4632class AppendableMultiSeriesTable(AppendableSeriesTable):

4633 """support the new appendable table formats"""

4634

4635 pandas_kind = "series_table"

4636 table_type = "appendable_multiseries"

4637

4638 def write(self, obj, **kwargs):

4639 """we are going to write this as a frame table"""

4640 name = obj.name or "values"

4641 newobj, self.levels = self.validate_multiindex(obj)

4642 assert isinstance(self.levels, list) # for mypy

4643 cols = list(self.levels)

4644 cols.append(name)

4645 newobj.columns = Index(cols)

4646 return super().write(obj=newobj, **kwargs)

4647

4648

4649class GenericTable(AppendableFrameTable):

4650 """a table that read/writes the generic pytables table format"""

4651

4652 pandas_kind = "frame_table"

4653 table_type = "generic_table"

4654 ndim = 2

4655 obj_type = DataFrame

4656 levels: list[Hashable]

4657

4658 @property

4659 def pandas_type(self) -> str:

4660 return self.pandas_kind

4661

4662 @property

4663 def storable(self):

4664 return getattr(self.group, "table", None) or self.group

4665

4666 def get_attrs(self) -> None:

4667 """retrieve our attributes"""

4668 self.non_index_axes = []

4669 self.nan_rep = None

4670 self.levels = []

4671

4672 self.index_axes = [a for a in self.indexables if a.is_an_indexable]

4673 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]

4674 self.data_columns = [a.name for a in self.values_axes]

4675

4676 @cache_readonly

4677 def indexables(self):

4678 """create the indexables from the table description"""

4679 d = self.description

4680

4681 # TODO: can we get a typ for this? AFAICT it is the only place

4682 # where we aren't passing one

4683 # the index columns is just a simple index

4684 md = self.read_metadata("index")

4685 meta = "category" if md is not None else None

4686 index_col = GenericIndexCol(

4687 name="index", axis=0, table=self.table, meta=meta, metadata=md

4688 )

4689

4690 _indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col]

4691

4692 for i, n in enumerate(d._v_names):

4693 assert isinstance(n, str)

4694

4695 atom = getattr(d, n)

4696 md = self.read_metadata(n)

4697 meta = "category" if md is not None else None

4698 dc = GenericDataIndexableCol(

4699 name=n,

4700 pos=i,

4701 values=[n],

4702 typ=atom,

4703 table=self.table,

4704 meta=meta,

4705 metadata=md,

4706 )

4707 _indexables.append(dc)

4708

4709 return _indexables

4710

4711 def write(self, **kwargs):

4712 raise NotImplementedError("cannot write on an generic table")

4713

4714

4715class AppendableMultiFrameTable(AppendableFrameTable):

4716 """a frame with a multi-index"""

4717

4718 table_type = "appendable_multiframe"

4719 obj_type = DataFrame

4720 ndim = 2

4721 _re_levels = re.compile(r"^level_\d+$")

4722

4723 @property

4724 def table_type_short(self) -> str:

4725 return "appendable_multi"

4726

4727 def write(self, obj, data_columns=None, **kwargs):

4728 if data_columns is None:

4729 data_columns = []

4730 elif data_columns is True:

4731 data_columns = obj.columns.tolist()

4732 obj, self.levels = self.validate_multiindex(obj)

4733 assert isinstance(self.levels, list) # for mypy

4734 for n in self.levels:

4735 if n not in data_columns:

4736 data_columns.insert(0, n)

4737 return super().write(obj=obj, data_columns=data_columns, **kwargs)

4738

4739 def read(

4740 self,

4741 where=None,

4742 columns=None,

4743 start: int | None = None,

4744 stop: int | None = None,

4745 ):

4746 df = super().read(where=where, columns=columns, start=start, stop=stop)

4747 df = df.set_index(self.levels)

4748

4749 # remove names for 'level_%d'

4750 df.index = df.index.set_names(

4751 [None if self._re_levels.search(name) else name for name in df.index.names]

4752 )

4753

4754 return df

4755

4756

4757def _reindex_axis(

4758 obj: DataFrame, axis: AxisInt, labels: Index, other=None

4759) -> DataFrame:

4760 ax = obj._get_axis(axis)

4761 labels = ensure_index(labels)

4762

4763 # try not to reindex even if other is provided

4764 # if it equals our current index

4765 if other is not None:

4766 other = ensure_index(other)

4767 if (other is None or labels.equals(other)) and labels.equals(ax):

4768 return obj

4769

4770 labels = ensure_index(labels.unique())

4771 if other is not None:

4772 labels = ensure_index(other.unique()).intersection(labels, sort=False)

4773 if not labels.equals(ax):

4774 slicer: list[slice | Index] = [slice(None, None)] * obj.ndim

4775 slicer[axis] = labels

4776 obj = obj.loc[tuple(slicer)]

4777 return obj

4778

4779

4780# tz to/from coercion

4781

4782

4783def _get_tz(tz: tzinfo) -> str | tzinfo:

4784 """for a tz-aware type, return an encoded zone"""

4785 zone = timezones.get_timezone(tz)

4786 return zone

4787

4788

4789@overload

4790def _set_tz(

4791 values: np.ndarray | Index, tz: str | tzinfo, coerce: bool = False

4792) -> DatetimeIndex:

4793 ...

4794

4795

4796@overload

4797def _set_tz(values: np.ndarray | Index, tz: None, coerce: bool = False) -> np.ndarray:

4798 ...

4799

4800

4801def _set_tz(

4802 values: np.ndarray | Index, tz: str | tzinfo | None, coerce: bool = False

4803) -> np.ndarray | DatetimeIndex:

4804 """

4805 coerce the values to a DatetimeIndex if tz is set

4806 preserve the input shape if possible

4807

4808 Parameters

4809 ----------

4810 values : ndarray or Index

4811 tz : str or tzinfo

4812 coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray

4813 """

4814 if isinstance(values, DatetimeIndex):

4815 # If values is tzaware, the tz gets dropped in the values.ravel()

4816 # call below (which returns an ndarray). So we are only non-lossy

4817 # if `tz` matches `values.tz`.

4818 assert values.tz is None or values.tz == tz

4819

4820 if tz is not None:

4821 if isinstance(values, DatetimeIndex):

4822 name = values.name

4823 values = values.asi8

4824 else:

4825 name = None

4826 values = values.ravel()

4827

4828 tz = _ensure_decoded(tz)

4829 values = DatetimeIndex(values, name=name)

4830 values = values.tz_localize("UTC").tz_convert(tz)

4831 elif coerce:

4832 values = np.asarray(values, dtype="M8[ns]")

4833

4834 # error: Incompatible return value type (got "Union[ndarray, Index]",

4835 # expected "Union[ndarray, DatetimeIndex]")

4836 return values # type: ignore[return-value]

4837

4838

4839def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol:

4840 assert isinstance(name, str)

4841

4842 index_name = index.name

4843 # error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index";

4844 # expected "Union[ExtensionArray, ndarray]"

4845 converted, dtype_name = _get_data_and_dtype_name(index) # type: ignore[arg-type]

4846 kind = _dtype_to_kind(dtype_name)

4847 atom = DataIndexableCol._get_atom(converted)

4848

4849 if (

4850 (isinstance(index.dtype, np.dtype) and is_integer_dtype(index))

4851 or needs_i8_conversion(index.dtype)

4852 or is_bool_dtype(index.dtype)

4853 ):

4854 # Includes Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,

4855 # in which case "kind" is "integer", "integer", "datetime64",

4856 # "timedelta64", and "integer", respectively.

4857 return IndexCol(

4858 name,

4859 values=converted,

4860 kind=kind,

4861 typ=atom,

4862 freq=getattr(index, "freq", None),

4863 tz=getattr(index, "tz", None),

4864 index_name=index_name,

4865 )

4866

4867 if isinstance(index, MultiIndex):

4868 raise TypeError("MultiIndex not supported here!")

4869

4870 inferred_type = lib.infer_dtype(index, skipna=False)

4871 # we won't get inferred_type of "datetime64" or "timedelta64" as these

4872 # would go through the DatetimeIndex/TimedeltaIndex paths above

4873

4874 values = np.asarray(index)

4875

4876 if inferred_type == "date":

4877 converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)

4878 return IndexCol(

4879 name, converted, "date", _tables().Time32Col(), index_name=index_name

4880 )

4881 elif inferred_type == "string":

4882 converted = _convert_string_array(values, encoding, errors)

4883 itemsize = converted.dtype.itemsize

4884 return IndexCol(

4885 name,

4886 converted,

4887 "string",

4888 _tables().StringCol(itemsize),

4889 index_name=index_name,

4890 )

4891

4892 elif inferred_type in ["integer", "floating"]:

4893 return IndexCol(

4894 name, values=converted, kind=kind, typ=atom, index_name=index_name

4895 )

4896 else:

4897 assert isinstance(converted, np.ndarray) and converted.dtype == object

4898 assert kind == "object", kind

4899 atom = _tables().ObjectAtom()

4900 return IndexCol(name, converted, kind, atom, index_name=index_name)

4901

4902

4903def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index:

4904 index: Index | np.ndarray

4905

4906 if kind == "datetime64":

4907 index = DatetimeIndex(data)

4908 elif kind == "timedelta64":

4909 index = TimedeltaIndex(data)

4910 elif kind == "date":

4911 try:

4912 index = np.asarray([date.fromordinal(v) for v in data], dtype=object)

4913 except ValueError:

4914 index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)

4915 elif kind in ("integer", "float", "bool"):

4916 index = np.asarray(data)

4917 elif kind in ("string"):

4918 index = _unconvert_string_array(

4919 data, nan_rep=None, encoding=encoding, errors=errors

4920 )

4921 elif kind == "object":

4922 index = np.asarray(data[0])

4923 else: # pragma: no cover

4924 raise ValueError(f"unrecognized index type {kind}")

4925 return index

4926

4927

4928def _maybe_convert_for_string_atom(

4929 name: str,

4930 bvalues: ArrayLike,

4931 existing_col,

4932 min_itemsize,

4933 nan_rep,

4934 encoding,

4935 errors,

4936 columns: list[str],

4937):

4938 if bvalues.dtype != object:

4939 return bvalues

4940

4941 bvalues = cast(np.ndarray, bvalues)

4942

4943 dtype_name = bvalues.dtype.name

4944 inferred_type = lib.infer_dtype(bvalues, skipna=False)

4945

4946 if inferred_type == "date":

4947 raise TypeError("[date] is not implemented as a table column")

4948 if inferred_type == "datetime":

4949 # after GH#8260

4950 # this only would be hit for a multi-timezone dtype which is an error

4951 raise TypeError(

4952 "too many timezones in this block, create separate data columns"

4953 )

4954

4955 if not (inferred_type == "string" or dtype_name == "object"):

4956 return bvalues

4957

4958 mask = isna(bvalues)

4959 data = bvalues.copy()

4960 data[mask] = nan_rep

4961

4962 # see if we have a valid string type

4963 inferred_type = lib.infer_dtype(data, skipna=False)

4964 if inferred_type != "string":

4965 # we cannot serialize this data, so report an exception on a column

4966 # by column basis

4967

4968 # expected behaviour:

4969 # search block for a non-string object column by column

4970 for i in range(data.shape[0]):

4971 col = data[i]

4972 inferred_type = lib.infer_dtype(col, skipna=False)

4973 if inferred_type != "string":

4974 error_column_label = columns[i] if len(columns) > i else f"No.{i}"

4975 raise TypeError(

4976 f"Cannot serialize the column [{error_column_label}]\n"

4977 f"because its data contents are not [string] but "

4978 f"[{inferred_type}] object dtype"

4979 )

4980

4981 # itemsize is the maximum length of a string (along any dimension)

4982

4983 data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape)

4984 itemsize = data_converted.itemsize

4985

4986 # specified min_itemsize?

4987 if isinstance(min_itemsize, dict):

4988 min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)

4989 itemsize = max(min_itemsize or 0, itemsize)

4990

4991 # check for column in the values conflicts

4992 if existing_col is not None:

4993 eci = existing_col.validate_col(itemsize)

4994 if eci is not None and eci > itemsize:

4995 itemsize = eci

4996

4997 data_converted = data_converted.astype(f"|S{itemsize}", copy=False)

4998 return data_converted

4999

5000

5001def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray:

5002 """

5003 Take a string-like that is object dtype and coerce to a fixed size string type.

5004

5005 Parameters

5006 ----------

5007 data : np.ndarray[object]

5008 encoding : str

5009 errors : str

5010 Handler for encoding errors.

5011

5012 Returns

5013 -------

5014 np.ndarray[fixed-length-string]

5015 """

5016 # encode if needed

5017 if len(data):

5018 data = (

5019 Series(data.ravel(), copy=False)

5020 .str.encode(encoding, errors)

5021 ._values.reshape(data.shape)

5022 )

5023

5024 # create the sized dtype

5025 ensured = ensure_object(data.ravel())

5026 itemsize = max(1, libwriters.max_len_string_array(ensured))

5027

5028 data = np.asarray(data, dtype=f"S{itemsize}")

5029 return data

5030

5031

5032def _unconvert_string_array(

5033 data: np.ndarray, nan_rep, encoding: str, errors: str

5034) -> np.ndarray:

5035 """

5036 Inverse of _convert_string_array.

5037

5038 Parameters

5039 ----------

5040 data : np.ndarray[fixed-length-string]

5041 nan_rep : the storage repr of NaN

5042 encoding : str

5043 errors : str

5044 Handler for encoding errors.

5045

5046 Returns

5047 -------

5048 np.ndarray[object]

5049 Decoded data.

5050 """

5051 shape = data.shape

5052 data = np.asarray(data.ravel(), dtype=object)

5053

5054 if len(data):

5055 itemsize = libwriters.max_len_string_array(ensure_object(data))

5056 dtype = f"U{itemsize}"

5057

5058 if isinstance(data[0], bytes):

5059 data = Series(data, copy=False).str.decode(encoding, errors=errors)._values

5060 else:

5061 data = data.astype(dtype, copy=False).astype(object, copy=False)

5062

5063 if nan_rep is None:

5064 nan_rep = "nan"

5065

5066 libwriters.string_array_replace_from_nan_rep(data, nan_rep)

5067 return data.reshape(shape)

5068

5069

5070def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str):

5071 assert isinstance(val_kind, str), type(val_kind)

5072 if _need_convert(val_kind):

5073 conv = _get_converter(val_kind, encoding, errors)

5074 values = conv(values)

5075 return values

5076

5077

5078def _get_converter(kind: str, encoding: str, errors: str):

5079 if kind == "datetime64":

5080 return lambda x: np.asarray(x, dtype="M8[ns]")

5081 elif kind == "string":

5082 return lambda x: _unconvert_string_array(

5083 x, nan_rep=None, encoding=encoding, errors=errors

5084 )

5085 else: # pragma: no cover

5086 raise ValueError(f"invalid kind {kind}")

5087

5088

5089def _need_convert(kind: str) -> bool:

5090 if kind in ("datetime64", "string"):

5091 return True

5092 return False

5093

5094

5095def _maybe_adjust_name(name: str, version: Sequence[int]) -> str:

5096 """

5097 Prior to 0.10.1, we named values blocks like: values_block_0 an the

5098 name values_0, adjust the given name if necessary.

5099

5100 Parameters

5101 ----------

5102 name : str

5103 version : Tuple[int, int, int]

5104

5105 Returns

5106 -------

5107 str

5108 """

5109 if isinstance(version, str) or len(version) < 3:

5110 raise ValueError("Version is incorrect, expected sequence of 3 integers.")

5111

5112 if version[0] == 0 and version[1] <= 10 and version[2] == 0:

5113 m = re.search(r"values_block_(\d+)", name)

5114 if m:

5115 grp = m.groups()[0]

5116 name = f"values_{grp}"

5117 return name

5118

5119

5120def _dtype_to_kind(dtype_str: str) -> str:

5121 """

5122 Find the "kind" string describing the given dtype name.

5123 """

5124 dtype_str = _ensure_decoded(dtype_str)

5125

5126 if dtype_str.startswith("string") or dtype_str.startswith("bytes"):

5127 kind = "string"

5128 elif dtype_str.startswith("float"):

5129 kind = "float"

5130 elif dtype_str.startswith("complex"):

5131 kind = "complex"

5132 elif dtype_str.startswith("int") or dtype_str.startswith("uint"):

5133 kind = "integer"

5134 elif dtype_str.startswith("datetime64"):

5135 kind = "datetime64"

5136 elif dtype_str.startswith("timedelta"):

5137 kind = "timedelta64"

5138 elif dtype_str.startswith("bool"):

5139 kind = "bool"

5140 elif dtype_str.startswith("category"):

5141 kind = "category"

5142 elif dtype_str.startswith("period"):

5143 # We store the `freq` attr so we can restore from integers

5144 kind = "integer"

5145 elif dtype_str == "object":

5146 kind = "object"

5147 else:

5148 raise ValueError(f"cannot interpret dtype of [{dtype_str}]")

5149

5150 return kind

5151

5152

5153def _get_data_and_dtype_name(data: ArrayLike):

5154 """

5155 Convert the passed data into a storable form and a dtype string.

5156 """

5157 if isinstance(data, Categorical):

5158 data = data.codes

5159

5160 # For datetime64tz we need to drop the TZ in tests TODO: why?

5161 dtype_name = data.dtype.name.split("[")[0]

5162

5163 if data.dtype.kind in ["m", "M"]:

5164 data = np.asarray(data.view("i8"))

5165 # TODO: we used to reshape for the dt64tz case, but no longer

5166 # doing that doesn't seem to break anything. why?

5167

5168 elif isinstance(data, PeriodIndex):

5169 data = data.asi8

5170

5171 data = np.asarray(data)

5172 return data, dtype_name

5173

5174

5175class Selection:

5176 """

5177 Carries out a selection operation on a tables.Table object.

5178

5179 Parameters

5180 ----------

5181 table : a Table object

5182 where : list of Terms (or convertible to)

5183 start, stop: indices to start and/or stop selection

5184

5185 """

5186

5187 def __init__(

5188 self,

5189 table: Table,

5190 where=None,

5191 start: int | None = None,

5192 stop: int | None = None,

5193 ) -> None:

5194 self.table = table

5195 self.where = where

5196 self.start = start

5197 self.stop = stop

5198 self.condition = None

5199 self.filter = None

5200 self.terms = None

5201 self.coordinates = None

5202

5203 if is_list_like(where):

5204 # see if we have a passed coordinate like

5205 with suppress(ValueError):

5206 inferred = lib.infer_dtype(where, skipna=False)

5207 if inferred in ("integer", "boolean"):

5208 where = np.asarray(where)

5209 if where.dtype == np.bool_:

5210 start, stop = self.start, self.stop

5211 if start is None:

5212 start = 0

5213 if stop is None:

5214 stop = self.table.nrows

5215 self.coordinates = np.arange(start, stop)[where]

5216 elif issubclass(where.dtype.type, np.integer):

5217 if (self.start is not None and (where < self.start).any()) or (

5218 self.stop is not None and (where >= self.stop).any()

5219 ):

5220 raise ValueError(

5221 "where must have index locations >= start and < stop"

5222 )

5223 self.coordinates = where

5224

5225 if self.coordinates is None:

5226 self.terms = self.generate(where)

5227

5228 # create the numexpr & the filter

5229 if self.terms is not None:

5230 self.condition, self.filter = self.terms.evaluate()

5231

5232 def generate(self, where):

5233 """where can be a : dict,list,tuple,string"""

5234 if where is None:

5235 return None

5236

5237 q = self.table.queryables()

5238 try:

5239 return PyTablesExpr(where, queryables=q, encoding=self.table.encoding)

5240 except NameError as err:

5241 # raise a nice message, suggesting that the user should use

5242 # data_columns

5243 qkeys = ",".join(q.keys())

5244 msg = dedent(

5245 f"""\

5246 The passed where expression: {where}

5247 contains an invalid variable reference

5248 all of the variable references must be a reference to

5249 an axis (e.g. 'index' or 'columns'), or a data_column

5250 The currently defined references are: {qkeys}

5251 """

5252 )

5253 raise ValueError(msg) from err

5254

5255 def select(self):

5256 """

5257 generate the selection

5258 """

5259 if self.condition is not None:

5260 return self.table.table.read_where(

5261 self.condition.format(), start=self.start, stop=self.stop

5262 )

5263 elif self.coordinates is not None:

5264 return self.table.table.read_coordinates(self.coordinates)

5265 return self.table.table.read(start=self.start, stop=self.stop)

5266

5267 def select_coords(self):

5268 """

5269 generate the selection

5270 """

5271 start, stop = self.start, self.stop

5272 nrows = self.table.nrows

5273 if start is None:

5274 start = 0

5275 elif start < 0:

5276 start += nrows

5277 if stop is None:

5278 stop = nrows

5279 elif stop < 0:

5280 stop += nrows

5281

5282 if self.condition is not None:

5283 return self.table.table.get_where_list(

5284 self.condition.format(), start=start, stop=stop, sort=True

5285 )

5286 elif self.coordinates is not None:

5287 return self.coordinates

5288

5289 return np.arange(start, stop)