1"""
2High level interface to PyTables for reading and writing pandas data structures
3to disk
4"""
5from __future__ import annotations
6
7from contextlib import suppress
8import copy
9from datetime import (
10 date,
11 tzinfo,
12)
13import itertools
14import os
15import re
16from textwrap import dedent
17from typing import (
18 TYPE_CHECKING,
19 Any,
20 Callable,
21 Final,
22 Literal,
23 cast,
24 overload,
25)
26import warnings
27
28import numpy as np
29
30from pandas._config import (
31 config,
32 get_option,
33 using_copy_on_write,
34 using_pyarrow_string_dtype,
35)
36
37from pandas._libs import (
38 lib,
39 writers as libwriters,
40)
41from pandas._libs.lib import is_string_array
42from pandas._libs.tslibs import timezones
43from pandas.compat._optional import import_optional_dependency
44from pandas.compat.pickle_compat import patch_pickle
45from pandas.errors import (
46 AttributeConflictWarning,
47 ClosedFileError,
48 IncompatibilityWarning,
49 PerformanceWarning,
50 PossibleDataLossError,
51)
52from pandas.util._decorators import cache_readonly
53from pandas.util._exceptions import find_stack_level
54
55from pandas.core.dtypes.common import (
56 ensure_object,
57 is_bool_dtype,
58 is_complex_dtype,
59 is_list_like,
60 is_string_dtype,
61 needs_i8_conversion,
62)
63from pandas.core.dtypes.dtypes import (
64 CategoricalDtype,
65 DatetimeTZDtype,
66 ExtensionDtype,
67 PeriodDtype,
68)
69from pandas.core.dtypes.missing import array_equivalent
70
71from pandas import (
72 DataFrame,
73 DatetimeIndex,
74 Index,
75 MultiIndex,
76 PeriodIndex,
77 RangeIndex,
78 Series,
79 TimedeltaIndex,
80 concat,
81 isna,
82)
83from pandas.core.arrays import (
84 Categorical,
85 DatetimeArray,
86 PeriodArray,
87)
88import pandas.core.common as com
89from pandas.core.computation.pytables import (
90 PyTablesExpr,
91 maybe_expression,
92)
93from pandas.core.construction import extract_array
94from pandas.core.indexes.api import ensure_index
95from pandas.core.internals import (
96 ArrayManager,
97 BlockManager,
98)
99
100from pandas.io.common import stringify_path
101from pandas.io.formats.printing import (
102 adjoin,
103 pprint_thing,
104)
105
106if TYPE_CHECKING:
107 from collections.abc import (
108 Hashable,
109 Iterator,
110 Sequence,
111 )
112 from types import TracebackType
113
114 from tables import (
115 Col,
116 File,
117 Node,
118 )
119
120 from pandas._typing import (
121 AnyArrayLike,
122 ArrayLike,
123 AxisInt,
124 DtypeArg,
125 FilePath,
126 Self,
127 Shape,
128 npt,
129 )
130
131 from pandas.core.internals import Block
132
133# versioning attribute
134_version = "0.15.2"
135
136# encoding
137_default_encoding = "UTF-8"
138
139
140def _ensure_decoded(s):
141 """if we have bytes, decode them to unicode"""
142 if isinstance(s, np.bytes_):
143 s = s.decode("UTF-8")
144 return s
145
146
147def _ensure_encoding(encoding: str | None) -> str:
148 # set the encoding if we need
149 if encoding is None:
150 encoding = _default_encoding
151
152 return encoding
153
154
155def _ensure_str(name):
156 """
157 Ensure that an index / column name is a str (python 3); otherwise they
158 may be np.string dtype. Non-string dtypes are passed through unchanged.
159
160 https://github.com/pandas-dev/pandas/issues/13492
161 """
162 if isinstance(name, str):
163 name = str(name)
164 return name
165
166
167Term = PyTablesExpr
168
169
170def _ensure_term(where, scope_level: int):
171 """
172 Ensure that the where is a Term or a list of Term.
173
174 This makes sure that we are capturing the scope of variables that are
175 passed create the terms here with a frame_level=2 (we are 2 levels down)
176 """
177 # only consider list/tuple here as an ndarray is automatically a coordinate
178 # list
179 level = scope_level + 1
180 if isinstance(where, (list, tuple)):
181 where = [
182 Term(term, scope_level=level + 1) if maybe_expression(term) else term
183 for term in where
184 if term is not None
185 ]
186 elif maybe_expression(where):
187 where = Term(where, scope_level=level)
188 return where if where is None or len(where) else None
189
190
191incompatibility_doc: Final = """
192where criteria is being ignored as this version [%s] is too old (or
193not-defined), read the file in and write it out to a new file to upgrade (with
194the copy_to method)
195"""
196
197attribute_conflict_doc: Final = """
198the [%s] attribute of the existing index is [%s] which conflicts with the new
199[%s], resetting the attribute to None
200"""
201
202performance_doc: Final = """
203your performance may suffer as PyTables will pickle object types that it cannot
204map directly to c-types [inferred_type->%s,key->%s] [items->%s]
205"""
206
207# formats
208_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}
209
210# axes map
211_AXES_MAP = {DataFrame: [0]}
212
213# register our configuration options
214dropna_doc: Final = """
215: boolean
216 drop ALL nan rows when appending to a table
217"""
218format_doc: Final = """
219: format
220 default format writing format, if None, then
221 put will default to 'fixed' and append will default to 'table'
222"""
223
224with config.config_prefix("io.hdf"):
225 config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)
226 config.register_option(
227 "default_format",
228 None,
229 format_doc,
230 validator=config.is_one_of_factory(["fixed", "table", None]),
231 )
232
233# oh the troubles to reduce import time
234_table_mod = None
235_table_file_open_policy_is_strict = False
236
237
238def _tables():
239 global _table_mod
240 global _table_file_open_policy_is_strict
241 if _table_mod is None:
242 import tables
243
244 _table_mod = tables
245
246 # set the file open policy
247 # return the file open policy; this changes as of pytables 3.1
248 # depending on the HDF5 version
249 with suppress(AttributeError):
250 _table_file_open_policy_is_strict = (
251 tables.file._FILE_OPEN_POLICY == "strict"
252 )
253
254 return _table_mod
255
256
257# interface to/from ###
258
259
260def to_hdf(
261 path_or_buf: FilePath | HDFStore,
262 key: str,
263 value: DataFrame | Series,
264 mode: str = "a",
265 complevel: int | None = None,
266 complib: str | None = None,
267 append: bool = False,
268 format: str | None = None,
269 index: bool = True,
270 min_itemsize: int | dict[str, int] | None = None,
271 nan_rep=None,
272 dropna: bool | None = None,
273 data_columns: Literal[True] | list[str] | None = None,
274 errors: str = "strict",
275 encoding: str = "UTF-8",
276) -> None:
277 """store this object, close it if we opened it"""
278 if append:
279 f = lambda store: store.append(
280 key,
281 value,
282 format=format,
283 index=index,
284 min_itemsize=min_itemsize,
285 nan_rep=nan_rep,
286 dropna=dropna,
287 data_columns=data_columns,
288 errors=errors,
289 encoding=encoding,
290 )
291 else:
292 # NB: dropna is not passed to `put`
293 f = lambda store: store.put(
294 key,
295 value,
296 format=format,
297 index=index,
298 min_itemsize=min_itemsize,
299 nan_rep=nan_rep,
300 data_columns=data_columns,
301 errors=errors,
302 encoding=encoding,
303 dropna=dropna,
304 )
305
306 path_or_buf = stringify_path(path_or_buf)
307 if isinstance(path_or_buf, str):
308 with HDFStore(
309 path_or_buf, mode=mode, complevel=complevel, complib=complib
310 ) as store:
311 f(store)
312 else:
313 f(path_or_buf)
314
315
316def read_hdf(
317 path_or_buf: FilePath | HDFStore,
318 key=None,
319 mode: str = "r",
320 errors: str = "strict",
321 where: str | list | None = None,
322 start: int | None = None,
323 stop: int | None = None,
324 columns: list[str] | None = None,
325 iterator: bool = False,
326 chunksize: int | None = None,
327 **kwargs,
328):
329 """
330 Read from the store, close it if we opened it.
331
332 Retrieve pandas object stored in file, optionally based on where
333 criteria.
334
335 .. warning::
336
337 Pandas uses PyTables for reading and writing HDF5 files, which allows
338 serializing object-dtype data with pickle when using the "fixed" format.
339 Loading pickled data received from untrusted sources can be unsafe.
340
341 See: https://docs.python.org/3/library/pickle.html for more.
342
343 Parameters
344 ----------
345 path_or_buf : str, path object, pandas.HDFStore
346 Any valid string path is acceptable. Only supports the local file system,
347 remote URLs and file-like objects are not supported.
348
349 If you want to pass in a path object, pandas accepts any
350 ``os.PathLike``.
351
352 Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.
353
354 key : object, optional
355 The group identifier in the store. Can be omitted if the HDF file
356 contains a single pandas object.
357 mode : {'r', 'r+', 'a'}, default 'r'
358 Mode to use when opening the file. Ignored if path_or_buf is a
359 :class:`pandas.HDFStore`. Default is 'r'.
360 errors : str, default 'strict'
361 Specifies how encoding and decoding errors are to be handled.
362 See the errors argument for :func:`open` for a full list
363 of options.
364 where : list, optional
365 A list of Term (or convertible) objects.
366 start : int, optional
367 Row number to start selection.
368 stop : int, optional
369 Row number to stop selection.
370 columns : list, optional
371 A list of columns names to return.
372 iterator : bool, optional
373 Return an iterator object.
374 chunksize : int, optional
375 Number of rows to include in an iteration when using an iterator.
376 **kwargs
377 Additional keyword arguments passed to HDFStore.
378
379 Returns
380 -------
381 object
382 The selected object. Return type depends on the object stored.
383
384 See Also
385 --------
386 DataFrame.to_hdf : Write a HDF file from a DataFrame.
387 HDFStore : Low-level access to HDF files.
388
389 Examples
390 --------
391 >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP
392 >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP
393 >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP
394 """
395 if mode not in ["r", "r+", "a"]:
396 raise ValueError(
397 f"mode {mode} is not allowed while performing a read. "
398 f"Allowed modes are r, r+ and a."
399 )
400 # grab the scope
401 if where is not None:
402 where = _ensure_term(where, scope_level=1)
403
404 if isinstance(path_or_buf, HDFStore):
405 if not path_or_buf.is_open:
406 raise OSError("The HDFStore must be open for reading.")
407
408 store = path_or_buf
409 auto_close = False
410 else:
411 path_or_buf = stringify_path(path_or_buf)
412 if not isinstance(path_or_buf, str):
413 raise NotImplementedError(
414 "Support for generic buffers has not been implemented."
415 )
416 try:
417 exists = os.path.exists(path_or_buf)
418
419 # if filepath is too long
420 except (TypeError, ValueError):
421 exists = False
422
423 if not exists:
424 raise FileNotFoundError(f"File {path_or_buf} does not exist")
425
426 store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)
427 # can't auto open/close if we are using an iterator
428 # so delegate to the iterator
429 auto_close = True
430
431 try:
432 if key is None:
433 groups = store.groups()
434 if len(groups) == 0:
435 raise ValueError(
436 "Dataset(s) incompatible with Pandas data types, "
437 "not table, or no datasets found in HDF5 file."
438 )
439 candidate_only_group = groups[0]
440
441 # For the HDF file to have only one dataset, all other groups
442 # should then be metadata groups for that candidate group. (This
443 # assumes that the groups() method enumerates parent groups
444 # before their children.)
445 for group_to_check in groups[1:]:
446 if not _is_metadata_of(group_to_check, candidate_only_group):
447 raise ValueError(
448 "key must be provided when HDF5 "
449 "file contains multiple datasets."
450 )
451 key = candidate_only_group._v_pathname
452 return store.select(
453 key,
454 where=where,
455 start=start,
456 stop=stop,
457 columns=columns,
458 iterator=iterator,
459 chunksize=chunksize,
460 auto_close=auto_close,
461 )
462 except (ValueError, TypeError, LookupError):
463 if not isinstance(path_or_buf, HDFStore):
464 # if there is an error, close the store if we opened it.
465 with suppress(AttributeError):
466 store.close()
467
468 raise
469
470
471def _is_metadata_of(group: Node, parent_group: Node) -> bool:
472 """Check if a given group is a metadata group for a given parent_group."""
473 if group._v_depth <= parent_group._v_depth:
474 return False
475
476 current = group
477 while current._v_depth > 1:
478 parent = current._v_parent
479 if parent == parent_group and current._v_name == "meta":
480 return True
481 current = current._v_parent
482 return False
483
484
485class HDFStore:
486 """
487 Dict-like IO interface for storing pandas objects in PyTables.
488
489 Either Fixed or Table format.
490
491 .. warning::
492
493 Pandas uses PyTables for reading and writing HDF5 files, which allows
494 serializing object-dtype data with pickle when using the "fixed" format.
495 Loading pickled data received from untrusted sources can be unsafe.
496
497 See: https://docs.python.org/3/library/pickle.html for more.
498
499 Parameters
500 ----------
501 path : str
502 File path to HDF5 file.
503 mode : {'a', 'w', 'r', 'r+'}, default 'a'
504
505 ``'r'``
506 Read-only; no data can be modified.
507 ``'w'``
508 Write; a new file is created (an existing file with the same
509 name would be deleted).
510 ``'a'``
511 Append; an existing file is opened for reading and writing,
512 and if the file does not exist it is created.
513 ``'r+'``
514 It is similar to ``'a'``, but the file must already exist.
515 complevel : int, 0-9, default None
516 Specifies a compression level for data.
517 A value of 0 or None disables compression.
518 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
519 Specifies the compression library to be used.
520 These additional compressors for Blosc are supported
521 (default if no compressor specified: 'blosc:blosclz'):
522 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
523 'blosc:zlib', 'blosc:zstd'}.
524 Specifying a compression library which is not available issues
525 a ValueError.
526 fletcher32 : bool, default False
527 If applying compression use the fletcher32 checksum.
528 **kwargs
529 These parameters will be passed to the PyTables open_file method.
530
531 Examples
532 --------
533 >>> bar = pd.DataFrame(np.random.randn(10, 4))
534 >>> store = pd.HDFStore('test.h5')
535 >>> store['foo'] = bar # write to HDF5
536 >>> bar = store['foo'] # retrieve
537 >>> store.close()
538
539 **Create or load HDF5 file in-memory**
540
541 When passing the `driver` option to the PyTables open_file method through
542 **kwargs, the HDF5 file is loaded or created in-memory and will only be
543 written when closed:
544
545 >>> bar = pd.DataFrame(np.random.randn(10, 4))
546 >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE')
547 >>> store['foo'] = bar
548 >>> store.close() # only now, data is written to disk
549 """
550
551 _handle: File | None
552 _mode: str
553
554 def __init__(
555 self,
556 path,
557 mode: str = "a",
558 complevel: int | None = None,
559 complib=None,
560 fletcher32: bool = False,
561 **kwargs,
562 ) -> None:
563 if "format" in kwargs:
564 raise ValueError("format is not a defined argument for HDFStore")
565
566 tables = import_optional_dependency("tables")
567
568 if complib is not None and complib not in tables.filters.all_complibs:
569 raise ValueError(
570 f"complib only supports {tables.filters.all_complibs} compression."
571 )
572
573 if complib is None and complevel is not None:
574 complib = tables.filters.default_complib
575
576 self._path = stringify_path(path)
577 if mode is None:
578 mode = "a"
579 self._mode = mode
580 self._handle = None
581 self._complevel = complevel if complevel else 0
582 self._complib = complib
583 self._fletcher32 = fletcher32
584 self._filters = None
585 self.open(mode=mode, **kwargs)
586
587 def __fspath__(self) -> str:
588 return self._path
589
590 @property
591 def root(self):
592 """return the root node"""
593 self._check_if_open()
594 assert self._handle is not None # for mypy
595 return self._handle.root
596
597 @property
598 def filename(self) -> str:
599 return self._path
600
601 def __getitem__(self, key: str):
602 return self.get(key)
603
604 def __setitem__(self, key: str, value) -> None:
605 self.put(key, value)
606
607 def __delitem__(self, key: str) -> None:
608 return self.remove(key)
609
610 def __getattr__(self, name: str):
611 """allow attribute access to get stores"""
612 try:
613 return self.get(name)
614 except (KeyError, ClosedFileError):
615 pass
616 raise AttributeError(
617 f"'{type(self).__name__}' object has no attribute '{name}'"
618 )
619
620 def __contains__(self, key: str) -> bool:
621 """
622 check for existence of this key
623 can match the exact pathname or the pathnm w/o the leading '/'
624 """
625 node = self.get_node(key)
626 if node is not None:
627 name = node._v_pathname
628 if key in (name, name[1:]):
629 return True
630 return False
631
632 def __len__(self) -> int:
633 return len(self.groups())
634
635 def __repr__(self) -> str:
636 pstr = pprint_thing(self._path)
637 return f"{type(self)}\nFile path: {pstr}\n"
638
639 def __enter__(self) -> Self:
640 return self
641
642 def __exit__(
643 self,
644 exc_type: type[BaseException] | None,
645 exc_value: BaseException | None,
646 traceback: TracebackType | None,
647 ) -> None:
648 self.close()
649
650 def keys(self, include: str = "pandas") -> list[str]:
651 """
652 Return a list of keys corresponding to objects stored in HDFStore.
653
654 Parameters
655 ----------
656
657 include : str, default 'pandas'
658 When kind equals 'pandas' return pandas objects.
659 When kind equals 'native' return native HDF5 Table objects.
660
661 Returns
662 -------
663 list
664 List of ABSOLUTE path-names (e.g. have the leading '/').
665
666 Raises
667 ------
668 raises ValueError if kind has an illegal value
669
670 Examples
671 --------
672 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
673 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
674 >>> store.put('data', df) # doctest: +SKIP
675 >>> store.get('data') # doctest: +SKIP
676 >>> print(store.keys()) # doctest: +SKIP
677 ['/data1', '/data2']
678 >>> store.close() # doctest: +SKIP
679 """
680 if include == "pandas":
681 return [n._v_pathname for n in self.groups()]
682
683 elif include == "native":
684 assert self._handle is not None # mypy
685 return [
686 n._v_pathname for n in self._handle.walk_nodes("/", classname="Table")
687 ]
688 raise ValueError(
689 f"`include` should be either 'pandas' or 'native' but is '{include}'"
690 )
691
692 def __iter__(self) -> Iterator[str]:
693 return iter(self.keys())
694
695 def items(self) -> Iterator[tuple[str, list]]:
696 """
697 iterate on key->group
698 """
699 for g in self.groups():
700 yield g._v_pathname, g
701
702 def open(self, mode: str = "a", **kwargs) -> None:
703 """
704 Open the file in the specified mode
705
706 Parameters
707 ----------
708 mode : {'a', 'w', 'r', 'r+'}, default 'a'
709 See HDFStore docstring or tables.open_file for info about modes
710 **kwargs
711 These parameters will be passed to the PyTables open_file method.
712 """
713 tables = _tables()
714
715 if self._mode != mode:
716 # if we are changing a write mode to read, ok
717 if self._mode in ["a", "w"] and mode in ["r", "r+"]:
718 pass
719 elif mode in ["w"]:
720 # this would truncate, raise here
721 if self.is_open:
722 raise PossibleDataLossError(
723 f"Re-opening the file [{self._path}] with mode [{self._mode}] "
724 "will delete the current file!"
725 )
726
727 self._mode = mode
728
729 # close and reopen the handle
730 if self.is_open:
731 self.close()
732
733 if self._complevel and self._complevel > 0:
734 self._filters = _tables().Filters(
735 self._complevel, self._complib, fletcher32=self._fletcher32
736 )
737
738 if _table_file_open_policy_is_strict and self.is_open:
739 msg = (
740 "Cannot open HDF5 file, which is already opened, "
741 "even in read-only mode."
742 )
743 raise ValueError(msg)
744
745 self._handle = tables.open_file(self._path, self._mode, **kwargs)
746
747 def close(self) -> None:
748 """
749 Close the PyTables file handle
750 """
751 if self._handle is not None:
752 self._handle.close()
753 self._handle = None
754
755 @property
756 def is_open(self) -> bool:
757 """
758 return a boolean indicating whether the file is open
759 """
760 if self._handle is None:
761 return False
762 return bool(self._handle.isopen)
763
764 def flush(self, fsync: bool = False) -> None:
765 """
766 Force all buffered modifications to be written to disk.
767
768 Parameters
769 ----------
770 fsync : bool (default False)
771 call ``os.fsync()`` on the file handle to force writing to disk.
772
773 Notes
774 -----
775 Without ``fsync=True``, flushing may not guarantee that the OS writes
776 to disk. With fsync, the operation will block until the OS claims the
777 file has been written; however, other caching layers may still
778 interfere.
779 """
780 if self._handle is not None:
781 self._handle.flush()
782 if fsync:
783 with suppress(OSError):
784 os.fsync(self._handle.fileno())
785
786 def get(self, key: str):
787 """
788 Retrieve pandas object stored in file.
789
790 Parameters
791 ----------
792 key : str
793
794 Returns
795 -------
796 object
797 Same type as object stored in file.
798
799 Examples
800 --------
801 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
802 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
803 >>> store.put('data', df) # doctest: +SKIP
804 >>> store.get('data') # doctest: +SKIP
805 >>> store.close() # doctest: +SKIP
806 """
807 with patch_pickle():
808 # GH#31167 Without this patch, pickle doesn't know how to unpickle
809 # old DateOffset objects now that they are cdef classes.
810 group = self.get_node(key)
811 if group is None:
812 raise KeyError(f"No object named {key} in the file")
813 return self._read_group(group)
814
815 def select(
816 self,
817 key: str,
818 where=None,
819 start=None,
820 stop=None,
821 columns=None,
822 iterator: bool = False,
823 chunksize: int | None = None,
824 auto_close: bool = False,
825 ):
826 """
827 Retrieve pandas object stored in file, optionally based on where criteria.
828
829 .. warning::
830
831 Pandas uses PyTables for reading and writing HDF5 files, which allows
832 serializing object-dtype data with pickle when using the "fixed" format.
833 Loading pickled data received from untrusted sources can be unsafe.
834
835 See: https://docs.python.org/3/library/pickle.html for more.
836
837 Parameters
838 ----------
839 key : str
840 Object being retrieved from file.
841 where : list or None
842 List of Term (or convertible) objects, optional.
843 start : int or None
844 Row number to start selection.
845 stop : int, default None
846 Row number to stop selection.
847 columns : list or None
848 A list of columns that if not None, will limit the return columns.
849 iterator : bool or False
850 Returns an iterator.
851 chunksize : int or None
852 Number or rows to include in iteration, return an iterator.
853 auto_close : bool or False
854 Should automatically close the store when finished.
855
856 Returns
857 -------
858 object
859 Retrieved object from file.
860
861 Examples
862 --------
863 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
864 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
865 >>> store.put('data', df) # doctest: +SKIP
866 >>> store.get('data') # doctest: +SKIP
867 >>> print(store.keys()) # doctest: +SKIP
868 ['/data1', '/data2']
869 >>> store.select('/data1') # doctest: +SKIP
870 A B
871 0 1 2
872 1 3 4
873 >>> store.select('/data1', where='columns == A') # doctest: +SKIP
874 A
875 0 1
876 1 3
877 >>> store.close() # doctest: +SKIP
878 """
879 group = self.get_node(key)
880 if group is None:
881 raise KeyError(f"No object named {key} in the file")
882
883 # create the storer and axes
884 where = _ensure_term(where, scope_level=1)
885 s = self._create_storer(group)
886 s.infer_axes()
887
888 # function to call on iteration
889 def func(_start, _stop, _where):
890 return s.read(start=_start, stop=_stop, where=_where, columns=columns)
891
892 # create the iterator
893 it = TableIterator(
894 self,
895 s,
896 func,
897 where=where,
898 nrows=s.nrows,
899 start=start,
900 stop=stop,
901 iterator=iterator,
902 chunksize=chunksize,
903 auto_close=auto_close,
904 )
905
906 return it.get_result()
907
908 def select_as_coordinates(
909 self,
910 key: str,
911 where=None,
912 start: int | None = None,
913 stop: int | None = None,
914 ):
915 """
916 return the selection as an Index
917
918 .. warning::
919
920 Pandas uses PyTables for reading and writing HDF5 files, which allows
921 serializing object-dtype data with pickle when using the "fixed" format.
922 Loading pickled data received from untrusted sources can be unsafe.
923
924 See: https://docs.python.org/3/library/pickle.html for more.
925
926
927 Parameters
928 ----------
929 key : str
930 where : list of Term (or convertible) objects, optional
931 start : integer (defaults to None), row number to start selection
932 stop : integer (defaults to None), row number to stop selection
933 """
934 where = _ensure_term(where, scope_level=1)
935 tbl = self.get_storer(key)
936 if not isinstance(tbl, Table):
937 raise TypeError("can only read_coordinates with a table")
938 return tbl.read_coordinates(where=where, start=start, stop=stop)
939
940 def select_column(
941 self,
942 key: str,
943 column: str,
944 start: int | None = None,
945 stop: int | None = None,
946 ):
947 """
948 return a single column from the table. This is generally only useful to
949 select an indexable
950
951 .. warning::
952
953 Pandas uses PyTables for reading and writing HDF5 files, which allows
954 serializing object-dtype data with pickle when using the "fixed" format.
955 Loading pickled data received from untrusted sources can be unsafe.
956
957 See: https://docs.python.org/3/library/pickle.html for more.
958
959 Parameters
960 ----------
961 key : str
962 column : str
963 The column of interest.
964 start : int or None, default None
965 stop : int or None, default None
966
967 Raises
968 ------
969 raises KeyError if the column is not found (or key is not a valid
970 store)
971 raises ValueError if the column can not be extracted individually (it
972 is part of a data block)
973
974 """
975 tbl = self.get_storer(key)
976 if not isinstance(tbl, Table):
977 raise TypeError("can only read_column with a table")
978 return tbl.read_column(column=column, start=start, stop=stop)
979
980 def select_as_multiple(
981 self,
982 keys,
983 where=None,
984 selector=None,
985 columns=None,
986 start=None,
987 stop=None,
988 iterator: bool = False,
989 chunksize: int | None = None,
990 auto_close: bool = False,
991 ):
992 """
993 Retrieve pandas objects from multiple tables.
994
995 .. warning::
996
997 Pandas uses PyTables for reading and writing HDF5 files, which allows
998 serializing object-dtype data with pickle when using the "fixed" format.
999 Loading pickled data received from untrusted sources can be unsafe.
1000
1001 See: https://docs.python.org/3/library/pickle.html for more.
1002
1003 Parameters
1004 ----------
1005 keys : a list of the tables
1006 selector : the table to apply the where criteria (defaults to keys[0]
1007 if not supplied)
1008 columns : the columns I want back
1009 start : integer (defaults to None), row number to start selection
1010 stop : integer (defaults to None), row number to stop selection
1011 iterator : bool, return an iterator, default False
1012 chunksize : nrows to include in iteration, return an iterator
1013 auto_close : bool, default False
1014 Should automatically close the store when finished.
1015
1016 Raises
1017 ------
1018 raises KeyError if keys or selector is not found or keys is empty
1019 raises TypeError if keys is not a list or tuple
1020 raises ValueError if the tables are not ALL THE SAME DIMENSIONS
1021 """
1022 # default to single select
1023 where = _ensure_term(where, scope_level=1)
1024 if isinstance(keys, (list, tuple)) and len(keys) == 1:
1025 keys = keys[0]
1026 if isinstance(keys, str):
1027 return self.select(
1028 key=keys,
1029 where=where,
1030 columns=columns,
1031 start=start,
1032 stop=stop,
1033 iterator=iterator,
1034 chunksize=chunksize,
1035 auto_close=auto_close,
1036 )
1037
1038 if not isinstance(keys, (list, tuple)):
1039 raise TypeError("keys must be a list/tuple")
1040
1041 if not len(keys):
1042 raise ValueError("keys must have a non-zero length")
1043
1044 if selector is None:
1045 selector = keys[0]
1046
1047 # collect the tables
1048 tbls = [self.get_storer(k) for k in keys]
1049 s = self.get_storer(selector)
1050
1051 # validate rows
1052 nrows = None
1053 for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
1054 if t is None:
1055 raise KeyError(f"Invalid table [{k}]")
1056 if not t.is_table:
1057 raise TypeError(
1058 f"object [{t.pathname}] is not a table, and cannot be used in all "
1059 "select as multiple"
1060 )
1061
1062 if nrows is None:
1063 nrows = t.nrows
1064 elif t.nrows != nrows:
1065 raise ValueError("all tables must have exactly the same nrows!")
1066
1067 # The isinstance checks here are redundant with the check above,
1068 # but necessary for mypy; see GH#29757
1069 _tbls = [x for x in tbls if isinstance(x, Table)]
1070
1071 # axis is the concentration axes
1072 axis = {t.non_index_axes[0][0] for t in _tbls}.pop()
1073
1074 def func(_start, _stop, _where):
1075 # retrieve the objs, _where is always passed as a set of
1076 # coordinates here
1077 objs = [
1078 t.read(where=_where, columns=columns, start=_start, stop=_stop)
1079 for t in tbls
1080 ]
1081
1082 # concat and return
1083 return concat(objs, axis=axis, verify_integrity=False)._consolidate()
1084
1085 # create the iterator
1086 it = TableIterator(
1087 self,
1088 s,
1089 func,
1090 where=where,
1091 nrows=nrows,
1092 start=start,
1093 stop=stop,
1094 iterator=iterator,
1095 chunksize=chunksize,
1096 auto_close=auto_close,
1097 )
1098
1099 return it.get_result(coordinates=True)
1100
1101 def put(
1102 self,
1103 key: str,
1104 value: DataFrame | Series,
1105 format=None,
1106 index: bool = True,
1107 append: bool = False,
1108 complib=None,
1109 complevel: int | None = None,
1110 min_itemsize: int | dict[str, int] | None = None,
1111 nan_rep=None,
1112 data_columns: Literal[True] | list[str] | None = None,
1113 encoding=None,
1114 errors: str = "strict",
1115 track_times: bool = True,
1116 dropna: bool = False,
1117 ) -> None:
1118 """
1119 Store object in HDFStore.
1120
1121 Parameters
1122 ----------
1123 key : str
1124 value : {Series, DataFrame}
1125 format : 'fixed(f)|table(t)', default is 'fixed'
1126 Format to use when storing object in HDFStore. Value can be one of:
1127
1128 ``'fixed'``
1129 Fixed format. Fast writing/reading. Not-appendable, nor searchable.
1130 ``'table'``
1131 Table format. Write as a PyTables Table structure which may perform
1132 worse but allow more flexible operations like searching / selecting
1133 subsets of the data.
1134 index : bool, default True
1135 Write DataFrame index as a column.
1136 append : bool, default False
1137 This will force Table format, append the input data to the existing.
1138 data_columns : list of columns or True, default None
1139 List of columns to create as data columns, or True to use all columns.
1140 See `here
1141 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
1142 encoding : str, default None
1143 Provide an encoding for strings.
1144 track_times : bool, default True
1145 Parameter is propagated to 'create_table' method of 'PyTables'.
1146 If set to False it enables to have the same h5 files (same hashes)
1147 independent on creation time.
1148 dropna : bool, default False, optional
1149 Remove missing values.
1150
1151 Examples
1152 --------
1153 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
1154 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1155 >>> store.put('data', df) # doctest: +SKIP
1156 """
1157 if format is None:
1158 format = get_option("io.hdf.default_format") or "fixed"
1159 format = self._validate_format(format)
1160 self._write_to_group(
1161 key,
1162 value,
1163 format=format,
1164 index=index,
1165 append=append,
1166 complib=complib,
1167 complevel=complevel,
1168 min_itemsize=min_itemsize,
1169 nan_rep=nan_rep,
1170 data_columns=data_columns,
1171 encoding=encoding,
1172 errors=errors,
1173 track_times=track_times,
1174 dropna=dropna,
1175 )
1176
1177 def remove(self, key: str, where=None, start=None, stop=None) -> None:
1178 """
1179 Remove pandas object partially by specifying the where condition
1180
1181 Parameters
1182 ----------
1183 key : str
1184 Node to remove or delete rows from
1185 where : list of Term (or convertible) objects, optional
1186 start : integer (defaults to None), row number to start selection
1187 stop : integer (defaults to None), row number to stop selection
1188
1189 Returns
1190 -------
1191 number of rows removed (or None if not a Table)
1192
1193 Raises
1194 ------
1195 raises KeyError if key is not a valid store
1196
1197 """
1198 where = _ensure_term(where, scope_level=1)
1199 try:
1200 s = self.get_storer(key)
1201 except KeyError:
1202 # the key is not a valid store, re-raising KeyError
1203 raise
1204 except AssertionError:
1205 # surface any assertion errors for e.g. debugging
1206 raise
1207 except Exception as err:
1208 # In tests we get here with ClosedFileError, TypeError, and
1209 # _table_mod.NoSuchNodeError. TODO: Catch only these?
1210
1211 if where is not None:
1212 raise ValueError(
1213 "trying to remove a node with a non-None where clause!"
1214 ) from err
1215
1216 # we are actually trying to remove a node (with children)
1217 node = self.get_node(key)
1218 if node is not None:
1219 node._f_remove(recursive=True)
1220 return None
1221
1222 # remove the node
1223 if com.all_none(where, start, stop):
1224 s.group._f_remove(recursive=True)
1225
1226 # delete from the table
1227 else:
1228 if not s.is_table:
1229 raise ValueError(
1230 "can only remove with where on objects written as tables"
1231 )
1232 return s.delete(where=where, start=start, stop=stop)
1233
1234 def append(
1235 self,
1236 key: str,
1237 value: DataFrame | Series,
1238 format=None,
1239 axes=None,
1240 index: bool | list[str] = True,
1241 append: bool = True,
1242 complib=None,
1243 complevel: int | None = None,
1244 columns=None,
1245 min_itemsize: int | dict[str, int] | None = None,
1246 nan_rep=None,
1247 chunksize: int | None = None,
1248 expectedrows=None,
1249 dropna: bool | None = None,
1250 data_columns: Literal[True] | list[str] | None = None,
1251 encoding=None,
1252 errors: str = "strict",
1253 ) -> None:
1254 """
1255 Append to Table in file.
1256
1257 Node must already exist and be Table format.
1258
1259 Parameters
1260 ----------
1261 key : str
1262 value : {Series, DataFrame}
1263 format : 'table' is the default
1264 Format to use when storing object in HDFStore. Value can be one of:
1265
1266 ``'table'``
1267 Table format. Write as a PyTables Table structure which may perform
1268 worse but allow more flexible operations like searching / selecting
1269 subsets of the data.
1270 index : bool, default True
1271 Write DataFrame index as a column.
1272 append : bool, default True
1273 Append the input data to the existing.
1274 data_columns : list of columns, or True, default None
1275 List of columns to create as indexed data columns for on-disk
1276 queries, or True to use all columns. By default only the axes
1277 of the object are indexed. See `here
1278 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
1279 min_itemsize : dict of columns that specify minimum str sizes
1280 nan_rep : str to use as str nan representation
1281 chunksize : size to chunk the writing
1282 expectedrows : expected TOTAL row size of this table
1283 encoding : default None, provide an encoding for str
1284 dropna : bool, default False, optional
1285 Do not write an ALL nan row to the store settable
1286 by the option 'io.hdf.dropna_table'.
1287
1288 Notes
1289 -----
1290 Does *not* check if data being appended overlaps with existing
1291 data in the table, so be careful
1292
1293 Examples
1294 --------
1295 >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
1296 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1297 >>> store.put('data', df1, format='table') # doctest: +SKIP
1298 >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B'])
1299 >>> store.append('data', df2) # doctest: +SKIP
1300 >>> store.close() # doctest: +SKIP
1301 A B
1302 0 1 2
1303 1 3 4
1304 0 5 6
1305 1 7 8
1306 """
1307 if columns is not None:
1308 raise TypeError(
1309 "columns is not a supported keyword in append, try data_columns"
1310 )
1311
1312 if dropna is None:
1313 dropna = get_option("io.hdf.dropna_table")
1314 if format is None:
1315 format = get_option("io.hdf.default_format") or "table"
1316 format = self._validate_format(format)
1317 self._write_to_group(
1318 key,
1319 value,
1320 format=format,
1321 axes=axes,
1322 index=index,
1323 append=append,
1324 complib=complib,
1325 complevel=complevel,
1326 min_itemsize=min_itemsize,
1327 nan_rep=nan_rep,
1328 chunksize=chunksize,
1329 expectedrows=expectedrows,
1330 dropna=dropna,
1331 data_columns=data_columns,
1332 encoding=encoding,
1333 errors=errors,
1334 )
1335
1336 def append_to_multiple(
1337 self,
1338 d: dict,
1339 value,
1340 selector,
1341 data_columns=None,
1342 axes=None,
1343 dropna: bool = False,
1344 **kwargs,
1345 ) -> None:
1346 """
1347 Append to multiple tables
1348
1349 Parameters
1350 ----------
1351 d : a dict of table_name to table_columns, None is acceptable as the
1352 values of one node (this will get all the remaining columns)
1353 value : a pandas object
1354 selector : a string that designates the indexable table; all of its
1355 columns will be designed as data_columns, unless data_columns is
1356 passed, in which case these are used
1357 data_columns : list of columns to create as data columns, or True to
1358 use all columns
1359 dropna : if evaluates to True, drop rows from all tables if any single
1360 row in each table has all NaN. Default False.
1361
1362 Notes
1363 -----
1364 axes parameter is currently not accepted
1365
1366 """
1367 if axes is not None:
1368 raise TypeError(
1369 "axes is currently not accepted as a parameter to append_to_multiple; "
1370 "you can create the tables independently instead"
1371 )
1372
1373 if not isinstance(d, dict):
1374 raise ValueError(
1375 "append_to_multiple must have a dictionary specified as the "
1376 "way to split the value"
1377 )
1378
1379 if selector not in d:
1380 raise ValueError(
1381 "append_to_multiple requires a selector that is in passed dict"
1382 )
1383
1384 # figure out the splitting axis (the non_index_axis)
1385 axis = next(iter(set(range(value.ndim)) - set(_AXES_MAP[type(value)])))
1386
1387 # figure out how to split the value
1388 remain_key = None
1389 remain_values: list = []
1390 for k, v in d.items():
1391 if v is None:
1392 if remain_key is not None:
1393 raise ValueError(
1394 "append_to_multiple can only have one value in d that is None"
1395 )
1396 remain_key = k
1397 else:
1398 remain_values.extend(v)
1399 if remain_key is not None:
1400 ordered = value.axes[axis]
1401 ordd = ordered.difference(Index(remain_values))
1402 ordd = sorted(ordered.get_indexer(ordd))
1403 d[remain_key] = ordered.take(ordd)
1404
1405 # data_columns
1406 if data_columns is None:
1407 data_columns = d[selector]
1408
1409 # ensure rows are synchronized across the tables
1410 if dropna:
1411 idxs = (value[cols].dropna(how="all").index for cols in d.values())
1412 valid_index = next(idxs)
1413 for index in idxs:
1414 valid_index = valid_index.intersection(index)
1415 value = value.loc[valid_index]
1416
1417 min_itemsize = kwargs.pop("min_itemsize", None)
1418
1419 # append
1420 for k, v in d.items():
1421 dc = data_columns if k == selector else None
1422
1423 # compute the val
1424 val = value.reindex(v, axis=axis)
1425
1426 filtered = (
1427 {key: value for (key, value) in min_itemsize.items() if key in v}
1428 if min_itemsize is not None
1429 else None
1430 )
1431 self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs)
1432
1433 def create_table_index(
1434 self,
1435 key: str,
1436 columns=None,
1437 optlevel: int | None = None,
1438 kind: str | None = None,
1439 ) -> None:
1440 """
1441 Create a pytables index on the table.
1442
1443 Parameters
1444 ----------
1445 key : str
1446 columns : None, bool, or listlike[str]
1447 Indicate which columns to create an index on.
1448
1449 * False : Do not create any indexes.
1450 * True : Create indexes on all columns.
1451 * None : Create indexes on all columns.
1452 * listlike : Create indexes on the given columns.
1453
1454 optlevel : int or None, default None
1455 Optimization level, if None, pytables defaults to 6.
1456 kind : str or None, default None
1457 Kind of index, if None, pytables defaults to "medium".
1458
1459 Raises
1460 ------
1461 TypeError: raises if the node is not a table
1462 """
1463 # version requirements
1464 _tables()
1465 s = self.get_storer(key)
1466 if s is None:
1467 return
1468
1469 if not isinstance(s, Table):
1470 raise TypeError("cannot create table index on a Fixed format store")
1471 s.create_index(columns=columns, optlevel=optlevel, kind=kind)
1472
1473 def groups(self) -> list:
1474 """
1475 Return a list of all the top-level nodes.
1476
1477 Each node returned is not a pandas storage object.
1478
1479 Returns
1480 -------
1481 list
1482 List of objects.
1483
1484 Examples
1485 --------
1486 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
1487 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1488 >>> store.put('data', df) # doctest: +SKIP
1489 >>> print(store.groups()) # doctest: +SKIP
1490 >>> store.close() # doctest: +SKIP
1491 [/data (Group) ''
1492 children := ['axis0' (Array), 'axis1' (Array), 'block0_values' (Array),
1493 'block0_items' (Array)]]
1494 """
1495 _tables()
1496 self._check_if_open()
1497 assert self._handle is not None # for mypy
1498 assert _table_mod is not None # for mypy
1499 return [
1500 g
1501 for g in self._handle.walk_groups()
1502 if (
1503 not isinstance(g, _table_mod.link.Link)
1504 and (
1505 getattr(g._v_attrs, "pandas_type", None)
1506 or getattr(g, "table", None)
1507 or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")
1508 )
1509 )
1510 ]
1511
1512 def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]:
1513 """
1514 Walk the pytables group hierarchy for pandas objects.
1515
1516 This generator will yield the group path, subgroups and pandas object
1517 names for each group.
1518
1519 Any non-pandas PyTables objects that are not a group will be ignored.
1520
1521 The `where` group itself is listed first (preorder), then each of its
1522 child groups (following an alphanumerical order) is also traversed,
1523 following the same procedure.
1524
1525 Parameters
1526 ----------
1527 where : str, default "/"
1528 Group where to start walking.
1529
1530 Yields
1531 ------
1532 path : str
1533 Full path to a group (without trailing '/').
1534 groups : list
1535 Names (strings) of the groups contained in `path`.
1536 leaves : list
1537 Names (strings) of the pandas objects contained in `path`.
1538
1539 Examples
1540 --------
1541 >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
1542 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1543 >>> store.put('data', df1, format='table') # doctest: +SKIP
1544 >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B'])
1545 >>> store.append('data', df2) # doctest: +SKIP
1546 >>> store.close() # doctest: +SKIP
1547 >>> for group in store.walk(): # doctest: +SKIP
1548 ... print(group) # doctest: +SKIP
1549 >>> store.close() # doctest: +SKIP
1550 """
1551 _tables()
1552 self._check_if_open()
1553 assert self._handle is not None # for mypy
1554 assert _table_mod is not None # for mypy
1555
1556 for g in self._handle.walk_groups(where):
1557 if getattr(g._v_attrs, "pandas_type", None) is not None:
1558 continue
1559
1560 groups = []
1561 leaves = []
1562 for child in g._v_children.values():
1563 pandas_type = getattr(child._v_attrs, "pandas_type", None)
1564 if pandas_type is None:
1565 if isinstance(child, _table_mod.group.Group):
1566 groups.append(child._v_name)
1567 else:
1568 leaves.append(child._v_name)
1569
1570 yield (g._v_pathname.rstrip("/"), groups, leaves)
1571
1572 def get_node(self, key: str) -> Node | None:
1573 """return the node with the key or None if it does not exist"""
1574 self._check_if_open()
1575 if not key.startswith("/"):
1576 key = "/" + key
1577
1578 assert self._handle is not None
1579 assert _table_mod is not None # for mypy
1580 try:
1581 node = self._handle.get_node(self.root, key)
1582 except _table_mod.exceptions.NoSuchNodeError:
1583 return None
1584
1585 assert isinstance(node, _table_mod.Node), type(node)
1586 return node
1587
1588 def get_storer(self, key: str) -> GenericFixed | Table:
1589 """return the storer object for a key, raise if not in the file"""
1590 group = self.get_node(key)
1591 if group is None:
1592 raise KeyError(f"No object named {key} in the file")
1593
1594 s = self._create_storer(group)
1595 s.infer_axes()
1596 return s
1597
1598 def copy(
1599 self,
1600 file,
1601 mode: str = "w",
1602 propindexes: bool = True,
1603 keys=None,
1604 complib=None,
1605 complevel: int | None = None,
1606 fletcher32: bool = False,
1607 overwrite: bool = True,
1608 ) -> HDFStore:
1609 """
1610 Copy the existing store to a new file, updating in place.
1611
1612 Parameters
1613 ----------
1614 propindexes : bool, default True
1615 Restore indexes in copied file.
1616 keys : list, optional
1617 List of keys to include in the copy (defaults to all).
1618 overwrite : bool, default True
1619 Whether to overwrite (remove and replace) existing nodes in the new store.
1620 mode, complib, complevel, fletcher32 same as in HDFStore.__init__
1621
1622 Returns
1623 -------
1624 open file handle of the new store
1625 """
1626 new_store = HDFStore(
1627 file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32
1628 )
1629 if keys is None:
1630 keys = list(self.keys())
1631 if not isinstance(keys, (tuple, list)):
1632 keys = [keys]
1633 for k in keys:
1634 s = self.get_storer(k)
1635 if s is not None:
1636 if k in new_store:
1637 if overwrite:
1638 new_store.remove(k)
1639
1640 data = self.select(k)
1641 if isinstance(s, Table):
1642 index: bool | list[str] = False
1643 if propindexes:
1644 index = [a.name for a in s.axes if a.is_indexed]
1645 new_store.append(
1646 k,
1647 data,
1648 index=index,
1649 data_columns=getattr(s, "data_columns", None),
1650 encoding=s.encoding,
1651 )
1652 else:
1653 new_store.put(k, data, encoding=s.encoding)
1654
1655 return new_store
1656
1657 def info(self) -> str:
1658 """
1659 Print detailed information on the store.
1660
1661 Returns
1662 -------
1663 str
1664
1665 Examples
1666 --------
1667 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
1668 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1669 >>> store.put('data', df) # doctest: +SKIP
1670 >>> print(store.info()) # doctest: +SKIP
1671 >>> store.close() # doctest: +SKIP
1672 <class 'pandas.io.pytables.HDFStore'>
1673 File path: store.h5
1674 /data frame (shape->[2,2])
1675 """
1676 path = pprint_thing(self._path)
1677 output = f"{type(self)}\nFile path: {path}\n"
1678
1679 if self.is_open:
1680 lkeys = sorted(self.keys())
1681 if len(lkeys):
1682 keys = []
1683 values = []
1684
1685 for k in lkeys:
1686 try:
1687 s = self.get_storer(k)
1688 if s is not None:
1689 keys.append(pprint_thing(s.pathname or k))
1690 values.append(pprint_thing(s or "invalid_HDFStore node"))
1691 except AssertionError:
1692 # surface any assertion errors for e.g. debugging
1693 raise
1694 except Exception as detail:
1695 keys.append(k)
1696 dstr = pprint_thing(detail)
1697 values.append(f"[invalid_HDFStore node: {dstr}]")
1698
1699 output += adjoin(12, keys, values)
1700 else:
1701 output += "Empty"
1702 else:
1703 output += "File is CLOSED"
1704
1705 return output
1706
1707 # ------------------------------------------------------------------------
1708 # private methods
1709
1710 def _check_if_open(self) -> None:
1711 if not self.is_open:
1712 raise ClosedFileError(f"{self._path} file is not open!")
1713
1714 def _validate_format(self, format: str) -> str:
1715 """validate / deprecate formats"""
1716 # validate
1717 try:
1718 format = _FORMAT_MAP[format.lower()]
1719 except KeyError as err:
1720 raise TypeError(f"invalid HDFStore format specified [{format}]") from err
1721
1722 return format
1723
1724 def _create_storer(
1725 self,
1726 group,
1727 format=None,
1728 value: DataFrame | Series | None = None,
1729 encoding: str = "UTF-8",
1730 errors: str = "strict",
1731 ) -> GenericFixed | Table:
1732 """return a suitable class to operate"""
1733 cls: type[GenericFixed | Table]
1734
1735 if value is not None and not isinstance(value, (Series, DataFrame)):
1736 raise TypeError("value must be None, Series, or DataFrame")
1737
1738 pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))
1739 tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))
1740
1741 # infer the pt from the passed value
1742 if pt is None:
1743 if value is None:
1744 _tables()
1745 assert _table_mod is not None # for mypy
1746 if getattr(group, "table", None) or isinstance(
1747 group, _table_mod.table.Table
1748 ):
1749 pt = "frame_table"
1750 tt = "generic_table"
1751 else:
1752 raise TypeError(
1753 "cannot create a storer if the object is not existing "
1754 "nor a value are passed"
1755 )
1756 else:
1757 if isinstance(value, Series):
1758 pt = "series"
1759 else:
1760 pt = "frame"
1761
1762 # we are actually a table
1763 if format == "table":
1764 pt += "_table"
1765
1766 # a storer node
1767 if "table" not in pt:
1768 _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed}
1769 try:
1770 cls = _STORER_MAP[pt]
1771 except KeyError as err:
1772 raise TypeError(
1773 f"cannot properly create the storer for: [_STORER_MAP] [group->"
1774 f"{group},value->{type(value)},format->{format}"
1775 ) from err
1776 return cls(self, group, encoding=encoding, errors=errors)
1777
1778 # existing node (and must be a table)
1779 if tt is None:
1780 # if we are a writer, determine the tt
1781 if value is not None:
1782 if pt == "series_table":
1783 index = getattr(value, "index", None)
1784 if index is not None:
1785 if index.nlevels == 1:
1786 tt = "appendable_series"
1787 elif index.nlevels > 1:
1788 tt = "appendable_multiseries"
1789 elif pt == "frame_table":
1790 index = getattr(value, "index", None)
1791 if index is not None:
1792 if index.nlevels == 1:
1793 tt = "appendable_frame"
1794 elif index.nlevels > 1:
1795 tt = "appendable_multiframe"
1796
1797 _TABLE_MAP = {
1798 "generic_table": GenericTable,
1799 "appendable_series": AppendableSeriesTable,
1800 "appendable_multiseries": AppendableMultiSeriesTable,
1801 "appendable_frame": AppendableFrameTable,
1802 "appendable_multiframe": AppendableMultiFrameTable,
1803 "worm": WORMTable,
1804 }
1805 try:
1806 cls = _TABLE_MAP[tt]
1807 except KeyError as err:
1808 raise TypeError(
1809 f"cannot properly create the storer for: [_TABLE_MAP] [group->"
1810 f"{group},value->{type(value)},format->{format}"
1811 ) from err
1812
1813 return cls(self, group, encoding=encoding, errors=errors)
1814
1815 def _write_to_group(
1816 self,
1817 key: str,
1818 value: DataFrame | Series,
1819 format,
1820 axes=None,
1821 index: bool | list[str] = True,
1822 append: bool = False,
1823 complib=None,
1824 complevel: int | None = None,
1825 fletcher32=None,
1826 min_itemsize: int | dict[str, int] | None = None,
1827 chunksize: int | None = None,
1828 expectedrows=None,
1829 dropna: bool = False,
1830 nan_rep=None,
1831 data_columns=None,
1832 encoding=None,
1833 errors: str = "strict",
1834 track_times: bool = True,
1835 ) -> None:
1836 # we don't want to store a table node at all if our object is 0-len
1837 # as there are not dtypes
1838 if getattr(value, "empty", None) and (format == "table" or append):
1839 return
1840
1841 group = self._identify_group(key, append)
1842
1843 s = self._create_storer(group, format, value, encoding=encoding, errors=errors)
1844 if append:
1845 # raise if we are trying to append to a Fixed format,
1846 # or a table that exists (and we are putting)
1847 if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):
1848 raise ValueError("Can only append to Tables")
1849 if not s.is_exists:
1850 s.set_object_info()
1851 else:
1852 s.set_object_info()
1853
1854 if not s.is_table and complib:
1855 raise ValueError("Compression not supported on Fixed format stores")
1856
1857 # write the object
1858 s.write(
1859 obj=value,
1860 axes=axes,
1861 append=append,
1862 complib=complib,
1863 complevel=complevel,
1864 fletcher32=fletcher32,
1865 min_itemsize=min_itemsize,
1866 chunksize=chunksize,
1867 expectedrows=expectedrows,
1868 dropna=dropna,
1869 nan_rep=nan_rep,
1870 data_columns=data_columns,
1871 track_times=track_times,
1872 )
1873
1874 if isinstance(s, Table) and index:
1875 s.create_index(columns=index)
1876
1877 def _read_group(self, group: Node):
1878 s = self._create_storer(group)
1879 s.infer_axes()
1880 return s.read()
1881
1882 def _identify_group(self, key: str, append: bool) -> Node:
1883 """Identify HDF5 group based on key, delete/create group if needed."""
1884 group = self.get_node(key)
1885
1886 # we make this assertion for mypy; the get_node call will already
1887 # have raised if this is incorrect
1888 assert self._handle is not None
1889
1890 # remove the node if we are not appending
1891 if group is not None and not append:
1892 self._handle.remove_node(group, recursive=True)
1893 group = None
1894
1895 if group is None:
1896 group = self._create_nodes_and_group(key)
1897
1898 return group
1899
1900 def _create_nodes_and_group(self, key: str) -> Node:
1901 """Create nodes from key and return group name."""
1902 # assertion for mypy
1903 assert self._handle is not None
1904
1905 paths = key.split("/")
1906 # recursively create the groups
1907 path = "/"
1908 for p in paths:
1909 if not len(p):
1910 continue
1911 new_path = path
1912 if not path.endswith("/"):
1913 new_path += "/"
1914 new_path += p
1915 group = self.get_node(new_path)
1916 if group is None:
1917 group = self._handle.create_group(path, p)
1918 path = new_path
1919 return group
1920
1921
1922class TableIterator:
1923 """
1924 Define the iteration interface on a table
1925
1926 Parameters
1927 ----------
1928 store : HDFStore
1929 s : the referred storer
1930 func : the function to execute the query
1931 where : the where of the query
1932 nrows : the rows to iterate on
1933 start : the passed start value (default is None)
1934 stop : the passed stop value (default is None)
1935 iterator : bool, default False
1936 Whether to use the default iterator.
1937 chunksize : the passed chunking value (default is 100000)
1938 auto_close : bool, default False
1939 Whether to automatically close the store at the end of iteration.
1940 """
1941
1942 chunksize: int | None
1943 store: HDFStore
1944 s: GenericFixed | Table
1945
1946 def __init__(
1947 self,
1948 store: HDFStore,
1949 s: GenericFixed | Table,
1950 func,
1951 where,
1952 nrows,
1953 start=None,
1954 stop=None,
1955 iterator: bool = False,
1956 chunksize: int | None = None,
1957 auto_close: bool = False,
1958 ) -> None:
1959 self.store = store
1960 self.s = s
1961 self.func = func
1962 self.where = where
1963
1964 # set start/stop if they are not set if we are a table
1965 if self.s.is_table:
1966 if nrows is None:
1967 nrows = 0
1968 if start is None:
1969 start = 0
1970 if stop is None:
1971 stop = nrows
1972 stop = min(nrows, stop)
1973
1974 self.nrows = nrows
1975 self.start = start
1976 self.stop = stop
1977
1978 self.coordinates = None
1979 if iterator or chunksize is not None:
1980 if chunksize is None:
1981 chunksize = 100000
1982 self.chunksize = int(chunksize)
1983 else:
1984 self.chunksize = None
1985
1986 self.auto_close = auto_close
1987
1988 def __iter__(self) -> Iterator:
1989 # iterate
1990 current = self.start
1991 if self.coordinates is None:
1992 raise ValueError("Cannot iterate until get_result is called.")
1993 while current < self.stop:
1994 stop = min(current + self.chunksize, self.stop)
1995 value = self.func(None, None, self.coordinates[current:stop])
1996 current = stop
1997 if value is None or not len(value):
1998 continue
1999
2000 yield value
2001
2002 self.close()
2003
2004 def close(self) -> None:
2005 if self.auto_close:
2006 self.store.close()
2007
2008 def get_result(self, coordinates: bool = False):
2009 # return the actual iterator
2010 if self.chunksize is not None:
2011 if not isinstance(self.s, Table):
2012 raise TypeError("can only use an iterator or chunksize on a table")
2013
2014 self.coordinates = self.s.read_coordinates(where=self.where)
2015
2016 return self
2017
2018 # if specified read via coordinates (necessary for multiple selections
2019 if coordinates:
2020 if not isinstance(self.s, Table):
2021 raise TypeError("can only read_coordinates on a table")
2022 where = self.s.read_coordinates(
2023 where=self.where, start=self.start, stop=self.stop
2024 )
2025 else:
2026 where = self.where
2027
2028 # directly return the result
2029 results = self.func(self.start, self.stop, where)
2030 self.close()
2031 return results
2032
2033
2034class IndexCol:
2035 """
2036 an index column description class
2037
2038 Parameters
2039 ----------
2040 axis : axis which I reference
2041 values : the ndarray like converted values
2042 kind : a string description of this type
2043 typ : the pytables type
2044 pos : the position in the pytables
2045
2046 """
2047
2048 is_an_indexable: bool = True
2049 is_data_indexable: bool = True
2050 _info_fields = ["freq", "tz", "index_name"]
2051
2052 def __init__(
2053 self,
2054 name: str,
2055 values=None,
2056 kind=None,
2057 typ=None,
2058 cname: str | None = None,
2059 axis=None,
2060 pos=None,
2061 freq=None,
2062 tz=None,
2063 index_name=None,
2064 ordered=None,
2065 table=None,
2066 meta=None,
2067 metadata=None,
2068 ) -> None:
2069 if not isinstance(name, str):
2070 raise ValueError("`name` must be a str.")
2071
2072 self.values = values
2073 self.kind = kind
2074 self.typ = typ
2075 self.name = name
2076 self.cname = cname or name
2077 self.axis = axis
2078 self.pos = pos
2079 self.freq = freq
2080 self.tz = tz
2081 self.index_name = index_name
2082 self.ordered = ordered
2083 self.table = table
2084 self.meta = meta
2085 self.metadata = metadata
2086
2087 if pos is not None:
2088 self.set_pos(pos)
2089
2090 # These are ensured as long as the passed arguments match the
2091 # constructor annotations.
2092 assert isinstance(self.name, str)
2093 assert isinstance(self.cname, str)
2094
2095 @property
2096 def itemsize(self) -> int:
2097 # Assumes self.typ has already been initialized
2098 return self.typ.itemsize
2099
2100 @property
2101 def kind_attr(self) -> str:
2102 return f"{self.name}_kind"
2103
2104 def set_pos(self, pos: int) -> None:
2105 """set the position of this column in the Table"""
2106 self.pos = pos
2107 if pos is not None and self.typ is not None:
2108 self.typ._v_pos = pos
2109
2110 def __repr__(self) -> str:
2111 temp = tuple(
2112 map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))
2113 )
2114 return ",".join(
2115 [
2116 f"{key}->{value}"
2117 for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp)
2118 ]
2119 )
2120
2121 def __eq__(self, other: object) -> bool:
2122 """compare 2 col items"""
2123 return all(
2124 getattr(self, a, None) == getattr(other, a, None)
2125 for a in ["name", "cname", "axis", "pos"]
2126 )
2127
2128 def __ne__(self, other) -> bool:
2129 return not self.__eq__(other)
2130
2131 @property
2132 def is_indexed(self) -> bool:
2133 """return whether I am an indexed column"""
2134 if not hasattr(self.table, "cols"):
2135 # e.g. if infer hasn't been called yet, self.table will be None.
2136 return False
2137 return getattr(self.table.cols, self.cname).is_indexed
2138
2139 def convert(
2140 self, values: np.ndarray, nan_rep, encoding: str, errors: str
2141 ) -> tuple[np.ndarray, np.ndarray] | tuple[Index, Index]:
2142 """
2143 Convert the data from this selection to the appropriate pandas type.
2144 """
2145 assert isinstance(values, np.ndarray), type(values)
2146
2147 # values is a recarray
2148 if values.dtype.fields is not None:
2149 # Copy, otherwise values will be a view
2150 # preventing the original recarry from being free'ed
2151 values = values[self.cname].copy()
2152
2153 val_kind = _ensure_decoded(self.kind)
2154 values = _maybe_convert(values, val_kind, encoding, errors)
2155 kwargs = {}
2156 kwargs["name"] = _ensure_decoded(self.index_name)
2157
2158 if self.freq is not None:
2159 kwargs["freq"] = _ensure_decoded(self.freq)
2160
2161 factory: type[Index | DatetimeIndex] = Index
2162 if lib.is_np_dtype(values.dtype, "M") or isinstance(
2163 values.dtype, DatetimeTZDtype
2164 ):
2165 factory = DatetimeIndex
2166 elif values.dtype == "i8" and "freq" in kwargs:
2167 # PeriodIndex data is stored as i8
2168 # error: Incompatible types in assignment (expression has type
2169 # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type
2170 # "Union[Type[Index], Type[DatetimeIndex]]")
2171 factory = lambda x, **kwds: PeriodIndex.from_ordinals( # type: ignore[assignment]
2172 x, freq=kwds.get("freq", None)
2173 )._rename(
2174 kwds["name"]
2175 )
2176
2177 # making an Index instance could throw a number of different errors
2178 try:
2179 new_pd_index = factory(values, **kwargs)
2180 except ValueError:
2181 # if the output freq is different that what we recorded,
2182 # it should be None (see also 'doc example part 2')
2183 if "freq" in kwargs:
2184 kwargs["freq"] = None
2185 new_pd_index = factory(values, **kwargs)
2186 final_pd_index = _set_tz(new_pd_index, self.tz)
2187 return final_pd_index, final_pd_index
2188
2189 def take_data(self):
2190 """return the values"""
2191 return self.values
2192
2193 @property
2194 def attrs(self):
2195 return self.table._v_attrs
2196
2197 @property
2198 def description(self):
2199 return self.table.description
2200
2201 @property
2202 def col(self):
2203 """return my current col description"""
2204 return getattr(self.description, self.cname, None)
2205
2206 @property
2207 def cvalues(self):
2208 """return my cython values"""
2209 return self.values
2210
2211 def __iter__(self) -> Iterator:
2212 return iter(self.values)
2213
2214 def maybe_set_size(self, min_itemsize=None) -> None:
2215 """
2216 maybe set a string col itemsize:
2217 min_itemsize can be an integer or a dict with this columns name
2218 with an integer size
2219 """
2220 if _ensure_decoded(self.kind) == "string":
2221 if isinstance(min_itemsize, dict):
2222 min_itemsize = min_itemsize.get(self.name)
2223
2224 if min_itemsize is not None and self.typ.itemsize < min_itemsize:
2225 self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)
2226
2227 def validate_names(self) -> None:
2228 pass
2229
2230 def validate_and_set(self, handler: AppendableTable, append: bool) -> None:
2231 self.table = handler.table
2232 self.validate_col()
2233 self.validate_attr(append)
2234 self.validate_metadata(handler)
2235 self.write_metadata(handler)
2236 self.set_attr()
2237
2238 def validate_col(self, itemsize=None):
2239 """validate this column: return the compared against itemsize"""
2240 # validate this column for string truncation (or reset to the max size)
2241 if _ensure_decoded(self.kind) == "string":
2242 c = self.col
2243 if c is not None:
2244 if itemsize is None:
2245 itemsize = self.itemsize
2246 if c.itemsize < itemsize:
2247 raise ValueError(
2248 f"Trying to store a string with len [{itemsize}] in "
2249 f"[{self.cname}] column but\nthis column has a limit of "
2250 f"[{c.itemsize}]!\nConsider using min_itemsize to "
2251 "preset the sizes on these columns"
2252 )
2253 return c.itemsize
2254
2255 return None
2256
2257 def validate_attr(self, append: bool) -> None:
2258 # check for backwards incompatibility
2259 if append:
2260 existing_kind = getattr(self.attrs, self.kind_attr, None)
2261 if existing_kind is not None and existing_kind != self.kind:
2262 raise TypeError(
2263 f"incompatible kind in col [{existing_kind} - {self.kind}]"
2264 )
2265
2266 def update_info(self, info) -> None:
2267 """
2268 set/update the info for this indexable with the key/value
2269 if there is a conflict raise/warn as needed
2270 """
2271 for key in self._info_fields:
2272 value = getattr(self, key, None)
2273 idx = info.setdefault(self.name, {})
2274
2275 existing_value = idx.get(key)
2276 if key in idx and value is not None and existing_value != value:
2277 # frequency/name just warn
2278 if key in ["freq", "index_name"]:
2279 ws = attribute_conflict_doc % (key, existing_value, value)
2280 warnings.warn(
2281 ws, AttributeConflictWarning, stacklevel=find_stack_level()
2282 )
2283
2284 # reset
2285 idx[key] = None
2286 setattr(self, key, None)
2287
2288 else:
2289 raise ValueError(
2290 f"invalid info for [{self.name}] for [{key}], "
2291 f"existing_value [{existing_value}] conflicts with "
2292 f"new value [{value}]"
2293 )
2294 elif value is not None or existing_value is not None:
2295 idx[key] = value
2296
2297 def set_info(self, info) -> None:
2298 """set my state from the passed info"""
2299 idx = info.get(self.name)
2300 if idx is not None:
2301 self.__dict__.update(idx)
2302
2303 def set_attr(self) -> None:
2304 """set the kind for this column"""
2305 setattr(self.attrs, self.kind_attr, self.kind)
2306
2307 def validate_metadata(self, handler: AppendableTable) -> None:
2308 """validate that kind=category does not change the categories"""
2309 if self.meta == "category":
2310 new_metadata = self.metadata
2311 cur_metadata = handler.read_metadata(self.cname)
2312 if (
2313 new_metadata is not None
2314 and cur_metadata is not None
2315 and not array_equivalent(
2316 new_metadata, cur_metadata, strict_nan=True, dtype_equal=True
2317 )
2318 ):
2319 raise ValueError(
2320 "cannot append a categorical with "
2321 "different categories to the existing"
2322 )
2323
2324 def write_metadata(self, handler: AppendableTable) -> None:
2325 """set the meta data"""
2326 if self.metadata is not None:
2327 handler.write_metadata(self.cname, self.metadata)
2328
2329
2330class GenericIndexCol(IndexCol):
2331 """an index which is not represented in the data of the table"""
2332
2333 @property
2334 def is_indexed(self) -> bool:
2335 return False
2336
2337 def convert(
2338 self, values: np.ndarray, nan_rep, encoding: str, errors: str
2339 ) -> tuple[Index, Index]:
2340 """
2341 Convert the data from this selection to the appropriate pandas type.
2342
2343 Parameters
2344 ----------
2345 values : np.ndarray
2346 nan_rep : str
2347 encoding : str
2348 errors : str
2349 """
2350 assert isinstance(values, np.ndarray), type(values)
2351
2352 index = RangeIndex(len(values))
2353 return index, index
2354
2355 def set_attr(self) -> None:
2356 pass
2357
2358
2359class DataCol(IndexCol):
2360 """
2361 a data holding column, by definition this is not indexable
2362
2363 Parameters
2364 ----------
2365 data : the actual data
2366 cname : the column name in the table to hold the data (typically
2367 values)
2368 meta : a string description of the metadata
2369 metadata : the actual metadata
2370 """
2371
2372 is_an_indexable = False
2373 is_data_indexable = False
2374 _info_fields = ["tz", "ordered"]
2375
2376 def __init__(
2377 self,
2378 name: str,
2379 values=None,
2380 kind=None,
2381 typ=None,
2382 cname: str | None = None,
2383 pos=None,
2384 tz=None,
2385 ordered=None,
2386 table=None,
2387 meta=None,
2388 metadata=None,
2389 dtype: DtypeArg | None = None,
2390 data=None,
2391 ) -> None:
2392 super().__init__(
2393 name=name,
2394 values=values,
2395 kind=kind,
2396 typ=typ,
2397 pos=pos,
2398 cname=cname,
2399 tz=tz,
2400 ordered=ordered,
2401 table=table,
2402 meta=meta,
2403 metadata=metadata,
2404 )
2405 self.dtype = dtype
2406 self.data = data
2407
2408 @property
2409 def dtype_attr(self) -> str:
2410 return f"{self.name}_dtype"
2411
2412 @property
2413 def meta_attr(self) -> str:
2414 return f"{self.name}_meta"
2415
2416 def __repr__(self) -> str:
2417 temp = tuple(
2418 map(
2419 pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)
2420 )
2421 )
2422 return ",".join(
2423 [
2424 f"{key}->{value}"
2425 for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp)
2426 ]
2427 )
2428
2429 def __eq__(self, other: object) -> bool:
2430 """compare 2 col items"""
2431 return all(
2432 getattr(self, a, None) == getattr(other, a, None)
2433 for a in ["name", "cname", "dtype", "pos"]
2434 )
2435
2436 def set_data(self, data: ArrayLike) -> None:
2437 assert data is not None
2438 assert self.dtype is None
2439
2440 data, dtype_name = _get_data_and_dtype_name(data)
2441
2442 self.data = data
2443 self.dtype = dtype_name
2444 self.kind = _dtype_to_kind(dtype_name)
2445
2446 def take_data(self):
2447 """return the data"""
2448 return self.data
2449
2450 @classmethod
2451 def _get_atom(cls, values: ArrayLike) -> Col:
2452 """
2453 Get an appropriately typed and shaped pytables.Col object for values.
2454 """
2455 dtype = values.dtype
2456 # error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no
2457 # attribute "itemsize"
2458 itemsize = dtype.itemsize # type: ignore[union-attr]
2459
2460 shape = values.shape
2461 if values.ndim == 1:
2462 # EA, use block shape pretending it is 2D
2463 # TODO(EA2D): not necessary with 2D EAs
2464 shape = (1, values.size)
2465
2466 if isinstance(values, Categorical):
2467 codes = values.codes
2468 atom = cls.get_atom_data(shape, kind=codes.dtype.name)
2469 elif lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype):
2470 atom = cls.get_atom_datetime64(shape)
2471 elif lib.is_np_dtype(dtype, "m"):
2472 atom = cls.get_atom_timedelta64(shape)
2473 elif is_complex_dtype(dtype):
2474 atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0])
2475 elif is_string_dtype(dtype):
2476 atom = cls.get_atom_string(shape, itemsize)
2477 else:
2478 atom = cls.get_atom_data(shape, kind=dtype.name)
2479
2480 return atom
2481
2482 @classmethod
2483 def get_atom_string(cls, shape, itemsize):
2484 return _tables().StringCol(itemsize=itemsize, shape=shape[0])
2485
2486 @classmethod
2487 def get_atom_coltype(cls, kind: str) -> type[Col]:
2488 """return the PyTables column class for this column"""
2489 if kind.startswith("uint"):
2490 k4 = kind[4:]
2491 col_name = f"UInt{k4}Col"
2492 elif kind.startswith("period"):
2493 # we store as integer
2494 col_name = "Int64Col"
2495 else:
2496 kcap = kind.capitalize()
2497 col_name = f"{kcap}Col"
2498
2499 return getattr(_tables(), col_name)
2500
2501 @classmethod
2502 def get_atom_data(cls, shape, kind: str) -> Col:
2503 return cls.get_atom_coltype(kind=kind)(shape=shape[0])
2504
2505 @classmethod
2506 def get_atom_datetime64(cls, shape):
2507 return _tables().Int64Col(shape=shape[0])
2508
2509 @classmethod
2510 def get_atom_timedelta64(cls, shape):
2511 return _tables().Int64Col(shape=shape[0])
2512
2513 @property
2514 def shape(self):
2515 return getattr(self.data, "shape", None)
2516
2517 @property
2518 def cvalues(self):
2519 """return my cython values"""
2520 return self.data
2521
2522 def validate_attr(self, append) -> None:
2523 """validate that we have the same order as the existing & same dtype"""
2524 if append:
2525 existing_fields = getattr(self.attrs, self.kind_attr, None)
2526 if existing_fields is not None and existing_fields != list(self.values):
2527 raise ValueError("appended items do not match existing items in table!")
2528
2529 existing_dtype = getattr(self.attrs, self.dtype_attr, None)
2530 if existing_dtype is not None and existing_dtype != self.dtype:
2531 raise ValueError(
2532 "appended items dtype do not match existing items dtype in table!"
2533 )
2534
2535 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
2536 """
2537 Convert the data from this selection to the appropriate pandas type.
2538
2539 Parameters
2540 ----------
2541 values : np.ndarray
2542 nan_rep :
2543 encoding : str
2544 errors : str
2545
2546 Returns
2547 -------
2548 index : listlike to become an Index
2549 data : ndarraylike to become a column
2550 """
2551 assert isinstance(values, np.ndarray), type(values)
2552
2553 # values is a recarray
2554 if values.dtype.fields is not None:
2555 values = values[self.cname]
2556
2557 assert self.typ is not None
2558 if self.dtype is None:
2559 # Note: in tests we never have timedelta64 or datetime64,
2560 # so the _get_data_and_dtype_name may be unnecessary
2561 converted, dtype_name = _get_data_and_dtype_name(values)
2562 kind = _dtype_to_kind(dtype_name)
2563 else:
2564 converted = values
2565 dtype_name = self.dtype
2566 kind = self.kind
2567
2568 assert isinstance(converted, np.ndarray) # for mypy
2569
2570 # use the meta if needed
2571 meta = _ensure_decoded(self.meta)
2572 metadata = self.metadata
2573 ordered = self.ordered
2574 tz = self.tz
2575
2576 assert dtype_name is not None
2577 # convert to the correct dtype
2578 dtype = _ensure_decoded(dtype_name)
2579
2580 # reverse converts
2581 if dtype.startswith("datetime64"):
2582 # recreate with tz if indicated
2583 converted = _set_tz(converted, tz, coerce=True)
2584
2585 elif dtype == "timedelta64":
2586 converted = np.asarray(converted, dtype="m8[ns]")
2587 elif dtype == "date":
2588 try:
2589 converted = np.asarray(
2590 [date.fromordinal(v) for v in converted], dtype=object
2591 )
2592 except ValueError:
2593 converted = np.asarray(
2594 [date.fromtimestamp(v) for v in converted], dtype=object
2595 )
2596
2597 elif meta == "category":
2598 # we have a categorical
2599 categories = metadata
2600 codes = converted.ravel()
2601
2602 # if we have stored a NaN in the categories
2603 # then strip it; in theory we could have BOTH
2604 # -1s in the codes and nulls :<
2605 if categories is None:
2606 # Handle case of NaN-only categorical columns in which case
2607 # the categories are an empty array; when this is stored,
2608 # pytables cannot write a zero-len array, so on readback
2609 # the categories would be None and `read_hdf()` would fail.
2610 categories = Index([], dtype=np.float64)
2611 else:
2612 mask = isna(categories)
2613 if mask.any():
2614 categories = categories[~mask]
2615 codes[codes != -1] -= mask.astype(int).cumsum()._values
2616
2617 converted = Categorical.from_codes(
2618 codes, categories=categories, ordered=ordered, validate=False
2619 )
2620
2621 else:
2622 try:
2623 converted = converted.astype(dtype, copy=False)
2624 except TypeError:
2625 converted = converted.astype("O", copy=False)
2626
2627 # convert nans / decode
2628 if _ensure_decoded(kind) == "string":
2629 converted = _unconvert_string_array(
2630 converted, nan_rep=nan_rep, encoding=encoding, errors=errors
2631 )
2632
2633 return self.values, converted
2634
2635 def set_attr(self) -> None:
2636 """set the data for this column"""
2637 setattr(self.attrs, self.kind_attr, self.values)
2638 setattr(self.attrs, self.meta_attr, self.meta)
2639 assert self.dtype is not None
2640 setattr(self.attrs, self.dtype_attr, self.dtype)
2641
2642
2643class DataIndexableCol(DataCol):
2644 """represent a data column that can be indexed"""
2645
2646 is_data_indexable = True
2647
2648 def validate_names(self) -> None:
2649 if not is_string_dtype(Index(self.values).dtype):
2650 # TODO: should the message here be more specifically non-str?
2651 raise ValueError("cannot have non-object label DataIndexableCol")
2652
2653 @classmethod
2654 def get_atom_string(cls, shape, itemsize):
2655 return _tables().StringCol(itemsize=itemsize)
2656
2657 @classmethod
2658 def get_atom_data(cls, shape, kind: str) -> Col:
2659 return cls.get_atom_coltype(kind=kind)()
2660
2661 @classmethod
2662 def get_atom_datetime64(cls, shape):
2663 return _tables().Int64Col()
2664
2665 @classmethod
2666 def get_atom_timedelta64(cls, shape):
2667 return _tables().Int64Col()
2668
2669
2670class GenericDataIndexableCol(DataIndexableCol):
2671 """represent a generic pytables data column"""
2672
2673
2674class Fixed:
2675 """
2676 represent an object in my store
2677 facilitate read/write of various types of objects
2678 this is an abstract base class
2679
2680 Parameters
2681 ----------
2682 parent : HDFStore
2683 group : Node
2684 The group node where the table resides.
2685 """
2686
2687 pandas_kind: str
2688 format_type: str = "fixed" # GH#30962 needed by dask
2689 obj_type: type[DataFrame | Series]
2690 ndim: int
2691 parent: HDFStore
2692 is_table: bool = False
2693
2694 def __init__(
2695 self,
2696 parent: HDFStore,
2697 group: Node,
2698 encoding: str | None = "UTF-8",
2699 errors: str = "strict",
2700 ) -> None:
2701 assert isinstance(parent, HDFStore), type(parent)
2702 assert _table_mod is not None # needed for mypy
2703 assert isinstance(group, _table_mod.Node), type(group)
2704 self.parent = parent
2705 self.group = group
2706 self.encoding = _ensure_encoding(encoding)
2707 self.errors = errors
2708
2709 @property
2710 def is_old_version(self) -> bool:
2711 return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1
2712
2713 @property
2714 def version(self) -> tuple[int, int, int]:
2715 """compute and set our version"""
2716 version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))
2717 try:
2718 version = tuple(int(x) for x in version.split("."))
2719 if len(version) == 2:
2720 version = version + (0,)
2721 except AttributeError:
2722 version = (0, 0, 0)
2723 return version
2724
2725 @property
2726 def pandas_type(self):
2727 return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))
2728
2729 def __repr__(self) -> str:
2730 """return a pretty representation of myself"""
2731 self.infer_axes()
2732 s = self.shape
2733 if s is not None:
2734 if isinstance(s, (list, tuple)):
2735 jshape = ",".join([pprint_thing(x) for x in s])
2736 s = f"[{jshape}]"
2737 return f"{self.pandas_type:12.12} (shape->{s})"
2738 return self.pandas_type
2739
2740 def set_object_info(self) -> None:
2741 """set my pandas type & version"""
2742 self.attrs.pandas_type = str(self.pandas_kind)
2743 self.attrs.pandas_version = str(_version)
2744
2745 def copy(self) -> Fixed:
2746 new_self = copy.copy(self)
2747 return new_self
2748
2749 @property
2750 def shape(self):
2751 return self.nrows
2752
2753 @property
2754 def pathname(self):
2755 return self.group._v_pathname
2756
2757 @property
2758 def _handle(self):
2759 return self.parent._handle
2760
2761 @property
2762 def _filters(self):
2763 return self.parent._filters
2764
2765 @property
2766 def _complevel(self) -> int:
2767 return self.parent._complevel
2768
2769 @property
2770 def _fletcher32(self) -> bool:
2771 return self.parent._fletcher32
2772
2773 @property
2774 def attrs(self):
2775 return self.group._v_attrs
2776
2777 def set_attrs(self) -> None:
2778 """set our object attributes"""
2779
2780 def get_attrs(self) -> None:
2781 """get our object attributes"""
2782
2783 @property
2784 def storable(self):
2785 """return my storable"""
2786 return self.group
2787
2788 @property
2789 def is_exists(self) -> bool:
2790 return False
2791
2792 @property
2793 def nrows(self):
2794 return getattr(self.storable, "nrows", None)
2795
2796 def validate(self, other) -> Literal[True] | None:
2797 """validate against an existing storable"""
2798 if other is None:
2799 return None
2800 return True
2801
2802 def validate_version(self, where=None) -> None:
2803 """are we trying to operate on an old version?"""
2804
2805 def infer_axes(self) -> bool:
2806 """
2807 infer the axes of my storer
2808 return a boolean indicating if we have a valid storer or not
2809 """
2810 s = self.storable
2811 if s is None:
2812 return False
2813 self.get_attrs()
2814 return True
2815
2816 def read(
2817 self,
2818 where=None,
2819 columns=None,
2820 start: int | None = None,
2821 stop: int | None = None,
2822 ):
2823 raise NotImplementedError(
2824 "cannot read on an abstract storer: subclasses should implement"
2825 )
2826
2827 def write(self, obj, **kwargs) -> None:
2828 raise NotImplementedError(
2829 "cannot write on an abstract storer: subclasses should implement"
2830 )
2831
2832 def delete(
2833 self, where=None, start: int | None = None, stop: int | None = None
2834 ) -> None:
2835 """
2836 support fully deleting the node in its entirety (only) - where
2837 specification must be None
2838 """
2839 if com.all_none(where, start, stop):
2840 self._handle.remove_node(self.group, recursive=True)
2841 return None
2842
2843 raise TypeError("cannot delete on an abstract storer")
2844
2845
2846class GenericFixed(Fixed):
2847 """a generified fixed version"""
2848
2849 _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}
2850 _reverse_index_map = {v: k for k, v in _index_type_map.items()}
2851 attributes: list[str] = []
2852
2853 # indexer helpers
2854 def _class_to_alias(self, cls) -> str:
2855 return self._index_type_map.get(cls, "")
2856
2857 def _alias_to_class(self, alias):
2858 if isinstance(alias, type): # pragma: no cover
2859 # compat: for a short period of time master stored types
2860 return alias
2861 return self._reverse_index_map.get(alias, Index)
2862
2863 def _get_index_factory(self, attrs):
2864 index_class = self._alias_to_class(
2865 _ensure_decoded(getattr(attrs, "index_class", ""))
2866 )
2867
2868 factory: Callable
2869
2870 if index_class == DatetimeIndex:
2871
2872 def f(values, freq=None, tz=None):
2873 # data are already in UTC, localize and convert if tz present
2874 dta = DatetimeArray._simple_new(
2875 values.values, dtype=values.dtype, freq=freq
2876 )
2877 result = DatetimeIndex._simple_new(dta, name=None)
2878 if tz is not None:
2879 result = result.tz_localize("UTC").tz_convert(tz)
2880 return result
2881
2882 factory = f
2883 elif index_class == PeriodIndex:
2884
2885 def f(values, freq=None, tz=None):
2886 dtype = PeriodDtype(freq)
2887 parr = PeriodArray._simple_new(values, dtype=dtype)
2888 return PeriodIndex._simple_new(parr, name=None)
2889
2890 factory = f
2891 else:
2892 factory = index_class
2893
2894 kwargs = {}
2895 if "freq" in attrs:
2896 kwargs["freq"] = attrs["freq"]
2897 if index_class is Index:
2898 # DTI/PI would be gotten by _alias_to_class
2899 factory = TimedeltaIndex
2900
2901 if "tz" in attrs:
2902 if isinstance(attrs["tz"], bytes):
2903 # created by python2
2904 kwargs["tz"] = attrs["tz"].decode("utf-8")
2905 else:
2906 # created by python3
2907 kwargs["tz"] = attrs["tz"]
2908 assert index_class is DatetimeIndex # just checking
2909
2910 return factory, kwargs
2911
2912 def validate_read(self, columns, where) -> None:
2913 """
2914 raise if any keywords are passed which are not-None
2915 """
2916 if columns is not None:
2917 raise TypeError(
2918 "cannot pass a column specification when reading "
2919 "a Fixed format store. this store must be selected in its entirety"
2920 )
2921 if where is not None:
2922 raise TypeError(
2923 "cannot pass a where specification when reading "
2924 "from a Fixed format store. this store must be selected in its entirety"
2925 )
2926
2927 @property
2928 def is_exists(self) -> bool:
2929 return True
2930
2931 def set_attrs(self) -> None:
2932 """set our object attributes"""
2933 self.attrs.encoding = self.encoding
2934 self.attrs.errors = self.errors
2935
2936 def get_attrs(self) -> None:
2937 """retrieve our attributes"""
2938 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
2939 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
2940 for n in self.attributes:
2941 setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
2942
2943 def write(self, obj, **kwargs) -> None:
2944 self.set_attrs()
2945
2946 def read_array(self, key: str, start: int | None = None, stop: int | None = None):
2947 """read an array for the specified node (off of group"""
2948 import tables
2949
2950 node = getattr(self.group, key)
2951 attrs = node._v_attrs
2952
2953 transposed = getattr(attrs, "transposed", False)
2954
2955 if isinstance(node, tables.VLArray):
2956 ret = node[0][start:stop]
2957 else:
2958 dtype = _ensure_decoded(getattr(attrs, "value_type", None))
2959 shape = getattr(attrs, "shape", None)
2960
2961 if shape is not None:
2962 # length 0 axis
2963 ret = np.empty(shape, dtype=dtype)
2964 else:
2965 ret = node[start:stop]
2966
2967 if dtype and dtype.startswith("datetime64"):
2968 # reconstruct a timezone if indicated
2969 tz = getattr(attrs, "tz", None)
2970 ret = _set_tz(ret, tz, coerce=True)
2971
2972 elif dtype == "timedelta64":
2973 ret = np.asarray(ret, dtype="m8[ns]")
2974
2975 if transposed:
2976 return ret.T
2977 else:
2978 return ret
2979
2980 def read_index(
2981 self, key: str, start: int | None = None, stop: int | None = None
2982 ) -> Index:
2983 variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))
2984
2985 if variety == "multi":
2986 return self.read_multi_index(key, start=start, stop=stop)
2987 elif variety == "regular":
2988 node = getattr(self.group, key)
2989 index = self.read_index_node(node, start=start, stop=stop)
2990 return index
2991 else: # pragma: no cover
2992 raise TypeError(f"unrecognized index variety: {variety}")
2993
2994 def write_index(self, key: str, index: Index) -> None:
2995 if isinstance(index, MultiIndex):
2996 setattr(self.attrs, f"{key}_variety", "multi")
2997 self.write_multi_index(key, index)
2998 else:
2999 setattr(self.attrs, f"{key}_variety", "regular")
3000 converted = _convert_index("index", index, self.encoding, self.errors)
3001
3002 self.write_array(key, converted.values)
3003
3004 node = getattr(self.group, key)
3005 node._v_attrs.kind = converted.kind
3006 node._v_attrs.name = index.name
3007
3008 if isinstance(index, (DatetimeIndex, PeriodIndex)):
3009 node._v_attrs.index_class = self._class_to_alias(type(index))
3010
3011 if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
3012 node._v_attrs.freq = index.freq
3013
3014 if isinstance(index, DatetimeIndex) and index.tz is not None:
3015 node._v_attrs.tz = _get_tz(index.tz)
3016
3017 def write_multi_index(self, key: str, index: MultiIndex) -> None:
3018 setattr(self.attrs, f"{key}_nlevels", index.nlevels)
3019
3020 for i, (lev, level_codes, name) in enumerate(
3021 zip(index.levels, index.codes, index.names)
3022 ):
3023 # write the level
3024 if isinstance(lev.dtype, ExtensionDtype):
3025 raise NotImplementedError(
3026 "Saving a MultiIndex with an extension dtype is not supported."
3027 )
3028 level_key = f"{key}_level{i}"
3029 conv_level = _convert_index(level_key, lev, self.encoding, self.errors)
3030 self.write_array(level_key, conv_level.values)
3031 node = getattr(self.group, level_key)
3032 node._v_attrs.kind = conv_level.kind
3033 node._v_attrs.name = name
3034
3035 # write the name
3036 setattr(node._v_attrs, f"{key}_name{name}", name)
3037
3038 # write the labels
3039 label_key = f"{key}_label{i}"
3040 self.write_array(label_key, level_codes)
3041
3042 def read_multi_index(
3043 self, key: str, start: int | None = None, stop: int | None = None
3044 ) -> MultiIndex:
3045 nlevels = getattr(self.attrs, f"{key}_nlevels")
3046
3047 levels = []
3048 codes = []
3049 names: list[Hashable] = []
3050 for i in range(nlevels):
3051 level_key = f"{key}_level{i}"
3052 node = getattr(self.group, level_key)
3053 lev = self.read_index_node(node, start=start, stop=stop)
3054 levels.append(lev)
3055 names.append(lev.name)
3056
3057 label_key = f"{key}_label{i}"
3058 level_codes = self.read_array(label_key, start=start, stop=stop)
3059 codes.append(level_codes)
3060
3061 return MultiIndex(
3062 levels=levels, codes=codes, names=names, verify_integrity=True
3063 )
3064
3065 def read_index_node(
3066 self, node: Node, start: int | None = None, stop: int | None = None
3067 ) -> Index:
3068 data = node[start:stop]
3069 # If the index was an empty array write_array_empty() will
3070 # have written a sentinel. Here we replace it with the original.
3071 if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
3072 data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)
3073 kind = _ensure_decoded(node._v_attrs.kind)
3074 name = None
3075
3076 if "name" in node._v_attrs:
3077 name = _ensure_str(node._v_attrs.name)
3078 name = _ensure_decoded(name)
3079
3080 attrs = node._v_attrs
3081 factory, kwargs = self._get_index_factory(attrs)
3082
3083 if kind in ("date", "object"):
3084 index = factory(
3085 _unconvert_index(
3086 data, kind, encoding=self.encoding, errors=self.errors
3087 ),
3088 dtype=object,
3089 **kwargs,
3090 )
3091 else:
3092 index = factory(
3093 _unconvert_index(
3094 data, kind, encoding=self.encoding, errors=self.errors
3095 ),
3096 **kwargs,
3097 )
3098
3099 index.name = name
3100
3101 return index
3102
3103 def write_array_empty(self, key: str, value: ArrayLike) -> None:
3104 """write a 0-len array"""
3105 # ugly hack for length 0 axes
3106 arr = np.empty((1,) * value.ndim)
3107 self._handle.create_array(self.group, key, arr)
3108 node = getattr(self.group, key)
3109 node._v_attrs.value_type = str(value.dtype)
3110 node._v_attrs.shape = value.shape
3111
3112 def write_array(
3113 self, key: str, obj: AnyArrayLike, items: Index | None = None
3114 ) -> None:
3115 # TODO: we only have a few tests that get here, the only EA
3116 # that gets passed is DatetimeArray, and we never have
3117 # both self._filters and EA
3118
3119 value = extract_array(obj, extract_numpy=True)
3120
3121 if key in self.group:
3122 self._handle.remove_node(self.group, key)
3123
3124 # Transform needed to interface with pytables row/col notation
3125 empty_array = value.size == 0
3126 transposed = False
3127
3128 if isinstance(value.dtype, CategoricalDtype):
3129 raise NotImplementedError(
3130 "Cannot store a category dtype in a HDF5 dataset that uses format="
3131 '"fixed". Use format="table".'
3132 )
3133 if not empty_array:
3134 if hasattr(value, "T"):
3135 # ExtensionArrays (1d) may not have transpose.
3136 value = value.T
3137 transposed = True
3138
3139 atom = None
3140 if self._filters is not None:
3141 with suppress(ValueError):
3142 # get the atom for this datatype
3143 atom = _tables().Atom.from_dtype(value.dtype)
3144
3145 if atom is not None:
3146 # We only get here if self._filters is non-None and
3147 # the Atom.from_dtype call succeeded
3148
3149 # create an empty chunked array and fill it from value
3150 if not empty_array:
3151 ca = self._handle.create_carray(
3152 self.group, key, atom, value.shape, filters=self._filters
3153 )
3154 ca[:] = value
3155
3156 else:
3157 self.write_array_empty(key, value)
3158
3159 elif value.dtype.type == np.object_:
3160 # infer the type, warn if we have a non-string type here (for
3161 # performance)
3162 inferred_type = lib.infer_dtype(value, skipna=False)
3163 if empty_array:
3164 pass
3165 elif inferred_type == "string":
3166 pass
3167 else:
3168 ws = performance_doc % (inferred_type, key, items)
3169 warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())
3170
3171 vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
3172 vlarr.append(value)
3173
3174 elif lib.is_np_dtype(value.dtype, "M"):
3175 self._handle.create_array(self.group, key, value.view("i8"))
3176 getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
3177 elif isinstance(value.dtype, DatetimeTZDtype):
3178 # store as UTC
3179 # with a zone
3180
3181 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
3182 # attribute "asi8"
3183 self._handle.create_array(
3184 self.group, key, value.asi8 # type: ignore[union-attr]
3185 )
3186
3187 node = getattr(self.group, key)
3188 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
3189 # attribute "tz"
3190 node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr]
3191 node._v_attrs.value_type = f"datetime64[{value.dtype.unit}]"
3192 elif lib.is_np_dtype(value.dtype, "m"):
3193 self._handle.create_array(self.group, key, value.view("i8"))
3194 getattr(self.group, key)._v_attrs.value_type = "timedelta64"
3195 elif empty_array:
3196 self.write_array_empty(key, value)
3197 else:
3198 self._handle.create_array(self.group, key, value)
3199
3200 getattr(self.group, key)._v_attrs.transposed = transposed
3201
3202
3203class SeriesFixed(GenericFixed):
3204 pandas_kind = "series"
3205 attributes = ["name"]
3206
3207 name: Hashable
3208
3209 @property
3210 def shape(self):
3211 try:
3212 return (len(self.group.values),)
3213 except (TypeError, AttributeError):
3214 return None
3215
3216 def read(
3217 self,
3218 where=None,
3219 columns=None,
3220 start: int | None = None,
3221 stop: int | None = None,
3222 ) -> Series:
3223 self.validate_read(columns, where)
3224 index = self.read_index("index", start=start, stop=stop)
3225 values = self.read_array("values", start=start, stop=stop)
3226 result = Series(values, index=index, name=self.name, copy=False)
3227 if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
3228 result = result.astype("string[pyarrow_numpy]")
3229 return result
3230
3231 def write(self, obj, **kwargs) -> None:
3232 super().write(obj, **kwargs)
3233 self.write_index("index", obj.index)
3234 self.write_array("values", obj)
3235 self.attrs.name = obj.name
3236
3237
3238class BlockManagerFixed(GenericFixed):
3239 attributes = ["ndim", "nblocks"]
3240
3241 nblocks: int
3242
3243 @property
3244 def shape(self) -> Shape | None:
3245 try:
3246 ndim = self.ndim
3247
3248 # items
3249 items = 0
3250 for i in range(self.nblocks):
3251 node = getattr(self.group, f"block{i}_items")
3252 shape = getattr(node, "shape", None)
3253 if shape is not None:
3254 items += shape[0]
3255
3256 # data shape
3257 node = self.group.block0_values
3258 shape = getattr(node, "shape", None)
3259 if shape is not None:
3260 shape = list(shape[0 : (ndim - 1)])
3261 else:
3262 shape = []
3263
3264 shape.append(items)
3265
3266 return shape
3267 except AttributeError:
3268 return None
3269
3270 def read(
3271 self,
3272 where=None,
3273 columns=None,
3274 start: int | None = None,
3275 stop: int | None = None,
3276 ) -> DataFrame:
3277 # start, stop applied to rows, so 0th axis only
3278 self.validate_read(columns, where)
3279 select_axis = self.obj_type()._get_block_manager_axis(0)
3280
3281 axes = []
3282 for i in range(self.ndim):
3283 _start, _stop = (start, stop) if i == select_axis else (None, None)
3284 ax = self.read_index(f"axis{i}", start=_start, stop=_stop)
3285 axes.append(ax)
3286
3287 items = axes[0]
3288 dfs = []
3289
3290 for i in range(self.nblocks):
3291 blk_items = self.read_index(f"block{i}_items")
3292 values = self.read_array(f"block{i}_values", start=_start, stop=_stop)
3293
3294 columns = items[items.get_indexer(blk_items)]
3295 df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
3296 if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
3297 df = df.astype("string[pyarrow_numpy]")
3298 dfs.append(df)
3299
3300 if len(dfs) > 0:
3301 out = concat(dfs, axis=1, copy=True)
3302 if using_copy_on_write():
3303 # with CoW, concat ignores the copy keyword. Here, we still want
3304 # to copy to enforce optimized column-major layout
3305 out = out.copy()
3306 out = out.reindex(columns=items, copy=False)
3307 return out
3308
3309 return DataFrame(columns=axes[0], index=axes[1])
3310
3311 def write(self, obj, **kwargs) -> None:
3312 super().write(obj, **kwargs)
3313
3314 # TODO(ArrayManager) HDFStore relies on accessing the blocks
3315 if isinstance(obj._mgr, ArrayManager):
3316 obj = obj._as_manager("block")
3317
3318 data = obj._mgr
3319 if not data.is_consolidated():
3320 data = data.consolidate()
3321
3322 self.attrs.ndim = data.ndim
3323 for i, ax in enumerate(data.axes):
3324 if i == 0 and (not ax.is_unique):
3325 raise ValueError("Columns index has to be unique for fixed format")
3326 self.write_index(f"axis{i}", ax)
3327
3328 # Supporting mixed-type DataFrame objects...nontrivial
3329 self.attrs.nblocks = len(data.blocks)
3330 for i, blk in enumerate(data.blocks):
3331 # I have no idea why, but writing values before items fixed #2299
3332 blk_items = data.items.take(blk.mgr_locs)
3333 self.write_array(f"block{i}_values", blk.values, items=blk_items)
3334 self.write_index(f"block{i}_items", blk_items)
3335
3336
3337class FrameFixed(BlockManagerFixed):
3338 pandas_kind = "frame"
3339 obj_type = DataFrame
3340
3341
3342class Table(Fixed):
3343 """
3344 represent a table:
3345 facilitate read/write of various types of tables
3346
3347 Attrs in Table Node
3348 -------------------
3349 These are attributes that are store in the main table node, they are
3350 necessary to recreate these tables when read back in.
3351
3352 index_axes : a list of tuples of the (original indexing axis and
3353 index column)
3354 non_index_axes: a list of tuples of the (original index axis and
3355 columns on a non-indexing axis)
3356 values_axes : a list of the columns which comprise the data of this
3357 table
3358 data_columns : a list of the columns that we are allowing indexing
3359 (these become single columns in values_axes)
3360 nan_rep : the string to use for nan representations for string
3361 objects
3362 levels : the names of levels
3363 metadata : the names of the metadata columns
3364 """
3365
3366 pandas_kind = "wide_table"
3367 format_type: str = "table" # GH#30962 needed by dask
3368 table_type: str
3369 levels: int | list[Hashable] = 1
3370 is_table = True
3371
3372 metadata: list
3373
3374 def __init__(
3375 self,
3376 parent: HDFStore,
3377 group: Node,
3378 encoding: str | None = None,
3379 errors: str = "strict",
3380 index_axes: list[IndexCol] | None = None,
3381 non_index_axes: list[tuple[AxisInt, Any]] | None = None,
3382 values_axes: list[DataCol] | None = None,
3383 data_columns: list | None = None,
3384 info: dict | None = None,
3385 nan_rep=None,
3386 ) -> None:
3387 super().__init__(parent, group, encoding=encoding, errors=errors)
3388 self.index_axes = index_axes or []
3389 self.non_index_axes = non_index_axes or []
3390 self.values_axes = values_axes or []
3391 self.data_columns = data_columns or []
3392 self.info = info or {}
3393 self.nan_rep = nan_rep
3394
3395 @property
3396 def table_type_short(self) -> str:
3397 return self.table_type.split("_")[0]
3398
3399 def __repr__(self) -> str:
3400 """return a pretty representation of myself"""
3401 self.infer_axes()
3402 jdc = ",".join(self.data_columns) if len(self.data_columns) else ""
3403 dc = f",dc->[{jdc}]"
3404
3405 ver = ""
3406 if self.is_old_version:
3407 jver = ".".join([str(x) for x in self.version])
3408 ver = f"[{jver}]"
3409
3410 jindex_axes = ",".join([a.name for a in self.index_axes])
3411 return (
3412 f"{self.pandas_type:12.12}{ver} "
3413 f"(typ->{self.table_type_short},nrows->{self.nrows},"
3414 f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})"
3415 )
3416
3417 def __getitem__(self, c: str):
3418 """return the axis for c"""
3419 for a in self.axes:
3420 if c == a.name:
3421 return a
3422 return None
3423
3424 def validate(self, other) -> None:
3425 """validate against an existing table"""
3426 if other is None:
3427 return
3428
3429 if other.table_type != self.table_type:
3430 raise TypeError(
3431 "incompatible table_type with existing "
3432 f"[{other.table_type} - {self.table_type}]"
3433 )
3434
3435 for c in ["index_axes", "non_index_axes", "values_axes"]:
3436 sv = getattr(self, c, None)
3437 ov = getattr(other, c, None)
3438 if sv != ov:
3439 # show the error for the specific axes
3440 # Argument 1 to "enumerate" has incompatible type
3441 # "Optional[Any]"; expected "Iterable[Any]" [arg-type]
3442 for i, sax in enumerate(sv): # type: ignore[arg-type]
3443 # Value of type "Optional[Any]" is not indexable [index]
3444 oax = ov[i] # type: ignore[index]
3445 if sax != oax:
3446 raise ValueError(
3447 f"invalid combination of [{c}] on appending data "
3448 f"[{sax}] vs current table [{oax}]"
3449 )
3450
3451 # should never get here
3452 raise Exception(
3453 f"invalid combination of [{c}] on appending data [{sv}] vs "
3454 f"current table [{ov}]"
3455 )
3456
3457 @property
3458 def is_multi_index(self) -> bool:
3459 """the levels attribute is 1 or a list in the case of a multi-index"""
3460 return isinstance(self.levels, list)
3461
3462 def validate_multiindex(
3463 self, obj: DataFrame | Series
3464 ) -> tuple[DataFrame, list[Hashable]]:
3465 """
3466 validate that we can store the multi-index; reset and return the
3467 new object
3468 """
3469 levels = com.fill_missing_names(obj.index.names)
3470 try:
3471 reset_obj = obj.reset_index()
3472 except ValueError as err:
3473 raise ValueError(
3474 "duplicate names/columns in the multi-index when storing as a table"
3475 ) from err
3476 assert isinstance(reset_obj, DataFrame) # for mypy
3477 return reset_obj, levels
3478
3479 @property
3480 def nrows_expected(self) -> int:
3481 """based on our axes, compute the expected nrows"""
3482 return np.prod([i.cvalues.shape[0] for i in self.index_axes])
3483
3484 @property
3485 def is_exists(self) -> bool:
3486 """has this table been created"""
3487 return "table" in self.group
3488
3489 @property
3490 def storable(self):
3491 return getattr(self.group, "table", None)
3492
3493 @property
3494 def table(self):
3495 """return the table group (this is my storable)"""
3496 return self.storable
3497
3498 @property
3499 def dtype(self):
3500 return self.table.dtype
3501
3502 @property
3503 def description(self):
3504 return self.table.description
3505
3506 @property
3507 def axes(self) -> itertools.chain[IndexCol]:
3508 return itertools.chain(self.index_axes, self.values_axes)
3509
3510 @property
3511 def ncols(self) -> int:
3512 """the number of total columns in the values axes"""
3513 return sum(len(a.values) for a in self.values_axes)
3514
3515 @property
3516 def is_transposed(self) -> bool:
3517 return False
3518
3519 @property
3520 def data_orientation(self) -> tuple[int, ...]:
3521 """return a tuple of my permutated axes, non_indexable at the front"""
3522 return tuple(
3523 itertools.chain(
3524 [int(a[0]) for a in self.non_index_axes],
3525 [int(a.axis) for a in self.index_axes],
3526 )
3527 )
3528
3529 def queryables(self) -> dict[str, Any]:
3530 """return a dict of the kinds allowable columns for this object"""
3531 # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here
3532 axis_names = {0: "index", 1: "columns"}
3533
3534 # compute the values_axes queryables
3535 d1 = [(a.cname, a) for a in self.index_axes]
3536 d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes]
3537 d3 = [
3538 (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)
3539 ]
3540
3541 return dict(d1 + d2 + d3)
3542
3543 def index_cols(self):
3544 """return a list of my index cols"""
3545 # Note: each `i.cname` below is assured to be a str.
3546 return [(i.axis, i.cname) for i in self.index_axes]
3547
3548 def values_cols(self) -> list[str]:
3549 """return a list of my values cols"""
3550 return [i.cname for i in self.values_axes]
3551
3552 def _get_metadata_path(self, key: str) -> str:
3553 """return the metadata pathname for this key"""
3554 group = self.group._v_pathname
3555 return f"{group}/meta/{key}/meta"
3556
3557 def write_metadata(self, key: str, values: np.ndarray) -> None:
3558 """
3559 Write out a metadata array to the key as a fixed-format Series.
3560
3561 Parameters
3562 ----------
3563 key : str
3564 values : ndarray
3565 """
3566 self.parent.put(
3567 self._get_metadata_path(key),
3568 Series(values, copy=False),
3569 format="table",
3570 encoding=self.encoding,
3571 errors=self.errors,
3572 nan_rep=self.nan_rep,
3573 )
3574
3575 def read_metadata(self, key: str):
3576 """return the meta data array for this key"""
3577 if getattr(getattr(self.group, "meta", None), key, None) is not None:
3578 return self.parent.select(self._get_metadata_path(key))
3579 return None
3580
3581 def set_attrs(self) -> None:
3582 """set our table type & indexables"""
3583 self.attrs.table_type = str(self.table_type)
3584 self.attrs.index_cols = self.index_cols()
3585 self.attrs.values_cols = self.values_cols()
3586 self.attrs.non_index_axes = self.non_index_axes
3587 self.attrs.data_columns = self.data_columns
3588 self.attrs.nan_rep = self.nan_rep
3589 self.attrs.encoding = self.encoding
3590 self.attrs.errors = self.errors
3591 self.attrs.levels = self.levels
3592 self.attrs.info = self.info
3593
3594 def get_attrs(self) -> None:
3595 """retrieve our attributes"""
3596 self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []
3597 self.data_columns = getattr(self.attrs, "data_columns", None) or []
3598 self.info = getattr(self.attrs, "info", None) or {}
3599 self.nan_rep = getattr(self.attrs, "nan_rep", None)
3600 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
3601 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
3602 self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []
3603 self.index_axes = [a for a in self.indexables if a.is_an_indexable]
3604 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
3605
3606 def validate_version(self, where=None) -> None:
3607 """are we trying to operate on an old version?"""
3608 if where is not None:
3609 if self.is_old_version:
3610 ws = incompatibility_doc % ".".join([str(x) for x in self.version])
3611 warnings.warn(
3612 ws,
3613 IncompatibilityWarning,
3614 stacklevel=find_stack_level(),
3615 )
3616
3617 def validate_min_itemsize(self, min_itemsize) -> None:
3618 """
3619 validate the min_itemsize doesn't contain items that are not in the
3620 axes this needs data_columns to be defined
3621 """
3622 if min_itemsize is None:
3623 return
3624 if not isinstance(min_itemsize, dict):
3625 return
3626
3627 q = self.queryables()
3628 for k in min_itemsize:
3629 # ok, apply generally
3630 if k == "values":
3631 continue
3632 if k not in q:
3633 raise ValueError(
3634 f"min_itemsize has the key [{k}] which is not an axis or "
3635 "data_column"
3636 )
3637
3638 @cache_readonly
3639 def indexables(self):
3640 """create/cache the indexables if they don't exist"""
3641 _indexables = []
3642
3643 desc = self.description
3644 table_attrs = self.table.attrs
3645
3646 # Note: each of the `name` kwargs below are str, ensured
3647 # by the definition in index_cols.
3648 # index columns
3649 for i, (axis, name) in enumerate(self.attrs.index_cols):
3650 atom = getattr(desc, name)
3651 md = self.read_metadata(name)
3652 meta = "category" if md is not None else None
3653
3654 kind_attr = f"{name}_kind"
3655 kind = getattr(table_attrs, kind_attr, None)
3656
3657 index_col = IndexCol(
3658 name=name,
3659 axis=axis,
3660 pos=i,
3661 kind=kind,
3662 typ=atom,
3663 table=self.table,
3664 meta=meta,
3665 metadata=md,
3666 )
3667 _indexables.append(index_col)
3668
3669 # values columns
3670 dc = set(self.data_columns)
3671 base_pos = len(_indexables)
3672
3673 def f(i, c):
3674 assert isinstance(c, str)
3675 klass = DataCol
3676 if c in dc:
3677 klass = DataIndexableCol
3678
3679 atom = getattr(desc, c)
3680 adj_name = _maybe_adjust_name(c, self.version)
3681
3682 # TODO: why kind_attr here?
3683 values = getattr(table_attrs, f"{adj_name}_kind", None)
3684 dtype = getattr(table_attrs, f"{adj_name}_dtype", None)
3685 # Argument 1 to "_dtype_to_kind" has incompatible type
3686 # "Optional[Any]"; expected "str" [arg-type]
3687 kind = _dtype_to_kind(dtype) # type: ignore[arg-type]
3688
3689 md = self.read_metadata(c)
3690 # TODO: figure out why these two versions of `meta` dont always match.
3691 # meta = "category" if md is not None else None
3692 meta = getattr(table_attrs, f"{adj_name}_meta", None)
3693
3694 obj = klass(
3695 name=adj_name,
3696 cname=c,
3697 values=values,
3698 kind=kind,
3699 pos=base_pos + i,
3700 typ=atom,
3701 table=self.table,
3702 meta=meta,
3703 metadata=md,
3704 dtype=dtype,
3705 )
3706 return obj
3707
3708 # Note: the definition of `values_cols` ensures that each
3709 # `c` below is a str.
3710 _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)])
3711
3712 return _indexables
3713
3714 def create_index(
3715 self, columns=None, optlevel=None, kind: str | None = None
3716 ) -> None:
3717 """
3718 Create a pytables index on the specified columns.
3719
3720 Parameters
3721 ----------
3722 columns : None, bool, or listlike[str]
3723 Indicate which columns to create an index on.
3724
3725 * False : Do not create any indexes.
3726 * True : Create indexes on all columns.
3727 * None : Create indexes on all columns.
3728 * listlike : Create indexes on the given columns.
3729
3730 optlevel : int or None, default None
3731 Optimization level, if None, pytables defaults to 6.
3732 kind : str or None, default None
3733 Kind of index, if None, pytables defaults to "medium".
3734
3735 Raises
3736 ------
3737 TypeError if trying to create an index on a complex-type column.
3738
3739 Notes
3740 -----
3741 Cannot index Time64Col or ComplexCol.
3742 Pytables must be >= 3.0.
3743 """
3744 if not self.infer_axes():
3745 return
3746 if columns is False:
3747 return
3748
3749 # index all indexables and data_columns
3750 if columns is None or columns is True:
3751 columns = [a.cname for a in self.axes if a.is_data_indexable]
3752 if not isinstance(columns, (tuple, list)):
3753 columns = [columns]
3754
3755 kw = {}
3756 if optlevel is not None:
3757 kw["optlevel"] = optlevel
3758 if kind is not None:
3759 kw["kind"] = kind
3760
3761 table = self.table
3762 for c in columns:
3763 v = getattr(table.cols, c, None)
3764 if v is not None:
3765 # remove the index if the kind/optlevel have changed
3766 if v.is_indexed:
3767 index = v.index
3768 cur_optlevel = index.optlevel
3769 cur_kind = index.kind
3770
3771 if kind is not None and cur_kind != kind:
3772 v.remove_index()
3773 else:
3774 kw["kind"] = cur_kind
3775
3776 if optlevel is not None and cur_optlevel != optlevel:
3777 v.remove_index()
3778 else:
3779 kw["optlevel"] = cur_optlevel
3780
3781 # create the index
3782 if not v.is_indexed:
3783 if v.type.startswith("complex"):
3784 raise TypeError(
3785 "Columns containing complex values can be stored but "
3786 "cannot be indexed when using table format. Either use "
3787 "fixed format, set index=False, or do not include "
3788 "the columns containing complex values to "
3789 "data_columns when initializing the table."
3790 )
3791 v.create_index(**kw)
3792 elif c in self.non_index_axes[0][1]:
3793 # GH 28156
3794 raise AttributeError(
3795 f"column {c} is not a data_column.\n"
3796 f"In order to read column {c} you must reload the dataframe \n"
3797 f"into HDFStore and include {c} with the data_columns argument."
3798 )
3799
3800 def _read_axes(
3801 self, where, start: int | None = None, stop: int | None = None
3802 ) -> list[tuple[np.ndarray, np.ndarray] | tuple[Index, Index]]:
3803 """
3804 Create the axes sniffed from the table.
3805
3806 Parameters
3807 ----------
3808 where : ???
3809 start : int or None, default None
3810 stop : int or None, default None
3811
3812 Returns
3813 -------
3814 List[Tuple[index_values, column_values]]
3815 """
3816 # create the selection
3817 selection = Selection(self, where=where, start=start, stop=stop)
3818 values = selection.select()
3819
3820 results = []
3821 # convert the data
3822 for a in self.axes:
3823 a.set_info(self.info)
3824 res = a.convert(
3825 values,
3826 nan_rep=self.nan_rep,
3827 encoding=self.encoding,
3828 errors=self.errors,
3829 )
3830 results.append(res)
3831
3832 return results
3833
3834 @classmethod
3835 def get_object(cls, obj, transposed: bool):
3836 """return the data for this obj"""
3837 return obj
3838
3839 def validate_data_columns(self, data_columns, min_itemsize, non_index_axes):
3840 """
3841 take the input data_columns and min_itemize and create a data
3842 columns spec
3843 """
3844 if not len(non_index_axes):
3845 return []
3846
3847 axis, axis_labels = non_index_axes[0]
3848 info = self.info.get(axis, {})
3849 if info.get("type") == "MultiIndex" and data_columns:
3850 raise ValueError(
3851 f"cannot use a multi-index on axis [{axis}] with "
3852 f"data_columns {data_columns}"
3853 )
3854
3855 # evaluate the passed data_columns, True == use all columns
3856 # take only valid axis labels
3857 if data_columns is True:
3858 data_columns = list(axis_labels)
3859 elif data_columns is None:
3860 data_columns = []
3861
3862 # if min_itemsize is a dict, add the keys (exclude 'values')
3863 if isinstance(min_itemsize, dict):
3864 existing_data_columns = set(data_columns)
3865 data_columns = list(data_columns) # ensure we do not modify
3866 data_columns.extend(
3867 [
3868 k
3869 for k in min_itemsize.keys()
3870 if k != "values" and k not in existing_data_columns
3871 ]
3872 )
3873
3874 # return valid columns in the order of our axis
3875 return [c for c in data_columns if c in axis_labels]
3876
3877 def _create_axes(
3878 self,
3879 axes,
3880 obj: DataFrame,
3881 validate: bool = True,
3882 nan_rep=None,
3883 data_columns=None,
3884 min_itemsize=None,
3885 ):
3886 """
3887 Create and return the axes.
3888
3889 Parameters
3890 ----------
3891 axes: list or None
3892 The names or numbers of the axes to create.
3893 obj : DataFrame
3894 The object to create axes on.
3895 validate: bool, default True
3896 Whether to validate the obj against an existing object already written.
3897 nan_rep :
3898 A value to use for string column nan_rep.
3899 data_columns : List[str], True, or None, default None
3900 Specify the columns that we want to create to allow indexing on.
3901
3902 * True : Use all available columns.
3903 * None : Use no columns.
3904 * List[str] : Use the specified columns.
3905
3906 min_itemsize: Dict[str, int] or None, default None
3907 The min itemsize for a column in bytes.
3908 """
3909 if not isinstance(obj, DataFrame):
3910 group = self.group._v_name
3911 raise TypeError(
3912 f"cannot properly create the storer for: [group->{group},"
3913 f"value->{type(obj)}]"
3914 )
3915
3916 # set the default axes if needed
3917 if axes is None:
3918 axes = [0]
3919
3920 # map axes to numbers
3921 axes = [obj._get_axis_number(a) for a in axes]
3922
3923 # do we have an existing table (if so, use its axes & data_columns)
3924 if self.infer_axes():
3925 table_exists = True
3926 axes = [a.axis for a in self.index_axes]
3927 data_columns = list(self.data_columns)
3928 nan_rep = self.nan_rep
3929 # TODO: do we always have validate=True here?
3930 else:
3931 table_exists = False
3932
3933 new_info = self.info
3934
3935 assert self.ndim == 2 # with next check, we must have len(axes) == 1
3936 # currently support on ndim-1 axes
3937 if len(axes) != self.ndim - 1:
3938 raise ValueError(
3939 "currently only support ndim-1 indexers in an AppendableTable"
3940 )
3941
3942 # create according to the new data
3943 new_non_index_axes: list = []
3944
3945 # nan_representation
3946 if nan_rep is None:
3947 nan_rep = "nan"
3948
3949 # We construct the non-index-axis first, since that alters new_info
3950 idx = next(x for x in [0, 1] if x not in axes)
3951
3952 a = obj.axes[idx]
3953 # we might be able to change the axes on the appending data if necessary
3954 append_axis = list(a)
3955 if table_exists:
3956 indexer = len(new_non_index_axes) # i.e. 0
3957 exist_axis = self.non_index_axes[indexer][1]
3958 if not array_equivalent(
3959 np.array(append_axis),
3960 np.array(exist_axis),
3961 strict_nan=True,
3962 dtype_equal=True,
3963 ):
3964 # ahah! -> reindex
3965 if array_equivalent(
3966 np.array(sorted(append_axis)),
3967 np.array(sorted(exist_axis)),
3968 strict_nan=True,
3969 dtype_equal=True,
3970 ):
3971 append_axis = exist_axis
3972
3973 # the non_index_axes info
3974 info = new_info.setdefault(idx, {})
3975 info["names"] = list(a.names)
3976 info["type"] = type(a).__name__
3977
3978 new_non_index_axes.append((idx, append_axis))
3979
3980 # Now we can construct our new index axis
3981 idx = axes[0]
3982 a = obj.axes[idx]
3983 axis_name = obj._get_axis_name(idx)
3984 new_index = _convert_index(axis_name, a, self.encoding, self.errors)
3985 new_index.axis = idx
3986
3987 # Because we are always 2D, there is only one new_index, so
3988 # we know it will have pos=0
3989 new_index.set_pos(0)
3990 new_index.update_info(new_info)
3991 new_index.maybe_set_size(min_itemsize) # check for column conflicts
3992
3993 new_index_axes = [new_index]
3994 j = len(new_index_axes) # i.e. 1
3995 assert j == 1
3996
3997 # reindex by our non_index_axes & compute data_columns
3998 assert len(new_non_index_axes) == 1
3999 for a in new_non_index_axes:
4000 obj = _reindex_axis(obj, a[0], a[1])
4001
4002 transposed = new_index.axis == 1
4003
4004 # figure out data_columns and get out blocks
4005 data_columns = self.validate_data_columns(
4006 data_columns, min_itemsize, new_non_index_axes
4007 )
4008
4009 frame = self.get_object(obj, transposed)._consolidate()
4010
4011 blocks, blk_items = self._get_blocks_and_items(
4012 frame, table_exists, new_non_index_axes, self.values_axes, data_columns
4013 )
4014
4015 # add my values
4016 vaxes = []
4017 for i, (blk, b_items) in enumerate(zip(blocks, blk_items)):
4018 # shape of the data column are the indexable axes
4019 klass = DataCol
4020 name = None
4021
4022 # we have a data_column
4023 if data_columns and len(b_items) == 1 and b_items[0] in data_columns:
4024 klass = DataIndexableCol
4025 name = b_items[0]
4026 if not (name is None or isinstance(name, str)):
4027 # TODO: should the message here be more specifically non-str?
4028 raise ValueError("cannot have non-object label DataIndexableCol")
4029
4030 # make sure that we match up the existing columns
4031 # if we have an existing table
4032 existing_col: DataCol | None
4033
4034 if table_exists and validate:
4035 try:
4036 existing_col = self.values_axes[i]
4037 except (IndexError, KeyError) as err:
4038 raise ValueError(
4039 f"Incompatible appended table [{blocks}]"
4040 f"with existing table [{self.values_axes}]"
4041 ) from err
4042 else:
4043 existing_col = None
4044
4045 new_name = name or f"values_block_{i}"
4046 data_converted = _maybe_convert_for_string_atom(
4047 new_name,
4048 blk.values,
4049 existing_col=existing_col,
4050 min_itemsize=min_itemsize,
4051 nan_rep=nan_rep,
4052 encoding=self.encoding,
4053 errors=self.errors,
4054 columns=b_items,
4055 )
4056 adj_name = _maybe_adjust_name(new_name, self.version)
4057
4058 typ = klass._get_atom(data_converted)
4059 kind = _dtype_to_kind(data_converted.dtype.name)
4060 tz = None
4061 if getattr(data_converted, "tz", None) is not None:
4062 tz = _get_tz(data_converted.tz)
4063
4064 meta = metadata = ordered = None
4065 if isinstance(data_converted.dtype, CategoricalDtype):
4066 ordered = data_converted.ordered
4067 meta = "category"
4068 metadata = np.asarray(data_converted.categories).ravel()
4069
4070 data, dtype_name = _get_data_and_dtype_name(data_converted)
4071
4072 col = klass(
4073 name=adj_name,
4074 cname=new_name,
4075 values=list(b_items),
4076 typ=typ,
4077 pos=j,
4078 kind=kind,
4079 tz=tz,
4080 ordered=ordered,
4081 meta=meta,
4082 metadata=metadata,
4083 dtype=dtype_name,
4084 data=data,
4085 )
4086 col.update_info(new_info)
4087
4088 vaxes.append(col)
4089
4090 j += 1
4091
4092 dcs = [col.name for col in vaxes if col.is_data_indexable]
4093
4094 new_table = type(self)(
4095 parent=self.parent,
4096 group=self.group,
4097 encoding=self.encoding,
4098 errors=self.errors,
4099 index_axes=new_index_axes,
4100 non_index_axes=new_non_index_axes,
4101 values_axes=vaxes,
4102 data_columns=dcs,
4103 info=new_info,
4104 nan_rep=nan_rep,
4105 )
4106 if hasattr(self, "levels"):
4107 # TODO: get this into constructor, only for appropriate subclass
4108 new_table.levels = self.levels
4109
4110 new_table.validate_min_itemsize(min_itemsize)
4111
4112 if validate and table_exists:
4113 new_table.validate(self)
4114
4115 return new_table
4116
4117 @staticmethod
4118 def _get_blocks_and_items(
4119 frame: DataFrame,
4120 table_exists: bool,
4121 new_non_index_axes,
4122 values_axes,
4123 data_columns,
4124 ):
4125 # Helper to clarify non-state-altering parts of _create_axes
4126
4127 # TODO(ArrayManager) HDFStore relies on accessing the blocks
4128 if isinstance(frame._mgr, ArrayManager):
4129 frame = frame._as_manager("block")
4130
4131 def get_blk_items(mgr):
4132 return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks]
4133
4134 mgr = frame._mgr
4135 mgr = cast(BlockManager, mgr)
4136 blocks: list[Block] = list(mgr.blocks)
4137 blk_items: list[Index] = get_blk_items(mgr)
4138
4139 if len(data_columns):
4140 # TODO: prove that we only get here with axis == 1?
4141 # It is the case in all extant tests, but NOT the case
4142 # outside this `if len(data_columns)` check.
4143
4144 axis, axis_labels = new_non_index_axes[0]
4145 new_labels = Index(axis_labels).difference(Index(data_columns))
4146 mgr = frame.reindex(new_labels, axis=axis)._mgr
4147 mgr = cast(BlockManager, mgr)
4148
4149 blocks = list(mgr.blocks)
4150 blk_items = get_blk_items(mgr)
4151 for c in data_columns:
4152 # This reindex would raise ValueError if we had a duplicate
4153 # index, so we can infer that (as long as axis==1) we
4154 # get a single column back, so a single block.
4155 mgr = frame.reindex([c], axis=axis)._mgr
4156 mgr = cast(BlockManager, mgr)
4157 blocks.extend(mgr.blocks)
4158 blk_items.extend(get_blk_items(mgr))
4159
4160 # reorder the blocks in the same order as the existing table if we can
4161 if table_exists:
4162 by_items = {
4163 tuple(b_items.tolist()): (b, b_items)
4164 for b, b_items in zip(blocks, blk_items)
4165 }
4166 new_blocks: list[Block] = []
4167 new_blk_items = []
4168 for ea in values_axes:
4169 items = tuple(ea.values)
4170 try:
4171 b, b_items = by_items.pop(items)
4172 new_blocks.append(b)
4173 new_blk_items.append(b_items)
4174 except (IndexError, KeyError) as err:
4175 jitems = ",".join([pprint_thing(item) for item in items])
4176 raise ValueError(
4177 f"cannot match existing table structure for [{jitems}] "
4178 "on appending data"
4179 ) from err
4180 blocks = new_blocks
4181 blk_items = new_blk_items
4182
4183 return blocks, blk_items
4184
4185 def process_axes(self, obj, selection: Selection, columns=None) -> DataFrame:
4186 """process axes filters"""
4187 # make a copy to avoid side effects
4188 if columns is not None:
4189 columns = list(columns)
4190
4191 # make sure to include levels if we have them
4192 if columns is not None and self.is_multi_index:
4193 assert isinstance(self.levels, list) # assured by is_multi_index
4194 for n in self.levels:
4195 if n not in columns:
4196 columns.insert(0, n)
4197
4198 # reorder by any non_index_axes & limit to the select columns
4199 for axis, labels in self.non_index_axes:
4200 obj = _reindex_axis(obj, axis, labels, columns)
4201
4202 def process_filter(field, filt, op):
4203 for axis_name in obj._AXIS_ORDERS:
4204 axis_number = obj._get_axis_number(axis_name)
4205 axis_values = obj._get_axis(axis_name)
4206 assert axis_number is not None
4207
4208 # see if the field is the name of an axis
4209 if field == axis_name:
4210 # if we have a multi-index, then need to include
4211 # the levels
4212 if self.is_multi_index:
4213 filt = filt.union(Index(self.levels))
4214
4215 takers = op(axis_values, filt)
4216 return obj.loc(axis=axis_number)[takers]
4217
4218 # this might be the name of a file IN an axis
4219 elif field in axis_values:
4220 # we need to filter on this dimension
4221 values = ensure_index(getattr(obj, field).values)
4222 filt = ensure_index(filt)
4223
4224 # hack until we support reversed dim flags
4225 if isinstance(obj, DataFrame):
4226 axis_number = 1 - axis_number
4227
4228 takers = op(values, filt)
4229 return obj.loc(axis=axis_number)[takers]
4230
4231 raise ValueError(f"cannot find the field [{field}] for filtering!")
4232
4233 # apply the selection filters (but keep in the same order)
4234 if selection.filter is not None:
4235 for field, op, filt in selection.filter.format():
4236 obj = process_filter(field, filt, op)
4237
4238 return obj
4239
4240 def create_description(
4241 self,
4242 complib,
4243 complevel: int | None,
4244 fletcher32: bool,
4245 expectedrows: int | None,
4246 ) -> dict[str, Any]:
4247 """create the description of the table from the axes & values"""
4248 # provided expected rows if its passed
4249 if expectedrows is None:
4250 expectedrows = max(self.nrows_expected, 10000)
4251
4252 d = {"name": "table", "expectedrows": expectedrows}
4253
4254 # description from the axes & values
4255 d["description"] = {a.cname: a.typ for a in self.axes}
4256
4257 if complib:
4258 if complevel is None:
4259 complevel = self._complevel or 9
4260 filters = _tables().Filters(
4261 complevel=complevel,
4262 complib=complib,
4263 fletcher32=fletcher32 or self._fletcher32,
4264 )
4265 d["filters"] = filters
4266 elif self._filters is not None:
4267 d["filters"] = self._filters
4268
4269 return d
4270
4271 def read_coordinates(
4272 self, where=None, start: int | None = None, stop: int | None = None
4273 ):
4274 """
4275 select coordinates (row numbers) from a table; return the
4276 coordinates object
4277 """
4278 # validate the version
4279 self.validate_version(where)
4280
4281 # infer the data kind
4282 if not self.infer_axes():
4283 return False
4284
4285 # create the selection
4286 selection = Selection(self, where=where, start=start, stop=stop)
4287 coords = selection.select_coords()
4288 if selection.filter is not None:
4289 for field, op, filt in selection.filter.format():
4290 data = self.read_column(
4291 field, start=coords.min(), stop=coords.max() + 1
4292 )
4293 coords = coords[op(data.iloc[coords - coords.min()], filt).values]
4294
4295 return Index(coords)
4296
4297 def read_column(
4298 self,
4299 column: str,
4300 where=None,
4301 start: int | None = None,
4302 stop: int | None = None,
4303 ):
4304 """
4305 return a single column from the table, generally only indexables
4306 are interesting
4307 """
4308 # validate the version
4309 self.validate_version()
4310
4311 # infer the data kind
4312 if not self.infer_axes():
4313 return False
4314
4315 if where is not None:
4316 raise TypeError("read_column does not currently accept a where clause")
4317
4318 # find the axes
4319 for a in self.axes:
4320 if column == a.name:
4321 if not a.is_data_indexable:
4322 raise ValueError(
4323 f"column [{column}] can not be extracted individually; "
4324 "it is not data indexable"
4325 )
4326
4327 # column must be an indexable or a data column
4328 c = getattr(self.table.cols, column)
4329 a.set_info(self.info)
4330 col_values = a.convert(
4331 c[start:stop],
4332 nan_rep=self.nan_rep,
4333 encoding=self.encoding,
4334 errors=self.errors,
4335 )
4336 return Series(_set_tz(col_values[1], a.tz), name=column, copy=False)
4337
4338 raise KeyError(f"column [{column}] not found in the table")
4339
4340
4341class WORMTable(Table):
4342 """
4343 a write-once read-many table: this format DOES NOT ALLOW appending to a
4344 table. writing is a one-time operation the data are stored in a format
4345 that allows for searching the data on disk
4346 """
4347
4348 table_type = "worm"
4349
4350 def read(
4351 self,
4352 where=None,
4353 columns=None,
4354 start: int | None = None,
4355 stop: int | None = None,
4356 ):
4357 """
4358 read the indices and the indexing array, calculate offset rows and return
4359 """
4360 raise NotImplementedError("WORMTable needs to implement read")
4361
4362 def write(self, obj, **kwargs) -> None:
4363 """
4364 write in a format that we can search later on (but cannot append
4365 to): write out the indices and the values using _write_array
4366 (e.g. a CArray) create an indexing table so that we can search
4367 """
4368 raise NotImplementedError("WORMTable needs to implement write")
4369
4370
4371class AppendableTable(Table):
4372 """support the new appendable table formats"""
4373
4374 table_type = "appendable"
4375
4376 # error: Signature of "write" incompatible with supertype "Fixed"
4377 def write( # type: ignore[override]
4378 self,
4379 obj,
4380 axes=None,
4381 append: bool = False,
4382 complib=None,
4383 complevel=None,
4384 fletcher32=None,
4385 min_itemsize=None,
4386 chunksize: int | None = None,
4387 expectedrows=None,
4388 dropna: bool = False,
4389 nan_rep=None,
4390 data_columns=None,
4391 track_times: bool = True,
4392 ) -> None:
4393 if not append and self.is_exists:
4394 self._handle.remove_node(self.group, "table")
4395
4396 # create the axes
4397 table = self._create_axes(
4398 axes=axes,
4399 obj=obj,
4400 validate=append,
4401 min_itemsize=min_itemsize,
4402 nan_rep=nan_rep,
4403 data_columns=data_columns,
4404 )
4405
4406 for a in table.axes:
4407 a.validate_names()
4408
4409 if not table.is_exists:
4410 # create the table
4411 options = table.create_description(
4412 complib=complib,
4413 complevel=complevel,
4414 fletcher32=fletcher32,
4415 expectedrows=expectedrows,
4416 )
4417
4418 # set the table attributes
4419 table.set_attrs()
4420
4421 options["track_times"] = track_times
4422
4423 # create the table
4424 table._handle.create_table(table.group, **options)
4425
4426 # update my info
4427 table.attrs.info = table.info
4428
4429 # validate the axes and set the kinds
4430 for a in table.axes:
4431 a.validate_and_set(table, append)
4432
4433 # add the rows
4434 table.write_data(chunksize, dropna=dropna)
4435
4436 def write_data(self, chunksize: int | None, dropna: bool = False) -> None:
4437 """
4438 we form the data into a 2-d including indexes,values,mask write chunk-by-chunk
4439 """
4440 names = self.dtype.names
4441 nrows = self.nrows_expected
4442
4443 # if dropna==True, then drop ALL nan rows
4444 masks = []
4445 if dropna:
4446 for a in self.values_axes:
4447 # figure the mask: only do if we can successfully process this
4448 # column, otherwise ignore the mask
4449 mask = isna(a.data).all(axis=0)
4450 if isinstance(mask, np.ndarray):
4451 masks.append(mask.astype("u1", copy=False))
4452
4453 # consolidate masks
4454 if len(masks):
4455 mask = masks[0]
4456 for m in masks[1:]:
4457 mask = mask & m
4458 mask = mask.ravel()
4459 else:
4460 mask = None
4461
4462 # broadcast the indexes if needed
4463 indexes = [a.cvalues for a in self.index_axes]
4464 nindexes = len(indexes)
4465 assert nindexes == 1, nindexes # ensures we dont need to broadcast
4466
4467 # transpose the values so first dimension is last
4468 # reshape the values if needed
4469 values = [a.take_data() for a in self.values_axes]
4470 values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]
4471 bvalues = []
4472 for i, v in enumerate(values):
4473 new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
4474 bvalues.append(v.reshape(new_shape))
4475
4476 # write the chunks
4477 if chunksize is None:
4478 chunksize = 100000
4479
4480 rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
4481 chunks = nrows // chunksize + 1
4482 for i in range(chunks):
4483 start_i = i * chunksize
4484 end_i = min((i + 1) * chunksize, nrows)
4485 if start_i >= end_i:
4486 break
4487
4488 self.write_data_chunk(
4489 rows,
4490 indexes=[a[start_i:end_i] for a in indexes],
4491 mask=mask[start_i:end_i] if mask is not None else None,
4492 values=[v[start_i:end_i] for v in bvalues],
4493 )
4494
4495 def write_data_chunk(
4496 self,
4497 rows: np.ndarray,
4498 indexes: list[np.ndarray],
4499 mask: npt.NDArray[np.bool_] | None,
4500 values: list[np.ndarray],
4501 ) -> None:
4502 """
4503 Parameters
4504 ----------
4505 rows : an empty memory space where we are putting the chunk
4506 indexes : an array of the indexes
4507 mask : an array of the masks
4508 values : an array of the values
4509 """
4510 # 0 len
4511 for v in values:
4512 if not np.prod(v.shape):
4513 return
4514
4515 nrows = indexes[0].shape[0]
4516 if nrows != len(rows):
4517 rows = np.empty(nrows, dtype=self.dtype)
4518 names = self.dtype.names
4519 nindexes = len(indexes)
4520
4521 # indexes
4522 for i, idx in enumerate(indexes):
4523 rows[names[i]] = idx
4524
4525 # values
4526 for i, v in enumerate(values):
4527 rows[names[i + nindexes]] = v
4528
4529 # mask
4530 if mask is not None:
4531 m = ~mask.ravel().astype(bool, copy=False)
4532 if not m.all():
4533 rows = rows[m]
4534
4535 if len(rows):
4536 self.table.append(rows)
4537 self.table.flush()
4538
4539 def delete(self, where=None, start: int | None = None, stop: int | None = None):
4540 # delete all rows (and return the nrows)
4541 if where is None or not len(where):
4542 if start is None and stop is None:
4543 nrows = self.nrows
4544 self._handle.remove_node(self.group, recursive=True)
4545 else:
4546 # pytables<3.0 would remove a single row with stop=None
4547 if stop is None:
4548 stop = self.nrows
4549 nrows = self.table.remove_rows(start=start, stop=stop)
4550 self.table.flush()
4551 return nrows
4552
4553 # infer the data kind
4554 if not self.infer_axes():
4555 return None
4556
4557 # create the selection
4558 table = self.table
4559 selection = Selection(self, where, start=start, stop=stop)
4560 values = selection.select_coords()
4561
4562 # delete the rows in reverse order
4563 sorted_series = Series(values, copy=False).sort_values()
4564 ln = len(sorted_series)
4565
4566 if ln:
4567 # construct groups of consecutive rows
4568 diff = sorted_series.diff()
4569 groups = list(diff[diff > 1].index)
4570
4571 # 1 group
4572 if not len(groups):
4573 groups = [0]
4574
4575 # final element
4576 if groups[-1] != ln:
4577 groups.append(ln)
4578
4579 # initial element
4580 if groups[0] != 0:
4581 groups.insert(0, 0)
4582
4583 # we must remove in reverse order!
4584 pg = groups.pop()
4585 for g in reversed(groups):
4586 rows = sorted_series.take(range(g, pg))
4587 table.remove_rows(
4588 start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1
4589 )
4590 pg = g
4591
4592 self.table.flush()
4593
4594 # return the number of rows removed
4595 return ln
4596
4597
4598class AppendableFrameTable(AppendableTable):
4599 """support the new appendable table formats"""
4600
4601 pandas_kind = "frame_table"
4602 table_type = "appendable_frame"
4603 ndim = 2
4604 obj_type: type[DataFrame | Series] = DataFrame
4605
4606 @property
4607 def is_transposed(self) -> bool:
4608 return self.index_axes[0].axis == 1
4609
4610 @classmethod
4611 def get_object(cls, obj, transposed: bool):
4612 """these are written transposed"""
4613 if transposed:
4614 obj = obj.T
4615 return obj
4616
4617 def read(
4618 self,
4619 where=None,
4620 columns=None,
4621 start: int | None = None,
4622 stop: int | None = None,
4623 ):
4624 # validate the version
4625 self.validate_version(where)
4626
4627 # infer the data kind
4628 if not self.infer_axes():
4629 return None
4630
4631 result = self._read_axes(where=where, start=start, stop=stop)
4632
4633 info = (
4634 self.info.get(self.non_index_axes[0][0], {})
4635 if len(self.non_index_axes)
4636 else {}
4637 )
4638
4639 inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]
4640 assert len(inds) == 1
4641 ind = inds[0]
4642
4643 index = result[ind][0]
4644
4645 frames = []
4646 for i, a in enumerate(self.axes):
4647 if a not in self.values_axes:
4648 continue
4649 index_vals, cvalues = result[i]
4650
4651 # we could have a multi-index constructor here
4652 # ensure_index doesn't recognized our list-of-tuples here
4653 if info.get("type") != "MultiIndex":
4654 cols = Index(index_vals)
4655 else:
4656 cols = MultiIndex.from_tuples(index_vals)
4657
4658 names = info.get("names")
4659 if names is not None:
4660 cols.set_names(names, inplace=True)
4661
4662 if self.is_transposed:
4663 values = cvalues
4664 index_ = cols
4665 cols_ = Index(index, name=getattr(index, "name", None))
4666 else:
4667 values = cvalues.T
4668 index_ = Index(index, name=getattr(index, "name", None))
4669 cols_ = cols
4670
4671 # if we have a DataIndexableCol, its shape will only be 1 dim
4672 if values.ndim == 1 and isinstance(values, np.ndarray):
4673 values = values.reshape((1, values.shape[0]))
4674
4675 if isinstance(values, np.ndarray):
4676 df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
4677 elif isinstance(values, Index):
4678 df = DataFrame(values, columns=cols_, index=index_)
4679 else:
4680 # Categorical
4681 df = DataFrame._from_arrays([values], columns=cols_, index=index_)
4682 if not (using_pyarrow_string_dtype() and values.dtype.kind == "O"):
4683 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
4684 if using_pyarrow_string_dtype() and is_string_array(
4685 values, # type: ignore[arg-type]
4686 skipna=True,
4687 ):
4688 df = df.astype("string[pyarrow_numpy]")
4689 frames.append(df)
4690
4691 if len(frames) == 1:
4692 df = frames[0]
4693 else:
4694 df = concat(frames, axis=1)
4695
4696 selection = Selection(self, where=where, start=start, stop=stop)
4697 # apply the selection filters & axis orderings
4698 df = self.process_axes(df, selection=selection, columns=columns)
4699 return df
4700
4701
4702class AppendableSeriesTable(AppendableFrameTable):
4703 """support the new appendable table formats"""
4704
4705 pandas_kind = "series_table"
4706 table_type = "appendable_series"
4707 ndim = 2
4708 obj_type = Series
4709
4710 @property
4711 def is_transposed(self) -> bool:
4712 return False
4713
4714 @classmethod
4715 def get_object(cls, obj, transposed: bool):
4716 return obj
4717
4718 # error: Signature of "write" incompatible with supertype "Fixed"
4719 def write(self, obj, data_columns=None, **kwargs) -> None: # type: ignore[override]
4720 """we are going to write this as a frame table"""
4721 if not isinstance(obj, DataFrame):
4722 name = obj.name or "values"
4723 obj = obj.to_frame(name)
4724 super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)
4725
4726 def read(
4727 self,
4728 where=None,
4729 columns=None,
4730 start: int | None = None,
4731 stop: int | None = None,
4732 ) -> Series:
4733 is_multi_index = self.is_multi_index
4734 if columns is not None and is_multi_index:
4735 assert isinstance(self.levels, list) # needed for mypy
4736 for n in self.levels:
4737 if n not in columns:
4738 columns.insert(0, n)
4739 s = super().read(where=where, columns=columns, start=start, stop=stop)
4740 if is_multi_index:
4741 s.set_index(self.levels, inplace=True)
4742
4743 s = s.iloc[:, 0]
4744
4745 # remove the default name
4746 if s.name == "values":
4747 s.name = None
4748 return s
4749
4750
4751class AppendableMultiSeriesTable(AppendableSeriesTable):
4752 """support the new appendable table formats"""
4753
4754 pandas_kind = "series_table"
4755 table_type = "appendable_multiseries"
4756
4757 # error: Signature of "write" incompatible with supertype "Fixed"
4758 def write(self, obj, **kwargs) -> None: # type: ignore[override]
4759 """we are going to write this as a frame table"""
4760 name = obj.name or "values"
4761 newobj, self.levels = self.validate_multiindex(obj)
4762 assert isinstance(self.levels, list) # for mypy
4763 cols = list(self.levels)
4764 cols.append(name)
4765 newobj.columns = Index(cols)
4766 super().write(obj=newobj, **kwargs)
4767
4768
4769class GenericTable(AppendableFrameTable):
4770 """a table that read/writes the generic pytables table format"""
4771
4772 pandas_kind = "frame_table"
4773 table_type = "generic_table"
4774 ndim = 2
4775 obj_type = DataFrame
4776 levels: list[Hashable]
4777
4778 @property
4779 def pandas_type(self) -> str:
4780 return self.pandas_kind
4781
4782 @property
4783 def storable(self):
4784 return getattr(self.group, "table", None) or self.group
4785
4786 def get_attrs(self) -> None:
4787 """retrieve our attributes"""
4788 self.non_index_axes = []
4789 self.nan_rep = None
4790 self.levels = []
4791
4792 self.index_axes = [a for a in self.indexables if a.is_an_indexable]
4793 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
4794 self.data_columns = [a.name for a in self.values_axes]
4795
4796 @cache_readonly
4797 def indexables(self):
4798 """create the indexables from the table description"""
4799 d = self.description
4800
4801 # TODO: can we get a typ for this? AFAICT it is the only place
4802 # where we aren't passing one
4803 # the index columns is just a simple index
4804 md = self.read_metadata("index")
4805 meta = "category" if md is not None else None
4806 index_col = GenericIndexCol(
4807 name="index", axis=0, table=self.table, meta=meta, metadata=md
4808 )
4809
4810 _indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col]
4811
4812 for i, n in enumerate(d._v_names):
4813 assert isinstance(n, str)
4814
4815 atom = getattr(d, n)
4816 md = self.read_metadata(n)
4817 meta = "category" if md is not None else None
4818 dc = GenericDataIndexableCol(
4819 name=n,
4820 pos=i,
4821 values=[n],
4822 typ=atom,
4823 table=self.table,
4824 meta=meta,
4825 metadata=md,
4826 )
4827 _indexables.append(dc)
4828
4829 return _indexables
4830
4831 # error: Signature of "write" incompatible with supertype "AppendableTable"
4832 def write(self, **kwargs) -> None: # type: ignore[override]
4833 raise NotImplementedError("cannot write on an generic table")
4834
4835
4836class AppendableMultiFrameTable(AppendableFrameTable):
4837 """a frame with a multi-index"""
4838
4839 table_type = "appendable_multiframe"
4840 obj_type = DataFrame
4841 ndim = 2
4842 _re_levels = re.compile(r"^level_\d+$")
4843
4844 @property
4845 def table_type_short(self) -> str:
4846 return "appendable_multi"
4847
4848 # error: Signature of "write" incompatible with supertype "Fixed"
4849 def write(self, obj, data_columns=None, **kwargs) -> None: # type: ignore[override]
4850 if data_columns is None:
4851 data_columns = []
4852 elif data_columns is True:
4853 data_columns = obj.columns.tolist()
4854 obj, self.levels = self.validate_multiindex(obj)
4855 assert isinstance(self.levels, list) # for mypy
4856 for n in self.levels:
4857 if n not in data_columns:
4858 data_columns.insert(0, n)
4859 super().write(obj=obj, data_columns=data_columns, **kwargs)
4860
4861 def read(
4862 self,
4863 where=None,
4864 columns=None,
4865 start: int | None = None,
4866 stop: int | None = None,
4867 ):
4868 df = super().read(where=where, columns=columns, start=start, stop=stop)
4869 df = df.set_index(self.levels)
4870
4871 # remove names for 'level_%d'
4872 df.index = df.index.set_names(
4873 [None if self._re_levels.search(name) else name for name in df.index.names]
4874 )
4875
4876 return df
4877
4878
4879def _reindex_axis(
4880 obj: DataFrame, axis: AxisInt, labels: Index, other=None
4881) -> DataFrame:
4882 ax = obj._get_axis(axis)
4883 labels = ensure_index(labels)
4884
4885 # try not to reindex even if other is provided
4886 # if it equals our current index
4887 if other is not None:
4888 other = ensure_index(other)
4889 if (other is None or labels.equals(other)) and labels.equals(ax):
4890 return obj
4891
4892 labels = ensure_index(labels.unique())
4893 if other is not None:
4894 labels = ensure_index(other.unique()).intersection(labels, sort=False)
4895 if not labels.equals(ax):
4896 slicer: list[slice | Index] = [slice(None, None)] * obj.ndim
4897 slicer[axis] = labels
4898 obj = obj.loc[tuple(slicer)]
4899 return obj
4900
4901
4902# tz to/from coercion
4903
4904
4905def _get_tz(tz: tzinfo) -> str | tzinfo:
4906 """for a tz-aware type, return an encoded zone"""
4907 zone = timezones.get_timezone(tz)
4908 return zone
4909
4910
4911@overload
4912def _set_tz(
4913 values: np.ndarray | Index, tz: str | tzinfo, coerce: bool = False
4914) -> DatetimeIndex:
4915 ...
4916
4917
4918@overload
4919def _set_tz(values: np.ndarray | Index, tz: None, coerce: bool = False) -> np.ndarray:
4920 ...
4921
4922
4923def _set_tz(
4924 values: np.ndarray | Index, tz: str | tzinfo | None, coerce: bool = False
4925) -> np.ndarray | DatetimeIndex:
4926 """
4927 coerce the values to a DatetimeIndex if tz is set
4928 preserve the input shape if possible
4929
4930 Parameters
4931 ----------
4932 values : ndarray or Index
4933 tz : str or tzinfo
4934 coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
4935 """
4936 if isinstance(values, DatetimeIndex):
4937 # If values is tzaware, the tz gets dropped in the values.ravel()
4938 # call below (which returns an ndarray). So we are only non-lossy
4939 # if `tz` matches `values.tz`.
4940 assert values.tz is None or values.tz == tz
4941 if values.tz is not None:
4942 return values
4943
4944 if tz is not None:
4945 if isinstance(values, DatetimeIndex):
4946 name = values.name
4947 else:
4948 name = None
4949 values = values.ravel()
4950
4951 tz = _ensure_decoded(tz)
4952 values = DatetimeIndex(values, name=name)
4953 values = values.tz_localize("UTC").tz_convert(tz)
4954 elif coerce:
4955 values = np.asarray(values, dtype="M8[ns]")
4956
4957 # error: Incompatible return value type (got "Union[ndarray, Index]",
4958 # expected "Union[ndarray, DatetimeIndex]")
4959 return values # type: ignore[return-value]
4960
4961
4962def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol:
4963 assert isinstance(name, str)
4964
4965 index_name = index.name
4966 # error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index";
4967 # expected "Union[ExtensionArray, ndarray]"
4968 converted, dtype_name = _get_data_and_dtype_name(index) # type: ignore[arg-type]
4969 kind = _dtype_to_kind(dtype_name)
4970 atom = DataIndexableCol._get_atom(converted)
4971
4972 if (
4973 lib.is_np_dtype(index.dtype, "iu")
4974 or needs_i8_conversion(index.dtype)
4975 or is_bool_dtype(index.dtype)
4976 ):
4977 # Includes Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,
4978 # in which case "kind" is "integer", "integer", "datetime64",
4979 # "timedelta64", and "integer", respectively.
4980 return IndexCol(
4981 name,
4982 values=converted,
4983 kind=kind,
4984 typ=atom,
4985 freq=getattr(index, "freq", None),
4986 tz=getattr(index, "tz", None),
4987 index_name=index_name,
4988 )
4989
4990 if isinstance(index, MultiIndex):
4991 raise TypeError("MultiIndex not supported here!")
4992
4993 inferred_type = lib.infer_dtype(index, skipna=False)
4994 # we won't get inferred_type of "datetime64" or "timedelta64" as these
4995 # would go through the DatetimeIndex/TimedeltaIndex paths above
4996
4997 values = np.asarray(index)
4998
4999 if inferred_type == "date":
5000 converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)
5001 return IndexCol(
5002 name, converted, "date", _tables().Time32Col(), index_name=index_name
5003 )
5004 elif inferred_type == "string":
5005 converted = _convert_string_array(values, encoding, errors)
5006 itemsize = converted.dtype.itemsize
5007 return IndexCol(
5008 name,
5009 converted,
5010 "string",
5011 _tables().StringCol(itemsize),
5012 index_name=index_name,
5013 )
5014
5015 elif inferred_type in ["integer", "floating"]:
5016 return IndexCol(
5017 name, values=converted, kind=kind, typ=atom, index_name=index_name
5018 )
5019 else:
5020 assert isinstance(converted, np.ndarray) and converted.dtype == object
5021 assert kind == "object", kind
5022 atom = _tables().ObjectAtom()
5023 return IndexCol(name, converted, kind, atom, index_name=index_name)
5024
5025
5026def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index:
5027 index: Index | np.ndarray
5028
5029 if kind.startswith("datetime64"):
5030 if kind == "datetime64":
5031 # created before we stored resolution information
5032 index = DatetimeIndex(data)
5033 else:
5034 index = DatetimeIndex(data.view(kind))
5035 elif kind == "timedelta64":
5036 index = TimedeltaIndex(data)
5037 elif kind == "date":
5038 try:
5039 index = np.asarray([date.fromordinal(v) for v in data], dtype=object)
5040 except ValueError:
5041 index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)
5042 elif kind in ("integer", "float", "bool"):
5043 index = np.asarray(data)
5044 elif kind in ("string"):
5045 index = _unconvert_string_array(
5046 data, nan_rep=None, encoding=encoding, errors=errors
5047 )
5048 elif kind == "object":
5049 index = np.asarray(data[0])
5050 else: # pragma: no cover
5051 raise ValueError(f"unrecognized index type {kind}")
5052 return index
5053
5054
5055def _maybe_convert_for_string_atom(
5056 name: str,
5057 bvalues: ArrayLike,
5058 existing_col,
5059 min_itemsize,
5060 nan_rep,
5061 encoding,
5062 errors,
5063 columns: list[str],
5064):
5065 if bvalues.dtype != object:
5066 return bvalues
5067
5068 bvalues = cast(np.ndarray, bvalues)
5069
5070 dtype_name = bvalues.dtype.name
5071 inferred_type = lib.infer_dtype(bvalues, skipna=False)
5072
5073 if inferred_type == "date":
5074 raise TypeError("[date] is not implemented as a table column")
5075 if inferred_type == "datetime":
5076 # after GH#8260
5077 # this only would be hit for a multi-timezone dtype which is an error
5078 raise TypeError(
5079 "too many timezones in this block, create separate data columns"
5080 )
5081
5082 if not (inferred_type == "string" or dtype_name == "object"):
5083 return bvalues
5084
5085 mask = isna(bvalues)
5086 data = bvalues.copy()
5087 data[mask] = nan_rep
5088
5089 # see if we have a valid string type
5090 inferred_type = lib.infer_dtype(data, skipna=False)
5091 if inferred_type != "string":
5092 # we cannot serialize this data, so report an exception on a column
5093 # by column basis
5094
5095 # expected behaviour:
5096 # search block for a non-string object column by column
5097 for i in range(data.shape[0]):
5098 col = data[i]
5099 inferred_type = lib.infer_dtype(col, skipna=False)
5100 if inferred_type != "string":
5101 error_column_label = columns[i] if len(columns) > i else f"No.{i}"
5102 raise TypeError(
5103 f"Cannot serialize the column [{error_column_label}]\n"
5104 f"because its data contents are not [string] but "
5105 f"[{inferred_type}] object dtype"
5106 )
5107
5108 # itemsize is the maximum length of a string (along any dimension)
5109
5110 data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape)
5111 itemsize = data_converted.itemsize
5112
5113 # specified min_itemsize?
5114 if isinstance(min_itemsize, dict):
5115 min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)
5116 itemsize = max(min_itemsize or 0, itemsize)
5117
5118 # check for column in the values conflicts
5119 if existing_col is not None:
5120 eci = existing_col.validate_col(itemsize)
5121 if eci is not None and eci > itemsize:
5122 itemsize = eci
5123
5124 data_converted = data_converted.astype(f"|S{itemsize}", copy=False)
5125 return data_converted
5126
5127
5128def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray:
5129 """
5130 Take a string-like that is object dtype and coerce to a fixed size string type.
5131
5132 Parameters
5133 ----------
5134 data : np.ndarray[object]
5135 encoding : str
5136 errors : str
5137 Handler for encoding errors.
5138
5139 Returns
5140 -------
5141 np.ndarray[fixed-length-string]
5142 """
5143 # encode if needed
5144 if len(data):
5145 data = (
5146 Series(data.ravel(), copy=False)
5147 .str.encode(encoding, errors)
5148 ._values.reshape(data.shape)
5149 )
5150
5151 # create the sized dtype
5152 ensured = ensure_object(data.ravel())
5153 itemsize = max(1, libwriters.max_len_string_array(ensured))
5154
5155 data = np.asarray(data, dtype=f"S{itemsize}")
5156 return data
5157
5158
5159def _unconvert_string_array(
5160 data: np.ndarray, nan_rep, encoding: str, errors: str
5161) -> np.ndarray:
5162 """
5163 Inverse of _convert_string_array.
5164
5165 Parameters
5166 ----------
5167 data : np.ndarray[fixed-length-string]
5168 nan_rep : the storage repr of NaN
5169 encoding : str
5170 errors : str
5171 Handler for encoding errors.
5172
5173 Returns
5174 -------
5175 np.ndarray[object]
5176 Decoded data.
5177 """
5178 shape = data.shape
5179 data = np.asarray(data.ravel(), dtype=object)
5180
5181 if len(data):
5182 itemsize = libwriters.max_len_string_array(ensure_object(data))
5183 dtype = f"U{itemsize}"
5184
5185 if isinstance(data[0], bytes):
5186 data = Series(data, copy=False).str.decode(encoding, errors=errors)._values
5187 else:
5188 data = data.astype(dtype, copy=False).astype(object, copy=False)
5189
5190 if nan_rep is None:
5191 nan_rep = "nan"
5192
5193 libwriters.string_array_replace_from_nan_rep(data, nan_rep)
5194 return data.reshape(shape)
5195
5196
5197def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str):
5198 assert isinstance(val_kind, str), type(val_kind)
5199 if _need_convert(val_kind):
5200 conv = _get_converter(val_kind, encoding, errors)
5201 values = conv(values)
5202 return values
5203
5204
5205def _get_converter(kind: str, encoding: str, errors: str):
5206 if kind == "datetime64":
5207 return lambda x: np.asarray(x, dtype="M8[ns]")
5208 elif "datetime64" in kind:
5209 return lambda x: np.asarray(x, dtype=kind)
5210 elif kind == "string":
5211 return lambda x: _unconvert_string_array(
5212 x, nan_rep=None, encoding=encoding, errors=errors
5213 )
5214 else: # pragma: no cover
5215 raise ValueError(f"invalid kind {kind}")
5216
5217
5218def _need_convert(kind: str) -> bool:
5219 if kind in ("datetime64", "string") or "datetime64" in kind:
5220 return True
5221 return False
5222
5223
5224def _maybe_adjust_name(name: str, version: Sequence[int]) -> str:
5225 """
5226 Prior to 0.10.1, we named values blocks like: values_block_0 an the
5227 name values_0, adjust the given name if necessary.
5228
5229 Parameters
5230 ----------
5231 name : str
5232 version : Tuple[int, int, int]
5233
5234 Returns
5235 -------
5236 str
5237 """
5238 if isinstance(version, str) or len(version) < 3:
5239 raise ValueError("Version is incorrect, expected sequence of 3 integers.")
5240
5241 if version[0] == 0 and version[1] <= 10 and version[2] == 0:
5242 m = re.search(r"values_block_(\d+)", name)
5243 if m:
5244 grp = m.groups()[0]
5245 name = f"values_{grp}"
5246 return name
5247
5248
5249def _dtype_to_kind(dtype_str: str) -> str:
5250 """
5251 Find the "kind" string describing the given dtype name.
5252 """
5253 dtype_str = _ensure_decoded(dtype_str)
5254
5255 if dtype_str.startswith(("string", "bytes")):
5256 kind = "string"
5257 elif dtype_str.startswith("float"):
5258 kind = "float"
5259 elif dtype_str.startswith("complex"):
5260 kind = "complex"
5261 elif dtype_str.startswith(("int", "uint")):
5262 kind = "integer"
5263 elif dtype_str.startswith("datetime64"):
5264 kind = dtype_str
5265 elif dtype_str.startswith("timedelta"):
5266 kind = "timedelta64"
5267 elif dtype_str.startswith("bool"):
5268 kind = "bool"
5269 elif dtype_str.startswith("category"):
5270 kind = "category"
5271 elif dtype_str.startswith("period"):
5272 # We store the `freq` attr so we can restore from integers
5273 kind = "integer"
5274 elif dtype_str == "object":
5275 kind = "object"
5276 else:
5277 raise ValueError(f"cannot interpret dtype of [{dtype_str}]")
5278
5279 return kind
5280
5281
5282def _get_data_and_dtype_name(data: ArrayLike):
5283 """
5284 Convert the passed data into a storable form and a dtype string.
5285 """
5286 if isinstance(data, Categorical):
5287 data = data.codes
5288
5289 if isinstance(data.dtype, DatetimeTZDtype):
5290 # For datetime64tz we need to drop the TZ in tests TODO: why?
5291 dtype_name = f"datetime64[{data.dtype.unit}]"
5292 else:
5293 dtype_name = data.dtype.name
5294
5295 if data.dtype.kind in "mM":
5296 data = np.asarray(data.view("i8"))
5297 # TODO: we used to reshape for the dt64tz case, but no longer
5298 # doing that doesn't seem to break anything. why?
5299
5300 elif isinstance(data, PeriodIndex):
5301 data = data.asi8
5302
5303 data = np.asarray(data)
5304 return data, dtype_name
5305
5306
5307class Selection:
5308 """
5309 Carries out a selection operation on a tables.Table object.
5310
5311 Parameters
5312 ----------
5313 table : a Table object
5314 where : list of Terms (or convertible to)
5315 start, stop: indices to start and/or stop selection
5316
5317 """
5318
5319 def __init__(
5320 self,
5321 table: Table,
5322 where=None,
5323 start: int | None = None,
5324 stop: int | None = None,
5325 ) -> None:
5326 self.table = table
5327 self.where = where
5328 self.start = start
5329 self.stop = stop
5330 self.condition = None
5331 self.filter = None
5332 self.terms = None
5333 self.coordinates = None
5334
5335 if is_list_like(where):
5336 # see if we have a passed coordinate like
5337 with suppress(ValueError):
5338 inferred = lib.infer_dtype(where, skipna=False)
5339 if inferred in ("integer", "boolean"):
5340 where = np.asarray(where)
5341 if where.dtype == np.bool_:
5342 start, stop = self.start, self.stop
5343 if start is None:
5344 start = 0
5345 if stop is None:
5346 stop = self.table.nrows
5347 self.coordinates = np.arange(start, stop)[where]
5348 elif issubclass(where.dtype.type, np.integer):
5349 if (self.start is not None and (where < self.start).any()) or (
5350 self.stop is not None and (where >= self.stop).any()
5351 ):
5352 raise ValueError(
5353 "where must have index locations >= start and < stop"
5354 )
5355 self.coordinates = where
5356
5357 if self.coordinates is None:
5358 self.terms = self.generate(where)
5359
5360 # create the numexpr & the filter
5361 if self.terms is not None:
5362 self.condition, self.filter = self.terms.evaluate()
5363
5364 def generate(self, where):
5365 """where can be a : dict,list,tuple,string"""
5366 if where is None:
5367 return None
5368
5369 q = self.table.queryables()
5370 try:
5371 return PyTablesExpr(where, queryables=q, encoding=self.table.encoding)
5372 except NameError as err:
5373 # raise a nice message, suggesting that the user should use
5374 # data_columns
5375 qkeys = ",".join(q.keys())
5376 msg = dedent(
5377 f"""\
5378 The passed where expression: {where}
5379 contains an invalid variable reference
5380 all of the variable references must be a reference to
5381 an axis (e.g. 'index' or 'columns'), or a data_column
5382 The currently defined references are: {qkeys}
5383 """
5384 )
5385 raise ValueError(msg) from err
5386
5387 def select(self):
5388 """
5389 generate the selection
5390 """
5391 if self.condition is not None:
5392 return self.table.table.read_where(
5393 self.condition.format(), start=self.start, stop=self.stop
5394 )
5395 elif self.coordinates is not None:
5396 return self.table.table.read_coordinates(self.coordinates)
5397 return self.table.table.read(start=self.start, stop=self.stop)
5398
5399 def select_coords(self):
5400 """
5401 generate the selection
5402 """
5403 start, stop = self.start, self.stop
5404 nrows = self.table.nrows
5405 if start is None:
5406 start = 0
5407 elif start < 0:
5408 start += nrows
5409 if stop is None:
5410 stop = nrows
5411 elif stop < 0:
5412 stop += nrows
5413
5414 if self.condition is not None:
5415 return self.table.table.get_where_list(
5416 self.condition.format(), start=start, stop=stop, sort=True
5417 )
5418 elif self.coordinates is not None:
5419 return self.coordinates
5420
5421 return np.arange(start, stop)