Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/io/pytables.py: 19%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2High level interface to PyTables for reading and writing pandas data structures
3to disk
4"""
5from __future__ import annotations
7from contextlib import suppress
8import copy
9from datetime import (
10 date,
11 tzinfo,
12)
13import itertools
14import os
15import re
16from textwrap import dedent
17from typing import (
18 TYPE_CHECKING,
19 Any,
20 Callable,
21 Final,
22 Literal,
23 cast,
24 overload,
25)
26import warnings
28import numpy as np
30from pandas._config import (
31 config,
32 get_option,
33 using_copy_on_write,
34 using_pyarrow_string_dtype,
35)
37from pandas._libs import (
38 lib,
39 writers as libwriters,
40)
41from pandas._libs.lib import is_string_array
42from pandas._libs.tslibs import timezones
43from pandas.compat._optional import import_optional_dependency
44from pandas.compat.pickle_compat import patch_pickle
45from pandas.errors import (
46 AttributeConflictWarning,
47 ClosedFileError,
48 IncompatibilityWarning,
49 PerformanceWarning,
50 PossibleDataLossError,
51)
52from pandas.util._decorators import cache_readonly
53from pandas.util._exceptions import find_stack_level
55from pandas.core.dtypes.common import (
56 ensure_object,
57 is_bool_dtype,
58 is_complex_dtype,
59 is_list_like,
60 is_string_dtype,
61 needs_i8_conversion,
62)
63from pandas.core.dtypes.dtypes import (
64 CategoricalDtype,
65 DatetimeTZDtype,
66 ExtensionDtype,
67 PeriodDtype,
68)
69from pandas.core.dtypes.missing import array_equivalent
71from pandas import (
72 DataFrame,
73 DatetimeIndex,
74 Index,
75 MultiIndex,
76 PeriodIndex,
77 RangeIndex,
78 Series,
79 TimedeltaIndex,
80 concat,
81 isna,
82)
83from pandas.core.arrays import (
84 Categorical,
85 DatetimeArray,
86 PeriodArray,
87)
88import pandas.core.common as com
89from pandas.core.computation.pytables import (
90 PyTablesExpr,
91 maybe_expression,
92)
93from pandas.core.construction import extract_array
94from pandas.core.indexes.api import ensure_index
95from pandas.core.internals import (
96 ArrayManager,
97 BlockManager,
98)
100from pandas.io.common import stringify_path
101from pandas.io.formats.printing import (
102 adjoin,
103 pprint_thing,
104)
106if TYPE_CHECKING:
107 from collections.abc import (
108 Hashable,
109 Iterator,
110 Sequence,
111 )
112 from types import TracebackType
114 from tables import (
115 Col,
116 File,
117 Node,
118 )
120 from pandas._typing import (
121 AnyArrayLike,
122 ArrayLike,
123 AxisInt,
124 DtypeArg,
125 FilePath,
126 Self,
127 Shape,
128 npt,
129 )
131 from pandas.core.internals import Block
133# versioning attribute
134_version = "0.15.2"
136# encoding
137_default_encoding = "UTF-8"
140def _ensure_decoded(s):
141 """if we have bytes, decode them to unicode"""
142 if isinstance(s, np.bytes_):
143 s = s.decode("UTF-8")
144 return s
147def _ensure_encoding(encoding: str | None) -> str:
148 # set the encoding if we need
149 if encoding is None:
150 encoding = _default_encoding
152 return encoding
155def _ensure_str(name):
156 """
157 Ensure that an index / column name is a str (python 3); otherwise they
158 may be np.string dtype. Non-string dtypes are passed through unchanged.
160 https://github.com/pandas-dev/pandas/issues/13492
161 """
162 if isinstance(name, str):
163 name = str(name)
164 return name
167Term = PyTablesExpr
170def _ensure_term(where, scope_level: int):
171 """
172 Ensure that the where is a Term or a list of Term.
174 This makes sure that we are capturing the scope of variables that are
175 passed create the terms here with a frame_level=2 (we are 2 levels down)
176 """
177 # only consider list/tuple here as an ndarray is automatically a coordinate
178 # list
179 level = scope_level + 1
180 if isinstance(where, (list, tuple)):
181 where = [
182 Term(term, scope_level=level + 1) if maybe_expression(term) else term
183 for term in where
184 if term is not None
185 ]
186 elif maybe_expression(where):
187 where = Term(where, scope_level=level)
188 return where if where is None or len(where) else None
191incompatibility_doc: Final = """
192where criteria is being ignored as this version [%s] is too old (or
193not-defined), read the file in and write it out to a new file to upgrade (with
194the copy_to method)
195"""
197attribute_conflict_doc: Final = """
198the [%s] attribute of the existing index is [%s] which conflicts with the new
199[%s], resetting the attribute to None
200"""
202performance_doc: Final = """
203your performance may suffer as PyTables will pickle object types that it cannot
204map directly to c-types [inferred_type->%s,key->%s] [items->%s]
205"""
207# formats
208_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}
210# axes map
211_AXES_MAP = {DataFrame: [0]}
213# register our configuration options
214dropna_doc: Final = """
215: boolean
216 drop ALL nan rows when appending to a table
217"""
218format_doc: Final = """
219: format
220 default format writing format, if None, then
221 put will default to 'fixed' and append will default to 'table'
222"""
224with config.config_prefix("io.hdf"):
225 config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)
226 config.register_option(
227 "default_format",
228 None,
229 format_doc,
230 validator=config.is_one_of_factory(["fixed", "table", None]),
231 )
233# oh the troubles to reduce import time
234_table_mod = None
235_table_file_open_policy_is_strict = False
238def _tables():
239 global _table_mod
240 global _table_file_open_policy_is_strict
241 if _table_mod is None:
242 import tables
244 _table_mod = tables
246 # set the file open policy
247 # return the file open policy; this changes as of pytables 3.1
248 # depending on the HDF5 version
249 with suppress(AttributeError):
250 _table_file_open_policy_is_strict = (
251 tables.file._FILE_OPEN_POLICY == "strict"
252 )
254 return _table_mod
257# interface to/from ###
260def to_hdf(
261 path_or_buf: FilePath | HDFStore,
262 key: str,
263 value: DataFrame | Series,
264 mode: str = "a",
265 complevel: int | None = None,
266 complib: str | None = None,
267 append: bool = False,
268 format: str | None = None,
269 index: bool = True,
270 min_itemsize: int | dict[str, int] | None = None,
271 nan_rep=None,
272 dropna: bool | None = None,
273 data_columns: Literal[True] | list[str] | None = None,
274 errors: str = "strict",
275 encoding: str = "UTF-8",
276) -> None:
277 """store this object, close it if we opened it"""
278 if append:
279 f = lambda store: store.append(
280 key,
281 value,
282 format=format,
283 index=index,
284 min_itemsize=min_itemsize,
285 nan_rep=nan_rep,
286 dropna=dropna,
287 data_columns=data_columns,
288 errors=errors,
289 encoding=encoding,
290 )
291 else:
292 # NB: dropna is not passed to `put`
293 f = lambda store: store.put(
294 key,
295 value,
296 format=format,
297 index=index,
298 min_itemsize=min_itemsize,
299 nan_rep=nan_rep,
300 data_columns=data_columns,
301 errors=errors,
302 encoding=encoding,
303 dropna=dropna,
304 )
306 path_or_buf = stringify_path(path_or_buf)
307 if isinstance(path_or_buf, str):
308 with HDFStore(
309 path_or_buf, mode=mode, complevel=complevel, complib=complib
310 ) as store:
311 f(store)
312 else:
313 f(path_or_buf)
316def read_hdf(
317 path_or_buf: FilePath | HDFStore,
318 key=None,
319 mode: str = "r",
320 errors: str = "strict",
321 where: str | list | None = None,
322 start: int | None = None,
323 stop: int | None = None,
324 columns: list[str] | None = None,
325 iterator: bool = False,
326 chunksize: int | None = None,
327 **kwargs,
328):
329 """
330 Read from the store, close it if we opened it.
332 Retrieve pandas object stored in file, optionally based on where
333 criteria.
335 .. warning::
337 Pandas uses PyTables for reading and writing HDF5 files, which allows
338 serializing object-dtype data with pickle when using the "fixed" format.
339 Loading pickled data received from untrusted sources can be unsafe.
341 See: https://docs.python.org/3/library/pickle.html for more.
343 Parameters
344 ----------
345 path_or_buf : str, path object, pandas.HDFStore
346 Any valid string path is acceptable. Only supports the local file system,
347 remote URLs and file-like objects are not supported.
349 If you want to pass in a path object, pandas accepts any
350 ``os.PathLike``.
352 Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.
354 key : object, optional
355 The group identifier in the store. Can be omitted if the HDF file
356 contains a single pandas object.
357 mode : {'r', 'r+', 'a'}, default 'r'
358 Mode to use when opening the file. Ignored if path_or_buf is a
359 :class:`pandas.HDFStore`. Default is 'r'.
360 errors : str, default 'strict'
361 Specifies how encoding and decoding errors are to be handled.
362 See the errors argument for :func:`open` for a full list
363 of options.
364 where : list, optional
365 A list of Term (or convertible) objects.
366 start : int, optional
367 Row number to start selection.
368 stop : int, optional
369 Row number to stop selection.
370 columns : list, optional
371 A list of columns names to return.
372 iterator : bool, optional
373 Return an iterator object.
374 chunksize : int, optional
375 Number of rows to include in an iteration when using an iterator.
376 **kwargs
377 Additional keyword arguments passed to HDFStore.
379 Returns
380 -------
381 object
382 The selected object. Return type depends on the object stored.
384 See Also
385 --------
386 DataFrame.to_hdf : Write a HDF file from a DataFrame.
387 HDFStore : Low-level access to HDF files.
389 Examples
390 --------
391 >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP
392 >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP
393 >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP
394 """
395 if mode not in ["r", "r+", "a"]:
396 raise ValueError(
397 f"mode {mode} is not allowed while performing a read. "
398 f"Allowed modes are r, r+ and a."
399 )
400 # grab the scope
401 if where is not None:
402 where = _ensure_term(where, scope_level=1)
404 if isinstance(path_or_buf, HDFStore):
405 if not path_or_buf.is_open:
406 raise OSError("The HDFStore must be open for reading.")
408 store = path_or_buf
409 auto_close = False
410 else:
411 path_or_buf = stringify_path(path_or_buf)
412 if not isinstance(path_or_buf, str):
413 raise NotImplementedError(
414 "Support for generic buffers has not been implemented."
415 )
416 try:
417 exists = os.path.exists(path_or_buf)
419 # if filepath is too long
420 except (TypeError, ValueError):
421 exists = False
423 if not exists:
424 raise FileNotFoundError(f"File {path_or_buf} does not exist")
426 store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)
427 # can't auto open/close if we are using an iterator
428 # so delegate to the iterator
429 auto_close = True
431 try:
432 if key is None:
433 groups = store.groups()
434 if len(groups) == 0:
435 raise ValueError(
436 "Dataset(s) incompatible with Pandas data types, "
437 "not table, or no datasets found in HDF5 file."
438 )
439 candidate_only_group = groups[0]
441 # For the HDF file to have only one dataset, all other groups
442 # should then be metadata groups for that candidate group. (This
443 # assumes that the groups() method enumerates parent groups
444 # before their children.)
445 for group_to_check in groups[1:]:
446 if not _is_metadata_of(group_to_check, candidate_only_group):
447 raise ValueError(
448 "key must be provided when HDF5 "
449 "file contains multiple datasets."
450 )
451 key = candidate_only_group._v_pathname
452 return store.select(
453 key,
454 where=where,
455 start=start,
456 stop=stop,
457 columns=columns,
458 iterator=iterator,
459 chunksize=chunksize,
460 auto_close=auto_close,
461 )
462 except (ValueError, TypeError, LookupError):
463 if not isinstance(path_or_buf, HDFStore):
464 # if there is an error, close the store if we opened it.
465 with suppress(AttributeError):
466 store.close()
468 raise
471def _is_metadata_of(group: Node, parent_group: Node) -> bool:
472 """Check if a given group is a metadata group for a given parent_group."""
473 if group._v_depth <= parent_group._v_depth:
474 return False
476 current = group
477 while current._v_depth > 1:
478 parent = current._v_parent
479 if parent == parent_group and current._v_name == "meta":
480 return True
481 current = current._v_parent
482 return False
485class HDFStore:
486 """
487 Dict-like IO interface for storing pandas objects in PyTables.
489 Either Fixed or Table format.
491 .. warning::
493 Pandas uses PyTables for reading and writing HDF5 files, which allows
494 serializing object-dtype data with pickle when using the "fixed" format.
495 Loading pickled data received from untrusted sources can be unsafe.
497 See: https://docs.python.org/3/library/pickle.html for more.
499 Parameters
500 ----------
501 path : str
502 File path to HDF5 file.
503 mode : {'a', 'w', 'r', 'r+'}, default 'a'
505 ``'r'``
506 Read-only; no data can be modified.
507 ``'w'``
508 Write; a new file is created (an existing file with the same
509 name would be deleted).
510 ``'a'``
511 Append; an existing file is opened for reading and writing,
512 and if the file does not exist it is created.
513 ``'r+'``
514 It is similar to ``'a'``, but the file must already exist.
515 complevel : int, 0-9, default None
516 Specifies a compression level for data.
517 A value of 0 or None disables compression.
518 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
519 Specifies the compression library to be used.
520 These additional compressors for Blosc are supported
521 (default if no compressor specified: 'blosc:blosclz'):
522 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
523 'blosc:zlib', 'blosc:zstd'}.
524 Specifying a compression library which is not available issues
525 a ValueError.
526 fletcher32 : bool, default False
527 If applying compression use the fletcher32 checksum.
528 **kwargs
529 These parameters will be passed to the PyTables open_file method.
531 Examples
532 --------
533 >>> bar = pd.DataFrame(np.random.randn(10, 4))
534 >>> store = pd.HDFStore('test.h5')
535 >>> store['foo'] = bar # write to HDF5
536 >>> bar = store['foo'] # retrieve
537 >>> store.close()
539 **Create or load HDF5 file in-memory**
541 When passing the `driver` option to the PyTables open_file method through
542 **kwargs, the HDF5 file is loaded or created in-memory and will only be
543 written when closed:
545 >>> bar = pd.DataFrame(np.random.randn(10, 4))
546 >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE')
547 >>> store['foo'] = bar
548 >>> store.close() # only now, data is written to disk
549 """
551 _handle: File | None
552 _mode: str
554 def __init__(
555 self,
556 path,
557 mode: str = "a",
558 complevel: int | None = None,
559 complib=None,
560 fletcher32: bool = False,
561 **kwargs,
562 ) -> None:
563 if "format" in kwargs:
564 raise ValueError("format is not a defined argument for HDFStore")
566 tables = import_optional_dependency("tables")
568 if complib is not None and complib not in tables.filters.all_complibs:
569 raise ValueError(
570 f"complib only supports {tables.filters.all_complibs} compression."
571 )
573 if complib is None and complevel is not None:
574 complib = tables.filters.default_complib
576 self._path = stringify_path(path)
577 if mode is None:
578 mode = "a"
579 self._mode = mode
580 self._handle = None
581 self._complevel = complevel if complevel else 0
582 self._complib = complib
583 self._fletcher32 = fletcher32
584 self._filters = None
585 self.open(mode=mode, **kwargs)
587 def __fspath__(self) -> str:
588 return self._path
590 @property
591 def root(self):
592 """return the root node"""
593 self._check_if_open()
594 assert self._handle is not None # for mypy
595 return self._handle.root
597 @property
598 def filename(self) -> str:
599 return self._path
601 def __getitem__(self, key: str):
602 return self.get(key)
604 def __setitem__(self, key: str, value) -> None:
605 self.put(key, value)
607 def __delitem__(self, key: str) -> None:
608 return self.remove(key)
610 def __getattr__(self, name: str):
611 """allow attribute access to get stores"""
612 try:
613 return self.get(name)
614 except (KeyError, ClosedFileError):
615 pass
616 raise AttributeError(
617 f"'{type(self).__name__}' object has no attribute '{name}'"
618 )
620 def __contains__(self, key: str) -> bool:
621 """
622 check for existence of this key
623 can match the exact pathname or the pathnm w/o the leading '/'
624 """
625 node = self.get_node(key)
626 if node is not None:
627 name = node._v_pathname
628 if key in (name, name[1:]):
629 return True
630 return False
632 def __len__(self) -> int:
633 return len(self.groups())
635 def __repr__(self) -> str:
636 pstr = pprint_thing(self._path)
637 return f"{type(self)}\nFile path: {pstr}\n"
639 def __enter__(self) -> Self:
640 return self
642 def __exit__(
643 self,
644 exc_type: type[BaseException] | None,
645 exc_value: BaseException | None,
646 traceback: TracebackType | None,
647 ) -> None:
648 self.close()
650 def keys(self, include: str = "pandas") -> list[str]:
651 """
652 Return a list of keys corresponding to objects stored in HDFStore.
654 Parameters
655 ----------
657 include : str, default 'pandas'
658 When kind equals 'pandas' return pandas objects.
659 When kind equals 'native' return native HDF5 Table objects.
661 Returns
662 -------
663 list
664 List of ABSOLUTE path-names (e.g. have the leading '/').
666 Raises
667 ------
668 raises ValueError if kind has an illegal value
670 Examples
671 --------
672 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
673 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
674 >>> store.put('data', df) # doctest: +SKIP
675 >>> store.get('data') # doctest: +SKIP
676 >>> print(store.keys()) # doctest: +SKIP
677 ['/data1', '/data2']
678 >>> store.close() # doctest: +SKIP
679 """
680 if include == "pandas":
681 return [n._v_pathname for n in self.groups()]
683 elif include == "native":
684 assert self._handle is not None # mypy
685 return [
686 n._v_pathname for n in self._handle.walk_nodes("/", classname="Table")
687 ]
688 raise ValueError(
689 f"`include` should be either 'pandas' or 'native' but is '{include}'"
690 )
692 def __iter__(self) -> Iterator[str]:
693 return iter(self.keys())
695 def items(self) -> Iterator[tuple[str, list]]:
696 """
697 iterate on key->group
698 """
699 for g in self.groups():
700 yield g._v_pathname, g
702 def open(self, mode: str = "a", **kwargs) -> None:
703 """
704 Open the file in the specified mode
706 Parameters
707 ----------
708 mode : {'a', 'w', 'r', 'r+'}, default 'a'
709 See HDFStore docstring or tables.open_file for info about modes
710 **kwargs
711 These parameters will be passed to the PyTables open_file method.
712 """
713 tables = _tables()
715 if self._mode != mode:
716 # if we are changing a write mode to read, ok
717 if self._mode in ["a", "w"] and mode in ["r", "r+"]:
718 pass
719 elif mode in ["w"]:
720 # this would truncate, raise here
721 if self.is_open:
722 raise PossibleDataLossError(
723 f"Re-opening the file [{self._path}] with mode [{self._mode}] "
724 "will delete the current file!"
725 )
727 self._mode = mode
729 # close and reopen the handle
730 if self.is_open:
731 self.close()
733 if self._complevel and self._complevel > 0:
734 self._filters = _tables().Filters(
735 self._complevel, self._complib, fletcher32=self._fletcher32
736 )
738 if _table_file_open_policy_is_strict and self.is_open:
739 msg = (
740 "Cannot open HDF5 file, which is already opened, "
741 "even in read-only mode."
742 )
743 raise ValueError(msg)
745 self._handle = tables.open_file(self._path, self._mode, **kwargs)
747 def close(self) -> None:
748 """
749 Close the PyTables file handle
750 """
751 if self._handle is not None:
752 self._handle.close()
753 self._handle = None
755 @property
756 def is_open(self) -> bool:
757 """
758 return a boolean indicating whether the file is open
759 """
760 if self._handle is None:
761 return False
762 return bool(self._handle.isopen)
764 def flush(self, fsync: bool = False) -> None:
765 """
766 Force all buffered modifications to be written to disk.
768 Parameters
769 ----------
770 fsync : bool (default False)
771 call ``os.fsync()`` on the file handle to force writing to disk.
773 Notes
774 -----
775 Without ``fsync=True``, flushing may not guarantee that the OS writes
776 to disk. With fsync, the operation will block until the OS claims the
777 file has been written; however, other caching layers may still
778 interfere.
779 """
780 if self._handle is not None:
781 self._handle.flush()
782 if fsync:
783 with suppress(OSError):
784 os.fsync(self._handle.fileno())
786 def get(self, key: str):
787 """
788 Retrieve pandas object stored in file.
790 Parameters
791 ----------
792 key : str
794 Returns
795 -------
796 object
797 Same type as object stored in file.
799 Examples
800 --------
801 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
802 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
803 >>> store.put('data', df) # doctest: +SKIP
804 >>> store.get('data') # doctest: +SKIP
805 >>> store.close() # doctest: +SKIP
806 """
807 with patch_pickle():
808 # GH#31167 Without this patch, pickle doesn't know how to unpickle
809 # old DateOffset objects now that they are cdef classes.
810 group = self.get_node(key)
811 if group is None:
812 raise KeyError(f"No object named {key} in the file")
813 return self._read_group(group)
815 def select(
816 self,
817 key: str,
818 where=None,
819 start=None,
820 stop=None,
821 columns=None,
822 iterator: bool = False,
823 chunksize: int | None = None,
824 auto_close: bool = False,
825 ):
826 """
827 Retrieve pandas object stored in file, optionally based on where criteria.
829 .. warning::
831 Pandas uses PyTables for reading and writing HDF5 files, which allows
832 serializing object-dtype data with pickle when using the "fixed" format.
833 Loading pickled data received from untrusted sources can be unsafe.
835 See: https://docs.python.org/3/library/pickle.html for more.
837 Parameters
838 ----------
839 key : str
840 Object being retrieved from file.
841 where : list or None
842 List of Term (or convertible) objects, optional.
843 start : int or None
844 Row number to start selection.
845 stop : int, default None
846 Row number to stop selection.
847 columns : list or None
848 A list of columns that if not None, will limit the return columns.
849 iterator : bool or False
850 Returns an iterator.
851 chunksize : int or None
852 Number or rows to include in iteration, return an iterator.
853 auto_close : bool or False
854 Should automatically close the store when finished.
856 Returns
857 -------
858 object
859 Retrieved object from file.
861 Examples
862 --------
863 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
864 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
865 >>> store.put('data', df) # doctest: +SKIP
866 >>> store.get('data') # doctest: +SKIP
867 >>> print(store.keys()) # doctest: +SKIP
868 ['/data1', '/data2']
869 >>> store.select('/data1') # doctest: +SKIP
870 A B
871 0 1 2
872 1 3 4
873 >>> store.select('/data1', where='columns == A') # doctest: +SKIP
874 A
875 0 1
876 1 3
877 >>> store.close() # doctest: +SKIP
878 """
879 group = self.get_node(key)
880 if group is None:
881 raise KeyError(f"No object named {key} in the file")
883 # create the storer and axes
884 where = _ensure_term(where, scope_level=1)
885 s = self._create_storer(group)
886 s.infer_axes()
888 # function to call on iteration
889 def func(_start, _stop, _where):
890 return s.read(start=_start, stop=_stop, where=_where, columns=columns)
892 # create the iterator
893 it = TableIterator(
894 self,
895 s,
896 func,
897 where=where,
898 nrows=s.nrows,
899 start=start,
900 stop=stop,
901 iterator=iterator,
902 chunksize=chunksize,
903 auto_close=auto_close,
904 )
906 return it.get_result()
908 def select_as_coordinates(
909 self,
910 key: str,
911 where=None,
912 start: int | None = None,
913 stop: int | None = None,
914 ):
915 """
916 return the selection as an Index
918 .. warning::
920 Pandas uses PyTables for reading and writing HDF5 files, which allows
921 serializing object-dtype data with pickle when using the "fixed" format.
922 Loading pickled data received from untrusted sources can be unsafe.
924 See: https://docs.python.org/3/library/pickle.html for more.
927 Parameters
928 ----------
929 key : str
930 where : list of Term (or convertible) objects, optional
931 start : integer (defaults to None), row number to start selection
932 stop : integer (defaults to None), row number to stop selection
933 """
934 where = _ensure_term(where, scope_level=1)
935 tbl = self.get_storer(key)
936 if not isinstance(tbl, Table):
937 raise TypeError("can only read_coordinates with a table")
938 return tbl.read_coordinates(where=where, start=start, stop=stop)
940 def select_column(
941 self,
942 key: str,
943 column: str,
944 start: int | None = None,
945 stop: int | None = None,
946 ):
947 """
948 return a single column from the table. This is generally only useful to
949 select an indexable
951 .. warning::
953 Pandas uses PyTables for reading and writing HDF5 files, which allows
954 serializing object-dtype data with pickle when using the "fixed" format.
955 Loading pickled data received from untrusted sources can be unsafe.
957 See: https://docs.python.org/3/library/pickle.html for more.
959 Parameters
960 ----------
961 key : str
962 column : str
963 The column of interest.
964 start : int or None, default None
965 stop : int or None, default None
967 Raises
968 ------
969 raises KeyError if the column is not found (or key is not a valid
970 store)
971 raises ValueError if the column can not be extracted individually (it
972 is part of a data block)
974 """
975 tbl = self.get_storer(key)
976 if not isinstance(tbl, Table):
977 raise TypeError("can only read_column with a table")
978 return tbl.read_column(column=column, start=start, stop=stop)
980 def select_as_multiple(
981 self,
982 keys,
983 where=None,
984 selector=None,
985 columns=None,
986 start=None,
987 stop=None,
988 iterator: bool = False,
989 chunksize: int | None = None,
990 auto_close: bool = False,
991 ):
992 """
993 Retrieve pandas objects from multiple tables.
995 .. warning::
997 Pandas uses PyTables for reading and writing HDF5 files, which allows
998 serializing object-dtype data with pickle when using the "fixed" format.
999 Loading pickled data received from untrusted sources can be unsafe.
1001 See: https://docs.python.org/3/library/pickle.html for more.
1003 Parameters
1004 ----------
1005 keys : a list of the tables
1006 selector : the table to apply the where criteria (defaults to keys[0]
1007 if not supplied)
1008 columns : the columns I want back
1009 start : integer (defaults to None), row number to start selection
1010 stop : integer (defaults to None), row number to stop selection
1011 iterator : bool, return an iterator, default False
1012 chunksize : nrows to include in iteration, return an iterator
1013 auto_close : bool, default False
1014 Should automatically close the store when finished.
1016 Raises
1017 ------
1018 raises KeyError if keys or selector is not found or keys is empty
1019 raises TypeError if keys is not a list or tuple
1020 raises ValueError if the tables are not ALL THE SAME DIMENSIONS
1021 """
1022 # default to single select
1023 where = _ensure_term(where, scope_level=1)
1024 if isinstance(keys, (list, tuple)) and len(keys) == 1:
1025 keys = keys[0]
1026 if isinstance(keys, str):
1027 return self.select(
1028 key=keys,
1029 where=where,
1030 columns=columns,
1031 start=start,
1032 stop=stop,
1033 iterator=iterator,
1034 chunksize=chunksize,
1035 auto_close=auto_close,
1036 )
1038 if not isinstance(keys, (list, tuple)):
1039 raise TypeError("keys must be a list/tuple")
1041 if not len(keys):
1042 raise ValueError("keys must have a non-zero length")
1044 if selector is None:
1045 selector = keys[0]
1047 # collect the tables
1048 tbls = [self.get_storer(k) for k in keys]
1049 s = self.get_storer(selector)
1051 # validate rows
1052 nrows = None
1053 for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
1054 if t is None:
1055 raise KeyError(f"Invalid table [{k}]")
1056 if not t.is_table:
1057 raise TypeError(
1058 f"object [{t.pathname}] is not a table, and cannot be used in all "
1059 "select as multiple"
1060 )
1062 if nrows is None:
1063 nrows = t.nrows
1064 elif t.nrows != nrows:
1065 raise ValueError("all tables must have exactly the same nrows!")
1067 # The isinstance checks here are redundant with the check above,
1068 # but necessary for mypy; see GH#29757
1069 _tbls = [x for x in tbls if isinstance(x, Table)]
1071 # axis is the concentration axes
1072 axis = {t.non_index_axes[0][0] for t in _tbls}.pop()
1074 def func(_start, _stop, _where):
1075 # retrieve the objs, _where is always passed as a set of
1076 # coordinates here
1077 objs = [
1078 t.read(where=_where, columns=columns, start=_start, stop=_stop)
1079 for t in tbls
1080 ]
1082 # concat and return
1083 return concat(objs, axis=axis, verify_integrity=False)._consolidate()
1085 # create the iterator
1086 it = TableIterator(
1087 self,
1088 s,
1089 func,
1090 where=where,
1091 nrows=nrows,
1092 start=start,
1093 stop=stop,
1094 iterator=iterator,
1095 chunksize=chunksize,
1096 auto_close=auto_close,
1097 )
1099 return it.get_result(coordinates=True)
1101 def put(
1102 self,
1103 key: str,
1104 value: DataFrame | Series,
1105 format=None,
1106 index: bool = True,
1107 append: bool = False,
1108 complib=None,
1109 complevel: int | None = None,
1110 min_itemsize: int | dict[str, int] | None = None,
1111 nan_rep=None,
1112 data_columns: Literal[True] | list[str] | None = None,
1113 encoding=None,
1114 errors: str = "strict",
1115 track_times: bool = True,
1116 dropna: bool = False,
1117 ) -> None:
1118 """
1119 Store object in HDFStore.
1121 Parameters
1122 ----------
1123 key : str
1124 value : {Series, DataFrame}
1125 format : 'fixed(f)|table(t)', default is 'fixed'
1126 Format to use when storing object in HDFStore. Value can be one of:
1128 ``'fixed'``
1129 Fixed format. Fast writing/reading. Not-appendable, nor searchable.
1130 ``'table'``
1131 Table format. Write as a PyTables Table structure which may perform
1132 worse but allow more flexible operations like searching / selecting
1133 subsets of the data.
1134 index : bool, default True
1135 Write DataFrame index as a column.
1136 append : bool, default False
1137 This will force Table format, append the input data to the existing.
1138 data_columns : list of columns or True, default None
1139 List of columns to create as data columns, or True to use all columns.
1140 See `here
1141 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
1142 encoding : str, default None
1143 Provide an encoding for strings.
1144 track_times : bool, default True
1145 Parameter is propagated to 'create_table' method of 'PyTables'.
1146 If set to False it enables to have the same h5 files (same hashes)
1147 independent on creation time.
1148 dropna : bool, default False, optional
1149 Remove missing values.
1151 Examples
1152 --------
1153 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
1154 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1155 >>> store.put('data', df) # doctest: +SKIP
1156 """
1157 if format is None:
1158 format = get_option("io.hdf.default_format") or "fixed"
1159 format = self._validate_format(format)
1160 self._write_to_group(
1161 key,
1162 value,
1163 format=format,
1164 index=index,
1165 append=append,
1166 complib=complib,
1167 complevel=complevel,
1168 min_itemsize=min_itemsize,
1169 nan_rep=nan_rep,
1170 data_columns=data_columns,
1171 encoding=encoding,
1172 errors=errors,
1173 track_times=track_times,
1174 dropna=dropna,
1175 )
1177 def remove(self, key: str, where=None, start=None, stop=None) -> None:
1178 """
1179 Remove pandas object partially by specifying the where condition
1181 Parameters
1182 ----------
1183 key : str
1184 Node to remove or delete rows from
1185 where : list of Term (or convertible) objects, optional
1186 start : integer (defaults to None), row number to start selection
1187 stop : integer (defaults to None), row number to stop selection
1189 Returns
1190 -------
1191 number of rows removed (or None if not a Table)
1193 Raises
1194 ------
1195 raises KeyError if key is not a valid store
1197 """
1198 where = _ensure_term(where, scope_level=1)
1199 try:
1200 s = self.get_storer(key)
1201 except KeyError:
1202 # the key is not a valid store, re-raising KeyError
1203 raise
1204 except AssertionError:
1205 # surface any assertion errors for e.g. debugging
1206 raise
1207 except Exception as err:
1208 # In tests we get here with ClosedFileError, TypeError, and
1209 # _table_mod.NoSuchNodeError. TODO: Catch only these?
1211 if where is not None:
1212 raise ValueError(
1213 "trying to remove a node with a non-None where clause!"
1214 ) from err
1216 # we are actually trying to remove a node (with children)
1217 node = self.get_node(key)
1218 if node is not None:
1219 node._f_remove(recursive=True)
1220 return None
1222 # remove the node
1223 if com.all_none(where, start, stop):
1224 s.group._f_remove(recursive=True)
1226 # delete from the table
1227 else:
1228 if not s.is_table:
1229 raise ValueError(
1230 "can only remove with where on objects written as tables"
1231 )
1232 return s.delete(where=where, start=start, stop=stop)
1234 def append(
1235 self,
1236 key: str,
1237 value: DataFrame | Series,
1238 format=None,
1239 axes=None,
1240 index: bool | list[str] = True,
1241 append: bool = True,
1242 complib=None,
1243 complevel: int | None = None,
1244 columns=None,
1245 min_itemsize: int | dict[str, int] | None = None,
1246 nan_rep=None,
1247 chunksize: int | None = None,
1248 expectedrows=None,
1249 dropna: bool | None = None,
1250 data_columns: Literal[True] | list[str] | None = None,
1251 encoding=None,
1252 errors: str = "strict",
1253 ) -> None:
1254 """
1255 Append to Table in file.
1257 Node must already exist and be Table format.
1259 Parameters
1260 ----------
1261 key : str
1262 value : {Series, DataFrame}
1263 format : 'table' is the default
1264 Format to use when storing object in HDFStore. Value can be one of:
1266 ``'table'``
1267 Table format. Write as a PyTables Table structure which may perform
1268 worse but allow more flexible operations like searching / selecting
1269 subsets of the data.
1270 index : bool, default True
1271 Write DataFrame index as a column.
1272 append : bool, default True
1273 Append the input data to the existing.
1274 data_columns : list of columns, or True, default None
1275 List of columns to create as indexed data columns for on-disk
1276 queries, or True to use all columns. By default only the axes
1277 of the object are indexed. See `here
1278 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
1279 min_itemsize : dict of columns that specify minimum str sizes
1280 nan_rep : str to use as str nan representation
1281 chunksize : size to chunk the writing
1282 expectedrows : expected TOTAL row size of this table
1283 encoding : default None, provide an encoding for str
1284 dropna : bool, default False, optional
1285 Do not write an ALL nan row to the store settable
1286 by the option 'io.hdf.dropna_table'.
1288 Notes
1289 -----
1290 Does *not* check if data being appended overlaps with existing
1291 data in the table, so be careful
1293 Examples
1294 --------
1295 >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
1296 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1297 >>> store.put('data', df1, format='table') # doctest: +SKIP
1298 >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B'])
1299 >>> store.append('data', df2) # doctest: +SKIP
1300 >>> store.close() # doctest: +SKIP
1301 A B
1302 0 1 2
1303 1 3 4
1304 0 5 6
1305 1 7 8
1306 """
1307 if columns is not None:
1308 raise TypeError(
1309 "columns is not a supported keyword in append, try data_columns"
1310 )
1312 if dropna is None:
1313 dropna = get_option("io.hdf.dropna_table")
1314 if format is None:
1315 format = get_option("io.hdf.default_format") or "table"
1316 format = self._validate_format(format)
1317 self._write_to_group(
1318 key,
1319 value,
1320 format=format,
1321 axes=axes,
1322 index=index,
1323 append=append,
1324 complib=complib,
1325 complevel=complevel,
1326 min_itemsize=min_itemsize,
1327 nan_rep=nan_rep,
1328 chunksize=chunksize,
1329 expectedrows=expectedrows,
1330 dropna=dropna,
1331 data_columns=data_columns,
1332 encoding=encoding,
1333 errors=errors,
1334 )
1336 def append_to_multiple(
1337 self,
1338 d: dict,
1339 value,
1340 selector,
1341 data_columns=None,
1342 axes=None,
1343 dropna: bool = False,
1344 **kwargs,
1345 ) -> None:
1346 """
1347 Append to multiple tables
1349 Parameters
1350 ----------
1351 d : a dict of table_name to table_columns, None is acceptable as the
1352 values of one node (this will get all the remaining columns)
1353 value : a pandas object
1354 selector : a string that designates the indexable table; all of its
1355 columns will be designed as data_columns, unless data_columns is
1356 passed, in which case these are used
1357 data_columns : list of columns to create as data columns, or True to
1358 use all columns
1359 dropna : if evaluates to True, drop rows from all tables if any single
1360 row in each table has all NaN. Default False.
1362 Notes
1363 -----
1364 axes parameter is currently not accepted
1366 """
1367 if axes is not None:
1368 raise TypeError(
1369 "axes is currently not accepted as a parameter to append_to_multiple; "
1370 "you can create the tables independently instead"
1371 )
1373 if not isinstance(d, dict):
1374 raise ValueError(
1375 "append_to_multiple must have a dictionary specified as the "
1376 "way to split the value"
1377 )
1379 if selector not in d:
1380 raise ValueError(
1381 "append_to_multiple requires a selector that is in passed dict"
1382 )
1384 # figure out the splitting axis (the non_index_axis)
1385 axis = next(iter(set(range(value.ndim)) - set(_AXES_MAP[type(value)])))
1387 # figure out how to split the value
1388 remain_key = None
1389 remain_values: list = []
1390 for k, v in d.items():
1391 if v is None:
1392 if remain_key is not None:
1393 raise ValueError(
1394 "append_to_multiple can only have one value in d that is None"
1395 )
1396 remain_key = k
1397 else:
1398 remain_values.extend(v)
1399 if remain_key is not None:
1400 ordered = value.axes[axis]
1401 ordd = ordered.difference(Index(remain_values))
1402 ordd = sorted(ordered.get_indexer(ordd))
1403 d[remain_key] = ordered.take(ordd)
1405 # data_columns
1406 if data_columns is None:
1407 data_columns = d[selector]
1409 # ensure rows are synchronized across the tables
1410 if dropna:
1411 idxs = (value[cols].dropna(how="all").index for cols in d.values())
1412 valid_index = next(idxs)
1413 for index in idxs:
1414 valid_index = valid_index.intersection(index)
1415 value = value.loc[valid_index]
1417 min_itemsize = kwargs.pop("min_itemsize", None)
1419 # append
1420 for k, v in d.items():
1421 dc = data_columns if k == selector else None
1423 # compute the val
1424 val = value.reindex(v, axis=axis)
1426 filtered = (
1427 {key: value for (key, value) in min_itemsize.items() if key in v}
1428 if min_itemsize is not None
1429 else None
1430 )
1431 self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs)
1433 def create_table_index(
1434 self,
1435 key: str,
1436 columns=None,
1437 optlevel: int | None = None,
1438 kind: str | None = None,
1439 ) -> None:
1440 """
1441 Create a pytables index on the table.
1443 Parameters
1444 ----------
1445 key : str
1446 columns : None, bool, or listlike[str]
1447 Indicate which columns to create an index on.
1449 * False : Do not create any indexes.
1450 * True : Create indexes on all columns.
1451 * None : Create indexes on all columns.
1452 * listlike : Create indexes on the given columns.
1454 optlevel : int or None, default None
1455 Optimization level, if None, pytables defaults to 6.
1456 kind : str or None, default None
1457 Kind of index, if None, pytables defaults to "medium".
1459 Raises
1460 ------
1461 TypeError: raises if the node is not a table
1462 """
1463 # version requirements
1464 _tables()
1465 s = self.get_storer(key)
1466 if s is None:
1467 return
1469 if not isinstance(s, Table):
1470 raise TypeError("cannot create table index on a Fixed format store")
1471 s.create_index(columns=columns, optlevel=optlevel, kind=kind)
1473 def groups(self) -> list:
1474 """
1475 Return a list of all the top-level nodes.
1477 Each node returned is not a pandas storage object.
1479 Returns
1480 -------
1481 list
1482 List of objects.
1484 Examples
1485 --------
1486 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
1487 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1488 >>> store.put('data', df) # doctest: +SKIP
1489 >>> print(store.groups()) # doctest: +SKIP
1490 >>> store.close() # doctest: +SKIP
1491 [/data (Group) ''
1492 children := ['axis0' (Array), 'axis1' (Array), 'block0_values' (Array),
1493 'block0_items' (Array)]]
1494 """
1495 _tables()
1496 self._check_if_open()
1497 assert self._handle is not None # for mypy
1498 assert _table_mod is not None # for mypy
1499 return [
1500 g
1501 for g in self._handle.walk_groups()
1502 if (
1503 not isinstance(g, _table_mod.link.Link)
1504 and (
1505 getattr(g._v_attrs, "pandas_type", None)
1506 or getattr(g, "table", None)
1507 or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")
1508 )
1509 )
1510 ]
1512 def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]:
1513 """
1514 Walk the pytables group hierarchy for pandas objects.
1516 This generator will yield the group path, subgroups and pandas object
1517 names for each group.
1519 Any non-pandas PyTables objects that are not a group will be ignored.
1521 The `where` group itself is listed first (preorder), then each of its
1522 child groups (following an alphanumerical order) is also traversed,
1523 following the same procedure.
1525 Parameters
1526 ----------
1527 where : str, default "/"
1528 Group where to start walking.
1530 Yields
1531 ------
1532 path : str
1533 Full path to a group (without trailing '/').
1534 groups : list
1535 Names (strings) of the groups contained in `path`.
1536 leaves : list
1537 Names (strings) of the pandas objects contained in `path`.
1539 Examples
1540 --------
1541 >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
1542 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1543 >>> store.put('data', df1, format='table') # doctest: +SKIP
1544 >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=['A', 'B'])
1545 >>> store.append('data', df2) # doctest: +SKIP
1546 >>> store.close() # doctest: +SKIP
1547 >>> for group in store.walk(): # doctest: +SKIP
1548 ... print(group) # doctest: +SKIP
1549 >>> store.close() # doctest: +SKIP
1550 """
1551 _tables()
1552 self._check_if_open()
1553 assert self._handle is not None # for mypy
1554 assert _table_mod is not None # for mypy
1556 for g in self._handle.walk_groups(where):
1557 if getattr(g._v_attrs, "pandas_type", None) is not None:
1558 continue
1560 groups = []
1561 leaves = []
1562 for child in g._v_children.values():
1563 pandas_type = getattr(child._v_attrs, "pandas_type", None)
1564 if pandas_type is None:
1565 if isinstance(child, _table_mod.group.Group):
1566 groups.append(child._v_name)
1567 else:
1568 leaves.append(child._v_name)
1570 yield (g._v_pathname.rstrip("/"), groups, leaves)
1572 def get_node(self, key: str) -> Node | None:
1573 """return the node with the key or None if it does not exist"""
1574 self._check_if_open()
1575 if not key.startswith("/"):
1576 key = "/" + key
1578 assert self._handle is not None
1579 assert _table_mod is not None # for mypy
1580 try:
1581 node = self._handle.get_node(self.root, key)
1582 except _table_mod.exceptions.NoSuchNodeError:
1583 return None
1585 assert isinstance(node, _table_mod.Node), type(node)
1586 return node
1588 def get_storer(self, key: str) -> GenericFixed | Table:
1589 """return the storer object for a key, raise if not in the file"""
1590 group = self.get_node(key)
1591 if group is None:
1592 raise KeyError(f"No object named {key} in the file")
1594 s = self._create_storer(group)
1595 s.infer_axes()
1596 return s
1598 def copy(
1599 self,
1600 file,
1601 mode: str = "w",
1602 propindexes: bool = True,
1603 keys=None,
1604 complib=None,
1605 complevel: int | None = None,
1606 fletcher32: bool = False,
1607 overwrite: bool = True,
1608 ) -> HDFStore:
1609 """
1610 Copy the existing store to a new file, updating in place.
1612 Parameters
1613 ----------
1614 propindexes : bool, default True
1615 Restore indexes in copied file.
1616 keys : list, optional
1617 List of keys to include in the copy (defaults to all).
1618 overwrite : bool, default True
1619 Whether to overwrite (remove and replace) existing nodes in the new store.
1620 mode, complib, complevel, fletcher32 same as in HDFStore.__init__
1622 Returns
1623 -------
1624 open file handle of the new store
1625 """
1626 new_store = HDFStore(
1627 file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32
1628 )
1629 if keys is None:
1630 keys = list(self.keys())
1631 if not isinstance(keys, (tuple, list)):
1632 keys = [keys]
1633 for k in keys:
1634 s = self.get_storer(k)
1635 if s is not None:
1636 if k in new_store:
1637 if overwrite:
1638 new_store.remove(k)
1640 data = self.select(k)
1641 if isinstance(s, Table):
1642 index: bool | list[str] = False
1643 if propindexes:
1644 index = [a.name for a in s.axes if a.is_indexed]
1645 new_store.append(
1646 k,
1647 data,
1648 index=index,
1649 data_columns=getattr(s, "data_columns", None),
1650 encoding=s.encoding,
1651 )
1652 else:
1653 new_store.put(k, data, encoding=s.encoding)
1655 return new_store
1657 def info(self) -> str:
1658 """
1659 Print detailed information on the store.
1661 Returns
1662 -------
1663 str
1665 Examples
1666 --------
1667 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
1668 >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1669 >>> store.put('data', df) # doctest: +SKIP
1670 >>> print(store.info()) # doctest: +SKIP
1671 >>> store.close() # doctest: +SKIP
1672 <class 'pandas.io.pytables.HDFStore'>
1673 File path: store.h5
1674 /data frame (shape->[2,2])
1675 """
1676 path = pprint_thing(self._path)
1677 output = f"{type(self)}\nFile path: {path}\n"
1679 if self.is_open:
1680 lkeys = sorted(self.keys())
1681 if len(lkeys):
1682 keys = []
1683 values = []
1685 for k in lkeys:
1686 try:
1687 s = self.get_storer(k)
1688 if s is not None:
1689 keys.append(pprint_thing(s.pathname or k))
1690 values.append(pprint_thing(s or "invalid_HDFStore node"))
1691 except AssertionError:
1692 # surface any assertion errors for e.g. debugging
1693 raise
1694 except Exception as detail:
1695 keys.append(k)
1696 dstr = pprint_thing(detail)
1697 values.append(f"[invalid_HDFStore node: {dstr}]")
1699 output += adjoin(12, keys, values)
1700 else:
1701 output += "Empty"
1702 else:
1703 output += "File is CLOSED"
1705 return output
1707 # ------------------------------------------------------------------------
1708 # private methods
1710 def _check_if_open(self) -> None:
1711 if not self.is_open:
1712 raise ClosedFileError(f"{self._path} file is not open!")
1714 def _validate_format(self, format: str) -> str:
1715 """validate / deprecate formats"""
1716 # validate
1717 try:
1718 format = _FORMAT_MAP[format.lower()]
1719 except KeyError as err:
1720 raise TypeError(f"invalid HDFStore format specified [{format}]") from err
1722 return format
1724 def _create_storer(
1725 self,
1726 group,
1727 format=None,
1728 value: DataFrame | Series | None = None,
1729 encoding: str = "UTF-8",
1730 errors: str = "strict",
1731 ) -> GenericFixed | Table:
1732 """return a suitable class to operate"""
1733 cls: type[GenericFixed | Table]
1735 if value is not None and not isinstance(value, (Series, DataFrame)):
1736 raise TypeError("value must be None, Series, or DataFrame")
1738 pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))
1739 tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))
1741 # infer the pt from the passed value
1742 if pt is None:
1743 if value is None:
1744 _tables()
1745 assert _table_mod is not None # for mypy
1746 if getattr(group, "table", None) or isinstance(
1747 group, _table_mod.table.Table
1748 ):
1749 pt = "frame_table"
1750 tt = "generic_table"
1751 else:
1752 raise TypeError(
1753 "cannot create a storer if the object is not existing "
1754 "nor a value are passed"
1755 )
1756 else:
1757 if isinstance(value, Series):
1758 pt = "series"
1759 else:
1760 pt = "frame"
1762 # we are actually a table
1763 if format == "table":
1764 pt += "_table"
1766 # a storer node
1767 if "table" not in pt:
1768 _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed}
1769 try:
1770 cls = _STORER_MAP[pt]
1771 except KeyError as err:
1772 raise TypeError(
1773 f"cannot properly create the storer for: [_STORER_MAP] [group->"
1774 f"{group},value->{type(value)},format->{format}"
1775 ) from err
1776 return cls(self, group, encoding=encoding, errors=errors)
1778 # existing node (and must be a table)
1779 if tt is None:
1780 # if we are a writer, determine the tt
1781 if value is not None:
1782 if pt == "series_table":
1783 index = getattr(value, "index", None)
1784 if index is not None:
1785 if index.nlevels == 1:
1786 tt = "appendable_series"
1787 elif index.nlevels > 1:
1788 tt = "appendable_multiseries"
1789 elif pt == "frame_table":
1790 index = getattr(value, "index", None)
1791 if index is not None:
1792 if index.nlevels == 1:
1793 tt = "appendable_frame"
1794 elif index.nlevels > 1:
1795 tt = "appendable_multiframe"
1797 _TABLE_MAP = {
1798 "generic_table": GenericTable,
1799 "appendable_series": AppendableSeriesTable,
1800 "appendable_multiseries": AppendableMultiSeriesTable,
1801 "appendable_frame": AppendableFrameTable,
1802 "appendable_multiframe": AppendableMultiFrameTable,
1803 "worm": WORMTable,
1804 }
1805 try:
1806 cls = _TABLE_MAP[tt]
1807 except KeyError as err:
1808 raise TypeError(
1809 f"cannot properly create the storer for: [_TABLE_MAP] [group->"
1810 f"{group},value->{type(value)},format->{format}"
1811 ) from err
1813 return cls(self, group, encoding=encoding, errors=errors)
1815 def _write_to_group(
1816 self,
1817 key: str,
1818 value: DataFrame | Series,
1819 format,
1820 axes=None,
1821 index: bool | list[str] = True,
1822 append: bool = False,
1823 complib=None,
1824 complevel: int | None = None,
1825 fletcher32=None,
1826 min_itemsize: int | dict[str, int] | None = None,
1827 chunksize: int | None = None,
1828 expectedrows=None,
1829 dropna: bool = False,
1830 nan_rep=None,
1831 data_columns=None,
1832 encoding=None,
1833 errors: str = "strict",
1834 track_times: bool = True,
1835 ) -> None:
1836 # we don't want to store a table node at all if our object is 0-len
1837 # as there are not dtypes
1838 if getattr(value, "empty", None) and (format == "table" or append):
1839 return
1841 group = self._identify_group(key, append)
1843 s = self._create_storer(group, format, value, encoding=encoding, errors=errors)
1844 if append:
1845 # raise if we are trying to append to a Fixed format,
1846 # or a table that exists (and we are putting)
1847 if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):
1848 raise ValueError("Can only append to Tables")
1849 if not s.is_exists:
1850 s.set_object_info()
1851 else:
1852 s.set_object_info()
1854 if not s.is_table and complib:
1855 raise ValueError("Compression not supported on Fixed format stores")
1857 # write the object
1858 s.write(
1859 obj=value,
1860 axes=axes,
1861 append=append,
1862 complib=complib,
1863 complevel=complevel,
1864 fletcher32=fletcher32,
1865 min_itemsize=min_itemsize,
1866 chunksize=chunksize,
1867 expectedrows=expectedrows,
1868 dropna=dropna,
1869 nan_rep=nan_rep,
1870 data_columns=data_columns,
1871 track_times=track_times,
1872 )
1874 if isinstance(s, Table) and index:
1875 s.create_index(columns=index)
1877 def _read_group(self, group: Node):
1878 s = self._create_storer(group)
1879 s.infer_axes()
1880 return s.read()
1882 def _identify_group(self, key: str, append: bool) -> Node:
1883 """Identify HDF5 group based on key, delete/create group if needed."""
1884 group = self.get_node(key)
1886 # we make this assertion for mypy; the get_node call will already
1887 # have raised if this is incorrect
1888 assert self._handle is not None
1890 # remove the node if we are not appending
1891 if group is not None and not append:
1892 self._handle.remove_node(group, recursive=True)
1893 group = None
1895 if group is None:
1896 group = self._create_nodes_and_group(key)
1898 return group
1900 def _create_nodes_and_group(self, key: str) -> Node:
1901 """Create nodes from key and return group name."""
1902 # assertion for mypy
1903 assert self._handle is not None
1905 paths = key.split("/")
1906 # recursively create the groups
1907 path = "/"
1908 for p in paths:
1909 if not len(p):
1910 continue
1911 new_path = path
1912 if not path.endswith("/"):
1913 new_path += "/"
1914 new_path += p
1915 group = self.get_node(new_path)
1916 if group is None:
1917 group = self._handle.create_group(path, p)
1918 path = new_path
1919 return group
1922class TableIterator:
1923 """
1924 Define the iteration interface on a table
1926 Parameters
1927 ----------
1928 store : HDFStore
1929 s : the referred storer
1930 func : the function to execute the query
1931 where : the where of the query
1932 nrows : the rows to iterate on
1933 start : the passed start value (default is None)
1934 stop : the passed stop value (default is None)
1935 iterator : bool, default False
1936 Whether to use the default iterator.
1937 chunksize : the passed chunking value (default is 100000)
1938 auto_close : bool, default False
1939 Whether to automatically close the store at the end of iteration.
1940 """
1942 chunksize: int | None
1943 store: HDFStore
1944 s: GenericFixed | Table
1946 def __init__(
1947 self,
1948 store: HDFStore,
1949 s: GenericFixed | Table,
1950 func,
1951 where,
1952 nrows,
1953 start=None,
1954 stop=None,
1955 iterator: bool = False,
1956 chunksize: int | None = None,
1957 auto_close: bool = False,
1958 ) -> None:
1959 self.store = store
1960 self.s = s
1961 self.func = func
1962 self.where = where
1964 # set start/stop if they are not set if we are a table
1965 if self.s.is_table:
1966 if nrows is None:
1967 nrows = 0
1968 if start is None:
1969 start = 0
1970 if stop is None:
1971 stop = nrows
1972 stop = min(nrows, stop)
1974 self.nrows = nrows
1975 self.start = start
1976 self.stop = stop
1978 self.coordinates = None
1979 if iterator or chunksize is not None:
1980 if chunksize is None:
1981 chunksize = 100000
1982 self.chunksize = int(chunksize)
1983 else:
1984 self.chunksize = None
1986 self.auto_close = auto_close
1988 def __iter__(self) -> Iterator:
1989 # iterate
1990 current = self.start
1991 if self.coordinates is None:
1992 raise ValueError("Cannot iterate until get_result is called.")
1993 while current < self.stop:
1994 stop = min(current + self.chunksize, self.stop)
1995 value = self.func(None, None, self.coordinates[current:stop])
1996 current = stop
1997 if value is None or not len(value):
1998 continue
2000 yield value
2002 self.close()
2004 def close(self) -> None:
2005 if self.auto_close:
2006 self.store.close()
2008 def get_result(self, coordinates: bool = False):
2009 # return the actual iterator
2010 if self.chunksize is not None:
2011 if not isinstance(self.s, Table):
2012 raise TypeError("can only use an iterator or chunksize on a table")
2014 self.coordinates = self.s.read_coordinates(where=self.where)
2016 return self
2018 # if specified read via coordinates (necessary for multiple selections
2019 if coordinates:
2020 if not isinstance(self.s, Table):
2021 raise TypeError("can only read_coordinates on a table")
2022 where = self.s.read_coordinates(
2023 where=self.where, start=self.start, stop=self.stop
2024 )
2025 else:
2026 where = self.where
2028 # directly return the result
2029 results = self.func(self.start, self.stop, where)
2030 self.close()
2031 return results
2034class IndexCol:
2035 """
2036 an index column description class
2038 Parameters
2039 ----------
2040 axis : axis which I reference
2041 values : the ndarray like converted values
2042 kind : a string description of this type
2043 typ : the pytables type
2044 pos : the position in the pytables
2046 """
2048 is_an_indexable: bool = True
2049 is_data_indexable: bool = True
2050 _info_fields = ["freq", "tz", "index_name"]
2052 def __init__(
2053 self,
2054 name: str,
2055 values=None,
2056 kind=None,
2057 typ=None,
2058 cname: str | None = None,
2059 axis=None,
2060 pos=None,
2061 freq=None,
2062 tz=None,
2063 index_name=None,
2064 ordered=None,
2065 table=None,
2066 meta=None,
2067 metadata=None,
2068 ) -> None:
2069 if not isinstance(name, str):
2070 raise ValueError("`name` must be a str.")
2072 self.values = values
2073 self.kind = kind
2074 self.typ = typ
2075 self.name = name
2076 self.cname = cname or name
2077 self.axis = axis
2078 self.pos = pos
2079 self.freq = freq
2080 self.tz = tz
2081 self.index_name = index_name
2082 self.ordered = ordered
2083 self.table = table
2084 self.meta = meta
2085 self.metadata = metadata
2087 if pos is not None:
2088 self.set_pos(pos)
2090 # These are ensured as long as the passed arguments match the
2091 # constructor annotations.
2092 assert isinstance(self.name, str)
2093 assert isinstance(self.cname, str)
2095 @property
2096 def itemsize(self) -> int:
2097 # Assumes self.typ has already been initialized
2098 return self.typ.itemsize
2100 @property
2101 def kind_attr(self) -> str:
2102 return f"{self.name}_kind"
2104 def set_pos(self, pos: int) -> None:
2105 """set the position of this column in the Table"""
2106 self.pos = pos
2107 if pos is not None and self.typ is not None:
2108 self.typ._v_pos = pos
2110 def __repr__(self) -> str:
2111 temp = tuple(
2112 map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))
2113 )
2114 return ",".join(
2115 [
2116 f"{key}->{value}"
2117 for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp)
2118 ]
2119 )
2121 def __eq__(self, other: object) -> bool:
2122 """compare 2 col items"""
2123 return all(
2124 getattr(self, a, None) == getattr(other, a, None)
2125 for a in ["name", "cname", "axis", "pos"]
2126 )
2128 def __ne__(self, other) -> bool:
2129 return not self.__eq__(other)
2131 @property
2132 def is_indexed(self) -> bool:
2133 """return whether I am an indexed column"""
2134 if not hasattr(self.table, "cols"):
2135 # e.g. if infer hasn't been called yet, self.table will be None.
2136 return False
2137 return getattr(self.table.cols, self.cname).is_indexed
2139 def convert(
2140 self, values: np.ndarray, nan_rep, encoding: str, errors: str
2141 ) -> tuple[np.ndarray, np.ndarray] | tuple[Index, Index]:
2142 """
2143 Convert the data from this selection to the appropriate pandas type.
2144 """
2145 assert isinstance(values, np.ndarray), type(values)
2147 # values is a recarray
2148 if values.dtype.fields is not None:
2149 # Copy, otherwise values will be a view
2150 # preventing the original recarry from being free'ed
2151 values = values[self.cname].copy()
2153 val_kind = _ensure_decoded(self.kind)
2154 values = _maybe_convert(values, val_kind, encoding, errors)
2155 kwargs = {}
2156 kwargs["name"] = _ensure_decoded(self.index_name)
2158 if self.freq is not None:
2159 kwargs["freq"] = _ensure_decoded(self.freq)
2161 factory: type[Index | DatetimeIndex] = Index
2162 if lib.is_np_dtype(values.dtype, "M") or isinstance(
2163 values.dtype, DatetimeTZDtype
2164 ):
2165 factory = DatetimeIndex
2166 elif values.dtype == "i8" and "freq" in kwargs:
2167 # PeriodIndex data is stored as i8
2168 # error: Incompatible types in assignment (expression has type
2169 # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type
2170 # "Union[Type[Index], Type[DatetimeIndex]]")
2171 factory = lambda x, **kwds: PeriodIndex.from_ordinals( # type: ignore[assignment]
2172 x, freq=kwds.get("freq", None)
2173 )._rename(
2174 kwds["name"]
2175 )
2177 # making an Index instance could throw a number of different errors
2178 try:
2179 new_pd_index = factory(values, **kwargs)
2180 except ValueError:
2181 # if the output freq is different that what we recorded,
2182 # it should be None (see also 'doc example part 2')
2183 if "freq" in kwargs:
2184 kwargs["freq"] = None
2185 new_pd_index = factory(values, **kwargs)
2186 final_pd_index = _set_tz(new_pd_index, self.tz)
2187 return final_pd_index, final_pd_index
2189 def take_data(self):
2190 """return the values"""
2191 return self.values
2193 @property
2194 def attrs(self):
2195 return self.table._v_attrs
2197 @property
2198 def description(self):
2199 return self.table.description
2201 @property
2202 def col(self):
2203 """return my current col description"""
2204 return getattr(self.description, self.cname, None)
2206 @property
2207 def cvalues(self):
2208 """return my cython values"""
2209 return self.values
2211 def __iter__(self) -> Iterator:
2212 return iter(self.values)
2214 def maybe_set_size(self, min_itemsize=None) -> None:
2215 """
2216 maybe set a string col itemsize:
2217 min_itemsize can be an integer or a dict with this columns name
2218 with an integer size
2219 """
2220 if _ensure_decoded(self.kind) == "string":
2221 if isinstance(min_itemsize, dict):
2222 min_itemsize = min_itemsize.get(self.name)
2224 if min_itemsize is not None and self.typ.itemsize < min_itemsize:
2225 self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)
2227 def validate_names(self) -> None:
2228 pass
2230 def validate_and_set(self, handler: AppendableTable, append: bool) -> None:
2231 self.table = handler.table
2232 self.validate_col()
2233 self.validate_attr(append)
2234 self.validate_metadata(handler)
2235 self.write_metadata(handler)
2236 self.set_attr()
2238 def validate_col(self, itemsize=None):
2239 """validate this column: return the compared against itemsize"""
2240 # validate this column for string truncation (or reset to the max size)
2241 if _ensure_decoded(self.kind) == "string":
2242 c = self.col
2243 if c is not None:
2244 if itemsize is None:
2245 itemsize = self.itemsize
2246 if c.itemsize < itemsize:
2247 raise ValueError(
2248 f"Trying to store a string with len [{itemsize}] in "
2249 f"[{self.cname}] column but\nthis column has a limit of "
2250 f"[{c.itemsize}]!\nConsider using min_itemsize to "
2251 "preset the sizes on these columns"
2252 )
2253 return c.itemsize
2255 return None
2257 def validate_attr(self, append: bool) -> None:
2258 # check for backwards incompatibility
2259 if append:
2260 existing_kind = getattr(self.attrs, self.kind_attr, None)
2261 if existing_kind is not None and existing_kind != self.kind:
2262 raise TypeError(
2263 f"incompatible kind in col [{existing_kind} - {self.kind}]"
2264 )
2266 def update_info(self, info) -> None:
2267 """
2268 set/update the info for this indexable with the key/value
2269 if there is a conflict raise/warn as needed
2270 """
2271 for key in self._info_fields:
2272 value = getattr(self, key, None)
2273 idx = info.setdefault(self.name, {})
2275 existing_value = idx.get(key)
2276 if key in idx and value is not None and existing_value != value:
2277 # frequency/name just warn
2278 if key in ["freq", "index_name"]:
2279 ws = attribute_conflict_doc % (key, existing_value, value)
2280 warnings.warn(
2281 ws, AttributeConflictWarning, stacklevel=find_stack_level()
2282 )
2284 # reset
2285 idx[key] = None
2286 setattr(self, key, None)
2288 else:
2289 raise ValueError(
2290 f"invalid info for [{self.name}] for [{key}], "
2291 f"existing_value [{existing_value}] conflicts with "
2292 f"new value [{value}]"
2293 )
2294 elif value is not None or existing_value is not None:
2295 idx[key] = value
2297 def set_info(self, info) -> None:
2298 """set my state from the passed info"""
2299 idx = info.get(self.name)
2300 if idx is not None:
2301 self.__dict__.update(idx)
2303 def set_attr(self) -> None:
2304 """set the kind for this column"""
2305 setattr(self.attrs, self.kind_attr, self.kind)
2307 def validate_metadata(self, handler: AppendableTable) -> None:
2308 """validate that kind=category does not change the categories"""
2309 if self.meta == "category":
2310 new_metadata = self.metadata
2311 cur_metadata = handler.read_metadata(self.cname)
2312 if (
2313 new_metadata is not None
2314 and cur_metadata is not None
2315 and not array_equivalent(
2316 new_metadata, cur_metadata, strict_nan=True, dtype_equal=True
2317 )
2318 ):
2319 raise ValueError(
2320 "cannot append a categorical with "
2321 "different categories to the existing"
2322 )
2324 def write_metadata(self, handler: AppendableTable) -> None:
2325 """set the meta data"""
2326 if self.metadata is not None:
2327 handler.write_metadata(self.cname, self.metadata)
2330class GenericIndexCol(IndexCol):
2331 """an index which is not represented in the data of the table"""
2333 @property
2334 def is_indexed(self) -> bool:
2335 return False
2337 def convert(
2338 self, values: np.ndarray, nan_rep, encoding: str, errors: str
2339 ) -> tuple[Index, Index]:
2340 """
2341 Convert the data from this selection to the appropriate pandas type.
2343 Parameters
2344 ----------
2345 values : np.ndarray
2346 nan_rep : str
2347 encoding : str
2348 errors : str
2349 """
2350 assert isinstance(values, np.ndarray), type(values)
2352 index = RangeIndex(len(values))
2353 return index, index
2355 def set_attr(self) -> None:
2356 pass
2359class DataCol(IndexCol):
2360 """
2361 a data holding column, by definition this is not indexable
2363 Parameters
2364 ----------
2365 data : the actual data
2366 cname : the column name in the table to hold the data (typically
2367 values)
2368 meta : a string description of the metadata
2369 metadata : the actual metadata
2370 """
2372 is_an_indexable = False
2373 is_data_indexable = False
2374 _info_fields = ["tz", "ordered"]
2376 def __init__(
2377 self,
2378 name: str,
2379 values=None,
2380 kind=None,
2381 typ=None,
2382 cname: str | None = None,
2383 pos=None,
2384 tz=None,
2385 ordered=None,
2386 table=None,
2387 meta=None,
2388 metadata=None,
2389 dtype: DtypeArg | None = None,
2390 data=None,
2391 ) -> None:
2392 super().__init__(
2393 name=name,
2394 values=values,
2395 kind=kind,
2396 typ=typ,
2397 pos=pos,
2398 cname=cname,
2399 tz=tz,
2400 ordered=ordered,
2401 table=table,
2402 meta=meta,
2403 metadata=metadata,
2404 )
2405 self.dtype = dtype
2406 self.data = data
2408 @property
2409 def dtype_attr(self) -> str:
2410 return f"{self.name}_dtype"
2412 @property
2413 def meta_attr(self) -> str:
2414 return f"{self.name}_meta"
2416 def __repr__(self) -> str:
2417 temp = tuple(
2418 map(
2419 pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)
2420 )
2421 )
2422 return ",".join(
2423 [
2424 f"{key}->{value}"
2425 for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp)
2426 ]
2427 )
2429 def __eq__(self, other: object) -> bool:
2430 """compare 2 col items"""
2431 return all(
2432 getattr(self, a, None) == getattr(other, a, None)
2433 for a in ["name", "cname", "dtype", "pos"]
2434 )
2436 def set_data(self, data: ArrayLike) -> None:
2437 assert data is not None
2438 assert self.dtype is None
2440 data, dtype_name = _get_data_and_dtype_name(data)
2442 self.data = data
2443 self.dtype = dtype_name
2444 self.kind = _dtype_to_kind(dtype_name)
2446 def take_data(self):
2447 """return the data"""
2448 return self.data
2450 @classmethod
2451 def _get_atom(cls, values: ArrayLike) -> Col:
2452 """
2453 Get an appropriately typed and shaped pytables.Col object for values.
2454 """
2455 dtype = values.dtype
2456 # error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no
2457 # attribute "itemsize"
2458 itemsize = dtype.itemsize # type: ignore[union-attr]
2460 shape = values.shape
2461 if values.ndim == 1:
2462 # EA, use block shape pretending it is 2D
2463 # TODO(EA2D): not necessary with 2D EAs
2464 shape = (1, values.size)
2466 if isinstance(values, Categorical):
2467 codes = values.codes
2468 atom = cls.get_atom_data(shape, kind=codes.dtype.name)
2469 elif lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype):
2470 atom = cls.get_atom_datetime64(shape)
2471 elif lib.is_np_dtype(dtype, "m"):
2472 atom = cls.get_atom_timedelta64(shape)
2473 elif is_complex_dtype(dtype):
2474 atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0])
2475 elif is_string_dtype(dtype):
2476 atom = cls.get_atom_string(shape, itemsize)
2477 else:
2478 atom = cls.get_atom_data(shape, kind=dtype.name)
2480 return atom
2482 @classmethod
2483 def get_atom_string(cls, shape, itemsize):
2484 return _tables().StringCol(itemsize=itemsize, shape=shape[0])
2486 @classmethod
2487 def get_atom_coltype(cls, kind: str) -> type[Col]:
2488 """return the PyTables column class for this column"""
2489 if kind.startswith("uint"):
2490 k4 = kind[4:]
2491 col_name = f"UInt{k4}Col"
2492 elif kind.startswith("period"):
2493 # we store as integer
2494 col_name = "Int64Col"
2495 else:
2496 kcap = kind.capitalize()
2497 col_name = f"{kcap}Col"
2499 return getattr(_tables(), col_name)
2501 @classmethod
2502 def get_atom_data(cls, shape, kind: str) -> Col:
2503 return cls.get_atom_coltype(kind=kind)(shape=shape[0])
2505 @classmethod
2506 def get_atom_datetime64(cls, shape):
2507 return _tables().Int64Col(shape=shape[0])
2509 @classmethod
2510 def get_atom_timedelta64(cls, shape):
2511 return _tables().Int64Col(shape=shape[0])
2513 @property
2514 def shape(self):
2515 return getattr(self.data, "shape", None)
2517 @property
2518 def cvalues(self):
2519 """return my cython values"""
2520 return self.data
2522 def validate_attr(self, append) -> None:
2523 """validate that we have the same order as the existing & same dtype"""
2524 if append:
2525 existing_fields = getattr(self.attrs, self.kind_attr, None)
2526 if existing_fields is not None and existing_fields != list(self.values):
2527 raise ValueError("appended items do not match existing items in table!")
2529 existing_dtype = getattr(self.attrs, self.dtype_attr, None)
2530 if existing_dtype is not None and existing_dtype != self.dtype:
2531 raise ValueError(
2532 "appended items dtype do not match existing items dtype in table!"
2533 )
2535 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
2536 """
2537 Convert the data from this selection to the appropriate pandas type.
2539 Parameters
2540 ----------
2541 values : np.ndarray
2542 nan_rep :
2543 encoding : str
2544 errors : str
2546 Returns
2547 -------
2548 index : listlike to become an Index
2549 data : ndarraylike to become a column
2550 """
2551 assert isinstance(values, np.ndarray), type(values)
2553 # values is a recarray
2554 if values.dtype.fields is not None:
2555 values = values[self.cname]
2557 assert self.typ is not None
2558 if self.dtype is None:
2559 # Note: in tests we never have timedelta64 or datetime64,
2560 # so the _get_data_and_dtype_name may be unnecessary
2561 converted, dtype_name = _get_data_and_dtype_name(values)
2562 kind = _dtype_to_kind(dtype_name)
2563 else:
2564 converted = values
2565 dtype_name = self.dtype
2566 kind = self.kind
2568 assert isinstance(converted, np.ndarray) # for mypy
2570 # use the meta if needed
2571 meta = _ensure_decoded(self.meta)
2572 metadata = self.metadata
2573 ordered = self.ordered
2574 tz = self.tz
2576 assert dtype_name is not None
2577 # convert to the correct dtype
2578 dtype = _ensure_decoded(dtype_name)
2580 # reverse converts
2581 if dtype.startswith("datetime64"):
2582 # recreate with tz if indicated
2583 converted = _set_tz(converted, tz, coerce=True)
2585 elif dtype == "timedelta64":
2586 converted = np.asarray(converted, dtype="m8[ns]")
2587 elif dtype == "date":
2588 try:
2589 converted = np.asarray(
2590 [date.fromordinal(v) for v in converted], dtype=object
2591 )
2592 except ValueError:
2593 converted = np.asarray(
2594 [date.fromtimestamp(v) for v in converted], dtype=object
2595 )
2597 elif meta == "category":
2598 # we have a categorical
2599 categories = metadata
2600 codes = converted.ravel()
2602 # if we have stored a NaN in the categories
2603 # then strip it; in theory we could have BOTH
2604 # -1s in the codes and nulls :<
2605 if categories is None:
2606 # Handle case of NaN-only categorical columns in which case
2607 # the categories are an empty array; when this is stored,
2608 # pytables cannot write a zero-len array, so on readback
2609 # the categories would be None and `read_hdf()` would fail.
2610 categories = Index([], dtype=np.float64)
2611 else:
2612 mask = isna(categories)
2613 if mask.any():
2614 categories = categories[~mask]
2615 codes[codes != -1] -= mask.astype(int).cumsum()._values
2617 converted = Categorical.from_codes(
2618 codes, categories=categories, ordered=ordered, validate=False
2619 )
2621 else:
2622 try:
2623 converted = converted.astype(dtype, copy=False)
2624 except TypeError:
2625 converted = converted.astype("O", copy=False)
2627 # convert nans / decode
2628 if _ensure_decoded(kind) == "string":
2629 converted = _unconvert_string_array(
2630 converted, nan_rep=nan_rep, encoding=encoding, errors=errors
2631 )
2633 return self.values, converted
2635 def set_attr(self) -> None:
2636 """set the data for this column"""
2637 setattr(self.attrs, self.kind_attr, self.values)
2638 setattr(self.attrs, self.meta_attr, self.meta)
2639 assert self.dtype is not None
2640 setattr(self.attrs, self.dtype_attr, self.dtype)
2643class DataIndexableCol(DataCol):
2644 """represent a data column that can be indexed"""
2646 is_data_indexable = True
2648 def validate_names(self) -> None:
2649 if not is_string_dtype(Index(self.values).dtype):
2650 # TODO: should the message here be more specifically non-str?
2651 raise ValueError("cannot have non-object label DataIndexableCol")
2653 @classmethod
2654 def get_atom_string(cls, shape, itemsize):
2655 return _tables().StringCol(itemsize=itemsize)
2657 @classmethod
2658 def get_atom_data(cls, shape, kind: str) -> Col:
2659 return cls.get_atom_coltype(kind=kind)()
2661 @classmethod
2662 def get_atom_datetime64(cls, shape):
2663 return _tables().Int64Col()
2665 @classmethod
2666 def get_atom_timedelta64(cls, shape):
2667 return _tables().Int64Col()
2670class GenericDataIndexableCol(DataIndexableCol):
2671 """represent a generic pytables data column"""
2674class Fixed:
2675 """
2676 represent an object in my store
2677 facilitate read/write of various types of objects
2678 this is an abstract base class
2680 Parameters
2681 ----------
2682 parent : HDFStore
2683 group : Node
2684 The group node where the table resides.
2685 """
2687 pandas_kind: str
2688 format_type: str = "fixed" # GH#30962 needed by dask
2689 obj_type: type[DataFrame | Series]
2690 ndim: int
2691 parent: HDFStore
2692 is_table: bool = False
2694 def __init__(
2695 self,
2696 parent: HDFStore,
2697 group: Node,
2698 encoding: str | None = "UTF-8",
2699 errors: str = "strict",
2700 ) -> None:
2701 assert isinstance(parent, HDFStore), type(parent)
2702 assert _table_mod is not None # needed for mypy
2703 assert isinstance(group, _table_mod.Node), type(group)
2704 self.parent = parent
2705 self.group = group
2706 self.encoding = _ensure_encoding(encoding)
2707 self.errors = errors
2709 @property
2710 def is_old_version(self) -> bool:
2711 return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1
2713 @property
2714 def version(self) -> tuple[int, int, int]:
2715 """compute and set our version"""
2716 version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))
2717 try:
2718 version = tuple(int(x) for x in version.split("."))
2719 if len(version) == 2:
2720 version = version + (0,)
2721 except AttributeError:
2722 version = (0, 0, 0)
2723 return version
2725 @property
2726 def pandas_type(self):
2727 return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))
2729 def __repr__(self) -> str:
2730 """return a pretty representation of myself"""
2731 self.infer_axes()
2732 s = self.shape
2733 if s is not None:
2734 if isinstance(s, (list, tuple)):
2735 jshape = ",".join([pprint_thing(x) for x in s])
2736 s = f"[{jshape}]"
2737 return f"{self.pandas_type:12.12} (shape->{s})"
2738 return self.pandas_type
2740 def set_object_info(self) -> None:
2741 """set my pandas type & version"""
2742 self.attrs.pandas_type = str(self.pandas_kind)
2743 self.attrs.pandas_version = str(_version)
2745 def copy(self) -> Fixed:
2746 new_self = copy.copy(self)
2747 return new_self
2749 @property
2750 def shape(self):
2751 return self.nrows
2753 @property
2754 def pathname(self):
2755 return self.group._v_pathname
2757 @property
2758 def _handle(self):
2759 return self.parent._handle
2761 @property
2762 def _filters(self):
2763 return self.parent._filters
2765 @property
2766 def _complevel(self) -> int:
2767 return self.parent._complevel
2769 @property
2770 def _fletcher32(self) -> bool:
2771 return self.parent._fletcher32
2773 @property
2774 def attrs(self):
2775 return self.group._v_attrs
2777 def set_attrs(self) -> None:
2778 """set our object attributes"""
2780 def get_attrs(self) -> None:
2781 """get our object attributes"""
2783 @property
2784 def storable(self):
2785 """return my storable"""
2786 return self.group
2788 @property
2789 def is_exists(self) -> bool:
2790 return False
2792 @property
2793 def nrows(self):
2794 return getattr(self.storable, "nrows", None)
2796 def validate(self, other) -> Literal[True] | None:
2797 """validate against an existing storable"""
2798 if other is None:
2799 return None
2800 return True
2802 def validate_version(self, where=None) -> None:
2803 """are we trying to operate on an old version?"""
2805 def infer_axes(self) -> bool:
2806 """
2807 infer the axes of my storer
2808 return a boolean indicating if we have a valid storer or not
2809 """
2810 s = self.storable
2811 if s is None:
2812 return False
2813 self.get_attrs()
2814 return True
2816 def read(
2817 self,
2818 where=None,
2819 columns=None,
2820 start: int | None = None,
2821 stop: int | None = None,
2822 ):
2823 raise NotImplementedError(
2824 "cannot read on an abstract storer: subclasses should implement"
2825 )
2827 def write(self, obj, **kwargs) -> None:
2828 raise NotImplementedError(
2829 "cannot write on an abstract storer: subclasses should implement"
2830 )
2832 def delete(
2833 self, where=None, start: int | None = None, stop: int | None = None
2834 ) -> None:
2835 """
2836 support fully deleting the node in its entirety (only) - where
2837 specification must be None
2838 """
2839 if com.all_none(where, start, stop):
2840 self._handle.remove_node(self.group, recursive=True)
2841 return None
2843 raise TypeError("cannot delete on an abstract storer")
2846class GenericFixed(Fixed):
2847 """a generified fixed version"""
2849 _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}
2850 _reverse_index_map = {v: k for k, v in _index_type_map.items()}
2851 attributes: list[str] = []
2853 # indexer helpers
2854 def _class_to_alias(self, cls) -> str:
2855 return self._index_type_map.get(cls, "")
2857 def _alias_to_class(self, alias):
2858 if isinstance(alias, type): # pragma: no cover
2859 # compat: for a short period of time master stored types
2860 return alias
2861 return self._reverse_index_map.get(alias, Index)
2863 def _get_index_factory(self, attrs):
2864 index_class = self._alias_to_class(
2865 _ensure_decoded(getattr(attrs, "index_class", ""))
2866 )
2868 factory: Callable
2870 if index_class == DatetimeIndex:
2872 def f(values, freq=None, tz=None):
2873 # data are already in UTC, localize and convert if tz present
2874 dta = DatetimeArray._simple_new(
2875 values.values, dtype=values.dtype, freq=freq
2876 )
2877 result = DatetimeIndex._simple_new(dta, name=None)
2878 if tz is not None:
2879 result = result.tz_localize("UTC").tz_convert(tz)
2880 return result
2882 factory = f
2883 elif index_class == PeriodIndex:
2885 def f(values, freq=None, tz=None):
2886 dtype = PeriodDtype(freq)
2887 parr = PeriodArray._simple_new(values, dtype=dtype)
2888 return PeriodIndex._simple_new(parr, name=None)
2890 factory = f
2891 else:
2892 factory = index_class
2894 kwargs = {}
2895 if "freq" in attrs:
2896 kwargs["freq"] = attrs["freq"]
2897 if index_class is Index:
2898 # DTI/PI would be gotten by _alias_to_class
2899 factory = TimedeltaIndex
2901 if "tz" in attrs:
2902 if isinstance(attrs["tz"], bytes):
2903 # created by python2
2904 kwargs["tz"] = attrs["tz"].decode("utf-8")
2905 else:
2906 # created by python3
2907 kwargs["tz"] = attrs["tz"]
2908 assert index_class is DatetimeIndex # just checking
2910 return factory, kwargs
2912 def validate_read(self, columns, where) -> None:
2913 """
2914 raise if any keywords are passed which are not-None
2915 """
2916 if columns is not None:
2917 raise TypeError(
2918 "cannot pass a column specification when reading "
2919 "a Fixed format store. this store must be selected in its entirety"
2920 )
2921 if where is not None:
2922 raise TypeError(
2923 "cannot pass a where specification when reading "
2924 "from a Fixed format store. this store must be selected in its entirety"
2925 )
2927 @property
2928 def is_exists(self) -> bool:
2929 return True
2931 def set_attrs(self) -> None:
2932 """set our object attributes"""
2933 self.attrs.encoding = self.encoding
2934 self.attrs.errors = self.errors
2936 def get_attrs(self) -> None:
2937 """retrieve our attributes"""
2938 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
2939 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
2940 for n in self.attributes:
2941 setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
2943 def write(self, obj, **kwargs) -> None:
2944 self.set_attrs()
2946 def read_array(self, key: str, start: int | None = None, stop: int | None = None):
2947 """read an array for the specified node (off of group"""
2948 import tables
2950 node = getattr(self.group, key)
2951 attrs = node._v_attrs
2953 transposed = getattr(attrs, "transposed", False)
2955 if isinstance(node, tables.VLArray):
2956 ret = node[0][start:stop]
2957 else:
2958 dtype = _ensure_decoded(getattr(attrs, "value_type", None))
2959 shape = getattr(attrs, "shape", None)
2961 if shape is not None:
2962 # length 0 axis
2963 ret = np.empty(shape, dtype=dtype)
2964 else:
2965 ret = node[start:stop]
2967 if dtype and dtype.startswith("datetime64"):
2968 # reconstruct a timezone if indicated
2969 tz = getattr(attrs, "tz", None)
2970 ret = _set_tz(ret, tz, coerce=True)
2972 elif dtype == "timedelta64":
2973 ret = np.asarray(ret, dtype="m8[ns]")
2975 if transposed:
2976 return ret.T
2977 else:
2978 return ret
2980 def read_index(
2981 self, key: str, start: int | None = None, stop: int | None = None
2982 ) -> Index:
2983 variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))
2985 if variety == "multi":
2986 return self.read_multi_index(key, start=start, stop=stop)
2987 elif variety == "regular":
2988 node = getattr(self.group, key)
2989 index = self.read_index_node(node, start=start, stop=stop)
2990 return index
2991 else: # pragma: no cover
2992 raise TypeError(f"unrecognized index variety: {variety}")
2994 def write_index(self, key: str, index: Index) -> None:
2995 if isinstance(index, MultiIndex):
2996 setattr(self.attrs, f"{key}_variety", "multi")
2997 self.write_multi_index(key, index)
2998 else:
2999 setattr(self.attrs, f"{key}_variety", "regular")
3000 converted = _convert_index("index", index, self.encoding, self.errors)
3002 self.write_array(key, converted.values)
3004 node = getattr(self.group, key)
3005 node._v_attrs.kind = converted.kind
3006 node._v_attrs.name = index.name
3008 if isinstance(index, (DatetimeIndex, PeriodIndex)):
3009 node._v_attrs.index_class = self._class_to_alias(type(index))
3011 if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
3012 node._v_attrs.freq = index.freq
3014 if isinstance(index, DatetimeIndex) and index.tz is not None:
3015 node._v_attrs.tz = _get_tz(index.tz)
3017 def write_multi_index(self, key: str, index: MultiIndex) -> None:
3018 setattr(self.attrs, f"{key}_nlevels", index.nlevels)
3020 for i, (lev, level_codes, name) in enumerate(
3021 zip(index.levels, index.codes, index.names)
3022 ):
3023 # write the level
3024 if isinstance(lev.dtype, ExtensionDtype):
3025 raise NotImplementedError(
3026 "Saving a MultiIndex with an extension dtype is not supported."
3027 )
3028 level_key = f"{key}_level{i}"
3029 conv_level = _convert_index(level_key, lev, self.encoding, self.errors)
3030 self.write_array(level_key, conv_level.values)
3031 node = getattr(self.group, level_key)
3032 node._v_attrs.kind = conv_level.kind
3033 node._v_attrs.name = name
3035 # write the name
3036 setattr(node._v_attrs, f"{key}_name{name}", name)
3038 # write the labels
3039 label_key = f"{key}_label{i}"
3040 self.write_array(label_key, level_codes)
3042 def read_multi_index(
3043 self, key: str, start: int | None = None, stop: int | None = None
3044 ) -> MultiIndex:
3045 nlevels = getattr(self.attrs, f"{key}_nlevels")
3047 levels = []
3048 codes = []
3049 names: list[Hashable] = []
3050 for i in range(nlevels):
3051 level_key = f"{key}_level{i}"
3052 node = getattr(self.group, level_key)
3053 lev = self.read_index_node(node, start=start, stop=stop)
3054 levels.append(lev)
3055 names.append(lev.name)
3057 label_key = f"{key}_label{i}"
3058 level_codes = self.read_array(label_key, start=start, stop=stop)
3059 codes.append(level_codes)
3061 return MultiIndex(
3062 levels=levels, codes=codes, names=names, verify_integrity=True
3063 )
3065 def read_index_node(
3066 self, node: Node, start: int | None = None, stop: int | None = None
3067 ) -> Index:
3068 data = node[start:stop]
3069 # If the index was an empty array write_array_empty() will
3070 # have written a sentinel. Here we replace it with the original.
3071 if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
3072 data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)
3073 kind = _ensure_decoded(node._v_attrs.kind)
3074 name = None
3076 if "name" in node._v_attrs:
3077 name = _ensure_str(node._v_attrs.name)
3078 name = _ensure_decoded(name)
3080 attrs = node._v_attrs
3081 factory, kwargs = self._get_index_factory(attrs)
3083 if kind in ("date", "object"):
3084 index = factory(
3085 _unconvert_index(
3086 data, kind, encoding=self.encoding, errors=self.errors
3087 ),
3088 dtype=object,
3089 **kwargs,
3090 )
3091 else:
3092 index = factory(
3093 _unconvert_index(
3094 data, kind, encoding=self.encoding, errors=self.errors
3095 ),
3096 **kwargs,
3097 )
3099 index.name = name
3101 return index
3103 def write_array_empty(self, key: str, value: ArrayLike) -> None:
3104 """write a 0-len array"""
3105 # ugly hack for length 0 axes
3106 arr = np.empty((1,) * value.ndim)
3107 self._handle.create_array(self.group, key, arr)
3108 node = getattr(self.group, key)
3109 node._v_attrs.value_type = str(value.dtype)
3110 node._v_attrs.shape = value.shape
3112 def write_array(
3113 self, key: str, obj: AnyArrayLike, items: Index | None = None
3114 ) -> None:
3115 # TODO: we only have a few tests that get here, the only EA
3116 # that gets passed is DatetimeArray, and we never have
3117 # both self._filters and EA
3119 value = extract_array(obj, extract_numpy=True)
3121 if key in self.group:
3122 self._handle.remove_node(self.group, key)
3124 # Transform needed to interface with pytables row/col notation
3125 empty_array = value.size == 0
3126 transposed = False
3128 if isinstance(value.dtype, CategoricalDtype):
3129 raise NotImplementedError(
3130 "Cannot store a category dtype in a HDF5 dataset that uses format="
3131 '"fixed". Use format="table".'
3132 )
3133 if not empty_array:
3134 if hasattr(value, "T"):
3135 # ExtensionArrays (1d) may not have transpose.
3136 value = value.T
3137 transposed = True
3139 atom = None
3140 if self._filters is not None:
3141 with suppress(ValueError):
3142 # get the atom for this datatype
3143 atom = _tables().Atom.from_dtype(value.dtype)
3145 if atom is not None:
3146 # We only get here if self._filters is non-None and
3147 # the Atom.from_dtype call succeeded
3149 # create an empty chunked array and fill it from value
3150 if not empty_array:
3151 ca = self._handle.create_carray(
3152 self.group, key, atom, value.shape, filters=self._filters
3153 )
3154 ca[:] = value
3156 else:
3157 self.write_array_empty(key, value)
3159 elif value.dtype.type == np.object_:
3160 # infer the type, warn if we have a non-string type here (for
3161 # performance)
3162 inferred_type = lib.infer_dtype(value, skipna=False)
3163 if empty_array:
3164 pass
3165 elif inferred_type == "string":
3166 pass
3167 else:
3168 ws = performance_doc % (inferred_type, key, items)
3169 warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())
3171 vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
3172 vlarr.append(value)
3174 elif lib.is_np_dtype(value.dtype, "M"):
3175 self._handle.create_array(self.group, key, value.view("i8"))
3176 getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
3177 elif isinstance(value.dtype, DatetimeTZDtype):
3178 # store as UTC
3179 # with a zone
3181 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
3182 # attribute "asi8"
3183 self._handle.create_array(
3184 self.group, key, value.asi8 # type: ignore[union-attr]
3185 )
3187 node = getattr(self.group, key)
3188 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
3189 # attribute "tz"
3190 node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr]
3191 node._v_attrs.value_type = f"datetime64[{value.dtype.unit}]"
3192 elif lib.is_np_dtype(value.dtype, "m"):
3193 self._handle.create_array(self.group, key, value.view("i8"))
3194 getattr(self.group, key)._v_attrs.value_type = "timedelta64"
3195 elif empty_array:
3196 self.write_array_empty(key, value)
3197 else:
3198 self._handle.create_array(self.group, key, value)
3200 getattr(self.group, key)._v_attrs.transposed = transposed
3203class SeriesFixed(GenericFixed):
3204 pandas_kind = "series"
3205 attributes = ["name"]
3207 name: Hashable
3209 @property
3210 def shape(self):
3211 try:
3212 return (len(self.group.values),)
3213 except (TypeError, AttributeError):
3214 return None
3216 def read(
3217 self,
3218 where=None,
3219 columns=None,
3220 start: int | None = None,
3221 stop: int | None = None,
3222 ) -> Series:
3223 self.validate_read(columns, where)
3224 index = self.read_index("index", start=start, stop=stop)
3225 values = self.read_array("values", start=start, stop=stop)
3226 result = Series(values, index=index, name=self.name, copy=False)
3227 if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
3228 result = result.astype("string[pyarrow_numpy]")
3229 return result
3231 def write(self, obj, **kwargs) -> None:
3232 super().write(obj, **kwargs)
3233 self.write_index("index", obj.index)
3234 self.write_array("values", obj)
3235 self.attrs.name = obj.name
3238class BlockManagerFixed(GenericFixed):
3239 attributes = ["ndim", "nblocks"]
3241 nblocks: int
3243 @property
3244 def shape(self) -> Shape | None:
3245 try:
3246 ndim = self.ndim
3248 # items
3249 items = 0
3250 for i in range(self.nblocks):
3251 node = getattr(self.group, f"block{i}_items")
3252 shape = getattr(node, "shape", None)
3253 if shape is not None:
3254 items += shape[0]
3256 # data shape
3257 node = self.group.block0_values
3258 shape = getattr(node, "shape", None)
3259 if shape is not None:
3260 shape = list(shape[0 : (ndim - 1)])
3261 else:
3262 shape = []
3264 shape.append(items)
3266 return shape
3267 except AttributeError:
3268 return None
3270 def read(
3271 self,
3272 where=None,
3273 columns=None,
3274 start: int | None = None,
3275 stop: int | None = None,
3276 ) -> DataFrame:
3277 # start, stop applied to rows, so 0th axis only
3278 self.validate_read(columns, where)
3279 select_axis = self.obj_type()._get_block_manager_axis(0)
3281 axes = []
3282 for i in range(self.ndim):
3283 _start, _stop = (start, stop) if i == select_axis else (None, None)
3284 ax = self.read_index(f"axis{i}", start=_start, stop=_stop)
3285 axes.append(ax)
3287 items = axes[0]
3288 dfs = []
3290 for i in range(self.nblocks):
3291 blk_items = self.read_index(f"block{i}_items")
3292 values = self.read_array(f"block{i}_values", start=_start, stop=_stop)
3294 columns = items[items.get_indexer(blk_items)]
3295 df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
3296 if using_pyarrow_string_dtype() and is_string_array(values, skipna=True):
3297 df = df.astype("string[pyarrow_numpy]")
3298 dfs.append(df)
3300 if len(dfs) > 0:
3301 out = concat(dfs, axis=1, copy=True)
3302 if using_copy_on_write():
3303 # with CoW, concat ignores the copy keyword. Here, we still want
3304 # to copy to enforce optimized column-major layout
3305 out = out.copy()
3306 out = out.reindex(columns=items, copy=False)
3307 return out
3309 return DataFrame(columns=axes[0], index=axes[1])
3311 def write(self, obj, **kwargs) -> None:
3312 super().write(obj, **kwargs)
3314 # TODO(ArrayManager) HDFStore relies on accessing the blocks
3315 if isinstance(obj._mgr, ArrayManager):
3316 obj = obj._as_manager("block")
3318 data = obj._mgr
3319 if not data.is_consolidated():
3320 data = data.consolidate()
3322 self.attrs.ndim = data.ndim
3323 for i, ax in enumerate(data.axes):
3324 if i == 0 and (not ax.is_unique):
3325 raise ValueError("Columns index has to be unique for fixed format")
3326 self.write_index(f"axis{i}", ax)
3328 # Supporting mixed-type DataFrame objects...nontrivial
3329 self.attrs.nblocks = len(data.blocks)
3330 for i, blk in enumerate(data.blocks):
3331 # I have no idea why, but writing values before items fixed #2299
3332 blk_items = data.items.take(blk.mgr_locs)
3333 self.write_array(f"block{i}_values", blk.values, items=blk_items)
3334 self.write_index(f"block{i}_items", blk_items)
3337class FrameFixed(BlockManagerFixed):
3338 pandas_kind = "frame"
3339 obj_type = DataFrame
3342class Table(Fixed):
3343 """
3344 represent a table:
3345 facilitate read/write of various types of tables
3347 Attrs in Table Node
3348 -------------------
3349 These are attributes that are store in the main table node, they are
3350 necessary to recreate these tables when read back in.
3352 index_axes : a list of tuples of the (original indexing axis and
3353 index column)
3354 non_index_axes: a list of tuples of the (original index axis and
3355 columns on a non-indexing axis)
3356 values_axes : a list of the columns which comprise the data of this
3357 table
3358 data_columns : a list of the columns that we are allowing indexing
3359 (these become single columns in values_axes)
3360 nan_rep : the string to use for nan representations for string
3361 objects
3362 levels : the names of levels
3363 metadata : the names of the metadata columns
3364 """
3366 pandas_kind = "wide_table"
3367 format_type: str = "table" # GH#30962 needed by dask
3368 table_type: str
3369 levels: int | list[Hashable] = 1
3370 is_table = True
3372 metadata: list
3374 def __init__(
3375 self,
3376 parent: HDFStore,
3377 group: Node,
3378 encoding: str | None = None,
3379 errors: str = "strict",
3380 index_axes: list[IndexCol] | None = None,
3381 non_index_axes: list[tuple[AxisInt, Any]] | None = None,
3382 values_axes: list[DataCol] | None = None,
3383 data_columns: list | None = None,
3384 info: dict | None = None,
3385 nan_rep=None,
3386 ) -> None:
3387 super().__init__(parent, group, encoding=encoding, errors=errors)
3388 self.index_axes = index_axes or []
3389 self.non_index_axes = non_index_axes or []
3390 self.values_axes = values_axes or []
3391 self.data_columns = data_columns or []
3392 self.info = info or {}
3393 self.nan_rep = nan_rep
3395 @property
3396 def table_type_short(self) -> str:
3397 return self.table_type.split("_")[0]
3399 def __repr__(self) -> str:
3400 """return a pretty representation of myself"""
3401 self.infer_axes()
3402 jdc = ",".join(self.data_columns) if len(self.data_columns) else ""
3403 dc = f",dc->[{jdc}]"
3405 ver = ""
3406 if self.is_old_version:
3407 jver = ".".join([str(x) for x in self.version])
3408 ver = f"[{jver}]"
3410 jindex_axes = ",".join([a.name for a in self.index_axes])
3411 return (
3412 f"{self.pandas_type:12.12}{ver} "
3413 f"(typ->{self.table_type_short},nrows->{self.nrows},"
3414 f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})"
3415 )
3417 def __getitem__(self, c: str):
3418 """return the axis for c"""
3419 for a in self.axes:
3420 if c == a.name:
3421 return a
3422 return None
3424 def validate(self, other) -> None:
3425 """validate against an existing table"""
3426 if other is None:
3427 return
3429 if other.table_type != self.table_type:
3430 raise TypeError(
3431 "incompatible table_type with existing "
3432 f"[{other.table_type} - {self.table_type}]"
3433 )
3435 for c in ["index_axes", "non_index_axes", "values_axes"]:
3436 sv = getattr(self, c, None)
3437 ov = getattr(other, c, None)
3438 if sv != ov:
3439 # show the error for the specific axes
3440 # Argument 1 to "enumerate" has incompatible type
3441 # "Optional[Any]"; expected "Iterable[Any]" [arg-type]
3442 for i, sax in enumerate(sv): # type: ignore[arg-type]
3443 # Value of type "Optional[Any]" is not indexable [index]
3444 oax = ov[i] # type: ignore[index]
3445 if sax != oax:
3446 raise ValueError(
3447 f"invalid combination of [{c}] on appending data "
3448 f"[{sax}] vs current table [{oax}]"
3449 )
3451 # should never get here
3452 raise Exception(
3453 f"invalid combination of [{c}] on appending data [{sv}] vs "
3454 f"current table [{ov}]"
3455 )
3457 @property
3458 def is_multi_index(self) -> bool:
3459 """the levels attribute is 1 or a list in the case of a multi-index"""
3460 return isinstance(self.levels, list)
3462 def validate_multiindex(
3463 self, obj: DataFrame | Series
3464 ) -> tuple[DataFrame, list[Hashable]]:
3465 """
3466 validate that we can store the multi-index; reset and return the
3467 new object
3468 """
3469 levels = com.fill_missing_names(obj.index.names)
3470 try:
3471 reset_obj = obj.reset_index()
3472 except ValueError as err:
3473 raise ValueError(
3474 "duplicate names/columns in the multi-index when storing as a table"
3475 ) from err
3476 assert isinstance(reset_obj, DataFrame) # for mypy
3477 return reset_obj, levels
3479 @property
3480 def nrows_expected(self) -> int:
3481 """based on our axes, compute the expected nrows"""
3482 return np.prod([i.cvalues.shape[0] for i in self.index_axes])
3484 @property
3485 def is_exists(self) -> bool:
3486 """has this table been created"""
3487 return "table" in self.group
3489 @property
3490 def storable(self):
3491 return getattr(self.group, "table", None)
3493 @property
3494 def table(self):
3495 """return the table group (this is my storable)"""
3496 return self.storable
3498 @property
3499 def dtype(self):
3500 return self.table.dtype
3502 @property
3503 def description(self):
3504 return self.table.description
3506 @property
3507 def axes(self) -> itertools.chain[IndexCol]:
3508 return itertools.chain(self.index_axes, self.values_axes)
3510 @property
3511 def ncols(self) -> int:
3512 """the number of total columns in the values axes"""
3513 return sum(len(a.values) for a in self.values_axes)
3515 @property
3516 def is_transposed(self) -> bool:
3517 return False
3519 @property
3520 def data_orientation(self) -> tuple[int, ...]:
3521 """return a tuple of my permutated axes, non_indexable at the front"""
3522 return tuple(
3523 itertools.chain(
3524 [int(a[0]) for a in self.non_index_axes],
3525 [int(a.axis) for a in self.index_axes],
3526 )
3527 )
3529 def queryables(self) -> dict[str, Any]:
3530 """return a dict of the kinds allowable columns for this object"""
3531 # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here
3532 axis_names = {0: "index", 1: "columns"}
3534 # compute the values_axes queryables
3535 d1 = [(a.cname, a) for a in self.index_axes]
3536 d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes]
3537 d3 = [
3538 (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)
3539 ]
3541 return dict(d1 + d2 + d3)
3543 def index_cols(self):
3544 """return a list of my index cols"""
3545 # Note: each `i.cname` below is assured to be a str.
3546 return [(i.axis, i.cname) for i in self.index_axes]
3548 def values_cols(self) -> list[str]:
3549 """return a list of my values cols"""
3550 return [i.cname for i in self.values_axes]
3552 def _get_metadata_path(self, key: str) -> str:
3553 """return the metadata pathname for this key"""
3554 group = self.group._v_pathname
3555 return f"{group}/meta/{key}/meta"
3557 def write_metadata(self, key: str, values: np.ndarray) -> None:
3558 """
3559 Write out a metadata array to the key as a fixed-format Series.
3561 Parameters
3562 ----------
3563 key : str
3564 values : ndarray
3565 """
3566 self.parent.put(
3567 self._get_metadata_path(key),
3568 Series(values, copy=False),
3569 format="table",
3570 encoding=self.encoding,
3571 errors=self.errors,
3572 nan_rep=self.nan_rep,
3573 )
3575 def read_metadata(self, key: str):
3576 """return the meta data array for this key"""
3577 if getattr(getattr(self.group, "meta", None), key, None) is not None:
3578 return self.parent.select(self._get_metadata_path(key))
3579 return None
3581 def set_attrs(self) -> None:
3582 """set our table type & indexables"""
3583 self.attrs.table_type = str(self.table_type)
3584 self.attrs.index_cols = self.index_cols()
3585 self.attrs.values_cols = self.values_cols()
3586 self.attrs.non_index_axes = self.non_index_axes
3587 self.attrs.data_columns = self.data_columns
3588 self.attrs.nan_rep = self.nan_rep
3589 self.attrs.encoding = self.encoding
3590 self.attrs.errors = self.errors
3591 self.attrs.levels = self.levels
3592 self.attrs.info = self.info
3594 def get_attrs(self) -> None:
3595 """retrieve our attributes"""
3596 self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []
3597 self.data_columns = getattr(self.attrs, "data_columns", None) or []
3598 self.info = getattr(self.attrs, "info", None) or {}
3599 self.nan_rep = getattr(self.attrs, "nan_rep", None)
3600 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
3601 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
3602 self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []
3603 self.index_axes = [a for a in self.indexables if a.is_an_indexable]
3604 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
3606 def validate_version(self, where=None) -> None:
3607 """are we trying to operate on an old version?"""
3608 if where is not None:
3609 if self.is_old_version:
3610 ws = incompatibility_doc % ".".join([str(x) for x in self.version])
3611 warnings.warn(
3612 ws,
3613 IncompatibilityWarning,
3614 stacklevel=find_stack_level(),
3615 )
3617 def validate_min_itemsize(self, min_itemsize) -> None:
3618 """
3619 validate the min_itemsize doesn't contain items that are not in the
3620 axes this needs data_columns to be defined
3621 """
3622 if min_itemsize is None:
3623 return
3624 if not isinstance(min_itemsize, dict):
3625 return
3627 q = self.queryables()
3628 for k in min_itemsize:
3629 # ok, apply generally
3630 if k == "values":
3631 continue
3632 if k not in q:
3633 raise ValueError(
3634 f"min_itemsize has the key [{k}] which is not an axis or "
3635 "data_column"
3636 )
3638 @cache_readonly
3639 def indexables(self):
3640 """create/cache the indexables if they don't exist"""
3641 _indexables = []
3643 desc = self.description
3644 table_attrs = self.table.attrs
3646 # Note: each of the `name` kwargs below are str, ensured
3647 # by the definition in index_cols.
3648 # index columns
3649 for i, (axis, name) in enumerate(self.attrs.index_cols):
3650 atom = getattr(desc, name)
3651 md = self.read_metadata(name)
3652 meta = "category" if md is not None else None
3654 kind_attr = f"{name}_kind"
3655 kind = getattr(table_attrs, kind_attr, None)
3657 index_col = IndexCol(
3658 name=name,
3659 axis=axis,
3660 pos=i,
3661 kind=kind,
3662 typ=atom,
3663 table=self.table,
3664 meta=meta,
3665 metadata=md,
3666 )
3667 _indexables.append(index_col)
3669 # values columns
3670 dc = set(self.data_columns)
3671 base_pos = len(_indexables)
3673 def f(i, c):
3674 assert isinstance(c, str)
3675 klass = DataCol
3676 if c in dc:
3677 klass = DataIndexableCol
3679 atom = getattr(desc, c)
3680 adj_name = _maybe_adjust_name(c, self.version)
3682 # TODO: why kind_attr here?
3683 values = getattr(table_attrs, f"{adj_name}_kind", None)
3684 dtype = getattr(table_attrs, f"{adj_name}_dtype", None)
3685 # Argument 1 to "_dtype_to_kind" has incompatible type
3686 # "Optional[Any]"; expected "str" [arg-type]
3687 kind = _dtype_to_kind(dtype) # type: ignore[arg-type]
3689 md = self.read_metadata(c)
3690 # TODO: figure out why these two versions of `meta` dont always match.
3691 # meta = "category" if md is not None else None
3692 meta = getattr(table_attrs, f"{adj_name}_meta", None)
3694 obj = klass(
3695 name=adj_name,
3696 cname=c,
3697 values=values,
3698 kind=kind,
3699 pos=base_pos + i,
3700 typ=atom,
3701 table=self.table,
3702 meta=meta,
3703 metadata=md,
3704 dtype=dtype,
3705 )
3706 return obj
3708 # Note: the definition of `values_cols` ensures that each
3709 # `c` below is a str.
3710 _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)])
3712 return _indexables
3714 def create_index(
3715 self, columns=None, optlevel=None, kind: str | None = None
3716 ) -> None:
3717 """
3718 Create a pytables index on the specified columns.
3720 Parameters
3721 ----------
3722 columns : None, bool, or listlike[str]
3723 Indicate which columns to create an index on.
3725 * False : Do not create any indexes.
3726 * True : Create indexes on all columns.
3727 * None : Create indexes on all columns.
3728 * listlike : Create indexes on the given columns.
3730 optlevel : int or None, default None
3731 Optimization level, if None, pytables defaults to 6.
3732 kind : str or None, default None
3733 Kind of index, if None, pytables defaults to "medium".
3735 Raises
3736 ------
3737 TypeError if trying to create an index on a complex-type column.
3739 Notes
3740 -----
3741 Cannot index Time64Col or ComplexCol.
3742 Pytables must be >= 3.0.
3743 """
3744 if not self.infer_axes():
3745 return
3746 if columns is False:
3747 return
3749 # index all indexables and data_columns
3750 if columns is None or columns is True:
3751 columns = [a.cname for a in self.axes if a.is_data_indexable]
3752 if not isinstance(columns, (tuple, list)):
3753 columns = [columns]
3755 kw = {}
3756 if optlevel is not None:
3757 kw["optlevel"] = optlevel
3758 if kind is not None:
3759 kw["kind"] = kind
3761 table = self.table
3762 for c in columns:
3763 v = getattr(table.cols, c, None)
3764 if v is not None:
3765 # remove the index if the kind/optlevel have changed
3766 if v.is_indexed:
3767 index = v.index
3768 cur_optlevel = index.optlevel
3769 cur_kind = index.kind
3771 if kind is not None and cur_kind != kind:
3772 v.remove_index()
3773 else:
3774 kw["kind"] = cur_kind
3776 if optlevel is not None and cur_optlevel != optlevel:
3777 v.remove_index()
3778 else:
3779 kw["optlevel"] = cur_optlevel
3781 # create the index
3782 if not v.is_indexed:
3783 if v.type.startswith("complex"):
3784 raise TypeError(
3785 "Columns containing complex values can be stored but "
3786 "cannot be indexed when using table format. Either use "
3787 "fixed format, set index=False, or do not include "
3788 "the columns containing complex values to "
3789 "data_columns when initializing the table."
3790 )
3791 v.create_index(**kw)
3792 elif c in self.non_index_axes[0][1]:
3793 # GH 28156
3794 raise AttributeError(
3795 f"column {c} is not a data_column.\n"
3796 f"In order to read column {c} you must reload the dataframe \n"
3797 f"into HDFStore and include {c} with the data_columns argument."
3798 )
3800 def _read_axes(
3801 self, where, start: int | None = None, stop: int | None = None
3802 ) -> list[tuple[np.ndarray, np.ndarray] | tuple[Index, Index]]:
3803 """
3804 Create the axes sniffed from the table.
3806 Parameters
3807 ----------
3808 where : ???
3809 start : int or None, default None
3810 stop : int or None, default None
3812 Returns
3813 -------
3814 List[Tuple[index_values, column_values]]
3815 """
3816 # create the selection
3817 selection = Selection(self, where=where, start=start, stop=stop)
3818 values = selection.select()
3820 results = []
3821 # convert the data
3822 for a in self.axes:
3823 a.set_info(self.info)
3824 res = a.convert(
3825 values,
3826 nan_rep=self.nan_rep,
3827 encoding=self.encoding,
3828 errors=self.errors,
3829 )
3830 results.append(res)
3832 return results
3834 @classmethod
3835 def get_object(cls, obj, transposed: bool):
3836 """return the data for this obj"""
3837 return obj
3839 def validate_data_columns(self, data_columns, min_itemsize, non_index_axes):
3840 """
3841 take the input data_columns and min_itemize and create a data
3842 columns spec
3843 """
3844 if not len(non_index_axes):
3845 return []
3847 axis, axis_labels = non_index_axes[0]
3848 info = self.info.get(axis, {})
3849 if info.get("type") == "MultiIndex" and data_columns:
3850 raise ValueError(
3851 f"cannot use a multi-index on axis [{axis}] with "
3852 f"data_columns {data_columns}"
3853 )
3855 # evaluate the passed data_columns, True == use all columns
3856 # take only valid axis labels
3857 if data_columns is True:
3858 data_columns = list(axis_labels)
3859 elif data_columns is None:
3860 data_columns = []
3862 # if min_itemsize is a dict, add the keys (exclude 'values')
3863 if isinstance(min_itemsize, dict):
3864 existing_data_columns = set(data_columns)
3865 data_columns = list(data_columns) # ensure we do not modify
3866 data_columns.extend(
3867 [
3868 k
3869 for k in min_itemsize.keys()
3870 if k != "values" and k not in existing_data_columns
3871 ]
3872 )
3874 # return valid columns in the order of our axis
3875 return [c for c in data_columns if c in axis_labels]
3877 def _create_axes(
3878 self,
3879 axes,
3880 obj: DataFrame,
3881 validate: bool = True,
3882 nan_rep=None,
3883 data_columns=None,
3884 min_itemsize=None,
3885 ):
3886 """
3887 Create and return the axes.
3889 Parameters
3890 ----------
3891 axes: list or None
3892 The names or numbers of the axes to create.
3893 obj : DataFrame
3894 The object to create axes on.
3895 validate: bool, default True
3896 Whether to validate the obj against an existing object already written.
3897 nan_rep :
3898 A value to use for string column nan_rep.
3899 data_columns : List[str], True, or None, default None
3900 Specify the columns that we want to create to allow indexing on.
3902 * True : Use all available columns.
3903 * None : Use no columns.
3904 * List[str] : Use the specified columns.
3906 min_itemsize: Dict[str, int] or None, default None
3907 The min itemsize for a column in bytes.
3908 """
3909 if not isinstance(obj, DataFrame):
3910 group = self.group._v_name
3911 raise TypeError(
3912 f"cannot properly create the storer for: [group->{group},"
3913 f"value->{type(obj)}]"
3914 )
3916 # set the default axes if needed
3917 if axes is None:
3918 axes = [0]
3920 # map axes to numbers
3921 axes = [obj._get_axis_number(a) for a in axes]
3923 # do we have an existing table (if so, use its axes & data_columns)
3924 if self.infer_axes():
3925 table_exists = True
3926 axes = [a.axis for a in self.index_axes]
3927 data_columns = list(self.data_columns)
3928 nan_rep = self.nan_rep
3929 # TODO: do we always have validate=True here?
3930 else:
3931 table_exists = False
3933 new_info = self.info
3935 assert self.ndim == 2 # with next check, we must have len(axes) == 1
3936 # currently support on ndim-1 axes
3937 if len(axes) != self.ndim - 1:
3938 raise ValueError(
3939 "currently only support ndim-1 indexers in an AppendableTable"
3940 )
3942 # create according to the new data
3943 new_non_index_axes: list = []
3945 # nan_representation
3946 if nan_rep is None:
3947 nan_rep = "nan"
3949 # We construct the non-index-axis first, since that alters new_info
3950 idx = next(x for x in [0, 1] if x not in axes)
3952 a = obj.axes[idx]
3953 # we might be able to change the axes on the appending data if necessary
3954 append_axis = list(a)
3955 if table_exists:
3956 indexer = len(new_non_index_axes) # i.e. 0
3957 exist_axis = self.non_index_axes[indexer][1]
3958 if not array_equivalent(
3959 np.array(append_axis),
3960 np.array(exist_axis),
3961 strict_nan=True,
3962 dtype_equal=True,
3963 ):
3964 # ahah! -> reindex
3965 if array_equivalent(
3966 np.array(sorted(append_axis)),
3967 np.array(sorted(exist_axis)),
3968 strict_nan=True,
3969 dtype_equal=True,
3970 ):
3971 append_axis = exist_axis
3973 # the non_index_axes info
3974 info = new_info.setdefault(idx, {})
3975 info["names"] = list(a.names)
3976 info["type"] = type(a).__name__
3978 new_non_index_axes.append((idx, append_axis))
3980 # Now we can construct our new index axis
3981 idx = axes[0]
3982 a = obj.axes[idx]
3983 axis_name = obj._get_axis_name(idx)
3984 new_index = _convert_index(axis_name, a, self.encoding, self.errors)
3985 new_index.axis = idx
3987 # Because we are always 2D, there is only one new_index, so
3988 # we know it will have pos=0
3989 new_index.set_pos(0)
3990 new_index.update_info(new_info)
3991 new_index.maybe_set_size(min_itemsize) # check for column conflicts
3993 new_index_axes = [new_index]
3994 j = len(new_index_axes) # i.e. 1
3995 assert j == 1
3997 # reindex by our non_index_axes & compute data_columns
3998 assert len(new_non_index_axes) == 1
3999 for a in new_non_index_axes:
4000 obj = _reindex_axis(obj, a[0], a[1])
4002 transposed = new_index.axis == 1
4004 # figure out data_columns and get out blocks
4005 data_columns = self.validate_data_columns(
4006 data_columns, min_itemsize, new_non_index_axes
4007 )
4009 frame = self.get_object(obj, transposed)._consolidate()
4011 blocks, blk_items = self._get_blocks_and_items(
4012 frame, table_exists, new_non_index_axes, self.values_axes, data_columns
4013 )
4015 # add my values
4016 vaxes = []
4017 for i, (blk, b_items) in enumerate(zip(blocks, blk_items)):
4018 # shape of the data column are the indexable axes
4019 klass = DataCol
4020 name = None
4022 # we have a data_column
4023 if data_columns and len(b_items) == 1 and b_items[0] in data_columns:
4024 klass = DataIndexableCol
4025 name = b_items[0]
4026 if not (name is None or isinstance(name, str)):
4027 # TODO: should the message here be more specifically non-str?
4028 raise ValueError("cannot have non-object label DataIndexableCol")
4030 # make sure that we match up the existing columns
4031 # if we have an existing table
4032 existing_col: DataCol | None
4034 if table_exists and validate:
4035 try:
4036 existing_col = self.values_axes[i]
4037 except (IndexError, KeyError) as err:
4038 raise ValueError(
4039 f"Incompatible appended table [{blocks}]"
4040 f"with existing table [{self.values_axes}]"
4041 ) from err
4042 else:
4043 existing_col = None
4045 new_name = name or f"values_block_{i}"
4046 data_converted = _maybe_convert_for_string_atom(
4047 new_name,
4048 blk.values,
4049 existing_col=existing_col,
4050 min_itemsize=min_itemsize,
4051 nan_rep=nan_rep,
4052 encoding=self.encoding,
4053 errors=self.errors,
4054 columns=b_items,
4055 )
4056 adj_name = _maybe_adjust_name(new_name, self.version)
4058 typ = klass._get_atom(data_converted)
4059 kind = _dtype_to_kind(data_converted.dtype.name)
4060 tz = None
4061 if getattr(data_converted, "tz", None) is not None:
4062 tz = _get_tz(data_converted.tz)
4064 meta = metadata = ordered = None
4065 if isinstance(data_converted.dtype, CategoricalDtype):
4066 ordered = data_converted.ordered
4067 meta = "category"
4068 metadata = np.asarray(data_converted.categories).ravel()
4070 data, dtype_name = _get_data_and_dtype_name(data_converted)
4072 col = klass(
4073 name=adj_name,
4074 cname=new_name,
4075 values=list(b_items),
4076 typ=typ,
4077 pos=j,
4078 kind=kind,
4079 tz=tz,
4080 ordered=ordered,
4081 meta=meta,
4082 metadata=metadata,
4083 dtype=dtype_name,
4084 data=data,
4085 )
4086 col.update_info(new_info)
4088 vaxes.append(col)
4090 j += 1
4092 dcs = [col.name for col in vaxes if col.is_data_indexable]
4094 new_table = type(self)(
4095 parent=self.parent,
4096 group=self.group,
4097 encoding=self.encoding,
4098 errors=self.errors,
4099 index_axes=new_index_axes,
4100 non_index_axes=new_non_index_axes,
4101 values_axes=vaxes,
4102 data_columns=dcs,
4103 info=new_info,
4104 nan_rep=nan_rep,
4105 )
4106 if hasattr(self, "levels"):
4107 # TODO: get this into constructor, only for appropriate subclass
4108 new_table.levels = self.levels
4110 new_table.validate_min_itemsize(min_itemsize)
4112 if validate and table_exists:
4113 new_table.validate(self)
4115 return new_table
4117 @staticmethod
4118 def _get_blocks_and_items(
4119 frame: DataFrame,
4120 table_exists: bool,
4121 new_non_index_axes,
4122 values_axes,
4123 data_columns,
4124 ):
4125 # Helper to clarify non-state-altering parts of _create_axes
4127 # TODO(ArrayManager) HDFStore relies on accessing the blocks
4128 if isinstance(frame._mgr, ArrayManager):
4129 frame = frame._as_manager("block")
4131 def get_blk_items(mgr):
4132 return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks]
4134 mgr = frame._mgr
4135 mgr = cast(BlockManager, mgr)
4136 blocks: list[Block] = list(mgr.blocks)
4137 blk_items: list[Index] = get_blk_items(mgr)
4139 if len(data_columns):
4140 # TODO: prove that we only get here with axis == 1?
4141 # It is the case in all extant tests, but NOT the case
4142 # outside this `if len(data_columns)` check.
4144 axis, axis_labels = new_non_index_axes[0]
4145 new_labels = Index(axis_labels).difference(Index(data_columns))
4146 mgr = frame.reindex(new_labels, axis=axis)._mgr
4147 mgr = cast(BlockManager, mgr)
4149 blocks = list(mgr.blocks)
4150 blk_items = get_blk_items(mgr)
4151 for c in data_columns:
4152 # This reindex would raise ValueError if we had a duplicate
4153 # index, so we can infer that (as long as axis==1) we
4154 # get a single column back, so a single block.
4155 mgr = frame.reindex([c], axis=axis)._mgr
4156 mgr = cast(BlockManager, mgr)
4157 blocks.extend(mgr.blocks)
4158 blk_items.extend(get_blk_items(mgr))
4160 # reorder the blocks in the same order as the existing table if we can
4161 if table_exists:
4162 by_items = {
4163 tuple(b_items.tolist()): (b, b_items)
4164 for b, b_items in zip(blocks, blk_items)
4165 }
4166 new_blocks: list[Block] = []
4167 new_blk_items = []
4168 for ea in values_axes:
4169 items = tuple(ea.values)
4170 try:
4171 b, b_items = by_items.pop(items)
4172 new_blocks.append(b)
4173 new_blk_items.append(b_items)
4174 except (IndexError, KeyError) as err:
4175 jitems = ",".join([pprint_thing(item) for item in items])
4176 raise ValueError(
4177 f"cannot match existing table structure for [{jitems}] "
4178 "on appending data"
4179 ) from err
4180 blocks = new_blocks
4181 blk_items = new_blk_items
4183 return blocks, blk_items
4185 def process_axes(self, obj, selection: Selection, columns=None) -> DataFrame:
4186 """process axes filters"""
4187 # make a copy to avoid side effects
4188 if columns is not None:
4189 columns = list(columns)
4191 # make sure to include levels if we have them
4192 if columns is not None and self.is_multi_index:
4193 assert isinstance(self.levels, list) # assured by is_multi_index
4194 for n in self.levels:
4195 if n not in columns:
4196 columns.insert(0, n)
4198 # reorder by any non_index_axes & limit to the select columns
4199 for axis, labels in self.non_index_axes:
4200 obj = _reindex_axis(obj, axis, labels, columns)
4202 def process_filter(field, filt, op):
4203 for axis_name in obj._AXIS_ORDERS:
4204 axis_number = obj._get_axis_number(axis_name)
4205 axis_values = obj._get_axis(axis_name)
4206 assert axis_number is not None
4208 # see if the field is the name of an axis
4209 if field == axis_name:
4210 # if we have a multi-index, then need to include
4211 # the levels
4212 if self.is_multi_index:
4213 filt = filt.union(Index(self.levels))
4215 takers = op(axis_values, filt)
4216 return obj.loc(axis=axis_number)[takers]
4218 # this might be the name of a file IN an axis
4219 elif field in axis_values:
4220 # we need to filter on this dimension
4221 values = ensure_index(getattr(obj, field).values)
4222 filt = ensure_index(filt)
4224 # hack until we support reversed dim flags
4225 if isinstance(obj, DataFrame):
4226 axis_number = 1 - axis_number
4228 takers = op(values, filt)
4229 return obj.loc(axis=axis_number)[takers]
4231 raise ValueError(f"cannot find the field [{field}] for filtering!")
4233 # apply the selection filters (but keep in the same order)
4234 if selection.filter is not None:
4235 for field, op, filt in selection.filter.format():
4236 obj = process_filter(field, filt, op)
4238 return obj
4240 def create_description(
4241 self,
4242 complib,
4243 complevel: int | None,
4244 fletcher32: bool,
4245 expectedrows: int | None,
4246 ) -> dict[str, Any]:
4247 """create the description of the table from the axes & values"""
4248 # provided expected rows if its passed
4249 if expectedrows is None:
4250 expectedrows = max(self.nrows_expected, 10000)
4252 d = {"name": "table", "expectedrows": expectedrows}
4254 # description from the axes & values
4255 d["description"] = {a.cname: a.typ for a in self.axes}
4257 if complib:
4258 if complevel is None:
4259 complevel = self._complevel or 9
4260 filters = _tables().Filters(
4261 complevel=complevel,
4262 complib=complib,
4263 fletcher32=fletcher32 or self._fletcher32,
4264 )
4265 d["filters"] = filters
4266 elif self._filters is not None:
4267 d["filters"] = self._filters
4269 return d
4271 def read_coordinates(
4272 self, where=None, start: int | None = None, stop: int | None = None
4273 ):
4274 """
4275 select coordinates (row numbers) from a table; return the
4276 coordinates object
4277 """
4278 # validate the version
4279 self.validate_version(where)
4281 # infer the data kind
4282 if not self.infer_axes():
4283 return False
4285 # create the selection
4286 selection = Selection(self, where=where, start=start, stop=stop)
4287 coords = selection.select_coords()
4288 if selection.filter is not None:
4289 for field, op, filt in selection.filter.format():
4290 data = self.read_column(
4291 field, start=coords.min(), stop=coords.max() + 1
4292 )
4293 coords = coords[op(data.iloc[coords - coords.min()], filt).values]
4295 return Index(coords)
4297 def read_column(
4298 self,
4299 column: str,
4300 where=None,
4301 start: int | None = None,
4302 stop: int | None = None,
4303 ):
4304 """
4305 return a single column from the table, generally only indexables
4306 are interesting
4307 """
4308 # validate the version
4309 self.validate_version()
4311 # infer the data kind
4312 if not self.infer_axes():
4313 return False
4315 if where is not None:
4316 raise TypeError("read_column does not currently accept a where clause")
4318 # find the axes
4319 for a in self.axes:
4320 if column == a.name:
4321 if not a.is_data_indexable:
4322 raise ValueError(
4323 f"column [{column}] can not be extracted individually; "
4324 "it is not data indexable"
4325 )
4327 # column must be an indexable or a data column
4328 c = getattr(self.table.cols, column)
4329 a.set_info(self.info)
4330 col_values = a.convert(
4331 c[start:stop],
4332 nan_rep=self.nan_rep,
4333 encoding=self.encoding,
4334 errors=self.errors,
4335 )
4336 return Series(_set_tz(col_values[1], a.tz), name=column, copy=False)
4338 raise KeyError(f"column [{column}] not found in the table")
4341class WORMTable(Table):
4342 """
4343 a write-once read-many table: this format DOES NOT ALLOW appending to a
4344 table. writing is a one-time operation the data are stored in a format
4345 that allows for searching the data on disk
4346 """
4348 table_type = "worm"
4350 def read(
4351 self,
4352 where=None,
4353 columns=None,
4354 start: int | None = None,
4355 stop: int | None = None,
4356 ):
4357 """
4358 read the indices and the indexing array, calculate offset rows and return
4359 """
4360 raise NotImplementedError("WORMTable needs to implement read")
4362 def write(self, obj, **kwargs) -> None:
4363 """
4364 write in a format that we can search later on (but cannot append
4365 to): write out the indices and the values using _write_array
4366 (e.g. a CArray) create an indexing table so that we can search
4367 """
4368 raise NotImplementedError("WORMTable needs to implement write")
4371class AppendableTable(Table):
4372 """support the new appendable table formats"""
4374 table_type = "appendable"
4376 # error: Signature of "write" incompatible with supertype "Fixed"
4377 def write( # type: ignore[override]
4378 self,
4379 obj,
4380 axes=None,
4381 append: bool = False,
4382 complib=None,
4383 complevel=None,
4384 fletcher32=None,
4385 min_itemsize=None,
4386 chunksize: int | None = None,
4387 expectedrows=None,
4388 dropna: bool = False,
4389 nan_rep=None,
4390 data_columns=None,
4391 track_times: bool = True,
4392 ) -> None:
4393 if not append and self.is_exists:
4394 self._handle.remove_node(self.group, "table")
4396 # create the axes
4397 table = self._create_axes(
4398 axes=axes,
4399 obj=obj,
4400 validate=append,
4401 min_itemsize=min_itemsize,
4402 nan_rep=nan_rep,
4403 data_columns=data_columns,
4404 )
4406 for a in table.axes:
4407 a.validate_names()
4409 if not table.is_exists:
4410 # create the table
4411 options = table.create_description(
4412 complib=complib,
4413 complevel=complevel,
4414 fletcher32=fletcher32,
4415 expectedrows=expectedrows,
4416 )
4418 # set the table attributes
4419 table.set_attrs()
4421 options["track_times"] = track_times
4423 # create the table
4424 table._handle.create_table(table.group, **options)
4426 # update my info
4427 table.attrs.info = table.info
4429 # validate the axes and set the kinds
4430 for a in table.axes:
4431 a.validate_and_set(table, append)
4433 # add the rows
4434 table.write_data(chunksize, dropna=dropna)
4436 def write_data(self, chunksize: int | None, dropna: bool = False) -> None:
4437 """
4438 we form the data into a 2-d including indexes,values,mask write chunk-by-chunk
4439 """
4440 names = self.dtype.names
4441 nrows = self.nrows_expected
4443 # if dropna==True, then drop ALL nan rows
4444 masks = []
4445 if dropna:
4446 for a in self.values_axes:
4447 # figure the mask: only do if we can successfully process this
4448 # column, otherwise ignore the mask
4449 mask = isna(a.data).all(axis=0)
4450 if isinstance(mask, np.ndarray):
4451 masks.append(mask.astype("u1", copy=False))
4453 # consolidate masks
4454 if len(masks):
4455 mask = masks[0]
4456 for m in masks[1:]:
4457 mask = mask & m
4458 mask = mask.ravel()
4459 else:
4460 mask = None
4462 # broadcast the indexes if needed
4463 indexes = [a.cvalues for a in self.index_axes]
4464 nindexes = len(indexes)
4465 assert nindexes == 1, nindexes # ensures we dont need to broadcast
4467 # transpose the values so first dimension is last
4468 # reshape the values if needed
4469 values = [a.take_data() for a in self.values_axes]
4470 values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]
4471 bvalues = []
4472 for i, v in enumerate(values):
4473 new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
4474 bvalues.append(v.reshape(new_shape))
4476 # write the chunks
4477 if chunksize is None:
4478 chunksize = 100000
4480 rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
4481 chunks = nrows // chunksize + 1
4482 for i in range(chunks):
4483 start_i = i * chunksize
4484 end_i = min((i + 1) * chunksize, nrows)
4485 if start_i >= end_i:
4486 break
4488 self.write_data_chunk(
4489 rows,
4490 indexes=[a[start_i:end_i] for a in indexes],
4491 mask=mask[start_i:end_i] if mask is not None else None,
4492 values=[v[start_i:end_i] for v in bvalues],
4493 )
4495 def write_data_chunk(
4496 self,
4497 rows: np.ndarray,
4498 indexes: list[np.ndarray],
4499 mask: npt.NDArray[np.bool_] | None,
4500 values: list[np.ndarray],
4501 ) -> None:
4502 """
4503 Parameters
4504 ----------
4505 rows : an empty memory space where we are putting the chunk
4506 indexes : an array of the indexes
4507 mask : an array of the masks
4508 values : an array of the values
4509 """
4510 # 0 len
4511 for v in values:
4512 if not np.prod(v.shape):
4513 return
4515 nrows = indexes[0].shape[0]
4516 if nrows != len(rows):
4517 rows = np.empty(nrows, dtype=self.dtype)
4518 names = self.dtype.names
4519 nindexes = len(indexes)
4521 # indexes
4522 for i, idx in enumerate(indexes):
4523 rows[names[i]] = idx
4525 # values
4526 for i, v in enumerate(values):
4527 rows[names[i + nindexes]] = v
4529 # mask
4530 if mask is not None:
4531 m = ~mask.ravel().astype(bool, copy=False)
4532 if not m.all():
4533 rows = rows[m]
4535 if len(rows):
4536 self.table.append(rows)
4537 self.table.flush()
4539 def delete(self, where=None, start: int | None = None, stop: int | None = None):
4540 # delete all rows (and return the nrows)
4541 if where is None or not len(where):
4542 if start is None and stop is None:
4543 nrows = self.nrows
4544 self._handle.remove_node(self.group, recursive=True)
4545 else:
4546 # pytables<3.0 would remove a single row with stop=None
4547 if stop is None:
4548 stop = self.nrows
4549 nrows = self.table.remove_rows(start=start, stop=stop)
4550 self.table.flush()
4551 return nrows
4553 # infer the data kind
4554 if not self.infer_axes():
4555 return None
4557 # create the selection
4558 table = self.table
4559 selection = Selection(self, where, start=start, stop=stop)
4560 values = selection.select_coords()
4562 # delete the rows in reverse order
4563 sorted_series = Series(values, copy=False).sort_values()
4564 ln = len(sorted_series)
4566 if ln:
4567 # construct groups of consecutive rows
4568 diff = sorted_series.diff()
4569 groups = list(diff[diff > 1].index)
4571 # 1 group
4572 if not len(groups):
4573 groups = [0]
4575 # final element
4576 if groups[-1] != ln:
4577 groups.append(ln)
4579 # initial element
4580 if groups[0] != 0:
4581 groups.insert(0, 0)
4583 # we must remove in reverse order!
4584 pg = groups.pop()
4585 for g in reversed(groups):
4586 rows = sorted_series.take(range(g, pg))
4587 table.remove_rows(
4588 start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1
4589 )
4590 pg = g
4592 self.table.flush()
4594 # return the number of rows removed
4595 return ln
4598class AppendableFrameTable(AppendableTable):
4599 """support the new appendable table formats"""
4601 pandas_kind = "frame_table"
4602 table_type = "appendable_frame"
4603 ndim = 2
4604 obj_type: type[DataFrame | Series] = DataFrame
4606 @property
4607 def is_transposed(self) -> bool:
4608 return self.index_axes[0].axis == 1
4610 @classmethod
4611 def get_object(cls, obj, transposed: bool):
4612 """these are written transposed"""
4613 if transposed:
4614 obj = obj.T
4615 return obj
4617 def read(
4618 self,
4619 where=None,
4620 columns=None,
4621 start: int | None = None,
4622 stop: int | None = None,
4623 ):
4624 # validate the version
4625 self.validate_version(where)
4627 # infer the data kind
4628 if not self.infer_axes():
4629 return None
4631 result = self._read_axes(where=where, start=start, stop=stop)
4633 info = (
4634 self.info.get(self.non_index_axes[0][0], {})
4635 if len(self.non_index_axes)
4636 else {}
4637 )
4639 inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]
4640 assert len(inds) == 1
4641 ind = inds[0]
4643 index = result[ind][0]
4645 frames = []
4646 for i, a in enumerate(self.axes):
4647 if a not in self.values_axes:
4648 continue
4649 index_vals, cvalues = result[i]
4651 # we could have a multi-index constructor here
4652 # ensure_index doesn't recognized our list-of-tuples here
4653 if info.get("type") != "MultiIndex":
4654 cols = Index(index_vals)
4655 else:
4656 cols = MultiIndex.from_tuples(index_vals)
4658 names = info.get("names")
4659 if names is not None:
4660 cols.set_names(names, inplace=True)
4662 if self.is_transposed:
4663 values = cvalues
4664 index_ = cols
4665 cols_ = Index(index, name=getattr(index, "name", None))
4666 else:
4667 values = cvalues.T
4668 index_ = Index(index, name=getattr(index, "name", None))
4669 cols_ = cols
4671 # if we have a DataIndexableCol, its shape will only be 1 dim
4672 if values.ndim == 1 and isinstance(values, np.ndarray):
4673 values = values.reshape((1, values.shape[0]))
4675 if isinstance(values, np.ndarray):
4676 df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
4677 elif isinstance(values, Index):
4678 df = DataFrame(values, columns=cols_, index=index_)
4679 else:
4680 # Categorical
4681 df = DataFrame._from_arrays([values], columns=cols_, index=index_)
4682 if not (using_pyarrow_string_dtype() and values.dtype.kind == "O"):
4683 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
4684 if using_pyarrow_string_dtype() and is_string_array(
4685 values, # type: ignore[arg-type]
4686 skipna=True,
4687 ):
4688 df = df.astype("string[pyarrow_numpy]")
4689 frames.append(df)
4691 if len(frames) == 1:
4692 df = frames[0]
4693 else:
4694 df = concat(frames, axis=1)
4696 selection = Selection(self, where=where, start=start, stop=stop)
4697 # apply the selection filters & axis orderings
4698 df = self.process_axes(df, selection=selection, columns=columns)
4699 return df
4702class AppendableSeriesTable(AppendableFrameTable):
4703 """support the new appendable table formats"""
4705 pandas_kind = "series_table"
4706 table_type = "appendable_series"
4707 ndim = 2
4708 obj_type = Series
4710 @property
4711 def is_transposed(self) -> bool:
4712 return False
4714 @classmethod
4715 def get_object(cls, obj, transposed: bool):
4716 return obj
4718 # error: Signature of "write" incompatible with supertype "Fixed"
4719 def write(self, obj, data_columns=None, **kwargs) -> None: # type: ignore[override]
4720 """we are going to write this as a frame table"""
4721 if not isinstance(obj, DataFrame):
4722 name = obj.name or "values"
4723 obj = obj.to_frame(name)
4724 super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)
4726 def read(
4727 self,
4728 where=None,
4729 columns=None,
4730 start: int | None = None,
4731 stop: int | None = None,
4732 ) -> Series:
4733 is_multi_index = self.is_multi_index
4734 if columns is not None and is_multi_index:
4735 assert isinstance(self.levels, list) # needed for mypy
4736 for n in self.levels:
4737 if n not in columns:
4738 columns.insert(0, n)
4739 s = super().read(where=where, columns=columns, start=start, stop=stop)
4740 if is_multi_index:
4741 s.set_index(self.levels, inplace=True)
4743 s = s.iloc[:, 0]
4745 # remove the default name
4746 if s.name == "values":
4747 s.name = None
4748 return s
4751class AppendableMultiSeriesTable(AppendableSeriesTable):
4752 """support the new appendable table formats"""
4754 pandas_kind = "series_table"
4755 table_type = "appendable_multiseries"
4757 # error: Signature of "write" incompatible with supertype "Fixed"
4758 def write(self, obj, **kwargs) -> None: # type: ignore[override]
4759 """we are going to write this as a frame table"""
4760 name = obj.name or "values"
4761 newobj, self.levels = self.validate_multiindex(obj)
4762 assert isinstance(self.levels, list) # for mypy
4763 cols = list(self.levels)
4764 cols.append(name)
4765 newobj.columns = Index(cols)
4766 super().write(obj=newobj, **kwargs)
4769class GenericTable(AppendableFrameTable):
4770 """a table that read/writes the generic pytables table format"""
4772 pandas_kind = "frame_table"
4773 table_type = "generic_table"
4774 ndim = 2
4775 obj_type = DataFrame
4776 levels: list[Hashable]
4778 @property
4779 def pandas_type(self) -> str:
4780 return self.pandas_kind
4782 @property
4783 def storable(self):
4784 return getattr(self.group, "table", None) or self.group
4786 def get_attrs(self) -> None:
4787 """retrieve our attributes"""
4788 self.non_index_axes = []
4789 self.nan_rep = None
4790 self.levels = []
4792 self.index_axes = [a for a in self.indexables if a.is_an_indexable]
4793 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
4794 self.data_columns = [a.name for a in self.values_axes]
4796 @cache_readonly
4797 def indexables(self):
4798 """create the indexables from the table description"""
4799 d = self.description
4801 # TODO: can we get a typ for this? AFAICT it is the only place
4802 # where we aren't passing one
4803 # the index columns is just a simple index
4804 md = self.read_metadata("index")
4805 meta = "category" if md is not None else None
4806 index_col = GenericIndexCol(
4807 name="index", axis=0, table=self.table, meta=meta, metadata=md
4808 )
4810 _indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col]
4812 for i, n in enumerate(d._v_names):
4813 assert isinstance(n, str)
4815 atom = getattr(d, n)
4816 md = self.read_metadata(n)
4817 meta = "category" if md is not None else None
4818 dc = GenericDataIndexableCol(
4819 name=n,
4820 pos=i,
4821 values=[n],
4822 typ=atom,
4823 table=self.table,
4824 meta=meta,
4825 metadata=md,
4826 )
4827 _indexables.append(dc)
4829 return _indexables
4831 # error: Signature of "write" incompatible with supertype "AppendableTable"
4832 def write(self, **kwargs) -> None: # type: ignore[override]
4833 raise NotImplementedError("cannot write on an generic table")
4836class AppendableMultiFrameTable(AppendableFrameTable):
4837 """a frame with a multi-index"""
4839 table_type = "appendable_multiframe"
4840 obj_type = DataFrame
4841 ndim = 2
4842 _re_levels = re.compile(r"^level_\d+$")
4844 @property
4845 def table_type_short(self) -> str:
4846 return "appendable_multi"
4848 # error: Signature of "write" incompatible with supertype "Fixed"
4849 def write(self, obj, data_columns=None, **kwargs) -> None: # type: ignore[override]
4850 if data_columns is None:
4851 data_columns = []
4852 elif data_columns is True:
4853 data_columns = obj.columns.tolist()
4854 obj, self.levels = self.validate_multiindex(obj)
4855 assert isinstance(self.levels, list) # for mypy
4856 for n in self.levels:
4857 if n not in data_columns:
4858 data_columns.insert(0, n)
4859 super().write(obj=obj, data_columns=data_columns, **kwargs)
4861 def read(
4862 self,
4863 where=None,
4864 columns=None,
4865 start: int | None = None,
4866 stop: int | None = None,
4867 ):
4868 df = super().read(where=where, columns=columns, start=start, stop=stop)
4869 df = df.set_index(self.levels)
4871 # remove names for 'level_%d'
4872 df.index = df.index.set_names(
4873 [None if self._re_levels.search(name) else name for name in df.index.names]
4874 )
4876 return df
4879def _reindex_axis(
4880 obj: DataFrame, axis: AxisInt, labels: Index, other=None
4881) -> DataFrame:
4882 ax = obj._get_axis(axis)
4883 labels = ensure_index(labels)
4885 # try not to reindex even if other is provided
4886 # if it equals our current index
4887 if other is not None:
4888 other = ensure_index(other)
4889 if (other is None or labels.equals(other)) and labels.equals(ax):
4890 return obj
4892 labels = ensure_index(labels.unique())
4893 if other is not None:
4894 labels = ensure_index(other.unique()).intersection(labels, sort=False)
4895 if not labels.equals(ax):
4896 slicer: list[slice | Index] = [slice(None, None)] * obj.ndim
4897 slicer[axis] = labels
4898 obj = obj.loc[tuple(slicer)]
4899 return obj
4902# tz to/from coercion
4905def _get_tz(tz: tzinfo) -> str | tzinfo:
4906 """for a tz-aware type, return an encoded zone"""
4907 zone = timezones.get_timezone(tz)
4908 return zone
4911@overload
4912def _set_tz(
4913 values: np.ndarray | Index, tz: str | tzinfo, coerce: bool = False
4914) -> DatetimeIndex:
4915 ...
4918@overload
4919def _set_tz(values: np.ndarray | Index, tz: None, coerce: bool = False) -> np.ndarray:
4920 ...
4923def _set_tz(
4924 values: np.ndarray | Index, tz: str | tzinfo | None, coerce: bool = False
4925) -> np.ndarray | DatetimeIndex:
4926 """
4927 coerce the values to a DatetimeIndex if tz is set
4928 preserve the input shape if possible
4930 Parameters
4931 ----------
4932 values : ndarray or Index
4933 tz : str or tzinfo
4934 coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
4935 """
4936 if isinstance(values, DatetimeIndex):
4937 # If values is tzaware, the tz gets dropped in the values.ravel()
4938 # call below (which returns an ndarray). So we are only non-lossy
4939 # if `tz` matches `values.tz`.
4940 assert values.tz is None or values.tz == tz
4941 if values.tz is not None:
4942 return values
4944 if tz is not None:
4945 if isinstance(values, DatetimeIndex):
4946 name = values.name
4947 else:
4948 name = None
4949 values = values.ravel()
4951 tz = _ensure_decoded(tz)
4952 values = DatetimeIndex(values, name=name)
4953 values = values.tz_localize("UTC").tz_convert(tz)
4954 elif coerce:
4955 values = np.asarray(values, dtype="M8[ns]")
4957 # error: Incompatible return value type (got "Union[ndarray, Index]",
4958 # expected "Union[ndarray, DatetimeIndex]")
4959 return values # type: ignore[return-value]
4962def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol:
4963 assert isinstance(name, str)
4965 index_name = index.name
4966 # error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index";
4967 # expected "Union[ExtensionArray, ndarray]"
4968 converted, dtype_name = _get_data_and_dtype_name(index) # type: ignore[arg-type]
4969 kind = _dtype_to_kind(dtype_name)
4970 atom = DataIndexableCol._get_atom(converted)
4972 if (
4973 lib.is_np_dtype(index.dtype, "iu")
4974 or needs_i8_conversion(index.dtype)
4975 or is_bool_dtype(index.dtype)
4976 ):
4977 # Includes Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,
4978 # in which case "kind" is "integer", "integer", "datetime64",
4979 # "timedelta64", and "integer", respectively.
4980 return IndexCol(
4981 name,
4982 values=converted,
4983 kind=kind,
4984 typ=atom,
4985 freq=getattr(index, "freq", None),
4986 tz=getattr(index, "tz", None),
4987 index_name=index_name,
4988 )
4990 if isinstance(index, MultiIndex):
4991 raise TypeError("MultiIndex not supported here!")
4993 inferred_type = lib.infer_dtype(index, skipna=False)
4994 # we won't get inferred_type of "datetime64" or "timedelta64" as these
4995 # would go through the DatetimeIndex/TimedeltaIndex paths above
4997 values = np.asarray(index)
4999 if inferred_type == "date":
5000 converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)
5001 return IndexCol(
5002 name, converted, "date", _tables().Time32Col(), index_name=index_name
5003 )
5004 elif inferred_type == "string":
5005 converted = _convert_string_array(values, encoding, errors)
5006 itemsize = converted.dtype.itemsize
5007 return IndexCol(
5008 name,
5009 converted,
5010 "string",
5011 _tables().StringCol(itemsize),
5012 index_name=index_name,
5013 )
5015 elif inferred_type in ["integer", "floating"]:
5016 return IndexCol(
5017 name, values=converted, kind=kind, typ=atom, index_name=index_name
5018 )
5019 else:
5020 assert isinstance(converted, np.ndarray) and converted.dtype == object
5021 assert kind == "object", kind
5022 atom = _tables().ObjectAtom()
5023 return IndexCol(name, converted, kind, atom, index_name=index_name)
5026def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index:
5027 index: Index | np.ndarray
5029 if kind.startswith("datetime64"):
5030 if kind == "datetime64":
5031 # created before we stored resolution information
5032 index = DatetimeIndex(data)
5033 else:
5034 index = DatetimeIndex(data.view(kind))
5035 elif kind == "timedelta64":
5036 index = TimedeltaIndex(data)
5037 elif kind == "date":
5038 try:
5039 index = np.asarray([date.fromordinal(v) for v in data], dtype=object)
5040 except ValueError:
5041 index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)
5042 elif kind in ("integer", "float", "bool"):
5043 index = np.asarray(data)
5044 elif kind in ("string"):
5045 index = _unconvert_string_array(
5046 data, nan_rep=None, encoding=encoding, errors=errors
5047 )
5048 elif kind == "object":
5049 index = np.asarray(data[0])
5050 else: # pragma: no cover
5051 raise ValueError(f"unrecognized index type {kind}")
5052 return index
5055def _maybe_convert_for_string_atom(
5056 name: str,
5057 bvalues: ArrayLike,
5058 existing_col,
5059 min_itemsize,
5060 nan_rep,
5061 encoding,
5062 errors,
5063 columns: list[str],
5064):
5065 if bvalues.dtype != object:
5066 return bvalues
5068 bvalues = cast(np.ndarray, bvalues)
5070 dtype_name = bvalues.dtype.name
5071 inferred_type = lib.infer_dtype(bvalues, skipna=False)
5073 if inferred_type == "date":
5074 raise TypeError("[date] is not implemented as a table column")
5075 if inferred_type == "datetime":
5076 # after GH#8260
5077 # this only would be hit for a multi-timezone dtype which is an error
5078 raise TypeError(
5079 "too many timezones in this block, create separate data columns"
5080 )
5082 if not (inferred_type == "string" or dtype_name == "object"):
5083 return bvalues
5085 mask = isna(bvalues)
5086 data = bvalues.copy()
5087 data[mask] = nan_rep
5089 # see if we have a valid string type
5090 inferred_type = lib.infer_dtype(data, skipna=False)
5091 if inferred_type != "string":
5092 # we cannot serialize this data, so report an exception on a column
5093 # by column basis
5095 # expected behaviour:
5096 # search block for a non-string object column by column
5097 for i in range(data.shape[0]):
5098 col = data[i]
5099 inferred_type = lib.infer_dtype(col, skipna=False)
5100 if inferred_type != "string":
5101 error_column_label = columns[i] if len(columns) > i else f"No.{i}"
5102 raise TypeError(
5103 f"Cannot serialize the column [{error_column_label}]\n"
5104 f"because its data contents are not [string] but "
5105 f"[{inferred_type}] object dtype"
5106 )
5108 # itemsize is the maximum length of a string (along any dimension)
5110 data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape)
5111 itemsize = data_converted.itemsize
5113 # specified min_itemsize?
5114 if isinstance(min_itemsize, dict):
5115 min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)
5116 itemsize = max(min_itemsize or 0, itemsize)
5118 # check for column in the values conflicts
5119 if existing_col is not None:
5120 eci = existing_col.validate_col(itemsize)
5121 if eci is not None and eci > itemsize:
5122 itemsize = eci
5124 data_converted = data_converted.astype(f"|S{itemsize}", copy=False)
5125 return data_converted
5128def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray:
5129 """
5130 Take a string-like that is object dtype and coerce to a fixed size string type.
5132 Parameters
5133 ----------
5134 data : np.ndarray[object]
5135 encoding : str
5136 errors : str
5137 Handler for encoding errors.
5139 Returns
5140 -------
5141 np.ndarray[fixed-length-string]
5142 """
5143 # encode if needed
5144 if len(data):
5145 data = (
5146 Series(data.ravel(), copy=False)
5147 .str.encode(encoding, errors)
5148 ._values.reshape(data.shape)
5149 )
5151 # create the sized dtype
5152 ensured = ensure_object(data.ravel())
5153 itemsize = max(1, libwriters.max_len_string_array(ensured))
5155 data = np.asarray(data, dtype=f"S{itemsize}")
5156 return data
5159def _unconvert_string_array(
5160 data: np.ndarray, nan_rep, encoding: str, errors: str
5161) -> np.ndarray:
5162 """
5163 Inverse of _convert_string_array.
5165 Parameters
5166 ----------
5167 data : np.ndarray[fixed-length-string]
5168 nan_rep : the storage repr of NaN
5169 encoding : str
5170 errors : str
5171 Handler for encoding errors.
5173 Returns
5174 -------
5175 np.ndarray[object]
5176 Decoded data.
5177 """
5178 shape = data.shape
5179 data = np.asarray(data.ravel(), dtype=object)
5181 if len(data):
5182 itemsize = libwriters.max_len_string_array(ensure_object(data))
5183 dtype = f"U{itemsize}"
5185 if isinstance(data[0], bytes):
5186 data = Series(data, copy=False).str.decode(encoding, errors=errors)._values
5187 else:
5188 data = data.astype(dtype, copy=False).astype(object, copy=False)
5190 if nan_rep is None:
5191 nan_rep = "nan"
5193 libwriters.string_array_replace_from_nan_rep(data, nan_rep)
5194 return data.reshape(shape)
5197def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str):
5198 assert isinstance(val_kind, str), type(val_kind)
5199 if _need_convert(val_kind):
5200 conv = _get_converter(val_kind, encoding, errors)
5201 values = conv(values)
5202 return values
5205def _get_converter(kind: str, encoding: str, errors: str):
5206 if kind == "datetime64":
5207 return lambda x: np.asarray(x, dtype="M8[ns]")
5208 elif "datetime64" in kind:
5209 return lambda x: np.asarray(x, dtype=kind)
5210 elif kind == "string":
5211 return lambda x: _unconvert_string_array(
5212 x, nan_rep=None, encoding=encoding, errors=errors
5213 )
5214 else: # pragma: no cover
5215 raise ValueError(f"invalid kind {kind}")
5218def _need_convert(kind: str) -> bool:
5219 if kind in ("datetime64", "string") or "datetime64" in kind:
5220 return True
5221 return False
5224def _maybe_adjust_name(name: str, version: Sequence[int]) -> str:
5225 """
5226 Prior to 0.10.1, we named values blocks like: values_block_0 an the
5227 name values_0, adjust the given name if necessary.
5229 Parameters
5230 ----------
5231 name : str
5232 version : Tuple[int, int, int]
5234 Returns
5235 -------
5236 str
5237 """
5238 if isinstance(version, str) or len(version) < 3:
5239 raise ValueError("Version is incorrect, expected sequence of 3 integers.")
5241 if version[0] == 0 and version[1] <= 10 and version[2] == 0:
5242 m = re.search(r"values_block_(\d+)", name)
5243 if m:
5244 grp = m.groups()[0]
5245 name = f"values_{grp}"
5246 return name
5249def _dtype_to_kind(dtype_str: str) -> str:
5250 """
5251 Find the "kind" string describing the given dtype name.
5252 """
5253 dtype_str = _ensure_decoded(dtype_str)
5255 if dtype_str.startswith(("string", "bytes")):
5256 kind = "string"
5257 elif dtype_str.startswith("float"):
5258 kind = "float"
5259 elif dtype_str.startswith("complex"):
5260 kind = "complex"
5261 elif dtype_str.startswith(("int", "uint")):
5262 kind = "integer"
5263 elif dtype_str.startswith("datetime64"):
5264 kind = dtype_str
5265 elif dtype_str.startswith("timedelta"):
5266 kind = "timedelta64"
5267 elif dtype_str.startswith("bool"):
5268 kind = "bool"
5269 elif dtype_str.startswith("category"):
5270 kind = "category"
5271 elif dtype_str.startswith("period"):
5272 # We store the `freq` attr so we can restore from integers
5273 kind = "integer"
5274 elif dtype_str == "object":
5275 kind = "object"
5276 else:
5277 raise ValueError(f"cannot interpret dtype of [{dtype_str}]")
5279 return kind
5282def _get_data_and_dtype_name(data: ArrayLike):
5283 """
5284 Convert the passed data into a storable form and a dtype string.
5285 """
5286 if isinstance(data, Categorical):
5287 data = data.codes
5289 if isinstance(data.dtype, DatetimeTZDtype):
5290 # For datetime64tz we need to drop the TZ in tests TODO: why?
5291 dtype_name = f"datetime64[{data.dtype.unit}]"
5292 else:
5293 dtype_name = data.dtype.name
5295 if data.dtype.kind in "mM":
5296 data = np.asarray(data.view("i8"))
5297 # TODO: we used to reshape for the dt64tz case, but no longer
5298 # doing that doesn't seem to break anything. why?
5300 elif isinstance(data, PeriodIndex):
5301 data = data.asi8
5303 data = np.asarray(data)
5304 return data, dtype_name
5307class Selection:
5308 """
5309 Carries out a selection operation on a tables.Table object.
5311 Parameters
5312 ----------
5313 table : a Table object
5314 where : list of Terms (or convertible to)
5315 start, stop: indices to start and/or stop selection
5317 """
5319 def __init__(
5320 self,
5321 table: Table,
5322 where=None,
5323 start: int | None = None,
5324 stop: int | None = None,
5325 ) -> None:
5326 self.table = table
5327 self.where = where
5328 self.start = start
5329 self.stop = stop
5330 self.condition = None
5331 self.filter = None
5332 self.terms = None
5333 self.coordinates = None
5335 if is_list_like(where):
5336 # see if we have a passed coordinate like
5337 with suppress(ValueError):
5338 inferred = lib.infer_dtype(where, skipna=False)
5339 if inferred in ("integer", "boolean"):
5340 where = np.asarray(where)
5341 if where.dtype == np.bool_:
5342 start, stop = self.start, self.stop
5343 if start is None:
5344 start = 0
5345 if stop is None:
5346 stop = self.table.nrows
5347 self.coordinates = np.arange(start, stop)[where]
5348 elif issubclass(where.dtype.type, np.integer):
5349 if (self.start is not None and (where < self.start).any()) or (
5350 self.stop is not None and (where >= self.stop).any()
5351 ):
5352 raise ValueError(
5353 "where must have index locations >= start and < stop"
5354 )
5355 self.coordinates = where
5357 if self.coordinates is None:
5358 self.terms = self.generate(where)
5360 # create the numexpr & the filter
5361 if self.terms is not None:
5362 self.condition, self.filter = self.terms.evaluate()
5364 def generate(self, where):
5365 """where can be a : dict,list,tuple,string"""
5366 if where is None:
5367 return None
5369 q = self.table.queryables()
5370 try:
5371 return PyTablesExpr(where, queryables=q, encoding=self.table.encoding)
5372 except NameError as err:
5373 # raise a nice message, suggesting that the user should use
5374 # data_columns
5375 qkeys = ",".join(q.keys())
5376 msg = dedent(
5377 f"""\
5378 The passed where expression: {where}
5379 contains an invalid variable reference
5380 all of the variable references must be a reference to
5381 an axis (e.g. 'index' or 'columns'), or a data_column
5382 The currently defined references are: {qkeys}
5383 """
5384 )
5385 raise ValueError(msg) from err
5387 def select(self):
5388 """
5389 generate the selection
5390 """
5391 if self.condition is not None:
5392 return self.table.table.read_where(
5393 self.condition.format(), start=self.start, stop=self.stop
5394 )
5395 elif self.coordinates is not None:
5396 return self.table.table.read_coordinates(self.coordinates)
5397 return self.table.table.read(start=self.start, stop=self.stop)
5399 def select_coords(self):
5400 """
5401 generate the selection
5402 """
5403 start, stop = self.start, self.stop
5404 nrows = self.table.nrows
5405 if start is None:
5406 start = 0
5407 elif start < 0:
5408 start += nrows
5409 if stop is None:
5410 stop = nrows
5411 elif stop < 0:
5412 stop += nrows
5414 if self.condition is not None:
5415 return self.table.table.get_where_list(
5416 self.condition.format(), start=start, stop=stop, sort=True
5417 )
5418 elif self.coordinates is not None:
5419 return self.coordinates
5421 return np.arange(start, stop)