1"""
2High level interface to PyTables for reading and writing pandas data structures
3to disk
4"""
5from __future__ import annotations
6
7from contextlib import suppress
8import copy
9from datetime import (
10 date,
11 tzinfo,
12)
13import itertools
14import os
15import re
16from textwrap import dedent
17from types import TracebackType
18from typing import (
19 TYPE_CHECKING,
20 Any,
21 Callable,
22 Final,
23 Hashable,
24 Iterator,
25 Literal,
26 Sequence,
27 cast,
28 overload,
29)
30import warnings
31
32import numpy as np
33
34from pandas._config import (
35 config,
36 get_option,
37)
38
39from pandas._libs import (
40 lib,
41 writers as libwriters,
42)
43from pandas._libs.tslibs import timezones
44from pandas._typing import (
45 AnyArrayLike,
46 ArrayLike,
47 AxisInt,
48 DtypeArg,
49 FilePath,
50 Shape,
51 npt,
52)
53from pandas.compat._optional import import_optional_dependency
54from pandas.compat.pickle_compat import patch_pickle
55from pandas.errors import (
56 AttributeConflictWarning,
57 ClosedFileError,
58 IncompatibilityWarning,
59 PerformanceWarning,
60 PossibleDataLossError,
61)
62from pandas.util._decorators import cache_readonly
63from pandas.util._exceptions import find_stack_level
64
65from pandas.core.dtypes.common import (
66 ensure_object,
67 is_bool_dtype,
68 is_categorical_dtype,
69 is_complex_dtype,
70 is_datetime64_dtype,
71 is_datetime64tz_dtype,
72 is_extension_array_dtype,
73 is_integer_dtype,
74 is_list_like,
75 is_object_dtype,
76 is_string_dtype,
77 is_timedelta64_dtype,
78 needs_i8_conversion,
79)
80from pandas.core.dtypes.missing import array_equivalent
81
82from pandas import (
83 DataFrame,
84 DatetimeIndex,
85 Index,
86 MultiIndex,
87 PeriodIndex,
88 RangeIndex,
89 Series,
90 TimedeltaIndex,
91 concat,
92 isna,
93)
94from pandas.core.arrays import (
95 Categorical,
96 DatetimeArray,
97 PeriodArray,
98)
99import pandas.core.common as com
100from pandas.core.computation.pytables import (
101 PyTablesExpr,
102 maybe_expression,
103)
104from pandas.core.construction import extract_array
105from pandas.core.indexes.api import ensure_index
106from pandas.core.internals import (
107 ArrayManager,
108 BlockManager,
109)
110
111from pandas.io.common import stringify_path
112from pandas.io.formats.printing import (
113 adjoin,
114 pprint_thing,
115)
116
117if TYPE_CHECKING:
118 from tables import (
119 Col,
120 File,
121 Node,
122 )
123
124 from pandas.core.internals import Block
125
126
127# versioning attribute
128_version = "0.15.2"
129
130# encoding
131_default_encoding = "UTF-8"
132
133
134def _ensure_decoded(s):
135 """if we have bytes, decode them to unicode"""
136 if isinstance(s, np.bytes_):
137 s = s.decode("UTF-8")
138 return s
139
140
141def _ensure_encoding(encoding: str | None) -> str:
142 # set the encoding if we need
143 if encoding is None:
144 encoding = _default_encoding
145
146 return encoding
147
148
149def _ensure_str(name):
150 """
151 Ensure that an index / column name is a str (python 3); otherwise they
152 may be np.string dtype. Non-string dtypes are passed through unchanged.
153
154 https://github.com/pandas-dev/pandas/issues/13492
155 """
156 if isinstance(name, str):
157 name = str(name)
158 return name
159
160
161Term = PyTablesExpr
162
163
164def _ensure_term(where, scope_level: int):
165 """
166 Ensure that the where is a Term or a list of Term.
167
168 This makes sure that we are capturing the scope of variables that are
169 passed create the terms here with a frame_level=2 (we are 2 levels down)
170 """
171 # only consider list/tuple here as an ndarray is automatically a coordinate
172 # list
173 level = scope_level + 1
174 if isinstance(where, (list, tuple)):
175 where = [
176 Term(term, scope_level=level + 1) if maybe_expression(term) else term
177 for term in where
178 if term is not None
179 ]
180 elif maybe_expression(where):
181 where = Term(where, scope_level=level)
182 return where if where is None or len(where) else None
183
184
185incompatibility_doc: Final = """
186where criteria is being ignored as this version [%s] is too old (or
187not-defined), read the file in and write it out to a new file to upgrade (with
188the copy_to method)
189"""
190
191attribute_conflict_doc: Final = """
192the [%s] attribute of the existing index is [%s] which conflicts with the new
193[%s], resetting the attribute to None
194"""
195
196performance_doc: Final = """
197your performance may suffer as PyTables will pickle object types that it cannot
198map directly to c-types [inferred_type->%s,key->%s] [items->%s]
199"""
200
201# formats
202_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}
203
204# axes map
205_AXES_MAP = {DataFrame: [0]}
206
207# register our configuration options
208dropna_doc: Final = """
209: boolean
210 drop ALL nan rows when appending to a table
211"""
212format_doc: Final = """
213: format
214 default format writing format, if None, then
215 put will default to 'fixed' and append will default to 'table'
216"""
217
218with config.config_prefix("io.hdf"):
219 config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)
220 config.register_option(
221 "default_format",
222 None,
223 format_doc,
224 validator=config.is_one_of_factory(["fixed", "table", None]),
225 )
226
227# oh the troubles to reduce import time
228_table_mod = None
229_table_file_open_policy_is_strict = False
230
231
232def _tables():
233 global _table_mod
234 global _table_file_open_policy_is_strict
235 if _table_mod is None:
236 import tables
237
238 _table_mod = tables
239
240 # set the file open policy
241 # return the file open policy; this changes as of pytables 3.1
242 # depending on the HDF5 version
243 with suppress(AttributeError):
244 _table_file_open_policy_is_strict = (
245 tables.file._FILE_OPEN_POLICY == "strict"
246 )
247
248 return _table_mod
249
250
251# interface to/from ###
252
253
254def to_hdf(
255 path_or_buf: FilePath | HDFStore,
256 key: str,
257 value: DataFrame | Series,
258 mode: str = "a",
259 complevel: int | None = None,
260 complib: str | None = None,
261 append: bool = False,
262 format: str | None = None,
263 index: bool = True,
264 min_itemsize: int | dict[str, int] | None = None,
265 nan_rep=None,
266 dropna: bool | None = None,
267 data_columns: Literal[True] | list[str] | None = None,
268 errors: str = "strict",
269 encoding: str = "UTF-8",
270) -> None:
271 """store this object, close it if we opened it"""
272 if append:
273 f = lambda store: store.append(
274 key,
275 value,
276 format=format,
277 index=index,
278 min_itemsize=min_itemsize,
279 nan_rep=nan_rep,
280 dropna=dropna,
281 data_columns=data_columns,
282 errors=errors,
283 encoding=encoding,
284 )
285 else:
286 # NB: dropna is not passed to `put`
287 f = lambda store: store.put(
288 key,
289 value,
290 format=format,
291 index=index,
292 min_itemsize=min_itemsize,
293 nan_rep=nan_rep,
294 data_columns=data_columns,
295 errors=errors,
296 encoding=encoding,
297 dropna=dropna,
298 )
299
300 path_or_buf = stringify_path(path_or_buf)
301 if isinstance(path_or_buf, str):
302 with HDFStore(
303 path_or_buf, mode=mode, complevel=complevel, complib=complib
304 ) as store:
305 f(store)
306 else:
307 f(path_or_buf)
308
309
310def read_hdf(
311 path_or_buf: FilePath | HDFStore,
312 key=None,
313 mode: str = "r",
314 errors: str = "strict",
315 where: str | list | None = None,
316 start: int | None = None,
317 stop: int | None = None,
318 columns: list[str] | None = None,
319 iterator: bool = False,
320 chunksize: int | None = None,
321 **kwargs,
322):
323 """
324 Read from the store, close it if we opened it.
325
326 Retrieve pandas object stored in file, optionally based on where
327 criteria.
328
329 .. warning::
330
331 Pandas uses PyTables for reading and writing HDF5 files, which allows
332 serializing object-dtype data with pickle when using the "fixed" format.
333 Loading pickled data received from untrusted sources can be unsafe.
334
335 See: https://docs.python.org/3/library/pickle.html for more.
336
337 Parameters
338 ----------
339 path_or_buf : str, path object, pandas.HDFStore
340 Any valid string path is acceptable. Only supports the local file system,
341 remote URLs and file-like objects are not supported.
342
343 If you want to pass in a path object, pandas accepts any
344 ``os.PathLike``.
345
346 Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.
347
348 key : object, optional
349 The group identifier in the store. Can be omitted if the HDF file
350 contains a single pandas object.
351 mode : {'r', 'r+', 'a'}, default 'r'
352 Mode to use when opening the file. Ignored if path_or_buf is a
353 :class:`pandas.HDFStore`. Default is 'r'.
354 errors : str, default 'strict'
355 Specifies how encoding and decoding errors are to be handled.
356 See the errors argument for :func:`open` for a full list
357 of options.
358 where : list, optional
359 A list of Term (or convertible) objects.
360 start : int, optional
361 Row number to start selection.
362 stop : int, optional
363 Row number to stop selection.
364 columns : list, optional
365 A list of columns names to return.
366 iterator : bool, optional
367 Return an iterator object.
368 chunksize : int, optional
369 Number of rows to include in an iteration when using an iterator.
370 **kwargs
371 Additional keyword arguments passed to HDFStore.
372
373 Returns
374 -------
375 object
376 The selected object. Return type depends on the object stored.
377
378 See Also
379 --------
380 DataFrame.to_hdf : Write a HDF file from a DataFrame.
381 HDFStore : Low-level access to HDF files.
382
383 Examples
384 --------
385 >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP
386 >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP
387 >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP
388 """
389 if mode not in ["r", "r+", "a"]:
390 raise ValueError(
391 f"mode {mode} is not allowed while performing a read. "
392 f"Allowed modes are r, r+ and a."
393 )
394 # grab the scope
395 if where is not None:
396 where = _ensure_term(where, scope_level=1)
397
398 if isinstance(path_or_buf, HDFStore):
399 if not path_or_buf.is_open:
400 raise OSError("The HDFStore must be open for reading.")
401
402 store = path_or_buf
403 auto_close = False
404 else:
405 path_or_buf = stringify_path(path_or_buf)
406 if not isinstance(path_or_buf, str):
407 raise NotImplementedError(
408 "Support for generic buffers has not been implemented."
409 )
410 try:
411 exists = os.path.exists(path_or_buf)
412
413 # if filepath is too long
414 except (TypeError, ValueError):
415 exists = False
416
417 if not exists:
418 raise FileNotFoundError(f"File {path_or_buf} does not exist")
419
420 store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)
421 # can't auto open/close if we are using an iterator
422 # so delegate to the iterator
423 auto_close = True
424
425 try:
426 if key is None:
427 groups = store.groups()
428 if len(groups) == 0:
429 raise ValueError(
430 "Dataset(s) incompatible with Pandas data types, "
431 "not table, or no datasets found in HDF5 file."
432 )
433 candidate_only_group = groups[0]
434
435 # For the HDF file to have only one dataset, all other groups
436 # should then be metadata groups for that candidate group. (This
437 # assumes that the groups() method enumerates parent groups
438 # before their children.)
439 for group_to_check in groups[1:]:
440 if not _is_metadata_of(group_to_check, candidate_only_group):
441 raise ValueError(
442 "key must be provided when HDF5 "
443 "file contains multiple datasets."
444 )
445 key = candidate_only_group._v_pathname
446 return store.select(
447 key,
448 where=where,
449 start=start,
450 stop=stop,
451 columns=columns,
452 iterator=iterator,
453 chunksize=chunksize,
454 auto_close=auto_close,
455 )
456 except (ValueError, TypeError, KeyError):
457 if not isinstance(path_or_buf, HDFStore):
458 # if there is an error, close the store if we opened it.
459 with suppress(AttributeError):
460 store.close()
461
462 raise
463
464
465def _is_metadata_of(group: Node, parent_group: Node) -> bool:
466 """Check if a given group is a metadata group for a given parent_group."""
467 if group._v_depth <= parent_group._v_depth:
468 return False
469
470 current = group
471 while current._v_depth > 1:
472 parent = current._v_parent
473 if parent == parent_group and current._v_name == "meta":
474 return True
475 current = current._v_parent
476 return False
477
478
479class HDFStore:
480 """
481 Dict-like IO interface for storing pandas objects in PyTables.
482
483 Either Fixed or Table format.
484
485 .. warning::
486
487 Pandas uses PyTables for reading and writing HDF5 files, which allows
488 serializing object-dtype data with pickle when using the "fixed" format.
489 Loading pickled data received from untrusted sources can be unsafe.
490
491 See: https://docs.python.org/3/library/pickle.html for more.
492
493 Parameters
494 ----------
495 path : str
496 File path to HDF5 file.
497 mode : {'a', 'w', 'r', 'r+'}, default 'a'
498
499 ``'r'``
500 Read-only; no data can be modified.
501 ``'w'``
502 Write; a new file is created (an existing file with the same
503 name would be deleted).
504 ``'a'``
505 Append; an existing file is opened for reading and writing,
506 and if the file does not exist it is created.
507 ``'r+'``
508 It is similar to ``'a'``, but the file must already exist.
509 complevel : int, 0-9, default None
510 Specifies a compression level for data.
511 A value of 0 or None disables compression.
512 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
513 Specifies the compression library to be used.
514 As of v0.20.2 these additional compressors for Blosc are supported
515 (default if no compressor specified: 'blosc:blosclz'):
516 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
517 'blosc:zlib', 'blosc:zstd'}.
518 Specifying a compression library which is not available issues
519 a ValueError.
520 fletcher32 : bool, default False
521 If applying compression use the fletcher32 checksum.
522 **kwargs
523 These parameters will be passed to the PyTables open_file method.
524
525 Examples
526 --------
527 >>> bar = pd.DataFrame(np.random.randn(10, 4))
528 >>> store = pd.HDFStore('test.h5')
529 >>> store['foo'] = bar # write to HDF5
530 >>> bar = store['foo'] # retrieve
531 >>> store.close()
532
533 **Create or load HDF5 file in-memory**
534
535 When passing the `driver` option to the PyTables open_file method through
536 **kwargs, the HDF5 file is loaded or created in-memory and will only be
537 written when closed:
538
539 >>> bar = pd.DataFrame(np.random.randn(10, 4))
540 >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE')
541 >>> store['foo'] = bar
542 >>> store.close() # only now, data is written to disk
543 """
544
545 _handle: File | None
546 _mode: str
547
548 def __init__(
549 self,
550 path,
551 mode: str = "a",
552 complevel: int | None = None,
553 complib=None,
554 fletcher32: bool = False,
555 **kwargs,
556 ) -> None:
557 if "format" in kwargs:
558 raise ValueError("format is not a defined argument for HDFStore")
559
560 tables = import_optional_dependency("tables")
561
562 if complib is not None and complib not in tables.filters.all_complibs:
563 raise ValueError(
564 f"complib only supports {tables.filters.all_complibs} compression."
565 )
566
567 if complib is None and complevel is not None:
568 complib = tables.filters.default_complib
569
570 self._path = stringify_path(path)
571 if mode is None:
572 mode = "a"
573 self._mode = mode
574 self._handle = None
575 self._complevel = complevel if complevel else 0
576 self._complib = complib
577 self._fletcher32 = fletcher32
578 self._filters = None
579 self.open(mode=mode, **kwargs)
580
581 def __fspath__(self) -> str:
582 return self._path
583
584 @property
585 def root(self):
586 """return the root node"""
587 self._check_if_open()
588 assert self._handle is not None # for mypy
589 return self._handle.root
590
591 @property
592 def filename(self) -> str:
593 return self._path
594
595 def __getitem__(self, key: str):
596 return self.get(key)
597
598 def __setitem__(self, key: str, value) -> None:
599 self.put(key, value)
600
601 def __delitem__(self, key: str) -> None:
602 return self.remove(key)
603
604 def __getattr__(self, name: str):
605 """allow attribute access to get stores"""
606 try:
607 return self.get(name)
608 except (KeyError, ClosedFileError):
609 pass
610 raise AttributeError(
611 f"'{type(self).__name__}' object has no attribute '{name}'"
612 )
613
614 def __contains__(self, key: str) -> bool:
615 """
616 check for existence of this key
617 can match the exact pathname or the pathnm w/o the leading '/'
618 """
619 node = self.get_node(key)
620 if node is not None:
621 name = node._v_pathname
622 if key in (name, name[1:]):
623 return True
624 return False
625
626 def __len__(self) -> int:
627 return len(self.groups())
628
629 def __repr__(self) -> str:
630 pstr = pprint_thing(self._path)
631 return f"{type(self)}\nFile path: {pstr}\n"
632
633 def __enter__(self) -> HDFStore:
634 return self
635
636 def __exit__(
637 self,
638 exc_type: type[BaseException] | None,
639 exc_value: BaseException | None,
640 traceback: TracebackType | None,
641 ) -> None:
642 self.close()
643
644 def keys(self, include: str = "pandas") -> list[str]:
645 """
646 Return a list of keys corresponding to objects stored in HDFStore.
647
648 Parameters
649 ----------
650
651 include : str, default 'pandas'
652 When kind equals 'pandas' return pandas objects.
653 When kind equals 'native' return native HDF5 Table objects.
654
655 .. versionadded:: 1.1.0
656
657 Returns
658 -------
659 list
660 List of ABSOLUTE path-names (e.g. have the leading '/').
661
662 Raises
663 ------
664 raises ValueError if kind has an illegal value
665 """
666 if include == "pandas":
667 return [n._v_pathname for n in self.groups()]
668
669 elif include == "native":
670 assert self._handle is not None # mypy
671 return [
672 n._v_pathname for n in self._handle.walk_nodes("/", classname="Table")
673 ]
674 raise ValueError(
675 f"`include` should be either 'pandas' or 'native' but is '{include}'"
676 )
677
678 def __iter__(self) -> Iterator[str]:
679 return iter(self.keys())
680
681 def items(self) -> Iterator[tuple[str, list]]:
682 """
683 iterate on key->group
684 """
685 for g in self.groups():
686 yield g._v_pathname, g
687
688 def open(self, mode: str = "a", **kwargs) -> None:
689 """
690 Open the file in the specified mode
691
692 Parameters
693 ----------
694 mode : {'a', 'w', 'r', 'r+'}, default 'a'
695 See HDFStore docstring or tables.open_file for info about modes
696 **kwargs
697 These parameters will be passed to the PyTables open_file method.
698 """
699 tables = _tables()
700
701 if self._mode != mode:
702 # if we are changing a write mode to read, ok
703 if self._mode in ["a", "w"] and mode in ["r", "r+"]:
704 pass
705 elif mode in ["w"]:
706 # this would truncate, raise here
707 if self.is_open:
708 raise PossibleDataLossError(
709 f"Re-opening the file [{self._path}] with mode [{self._mode}] "
710 "will delete the current file!"
711 )
712
713 self._mode = mode
714
715 # close and reopen the handle
716 if self.is_open:
717 self.close()
718
719 if self._complevel and self._complevel > 0:
720 self._filters = _tables().Filters(
721 self._complevel, self._complib, fletcher32=self._fletcher32
722 )
723
724 if _table_file_open_policy_is_strict and self.is_open:
725 msg = (
726 "Cannot open HDF5 file, which is already opened, "
727 "even in read-only mode."
728 )
729 raise ValueError(msg)
730
731 self._handle = tables.open_file(self._path, self._mode, **kwargs)
732
733 def close(self) -> None:
734 """
735 Close the PyTables file handle
736 """
737 if self._handle is not None:
738 self._handle.close()
739 self._handle = None
740
741 @property
742 def is_open(self) -> bool:
743 """
744 return a boolean indicating whether the file is open
745 """
746 if self._handle is None:
747 return False
748 return bool(self._handle.isopen)
749
750 def flush(self, fsync: bool = False) -> None:
751 """
752 Force all buffered modifications to be written to disk.
753
754 Parameters
755 ----------
756 fsync : bool (default False)
757 call ``os.fsync()`` on the file handle to force writing to disk.
758
759 Notes
760 -----
761 Without ``fsync=True``, flushing may not guarantee that the OS writes
762 to disk. With fsync, the operation will block until the OS claims the
763 file has been written; however, other caching layers may still
764 interfere.
765 """
766 if self._handle is not None:
767 self._handle.flush()
768 if fsync:
769 with suppress(OSError):
770 os.fsync(self._handle.fileno())
771
772 def get(self, key: str):
773 """
774 Retrieve pandas object stored in file.
775
776 Parameters
777 ----------
778 key : str
779
780 Returns
781 -------
782 object
783 Same type as object stored in file.
784 """
785 with patch_pickle():
786 # GH#31167 Without this patch, pickle doesn't know how to unpickle
787 # old DateOffset objects now that they are cdef classes.
788 group = self.get_node(key)
789 if group is None:
790 raise KeyError(f"No object named {key} in the file")
791 return self._read_group(group)
792
793 def select(
794 self,
795 key: str,
796 where=None,
797 start=None,
798 stop=None,
799 columns=None,
800 iterator: bool = False,
801 chunksize=None,
802 auto_close: bool = False,
803 ):
804 """
805 Retrieve pandas object stored in file, optionally based on where criteria.
806
807 .. warning::
808
809 Pandas uses PyTables for reading and writing HDF5 files, which allows
810 serializing object-dtype data with pickle when using the "fixed" format.
811 Loading pickled data received from untrusted sources can be unsafe.
812
813 See: https://docs.python.org/3/library/pickle.html for more.
814
815 Parameters
816 ----------
817 key : str
818 Object being retrieved from file.
819 where : list or None
820 List of Term (or convertible) objects, optional.
821 start : int or None
822 Row number to start selection.
823 stop : int, default None
824 Row number to stop selection.
825 columns : list or None
826 A list of columns that if not None, will limit the return columns.
827 iterator : bool or False
828 Returns an iterator.
829 chunksize : int or None
830 Number or rows to include in iteration, return an iterator.
831 auto_close : bool or False
832 Should automatically close the store when finished.
833
834 Returns
835 -------
836 object
837 Retrieved object from file.
838 """
839 group = self.get_node(key)
840 if group is None:
841 raise KeyError(f"No object named {key} in the file")
842
843 # create the storer and axes
844 where = _ensure_term(where, scope_level=1)
845 s = self._create_storer(group)
846 s.infer_axes()
847
848 # function to call on iteration
849 def func(_start, _stop, _where):
850 return s.read(start=_start, stop=_stop, where=_where, columns=columns)
851
852 # create the iterator
853 it = TableIterator(
854 self,
855 s,
856 func,
857 where=where,
858 nrows=s.nrows,
859 start=start,
860 stop=stop,
861 iterator=iterator,
862 chunksize=chunksize,
863 auto_close=auto_close,
864 )
865
866 return it.get_result()
867
868 def select_as_coordinates(
869 self,
870 key: str,
871 where=None,
872 start: int | None = None,
873 stop: int | None = None,
874 ):
875 """
876 return the selection as an Index
877
878 .. warning::
879
880 Pandas uses PyTables for reading and writing HDF5 files, which allows
881 serializing object-dtype data with pickle when using the "fixed" format.
882 Loading pickled data received from untrusted sources can be unsafe.
883
884 See: https://docs.python.org/3/library/pickle.html for more.
885
886
887 Parameters
888 ----------
889 key : str
890 where : list of Term (or convertible) objects, optional
891 start : integer (defaults to None), row number to start selection
892 stop : integer (defaults to None), row number to stop selection
893 """
894 where = _ensure_term(where, scope_level=1)
895 tbl = self.get_storer(key)
896 if not isinstance(tbl, Table):
897 raise TypeError("can only read_coordinates with a table")
898 return tbl.read_coordinates(where=where, start=start, stop=stop)
899
900 def select_column(
901 self,
902 key: str,
903 column: str,
904 start: int | None = None,
905 stop: int | None = None,
906 ):
907 """
908 return a single column from the table. This is generally only useful to
909 select an indexable
910
911 .. warning::
912
913 Pandas uses PyTables for reading and writing HDF5 files, which allows
914 serializing object-dtype data with pickle when using the "fixed" format.
915 Loading pickled data received from untrusted sources can be unsafe.
916
917 See: https://docs.python.org/3/library/pickle.html for more.
918
919 Parameters
920 ----------
921 key : str
922 column : str
923 The column of interest.
924 start : int or None, default None
925 stop : int or None, default None
926
927 Raises
928 ------
929 raises KeyError if the column is not found (or key is not a valid
930 store)
931 raises ValueError if the column can not be extracted individually (it
932 is part of a data block)
933
934 """
935 tbl = self.get_storer(key)
936 if not isinstance(tbl, Table):
937 raise TypeError("can only read_column with a table")
938 return tbl.read_column(column=column, start=start, stop=stop)
939
940 def select_as_multiple(
941 self,
942 keys,
943 where=None,
944 selector=None,
945 columns=None,
946 start=None,
947 stop=None,
948 iterator: bool = False,
949 chunksize=None,
950 auto_close: bool = False,
951 ):
952 """
953 Retrieve pandas objects from multiple tables.
954
955 .. warning::
956
957 Pandas uses PyTables for reading and writing HDF5 files, which allows
958 serializing object-dtype data with pickle when using the "fixed" format.
959 Loading pickled data received from untrusted sources can be unsafe.
960
961 See: https://docs.python.org/3/library/pickle.html for more.
962
963 Parameters
964 ----------
965 keys : a list of the tables
966 selector : the table to apply the where criteria (defaults to keys[0]
967 if not supplied)
968 columns : the columns I want back
969 start : integer (defaults to None), row number to start selection
970 stop : integer (defaults to None), row number to stop selection
971 iterator : bool, return an iterator, default False
972 chunksize : nrows to include in iteration, return an iterator
973 auto_close : bool, default False
974 Should automatically close the store when finished.
975
976 Raises
977 ------
978 raises KeyError if keys or selector is not found or keys is empty
979 raises TypeError if keys is not a list or tuple
980 raises ValueError if the tables are not ALL THE SAME DIMENSIONS
981 """
982 # default to single select
983 where = _ensure_term(where, scope_level=1)
984 if isinstance(keys, (list, tuple)) and len(keys) == 1:
985 keys = keys[0]
986 if isinstance(keys, str):
987 return self.select(
988 key=keys,
989 where=where,
990 columns=columns,
991 start=start,
992 stop=stop,
993 iterator=iterator,
994 chunksize=chunksize,
995 auto_close=auto_close,
996 )
997
998 if not isinstance(keys, (list, tuple)):
999 raise TypeError("keys must be a list/tuple")
1000
1001 if not len(keys):
1002 raise ValueError("keys must have a non-zero length")
1003
1004 if selector is None:
1005 selector = keys[0]
1006
1007 # collect the tables
1008 tbls = [self.get_storer(k) for k in keys]
1009 s = self.get_storer(selector)
1010
1011 # validate rows
1012 nrows = None
1013 for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
1014 if t is None:
1015 raise KeyError(f"Invalid table [{k}]")
1016 if not t.is_table:
1017 raise TypeError(
1018 f"object [{t.pathname}] is not a table, and cannot be used in all "
1019 "select as multiple"
1020 )
1021
1022 if nrows is None:
1023 nrows = t.nrows
1024 elif t.nrows != nrows:
1025 raise ValueError("all tables must have exactly the same nrows!")
1026
1027 # The isinstance checks here are redundant with the check above,
1028 # but necessary for mypy; see GH#29757
1029 _tbls = [x for x in tbls if isinstance(x, Table)]
1030
1031 # axis is the concentration axes
1032 axis = {t.non_index_axes[0][0] for t in _tbls}.pop()
1033
1034 def func(_start, _stop, _where):
1035 # retrieve the objs, _where is always passed as a set of
1036 # coordinates here
1037 objs = [
1038 t.read(where=_where, columns=columns, start=_start, stop=_stop)
1039 for t in tbls
1040 ]
1041
1042 # concat and return
1043 return concat(objs, axis=axis, verify_integrity=False)._consolidate()
1044
1045 # create the iterator
1046 it = TableIterator(
1047 self,
1048 s,
1049 func,
1050 where=where,
1051 nrows=nrows,
1052 start=start,
1053 stop=stop,
1054 iterator=iterator,
1055 chunksize=chunksize,
1056 auto_close=auto_close,
1057 )
1058
1059 return it.get_result(coordinates=True)
1060
1061 def put(
1062 self,
1063 key: str,
1064 value: DataFrame | Series,
1065 format=None,
1066 index: bool = True,
1067 append: bool = False,
1068 complib=None,
1069 complevel: int | None = None,
1070 min_itemsize: int | dict[str, int] | None = None,
1071 nan_rep=None,
1072 data_columns: Literal[True] | list[str] | None = None,
1073 encoding=None,
1074 errors: str = "strict",
1075 track_times: bool = True,
1076 dropna: bool = False,
1077 ) -> None:
1078 """
1079 Store object in HDFStore.
1080
1081 Parameters
1082 ----------
1083 key : str
1084 value : {Series, DataFrame}
1085 format : 'fixed(f)|table(t)', default is 'fixed'
1086 Format to use when storing object in HDFStore. Value can be one of:
1087
1088 ``'fixed'``
1089 Fixed format. Fast writing/reading. Not-appendable, nor searchable.
1090 ``'table'``
1091 Table format. Write as a PyTables Table structure which may perform
1092 worse but allow more flexible operations like searching / selecting
1093 subsets of the data.
1094 index : bool, default True
1095 Write DataFrame index as a column.
1096 append : bool, default False
1097 This will force Table format, append the input data to the existing.
1098 data_columns : list of columns or True, default None
1099 List of columns to create as data columns, or True to use all columns.
1100 See `here
1101 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
1102 encoding : str, default None
1103 Provide an encoding for strings.
1104 track_times : bool, default True
1105 Parameter is propagated to 'create_table' method of 'PyTables'.
1106 If set to False it enables to have the same h5 files (same hashes)
1107 independent on creation time.
1108 dropna : bool, default False, optional
1109 Remove missing values.
1110
1111 .. versionadded:: 1.1.0
1112 """
1113 if format is None:
1114 format = get_option("io.hdf.default_format") or "fixed"
1115 format = self._validate_format(format)
1116 self._write_to_group(
1117 key,
1118 value,
1119 format=format,
1120 index=index,
1121 append=append,
1122 complib=complib,
1123 complevel=complevel,
1124 min_itemsize=min_itemsize,
1125 nan_rep=nan_rep,
1126 data_columns=data_columns,
1127 encoding=encoding,
1128 errors=errors,
1129 track_times=track_times,
1130 dropna=dropna,
1131 )
1132
1133 def remove(self, key: str, where=None, start=None, stop=None) -> None:
1134 """
1135 Remove pandas object partially by specifying the where condition
1136
1137 Parameters
1138 ----------
1139 key : str
1140 Node to remove or delete rows from
1141 where : list of Term (or convertible) objects, optional
1142 start : integer (defaults to None), row number to start selection
1143 stop : integer (defaults to None), row number to stop selection
1144
1145 Returns
1146 -------
1147 number of rows removed (or None if not a Table)
1148
1149 Raises
1150 ------
1151 raises KeyError if key is not a valid store
1152
1153 """
1154 where = _ensure_term(where, scope_level=1)
1155 try:
1156 s = self.get_storer(key)
1157 except KeyError:
1158 # the key is not a valid store, re-raising KeyError
1159 raise
1160 except AssertionError:
1161 # surface any assertion errors for e.g. debugging
1162 raise
1163 except Exception as err:
1164 # In tests we get here with ClosedFileError, TypeError, and
1165 # _table_mod.NoSuchNodeError. TODO: Catch only these?
1166
1167 if where is not None:
1168 raise ValueError(
1169 "trying to remove a node with a non-None where clause!"
1170 ) from err
1171
1172 # we are actually trying to remove a node (with children)
1173 node = self.get_node(key)
1174 if node is not None:
1175 node._f_remove(recursive=True)
1176 return None
1177
1178 # remove the node
1179 if com.all_none(where, start, stop):
1180 s.group._f_remove(recursive=True)
1181
1182 # delete from the table
1183 else:
1184 if not s.is_table:
1185 raise ValueError(
1186 "can only remove with where on objects written as tables"
1187 )
1188 return s.delete(where=where, start=start, stop=stop)
1189
1190 def append(
1191 self,
1192 key: str,
1193 value: DataFrame | Series,
1194 format=None,
1195 axes=None,
1196 index: bool | list[str] = True,
1197 append: bool = True,
1198 complib=None,
1199 complevel: int | None = None,
1200 columns=None,
1201 min_itemsize: int | dict[str, int] | None = None,
1202 nan_rep=None,
1203 chunksize=None,
1204 expectedrows=None,
1205 dropna: bool | None = None,
1206 data_columns: Literal[True] | list[str] | None = None,
1207 encoding=None,
1208 errors: str = "strict",
1209 ) -> None:
1210 """
1211 Append to Table in file.
1212
1213 Node must already exist and be Table format.
1214
1215 Parameters
1216 ----------
1217 key : str
1218 value : {Series, DataFrame}
1219 format : 'table' is the default
1220 Format to use when storing object in HDFStore. Value can be one of:
1221
1222 ``'table'``
1223 Table format. Write as a PyTables Table structure which may perform
1224 worse but allow more flexible operations like searching / selecting
1225 subsets of the data.
1226 index : bool, default True
1227 Write DataFrame index as a column.
1228 append : bool, default True
1229 Append the input data to the existing.
1230 data_columns : list of columns, or True, default None
1231 List of columns to create as indexed data columns for on-disk
1232 queries, or True to use all columns. By default only the axes
1233 of the object are indexed. See `here
1234 <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
1235 min_itemsize : dict of columns that specify minimum str sizes
1236 nan_rep : str to use as str nan representation
1237 chunksize : size to chunk the writing
1238 expectedrows : expected TOTAL row size of this table
1239 encoding : default None, provide an encoding for str
1240 dropna : bool, default False, optional
1241 Do not write an ALL nan row to the store settable
1242 by the option 'io.hdf.dropna_table'.
1243
1244 Notes
1245 -----
1246 Does *not* check if data being appended overlaps with existing
1247 data in the table, so be careful
1248 """
1249 if columns is not None:
1250 raise TypeError(
1251 "columns is not a supported keyword in append, try data_columns"
1252 )
1253
1254 if dropna is None:
1255 dropna = get_option("io.hdf.dropna_table")
1256 if format is None:
1257 format = get_option("io.hdf.default_format") or "table"
1258 format = self._validate_format(format)
1259 self._write_to_group(
1260 key,
1261 value,
1262 format=format,
1263 axes=axes,
1264 index=index,
1265 append=append,
1266 complib=complib,
1267 complevel=complevel,
1268 min_itemsize=min_itemsize,
1269 nan_rep=nan_rep,
1270 chunksize=chunksize,
1271 expectedrows=expectedrows,
1272 dropna=dropna,
1273 data_columns=data_columns,
1274 encoding=encoding,
1275 errors=errors,
1276 )
1277
1278 def append_to_multiple(
1279 self,
1280 d: dict,
1281 value,
1282 selector,
1283 data_columns=None,
1284 axes=None,
1285 dropna: bool = False,
1286 **kwargs,
1287 ) -> None:
1288 """
1289 Append to multiple tables
1290
1291 Parameters
1292 ----------
1293 d : a dict of table_name to table_columns, None is acceptable as the
1294 values of one node (this will get all the remaining columns)
1295 value : a pandas object
1296 selector : a string that designates the indexable table; all of its
1297 columns will be designed as data_columns, unless data_columns is
1298 passed, in which case these are used
1299 data_columns : list of columns to create as data columns, or True to
1300 use all columns
1301 dropna : if evaluates to True, drop rows from all tables if any single
1302 row in each table has all NaN. Default False.
1303
1304 Notes
1305 -----
1306 axes parameter is currently not accepted
1307
1308 """
1309 if axes is not None:
1310 raise TypeError(
1311 "axes is currently not accepted as a parameter to append_to_multiple; "
1312 "you can create the tables independently instead"
1313 )
1314
1315 if not isinstance(d, dict):
1316 raise ValueError(
1317 "append_to_multiple must have a dictionary specified as the "
1318 "way to split the value"
1319 )
1320
1321 if selector not in d:
1322 raise ValueError(
1323 "append_to_multiple requires a selector that is in passed dict"
1324 )
1325
1326 # figure out the splitting axis (the non_index_axis)
1327 axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]
1328
1329 # figure out how to split the value
1330 remain_key = None
1331 remain_values: list = []
1332 for k, v in d.items():
1333 if v is None:
1334 if remain_key is not None:
1335 raise ValueError(
1336 "append_to_multiple can only have one value in d that is None"
1337 )
1338 remain_key = k
1339 else:
1340 remain_values.extend(v)
1341 if remain_key is not None:
1342 ordered = value.axes[axis]
1343 ordd = ordered.difference(Index(remain_values))
1344 ordd = sorted(ordered.get_indexer(ordd))
1345 d[remain_key] = ordered.take(ordd)
1346
1347 # data_columns
1348 if data_columns is None:
1349 data_columns = d[selector]
1350
1351 # ensure rows are synchronized across the tables
1352 if dropna:
1353 idxs = (value[cols].dropna(how="all").index for cols in d.values())
1354 valid_index = next(idxs)
1355 for index in idxs:
1356 valid_index = valid_index.intersection(index)
1357 value = value.loc[valid_index]
1358
1359 min_itemsize = kwargs.pop("min_itemsize", None)
1360
1361 # append
1362 for k, v in d.items():
1363 dc = data_columns if k == selector else None
1364
1365 # compute the val
1366 val = value.reindex(v, axis=axis)
1367
1368 filtered = (
1369 {key: value for (key, value) in min_itemsize.items() if key in v}
1370 if min_itemsize is not None
1371 else None
1372 )
1373 self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs)
1374
1375 def create_table_index(
1376 self,
1377 key: str,
1378 columns=None,
1379 optlevel: int | None = None,
1380 kind: str | None = None,
1381 ) -> None:
1382 """
1383 Create a pytables index on the table.
1384
1385 Parameters
1386 ----------
1387 key : str
1388 columns : None, bool, or listlike[str]
1389 Indicate which columns to create an index on.
1390
1391 * False : Do not create any indexes.
1392 * True : Create indexes on all columns.
1393 * None : Create indexes on all columns.
1394 * listlike : Create indexes on the given columns.
1395
1396 optlevel : int or None, default None
1397 Optimization level, if None, pytables defaults to 6.
1398 kind : str or None, default None
1399 Kind of index, if None, pytables defaults to "medium".
1400
1401 Raises
1402 ------
1403 TypeError: raises if the node is not a table
1404 """
1405 # version requirements
1406 _tables()
1407 s = self.get_storer(key)
1408 if s is None:
1409 return
1410
1411 if not isinstance(s, Table):
1412 raise TypeError("cannot create table index on a Fixed format store")
1413 s.create_index(columns=columns, optlevel=optlevel, kind=kind)
1414
1415 def groups(self) -> list:
1416 """
1417 Return a list of all the top-level nodes.
1418
1419 Each node returned is not a pandas storage object.
1420
1421 Returns
1422 -------
1423 list
1424 List of objects.
1425 """
1426 _tables()
1427 self._check_if_open()
1428 assert self._handle is not None # for mypy
1429 assert _table_mod is not None # for mypy
1430 return [
1431 g
1432 for g in self._handle.walk_groups()
1433 if (
1434 not isinstance(g, _table_mod.link.Link)
1435 and (
1436 getattr(g._v_attrs, "pandas_type", None)
1437 or getattr(g, "table", None)
1438 or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")
1439 )
1440 )
1441 ]
1442
1443 def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]:
1444 """
1445 Walk the pytables group hierarchy for pandas objects.
1446
1447 This generator will yield the group path, subgroups and pandas object
1448 names for each group.
1449
1450 Any non-pandas PyTables objects that are not a group will be ignored.
1451
1452 The `where` group itself is listed first (preorder), then each of its
1453 child groups (following an alphanumerical order) is also traversed,
1454 following the same procedure.
1455
1456 Parameters
1457 ----------
1458 where : str, default "/"
1459 Group where to start walking.
1460
1461 Yields
1462 ------
1463 path : str
1464 Full path to a group (without trailing '/').
1465 groups : list
1466 Names (strings) of the groups contained in `path`.
1467 leaves : list
1468 Names (strings) of the pandas objects contained in `path`.
1469 """
1470 _tables()
1471 self._check_if_open()
1472 assert self._handle is not None # for mypy
1473 assert _table_mod is not None # for mypy
1474
1475 for g in self._handle.walk_groups(where):
1476 if getattr(g._v_attrs, "pandas_type", None) is not None:
1477 continue
1478
1479 groups = []
1480 leaves = []
1481 for child in g._v_children.values():
1482 pandas_type = getattr(child._v_attrs, "pandas_type", None)
1483 if pandas_type is None:
1484 if isinstance(child, _table_mod.group.Group):
1485 groups.append(child._v_name)
1486 else:
1487 leaves.append(child._v_name)
1488
1489 yield (g._v_pathname.rstrip("/"), groups, leaves)
1490
1491 def get_node(self, key: str) -> Node | None:
1492 """return the node with the key or None if it does not exist"""
1493 self._check_if_open()
1494 if not key.startswith("/"):
1495 key = "/" + key
1496
1497 assert self._handle is not None
1498 assert _table_mod is not None # for mypy
1499 try:
1500 node = self._handle.get_node(self.root, key)
1501 except _table_mod.exceptions.NoSuchNodeError:
1502 return None
1503
1504 assert isinstance(node, _table_mod.Node), type(node)
1505 return node
1506
1507 def get_storer(self, key: str) -> GenericFixed | Table:
1508 """return the storer object for a key, raise if not in the file"""
1509 group = self.get_node(key)
1510 if group is None:
1511 raise KeyError(f"No object named {key} in the file")
1512
1513 s = self._create_storer(group)
1514 s.infer_axes()
1515 return s
1516
1517 def copy(
1518 self,
1519 file,
1520 mode: str = "w",
1521 propindexes: bool = True,
1522 keys=None,
1523 complib=None,
1524 complevel: int | None = None,
1525 fletcher32: bool = False,
1526 overwrite: bool = True,
1527 ) -> HDFStore:
1528 """
1529 Copy the existing store to a new file, updating in place.
1530
1531 Parameters
1532 ----------
1533 propindexes : bool, default True
1534 Restore indexes in copied file.
1535 keys : list, optional
1536 List of keys to include in the copy (defaults to all).
1537 overwrite : bool, default True
1538 Whether to overwrite (remove and replace) existing nodes in the new store.
1539 mode, complib, complevel, fletcher32 same as in HDFStore.__init__
1540
1541 Returns
1542 -------
1543 open file handle of the new store
1544 """
1545 new_store = HDFStore(
1546 file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32
1547 )
1548 if keys is None:
1549 keys = list(self.keys())
1550 if not isinstance(keys, (tuple, list)):
1551 keys = [keys]
1552 for k in keys:
1553 s = self.get_storer(k)
1554 if s is not None:
1555 if k in new_store:
1556 if overwrite:
1557 new_store.remove(k)
1558
1559 data = self.select(k)
1560 if isinstance(s, Table):
1561 index: bool | list[str] = False
1562 if propindexes:
1563 index = [a.name for a in s.axes if a.is_indexed]
1564 new_store.append(
1565 k,
1566 data,
1567 index=index,
1568 data_columns=getattr(s, "data_columns", None),
1569 encoding=s.encoding,
1570 )
1571 else:
1572 new_store.put(k, data, encoding=s.encoding)
1573
1574 return new_store
1575
1576 def info(self) -> str:
1577 """
1578 Print detailed information on the store.
1579
1580 Returns
1581 -------
1582 str
1583 """
1584 path = pprint_thing(self._path)
1585 output = f"{type(self)}\nFile path: {path}\n"
1586
1587 if self.is_open:
1588 lkeys = sorted(self.keys())
1589 if len(lkeys):
1590 keys = []
1591 values = []
1592
1593 for k in lkeys:
1594 try:
1595 s = self.get_storer(k)
1596 if s is not None:
1597 keys.append(pprint_thing(s.pathname or k))
1598 values.append(pprint_thing(s or "invalid_HDFStore node"))
1599 except AssertionError:
1600 # surface any assertion errors for e.g. debugging
1601 raise
1602 except Exception as detail:
1603 keys.append(k)
1604 dstr = pprint_thing(detail)
1605 values.append(f"[invalid_HDFStore node: {dstr}]")
1606
1607 output += adjoin(12, keys, values)
1608 else:
1609 output += "Empty"
1610 else:
1611 output += "File is CLOSED"
1612
1613 return output
1614
1615 # ------------------------------------------------------------------------
1616 # private methods
1617
1618 def _check_if_open(self):
1619 if not self.is_open:
1620 raise ClosedFileError(f"{self._path} file is not open!")
1621
1622 def _validate_format(self, format: str) -> str:
1623 """validate / deprecate formats"""
1624 # validate
1625 try:
1626 format = _FORMAT_MAP[format.lower()]
1627 except KeyError as err:
1628 raise TypeError(f"invalid HDFStore format specified [{format}]") from err
1629
1630 return format
1631
1632 def _create_storer(
1633 self,
1634 group,
1635 format=None,
1636 value: DataFrame | Series | None = None,
1637 encoding: str = "UTF-8",
1638 errors: str = "strict",
1639 ) -> GenericFixed | Table:
1640 """return a suitable class to operate"""
1641 cls: type[GenericFixed] | type[Table]
1642
1643 if value is not None and not isinstance(value, (Series, DataFrame)):
1644 raise TypeError("value must be None, Series, or DataFrame")
1645
1646 pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))
1647 tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))
1648
1649 # infer the pt from the passed value
1650 if pt is None:
1651 if value is None:
1652 _tables()
1653 assert _table_mod is not None # for mypy
1654 if getattr(group, "table", None) or isinstance(
1655 group, _table_mod.table.Table
1656 ):
1657 pt = "frame_table"
1658 tt = "generic_table"
1659 else:
1660 raise TypeError(
1661 "cannot create a storer if the object is not existing "
1662 "nor a value are passed"
1663 )
1664 else:
1665 if isinstance(value, Series):
1666 pt = "series"
1667 else:
1668 pt = "frame"
1669
1670 # we are actually a table
1671 if format == "table":
1672 pt += "_table"
1673
1674 # a storer node
1675 if "table" not in pt:
1676 _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed}
1677 try:
1678 cls = _STORER_MAP[pt]
1679 except KeyError as err:
1680 raise TypeError(
1681 f"cannot properly create the storer for: [_STORER_MAP] [group->"
1682 f"{group},value->{type(value)},format->{format}"
1683 ) from err
1684 return cls(self, group, encoding=encoding, errors=errors)
1685
1686 # existing node (and must be a table)
1687 if tt is None:
1688 # if we are a writer, determine the tt
1689 if value is not None:
1690 if pt == "series_table":
1691 index = getattr(value, "index", None)
1692 if index is not None:
1693 if index.nlevels == 1:
1694 tt = "appendable_series"
1695 elif index.nlevels > 1:
1696 tt = "appendable_multiseries"
1697 elif pt == "frame_table":
1698 index = getattr(value, "index", None)
1699 if index is not None:
1700 if index.nlevels == 1:
1701 tt = "appendable_frame"
1702 elif index.nlevels > 1:
1703 tt = "appendable_multiframe"
1704
1705 _TABLE_MAP = {
1706 "generic_table": GenericTable,
1707 "appendable_series": AppendableSeriesTable,
1708 "appendable_multiseries": AppendableMultiSeriesTable,
1709 "appendable_frame": AppendableFrameTable,
1710 "appendable_multiframe": AppendableMultiFrameTable,
1711 "worm": WORMTable,
1712 }
1713 try:
1714 cls = _TABLE_MAP[tt]
1715 except KeyError as err:
1716 raise TypeError(
1717 f"cannot properly create the storer for: [_TABLE_MAP] [group->"
1718 f"{group},value->{type(value)},format->{format}"
1719 ) from err
1720
1721 return cls(self, group, encoding=encoding, errors=errors)
1722
1723 def _write_to_group(
1724 self,
1725 key: str,
1726 value: DataFrame | Series,
1727 format,
1728 axes=None,
1729 index: bool | list[str] = True,
1730 append: bool = False,
1731 complib=None,
1732 complevel: int | None = None,
1733 fletcher32=None,
1734 min_itemsize: int | dict[str, int] | None = None,
1735 chunksize=None,
1736 expectedrows=None,
1737 dropna: bool = False,
1738 nan_rep=None,
1739 data_columns=None,
1740 encoding=None,
1741 errors: str = "strict",
1742 track_times: bool = True,
1743 ) -> None:
1744 # we don't want to store a table node at all if our object is 0-len
1745 # as there are not dtypes
1746 if getattr(value, "empty", None) and (format == "table" or append):
1747 return
1748
1749 group = self._identify_group(key, append)
1750
1751 s = self._create_storer(group, format, value, encoding=encoding, errors=errors)
1752 if append:
1753 # raise if we are trying to append to a Fixed format,
1754 # or a table that exists (and we are putting)
1755 if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):
1756 raise ValueError("Can only append to Tables")
1757 if not s.is_exists:
1758 s.set_object_info()
1759 else:
1760 s.set_object_info()
1761
1762 if not s.is_table and complib:
1763 raise ValueError("Compression not supported on Fixed format stores")
1764
1765 # write the object
1766 s.write(
1767 obj=value,
1768 axes=axes,
1769 append=append,
1770 complib=complib,
1771 complevel=complevel,
1772 fletcher32=fletcher32,
1773 min_itemsize=min_itemsize,
1774 chunksize=chunksize,
1775 expectedrows=expectedrows,
1776 dropna=dropna,
1777 nan_rep=nan_rep,
1778 data_columns=data_columns,
1779 track_times=track_times,
1780 )
1781
1782 if isinstance(s, Table) and index:
1783 s.create_index(columns=index)
1784
1785 def _read_group(self, group: Node):
1786 s = self._create_storer(group)
1787 s.infer_axes()
1788 return s.read()
1789
1790 def _identify_group(self, key: str, append: bool) -> Node:
1791 """Identify HDF5 group based on key, delete/create group if needed."""
1792 group = self.get_node(key)
1793
1794 # we make this assertion for mypy; the get_node call will already
1795 # have raised if this is incorrect
1796 assert self._handle is not None
1797
1798 # remove the node if we are not appending
1799 if group is not None and not append:
1800 self._handle.remove_node(group, recursive=True)
1801 group = None
1802
1803 if group is None:
1804 group = self._create_nodes_and_group(key)
1805
1806 return group
1807
1808 def _create_nodes_and_group(self, key: str) -> Node:
1809 """Create nodes from key and return group name."""
1810 # assertion for mypy
1811 assert self._handle is not None
1812
1813 paths = key.split("/")
1814 # recursively create the groups
1815 path = "/"
1816 for p in paths:
1817 if not len(p):
1818 continue
1819 new_path = path
1820 if not path.endswith("/"):
1821 new_path += "/"
1822 new_path += p
1823 group = self.get_node(new_path)
1824 if group is None:
1825 group = self._handle.create_group(path, p)
1826 path = new_path
1827 return group
1828
1829
1830class TableIterator:
1831 """
1832 Define the iteration interface on a table
1833
1834 Parameters
1835 ----------
1836 store : HDFStore
1837 s : the referred storer
1838 func : the function to execute the query
1839 where : the where of the query
1840 nrows : the rows to iterate on
1841 start : the passed start value (default is None)
1842 stop : the passed stop value (default is None)
1843 iterator : bool, default False
1844 Whether to use the default iterator.
1845 chunksize : the passed chunking value (default is 100000)
1846 auto_close : bool, default False
1847 Whether to automatically close the store at the end of iteration.
1848 """
1849
1850 chunksize: int | None
1851 store: HDFStore
1852 s: GenericFixed | Table
1853
1854 def __init__(
1855 self,
1856 store: HDFStore,
1857 s: GenericFixed | Table,
1858 func,
1859 where,
1860 nrows,
1861 start=None,
1862 stop=None,
1863 iterator: bool = False,
1864 chunksize: int | None = None,
1865 auto_close: bool = False,
1866 ) -> None:
1867 self.store = store
1868 self.s = s
1869 self.func = func
1870 self.where = where
1871
1872 # set start/stop if they are not set if we are a table
1873 if self.s.is_table:
1874 if nrows is None:
1875 nrows = 0
1876 if start is None:
1877 start = 0
1878 if stop is None:
1879 stop = nrows
1880 stop = min(nrows, stop)
1881
1882 self.nrows = nrows
1883 self.start = start
1884 self.stop = stop
1885
1886 self.coordinates = None
1887 if iterator or chunksize is not None:
1888 if chunksize is None:
1889 chunksize = 100000
1890 self.chunksize = int(chunksize)
1891 else:
1892 self.chunksize = None
1893
1894 self.auto_close = auto_close
1895
1896 def __iter__(self) -> Iterator:
1897 # iterate
1898 current = self.start
1899 if self.coordinates is None:
1900 raise ValueError("Cannot iterate until get_result is called.")
1901 while current < self.stop:
1902 stop = min(current + self.chunksize, self.stop)
1903 value = self.func(None, None, self.coordinates[current:stop])
1904 current = stop
1905 if value is None or not len(value):
1906 continue
1907
1908 yield value
1909
1910 self.close()
1911
1912 def close(self) -> None:
1913 if self.auto_close:
1914 self.store.close()
1915
1916 def get_result(self, coordinates: bool = False):
1917 # return the actual iterator
1918 if self.chunksize is not None:
1919 if not isinstance(self.s, Table):
1920 raise TypeError("can only use an iterator or chunksize on a table")
1921
1922 self.coordinates = self.s.read_coordinates(where=self.where)
1923
1924 return self
1925
1926 # if specified read via coordinates (necessary for multiple selections
1927 if coordinates:
1928 if not isinstance(self.s, Table):
1929 raise TypeError("can only read_coordinates on a table")
1930 where = self.s.read_coordinates(
1931 where=self.where, start=self.start, stop=self.stop
1932 )
1933 else:
1934 where = self.where
1935
1936 # directly return the result
1937 results = self.func(self.start, self.stop, where)
1938 self.close()
1939 return results
1940
1941
1942class IndexCol:
1943 """
1944 an index column description class
1945
1946 Parameters
1947 ----------
1948 axis : axis which I reference
1949 values : the ndarray like converted values
1950 kind : a string description of this type
1951 typ : the pytables type
1952 pos : the position in the pytables
1953
1954 """
1955
1956 is_an_indexable: bool = True
1957 is_data_indexable: bool = True
1958 _info_fields = ["freq", "tz", "index_name"]
1959
1960 def __init__(
1961 self,
1962 name: str,
1963 values=None,
1964 kind=None,
1965 typ=None,
1966 cname: str | None = None,
1967 axis=None,
1968 pos=None,
1969 freq=None,
1970 tz=None,
1971 index_name=None,
1972 ordered=None,
1973 table=None,
1974 meta=None,
1975 metadata=None,
1976 ) -> None:
1977 if not isinstance(name, str):
1978 raise ValueError("`name` must be a str.")
1979
1980 self.values = values
1981 self.kind = kind
1982 self.typ = typ
1983 self.name = name
1984 self.cname = cname or name
1985 self.axis = axis
1986 self.pos = pos
1987 self.freq = freq
1988 self.tz = tz
1989 self.index_name = index_name
1990 self.ordered = ordered
1991 self.table = table
1992 self.meta = meta
1993 self.metadata = metadata
1994
1995 if pos is not None:
1996 self.set_pos(pos)
1997
1998 # These are ensured as long as the passed arguments match the
1999 # constructor annotations.
2000 assert isinstance(self.name, str)
2001 assert isinstance(self.cname, str)
2002
2003 @property
2004 def itemsize(self) -> int:
2005 # Assumes self.typ has already been initialized
2006 return self.typ.itemsize
2007
2008 @property
2009 def kind_attr(self) -> str:
2010 return f"{self.name}_kind"
2011
2012 def set_pos(self, pos: int) -> None:
2013 """set the position of this column in the Table"""
2014 self.pos = pos
2015 if pos is not None and self.typ is not None:
2016 self.typ._v_pos = pos
2017
2018 def __repr__(self) -> str:
2019 temp = tuple(
2020 map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))
2021 )
2022 return ",".join(
2023 [
2024 f"{key}->{value}"
2025 for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp)
2026 ]
2027 )
2028
2029 def __eq__(self, other: Any) -> bool:
2030 """compare 2 col items"""
2031 return all(
2032 getattr(self, a, None) == getattr(other, a, None)
2033 for a in ["name", "cname", "axis", "pos"]
2034 )
2035
2036 def __ne__(self, other) -> bool:
2037 return not self.__eq__(other)
2038
2039 @property
2040 def is_indexed(self) -> bool:
2041 """return whether I am an indexed column"""
2042 if not hasattr(self.table, "cols"):
2043 # e.g. if infer hasn't been called yet, self.table will be None.
2044 return False
2045 return getattr(self.table.cols, self.cname).is_indexed
2046
2047 def convert(
2048 self, values: np.ndarray, nan_rep, encoding: str, errors: str
2049 ) -> tuple[np.ndarray, np.ndarray] | tuple[Index, Index]:
2050 """
2051 Convert the data from this selection to the appropriate pandas type.
2052 """
2053 assert isinstance(values, np.ndarray), type(values)
2054
2055 # values is a recarray
2056 if values.dtype.fields is not None:
2057 # Copy, otherwise values will be a view
2058 # preventing the original recarry from being free'ed
2059 values = values[self.cname].copy()
2060
2061 val_kind = _ensure_decoded(self.kind)
2062 values = _maybe_convert(values, val_kind, encoding, errors)
2063
2064 kwargs = {}
2065 kwargs["name"] = _ensure_decoded(self.index_name)
2066
2067 if self.freq is not None:
2068 kwargs["freq"] = _ensure_decoded(self.freq)
2069
2070 factory: type[Index] | type[DatetimeIndex] = Index
2071 if is_datetime64_dtype(values.dtype) or is_datetime64tz_dtype(values.dtype):
2072 factory = DatetimeIndex
2073 elif values.dtype == "i8" and "freq" in kwargs:
2074 # PeriodIndex data is stored as i8
2075 # error: Incompatible types in assignment (expression has type
2076 # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type
2077 # "Union[Type[Index], Type[DatetimeIndex]]")
2078 factory = lambda x, **kwds: PeriodIndex( # type: ignore[assignment]
2079 ordinal=x, **kwds
2080 )
2081
2082 # making an Index instance could throw a number of different errors
2083 try:
2084 new_pd_index = factory(values, **kwargs)
2085 except ValueError:
2086 # if the output freq is different that what we recorded,
2087 # it should be None (see also 'doc example part 2')
2088 if "freq" in kwargs:
2089 kwargs["freq"] = None
2090 new_pd_index = factory(values, **kwargs)
2091 final_pd_index = _set_tz(new_pd_index, self.tz)
2092 return final_pd_index, final_pd_index
2093
2094 def take_data(self):
2095 """return the values"""
2096 return self.values
2097
2098 @property
2099 def attrs(self):
2100 return self.table._v_attrs
2101
2102 @property
2103 def description(self):
2104 return self.table.description
2105
2106 @property
2107 def col(self):
2108 """return my current col description"""
2109 return getattr(self.description, self.cname, None)
2110
2111 @property
2112 def cvalues(self):
2113 """return my cython values"""
2114 return self.values
2115
2116 def __iter__(self) -> Iterator:
2117 return iter(self.values)
2118
2119 def maybe_set_size(self, min_itemsize=None) -> None:
2120 """
2121 maybe set a string col itemsize:
2122 min_itemsize can be an integer or a dict with this columns name
2123 with an integer size
2124 """
2125 if _ensure_decoded(self.kind) == "string":
2126 if isinstance(min_itemsize, dict):
2127 min_itemsize = min_itemsize.get(self.name)
2128
2129 if min_itemsize is not None and self.typ.itemsize < min_itemsize:
2130 self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)
2131
2132 def validate_names(self) -> None:
2133 pass
2134
2135 def validate_and_set(self, handler: AppendableTable, append: bool) -> None:
2136 self.table = handler.table
2137 self.validate_col()
2138 self.validate_attr(append)
2139 self.validate_metadata(handler)
2140 self.write_metadata(handler)
2141 self.set_attr()
2142
2143 def validate_col(self, itemsize=None):
2144 """validate this column: return the compared against itemsize"""
2145 # validate this column for string truncation (or reset to the max size)
2146 if _ensure_decoded(self.kind) == "string":
2147 c = self.col
2148 if c is not None:
2149 if itemsize is None:
2150 itemsize = self.itemsize
2151 if c.itemsize < itemsize:
2152 raise ValueError(
2153 f"Trying to store a string with len [{itemsize}] in "
2154 f"[{self.cname}] column but\nthis column has a limit of "
2155 f"[{c.itemsize}]!\nConsider using min_itemsize to "
2156 "preset the sizes on these columns"
2157 )
2158 return c.itemsize
2159
2160 return None
2161
2162 def validate_attr(self, append: bool) -> None:
2163 # check for backwards incompatibility
2164 if append:
2165 existing_kind = getattr(self.attrs, self.kind_attr, None)
2166 if existing_kind is not None and existing_kind != self.kind:
2167 raise TypeError(
2168 f"incompatible kind in col [{existing_kind} - {self.kind}]"
2169 )
2170
2171 def update_info(self, info) -> None:
2172 """
2173 set/update the info for this indexable with the key/value
2174 if there is a conflict raise/warn as needed
2175 """
2176 for key in self._info_fields:
2177 value = getattr(self, key, None)
2178 idx = info.setdefault(self.name, {})
2179
2180 existing_value = idx.get(key)
2181 if key in idx and value is not None and existing_value != value:
2182 # frequency/name just warn
2183 if key in ["freq", "index_name"]:
2184 ws = attribute_conflict_doc % (key, existing_value, value)
2185 warnings.warn(
2186 ws, AttributeConflictWarning, stacklevel=find_stack_level()
2187 )
2188
2189 # reset
2190 idx[key] = None
2191 setattr(self, key, None)
2192
2193 else:
2194 raise ValueError(
2195 f"invalid info for [{self.name}] for [{key}], "
2196 f"existing_value [{existing_value}] conflicts with "
2197 f"new value [{value}]"
2198 )
2199 else:
2200 if value is not None or existing_value is not None:
2201 idx[key] = value
2202
2203 def set_info(self, info) -> None:
2204 """set my state from the passed info"""
2205 idx = info.get(self.name)
2206 if idx is not None:
2207 self.__dict__.update(idx)
2208
2209 def set_attr(self) -> None:
2210 """set the kind for this column"""
2211 setattr(self.attrs, self.kind_attr, self.kind)
2212
2213 def validate_metadata(self, handler: AppendableTable) -> None:
2214 """validate that kind=category does not change the categories"""
2215 if self.meta == "category":
2216 new_metadata = self.metadata
2217 cur_metadata = handler.read_metadata(self.cname)
2218 if (
2219 new_metadata is not None
2220 and cur_metadata is not None
2221 and not array_equivalent(new_metadata, cur_metadata)
2222 ):
2223 raise ValueError(
2224 "cannot append a categorical with "
2225 "different categories to the existing"
2226 )
2227
2228 def write_metadata(self, handler: AppendableTable) -> None:
2229 """set the meta data"""
2230 if self.metadata is not None:
2231 handler.write_metadata(self.cname, self.metadata)
2232
2233
2234class GenericIndexCol(IndexCol):
2235 """an index which is not represented in the data of the table"""
2236
2237 @property
2238 def is_indexed(self) -> bool:
2239 return False
2240
2241 def convert(
2242 self, values: np.ndarray, nan_rep, encoding: str, errors: str
2243 ) -> tuple[Index, Index]:
2244 """
2245 Convert the data from this selection to the appropriate pandas type.
2246
2247 Parameters
2248 ----------
2249 values : np.ndarray
2250 nan_rep : str
2251 encoding : str
2252 errors : str
2253 """
2254 assert isinstance(values, np.ndarray), type(values)
2255
2256 index = RangeIndex(len(values))
2257 return index, index
2258
2259 def set_attr(self) -> None:
2260 pass
2261
2262
2263class DataCol(IndexCol):
2264 """
2265 a data holding column, by definition this is not indexable
2266
2267 Parameters
2268 ----------
2269 data : the actual data
2270 cname : the column name in the table to hold the data (typically
2271 values)
2272 meta : a string description of the metadata
2273 metadata : the actual metadata
2274 """
2275
2276 is_an_indexable = False
2277 is_data_indexable = False
2278 _info_fields = ["tz", "ordered"]
2279
2280 def __init__(
2281 self,
2282 name: str,
2283 values=None,
2284 kind=None,
2285 typ=None,
2286 cname: str | None = None,
2287 pos=None,
2288 tz=None,
2289 ordered=None,
2290 table=None,
2291 meta=None,
2292 metadata=None,
2293 dtype: DtypeArg | None = None,
2294 data=None,
2295 ) -> None:
2296 super().__init__(
2297 name=name,
2298 values=values,
2299 kind=kind,
2300 typ=typ,
2301 pos=pos,
2302 cname=cname,
2303 tz=tz,
2304 ordered=ordered,
2305 table=table,
2306 meta=meta,
2307 metadata=metadata,
2308 )
2309 self.dtype = dtype
2310 self.data = data
2311
2312 @property
2313 def dtype_attr(self) -> str:
2314 return f"{self.name}_dtype"
2315
2316 @property
2317 def meta_attr(self) -> str:
2318 return f"{self.name}_meta"
2319
2320 def __repr__(self) -> str:
2321 temp = tuple(
2322 map(
2323 pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)
2324 )
2325 )
2326 return ",".join(
2327 [
2328 f"{key}->{value}"
2329 for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp)
2330 ]
2331 )
2332
2333 def __eq__(self, other: Any) -> bool:
2334 """compare 2 col items"""
2335 return all(
2336 getattr(self, a, None) == getattr(other, a, None)
2337 for a in ["name", "cname", "dtype", "pos"]
2338 )
2339
2340 def set_data(self, data: ArrayLike) -> None:
2341 assert data is not None
2342 assert self.dtype is None
2343
2344 data, dtype_name = _get_data_and_dtype_name(data)
2345
2346 self.data = data
2347 self.dtype = dtype_name
2348 self.kind = _dtype_to_kind(dtype_name)
2349
2350 def take_data(self):
2351 """return the data"""
2352 return self.data
2353
2354 @classmethod
2355 def _get_atom(cls, values: ArrayLike) -> Col:
2356 """
2357 Get an appropriately typed and shaped pytables.Col object for values.
2358 """
2359 dtype = values.dtype
2360 # error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no
2361 # attribute "itemsize"
2362 itemsize = dtype.itemsize # type: ignore[union-attr]
2363
2364 shape = values.shape
2365 if values.ndim == 1:
2366 # EA, use block shape pretending it is 2D
2367 # TODO(EA2D): not necessary with 2D EAs
2368 shape = (1, values.size)
2369
2370 if isinstance(values, Categorical):
2371 codes = values.codes
2372 atom = cls.get_atom_data(shape, kind=codes.dtype.name)
2373 elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
2374 atom = cls.get_atom_datetime64(shape)
2375 elif is_timedelta64_dtype(dtype):
2376 atom = cls.get_atom_timedelta64(shape)
2377 elif is_complex_dtype(dtype):
2378 atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0])
2379 elif is_string_dtype(dtype):
2380 atom = cls.get_atom_string(shape, itemsize)
2381 else:
2382 atom = cls.get_atom_data(shape, kind=dtype.name)
2383
2384 return atom
2385
2386 @classmethod
2387 def get_atom_string(cls, shape, itemsize):
2388 return _tables().StringCol(itemsize=itemsize, shape=shape[0])
2389
2390 @classmethod
2391 def get_atom_coltype(cls, kind: str) -> type[Col]:
2392 """return the PyTables column class for this column"""
2393 if kind.startswith("uint"):
2394 k4 = kind[4:]
2395 col_name = f"UInt{k4}Col"
2396 elif kind.startswith("period"):
2397 # we store as integer
2398 col_name = "Int64Col"
2399 else:
2400 kcap = kind.capitalize()
2401 col_name = f"{kcap}Col"
2402
2403 return getattr(_tables(), col_name)
2404
2405 @classmethod
2406 def get_atom_data(cls, shape, kind: str) -> Col:
2407 return cls.get_atom_coltype(kind=kind)(shape=shape[0])
2408
2409 @classmethod
2410 def get_atom_datetime64(cls, shape):
2411 return _tables().Int64Col(shape=shape[0])
2412
2413 @classmethod
2414 def get_atom_timedelta64(cls, shape):
2415 return _tables().Int64Col(shape=shape[0])
2416
2417 @property
2418 def shape(self):
2419 return getattr(self.data, "shape", None)
2420
2421 @property
2422 def cvalues(self):
2423 """return my cython values"""
2424 return self.data
2425
2426 def validate_attr(self, append) -> None:
2427 """validate that we have the same order as the existing & same dtype"""
2428 if append:
2429 existing_fields = getattr(self.attrs, self.kind_attr, None)
2430 if existing_fields is not None and existing_fields != list(self.values):
2431 raise ValueError("appended items do not match existing items in table!")
2432
2433 existing_dtype = getattr(self.attrs, self.dtype_attr, None)
2434 if existing_dtype is not None and existing_dtype != self.dtype:
2435 raise ValueError(
2436 "appended items dtype do not match existing items dtype in table!"
2437 )
2438
2439 def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
2440 """
2441 Convert the data from this selection to the appropriate pandas type.
2442
2443 Parameters
2444 ----------
2445 values : np.ndarray
2446 nan_rep :
2447 encoding : str
2448 errors : str
2449
2450 Returns
2451 -------
2452 index : listlike to become an Index
2453 data : ndarraylike to become a column
2454 """
2455 assert isinstance(values, np.ndarray), type(values)
2456
2457 # values is a recarray
2458 if values.dtype.fields is not None:
2459 values = values[self.cname]
2460
2461 assert self.typ is not None
2462 if self.dtype is None:
2463 # Note: in tests we never have timedelta64 or datetime64,
2464 # so the _get_data_and_dtype_name may be unnecessary
2465 converted, dtype_name = _get_data_and_dtype_name(values)
2466 kind = _dtype_to_kind(dtype_name)
2467 else:
2468 converted = values
2469 dtype_name = self.dtype
2470 kind = self.kind
2471
2472 assert isinstance(converted, np.ndarray) # for mypy
2473
2474 # use the meta if needed
2475 meta = _ensure_decoded(self.meta)
2476 metadata = self.metadata
2477 ordered = self.ordered
2478 tz = self.tz
2479
2480 assert dtype_name is not None
2481 # convert to the correct dtype
2482 dtype = _ensure_decoded(dtype_name)
2483
2484 # reverse converts
2485 if dtype == "datetime64":
2486 # recreate with tz if indicated
2487 converted = _set_tz(converted, tz, coerce=True)
2488
2489 elif dtype == "timedelta64":
2490 converted = np.asarray(converted, dtype="m8[ns]")
2491 elif dtype == "date":
2492 try:
2493 converted = np.asarray(
2494 [date.fromordinal(v) for v in converted], dtype=object
2495 )
2496 except ValueError:
2497 converted = np.asarray(
2498 [date.fromtimestamp(v) for v in converted], dtype=object
2499 )
2500
2501 elif meta == "category":
2502 # we have a categorical
2503 categories = metadata
2504 codes = converted.ravel()
2505
2506 # if we have stored a NaN in the categories
2507 # then strip it; in theory we could have BOTH
2508 # -1s in the codes and nulls :<
2509 if categories is None:
2510 # Handle case of NaN-only categorical columns in which case
2511 # the categories are an empty array; when this is stored,
2512 # pytables cannot write a zero-len array, so on readback
2513 # the categories would be None and `read_hdf()` would fail.
2514 categories = Index([], dtype=np.float64)
2515 else:
2516 mask = isna(categories)
2517 if mask.any():
2518 categories = categories[~mask]
2519 codes[codes != -1] -= mask.astype(int).cumsum()._values
2520
2521 converted = Categorical.from_codes(
2522 codes, categories=categories, ordered=ordered
2523 )
2524
2525 else:
2526 try:
2527 converted = converted.astype(dtype, copy=False)
2528 except TypeError:
2529 converted = converted.astype("O", copy=False)
2530
2531 # convert nans / decode
2532 if _ensure_decoded(kind) == "string":
2533 converted = _unconvert_string_array(
2534 converted, nan_rep=nan_rep, encoding=encoding, errors=errors
2535 )
2536
2537 return self.values, converted
2538
2539 def set_attr(self) -> None:
2540 """set the data for this column"""
2541 setattr(self.attrs, self.kind_attr, self.values)
2542 setattr(self.attrs, self.meta_attr, self.meta)
2543 assert self.dtype is not None
2544 setattr(self.attrs, self.dtype_attr, self.dtype)
2545
2546
2547class DataIndexableCol(DataCol):
2548 """represent a data column that can be indexed"""
2549
2550 is_data_indexable = True
2551
2552 def validate_names(self) -> None:
2553 if not is_object_dtype(Index(self.values)):
2554 # TODO: should the message here be more specifically non-str?
2555 raise ValueError("cannot have non-object label DataIndexableCol")
2556
2557 @classmethod
2558 def get_atom_string(cls, shape, itemsize):
2559 return _tables().StringCol(itemsize=itemsize)
2560
2561 @classmethod
2562 def get_atom_data(cls, shape, kind: str) -> Col:
2563 return cls.get_atom_coltype(kind=kind)()
2564
2565 @classmethod
2566 def get_atom_datetime64(cls, shape):
2567 return _tables().Int64Col()
2568
2569 @classmethod
2570 def get_atom_timedelta64(cls, shape):
2571 return _tables().Int64Col()
2572
2573
2574class GenericDataIndexableCol(DataIndexableCol):
2575 """represent a generic pytables data column"""
2576
2577
2578class Fixed:
2579 """
2580 represent an object in my store
2581 facilitate read/write of various types of objects
2582 this is an abstract base class
2583
2584 Parameters
2585 ----------
2586 parent : HDFStore
2587 group : Node
2588 The group node where the table resides.
2589 """
2590
2591 pandas_kind: str
2592 format_type: str = "fixed" # GH#30962 needed by dask
2593 obj_type: type[DataFrame | Series]
2594 ndim: int
2595 parent: HDFStore
2596 is_table: bool = False
2597
2598 def __init__(
2599 self,
2600 parent: HDFStore,
2601 group: Node,
2602 encoding: str | None = "UTF-8",
2603 errors: str = "strict",
2604 ) -> None:
2605 assert isinstance(parent, HDFStore), type(parent)
2606 assert _table_mod is not None # needed for mypy
2607 assert isinstance(group, _table_mod.Node), type(group)
2608 self.parent = parent
2609 self.group = group
2610 self.encoding = _ensure_encoding(encoding)
2611 self.errors = errors
2612
2613 @property
2614 def is_old_version(self) -> bool:
2615 return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1
2616
2617 @property
2618 def version(self) -> tuple[int, int, int]:
2619 """compute and set our version"""
2620 version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))
2621 try:
2622 version = tuple(int(x) for x in version.split("."))
2623 if len(version) == 2:
2624 version = version + (0,)
2625 except AttributeError:
2626 version = (0, 0, 0)
2627 return version
2628
2629 @property
2630 def pandas_type(self):
2631 return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))
2632
2633 def __repr__(self) -> str:
2634 """return a pretty representation of myself"""
2635 self.infer_axes()
2636 s = self.shape
2637 if s is not None:
2638 if isinstance(s, (list, tuple)):
2639 jshape = ",".join([pprint_thing(x) for x in s])
2640 s = f"[{jshape}]"
2641 return f"{self.pandas_type:12.12} (shape->{s})"
2642 return self.pandas_type
2643
2644 def set_object_info(self) -> None:
2645 """set my pandas type & version"""
2646 self.attrs.pandas_type = str(self.pandas_kind)
2647 self.attrs.pandas_version = str(_version)
2648
2649 def copy(self) -> Fixed:
2650 new_self = copy.copy(self)
2651 return new_self
2652
2653 @property
2654 def shape(self):
2655 return self.nrows
2656
2657 @property
2658 def pathname(self):
2659 return self.group._v_pathname
2660
2661 @property
2662 def _handle(self):
2663 return self.parent._handle
2664
2665 @property
2666 def _filters(self):
2667 return self.parent._filters
2668
2669 @property
2670 def _complevel(self) -> int:
2671 return self.parent._complevel
2672
2673 @property
2674 def _fletcher32(self) -> bool:
2675 return self.parent._fletcher32
2676
2677 @property
2678 def attrs(self):
2679 return self.group._v_attrs
2680
2681 def set_attrs(self) -> None:
2682 """set our object attributes"""
2683
2684 def get_attrs(self) -> None:
2685 """get our object attributes"""
2686
2687 @property
2688 def storable(self):
2689 """return my storable"""
2690 return self.group
2691
2692 @property
2693 def is_exists(self) -> bool:
2694 return False
2695
2696 @property
2697 def nrows(self):
2698 return getattr(self.storable, "nrows", None)
2699
2700 def validate(self, other) -> Literal[True] | None:
2701 """validate against an existing storable"""
2702 if other is None:
2703 return None
2704 return True
2705
2706 def validate_version(self, where=None) -> None:
2707 """are we trying to operate on an old version?"""
2708
2709 def infer_axes(self) -> bool:
2710 """
2711 infer the axes of my storer
2712 return a boolean indicating if we have a valid storer or not
2713 """
2714 s = self.storable
2715 if s is None:
2716 return False
2717 self.get_attrs()
2718 return True
2719
2720 def read(
2721 self,
2722 where=None,
2723 columns=None,
2724 start: int | None = None,
2725 stop: int | None = None,
2726 ):
2727 raise NotImplementedError(
2728 "cannot read on an abstract storer: subclasses should implement"
2729 )
2730
2731 def write(self, **kwargs):
2732 raise NotImplementedError(
2733 "cannot write on an abstract storer: subclasses should implement"
2734 )
2735
2736 def delete(
2737 self, where=None, start: int | None = None, stop: int | None = None
2738 ) -> None:
2739 """
2740 support fully deleting the node in its entirety (only) - where
2741 specification must be None
2742 """
2743 if com.all_none(where, start, stop):
2744 self._handle.remove_node(self.group, recursive=True)
2745 return None
2746
2747 raise TypeError("cannot delete on an abstract storer")
2748
2749
2750class GenericFixed(Fixed):
2751 """a generified fixed version"""
2752
2753 _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}
2754 _reverse_index_map = {v: k for k, v in _index_type_map.items()}
2755 attributes: list[str] = []
2756
2757 # indexer helpers
2758 def _class_to_alias(self, cls) -> str:
2759 return self._index_type_map.get(cls, "")
2760
2761 def _alias_to_class(self, alias):
2762 if isinstance(alias, type): # pragma: no cover
2763 # compat: for a short period of time master stored types
2764 return alias
2765 return self._reverse_index_map.get(alias, Index)
2766
2767 def _get_index_factory(self, attrs):
2768 index_class = self._alias_to_class(
2769 _ensure_decoded(getattr(attrs, "index_class", ""))
2770 )
2771
2772 factory: Callable
2773
2774 if index_class == DatetimeIndex:
2775
2776 def f(values, freq=None, tz=None):
2777 # data are already in UTC, localize and convert if tz present
2778 dta = DatetimeArray._simple_new(values.values, freq=freq)
2779 result = DatetimeIndex._simple_new(dta, name=None)
2780 if tz is not None:
2781 result = result.tz_localize("UTC").tz_convert(tz)
2782 return result
2783
2784 factory = f
2785 elif index_class == PeriodIndex:
2786
2787 def f(values, freq=None, tz=None):
2788 parr = PeriodArray._simple_new(values, freq=freq)
2789 return PeriodIndex._simple_new(parr, name=None)
2790
2791 factory = f
2792 else:
2793 factory = index_class
2794
2795 kwargs = {}
2796 if "freq" in attrs:
2797 kwargs["freq"] = attrs["freq"]
2798 if index_class is Index:
2799 # DTI/PI would be gotten by _alias_to_class
2800 factory = TimedeltaIndex
2801
2802 if "tz" in attrs:
2803 if isinstance(attrs["tz"], bytes):
2804 # created by python2
2805 kwargs["tz"] = attrs["tz"].decode("utf-8")
2806 else:
2807 # created by python3
2808 kwargs["tz"] = attrs["tz"]
2809 assert index_class is DatetimeIndex # just checking
2810
2811 return factory, kwargs
2812
2813 def validate_read(self, columns, where) -> None:
2814 """
2815 raise if any keywords are passed which are not-None
2816 """
2817 if columns is not None:
2818 raise TypeError(
2819 "cannot pass a column specification when reading "
2820 "a Fixed format store. this store must be selected in its entirety"
2821 )
2822 if where is not None:
2823 raise TypeError(
2824 "cannot pass a where specification when reading "
2825 "from a Fixed format store. this store must be selected in its entirety"
2826 )
2827
2828 @property
2829 def is_exists(self) -> bool:
2830 return True
2831
2832 def set_attrs(self) -> None:
2833 """set our object attributes"""
2834 self.attrs.encoding = self.encoding
2835 self.attrs.errors = self.errors
2836
2837 def get_attrs(self) -> None:
2838 """retrieve our attributes"""
2839 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
2840 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
2841 for n in self.attributes:
2842 setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
2843
2844 # error: Signature of "write" incompatible with supertype "Fixed"
2845 def write(self, obj, **kwargs) -> None: # type: ignore[override]
2846 self.set_attrs()
2847
2848 def read_array(self, key: str, start: int | None = None, stop: int | None = None):
2849 """read an array for the specified node (off of group"""
2850 import tables
2851
2852 node = getattr(self.group, key)
2853 attrs = node._v_attrs
2854
2855 transposed = getattr(attrs, "transposed", False)
2856
2857 if isinstance(node, tables.VLArray):
2858 ret = node[0][start:stop]
2859 else:
2860 dtype = _ensure_decoded(getattr(attrs, "value_type", None))
2861 shape = getattr(attrs, "shape", None)
2862
2863 if shape is not None:
2864 # length 0 axis
2865 ret = np.empty(shape, dtype=dtype)
2866 else:
2867 ret = node[start:stop]
2868
2869 if dtype == "datetime64":
2870 # reconstruct a timezone if indicated
2871 tz = getattr(attrs, "tz", None)
2872 ret = _set_tz(ret, tz, coerce=True)
2873
2874 elif dtype == "timedelta64":
2875 ret = np.asarray(ret, dtype="m8[ns]")
2876
2877 if transposed:
2878 return ret.T
2879 else:
2880 return ret
2881
2882 def read_index(
2883 self, key: str, start: int | None = None, stop: int | None = None
2884 ) -> Index:
2885 variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))
2886
2887 if variety == "multi":
2888 return self.read_multi_index(key, start=start, stop=stop)
2889 elif variety == "regular":
2890 node = getattr(self.group, key)
2891 index = self.read_index_node(node, start=start, stop=stop)
2892 return index
2893 else: # pragma: no cover
2894 raise TypeError(f"unrecognized index variety: {variety}")
2895
2896 def write_index(self, key: str, index: Index) -> None:
2897 if isinstance(index, MultiIndex):
2898 setattr(self.attrs, f"{key}_variety", "multi")
2899 self.write_multi_index(key, index)
2900 else:
2901 setattr(self.attrs, f"{key}_variety", "regular")
2902 converted = _convert_index("index", index, self.encoding, self.errors)
2903
2904 self.write_array(key, converted.values)
2905
2906 node = getattr(self.group, key)
2907 node._v_attrs.kind = converted.kind
2908 node._v_attrs.name = index.name
2909
2910 if isinstance(index, (DatetimeIndex, PeriodIndex)):
2911 node._v_attrs.index_class = self._class_to_alias(type(index))
2912
2913 if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
2914 node._v_attrs.freq = index.freq
2915
2916 if isinstance(index, DatetimeIndex) and index.tz is not None:
2917 node._v_attrs.tz = _get_tz(index.tz)
2918
2919 def write_multi_index(self, key: str, index: MultiIndex) -> None:
2920 setattr(self.attrs, f"{key}_nlevels", index.nlevels)
2921
2922 for i, (lev, level_codes, name) in enumerate(
2923 zip(index.levels, index.codes, index.names)
2924 ):
2925 # write the level
2926 if is_extension_array_dtype(lev):
2927 raise NotImplementedError(
2928 "Saving a MultiIndex with an extension dtype is not supported."
2929 )
2930 level_key = f"{key}_level{i}"
2931 conv_level = _convert_index(level_key, lev, self.encoding, self.errors)
2932 self.write_array(level_key, conv_level.values)
2933 node = getattr(self.group, level_key)
2934 node._v_attrs.kind = conv_level.kind
2935 node._v_attrs.name = name
2936
2937 # write the name
2938 setattr(node._v_attrs, f"{key}_name{name}", name)
2939
2940 # write the labels
2941 label_key = f"{key}_label{i}"
2942 self.write_array(label_key, level_codes)
2943
2944 def read_multi_index(
2945 self, key: str, start: int | None = None, stop: int | None = None
2946 ) -> MultiIndex:
2947 nlevels = getattr(self.attrs, f"{key}_nlevels")
2948
2949 levels = []
2950 codes = []
2951 names: list[Hashable] = []
2952 for i in range(nlevels):
2953 level_key = f"{key}_level{i}"
2954 node = getattr(self.group, level_key)
2955 lev = self.read_index_node(node, start=start, stop=stop)
2956 levels.append(lev)
2957 names.append(lev.name)
2958
2959 label_key = f"{key}_label{i}"
2960 level_codes = self.read_array(label_key, start=start, stop=stop)
2961 codes.append(level_codes)
2962
2963 return MultiIndex(
2964 levels=levels, codes=codes, names=names, verify_integrity=True
2965 )
2966
2967 def read_index_node(
2968 self, node: Node, start: int | None = None, stop: int | None = None
2969 ) -> Index:
2970 data = node[start:stop]
2971 # If the index was an empty array write_array_empty() will
2972 # have written a sentinel. Here we replace it with the original.
2973 if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
2974 data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)
2975 kind = _ensure_decoded(node._v_attrs.kind)
2976 name = None
2977
2978 if "name" in node._v_attrs:
2979 name = _ensure_str(node._v_attrs.name)
2980 name = _ensure_decoded(name)
2981
2982 attrs = node._v_attrs
2983 factory, kwargs = self._get_index_factory(attrs)
2984
2985 if kind in ("date", "object"):
2986 index = factory(
2987 _unconvert_index(
2988 data, kind, encoding=self.encoding, errors=self.errors
2989 ),
2990 dtype=object,
2991 **kwargs,
2992 )
2993 else:
2994 index = factory(
2995 _unconvert_index(
2996 data, kind, encoding=self.encoding, errors=self.errors
2997 ),
2998 **kwargs,
2999 )
3000
3001 index.name = name
3002
3003 return index
3004
3005 def write_array_empty(self, key: str, value: ArrayLike) -> None:
3006 """write a 0-len array"""
3007 # ugly hack for length 0 axes
3008 arr = np.empty((1,) * value.ndim)
3009 self._handle.create_array(self.group, key, arr)
3010 node = getattr(self.group, key)
3011 node._v_attrs.value_type = str(value.dtype)
3012 node._v_attrs.shape = value.shape
3013
3014 def write_array(
3015 self, key: str, obj: AnyArrayLike, items: Index | None = None
3016 ) -> None:
3017 # TODO: we only have a few tests that get here, the only EA
3018 # that gets passed is DatetimeArray, and we never have
3019 # both self._filters and EA
3020
3021 value = extract_array(obj, extract_numpy=True)
3022
3023 if key in self.group:
3024 self._handle.remove_node(self.group, key)
3025
3026 # Transform needed to interface with pytables row/col notation
3027 empty_array = value.size == 0
3028 transposed = False
3029
3030 if is_categorical_dtype(value.dtype):
3031 raise NotImplementedError(
3032 "Cannot store a category dtype in a HDF5 dataset that uses format="
3033 '"fixed". Use format="table".'
3034 )
3035 if not empty_array:
3036 if hasattr(value, "T"):
3037 # ExtensionArrays (1d) may not have transpose.
3038 value = value.T
3039 transposed = True
3040
3041 atom = None
3042 if self._filters is not None:
3043 with suppress(ValueError):
3044 # get the atom for this datatype
3045 atom = _tables().Atom.from_dtype(value.dtype)
3046
3047 if atom is not None:
3048 # We only get here if self._filters is non-None and
3049 # the Atom.from_dtype call succeeded
3050
3051 # create an empty chunked array and fill it from value
3052 if not empty_array:
3053 ca = self._handle.create_carray(
3054 self.group, key, atom, value.shape, filters=self._filters
3055 )
3056 ca[:] = value
3057
3058 else:
3059 self.write_array_empty(key, value)
3060
3061 elif value.dtype.type == np.object_:
3062 # infer the type, warn if we have a non-string type here (for
3063 # performance)
3064 inferred_type = lib.infer_dtype(value, skipna=False)
3065 if empty_array:
3066 pass
3067 elif inferred_type == "string":
3068 pass
3069 else:
3070 ws = performance_doc % (inferred_type, key, items)
3071 warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())
3072
3073 vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
3074 vlarr.append(value)
3075
3076 elif is_datetime64_dtype(value.dtype):
3077 self._handle.create_array(self.group, key, value.view("i8"))
3078 getattr(self.group, key)._v_attrs.value_type = "datetime64"
3079 elif is_datetime64tz_dtype(value.dtype):
3080 # store as UTC
3081 # with a zone
3082
3083 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
3084 # attribute "asi8"
3085 self._handle.create_array(
3086 self.group, key, value.asi8 # type: ignore[union-attr]
3087 )
3088
3089 node = getattr(self.group, key)
3090 # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
3091 # attribute "tz"
3092 node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr]
3093 node._v_attrs.value_type = "datetime64"
3094 elif is_timedelta64_dtype(value.dtype):
3095 self._handle.create_array(self.group, key, value.view("i8"))
3096 getattr(self.group, key)._v_attrs.value_type = "timedelta64"
3097 elif empty_array:
3098 self.write_array_empty(key, value)
3099 else:
3100 self._handle.create_array(self.group, key, value)
3101
3102 getattr(self.group, key)._v_attrs.transposed = transposed
3103
3104
3105class SeriesFixed(GenericFixed):
3106 pandas_kind = "series"
3107 attributes = ["name"]
3108
3109 name: Hashable
3110
3111 @property
3112 def shape(self):
3113 try:
3114 return (len(self.group.values),)
3115 except (TypeError, AttributeError):
3116 return None
3117
3118 def read(
3119 self,
3120 where=None,
3121 columns=None,
3122 start: int | None = None,
3123 stop: int | None = None,
3124 ) -> Series:
3125 self.validate_read(columns, where)
3126 index = self.read_index("index", start=start, stop=stop)
3127 values = self.read_array("values", start=start, stop=stop)
3128 return Series(values, index=index, name=self.name, copy=False)
3129
3130 # error: Signature of "write" incompatible with supertype "Fixed"
3131 def write(self, obj, **kwargs) -> None: # type: ignore[override]
3132 super().write(obj, **kwargs)
3133 self.write_index("index", obj.index)
3134 self.write_array("values", obj)
3135 self.attrs.name = obj.name
3136
3137
3138class BlockManagerFixed(GenericFixed):
3139 attributes = ["ndim", "nblocks"]
3140
3141 nblocks: int
3142
3143 @property
3144 def shape(self) -> Shape | None:
3145 try:
3146 ndim = self.ndim
3147
3148 # items
3149 items = 0
3150 for i in range(self.nblocks):
3151 node = getattr(self.group, f"block{i}_items")
3152 shape = getattr(node, "shape", None)
3153 if shape is not None:
3154 items += shape[0]
3155
3156 # data shape
3157 node = self.group.block0_values
3158 shape = getattr(node, "shape", None)
3159 if shape is not None:
3160 shape = list(shape[0 : (ndim - 1)])
3161 else:
3162 shape = []
3163
3164 shape.append(items)
3165
3166 return shape
3167 except AttributeError:
3168 return None
3169
3170 def read(
3171 self,
3172 where=None,
3173 columns=None,
3174 start: int | None = None,
3175 stop: int | None = None,
3176 ) -> DataFrame:
3177 # start, stop applied to rows, so 0th axis only
3178 self.validate_read(columns, where)
3179 select_axis = self.obj_type()._get_block_manager_axis(0)
3180
3181 axes = []
3182 for i in range(self.ndim):
3183 _start, _stop = (start, stop) if i == select_axis else (None, None)
3184 ax = self.read_index(f"axis{i}", start=_start, stop=_stop)
3185 axes.append(ax)
3186
3187 items = axes[0]
3188 dfs = []
3189
3190 for i in range(self.nblocks):
3191 blk_items = self.read_index(f"block{i}_items")
3192 values = self.read_array(f"block{i}_values", start=_start, stop=_stop)
3193
3194 columns = items[items.get_indexer(blk_items)]
3195 df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
3196 dfs.append(df)
3197
3198 if len(dfs) > 0:
3199 out = concat(dfs, axis=1, copy=True)
3200 out = out.reindex(columns=items, copy=False)
3201 return out
3202
3203 return DataFrame(columns=axes[0], index=axes[1])
3204
3205 # error: Signature of "write" incompatible with supertype "Fixed"
3206 def write(self, obj, **kwargs) -> None: # type: ignore[override]
3207 super().write(obj, **kwargs)
3208
3209 # TODO(ArrayManager) HDFStore relies on accessing the blocks
3210 if isinstance(obj._mgr, ArrayManager):
3211 obj = obj._as_manager("block")
3212
3213 data = obj._mgr
3214 if not data.is_consolidated():
3215 data = data.consolidate()
3216
3217 self.attrs.ndim = data.ndim
3218 for i, ax in enumerate(data.axes):
3219 if i == 0 and (not ax.is_unique):
3220 raise ValueError("Columns index has to be unique for fixed format")
3221 self.write_index(f"axis{i}", ax)
3222
3223 # Supporting mixed-type DataFrame objects...nontrivial
3224 self.attrs.nblocks = len(data.blocks)
3225 for i, blk in enumerate(data.blocks):
3226 # I have no idea why, but writing values before items fixed #2299
3227 blk_items = data.items.take(blk.mgr_locs)
3228 self.write_array(f"block{i}_values", blk.values, items=blk_items)
3229 self.write_index(f"block{i}_items", blk_items)
3230
3231
3232class FrameFixed(BlockManagerFixed):
3233 pandas_kind = "frame"
3234 obj_type = DataFrame
3235
3236
3237class Table(Fixed):
3238 """
3239 represent a table:
3240 facilitate read/write of various types of tables
3241
3242 Attrs in Table Node
3243 -------------------
3244 These are attributes that are store in the main table node, they are
3245 necessary to recreate these tables when read back in.
3246
3247 index_axes : a list of tuples of the (original indexing axis and
3248 index column)
3249 non_index_axes: a list of tuples of the (original index axis and
3250 columns on a non-indexing axis)
3251 values_axes : a list of the columns which comprise the data of this
3252 table
3253 data_columns : a list of the columns that we are allowing indexing
3254 (these become single columns in values_axes)
3255 nan_rep : the string to use for nan representations for string
3256 objects
3257 levels : the names of levels
3258 metadata : the names of the metadata columns
3259 """
3260
3261 pandas_kind = "wide_table"
3262 format_type: str = "table" # GH#30962 needed by dask
3263 table_type: str
3264 levels: int | list[Hashable] = 1
3265 is_table = True
3266
3267 metadata: list
3268
3269 def __init__(
3270 self,
3271 parent: HDFStore,
3272 group: Node,
3273 encoding: str | None = None,
3274 errors: str = "strict",
3275 index_axes: list[IndexCol] | None = None,
3276 non_index_axes: list[tuple[AxisInt, Any]] | None = None,
3277 values_axes: list[DataCol] | None = None,
3278 data_columns: list | None = None,
3279 info: dict | None = None,
3280 nan_rep=None,
3281 ) -> None:
3282 super().__init__(parent, group, encoding=encoding, errors=errors)
3283 self.index_axes = index_axes or []
3284 self.non_index_axes = non_index_axes or []
3285 self.values_axes = values_axes or []
3286 self.data_columns = data_columns or []
3287 self.info = info or {}
3288 self.nan_rep = nan_rep
3289
3290 @property
3291 def table_type_short(self) -> str:
3292 return self.table_type.split("_")[0]
3293
3294 def __repr__(self) -> str:
3295 """return a pretty representation of myself"""
3296 self.infer_axes()
3297 jdc = ",".join(self.data_columns) if len(self.data_columns) else ""
3298 dc = f",dc->[{jdc}]"
3299
3300 ver = ""
3301 if self.is_old_version:
3302 jver = ".".join([str(x) for x in self.version])
3303 ver = f"[{jver}]"
3304
3305 jindex_axes = ",".join([a.name for a in self.index_axes])
3306 return (
3307 f"{self.pandas_type:12.12}{ver} "
3308 f"(typ->{self.table_type_short},nrows->{self.nrows},"
3309 f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})"
3310 )
3311
3312 def __getitem__(self, c: str):
3313 """return the axis for c"""
3314 for a in self.axes:
3315 if c == a.name:
3316 return a
3317 return None
3318
3319 def validate(self, other) -> None:
3320 """validate against an existing table"""
3321 if other is None:
3322 return
3323
3324 if other.table_type != self.table_type:
3325 raise TypeError(
3326 "incompatible table_type with existing "
3327 f"[{other.table_type} - {self.table_type}]"
3328 )
3329
3330 for c in ["index_axes", "non_index_axes", "values_axes"]:
3331 sv = getattr(self, c, None)
3332 ov = getattr(other, c, None)
3333 if sv != ov:
3334 # show the error for the specific axes
3335 # Argument 1 to "enumerate" has incompatible type
3336 # "Optional[Any]"; expected "Iterable[Any]" [arg-type]
3337 for i, sax in enumerate(sv): # type: ignore[arg-type]
3338 # Value of type "Optional[Any]" is not indexable [index]
3339 oax = ov[i] # type: ignore[index]
3340 if sax != oax:
3341 raise ValueError(
3342 f"invalid combination of [{c}] on appending data "
3343 f"[{sax}] vs current table [{oax}]"
3344 )
3345
3346 # should never get here
3347 raise Exception(
3348 f"invalid combination of [{c}] on appending data [{sv}] vs "
3349 f"current table [{ov}]"
3350 )
3351
3352 @property
3353 def is_multi_index(self) -> bool:
3354 """the levels attribute is 1 or a list in the case of a multi-index"""
3355 return isinstance(self.levels, list)
3356
3357 def validate_multiindex(
3358 self, obj: DataFrame | Series
3359 ) -> tuple[DataFrame, list[Hashable]]:
3360 """
3361 validate that we can store the multi-index; reset and return the
3362 new object
3363 """
3364 levels = com.fill_missing_names(obj.index.names)
3365 try:
3366 reset_obj = obj.reset_index()
3367 except ValueError as err:
3368 raise ValueError(
3369 "duplicate names/columns in the multi-index when storing as a table"
3370 ) from err
3371 assert isinstance(reset_obj, DataFrame) # for mypy
3372 return reset_obj, levels
3373
3374 @property
3375 def nrows_expected(self) -> int:
3376 """based on our axes, compute the expected nrows"""
3377 return np.prod([i.cvalues.shape[0] for i in self.index_axes])
3378
3379 @property
3380 def is_exists(self) -> bool:
3381 """has this table been created"""
3382 return "table" in self.group
3383
3384 @property
3385 def storable(self):
3386 return getattr(self.group, "table", None)
3387
3388 @property
3389 def table(self):
3390 """return the table group (this is my storable)"""
3391 return self.storable
3392
3393 @property
3394 def dtype(self):
3395 return self.table.dtype
3396
3397 @property
3398 def description(self):
3399 return self.table.description
3400
3401 @property
3402 def axes(self):
3403 return itertools.chain(self.index_axes, self.values_axes)
3404
3405 @property
3406 def ncols(self) -> int:
3407 """the number of total columns in the values axes"""
3408 return sum(len(a.values) for a in self.values_axes)
3409
3410 @property
3411 def is_transposed(self) -> bool:
3412 return False
3413
3414 @property
3415 def data_orientation(self) -> tuple[int, ...]:
3416 """return a tuple of my permutated axes, non_indexable at the front"""
3417 return tuple(
3418 itertools.chain(
3419 [int(a[0]) for a in self.non_index_axes],
3420 [int(a.axis) for a in self.index_axes],
3421 )
3422 )
3423
3424 def queryables(self) -> dict[str, Any]:
3425 """return a dict of the kinds allowable columns for this object"""
3426 # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here
3427 axis_names = {0: "index", 1: "columns"}
3428
3429 # compute the values_axes queryables
3430 d1 = [(a.cname, a) for a in self.index_axes]
3431 d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes]
3432 d3 = [
3433 (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)
3434 ]
3435
3436 return dict(d1 + d2 + d3)
3437
3438 def index_cols(self):
3439 """return a list of my index cols"""
3440 # Note: each `i.cname` below is assured to be a str.
3441 return [(i.axis, i.cname) for i in self.index_axes]
3442
3443 def values_cols(self) -> list[str]:
3444 """return a list of my values cols"""
3445 return [i.cname for i in self.values_axes]
3446
3447 def _get_metadata_path(self, key: str) -> str:
3448 """return the metadata pathname for this key"""
3449 group = self.group._v_pathname
3450 return f"{group}/meta/{key}/meta"
3451
3452 def write_metadata(self, key: str, values: np.ndarray) -> None:
3453 """
3454 Write out a metadata array to the key as a fixed-format Series.
3455
3456 Parameters
3457 ----------
3458 key : str
3459 values : ndarray
3460 """
3461 self.parent.put(
3462 self._get_metadata_path(key),
3463 Series(values, copy=False),
3464 format="table",
3465 encoding=self.encoding,
3466 errors=self.errors,
3467 nan_rep=self.nan_rep,
3468 )
3469
3470 def read_metadata(self, key: str):
3471 """return the meta data array for this key"""
3472 if getattr(getattr(self.group, "meta", None), key, None) is not None:
3473 return self.parent.select(self._get_metadata_path(key))
3474 return None
3475
3476 def set_attrs(self) -> None:
3477 """set our table type & indexables"""
3478 self.attrs.table_type = str(self.table_type)
3479 self.attrs.index_cols = self.index_cols()
3480 self.attrs.values_cols = self.values_cols()
3481 self.attrs.non_index_axes = self.non_index_axes
3482 self.attrs.data_columns = self.data_columns
3483 self.attrs.nan_rep = self.nan_rep
3484 self.attrs.encoding = self.encoding
3485 self.attrs.errors = self.errors
3486 self.attrs.levels = self.levels
3487 self.attrs.info = self.info
3488
3489 def get_attrs(self) -> None:
3490 """retrieve our attributes"""
3491 self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []
3492 self.data_columns = getattr(self.attrs, "data_columns", None) or []
3493 self.info = getattr(self.attrs, "info", None) or {}
3494 self.nan_rep = getattr(self.attrs, "nan_rep", None)
3495 self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
3496 self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
3497 self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []
3498 self.index_axes = [a for a in self.indexables if a.is_an_indexable]
3499 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
3500
3501 def validate_version(self, where=None) -> None:
3502 """are we trying to operate on an old version?"""
3503 if where is not None:
3504 if self.is_old_version:
3505 ws = incompatibility_doc % ".".join([str(x) for x in self.version])
3506 warnings.warn(
3507 ws,
3508 IncompatibilityWarning,
3509 stacklevel=find_stack_level(),
3510 )
3511
3512 def validate_min_itemsize(self, min_itemsize) -> None:
3513 """
3514 validate the min_itemsize doesn't contain items that are not in the
3515 axes this needs data_columns to be defined
3516 """
3517 if min_itemsize is None:
3518 return
3519 if not isinstance(min_itemsize, dict):
3520 return
3521
3522 q = self.queryables()
3523 for k in min_itemsize:
3524 # ok, apply generally
3525 if k == "values":
3526 continue
3527 if k not in q:
3528 raise ValueError(
3529 f"min_itemsize has the key [{k}] which is not an axis or "
3530 "data_column"
3531 )
3532
3533 @cache_readonly
3534 def indexables(self):
3535 """create/cache the indexables if they don't exist"""
3536 _indexables = []
3537
3538 desc = self.description
3539 table_attrs = self.table.attrs
3540
3541 # Note: each of the `name` kwargs below are str, ensured
3542 # by the definition in index_cols.
3543 # index columns
3544 for i, (axis, name) in enumerate(self.attrs.index_cols):
3545 atom = getattr(desc, name)
3546 md = self.read_metadata(name)
3547 meta = "category" if md is not None else None
3548
3549 kind_attr = f"{name}_kind"
3550 kind = getattr(table_attrs, kind_attr, None)
3551
3552 index_col = IndexCol(
3553 name=name,
3554 axis=axis,
3555 pos=i,
3556 kind=kind,
3557 typ=atom,
3558 table=self.table,
3559 meta=meta,
3560 metadata=md,
3561 )
3562 _indexables.append(index_col)
3563
3564 # values columns
3565 dc = set(self.data_columns)
3566 base_pos = len(_indexables)
3567
3568 def f(i, c):
3569 assert isinstance(c, str)
3570 klass = DataCol
3571 if c in dc:
3572 klass = DataIndexableCol
3573
3574 atom = getattr(desc, c)
3575 adj_name = _maybe_adjust_name(c, self.version)
3576
3577 # TODO: why kind_attr here?
3578 values = getattr(table_attrs, f"{adj_name}_kind", None)
3579 dtype = getattr(table_attrs, f"{adj_name}_dtype", None)
3580 # Argument 1 to "_dtype_to_kind" has incompatible type
3581 # "Optional[Any]"; expected "str" [arg-type]
3582 kind = _dtype_to_kind(dtype) # type: ignore[arg-type]
3583
3584 md = self.read_metadata(c)
3585 # TODO: figure out why these two versions of `meta` dont always match.
3586 # meta = "category" if md is not None else None
3587 meta = getattr(table_attrs, f"{adj_name}_meta", None)
3588
3589 obj = klass(
3590 name=adj_name,
3591 cname=c,
3592 values=values,
3593 kind=kind,
3594 pos=base_pos + i,
3595 typ=atom,
3596 table=self.table,
3597 meta=meta,
3598 metadata=md,
3599 dtype=dtype,
3600 )
3601 return obj
3602
3603 # Note: the definition of `values_cols` ensures that each
3604 # `c` below is a str.
3605 _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)])
3606
3607 return _indexables
3608
3609 def create_index(
3610 self, columns=None, optlevel=None, kind: str | None = None
3611 ) -> None:
3612 """
3613 Create a pytables index on the specified columns.
3614
3615 Parameters
3616 ----------
3617 columns : None, bool, or listlike[str]
3618 Indicate which columns to create an index on.
3619
3620 * False : Do not create any indexes.
3621 * True : Create indexes on all columns.
3622 * None : Create indexes on all columns.
3623 * listlike : Create indexes on the given columns.
3624
3625 optlevel : int or None, default None
3626 Optimization level, if None, pytables defaults to 6.
3627 kind : str or None, default None
3628 Kind of index, if None, pytables defaults to "medium".
3629
3630 Raises
3631 ------
3632 TypeError if trying to create an index on a complex-type column.
3633
3634 Notes
3635 -----
3636 Cannot index Time64Col or ComplexCol.
3637 Pytables must be >= 3.0.
3638 """
3639 if not self.infer_axes():
3640 return
3641 if columns is False:
3642 return
3643
3644 # index all indexables and data_columns
3645 if columns is None or columns is True:
3646 columns = [a.cname for a in self.axes if a.is_data_indexable]
3647 if not isinstance(columns, (tuple, list)):
3648 columns = [columns]
3649
3650 kw = {}
3651 if optlevel is not None:
3652 kw["optlevel"] = optlevel
3653 if kind is not None:
3654 kw["kind"] = kind
3655
3656 table = self.table
3657 for c in columns:
3658 v = getattr(table.cols, c, None)
3659 if v is not None:
3660 # remove the index if the kind/optlevel have changed
3661 if v.is_indexed:
3662 index = v.index
3663 cur_optlevel = index.optlevel
3664 cur_kind = index.kind
3665
3666 if kind is not None and cur_kind != kind:
3667 v.remove_index()
3668 else:
3669 kw["kind"] = cur_kind
3670
3671 if optlevel is not None and cur_optlevel != optlevel:
3672 v.remove_index()
3673 else:
3674 kw["optlevel"] = cur_optlevel
3675
3676 # create the index
3677 if not v.is_indexed:
3678 if v.type.startswith("complex"):
3679 raise TypeError(
3680 "Columns containing complex values can be stored but "
3681 "cannot be indexed when using table format. Either use "
3682 "fixed format, set index=False, or do not include "
3683 "the columns containing complex values to "
3684 "data_columns when initializing the table."
3685 )
3686 v.create_index(**kw)
3687 elif c in self.non_index_axes[0][1]:
3688 # GH 28156
3689 raise AttributeError(
3690 f"column {c} is not a data_column.\n"
3691 f"In order to read column {c} you must reload the dataframe \n"
3692 f"into HDFStore and include {c} with the data_columns argument."
3693 )
3694
3695 def _read_axes(
3696 self, where, start: int | None = None, stop: int | None = None
3697 ) -> list[tuple[ArrayLike, ArrayLike]]:
3698 """
3699 Create the axes sniffed from the table.
3700
3701 Parameters
3702 ----------
3703 where : ???
3704 start : int or None, default None
3705 stop : int or None, default None
3706
3707 Returns
3708 -------
3709 List[Tuple[index_values, column_values]]
3710 """
3711 # create the selection
3712 selection = Selection(self, where=where, start=start, stop=stop)
3713 values = selection.select()
3714
3715 results = []
3716 # convert the data
3717 for a in self.axes:
3718 a.set_info(self.info)
3719 res = a.convert(
3720 values,
3721 nan_rep=self.nan_rep,
3722 encoding=self.encoding,
3723 errors=self.errors,
3724 )
3725 results.append(res)
3726
3727 return results
3728
3729 @classmethod
3730 def get_object(cls, obj, transposed: bool):
3731 """return the data for this obj"""
3732 return obj
3733
3734 def validate_data_columns(self, data_columns, min_itemsize, non_index_axes):
3735 """
3736 take the input data_columns and min_itemize and create a data
3737 columns spec
3738 """
3739 if not len(non_index_axes):
3740 return []
3741
3742 axis, axis_labels = non_index_axes[0]
3743 info = self.info.get(axis, {})
3744 if info.get("type") == "MultiIndex" and data_columns:
3745 raise ValueError(
3746 f"cannot use a multi-index on axis [{axis}] with "
3747 f"data_columns {data_columns}"
3748 )
3749
3750 # evaluate the passed data_columns, True == use all columns
3751 # take only valid axis labels
3752 if data_columns is True:
3753 data_columns = list(axis_labels)
3754 elif data_columns is None:
3755 data_columns = []
3756
3757 # if min_itemsize is a dict, add the keys (exclude 'values')
3758 if isinstance(min_itemsize, dict):
3759 existing_data_columns = set(data_columns)
3760 data_columns = list(data_columns) # ensure we do not modify
3761 data_columns.extend(
3762 [
3763 k
3764 for k in min_itemsize.keys()
3765 if k != "values" and k not in existing_data_columns
3766 ]
3767 )
3768
3769 # return valid columns in the order of our axis
3770 return [c for c in data_columns if c in axis_labels]
3771
3772 def _create_axes(
3773 self,
3774 axes,
3775 obj: DataFrame,
3776 validate: bool = True,
3777 nan_rep=None,
3778 data_columns=None,
3779 min_itemsize=None,
3780 ):
3781 """
3782 Create and return the axes.
3783
3784 Parameters
3785 ----------
3786 axes: list or None
3787 The names or numbers of the axes to create.
3788 obj : DataFrame
3789 The object to create axes on.
3790 validate: bool, default True
3791 Whether to validate the obj against an existing object already written.
3792 nan_rep :
3793 A value to use for string column nan_rep.
3794 data_columns : List[str], True, or None, default None
3795 Specify the columns that we want to create to allow indexing on.
3796
3797 * True : Use all available columns.
3798 * None : Use no columns.
3799 * List[str] : Use the specified columns.
3800
3801 min_itemsize: Dict[str, int] or None, default None
3802 The min itemsize for a column in bytes.
3803 """
3804 if not isinstance(obj, DataFrame):
3805 group = self.group._v_name
3806 raise TypeError(
3807 f"cannot properly create the storer for: [group->{group},"
3808 f"value->{type(obj)}]"
3809 )
3810
3811 # set the default axes if needed
3812 if axes is None:
3813 axes = [0]
3814
3815 # map axes to numbers
3816 axes = [obj._get_axis_number(a) for a in axes]
3817
3818 # do we have an existing table (if so, use its axes & data_columns)
3819 if self.infer_axes():
3820 table_exists = True
3821 axes = [a.axis for a in self.index_axes]
3822 data_columns = list(self.data_columns)
3823 nan_rep = self.nan_rep
3824 # TODO: do we always have validate=True here?
3825 else:
3826 table_exists = False
3827
3828 new_info = self.info
3829
3830 assert self.ndim == 2 # with next check, we must have len(axes) == 1
3831 # currently support on ndim-1 axes
3832 if len(axes) != self.ndim - 1:
3833 raise ValueError(
3834 "currently only support ndim-1 indexers in an AppendableTable"
3835 )
3836
3837 # create according to the new data
3838 new_non_index_axes: list = []
3839
3840 # nan_representation
3841 if nan_rep is None:
3842 nan_rep = "nan"
3843
3844 # We construct the non-index-axis first, since that alters new_info
3845 idx = [x for x in [0, 1] if x not in axes][0]
3846
3847 a = obj.axes[idx]
3848 # we might be able to change the axes on the appending data if necessary
3849 append_axis = list(a)
3850 if table_exists:
3851 indexer = len(new_non_index_axes) # i.e. 0
3852 exist_axis = self.non_index_axes[indexer][1]
3853 if not array_equivalent(np.array(append_axis), np.array(exist_axis)):
3854 # ahah! -> reindex
3855 if array_equivalent(
3856 np.array(sorted(append_axis)), np.array(sorted(exist_axis))
3857 ):
3858 append_axis = exist_axis
3859
3860 # the non_index_axes info
3861 info = new_info.setdefault(idx, {})
3862 info["names"] = list(a.names)
3863 info["type"] = type(a).__name__
3864
3865 new_non_index_axes.append((idx, append_axis))
3866
3867 # Now we can construct our new index axis
3868 idx = axes[0]
3869 a = obj.axes[idx]
3870 axis_name = obj._get_axis_name(idx)
3871 new_index = _convert_index(axis_name, a, self.encoding, self.errors)
3872 new_index.axis = idx
3873
3874 # Because we are always 2D, there is only one new_index, so
3875 # we know it will have pos=0
3876 new_index.set_pos(0)
3877 new_index.update_info(new_info)
3878 new_index.maybe_set_size(min_itemsize) # check for column conflicts
3879
3880 new_index_axes = [new_index]
3881 j = len(new_index_axes) # i.e. 1
3882 assert j == 1
3883
3884 # reindex by our non_index_axes & compute data_columns
3885 assert len(new_non_index_axes) == 1
3886 for a in new_non_index_axes:
3887 obj = _reindex_axis(obj, a[0], a[1])
3888
3889 transposed = new_index.axis == 1
3890
3891 # figure out data_columns and get out blocks
3892 data_columns = self.validate_data_columns(
3893 data_columns, min_itemsize, new_non_index_axes
3894 )
3895
3896 frame = self.get_object(obj, transposed)._consolidate()
3897
3898 blocks, blk_items = self._get_blocks_and_items(
3899 frame, table_exists, new_non_index_axes, self.values_axes, data_columns
3900 )
3901
3902 # add my values
3903 vaxes = []
3904 for i, (blk, b_items) in enumerate(zip(blocks, blk_items)):
3905 # shape of the data column are the indexable axes
3906 klass = DataCol
3907 name = None
3908
3909 # we have a data_column
3910 if data_columns and len(b_items) == 1 and b_items[0] in data_columns:
3911 klass = DataIndexableCol
3912 name = b_items[0]
3913 if not (name is None or isinstance(name, str)):
3914 # TODO: should the message here be more specifically non-str?
3915 raise ValueError("cannot have non-object label DataIndexableCol")
3916
3917 # make sure that we match up the existing columns
3918 # if we have an existing table
3919 existing_col: DataCol | None
3920
3921 if table_exists and validate:
3922 try:
3923 existing_col = self.values_axes[i]
3924 except (IndexError, KeyError) as err:
3925 raise ValueError(
3926 f"Incompatible appended table [{blocks}]"
3927 f"with existing table [{self.values_axes}]"
3928 ) from err
3929 else:
3930 existing_col = None
3931
3932 new_name = name or f"values_block_{i}"
3933 data_converted = _maybe_convert_for_string_atom(
3934 new_name,
3935 blk.values,
3936 existing_col=existing_col,
3937 min_itemsize=min_itemsize,
3938 nan_rep=nan_rep,
3939 encoding=self.encoding,
3940 errors=self.errors,
3941 columns=b_items,
3942 )
3943 adj_name = _maybe_adjust_name(new_name, self.version)
3944
3945 typ = klass._get_atom(data_converted)
3946 kind = _dtype_to_kind(data_converted.dtype.name)
3947 tz = None
3948 if getattr(data_converted, "tz", None) is not None:
3949 tz = _get_tz(data_converted.tz)
3950
3951 meta = metadata = ordered = None
3952 if is_categorical_dtype(data_converted.dtype):
3953 ordered = data_converted.ordered
3954 meta = "category"
3955 metadata = np.array(data_converted.categories, copy=False).ravel()
3956
3957 data, dtype_name = _get_data_and_dtype_name(data_converted)
3958
3959 col = klass(
3960 name=adj_name,
3961 cname=new_name,
3962 values=list(b_items),
3963 typ=typ,
3964 pos=j,
3965 kind=kind,
3966 tz=tz,
3967 ordered=ordered,
3968 meta=meta,
3969 metadata=metadata,
3970 dtype=dtype_name,
3971 data=data,
3972 )
3973 col.update_info(new_info)
3974
3975 vaxes.append(col)
3976
3977 j += 1
3978
3979 dcs = [col.name for col in vaxes if col.is_data_indexable]
3980
3981 new_table = type(self)(
3982 parent=self.parent,
3983 group=self.group,
3984 encoding=self.encoding,
3985 errors=self.errors,
3986 index_axes=new_index_axes,
3987 non_index_axes=new_non_index_axes,
3988 values_axes=vaxes,
3989 data_columns=dcs,
3990 info=new_info,
3991 nan_rep=nan_rep,
3992 )
3993 if hasattr(self, "levels"):
3994 # TODO: get this into constructor, only for appropriate subclass
3995 new_table.levels = self.levels
3996
3997 new_table.validate_min_itemsize(min_itemsize)
3998
3999 if validate and table_exists:
4000 new_table.validate(self)
4001
4002 return new_table
4003
4004 @staticmethod
4005 def _get_blocks_and_items(
4006 frame: DataFrame,
4007 table_exists: bool,
4008 new_non_index_axes,
4009 values_axes,
4010 data_columns,
4011 ):
4012 # Helper to clarify non-state-altering parts of _create_axes
4013
4014 # TODO(ArrayManager) HDFStore relies on accessing the blocks
4015 if isinstance(frame._mgr, ArrayManager):
4016 frame = frame._as_manager("block")
4017
4018 def get_blk_items(mgr):
4019 return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks]
4020
4021 mgr = frame._mgr
4022 mgr = cast(BlockManager, mgr)
4023 blocks: list[Block] = list(mgr.blocks)
4024 blk_items: list[Index] = get_blk_items(mgr)
4025
4026 if len(data_columns):
4027 # TODO: prove that we only get here with axis == 1?
4028 # It is the case in all extant tests, but NOT the case
4029 # outside this `if len(data_columns)` check.
4030
4031 axis, axis_labels = new_non_index_axes[0]
4032 new_labels = Index(axis_labels).difference(Index(data_columns))
4033 mgr = frame.reindex(new_labels, axis=axis)._mgr
4034 mgr = cast(BlockManager, mgr)
4035
4036 blocks = list(mgr.blocks)
4037 blk_items = get_blk_items(mgr)
4038 for c in data_columns:
4039 # This reindex would raise ValueError if we had a duplicate
4040 # index, so we can infer that (as long as axis==1) we
4041 # get a single column back, so a single block.
4042 mgr = frame.reindex([c], axis=axis)._mgr
4043 mgr = cast(BlockManager, mgr)
4044 blocks.extend(mgr.blocks)
4045 blk_items.extend(get_blk_items(mgr))
4046
4047 # reorder the blocks in the same order as the existing table if we can
4048 if table_exists:
4049 by_items = {
4050 tuple(b_items.tolist()): (b, b_items)
4051 for b, b_items in zip(blocks, blk_items)
4052 }
4053 new_blocks: list[Block] = []
4054 new_blk_items = []
4055 for ea in values_axes:
4056 items = tuple(ea.values)
4057 try:
4058 b, b_items = by_items.pop(items)
4059 new_blocks.append(b)
4060 new_blk_items.append(b_items)
4061 except (IndexError, KeyError) as err:
4062 jitems = ",".join([pprint_thing(item) for item in items])
4063 raise ValueError(
4064 f"cannot match existing table structure for [{jitems}] "
4065 "on appending data"
4066 ) from err
4067 blocks = new_blocks
4068 blk_items = new_blk_items
4069
4070 return blocks, blk_items
4071
4072 def process_axes(self, obj, selection: Selection, columns=None) -> DataFrame:
4073 """process axes filters"""
4074 # make a copy to avoid side effects
4075 if columns is not None:
4076 columns = list(columns)
4077
4078 # make sure to include levels if we have them
4079 if columns is not None and self.is_multi_index:
4080 assert isinstance(self.levels, list) # assured by is_multi_index
4081 for n in self.levels:
4082 if n not in columns:
4083 columns.insert(0, n)
4084
4085 # reorder by any non_index_axes & limit to the select columns
4086 for axis, labels in self.non_index_axes:
4087 obj = _reindex_axis(obj, axis, labels, columns)
4088
4089 def process_filter(field, filt, op):
4090 for axis_name in obj._AXIS_ORDERS:
4091 axis_number = obj._get_axis_number(axis_name)
4092 axis_values = obj._get_axis(axis_name)
4093 assert axis_number is not None
4094
4095 # see if the field is the name of an axis
4096 if field == axis_name:
4097 # if we have a multi-index, then need to include
4098 # the levels
4099 if self.is_multi_index:
4100 filt = filt.union(Index(self.levels))
4101
4102 takers = op(axis_values, filt)
4103 return obj.loc(axis=axis_number)[takers]
4104
4105 # this might be the name of a file IN an axis
4106 elif field in axis_values:
4107 # we need to filter on this dimension
4108 values = ensure_index(getattr(obj, field).values)
4109 filt = ensure_index(filt)
4110
4111 # hack until we support reversed dim flags
4112 if isinstance(obj, DataFrame):
4113 axis_number = 1 - axis_number
4114
4115 takers = op(values, filt)
4116 return obj.loc(axis=axis_number)[takers]
4117
4118 raise ValueError(f"cannot find the field [{field}] for filtering!")
4119
4120 # apply the selection filters (but keep in the same order)
4121 if selection.filter is not None:
4122 for field, op, filt in selection.filter.format():
4123 obj = process_filter(field, filt, op)
4124
4125 return obj
4126
4127 def create_description(
4128 self,
4129 complib,
4130 complevel: int | None,
4131 fletcher32: bool,
4132 expectedrows: int | None,
4133 ) -> dict[str, Any]:
4134 """create the description of the table from the axes & values"""
4135 # provided expected rows if its passed
4136 if expectedrows is None:
4137 expectedrows = max(self.nrows_expected, 10000)
4138
4139 d = {"name": "table", "expectedrows": expectedrows}
4140
4141 # description from the axes & values
4142 d["description"] = {a.cname: a.typ for a in self.axes}
4143
4144 if complib:
4145 if complevel is None:
4146 complevel = self._complevel or 9
4147 filters = _tables().Filters(
4148 complevel=complevel,
4149 complib=complib,
4150 fletcher32=fletcher32 or self._fletcher32,
4151 )
4152 d["filters"] = filters
4153 elif self._filters is not None:
4154 d["filters"] = self._filters
4155
4156 return d
4157
4158 def read_coordinates(
4159 self, where=None, start: int | None = None, stop: int | None = None
4160 ):
4161 """
4162 select coordinates (row numbers) from a table; return the
4163 coordinates object
4164 """
4165 # validate the version
4166 self.validate_version(where)
4167
4168 # infer the data kind
4169 if not self.infer_axes():
4170 return False
4171
4172 # create the selection
4173 selection = Selection(self, where=where, start=start, stop=stop)
4174 coords = selection.select_coords()
4175 if selection.filter is not None:
4176 for field, op, filt in selection.filter.format():
4177 data = self.read_column(
4178 field, start=coords.min(), stop=coords.max() + 1
4179 )
4180 coords = coords[op(data.iloc[coords - coords.min()], filt).values]
4181
4182 return Index(coords)
4183
4184 def read_column(
4185 self,
4186 column: str,
4187 where=None,
4188 start: int | None = None,
4189 stop: int | None = None,
4190 ):
4191 """
4192 return a single column from the table, generally only indexables
4193 are interesting
4194 """
4195 # validate the version
4196 self.validate_version()
4197
4198 # infer the data kind
4199 if not self.infer_axes():
4200 return False
4201
4202 if where is not None:
4203 raise TypeError("read_column does not currently accept a where clause")
4204
4205 # find the axes
4206 for a in self.axes:
4207 if column == a.name:
4208 if not a.is_data_indexable:
4209 raise ValueError(
4210 f"column [{column}] can not be extracted individually; "
4211 "it is not data indexable"
4212 )
4213
4214 # column must be an indexable or a data column
4215 c = getattr(self.table.cols, column)
4216 a.set_info(self.info)
4217 col_values = a.convert(
4218 c[start:stop],
4219 nan_rep=self.nan_rep,
4220 encoding=self.encoding,
4221 errors=self.errors,
4222 )
4223 return Series(_set_tz(col_values[1], a.tz), name=column, copy=False)
4224
4225 raise KeyError(f"column [{column}] not found in the table")
4226
4227
4228class WORMTable(Table):
4229 """
4230 a write-once read-many table: this format DOES NOT ALLOW appending to a
4231 table. writing is a one-time operation the data are stored in a format
4232 that allows for searching the data on disk
4233 """
4234
4235 table_type = "worm"
4236
4237 def read(
4238 self,
4239 where=None,
4240 columns=None,
4241 start: int | None = None,
4242 stop: int | None = None,
4243 ):
4244 """
4245 read the indices and the indexing array, calculate offset rows and return
4246 """
4247 raise NotImplementedError("WORMTable needs to implement read")
4248
4249 def write(self, **kwargs) -> None:
4250 """
4251 write in a format that we can search later on (but cannot append
4252 to): write out the indices and the values using _write_array
4253 (e.g. a CArray) create an indexing table so that we can search
4254 """
4255 raise NotImplementedError("WORMTable needs to implement write")
4256
4257
4258class AppendableTable(Table):
4259 """support the new appendable table formats"""
4260
4261 table_type = "appendable"
4262
4263 # error: Signature of "write" incompatible with supertype "Fixed"
4264 def write( # type: ignore[override]
4265 self,
4266 obj,
4267 axes=None,
4268 append: bool = False,
4269 complib=None,
4270 complevel=None,
4271 fletcher32=None,
4272 min_itemsize=None,
4273 chunksize=None,
4274 expectedrows=None,
4275 dropna: bool = False,
4276 nan_rep=None,
4277 data_columns=None,
4278 track_times: bool = True,
4279 ) -> None:
4280 if not append and self.is_exists:
4281 self._handle.remove_node(self.group, "table")
4282
4283 # create the axes
4284 table = self._create_axes(
4285 axes=axes,
4286 obj=obj,
4287 validate=append,
4288 min_itemsize=min_itemsize,
4289 nan_rep=nan_rep,
4290 data_columns=data_columns,
4291 )
4292
4293 for a in table.axes:
4294 a.validate_names()
4295
4296 if not table.is_exists:
4297 # create the table
4298 options = table.create_description(
4299 complib=complib,
4300 complevel=complevel,
4301 fletcher32=fletcher32,
4302 expectedrows=expectedrows,
4303 )
4304
4305 # set the table attributes
4306 table.set_attrs()
4307
4308 options["track_times"] = track_times
4309
4310 # create the table
4311 table._handle.create_table(table.group, **options)
4312
4313 # update my info
4314 table.attrs.info = table.info
4315
4316 # validate the axes and set the kinds
4317 for a in table.axes:
4318 a.validate_and_set(table, append)
4319
4320 # add the rows
4321 table.write_data(chunksize, dropna=dropna)
4322
4323 def write_data(self, chunksize: int | None, dropna: bool = False) -> None:
4324 """
4325 we form the data into a 2-d including indexes,values,mask write chunk-by-chunk
4326 """
4327 names = self.dtype.names
4328 nrows = self.nrows_expected
4329
4330 # if dropna==True, then drop ALL nan rows
4331 masks = []
4332 if dropna:
4333 for a in self.values_axes:
4334 # figure the mask: only do if we can successfully process this
4335 # column, otherwise ignore the mask
4336 mask = isna(a.data).all(axis=0)
4337 if isinstance(mask, np.ndarray):
4338 masks.append(mask.astype("u1", copy=False))
4339
4340 # consolidate masks
4341 if len(masks):
4342 mask = masks[0]
4343 for m in masks[1:]:
4344 mask = mask & m
4345 mask = mask.ravel()
4346 else:
4347 mask = None
4348
4349 # broadcast the indexes if needed
4350 indexes = [a.cvalues for a in self.index_axes]
4351 nindexes = len(indexes)
4352 assert nindexes == 1, nindexes # ensures we dont need to broadcast
4353
4354 # transpose the values so first dimension is last
4355 # reshape the values if needed
4356 values = [a.take_data() for a in self.values_axes]
4357 values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]
4358 bvalues = []
4359 for i, v in enumerate(values):
4360 new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
4361 bvalues.append(v.reshape(new_shape))
4362
4363 # write the chunks
4364 if chunksize is None:
4365 chunksize = 100000
4366
4367 rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
4368 chunks = nrows // chunksize + 1
4369 for i in range(chunks):
4370 start_i = i * chunksize
4371 end_i = min((i + 1) * chunksize, nrows)
4372 if start_i >= end_i:
4373 break
4374
4375 self.write_data_chunk(
4376 rows,
4377 indexes=[a[start_i:end_i] for a in indexes],
4378 mask=mask[start_i:end_i] if mask is not None else None,
4379 values=[v[start_i:end_i] for v in bvalues],
4380 )
4381
4382 def write_data_chunk(
4383 self,
4384 rows: np.ndarray,
4385 indexes: list[np.ndarray],
4386 mask: npt.NDArray[np.bool_] | None,
4387 values: list[np.ndarray],
4388 ) -> None:
4389 """
4390 Parameters
4391 ----------
4392 rows : an empty memory space where we are putting the chunk
4393 indexes : an array of the indexes
4394 mask : an array of the masks
4395 values : an array of the values
4396 """
4397 # 0 len
4398 for v in values:
4399 if not np.prod(v.shape):
4400 return
4401
4402 nrows = indexes[0].shape[0]
4403 if nrows != len(rows):
4404 rows = np.empty(nrows, dtype=self.dtype)
4405 names = self.dtype.names
4406 nindexes = len(indexes)
4407
4408 # indexes
4409 for i, idx in enumerate(indexes):
4410 rows[names[i]] = idx
4411
4412 # values
4413 for i, v in enumerate(values):
4414 rows[names[i + nindexes]] = v
4415
4416 # mask
4417 if mask is not None:
4418 m = ~mask.ravel().astype(bool, copy=False)
4419 if not m.all():
4420 rows = rows[m]
4421
4422 if len(rows):
4423 self.table.append(rows)
4424 self.table.flush()
4425
4426 def delete(self, where=None, start: int | None = None, stop: int | None = None):
4427 # delete all rows (and return the nrows)
4428 if where is None or not len(where):
4429 if start is None and stop is None:
4430 nrows = self.nrows
4431 self._handle.remove_node(self.group, recursive=True)
4432 else:
4433 # pytables<3.0 would remove a single row with stop=None
4434 if stop is None:
4435 stop = self.nrows
4436 nrows = self.table.remove_rows(start=start, stop=stop)
4437 self.table.flush()
4438 return nrows
4439
4440 # infer the data kind
4441 if not self.infer_axes():
4442 return None
4443
4444 # create the selection
4445 table = self.table
4446 selection = Selection(self, where, start=start, stop=stop)
4447 values = selection.select_coords()
4448
4449 # delete the rows in reverse order
4450 sorted_series = Series(values, copy=False).sort_values()
4451 ln = len(sorted_series)
4452
4453 if ln:
4454 # construct groups of consecutive rows
4455 diff = sorted_series.diff()
4456 groups = list(diff[diff > 1].index)
4457
4458 # 1 group
4459 if not len(groups):
4460 groups = [0]
4461
4462 # final element
4463 if groups[-1] != ln:
4464 groups.append(ln)
4465
4466 # initial element
4467 if groups[0] != 0:
4468 groups.insert(0, 0)
4469
4470 # we must remove in reverse order!
4471 pg = groups.pop()
4472 for g in reversed(groups):
4473 rows = sorted_series.take(range(g, pg))
4474 table.remove_rows(
4475 start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1
4476 )
4477 pg = g
4478
4479 self.table.flush()
4480
4481 # return the number of rows removed
4482 return ln
4483
4484
4485class AppendableFrameTable(AppendableTable):
4486 """support the new appendable table formats"""
4487
4488 pandas_kind = "frame_table"
4489 table_type = "appendable_frame"
4490 ndim = 2
4491 obj_type: type[DataFrame | Series] = DataFrame
4492
4493 @property
4494 def is_transposed(self) -> bool:
4495 return self.index_axes[0].axis == 1
4496
4497 @classmethod
4498 def get_object(cls, obj, transposed: bool):
4499 """these are written transposed"""
4500 if transposed:
4501 obj = obj.T
4502 return obj
4503
4504 def read(
4505 self,
4506 where=None,
4507 columns=None,
4508 start: int | None = None,
4509 stop: int | None = None,
4510 ):
4511 # validate the version
4512 self.validate_version(where)
4513
4514 # infer the data kind
4515 if not self.infer_axes():
4516 return None
4517
4518 result = self._read_axes(where=where, start=start, stop=stop)
4519
4520 info = (
4521 self.info.get(self.non_index_axes[0][0], {})
4522 if len(self.non_index_axes)
4523 else {}
4524 )
4525
4526 inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]
4527 assert len(inds) == 1
4528 ind = inds[0]
4529
4530 index = result[ind][0]
4531
4532 frames = []
4533 for i, a in enumerate(self.axes):
4534 if a not in self.values_axes:
4535 continue
4536 index_vals, cvalues = result[i]
4537
4538 # we could have a multi-index constructor here
4539 # ensure_index doesn't recognized our list-of-tuples here
4540 if info.get("type") != "MultiIndex":
4541 cols = Index(index_vals)
4542 else:
4543 cols = MultiIndex.from_tuples(index_vals)
4544
4545 names = info.get("names")
4546 if names is not None:
4547 cols.set_names(names, inplace=True)
4548
4549 if self.is_transposed:
4550 values = cvalues
4551 index_ = cols
4552 cols_ = Index(index, name=getattr(index, "name", None))
4553 else:
4554 values = cvalues.T
4555 index_ = Index(index, name=getattr(index, "name", None))
4556 cols_ = cols
4557
4558 # if we have a DataIndexableCol, its shape will only be 1 dim
4559 if values.ndim == 1 and isinstance(values, np.ndarray):
4560 values = values.reshape((1, values.shape[0]))
4561
4562 if isinstance(values, np.ndarray):
4563 df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
4564 elif isinstance(values, Index):
4565 df = DataFrame(values, columns=cols_, index=index_)
4566 else:
4567 # Categorical
4568 df = DataFrame._from_arrays([values], columns=cols_, index=index_)
4569 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
4570 frames.append(df)
4571
4572 if len(frames) == 1:
4573 df = frames[0]
4574 else:
4575 df = concat(frames, axis=1)
4576
4577 selection = Selection(self, where=where, start=start, stop=stop)
4578 # apply the selection filters & axis orderings
4579 df = self.process_axes(df, selection=selection, columns=columns)
4580
4581 return df
4582
4583
4584class AppendableSeriesTable(AppendableFrameTable):
4585 """support the new appendable table formats"""
4586
4587 pandas_kind = "series_table"
4588 table_type = "appendable_series"
4589 ndim = 2
4590 obj_type = Series
4591
4592 @property
4593 def is_transposed(self) -> bool:
4594 return False
4595
4596 @classmethod
4597 def get_object(cls, obj, transposed: bool):
4598 return obj
4599
4600 def write(self, obj, data_columns=None, **kwargs):
4601 """we are going to write this as a frame table"""
4602 if not isinstance(obj, DataFrame):
4603 name = obj.name or "values"
4604 obj = obj.to_frame(name)
4605 return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)
4606
4607 def read(
4608 self,
4609 where=None,
4610 columns=None,
4611 start: int | None = None,
4612 stop: int | None = None,
4613 ) -> Series:
4614 is_multi_index = self.is_multi_index
4615 if columns is not None and is_multi_index:
4616 assert isinstance(self.levels, list) # needed for mypy
4617 for n in self.levels:
4618 if n not in columns:
4619 columns.insert(0, n)
4620 s = super().read(where=where, columns=columns, start=start, stop=stop)
4621 if is_multi_index:
4622 s.set_index(self.levels, inplace=True)
4623
4624 s = s.iloc[:, 0]
4625
4626 # remove the default name
4627 if s.name == "values":
4628 s.name = None
4629 return s
4630
4631
4632class AppendableMultiSeriesTable(AppendableSeriesTable):
4633 """support the new appendable table formats"""
4634
4635 pandas_kind = "series_table"
4636 table_type = "appendable_multiseries"
4637
4638 def write(self, obj, **kwargs):
4639 """we are going to write this as a frame table"""
4640 name = obj.name or "values"
4641 newobj, self.levels = self.validate_multiindex(obj)
4642 assert isinstance(self.levels, list) # for mypy
4643 cols = list(self.levels)
4644 cols.append(name)
4645 newobj.columns = Index(cols)
4646 return super().write(obj=newobj, **kwargs)
4647
4648
4649class GenericTable(AppendableFrameTable):
4650 """a table that read/writes the generic pytables table format"""
4651
4652 pandas_kind = "frame_table"
4653 table_type = "generic_table"
4654 ndim = 2
4655 obj_type = DataFrame
4656 levels: list[Hashable]
4657
4658 @property
4659 def pandas_type(self) -> str:
4660 return self.pandas_kind
4661
4662 @property
4663 def storable(self):
4664 return getattr(self.group, "table", None) or self.group
4665
4666 def get_attrs(self) -> None:
4667 """retrieve our attributes"""
4668 self.non_index_axes = []
4669 self.nan_rep = None
4670 self.levels = []
4671
4672 self.index_axes = [a for a in self.indexables if a.is_an_indexable]
4673 self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
4674 self.data_columns = [a.name for a in self.values_axes]
4675
4676 @cache_readonly
4677 def indexables(self):
4678 """create the indexables from the table description"""
4679 d = self.description
4680
4681 # TODO: can we get a typ for this? AFAICT it is the only place
4682 # where we aren't passing one
4683 # the index columns is just a simple index
4684 md = self.read_metadata("index")
4685 meta = "category" if md is not None else None
4686 index_col = GenericIndexCol(
4687 name="index", axis=0, table=self.table, meta=meta, metadata=md
4688 )
4689
4690 _indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col]
4691
4692 for i, n in enumerate(d._v_names):
4693 assert isinstance(n, str)
4694
4695 atom = getattr(d, n)
4696 md = self.read_metadata(n)
4697 meta = "category" if md is not None else None
4698 dc = GenericDataIndexableCol(
4699 name=n,
4700 pos=i,
4701 values=[n],
4702 typ=atom,
4703 table=self.table,
4704 meta=meta,
4705 metadata=md,
4706 )
4707 _indexables.append(dc)
4708
4709 return _indexables
4710
4711 def write(self, **kwargs):
4712 raise NotImplementedError("cannot write on an generic table")
4713
4714
4715class AppendableMultiFrameTable(AppendableFrameTable):
4716 """a frame with a multi-index"""
4717
4718 table_type = "appendable_multiframe"
4719 obj_type = DataFrame
4720 ndim = 2
4721 _re_levels = re.compile(r"^level_\d+$")
4722
4723 @property
4724 def table_type_short(self) -> str:
4725 return "appendable_multi"
4726
4727 def write(self, obj, data_columns=None, **kwargs):
4728 if data_columns is None:
4729 data_columns = []
4730 elif data_columns is True:
4731 data_columns = obj.columns.tolist()
4732 obj, self.levels = self.validate_multiindex(obj)
4733 assert isinstance(self.levels, list) # for mypy
4734 for n in self.levels:
4735 if n not in data_columns:
4736 data_columns.insert(0, n)
4737 return super().write(obj=obj, data_columns=data_columns, **kwargs)
4738
4739 def read(
4740 self,
4741 where=None,
4742 columns=None,
4743 start: int | None = None,
4744 stop: int | None = None,
4745 ):
4746 df = super().read(where=where, columns=columns, start=start, stop=stop)
4747 df = df.set_index(self.levels)
4748
4749 # remove names for 'level_%d'
4750 df.index = df.index.set_names(
4751 [None if self._re_levels.search(name) else name for name in df.index.names]
4752 )
4753
4754 return df
4755
4756
4757def _reindex_axis(
4758 obj: DataFrame, axis: AxisInt, labels: Index, other=None
4759) -> DataFrame:
4760 ax = obj._get_axis(axis)
4761 labels = ensure_index(labels)
4762
4763 # try not to reindex even if other is provided
4764 # if it equals our current index
4765 if other is not None:
4766 other = ensure_index(other)
4767 if (other is None or labels.equals(other)) and labels.equals(ax):
4768 return obj
4769
4770 labels = ensure_index(labels.unique())
4771 if other is not None:
4772 labels = ensure_index(other.unique()).intersection(labels, sort=False)
4773 if not labels.equals(ax):
4774 slicer: list[slice | Index] = [slice(None, None)] * obj.ndim
4775 slicer[axis] = labels
4776 obj = obj.loc[tuple(slicer)]
4777 return obj
4778
4779
4780# tz to/from coercion
4781
4782
4783def _get_tz(tz: tzinfo) -> str | tzinfo:
4784 """for a tz-aware type, return an encoded zone"""
4785 zone = timezones.get_timezone(tz)
4786 return zone
4787
4788
4789@overload
4790def _set_tz(
4791 values: np.ndarray | Index, tz: str | tzinfo, coerce: bool = False
4792) -> DatetimeIndex:
4793 ...
4794
4795
4796@overload
4797def _set_tz(values: np.ndarray | Index, tz: None, coerce: bool = False) -> np.ndarray:
4798 ...
4799
4800
4801def _set_tz(
4802 values: np.ndarray | Index, tz: str | tzinfo | None, coerce: bool = False
4803) -> np.ndarray | DatetimeIndex:
4804 """
4805 coerce the values to a DatetimeIndex if tz is set
4806 preserve the input shape if possible
4807
4808 Parameters
4809 ----------
4810 values : ndarray or Index
4811 tz : str or tzinfo
4812 coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
4813 """
4814 if isinstance(values, DatetimeIndex):
4815 # If values is tzaware, the tz gets dropped in the values.ravel()
4816 # call below (which returns an ndarray). So we are only non-lossy
4817 # if `tz` matches `values.tz`.
4818 assert values.tz is None or values.tz == tz
4819
4820 if tz is not None:
4821 if isinstance(values, DatetimeIndex):
4822 name = values.name
4823 values = values.asi8
4824 else:
4825 name = None
4826 values = values.ravel()
4827
4828 tz = _ensure_decoded(tz)
4829 values = DatetimeIndex(values, name=name)
4830 values = values.tz_localize("UTC").tz_convert(tz)
4831 elif coerce:
4832 values = np.asarray(values, dtype="M8[ns]")
4833
4834 # error: Incompatible return value type (got "Union[ndarray, Index]",
4835 # expected "Union[ndarray, DatetimeIndex]")
4836 return values # type: ignore[return-value]
4837
4838
4839def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol:
4840 assert isinstance(name, str)
4841
4842 index_name = index.name
4843 # error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index";
4844 # expected "Union[ExtensionArray, ndarray]"
4845 converted, dtype_name = _get_data_and_dtype_name(index) # type: ignore[arg-type]
4846 kind = _dtype_to_kind(dtype_name)
4847 atom = DataIndexableCol._get_atom(converted)
4848
4849 if (
4850 (isinstance(index.dtype, np.dtype) and is_integer_dtype(index))
4851 or needs_i8_conversion(index.dtype)
4852 or is_bool_dtype(index.dtype)
4853 ):
4854 # Includes Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,
4855 # in which case "kind" is "integer", "integer", "datetime64",
4856 # "timedelta64", and "integer", respectively.
4857 return IndexCol(
4858 name,
4859 values=converted,
4860 kind=kind,
4861 typ=atom,
4862 freq=getattr(index, "freq", None),
4863 tz=getattr(index, "tz", None),
4864 index_name=index_name,
4865 )
4866
4867 if isinstance(index, MultiIndex):
4868 raise TypeError("MultiIndex not supported here!")
4869
4870 inferred_type = lib.infer_dtype(index, skipna=False)
4871 # we won't get inferred_type of "datetime64" or "timedelta64" as these
4872 # would go through the DatetimeIndex/TimedeltaIndex paths above
4873
4874 values = np.asarray(index)
4875
4876 if inferred_type == "date":
4877 converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)
4878 return IndexCol(
4879 name, converted, "date", _tables().Time32Col(), index_name=index_name
4880 )
4881 elif inferred_type == "string":
4882 converted = _convert_string_array(values, encoding, errors)
4883 itemsize = converted.dtype.itemsize
4884 return IndexCol(
4885 name,
4886 converted,
4887 "string",
4888 _tables().StringCol(itemsize),
4889 index_name=index_name,
4890 )
4891
4892 elif inferred_type in ["integer", "floating"]:
4893 return IndexCol(
4894 name, values=converted, kind=kind, typ=atom, index_name=index_name
4895 )
4896 else:
4897 assert isinstance(converted, np.ndarray) and converted.dtype == object
4898 assert kind == "object", kind
4899 atom = _tables().ObjectAtom()
4900 return IndexCol(name, converted, kind, atom, index_name=index_name)
4901
4902
4903def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index:
4904 index: Index | np.ndarray
4905
4906 if kind == "datetime64":
4907 index = DatetimeIndex(data)
4908 elif kind == "timedelta64":
4909 index = TimedeltaIndex(data)
4910 elif kind == "date":
4911 try:
4912 index = np.asarray([date.fromordinal(v) for v in data], dtype=object)
4913 except ValueError:
4914 index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)
4915 elif kind in ("integer", "float", "bool"):
4916 index = np.asarray(data)
4917 elif kind in ("string"):
4918 index = _unconvert_string_array(
4919 data, nan_rep=None, encoding=encoding, errors=errors
4920 )
4921 elif kind == "object":
4922 index = np.asarray(data[0])
4923 else: # pragma: no cover
4924 raise ValueError(f"unrecognized index type {kind}")
4925 return index
4926
4927
4928def _maybe_convert_for_string_atom(
4929 name: str,
4930 bvalues: ArrayLike,
4931 existing_col,
4932 min_itemsize,
4933 nan_rep,
4934 encoding,
4935 errors,
4936 columns: list[str],
4937):
4938 if bvalues.dtype != object:
4939 return bvalues
4940
4941 bvalues = cast(np.ndarray, bvalues)
4942
4943 dtype_name = bvalues.dtype.name
4944 inferred_type = lib.infer_dtype(bvalues, skipna=False)
4945
4946 if inferred_type == "date":
4947 raise TypeError("[date] is not implemented as a table column")
4948 if inferred_type == "datetime":
4949 # after GH#8260
4950 # this only would be hit for a multi-timezone dtype which is an error
4951 raise TypeError(
4952 "too many timezones in this block, create separate data columns"
4953 )
4954
4955 if not (inferred_type == "string" or dtype_name == "object"):
4956 return bvalues
4957
4958 mask = isna(bvalues)
4959 data = bvalues.copy()
4960 data[mask] = nan_rep
4961
4962 # see if we have a valid string type
4963 inferred_type = lib.infer_dtype(data, skipna=False)
4964 if inferred_type != "string":
4965 # we cannot serialize this data, so report an exception on a column
4966 # by column basis
4967
4968 # expected behaviour:
4969 # search block for a non-string object column by column
4970 for i in range(data.shape[0]):
4971 col = data[i]
4972 inferred_type = lib.infer_dtype(col, skipna=False)
4973 if inferred_type != "string":
4974 error_column_label = columns[i] if len(columns) > i else f"No.{i}"
4975 raise TypeError(
4976 f"Cannot serialize the column [{error_column_label}]\n"
4977 f"because its data contents are not [string] but "
4978 f"[{inferred_type}] object dtype"
4979 )
4980
4981 # itemsize is the maximum length of a string (along any dimension)
4982
4983 data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape)
4984 itemsize = data_converted.itemsize
4985
4986 # specified min_itemsize?
4987 if isinstance(min_itemsize, dict):
4988 min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)
4989 itemsize = max(min_itemsize or 0, itemsize)
4990
4991 # check for column in the values conflicts
4992 if existing_col is not None:
4993 eci = existing_col.validate_col(itemsize)
4994 if eci is not None and eci > itemsize:
4995 itemsize = eci
4996
4997 data_converted = data_converted.astype(f"|S{itemsize}", copy=False)
4998 return data_converted
4999
5000
5001def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray:
5002 """
5003 Take a string-like that is object dtype and coerce to a fixed size string type.
5004
5005 Parameters
5006 ----------
5007 data : np.ndarray[object]
5008 encoding : str
5009 errors : str
5010 Handler for encoding errors.
5011
5012 Returns
5013 -------
5014 np.ndarray[fixed-length-string]
5015 """
5016 # encode if needed
5017 if len(data):
5018 data = (
5019 Series(data.ravel(), copy=False)
5020 .str.encode(encoding, errors)
5021 ._values.reshape(data.shape)
5022 )
5023
5024 # create the sized dtype
5025 ensured = ensure_object(data.ravel())
5026 itemsize = max(1, libwriters.max_len_string_array(ensured))
5027
5028 data = np.asarray(data, dtype=f"S{itemsize}")
5029 return data
5030
5031
5032def _unconvert_string_array(
5033 data: np.ndarray, nan_rep, encoding: str, errors: str
5034) -> np.ndarray:
5035 """
5036 Inverse of _convert_string_array.
5037
5038 Parameters
5039 ----------
5040 data : np.ndarray[fixed-length-string]
5041 nan_rep : the storage repr of NaN
5042 encoding : str
5043 errors : str
5044 Handler for encoding errors.
5045
5046 Returns
5047 -------
5048 np.ndarray[object]
5049 Decoded data.
5050 """
5051 shape = data.shape
5052 data = np.asarray(data.ravel(), dtype=object)
5053
5054 if len(data):
5055 itemsize = libwriters.max_len_string_array(ensure_object(data))
5056 dtype = f"U{itemsize}"
5057
5058 if isinstance(data[0], bytes):
5059 data = Series(data, copy=False).str.decode(encoding, errors=errors)._values
5060 else:
5061 data = data.astype(dtype, copy=False).astype(object, copy=False)
5062
5063 if nan_rep is None:
5064 nan_rep = "nan"
5065
5066 libwriters.string_array_replace_from_nan_rep(data, nan_rep)
5067 return data.reshape(shape)
5068
5069
5070def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str):
5071 assert isinstance(val_kind, str), type(val_kind)
5072 if _need_convert(val_kind):
5073 conv = _get_converter(val_kind, encoding, errors)
5074 values = conv(values)
5075 return values
5076
5077
5078def _get_converter(kind: str, encoding: str, errors: str):
5079 if kind == "datetime64":
5080 return lambda x: np.asarray(x, dtype="M8[ns]")
5081 elif kind == "string":
5082 return lambda x: _unconvert_string_array(
5083 x, nan_rep=None, encoding=encoding, errors=errors
5084 )
5085 else: # pragma: no cover
5086 raise ValueError(f"invalid kind {kind}")
5087
5088
5089def _need_convert(kind: str) -> bool:
5090 if kind in ("datetime64", "string"):
5091 return True
5092 return False
5093
5094
5095def _maybe_adjust_name(name: str, version: Sequence[int]) -> str:
5096 """
5097 Prior to 0.10.1, we named values blocks like: values_block_0 an the
5098 name values_0, adjust the given name if necessary.
5099
5100 Parameters
5101 ----------
5102 name : str
5103 version : Tuple[int, int, int]
5104
5105 Returns
5106 -------
5107 str
5108 """
5109 if isinstance(version, str) or len(version) < 3:
5110 raise ValueError("Version is incorrect, expected sequence of 3 integers.")
5111
5112 if version[0] == 0 and version[1] <= 10 and version[2] == 0:
5113 m = re.search(r"values_block_(\d+)", name)
5114 if m:
5115 grp = m.groups()[0]
5116 name = f"values_{grp}"
5117 return name
5118
5119
5120def _dtype_to_kind(dtype_str: str) -> str:
5121 """
5122 Find the "kind" string describing the given dtype name.
5123 """
5124 dtype_str = _ensure_decoded(dtype_str)
5125
5126 if dtype_str.startswith("string") or dtype_str.startswith("bytes"):
5127 kind = "string"
5128 elif dtype_str.startswith("float"):
5129 kind = "float"
5130 elif dtype_str.startswith("complex"):
5131 kind = "complex"
5132 elif dtype_str.startswith("int") or dtype_str.startswith("uint"):
5133 kind = "integer"
5134 elif dtype_str.startswith("datetime64"):
5135 kind = "datetime64"
5136 elif dtype_str.startswith("timedelta"):
5137 kind = "timedelta64"
5138 elif dtype_str.startswith("bool"):
5139 kind = "bool"
5140 elif dtype_str.startswith("category"):
5141 kind = "category"
5142 elif dtype_str.startswith("period"):
5143 # We store the `freq` attr so we can restore from integers
5144 kind = "integer"
5145 elif dtype_str == "object":
5146 kind = "object"
5147 else:
5148 raise ValueError(f"cannot interpret dtype of [{dtype_str}]")
5149
5150 return kind
5151
5152
5153def _get_data_and_dtype_name(data: ArrayLike):
5154 """
5155 Convert the passed data into a storable form and a dtype string.
5156 """
5157 if isinstance(data, Categorical):
5158 data = data.codes
5159
5160 # For datetime64tz we need to drop the TZ in tests TODO: why?
5161 dtype_name = data.dtype.name.split("[")[0]
5162
5163 if data.dtype.kind in ["m", "M"]:
5164 data = np.asarray(data.view("i8"))
5165 # TODO: we used to reshape for the dt64tz case, but no longer
5166 # doing that doesn't seem to break anything. why?
5167
5168 elif isinstance(data, PeriodIndex):
5169 data = data.asi8
5170
5171 data = np.asarray(data)
5172 return data, dtype_name
5173
5174
5175class Selection:
5176 """
5177 Carries out a selection operation on a tables.Table object.
5178
5179 Parameters
5180 ----------
5181 table : a Table object
5182 where : list of Terms (or convertible to)
5183 start, stop: indices to start and/or stop selection
5184
5185 """
5186
5187 def __init__(
5188 self,
5189 table: Table,
5190 where=None,
5191 start: int | None = None,
5192 stop: int | None = None,
5193 ) -> None:
5194 self.table = table
5195 self.where = where
5196 self.start = start
5197 self.stop = stop
5198 self.condition = None
5199 self.filter = None
5200 self.terms = None
5201 self.coordinates = None
5202
5203 if is_list_like(where):
5204 # see if we have a passed coordinate like
5205 with suppress(ValueError):
5206 inferred = lib.infer_dtype(where, skipna=False)
5207 if inferred in ("integer", "boolean"):
5208 where = np.asarray(where)
5209 if where.dtype == np.bool_:
5210 start, stop = self.start, self.stop
5211 if start is None:
5212 start = 0
5213 if stop is None:
5214 stop = self.table.nrows
5215 self.coordinates = np.arange(start, stop)[where]
5216 elif issubclass(where.dtype.type, np.integer):
5217 if (self.start is not None and (where < self.start).any()) or (
5218 self.stop is not None and (where >= self.stop).any()
5219 ):
5220 raise ValueError(
5221 "where must have index locations >= start and < stop"
5222 )
5223 self.coordinates = where
5224
5225 if self.coordinates is None:
5226 self.terms = self.generate(where)
5227
5228 # create the numexpr & the filter
5229 if self.terms is not None:
5230 self.condition, self.filter = self.terms.evaluate()
5231
5232 def generate(self, where):
5233 """where can be a : dict,list,tuple,string"""
5234 if where is None:
5235 return None
5236
5237 q = self.table.queryables()
5238 try:
5239 return PyTablesExpr(where, queryables=q, encoding=self.table.encoding)
5240 except NameError as err:
5241 # raise a nice message, suggesting that the user should use
5242 # data_columns
5243 qkeys = ",".join(q.keys())
5244 msg = dedent(
5245 f"""\
5246 The passed where expression: {where}
5247 contains an invalid variable reference
5248 all of the variable references must be a reference to
5249 an axis (e.g. 'index' or 'columns'), or a data_column
5250 The currently defined references are: {qkeys}
5251 """
5252 )
5253 raise ValueError(msg) from err
5254
5255 def select(self):
5256 """
5257 generate the selection
5258 """
5259 if self.condition is not None:
5260 return self.table.table.read_where(
5261 self.condition.format(), start=self.start, stop=self.stop
5262 )
5263 elif self.coordinates is not None:
5264 return self.table.table.read_coordinates(self.coordinates)
5265 return self.table.table.read(start=self.start, stop=self.stop)
5266
5267 def select_coords(self):
5268 """
5269 generate the selection
5270 """
5271 start, stop = self.start, self.stop
5272 nrows = self.table.nrows
5273 if start is None:
5274 start = 0
5275 elif start < 0:
5276 start += nrows
5277 if stop is None:
5278 stop = nrows
5279 elif stop < 0:
5280 stop += nrows
5281
5282 if self.condition is not None:
5283 return self.table.table.get_where_list(
5284 self.condition.format(), start=start, stop=stop, sort=True
5285 )
5286 elif self.coordinates is not None:
5287 return self.coordinates
5288
5289 return np.arange(start, stop)