Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tables/table.py: 14%
1343 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-10 06:15 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-10 06:15 +0000
1"""Here is defined the Table class."""
3import functools
4import math
5import operator
6import platform
7import sys
8import warnings
9from pathlib import Path
11from time import perf_counter as clock
13import numexpr as ne
14import numpy as np
16from . import tableextension
17from .lrucacheextension import ObjectCache, NumCache
18from .atom import Atom
19from .conditions import compile_condition
20from .flavor import flavor_of, array_as_internal, internal_to_flavor
21from .utils import is_idx, lazyattr, SizeType, NailedDict as CacheDict
22from .leaf import Leaf
23from .description import (IsDescription, Description, Col, descr_from_dtype)
24from .exceptions import (
25 NodeError, HDF5ExtError, PerformanceWarning, OldIndexWarning,
26 NoSuchNodeError)
27from .utilsextension import get_nested_field
29from .path import join_path, split_path
30from .index import (
31 OldIndex, default_index_filters, default_auto_index, Index, IndexesDescG,
32 IndexesTableG)
35profile = False
36# profile = True # Uncomment for profiling
37if profile:
38 from .utils import show_stats
41# 2.2: Added support for complex types. Introduced in version 0.9.
42# 2.2.1: Added suport for time types.
43# 2.3: Changed the indexes naming schema.
44# 2.4: Changed indexes naming schema (again).
45# 2.5: Added the FIELD_%d_FILL attributes.
46# 2.6: Added the FLAVOR attribute (optional).
47# 2.7: Numeric and numarray flavors are gone.
48obversion = "2.7" # The Table VERSION number
51# Maps NumPy types to the types used by Numexpr.
52_nxtype_from_nptype = {
53 np.bool_: bool,
54 np.int8: ne.necompiler.int_,
55 np.int16: ne.necompiler.int_,
56 np.int32: ne.necompiler.int_,
57 np.int64: ne.necompiler.long_,
58 np.uint8: ne.necompiler.int_,
59 np.uint16: ne.necompiler.int_,
60 np.uint32: ne.necompiler.long_,
61 np.uint64: ne.necompiler.long_,
62 np.float32: float,
63 np.float64: ne.necompiler.double,
64 np.complex64: complex,
65 np.complex128: complex,
66 np.bytes_: bytes,
67}
69_nxtype_from_nptype[np.str_] = str
71if hasattr(np, 'float16'):
72 _nxtype_from_nptype[np.float16] = float # XXX: check
73if hasattr(np, 'float96'):
74 _nxtype_from_nptype[np.float96] = ne.necompiler.double # XXX: check
75if hasattr(np, 'float128'):
76 _nxtype_from_nptype[np.float128] = ne.necompiler.double # XXX: check
77if hasattr(np, 'complex192'):
78 _nxtype_from_nptype[np.complex192] = complex # XXX: check
79if hasattr(np, 'complex256'):
80 _nxtype_from_nptype[np.complex256] = complex # XXX: check
83# The NumPy scalar type corresponding to `SizeType`.
84_npsizetype = np.array(SizeType(0)).dtype.type
87def _index_name_of(node):
88 return '_i_%s' % node._v_name
91def _index_pathname_of(node):
92 nodeParentPath = split_path(node._v_pathname)[0]
93 return join_path(nodeParentPath, _index_name_of(node))
96def _index_pathname_of_column(table, colpathname):
97 return join_path(_index_pathname_of(table), colpathname)
100# The next are versions that work with just paths (i.e. we don't need
101# a node instance for using them, which can be critical in certain
102# situations)
105def _index_name_of_(nodeName):
106 return '_i_%s' % nodeName
109def _index_pathname_of_(nodePath):
110 nodeParentPath, nodeName = split_path(nodePath)
111 return join_path(nodeParentPath, _index_name_of_(nodeName))
114def _index_pathname_of_column_(tablePath, colpathname):
115 return join_path(_index_pathname_of_(tablePath), colpathname)
118def restorecache(self):
119 # Define a cache for sparse table reads
120 params = self._v_file.params
121 chunksize = self._v_chunkshape[0]
122 nslots = params['TABLE_MAX_SIZE'] / (chunksize * self._v_dtype.itemsize)
123 self._chunkcache = NumCache((nslots, chunksize), self._v_dtype,
124 'table chunk cache')
125 self._seqcache = ObjectCache(params['ITERSEQ_MAX_SLOTS'],
126 params['ITERSEQ_MAX_SIZE'],
127 'Iter sequence cache')
128 self._dirtycache = False
131def _table__where_indexed(self, compiled, condition, condvars,
132 start, stop, step):
133 if profile:
134 tref = clock()
135 if profile:
136 show_stats("Entering table_whereIndexed", tref)
137 self._use_index = True
138 # Clean the table caches for indexed queries if needed
139 if self._dirtycache:
140 restorecache(self)
142 # Get the values in expression that are not columns
143 values = []
144 for key, value in condvars.items():
145 if isinstance(value, np.ndarray):
146 values.append((key, value.item()))
147 # Build a key for the sequence cache
148 seqkey = (condition, tuple(values), (start, stop, step))
149 # Do a lookup in sequential cache for this query
150 nslot = self._seqcache.getslot(seqkey)
151 if nslot >= 0:
152 # Get the row sequence from the cache
153 seq = self._seqcache.getitem(nslot)
154 if len(seq) == 0:
155 return iter([])
156 # seq is a list.
157 seq = np.array(seq, dtype='int64')
158 # Correct the ranges in cached sequence
159 if (start, stop, step) != (0, self.nrows, 1):
160 seq = seq[(seq >= start) & (
161 seq < stop) & ((seq - start) % step == 0)]
162 return self.itersequence(seq)
163 else:
164 # No luck. self._seqcache will be populated
165 # in the iterator if possible. (Row._finish_riterator)
166 self._seqcache_key = seqkey
168 # Compute the chunkmap for every index in indexed expression
169 idxexprs = compiled.index_expressions
170 strexpr = compiled.string_expression
171 cmvars = {}
172 tcoords = 0
173 for i, idxexpr in enumerate(idxexprs):
174 var, ops, lims = idxexpr
175 col = condvars[var]
176 index = col.index
177 assert index is not None, "the chosen column is not indexed"
178 assert not index.dirty, "the chosen column has a dirty index"
180 # Get the number of rows that the indexed condition yields.
181 range_ = index.get_lookup_range(ops, lims)
182 ncoords = index.search(range_)
183 tcoords += ncoords
184 if index.reduction == 1 and ncoords == 0:
185 # No values from index condition, thus the chunkmap should be empty
186 nrowsinchunk = self.chunkshape[0]
187 nchunks = math.ceil(self.nrows / nrowsinchunk)
188 chunkmap = np.zeros(shape=nchunks, dtype="bool")
189 else:
190 # Get the chunkmap from the index
191 chunkmap = index.get_chunkmap()
192 # Assign the chunkmap to the cmvars dictionary
193 cmvars["e%d" % i] = chunkmap
195 if index.reduction == 1 and tcoords == 0:
196 # No candidates found in any indexed expression component, so leave now
197 self._seqcache.setitem(seqkey, [], 1)
198 return iter([])
200 # Compute the final chunkmap
201 chunkmap = ne.evaluate(strexpr, cmvars)
202 if not chunkmap.any():
203 # The chunkmap is all False, so the result is empty
204 self._seqcache.setitem(seqkey, [], 1)
205 return iter([])
207 if profile:
208 show_stats("Exiting table_whereIndexed", tref)
209 return chunkmap
212def create_indexes_table(table):
213 itgroup = IndexesTableG(
214 table._v_parent, _index_name_of(table),
215 "Indexes container for table " + table._v_pathname, new=True)
216 return itgroup
219def create_indexes_descr(igroup, dname, iname, filters):
220 idgroup = IndexesDescG(
221 igroup, iname,
222 "Indexes container for sub-description " + dname,
223 filters=filters, new=True)
224 return idgroup
227def _column__create_index(self, optlevel, kind, filters, tmp_dir,
228 blocksizes, verbose):
229 name = self.name
230 table = self.table
231 dtype = self.dtype
232 descr = self.descr
233 index = self.index
234 get_node = table._v_file._get_node
236 # Warn if the index already exists
237 if index:
238 raise ValueError("%s for column '%s' already exists. If you want to "
239 "re-create it, please, try with reindex() method "
240 "better" % (str(index), str(self.pathname)))
242 # Check that the datatype is indexable.
243 if dtype.str[1:] == 'u8':
244 raise NotImplementedError(
245 "indexing 64-bit unsigned integer columns "
246 "is not supported yet, sorry")
247 if dtype.kind == 'c':
248 raise TypeError("complex columns can not be indexed")
249 if dtype.shape != ():
250 raise TypeError("multidimensional columns can not be indexed")
252 # Get the indexes group for table, and if not exists, create it
253 try:
254 itgroup = get_node(_index_pathname_of(table))
255 except NoSuchNodeError:
256 itgroup = create_indexes_table(table)
258 # Create the necessary intermediate groups for descriptors
259 idgroup = itgroup
260 dname = ""
261 pathname = descr._v_pathname
262 if pathname != '':
263 inames = pathname.split('/')
264 for iname in inames:
265 if dname == '':
266 dname = iname
267 else:
268 dname += '/' + iname
269 try:
270 idgroup = get_node(f'{itgroup._v_pathname}/{dname}')
271 except NoSuchNodeError:
272 idgroup = create_indexes_descr(idgroup, dname, iname, filters)
274 # Create the atom
275 assert dtype.shape == ()
276 atom = Atom.from_dtype(np.dtype((dtype, (0,))))
278 # Protection on tables larger than the expected rows (perhaps the
279 # user forgot to pass this parameter to the Table constructor?)
280 expectedrows = table._v_expectedrows
281 if table.nrows > expectedrows:
282 expectedrows = table.nrows
284 # Create the index itself
285 index = Index(
286 idgroup, name, atom=atom,
287 title="Index for %s column" % name,
288 kind=kind,
289 optlevel=optlevel,
290 filters=filters,
291 tmp_dir=tmp_dir,
292 expectedrows=expectedrows,
293 byteorder=table.byteorder,
294 blocksizes=blocksizes)
296 table._set_column_indexing(self.pathname, True)
298 # Feed the index with values
300 # Add rows to the index if necessary
301 if table.nrows > 0:
302 indexedrows = table._add_rows_to_index(
303 self.pathname, 0, table.nrows, lastrow=True, update=False)
304 else:
305 indexedrows = 0
306 index.dirty = False
307 table._indexedrows = indexedrows
308 table._unsaved_indexedrows = table.nrows - indexedrows
310 # Optimize the index that has been already filled-up
311 index.optimize(verbose=verbose)
313 # We cannot do a flush here because when reindexing during a
314 # flush, the indexes are created anew, and that creates a nested
315 # call to flush().
316 # table.flush()
318 return indexedrows
321class _ColIndexes(dict):
322 """Provides a nice representation of column indexes."""
324 def __repr__(self):
325 """Gives a detailed Description column representation."""
327 rep = [f' \"{k}\": {v}' for k, v in self.items()]
328 return '{\n %s}' % (',\n '.join(rep))
331class Table(tableextension.Table, Leaf):
332 """This class represents heterogeneous datasets in an HDF5 file.
334 Tables are leaves (see the Leaf class in :ref:`LeafClassDescr`) whose data
335 consists of a unidimensional sequence of *rows*, where each row contains
336 one or more *fields*. Fields have an associated unique *name* and
337 *position*, with the first field having position 0. All rows have the same
338 fields, which are arranged in *columns*.
340 Fields can have any type supported by the Col class (see
341 :ref:`ColClassDescr`) and its descendants, which support multidimensional
342 data. Moreover, a field can be *nested* (to an arbitrary depth), meaning
343 that it includes further fields inside. A field named x inside a nested
344 field a in a table can be accessed as the field a/x (its *path name*) from
345 the table.
347 The structure of a table is declared by its description, which is made
348 available in the Table.description attribute (see :class:`Table`).
350 This class provides new methods to read, write and search table data
351 efficiently. It also provides special Python methods to allow accessing
352 the table as a normal sequence or array (with extended slicing supported).
354 PyTables supports *in-kernel* searches working simultaneously on several
355 columns using complex conditions. These are faster than selections using
356 Python expressions. See the :meth:`Table.where` method for more
357 information on in-kernel searches.
359 Non-nested columns can be *indexed*. Searching an indexed column can be
360 several times faster than searching a non-nested one. Search methods
361 automatically take advantage of indexing where available.
363 When iterating a table, an object from the Row (see :ref:`RowClassDescr`)
364 class is used. This object allows to read and write data one row at a
365 time, as well as to perform queries which are not supported by in-kernel
366 syntax (at a much lower speed, of course).
368 Objects of this class support access to individual columns via *natural
369 naming* through the :attr:`Table.cols` accessor. Nested columns are
370 mapped to Cols instances, and non-nested ones to Column instances.
371 See the Column class in :ref:`ColumnClassDescr` for examples of this
372 feature.
374 Parameters
375 ----------
376 parentnode
377 The parent :class:`Group` object.
379 .. versionchanged:: 3.0
380 Renamed from *parentNode* to *parentnode*.
382 name : str
383 The name of this node in its parent group.
384 description
385 An IsDescription subclass or a dictionary where the keys are the field
386 names, and the values the type definitions. In addition, a pure NumPy
387 dtype is accepted. If None, the table metadata is read from disk,
388 else, it's taken from previous parameters.
389 title
390 Sets a TITLE attribute on the HDF5 table entity.
391 filters : Filters
392 An instance of the Filters class that provides information about the
393 desired I/O filters to be applied during the life of this object.
394 expectedrows
395 A user estimate about the number of rows that will be on table. If not
396 provided, the default value is ``EXPECTED_ROWS_TABLE`` (see
397 ``tables/parameters.py``). If you plan to save bigger tables, try
398 providing a guess; this will optimize the HDF5 B-Tree creation and
399 management process time and memory used.
400 chunkshape
401 The shape of the data chunk to be read or written as a single HDF5 I/O
402 operation. The filters are applied to those chunks of data. Its rank
403 for tables has to be 1. If ``None``, a sensible value is calculated
404 based on the `expectedrows` parameter (which is recommended).
405 byteorder
406 The byteorder of the data *on-disk*, specified as 'little' or 'big'. If
407 this is not specified, the byteorder is that of the platform, unless
408 you passed a recarray as the `description`, in which case the recarray
409 byteorder will be chosen.
410 track_times
411 Whether time data associated with the leaf are recorded (object
412 access time, raw data modification time, metadata change time, object
413 birth time); default True. Semantics of these times depend on their
414 implementation in the HDF5 library: refer to documentation of the
415 H5O_info_t data structure. As of HDF5 1.8.15, only ctime (metadata
416 change time) is implemented.
418 .. versionadded:: 3.4.3
420 Notes
421 -----
422 The instance variables below are provided in addition to those in
423 Leaf (see :ref:`LeafClassDescr`). Please note that there are several
424 col* dictionaries to ease retrieving information about a column
425 directly by its path name, avoiding the need to walk through
426 Table.description or Table.cols.
429 .. rubric:: Table attributes
431 .. attribute:: coldescrs
433 Maps the name of a column to its Col description (see
434 :ref:`ColClassDescr`).
436 .. attribute:: coldflts
438 Maps the name of a column to its default value.
440 .. attribute:: coldtypes
442 Maps the name of a column to its NumPy data type.
444 .. attribute:: colindexed
446 Is the column which name is used as a key indexed?
448 .. attribute:: colinstances
450 Maps the name of a column to its Column (see
451 :ref:`ColumnClassDescr`) or Cols (see :ref:`ColsClassDescr`)
452 instance.
454 .. attribute:: colnames
456 A list containing the names of *top-level* columns in the table.
458 .. attribute:: colpathnames
460 A list containing the pathnames of *bottom-level* columns in
461 the table.
463 These are the leaf columns obtained when walking the table
464 description left-to-right, bottom-first. Columns inside a
465 nested column have slashes (/) separating name components in
466 their pathname.
468 .. attribute:: cols
470 A Cols instance that provides *natural naming* access to
471 non-nested (Column, see :ref:`ColumnClassDescr`) and nested
472 (Cols, see :ref:`ColsClassDescr`) columns.
474 .. attribute:: coltypes
476 Maps the name of a column to its PyTables data type.
478 .. attribute:: description
480 A Description instance (see :ref:`DescriptionClassDescr`)
481 reflecting the structure of the table.
483 .. attribute:: extdim
485 The index of the enlargeable dimension (always 0 for tables).
487 .. attribute:: indexed
489 Does this table have any indexed columns?
491 .. attribute:: nrows
493 The current number of rows in the table.
495 """
497 # Class identifier.
498 _c_classid = 'TABLE'
500 @lazyattr
501 def row(self):
502 """The associated Row instance (see :ref:`RowClassDescr`)."""
504 return tableextension.Row(self)
506 @lazyattr
507 def dtype(self):
508 """The NumPy ``dtype`` that most closely matches this table."""
510 return self.description._v_dtype
512 @property
513 def shape(self):
514 """The shape of this table."""
515 return (self.nrows,)
517 @property
518 def rowsize(self):
519 """The size in bytes of each row in the table."""
520 return self.description._v_dtype.itemsize
522 @property
523 def size_in_memory(self):
524 """The size of this table's data in bytes when it is fully loaded into
525 memory. This may be used in combination with size_on_disk to calculate
526 the compression ratio of the data."""
527 return self.nrows * self.rowsize
529 @lazyattr
530 def _v_iobuf(self):
531 """A buffer for doing I/O."""
533 return self._get_container(self.nrowsinbuf)
535 @lazyattr
536 def _v_wdflts(self):
537 """The defaults for writing in recarray format."""
539 # First, do a check to see whether we need to set default values
540 # different from 0 or not.
541 for coldflt in self.coldflts.values():
542 if isinstance(coldflt, np.ndarray) or coldflt:
543 break
544 else:
545 # No default different from 0 found. Returning None.
546 return None
547 wdflts = self._get_container(1)
548 for colname, coldflt in self.coldflts.items():
549 ra = get_nested_field(wdflts, colname)
550 ra[:] = coldflt
551 return wdflts
553 @lazyattr
554 def _colunaligned(self):
555 """The pathnames of unaligned, *unidimensional* columns."""
556 colunaligned, rarr = [], self._get_container(0)
557 for colpathname in self.colpathnames:
558 carr = get_nested_field(rarr, colpathname)
559 if not carr.flags.aligned and carr.ndim == 1:
560 colunaligned.append(colpathname)
561 return frozenset(colunaligned)
563 # **************** WARNING! ***********************
564 # This function can be called during the destruction time of a table
565 # so measures have been taken so that it doesn't have to revive
566 # another node (which can fool the LRU cache). The solution devised
567 # has been to add a cache for autoindex (Table._autoindex), populate
568 # it in creation time of the cache (which is a safe period) and then
569 # update the cache whenever it changes.
570 # This solves the error when running test_indexes.py ManyNodesTestCase.
571 # F. Alted 2007-04-20
572 # **************************************************
574 @property
575 def autoindex(self):
576 """Automatically keep column indexes up to date?
578 Setting this value states whether existing indexes should be
579 automatically updated after an append operation or recomputed
580 after an index-invalidating operation (i.e. removal and
581 modification of rows). The default is true.
583 This value gets into effect whenever a column is altered. If you
584 don't have automatic indexing activated and you want to do an an
585 immediate update use `Table.flush_rows_to_index()`; for an immediate
586 reindexing of invalidated indexes, use `Table.reindex_dirty()`.
588 This value is persistent.
590 .. versionchanged:: 3.0
591 The *autoIndex* property has been renamed into *autoindex*.
592 """
594 if self._autoindex is None:
595 try:
596 indexgroup = self._v_file._get_node(_index_pathname_of(self))
597 except NoSuchNodeError:
598 self._autoindex = default_auto_index # update cache
599 return self._autoindex
600 else:
601 self._autoindex = indexgroup.auto # update cache
602 return self._autoindex
603 else:
604 # The value is in cache, return it
605 return self._autoindex
607 @autoindex.setter
608 def autoindex(self, auto):
609 auto = bool(auto)
610 try:
611 indexgroup = self._v_file._get_node(_index_pathname_of(self))
612 except NoSuchNodeError:
613 indexgroup = create_indexes_table(self)
614 indexgroup.auto = auto
615 # Update the cache in table instance as well
616 self._autoindex = auto
618 @property
619 def indexedcolpathnames(self):
620 """List of pathnames of indexed columns in the table."""
621 return [_colpname
622 for _colpname in self.colpathnames
623 if self.colindexed[_colpname]]
625 @property
626 def colindexes(self):
627 """A dictionary with the indexes of the indexed columns."""
628 return _ColIndexes((_colpname, self.cols._f_col(_colpname).index)
629 for _colpname in self.colpathnames
630 if self.colindexed[_colpname])
632 @property
633 def _dirtyindexes(self):
634 """Whether some index in table is dirty."""
635 return self._condition_cache._nailcount > 0
637 def __init__(self, parentnode, name,
638 description=None, title="", filters=None,
639 expectedrows=None, chunkshape=None,
640 byteorder=None, _log=True, track_times=True):
642 self._v_new = new = description is not None
643 """Is this the first time the node has been created?"""
644 self._v_new_title = title
645 """New title for this node."""
646 self._v_new_filters = filters
647 """New filter properties for this node."""
648 self.extdim = 0 # Tables only have one dimension currently
649 """The index of the enlargeable dimension (always 0 for tables)."""
650 self._v_recarray = None
651 """A structured array to be stored in the table."""
652 self._rabyteorder = None
653 """The computed byteorder of the self._v_recarray."""
654 if expectedrows is None:
655 expectedrows = parentnode._v_file.params['EXPECTED_ROWS_TABLE']
656 self._v_expectedrows = expectedrows
657 """The expected number of rows to be stored in the table."""
658 self.nrows = SizeType(0)
659 """The current number of rows in the table."""
660 self.description = None
661 """A Description instance (see :ref:`DescriptionClassDescr`)
662 reflecting the structure of the table."""
663 self._time64colnames = []
664 """The names of ``Time64`` columns."""
665 self._strcolnames = []
666 """The names of ``String`` columns."""
667 self._colenums = {}
668 """Maps the name of an enumerated column to its ``Enum`` instance."""
669 self._v_chunkshape = None
670 """Private storage for the `chunkshape` property of the leaf."""
672 self.indexed = False
673 """Does this table have any indexed columns?"""
674 self._indexedrows = 0
675 """Number of rows indexed in disk."""
676 self._unsaved_indexedrows = 0
677 """Number of rows indexed in memory but still not in disk."""
678 self._listoldindexes = []
679 """The list of columns with old indexes."""
680 self._autoindex = None
681 """Private variable that caches the value for autoindex."""
683 self.colnames = []
684 """A list containing the names of *top-level* columns in the table."""
685 self.colpathnames = []
686 """A list containing the pathnames of *bottom-level* columns in the
687 table.
689 These are the leaf columns obtained when walking the
690 table description left-to-right, bottom-first. Columns inside a
691 nested column have slashes (/) separating name components in
692 their pathname.
693 """
694 self.colinstances = {}
695 """Maps the name of a column to its Column (see
696 :ref:`ColumnClassDescr`) or Cols (see :ref:`ColsClassDescr`)
697 instance."""
698 self.coldescrs = {}
699 """Maps the name of a column to its Col description (see
700 :ref:`ColClassDescr`)."""
701 self.coltypes = {}
702 """Maps the name of a column to its PyTables data type."""
703 self.coldtypes = {}
704 """Maps the name of a column to its NumPy data type."""
705 self.coldflts = {}
706 """Maps the name of a column to its default value."""
707 self.colindexed = {}
708 """Is the column which name is used as a key indexed?"""
710 self._use_index = False
711 """Whether an index can be used or not in a search. Boolean."""
712 self._where_condition = None
713 """Condition function and argument list for selection of values."""
714 self._seqcache_key = None
715 """The key under which to save a query's results (list of row indexes)
716 or None to not save."""
717 max_slots = parentnode._v_file.params['COND_CACHE_SLOTS']
718 self._condition_cache = CacheDict(max_slots)
719 """Cache of already compiled conditions."""
720 self._exprvars_cache = {}
721 """Cache of variables participating in numexpr expressions."""
722 self._enabled_indexing_in_queries = True
723 """Is indexing enabled in queries? *Use only for testing.*"""
724 self._empty_array_cache = {}
725 """Cache of empty arrays."""
727 self._v_dtype = None
728 """The NumPy datatype fopr this table."""
729 self.cols = None
730 """
731 A Cols instance that provides *natural naming* access to non-nested
732 (Column, see :ref:`ColumnClassDescr`) and nested (Cols, see
733 :ref:`ColsClassDescr`) columns.
734 """
735 self._dirtycache = True
736 """Whether the data caches are dirty or not. Initially set to yes."""
737 self._descflavor = None
738 """Temporarily keeps the flavor of a description with data."""
740 # Initialize this object in case is a new Table
742 # Try purely descriptive description objects.
743 if new and isinstance(description, dict):
744 # Dictionary case
745 self.description = Description(description,
746 ptparams=parentnode._v_file.params)
747 elif new and (type(description) == type(IsDescription)
748 and issubclass(description, IsDescription)):
749 # IsDescription subclass case
750 descr = description()
751 self.description = Description(descr.columns,
752 ptparams=parentnode._v_file.params)
753 elif new and isinstance(description, Description):
754 # It is a Description instance already
755 self.description = description
757 # No description yet?
758 if new and self.description is None:
759 # Try NumPy dtype instances
760 if isinstance(description, np.dtype):
761 tup = descr_from_dtype(description,
762 ptparams=parentnode._v_file.params)
763 self.description, self._rabyteorder = tup
765 # No description yet?
766 if new and self.description is None:
767 # Try structured array description objects.
768 try:
769 self._descflavor = flavor = flavor_of(description)
770 except TypeError: # probably not an array
771 pass
772 else:
773 if flavor == 'python':
774 nparray = np.rec.array(description)
775 else:
776 nparray = array_as_internal(description, flavor)
777 self.nrows = nrows = SizeType(nparray.size)
778 # If `self._v_recarray` is set, it will be used as the
779 # initial buffer.
780 if nrows > 0:
781 self._v_recarray = nparray
782 tup = descr_from_dtype(nparray.dtype,
783 ptparams=parentnode._v_file.params)
784 self.description, self._rabyteorder = tup
786 # No description yet?
787 if new and self.description is None:
788 raise TypeError(
789 "the ``description`` argument is not of a supported type: "
790 "``IsDescription`` subclass, ``Description`` instance, "
791 "dictionary, or structured array")
793 # Check the chunkshape parameter
794 if new and chunkshape is not None:
795 if isinstance(chunkshape, (int, np.integer)):
796 chunkshape = (chunkshape,)
797 try:
798 chunkshape = tuple(chunkshape)
799 except TypeError:
800 raise TypeError(
801 "`chunkshape` parameter must be an integer or sequence "
802 "and you passed a %s" % type(chunkshape))
803 if len(chunkshape) != 1:
804 raise ValueError("`chunkshape` rank (length) must be 1: %r"
805 % (chunkshape,))
806 self._v_chunkshape = tuple(SizeType(s) for s in chunkshape)
808 super().__init__(parentnode, name, new, filters, byteorder, _log,
809 track_times)
811 def _g_post_init_hook(self):
812 # We are putting here the index-related issues
813 # as well as filling general info for table
814 # This is needed because we need first the index objects created
816 # First, get back the flavor of input data (if any) for
817 # `Leaf._g_post_init_hook()`.
818 self._flavor, self._descflavor = self._descflavor, None
819 super()._g_post_init_hook()
821 self.blosc2_support_write = (
822 (self.byteorder == sys.byteorder) and
823 (self.filters.complib != None) and
824 (self.filters.complib.startswith("blosc2")))
825 # For reading, Windows does not support re-opening a file twice
826 # in not read-only mode (for good reason), so we cannot use the
827 # blosc2 opt
828 self.blosc2_support_read = (
829 self.blosc2_support_write and
830 ((platform.system().lower() != 'windows') or
831 ((self._v_file.mode == 'r')))
832 )
834 # Create a cols accessor.
835 self.cols = Cols(self, self.description)
837 # Place the `Cols` and `Column` objects into `self.colinstances`.
838 colinstances, cols = self.colinstances, self.cols
839 for colpathname in self.description._v_pathnames:
840 colinstances[colpathname] = cols._g_col(colpathname)
842 if self._v_new:
843 # Columns are never indexed on creation.
844 self.colindexed = {cpn: False for cpn in self.colpathnames}
845 return
847 # The following code is only for opened tables.
849 # Do the indexes group exist?
850 indexesgrouppath = _index_pathname_of(self)
851 igroup = indexesgrouppath in self._v_file
852 oldindexes = False
853 for colobj in self.description._f_walk(type="Col"):
854 colname = colobj._v_pathname
855 # Is this column indexed?
856 if igroup:
857 indexname = _index_pathname_of_column(self, colname)
858 indexed = indexname in self._v_file
859 self.colindexed[colname] = indexed
860 if indexed:
861 column = self.cols._g_col(colname)
862 indexobj = column.index
863 if isinstance(indexobj, OldIndex):
864 indexed = False # Not a vaild index
865 oldindexes = True
866 self._listoldindexes.append(colname)
867 else:
868 # Tell the condition cache about columns with dirty
869 # indexes.
870 if indexobj.dirty:
871 self._condition_cache.nail()
872 else:
873 indexed = False
874 self.colindexed[colname] = False
875 if indexed:
876 self.indexed = True
878 if oldindexes: # this should only appear under 2.x Pro
879 warnings.warn(
880 "table ``%s`` has column indexes with PyTables 1.x format. "
881 "Unfortunately, this format is not supported in "
882 "PyTables 2.x series. Note that you can use the "
883 "``ptrepack`` utility in order to recreate the indexes. "
884 "The 1.x indexed columns found are: %s" %
885 (self._v_pathname, self._listoldindexes),
886 OldIndexWarning)
888 # It does not matter to which column 'indexobj' belongs,
889 # since their respective index objects share
890 # the same number of elements.
891 if self.indexed:
892 self._indexedrows = indexobj.nelements
893 self._unsaved_indexedrows = self.nrows - self._indexedrows
894 # Put the autoindex value in a cache variable
895 self._autoindex = self.autoindex
897 def _calc_nrowsinbuf(self):
898 """Calculate the number of rows that fits on a PyTables buffer."""
900 params = self._v_file.params
901 # Compute the nrowsinbuf
902 rowsize = self.rowsize
903 buffersize = params['IO_BUFFER_SIZE']
904 if rowsize != 0:
905 nrowsinbuf = buffersize // rowsize
906 # The number of rows in buffer needs to be an exact multiple of
907 # chunkshape[0] for queries using indexed columns.
908 # Fixes #319 and probably #409 too.
909 nrowsinbuf -= nrowsinbuf % self.chunkshape[0]
910 else:
911 nrowsinbuf = 1
913 # tableextension.pyx performs an assertion
914 # to make sure nrowsinbuf is greater than or
915 # equal to the chunksize.
916 # See gh-206 and gh-238
917 if self.chunkshape is not None:
918 if nrowsinbuf < self.chunkshape[0]:
919 nrowsinbuf = self.chunkshape[0]
921 # Safeguard against row sizes being extremely large
922 if nrowsinbuf == 0:
923 nrowsinbuf = 1
924 # If rowsize is too large, issue a Performance warning
925 maxrowsize = params['BUFFER_TIMES'] * buffersize
926 if rowsize > maxrowsize:
927 warnings.warn("""\
928The Table ``%s`` is exceeding the maximum recommended rowsize (%d bytes);
929be ready to see PyTables asking for *lots* of memory and possibly slow
930I/O. You may want to reduce the rowsize by trimming the value of
931dimensions that are orthogonal (and preferably close) to the *main*
932dimension of this leave. Alternatively, in case you have specified a
933very small/large chunksize, you may want to increase/decrease it."""
934 % (self._v_pathname, maxrowsize),
935 PerformanceWarning)
936 return nrowsinbuf
938 def _getemptyarray(self, dtype):
939 # Acts as a cache for empty arrays
940 key = dtype
941 if key in self._empty_array_cache:
942 return self._empty_array_cache[key]
943 else:
944 self._empty_array_cache[
945 key] = arr = np.empty(shape=0, dtype=key)
946 return arr
948 def _get_container(self, shape):
949 """Get the appropriate buffer for data depending on table
950 nestedness."""
952 # This is *much* faster than the numpy.rec.array counterpart
953 return np.empty(shape=shape, dtype=self._v_dtype)
955 def _get_type_col_names(self, type_):
956 """Returns a list containing 'type_' column names."""
958 return [colobj._v_pathname
959 for colobj in self.description._f_walk('Col')
960 if colobj.type == type_]
962 def _get_enum_map(self):
963 """Return mapping from enumerated column names to `Enum` instances."""
965 enumMap = {}
966 for colobj in self.description._f_walk('Col'):
967 if colobj.kind == 'enum':
968 enumMap[colobj._v_pathname] = colobj.enum
969 return enumMap
971 def _g_create(self):
972 """Create a new table on disk."""
974 # Warning against assigning too much columns...
975 # F. Alted 2005-06-05
976 maxColumns = self._v_file.params['MAX_COLUMNS']
977 if (len(self.description._v_names) > maxColumns):
978 warnings.warn(
979 "table ``%s`` is exceeding the recommended "
980 "maximum number of columns (%d); "
981 "be ready to see PyTables asking for *lots* of memory "
982 "and possibly slow I/O" % (self._v_pathname, maxColumns),
983 PerformanceWarning)
985 # 1. Create the HDF5 table (some parameters need to be computed).
987 # Fix the byteorder of the recarray and update the number of
988 # expected rows if necessary
989 if self._v_recarray is not None:
990 self._v_recarray = self._g_fix_byteorder_data(self._v_recarray,
991 self._rabyteorder)
992 if len(self._v_recarray) > self._v_expectedrows:
993 self._v_expectedrows = len(self._v_recarray)
994 # Compute a sensible chunkshape
995 if self._v_chunkshape is None:
996 self._v_chunkshape = self._calc_chunkshape(
997 self._v_expectedrows, self.rowsize, self.rowsize)
998 # Correct the byteorder, if still needed
999 if self.byteorder is None:
1000 self.byteorder = sys.byteorder
1002 # Cache some data which is already in the description.
1003 # This is necessary to happen before creation time in order
1004 # to be able to populate the self._v_wdflts
1005 self._cache_description_data()
1007 # After creating the table, ``self._v_objectid`` needs to be
1008 # set because it is needed for setting attributes afterwards.
1009 self._v_objectid = self._create_table(
1010 self._v_new_title, self.filters.complib or '', obversion)
1011 self._v_recarray = None # not useful anymore
1012 self._rabyteorder = None # not useful anymore
1014 # 2. Compute or get chunk shape and buffer size parameters.
1015 self.nrowsinbuf = self._calc_nrowsinbuf()
1017 # 3. Get field fill attributes from the table description and
1018 # set them on disk.
1019 if self._v_file.params['PYTABLES_SYS_ATTRS']:
1020 set_attr = self._v_attrs._g__setattr
1021 for i, colobj in enumerate(self.description._f_walk(type="Col")):
1022 fieldname = "FIELD_%d_FILL" % i
1023 set_attr(fieldname, colobj.dflt)
1025 return self._v_objectid
1027 def _g_open(self):
1028 """Opens a table from disk and read the metadata on it.
1030 Creates an user description on the flight to easy the access to
1031 the actual data.
1033 """
1035 # 1. Open the HDF5 table and get some data from it.
1036 self._v_objectid, description, chunksize = self._get_info()
1037 self._v_expectedrows = self.nrows # the actual number of rows
1039 # 2. Create an instance description to host the record fields.
1040 validate = not self._v_file._isPTFile # only for non-PyTables files
1041 self.description = Description(description, validate=validate,
1042 ptparams=self._v_file.params)
1044 # 3. Compute or get chunk shape and buffer size parameters.
1045 if chunksize == 0:
1046 self._v_chunkshape = self._calc_chunkshape(
1047 self._v_expectedrows, self.rowsize, self.rowsize)
1048 else:
1049 self._v_chunkshape = (chunksize,)
1050 self.nrowsinbuf = self._calc_nrowsinbuf()
1052 # 4. If there are field fill attributes, get them from disk and
1053 # set them in the table description.
1054 if self._v_file.params['PYTABLES_SYS_ATTRS']:
1055 if "FIELD_0_FILL" in self._v_attrs._f_list("sys"):
1056 i = 0
1057 get_attr = self._v_attrs.__getattr__
1058 for objcol in self.description._f_walk(type="Col"):
1059 colname = objcol._v_pathname
1060 # Get the default values for each column
1061 fieldname = "FIELD_%s_FILL" % i
1062 defval = get_attr(fieldname)
1063 if defval is not None:
1064 objcol.dflt = defval
1065 else:
1066 warnings.warn("could not load default value "
1067 "for the ``%s`` column of table ``%s``; "
1068 "using ``%r`` instead"
1069 % (colname, self._v_pathname,
1070 objcol.dflt))
1071 defval = objcol.dflt
1072 i += 1
1074 # Set also the correct value in the desc._v_dflts dictionary
1075 for descr in self.description._f_walk(type="Description"):
1076 for name in descr._v_names:
1077 objcol = descr._v_colobjects[name]
1078 if isinstance(objcol, Col):
1079 descr._v_dflts[objcol._v_name] = objcol.dflt
1081 # 5. Cache some data which is already in the description.
1082 self._cache_description_data()
1084 return self._v_objectid
1086 def _cache_description_data(self):
1087 """Cache some data which is already in the description.
1089 Some information is extracted from `self.description` to build
1090 some useful (but redundant) structures:
1092 * `self.colnames`
1093 * `self.colpathnames`
1094 * `self.coldescrs`
1095 * `self.coltypes`
1096 * `self.coldtypes`
1097 * `self.coldflts`
1098 * `self._v_dtype`
1099 * `self._time64colnames`
1100 * `self._strcolnames`
1101 * `self._colenums`
1103 """
1105 self.colnames = list(self.description._v_names)
1106 self.colpathnames = [
1107 col._v_pathname for col in self.description._f_walk()
1108 if not hasattr(col, '_v_names')] # bottom-level
1110 # Find ``time64`` column names.
1111 self._time64colnames = self._get_type_col_names('time64')
1112 # Find ``string`` column names.
1113 self._strcolnames = self._get_type_col_names('string')
1114 # Get a mapping of enumerated columns to their `Enum` instances.
1115 self._colenums = self._get_enum_map()
1117 # Get info about columns
1118 for colobj in self.description._f_walk(type="Col"):
1119 colname = colobj._v_pathname
1120 # Get the column types, types and defaults
1121 self.coldescrs[colname] = colobj
1122 self.coltypes[colname] = colobj.type
1123 self.coldtypes[colname] = colobj.dtype
1124 self.coldflts[colname] = colobj.dflt
1126 # Assign _v_dtype for this table
1127 self._v_dtype = self.description._v_dtype
1129 def _get_column_instance(self, colpathname):
1130 """Get the instance of the column with the given `colpathname`.
1132 If the column does not exist in the table, a `KeyError` is
1133 raised.
1135 """
1137 try:
1138 return functools.reduce(
1139 getattr, colpathname.split('/'), self.description)
1140 except AttributeError:
1141 raise KeyError("table ``%s`` does not have a column named ``%s``"
1142 % (self._v_pathname, colpathname))
1144 _check_column = _get_column_instance
1146 def _disable_indexing_in_queries(self):
1147 """Force queries not to use indexing.
1149 *Use only for testing.*
1151 """
1153 if not self._enabled_indexing_in_queries:
1154 return # already disabled
1155 # The nail avoids setting/getting compiled conditions in/from
1156 # the cache where indexing is used.
1157 self._condition_cache.nail()
1158 self._enabled_indexing_in_queries = False
1160 def _enable_indexing_in_queries(self):
1161 """Allow queries to use indexing.
1163 *Use only for testing.*
1165 """
1167 if self._enabled_indexing_in_queries:
1168 return # already enabled
1169 self._condition_cache.unnail()
1170 self._enabled_indexing_in_queries = True
1172 def _required_expr_vars(self, expression, uservars, depth=1):
1173 """Get the variables required by the `expression`.
1175 A new dictionary defining the variables used in the `expression`
1176 is returned. Required variables are first looked up in the
1177 `uservars` mapping, then in the set of top-level columns of the
1178 table. Unknown variables cause a `NameError` to be raised.
1180 When `uservars` is `None`, the local and global namespace where
1181 the API callable which uses this method is called is sought
1182 instead. This mechanism will not work as expected if this
1183 method is not used *directly* from an API callable. To disable
1184 this mechanism, just specify a mapping as `uservars`.
1186 Nested columns and columns from other tables are not allowed
1187 (`TypeError` and `ValueError` are raised, respectively). Also,
1188 non-column variable values are converted to NumPy arrays.
1190 `depth` specifies the depth of the frame in order to reach local
1191 or global variables.
1193 """
1195 # Get the names of variables used in the expression.
1196 exprvarscache = self._exprvars_cache
1197 if expression not in exprvarscache:
1198 # Protection against growing the cache too much
1199 if len(exprvarscache) > 256:
1200 # Remove 10 (arbitrary) elements from the cache
1201 for k in list(exprvarscache)[:10]:
1202 del exprvarscache[k]
1203 cexpr = compile(expression, '<string>', 'eval')
1204 exprvars = [var for var in cexpr.co_names
1205 if var not in ['None', 'False', 'True']
1206 and var not in ne.expressions.functions]
1207 exprvarscache[expression] = exprvars
1208 else:
1209 exprvars = exprvarscache[expression]
1211 # Get the local and global variable mappings of the user frame
1212 # if no mapping has been explicitly given for user variables.
1213 user_locals, user_globals = {}, {}
1214 if uservars is None:
1215 # We use specified depth to get the frame where the API
1216 # callable using this method is called. For instance:
1217 #
1218 # * ``table._required_expr_vars()`` (depth 0) is called by
1219 # * ``table._where()`` (depth 1) is called by
1220 # * ``table.where()`` (depth 2) is called by
1221 # * user-space functions (depth 3)
1222 user_frame = sys._getframe(depth)
1223 user_locals = user_frame.f_locals
1224 user_globals = user_frame.f_globals
1226 colinstances = self.colinstances
1227 tblfile, tblpath = self._v_file, self._v_pathname
1228 # Look for the required variables first among the ones
1229 # explicitly provided by the user, then among implicit columns,
1230 # then among external variables (only if no explicit variables).
1231 reqvars = {}
1232 for var in exprvars:
1233 # Get the value.
1234 if uservars is not None and var in uservars:
1235 val = uservars[var]
1236 elif var in colinstances:
1237 val = colinstances[var]
1238 elif uservars is None and var in user_locals:
1239 val = user_locals[var]
1240 elif uservars is None and var in user_globals:
1241 val = user_globals[var]
1242 else:
1243 raise NameError("name ``%s`` is not defined" % var)
1245 # Check the value.
1246 if hasattr(val, 'pathname'): # non-nested column
1247 if val.shape[1:] != ():
1248 raise NotImplementedError(
1249 "variable ``%s`` refers to "
1250 "a multidimensional column, "
1251 "not yet supported in conditions, sorry" % var)
1252 if (val._table_file is not tblfile or
1253 val._table_path != tblpath):
1254 raise ValueError("variable ``%s`` refers to a column "
1255 "which is not part of table ``%s``"
1256 % (var, tblpath))
1257 if val.dtype.str[1:] == 'u8':
1258 raise NotImplementedError(
1259 "variable ``%s`` refers to "
1260 "a 64-bit unsigned integer column, "
1261 "not yet supported in conditions, sorry; "
1262 "please use regular Python selections" % var)
1263 elif hasattr(val, '_v_colpathnames'): # nested column
1264 raise TypeError(
1265 "variable ``%s`` refers to a nested column, "
1266 "not allowed in conditions" % var)
1267 else: # only non-column values are converted to arrays
1268 # XXX: not 100% sure about this
1269 if isinstance(val, str):
1270 val = np.asarray(val.encode('ascii'))
1271 else:
1272 val = np.asarray(val)
1273 reqvars[var] = val
1274 return reqvars
1276 def _get_condition_key(self, condition, condvars):
1277 """Get the condition cache key for `condition` with `condvars`.
1279 Currently, the key is a tuple of `condition`, column variables
1280 names, normal variables names, column paths and variable paths
1281 (all are tuples).
1283 """
1285 # Variable names for column and normal variables.
1286 colnames, varnames = [], []
1287 # Column paths and types for each of the previous variable.
1288 colpaths, vartypes = [], []
1289 for (var, val) in condvars.items():
1290 if hasattr(val, 'pathname'): # column
1291 colnames.append(var)
1292 colpaths.append(val.pathname)
1293 else: # array
1294 try:
1295 varnames.append(var)
1296 vartypes.append(ne.necompiler.getType(val)) # expensive
1297 except ValueError:
1298 # This is more clear than the error given by Numexpr.
1299 raise TypeError("variable ``%s`` has data type ``%s``, "
1300 "not allowed in conditions"
1301 % (var, val.dtype.name))
1302 colnames, varnames = tuple(colnames), tuple(varnames)
1303 colpaths, vartypes = tuple(colpaths), tuple(vartypes)
1304 condkey = (condition, colnames, varnames, colpaths, vartypes)
1305 return condkey
1307 def _compile_condition(self, condition, condvars):
1308 """Compile the `condition` and extract usable index conditions.
1310 This method returns an instance of ``CompiledCondition``. See
1311 the ``compile_condition()`` function in the ``conditions``
1312 module for more information about the compilation process.
1314 This method makes use of the condition cache when possible.
1316 """
1318 # Look up the condition in the condition cache.
1319 condcache = self._condition_cache
1320 condkey = self._get_condition_key(condition, condvars)
1321 compiled = condcache.get(condkey)
1322 if compiled:
1323 return compiled.with_replaced_vars(condvars) # bingo!
1325 # Bad luck, the condition must be parsed and compiled.
1326 # Fortunately, the key provides some valuable information. ;)
1327 (condition, colnames, varnames, colpaths, vartypes) = condkey
1329 # Extract more information from referenced columns.
1331 # start with normal variables
1332 typemap = dict(list(zip(varnames, vartypes)))
1333 indexedcols = []
1334 for colname in colnames:
1335 col = condvars[colname]
1337 # Extract types from *all* the given variables.
1338 coltype = col.dtype.type
1339 typemap[colname] = _nxtype_from_nptype[coltype]
1341 # Get the set of columns with usable indexes.
1342 if (self._enabled_indexing_in_queries # no in-kernel searches
1343 and self.colindexed[col.pathname] and not col.index.dirty):
1344 indexedcols.append(colname)
1346 indexedcols = frozenset(indexedcols)
1347 # Now let ``compile_condition()`` do the Numexpr-related job.
1348 compiled = compile_condition(condition, typemap, indexedcols)
1350 # Check that there actually are columns in the condition.
1351 if not set(compiled.parameters).intersection(set(colnames)):
1352 raise ValueError("there are no columns taking part "
1353 "in condition ``%s``" % (condition,))
1355 # Store the compiled condition in the cache and return it.
1356 condcache[condkey] = compiled
1357 return compiled.with_replaced_vars(condvars)
1359 def will_query_use_indexing(self, condition, condvars=None):
1360 """Will a query for the condition use indexing?
1362 The meaning of the condition and *condvars* arguments is the same as in
1363 the :meth:`Table.where` method. If condition can use indexing, this
1364 method returns a frozenset with the path names of the columns whose
1365 index is usable. Otherwise, it returns an empty list.
1367 This method is mainly intended for testing. Keep in mind that changing
1368 the set of indexed columns or their dirtiness may make this method
1369 return different values for the same arguments at different times.
1371 """
1373 # Compile the condition and extract usable index conditions.
1374 condvars = self._required_expr_vars(condition, condvars, depth=2)
1375 compiled = self._compile_condition(condition, condvars)
1376 # Return the columns in indexed expressions
1377 idxcols = [condvars[var].pathname for var in compiled.index_variables]
1378 return frozenset(idxcols)
1380 def where(self, condition, condvars=None,
1381 start=None, stop=None, step=None):
1382 r"""Iterate over values fulfilling a condition.
1384 This method returns a Row iterator (see :ref:`RowClassDescr`) which
1385 only selects rows in the table that satisfy the given condition (an
1386 expression-like string).
1388 The condvars mapping may be used to define the variable names appearing
1389 in the condition. condvars should consist of identifier-like strings
1390 pointing to Column (see :ref:`ColumnClassDescr`) instances *of this
1391 table*, or to other values (which will be converted to arrays). A
1392 default set of condition variables is provided where each top-level,
1393 non-nested column with an identifier-like name appears. Variables in
1394 condvars override the default ones.
1396 When condvars is not provided or None, the current local and global
1397 namespace is sought instead of condvars. The previous mechanism is
1398 mostly intended for interactive usage. To disable it, just specify a
1399 (maybe empty) mapping as condvars.
1401 If a range is supplied (by setting some of the start, stop or step
1402 parameters), only the rows in that range and fulfilling the condition
1403 are used. The meaning of the start, stop and step parameters is the
1404 same as for Python slices.
1406 When possible, indexed columns participating in the condition will be
1407 used to speed up the search. It is recommended that you place the
1408 indexed columns as left and out in the condition as possible. Anyway,
1409 this method has always better performance than regular Python
1410 selections on the table.
1412 You can mix this method with regular Python selections in order to
1413 support even more complex queries. It is strongly recommended that you
1414 pass the most restrictive condition as the parameter to this method if
1415 you want to achieve maximum performance.
1417 .. warning::
1419 When in the middle of a table row iterator, you should not
1420 use methods that can change the number of rows in the table
1421 (like :meth:`Table.append` or :meth:`Table.remove_rows`) or
1422 unexpected errors will happen.
1424 Examples
1425 --------
1427 ::
1429 passvalues = [ row['col3'] for row in
1430 table.where('(col1 > 0) & (col2 <= 20)', step=5)
1431 if your_function(row['col2']) ]
1432 print("Values that pass the cuts:", passvalues)
1434 .. note::
1436 A special care should be taken when the query condition includes
1437 string literals.
1439 Let's assume that the table ``table`` has the following
1440 structure::
1442 class Record(IsDescription):
1443 col1 = StringCol(4) # 4-character String of bytes
1444 col2 = IntCol()
1445 col3 = FloatCol()
1447 The type of "col1" corresponds to strings of bytes.
1449 Any condition involving "col1" should be written using the
1450 appropriate type for string literals in order to avoid
1451 :exc:`TypeError`\ s.
1453 The code below will fail with a :exc:`TypeError`::
1455 condition = 'col1 == "AAAA"'
1456 for record in table.where(condition): # TypeError in Python3
1457 # do something with "record"
1459 The reason is that in Python 3 "condition" implies a comparison
1460 between a string of bytes ("col1" contents) and a unicode literal
1461 ("AAAA").
1463 The correct way to write the condition is::
1465 condition = 'col1 == b"AAAA"'
1467 .. versionchanged:: 3.0
1468 The start, stop and step parameters now behave like in slice.
1470 """
1472 return self._where(condition, condvars, start, stop, step)
1474 def _where(self, condition, condvars, start=None, stop=None, step=None):
1475 """Low-level counterpart of `self.where()`."""
1477 if profile:
1478 tref = clock()
1479 if profile:
1480 show_stats("Entering table._where", tref)
1481 # Adjust the slice to be used.
1482 (start, stop, step) = self._process_range_read(start, stop, step)
1483 if start >= stop: # empty range, reset conditions
1484 self._use_index = False
1485 self._where_condition = None
1486 return iter([])
1488 # Compile the condition and extract usable index conditions.
1489 condvars = self._required_expr_vars(condition, condvars, depth=3)
1490 compiled = self._compile_condition(condition, condvars)
1492 # Can we use indexes?
1493 if compiled.index_expressions:
1494 chunkmap = _table__where_indexed(
1495 self, compiled, condition, condvars, start, stop, step)
1496 if not isinstance(chunkmap, np.ndarray):
1497 # If it is not a NumPy array it should be an iterator
1498 # Reset conditions
1499 self._use_index = False
1500 self._where_condition = None
1501 # ...and return the iterator
1502 return chunkmap
1503 else:
1504 chunkmap = None # default to an in-kernel query
1506 args = [condvars[param] for param in compiled.parameters]
1507 self._where_condition = (compiled.function, args, compiled.kwargs)
1508 row = tableextension.Row(self)
1509 if profile:
1510 show_stats("Exiting table._where", tref)
1511 return row._iter(start, stop, step, chunkmap=chunkmap)
1513 def read_where(self, condition, condvars=None, field=None,
1514 start=None, stop=None, step=None):
1515 """Read table data fulfilling the given *condition*.
1517 This method is similar to :meth:`Table.read`, having their common
1518 arguments and return values the same meanings. However, only the rows
1519 fulfilling the *condition* are included in the result.
1521 The meaning of the other arguments is the same as in the
1522 :meth:`Table.where` method.
1524 """
1526 self._g_check_open()
1527 coords = [p.nrow for p in
1528 self._where(condition, condvars, start, stop, step)]
1529 self._where_condition = None # reset the conditions
1530 if len(coords) > 1:
1531 cstart, cstop = coords[0], coords[-1] + 1
1532 if cstop - cstart == len(coords):
1533 # Chances for monotonically increasing row values. Refine.
1534 inc_seq = np.alltrue(
1535 np.arange(cstart, cstop) == np.array(coords))
1536 if inc_seq:
1537 return self.read(cstart, cstop, field=field)
1538 return self.read_coordinates(coords, field)
1540 def append_where(self, dstTable, condition=None, condvars=None,
1541 start=None, stop=None, step=None):
1542 """Append rows fulfilling the condition to the dstTable table.
1544 dstTable must be capable of taking the rows resulting from the query,
1545 i.e. it must have columns with the expected names and compatible
1546 types. The meaning of the other arguments is the same as in the
1547 :meth:`Table.where` method.
1549 The number of rows appended to dstTable is returned as a result.
1551 .. versionchanged:: 3.0
1552 The *whereAppend* method has been renamed into *append_where*.
1554 """
1556 self._g_check_open()
1558 # Check that the destination file is not in read-only mode.
1559 dstTable._v_file._check_writable()
1561 # Row objects do not support nested columns, so we must iterate
1562 # over the flat column paths. When rows support nesting,
1563 # ``self.colnames`` can be directly iterated upon.
1564 colNames = [colName for colName in self.colpathnames]
1565 dstRow = dstTable.row
1566 nrows = 0
1567 if condition is not None:
1568 srcRows = self._where(condition, condvars, start, stop, step)
1569 else:
1570 srcRows = self.iterrows(start, stop, step)
1571 for srcRow in srcRows:
1572 for colName in colNames:
1573 dstRow[colName] = srcRow[colName]
1574 dstRow.append()
1575 nrows += 1
1576 dstTable.flush()
1577 return nrows
1579 def get_where_list(self, condition, condvars=None, sort=False,
1580 start=None, stop=None, step=None):
1581 """Get the row coordinates fulfilling the given condition.
1583 The coordinates are returned as a list of the current flavor. sort
1584 means that you want to retrieve the coordinates ordered. The default is
1585 to not sort them.
1587 The meaning of the other arguments is the same as in the
1588 :meth:`Table.where` method.
1590 """
1592 self._g_check_open()
1594 coords = [p.nrow for p in
1595 self._where(condition, condvars, start, stop, step)]
1596 coords = np.array(coords, dtype=SizeType)
1597 # Reset the conditions
1598 self._where_condition = None
1599 if sort:
1600 coords = np.sort(coords)
1601 return internal_to_flavor(coords, self.flavor)
1603 def itersequence(self, sequence):
1604 """Iterate over a sequence of row coordinates."""
1606 if not hasattr(sequence, '__getitem__'):
1607 raise TypeError("Wrong 'sequence' parameter type. Only sequences "
1608 "are suported.")
1609 # start, stop and step are necessary for the new iterator for
1610 # coordinates, and perhaps it would be useful to add them as
1611 # parameters in the future (not now, because I've just removed
1612 # the `sort` argument for 2.1).
1613 #
1614 # *Important note*: Negative values for step are not supported
1615 # for the general case, but only for the itersorted() and
1616 # read_sorted() purposes! The self._process_range_read will raise
1617 # an appropiate error.
1618 # F. Alted 2008-09-18
1619 # A.V. 20130513: _process_range_read --> _process_range
1620 (start, stop, step) = self._process_range(None, None, None)
1621 if (start > stop) or (len(sequence) == 0):
1622 return iter([])
1623 row = tableextension.Row(self)
1624 return row._iter(start, stop, step, coords=sequence)
1626 def _check_sortby_csi(self, sortby, checkCSI):
1627 if isinstance(sortby, Column):
1628 icol = sortby
1629 elif isinstance(sortby, str):
1630 icol = self.cols._f_col(sortby)
1631 else:
1632 raise TypeError(
1633 "`sortby` can only be a `Column` or string object, "
1634 "but you passed an object of type: %s" % type(sortby))
1635 if icol.is_indexed and icol.index.kind == "full":
1636 if checkCSI and not icol.index.is_csi:
1637 # The index exists, but it is not a CSI one.
1638 raise ValueError(
1639 "Field `%s` must have associated a CSI index "
1640 "in table `%s`, but the existing one is not. "
1641 % (sortby, self))
1642 return icol.index
1643 else:
1644 raise ValueError(
1645 "Field `%s` must have associated a 'full' index "
1646 "in table `%s`." % (sortby, self))
1648 def itersorted(self, sortby, checkCSI=False,
1649 start=None, stop=None, step=None):
1650 """Iterate table data following the order of the index of sortby
1651 column.
1653 The sortby column must have associated a full index. If you want to
1654 ensure a fully sorted order, the index must be a CSI one. You may want
1655 to use the checkCSI argument in order to explicitly check for the
1656 existence of a CSI index.
1658 The meaning of the start, stop and step arguments is the same as in
1659 :meth:`Table.read`.
1661 .. versionchanged:: 3.0
1662 If the *start* parameter is provided and *stop* is None then the
1663 table is iterated from *start* to the last line.
1664 In PyTables < 3.0 only one element was returned.
1666 """
1668 index = self._check_sortby_csi(sortby, checkCSI)
1669 # Adjust the slice to be used.
1670 (start, stop, step) = self._process_range(start, stop, step,
1671 warn_negstep=False)
1672 if (start > stop and 0 < step) or (start < stop and 0 > step):
1673 # Fall-back action is to return an empty iterator
1674 return iter([])
1675 row = tableextension.Row(self)
1676 return row._iter(start, stop, step, coords=index)
1678 def read_sorted(self, sortby, checkCSI=False, field=None,
1679 start=None, stop=None, step=None):
1680 """Read table data following the order of the index of sortby column.
1682 The sortby column must have associated a full index. If you want to
1683 ensure a fully sorted order, the index must be a CSI one. You may want
1684 to use the checkCSI argument in order to explicitly check for the
1685 existence of a CSI index.
1687 If field is supplied only the named column will be selected. If the
1688 column is not nested, an *array* of the current flavor will be
1689 returned; if it is, a *structured array* will be used instead. If no
1690 field is specified, all the columns will be returned in a structured
1691 array of the current flavor.
1693 The meaning of the start, stop and step arguments is the same as in
1694 :meth:`Table.read`.
1696 .. versionchanged:: 3.0
1697 The start, stop and step parameters now behave like in slice.
1699 """
1701 self._g_check_open()
1702 index = self._check_sortby_csi(sortby, checkCSI)
1703 coords = index[start:stop:step]
1704 return self.read_coordinates(coords, field)
1706 def iterrows(self, start=None, stop=None, step=None):
1707 """Iterate over the table using a Row instance.
1709 If a range is not supplied, *all the rows* in the table are iterated
1710 upon - you can also use the :meth:`Table.__iter__` special method for
1711 that purpose. If you want to iterate over a given *range of rows* in
1712 the table, you may use the start, stop and step parameters.
1714 .. warning::
1716 When in the middle of a table row iterator, you should not
1717 use methods that can change the number of rows in the table
1718 (like :meth:`Table.append` or :meth:`Table.remove_rows`) or
1719 unexpected errors will happen.
1721 See Also
1722 --------
1723 tableextension.Row : the table row iterator and field accessor
1725 Examples
1726 --------
1728 ::
1730 result = [ row['var2'] for row in table.iterrows(step=5)
1731 if row['var1'] <= 20 ]
1733 .. versionchanged:: 3.0
1734 If the *start* parameter is provided and *stop* is None then the
1735 table is iterated from *start* to the last line.
1736 In PyTables < 3.0 only one element was returned.
1738 """
1739 (start, stop, step) = self._process_range(start, stop, step,
1740 warn_negstep=False)
1741 if (start > stop and 0 < step) or (start < stop and 0 > step):
1742 # Fall-back action is to return an empty iterator
1743 return iter([])
1744 row = tableextension.Row(self)
1745 return row._iter(start, stop, step)
1747 def __iter__(self):
1748 """Iterate over the table using a Row instance.
1750 This is equivalent to calling :meth:`Table.iterrows` with default
1751 arguments, i.e. it iterates over *all the rows* in the table.
1753 See Also
1754 --------
1755 tableextension.Row : the table row iterator and field accessor
1757 Examples
1758 --------
1760 ::
1762 result = [ row['var2'] for row in table if row['var1'] <= 20 ]
1764 Which is equivalent to::
1766 result = [ row['var2'] for row in table.iterrows()
1767 if row['var1'] <= 20 ]
1769 """
1771 return self.iterrows()
1773 def _read(self, start, stop, step, field=None, out=None):
1774 """Read a range of rows and return an in-memory object."""
1776 select_field = None
1777 if field:
1778 if field not in self.coldtypes:
1779 if field in self.description._v_names:
1780 # Remember to select this field
1781 select_field = field
1782 field = None
1783 else:
1784 raise KeyError(("Field {} not found in table "
1785 "{}").format(field, self))
1786 else:
1787 # The column hangs directly from the top
1788 dtype_field = self.coldtypes[field]
1790 # Return a rank-0 array if start > stop
1791 if (start >= stop and 0 < step) or (start <= stop and 0 > step):
1792 if field is None:
1793 nra = self._get_container(0)
1794 return nra
1795 return np.empty(shape=0, dtype=dtype_field)
1797 nrows = len(range(start, stop, step))
1799 if out is None:
1800 # Compute the shape of the resulting column object
1801 if field:
1802 # Create a container for the results
1803 result = np.empty(shape=nrows, dtype=dtype_field)
1804 else:
1805 # Recarray case
1806 result = self._get_container(nrows)
1807 else:
1808 # there is no fast way to byteswap, since different columns may
1809 # have different byteorders
1810 if not out.dtype.isnative:
1811 raise ValueError("output array must be in system's byteorder "
1812 "or results will be incorrect")
1813 if field:
1814 bytes_required = dtype_field.itemsize * nrows
1815 else:
1816 bytes_required = self.rowsize * nrows
1817 if bytes_required != out.nbytes:
1818 raise ValueError(f'output array size invalid, got {out.nbytes}'
1819 f' bytes, need {bytes_required} bytes')
1820 if not out.flags['C_CONTIGUOUS']:
1821 raise ValueError('output array not C contiguous')
1822 result = out
1824 # Call the routine to fill-up the resulting array
1825 if step == 1 and not field:
1826 # This optimization works three times faster than
1827 # the row._fill_col method (up to 170 MB/s on a pentium IV @ 2GHz)
1828 self._read_records(start, stop - start, result)
1829 # Warning!: _read_field_name should not be used until
1830 # H5TBread_fields_name in tableextension will be finished
1831 # F. Alted 2005/05/26
1832 # XYX Ho implementem per a PyTables 2.0??
1833 elif field and step > 15 and 0:
1834 # For step>15, this seems to work always faster than row._fill_col.
1835 self._read_field_name(result, start, stop, step, field)
1836 else:
1837 self.row._fill_col(result, start, stop, step, field)
1839 if select_field:
1840 return result[select_field]
1841 else:
1842 return result
1844 def read(self, start=None, stop=None, step=None, field=None, out=None):
1845 """Get data in the table as a (record) array.
1847 The start, stop and step parameters can be used to select only
1848 a *range of rows* in the table. Their meanings are the same as
1849 in the built-in Python slices.
1851 If field is supplied only the named column will be selected.
1852 If the column is not nested, an *array* of the current flavor
1853 will be returned; if it is, a *structured array* will be used
1854 instead. If no field is specified, all the columns will be
1855 returned in a structured array of the current flavor.
1857 Columns under a nested column can be specified in the field
1858 parameter by using a slash character (/) as a separator (e.g.
1859 'position/x').
1861 The out parameter may be used to specify a NumPy array to
1862 receive the output data. Note that the array must have the
1863 same size as the data selected with the other parameters.
1864 Note that the array's datatype is not checked and no type
1865 casting is performed, so if it does not match the datatype on
1866 disk, the output will not be correct.
1868 When specifying a single nested column with the field parameter,
1869 and supplying an output buffer with the out parameter, the
1870 output buffer must contain all columns in the table.
1871 The data in all columns will be read into the output buffer.
1872 However, only the specified nested column will be returned from
1873 the method call.
1875 When data is read from disk in NumPy format, the output will be
1876 in the current system's byteorder, regardless of how it is
1877 stored on disk. If the out parameter is specified, the output
1878 array also must be in the current system's byteorder.
1880 .. versionchanged:: 3.0
1881 Added the *out* parameter. Also the start, stop and step
1882 parameters now behave like in slice.
1884 Examples
1885 --------
1887 Reading the entire table::
1889 t.read()
1891 Reading record n. 6::
1893 t.read(6, 7)
1895 Reading from record n. 6 to the end of the table::
1897 t.read(6)
1899 """
1901 self._g_check_open()
1903 if field:
1904 self._check_column(field)
1906 if out is not None and self.flavor != 'numpy':
1907 msg = ("Optional 'out' argument may only be supplied if array "
1908 "flavor is 'numpy', currently is {}").format(self.flavor)
1909 raise TypeError(msg)
1911 start, stop, step = self._process_range(start, stop, step,
1912 warn_negstep=False)
1914 arr = self._read(start, stop, step, field, out)
1915 return internal_to_flavor(arr, self.flavor)
1917 def _read_coordinates(self, coords, field=None):
1918 """Private part of `read_coordinates()` with no flavor conversion."""
1920 coords = self._point_selection(coords)
1922 ncoords = len(coords)
1923 # Create a read buffer only if needed
1924 if field is None or ncoords > 0:
1925 # Doing a copy is faster when ncoords is small (<1000)
1926 if ncoords < min(1000, self.nrowsinbuf):
1927 result = self._v_iobuf[:ncoords].copy()
1928 else:
1929 result = self._get_container(ncoords)
1931 # Do the real read
1932 if ncoords > 0:
1933 # Turn coords into an array of coordinate indexes, if necessary
1934 if not (isinstance(coords, np.ndarray) and
1935 coords.dtype.type is _npsizetype and
1936 coords.flags.contiguous and
1937 coords.flags.aligned):
1938 # Get a contiguous and aligned coordinate array
1939 coords = np.array(coords, dtype=SizeType)
1940 self._read_elements(coords, result)
1942 # Do the final conversions, if needed
1943 if field:
1944 if ncoords > 0:
1945 result = get_nested_field(result, field)
1946 else:
1947 # Get an empty array from the cache
1948 result = self._getemptyarray(self.coldtypes[field])
1949 return result
1951 def read_coordinates(self, coords, field=None):
1952 """Get a set of rows given their indexes as a (record) array.
1954 This method works much like the :meth:`Table.read` method, but it uses
1955 a sequence (coords) of row indexes to select the wanted columns,
1956 instead of a column range.
1958 The selected rows are returned in an array or structured array of the
1959 current flavor.
1961 """
1963 self._g_check_open()
1964 result = self._read_coordinates(coords, field)
1965 return internal_to_flavor(result, self.flavor)
1967 def get_enum(self, colname):
1968 """Get the enumerated type associated with the named column.
1970 If the column named colname (a string) exists and is of an enumerated
1971 type, the corresponding Enum instance (see :ref:`EnumClassDescr`) is
1972 returned. If it is not of an enumerated type, a TypeError is raised. If
1973 the column does not exist, a KeyError is raised.
1975 """
1977 self._check_column(colname)
1979 try:
1980 return self._colenums[colname]
1981 except KeyError:
1982 raise TypeError(
1983 "column ``%s`` of table ``%s`` is not of an enumerated type"
1984 % (colname, self._v_pathname))
1986 def col(self, name):
1987 """Get a column from the table.
1989 If a column called name exists in the table, it is read and returned as
1990 a NumPy object. If it does not exist, a KeyError is raised.
1992 Examples
1993 --------
1995 ::
1997 narray = table.col('var2')
1999 That statement is equivalent to::
2001 narray = table.read(field='var2')
2003 Here you can see how this method can be used as a shorthand for the
2004 :meth:`Table.read` method.
2006 """
2008 return self.read(field=name)
2010 def __getitem__(self, key):
2011 """Get a row or a range of rows from the table.
2013 If key argument is an integer, the corresponding table row is returned
2014 as a record of the current flavor. If key is a slice, the range of rows
2015 determined by it is returned as a structured array of the current
2016 flavor.
2018 In addition, NumPy-style point selections are supported. In
2019 particular, if key is a list of row coordinates, the set of rows
2020 determined by it is returned. Furthermore, if key is an array of
2021 boolean values, only the coordinates where key is True are returned.
2022 Note that for the latter to work it is necessary that key list would
2023 contain exactly as many rows as the table has.
2025 Examples
2026 --------
2028 ::
2030 record = table[4]
2031 recarray = table[4:1000:2]
2032 recarray = table[[4,1000]] # only retrieves rows 4 and 1000
2033 recarray = table[[True, False, ..., True]]
2035 Those statements are equivalent to::
2037 record = table.read(start=4)[0]
2038 recarray = table.read(start=4, stop=1000, step=2)
2039 recarray = table.read_coordinates([4,1000])
2040 recarray = table.read_coordinates([True, False, ..., True])
2042 Here, you can see how indexing can be used as a shorthand for the
2043 :meth:`Table.read` and :meth:`Table.read_coordinates` methods.
2045 """
2047 self._g_check_open()
2049 if is_idx(key):
2050 key = operator.index(key)
2052 # Index out of range protection
2053 if key >= self.nrows:
2054 raise IndexError("Index out of range")
2055 if key < 0:
2056 # To support negative values
2057 key += self.nrows
2058 (start, stop, step) = self._process_range(key, key + 1, 1)
2059 return self.read(start, stop, step)[0]
2060 elif isinstance(key, slice):
2061 (start, stop, step) = self._process_range(
2062 key.start, key.stop, key.step)
2063 return self.read(start, stop, step)
2064 # Try with a boolean or point selection
2065 elif type(key) in (list, tuple) or isinstance(key, np.ndarray):
2066 return self._read_coordinates(key, None)
2067 else:
2068 raise IndexError(f"Invalid index or slice: {key!r}")
2070 def __setitem__(self, key, value):
2071 """Set a row or a range of rows in the table.
2073 It takes different actions depending on the type of the *key*
2074 parameter: if it is an integer, the corresponding table row is
2075 set to *value* (a record or sequence capable of being converted
2076 to the table structure). If *key* is a slice, the row slice
2077 determined by it is set to *value* (a record array or sequence
2078 capable of being converted to the table structure).
2080 In addition, NumPy-style point selections are supported. In
2081 particular, if key is a list of row coordinates, the set of rows
2082 determined by it is set to value. Furthermore, if key is an array of
2083 boolean values, only the coordinates where key is True are set to
2084 values from value. Note that for the latter to work it is necessary
2085 that key list would contain exactly as many rows as the table has.
2087 Examples
2088 --------
2090 ::
2092 # Modify just one existing row
2093 table[2] = [456,'db2',1.2]
2095 # Modify two existing rows
2096 rows = numpy.rec.array([[457,'db1',1.2],[6,'de2',1.3]],
2097 formats='i4,a3,f8')
2098 table[1:30:2] = rows # modify a table slice
2099 table[[1,3]] = rows # only modifies rows 1 and 3
2100 table[[True,False,True]] = rows # only modifies rows 0 and 2
2102 Which is equivalent to::
2104 table.modify_rows(start=2, rows=[456,'db2',1.2])
2105 rows = numpy.rec.array([[457,'db1',1.2],[6,'de2',1.3]],
2106 formats='i4,a3,f8')
2107 table.modify_rows(start=1, stop=3, step=2, rows=rows)
2108 table.modify_coordinates([1,3,2], rows)
2109 table.modify_coordinates([True, False, True], rows)
2111 Here, you can see how indexing can be used as a shorthand for the
2112 :meth:`Table.modify_rows` and :meth:`Table.modify_coordinates`
2113 methods.
2115 """
2117 self._g_check_open()
2118 self._v_file._check_writable()
2120 if is_idx(key):
2121 key = operator.index(key)
2123 # Index out of range protection
2124 if key >= self.nrows:
2125 raise IndexError("Index out of range")
2126 if key < 0:
2127 # To support negative values
2128 key += self.nrows
2129 return self.modify_rows(key, key + 1, 1, [value])
2130 elif isinstance(key, slice):
2131 (start, stop, step) = self._process_range(
2132 key.start, key.stop, key.step)
2133 return self.modify_rows(start, stop, step, value)
2134 # Try with a boolean or point selection
2135 elif type(key) in (list, tuple) or isinstance(key, np.ndarray):
2136 return self.modify_coordinates(key, value)
2137 else:
2138 raise IndexError(f"Invalid index or slice: {key!r}")
2140 def _save_buffered_rows(self, wbufRA, lenrows):
2141 """Update the indexes after a flushing of rows."""
2143 self._open_append(wbufRA)
2144 self._append_records(lenrows)
2145 self._close_append()
2146 if self.indexed:
2147 self._unsaved_indexedrows += lenrows
2148 # The table caches for indexed queries are dirty now
2149 self._dirtycache = True
2150 if self.autoindex:
2151 # Flush the unindexed rows
2152 self.flush_rows_to_index(_lastrow=False)
2153 else:
2154 # All the columns are dirty now
2155 self._mark_columns_as_dirty(self.colpathnames)
2157 def append(self, rows):
2158 """Append a sequence of rows to the end of the table.
2160 The rows argument may be any object which can be converted to
2161 a structured array compliant with the table structure
2162 (otherwise, a ValueError is raised). This includes NumPy
2163 structured arrays, lists of tuples or array records, and a
2164 string or Python buffer.
2166 Examples
2167 --------
2169 ::
2171 import tables as tb
2173 class Particle(tb.IsDescription):
2174 name = tb.StringCol(16, pos=1) # 16-character String
2175 lati = tb.IntCol(pos=2) # integer
2176 longi = tb.IntCol(pos=3) # integer
2177 pressure = tb.Float32Col(pos=4) # float (single-precision)
2178 temperature = tb.FloatCol(pos=5) # double (double-precision)
2180 fileh = tb.open_file('test4.h5', mode='w')
2181 table = fileh.create_table(fileh.root, 'table', Particle,
2182 "A table")
2184 # Append several rows in only one call
2185 table.append([("Particle: 10", 10, 0, 10 * 10, 10**2),
2186 ("Particle: 11", 11, -1, 11 * 11, 11**2),
2187 ("Particle: 12", 12, -2, 12 * 12, 12**2)])
2188 fileh.close()
2190 """
2192 self._g_check_open()
2193 self._v_file._check_writable()
2195 if not self._chunked:
2196 raise HDF5ExtError(
2197 "You cannot append rows to a non-chunked table.", h5bt=False)
2199 if (hasattr(rows, "dtype") and
2200 not self.description._v_is_nested and
2201 rows.dtype == self.dtype):
2202 # Shortcut for compliant arrays
2203 # (for some reason, not valid for nested types)
2204 wbufRA = rows
2205 else:
2206 # Try to convert the object into a recarray compliant with table
2207 try:
2208 iflavor = flavor_of(rows)
2209 if iflavor != 'python':
2210 rows = array_as_internal(rows, iflavor)
2211 # Works for Python structures and always copies the original,
2212 # so the resulting object is safe for in-place conversion.
2213 wbufRA = np.rec.array(rows, dtype=self._v_dtype)
2214 except Exception as exc: # XXX
2215 raise ValueError("rows parameter cannot be converted into a "
2216 "recarray object compliant with table '%s'. "
2217 "The error was: <%s>" % (str(self), exc))
2218 lenrows = wbufRA.shape[0]
2219 # If the number of rows to append is zero, don't do anything else
2220 if lenrows > 0:
2221 # Save write buffer to disk
2222 self._save_buffered_rows(wbufRA, lenrows)
2224 def _conv_to_recarr(self, obj):
2225 """Try to convert the object into a recarray."""
2227 try:
2228 iflavor = flavor_of(obj)
2229 if iflavor != 'python':
2230 obj = array_as_internal(obj, iflavor)
2231 if hasattr(obj, "shape") and obj.shape == ():
2232 # To allow conversion of scalars (void type) into arrays.
2233 # See http://projects.scipy.org/scipy/numpy/ticket/315
2234 # for discussion on how to pass buffers to constructors
2235 # See also http://projects.scipy.org/scipy/numpy/ticket/348
2236 recarr = np.array([obj], dtype=self._v_dtype)
2237 else:
2238 # Works for Python structures and always copies the original,
2239 # so the resulting object is safe for in-place conversion.
2240 recarr = np.rec.array(obj, dtype=self._v_dtype)
2241 except Exception as exc: # XXX
2242 raise ValueError("Object cannot be converted into a recarray "
2243 "object compliant with table format '%s'. "
2244 "The error was: <%s>" %
2245 (self.description._v_nested_descr, exc))
2247 return recarr
2249 def modify_coordinates(self, coords, rows):
2250 """Modify a series of rows in positions specified in coords.
2252 The values in the selected rows will be modified with the data given in
2253 rows. This method returns the number of rows modified.
2255 The possible values for the rows argument are the same as in
2256 :meth:`Table.append`.
2258 """
2260 if rows is None: # Nothing to be done
2261 return SizeType(0)
2263 # Convert the coordinates to something expected by HDF5
2264 coords = self._point_selection(coords)
2266 lcoords = len(coords)
2267 if len(rows) < lcoords:
2268 raise ValueError("The value has not enough elements to fill-in "
2269 "the specified range")
2271 # Convert rows into a recarray
2272 recarr = self._conv_to_recarr(rows)
2274 if len(coords) > 0:
2275 # Do the actual update of rows
2276 self._update_elements(lcoords, coords, recarr)
2278 # Redo the index if needed
2279 self._reindex(self.colpathnames)
2281 return SizeType(lcoords)
2283 def modify_rows(self, start=None, stop=None, step=None, rows=None):
2284 """Modify a series of rows in the slice [start:stop:step].
2286 The values in the selected rows will be modified with the data given in
2287 rows. This method returns the number of rows modified. Should the
2288 modification exceed the length of the table, an IndexError is raised
2289 before changing data.
2291 The possible values for the rows argument are the same as in
2292 :meth:`Table.append`.
2294 """
2296 if step is None:
2297 step = 1
2298 if rows is None: # Nothing to be done
2299 return SizeType(0)
2300 if start is None:
2301 start = 0
2303 if start < 0:
2304 raise ValueError("'start' must have a positive value.")
2305 if step < 1:
2306 raise ValueError(
2307 "'step' must have a value greater or equal than 1.")
2308 if stop is None:
2309 # compute the stop value. start + len(rows)*step does not work
2310 stop = start + (len(rows) - 1) * step + 1
2312 (start, stop, step) = self._process_range(start, stop, step)
2313 if stop > self.nrows:
2314 raise IndexError("This modification will exceed the length of "
2315 "the table. Giving up.")
2316 # Compute the number of rows to read.
2317 nrows = len(range(start, stop, step))
2318 if len(rows) != nrows:
2319 raise ValueError("The value has different elements than the "
2320 "specified range")
2322 # Convert rows into a recarray
2323 recarr = self._conv_to_recarr(rows)
2325 lenrows = len(recarr)
2326 if start + lenrows > self.nrows:
2327 raise IndexError("This modification will exceed the length of the "
2328 "table. Giving up.")
2330 # Do the actual update
2331 self._update_records(start, stop, step, recarr)
2333 # Redo the index if needed
2334 self._reindex(self.colpathnames)
2336 return SizeType(lenrows)
2338 def modify_column(self, start=None, stop=None, step=None,
2339 column=None, colname=None):
2340 """Modify one single column in the row slice [start:stop:step].
2342 The colname argument specifies the name of the column in the
2343 table to be modified with the data given in column. This
2344 method returns the number of rows modified. Should the
2345 modification exceed the length of the table, an IndexError is
2346 raised before changing data.
2348 The *column* argument may be any object which can be converted
2349 to a (record) array compliant with the structure of the column
2350 to be modified (otherwise, a ValueError is raised). This
2351 includes NumPy (record) arrays, lists of scalars, tuples or
2352 array records, and a string or Python buffer.
2354 """
2355 if step is None:
2356 step = 1
2357 if not isinstance(colname, str):
2358 raise TypeError("The 'colname' parameter must be a string.")
2359 self._v_file._check_writable()
2361 if column is None: # Nothing to be done
2362 return SizeType(0)
2363 if start is None:
2364 start = 0
2366 if start < 0:
2367 raise ValueError("'start' must have a positive value.")
2368 if step < 1:
2369 raise ValueError(
2370 "'step' must have a value greater or equal than 1.")
2371 # Get the column format to be modified:
2372 objcol = self._get_column_instance(colname)
2373 descr = [objcol._v_parent._v_nested_descr[objcol._v_pos]]
2374 # Try to convert the column object into a NumPy ndarray
2375 try:
2376 # If the column is a recarray (or kind of), convert into ndarray
2377 if hasattr(column, 'dtype') and column.dtype.kind == 'V':
2378 column = np.rec.array(column, dtype=descr).field(0)
2379 else:
2380 # Make sure the result is always a *copy* of the original,
2381 # so the resulting object is safe for in-place conversion.
2382 iflavor = flavor_of(column)
2383 column = array_as_internal(column, iflavor)
2384 except Exception as exc: # XXX
2385 raise ValueError("column parameter cannot be converted into a "
2386 "ndarray object compliant with specified column "
2387 "'%s'. The error was: <%s>" % (str(column), exc))
2389 # Get rid of single-dimensional dimensions
2390 column = column.squeeze()
2391 if column.shape == ():
2392 # Oops, stripped off to much dimensions
2393 column.shape = (1,)
2395 if stop is None:
2396 # compute the stop value. start + len(rows)*step does not work
2397 stop = start + (len(column) - 1) * step + 1
2398 (start, stop, step) = self._process_range(start, stop, step)
2399 if stop > self.nrows:
2400 raise IndexError("This modification will exceed the length of "
2401 "the table. Giving up.")
2402 # Compute the number of rows to read.
2403 nrows = len(range(start, stop, step))
2404 if len(column) < nrows:
2405 raise ValueError("The value has not enough elements to fill-in "
2406 "the specified range")
2407 # Now, read the original values:
2408 mod_recarr = self._read(start, stop, step)
2409 # Modify the appropriate column in the original recarray
2410 mod_col = get_nested_field(mod_recarr, colname)
2411 mod_col[:] = column
2412 # save this modified rows in table
2413 self._update_records(start, stop, step, mod_recarr)
2414 # Redo the index if needed
2415 self._reindex([colname])
2417 return SizeType(nrows)
2419 def modify_columns(self, start=None, stop=None, step=None,
2420 columns=None, names=None):
2421 """Modify a series of columns in the row slice [start:stop:step].
2423 The names argument specifies the names of the columns in the
2424 table to be modified with the data given in columns. This
2425 method returns the number of rows modified. Should the
2426 modification exceed the length of the table, an IndexError
2427 is raised before changing data.
2429 The columns argument may be any object which can be converted
2430 to a structured array compliant with the structure of the
2431 columns to be modified (otherwise, a ValueError is raised).
2432 This includes NumPy structured arrays, lists of tuples or array
2433 records, and a string or Python buffer.
2435 """
2436 if step is None:
2437 step = 1
2438 if type(names) not in (list, tuple):
2439 raise TypeError("The 'names' parameter must be a list of strings.")
2441 if columns is None: # Nothing to be done
2442 return SizeType(0)
2443 if start is None:
2444 start = 0
2445 if start < 0:
2446 raise ValueError("'start' must have a positive value.")
2447 if step < 1:
2448 raise ValueError("'step' must have a value greater or "
2449 "equal than 1.")
2450 descr = []
2451 for colname in names:
2452 objcol = self._get_column_instance(colname)
2453 descr.append(objcol._v_parent._v_nested_descr[objcol._v_pos])
2454 # descr.append(objcol._v_parent._v_dtype[objcol._v_pos])
2455 # Try to convert the columns object into a recarray
2456 try:
2457 # Make sure the result is always a *copy* of the original,
2458 # so the resulting object is safe for in-place conversion.
2459 iflavor = flavor_of(columns)
2460 if iflavor != 'python':
2461 columns = array_as_internal(columns, iflavor)
2462 recarray = np.rec.array(columns, dtype=descr)
2463 else:
2464 recarray = np.rec.fromarrays(columns, dtype=descr)
2465 except Exception as exc: # XXX
2466 raise ValueError("columns parameter cannot be converted into a "
2467 "recarray object compliant with table '%s'. "
2468 "The error was: <%s>" % (str(self), exc))
2470 if stop is None:
2471 # compute the stop value. start + len(rows)*step does not work
2472 stop = start + (len(recarray) - 1) * step + 1
2473 (start, stop, step) = self._process_range(start, stop, step)
2474 if stop > self.nrows:
2475 raise IndexError("This modification will exceed the length of "
2476 "the table. Giving up.")
2477 # Compute the number of rows to read.
2478 nrows = len(range(start, stop, step))
2479 if len(recarray) < nrows:
2480 raise ValueError("The value has not enough elements to fill-in "
2481 "the specified range")
2482 # Now, read the original values:
2483 mod_recarr = self._read(start, stop, step)
2484 # Modify the appropriate columns in the original recarray
2485 for i, name in enumerate(recarray.dtype.names):
2486 mod_col = get_nested_field(mod_recarr, names[i])
2487 mod_col[:] = recarray[name].squeeze()
2488 # save this modified rows in table
2489 self._update_records(start, stop, step, mod_recarr)
2490 # Redo the index if needed
2491 self._reindex(names)
2493 return SizeType(nrows)
2495 def flush_rows_to_index(self, _lastrow=True):
2496 """Add remaining rows in buffers to non-dirty indexes.
2498 This can be useful when you have chosen non-automatic indexing
2499 for the table (see the :attr:`Table.autoindex` property in
2500 :class:`Table`) and you want to update the indexes on it.
2502 """
2504 rowsadded = 0
2505 if self.indexed:
2506 # Update the number of unsaved indexed rows
2507 start = self._indexedrows
2508 nrows = self._unsaved_indexedrows
2509 for (colname, colindexed) in self.colindexed.items():
2510 if colindexed:
2511 col = self.cols._g_col(colname)
2512 if nrows > 0 and not col.index.dirty:
2513 rowsadded = self._add_rows_to_index(
2514 colname, start, nrows, _lastrow, update=True)
2515 self._unsaved_indexedrows -= rowsadded
2516 self._indexedrows += rowsadded
2517 return rowsadded
2519 def _add_rows_to_index(self, colname, start, nrows, lastrow, update):
2520 """Add more elements to the existing index."""
2522 # This method really belongs to Column, but since it makes extensive
2523 # use of the table, it gets dangerous when closing the file, since the
2524 # column may be accessing a table which is being destroyed.
2525 index = self.cols._g_col(colname).index
2526 slicesize = index.slicesize
2527 # The next loop does not rely on xrange so that it can
2528 # deal with long ints (i.e. more than 32-bit integers)
2529 # This allows to index columns with more than 2**31 rows
2530 # F. Alted 2005-05-09
2531 startLR = index.sorted.nrows * slicesize
2532 indexedrows = startLR - start
2533 stop = start + nrows - slicesize + 1
2534 while startLR < stop:
2535 index.append(
2536 [self._read(startLR, startLR + slicesize, 1, colname)],
2537 update=update)
2538 indexedrows += slicesize
2539 startLR += slicesize
2540 # index the remaining rows in last row
2541 if lastrow and startLR < self.nrows:
2542 index.append_last_row(
2543 [self._read(startLR, self.nrows, 1, colname)],
2544 update=update)
2545 indexedrows += self.nrows - startLR
2546 return indexedrows
2548 def remove_rows(self, start=None, stop=None, step=None):
2549 """Remove a range of rows in the table.
2551 If only start is supplied, that row and all following will be deleted.
2552 If a range is supplied, i.e. both the start and stop parameters are
2553 passed, all the rows in the range are removed.
2555 .. versionchanged:: 3.0
2556 The start, stop and step parameters now behave like in slice.
2558 .. seealso:: remove_row()
2560 Parameters
2561 ----------
2562 start : int
2563 Sets the starting row to be removed. It accepts negative values
2564 meaning that the count starts from the end. A value of 0 means the
2565 first row.
2566 stop : int
2567 Sets the last row to be removed to stop-1, i.e. the end point is
2568 omitted (in the Python range() tradition). Negative values are also
2569 accepted. If None all rows after start will be removed.
2570 step : int
2571 The step size between rows to remove.
2573 .. versionadded:: 3.0
2575 Examples
2576 --------
2578 Removing rows from 5 to 10 (excluded)::
2580 t.remove_rows(5, 10)
2582 Removing all rows starting from the 10th::
2584 t.remove_rows(10)
2586 Removing the 6th row::
2588 t.remove_rows(6, 7)
2590 .. note::
2592 removing a single row can be done using the specific
2593 :meth:`remove_row` method.
2595 """
2597 (start, stop, step) = self._process_range(start, stop, step)
2598 nrows = self._remove_rows(start, stop, step)
2599 # remove_rows is a invalidating index operation
2600 self._reindex(self.colpathnames)
2602 return SizeType(nrows)
2604 def remove_row(self, n):
2605 """Removes a row from the table.
2607 Parameters
2608 ----------
2609 n : int
2610 The index of the row to remove.
2613 .. versionadded:: 3.0
2615 Examples
2616 --------
2618 Remove row 15::
2620 table.remove_row(15)
2622 Which is equivalent to::
2624 table.remove_rows(15, 16)
2626 .. warning::
2628 This is not equivalent to::
2630 table.remove_rows(15)
2632 """
2634 self.remove_rows(start=n, stop=n + 1)
2636 def _g_update_dependent(self):
2637 super()._g_update_dependent()
2639 # Update the new path in columns
2640 self.cols._g_update_table_location(self)
2642 # Update the new path in the Row instance, if cached. Fixes #224.
2643 if 'row' in self.__dict__:
2644 self.__dict__['row'] = tableextension.Row(self)
2646 def _g_move(self, newparent, newname):
2647 """Move this node in the hierarchy.
2649 This overloads the Node._g_move() method.
2651 """
2653 itgpathname = _index_pathname_of(self)
2655 # First, move the table to the new location.
2656 super()._g_move(newparent, newname)
2658 # Then move the associated index group (if any).
2659 try:
2660 itgroup = self._v_file._get_node(itgpathname)
2661 except NoSuchNodeError:
2662 pass
2663 else:
2664 newigroup = self._v_parent
2665 newiname = _index_name_of(self)
2666 itgroup._g_move(newigroup, newiname)
2668 def _g_remove(self, recursive=False, force=False):
2669 # Remove the associated index group (if any).
2670 itgpathname = _index_pathname_of(self)
2671 try:
2672 itgroup = self._v_file._get_node(itgpathname)
2673 except NoSuchNodeError:
2674 pass
2675 else:
2676 itgroup._f_remove(recursive=True)
2677 self.indexed = False # there are indexes no more
2679 # Remove the leaf itself from the hierarchy.
2680 super()._g_remove(recursive, force)
2682 def _set_column_indexing(self, colpathname, indexed):
2683 """Mark the referred column as indexed or non-indexed."""
2685 colindexed = self.colindexed
2686 isindexed, wasindexed = bool(indexed), colindexed[colpathname]
2687 if isindexed == wasindexed:
2688 return # indexing state is unchanged
2690 # Changing the set of indexed columns invalidates the condition cache
2691 self._condition_cache.clear()
2692 colindexed[colpathname] = isindexed
2693 self.indexed = max(colindexed.values()) # this is an OR :)
2695 def _mark_columns_as_dirty(self, colnames):
2696 """Mark column indexes in `colnames` as dirty."""
2698 assert len(colnames) > 0
2699 if self.indexed:
2700 colindexed, cols = self.colindexed, self.cols
2701 # Mark the proper indexes as dirty
2702 for colname in colnames:
2703 if colindexed[colname]:
2704 col = cols._g_col(colname)
2705 col.index.dirty = True
2707 def _reindex(self, colnames):
2708 """Re-index columns in `colnames` if automatic indexing is true."""
2710 if self.indexed:
2711 colindexed, cols = self.colindexed, self.cols
2712 colstoindex = []
2713 # Mark the proper indexes as dirty
2714 for colname in colnames:
2715 if colindexed[colname]:
2716 col = cols._g_col(colname)
2717 col.index.dirty = True
2718 colstoindex.append(colname)
2719 # Now, re-index the dirty ones
2720 if self.autoindex and colstoindex:
2721 self._do_reindex(dirty=True)
2722 # The table caches for indexed queries are dirty now
2723 self._dirtycache = True
2725 def _do_reindex(self, dirty):
2726 """Common code for `reindex()` and `reindex_dirty()`."""
2728 indexedrows = 0
2729 for (colname, colindexed) in self.colindexed.items():
2730 if colindexed:
2731 indexcol = self.cols._g_col(colname)
2732 indexedrows = indexcol._do_reindex(dirty)
2733 # Update counters in case some column has been updated
2734 if indexedrows > 0:
2735 self._indexedrows = indexedrows
2736 self._unsaved_indexedrows = self.nrows - indexedrows
2738 return SizeType(indexedrows)
2740 def reindex(self):
2741 """Recompute all the existing indexes in the table.
2743 This can be useful when you suspect that, for any reason, the
2744 index information for columns is no longer valid and want to
2745 rebuild the indexes on it.
2747 """
2749 self._do_reindex(dirty=False)
2751 def reindex_dirty(self):
2752 """Recompute the existing indexes in table, *if* they are dirty.
2754 This can be useful when you have set :attr:`Table.autoindex`
2755 (see :class:`Table`) to false for the table and you want to
2756 update the indexes after a invalidating index operation
2757 (:meth:`Table.remove_rows`, for example).
2759 """
2761 self._do_reindex(dirty=True)
2763 def _g_copy_rows(self, object, start, stop, step, sortby, checkCSI):
2764 """Copy rows from self to object"""
2765 if sortby is None:
2766 self._g_copy_rows_optim(object, start, stop, step)
2767 return
2768 lenbuf = self.nrowsinbuf
2769 absstep = step
2770 if step < 0:
2771 absstep = -step
2772 start, stop = stop + 1, start + 1
2773 if sortby is not None:
2774 index = self._check_sortby_csi(sortby, checkCSI)
2775 for start2 in range(start, stop, absstep * lenbuf):
2776 stop2 = start2 + absstep * lenbuf
2777 if stop2 > stop:
2778 stop2 = stop
2779 # The next 'if' is not needed, but it doesn't bother either
2780 if sortby is None:
2781 rows = self[start2:stop2:step]
2782 else:
2783 coords = index[start2:stop2:step]
2784 rows = self.read_coordinates(coords)
2785 # Save the records on disk
2786 object.append(rows)
2787 object.flush()
2789 def _g_copy_rows_optim(self, object, start, stop, step):
2790 """Copy rows from self to object (optimized version)"""
2792 nrowsinbuf = self.nrowsinbuf
2793 object._open_append(self._v_iobuf)
2794 nrowsdest = object.nrows
2795 for start2 in range(start, stop, step * nrowsinbuf):
2796 # Save the records on disk
2797 stop2 = start2 + step * nrowsinbuf
2798 if stop2 > stop:
2799 stop2 = stop
2800 # Optimized version (it saves some conversions)
2801 nrows = ((stop2 - start2 - 1) // step) + 1
2802 self.row._fill_col(self._v_iobuf, start2, stop2, step, None)
2803 # The output buffer is created anew,
2804 # so the operation is safe to in-place conversion.
2805 object._append_records(nrows)
2806 nrowsdest += nrows
2807 object._close_append()
2809 def _g_prop_indexes(self, other):
2810 """Generate index in `other` table for every indexed column here."""
2812 oldcols, newcols = self.colinstances, other.colinstances
2813 for colname in newcols:
2814 if (isinstance(oldcols[colname], Column)):
2815 oldcolindexed = oldcols[colname].is_indexed
2816 if oldcolindexed:
2817 oldcolindex = oldcols[colname].index
2818 newcol = newcols[colname]
2819 newcol.create_index(
2820 kind=oldcolindex.kind, optlevel=oldcolindex.optlevel,
2821 filters=oldcolindex.filters, tmp_dir=None)
2823 def _g_copy_with_stats(self, group, name, start, stop, step,
2824 title, filters, chunkshape, _log, **kwargs):
2825 """Private part of Leaf.copy() for each kind of leaf."""
2827 # Get the private args for the Table flavor of copy()
2828 sortby = kwargs.pop('sortby', None)
2829 propindexes = kwargs.pop('propindexes', False)
2830 checkCSI = kwargs.pop('checkCSI', False)
2831 # Compute the correct indices.
2832 (start, stop, step) = self._process_range_read(
2833 start, stop, step, warn_negstep=sortby is None)
2834 # And the number of final rows
2835 nrows = len(range(start, stop, step))
2836 # Create the new table and copy the selected data.
2837 newtable = Table(group, name, self.description, title=title,
2838 filters=filters, expectedrows=nrows,
2839 chunkshape=chunkshape,
2840 _log=_log)
2841 self._g_copy_rows(newtable, start, stop, step, sortby, checkCSI)
2842 nbytes = newtable.nrows * newtable.rowsize
2843 # Generate equivalent indexes in the new table, if required.
2844 if propindexes and self.indexed:
2845 self._g_prop_indexes(newtable)
2846 return (newtable, nbytes)
2848 # This overloading of copy is needed here in order to document
2849 # the additional keywords for the Table case.
2850 def copy(self, newparent=None, newname=None, overwrite=False,
2851 createparents=False, **kwargs):
2852 """Copy this table and return the new one.
2854 This method has the behavior and keywords described in
2855 :meth:`Leaf.copy`. Moreover, it recognises the following additional
2856 keyword arguments.
2858 Parameters
2859 ----------
2860 sortby
2861 If specified, and sortby corresponds to a column with an index,
2862 then the copy will be sorted by this index. If you want to ensure
2863 a fully sorted order, the index must be a CSI one. A reverse
2864 sorted copy can be achieved by specifying a negative value for the
2865 step keyword. If sortby is omitted or None, the original table
2866 order is used.
2867 checkCSI
2868 If true and a CSI index does not exist for the sortby column, an
2869 error will be raised. If false (the default), it does nothing.
2870 You can use this flag in order to explicitly check for the
2871 existence of a CSI index.
2872 propindexes
2873 If true, the existing indexes in the source table are propagated
2874 (created) to the new one. If false (the default), the indexes are
2875 not propagated.
2877 """
2879 return super().copy(
2880 newparent, newname, overwrite, createparents, **kwargs)
2882 def flush(self):
2883 """Flush the table buffers."""
2885 if self._v_file._iswritable():
2886 # Flush rows that remains to be appended
2887 if 'row' in self.__dict__:
2888 self.row._flush_buffered_rows()
2889 if self.indexed and self.autoindex:
2890 # Flush any unindexed row
2891 rowsadded = self.flush_rows_to_index(_lastrow=True)
2892 assert rowsadded <= 0 or self._indexedrows == self.nrows, \
2893 ("internal error: the number of indexed rows (%d) "
2894 "and rows in the table (%d) is not equal; "
2895 "please report this to the authors."
2896 % (self._indexedrows, self.nrows))
2897 if self._dirtyindexes:
2898 # Finally, re-index any dirty column
2899 self.reindex_dirty()
2901 super().flush()
2903 def _g_pre_kill_hook(self):
2904 """Code to be called before killing the node."""
2906 # Flush the buffers before to clean-up them
2907 # self.flush()
2908 # It seems that flushing during the __del__ phase is a sure receipt for
2909 # bringing all kind of problems:
2910 # 1. Illegal Instruction
2911 # 2. Malloc(): trying to call free() twice
2912 # 3. Bus Error
2913 # 4. Segmentation fault
2914 # So, the best would be doing *nothing* at all in this __del__ phase.
2915 # As a consequence, the I/O will not be cleaned until a call to
2916 # Table.flush() would be done. This could lead to a potentially large
2917 # memory consumption.
2918 # NOTE: The user should make a call to Table.flush() whenever he has
2919 # finished working with his table.
2920 # I've added a Performance warning in order to compel the user to
2921 # call self.flush() before the table is being preempted.
2922 # F. Alted 2006-08-03
2923 if (('row' in self.__dict__ and self.row._get_unsaved_nrows() > 0) or
2924 (self.indexed and self.autoindex and
2925 (self._unsaved_indexedrows > 0 or self._dirtyindexes))):
2926 warnings.warn(("table ``%s`` is being preempted from alive nodes "
2927 "without its buffers being flushed or with some "
2928 "index being dirty. This may lead to very "
2929 "ineficient use of resources and even to fatal "
2930 "errors in certain situations. Please do a call "
2931 "to the .flush() or .reindex_dirty() methods on "
2932 "this table before start using other nodes.")
2933 % (self._v_pathname), PerformanceWarning)
2934 # Get rid of the IO buffers (if they have been created at all)
2935 mydict = self.__dict__
2936 if '_v_iobuf' in mydict:
2937 del mydict['_v_iobuf']
2938 if '_v_wdflts' in mydict:
2939 del mydict['_v_wdflts']
2941 def _f_close(self, flush=True):
2942 if not self._v_isopen:
2943 return # the node is already closed
2945 # .. note::
2946 #
2947 # As long as ``Table`` objects access their indices on closing,
2948 # ``File.close()`` will need to make *two separate passes*
2949 # to first close ``Table`` objects and then ``Index`` hierarchies.
2950 #
2952 # Flush right now so the row object does not get in the middle.
2953 if flush:
2954 self.flush()
2956 # Some warnings can be issued after calling `self._g_set_location()`
2957 # in `self.__init__()`. If warnings are turned into exceptions,
2958 # `self._g_post_init_hook` may not be called and `self.cols` not set.
2959 # One example of this is
2960 # ``test_create.createTestCase.test05_maxFieldsExceeded()``.
2961 cols = self.cols
2962 if cols is not None:
2963 cols._g_close()
2965 # Clean address cache
2966 self._clean_chunk_addrs()
2968 # Close myself as a leaf.
2969 super()._f_close(False)
2971 def __repr__(self):
2972 """This provides column metainfo in addition to standard __str__"""
2974 if self.indexed:
2975 format = """\
2976%s
2977 description := %r
2978 byteorder := %r
2979 chunkshape := %r
2980 autoindex := %r
2981 colindexes := %r"""
2982 return format % (str(self), self.description, self.byteorder,
2983 self.chunkshape, self.autoindex,
2984 _ColIndexes(self.colindexes))
2985 else:
2986 return """\
2987%s
2988 description := %r
2989 byteorder := %r
2990 chunkshape := %r""" % \
2991 (str(self), self.description, self.byteorder, self.chunkshape)
2994class Cols:
2995 """Container for columns in a table or nested column.
2997 This class is used as an *accessor* to the columns in a table or nested
2998 column. It supports the *natural naming* convention, so that you can
2999 access the different columns as attributes which lead to Column instances
3000 (for non-nested columns) or other Cols instances (for nested columns).
3002 For instance, if table.cols is a Cols instance with a column named col1
3003 under it, the later can be accessed as table.cols.col1. If col1 is nested
3004 and contains a col2 column, this can be accessed as table.cols.col1.col2
3005 and so on. Because of natural naming, the names of members start with
3006 special prefixes, like in the Group class (see :ref:`GroupClassDescr`).
3008 Like the Column class (see :ref:`ColumnClassDescr`), Cols supports item
3009 access to read and write ranges of values in the table or nested column.
3012 .. rubric:: Cols attributes
3014 .. attribute:: _v_colnames
3016 A list of the names of the columns hanging directly
3017 from the associated table or nested column. The order of
3018 the names matches the order of their respective columns in
3019 the containing table.
3021 .. attribute:: _v_colpathnames
3023 A list of the pathnames of all the columns under the
3024 associated table or nested column (in preorder). If it does
3025 not contain nested columns, this is exactly the same as the
3026 :attr:`Cols._v_colnames` attribute.
3028 .. attribute:: _v_desc
3030 The associated Description instance (see
3031 :ref:`DescriptionClassDescr`).
3033 """
3035 @property
3036 def _v_table(self):
3037 """The parent Table instance (see :ref:`TableClassDescr`)."""
3038 return self._v__tableFile._get_node(self._v__tablePath)
3040 def __init__(self, table, desc):
3041 myDict = self.__dict__
3042 myDict['_v__tableFile'] = table._v_file
3043 myDict['_v__tablePath'] = table._v_pathname
3044 myDict['_v_desc'] = desc
3045 myDict['_v_colnames'] = desc._v_names
3046 myDict['_v_colpathnames'] = table.description._v_pathnames
3047 # Put the column in the local dictionary
3048 for name in desc._v_names:
3049 if name in desc._v_types:
3050 myDict[name] = Column(table, name, desc)
3051 else:
3052 myDict[name] = Cols(table, desc._v_colobjects[name])
3054 def _g_update_table_location(self, table):
3055 """Updates the location information about the associated `table`."""
3057 myDict = self.__dict__
3058 myDict['_v__tableFile'] = table._v_file
3059 myDict['_v__tablePath'] = table._v_pathname
3061 # Update the locations in individual columns.
3062 for colname in self._v_colnames:
3063 myDict[colname]._g_update_table_location(table)
3065 def __len__(self):
3066 """Get the number of top level columns in table."""
3068 return len(self._v_colnames)
3070 def _f_col(self, colname):
3071 """Get an accessor to the column colname.
3073 This method returns a Column instance (see :ref:`ColumnClassDescr`) if
3074 the requested column is not nested, and a Cols instance (see
3075 :ref:`ColsClassDescr`) if it is. You may use full column pathnames in
3076 colname.
3078 Calling cols._f_col('col1/col2') is equivalent to using cols.col1.col2.
3079 However, the first syntax is more intended for programmatic use. It is
3080 also better if you want to access columns with names that are not valid
3081 Python identifiers.
3083 """
3085 if not isinstance(colname, str):
3086 raise TypeError("Parameter can only be an string. You passed "
3087 "object: %s" % colname)
3088 if ((colname.find('/') > -1 and
3089 colname not in self._v_colpathnames) and
3090 colname not in self._v_colnames):
3091 raise KeyError(("Cols accessor ``%s.cols%s`` does not have a "
3092 "column named ``%s``")
3093 % (self._v__tablePath, self._v_desc._v_pathname,
3094 colname))
3096 return self._g_col(colname)
3098 def _g_col(self, colname):
3099 """Like `self._f_col()` but it does not check arguments."""
3101 # Get the Column or Description object
3102 inames = colname.split('/')
3103 cols = self
3104 for iname in inames:
3105 cols = cols.__dict__[iname]
3106 return cols
3108 def __getitem__(self, key):
3109 """Get a row or a range of rows from a table or nested column.
3111 If key argument is an integer, the corresponding nested type row is
3112 returned as a record of the current flavor. If key is a slice, the
3113 range of rows determined by it is returned as a structured array of the
3114 current flavor.
3116 Examples
3117 --------
3119 ::
3121 record = table.cols[4] # equivalent to table[4]
3122 recarray = table.cols.Info[4:1000:2]
3124 Those statements are equivalent to::
3126 nrecord = table.read(start=4)[0]
3127 nrecarray = table.read(start=4, stop=1000, step=2).field('Info')
3129 Here you can see how a mix of natural naming, indexing and slicing can
3130 be used as shorthands for the :meth:`Table.read` method.
3132 """
3133 table = self._v_table
3134 nrows = table.nrows
3135 if is_idx(key):
3136 key = operator.index(key)
3138 # Index out of range protection
3139 if key >= nrows:
3140 raise IndexError("Index out of range")
3141 if key < 0:
3142 # To support negative values
3143 key += nrows
3144 (start, stop, step) = table._process_range(key, key + 1, 1)
3145 colgroup = self._v_desc._v_pathname
3146 if colgroup == "": # The root group
3147 return table.read(start, stop, step)[0]
3148 else:
3149 crecord = table.read(start, stop, step)[0]
3150 return crecord[colgroup]
3151 elif isinstance(key, slice):
3152 (start, stop, step) = table._process_range(
3153 key.start, key.stop, key.step)
3154 colgroup = self._v_desc._v_pathname
3155 if colgroup == "": # The root group
3156 return table.read(start, stop, step)
3157 else:
3158 crecarray = table.read(start, stop, step)
3159 if hasattr(crecarray, "field"):
3160 return crecarray.field(colgroup) # RecArray case
3161 else:
3162 return get_nested_field(crecarray, colgroup) # numpy case
3163 else:
3164 raise TypeError(f"invalid index or slice: {key!r}")
3166 def __setitem__(self, key, value):
3167 """Set a row or a range of rows in a table or nested column.
3169 If key argument is an integer, the corresponding row is set to
3170 value. If key is a slice, the range of rows determined by it is set to
3171 value.
3173 Examples
3174 --------
3176 ::
3178 table.cols[4] = record
3179 table.cols.Info[4:1000:2] = recarray
3181 Those statements are equivalent to::
3183 table.modify_rows(4, rows=record)
3184 table.modify_column(4, 1000, 2, colname='Info', column=recarray)
3186 Here you can see how a mix of natural naming, indexing and slicing
3187 can be used as shorthands for the :meth:`Table.modify_rows` and
3188 :meth:`Table.modify_column` methods.
3190 """
3192 table = self._v_table
3193 nrows = table.nrows
3194 if is_idx(key):
3195 key = operator.index(key)
3197 # Index out of range protection
3198 if key >= nrows:
3199 raise IndexError("Index out of range")
3200 if key < 0:
3201 # To support negative values
3202 key += nrows
3203 (start, stop, step) = table._process_range(key, key + 1, 1)
3204 elif isinstance(key, slice):
3205 (start, stop, step) = table._process_range(
3206 key.start, key.stop, key.step)
3207 else:
3208 raise TypeError(f"invalid index or slice: {key!r}")
3210 # Actually modify the correct columns
3211 colgroup = self._v_desc._v_pathname
3212 if colgroup == "": # The root group
3213 table.modify_rows(start, stop, step, rows=value)
3214 else:
3215 table.modify_column(
3216 start, stop, step, colname=colgroup, column=value)
3218 def _g_close(self):
3219 # First, close the columns (ie possible indices open)
3220 for col in self._v_colnames:
3221 colobj = self._g_col(col)
3222 if isinstance(colobj, Column):
3223 colobj.close()
3224 # Delete the reference to column
3225 del self.__dict__[col]
3226 else:
3227 colobj._g_close()
3229 self.__dict__.clear()
3231 def __str__(self):
3232 """The string representation for this object."""
3234 # The pathname
3235 descpathname = self._v_desc._v_pathname
3236 if descpathname:
3237 descpathname = "." + descpathname
3238 return (f"{self._v__tablePath}.cols{descpathname} "
3239 f"({self.__class__.__name__}), "
3240 f"{len(self._v_colnames)} columns")
3242 def __repr__(self):
3243 """A detailed string representation for this object."""
3245 lines = [f'{self!s}']
3246 for name in self._v_colnames:
3247 # Get this class name
3248 classname = getattr(self, name).__class__.__name__
3249 # The type
3250 if name in self._v_desc._v_dtypes:
3251 tcol = self._v_desc._v_dtypes[name]
3252 # The shape for this column
3253 shape = (self._v_table.nrows,) + \
3254 self._v_desc._v_dtypes[name].shape
3255 else:
3256 tcol = "Description"
3257 # Description doesn't have a shape currently
3258 shape = ()
3259 lines.append(f" {name} ({classname}{shape}, {tcol})")
3260 return '\n'.join(lines) + '\n'
3263class Column:
3264 """Accessor for a non-nested column in a table.
3266 Each instance of this class is associated with one *non-nested* column of a
3267 table. These instances are mainly used to read and write data from the
3268 table columns using item access (like the Cols class - see
3269 :ref:`ColsClassDescr`), but there are a few other associated methods to
3270 deal with indexes.
3272 .. rubric:: Column attributes
3274 .. attribute:: descr
3276 The Description (see :ref:`DescriptionClassDescr`) instance of the
3277 parent table or nested column.
3279 .. attribute:: name
3281 The name of the associated column.
3283 .. attribute:: pathname
3285 The complete pathname of the associated column (the same as
3286 Column.name if the column is not inside a nested column).
3288 Parameters
3289 ----------
3290 table
3291 The parent table instance
3292 name
3293 The name of the column that is associated with this object
3294 descr
3295 The parent description object
3297 """
3299 @lazyattr
3300 def dtype(self):
3301 """The NumPy dtype that most closely matches this column."""
3303 return self.descr._v_dtypes[self.name].base # Get rid of shape info
3305 @lazyattr
3306 def type(self):
3307 """The PyTables type of the column (a string)."""
3309 return self.descr._v_types[self.name]
3311 @property
3312 def table(self):
3313 """The parent Table instance (see :ref:`TableClassDescr`)."""
3314 return self._table_file._get_node(self._table_path)
3316 @property
3317 def index(self):
3318 """The Index instance (see :ref:`IndexClassDescr`) associated with this
3319 column (None if the column is not indexed)."""
3320 indexPath = _index_pathname_of_column_(self._table_path, self.pathname)
3321 try:
3322 index = self._table_file._get_node(indexPath)
3323 except NodeError:
3324 index = None # The column is not indexed
3325 return index
3327 @lazyattr
3328 def _itemtype(self):
3329 return self.descr._v_dtypes[self.name]
3331 @property
3332 def shape(self):
3333 """The shape of this column."""
3334 return (self.table.nrows,) + self.descr._v_dtypes[self.name].shape
3336 @property
3337 def is_indexed(self):
3338 """True if the column is indexed, false otherwise."""
3339 if self.index is None:
3340 return False
3341 else:
3342 return True
3344 @property
3345 def maindim(self):
3346 """"The dimension along which iterators work. Its value is 0 (i.e. the
3347 first dimension)."""
3348 return 0
3350 def __init__(self, table, name, descr):
3351 self._table_file = table._v_file
3352 self._table_path = table._v_pathname
3353 self.name = name
3354 """The name of the associated column."""
3355 self.pathname = descr._v_colobjects[name]._v_pathname
3356 """The complete pathname of the associated column (the same as
3357 Column.name if the column is not inside a nested column)."""
3358 self.descr = descr
3359 """The Description (see :ref:`DescriptionClassDescr`) instance of the
3360 parent table or nested column."""
3362 def _g_update_table_location(self, table):
3363 """Updates the location information about the associated `table`."""
3365 self._table_file = table._v_file
3366 self._table_path = table._v_pathname
3368 def __len__(self):
3369 """Get the number of elements in the column.
3371 This matches the length in rows of the parent table.
3373 """
3375 return self.table.nrows
3377 def __getitem__(self, key):
3378 """Get a row or a range of rows from a column.
3380 If key argument is an integer, the corresponding element in the column
3381 is returned as an object of the current flavor. If key is a slice, the
3382 range of elements determined by it is returned as an array of the
3383 current flavor.
3385 Examples
3386 --------
3388 ::
3390 print("Column handlers:")
3391 for name in table.colnames:
3392 print(table.cols._f_col(name))
3393 print("Select table.cols.name[1]-->", table.cols.name[1])
3394 print("Select table.cols.name[1:2]-->", table.cols.name[1:2])
3395 print("Select table.cols.name[:]-->", table.cols.name[:])
3396 print("Select table.cols._f_col('name')[:]-->",
3397 table.cols._f_col('name')[:])
3399 The output of this for a certain arbitrary table is::
3401 Column handlers:
3402 /table.cols.name (Column(), string, idx=None)
3403 /table.cols.lati (Column(), int32, idx=None)
3404 /table.cols.longi (Column(), int32, idx=None)
3405 /table.cols.vector (Column(2,), int32, idx=None)
3406 /table.cols.matrix2D (Column(2, 2), float64, idx=None)
3407 Select table.cols.name[1]--> Particle: 11
3408 Select table.cols.name[1:2]--> ['Particle: 11']
3409 Select table.cols.name[:]--> ['Particle: 10'
3410 'Particle: 11' 'Particle: 12'
3411 'Particle: 13' 'Particle: 14']
3412 Select table.cols._f_col('name')[:]--> ['Particle: 10'
3413 'Particle: 11' 'Particle: 12'
3414 'Particle: 13' 'Particle: 14']
3416 See the :file:`examples/table2.py` file for a more complete example.
3418 """
3420 table = self.table
3422 # Generalized key support not there yet, but at least allow
3423 # for a tuple with one single element (the main dimension).
3424 # (key,) --> key
3425 if isinstance(key, tuple) and len(key) == 1:
3426 key = key[0]
3428 if is_idx(key):
3429 key = operator.index(key)
3431 # Index out of range protection
3432 if key >= table.nrows:
3433 raise IndexError("Index out of range")
3434 if key < 0:
3435 # To support negative values
3436 key += table.nrows
3437 (start, stop, step) = table._process_range(key, key + 1, 1)
3438 return table.read(start, stop, step, self.pathname)[0]
3439 elif isinstance(key, slice):
3440 (start, stop, step) = table._process_range(
3441 key.start, key.stop, key.step)
3442 return table.read(start, stop, step, self.pathname)
3443 else:
3444 raise TypeError(
3445 "'%s' key type is not valid in this context" % key)
3447 def __iter__(self):
3448 """Iterate through all items in the column."""
3450 table = self.table
3451 itemsize = self.dtype.itemsize
3452 nrowsinbuf = table._v_file.params['IO_BUFFER_SIZE'] // itemsize
3453 buf = np.empty((nrowsinbuf, ), self._itemtype)
3454 max_row = len(self)
3455 for start_row in range(0, len(self), nrowsinbuf):
3456 end_row = min(start_row + nrowsinbuf, max_row)
3457 buf_slice = buf[0:end_row - start_row]
3458 table.read(start_row, end_row, 1, field=self.pathname,
3459 out=buf_slice)
3460 yield from buf_slice
3462 def __setitem__(self, key, value):
3463 """Set a row or a range of rows in a column.
3465 If key argument is an integer, the corresponding element is set to
3466 value. If key is a slice, the range of elements determined by it is
3467 set to value.
3469 Examples
3470 --------
3472 ::
3474 # Modify row 1
3475 table.cols.col1[1] = -1
3477 # Modify rows 1 and 3
3478 table.cols.col1[1::2] = [2,3]
3480 Which is equivalent to::
3482 # Modify row 1
3483 table.modify_columns(start=1, columns=[[-1]], names=['col1'])
3485 # Modify rows 1 and 3
3486 columns = numpy.rec.fromarrays([[2,3]], formats='i4')
3487 table.modify_columns(start=1, step=2, columns=columns,
3488 names=['col1'])
3490 """
3492 table = self.table
3493 table._v_file._check_writable()
3495 # Generalized key support not there yet, but at least allow
3496 # for a tuple with one single element (the main dimension).
3497 # (key,) --> key
3498 if isinstance(key, tuple) and len(key) == 1:
3499 key = key[0]
3501 if is_idx(key):
3502 key = operator.index(key)
3504 # Index out of range protection
3505 if key >= table.nrows:
3506 raise IndexError("Index out of range")
3507 if key < 0:
3508 # To support negative values
3509 key += table.nrows
3510 return table.modify_column(key, key + 1, 1,
3511 [[value]], self.pathname)
3512 elif isinstance(key, slice):
3513 (start, stop, step) = table._process_range(
3514 key.start, key.stop, key.step)
3515 return table.modify_column(start, stop, step,
3516 value, self.pathname)
3517 else:
3518 raise ValueError("Non-valid index or slice: %s" % key)
3520 def create_index(self, optlevel=6, kind="medium", filters=None,
3521 tmp_dir=None, _blocksizes=None, _testmode=False,
3522 _verbose=False):
3523 """Create an index for this column.
3525 .. warning::
3527 In some situations it is useful to get a completely sorted
3528 index (CSI). For those cases, it is best to use the
3529 :meth:`Column.create_csindex` method instead.
3531 Parameters
3532 ----------
3533 optlevel : int
3534 The optimization level for building the index. The levels ranges
3535 from 0 (no optimization) up to 9 (maximum optimization). Higher
3536 levels of optimization mean better chances for reducing the entropy
3537 of the index at the price of using more CPU, memory and I/O
3538 resources for creating the index.
3539 kind : str
3540 The kind of the index to be built. It can take the 'ultralight',
3541 'light', 'medium' or 'full' values. Lighter kinds ('ultralight'
3542 and 'light') mean that the index takes less space on disk, but will
3543 perform queries slower. Heavier kinds ('medium' and 'full') mean
3544 better chances for reducing the entropy of the index (increasing
3545 the query speed) at the price of using more disk space as well as
3546 more CPU, memory and I/O resources for creating the index.
3548 Note that selecting a full kind with an optlevel of 9 (the maximum)
3549 guarantees the creation of an index with zero entropy, that is, a
3550 completely sorted index (CSI) - provided that the number of rows in
3551 the table does not exceed the 2**48 figure (that is more than 100
3552 trillions of rows). See :meth:`Column.create_csindex` method for a
3553 more direct way to create a CSI index.
3554 filters : Filters
3555 Specify the Filters instance used to compress the index. If None,
3556 default index filters will be used (currently, zlib level 1 with
3557 shuffling).
3558 tmp_dir
3559 When kind is other than 'ultralight', a temporary file is created
3560 during the index build process. You can use the tmp_dir argument
3561 to specify the directory for this temporary file. The default is
3562 to create it in the same directory as the file containing the
3563 original table.
3565 """
3567 kinds = ['ultralight', 'light', 'medium', 'full']
3568 if kind not in kinds:
3569 raise ValueError("Kind must have any of these values: %s" % kinds)
3570 if (not isinstance(optlevel, int) or
3571 (optlevel < 0 or optlevel > 9)):
3572 raise ValueError("Optimization level must be an integer in the "
3573 "range 0-9")
3574 if filters is None:
3575 filters = default_index_filters
3576 if tmp_dir is None:
3577 tmp_dir = str(Path(self._table_file.filename).parent)
3578 else:
3579 if not Path(tmp_dir).is_dir():
3580 raise ValueError(
3581 f"Temporary directory '{tmp_dir}' does not exist"
3582 )
3583 if (_blocksizes is not None and
3584 (not isinstance(_blocksizes, tuple) or len(_blocksizes) != 4)):
3585 raise ValueError("_blocksizes must be a tuple with exactly 4 "
3586 "elements")
3587 idxrows = _column__create_index(self, optlevel, kind, filters,
3588 tmp_dir, _blocksizes, _verbose)
3589 return SizeType(idxrows)
3591 def create_csindex(self, filters=None, tmp_dir=None,
3592 _blocksizes=None, _testmode=False, _verbose=False):
3593 """Create a completely sorted index (CSI) for this column.
3595 This method guarantees the creation of an index with zero entropy, that
3596 is, a completely sorted index (CSI) -- provided that the number of rows
3597 in the table does not exceed the 2**48 figure (that is more than 100
3598 trillions of rows). A CSI index is needed for some table methods (like
3599 :meth:`Table.itersorted` or :meth:`Table.read_sorted`) in order to
3600 ensure completely sorted results.
3602 For the meaning of filters and tmp_dir arguments see
3603 :meth:`Column.create_index`.
3605 Notes
3606 -----
3607 This method is equivalent to
3608 Column.create_index(optlevel=9, kind='full', ...).
3610 """
3612 return self.create_index(
3613 kind='full', optlevel=9, filters=filters, tmp_dir=tmp_dir,
3614 _blocksizes=_blocksizes, _testmode=_testmode, _verbose=_verbose)
3616 def _do_reindex(self, dirty):
3617 """Common code for reindex() and reindex_dirty() codes."""
3619 index = self.index
3620 dodirty = True
3621 if dirty and not index.dirty:
3622 dodirty = False
3623 if index is not None and dodirty:
3624 self._table_file._check_writable()
3625 # Get the old index parameters
3626 kind = index.kind
3627 optlevel = index.optlevel
3628 filters = index.filters
3629 # We *need* to tell the index that it is going to be undirty.
3630 # This is needed here so as to unnail() the condition cache.
3631 index.dirty = False
3632 # Delete the existing Index
3633 index._f_remove()
3634 # Create a new Index with the previous parameters
3635 return SizeType(self.create_index(
3636 kind=kind, optlevel=optlevel, filters=filters))
3637 else:
3638 return SizeType(0) # The column is not intended for indexing
3640 def reindex(self):
3641 """Recompute the index associated with this column.
3643 This can be useful when you suspect that, for any reason,
3644 the index information is no longer valid and you want to rebuild it.
3646 This method does nothing if the column is not indexed.
3648 """
3650 self._do_reindex(dirty=False)
3652 def reindex_dirty(self):
3653 """Recompute the associated index only if it is dirty.
3655 This can be useful when you have set :attr:`Table.autoindex` to false
3656 for the table and you want to update the column's index after an
3657 invalidating index operation (like :meth:`Table.remove_rows`).
3659 This method does nothing if the column is not indexed.
3661 """
3663 self._do_reindex(dirty=True)
3665 def remove_index(self):
3666 """Remove the index associated with this column.
3668 This method does nothing if the column is not indexed. The removed
3669 index can be created again by calling the :meth:`Column.create_index`
3670 method.
3672 """
3674 self._table_file._check_writable()
3676 # Remove the index if existing.
3677 if self.is_indexed:
3678 index = self.index
3679 index._f_remove()
3680 self.table._set_column_indexing(self.pathname, False)
3682 def close(self):
3683 """Close this column."""
3685 self.__dict__.clear()
3687 def __str__(self):
3688 """The string representation for this object."""
3690 return (f"{self._table_path}.cols.{self.pathname.replace('/', '.')} "
3691 f"({self.__class__.__name__}{self.shape}, "
3692 f"{self.descr._v_types[self.name]}, idx={self.index})")
3694 def __repr__(self):
3695 """A detailed string representation for this object."""
3697 return str(self)