Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tables/table.py: 14%

1"""Here is defined the Table class."""

3import functools

4import math

5import operator

6import platform

7import sys

8import warnings

9from pathlib import Path

11from time import perf_counter as clock

13import numexpr as ne

14import numpy as np

16from . import tableextension

17from .lrucacheextension import ObjectCache, NumCache

18from .atom import Atom

19from .conditions import compile_condition

20from .flavor import flavor_of, array_as_internal, internal_to_flavor

21from .utils import is_idx, lazyattr, SizeType, NailedDict as CacheDict

22from .leaf import Leaf

23from .description import (IsDescription, Description, Col, descr_from_dtype)

24from .exceptions import (

25 NodeError, HDF5ExtError, PerformanceWarning, OldIndexWarning,

26 NoSuchNodeError)

27from .utilsextension import get_nested_field

29from .path import join_path, split_path

30from .index import (

31 OldIndex, default_index_filters, default_auto_index, Index, IndexesDescG,

32 IndexesTableG)

35profile = False

36# profile = True # Uncomment for profiling

37if profile:

38 from .utils import show_stats

41# 2.2: Added support for complex types. Introduced in version 0.9.

42# 2.2.1: Added suport for time types.

43# 2.3: Changed the indexes naming schema.

44# 2.4: Changed indexes naming schema (again).

45# 2.5: Added the FIELD_%d_FILL attributes.

46# 2.6: Added the FLAVOR attribute (optional).

47# 2.7: Numeric and numarray flavors are gone.

48obversion = "2.7" # The Table VERSION number

51# Maps NumPy types to the types used by Numexpr.

52_nxtype_from_nptype = {

53 np.bool_: bool,

54 np.int8: ne.necompiler.int_,

55 np.int16: ne.necompiler.int_,

56 np.int32: ne.necompiler.int_,

57 np.int64: ne.necompiler.long_,

58 np.uint8: ne.necompiler.int_,

59 np.uint16: ne.necompiler.int_,

60 np.uint32: ne.necompiler.long_,

61 np.uint64: ne.necompiler.long_,

62 np.float32: float,

63 np.float64: ne.necompiler.double,

64 np.complex64: complex,

65 np.complex128: complex,

66 np.bytes_: bytes,

67}

69_nxtype_from_nptype[np.str_] = str

71if hasattr(np, 'float16'):

72 _nxtype_from_nptype[np.float16] = float # XXX: check

73if hasattr(np, 'float96'):

74 _nxtype_from_nptype[np.float96] = ne.necompiler.double # XXX: check

75if hasattr(np, 'float128'):

76 _nxtype_from_nptype[np.float128] = ne.necompiler.double # XXX: check

77if hasattr(np, 'complex192'):

78 _nxtype_from_nptype[np.complex192] = complex # XXX: check

79if hasattr(np, 'complex256'):

80 _nxtype_from_nptype[np.complex256] = complex # XXX: check

83# The NumPy scalar type corresponding to `SizeType`.

84_npsizetype = np.array(SizeType(0)).dtype.type

87def _index_name_of(node):

88 return '_i_%s' % node._v_name

91def _index_pathname_of(node):

92 nodeParentPath = split_path(node._v_pathname)[0]

93 return join_path(nodeParentPath, _index_name_of(node))

96def _index_pathname_of_column(table, colpathname):

97 return join_path(_index_pathname_of(table), colpathname)

100# The next are versions that work with just paths (i.e. we don't need

101# a node instance for using them, which can be critical in certain

102# situations)

103

104

105def _index_name_of_(nodeName):

106 return '_i_%s' % nodeName

107

108

109def _index_pathname_of_(nodePath):

110 nodeParentPath, nodeName = split_path(nodePath)

111 return join_path(nodeParentPath, _index_name_of_(nodeName))

112

113

114def _index_pathname_of_column_(tablePath, colpathname):

115 return join_path(_index_pathname_of_(tablePath), colpathname)

116

117

118def restorecache(self):

119 # Define a cache for sparse table reads

120 params = self._v_file.params

121 chunksize = self._v_chunkshape[0]

122 nslots = params['TABLE_MAX_SIZE'] / (chunksize * self._v_dtype.itemsize)

123 self._chunkcache = NumCache((nslots, chunksize), self._v_dtype,

124 'table chunk cache')

125 self._seqcache = ObjectCache(params['ITERSEQ_MAX_SLOTS'],

126 params['ITERSEQ_MAX_SIZE'],

127 'Iter sequence cache')

128 self._dirtycache = False

129

130

131def _table__where_indexed(self, compiled, condition, condvars,

132 start, stop, step):

133 if profile:

134 tref = clock()

135 if profile:

136 show_stats("Entering table_whereIndexed", tref)

137 self._use_index = True

138 # Clean the table caches for indexed queries if needed

139 if self._dirtycache:

140 restorecache(self)

141

142 # Get the values in expression that are not columns

143 values = []

144 for key, value in condvars.items():

145 if isinstance(value, np.ndarray):

146 values.append((key, value.item()))

147 # Build a key for the sequence cache

148 seqkey = (condition, tuple(values), (start, stop, step))

149 # Do a lookup in sequential cache for this query

150 nslot = self._seqcache.getslot(seqkey)

151 if nslot >= 0:

152 # Get the row sequence from the cache

153 seq = self._seqcache.getitem(nslot)

154 if len(seq) == 0:

155 return iter([])

156 # seq is a list.

157 seq = np.array(seq, dtype='int64')

158 # Correct the ranges in cached sequence

159 if (start, stop, step) != (0, self.nrows, 1):

160 seq = seq[(seq >= start) & (

161 seq < stop) & ((seq - start) % step == 0)]

162 return self.itersequence(seq)

163 else:

164 # No luck. self._seqcache will be populated

165 # in the iterator if possible. (Row._finish_riterator)

166 self._seqcache_key = seqkey

167

168 # Compute the chunkmap for every index in indexed expression

169 idxexprs = compiled.index_expressions

170 strexpr = compiled.string_expression

171 cmvars = {}

172 tcoords = 0

173 for i, idxexpr in enumerate(idxexprs):

174 var, ops, lims = idxexpr

175 col = condvars[var]

176 index = col.index

177 assert index is not None, "the chosen column is not indexed"

178 assert not index.dirty, "the chosen column has a dirty index"

179

180 # Get the number of rows that the indexed condition yields.

181 range_ = index.get_lookup_range(ops, lims)

182 ncoords = index.search(range_)

183 tcoords += ncoords

184 if index.reduction == 1 and ncoords == 0:

185 # No values from index condition, thus the chunkmap should be empty

186 nrowsinchunk = self.chunkshape[0]

187 nchunks = math.ceil(self.nrows / nrowsinchunk)

188 chunkmap = np.zeros(shape=nchunks, dtype="bool")

189 else:

190 # Get the chunkmap from the index

191 chunkmap = index.get_chunkmap()

192 # Assign the chunkmap to the cmvars dictionary

193 cmvars["e%d" % i] = chunkmap

194

195 if index.reduction == 1 and tcoords == 0:

196 # No candidates found in any indexed expression component, so leave now

197 self._seqcache.setitem(seqkey, [], 1)

198 return iter([])

199

200 # Compute the final chunkmap

201 chunkmap = ne.evaluate(strexpr, cmvars)

202 if not chunkmap.any():

203 # The chunkmap is all False, so the result is empty

204 self._seqcache.setitem(seqkey, [], 1)

205 return iter([])

206

207 if profile:

208 show_stats("Exiting table_whereIndexed", tref)

209 return chunkmap

210

211

212def create_indexes_table(table):

213 itgroup = IndexesTableG(

214 table._v_parent, _index_name_of(table),

215 "Indexes container for table " + table._v_pathname, new=True)

216 return itgroup

217

218

219def create_indexes_descr(igroup, dname, iname, filters):

220 idgroup = IndexesDescG(

221 igroup, iname,

222 "Indexes container for sub-description " + dname,

223 filters=filters, new=True)

224 return idgroup

225

226

227def _column__create_index(self, optlevel, kind, filters, tmp_dir,

228 blocksizes, verbose):

229 name = self.name

230 table = self.table

231 dtype = self.dtype

232 descr = self.descr

233 index = self.index

234 get_node = table._v_file._get_node

235

236 # Warn if the index already exists

237 if index:

238 raise ValueError("%s for column '%s' already exists. If you want to "

239 "re-create it, please, try with reindex() method "

240 "better" % (str(index), str(self.pathname)))

241

242 # Check that the datatype is indexable.

243 if dtype.str[1:] == 'u8':

244 raise NotImplementedError(

245 "indexing 64-bit unsigned integer columns "

246 "is not supported yet, sorry")

247 if dtype.kind == 'c':

248 raise TypeError("complex columns can not be indexed")

249 if dtype.shape != ():

250 raise TypeError("multidimensional columns can not be indexed")

251

252 # Get the indexes group for table, and if not exists, create it

253 try:

254 itgroup = get_node(_index_pathname_of(table))

255 except NoSuchNodeError:

256 itgroup = create_indexes_table(table)

257

258 # Create the necessary intermediate groups for descriptors

259 idgroup = itgroup

260 dname = ""

261 pathname = descr._v_pathname

262 if pathname != '':

263 inames = pathname.split('/')

264 for iname in inames:

265 if dname == '':

266 dname = iname

267 else:

268 dname += '/' + iname

269 try:

270 idgroup = get_node(f'{itgroup._v_pathname}/{dname}')

271 except NoSuchNodeError:

272 idgroup = create_indexes_descr(idgroup, dname, iname, filters)

273

274 # Create the atom

275 assert dtype.shape == ()

276 atom = Atom.from_dtype(np.dtype((dtype, (0,))))

277

278 # Protection on tables larger than the expected rows (perhaps the

279 # user forgot to pass this parameter to the Table constructor?)

280 expectedrows = table._v_expectedrows

281 if table.nrows > expectedrows:

282 expectedrows = table.nrows

283

284 # Create the index itself

285 index = Index(

286 idgroup, name, atom=atom,

287 title="Index for %s column" % name,

288 kind=kind,

289 optlevel=optlevel,

290 filters=filters,

291 tmp_dir=tmp_dir,

292 expectedrows=expectedrows,

293 byteorder=table.byteorder,

294 blocksizes=blocksizes)

295

296 table._set_column_indexing(self.pathname, True)

297

298 # Feed the index with values

299

300 # Add rows to the index if necessary

301 if table.nrows > 0:

302 indexedrows = table._add_rows_to_index(

303 self.pathname, 0, table.nrows, lastrow=True, update=False)

304 else:

305 indexedrows = 0

306 index.dirty = False

307 table._indexedrows = indexedrows

308 table._unsaved_indexedrows = table.nrows - indexedrows

309

310 # Optimize the index that has been already filled-up

311 index.optimize(verbose=verbose)

312

313 # We cannot do a flush here because when reindexing during a

314 # flush, the indexes are created anew, and that creates a nested

315 # call to flush().

316 # table.flush()

317

318 return indexedrows

319

320

321class _ColIndexes(dict):

322 """Provides a nice representation of column indexes."""

323

324 def __repr__(self):

325 """Gives a detailed Description column representation."""

326

327 rep = [f' \"{k}\": {v}' for k, v in self.items()]

328 return '{\n %s}' % (',\n '.join(rep))

329

330

331class Table(tableextension.Table, Leaf):

332 """This class represents heterogeneous datasets in an HDF5 file.

333

334 Tables are leaves (see the Leaf class in :ref:`LeafClassDescr`) whose data

335 consists of a unidimensional sequence of *rows*, where each row contains

336 one or more *fields*. Fields have an associated unique *name* and

337 *position*, with the first field having position 0. All rows have the same

338 fields, which are arranged in *columns*.

339

340 Fields can have any type supported by the Col class (see

341 :ref:`ColClassDescr`) and its descendants, which support multidimensional

342 data. Moreover, a field can be *nested* (to an arbitrary depth), meaning

343 that it includes further fields inside. A field named x inside a nested

344 field a in a table can be accessed as the field a/x (its *path name*) from

345 the table.

346

347 The structure of a table is declared by its description, which is made

348 available in the Table.description attribute (see :class:`Table`).

349

350 This class provides new methods to read, write and search table data

351 efficiently. It also provides special Python methods to allow accessing

352 the table as a normal sequence or array (with extended slicing supported).

353

354 PyTables supports *in-kernel* searches working simultaneously on several

355 columns using complex conditions. These are faster than selections using

356 Python expressions. See the :meth:`Table.where` method for more

357 information on in-kernel searches.

358

359 Non-nested columns can be *indexed*. Searching an indexed column can be

360 several times faster than searching a non-nested one. Search methods

361 automatically take advantage of indexing where available.

362

363 When iterating a table, an object from the Row (see :ref:`RowClassDescr`)

364 class is used. This object allows to read and write data one row at a

365 time, as well as to perform queries which are not supported by in-kernel

366 syntax (at a much lower speed, of course).

367

368 Objects of this class support access to individual columns via *natural

369 naming* through the :attr:`Table.cols` accessor. Nested columns are

370 mapped to Cols instances, and non-nested ones to Column instances.

371 See the Column class in :ref:`ColumnClassDescr` for examples of this

372 feature.

373

374 Parameters

375 ----------

376 parentnode

377 The parent :class:`Group` object.

378

379 .. versionchanged:: 3.0

380 Renamed from *parentNode* to *parentnode*.

381

382 name : str

383 The name of this node in its parent group.

384 description

385 An IsDescription subclass or a dictionary where the keys are the field

386 names, and the values the type definitions. In addition, a pure NumPy

387 dtype is accepted. If None, the table metadata is read from disk,

388 else, it's taken from previous parameters.

389 title

390 Sets a TITLE attribute on the HDF5 table entity.

391 filters : Filters

392 An instance of the Filters class that provides information about the

393 desired I/O filters to be applied during the life of this object.

394 expectedrows

395 A user estimate about the number of rows that will be on table. If not

396 provided, the default value is ``EXPECTED_ROWS_TABLE`` (see

397 ``tables/parameters.py``). If you plan to save bigger tables, try

398 providing a guess; this will optimize the HDF5 B-Tree creation and

399 management process time and memory used.

400 chunkshape

401 The shape of the data chunk to be read or written as a single HDF5 I/O

402 operation. The filters are applied to those chunks of data. Its rank

403 for tables has to be 1. If ``None``, a sensible value is calculated

404 based on the `expectedrows` parameter (which is recommended).

405 byteorder

406 The byteorder of the data *on-disk*, specified as 'little' or 'big'. If

407 this is not specified, the byteorder is that of the platform, unless

408 you passed a recarray as the `description`, in which case the recarray

409 byteorder will be chosen.

410 track_times

411 Whether time data associated with the leaf are recorded (object

412 access time, raw data modification time, metadata change time, object

413 birth time); default True. Semantics of these times depend on their

414 implementation in the HDF5 library: refer to documentation of the

415 H5O_info_t data structure. As of HDF5 1.8.15, only ctime (metadata

416 change time) is implemented.

417

418 .. versionadded:: 3.4.3

419

420 Notes

421 -----

422 The instance variables below are provided in addition to those in

423 Leaf (see :ref:`LeafClassDescr`). Please note that there are several

424 col* dictionaries to ease retrieving information about a column

425 directly by its path name, avoiding the need to walk through

426 Table.description or Table.cols.

427

428

429 .. rubric:: Table attributes

430

431 .. attribute:: coldescrs

432

433 Maps the name of a column to its Col description (see

434 :ref:`ColClassDescr`).

435

436 .. attribute:: coldflts

437

438 Maps the name of a column to its default value.

439

440 .. attribute:: coldtypes

441

442 Maps the name of a column to its NumPy data type.

443

444 .. attribute:: colindexed

445

446 Is the column which name is used as a key indexed?

447

448 .. attribute:: colinstances

449

450 Maps the name of a column to its Column (see

451 :ref:`ColumnClassDescr`) or Cols (see :ref:`ColsClassDescr`)

452 instance.

453

454 .. attribute:: colnames

455

456 A list containing the names of *top-level* columns in the table.

457

458 .. attribute:: colpathnames

459

460 A list containing the pathnames of *bottom-level* columns in

461 the table.

462

463 These are the leaf columns obtained when walking the table

464 description left-to-right, bottom-first. Columns inside a

465 nested column have slashes (/) separating name components in

466 their pathname.

467

468 .. attribute:: cols

469

470 A Cols instance that provides *natural naming* access to

471 non-nested (Column, see :ref:`ColumnClassDescr`) and nested

472 (Cols, see :ref:`ColsClassDescr`) columns.

473

474 .. attribute:: coltypes

475

476 Maps the name of a column to its PyTables data type.

477

478 .. attribute:: description

479

480 A Description instance (see :ref:`DescriptionClassDescr`)

481 reflecting the structure of the table.

482

483 .. attribute:: extdim

484

485 The index of the enlargeable dimension (always 0 for tables).

486

487 .. attribute:: indexed

488

489 Does this table have any indexed columns?

490

491 .. attribute:: nrows

492

493 The current number of rows in the table.

494

495 """

496

497 # Class identifier.

498 _c_classid = 'TABLE'

499

500 @lazyattr

501 def row(self):

502 """The associated Row instance (see :ref:`RowClassDescr`)."""

503

504 return tableextension.Row(self)

505

506 @lazyattr

507 def dtype(self):

508 """The NumPy ``dtype`` that most closely matches this table."""

509

510 return self.description._v_dtype

511

512 @property

513 def shape(self):

514 """The shape of this table."""

515 return (self.nrows,)

516

517 @property

518 def rowsize(self):

519 """The size in bytes of each row in the table."""

520 return self.description._v_dtype.itemsize

521

522 @property

523 def size_in_memory(self):

524 """The size of this table's data in bytes when it is fully loaded into

525 memory. This may be used in combination with size_on_disk to calculate

526 the compression ratio of the data."""

527 return self.nrows * self.rowsize

528

529 @lazyattr

530 def _v_iobuf(self):

531 """A buffer for doing I/O."""

532

533 return self._get_container(self.nrowsinbuf)

534

535 @lazyattr

536 def _v_wdflts(self):

537 """The defaults for writing in recarray format."""

538

539 # First, do a check to see whether we need to set default values

540 # different from 0 or not.

541 for coldflt in self.coldflts.values():

542 if isinstance(coldflt, np.ndarray) or coldflt:

543 break

544 else:

545 # No default different from 0 found. Returning None.

546 return None

547 wdflts = self._get_container(1)

548 for colname, coldflt in self.coldflts.items():

549 ra = get_nested_field(wdflts, colname)

550 ra[:] = coldflt

551 return wdflts

552

553 @lazyattr

554 def _colunaligned(self):

555 """The pathnames of unaligned, *unidimensional* columns."""

556 colunaligned, rarr = [], self._get_container(0)

557 for colpathname in self.colpathnames:

558 carr = get_nested_field(rarr, colpathname)

559 if not carr.flags.aligned and carr.ndim == 1:

560 colunaligned.append(colpathname)

561 return frozenset(colunaligned)

562

563 # **************** WARNING! ***********************

564 # This function can be called during the destruction time of a table

565 # so measures have been taken so that it doesn't have to revive

566 # another node (which can fool the LRU cache). The solution devised

567 # has been to add a cache for autoindex (Table._autoindex), populate

568 # it in creation time of the cache (which is a safe period) and then

569 # update the cache whenever it changes.

570 # This solves the error when running test_indexes.py ManyNodesTestCase.

571 # F. Alted 2007-04-20

572 # **************************************************

573

574 @property

575 def autoindex(self):

576 """Automatically keep column indexes up to date?

577

578 Setting this value states whether existing indexes should be

579 automatically updated after an append operation or recomputed

580 after an index-invalidating operation (i.e. removal and

581 modification of rows). The default is true.

582

583 This value gets into effect whenever a column is altered. If you

584 don't have automatic indexing activated and you want to do an an

585 immediate update use `Table.flush_rows_to_index()`; for an immediate

586 reindexing of invalidated indexes, use `Table.reindex_dirty()`.

587

588 This value is persistent.

589

590 .. versionchanged:: 3.0

591 The *autoIndex* property has been renamed into *autoindex*.

592 """

593

594 if self._autoindex is None:

595 try:

596 indexgroup = self._v_file._get_node(_index_pathname_of(self))

597 except NoSuchNodeError:

598 self._autoindex = default_auto_index # update cache

599 return self._autoindex

600 else:

601 self._autoindex = indexgroup.auto # update cache

602 return self._autoindex

603 else:

604 # The value is in cache, return it

605 return self._autoindex

606

607 @autoindex.setter

608 def autoindex(self, auto):

609 auto = bool(auto)

610 try:

611 indexgroup = self._v_file._get_node(_index_pathname_of(self))

612 except NoSuchNodeError:

613 indexgroup = create_indexes_table(self)

614 indexgroup.auto = auto

615 # Update the cache in table instance as well

616 self._autoindex = auto

617

618 @property

619 def indexedcolpathnames(self):

620 """List of pathnames of indexed columns in the table."""

621 return [_colpname

622 for _colpname in self.colpathnames

623 if self.colindexed[_colpname]]

624

625 @property

626 def colindexes(self):

627 """A dictionary with the indexes of the indexed columns."""

628 return _ColIndexes((_colpname, self.cols._f_col(_colpname).index)

629 for _colpname in self.colpathnames

630 if self.colindexed[_colpname])

631

632 @property

633 def _dirtyindexes(self):

634 """Whether some index in table is dirty."""

635 return self._condition_cache._nailcount > 0

636

637 def __init__(self, parentnode, name,

638 description=None, title="", filters=None,

639 expectedrows=None, chunkshape=None,

640 byteorder=None, _log=True, track_times=True):

641

642 self._v_new = new = description is not None

643 """Is this the first time the node has been created?"""

644 self._v_new_title = title

645 """New title for this node."""

646 self._v_new_filters = filters

647 """New filter properties for this node."""

648 self.extdim = 0 # Tables only have one dimension currently

649 """The index of the enlargeable dimension (always 0 for tables)."""

650 self._v_recarray = None

651 """A structured array to be stored in the table."""

652 self._rabyteorder = None

653 """The computed byteorder of the self._v_recarray."""

654 if expectedrows is None:

655 expectedrows = parentnode._v_file.params['EXPECTED_ROWS_TABLE']

656 self._v_expectedrows = expectedrows

657 """The expected number of rows to be stored in the table."""

658 self.nrows = SizeType(0)

659 """The current number of rows in the table."""

660 self.description = None

661 """A Description instance (see :ref:`DescriptionClassDescr`)

662 reflecting the structure of the table."""

663 self._time64colnames = []

664 """The names of ``Time64`` columns."""

665 self._strcolnames = []

666 """The names of ``String`` columns."""

667 self._colenums = {}

668 """Maps the name of an enumerated column to its ``Enum`` instance."""

669 self._v_chunkshape = None

670 """Private storage for the `chunkshape` property of the leaf."""

671

672 self.indexed = False

673 """Does this table have any indexed columns?"""

674 self._indexedrows = 0

675 """Number of rows indexed in disk."""

676 self._unsaved_indexedrows = 0

677 """Number of rows indexed in memory but still not in disk."""

678 self._listoldindexes = []

679 """The list of columns with old indexes."""

680 self._autoindex = None

681 """Private variable that caches the value for autoindex."""

682

683 self.colnames = []

684 """A list containing the names of *top-level* columns in the table."""

685 self.colpathnames = []

686 """A list containing the pathnames of *bottom-level* columns in the

687 table.

688

689 These are the leaf columns obtained when walking the

690 table description left-to-right, bottom-first. Columns inside a

691 nested column have slashes (/) separating name components in

692 their pathname.

693 """

694 self.colinstances = {}

695 """Maps the name of a column to its Column (see

696 :ref:`ColumnClassDescr`) or Cols (see :ref:`ColsClassDescr`)

697 instance."""

698 self.coldescrs = {}

699 """Maps the name of a column to its Col description (see

700 :ref:`ColClassDescr`)."""

701 self.coltypes = {}

702 """Maps the name of a column to its PyTables data type."""

703 self.coldtypes = {}

704 """Maps the name of a column to its NumPy data type."""

705 self.coldflts = {}

706 """Maps the name of a column to its default value."""

707 self.colindexed = {}

708 """Is the column which name is used as a key indexed?"""

709

710 self._use_index = False

711 """Whether an index can be used or not in a search. Boolean."""

712 self._where_condition = None

713 """Condition function and argument list for selection of values."""

714 self._seqcache_key = None

715 """The key under which to save a query's results (list of row indexes)

716 or None to not save."""

717 max_slots = parentnode._v_file.params['COND_CACHE_SLOTS']

718 self._condition_cache = CacheDict(max_slots)

719 """Cache of already compiled conditions."""

720 self._exprvars_cache = {}

721 """Cache of variables participating in numexpr expressions."""

722 self._enabled_indexing_in_queries = True

723 """Is indexing enabled in queries? *Use only for testing.*"""

724 self._empty_array_cache = {}

725 """Cache of empty arrays."""

726

727 self._v_dtype = None

728 """The NumPy datatype fopr this table."""

729 self.cols = None

730 """

731 A Cols instance that provides *natural naming* access to non-nested

732 (Column, see :ref:`ColumnClassDescr`) and nested (Cols, see

733 :ref:`ColsClassDescr`) columns.

734 """

735 self._dirtycache = True

736 """Whether the data caches are dirty or not. Initially set to yes."""

737 self._descflavor = None

738 """Temporarily keeps the flavor of a description with data."""

739

740 # Initialize this object in case is a new Table

741

742 # Try purely descriptive description objects.

743 if new and isinstance(description, dict):

744 # Dictionary case

745 self.description = Description(description,

746 ptparams=parentnode._v_file.params)

747 elif new and (type(description) == type(IsDescription)

748 and issubclass(description, IsDescription)):

749 # IsDescription subclass case

750 descr = description()

751 self.description = Description(descr.columns,

752 ptparams=parentnode._v_file.params)

753 elif new and isinstance(description, Description):

754 # It is a Description instance already

755 self.description = description

756

757 # No description yet?

758 if new and self.description is None:

759 # Try NumPy dtype instances

760 if isinstance(description, np.dtype):

761 tup = descr_from_dtype(description,

762 ptparams=parentnode._v_file.params)

763 self.description, self._rabyteorder = tup

764

765 # No description yet?

766 if new and self.description is None:

767 # Try structured array description objects.

768 try:

769 self._descflavor = flavor = flavor_of(description)

770 except TypeError: # probably not an array

771 pass

772 else:

773 if flavor == 'python':

774 nparray = np.rec.array(description)

775 else:

776 nparray = array_as_internal(description, flavor)

777 self.nrows = nrows = SizeType(nparray.size)

778 # If `self._v_recarray` is set, it will be used as the

779 # initial buffer.

780 if nrows > 0:

781 self._v_recarray = nparray

782 tup = descr_from_dtype(nparray.dtype,

783 ptparams=parentnode._v_file.params)

784 self.description, self._rabyteorder = tup

785

786 # No description yet?

787 if new and self.description is None:

788 raise TypeError(

789 "the ``description`` argument is not of a supported type: "

790 "``IsDescription`` subclass, ``Description`` instance, "

791 "dictionary, or structured array")

792

793 # Check the chunkshape parameter

794 if new and chunkshape is not None:

795 if isinstance(chunkshape, (int, np.integer)):

796 chunkshape = (chunkshape,)

797 try:

798 chunkshape = tuple(chunkshape)

799 except TypeError:

800 raise TypeError(

801 "`chunkshape` parameter must be an integer or sequence "

802 "and you passed a %s" % type(chunkshape))

803 if len(chunkshape) != 1:

804 raise ValueError("`chunkshape` rank (length) must be 1: %r"

805 % (chunkshape,))

806 self._v_chunkshape = tuple(SizeType(s) for s in chunkshape)

807

808 super().__init__(parentnode, name, new, filters, byteorder, _log,

809 track_times)

810

811 def _g_post_init_hook(self):

812 # We are putting here the index-related issues

813 # as well as filling general info for table

814 # This is needed because we need first the index objects created

815

816 # First, get back the flavor of input data (if any) for

817 # `Leaf._g_post_init_hook()`.

818 self._flavor, self._descflavor = self._descflavor, None

819 super()._g_post_init_hook()

820

821 self.blosc2_support_write = (

822 (self.byteorder == sys.byteorder) and

823 (self.filters.complib != None) and

824 (self.filters.complib.startswith("blosc2")))

825 # For reading, Windows does not support re-opening a file twice

826 # in not read-only mode (for good reason), so we cannot use the

827 # blosc2 opt

828 self.blosc2_support_read = (

829 self.blosc2_support_write and

830 ((platform.system().lower() != 'windows') or

831 ((self._v_file.mode == 'r')))

832 )

833

834 # Create a cols accessor.

835 self.cols = Cols(self, self.description)

836

837 # Place the `Cols` and `Column` objects into `self.colinstances`.

838 colinstances, cols = self.colinstances, self.cols

839 for colpathname in self.description._v_pathnames:

840 colinstances[colpathname] = cols._g_col(colpathname)

841

842 if self._v_new:

843 # Columns are never indexed on creation.

844 self.colindexed = {cpn: False for cpn in self.colpathnames}

845 return

846

847 # The following code is only for opened tables.

848

849 # Do the indexes group exist?

850 indexesgrouppath = _index_pathname_of(self)

851 igroup = indexesgrouppath in self._v_file

852 oldindexes = False

853 for colobj in self.description._f_walk(type="Col"):

854 colname = colobj._v_pathname

855 # Is this column indexed?

856 if igroup:

857 indexname = _index_pathname_of_column(self, colname)

858 indexed = indexname in self._v_file

859 self.colindexed[colname] = indexed

860 if indexed:

861 column = self.cols._g_col(colname)

862 indexobj = column.index

863 if isinstance(indexobj, OldIndex):

864 indexed = False # Not a vaild index

865 oldindexes = True

866 self._listoldindexes.append(colname)

867 else:

868 # Tell the condition cache about columns with dirty

869 # indexes.

870 if indexobj.dirty:

871 self._condition_cache.nail()

872 else:

873 indexed = False

874 self.colindexed[colname] = False

875 if indexed:

876 self.indexed = True

877

878 if oldindexes: # this should only appear under 2.x Pro

879 warnings.warn(

880 "table ``%s`` has column indexes with PyTables 1.x format. "

881 "Unfortunately, this format is not supported in "

882 "PyTables 2.x series. Note that you can use the "

883 "``ptrepack`` utility in order to recreate the indexes. "

884 "The 1.x indexed columns found are: %s" %

885 (self._v_pathname, self._listoldindexes),

886 OldIndexWarning)

887

888 # It does not matter to which column 'indexobj' belongs,

889 # since their respective index objects share

890 # the same number of elements.

891 if self.indexed:

892 self._indexedrows = indexobj.nelements

893 self._unsaved_indexedrows = self.nrows - self._indexedrows

894 # Put the autoindex value in a cache variable

895 self._autoindex = self.autoindex

896

897 def _calc_nrowsinbuf(self):

898 """Calculate the number of rows that fits on a PyTables buffer."""

899

900 params = self._v_file.params

901 # Compute the nrowsinbuf

902 rowsize = self.rowsize

903 buffersize = params['IO_BUFFER_SIZE']

904 if rowsize != 0:

905 nrowsinbuf = buffersize // rowsize

906 # The number of rows in buffer needs to be an exact multiple of

907 # chunkshape[0] for queries using indexed columns.

908 # Fixes #319 and probably #409 too.

909 nrowsinbuf -= nrowsinbuf % self.chunkshape[0]

910 else:

911 nrowsinbuf = 1

912

913 # tableextension.pyx performs an assertion

914 # to make sure nrowsinbuf is greater than or

915 # equal to the chunksize.

916 # See gh-206 and gh-238

917 if self.chunkshape is not None:

918 if nrowsinbuf < self.chunkshape[0]:

919 nrowsinbuf = self.chunkshape[0]

920

921 # Safeguard against row sizes being extremely large

922 if nrowsinbuf == 0:

923 nrowsinbuf = 1

924 # If rowsize is too large, issue a Performance warning

925 maxrowsize = params['BUFFER_TIMES'] * buffersize

926 if rowsize > maxrowsize:

927 warnings.warn("""\

928The Table ``%s`` is exceeding the maximum recommended rowsize (%d bytes);

929be ready to see PyTables asking for *lots* of memory and possibly slow

930I/O. You may want to reduce the rowsize by trimming the value of

931dimensions that are orthogonal (and preferably close) to the *main*

932dimension of this leave. Alternatively, in case you have specified a

933very small/large chunksize, you may want to increase/decrease it."""

934 % (self._v_pathname, maxrowsize),

935 PerformanceWarning)

936 return nrowsinbuf

937

938 def _getemptyarray(self, dtype):

939 # Acts as a cache for empty arrays

940 key = dtype

941 if key in self._empty_array_cache:

942 return self._empty_array_cache[key]

943 else:

944 self._empty_array_cache[

945 key] = arr = np.empty(shape=0, dtype=key)

946 return arr

947

948 def _get_container(self, shape):

949 """Get the appropriate buffer for data depending on table

950 nestedness."""

951

952 # This is *much* faster than the numpy.rec.array counterpart

953 return np.empty(shape=shape, dtype=self._v_dtype)

954

955 def _get_type_col_names(self, type_):

956 """Returns a list containing 'type_' column names."""

957

958 return [colobj._v_pathname

959 for colobj in self.description._f_walk('Col')

960 if colobj.type == type_]

961

962 def _get_enum_map(self):

963 """Return mapping from enumerated column names to `Enum` instances."""

964

965 enumMap = {}

966 for colobj in self.description._f_walk('Col'):

967 if colobj.kind == 'enum':

968 enumMap[colobj._v_pathname] = colobj.enum

969 return enumMap

970

971 def _g_create(self):

972 """Create a new table on disk."""

973

974 # Warning against assigning too much columns...

975 # F. Alted 2005-06-05

976 maxColumns = self._v_file.params['MAX_COLUMNS']

977 if (len(self.description._v_names) > maxColumns):

978 warnings.warn(

979 "table ``%s`` is exceeding the recommended "

980 "maximum number of columns (%d); "

981 "be ready to see PyTables asking for *lots* of memory "

982 "and possibly slow I/O" % (self._v_pathname, maxColumns),

983 PerformanceWarning)

984

985 # 1. Create the HDF5 table (some parameters need to be computed).

986

987 # Fix the byteorder of the recarray and update the number of

988 # expected rows if necessary

989 if self._v_recarray is not None:

990 self._v_recarray = self._g_fix_byteorder_data(self._v_recarray,

991 self._rabyteorder)

992 if len(self._v_recarray) > self._v_expectedrows:

993 self._v_expectedrows = len(self._v_recarray)

994 # Compute a sensible chunkshape

995 if self._v_chunkshape is None:

996 self._v_chunkshape = self._calc_chunkshape(

997 self._v_expectedrows, self.rowsize, self.rowsize)

998 # Correct the byteorder, if still needed

999 if self.byteorder is None:

1000 self.byteorder = sys.byteorder

1001

1002 # Cache some data which is already in the description.

1003 # This is necessary to happen before creation time in order

1004 # to be able to populate the self._v_wdflts

1005 self._cache_description_data()

1006

1007 # After creating the table, ``self._v_objectid`` needs to be

1008 # set because it is needed for setting attributes afterwards.

1009 self._v_objectid = self._create_table(

1010 self._v_new_title, self.filters.complib or '', obversion)

1011 self._v_recarray = None # not useful anymore

1012 self._rabyteorder = None # not useful anymore

1013

1014 # 2. Compute or get chunk shape and buffer size parameters.

1015 self.nrowsinbuf = self._calc_nrowsinbuf()

1016

1017 # 3. Get field fill attributes from the table description and

1018 # set them on disk.

1019 if self._v_file.params['PYTABLES_SYS_ATTRS']:

1020 set_attr = self._v_attrs._g__setattr

1021 for i, colobj in enumerate(self.description._f_walk(type="Col")):

1022 fieldname = "FIELD_%d_FILL" % i

1023 set_attr(fieldname, colobj.dflt)

1024

1025 return self._v_objectid

1026

1027 def _g_open(self):

1028 """Opens a table from disk and read the metadata on it.

1029

1030 Creates an user description on the flight to easy the access to

1031 the actual data.

1032

1033 """

1034

1035 # 1. Open the HDF5 table and get some data from it.

1036 self._v_objectid, description, chunksize = self._get_info()

1037 self._v_expectedrows = self.nrows # the actual number of rows

1038

1039 # 2. Create an instance description to host the record fields.

1040 validate = not self._v_file._isPTFile # only for non-PyTables files

1041 self.description = Description(description, validate=validate,

1042 ptparams=self._v_file.params)

1043

1044 # 3. Compute or get chunk shape and buffer size parameters.

1045 if chunksize == 0:

1046 self._v_chunkshape = self._calc_chunkshape(

1047 self._v_expectedrows, self.rowsize, self.rowsize)

1048 else:

1049 self._v_chunkshape = (chunksize,)

1050 self.nrowsinbuf = self._calc_nrowsinbuf()

1051

1052 # 4. If there are field fill attributes, get them from disk and

1053 # set them in the table description.

1054 if self._v_file.params['PYTABLES_SYS_ATTRS']:

1055 if "FIELD_0_FILL" in self._v_attrs._f_list("sys"):

1056 i = 0

1057 get_attr = self._v_attrs.__getattr__

1058 for objcol in self.description._f_walk(type="Col"):

1059 colname = objcol._v_pathname

1060 # Get the default values for each column

1061 fieldname = "FIELD_%s_FILL" % i

1062 defval = get_attr(fieldname)

1063 if defval is not None:

1064 objcol.dflt = defval

1065 else:

1066 warnings.warn("could not load default value "

1067 "for the ``%s`` column of table ``%s``; "

1068 "using ``%r`` instead"

1069 % (colname, self._v_pathname,

1070 objcol.dflt))

1071 defval = objcol.dflt

1072 i += 1

1073

1074 # Set also the correct value in the desc._v_dflts dictionary

1075 for descr in self.description._f_walk(type="Description"):

1076 for name in descr._v_names:

1077 objcol = descr._v_colobjects[name]

1078 if isinstance(objcol, Col):

1079 descr._v_dflts[objcol._v_name] = objcol.dflt

1080

1081 # 5. Cache some data which is already in the description.

1082 self._cache_description_data()

1083

1084 return self._v_objectid

1085

1086 def _cache_description_data(self):

1087 """Cache some data which is already in the description.

1088

1089 Some information is extracted from `self.description` to build

1090 some useful (but redundant) structures:

1091

1092 * `self.colnames`

1093 * `self.colpathnames`

1094 * `self.coldescrs`

1095 * `self.coltypes`

1096 * `self.coldtypes`

1097 * `self.coldflts`

1098 * `self._v_dtype`

1099 * `self._time64colnames`

1100 * `self._strcolnames`

1101 * `self._colenums`

1102

1103 """

1104

1105 self.colnames = list(self.description._v_names)

1106 self.colpathnames = [

1107 col._v_pathname for col in self.description._f_walk()

1108 if not hasattr(col, '_v_names')] # bottom-level

1109

1110 # Find ``time64`` column names.

1111 self._time64colnames = self._get_type_col_names('time64')

1112 # Find ``string`` column names.

1113 self._strcolnames = self._get_type_col_names('string')

1114 # Get a mapping of enumerated columns to their `Enum` instances.

1115 self._colenums = self._get_enum_map()

1116

1117 # Get info about columns

1118 for colobj in self.description._f_walk(type="Col"):

1119 colname = colobj._v_pathname

1120 # Get the column types, types and defaults

1121 self.coldescrs[colname] = colobj

1122 self.coltypes[colname] = colobj.type

1123 self.coldtypes[colname] = colobj.dtype

1124 self.coldflts[colname] = colobj.dflt

1125

1126 # Assign _v_dtype for this table

1127 self._v_dtype = self.description._v_dtype

1128

1129 def _get_column_instance(self, colpathname):

1130 """Get the instance of the column with the given `colpathname`.

1131

1132 If the column does not exist in the table, a `KeyError` is

1133 raised.

1134

1135 """

1136

1137 try:

1138 return functools.reduce(

1139 getattr, colpathname.split('/'), self.description)

1140 except AttributeError:

1141 raise KeyError("table ``%s`` does not have a column named ``%s``"

1142 % (self._v_pathname, colpathname))

1143

1144 _check_column = _get_column_instance

1145

1146 def _disable_indexing_in_queries(self):

1147 """Force queries not to use indexing.

1148

1149 *Use only for testing.*

1150

1151 """

1152

1153 if not self._enabled_indexing_in_queries:

1154 return # already disabled

1155 # The nail avoids setting/getting compiled conditions in/from

1156 # the cache where indexing is used.

1157 self._condition_cache.nail()

1158 self._enabled_indexing_in_queries = False

1159

1160 def _enable_indexing_in_queries(self):

1161 """Allow queries to use indexing.

1162

1163 *Use only for testing.*

1164

1165 """

1166

1167 if self._enabled_indexing_in_queries:

1168 return # already enabled

1169 self._condition_cache.unnail()

1170 self._enabled_indexing_in_queries = True

1171

1172 def _required_expr_vars(self, expression, uservars, depth=1):

1173 """Get the variables required by the `expression`.

1174

1175 A new dictionary defining the variables used in the `expression`

1176 is returned. Required variables are first looked up in the

1177 `uservars` mapping, then in the set of top-level columns of the

1178 table. Unknown variables cause a `NameError` to be raised.

1179

1180 When `uservars` is `None`, the local and global namespace where

1181 the API callable which uses this method is called is sought

1182 instead. This mechanism will not work as expected if this

1183 method is not used *directly* from an API callable. To disable

1184 this mechanism, just specify a mapping as `uservars`.

1185

1186 Nested columns and columns from other tables are not allowed

1187 (`TypeError` and `ValueError` are raised, respectively). Also,

1188 non-column variable values are converted to NumPy arrays.

1189

1190 `depth` specifies the depth of the frame in order to reach local

1191 or global variables.

1192

1193 """

1194

1195 # Get the names of variables used in the expression.

1196 exprvarscache = self._exprvars_cache

1197 if expression not in exprvarscache:

1198 # Protection against growing the cache too much

1199 if len(exprvarscache) > 256:

1200 # Remove 10 (arbitrary) elements from the cache

1201 for k in list(exprvarscache)[:10]:

1202 del exprvarscache[k]

1203 cexpr = compile(expression, '<string>', 'eval')

1204 exprvars = [var for var in cexpr.co_names

1205 if var not in ['None', 'False', 'True']

1206 and var not in ne.expressions.functions]

1207 exprvarscache[expression] = exprvars

1208 else:

1209 exprvars = exprvarscache[expression]

1210

1211 # Get the local and global variable mappings of the user frame

1212 # if no mapping has been explicitly given for user variables.

1213 user_locals, user_globals = {}, {}

1214 if uservars is None:

1215 # We use specified depth to get the frame where the API

1216 # callable using this method is called. For instance:

1217 #

1218 # * ``table._required_expr_vars()`` (depth 0) is called by

1219 # * ``table._where()`` (depth 1) is called by

1220 # * ``table.where()`` (depth 2) is called by

1221 # * user-space functions (depth 3)

1222 user_frame = sys._getframe(depth)

1223 user_locals = user_frame.f_locals

1224 user_globals = user_frame.f_globals

1225

1226 colinstances = self.colinstances

1227 tblfile, tblpath = self._v_file, self._v_pathname

1228 # Look for the required variables first among the ones

1229 # explicitly provided by the user, then among implicit columns,

1230 # then among external variables (only if no explicit variables).

1231 reqvars = {}

1232 for var in exprvars:

1233 # Get the value.

1234 if uservars is not None and var in uservars:

1235 val = uservars[var]

1236 elif var in colinstances:

1237 val = colinstances[var]

1238 elif uservars is None and var in user_locals:

1239 val = user_locals[var]

1240 elif uservars is None and var in user_globals:

1241 val = user_globals[var]

1242 else:

1243 raise NameError("name ``%s`` is not defined" % var)

1244

1245 # Check the value.

1246 if hasattr(val, 'pathname'): # non-nested column

1247 if val.shape[1:] != ():

1248 raise NotImplementedError(

1249 "variable ``%s`` refers to "

1250 "a multidimensional column, "

1251 "not yet supported in conditions, sorry" % var)

1252 if (val._table_file is not tblfile or

1253 val._table_path != tblpath):

1254 raise ValueError("variable ``%s`` refers to a column "

1255 "which is not part of table ``%s``"

1256 % (var, tblpath))

1257 if val.dtype.str[1:] == 'u8':

1258 raise NotImplementedError(

1259 "variable ``%s`` refers to "

1260 "a 64-bit unsigned integer column, "

1261 "not yet supported in conditions, sorry; "

1262 "please use regular Python selections" % var)

1263 elif hasattr(val, '_v_colpathnames'): # nested column

1264 raise TypeError(

1265 "variable ``%s`` refers to a nested column, "

1266 "not allowed in conditions" % var)

1267 else: # only non-column values are converted to arrays

1268 # XXX: not 100% sure about this

1269 if isinstance(val, str):

1270 val = np.asarray(val.encode('ascii'))

1271 else:

1272 val = np.asarray(val)

1273 reqvars[var] = val

1274 return reqvars

1275

1276 def _get_condition_key(self, condition, condvars):

1277 """Get the condition cache key for `condition` with `condvars`.

1278

1279 Currently, the key is a tuple of `condition`, column variables

1280 names, normal variables names, column paths and variable paths

1281 (all are tuples).

1282

1283 """

1284

1285 # Variable names for column and normal variables.

1286 colnames, varnames = [], []

1287 # Column paths and types for each of the previous variable.

1288 colpaths, vartypes = [], []

1289 for (var, val) in condvars.items():

1290 if hasattr(val, 'pathname'): # column

1291 colnames.append(var)

1292 colpaths.append(val.pathname)

1293 else: # array

1294 try:

1295 varnames.append(var)

1296 vartypes.append(ne.necompiler.getType(val)) # expensive

1297 except ValueError:

1298 # This is more clear than the error given by Numexpr.

1299 raise TypeError("variable ``%s`` has data type ``%s``, "

1300 "not allowed in conditions"

1301 % (var, val.dtype.name))

1302 colnames, varnames = tuple(colnames), tuple(varnames)

1303 colpaths, vartypes = tuple(colpaths), tuple(vartypes)

1304 condkey = (condition, colnames, varnames, colpaths, vartypes)

1305 return condkey

1306

1307 def _compile_condition(self, condition, condvars):

1308 """Compile the `condition` and extract usable index conditions.

1309

1310 This method returns an instance of ``CompiledCondition``. See

1311 the ``compile_condition()`` function in the ``conditions``

1312 module for more information about the compilation process.

1313

1314 This method makes use of the condition cache when possible.

1315

1316 """

1317

1318 # Look up the condition in the condition cache.

1319 condcache = self._condition_cache

1320 condkey = self._get_condition_key(condition, condvars)

1321 compiled = condcache.get(condkey)

1322 if compiled:

1323 return compiled.with_replaced_vars(condvars) # bingo!

1324

1325 # Bad luck, the condition must be parsed and compiled.

1326 # Fortunately, the key provides some valuable information. ;)

1327 (condition, colnames, varnames, colpaths, vartypes) = condkey

1328

1329 # Extract more information from referenced columns.

1330

1331 # start with normal variables

1332 typemap = dict(list(zip(varnames, vartypes)))

1333 indexedcols = []

1334 for colname in colnames:

1335 col = condvars[colname]

1336

1337 # Extract types from *all* the given variables.

1338 coltype = col.dtype.type

1339 typemap[colname] = _nxtype_from_nptype[coltype]

1340

1341 # Get the set of columns with usable indexes.

1342 if (self._enabled_indexing_in_queries # no in-kernel searches

1343 and self.colindexed[col.pathname] and not col.index.dirty):

1344 indexedcols.append(colname)

1345

1346 indexedcols = frozenset(indexedcols)

1347 # Now let ``compile_condition()`` do the Numexpr-related job.

1348 compiled = compile_condition(condition, typemap, indexedcols)

1349

1350 # Check that there actually are columns in the condition.

1351 if not set(compiled.parameters).intersection(set(colnames)):

1352 raise ValueError("there are no columns taking part "

1353 "in condition ``%s``" % (condition,))

1354

1355 # Store the compiled condition in the cache and return it.

1356 condcache[condkey] = compiled

1357 return compiled.with_replaced_vars(condvars)

1358

1359 def will_query_use_indexing(self, condition, condvars=None):

1360 """Will a query for the condition use indexing?

1361

1362 The meaning of the condition and *condvars* arguments is the same as in

1363 the :meth:`Table.where` method. If condition can use indexing, this

1364 method returns a frozenset with the path names of the columns whose

1365 index is usable. Otherwise, it returns an empty list.

1366

1367 This method is mainly intended for testing. Keep in mind that changing

1368 the set of indexed columns or their dirtiness may make this method

1369 return different values for the same arguments at different times.

1370

1371 """

1372

1373 # Compile the condition and extract usable index conditions.

1374 condvars = self._required_expr_vars(condition, condvars, depth=2)

1375 compiled = self._compile_condition(condition, condvars)

1376 # Return the columns in indexed expressions

1377 idxcols = [condvars[var].pathname for var in compiled.index_variables]

1378 return frozenset(idxcols)

1379

1380 def where(self, condition, condvars=None,

1381 start=None, stop=None, step=None):

1382 r"""Iterate over values fulfilling a condition.

1383

1384 This method returns a Row iterator (see :ref:`RowClassDescr`) which

1385 only selects rows in the table that satisfy the given condition (an

1386 expression-like string).

1387

1388 The condvars mapping may be used to define the variable names appearing

1389 in the condition. condvars should consist of identifier-like strings

1390 pointing to Column (see :ref:`ColumnClassDescr`) instances *of this

1391 table*, or to other values (which will be converted to arrays). A

1392 default set of condition variables is provided where each top-level,

1393 non-nested column with an identifier-like name appears. Variables in

1394 condvars override the default ones.

1395

1396 When condvars is not provided or None, the current local and global

1397 namespace is sought instead of condvars. The previous mechanism is

1398 mostly intended for interactive usage. To disable it, just specify a

1399 (maybe empty) mapping as condvars.

1400

1401 If a range is supplied (by setting some of the start, stop or step

1402 parameters), only the rows in that range and fulfilling the condition

1403 are used. The meaning of the start, stop and step parameters is the

1404 same as for Python slices.

1405

1406 When possible, indexed columns participating in the condition will be

1407 used to speed up the search. It is recommended that you place the

1408 indexed columns as left and out in the condition as possible. Anyway,

1409 this method has always better performance than regular Python

1410 selections on the table.

1411

1412 You can mix this method with regular Python selections in order to

1413 support even more complex queries. It is strongly recommended that you

1414 pass the most restrictive condition as the parameter to this method if

1415 you want to achieve maximum performance.

1416

1417 .. warning::

1418

1419 When in the middle of a table row iterator, you should not

1420 use methods that can change the number of rows in the table

1421 (like :meth:`Table.append` or :meth:`Table.remove_rows`) or

1422 unexpected errors will happen.

1423

1424 Examples

1425 --------

1426

1427 ::

1428

1429 passvalues = [ row['col3'] for row in

1430 table.where('(col1 > 0) & (col2 <= 20)', step=5)

1431 if your_function(row['col2']) ]

1432 print("Values that pass the cuts:", passvalues)

1433

1434 .. note::

1435

1436 A special care should be taken when the query condition includes

1437 string literals.

1438

1439 Let's assume that the table ``table`` has the following

1440 structure::

1441

1442 class Record(IsDescription):

1443 col1 = StringCol(4) # 4-character String of bytes

1444 col2 = IntCol()

1445 col3 = FloatCol()

1446

1447 The type of "col1" corresponds to strings of bytes.

1448

1449 Any condition involving "col1" should be written using the

1450 appropriate type for string literals in order to avoid

1451 :exc:`TypeError`\ s.

1452

1453 The code below will fail with a :exc:`TypeError`::

1454

1455 condition = 'col1 == "AAAA"'

1456 for record in table.where(condition): # TypeError in Python3

1457 # do something with "record"

1458

1459 The reason is that in Python 3 "condition" implies a comparison

1460 between a string of bytes ("col1" contents) and a unicode literal

1461 ("AAAA").

1462

1463 The correct way to write the condition is::

1464

1465 condition = 'col1 == b"AAAA"'

1466

1467 .. versionchanged:: 3.0

1468 The start, stop and step parameters now behave like in slice.

1469

1470 """

1471

1472 return self._where(condition, condvars, start, stop, step)

1473

1474 def _where(self, condition, condvars, start=None, stop=None, step=None):

1475 """Low-level counterpart of `self.where()`."""

1476

1477 if profile:

1478 tref = clock()

1479 if profile:

1480 show_stats("Entering table._where", tref)

1481 # Adjust the slice to be used.

1482 (start, stop, step) = self._process_range_read(start, stop, step)

1483 if start >= stop: # empty range, reset conditions

1484 self._use_index = False

1485 self._where_condition = None

1486 return iter([])

1487

1488 # Compile the condition and extract usable index conditions.

1489 condvars = self._required_expr_vars(condition, condvars, depth=3)

1490 compiled = self._compile_condition(condition, condvars)

1491

1492 # Can we use indexes?

1493 if compiled.index_expressions:

1494 chunkmap = _table__where_indexed(

1495 self, compiled, condition, condvars, start, stop, step)

1496 if not isinstance(chunkmap, np.ndarray):

1497 # If it is not a NumPy array it should be an iterator

1498 # Reset conditions

1499 self._use_index = False

1500 self._where_condition = None

1501 # ...and return the iterator

1502 return chunkmap

1503 else:

1504 chunkmap = None # default to an in-kernel query

1505

1506 args = [condvars[param] for param in compiled.parameters]

1507 self._where_condition = (compiled.function, args, compiled.kwargs)

1508 row = tableextension.Row(self)

1509 if profile:

1510 show_stats("Exiting table._where", tref)

1511 return row._iter(start, stop, step, chunkmap=chunkmap)

1512

1513 def read_where(self, condition, condvars=None, field=None,

1514 start=None, stop=None, step=None):

1515 """Read table data fulfilling the given *condition*.

1516

1517 This method is similar to :meth:`Table.read`, having their common

1518 arguments and return values the same meanings. However, only the rows

1519 fulfilling the *condition* are included in the result.

1520

1521 The meaning of the other arguments is the same as in the

1522 :meth:`Table.where` method.

1523

1524 """

1525

1526 self._g_check_open()

1527 coords = [p.nrow for p in

1528 self._where(condition, condvars, start, stop, step)]

1529 self._where_condition = None # reset the conditions

1530 if len(coords) > 1:

1531 cstart, cstop = coords[0], coords[-1] + 1

1532 if cstop - cstart == len(coords):

1533 # Chances for monotonically increasing row values. Refine.

1534 inc_seq = np.alltrue(

1535 np.arange(cstart, cstop) == np.array(coords))

1536 if inc_seq:

1537 return self.read(cstart, cstop, field=field)

1538 return self.read_coordinates(coords, field)

1539

1540 def append_where(self, dstTable, condition=None, condvars=None,

1541 start=None, stop=None, step=None):

1542 """Append rows fulfilling the condition to the dstTable table.

1543

1544 dstTable must be capable of taking the rows resulting from the query,

1545 i.e. it must have columns with the expected names and compatible

1546 types. The meaning of the other arguments is the same as in the

1547 :meth:`Table.where` method.

1548

1549 The number of rows appended to dstTable is returned as a result.

1550

1551 .. versionchanged:: 3.0

1552 The *whereAppend* method has been renamed into *append_where*.

1553

1554 """

1555

1556 self._g_check_open()

1557

1558 # Check that the destination file is not in read-only mode.

1559 dstTable._v_file._check_writable()

1560

1561 # Row objects do not support nested columns, so we must iterate

1562 # over the flat column paths. When rows support nesting,

1563 # ``self.colnames`` can be directly iterated upon.

1564 colNames = [colName for colName in self.colpathnames]

1565 dstRow = dstTable.row

1566 nrows = 0

1567 if condition is not None:

1568 srcRows = self._where(condition, condvars, start, stop, step)

1569 else:

1570 srcRows = self.iterrows(start, stop, step)

1571 for srcRow in srcRows:

1572 for colName in colNames:

1573 dstRow[colName] = srcRow[colName]

1574 dstRow.append()

1575 nrows += 1

1576 dstTable.flush()

1577 return nrows

1578

1579 def get_where_list(self, condition, condvars=None, sort=False,

1580 start=None, stop=None, step=None):

1581 """Get the row coordinates fulfilling the given condition.

1582

1583 The coordinates are returned as a list of the current flavor. sort

1584 means that you want to retrieve the coordinates ordered. The default is

1585 to not sort them.

1586

1587 The meaning of the other arguments is the same as in the

1588 :meth:`Table.where` method.

1589

1590 """

1591

1592 self._g_check_open()

1593

1594 coords = [p.nrow for p in

1595 self._where(condition, condvars, start, stop, step)]

1596 coords = np.array(coords, dtype=SizeType)

1597 # Reset the conditions

1598 self._where_condition = None

1599 if sort:

1600 coords = np.sort(coords)

1601 return internal_to_flavor(coords, self.flavor)

1602

1603 def itersequence(self, sequence):

1604 """Iterate over a sequence of row coordinates."""

1605

1606 if not hasattr(sequence, '__getitem__'):

1607 raise TypeError("Wrong 'sequence' parameter type. Only sequences "

1608 "are suported.")

1609 # start, stop and step are necessary for the new iterator for

1610 # coordinates, and perhaps it would be useful to add them as

1611 # parameters in the future (not now, because I've just removed

1612 # the `sort` argument for 2.1).

1613 #

1614 # *Important note*: Negative values for step are not supported

1615 # for the general case, but only for the itersorted() and

1616 # read_sorted() purposes! The self._process_range_read will raise

1617 # an appropiate error.

1618 # F. Alted 2008-09-18

1619 # A.V. 20130513: _process_range_read --> _process_range

1620 (start, stop, step) = self._process_range(None, None, None)

1621 if (start > stop) or (len(sequence) == 0):

1622 return iter([])

1623 row = tableextension.Row(self)

1624 return row._iter(start, stop, step, coords=sequence)

1625

1626 def _check_sortby_csi(self, sortby, checkCSI):

1627 if isinstance(sortby, Column):

1628 icol = sortby

1629 elif isinstance(sortby, str):

1630 icol = self.cols._f_col(sortby)

1631 else:

1632 raise TypeError(

1633 "`sortby` can only be a `Column` or string object, "

1634 "but you passed an object of type: %s" % type(sortby))

1635 if icol.is_indexed and icol.index.kind == "full":

1636 if checkCSI and not icol.index.is_csi:

1637 # The index exists, but it is not a CSI one.

1638 raise ValueError(

1639 "Field `%s` must have associated a CSI index "

1640 "in table `%s`, but the existing one is not. "

1641 % (sortby, self))

1642 return icol.index

1643 else:

1644 raise ValueError(

1645 "Field `%s` must have associated a 'full' index "

1646 "in table `%s`." % (sortby, self))

1647

1648 def itersorted(self, sortby, checkCSI=False,

1649 start=None, stop=None, step=None):

1650 """Iterate table data following the order of the index of sortby

1651 column.

1652

1653 The sortby column must have associated a full index. If you want to

1654 ensure a fully sorted order, the index must be a CSI one. You may want

1655 to use the checkCSI argument in order to explicitly check for the

1656 existence of a CSI index.

1657

1658 The meaning of the start, stop and step arguments is the same as in

1659 :meth:`Table.read`.

1660

1661 .. versionchanged:: 3.0

1662 If the *start* parameter is provided and *stop* is None then the

1663 table is iterated from *start* to the last line.

1664 In PyTables < 3.0 only one element was returned.

1665

1666 """

1667

1668 index = self._check_sortby_csi(sortby, checkCSI)

1669 # Adjust the slice to be used.

1670 (start, stop, step) = self._process_range(start, stop, step,

1671 warn_negstep=False)

1672 if (start > stop and 0 < step) or (start < stop and 0 > step):

1673 # Fall-back action is to return an empty iterator

1674 return iter([])

1675 row = tableextension.Row(self)

1676 return row._iter(start, stop, step, coords=index)

1677

1678 def read_sorted(self, sortby, checkCSI=False, field=None,

1679 start=None, stop=None, step=None):

1680 """Read table data following the order of the index of sortby column.

1681

1682 The sortby column must have associated a full index. If you want to

1683 ensure a fully sorted order, the index must be a CSI one. You may want

1684 to use the checkCSI argument in order to explicitly check for the

1685 existence of a CSI index.

1686

1687 If field is supplied only the named column will be selected. If the

1688 column is not nested, an *array* of the current flavor will be

1689 returned; if it is, a *structured array* will be used instead. If no

1690 field is specified, all the columns will be returned in a structured

1691 array of the current flavor.

1692

1693 The meaning of the start, stop and step arguments is the same as in

1694 :meth:`Table.read`.

1695

1696 .. versionchanged:: 3.0

1697 The start, stop and step parameters now behave like in slice.

1698

1699 """

1700

1701 self._g_check_open()

1702 index = self._check_sortby_csi(sortby, checkCSI)

1703 coords = index[start:stop:step]

1704 return self.read_coordinates(coords, field)

1705

1706 def iterrows(self, start=None, stop=None, step=None):

1707 """Iterate over the table using a Row instance.

1708

1709 If a range is not supplied, *all the rows* in the table are iterated

1710 upon - you can also use the :meth:`Table.__iter__` special method for

1711 that purpose. If you want to iterate over a given *range of rows* in

1712 the table, you may use the start, stop and step parameters.

1713

1714 .. warning::

1715

1716 When in the middle of a table row iterator, you should not

1717 use methods that can change the number of rows in the table

1718 (like :meth:`Table.append` or :meth:`Table.remove_rows`) or

1719 unexpected errors will happen.

1720

1721 See Also

1722 --------

1723 tableextension.Row : the table row iterator and field accessor

1724

1725 Examples

1726 --------

1727

1728 ::

1729

1730 result = [ row['var2'] for row in table.iterrows(step=5)

1731 if row['var1'] <= 20 ]

1732

1733 .. versionchanged:: 3.0

1734 If the *start* parameter is provided and *stop* is None then the

1735 table is iterated from *start* to the last line.

1736 In PyTables < 3.0 only one element was returned.

1737

1738 """

1739 (start, stop, step) = self._process_range(start, stop, step,

1740 warn_negstep=False)

1741 if (start > stop and 0 < step) or (start < stop and 0 > step):

1742 # Fall-back action is to return an empty iterator

1743 return iter([])

1744 row = tableextension.Row(self)

1745 return row._iter(start, stop, step)

1746

1747 def __iter__(self):

1748 """Iterate over the table using a Row instance.

1749

1750 This is equivalent to calling :meth:`Table.iterrows` with default

1751 arguments, i.e. it iterates over *all the rows* in the table.

1752

1753 See Also

1754 --------

1755 tableextension.Row : the table row iterator and field accessor

1756

1757 Examples

1758 --------

1759

1760 ::

1761

1762 result = [ row['var2'] for row in table if row['var1'] <= 20 ]

1763

1764 Which is equivalent to::

1765

1766 result = [ row['var2'] for row in table.iterrows()

1767 if row['var1'] <= 20 ]

1768

1769 """

1770

1771 return self.iterrows()

1772

1773 def _read(self, start, stop, step, field=None, out=None):

1774 """Read a range of rows and return an in-memory object."""

1775

1776 select_field = None

1777 if field:

1778 if field not in self.coldtypes:

1779 if field in self.description._v_names:

1780 # Remember to select this field

1781 select_field = field

1782 field = None

1783 else:

1784 raise KeyError(("Field {} not found in table "

1785 "{}").format(field, self))

1786 else:

1787 # The column hangs directly from the top

1788 dtype_field = self.coldtypes[field]

1789

1790 # Return a rank-0 array if start > stop

1791 if (start >= stop and 0 < step) or (start <= stop and 0 > step):

1792 if field is None:

1793 nra = self._get_container(0)

1794 return nra

1795 return np.empty(shape=0, dtype=dtype_field)

1796

1797 nrows = len(range(start, stop, step))

1798

1799 if out is None:

1800 # Compute the shape of the resulting column object

1801 if field:

1802 # Create a container for the results

1803 result = np.empty(shape=nrows, dtype=dtype_field)

1804 else:

1805 # Recarray case

1806 result = self._get_container(nrows)

1807 else:

1808 # there is no fast way to byteswap, since different columns may

1809 # have different byteorders

1810 if not out.dtype.isnative:

1811 raise ValueError("output array must be in system's byteorder "

1812 "or results will be incorrect")

1813 if field:

1814 bytes_required = dtype_field.itemsize * nrows

1815 else:

1816 bytes_required = self.rowsize * nrows

1817 if bytes_required != out.nbytes:

1818 raise ValueError(f'output array size invalid, got {out.nbytes}'

1819 f' bytes, need {bytes_required} bytes')

1820 if not out.flags['C_CONTIGUOUS']:

1821 raise ValueError('output array not C contiguous')

1822 result = out

1823

1824 # Call the routine to fill-up the resulting array

1825 if step == 1 and not field:

1826 # This optimization works three times faster than

1827 # the row._fill_col method (up to 170 MB/s on a pentium IV @ 2GHz)

1828 self._read_records(start, stop - start, result)

1829 # Warning!: _read_field_name should not be used until

1830 # H5TBread_fields_name in tableextension will be finished

1831 # F. Alted 2005/05/26

1832 # XYX Ho implementem per a PyTables 2.0??

1833 elif field and step > 15 and 0:

1834 # For step>15, this seems to work always faster than row._fill_col.

1835 self._read_field_name(result, start, stop, step, field)

1836 else:

1837 self.row._fill_col(result, start, stop, step, field)

1838

1839 if select_field:

1840 return result[select_field]

1841 else:

1842 return result

1843

1844 def read(self, start=None, stop=None, step=None, field=None, out=None):

1845 """Get data in the table as a (record) array.

1846

1847 The start, stop and step parameters can be used to select only

1848 a *range of rows* in the table. Their meanings are the same as

1849 in the built-in Python slices.

1850

1851 If field is supplied only the named column will be selected.

1852 If the column is not nested, an *array* of the current flavor

1853 will be returned; if it is, a *structured array* will be used

1854 instead. If no field is specified, all the columns will be

1855 returned in a structured array of the current flavor.

1856

1857 Columns under a nested column can be specified in the field

1858 parameter by using a slash character (/) as a separator (e.g.

1859 'position/x').

1860

1861 The out parameter may be used to specify a NumPy array to

1862 receive the output data. Note that the array must have the

1863 same size as the data selected with the other parameters.

1864 Note that the array's datatype is not checked and no type

1865 casting is performed, so if it does not match the datatype on

1866 disk, the output will not be correct.

1867

1868 When specifying a single nested column with the field parameter,

1869 and supplying an output buffer with the out parameter, the

1870 output buffer must contain all columns in the table.

1871 The data in all columns will be read into the output buffer.

1872 However, only the specified nested column will be returned from

1873 the method call.

1874

1875 When data is read from disk in NumPy format, the output will be

1876 in the current system's byteorder, regardless of how it is

1877 stored on disk. If the out parameter is specified, the output

1878 array also must be in the current system's byteorder.

1879

1880 .. versionchanged:: 3.0

1881 Added the *out* parameter. Also the start, stop and step

1882 parameters now behave like in slice.

1883

1884 Examples

1885 --------

1886

1887 Reading the entire table::

1888

1889 t.read()

1890

1891 Reading record n. 6::

1892

1893 t.read(6, 7)

1894

1895 Reading from record n. 6 to the end of the table::

1896

1897 t.read(6)

1898

1899 """

1900

1901 self._g_check_open()

1902

1903 if field:

1904 self._check_column(field)

1905

1906 if out is not None and self.flavor != 'numpy':

1907 msg = ("Optional 'out' argument may only be supplied if array "

1908 "flavor is 'numpy', currently is {}").format(self.flavor)

1909 raise TypeError(msg)

1910

1911 start, stop, step = self._process_range(start, stop, step,

1912 warn_negstep=False)

1913

1914 arr = self._read(start, stop, step, field, out)

1915 return internal_to_flavor(arr, self.flavor)

1916

1917 def _read_coordinates(self, coords, field=None):

1918 """Private part of `read_coordinates()` with no flavor conversion."""

1919

1920 coords = self._point_selection(coords)

1921

1922 ncoords = len(coords)

1923 # Create a read buffer only if needed

1924 if field is None or ncoords > 0:

1925 # Doing a copy is faster when ncoords is small (<1000)

1926 if ncoords < min(1000, self.nrowsinbuf):

1927 result = self._v_iobuf[:ncoords].copy()

1928 else:

1929 result = self._get_container(ncoords)

1930

1931 # Do the real read

1932 if ncoords > 0:

1933 # Turn coords into an array of coordinate indexes, if necessary

1934 if not (isinstance(coords, np.ndarray) and

1935 coords.dtype.type is _npsizetype and

1936 coords.flags.contiguous and

1937 coords.flags.aligned):

1938 # Get a contiguous and aligned coordinate array

1939 coords = np.array(coords, dtype=SizeType)

1940 self._read_elements(coords, result)

1941

1942 # Do the final conversions, if needed

1943 if field:

1944 if ncoords > 0:

1945 result = get_nested_field(result, field)

1946 else:

1947 # Get an empty array from the cache

1948 result = self._getemptyarray(self.coldtypes[field])

1949 return result

1950

1951 def read_coordinates(self, coords, field=None):

1952 """Get a set of rows given their indexes as a (record) array.

1953

1954 This method works much like the :meth:`Table.read` method, but it uses

1955 a sequence (coords) of row indexes to select the wanted columns,

1956 instead of a column range.

1957

1958 The selected rows are returned in an array or structured array of the

1959 current flavor.

1960

1961 """

1962

1963 self._g_check_open()

1964 result = self._read_coordinates(coords, field)

1965 return internal_to_flavor(result, self.flavor)

1966

1967 def get_enum(self, colname):

1968 """Get the enumerated type associated with the named column.

1969

1970 If the column named colname (a string) exists and is of an enumerated

1971 type, the corresponding Enum instance (see :ref:`EnumClassDescr`) is

1972 returned. If it is not of an enumerated type, a TypeError is raised. If

1973 the column does not exist, a KeyError is raised.

1974

1975 """

1976

1977 self._check_column(colname)

1978

1979 try:

1980 return self._colenums[colname]

1981 except KeyError:

1982 raise TypeError(

1983 "column ``%s`` of table ``%s`` is not of an enumerated type"

1984 % (colname, self._v_pathname))

1985

1986 def col(self, name):

1987 """Get a column from the table.

1988

1989 If a column called name exists in the table, it is read and returned as

1990 a NumPy object. If it does not exist, a KeyError is raised.

1991

1992 Examples

1993 --------

1994

1995 ::

1996

1997 narray = table.col('var2')

1998

1999 That statement is equivalent to::

2000

2001 narray = table.read(field='var2')

2002

2003 Here you can see how this method can be used as a shorthand for the

2004 :meth:`Table.read` method.

2005

2006 """

2007

2008 return self.read(field=name)

2009

2010 def __getitem__(self, key):

2011 """Get a row or a range of rows from the table.

2012

2013 If key argument is an integer, the corresponding table row is returned

2014 as a record of the current flavor. If key is a slice, the range of rows

2015 determined by it is returned as a structured array of the current

2016 flavor.

2017

2018 In addition, NumPy-style point selections are supported. In

2019 particular, if key is a list of row coordinates, the set of rows

2020 determined by it is returned. Furthermore, if key is an array of

2021 boolean values, only the coordinates where key is True are returned.

2022 Note that for the latter to work it is necessary that key list would

2023 contain exactly as many rows as the table has.

2024

2025 Examples

2026 --------

2027

2028 ::

2029

2030 record = table[4]

2031 recarray = table[4:1000:2]

2032 recarray = table[[4,1000]] # only retrieves rows 4 and 1000

2033 recarray = table[[True, False, ..., True]]

2034

2035 Those statements are equivalent to::

2036

2037 record = table.read(start=4)[0]

2038 recarray = table.read(start=4, stop=1000, step=2)

2039 recarray = table.read_coordinates([4,1000])

2040 recarray = table.read_coordinates([True, False, ..., True])

2041

2042 Here, you can see how indexing can be used as a shorthand for the

2043 :meth:`Table.read` and :meth:`Table.read_coordinates` methods.

2044

2045 """

2046

2047 self._g_check_open()

2048

2049 if is_idx(key):

2050 key = operator.index(key)

2051

2052 # Index out of range protection

2053 if key >= self.nrows:

2054 raise IndexError("Index out of range")

2055 if key < 0:

2056 # To support negative values

2057 key += self.nrows

2058 (start, stop, step) = self._process_range(key, key + 1, 1)

2059 return self.read(start, stop, step)[0]

2060 elif isinstance(key, slice):

2061 (start, stop, step) = self._process_range(

2062 key.start, key.stop, key.step)

2063 return self.read(start, stop, step)

2064 # Try with a boolean or point selection

2065 elif type(key) in (list, tuple) or isinstance(key, np.ndarray):

2066 return self._read_coordinates(key, None)

2067 else:

2068 raise IndexError(f"Invalid index or slice: {key!r}")

2069

2070 def __setitem__(self, key, value):

2071 """Set a row or a range of rows in the table.

2072

2073 It takes different actions depending on the type of the *key*

2074 parameter: if it is an integer, the corresponding table row is

2075 set to *value* (a record or sequence capable of being converted

2076 to the table structure). If *key* is a slice, the row slice

2077 determined by it is set to *value* (a record array or sequence

2078 capable of being converted to the table structure).

2079

2080 In addition, NumPy-style point selections are supported. In

2081 particular, if key is a list of row coordinates, the set of rows

2082 determined by it is set to value. Furthermore, if key is an array of

2083 boolean values, only the coordinates where key is True are set to

2084 values from value. Note that for the latter to work it is necessary

2085 that key list would contain exactly as many rows as the table has.

2086

2087 Examples

2088 --------

2089

2090 ::

2091

2092 # Modify just one existing row

2093 table[2] = [456,'db2',1.2]

2094

2095 # Modify two existing rows

2096 rows = numpy.rec.array([[457,'db1',1.2],[6,'de2',1.3]],

2097 formats='i4,a3,f8')

2098 table[1:30:2] = rows # modify a table slice

2099 table[[1,3]] = rows # only modifies rows 1 and 3

2100 table[[True,False,True]] = rows # only modifies rows 0 and 2

2101

2102 Which is equivalent to::

2103

2104 table.modify_rows(start=2, rows=[456,'db2',1.2])

2105 rows = numpy.rec.array([[457,'db1',1.2],[6,'de2',1.3]],

2106 formats='i4,a3,f8')

2107 table.modify_rows(start=1, stop=3, step=2, rows=rows)

2108 table.modify_coordinates([1,3,2], rows)

2109 table.modify_coordinates([True, False, True], rows)

2110

2111 Here, you can see how indexing can be used as a shorthand for the

2112 :meth:`Table.modify_rows` and :meth:`Table.modify_coordinates`

2113 methods.

2114

2115 """

2116

2117 self._g_check_open()

2118 self._v_file._check_writable()

2119

2120 if is_idx(key):

2121 key = operator.index(key)

2122

2123 # Index out of range protection

2124 if key >= self.nrows:

2125 raise IndexError("Index out of range")

2126 if key < 0:

2127 # To support negative values

2128 key += self.nrows

2129 return self.modify_rows(key, key + 1, 1, [value])

2130 elif isinstance(key, slice):

2131 (start, stop, step) = self._process_range(

2132 key.start, key.stop, key.step)

2133 return self.modify_rows(start, stop, step, value)

2134 # Try with a boolean or point selection

2135 elif type(key) in (list, tuple) or isinstance(key, np.ndarray):

2136 return self.modify_coordinates(key, value)

2137 else:

2138 raise IndexError(f"Invalid index or slice: {key!r}")

2139

2140 def _save_buffered_rows(self, wbufRA, lenrows):

2141 """Update the indexes after a flushing of rows."""

2142

2143 self._open_append(wbufRA)

2144 self._append_records(lenrows)

2145 self._close_append()

2146 if self.indexed:

2147 self._unsaved_indexedrows += lenrows

2148 # The table caches for indexed queries are dirty now

2149 self._dirtycache = True

2150 if self.autoindex:

2151 # Flush the unindexed rows

2152 self.flush_rows_to_index(_lastrow=False)

2153 else:

2154 # All the columns are dirty now

2155 self._mark_columns_as_dirty(self.colpathnames)

2156

2157 def append(self, rows):

2158 """Append a sequence of rows to the end of the table.

2159

2160 The rows argument may be any object which can be converted to

2161 a structured array compliant with the table structure

2162 (otherwise, a ValueError is raised). This includes NumPy

2163 structured arrays, lists of tuples or array records, and a

2164 string or Python buffer.

2165

2166 Examples

2167 --------

2168

2169 ::

2170

2171 import tables as tb

2172

2173 class Particle(tb.IsDescription):

2174 name = tb.StringCol(16, pos=1) # 16-character String

2175 lati = tb.IntCol(pos=2) # integer

2176 longi = tb.IntCol(pos=3) # integer

2177 pressure = tb.Float32Col(pos=4) # float (single-precision)

2178 temperature = tb.FloatCol(pos=5) # double (double-precision)

2179

2180 fileh = tb.open_file('test4.h5', mode='w')

2181 table = fileh.create_table(fileh.root, 'table', Particle,

2182 "A table")

2183

2184 # Append several rows in only one call

2185 table.append([("Particle: 10", 10, 0, 10 * 10, 10**2),

2186 ("Particle: 11", 11, -1, 11 * 11, 11**2),

2187 ("Particle: 12", 12, -2, 12 * 12, 12**2)])

2188 fileh.close()

2189

2190 """

2191

2192 self._g_check_open()

2193 self._v_file._check_writable()

2194

2195 if not self._chunked:

2196 raise HDF5ExtError(

2197 "You cannot append rows to a non-chunked table.", h5bt=False)

2198

2199 if (hasattr(rows, "dtype") and

2200 not self.description._v_is_nested and

2201 rows.dtype == self.dtype):

2202 # Shortcut for compliant arrays

2203 # (for some reason, not valid for nested types)

2204 wbufRA = rows

2205 else:

2206 # Try to convert the object into a recarray compliant with table

2207 try:

2208 iflavor = flavor_of(rows)

2209 if iflavor != 'python':

2210 rows = array_as_internal(rows, iflavor)

2211 # Works for Python structures and always copies the original,

2212 # so the resulting object is safe for in-place conversion.

2213 wbufRA = np.rec.array(rows, dtype=self._v_dtype)

2214 except Exception as exc: # XXX

2215 raise ValueError("rows parameter cannot be converted into a "

2216 "recarray object compliant with table '%s'. "

2217 "The error was: <%s>" % (str(self), exc))

2218 lenrows = wbufRA.shape[0]

2219 # If the number of rows to append is zero, don't do anything else

2220 if lenrows > 0:

2221 # Save write buffer to disk

2222 self._save_buffered_rows(wbufRA, lenrows)

2223

2224 def _conv_to_recarr(self, obj):

2225 """Try to convert the object into a recarray."""

2226

2227 try:

2228 iflavor = flavor_of(obj)

2229 if iflavor != 'python':

2230 obj = array_as_internal(obj, iflavor)

2231 if hasattr(obj, "shape") and obj.shape == ():

2232 # To allow conversion of scalars (void type) into arrays.

2233 # See http://projects.scipy.org/scipy/numpy/ticket/315

2234 # for discussion on how to pass buffers to constructors

2235 # See also http://projects.scipy.org/scipy/numpy/ticket/348

2236 recarr = np.array([obj], dtype=self._v_dtype)

2237 else:

2238 # Works for Python structures and always copies the original,

2239 # so the resulting object is safe for in-place conversion.

2240 recarr = np.rec.array(obj, dtype=self._v_dtype)

2241 except Exception as exc: # XXX

2242 raise ValueError("Object cannot be converted into a recarray "

2243 "object compliant with table format '%s'. "

2244 "The error was: <%s>" %

2245 (self.description._v_nested_descr, exc))

2246

2247 return recarr

2248

2249 def modify_coordinates(self, coords, rows):

2250 """Modify a series of rows in positions specified in coords.

2251

2252 The values in the selected rows will be modified with the data given in

2253 rows. This method returns the number of rows modified.

2254

2255 The possible values for the rows argument are the same as in

2256 :meth:`Table.append`.

2257

2258 """

2259

2260 if rows is None: # Nothing to be done

2261 return SizeType(0)

2262

2263 # Convert the coordinates to something expected by HDF5

2264 coords = self._point_selection(coords)

2265

2266 lcoords = len(coords)

2267 if len(rows) < lcoords:

2268 raise ValueError("The value has not enough elements to fill-in "

2269 "the specified range")

2270

2271 # Convert rows into a recarray

2272 recarr = self._conv_to_recarr(rows)

2273

2274 if len(coords) > 0:

2275 # Do the actual update of rows

2276 self._update_elements(lcoords, coords, recarr)

2277

2278 # Redo the index if needed

2279 self._reindex(self.colpathnames)

2280

2281 return SizeType(lcoords)

2282

2283 def modify_rows(self, start=None, stop=None, step=None, rows=None):

2284 """Modify a series of rows in the slice [start:stop:step].

2285

2286 The values in the selected rows will be modified with the data given in

2287 rows. This method returns the number of rows modified. Should the

2288 modification exceed the length of the table, an IndexError is raised

2289 before changing data.

2290

2291 The possible values for the rows argument are the same as in

2292 :meth:`Table.append`.

2293

2294 """

2295

2296 if step is None:

2297 step = 1

2298 if rows is None: # Nothing to be done

2299 return SizeType(0)

2300 if start is None:

2301 start = 0

2302

2303 if start < 0:

2304 raise ValueError("'start' must have a positive value.")

2305 if step < 1:

2306 raise ValueError(

2307 "'step' must have a value greater or equal than 1.")

2308 if stop is None:

2309 # compute the stop value. start + len(rows)*step does not work

2310 stop = start + (len(rows) - 1) * step + 1

2311

2312 (start, stop, step) = self._process_range(start, stop, step)

2313 if stop > self.nrows:

2314 raise IndexError("This modification will exceed the length of "

2315 "the table. Giving up.")

2316 # Compute the number of rows to read.

2317 nrows = len(range(start, stop, step))

2318 if len(rows) != nrows:

2319 raise ValueError("The value has different elements than the "

2320 "specified range")

2321

2322 # Convert rows into a recarray

2323 recarr = self._conv_to_recarr(rows)

2324

2325 lenrows = len(recarr)

2326 if start + lenrows > self.nrows:

2327 raise IndexError("This modification will exceed the length of the "

2328 "table. Giving up.")

2329

2330 # Do the actual update

2331 self._update_records(start, stop, step, recarr)

2332

2333 # Redo the index if needed

2334 self._reindex(self.colpathnames)

2335

2336 return SizeType(lenrows)

2337

2338 def modify_column(self, start=None, stop=None, step=None,

2339 column=None, colname=None):

2340 """Modify one single column in the row slice [start:stop:step].

2341

2342 The colname argument specifies the name of the column in the

2343 table to be modified with the data given in column. This

2344 method returns the number of rows modified. Should the

2345 modification exceed the length of the table, an IndexError is

2346 raised before changing data.

2347

2348 The *column* argument may be any object which can be converted

2349 to a (record) array compliant with the structure of the column

2350 to be modified (otherwise, a ValueError is raised). This

2351 includes NumPy (record) arrays, lists of scalars, tuples or

2352 array records, and a string or Python buffer.

2353

2354 """

2355 if step is None:

2356 step = 1

2357 if not isinstance(colname, str):

2358 raise TypeError("The 'colname' parameter must be a string.")

2359 self._v_file._check_writable()

2360

2361 if column is None: # Nothing to be done

2362 return SizeType(0)

2363 if start is None:

2364 start = 0

2365

2366 if start < 0:

2367 raise ValueError("'start' must have a positive value.")

2368 if step < 1:

2369 raise ValueError(

2370 "'step' must have a value greater or equal than 1.")

2371 # Get the column format to be modified:

2372 objcol = self._get_column_instance(colname)

2373 descr = [objcol._v_parent._v_nested_descr[objcol._v_pos]]

2374 # Try to convert the column object into a NumPy ndarray

2375 try:

2376 # If the column is a recarray (or kind of), convert into ndarray

2377 if hasattr(column, 'dtype') and column.dtype.kind == 'V':

2378 column = np.rec.array(column, dtype=descr).field(0)

2379 else:

2380 # Make sure the result is always a *copy* of the original,

2381 # so the resulting object is safe for in-place conversion.

2382 iflavor = flavor_of(column)

2383 column = array_as_internal(column, iflavor)

2384 except Exception as exc: # XXX

2385 raise ValueError("column parameter cannot be converted into a "

2386 "ndarray object compliant with specified column "

2387 "'%s'. The error was: <%s>" % (str(column), exc))

2388

2389 # Get rid of single-dimensional dimensions

2390 column = column.squeeze()

2391 if column.shape == ():

2392 # Oops, stripped off to much dimensions

2393 column.shape = (1,)

2394

2395 if stop is None:

2396 # compute the stop value. start + len(rows)*step does not work

2397 stop = start + (len(column) - 1) * step + 1

2398 (start, stop, step) = self._process_range(start, stop, step)

2399 if stop > self.nrows:

2400 raise IndexError("This modification will exceed the length of "

2401 "the table. Giving up.")

2402 # Compute the number of rows to read.

2403 nrows = len(range(start, stop, step))

2404 if len(column) < nrows:

2405 raise ValueError("The value has not enough elements to fill-in "

2406 "the specified range")

2407 # Now, read the original values:

2408 mod_recarr = self._read(start, stop, step)

2409 # Modify the appropriate column in the original recarray

2410 mod_col = get_nested_field(mod_recarr, colname)

2411 mod_col[:] = column

2412 # save this modified rows in table

2413 self._update_records(start, stop, step, mod_recarr)

2414 # Redo the index if needed

2415 self._reindex([colname])

2416

2417 return SizeType(nrows)

2418

2419 def modify_columns(self, start=None, stop=None, step=None,

2420 columns=None, names=None):

2421 """Modify a series of columns in the row slice [start:stop:step].

2422

2423 The names argument specifies the names of the columns in the

2424 table to be modified with the data given in columns. This

2425 method returns the number of rows modified. Should the

2426 modification exceed the length of the table, an IndexError

2427 is raised before changing data.

2428

2429 The columns argument may be any object which can be converted

2430 to a structured array compliant with the structure of the

2431 columns to be modified (otherwise, a ValueError is raised).

2432 This includes NumPy structured arrays, lists of tuples or array

2433 records, and a string or Python buffer.

2434

2435 """

2436 if step is None:

2437 step = 1

2438 if type(names) not in (list, tuple):

2439 raise TypeError("The 'names' parameter must be a list of strings.")

2440

2441 if columns is None: # Nothing to be done

2442 return SizeType(0)

2443 if start is None:

2444 start = 0

2445 if start < 0:

2446 raise ValueError("'start' must have a positive value.")

2447 if step < 1:

2448 raise ValueError("'step' must have a value greater or "

2449 "equal than 1.")

2450 descr = []

2451 for colname in names:

2452 objcol = self._get_column_instance(colname)

2453 descr.append(objcol._v_parent._v_nested_descr[objcol._v_pos])

2454 # descr.append(objcol._v_parent._v_dtype[objcol._v_pos])

2455 # Try to convert the columns object into a recarray

2456 try:

2457 # Make sure the result is always a *copy* of the original,

2458 # so the resulting object is safe for in-place conversion.

2459 iflavor = flavor_of(columns)

2460 if iflavor != 'python':

2461 columns = array_as_internal(columns, iflavor)

2462 recarray = np.rec.array(columns, dtype=descr)

2463 else:

2464 recarray = np.rec.fromarrays(columns, dtype=descr)

2465 except Exception as exc: # XXX

2466 raise ValueError("columns parameter cannot be converted into a "

2467 "recarray object compliant with table '%s'. "

2468 "The error was: <%s>" % (str(self), exc))

2469

2470 if stop is None:

2471 # compute the stop value. start + len(rows)*step does not work

2472 stop = start + (len(recarray) - 1) * step + 1

2473 (start, stop, step) = self._process_range(start, stop, step)

2474 if stop > self.nrows:

2475 raise IndexError("This modification will exceed the length of "

2476 "the table. Giving up.")

2477 # Compute the number of rows to read.

2478 nrows = len(range(start, stop, step))

2479 if len(recarray) < nrows:

2480 raise ValueError("The value has not enough elements to fill-in "

2481 "the specified range")

2482 # Now, read the original values:

2483 mod_recarr = self._read(start, stop, step)

2484 # Modify the appropriate columns in the original recarray

2485 for i, name in enumerate(recarray.dtype.names):

2486 mod_col = get_nested_field(mod_recarr, names[i])

2487 mod_col[:] = recarray[name].squeeze()

2488 # save this modified rows in table

2489 self._update_records(start, stop, step, mod_recarr)

2490 # Redo the index if needed

2491 self._reindex(names)

2492

2493 return SizeType(nrows)

2494

2495 def flush_rows_to_index(self, _lastrow=True):

2496 """Add remaining rows in buffers to non-dirty indexes.

2497

2498 This can be useful when you have chosen non-automatic indexing

2499 for the table (see the :attr:`Table.autoindex` property in

2500 :class:`Table`) and you want to update the indexes on it.

2501

2502 """

2503

2504 rowsadded = 0

2505 if self.indexed:

2506 # Update the number of unsaved indexed rows

2507 start = self._indexedrows

2508 nrows = self._unsaved_indexedrows

2509 for (colname, colindexed) in self.colindexed.items():

2510 if colindexed:

2511 col = self.cols._g_col(colname)

2512 if nrows > 0 and not col.index.dirty:

2513 rowsadded = self._add_rows_to_index(

2514 colname, start, nrows, _lastrow, update=True)

2515 self._unsaved_indexedrows -= rowsadded

2516 self._indexedrows += rowsadded

2517 return rowsadded

2518

2519 def _add_rows_to_index(self, colname, start, nrows, lastrow, update):

2520 """Add more elements to the existing index."""

2521

2522 # This method really belongs to Column, but since it makes extensive

2523 # use of the table, it gets dangerous when closing the file, since the

2524 # column may be accessing a table which is being destroyed.

2525 index = self.cols._g_col(colname).index

2526 slicesize = index.slicesize

2527 # The next loop does not rely on xrange so that it can

2528 # deal with long ints (i.e. more than 32-bit integers)

2529 # This allows to index columns with more than 2**31 rows

2530 # F. Alted 2005-05-09

2531 startLR = index.sorted.nrows * slicesize

2532 indexedrows = startLR - start

2533 stop = start + nrows - slicesize + 1

2534 while startLR < stop:

2535 index.append(

2536 [self._read(startLR, startLR + slicesize, 1, colname)],

2537 update=update)

2538 indexedrows += slicesize

2539 startLR += slicesize

2540 # index the remaining rows in last row

2541 if lastrow and startLR < self.nrows:

2542 index.append_last_row(

2543 [self._read(startLR, self.nrows, 1, colname)],

2544 update=update)

2545 indexedrows += self.nrows - startLR

2546 return indexedrows

2547

2548 def remove_rows(self, start=None, stop=None, step=None):

2549 """Remove a range of rows in the table.

2550

2551 If only start is supplied, that row and all following will be deleted.

2552 If a range is supplied, i.e. both the start and stop parameters are

2553 passed, all the rows in the range are removed.

2554

2555 .. versionchanged:: 3.0

2556 The start, stop and step parameters now behave like in slice.

2557

2558 .. seealso:: remove_row()

2559

2560 Parameters

2561 ----------

2562 start : int

2563 Sets the starting row to be removed. It accepts negative values

2564 meaning that the count starts from the end. A value of 0 means the

2565 first row.

2566 stop : int

2567 Sets the last row to be removed to stop-1, i.e. the end point is

2568 omitted (in the Python range() tradition). Negative values are also

2569 accepted. If None all rows after start will be removed.

2570 step : int

2571 The step size between rows to remove.

2572

2573 .. versionadded:: 3.0

2574

2575 Examples

2576 --------

2577

2578 Removing rows from 5 to 10 (excluded)::

2579

2580 t.remove_rows(5, 10)

2581

2582 Removing all rows starting from the 10th::

2583

2584 t.remove_rows(10)

2585

2586 Removing the 6th row::

2587

2588 t.remove_rows(6, 7)

2589

2590 .. note::

2591

2592 removing a single row can be done using the specific

2593 :meth:`remove_row` method.

2594

2595 """

2596

2597 (start, stop, step) = self._process_range(start, stop, step)

2598 nrows = self._remove_rows(start, stop, step)

2599 # remove_rows is a invalidating index operation

2600 self._reindex(self.colpathnames)

2601

2602 return SizeType(nrows)

2603

2604 def remove_row(self, n):

2605 """Removes a row from the table.

2606

2607 Parameters

2608 ----------

2609 n : int

2610 The index of the row to remove.

2611

2612

2613 .. versionadded:: 3.0

2614

2615 Examples

2616 --------

2617

2618 Remove row 15::

2619

2620 table.remove_row(15)

2621

2622 Which is equivalent to::

2623

2624 table.remove_rows(15, 16)

2625

2626 .. warning::

2627

2628 This is not equivalent to::

2629

2630 table.remove_rows(15)

2631

2632 """

2633

2634 self.remove_rows(start=n, stop=n + 1)

2635

2636 def _g_update_dependent(self):

2637 super()._g_update_dependent()

2638

2639 # Update the new path in columns

2640 self.cols._g_update_table_location(self)

2641

2642 # Update the new path in the Row instance, if cached. Fixes #224.

2643 if 'row' in self.__dict__:

2644 self.__dict__['row'] = tableextension.Row(self)

2645

2646 def _g_move(self, newparent, newname):

2647 """Move this node in the hierarchy.

2648

2649 This overloads the Node._g_move() method.

2650

2651 """

2652

2653 itgpathname = _index_pathname_of(self)

2654

2655 # First, move the table to the new location.

2656 super()._g_move(newparent, newname)

2657

2658 # Then move the associated index group (if any).

2659 try:

2660 itgroup = self._v_file._get_node(itgpathname)

2661 except NoSuchNodeError:

2662 pass

2663 else:

2664 newigroup = self._v_parent

2665 newiname = _index_name_of(self)

2666 itgroup._g_move(newigroup, newiname)

2667

2668 def _g_remove(self, recursive=False, force=False):

2669 # Remove the associated index group (if any).

2670 itgpathname = _index_pathname_of(self)

2671 try:

2672 itgroup = self._v_file._get_node(itgpathname)

2673 except NoSuchNodeError:

2674 pass

2675 else:

2676 itgroup._f_remove(recursive=True)

2677 self.indexed = False # there are indexes no more

2678

2679 # Remove the leaf itself from the hierarchy.

2680 super()._g_remove(recursive, force)

2681

2682 def _set_column_indexing(self, colpathname, indexed):

2683 """Mark the referred column as indexed or non-indexed."""

2684

2685 colindexed = self.colindexed

2686 isindexed, wasindexed = bool(indexed), colindexed[colpathname]

2687 if isindexed == wasindexed:

2688 return # indexing state is unchanged

2689

2690 # Changing the set of indexed columns invalidates the condition cache

2691 self._condition_cache.clear()

2692 colindexed[colpathname] = isindexed

2693 self.indexed = max(colindexed.values()) # this is an OR :)

2694

2695 def _mark_columns_as_dirty(self, colnames):

2696 """Mark column indexes in `colnames` as dirty."""

2697

2698 assert len(colnames) > 0

2699 if self.indexed:

2700 colindexed, cols = self.colindexed, self.cols

2701 # Mark the proper indexes as dirty

2702 for colname in colnames:

2703 if colindexed[colname]:

2704 col = cols._g_col(colname)

2705 col.index.dirty = True

2706

2707 def _reindex(self, colnames):

2708 """Re-index columns in `colnames` if automatic indexing is true."""

2709

2710 if self.indexed:

2711 colindexed, cols = self.colindexed, self.cols

2712 colstoindex = []

2713 # Mark the proper indexes as dirty

2714 for colname in colnames:

2715 if colindexed[colname]:

2716 col = cols._g_col(colname)

2717 col.index.dirty = True

2718 colstoindex.append(colname)

2719 # Now, re-index the dirty ones

2720 if self.autoindex and colstoindex:

2721 self._do_reindex(dirty=True)

2722 # The table caches for indexed queries are dirty now

2723 self._dirtycache = True

2724

2725 def _do_reindex(self, dirty):

2726 """Common code for `reindex()` and `reindex_dirty()`."""

2727

2728 indexedrows = 0

2729 for (colname, colindexed) in self.colindexed.items():

2730 if colindexed:

2731 indexcol = self.cols._g_col(colname)

2732 indexedrows = indexcol._do_reindex(dirty)

2733 # Update counters in case some column has been updated

2734 if indexedrows > 0:

2735 self._indexedrows = indexedrows

2736 self._unsaved_indexedrows = self.nrows - indexedrows

2737

2738 return SizeType(indexedrows)

2739

2740 def reindex(self):

2741 """Recompute all the existing indexes in the table.

2742

2743 This can be useful when you suspect that, for any reason, the

2744 index information for columns is no longer valid and want to

2745 rebuild the indexes on it.

2746

2747 """

2748

2749 self._do_reindex(dirty=False)

2750

2751 def reindex_dirty(self):

2752 """Recompute the existing indexes in table, *if* they are dirty.

2753

2754 This can be useful when you have set :attr:`Table.autoindex`

2755 (see :class:`Table`) to false for the table and you want to

2756 update the indexes after a invalidating index operation

2757 (:meth:`Table.remove_rows`, for example).

2758

2759 """

2760

2761 self._do_reindex(dirty=True)

2762

2763 def _g_copy_rows(self, object, start, stop, step, sortby, checkCSI):

2764 """Copy rows from self to object"""

2765 if sortby is None:

2766 self._g_copy_rows_optim(object, start, stop, step)

2767 return

2768 lenbuf = self.nrowsinbuf

2769 absstep = step

2770 if step < 0:

2771 absstep = -step

2772 start, stop = stop + 1, start + 1

2773 if sortby is not None:

2774 index = self._check_sortby_csi(sortby, checkCSI)

2775 for start2 in range(start, stop, absstep * lenbuf):

2776 stop2 = start2 + absstep * lenbuf

2777 if stop2 > stop:

2778 stop2 = stop

2779 # The next 'if' is not needed, but it doesn't bother either

2780 if sortby is None:

2781 rows = self[start2:stop2:step]

2782 else:

2783 coords = index[start2:stop2:step]

2784 rows = self.read_coordinates(coords)

2785 # Save the records on disk

2786 object.append(rows)

2787 object.flush()

2788

2789 def _g_copy_rows_optim(self, object, start, stop, step):

2790 """Copy rows from self to object (optimized version)"""

2791

2792 nrowsinbuf = self.nrowsinbuf

2793 object._open_append(self._v_iobuf)

2794 nrowsdest = object.nrows

2795 for start2 in range(start, stop, step * nrowsinbuf):

2796 # Save the records on disk

2797 stop2 = start2 + step * nrowsinbuf

2798 if stop2 > stop:

2799 stop2 = stop

2800 # Optimized version (it saves some conversions)

2801 nrows = ((stop2 - start2 - 1) // step) + 1

2802 self.row._fill_col(self._v_iobuf, start2, stop2, step, None)

2803 # The output buffer is created anew,

2804 # so the operation is safe to in-place conversion.

2805 object._append_records(nrows)

2806 nrowsdest += nrows

2807 object._close_append()

2808

2809 def _g_prop_indexes(self, other):

2810 """Generate index in `other` table for every indexed column here."""

2811

2812 oldcols, newcols = self.colinstances, other.colinstances

2813 for colname in newcols:

2814 if (isinstance(oldcols[colname], Column)):

2815 oldcolindexed = oldcols[colname].is_indexed

2816 if oldcolindexed:

2817 oldcolindex = oldcols[colname].index

2818 newcol = newcols[colname]

2819 newcol.create_index(

2820 kind=oldcolindex.kind, optlevel=oldcolindex.optlevel,

2821 filters=oldcolindex.filters, tmp_dir=None)

2822

2823 def _g_copy_with_stats(self, group, name, start, stop, step,

2824 title, filters, chunkshape, _log, **kwargs):

2825 """Private part of Leaf.copy() for each kind of leaf."""

2826

2827 # Get the private args for the Table flavor of copy()

2828 sortby = kwargs.pop('sortby', None)

2829 propindexes = kwargs.pop('propindexes', False)

2830 checkCSI = kwargs.pop('checkCSI', False)

2831 # Compute the correct indices.

2832 (start, stop, step) = self._process_range_read(

2833 start, stop, step, warn_negstep=sortby is None)

2834 # And the number of final rows

2835 nrows = len(range(start, stop, step))

2836 # Create the new table and copy the selected data.

2837 newtable = Table(group, name, self.description, title=title,

2838 filters=filters, expectedrows=nrows,

2839 chunkshape=chunkshape,

2840 _log=_log)

2841 self._g_copy_rows(newtable, start, stop, step, sortby, checkCSI)

2842 nbytes = newtable.nrows * newtable.rowsize

2843 # Generate equivalent indexes in the new table, if required.

2844 if propindexes and self.indexed:

2845 self._g_prop_indexes(newtable)

2846 return (newtable, nbytes)

2847

2848 # This overloading of copy is needed here in order to document

2849 # the additional keywords for the Table case.

2850 def copy(self, newparent=None, newname=None, overwrite=False,

2851 createparents=False, **kwargs):

2852 """Copy this table and return the new one.

2853

2854 This method has the behavior and keywords described in

2855 :meth:`Leaf.copy`. Moreover, it recognises the following additional

2856 keyword arguments.

2857

2858 Parameters

2859 ----------

2860 sortby

2861 If specified, and sortby corresponds to a column with an index,

2862 then the copy will be sorted by this index. If you want to ensure

2863 a fully sorted order, the index must be a CSI one. A reverse

2864 sorted copy can be achieved by specifying a negative value for the

2865 step keyword. If sortby is omitted or None, the original table

2866 order is used.

2867 checkCSI

2868 If true and a CSI index does not exist for the sortby column, an

2869 error will be raised. If false (the default), it does nothing.

2870 You can use this flag in order to explicitly check for the

2871 existence of a CSI index.

2872 propindexes

2873 If true, the existing indexes in the source table are propagated

2874 (created) to the new one. If false (the default), the indexes are

2875 not propagated.

2876

2877 """

2878

2879 return super().copy(

2880 newparent, newname, overwrite, createparents, **kwargs)

2881

2882 def flush(self):

2883 """Flush the table buffers."""

2884

2885 if self._v_file._iswritable():

2886 # Flush rows that remains to be appended

2887 if 'row' in self.__dict__:

2888 self.row._flush_buffered_rows()

2889 if self.indexed and self.autoindex:

2890 # Flush any unindexed row

2891 rowsadded = self.flush_rows_to_index(_lastrow=True)

2892 assert rowsadded <= 0 or self._indexedrows == self.nrows, \

2893 ("internal error: the number of indexed rows (%d) "

2894 "and rows in the table (%d) is not equal; "

2895 "please report this to the authors."

2896 % (self._indexedrows, self.nrows))

2897 if self._dirtyindexes:

2898 # Finally, re-index any dirty column

2899 self.reindex_dirty()

2900

2901 super().flush()

2902

2903 def _g_pre_kill_hook(self):

2904 """Code to be called before killing the node."""

2905

2906 # Flush the buffers before to clean-up them

2907 # self.flush()

2908 # It seems that flushing during the __del__ phase is a sure receipt for

2909 # bringing all kind of problems:

2910 # 1. Illegal Instruction

2911 # 2. Malloc(): trying to call free() twice

2912 # 3. Bus Error

2913 # 4. Segmentation fault

2914 # So, the best would be doing *nothing* at all in this __del__ phase.

2915 # As a consequence, the I/O will not be cleaned until a call to

2916 # Table.flush() would be done. This could lead to a potentially large

2917 # memory consumption.

2918 # NOTE: The user should make a call to Table.flush() whenever he has

2919 # finished working with his table.

2920 # I've added a Performance warning in order to compel the user to

2921 # call self.flush() before the table is being preempted.

2922 # F. Alted 2006-08-03

2923 if (('row' in self.__dict__ and self.row._get_unsaved_nrows() > 0) or

2924 (self.indexed and self.autoindex and

2925 (self._unsaved_indexedrows > 0 or self._dirtyindexes))):

2926 warnings.warn(("table ``%s`` is being preempted from alive nodes "

2927 "without its buffers being flushed or with some "

2928 "index being dirty. This may lead to very "

2929 "ineficient use of resources and even to fatal "

2930 "errors in certain situations. Please do a call "

2931 "to the .flush() or .reindex_dirty() methods on "

2932 "this table before start using other nodes.")

2933 % (self._v_pathname), PerformanceWarning)

2934 # Get rid of the IO buffers (if they have been created at all)

2935 mydict = self.__dict__

2936 if '_v_iobuf' in mydict:

2937 del mydict['_v_iobuf']

2938 if '_v_wdflts' in mydict:

2939 del mydict['_v_wdflts']

2940

2941 def _f_close(self, flush=True):

2942 if not self._v_isopen:

2943 return # the node is already closed

2944

2945 # .. note::

2946 #

2947 # As long as ``Table`` objects access their indices on closing,

2948 # ``File.close()`` will need to make *two separate passes*

2949 # to first close ``Table`` objects and then ``Index`` hierarchies.

2950 #

2951

2952 # Flush right now so the row object does not get in the middle.

2953 if flush:

2954 self.flush()

2955

2956 # Some warnings can be issued after calling `self._g_set_location()`

2957 # in `self.__init__()`. If warnings are turned into exceptions,

2958 # `self._g_post_init_hook` may not be called and `self.cols` not set.

2959 # One example of this is

2960 # ``test_create.createTestCase.test05_maxFieldsExceeded()``.

2961 cols = self.cols

2962 if cols is not None:

2963 cols._g_close()

2964

2965 # Clean address cache

2966 self._clean_chunk_addrs()

2967

2968 # Close myself as a leaf.

2969 super()._f_close(False)

2970

2971 def __repr__(self):

2972 """This provides column metainfo in addition to standard __str__"""

2973

2974 if self.indexed:

2975 format = """\

2976%s

2977 description := %r

2978 byteorder := %r

2979 chunkshape := %r

2980 autoindex := %r

2981 colindexes := %r"""

2982 return format % (str(self), self.description, self.byteorder,

2983 self.chunkshape, self.autoindex,

2984 _ColIndexes(self.colindexes))

2985 else:

2986 return """\

2987%s

2988 description := %r

2989 byteorder := %r

2990 chunkshape := %r""" % \

2991 (str(self), self.description, self.byteorder, self.chunkshape)

2992

2993

2994class Cols:

2995 """Container for columns in a table or nested column.

2996

2997 This class is used as an *accessor* to the columns in a table or nested

2998 column. It supports the *natural naming* convention, so that you can

2999 access the different columns as attributes which lead to Column instances

3000 (for non-nested columns) or other Cols instances (for nested columns).

3001

3002 For instance, if table.cols is a Cols instance with a column named col1

3003 under it, the later can be accessed as table.cols.col1. If col1 is nested

3004 and contains a col2 column, this can be accessed as table.cols.col1.col2

3005 and so on. Because of natural naming, the names of members start with

3006 special prefixes, like in the Group class (see :ref:`GroupClassDescr`).

3007

3008 Like the Column class (see :ref:`ColumnClassDescr`), Cols supports item

3009 access to read and write ranges of values in the table or nested column.

3010

3011

3012 .. rubric:: Cols attributes

3013

3014 .. attribute:: _v_colnames

3015

3016 A list of the names of the columns hanging directly

3017 from the associated table or nested column. The order of

3018 the names matches the order of their respective columns in

3019 the containing table.

3020

3021 .. attribute:: _v_colpathnames

3022

3023 A list of the pathnames of all the columns under the

3024 associated table or nested column (in preorder). If it does

3025 not contain nested columns, this is exactly the same as the

3026 :attr:`Cols._v_colnames` attribute.

3027

3028 .. attribute:: _v_desc

3029

3030 The associated Description instance (see

3031 :ref:`DescriptionClassDescr`).

3032

3033 """

3034

3035 @property

3036 def _v_table(self):

3037 """The parent Table instance (see :ref:`TableClassDescr`)."""

3038 return self._v__tableFile._get_node(self._v__tablePath)

3039

3040 def __init__(self, table, desc):

3041 myDict = self.__dict__

3042 myDict['_v__tableFile'] = table._v_file

3043 myDict['_v__tablePath'] = table._v_pathname

3044 myDict['_v_desc'] = desc

3045 myDict['_v_colnames'] = desc._v_names

3046 myDict['_v_colpathnames'] = table.description._v_pathnames

3047 # Put the column in the local dictionary

3048 for name in desc._v_names:

3049 if name in desc._v_types:

3050 myDict[name] = Column(table, name, desc)

3051 else:

3052 myDict[name] = Cols(table, desc._v_colobjects[name])

3053

3054 def _g_update_table_location(self, table):

3055 """Updates the location information about the associated `table`."""

3056

3057 myDict = self.__dict__

3058 myDict['_v__tableFile'] = table._v_file

3059 myDict['_v__tablePath'] = table._v_pathname

3060

3061 # Update the locations in individual columns.

3062 for colname in self._v_colnames:

3063 myDict[colname]._g_update_table_location(table)

3064

3065 def __len__(self):

3066 """Get the number of top level columns in table."""

3067

3068 return len(self._v_colnames)

3069

3070 def _f_col(self, colname):

3071 """Get an accessor to the column colname.

3072

3073 This method returns a Column instance (see :ref:`ColumnClassDescr`) if

3074 the requested column is not nested, and a Cols instance (see

3075 :ref:`ColsClassDescr`) if it is. You may use full column pathnames in

3076 colname.

3077

3078 Calling cols._f_col('col1/col2') is equivalent to using cols.col1.col2.

3079 However, the first syntax is more intended for programmatic use. It is

3080 also better if you want to access columns with names that are not valid

3081 Python identifiers.

3082

3083 """

3084

3085 if not isinstance(colname, str):

3086 raise TypeError("Parameter can only be an string. You passed "

3087 "object: %s" % colname)

3088 if ((colname.find('/') > -1 and

3089 colname not in self._v_colpathnames) and

3090 colname not in self._v_colnames):

3091 raise KeyError(("Cols accessor ``%s.cols%s`` does not have a "

3092 "column named ``%s``")

3093 % (self._v__tablePath, self._v_desc._v_pathname,

3094 colname))

3095

3096 return self._g_col(colname)

3097

3098 def _g_col(self, colname):

3099 """Like `self._f_col()` but it does not check arguments."""

3100

3101 # Get the Column or Description object

3102 inames = colname.split('/')

3103 cols = self

3104 for iname in inames:

3105 cols = cols.__dict__[iname]

3106 return cols

3107

3108 def __getitem__(self, key):

3109 """Get a row or a range of rows from a table or nested column.

3110

3111 If key argument is an integer, the corresponding nested type row is

3112 returned as a record of the current flavor. If key is a slice, the

3113 range of rows determined by it is returned as a structured array of the

3114 current flavor.

3115

3116 Examples

3117 --------

3118

3119 ::

3120

3121 record = table.cols[4] # equivalent to table[4]

3122 recarray = table.cols.Info[4:1000:2]

3123

3124 Those statements are equivalent to::

3125

3126 nrecord = table.read(start=4)[0]

3127 nrecarray = table.read(start=4, stop=1000, step=2).field('Info')

3128

3129 Here you can see how a mix of natural naming, indexing and slicing can

3130 be used as shorthands for the :meth:`Table.read` method.

3131

3132 """

3133 table = self._v_table

3134 nrows = table.nrows

3135 if is_idx(key):

3136 key = operator.index(key)

3137

3138 # Index out of range protection

3139 if key >= nrows:

3140 raise IndexError("Index out of range")

3141 if key < 0:

3142 # To support negative values

3143 key += nrows

3144 (start, stop, step) = table._process_range(key, key + 1, 1)

3145 colgroup = self._v_desc._v_pathname

3146 if colgroup == "": # The root group

3147 return table.read(start, stop, step)[0]

3148 else:

3149 crecord = table.read(start, stop, step)[0]

3150 return crecord[colgroup]

3151 elif isinstance(key, slice):

3152 (start, stop, step) = table._process_range(

3153 key.start, key.stop, key.step)

3154 colgroup = self._v_desc._v_pathname

3155 if colgroup == "": # The root group

3156 return table.read(start, stop, step)

3157 else:

3158 crecarray = table.read(start, stop, step)

3159 if hasattr(crecarray, "field"):

3160 return crecarray.field(colgroup) # RecArray case

3161 else:

3162 return get_nested_field(crecarray, colgroup) # numpy case

3163 else:

3164 raise TypeError(f"invalid index or slice: {key!r}")

3165

3166 def __setitem__(self, key, value):

3167 """Set a row or a range of rows in a table or nested column.

3168

3169 If key argument is an integer, the corresponding row is set to

3170 value. If key is a slice, the range of rows determined by it is set to

3171 value.

3172

3173 Examples

3174 --------

3175

3176 ::

3177

3178 table.cols[4] = record

3179 table.cols.Info[4:1000:2] = recarray

3180

3181 Those statements are equivalent to::

3182

3183 table.modify_rows(4, rows=record)

3184 table.modify_column(4, 1000, 2, colname='Info', column=recarray)

3185

3186 Here you can see how a mix of natural naming, indexing and slicing

3187 can be used as shorthands for the :meth:`Table.modify_rows` and

3188 :meth:`Table.modify_column` methods.

3189

3190 """

3191

3192 table = self._v_table

3193 nrows = table.nrows

3194 if is_idx(key):

3195 key = operator.index(key)

3196

3197 # Index out of range protection

3198 if key >= nrows:

3199 raise IndexError("Index out of range")

3200 if key < 0:

3201 # To support negative values

3202 key += nrows

3203 (start, stop, step) = table._process_range(key, key + 1, 1)

3204 elif isinstance(key, slice):

3205 (start, stop, step) = table._process_range(

3206 key.start, key.stop, key.step)

3207 else:

3208 raise TypeError(f"invalid index or slice: {key!r}")

3209

3210 # Actually modify the correct columns

3211 colgroup = self._v_desc._v_pathname

3212 if colgroup == "": # The root group

3213 table.modify_rows(start, stop, step, rows=value)

3214 else:

3215 table.modify_column(

3216 start, stop, step, colname=colgroup, column=value)

3217

3218 def _g_close(self):

3219 # First, close the columns (ie possible indices open)

3220 for col in self._v_colnames:

3221 colobj = self._g_col(col)

3222 if isinstance(colobj, Column):

3223 colobj.close()

3224 # Delete the reference to column

3225 del self.__dict__[col]

3226 else:

3227 colobj._g_close()

3228

3229 self.__dict__.clear()

3230

3231 def __str__(self):

3232 """The string representation for this object."""

3233

3234 # The pathname

3235 descpathname = self._v_desc._v_pathname

3236 if descpathname:

3237 descpathname = "." + descpathname

3238 return (f"{self._v__tablePath}.cols{descpathname} "

3239 f"({self.__class__.__name__}), "

3240 f"{len(self._v_colnames)} columns")

3241

3242 def __repr__(self):

3243 """A detailed string representation for this object."""

3244

3245 lines = [f'{self!s}']

3246 for name in self._v_colnames:

3247 # Get this class name

3248 classname = getattr(self, name).__class__.__name__

3249 # The type

3250 if name in self._v_desc._v_dtypes:

3251 tcol = self._v_desc._v_dtypes[name]

3252 # The shape for this column

3253 shape = (self._v_table.nrows,) + \

3254 self._v_desc._v_dtypes[name].shape

3255 else:

3256 tcol = "Description"

3257 # Description doesn't have a shape currently

3258 shape = ()

3259 lines.append(f" {name} ({classname}{shape}, {tcol})")

3260 return '\n'.join(lines) + '\n'

3261

3262

3263class Column:

3264 """Accessor for a non-nested column in a table.

3265

3266 Each instance of this class is associated with one *non-nested* column of a

3267 table. These instances are mainly used to read and write data from the

3268 table columns using item access (like the Cols class - see

3269 :ref:`ColsClassDescr`), but there are a few other associated methods to

3270 deal with indexes.

3271

3272 .. rubric:: Column attributes

3273

3274 .. attribute:: descr

3275

3276 The Description (see :ref:`DescriptionClassDescr`) instance of the

3277 parent table or nested column.

3278

3279 .. attribute:: name

3280

3281 The name of the associated column.

3282

3283 .. attribute:: pathname

3284

3285 The complete pathname of the associated column (the same as

3286 Column.name if the column is not inside a nested column).

3287

3288 Parameters

3289 ----------

3290 table

3291 The parent table instance

3292 name

3293 The name of the column that is associated with this object

3294 descr

3295 The parent description object

3296

3297 """

3298

3299 @lazyattr

3300 def dtype(self):

3301 """The NumPy dtype that most closely matches this column."""

3302

3303 return self.descr._v_dtypes[self.name].base # Get rid of shape info

3304

3305 @lazyattr

3306 def type(self):

3307 """The PyTables type of the column (a string)."""

3308

3309 return self.descr._v_types[self.name]

3310

3311 @property

3312 def table(self):

3313 """The parent Table instance (see :ref:`TableClassDescr`)."""

3314 return self._table_file._get_node(self._table_path)

3315

3316 @property

3317 def index(self):

3318 """The Index instance (see :ref:`IndexClassDescr`) associated with this

3319 column (None if the column is not indexed)."""

3320 indexPath = _index_pathname_of_column_(self._table_path, self.pathname)

3321 try:

3322 index = self._table_file._get_node(indexPath)

3323 except NodeError:

3324 index = None # The column is not indexed

3325 return index

3326

3327 @lazyattr

3328 def _itemtype(self):

3329 return self.descr._v_dtypes[self.name]

3330

3331 @property

3332 def shape(self):

3333 """The shape of this column."""

3334 return (self.table.nrows,) + self.descr._v_dtypes[self.name].shape

3335

3336 @property

3337 def is_indexed(self):

3338 """True if the column is indexed, false otherwise."""

3339 if self.index is None:

3340 return False

3341 else:

3342 return True

3343

3344 @property

3345 def maindim(self):

3346 """"The dimension along which iterators work. Its value is 0 (i.e. the

3347 first dimension)."""

3348 return 0

3349

3350 def __init__(self, table, name, descr):

3351 self._table_file = table._v_file

3352 self._table_path = table._v_pathname

3353 self.name = name

3354 """The name of the associated column."""

3355 self.pathname = descr._v_colobjects[name]._v_pathname

3356 """The complete pathname of the associated column (the same as

3357 Column.name if the column is not inside a nested column)."""

3358 self.descr = descr

3359 """The Description (see :ref:`DescriptionClassDescr`) instance of the

3360 parent table or nested column."""

3361

3362 def _g_update_table_location(self, table):

3363 """Updates the location information about the associated `table`."""

3364

3365 self._table_file = table._v_file

3366 self._table_path = table._v_pathname

3367

3368 def __len__(self):

3369 """Get the number of elements in the column.

3370

3371 This matches the length in rows of the parent table.

3372

3373 """

3374

3375 return self.table.nrows

3376

3377 def __getitem__(self, key):

3378 """Get a row or a range of rows from a column.

3379

3380 If key argument is an integer, the corresponding element in the column

3381 is returned as an object of the current flavor. If key is a slice, the

3382 range of elements determined by it is returned as an array of the

3383 current flavor.

3384

3385 Examples

3386 --------

3387

3388 ::

3389

3390 print("Column handlers:")

3391 for name in table.colnames:

3392 print(table.cols._f_col(name))

3393 print("Select table.cols.name[1]-->", table.cols.name[1])

3394 print("Select table.cols.name[1:2]-->", table.cols.name[1:2])

3395 print("Select table.cols.name[:]-->", table.cols.name[:])

3396 print("Select table.cols._f_col('name')[:]-->",

3397 table.cols._f_col('name')[:])

3398

3399 The output of this for a certain arbitrary table is::

3400

3401 Column handlers:

3402 /table.cols.name (Column(), string, idx=None)

3403 /table.cols.lati (Column(), int32, idx=None)

3404 /table.cols.longi (Column(), int32, idx=None)

3405 /table.cols.vector (Column(2,), int32, idx=None)

3406 /table.cols.matrix2D (Column(2, 2), float64, idx=None)

3407 Select table.cols.name[1]--> Particle: 11

3408 Select table.cols.name[1:2]--> ['Particle: 11']

3409 Select table.cols.name[:]--> ['Particle: 10'

3410 'Particle: 11' 'Particle: 12'

3411 'Particle: 13' 'Particle: 14']

3412 Select table.cols._f_col('name')[:]--> ['Particle: 10'

3413 'Particle: 11' 'Particle: 12'

3414 'Particle: 13' 'Particle: 14']

3415

3416 See the :file:`examples/table2.py` file for a more complete example.

3417

3418 """

3419

3420 table = self.table

3421

3422 # Generalized key support not there yet, but at least allow

3423 # for a tuple with one single element (the main dimension).

3424 # (key,) --> key

3425 if isinstance(key, tuple) and len(key) == 1:

3426 key = key[0]

3427

3428 if is_idx(key):

3429 key = operator.index(key)

3430

3431 # Index out of range protection

3432 if key >= table.nrows:

3433 raise IndexError("Index out of range")

3434 if key < 0:

3435 # To support negative values

3436 key += table.nrows

3437 (start, stop, step) = table._process_range(key, key + 1, 1)

3438 return table.read(start, stop, step, self.pathname)[0]

3439 elif isinstance(key, slice):

3440 (start, stop, step) = table._process_range(

3441 key.start, key.stop, key.step)

3442 return table.read(start, stop, step, self.pathname)

3443 else:

3444 raise TypeError(

3445 "'%s' key type is not valid in this context" % key)

3446

3447 def __iter__(self):

3448 """Iterate through all items in the column."""

3449

3450 table = self.table

3451 itemsize = self.dtype.itemsize

3452 nrowsinbuf = table._v_file.params['IO_BUFFER_SIZE'] // itemsize

3453 buf = np.empty((nrowsinbuf, ), self._itemtype)

3454 max_row = len(self)

3455 for start_row in range(0, len(self), nrowsinbuf):

3456 end_row = min(start_row + nrowsinbuf, max_row)

3457 buf_slice = buf[0:end_row - start_row]

3458 table.read(start_row, end_row, 1, field=self.pathname,

3459 out=buf_slice)

3460 yield from buf_slice

3461

3462 def __setitem__(self, key, value):

3463 """Set a row or a range of rows in a column.

3464

3465 If key argument is an integer, the corresponding element is set to

3466 value. If key is a slice, the range of elements determined by it is

3467 set to value.

3468

3469 Examples

3470 --------

3471

3472 ::

3473

3474 # Modify row 1

3475 table.cols.col1[1] = -1

3476

3477 # Modify rows 1 and 3

3478 table.cols.col1[1::2] = [2,3]

3479

3480 Which is equivalent to::

3481

3482 # Modify row 1

3483 table.modify_columns(start=1, columns=[[-1]], names=['col1'])

3484

3485 # Modify rows 1 and 3

3486 columns = numpy.rec.fromarrays([[2,3]], formats='i4')

3487 table.modify_columns(start=1, step=2, columns=columns,

3488 names=['col1'])

3489

3490 """

3491

3492 table = self.table

3493 table._v_file._check_writable()

3494

3495 # Generalized key support not there yet, but at least allow

3496 # for a tuple with one single element (the main dimension).

3497 # (key,) --> key

3498 if isinstance(key, tuple) and len(key) == 1:

3499 key = key[0]

3500

3501 if is_idx(key):

3502 key = operator.index(key)

3503

3504 # Index out of range protection

3505 if key >= table.nrows:

3506 raise IndexError("Index out of range")

3507 if key < 0:

3508 # To support negative values

3509 key += table.nrows

3510 return table.modify_column(key, key + 1, 1,

3511 [[value]], self.pathname)

3512 elif isinstance(key, slice):

3513 (start, stop, step) = table._process_range(

3514 key.start, key.stop, key.step)

3515 return table.modify_column(start, stop, step,

3516 value, self.pathname)

3517 else:

3518 raise ValueError("Non-valid index or slice: %s" % key)

3519

3520 def create_index(self, optlevel=6, kind="medium", filters=None,

3521 tmp_dir=None, _blocksizes=None, _testmode=False,

3522 _verbose=False):

3523 """Create an index for this column.

3524

3525 .. warning::

3526

3527 In some situations it is useful to get a completely sorted

3528 index (CSI). For those cases, it is best to use the

3529 :meth:`Column.create_csindex` method instead.

3530

3531 Parameters

3532 ----------

3533 optlevel : int

3534 The optimization level for building the index. The levels ranges

3535 from 0 (no optimization) up to 9 (maximum optimization). Higher

3536 levels of optimization mean better chances for reducing the entropy

3537 of the index at the price of using more CPU, memory and I/O

3538 resources for creating the index.

3539 kind : str

3540 The kind of the index to be built. It can take the 'ultralight',

3541 'light', 'medium' or 'full' values. Lighter kinds ('ultralight'

3542 and 'light') mean that the index takes less space on disk, but will

3543 perform queries slower. Heavier kinds ('medium' and 'full') mean

3544 better chances for reducing the entropy of the index (increasing

3545 the query speed) at the price of using more disk space as well as

3546 more CPU, memory and I/O resources for creating the index.

3547

3548 Note that selecting a full kind with an optlevel of 9 (the maximum)

3549 guarantees the creation of an index with zero entropy, that is, a

3550 completely sorted index (CSI) - provided that the number of rows in

3551 the table does not exceed the 2**48 figure (that is more than 100

3552 trillions of rows). See :meth:`Column.create_csindex` method for a

3553 more direct way to create a CSI index.

3554 filters : Filters

3555 Specify the Filters instance used to compress the index. If None,

3556 default index filters will be used (currently, zlib level 1 with

3557 shuffling).

3558 tmp_dir

3559 When kind is other than 'ultralight', a temporary file is created

3560 during the index build process. You can use the tmp_dir argument

3561 to specify the directory for this temporary file. The default is

3562 to create it in the same directory as the file containing the

3563 original table.

3564

3565 """

3566

3567 kinds = ['ultralight', 'light', 'medium', 'full']

3568 if kind not in kinds:

3569 raise ValueError("Kind must have any of these values: %s" % kinds)

3570 if (not isinstance(optlevel, int) or

3571 (optlevel < 0 or optlevel > 9)):

3572 raise ValueError("Optimization level must be an integer in the "

3573 "range 0-9")

3574 if filters is None:

3575 filters = default_index_filters

3576 if tmp_dir is None:

3577 tmp_dir = str(Path(self._table_file.filename).parent)

3578 else:

3579 if not Path(tmp_dir).is_dir():

3580 raise ValueError(

3581 f"Temporary directory '{tmp_dir}' does not exist"

3582 )

3583 if (_blocksizes is not None and

3584 (not isinstance(_blocksizes, tuple) or len(_blocksizes) != 4)):

3585 raise ValueError("_blocksizes must be a tuple with exactly 4 "

3586 "elements")

3587 idxrows = _column__create_index(self, optlevel, kind, filters,

3588 tmp_dir, _blocksizes, _verbose)

3589 return SizeType(idxrows)

3590

3591 def create_csindex(self, filters=None, tmp_dir=None,

3592 _blocksizes=None, _testmode=False, _verbose=False):

3593 """Create a completely sorted index (CSI) for this column.

3594

3595 This method guarantees the creation of an index with zero entropy, that

3596 is, a completely sorted index (CSI) -- provided that the number of rows

3597 in the table does not exceed the 2**48 figure (that is more than 100

3598 trillions of rows). A CSI index is needed for some table methods (like

3599 :meth:`Table.itersorted` or :meth:`Table.read_sorted`) in order to

3600 ensure completely sorted results.

3601

3602 For the meaning of filters and tmp_dir arguments see

3603 :meth:`Column.create_index`.

3604

3605 Notes

3606 -----

3607 This method is equivalent to

3608 Column.create_index(optlevel=9, kind='full', ...).

3609

3610 """

3611

3612 return self.create_index(

3613 kind='full', optlevel=9, filters=filters, tmp_dir=tmp_dir,

3614 _blocksizes=_blocksizes, _testmode=_testmode, _verbose=_verbose)

3615

3616 def _do_reindex(self, dirty):

3617 """Common code for reindex() and reindex_dirty() codes."""

3618

3619 index = self.index

3620 dodirty = True

3621 if dirty and not index.dirty:

3622 dodirty = False

3623 if index is not None and dodirty:

3624 self._table_file._check_writable()

3625 # Get the old index parameters

3626 kind = index.kind

3627 optlevel = index.optlevel

3628 filters = index.filters

3629 # We *need* to tell the index that it is going to be undirty.

3630 # This is needed here so as to unnail() the condition cache.

3631 index.dirty = False

3632 # Delete the existing Index

3633 index._f_remove()

3634 # Create a new Index with the previous parameters

3635 return SizeType(self.create_index(

3636 kind=kind, optlevel=optlevel, filters=filters))

3637 else:

3638 return SizeType(0) # The column is not intended for indexing

3639

3640 def reindex(self):

3641 """Recompute the index associated with this column.

3642

3643 This can be useful when you suspect that, for any reason,

3644 the index information is no longer valid and you want to rebuild it.

3645

3646 This method does nothing if the column is not indexed.

3647

3648 """

3649

3650 self._do_reindex(dirty=False)

3651

3652 def reindex_dirty(self):

3653 """Recompute the associated index only if it is dirty.

3654

3655 This can be useful when you have set :attr:`Table.autoindex` to false

3656 for the table and you want to update the column's index after an

3657 invalidating index operation (like :meth:`Table.remove_rows`).

3658

3659 This method does nothing if the column is not indexed.

3660

3661 """

3662

3663 self._do_reindex(dirty=True)

3664

3665 def remove_index(self):

3666 """Remove the index associated with this column.

3667

3668 This method does nothing if the column is not indexed. The removed

3669 index can be created again by calling the :meth:`Column.create_index`

3670 method.

3671

3672 """

3673

3674 self._table_file._check_writable()

3675

3676 # Remove the index if existing.

3677 if self.is_indexed:

3678 index = self.index

3679 index._f_remove()

3680 self.table._set_column_indexing(self.pathname, False)

3681

3682 def close(self):

3683 """Close this column."""

3684

3685 self.__dict__.clear()

3686

3687 def __str__(self):

3688 """The string representation for this object."""

3689

3690 return (f"{self._table_path}.cols.{self.pathname.replace('/', '.')} "

3691 f"({self.__class__.__name__}{self.shape}, "

3692 f"{self.descr._v_types[self.name]}, idx={self.index})")

3693

3694 def __repr__(self):

3695 """A detailed string representation for this object."""

3696

3697 return str(self)