Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tables/index.py: 10%
1268 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-10 06:15 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-10 06:15 +0000
1"""Here is defined the Index class."""
3import math
4import operator
5import os
6import sys
7import tempfile
8import warnings
10from pathlib import Path
11from time import perf_counter as clock
12from time import process_time as cpuclock
14import numpy as np
16from .idxutils import (calc_chunksize, calcoptlevels,
17 get_reduction_level, nextafter, inftype)
19from . import indexesextension
20from .node import NotLoggedMixin
21from .atom import UIntAtom, Atom
22from .earray import EArray
23from .carray import CArray
24from .leaf import Filters
25from .indexes import CacheArray, LastRowArray, IndexArray
26from .group import Group
27from .path import join_path
28from .exceptions import PerformanceWarning
29from .utils import is_idx, idx2long, lazyattr
30from .utilsextension import (nan_aware_gt, nan_aware_ge,
31 nan_aware_lt, nan_aware_le,
32 bisect_left, bisect_right)
33from .lrucacheextension import ObjectCache
35# default version for INDEX objects
36# obversion = "1.0" # Version of indexes in PyTables 1.x series
37# obversion = "2.0" # Version of indexes in PyTables Pro 2.0 series
38obversion = "2.1" # Version of indexes in PyTables Pro 2.1 and up series,
39# # including the join 2.3 Std + Pro version
41debug = False
42# debug = True # Uncomment this for printing sizes purposes
43profile = False
44# profile = True # Uncomment for profiling
45if profile:
46 from .utils import show_stats
48# The default method for sorting
49# defsort = "quicksort"
50# Changing to mergesort to fix #441
51defsort = "mergesort"
53# Default policy for automatically updating indexes after a table
54# append operation, or automatically reindexing after an
55# index-invalidating operation like removing or modifying table rows.
56default_auto_index = True
57# Keep in sync with ``Table.autoindex`` docstring.
59# Default filters used to compress indexes. This is quite fast and
60# compression is pretty good.
61# Remember to keep these defaults in sync with the docstrings and UG.
62default_index_filters = Filters(complevel=1, complib='zlib',
63 shuffle=True, fletcher32=False)
65# Deprecated API
66defaultAutoIndex = default_auto_index
67defaultIndexFilters = default_index_filters
69# The list of types for which an optimised search in cython and C has
70# been implemented. Always add here the name of a new optimised type.
71opt_search_types = ("int8", "int16", "int32", "int64",
72 "uint8", "uint16", "uint32", "uint64",
73 "float32", "float64")
75# The upper limit for uint32 ints
76max32 = 2**32
79def _table_column_pathname_of_index(indexpathname):
80 names = indexpathname.split("/")
81 for i, name in enumerate(names):
82 if name.startswith('_i_'):
83 break
84 tablepathname = "/".join(names[:i]) + "/" + name[3:]
85 colpathname = "/".join(names[i + 1:])
86 return (tablepathname, colpathname)
89class Index(NotLoggedMixin, Group, indexesextension.Index):
90 """Represents the index of a column in a table.
92 This class is used to keep the indexing information for columns in a Table
93 dataset (see :ref:`TableClassDescr`). It is actually a descendant of the
94 Group class (see :ref:`GroupClassDescr`), with some added functionality. An
95 Index is always associated with one and only one column in the table.
97 .. note::
99 This class is mainly intended for internal use, but some of its
100 documented attributes and methods may be interesting for the
101 programmer.
103 Parameters
104 ----------
105 parentnode
106 The parent :class:`Group` object.
108 .. versionchanged:: 3.0
109 Renamed from *parentNode* to *parentnode*.
111 name : str
112 The name of this node in its parent group.
113 atom : Atom
114 An Atom object representing the shape and type of the atomic objects to
115 be saved. Only scalar atoms are supported.
116 title
117 Sets a TITLE attribute of the Index entity.
118 kind
119 The desired kind for this index. The 'full' kind specifies a complete
120 track of the row position (64-bit), while the 'medium', 'light' or
121 'ultralight' kinds only specify in which chunk the row is (using
122 32-bit, 16-bit and 8-bit respectively).
123 optlevel
124 The desired optimization level for this index.
125 filters : Filters
126 An instance of the Filters class that provides information about the
127 desired I/O filters to be applied during the life of this object.
128 tmp_dir
129 The directory for the temporary files.
130 expectedrows
131 Represents an user estimate about the number of row slices that will be
132 added to the growable dimension in the IndexArray object.
133 byteorder
134 The byteorder of the index datasets *on-disk*.
135 blocksizes
136 The four main sizes of the compound blocks in index datasets (a low
137 level parameter).
139 """
141 _c_classid = 'INDEX'
143 @property
144 def kind(self):
145 """The kind of this index."""
146 return {1: 'ultralight', 2: 'light',
147 4: 'medium', 8: 'full'}[self.indsize]
149 @property
150 def filters(self):
151 """Filter properties for this index - see Filters in
152 :ref:`FiltersClassDescr`."""
153 return self._v_filters
155 @property
156 def dirty(self):
157 """Whether the index is dirty or not.
158 Dirty indexes are out of sync with column data, so they exist but they
159 are not usable.
160 """
162 # If there is no ``DIRTY`` attribute, index should be clean.
163 return getattr(self._v_attrs, 'DIRTY', False)
165 @dirty.setter
166 def dirty(self, dirty):
167 wasdirty, isdirty = self.dirty, bool(dirty)
168 self._v_attrs.DIRTY = dirty
169 # If an *actual* change in dirtiness happens,
170 # notify the condition cache by setting or removing a nail.
171 conditioncache = self.table._condition_cache
172 if not wasdirty and isdirty:
173 conditioncache.nail()
174 if wasdirty and not isdirty:
175 conditioncache.unnail()
177 @property
178 def column(self):
179 """The Column (see :ref:`ColumnClassDescr`) instance for the indexed
180 column."""
182 tablepath, columnpath = _table_column_pathname_of_index(
183 self._v_pathname)
184 table = self._v_file._get_node(tablepath)
185 column = table.cols._g_col(columnpath)
186 return column
188 @property
189 def table(self):
190 """Accessor for the `Table` object of this index."""
191 tablepath, columnpath = _table_column_pathname_of_index(
192 self._v_pathname)
193 table = self._v_file._get_node(tablepath)
194 return table
196 @property
197 def nblockssuperblock(self):
198 """The number of blocks in a superblock."""
199 return self.superblocksize // self.blocksize
201 @property
202 def nslicesblock(self):
203 """The number of slices in a block."""
204 return self.blocksize // self.slicesize
206 @property
207 def nchunkslice(self):
208 """The number of chunks in a slice."""
209 return self.slicesize // self.chunksize
211 @property
212 def nsuperblocks(self):
213 """The total number of superblocks in index."""
214 # Last row should not be considered as a superblock
215 nelements = self.nelements - self.nelementsILR
216 nblocks = nelements // self.superblocksize
217 if nelements % self.blocksize > 0:
218 nblocks += 1
219 return nblocks
221 @property
222 def nblocks(self):
223 """The total number of blocks in index."""
224 # Last row should not be considered as a block
225 nelements = self.nelements - self.nelementsILR
226 nblocks = nelements // self.blocksize
227 if nelements % self.blocksize > 0:
228 nblocks += 1
229 return nblocks
231 @property
232 def nslices(self):
233 """The number of complete slices in index."""
234 return self.nelements // self.slicesize
236 @property
237 def nchunks(self):
238 """The number of complete chunks in index."""
239 return self.nelements // self.chunksize
241 @property
242 def shape(self):
243 """The shape of this index (in slices and elements)."""
244 return (self.nrows, self.slicesize)
246 @property
247 def temp_required(self):
248 """Whether a temporary file for indexes is required or not."""
249 return (self.indsize > 1 and
250 self.optlevel > 0 and
251 self.table.nrows > self.slicesize)
253 @property
254 def want_complete_sort(self):
255 """Whether we should try to build a completely sorted index or not."""
256 return self.indsize == 8 and self.optlevel == 9
258 @property
259 def is_csi(self):
260 """Whether the index is completely sorted or not.
262 .. versionchanged:: 3.0
263 The *is_CSI* property has been renamed into *is_csi*.
265 """
267 if self.nelements == 0:
268 # An index with 0 indexed elements is not a CSI one (by definition)
269 return False
270 if self.indsize < 8:
271 # An index that is not full cannot be completely sorted
272 return False
273 # Try with the 'is_csi' attribute
274 if 'is_csi' in self._v_attrs:
275 return self._v_attrs.is_csi
276 # If not, then compute the overlaps manually
277 # (the attribute 'is_csi' will be set there)
278 self.compute_overlaps(self, None, False)
279 return self.noverlaps == 0
281 @lazyattr
282 def nrowsinchunk(self):
283 """The number of rows that fits in a *table* chunk."""
285 return self.table.chunkshape[0]
287 @lazyattr
288 def lbucket(self):
289 """Return the length of a bucket based index type."""
291 # Avoid to set a too large lbucket size (mainly useful for tests)
292 lbucket = min(self.nrowsinchunk, self.chunksize)
293 if self.indsize == 1:
294 # For ultra-light, we will never have to keep track of a
295 # bucket outside of a slice.
296 maxnb = 2**8
297 if self.slicesize > maxnb * lbucket:
298 lbucket = math.ceil(self.slicesize / maxnb)
299 elif self.indsize == 2:
300 # For light, we will never have to keep track of a
301 # bucket outside of a block.
302 maxnb = 2**16
303 if self.blocksize > maxnb * lbucket:
304 lbucket = math.ceil(self.blocksize / maxnb)
305 else:
306 # For medium and full indexes there should not be a need to
307 # increase lbucket
308 pass
309 return lbucket
311 def __init__(self, parentnode, name,
312 atom=None, title="",
313 kind=None,
314 optlevel=None,
315 filters=None,
316 tmp_dir=None,
317 expectedrows=0,
318 byteorder=None,
319 blocksizes=None,
320 new=True):
322 self._v_version = None
323 """The object version of this index."""
324 self.optlevel = optlevel
325 """The optimization level for this index."""
326 self.tmp_dir = tmp_dir
327 """The directory for the temporary files."""
328 self.expectedrows = expectedrows
329 """The expected number of items of index arrays."""
330 if byteorder in ["little", "big"]:
331 self.byteorder = byteorder
332 else:
333 self.byteorder = sys.byteorder
334 """The byteorder of the index datasets."""
335 if atom is not None:
336 self.dtype = atom.dtype.base
337 self.type = atom.type
338 """The datatypes to be stored by the sorted index array."""
339 # ############## Important note ###########################
340 # The datatypes saved as index values are NumPy native
341 # types, so we get rid of type metainfo like Time* or Enum*
342 # that belongs to HDF5 types (actually, this metainfo is
343 # not needed for sorting and looking-up purposes).
344 # #########################################################
345 indsize = {
346 'ultralight': 1, 'light': 2, 'medium': 4, 'full': 8}[kind]
347 assert indsize in (1, 2, 4, 8), "indsize should be 1, 2, 4 or 8!"
348 self.indsize = indsize
349 """The itemsize for the indices part of the index."""
351 self.nrows = None
352 """The total number of slices in the index."""
353 self.nelements = None
354 """The number of currently indexed rows for this column."""
355 self.blocksizes = blocksizes
356 """The four main sizes of the compound blocks (if specified)."""
357 self.dirtycache = True
358 """Dirty cache (for ranges, bounds & sorted) flag."""
359 self.superblocksize = None
360 """Size of the superblock for this index."""
361 self.blocksize = None
362 """Size of the block for this index."""
363 self.slicesize = None
364 """Size of the slice for this index."""
365 self.chunksize = None
366 """Size of the chunk for this index."""
367 self.tmpfilename = None
368 """Filename for temporary bounds."""
369 self.opt_search_types = opt_search_types
370 """The types for which and optimized search has been implemented."""
371 self.noverlaps = -1
372 """The number of overlaps in an index. 0 means a completely
373 sorted index. -1 means that this number is not computed yet."""
374 self.tprof = 0
375 """Time counter for benchmarking purposes."""
377 from .file import open_file
378 self._openFile = open_file
379 """The `open_file()` function, to avoid a circular import."""
381 super().__init__(parentnode, name, title, new, filters)
383 def _g_post_init_hook(self):
384 if self._v_new:
385 # The version for newly created indexes
386 self._v_version = obversion
387 super()._g_post_init_hook()
389 # Index arrays must only be created for new indexes
390 if not self._v_new:
391 idxversion = self._v_version
392 # Set-up some variables from info on disk and return
393 attrs = self._v_attrs
394 # Coerce NumPy scalars to Python scalars in order
395 # to avoid undesired upcasting operations.
396 self.superblocksize = int(attrs.superblocksize)
397 self.blocksize = int(attrs.blocksize)
398 self.slicesize = int(attrs.slicesize)
399 self.chunksize = int(attrs.chunksize)
400 self.blocksizes = (self.superblocksize, self.blocksize,
401 self.slicesize, self.chunksize)
402 self.optlevel = int(attrs.optlevel)
403 sorted = self.sorted
404 indices = self.indices
405 self.dtype = sorted.atom.dtype
406 self.type = sorted.atom.type
407 self.indsize = indices.atom.itemsize
408 # Some sanity checks for slicesize, chunksize and indsize
409 assert self.slicesize == indices.shape[1], "Wrong slicesize"
410 assert self.chunksize == indices._v_chunkshape[
411 1], "Wrong chunksize"
412 assert self.indsize in (1, 2, 4, 8), "Wrong indices itemsize"
413 if idxversion > "2.0":
414 self.reduction = int(attrs.reduction)
415 nelementsSLR = int(self.sortedLR.attrs.nelements)
416 nelementsILR = int(self.indicesLR.attrs.nelements)
417 else:
418 self.reduction = 1
419 nelementsILR = self.indicesLR[-1]
420 nelementsSLR = nelementsILR
421 self.nrows = sorted.nrows
422 self.nelements = self.nrows * self.slicesize + nelementsILR
423 self.nelementsSLR = nelementsSLR
424 self.nelementsILR = nelementsILR
425 if nelementsILR > 0:
426 self.nrows += 1
427 # Get the bounds as a cache (this has to remain here!)
428 rchunksize = self.chunksize // self.reduction
429 nboundsLR = (nelementsSLR - 1) // rchunksize
430 if nboundsLR < 0:
431 nboundsLR = 0 # correction for -1 bounds
432 nboundsLR += 2 # bounds + begin + end
433 # All bounds values (+begin + end) are at the end of sortedLR
434 self.bebounds = self.sortedLR[
435 nelementsSLR:nelementsSLR + nboundsLR]
436 return
438 # The index is new. Initialize the values
439 self.nrows = 0
440 self.nelements = 0
441 self.nelementsSLR = 0
442 self.nelementsILR = 0
444 # The atom
445 atom = Atom.from_dtype(self.dtype)
447 # The filters
448 filters = self.filters
450 # Compute the superblocksize, blocksize, slicesize and chunksize values
451 # (in case these parameters haven't been passed to the constructor)
452 if self.blocksizes is None:
453 self.blocksizes = calc_chunksize(
454 self.expectedrows, self.optlevel, self.indsize, node=self)
455 (self.superblocksize, self.blocksize,
456 self.slicesize, self.chunksize) = self.blocksizes
457 if debug:
458 print("blocksizes:", self.blocksizes)
459 # Compute the reduction level
460 self.reduction = get_reduction_level(
461 self.indsize, self.optlevel, self.slicesize, self.chunksize)
462 rchunksize = self.chunksize // self.reduction
463 rslicesize = self.slicesize // self.reduction
465 # Save them on disk as attributes
466 self._v_attrs.superblocksize = np.uint64(self.superblocksize)
467 self._v_attrs.blocksize = np.uint64(self.blocksize)
468 self._v_attrs.slicesize = np.uint32(self.slicesize)
469 self._v_attrs.chunksize = np.uint32(self.chunksize)
470 # Save the optlevel as well
471 self._v_attrs.optlevel = self.optlevel
472 # Save the reduction level
473 self._v_attrs.reduction = self.reduction
475 # Create the IndexArray for sorted values
476 sorted = IndexArray(self, 'sorted', atom, "Sorted Values",
477 filters, self.byteorder)
479 # Create the IndexArray for index values
480 IndexArray(self, 'indices', UIntAtom(itemsize=self.indsize),
481 "Number of chunk in table", filters, self.byteorder)
483 # Create the cache for range values (1st order cache)
484 CacheArray(self, 'ranges', atom, (0, 2), "Range Values", filters,
485 self.expectedrows // self.slicesize,
486 byteorder=self.byteorder)
487 # median ranges
488 EArray(self, 'mranges', atom, (0,), "Median ranges", filters,
489 byteorder=self.byteorder, _log=False)
491 # Create the cache for boundary values (2nd order cache)
492 nbounds_inslice = (rslicesize - 1) // rchunksize
493 CacheArray(self, 'bounds', atom, (0, nbounds_inslice),
494 "Boundary Values", filters, self.nchunks,
495 (1, nbounds_inslice), byteorder=self.byteorder)
497 # begin, end & median bounds (only for numerical types)
498 EArray(self, 'abounds', atom, (0,), "Start bounds", filters,
499 byteorder=self.byteorder, _log=False)
500 EArray(self, 'zbounds', atom, (0,), "End bounds", filters,
501 byteorder=self.byteorder, _log=False)
502 EArray(self, 'mbounds', atom, (0,), "Median bounds", filters,
503 byteorder=self.byteorder, _log=False)
505 # Create the Array for last (sorted) row values + bounds
506 shape = (rslicesize + 2 + nbounds_inslice,)
507 sortedLR = LastRowArray(self, 'sortedLR', atom, shape,
508 "Last Row sorted values + bounds",
509 filters, (rchunksize,),
510 byteorder=self.byteorder)
512 # Create the Array for the number of chunk in last row
513 shape = (self.slicesize,) # enough for indexes and length
514 indicesLR = LastRowArray(self, 'indicesLR',
515 UIntAtom(itemsize=self.indsize),
516 shape, "Last Row indices",
517 filters, (self.chunksize,),
518 byteorder=self.byteorder)
520 # The number of elements in LR will be initialized here
521 sortedLR.attrs.nelements = 0
522 indicesLR.attrs.nelements = 0
524 # All bounds values (+begin + end) are uninitialized in creation time
525 self.bebounds = None
527 # The starts and lengths initialization
528 self.starts = np.empty(shape=self.nrows, dtype=np.int32)
529 """Where the values fulfiling conditions starts for every slice."""
530 self.lengths = np.empty(shape=self.nrows, dtype=np.int32)
531 """Lengths of the values fulfilling conditions for every slice."""
533 # Finally, create a temporary file for indexes if needed
534 if self.temp_required:
535 self.create_temp()
537 def initial_append(self, xarr, nrow, reduction):
538 """Compute an initial indices arrays for data to be indexed."""
540 if profile:
541 tref = clock()
542 if profile:
543 show_stats("Entering initial_append", tref)
544 arr = xarr.pop()
545 indsize = self.indsize
546 slicesize = self.slicesize
547 nelementsILR = self.nelementsILR
548 if profile:
549 show_stats("Before creating idx", tref)
550 if indsize == 8:
551 idx = np.arange(0, len(arr), dtype="uint64") + nrow * slicesize
552 elif indsize == 4:
553 # For medium (32-bit) all the rows in tables should be
554 # directly reachable. But as len(arr) < 2**31, we can
555 # choose uint32 for representing indices. In this way, we
556 # consume far less memory during the keysort process. The
557 # offset will be added in self.final_idx32() later on.
558 #
559 # This optimization also prevents the values in LR to
560 # participate in the ``swap_chunks`` process, and this is
561 # the main reason to not allow the medium indexes to create
562 # completely sorted indexes. However, I don't find this to
563 # be a big limitation, as probably fully indexes are much
564 # more suitable for producing completely sorted indexes
565 # because in this case the indices part is usable for
566 # getting the reverse indices of the index, and I forsee
567 # this to be a common requirement in many operations (for
568 # example, in table sorts).
569 #
570 # F. Alted 2008-09-15
571 idx = np.arange(0, len(arr), dtype="uint32")
572 else:
573 idx = np.empty(len(arr), "uint%d" % (indsize * 8))
574 lbucket = self.lbucket
575 # Fill the idx with the bucket indices
576 offset = int(lbucket - ((nrow * (slicesize % lbucket)) % lbucket))
577 idx[0:offset] = 0
578 for i in range(offset, slicesize, lbucket):
579 idx[i:i + lbucket] = (i + lbucket - 1) // lbucket
580 if indsize == 2:
581 # Add a second offset in this case
582 # First normalize the number of rows
583 offset2 = (nrow % self.nslicesblock) * slicesize // lbucket
584 idx += offset2
585 # Add the last row at the beginning of arr & idx (if needed)
586 if (indsize == 8 and nelementsILR > 0):
587 # It is possible that the values in LR are already sorted.
588 # Fetch them and override existing values in arr and idx.
589 assert len(arr) > nelementsILR
590 self.read_slice_lr(self.sortedLR, arr[:nelementsILR])
591 self.read_slice_lr(self.indicesLR, idx[:nelementsILR])
592 # In-place sorting
593 if profile:
594 show_stats("Before keysort", tref)
595 indexesextension.keysort(arr, idx)
596 larr = arr[-1]
597 if reduction > 1:
598 # It's important to do a copy() here in order to ensure that
599 # sorted._append() will receive a contiguous array.
600 if profile:
601 show_stats("Before reduction", tref)
602 reduc = arr[::reduction].copy()
603 if profile:
604 show_stats("After reduction", tref)
605 arr = reduc
606 if profile:
607 show_stats("After arr <-- reduc", tref)
608 # A completely sorted index is not longer possible after an
609 # append of an index with already one slice.
610 if nrow > 0:
611 self._v_attrs.is_csi = False
612 if profile:
613 show_stats("Exiting initial_append", tref)
614 return larr, arr, idx
616 def final_idx32(self, idx, offset):
617 """Perform final operations in 32-bit indices."""
619 if profile:
620 tref = clock()
621 if profile:
622 show_stats("Entering final_idx32", tref)
623 # Do an upcast first in order to add the offset.
624 idx = idx.astype('uint64')
625 idx += offset
626 # The next partition is valid up to table sizes of
627 # 2**30 * 2**18 = 2**48 bytes, that is, 256 Tera-elements,
628 # which should be a safe figure, at least for a while.
629 idx //= self.lbucket
630 # After the division, we can downsize the indexes to 'uint32'
631 idx = idx.astype('uint32')
632 if profile:
633 show_stats("Exiting final_idx32", tref)
634 return idx
636 def append(self, xarr, update=False):
637 """Append the array to the index objects."""
639 if profile:
640 tref = clock()
641 if profile:
642 show_stats("Entering append", tref)
643 if not update and self.temp_required:
644 where = self.tmp
645 # The reduction will take place *after* the optimization process
646 reduction = 1
647 else:
648 where = self
649 reduction = self.reduction
650 sorted = where.sorted
651 indices = where.indices
652 ranges = where.ranges
653 mranges = where.mranges
654 bounds = where.bounds
655 mbounds = where.mbounds
656 abounds = where.abounds
657 zbounds = where.zbounds
658 sortedLR = where.sortedLR
659 indicesLR = where.indicesLR
660 nrows = sorted.nrows # before sorted.append()
661 larr, arr, idx = self.initial_append(xarr, nrows, reduction)
662 # Save the sorted array
663 sorted.append(arr.reshape(1, arr.size))
664 cs = self.chunksize // reduction
665 ncs = self.nchunkslice
666 # Save ranges & bounds
667 ranges.append([[arr[0], larr]])
668 bounds.append([arr[cs::cs]])
669 abounds.append(arr[0::cs])
670 zbounds.append(arr[cs - 1::cs])
671 # Compute the medians
672 smedian = arr[cs // 2::cs]
673 mbounds.append(smedian)
674 mranges.append([smedian[ncs // 2]])
675 if profile:
676 show_stats("Before deleting arr & smedian", tref)
677 del arr, smedian # delete references
678 if profile:
679 show_stats("After deleting arr & smedian", tref)
680 # Now that arr is gone, we can upcast the indices and add the offset
681 if self.indsize == 4:
682 idx = self.final_idx32(idx, nrows * self.slicesize)
683 indices.append(idx.reshape(1, idx.size))
684 if profile:
685 show_stats("Before deleting idx", tref)
686 del idx
687 # Update counters after a successful append
688 self.nrows = nrows + 1
689 self.nelements = self.nrows * self.slicesize
690 self.nelementsSLR = 0 # reset the counter of the last row index to 0
691 self.nelementsILR = 0 # reset the counter of the last row index to 0
692 # The number of elements will be saved as an attribute.
693 # This is necessary in case the LR arrays can remember its values
694 # after a possible node preemtion/reload.
695 sortedLR.attrs.nelements = self.nelementsSLR
696 indicesLR.attrs.nelements = self.nelementsILR
697 self.dirtycache = True # the cache is dirty now
698 if profile:
699 show_stats("Exiting append", tref)
701 def append_last_row(self, xarr, update=False):
702 """Append the array to the last row index objects."""
704 if profile:
705 tref = clock()
706 if profile:
707 show_stats("Entering appendLR", tref)
708 # compute the elements in the last row sorted & bounds array
709 nrows = self.nslices
710 if not update and self.temp_required:
711 where = self.tmp
712 # The reduction will take place *after* the optimization process
713 reduction = 1
714 else:
715 where = self
716 reduction = self.reduction
717 indicesLR = where.indicesLR
718 sortedLR = where.sortedLR
719 larr, arr, idx = self.initial_append(xarr, nrows, reduction)
720 nelementsSLR = len(arr)
721 nelementsILR = len(idx)
722 # Build the cache of bounds
723 rchunksize = self.chunksize // reduction
724 self.bebounds = np.concatenate((arr[::rchunksize], [larr]))
725 # The number of elements will be saved as an attribute
726 sortedLR.attrs.nelements = nelementsSLR
727 indicesLR.attrs.nelements = nelementsILR
728 # Save the number of elements, bounds and sorted values
729 # at the end of the sorted array
730 offset2 = len(self.bebounds)
731 sortedLR[nelementsSLR:nelementsSLR + offset2] = self.bebounds
732 sortedLR[:nelementsSLR] = arr
733 del arr
734 # Now that arr is gone, we can upcast the indices and add the offset
735 if self.indsize == 4:
736 idx = self.final_idx32(idx, nrows * self.slicesize)
737 # Save the reverse index array
738 indicesLR[:len(idx)] = idx
739 del idx
740 # Update counters after a successful append
741 self.nrows = nrows + 1
742 self.nelements = nrows * self.slicesize + nelementsILR
743 self.nelementsILR = nelementsILR
744 self.nelementsSLR = nelementsSLR
745 self.dirtycache = True # the cache is dirty now
746 if profile:
747 show_stats("Exiting appendLR", tref)
749 def optimize(self, verbose=False):
750 """Optimize an index so as to allow faster searches.
752 verbose
753 If True, messages about the progress of the
754 optimization process are printed out.
756 """
758 if not self.temp_required:
759 return
761 if verbose:
762 self.verbose = True
763 else:
764 self.verbose = debug
766 # Initialize last_tover and last_nover
767 self.last_tover = 0
768 self.last_nover = 0
770 # Compute the correct optimizations for current optim level
771 opts = calcoptlevels(self.nblocks, self.optlevel, self.indsize)
772 optmedian, optstarts, optstops, optfull = opts
774 if debug:
775 print("optvalues:", opts)
777 self.create_temp2()
778 # Start the optimization process
779 while True:
780 if optfull:
781 for niter in range(optfull):
782 if self.swap('chunks', 'median'):
783 break
784 if self.nblocks > 1:
785 # Swap slices only in the case that we have
786 # several blocks
787 if self.swap('slices', 'median'):
788 break
789 if self.swap('chunks', 'median'):
790 break
791 if self.swap('chunks', 'start'):
792 break
793 if self.swap('chunks', 'stop'):
794 break
795 else:
796 if optmedian:
797 if self.swap('chunks', 'median'):
798 break
799 if optstarts:
800 if self.swap('chunks', 'start'):
801 break
802 if optstops:
803 if self.swap('chunks', 'stop'):
804 break
805 break # If we reach this, exit the loop
807 # Check if we require a complete sort. Important: this step
808 # should be carried out *after* the optimization process has
809 # been completed (this is to guarantee that the complete sort
810 # does not take too much memory).
811 if self.want_complete_sort:
812 if self.noverlaps > 0:
813 self.do_complete_sort()
814 # Check that we have effectively achieved the complete sort
815 if self.noverlaps > 0:
816 warnings.warn(
817 "OPSI was not able to achieve a completely sorted index."
818 " Please report this to the authors.", UserWarning)
820 # Close and delete the temporal optimization index file
821 self.cleanup_temp()
822 return
824 def do_complete_sort(self):
825 """Bring an already optimized index into a complete sorted state."""
827 if self.verbose:
828 t1 = clock()
829 c1 = cpuclock()
830 ss = self.slicesize
831 tmp = self.tmp
832 ranges = tmp.ranges[:]
833 nslices = self.nslices
835 nelementsLR = self.nelementsILR
836 if nelementsLR > 0:
837 # Add the ranges corresponding to the last row
838 rangeslr = np.array([self.bebounds[0], self.bebounds[-1]])
839 ranges = np.concatenate((ranges, [rangeslr]))
840 nslices += 1
842 sorted = tmp.sorted
843 indices = tmp.indices
844 sortedLR = tmp.sortedLR
845 indicesLR = tmp.indicesLR
846 sremain = np.array([], dtype=self.dtype)
847 iremain = np.array([], dtype='u%d' % self.indsize)
848 starts = np.zeros(shape=nslices, dtype=np.int_)
849 for i in range(nslices):
850 # Find the overlapping elements for slice i
851 sover = np.array([], dtype=self.dtype)
852 iover = np.array([], dtype='u%d' % self.indsize)
853 prev_end = ranges[i, 1]
854 for j in range(i + 1, nslices):
855 stj = starts[j]
856 if ((j < self.nslices and stj == ss) or
857 (j == self.nslices and stj == nelementsLR)):
858 # This slice has been already dealt with
859 continue
860 if j < self.nslices:
861 assert stj < ss, \
862 "Two slices cannot overlap completely at this stage!"
863 next_beg = sorted[j, stj]
864 else:
865 assert stj < nelementsLR, \
866 "Two slices cannot overlap completely at this stage!"
867 next_beg = sortedLR[stj]
868 next_end = ranges[j, 1]
869 if prev_end > next_end:
870 # Complete overlapping case
871 if j < self.nslices:
872 sover = np.concatenate((sover, sorted[j, stj:]))
873 iover = np.concatenate((iover, indices[j, stj:]))
874 starts[j] = ss
875 else:
876 n = nelementsLR
877 sover = np.concatenate((sover, sortedLR[stj:n]))
878 iover = np.concatenate((iover, indicesLR[stj:n]))
879 starts[j] = nelementsLR
880 elif prev_end > next_beg:
881 idx = self.search_item_lt(tmp, prev_end, j, ranges[j], stj)
882 if j < self.nslices:
883 sover = np.concatenate((sover, sorted[j, stj:idx]))
884 iover = np.concatenate((iover, indices[j, stj:idx]))
885 else:
886 sover = np.concatenate((sover, sortedLR[stj:idx]))
887 iover = np.concatenate((iover, indicesLR[stj:idx]))
888 starts[j] = idx
889 # Build the extended slices to sort out
890 if i < self.nslices:
891 ssorted = np.concatenate(
892 (sremain, sorted[i, starts[i]:], sover))
893 sindices = np.concatenate(
894 (iremain, indices[i, starts[i]:], iover))
895 else:
896 ssorted = np.concatenate(
897 (sremain, sortedLR[starts[i]:nelementsLR], sover))
898 sindices = np.concatenate(
899 (iremain, indicesLR[starts[i]:nelementsLR], iover))
900 # Sort the extended slices
901 indexesextension.keysort(ssorted, sindices)
902 # Save the first elements of extended slices in the slice i
903 if i < self.nslices:
904 sorted[i] = ssorted[:ss]
905 indices[i] = sindices[:ss]
906 # Update caches for this slice
907 self.update_caches(i, ssorted[:ss])
908 # Save the remaining values in a separate array
909 send = len(sover) + len(sremain)
910 sremain = ssorted[ss:ss + send]
911 iremain = sindices[ss:ss + send]
912 else:
913 # Still some elements remain for the last row
914 n = len(ssorted)
915 assert n == nelementsLR
916 send = 0
917 sortedLR[:n] = ssorted
918 indicesLR[:n] = sindices
919 # Update the caches for last row
920 sortedlr = sortedLR[:nelementsLR]
921 bebounds = np.concatenate(
922 (sortedlr[::self.chunksize], [sortedlr[-1]]))
923 sortedLR[nelementsLR:nelementsLR + len(bebounds)] = bebounds
924 self.bebounds = bebounds
926 # Verify that we have dealt with all the remaining values
927 assert send == 0
929 # Compute the overlaps in order to verify that we have achieved
930 # a complete sort. This has to be executed always (and not only
931 # in verbose mode!).
932 self.compute_overlaps(self.tmp, "do_complete_sort()", self.verbose)
933 if self.verbose:
934 print(f"time: {clock() - t1:.4f}. clock: {cpuclock() - c1:.4f}")
936 def swap(self, what, mode=None):
937 """Swap chunks or slices using a certain bounds reference."""
939 # Thresholds for avoiding continuing the optimization
940 # thnover = 4 * self.slicesize # minimum number of overlapping
941 # # elements
942 thnover = 40
943 thmult = 0.1 # minimum ratio of multiplicity (a 10%)
944 thtover = 0.01 # minimum overlaping index for slices (a 1%)
946 if self.verbose:
947 t1 = clock()
948 c1 = cpuclock()
949 if what == "chunks":
950 self.swap_chunks(mode)
951 elif what == "slices":
952 self.swap_slices(mode)
953 if mode:
954 message = f"swap_{what}({mode})"
955 else:
956 message = f"swap_{what}"
957 (nover, mult, tover) = self.compute_overlaps(
958 self.tmp, message, self.verbose)
959 rmult = len(mult.nonzero()[0]) / len(mult)
960 if self.verbose:
961 print(f"time: {clock() - t1:.4f}. clock: {cpuclock() - c1:.4f}")
962 # Check that entropy is actually decreasing
963 if what == "chunks" and self.last_tover > 0 and self.last_nover > 0:
964 tover_var = (self.last_tover - tover) / self.last_tover
965 nover_var = (self.last_nover - nover) / self.last_nover
966 if tover_var < 0.05 and nover_var < 0.05:
967 # Less than a 5% of improvement is too few
968 return True
969 self.last_tover = tover
970 self.last_nover = nover
971 # Check if some threshold has met
972 if nover < thnover:
973 return True
974 if rmult < thmult:
975 return True
976 # Additional check for the overlap ratio
977 if 0 <= tover < thtover:
978 return True
979 return False
981 def create_temp(self):
982 """Create some temporary objects for slice sorting purposes."""
984 # The index will be dirty during the index optimization process
985 self.dirty = True
986 # Build the name of the temporary file
987 fd, self.tmpfilename = tempfile.mkstemp(
988 ".tmp", "pytables-", self.tmp_dir)
989 # Close the file descriptor so as to avoid leaks
990 os.close(fd)
991 # Create the proper PyTables file
992 self.tmpfile = self._openFile(self.tmpfilename, "w")
993 self.tmp = tmp = self.tmpfile.root
994 cs = self.chunksize
995 ss = self.slicesize
996 filters = self.filters
997 # temporary sorted & indices arrays
998 shape = (0, ss)
999 atom = Atom.from_dtype(self.dtype)
1000 EArray(tmp, 'sorted', atom, shape,
1001 "Temporary sorted", filters, chunkshape=(1, cs))
1002 EArray(tmp, 'indices', UIntAtom(itemsize=self.indsize), shape,
1003 "Temporary indices", filters, chunkshape=(1, cs))
1004 # temporary bounds
1005 nbounds_inslice = (ss - 1) // cs
1006 shape = (0, nbounds_inslice)
1007 EArray(tmp, 'bounds', atom, shape, "Temp chunk bounds",
1008 filters, chunkshape=(cs, nbounds_inslice))
1009 shape = (0,)
1010 EArray(tmp, 'abounds', atom, shape, "Temp start bounds",
1011 filters, chunkshape=(cs,))
1012 EArray(tmp, 'zbounds', atom, shape, "Temp end bounds",
1013 filters, chunkshape=(cs,))
1014 EArray(tmp, 'mbounds', atom, shape, "Median bounds",
1015 filters, chunkshape=(cs,))
1016 # temporary ranges
1017 EArray(tmp, 'ranges', atom, (0, 2),
1018 "Temporary range values", filters, chunkshape=(cs, 2))
1019 EArray(tmp, 'mranges', atom, (0,),
1020 "Median ranges", filters, chunkshape=(cs,))
1021 # temporary last row (sorted)
1022 shape = (ss + 2 + nbounds_inslice,)
1023 CArray(tmp, 'sortedLR', atom, shape,
1024 "Temp Last Row sorted values + bounds",
1025 filters, chunkshape=(cs,))
1026 # temporary last row (indices)
1027 shape = (ss,)
1028 CArray(tmp, 'indicesLR',
1029 UIntAtom(itemsize=self.indsize),
1030 shape, "Temp Last Row indices",
1031 filters, chunkshape=(cs,))
1033 def create_temp2(self):
1034 """Create some temporary objects for slice sorting purposes."""
1036 # The algorithms for doing the swap can be optimized so that
1037 # one should be necessary to create temporaries for keeping just
1038 # the contents of a single superblock.
1039 # F. Alted 2007-01-03
1040 cs = self.chunksize
1041 ss = self.slicesize
1042 filters = self.filters
1043 # temporary sorted & indices arrays
1044 shape = (self.nslices, ss)
1045 atom = Atom.from_dtype(self.dtype)
1046 tmp = self.tmp
1047 CArray(tmp, 'sorted2', atom, shape,
1048 "Temporary sorted 2", filters, chunkshape=(1, cs))
1049 CArray(tmp, 'indices2', UIntAtom(itemsize=self.indsize), shape,
1050 "Temporary indices 2", filters, chunkshape=(1, cs))
1051 # temporary bounds
1052 nbounds_inslice = (ss - 1) // cs
1053 shape = (self.nslices, nbounds_inslice)
1054 CArray(tmp, 'bounds2', atom, shape, "Temp chunk bounds 2",
1055 filters, chunkshape=(cs, nbounds_inslice))
1056 shape = (self.nchunks,)
1057 CArray(tmp, 'abounds2', atom, shape, "Temp start bounds 2",
1058 filters, chunkshape=(cs,))
1059 CArray(tmp, 'zbounds2', atom, shape, "Temp end bounds 2",
1060 filters, chunkshape=(cs,))
1061 CArray(tmp, 'mbounds2', atom, shape, "Median bounds 2",
1062 filters, chunkshape=(cs,))
1063 # temporary ranges
1064 CArray(tmp, 'ranges2', atom, (self.nslices, 2),
1065 "Temporary range values 2", filters, chunkshape=(cs, 2))
1066 CArray(tmp, 'mranges2', atom, (self.nslices,),
1067 "Median ranges 2", filters, chunkshape=(cs,))
1069 def cleanup_temp(self):
1070 """Copy the data and delete the temporaries for sorting purposes."""
1072 if self.verbose:
1073 print("Copying temporary data...")
1074 # tmp -> index
1075 reduction = self.reduction
1076 cs = self.chunksize // reduction
1077 ncs = self.nchunkslice
1078 tmp = self.tmp
1079 for i in range(self.nslices):
1080 # Copy sorted & indices slices
1081 sorted = tmp.sorted[i][::reduction].copy()
1082 self.sorted.append(sorted.reshape(1, sorted.size))
1083 # Compute ranges
1084 self.ranges.append([[sorted[0], sorted[-1]]])
1085 # Compute chunk bounds
1086 self.bounds.append([sorted[cs::cs]])
1087 # Compute start, stop & median bounds and ranges
1088 self.abounds.append(sorted[0::cs])
1089 self.zbounds.append(sorted[cs - 1::cs])
1090 smedian = sorted[cs // 2::cs]
1091 self.mbounds.append(smedian)
1092 self.mranges.append([smedian[ncs // 2]])
1093 del sorted, smedian # delete references
1094 # Now that sorted is gone, we can copy the indices
1095 indices = tmp.indices[i]
1096 self.indices.append(indices.reshape(1, indices.size))
1098 # Now it is the last row turn (if needed)
1099 if self.nelementsSLR > 0:
1100 # First, the sorted values
1101 sortedLR = self.sortedLR
1102 indicesLR = self.indicesLR
1103 nelementsLR = self.nelementsILR
1104 sortedlr = tmp.sortedLR[:nelementsLR][::reduction].copy()
1105 nelementsSLR = len(sortedlr)
1106 sortedLR[:nelementsSLR] = sortedlr
1107 # Now, the bounds
1108 self.bebounds = np.concatenate((sortedlr[::cs], [sortedlr[-1]]))
1109 offset2 = len(self.bebounds)
1110 sortedLR[nelementsSLR:nelementsSLR + offset2] = self.bebounds
1111 # Finally, the indices
1112 indicesLR[:] = tmp.indicesLR[:]
1113 # Update the number of (reduced) sorted elements
1114 self.nelementsSLR = nelementsSLR
1115 # The number of elements will be saved as an attribute
1116 self.sortedLR.attrs.nelements = self.nelementsSLR
1117 self.indicesLR.attrs.nelements = self.nelementsILR
1119 if self.verbose:
1120 print("Deleting temporaries...")
1121 self.tmp = None
1122 self.tmpfile.close()
1123 Path(self.tmpfilename).unlink()
1124 self.tmpfilename = None
1126 # The optimization process has finished, and the index is ok now
1127 self.dirty = False
1128 # ...but the memory data cache is dirty now
1129 self.dirtycache = True
1131 def get_neworder(self, neworder, src_disk, tmp_disk,
1132 lastrow, nslices, offset, dtype):
1133 """Get sorted & indices values in new order."""
1135 cs = self.chunksize
1136 ncs = ncs2 = self.nchunkslice
1137 self_nslices = self.nslices
1138 tmp = np.empty(shape=self.slicesize, dtype=dtype)
1139 for i in range(nslices):
1140 ns = offset + i
1141 if ns == self_nslices:
1142 # The number of complete chunks in the last row
1143 ncs2 = self.nelementsILR // cs
1144 # Get slices in new order
1145 for j in range(ncs2):
1146 idx = neworder[i * ncs + j]
1147 ins = idx // ncs
1148 inc = (idx - ins * ncs) * cs
1149 ins += offset
1150 nc = j * cs
1151 if ins == self_nslices:
1152 tmp[nc:nc + cs] = lastrow[inc:inc + cs]
1153 else:
1154 tmp[nc:nc + cs] = src_disk[ins, inc:inc + cs]
1155 if ns == self_nslices:
1156 # The number of complete chunks in the last row
1157 lastrow[:ncs2 * cs] = tmp[:ncs2 * cs]
1158 # The elements in the last chunk of the last row will
1159 # participate in the global reordering later on, during
1160 # the phase of sorting of *two* slices at a time
1161 # (including the last row slice, see
1162 # self.reorder_slices()). The caches for last row will
1163 # be updated in self.reorder_slices() too.
1164 # F. Altet 2008-08-25
1165 else:
1166 tmp_disk[ns] = tmp
1168 def swap_chunks(self, mode="median"):
1169 """Swap & reorder the different chunks in a block."""
1171 boundsnames = {
1172 'start': 'abounds', 'stop': 'zbounds', 'median': 'mbounds'}
1173 tmp = self.tmp
1174 sorted = tmp.sorted
1175 indices = tmp.indices
1176 tmp_sorted = tmp.sorted2
1177 tmp_indices = tmp.indices2
1178 sortedLR = tmp.sortedLR
1179 indicesLR = tmp.indicesLR
1180 cs = self.chunksize
1181 ncs = self.nchunkslice
1182 nsb = self.nslicesblock
1183 ncb = ncs * nsb
1184 ncb2 = ncb
1185 boundsobj = tmp._f_get_child(boundsnames[mode])
1186 can_cross_bbounds = (self.indsize == 8 and self.nelementsILR > 0)
1187 for nblock in range(self.nblocks):
1188 # Protection for last block having less chunks than ncb
1189 remainingchunks = self.nchunks - nblock * ncb
1190 if remainingchunks < ncb:
1191 ncb2 = remainingchunks
1192 if ncb2 <= 1:
1193 # if only zero or one chunks remains we are done
1194 break
1195 nslices = ncb2 // ncs
1196 bounds = boundsobj[nblock * ncb:nblock * ncb + ncb2]
1197 # Do this only if lastrow elements can cross block boundaries
1198 if (nblock == self.nblocks - 1 and # last block
1199 can_cross_bbounds):
1200 nslices += 1
1201 ul = self.nelementsILR // cs
1202 bounds = np.concatenate((bounds, self.bebounds[:ul]))
1203 sbounds_idx = bounds.argsort(kind=defsort)
1204 offset = int(nblock * nsb)
1205 # Swap sorted and indices following the new order
1206 self.get_neworder(sbounds_idx, sorted, tmp_sorted, sortedLR,
1207 nslices, offset, self.dtype)
1208 self.get_neworder(sbounds_idx, indices, tmp_indices, indicesLR,
1209 nslices, offset, 'u%d' % self.indsize)
1210 # Reorder completely the index at slice level
1211 self.reorder_slices(tmp=True)
1213 def read_slice(self, where, nslice, buffer, start=0):
1214 """Read a slice from the `where` dataset and put it in `buffer`."""
1216 # Create the buffers for specifying the coordinates
1217 self.startl = np.array([nslice, start], np.uint64)
1218 self.stopl = np.array([nslice + 1, start + buffer.size], np.uint64)
1219 self.stepl = np.ones(shape=2, dtype=np.uint64)
1220 where._g_read_slice(self.startl, self.stopl, self.stepl, buffer)
1222 def write_slice(self, where, nslice, buffer, start=0):
1223 """Write a `slice` to the `where` dataset with the `buffer` data."""
1225 self.startl = np.array([nslice, start], np.uint64)
1226 self.stopl = np.array([nslice + 1, start + buffer.size], np.uint64)
1227 self.stepl = np.ones(shape=2, dtype=np.uint64)
1228 countl = self.stopl - self.startl # (1, self.slicesize)
1229 where._g_write_slice(self.startl, self.stepl, countl, buffer)
1231 # Read version for LastRow
1232 def read_slice_lr(self, where, buffer, start=0):
1233 """Read a slice from the `where` dataset and put it in `buffer`."""
1235 startl = np.array([start], dtype=np.uint64)
1236 stopl = np.array([start + buffer.size], dtype=np.uint64)
1237 stepl = np.array([1], dtype=np.uint64)
1238 where._g_read_slice(startl, stopl, stepl, buffer)
1240 # Write version for LastRow
1241 def write_sliceLR(self, where, buffer, start=0):
1242 """Write a slice from the `where` dataset with the `buffer` data."""
1244 startl = np.array([start], dtype=np.uint64)
1245 countl = np.array([start + buffer.size], dtype=np.uint64)
1246 stepl = np.array([1], dtype=np.uint64)
1247 where._g_write_slice(startl, stepl, countl, buffer)
1249 def reorder_slice(self, nslice, sorted, indices, ssorted, sindices,
1250 tmp_sorted, tmp_indices):
1251 """Copy & reorder the slice in source to final destination."""
1253 ss = self.slicesize
1254 # Load the second part in buffers
1255 self.read_slice(tmp_sorted, nslice, ssorted[ss:])
1256 self.read_slice(tmp_indices, nslice, sindices[ss:])
1257 indexesextension.keysort(ssorted, sindices)
1258 # Write the first part of the buffers to the regular leaves
1259 self.write_slice(sorted, nslice - 1, ssorted[:ss])
1260 self.write_slice(indices, nslice - 1, sindices[:ss])
1261 # Update caches
1262 self.update_caches(nslice - 1, ssorted[:ss])
1263 # Shift the slice in the end to the beginning
1264 ssorted[:ss] = ssorted[ss:]
1265 sindices[:ss] = sindices[ss:]
1267 def update_caches(self, nslice, ssorted):
1268 """Update the caches for faster lookups."""
1270 cs = self.chunksize
1271 ncs = self.nchunkslice
1272 tmp = self.tmp
1273 # update first & second cache bounds (ranges & bounds)
1274 tmp.ranges[nslice] = ssorted[[0, -1]]
1275 tmp.bounds[nslice] = ssorted[cs::cs]
1276 # update start & stop bounds
1277 tmp.abounds[nslice * ncs:(nslice + 1) * ncs] = ssorted[0::cs]
1278 tmp.zbounds[nslice * ncs:(nslice + 1) * ncs] = ssorted[cs - 1::cs]
1279 # update median bounds
1280 smedian = ssorted[cs // 2::cs]
1281 tmp.mbounds[nslice * ncs:(nslice + 1) * ncs] = smedian
1282 tmp.mranges[nslice] = smedian[ncs // 2]
1284 def reorder_slices(self, tmp):
1285 """Reorder completely the index at slice level.
1287 This method has to maintain the locality of elements in the
1288 ambit of ``blocks``, i.e. an element of a ``block`` cannot be
1289 sent to another ``block`` during this reordering. This is
1290 *critical* for ``light`` indexes to be able to use this.
1292 This version of reorder_slices is optimized in that *two*
1293 complete slices are taken at a time (including the last row
1294 slice) so as to sort them. Then, each new slice that is read is
1295 put at the end of this two-slice buffer, while the previous one
1296 is moved to the beginning of the buffer. This is in order to
1297 better reduce the entropy of the regular part (i.e. all except
1298 the last row) of the index.
1300 A secondary effect of this is that it takes at least *twice* of
1301 memory than a previous version of reorder_slices() that only
1302 reorders on a slice-by-slice basis. However, as this is more
1303 efficient than the old version, one can configure the slicesize
1304 to be smaller, so the memory consumption is barely similar.
1306 """
1308 tmp = self.tmp
1309 sorted = tmp.sorted
1310 indices = tmp.indices
1311 if tmp:
1312 tmp_sorted = tmp.sorted2
1313 tmp_indices = tmp.indices2
1314 else:
1315 tmp_sorted = tmp.sorted
1316 tmp_indices = tmp.indices
1317 cs = self.chunksize
1318 ss = self.slicesize
1319 nsb = self.blocksize // self.slicesize
1320 nslices = self.nslices
1321 nblocks = self.nblocks
1322 nelementsLR = self.nelementsILR
1323 # Create the buffer for reordering 2 slices at a time
1324 ssorted = np.empty(shape=ss * 2, dtype=self.dtype)
1325 sindices = np.empty(shape=ss * 2, dtype=np.dtype('u%d' % self.indsize))
1327 if self.indsize == 8:
1328 # Bootstrap the process for reordering
1329 # Read the first slice in buffers
1330 self.read_slice(tmp_sorted, 0, ssorted[:ss])
1331 self.read_slice(tmp_indices, 0, sindices[:ss])
1333 nslice = 0 # Just in case the loop behind executes nothing
1334 # Loop over the remainding slices in block
1335 for nslice in range(1, sorted.nrows):
1336 self.reorder_slice(nslice, sorted, indices,
1337 ssorted, sindices,
1338 tmp_sorted, tmp_indices)
1340 # End the process (enrolling the lastrow if necessary)
1341 if nelementsLR > 0:
1342 sortedLR = self.tmp.sortedLR
1343 indicesLR = self.tmp.indicesLR
1344 # Shrink the ssorted and sindices arrays to the minimum
1345 ssorted2 = ssorted[:ss + nelementsLR]
1346 sortedlr = ssorted2[ss:]
1347 sindices2 = sindices[:ss + nelementsLR]
1348 indiceslr = sindices2[ss:]
1349 # Read the last row info in the second part of the buffer
1350 self.read_slice_lr(sortedLR, sortedlr)
1351 self.read_slice_lr(indicesLR, indiceslr)
1352 indexesextension.keysort(ssorted2, sindices2)
1353 # Write the second part of the buffers to the lastrow indices
1354 self.write_sliceLR(sortedLR, sortedlr)
1355 self.write_sliceLR(indicesLR, indiceslr)
1356 # Update the caches for last row
1357 bebounds = np.concatenate((sortedlr[::cs], [sortedlr[-1]]))
1358 sortedLR[nelementsLR:nelementsLR + len(bebounds)] = bebounds
1359 self.bebounds = bebounds
1360 # Write the first part of the buffers to the regular leaves
1361 self.write_slice(sorted, nslice, ssorted[:ss])
1362 self.write_slice(indices, nslice, sindices[:ss])
1363 # Update caches for this slice
1364 self.update_caches(nslice, ssorted[:ss])
1365 else:
1366 # Iterate over each block. No data should cross block
1367 # boundaries to avoid adressing problems with short indices.
1368 for nb in range(nblocks):
1369 # Bootstrap the process for reordering
1370 # Read the first slice in buffers
1371 nrow = nb * nsb
1372 self.read_slice(tmp_sorted, nrow, ssorted[:ss])
1373 self.read_slice(tmp_indices, nrow, sindices[:ss])
1375 # Loop over the remainding slices in block
1376 lrb = nrow + nsb
1377 if lrb > nslices:
1378 lrb = nslices
1379 nslice = nrow # Just in case the loop behind executes nothing
1380 for nslice in range(nrow + 1, lrb):
1381 self.reorder_slice(nslice, sorted, indices,
1382 ssorted, sindices,
1383 tmp_sorted, tmp_indices)
1385 # Write the first part of the buffers to the regular leaves
1386 self.write_slice(sorted, nslice, ssorted[:ss])
1387 self.write_slice(indices, nslice, sindices[:ss])
1388 # Update caches for this slice
1389 self.update_caches(nslice, ssorted[:ss])
1391 def swap_slices(self, mode="median"):
1392 """Swap slices in a superblock."""
1394 tmp = self.tmp
1395 sorted = tmp.sorted
1396 indices = tmp.indices
1397 tmp_sorted = tmp.sorted2
1398 tmp_indices = tmp.indices2
1399 ncs = self.nchunkslice
1400 nss = self.superblocksize // self.slicesize
1401 nss2 = nss
1402 for sblock in range(self.nsuperblocks):
1403 # Protection for last superblock having less slices than nss
1404 remainingslices = self.nslices - sblock * nss
1405 if remainingslices < nss:
1406 nss2 = remainingslices
1407 if nss2 <= 1:
1408 break
1409 if mode == "start":
1410 ranges = tmp.ranges[sblock * nss:sblock * nss + nss2, 0]
1411 elif mode == "stop":
1412 ranges = tmp.ranges[sblock * nss:sblock * nss + nss2, 1]
1413 elif mode == "median":
1414 ranges = tmp.mranges[sblock * nss:sblock * nss + nss2]
1415 sranges_idx = ranges.argsort(kind=defsort)
1416 # Don't swap the superblock at all if one doesn't need to
1417 ndiff = (sranges_idx != np.arange(nss2)).sum() / 2
1418 if ndiff * 50 < nss2:
1419 # The number of slices to rearrange is less than 2.5%,
1420 # so skip the reordering of this superblock
1421 # (too expensive for such a little improvement)
1422 if self.verbose:
1423 print("skipping reordering of superblock ->", sblock)
1424 continue
1425 ns = sblock * nss2
1426 # Swap sorted and indices slices following the new order
1427 for i in range(nss2):
1428 idx = sranges_idx[i]
1429 # Swap sorted & indices slices
1430 oi = ns + i
1431 oidx = ns + idx
1432 tmp_sorted[oi] = sorted[oidx]
1433 tmp_indices[oi] = indices[oidx]
1434 # Swap start, stop & median ranges
1435 tmp.ranges2[oi] = tmp.ranges[oidx]
1436 tmp.mranges2[oi] = tmp.mranges[oidx]
1437 # Swap chunk bounds
1438 tmp.bounds2[oi] = tmp.bounds[oidx]
1439 # Swap start, stop & median bounds
1440 j = oi * ncs
1441 jn = (oi + 1) * ncs
1442 xj = oidx * ncs
1443 xjn = (oidx + 1) * ncs
1444 tmp.abounds2[j:jn] = tmp.abounds[xj:xjn]
1445 tmp.zbounds2[j:jn] = tmp.zbounds[xj:xjn]
1446 tmp.mbounds2[j:jn] = tmp.mbounds[xj:xjn]
1447 # tmp -> originals
1448 for i in range(nss2):
1449 # Copy sorted & indices slices
1450 oi = ns + i
1451 sorted[oi] = tmp_sorted[oi]
1452 indices[oi] = tmp_indices[oi]
1453 # Copy start, stop & median ranges
1454 tmp.ranges[oi] = tmp.ranges2[oi]
1455 tmp.mranges[oi] = tmp.mranges2[oi]
1456 # Copy chunk bounds
1457 tmp.bounds[oi] = tmp.bounds2[oi]
1458 # Copy start, stop & median bounds
1459 j = oi * ncs
1460 jn = (oi + 1) * ncs
1461 tmp.abounds[j:jn] = tmp.abounds2[j:jn]
1462 tmp.zbounds[j:jn] = tmp.zbounds2[j:jn]
1463 tmp.mbounds[j:jn] = tmp.mbounds2[j:jn]
1465 def search_item_lt(self, where, item, nslice, limits, start=0):
1466 """Search a single item in a specific sorted slice."""
1468 # This method will only works under the assumtion that item
1469 # *is to be found* in the nslice.
1470 assert nan_aware_lt(limits[0], item) and nan_aware_le(item, limits[1])
1471 cs = self.chunksize
1472 ss = self.slicesize
1473 nelementsLR = self.nelementsILR
1474 bstart = start // cs
1476 # Find the chunk
1477 if nslice < self.nslices:
1478 nchunk = bisect_left(where.bounds[nslice], item, bstart)
1479 else:
1480 # We need to subtract 1 chunk here because bebounds
1481 # has a leading value
1482 nchunk = bisect_left(self.bebounds, item, bstart) - 1
1483 assert nchunk >= 0
1485 # Find the element in chunk
1486 pos = nchunk * cs
1487 if nslice < self.nslices:
1488 pos += bisect_left(where.sorted[nslice, pos:pos + cs], item)
1489 assert pos <= ss
1490 else:
1491 end = pos + cs
1492 if end > nelementsLR:
1493 end = nelementsLR
1494 pos += bisect_left(self.sortedLR[pos:end], item)
1495 assert pos <= nelementsLR
1496 assert pos > 0
1497 return pos
1499 def compute_overlaps_finegrain(self, where, message, verbose):
1500 """Compute some statistics about overlaping of slices in index.
1502 It returns the following info:
1504 noverlaps : int
1505 The total number of elements that overlaps in index.
1506 multiplicity : array of int
1507 The number of times that a concrete slice overlaps with any other.
1508 toverlap : float
1509 An ovelap index: the sum of the values in segment slices that
1510 overlaps divided by the entire range of values. This index is only
1511 computed for numerical types.
1513 """
1515 ss = self.slicesize
1516 ranges = where.ranges[:]
1517 sorted = where.sorted
1518 sortedLR = where.sortedLR
1519 nslices = self.nslices
1520 nelementsLR = self.nelementsILR
1521 if nelementsLR > 0:
1522 # Add the ranges corresponding to the last row
1523 rangeslr = np.array([self.bebounds[0], self.bebounds[-1]])
1524 ranges = np.concatenate((ranges, [rangeslr]))
1525 nslices += 1
1526 soverlap = 0
1527 toverlap = -1
1528 multiplicity = np.zeros(shape=nslices, dtype="int_")
1529 overlaps = multiplicity.copy()
1530 starts = multiplicity.copy()
1531 for i in range(nslices):
1532 prev_end = ranges[i, 1]
1533 for j in range(i + 1, nslices):
1534 stj = starts[j]
1535 assert stj <= ss
1536 if stj == ss:
1537 # This slice has already been counted
1538 continue
1539 if j < self.nslices:
1540 next_beg = sorted[j, stj]
1541 else:
1542 next_beg = sortedLR[stj]
1543 next_end = ranges[j, 1]
1544 if prev_end > next_end:
1545 # Complete overlapping case
1546 multiplicity[j - i] += 1
1547 if j < self.nslices:
1548 overlaps[i] += ss - stj
1549 starts[j] = ss # a sentinel
1550 else:
1551 overlaps[i] += nelementsLR - stj
1552 starts[j] = nelementsLR # a sentinel
1553 elif prev_end > next_beg:
1554 multiplicity[j - i] += 1
1555 idx = self.search_item_lt(
1556 where, prev_end, j, ranges[j], stj)
1557 nelem = idx - stj
1558 overlaps[i] += nelem
1559 starts[j] = idx
1560 if self.type != "string":
1561 # Convert ranges into floats in order to allow
1562 # doing operations with them without overflows
1563 soverlap += float(ranges[i, 1]) - float(ranges[j, 0])
1565 # Return the overlap as the ratio between overlaps and entire range
1566 if self.type != "string":
1567 erange = float(ranges[-1, 1]) - float(ranges[0, 0])
1568 # Check that there is an effective range of values
1569 # Beware, erange can be negative in situations where
1570 # the values are suffering overflow. This can happen
1571 # specially on big signed integer values (on overflows,
1572 # the end value will become negative!).
1573 # Also, there is no way to compute overlap ratios for
1574 # non-numerical types. So, be careful and always check
1575 # that toverlap has a positive value (it must have been
1576 # initialized to -1. before) before using it.
1577 # F. Alted 2007-01-19
1578 if erange > 0:
1579 toverlap = soverlap / erange
1580 if verbose and message != "init":
1581 print("toverlap (%s):" % message, toverlap)
1582 print("multiplicity:\n", multiplicity, multiplicity.sum())
1583 print("overlaps:\n", overlaps, overlaps.sum())
1584 noverlaps = overlaps.sum()
1585 # For full indexes, set the 'is_csi' flag
1586 if self.indsize == 8 and self._v_file._iswritable():
1587 self._v_attrs.is_csi = (noverlaps == 0)
1588 # Save the number of overlaps for future references
1589 self.noverlaps = noverlaps
1590 return (noverlaps, multiplicity, toverlap)
1592 def compute_overlaps(self, where, message, verbose):
1593 """Compute some statistics about overlaping of slices in index.
1595 It returns the following info:
1597 noverlaps : int
1598 The total number of slices that overlaps in index.
1599 multiplicity : array of int
1600 The number of times that a concrete slice overlaps with any other.
1601 toverlap : float
1602 An ovelap index: the sum of the values in segment slices that
1603 overlaps divided by the entire range of values. This index is only
1604 computed for numerical types.
1606 """
1608 ranges = where.ranges[:]
1609 nslices = self.nslices
1610 if self.nelementsILR > 0:
1611 # Add the ranges corresponding to the last row
1612 rangeslr = np.array([self.bebounds[0], self.bebounds[-1]])
1613 ranges = np.concatenate((ranges, [rangeslr]))
1614 nslices += 1
1615 noverlaps = 0
1616 soverlap = 0
1617 toverlap = -1
1618 multiplicity = np.zeros(shape=nslices, dtype="int_")
1619 for i in range(nslices):
1620 for j in range(i + 1, nslices):
1621 if ranges[i, 1] > ranges[j, 0]:
1622 noverlaps += 1
1623 multiplicity[j - i] += 1
1624 if self.type != "string":
1625 # Convert ranges into floats in order to allow
1626 # doing operations with them without overflows
1627 soverlap += float(ranges[i, 1]) - float(ranges[j, 0])
1629 # Return the overlap as the ratio between overlaps and entire range
1630 if self.type != "string":
1631 erange = float(ranges[-1, 1]) - float(ranges[0, 0])
1632 # Check that there is an effective range of values
1633 # Beware, erange can be negative in situations where
1634 # the values are suffering overflow. This can happen
1635 # specially on big signed integer values (on overflows,
1636 # the end value will become negative!).
1637 # Also, there is no way to compute overlap ratios for
1638 # non-numerical types. So, be careful and always check
1639 # that toverlap has a positive value (it must have been
1640 # initialized to -1. before) before using it.
1641 # F. Altet 2007-01-19
1642 if erange > 0:
1643 toverlap = soverlap / erange
1644 if verbose:
1645 print("overlaps (%s):" % message, noverlaps, toverlap)
1646 print(multiplicity)
1647 # For full indexes, set the 'is_csi' flag
1648 if self.indsize == 8 and self._v_file._iswritable():
1649 self._v_attrs.is_csi = (noverlaps == 0)
1650 # Save the number of overlaps for future references
1651 self.noverlaps = noverlaps
1652 return (noverlaps, multiplicity, toverlap)
1654 def read_sorted_indices(self, what, start, stop, step):
1655 """Return the sorted or indices values in the specified range."""
1656 (start, stop, step) = self._process_range(start, stop, step)
1657 if start >= stop:
1658 return np.empty(0, self.dtype)
1659 # Correction for negative values of step (reverse indices)
1660 if step < 0:
1661 tmp = start
1662 start = self.nelements - stop
1663 stop = self.nelements - tmp
1664 if what == "sorted":
1665 values = self.sorted
1666 valuesLR = self.sortedLR
1667 buffer_ = np.empty(stop - start, dtype=self.dtype)
1668 else:
1669 values = self.indices
1670 valuesLR = self.indicesLR
1671 buffer_ = np.empty(stop - start, dtype="u%d" % self.indsize)
1672 ss = self.slicesize
1673 nrow_start = start // ss
1674 istart = start % ss
1675 nrow_stop = stop // ss
1676 tlen = stop - start
1677 bstart = 0
1678 ilen = 0
1679 for nrow in range(nrow_start, nrow_stop + 1):
1680 blen = ss - istart
1681 if ilen + blen > tlen:
1682 blen = tlen - ilen
1683 if blen <= 0:
1684 break
1685 if nrow < self.nslices:
1686 self.read_slice(
1687 values, nrow, buffer_[bstart:bstart + blen], istart)
1688 else:
1689 self.read_slice_lr(
1690 valuesLR, buffer_[bstart:bstart + blen], istart)
1691 istart = 0
1692 bstart += blen
1693 ilen += blen
1694 return buffer_[::step]
1696 def read_sorted(self, start=None, stop=None, step=None):
1697 """Return the sorted values of index in the specified range.
1699 The meaning of the start, stop and step arguments is the same as in
1700 :meth:`Table.read_sorted`.
1702 """
1704 return self.read_sorted_indices('sorted', start, stop, step)
1706 def read_indices(self, start=None, stop=None, step=None):
1707 """Return the indices values of index in the specified range.
1709 The meaning of the start, stop and step arguments is the same as in
1710 :meth:`Table.read_sorted`.
1712 """
1714 return self.read_sorted_indices('indices', start, stop, step)
1716 def _process_range(self, start, stop, step):
1717 """Get a range specifc for the index usage."""
1719 if start is not None and stop is None:
1720 # Special case for the behaviour of PyTables iterators
1721 stop = idx2long(start + 1)
1722 if start is None:
1723 start = 0
1724 else:
1725 start = idx2long(start)
1726 if stop is None:
1727 stop = idx2long(self.nelements)
1728 else:
1729 stop = idx2long(stop)
1730 if step is None:
1731 step = 1
1732 else:
1733 step = idx2long(step)
1734 return (start, stop, step)
1736 def __getitem__(self, key):
1737 """Return the indices values of index in the specified range.
1739 If key argument is an integer, the corresponding index is returned. If
1740 key is a slice, the range of indices determined by it is returned. A
1741 negative value of step in slice is supported, meaning that the results
1742 will be returned in reverse order.
1744 This method is equivalent to :meth:`Index.read_indices`.
1746 """
1748 if is_idx(key):
1749 key = operator.index(key)
1751 if key < 0:
1752 # To support negative values
1753 key += self.nelements
1754 return self.read_indices(key, key + 1, 1)[0]
1755 elif isinstance(key, slice):
1756 return self.read_indices(key.start, key.stop, key.step)
1758 def __len__(self):
1759 return self.nelements
1761 def restorecache(self):
1762 """Clean the limits cache and resize starts and lengths arrays"""
1764 params = self._v_file.params
1765 # The sorted IndexArray is absolutely required to be in memory
1766 # at the same time than the Index instance, so create a strong
1767 # reference to it. We are not introducing leaks because the
1768 # strong reference will disappear when this Index instance is
1769 # to be closed.
1770 self._sorted = self.sorted
1771 self._sorted.boundscache = ObjectCache(params['BOUNDS_MAX_SLOTS'],
1772 params['BOUNDS_MAX_SIZE'],
1773 'non-opt types bounds')
1774 self.sorted.boundscache = ObjectCache(params['BOUNDS_MAX_SLOTS'],
1775 params['BOUNDS_MAX_SIZE'],
1776 'non-opt types bounds')
1777 """A cache for the bounds (2nd hash) data. Only used for
1778 non-optimized types searches."""
1779 self.limboundscache = ObjectCache(params['LIMBOUNDS_MAX_SLOTS'],
1780 params['LIMBOUNDS_MAX_SIZE'],
1781 'bounding limits')
1782 """A cache for bounding limits."""
1783 self.sortedLRcache = ObjectCache(params['SORTEDLR_MAX_SLOTS'],
1784 params['SORTEDLR_MAX_SIZE'],
1785 'last row chunks')
1786 """A cache for the last row chunks. Only used for searches in
1787 the last row, and mainly useful for small indexes."""
1788 self.starts = np.empty(shape=self.nrows, dtype=np.int32)
1789 self.lengths = np.empty(shape=self.nrows, dtype=np.int32)
1790 self.sorted._init_sorted_slice(self)
1791 self.dirtycache = False
1793 def search(self, item):
1794 """Do a binary search in this index for an item."""
1796 if profile:
1797 tref = clock()
1798 if profile:
1799 show_stats("Entering search", tref)
1801 if self.dirtycache:
1802 self.restorecache()
1804 # An empty item or if left limit is larger than the right one
1805 # means that the number of records is always going to be empty,
1806 # so we avoid further computation (including looking up the
1807 # limits cache).
1808 if not item or item[0] > item[1]:
1809 self.starts[:] = 0
1810 self.lengths[:] = 0
1811 return 0
1813 tlen = 0
1814 # Check whether the item tuple is in the limits cache or not
1815 nslot = self.limboundscache.getslot(item)
1816 if nslot >= 0:
1817 startlengths = self.limboundscache.getitem(nslot)
1818 # Reset the lengths array (not necessary for starts)
1819 self.lengths[:] = 0
1820 # Now, set the interesting rows
1821 for nrow2, start, length in startlengths:
1822 self.starts[nrow2] = start
1823 self.lengths[nrow2] = length
1824 tlen = tlen + length
1825 return tlen
1826 # The item is not in cache. Do the real lookup.
1827 sorted = self.sorted
1828 if self.nslices > 0:
1829 if self.type in self.opt_search_types:
1830 # The next are optimizations. However, they hide the
1831 # CPU functions consumptions from python profiles.
1832 # You may want to de-activate them during profiling.
1833 if self.type == "int32":
1834 tlen = sorted._search_bin_na_i(*item)
1835 elif self.type == "int64":
1836 tlen = sorted._search_bin_na_ll(*item)
1837 elif self.type == "float16":
1838 tlen = sorted._search_bin_na_e(*item)
1839 elif self.type == "float32":
1840 tlen = sorted._search_bin_na_f(*item)
1841 elif self.type == "float64":
1842 tlen = sorted._search_bin_na_d(*item)
1843 elif self.type == "float96":
1844 tlen = sorted._search_bin_na_g(*item)
1845 elif self.type == "float128":
1846 tlen = sorted._search_bin_na_g(*item)
1847 elif self.type == "uint32":
1848 tlen = sorted._search_bin_na_ui(*item)
1849 elif self.type == "uint64":
1850 tlen = sorted._search_bin_na_ull(*item)
1851 elif self.type == "int8":
1852 tlen = sorted._search_bin_na_b(*item)
1853 elif self.type == "int16":
1854 tlen = sorted._search_bin_na_s(*item)
1855 elif self.type == "uint8":
1856 tlen = sorted._search_bin_na_ub(*item)
1857 elif self.type == "uint16":
1858 tlen = sorted._search_bin_na_us(*item)
1859 else:
1860 assert False, "This can't happen!"
1861 else:
1862 tlen = self.search_scalar(item, sorted)
1863 # Get possible remaining values in last row
1864 if self.nelementsSLR > 0:
1865 # Look for more indexes in the last row
1866 (start, stop) = self.search_last_row(item)
1867 self.starts[-1] = start
1868 self.lengths[-1] = stop - start
1869 tlen += stop - start
1871 if self.limboundscache.couldenablecache():
1872 # Get a startlengths tuple and save it in cache.
1873 # This is quite slow, but it is a good way to compress
1874 # the bounds info. Moreover, the .couldenablecache()
1875 # is doing a good work so as to avoid computing this
1876 # when it is not necessary to do it.
1877 startlengths = []
1878 for nrow, length in enumerate(self.lengths):
1879 if length > 0:
1880 startlengths.append((nrow, self.starts[nrow], length))
1881 # Compute the size of the recarray (aproximately)
1882 # The +1 at the end is important to avoid 0 lengths
1883 # (remember, the object headers take some space)
1884 size = len(startlengths) * 8 * 2 + 1
1885 # Put this startlengths list in cache
1886 self.limboundscache.setitem(item, startlengths, size)
1888 if profile:
1889 show_stats("Exiting search", tref)
1890 return tlen
1892 # This is an scalar version of search. It works with strings as well.
1893 def search_scalar(self, item, sorted):
1894 """Do a binary search in this index for an item."""
1896 tlen = 0
1897 # Do the lookup for values fullfilling the conditions
1898 for i in range(self.nslices):
1899 (start, stop) = sorted._search_bin(i, item)
1900 self.starts[i] = start
1901 self.lengths[i] = stop - start
1902 tlen += stop - start
1903 return tlen
1905 def search_last_row(self, item):
1906 # Variable initialization
1907 item1, item2 = item
1908 bebounds = self.bebounds
1909 b0, b1 = bebounds[0], bebounds[-1]
1910 bounds = bebounds[1:-1]
1911 itemsize = self.dtype.itemsize
1912 sortedLRcache = self.sortedLRcache
1913 hi = self.nelementsSLR # maximum number of elements
1914 rchunksize = self.chunksize // self.reduction
1916 nchunk = -1
1917 # Lookup for item1
1918 if nan_aware_gt(item1, b0):
1919 if nan_aware_le(item1, b1):
1920 # Search the appropriate chunk in bounds cache
1921 nchunk = bisect_left(bounds, item1)
1922 # Lookup for this chunk in cache
1923 nslot = sortedLRcache.getslot(nchunk)
1924 if nslot >= 0:
1925 chunk = sortedLRcache.getitem(nslot)
1926 else:
1927 begin = rchunksize * nchunk
1928 end = rchunksize * (nchunk + 1)
1929 if end > hi:
1930 end = hi
1931 # Read the chunk from disk
1932 chunk = self.sortedLR._read_sorted_slice(
1933 self.sorted, begin, end)
1934 # Put it in cache. It's important to *copy*
1935 # the buffer, as it is reused in future reads!
1936 sortedLRcache.setitem(nchunk, chunk.copy(),
1937 (end - begin) * itemsize)
1938 start = bisect_left(chunk, item1)
1939 start += rchunksize * nchunk
1940 else:
1941 start = hi
1942 else:
1943 start = 0
1944 # Lookup for item2
1945 if nan_aware_ge(item2, b0):
1946 if nan_aware_lt(item2, b1):
1947 # Search the appropriate chunk in bounds cache
1948 nchunk2 = bisect_right(bounds, item2)
1949 if nchunk2 != nchunk:
1950 # Lookup for this chunk in cache
1951 nslot = sortedLRcache.getslot(nchunk2)
1952 if nslot >= 0:
1953 chunk = sortedLRcache.getitem(nslot)
1954 else:
1955 begin = rchunksize * nchunk2
1956 end = rchunksize * (nchunk2 + 1)
1957 if end > hi:
1958 end = hi
1959 # Read the chunk from disk
1960 chunk = self.sortedLR._read_sorted_slice(
1961 self.sorted, begin, end)
1962 # Put it in cache. It's important to *copy*
1963 # the buffer, as it is reused in future reads!
1964 # See bug #60 in xot.carabos.com
1965 sortedLRcache.setitem(nchunk2, chunk.copy(),
1966 (end - begin) * itemsize)
1967 stop = bisect_right(chunk, item2)
1968 stop += rchunksize * nchunk2
1969 else:
1970 stop = hi
1971 else:
1972 stop = 0
1973 return (start, stop)
1975 def get_chunkmap(self):
1976 """Compute a map with the interesting chunks in index."""
1978 if profile:
1979 tref = clock()
1980 if profile:
1981 show_stats("Entering get_chunkmap", tref)
1982 ss = self.slicesize
1983 nsb = self.nslicesblock
1984 nslices = self.nslices
1985 lbucket = self.lbucket
1986 indsize = self.indsize
1987 bucketsinblock = self.blocksize / lbucket
1988 nchunks = math.ceil(self.nelements / lbucket)
1989 chunkmap = np.zeros(shape=nchunks, dtype="bool")
1990 reduction = self.reduction
1991 starts = (self.starts - 1) * reduction + 1
1992 stops = (self.starts + self.lengths) * reduction
1993 starts[starts < 0] = 0 # All negative values set to zero
1994 indices = self.indices
1995 for nslice in range(self.nrows):
1996 start = starts[nslice]
1997 stop = stops[nslice]
1998 if stop > start:
1999 idx = np.empty(shape=stop - start, dtype='u%d' % indsize)
2000 if nslice < nslices:
2001 indices._read_index_slice(nslice, start, stop, idx)
2002 else:
2003 self.indicesLR._read_index_slice(start, stop, idx)
2004 if indsize == 8:
2005 idx //= lbucket
2006 elif indsize == 2:
2007 # The chunkmap size cannot be never larger than 'int_'
2008 idx = idx.astype("int_")
2009 offset = int((nslice // nsb) * bucketsinblock)
2010 idx += offset
2011 elif indsize == 1:
2012 # The chunkmap size cannot be never larger than 'int_'
2013 idx = idx.astype("int_")
2014 offset = (nslice * ss) // lbucket
2015 idx += offset
2016 chunkmap[idx] = True
2017 # The case lbucket < nrowsinchunk should only happen in tests
2018 nrowsinchunk = self.nrowsinchunk
2019 if lbucket != nrowsinchunk:
2020 # Map the 'coarse grain' chunkmap into the 'true' chunkmap
2021 nelements = self.nelements
2022 tnchunks = math.ceil(nelements / nrowsinchunk)
2023 tchunkmap = np.zeros(shape=tnchunks, dtype="bool")
2024 ratio = lbucket / nrowsinchunk
2025 idx = chunkmap.nonzero()[0]
2026 starts = (idx * ratio).astype('int_')
2027 stops = np.ceil((idx + 1) * ratio).astype('int_')
2028 for start, stop in zip(starts, stops):
2029 tchunkmap[start:stop] = True
2030 chunkmap = tchunkmap
2031 if profile:
2032 show_stats("Exiting get_chunkmap", tref)
2033 return chunkmap
2035 def get_lookup_range(self, ops, limits):
2036 assert len(ops) in [1, 2]
2037 assert len(limits) in [1, 2]
2038 assert len(ops) == len(limits)
2040 column = self.column
2041 coldtype = column.dtype.base
2042 itemsize = coldtype.itemsize
2044 if len(limits) == 1:
2045 assert ops[0] in ['lt', 'le', 'eq', 'ge', 'gt']
2046 limit = limits[0]
2047 op = ops[0]
2048 if op == 'lt':
2049 range_ = (inftype(coldtype, itemsize, sign=-1),
2050 nextafter(limit, -1, coldtype, itemsize))
2051 elif op == 'le':
2052 range_ = (inftype(coldtype, itemsize, sign=-1),
2053 limit)
2054 elif op == 'gt':
2055 range_ = (nextafter(limit, +1, coldtype, itemsize),
2056 inftype(coldtype, itemsize, sign=+1))
2057 elif op == 'ge':
2058 range_ = (limit,
2059 inftype(coldtype, itemsize, sign=+1))
2060 elif op == 'eq':
2061 range_ = (limit, limit)
2063 elif len(limits) == 2:
2064 assert ops[0] in ('gt', 'ge') and ops[1] in ('lt', 'le')
2066 lower, upper = limits
2067 if lower > upper:
2068 # ``a <[=] x <[=] b`` is always false if ``a > b``.
2069 return ()
2071 if ops == ('gt', 'lt'): # lower < col < upper
2072 range_ = (nextafter(lower, +1, coldtype, itemsize),
2073 nextafter(upper, -1, coldtype, itemsize))
2074 elif ops == ('ge', 'lt'): # lower <= col < upper
2075 range_ = (lower, nextafter(upper, -1, coldtype, itemsize))
2076 elif ops == ('gt', 'le'): # lower < col <= upper
2077 range_ = (nextafter(lower, +1, coldtype, itemsize), upper)
2078 elif ops == ('ge', 'le'): # lower <= col <= upper
2079 range_ = (lower, upper)
2081 return range_
2083 def _f_remove(self, recursive=False):
2084 """Remove this Index object."""
2086 # Index removal is always recursive,
2087 # no matter what `recursive` says.
2088 super()._f_remove(True)
2090 def __str__(self):
2091 """This provides a more compact representation than __repr__"""
2093 # The filters
2094 filters = []
2095 if self.filters.complevel:
2096 if self.filters.shuffle:
2097 filters.append('shuffle')
2098 if self.filters.bitshuffle:
2099 filters.append('bitshuffle')
2100 filters.append(f'{self.filters.complib}({self.filters.complevel})')
2101 return (f"Index({self.optlevel}, "
2102 f"{self.kind}{', '.join(filters)}).is_csi={self.is_csi}")
2104 def __repr__(self):
2105 """This provides more metainfo than standard __repr__"""
2107 cpathname = f"{self.table._v_pathname}.cols.{self.column.pathname}"
2108 retstr = f"""{self._v_pathname} (Index for column {cpathname})
2109 optlevel := {self.optlevel}
2110 kind := {self.kind}
2111 filters := {self.filters}
2112 is_csi := {self.is_csi}
2113 nelements := {self.nelements}
2114 chunksize := {self.chunksize}
2115 slicesize := {self.slicesize}
2116 blocksize := {self.blocksize}
2117 superblocksize := {self.superblocksize}
2118 dirty := {self.dirty}
2119 byteorder := {self.byteorder!r}
2120 sorted := {self.sorted}
2121 indices := {self.indices}
2122 ranges := {self.ranges}
2123 bounds := {self.bounds}
2124 sortedLR := {self.sortedLR}
2125 indicesLR := {self.indicesLR}"""
2126 return retstr
2129class IndexesDescG(NotLoggedMixin, Group):
2130 _c_classid = 'DINDEX'
2132 def _g_width_warning(self):
2133 warnings.warn(
2134 "the number of indexed columns on a single description group "
2135 "is exceeding the recommended maximum (%d); "
2136 "be ready to see PyTables asking for *lots* of memory "
2137 "and possibly slow I/O" % self._v_max_group_width,
2138 PerformanceWarning)
2141class IndexesTableG(NotLoggedMixin, Group):
2142 _c_classid = 'TINDEX'
2144 @property
2145 def auto(self):
2146 if 'AUTO_INDEX' not in self._v_attrs:
2147 return default_auto_index
2148 return self._v_attrs.AUTO_INDEX
2150 @auto.setter
2151 def auto(self, auto):
2152 self._v_attrs.AUTO_INDEX = bool(auto)
2154 @auto.deleter
2155 def auto(self):
2156 del self._v_attrs.AUTO_INDEX
2158 def _g_width_warning(self):
2159 warnings.warn(
2160 "the number of indexed columns on a single table "
2161 "is exceeding the recommended maximum (%d); "
2162 "be ready to see PyTables asking for *lots* of memory "
2163 "and possibly slow I/O" % self._v_max_group_width,
2164 PerformanceWarning)
2166 def _g_check_name(self, name):
2167 if not name.startswith('_i_'):
2168 raise ValueError(
2169 "names of index groups must start with ``_i_``: %s" % name)
2171 @property
2172 def table(self):
2173 """Accessor for the `Table` object of this `IndexesTableG`
2174 container."""
2175 names = self._v_pathname.split("/")
2176 tablename = names.pop()[3:] # "_i_" is at the beginning
2177 parentpathname = "/".join(names)
2178 tablepathname = join_path(parentpathname, tablename)
2179 table = self._v_file._get_node(tablepathname)
2180 return table
2183class OldIndex(NotLoggedMixin, Group):
2184 """This is meant to hide indexes of PyTables 1.x files."""
2186 _c_classid = 'CINDEX'