Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tables/index.py: 10%

1"""Here is defined the Index class."""

3import math

4import operator

5import os

6import sys

7import tempfile

8import warnings

10from pathlib import Path

11from time import perf_counter as clock

12from time import process_time as cpuclock

14import numpy as np

16from .idxutils import (calc_chunksize, calcoptlevels,

17 get_reduction_level, nextafter, inftype)

19from . import indexesextension

20from .node import NotLoggedMixin

21from .atom import UIntAtom, Atom

22from .earray import EArray

23from .carray import CArray

24from .leaf import Filters

25from .indexes import CacheArray, LastRowArray, IndexArray

26from .group import Group

27from .path import join_path

28from .exceptions import PerformanceWarning

29from .utils import is_idx, idx2long, lazyattr

30from .utilsextension import (nan_aware_gt, nan_aware_ge,

31 nan_aware_lt, nan_aware_le,

32 bisect_left, bisect_right)

33from .lrucacheextension import ObjectCache

35# default version for INDEX objects

36# obversion = "1.0" # Version of indexes in PyTables 1.x series

37# obversion = "2.0" # Version of indexes in PyTables Pro 2.0 series

38obversion = "2.1" # Version of indexes in PyTables Pro 2.1 and up series,

39# # including the join 2.3 Std + Pro version

41debug = False

42# debug = True # Uncomment this for printing sizes purposes

43profile = False

44# profile = True # Uncomment for profiling

45if profile:

46 from .utils import show_stats

48# The default method for sorting

49# defsort = "quicksort"

50# Changing to mergesort to fix #441

51defsort = "mergesort"

53# Default policy for automatically updating indexes after a table

54# append operation, or automatically reindexing after an

55# index-invalidating operation like removing or modifying table rows.

56default_auto_index = True

57# Keep in sync with ``Table.autoindex`` docstring.

59# Default filters used to compress indexes. This is quite fast and

60# compression is pretty good.

61# Remember to keep these defaults in sync with the docstrings and UG.

62default_index_filters = Filters(complevel=1, complib='zlib',

63 shuffle=True, fletcher32=False)

65# Deprecated API

66defaultAutoIndex = default_auto_index

67defaultIndexFilters = default_index_filters

69# The list of types for which an optimised search in cython and C has

70# been implemented. Always add here the name of a new optimised type.

71opt_search_types = ("int8", "int16", "int32", "int64",

72 "uint8", "uint16", "uint32", "uint64",

73 "float32", "float64")

75# The upper limit for uint32 ints

76max32 = 2**32

79def _table_column_pathname_of_index(indexpathname):

80 names = indexpathname.split("/")

81 for i, name in enumerate(names):

82 if name.startswith('_i_'):

83 break

84 tablepathname = "/".join(names[:i]) + "/" + name[3:]

85 colpathname = "/".join(names[i + 1:])

86 return (tablepathname, colpathname)

89class Index(NotLoggedMixin, Group, indexesextension.Index):

90 """Represents the index of a column in a table.

92 This class is used to keep the indexing information for columns in a Table

93 dataset (see :ref:`TableClassDescr`). It is actually a descendant of the

94 Group class (see :ref:`GroupClassDescr`), with some added functionality. An

95 Index is always associated with one and only one column in the table.

97 .. note::

99 This class is mainly intended for internal use, but some of its

100 documented attributes and methods may be interesting for the

101 programmer.

102

103 Parameters

104 ----------

105 parentnode

106 The parent :class:`Group` object.

107

108 .. versionchanged:: 3.0

109 Renamed from *parentNode* to *parentnode*.

110

111 name : str

112 The name of this node in its parent group.

113 atom : Atom

114 An Atom object representing the shape and type of the atomic objects to

115 be saved. Only scalar atoms are supported.

116 title

117 Sets a TITLE attribute of the Index entity.

118 kind

119 The desired kind for this index. The 'full' kind specifies a complete

120 track of the row position (64-bit), while the 'medium', 'light' or

121 'ultralight' kinds only specify in which chunk the row is (using

122 32-bit, 16-bit and 8-bit respectively).

123 optlevel

124 The desired optimization level for this index.

125 filters : Filters

126 An instance of the Filters class that provides information about the

127 desired I/O filters to be applied during the life of this object.

128 tmp_dir

129 The directory for the temporary files.

130 expectedrows

131 Represents an user estimate about the number of row slices that will be

132 added to the growable dimension in the IndexArray object.

133 byteorder

134 The byteorder of the index datasets *on-disk*.

135 blocksizes

136 The four main sizes of the compound blocks in index datasets (a low

137 level parameter).

138

139 """

140

141 _c_classid = 'INDEX'

142

143 @property

144 def kind(self):

145 """The kind of this index."""

146 return {1: 'ultralight', 2: 'light',

147 4: 'medium', 8: 'full'}[self.indsize]

148

149 @property

150 def filters(self):

151 """Filter properties for this index - see Filters in

152 :ref:`FiltersClassDescr`."""

153 return self._v_filters

154

155 @property

156 def dirty(self):

157 """Whether the index is dirty or not.

158 Dirty indexes are out of sync with column data, so they exist but they

159 are not usable.

160 """

161

162 # If there is no ``DIRTY`` attribute, index should be clean.

163 return getattr(self._v_attrs, 'DIRTY', False)

164

165 @dirty.setter

166 def dirty(self, dirty):

167 wasdirty, isdirty = self.dirty, bool(dirty)

168 self._v_attrs.DIRTY = dirty

169 # If an *actual* change in dirtiness happens,

170 # notify the condition cache by setting or removing a nail.

171 conditioncache = self.table._condition_cache

172 if not wasdirty and isdirty:

173 conditioncache.nail()

174 if wasdirty and not isdirty:

175 conditioncache.unnail()

176

177 @property

178 def column(self):

179 """The Column (see :ref:`ColumnClassDescr`) instance for the indexed

180 column."""

181

182 tablepath, columnpath = _table_column_pathname_of_index(

183 self._v_pathname)

184 table = self._v_file._get_node(tablepath)

185 column = table.cols._g_col(columnpath)

186 return column

187

188 @property

189 def table(self):

190 """Accessor for the `Table` object of this index."""

191 tablepath, columnpath = _table_column_pathname_of_index(

192 self._v_pathname)

193 table = self._v_file._get_node(tablepath)

194 return table

195

196 @property

197 def nblockssuperblock(self):

198 """The number of blocks in a superblock."""

199 return self.superblocksize // self.blocksize

200

201 @property

202 def nslicesblock(self):

203 """The number of slices in a block."""

204 return self.blocksize // self.slicesize

205

206 @property

207 def nchunkslice(self):

208 """The number of chunks in a slice."""

209 return self.slicesize // self.chunksize

210

211 @property

212 def nsuperblocks(self):

213 """The total number of superblocks in index."""

214 # Last row should not be considered as a superblock

215 nelements = self.nelements - self.nelementsILR

216 nblocks = nelements // self.superblocksize

217 if nelements % self.blocksize > 0:

218 nblocks += 1

219 return nblocks

220

221 @property

222 def nblocks(self):

223 """The total number of blocks in index."""

224 # Last row should not be considered as a block

225 nelements = self.nelements - self.nelementsILR

226 nblocks = nelements // self.blocksize

227 if nelements % self.blocksize > 0:

228 nblocks += 1

229 return nblocks

230

231 @property

232 def nslices(self):

233 """The number of complete slices in index."""

234 return self.nelements // self.slicesize

235

236 @property

237 def nchunks(self):

238 """The number of complete chunks in index."""

239 return self.nelements // self.chunksize

240

241 @property

242 def shape(self):

243 """The shape of this index (in slices and elements)."""

244 return (self.nrows, self.slicesize)

245

246 @property

247 def temp_required(self):

248 """Whether a temporary file for indexes is required or not."""

249 return (self.indsize > 1 and

250 self.optlevel > 0 and

251 self.table.nrows > self.slicesize)

252

253 @property

254 def want_complete_sort(self):

255 """Whether we should try to build a completely sorted index or not."""

256 return self.indsize == 8 and self.optlevel == 9

257

258 @property

259 def is_csi(self):

260 """Whether the index is completely sorted or not.

261

262 .. versionchanged:: 3.0

263 The *is_CSI* property has been renamed into *is_csi*.

264

265 """

266

267 if self.nelements == 0:

268 # An index with 0 indexed elements is not a CSI one (by definition)

269 return False

270 if self.indsize < 8:

271 # An index that is not full cannot be completely sorted

272 return False

273 # Try with the 'is_csi' attribute

274 if 'is_csi' in self._v_attrs:

275 return self._v_attrs.is_csi

276 # If not, then compute the overlaps manually

277 # (the attribute 'is_csi' will be set there)

278 self.compute_overlaps(self, None, False)

279 return self.noverlaps == 0

280

281 @lazyattr

282 def nrowsinchunk(self):

283 """The number of rows that fits in a *table* chunk."""

284

285 return self.table.chunkshape[0]

286

287 @lazyattr

288 def lbucket(self):

289 """Return the length of a bucket based index type."""

290

291 # Avoid to set a too large lbucket size (mainly useful for tests)

292 lbucket = min(self.nrowsinchunk, self.chunksize)

293 if self.indsize == 1:

294 # For ultra-light, we will never have to keep track of a

295 # bucket outside of a slice.

296 maxnb = 2**8

297 if self.slicesize > maxnb * lbucket:

298 lbucket = math.ceil(self.slicesize / maxnb)

299 elif self.indsize == 2:

300 # For light, we will never have to keep track of a

301 # bucket outside of a block.

302 maxnb = 2**16

303 if self.blocksize > maxnb * lbucket:

304 lbucket = math.ceil(self.blocksize / maxnb)

305 else:

306 # For medium and full indexes there should not be a need to

307 # increase lbucket

308 pass

309 return lbucket

310

311 def __init__(self, parentnode, name,

312 atom=None, title="",

313 kind=None,

314 optlevel=None,

315 filters=None,

316 tmp_dir=None,

317 expectedrows=0,

318 byteorder=None,

319 blocksizes=None,

320 new=True):

321

322 self._v_version = None

323 """The object version of this index."""

324 self.optlevel = optlevel

325 """The optimization level for this index."""

326 self.tmp_dir = tmp_dir

327 """The directory for the temporary files."""

328 self.expectedrows = expectedrows

329 """The expected number of items of index arrays."""

330 if byteorder in ["little", "big"]:

331 self.byteorder = byteorder

332 else:

333 self.byteorder = sys.byteorder

334 """The byteorder of the index datasets."""

335 if atom is not None:

336 self.dtype = atom.dtype.base

337 self.type = atom.type

338 """The datatypes to be stored by the sorted index array."""

339 # ############## Important note ###########################

340 # The datatypes saved as index values are NumPy native

341 # types, so we get rid of type metainfo like Time* or Enum*

342 # that belongs to HDF5 types (actually, this metainfo is

343 # not needed for sorting and looking-up purposes).

344 # #########################################################

345 indsize = {

346 'ultralight': 1, 'light': 2, 'medium': 4, 'full': 8}[kind]

347 assert indsize in (1, 2, 4, 8), "indsize should be 1, 2, 4 or 8!"

348 self.indsize = indsize

349 """The itemsize for the indices part of the index."""

350

351 self.nrows = None

352 """The total number of slices in the index."""

353 self.nelements = None

354 """The number of currently indexed rows for this column."""

355 self.blocksizes = blocksizes

356 """The four main sizes of the compound blocks (if specified)."""

357 self.dirtycache = True

358 """Dirty cache (for ranges, bounds & sorted) flag."""

359 self.superblocksize = None

360 """Size of the superblock for this index."""

361 self.blocksize = None

362 """Size of the block for this index."""

363 self.slicesize = None

364 """Size of the slice for this index."""

365 self.chunksize = None

366 """Size of the chunk for this index."""

367 self.tmpfilename = None

368 """Filename for temporary bounds."""

369 self.opt_search_types = opt_search_types

370 """The types for which and optimized search has been implemented."""

371 self.noverlaps = -1

372 """The number of overlaps in an index. 0 means a completely

373 sorted index. -1 means that this number is not computed yet."""

374 self.tprof = 0

375 """Time counter for benchmarking purposes."""

376

377 from .file import open_file

378 self._openFile = open_file

379 """The `open_file()` function, to avoid a circular import."""

380

381 super().__init__(parentnode, name, title, new, filters)

382

383 def _g_post_init_hook(self):

384 if self._v_new:

385 # The version for newly created indexes

386 self._v_version = obversion

387 super()._g_post_init_hook()

388

389 # Index arrays must only be created for new indexes

390 if not self._v_new:

391 idxversion = self._v_version

392 # Set-up some variables from info on disk and return

393 attrs = self._v_attrs

394 # Coerce NumPy scalars to Python scalars in order

395 # to avoid undesired upcasting operations.

396 self.superblocksize = int(attrs.superblocksize)

397 self.blocksize = int(attrs.blocksize)

398 self.slicesize = int(attrs.slicesize)

399 self.chunksize = int(attrs.chunksize)

400 self.blocksizes = (self.superblocksize, self.blocksize,

401 self.slicesize, self.chunksize)

402 self.optlevel = int(attrs.optlevel)

403 sorted = self.sorted

404 indices = self.indices

405 self.dtype = sorted.atom.dtype

406 self.type = sorted.atom.type

407 self.indsize = indices.atom.itemsize

408 # Some sanity checks for slicesize, chunksize and indsize

409 assert self.slicesize == indices.shape[1], "Wrong slicesize"

410 assert self.chunksize == indices._v_chunkshape[

411 1], "Wrong chunksize"

412 assert self.indsize in (1, 2, 4, 8), "Wrong indices itemsize"

413 if idxversion > "2.0":

414 self.reduction = int(attrs.reduction)

415 nelementsSLR = int(self.sortedLR.attrs.nelements)

416 nelementsILR = int(self.indicesLR.attrs.nelements)

417 else:

418 self.reduction = 1

419 nelementsILR = self.indicesLR[-1]

420 nelementsSLR = nelementsILR

421 self.nrows = sorted.nrows

422 self.nelements = self.nrows * self.slicesize + nelementsILR

423 self.nelementsSLR = nelementsSLR

424 self.nelementsILR = nelementsILR

425 if nelementsILR > 0:

426 self.nrows += 1

427 # Get the bounds as a cache (this has to remain here!)

428 rchunksize = self.chunksize // self.reduction

429 nboundsLR = (nelementsSLR - 1) // rchunksize

430 if nboundsLR < 0:

431 nboundsLR = 0 # correction for -1 bounds

432 nboundsLR += 2 # bounds + begin + end

433 # All bounds values (+begin + end) are at the end of sortedLR

434 self.bebounds = self.sortedLR[

435 nelementsSLR:nelementsSLR + nboundsLR]

436 return

437

438 # The index is new. Initialize the values

439 self.nrows = 0

440 self.nelements = 0

441 self.nelementsSLR = 0

442 self.nelementsILR = 0

443

444 # The atom

445 atom = Atom.from_dtype(self.dtype)

446

447 # The filters

448 filters = self.filters

449

450 # Compute the superblocksize, blocksize, slicesize and chunksize values

451 # (in case these parameters haven't been passed to the constructor)

452 if self.blocksizes is None:

453 self.blocksizes = calc_chunksize(

454 self.expectedrows, self.optlevel, self.indsize, node=self)

455 (self.superblocksize, self.blocksize,

456 self.slicesize, self.chunksize) = self.blocksizes

457 if debug:

458 print("blocksizes:", self.blocksizes)

459 # Compute the reduction level

460 self.reduction = get_reduction_level(

461 self.indsize, self.optlevel, self.slicesize, self.chunksize)

462 rchunksize = self.chunksize // self.reduction

463 rslicesize = self.slicesize // self.reduction

464

465 # Save them on disk as attributes

466 self._v_attrs.superblocksize = np.uint64(self.superblocksize)

467 self._v_attrs.blocksize = np.uint64(self.blocksize)

468 self._v_attrs.slicesize = np.uint32(self.slicesize)

469 self._v_attrs.chunksize = np.uint32(self.chunksize)

470 # Save the optlevel as well

471 self._v_attrs.optlevel = self.optlevel

472 # Save the reduction level

473 self._v_attrs.reduction = self.reduction

474

475 # Create the IndexArray for sorted values

476 sorted = IndexArray(self, 'sorted', atom, "Sorted Values",

477 filters, self.byteorder)

478

479 # Create the IndexArray for index values

480 IndexArray(self, 'indices', UIntAtom(itemsize=self.indsize),

481 "Number of chunk in table", filters, self.byteorder)

482

483 # Create the cache for range values (1st order cache)

484 CacheArray(self, 'ranges', atom, (0, 2), "Range Values", filters,

485 self.expectedrows // self.slicesize,

486 byteorder=self.byteorder)

487 # median ranges

488 EArray(self, 'mranges', atom, (0,), "Median ranges", filters,

489 byteorder=self.byteorder, _log=False)

490

491 # Create the cache for boundary values (2nd order cache)

492 nbounds_inslice = (rslicesize - 1) // rchunksize

493 CacheArray(self, 'bounds', atom, (0, nbounds_inslice),

494 "Boundary Values", filters, self.nchunks,

495 (1, nbounds_inslice), byteorder=self.byteorder)

496

497 # begin, end & median bounds (only for numerical types)

498 EArray(self, 'abounds', atom, (0,), "Start bounds", filters,

499 byteorder=self.byteorder, _log=False)

500 EArray(self, 'zbounds', atom, (0,), "End bounds", filters,

501 byteorder=self.byteorder, _log=False)

502 EArray(self, 'mbounds', atom, (0,), "Median bounds", filters,

503 byteorder=self.byteorder, _log=False)

504

505 # Create the Array for last (sorted) row values + bounds

506 shape = (rslicesize + 2 + nbounds_inslice,)

507 sortedLR = LastRowArray(self, 'sortedLR', atom, shape,

508 "Last Row sorted values + bounds",

509 filters, (rchunksize,),

510 byteorder=self.byteorder)

511

512 # Create the Array for the number of chunk in last row

513 shape = (self.slicesize,) # enough for indexes and length

514 indicesLR = LastRowArray(self, 'indicesLR',

515 UIntAtom(itemsize=self.indsize),

516 shape, "Last Row indices",

517 filters, (self.chunksize,),

518 byteorder=self.byteorder)

519

520 # The number of elements in LR will be initialized here

521 sortedLR.attrs.nelements = 0

522 indicesLR.attrs.nelements = 0

523

524 # All bounds values (+begin + end) are uninitialized in creation time

525 self.bebounds = None

526

527 # The starts and lengths initialization

528 self.starts = np.empty(shape=self.nrows, dtype=np.int32)

529 """Where the values fulfiling conditions starts for every slice."""

530 self.lengths = np.empty(shape=self.nrows, dtype=np.int32)

531 """Lengths of the values fulfilling conditions for every slice."""

532

533 # Finally, create a temporary file for indexes if needed

534 if self.temp_required:

535 self.create_temp()

536

537 def initial_append(self, xarr, nrow, reduction):

538 """Compute an initial indices arrays for data to be indexed."""

539

540 if profile:

541 tref = clock()

542 if profile:

543 show_stats("Entering initial_append", tref)

544 arr = xarr.pop()

545 indsize = self.indsize

546 slicesize = self.slicesize

547 nelementsILR = self.nelementsILR

548 if profile:

549 show_stats("Before creating idx", tref)

550 if indsize == 8:

551 idx = np.arange(0, len(arr), dtype="uint64") + nrow * slicesize

552 elif indsize == 4:

553 # For medium (32-bit) all the rows in tables should be

554 # directly reachable. But as len(arr) < 2**31, we can

555 # choose uint32 for representing indices. In this way, we

556 # consume far less memory during the keysort process. The

557 # offset will be added in self.final_idx32() later on.

558 #

559 # This optimization also prevents the values in LR to

560 # participate in the ``swap_chunks`` process, and this is

561 # the main reason to not allow the medium indexes to create

562 # completely sorted indexes. However, I don't find this to

563 # be a big limitation, as probably fully indexes are much

564 # more suitable for producing completely sorted indexes

565 # because in this case the indices part is usable for

566 # getting the reverse indices of the index, and I forsee

567 # this to be a common requirement in many operations (for

568 # example, in table sorts).

569 #

570 # F. Alted 2008-09-15

571 idx = np.arange(0, len(arr), dtype="uint32")

572 else:

573 idx = np.empty(len(arr), "uint%d" % (indsize * 8))

574 lbucket = self.lbucket

575 # Fill the idx with the bucket indices

576 offset = int(lbucket - ((nrow * (slicesize % lbucket)) % lbucket))

577 idx[0:offset] = 0

578 for i in range(offset, slicesize, lbucket):

579 idx[i:i + lbucket] = (i + lbucket - 1) // lbucket

580 if indsize == 2:

581 # Add a second offset in this case

582 # First normalize the number of rows

583 offset2 = (nrow % self.nslicesblock) * slicesize // lbucket

584 idx += offset2

585 # Add the last row at the beginning of arr & idx (if needed)

586 if (indsize == 8 and nelementsILR > 0):

587 # It is possible that the values in LR are already sorted.

588 # Fetch them and override existing values in arr and idx.

589 assert len(arr) > nelementsILR

590 self.read_slice_lr(self.sortedLR, arr[:nelementsILR])

591 self.read_slice_lr(self.indicesLR, idx[:nelementsILR])

592 # In-place sorting

593 if profile:

594 show_stats("Before keysort", tref)

595 indexesextension.keysort(arr, idx)

596 larr = arr[-1]

597 if reduction > 1:

598 # It's important to do a copy() here in order to ensure that

599 # sorted._append() will receive a contiguous array.

600 if profile:

601 show_stats("Before reduction", tref)

602 reduc = arr[::reduction].copy()

603 if profile:

604 show_stats("After reduction", tref)

605 arr = reduc

606 if profile:

607 show_stats("After arr <-- reduc", tref)

608 # A completely sorted index is not longer possible after an

609 # append of an index with already one slice.

610 if nrow > 0:

611 self._v_attrs.is_csi = False

612 if profile:

613 show_stats("Exiting initial_append", tref)

614 return larr, arr, idx

615

616 def final_idx32(self, idx, offset):

617 """Perform final operations in 32-bit indices."""

618

619 if profile:

620 tref = clock()

621 if profile:

622 show_stats("Entering final_idx32", tref)

623 # Do an upcast first in order to add the offset.

624 idx = idx.astype('uint64')

625 idx += offset

626 # The next partition is valid up to table sizes of

627 # 2**30 * 2**18 = 2**48 bytes, that is, 256 Tera-elements,

628 # which should be a safe figure, at least for a while.

629 idx //= self.lbucket

630 # After the division, we can downsize the indexes to 'uint32'

631 idx = idx.astype('uint32')

632 if profile:

633 show_stats("Exiting final_idx32", tref)

634 return idx

635

636 def append(self, xarr, update=False):

637 """Append the array to the index objects."""

638

639 if profile:

640 tref = clock()

641 if profile:

642 show_stats("Entering append", tref)

643 if not update and self.temp_required:

644 where = self.tmp

645 # The reduction will take place *after* the optimization process

646 reduction = 1

647 else:

648 where = self

649 reduction = self.reduction

650 sorted = where.sorted

651 indices = where.indices

652 ranges = where.ranges

653 mranges = where.mranges

654 bounds = where.bounds

655 mbounds = where.mbounds

656 abounds = where.abounds

657 zbounds = where.zbounds

658 sortedLR = where.sortedLR

659 indicesLR = where.indicesLR

660 nrows = sorted.nrows # before sorted.append()

661 larr, arr, idx = self.initial_append(xarr, nrows, reduction)

662 # Save the sorted array

663 sorted.append(arr.reshape(1, arr.size))

664 cs = self.chunksize // reduction

665 ncs = self.nchunkslice

666 # Save ranges & bounds

667 ranges.append([[arr[0], larr]])

668 bounds.append([arr[cs::cs]])

669 abounds.append(arr[0::cs])

670 zbounds.append(arr[cs - 1::cs])

671 # Compute the medians

672 smedian = arr[cs // 2::cs]

673 mbounds.append(smedian)

674 mranges.append([smedian[ncs // 2]])

675 if profile:

676 show_stats("Before deleting arr & smedian", tref)

677 del arr, smedian # delete references

678 if profile:

679 show_stats("After deleting arr & smedian", tref)

680 # Now that arr is gone, we can upcast the indices and add the offset

681 if self.indsize == 4:

682 idx = self.final_idx32(idx, nrows * self.slicesize)

683 indices.append(idx.reshape(1, idx.size))

684 if profile:

685 show_stats("Before deleting idx", tref)

686 del idx

687 # Update counters after a successful append

688 self.nrows = nrows + 1

689 self.nelements = self.nrows * self.slicesize

690 self.nelementsSLR = 0 # reset the counter of the last row index to 0

691 self.nelementsILR = 0 # reset the counter of the last row index to 0

692 # The number of elements will be saved as an attribute.

693 # This is necessary in case the LR arrays can remember its values

694 # after a possible node preemtion/reload.

695 sortedLR.attrs.nelements = self.nelementsSLR

696 indicesLR.attrs.nelements = self.nelementsILR

697 self.dirtycache = True # the cache is dirty now

698 if profile:

699 show_stats("Exiting append", tref)

700

701 def append_last_row(self, xarr, update=False):

702 """Append the array to the last row index objects."""

703

704 if profile:

705 tref = clock()

706 if profile:

707 show_stats("Entering appendLR", tref)

708 # compute the elements in the last row sorted & bounds array

709 nrows = self.nslices

710 if not update and self.temp_required:

711 where = self.tmp

712 # The reduction will take place *after* the optimization process

713 reduction = 1

714 else:

715 where = self

716 reduction = self.reduction

717 indicesLR = where.indicesLR

718 sortedLR = where.sortedLR

719 larr, arr, idx = self.initial_append(xarr, nrows, reduction)

720 nelementsSLR = len(arr)

721 nelementsILR = len(idx)

722 # Build the cache of bounds

723 rchunksize = self.chunksize // reduction

724 self.bebounds = np.concatenate((arr[::rchunksize], [larr]))

725 # The number of elements will be saved as an attribute

726 sortedLR.attrs.nelements = nelementsSLR

727 indicesLR.attrs.nelements = nelementsILR

728 # Save the number of elements, bounds and sorted values

729 # at the end of the sorted array

730 offset2 = len(self.bebounds)

731 sortedLR[nelementsSLR:nelementsSLR + offset2] = self.bebounds

732 sortedLR[:nelementsSLR] = arr

733 del arr

734 # Now that arr is gone, we can upcast the indices and add the offset

735 if self.indsize == 4:

736 idx = self.final_idx32(idx, nrows * self.slicesize)

737 # Save the reverse index array

738 indicesLR[:len(idx)] = idx

739 del idx

740 # Update counters after a successful append

741 self.nrows = nrows + 1

742 self.nelements = nrows * self.slicesize + nelementsILR

743 self.nelementsILR = nelementsILR

744 self.nelementsSLR = nelementsSLR

745 self.dirtycache = True # the cache is dirty now

746 if profile:

747 show_stats("Exiting appendLR", tref)

748

749 def optimize(self, verbose=False):

750 """Optimize an index so as to allow faster searches.

751

752 verbose

753 If True, messages about the progress of the

754 optimization process are printed out.

755

756 """

757

758 if not self.temp_required:

759 return

760

761 if verbose:

762 self.verbose = True

763 else:

764 self.verbose = debug

765

766 # Initialize last_tover and last_nover

767 self.last_tover = 0

768 self.last_nover = 0

769

770 # Compute the correct optimizations for current optim level

771 opts = calcoptlevels(self.nblocks, self.optlevel, self.indsize)

772 optmedian, optstarts, optstops, optfull = opts

773

774 if debug:

775 print("optvalues:", opts)

776

777 self.create_temp2()

778 # Start the optimization process

779 while True:

780 if optfull:

781 for niter in range(optfull):

782 if self.swap('chunks', 'median'):

783 break

784 if self.nblocks > 1:

785 # Swap slices only in the case that we have

786 # several blocks

787 if self.swap('slices', 'median'):

788 break

789 if self.swap('chunks', 'median'):

790 break

791 if self.swap('chunks', 'start'):

792 break

793 if self.swap('chunks', 'stop'):

794 break

795 else:

796 if optmedian:

797 if self.swap('chunks', 'median'):

798 break

799 if optstarts:

800 if self.swap('chunks', 'start'):

801 break

802 if optstops:

803 if self.swap('chunks', 'stop'):

804 break

805 break # If we reach this, exit the loop

806

807 # Check if we require a complete sort. Important: this step

808 # should be carried out *after* the optimization process has

809 # been completed (this is to guarantee that the complete sort

810 # does not take too much memory).

811 if self.want_complete_sort:

812 if self.noverlaps > 0:

813 self.do_complete_sort()

814 # Check that we have effectively achieved the complete sort

815 if self.noverlaps > 0:

816 warnings.warn(

817 "OPSI was not able to achieve a completely sorted index."

818 " Please report this to the authors.", UserWarning)

819

820 # Close and delete the temporal optimization index file

821 self.cleanup_temp()

822 return

823

824 def do_complete_sort(self):

825 """Bring an already optimized index into a complete sorted state."""

826

827 if self.verbose:

828 t1 = clock()

829 c1 = cpuclock()

830 ss = self.slicesize

831 tmp = self.tmp

832 ranges = tmp.ranges[:]

833 nslices = self.nslices

834

835 nelementsLR = self.nelementsILR

836 if nelementsLR > 0:

837 # Add the ranges corresponding to the last row

838 rangeslr = np.array([self.bebounds[0], self.bebounds[-1]])

839 ranges = np.concatenate((ranges, [rangeslr]))

840 nslices += 1

841

842 sorted = tmp.sorted

843 indices = tmp.indices

844 sortedLR = tmp.sortedLR

845 indicesLR = tmp.indicesLR

846 sremain = np.array([], dtype=self.dtype)

847 iremain = np.array([], dtype='u%d' % self.indsize)

848 starts = np.zeros(shape=nslices, dtype=np.int_)

849 for i in range(nslices):

850 # Find the overlapping elements for slice i

851 sover = np.array([], dtype=self.dtype)

852 iover = np.array([], dtype='u%d' % self.indsize)

853 prev_end = ranges[i, 1]

854 for j in range(i + 1, nslices):

855 stj = starts[j]

856 if ((j < self.nslices and stj == ss) or

857 (j == self.nslices and stj == nelementsLR)):

858 # This slice has been already dealt with

859 continue

860 if j < self.nslices:

861 assert stj < ss, \

862 "Two slices cannot overlap completely at this stage!"

863 next_beg = sorted[j, stj]

864 else:

865 assert stj < nelementsLR, \

866 "Two slices cannot overlap completely at this stage!"

867 next_beg = sortedLR[stj]

868 next_end = ranges[j, 1]

869 if prev_end > next_end:

870 # Complete overlapping case

871 if j < self.nslices:

872 sover = np.concatenate((sover, sorted[j, stj:]))

873 iover = np.concatenate((iover, indices[j, stj:]))

874 starts[j] = ss

875 else:

876 n = nelementsLR

877 sover = np.concatenate((sover, sortedLR[stj:n]))

878 iover = np.concatenate((iover, indicesLR[stj:n]))

879 starts[j] = nelementsLR

880 elif prev_end > next_beg:

881 idx = self.search_item_lt(tmp, prev_end, j, ranges[j], stj)

882 if j < self.nslices:

883 sover = np.concatenate((sover, sorted[j, stj:idx]))

884 iover = np.concatenate((iover, indices[j, stj:idx]))

885 else:

886 sover = np.concatenate((sover, sortedLR[stj:idx]))

887 iover = np.concatenate((iover, indicesLR[stj:idx]))

888 starts[j] = idx

889 # Build the extended slices to sort out

890 if i < self.nslices:

891 ssorted = np.concatenate(

892 (sremain, sorted[i, starts[i]:], sover))

893 sindices = np.concatenate(

894 (iremain, indices[i, starts[i]:], iover))

895 else:

896 ssorted = np.concatenate(

897 (sremain, sortedLR[starts[i]:nelementsLR], sover))

898 sindices = np.concatenate(

899 (iremain, indicesLR[starts[i]:nelementsLR], iover))

900 # Sort the extended slices

901 indexesextension.keysort(ssorted, sindices)

902 # Save the first elements of extended slices in the slice i

903 if i < self.nslices:

904 sorted[i] = ssorted[:ss]

905 indices[i] = sindices[:ss]

906 # Update caches for this slice

907 self.update_caches(i, ssorted[:ss])

908 # Save the remaining values in a separate array

909 send = len(sover) + len(sremain)

910 sremain = ssorted[ss:ss + send]

911 iremain = sindices[ss:ss + send]

912 else:

913 # Still some elements remain for the last row

914 n = len(ssorted)

915 assert n == nelementsLR

916 send = 0

917 sortedLR[:n] = ssorted

918 indicesLR[:n] = sindices

919 # Update the caches for last row

920 sortedlr = sortedLR[:nelementsLR]

921 bebounds = np.concatenate(

922 (sortedlr[::self.chunksize], [sortedlr[-1]]))

923 sortedLR[nelementsLR:nelementsLR + len(bebounds)] = bebounds

924 self.bebounds = bebounds

925

926 # Verify that we have dealt with all the remaining values

927 assert send == 0

928

929 # Compute the overlaps in order to verify that we have achieved

930 # a complete sort. This has to be executed always (and not only

931 # in verbose mode!).

932 self.compute_overlaps(self.tmp, "do_complete_sort()", self.verbose)

933 if self.verbose:

934 print(f"time: {clock() - t1:.4f}. clock: {cpuclock() - c1:.4f}")

935

936 def swap(self, what, mode=None):

937 """Swap chunks or slices using a certain bounds reference."""

938

939 # Thresholds for avoiding continuing the optimization

940 # thnover = 4 * self.slicesize # minimum number of overlapping

941 # # elements

942 thnover = 40

943 thmult = 0.1 # minimum ratio of multiplicity (a 10%)

944 thtover = 0.01 # minimum overlaping index for slices (a 1%)

945

946 if self.verbose:

947 t1 = clock()

948 c1 = cpuclock()

949 if what == "chunks":

950 self.swap_chunks(mode)

951 elif what == "slices":

952 self.swap_slices(mode)

953 if mode:

954 message = f"swap_{what}({mode})"

955 else:

956 message = f"swap_{what}"

957 (nover, mult, tover) = self.compute_overlaps(

958 self.tmp, message, self.verbose)

959 rmult = len(mult.nonzero()[0]) / len(mult)

960 if self.verbose:

961 print(f"time: {clock() - t1:.4f}. clock: {cpuclock() - c1:.4f}")

962 # Check that entropy is actually decreasing

963 if what == "chunks" and self.last_tover > 0 and self.last_nover > 0:

964 tover_var = (self.last_tover - tover) / self.last_tover

965 nover_var = (self.last_nover - nover) / self.last_nover

966 if tover_var < 0.05 and nover_var < 0.05:

967 # Less than a 5% of improvement is too few

968 return True

969 self.last_tover = tover

970 self.last_nover = nover

971 # Check if some threshold has met

972 if nover < thnover:

973 return True

974 if rmult < thmult:

975 return True

976 # Additional check for the overlap ratio

977 if 0 <= tover < thtover:

978 return True

979 return False

980

981 def create_temp(self):

982 """Create some temporary objects for slice sorting purposes."""

983

984 # The index will be dirty during the index optimization process

985 self.dirty = True

986 # Build the name of the temporary file

987 fd, self.tmpfilename = tempfile.mkstemp(

988 ".tmp", "pytables-", self.tmp_dir)

989 # Close the file descriptor so as to avoid leaks

990 os.close(fd)

991 # Create the proper PyTables file

992 self.tmpfile = self._openFile(self.tmpfilename, "w")

993 self.tmp = tmp = self.tmpfile.root

994 cs = self.chunksize

995 ss = self.slicesize

996 filters = self.filters

997 # temporary sorted & indices arrays

998 shape = (0, ss)

999 atom = Atom.from_dtype(self.dtype)

1000 EArray(tmp, 'sorted', atom, shape,

1001 "Temporary sorted", filters, chunkshape=(1, cs))

1002 EArray(tmp, 'indices', UIntAtom(itemsize=self.indsize), shape,

1003 "Temporary indices", filters, chunkshape=(1, cs))

1004 # temporary bounds

1005 nbounds_inslice = (ss - 1) // cs

1006 shape = (0, nbounds_inslice)

1007 EArray(tmp, 'bounds', atom, shape, "Temp chunk bounds",

1008 filters, chunkshape=(cs, nbounds_inslice))

1009 shape = (0,)

1010 EArray(tmp, 'abounds', atom, shape, "Temp start bounds",

1011 filters, chunkshape=(cs,))

1012 EArray(tmp, 'zbounds', atom, shape, "Temp end bounds",

1013 filters, chunkshape=(cs,))

1014 EArray(tmp, 'mbounds', atom, shape, "Median bounds",

1015 filters, chunkshape=(cs,))

1016 # temporary ranges

1017 EArray(tmp, 'ranges', atom, (0, 2),

1018 "Temporary range values", filters, chunkshape=(cs, 2))

1019 EArray(tmp, 'mranges', atom, (0,),

1020 "Median ranges", filters, chunkshape=(cs,))

1021 # temporary last row (sorted)

1022 shape = (ss + 2 + nbounds_inslice,)

1023 CArray(tmp, 'sortedLR', atom, shape,

1024 "Temp Last Row sorted values + bounds",

1025 filters, chunkshape=(cs,))

1026 # temporary last row (indices)

1027 shape = (ss,)

1028 CArray(tmp, 'indicesLR',

1029 UIntAtom(itemsize=self.indsize),

1030 shape, "Temp Last Row indices",

1031 filters, chunkshape=(cs,))

1032

1033 def create_temp2(self):

1034 """Create some temporary objects for slice sorting purposes."""

1035

1036 # The algorithms for doing the swap can be optimized so that

1037 # one should be necessary to create temporaries for keeping just

1038 # the contents of a single superblock.

1039 # F. Alted 2007-01-03

1040 cs = self.chunksize

1041 ss = self.slicesize

1042 filters = self.filters

1043 # temporary sorted & indices arrays

1044 shape = (self.nslices, ss)

1045 atom = Atom.from_dtype(self.dtype)

1046 tmp = self.tmp

1047 CArray(tmp, 'sorted2', atom, shape,

1048 "Temporary sorted 2", filters, chunkshape=(1, cs))

1049 CArray(tmp, 'indices2', UIntAtom(itemsize=self.indsize), shape,

1050 "Temporary indices 2", filters, chunkshape=(1, cs))

1051 # temporary bounds

1052 nbounds_inslice = (ss - 1) // cs

1053 shape = (self.nslices, nbounds_inslice)

1054 CArray(tmp, 'bounds2', atom, shape, "Temp chunk bounds 2",

1055 filters, chunkshape=(cs, nbounds_inslice))

1056 shape = (self.nchunks,)

1057 CArray(tmp, 'abounds2', atom, shape, "Temp start bounds 2",

1058 filters, chunkshape=(cs,))

1059 CArray(tmp, 'zbounds2', atom, shape, "Temp end bounds 2",

1060 filters, chunkshape=(cs,))

1061 CArray(tmp, 'mbounds2', atom, shape, "Median bounds 2",

1062 filters, chunkshape=(cs,))

1063 # temporary ranges

1064 CArray(tmp, 'ranges2', atom, (self.nslices, 2),

1065 "Temporary range values 2", filters, chunkshape=(cs, 2))

1066 CArray(tmp, 'mranges2', atom, (self.nslices,),

1067 "Median ranges 2", filters, chunkshape=(cs,))

1068

1069 def cleanup_temp(self):

1070 """Copy the data and delete the temporaries for sorting purposes."""

1071

1072 if self.verbose:

1073 print("Copying temporary data...")

1074 # tmp -> index

1075 reduction = self.reduction

1076 cs = self.chunksize // reduction

1077 ncs = self.nchunkslice

1078 tmp = self.tmp

1079 for i in range(self.nslices):

1080 # Copy sorted & indices slices

1081 sorted = tmp.sorted[i][::reduction].copy()

1082 self.sorted.append(sorted.reshape(1, sorted.size))

1083 # Compute ranges

1084 self.ranges.append([[sorted[0], sorted[-1]]])

1085 # Compute chunk bounds

1086 self.bounds.append([sorted[cs::cs]])

1087 # Compute start, stop & median bounds and ranges

1088 self.abounds.append(sorted[0::cs])

1089 self.zbounds.append(sorted[cs - 1::cs])

1090 smedian = sorted[cs // 2::cs]

1091 self.mbounds.append(smedian)

1092 self.mranges.append([smedian[ncs // 2]])

1093 del sorted, smedian # delete references

1094 # Now that sorted is gone, we can copy the indices

1095 indices = tmp.indices[i]

1096 self.indices.append(indices.reshape(1, indices.size))

1097

1098 # Now it is the last row turn (if needed)

1099 if self.nelementsSLR > 0:

1100 # First, the sorted values

1101 sortedLR = self.sortedLR

1102 indicesLR = self.indicesLR

1103 nelementsLR = self.nelementsILR

1104 sortedlr = tmp.sortedLR[:nelementsLR][::reduction].copy()

1105 nelementsSLR = len(sortedlr)

1106 sortedLR[:nelementsSLR] = sortedlr

1107 # Now, the bounds

1108 self.bebounds = np.concatenate((sortedlr[::cs], [sortedlr[-1]]))

1109 offset2 = len(self.bebounds)

1110 sortedLR[nelementsSLR:nelementsSLR + offset2] = self.bebounds

1111 # Finally, the indices

1112 indicesLR[:] = tmp.indicesLR[:]

1113 # Update the number of (reduced) sorted elements

1114 self.nelementsSLR = nelementsSLR

1115 # The number of elements will be saved as an attribute

1116 self.sortedLR.attrs.nelements = self.nelementsSLR

1117 self.indicesLR.attrs.nelements = self.nelementsILR

1118

1119 if self.verbose:

1120 print("Deleting temporaries...")

1121 self.tmp = None

1122 self.tmpfile.close()

1123 Path(self.tmpfilename).unlink()

1124 self.tmpfilename = None

1125

1126 # The optimization process has finished, and the index is ok now

1127 self.dirty = False

1128 # ...but the memory data cache is dirty now

1129 self.dirtycache = True

1130

1131 def get_neworder(self, neworder, src_disk, tmp_disk,

1132 lastrow, nslices, offset, dtype):

1133 """Get sorted & indices values in new order."""

1134

1135 cs = self.chunksize

1136 ncs = ncs2 = self.nchunkslice

1137 self_nslices = self.nslices

1138 tmp = np.empty(shape=self.slicesize, dtype=dtype)

1139 for i in range(nslices):

1140 ns = offset + i

1141 if ns == self_nslices:

1142 # The number of complete chunks in the last row

1143 ncs2 = self.nelementsILR // cs

1144 # Get slices in new order

1145 for j in range(ncs2):

1146 idx = neworder[i * ncs + j]

1147 ins = idx // ncs

1148 inc = (idx - ins * ncs) * cs

1149 ins += offset

1150 nc = j * cs

1151 if ins == self_nslices:

1152 tmp[nc:nc + cs] = lastrow[inc:inc + cs]

1153 else:

1154 tmp[nc:nc + cs] = src_disk[ins, inc:inc + cs]

1155 if ns == self_nslices:

1156 # The number of complete chunks in the last row

1157 lastrow[:ncs2 * cs] = tmp[:ncs2 * cs]

1158 # The elements in the last chunk of the last row will

1159 # participate in the global reordering later on, during

1160 # the phase of sorting of *two* slices at a time

1161 # (including the last row slice, see

1162 # self.reorder_slices()). The caches for last row will

1163 # be updated in self.reorder_slices() too.

1164 # F. Altet 2008-08-25

1165 else:

1166 tmp_disk[ns] = tmp

1167

1168 def swap_chunks(self, mode="median"):

1169 """Swap & reorder the different chunks in a block."""

1170

1171 boundsnames = {

1172 'start': 'abounds', 'stop': 'zbounds', 'median': 'mbounds'}

1173 tmp = self.tmp

1174 sorted = tmp.sorted

1175 indices = tmp.indices

1176 tmp_sorted = tmp.sorted2

1177 tmp_indices = tmp.indices2

1178 sortedLR = tmp.sortedLR

1179 indicesLR = tmp.indicesLR

1180 cs = self.chunksize

1181 ncs = self.nchunkslice

1182 nsb = self.nslicesblock

1183 ncb = ncs * nsb

1184 ncb2 = ncb

1185 boundsobj = tmp._f_get_child(boundsnames[mode])

1186 can_cross_bbounds = (self.indsize == 8 and self.nelementsILR > 0)

1187 for nblock in range(self.nblocks):

1188 # Protection for last block having less chunks than ncb

1189 remainingchunks = self.nchunks - nblock * ncb

1190 if remainingchunks < ncb:

1191 ncb2 = remainingchunks

1192 if ncb2 <= 1:

1193 # if only zero or one chunks remains we are done

1194 break

1195 nslices = ncb2 // ncs

1196 bounds = boundsobj[nblock * ncb:nblock * ncb + ncb2]

1197 # Do this only if lastrow elements can cross block boundaries

1198 if (nblock == self.nblocks - 1 and # last block

1199 can_cross_bbounds):

1200 nslices += 1

1201 ul = self.nelementsILR // cs

1202 bounds = np.concatenate((bounds, self.bebounds[:ul]))

1203 sbounds_idx = bounds.argsort(kind=defsort)

1204 offset = int(nblock * nsb)

1205 # Swap sorted and indices following the new order

1206 self.get_neworder(sbounds_idx, sorted, tmp_sorted, sortedLR,

1207 nslices, offset, self.dtype)

1208 self.get_neworder(sbounds_idx, indices, tmp_indices, indicesLR,

1209 nslices, offset, 'u%d' % self.indsize)

1210 # Reorder completely the index at slice level

1211 self.reorder_slices(tmp=True)

1212

1213 def read_slice(self, where, nslice, buffer, start=0):

1214 """Read a slice from the `where` dataset and put it in `buffer`."""

1215

1216 # Create the buffers for specifying the coordinates

1217 self.startl = np.array([nslice, start], np.uint64)

1218 self.stopl = np.array([nslice + 1, start + buffer.size], np.uint64)

1219 self.stepl = np.ones(shape=2, dtype=np.uint64)

1220 where._g_read_slice(self.startl, self.stopl, self.stepl, buffer)

1221

1222 def write_slice(self, where, nslice, buffer, start=0):

1223 """Write a `slice` to the `where` dataset with the `buffer` data."""

1224

1225 self.startl = np.array([nslice, start], np.uint64)

1226 self.stopl = np.array([nslice + 1, start + buffer.size], np.uint64)

1227 self.stepl = np.ones(shape=2, dtype=np.uint64)

1228 countl = self.stopl - self.startl # (1, self.slicesize)

1229 where._g_write_slice(self.startl, self.stepl, countl, buffer)

1230

1231 # Read version for LastRow

1232 def read_slice_lr(self, where, buffer, start=0):

1233 """Read a slice from the `where` dataset and put it in `buffer`."""

1234

1235 startl = np.array([start], dtype=np.uint64)

1236 stopl = np.array([start + buffer.size], dtype=np.uint64)

1237 stepl = np.array([1], dtype=np.uint64)

1238 where._g_read_slice(startl, stopl, stepl, buffer)

1239

1240 # Write version for LastRow

1241 def write_sliceLR(self, where, buffer, start=0):

1242 """Write a slice from the `where` dataset with the `buffer` data."""

1243

1244 startl = np.array([start], dtype=np.uint64)

1245 countl = np.array([start + buffer.size], dtype=np.uint64)

1246 stepl = np.array([1], dtype=np.uint64)

1247 where._g_write_slice(startl, stepl, countl, buffer)

1248

1249 def reorder_slice(self, nslice, sorted, indices, ssorted, sindices,

1250 tmp_sorted, tmp_indices):

1251 """Copy & reorder the slice in source to final destination."""

1252

1253 ss = self.slicesize

1254 # Load the second part in buffers

1255 self.read_slice(tmp_sorted, nslice, ssorted[ss:])

1256 self.read_slice(tmp_indices, nslice, sindices[ss:])

1257 indexesextension.keysort(ssorted, sindices)

1258 # Write the first part of the buffers to the regular leaves

1259 self.write_slice(sorted, nslice - 1, ssorted[:ss])

1260 self.write_slice(indices, nslice - 1, sindices[:ss])

1261 # Update caches

1262 self.update_caches(nslice - 1, ssorted[:ss])

1263 # Shift the slice in the end to the beginning

1264 ssorted[:ss] = ssorted[ss:]

1265 sindices[:ss] = sindices[ss:]

1266

1267 def update_caches(self, nslice, ssorted):

1268 """Update the caches for faster lookups."""

1269

1270 cs = self.chunksize

1271 ncs = self.nchunkslice

1272 tmp = self.tmp

1273 # update first & second cache bounds (ranges & bounds)

1274 tmp.ranges[nslice] = ssorted[[0, -1]]

1275 tmp.bounds[nslice] = ssorted[cs::cs]

1276 # update start & stop bounds

1277 tmp.abounds[nslice * ncs:(nslice + 1) * ncs] = ssorted[0::cs]

1278 tmp.zbounds[nslice * ncs:(nslice + 1) * ncs] = ssorted[cs - 1::cs]

1279 # update median bounds

1280 smedian = ssorted[cs // 2::cs]

1281 tmp.mbounds[nslice * ncs:(nslice + 1) * ncs] = smedian

1282 tmp.mranges[nslice] = smedian[ncs // 2]

1283

1284 def reorder_slices(self, tmp):

1285 """Reorder completely the index at slice level.

1286

1287 This method has to maintain the locality of elements in the

1288 ambit of ``blocks``, i.e. an element of a ``block`` cannot be

1289 sent to another ``block`` during this reordering. This is

1290 *critical* for ``light`` indexes to be able to use this.

1291

1292 This version of reorder_slices is optimized in that *two*

1293 complete slices are taken at a time (including the last row

1294 slice) so as to sort them. Then, each new slice that is read is

1295 put at the end of this two-slice buffer, while the previous one

1296 is moved to the beginning of the buffer. This is in order to

1297 better reduce the entropy of the regular part (i.e. all except

1298 the last row) of the index.

1299

1300 A secondary effect of this is that it takes at least *twice* of

1301 memory than a previous version of reorder_slices() that only

1302 reorders on a slice-by-slice basis. However, as this is more

1303 efficient than the old version, one can configure the slicesize

1304 to be smaller, so the memory consumption is barely similar.

1305

1306 """

1307

1308 tmp = self.tmp

1309 sorted = tmp.sorted

1310 indices = tmp.indices

1311 if tmp:

1312 tmp_sorted = tmp.sorted2

1313 tmp_indices = tmp.indices2

1314 else:

1315 tmp_sorted = tmp.sorted

1316 tmp_indices = tmp.indices

1317 cs = self.chunksize

1318 ss = self.slicesize

1319 nsb = self.blocksize // self.slicesize

1320 nslices = self.nslices

1321 nblocks = self.nblocks

1322 nelementsLR = self.nelementsILR

1323 # Create the buffer for reordering 2 slices at a time

1324 ssorted = np.empty(shape=ss * 2, dtype=self.dtype)

1325 sindices = np.empty(shape=ss * 2, dtype=np.dtype('u%d' % self.indsize))

1326

1327 if self.indsize == 8:

1328 # Bootstrap the process for reordering

1329 # Read the first slice in buffers

1330 self.read_slice(tmp_sorted, 0, ssorted[:ss])

1331 self.read_slice(tmp_indices, 0, sindices[:ss])

1332

1333 nslice = 0 # Just in case the loop behind executes nothing

1334 # Loop over the remainding slices in block

1335 for nslice in range(1, sorted.nrows):

1336 self.reorder_slice(nslice, sorted, indices,

1337 ssorted, sindices,

1338 tmp_sorted, tmp_indices)

1339

1340 # End the process (enrolling the lastrow if necessary)

1341 if nelementsLR > 0:

1342 sortedLR = self.tmp.sortedLR

1343 indicesLR = self.tmp.indicesLR

1344 # Shrink the ssorted and sindices arrays to the minimum

1345 ssorted2 = ssorted[:ss + nelementsLR]

1346 sortedlr = ssorted2[ss:]

1347 sindices2 = sindices[:ss + nelementsLR]

1348 indiceslr = sindices2[ss:]

1349 # Read the last row info in the second part of the buffer

1350 self.read_slice_lr(sortedLR, sortedlr)

1351 self.read_slice_lr(indicesLR, indiceslr)

1352 indexesextension.keysort(ssorted2, sindices2)

1353 # Write the second part of the buffers to the lastrow indices

1354 self.write_sliceLR(sortedLR, sortedlr)

1355 self.write_sliceLR(indicesLR, indiceslr)

1356 # Update the caches for last row

1357 bebounds = np.concatenate((sortedlr[::cs], [sortedlr[-1]]))

1358 sortedLR[nelementsLR:nelementsLR + len(bebounds)] = bebounds

1359 self.bebounds = bebounds

1360 # Write the first part of the buffers to the regular leaves

1361 self.write_slice(sorted, nslice, ssorted[:ss])

1362 self.write_slice(indices, nslice, sindices[:ss])

1363 # Update caches for this slice

1364 self.update_caches(nslice, ssorted[:ss])

1365 else:

1366 # Iterate over each block. No data should cross block

1367 # boundaries to avoid adressing problems with short indices.

1368 for nb in range(nblocks):

1369 # Bootstrap the process for reordering

1370 # Read the first slice in buffers

1371 nrow = nb * nsb

1372 self.read_slice(tmp_sorted, nrow, ssorted[:ss])

1373 self.read_slice(tmp_indices, nrow, sindices[:ss])

1374

1375 # Loop over the remainding slices in block

1376 lrb = nrow + nsb

1377 if lrb > nslices:

1378 lrb = nslices

1379 nslice = nrow # Just in case the loop behind executes nothing

1380 for nslice in range(nrow + 1, lrb):

1381 self.reorder_slice(nslice, sorted, indices,

1382 ssorted, sindices,

1383 tmp_sorted, tmp_indices)

1384

1385 # Write the first part of the buffers to the regular leaves

1386 self.write_slice(sorted, nslice, ssorted[:ss])

1387 self.write_slice(indices, nslice, sindices[:ss])

1388 # Update caches for this slice

1389 self.update_caches(nslice, ssorted[:ss])

1390

1391 def swap_slices(self, mode="median"):

1392 """Swap slices in a superblock."""

1393

1394 tmp = self.tmp

1395 sorted = tmp.sorted

1396 indices = tmp.indices

1397 tmp_sorted = tmp.sorted2

1398 tmp_indices = tmp.indices2

1399 ncs = self.nchunkslice

1400 nss = self.superblocksize // self.slicesize

1401 nss2 = nss

1402 for sblock in range(self.nsuperblocks):

1403 # Protection for last superblock having less slices than nss

1404 remainingslices = self.nslices - sblock * nss

1405 if remainingslices < nss:

1406 nss2 = remainingslices

1407 if nss2 <= 1:

1408 break

1409 if mode == "start":

1410 ranges = tmp.ranges[sblock * nss:sblock * nss + nss2, 0]

1411 elif mode == "stop":

1412 ranges = tmp.ranges[sblock * nss:sblock * nss + nss2, 1]

1413 elif mode == "median":

1414 ranges = tmp.mranges[sblock * nss:sblock * nss + nss2]

1415 sranges_idx = ranges.argsort(kind=defsort)

1416 # Don't swap the superblock at all if one doesn't need to

1417 ndiff = (sranges_idx != np.arange(nss2)).sum() / 2

1418 if ndiff * 50 < nss2:

1419 # The number of slices to rearrange is less than 2.5%,

1420 # so skip the reordering of this superblock

1421 # (too expensive for such a little improvement)

1422 if self.verbose:

1423 print("skipping reordering of superblock ->", sblock)

1424 continue

1425 ns = sblock * nss2

1426 # Swap sorted and indices slices following the new order

1427 for i in range(nss2):

1428 idx = sranges_idx[i]

1429 # Swap sorted & indices slices

1430 oi = ns + i

1431 oidx = ns + idx

1432 tmp_sorted[oi] = sorted[oidx]

1433 tmp_indices[oi] = indices[oidx]

1434 # Swap start, stop & median ranges

1435 tmp.ranges2[oi] = tmp.ranges[oidx]

1436 tmp.mranges2[oi] = tmp.mranges[oidx]

1437 # Swap chunk bounds

1438 tmp.bounds2[oi] = tmp.bounds[oidx]

1439 # Swap start, stop & median bounds

1440 j = oi * ncs

1441 jn = (oi + 1) * ncs

1442 xj = oidx * ncs

1443 xjn = (oidx + 1) * ncs

1444 tmp.abounds2[j:jn] = tmp.abounds[xj:xjn]

1445 tmp.zbounds2[j:jn] = tmp.zbounds[xj:xjn]

1446 tmp.mbounds2[j:jn] = tmp.mbounds[xj:xjn]

1447 # tmp -> originals

1448 for i in range(nss2):

1449 # Copy sorted & indices slices

1450 oi = ns + i

1451 sorted[oi] = tmp_sorted[oi]

1452 indices[oi] = tmp_indices[oi]

1453 # Copy start, stop & median ranges

1454 tmp.ranges[oi] = tmp.ranges2[oi]

1455 tmp.mranges[oi] = tmp.mranges2[oi]

1456 # Copy chunk bounds

1457 tmp.bounds[oi] = tmp.bounds2[oi]

1458 # Copy start, stop & median bounds

1459 j = oi * ncs

1460 jn = (oi + 1) * ncs

1461 tmp.abounds[j:jn] = tmp.abounds2[j:jn]

1462 tmp.zbounds[j:jn] = tmp.zbounds2[j:jn]

1463 tmp.mbounds[j:jn] = tmp.mbounds2[j:jn]

1464

1465 def search_item_lt(self, where, item, nslice, limits, start=0):

1466 """Search a single item in a specific sorted slice."""

1467

1468 # This method will only works under the assumtion that item

1469 # *is to be found* in the nslice.

1470 assert nan_aware_lt(limits[0], item) and nan_aware_le(item, limits[1])

1471 cs = self.chunksize

1472 ss = self.slicesize

1473 nelementsLR = self.nelementsILR

1474 bstart = start // cs

1475

1476 # Find the chunk

1477 if nslice < self.nslices:

1478 nchunk = bisect_left(where.bounds[nslice], item, bstart)

1479 else:

1480 # We need to subtract 1 chunk here because bebounds

1481 # has a leading value

1482 nchunk = bisect_left(self.bebounds, item, bstart) - 1

1483 assert nchunk >= 0

1484

1485 # Find the element in chunk

1486 pos = nchunk * cs

1487 if nslice < self.nslices:

1488 pos += bisect_left(where.sorted[nslice, pos:pos + cs], item)

1489 assert pos <= ss

1490 else:

1491 end = pos + cs

1492 if end > nelementsLR:

1493 end = nelementsLR

1494 pos += bisect_left(self.sortedLR[pos:end], item)

1495 assert pos <= nelementsLR

1496 assert pos > 0

1497 return pos

1498

1499 def compute_overlaps_finegrain(self, where, message, verbose):

1500 """Compute some statistics about overlaping of slices in index.

1501

1502 It returns the following info:

1503

1504 noverlaps : int

1505 The total number of elements that overlaps in index.

1506 multiplicity : array of int

1507 The number of times that a concrete slice overlaps with any other.

1508 toverlap : float

1509 An ovelap index: the sum of the values in segment slices that

1510 overlaps divided by the entire range of values. This index is only

1511 computed for numerical types.

1512

1513 """

1514

1515 ss = self.slicesize

1516 ranges = where.ranges[:]

1517 sorted = where.sorted

1518 sortedLR = where.sortedLR

1519 nslices = self.nslices

1520 nelementsLR = self.nelementsILR

1521 if nelementsLR > 0:

1522 # Add the ranges corresponding to the last row

1523 rangeslr = np.array([self.bebounds[0], self.bebounds[-1]])

1524 ranges = np.concatenate((ranges, [rangeslr]))

1525 nslices += 1

1526 soverlap = 0

1527 toverlap = -1

1528 multiplicity = np.zeros(shape=nslices, dtype="int_")

1529 overlaps = multiplicity.copy()

1530 starts = multiplicity.copy()

1531 for i in range(nslices):

1532 prev_end = ranges[i, 1]

1533 for j in range(i + 1, nslices):

1534 stj = starts[j]

1535 assert stj <= ss

1536 if stj == ss:

1537 # This slice has already been counted

1538 continue

1539 if j < self.nslices:

1540 next_beg = sorted[j, stj]

1541 else:

1542 next_beg = sortedLR[stj]

1543 next_end = ranges[j, 1]

1544 if prev_end > next_end:

1545 # Complete overlapping case

1546 multiplicity[j - i] += 1

1547 if j < self.nslices:

1548 overlaps[i] += ss - stj

1549 starts[j] = ss # a sentinel

1550 else:

1551 overlaps[i] += nelementsLR - stj

1552 starts[j] = nelementsLR # a sentinel

1553 elif prev_end > next_beg:

1554 multiplicity[j - i] += 1

1555 idx = self.search_item_lt(

1556 where, prev_end, j, ranges[j], stj)

1557 nelem = idx - stj

1558 overlaps[i] += nelem

1559 starts[j] = idx

1560 if self.type != "string":

1561 # Convert ranges into floats in order to allow

1562 # doing operations with them without overflows

1563 soverlap += float(ranges[i, 1]) - float(ranges[j, 0])

1564

1565 # Return the overlap as the ratio between overlaps and entire range

1566 if self.type != "string":

1567 erange = float(ranges[-1, 1]) - float(ranges[0, 0])

1568 # Check that there is an effective range of values

1569 # Beware, erange can be negative in situations where

1570 # the values are suffering overflow. This can happen

1571 # specially on big signed integer values (on overflows,

1572 # the end value will become negative!).

1573 # Also, there is no way to compute overlap ratios for

1574 # non-numerical types. So, be careful and always check

1575 # that toverlap has a positive value (it must have been

1576 # initialized to -1. before) before using it.

1577 # F. Alted 2007-01-19

1578 if erange > 0:

1579 toverlap = soverlap / erange

1580 if verbose and message != "init":

1581 print("toverlap (%s):" % message, toverlap)

1582 print("multiplicity:\n", multiplicity, multiplicity.sum())

1583 print("overlaps:\n", overlaps, overlaps.sum())

1584 noverlaps = overlaps.sum()

1585 # For full indexes, set the 'is_csi' flag

1586 if self.indsize == 8 and self._v_file._iswritable():

1587 self._v_attrs.is_csi = (noverlaps == 0)

1588 # Save the number of overlaps for future references

1589 self.noverlaps = noverlaps

1590 return (noverlaps, multiplicity, toverlap)

1591

1592 def compute_overlaps(self, where, message, verbose):

1593 """Compute some statistics about overlaping of slices in index.

1594

1595 It returns the following info:

1596

1597 noverlaps : int

1598 The total number of slices that overlaps in index.

1599 multiplicity : array of int

1600 The number of times that a concrete slice overlaps with any other.

1601 toverlap : float

1602 An ovelap index: the sum of the values in segment slices that

1603 overlaps divided by the entire range of values. This index is only

1604 computed for numerical types.

1605

1606 """

1607

1608 ranges = where.ranges[:]

1609 nslices = self.nslices

1610 if self.nelementsILR > 0:

1611 # Add the ranges corresponding to the last row

1612 rangeslr = np.array([self.bebounds[0], self.bebounds[-1]])

1613 ranges = np.concatenate((ranges, [rangeslr]))

1614 nslices += 1

1615 noverlaps = 0

1616 soverlap = 0

1617 toverlap = -1

1618 multiplicity = np.zeros(shape=nslices, dtype="int_")

1619 for i in range(nslices):

1620 for j in range(i + 1, nslices):

1621 if ranges[i, 1] > ranges[j, 0]:

1622 noverlaps += 1

1623 multiplicity[j - i] += 1

1624 if self.type != "string":

1625 # Convert ranges into floats in order to allow

1626 # doing operations with them without overflows

1627 soverlap += float(ranges[i, 1]) - float(ranges[j, 0])

1628

1629 # Return the overlap as the ratio between overlaps and entire range

1630 if self.type != "string":

1631 erange = float(ranges[-1, 1]) - float(ranges[0, 0])

1632 # Check that there is an effective range of values

1633 # Beware, erange can be negative in situations where

1634 # the values are suffering overflow. This can happen

1635 # specially on big signed integer values (on overflows,

1636 # the end value will become negative!).

1637 # Also, there is no way to compute overlap ratios for

1638 # non-numerical types. So, be careful and always check

1639 # that toverlap has a positive value (it must have been

1640 # initialized to -1. before) before using it.

1641 # F. Altet 2007-01-19

1642 if erange > 0:

1643 toverlap = soverlap / erange

1644 if verbose:

1645 print("overlaps (%s):" % message, noverlaps, toverlap)

1646 print(multiplicity)

1647 # For full indexes, set the 'is_csi' flag

1648 if self.indsize == 8 and self._v_file._iswritable():

1649 self._v_attrs.is_csi = (noverlaps == 0)

1650 # Save the number of overlaps for future references

1651 self.noverlaps = noverlaps

1652 return (noverlaps, multiplicity, toverlap)

1653

1654 def read_sorted_indices(self, what, start, stop, step):

1655 """Return the sorted or indices values in the specified range."""

1656 (start, stop, step) = self._process_range(start, stop, step)

1657 if start >= stop:

1658 return np.empty(0, self.dtype)

1659 # Correction for negative values of step (reverse indices)

1660 if step < 0:

1661 tmp = start

1662 start = self.nelements - stop

1663 stop = self.nelements - tmp

1664 if what == "sorted":

1665 values = self.sorted

1666 valuesLR = self.sortedLR

1667 buffer_ = np.empty(stop - start, dtype=self.dtype)

1668 else:

1669 values = self.indices

1670 valuesLR = self.indicesLR

1671 buffer_ = np.empty(stop - start, dtype="u%d" % self.indsize)

1672 ss = self.slicesize

1673 nrow_start = start // ss

1674 istart = start % ss

1675 nrow_stop = stop // ss

1676 tlen = stop - start

1677 bstart = 0

1678 ilen = 0

1679 for nrow in range(nrow_start, nrow_stop + 1):

1680 blen = ss - istart

1681 if ilen + blen > tlen:

1682 blen = tlen - ilen

1683 if blen <= 0:

1684 break

1685 if nrow < self.nslices:

1686 self.read_slice(

1687 values, nrow, buffer_[bstart:bstart + blen], istart)

1688 else:

1689 self.read_slice_lr(

1690 valuesLR, buffer_[bstart:bstart + blen], istart)

1691 istart = 0

1692 bstart += blen

1693 ilen += blen

1694 return buffer_[::step]

1695

1696 def read_sorted(self, start=None, stop=None, step=None):

1697 """Return the sorted values of index in the specified range.

1698

1699 The meaning of the start, stop and step arguments is the same as in

1700 :meth:`Table.read_sorted`.

1701

1702 """

1703

1704 return self.read_sorted_indices('sorted', start, stop, step)

1705

1706 def read_indices(self, start=None, stop=None, step=None):

1707 """Return the indices values of index in the specified range.

1708

1709 The meaning of the start, stop and step arguments is the same as in

1710 :meth:`Table.read_sorted`.

1711

1712 """

1713

1714 return self.read_sorted_indices('indices', start, stop, step)

1715

1716 def _process_range(self, start, stop, step):

1717 """Get a range specifc for the index usage."""

1718

1719 if start is not None and stop is None:

1720 # Special case for the behaviour of PyTables iterators

1721 stop = idx2long(start + 1)

1722 if start is None:

1723 start = 0

1724 else:

1725 start = idx2long(start)

1726 if stop is None:

1727 stop = idx2long(self.nelements)

1728 else:

1729 stop = idx2long(stop)

1730 if step is None:

1731 step = 1

1732 else:

1733 step = idx2long(step)

1734 return (start, stop, step)

1735

1736 def __getitem__(self, key):

1737 """Return the indices values of index in the specified range.

1738

1739 If key argument is an integer, the corresponding index is returned. If

1740 key is a slice, the range of indices determined by it is returned. A

1741 negative value of step in slice is supported, meaning that the results

1742 will be returned in reverse order.

1743

1744 This method is equivalent to :meth:`Index.read_indices`.

1745

1746 """

1747

1748 if is_idx(key):

1749 key = operator.index(key)

1750

1751 if key < 0:

1752 # To support negative values

1753 key += self.nelements

1754 return self.read_indices(key, key + 1, 1)[0]

1755 elif isinstance(key, slice):

1756 return self.read_indices(key.start, key.stop, key.step)

1757

1758 def __len__(self):

1759 return self.nelements

1760

1761 def restorecache(self):

1762 """Clean the limits cache and resize starts and lengths arrays"""

1763

1764 params = self._v_file.params

1765 # The sorted IndexArray is absolutely required to be in memory

1766 # at the same time than the Index instance, so create a strong

1767 # reference to it. We are not introducing leaks because the

1768 # strong reference will disappear when this Index instance is

1769 # to be closed.

1770 self._sorted = self.sorted

1771 self._sorted.boundscache = ObjectCache(params['BOUNDS_MAX_SLOTS'],

1772 params['BOUNDS_MAX_SIZE'],

1773 'non-opt types bounds')

1774 self.sorted.boundscache = ObjectCache(params['BOUNDS_MAX_SLOTS'],

1775 params['BOUNDS_MAX_SIZE'],

1776 'non-opt types bounds')

1777 """A cache for the bounds (2nd hash) data. Only used for

1778 non-optimized types searches."""

1779 self.limboundscache = ObjectCache(params['LIMBOUNDS_MAX_SLOTS'],

1780 params['LIMBOUNDS_MAX_SIZE'],

1781 'bounding limits')

1782 """A cache for bounding limits."""

1783 self.sortedLRcache = ObjectCache(params['SORTEDLR_MAX_SLOTS'],

1784 params['SORTEDLR_MAX_SIZE'],

1785 'last row chunks')

1786 """A cache for the last row chunks. Only used for searches in

1787 the last row, and mainly useful for small indexes."""

1788 self.starts = np.empty(shape=self.nrows, dtype=np.int32)

1789 self.lengths = np.empty(shape=self.nrows, dtype=np.int32)

1790 self.sorted._init_sorted_slice(self)

1791 self.dirtycache = False

1792

1793 def search(self, item):

1794 """Do a binary search in this index for an item."""

1795

1796 if profile:

1797 tref = clock()

1798 if profile:

1799 show_stats("Entering search", tref)

1800

1801 if self.dirtycache:

1802 self.restorecache()

1803

1804 # An empty item or if left limit is larger than the right one

1805 # means that the number of records is always going to be empty,

1806 # so we avoid further computation (including looking up the

1807 # limits cache).

1808 if not item or item[0] > item[1]:

1809 self.starts[:] = 0

1810 self.lengths[:] = 0

1811 return 0

1812

1813 tlen = 0

1814 # Check whether the item tuple is in the limits cache or not

1815 nslot = self.limboundscache.getslot(item)

1816 if nslot >= 0:

1817 startlengths = self.limboundscache.getitem(nslot)

1818 # Reset the lengths array (not necessary for starts)

1819 self.lengths[:] = 0

1820 # Now, set the interesting rows

1821 for nrow2, start, length in startlengths:

1822 self.starts[nrow2] = start

1823 self.lengths[nrow2] = length

1824 tlen = tlen + length

1825 return tlen

1826 # The item is not in cache. Do the real lookup.

1827 sorted = self.sorted

1828 if self.nslices > 0:

1829 if self.type in self.opt_search_types:

1830 # The next are optimizations. However, they hide the

1831 # CPU functions consumptions from python profiles.

1832 # You may want to de-activate them during profiling.

1833 if self.type == "int32":

1834 tlen = sorted._search_bin_na_i(*item)

1835 elif self.type == "int64":

1836 tlen = sorted._search_bin_na_ll(*item)

1837 elif self.type == "float16":

1838 tlen = sorted._search_bin_na_e(*item)

1839 elif self.type == "float32":

1840 tlen = sorted._search_bin_na_f(*item)

1841 elif self.type == "float64":

1842 tlen = sorted._search_bin_na_d(*item)

1843 elif self.type == "float96":

1844 tlen = sorted._search_bin_na_g(*item)

1845 elif self.type == "float128":

1846 tlen = sorted._search_bin_na_g(*item)

1847 elif self.type == "uint32":

1848 tlen = sorted._search_bin_na_ui(*item)

1849 elif self.type == "uint64":

1850 tlen = sorted._search_bin_na_ull(*item)

1851 elif self.type == "int8":

1852 tlen = sorted._search_bin_na_b(*item)

1853 elif self.type == "int16":

1854 tlen = sorted._search_bin_na_s(*item)

1855 elif self.type == "uint8":

1856 tlen = sorted._search_bin_na_ub(*item)

1857 elif self.type == "uint16":

1858 tlen = sorted._search_bin_na_us(*item)

1859 else:

1860 assert False, "This can't happen!"

1861 else:

1862 tlen = self.search_scalar(item, sorted)

1863 # Get possible remaining values in last row

1864 if self.nelementsSLR > 0:

1865 # Look for more indexes in the last row

1866 (start, stop) = self.search_last_row(item)

1867 self.starts[-1] = start

1868 self.lengths[-1] = stop - start

1869 tlen += stop - start

1870

1871 if self.limboundscache.couldenablecache():

1872 # Get a startlengths tuple and save it in cache.

1873 # This is quite slow, but it is a good way to compress

1874 # the bounds info. Moreover, the .couldenablecache()

1875 # is doing a good work so as to avoid computing this

1876 # when it is not necessary to do it.

1877 startlengths = []

1878 for nrow, length in enumerate(self.lengths):

1879 if length > 0:

1880 startlengths.append((nrow, self.starts[nrow], length))

1881 # Compute the size of the recarray (aproximately)

1882 # The +1 at the end is important to avoid 0 lengths

1883 # (remember, the object headers take some space)

1884 size = len(startlengths) * 8 * 2 + 1

1885 # Put this startlengths list in cache

1886 self.limboundscache.setitem(item, startlengths, size)

1887

1888 if profile:

1889 show_stats("Exiting search", tref)

1890 return tlen

1891

1892 # This is an scalar version of search. It works with strings as well.

1893 def search_scalar(self, item, sorted):

1894 """Do a binary search in this index for an item."""

1895

1896 tlen = 0

1897 # Do the lookup for values fullfilling the conditions

1898 for i in range(self.nslices):

1899 (start, stop) = sorted._search_bin(i, item)

1900 self.starts[i] = start

1901 self.lengths[i] = stop - start

1902 tlen += stop - start

1903 return tlen

1904

1905 def search_last_row(self, item):

1906 # Variable initialization

1907 item1, item2 = item

1908 bebounds = self.bebounds

1909 b0, b1 = bebounds[0], bebounds[-1]

1910 bounds = bebounds[1:-1]

1911 itemsize = self.dtype.itemsize

1912 sortedLRcache = self.sortedLRcache

1913 hi = self.nelementsSLR # maximum number of elements

1914 rchunksize = self.chunksize // self.reduction

1915

1916 nchunk = -1

1917 # Lookup for item1

1918 if nan_aware_gt(item1, b0):

1919 if nan_aware_le(item1, b1):

1920 # Search the appropriate chunk in bounds cache

1921 nchunk = bisect_left(bounds, item1)

1922 # Lookup for this chunk in cache

1923 nslot = sortedLRcache.getslot(nchunk)

1924 if nslot >= 0:

1925 chunk = sortedLRcache.getitem(nslot)

1926 else:

1927 begin = rchunksize * nchunk

1928 end = rchunksize * (nchunk + 1)

1929 if end > hi:

1930 end = hi

1931 # Read the chunk from disk

1932 chunk = self.sortedLR._read_sorted_slice(

1933 self.sorted, begin, end)

1934 # Put it in cache. It's important to *copy*

1935 # the buffer, as it is reused in future reads!

1936 sortedLRcache.setitem(nchunk, chunk.copy(),

1937 (end - begin) * itemsize)

1938 start = bisect_left(chunk, item1)

1939 start += rchunksize * nchunk

1940 else:

1941 start = hi

1942 else:

1943 start = 0

1944 # Lookup for item2

1945 if nan_aware_ge(item2, b0):

1946 if nan_aware_lt(item2, b1):

1947 # Search the appropriate chunk in bounds cache

1948 nchunk2 = bisect_right(bounds, item2)

1949 if nchunk2 != nchunk:

1950 # Lookup for this chunk in cache

1951 nslot = sortedLRcache.getslot(nchunk2)

1952 if nslot >= 0:

1953 chunk = sortedLRcache.getitem(nslot)

1954 else:

1955 begin = rchunksize * nchunk2

1956 end = rchunksize * (nchunk2 + 1)

1957 if end > hi:

1958 end = hi

1959 # Read the chunk from disk

1960 chunk = self.sortedLR._read_sorted_slice(

1961 self.sorted, begin, end)

1962 # Put it in cache. It's important to *copy*

1963 # the buffer, as it is reused in future reads!

1964 # See bug #60 in xot.carabos.com

1965 sortedLRcache.setitem(nchunk2, chunk.copy(),

1966 (end - begin) * itemsize)

1967 stop = bisect_right(chunk, item2)

1968 stop += rchunksize * nchunk2

1969 else:

1970 stop = hi

1971 else:

1972 stop = 0

1973 return (start, stop)

1974

1975 def get_chunkmap(self):

1976 """Compute a map with the interesting chunks in index."""

1977

1978 if profile:

1979 tref = clock()

1980 if profile:

1981 show_stats("Entering get_chunkmap", tref)

1982 ss = self.slicesize

1983 nsb = self.nslicesblock

1984 nslices = self.nslices

1985 lbucket = self.lbucket

1986 indsize = self.indsize

1987 bucketsinblock = self.blocksize / lbucket

1988 nchunks = math.ceil(self.nelements / lbucket)

1989 chunkmap = np.zeros(shape=nchunks, dtype="bool")

1990 reduction = self.reduction

1991 starts = (self.starts - 1) * reduction + 1

1992 stops = (self.starts + self.lengths) * reduction

1993 starts[starts < 0] = 0 # All negative values set to zero

1994 indices = self.indices

1995 for nslice in range(self.nrows):

1996 start = starts[nslice]

1997 stop = stops[nslice]

1998 if stop > start:

1999 idx = np.empty(shape=stop - start, dtype='u%d' % indsize)

2000 if nslice < nslices:

2001 indices._read_index_slice(nslice, start, stop, idx)

2002 else:

2003 self.indicesLR._read_index_slice(start, stop, idx)

2004 if indsize == 8:

2005 idx //= lbucket

2006 elif indsize == 2:

2007 # The chunkmap size cannot be never larger than 'int_'

2008 idx = idx.astype("int_")

2009 offset = int((nslice // nsb) * bucketsinblock)

2010 idx += offset

2011 elif indsize == 1:

2012 # The chunkmap size cannot be never larger than 'int_'

2013 idx = idx.astype("int_")

2014 offset = (nslice * ss) // lbucket

2015 idx += offset

2016 chunkmap[idx] = True

2017 # The case lbucket < nrowsinchunk should only happen in tests

2018 nrowsinchunk = self.nrowsinchunk

2019 if lbucket != nrowsinchunk:

2020 # Map the 'coarse grain' chunkmap into the 'true' chunkmap

2021 nelements = self.nelements

2022 tnchunks = math.ceil(nelements / nrowsinchunk)

2023 tchunkmap = np.zeros(shape=tnchunks, dtype="bool")

2024 ratio = lbucket / nrowsinchunk

2025 idx = chunkmap.nonzero()[0]

2026 starts = (idx * ratio).astype('int_')

2027 stops = np.ceil((idx + 1) * ratio).astype('int_')

2028 for start, stop in zip(starts, stops):

2029 tchunkmap[start:stop] = True

2030 chunkmap = tchunkmap

2031 if profile:

2032 show_stats("Exiting get_chunkmap", tref)

2033 return chunkmap

2034

2035 def get_lookup_range(self, ops, limits):

2036 assert len(ops) in [1, 2]

2037 assert len(limits) in [1, 2]

2038 assert len(ops) == len(limits)

2039

2040 column = self.column

2041 coldtype = column.dtype.base

2042 itemsize = coldtype.itemsize

2043

2044 if len(limits) == 1:

2045 assert ops[0] in ['lt', 'le', 'eq', 'ge', 'gt']

2046 limit = limits[0]

2047 op = ops[0]

2048 if op == 'lt':

2049 range_ = (inftype(coldtype, itemsize, sign=-1),

2050 nextafter(limit, -1, coldtype, itemsize))

2051 elif op == 'le':

2052 range_ = (inftype(coldtype, itemsize, sign=-1),

2053 limit)

2054 elif op == 'gt':

2055 range_ = (nextafter(limit, +1, coldtype, itemsize),

2056 inftype(coldtype, itemsize, sign=+1))

2057 elif op == 'ge':

2058 range_ = (limit,

2059 inftype(coldtype, itemsize, sign=+1))

2060 elif op == 'eq':

2061 range_ = (limit, limit)

2062

2063 elif len(limits) == 2:

2064 assert ops[0] in ('gt', 'ge') and ops[1] in ('lt', 'le')

2065

2066 lower, upper = limits

2067 if lower > upper:

2068 # ``a <[=] x <[=] b`` is always false if ``a > b``.

2069 return ()

2070

2071 if ops == ('gt', 'lt'): # lower < col < upper

2072 range_ = (nextafter(lower, +1, coldtype, itemsize),

2073 nextafter(upper, -1, coldtype, itemsize))

2074 elif ops == ('ge', 'lt'): # lower <= col < upper

2075 range_ = (lower, nextafter(upper, -1, coldtype, itemsize))

2076 elif ops == ('gt', 'le'): # lower < col <= upper

2077 range_ = (nextafter(lower, +1, coldtype, itemsize), upper)

2078 elif ops == ('ge', 'le'): # lower <= col <= upper

2079 range_ = (lower, upper)

2080

2081 return range_

2082

2083 def _f_remove(self, recursive=False):

2084 """Remove this Index object."""

2085

2086 # Index removal is always recursive,

2087 # no matter what `recursive` says.

2088 super()._f_remove(True)

2089

2090 def __str__(self):

2091 """This provides a more compact representation than __repr__"""

2092

2093 # The filters

2094 filters = []

2095 if self.filters.complevel:

2096 if self.filters.shuffle:

2097 filters.append('shuffle')

2098 if self.filters.bitshuffle:

2099 filters.append('bitshuffle')

2100 filters.append(f'{self.filters.complib}({self.filters.complevel})')

2101 return (f"Index({self.optlevel}, "

2102 f"{self.kind}{', '.join(filters)}).is_csi={self.is_csi}")

2103

2104 def __repr__(self):

2105 """This provides more metainfo than standard __repr__"""

2106

2107 cpathname = f"{self.table._v_pathname}.cols.{self.column.pathname}"

2108 retstr = f"""{self._v_pathname} (Index for column {cpathname})

2109 optlevel := {self.optlevel}

2110 kind := {self.kind}

2111 filters := {self.filters}

2112 is_csi := {self.is_csi}

2113 nelements := {self.nelements}

2114 chunksize := {self.chunksize}

2115 slicesize := {self.slicesize}

2116 blocksize := {self.blocksize}

2117 superblocksize := {self.superblocksize}

2118 dirty := {self.dirty}

2119 byteorder := {self.byteorder!r}

2120 sorted := {self.sorted}

2121 indices := {self.indices}

2122 ranges := {self.ranges}

2123 bounds := {self.bounds}

2124 sortedLR := {self.sortedLR}

2125 indicesLR := {self.indicesLR}"""

2126 return retstr

2127

2128

2129class IndexesDescG(NotLoggedMixin, Group):

2130 _c_classid = 'DINDEX'

2131

2132 def _g_width_warning(self):

2133 warnings.warn(

2134 "the number of indexed columns on a single description group "

2135 "is exceeding the recommended maximum (%d); "

2136 "be ready to see PyTables asking for *lots* of memory "

2137 "and possibly slow I/O" % self._v_max_group_width,

2138 PerformanceWarning)

2139

2140

2141class IndexesTableG(NotLoggedMixin, Group):

2142 _c_classid = 'TINDEX'

2143

2144 @property

2145 def auto(self):

2146 if 'AUTO_INDEX' not in self._v_attrs:

2147 return default_auto_index

2148 return self._v_attrs.AUTO_INDEX

2149

2150 @auto.setter

2151 def auto(self, auto):

2152 self._v_attrs.AUTO_INDEX = bool(auto)

2153

2154 @auto.deleter

2155 def auto(self):

2156 del self._v_attrs.AUTO_INDEX

2157

2158 def _g_width_warning(self):

2159 warnings.warn(

2160 "the number of indexed columns on a single table "

2161 "is exceeding the recommended maximum (%d); "

2162 "be ready to see PyTables asking for *lots* of memory "

2163 "and possibly slow I/O" % self._v_max_group_width,

2164 PerformanceWarning)

2165

2166 def _g_check_name(self, name):

2167 if not name.startswith('_i_'):

2168 raise ValueError(

2169 "names of index groups must start with ``_i_``: %s" % name)

2170

2171 @property

2172 def table(self):

2173 """Accessor for the `Table` object of this `IndexesTableG`

2174 container."""

2175 names = self._v_pathname.split("/")

2176 tablename = names.pop()[3:] # "_i_" is at the beginning

2177 parentpathname = "/".join(names)

2178 tablepathname = join_path(parentpathname, tablename)

2179 table = self._v_file._get_node(tablepathname)

2180 return table

2181

2182

2183class OldIndex(NotLoggedMixin, Group):

2184 """This is meant to hide indexes of PyTables 1.x files."""

2185

2186 _c_classid = 'CINDEX'