Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tables/index.py: 10%

1268 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-10 06:15 +0000

1"""Here is defined the Index class.""" 

2 

3import math 

4import operator 

5import os 

6import sys 

7import tempfile 

8import warnings 

9 

10from pathlib import Path 

11from time import perf_counter as clock 

12from time import process_time as cpuclock 

13 

14import numpy as np 

15 

16from .idxutils import (calc_chunksize, calcoptlevels, 

17 get_reduction_level, nextafter, inftype) 

18 

19from . import indexesextension 

20from .node import NotLoggedMixin 

21from .atom import UIntAtom, Atom 

22from .earray import EArray 

23from .carray import CArray 

24from .leaf import Filters 

25from .indexes import CacheArray, LastRowArray, IndexArray 

26from .group import Group 

27from .path import join_path 

28from .exceptions import PerformanceWarning 

29from .utils import is_idx, idx2long, lazyattr 

30from .utilsextension import (nan_aware_gt, nan_aware_ge, 

31 nan_aware_lt, nan_aware_le, 

32 bisect_left, bisect_right) 

33from .lrucacheextension import ObjectCache 

34 

35# default version for INDEX objects 

36# obversion = "1.0" # Version of indexes in PyTables 1.x series 

37# obversion = "2.0" # Version of indexes in PyTables Pro 2.0 series 

38obversion = "2.1" # Version of indexes in PyTables Pro 2.1 and up series, 

39# # including the join 2.3 Std + Pro version 

40 

41debug = False 

42# debug = True # Uncomment this for printing sizes purposes 

43profile = False 

44# profile = True # Uncomment for profiling 

45if profile: 

46 from .utils import show_stats 

47 

48# The default method for sorting 

49# defsort = "quicksort" 

50# Changing to mergesort to fix #441 

51defsort = "mergesort" 

52 

53# Default policy for automatically updating indexes after a table 

54# append operation, or automatically reindexing after an 

55# index-invalidating operation like removing or modifying table rows. 

56default_auto_index = True 

57# Keep in sync with ``Table.autoindex`` docstring. 

58 

59# Default filters used to compress indexes. This is quite fast and 

60# compression is pretty good. 

61# Remember to keep these defaults in sync with the docstrings and UG. 

62default_index_filters = Filters(complevel=1, complib='zlib', 

63 shuffle=True, fletcher32=False) 

64 

65# Deprecated API 

66defaultAutoIndex = default_auto_index 

67defaultIndexFilters = default_index_filters 

68 

69# The list of types for which an optimised search in cython and C has 

70# been implemented. Always add here the name of a new optimised type. 

71opt_search_types = ("int8", "int16", "int32", "int64", 

72 "uint8", "uint16", "uint32", "uint64", 

73 "float32", "float64") 

74 

75# The upper limit for uint32 ints 

76max32 = 2**32 

77 

78 

79def _table_column_pathname_of_index(indexpathname): 

80 names = indexpathname.split("/") 

81 for i, name in enumerate(names): 

82 if name.startswith('_i_'): 

83 break 

84 tablepathname = "/".join(names[:i]) + "/" + name[3:] 

85 colpathname = "/".join(names[i + 1:]) 

86 return (tablepathname, colpathname) 

87 

88 

89class Index(NotLoggedMixin, Group, indexesextension.Index): 

90 """Represents the index of a column in a table. 

91 

92 This class is used to keep the indexing information for columns in a Table 

93 dataset (see :ref:`TableClassDescr`). It is actually a descendant of the 

94 Group class (see :ref:`GroupClassDescr`), with some added functionality. An 

95 Index is always associated with one and only one column in the table. 

96 

97 .. note:: 

98 

99 This class is mainly intended for internal use, but some of its 

100 documented attributes and methods may be interesting for the 

101 programmer. 

102 

103 Parameters 

104 ---------- 

105 parentnode 

106 The parent :class:`Group` object. 

107 

108 .. versionchanged:: 3.0 

109 Renamed from *parentNode* to *parentnode*. 

110 

111 name : str 

112 The name of this node in its parent group. 

113 atom : Atom 

114 An Atom object representing the shape and type of the atomic objects to 

115 be saved. Only scalar atoms are supported. 

116 title 

117 Sets a TITLE attribute of the Index entity. 

118 kind 

119 The desired kind for this index. The 'full' kind specifies a complete 

120 track of the row position (64-bit), while the 'medium', 'light' or 

121 'ultralight' kinds only specify in which chunk the row is (using 

122 32-bit, 16-bit and 8-bit respectively). 

123 optlevel 

124 The desired optimization level for this index. 

125 filters : Filters 

126 An instance of the Filters class that provides information about the 

127 desired I/O filters to be applied during the life of this object. 

128 tmp_dir 

129 The directory for the temporary files. 

130 expectedrows 

131 Represents an user estimate about the number of row slices that will be 

132 added to the growable dimension in the IndexArray object. 

133 byteorder 

134 The byteorder of the index datasets *on-disk*. 

135 blocksizes 

136 The four main sizes of the compound blocks in index datasets (a low 

137 level parameter). 

138 

139 """ 

140 

141 _c_classid = 'INDEX' 

142 

143 @property 

144 def kind(self): 

145 """The kind of this index.""" 

146 return {1: 'ultralight', 2: 'light', 

147 4: 'medium', 8: 'full'}[self.indsize] 

148 

149 @property 

150 def filters(self): 

151 """Filter properties for this index - see Filters in 

152 :ref:`FiltersClassDescr`.""" 

153 return self._v_filters 

154 

155 @property 

156 def dirty(self): 

157 """Whether the index is dirty or not. 

158 Dirty indexes are out of sync with column data, so they exist but they 

159 are not usable. 

160 """ 

161 

162 # If there is no ``DIRTY`` attribute, index should be clean. 

163 return getattr(self._v_attrs, 'DIRTY', False) 

164 

165 @dirty.setter 

166 def dirty(self, dirty): 

167 wasdirty, isdirty = self.dirty, bool(dirty) 

168 self._v_attrs.DIRTY = dirty 

169 # If an *actual* change in dirtiness happens, 

170 # notify the condition cache by setting or removing a nail. 

171 conditioncache = self.table._condition_cache 

172 if not wasdirty and isdirty: 

173 conditioncache.nail() 

174 if wasdirty and not isdirty: 

175 conditioncache.unnail() 

176 

177 @property 

178 def column(self): 

179 """The Column (see :ref:`ColumnClassDescr`) instance for the indexed 

180 column.""" 

181 

182 tablepath, columnpath = _table_column_pathname_of_index( 

183 self._v_pathname) 

184 table = self._v_file._get_node(tablepath) 

185 column = table.cols._g_col(columnpath) 

186 return column 

187 

188 @property 

189 def table(self): 

190 """Accessor for the `Table` object of this index.""" 

191 tablepath, columnpath = _table_column_pathname_of_index( 

192 self._v_pathname) 

193 table = self._v_file._get_node(tablepath) 

194 return table 

195 

196 @property 

197 def nblockssuperblock(self): 

198 """The number of blocks in a superblock.""" 

199 return self.superblocksize // self.blocksize 

200 

201 @property 

202 def nslicesblock(self): 

203 """The number of slices in a block.""" 

204 return self.blocksize // self.slicesize 

205 

206 @property 

207 def nchunkslice(self): 

208 """The number of chunks in a slice.""" 

209 return self.slicesize // self.chunksize 

210 

211 @property 

212 def nsuperblocks(self): 

213 """The total number of superblocks in index.""" 

214 # Last row should not be considered as a superblock 

215 nelements = self.nelements - self.nelementsILR 

216 nblocks = nelements // self.superblocksize 

217 if nelements % self.blocksize > 0: 

218 nblocks += 1 

219 return nblocks 

220 

221 @property 

222 def nblocks(self): 

223 """The total number of blocks in index.""" 

224 # Last row should not be considered as a block 

225 nelements = self.nelements - self.nelementsILR 

226 nblocks = nelements // self.blocksize 

227 if nelements % self.blocksize > 0: 

228 nblocks += 1 

229 return nblocks 

230 

231 @property 

232 def nslices(self): 

233 """The number of complete slices in index.""" 

234 return self.nelements // self.slicesize 

235 

236 @property 

237 def nchunks(self): 

238 """The number of complete chunks in index.""" 

239 return self.nelements // self.chunksize 

240 

241 @property 

242 def shape(self): 

243 """The shape of this index (in slices and elements).""" 

244 return (self.nrows, self.slicesize) 

245 

246 @property 

247 def temp_required(self): 

248 """Whether a temporary file for indexes is required or not.""" 

249 return (self.indsize > 1 and 

250 self.optlevel > 0 and 

251 self.table.nrows > self.slicesize) 

252 

253 @property 

254 def want_complete_sort(self): 

255 """Whether we should try to build a completely sorted index or not.""" 

256 return self.indsize == 8 and self.optlevel == 9 

257 

258 @property 

259 def is_csi(self): 

260 """Whether the index is completely sorted or not. 

261 

262 .. versionchanged:: 3.0 

263 The *is_CSI* property has been renamed into *is_csi*. 

264 

265 """ 

266 

267 if self.nelements == 0: 

268 # An index with 0 indexed elements is not a CSI one (by definition) 

269 return False 

270 if self.indsize < 8: 

271 # An index that is not full cannot be completely sorted 

272 return False 

273 # Try with the 'is_csi' attribute 

274 if 'is_csi' in self._v_attrs: 

275 return self._v_attrs.is_csi 

276 # If not, then compute the overlaps manually 

277 # (the attribute 'is_csi' will be set there) 

278 self.compute_overlaps(self, None, False) 

279 return self.noverlaps == 0 

280 

281 @lazyattr 

282 def nrowsinchunk(self): 

283 """The number of rows that fits in a *table* chunk.""" 

284 

285 return self.table.chunkshape[0] 

286 

287 @lazyattr 

288 def lbucket(self): 

289 """Return the length of a bucket based index type.""" 

290 

291 # Avoid to set a too large lbucket size (mainly useful for tests) 

292 lbucket = min(self.nrowsinchunk, self.chunksize) 

293 if self.indsize == 1: 

294 # For ultra-light, we will never have to keep track of a 

295 # bucket outside of a slice. 

296 maxnb = 2**8 

297 if self.slicesize > maxnb * lbucket: 

298 lbucket = math.ceil(self.slicesize / maxnb) 

299 elif self.indsize == 2: 

300 # For light, we will never have to keep track of a 

301 # bucket outside of a block. 

302 maxnb = 2**16 

303 if self.blocksize > maxnb * lbucket: 

304 lbucket = math.ceil(self.blocksize / maxnb) 

305 else: 

306 # For medium and full indexes there should not be a need to 

307 # increase lbucket 

308 pass 

309 return lbucket 

310 

311 def __init__(self, parentnode, name, 

312 atom=None, title="", 

313 kind=None, 

314 optlevel=None, 

315 filters=None, 

316 tmp_dir=None, 

317 expectedrows=0, 

318 byteorder=None, 

319 blocksizes=None, 

320 new=True): 

321 

322 self._v_version = None 

323 """The object version of this index.""" 

324 self.optlevel = optlevel 

325 """The optimization level for this index.""" 

326 self.tmp_dir = tmp_dir 

327 """The directory for the temporary files.""" 

328 self.expectedrows = expectedrows 

329 """The expected number of items of index arrays.""" 

330 if byteorder in ["little", "big"]: 

331 self.byteorder = byteorder 

332 else: 

333 self.byteorder = sys.byteorder 

334 """The byteorder of the index datasets.""" 

335 if atom is not None: 

336 self.dtype = atom.dtype.base 

337 self.type = atom.type 

338 """The datatypes to be stored by the sorted index array.""" 

339 # ############## Important note ########################### 

340 # The datatypes saved as index values are NumPy native 

341 # types, so we get rid of type metainfo like Time* or Enum* 

342 # that belongs to HDF5 types (actually, this metainfo is 

343 # not needed for sorting and looking-up purposes). 

344 # ######################################################### 

345 indsize = { 

346 'ultralight': 1, 'light': 2, 'medium': 4, 'full': 8}[kind] 

347 assert indsize in (1, 2, 4, 8), "indsize should be 1, 2, 4 or 8!" 

348 self.indsize = indsize 

349 """The itemsize for the indices part of the index.""" 

350 

351 self.nrows = None 

352 """The total number of slices in the index.""" 

353 self.nelements = None 

354 """The number of currently indexed rows for this column.""" 

355 self.blocksizes = blocksizes 

356 """The four main sizes of the compound blocks (if specified).""" 

357 self.dirtycache = True 

358 """Dirty cache (for ranges, bounds & sorted) flag.""" 

359 self.superblocksize = None 

360 """Size of the superblock for this index.""" 

361 self.blocksize = None 

362 """Size of the block for this index.""" 

363 self.slicesize = None 

364 """Size of the slice for this index.""" 

365 self.chunksize = None 

366 """Size of the chunk for this index.""" 

367 self.tmpfilename = None 

368 """Filename for temporary bounds.""" 

369 self.opt_search_types = opt_search_types 

370 """The types for which and optimized search has been implemented.""" 

371 self.noverlaps = -1 

372 """The number of overlaps in an index. 0 means a completely 

373 sorted index. -1 means that this number is not computed yet.""" 

374 self.tprof = 0 

375 """Time counter for benchmarking purposes.""" 

376 

377 from .file import open_file 

378 self._openFile = open_file 

379 """The `open_file()` function, to avoid a circular import.""" 

380 

381 super().__init__(parentnode, name, title, new, filters) 

382 

383 def _g_post_init_hook(self): 

384 if self._v_new: 

385 # The version for newly created indexes 

386 self._v_version = obversion 

387 super()._g_post_init_hook() 

388 

389 # Index arrays must only be created for new indexes 

390 if not self._v_new: 

391 idxversion = self._v_version 

392 # Set-up some variables from info on disk and return 

393 attrs = self._v_attrs 

394 # Coerce NumPy scalars to Python scalars in order 

395 # to avoid undesired upcasting operations. 

396 self.superblocksize = int(attrs.superblocksize) 

397 self.blocksize = int(attrs.blocksize) 

398 self.slicesize = int(attrs.slicesize) 

399 self.chunksize = int(attrs.chunksize) 

400 self.blocksizes = (self.superblocksize, self.blocksize, 

401 self.slicesize, self.chunksize) 

402 self.optlevel = int(attrs.optlevel) 

403 sorted = self.sorted 

404 indices = self.indices 

405 self.dtype = sorted.atom.dtype 

406 self.type = sorted.atom.type 

407 self.indsize = indices.atom.itemsize 

408 # Some sanity checks for slicesize, chunksize and indsize 

409 assert self.slicesize == indices.shape[1], "Wrong slicesize" 

410 assert self.chunksize == indices._v_chunkshape[ 

411 1], "Wrong chunksize" 

412 assert self.indsize in (1, 2, 4, 8), "Wrong indices itemsize" 

413 if idxversion > "2.0": 

414 self.reduction = int(attrs.reduction) 

415 nelementsSLR = int(self.sortedLR.attrs.nelements) 

416 nelementsILR = int(self.indicesLR.attrs.nelements) 

417 else: 

418 self.reduction = 1 

419 nelementsILR = self.indicesLR[-1] 

420 nelementsSLR = nelementsILR 

421 self.nrows = sorted.nrows 

422 self.nelements = self.nrows * self.slicesize + nelementsILR 

423 self.nelementsSLR = nelementsSLR 

424 self.nelementsILR = nelementsILR 

425 if nelementsILR > 0: 

426 self.nrows += 1 

427 # Get the bounds as a cache (this has to remain here!) 

428 rchunksize = self.chunksize // self.reduction 

429 nboundsLR = (nelementsSLR - 1) // rchunksize 

430 if nboundsLR < 0: 

431 nboundsLR = 0 # correction for -1 bounds 

432 nboundsLR += 2 # bounds + begin + end 

433 # All bounds values (+begin + end) are at the end of sortedLR 

434 self.bebounds = self.sortedLR[ 

435 nelementsSLR:nelementsSLR + nboundsLR] 

436 return 

437 

438 # The index is new. Initialize the values 

439 self.nrows = 0 

440 self.nelements = 0 

441 self.nelementsSLR = 0 

442 self.nelementsILR = 0 

443 

444 # The atom 

445 atom = Atom.from_dtype(self.dtype) 

446 

447 # The filters 

448 filters = self.filters 

449 

450 # Compute the superblocksize, blocksize, slicesize and chunksize values 

451 # (in case these parameters haven't been passed to the constructor) 

452 if self.blocksizes is None: 

453 self.blocksizes = calc_chunksize( 

454 self.expectedrows, self.optlevel, self.indsize, node=self) 

455 (self.superblocksize, self.blocksize, 

456 self.slicesize, self.chunksize) = self.blocksizes 

457 if debug: 

458 print("blocksizes:", self.blocksizes) 

459 # Compute the reduction level 

460 self.reduction = get_reduction_level( 

461 self.indsize, self.optlevel, self.slicesize, self.chunksize) 

462 rchunksize = self.chunksize // self.reduction 

463 rslicesize = self.slicesize // self.reduction 

464 

465 # Save them on disk as attributes 

466 self._v_attrs.superblocksize = np.uint64(self.superblocksize) 

467 self._v_attrs.blocksize = np.uint64(self.blocksize) 

468 self._v_attrs.slicesize = np.uint32(self.slicesize) 

469 self._v_attrs.chunksize = np.uint32(self.chunksize) 

470 # Save the optlevel as well 

471 self._v_attrs.optlevel = self.optlevel 

472 # Save the reduction level 

473 self._v_attrs.reduction = self.reduction 

474 

475 # Create the IndexArray for sorted values 

476 sorted = IndexArray(self, 'sorted', atom, "Sorted Values", 

477 filters, self.byteorder) 

478 

479 # Create the IndexArray for index values 

480 IndexArray(self, 'indices', UIntAtom(itemsize=self.indsize), 

481 "Number of chunk in table", filters, self.byteorder) 

482 

483 # Create the cache for range values (1st order cache) 

484 CacheArray(self, 'ranges', atom, (0, 2), "Range Values", filters, 

485 self.expectedrows // self.slicesize, 

486 byteorder=self.byteorder) 

487 # median ranges 

488 EArray(self, 'mranges', atom, (0,), "Median ranges", filters, 

489 byteorder=self.byteorder, _log=False) 

490 

491 # Create the cache for boundary values (2nd order cache) 

492 nbounds_inslice = (rslicesize - 1) // rchunksize 

493 CacheArray(self, 'bounds', atom, (0, nbounds_inslice), 

494 "Boundary Values", filters, self.nchunks, 

495 (1, nbounds_inslice), byteorder=self.byteorder) 

496 

497 # begin, end & median bounds (only for numerical types) 

498 EArray(self, 'abounds', atom, (0,), "Start bounds", filters, 

499 byteorder=self.byteorder, _log=False) 

500 EArray(self, 'zbounds', atom, (0,), "End bounds", filters, 

501 byteorder=self.byteorder, _log=False) 

502 EArray(self, 'mbounds', atom, (0,), "Median bounds", filters, 

503 byteorder=self.byteorder, _log=False) 

504 

505 # Create the Array for last (sorted) row values + bounds 

506 shape = (rslicesize + 2 + nbounds_inslice,) 

507 sortedLR = LastRowArray(self, 'sortedLR', atom, shape, 

508 "Last Row sorted values + bounds", 

509 filters, (rchunksize,), 

510 byteorder=self.byteorder) 

511 

512 # Create the Array for the number of chunk in last row 

513 shape = (self.slicesize,) # enough for indexes and length 

514 indicesLR = LastRowArray(self, 'indicesLR', 

515 UIntAtom(itemsize=self.indsize), 

516 shape, "Last Row indices", 

517 filters, (self.chunksize,), 

518 byteorder=self.byteorder) 

519 

520 # The number of elements in LR will be initialized here 

521 sortedLR.attrs.nelements = 0 

522 indicesLR.attrs.nelements = 0 

523 

524 # All bounds values (+begin + end) are uninitialized in creation time 

525 self.bebounds = None 

526 

527 # The starts and lengths initialization 

528 self.starts = np.empty(shape=self.nrows, dtype=np.int32) 

529 """Where the values fulfiling conditions starts for every slice.""" 

530 self.lengths = np.empty(shape=self.nrows, dtype=np.int32) 

531 """Lengths of the values fulfilling conditions for every slice.""" 

532 

533 # Finally, create a temporary file for indexes if needed 

534 if self.temp_required: 

535 self.create_temp() 

536 

537 def initial_append(self, xarr, nrow, reduction): 

538 """Compute an initial indices arrays for data to be indexed.""" 

539 

540 if profile: 

541 tref = clock() 

542 if profile: 

543 show_stats("Entering initial_append", tref) 

544 arr = xarr.pop() 

545 indsize = self.indsize 

546 slicesize = self.slicesize 

547 nelementsILR = self.nelementsILR 

548 if profile: 

549 show_stats("Before creating idx", tref) 

550 if indsize == 8: 

551 idx = np.arange(0, len(arr), dtype="uint64") + nrow * slicesize 

552 elif indsize == 4: 

553 # For medium (32-bit) all the rows in tables should be 

554 # directly reachable. But as len(arr) < 2**31, we can 

555 # choose uint32 for representing indices. In this way, we 

556 # consume far less memory during the keysort process. The 

557 # offset will be added in self.final_idx32() later on. 

558 # 

559 # This optimization also prevents the values in LR to 

560 # participate in the ``swap_chunks`` process, and this is 

561 # the main reason to not allow the medium indexes to create 

562 # completely sorted indexes. However, I don't find this to 

563 # be a big limitation, as probably fully indexes are much 

564 # more suitable for producing completely sorted indexes 

565 # because in this case the indices part is usable for 

566 # getting the reverse indices of the index, and I forsee 

567 # this to be a common requirement in many operations (for 

568 # example, in table sorts). 

569 # 

570 # F. Alted 2008-09-15 

571 idx = np.arange(0, len(arr), dtype="uint32") 

572 else: 

573 idx = np.empty(len(arr), "uint%d" % (indsize * 8)) 

574 lbucket = self.lbucket 

575 # Fill the idx with the bucket indices 

576 offset = int(lbucket - ((nrow * (slicesize % lbucket)) % lbucket)) 

577 idx[0:offset] = 0 

578 for i in range(offset, slicesize, lbucket): 

579 idx[i:i + lbucket] = (i + lbucket - 1) // lbucket 

580 if indsize == 2: 

581 # Add a second offset in this case 

582 # First normalize the number of rows 

583 offset2 = (nrow % self.nslicesblock) * slicesize // lbucket 

584 idx += offset2 

585 # Add the last row at the beginning of arr & idx (if needed) 

586 if (indsize == 8 and nelementsILR > 0): 

587 # It is possible that the values in LR are already sorted. 

588 # Fetch them and override existing values in arr and idx. 

589 assert len(arr) > nelementsILR 

590 self.read_slice_lr(self.sortedLR, arr[:nelementsILR]) 

591 self.read_slice_lr(self.indicesLR, idx[:nelementsILR]) 

592 # In-place sorting 

593 if profile: 

594 show_stats("Before keysort", tref) 

595 indexesextension.keysort(arr, idx) 

596 larr = arr[-1] 

597 if reduction > 1: 

598 # It's important to do a copy() here in order to ensure that 

599 # sorted._append() will receive a contiguous array. 

600 if profile: 

601 show_stats("Before reduction", tref) 

602 reduc = arr[::reduction].copy() 

603 if profile: 

604 show_stats("After reduction", tref) 

605 arr = reduc 

606 if profile: 

607 show_stats("After arr <-- reduc", tref) 

608 # A completely sorted index is not longer possible after an 

609 # append of an index with already one slice. 

610 if nrow > 0: 

611 self._v_attrs.is_csi = False 

612 if profile: 

613 show_stats("Exiting initial_append", tref) 

614 return larr, arr, idx 

615 

616 def final_idx32(self, idx, offset): 

617 """Perform final operations in 32-bit indices.""" 

618 

619 if profile: 

620 tref = clock() 

621 if profile: 

622 show_stats("Entering final_idx32", tref) 

623 # Do an upcast first in order to add the offset. 

624 idx = idx.astype('uint64') 

625 idx += offset 

626 # The next partition is valid up to table sizes of 

627 # 2**30 * 2**18 = 2**48 bytes, that is, 256 Tera-elements, 

628 # which should be a safe figure, at least for a while. 

629 idx //= self.lbucket 

630 # After the division, we can downsize the indexes to 'uint32' 

631 idx = idx.astype('uint32') 

632 if profile: 

633 show_stats("Exiting final_idx32", tref) 

634 return idx 

635 

636 def append(self, xarr, update=False): 

637 """Append the array to the index objects.""" 

638 

639 if profile: 

640 tref = clock() 

641 if profile: 

642 show_stats("Entering append", tref) 

643 if not update and self.temp_required: 

644 where = self.tmp 

645 # The reduction will take place *after* the optimization process 

646 reduction = 1 

647 else: 

648 where = self 

649 reduction = self.reduction 

650 sorted = where.sorted 

651 indices = where.indices 

652 ranges = where.ranges 

653 mranges = where.mranges 

654 bounds = where.bounds 

655 mbounds = where.mbounds 

656 abounds = where.abounds 

657 zbounds = where.zbounds 

658 sortedLR = where.sortedLR 

659 indicesLR = where.indicesLR 

660 nrows = sorted.nrows # before sorted.append() 

661 larr, arr, idx = self.initial_append(xarr, nrows, reduction) 

662 # Save the sorted array 

663 sorted.append(arr.reshape(1, arr.size)) 

664 cs = self.chunksize // reduction 

665 ncs = self.nchunkslice 

666 # Save ranges & bounds 

667 ranges.append([[arr[0], larr]]) 

668 bounds.append([arr[cs::cs]]) 

669 abounds.append(arr[0::cs]) 

670 zbounds.append(arr[cs - 1::cs]) 

671 # Compute the medians 

672 smedian = arr[cs // 2::cs] 

673 mbounds.append(smedian) 

674 mranges.append([smedian[ncs // 2]]) 

675 if profile: 

676 show_stats("Before deleting arr & smedian", tref) 

677 del arr, smedian # delete references 

678 if profile: 

679 show_stats("After deleting arr & smedian", tref) 

680 # Now that arr is gone, we can upcast the indices and add the offset 

681 if self.indsize == 4: 

682 idx = self.final_idx32(idx, nrows * self.slicesize) 

683 indices.append(idx.reshape(1, idx.size)) 

684 if profile: 

685 show_stats("Before deleting idx", tref) 

686 del idx 

687 # Update counters after a successful append 

688 self.nrows = nrows + 1 

689 self.nelements = self.nrows * self.slicesize 

690 self.nelementsSLR = 0 # reset the counter of the last row index to 0 

691 self.nelementsILR = 0 # reset the counter of the last row index to 0 

692 # The number of elements will be saved as an attribute. 

693 # This is necessary in case the LR arrays can remember its values 

694 # after a possible node preemtion/reload. 

695 sortedLR.attrs.nelements = self.nelementsSLR 

696 indicesLR.attrs.nelements = self.nelementsILR 

697 self.dirtycache = True # the cache is dirty now 

698 if profile: 

699 show_stats("Exiting append", tref) 

700 

701 def append_last_row(self, xarr, update=False): 

702 """Append the array to the last row index objects.""" 

703 

704 if profile: 

705 tref = clock() 

706 if profile: 

707 show_stats("Entering appendLR", tref) 

708 # compute the elements in the last row sorted & bounds array 

709 nrows = self.nslices 

710 if not update and self.temp_required: 

711 where = self.tmp 

712 # The reduction will take place *after* the optimization process 

713 reduction = 1 

714 else: 

715 where = self 

716 reduction = self.reduction 

717 indicesLR = where.indicesLR 

718 sortedLR = where.sortedLR 

719 larr, arr, idx = self.initial_append(xarr, nrows, reduction) 

720 nelementsSLR = len(arr) 

721 nelementsILR = len(idx) 

722 # Build the cache of bounds 

723 rchunksize = self.chunksize // reduction 

724 self.bebounds = np.concatenate((arr[::rchunksize], [larr])) 

725 # The number of elements will be saved as an attribute 

726 sortedLR.attrs.nelements = nelementsSLR 

727 indicesLR.attrs.nelements = nelementsILR 

728 # Save the number of elements, bounds and sorted values 

729 # at the end of the sorted array 

730 offset2 = len(self.bebounds) 

731 sortedLR[nelementsSLR:nelementsSLR + offset2] = self.bebounds 

732 sortedLR[:nelementsSLR] = arr 

733 del arr 

734 # Now that arr is gone, we can upcast the indices and add the offset 

735 if self.indsize == 4: 

736 idx = self.final_idx32(idx, nrows * self.slicesize) 

737 # Save the reverse index array 

738 indicesLR[:len(idx)] = idx 

739 del idx 

740 # Update counters after a successful append 

741 self.nrows = nrows + 1 

742 self.nelements = nrows * self.slicesize + nelementsILR 

743 self.nelementsILR = nelementsILR 

744 self.nelementsSLR = nelementsSLR 

745 self.dirtycache = True # the cache is dirty now 

746 if profile: 

747 show_stats("Exiting appendLR", tref) 

748 

749 def optimize(self, verbose=False): 

750 """Optimize an index so as to allow faster searches. 

751 

752 verbose 

753 If True, messages about the progress of the 

754 optimization process are printed out. 

755 

756 """ 

757 

758 if not self.temp_required: 

759 return 

760 

761 if verbose: 

762 self.verbose = True 

763 else: 

764 self.verbose = debug 

765 

766 # Initialize last_tover and last_nover 

767 self.last_tover = 0 

768 self.last_nover = 0 

769 

770 # Compute the correct optimizations for current optim level 

771 opts = calcoptlevels(self.nblocks, self.optlevel, self.indsize) 

772 optmedian, optstarts, optstops, optfull = opts 

773 

774 if debug: 

775 print("optvalues:", opts) 

776 

777 self.create_temp2() 

778 # Start the optimization process 

779 while True: 

780 if optfull: 

781 for niter in range(optfull): 

782 if self.swap('chunks', 'median'): 

783 break 

784 if self.nblocks > 1: 

785 # Swap slices only in the case that we have 

786 # several blocks 

787 if self.swap('slices', 'median'): 

788 break 

789 if self.swap('chunks', 'median'): 

790 break 

791 if self.swap('chunks', 'start'): 

792 break 

793 if self.swap('chunks', 'stop'): 

794 break 

795 else: 

796 if optmedian: 

797 if self.swap('chunks', 'median'): 

798 break 

799 if optstarts: 

800 if self.swap('chunks', 'start'): 

801 break 

802 if optstops: 

803 if self.swap('chunks', 'stop'): 

804 break 

805 break # If we reach this, exit the loop 

806 

807 # Check if we require a complete sort. Important: this step 

808 # should be carried out *after* the optimization process has 

809 # been completed (this is to guarantee that the complete sort 

810 # does not take too much memory). 

811 if self.want_complete_sort: 

812 if self.noverlaps > 0: 

813 self.do_complete_sort() 

814 # Check that we have effectively achieved the complete sort 

815 if self.noverlaps > 0: 

816 warnings.warn( 

817 "OPSI was not able to achieve a completely sorted index." 

818 " Please report this to the authors.", UserWarning) 

819 

820 # Close and delete the temporal optimization index file 

821 self.cleanup_temp() 

822 return 

823 

824 def do_complete_sort(self): 

825 """Bring an already optimized index into a complete sorted state.""" 

826 

827 if self.verbose: 

828 t1 = clock() 

829 c1 = cpuclock() 

830 ss = self.slicesize 

831 tmp = self.tmp 

832 ranges = tmp.ranges[:] 

833 nslices = self.nslices 

834 

835 nelementsLR = self.nelementsILR 

836 if nelementsLR > 0: 

837 # Add the ranges corresponding to the last row 

838 rangeslr = np.array([self.bebounds[0], self.bebounds[-1]]) 

839 ranges = np.concatenate((ranges, [rangeslr])) 

840 nslices += 1 

841 

842 sorted = tmp.sorted 

843 indices = tmp.indices 

844 sortedLR = tmp.sortedLR 

845 indicesLR = tmp.indicesLR 

846 sremain = np.array([], dtype=self.dtype) 

847 iremain = np.array([], dtype='u%d' % self.indsize) 

848 starts = np.zeros(shape=nslices, dtype=np.int_) 

849 for i in range(nslices): 

850 # Find the overlapping elements for slice i 

851 sover = np.array([], dtype=self.dtype) 

852 iover = np.array([], dtype='u%d' % self.indsize) 

853 prev_end = ranges[i, 1] 

854 for j in range(i + 1, nslices): 

855 stj = starts[j] 

856 if ((j < self.nslices and stj == ss) or 

857 (j == self.nslices and stj == nelementsLR)): 

858 # This slice has been already dealt with 

859 continue 

860 if j < self.nslices: 

861 assert stj < ss, \ 

862 "Two slices cannot overlap completely at this stage!" 

863 next_beg = sorted[j, stj] 

864 else: 

865 assert stj < nelementsLR, \ 

866 "Two slices cannot overlap completely at this stage!" 

867 next_beg = sortedLR[stj] 

868 next_end = ranges[j, 1] 

869 if prev_end > next_end: 

870 # Complete overlapping case 

871 if j < self.nslices: 

872 sover = np.concatenate((sover, sorted[j, stj:])) 

873 iover = np.concatenate((iover, indices[j, stj:])) 

874 starts[j] = ss 

875 else: 

876 n = nelementsLR 

877 sover = np.concatenate((sover, sortedLR[stj:n])) 

878 iover = np.concatenate((iover, indicesLR[stj:n])) 

879 starts[j] = nelementsLR 

880 elif prev_end > next_beg: 

881 idx = self.search_item_lt(tmp, prev_end, j, ranges[j], stj) 

882 if j < self.nslices: 

883 sover = np.concatenate((sover, sorted[j, stj:idx])) 

884 iover = np.concatenate((iover, indices[j, stj:idx])) 

885 else: 

886 sover = np.concatenate((sover, sortedLR[stj:idx])) 

887 iover = np.concatenate((iover, indicesLR[stj:idx])) 

888 starts[j] = idx 

889 # Build the extended slices to sort out 

890 if i < self.nslices: 

891 ssorted = np.concatenate( 

892 (sremain, sorted[i, starts[i]:], sover)) 

893 sindices = np.concatenate( 

894 (iremain, indices[i, starts[i]:], iover)) 

895 else: 

896 ssorted = np.concatenate( 

897 (sremain, sortedLR[starts[i]:nelementsLR], sover)) 

898 sindices = np.concatenate( 

899 (iremain, indicesLR[starts[i]:nelementsLR], iover)) 

900 # Sort the extended slices 

901 indexesextension.keysort(ssorted, sindices) 

902 # Save the first elements of extended slices in the slice i 

903 if i < self.nslices: 

904 sorted[i] = ssorted[:ss] 

905 indices[i] = sindices[:ss] 

906 # Update caches for this slice 

907 self.update_caches(i, ssorted[:ss]) 

908 # Save the remaining values in a separate array 

909 send = len(sover) + len(sremain) 

910 sremain = ssorted[ss:ss + send] 

911 iremain = sindices[ss:ss + send] 

912 else: 

913 # Still some elements remain for the last row 

914 n = len(ssorted) 

915 assert n == nelementsLR 

916 send = 0 

917 sortedLR[:n] = ssorted 

918 indicesLR[:n] = sindices 

919 # Update the caches for last row 

920 sortedlr = sortedLR[:nelementsLR] 

921 bebounds = np.concatenate( 

922 (sortedlr[::self.chunksize], [sortedlr[-1]])) 

923 sortedLR[nelementsLR:nelementsLR + len(bebounds)] = bebounds 

924 self.bebounds = bebounds 

925 

926 # Verify that we have dealt with all the remaining values 

927 assert send == 0 

928 

929 # Compute the overlaps in order to verify that we have achieved 

930 # a complete sort. This has to be executed always (and not only 

931 # in verbose mode!). 

932 self.compute_overlaps(self.tmp, "do_complete_sort()", self.verbose) 

933 if self.verbose: 

934 print(f"time: {clock() - t1:.4f}. clock: {cpuclock() - c1:.4f}") 

935 

936 def swap(self, what, mode=None): 

937 """Swap chunks or slices using a certain bounds reference.""" 

938 

939 # Thresholds for avoiding continuing the optimization 

940 # thnover = 4 * self.slicesize # minimum number of overlapping 

941 # # elements 

942 thnover = 40 

943 thmult = 0.1 # minimum ratio of multiplicity (a 10%) 

944 thtover = 0.01 # minimum overlaping index for slices (a 1%) 

945 

946 if self.verbose: 

947 t1 = clock() 

948 c1 = cpuclock() 

949 if what == "chunks": 

950 self.swap_chunks(mode) 

951 elif what == "slices": 

952 self.swap_slices(mode) 

953 if mode: 

954 message = f"swap_{what}({mode})" 

955 else: 

956 message = f"swap_{what}" 

957 (nover, mult, tover) = self.compute_overlaps( 

958 self.tmp, message, self.verbose) 

959 rmult = len(mult.nonzero()[0]) / len(mult) 

960 if self.verbose: 

961 print(f"time: {clock() - t1:.4f}. clock: {cpuclock() - c1:.4f}") 

962 # Check that entropy is actually decreasing 

963 if what == "chunks" and self.last_tover > 0 and self.last_nover > 0: 

964 tover_var = (self.last_tover - tover) / self.last_tover 

965 nover_var = (self.last_nover - nover) / self.last_nover 

966 if tover_var < 0.05 and nover_var < 0.05: 

967 # Less than a 5% of improvement is too few 

968 return True 

969 self.last_tover = tover 

970 self.last_nover = nover 

971 # Check if some threshold has met 

972 if nover < thnover: 

973 return True 

974 if rmult < thmult: 

975 return True 

976 # Additional check for the overlap ratio 

977 if 0 <= tover < thtover: 

978 return True 

979 return False 

980 

981 def create_temp(self): 

982 """Create some temporary objects for slice sorting purposes.""" 

983 

984 # The index will be dirty during the index optimization process 

985 self.dirty = True 

986 # Build the name of the temporary file 

987 fd, self.tmpfilename = tempfile.mkstemp( 

988 ".tmp", "pytables-", self.tmp_dir) 

989 # Close the file descriptor so as to avoid leaks 

990 os.close(fd) 

991 # Create the proper PyTables file 

992 self.tmpfile = self._openFile(self.tmpfilename, "w") 

993 self.tmp = tmp = self.tmpfile.root 

994 cs = self.chunksize 

995 ss = self.slicesize 

996 filters = self.filters 

997 # temporary sorted & indices arrays 

998 shape = (0, ss) 

999 atom = Atom.from_dtype(self.dtype) 

1000 EArray(tmp, 'sorted', atom, shape, 

1001 "Temporary sorted", filters, chunkshape=(1, cs)) 

1002 EArray(tmp, 'indices', UIntAtom(itemsize=self.indsize), shape, 

1003 "Temporary indices", filters, chunkshape=(1, cs)) 

1004 # temporary bounds 

1005 nbounds_inslice = (ss - 1) // cs 

1006 shape = (0, nbounds_inslice) 

1007 EArray(tmp, 'bounds', atom, shape, "Temp chunk bounds", 

1008 filters, chunkshape=(cs, nbounds_inslice)) 

1009 shape = (0,) 

1010 EArray(tmp, 'abounds', atom, shape, "Temp start bounds", 

1011 filters, chunkshape=(cs,)) 

1012 EArray(tmp, 'zbounds', atom, shape, "Temp end bounds", 

1013 filters, chunkshape=(cs,)) 

1014 EArray(tmp, 'mbounds', atom, shape, "Median bounds", 

1015 filters, chunkshape=(cs,)) 

1016 # temporary ranges 

1017 EArray(tmp, 'ranges', atom, (0, 2), 

1018 "Temporary range values", filters, chunkshape=(cs, 2)) 

1019 EArray(tmp, 'mranges', atom, (0,), 

1020 "Median ranges", filters, chunkshape=(cs,)) 

1021 # temporary last row (sorted) 

1022 shape = (ss + 2 + nbounds_inslice,) 

1023 CArray(tmp, 'sortedLR', atom, shape, 

1024 "Temp Last Row sorted values + bounds", 

1025 filters, chunkshape=(cs,)) 

1026 # temporary last row (indices) 

1027 shape = (ss,) 

1028 CArray(tmp, 'indicesLR', 

1029 UIntAtom(itemsize=self.indsize), 

1030 shape, "Temp Last Row indices", 

1031 filters, chunkshape=(cs,)) 

1032 

1033 def create_temp2(self): 

1034 """Create some temporary objects for slice sorting purposes.""" 

1035 

1036 # The algorithms for doing the swap can be optimized so that 

1037 # one should be necessary to create temporaries for keeping just 

1038 # the contents of a single superblock. 

1039 # F. Alted 2007-01-03 

1040 cs = self.chunksize 

1041 ss = self.slicesize 

1042 filters = self.filters 

1043 # temporary sorted & indices arrays 

1044 shape = (self.nslices, ss) 

1045 atom = Atom.from_dtype(self.dtype) 

1046 tmp = self.tmp 

1047 CArray(tmp, 'sorted2', atom, shape, 

1048 "Temporary sorted 2", filters, chunkshape=(1, cs)) 

1049 CArray(tmp, 'indices2', UIntAtom(itemsize=self.indsize), shape, 

1050 "Temporary indices 2", filters, chunkshape=(1, cs)) 

1051 # temporary bounds 

1052 nbounds_inslice = (ss - 1) // cs 

1053 shape = (self.nslices, nbounds_inslice) 

1054 CArray(tmp, 'bounds2', atom, shape, "Temp chunk bounds 2", 

1055 filters, chunkshape=(cs, nbounds_inslice)) 

1056 shape = (self.nchunks,) 

1057 CArray(tmp, 'abounds2', atom, shape, "Temp start bounds 2", 

1058 filters, chunkshape=(cs,)) 

1059 CArray(tmp, 'zbounds2', atom, shape, "Temp end bounds 2", 

1060 filters, chunkshape=(cs,)) 

1061 CArray(tmp, 'mbounds2', atom, shape, "Median bounds 2", 

1062 filters, chunkshape=(cs,)) 

1063 # temporary ranges 

1064 CArray(tmp, 'ranges2', atom, (self.nslices, 2), 

1065 "Temporary range values 2", filters, chunkshape=(cs, 2)) 

1066 CArray(tmp, 'mranges2', atom, (self.nslices,), 

1067 "Median ranges 2", filters, chunkshape=(cs,)) 

1068 

1069 def cleanup_temp(self): 

1070 """Copy the data and delete the temporaries for sorting purposes.""" 

1071 

1072 if self.verbose: 

1073 print("Copying temporary data...") 

1074 # tmp -> index 

1075 reduction = self.reduction 

1076 cs = self.chunksize // reduction 

1077 ncs = self.nchunkslice 

1078 tmp = self.tmp 

1079 for i in range(self.nslices): 

1080 # Copy sorted & indices slices 

1081 sorted = tmp.sorted[i][::reduction].copy() 

1082 self.sorted.append(sorted.reshape(1, sorted.size)) 

1083 # Compute ranges 

1084 self.ranges.append([[sorted[0], sorted[-1]]]) 

1085 # Compute chunk bounds 

1086 self.bounds.append([sorted[cs::cs]]) 

1087 # Compute start, stop & median bounds and ranges 

1088 self.abounds.append(sorted[0::cs]) 

1089 self.zbounds.append(sorted[cs - 1::cs]) 

1090 smedian = sorted[cs // 2::cs] 

1091 self.mbounds.append(smedian) 

1092 self.mranges.append([smedian[ncs // 2]]) 

1093 del sorted, smedian # delete references 

1094 # Now that sorted is gone, we can copy the indices 

1095 indices = tmp.indices[i] 

1096 self.indices.append(indices.reshape(1, indices.size)) 

1097 

1098 # Now it is the last row turn (if needed) 

1099 if self.nelementsSLR > 0: 

1100 # First, the sorted values 

1101 sortedLR = self.sortedLR 

1102 indicesLR = self.indicesLR 

1103 nelementsLR = self.nelementsILR 

1104 sortedlr = tmp.sortedLR[:nelementsLR][::reduction].copy() 

1105 nelementsSLR = len(sortedlr) 

1106 sortedLR[:nelementsSLR] = sortedlr 

1107 # Now, the bounds 

1108 self.bebounds = np.concatenate((sortedlr[::cs], [sortedlr[-1]])) 

1109 offset2 = len(self.bebounds) 

1110 sortedLR[nelementsSLR:nelementsSLR + offset2] = self.bebounds 

1111 # Finally, the indices 

1112 indicesLR[:] = tmp.indicesLR[:] 

1113 # Update the number of (reduced) sorted elements 

1114 self.nelementsSLR = nelementsSLR 

1115 # The number of elements will be saved as an attribute 

1116 self.sortedLR.attrs.nelements = self.nelementsSLR 

1117 self.indicesLR.attrs.nelements = self.nelementsILR 

1118 

1119 if self.verbose: 

1120 print("Deleting temporaries...") 

1121 self.tmp = None 

1122 self.tmpfile.close() 

1123 Path(self.tmpfilename).unlink() 

1124 self.tmpfilename = None 

1125 

1126 # The optimization process has finished, and the index is ok now 

1127 self.dirty = False 

1128 # ...but the memory data cache is dirty now 

1129 self.dirtycache = True 

1130 

1131 def get_neworder(self, neworder, src_disk, tmp_disk, 

1132 lastrow, nslices, offset, dtype): 

1133 """Get sorted & indices values in new order.""" 

1134 

1135 cs = self.chunksize 

1136 ncs = ncs2 = self.nchunkslice 

1137 self_nslices = self.nslices 

1138 tmp = np.empty(shape=self.slicesize, dtype=dtype) 

1139 for i in range(nslices): 

1140 ns = offset + i 

1141 if ns == self_nslices: 

1142 # The number of complete chunks in the last row 

1143 ncs2 = self.nelementsILR // cs 

1144 # Get slices in new order 

1145 for j in range(ncs2): 

1146 idx = neworder[i * ncs + j] 

1147 ins = idx // ncs 

1148 inc = (idx - ins * ncs) * cs 

1149 ins += offset 

1150 nc = j * cs 

1151 if ins == self_nslices: 

1152 tmp[nc:nc + cs] = lastrow[inc:inc + cs] 

1153 else: 

1154 tmp[nc:nc + cs] = src_disk[ins, inc:inc + cs] 

1155 if ns == self_nslices: 

1156 # The number of complete chunks in the last row 

1157 lastrow[:ncs2 * cs] = tmp[:ncs2 * cs] 

1158 # The elements in the last chunk of the last row will 

1159 # participate in the global reordering later on, during 

1160 # the phase of sorting of *two* slices at a time 

1161 # (including the last row slice, see 

1162 # self.reorder_slices()). The caches for last row will 

1163 # be updated in self.reorder_slices() too. 

1164 # F. Altet 2008-08-25 

1165 else: 

1166 tmp_disk[ns] = tmp 

1167 

1168 def swap_chunks(self, mode="median"): 

1169 """Swap & reorder the different chunks in a block.""" 

1170 

1171 boundsnames = { 

1172 'start': 'abounds', 'stop': 'zbounds', 'median': 'mbounds'} 

1173 tmp = self.tmp 

1174 sorted = tmp.sorted 

1175 indices = tmp.indices 

1176 tmp_sorted = tmp.sorted2 

1177 tmp_indices = tmp.indices2 

1178 sortedLR = tmp.sortedLR 

1179 indicesLR = tmp.indicesLR 

1180 cs = self.chunksize 

1181 ncs = self.nchunkslice 

1182 nsb = self.nslicesblock 

1183 ncb = ncs * nsb 

1184 ncb2 = ncb 

1185 boundsobj = tmp._f_get_child(boundsnames[mode]) 

1186 can_cross_bbounds = (self.indsize == 8 and self.nelementsILR > 0) 

1187 for nblock in range(self.nblocks): 

1188 # Protection for last block having less chunks than ncb 

1189 remainingchunks = self.nchunks - nblock * ncb 

1190 if remainingchunks < ncb: 

1191 ncb2 = remainingchunks 

1192 if ncb2 <= 1: 

1193 # if only zero or one chunks remains we are done 

1194 break 

1195 nslices = ncb2 // ncs 

1196 bounds = boundsobj[nblock * ncb:nblock * ncb + ncb2] 

1197 # Do this only if lastrow elements can cross block boundaries 

1198 if (nblock == self.nblocks - 1 and # last block 

1199 can_cross_bbounds): 

1200 nslices += 1 

1201 ul = self.nelementsILR // cs 

1202 bounds = np.concatenate((bounds, self.bebounds[:ul])) 

1203 sbounds_idx = bounds.argsort(kind=defsort) 

1204 offset = int(nblock * nsb) 

1205 # Swap sorted and indices following the new order 

1206 self.get_neworder(sbounds_idx, sorted, tmp_sorted, sortedLR, 

1207 nslices, offset, self.dtype) 

1208 self.get_neworder(sbounds_idx, indices, tmp_indices, indicesLR, 

1209 nslices, offset, 'u%d' % self.indsize) 

1210 # Reorder completely the index at slice level 

1211 self.reorder_slices(tmp=True) 

1212 

1213 def read_slice(self, where, nslice, buffer, start=0): 

1214 """Read a slice from the `where` dataset and put it in `buffer`.""" 

1215 

1216 # Create the buffers for specifying the coordinates 

1217 self.startl = np.array([nslice, start], np.uint64) 

1218 self.stopl = np.array([nslice + 1, start + buffer.size], np.uint64) 

1219 self.stepl = np.ones(shape=2, dtype=np.uint64) 

1220 where._g_read_slice(self.startl, self.stopl, self.stepl, buffer) 

1221 

1222 def write_slice(self, where, nslice, buffer, start=0): 

1223 """Write a `slice` to the `where` dataset with the `buffer` data.""" 

1224 

1225 self.startl = np.array([nslice, start], np.uint64) 

1226 self.stopl = np.array([nslice + 1, start + buffer.size], np.uint64) 

1227 self.stepl = np.ones(shape=2, dtype=np.uint64) 

1228 countl = self.stopl - self.startl # (1, self.slicesize) 

1229 where._g_write_slice(self.startl, self.stepl, countl, buffer) 

1230 

1231 # Read version for LastRow 

1232 def read_slice_lr(self, where, buffer, start=0): 

1233 """Read a slice from the `where` dataset and put it in `buffer`.""" 

1234 

1235 startl = np.array([start], dtype=np.uint64) 

1236 stopl = np.array([start + buffer.size], dtype=np.uint64) 

1237 stepl = np.array([1], dtype=np.uint64) 

1238 where._g_read_slice(startl, stopl, stepl, buffer) 

1239 

1240 # Write version for LastRow 

1241 def write_sliceLR(self, where, buffer, start=0): 

1242 """Write a slice from the `where` dataset with the `buffer` data.""" 

1243 

1244 startl = np.array([start], dtype=np.uint64) 

1245 countl = np.array([start + buffer.size], dtype=np.uint64) 

1246 stepl = np.array([1], dtype=np.uint64) 

1247 where._g_write_slice(startl, stepl, countl, buffer) 

1248 

1249 def reorder_slice(self, nslice, sorted, indices, ssorted, sindices, 

1250 tmp_sorted, tmp_indices): 

1251 """Copy & reorder the slice in source to final destination.""" 

1252 

1253 ss = self.slicesize 

1254 # Load the second part in buffers 

1255 self.read_slice(tmp_sorted, nslice, ssorted[ss:]) 

1256 self.read_slice(tmp_indices, nslice, sindices[ss:]) 

1257 indexesextension.keysort(ssorted, sindices) 

1258 # Write the first part of the buffers to the regular leaves 

1259 self.write_slice(sorted, nslice - 1, ssorted[:ss]) 

1260 self.write_slice(indices, nslice - 1, sindices[:ss]) 

1261 # Update caches 

1262 self.update_caches(nslice - 1, ssorted[:ss]) 

1263 # Shift the slice in the end to the beginning 

1264 ssorted[:ss] = ssorted[ss:] 

1265 sindices[:ss] = sindices[ss:] 

1266 

1267 def update_caches(self, nslice, ssorted): 

1268 """Update the caches for faster lookups.""" 

1269 

1270 cs = self.chunksize 

1271 ncs = self.nchunkslice 

1272 tmp = self.tmp 

1273 # update first & second cache bounds (ranges & bounds) 

1274 tmp.ranges[nslice] = ssorted[[0, -1]] 

1275 tmp.bounds[nslice] = ssorted[cs::cs] 

1276 # update start & stop bounds 

1277 tmp.abounds[nslice * ncs:(nslice + 1) * ncs] = ssorted[0::cs] 

1278 tmp.zbounds[nslice * ncs:(nslice + 1) * ncs] = ssorted[cs - 1::cs] 

1279 # update median bounds 

1280 smedian = ssorted[cs // 2::cs] 

1281 tmp.mbounds[nslice * ncs:(nslice + 1) * ncs] = smedian 

1282 tmp.mranges[nslice] = smedian[ncs // 2] 

1283 

1284 def reorder_slices(self, tmp): 

1285 """Reorder completely the index at slice level. 

1286 

1287 This method has to maintain the locality of elements in the 

1288 ambit of ``blocks``, i.e. an element of a ``block`` cannot be 

1289 sent to another ``block`` during this reordering. This is 

1290 *critical* for ``light`` indexes to be able to use this. 

1291 

1292 This version of reorder_slices is optimized in that *two* 

1293 complete slices are taken at a time (including the last row 

1294 slice) so as to sort them. Then, each new slice that is read is 

1295 put at the end of this two-slice buffer, while the previous one 

1296 is moved to the beginning of the buffer. This is in order to 

1297 better reduce the entropy of the regular part (i.e. all except 

1298 the last row) of the index. 

1299 

1300 A secondary effect of this is that it takes at least *twice* of 

1301 memory than a previous version of reorder_slices() that only 

1302 reorders on a slice-by-slice basis. However, as this is more 

1303 efficient than the old version, one can configure the slicesize 

1304 to be smaller, so the memory consumption is barely similar. 

1305 

1306 """ 

1307 

1308 tmp = self.tmp 

1309 sorted = tmp.sorted 

1310 indices = tmp.indices 

1311 if tmp: 

1312 tmp_sorted = tmp.sorted2 

1313 tmp_indices = tmp.indices2 

1314 else: 

1315 tmp_sorted = tmp.sorted 

1316 tmp_indices = tmp.indices 

1317 cs = self.chunksize 

1318 ss = self.slicesize 

1319 nsb = self.blocksize // self.slicesize 

1320 nslices = self.nslices 

1321 nblocks = self.nblocks 

1322 nelementsLR = self.nelementsILR 

1323 # Create the buffer for reordering 2 slices at a time 

1324 ssorted = np.empty(shape=ss * 2, dtype=self.dtype) 

1325 sindices = np.empty(shape=ss * 2, dtype=np.dtype('u%d' % self.indsize)) 

1326 

1327 if self.indsize == 8: 

1328 # Bootstrap the process for reordering 

1329 # Read the first slice in buffers 

1330 self.read_slice(tmp_sorted, 0, ssorted[:ss]) 

1331 self.read_slice(tmp_indices, 0, sindices[:ss]) 

1332 

1333 nslice = 0 # Just in case the loop behind executes nothing 

1334 # Loop over the remainding slices in block 

1335 for nslice in range(1, sorted.nrows): 

1336 self.reorder_slice(nslice, sorted, indices, 

1337 ssorted, sindices, 

1338 tmp_sorted, tmp_indices) 

1339 

1340 # End the process (enrolling the lastrow if necessary) 

1341 if nelementsLR > 0: 

1342 sortedLR = self.tmp.sortedLR 

1343 indicesLR = self.tmp.indicesLR 

1344 # Shrink the ssorted and sindices arrays to the minimum 

1345 ssorted2 = ssorted[:ss + nelementsLR] 

1346 sortedlr = ssorted2[ss:] 

1347 sindices2 = sindices[:ss + nelementsLR] 

1348 indiceslr = sindices2[ss:] 

1349 # Read the last row info in the second part of the buffer 

1350 self.read_slice_lr(sortedLR, sortedlr) 

1351 self.read_slice_lr(indicesLR, indiceslr) 

1352 indexesextension.keysort(ssorted2, sindices2) 

1353 # Write the second part of the buffers to the lastrow indices 

1354 self.write_sliceLR(sortedLR, sortedlr) 

1355 self.write_sliceLR(indicesLR, indiceslr) 

1356 # Update the caches for last row 

1357 bebounds = np.concatenate((sortedlr[::cs], [sortedlr[-1]])) 

1358 sortedLR[nelementsLR:nelementsLR + len(bebounds)] = bebounds 

1359 self.bebounds = bebounds 

1360 # Write the first part of the buffers to the regular leaves 

1361 self.write_slice(sorted, nslice, ssorted[:ss]) 

1362 self.write_slice(indices, nslice, sindices[:ss]) 

1363 # Update caches for this slice 

1364 self.update_caches(nslice, ssorted[:ss]) 

1365 else: 

1366 # Iterate over each block. No data should cross block 

1367 # boundaries to avoid adressing problems with short indices. 

1368 for nb in range(nblocks): 

1369 # Bootstrap the process for reordering 

1370 # Read the first slice in buffers 

1371 nrow = nb * nsb 

1372 self.read_slice(tmp_sorted, nrow, ssorted[:ss]) 

1373 self.read_slice(tmp_indices, nrow, sindices[:ss]) 

1374 

1375 # Loop over the remainding slices in block 

1376 lrb = nrow + nsb 

1377 if lrb > nslices: 

1378 lrb = nslices 

1379 nslice = nrow # Just in case the loop behind executes nothing 

1380 for nslice in range(nrow + 1, lrb): 

1381 self.reorder_slice(nslice, sorted, indices, 

1382 ssorted, sindices, 

1383 tmp_sorted, tmp_indices) 

1384 

1385 # Write the first part of the buffers to the regular leaves 

1386 self.write_slice(sorted, nslice, ssorted[:ss]) 

1387 self.write_slice(indices, nslice, sindices[:ss]) 

1388 # Update caches for this slice 

1389 self.update_caches(nslice, ssorted[:ss]) 

1390 

1391 def swap_slices(self, mode="median"): 

1392 """Swap slices in a superblock.""" 

1393 

1394 tmp = self.tmp 

1395 sorted = tmp.sorted 

1396 indices = tmp.indices 

1397 tmp_sorted = tmp.sorted2 

1398 tmp_indices = tmp.indices2 

1399 ncs = self.nchunkslice 

1400 nss = self.superblocksize // self.slicesize 

1401 nss2 = nss 

1402 for sblock in range(self.nsuperblocks): 

1403 # Protection for last superblock having less slices than nss 

1404 remainingslices = self.nslices - sblock * nss 

1405 if remainingslices < nss: 

1406 nss2 = remainingslices 

1407 if nss2 <= 1: 

1408 break 

1409 if mode == "start": 

1410 ranges = tmp.ranges[sblock * nss:sblock * nss + nss2, 0] 

1411 elif mode == "stop": 

1412 ranges = tmp.ranges[sblock * nss:sblock * nss + nss2, 1] 

1413 elif mode == "median": 

1414 ranges = tmp.mranges[sblock * nss:sblock * nss + nss2] 

1415 sranges_idx = ranges.argsort(kind=defsort) 

1416 # Don't swap the superblock at all if one doesn't need to 

1417 ndiff = (sranges_idx != np.arange(nss2)).sum() / 2 

1418 if ndiff * 50 < nss2: 

1419 # The number of slices to rearrange is less than 2.5%, 

1420 # so skip the reordering of this superblock 

1421 # (too expensive for such a little improvement) 

1422 if self.verbose: 

1423 print("skipping reordering of superblock ->", sblock) 

1424 continue 

1425 ns = sblock * nss2 

1426 # Swap sorted and indices slices following the new order 

1427 for i in range(nss2): 

1428 idx = sranges_idx[i] 

1429 # Swap sorted & indices slices 

1430 oi = ns + i 

1431 oidx = ns + idx 

1432 tmp_sorted[oi] = sorted[oidx] 

1433 tmp_indices[oi] = indices[oidx] 

1434 # Swap start, stop & median ranges 

1435 tmp.ranges2[oi] = tmp.ranges[oidx] 

1436 tmp.mranges2[oi] = tmp.mranges[oidx] 

1437 # Swap chunk bounds 

1438 tmp.bounds2[oi] = tmp.bounds[oidx] 

1439 # Swap start, stop & median bounds 

1440 j = oi * ncs 

1441 jn = (oi + 1) * ncs 

1442 xj = oidx * ncs 

1443 xjn = (oidx + 1) * ncs 

1444 tmp.abounds2[j:jn] = tmp.abounds[xj:xjn] 

1445 tmp.zbounds2[j:jn] = tmp.zbounds[xj:xjn] 

1446 tmp.mbounds2[j:jn] = tmp.mbounds[xj:xjn] 

1447 # tmp -> originals 

1448 for i in range(nss2): 

1449 # Copy sorted & indices slices 

1450 oi = ns + i 

1451 sorted[oi] = tmp_sorted[oi] 

1452 indices[oi] = tmp_indices[oi] 

1453 # Copy start, stop & median ranges 

1454 tmp.ranges[oi] = tmp.ranges2[oi] 

1455 tmp.mranges[oi] = tmp.mranges2[oi] 

1456 # Copy chunk bounds 

1457 tmp.bounds[oi] = tmp.bounds2[oi] 

1458 # Copy start, stop & median bounds 

1459 j = oi * ncs 

1460 jn = (oi + 1) * ncs 

1461 tmp.abounds[j:jn] = tmp.abounds2[j:jn] 

1462 tmp.zbounds[j:jn] = tmp.zbounds2[j:jn] 

1463 tmp.mbounds[j:jn] = tmp.mbounds2[j:jn] 

1464 

1465 def search_item_lt(self, where, item, nslice, limits, start=0): 

1466 """Search a single item in a specific sorted slice.""" 

1467 

1468 # This method will only works under the assumtion that item 

1469 # *is to be found* in the nslice. 

1470 assert nan_aware_lt(limits[0], item) and nan_aware_le(item, limits[1]) 

1471 cs = self.chunksize 

1472 ss = self.slicesize 

1473 nelementsLR = self.nelementsILR 

1474 bstart = start // cs 

1475 

1476 # Find the chunk 

1477 if nslice < self.nslices: 

1478 nchunk = bisect_left(where.bounds[nslice], item, bstart) 

1479 else: 

1480 # We need to subtract 1 chunk here because bebounds 

1481 # has a leading value 

1482 nchunk = bisect_left(self.bebounds, item, bstart) - 1 

1483 assert nchunk >= 0 

1484 

1485 # Find the element in chunk 

1486 pos = nchunk * cs 

1487 if nslice < self.nslices: 

1488 pos += bisect_left(where.sorted[nslice, pos:pos + cs], item) 

1489 assert pos <= ss 

1490 else: 

1491 end = pos + cs 

1492 if end > nelementsLR: 

1493 end = nelementsLR 

1494 pos += bisect_left(self.sortedLR[pos:end], item) 

1495 assert pos <= nelementsLR 

1496 assert pos > 0 

1497 return pos 

1498 

1499 def compute_overlaps_finegrain(self, where, message, verbose): 

1500 """Compute some statistics about overlaping of slices in index. 

1501 

1502 It returns the following info: 

1503 

1504 noverlaps : int 

1505 The total number of elements that overlaps in index. 

1506 multiplicity : array of int 

1507 The number of times that a concrete slice overlaps with any other. 

1508 toverlap : float 

1509 An ovelap index: the sum of the values in segment slices that 

1510 overlaps divided by the entire range of values. This index is only 

1511 computed for numerical types. 

1512 

1513 """ 

1514 

1515 ss = self.slicesize 

1516 ranges = where.ranges[:] 

1517 sorted = where.sorted 

1518 sortedLR = where.sortedLR 

1519 nslices = self.nslices 

1520 nelementsLR = self.nelementsILR 

1521 if nelementsLR > 0: 

1522 # Add the ranges corresponding to the last row 

1523 rangeslr = np.array([self.bebounds[0], self.bebounds[-1]]) 

1524 ranges = np.concatenate((ranges, [rangeslr])) 

1525 nslices += 1 

1526 soverlap = 0 

1527 toverlap = -1 

1528 multiplicity = np.zeros(shape=nslices, dtype="int_") 

1529 overlaps = multiplicity.copy() 

1530 starts = multiplicity.copy() 

1531 for i in range(nslices): 

1532 prev_end = ranges[i, 1] 

1533 for j in range(i + 1, nslices): 

1534 stj = starts[j] 

1535 assert stj <= ss 

1536 if stj == ss: 

1537 # This slice has already been counted 

1538 continue 

1539 if j < self.nslices: 

1540 next_beg = sorted[j, stj] 

1541 else: 

1542 next_beg = sortedLR[stj] 

1543 next_end = ranges[j, 1] 

1544 if prev_end > next_end: 

1545 # Complete overlapping case 

1546 multiplicity[j - i] += 1 

1547 if j < self.nslices: 

1548 overlaps[i] += ss - stj 

1549 starts[j] = ss # a sentinel 

1550 else: 

1551 overlaps[i] += nelementsLR - stj 

1552 starts[j] = nelementsLR # a sentinel 

1553 elif prev_end > next_beg: 

1554 multiplicity[j - i] += 1 

1555 idx = self.search_item_lt( 

1556 where, prev_end, j, ranges[j], stj) 

1557 nelem = idx - stj 

1558 overlaps[i] += nelem 

1559 starts[j] = idx 

1560 if self.type != "string": 

1561 # Convert ranges into floats in order to allow 

1562 # doing operations with them without overflows 

1563 soverlap += float(ranges[i, 1]) - float(ranges[j, 0]) 

1564 

1565 # Return the overlap as the ratio between overlaps and entire range 

1566 if self.type != "string": 

1567 erange = float(ranges[-1, 1]) - float(ranges[0, 0]) 

1568 # Check that there is an effective range of values 

1569 # Beware, erange can be negative in situations where 

1570 # the values are suffering overflow. This can happen 

1571 # specially on big signed integer values (on overflows, 

1572 # the end value will become negative!). 

1573 # Also, there is no way to compute overlap ratios for 

1574 # non-numerical types. So, be careful and always check 

1575 # that toverlap has a positive value (it must have been 

1576 # initialized to -1. before) before using it. 

1577 # F. Alted 2007-01-19 

1578 if erange > 0: 

1579 toverlap = soverlap / erange 

1580 if verbose and message != "init": 

1581 print("toverlap (%s):" % message, toverlap) 

1582 print("multiplicity:\n", multiplicity, multiplicity.sum()) 

1583 print("overlaps:\n", overlaps, overlaps.sum()) 

1584 noverlaps = overlaps.sum() 

1585 # For full indexes, set the 'is_csi' flag 

1586 if self.indsize == 8 and self._v_file._iswritable(): 

1587 self._v_attrs.is_csi = (noverlaps == 0) 

1588 # Save the number of overlaps for future references 

1589 self.noverlaps = noverlaps 

1590 return (noverlaps, multiplicity, toverlap) 

1591 

1592 def compute_overlaps(self, where, message, verbose): 

1593 """Compute some statistics about overlaping of slices in index. 

1594 

1595 It returns the following info: 

1596 

1597 noverlaps : int 

1598 The total number of slices that overlaps in index. 

1599 multiplicity : array of int 

1600 The number of times that a concrete slice overlaps with any other. 

1601 toverlap : float 

1602 An ovelap index: the sum of the values in segment slices that 

1603 overlaps divided by the entire range of values. This index is only 

1604 computed for numerical types. 

1605 

1606 """ 

1607 

1608 ranges = where.ranges[:] 

1609 nslices = self.nslices 

1610 if self.nelementsILR > 0: 

1611 # Add the ranges corresponding to the last row 

1612 rangeslr = np.array([self.bebounds[0], self.bebounds[-1]]) 

1613 ranges = np.concatenate((ranges, [rangeslr])) 

1614 nslices += 1 

1615 noverlaps = 0 

1616 soverlap = 0 

1617 toverlap = -1 

1618 multiplicity = np.zeros(shape=nslices, dtype="int_") 

1619 for i in range(nslices): 

1620 for j in range(i + 1, nslices): 

1621 if ranges[i, 1] > ranges[j, 0]: 

1622 noverlaps += 1 

1623 multiplicity[j - i] += 1 

1624 if self.type != "string": 

1625 # Convert ranges into floats in order to allow 

1626 # doing operations with them without overflows 

1627 soverlap += float(ranges[i, 1]) - float(ranges[j, 0]) 

1628 

1629 # Return the overlap as the ratio between overlaps and entire range 

1630 if self.type != "string": 

1631 erange = float(ranges[-1, 1]) - float(ranges[0, 0]) 

1632 # Check that there is an effective range of values 

1633 # Beware, erange can be negative in situations where 

1634 # the values are suffering overflow. This can happen 

1635 # specially on big signed integer values (on overflows, 

1636 # the end value will become negative!). 

1637 # Also, there is no way to compute overlap ratios for 

1638 # non-numerical types. So, be careful and always check 

1639 # that toverlap has a positive value (it must have been 

1640 # initialized to -1. before) before using it. 

1641 # F. Altet 2007-01-19 

1642 if erange > 0: 

1643 toverlap = soverlap / erange 

1644 if verbose: 

1645 print("overlaps (%s):" % message, noverlaps, toverlap) 

1646 print(multiplicity) 

1647 # For full indexes, set the 'is_csi' flag 

1648 if self.indsize == 8 and self._v_file._iswritable(): 

1649 self._v_attrs.is_csi = (noverlaps == 0) 

1650 # Save the number of overlaps for future references 

1651 self.noverlaps = noverlaps 

1652 return (noverlaps, multiplicity, toverlap) 

1653 

1654 def read_sorted_indices(self, what, start, stop, step): 

1655 """Return the sorted or indices values in the specified range.""" 

1656 (start, stop, step) = self._process_range(start, stop, step) 

1657 if start >= stop: 

1658 return np.empty(0, self.dtype) 

1659 # Correction for negative values of step (reverse indices) 

1660 if step < 0: 

1661 tmp = start 

1662 start = self.nelements - stop 

1663 stop = self.nelements - tmp 

1664 if what == "sorted": 

1665 values = self.sorted 

1666 valuesLR = self.sortedLR 

1667 buffer_ = np.empty(stop - start, dtype=self.dtype) 

1668 else: 

1669 values = self.indices 

1670 valuesLR = self.indicesLR 

1671 buffer_ = np.empty(stop - start, dtype="u%d" % self.indsize) 

1672 ss = self.slicesize 

1673 nrow_start = start // ss 

1674 istart = start % ss 

1675 nrow_stop = stop // ss 

1676 tlen = stop - start 

1677 bstart = 0 

1678 ilen = 0 

1679 for nrow in range(nrow_start, nrow_stop + 1): 

1680 blen = ss - istart 

1681 if ilen + blen > tlen: 

1682 blen = tlen - ilen 

1683 if blen <= 0: 

1684 break 

1685 if nrow < self.nslices: 

1686 self.read_slice( 

1687 values, nrow, buffer_[bstart:bstart + blen], istart) 

1688 else: 

1689 self.read_slice_lr( 

1690 valuesLR, buffer_[bstart:bstart + blen], istart) 

1691 istart = 0 

1692 bstart += blen 

1693 ilen += blen 

1694 return buffer_[::step] 

1695 

1696 def read_sorted(self, start=None, stop=None, step=None): 

1697 """Return the sorted values of index in the specified range. 

1698 

1699 The meaning of the start, stop and step arguments is the same as in 

1700 :meth:`Table.read_sorted`. 

1701 

1702 """ 

1703 

1704 return self.read_sorted_indices('sorted', start, stop, step) 

1705 

1706 def read_indices(self, start=None, stop=None, step=None): 

1707 """Return the indices values of index in the specified range. 

1708 

1709 The meaning of the start, stop and step arguments is the same as in 

1710 :meth:`Table.read_sorted`. 

1711 

1712 """ 

1713 

1714 return self.read_sorted_indices('indices', start, stop, step) 

1715 

1716 def _process_range(self, start, stop, step): 

1717 """Get a range specifc for the index usage.""" 

1718 

1719 if start is not None and stop is None: 

1720 # Special case for the behaviour of PyTables iterators 

1721 stop = idx2long(start + 1) 

1722 if start is None: 

1723 start = 0 

1724 else: 

1725 start = idx2long(start) 

1726 if stop is None: 

1727 stop = idx2long(self.nelements) 

1728 else: 

1729 stop = idx2long(stop) 

1730 if step is None: 

1731 step = 1 

1732 else: 

1733 step = idx2long(step) 

1734 return (start, stop, step) 

1735 

1736 def __getitem__(self, key): 

1737 """Return the indices values of index in the specified range. 

1738 

1739 If key argument is an integer, the corresponding index is returned. If 

1740 key is a slice, the range of indices determined by it is returned. A 

1741 negative value of step in slice is supported, meaning that the results 

1742 will be returned in reverse order. 

1743 

1744 This method is equivalent to :meth:`Index.read_indices`. 

1745 

1746 """ 

1747 

1748 if is_idx(key): 

1749 key = operator.index(key) 

1750 

1751 if key < 0: 

1752 # To support negative values 

1753 key += self.nelements 

1754 return self.read_indices(key, key + 1, 1)[0] 

1755 elif isinstance(key, slice): 

1756 return self.read_indices(key.start, key.stop, key.step) 

1757 

1758 def __len__(self): 

1759 return self.nelements 

1760 

1761 def restorecache(self): 

1762 """Clean the limits cache and resize starts and lengths arrays""" 

1763 

1764 params = self._v_file.params 

1765 # The sorted IndexArray is absolutely required to be in memory 

1766 # at the same time than the Index instance, so create a strong 

1767 # reference to it. We are not introducing leaks because the 

1768 # strong reference will disappear when this Index instance is 

1769 # to be closed. 

1770 self._sorted = self.sorted 

1771 self._sorted.boundscache = ObjectCache(params['BOUNDS_MAX_SLOTS'], 

1772 params['BOUNDS_MAX_SIZE'], 

1773 'non-opt types bounds') 

1774 self.sorted.boundscache = ObjectCache(params['BOUNDS_MAX_SLOTS'], 

1775 params['BOUNDS_MAX_SIZE'], 

1776 'non-opt types bounds') 

1777 """A cache for the bounds (2nd hash) data. Only used for 

1778 non-optimized types searches.""" 

1779 self.limboundscache = ObjectCache(params['LIMBOUNDS_MAX_SLOTS'], 

1780 params['LIMBOUNDS_MAX_SIZE'], 

1781 'bounding limits') 

1782 """A cache for bounding limits.""" 

1783 self.sortedLRcache = ObjectCache(params['SORTEDLR_MAX_SLOTS'], 

1784 params['SORTEDLR_MAX_SIZE'], 

1785 'last row chunks') 

1786 """A cache for the last row chunks. Only used for searches in 

1787 the last row, and mainly useful for small indexes.""" 

1788 self.starts = np.empty(shape=self.nrows, dtype=np.int32) 

1789 self.lengths = np.empty(shape=self.nrows, dtype=np.int32) 

1790 self.sorted._init_sorted_slice(self) 

1791 self.dirtycache = False 

1792 

1793 def search(self, item): 

1794 """Do a binary search in this index for an item.""" 

1795 

1796 if profile: 

1797 tref = clock() 

1798 if profile: 

1799 show_stats("Entering search", tref) 

1800 

1801 if self.dirtycache: 

1802 self.restorecache() 

1803 

1804 # An empty item or if left limit is larger than the right one 

1805 # means that the number of records is always going to be empty, 

1806 # so we avoid further computation (including looking up the 

1807 # limits cache). 

1808 if not item or item[0] > item[1]: 

1809 self.starts[:] = 0 

1810 self.lengths[:] = 0 

1811 return 0 

1812 

1813 tlen = 0 

1814 # Check whether the item tuple is in the limits cache or not 

1815 nslot = self.limboundscache.getslot(item) 

1816 if nslot >= 0: 

1817 startlengths = self.limboundscache.getitem(nslot) 

1818 # Reset the lengths array (not necessary for starts) 

1819 self.lengths[:] = 0 

1820 # Now, set the interesting rows 

1821 for nrow2, start, length in startlengths: 

1822 self.starts[nrow2] = start 

1823 self.lengths[nrow2] = length 

1824 tlen = tlen + length 

1825 return tlen 

1826 # The item is not in cache. Do the real lookup. 

1827 sorted = self.sorted 

1828 if self.nslices > 0: 

1829 if self.type in self.opt_search_types: 

1830 # The next are optimizations. However, they hide the 

1831 # CPU functions consumptions from python profiles. 

1832 # You may want to de-activate them during profiling. 

1833 if self.type == "int32": 

1834 tlen = sorted._search_bin_na_i(*item) 

1835 elif self.type == "int64": 

1836 tlen = sorted._search_bin_na_ll(*item) 

1837 elif self.type == "float16": 

1838 tlen = sorted._search_bin_na_e(*item) 

1839 elif self.type == "float32": 

1840 tlen = sorted._search_bin_na_f(*item) 

1841 elif self.type == "float64": 

1842 tlen = sorted._search_bin_na_d(*item) 

1843 elif self.type == "float96": 

1844 tlen = sorted._search_bin_na_g(*item) 

1845 elif self.type == "float128": 

1846 tlen = sorted._search_bin_na_g(*item) 

1847 elif self.type == "uint32": 

1848 tlen = sorted._search_bin_na_ui(*item) 

1849 elif self.type == "uint64": 

1850 tlen = sorted._search_bin_na_ull(*item) 

1851 elif self.type == "int8": 

1852 tlen = sorted._search_bin_na_b(*item) 

1853 elif self.type == "int16": 

1854 tlen = sorted._search_bin_na_s(*item) 

1855 elif self.type == "uint8": 

1856 tlen = sorted._search_bin_na_ub(*item) 

1857 elif self.type == "uint16": 

1858 tlen = sorted._search_bin_na_us(*item) 

1859 else: 

1860 assert False, "This can't happen!" 

1861 else: 

1862 tlen = self.search_scalar(item, sorted) 

1863 # Get possible remaining values in last row 

1864 if self.nelementsSLR > 0: 

1865 # Look for more indexes in the last row 

1866 (start, stop) = self.search_last_row(item) 

1867 self.starts[-1] = start 

1868 self.lengths[-1] = stop - start 

1869 tlen += stop - start 

1870 

1871 if self.limboundscache.couldenablecache(): 

1872 # Get a startlengths tuple and save it in cache. 

1873 # This is quite slow, but it is a good way to compress 

1874 # the bounds info. Moreover, the .couldenablecache() 

1875 # is doing a good work so as to avoid computing this 

1876 # when it is not necessary to do it. 

1877 startlengths = [] 

1878 for nrow, length in enumerate(self.lengths): 

1879 if length > 0: 

1880 startlengths.append((nrow, self.starts[nrow], length)) 

1881 # Compute the size of the recarray (aproximately) 

1882 # The +1 at the end is important to avoid 0 lengths 

1883 # (remember, the object headers take some space) 

1884 size = len(startlengths) * 8 * 2 + 1 

1885 # Put this startlengths list in cache 

1886 self.limboundscache.setitem(item, startlengths, size) 

1887 

1888 if profile: 

1889 show_stats("Exiting search", tref) 

1890 return tlen 

1891 

1892 # This is an scalar version of search. It works with strings as well. 

1893 def search_scalar(self, item, sorted): 

1894 """Do a binary search in this index for an item.""" 

1895 

1896 tlen = 0 

1897 # Do the lookup for values fullfilling the conditions 

1898 for i in range(self.nslices): 

1899 (start, stop) = sorted._search_bin(i, item) 

1900 self.starts[i] = start 

1901 self.lengths[i] = stop - start 

1902 tlen += stop - start 

1903 return tlen 

1904 

1905 def search_last_row(self, item): 

1906 # Variable initialization 

1907 item1, item2 = item 

1908 bebounds = self.bebounds 

1909 b0, b1 = bebounds[0], bebounds[-1] 

1910 bounds = bebounds[1:-1] 

1911 itemsize = self.dtype.itemsize 

1912 sortedLRcache = self.sortedLRcache 

1913 hi = self.nelementsSLR # maximum number of elements 

1914 rchunksize = self.chunksize // self.reduction 

1915 

1916 nchunk = -1 

1917 # Lookup for item1 

1918 if nan_aware_gt(item1, b0): 

1919 if nan_aware_le(item1, b1): 

1920 # Search the appropriate chunk in bounds cache 

1921 nchunk = bisect_left(bounds, item1) 

1922 # Lookup for this chunk in cache 

1923 nslot = sortedLRcache.getslot(nchunk) 

1924 if nslot >= 0: 

1925 chunk = sortedLRcache.getitem(nslot) 

1926 else: 

1927 begin = rchunksize * nchunk 

1928 end = rchunksize * (nchunk + 1) 

1929 if end > hi: 

1930 end = hi 

1931 # Read the chunk from disk 

1932 chunk = self.sortedLR._read_sorted_slice( 

1933 self.sorted, begin, end) 

1934 # Put it in cache. It's important to *copy* 

1935 # the buffer, as it is reused in future reads! 

1936 sortedLRcache.setitem(nchunk, chunk.copy(), 

1937 (end - begin) * itemsize) 

1938 start = bisect_left(chunk, item1) 

1939 start += rchunksize * nchunk 

1940 else: 

1941 start = hi 

1942 else: 

1943 start = 0 

1944 # Lookup for item2 

1945 if nan_aware_ge(item2, b0): 

1946 if nan_aware_lt(item2, b1): 

1947 # Search the appropriate chunk in bounds cache 

1948 nchunk2 = bisect_right(bounds, item2) 

1949 if nchunk2 != nchunk: 

1950 # Lookup for this chunk in cache 

1951 nslot = sortedLRcache.getslot(nchunk2) 

1952 if nslot >= 0: 

1953 chunk = sortedLRcache.getitem(nslot) 

1954 else: 

1955 begin = rchunksize * nchunk2 

1956 end = rchunksize * (nchunk2 + 1) 

1957 if end > hi: 

1958 end = hi 

1959 # Read the chunk from disk 

1960 chunk = self.sortedLR._read_sorted_slice( 

1961 self.sorted, begin, end) 

1962 # Put it in cache. It's important to *copy* 

1963 # the buffer, as it is reused in future reads! 

1964 # See bug #60 in xot.carabos.com 

1965 sortedLRcache.setitem(nchunk2, chunk.copy(), 

1966 (end - begin) * itemsize) 

1967 stop = bisect_right(chunk, item2) 

1968 stop += rchunksize * nchunk2 

1969 else: 

1970 stop = hi 

1971 else: 

1972 stop = 0 

1973 return (start, stop) 

1974 

1975 def get_chunkmap(self): 

1976 """Compute a map with the interesting chunks in index.""" 

1977 

1978 if profile: 

1979 tref = clock() 

1980 if profile: 

1981 show_stats("Entering get_chunkmap", tref) 

1982 ss = self.slicesize 

1983 nsb = self.nslicesblock 

1984 nslices = self.nslices 

1985 lbucket = self.lbucket 

1986 indsize = self.indsize 

1987 bucketsinblock = self.blocksize / lbucket 

1988 nchunks = math.ceil(self.nelements / lbucket) 

1989 chunkmap = np.zeros(shape=nchunks, dtype="bool") 

1990 reduction = self.reduction 

1991 starts = (self.starts - 1) * reduction + 1 

1992 stops = (self.starts + self.lengths) * reduction 

1993 starts[starts < 0] = 0 # All negative values set to zero 

1994 indices = self.indices 

1995 for nslice in range(self.nrows): 

1996 start = starts[nslice] 

1997 stop = stops[nslice] 

1998 if stop > start: 

1999 idx = np.empty(shape=stop - start, dtype='u%d' % indsize) 

2000 if nslice < nslices: 

2001 indices._read_index_slice(nslice, start, stop, idx) 

2002 else: 

2003 self.indicesLR._read_index_slice(start, stop, idx) 

2004 if indsize == 8: 

2005 idx //= lbucket 

2006 elif indsize == 2: 

2007 # The chunkmap size cannot be never larger than 'int_' 

2008 idx = idx.astype("int_") 

2009 offset = int((nslice // nsb) * bucketsinblock) 

2010 idx += offset 

2011 elif indsize == 1: 

2012 # The chunkmap size cannot be never larger than 'int_' 

2013 idx = idx.astype("int_") 

2014 offset = (nslice * ss) // lbucket 

2015 idx += offset 

2016 chunkmap[idx] = True 

2017 # The case lbucket < nrowsinchunk should only happen in tests 

2018 nrowsinchunk = self.nrowsinchunk 

2019 if lbucket != nrowsinchunk: 

2020 # Map the 'coarse grain' chunkmap into the 'true' chunkmap 

2021 nelements = self.nelements 

2022 tnchunks = math.ceil(nelements / nrowsinchunk) 

2023 tchunkmap = np.zeros(shape=tnchunks, dtype="bool") 

2024 ratio = lbucket / nrowsinchunk 

2025 idx = chunkmap.nonzero()[0] 

2026 starts = (idx * ratio).astype('int_') 

2027 stops = np.ceil((idx + 1) * ratio).astype('int_') 

2028 for start, stop in zip(starts, stops): 

2029 tchunkmap[start:stop] = True 

2030 chunkmap = tchunkmap 

2031 if profile: 

2032 show_stats("Exiting get_chunkmap", tref) 

2033 return chunkmap 

2034 

2035 def get_lookup_range(self, ops, limits): 

2036 assert len(ops) in [1, 2] 

2037 assert len(limits) in [1, 2] 

2038 assert len(ops) == len(limits) 

2039 

2040 column = self.column 

2041 coldtype = column.dtype.base 

2042 itemsize = coldtype.itemsize 

2043 

2044 if len(limits) == 1: 

2045 assert ops[0] in ['lt', 'le', 'eq', 'ge', 'gt'] 

2046 limit = limits[0] 

2047 op = ops[0] 

2048 if op == 'lt': 

2049 range_ = (inftype(coldtype, itemsize, sign=-1), 

2050 nextafter(limit, -1, coldtype, itemsize)) 

2051 elif op == 'le': 

2052 range_ = (inftype(coldtype, itemsize, sign=-1), 

2053 limit) 

2054 elif op == 'gt': 

2055 range_ = (nextafter(limit, +1, coldtype, itemsize), 

2056 inftype(coldtype, itemsize, sign=+1)) 

2057 elif op == 'ge': 

2058 range_ = (limit, 

2059 inftype(coldtype, itemsize, sign=+1)) 

2060 elif op == 'eq': 

2061 range_ = (limit, limit) 

2062 

2063 elif len(limits) == 2: 

2064 assert ops[0] in ('gt', 'ge') and ops[1] in ('lt', 'le') 

2065 

2066 lower, upper = limits 

2067 if lower > upper: 

2068 # ``a <[=] x <[=] b`` is always false if ``a > b``. 

2069 return () 

2070 

2071 if ops == ('gt', 'lt'): # lower < col < upper 

2072 range_ = (nextafter(lower, +1, coldtype, itemsize), 

2073 nextafter(upper, -1, coldtype, itemsize)) 

2074 elif ops == ('ge', 'lt'): # lower <= col < upper 

2075 range_ = (lower, nextafter(upper, -1, coldtype, itemsize)) 

2076 elif ops == ('gt', 'le'): # lower < col <= upper 

2077 range_ = (nextafter(lower, +1, coldtype, itemsize), upper) 

2078 elif ops == ('ge', 'le'): # lower <= col <= upper 

2079 range_ = (lower, upper) 

2080 

2081 return range_ 

2082 

2083 def _f_remove(self, recursive=False): 

2084 """Remove this Index object.""" 

2085 

2086 # Index removal is always recursive, 

2087 # no matter what `recursive` says. 

2088 super()._f_remove(True) 

2089 

2090 def __str__(self): 

2091 """This provides a more compact representation than __repr__""" 

2092 

2093 # The filters 

2094 filters = [] 

2095 if self.filters.complevel: 

2096 if self.filters.shuffle: 

2097 filters.append('shuffle') 

2098 if self.filters.bitshuffle: 

2099 filters.append('bitshuffle') 

2100 filters.append(f'{self.filters.complib}({self.filters.complevel})') 

2101 return (f"Index({self.optlevel}, " 

2102 f"{self.kind}{', '.join(filters)}).is_csi={self.is_csi}") 

2103 

2104 def __repr__(self): 

2105 """This provides more metainfo than standard __repr__""" 

2106 

2107 cpathname = f"{self.table._v_pathname}.cols.{self.column.pathname}" 

2108 retstr = f"""{self._v_pathname} (Index for column {cpathname}) 

2109 optlevel := {self.optlevel} 

2110 kind := {self.kind} 

2111 filters := {self.filters} 

2112 is_csi := {self.is_csi} 

2113 nelements := {self.nelements} 

2114 chunksize := {self.chunksize} 

2115 slicesize := {self.slicesize} 

2116 blocksize := {self.blocksize} 

2117 superblocksize := {self.superblocksize} 

2118 dirty := {self.dirty} 

2119 byteorder := {self.byteorder!r} 

2120 sorted := {self.sorted} 

2121 indices := {self.indices} 

2122 ranges := {self.ranges} 

2123 bounds := {self.bounds} 

2124 sortedLR := {self.sortedLR} 

2125 indicesLR := {self.indicesLR}""" 

2126 return retstr 

2127 

2128 

2129class IndexesDescG(NotLoggedMixin, Group): 

2130 _c_classid = 'DINDEX' 

2131 

2132 def _g_width_warning(self): 

2133 warnings.warn( 

2134 "the number of indexed columns on a single description group " 

2135 "is exceeding the recommended maximum (%d); " 

2136 "be ready to see PyTables asking for *lots* of memory " 

2137 "and possibly slow I/O" % self._v_max_group_width, 

2138 PerformanceWarning) 

2139 

2140 

2141class IndexesTableG(NotLoggedMixin, Group): 

2142 _c_classid = 'TINDEX' 

2143 

2144 @property 

2145 def auto(self): 

2146 if 'AUTO_INDEX' not in self._v_attrs: 

2147 return default_auto_index 

2148 return self._v_attrs.AUTO_INDEX 

2149 

2150 @auto.setter 

2151 def auto(self, auto): 

2152 self._v_attrs.AUTO_INDEX = bool(auto) 

2153 

2154 @auto.deleter 

2155 def auto(self): 

2156 del self._v_attrs.AUTO_INDEX 

2157 

2158 def _g_width_warning(self): 

2159 warnings.warn( 

2160 "the number of indexed columns on a single table " 

2161 "is exceeding the recommended maximum (%d); " 

2162 "be ready to see PyTables asking for *lots* of memory " 

2163 "and possibly slow I/O" % self._v_max_group_width, 

2164 PerformanceWarning) 

2165 

2166 def _g_check_name(self, name): 

2167 if not name.startswith('_i_'): 

2168 raise ValueError( 

2169 "names of index groups must start with ``_i_``: %s" % name) 

2170 

2171 @property 

2172 def table(self): 

2173 """Accessor for the `Table` object of this `IndexesTableG` 

2174 container.""" 

2175 names = self._v_pathname.split("/") 

2176 tablename = names.pop()[3:] # "_i_" is at the beginning 

2177 parentpathname = "/".join(names) 

2178 tablepathname = join_path(parentpathname, tablename) 

2179 table = self._v_file._get_node(tablepathname) 

2180 return table 

2181 

2182 

2183class OldIndex(NotLoggedMixin, Group): 

2184 """This is meant to hide indexes of PyTables 1.x files.""" 

2185 

2186 _c_classid = 'CINDEX'