Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tables/idxutils.py: 14%

232 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2023-05-10 06:15 +0000

1"""Utilities to be used mainly by the Index class.""" 

2 

3import math 

4import numpy as np 

5 

6 

7# Hints for chunk/slice/block/superblock computations: 

8# - The slicesize should not exceed 2**32 elements (because of 

9# implementation reasons). Such an extreme case would make the 

10# sorting algorithms to consume up to 64 GB of memory. 

11# - In general, one should favor a small chunksize ( < 128 KB) if one 

12# wants to reduce the latency for indexed queries. However, keep in 

13# mind that a very low value of chunksize for big datasets may hurt 

14# the performance by requering the HDF5 to use a lot of memory and CPU 

15# for its internal B-Tree. 

16 

17def csformula(nrows): 

18 """Return the fitted chunksize (a float value) for nrows.""" 

19 

20 # This formula has been computed using two points: 

21 # 2**12 = m * 2**(n + log10(10**6)) 

22 # 2**15 = m * 2**(n + log10(10**9)) 

23 # where 2**12 and 2**15 are reasonable values for chunksizes for indexes 

24 # with 10**6 and 10**9 elements respectively. 

25 # Yes, return a floating point number! 

26 return 64 * 2**math.log10(nrows) 

27 

28 

29def limit_er(expectedrows): 

30 """Protection against creating too small or too large chunks or slices.""" 

31 

32 if expectedrows < 10**5: 

33 expectedrows = 10**5 

34 elif expectedrows > 10**12: 

35 expectedrows = 10**12 

36 return expectedrows 

37 

38 

39def computechunksize(expectedrows): 

40 """Get the optimum chunksize based on expectedrows.""" 

41 

42 expectedrows = limit_er(expectedrows) 

43 zone = int(math.log10(expectedrows)) 

44 nrows = 10**zone 

45 return int(csformula(nrows)) 

46 

47 

48def computeslicesize(expectedrows, memlevel): 

49 """Get the optimum slicesize based on expectedrows and memorylevel.""" 

50 

51 expectedrows = limit_er(expectedrows) 

52 # First, the optimum chunksize 

53 cs = csformula(expectedrows) 

54 # Now, the actual chunksize 

55 chunksize = computechunksize(expectedrows) 

56 # The optimal slicesize 

57 ss = int(cs * memlevel**2) 

58 # We *need* slicesize to be an exact multiple of the actual chunksize 

59 ss = (ss // chunksize) * chunksize 

60 ss *= 4 # slicesize should be at least divisible by 4 

61 # ss cannot be bigger than 2**31 - 1 elements because of fundamental 

62 # reasons (this limitation comes mainly from the way of compute 

63 # indices for indexes, but also because C keysort is not implemented 

64 # yet for the string type). Besides, it cannot be larger than 

65 # 2**30, because limitiations of the optimized binary search code 

66 # (in idx-opt.c, the line ``mid = lo + (hi-lo)/2;`` will overflow 

67 # for values of ``lo`` and ``hi`` >= 2**30). Finally, ss must be a 

68 # multiple of 4, so 2**30 must definitely be an upper limit. 

69 if ss > 2**30: 

70 ss = 2**30 

71 return ss 

72 

73 

74def computeblocksize(expectedrows, compoundsize, lowercompoundsize): 

75 """Calculate the optimum number of superblocks made from compounds blocks. 

76 

77 This is useful for computing the sizes of both blocks and 

78 superblocks (using the PyTables terminology for blocks in indexes). 

79 

80 """ 

81 

82 nlowerblocks = (expectedrows // lowercompoundsize) + 1 

83 if nlowerblocks > 2**20: 

84 # Protection against too large number of compound blocks 

85 nlowerblocks = 2**20 

86 size = int(lowercompoundsize * nlowerblocks) 

87 # We *need* superblocksize to be an exact multiple of the actual 

88 # compoundblock size (a ceil must be performed here!) 

89 size = ((size // compoundsize) + 1) * compoundsize 

90 return size 

91 

92 

93def calc_chunksize(expectedrows, optlevel=6, indsize=4, memlevel=4, node=None): 

94 """Calculate the HDF5 chunk size for index and sorted arrays. 

95 

96 The logic to do that is based purely in experiments playing with 

97 different chunksizes and compression flag. It is obvious that using 

98 big chunks optimizes the I/O speed, but if they are too large, the 

99 uncompressor takes too much time. This might (should) be further 

100 optimized by doing more experiments. 

101 

102 """ 

103 

104 chunksize = computechunksize(expectedrows) 

105 slicesize = computeslicesize(expectedrows, memlevel) 

106 

107 # Avoid excessive slicesize in Indexes, see https://github.com/PyTables/PyTables/issues/879 

108 if node is not None: 

109 maxsize = node._v_file.params['BUFFER_TIMES'] * node._v_file.params['IO_BUFFER_SIZE'] 

110 while (slicesize * node.dtype.itemsize) > maxsize: 

111 slicesize = slicesize // 2 

112 

113 # Correct the slicesize and the chunksize based on optlevel 

114 if indsize == 1: # ultralight 

115 chunksize, slicesize = ccs_ultralight(optlevel, chunksize, slicesize) 

116 elif indsize == 2: # light 

117 chunksize, slicesize = ccs_light(optlevel, chunksize, slicesize) 

118 elif indsize == 4: # medium 

119 chunksize, slicesize = ccs_medium(optlevel, chunksize, slicesize) 

120 elif indsize == 8: # full 

121 chunksize, slicesize = ccs_full(optlevel, chunksize, slicesize) 

122 

123 # Finally, compute blocksize and superblocksize 

124 blocksize = computeblocksize(expectedrows, slicesize, chunksize) 

125 superblocksize = computeblocksize(expectedrows, blocksize, slicesize) 

126 # The size for different blocks information 

127 sizes = (superblocksize, blocksize, slicesize, chunksize) 

128 return sizes 

129 

130 

131def ccs_ultralight(optlevel, chunksize, slicesize): 

132 """Correct the slicesize and the chunksize based on optlevel.""" 

133 

134 if optlevel in (0, 1, 2): 

135 slicesize //= 2 

136 slicesize += optlevel * slicesize 

137 elif optlevel in (3, 4, 5): 

138 slicesize *= optlevel - 1 

139 elif optlevel in (6, 7, 8): 

140 slicesize *= optlevel - 1 

141 elif optlevel == 9: 

142 slicesize *= optlevel - 1 

143 return chunksize, slicesize 

144 

145 

146def ccs_light(optlevel, chunksize, slicesize): 

147 """Correct the slicesize and the chunksize based on optlevel.""" 

148 

149 if optlevel in (0, 1, 2): 

150 slicesize //= 2 

151 elif optlevel in (3, 4, 5): 

152 pass 

153 elif optlevel in (6, 7, 8): 

154 chunksize //= 2 

155 elif optlevel == 9: 

156 # Reducing the chunksize and enlarging the slicesize is the 

157 # best way to reduce the entropy with the current algorithm. 

158 chunksize //= 2 

159 slicesize *= 2 

160 return chunksize, slicesize 

161 

162 

163def ccs_medium(optlevel, chunksize, slicesize): 

164 """Correct the slicesize and the chunksize based on optlevel.""" 

165 

166 if optlevel in (0, 1, 2): 

167 slicesize //= 2 

168 elif optlevel in (3, 4, 5): 

169 pass 

170 elif optlevel in (6, 7, 8): 

171 chunksize //= 2 

172 elif optlevel == 9: 

173 # Reducing the chunksize and enlarging the slicesize is the 

174 # best way to reduce the entropy with the current algorithm. 

175 chunksize //= 2 

176 slicesize *= 2 

177 return chunksize, slicesize 

178 

179 

180def ccs_full(optlevel, chunksize, slicesize): 

181 """Correct the slicesize and the chunksize based on optlevel.""" 

182 

183 if optlevel in (0, 1, 2): 

184 slicesize //= 2 

185 elif optlevel in (3, 4, 5): 

186 pass 

187 elif optlevel in (6, 7, 8): 

188 chunksize //= 2 

189 elif optlevel == 9: 

190 # Reducing the chunksize and enlarging the slicesize is the 

191 # best way to reduce the entropy with the current algorithm. 

192 chunksize //= 2 

193 slicesize *= 2 

194 return chunksize, slicesize 

195 

196 

197def calcoptlevels(nblocks, optlevel, indsize): 

198 """Compute the optimizations to be done. 

199 

200 The calculation is based on the number of blocks, optlevel and 

201 indexing mode. 

202 

203 """ 

204 

205 if indsize == 2: # light 

206 return col_light(nblocks, optlevel) 

207 elif indsize == 4: # medium 

208 return col_medium(nblocks, optlevel) 

209 elif indsize == 8: # full 

210 return col_full(nblocks, optlevel) 

211 

212 

213def col_light(nblocks, optlevel): 

214 """Compute the optimizations to be done for light indexes.""" 

215 

216 optmedian, optstarts, optstops, optfull = (False,) * 4 

217 

218 if 0 < optlevel <= 3: 

219 optmedian = True 

220 elif 3 < optlevel <= 6: 

221 optmedian, optstarts = (True, True) 

222 elif 6 < optlevel <= 9: 

223 optmedian, optstarts, optstops = (True, True, True) 

224 

225 return optmedian, optstarts, optstops, optfull 

226 

227 

228def col_medium(nblocks, optlevel): 

229 """Compute the optimizations to be done for medium indexes.""" 

230 

231 optmedian, optstarts, optstops, optfull = (False,) * 4 

232 

233 # Medium case 

234 if nblocks <= 1: 

235 if 0 < optlevel <= 3: 

236 optmedian = True 

237 elif 3 < optlevel <= 6: 

238 optmedian, optstarts = (True, True) 

239 elif 6 < optlevel <= 9: 

240 optfull = 1 

241 else: # More than a block 

242 if 0 < optlevel <= 3: 

243 optfull = 1 

244 elif 3 < optlevel <= 6: 

245 optfull = 2 

246 elif 6 < optlevel <= 9: 

247 optfull = 3 

248 

249 return optmedian, optstarts, optstops, optfull 

250 

251 

252def col_full(nblocks, optlevel): 

253 """Compute the optimizations to be done for full indexes.""" 

254 

255 optmedian, optstarts, optstops, optfull = (False,) * 4 

256 

257 # Full case 

258 if nblocks <= 1: 

259 if 0 < optlevel <= 3: 

260 optmedian = True 

261 elif 3 < optlevel <= 6: 

262 optmedian, optstarts = (True, True) 

263 elif 6 < optlevel <= 9: 

264 optfull = 1 

265 else: # More than a block 

266 if 0 < optlevel <= 3: 

267 optfull = 1 

268 elif 3 < optlevel <= 6: 

269 optfull = 2 

270 elif 6 < optlevel <= 9: 

271 optfull = 3 

272 

273 return optmedian, optstarts, optstops, optfull 

274 

275 

276def get_reduction_level(indsize, optlevel, slicesize, chunksize): 

277 """Compute the reduction level based on indsize and optlevel.""" 

278 rlevels = [ 

279 [8, 8, 8, 8, 4, 4, 4, 2, 2, 1], # 8-bit indices (ultralight) 

280 [4, 4, 4, 4, 2, 2, 2, 1, 1, 1], # 16-bit indices (light) 

281 [2, 2, 2, 2, 1, 1, 1, 1, 1, 1], # 32-bit indices (medium) 

282 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # 64-bit indices (full) 

283 ] 

284 isizes = {1: 0, 2: 1, 4: 2, 8: 3} 

285 rlevel = rlevels[isizes[indsize]][optlevel] 

286 # The next cases should only happen in tests 

287 if rlevel >= slicesize: 

288 rlevel = 1 

289 if slicesize <= chunksize * rlevel: 

290 rlevel = 1 

291 if indsize == 8: 

292 # Ensure that, for full indexes we will never perform a reduction. 

293 # This is required because of implementation assumptions. 

294 assert rlevel == 1 

295 return rlevel 

296 

297 

298# Python implementations of NextAfter and NextAfterF 

299# 

300# These implementations exist because the standard function 

301# nextafterf is not available on Microsoft platforms. 

302# 

303# These implementations are based on the IEEE representation of 

304# floats and doubles. 

305# Author: Shack Toms - shack@livedata.com 

306# 

307# Thanks to Shack Toms shack@livedata.com for NextAfter and NextAfterF 

308# implementations in Python. 2004-10-01 

309# epsilon = math.ldexp(1.0, -53) # smallest double such that 

310# # 0.5 + epsilon != 0.5 

311# epsilonF = math.ldexp(1.0, -24) # smallest float such that 0.5 + epsilonF 

312# != 0.5 

313# maxFloat = float(2**1024 - 2**971) # From the IEEE 754 standard 

314# maxFloatF = float(2**128 - 2**104) # From the IEEE 754 standard 

315# minFloat = math.ldexp(1.0, -1022) # min positive normalized double 

316# minFloatF = math.ldexp(1.0, -126) # min positive normalized float 

317# smallEpsilon = math.ldexp(1.0, -1074) # smallest increment for 

318# # doubles < minFloat 

319# smallEpsilonF = math.ldexp(1.0, -149) # smallest increment for 

320# # floats < minFloatF 

321infinity = math.ldexp(1.0, 1023) * 2 

322infinityf = math.ldexp(1.0, 128) 

323# Finf = float("inf") # Infinite in the IEEE 754 standard (not avail in Win) 

324 

325# A portable representation of NaN 

326# if sys.byteorder == "little": 

327# testNaN = struct.unpack("d", '\x01\x00\x00\x00\x00\x00\xf0\x7f')[0] 

328# elif sys.byteorder == "big": 

329# testNaN = struct.unpack("d", '\x7f\xf0\x00\x00\x00\x00\x00\x01')[0] 

330# else: 

331# raise ValueError("Byteorder '%s' not supported!" % sys.byteorder) 

332# This one seems better 

333# testNaN = infinity - infinity 

334 

335# "infinity" for several types 

336infinitymap = { 

337 'bool': [0, 1], 

338 'int8': [-2**7, 2**7 - 1], 

339 'uint8': [0, 2**8 - 1], 

340 'int16': [-2**15, 2**15 - 1], 

341 'uint16': [0, 2**16 - 1], 

342 'int32': [-2**31, 2**31 - 1], 

343 'uint32': [0, 2**32 - 1], 

344 'int64': [-2**63, 2**63 - 1], 

345 'uint64': [0, 2**64 - 1], 

346 'float32': [-infinityf, infinityf], 

347 'float64': [-infinity, infinity], 

348} 

349 

350if hasattr(np, 'float16'): 

351 infinitymap['float16'] = [-np.float16(np.inf), 

352 np.float16(np.inf)] 

353if hasattr(np, 'float96'): 

354 infinitymap['float96'] = [-np.float96(np.inf), 

355 np.float96(np.inf)] 

356if hasattr(np, 'float128'): 

357 infinitymap['float128'] = [-np.float128(np.inf), 

358 np.float128(np.inf)] 

359 

360# deprecated API 

361infinityMap = infinitymap 

362infinityF = infinityf 

363 

364# Utility functions 

365 

366 

367def inftype(dtype, itemsize, sign=+1): 

368 """Return a superior limit for maximum representable data type.""" 

369 

370 assert sign in [-1, +1] 

371 

372 if dtype.kind == "S": 

373 if sign < 0: 

374 return b"\x00" * itemsize 

375 else: 

376 return b"\xff" * itemsize 

377 try: 

378 return infinitymap[dtype.name][sign >= 0] 

379 except KeyError: 

380 raise TypeError("Type %s is not supported" % dtype.name) 

381 

382 

383def string_next_after(x, direction, itemsize): 

384 """Return the next representable neighbor of x in the appropriate 

385 direction.""" 

386 

387 assert direction in [-1, +1] 

388 

389 # Pad the string with \x00 chars until itemsize completion 

390 padsize = itemsize - len(x) 

391 if padsize > 0: 

392 x += b"\x00" * padsize 

393 # int.to_bytes is not available in Python < 3.2 

394 # xlist = [i.to_bytes(1, sys.byteorder) for i in x] 

395 xlist = [bytes([i]) for i in x] 

396 xlist.reverse() 

397 i = 0 

398 if direction > 0: 

399 if xlist == b"\xff" * itemsize: 

400 # Maximum value, return this 

401 return b"".join(xlist) 

402 for xchar in xlist: 

403 if ord(xchar) < 0xff: 

404 xlist[i] = chr(ord(xchar) + 1).encode('ascii') 

405 break 

406 else: 

407 xlist[i] = b"\x00" 

408 i += 1 

409 else: 

410 if xlist == b"\x00" * itemsize: 

411 # Minimum value, return this 

412 return b"".join(xlist) 

413 for xchar in xlist: 

414 if ord(xchar) > 0x00: 

415 xlist[i] = chr(ord(xchar) - 1).encode('ascii') 

416 break 

417 else: 

418 xlist[i] = b"\xff" 

419 i += 1 

420 xlist.reverse() 

421 return b"".join(xlist) 

422 

423 

424def int_type_next_after(x, direction, itemsize): 

425 """Return the next representable neighbor of x in the appropriate 

426 direction.""" 

427 

428 assert direction in [-1, +1] 

429 

430 # x is guaranteed to be either an int or a float 

431 if direction < 0: 

432 if isinstance(x, int): 

433 return x - 1 

434 else: 

435 # return int(PyNextAfter(x, x - 1)) 

436 return int(np.nextafter(x, x - 1)) 

437 else: 

438 if isinstance(x, int): 

439 return x + 1 

440 else: 

441 # return int(PyNextAfter(x,x + 1)) + 1 

442 return int(np.nextafter(x, x + 1)) + 1 

443 

444 

445def bool_type_next_after(x, direction, itemsize): 

446 """Return the next representable neighbor of x in the appropriate 

447 direction.""" 

448 

449 assert direction in [-1, +1] 

450 

451 # x is guaranteed to be either a boolean 

452 if direction < 0: 

453 return False 

454 else: 

455 return True 

456 

457 

458def nextafter(x, direction, dtype, itemsize): 

459 """Return the next representable neighbor of x in the appropriate 

460 direction.""" 

461 

462 assert direction in [-1, 0, +1] 

463 assert dtype.kind == "S" or type(x) in (bool, float, int) 

464 

465 if direction == 0: 

466 return x 

467 

468 if dtype.kind == "S": 

469 return string_next_after(x, direction, itemsize) 

470 

471 if dtype.kind in ['b']: 

472 return bool_type_next_after(x, direction, itemsize) 

473 elif dtype.kind in ['i', 'u']: 

474 return int_type_next_after(x, direction, itemsize) 

475 elif dtype.kind == "f": 

476 if direction < 0: 

477 return np.nextafter(x, x - 1) 

478 else: 

479 return np.nextafter(x, x + 1) 

480 

481 # elif dtype.name == "float32": 

482 # if direction < 0: 

483 # return PyNextAfterF(x,x-1) 

484 # else: 

485 # return PyNextAfterF(x,x + 1) 

486 # elif dtype.name == "float64": 

487 # if direction < 0: 

488 # return PyNextAfter(x,x-1) 

489 # else: 

490 # return PyNextAfter(x,x + 1) 

491 

492 raise TypeError("data type ``%s`` is not supported" % dtype)