Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tables/idxutils.py: 14%

1"""Utilities to be used mainly by the Index class."""

3import math

4import numpy as np

7# Hints for chunk/slice/block/superblock computations:

8# - The slicesize should not exceed 2**32 elements (because of

9# implementation reasons). Such an extreme case would make the

10# sorting algorithms to consume up to 64 GB of memory.

11# - In general, one should favor a small chunksize ( < 128 KB) if one

12# wants to reduce the latency for indexed queries. However, keep in

13# mind that a very low value of chunksize for big datasets may hurt

14# the performance by requering the HDF5 to use a lot of memory and CPU

15# for its internal B-Tree.

17def csformula(nrows):

18 """Return the fitted chunksize (a float value) for nrows."""

20 # This formula has been computed using two points:

21 # 2**12 = m * 2**(n + log10(10**6))

22 # 2**15 = m * 2**(n + log10(10**9))

23 # where 2**12 and 2**15 are reasonable values for chunksizes for indexes

24 # with 10**6 and 10**9 elements respectively.

25 # Yes, return a floating point number!

26 return 64 * 2**math.log10(nrows)

29def limit_er(expectedrows):

30 """Protection against creating too small or too large chunks or slices."""

32 if expectedrows < 10**5:

33 expectedrows = 10**5

34 elif expectedrows > 10**12:

35 expectedrows = 10**12

36 return expectedrows

39def computechunksize(expectedrows):

40 """Get the optimum chunksize based on expectedrows."""

42 expectedrows = limit_er(expectedrows)

43 zone = int(math.log10(expectedrows))

44 nrows = 10**zone

45 return int(csformula(nrows))

48def computeslicesize(expectedrows, memlevel):

49 """Get the optimum slicesize based on expectedrows and memorylevel."""

51 expectedrows = limit_er(expectedrows)

52 # First, the optimum chunksize

53 cs = csformula(expectedrows)

54 # Now, the actual chunksize

55 chunksize = computechunksize(expectedrows)

56 # The optimal slicesize

57 ss = int(cs * memlevel**2)

58 # We *need* slicesize to be an exact multiple of the actual chunksize

59 ss = (ss // chunksize) * chunksize

60 ss *= 4 # slicesize should be at least divisible by 4

61 # ss cannot be bigger than 2**31 - 1 elements because of fundamental

62 # reasons (this limitation comes mainly from the way of compute

63 # indices for indexes, but also because C keysort is not implemented

64 # yet for the string type). Besides, it cannot be larger than

65 # 2**30, because limitiations of the optimized binary search code

66 # (in idx-opt.c, the line ``mid = lo + (hi-lo)/2;`` will overflow

67 # for values of ``lo`` and ``hi`` >= 2**30). Finally, ss must be a

68 # multiple of 4, so 2**30 must definitely be an upper limit.

69 if ss > 2**30:

70 ss = 2**30

71 return ss

74def computeblocksize(expectedrows, compoundsize, lowercompoundsize):

75 """Calculate the optimum number of superblocks made from compounds blocks.

77 This is useful for computing the sizes of both blocks and

78 superblocks (using the PyTables terminology for blocks in indexes).

80 """

82 nlowerblocks = (expectedrows // lowercompoundsize) + 1

83 if nlowerblocks > 2**20:

84 # Protection against too large number of compound blocks

85 nlowerblocks = 2**20

86 size = int(lowercompoundsize * nlowerblocks)

87 # We *need* superblocksize to be an exact multiple of the actual

88 # compoundblock size (a ceil must be performed here!)

89 size = ((size // compoundsize) + 1) * compoundsize

90 return size

93def calc_chunksize(expectedrows, optlevel=6, indsize=4, memlevel=4, node=None):

94 """Calculate the HDF5 chunk size for index and sorted arrays.

96 The logic to do that is based purely in experiments playing with

97 different chunksizes and compression flag. It is obvious that using

98 big chunks optimizes the I/O speed, but if they are too large, the

99 uncompressor takes too much time. This might (should) be further

100 optimized by doing more experiments.

101

102 """

103

104 chunksize = computechunksize(expectedrows)

105 slicesize = computeslicesize(expectedrows, memlevel)

106

107 # Avoid excessive slicesize in Indexes, see https://github.com/PyTables/PyTables/issues/879

108 if node is not None:

109 maxsize = node._v_file.params['BUFFER_TIMES'] * node._v_file.params['IO_BUFFER_SIZE']

110 while (slicesize * node.dtype.itemsize) > maxsize:

111 slicesize = slicesize // 2

112

113 # Correct the slicesize and the chunksize based on optlevel

114 if indsize == 1: # ultralight

115 chunksize, slicesize = ccs_ultralight(optlevel, chunksize, slicesize)

116 elif indsize == 2: # light

117 chunksize, slicesize = ccs_light(optlevel, chunksize, slicesize)

118 elif indsize == 4: # medium

119 chunksize, slicesize = ccs_medium(optlevel, chunksize, slicesize)

120 elif indsize == 8: # full

121 chunksize, slicesize = ccs_full(optlevel, chunksize, slicesize)

122

123 # Finally, compute blocksize and superblocksize

124 blocksize = computeblocksize(expectedrows, slicesize, chunksize)

125 superblocksize = computeblocksize(expectedrows, blocksize, slicesize)

126 # The size for different blocks information

127 sizes = (superblocksize, blocksize, slicesize, chunksize)

128 return sizes

129

130

131def ccs_ultralight(optlevel, chunksize, slicesize):

132 """Correct the slicesize and the chunksize based on optlevel."""

133

134 if optlevel in (0, 1, 2):

135 slicesize //= 2

136 slicesize += optlevel * slicesize

137 elif optlevel in (3, 4, 5):

138 slicesize *= optlevel - 1

139 elif optlevel in (6, 7, 8):

140 slicesize *= optlevel - 1

141 elif optlevel == 9:

142 slicesize *= optlevel - 1

143 return chunksize, slicesize

144

145

146def ccs_light(optlevel, chunksize, slicesize):

147 """Correct the slicesize and the chunksize based on optlevel."""

148

149 if optlevel in (0, 1, 2):

150 slicesize //= 2

151 elif optlevel in (3, 4, 5):

152 pass

153 elif optlevel in (6, 7, 8):

154 chunksize //= 2

155 elif optlevel == 9:

156 # Reducing the chunksize and enlarging the slicesize is the

157 # best way to reduce the entropy with the current algorithm.

158 chunksize //= 2

159 slicesize *= 2

160 return chunksize, slicesize

161

162

163def ccs_medium(optlevel, chunksize, slicesize):

164 """Correct the slicesize and the chunksize based on optlevel."""

165

166 if optlevel in (0, 1, 2):

167 slicesize //= 2

168 elif optlevel in (3, 4, 5):

169 pass

170 elif optlevel in (6, 7, 8):

171 chunksize //= 2

172 elif optlevel == 9:

173 # Reducing the chunksize and enlarging the slicesize is the

174 # best way to reduce the entropy with the current algorithm.

175 chunksize //= 2

176 slicesize *= 2

177 return chunksize, slicesize

178

179

180def ccs_full(optlevel, chunksize, slicesize):

181 """Correct the slicesize and the chunksize based on optlevel."""

182

183 if optlevel in (0, 1, 2):

184 slicesize //= 2

185 elif optlevel in (3, 4, 5):

186 pass

187 elif optlevel in (6, 7, 8):

188 chunksize //= 2

189 elif optlevel == 9:

190 # Reducing the chunksize and enlarging the slicesize is the

191 # best way to reduce the entropy with the current algorithm.

192 chunksize //= 2

193 slicesize *= 2

194 return chunksize, slicesize

195

196

197def calcoptlevels(nblocks, optlevel, indsize):

198 """Compute the optimizations to be done.

199

200 The calculation is based on the number of blocks, optlevel and

201 indexing mode.

202

203 """

204

205 if indsize == 2: # light

206 return col_light(nblocks, optlevel)

207 elif indsize == 4: # medium

208 return col_medium(nblocks, optlevel)

209 elif indsize == 8: # full

210 return col_full(nblocks, optlevel)

211

212

213def col_light(nblocks, optlevel):

214 """Compute the optimizations to be done for light indexes."""

215

216 optmedian, optstarts, optstops, optfull = (False,) * 4

217

218 if 0 < optlevel <= 3:

219 optmedian = True

220 elif 3 < optlevel <= 6:

221 optmedian, optstarts = (True, True)

222 elif 6 < optlevel <= 9:

223 optmedian, optstarts, optstops = (True, True, True)

224

225 return optmedian, optstarts, optstops, optfull

226

227

228def col_medium(nblocks, optlevel):

229 """Compute the optimizations to be done for medium indexes."""

230

231 optmedian, optstarts, optstops, optfull = (False,) * 4

232

233 # Medium case

234 if nblocks <= 1:

235 if 0 < optlevel <= 3:

236 optmedian = True

237 elif 3 < optlevel <= 6:

238 optmedian, optstarts = (True, True)

239 elif 6 < optlevel <= 9:

240 optfull = 1

241 else: # More than a block

242 if 0 < optlevel <= 3:

243 optfull = 1

244 elif 3 < optlevel <= 6:

245 optfull = 2

246 elif 6 < optlevel <= 9:

247 optfull = 3

248

249 return optmedian, optstarts, optstops, optfull

250

251

252def col_full(nblocks, optlevel):

253 """Compute the optimizations to be done for full indexes."""

254

255 optmedian, optstarts, optstops, optfull = (False,) * 4

256

257 # Full case

258 if nblocks <= 1:

259 if 0 < optlevel <= 3:

260 optmedian = True

261 elif 3 < optlevel <= 6:

262 optmedian, optstarts = (True, True)

263 elif 6 < optlevel <= 9:

264 optfull = 1

265 else: # More than a block

266 if 0 < optlevel <= 3:

267 optfull = 1

268 elif 3 < optlevel <= 6:

269 optfull = 2

270 elif 6 < optlevel <= 9:

271 optfull = 3

272

273 return optmedian, optstarts, optstops, optfull

274

275

276def get_reduction_level(indsize, optlevel, slicesize, chunksize):

277 """Compute the reduction level based on indsize and optlevel."""

278 rlevels = [

279 [8, 8, 8, 8, 4, 4, 4, 2, 2, 1], # 8-bit indices (ultralight)

280 [4, 4, 4, 4, 2, 2, 2, 1, 1, 1], # 16-bit indices (light)

281 [2, 2, 2, 2, 1, 1, 1, 1, 1, 1], # 32-bit indices (medium)

282 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # 64-bit indices (full)

283 ]

284 isizes = {1: 0, 2: 1, 4: 2, 8: 3}

285 rlevel = rlevels[isizes[indsize]][optlevel]

286 # The next cases should only happen in tests

287 if rlevel >= slicesize:

288 rlevel = 1

289 if slicesize <= chunksize * rlevel:

290 rlevel = 1

291 if indsize == 8:

292 # Ensure that, for full indexes we will never perform a reduction.

293 # This is required because of implementation assumptions.

294 assert rlevel == 1

295 return rlevel

296

297

298# Python implementations of NextAfter and NextAfterF

299#

300# These implementations exist because the standard function

301# nextafterf is not available on Microsoft platforms.

302#

303# These implementations are based on the IEEE representation of

304# floats and doubles.

305# Author: Shack Toms - shack@livedata.com

306#

307# Thanks to Shack Toms shack@livedata.com for NextAfter and NextAfterF

308# implementations in Python. 2004-10-01

309# epsilon = math.ldexp(1.0, -53) # smallest double such that

310# # 0.5 + epsilon != 0.5

311# epsilonF = math.ldexp(1.0, -24) # smallest float such that 0.5 + epsilonF

312# != 0.5

313# maxFloat = float(2**1024 - 2**971) # From the IEEE 754 standard

314# maxFloatF = float(2**128 - 2**104) # From the IEEE 754 standard

315# minFloat = math.ldexp(1.0, -1022) # min positive normalized double

316# minFloatF = math.ldexp(1.0, -126) # min positive normalized float

317# smallEpsilon = math.ldexp(1.0, -1074) # smallest increment for

318# # doubles < minFloat

319# smallEpsilonF = math.ldexp(1.0, -149) # smallest increment for

320# # floats < minFloatF

321infinity = math.ldexp(1.0, 1023) * 2

322infinityf = math.ldexp(1.0, 128)

323# Finf = float("inf") # Infinite in the IEEE 754 standard (not avail in Win)

324

325# A portable representation of NaN

326# if sys.byteorder == "little":

327# testNaN = struct.unpack("d", '\x01\x00\x00\x00\x00\x00\xf0\x7f')[0]

328# elif sys.byteorder == "big":

329# testNaN = struct.unpack("d", '\x7f\xf0\x00\x00\x00\x00\x00\x01')[0]

330# else:

331# raise ValueError("Byteorder '%s' not supported!" % sys.byteorder)

332# This one seems better

333# testNaN = infinity - infinity

334

335# "infinity" for several types

336infinitymap = {

337 'bool': [0, 1],

338 'int8': [-2**7, 2**7 - 1],

339 'uint8': [0, 2**8 - 1],

340 'int16': [-2**15, 2**15 - 1],

341 'uint16': [0, 2**16 - 1],

342 'int32': [-2**31, 2**31 - 1],

343 'uint32': [0, 2**32 - 1],

344 'int64': [-2**63, 2**63 - 1],

345 'uint64': [0, 2**64 - 1],

346 'float32': [-infinityf, infinityf],

347 'float64': [-infinity, infinity],

348}

349

350if hasattr(np, 'float16'):

351 infinitymap['float16'] = [-np.float16(np.inf),

352 np.float16(np.inf)]

353if hasattr(np, 'float96'):

354 infinitymap['float96'] = [-np.float96(np.inf),

355 np.float96(np.inf)]

356if hasattr(np, 'float128'):

357 infinitymap['float128'] = [-np.float128(np.inf),

358 np.float128(np.inf)]

359

360# deprecated API

361infinityMap = infinitymap

362infinityF = infinityf

363

364# Utility functions

365

366

367def inftype(dtype, itemsize, sign=+1):

368 """Return a superior limit for maximum representable data type."""

369

370 assert sign in [-1, +1]

371

372 if dtype.kind == "S":

373 if sign < 0:

374 return b"\x00" * itemsize

375 else:

376 return b"\xff" * itemsize

377 try:

378 return infinitymap[dtype.name][sign >= 0]

379 except KeyError:

380 raise TypeError("Type %s is not supported" % dtype.name)

381

382

383def string_next_after(x, direction, itemsize):

384 """Return the next representable neighbor of x in the appropriate

385 direction."""

386

387 assert direction in [-1, +1]

388

389 # Pad the string with \x00 chars until itemsize completion

390 padsize = itemsize - len(x)

391 if padsize > 0:

392 x += b"\x00" * padsize

393 # int.to_bytes is not available in Python < 3.2

394 # xlist = [i.to_bytes(1, sys.byteorder) for i in x]

395 xlist = [bytes([i]) for i in x]

396 xlist.reverse()

397 i = 0

398 if direction > 0:

399 if xlist == b"\xff" * itemsize:

400 # Maximum value, return this

401 return b"".join(xlist)

402 for xchar in xlist:

403 if ord(xchar) < 0xff:

404 xlist[i] = chr(ord(xchar) + 1).encode('ascii')

405 break

406 else:

407 xlist[i] = b"\x00"

408 i += 1

409 else:

410 if xlist == b"\x00" * itemsize:

411 # Minimum value, return this

412 return b"".join(xlist)

413 for xchar in xlist:

414 if ord(xchar) > 0x00:

415 xlist[i] = chr(ord(xchar) - 1).encode('ascii')

416 break

417 else:

418 xlist[i] = b"\xff"

419 i += 1

420 xlist.reverse()

421 return b"".join(xlist)

422

423

424def int_type_next_after(x, direction, itemsize):

425 """Return the next representable neighbor of x in the appropriate

426 direction."""

427

428 assert direction in [-1, +1]

429

430 # x is guaranteed to be either an int or a float

431 if direction < 0:

432 if isinstance(x, int):

433 return x - 1

434 else:

435 # return int(PyNextAfter(x, x - 1))

436 return int(np.nextafter(x, x - 1))

437 else:

438 if isinstance(x, int):

439 return x + 1

440 else:

441 # return int(PyNextAfter(x,x + 1)) + 1

442 return int(np.nextafter(x, x + 1)) + 1

443

444

445def bool_type_next_after(x, direction, itemsize):

446 """Return the next representable neighbor of x in the appropriate

447 direction."""

448

449 assert direction in [-1, +1]

450

451 # x is guaranteed to be either a boolean

452 if direction < 0:

453 return False

454 else:

455 return True

456

457

458def nextafter(x, direction, dtype, itemsize):

459 """Return the next representable neighbor of x in the appropriate

460 direction."""

461

462 assert direction in [-1, 0, +1]

463 assert dtype.kind == "S" or type(x) in (bool, float, int)

464

465 if direction == 0:

466 return x

467

468 if dtype.kind == "S":

469 return string_next_after(x, direction, itemsize)

470

471 if dtype.kind in ['b']:

472 return bool_type_next_after(x, direction, itemsize)

473 elif dtype.kind in ['i', 'u']:

474 return int_type_next_after(x, direction, itemsize)

475 elif dtype.kind == "f":

476 if direction < 0:

477 return np.nextafter(x, x - 1)

478 else:

479 return np.nextafter(x, x + 1)

480

481 # elif dtype.name == "float32":

482 # if direction < 0:

483 # return PyNextAfterF(x,x-1)

484 # else:

485 # return PyNextAfterF(x,x + 1)

486 # elif dtype.name == "float64":

487 # if direction < 0:

488 # return PyNextAfter(x,x-1)

489 # else:

490 # return PyNextAfter(x,x + 1)

491

492 raise TypeError("data type ``%s`` is not supported" % dtype)