Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tables/idxutils.py: 14%
232 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-10 06:15 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-10 06:15 +0000
1"""Utilities to be used mainly by the Index class."""
3import math
4import numpy as np
7# Hints for chunk/slice/block/superblock computations:
8# - The slicesize should not exceed 2**32 elements (because of
9# implementation reasons). Such an extreme case would make the
10# sorting algorithms to consume up to 64 GB of memory.
11# - In general, one should favor a small chunksize ( < 128 KB) if one
12# wants to reduce the latency for indexed queries. However, keep in
13# mind that a very low value of chunksize for big datasets may hurt
14# the performance by requering the HDF5 to use a lot of memory and CPU
15# for its internal B-Tree.
17def csformula(nrows):
18 """Return the fitted chunksize (a float value) for nrows."""
20 # This formula has been computed using two points:
21 # 2**12 = m * 2**(n + log10(10**6))
22 # 2**15 = m * 2**(n + log10(10**9))
23 # where 2**12 and 2**15 are reasonable values for chunksizes for indexes
24 # with 10**6 and 10**9 elements respectively.
25 # Yes, return a floating point number!
26 return 64 * 2**math.log10(nrows)
29def limit_er(expectedrows):
30 """Protection against creating too small or too large chunks or slices."""
32 if expectedrows < 10**5:
33 expectedrows = 10**5
34 elif expectedrows > 10**12:
35 expectedrows = 10**12
36 return expectedrows
39def computechunksize(expectedrows):
40 """Get the optimum chunksize based on expectedrows."""
42 expectedrows = limit_er(expectedrows)
43 zone = int(math.log10(expectedrows))
44 nrows = 10**zone
45 return int(csformula(nrows))
48def computeslicesize(expectedrows, memlevel):
49 """Get the optimum slicesize based on expectedrows and memorylevel."""
51 expectedrows = limit_er(expectedrows)
52 # First, the optimum chunksize
53 cs = csformula(expectedrows)
54 # Now, the actual chunksize
55 chunksize = computechunksize(expectedrows)
56 # The optimal slicesize
57 ss = int(cs * memlevel**2)
58 # We *need* slicesize to be an exact multiple of the actual chunksize
59 ss = (ss // chunksize) * chunksize
60 ss *= 4 # slicesize should be at least divisible by 4
61 # ss cannot be bigger than 2**31 - 1 elements because of fundamental
62 # reasons (this limitation comes mainly from the way of compute
63 # indices for indexes, but also because C keysort is not implemented
64 # yet for the string type). Besides, it cannot be larger than
65 # 2**30, because limitiations of the optimized binary search code
66 # (in idx-opt.c, the line ``mid = lo + (hi-lo)/2;`` will overflow
67 # for values of ``lo`` and ``hi`` >= 2**30). Finally, ss must be a
68 # multiple of 4, so 2**30 must definitely be an upper limit.
69 if ss > 2**30:
70 ss = 2**30
71 return ss
74def computeblocksize(expectedrows, compoundsize, lowercompoundsize):
75 """Calculate the optimum number of superblocks made from compounds blocks.
77 This is useful for computing the sizes of both blocks and
78 superblocks (using the PyTables terminology for blocks in indexes).
80 """
82 nlowerblocks = (expectedrows // lowercompoundsize) + 1
83 if nlowerblocks > 2**20:
84 # Protection against too large number of compound blocks
85 nlowerblocks = 2**20
86 size = int(lowercompoundsize * nlowerblocks)
87 # We *need* superblocksize to be an exact multiple of the actual
88 # compoundblock size (a ceil must be performed here!)
89 size = ((size // compoundsize) + 1) * compoundsize
90 return size
93def calc_chunksize(expectedrows, optlevel=6, indsize=4, memlevel=4, node=None):
94 """Calculate the HDF5 chunk size for index and sorted arrays.
96 The logic to do that is based purely in experiments playing with
97 different chunksizes and compression flag. It is obvious that using
98 big chunks optimizes the I/O speed, but if they are too large, the
99 uncompressor takes too much time. This might (should) be further
100 optimized by doing more experiments.
102 """
104 chunksize = computechunksize(expectedrows)
105 slicesize = computeslicesize(expectedrows, memlevel)
107 # Avoid excessive slicesize in Indexes, see https://github.com/PyTables/PyTables/issues/879
108 if node is not None:
109 maxsize = node._v_file.params['BUFFER_TIMES'] * node._v_file.params['IO_BUFFER_SIZE']
110 while (slicesize * node.dtype.itemsize) > maxsize:
111 slicesize = slicesize // 2
113 # Correct the slicesize and the chunksize based on optlevel
114 if indsize == 1: # ultralight
115 chunksize, slicesize = ccs_ultralight(optlevel, chunksize, slicesize)
116 elif indsize == 2: # light
117 chunksize, slicesize = ccs_light(optlevel, chunksize, slicesize)
118 elif indsize == 4: # medium
119 chunksize, slicesize = ccs_medium(optlevel, chunksize, slicesize)
120 elif indsize == 8: # full
121 chunksize, slicesize = ccs_full(optlevel, chunksize, slicesize)
123 # Finally, compute blocksize and superblocksize
124 blocksize = computeblocksize(expectedrows, slicesize, chunksize)
125 superblocksize = computeblocksize(expectedrows, blocksize, slicesize)
126 # The size for different blocks information
127 sizes = (superblocksize, blocksize, slicesize, chunksize)
128 return sizes
131def ccs_ultralight(optlevel, chunksize, slicesize):
132 """Correct the slicesize and the chunksize based on optlevel."""
134 if optlevel in (0, 1, 2):
135 slicesize //= 2
136 slicesize += optlevel * slicesize
137 elif optlevel in (3, 4, 5):
138 slicesize *= optlevel - 1
139 elif optlevel in (6, 7, 8):
140 slicesize *= optlevel - 1
141 elif optlevel == 9:
142 slicesize *= optlevel - 1
143 return chunksize, slicesize
146def ccs_light(optlevel, chunksize, slicesize):
147 """Correct the slicesize and the chunksize based on optlevel."""
149 if optlevel in (0, 1, 2):
150 slicesize //= 2
151 elif optlevel in (3, 4, 5):
152 pass
153 elif optlevel in (6, 7, 8):
154 chunksize //= 2
155 elif optlevel == 9:
156 # Reducing the chunksize and enlarging the slicesize is the
157 # best way to reduce the entropy with the current algorithm.
158 chunksize //= 2
159 slicesize *= 2
160 return chunksize, slicesize
163def ccs_medium(optlevel, chunksize, slicesize):
164 """Correct the slicesize and the chunksize based on optlevel."""
166 if optlevel in (0, 1, 2):
167 slicesize //= 2
168 elif optlevel in (3, 4, 5):
169 pass
170 elif optlevel in (6, 7, 8):
171 chunksize //= 2
172 elif optlevel == 9:
173 # Reducing the chunksize and enlarging the slicesize is the
174 # best way to reduce the entropy with the current algorithm.
175 chunksize //= 2
176 slicesize *= 2
177 return chunksize, slicesize
180def ccs_full(optlevel, chunksize, slicesize):
181 """Correct the slicesize and the chunksize based on optlevel."""
183 if optlevel in (0, 1, 2):
184 slicesize //= 2
185 elif optlevel in (3, 4, 5):
186 pass
187 elif optlevel in (6, 7, 8):
188 chunksize //= 2
189 elif optlevel == 9:
190 # Reducing the chunksize and enlarging the slicesize is the
191 # best way to reduce the entropy with the current algorithm.
192 chunksize //= 2
193 slicesize *= 2
194 return chunksize, slicesize
197def calcoptlevels(nblocks, optlevel, indsize):
198 """Compute the optimizations to be done.
200 The calculation is based on the number of blocks, optlevel and
201 indexing mode.
203 """
205 if indsize == 2: # light
206 return col_light(nblocks, optlevel)
207 elif indsize == 4: # medium
208 return col_medium(nblocks, optlevel)
209 elif indsize == 8: # full
210 return col_full(nblocks, optlevel)
213def col_light(nblocks, optlevel):
214 """Compute the optimizations to be done for light indexes."""
216 optmedian, optstarts, optstops, optfull = (False,) * 4
218 if 0 < optlevel <= 3:
219 optmedian = True
220 elif 3 < optlevel <= 6:
221 optmedian, optstarts = (True, True)
222 elif 6 < optlevel <= 9:
223 optmedian, optstarts, optstops = (True, True, True)
225 return optmedian, optstarts, optstops, optfull
228def col_medium(nblocks, optlevel):
229 """Compute the optimizations to be done for medium indexes."""
231 optmedian, optstarts, optstops, optfull = (False,) * 4
233 # Medium case
234 if nblocks <= 1:
235 if 0 < optlevel <= 3:
236 optmedian = True
237 elif 3 < optlevel <= 6:
238 optmedian, optstarts = (True, True)
239 elif 6 < optlevel <= 9:
240 optfull = 1
241 else: # More than a block
242 if 0 < optlevel <= 3:
243 optfull = 1
244 elif 3 < optlevel <= 6:
245 optfull = 2
246 elif 6 < optlevel <= 9:
247 optfull = 3
249 return optmedian, optstarts, optstops, optfull
252def col_full(nblocks, optlevel):
253 """Compute the optimizations to be done for full indexes."""
255 optmedian, optstarts, optstops, optfull = (False,) * 4
257 # Full case
258 if nblocks <= 1:
259 if 0 < optlevel <= 3:
260 optmedian = True
261 elif 3 < optlevel <= 6:
262 optmedian, optstarts = (True, True)
263 elif 6 < optlevel <= 9:
264 optfull = 1
265 else: # More than a block
266 if 0 < optlevel <= 3:
267 optfull = 1
268 elif 3 < optlevel <= 6:
269 optfull = 2
270 elif 6 < optlevel <= 9:
271 optfull = 3
273 return optmedian, optstarts, optstops, optfull
276def get_reduction_level(indsize, optlevel, slicesize, chunksize):
277 """Compute the reduction level based on indsize and optlevel."""
278 rlevels = [
279 [8, 8, 8, 8, 4, 4, 4, 2, 2, 1], # 8-bit indices (ultralight)
280 [4, 4, 4, 4, 2, 2, 2, 1, 1, 1], # 16-bit indices (light)
281 [2, 2, 2, 2, 1, 1, 1, 1, 1, 1], # 32-bit indices (medium)
282 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # 64-bit indices (full)
283 ]
284 isizes = {1: 0, 2: 1, 4: 2, 8: 3}
285 rlevel = rlevels[isizes[indsize]][optlevel]
286 # The next cases should only happen in tests
287 if rlevel >= slicesize:
288 rlevel = 1
289 if slicesize <= chunksize * rlevel:
290 rlevel = 1
291 if indsize == 8:
292 # Ensure that, for full indexes we will never perform a reduction.
293 # This is required because of implementation assumptions.
294 assert rlevel == 1
295 return rlevel
298# Python implementations of NextAfter and NextAfterF
299#
300# These implementations exist because the standard function
301# nextafterf is not available on Microsoft platforms.
302#
303# These implementations are based on the IEEE representation of
304# floats and doubles.
305# Author: Shack Toms - shack@livedata.com
306#
307# Thanks to Shack Toms shack@livedata.com for NextAfter and NextAfterF
308# implementations in Python. 2004-10-01
309# epsilon = math.ldexp(1.0, -53) # smallest double such that
310# # 0.5 + epsilon != 0.5
311# epsilonF = math.ldexp(1.0, -24) # smallest float such that 0.5 + epsilonF
312# != 0.5
313# maxFloat = float(2**1024 - 2**971) # From the IEEE 754 standard
314# maxFloatF = float(2**128 - 2**104) # From the IEEE 754 standard
315# minFloat = math.ldexp(1.0, -1022) # min positive normalized double
316# minFloatF = math.ldexp(1.0, -126) # min positive normalized float
317# smallEpsilon = math.ldexp(1.0, -1074) # smallest increment for
318# # doubles < minFloat
319# smallEpsilonF = math.ldexp(1.0, -149) # smallest increment for
320# # floats < minFloatF
321infinity = math.ldexp(1.0, 1023) * 2
322infinityf = math.ldexp(1.0, 128)
323# Finf = float("inf") # Infinite in the IEEE 754 standard (not avail in Win)
325# A portable representation of NaN
326# if sys.byteorder == "little":
327# testNaN = struct.unpack("d", '\x01\x00\x00\x00\x00\x00\xf0\x7f')[0]
328# elif sys.byteorder == "big":
329# testNaN = struct.unpack("d", '\x7f\xf0\x00\x00\x00\x00\x00\x01')[0]
330# else:
331# raise ValueError("Byteorder '%s' not supported!" % sys.byteorder)
332# This one seems better
333# testNaN = infinity - infinity
335# "infinity" for several types
336infinitymap = {
337 'bool': [0, 1],
338 'int8': [-2**7, 2**7 - 1],
339 'uint8': [0, 2**8 - 1],
340 'int16': [-2**15, 2**15 - 1],
341 'uint16': [0, 2**16 - 1],
342 'int32': [-2**31, 2**31 - 1],
343 'uint32': [0, 2**32 - 1],
344 'int64': [-2**63, 2**63 - 1],
345 'uint64': [0, 2**64 - 1],
346 'float32': [-infinityf, infinityf],
347 'float64': [-infinity, infinity],
348}
350if hasattr(np, 'float16'):
351 infinitymap['float16'] = [-np.float16(np.inf),
352 np.float16(np.inf)]
353if hasattr(np, 'float96'):
354 infinitymap['float96'] = [-np.float96(np.inf),
355 np.float96(np.inf)]
356if hasattr(np, 'float128'):
357 infinitymap['float128'] = [-np.float128(np.inf),
358 np.float128(np.inf)]
360# deprecated API
361infinityMap = infinitymap
362infinityF = infinityf
364# Utility functions
367def inftype(dtype, itemsize, sign=+1):
368 """Return a superior limit for maximum representable data type."""
370 assert sign in [-1, +1]
372 if dtype.kind == "S":
373 if sign < 0:
374 return b"\x00" * itemsize
375 else:
376 return b"\xff" * itemsize
377 try:
378 return infinitymap[dtype.name][sign >= 0]
379 except KeyError:
380 raise TypeError("Type %s is not supported" % dtype.name)
383def string_next_after(x, direction, itemsize):
384 """Return the next representable neighbor of x in the appropriate
385 direction."""
387 assert direction in [-1, +1]
389 # Pad the string with \x00 chars until itemsize completion
390 padsize = itemsize - len(x)
391 if padsize > 0:
392 x += b"\x00" * padsize
393 # int.to_bytes is not available in Python < 3.2
394 # xlist = [i.to_bytes(1, sys.byteorder) for i in x]
395 xlist = [bytes([i]) for i in x]
396 xlist.reverse()
397 i = 0
398 if direction > 0:
399 if xlist == b"\xff" * itemsize:
400 # Maximum value, return this
401 return b"".join(xlist)
402 for xchar in xlist:
403 if ord(xchar) < 0xff:
404 xlist[i] = chr(ord(xchar) + 1).encode('ascii')
405 break
406 else:
407 xlist[i] = b"\x00"
408 i += 1
409 else:
410 if xlist == b"\x00" * itemsize:
411 # Minimum value, return this
412 return b"".join(xlist)
413 for xchar in xlist:
414 if ord(xchar) > 0x00:
415 xlist[i] = chr(ord(xchar) - 1).encode('ascii')
416 break
417 else:
418 xlist[i] = b"\xff"
419 i += 1
420 xlist.reverse()
421 return b"".join(xlist)
424def int_type_next_after(x, direction, itemsize):
425 """Return the next representable neighbor of x in the appropriate
426 direction."""
428 assert direction in [-1, +1]
430 # x is guaranteed to be either an int or a float
431 if direction < 0:
432 if isinstance(x, int):
433 return x - 1
434 else:
435 # return int(PyNextAfter(x, x - 1))
436 return int(np.nextafter(x, x - 1))
437 else:
438 if isinstance(x, int):
439 return x + 1
440 else:
441 # return int(PyNextAfter(x,x + 1)) + 1
442 return int(np.nextafter(x, x + 1)) + 1
445def bool_type_next_after(x, direction, itemsize):
446 """Return the next representable neighbor of x in the appropriate
447 direction."""
449 assert direction in [-1, +1]
451 # x is guaranteed to be either a boolean
452 if direction < 0:
453 return False
454 else:
455 return True
458def nextafter(x, direction, dtype, itemsize):
459 """Return the next representable neighbor of x in the appropriate
460 direction."""
462 assert direction in [-1, 0, +1]
463 assert dtype.kind == "S" or type(x) in (bool, float, int)
465 if direction == 0:
466 return x
468 if dtype.kind == "S":
469 return string_next_after(x, direction, itemsize)
471 if dtype.kind in ['b']:
472 return bool_type_next_after(x, direction, itemsize)
473 elif dtype.kind in ['i', 'u']:
474 return int_type_next_after(x, direction, itemsize)
475 elif dtype.kind == "f":
476 if direction < 0:
477 return np.nextafter(x, x - 1)
478 else:
479 return np.nextafter(x, x + 1)
481 # elif dtype.name == "float32":
482 # if direction < 0:
483 # return PyNextAfterF(x,x-1)
484 # else:
485 # return PyNextAfterF(x,x + 1)
486 # elif dtype.name == "float64":
487 # if direction < 0:
488 # return PyNextAfter(x,x-1)
489 # else:
490 # return PyNextAfter(x,x + 1)
492 raise TypeError("data type ``%s`` is not supported" % dtype)