Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tables/expression.py: 8%
264 statements
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-10 06:15 +0000
« prev ^ index » next coverage.py v7.2.5, created at 2023-05-10 06:15 +0000
1"""Here is defined the Expr class."""
3import sys
4import warnings
6import numexpr as ne
7import numpy as np
8import tables as tb
10from .exceptions import PerformanceWarning
11from .parameters import IO_BUFFER_SIZE, BUFFER_TIMES
14class Expr:
15 """A class for evaluating expressions with arbitrary array-like objects.
17 Expr is a class for evaluating expressions containing array-like objects.
18 With it, you can evaluate expressions (like "3 * a + 4 * b") that
19 operate on arbitrary large arrays while optimizing the resources
20 required to perform them (basically main memory and CPU cache memory).
21 It is similar to the Numexpr package (see :ref:`[NUMEXPR] <NUMEXPR>`),
22 but in addition to NumPy objects, it also accepts disk-based homogeneous
23 arrays, like the Array, CArray, EArray and Column PyTables objects.
25 .. warning::
27 Expr class only offers a subset of the Numexpr features due to the
28 complexity of implement some of them when dealing with huge amount of
29 data.
31 All the internal computations are performed via the Numexpr package,
32 so all the broadcast and upcasting rules of Numexpr applies here too.
33 These rules are very similar to the NumPy ones, but with some exceptions
34 due to the particularities of having to deal with potentially very large
35 disk-based arrays. Be sure to read the documentation of the Expr
36 constructor and methods as well as that of Numexpr, if you want to fully
37 grasp these particularities.
40 Parameters
41 ----------
42 expr : str
43 This specifies the expression to be evaluated, such as "2 * a + 3 * b".
44 uservars : dict
45 This can be used to define the variable names appearing in *expr*.
46 This mapping should consist of identifier-like strings pointing to any
47 `Array`, `CArray`, `EArray`, `Column` or NumPy ndarray instances (or
48 even others which will tried to be converted to ndarrays). When
49 `uservars` is not provided or `None`, the current local and global
50 namespace is sought instead of `uservars`. It is also possible to pass
51 just some of the variables in expression via the `uservars` mapping,
52 and the rest will be retrieved from the current local and global
53 namespaces.
54 kwargs : dict
55 This is meant to pass additional parameters to the Numexpr kernel.
56 This is basically the same as the kwargs argument in
57 Numexpr.evaluate(), and is mainly meant for advanced use.
59 Examples
60 --------
61 The following shows an example of using Expr::
63 >>> f = tb.open_file('/tmp/test_expr.h5', 'w')
64 >>> a = f.create_array('/', 'a', np.array([1,2,3]))
65 >>> b = f.create_array('/', 'b', np.array([3,4,5]))
66 >>> c = np.array([4,5,6])
67 >>> expr = tb.Expr("2 * a + b * c") # initialize the expression
68 >>> expr.eval() # evaluate it
69 array([14, 24, 36], dtype=int64)
70 >>> sum(expr) # use as an iterator
71 74
73 where you can see that you can mix different containers in
74 the expression (whenever shapes are consistent).
76 You can also work with multidimensional arrays::
78 >>> a2 = f.create_array('/', 'a2', np.array([[1,2],[3,4]]))
79 >>> b2 = f.create_array('/', 'b2', np.array([[3,4],[5,6]]))
80 >>> c2 = np.array([4,5]) # This will be broadcasted
81 >>> expr = tb.Expr("2 * a2 + b2-c2")
82 >>> expr.eval()
83 array([[1, 3],
84 [7, 9]], dtype=int64)
85 >>> sum(expr)
86 array([ 8, 12], dtype=int64)
87 >>> f.close()
89 .. rubric:: Expr attributes
91 .. attribute:: append_mode
93 The append mode for user-provided output containers.
95 .. attribute:: maindim
97 Common main dimension for inputs in expression.
99 .. attribute:: names
101 The names of variables in expression (list).
103 .. attribute:: out
105 The user-provided container (if any) for the expression outcome.
107 .. attribute:: o_start
109 The start range selection for the user-provided output.
111 .. attribute:: o_stop
113 The stop range selection for the user-provided output.
115 .. attribute:: o_step
117 The step range selection for the user-provided output.
119 .. attribute:: shape
121 Common shape for the arrays in expression.
123 .. attribute:: values
125 The values of variables in expression (list).
127 """
129 _exprvars_cache = {}
130 """Cache of variables participating in expressions.
132 .. versionadded:: 3.0
134 """
136 def __init__(self, expr, uservars=None, **kwargs):
138 self.append_mode = False
139 """The append mode for user-provided output containers."""
140 self.maindim = 0
141 """Common main dimension for inputs in expression."""
142 self.names = []
143 """The names of variables in expression (list)."""
144 self.out = None
145 """The user-provided container (if any) for the expression outcome."""
146 self.o_start = None
147 """The start range selection for the user-provided output."""
148 self.o_stop = None
149 """The stop range selection for the user-provided output."""
150 self.o_step = None
151 """The step range selection for the user-provided output."""
152 self.shape = None
153 """Common shape for the arrays in expression."""
154 self.start, self.stop, self.step = (None,) * 3
155 self.start = None
156 """The start range selection for the input."""
157 self.stop = None
158 """The stop range selection for the input."""
159 self.step = None
160 """The step range selection for the input."""
161 self.values = []
162 """The values of variables in expression (list)."""
164 self._compiled_expr = None
165 """The compiled expression."""
166 self._single_row_out = None
167 """A sample of the output with just a single row."""
169 # First, get the signature for the arrays in expression
170 vars_ = self._required_expr_vars(expr, uservars)
171 context = ne.necompiler.getContext(kwargs)
172 self.names, _ = ne.necompiler.getExprNames(expr, context)
174 # Raise a ValueError in case we have unsupported objects
175 for name, var in vars_.items():
176 if type(var) in (int, float, str):
177 continue
178 if not isinstance(var, (tb.Leaf, tb.Column)):
179 if hasattr(var, "dtype"):
180 # Quacks like a NumPy object
181 continue
182 raise TypeError("Unsupported variable type: %r" % var)
183 objname = var.__class__.__name__
184 if objname not in ("Array", "CArray", "EArray", "Column"):
185 raise TypeError("Unsupported variable type: %r" % var)
187 # NumPy arrays to be copied? (we don't need to worry about
188 # PyTables objects, as the reads always return contiguous and
189 # aligned objects, or at least I think so).
190 for name, var in vars_.items():
191 if isinstance(var, np.ndarray):
192 # See numexpr.necompiler.evaluate for a rational
193 # of the code below
194 if not var.flags.aligned:
195 if var.ndim != 1:
196 # Do a copy of this variable
197 var = var.copy()
198 # Update the vars_ dictionary
199 vars_[name] = var
201 # Get the variables and types
202 values = self.values
203 types_ = []
204 for name in self.names:
205 value = vars_[name]
206 if hasattr(value, 'atom'):
207 types_.append(value.atom)
208 elif hasattr(value, 'dtype'):
209 types_.append(value)
210 else:
211 # try to convert into a NumPy array
212 value = np.array(value)
213 types_.append(value)
214 values.append(value)
216 # Create a signature for the expression
217 signature = [(name, ne.necompiler.getType(type_))
218 for (name, type_) in zip(self.names, types_)]
220 # Compile the expression
221 self._compiled_expr = ne.necompiler.NumExpr(expr, signature, **kwargs)
223 # Guess the shape for the outcome and the maindim of inputs
224 self.shape, self.maindim = self._guess_shape()
226 # The next method is similar to their counterpart in `Table`, but
227 # adapted to the `Expr` own requirements.
228 def _required_expr_vars(self, expression, uservars, depth=2):
229 """Get the variables required by the `expression`.
231 A new dictionary defining the variables used in the `expression`
232 is returned. Required variables are first looked up in the
233 `uservars` mapping, then in the set of top-level columns of the
234 table. Unknown variables cause a `NameError` to be raised.
236 When `uservars` is `None`, the local and global namespace where
237 the API callable which uses this method is called is sought
238 instead. To disable this mechanism, just specify a mapping as
239 `uservars`.
241 Nested columns and variables with an ``uint64`` type are not
242 allowed (`TypeError` and `NotImplementedError` are raised,
243 respectively).
245 `depth` specifies the depth of the frame in order to reach local
246 or global variables.
248 """
250 # Get the names of variables used in the expression.
251 exprvars_cache = self._exprvars_cache
252 if expression not in exprvars_cache:
253 # Protection against growing the cache too much
254 if len(exprvars_cache) > 256:
255 # Remove 10 (arbitrary) elements from the cache
256 for k in list(exprvars_cache)[:10]:
257 del exprvars_cache[k]
258 cexpr = compile(expression, '<string>', 'eval')
259 exprvars = [var for var in cexpr.co_names
260 if var not in ['None', 'False', 'True']
261 and var not in ne.expressions.functions]
262 exprvars_cache[expression] = exprvars
263 else:
264 exprvars = exprvars_cache[expression]
266 # Get the local and global variable mappings of the user frame
267 # if no mapping has been explicitly given for user variables.
268 user_locals, user_globals = {}, {}
269 if uservars is None:
270 user_frame = sys._getframe(depth)
271 user_locals = user_frame.f_locals
272 user_globals = user_frame.f_globals
274 # Look for the required variables first among the ones
275 # explicitly provided by the user.
276 reqvars = {}
277 for var in exprvars:
278 # Get the value.
279 if uservars is not None and var in uservars:
280 val = uservars[var]
281 elif uservars is None and var in user_locals:
282 val = user_locals[var]
283 elif uservars is None and var in user_globals:
284 val = user_globals[var]
285 else:
286 raise NameError("name ``%s`` is not defined" % var)
288 # Check the value.
289 if hasattr(val, 'dtype') and val.dtype.str[1:] == 'u8':
290 raise NotImplementedError(
291 "variable ``%s`` refers to "
292 "a 64-bit unsigned integer object, that is "
293 "not yet supported in expressions, sorry; " % var)
294 elif hasattr(val, '_v_colpathnames'): # nested column
295 # This branch is never reached because the compile step
296 # above already raise a ``TypeError`` for nested
297 # columns, but that could change in the future. So it
298 # is best to let this here.
299 raise TypeError(
300 "variable ``%s`` refers to a nested column, "
301 "not allowed in expressions" % var)
302 reqvars[var] = val
303 return reqvars
305 def set_inputs_range(self, start=None, stop=None, step=None):
306 """Define a range for all inputs in expression.
308 The computation will only take place for the range defined by
309 the start, stop and step parameters in the main dimension of
310 inputs (or the leading one, if the object lacks the concept of
311 main dimension, like a NumPy container). If not a common main
312 dimension exists for all inputs, the leading dimension will be
313 used instead.
315 """
317 self.start = start
318 self.stop = stop
319 self.step = step
321 def set_output(self, out, append_mode=False):
322 """Set out as container for output as well as the append_mode.
324 The out must be a container that is meant to keep the outcome of
325 the expression. It should be an homogeneous type container and
326 can typically be an Array, CArray, EArray, Column or a NumPy ndarray.
328 The append_mode specifies the way of which the output is filled.
329 If true, the rows of the outcome are *appended* to the out container.
330 Of course, for doing this it is necessary that out would have an
331 append() method (like an EArray, for example).
333 If append_mode is false, the output is set via the __setitem__()
334 method (see the Expr.set_output_range() for info on how to select
335 the rows to be updated). If out is smaller than what is required
336 by the expression, only the computations that are needed to fill
337 up the container are carried out. If it is larger, the excess
338 elements are unaffected.
340 """
342 if not (hasattr(out, "shape") and hasattr(out, "__setitem__")):
343 raise ValueError(
344 "You need to pass a settable multidimensional container "
345 "as output")
346 self.out = out
347 if append_mode and not hasattr(out, "append"):
348 raise ValueError(
349 "For activating the ``append`` mode, you need a container "
350 "with an `append()` method (like the `EArray`)")
351 self.append_mode = append_mode
353 def set_output_range(self, start=None, stop=None, step=None):
354 """Define a range for user-provided output object.
356 The output object will only be modified in the range specified by the
357 start, stop and step parameters in the main dimension of output (or the
358 leading one, if the object does not have the concept of main dimension,
359 like a NumPy container).
361 """
363 if self.out is None:
364 raise IndexError(
365 "You need to pass an output object to `setOut()` first")
366 self.o_start = start
367 self.o_stop = stop
368 self.o_step = step
370 # Although the next code is similar to the method in `Leaf`, it
371 # allows the use of pure NumPy objects.
372 def _calc_nrowsinbuf(self, object_):
373 """Calculate the number of rows that will fit in a buffer."""
375 # Compute the rowsize for the *leading* dimension
376 shape_ = list(object_.shape)
377 if shape_:
378 shape_[0] = 1
380 rowsize = np.prod(shape_) * object_.dtype.itemsize
382 # Compute the nrowsinbuf
383 # Multiplying the I/O buffer size by 4 gives optimal results
384 # in my benchmarks with `tables.Expr` (see ``bench/poly.py``)
385 buffersize = IO_BUFFER_SIZE * 4
386 nrowsinbuf = buffersize // rowsize
388 # Safeguard against row sizes being extremely large
389 if nrowsinbuf == 0:
390 nrowsinbuf = 1
391 # If rowsize is too large, issue a Performance warning
392 maxrowsize = BUFFER_TIMES * buffersize
393 if rowsize > maxrowsize:
394 warnings.warn("""\
395The object ``%s`` is exceeding the maximum recommended rowsize (%d
396bytes); be ready to see PyTables asking for *lots* of memory and
397possibly slow I/O. You may want to reduce the rowsize by trimming the
398value of dimensions that are orthogonal (and preferably close) to the
399*leading* dimension of this object."""
400 % (object, maxrowsize),
401 PerformanceWarning)
403 return nrowsinbuf
405 def _guess_shape(self):
406 """Guess the shape of the output of the expression."""
408 # First, compute the maximum dimension of inputs and maindim
409 # (if it exists)
410 maxndim = 0
411 maindims = []
412 for val in self.values:
413 # Get the minimum of the lengths
414 if len(val.shape) > maxndim:
415 maxndim = len(val.shape)
416 if hasattr(val, "maindim"):
417 maindims.append(val.maindim)
418 if maxndim == 0:
419 self._single_row_out = out = self._compiled_expr(*self.values)
420 return (), None
421 if maindims and [maindims[0]] * len(maindims) == maindims:
422 # If all maindims detected are the same, use this as maindim
423 maindim = maindims[0]
424 else:
425 # If not, the main dimension will be the default one
426 maindim = 0
428 # The slices parameter for inputs
429 slices = (slice(None),) * maindim + (0,)
431 # Now, collect the values in first row of arrays with maximum dims
432 vals = []
433 lens = []
434 for val in self.values:
435 shape = val.shape
436 # Warning: don't use len(val) below or it will raise an
437 # `Overflow` error on 32-bit platforms for large enough arrays.
438 if shape != () and shape[maindim] == 0:
439 vals.append(val[:])
440 lens.append(0)
441 elif len(shape) < maxndim:
442 vals.append(val)
443 else:
444 vals.append(val.__getitem__(slices))
445 lens.append(shape[maindim])
446 minlen = min(lens)
447 self._single_row_out = out = self._compiled_expr(*vals)
448 shape = list(out.shape)
449 if minlen > 0:
450 shape.insert(maindim, minlen)
451 return shape, maindim
453 def _get_info(self, shape, maindim, itermode=False):
454 """Return various info needed for evaluating the computation loop."""
456 # Compute the shape of the resulting container having
457 # in account new possible values of start, stop and step in
458 # the inputs range
459 if maindim is not None:
460 (start, stop, step) = slice(
461 self.start, self.stop, self.step).indices(shape[maindim])
462 shape[maindim] = min(
463 shape[maindim], len(range(start, stop, step)))
464 i_nrows = shape[maindim]
465 else:
466 start, stop, step = 0, 0, None
467 i_nrows = 0
469 if not itermode:
470 # Create a container for output if not defined yet
471 o_maindim = 0 # Default maindim
472 if self.out is None:
473 out = np.empty(shape, dtype=self._single_row_out.dtype)
474 # Get the trivial values for start, stop and step
475 if maindim is not None:
476 (o_start, o_stop, o_step) = (0, shape[maindim], 1)
477 else:
478 (o_start, o_stop, o_step) = (0, 0, 1)
479 else:
480 out = self.out
481 # Out container already provided. Do some sanity checks.
482 if hasattr(out, "maindim"):
483 o_maindim = out.maindim
485 # Refine the shape of the resulting container having in
486 # account new possible values of start, stop and step in
487 # the output range
488 o_shape = list(out.shape)
489 s = slice(self.o_start, self.o_stop, self.o_step)
490 o_start, o_stop, o_step = s.indices(o_shape[o_maindim])
491 o_shape[o_maindim] = min(o_shape[o_maindim],
492 len(range(o_start, o_stop, o_step)))
494 # Check that the shape of output is consistent with inputs
495 tr_oshape = list(o_shape) # this implies a copy
496 olen_ = tr_oshape.pop(o_maindim)
497 tr_shape = list(shape) # do a copy
498 if maindim is not None:
499 len_ = tr_shape.pop(o_maindim)
500 else:
501 len_ = 1
502 if tr_oshape != tr_shape:
503 raise ValueError(
504 "Shape for out container does not match expression")
505 # Force the input length to fit in `out`
506 if not self.append_mode and olen_ < len_:
507 shape[o_maindim] = olen_
508 stop = start + olen_
510 # Get the positions of inputs that should be sliced (the others
511 # will be broadcasted)
512 ndim = len(shape)
513 slice_pos = [i for i, val in enumerate(self.values)
514 if len(val.shape) == ndim]
516 # The size of the I/O buffer
517 nrowsinbuf = 1
518 for i, val in enumerate(self.values):
519 # Skip scalar values in variables
520 if i in slice_pos:
521 nrows = self._calc_nrowsinbuf(val)
522 if nrows > nrowsinbuf:
523 nrowsinbuf = nrows
525 if not itermode:
526 return (i_nrows, slice_pos, start, stop, step, nrowsinbuf,
527 out, o_maindim, o_start, o_stop, o_step)
528 else:
529 # For itermode, we don't need the out info
530 return (i_nrows, slice_pos, start, stop, step, nrowsinbuf)
532 def eval(self):
533 """Evaluate the expression and return the outcome.
535 Because of performance reasons, the computation order tries to go along
536 the common main dimension of all inputs. If not such a common main
537 dimension is found, the iteration will go along the leading dimension
538 instead.
540 For non-consistent shapes in inputs (i.e. shapes having a different
541 number of dimensions), the regular NumPy broadcast rules applies.
542 There is one exception to this rule though: when the dimensions
543 orthogonal to the main dimension of the expression are consistent, but
544 the main dimension itself differs among the inputs, then the shortest
545 one is chosen for doing the computations. This is so because trying to
546 expand very large on-disk arrays could be too expensive or simply not
547 possible.
549 Also, the regular Numexpr casting rules (which are similar to those of
550 NumPy, although you should check the Numexpr manual for the exceptions)
551 are applied to determine the output type.
553 Finally, if the setOuput() method specifying a user container has
554 already been called, the output is sent to this user-provided
555 container. If not, a fresh NumPy container is returned instead.
557 .. warning::
559 When dealing with large on-disk inputs, failing to specify an
560 on-disk container may consume all your available memory.
562 """
564 values, shape, maindim = self.values, self.shape, self.maindim
566 # Get different info we need for the main computation loop
567 (i_nrows, slice_pos, start, stop, step, nrowsinbuf,
568 out, o_maindim, o_start, o_stop, o_step) = \
569 self._get_info(shape, maindim)
571 if i_nrows == 0:
572 # No elements to compute
573 if start >= stop and self.start is not None:
574 return out
575 else:
576 return self._single_row_out
578 # Create a key that selects every element in inputs and output
579 # (including the main dimension)
580 i_slices = [slice(None)] * (maindim + 1)
581 o_slices = [slice(None)] * (o_maindim + 1)
583 # This is a hack to prevent doing unnecessary flavor conversions
584 # while reading buffers
585 for val in values:
586 if hasattr(val, 'maindim'):
587 val._v_convert = False
589 # Start the computation itself
590 for start2 in range(start, stop, step * nrowsinbuf):
591 stop2 = start2 + step * nrowsinbuf
592 if stop2 > stop:
593 stop2 = stop
594 # Set the proper slice for inputs
595 i_slices[maindim] = slice(start2, stop2, step)
596 # Get the input values
597 vals = []
598 for i, val in enumerate(values):
599 if i in slice_pos:
600 vals.append(val.__getitem__(tuple(i_slices)))
601 else:
602 # A read of values is not apparently needed, as PyTables
603 # leaves seems to work just fine inside Numexpr
604 vals.append(val)
605 # Do the actual computation for this slice
606 rout = self._compiled_expr(*vals)
607 # Set the values into the out buffer
608 if self.append_mode:
609 out.append(rout)
610 else:
611 # Compute the slice to be filled in output
612 start3 = o_start + (start2 - start) // step
613 stop3 = start3 + nrowsinbuf * o_step
614 if stop3 > o_stop:
615 stop3 = o_stop
616 o_slices[o_maindim] = slice(start3, stop3, o_step)
617 # Set the slice
618 out[tuple(o_slices)] = rout
620 # Activate the conversion again (default)
621 for val in values:
622 if hasattr(val, 'maindim'):
623 val._v_convert = True
625 return out
627 def __iter__(self):
628 """Iterate over the rows of the outcome of the expression.
630 This iterator always returns rows as NumPy objects, so a possible out
631 container specified in :meth:`Expr.set_output` method is ignored here.
633 """
635 values, shape, maindim = self.values, self.shape, self.maindim
637 # Get different info we need for the main computation loop
638 (i_nrows, slice_pos, start, stop, step, nrowsinbuf) = \
639 self._get_info(shape, maindim, itermode=True)
641 if i_nrows == 0:
642 # No elements to compute
643 return
645 # Create a key that selects every element in inputs
646 # (including the main dimension)
647 i_slices = [slice(None)] * (maindim + 1)
649 # This is a hack to prevent doing unnecessary flavor conversions
650 # while reading buffers
651 for val in values:
652 if hasattr(val, 'maindim'):
653 val._v_convert = False
655 # Start the computation itself
656 for start2 in range(start, stop, step * nrowsinbuf):
657 stop2 = start2 + step * nrowsinbuf
658 if stop2 > stop:
659 stop2 = stop
660 # Set the proper slice in the main dimension
661 i_slices[maindim] = slice(start2, stop2, step)
662 # Get the values for computing the buffer
663 vals = []
664 for i, val in enumerate(values):
665 if i in slice_pos:
666 vals.append(val.__getitem__(tuple(i_slices)))
667 else:
668 # A read of values is not apparently needed, as PyTables
669 # leaves seems to work just fine inside Numexpr
670 vals.append(val)
671 # Do the actual computation
672 rout = self._compiled_expr(*vals)
673 # Return one row per call
674 yield from rout
676 # Activate the conversion again (default)
677 for val in values:
678 if hasattr(val, 'maindim'):
679 val._v_convert = True
682if __name__ == "__main__":
684 # shape = (10000,10000)
685 shape = (10, 10_000)
687 f = tb.open_file("/tmp/expression.h5", "w")
689 # Create some arrays
690 a = f.create_carray(f.root, 'a', atom=tb.Float32Atom(dflt=1), shape=shape)
691 b = f.create_carray(f.root, 'b', atom=tb.Float32Atom(dflt=2), shape=shape)
692 c = f.create_carray(f.root, 'c', atom=tb.Float32Atom(dflt=3), shape=shape)
693 out = f.create_carray(f.root, 'out', atom=tb.Float32Atom(dflt=3),
694 shape=shape)
696 expr = Expr("a * b + c")
697 expr.set_output(out)
698 d = expr.eval()
700 print("returned-->", repr(d))
701 # print(`d[:]`)
703 f.close()