Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tables/expression.py: 8%

1"""Here is defined the Expr class."""

3import sys

4import warnings

6import numexpr as ne

7import numpy as np

8import tables as tb

10from .exceptions import PerformanceWarning

11from .parameters import IO_BUFFER_SIZE, BUFFER_TIMES

14class Expr:

15 """A class for evaluating expressions with arbitrary array-like objects.

17 Expr is a class for evaluating expressions containing array-like objects.

18 With it, you can evaluate expressions (like "3 * a + 4 * b") that

19 operate on arbitrary large arrays while optimizing the resources

20 required to perform them (basically main memory and CPU cache memory).

21 It is similar to the Numexpr package (see :ref:`[NUMEXPR] <NUMEXPR>`),

22 but in addition to NumPy objects, it also accepts disk-based homogeneous

23 arrays, like the Array, CArray, EArray and Column PyTables objects.

25 .. warning::

27 Expr class only offers a subset of the Numexpr features due to the

28 complexity of implement some of them when dealing with huge amount of

29 data.

31 All the internal computations are performed via the Numexpr package,

32 so all the broadcast and upcasting rules of Numexpr applies here too.

33 These rules are very similar to the NumPy ones, but with some exceptions

34 due to the particularities of having to deal with potentially very large

35 disk-based arrays. Be sure to read the documentation of the Expr

36 constructor and methods as well as that of Numexpr, if you want to fully

37 grasp these particularities.

40 Parameters

41 ----------

42 expr : str

43 This specifies the expression to be evaluated, such as "2 * a + 3 * b".

44 uservars : dict

45 This can be used to define the variable names appearing in *expr*.

46 This mapping should consist of identifier-like strings pointing to any

47 `Array`, `CArray`, `EArray`, `Column` or NumPy ndarray instances (or

48 even others which will tried to be converted to ndarrays). When

49 `uservars` is not provided or `None`, the current local and global

50 namespace is sought instead of `uservars`. It is also possible to pass

51 just some of the variables in expression via the `uservars` mapping,

52 and the rest will be retrieved from the current local and global

53 namespaces.

54 kwargs : dict

55 This is meant to pass additional parameters to the Numexpr kernel.

56 This is basically the same as the kwargs argument in

57 Numexpr.evaluate(), and is mainly meant for advanced use.

59 Examples

60 --------

61 The following shows an example of using Expr::

63 >>> f = tb.open_file('/tmp/test_expr.h5', 'w')

64 >>> a = f.create_array('/', 'a', np.array([1,2,3]))

65 >>> b = f.create_array('/', 'b', np.array([3,4,5]))

66 >>> c = np.array([4,5,6])

67 >>> expr = tb.Expr("2 * a + b * c") # initialize the expression

68 >>> expr.eval() # evaluate it

69 array([14, 24, 36], dtype=int64)

70 >>> sum(expr) # use as an iterator

71 74

73 where you can see that you can mix different containers in

74 the expression (whenever shapes are consistent).

76 You can also work with multidimensional arrays::

78 >>> a2 = f.create_array('/', 'a2', np.array([[1,2],[3,4]]))

79 >>> b2 = f.create_array('/', 'b2', np.array([[3,4],[5,6]]))

80 >>> c2 = np.array([4,5]) # This will be broadcasted

81 >>> expr = tb.Expr("2 * a2 + b2-c2")

82 >>> expr.eval()

83 array([[1, 3],

84 [7, 9]], dtype=int64)

85 >>> sum(expr)

86 array([ 8, 12], dtype=int64)

87 >>> f.close()

89 .. rubric:: Expr attributes

91 .. attribute:: append_mode

93 The append mode for user-provided output containers.

95 .. attribute:: maindim

97 Common main dimension for inputs in expression.

99 .. attribute:: names

100

101 The names of variables in expression (list).

102

103 .. attribute:: out

104

105 The user-provided container (if any) for the expression outcome.

106

107 .. attribute:: o_start

108

109 The start range selection for the user-provided output.

110

111 .. attribute:: o_stop

112

113 The stop range selection for the user-provided output.

114

115 .. attribute:: o_step

116

117 The step range selection for the user-provided output.

118

119 .. attribute:: shape

120

121 Common shape for the arrays in expression.

122

123 .. attribute:: values

124

125 The values of variables in expression (list).

126

127 """

128

129 _exprvars_cache = {}

130 """Cache of variables participating in expressions.

131

132 .. versionadded:: 3.0

133

134 """

135

136 def __init__(self, expr, uservars=None, **kwargs):

137

138 self.append_mode = False

139 """The append mode for user-provided output containers."""

140 self.maindim = 0

141 """Common main dimension for inputs in expression."""

142 self.names = []

143 """The names of variables in expression (list)."""

144 self.out = None

145 """The user-provided container (if any) for the expression outcome."""

146 self.o_start = None

147 """The start range selection for the user-provided output."""

148 self.o_stop = None

149 """The stop range selection for the user-provided output."""

150 self.o_step = None

151 """The step range selection for the user-provided output."""

152 self.shape = None

153 """Common shape for the arrays in expression."""

154 self.start, self.stop, self.step = (None,) * 3

155 self.start = None

156 """The start range selection for the input."""

157 self.stop = None

158 """The stop range selection for the input."""

159 self.step = None

160 """The step range selection for the input."""

161 self.values = []

162 """The values of variables in expression (list)."""

163

164 self._compiled_expr = None

165 """The compiled expression."""

166 self._single_row_out = None

167 """A sample of the output with just a single row."""

168

169 # First, get the signature for the arrays in expression

170 vars_ = self._required_expr_vars(expr, uservars)

171 context = ne.necompiler.getContext(kwargs)

172 self.names, _ = ne.necompiler.getExprNames(expr, context)

173

174 # Raise a ValueError in case we have unsupported objects

175 for name, var in vars_.items():

176 if type(var) in (int, float, str):

177 continue

178 if not isinstance(var, (tb.Leaf, tb.Column)):

179 if hasattr(var, "dtype"):

180 # Quacks like a NumPy object

181 continue

182 raise TypeError("Unsupported variable type: %r" % var)

183 objname = var.__class__.__name__

184 if objname not in ("Array", "CArray", "EArray", "Column"):

185 raise TypeError("Unsupported variable type: %r" % var)

186

187 # NumPy arrays to be copied? (we don't need to worry about

188 # PyTables objects, as the reads always return contiguous and

189 # aligned objects, or at least I think so).

190 for name, var in vars_.items():

191 if isinstance(var, np.ndarray):

192 # See numexpr.necompiler.evaluate for a rational

193 # of the code below

194 if not var.flags.aligned:

195 if var.ndim != 1:

196 # Do a copy of this variable

197 var = var.copy()

198 # Update the vars_ dictionary

199 vars_[name] = var

200

201 # Get the variables and types

202 values = self.values

203 types_ = []

204 for name in self.names:

205 value = vars_[name]

206 if hasattr(value, 'atom'):

207 types_.append(value.atom)

208 elif hasattr(value, 'dtype'):

209 types_.append(value)

210 else:

211 # try to convert into a NumPy array

212 value = np.array(value)

213 types_.append(value)

214 values.append(value)

215

216 # Create a signature for the expression

217 signature = [(name, ne.necompiler.getType(type_))

218 for (name, type_) in zip(self.names, types_)]

219

220 # Compile the expression

221 self._compiled_expr = ne.necompiler.NumExpr(expr, signature, **kwargs)

222

223 # Guess the shape for the outcome and the maindim of inputs

224 self.shape, self.maindim = self._guess_shape()

225

226 # The next method is similar to their counterpart in `Table`, but

227 # adapted to the `Expr` own requirements.

228 def _required_expr_vars(self, expression, uservars, depth=2):

229 """Get the variables required by the `expression`.

230

231 A new dictionary defining the variables used in the `expression`

232 is returned. Required variables are first looked up in the

233 `uservars` mapping, then in the set of top-level columns of the

234 table. Unknown variables cause a `NameError` to be raised.

235

236 When `uservars` is `None`, the local and global namespace where

237 the API callable which uses this method is called is sought

238 instead. To disable this mechanism, just specify a mapping as

239 `uservars`.

240

241 Nested columns and variables with an ``uint64`` type are not

242 allowed (`TypeError` and `NotImplementedError` are raised,

243 respectively).

244

245 `depth` specifies the depth of the frame in order to reach local

246 or global variables.

247

248 """

249

250 # Get the names of variables used in the expression.

251 exprvars_cache = self._exprvars_cache

252 if expression not in exprvars_cache:

253 # Protection against growing the cache too much

254 if len(exprvars_cache) > 256:

255 # Remove 10 (arbitrary) elements from the cache

256 for k in list(exprvars_cache)[:10]:

257 del exprvars_cache[k]

258 cexpr = compile(expression, '<string>', 'eval')

259 exprvars = [var for var in cexpr.co_names

260 if var not in ['None', 'False', 'True']

261 and var not in ne.expressions.functions]

262 exprvars_cache[expression] = exprvars

263 else:

264 exprvars = exprvars_cache[expression]

265

266 # Get the local and global variable mappings of the user frame

267 # if no mapping has been explicitly given for user variables.

268 user_locals, user_globals = {}, {}

269 if uservars is None:

270 user_frame = sys._getframe(depth)

271 user_locals = user_frame.f_locals

272 user_globals = user_frame.f_globals

273

274 # Look for the required variables first among the ones

275 # explicitly provided by the user.

276 reqvars = {}

277 for var in exprvars:

278 # Get the value.

279 if uservars is not None and var in uservars:

280 val = uservars[var]

281 elif uservars is None and var in user_locals:

282 val = user_locals[var]

283 elif uservars is None and var in user_globals:

284 val = user_globals[var]

285 else:

286 raise NameError("name ``%s`` is not defined" % var)

287

288 # Check the value.

289 if hasattr(val, 'dtype') and val.dtype.str[1:] == 'u8':

290 raise NotImplementedError(

291 "variable ``%s`` refers to "

292 "a 64-bit unsigned integer object, that is "

293 "not yet supported in expressions, sorry; " % var)

294 elif hasattr(val, '_v_colpathnames'): # nested column

295 # This branch is never reached because the compile step

296 # above already raise a ``TypeError`` for nested

297 # columns, but that could change in the future. So it

298 # is best to let this here.

299 raise TypeError(

300 "variable ``%s`` refers to a nested column, "

301 "not allowed in expressions" % var)

302 reqvars[var] = val

303 return reqvars

304

305 def set_inputs_range(self, start=None, stop=None, step=None):

306 """Define a range for all inputs in expression.

307

308 The computation will only take place for the range defined by

309 the start, stop and step parameters in the main dimension of

310 inputs (or the leading one, if the object lacks the concept of

311 main dimension, like a NumPy container). If not a common main

312 dimension exists for all inputs, the leading dimension will be

313 used instead.

314

315 """

316

317 self.start = start

318 self.stop = stop

319 self.step = step

320

321 def set_output(self, out, append_mode=False):

322 """Set out as container for output as well as the append_mode.

323

324 The out must be a container that is meant to keep the outcome of

325 the expression. It should be an homogeneous type container and

326 can typically be an Array, CArray, EArray, Column or a NumPy ndarray.

327

328 The append_mode specifies the way of which the output is filled.

329 If true, the rows of the outcome are *appended* to the out container.

330 Of course, for doing this it is necessary that out would have an

331 append() method (like an EArray, for example).

332

333 If append_mode is false, the output is set via the __setitem__()

334 method (see the Expr.set_output_range() for info on how to select

335 the rows to be updated). If out is smaller than what is required

336 by the expression, only the computations that are needed to fill

337 up the container are carried out. If it is larger, the excess

338 elements are unaffected.

339

340 """

341

342 if not (hasattr(out, "shape") and hasattr(out, "__setitem__")):

343 raise ValueError(

344 "You need to pass a settable multidimensional container "

345 "as output")

346 self.out = out

347 if append_mode and not hasattr(out, "append"):

348 raise ValueError(

349 "For activating the ``append`` mode, you need a container "

350 "with an `append()` method (like the `EArray`)")

351 self.append_mode = append_mode

352

353 def set_output_range(self, start=None, stop=None, step=None):

354 """Define a range for user-provided output object.

355

356 The output object will only be modified in the range specified by the

357 start, stop and step parameters in the main dimension of output (or the

358 leading one, if the object does not have the concept of main dimension,

359 like a NumPy container).

360

361 """

362

363 if self.out is None:

364 raise IndexError(

365 "You need to pass an output object to `setOut()` first")

366 self.o_start = start

367 self.o_stop = stop

368 self.o_step = step

369

370 # Although the next code is similar to the method in `Leaf`, it

371 # allows the use of pure NumPy objects.

372 def _calc_nrowsinbuf(self, object_):

373 """Calculate the number of rows that will fit in a buffer."""

374

375 # Compute the rowsize for the *leading* dimension

376 shape_ = list(object_.shape)

377 if shape_:

378 shape_[0] = 1

379

380 rowsize = np.prod(shape_) * object_.dtype.itemsize

381

382 # Compute the nrowsinbuf

383 # Multiplying the I/O buffer size by 4 gives optimal results

384 # in my benchmarks with `tables.Expr` (see ``bench/poly.py``)

385 buffersize = IO_BUFFER_SIZE * 4

386 nrowsinbuf = buffersize // rowsize

387

388 # Safeguard against row sizes being extremely large

389 if nrowsinbuf == 0:

390 nrowsinbuf = 1

391 # If rowsize is too large, issue a Performance warning

392 maxrowsize = BUFFER_TIMES * buffersize

393 if rowsize > maxrowsize:

394 warnings.warn("""\

395The object ``%s`` is exceeding the maximum recommended rowsize (%d

396bytes); be ready to see PyTables asking for *lots* of memory and

397possibly slow I/O. You may want to reduce the rowsize by trimming the

398value of dimensions that are orthogonal (and preferably close) to the

399*leading* dimension of this object."""

400 % (object, maxrowsize),

401 PerformanceWarning)

402

403 return nrowsinbuf

404

405 def _guess_shape(self):

406 """Guess the shape of the output of the expression."""

407

408 # First, compute the maximum dimension of inputs and maindim

409 # (if it exists)

410 maxndim = 0

411 maindims = []

412 for val in self.values:

413 # Get the minimum of the lengths

414 if len(val.shape) > maxndim:

415 maxndim = len(val.shape)

416 if hasattr(val, "maindim"):

417 maindims.append(val.maindim)

418 if maxndim == 0:

419 self._single_row_out = out = self._compiled_expr(*self.values)

420 return (), None

421 if maindims and [maindims[0]] * len(maindims) == maindims:

422 # If all maindims detected are the same, use this as maindim

423 maindim = maindims[0]

424 else:

425 # If not, the main dimension will be the default one

426 maindim = 0

427

428 # The slices parameter for inputs

429 slices = (slice(None),) * maindim + (0,)

430

431 # Now, collect the values in first row of arrays with maximum dims

432 vals = []

433 lens = []

434 for val in self.values:

435 shape = val.shape

436 # Warning: don't use len(val) below or it will raise an

437 # `Overflow` error on 32-bit platforms for large enough arrays.

438 if shape != () and shape[maindim] == 0:

439 vals.append(val[:])

440 lens.append(0)

441 elif len(shape) < maxndim:

442 vals.append(val)

443 else:

444 vals.append(val.__getitem__(slices))

445 lens.append(shape[maindim])

446 minlen = min(lens)

447 self._single_row_out = out = self._compiled_expr(*vals)

448 shape = list(out.shape)

449 if minlen > 0:

450 shape.insert(maindim, minlen)

451 return shape, maindim

452

453 def _get_info(self, shape, maindim, itermode=False):

454 """Return various info needed for evaluating the computation loop."""

455

456 # Compute the shape of the resulting container having

457 # in account new possible values of start, stop and step in

458 # the inputs range

459 if maindim is not None:

460 (start, stop, step) = slice(

461 self.start, self.stop, self.step).indices(shape[maindim])

462 shape[maindim] = min(

463 shape[maindim], len(range(start, stop, step)))

464 i_nrows = shape[maindim]

465 else:

466 start, stop, step = 0, 0, None

467 i_nrows = 0

468

469 if not itermode:

470 # Create a container for output if not defined yet

471 o_maindim = 0 # Default maindim

472 if self.out is None:

473 out = np.empty(shape, dtype=self._single_row_out.dtype)

474 # Get the trivial values for start, stop and step

475 if maindim is not None:

476 (o_start, o_stop, o_step) = (0, shape[maindim], 1)

477 else:

478 (o_start, o_stop, o_step) = (0, 0, 1)

479 else:

480 out = self.out

481 # Out container already provided. Do some sanity checks.

482 if hasattr(out, "maindim"):

483 o_maindim = out.maindim

484

485 # Refine the shape of the resulting container having in

486 # account new possible values of start, stop and step in

487 # the output range

488 o_shape = list(out.shape)

489 s = slice(self.o_start, self.o_stop, self.o_step)

490 o_start, o_stop, o_step = s.indices(o_shape[o_maindim])

491 o_shape[o_maindim] = min(o_shape[o_maindim],

492 len(range(o_start, o_stop, o_step)))

493

494 # Check that the shape of output is consistent with inputs

495 tr_oshape = list(o_shape) # this implies a copy

496 olen_ = tr_oshape.pop(o_maindim)

497 tr_shape = list(shape) # do a copy

498 if maindim is not None:

499 len_ = tr_shape.pop(o_maindim)

500 else:

501 len_ = 1

502 if tr_oshape != tr_shape:

503 raise ValueError(

504 "Shape for out container does not match expression")

505 # Force the input length to fit in `out`

506 if not self.append_mode and olen_ < len_:

507 shape[o_maindim] = olen_

508 stop = start + olen_

509

510 # Get the positions of inputs that should be sliced (the others

511 # will be broadcasted)

512 ndim = len(shape)

513 slice_pos = [i for i, val in enumerate(self.values)

514 if len(val.shape) == ndim]

515

516 # The size of the I/O buffer

517 nrowsinbuf = 1

518 for i, val in enumerate(self.values):

519 # Skip scalar values in variables

520 if i in slice_pos:

521 nrows = self._calc_nrowsinbuf(val)

522 if nrows > nrowsinbuf:

523 nrowsinbuf = nrows

524

525 if not itermode:

526 return (i_nrows, slice_pos, start, stop, step, nrowsinbuf,

527 out, o_maindim, o_start, o_stop, o_step)

528 else:

529 # For itermode, we don't need the out info

530 return (i_nrows, slice_pos, start, stop, step, nrowsinbuf)

531

532 def eval(self):

533 """Evaluate the expression and return the outcome.

534

535 Because of performance reasons, the computation order tries to go along

536 the common main dimension of all inputs. If not such a common main

537 dimension is found, the iteration will go along the leading dimension

538 instead.

539

540 For non-consistent shapes in inputs (i.e. shapes having a different

541 number of dimensions), the regular NumPy broadcast rules applies.

542 There is one exception to this rule though: when the dimensions

543 orthogonal to the main dimension of the expression are consistent, but

544 the main dimension itself differs among the inputs, then the shortest

545 one is chosen for doing the computations. This is so because trying to

546 expand very large on-disk arrays could be too expensive or simply not

547 possible.

548

549 Also, the regular Numexpr casting rules (which are similar to those of

550 NumPy, although you should check the Numexpr manual for the exceptions)

551 are applied to determine the output type.

552

553 Finally, if the setOuput() method specifying a user container has

554 already been called, the output is sent to this user-provided

555 container. If not, a fresh NumPy container is returned instead.

556

557 .. warning::

558

559 When dealing with large on-disk inputs, failing to specify an

560 on-disk container may consume all your available memory.

561

562 """

563

564 values, shape, maindim = self.values, self.shape, self.maindim

565

566 # Get different info we need for the main computation loop

567 (i_nrows, slice_pos, start, stop, step, nrowsinbuf,

568 out, o_maindim, o_start, o_stop, o_step) = \

569 self._get_info(shape, maindim)

570

571 if i_nrows == 0:

572 # No elements to compute

573 if start >= stop and self.start is not None:

574 return out

575 else:

576 return self._single_row_out

577

578 # Create a key that selects every element in inputs and output

579 # (including the main dimension)

580 i_slices = [slice(None)] * (maindim + 1)

581 o_slices = [slice(None)] * (o_maindim + 1)

582

583 # This is a hack to prevent doing unnecessary flavor conversions

584 # while reading buffers

585 for val in values:

586 if hasattr(val, 'maindim'):

587 val._v_convert = False

588

589 # Start the computation itself

590 for start2 in range(start, stop, step * nrowsinbuf):

591 stop2 = start2 + step * nrowsinbuf

592 if stop2 > stop:

593 stop2 = stop

594 # Set the proper slice for inputs

595 i_slices[maindim] = slice(start2, stop2, step)

596 # Get the input values

597 vals = []

598 for i, val in enumerate(values):

599 if i in slice_pos:

600 vals.append(val.__getitem__(tuple(i_slices)))

601 else:

602 # A read of values is not apparently needed, as PyTables

603 # leaves seems to work just fine inside Numexpr

604 vals.append(val)

605 # Do the actual computation for this slice

606 rout = self._compiled_expr(*vals)

607 # Set the values into the out buffer

608 if self.append_mode:

609 out.append(rout)

610 else:

611 # Compute the slice to be filled in output

612 start3 = o_start + (start2 - start) // step

613 stop3 = start3 + nrowsinbuf * o_step

614 if stop3 > o_stop:

615 stop3 = o_stop

616 o_slices[o_maindim] = slice(start3, stop3, o_step)

617 # Set the slice

618 out[tuple(o_slices)] = rout

619

620 # Activate the conversion again (default)

621 for val in values:

622 if hasattr(val, 'maindim'):

623 val._v_convert = True

624

625 return out

626

627 def __iter__(self):

628 """Iterate over the rows of the outcome of the expression.

629

630 This iterator always returns rows as NumPy objects, so a possible out

631 container specified in :meth:`Expr.set_output` method is ignored here.

632

633 """

634

635 values, shape, maindim = self.values, self.shape, self.maindim

636

637 # Get different info we need for the main computation loop

638 (i_nrows, slice_pos, start, stop, step, nrowsinbuf) = \

639 self._get_info(shape, maindim, itermode=True)

640

641 if i_nrows == 0:

642 # No elements to compute

643 return

644

645 # Create a key that selects every element in inputs

646 # (including the main dimension)

647 i_slices = [slice(None)] * (maindim + 1)

648

649 # This is a hack to prevent doing unnecessary flavor conversions

650 # while reading buffers

651 for val in values:

652 if hasattr(val, 'maindim'):

653 val._v_convert = False

654

655 # Start the computation itself

656 for start2 in range(start, stop, step * nrowsinbuf):

657 stop2 = start2 + step * nrowsinbuf

658 if stop2 > stop:

659 stop2 = stop

660 # Set the proper slice in the main dimension

661 i_slices[maindim] = slice(start2, stop2, step)

662 # Get the values for computing the buffer

663 vals = []

664 for i, val in enumerate(values):

665 if i in slice_pos:

666 vals.append(val.__getitem__(tuple(i_slices)))

667 else:

668 # A read of values is not apparently needed, as PyTables

669 # leaves seems to work just fine inside Numexpr

670 vals.append(val)

671 # Do the actual computation

672 rout = self._compiled_expr(*vals)

673 # Return one row per call

674 yield from rout

675

676 # Activate the conversion again (default)

677 for val in values:

678 if hasattr(val, 'maindim'):

679 val._v_convert = True

680

681

682if __name__ == "__main__":

683

684 # shape = (10000,10000)

685 shape = (10, 10_000)

686

687 f = tb.open_file("/tmp/expression.h5", "w")

688

689 # Create some arrays

690 a = f.create_carray(f.root, 'a', atom=tb.Float32Atom(dflt=1), shape=shape)

691 b = f.create_carray(f.root, 'b', atom=tb.Float32Atom(dflt=2), shape=shape)

692 c = f.create_carray(f.root, 'c', atom=tb.Float32Atom(dflt=3), shape=shape)

693 out = f.create_carray(f.root, 'out', atom=tb.Float32Atom(dflt=3),

694 shape=shape)

695

696 expr = Expr("a * b + c")

697 expr.set_output(out)

698 d = expr.eval()

699

700 print("returned-->", repr(d))

701 # print(`d[:]`)

702

703 f.close()