Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pyarrow/compute.py: 55%

1# Licensed to the Apache Software Foundation (ASF) under one

2# or more contributor license agreements. See the NOTICE file

3# distributed with this work for additional information

4# regarding copyright ownership. The ASF licenses this file

5# to you under the Apache License, Version 2.0 (the

6# "License"); you may not use this file except in compliance

7# with the License. You may obtain a copy of the License at

9# http://www.apache.org/licenses/LICENSE-2.0

10#

11# Unless required by applicable law or agreed to in writing,

12# software distributed under the License is distributed on an

13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

14# KIND, either express or implied. See the License for the

15# specific language governing permissions and limitations

16# under the License.

18from pyarrow._compute import ( # noqa

19 Function,

20 FunctionOptions,

21 FunctionRegistry,

22 HashAggregateFunction,

23 HashAggregateKernel,

24 Kernel,

25 ScalarAggregateFunction,

26 ScalarAggregateKernel,

27 ScalarFunction,

28 ScalarKernel,

29 VectorFunction,

30 VectorKernel,

31 # Option classes

32 ArraySortOptions,

33 AssumeTimezoneOptions,

34 CastOptions,

35 CountOptions,

36 CumulativeOptions,

37 CumulativeSumOptions,

38 DayOfWeekOptions,

39 DictionaryEncodeOptions,

40 RunEndEncodeOptions,

41 ElementWiseAggregateOptions,

42 ExtractRegexOptions,

43 FilterOptions,

44 IndexOptions,

45 JoinOptions,

46 ListSliceOptions,

47 ListFlattenOptions,

48 MakeStructOptions,

49 MapLookupOptions,

50 MatchSubstringOptions,

51 ModeOptions,

52 NullOptions,

53 PadOptions,

54 PairwiseOptions,

55 PartitionNthOptions,

56 QuantileOptions,

57 RandomOptions,

58 RankOptions,

59 ReplaceSliceOptions,

60 ReplaceSubstringOptions,

61 RoundBinaryOptions,

62 RoundOptions,

63 RoundTemporalOptions,

64 RoundToMultipleOptions,

65 ScalarAggregateOptions,

66 SelectKOptions,

67 SetLookupOptions,

68 SliceOptions,

69 SortOptions,

70 SplitOptions,

71 SplitPatternOptions,

72 StrftimeOptions,

73 StrptimeOptions,

74 StructFieldOptions,

75 TakeOptions,

76 TDigestOptions,

77 TrimOptions,

78 Utf8NormalizeOptions,

79 VarianceOptions,

80 WeekOptions,

81 # Functions

82 call_function,

83 function_registry,

84 get_function,

85 list_functions,

86 # Udf

87 call_tabular_function,

88 register_scalar_function,

89 register_tabular_function,

90 register_aggregate_function,

91 register_vector_function,

92 UdfContext,

93 # Expressions

94 Expression,

95)

97from collections import namedtuple

98import inspect

99from textwrap import dedent

100import warnings

101

102import pyarrow as pa

103from pyarrow import _compute_docstrings

104from pyarrow.vendored import docscrape

105

106

107def _get_arg_names(func):

108 return func._doc.arg_names

109

110

111_OptionsClassDoc = namedtuple('_OptionsClassDoc', ('params',))

112

113

114def _scrape_options_class_doc(options_class):

115 if not options_class.__doc__:

116 return None

117 doc = docscrape.NumpyDocString(options_class.__doc__)

118 return _OptionsClassDoc(doc['Parameters'])

119

120

121def _decorate_compute_function(wrapper, exposed_name, func, options_class):

122 # Decorate the given compute function wrapper with useful metadata

123 # and documentation.

124 cpp_doc = func._doc

125

126 wrapper.__arrow_compute_function__ = dict(

127 name=func.name,

128 arity=func.arity,

129 options_class=cpp_doc.options_class,

130 options_required=cpp_doc.options_required)

131 wrapper.__name__ = exposed_name

132 wrapper.__qualname__ = exposed_name

133

134 doc_pieces = []

135

136 # 1. One-line summary

137 summary = cpp_doc.summary

138 if not summary:

139 arg_str = "arguments" if func.arity > 1 else "argument"

140 summary = ("Call compute function {!r} with the given {}"

141 .format(func.name, arg_str))

142

143 doc_pieces.append(f"{summary}.\n\n")

144

145 # 2. Multi-line description

146 description = cpp_doc.description

147 if description:

148 doc_pieces.append(f"{description}\n\n")

149

150 doc_addition = _compute_docstrings.function_doc_additions.get(func.name)

151

152 # 3. Parameter description

153 doc_pieces.append(dedent("""\

154 Parameters

155 ----------

156 """))

157

158 # 3a. Compute function parameters

159 arg_names = _get_arg_names(func)

160 for arg_name in arg_names:

161 if func.kind in ('vector', 'scalar_aggregate'):

162 arg_type = 'Array-like'

163 else:

164 arg_type = 'Array-like or scalar-like'

165 doc_pieces.append(f"{arg_name} : {arg_type}\n")

166 doc_pieces.append(" Argument to compute function.\n")

167

168 # 3b. Compute function option values

169 if options_class is not None:

170 options_class_doc = _scrape_options_class_doc(options_class)

171 if options_class_doc:

172 for p in options_class_doc.params:

173 doc_pieces.append(f"{p.name} : {p.type}\n")

174 for s in p.desc:

175 doc_pieces.append(f" {s}\n")

176 else:

177 warnings.warn(f"Options class {options_class.__name__} "

178 f"does not have a docstring", RuntimeWarning)

179 options_sig = inspect.signature(options_class)

180 for p in options_sig.parameters.values():

181 doc_pieces.append(dedent("""\

182 {0} : optional

183 Parameter for {1} constructor. Either `options`

184 or `{0}` can be passed, but not both at the same time.

185 """.format(p.name, options_class.__name__)))

186 doc_pieces.append(dedent(f"""\

187 options : pyarrow.compute.{options_class.__name__}, optional

188 Alternative way of passing options.

189 """))

190

191 doc_pieces.append(dedent("""\

192 memory_pool : pyarrow.MemoryPool, optional

193 If not passed, will allocate memory from the default memory pool.

194 """))

195

196 # 4. Custom addition (e.g. examples)

197 if doc_addition is not None:

198 doc_pieces.append("\n{}\n".format(dedent(doc_addition).strip("\n")))

199

200 wrapper.__doc__ = "".join(doc_pieces)

201 return wrapper

202

203

204def _get_options_class(func):

205 class_name = func._doc.options_class

206 if not class_name:

207 return None

208 try:

209 return globals()[class_name]

210 except KeyError:

211 warnings.warn("Python binding for {} not exposed"

212 .format(class_name), RuntimeWarning)

213 return None

214

215

216def _handle_options(name, options_class, options, args, kwargs):

217 if args or kwargs:

218 if options is not None:

219 raise TypeError(

220 "Function {!r} called with both an 'options' argument "

221 "and additional arguments"

222 .format(name))

223 return options_class(*args, **kwargs)

224

225 if options is not None:

226 if isinstance(options, dict):

227 return options_class(**options)

228 elif isinstance(options, options_class):

229 return options

230 raise TypeError(

231 "Function {!r} expected a {} parameter, got {}"

232 .format(name, options_class, type(options)))

233

234 return None

235

236

237def _make_generic_wrapper(func_name, func, options_class, arity):

238 if options_class is None:

239 def wrapper(*args, memory_pool=None):

240 if arity is not Ellipsis and len(args) != arity:

241 raise TypeError(

242 f"{func_name} takes {arity} positional argument(s), "

243 f"but {len(args)} were given"

244 )

245 if args and isinstance(args[0], Expression):

246 return Expression._call(func_name, list(args))

247 return func.call(args, None, memory_pool)

248 else:

249 def wrapper(*args, memory_pool=None, options=None, **kwargs):

250 if arity is not Ellipsis:

251 if len(args) < arity:

252 raise TypeError(

253 f"{func_name} takes {arity} positional argument(s), "

254 f"but {len(args)} were given"

255 )

256 option_args = args[arity:]

257 args = args[:arity]

258 else:

259 option_args = ()

260 options = _handle_options(func_name, options_class, options,

261 option_args, kwargs)

262 if args and isinstance(args[0], Expression):

263 return Expression._call(func_name, list(args), options)

264 return func.call(args, options, memory_pool)

265 return wrapper

266

267

268def _make_signature(arg_names, var_arg_names, options_class):

269 from inspect import Parameter

270 params = []

271 for name in arg_names:

272 params.append(Parameter(name, Parameter.POSITIONAL_ONLY))

273 for name in var_arg_names:

274 params.append(Parameter(name, Parameter.VAR_POSITIONAL))

275 if options_class is not None:

276 options_sig = inspect.signature(options_class)

277 for p in options_sig.parameters.values():

278 assert p.kind in (Parameter.POSITIONAL_OR_KEYWORD,

279 Parameter.KEYWORD_ONLY)

280 if var_arg_names:

281 # Cannot have a positional argument after a *args

282 p = p.replace(kind=Parameter.KEYWORD_ONLY)

283 params.append(p)

284 params.append(Parameter("options", Parameter.KEYWORD_ONLY,

285 default=None))

286 params.append(Parameter("memory_pool", Parameter.KEYWORD_ONLY,

287 default=None))

288 return inspect.Signature(params)

289

290

291def _wrap_function(name, func):

292 options_class = _get_options_class(func)

293 arg_names = _get_arg_names(func)

294 has_vararg = arg_names and arg_names[-1].startswith('*')

295 if has_vararg:

296 var_arg_names = [arg_names.pop().lstrip('*')]

297 else:

298 var_arg_names = []

299

300 wrapper = _make_generic_wrapper(

301 name, func, options_class, arity=func.arity)

302 wrapper.__signature__ = _make_signature(arg_names, var_arg_names,

303 options_class)

304 return _decorate_compute_function(wrapper, name, func, options_class)

305

306

307def _make_global_functions():

308 """

309 Make global functions wrapping each compute function.

310

311 Note that some of the automatically-generated wrappers may be overridden

312 by custom versions below.

313 """

314 g = globals()

315 reg = function_registry()

316

317 # Avoid clashes with Python keywords

318 rewrites = {'and': 'and_',

319 'or': 'or_'}

320

321 for cpp_name in reg.list_functions():

322 name = rewrites.get(cpp_name, cpp_name)

323 func = reg.get_function(cpp_name)

324 if func.kind == "hash_aggregate":

325 # Hash aggregate functions are not callable,

326 # so let's not expose them at module level.

327 continue

328 if func.kind == "scalar_aggregate" and func.arity == 0:

329 # Nullary scalar aggregate functions are not callable

330 # directly so let's not expose them at module level.

331 continue

332 assert name not in g, name

333 g[cpp_name] = g[name] = _wrap_function(name, func)

334

335

336_make_global_functions()

337

338

339def cast(arr, target_type=None, safe=None, options=None, memory_pool=None):

340 """

341 Cast array values to another data type. Can also be invoked as an array

342 instance method.

343

344 Parameters

345 ----------

346 arr : Array-like

347 target_type : DataType or str

348 Type to cast to

349 safe : bool, default True

350 Check for overflows or other unsafe conversions

351 options : CastOptions, default None

352 Additional checks pass by CastOptions

353 memory_pool : MemoryPool, optional

354 memory pool to use for allocations during function execution.

355

356 Examples

357 --------

358 >>> from datetime import datetime

359 >>> import pyarrow as pa

360 >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)])

361 >>> arr.type

362 TimestampType(timestamp[us])

363

364 You can use ``pyarrow.DataType`` objects to specify the target type:

365

366 >>> cast(arr, pa.timestamp('ms'))

367 <pyarrow.lib.TimestampArray object at ...>

368 [

369 2010-01-01 00:00:00.000,

370 2015-01-01 00:00:00.000

371 ]

372

373 >>> cast(arr, pa.timestamp('ms')).type

374 TimestampType(timestamp[ms])

375

376 Alternatively, it is also supported to use the string aliases for these

377 types:

378

379 >>> arr.cast('timestamp[ms]')

380 <pyarrow.lib.TimestampArray object at ...>

381 [

382 2010-01-01 00:00:00.000,

383 2015-01-01 00:00:00.000

384 ]

385 >>> arr.cast('timestamp[ms]').type

386 TimestampType(timestamp[ms])

387

388 Returns

389 -------

390 casted : Array

391 The cast result as a new Array

392 """

393 safe_vars_passed = (safe is not None) or (target_type is not None)

394

395 if safe_vars_passed and (options is not None):

396 raise ValueError("Must either pass values for 'target_type' and 'safe'"

397 " or pass a value for 'options'")

398

399 if options is None:

400 target_type = pa.types.lib.ensure_type(target_type)

401 if safe is False:

402 options = CastOptions.unsafe(target_type)

403 else:

404 options = CastOptions.safe(target_type)

405 return call_function("cast", [arr], options, memory_pool)

406

407

408def index(data, value, start=None, end=None, *, memory_pool=None):

409 """

410 Find the index of the first occurrence of a given value.

411

412 Parameters

413 ----------

414 data : Array-like

415 value : Scalar-like object

416 The value to search for.

417 start : int, optional

418 end : int, optional

419 memory_pool : MemoryPool, optional

420 If not passed, will allocate memory from the default memory pool.

421

422 Returns

423 -------

424 index : int

425 the index, or -1 if not found

426

427 Examples

428 --------

429 >>> import pyarrow as pa

430 >>> import pyarrow.compute as pc

431 >>> arr = pa.array(["Lorem", "ipsum", "dolor", "sit", "Lorem", "ipsum"])

432 >>> pc.index(arr, "ipsum")

433 <pyarrow.Int64Scalar: 1>

434 >>> pc.index(arr, "ipsum", start=2)

435 <pyarrow.Int64Scalar: 5>

436 >>> pc.index(arr, "amet")

437 <pyarrow.Int64Scalar: -1>

438 """

439 if start is not None:

440 if end is not None:

441 data = data.slice(start, end - start)

442 else:

443 data = data.slice(start)

444 elif end is not None:

445 data = data.slice(0, end)

446

447 if not isinstance(value, pa.Scalar):

448 value = pa.scalar(value, type=data.type)

449 elif data.type != value.type:

450 value = pa.scalar(value.as_py(), type=data.type)

451 options = IndexOptions(value=value)

452 result = call_function('index', [data], options, memory_pool)

453 if start is not None and result.as_py() >= 0:

454 result = pa.scalar(result.as_py() + start, type=pa.int64())

455 return result

456

457

458def take(data, indices, *, boundscheck=True, memory_pool=None):

459 """

460 Select values (or records) from array- or table-like data given integer

461 selection indices.

462

463 The result will be of the same type(s) as the input, with elements taken

464 from the input array (or record batch / table fields) at the given

465 indices. If an index is null then the corresponding value in the output

466 will be null.

467

468 Parameters

469 ----------

470 data : Array, ChunkedArray, RecordBatch, or Table

471 indices : Array, ChunkedArray

472 Must be of integer type

473 boundscheck : boolean, default True

474 Whether to boundscheck the indices. If False and there is an out of

475 bounds index, will likely cause the process to crash.

476 memory_pool : MemoryPool, optional

477 If not passed, will allocate memory from the default memory pool.

478

479 Returns

480 -------

481 result : depends on inputs

482 Selected values for the given indices

483

484 Examples

485 --------

486 >>> import pyarrow as pa

487 >>> arr = pa.array(["a", "b", "c", None, "e", "f"])

488 >>> indices = pa.array([0, None, 4, 3])

489 >>> arr.take(indices)

490 <pyarrow.lib.StringArray object at ...>

491 [

492 "a",

493 null,

494 "e",

495 null

496 ]

497 """

498 options = TakeOptions(boundscheck=boundscheck)

499 return call_function('take', [data, indices], options, memory_pool)

500

501

502def fill_null(values, fill_value):

503 """Replace each null element in values with a corresponding

504 element from fill_value.

505

506 If fill_value is scalar-like, then every null element in values

507 will be replaced with fill_value. If fill_value is array-like,

508 then the i-th element in values will be replaced with the i-th

509 element in fill_value.

510

511 The fill_value's type must be the same as that of values, or it

512 must be able to be implicitly casted to the array's type.

513

514 This is an alias for :func:`coalesce`.

515

516 Parameters

517 ----------

518 values : Array, ChunkedArray, or Scalar-like object

519 Each null element is replaced with the corresponding value

520 from fill_value.

521 fill_value : Array, ChunkedArray, or Scalar-like object

522 If not same type as values, will attempt to cast.

523

524 Returns

525 -------

526 result : depends on inputs

527 Values with all null elements replaced

528

529 Examples

530 --------

531 >>> import pyarrow as pa

532 >>> arr = pa.array([1, 2, None, 3], type=pa.int8())

533 >>> fill_value = pa.scalar(5, type=pa.int8())

534 >>> arr.fill_null(fill_value)

535 <pyarrow.lib.Int8Array object at ...>

536 [

537 1,

538 2,

539 5,

540 3

541 ]

542 >>> arr = pa.array([1, 2, None, 4, None])

543 >>> arr.fill_null(pa.array([10, 20, 30, 40, 50]))

544 <pyarrow.lib.Int64Array object at ...>

545 [

546 1,

547 2,

548 30,

549 4,

550 50

551 ]

552 """

553 if not isinstance(fill_value, (pa.Array, pa.ChunkedArray, pa.Scalar)):

554 fill_value = pa.scalar(fill_value, type=values.type)

555 elif values.type != fill_value.type:

556 fill_value = pa.scalar(fill_value.as_py(), type=values.type)

557

558 return call_function("coalesce", [values, fill_value])

559

560

561def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None):

562 """

563 Select the indices of the top-k ordered elements from array- or table-like

564 data.

565

566 This is a specialization for :func:`select_k_unstable`. Output is not

567 guaranteed to be stable.

568

569 Parameters

570 ----------

571 values : Array, ChunkedArray, RecordBatch, or Table

572 Data to sort and get top indices from.

573 k : int

574 The number of `k` elements to keep.

575 sort_keys : List-like

576 Column key names to order by when input is table-like data.

577 memory_pool : MemoryPool, optional

578 If not passed, will allocate memory from the default memory pool.

579

580 Returns

581 -------

582 result : Array

583 Indices of the top-k ordered elements

584

585 Examples

586 --------

587 >>> import pyarrow as pa

588 >>> import pyarrow.compute as pc

589 >>> arr = pa.array(["a", "b", "c", None, "e", "f"])

590 >>> pc.top_k_unstable(arr, k=3)

591 <pyarrow.lib.UInt64Array object at ...>

592 [

593 5,

594 4,

595 2

596 ]

597 """

598 if sort_keys is None:

599 sort_keys = []

600 if isinstance(values, (pa.Array, pa.ChunkedArray)):

601 sort_keys.append(("dummy", "descending"))

602 else:

603 sort_keys = map(lambda key_name: (key_name, "descending"), sort_keys)

604 options = SelectKOptions(k, sort_keys)

605 return call_function("select_k_unstable", [values], options, memory_pool)

606

607

608def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None):

609 """

610 Select the indices of the bottom-k ordered elements from

611 array- or table-like data.

612

613 This is a specialization for :func:`select_k_unstable`. Output is not

614 guaranteed to be stable.

615

616 Parameters

617 ----------

618 values : Array, ChunkedArray, RecordBatch, or Table

619 Data to sort and get bottom indices from.

620 k : int

621 The number of `k` elements to keep.

622 sort_keys : List-like

623 Column key names to order by when input is table-like data.

624 memory_pool : MemoryPool, optional

625 If not passed, will allocate memory from the default memory pool.

626

627 Returns

628 -------

629 result : Array of indices

630 Indices of the bottom-k ordered elements

631

632 Examples

633 --------

634 >>> import pyarrow as pa

635 >>> import pyarrow.compute as pc

636 >>> arr = pa.array(["a", "b", "c", None, "e", "f"])

637 >>> pc.bottom_k_unstable(arr, k=3)

638 <pyarrow.lib.UInt64Array object at ...>

639 [

640 0,

641 1,

642 2

643 ]

644 """

645 if sort_keys is None:

646 sort_keys = []

647 if isinstance(values, (pa.Array, pa.ChunkedArray)):

648 sort_keys.append(("dummy", "ascending"))

649 else:

650 sort_keys = map(lambda key_name: (key_name, "ascending"), sort_keys)

651 options = SelectKOptions(k, sort_keys)

652 return call_function("select_k_unstable", [values], options, memory_pool)

653

654

655def random(n, *, initializer='system', options=None, memory_pool=None):

656 """

657 Generate numbers in the range [0, 1).

658

659 Generated values are uniformly-distributed, double-precision

660 in range [0, 1). Algorithm and seed can be changed via RandomOptions.

661

662 Parameters

663 ----------

664 n : int

665 Number of values to generate, must be greater than or equal to 0

666 initializer : int or str

667 How to initialize the underlying random generator.

668 If an integer is given, it is used as a seed.

669 If "system" is given, the random generator is initialized with

670 a system-specific source of (hopefully true) randomness.

671 Other values are invalid.

672 options : pyarrow.compute.RandomOptions, optional

673 Alternative way of passing options.

674 memory_pool : pyarrow.MemoryPool, optional

675 If not passed, will allocate memory from the default memory pool.

676 """

677 options = RandomOptions(initializer=initializer)

678 return call_function("random", [], options, memory_pool, length=n)

679

680

681def field(*name_or_index):

682 """Reference a column of the dataset.

683

684 Stores only the field's name. Type and other information is known only when

685 the expression is bound to a dataset having an explicit scheme.

686

687 Nested references are allowed by passing multiple names or a tuple of

688 names. For example ``('foo', 'bar')`` references the field named "bar"

689 inside the field named "foo".

690

691 Parameters

692 ----------

693 *name_or_index : string, multiple strings, tuple or int

694 The name or index of the (possibly nested) field the expression

695 references to.

696

697 Returns

698 -------

699 field_expr : Expression

700 Reference to the given field

701

702 Examples

703 --------

704 >>> import pyarrow.compute as pc

705 >>> pc.field("a")

706 <pyarrow.compute.Expression a>

707 >>> pc.field(1)

708 <pyarrow.compute.Expression FieldPath(1)>

709 >>> pc.field(("a", "b"))

710 <pyarrow.compute.Expression FieldRef.Nested(FieldRef.Name(a) ...

711 >>> pc.field("a", "b")

712 <pyarrow.compute.Expression FieldRef.Nested(FieldRef.Name(a) ...

713 """

714 n = len(name_or_index)

715 if n == 1:

716 if isinstance(name_or_index[0], (str, int)):

717 return Expression._field(name_or_index[0])

718 elif isinstance(name_or_index[0], tuple):

719 return Expression._nested_field(name_or_index[0])

720 else:

721 raise TypeError(

722 "field reference should be str, multiple str, tuple or "

723 f"integer, got {type(name_or_index[0])}"

724 )

725 # In case of multiple strings not supplied in a tuple

726 else:

727 return Expression._nested_field(name_or_index)

728

729

730def scalar(value):

731 """Expression representing a scalar value.

732

733 Parameters

734 ----------

735 value : bool, int, float or string

736 Python value of the scalar. Note that only a subset of types are

737 currently supported.

738

739 Returns

740 -------

741 scalar_expr : Expression

742 An Expression representing the scalar value

743 """

744 return Expression._scalar(value)