Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pyarrow/compute.py: 55%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# Licensed to the Apache Software Foundation (ASF) under one
2# or more contributor license agreements. See the NOTICE file
3# distributed with this work for additional information
4# regarding copyright ownership. The ASF licenses this file
5# to you under the Apache License, Version 2.0 (the
6# "License"); you may not use this file except in compliance
7# with the License. You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing,
12# software distributed under the License is distributed on an
13# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14# KIND, either express or implied. See the License for the
15# specific language governing permissions and limitations
16# under the License.
18from pyarrow._compute import ( # noqa
19 Function,
20 FunctionOptions,
21 FunctionRegistry,
22 HashAggregateFunction,
23 HashAggregateKernel,
24 Kernel,
25 ScalarAggregateFunction,
26 ScalarAggregateKernel,
27 ScalarFunction,
28 ScalarKernel,
29 VectorFunction,
30 VectorKernel,
31 # Option classes
32 ArraySortOptions,
33 AssumeTimezoneOptions,
34 CastOptions,
35 CountOptions,
36 CumulativeOptions,
37 CumulativeSumOptions,
38 DayOfWeekOptions,
39 DictionaryEncodeOptions,
40 RunEndEncodeOptions,
41 ElementWiseAggregateOptions,
42 ExtractRegexOptions,
43 FilterOptions,
44 IndexOptions,
45 JoinOptions,
46 ListSliceOptions,
47 ListFlattenOptions,
48 MakeStructOptions,
49 MapLookupOptions,
50 MatchSubstringOptions,
51 ModeOptions,
52 NullOptions,
53 PadOptions,
54 PairwiseOptions,
55 PartitionNthOptions,
56 QuantileOptions,
57 RandomOptions,
58 RankOptions,
59 ReplaceSliceOptions,
60 ReplaceSubstringOptions,
61 RoundBinaryOptions,
62 RoundOptions,
63 RoundTemporalOptions,
64 RoundToMultipleOptions,
65 ScalarAggregateOptions,
66 SelectKOptions,
67 SetLookupOptions,
68 SliceOptions,
69 SortOptions,
70 SplitOptions,
71 SplitPatternOptions,
72 StrftimeOptions,
73 StrptimeOptions,
74 StructFieldOptions,
75 TakeOptions,
76 TDigestOptions,
77 TrimOptions,
78 Utf8NormalizeOptions,
79 VarianceOptions,
80 WeekOptions,
81 # Functions
82 call_function,
83 function_registry,
84 get_function,
85 list_functions,
86 # Udf
87 call_tabular_function,
88 register_scalar_function,
89 register_tabular_function,
90 register_aggregate_function,
91 register_vector_function,
92 UdfContext,
93 # Expressions
94 Expression,
95)
97from collections import namedtuple
98import inspect
99from textwrap import dedent
100import warnings
102import pyarrow as pa
103from pyarrow import _compute_docstrings
104from pyarrow.vendored import docscrape
107def _get_arg_names(func):
108 return func._doc.arg_names
111_OptionsClassDoc = namedtuple('_OptionsClassDoc', ('params',))
114def _scrape_options_class_doc(options_class):
115 if not options_class.__doc__:
116 return None
117 doc = docscrape.NumpyDocString(options_class.__doc__)
118 return _OptionsClassDoc(doc['Parameters'])
121def _decorate_compute_function(wrapper, exposed_name, func, options_class):
122 # Decorate the given compute function wrapper with useful metadata
123 # and documentation.
124 cpp_doc = func._doc
126 wrapper.__arrow_compute_function__ = dict(
127 name=func.name,
128 arity=func.arity,
129 options_class=cpp_doc.options_class,
130 options_required=cpp_doc.options_required)
131 wrapper.__name__ = exposed_name
132 wrapper.__qualname__ = exposed_name
134 doc_pieces = []
136 # 1. One-line summary
137 summary = cpp_doc.summary
138 if not summary:
139 arg_str = "arguments" if func.arity > 1 else "argument"
140 summary = ("Call compute function {!r} with the given {}"
141 .format(func.name, arg_str))
143 doc_pieces.append(f"{summary}.\n\n")
145 # 2. Multi-line description
146 description = cpp_doc.description
147 if description:
148 doc_pieces.append(f"{description}\n\n")
150 doc_addition = _compute_docstrings.function_doc_additions.get(func.name)
152 # 3. Parameter description
153 doc_pieces.append(dedent("""\
154 Parameters
155 ----------
156 """))
158 # 3a. Compute function parameters
159 arg_names = _get_arg_names(func)
160 for arg_name in arg_names:
161 if func.kind in ('vector', 'scalar_aggregate'):
162 arg_type = 'Array-like'
163 else:
164 arg_type = 'Array-like or scalar-like'
165 doc_pieces.append(f"{arg_name} : {arg_type}\n")
166 doc_pieces.append(" Argument to compute function.\n")
168 # 3b. Compute function option values
169 if options_class is not None:
170 options_class_doc = _scrape_options_class_doc(options_class)
171 if options_class_doc:
172 for p in options_class_doc.params:
173 doc_pieces.append(f"{p.name} : {p.type}\n")
174 for s in p.desc:
175 doc_pieces.append(f" {s}\n")
176 else:
177 warnings.warn(f"Options class {options_class.__name__} "
178 f"does not have a docstring", RuntimeWarning)
179 options_sig = inspect.signature(options_class)
180 for p in options_sig.parameters.values():
181 doc_pieces.append(dedent("""\
182 {0} : optional
183 Parameter for {1} constructor. Either `options`
184 or `{0}` can be passed, but not both at the same time.
185 """.format(p.name, options_class.__name__)))
186 doc_pieces.append(dedent(f"""\
187 options : pyarrow.compute.{options_class.__name__}, optional
188 Alternative way of passing options.
189 """))
191 doc_pieces.append(dedent("""\
192 memory_pool : pyarrow.MemoryPool, optional
193 If not passed, will allocate memory from the default memory pool.
194 """))
196 # 4. Custom addition (e.g. examples)
197 if doc_addition is not None:
198 doc_pieces.append("\n{}\n".format(dedent(doc_addition).strip("\n")))
200 wrapper.__doc__ = "".join(doc_pieces)
201 return wrapper
204def _get_options_class(func):
205 class_name = func._doc.options_class
206 if not class_name:
207 return None
208 try:
209 return globals()[class_name]
210 except KeyError:
211 warnings.warn("Python binding for {} not exposed"
212 .format(class_name), RuntimeWarning)
213 return None
216def _handle_options(name, options_class, options, args, kwargs):
217 if args or kwargs:
218 if options is not None:
219 raise TypeError(
220 "Function {!r} called with both an 'options' argument "
221 "and additional arguments"
222 .format(name))
223 return options_class(*args, **kwargs)
225 if options is not None:
226 if isinstance(options, dict):
227 return options_class(**options)
228 elif isinstance(options, options_class):
229 return options
230 raise TypeError(
231 "Function {!r} expected a {} parameter, got {}"
232 .format(name, options_class, type(options)))
234 return None
237def _make_generic_wrapper(func_name, func, options_class, arity):
238 if options_class is None:
239 def wrapper(*args, memory_pool=None):
240 if arity is not Ellipsis and len(args) != arity:
241 raise TypeError(
242 f"{func_name} takes {arity} positional argument(s), "
243 f"but {len(args)} were given"
244 )
245 if args and isinstance(args[0], Expression):
246 return Expression._call(func_name, list(args))
247 return func.call(args, None, memory_pool)
248 else:
249 def wrapper(*args, memory_pool=None, options=None, **kwargs):
250 if arity is not Ellipsis:
251 if len(args) < arity:
252 raise TypeError(
253 f"{func_name} takes {arity} positional argument(s), "
254 f"but {len(args)} were given"
255 )
256 option_args = args[arity:]
257 args = args[:arity]
258 else:
259 option_args = ()
260 options = _handle_options(func_name, options_class, options,
261 option_args, kwargs)
262 if args and isinstance(args[0], Expression):
263 return Expression._call(func_name, list(args), options)
264 return func.call(args, options, memory_pool)
265 return wrapper
268def _make_signature(arg_names, var_arg_names, options_class):
269 from inspect import Parameter
270 params = []
271 for name in arg_names:
272 params.append(Parameter(name, Parameter.POSITIONAL_ONLY))
273 for name in var_arg_names:
274 params.append(Parameter(name, Parameter.VAR_POSITIONAL))
275 if options_class is not None:
276 options_sig = inspect.signature(options_class)
277 for p in options_sig.parameters.values():
278 assert p.kind in (Parameter.POSITIONAL_OR_KEYWORD,
279 Parameter.KEYWORD_ONLY)
280 if var_arg_names:
281 # Cannot have a positional argument after a *args
282 p = p.replace(kind=Parameter.KEYWORD_ONLY)
283 params.append(p)
284 params.append(Parameter("options", Parameter.KEYWORD_ONLY,
285 default=None))
286 params.append(Parameter("memory_pool", Parameter.KEYWORD_ONLY,
287 default=None))
288 return inspect.Signature(params)
291def _wrap_function(name, func):
292 options_class = _get_options_class(func)
293 arg_names = _get_arg_names(func)
294 has_vararg = arg_names and arg_names[-1].startswith('*')
295 if has_vararg:
296 var_arg_names = [arg_names.pop().lstrip('*')]
297 else:
298 var_arg_names = []
300 wrapper = _make_generic_wrapper(
301 name, func, options_class, arity=func.arity)
302 wrapper.__signature__ = _make_signature(arg_names, var_arg_names,
303 options_class)
304 return _decorate_compute_function(wrapper, name, func, options_class)
307def _make_global_functions():
308 """
309 Make global functions wrapping each compute function.
311 Note that some of the automatically-generated wrappers may be overridden
312 by custom versions below.
313 """
314 g = globals()
315 reg = function_registry()
317 # Avoid clashes with Python keywords
318 rewrites = {'and': 'and_',
319 'or': 'or_'}
321 for cpp_name in reg.list_functions():
322 name = rewrites.get(cpp_name, cpp_name)
323 func = reg.get_function(cpp_name)
324 if func.kind == "hash_aggregate":
325 # Hash aggregate functions are not callable,
326 # so let's not expose them at module level.
327 continue
328 if func.kind == "scalar_aggregate" and func.arity == 0:
329 # Nullary scalar aggregate functions are not callable
330 # directly so let's not expose them at module level.
331 continue
332 assert name not in g, name
333 g[cpp_name] = g[name] = _wrap_function(name, func)
336_make_global_functions()
339def cast(arr, target_type=None, safe=None, options=None, memory_pool=None):
340 """
341 Cast array values to another data type. Can also be invoked as an array
342 instance method.
344 Parameters
345 ----------
346 arr : Array-like
347 target_type : DataType or str
348 Type to cast to
349 safe : bool, default True
350 Check for overflows or other unsafe conversions
351 options : CastOptions, default None
352 Additional checks pass by CastOptions
353 memory_pool : MemoryPool, optional
354 memory pool to use for allocations during function execution.
356 Examples
357 --------
358 >>> from datetime import datetime
359 >>> import pyarrow as pa
360 >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)])
361 >>> arr.type
362 TimestampType(timestamp[us])
364 You can use ``pyarrow.DataType`` objects to specify the target type:
366 >>> cast(arr, pa.timestamp('ms'))
367 <pyarrow.lib.TimestampArray object at ...>
368 [
369 2010-01-01 00:00:00.000,
370 2015-01-01 00:00:00.000
371 ]
373 >>> cast(arr, pa.timestamp('ms')).type
374 TimestampType(timestamp[ms])
376 Alternatively, it is also supported to use the string aliases for these
377 types:
379 >>> arr.cast('timestamp[ms]')
380 <pyarrow.lib.TimestampArray object at ...>
381 [
382 2010-01-01 00:00:00.000,
383 2015-01-01 00:00:00.000
384 ]
385 >>> arr.cast('timestamp[ms]').type
386 TimestampType(timestamp[ms])
388 Returns
389 -------
390 casted : Array
391 The cast result as a new Array
392 """
393 safe_vars_passed = (safe is not None) or (target_type is not None)
395 if safe_vars_passed and (options is not None):
396 raise ValueError("Must either pass values for 'target_type' and 'safe'"
397 " or pass a value for 'options'")
399 if options is None:
400 target_type = pa.types.lib.ensure_type(target_type)
401 if safe is False:
402 options = CastOptions.unsafe(target_type)
403 else:
404 options = CastOptions.safe(target_type)
405 return call_function("cast", [arr], options, memory_pool)
408def index(data, value, start=None, end=None, *, memory_pool=None):
409 """
410 Find the index of the first occurrence of a given value.
412 Parameters
413 ----------
414 data : Array-like
415 value : Scalar-like object
416 The value to search for.
417 start : int, optional
418 end : int, optional
419 memory_pool : MemoryPool, optional
420 If not passed, will allocate memory from the default memory pool.
422 Returns
423 -------
424 index : int
425 the index, or -1 if not found
427 Examples
428 --------
429 >>> import pyarrow as pa
430 >>> import pyarrow.compute as pc
431 >>> arr = pa.array(["Lorem", "ipsum", "dolor", "sit", "Lorem", "ipsum"])
432 >>> pc.index(arr, "ipsum")
433 <pyarrow.Int64Scalar: 1>
434 >>> pc.index(arr, "ipsum", start=2)
435 <pyarrow.Int64Scalar: 5>
436 >>> pc.index(arr, "amet")
437 <pyarrow.Int64Scalar: -1>
438 """
439 if start is not None:
440 if end is not None:
441 data = data.slice(start, end - start)
442 else:
443 data = data.slice(start)
444 elif end is not None:
445 data = data.slice(0, end)
447 if not isinstance(value, pa.Scalar):
448 value = pa.scalar(value, type=data.type)
449 elif data.type != value.type:
450 value = pa.scalar(value.as_py(), type=data.type)
451 options = IndexOptions(value=value)
452 result = call_function('index', [data], options, memory_pool)
453 if start is not None and result.as_py() >= 0:
454 result = pa.scalar(result.as_py() + start, type=pa.int64())
455 return result
458def take(data, indices, *, boundscheck=True, memory_pool=None):
459 """
460 Select values (or records) from array- or table-like data given integer
461 selection indices.
463 The result will be of the same type(s) as the input, with elements taken
464 from the input array (or record batch / table fields) at the given
465 indices. If an index is null then the corresponding value in the output
466 will be null.
468 Parameters
469 ----------
470 data : Array, ChunkedArray, RecordBatch, or Table
471 indices : Array, ChunkedArray
472 Must be of integer type
473 boundscheck : boolean, default True
474 Whether to boundscheck the indices. If False and there is an out of
475 bounds index, will likely cause the process to crash.
476 memory_pool : MemoryPool, optional
477 If not passed, will allocate memory from the default memory pool.
479 Returns
480 -------
481 result : depends on inputs
482 Selected values for the given indices
484 Examples
485 --------
486 >>> import pyarrow as pa
487 >>> arr = pa.array(["a", "b", "c", None, "e", "f"])
488 >>> indices = pa.array([0, None, 4, 3])
489 >>> arr.take(indices)
490 <pyarrow.lib.StringArray object at ...>
491 [
492 "a",
493 null,
494 "e",
495 null
496 ]
497 """
498 options = TakeOptions(boundscheck=boundscheck)
499 return call_function('take', [data, indices], options, memory_pool)
502def fill_null(values, fill_value):
503 """Replace each null element in values with a corresponding
504 element from fill_value.
506 If fill_value is scalar-like, then every null element in values
507 will be replaced with fill_value. If fill_value is array-like,
508 then the i-th element in values will be replaced with the i-th
509 element in fill_value.
511 The fill_value's type must be the same as that of values, or it
512 must be able to be implicitly casted to the array's type.
514 This is an alias for :func:`coalesce`.
516 Parameters
517 ----------
518 values : Array, ChunkedArray, or Scalar-like object
519 Each null element is replaced with the corresponding value
520 from fill_value.
521 fill_value : Array, ChunkedArray, or Scalar-like object
522 If not same type as values, will attempt to cast.
524 Returns
525 -------
526 result : depends on inputs
527 Values with all null elements replaced
529 Examples
530 --------
531 >>> import pyarrow as pa
532 >>> arr = pa.array([1, 2, None, 3], type=pa.int8())
533 >>> fill_value = pa.scalar(5, type=pa.int8())
534 >>> arr.fill_null(fill_value)
535 <pyarrow.lib.Int8Array object at ...>
536 [
537 1,
538 2,
539 5,
540 3
541 ]
542 >>> arr = pa.array([1, 2, None, 4, None])
543 >>> arr.fill_null(pa.array([10, 20, 30, 40, 50]))
544 <pyarrow.lib.Int64Array object at ...>
545 [
546 1,
547 2,
548 30,
549 4,
550 50
551 ]
552 """
553 if not isinstance(fill_value, (pa.Array, pa.ChunkedArray, pa.Scalar)):
554 fill_value = pa.scalar(fill_value, type=values.type)
555 elif values.type != fill_value.type:
556 fill_value = pa.scalar(fill_value.as_py(), type=values.type)
558 return call_function("coalesce", [values, fill_value])
561def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
562 """
563 Select the indices of the top-k ordered elements from array- or table-like
564 data.
566 This is a specialization for :func:`select_k_unstable`. Output is not
567 guaranteed to be stable.
569 Parameters
570 ----------
571 values : Array, ChunkedArray, RecordBatch, or Table
572 Data to sort and get top indices from.
573 k : int
574 The number of `k` elements to keep.
575 sort_keys : List-like
576 Column key names to order by when input is table-like data.
577 memory_pool : MemoryPool, optional
578 If not passed, will allocate memory from the default memory pool.
580 Returns
581 -------
582 result : Array
583 Indices of the top-k ordered elements
585 Examples
586 --------
587 >>> import pyarrow as pa
588 >>> import pyarrow.compute as pc
589 >>> arr = pa.array(["a", "b", "c", None, "e", "f"])
590 >>> pc.top_k_unstable(arr, k=3)
591 <pyarrow.lib.UInt64Array object at ...>
592 [
593 5,
594 4,
595 2
596 ]
597 """
598 if sort_keys is None:
599 sort_keys = []
600 if isinstance(values, (pa.Array, pa.ChunkedArray)):
601 sort_keys.append(("dummy", "descending"))
602 else:
603 sort_keys = map(lambda key_name: (key_name, "descending"), sort_keys)
604 options = SelectKOptions(k, sort_keys)
605 return call_function("select_k_unstable", [values], options, memory_pool)
608def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
609 """
610 Select the indices of the bottom-k ordered elements from
611 array- or table-like data.
613 This is a specialization for :func:`select_k_unstable`. Output is not
614 guaranteed to be stable.
616 Parameters
617 ----------
618 values : Array, ChunkedArray, RecordBatch, or Table
619 Data to sort and get bottom indices from.
620 k : int
621 The number of `k` elements to keep.
622 sort_keys : List-like
623 Column key names to order by when input is table-like data.
624 memory_pool : MemoryPool, optional
625 If not passed, will allocate memory from the default memory pool.
627 Returns
628 -------
629 result : Array of indices
630 Indices of the bottom-k ordered elements
632 Examples
633 --------
634 >>> import pyarrow as pa
635 >>> import pyarrow.compute as pc
636 >>> arr = pa.array(["a", "b", "c", None, "e", "f"])
637 >>> pc.bottom_k_unstable(arr, k=3)
638 <pyarrow.lib.UInt64Array object at ...>
639 [
640 0,
641 1,
642 2
643 ]
644 """
645 if sort_keys is None:
646 sort_keys = []
647 if isinstance(values, (pa.Array, pa.ChunkedArray)):
648 sort_keys.append(("dummy", "ascending"))
649 else:
650 sort_keys = map(lambda key_name: (key_name, "ascending"), sort_keys)
651 options = SelectKOptions(k, sort_keys)
652 return call_function("select_k_unstable", [values], options, memory_pool)
655def random(n, *, initializer='system', options=None, memory_pool=None):
656 """
657 Generate numbers in the range [0, 1).
659 Generated values are uniformly-distributed, double-precision
660 in range [0, 1). Algorithm and seed can be changed via RandomOptions.
662 Parameters
663 ----------
664 n : int
665 Number of values to generate, must be greater than or equal to 0
666 initializer : int or str
667 How to initialize the underlying random generator.
668 If an integer is given, it is used as a seed.
669 If "system" is given, the random generator is initialized with
670 a system-specific source of (hopefully true) randomness.
671 Other values are invalid.
672 options : pyarrow.compute.RandomOptions, optional
673 Alternative way of passing options.
674 memory_pool : pyarrow.MemoryPool, optional
675 If not passed, will allocate memory from the default memory pool.
676 """
677 options = RandomOptions(initializer=initializer)
678 return call_function("random", [], options, memory_pool, length=n)
681def field(*name_or_index):
682 """Reference a column of the dataset.
684 Stores only the field's name. Type and other information is known only when
685 the expression is bound to a dataset having an explicit scheme.
687 Nested references are allowed by passing multiple names or a tuple of
688 names. For example ``('foo', 'bar')`` references the field named "bar"
689 inside the field named "foo".
691 Parameters
692 ----------
693 *name_or_index : string, multiple strings, tuple or int
694 The name or index of the (possibly nested) field the expression
695 references to.
697 Returns
698 -------
699 field_expr : Expression
700 Reference to the given field
702 Examples
703 --------
704 >>> import pyarrow.compute as pc
705 >>> pc.field("a")
706 <pyarrow.compute.Expression a>
707 >>> pc.field(1)
708 <pyarrow.compute.Expression FieldPath(1)>
709 >>> pc.field(("a", "b"))
710 <pyarrow.compute.Expression FieldRef.Nested(FieldRef.Name(a) ...
711 >>> pc.field("a", "b")
712 <pyarrow.compute.Expression FieldRef.Nested(FieldRef.Name(a) ...
713 """
714 n = len(name_or_index)
715 if n == 1:
716 if isinstance(name_or_index[0], (str, int)):
717 return Expression._field(name_or_index[0])
718 elif isinstance(name_or_index[0], tuple):
719 return Expression._nested_field(name_or_index[0])
720 else:
721 raise TypeError(
722 "field reference should be str, multiple str, tuple or "
723 f"integer, got {type(name_or_index[0])}"
724 )
725 # In case of multiple strings not supplied in a tuple
726 else:
727 return Expression._nested_field(name_or_index)
730def scalar(value):
731 """Expression representing a scalar value.
733 Parameters
734 ----------
735 value : bool, int, float or string
736 Python value of the scalar. Note that only a subset of types are
737 currently supported.
739 Returns
740 -------
741 scalar_expr : Expression
742 An Expression representing the scalar value
743 """
744 return Expression._scalar(value)