1from __future__ import annotations
2
3import warnings
4
5import numpy as np
6import pyarrow
7
8from pandas.errors import PerformanceWarning
9from pandas.util._exceptions import find_stack_level
10
11
12def fallback_performancewarning(version: str | None = None) -> None:
13 """
14 Raise a PerformanceWarning for falling back to ExtensionArray's
15 non-pyarrow method
16 """
17 msg = "Falling back on a non-pyarrow code path which may decrease performance."
18 if version is not None:
19 msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning."
20 warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
21
22
23def pyarrow_array_to_numpy_and_mask(
24 arr, dtype: np.dtype
25) -> tuple[np.ndarray, np.ndarray]:
26 """
27 Convert a primitive pyarrow.Array to a numpy array and boolean mask based
28 on the buffers of the Array.
29
30 At the moment pyarrow.BooleanArray is not supported.
31
32 Parameters
33 ----------
34 arr : pyarrow.Array
35 dtype : numpy.dtype
36
37 Returns
38 -------
39 (data, mask)
40 Tuple of two numpy arrays with the raw data (with specified dtype) and
41 a boolean mask (validity mask, so False means missing)
42 """
43 dtype = np.dtype(dtype)
44
45 if pyarrow.types.is_null(arr.type):
46 # No initialization of data is needed since everything is null
47 data = np.empty(len(arr), dtype=dtype)
48 mask = np.zeros(len(arr), dtype=bool)
49 return data, mask
50 buflist = arr.buffers()
51 # Since Arrow buffers might contain padding and the data might be offset,
52 # the buffer gets sliced here before handing it to numpy.
53 # See also https://github.com/pandas-dev/pandas/issues/40896
54 offset = arr.offset * dtype.itemsize
55 length = len(arr) * dtype.itemsize
56 data_buf = buflist[1][offset : offset + length]
57 data = np.frombuffer(data_buf, dtype=dtype)
58 bitmask = buflist[0]
59 if bitmask is not None:
60 mask = pyarrow.BooleanArray.from_buffers(
61 pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
62 )
63 mask = np.asarray(mask)
64 else:
65 mask = np.ones(len(arr), dtype=bool)
66 return data, mask