Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/arrays/arrow/_arrow_utils.py: 29%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

28 statements  

1from __future__ import annotations 

2 

3import warnings 

4 

5import numpy as np 

6import pyarrow 

7 

8from pandas.errors import PerformanceWarning 

9from pandas.util._exceptions import find_stack_level 

10 

11 

12def fallback_performancewarning(version: str | None = None) -> None: 

13 """ 

14 Raise a PerformanceWarning for falling back to ExtensionArray's 

15 non-pyarrow method 

16 """ 

17 msg = "Falling back on a non-pyarrow code path which may decrease performance." 

18 if version is not None: 

19 msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning." 

20 warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) 

21 

22 

23def pyarrow_array_to_numpy_and_mask( 

24 arr, dtype: np.dtype 

25) -> tuple[np.ndarray, np.ndarray]: 

26 """ 

27 Convert a primitive pyarrow.Array to a numpy array and boolean mask based 

28 on the buffers of the Array. 

29 

30 At the moment pyarrow.BooleanArray is not supported. 

31 

32 Parameters 

33 ---------- 

34 arr : pyarrow.Array 

35 dtype : numpy.dtype 

36 

37 Returns 

38 ------- 

39 (data, mask) 

40 Tuple of two numpy arrays with the raw data (with specified dtype) and 

41 a boolean mask (validity mask, so False means missing) 

42 """ 

43 dtype = np.dtype(dtype) 

44 

45 if pyarrow.types.is_null(arr.type): 

46 # No initialization of data is needed since everything is null 

47 data = np.empty(len(arr), dtype=dtype) 

48 mask = np.zeros(len(arr), dtype=bool) 

49 return data, mask 

50 buflist = arr.buffers() 

51 # Since Arrow buffers might contain padding and the data might be offset, 

52 # the buffer gets sliced here before handing it to numpy. 

53 # See also https://github.com/pandas-dev/pandas/issues/40896 

54 offset = arr.offset * dtype.itemsize 

55 length = len(arr) * dtype.itemsize 

56 data_buf = buflist[1][offset : offset + length] 

57 data = np.frombuffer(data_buf, dtype=dtype) 

58 bitmask = buflist[0] 

59 if bitmask is not None: 

60 mask = pyarrow.BooleanArray.from_buffers( 

61 pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset 

62 ) 

63 mask = np.asarray(mask) 

64 else: 

65 mask = np.ones(len(arr), dtype=bool) 

66 return data, mask