Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/interchange/utils.py: 50%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

70 statements  

1""" 

2Utility functions and objects for implementing the interchange API. 

3""" 

4 

5from __future__ import annotations 

6 

7import typing 

8 

9import numpy as np 

10 

11from pandas._libs import lib 

12 

13from pandas.core.dtypes.dtypes import ( 

14 ArrowDtype, 

15 CategoricalDtype, 

16 DatetimeTZDtype, 

17) 

18 

19import pandas as pd 

20 

21if typing.TYPE_CHECKING: 

22 from pandas._typing import DtypeObj 

23 

24 

25# Maps str(pyarrow.DataType) = C type format string 

26# Currently, no pyarrow API for this 

27PYARROW_CTYPES = { 

28 "null": "n", 

29 "bool": "b", 

30 "uint8": "C", 

31 "uint16": "S", 

32 "uint32": "I", 

33 "uint64": "L", 

34 "int8": "c", 

35 "int16": "S", 

36 "int32": "i", 

37 "int64": "l", 

38 "halffloat": "e", # float16 

39 "float": "f", # float32 

40 "double": "g", # float64 

41 "string": "u", 

42 "large_string": "U", 

43 "binary": "z", 

44 "time32[s]": "tts", 

45 "time32[ms]": "ttm", 

46 "time64[us]": "ttu", 

47 "time64[ns]": "ttn", 

48 "date32[day]": "tdD", 

49 "date64[ms]": "tdm", 

50 "timestamp[s]": "tss:", 

51 "timestamp[ms]": "tsm:", 

52 "timestamp[us]": "tsu:", 

53 "timestamp[ns]": "tsn:", 

54 "duration[s]": "tDs", 

55 "duration[ms]": "tDm", 

56 "duration[us]": "tDu", 

57 "duration[ns]": "tDn", 

58} 

59 

60 

61class ArrowCTypes: 

62 """ 

63 Enum for Apache Arrow C type format strings. 

64 

65 The Arrow C data interface: 

66 https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings 

67 """ 

68 

69 NULL = "n" 

70 BOOL = "b" 

71 INT8 = "c" 

72 UINT8 = "C" 

73 INT16 = "s" 

74 UINT16 = "S" 

75 INT32 = "i" 

76 UINT32 = "I" 

77 INT64 = "l" 

78 UINT64 = "L" 

79 FLOAT16 = "e" 

80 FLOAT32 = "f" 

81 FLOAT64 = "g" 

82 STRING = "u" # utf-8 

83 LARGE_STRING = "U" # utf-8 

84 DATE32 = "tdD" 

85 DATE64 = "tdm" 

86 # Resoulution: 

87 # - seconds -> 's' 

88 # - milliseconds -> 'm' 

89 # - microseconds -> 'u' 

90 # - nanoseconds -> 'n' 

91 TIMESTAMP = "ts{resolution}:{tz}" 

92 TIME = "tt{resolution}" 

93 

94 

95class Endianness: 

96 """Enum indicating the byte-order of a data-type.""" 

97 

98 LITTLE = "<" 

99 BIG = ">" 

100 NATIVE = "=" 

101 NA = "|" 

102 

103 

104def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str: 

105 """ 

106 Represent pandas `dtype` as a format string in Apache Arrow C notation. 

107 

108 Parameters 

109 ---------- 

110 dtype : np.dtype 

111 Datatype of pandas DataFrame to represent. 

112 

113 Returns 

114 ------- 

115 str 

116 Format string in Apache Arrow C notation of the given `dtype`. 

117 """ 

118 if isinstance(dtype, CategoricalDtype): 

119 return ArrowCTypes.INT64 

120 elif dtype == np.dtype("O"): 

121 return ArrowCTypes.STRING 

122 elif isinstance(dtype, ArrowDtype): 

123 import pyarrow as pa 

124 

125 pa_type = dtype.pyarrow_dtype 

126 if pa.types.is_decimal(pa_type): 

127 return f"d:{pa_type.precision},{pa_type.scale}" 

128 elif pa.types.is_timestamp(pa_type) and pa_type.tz is not None: 

129 return f"ts{pa_type.unit[0]}:{pa_type.tz}" 

130 format_str = PYARROW_CTYPES.get(str(pa_type), None) 

131 if format_str is not None: 

132 return format_str 

133 

134 format_str = getattr(ArrowCTypes, dtype.name.upper(), None) 

135 if format_str is not None: 

136 return format_str 

137 

138 if lib.is_np_dtype(dtype, "M"): 

139 # Selecting the first char of resolution string: 

140 # dtype.str -> '<M8[ns]' -> 'n' 

141 resolution = np.datetime_data(dtype)[0][0] 

142 return ArrowCTypes.TIMESTAMP.format(resolution=resolution, tz="") 

143 

144 elif isinstance(dtype, DatetimeTZDtype): 

145 return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz) 

146 

147 elif isinstance(dtype, pd.BooleanDtype): 

148 return ArrowCTypes.BOOL 

149 

150 raise NotImplementedError( 

151 f"Conversion of {dtype} to Arrow C format string is not implemented." 

152 ) 

153 

154 

155def maybe_rechunk(series: pd.Series, *, allow_copy: bool) -> pd.Series | None: 

156 """ 

157 Rechunk a multi-chunk pyarrow array into a single-chunk array, if necessary. 

158 

159 - Returns `None` if the input series is not backed by a multi-chunk pyarrow array 

160 (and so doesn't need rechunking) 

161 - Returns a single-chunk-backed-Series if the input is backed by a multi-chunk 

162 pyarrow array and `allow_copy` is `True`. 

163 - Raises a `RuntimeError` if `allow_copy` is `False` and input is a 

164 based by a multi-chunk pyarrow array. 

165 """ 

166 if not isinstance(series.dtype, pd.ArrowDtype): 

167 return None 

168 chunked_array = series.array._pa_array # type: ignore[attr-defined] 

169 if len(chunked_array.chunks) == 1: 

170 return None 

171 if not allow_copy: 

172 raise RuntimeError( 

173 "Found multi-chunk pyarrow array, but `allow_copy` is False. " 

174 "Please rechunk the array before calling this function, or set " 

175 "`allow_copy=True`." 

176 ) 

177 arr = chunked_array.combine_chunks() 

178 return pd.Series(arr, dtype=series.dtype, name=series.name, index=series.index)