Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/interchange/utils.py: 50%

1"""

2Utility functions and objects for implementing the interchange API.

3"""

5from __future__ import annotations

7import typing

9import numpy as np

11from pandas._libs import lib

13from pandas.core.dtypes.dtypes import (

14 ArrowDtype,

15 CategoricalDtype,

16 DatetimeTZDtype,

17)

19import pandas as pd

21if typing.TYPE_CHECKING:

22 from pandas._typing import DtypeObj

25# Maps str(pyarrow.DataType) = C type format string

26# Currently, no pyarrow API for this

27PYARROW_CTYPES = {

28 "null": "n",

29 "bool": "b",

30 "uint8": "C",

31 "uint16": "S",

32 "uint32": "I",

33 "uint64": "L",

34 "int8": "c",

35 "int16": "S",

36 "int32": "i",

37 "int64": "l",

38 "halffloat": "e", # float16

39 "float": "f", # float32

40 "double": "g", # float64

41 "string": "u",

42 "large_string": "U",

43 "binary": "z",

44 "time32[s]": "tts",

45 "time32[ms]": "ttm",

46 "time64[us]": "ttu",

47 "time64[ns]": "ttn",

48 "date32[day]": "tdD",

49 "date64[ms]": "tdm",

50 "timestamp[s]": "tss:",

51 "timestamp[ms]": "tsm:",

52 "timestamp[us]": "tsu:",

53 "timestamp[ns]": "tsn:",

54 "duration[s]": "tDs",

55 "duration[ms]": "tDm",

56 "duration[us]": "tDu",

57 "duration[ns]": "tDn",

58}

61class ArrowCTypes:

62 """

63 Enum for Apache Arrow C type format strings.

65 The Arrow C data interface:

66 https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings

67 """

69 NULL = "n"

70 BOOL = "b"

71 INT8 = "c"

72 UINT8 = "C"

73 INT16 = "s"

74 UINT16 = "S"

75 INT32 = "i"

76 UINT32 = "I"

77 INT64 = "l"

78 UINT64 = "L"

79 FLOAT16 = "e"

80 FLOAT32 = "f"

81 FLOAT64 = "g"

82 STRING = "u" # utf-8

83 LARGE_STRING = "U" # utf-8

84 DATE32 = "tdD"

85 DATE64 = "tdm"

86 # Resoulution:

87 # - seconds -> 's'

88 # - milliseconds -> 'm'

89 # - microseconds -> 'u'

90 # - nanoseconds -> 'n'

91 TIMESTAMP = "ts{resolution}:{tz}"

92 TIME = "tt{resolution}"

95class Endianness:

96 """Enum indicating the byte-order of a data-type."""

98 LITTLE = "<"

99 BIG = ">"

100 NATIVE = "="

101 NA = "|"

102

103

104def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:

105 """

106 Represent pandas `dtype` as a format string in Apache Arrow C notation.

107

108 Parameters

109 ----------

110 dtype : np.dtype

111 Datatype of pandas DataFrame to represent.

112

113 Returns

114 -------

115 str

116 Format string in Apache Arrow C notation of the given `dtype`.

117 """

118 if isinstance(dtype, CategoricalDtype):

119 return ArrowCTypes.INT64

120 elif dtype == np.dtype("O"):

121 return ArrowCTypes.STRING

122 elif isinstance(dtype, ArrowDtype):

123 import pyarrow as pa

124

125 pa_type = dtype.pyarrow_dtype

126 if pa.types.is_decimal(pa_type):

127 return f"d:{pa_type.precision},{pa_type.scale}"

128 elif pa.types.is_timestamp(pa_type) and pa_type.tz is not None:

129 return f"ts{pa_type.unit[0]}:{pa_type.tz}"

130 format_str = PYARROW_CTYPES.get(str(pa_type), None)

131 if format_str is not None:

132 return format_str

133

134 format_str = getattr(ArrowCTypes, dtype.name.upper(), None)

135 if format_str is not None:

136 return format_str

137

138 if lib.is_np_dtype(dtype, "M"):

139 # Selecting the first char of resolution string:

140 # dtype.str -> '<M8[ns]' -> 'n'

141 resolution = np.datetime_data(dtype)[0][0]

142 return ArrowCTypes.TIMESTAMP.format(resolution=resolution, tz="")

143

144 elif isinstance(dtype, DatetimeTZDtype):

145 return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz)

146

147 elif isinstance(dtype, pd.BooleanDtype):

148 return ArrowCTypes.BOOL

149

150 raise NotImplementedError(

151 f"Conversion of {dtype} to Arrow C format string is not implemented."

152 )

153

154

155def maybe_rechunk(series: pd.Series, *, allow_copy: bool) -> pd.Series | None:

156 """

157 Rechunk a multi-chunk pyarrow array into a single-chunk array, if necessary.

158

159 - Returns `None` if the input series is not backed by a multi-chunk pyarrow array

160 (and so doesn't need rechunking)

161 - Returns a single-chunk-backed-Series if the input is backed by a multi-chunk

162 pyarrow array and `allow_copy` is `True`.

163 - Raises a `RuntimeError` if `allow_copy` is `False` and input is a

164 based by a multi-chunk pyarrow array.

165 """

166 if not isinstance(series.dtype, pd.ArrowDtype):

167 return None

168 chunked_array = series.array._pa_array # type: ignore[attr-defined]

169 if len(chunked_array.chunks) == 1:

170 return None

171 if not allow_copy:

172 raise RuntimeError(

173 "Found multi-chunk pyarrow array, but `allow_copy` is False. "

174 "Please rechunk the array before calling this function, or set "

175 "`allow_copy=True`."

176 )

177 arr = chunked_array.combine_chunks()

178 return pd.Series(arr, dtype=series.dtype, name=series.name, index=series.index)