1"""
2Utility functions and objects for implementing the interchange API.
3"""
4
5from __future__ import annotations
6
7import typing
8
9import numpy as np
10
11from pandas._libs import lib
12
13from pandas.core.dtypes.dtypes import (
14 ArrowDtype,
15 CategoricalDtype,
16 DatetimeTZDtype,
17)
18
19import pandas as pd
20
21if typing.TYPE_CHECKING:
22 from pandas._typing import DtypeObj
23
24
25# Maps str(pyarrow.DataType) = C type format string
26# Currently, no pyarrow API for this
27PYARROW_CTYPES = {
28 "null": "n",
29 "bool": "b",
30 "uint8": "C",
31 "uint16": "S",
32 "uint32": "I",
33 "uint64": "L",
34 "int8": "c",
35 "int16": "S",
36 "int32": "i",
37 "int64": "l",
38 "halffloat": "e", # float16
39 "float": "f", # float32
40 "double": "g", # float64
41 "string": "u",
42 "large_string": "U",
43 "binary": "z",
44 "time32[s]": "tts",
45 "time32[ms]": "ttm",
46 "time64[us]": "ttu",
47 "time64[ns]": "ttn",
48 "date32[day]": "tdD",
49 "date64[ms]": "tdm",
50 "timestamp[s]": "tss:",
51 "timestamp[ms]": "tsm:",
52 "timestamp[us]": "tsu:",
53 "timestamp[ns]": "tsn:",
54 "duration[s]": "tDs",
55 "duration[ms]": "tDm",
56 "duration[us]": "tDu",
57 "duration[ns]": "tDn",
58}
59
60
61class ArrowCTypes:
62 """
63 Enum for Apache Arrow C type format strings.
64
65 The Arrow C data interface:
66 https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
67 """
68
69 NULL = "n"
70 BOOL = "b"
71 INT8 = "c"
72 UINT8 = "C"
73 INT16 = "s"
74 UINT16 = "S"
75 INT32 = "i"
76 UINT32 = "I"
77 INT64 = "l"
78 UINT64 = "L"
79 FLOAT16 = "e"
80 FLOAT32 = "f"
81 FLOAT64 = "g"
82 STRING = "u" # utf-8
83 LARGE_STRING = "U" # utf-8
84 DATE32 = "tdD"
85 DATE64 = "tdm"
86 # Resoulution:
87 # - seconds -> 's'
88 # - milliseconds -> 'm'
89 # - microseconds -> 'u'
90 # - nanoseconds -> 'n'
91 TIMESTAMP = "ts{resolution}:{tz}"
92 TIME = "tt{resolution}"
93
94
95class Endianness:
96 """Enum indicating the byte-order of a data-type."""
97
98 LITTLE = "<"
99 BIG = ">"
100 NATIVE = "="
101 NA = "|"
102
103
104def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:
105 """
106 Represent pandas `dtype` as a format string in Apache Arrow C notation.
107
108 Parameters
109 ----------
110 dtype : np.dtype
111 Datatype of pandas DataFrame to represent.
112
113 Returns
114 -------
115 str
116 Format string in Apache Arrow C notation of the given `dtype`.
117 """
118 if isinstance(dtype, CategoricalDtype):
119 return ArrowCTypes.INT64
120 elif dtype == np.dtype("O"):
121 return ArrowCTypes.STRING
122 elif isinstance(dtype, ArrowDtype):
123 import pyarrow as pa
124
125 pa_type = dtype.pyarrow_dtype
126 if pa.types.is_decimal(pa_type):
127 return f"d:{pa_type.precision},{pa_type.scale}"
128 elif pa.types.is_timestamp(pa_type) and pa_type.tz is not None:
129 return f"ts{pa_type.unit[0]}:{pa_type.tz}"
130 format_str = PYARROW_CTYPES.get(str(pa_type), None)
131 if format_str is not None:
132 return format_str
133
134 format_str = getattr(ArrowCTypes, dtype.name.upper(), None)
135 if format_str is not None:
136 return format_str
137
138 if lib.is_np_dtype(dtype, "M"):
139 # Selecting the first char of resolution string:
140 # dtype.str -> '<M8[ns]' -> 'n'
141 resolution = np.datetime_data(dtype)[0][0]
142 return ArrowCTypes.TIMESTAMP.format(resolution=resolution, tz="")
143
144 elif isinstance(dtype, DatetimeTZDtype):
145 return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz)
146
147 elif isinstance(dtype, pd.BooleanDtype):
148 return ArrowCTypes.BOOL
149
150 raise NotImplementedError(
151 f"Conversion of {dtype} to Arrow C format string is not implemented."
152 )
153
154
155def maybe_rechunk(series: pd.Series, *, allow_copy: bool) -> pd.Series | None:
156 """
157 Rechunk a multi-chunk pyarrow array into a single-chunk array, if necessary.
158
159 - Returns `None` if the input series is not backed by a multi-chunk pyarrow array
160 (and so doesn't need rechunking)
161 - Returns a single-chunk-backed-Series if the input is backed by a multi-chunk
162 pyarrow array and `allow_copy` is `True`.
163 - Raises a `RuntimeError` if `allow_copy` is `False` and input is a
164 based by a multi-chunk pyarrow array.
165 """
166 if not isinstance(series.dtype, pd.ArrowDtype):
167 return None
168 chunked_array = series.array._pa_array # type: ignore[attr-defined]
169 if len(chunked_array.chunks) == 1:
170 return None
171 if not allow_copy:
172 raise RuntimeError(
173 "Found multi-chunk pyarrow array, but `allow_copy` is False. "
174 "Please rechunk the array before calling this function, or set "
175 "`allow_copy=True`."
176 )
177 arr = chunked_array.combine_chunks()
178 return pd.Series(arr, dtype=series.dtype, name=series.name, index=series.index)