1"""
2Utility functions related to concat.
3"""
4from __future__ import annotations
5
6from typing import TYPE_CHECKING
7
8import numpy as np
9
10from pandas._typing import AxisInt
11
12from pandas.core.dtypes.astype import astype_array
13from pandas.core.dtypes.cast import (
14 common_dtype_categorical_compat,
15 find_common_type,
16 np_find_common_type,
17)
18from pandas.core.dtypes.common import is_dtype_equal
19from pandas.core.dtypes.dtypes import (
20 DatetimeTZDtype,
21 ExtensionDtype,
22)
23from pandas.core.dtypes.generic import (
24 ABCCategoricalIndex,
25 ABCExtensionArray,
26 ABCSeries,
27)
28
29if TYPE_CHECKING:
30 from pandas.core.arrays import Categorical
31
32
33def concat_compat(to_concat, axis: AxisInt = 0, ea_compat_axis: bool = False):
34 """
35 provide concatenation of an array of arrays each of which is a single
36 'normalized' dtypes (in that for example, if it's object, then it is a
37 non-datetimelike and provide a combined dtype for the resulting array that
38 preserves the overall dtype if possible)
39
40 Parameters
41 ----------
42 to_concat : array of arrays
43 axis : axis to provide concatenation
44 ea_compat_axis : bool, default False
45 For ExtensionArray compat, behave as if axis == 1 when determining
46 whether to drop empty arrays.
47
48 Returns
49 -------
50 a single array, preserving the combined dtypes
51 """
52
53 # filter empty arrays
54 # 1-d dtypes always are included here
55 def is_nonempty(x) -> bool:
56 if x.ndim <= axis:
57 return True
58 return x.shape[axis] > 0
59
60 # If all arrays are empty, there's nothing to convert, just short-cut to
61 # the concatenation, #3121.
62 #
63 # Creating an empty array directly is tempting, but the winnings would be
64 # marginal given that it would still require shape & dtype calculation and
65 # np.concatenate which has them both implemented is compiled.
66 non_empties = [x for x in to_concat if is_nonempty(x)]
67 if non_empties and axis == 0 and not ea_compat_axis:
68 # ea_compat_axis see GH#39574
69 to_concat = non_empties
70
71 dtypes = {obj.dtype for obj in to_concat}
72 kinds = {obj.dtype.kind for obj in to_concat}
73 contains_datetime = any(
74 isinstance(dtype, (np.dtype, DatetimeTZDtype)) and dtype.kind in ["m", "M"]
75 for dtype in dtypes
76 ) or any(isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat)
77
78 all_empty = not len(non_empties)
79 single_dtype = len({x.dtype for x in to_concat}) == 1
80 any_ea = any(isinstance(x.dtype, ExtensionDtype) for x in to_concat)
81
82 if contains_datetime:
83 return _concat_datetime(to_concat, axis=axis)
84
85 if any_ea:
86 # we ignore axis here, as internally concatting with EAs is always
87 # for axis=0
88 if not single_dtype:
89 target_dtype = find_common_type([x.dtype for x in to_concat])
90 target_dtype = common_dtype_categorical_compat(to_concat, target_dtype)
91 to_concat = [
92 astype_array(arr, target_dtype, copy=False) for arr in to_concat
93 ]
94
95 if isinstance(to_concat[0], ABCExtensionArray):
96 # TODO: what about EA-backed Index?
97 cls = type(to_concat[0])
98 return cls._concat_same_type(to_concat)
99 else:
100 return np.concatenate(to_concat)
101
102 elif all_empty:
103 # we have all empties, but may need to coerce the result dtype to
104 # object if we have non-numeric type operands (numpy would otherwise
105 # cast this to float)
106 if len(kinds) != 1:
107 if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}):
108 # let numpy coerce
109 pass
110 else:
111 # coerce to object
112 to_concat = [x.astype("object") for x in to_concat]
113 kinds = {"o"}
114 else:
115 target_dtype = np_find_common_type(*dtypes)
116
117 result = np.concatenate(to_concat, axis=axis)
118 if "b" in kinds and result.dtype.kind in ["i", "u", "f"]:
119 # GH#39817 cast to object instead of casting bools to numeric
120 result = result.astype(object, copy=False)
121 return result
122
123
124def union_categoricals(
125 to_union, sort_categories: bool = False, ignore_order: bool = False
126) -> Categorical:
127 """
128 Combine list-like of Categorical-like, unioning categories.
129
130 All categories must have the same dtype.
131
132 Parameters
133 ----------
134 to_union : list-like
135 Categorical, CategoricalIndex, or Series with dtype='category'.
136 sort_categories : bool, default False
137 If true, resulting categories will be lexsorted, otherwise
138 they will be ordered as they appear in the data.
139 ignore_order : bool, default False
140 If true, the ordered attribute of the Categoricals will be ignored.
141 Results in an unordered categorical.
142
143 Returns
144 -------
145 Categorical
146
147 Raises
148 ------
149 TypeError
150 - all inputs do not have the same dtype
151 - all inputs do not have the same ordered property
152 - all inputs are ordered and their categories are not identical
153 - sort_categories=True and Categoricals are ordered
154 ValueError
155 Empty list of categoricals passed
156
157 Notes
158 -----
159 To learn more about categories, see `link
160 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__
161
162 Examples
163 --------
164 If you want to combine categoricals that do not necessarily have
165 the same categories, `union_categoricals` will combine a list-like
166 of categoricals. The new categories will be the union of the
167 categories being combined.
168
169 >>> a = pd.Categorical(["b", "c"])
170 >>> b = pd.Categorical(["a", "b"])
171 >>> pd.api.types.union_categoricals([a, b])
172 ['b', 'c', 'a', 'b']
173 Categories (3, object): ['b', 'c', 'a']
174
175 By default, the resulting categories will be ordered as they appear
176 in the `categories` of the data. If you want the categories to be
177 lexsorted, use `sort_categories=True` argument.
178
179 >>> pd.api.types.union_categoricals([a, b], sort_categories=True)
180 ['b', 'c', 'a', 'b']
181 Categories (3, object): ['a', 'b', 'c']
182
183 `union_categoricals` also works with the case of combining two
184 categoricals of the same categories and order information (e.g. what
185 you could also `append` for).
186
187 >>> a = pd.Categorical(["a", "b"], ordered=True)
188 >>> b = pd.Categorical(["a", "b", "a"], ordered=True)
189 >>> pd.api.types.union_categoricals([a, b])
190 ['a', 'b', 'a', 'b', 'a']
191 Categories (2, object): ['a' < 'b']
192
193 Raises `TypeError` because the categories are ordered and not identical.
194
195 >>> a = pd.Categorical(["a", "b"], ordered=True)
196 >>> b = pd.Categorical(["a", "b", "c"], ordered=True)
197 >>> pd.api.types.union_categoricals([a, b])
198 Traceback (most recent call last):
199 ...
200 TypeError: to union ordered Categoricals, all categories must be the same
201
202 New in version 0.20.0
203
204 Ordered categoricals with different categories or orderings can be
205 combined by using the `ignore_ordered=True` argument.
206
207 >>> a = pd.Categorical(["a", "b", "c"], ordered=True)
208 >>> b = pd.Categorical(["c", "b", "a"], ordered=True)
209 >>> pd.api.types.union_categoricals([a, b], ignore_order=True)
210 ['a', 'b', 'c', 'c', 'b', 'a']
211 Categories (3, object): ['a', 'b', 'c']
212
213 `union_categoricals` also works with a `CategoricalIndex`, or `Series`
214 containing categorical data, but note that the resulting array will
215 always be a plain `Categorical`
216
217 >>> a = pd.Series(["b", "c"], dtype='category')
218 >>> b = pd.Series(["a", "b"], dtype='category')
219 >>> pd.api.types.union_categoricals([a, b])
220 ['b', 'c', 'a', 'b']
221 Categories (3, object): ['b', 'c', 'a']
222 """
223 from pandas import Categorical
224 from pandas.core.arrays.categorical import recode_for_categories
225
226 if len(to_union) == 0:
227 raise ValueError("No Categoricals to union")
228
229 def _maybe_unwrap(x):
230 if isinstance(x, (ABCCategoricalIndex, ABCSeries)):
231 return x._values
232 elif isinstance(x, Categorical):
233 return x
234 else:
235 raise TypeError("all components to combine must be Categorical")
236
237 to_union = [_maybe_unwrap(x) for x in to_union]
238 first = to_union[0]
239
240 if not all(
241 is_dtype_equal(other.categories.dtype, first.categories.dtype)
242 for other in to_union[1:]
243 ):
244 raise TypeError("dtype of categories must be the same")
245
246 ordered = False
247 if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]):
248 # identical categories - fastpath
249 categories = first.categories
250 ordered = first.ordered
251
252 all_codes = [first._encode_with_my_categories(x)._codes for x in to_union]
253 new_codes = np.concatenate(all_codes)
254
255 if sort_categories and not ignore_order and ordered:
256 raise TypeError("Cannot use sort_categories=True with ordered Categoricals")
257
258 if sort_categories and not categories.is_monotonic_increasing:
259 categories = categories.sort_values()
260 indexer = categories.get_indexer(first.categories)
261
262 from pandas.core.algorithms import take_nd
263
264 new_codes = take_nd(indexer, new_codes, fill_value=-1)
265 elif ignore_order or all(not c.ordered for c in to_union):
266 # different categories - union and recode
267 cats = first.categories.append([c.categories for c in to_union[1:]])
268 categories = cats.unique()
269 if sort_categories:
270 categories = categories.sort_values()
271
272 new_codes = [
273 recode_for_categories(c.codes, c.categories, categories) for c in to_union
274 ]
275 new_codes = np.concatenate(new_codes)
276 else:
277 # ordered - to show a proper error message
278 if all(c.ordered for c in to_union):
279 msg = "to union ordered Categoricals, all categories must be the same"
280 raise TypeError(msg)
281 raise TypeError("Categorical.ordered must be the same")
282
283 if ignore_order:
284 ordered = False
285
286 return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
287
288
289def _concatenate_2d(to_concat, axis: AxisInt):
290 # coerce to 2d if needed & concatenate
291 if axis == 1:
292 to_concat = [np.atleast_2d(x) for x in to_concat]
293 return np.concatenate(to_concat, axis=axis)
294
295
296def _concat_datetime(to_concat, axis: AxisInt = 0):
297 """
298 provide concatenation of an datetimelike array of arrays each of which is a
299 single M8[ns], datetime64[ns, tz] or m8[ns] dtype
300
301 Parameters
302 ----------
303 to_concat : array of arrays
304 axis : axis to provide concatenation
305
306 Returns
307 -------
308 a single array, preserving the combined dtypes
309 """
310 from pandas.core.construction import ensure_wrapped_if_datetimelike
311
312 to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat]
313
314 single_dtype = len({x.dtype for x in to_concat}) == 1
315
316 # multiple types, need to coerce to object
317 if not single_dtype:
318 # ensure_wrapped_if_datetimelike ensures that astype(object) wraps
319 # in Timestamp/Timedelta
320 return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis)
321
322 result = type(to_concat[0])._concat_same_type(to_concat, axis=axis)
323 return result