1"""
2Utility functions related to concat.
3"""
4from __future__ import annotations
5
6from typing import (
7 TYPE_CHECKING,
8 cast,
9)
10import warnings
11
12import numpy as np
13
14from pandas._libs import lib
15from pandas.util._exceptions import find_stack_level
16
17from pandas.core.dtypes.astype import astype_array
18from pandas.core.dtypes.cast import (
19 common_dtype_categorical_compat,
20 find_common_type,
21 np_find_common_type,
22)
23from pandas.core.dtypes.dtypes import CategoricalDtype
24from pandas.core.dtypes.generic import (
25 ABCCategoricalIndex,
26 ABCSeries,
27)
28
29if TYPE_CHECKING:
30 from collections.abc import Sequence
31
32 from pandas._typing import (
33 ArrayLike,
34 AxisInt,
35 DtypeObj,
36 )
37
38 from pandas.core.arrays import (
39 Categorical,
40 ExtensionArray,
41 )
42
43
44def _is_nonempty(x, axis) -> bool:
45 # filter empty arrays
46 # 1-d dtypes always are included here
47 if x.ndim <= axis:
48 return True
49 return x.shape[axis] > 0
50
51
52def concat_compat(
53 to_concat: Sequence[ArrayLike], axis: AxisInt = 0, ea_compat_axis: bool = False
54) -> ArrayLike:
55 """
56 provide concatenation of an array of arrays each of which is a single
57 'normalized' dtypes (in that for example, if it's object, then it is a
58 non-datetimelike and provide a combined dtype for the resulting array that
59 preserves the overall dtype if possible)
60
61 Parameters
62 ----------
63 to_concat : sequence of arrays
64 axis : axis to provide concatenation
65 ea_compat_axis : bool, default False
66 For ExtensionArray compat, behave as if axis == 1 when determining
67 whether to drop empty arrays.
68
69 Returns
70 -------
71 a single array, preserving the combined dtypes
72 """
73 if len(to_concat) and lib.dtypes_all_equal([obj.dtype for obj in to_concat]):
74 # fastpath!
75 obj = to_concat[0]
76 if isinstance(obj, np.ndarray):
77 to_concat_arrs = cast("Sequence[np.ndarray]", to_concat)
78 return np.concatenate(to_concat_arrs, axis=axis)
79
80 to_concat_eas = cast("Sequence[ExtensionArray]", to_concat)
81 if ea_compat_axis:
82 # We have 1D objects, that don't support axis keyword
83 return obj._concat_same_type(to_concat_eas)
84 elif axis == 0:
85 return obj._concat_same_type(to_concat_eas)
86 else:
87 # e.g. DatetimeArray
88 # NB: We are assuming here that ensure_wrapped_if_arraylike has
89 # been called where relevant.
90 return obj._concat_same_type(
91 # error: Unexpected keyword argument "axis" for "_concat_same_type"
92 # of "ExtensionArray"
93 to_concat_eas,
94 axis=axis, # type: ignore[call-arg]
95 )
96
97 # If all arrays are empty, there's nothing to convert, just short-cut to
98 # the concatenation, #3121.
99 #
100 # Creating an empty array directly is tempting, but the winnings would be
101 # marginal given that it would still require shape & dtype calculation and
102 # np.concatenate which has them both implemented is compiled.
103 orig = to_concat
104 non_empties = [x for x in to_concat if _is_nonempty(x, axis)]
105 if non_empties and axis == 0 and not ea_compat_axis:
106 # ea_compat_axis see GH#39574
107 to_concat = non_empties
108
109 any_ea, kinds, target_dtype = _get_result_dtype(to_concat, non_empties)
110
111 if len(to_concat) < len(orig):
112 _, _, alt_dtype = _get_result_dtype(orig, non_empties)
113 if alt_dtype != target_dtype:
114 # GH#39122
115 warnings.warn(
116 "The behavior of array concatenation with empty entries is "
117 "deprecated. In a future version, this will no longer exclude "
118 "empty items when determining the result dtype. "
119 "To retain the old behavior, exclude the empty entries before "
120 "the concat operation.",
121 FutureWarning,
122 stacklevel=find_stack_level(),
123 )
124
125 if target_dtype is not None:
126 to_concat = [astype_array(arr, target_dtype, copy=False) for arr in to_concat]
127
128 if not isinstance(to_concat[0], np.ndarray):
129 # i.e. isinstance(to_concat[0], ExtensionArray)
130 to_concat_eas = cast("Sequence[ExtensionArray]", to_concat)
131 cls = type(to_concat[0])
132 # GH#53640: eg. for datetime array, axis=1 but 0 is default
133 # However, class method `_concat_same_type()` for some classes
134 # may not support the `axis` keyword
135 if ea_compat_axis or axis == 0:
136 return cls._concat_same_type(to_concat_eas)
137 else:
138 return cls._concat_same_type(
139 to_concat_eas,
140 axis=axis, # type: ignore[call-arg]
141 )
142 else:
143 to_concat_arrs = cast("Sequence[np.ndarray]", to_concat)
144 result = np.concatenate(to_concat_arrs, axis=axis)
145
146 if not any_ea and "b" in kinds and result.dtype.kind in "iuf":
147 # GH#39817 cast to object instead of casting bools to numeric
148 result = result.astype(object, copy=False)
149 return result
150
151
152def _get_result_dtype(
153 to_concat: Sequence[ArrayLike], non_empties: Sequence[ArrayLike]
154) -> tuple[bool, set[str], DtypeObj | None]:
155 target_dtype = None
156
157 dtypes = {obj.dtype for obj in to_concat}
158 kinds = {obj.dtype.kind for obj in to_concat}
159
160 any_ea = any(not isinstance(x, np.ndarray) for x in to_concat)
161 if any_ea:
162 # i.e. any ExtensionArrays
163
164 # we ignore axis here, as internally concatting with EAs is always
165 # for axis=0
166 if len(dtypes) != 1:
167 target_dtype = find_common_type([x.dtype for x in to_concat])
168 target_dtype = common_dtype_categorical_compat(to_concat, target_dtype)
169
170 elif not len(non_empties):
171 # we have all empties, but may need to coerce the result dtype to
172 # object if we have non-numeric type operands (numpy would otherwise
173 # cast this to float)
174 if len(kinds) != 1:
175 if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}):
176 # let numpy coerce
177 pass
178 else:
179 # coerce to object
180 target_dtype = np.dtype(object)
181 kinds = {"o"}
182 else:
183 # error: Argument 1 to "np_find_common_type" has incompatible type
184 # "*Set[Union[ExtensionDtype, Any]]"; expected "dtype[Any]"
185 target_dtype = np_find_common_type(*dtypes) # type: ignore[arg-type]
186
187 return any_ea, kinds, target_dtype
188
189
190def union_categoricals(
191 to_union, sort_categories: bool = False, ignore_order: bool = False
192) -> Categorical:
193 """
194 Combine list-like of Categorical-like, unioning categories.
195
196 All categories must have the same dtype.
197
198 Parameters
199 ----------
200 to_union : list-like
201 Categorical, CategoricalIndex, or Series with dtype='category'.
202 sort_categories : bool, default False
203 If true, resulting categories will be lexsorted, otherwise
204 they will be ordered as they appear in the data.
205 ignore_order : bool, default False
206 If true, the ordered attribute of the Categoricals will be ignored.
207 Results in an unordered categorical.
208
209 Returns
210 -------
211 Categorical
212
213 Raises
214 ------
215 TypeError
216 - all inputs do not have the same dtype
217 - all inputs do not have the same ordered property
218 - all inputs are ordered and their categories are not identical
219 - sort_categories=True and Categoricals are ordered
220 ValueError
221 Empty list of categoricals passed
222
223 Notes
224 -----
225 To learn more about categories, see `link
226 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__
227
228 Examples
229 --------
230 If you want to combine categoricals that do not necessarily have
231 the same categories, `union_categoricals` will combine a list-like
232 of categoricals. The new categories will be the union of the
233 categories being combined.
234
235 >>> a = pd.Categorical(["b", "c"])
236 >>> b = pd.Categorical(["a", "b"])
237 >>> pd.api.types.union_categoricals([a, b])
238 ['b', 'c', 'a', 'b']
239 Categories (3, object): ['b', 'c', 'a']
240
241 By default, the resulting categories will be ordered as they appear
242 in the `categories` of the data. If you want the categories to be
243 lexsorted, use `sort_categories=True` argument.
244
245 >>> pd.api.types.union_categoricals([a, b], sort_categories=True)
246 ['b', 'c', 'a', 'b']
247 Categories (3, object): ['a', 'b', 'c']
248
249 `union_categoricals` also works with the case of combining two
250 categoricals of the same categories and order information (e.g. what
251 you could also `append` for).
252
253 >>> a = pd.Categorical(["a", "b"], ordered=True)
254 >>> b = pd.Categorical(["a", "b", "a"], ordered=True)
255 >>> pd.api.types.union_categoricals([a, b])
256 ['a', 'b', 'a', 'b', 'a']
257 Categories (2, object): ['a' < 'b']
258
259 Raises `TypeError` because the categories are ordered and not identical.
260
261 >>> a = pd.Categorical(["a", "b"], ordered=True)
262 >>> b = pd.Categorical(["a", "b", "c"], ordered=True)
263 >>> pd.api.types.union_categoricals([a, b])
264 Traceback (most recent call last):
265 ...
266 TypeError: to union ordered Categoricals, all categories must be the same
267
268 Ordered categoricals with different categories or orderings can be
269 combined by using the `ignore_ordered=True` argument.
270
271 >>> a = pd.Categorical(["a", "b", "c"], ordered=True)
272 >>> b = pd.Categorical(["c", "b", "a"], ordered=True)
273 >>> pd.api.types.union_categoricals([a, b], ignore_order=True)
274 ['a', 'b', 'c', 'c', 'b', 'a']
275 Categories (3, object): ['a', 'b', 'c']
276
277 `union_categoricals` also works with a `CategoricalIndex`, or `Series`
278 containing categorical data, but note that the resulting array will
279 always be a plain `Categorical`
280
281 >>> a = pd.Series(["b", "c"], dtype='category')
282 >>> b = pd.Series(["a", "b"], dtype='category')
283 >>> pd.api.types.union_categoricals([a, b])
284 ['b', 'c', 'a', 'b']
285 Categories (3, object): ['b', 'c', 'a']
286 """
287 from pandas import Categorical
288 from pandas.core.arrays.categorical import recode_for_categories
289
290 if len(to_union) == 0:
291 raise ValueError("No Categoricals to union")
292
293 def _maybe_unwrap(x):
294 if isinstance(x, (ABCCategoricalIndex, ABCSeries)):
295 return x._values
296 elif isinstance(x, Categorical):
297 return x
298 else:
299 raise TypeError("all components to combine must be Categorical")
300
301 to_union = [_maybe_unwrap(x) for x in to_union]
302 first = to_union[0]
303
304 if not lib.dtypes_all_equal([obj.categories.dtype for obj in to_union]):
305 raise TypeError("dtype of categories must be the same")
306
307 ordered = False
308 if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]):
309 # identical categories - fastpath
310 categories = first.categories
311 ordered = first.ordered
312
313 all_codes = [first._encode_with_my_categories(x)._codes for x in to_union]
314 new_codes = np.concatenate(all_codes)
315
316 if sort_categories and not ignore_order and ordered:
317 raise TypeError("Cannot use sort_categories=True with ordered Categoricals")
318
319 if sort_categories and not categories.is_monotonic_increasing:
320 categories = categories.sort_values()
321 indexer = categories.get_indexer(first.categories)
322
323 from pandas.core.algorithms import take_nd
324
325 new_codes = take_nd(indexer, new_codes, fill_value=-1)
326 elif ignore_order or all(not c.ordered for c in to_union):
327 # different categories - union and recode
328 cats = first.categories.append([c.categories for c in to_union[1:]])
329 categories = cats.unique()
330 if sort_categories:
331 categories = categories.sort_values()
332
333 new_codes = [
334 recode_for_categories(c.codes, c.categories, categories) for c in to_union
335 ]
336 new_codes = np.concatenate(new_codes)
337 else:
338 # ordered - to show a proper error message
339 if all(c.ordered for c in to_union):
340 msg = "to union ordered Categoricals, all categories must be the same"
341 raise TypeError(msg)
342 raise TypeError("Categorical.ordered must be the same")
343
344 if ignore_order:
345 ordered = False
346
347 dtype = CategoricalDtype(categories=categories, ordered=ordered)
348 return Categorical._simple_new(new_codes, dtype=dtype)