Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/dtypes/concat.py: 39%

1"""

2Utility functions related to concat.

3"""

4from __future__ import annotations

6from typing import (

7 TYPE_CHECKING,

8 cast,

10import warnings

12import numpy as np

14from pandas._libs import lib

15from pandas.util._exceptions import find_stack_level

17from pandas.core.dtypes.astype import astype_array

18from pandas.core.dtypes.cast import (

19 common_dtype_categorical_compat,

20 find_common_type,

21 np_find_common_type,

22)

23from pandas.core.dtypes.dtypes import CategoricalDtype

24from pandas.core.dtypes.generic import (

25 ABCCategoricalIndex,

26 ABCSeries,

27)

29if TYPE_CHECKING:

30 from collections.abc import Sequence

32 from pandas._typing import (

33 ArrayLike,

34 AxisInt,

35 DtypeObj,

36 )

38 from pandas.core.arrays import (

39 Categorical,

40 ExtensionArray,

41 )

44def _is_nonempty(x, axis) -> bool:

45 # filter empty arrays

46 # 1-d dtypes always are included here

47 if x.ndim <= axis:

48 return True

49 return x.shape[axis] > 0

52def concat_compat(

53 to_concat: Sequence[ArrayLike], axis: AxisInt = 0, ea_compat_axis: bool = False

54) -> ArrayLike:

55 """

56 provide concatenation of an array of arrays each of which is a single

57 'normalized' dtypes (in that for example, if it's object, then it is a

58 non-datetimelike and provide a combined dtype for the resulting array that

59 preserves the overall dtype if possible)

61 Parameters

62 ----------

63 to_concat : sequence of arrays

64 axis : axis to provide concatenation

65 ea_compat_axis : bool, default False

66 For ExtensionArray compat, behave as if axis == 1 when determining

67 whether to drop empty arrays.

69 Returns

70 -------

71 a single array, preserving the combined dtypes

72 """

73 if len(to_concat) and lib.dtypes_all_equal([obj.dtype for obj in to_concat]):

74 # fastpath!

75 obj = to_concat[0]

76 if isinstance(obj, np.ndarray):

77 to_concat_arrs = cast("Sequence[np.ndarray]", to_concat)

78 return np.concatenate(to_concat_arrs, axis=axis)

80 to_concat_eas = cast("Sequence[ExtensionArray]", to_concat)

81 if ea_compat_axis:

82 # We have 1D objects, that don't support axis keyword

83 return obj._concat_same_type(to_concat_eas)

84 elif axis == 0:

85 return obj._concat_same_type(to_concat_eas)

86 else:

87 # e.g. DatetimeArray

88 # NB: We are assuming here that ensure_wrapped_if_arraylike has

89 # been called where relevant.

90 return obj._concat_same_type(

91 # error: Unexpected keyword argument "axis" for "_concat_same_type"

92 # of "ExtensionArray"

93 to_concat_eas,

94 axis=axis, # type: ignore[call-arg]

95 )

97 # If all arrays are empty, there's nothing to convert, just short-cut to

98 # the concatenation, #3121.

99 #

100 # Creating an empty array directly is tempting, but the winnings would be

101 # marginal given that it would still require shape & dtype calculation and

102 # np.concatenate which has them both implemented is compiled.

103 orig = to_concat

104 non_empties = [x for x in to_concat if _is_nonempty(x, axis)]

105 if non_empties and axis == 0 and not ea_compat_axis:

106 # ea_compat_axis see GH#39574

107 to_concat = non_empties

108

109 any_ea, kinds, target_dtype = _get_result_dtype(to_concat, non_empties)

110

111 if len(to_concat) < len(orig):

112 _, _, alt_dtype = _get_result_dtype(orig, non_empties)

113 if alt_dtype != target_dtype:

114 # GH#39122

115 warnings.warn(

116 "The behavior of array concatenation with empty entries is "

117 "deprecated. In a future version, this will no longer exclude "

118 "empty items when determining the result dtype. "

119 "To retain the old behavior, exclude the empty entries before "

120 "the concat operation.",

121 FutureWarning,

122 stacklevel=find_stack_level(),

123 )

124

125 if target_dtype is not None:

126 to_concat = [astype_array(arr, target_dtype, copy=False) for arr in to_concat]

127

128 if not isinstance(to_concat[0], np.ndarray):

129 # i.e. isinstance(to_concat[0], ExtensionArray)

130 to_concat_eas = cast("Sequence[ExtensionArray]", to_concat)

131 cls = type(to_concat[0])

132 # GH#53640: eg. for datetime array, axis=1 but 0 is default

133 # However, class method `_concat_same_type()` for some classes

134 # may not support the `axis` keyword

135 if ea_compat_axis or axis == 0:

136 return cls._concat_same_type(to_concat_eas)

137 else:

138 return cls._concat_same_type(

139 to_concat_eas,

140 axis=axis, # type: ignore[call-arg]

141 )

142 else:

143 to_concat_arrs = cast("Sequence[np.ndarray]", to_concat)

144 result = np.concatenate(to_concat_arrs, axis=axis)

145

146 if not any_ea and "b" in kinds and result.dtype.kind in "iuf":

147 # GH#39817 cast to object instead of casting bools to numeric

148 result = result.astype(object, copy=False)

149 return result

150

151

152def _get_result_dtype(

153 to_concat: Sequence[ArrayLike], non_empties: Sequence[ArrayLike]

154) -> tuple[bool, set[str], DtypeObj | None]:

155 target_dtype = None

156

157 dtypes = {obj.dtype for obj in to_concat}

158 kinds = {obj.dtype.kind for obj in to_concat}

159

160 any_ea = any(not isinstance(x, np.ndarray) for x in to_concat)

161 if any_ea:

162 # i.e. any ExtensionArrays

163

164 # we ignore axis here, as internally concatting with EAs is always

165 # for axis=0

166 if len(dtypes) != 1:

167 target_dtype = find_common_type([x.dtype for x in to_concat])

168 target_dtype = common_dtype_categorical_compat(to_concat, target_dtype)

169

170 elif not len(non_empties):

171 # we have all empties, but may need to coerce the result dtype to

172 # object if we have non-numeric type operands (numpy would otherwise

173 # cast this to float)

174 if len(kinds) != 1:

175 if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}):

176 # let numpy coerce

177 pass

178 else:

179 # coerce to object

180 target_dtype = np.dtype(object)

181 kinds = {"o"}

182 else:

183 # error: Argument 1 to "np_find_common_type" has incompatible type

184 # "*Set[Union[ExtensionDtype, Any]]"; expected "dtype[Any]"

185 target_dtype = np_find_common_type(*dtypes) # type: ignore[arg-type]

186

187 return any_ea, kinds, target_dtype

188

189

190def union_categoricals(

191 to_union, sort_categories: bool = False, ignore_order: bool = False

192) -> Categorical:

193 """

194 Combine list-like of Categorical-like, unioning categories.

195

196 All categories must have the same dtype.

197

198 Parameters

199 ----------

200 to_union : list-like

201 Categorical, CategoricalIndex, or Series with dtype='category'.

202 sort_categories : bool, default False

203 If true, resulting categories will be lexsorted, otherwise

204 they will be ordered as they appear in the data.

205 ignore_order : bool, default False

206 If true, the ordered attribute of the Categoricals will be ignored.

207 Results in an unordered categorical.

208

209 Returns

210 -------

211 Categorical

212

213 Raises

214 ------

215 TypeError

216 - all inputs do not have the same dtype

217 - all inputs do not have the same ordered property

218 - all inputs are ordered and their categories are not identical

219 - sort_categories=True and Categoricals are ordered

220 ValueError

221 Empty list of categoricals passed

222

223 Notes

224 -----

225 To learn more about categories, see `link

226 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__

227

228 Examples

229 --------

230 If you want to combine categoricals that do not necessarily have

231 the same categories, `union_categoricals` will combine a list-like

232 of categoricals. The new categories will be the union of the

233 categories being combined.

234

235 >>> a = pd.Categorical(["b", "c"])

236 >>> b = pd.Categorical(["a", "b"])

237 >>> pd.api.types.union_categoricals([a, b])

238 ['b', 'c', 'a', 'b']

239 Categories (3, object): ['b', 'c', 'a']

240

241 By default, the resulting categories will be ordered as they appear

242 in the `categories` of the data. If you want the categories to be

243 lexsorted, use `sort_categories=True` argument.

244

245 >>> pd.api.types.union_categoricals([a, b], sort_categories=True)

246 ['b', 'c', 'a', 'b']

247 Categories (3, object): ['a', 'b', 'c']

248

249 `union_categoricals` also works with the case of combining two

250 categoricals of the same categories and order information (e.g. what

251 you could also `append` for).

252

253 >>> a = pd.Categorical(["a", "b"], ordered=True)

254 >>> b = pd.Categorical(["a", "b", "a"], ordered=True)

255 >>> pd.api.types.union_categoricals([a, b])

256 ['a', 'b', 'a', 'b', 'a']

257 Categories (2, object): ['a' < 'b']

258

259 Raises `TypeError` because the categories are ordered and not identical.

260

261 >>> a = pd.Categorical(["a", "b"], ordered=True)

262 >>> b = pd.Categorical(["a", "b", "c"], ordered=True)

263 >>> pd.api.types.union_categoricals([a, b])

264 Traceback (most recent call last):

265 ...

266 TypeError: to union ordered Categoricals, all categories must be the same

267

268 Ordered categoricals with different categories or orderings can be

269 combined by using the `ignore_ordered=True` argument.

270

271 >>> a = pd.Categorical(["a", "b", "c"], ordered=True)

272 >>> b = pd.Categorical(["c", "b", "a"], ordered=True)

273 >>> pd.api.types.union_categoricals([a, b], ignore_order=True)

274 ['a', 'b', 'c', 'c', 'b', 'a']

275 Categories (3, object): ['a', 'b', 'c']

276

277 `union_categoricals` also works with a `CategoricalIndex`, or `Series`

278 containing categorical data, but note that the resulting array will

279 always be a plain `Categorical`

280

281 >>> a = pd.Series(["b", "c"], dtype='category')

282 >>> b = pd.Series(["a", "b"], dtype='category')

283 >>> pd.api.types.union_categoricals([a, b])

284 ['b', 'c', 'a', 'b']

285 Categories (3, object): ['b', 'c', 'a']

286 """

287 from pandas import Categorical

288 from pandas.core.arrays.categorical import recode_for_categories

289

290 if len(to_union) == 0:

291 raise ValueError("No Categoricals to union")

292

293 def _maybe_unwrap(x):

294 if isinstance(x, (ABCCategoricalIndex, ABCSeries)):

295 return x._values

296 elif isinstance(x, Categorical):

297 return x

298 else:

299 raise TypeError("all components to combine must be Categorical")

300

301 to_union = [_maybe_unwrap(x) for x in to_union]

302 first = to_union[0]

303

304 if not lib.dtypes_all_equal([obj.categories.dtype for obj in to_union]):

305 raise TypeError("dtype of categories must be the same")

306

307 ordered = False

308 if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]):

309 # identical categories - fastpath

310 categories = first.categories

311 ordered = first.ordered

312

313 all_codes = [first._encode_with_my_categories(x)._codes for x in to_union]

314 new_codes = np.concatenate(all_codes)

315

316 if sort_categories and not ignore_order and ordered:

317 raise TypeError("Cannot use sort_categories=True with ordered Categoricals")

318

319 if sort_categories and not categories.is_monotonic_increasing:

320 categories = categories.sort_values()

321 indexer = categories.get_indexer(first.categories)

322

323 from pandas.core.algorithms import take_nd

324

325 new_codes = take_nd(indexer, new_codes, fill_value=-1)

326 elif ignore_order or all(not c.ordered for c in to_union):

327 # different categories - union and recode

328 cats = first.categories.append([c.categories for c in to_union[1:]])

329 categories = cats.unique()

330 if sort_categories:

331 categories = categories.sort_values()

332

333 new_codes = [

334 recode_for_categories(c.codes, c.categories, categories) for c in to_union

335 ]

336 new_codes = np.concatenate(new_codes)

337 else:

338 # ordered - to show a proper error message

339 if all(c.ordered for c in to_union):

340 msg = "to union ordered Categoricals, all categories must be the same"

341 raise TypeError(msg)

342 raise TypeError("Categorical.ordered must be the same")

343

344 if ignore_order:

345 ordered = False

346

347 dtype = CategoricalDtype(categories=categories, ordered=ordered)

348 return Categorical._simple_new(new_codes, dtype=dtype)