Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/dtypes/concat.py: 14%

1"""

2Utility functions related to concat.

3"""

4from __future__ import annotations

6from typing import TYPE_CHECKING

8import numpy as np

10from pandas._typing import AxisInt

12from pandas.core.dtypes.astype import astype_array

13from pandas.core.dtypes.cast import (

14 common_dtype_categorical_compat,

15 find_common_type,

16 np_find_common_type,

17)

18from pandas.core.dtypes.common import is_dtype_equal

19from pandas.core.dtypes.dtypes import (

20 DatetimeTZDtype,

21 ExtensionDtype,

22)

23from pandas.core.dtypes.generic import (

24 ABCCategoricalIndex,

25 ABCExtensionArray,

26 ABCSeries,

27)

29if TYPE_CHECKING:

30 from pandas.core.arrays import Categorical

33def concat_compat(to_concat, axis: AxisInt = 0, ea_compat_axis: bool = False):

34 """

35 provide concatenation of an array of arrays each of which is a single

36 'normalized' dtypes (in that for example, if it's object, then it is a

37 non-datetimelike and provide a combined dtype for the resulting array that

38 preserves the overall dtype if possible)

40 Parameters

41 ----------

42 to_concat : array of arrays

43 axis : axis to provide concatenation

44 ea_compat_axis : bool, default False

45 For ExtensionArray compat, behave as if axis == 1 when determining

46 whether to drop empty arrays.

48 Returns

49 -------

50 a single array, preserving the combined dtypes

51 """

53 # filter empty arrays

54 # 1-d dtypes always are included here

55 def is_nonempty(x) -> bool:

56 if x.ndim <= axis:

57 return True

58 return x.shape[axis] > 0

60 # If all arrays are empty, there's nothing to convert, just short-cut to

61 # the concatenation, #3121.

62 #

63 # Creating an empty array directly is tempting, but the winnings would be

64 # marginal given that it would still require shape & dtype calculation and

65 # np.concatenate which has them both implemented is compiled.

66 non_empties = [x for x in to_concat if is_nonempty(x)]

67 if non_empties and axis == 0 and not ea_compat_axis:

68 # ea_compat_axis see GH#39574

69 to_concat = non_empties

71 dtypes = {obj.dtype for obj in to_concat}

72 kinds = {obj.dtype.kind for obj in to_concat}

73 contains_datetime = any(

74 isinstance(dtype, (np.dtype, DatetimeTZDtype)) and dtype.kind in ["m", "M"]

75 for dtype in dtypes

76 ) or any(isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat)

78 all_empty = not len(non_empties)

79 single_dtype = len({x.dtype for x in to_concat}) == 1

80 any_ea = any(isinstance(x.dtype, ExtensionDtype) for x in to_concat)

82 if contains_datetime:

83 return _concat_datetime(to_concat, axis=axis)

85 if any_ea:

86 # we ignore axis here, as internally concatting with EAs is always

87 # for axis=0

88 if not single_dtype:

89 target_dtype = find_common_type([x.dtype for x in to_concat])

90 target_dtype = common_dtype_categorical_compat(to_concat, target_dtype)

91 to_concat = [

92 astype_array(arr, target_dtype, copy=False) for arr in to_concat

93 ]

95 if isinstance(to_concat[0], ABCExtensionArray):

96 # TODO: what about EA-backed Index?

97 cls = type(to_concat[0])

98 return cls._concat_same_type(to_concat)

99 else:

100 return np.concatenate(to_concat)

101

102 elif all_empty:

103 # we have all empties, but may need to coerce the result dtype to

104 # object if we have non-numeric type operands (numpy would otherwise

105 # cast this to float)

106 if len(kinds) != 1:

107 if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}):

108 # let numpy coerce

109 pass

110 else:

111 # coerce to object

112 to_concat = [x.astype("object") for x in to_concat]

113 kinds = {"o"}

114 else:

115 target_dtype = np_find_common_type(*dtypes)

116

117 result = np.concatenate(to_concat, axis=axis)

118 if "b" in kinds and result.dtype.kind in ["i", "u", "f"]:

119 # GH#39817 cast to object instead of casting bools to numeric

120 result = result.astype(object, copy=False)

121 return result

122

123

124def union_categoricals(

125 to_union, sort_categories: bool = False, ignore_order: bool = False

126) -> Categorical:

127 """

128 Combine list-like of Categorical-like, unioning categories.

129

130 All categories must have the same dtype.

131

132 Parameters

133 ----------

134 to_union : list-like

135 Categorical, CategoricalIndex, or Series with dtype='category'.

136 sort_categories : bool, default False

137 If true, resulting categories will be lexsorted, otherwise

138 they will be ordered as they appear in the data.

139 ignore_order : bool, default False

140 If true, the ordered attribute of the Categoricals will be ignored.

141 Results in an unordered categorical.

142

143 Returns

144 -------

145 Categorical

146

147 Raises

148 ------

149 TypeError

150 - all inputs do not have the same dtype

151 - all inputs do not have the same ordered property

152 - all inputs are ordered and their categories are not identical

153 - sort_categories=True and Categoricals are ordered

154 ValueError

155 Empty list of categoricals passed

156

157 Notes

158 -----

159 To learn more about categories, see `link

160 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__

161

162 Examples

163 --------

164 If you want to combine categoricals that do not necessarily have

165 the same categories, `union_categoricals` will combine a list-like

166 of categoricals. The new categories will be the union of the

167 categories being combined.

168

169 >>> a = pd.Categorical(["b", "c"])

170 >>> b = pd.Categorical(["a", "b"])

171 >>> pd.api.types.union_categoricals([a, b])

172 ['b', 'c', 'a', 'b']

173 Categories (3, object): ['b', 'c', 'a']

174

175 By default, the resulting categories will be ordered as they appear

176 in the `categories` of the data. If you want the categories to be

177 lexsorted, use `sort_categories=True` argument.

178

179 >>> pd.api.types.union_categoricals([a, b], sort_categories=True)

180 ['b', 'c', 'a', 'b']

181 Categories (3, object): ['a', 'b', 'c']

182

183 `union_categoricals` also works with the case of combining two

184 categoricals of the same categories and order information (e.g. what

185 you could also `append` for).

186

187 >>> a = pd.Categorical(["a", "b"], ordered=True)

188 >>> b = pd.Categorical(["a", "b", "a"], ordered=True)

189 >>> pd.api.types.union_categoricals([a, b])

190 ['a', 'b', 'a', 'b', 'a']

191 Categories (2, object): ['a' < 'b']

192

193 Raises `TypeError` because the categories are ordered and not identical.

194

195 >>> a = pd.Categorical(["a", "b"], ordered=True)

196 >>> b = pd.Categorical(["a", "b", "c"], ordered=True)

197 >>> pd.api.types.union_categoricals([a, b])

198 Traceback (most recent call last):

199 ...

200 TypeError: to union ordered Categoricals, all categories must be the same

201

202 New in version 0.20.0

203

204 Ordered categoricals with different categories or orderings can be

205 combined by using the `ignore_ordered=True` argument.

206

207 >>> a = pd.Categorical(["a", "b", "c"], ordered=True)

208 >>> b = pd.Categorical(["c", "b", "a"], ordered=True)

209 >>> pd.api.types.union_categoricals([a, b], ignore_order=True)

210 ['a', 'b', 'c', 'c', 'b', 'a']

211 Categories (3, object): ['a', 'b', 'c']

212

213 `union_categoricals` also works with a `CategoricalIndex`, or `Series`

214 containing categorical data, but note that the resulting array will

215 always be a plain `Categorical`

216

217 >>> a = pd.Series(["b", "c"], dtype='category')

218 >>> b = pd.Series(["a", "b"], dtype='category')

219 >>> pd.api.types.union_categoricals([a, b])

220 ['b', 'c', 'a', 'b']

221 Categories (3, object): ['b', 'c', 'a']

222 """

223 from pandas import Categorical

224 from pandas.core.arrays.categorical import recode_for_categories

225

226 if len(to_union) == 0:

227 raise ValueError("No Categoricals to union")

228

229 def _maybe_unwrap(x):

230 if isinstance(x, (ABCCategoricalIndex, ABCSeries)):

231 return x._values

232 elif isinstance(x, Categorical):

233 return x

234 else:

235 raise TypeError("all components to combine must be Categorical")

236

237 to_union = [_maybe_unwrap(x) for x in to_union]

238 first = to_union[0]

239

240 if not all(

241 is_dtype_equal(other.categories.dtype, first.categories.dtype)

242 for other in to_union[1:]

243 ):

244 raise TypeError("dtype of categories must be the same")

245

246 ordered = False

247 if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]):

248 # identical categories - fastpath

249 categories = first.categories

250 ordered = first.ordered

251

252 all_codes = [first._encode_with_my_categories(x)._codes for x in to_union]

253 new_codes = np.concatenate(all_codes)

254

255 if sort_categories and not ignore_order and ordered:

256 raise TypeError("Cannot use sort_categories=True with ordered Categoricals")

257

258 if sort_categories and not categories.is_monotonic_increasing:

259 categories = categories.sort_values()

260 indexer = categories.get_indexer(first.categories)

261

262 from pandas.core.algorithms import take_nd

263

264 new_codes = take_nd(indexer, new_codes, fill_value=-1)

265 elif ignore_order or all(not c.ordered for c in to_union):

266 # different categories - union and recode

267 cats = first.categories.append([c.categories for c in to_union[1:]])

268 categories = cats.unique()

269 if sort_categories:

270 categories = categories.sort_values()

271

272 new_codes = [

273 recode_for_categories(c.codes, c.categories, categories) for c in to_union

274 ]

275 new_codes = np.concatenate(new_codes)

276 else:

277 # ordered - to show a proper error message

278 if all(c.ordered for c in to_union):

279 msg = "to union ordered Categoricals, all categories must be the same"

280 raise TypeError(msg)

281 raise TypeError("Categorical.ordered must be the same")

282

283 if ignore_order:

284 ordered = False

285

286 return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)

287

288

289def _concatenate_2d(to_concat, axis: AxisInt):

290 # coerce to 2d if needed & concatenate

291 if axis == 1:

292 to_concat = [np.atleast_2d(x) for x in to_concat]

293 return np.concatenate(to_concat, axis=axis)

294

295

296def _concat_datetime(to_concat, axis: AxisInt = 0):

297 """

298 provide concatenation of an datetimelike array of arrays each of which is a

299 single M8[ns], datetime64[ns, tz] or m8[ns] dtype

300

301 Parameters

302 ----------

303 to_concat : array of arrays

304 axis : axis to provide concatenation

305

306 Returns

307 -------

308 a single array, preserving the combined dtypes

309 """

310 from pandas.core.construction import ensure_wrapped_if_datetimelike

311

312 to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat]

313

314 single_dtype = len({x.dtype for x in to_concat}) == 1

315

316 # multiple types, need to coerce to object

317 if not single_dtype:

318 # ensure_wrapped_if_datetimelike ensures that astype(object) wraps

319 # in Timestamp/Timedelta

320 return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis)

321

322 result = type(to_concat[0])._concat_same_type(to_concat, axis=axis)

323 return result