Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/dtypes/concat.py: 39%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

112 statements  

1""" 

2Utility functions related to concat. 

3""" 

4from __future__ import annotations 

5 

6from typing import ( 

7 TYPE_CHECKING, 

8 cast, 

9) 

10import warnings 

11 

12import numpy as np 

13 

14from pandas._libs import lib 

15from pandas.util._exceptions import find_stack_level 

16 

17from pandas.core.dtypes.astype import astype_array 

18from pandas.core.dtypes.cast import ( 

19 common_dtype_categorical_compat, 

20 find_common_type, 

21 np_find_common_type, 

22) 

23from pandas.core.dtypes.dtypes import CategoricalDtype 

24from pandas.core.dtypes.generic import ( 

25 ABCCategoricalIndex, 

26 ABCSeries, 

27) 

28 

29if TYPE_CHECKING: 

30 from collections.abc import Sequence 

31 

32 from pandas._typing import ( 

33 ArrayLike, 

34 AxisInt, 

35 DtypeObj, 

36 ) 

37 

38 from pandas.core.arrays import ( 

39 Categorical, 

40 ExtensionArray, 

41 ) 

42 

43 

44def _is_nonempty(x, axis) -> bool: 

45 # filter empty arrays 

46 # 1-d dtypes always are included here 

47 if x.ndim <= axis: 

48 return True 

49 return x.shape[axis] > 0 

50 

51 

52def concat_compat( 

53 to_concat: Sequence[ArrayLike], axis: AxisInt = 0, ea_compat_axis: bool = False 

54) -> ArrayLike: 

55 """ 

56 provide concatenation of an array of arrays each of which is a single 

57 'normalized' dtypes (in that for example, if it's object, then it is a 

58 non-datetimelike and provide a combined dtype for the resulting array that 

59 preserves the overall dtype if possible) 

60 

61 Parameters 

62 ---------- 

63 to_concat : sequence of arrays 

64 axis : axis to provide concatenation 

65 ea_compat_axis : bool, default False 

66 For ExtensionArray compat, behave as if axis == 1 when determining 

67 whether to drop empty arrays. 

68 

69 Returns 

70 ------- 

71 a single array, preserving the combined dtypes 

72 """ 

73 if len(to_concat) and lib.dtypes_all_equal([obj.dtype for obj in to_concat]): 

74 # fastpath! 

75 obj = to_concat[0] 

76 if isinstance(obj, np.ndarray): 

77 to_concat_arrs = cast("Sequence[np.ndarray]", to_concat) 

78 return np.concatenate(to_concat_arrs, axis=axis) 

79 

80 to_concat_eas = cast("Sequence[ExtensionArray]", to_concat) 

81 if ea_compat_axis: 

82 # We have 1D objects, that don't support axis keyword 

83 return obj._concat_same_type(to_concat_eas) 

84 elif axis == 0: 

85 return obj._concat_same_type(to_concat_eas) 

86 else: 

87 # e.g. DatetimeArray 

88 # NB: We are assuming here that ensure_wrapped_if_arraylike has 

89 # been called where relevant. 

90 return obj._concat_same_type( 

91 # error: Unexpected keyword argument "axis" for "_concat_same_type" 

92 # of "ExtensionArray" 

93 to_concat_eas, 

94 axis=axis, # type: ignore[call-arg] 

95 ) 

96 

97 # If all arrays are empty, there's nothing to convert, just short-cut to 

98 # the concatenation, #3121. 

99 # 

100 # Creating an empty array directly is tempting, but the winnings would be 

101 # marginal given that it would still require shape & dtype calculation and 

102 # np.concatenate which has them both implemented is compiled. 

103 orig = to_concat 

104 non_empties = [x for x in to_concat if _is_nonempty(x, axis)] 

105 if non_empties and axis == 0 and not ea_compat_axis: 

106 # ea_compat_axis see GH#39574 

107 to_concat = non_empties 

108 

109 any_ea, kinds, target_dtype = _get_result_dtype(to_concat, non_empties) 

110 

111 if len(to_concat) < len(orig): 

112 _, _, alt_dtype = _get_result_dtype(orig, non_empties) 

113 if alt_dtype != target_dtype: 

114 # GH#39122 

115 warnings.warn( 

116 "The behavior of array concatenation with empty entries is " 

117 "deprecated. In a future version, this will no longer exclude " 

118 "empty items when determining the result dtype. " 

119 "To retain the old behavior, exclude the empty entries before " 

120 "the concat operation.", 

121 FutureWarning, 

122 stacklevel=find_stack_level(), 

123 ) 

124 

125 if target_dtype is not None: 

126 to_concat = [astype_array(arr, target_dtype, copy=False) for arr in to_concat] 

127 

128 if not isinstance(to_concat[0], np.ndarray): 

129 # i.e. isinstance(to_concat[0], ExtensionArray) 

130 to_concat_eas = cast("Sequence[ExtensionArray]", to_concat) 

131 cls = type(to_concat[0]) 

132 # GH#53640: eg. for datetime array, axis=1 but 0 is default 

133 # However, class method `_concat_same_type()` for some classes 

134 # may not support the `axis` keyword 

135 if ea_compat_axis or axis == 0: 

136 return cls._concat_same_type(to_concat_eas) 

137 else: 

138 return cls._concat_same_type( 

139 to_concat_eas, 

140 axis=axis, # type: ignore[call-arg] 

141 ) 

142 else: 

143 to_concat_arrs = cast("Sequence[np.ndarray]", to_concat) 

144 result = np.concatenate(to_concat_arrs, axis=axis) 

145 

146 if not any_ea and "b" in kinds and result.dtype.kind in "iuf": 

147 # GH#39817 cast to object instead of casting bools to numeric 

148 result = result.astype(object, copy=False) 

149 return result 

150 

151 

152def _get_result_dtype( 

153 to_concat: Sequence[ArrayLike], non_empties: Sequence[ArrayLike] 

154) -> tuple[bool, set[str], DtypeObj | None]: 

155 target_dtype = None 

156 

157 dtypes = {obj.dtype for obj in to_concat} 

158 kinds = {obj.dtype.kind for obj in to_concat} 

159 

160 any_ea = any(not isinstance(x, np.ndarray) for x in to_concat) 

161 if any_ea: 

162 # i.e. any ExtensionArrays 

163 

164 # we ignore axis here, as internally concatting with EAs is always 

165 # for axis=0 

166 if len(dtypes) != 1: 

167 target_dtype = find_common_type([x.dtype for x in to_concat]) 

168 target_dtype = common_dtype_categorical_compat(to_concat, target_dtype) 

169 

170 elif not len(non_empties): 

171 # we have all empties, but may need to coerce the result dtype to 

172 # object if we have non-numeric type operands (numpy would otherwise 

173 # cast this to float) 

174 if len(kinds) != 1: 

175 if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}): 

176 # let numpy coerce 

177 pass 

178 else: 

179 # coerce to object 

180 target_dtype = np.dtype(object) 

181 kinds = {"o"} 

182 else: 

183 # error: Argument 1 to "np_find_common_type" has incompatible type 

184 # "*Set[Union[ExtensionDtype, Any]]"; expected "dtype[Any]" 

185 target_dtype = np_find_common_type(*dtypes) # type: ignore[arg-type] 

186 

187 return any_ea, kinds, target_dtype 

188 

189 

190def union_categoricals( 

191 to_union, sort_categories: bool = False, ignore_order: bool = False 

192) -> Categorical: 

193 """ 

194 Combine list-like of Categorical-like, unioning categories. 

195 

196 All categories must have the same dtype. 

197 

198 Parameters 

199 ---------- 

200 to_union : list-like 

201 Categorical, CategoricalIndex, or Series with dtype='category'. 

202 sort_categories : bool, default False 

203 If true, resulting categories will be lexsorted, otherwise 

204 they will be ordered as they appear in the data. 

205 ignore_order : bool, default False 

206 If true, the ordered attribute of the Categoricals will be ignored. 

207 Results in an unordered categorical. 

208 

209 Returns 

210 ------- 

211 Categorical 

212 

213 Raises 

214 ------ 

215 TypeError 

216 - all inputs do not have the same dtype 

217 - all inputs do not have the same ordered property 

218 - all inputs are ordered and their categories are not identical 

219 - sort_categories=True and Categoricals are ordered 

220 ValueError 

221 Empty list of categoricals passed 

222 

223 Notes 

224 ----- 

225 To learn more about categories, see `link 

226 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__ 

227 

228 Examples 

229 -------- 

230 If you want to combine categoricals that do not necessarily have 

231 the same categories, `union_categoricals` will combine a list-like 

232 of categoricals. The new categories will be the union of the 

233 categories being combined. 

234 

235 >>> a = pd.Categorical(["b", "c"]) 

236 >>> b = pd.Categorical(["a", "b"]) 

237 >>> pd.api.types.union_categoricals([a, b]) 

238 ['b', 'c', 'a', 'b'] 

239 Categories (3, object): ['b', 'c', 'a'] 

240 

241 By default, the resulting categories will be ordered as they appear 

242 in the `categories` of the data. If you want the categories to be 

243 lexsorted, use `sort_categories=True` argument. 

244 

245 >>> pd.api.types.union_categoricals([a, b], sort_categories=True) 

246 ['b', 'c', 'a', 'b'] 

247 Categories (3, object): ['a', 'b', 'c'] 

248 

249 `union_categoricals` also works with the case of combining two 

250 categoricals of the same categories and order information (e.g. what 

251 you could also `append` for). 

252 

253 >>> a = pd.Categorical(["a", "b"], ordered=True) 

254 >>> b = pd.Categorical(["a", "b", "a"], ordered=True) 

255 >>> pd.api.types.union_categoricals([a, b]) 

256 ['a', 'b', 'a', 'b', 'a'] 

257 Categories (2, object): ['a' < 'b'] 

258 

259 Raises `TypeError` because the categories are ordered and not identical. 

260 

261 >>> a = pd.Categorical(["a", "b"], ordered=True) 

262 >>> b = pd.Categorical(["a", "b", "c"], ordered=True) 

263 >>> pd.api.types.union_categoricals([a, b]) 

264 Traceback (most recent call last): 

265 ... 

266 TypeError: to union ordered Categoricals, all categories must be the same 

267 

268 Ordered categoricals with different categories or orderings can be 

269 combined by using the `ignore_ordered=True` argument. 

270 

271 >>> a = pd.Categorical(["a", "b", "c"], ordered=True) 

272 >>> b = pd.Categorical(["c", "b", "a"], ordered=True) 

273 >>> pd.api.types.union_categoricals([a, b], ignore_order=True) 

274 ['a', 'b', 'c', 'c', 'b', 'a'] 

275 Categories (3, object): ['a', 'b', 'c'] 

276 

277 `union_categoricals` also works with a `CategoricalIndex`, or `Series` 

278 containing categorical data, but note that the resulting array will 

279 always be a plain `Categorical` 

280 

281 >>> a = pd.Series(["b", "c"], dtype='category') 

282 >>> b = pd.Series(["a", "b"], dtype='category') 

283 >>> pd.api.types.union_categoricals([a, b]) 

284 ['b', 'c', 'a', 'b'] 

285 Categories (3, object): ['b', 'c', 'a'] 

286 """ 

287 from pandas import Categorical 

288 from pandas.core.arrays.categorical import recode_for_categories 

289 

290 if len(to_union) == 0: 

291 raise ValueError("No Categoricals to union") 

292 

293 def _maybe_unwrap(x): 

294 if isinstance(x, (ABCCategoricalIndex, ABCSeries)): 

295 return x._values 

296 elif isinstance(x, Categorical): 

297 return x 

298 else: 

299 raise TypeError("all components to combine must be Categorical") 

300 

301 to_union = [_maybe_unwrap(x) for x in to_union] 

302 first = to_union[0] 

303 

304 if not lib.dtypes_all_equal([obj.categories.dtype for obj in to_union]): 

305 raise TypeError("dtype of categories must be the same") 

306 

307 ordered = False 

308 if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]): 

309 # identical categories - fastpath 

310 categories = first.categories 

311 ordered = first.ordered 

312 

313 all_codes = [first._encode_with_my_categories(x)._codes for x in to_union] 

314 new_codes = np.concatenate(all_codes) 

315 

316 if sort_categories and not ignore_order and ordered: 

317 raise TypeError("Cannot use sort_categories=True with ordered Categoricals") 

318 

319 if sort_categories and not categories.is_monotonic_increasing: 

320 categories = categories.sort_values() 

321 indexer = categories.get_indexer(first.categories) 

322 

323 from pandas.core.algorithms import take_nd 

324 

325 new_codes = take_nd(indexer, new_codes, fill_value=-1) 

326 elif ignore_order or all(not c.ordered for c in to_union): 

327 # different categories - union and recode 

328 cats = first.categories.append([c.categories for c in to_union[1:]]) 

329 categories = cats.unique() 

330 if sort_categories: 

331 categories = categories.sort_values() 

332 

333 new_codes = [ 

334 recode_for_categories(c.codes, c.categories, categories) for c in to_union 

335 ] 

336 new_codes = np.concatenate(new_codes) 

337 else: 

338 # ordered - to show a proper error message 

339 if all(c.ordered for c in to_union): 

340 msg = "to union ordered Categoricals, all categories must be the same" 

341 raise TypeError(msg) 

342 raise TypeError("Categorical.ordered must be the same") 

343 

344 if ignore_order: 

345 ordered = False 

346 

347 dtype = CategoricalDtype(categories=categories, ordered=ordered) 

348 return Categorical._simple_new(new_codes, dtype=dtype)