Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/dtypes/concat.py: 14%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

101 statements  

1""" 

2Utility functions related to concat. 

3""" 

4from __future__ import annotations 

5 

6from typing import TYPE_CHECKING 

7 

8import numpy as np 

9 

10from pandas._typing import AxisInt 

11 

12from pandas.core.dtypes.astype import astype_array 

13from pandas.core.dtypes.cast import ( 

14 common_dtype_categorical_compat, 

15 find_common_type, 

16 np_find_common_type, 

17) 

18from pandas.core.dtypes.common import is_dtype_equal 

19from pandas.core.dtypes.dtypes import ( 

20 DatetimeTZDtype, 

21 ExtensionDtype, 

22) 

23from pandas.core.dtypes.generic import ( 

24 ABCCategoricalIndex, 

25 ABCExtensionArray, 

26 ABCSeries, 

27) 

28 

29if TYPE_CHECKING: 

30 from pandas.core.arrays import Categorical 

31 

32 

33def concat_compat(to_concat, axis: AxisInt = 0, ea_compat_axis: bool = False): 

34 """ 

35 provide concatenation of an array of arrays each of which is a single 

36 'normalized' dtypes (in that for example, if it's object, then it is a 

37 non-datetimelike and provide a combined dtype for the resulting array that 

38 preserves the overall dtype if possible) 

39 

40 Parameters 

41 ---------- 

42 to_concat : array of arrays 

43 axis : axis to provide concatenation 

44 ea_compat_axis : bool, default False 

45 For ExtensionArray compat, behave as if axis == 1 when determining 

46 whether to drop empty arrays. 

47 

48 Returns 

49 ------- 

50 a single array, preserving the combined dtypes 

51 """ 

52 

53 # filter empty arrays 

54 # 1-d dtypes always are included here 

55 def is_nonempty(x) -> bool: 

56 if x.ndim <= axis: 

57 return True 

58 return x.shape[axis] > 0 

59 

60 # If all arrays are empty, there's nothing to convert, just short-cut to 

61 # the concatenation, #3121. 

62 # 

63 # Creating an empty array directly is tempting, but the winnings would be 

64 # marginal given that it would still require shape & dtype calculation and 

65 # np.concatenate which has them both implemented is compiled. 

66 non_empties = [x for x in to_concat if is_nonempty(x)] 

67 if non_empties and axis == 0 and not ea_compat_axis: 

68 # ea_compat_axis see GH#39574 

69 to_concat = non_empties 

70 

71 dtypes = {obj.dtype for obj in to_concat} 

72 kinds = {obj.dtype.kind for obj in to_concat} 

73 contains_datetime = any( 

74 isinstance(dtype, (np.dtype, DatetimeTZDtype)) and dtype.kind in ["m", "M"] 

75 for dtype in dtypes 

76 ) or any(isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat) 

77 

78 all_empty = not len(non_empties) 

79 single_dtype = len({x.dtype for x in to_concat}) == 1 

80 any_ea = any(isinstance(x.dtype, ExtensionDtype) for x in to_concat) 

81 

82 if contains_datetime: 

83 return _concat_datetime(to_concat, axis=axis) 

84 

85 if any_ea: 

86 # we ignore axis here, as internally concatting with EAs is always 

87 # for axis=0 

88 if not single_dtype: 

89 target_dtype = find_common_type([x.dtype for x in to_concat]) 

90 target_dtype = common_dtype_categorical_compat(to_concat, target_dtype) 

91 to_concat = [ 

92 astype_array(arr, target_dtype, copy=False) for arr in to_concat 

93 ] 

94 

95 if isinstance(to_concat[0], ABCExtensionArray): 

96 # TODO: what about EA-backed Index? 

97 cls = type(to_concat[0]) 

98 return cls._concat_same_type(to_concat) 

99 else: 

100 return np.concatenate(to_concat) 

101 

102 elif all_empty: 

103 # we have all empties, but may need to coerce the result dtype to 

104 # object if we have non-numeric type operands (numpy would otherwise 

105 # cast this to float) 

106 if len(kinds) != 1: 

107 if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}): 

108 # let numpy coerce 

109 pass 

110 else: 

111 # coerce to object 

112 to_concat = [x.astype("object") for x in to_concat] 

113 kinds = {"o"} 

114 else: 

115 target_dtype = np_find_common_type(*dtypes) 

116 

117 result = np.concatenate(to_concat, axis=axis) 

118 if "b" in kinds and result.dtype.kind in ["i", "u", "f"]: 

119 # GH#39817 cast to object instead of casting bools to numeric 

120 result = result.astype(object, copy=False) 

121 return result 

122 

123 

124def union_categoricals( 

125 to_union, sort_categories: bool = False, ignore_order: bool = False 

126) -> Categorical: 

127 """ 

128 Combine list-like of Categorical-like, unioning categories. 

129 

130 All categories must have the same dtype. 

131 

132 Parameters 

133 ---------- 

134 to_union : list-like 

135 Categorical, CategoricalIndex, or Series with dtype='category'. 

136 sort_categories : bool, default False 

137 If true, resulting categories will be lexsorted, otherwise 

138 they will be ordered as they appear in the data. 

139 ignore_order : bool, default False 

140 If true, the ordered attribute of the Categoricals will be ignored. 

141 Results in an unordered categorical. 

142 

143 Returns 

144 ------- 

145 Categorical 

146 

147 Raises 

148 ------ 

149 TypeError 

150 - all inputs do not have the same dtype 

151 - all inputs do not have the same ordered property 

152 - all inputs are ordered and their categories are not identical 

153 - sort_categories=True and Categoricals are ordered 

154 ValueError 

155 Empty list of categoricals passed 

156 

157 Notes 

158 ----- 

159 To learn more about categories, see `link 

160 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__ 

161 

162 Examples 

163 -------- 

164 If you want to combine categoricals that do not necessarily have 

165 the same categories, `union_categoricals` will combine a list-like 

166 of categoricals. The new categories will be the union of the 

167 categories being combined. 

168 

169 >>> a = pd.Categorical(["b", "c"]) 

170 >>> b = pd.Categorical(["a", "b"]) 

171 >>> pd.api.types.union_categoricals([a, b]) 

172 ['b', 'c', 'a', 'b'] 

173 Categories (3, object): ['b', 'c', 'a'] 

174 

175 By default, the resulting categories will be ordered as they appear 

176 in the `categories` of the data. If you want the categories to be 

177 lexsorted, use `sort_categories=True` argument. 

178 

179 >>> pd.api.types.union_categoricals([a, b], sort_categories=True) 

180 ['b', 'c', 'a', 'b'] 

181 Categories (3, object): ['a', 'b', 'c'] 

182 

183 `union_categoricals` also works with the case of combining two 

184 categoricals of the same categories and order information (e.g. what 

185 you could also `append` for). 

186 

187 >>> a = pd.Categorical(["a", "b"], ordered=True) 

188 >>> b = pd.Categorical(["a", "b", "a"], ordered=True) 

189 >>> pd.api.types.union_categoricals([a, b]) 

190 ['a', 'b', 'a', 'b', 'a'] 

191 Categories (2, object): ['a' < 'b'] 

192 

193 Raises `TypeError` because the categories are ordered and not identical. 

194 

195 >>> a = pd.Categorical(["a", "b"], ordered=True) 

196 >>> b = pd.Categorical(["a", "b", "c"], ordered=True) 

197 >>> pd.api.types.union_categoricals([a, b]) 

198 Traceback (most recent call last): 

199 ... 

200 TypeError: to union ordered Categoricals, all categories must be the same 

201 

202 New in version 0.20.0 

203 

204 Ordered categoricals with different categories or orderings can be 

205 combined by using the `ignore_ordered=True` argument. 

206 

207 >>> a = pd.Categorical(["a", "b", "c"], ordered=True) 

208 >>> b = pd.Categorical(["c", "b", "a"], ordered=True) 

209 >>> pd.api.types.union_categoricals([a, b], ignore_order=True) 

210 ['a', 'b', 'c', 'c', 'b', 'a'] 

211 Categories (3, object): ['a', 'b', 'c'] 

212 

213 `union_categoricals` also works with a `CategoricalIndex`, or `Series` 

214 containing categorical data, but note that the resulting array will 

215 always be a plain `Categorical` 

216 

217 >>> a = pd.Series(["b", "c"], dtype='category') 

218 >>> b = pd.Series(["a", "b"], dtype='category') 

219 >>> pd.api.types.union_categoricals([a, b]) 

220 ['b', 'c', 'a', 'b'] 

221 Categories (3, object): ['b', 'c', 'a'] 

222 """ 

223 from pandas import Categorical 

224 from pandas.core.arrays.categorical import recode_for_categories 

225 

226 if len(to_union) == 0: 

227 raise ValueError("No Categoricals to union") 

228 

229 def _maybe_unwrap(x): 

230 if isinstance(x, (ABCCategoricalIndex, ABCSeries)): 

231 return x._values 

232 elif isinstance(x, Categorical): 

233 return x 

234 else: 

235 raise TypeError("all components to combine must be Categorical") 

236 

237 to_union = [_maybe_unwrap(x) for x in to_union] 

238 first = to_union[0] 

239 

240 if not all( 

241 is_dtype_equal(other.categories.dtype, first.categories.dtype) 

242 for other in to_union[1:] 

243 ): 

244 raise TypeError("dtype of categories must be the same") 

245 

246 ordered = False 

247 if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]): 

248 # identical categories - fastpath 

249 categories = first.categories 

250 ordered = first.ordered 

251 

252 all_codes = [first._encode_with_my_categories(x)._codes for x in to_union] 

253 new_codes = np.concatenate(all_codes) 

254 

255 if sort_categories and not ignore_order and ordered: 

256 raise TypeError("Cannot use sort_categories=True with ordered Categoricals") 

257 

258 if sort_categories and not categories.is_monotonic_increasing: 

259 categories = categories.sort_values() 

260 indexer = categories.get_indexer(first.categories) 

261 

262 from pandas.core.algorithms import take_nd 

263 

264 new_codes = take_nd(indexer, new_codes, fill_value=-1) 

265 elif ignore_order or all(not c.ordered for c in to_union): 

266 # different categories - union and recode 

267 cats = first.categories.append([c.categories for c in to_union[1:]]) 

268 categories = cats.unique() 

269 if sort_categories: 

270 categories = categories.sort_values() 

271 

272 new_codes = [ 

273 recode_for_categories(c.codes, c.categories, categories) for c in to_union 

274 ] 

275 new_codes = np.concatenate(new_codes) 

276 else: 

277 # ordered - to show a proper error message 

278 if all(c.ordered for c in to_union): 

279 msg = "to union ordered Categoricals, all categories must be the same" 

280 raise TypeError(msg) 

281 raise TypeError("Categorical.ordered must be the same") 

282 

283 if ignore_order: 

284 ordered = False 

285 

286 return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) 

287 

288 

289def _concatenate_2d(to_concat, axis: AxisInt): 

290 # coerce to 2d if needed & concatenate 

291 if axis == 1: 

292 to_concat = [np.atleast_2d(x) for x in to_concat] 

293 return np.concatenate(to_concat, axis=axis) 

294 

295 

296def _concat_datetime(to_concat, axis: AxisInt = 0): 

297 """ 

298 provide concatenation of an datetimelike array of arrays each of which is a 

299 single M8[ns], datetime64[ns, tz] or m8[ns] dtype 

300 

301 Parameters 

302 ---------- 

303 to_concat : array of arrays 

304 axis : axis to provide concatenation 

305 

306 Returns 

307 ------- 

308 a single array, preserving the combined dtypes 

309 """ 

310 from pandas.core.construction import ensure_wrapped_if_datetimelike 

311 

312 to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat] 

313 

314 single_dtype = len({x.dtype for x in to_concat}) == 1 

315 

316 # multiple types, need to coerce to object 

317 if not single_dtype: 

318 # ensure_wrapped_if_datetimelike ensures that astype(object) wraps 

319 # in Timestamp/Timedelta 

320 return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) 

321 

322 result = type(to_concat[0])._concat_same_type(to_concat, axis=axis) 

323 return result