Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/indexes/category.py: 38%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

120 statements  

1from __future__ import annotations 

2 

3from typing import ( 

4 TYPE_CHECKING, 

5 Any, 

6 Literal, 

7 cast, 

8) 

9 

10import numpy as np 

11 

12from pandas._libs import index as libindex 

13from pandas.util._decorators import ( 

14 cache_readonly, 

15 doc, 

16) 

17 

18from pandas.core.dtypes.common import is_scalar 

19from pandas.core.dtypes.concat import concat_compat 

20from pandas.core.dtypes.dtypes import CategoricalDtype 

21from pandas.core.dtypes.missing import ( 

22 is_valid_na_for_dtype, 

23 isna, 

24) 

25 

26from pandas.core.arrays.categorical import ( 

27 Categorical, 

28 contains, 

29) 

30from pandas.core.construction import extract_array 

31from pandas.core.indexes.base import ( 

32 Index, 

33 maybe_extract_name, 

34) 

35from pandas.core.indexes.extension import ( 

36 NDArrayBackedExtensionIndex, 

37 inherit_names, 

38) 

39 

40if TYPE_CHECKING: 

41 from collections.abc import Hashable 

42 

43 from pandas._typing import ( 

44 Dtype, 

45 DtypeObj, 

46 Self, 

47 npt, 

48 ) 

49 

50 

51@inherit_names( 

52 [ 

53 "argsort", 

54 "tolist", 

55 "codes", 

56 "categories", 

57 "ordered", 

58 "_reverse_indexer", 

59 "searchsorted", 

60 "min", 

61 "max", 

62 ], 

63 Categorical, 

64) 

65@inherit_names( 

66 [ 

67 "rename_categories", 

68 "reorder_categories", 

69 "add_categories", 

70 "remove_categories", 

71 "remove_unused_categories", 

72 "set_categories", 

73 "as_ordered", 

74 "as_unordered", 

75 ], 

76 Categorical, 

77 wrap=True, 

78) 

79class CategoricalIndex(NDArrayBackedExtensionIndex): 

80 """ 

81 Index based on an underlying :class:`Categorical`. 

82 

83 CategoricalIndex, like Categorical, can only take on a limited, 

84 and usually fixed, number of possible values (`categories`). Also, 

85 like Categorical, it might have an order, but numerical operations 

86 (additions, divisions, ...) are not possible. 

87 

88 Parameters 

89 ---------- 

90 data : array-like (1-dimensional) 

91 The values of the categorical. If `categories` are given, values not in 

92 `categories` will be replaced with NaN. 

93 categories : index-like, optional 

94 The categories for the categorical. Items need to be unique. 

95 If the categories are not given here (and also not in `dtype`), they 

96 will be inferred from the `data`. 

97 ordered : bool, optional 

98 Whether or not this categorical is treated as an ordered 

99 categorical. If not given here or in `dtype`, the resulting 

100 categorical will be unordered. 

101 dtype : CategoricalDtype or "category", optional 

102 If :class:`CategoricalDtype`, cannot be used together with 

103 `categories` or `ordered`. 

104 copy : bool, default False 

105 Make a copy of input ndarray. 

106 name : object, optional 

107 Name to be stored in the index. 

108 

109 Attributes 

110 ---------- 

111 codes 

112 categories 

113 ordered 

114 

115 Methods 

116 ------- 

117 rename_categories 

118 reorder_categories 

119 add_categories 

120 remove_categories 

121 remove_unused_categories 

122 set_categories 

123 as_ordered 

124 as_unordered 

125 map 

126 

127 Raises 

128 ------ 

129 ValueError 

130 If the categories do not validate. 

131 TypeError 

132 If an explicit ``ordered=True`` is given but no `categories` and the 

133 `values` are not sortable. 

134 

135 See Also 

136 -------- 

137 Index : The base pandas Index type. 

138 Categorical : A categorical array. 

139 CategoricalDtype : Type for categorical data. 

140 

141 Notes 

142 ----- 

143 See the `user guide 

144 <https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#categoricalindex>`__ 

145 for more. 

146 

147 Examples 

148 -------- 

149 >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) 

150 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], 

151 categories=['a', 'b', 'c'], ordered=False, dtype='category') 

152 

153 ``CategoricalIndex`` can also be instantiated from a ``Categorical``: 

154 

155 >>> c = pd.Categorical(["a", "b", "c", "a", "b", "c"]) 

156 >>> pd.CategoricalIndex(c) 

157 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], 

158 categories=['a', 'b', 'c'], ordered=False, dtype='category') 

159 

160 Ordered ``CategoricalIndex`` can have a min and max value. 

161 

162 >>> ci = pd.CategoricalIndex( 

163 ... ["a", "b", "c", "a", "b", "c"], ordered=True, categories=["c", "b", "a"] 

164 ... ) 

165 >>> ci 

166 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], 

167 categories=['c', 'b', 'a'], ordered=True, dtype='category') 

168 >>> ci.min() 

169 'c' 

170 """ 

171 

172 _typ = "categoricalindex" 

173 _data_cls = Categorical 

174 

175 @property 

176 def _can_hold_strings(self): 

177 return self.categories._can_hold_strings 

178 

179 @cache_readonly 

180 def _should_fallback_to_positional(self) -> bool: 

181 return self.categories._should_fallback_to_positional 

182 

183 codes: np.ndarray 

184 categories: Index 

185 ordered: bool | None 

186 _data: Categorical 

187 _values: Categorical 

188 

189 @property 

190 def _engine_type(self) -> type[libindex.IndexEngine]: 

191 # self.codes can have dtype int8, int16, int32 or int64, so we need 

192 # to return the corresponding engine type (libindex.Int8Engine, etc.). 

193 return { 

194 np.int8: libindex.Int8Engine, 

195 np.int16: libindex.Int16Engine, 

196 np.int32: libindex.Int32Engine, 

197 np.int64: libindex.Int64Engine, 

198 }[self.codes.dtype.type] 

199 

200 # -------------------------------------------------------------------- 

201 # Constructors 

202 

203 def __new__( 

204 cls, 

205 data=None, 

206 categories=None, 

207 ordered=None, 

208 dtype: Dtype | None = None, 

209 copy: bool = False, 

210 name: Hashable | None = None, 

211 ) -> Self: 

212 name = maybe_extract_name(name, data, cls) 

213 

214 if is_scalar(data): 

215 # GH#38944 include None here, which pre-2.0 subbed in [] 

216 cls._raise_scalar_data_error(data) 

217 

218 data = Categorical( 

219 data, categories=categories, ordered=ordered, dtype=dtype, copy=copy 

220 ) 

221 

222 return cls._simple_new(data, name=name) 

223 

224 # -------------------------------------------------------------------- 

225 

226 def _is_dtype_compat(self, other: Index) -> Categorical: 

227 """ 

228 *this is an internal non-public method* 

229 

230 provide a comparison between the dtype of self and other (coercing if 

231 needed) 

232 

233 Parameters 

234 ---------- 

235 other : Index 

236 

237 Returns 

238 ------- 

239 Categorical 

240 

241 Raises 

242 ------ 

243 TypeError if the dtypes are not compatible 

244 """ 

245 if isinstance(other.dtype, CategoricalDtype): 

246 cat = extract_array(other) 

247 cat = cast(Categorical, cat) 

248 if not cat._categories_match_up_to_permutation(self._values): 

249 raise TypeError( 

250 "categories must match existing categories when appending" 

251 ) 

252 

253 elif other._is_multi: 

254 # preempt raising NotImplementedError in isna call 

255 raise TypeError("MultiIndex is not dtype-compatible with CategoricalIndex") 

256 else: 

257 values = other 

258 

259 cat = Categorical(other, dtype=self.dtype) 

260 other = CategoricalIndex(cat) 

261 if not other.isin(values).all(): 

262 raise TypeError( 

263 "cannot append a non-category item to a CategoricalIndex" 

264 ) 

265 cat = other._values 

266 

267 if not ((cat == values) | (isna(cat) & isna(values))).all(): 

268 # GH#37667 see test_equals_non_category 

269 raise TypeError( 

270 "categories must match existing categories when appending" 

271 ) 

272 

273 return cat 

274 

275 def equals(self, other: object) -> bool: 

276 """ 

277 Determine if two CategoricalIndex objects contain the same elements. 

278 

279 Returns 

280 ------- 

281 bool 

282 ``True`` if two :class:`pandas.CategoricalIndex` objects have equal 

283 elements, ``False`` otherwise. 

284 

285 Examples 

286 -------- 

287 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c']) 

288 >>> ci2 = pd.CategoricalIndex(pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])) 

289 >>> ci.equals(ci2) 

290 True 

291 

292 The order of elements matters. 

293 

294 >>> ci3 = pd.CategoricalIndex(['c', 'b', 'a', 'a', 'b', 'c']) 

295 >>> ci.equals(ci3) 

296 False 

297 

298 The orderedness also matters. 

299 

300 >>> ci4 = ci.as_ordered() 

301 >>> ci.equals(ci4) 

302 False 

303 

304 The categories matter, but the order of the categories matters only when 

305 ``ordered=True``. 

306 

307 >>> ci5 = ci.set_categories(['a', 'b', 'c', 'd']) 

308 >>> ci.equals(ci5) 

309 False 

310 

311 >>> ci6 = ci.set_categories(['b', 'c', 'a']) 

312 >>> ci.equals(ci6) 

313 True 

314 >>> ci_ordered = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], 

315 ... ordered=True) 

316 >>> ci2_ordered = ci_ordered.set_categories(['b', 'c', 'a']) 

317 >>> ci_ordered.equals(ci2_ordered) 

318 False 

319 """ 

320 if self.is_(other): 

321 return True 

322 

323 if not isinstance(other, Index): 

324 return False 

325 

326 try: 

327 other = self._is_dtype_compat(other) 

328 except (TypeError, ValueError): 

329 return False 

330 

331 return self._data.equals(other) 

332 

333 # -------------------------------------------------------------------- 

334 # Rendering Methods 

335 

336 @property 

337 def _formatter_func(self): 

338 return self.categories._formatter_func 

339 

340 def _format_attrs(self): 

341 """ 

342 Return a list of tuples of the (attr,formatted_value) 

343 """ 

344 attrs: list[tuple[str, str | int | bool | None]] 

345 

346 attrs = [ 

347 ( 

348 "categories", 

349 f"[{', '.join(self._data._repr_categories())}]", 

350 ), 

351 ("ordered", self.ordered), 

352 ] 

353 extra = super()._format_attrs() 

354 return attrs + extra 

355 

356 # -------------------------------------------------------------------- 

357 

358 @property 

359 def inferred_type(self) -> str: 

360 return "categorical" 

361 

362 @doc(Index.__contains__) 

363 def __contains__(self, key: Any) -> bool: 

364 # if key is a NaN, check if any NaN is in self. 

365 if is_valid_na_for_dtype(key, self.categories.dtype): 

366 return self.hasnans 

367 

368 return contains(self, key, container=self._engine) 

369 

370 def reindex( 

371 self, target, method=None, level=None, limit: int | None = None, tolerance=None 

372 ) -> tuple[Index, npt.NDArray[np.intp] | None]: 

373 """ 

374 Create index with target's values (move/add/delete values as necessary) 

375 

376 Returns 

377 ------- 

378 new_index : pd.Index 

379 Resulting index 

380 indexer : np.ndarray[np.intp] or None 

381 Indices of output values in original index 

382 

383 """ 

384 if method is not None: 

385 raise NotImplementedError( 

386 "argument method is not implemented for CategoricalIndex.reindex" 

387 ) 

388 if level is not None: 

389 raise NotImplementedError( 

390 "argument level is not implemented for CategoricalIndex.reindex" 

391 ) 

392 if limit is not None: 

393 raise NotImplementedError( 

394 "argument limit is not implemented for CategoricalIndex.reindex" 

395 ) 

396 return super().reindex(target) 

397 

398 # -------------------------------------------------------------------- 

399 # Indexing Methods 

400 

401 def _maybe_cast_indexer(self, key) -> int: 

402 # GH#41933: we have to do this instead of self._data._validate_scalar 

403 # because this will correctly get partial-indexing on Interval categories 

404 try: 

405 return self._data._unbox_scalar(key) 

406 except KeyError: 

407 if is_valid_na_for_dtype(key, self.categories.dtype): 

408 return -1 

409 raise 

410 

411 def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex: 

412 if isinstance(values, CategoricalIndex): 

413 values = values._data 

414 if isinstance(values, Categorical): 

415 # Indexing on codes is more efficient if categories are the same, 

416 # so we can apply some optimizations based on the degree of 

417 # dtype-matching. 

418 cat = self._data._encode_with_my_categories(values) 

419 codes = cat._codes 

420 else: 

421 codes = self.categories.get_indexer(values) 

422 codes = codes.astype(self.codes.dtype, copy=False) 

423 cat = self._data._from_backing_data(codes) 

424 return type(self)._simple_new(cat) 

425 

426 # -------------------------------------------------------------------- 

427 

428 def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: 

429 return self.categories._is_comparable_dtype(dtype) 

430 

431 def map(self, mapper, na_action: Literal["ignore"] | None = None): 

432 """ 

433 Map values using input an input mapping or function. 

434 

435 Maps the values (their categories, not the codes) of the index to new 

436 categories. If the mapping correspondence is one-to-one the result is a 

437 :class:`~pandas.CategoricalIndex` which has the same order property as 

438 the original, otherwise an :class:`~pandas.Index` is returned. 

439 

440 If a `dict` or :class:`~pandas.Series` is used any unmapped category is 

441 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index` 

442 will be returned. 

443 

444 Parameters 

445 ---------- 

446 mapper : function, dict, or Series 

447 Mapping correspondence. 

448 

449 Returns 

450 ------- 

451 pandas.CategoricalIndex or pandas.Index 

452 Mapped index. 

453 

454 See Also 

455 -------- 

456 Index.map : Apply a mapping correspondence on an 

457 :class:`~pandas.Index`. 

458 Series.map : Apply a mapping correspondence on a 

459 :class:`~pandas.Series`. 

460 Series.apply : Apply more complex functions on a 

461 :class:`~pandas.Series`. 

462 

463 Examples 

464 -------- 

465 >>> idx = pd.CategoricalIndex(['a', 'b', 'c']) 

466 >>> idx 

467 CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], 

468 ordered=False, dtype='category') 

469 >>> idx.map(lambda x: x.upper()) 

470 CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'], 

471 ordered=False, dtype='category') 

472 >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'}) 

473 CategoricalIndex(['first', 'second', 'third'], categories=['first', 

474 'second', 'third'], ordered=False, dtype='category') 

475 

476 If the mapping is one-to-one the ordering of the categories is 

477 preserved: 

478 

479 >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True) 

480 >>> idx 

481 CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], 

482 ordered=True, dtype='category') 

483 >>> idx.map({'a': 3, 'b': 2, 'c': 1}) 

484 CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True, 

485 dtype='category') 

486 

487 If the mapping is not one-to-one an :class:`~pandas.Index` is returned: 

488 

489 >>> idx.map({'a': 'first', 'b': 'second', 'c': 'first'}) 

490 Index(['first', 'second', 'first'], dtype='object') 

491 

492 If a `dict` is used, all unmapped categories are mapped to `NaN` and 

493 the result is an :class:`~pandas.Index`: 

494 

495 >>> idx.map({'a': 'first', 'b': 'second'}) 

496 Index(['first', 'second', nan], dtype='object') 

497 """ 

498 mapped = self._values.map(mapper, na_action=na_action) 

499 return Index(mapped, name=self.name) 

500 

501 def _concat(self, to_concat: list[Index], name: Hashable) -> Index: 

502 # if calling index is category, don't check dtype of others 

503 try: 

504 cat = Categorical._concat_same_type( 

505 [self._is_dtype_compat(c) for c in to_concat] 

506 ) 

507 except TypeError: 

508 # not all to_concat elements are among our categories (or NA) 

509 

510 res = concat_compat([x._values for x in to_concat]) 

511 return Index(res, name=name) 

512 else: 

513 return type(self)._simple_new(cat, name=name)