Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/indexes/category.py: 38%

1from __future__ import annotations

3from typing import (

4 TYPE_CHECKING,

5 Any,

6 Literal,

7 cast,

10import numpy as np

12from pandas._libs import index as libindex

13from pandas.util._decorators import (

14 cache_readonly,

15 doc,

16)

18from pandas.core.dtypes.common import is_scalar

19from pandas.core.dtypes.concat import concat_compat

20from pandas.core.dtypes.dtypes import CategoricalDtype

21from pandas.core.dtypes.missing import (

22 is_valid_na_for_dtype,

23 isna,

24)

26from pandas.core.arrays.categorical import (

27 Categorical,

28 contains,

29)

30from pandas.core.construction import extract_array

31from pandas.core.indexes.base import (

32 Index,

33 maybe_extract_name,

34)

35from pandas.core.indexes.extension import (

36 NDArrayBackedExtensionIndex,

37 inherit_names,

38)

40if TYPE_CHECKING:

41 from collections.abc import Hashable

43 from pandas._typing import (

44 Dtype,

45 DtypeObj,

46 Self,

47 npt,

48 )

51@inherit_names(

52 [

53 "argsort",

54 "tolist",

55 "codes",

56 "categories",

57 "ordered",

58 "_reverse_indexer",

59 "searchsorted",

60 "min",

61 "max",

62 ],

63 Categorical,

64)

65@inherit_names(

66 [

67 "rename_categories",

68 "reorder_categories",

69 "add_categories",

70 "remove_categories",

71 "remove_unused_categories",

72 "set_categories",

73 "as_ordered",

74 "as_unordered",

75 ],

76 Categorical,

77 wrap=True,

78)

79class CategoricalIndex(NDArrayBackedExtensionIndex):

80 """

81 Index based on an underlying :class:`Categorical`.

83 CategoricalIndex, like Categorical, can only take on a limited,

84 and usually fixed, number of possible values (`categories`). Also,

85 like Categorical, it might have an order, but numerical operations

86 (additions, divisions, ...) are not possible.

88 Parameters

89 ----------

90 data : array-like (1-dimensional)

91 The values of the categorical. If `categories` are given, values not in

92 `categories` will be replaced with NaN.

93 categories : index-like, optional

94 The categories for the categorical. Items need to be unique.

95 If the categories are not given here (and also not in `dtype`), they

96 will be inferred from the `data`.

97 ordered : bool, optional

98 Whether or not this categorical is treated as an ordered

99 categorical. If not given here or in `dtype`, the resulting

100 categorical will be unordered.

101 dtype : CategoricalDtype or "category", optional

102 If :class:`CategoricalDtype`, cannot be used together with

103 `categories` or `ordered`.

104 copy : bool, default False

105 Make a copy of input ndarray.

106 name : object, optional

107 Name to be stored in the index.

108

109 Attributes

110 ----------

111 codes

112 categories

113 ordered

114

115 Methods

116 -------

117 rename_categories

118 reorder_categories

119 add_categories

120 remove_categories

121 remove_unused_categories

122 set_categories

123 as_ordered

124 as_unordered

125 map

126

127 Raises

128 ------

129 ValueError

130 If the categories do not validate.

131 TypeError

132 If an explicit ``ordered=True`` is given but no `categories` and the

133 `values` are not sortable.

134

135 See Also

136 --------

137 Index : The base pandas Index type.

138 Categorical : A categorical array.

139 CategoricalDtype : Type for categorical data.

140

141 Notes

142 -----

143 See the `user guide

144 <https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#categoricalindex>`__

145 for more.

146

147 Examples

148 --------

149 >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"])

150 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],

151 categories=['a', 'b', 'c'], ordered=False, dtype='category')

152

153 ``CategoricalIndex`` can also be instantiated from a ``Categorical``:

154

155 >>> c = pd.Categorical(["a", "b", "c", "a", "b", "c"])

156 >>> pd.CategoricalIndex(c)

157 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],

158 categories=['a', 'b', 'c'], ordered=False, dtype='category')

159

160 Ordered ``CategoricalIndex`` can have a min and max value.

161

162 >>> ci = pd.CategoricalIndex(

163 ... ["a", "b", "c", "a", "b", "c"], ordered=True, categories=["c", "b", "a"]

164 ... )

165 >>> ci

166 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],

167 categories=['c', 'b', 'a'], ordered=True, dtype='category')

168 >>> ci.min()

169 'c'

170 """

171

172 _typ = "categoricalindex"

173 _data_cls = Categorical

174

175 @property

176 def _can_hold_strings(self):

177 return self.categories._can_hold_strings

178

179 @cache_readonly

180 def _should_fallback_to_positional(self) -> bool:

181 return self.categories._should_fallback_to_positional

182

183 codes: np.ndarray

184 categories: Index

185 ordered: bool | None

186 _data: Categorical

187 _values: Categorical

188

189 @property

190 def _engine_type(self) -> type[libindex.IndexEngine]:

191 # self.codes can have dtype int8, int16, int32 or int64, so we need

192 # to return the corresponding engine type (libindex.Int8Engine, etc.).

193 return {

194 np.int8: libindex.Int8Engine,

195 np.int16: libindex.Int16Engine,

196 np.int32: libindex.Int32Engine,

197 np.int64: libindex.Int64Engine,

198 }[self.codes.dtype.type]

199

200 # --------------------------------------------------------------------

201 # Constructors

202

203 def __new__(

204 cls,

205 data=None,

206 categories=None,

207 ordered=None,

208 dtype: Dtype | None = None,

209 copy: bool = False,

210 name: Hashable | None = None,

211 ) -> Self:

212 name = maybe_extract_name(name, data, cls)

213

214 if is_scalar(data):

215 # GH#38944 include None here, which pre-2.0 subbed in []

216 cls._raise_scalar_data_error(data)

217

218 data = Categorical(

219 data, categories=categories, ordered=ordered, dtype=dtype, copy=copy

220 )

221

222 return cls._simple_new(data, name=name)

223

224 # --------------------------------------------------------------------

225

226 def _is_dtype_compat(self, other: Index) -> Categorical:

227 """

228 *this is an internal non-public method*

229

230 provide a comparison between the dtype of self and other (coercing if

231 needed)

232

233 Parameters

234 ----------

235 other : Index

236

237 Returns

238 -------

239 Categorical

240

241 Raises

242 ------

243 TypeError if the dtypes are not compatible

244 """

245 if isinstance(other.dtype, CategoricalDtype):

246 cat = extract_array(other)

247 cat = cast(Categorical, cat)

248 if not cat._categories_match_up_to_permutation(self._values):

249 raise TypeError(

250 "categories must match existing categories when appending"

251 )

252

253 elif other._is_multi:

254 # preempt raising NotImplementedError in isna call

255 raise TypeError("MultiIndex is not dtype-compatible with CategoricalIndex")

256 else:

257 values = other

258

259 cat = Categorical(other, dtype=self.dtype)

260 other = CategoricalIndex(cat)

261 if not other.isin(values).all():

262 raise TypeError(

263 "cannot append a non-category item to a CategoricalIndex"

264 )

265 cat = other._values

266

267 if not ((cat == values) | (isna(cat) & isna(values))).all():

268 # GH#37667 see test_equals_non_category

269 raise TypeError(

270 "categories must match existing categories when appending"

271 )

272

273 return cat

274

275 def equals(self, other: object) -> bool:

276 """

277 Determine if two CategoricalIndex objects contain the same elements.

278

279 Returns

280 -------

281 bool

282 ``True`` if two :class:`pandas.CategoricalIndex` objects have equal

283 elements, ``False`` otherwise.

284

285 Examples

286 --------

287 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'])

288 >>> ci2 = pd.CategoricalIndex(pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']))

289 >>> ci.equals(ci2)

290 True

291

292 The order of elements matters.

293

294 >>> ci3 = pd.CategoricalIndex(['c', 'b', 'a', 'a', 'b', 'c'])

295 >>> ci.equals(ci3)

296 False

297

298 The orderedness also matters.

299

300 >>> ci4 = ci.as_ordered()

301 >>> ci.equals(ci4)

302 False

303

304 The categories matter, but the order of the categories matters only when

305 ``ordered=True``.

306

307 >>> ci5 = ci.set_categories(['a', 'b', 'c', 'd'])

308 >>> ci.equals(ci5)

309 False

310

311 >>> ci6 = ci.set_categories(['b', 'c', 'a'])

312 >>> ci.equals(ci6)

313 True

314 >>> ci_ordered = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],

315 ... ordered=True)

316 >>> ci2_ordered = ci_ordered.set_categories(['b', 'c', 'a'])

317 >>> ci_ordered.equals(ci2_ordered)

318 False

319 """

320 if self.is_(other):

321 return True

322

323 if not isinstance(other, Index):

324 return False

325

326 try:

327 other = self._is_dtype_compat(other)

328 except (TypeError, ValueError):

329 return False

330

331 return self._data.equals(other)

332

333 # --------------------------------------------------------------------

334 # Rendering Methods

335

336 @property

337 def _formatter_func(self):

338 return self.categories._formatter_func

339

340 def _format_attrs(self):

341 """

342 Return a list of tuples of the (attr,formatted_value)

343 """

344 attrs: list[tuple[str, str | int | bool | None]]

345

346 attrs = [

347 (

348 "categories",

349 f"[{', '.join(self._data._repr_categories())}]",

350 ),

351 ("ordered", self.ordered),

352 ]

353 extra = super()._format_attrs()

354 return attrs + extra

355

356 # --------------------------------------------------------------------

357

358 @property

359 def inferred_type(self) -> str:

360 return "categorical"

361

362 @doc(Index.__contains__)

363 def __contains__(self, key: Any) -> bool:

364 # if key is a NaN, check if any NaN is in self.

365 if is_valid_na_for_dtype(key, self.categories.dtype):

366 return self.hasnans

367

368 return contains(self, key, container=self._engine)

369

370 def reindex(

371 self, target, method=None, level=None, limit: int | None = None, tolerance=None

372 ) -> tuple[Index, npt.NDArray[np.intp] | None]:

373 """

374 Create index with target's values (move/add/delete values as necessary)

375

376 Returns

377 -------

378 new_index : pd.Index

379 Resulting index

380 indexer : np.ndarray[np.intp] or None

381 Indices of output values in original index

382

383 """

384 if method is not None:

385 raise NotImplementedError(

386 "argument method is not implemented for CategoricalIndex.reindex"

387 )

388 if level is not None:

389 raise NotImplementedError(

390 "argument level is not implemented for CategoricalIndex.reindex"

391 )

392 if limit is not None:

393 raise NotImplementedError(

394 "argument limit is not implemented for CategoricalIndex.reindex"

395 )

396 return super().reindex(target)

397

398 # --------------------------------------------------------------------

399 # Indexing Methods

400

401 def _maybe_cast_indexer(self, key) -> int:

402 # GH#41933: we have to do this instead of self._data._validate_scalar

403 # because this will correctly get partial-indexing on Interval categories

404 try:

405 return self._data._unbox_scalar(key)

406 except KeyError:

407 if is_valid_na_for_dtype(key, self.categories.dtype):

408 return -1

409 raise

410

411 def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex:

412 if isinstance(values, CategoricalIndex):

413 values = values._data

414 if isinstance(values, Categorical):

415 # Indexing on codes is more efficient if categories are the same,

416 # so we can apply some optimizations based on the degree of

417 # dtype-matching.

418 cat = self._data._encode_with_my_categories(values)

419 codes = cat._codes

420 else:

421 codes = self.categories.get_indexer(values)

422 codes = codes.astype(self.codes.dtype, copy=False)

423 cat = self._data._from_backing_data(codes)

424 return type(self)._simple_new(cat)

425

426 # --------------------------------------------------------------------

427

428 def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:

429 return self.categories._is_comparable_dtype(dtype)

430

431 def map(self, mapper, na_action: Literal["ignore"] | None = None):

432 """

433 Map values using input an input mapping or function.

434

435 Maps the values (their categories, not the codes) of the index to new

436 categories. If the mapping correspondence is one-to-one the result is a

437 :class:`~pandas.CategoricalIndex` which has the same order property as

438 the original, otherwise an :class:`~pandas.Index` is returned.

439

440 If a `dict` or :class:`~pandas.Series` is used any unmapped category is

441 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`

442 will be returned.

443

444 Parameters

445 ----------

446 mapper : function, dict, or Series

447 Mapping correspondence.

448

449 Returns

450 -------

451 pandas.CategoricalIndex or pandas.Index

452 Mapped index.

453

454 See Also

455 --------

456 Index.map : Apply a mapping correspondence on an

457 :class:`~pandas.Index`.

458 Series.map : Apply a mapping correspondence on a

459 :class:`~pandas.Series`.

460 Series.apply : Apply more complex functions on a

461 :class:`~pandas.Series`.

462

463 Examples

464 --------

465 >>> idx = pd.CategoricalIndex(['a', 'b', 'c'])

466 >>> idx

467 CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],

468 ordered=False, dtype='category')

469 >>> idx.map(lambda x: x.upper())

470 CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'],

471 ordered=False, dtype='category')

472 >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'})

473 CategoricalIndex(['first', 'second', 'third'], categories=['first',

474 'second', 'third'], ordered=False, dtype='category')

475

476 If the mapping is one-to-one the ordering of the categories is

477 preserved:

478

479 >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True)

480 >>> idx

481 CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],

482 ordered=True, dtype='category')

483 >>> idx.map({'a': 3, 'b': 2, 'c': 1})

484 CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True,

485 dtype='category')

486

487 If the mapping is not one-to-one an :class:`~pandas.Index` is returned:

488

489 >>> idx.map({'a': 'first', 'b': 'second', 'c': 'first'})

490 Index(['first', 'second', 'first'], dtype='object')

491

492 If a `dict` is used, all unmapped categories are mapped to `NaN` and

493 the result is an :class:`~pandas.Index`:

494

495 >>> idx.map({'a': 'first', 'b': 'second'})

496 Index(['first', 'second', nan], dtype='object')

497 """

498 mapped = self._values.map(mapper, na_action=na_action)

499 return Index(mapped, name=self.name)

500

501 def _concat(self, to_concat: list[Index], name: Hashable) -> Index:

502 # if calling index is category, don't check dtype of others

503 try:

504 cat = Categorical._concat_same_type(

505 [self._is_dtype_compat(c) for c in to_concat]

506 )

507 except TypeError:

508 # not all to_concat elements are among our categories (or NA)

509

510 res = concat_compat([x._values for x in to_concat])

511 return Index(res, name=name)

512 else:

513 return type(self)._simple_new(cat, name=name)