Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/internals/managers.py: 55%

1from __future__ import annotations

3from collections.abc import (

4 Hashable,

5 Sequence,

7import itertools

8from typing import (

9 TYPE_CHECKING,

10 Callable,

11 Literal,

12 cast,

13)

14import warnings

16import numpy as np

18from pandas._config import (

19 using_copy_on_write,

20 warn_copy_on_write,

21)

23from pandas._libs import (

24 internals as libinternals,

25 lib,

26)

27from pandas._libs.internals import (

28 BlockPlacement,

29 BlockValuesRefs,

30)

31from pandas._libs.tslibs import Timestamp

32from pandas.errors import PerformanceWarning

33from pandas.util._decorators import cache_readonly

34from pandas.util._exceptions import find_stack_level

36from pandas.core.dtypes.cast import infer_dtype_from_scalar

37from pandas.core.dtypes.common import (

38 ensure_platform_int,

39 is_1d_only_ea_dtype,

40 is_list_like,

41)

42from pandas.core.dtypes.dtypes import (

43 DatetimeTZDtype,

44 ExtensionDtype,

45)

46from pandas.core.dtypes.generic import (

47 ABCDataFrame,

48 ABCSeries,

49)

50from pandas.core.dtypes.missing import (

51 array_equals,

52 isna,

53)

55import pandas.core.algorithms as algos

56from pandas.core.arrays import (

57 ArrowExtensionArray,

58 ArrowStringArray,

59 DatetimeArray,

60)

61from pandas.core.arrays._mixins import NDArrayBackedExtensionArray

62from pandas.core.construction import (

63 ensure_wrapped_if_datetimelike,

64 extract_array,

65)

66from pandas.core.indexers import maybe_convert_indices

67from pandas.core.indexes.api import (

68 Index,

69 ensure_index,

70)

71from pandas.core.internals.base import (

72 DataManager,

73 SingleDataManager,

74 ensure_np_dtype,

75 interleaved_dtype,

76)

77from pandas.core.internals.blocks import (

78 COW_WARNING_GENERAL_MSG,

79 COW_WARNING_SETITEM_MSG,

80 Block,

81 NumpyBlock,

82 ensure_block_shape,

83 extend_blocks,

84 get_block_type,

85 maybe_coerce_values,

86 new_block,

87 new_block_2d,

88)

89from pandas.core.internals.ops import (

90 blockwise_all,

91 operate_blockwise,

92)

94if TYPE_CHECKING:

95 from pandas._typing import (

96 ArrayLike,

97 AxisInt,

98 DtypeObj,

99 QuantileInterpolation,

100 Self,

101 Shape,

102 npt,

103 )

104

105 from pandas.api.extensions import ExtensionArray

106

107

108class BaseBlockManager(DataManager):

109 """

110 Core internal data structure to implement DataFrame, Series, etc.

111

112 Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a

113 lightweight blocked set of labeled data to be manipulated by the DataFrame

114 public API class

115

116 Attributes

117 ----------

118 shape

119 ndim

120 axes

121 values

122 items

123

124 Methods

125 -------

126 set_axis(axis, new_labels)

127 copy(deep=True)

128

129 get_dtypes

130

131 apply(func, axes, block_filter_fn)

132

133 get_bool_data

134 get_numeric_data

135

136 get_slice(slice_like, axis)

137 get(label)

138 iget(loc)

139

140 take(indexer, axis)

141 reindex_axis(new_labels, axis)

142 reindex_indexer(new_labels, indexer, axis)

143

144 delete(label)

145 insert(loc, label, value)

146 set(label, value)

147

148 Parameters

149 ----------

150 blocks: Sequence of Block

151 axes: Sequence of Index

152 verify_integrity: bool, default True

153

154 Notes

155 -----

156 This is *not* a public API class

157 """

158

159 __slots__ = ()

160

161 _blknos: npt.NDArray[np.intp]

162 _blklocs: npt.NDArray[np.intp]

163 blocks: tuple[Block, ...]

164 axes: list[Index]

165

166 @property

167 def ndim(self) -> int:

168 raise NotImplementedError

169

170 _known_consolidated: bool

171 _is_consolidated: bool

172

173 def __init__(self, blocks, axes, verify_integrity: bool = True) -> None:

174 raise NotImplementedError

175

176 @classmethod

177 def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self:

178 raise NotImplementedError

179

180 @property

181 def blknos(self) -> npt.NDArray[np.intp]:

182 """

183 Suppose we want to find the array corresponding to our i'th column.

184

185 blknos[i] identifies the block from self.blocks that contains this column.

186

187 blklocs[i] identifies the column of interest within

188 self.blocks[self.blknos[i]]

189 """

190 if self._blknos is None:

191 # Note: these can be altered by other BlockManager methods.

192 self._rebuild_blknos_and_blklocs()

193

194 return self._blknos

195

196 @property

197 def blklocs(self) -> npt.NDArray[np.intp]:

198 """

199 See blknos.__doc__

200 """

201 if self._blklocs is None:

202 # Note: these can be altered by other BlockManager methods.

203 self._rebuild_blknos_and_blklocs()

204

205 return self._blklocs

206

207 def make_empty(self, axes=None) -> Self:

208 """return an empty BlockManager with the items axis of len 0"""

209 if axes is None:

210 axes = [Index([])] + self.axes[1:]

211

212 # preserve dtype if possible

213 if self.ndim == 1:

214 assert isinstance(self, SingleBlockManager) # for mypy

215 blk = self.blocks[0]

216 arr = blk.values[:0]

217 bp = BlockPlacement(slice(0, 0))

218 nb = blk.make_block_same_class(arr, placement=bp)

219 blocks = [nb]

220 else:

221 blocks = []

222 return type(self).from_blocks(blocks, axes)

223

224 def __nonzero__(self) -> bool:

225 return True

226

227 # Python3 compat

228 __bool__ = __nonzero__

229

230 def _normalize_axis(self, axis: AxisInt) -> int:

231 # switch axis to follow BlockManager logic

232 if self.ndim == 2:

233 axis = 1 if axis == 0 else 0

234 return axis

235

236 def set_axis(self, axis: AxisInt, new_labels: Index) -> None:

237 # Caller is responsible for ensuring we have an Index object.

238 self._validate_set_axis(axis, new_labels)

239 self.axes[axis] = new_labels

240

241 @property

242 def is_single_block(self) -> bool:

243 # Assumes we are 2D; overridden by SingleBlockManager

244 return len(self.blocks) == 1

245

246 @property

247 def items(self) -> Index:

248 return self.axes[0]

249

250 def _has_no_reference(self, i: int) -> bool:

251 """

252 Check for column `i` if it has references.

253 (whether it references another array or is itself being referenced)

254 Returns True if the column has no references.

255 """

256 blkno = self.blknos[i]

257 return self._has_no_reference_block(blkno)

258

259 def _has_no_reference_block(self, blkno: int) -> bool:

260 """

261 Check for block `i` if it has references.

262 (whether it references another array or is itself being referenced)

263 Returns True if the block has no references.

264 """

265 return not self.blocks[blkno].refs.has_reference()

266

267 def add_references(self, mgr: BaseBlockManager) -> None:

268 """

269 Adds the references from one manager to another. We assume that both

270 managers have the same block structure.

271 """

272 if len(self.blocks) != len(mgr.blocks):

273 # If block structure changes, then we made a copy

274 return

275 for i, blk in enumerate(self.blocks):

276 blk.refs = mgr.blocks[i].refs

277 blk.refs.add_reference(blk)

278

279 def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool:

280 """

281 Checks if two blocks from two different block managers reference the

282 same underlying values.

283 """

284 blk = self.blocks[blkno]

285 return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks)

286

287 def get_dtypes(self) -> npt.NDArray[np.object_]:

288 dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object)

289 return dtypes.take(self.blknos)

290

291 @property

292 def arrays(self) -> list[ArrayLike]:

293 """

294 Quick access to the backing arrays of the Blocks.

295

296 Only for compatibility with ArrayManager for testing convenience.

297 Not to be used in actual code, and return value is not the same as the

298 ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs).

299

300 Warning! The returned arrays don't handle Copy-on-Write, so this should

301 be used with caution (only in read-mode).

302 """

303 return [blk.values for blk in self.blocks]

304

305 def __repr__(self) -> str:

306 output = type(self).__name__

307 for i, ax in enumerate(self.axes):

308 if i == 0:

309 output += f"\nItems: {ax}"

310 else:

311 output += f"\nAxis {i}: {ax}"

312

313 for block in self.blocks:

314 output += f"\n{block}"

315 return output

316

317 def apply(

318 self,

319 f,

320 align_keys: list[str] | None = None,

321 **kwargs,

322 ) -> Self:

323 """

324 Iterate over the blocks, collect and create a new BlockManager.

325

326 Parameters

327 ----------

328 f : str or callable

329 Name of the Block method to apply.

330 align_keys: List[str] or None, default None

331 **kwargs

332 Keywords to pass to `f`

333

334 Returns

335 -------

336 BlockManager

337 """

338 assert "filter" not in kwargs

339

340 align_keys = align_keys or []

341 result_blocks: list[Block] = []

342 # fillna: Series/DataFrame is responsible for making sure value is aligned

343

344 aligned_args = {k: kwargs[k] for k in align_keys}

345

346 for b in self.blocks:

347 if aligned_args:

348 for k, obj in aligned_args.items():

349 if isinstance(obj, (ABCSeries, ABCDataFrame)):

350 # The caller is responsible for ensuring that

351 # obj.axes[-1].equals(self.items)

352 if obj.ndim == 1:

353 kwargs[k] = obj.iloc[b.mgr_locs.indexer]._values

354 else:

355 kwargs[k] = obj.iloc[:, b.mgr_locs.indexer]._values

356 else:

357 # otherwise we have an ndarray

358 kwargs[k] = obj[b.mgr_locs.indexer]

359

360 if callable(f):

361 applied = b.apply(f, **kwargs)

362 else:

363 applied = getattr(b, f)(**kwargs)

364 result_blocks = extend_blocks(applied, result_blocks)

365

366 out = type(self).from_blocks(result_blocks, self.axes)

367 return out

368

369 # Alias so we can share code with ArrayManager

370 apply_with_block = apply

371

372 def setitem(self, indexer, value, warn: bool = True) -> Self:

373 """

374 Set values with indexer.

375

376 For SingleBlockManager, this backs s[indexer] = value

377 """

378 if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:

379 raise ValueError(f"Cannot set values with ndim > {self.ndim}")

380

381 if warn and warn_copy_on_write() and not self._has_no_reference(0):

382 warnings.warn(

383 COW_WARNING_GENERAL_MSG,

384 FutureWarning,

385 stacklevel=find_stack_level(),

386 )

387

388 elif using_copy_on_write() and not self._has_no_reference(0):

389 # this method is only called if there is a single block -> hardcoded 0

390 # Split blocks to only copy the columns we want to modify

391 if self.ndim == 2 and isinstance(indexer, tuple):

392 blk_loc = self.blklocs[indexer[1]]

393 if is_list_like(blk_loc) and blk_loc.ndim == 2:

394 blk_loc = np.squeeze(blk_loc, axis=0)

395 elif not is_list_like(blk_loc):

396 # Keep dimension and copy data later

397 blk_loc = [blk_loc] # type: ignore[assignment]

398 if len(blk_loc) == 0:

399 return self.copy(deep=False)

400

401 values = self.blocks[0].values

402 if values.ndim == 2:

403 values = values[blk_loc]

404 # "T" has no attribute "_iset_split_block"

405 self._iset_split_block( # type: ignore[attr-defined]

406 0, blk_loc, values

407 )

408 # first block equals values

409 self.blocks[0].setitem((indexer[0], np.arange(len(blk_loc))), value)

410 return self

411 # No need to split if we either set all columns or on a single block

412 # manager

413 self = self.copy()

414

415 return self.apply("setitem", indexer=indexer, value=value)

416

417 def diff(self, n: int) -> Self:

418 # only reached with self.ndim == 2

419 return self.apply("diff", n=n)

420

421 def astype(self, dtype, copy: bool | None = False, errors: str = "raise") -> Self:

422 if copy is None:

423 if using_copy_on_write():

424 copy = False

425 else:

426 copy = True

427 elif using_copy_on_write():

428 copy = False

429

430 return self.apply(

431 "astype",

432 dtype=dtype,

433 copy=copy,

434 errors=errors,

435 using_cow=using_copy_on_write(),

436 )

437

438 def convert(self, copy: bool | None) -> Self:

439 if copy is None:

440 if using_copy_on_write():

441 copy = False

442 else:

443 copy = True

444 elif using_copy_on_write():

445 copy = False

446

447 return self.apply("convert", copy=copy, using_cow=using_copy_on_write())

448

449 def convert_dtypes(self, **kwargs):

450 if using_copy_on_write():

451 copy = False

452 else:

453 copy = True

454

455 return self.apply(

456 "convert_dtypes", copy=copy, using_cow=using_copy_on_write(), **kwargs

457 )

458

459 def get_values_for_csv(

460 self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None

461 ) -> Self:

462 """

463 Convert values to native types (strings / python objects) that are used

464 in formatting (repr / csv).

465 """

466 return self.apply(

467 "get_values_for_csv",

468 na_rep=na_rep,

469 quoting=quoting,

470 float_format=float_format,

471 date_format=date_format,

472 decimal=decimal,

473 )

474

475 @property

476 def any_extension_types(self) -> bool:

477 """Whether any of the blocks in this manager are extension blocks"""

478 return any(block.is_extension for block in self.blocks)

479

480 @property

481 def is_view(self) -> bool:

482 """return a boolean if we are a single block and are a view"""

483 if len(self.blocks) == 1:

484 return self.blocks[0].is_view

485

486 # It is technically possible to figure out which blocks are views

487 # e.g. [ b.values.base is not None for b in self.blocks ]

488 # but then we have the case of possibly some blocks being a view

489 # and some blocks not. setting in theory is possible on the non-view

490 # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit

491 # complicated

492

493 return False

494

495 def _get_data_subset(self, predicate: Callable) -> Self:

496 blocks = [blk for blk in self.blocks if predicate(blk.values)]

497 return self._combine(blocks)

498

499 def get_bool_data(self) -> Self:

500 """

501 Select blocks that are bool-dtype and columns from object-dtype blocks

502 that are all-bool.

503 """

504

505 new_blocks = []

506

507 for blk in self.blocks:

508 if blk.dtype == bool:

509 new_blocks.append(blk)

510

511 elif blk.is_object:

512 nbs = blk._split()

513 new_blocks.extend(nb for nb in nbs if nb.is_bool)

514

515 return self._combine(new_blocks)

516

517 def get_numeric_data(self) -> Self:

518 numeric_blocks = [blk for blk in self.blocks if blk.is_numeric]

519 if len(numeric_blocks) == len(self.blocks):

520 # Avoid somewhat expensive _combine

521 return self

522 return self._combine(numeric_blocks)

523

524 def _combine(self, blocks: list[Block], index: Index | None = None) -> Self:

525 """return a new manager with the blocks"""

526 if len(blocks) == 0:

527 if self.ndim == 2:

528 # retain our own Index dtype

529 if index is not None:

530 axes = [self.items[:0], index]

531 else:

532 axes = [self.items[:0]] + self.axes[1:]

533 return self.make_empty(axes)

534 return self.make_empty()

535

536 # FIXME: optimization potential

537 indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks]))

538 inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])

539

540 new_blocks: list[Block] = []

541 for b in blocks:

542 nb = b.copy(deep=False)

543 nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer])

544 new_blocks.append(nb)

545

546 axes = list(self.axes)

547 if index is not None:

548 axes[-1] = index

549 axes[0] = self.items.take(indexer)

550

551 return type(self).from_blocks(new_blocks, axes)

552

553 @property

554 def nblocks(self) -> int:

555 return len(self.blocks)

556

557 def copy(self, deep: bool | None | Literal["all"] = True) -> Self:

558 """

559 Make deep or shallow copy of BlockManager

560

561 Parameters

562 ----------

563 deep : bool, string or None, default True

564 If False or None, return a shallow copy (do not copy data)

565 If 'all', copy data and a deep copy of the index

566

567 Returns

568 -------

569 BlockManager

570 """

571 if deep is None:

572 if using_copy_on_write():

573 # use shallow copy

574 deep = False

575 else:

576 # preserve deep copy for BlockManager with copy=None

577 deep = True

578

579 # this preserves the notion of view copying of axes

580 if deep:

581 # hit in e.g. tests.io.json.test_pandas

582

583 def copy_func(ax):

584 return ax.copy(deep=True) if deep == "all" else ax.view()

585

586 new_axes = [copy_func(ax) for ax in self.axes]

587 else:

588 if using_copy_on_write():

589 new_axes = [ax.view() for ax in self.axes]

590 else:

591 new_axes = list(self.axes)

592

593 res = self.apply("copy", deep=deep)

594 res.axes = new_axes

595

596 if self.ndim > 1:

597 # Avoid needing to re-compute these

598 blknos = self._blknos

599 if blknos is not None:

600 res._blknos = blknos.copy()

601 res._blklocs = self._blklocs.copy()

602

603 if deep:

604 res._consolidate_inplace()

605 return res

606

607 def consolidate(self) -> Self:

608 """

609 Join together blocks having same dtype

610

611 Returns

612 -------

613 y : BlockManager

614 """

615 if self.is_consolidated():

616 return self

617

618 bm = type(self)(self.blocks, self.axes, verify_integrity=False)

619 bm._is_consolidated = False

620 bm._consolidate_inplace()

621 return bm

622

623 def reindex_indexer(

624 self,

625 new_axis: Index,

626 indexer: npt.NDArray[np.intp] | None,

627 axis: AxisInt,

628 fill_value=None,

629 allow_dups: bool = False,

630 copy: bool | None = True,

631 only_slice: bool = False,

632 *,

633 use_na_proxy: bool = False,

634 ) -> Self:

635 """

636 Parameters

637 ----------

638 new_axis : Index

639 indexer : ndarray[intp] or None

640 axis : int

641 fill_value : object, default None

642 allow_dups : bool, default False

643 copy : bool or None, default True

644 If None, regard as False to get shallow copy.

645 only_slice : bool, default False

646 Whether to take views, not copies, along columns.

647 use_na_proxy : bool, default False

648 Whether to use a np.void ndarray for newly introduced columns.

649

650 pandas-indexer with -1's only.

651 """

652 if copy is None:

653 if using_copy_on_write():

654 # use shallow copy

655 copy = False

656 else:

657 # preserve deep copy for BlockManager with copy=None

658 copy = True

659

660 if indexer is None:

661 if new_axis is self.axes[axis] and not copy:

662 return self

663

664 result = self.copy(deep=copy)

665 result.axes = list(self.axes)

666 result.axes[axis] = new_axis

667 return result

668

669 # Should be intp, but in some cases we get int64 on 32bit builds

670 assert isinstance(indexer, np.ndarray)

671

672 # some axes don't allow reindexing with dups

673 if not allow_dups:

674 self.axes[axis]._validate_can_reindex(indexer)

675

676 if axis >= self.ndim:

677 raise IndexError("Requested axis not found in manager")

678

679 if axis == 0:

680 new_blocks = self._slice_take_blocks_ax0(

681 indexer,

682 fill_value=fill_value,

683 only_slice=only_slice,

684 use_na_proxy=use_na_proxy,

685 )

686 else:

687 new_blocks = [

688 blk.take_nd(

689 indexer,

690 axis=1,

691 fill_value=(

692 fill_value if fill_value is not None else blk.fill_value

693 ),

694 )

695 for blk in self.blocks

696 ]

697

698 new_axes = list(self.axes)

699 new_axes[axis] = new_axis

700

701 new_mgr = type(self).from_blocks(new_blocks, new_axes)

702 if axis == 1:

703 # We can avoid the need to rebuild these

704 new_mgr._blknos = self.blknos.copy()

705 new_mgr._blklocs = self.blklocs.copy()

706 return new_mgr

707

708 def _slice_take_blocks_ax0(

709 self,

710 slice_or_indexer: slice | np.ndarray,

711 fill_value=lib.no_default,

712 only_slice: bool = False,

713 *,

714 use_na_proxy: bool = False,

715 ref_inplace_op: bool = False,

716 ) -> list[Block]:

717 """

718 Slice/take blocks along axis=0.

719

720 Overloaded for SingleBlock

721

722 Parameters

723 ----------

724 slice_or_indexer : slice or np.ndarray[int64]

725 fill_value : scalar, default lib.no_default

726 only_slice : bool, default False

727 If True, we always return views on existing arrays, never copies.

728 This is used when called from ops.blockwise.operate_blockwise.

729 use_na_proxy : bool, default False

730 Whether to use a np.void ndarray for newly introduced columns.

731 ref_inplace_op: bool, default False

732 Don't track refs if True because we operate inplace

733

734 Returns

735 -------

736 new_blocks : list of Block

737 """

738 allow_fill = fill_value is not lib.no_default

739

740 sl_type, slobj, sllen = _preprocess_slice_or_indexer(

741 slice_or_indexer, self.shape[0], allow_fill=allow_fill

742 )

743

744 if self.is_single_block:

745 blk = self.blocks[0]

746

747 if sl_type == "slice":

748 # GH#32959 EABlock would fail since we can't make 0-width

749 # TODO(EA2D): special casing unnecessary with 2D EAs

750 if sllen == 0:

751 return []

752 bp = BlockPlacement(slice(0, sllen))

753 return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)]

754 elif not allow_fill or self.ndim == 1:

755 if allow_fill and fill_value is None:

756 fill_value = blk.fill_value

757

758 if not allow_fill and only_slice:

759 # GH#33597 slice instead of take, so we get

760 # views instead of copies

761 blocks = [

762 blk.getitem_block_columns(

763 slice(ml, ml + 1),

764 new_mgr_locs=BlockPlacement(i),

765 ref_inplace_op=ref_inplace_op,

766 )

767 for i, ml in enumerate(slobj)

768 ]

769 return blocks

770 else:

771 bp = BlockPlacement(slice(0, sllen))

772 return [

773 blk.take_nd(

774 slobj,

775 axis=0,

776 new_mgr_locs=bp,

777 fill_value=fill_value,

778 )

779 ]

780

781 if sl_type == "slice":

782 blknos = self.blknos[slobj]

783 blklocs = self.blklocs[slobj]

784 else:

785 blknos = algos.take_nd(

786 self.blknos, slobj, fill_value=-1, allow_fill=allow_fill

787 )

788 blklocs = algos.take_nd(

789 self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill

790 )

791

792 # When filling blknos, make sure blknos is updated before appending to

793 # blocks list, that way new blkno is exactly len(blocks).

794 blocks = []

795 group = not only_slice

796 for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group):

797 if blkno == -1:

798 # If we've got here, fill_value was not lib.no_default

799

800 blocks.append(

801 self._make_na_block(

802 placement=mgr_locs,

803 fill_value=fill_value,

804 use_na_proxy=use_na_proxy,

805 )

806 )

807 else:

808 blk = self.blocks[blkno]

809

810 # Otherwise, slicing along items axis is necessary.

811 if not blk._can_consolidate and not blk._validate_ndim:

812 # i.e. we dont go through here for DatetimeTZBlock

813 # A non-consolidatable block, it's easy, because there's

814 # only one item and each mgr loc is a copy of that single

815 # item.

816 deep = not (only_slice or using_copy_on_write())

817 for mgr_loc in mgr_locs:

818 newblk = blk.copy(deep=deep)

819 newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1))

820 blocks.append(newblk)

821

822 else:

823 # GH#32779 to avoid the performance penalty of copying,

824 # we may try to only slice

825 taker = blklocs[mgr_locs.indexer]

826 max_len = max(len(mgr_locs), taker.max() + 1)

827 if only_slice or using_copy_on_write():

828 taker = lib.maybe_indices_to_slice(taker, max_len)

829

830 if isinstance(taker, slice):

831 nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs)

832 blocks.append(nb)

833 elif only_slice:

834 # GH#33597 slice instead of take, so we get

835 # views instead of copies

836 for i, ml in zip(taker, mgr_locs):

837 slc = slice(i, i + 1)

838 bp = BlockPlacement(ml)

839 nb = blk.getitem_block_columns(slc, new_mgr_locs=bp)

840 # We have np.shares_memory(nb.values, blk.values)

841 blocks.append(nb)

842 else:

843 nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs)

844 blocks.append(nb)

845

846 return blocks

847

848 def _make_na_block(

849 self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False

850 ) -> Block:

851 # Note: we only get here with self.ndim == 2

852

853 if use_na_proxy:

854 assert fill_value is None

855 shape = (len(placement), self.shape[1])

856 vals = np.empty(shape, dtype=np.void)

857 nb = NumpyBlock(vals, placement, ndim=2)

858 return nb

859

860 if fill_value is None:

861 fill_value = np.nan

862

863 shape = (len(placement), self.shape[1])

864

865 dtype, fill_value = infer_dtype_from_scalar(fill_value)

866 block_values = make_na_array(dtype, shape, fill_value)

867 return new_block_2d(block_values, placement=placement)

868

869 def take(

870 self,

871 indexer: npt.NDArray[np.intp],

872 axis: AxisInt = 1,

873 verify: bool = True,

874 ) -> Self:

875 """

876 Take items along any axis.

877

878 indexer : np.ndarray[np.intp]

879 axis : int, default 1

880 verify : bool, default True

881 Check that all entries are between 0 and len(self) - 1, inclusive.

882 Pass verify=False if this check has been done by the caller.

883

884 Returns

885 -------

886 BlockManager

887 """

888 # Caller is responsible for ensuring indexer annotation is accurate

889

890 n = self.shape[axis]

891 indexer = maybe_convert_indices(indexer, n, verify=verify)

892

893 new_labels = self.axes[axis].take(indexer)

894 return self.reindex_indexer(

895 new_axis=new_labels,

896 indexer=indexer,

897 axis=axis,

898 allow_dups=True,

899 copy=None,

900 )

901

902

903class BlockManager(libinternals.BlockManager, BaseBlockManager):

904 """

905 BaseBlockManager that holds 2D blocks.

906 """

907

908 ndim = 2

909

910 # ----------------------------------------------------------------

911 # Constructors

912

913 def __init__(

914 self,

915 blocks: Sequence[Block],

916 axes: Sequence[Index],

917 verify_integrity: bool = True,

918 ) -> None:

919 if verify_integrity:

920 # Assertion disabled for performance

921 # assert all(isinstance(x, Index) for x in axes)

922

923 for block in blocks:

924 if self.ndim != block.ndim:

925 raise AssertionError(

926 f"Number of Block dimensions ({block.ndim}) must equal "

927 f"number of axes ({self.ndim})"

928 )

929 # As of 2.0, the caller is responsible for ensuring that

930 # DatetimeTZBlock with block.ndim == 2 has block.values.ndim ==2;

931 # previously there was a special check for fastparquet compat.

932

933 self._verify_integrity()

934

935 def _verify_integrity(self) -> None:

936 mgr_shape = self.shape

937 tot_items = sum(len(x.mgr_locs) for x in self.blocks)

938 for block in self.blocks:

939 if block.shape[1:] != mgr_shape[1:]:

940 raise_construction_error(tot_items, block.shape[1:], self.axes)

941 if len(self.items) != tot_items:

942 raise AssertionError(

943 "Number of manager items must equal union of "

944 f"block items\n# manager items: {len(self.items)}, # "

945 f"tot_items: {tot_items}"

946 )

947

948 @classmethod

949 def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self:

950 """

951 Constructor for BlockManager and SingleBlockManager with same signature.

952 """

953 return cls(blocks, axes, verify_integrity=False)

954

955 # ----------------------------------------------------------------

956 # Indexing

957

958 def fast_xs(self, loc: int) -> SingleBlockManager:

959 """

960 Return the array corresponding to `frame.iloc[loc]`.

961

962 Parameters

963 ----------

964 loc : int

965

966 Returns

967 -------

968 np.ndarray or ExtensionArray

969 """

970 if len(self.blocks) == 1:

971 # TODO: this could be wrong if blk.mgr_locs is not slice(None)-like;

972 # is this ruled out in the general case?

973 result = self.blocks[0].iget((slice(None), loc))

974 # in the case of a single block, the new block is a view

975 bp = BlockPlacement(slice(0, len(result)))

976 block = new_block(

977 result,

978 placement=bp,

979 ndim=1,

980 refs=self.blocks[0].refs,

981 )

982 return SingleBlockManager(block, self.axes[0])

983

984 dtype = interleaved_dtype([blk.dtype for blk in self.blocks])

985

986 n = len(self)

987

988 if isinstance(dtype, ExtensionDtype):

989 # TODO: use object dtype as workaround for non-performant

990 # EA.__setitem__ methods. (primarily ArrowExtensionArray.__setitem__

991 # when iteratively setting individual values)

992 # https://github.com/pandas-dev/pandas/pull/54508#issuecomment-1675827918

993 result = np.empty(n, dtype=object)

994 else:

995 result = np.empty(n, dtype=dtype)

996 result = ensure_wrapped_if_datetimelike(result)

997

998 for blk in self.blocks:

999 # Such assignment may incorrectly coerce NaT to None

1000 # result[blk.mgr_locs] = blk._slice((slice(None), loc))

1001 for i, rl in enumerate(blk.mgr_locs):

1002 result[rl] = blk.iget((i, loc))

1003

1004 if isinstance(dtype, ExtensionDtype):

1005 cls = dtype.construct_array_type()

1006 result = cls._from_sequence(result, dtype=dtype)

1007

1008 bp = BlockPlacement(slice(0, len(result)))

1009 block = new_block(result, placement=bp, ndim=1)

1010 return SingleBlockManager(block, self.axes[0])

1011

1012 def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:

1013 """

1014 Return the data as a SingleBlockManager.

1015 """

1016 block = self.blocks[self.blknos[i]]

1017 values = block.iget(self.blklocs[i])

1018

1019 # shortcut for select a single-dim from a 2-dim BM

1020 bp = BlockPlacement(slice(0, len(values)))

1021 nb = type(block)(

1022 values, placement=bp, ndim=1, refs=block.refs if track_ref else None

1023 )

1024 return SingleBlockManager(nb, self.axes[1])

1025

1026 def iget_values(self, i: int) -> ArrayLike:

1027 """

1028 Return the data for column i as the values (ndarray or ExtensionArray).

1029

1030 Warning! The returned array is a view but doesn't handle Copy-on-Write,

1031 so this should be used with caution.

1032 """

1033 # TODO(CoW) making the arrays read-only might make this safer to use?

1034 block = self.blocks[self.blknos[i]]

1035 values = block.iget(self.blklocs[i])

1036 return values

1037

1038 @property

1039 def column_arrays(self) -> list[np.ndarray]:

1040 """

1041 Used in the JSON C code to access column arrays.

1042 This optimizes compared to using `iget_values` by converting each

1043

1044 Warning! This doesn't handle Copy-on-Write, so should be used with

1045 caution (current use case of consuming this in the JSON code is fine).

1046 """

1047 # This is an optimized equivalent to

1048 # result = [self.iget_values(i) for i in range(len(self.items))]

1049 result: list[np.ndarray | None] = [None] * len(self.items)

1050

1051 for blk in self.blocks:

1052 mgr_locs = blk._mgr_locs

1053 values = blk.array_values._values_for_json()

1054 if values.ndim == 1:

1055 # TODO(EA2D): special casing not needed with 2D EAs

1056 result[mgr_locs[0]] = values

1057

1058 else:

1059 for i, loc in enumerate(mgr_locs):

1060 result[loc] = values[i]

1061

1062 # error: Incompatible return value type (got "List[None]",

1063 # expected "List[ndarray[Any, Any]]")

1064 return result # type: ignore[return-value]

1065

1066 def iset(

1067 self,

1068 loc: int | slice | np.ndarray,

1069 value: ArrayLike,

1070 inplace: bool = False,

1071 refs: BlockValuesRefs | None = None,

1072 ) -> None:

1073 """

1074 Set new item in-place. Does not consolidate. Adds new Block if not

1075 contained in the current set of items

1076 """

1077

1078 # FIXME: refactor, clearly separate broadcasting & zip-like assignment

1079 # can prob also fix the various if tests for sparse/categorical

1080 if self._blklocs is None and self.ndim > 1:

1081 self._rebuild_blknos_and_blklocs()

1082

1083 # Note: we exclude DTA/TDA here

1084 value_is_extension_type = is_1d_only_ea_dtype(value.dtype)

1085 if not value_is_extension_type:

1086 if value.ndim == 2:

1087 value = value.T

1088 else:

1089 value = ensure_block_shape(value, ndim=2)

1090

1091 if value.shape[1:] != self.shape[1:]:

1092 raise AssertionError(

1093 "Shape of new values must be compatible with manager shape"

1094 )

1095

1096 if lib.is_integer(loc):

1097 # We have 6 tests where loc is _not_ an int.

1098 # In this case, get_blkno_placements will yield only one tuple,

1099 # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1)))

1100

1101 # Check if we can use _iset_single fastpath

1102 loc = cast(int, loc)

1103 blkno = self.blknos[loc]

1104 blk = self.blocks[blkno]

1105 if len(blk._mgr_locs) == 1: # TODO: fastest way to check this?

1106 return self._iset_single(

1107 loc,

1108 value,

1109 inplace=inplace,

1110 blkno=blkno,

1111 blk=blk,

1112 refs=refs,

1113 )

1114

1115 # error: Incompatible types in assignment (expression has type

1116 # "List[Union[int, slice, ndarray]]", variable has type "Union[int,

1117 # slice, ndarray]")

1118 loc = [loc] # type: ignore[assignment]

1119

1120 # categorical/sparse/datetimetz

1121 if value_is_extension_type:

1122

1123 def value_getitem(placement):

1124 return value

1125

1126 else:

1127

1128 def value_getitem(placement):

1129 return value[placement.indexer]

1130

1131 # Accessing public blknos ensures the public versions are initialized

1132 blknos = self.blknos[loc]

1133 blklocs = self.blklocs[loc].copy()

1134

1135 unfit_mgr_locs = []

1136 unfit_val_locs = []

1137 removed_blknos = []

1138 for blkno_l, val_locs in libinternals.get_blkno_placements(blknos, group=True):

1139 blk = self.blocks[blkno_l]

1140 blk_locs = blklocs[val_locs.indexer]

1141 if inplace and blk.should_store(value):

1142 # Updating inplace -> check if we need to do Copy-on-Write

1143 if using_copy_on_write() and not self._has_no_reference_block(blkno_l):

1144 self._iset_split_block(

1145 blkno_l, blk_locs, value_getitem(val_locs), refs=refs

1146 )

1147 else:

1148 blk.set_inplace(blk_locs, value_getitem(val_locs))

1149 continue

1150 else:

1151 unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])

1152 unfit_val_locs.append(val_locs)

1153

1154 # If all block items are unfit, schedule the block for removal.

1155 if len(val_locs) == len(blk.mgr_locs):

1156 removed_blknos.append(blkno_l)

1157 continue

1158 else:

1159 # Defer setting the new values to enable consolidation

1160 self._iset_split_block(blkno_l, blk_locs, refs=refs)

1161

1162 if len(removed_blknos):

1163 # Remove blocks & update blknos accordingly

1164 is_deleted = np.zeros(self.nblocks, dtype=np.bool_)

1165 is_deleted[removed_blknos] = True

1166

1167 new_blknos = np.empty(self.nblocks, dtype=np.intp)

1168 new_blknos.fill(-1)

1169 new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos))

1170 self._blknos = new_blknos[self._blknos]

1171 self.blocks = tuple(

1172 blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos)

1173 )

1174

1175 if unfit_val_locs:

1176 unfit_idxr = np.concatenate(unfit_mgr_locs)

1177 unfit_count = len(unfit_idxr)

1178

1179 new_blocks: list[Block] = []

1180 if value_is_extension_type:

1181 # This code (ab-)uses the fact that EA blocks contain only

1182 # one item.

1183 # TODO(EA2D): special casing unnecessary with 2D EAs

1184 new_blocks.extend(

1185 new_block_2d(

1186 values=value,

1187 placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)),

1188 refs=refs,

1189 )

1190 for mgr_loc in unfit_idxr

1191 )

1192

1193 self._blknos[unfit_idxr] = np.arange(unfit_count) + len(self.blocks)

1194 self._blklocs[unfit_idxr] = 0

1195

1196 else:

1197 # unfit_val_locs contains BlockPlacement objects

1198 unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])

1199

1200 new_blocks.append(

1201 new_block_2d(

1202 values=value_getitem(unfit_val_items),

1203 placement=BlockPlacement(unfit_idxr),

1204 refs=refs,

1205 )

1206 )

1207

1208 self._blknos[unfit_idxr] = len(self.blocks)

1209 self._blklocs[unfit_idxr] = np.arange(unfit_count)

1210

1211 self.blocks += tuple(new_blocks)

1212

1213 # Newly created block's dtype may already be present.

1214 self._known_consolidated = False

1215

1216 def _iset_split_block(

1217 self,

1218 blkno_l: int,

1219 blk_locs: np.ndarray | list[int],

1220 value: ArrayLike | None = None,

1221 refs: BlockValuesRefs | None = None,

1222 ) -> None:

1223 """Removes columns from a block by splitting the block.

1224

1225 Avoids copying the whole block through slicing and updates the manager

1226 after determinint the new block structure. Optionally adds a new block,

1227 otherwise has to be done by the caller.

1228

1229 Parameters

1230 ----------

1231 blkno_l: The block number to operate on, relevant for updating the manager

1232 blk_locs: The locations of our block that should be deleted.

1233 value: The value to set as a replacement.

1234 refs: The reference tracking object of the value to set.

1235 """

1236 blk = self.blocks[blkno_l]

1237

1238 if self._blklocs is None:

1239 self._rebuild_blknos_and_blklocs()

1240

1241 nbs_tup = tuple(blk.delete(blk_locs))

1242 if value is not None:

1243 locs = blk.mgr_locs.as_array[blk_locs]

1244 first_nb = new_block_2d(value, BlockPlacement(locs), refs=refs)

1245 else:

1246 first_nb = nbs_tup[0]

1247 nbs_tup = tuple(nbs_tup[1:])

1248

1249 nr_blocks = len(self.blocks)

1250 blocks_tup = (

1251 self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup

1252 )

1253 self.blocks = blocks_tup

1254

1255 if not nbs_tup and value is not None:

1256 # No need to update anything if split did not happen

1257 return

1258

1259 self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))

1260

1261 for i, nb in enumerate(nbs_tup):

1262 self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))

1263 self._blknos[nb.mgr_locs.indexer] = i + nr_blocks

1264

1265 def _iset_single(

1266 self,

1267 loc: int,

1268 value: ArrayLike,

1269 inplace: bool,

1270 blkno: int,

1271 blk: Block,

1272 refs: BlockValuesRefs | None = None,

1273 ) -> None:

1274 """

1275 Fastpath for iset when we are only setting a single position and

1276 the Block currently in that position is itself single-column.

1277

1278 In this case we can swap out the entire Block and blklocs and blknos

1279 are unaffected.

1280 """

1281 # Caller is responsible for verifying value.shape

1282

1283 if inplace and blk.should_store(value):

1284 copy = False

1285 if using_copy_on_write() and not self._has_no_reference_block(blkno):

1286 # perform Copy-on-Write and clear the reference

1287 copy = True

1288 iloc = self.blklocs[loc]

1289 blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy)

1290 return

1291

1292 nb = new_block_2d(value, placement=blk._mgr_locs, refs=refs)

1293 old_blocks = self.blocks

1294 new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :]

1295 self.blocks = new_blocks

1296 return

1297

1298 def column_setitem(

1299 self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False

1300 ) -> None:

1301 """

1302 Set values ("setitem") into a single column (not setting the full column).

1303

1304 This is a method on the BlockManager level, to avoid creating an

1305 intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)

1306 """

1307 needs_to_warn = False

1308 if warn_copy_on_write() and not self._has_no_reference(loc):

1309 if not isinstance(

1310 self.blocks[self.blknos[loc]].values,

1311 (ArrowExtensionArray, ArrowStringArray),

1312 ):

1313 # We might raise if we are in an expansion case, so defer

1314 # warning till we actually updated

1315 needs_to_warn = True

1316

1317 elif using_copy_on_write() and not self._has_no_reference(loc):

1318 blkno = self.blknos[loc]

1319 # Split blocks to only copy the column we want to modify

1320 blk_loc = self.blklocs[loc]

1321 # Copy our values

1322 values = self.blocks[blkno].values

1323 if values.ndim == 1:

1324 values = values.copy()

1325 else:

1326 # Use [blk_loc] as indexer to keep ndim=2, this already results in a

1327 # copy

1328 values = values[[blk_loc]]

1329 self._iset_split_block(blkno, [blk_loc], values)

1330

1331 # this manager is only created temporarily to mutate the values in place

1332 # so don't track references, otherwise the `setitem` would perform CoW again

1333 col_mgr = self.iget(loc, track_ref=False)

1334 if inplace_only:

1335 col_mgr.setitem_inplace(idx, value)

1336 else:

1337 new_mgr = col_mgr.setitem((idx,), value)

1338 self.iset(loc, new_mgr._block.values, inplace=True)

1339

1340 if needs_to_warn:

1341 warnings.warn(

1342 COW_WARNING_GENERAL_MSG,

1343 FutureWarning,

1344 stacklevel=find_stack_level(),

1345 )

1346

1347 def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None:

1348 """

1349 Insert item at selected position.

1350

1351 Parameters

1352 ----------

1353 loc : int

1354 item : hashable

1355 value : np.ndarray or ExtensionArray

1356 refs : The reference tracking object of the value to set.

1357 """

1358 with warnings.catch_warnings():

1359 # TODO: re-issue this with setitem-specific message?

1360 warnings.filterwarnings(

1361 "ignore",

1362 "The behavior of Index.insert with object-dtype is deprecated",

1363 category=FutureWarning,

1364 )

1365 new_axis = self.items.insert(loc, item)

1366

1367 if value.ndim == 2:

1368 value = value.T

1369 if len(value) > 1:

1370 raise ValueError(

1371 f"Expected a 1D array, got an array with shape {value.T.shape}"

1372 )

1373 else:

1374 value = ensure_block_shape(value, ndim=self.ndim)

1375

1376 bp = BlockPlacement(slice(loc, loc + 1))

1377 block = new_block_2d(values=value, placement=bp, refs=refs)

1378

1379 if not len(self.blocks):

1380 # Fastpath

1381 self._blklocs = np.array([0], dtype=np.intp)

1382 self._blknos = np.array([0], dtype=np.intp)

1383 else:

1384 self._insert_update_mgr_locs(loc)

1385 self._insert_update_blklocs_and_blknos(loc)

1386

1387 self.axes[0] = new_axis

1388 self.blocks += (block,)

1389

1390 self._known_consolidated = False

1391

1392 if sum(not block.is_extension for block in self.blocks) > 100:

1393 warnings.warn(

1394 "DataFrame is highly fragmented. This is usually the result "

1395 "of calling `frame.insert` many times, which has poor performance. "

1396 "Consider joining all columns at once using pd.concat(axis=1) "

1397 "instead. To get a de-fragmented frame, use `newframe = frame.copy()`",

1398 PerformanceWarning,

1399 stacklevel=find_stack_level(),

1400 )

1401

1402 def _insert_update_mgr_locs(self, loc) -> None:

1403 """

1404 When inserting a new Block at location 'loc', we increment

1405 all of the mgr_locs of blocks above that by one.

1406 """

1407 for blkno, count in _fast_count_smallints(self.blknos[loc:]):

1408 # .620 this way, .326 of which is in increment_above

1409 blk = self.blocks[blkno]

1410 blk._mgr_locs = blk._mgr_locs.increment_above(loc)

1411

1412 def _insert_update_blklocs_and_blknos(self, loc) -> None:

1413 """

1414 When inserting a new Block at location 'loc', we update our

1415 _blklocs and _blknos.

1416 """

1417

1418 # Accessing public blklocs ensures the public versions are initialized

1419 if loc == self.blklocs.shape[0]:

1420 # np.append is a lot faster, let's use it if we can.

1421 self._blklocs = np.append(self._blklocs, 0)

1422 self._blknos = np.append(self._blknos, len(self.blocks))

1423 elif loc == 0:

1424 # np.append is a lot faster, let's use it if we can.

1425 self._blklocs = np.append(self._blklocs[::-1], 0)[::-1]

1426 self._blknos = np.append(self._blknos[::-1], len(self.blocks))[::-1]

1427 else:

1428 new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos(

1429 self.blklocs, self.blknos, loc, len(self.blocks)

1430 )

1431 self._blklocs = new_blklocs

1432 self._blknos = new_blknos

1433

1434 def idelete(self, indexer) -> BlockManager:

1435 """

1436 Delete selected locations, returning a new BlockManager.

1437 """

1438 is_deleted = np.zeros(self.shape[0], dtype=np.bool_)

1439 is_deleted[indexer] = True

1440 taker = (~is_deleted).nonzero()[0]

1441

1442 nbs = self._slice_take_blocks_ax0(taker, only_slice=True, ref_inplace_op=True)

1443 new_columns = self.items[~is_deleted]

1444 axes = [new_columns, self.axes[1]]

1445 return type(self)(tuple(nbs), axes, verify_integrity=False)

1446

1447 # ----------------------------------------------------------------

1448 # Block-wise Operation

1449

1450 def grouped_reduce(self, func: Callable) -> Self:

1451 """

1452 Apply grouped reduction function blockwise, returning a new BlockManager.

1453

1454 Parameters

1455 ----------

1456 func : grouped reduction function

1457

1458 Returns

1459 -------

1460 BlockManager

1461 """

1462 result_blocks: list[Block] = []

1463

1464 for blk in self.blocks:

1465 if blk.is_object:

1466 # split on object-dtype blocks bc some columns may raise

1467 # while others do not.

1468 for sb in blk._split():

1469 applied = sb.apply(func)

1470 result_blocks = extend_blocks(applied, result_blocks)

1471 else:

1472 applied = blk.apply(func)

1473 result_blocks = extend_blocks(applied, result_blocks)

1474

1475 if len(result_blocks) == 0:

1476 nrows = 0

1477 else:

1478 nrows = result_blocks[0].values.shape[-1]

1479 index = Index(range(nrows))

1480

1481 return type(self).from_blocks(result_blocks, [self.axes[0], index])

1482

1483 def reduce(self, func: Callable) -> Self:

1484 """

1485 Apply reduction function blockwise, returning a single-row BlockManager.

1486

1487 Parameters

1488 ----------

1489 func : reduction function

1490

1491 Returns

1492 -------

1493 BlockManager

1494 """

1495 # If 2D, we assume that we're operating column-wise

1496 assert self.ndim == 2

1497

1498 res_blocks: list[Block] = []

1499 for blk in self.blocks:

1500 nbs = blk.reduce(func)

1501 res_blocks.extend(nbs)

1502

1503 index = Index([None]) # placeholder

1504 new_mgr = type(self).from_blocks(res_blocks, [self.items, index])

1505 return new_mgr

1506

1507 def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager:

1508 """

1509 Apply array_op blockwise with another (aligned) BlockManager.

1510 """

1511 return operate_blockwise(self, other, array_op)

1512

1513 def _equal_values(self: BlockManager, other: BlockManager) -> bool:

1514 """

1515 Used in .equals defined in base class. Only check the column values

1516 assuming shape and indexes have already been checked.

1517 """

1518 return blockwise_all(self, other, array_equals)

1519

1520 def quantile(

1521 self,

1522 *,

1523 qs: Index, # with dtype float 64

1524 interpolation: QuantileInterpolation = "linear",

1525 ) -> Self:

1526 """

1527 Iterate over blocks applying quantile reduction.

1528 This routine is intended for reduction type operations and

1529 will do inference on the generated blocks.

1530

1531 Parameters

1532 ----------

1533 interpolation : type of interpolation, default 'linear'

1534 qs : list of the quantiles to be computed

1535

1536 Returns

1537 -------

1538 BlockManager

1539 """

1540 # Series dispatches to DataFrame for quantile, which allows us to

1541 # simplify some of the code here and in the blocks

1542 assert self.ndim >= 2

1543 assert is_list_like(qs) # caller is responsible for this

1544

1545 new_axes = list(self.axes)

1546 new_axes[1] = Index(qs, dtype=np.float64)

1547

1548 blocks = [

1549 blk.quantile(qs=qs, interpolation=interpolation) for blk in self.blocks

1550 ]

1551

1552 return type(self)(blocks, new_axes)

1553

1554 # ----------------------------------------------------------------

1555

1556 def unstack(self, unstacker, fill_value) -> BlockManager:

1557 """

1558 Return a BlockManager with all blocks unstacked.

1559

1560 Parameters

1561 ----------

1562 unstacker : reshape._Unstacker

1563 fill_value : Any

1564 fill_value for newly introduced missing values.

1565

1566 Returns

1567 -------

1568 unstacked : BlockManager

1569 """

1570 new_columns = unstacker.get_new_columns(self.items)

1571 new_index = unstacker.new_index

1572

1573 allow_fill = not unstacker.mask_all

1574 if allow_fill:

1575 # calculating the full mask once and passing it to Block._unstack is

1576 # faster than letting calculating it in each repeated call

1577 new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape)

1578 needs_masking = new_mask2D.any(axis=0)

1579 else:

1580 needs_masking = np.zeros(unstacker.full_shape[1], dtype=bool)

1581

1582 new_blocks: list[Block] = []

1583 columns_mask: list[np.ndarray] = []

1584

1585 if len(self.items) == 0:

1586 factor = 1

1587 else:

1588 fac = len(new_columns) / len(self.items)

1589 assert fac == int(fac)

1590 factor = int(fac)

1591

1592 for blk in self.blocks:

1593 mgr_locs = blk.mgr_locs

1594 new_placement = mgr_locs.tile_for_unstack(factor)

1595

1596 blocks, mask = blk._unstack(

1597 unstacker,

1598 fill_value,

1599 new_placement=new_placement,

1600 needs_masking=needs_masking,

1601 )

1602

1603 new_blocks.extend(blocks)

1604 columns_mask.extend(mask)

1605

1606 # Block._unstack should ensure this holds,

1607 assert mask.sum() == sum(len(nb._mgr_locs) for nb in blocks)

1608 # In turn this ensures that in the BlockManager call below

1609 # we have len(new_columns) == sum(x.shape[0] for x in new_blocks)

1610 # which suffices to allow us to pass verify_inegrity=False

1611

1612 new_columns = new_columns[columns_mask]

1613

1614 bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False)

1615 return bm

1616

1617 def to_dict(self) -> dict[str, Self]:

1618 """

1619 Return a dict of str(dtype) -> BlockManager

1620

1621 Returns

1622 -------

1623 values : a dict of dtype -> BlockManager

1624 """

1625

1626 bd: dict[str, list[Block]] = {}

1627 for b in self.blocks:

1628 bd.setdefault(str(b.dtype), []).append(b)

1629

1630 # TODO(EA2D): the combine will be unnecessary with 2D EAs

1631 return {dtype: self._combine(blocks) for dtype, blocks in bd.items()}

1632

1633 def as_array(

1634 self,

1635 dtype: np.dtype | None = None,

1636 copy: bool = False,

1637 na_value: object = lib.no_default,

1638 ) -> np.ndarray:

1639 """

1640 Convert the blockmanager data into an numpy array.

1641

1642 Parameters

1643 ----------

1644 dtype : np.dtype or None, default None

1645 Data type of the return array.

1646 copy : bool, default False

1647 If True then guarantee that a copy is returned. A value of

1648 False does not guarantee that the underlying data is not

1649 copied.

1650 na_value : object, default lib.no_default

1651 Value to be used as the missing value sentinel.

1652

1653 Returns

1654 -------

1655 arr : ndarray

1656 """

1657 passed_nan = lib.is_float(na_value) and isna(na_value)

1658

1659 if len(self.blocks) == 0:

1660 arr = np.empty(self.shape, dtype=float)

1661 return arr.transpose()

1662

1663 if self.is_single_block:

1664 blk = self.blocks[0]

1665

1666 if na_value is not lib.no_default:

1667 # We want to copy when na_value is provided to avoid

1668 # mutating the original object

1669 if lib.is_np_dtype(blk.dtype, "f") and passed_nan:

1670 # We are already numpy-float and na_value=np.nan

1671 pass

1672 else:

1673 copy = True

1674

1675 if blk.is_extension:

1676 # Avoid implicit conversion of extension blocks to object

1677

1678 # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no

1679 # attribute "to_numpy"

1680 arr = blk.values.to_numpy( # type: ignore[union-attr]

1681 dtype=dtype,

1682 na_value=na_value,

1683 copy=copy,

1684 ).reshape(blk.shape)

1685 elif not copy:

1686 arr = np.asarray(blk.values, dtype=dtype)

1687 else:

1688 arr = np.array(blk.values, dtype=dtype, copy=copy)

1689

1690 if using_copy_on_write() and not copy:

1691 arr = arr.view()

1692 arr.flags.writeable = False

1693 else:

1694 arr = self._interleave(dtype=dtype, na_value=na_value)

1695 # The underlying data was copied within _interleave, so no need

1696 # to further copy if copy=True or setting na_value

1697

1698 if na_value is lib.no_default:

1699 pass

1700 elif arr.dtype.kind == "f" and passed_nan:

1701 pass

1702 else:

1703 arr[isna(arr)] = na_value

1704

1705 return arr.transpose()

1706

1707 def _interleave(

1708 self,

1709 dtype: np.dtype | None = None,

1710 na_value: object = lib.no_default,

1711 ) -> np.ndarray:

1712 """

1713 Return ndarray from blocks with specified item order

1714 Items must be contained in the blocks

1715 """

1716 if not dtype:

1717 # Incompatible types in assignment (expression has type

1718 # "Optional[Union[dtype[Any], ExtensionDtype]]", variable has

1719 # type "Optional[dtype[Any]]")

1720 dtype = interleaved_dtype( # type: ignore[assignment]

1721 [blk.dtype for blk in self.blocks]

1722 )

1723

1724 # error: Argument 1 to "ensure_np_dtype" has incompatible type

1725 # "Optional[dtype[Any]]"; expected "Union[dtype[Any], ExtensionDtype]"

1726 dtype = ensure_np_dtype(dtype) # type: ignore[arg-type]

1727 result = np.empty(self.shape, dtype=dtype)

1728

1729 itemmask = np.zeros(self.shape[0])

1730

1731 if dtype == np.dtype("object") and na_value is lib.no_default:

1732 # much more performant than using to_numpy below

1733 for blk in self.blocks:

1734 rl = blk.mgr_locs

1735 arr = blk.get_values(dtype)

1736 result[rl.indexer] = arr

1737 itemmask[rl.indexer] = 1

1738 return result

1739

1740 for blk in self.blocks:

1741 rl = blk.mgr_locs

1742 if blk.is_extension:

1743 # Avoid implicit conversion of extension blocks to object

1744

1745 # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no

1746 # attribute "to_numpy"

1747 arr = blk.values.to_numpy( # type: ignore[union-attr]

1748 dtype=dtype,

1749 na_value=na_value,

1750 )

1751 else:

1752 arr = blk.get_values(dtype)

1753 result[rl.indexer] = arr

1754 itemmask[rl.indexer] = 1

1755

1756 if not itemmask.all():

1757 raise AssertionError("Some items were not contained in blocks")

1758

1759 return result

1760

1761 # ----------------------------------------------------------------

1762 # Consolidation

1763

1764 def is_consolidated(self) -> bool:

1765 """

1766 Return True if more than one block with the same dtype

1767 """

1768 if not self._known_consolidated:

1769 self._consolidate_check()

1770 return self._is_consolidated

1771

1772 def _consolidate_check(self) -> None:

1773 if len(self.blocks) == 1:

1774 # fastpath

1775 self._is_consolidated = True

1776 self._known_consolidated = True

1777 return

1778 dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate]

1779 self._is_consolidated = len(dtypes) == len(set(dtypes))

1780 self._known_consolidated = True

1781

1782 def _consolidate_inplace(self) -> None:

1783 # In general, _consolidate_inplace should only be called via

1784 # DataFrame._consolidate_inplace, otherwise we will fail to invalidate

1785 # the DataFrame's _item_cache. The exception is for newly-created

1786 # BlockManager objects not yet attached to a DataFrame.

1787 if not self.is_consolidated():

1788 self.blocks = _consolidate(self.blocks)

1789 self._is_consolidated = True

1790 self._known_consolidated = True

1791 self._rebuild_blknos_and_blklocs()

1792

1793 # ----------------------------------------------------------------

1794 # Concatenation

1795

1796 @classmethod

1797 def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self:

1798 """

1799 Concatenate uniformly-indexed BlockManagers horizontally.

1800 """

1801 offset = 0

1802 blocks: list[Block] = []

1803 for mgr in mgrs:

1804 for blk in mgr.blocks:

1805 # We need to do getitem_block here otherwise we would be altering

1806 # blk.mgr_locs in place, which would render it invalid. This is only

1807 # relevant in the copy=False case.

1808 nb = blk.slice_block_columns(slice(None))

1809 nb._mgr_locs = nb._mgr_locs.add(offset)

1810 blocks.append(nb)

1811

1812 offset += len(mgr.items)

1813

1814 new_mgr = cls(tuple(blocks), axes)

1815 return new_mgr

1816

1817 @classmethod

1818 def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self:

1819 """

1820 Concatenate uniformly-indexed BlockManagers vertically.

1821 """

1822 raise NotImplementedError("This logic lives (for now) in internals.concat")

1823

1824

1825class SingleBlockManager(BaseBlockManager, SingleDataManager):

1826 """manage a single block with"""

1827

1828 @property

1829 def ndim(self) -> Literal[1]:

1830 return 1

1831

1832 _is_consolidated = True

1833 _known_consolidated = True

1834 __slots__ = ()

1835 is_single_block = True

1836

1837 def __init__(

1838 self,

1839 block: Block,

1840 axis: Index,

1841 verify_integrity: bool = False,

1842 ) -> None:

1843 # Assertions disabled for performance

1844 # assert isinstance(block, Block), type(block)

1845 # assert isinstance(axis, Index), type(axis)

1846

1847 self.axes = [axis]

1848 self.blocks = (block,)

1849

1850 @classmethod

1851 def from_blocks(

1852 cls,

1853 blocks: list[Block],

1854 axes: list[Index],

1855 ) -> Self:

1856 """

1857 Constructor for BlockManager and SingleBlockManager with same signature.

1858 """

1859 assert len(blocks) == 1

1860 assert len(axes) == 1

1861 return cls(blocks[0], axes[0], verify_integrity=False)

1862

1863 @classmethod

1864 def from_array(

1865 cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None

1866 ) -> SingleBlockManager:

1867 """

1868 Constructor for if we have an array that is not yet a Block.

1869 """

1870 array = maybe_coerce_values(array)

1871 bp = BlockPlacement(slice(0, len(index)))

1872 block = new_block(array, placement=bp, ndim=1, refs=refs)

1873 return cls(block, index)

1874

1875 def to_2d_mgr(self, columns: Index) -> BlockManager:

1876 """

1877 Manager analogue of Series.to_frame

1878 """

1879 blk = self.blocks[0]

1880 arr = ensure_block_shape(blk.values, ndim=2)

1881 bp = BlockPlacement(0)

1882 new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs)

1883 axes = [columns, self.axes[0]]

1884 return BlockManager([new_blk], axes=axes, verify_integrity=False)

1885

1886 def _has_no_reference(self, i: int = 0) -> bool:

1887 """

1888 Check for column `i` if it has references.

1889 (whether it references another array or is itself being referenced)

1890 Returns True if the column has no references.

1891 """

1892 return not self.blocks[0].refs.has_reference()

1893

1894 def __getstate__(self):

1895 block_values = [b.values for b in self.blocks]

1896 block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]

1897 axes_array = list(self.axes)

1898

1899 extra_state = {

1900 "0.14.1": {

1901 "axes": axes_array,

1902 "blocks": [

1903 {"values": b.values, "mgr_locs": b.mgr_locs.indexer}

1904 for b in self.blocks

1905 ],

1906 }

1907 }

1908

1909 # First three elements of the state are to maintain forward

1910 # compatibility with 0.13.1.

1911 return axes_array, block_values, block_items, extra_state

1912

1913 def __setstate__(self, state) -> None:

1914 def unpickle_block(values, mgr_locs, ndim: int) -> Block:

1915 # TODO(EA2D): ndim would be unnecessary with 2D EAs

1916 # older pickles may store e.g. DatetimeIndex instead of DatetimeArray

1917 values = extract_array(values, extract_numpy=True)

1918 if not isinstance(mgr_locs, BlockPlacement):

1919 mgr_locs = BlockPlacement(mgr_locs)

1920

1921 values = maybe_coerce_values(values)

1922 return new_block(values, placement=mgr_locs, ndim=ndim)

1923

1924 if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:

1925 state = state[3]["0.14.1"]

1926 self.axes = [ensure_index(ax) for ax in state["axes"]]

1927 ndim = len(self.axes)

1928 self.blocks = tuple(

1929 unpickle_block(b["values"], b["mgr_locs"], ndim=ndim)

1930 for b in state["blocks"]

1931 )

1932 else:

1933 raise NotImplementedError("pre-0.14.1 pickles are no longer supported")

1934

1935 self._post_setstate()

1936

1937 def _post_setstate(self) -> None:

1938 pass

1939

1940 @cache_readonly

1941 def _block(self) -> Block:

1942 return self.blocks[0]

1943

1944 @property

1945 def _blknos(self):

1946 """compat with BlockManager"""

1947 return None

1948

1949 @property

1950 def _blklocs(self):

1951 """compat with BlockManager"""

1952 return None

1953

1954 def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Self:

1955 # similar to get_slice, but not restricted to slice indexer

1956 blk = self._block

1957 if using_copy_on_write() and len(indexer) > 0 and indexer.all():

1958 return type(self)(blk.copy(deep=False), self.index)

1959 array = blk.values[indexer]

1960

1961 if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "b":

1962 # boolean indexing always gives a copy with numpy

1963 refs = None

1964 else:

1965 # TODO(CoW) in theory only need to track reference if new_array is a view

1966 refs = blk.refs

1967

1968 bp = BlockPlacement(slice(0, len(array)))

1969 block = type(blk)(array, placement=bp, ndim=1, refs=refs)

1970

1971 new_idx = self.index[indexer]

1972 return type(self)(block, new_idx)

1973

1974 def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager:

1975 # Assertion disabled for performance

1976 # assert isinstance(slobj, slice), type(slobj)

1977 if axis >= self.ndim:

1978 raise IndexError("Requested axis not found in manager")

1979

1980 blk = self._block

1981 array = blk.values[slobj]

1982 bp = BlockPlacement(slice(0, len(array)))

1983 # TODO this method is only used in groupby SeriesSplitter at the moment,

1984 # so passing refs is not yet covered by the tests

1985 block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs)

1986 new_index = self.index._getitem_slice(slobj)

1987 return type(self)(block, new_index)

1988

1989 @property

1990 def index(self) -> Index:

1991 return self.axes[0]

1992

1993 @property

1994 def dtype(self) -> DtypeObj:

1995 return self._block.dtype

1996

1997 def get_dtypes(self) -> npt.NDArray[np.object_]:

1998 return np.array([self._block.dtype], dtype=object)

1999

2000 def external_values(self):

2001 """The array that Series.values returns"""

2002 return self._block.external_values()

2003

2004 def internal_values(self):

2005 """The array that Series._values returns"""

2006 return self._block.values

2007

2008 def array_values(self) -> ExtensionArray:

2009 """The array that Series.array returns"""

2010 return self._block.array_values

2011

2012 def get_numeric_data(self) -> Self:

2013 if self._block.is_numeric:

2014 return self.copy(deep=False)

2015 return self.make_empty()

2016

2017 @property

2018 def _can_hold_na(self) -> bool:

2019 return self._block._can_hold_na

2020

2021 def setitem_inplace(self, indexer, value, warn: bool = True) -> None:

2022 """

2023 Set values with indexer.

2024

2025 For Single[Block/Array]Manager, this backs s[indexer] = value

2026

2027 This is an inplace version of `setitem()`, mutating the manager/values

2028 in place, not returning a new Manager (and Block), and thus never changing

2029 the dtype.

2030 """

2031 using_cow = using_copy_on_write()

2032 warn_cow = warn_copy_on_write()

2033 if (using_cow or warn_cow) and not self._has_no_reference(0):

2034 if using_cow:

2035 self.blocks = (self._block.copy(),)

2036 self._cache.clear()

2037 elif warn_cow and warn:

2038 warnings.warn(

2039 COW_WARNING_SETITEM_MSG,

2040 FutureWarning,

2041 stacklevel=find_stack_level(),

2042 )

2043

2044 super().setitem_inplace(indexer, value)

2045

2046 def idelete(self, indexer) -> SingleBlockManager:

2047 """

2048 Delete single location from SingleBlockManager.

2049

2050 Ensures that self.blocks doesn't become empty.

2051 """

2052 nb = self._block.delete(indexer)[0]

2053 self.blocks = (nb,)

2054 self.axes[0] = self.axes[0].delete(indexer)

2055 self._cache.clear()

2056 return self

2057

2058 def fast_xs(self, loc):

2059 """

2060 fast path for getting a cross-section

2061 return a view of the data

2062 """

2063 raise NotImplementedError("Use series._values[loc] instead")

2064

2065 def set_values(self, values: ArrayLike) -> None:

2066 """

2067 Set the values of the single block in place.

2068

2069 Use at your own risk! This does not check if the passed values are

2070 valid for the current Block/SingleBlockManager (length, dtype, etc),

2071 and this does not properly keep track of references.

2072 """

2073 # NOTE(CoW) Currently this is only used for FrameColumnApply.series_generator

2074 # which handles CoW by setting the refs manually if necessary

2075 self.blocks[0].values = values

2076 self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values)))

2077

2078 def _equal_values(self, other: Self) -> bool:

2079 """

2080 Used in .equals defined in base class. Only check the column values

2081 assuming shape and indexes have already been checked.

2082 """

2083 # For SingleBlockManager (i.e.Series)

2084 if other.ndim != 1:

2085 return False

2086 left = self.blocks[0].values

2087 right = other.blocks[0].values

2088 return array_equals(left, right)

2089

2090

2091# --------------------------------------------------------------------

2092# Constructor Helpers

2093

2094

2095def create_block_manager_from_blocks(

2096 blocks: list[Block],

2097 axes: list[Index],

2098 consolidate: bool = True,

2099 verify_integrity: bool = True,

2100) -> BlockManager:

2101 # If verify_integrity=False, then caller is responsible for checking

2102 # all(x.shape[-1] == len(axes[1]) for x in blocks)

2103 # sum(x.shape[0] for x in blocks) == len(axes[0])

2104 # set(x for blk in blocks for x in blk.mgr_locs) == set(range(len(axes[0])))

2105 # all(blk.ndim == 2 for blk in blocks)

2106 # This allows us to safely pass verify_integrity=False

2107

2108 try:

2109 mgr = BlockManager(blocks, axes, verify_integrity=verify_integrity)

2110

2111 except ValueError as err:

2112 arrays = [blk.values for blk in blocks]

2113 tot_items = sum(arr.shape[0] for arr in arrays)

2114 raise_construction_error(tot_items, arrays[0].shape[1:], axes, err)

2115

2116 if consolidate:

2117 mgr._consolidate_inplace()

2118 return mgr

2119

2120

2121def create_block_manager_from_column_arrays(

2122 arrays: list[ArrayLike],

2123 axes: list[Index],

2124 consolidate: bool,

2125 refs: list,

2126) -> BlockManager:

2127 # Assertions disabled for performance (caller is responsible for verifying)

2128 # assert isinstance(axes, list)

2129 # assert all(isinstance(x, Index) for x in axes)

2130 # assert all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)

2131 # assert all(type(x) is not NumpyExtensionArray for x in arrays)

2132 # assert all(x.ndim == 1 for x in arrays)

2133 # assert all(len(x) == len(axes[1]) for x in arrays)

2134 # assert len(arrays) == len(axes[0])

2135 # These last three are sufficient to allow us to safely pass

2136 # verify_integrity=False below.

2137

2138 try:

2139 blocks = _form_blocks(arrays, consolidate, refs)

2140 mgr = BlockManager(blocks, axes, verify_integrity=False)

2141 except ValueError as e:

2142 raise_construction_error(len(arrays), arrays[0].shape, axes, e)

2143 if consolidate:

2144 mgr._consolidate_inplace()

2145 return mgr

2146

2147

2148def raise_construction_error(

2149 tot_items: int,

2150 block_shape: Shape,

2151 axes: list[Index],

2152 e: ValueError | None = None,

2153):

2154 """raise a helpful message about our construction"""

2155 passed = tuple(map(int, [tot_items] + list(block_shape)))

2156 # Correcting the user facing error message during dataframe construction

2157 if len(passed) <= 2:

2158 passed = passed[::-1]

2159

2160 implied = tuple(len(ax) for ax in axes)

2161 # Correcting the user facing error message during dataframe construction

2162 if len(implied) <= 2:

2163 implied = implied[::-1]

2164

2165 # We return the exception object instead of raising it so that we

2166 # can raise it in the caller; mypy plays better with that

2167 if passed == implied and e is not None:

2168 raise e

2169 if block_shape[0] == 0:

2170 raise ValueError("Empty data passed with indices specified.")

2171 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")

2172

2173

2174# -----------------------------------------------------------------------

2175

2176

2177def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, DtypeObj]:

2178 dtype = tup[1].dtype

2179

2180 if is_1d_only_ea_dtype(dtype):

2181 # We know these won't be consolidated, so don't need to group these.

2182 # This avoids expensive comparisons of CategoricalDtype objects

2183 sep = id(dtype)

2184 else:

2185 sep = 0

2186

2187 return sep, dtype

2188

2189

2190def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]:

2191 tuples = list(enumerate(arrays))

2192

2193 if not consolidate:

2194 return _tuples_to_blocks_no_consolidate(tuples, refs)

2195

2196 # when consolidating, we can ignore refs (either stacking always copies,

2197 # or the EA is already copied in the calling dict_to_mgr)

2198

2199 # group by dtype

2200 grouper = itertools.groupby(tuples, _grouping_func)

2201

2202 nbs: list[Block] = []

2203 for (_, dtype), tup_block in grouper:

2204 block_type = get_block_type(dtype)

2205

2206 if isinstance(dtype, np.dtype):

2207 is_dtlike = dtype.kind in "mM"

2208

2209 if issubclass(dtype.type, (str, bytes)):

2210 dtype = np.dtype(object)

2211

2212 values, placement = _stack_arrays(list(tup_block), dtype)

2213 if is_dtlike:

2214 values = ensure_wrapped_if_datetimelike(values)

2215 blk = block_type(values, placement=BlockPlacement(placement), ndim=2)

2216 nbs.append(blk)

2217

2218 elif is_1d_only_ea_dtype(dtype):

2219 dtype_blocks = [

2220 block_type(x[1], placement=BlockPlacement(x[0]), ndim=2)

2221 for x in tup_block

2222 ]

2223 nbs.extend(dtype_blocks)

2224

2225 else:

2226 dtype_blocks = [

2227 block_type(

2228 ensure_block_shape(x[1], 2), placement=BlockPlacement(x[0]), ndim=2

2229 )

2230 for x in tup_block

2231 ]

2232 nbs.extend(dtype_blocks)

2233 return nbs

2234

2235

2236def _tuples_to_blocks_no_consolidate(tuples, refs) -> list[Block]:

2237 # tuples produced within _form_blocks are of the form (placement, array)

2238 return [

2239 new_block_2d(

2240 ensure_block_shape(arr, ndim=2), placement=BlockPlacement(i), refs=ref

2241 )

2242 for ((i, arr), ref) in zip(tuples, refs)

2243 ]

2244

2245

2246def _stack_arrays(tuples, dtype: np.dtype):

2247 placement, arrays = zip(*tuples)

2248

2249 first = arrays[0]

2250 shape = (len(arrays),) + first.shape

2251

2252 stacked = np.empty(shape, dtype=dtype)

2253 for i, arr in enumerate(arrays):

2254 stacked[i] = arr

2255

2256 return stacked, placement

2257

2258

2259def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]:

2260 """

2261 Merge blocks having same dtype, exclude non-consolidating blocks

2262 """

2263 # sort by _can_consolidate, dtype

2264 gkey = lambda x: x._consolidate_key

2265 grouper = itertools.groupby(sorted(blocks, key=gkey), gkey)

2266

2267 new_blocks: list[Block] = []

2268 for (_can_consolidate, dtype), group_blocks in grouper:

2269 merged_blocks, _ = _merge_blocks(

2270 list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate

2271 )

2272 new_blocks = extend_blocks(merged_blocks, new_blocks)

2273 return tuple(new_blocks)

2274

2275

2276def _merge_blocks(

2277 blocks: list[Block], dtype: DtypeObj, can_consolidate: bool

2278) -> tuple[list[Block], bool]:

2279 if len(blocks) == 1:

2280 return blocks, False

2281

2282 if can_consolidate:

2283 # TODO: optimization potential in case all mgrs contain slices and

2284 # combination of those slices is a slice, too.

2285 new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])

2286

2287 new_values: ArrayLike

2288

2289 if isinstance(blocks[0].dtype, np.dtype):

2290 # error: List comprehension has incompatible type List[Union[ndarray,

2291 # ExtensionArray]]; expected List[Union[complex, generic,

2292 # Sequence[Union[int, float, complex, str, bytes, generic]],

2293 # Sequence[Sequence[Any]], SupportsArray]]

2294 new_values = np.vstack([b.values for b in blocks]) # type: ignore[misc]

2295 else:

2296 bvals = [blk.values for blk in blocks]

2297 bvals2 = cast(Sequence[NDArrayBackedExtensionArray], bvals)

2298 new_values = bvals2[0]._concat_same_type(bvals2, axis=0)

2299

2300 argsort = np.argsort(new_mgr_locs)

2301 new_values = new_values[argsort]

2302 new_mgr_locs = new_mgr_locs[argsort]

2303

2304 bp = BlockPlacement(new_mgr_locs)

2305 return [new_block_2d(new_values, placement=bp)], True

2306

2307 # can't consolidate --> no merge

2308 return blocks, False

2309

2310

2311def _fast_count_smallints(arr: npt.NDArray[np.intp]):

2312 """Faster version of set(arr) for sequences of small numbers."""

2313 counts = np.bincount(arr)

2314 nz = counts.nonzero()[0]

2315 # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here,

2316 # in one benchmark by a factor of 11

2317 return zip(nz, counts[nz])

2318

2319

2320def _preprocess_slice_or_indexer(

2321 slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool

2322):

2323 if isinstance(slice_or_indexer, slice):

2324 return (

2325 "slice",

2326 slice_or_indexer,

2327 libinternals.slice_len(slice_or_indexer, length),

2328 )

2329 else:

2330 if (

2331 not isinstance(slice_or_indexer, np.ndarray)

2332 or slice_or_indexer.dtype.kind != "i"

2333 ):

2334 dtype = getattr(slice_or_indexer, "dtype", None)

2335 raise TypeError(type(slice_or_indexer), dtype)

2336

2337 indexer = ensure_platform_int(slice_or_indexer)

2338 if not allow_fill:

2339 indexer = maybe_convert_indices(indexer, length)

2340 return "fancy", indexer, len(indexer)

2341

2342

2343def make_na_array(dtype: DtypeObj, shape: Shape, fill_value) -> ArrayLike:

2344 if isinstance(dtype, DatetimeTZDtype):

2345 # NB: exclude e.g. pyarrow[dt64tz] dtypes

2346 ts = Timestamp(fill_value).as_unit(dtype.unit)

2347 i8values = np.full(shape, ts._value)

2348 dt64values = i8values.view(f"M8[{dtype.unit}]")

2349 return DatetimeArray._simple_new(dt64values, dtype=dtype)

2350

2351 elif is_1d_only_ea_dtype(dtype):

2352 dtype = cast(ExtensionDtype, dtype)

2353 cls = dtype.construct_array_type()

2354

2355 missing_arr = cls._from_sequence([], dtype=dtype)

2356 ncols, nrows = shape

2357 assert ncols == 1, ncols

2358 empty_arr = -1 * np.ones((nrows,), dtype=np.intp)

2359 return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value)

2360 elif isinstance(dtype, ExtensionDtype):

2361 # TODO: no tests get here, a handful would if we disabled

2362 # the dt64tz special-case above (which is faster)

2363 cls = dtype.construct_array_type()

2364 missing_arr = cls._empty(shape=shape, dtype=dtype)

2365 missing_arr[:] = fill_value

2366 return missing_arr

2367 else:

2368 # NB: we should never get here with dtype integer or bool;

2369 # if we did, the missing_arr.fill would cast to gibberish

2370 missing_arr = np.empty(shape, dtype=dtype)

2371 missing_arr.fill(fill_value)

2372

2373 if dtype.kind in "mM":

2374 missing_arr = ensure_wrapped_if_datetimelike(missing_arr)

2375 return missing_arr