Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/internals/managers.py: 19%

1from __future__ import annotations

3import itertools

4from typing import (

5 Any,

6 Callable,

7 Hashable,

8 Literal,

9 Sequence,

10 TypeVar,

11 cast,

12)

13import warnings

14import weakref

16import numpy as np

18from pandas._config import using_copy_on_write

20from pandas._libs import (

21 algos as libalgos,

22 internals as libinternals,

23 lib,

24)

25from pandas._libs.internals import (

26 BlockPlacement,

27 BlockValuesRefs,

28)

29from pandas._typing import (

30 ArrayLike,

31 AxisInt,

32 DtypeObj,

33 QuantileInterpolation,

34 Shape,

35 npt,

36 type_t,

37)

38from pandas.errors import PerformanceWarning

39from pandas.util._decorators import cache_readonly

40from pandas.util._exceptions import find_stack_level

41from pandas.util._validators import validate_bool_kwarg

43from pandas.core.dtypes.cast import infer_dtype_from_scalar

44from pandas.core.dtypes.common import (

45 ensure_platform_int,

46 is_1d_only_ea_dtype,

47 is_dtype_equal,

48 is_list_like,

49)

50from pandas.core.dtypes.dtypes import ExtensionDtype

51from pandas.core.dtypes.generic import (

52 ABCDataFrame,

53 ABCSeries,

54)

55from pandas.core.dtypes.missing import (

56 array_equals,

57 isna,

58)

60import pandas.core.algorithms as algos

61from pandas.core.arrays._mixins import NDArrayBackedExtensionArray

62from pandas.core.arrays.sparse import SparseDtype

63import pandas.core.common as com

64from pandas.core.construction import (

65 ensure_wrapped_if_datetimelike,

66 extract_array,

67)

68from pandas.core.indexers import maybe_convert_indices

69from pandas.core.indexes.api import (

70 Index,

71 ensure_index,

72)

73from pandas.core.internals.base import (

74 DataManager,

75 SingleDataManager,

76 interleaved_dtype,

77)

78from pandas.core.internals.blocks import (

79 Block,

80 NumpyBlock,

81 ensure_block_shape,

82 extend_blocks,

83 get_block_type,

84 new_block,

85 new_block_2d,

86)

87from pandas.core.internals.ops import (

88 blockwise_all,

89 operate_blockwise,

90)

92T = TypeVar("T", bound="BaseBlockManager")

95class BaseBlockManager(DataManager):

96 """

97 Core internal data structure to implement DataFrame, Series, etc.

99 Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a

100 lightweight blocked set of labeled data to be manipulated by the DataFrame

101 public API class

102

103 Attributes

104 ----------

105 shape

106 ndim

107 axes

108 values

109 items

110

111 Methods

112 -------

113 set_axis(axis, new_labels)

114 copy(deep=True)

115

116 get_dtypes

117

118 apply(func, axes, block_filter_fn)

119

120 get_bool_data

121 get_numeric_data

122

123 get_slice(slice_like, axis)

124 get(label)

125 iget(loc)

126

127 take(indexer, axis)

128 reindex_axis(new_labels, axis)

129 reindex_indexer(new_labels, indexer, axis)

130

131 delete(label)

132 insert(loc, label, value)

133 set(label, value)

134

135 Parameters

136 ----------

137 blocks: Sequence of Block

138 axes: Sequence of Index

139 verify_integrity: bool, default True

140

141 Notes

142 -----

143 This is *not* a public API class

144 """

145

146 __slots__ = ()

147

148 _blknos: npt.NDArray[np.intp]

149 _blklocs: npt.NDArray[np.intp]

150 blocks: tuple[Block, ...]

151 axes: list[Index]

152

153 @property

154 def ndim(self) -> int:

155 raise NotImplementedError

156

157 _known_consolidated: bool

158 _is_consolidated: bool

159

160 def __init__(self, blocks, axes, verify_integrity: bool = True) -> None:

161 raise NotImplementedError

162

163 @classmethod

164 def from_blocks(cls: type_t[T], blocks: list[Block], axes: list[Index]) -> T:

165 raise NotImplementedError

166

167 @property

168 def blknos(self) -> npt.NDArray[np.intp]:

169 """

170 Suppose we want to find the array corresponding to our i'th column.

171

172 blknos[i] identifies the block from self.blocks that contains this column.

173

174 blklocs[i] identifies the column of interest within

175 self.blocks[self.blknos[i]]

176 """

177 if self._blknos is None:

178 # Note: these can be altered by other BlockManager methods.

179 self._rebuild_blknos_and_blklocs()

180

181 return self._blknos

182

183 @property

184 def blklocs(self) -> npt.NDArray[np.intp]:

185 """

186 See blknos.__doc__

187 """

188 if self._blklocs is None:

189 # Note: these can be altered by other BlockManager methods.

190 self._rebuild_blknos_and_blklocs()

191

192 return self._blklocs

193

194 def make_empty(self: T, axes=None) -> T:

195 """return an empty BlockManager with the items axis of len 0"""

196 if axes is None:

197 axes = [Index([])] + self.axes[1:]

198

199 # preserve dtype if possible

200 if self.ndim == 1:

201 assert isinstance(self, SingleBlockManager) # for mypy

202 blk = self.blocks[0]

203 arr = blk.values[:0]

204 bp = BlockPlacement(slice(0, 0))

205 nb = blk.make_block_same_class(arr, placement=bp)

206 blocks = [nb]

207 else:

208 blocks = []

209 return type(self).from_blocks(blocks, axes)

210

211 def __nonzero__(self) -> bool:

212 return True

213

214 # Python3 compat

215 __bool__ = __nonzero__

216

217 def _normalize_axis(self, axis: AxisInt) -> int:

218 # switch axis to follow BlockManager logic

219 if self.ndim == 2:

220 axis = 1 if axis == 0 else 0

221 return axis

222

223 def set_axis(self, axis: AxisInt, new_labels: Index) -> None:

224 # Caller is responsible for ensuring we have an Index object.

225 self._validate_set_axis(axis, new_labels)

226 self.axes[axis] = new_labels

227

228 @property

229 def is_single_block(self) -> bool:

230 # Assumes we are 2D; overridden by SingleBlockManager

231 return len(self.blocks) == 1

232

233 @property

234 def items(self) -> Index:

235 return self.axes[0]

236

237 def _has_no_reference(self, i: int) -> bool:

238 """

239 Check for column `i` if it has references.

240 (whether it references another array or is itself being referenced)

241 Returns True if the column has no references.

242 """

243 blkno = self.blknos[i]

244 return self._has_no_reference_block(blkno)

245

246 def _has_no_reference_block(self, blkno: int) -> bool:

247 """

248 Check for block `i` if it has references.

249 (whether it references another array or is itself being referenced)

250 Returns True if the block has no references.

251 """

252 return not self.blocks[blkno].refs.has_reference()

253

254 def add_references(self, mgr: BaseBlockManager) -> None:

255 """

256 Adds the references from one manager to another. We assume that both

257 managers have the same block structure.

258 """

259 if len(self.blocks) != len(mgr.blocks):

260 # If block structure changes, then we made a copy

261 return

262 for i, blk in enumerate(self.blocks):

263 blk.refs = mgr.blocks[i].refs

264 # Argument 1 to "add_reference" of "BlockValuesRefs" has incompatible type

265 # "Block"; expected "SharedBlock"

266 blk.refs.add_reference(blk) # type: ignore[arg-type]

267

268 def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool:

269 """

270 Checks if two blocks from two different block managers reference the

271 same underlying values.

272 """

273 ref = weakref.ref(self.blocks[blkno])

274 return ref in mgr.blocks[blkno].refs.referenced_blocks

275

276 def get_dtypes(self):

277 dtypes = np.array([blk.dtype for blk in self.blocks])

278 return dtypes.take(self.blknos)

279

280 @property

281 def arrays(self) -> list[ArrayLike]:

282 """

283 Quick access to the backing arrays of the Blocks.

284

285 Only for compatibility with ArrayManager for testing convenience.

286 Not to be used in actual code, and return value is not the same as the

287 ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs).

288

289 Warning! The returned arrays don't handle Copy-on-Write, so this should

290 be used with caution (only in read-mode).

291 """

292 return [blk.values for blk in self.blocks]

293

294 def __repr__(self) -> str:

295 output = type(self).__name__

296 for i, ax in enumerate(self.axes):

297 if i == 0:

298 output += f"\nItems: {ax}"

299 else:

300 output += f"\nAxis {i}: {ax}"

301

302 for block in self.blocks:

303 output += f"\n{block}"

304 return output

305

306 def apply(

307 self: T,

308 f,

309 align_keys: list[str] | None = None,

310 **kwargs,

311 ) -> T:

312 """

313 Iterate over the blocks, collect and create a new BlockManager.

314

315 Parameters

316 ----------

317 f : str or callable

318 Name of the Block method to apply.

319 align_keys: List[str] or None, default None

320 **kwargs

321 Keywords to pass to `f`

322

323 Returns

324 -------

325 BlockManager

326 """

327 assert "filter" not in kwargs

328

329 align_keys = align_keys or []

330 result_blocks: list[Block] = []

331 # fillna: Series/DataFrame is responsible for making sure value is aligned

332

333 aligned_args = {k: kwargs[k] for k in align_keys}

334

335 for b in self.blocks:

336 if aligned_args:

337 for k, obj in aligned_args.items():

338 if isinstance(obj, (ABCSeries, ABCDataFrame)):

339 # The caller is responsible for ensuring that

340 # obj.axes[-1].equals(self.items)

341 if obj.ndim == 1:

342 kwargs[k] = obj.iloc[b.mgr_locs.indexer]._values

343 else:

344 kwargs[k] = obj.iloc[:, b.mgr_locs.indexer]._values

345 else:

346 # otherwise we have an ndarray

347 kwargs[k] = obj[b.mgr_locs.indexer]

348

349 if callable(f):

350 applied = b.apply(f, **kwargs)

351 else:

352 applied = getattr(b, f)(**kwargs)

353 result_blocks = extend_blocks(applied, result_blocks)

354

355 out = type(self).from_blocks(result_blocks, self.axes)

356 return out

357

358 def where(self: T, other, cond, align: bool) -> T:

359 if align:

360 align_keys = ["other", "cond"]

361 else:

362 align_keys = ["cond"]

363 other = extract_array(other, extract_numpy=True)

364

365 return self.apply(

366 "where",

367 align_keys=align_keys,

368 other=other,

369 cond=cond,

370 using_cow=using_copy_on_write(),

371 )

372

373 def round(self: T, decimals: int, using_cow: bool = False) -> T:

374 return self.apply(

375 "round",

376 decimals=decimals,

377 using_cow=using_cow,

378 )

379

380 def setitem(self: T, indexer, value) -> T:

381 """

382 Set values with indexer.

383

384 For SingleBlockManager, this backs s[indexer] = value

385 """

386 if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:

387 raise ValueError(f"Cannot set values with ndim > {self.ndim}")

388

389 if using_copy_on_write() and not self._has_no_reference(0):

390 # if being referenced -> perform Copy-on-Write and clear the reference

391 # this method is only called if there is a single block -> hardcoded 0

392 self = self.copy()

393

394 return self.apply("setitem", indexer=indexer, value=value)

395

396 def putmask(self, mask, new, align: bool = True):

397 if align:

398 align_keys = ["new", "mask"]

399 else:

400 align_keys = ["mask"]

401 new = extract_array(new, extract_numpy=True)

402

403 return self.apply(

404 "putmask",

405 align_keys=align_keys,

406 mask=mask,

407 new=new,

408 using_cow=using_copy_on_write(),

409 )

410

411 def diff(self: T, n: int, axis: AxisInt) -> T:

412 # only reached with self.ndim == 2 and axis == 1

413 axis = self._normalize_axis(axis)

414 return self.apply("diff", n=n, axis=axis)

415

416 def interpolate(self: T, inplace: bool, **kwargs) -> T:

417 return self.apply(

418 "interpolate", inplace=inplace, **kwargs, using_cow=using_copy_on_write()

419 )

420

421 def shift(self: T, periods: int, axis: AxisInt, fill_value) -> T:

422 axis = self._normalize_axis(axis)

423 if fill_value is lib.no_default:

424 fill_value = None

425

426 return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value)

427

428 def fillna(self: T, value, limit, inplace: bool, downcast) -> T:

429 if limit is not None:

430 # Do this validation even if we go through one of the no-op paths

431 limit = libalgos.validate_limit(None, limit=limit)

432

433 return self.apply(

434 "fillna",

435 value=value,

436 limit=limit,

437 inplace=inplace,

438 downcast=downcast,

439 using_cow=using_copy_on_write(),

440 )

441

442 def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:

443 if copy is None:

444 if using_copy_on_write():

445 copy = False

446 else:

447 copy = True

448 elif using_copy_on_write():

449 copy = False

450

451 return self.apply(

452 "astype",

453 dtype=dtype,

454 copy=copy,

455 errors=errors,

456 using_cow=using_copy_on_write(),

457 )

458

459 def convert(self: T, copy: bool | None) -> T:

460 if copy is None:

461 if using_copy_on_write():

462 copy = False

463 else:

464 copy = True

465 elif using_copy_on_write():

466 copy = False

467

468 return self.apply("convert", copy=copy, using_cow=using_copy_on_write())

469

470 def replace(self: T, to_replace, value, inplace: bool) -> T:

471 inplace = validate_bool_kwarg(inplace, "inplace")

472 # NDFrame.replace ensures the not-is_list_likes here

473 assert not is_list_like(to_replace)

474 assert not is_list_like(value)

475 return self.apply(

476 "replace",

477 to_replace=to_replace,

478 value=value,

479 inplace=inplace,

480 using_cow=using_copy_on_write(),

481 )

482

483 def replace_regex(self, **kwargs):

484 return self.apply("_replace_regex", **kwargs, using_cow=using_copy_on_write())

485

486 def replace_list(

487 self: T,

488 src_list: list[Any],

489 dest_list: list[Any],

490 inplace: bool = False,

491 regex: bool = False,

492 ) -> T:

493 """do a list replace"""

494 inplace = validate_bool_kwarg(inplace, "inplace")

495

496 bm = self.apply(

497 "replace_list",

498 src_list=src_list,

499 dest_list=dest_list,

500 inplace=inplace,

501 regex=regex,

502 using_cow=using_copy_on_write(),

503 )

504 bm._consolidate_inplace()

505 return bm

506

507 def to_native_types(self: T, **kwargs) -> T:

508 """

509 Convert values to native types (strings / python objects) that are used

510 in formatting (repr / csv).

511 """

512 return self.apply("to_native_types", **kwargs)

513

514 @property

515 def is_numeric_mixed_type(self) -> bool:

516 return all(block.is_numeric for block in self.blocks)

517

518 @property

519 def any_extension_types(self) -> bool:

520 """Whether any of the blocks in this manager are extension blocks"""

521 return any(block.is_extension for block in self.blocks)

522

523 @property

524 def is_view(self) -> bool:

525 """return a boolean if we are a single block and are a view"""

526 if len(self.blocks) == 1:

527 return self.blocks[0].is_view

528

529 # It is technically possible to figure out which blocks are views

530 # e.g. [ b.values.base is not None for b in self.blocks ]

531 # but then we have the case of possibly some blocks being a view

532 # and some blocks not. setting in theory is possible on the non-view

533 # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit

534 # complicated

535

536 return False

537

538 def _get_data_subset(self: T, predicate: Callable) -> T:

539 blocks = [blk for blk in self.blocks if predicate(blk.values)]

540 return self._combine(blocks, copy=False)

541

542 def get_bool_data(self: T, copy: bool = False) -> T:

543 """

544 Select blocks that are bool-dtype and columns from object-dtype blocks

545 that are all-bool.

546

547 Parameters

548 ----------

549 copy : bool, default False

550 Whether to copy the blocks

551 """

552

553 new_blocks = []

554

555 for blk in self.blocks:

556 if blk.dtype == bool:

557 new_blocks.append(blk)

558

559 elif blk.is_object:

560 nbs = blk._split()

561 for nb in nbs:

562 if nb.is_bool:

563 new_blocks.append(nb)

564

565 return self._combine(new_blocks, copy)

566

567 def get_numeric_data(self: T, copy: bool = False) -> T:

568 """

569 Parameters

570 ----------

571 copy : bool, default False

572 Whether to copy the blocks

573 """

574 numeric_blocks = [blk for blk in self.blocks if blk.is_numeric]

575 if len(numeric_blocks) == len(self.blocks):

576 # Avoid somewhat expensive _combine

577 if copy:

578 return self.copy(deep=True)

579 return self

580 return self._combine(numeric_blocks, copy)

581

582 def _combine(

583 self: T, blocks: list[Block], copy: bool = True, index: Index | None = None

584 ) -> T:

585 """return a new manager with the blocks"""

586 if len(blocks) == 0:

587 if self.ndim == 2:

588 # retain our own Index dtype

589 if index is not None:

590 axes = [self.items[:0], index]

591 else:

592 axes = [self.items[:0]] + self.axes[1:]

593 return self.make_empty(axes)

594 return self.make_empty()

595

596 # FIXME: optimization potential

597 indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks]))

598 inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])

599

600 new_blocks: list[Block] = []

601 # TODO(CoW) we could optimize here if we know that the passed blocks

602 # are fully "owned" (eg created from an operation, not coming from

603 # an existing manager)

604 for b in blocks:

605 nb = b.copy(deep=copy)

606 nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer])

607 new_blocks.append(nb)

608

609 axes = list(self.axes)

610 if index is not None:

611 axes[-1] = index

612 axes[0] = self.items.take(indexer)

613

614 return type(self).from_blocks(new_blocks, axes)

615

616 @property

617 def nblocks(self) -> int:

618 return len(self.blocks)

619

620 def copy(self: T, deep: bool | None | Literal["all"] = True) -> T:

621 """

622 Make deep or shallow copy of BlockManager

623

624 Parameters

625 ----------

626 deep : bool, string or None, default True

627 If False or None, return a shallow copy (do not copy data)

628 If 'all', copy data and a deep copy of the index

629

630 Returns

631 -------

632 BlockManager

633 """

634 if deep is None:

635 if using_copy_on_write():

636 # use shallow copy

637 deep = False

638 else:

639 # preserve deep copy for BlockManager with copy=None

640 deep = True

641

642 # this preserves the notion of view copying of axes

643 if deep:

644 # hit in e.g. tests.io.json.test_pandas

645

646 def copy_func(ax):

647 return ax.copy(deep=True) if deep == "all" else ax.view()

648

649 new_axes = [copy_func(ax) for ax in self.axes]

650 else:

651 new_axes = list(self.axes)

652

653 res = self.apply("copy", deep=deep)

654 res.axes = new_axes

655

656 if self.ndim > 1:

657 # Avoid needing to re-compute these

658 blknos = self._blknos

659 if blknos is not None:

660 res._blknos = blknos.copy()

661 res._blklocs = self._blklocs.copy()

662

663 if deep:

664 res._consolidate_inplace()

665 return res

666

667 def consolidate(self: T) -> T:

668 """

669 Join together blocks having same dtype

670

671 Returns

672 -------

673 y : BlockManager

674 """

675 if self.is_consolidated():

676 return self

677

678 bm = type(self)(self.blocks, self.axes, verify_integrity=False)

679 bm._is_consolidated = False

680 bm._consolidate_inplace()

681 return bm

682

683 def reindex_indexer(

684 self: T,

685 new_axis: Index,

686 indexer: npt.NDArray[np.intp] | None,

687 axis: AxisInt,

688 fill_value=None,

689 allow_dups: bool = False,

690 copy: bool | None = True,

691 only_slice: bool = False,

692 *,

693 use_na_proxy: bool = False,

694 ) -> T:

695 """

696 Parameters

697 ----------

698 new_axis : Index

699 indexer : ndarray[intp] or None

700 axis : int

701 fill_value : object, default None

702 allow_dups : bool, default False

703 copy : bool or None, default True

704 If None, regard as False to get shallow copy.

705 only_slice : bool, default False

706 Whether to take views, not copies, along columns.

707 use_na_proxy : bool, default False

708 Whether to use a np.void ndarray for newly introduced columns.

709

710 pandas-indexer with -1's only.

711 """

712 if copy is None:

713 if using_copy_on_write():

714 # use shallow copy

715 copy = False

716 else:

717 # preserve deep copy for BlockManager with copy=None

718 copy = True

719

720 if indexer is None:

721 if new_axis is self.axes[axis] and not copy:

722 return self

723

724 result = self.copy(deep=copy)

725 result.axes = list(self.axes)

726 result.axes[axis] = new_axis

727 return result

728

729 # Should be intp, but in some cases we get int64 on 32bit builds

730 assert isinstance(indexer, np.ndarray)

731

732 # some axes don't allow reindexing with dups

733 if not allow_dups:

734 self.axes[axis]._validate_can_reindex(indexer)

735

736 if axis >= self.ndim:

737 raise IndexError("Requested axis not found in manager")

738

739 if axis == 0:

740 new_blocks = self._slice_take_blocks_ax0(

741 indexer,

742 fill_value=fill_value,

743 only_slice=only_slice,

744 use_na_proxy=use_na_proxy,

745 )

746 else:

747 new_blocks = [

748 blk.take_nd(

749 indexer,

750 axis=1,

751 fill_value=(

752 fill_value if fill_value is not None else blk.fill_value

753 ),

754 )

755 for blk in self.blocks

756 ]

757

758 new_axes = list(self.axes)

759 new_axes[axis] = new_axis

760

761 new_mgr = type(self).from_blocks(new_blocks, new_axes)

762 if axis == 1:

763 # We can avoid the need to rebuild these

764 new_mgr._blknos = self.blknos.copy()

765 new_mgr._blklocs = self.blklocs.copy()

766 return new_mgr

767

768 def _slice_take_blocks_ax0(

769 self,

770 slice_or_indexer: slice | np.ndarray,

771 fill_value=lib.no_default,

772 only_slice: bool = False,

773 *,

774 use_na_proxy: bool = False,

775 ) -> list[Block]:

776 """

777 Slice/take blocks along axis=0.

778

779 Overloaded for SingleBlock

780

781 Parameters

782 ----------

783 slice_or_indexer : slice or np.ndarray[int64]

784 fill_value : scalar, default lib.no_default

785 only_slice : bool, default False

786 If True, we always return views on existing arrays, never copies.

787 This is used when called from ops.blockwise.operate_blockwise.

788 use_na_proxy : bool, default False

789 Whether to use a np.void ndarray for newly introduced columns.

790

791 Returns

792 -------

793 new_blocks : list of Block

794 """

795 allow_fill = fill_value is not lib.no_default

796

797 sl_type, slobj, sllen = _preprocess_slice_or_indexer(

798 slice_or_indexer, self.shape[0], allow_fill=allow_fill

799 )

800

801 if self.is_single_block:

802 blk = self.blocks[0]

803

804 if sl_type == "slice":

805 # GH#32959 EABlock would fail since we can't make 0-width

806 # TODO(EA2D): special casing unnecessary with 2D EAs

807 if sllen == 0:

808 return []

809 bp = BlockPlacement(slice(0, sllen))

810 return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)]

811 elif not allow_fill or self.ndim == 1:

812 if allow_fill and fill_value is None:

813 fill_value = blk.fill_value

814

815 if not allow_fill and only_slice:

816 # GH#33597 slice instead of take, so we get

817 # views instead of copies

818 blocks = [

819 blk.getitem_block_columns(

820 slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i)

821 )

822 for i, ml in enumerate(slobj)

823 ]

824 return blocks

825 else:

826 bp = BlockPlacement(slice(0, sllen))

827 return [

828 blk.take_nd(

829 slobj,

830 axis=0,

831 new_mgr_locs=bp,

832 fill_value=fill_value,

833 )

834 ]

835

836 if sl_type == "slice":

837 blknos = self.blknos[slobj]

838 blklocs = self.blklocs[slobj]

839 else:

840 blknos = algos.take_nd(

841 self.blknos, slobj, fill_value=-1, allow_fill=allow_fill

842 )

843 blklocs = algos.take_nd(

844 self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill

845 )

846

847 # When filling blknos, make sure blknos is updated before appending to

848 # blocks list, that way new blkno is exactly len(blocks).

849 blocks = []

850 group = not only_slice

851 for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group):

852 if blkno == -1:

853 # If we've got here, fill_value was not lib.no_default

854

855 blocks.append(

856 self._make_na_block(

857 placement=mgr_locs,

858 fill_value=fill_value,

859 use_na_proxy=use_na_proxy,

860 )

861 )

862 else:

863 blk = self.blocks[blkno]

864

865 # Otherwise, slicing along items axis is necessary.

866 if not blk._can_consolidate and not blk._validate_ndim:

867 # i.e. we dont go through here for DatetimeTZBlock

868 # A non-consolidatable block, it's easy, because there's

869 # only one item and each mgr loc is a copy of that single

870 # item.

871 deep = not (only_slice or using_copy_on_write())

872 for mgr_loc in mgr_locs:

873 newblk = blk.copy(deep=deep)

874 newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1))

875 blocks.append(newblk)

876

877 else:

878 # GH#32779 to avoid the performance penalty of copying,

879 # we may try to only slice

880 taker = blklocs[mgr_locs.indexer]

881 max_len = max(len(mgr_locs), taker.max() + 1)

882 if only_slice or using_copy_on_write():

883 taker = lib.maybe_indices_to_slice(taker, max_len)

884

885 if isinstance(taker, slice):

886 nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs)

887 blocks.append(nb)

888 elif only_slice:

889 # GH#33597 slice instead of take, so we get

890 # views instead of copies

891 for i, ml in zip(taker, mgr_locs):

892 slc = slice(i, i + 1)

893 bp = BlockPlacement(ml)

894 nb = blk.getitem_block_columns(slc, new_mgr_locs=bp)

895 # We have np.shares_memory(nb.values, blk.values)

896 blocks.append(nb)

897 else:

898 nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs)

899 blocks.append(nb)

900

901 return blocks

902

903 def _make_na_block(

904 self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False

905 ) -> Block:

906 # Note: we only get here with self.ndim == 2

907

908 if use_na_proxy:

909 assert fill_value is None

910 shape = (len(placement), self.shape[1])

911 vals = np.empty(shape, dtype=np.void)

912 nb = NumpyBlock(vals, placement, ndim=2)

913 return nb

914

915 if fill_value is None:

916 fill_value = np.nan

917 block_shape = list(self.shape)

918 block_shape[0] = len(placement)

919

920 dtype, fill_value = infer_dtype_from_scalar(fill_value)

921 # error: Argument "dtype" to "empty" has incompatible type "Union[dtype,

922 # ExtensionDtype]"; expected "Union[dtype, None, type, _SupportsDtype, str,

923 # Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any], _DtypeDict,

924 # Tuple[Any, Any]]"

925 block_values = np.empty(block_shape, dtype=dtype) # type: ignore[arg-type]

926 block_values.fill(fill_value)

927 return new_block_2d(block_values, placement=placement)

928

929 def take(

930 self: T,

931 indexer,

932 axis: AxisInt = 1,

933 verify: bool = True,

934 convert_indices: bool = True,

935 ) -> T:

936 """

937 Take items along any axis.

938

939 indexer : np.ndarray or slice

940 axis : int, default 1

941 verify : bool, default True

942 Check that all entries are between 0 and len(self) - 1, inclusive.

943 Pass verify=False if this check has been done by the caller.

944 convert_indices : bool, default True

945 Whether to attempt to convert indices to positive values.

946

947 Returns

948 -------

949 BlockManager

950 """

951 # We have 6 tests that get here with a slice

952 indexer = (

953 np.arange(indexer.start, indexer.stop, indexer.step, dtype=np.intp)

954 if isinstance(indexer, slice)

955 else np.asanyarray(indexer, dtype=np.intp)

956 )

957

958 n = self.shape[axis]

959 if convert_indices:

960 indexer = maybe_convert_indices(indexer, n, verify=verify)

961

962 new_labels = self.axes[axis].take(indexer)

963 return self.reindex_indexer(

964 new_axis=new_labels,

965 indexer=indexer,

966 axis=axis,

967 allow_dups=True,

968 copy=None,

969 )

970

971

972class BlockManager(libinternals.BlockManager, BaseBlockManager):

973 """

974 BaseBlockManager that holds 2D blocks.

975 """

976

977 ndim = 2

978

979 # ----------------------------------------------------------------

980 # Constructors

981

982 def __init__(

983 self,

984 blocks: Sequence[Block],

985 axes: Sequence[Index],

986 verify_integrity: bool = True,

987 ) -> None:

988 if verify_integrity:

989 # Assertion disabled for performance

990 # assert all(isinstance(x, Index) for x in axes)

991

992 for block in blocks:

993 if self.ndim != block.ndim:

994 raise AssertionError(

995 f"Number of Block dimensions ({block.ndim}) must equal "

996 f"number of axes ({self.ndim})"

997 )

998 # As of 2.0, the caller is responsible for ensuring that

999 # DatetimeTZBlock with block.ndim == 2 has block.values.ndim ==2;

1000 # previously there was a special check for fastparquet compat.

1001

1002 self._verify_integrity()

1003

1004 def _verify_integrity(self) -> None:

1005 mgr_shape = self.shape

1006 tot_items = sum(len(x.mgr_locs) for x in self.blocks)

1007 for block in self.blocks:

1008 if block.shape[1:] != mgr_shape[1:]:

1009 raise_construction_error(tot_items, block.shape[1:], self.axes)

1010 if len(self.items) != tot_items:

1011 raise AssertionError(

1012 "Number of manager items must equal union of "

1013 f"block items\n# manager items: {len(self.items)}, # "

1014 f"tot_items: {tot_items}"

1015 )

1016

1017 @classmethod

1018 def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> BlockManager:

1019 """

1020 Constructor for BlockManager and SingleBlockManager with same signature.

1021 """

1022 return cls(blocks, axes, verify_integrity=False)

1023

1024 # ----------------------------------------------------------------

1025 # Indexing

1026

1027 def fast_xs(self, loc: int) -> SingleBlockManager:

1028 """

1029 Return the array corresponding to `frame.iloc[loc]`.

1030

1031 Parameters

1032 ----------

1033 loc : int

1034

1035 Returns

1036 -------

1037 np.ndarray or ExtensionArray

1038 """

1039 if len(self.blocks) == 1:

1040 # TODO: this could be wrong if blk.mgr_locs is not slice(None)-like;

1041 # is this ruled out in the general case?

1042 result = self.blocks[0].iget((slice(None), loc))

1043 # in the case of a single block, the new block is a view

1044 block = new_block(

1045 result,

1046 placement=slice(0, len(result)),

1047 ndim=1,

1048 refs=self.blocks[0].refs,

1049 )

1050 return SingleBlockManager(block, self.axes[0])

1051

1052 dtype = interleaved_dtype([blk.dtype for blk in self.blocks])

1053

1054 n = len(self)

1055

1056 # GH#46406

1057 immutable_ea = isinstance(dtype, SparseDtype)

1058

1059 if isinstance(dtype, ExtensionDtype) and not immutable_ea:

1060 cls = dtype.construct_array_type()

1061 result = cls._empty((n,), dtype=dtype)

1062 else:

1063 # error: Argument "dtype" to "empty" has incompatible type

1064 # "Union[Type[object], dtype[Any], ExtensionDtype, None]"; expected

1065 # "None"

1066 result = np.empty(

1067 n, dtype=object if immutable_ea else dtype # type: ignore[arg-type]

1068 )

1069 result = ensure_wrapped_if_datetimelike(result)

1070

1071 for blk in self.blocks:

1072 # Such assignment may incorrectly coerce NaT to None

1073 # result[blk.mgr_locs] = blk._slice((slice(None), loc))

1074 for i, rl in enumerate(blk.mgr_locs):

1075 result[rl] = blk.iget((i, loc))

1076

1077 if immutable_ea:

1078 dtype = cast(ExtensionDtype, dtype)

1079 result = dtype.construct_array_type()._from_sequence(result, dtype=dtype)

1080

1081 block = new_block(result, placement=slice(0, len(result)), ndim=1)

1082 return SingleBlockManager(block, self.axes[0])

1083

1084 def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:

1085 """

1086 Return the data as a SingleBlockManager.

1087 """

1088 block = self.blocks[self.blknos[i]]

1089 values = block.iget(self.blklocs[i])

1090

1091 # shortcut for select a single-dim from a 2-dim BM

1092 bp = BlockPlacement(slice(0, len(values)))

1093 nb = type(block)(

1094 values, placement=bp, ndim=1, refs=block.refs if track_ref else None

1095 )

1096 return SingleBlockManager(nb, self.axes[1])

1097

1098 def iget_values(self, i: int) -> ArrayLike:

1099 """

1100 Return the data for column i as the values (ndarray or ExtensionArray).

1101

1102 Warning! The returned array is a view but doesn't handle Copy-on-Write,

1103 so this should be used with caution.

1104 """

1105 # TODO(CoW) making the arrays read-only might make this safer to use?

1106 block = self.blocks[self.blknos[i]]

1107 values = block.iget(self.blklocs[i])

1108 return values

1109

1110 @property

1111 def column_arrays(self) -> list[np.ndarray]:

1112 """

1113 Used in the JSON C code to access column arrays.

1114 This optimizes compared to using `iget_values` by converting each

1115

1116 Warning! This doesn't handle Copy-on-Write, so should be used with

1117 caution (current use case of consuming this in the JSON code is fine).

1118 """

1119 # This is an optimized equivalent to

1120 # result = [self.iget_values(i) for i in range(len(self.items))]

1121 result: list[np.ndarray | None] = [None] * len(self.items)

1122

1123 for blk in self.blocks:

1124 mgr_locs = blk._mgr_locs

1125 values = blk.values_for_json()

1126 if values.ndim == 1:

1127 # TODO(EA2D): special casing not needed with 2D EAs

1128 result[mgr_locs[0]] = values

1129

1130 else:

1131 for i, loc in enumerate(mgr_locs):

1132 result[loc] = values[i]

1133

1134 # error: Incompatible return value type (got "List[None]",

1135 # expected "List[ndarray[Any, Any]]")

1136 return result # type: ignore[return-value]

1137

1138 def iset(

1139 self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False

1140 ):

1141 """

1142 Set new item in-place. Does not consolidate. Adds new Block if not

1143 contained in the current set of items

1144 """

1145

1146 # FIXME: refactor, clearly separate broadcasting & zip-like assignment

1147 # can prob also fix the various if tests for sparse/categorical

1148 if self._blklocs is None and self.ndim > 1:

1149 self._rebuild_blknos_and_blklocs()

1150

1151 # Note: we exclude DTA/TDA here

1152 value_is_extension_type = is_1d_only_ea_dtype(value.dtype)

1153 if not value_is_extension_type:

1154 if value.ndim == 2:

1155 value = value.T

1156 else:

1157 value = ensure_block_shape(value, ndim=2)

1158

1159 if value.shape[1:] != self.shape[1:]:

1160 raise AssertionError(

1161 "Shape of new values must be compatible with manager shape"

1162 )

1163

1164 if lib.is_integer(loc):

1165 # We have 6 tests where loc is _not_ an int.

1166 # In this case, get_blkno_placements will yield only one tuple,

1167 # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1)))

1168

1169 # Check if we can use _iset_single fastpath

1170 loc = cast(int, loc)

1171 blkno = self.blknos[loc]

1172 blk = self.blocks[blkno]

1173 if len(blk._mgr_locs) == 1: # TODO: fastest way to check this?

1174 return self._iset_single(

1175 loc,

1176 value,

1177 inplace=inplace,

1178 blkno=blkno,

1179 blk=blk,

1180 )

1181

1182 # error: Incompatible types in assignment (expression has type

1183 # "List[Union[int, slice, ndarray]]", variable has type "Union[int,

1184 # slice, ndarray]")

1185 loc = [loc] # type: ignore[assignment]

1186

1187 # categorical/sparse/datetimetz

1188 if value_is_extension_type:

1189

1190 def value_getitem(placement):

1191 return value

1192

1193 else:

1194

1195 def value_getitem(placement):

1196 return value[placement.indexer]

1197

1198 # Accessing public blknos ensures the public versions are initialized

1199 blknos = self.blknos[loc]

1200 blklocs = self.blklocs[loc].copy()

1201

1202 unfit_mgr_locs = []

1203 unfit_val_locs = []

1204 removed_blknos = []

1205 for blkno_l, val_locs in libinternals.get_blkno_placements(blknos, group=True):

1206 blk = self.blocks[blkno_l]

1207 blk_locs = blklocs[val_locs.indexer]

1208 if inplace and blk.should_store(value):

1209 # Updating inplace -> check if we need to do Copy-on-Write

1210 if using_copy_on_write() and not self._has_no_reference_block(blkno_l):

1211 self._iset_split_block(blkno_l, blk_locs, value_getitem(val_locs))

1212 else:

1213 blk.set_inplace(blk_locs, value_getitem(val_locs))

1214 continue

1215 else:

1216 unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])

1217 unfit_val_locs.append(val_locs)

1218

1219 # If all block items are unfit, schedule the block for removal.

1220 if len(val_locs) == len(blk.mgr_locs):

1221 removed_blknos.append(blkno_l)

1222 continue

1223 else:

1224 # Defer setting the new values to enable consolidation

1225 self._iset_split_block(blkno_l, blk_locs)

1226

1227 if len(removed_blknos):

1228 # Remove blocks & update blknos accordingly

1229 is_deleted = np.zeros(self.nblocks, dtype=np.bool_)

1230 is_deleted[removed_blknos] = True

1231

1232 new_blknos = np.empty(self.nblocks, dtype=np.intp)

1233 new_blknos.fill(-1)

1234 new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos))

1235 self._blknos = new_blknos[self._blknos]

1236 self.blocks = tuple(

1237 blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos)

1238 )

1239

1240 if unfit_val_locs:

1241 unfit_idxr = np.concatenate(unfit_mgr_locs)

1242 unfit_count = len(unfit_idxr)

1243

1244 new_blocks: list[Block] = []

1245 # TODO(CoW) is this always correct to assume that the new_blocks

1246 # are not referencing anything else?

1247 if value_is_extension_type:

1248 # This code (ab-)uses the fact that EA blocks contain only

1249 # one item.

1250 # TODO(EA2D): special casing unnecessary with 2D EAs

1251 new_blocks.extend(

1252 new_block_2d(

1253 values=value,

1254 placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)),

1255 )

1256 for mgr_loc in unfit_idxr

1257 )

1258

1259 self._blknos[unfit_idxr] = np.arange(unfit_count) + len(self.blocks)

1260 self._blklocs[unfit_idxr] = 0

1261

1262 else:

1263 # unfit_val_locs contains BlockPlacement objects

1264 unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])

1265

1266 new_blocks.append(

1267 new_block_2d(

1268 values=value_getitem(unfit_val_items),

1269 placement=BlockPlacement(unfit_idxr),

1270 )

1271 )

1272

1273 self._blknos[unfit_idxr] = len(self.blocks)

1274 self._blklocs[unfit_idxr] = np.arange(unfit_count)

1275

1276 self.blocks += tuple(new_blocks)

1277

1278 # Newly created block's dtype may already be present.

1279 self._known_consolidated = False

1280

1281 def _iset_split_block(

1282 self,

1283 blkno_l: int,

1284 blk_locs: np.ndarray | list[int],

1285 value: ArrayLike | None = None,

1286 ) -> None:

1287 """Removes columns from a block by splitting the block.

1288

1289 Avoids copying the whole block through slicing and updates the manager

1290 after determinint the new block structure. Optionally adds a new block,

1291 otherwise has to be done by the caller.

1292

1293 Parameters

1294 ----------

1295 blkno_l: The block number to operate on, relevant for updating the manager

1296 blk_locs: The locations of our block that should be deleted.

1297 value: The value to set as a replacement.

1298 """

1299 blk = self.blocks[blkno_l]

1300

1301 if self._blklocs is None:

1302 self._rebuild_blknos_and_blklocs()

1303

1304 nbs_tup = tuple(blk.delete(blk_locs))

1305 if value is not None:

1306 locs = blk.mgr_locs.as_array[blk_locs]

1307 first_nb = new_block_2d(value, BlockPlacement(locs))

1308 else:

1309 first_nb = nbs_tup[0]

1310 nbs_tup = tuple(nbs_tup[1:])

1311

1312 nr_blocks = len(self.blocks)

1313 blocks_tup = (

1314 self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup

1315 )

1316 self.blocks = blocks_tup

1317

1318 if not nbs_tup and value is not None:

1319 # No need to update anything if split did not happen

1320 return

1321

1322 self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))

1323

1324 for i, nb in enumerate(nbs_tup):

1325 self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))

1326 self._blknos[nb.mgr_locs.indexer] = i + nr_blocks

1327

1328 def _iset_single(

1329 self, loc: int, value: ArrayLike, inplace: bool, blkno: int, blk: Block

1330 ) -> None:

1331 """

1332 Fastpath for iset when we are only setting a single position and

1333 the Block currently in that position is itself single-column.

1334

1335 In this case we can swap out the entire Block and blklocs and blknos

1336 are unaffected.

1337 """

1338 # Caller is responsible for verifying value.shape

1339

1340 if inplace and blk.should_store(value):

1341 copy = False

1342 if using_copy_on_write() and not self._has_no_reference_block(blkno):

1343 # perform Copy-on-Write and clear the reference

1344 copy = True

1345 iloc = self.blklocs[loc]

1346 blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy)

1347 return

1348

1349 nb = new_block_2d(value, placement=blk._mgr_locs)

1350 old_blocks = self.blocks

1351 new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :]

1352 self.blocks = new_blocks

1353 return

1354

1355 def column_setitem(

1356 self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False

1357 ) -> None:

1358 """

1359 Set values ("setitem") into a single column (not setting the full column).

1360

1361 This is a method on the BlockManager level, to avoid creating an

1362 intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)

1363 """

1364 if using_copy_on_write() and not self._has_no_reference(loc):

1365 blkno = self.blknos[loc]

1366 # Split blocks to only copy the column we want to modify

1367 blk_loc = self.blklocs[loc]

1368 # Copy our values

1369 values = self.blocks[blkno].values

1370 if values.ndim == 1:

1371 values = values.copy()

1372 else:

1373 # Use [blk_loc] as indexer to keep ndim=2, this already results in a

1374 # copy

1375 values = values[[blk_loc]]

1376 self._iset_split_block(blkno, [blk_loc], values)

1377

1378 # this manager is only created temporarily to mutate the values in place

1379 # so don't track references, otherwise the `setitem` would perform CoW again

1380 col_mgr = self.iget(loc, track_ref=False)

1381 if inplace_only:

1382 col_mgr.setitem_inplace(idx, value)

1383 else:

1384 new_mgr = col_mgr.setitem((idx,), value)

1385 self.iset(loc, new_mgr._block.values, inplace=True)

1386

1387 def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:

1388 """

1389 Insert item at selected position.

1390

1391 Parameters

1392 ----------

1393 loc : int

1394 item : hashable

1395 value : np.ndarray or ExtensionArray

1396 """

1397 # insert to the axis; this could possibly raise a TypeError

1398 new_axis = self.items.insert(loc, item)

1399

1400 if value.ndim == 2:

1401 value = value.T

1402 if len(value) > 1:

1403 raise ValueError(

1404 f"Expected a 1D array, got an array with shape {value.T.shape}"

1405 )

1406 else:

1407 value = ensure_block_shape(value, ndim=self.ndim)

1408

1409 bp = BlockPlacement(slice(loc, loc + 1))

1410 # TODO(CoW) do we always "own" the passed `value`?

1411 block = new_block_2d(values=value, placement=bp)

1412

1413 if not len(self.blocks):

1414 # Fastpath

1415 self._blklocs = np.array([0], dtype=np.intp)

1416 self._blknos = np.array([0], dtype=np.intp)

1417 else:

1418 self._insert_update_mgr_locs(loc)

1419 self._insert_update_blklocs_and_blknos(loc)

1420

1421 self.axes[0] = new_axis

1422 self.blocks += (block,)

1423

1424 self._known_consolidated = False

1425

1426 if sum(not block.is_extension for block in self.blocks) > 100:

1427 warnings.warn(

1428 "DataFrame is highly fragmented. This is usually the result "

1429 "of calling `frame.insert` many times, which has poor performance. "

1430 "Consider joining all columns at once using pd.concat(axis=1) "

1431 "instead. To get a de-fragmented frame, use `newframe = frame.copy()`",

1432 PerformanceWarning,

1433 stacklevel=find_stack_level(),

1434 )

1435

1436 def _insert_update_mgr_locs(self, loc) -> None:

1437 """

1438 When inserting a new Block at location 'loc', we increment

1439 all of the mgr_locs of blocks above that by one.

1440 """

1441 for blkno, count in _fast_count_smallints(self.blknos[loc:]):

1442 # .620 this way, .326 of which is in increment_above

1443 blk = self.blocks[blkno]

1444 blk._mgr_locs = blk._mgr_locs.increment_above(loc)

1445

1446 def _insert_update_blklocs_and_blknos(self, loc) -> None:

1447 """

1448 When inserting a new Block at location 'loc', we update our

1449 _blklocs and _blknos.

1450 """

1451

1452 # Accessing public blklocs ensures the public versions are initialized

1453 if loc == self.blklocs.shape[0]:

1454 # np.append is a lot faster, let's use it if we can.

1455 self._blklocs = np.append(self._blklocs, 0)

1456 self._blknos = np.append(self._blknos, len(self.blocks))

1457 elif loc == 0:

1458 # np.append is a lot faster, let's use it if we can.

1459 self._blklocs = np.append(self._blklocs[::-1], 0)[::-1]

1460 self._blknos = np.append(self._blknos[::-1], len(self.blocks))[::-1]

1461 else:

1462 new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos(

1463 self.blklocs, self.blknos, loc, len(self.blocks)

1464 )

1465 self._blklocs = new_blklocs

1466 self._blknos = new_blknos

1467

1468 def idelete(self, indexer) -> BlockManager:

1469 """

1470 Delete selected locations, returning a new BlockManager.

1471 """

1472 is_deleted = np.zeros(self.shape[0], dtype=np.bool_)

1473 is_deleted[indexer] = True

1474 taker = (~is_deleted).nonzero()[0]

1475

1476 nbs = self._slice_take_blocks_ax0(taker, only_slice=True)

1477 new_columns = self.items[~is_deleted]

1478 axes = [new_columns, self.axes[1]]

1479 return type(self)(tuple(nbs), axes, verify_integrity=False)

1480

1481 # ----------------------------------------------------------------

1482 # Block-wise Operation

1483

1484 def grouped_reduce(self: T, func: Callable) -> T:

1485 """

1486 Apply grouped reduction function blockwise, returning a new BlockManager.

1487

1488 Parameters

1489 ----------

1490 func : grouped reduction function

1491

1492 Returns

1493 -------

1494 BlockManager

1495 """

1496 result_blocks: list[Block] = []

1497

1498 for blk in self.blocks:

1499 if blk.is_object:

1500 # split on object-dtype blocks bc some columns may raise

1501 # while others do not.

1502 for sb in blk._split():

1503 applied = sb.apply(func)

1504 result_blocks = extend_blocks(applied, result_blocks)

1505 else:

1506 applied = blk.apply(func)

1507 result_blocks = extend_blocks(applied, result_blocks)

1508

1509 if len(result_blocks) == 0:

1510 nrows = 0

1511 else:

1512 nrows = result_blocks[0].values.shape[-1]

1513 index = Index(range(nrows))

1514

1515 return type(self).from_blocks(result_blocks, [self.axes[0], index])

1516

1517 def reduce(self: T, func: Callable) -> T:

1518 """

1519 Apply reduction function blockwise, returning a single-row BlockManager.

1520

1521 Parameters

1522 ----------

1523 func : reduction function

1524

1525 Returns

1526 -------

1527 BlockManager

1528 """

1529 # If 2D, we assume that we're operating column-wise

1530 assert self.ndim == 2

1531

1532 res_blocks: list[Block] = []

1533 for blk in self.blocks:

1534 nbs = blk.reduce(func)

1535 res_blocks.extend(nbs)

1536

1537 index = Index([None]) # placeholder

1538 new_mgr = type(self).from_blocks(res_blocks, [self.items, index])

1539 return new_mgr

1540

1541 def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager:

1542 """

1543 Apply array_op blockwise with another (aligned) BlockManager.

1544 """

1545 return operate_blockwise(self, other, array_op)

1546

1547 def _equal_values(self: BlockManager, other: BlockManager) -> bool:

1548 """

1549 Used in .equals defined in base class. Only check the column values

1550 assuming shape and indexes have already been checked.

1551 """

1552 return blockwise_all(self, other, array_equals)

1553

1554 def quantile(

1555 self: T,

1556 *,

1557 qs: Index, # with dtype float 64

1558 axis: AxisInt = 0,

1559 interpolation: QuantileInterpolation = "linear",

1560 ) -> T:

1561 """

1562 Iterate over blocks applying quantile reduction.

1563 This routine is intended for reduction type operations and

1564 will do inference on the generated blocks.

1565

1566 Parameters

1567 ----------

1568 axis: reduction axis, default 0

1569 consolidate: bool, default True. Join together blocks having same

1570 dtype

1571 interpolation : type of interpolation, default 'linear'

1572 qs : list of the quantiles to be computed

1573

1574 Returns

1575 -------

1576 BlockManager

1577 """

1578 # Series dispatches to DataFrame for quantile, which allows us to

1579 # simplify some of the code here and in the blocks

1580 assert self.ndim >= 2

1581 assert is_list_like(qs) # caller is responsible for this

1582 assert axis == 1 # only ever called this way

1583

1584 new_axes = list(self.axes)

1585 new_axes[1] = Index(qs, dtype=np.float64)

1586

1587 blocks = [

1588 blk.quantile(axis=axis, qs=qs, interpolation=interpolation)

1589 for blk in self.blocks

1590 ]

1591

1592 return type(self)(blocks, new_axes)

1593

1594 # ----------------------------------------------------------------

1595

1596 def unstack(self, unstacker, fill_value) -> BlockManager:

1597 """

1598 Return a BlockManager with all blocks unstacked.

1599

1600 Parameters

1601 ----------

1602 unstacker : reshape._Unstacker

1603 fill_value : Any

1604 fill_value for newly introduced missing values.

1605

1606 Returns

1607 -------

1608 unstacked : BlockManager

1609 """

1610 new_columns = unstacker.get_new_columns(self.items)

1611 new_index = unstacker.new_index

1612

1613 allow_fill = not unstacker.mask_all

1614 if allow_fill:

1615 # calculating the full mask once and passing it to Block._unstack is

1616 # faster than letting calculating it in each repeated call

1617 new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape)

1618 needs_masking = new_mask2D.any(axis=0)

1619 else:

1620 needs_masking = np.zeros(unstacker.full_shape[1], dtype=bool)

1621

1622 new_blocks: list[Block] = []

1623 columns_mask: list[np.ndarray] = []

1624

1625 if len(self.items) == 0:

1626 factor = 1

1627 else:

1628 fac = len(new_columns) / len(self.items)

1629 assert fac == int(fac)

1630 factor = int(fac)

1631

1632 for blk in self.blocks:

1633 mgr_locs = blk.mgr_locs

1634 new_placement = mgr_locs.tile_for_unstack(factor)

1635

1636 blocks, mask = blk._unstack(

1637 unstacker,

1638 fill_value,

1639 new_placement=new_placement,

1640 needs_masking=needs_masking,

1641 )

1642

1643 new_blocks.extend(blocks)

1644 columns_mask.extend(mask)

1645

1646 # Block._unstack should ensure this holds,

1647 assert mask.sum() == sum(len(nb._mgr_locs) for nb in blocks)

1648 # In turn this ensures that in the BlockManager call below

1649 # we have len(new_columns) == sum(x.shape[0] for x in new_blocks)

1650 # which suffices to allow us to pass verify_inegrity=False

1651

1652 new_columns = new_columns[columns_mask]

1653

1654 bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False)

1655 return bm

1656

1657 def to_dict(self, copy: bool = True):

1658 """

1659 Return a dict of str(dtype) -> BlockManager

1660

1661 Parameters

1662 ----------

1663 copy : bool, default True

1664

1665 Returns

1666 -------

1667 values : a dict of dtype -> BlockManager

1668 """

1669

1670 bd: dict[str, list[Block]] = {}

1671 for b in self.blocks:

1672 bd.setdefault(str(b.dtype), []).append(b)

1673

1674 # TODO(EA2D): the combine will be unnecessary with 2D EAs

1675 return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()}

1676

1677 def as_array(

1678 self,

1679 dtype: np.dtype | None = None,

1680 copy: bool = False,

1681 na_value: object = lib.no_default,

1682 ) -> np.ndarray:

1683 """

1684 Convert the blockmanager data into an numpy array.

1685

1686 Parameters

1687 ----------

1688 dtype : np.dtype or None, default None

1689 Data type of the return array.

1690 copy : bool, default False

1691 If True then guarantee that a copy is returned. A value of

1692 False does not guarantee that the underlying data is not

1693 copied.

1694 na_value : object, default lib.no_default

1695 Value to be used as the missing value sentinel.

1696

1697 Returns

1698 -------

1699 arr : ndarray

1700 """

1701 # TODO(CoW) handle case where resulting array is a view

1702 if len(self.blocks) == 0:

1703 arr = np.empty(self.shape, dtype=float)

1704 return arr.transpose()

1705

1706 # We want to copy when na_value is provided to avoid

1707 # mutating the original object

1708 copy = copy or na_value is not lib.no_default

1709

1710 if self.is_single_block:

1711 blk = self.blocks[0]

1712 if blk.is_extension:

1713 # Avoid implicit conversion of extension blocks to object

1714

1715 # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no

1716 # attribute "to_numpy"

1717 arr = blk.values.to_numpy( # type: ignore[union-attr]

1718 dtype=dtype,

1719 na_value=na_value,

1720 ).reshape(blk.shape)

1721 else:

1722 arr = np.asarray(blk.get_values())

1723 if dtype:

1724 arr = arr.astype(dtype, copy=False)

1725

1726 if copy:

1727 arr = arr.copy()

1728 elif using_copy_on_write():

1729 arr = arr.view()

1730 arr.flags.writeable = False

1731 else:

1732 arr = self._interleave(dtype=dtype, na_value=na_value)

1733 # The underlying data was copied within _interleave, so no need

1734 # to further copy if copy=True or setting na_value

1735

1736 if na_value is not lib.no_default:

1737 arr[isna(arr)] = na_value

1738

1739 return arr.transpose()

1740

1741 def _interleave(

1742 self,

1743 dtype: np.dtype | None = None,

1744 na_value: object = lib.no_default,

1745 ) -> np.ndarray:

1746 """

1747 Return ndarray from blocks with specified item order

1748 Items must be contained in the blocks

1749 """

1750 if not dtype:

1751 # Incompatible types in assignment (expression has type

1752 # "Optional[Union[dtype[Any], ExtensionDtype]]", variable has

1753 # type "Optional[dtype[Any]]")

1754 dtype = interleaved_dtype( # type: ignore[assignment]

1755 [blk.dtype for blk in self.blocks]

1756 )

1757

1758 # TODO: https://github.com/pandas-dev/pandas/issues/22791

1759 # Give EAs some input on what happens here. Sparse needs this.

1760 if isinstance(dtype, SparseDtype):

1761 dtype = dtype.subtype

1762 dtype = cast(np.dtype, dtype)

1763 elif isinstance(dtype, ExtensionDtype):

1764 dtype = np.dtype("object")

1765 elif is_dtype_equal(dtype, str):

1766 dtype = np.dtype("object")

1767

1768 result = np.empty(self.shape, dtype=dtype)

1769

1770 itemmask = np.zeros(self.shape[0])

1771

1772 if dtype == np.dtype("object") and na_value is lib.no_default:

1773 # much more performant than using to_numpy below

1774 for blk in self.blocks:

1775 rl = blk.mgr_locs

1776 arr = blk.get_values(dtype)

1777 result[rl.indexer] = arr

1778 itemmask[rl.indexer] = 1

1779 return result

1780

1781 for blk in self.blocks:

1782 rl = blk.mgr_locs

1783 if blk.is_extension:

1784 # Avoid implicit conversion of extension blocks to object

1785

1786 # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no

1787 # attribute "to_numpy"

1788 arr = blk.values.to_numpy( # type: ignore[union-attr]

1789 dtype=dtype,

1790 na_value=na_value,

1791 )

1792 else:

1793 arr = blk.get_values(dtype)

1794 result[rl.indexer] = arr

1795 itemmask[rl.indexer] = 1

1796

1797 if not itemmask.all():

1798 raise AssertionError("Some items were not contained in blocks")

1799

1800 return result

1801

1802 # ----------------------------------------------------------------

1803 # Consolidation

1804

1805 def is_consolidated(self) -> bool:

1806 """

1807 Return True if more than one block with the same dtype

1808 """

1809 if not self._known_consolidated:

1810 self._consolidate_check()

1811 return self._is_consolidated

1812

1813 def _consolidate_check(self) -> None:

1814 if len(self.blocks) == 1:

1815 # fastpath

1816 self._is_consolidated = True

1817 self._known_consolidated = True

1818 return

1819 dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate]

1820 self._is_consolidated = len(dtypes) == len(set(dtypes))

1821 self._known_consolidated = True

1822

1823 def _consolidate_inplace(self) -> None:

1824 # In general, _consolidate_inplace should only be called via

1825 # DataFrame._consolidate_inplace, otherwise we will fail to invalidate

1826 # the DataFrame's _item_cache. The exception is for newly-created

1827 # BlockManager objects not yet attached to a DataFrame.

1828 if not self.is_consolidated():

1829 self.blocks = _consolidate(self.blocks)

1830 self._is_consolidated = True

1831 self._known_consolidated = True

1832 self._rebuild_blknos_and_blklocs()

1833

1834

1835class SingleBlockManager(BaseBlockManager, SingleDataManager):

1836 """manage a single block with"""

1837

1838 @property

1839 def ndim(self) -> Literal[1]:

1840 return 1

1841

1842 _is_consolidated = True

1843 _known_consolidated = True

1844 __slots__ = ()

1845 is_single_block = True

1846

1847 def __init__(

1848 self,

1849 block: Block,

1850 axis: Index,

1851 verify_integrity: bool = False,

1852 ) -> None:

1853 # Assertions disabled for performance

1854 # assert isinstance(block, Block), type(block)

1855 # assert isinstance(axis, Index), type(axis)

1856

1857 self.axes = [axis]

1858 self.blocks = (block,)

1859

1860 @classmethod

1861 def from_blocks(

1862 cls,

1863 blocks: list[Block],

1864 axes: list[Index],

1865 ) -> SingleBlockManager:

1866 """

1867 Constructor for BlockManager and SingleBlockManager with same signature.

1868 """

1869 assert len(blocks) == 1

1870 assert len(axes) == 1

1871 return cls(blocks[0], axes[0], verify_integrity=False)

1872

1873 @classmethod

1874 def from_array(

1875 cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None

1876 ) -> SingleBlockManager:

1877 """

1878 Constructor for if we have an array that is not yet a Block.

1879 """

1880 block = new_block(array, placement=slice(0, len(index)), ndim=1, refs=refs)

1881 return cls(block, index)

1882

1883 def to_2d_mgr(self, columns: Index) -> BlockManager:

1884 """

1885 Manager analogue of Series.to_frame

1886 """

1887 blk = self.blocks[0]

1888 arr = ensure_block_shape(blk.values, ndim=2)

1889 bp = BlockPlacement(0)

1890 new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs)

1891 axes = [columns, self.axes[0]]

1892 return BlockManager([new_blk], axes=axes, verify_integrity=False)

1893

1894 def _has_no_reference(self, i: int = 0) -> bool:

1895 """

1896 Check for column `i` if it has references.

1897 (whether it references another array or is itself being referenced)

1898 Returns True if the column has no references.

1899 """

1900 return not self.blocks[0].refs.has_reference()

1901

1902 def __getstate__(self):

1903 block_values = [b.values for b in self.blocks]

1904 block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]

1905 axes_array = list(self.axes)

1906

1907 extra_state = {

1908 "0.14.1": {

1909 "axes": axes_array,

1910 "blocks": [

1911 {"values": b.values, "mgr_locs": b.mgr_locs.indexer}

1912 for b in self.blocks

1913 ],

1914 }

1915 }

1916

1917 # First three elements of the state are to maintain forward

1918 # compatibility with 0.13.1.

1919 return axes_array, block_values, block_items, extra_state

1920

1921 def __setstate__(self, state):

1922 def unpickle_block(values, mgr_locs, ndim: int) -> Block:

1923 # TODO(EA2D): ndim would be unnecessary with 2D EAs

1924 # older pickles may store e.g. DatetimeIndex instead of DatetimeArray

1925 values = extract_array(values, extract_numpy=True)

1926 return new_block(values, placement=mgr_locs, ndim=ndim)

1927

1928 if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:

1929 state = state[3]["0.14.1"]

1930 self.axes = [ensure_index(ax) for ax in state["axes"]]

1931 ndim = len(self.axes)

1932 self.blocks = tuple(

1933 unpickle_block(b["values"], b["mgr_locs"], ndim=ndim)

1934 for b in state["blocks"]

1935 )

1936 else:

1937 raise NotImplementedError("pre-0.14.1 pickles are no longer supported")

1938

1939 self._post_setstate()

1940

1941 def _post_setstate(self) -> None:

1942 pass

1943

1944 @cache_readonly

1945 def _block(self) -> Block:

1946 return self.blocks[0]

1947

1948 @property

1949 def _blknos(self):

1950 """compat with BlockManager"""

1951 return None

1952

1953 @property

1954 def _blklocs(self):

1955 """compat with BlockManager"""

1956 return None

1957

1958 def getitem_mgr(self, indexer: slice | np.ndarray) -> SingleBlockManager:

1959 # similar to get_slice, but not restricted to slice indexer

1960 blk = self._block

1961 if (

1962 using_copy_on_write()

1963 and isinstance(indexer, np.ndarray)

1964 and len(indexer) > 0

1965 and com.is_bool_indexer(indexer)

1966 and indexer.all()

1967 ):

1968 return type(self)(blk.copy(deep=False), self.index)

1969 array = blk._slice(indexer)

1970 if array.ndim > 1:

1971 # This will be caught by Series._get_values

1972 raise ValueError("dimension-expanding indexing not allowed")

1973

1974 bp = BlockPlacement(slice(0, len(array)))

1975 # TODO(CoW) in theory only need to track reference if new_array is a view

1976 block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs)

1977

1978 new_idx = self.index[indexer]

1979 return type(self)(block, new_idx)

1980

1981 def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager:

1982 # Assertion disabled for performance

1983 # assert isinstance(slobj, slice), type(slobj)

1984 if axis >= self.ndim:

1985 raise IndexError("Requested axis not found in manager")

1986

1987 blk = self._block

1988 array = blk._slice(slobj)

1989 bp = BlockPlacement(slice(0, len(array)))

1990 # TODO this method is only used in groupby SeriesSplitter at the moment,

1991 # so passing refs is not yet covered by the tests

1992 block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs)

1993 new_index = self.index._getitem_slice(slobj)

1994 return type(self)(block, new_index)

1995

1996 @property

1997 def index(self) -> Index:

1998 return self.axes[0]

1999

2000 @property

2001 def dtype(self) -> DtypeObj:

2002 return self._block.dtype

2003

2004 def get_dtypes(self) -> np.ndarray:

2005 return np.array([self._block.dtype])

2006

2007 def external_values(self):

2008 """The array that Series.values returns"""

2009 return self._block.external_values()

2010

2011 def internal_values(self):

2012 """The array that Series._values returns"""

2013 return self._block.values

2014

2015 def array_values(self):

2016 """The array that Series.array returns"""

2017 return self._block.array_values

2018

2019 def get_numeric_data(self, copy: bool = False):

2020 if self._block.is_numeric:

2021 return self.copy(deep=copy)

2022 return self.make_empty()

2023

2024 @property

2025 def _can_hold_na(self) -> bool:

2026 return self._block._can_hold_na

2027

2028 def setitem_inplace(self, indexer, value) -> None:

2029 """

2030 Set values with indexer.

2031

2032 For Single[Block/Array]Manager, this backs s[indexer] = value

2033

2034 This is an inplace version of `setitem()`, mutating the manager/values

2035 in place, not returning a new Manager (and Block), and thus never changing

2036 the dtype.

2037 """

2038 if using_copy_on_write() and not self._has_no_reference(0):

2039 self.blocks = (self._block.copy(),)

2040 self._cache.clear()

2041

2042 super().setitem_inplace(indexer, value)

2043

2044 def idelete(self, indexer) -> SingleBlockManager:

2045 """

2046 Delete single location from SingleBlockManager.

2047

2048 Ensures that self.blocks doesn't become empty.

2049 """

2050 nb = self._block.delete(indexer)[0]

2051 self.blocks = (nb,)

2052 self.axes[0] = self.axes[0].delete(indexer)

2053 self._cache.clear()

2054 return self

2055

2056 def fast_xs(self, loc):

2057 """

2058 fast path for getting a cross-section

2059 return a view of the data

2060 """

2061 raise NotImplementedError("Use series._values[loc] instead")

2062

2063 def set_values(self, values: ArrayLike) -> None:

2064 """

2065 Set the values of the single block in place.

2066

2067 Use at your own risk! This does not check if the passed values are

2068 valid for the current Block/SingleBlockManager (length, dtype, etc).

2069 """

2070 # TODO(CoW) do we need to handle copy on write here? Currently this is

2071 # only used for FrameColumnApply.series_generator (what if apply is

2072 # mutating inplace?)

2073 self.blocks[0].values = values

2074 self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values)))

2075

2076 def _equal_values(self: T, other: T) -> bool:

2077 """

2078 Used in .equals defined in base class. Only check the column values

2079 assuming shape and indexes have already been checked.

2080 """

2081 # For SingleBlockManager (i.e.Series)

2082 if other.ndim != 1:

2083 return False

2084 left = self.blocks[0].values

2085 right = other.blocks[0].values

2086 return array_equals(left, right)

2087

2088

2089# --------------------------------------------------------------------

2090# Constructor Helpers

2091

2092

2093def create_block_manager_from_blocks(

2094 blocks: list[Block],

2095 axes: list[Index],

2096 consolidate: bool = True,

2097 verify_integrity: bool = True,

2098) -> BlockManager:

2099 # If verify_integrity=False, then caller is responsible for checking

2100 # all(x.shape[-1] == len(axes[1]) for x in blocks)

2101 # sum(x.shape[0] for x in blocks) == len(axes[0])

2102 # set(x for blk in blocks for x in blk.mgr_locs) == set(range(len(axes[0])))

2103 # all(blk.ndim == 2 for blk in blocks)

2104 # This allows us to safely pass verify_integrity=False

2105

2106 try:

2107 mgr = BlockManager(blocks, axes, verify_integrity=verify_integrity)

2108

2109 except ValueError as err:

2110 arrays = [blk.values for blk in blocks]

2111 tot_items = sum(arr.shape[0] for arr in arrays)

2112 raise_construction_error(tot_items, arrays[0].shape[1:], axes, err)

2113

2114 if consolidate:

2115 mgr._consolidate_inplace()

2116 return mgr

2117

2118

2119def create_block_manager_from_column_arrays(

2120 arrays: list[ArrayLike],

2121 axes: list[Index],

2122 consolidate: bool,

2123 refs: list,

2124) -> BlockManager:

2125 # Assertions disabled for performance (caller is responsible for verifying)

2126 # assert isinstance(axes, list)

2127 # assert all(isinstance(x, Index) for x in axes)

2128 # assert all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)

2129 # assert all(type(x) is not PandasArray for x in arrays)

2130 # assert all(x.ndim == 1 for x in arrays)

2131 # assert all(len(x) == len(axes[1]) for x in arrays)

2132 # assert len(arrays) == len(axes[0])

2133 # These last three are sufficient to allow us to safely pass

2134 # verify_integrity=False below.

2135

2136 try:

2137 blocks = _form_blocks(arrays, consolidate, refs)

2138 mgr = BlockManager(blocks, axes, verify_integrity=False)

2139 except ValueError as e:

2140 raise_construction_error(len(arrays), arrays[0].shape, axes, e)

2141 if consolidate:

2142 mgr._consolidate_inplace()

2143 return mgr

2144

2145

2146def raise_construction_error(

2147 tot_items: int,

2148 block_shape: Shape,

2149 axes: list[Index],

2150 e: ValueError | None = None,

2151):

2152 """raise a helpful message about our construction"""

2153 passed = tuple(map(int, [tot_items] + list(block_shape)))

2154 # Correcting the user facing error message during dataframe construction

2155 if len(passed) <= 2:

2156 passed = passed[::-1]

2157

2158 implied = tuple(len(ax) for ax in axes)

2159 # Correcting the user facing error message during dataframe construction

2160 if len(implied) <= 2:

2161 implied = implied[::-1]

2162

2163 # We return the exception object instead of raising it so that we

2164 # can raise it in the caller; mypy plays better with that

2165 if passed == implied and e is not None:

2166 raise e

2167 if block_shape[0] == 0:

2168 raise ValueError("Empty data passed with indices specified.")

2169 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")

2170

2171

2172# -----------------------------------------------------------------------

2173

2174

2175def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, bool, DtypeObj]:

2176 # compat for numpy<1.21, in which comparing a np.dtype with an ExtensionDtype

2177 # raises instead of returning False. Once earlier numpy versions are dropped,

2178 # this can be simplified to `return tup[1].dtype`

2179 dtype = tup[1].dtype

2180

2181 if is_1d_only_ea_dtype(dtype):

2182 # We know these won't be consolidated, so don't need to group these.

2183 # This avoids expensive comparisons of CategoricalDtype objects

2184 sep = id(dtype)

2185 else:

2186 sep = 0

2187

2188 return sep, isinstance(dtype, np.dtype), dtype

2189

2190

2191def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]:

2192 tuples = list(enumerate(arrays))

2193

2194 if not consolidate:

2195 nbs = _tuples_to_blocks_no_consolidate(tuples, refs)

2196 return nbs

2197

2198 # when consolidating, we can ignore refs (either stacking always copies,

2199 # or the EA is already copied in the calling dict_to_mgr)

2200 # TODO(CoW) check if this is also valid for rec_array_to_mgr

2201

2202 # group by dtype

2203 grouper = itertools.groupby(tuples, _grouping_func)

2204

2205 nbs = []

2206 for (_, _, dtype), tup_block in grouper:

2207 block_type = get_block_type(dtype)

2208

2209 if isinstance(dtype, np.dtype):

2210 is_dtlike = dtype.kind in ["m", "M"]

2211

2212 if issubclass(dtype.type, (str, bytes)):

2213 dtype = np.dtype(object)

2214

2215 values, placement = _stack_arrays(list(tup_block), dtype)

2216 if is_dtlike:

2217 values = ensure_wrapped_if_datetimelike(values)

2218 blk = block_type(values, placement=BlockPlacement(placement), ndim=2)

2219 nbs.append(blk)

2220

2221 elif is_1d_only_ea_dtype(dtype):

2222 dtype_blocks = [

2223 block_type(x[1], placement=BlockPlacement(x[0]), ndim=2)

2224 for x in tup_block

2225 ]

2226 nbs.extend(dtype_blocks)

2227

2228 else:

2229 dtype_blocks = [

2230 block_type(

2231 ensure_block_shape(x[1], 2), placement=BlockPlacement(x[0]), ndim=2

2232 )

2233 for x in tup_block

2234 ]

2235 nbs.extend(dtype_blocks)

2236 return nbs

2237

2238

2239def _tuples_to_blocks_no_consolidate(tuples, refs) -> list[Block]:

2240 # tuples produced within _form_blocks are of the form (placement, array)

2241 return [

2242 new_block_2d(

2243 ensure_block_shape(arr, ndim=2), placement=BlockPlacement(i), refs=ref

2244 )

2245 for ((i, arr), ref) in zip(tuples, refs)

2246 ]

2247

2248

2249def _stack_arrays(tuples, dtype: np.dtype):

2250 placement, arrays = zip(*tuples)

2251

2252 first = arrays[0]

2253 shape = (len(arrays),) + first.shape

2254

2255 stacked = np.empty(shape, dtype=dtype)

2256 for i, arr in enumerate(arrays):

2257 stacked[i] = arr

2258

2259 return stacked, placement

2260

2261

2262def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]:

2263 """

2264 Merge blocks having same dtype, exclude non-consolidating blocks

2265 """

2266 # sort by _can_consolidate, dtype

2267 gkey = lambda x: x._consolidate_key

2268 grouper = itertools.groupby(sorted(blocks, key=gkey), gkey)

2269

2270 new_blocks: list[Block] = []

2271 for (_can_consolidate, dtype), group_blocks in grouper:

2272 merged_blocks, _ = _merge_blocks(

2273 list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate

2274 )

2275 new_blocks = extend_blocks(merged_blocks, new_blocks)

2276 return tuple(new_blocks)

2277

2278

2279def _merge_blocks(

2280 blocks: list[Block], dtype: DtypeObj, can_consolidate: bool

2281) -> tuple[list[Block], bool]:

2282 if len(blocks) == 1:

2283 return blocks, False

2284

2285 if can_consolidate:

2286 # TODO: optimization potential in case all mgrs contain slices and

2287 # combination of those slices is a slice, too.

2288 new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])

2289

2290 new_values: ArrayLike

2291

2292 if isinstance(blocks[0].dtype, np.dtype):

2293 # error: List comprehension has incompatible type List[Union[ndarray,

2294 # ExtensionArray]]; expected List[Union[complex, generic,

2295 # Sequence[Union[int, float, complex, str, bytes, generic]],

2296 # Sequence[Sequence[Any]], SupportsArray]]

2297 new_values = np.vstack([b.values for b in blocks]) # type: ignore[misc]

2298 else:

2299 bvals = [blk.values for blk in blocks]

2300 bvals2 = cast(Sequence[NDArrayBackedExtensionArray], bvals)

2301 new_values = bvals2[0]._concat_same_type(bvals2, axis=0)

2302

2303 argsort = np.argsort(new_mgr_locs)

2304 new_values = new_values[argsort]

2305 new_mgr_locs = new_mgr_locs[argsort]

2306

2307 bp = BlockPlacement(new_mgr_locs)

2308 return [new_block_2d(new_values, placement=bp)], True

2309

2310 # can't consolidate --> no merge

2311 return blocks, False

2312

2313

2314def _fast_count_smallints(arr: npt.NDArray[np.intp]):

2315 """Faster version of set(arr) for sequences of small numbers."""

2316 counts = np.bincount(arr)

2317 nz = counts.nonzero()[0]

2318 # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here,

2319 # in one benchmark by a factor of 11

2320 return zip(nz, counts[nz])

2321

2322

2323def _preprocess_slice_or_indexer(

2324 slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool

2325):

2326 if isinstance(slice_or_indexer, slice):

2327 return (

2328 "slice",

2329 slice_or_indexer,

2330 libinternals.slice_len(slice_or_indexer, length),

2331 )

2332 else:

2333 if (

2334 not isinstance(slice_or_indexer, np.ndarray)

2335 or slice_or_indexer.dtype.kind != "i"

2336 ):

2337 dtype = getattr(slice_or_indexer, "dtype", None)

2338 raise TypeError(type(slice_or_indexer), dtype)

2339

2340 indexer = ensure_platform_int(slice_or_indexer)

2341 if not allow_fill:

2342 indexer = maybe_convert_indices(indexer, length)

2343 return "fancy", indexer, len(indexer)