1from __future__ import annotations
2
3from collections.abc import (
4 Hashable,
5 Sequence,
6)
7import itertools
8from typing import (
9 TYPE_CHECKING,
10 Callable,
11 Literal,
12 cast,
13)
14import warnings
15
16import numpy as np
17
18from pandas._config import (
19 using_copy_on_write,
20 warn_copy_on_write,
21)
22
23from pandas._libs import (
24 internals as libinternals,
25 lib,
26)
27from pandas._libs.internals import (
28 BlockPlacement,
29 BlockValuesRefs,
30)
31from pandas._libs.tslibs import Timestamp
32from pandas.errors import PerformanceWarning
33from pandas.util._decorators import cache_readonly
34from pandas.util._exceptions import find_stack_level
35
36from pandas.core.dtypes.cast import infer_dtype_from_scalar
37from pandas.core.dtypes.common import (
38 ensure_platform_int,
39 is_1d_only_ea_dtype,
40 is_list_like,
41)
42from pandas.core.dtypes.dtypes import (
43 DatetimeTZDtype,
44 ExtensionDtype,
45)
46from pandas.core.dtypes.generic import (
47 ABCDataFrame,
48 ABCSeries,
49)
50from pandas.core.dtypes.missing import (
51 array_equals,
52 isna,
53)
54
55import pandas.core.algorithms as algos
56from pandas.core.arrays import (
57 ArrowExtensionArray,
58 ArrowStringArray,
59 DatetimeArray,
60)
61from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
62from pandas.core.construction import (
63 ensure_wrapped_if_datetimelike,
64 extract_array,
65)
66from pandas.core.indexers import maybe_convert_indices
67from pandas.core.indexes.api import (
68 Index,
69 ensure_index,
70)
71from pandas.core.internals.base import (
72 DataManager,
73 SingleDataManager,
74 ensure_np_dtype,
75 interleaved_dtype,
76)
77from pandas.core.internals.blocks import (
78 COW_WARNING_GENERAL_MSG,
79 COW_WARNING_SETITEM_MSG,
80 Block,
81 NumpyBlock,
82 ensure_block_shape,
83 extend_blocks,
84 get_block_type,
85 maybe_coerce_values,
86 new_block,
87 new_block_2d,
88)
89from pandas.core.internals.ops import (
90 blockwise_all,
91 operate_blockwise,
92)
93
94if TYPE_CHECKING:
95 from pandas._typing import (
96 ArrayLike,
97 AxisInt,
98 DtypeObj,
99 QuantileInterpolation,
100 Self,
101 Shape,
102 npt,
103 )
104
105 from pandas.api.extensions import ExtensionArray
106
107
108class BaseBlockManager(DataManager):
109 """
110 Core internal data structure to implement DataFrame, Series, etc.
111
112 Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a
113 lightweight blocked set of labeled data to be manipulated by the DataFrame
114 public API class
115
116 Attributes
117 ----------
118 shape
119 ndim
120 axes
121 values
122 items
123
124 Methods
125 -------
126 set_axis(axis, new_labels)
127 copy(deep=True)
128
129 get_dtypes
130
131 apply(func, axes, block_filter_fn)
132
133 get_bool_data
134 get_numeric_data
135
136 get_slice(slice_like, axis)
137 get(label)
138 iget(loc)
139
140 take(indexer, axis)
141 reindex_axis(new_labels, axis)
142 reindex_indexer(new_labels, indexer, axis)
143
144 delete(label)
145 insert(loc, label, value)
146 set(label, value)
147
148 Parameters
149 ----------
150 blocks: Sequence of Block
151 axes: Sequence of Index
152 verify_integrity: bool, default True
153
154 Notes
155 -----
156 This is *not* a public API class
157 """
158
159 __slots__ = ()
160
161 _blknos: npt.NDArray[np.intp]
162 _blklocs: npt.NDArray[np.intp]
163 blocks: tuple[Block, ...]
164 axes: list[Index]
165
166 @property
167 def ndim(self) -> int:
168 raise NotImplementedError
169
170 _known_consolidated: bool
171 _is_consolidated: bool
172
173 def __init__(self, blocks, axes, verify_integrity: bool = True) -> None:
174 raise NotImplementedError
175
176 @classmethod
177 def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self:
178 raise NotImplementedError
179
180 @property
181 def blknos(self) -> npt.NDArray[np.intp]:
182 """
183 Suppose we want to find the array corresponding to our i'th column.
184
185 blknos[i] identifies the block from self.blocks that contains this column.
186
187 blklocs[i] identifies the column of interest within
188 self.blocks[self.blknos[i]]
189 """
190 if self._blknos is None:
191 # Note: these can be altered by other BlockManager methods.
192 self._rebuild_blknos_and_blklocs()
193
194 return self._blknos
195
196 @property
197 def blklocs(self) -> npt.NDArray[np.intp]:
198 """
199 See blknos.__doc__
200 """
201 if self._blklocs is None:
202 # Note: these can be altered by other BlockManager methods.
203 self._rebuild_blknos_and_blklocs()
204
205 return self._blklocs
206
207 def make_empty(self, axes=None) -> Self:
208 """return an empty BlockManager with the items axis of len 0"""
209 if axes is None:
210 axes = [Index([])] + self.axes[1:]
211
212 # preserve dtype if possible
213 if self.ndim == 1:
214 assert isinstance(self, SingleBlockManager) # for mypy
215 blk = self.blocks[0]
216 arr = blk.values[:0]
217 bp = BlockPlacement(slice(0, 0))
218 nb = blk.make_block_same_class(arr, placement=bp)
219 blocks = [nb]
220 else:
221 blocks = []
222 return type(self).from_blocks(blocks, axes)
223
224 def __nonzero__(self) -> bool:
225 return True
226
227 # Python3 compat
228 __bool__ = __nonzero__
229
230 def _normalize_axis(self, axis: AxisInt) -> int:
231 # switch axis to follow BlockManager logic
232 if self.ndim == 2:
233 axis = 1 if axis == 0 else 0
234 return axis
235
236 def set_axis(self, axis: AxisInt, new_labels: Index) -> None:
237 # Caller is responsible for ensuring we have an Index object.
238 self._validate_set_axis(axis, new_labels)
239 self.axes[axis] = new_labels
240
241 @property
242 def is_single_block(self) -> bool:
243 # Assumes we are 2D; overridden by SingleBlockManager
244 return len(self.blocks) == 1
245
246 @property
247 def items(self) -> Index:
248 return self.axes[0]
249
250 def _has_no_reference(self, i: int) -> bool:
251 """
252 Check for column `i` if it has references.
253 (whether it references another array or is itself being referenced)
254 Returns True if the column has no references.
255 """
256 blkno = self.blknos[i]
257 return self._has_no_reference_block(blkno)
258
259 def _has_no_reference_block(self, blkno: int) -> bool:
260 """
261 Check for block `i` if it has references.
262 (whether it references another array or is itself being referenced)
263 Returns True if the block has no references.
264 """
265 return not self.blocks[blkno].refs.has_reference()
266
267 def add_references(self, mgr: BaseBlockManager) -> None:
268 """
269 Adds the references from one manager to another. We assume that both
270 managers have the same block structure.
271 """
272 if len(self.blocks) != len(mgr.blocks):
273 # If block structure changes, then we made a copy
274 return
275 for i, blk in enumerate(self.blocks):
276 blk.refs = mgr.blocks[i].refs
277 blk.refs.add_reference(blk)
278
279 def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool:
280 """
281 Checks if two blocks from two different block managers reference the
282 same underlying values.
283 """
284 blk = self.blocks[blkno]
285 return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks)
286
287 def get_dtypes(self) -> npt.NDArray[np.object_]:
288 dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object)
289 return dtypes.take(self.blknos)
290
291 @property
292 def arrays(self) -> list[ArrayLike]:
293 """
294 Quick access to the backing arrays of the Blocks.
295
296 Only for compatibility with ArrayManager for testing convenience.
297 Not to be used in actual code, and return value is not the same as the
298 ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs).
299
300 Warning! The returned arrays don't handle Copy-on-Write, so this should
301 be used with caution (only in read-mode).
302 """
303 return [blk.values for blk in self.blocks]
304
305 def __repr__(self) -> str:
306 output = type(self).__name__
307 for i, ax in enumerate(self.axes):
308 if i == 0:
309 output += f"\nItems: {ax}"
310 else:
311 output += f"\nAxis {i}: {ax}"
312
313 for block in self.blocks:
314 output += f"\n{block}"
315 return output
316
317 def apply(
318 self,
319 f,
320 align_keys: list[str] | None = None,
321 **kwargs,
322 ) -> Self:
323 """
324 Iterate over the blocks, collect and create a new BlockManager.
325
326 Parameters
327 ----------
328 f : str or callable
329 Name of the Block method to apply.
330 align_keys: List[str] or None, default None
331 **kwargs
332 Keywords to pass to `f`
333
334 Returns
335 -------
336 BlockManager
337 """
338 assert "filter" not in kwargs
339
340 align_keys = align_keys or []
341 result_blocks: list[Block] = []
342 # fillna: Series/DataFrame is responsible for making sure value is aligned
343
344 aligned_args = {k: kwargs[k] for k in align_keys}
345
346 for b in self.blocks:
347 if aligned_args:
348 for k, obj in aligned_args.items():
349 if isinstance(obj, (ABCSeries, ABCDataFrame)):
350 # The caller is responsible for ensuring that
351 # obj.axes[-1].equals(self.items)
352 if obj.ndim == 1:
353 kwargs[k] = obj.iloc[b.mgr_locs.indexer]._values
354 else:
355 kwargs[k] = obj.iloc[:, b.mgr_locs.indexer]._values
356 else:
357 # otherwise we have an ndarray
358 kwargs[k] = obj[b.mgr_locs.indexer]
359
360 if callable(f):
361 applied = b.apply(f, **kwargs)
362 else:
363 applied = getattr(b, f)(**kwargs)
364 result_blocks = extend_blocks(applied, result_blocks)
365
366 out = type(self).from_blocks(result_blocks, self.axes)
367 return out
368
369 # Alias so we can share code with ArrayManager
370 apply_with_block = apply
371
372 def setitem(self, indexer, value, warn: bool = True) -> Self:
373 """
374 Set values with indexer.
375
376 For SingleBlockManager, this backs s[indexer] = value
377 """
378 if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
379 raise ValueError(f"Cannot set values with ndim > {self.ndim}")
380
381 if warn and warn_copy_on_write() and not self._has_no_reference(0):
382 warnings.warn(
383 COW_WARNING_GENERAL_MSG,
384 FutureWarning,
385 stacklevel=find_stack_level(),
386 )
387
388 elif using_copy_on_write() and not self._has_no_reference(0):
389 # this method is only called if there is a single block -> hardcoded 0
390 # Split blocks to only copy the columns we want to modify
391 if self.ndim == 2 and isinstance(indexer, tuple):
392 blk_loc = self.blklocs[indexer[1]]
393 if is_list_like(blk_loc) and blk_loc.ndim == 2:
394 blk_loc = np.squeeze(blk_loc, axis=0)
395 elif not is_list_like(blk_loc):
396 # Keep dimension and copy data later
397 blk_loc = [blk_loc] # type: ignore[assignment]
398 if len(blk_loc) == 0:
399 return self.copy(deep=False)
400
401 values = self.blocks[0].values
402 if values.ndim == 2:
403 values = values[blk_loc]
404 # "T" has no attribute "_iset_split_block"
405 self._iset_split_block( # type: ignore[attr-defined]
406 0, blk_loc, values
407 )
408 # first block equals values
409 self.blocks[0].setitem((indexer[0], np.arange(len(blk_loc))), value)
410 return self
411 # No need to split if we either set all columns or on a single block
412 # manager
413 self = self.copy()
414
415 return self.apply("setitem", indexer=indexer, value=value)
416
417 def diff(self, n: int) -> Self:
418 # only reached with self.ndim == 2
419 return self.apply("diff", n=n)
420
421 def astype(self, dtype, copy: bool | None = False, errors: str = "raise") -> Self:
422 if copy is None:
423 if using_copy_on_write():
424 copy = False
425 else:
426 copy = True
427 elif using_copy_on_write():
428 copy = False
429
430 return self.apply(
431 "astype",
432 dtype=dtype,
433 copy=copy,
434 errors=errors,
435 using_cow=using_copy_on_write(),
436 )
437
438 def convert(self, copy: bool | None) -> Self:
439 if copy is None:
440 if using_copy_on_write():
441 copy = False
442 else:
443 copy = True
444 elif using_copy_on_write():
445 copy = False
446
447 return self.apply("convert", copy=copy, using_cow=using_copy_on_write())
448
449 def convert_dtypes(self, **kwargs):
450 if using_copy_on_write():
451 copy = False
452 else:
453 copy = True
454
455 return self.apply(
456 "convert_dtypes", copy=copy, using_cow=using_copy_on_write(), **kwargs
457 )
458
459 def get_values_for_csv(
460 self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None
461 ) -> Self:
462 """
463 Convert values to native types (strings / python objects) that are used
464 in formatting (repr / csv).
465 """
466 return self.apply(
467 "get_values_for_csv",
468 na_rep=na_rep,
469 quoting=quoting,
470 float_format=float_format,
471 date_format=date_format,
472 decimal=decimal,
473 )
474
475 @property
476 def any_extension_types(self) -> bool:
477 """Whether any of the blocks in this manager are extension blocks"""
478 return any(block.is_extension for block in self.blocks)
479
480 @property
481 def is_view(self) -> bool:
482 """return a boolean if we are a single block and are a view"""
483 if len(self.blocks) == 1:
484 return self.blocks[0].is_view
485
486 # It is technically possible to figure out which blocks are views
487 # e.g. [ b.values.base is not None for b in self.blocks ]
488 # but then we have the case of possibly some blocks being a view
489 # and some blocks not. setting in theory is possible on the non-view
490 # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit
491 # complicated
492
493 return False
494
495 def _get_data_subset(self, predicate: Callable) -> Self:
496 blocks = [blk for blk in self.blocks if predicate(blk.values)]
497 return self._combine(blocks)
498
499 def get_bool_data(self) -> Self:
500 """
501 Select blocks that are bool-dtype and columns from object-dtype blocks
502 that are all-bool.
503 """
504
505 new_blocks = []
506
507 for blk in self.blocks:
508 if blk.dtype == bool:
509 new_blocks.append(blk)
510
511 elif blk.is_object:
512 nbs = blk._split()
513 new_blocks.extend(nb for nb in nbs if nb.is_bool)
514
515 return self._combine(new_blocks)
516
517 def get_numeric_data(self) -> Self:
518 numeric_blocks = [blk for blk in self.blocks if blk.is_numeric]
519 if len(numeric_blocks) == len(self.blocks):
520 # Avoid somewhat expensive _combine
521 return self
522 return self._combine(numeric_blocks)
523
524 def _combine(self, blocks: list[Block], index: Index | None = None) -> Self:
525 """return a new manager with the blocks"""
526 if len(blocks) == 0:
527 if self.ndim == 2:
528 # retain our own Index dtype
529 if index is not None:
530 axes = [self.items[:0], index]
531 else:
532 axes = [self.items[:0]] + self.axes[1:]
533 return self.make_empty(axes)
534 return self.make_empty()
535
536 # FIXME: optimization potential
537 indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks]))
538 inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])
539
540 new_blocks: list[Block] = []
541 for b in blocks:
542 nb = b.copy(deep=False)
543 nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer])
544 new_blocks.append(nb)
545
546 axes = list(self.axes)
547 if index is not None:
548 axes[-1] = index
549 axes[0] = self.items.take(indexer)
550
551 return type(self).from_blocks(new_blocks, axes)
552
553 @property
554 def nblocks(self) -> int:
555 return len(self.blocks)
556
557 def copy(self, deep: bool | None | Literal["all"] = True) -> Self:
558 """
559 Make deep or shallow copy of BlockManager
560
561 Parameters
562 ----------
563 deep : bool, string or None, default True
564 If False or None, return a shallow copy (do not copy data)
565 If 'all', copy data and a deep copy of the index
566
567 Returns
568 -------
569 BlockManager
570 """
571 if deep is None:
572 if using_copy_on_write():
573 # use shallow copy
574 deep = False
575 else:
576 # preserve deep copy for BlockManager with copy=None
577 deep = True
578
579 # this preserves the notion of view copying of axes
580 if deep:
581 # hit in e.g. tests.io.json.test_pandas
582
583 def copy_func(ax):
584 return ax.copy(deep=True) if deep == "all" else ax.view()
585
586 new_axes = [copy_func(ax) for ax in self.axes]
587 else:
588 if using_copy_on_write():
589 new_axes = [ax.view() for ax in self.axes]
590 else:
591 new_axes = list(self.axes)
592
593 res = self.apply("copy", deep=deep)
594 res.axes = new_axes
595
596 if self.ndim > 1:
597 # Avoid needing to re-compute these
598 blknos = self._blknos
599 if blknos is not None:
600 res._blknos = blknos.copy()
601 res._blklocs = self._blklocs.copy()
602
603 if deep:
604 res._consolidate_inplace()
605 return res
606
607 def consolidate(self) -> Self:
608 """
609 Join together blocks having same dtype
610
611 Returns
612 -------
613 y : BlockManager
614 """
615 if self.is_consolidated():
616 return self
617
618 bm = type(self)(self.blocks, self.axes, verify_integrity=False)
619 bm._is_consolidated = False
620 bm._consolidate_inplace()
621 return bm
622
623 def reindex_indexer(
624 self,
625 new_axis: Index,
626 indexer: npt.NDArray[np.intp] | None,
627 axis: AxisInt,
628 fill_value=None,
629 allow_dups: bool = False,
630 copy: bool | None = True,
631 only_slice: bool = False,
632 *,
633 use_na_proxy: bool = False,
634 ) -> Self:
635 """
636 Parameters
637 ----------
638 new_axis : Index
639 indexer : ndarray[intp] or None
640 axis : int
641 fill_value : object, default None
642 allow_dups : bool, default False
643 copy : bool or None, default True
644 If None, regard as False to get shallow copy.
645 only_slice : bool, default False
646 Whether to take views, not copies, along columns.
647 use_na_proxy : bool, default False
648 Whether to use a np.void ndarray for newly introduced columns.
649
650 pandas-indexer with -1's only.
651 """
652 if copy is None:
653 if using_copy_on_write():
654 # use shallow copy
655 copy = False
656 else:
657 # preserve deep copy for BlockManager with copy=None
658 copy = True
659
660 if indexer is None:
661 if new_axis is self.axes[axis] and not copy:
662 return self
663
664 result = self.copy(deep=copy)
665 result.axes = list(self.axes)
666 result.axes[axis] = new_axis
667 return result
668
669 # Should be intp, but in some cases we get int64 on 32bit builds
670 assert isinstance(indexer, np.ndarray)
671
672 # some axes don't allow reindexing with dups
673 if not allow_dups:
674 self.axes[axis]._validate_can_reindex(indexer)
675
676 if axis >= self.ndim:
677 raise IndexError("Requested axis not found in manager")
678
679 if axis == 0:
680 new_blocks = self._slice_take_blocks_ax0(
681 indexer,
682 fill_value=fill_value,
683 only_slice=only_slice,
684 use_na_proxy=use_na_proxy,
685 )
686 else:
687 new_blocks = [
688 blk.take_nd(
689 indexer,
690 axis=1,
691 fill_value=(
692 fill_value if fill_value is not None else blk.fill_value
693 ),
694 )
695 for blk in self.blocks
696 ]
697
698 new_axes = list(self.axes)
699 new_axes[axis] = new_axis
700
701 new_mgr = type(self).from_blocks(new_blocks, new_axes)
702 if axis == 1:
703 # We can avoid the need to rebuild these
704 new_mgr._blknos = self.blknos.copy()
705 new_mgr._blklocs = self.blklocs.copy()
706 return new_mgr
707
708 def _slice_take_blocks_ax0(
709 self,
710 slice_or_indexer: slice | np.ndarray,
711 fill_value=lib.no_default,
712 only_slice: bool = False,
713 *,
714 use_na_proxy: bool = False,
715 ref_inplace_op: bool = False,
716 ) -> list[Block]:
717 """
718 Slice/take blocks along axis=0.
719
720 Overloaded for SingleBlock
721
722 Parameters
723 ----------
724 slice_or_indexer : slice or np.ndarray[int64]
725 fill_value : scalar, default lib.no_default
726 only_slice : bool, default False
727 If True, we always return views on existing arrays, never copies.
728 This is used when called from ops.blockwise.operate_blockwise.
729 use_na_proxy : bool, default False
730 Whether to use a np.void ndarray for newly introduced columns.
731 ref_inplace_op: bool, default False
732 Don't track refs if True because we operate inplace
733
734 Returns
735 -------
736 new_blocks : list of Block
737 """
738 allow_fill = fill_value is not lib.no_default
739
740 sl_type, slobj, sllen = _preprocess_slice_or_indexer(
741 slice_or_indexer, self.shape[0], allow_fill=allow_fill
742 )
743
744 if self.is_single_block:
745 blk = self.blocks[0]
746
747 if sl_type == "slice":
748 # GH#32959 EABlock would fail since we can't make 0-width
749 # TODO(EA2D): special casing unnecessary with 2D EAs
750 if sllen == 0:
751 return []
752 bp = BlockPlacement(slice(0, sllen))
753 return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)]
754 elif not allow_fill or self.ndim == 1:
755 if allow_fill and fill_value is None:
756 fill_value = blk.fill_value
757
758 if not allow_fill and only_slice:
759 # GH#33597 slice instead of take, so we get
760 # views instead of copies
761 blocks = [
762 blk.getitem_block_columns(
763 slice(ml, ml + 1),
764 new_mgr_locs=BlockPlacement(i),
765 ref_inplace_op=ref_inplace_op,
766 )
767 for i, ml in enumerate(slobj)
768 ]
769 return blocks
770 else:
771 bp = BlockPlacement(slice(0, sllen))
772 return [
773 blk.take_nd(
774 slobj,
775 axis=0,
776 new_mgr_locs=bp,
777 fill_value=fill_value,
778 )
779 ]
780
781 if sl_type == "slice":
782 blknos = self.blknos[slobj]
783 blklocs = self.blklocs[slobj]
784 else:
785 blknos = algos.take_nd(
786 self.blknos, slobj, fill_value=-1, allow_fill=allow_fill
787 )
788 blklocs = algos.take_nd(
789 self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill
790 )
791
792 # When filling blknos, make sure blknos is updated before appending to
793 # blocks list, that way new blkno is exactly len(blocks).
794 blocks = []
795 group = not only_slice
796 for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group):
797 if blkno == -1:
798 # If we've got here, fill_value was not lib.no_default
799
800 blocks.append(
801 self._make_na_block(
802 placement=mgr_locs,
803 fill_value=fill_value,
804 use_na_proxy=use_na_proxy,
805 )
806 )
807 else:
808 blk = self.blocks[blkno]
809
810 # Otherwise, slicing along items axis is necessary.
811 if not blk._can_consolidate and not blk._validate_ndim:
812 # i.e. we dont go through here for DatetimeTZBlock
813 # A non-consolidatable block, it's easy, because there's
814 # only one item and each mgr loc is a copy of that single
815 # item.
816 deep = not (only_slice or using_copy_on_write())
817 for mgr_loc in mgr_locs:
818 newblk = blk.copy(deep=deep)
819 newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1))
820 blocks.append(newblk)
821
822 else:
823 # GH#32779 to avoid the performance penalty of copying,
824 # we may try to only slice
825 taker = blklocs[mgr_locs.indexer]
826 max_len = max(len(mgr_locs), taker.max() + 1)
827 if only_slice or using_copy_on_write():
828 taker = lib.maybe_indices_to_slice(taker, max_len)
829
830 if isinstance(taker, slice):
831 nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs)
832 blocks.append(nb)
833 elif only_slice:
834 # GH#33597 slice instead of take, so we get
835 # views instead of copies
836 for i, ml in zip(taker, mgr_locs):
837 slc = slice(i, i + 1)
838 bp = BlockPlacement(ml)
839 nb = blk.getitem_block_columns(slc, new_mgr_locs=bp)
840 # We have np.shares_memory(nb.values, blk.values)
841 blocks.append(nb)
842 else:
843 nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs)
844 blocks.append(nb)
845
846 return blocks
847
848 def _make_na_block(
849 self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False
850 ) -> Block:
851 # Note: we only get here with self.ndim == 2
852
853 if use_na_proxy:
854 assert fill_value is None
855 shape = (len(placement), self.shape[1])
856 vals = np.empty(shape, dtype=np.void)
857 nb = NumpyBlock(vals, placement, ndim=2)
858 return nb
859
860 if fill_value is None:
861 fill_value = np.nan
862
863 shape = (len(placement), self.shape[1])
864
865 dtype, fill_value = infer_dtype_from_scalar(fill_value)
866 block_values = make_na_array(dtype, shape, fill_value)
867 return new_block_2d(block_values, placement=placement)
868
869 def take(
870 self,
871 indexer: npt.NDArray[np.intp],
872 axis: AxisInt = 1,
873 verify: bool = True,
874 ) -> Self:
875 """
876 Take items along any axis.
877
878 indexer : np.ndarray[np.intp]
879 axis : int, default 1
880 verify : bool, default True
881 Check that all entries are between 0 and len(self) - 1, inclusive.
882 Pass verify=False if this check has been done by the caller.
883
884 Returns
885 -------
886 BlockManager
887 """
888 # Caller is responsible for ensuring indexer annotation is accurate
889
890 n = self.shape[axis]
891 indexer = maybe_convert_indices(indexer, n, verify=verify)
892
893 new_labels = self.axes[axis].take(indexer)
894 return self.reindex_indexer(
895 new_axis=new_labels,
896 indexer=indexer,
897 axis=axis,
898 allow_dups=True,
899 copy=None,
900 )
901
902
903class BlockManager(libinternals.BlockManager, BaseBlockManager):
904 """
905 BaseBlockManager that holds 2D blocks.
906 """
907
908 ndim = 2
909
910 # ----------------------------------------------------------------
911 # Constructors
912
913 def __init__(
914 self,
915 blocks: Sequence[Block],
916 axes: Sequence[Index],
917 verify_integrity: bool = True,
918 ) -> None:
919 if verify_integrity:
920 # Assertion disabled for performance
921 # assert all(isinstance(x, Index) for x in axes)
922
923 for block in blocks:
924 if self.ndim != block.ndim:
925 raise AssertionError(
926 f"Number of Block dimensions ({block.ndim}) must equal "
927 f"number of axes ({self.ndim})"
928 )
929 # As of 2.0, the caller is responsible for ensuring that
930 # DatetimeTZBlock with block.ndim == 2 has block.values.ndim ==2;
931 # previously there was a special check for fastparquet compat.
932
933 self._verify_integrity()
934
935 def _verify_integrity(self) -> None:
936 mgr_shape = self.shape
937 tot_items = sum(len(x.mgr_locs) for x in self.blocks)
938 for block in self.blocks:
939 if block.shape[1:] != mgr_shape[1:]:
940 raise_construction_error(tot_items, block.shape[1:], self.axes)
941 if len(self.items) != tot_items:
942 raise AssertionError(
943 "Number of manager items must equal union of "
944 f"block items\n# manager items: {len(self.items)}, # "
945 f"tot_items: {tot_items}"
946 )
947
948 @classmethod
949 def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> Self:
950 """
951 Constructor for BlockManager and SingleBlockManager with same signature.
952 """
953 return cls(blocks, axes, verify_integrity=False)
954
955 # ----------------------------------------------------------------
956 # Indexing
957
958 def fast_xs(self, loc: int) -> SingleBlockManager:
959 """
960 Return the array corresponding to `frame.iloc[loc]`.
961
962 Parameters
963 ----------
964 loc : int
965
966 Returns
967 -------
968 np.ndarray or ExtensionArray
969 """
970 if len(self.blocks) == 1:
971 # TODO: this could be wrong if blk.mgr_locs is not slice(None)-like;
972 # is this ruled out in the general case?
973 result = self.blocks[0].iget((slice(None), loc))
974 # in the case of a single block, the new block is a view
975 bp = BlockPlacement(slice(0, len(result)))
976 block = new_block(
977 result,
978 placement=bp,
979 ndim=1,
980 refs=self.blocks[0].refs,
981 )
982 return SingleBlockManager(block, self.axes[0])
983
984 dtype = interleaved_dtype([blk.dtype for blk in self.blocks])
985
986 n = len(self)
987
988 if isinstance(dtype, ExtensionDtype):
989 # TODO: use object dtype as workaround for non-performant
990 # EA.__setitem__ methods. (primarily ArrowExtensionArray.__setitem__
991 # when iteratively setting individual values)
992 # https://github.com/pandas-dev/pandas/pull/54508#issuecomment-1675827918
993 result = np.empty(n, dtype=object)
994 else:
995 result = np.empty(n, dtype=dtype)
996 result = ensure_wrapped_if_datetimelike(result)
997
998 for blk in self.blocks:
999 # Such assignment may incorrectly coerce NaT to None
1000 # result[blk.mgr_locs] = blk._slice((slice(None), loc))
1001 for i, rl in enumerate(blk.mgr_locs):
1002 result[rl] = blk.iget((i, loc))
1003
1004 if isinstance(dtype, ExtensionDtype):
1005 cls = dtype.construct_array_type()
1006 result = cls._from_sequence(result, dtype=dtype)
1007
1008 bp = BlockPlacement(slice(0, len(result)))
1009 block = new_block(result, placement=bp, ndim=1)
1010 return SingleBlockManager(block, self.axes[0])
1011
1012 def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:
1013 """
1014 Return the data as a SingleBlockManager.
1015 """
1016 block = self.blocks[self.blknos[i]]
1017 values = block.iget(self.blklocs[i])
1018
1019 # shortcut for select a single-dim from a 2-dim BM
1020 bp = BlockPlacement(slice(0, len(values)))
1021 nb = type(block)(
1022 values, placement=bp, ndim=1, refs=block.refs if track_ref else None
1023 )
1024 return SingleBlockManager(nb, self.axes[1])
1025
1026 def iget_values(self, i: int) -> ArrayLike:
1027 """
1028 Return the data for column i as the values (ndarray or ExtensionArray).
1029
1030 Warning! The returned array is a view but doesn't handle Copy-on-Write,
1031 so this should be used with caution.
1032 """
1033 # TODO(CoW) making the arrays read-only might make this safer to use?
1034 block = self.blocks[self.blknos[i]]
1035 values = block.iget(self.blklocs[i])
1036 return values
1037
1038 @property
1039 def column_arrays(self) -> list[np.ndarray]:
1040 """
1041 Used in the JSON C code to access column arrays.
1042 This optimizes compared to using `iget_values` by converting each
1043
1044 Warning! This doesn't handle Copy-on-Write, so should be used with
1045 caution (current use case of consuming this in the JSON code is fine).
1046 """
1047 # This is an optimized equivalent to
1048 # result = [self.iget_values(i) for i in range(len(self.items))]
1049 result: list[np.ndarray | None] = [None] * len(self.items)
1050
1051 for blk in self.blocks:
1052 mgr_locs = blk._mgr_locs
1053 values = blk.array_values._values_for_json()
1054 if values.ndim == 1:
1055 # TODO(EA2D): special casing not needed with 2D EAs
1056 result[mgr_locs[0]] = values
1057
1058 else:
1059 for i, loc in enumerate(mgr_locs):
1060 result[loc] = values[i]
1061
1062 # error: Incompatible return value type (got "List[None]",
1063 # expected "List[ndarray[Any, Any]]")
1064 return result # type: ignore[return-value]
1065
1066 def iset(
1067 self,
1068 loc: int | slice | np.ndarray,
1069 value: ArrayLike,
1070 inplace: bool = False,
1071 refs: BlockValuesRefs | None = None,
1072 ) -> None:
1073 """
1074 Set new item in-place. Does not consolidate. Adds new Block if not
1075 contained in the current set of items
1076 """
1077
1078 # FIXME: refactor, clearly separate broadcasting & zip-like assignment
1079 # can prob also fix the various if tests for sparse/categorical
1080 if self._blklocs is None and self.ndim > 1:
1081 self._rebuild_blknos_and_blklocs()
1082
1083 # Note: we exclude DTA/TDA here
1084 value_is_extension_type = is_1d_only_ea_dtype(value.dtype)
1085 if not value_is_extension_type:
1086 if value.ndim == 2:
1087 value = value.T
1088 else:
1089 value = ensure_block_shape(value, ndim=2)
1090
1091 if value.shape[1:] != self.shape[1:]:
1092 raise AssertionError(
1093 "Shape of new values must be compatible with manager shape"
1094 )
1095
1096 if lib.is_integer(loc):
1097 # We have 6 tests where loc is _not_ an int.
1098 # In this case, get_blkno_placements will yield only one tuple,
1099 # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1)))
1100
1101 # Check if we can use _iset_single fastpath
1102 loc = cast(int, loc)
1103 blkno = self.blknos[loc]
1104 blk = self.blocks[blkno]
1105 if len(blk._mgr_locs) == 1: # TODO: fastest way to check this?
1106 return self._iset_single(
1107 loc,
1108 value,
1109 inplace=inplace,
1110 blkno=blkno,
1111 blk=blk,
1112 refs=refs,
1113 )
1114
1115 # error: Incompatible types in assignment (expression has type
1116 # "List[Union[int, slice, ndarray]]", variable has type "Union[int,
1117 # slice, ndarray]")
1118 loc = [loc] # type: ignore[assignment]
1119
1120 # categorical/sparse/datetimetz
1121 if value_is_extension_type:
1122
1123 def value_getitem(placement):
1124 return value
1125
1126 else:
1127
1128 def value_getitem(placement):
1129 return value[placement.indexer]
1130
1131 # Accessing public blknos ensures the public versions are initialized
1132 blknos = self.blknos[loc]
1133 blklocs = self.blklocs[loc].copy()
1134
1135 unfit_mgr_locs = []
1136 unfit_val_locs = []
1137 removed_blknos = []
1138 for blkno_l, val_locs in libinternals.get_blkno_placements(blknos, group=True):
1139 blk = self.blocks[blkno_l]
1140 blk_locs = blklocs[val_locs.indexer]
1141 if inplace and blk.should_store(value):
1142 # Updating inplace -> check if we need to do Copy-on-Write
1143 if using_copy_on_write() and not self._has_no_reference_block(blkno_l):
1144 self._iset_split_block(
1145 blkno_l, blk_locs, value_getitem(val_locs), refs=refs
1146 )
1147 else:
1148 blk.set_inplace(blk_locs, value_getitem(val_locs))
1149 continue
1150 else:
1151 unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
1152 unfit_val_locs.append(val_locs)
1153
1154 # If all block items are unfit, schedule the block for removal.
1155 if len(val_locs) == len(blk.mgr_locs):
1156 removed_blknos.append(blkno_l)
1157 continue
1158 else:
1159 # Defer setting the new values to enable consolidation
1160 self._iset_split_block(blkno_l, blk_locs, refs=refs)
1161
1162 if len(removed_blknos):
1163 # Remove blocks & update blknos accordingly
1164 is_deleted = np.zeros(self.nblocks, dtype=np.bool_)
1165 is_deleted[removed_blknos] = True
1166
1167 new_blknos = np.empty(self.nblocks, dtype=np.intp)
1168 new_blknos.fill(-1)
1169 new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos))
1170 self._blknos = new_blknos[self._blknos]
1171 self.blocks = tuple(
1172 blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos)
1173 )
1174
1175 if unfit_val_locs:
1176 unfit_idxr = np.concatenate(unfit_mgr_locs)
1177 unfit_count = len(unfit_idxr)
1178
1179 new_blocks: list[Block] = []
1180 if value_is_extension_type:
1181 # This code (ab-)uses the fact that EA blocks contain only
1182 # one item.
1183 # TODO(EA2D): special casing unnecessary with 2D EAs
1184 new_blocks.extend(
1185 new_block_2d(
1186 values=value,
1187 placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)),
1188 refs=refs,
1189 )
1190 for mgr_loc in unfit_idxr
1191 )
1192
1193 self._blknos[unfit_idxr] = np.arange(unfit_count) + len(self.blocks)
1194 self._blklocs[unfit_idxr] = 0
1195
1196 else:
1197 # unfit_val_locs contains BlockPlacement objects
1198 unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])
1199
1200 new_blocks.append(
1201 new_block_2d(
1202 values=value_getitem(unfit_val_items),
1203 placement=BlockPlacement(unfit_idxr),
1204 refs=refs,
1205 )
1206 )
1207
1208 self._blknos[unfit_idxr] = len(self.blocks)
1209 self._blklocs[unfit_idxr] = np.arange(unfit_count)
1210
1211 self.blocks += tuple(new_blocks)
1212
1213 # Newly created block's dtype may already be present.
1214 self._known_consolidated = False
1215
1216 def _iset_split_block(
1217 self,
1218 blkno_l: int,
1219 blk_locs: np.ndarray | list[int],
1220 value: ArrayLike | None = None,
1221 refs: BlockValuesRefs | None = None,
1222 ) -> None:
1223 """Removes columns from a block by splitting the block.
1224
1225 Avoids copying the whole block through slicing and updates the manager
1226 after determinint the new block structure. Optionally adds a new block,
1227 otherwise has to be done by the caller.
1228
1229 Parameters
1230 ----------
1231 blkno_l: The block number to operate on, relevant for updating the manager
1232 blk_locs: The locations of our block that should be deleted.
1233 value: The value to set as a replacement.
1234 refs: The reference tracking object of the value to set.
1235 """
1236 blk = self.blocks[blkno_l]
1237
1238 if self._blklocs is None:
1239 self._rebuild_blknos_and_blklocs()
1240
1241 nbs_tup = tuple(blk.delete(blk_locs))
1242 if value is not None:
1243 locs = blk.mgr_locs.as_array[blk_locs]
1244 first_nb = new_block_2d(value, BlockPlacement(locs), refs=refs)
1245 else:
1246 first_nb = nbs_tup[0]
1247 nbs_tup = tuple(nbs_tup[1:])
1248
1249 nr_blocks = len(self.blocks)
1250 blocks_tup = (
1251 self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup
1252 )
1253 self.blocks = blocks_tup
1254
1255 if not nbs_tup and value is not None:
1256 # No need to update anything if split did not happen
1257 return
1258
1259 self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))
1260
1261 for i, nb in enumerate(nbs_tup):
1262 self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
1263 self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
1264
1265 def _iset_single(
1266 self,
1267 loc: int,
1268 value: ArrayLike,
1269 inplace: bool,
1270 blkno: int,
1271 blk: Block,
1272 refs: BlockValuesRefs | None = None,
1273 ) -> None:
1274 """
1275 Fastpath for iset when we are only setting a single position and
1276 the Block currently in that position is itself single-column.
1277
1278 In this case we can swap out the entire Block and blklocs and blknos
1279 are unaffected.
1280 """
1281 # Caller is responsible for verifying value.shape
1282
1283 if inplace and blk.should_store(value):
1284 copy = False
1285 if using_copy_on_write() and not self._has_no_reference_block(blkno):
1286 # perform Copy-on-Write and clear the reference
1287 copy = True
1288 iloc = self.blklocs[loc]
1289 blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy)
1290 return
1291
1292 nb = new_block_2d(value, placement=blk._mgr_locs, refs=refs)
1293 old_blocks = self.blocks
1294 new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :]
1295 self.blocks = new_blocks
1296 return
1297
1298 def column_setitem(
1299 self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False
1300 ) -> None:
1301 """
1302 Set values ("setitem") into a single column (not setting the full column).
1303
1304 This is a method on the BlockManager level, to avoid creating an
1305 intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
1306 """
1307 needs_to_warn = False
1308 if warn_copy_on_write() and not self._has_no_reference(loc):
1309 if not isinstance(
1310 self.blocks[self.blknos[loc]].values,
1311 (ArrowExtensionArray, ArrowStringArray),
1312 ):
1313 # We might raise if we are in an expansion case, so defer
1314 # warning till we actually updated
1315 needs_to_warn = True
1316
1317 elif using_copy_on_write() and not self._has_no_reference(loc):
1318 blkno = self.blknos[loc]
1319 # Split blocks to only copy the column we want to modify
1320 blk_loc = self.blklocs[loc]
1321 # Copy our values
1322 values = self.blocks[blkno].values
1323 if values.ndim == 1:
1324 values = values.copy()
1325 else:
1326 # Use [blk_loc] as indexer to keep ndim=2, this already results in a
1327 # copy
1328 values = values[[blk_loc]]
1329 self._iset_split_block(blkno, [blk_loc], values)
1330
1331 # this manager is only created temporarily to mutate the values in place
1332 # so don't track references, otherwise the `setitem` would perform CoW again
1333 col_mgr = self.iget(loc, track_ref=False)
1334 if inplace_only:
1335 col_mgr.setitem_inplace(idx, value)
1336 else:
1337 new_mgr = col_mgr.setitem((idx,), value)
1338 self.iset(loc, new_mgr._block.values, inplace=True)
1339
1340 if needs_to_warn:
1341 warnings.warn(
1342 COW_WARNING_GENERAL_MSG,
1343 FutureWarning,
1344 stacklevel=find_stack_level(),
1345 )
1346
1347 def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None:
1348 """
1349 Insert item at selected position.
1350
1351 Parameters
1352 ----------
1353 loc : int
1354 item : hashable
1355 value : np.ndarray or ExtensionArray
1356 refs : The reference tracking object of the value to set.
1357 """
1358 with warnings.catch_warnings():
1359 # TODO: re-issue this with setitem-specific message?
1360 warnings.filterwarnings(
1361 "ignore",
1362 "The behavior of Index.insert with object-dtype is deprecated",
1363 category=FutureWarning,
1364 )
1365 new_axis = self.items.insert(loc, item)
1366
1367 if value.ndim == 2:
1368 value = value.T
1369 if len(value) > 1:
1370 raise ValueError(
1371 f"Expected a 1D array, got an array with shape {value.T.shape}"
1372 )
1373 else:
1374 value = ensure_block_shape(value, ndim=self.ndim)
1375
1376 bp = BlockPlacement(slice(loc, loc + 1))
1377 block = new_block_2d(values=value, placement=bp, refs=refs)
1378
1379 if not len(self.blocks):
1380 # Fastpath
1381 self._blklocs = np.array([0], dtype=np.intp)
1382 self._blknos = np.array([0], dtype=np.intp)
1383 else:
1384 self._insert_update_mgr_locs(loc)
1385 self._insert_update_blklocs_and_blknos(loc)
1386
1387 self.axes[0] = new_axis
1388 self.blocks += (block,)
1389
1390 self._known_consolidated = False
1391
1392 if sum(not block.is_extension for block in self.blocks) > 100:
1393 warnings.warn(
1394 "DataFrame is highly fragmented. This is usually the result "
1395 "of calling `frame.insert` many times, which has poor performance. "
1396 "Consider joining all columns at once using pd.concat(axis=1) "
1397 "instead. To get a de-fragmented frame, use `newframe = frame.copy()`",
1398 PerformanceWarning,
1399 stacklevel=find_stack_level(),
1400 )
1401
1402 def _insert_update_mgr_locs(self, loc) -> None:
1403 """
1404 When inserting a new Block at location 'loc', we increment
1405 all of the mgr_locs of blocks above that by one.
1406 """
1407 for blkno, count in _fast_count_smallints(self.blknos[loc:]):
1408 # .620 this way, .326 of which is in increment_above
1409 blk = self.blocks[blkno]
1410 blk._mgr_locs = blk._mgr_locs.increment_above(loc)
1411
1412 def _insert_update_blklocs_and_blknos(self, loc) -> None:
1413 """
1414 When inserting a new Block at location 'loc', we update our
1415 _blklocs and _blknos.
1416 """
1417
1418 # Accessing public blklocs ensures the public versions are initialized
1419 if loc == self.blklocs.shape[0]:
1420 # np.append is a lot faster, let's use it if we can.
1421 self._blklocs = np.append(self._blklocs, 0)
1422 self._blknos = np.append(self._blknos, len(self.blocks))
1423 elif loc == 0:
1424 # np.append is a lot faster, let's use it if we can.
1425 self._blklocs = np.append(self._blklocs[::-1], 0)[::-1]
1426 self._blknos = np.append(self._blknos[::-1], len(self.blocks))[::-1]
1427 else:
1428 new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos(
1429 self.blklocs, self.blknos, loc, len(self.blocks)
1430 )
1431 self._blklocs = new_blklocs
1432 self._blknos = new_blknos
1433
1434 def idelete(self, indexer) -> BlockManager:
1435 """
1436 Delete selected locations, returning a new BlockManager.
1437 """
1438 is_deleted = np.zeros(self.shape[0], dtype=np.bool_)
1439 is_deleted[indexer] = True
1440 taker = (~is_deleted).nonzero()[0]
1441
1442 nbs = self._slice_take_blocks_ax0(taker, only_slice=True, ref_inplace_op=True)
1443 new_columns = self.items[~is_deleted]
1444 axes = [new_columns, self.axes[1]]
1445 return type(self)(tuple(nbs), axes, verify_integrity=False)
1446
1447 # ----------------------------------------------------------------
1448 # Block-wise Operation
1449
1450 def grouped_reduce(self, func: Callable) -> Self:
1451 """
1452 Apply grouped reduction function blockwise, returning a new BlockManager.
1453
1454 Parameters
1455 ----------
1456 func : grouped reduction function
1457
1458 Returns
1459 -------
1460 BlockManager
1461 """
1462 result_blocks: list[Block] = []
1463
1464 for blk in self.blocks:
1465 if blk.is_object:
1466 # split on object-dtype blocks bc some columns may raise
1467 # while others do not.
1468 for sb in blk._split():
1469 applied = sb.apply(func)
1470 result_blocks = extend_blocks(applied, result_blocks)
1471 else:
1472 applied = blk.apply(func)
1473 result_blocks = extend_blocks(applied, result_blocks)
1474
1475 if len(result_blocks) == 0:
1476 nrows = 0
1477 else:
1478 nrows = result_blocks[0].values.shape[-1]
1479 index = Index(range(nrows))
1480
1481 return type(self).from_blocks(result_blocks, [self.axes[0], index])
1482
1483 def reduce(self, func: Callable) -> Self:
1484 """
1485 Apply reduction function blockwise, returning a single-row BlockManager.
1486
1487 Parameters
1488 ----------
1489 func : reduction function
1490
1491 Returns
1492 -------
1493 BlockManager
1494 """
1495 # If 2D, we assume that we're operating column-wise
1496 assert self.ndim == 2
1497
1498 res_blocks: list[Block] = []
1499 for blk in self.blocks:
1500 nbs = blk.reduce(func)
1501 res_blocks.extend(nbs)
1502
1503 index = Index([None]) # placeholder
1504 new_mgr = type(self).from_blocks(res_blocks, [self.items, index])
1505 return new_mgr
1506
1507 def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager:
1508 """
1509 Apply array_op blockwise with another (aligned) BlockManager.
1510 """
1511 return operate_blockwise(self, other, array_op)
1512
1513 def _equal_values(self: BlockManager, other: BlockManager) -> bool:
1514 """
1515 Used in .equals defined in base class. Only check the column values
1516 assuming shape and indexes have already been checked.
1517 """
1518 return blockwise_all(self, other, array_equals)
1519
1520 def quantile(
1521 self,
1522 *,
1523 qs: Index, # with dtype float 64
1524 interpolation: QuantileInterpolation = "linear",
1525 ) -> Self:
1526 """
1527 Iterate over blocks applying quantile reduction.
1528 This routine is intended for reduction type operations and
1529 will do inference on the generated blocks.
1530
1531 Parameters
1532 ----------
1533 interpolation : type of interpolation, default 'linear'
1534 qs : list of the quantiles to be computed
1535
1536 Returns
1537 -------
1538 BlockManager
1539 """
1540 # Series dispatches to DataFrame for quantile, which allows us to
1541 # simplify some of the code here and in the blocks
1542 assert self.ndim >= 2
1543 assert is_list_like(qs) # caller is responsible for this
1544
1545 new_axes = list(self.axes)
1546 new_axes[1] = Index(qs, dtype=np.float64)
1547
1548 blocks = [
1549 blk.quantile(qs=qs, interpolation=interpolation) for blk in self.blocks
1550 ]
1551
1552 return type(self)(blocks, new_axes)
1553
1554 # ----------------------------------------------------------------
1555
1556 def unstack(self, unstacker, fill_value) -> BlockManager:
1557 """
1558 Return a BlockManager with all blocks unstacked.
1559
1560 Parameters
1561 ----------
1562 unstacker : reshape._Unstacker
1563 fill_value : Any
1564 fill_value for newly introduced missing values.
1565
1566 Returns
1567 -------
1568 unstacked : BlockManager
1569 """
1570 new_columns = unstacker.get_new_columns(self.items)
1571 new_index = unstacker.new_index
1572
1573 allow_fill = not unstacker.mask_all
1574 if allow_fill:
1575 # calculating the full mask once and passing it to Block._unstack is
1576 # faster than letting calculating it in each repeated call
1577 new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape)
1578 needs_masking = new_mask2D.any(axis=0)
1579 else:
1580 needs_masking = np.zeros(unstacker.full_shape[1], dtype=bool)
1581
1582 new_blocks: list[Block] = []
1583 columns_mask: list[np.ndarray] = []
1584
1585 if len(self.items) == 0:
1586 factor = 1
1587 else:
1588 fac = len(new_columns) / len(self.items)
1589 assert fac == int(fac)
1590 factor = int(fac)
1591
1592 for blk in self.blocks:
1593 mgr_locs = blk.mgr_locs
1594 new_placement = mgr_locs.tile_for_unstack(factor)
1595
1596 blocks, mask = blk._unstack(
1597 unstacker,
1598 fill_value,
1599 new_placement=new_placement,
1600 needs_masking=needs_masking,
1601 )
1602
1603 new_blocks.extend(blocks)
1604 columns_mask.extend(mask)
1605
1606 # Block._unstack should ensure this holds,
1607 assert mask.sum() == sum(len(nb._mgr_locs) for nb in blocks)
1608 # In turn this ensures that in the BlockManager call below
1609 # we have len(new_columns) == sum(x.shape[0] for x in new_blocks)
1610 # which suffices to allow us to pass verify_inegrity=False
1611
1612 new_columns = new_columns[columns_mask]
1613
1614 bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False)
1615 return bm
1616
1617 def to_dict(self) -> dict[str, Self]:
1618 """
1619 Return a dict of str(dtype) -> BlockManager
1620
1621 Returns
1622 -------
1623 values : a dict of dtype -> BlockManager
1624 """
1625
1626 bd: dict[str, list[Block]] = {}
1627 for b in self.blocks:
1628 bd.setdefault(str(b.dtype), []).append(b)
1629
1630 # TODO(EA2D): the combine will be unnecessary with 2D EAs
1631 return {dtype: self._combine(blocks) for dtype, blocks in bd.items()}
1632
1633 def as_array(
1634 self,
1635 dtype: np.dtype | None = None,
1636 copy: bool = False,
1637 na_value: object = lib.no_default,
1638 ) -> np.ndarray:
1639 """
1640 Convert the blockmanager data into an numpy array.
1641
1642 Parameters
1643 ----------
1644 dtype : np.dtype or None, default None
1645 Data type of the return array.
1646 copy : bool, default False
1647 If True then guarantee that a copy is returned. A value of
1648 False does not guarantee that the underlying data is not
1649 copied.
1650 na_value : object, default lib.no_default
1651 Value to be used as the missing value sentinel.
1652
1653 Returns
1654 -------
1655 arr : ndarray
1656 """
1657 passed_nan = lib.is_float(na_value) and isna(na_value)
1658
1659 if len(self.blocks) == 0:
1660 arr = np.empty(self.shape, dtype=float)
1661 return arr.transpose()
1662
1663 if self.is_single_block:
1664 blk = self.blocks[0]
1665
1666 if na_value is not lib.no_default:
1667 # We want to copy when na_value is provided to avoid
1668 # mutating the original object
1669 if lib.is_np_dtype(blk.dtype, "f") and passed_nan:
1670 # We are already numpy-float and na_value=np.nan
1671 pass
1672 else:
1673 copy = True
1674
1675 if blk.is_extension:
1676 # Avoid implicit conversion of extension blocks to object
1677
1678 # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no
1679 # attribute "to_numpy"
1680 arr = blk.values.to_numpy( # type: ignore[union-attr]
1681 dtype=dtype,
1682 na_value=na_value,
1683 copy=copy,
1684 ).reshape(blk.shape)
1685 elif not copy:
1686 arr = np.asarray(blk.values, dtype=dtype)
1687 else:
1688 arr = np.array(blk.values, dtype=dtype, copy=copy)
1689
1690 if using_copy_on_write() and not copy:
1691 arr = arr.view()
1692 arr.flags.writeable = False
1693 else:
1694 arr = self._interleave(dtype=dtype, na_value=na_value)
1695 # The underlying data was copied within _interleave, so no need
1696 # to further copy if copy=True or setting na_value
1697
1698 if na_value is lib.no_default:
1699 pass
1700 elif arr.dtype.kind == "f" and passed_nan:
1701 pass
1702 else:
1703 arr[isna(arr)] = na_value
1704
1705 return arr.transpose()
1706
1707 def _interleave(
1708 self,
1709 dtype: np.dtype | None = None,
1710 na_value: object = lib.no_default,
1711 ) -> np.ndarray:
1712 """
1713 Return ndarray from blocks with specified item order
1714 Items must be contained in the blocks
1715 """
1716 if not dtype:
1717 # Incompatible types in assignment (expression has type
1718 # "Optional[Union[dtype[Any], ExtensionDtype]]", variable has
1719 # type "Optional[dtype[Any]]")
1720 dtype = interleaved_dtype( # type: ignore[assignment]
1721 [blk.dtype for blk in self.blocks]
1722 )
1723
1724 # error: Argument 1 to "ensure_np_dtype" has incompatible type
1725 # "Optional[dtype[Any]]"; expected "Union[dtype[Any], ExtensionDtype]"
1726 dtype = ensure_np_dtype(dtype) # type: ignore[arg-type]
1727 result = np.empty(self.shape, dtype=dtype)
1728
1729 itemmask = np.zeros(self.shape[0])
1730
1731 if dtype == np.dtype("object") and na_value is lib.no_default:
1732 # much more performant than using to_numpy below
1733 for blk in self.blocks:
1734 rl = blk.mgr_locs
1735 arr = blk.get_values(dtype)
1736 result[rl.indexer] = arr
1737 itemmask[rl.indexer] = 1
1738 return result
1739
1740 for blk in self.blocks:
1741 rl = blk.mgr_locs
1742 if blk.is_extension:
1743 # Avoid implicit conversion of extension blocks to object
1744
1745 # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no
1746 # attribute "to_numpy"
1747 arr = blk.values.to_numpy( # type: ignore[union-attr]
1748 dtype=dtype,
1749 na_value=na_value,
1750 )
1751 else:
1752 arr = blk.get_values(dtype)
1753 result[rl.indexer] = arr
1754 itemmask[rl.indexer] = 1
1755
1756 if not itemmask.all():
1757 raise AssertionError("Some items were not contained in blocks")
1758
1759 return result
1760
1761 # ----------------------------------------------------------------
1762 # Consolidation
1763
1764 def is_consolidated(self) -> bool:
1765 """
1766 Return True if more than one block with the same dtype
1767 """
1768 if not self._known_consolidated:
1769 self._consolidate_check()
1770 return self._is_consolidated
1771
1772 def _consolidate_check(self) -> None:
1773 if len(self.blocks) == 1:
1774 # fastpath
1775 self._is_consolidated = True
1776 self._known_consolidated = True
1777 return
1778 dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate]
1779 self._is_consolidated = len(dtypes) == len(set(dtypes))
1780 self._known_consolidated = True
1781
1782 def _consolidate_inplace(self) -> None:
1783 # In general, _consolidate_inplace should only be called via
1784 # DataFrame._consolidate_inplace, otherwise we will fail to invalidate
1785 # the DataFrame's _item_cache. The exception is for newly-created
1786 # BlockManager objects not yet attached to a DataFrame.
1787 if not self.is_consolidated():
1788 self.blocks = _consolidate(self.blocks)
1789 self._is_consolidated = True
1790 self._known_consolidated = True
1791 self._rebuild_blknos_and_blklocs()
1792
1793 # ----------------------------------------------------------------
1794 # Concatenation
1795
1796 @classmethod
1797 def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self:
1798 """
1799 Concatenate uniformly-indexed BlockManagers horizontally.
1800 """
1801 offset = 0
1802 blocks: list[Block] = []
1803 for mgr in mgrs:
1804 for blk in mgr.blocks:
1805 # We need to do getitem_block here otherwise we would be altering
1806 # blk.mgr_locs in place, which would render it invalid. This is only
1807 # relevant in the copy=False case.
1808 nb = blk.slice_block_columns(slice(None))
1809 nb._mgr_locs = nb._mgr_locs.add(offset)
1810 blocks.append(nb)
1811
1812 offset += len(mgr.items)
1813
1814 new_mgr = cls(tuple(blocks), axes)
1815 return new_mgr
1816
1817 @classmethod
1818 def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self:
1819 """
1820 Concatenate uniformly-indexed BlockManagers vertically.
1821 """
1822 raise NotImplementedError("This logic lives (for now) in internals.concat")
1823
1824
1825class SingleBlockManager(BaseBlockManager, SingleDataManager):
1826 """manage a single block with"""
1827
1828 @property
1829 def ndim(self) -> Literal[1]:
1830 return 1
1831
1832 _is_consolidated = True
1833 _known_consolidated = True
1834 __slots__ = ()
1835 is_single_block = True
1836
1837 def __init__(
1838 self,
1839 block: Block,
1840 axis: Index,
1841 verify_integrity: bool = False,
1842 ) -> None:
1843 # Assertions disabled for performance
1844 # assert isinstance(block, Block), type(block)
1845 # assert isinstance(axis, Index), type(axis)
1846
1847 self.axes = [axis]
1848 self.blocks = (block,)
1849
1850 @classmethod
1851 def from_blocks(
1852 cls,
1853 blocks: list[Block],
1854 axes: list[Index],
1855 ) -> Self:
1856 """
1857 Constructor for BlockManager and SingleBlockManager with same signature.
1858 """
1859 assert len(blocks) == 1
1860 assert len(axes) == 1
1861 return cls(blocks[0], axes[0], verify_integrity=False)
1862
1863 @classmethod
1864 def from_array(
1865 cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None
1866 ) -> SingleBlockManager:
1867 """
1868 Constructor for if we have an array that is not yet a Block.
1869 """
1870 array = maybe_coerce_values(array)
1871 bp = BlockPlacement(slice(0, len(index)))
1872 block = new_block(array, placement=bp, ndim=1, refs=refs)
1873 return cls(block, index)
1874
1875 def to_2d_mgr(self, columns: Index) -> BlockManager:
1876 """
1877 Manager analogue of Series.to_frame
1878 """
1879 blk = self.blocks[0]
1880 arr = ensure_block_shape(blk.values, ndim=2)
1881 bp = BlockPlacement(0)
1882 new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs)
1883 axes = [columns, self.axes[0]]
1884 return BlockManager([new_blk], axes=axes, verify_integrity=False)
1885
1886 def _has_no_reference(self, i: int = 0) -> bool:
1887 """
1888 Check for column `i` if it has references.
1889 (whether it references another array or is itself being referenced)
1890 Returns True if the column has no references.
1891 """
1892 return not self.blocks[0].refs.has_reference()
1893
1894 def __getstate__(self):
1895 block_values = [b.values for b in self.blocks]
1896 block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
1897 axes_array = list(self.axes)
1898
1899 extra_state = {
1900 "0.14.1": {
1901 "axes": axes_array,
1902 "blocks": [
1903 {"values": b.values, "mgr_locs": b.mgr_locs.indexer}
1904 for b in self.blocks
1905 ],
1906 }
1907 }
1908
1909 # First three elements of the state are to maintain forward
1910 # compatibility with 0.13.1.
1911 return axes_array, block_values, block_items, extra_state
1912
1913 def __setstate__(self, state) -> None:
1914 def unpickle_block(values, mgr_locs, ndim: int) -> Block:
1915 # TODO(EA2D): ndim would be unnecessary with 2D EAs
1916 # older pickles may store e.g. DatetimeIndex instead of DatetimeArray
1917 values = extract_array(values, extract_numpy=True)
1918 if not isinstance(mgr_locs, BlockPlacement):
1919 mgr_locs = BlockPlacement(mgr_locs)
1920
1921 values = maybe_coerce_values(values)
1922 return new_block(values, placement=mgr_locs, ndim=ndim)
1923
1924 if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:
1925 state = state[3]["0.14.1"]
1926 self.axes = [ensure_index(ax) for ax in state["axes"]]
1927 ndim = len(self.axes)
1928 self.blocks = tuple(
1929 unpickle_block(b["values"], b["mgr_locs"], ndim=ndim)
1930 for b in state["blocks"]
1931 )
1932 else:
1933 raise NotImplementedError("pre-0.14.1 pickles are no longer supported")
1934
1935 self._post_setstate()
1936
1937 def _post_setstate(self) -> None:
1938 pass
1939
1940 @cache_readonly
1941 def _block(self) -> Block:
1942 return self.blocks[0]
1943
1944 @property
1945 def _blknos(self):
1946 """compat with BlockManager"""
1947 return None
1948
1949 @property
1950 def _blklocs(self):
1951 """compat with BlockManager"""
1952 return None
1953
1954 def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Self:
1955 # similar to get_slice, but not restricted to slice indexer
1956 blk = self._block
1957 if using_copy_on_write() and len(indexer) > 0 and indexer.all():
1958 return type(self)(blk.copy(deep=False), self.index)
1959 array = blk.values[indexer]
1960
1961 if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "b":
1962 # boolean indexing always gives a copy with numpy
1963 refs = None
1964 else:
1965 # TODO(CoW) in theory only need to track reference if new_array is a view
1966 refs = blk.refs
1967
1968 bp = BlockPlacement(slice(0, len(array)))
1969 block = type(blk)(array, placement=bp, ndim=1, refs=refs)
1970
1971 new_idx = self.index[indexer]
1972 return type(self)(block, new_idx)
1973
1974 def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager:
1975 # Assertion disabled for performance
1976 # assert isinstance(slobj, slice), type(slobj)
1977 if axis >= self.ndim:
1978 raise IndexError("Requested axis not found in manager")
1979
1980 blk = self._block
1981 array = blk.values[slobj]
1982 bp = BlockPlacement(slice(0, len(array)))
1983 # TODO this method is only used in groupby SeriesSplitter at the moment,
1984 # so passing refs is not yet covered by the tests
1985 block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs)
1986 new_index = self.index._getitem_slice(slobj)
1987 return type(self)(block, new_index)
1988
1989 @property
1990 def index(self) -> Index:
1991 return self.axes[0]
1992
1993 @property
1994 def dtype(self) -> DtypeObj:
1995 return self._block.dtype
1996
1997 def get_dtypes(self) -> npt.NDArray[np.object_]:
1998 return np.array([self._block.dtype], dtype=object)
1999
2000 def external_values(self):
2001 """The array that Series.values returns"""
2002 return self._block.external_values()
2003
2004 def internal_values(self):
2005 """The array that Series._values returns"""
2006 return self._block.values
2007
2008 def array_values(self) -> ExtensionArray:
2009 """The array that Series.array returns"""
2010 return self._block.array_values
2011
2012 def get_numeric_data(self) -> Self:
2013 if self._block.is_numeric:
2014 return self.copy(deep=False)
2015 return self.make_empty()
2016
2017 @property
2018 def _can_hold_na(self) -> bool:
2019 return self._block._can_hold_na
2020
2021 def setitem_inplace(self, indexer, value, warn: bool = True) -> None:
2022 """
2023 Set values with indexer.
2024
2025 For Single[Block/Array]Manager, this backs s[indexer] = value
2026
2027 This is an inplace version of `setitem()`, mutating the manager/values
2028 in place, not returning a new Manager (and Block), and thus never changing
2029 the dtype.
2030 """
2031 using_cow = using_copy_on_write()
2032 warn_cow = warn_copy_on_write()
2033 if (using_cow or warn_cow) and not self._has_no_reference(0):
2034 if using_cow:
2035 self.blocks = (self._block.copy(),)
2036 self._cache.clear()
2037 elif warn_cow and warn:
2038 warnings.warn(
2039 COW_WARNING_SETITEM_MSG,
2040 FutureWarning,
2041 stacklevel=find_stack_level(),
2042 )
2043
2044 super().setitem_inplace(indexer, value)
2045
2046 def idelete(self, indexer) -> SingleBlockManager:
2047 """
2048 Delete single location from SingleBlockManager.
2049
2050 Ensures that self.blocks doesn't become empty.
2051 """
2052 nb = self._block.delete(indexer)[0]
2053 self.blocks = (nb,)
2054 self.axes[0] = self.axes[0].delete(indexer)
2055 self._cache.clear()
2056 return self
2057
2058 def fast_xs(self, loc):
2059 """
2060 fast path for getting a cross-section
2061 return a view of the data
2062 """
2063 raise NotImplementedError("Use series._values[loc] instead")
2064
2065 def set_values(self, values: ArrayLike) -> None:
2066 """
2067 Set the values of the single block in place.
2068
2069 Use at your own risk! This does not check if the passed values are
2070 valid for the current Block/SingleBlockManager (length, dtype, etc),
2071 and this does not properly keep track of references.
2072 """
2073 # NOTE(CoW) Currently this is only used for FrameColumnApply.series_generator
2074 # which handles CoW by setting the refs manually if necessary
2075 self.blocks[0].values = values
2076 self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values)))
2077
2078 def _equal_values(self, other: Self) -> bool:
2079 """
2080 Used in .equals defined in base class. Only check the column values
2081 assuming shape and indexes have already been checked.
2082 """
2083 # For SingleBlockManager (i.e.Series)
2084 if other.ndim != 1:
2085 return False
2086 left = self.blocks[0].values
2087 right = other.blocks[0].values
2088 return array_equals(left, right)
2089
2090
2091# --------------------------------------------------------------------
2092# Constructor Helpers
2093
2094
2095def create_block_manager_from_blocks(
2096 blocks: list[Block],
2097 axes: list[Index],
2098 consolidate: bool = True,
2099 verify_integrity: bool = True,
2100) -> BlockManager:
2101 # If verify_integrity=False, then caller is responsible for checking
2102 # all(x.shape[-1] == len(axes[1]) for x in blocks)
2103 # sum(x.shape[0] for x in blocks) == len(axes[0])
2104 # set(x for blk in blocks for x in blk.mgr_locs) == set(range(len(axes[0])))
2105 # all(blk.ndim == 2 for blk in blocks)
2106 # This allows us to safely pass verify_integrity=False
2107
2108 try:
2109 mgr = BlockManager(blocks, axes, verify_integrity=verify_integrity)
2110
2111 except ValueError as err:
2112 arrays = [blk.values for blk in blocks]
2113 tot_items = sum(arr.shape[0] for arr in arrays)
2114 raise_construction_error(tot_items, arrays[0].shape[1:], axes, err)
2115
2116 if consolidate:
2117 mgr._consolidate_inplace()
2118 return mgr
2119
2120
2121def create_block_manager_from_column_arrays(
2122 arrays: list[ArrayLike],
2123 axes: list[Index],
2124 consolidate: bool,
2125 refs: list,
2126) -> BlockManager:
2127 # Assertions disabled for performance (caller is responsible for verifying)
2128 # assert isinstance(axes, list)
2129 # assert all(isinstance(x, Index) for x in axes)
2130 # assert all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)
2131 # assert all(type(x) is not NumpyExtensionArray for x in arrays)
2132 # assert all(x.ndim == 1 for x in arrays)
2133 # assert all(len(x) == len(axes[1]) for x in arrays)
2134 # assert len(arrays) == len(axes[0])
2135 # These last three are sufficient to allow us to safely pass
2136 # verify_integrity=False below.
2137
2138 try:
2139 blocks = _form_blocks(arrays, consolidate, refs)
2140 mgr = BlockManager(blocks, axes, verify_integrity=False)
2141 except ValueError as e:
2142 raise_construction_error(len(arrays), arrays[0].shape, axes, e)
2143 if consolidate:
2144 mgr._consolidate_inplace()
2145 return mgr
2146
2147
2148def raise_construction_error(
2149 tot_items: int,
2150 block_shape: Shape,
2151 axes: list[Index],
2152 e: ValueError | None = None,
2153):
2154 """raise a helpful message about our construction"""
2155 passed = tuple(map(int, [tot_items] + list(block_shape)))
2156 # Correcting the user facing error message during dataframe construction
2157 if len(passed) <= 2:
2158 passed = passed[::-1]
2159
2160 implied = tuple(len(ax) for ax in axes)
2161 # Correcting the user facing error message during dataframe construction
2162 if len(implied) <= 2:
2163 implied = implied[::-1]
2164
2165 # We return the exception object instead of raising it so that we
2166 # can raise it in the caller; mypy plays better with that
2167 if passed == implied and e is not None:
2168 raise e
2169 if block_shape[0] == 0:
2170 raise ValueError("Empty data passed with indices specified.")
2171 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
2172
2173
2174# -----------------------------------------------------------------------
2175
2176
2177def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, DtypeObj]:
2178 dtype = tup[1].dtype
2179
2180 if is_1d_only_ea_dtype(dtype):
2181 # We know these won't be consolidated, so don't need to group these.
2182 # This avoids expensive comparisons of CategoricalDtype objects
2183 sep = id(dtype)
2184 else:
2185 sep = 0
2186
2187 return sep, dtype
2188
2189
2190def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]:
2191 tuples = list(enumerate(arrays))
2192
2193 if not consolidate:
2194 return _tuples_to_blocks_no_consolidate(tuples, refs)
2195
2196 # when consolidating, we can ignore refs (either stacking always copies,
2197 # or the EA is already copied in the calling dict_to_mgr)
2198
2199 # group by dtype
2200 grouper = itertools.groupby(tuples, _grouping_func)
2201
2202 nbs: list[Block] = []
2203 for (_, dtype), tup_block in grouper:
2204 block_type = get_block_type(dtype)
2205
2206 if isinstance(dtype, np.dtype):
2207 is_dtlike = dtype.kind in "mM"
2208
2209 if issubclass(dtype.type, (str, bytes)):
2210 dtype = np.dtype(object)
2211
2212 values, placement = _stack_arrays(list(tup_block), dtype)
2213 if is_dtlike:
2214 values = ensure_wrapped_if_datetimelike(values)
2215 blk = block_type(values, placement=BlockPlacement(placement), ndim=2)
2216 nbs.append(blk)
2217
2218 elif is_1d_only_ea_dtype(dtype):
2219 dtype_blocks = [
2220 block_type(x[1], placement=BlockPlacement(x[0]), ndim=2)
2221 for x in tup_block
2222 ]
2223 nbs.extend(dtype_blocks)
2224
2225 else:
2226 dtype_blocks = [
2227 block_type(
2228 ensure_block_shape(x[1], 2), placement=BlockPlacement(x[0]), ndim=2
2229 )
2230 for x in tup_block
2231 ]
2232 nbs.extend(dtype_blocks)
2233 return nbs
2234
2235
2236def _tuples_to_blocks_no_consolidate(tuples, refs) -> list[Block]:
2237 # tuples produced within _form_blocks are of the form (placement, array)
2238 return [
2239 new_block_2d(
2240 ensure_block_shape(arr, ndim=2), placement=BlockPlacement(i), refs=ref
2241 )
2242 for ((i, arr), ref) in zip(tuples, refs)
2243 ]
2244
2245
2246def _stack_arrays(tuples, dtype: np.dtype):
2247 placement, arrays = zip(*tuples)
2248
2249 first = arrays[0]
2250 shape = (len(arrays),) + first.shape
2251
2252 stacked = np.empty(shape, dtype=dtype)
2253 for i, arr in enumerate(arrays):
2254 stacked[i] = arr
2255
2256 return stacked, placement
2257
2258
2259def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]:
2260 """
2261 Merge blocks having same dtype, exclude non-consolidating blocks
2262 """
2263 # sort by _can_consolidate, dtype
2264 gkey = lambda x: x._consolidate_key
2265 grouper = itertools.groupby(sorted(blocks, key=gkey), gkey)
2266
2267 new_blocks: list[Block] = []
2268 for (_can_consolidate, dtype), group_blocks in grouper:
2269 merged_blocks, _ = _merge_blocks(
2270 list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate
2271 )
2272 new_blocks = extend_blocks(merged_blocks, new_blocks)
2273 return tuple(new_blocks)
2274
2275
2276def _merge_blocks(
2277 blocks: list[Block], dtype: DtypeObj, can_consolidate: bool
2278) -> tuple[list[Block], bool]:
2279 if len(blocks) == 1:
2280 return blocks, False
2281
2282 if can_consolidate:
2283 # TODO: optimization potential in case all mgrs contain slices and
2284 # combination of those slices is a slice, too.
2285 new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])
2286
2287 new_values: ArrayLike
2288
2289 if isinstance(blocks[0].dtype, np.dtype):
2290 # error: List comprehension has incompatible type List[Union[ndarray,
2291 # ExtensionArray]]; expected List[Union[complex, generic,
2292 # Sequence[Union[int, float, complex, str, bytes, generic]],
2293 # Sequence[Sequence[Any]], SupportsArray]]
2294 new_values = np.vstack([b.values for b in blocks]) # type: ignore[misc]
2295 else:
2296 bvals = [blk.values for blk in blocks]
2297 bvals2 = cast(Sequence[NDArrayBackedExtensionArray], bvals)
2298 new_values = bvals2[0]._concat_same_type(bvals2, axis=0)
2299
2300 argsort = np.argsort(new_mgr_locs)
2301 new_values = new_values[argsort]
2302 new_mgr_locs = new_mgr_locs[argsort]
2303
2304 bp = BlockPlacement(new_mgr_locs)
2305 return [new_block_2d(new_values, placement=bp)], True
2306
2307 # can't consolidate --> no merge
2308 return blocks, False
2309
2310
2311def _fast_count_smallints(arr: npt.NDArray[np.intp]):
2312 """Faster version of set(arr) for sequences of small numbers."""
2313 counts = np.bincount(arr)
2314 nz = counts.nonzero()[0]
2315 # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here,
2316 # in one benchmark by a factor of 11
2317 return zip(nz, counts[nz])
2318
2319
2320def _preprocess_slice_or_indexer(
2321 slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool
2322):
2323 if isinstance(slice_or_indexer, slice):
2324 return (
2325 "slice",
2326 slice_or_indexer,
2327 libinternals.slice_len(slice_or_indexer, length),
2328 )
2329 else:
2330 if (
2331 not isinstance(slice_or_indexer, np.ndarray)
2332 or slice_or_indexer.dtype.kind != "i"
2333 ):
2334 dtype = getattr(slice_or_indexer, "dtype", None)
2335 raise TypeError(type(slice_or_indexer), dtype)
2336
2337 indexer = ensure_platform_int(slice_or_indexer)
2338 if not allow_fill:
2339 indexer = maybe_convert_indices(indexer, length)
2340 return "fancy", indexer, len(indexer)
2341
2342
2343def make_na_array(dtype: DtypeObj, shape: Shape, fill_value) -> ArrayLike:
2344 if isinstance(dtype, DatetimeTZDtype):
2345 # NB: exclude e.g. pyarrow[dt64tz] dtypes
2346 ts = Timestamp(fill_value).as_unit(dtype.unit)
2347 i8values = np.full(shape, ts._value)
2348 dt64values = i8values.view(f"M8[{dtype.unit}]")
2349 return DatetimeArray._simple_new(dt64values, dtype=dtype)
2350
2351 elif is_1d_only_ea_dtype(dtype):
2352 dtype = cast(ExtensionDtype, dtype)
2353 cls = dtype.construct_array_type()
2354
2355 missing_arr = cls._from_sequence([], dtype=dtype)
2356 ncols, nrows = shape
2357 assert ncols == 1, ncols
2358 empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
2359 return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value)
2360 elif isinstance(dtype, ExtensionDtype):
2361 # TODO: no tests get here, a handful would if we disabled
2362 # the dt64tz special-case above (which is faster)
2363 cls = dtype.construct_array_type()
2364 missing_arr = cls._empty(shape=shape, dtype=dtype)
2365 missing_arr[:] = fill_value
2366 return missing_arr
2367 else:
2368 # NB: we should never get here with dtype integer or bool;
2369 # if we did, the missing_arr.fill would cast to gibberish
2370 missing_arr = np.empty(shape, dtype=dtype)
2371 missing_arr.fill(fill_value)
2372
2373 if dtype.kind in "mM":
2374 missing_arr = ensure_wrapped_if_datetimelike(missing_arr)
2375 return missing_arr