1from __future__ import annotations
2
3import itertools
4from typing import (
5 Any,
6 Callable,
7 Hashable,
8 Literal,
9 Sequence,
10 TypeVar,
11 cast,
12)
13import warnings
14import weakref
15
16import numpy as np
17
18from pandas._config import using_copy_on_write
19
20from pandas._libs import (
21 algos as libalgos,
22 internals as libinternals,
23 lib,
24)
25from pandas._libs.internals import (
26 BlockPlacement,
27 BlockValuesRefs,
28)
29from pandas._typing import (
30 ArrayLike,
31 AxisInt,
32 DtypeObj,
33 QuantileInterpolation,
34 Shape,
35 npt,
36 type_t,
37)
38from pandas.errors import PerformanceWarning
39from pandas.util._decorators import cache_readonly
40from pandas.util._exceptions import find_stack_level
41from pandas.util._validators import validate_bool_kwarg
42
43from pandas.core.dtypes.cast import infer_dtype_from_scalar
44from pandas.core.dtypes.common import (
45 ensure_platform_int,
46 is_1d_only_ea_dtype,
47 is_dtype_equal,
48 is_list_like,
49)
50from pandas.core.dtypes.dtypes import ExtensionDtype
51from pandas.core.dtypes.generic import (
52 ABCDataFrame,
53 ABCSeries,
54)
55from pandas.core.dtypes.missing import (
56 array_equals,
57 isna,
58)
59
60import pandas.core.algorithms as algos
61from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
62from pandas.core.arrays.sparse import SparseDtype
63import pandas.core.common as com
64from pandas.core.construction import (
65 ensure_wrapped_if_datetimelike,
66 extract_array,
67)
68from pandas.core.indexers import maybe_convert_indices
69from pandas.core.indexes.api import (
70 Index,
71 ensure_index,
72)
73from pandas.core.internals.base import (
74 DataManager,
75 SingleDataManager,
76 interleaved_dtype,
77)
78from pandas.core.internals.blocks import (
79 Block,
80 NumpyBlock,
81 ensure_block_shape,
82 extend_blocks,
83 get_block_type,
84 new_block,
85 new_block_2d,
86)
87from pandas.core.internals.ops import (
88 blockwise_all,
89 operate_blockwise,
90)
91
92T = TypeVar("T", bound="BaseBlockManager")
93
94
95class BaseBlockManager(DataManager):
96 """
97 Core internal data structure to implement DataFrame, Series, etc.
98
99 Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a
100 lightweight blocked set of labeled data to be manipulated by the DataFrame
101 public API class
102
103 Attributes
104 ----------
105 shape
106 ndim
107 axes
108 values
109 items
110
111 Methods
112 -------
113 set_axis(axis, new_labels)
114 copy(deep=True)
115
116 get_dtypes
117
118 apply(func, axes, block_filter_fn)
119
120 get_bool_data
121 get_numeric_data
122
123 get_slice(slice_like, axis)
124 get(label)
125 iget(loc)
126
127 take(indexer, axis)
128 reindex_axis(new_labels, axis)
129 reindex_indexer(new_labels, indexer, axis)
130
131 delete(label)
132 insert(loc, label, value)
133 set(label, value)
134
135 Parameters
136 ----------
137 blocks: Sequence of Block
138 axes: Sequence of Index
139 verify_integrity: bool, default True
140
141 Notes
142 -----
143 This is *not* a public API class
144 """
145
146 __slots__ = ()
147
148 _blknos: npt.NDArray[np.intp]
149 _blklocs: npt.NDArray[np.intp]
150 blocks: tuple[Block, ...]
151 axes: list[Index]
152
153 @property
154 def ndim(self) -> int:
155 raise NotImplementedError
156
157 _known_consolidated: bool
158 _is_consolidated: bool
159
160 def __init__(self, blocks, axes, verify_integrity: bool = True) -> None:
161 raise NotImplementedError
162
163 @classmethod
164 def from_blocks(cls: type_t[T], blocks: list[Block], axes: list[Index]) -> T:
165 raise NotImplementedError
166
167 @property
168 def blknos(self) -> npt.NDArray[np.intp]:
169 """
170 Suppose we want to find the array corresponding to our i'th column.
171
172 blknos[i] identifies the block from self.blocks that contains this column.
173
174 blklocs[i] identifies the column of interest within
175 self.blocks[self.blknos[i]]
176 """
177 if self._blknos is None:
178 # Note: these can be altered by other BlockManager methods.
179 self._rebuild_blknos_and_blklocs()
180
181 return self._blknos
182
183 @property
184 def blklocs(self) -> npt.NDArray[np.intp]:
185 """
186 See blknos.__doc__
187 """
188 if self._blklocs is None:
189 # Note: these can be altered by other BlockManager methods.
190 self._rebuild_blknos_and_blklocs()
191
192 return self._blklocs
193
194 def make_empty(self: T, axes=None) -> T:
195 """return an empty BlockManager with the items axis of len 0"""
196 if axes is None:
197 axes = [Index([])] + self.axes[1:]
198
199 # preserve dtype if possible
200 if self.ndim == 1:
201 assert isinstance(self, SingleBlockManager) # for mypy
202 blk = self.blocks[0]
203 arr = blk.values[:0]
204 bp = BlockPlacement(slice(0, 0))
205 nb = blk.make_block_same_class(arr, placement=bp)
206 blocks = [nb]
207 else:
208 blocks = []
209 return type(self).from_blocks(blocks, axes)
210
211 def __nonzero__(self) -> bool:
212 return True
213
214 # Python3 compat
215 __bool__ = __nonzero__
216
217 def _normalize_axis(self, axis: AxisInt) -> int:
218 # switch axis to follow BlockManager logic
219 if self.ndim == 2:
220 axis = 1 if axis == 0 else 0
221 return axis
222
223 def set_axis(self, axis: AxisInt, new_labels: Index) -> None:
224 # Caller is responsible for ensuring we have an Index object.
225 self._validate_set_axis(axis, new_labels)
226 self.axes[axis] = new_labels
227
228 @property
229 def is_single_block(self) -> bool:
230 # Assumes we are 2D; overridden by SingleBlockManager
231 return len(self.blocks) == 1
232
233 @property
234 def items(self) -> Index:
235 return self.axes[0]
236
237 def _has_no_reference(self, i: int) -> bool:
238 """
239 Check for column `i` if it has references.
240 (whether it references another array or is itself being referenced)
241 Returns True if the column has no references.
242 """
243 blkno = self.blknos[i]
244 return self._has_no_reference_block(blkno)
245
246 def _has_no_reference_block(self, blkno: int) -> bool:
247 """
248 Check for block `i` if it has references.
249 (whether it references another array or is itself being referenced)
250 Returns True if the block has no references.
251 """
252 return not self.blocks[blkno].refs.has_reference()
253
254 def add_references(self, mgr: BaseBlockManager) -> None:
255 """
256 Adds the references from one manager to another. We assume that both
257 managers have the same block structure.
258 """
259 if len(self.blocks) != len(mgr.blocks):
260 # If block structure changes, then we made a copy
261 return
262 for i, blk in enumerate(self.blocks):
263 blk.refs = mgr.blocks[i].refs
264 # Argument 1 to "add_reference" of "BlockValuesRefs" has incompatible type
265 # "Block"; expected "SharedBlock"
266 blk.refs.add_reference(blk) # type: ignore[arg-type]
267
268 def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool:
269 """
270 Checks if two blocks from two different block managers reference the
271 same underlying values.
272 """
273 ref = weakref.ref(self.blocks[blkno])
274 return ref in mgr.blocks[blkno].refs.referenced_blocks
275
276 def get_dtypes(self):
277 dtypes = np.array([blk.dtype for blk in self.blocks])
278 return dtypes.take(self.blknos)
279
280 @property
281 def arrays(self) -> list[ArrayLike]:
282 """
283 Quick access to the backing arrays of the Blocks.
284
285 Only for compatibility with ArrayManager for testing convenience.
286 Not to be used in actual code, and return value is not the same as the
287 ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs).
288
289 Warning! The returned arrays don't handle Copy-on-Write, so this should
290 be used with caution (only in read-mode).
291 """
292 return [blk.values for blk in self.blocks]
293
294 def __repr__(self) -> str:
295 output = type(self).__name__
296 for i, ax in enumerate(self.axes):
297 if i == 0:
298 output += f"\nItems: {ax}"
299 else:
300 output += f"\nAxis {i}: {ax}"
301
302 for block in self.blocks:
303 output += f"\n{block}"
304 return output
305
306 def apply(
307 self: T,
308 f,
309 align_keys: list[str] | None = None,
310 **kwargs,
311 ) -> T:
312 """
313 Iterate over the blocks, collect and create a new BlockManager.
314
315 Parameters
316 ----------
317 f : str or callable
318 Name of the Block method to apply.
319 align_keys: List[str] or None, default None
320 **kwargs
321 Keywords to pass to `f`
322
323 Returns
324 -------
325 BlockManager
326 """
327 assert "filter" not in kwargs
328
329 align_keys = align_keys or []
330 result_blocks: list[Block] = []
331 # fillna: Series/DataFrame is responsible for making sure value is aligned
332
333 aligned_args = {k: kwargs[k] for k in align_keys}
334
335 for b in self.blocks:
336 if aligned_args:
337 for k, obj in aligned_args.items():
338 if isinstance(obj, (ABCSeries, ABCDataFrame)):
339 # The caller is responsible for ensuring that
340 # obj.axes[-1].equals(self.items)
341 if obj.ndim == 1:
342 kwargs[k] = obj.iloc[b.mgr_locs.indexer]._values
343 else:
344 kwargs[k] = obj.iloc[:, b.mgr_locs.indexer]._values
345 else:
346 # otherwise we have an ndarray
347 kwargs[k] = obj[b.mgr_locs.indexer]
348
349 if callable(f):
350 applied = b.apply(f, **kwargs)
351 else:
352 applied = getattr(b, f)(**kwargs)
353 result_blocks = extend_blocks(applied, result_blocks)
354
355 out = type(self).from_blocks(result_blocks, self.axes)
356 return out
357
358 def where(self: T, other, cond, align: bool) -> T:
359 if align:
360 align_keys = ["other", "cond"]
361 else:
362 align_keys = ["cond"]
363 other = extract_array(other, extract_numpy=True)
364
365 return self.apply(
366 "where",
367 align_keys=align_keys,
368 other=other,
369 cond=cond,
370 using_cow=using_copy_on_write(),
371 )
372
373 def round(self: T, decimals: int, using_cow: bool = False) -> T:
374 return self.apply(
375 "round",
376 decimals=decimals,
377 using_cow=using_cow,
378 )
379
380 def setitem(self: T, indexer, value) -> T:
381 """
382 Set values with indexer.
383
384 For SingleBlockManager, this backs s[indexer] = value
385 """
386 if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
387 raise ValueError(f"Cannot set values with ndim > {self.ndim}")
388
389 if using_copy_on_write() and not self._has_no_reference(0):
390 # if being referenced -> perform Copy-on-Write and clear the reference
391 # this method is only called if there is a single block -> hardcoded 0
392 self = self.copy()
393
394 return self.apply("setitem", indexer=indexer, value=value)
395
396 def putmask(self, mask, new, align: bool = True):
397 if align:
398 align_keys = ["new", "mask"]
399 else:
400 align_keys = ["mask"]
401 new = extract_array(new, extract_numpy=True)
402
403 return self.apply(
404 "putmask",
405 align_keys=align_keys,
406 mask=mask,
407 new=new,
408 using_cow=using_copy_on_write(),
409 )
410
411 def diff(self: T, n: int, axis: AxisInt) -> T:
412 # only reached with self.ndim == 2 and axis == 1
413 axis = self._normalize_axis(axis)
414 return self.apply("diff", n=n, axis=axis)
415
416 def interpolate(self: T, inplace: bool, **kwargs) -> T:
417 return self.apply(
418 "interpolate", inplace=inplace, **kwargs, using_cow=using_copy_on_write()
419 )
420
421 def shift(self: T, periods: int, axis: AxisInt, fill_value) -> T:
422 axis = self._normalize_axis(axis)
423 if fill_value is lib.no_default:
424 fill_value = None
425
426 return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value)
427
428 def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
429 if limit is not None:
430 # Do this validation even if we go through one of the no-op paths
431 limit = libalgos.validate_limit(None, limit=limit)
432
433 return self.apply(
434 "fillna",
435 value=value,
436 limit=limit,
437 inplace=inplace,
438 downcast=downcast,
439 using_cow=using_copy_on_write(),
440 )
441
442 def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
443 if copy is None:
444 if using_copy_on_write():
445 copy = False
446 else:
447 copy = True
448 elif using_copy_on_write():
449 copy = False
450
451 return self.apply(
452 "astype",
453 dtype=dtype,
454 copy=copy,
455 errors=errors,
456 using_cow=using_copy_on_write(),
457 )
458
459 def convert(self: T, copy: bool | None) -> T:
460 if copy is None:
461 if using_copy_on_write():
462 copy = False
463 else:
464 copy = True
465 elif using_copy_on_write():
466 copy = False
467
468 return self.apply("convert", copy=copy, using_cow=using_copy_on_write())
469
470 def replace(self: T, to_replace, value, inplace: bool) -> T:
471 inplace = validate_bool_kwarg(inplace, "inplace")
472 # NDFrame.replace ensures the not-is_list_likes here
473 assert not is_list_like(to_replace)
474 assert not is_list_like(value)
475 return self.apply(
476 "replace",
477 to_replace=to_replace,
478 value=value,
479 inplace=inplace,
480 using_cow=using_copy_on_write(),
481 )
482
483 def replace_regex(self, **kwargs):
484 return self.apply("_replace_regex", **kwargs, using_cow=using_copy_on_write())
485
486 def replace_list(
487 self: T,
488 src_list: list[Any],
489 dest_list: list[Any],
490 inplace: bool = False,
491 regex: bool = False,
492 ) -> T:
493 """do a list replace"""
494 inplace = validate_bool_kwarg(inplace, "inplace")
495
496 bm = self.apply(
497 "replace_list",
498 src_list=src_list,
499 dest_list=dest_list,
500 inplace=inplace,
501 regex=regex,
502 using_cow=using_copy_on_write(),
503 )
504 bm._consolidate_inplace()
505 return bm
506
507 def to_native_types(self: T, **kwargs) -> T:
508 """
509 Convert values to native types (strings / python objects) that are used
510 in formatting (repr / csv).
511 """
512 return self.apply("to_native_types", **kwargs)
513
514 @property
515 def is_numeric_mixed_type(self) -> bool:
516 return all(block.is_numeric for block in self.blocks)
517
518 @property
519 def any_extension_types(self) -> bool:
520 """Whether any of the blocks in this manager are extension blocks"""
521 return any(block.is_extension for block in self.blocks)
522
523 @property
524 def is_view(self) -> bool:
525 """return a boolean if we are a single block and are a view"""
526 if len(self.blocks) == 1:
527 return self.blocks[0].is_view
528
529 # It is technically possible to figure out which blocks are views
530 # e.g. [ b.values.base is not None for b in self.blocks ]
531 # but then we have the case of possibly some blocks being a view
532 # and some blocks not. setting in theory is possible on the non-view
533 # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit
534 # complicated
535
536 return False
537
538 def _get_data_subset(self: T, predicate: Callable) -> T:
539 blocks = [blk for blk in self.blocks if predicate(blk.values)]
540 return self._combine(blocks, copy=False)
541
542 def get_bool_data(self: T, copy: bool = False) -> T:
543 """
544 Select blocks that are bool-dtype and columns from object-dtype blocks
545 that are all-bool.
546
547 Parameters
548 ----------
549 copy : bool, default False
550 Whether to copy the blocks
551 """
552
553 new_blocks = []
554
555 for blk in self.blocks:
556 if blk.dtype == bool:
557 new_blocks.append(blk)
558
559 elif blk.is_object:
560 nbs = blk._split()
561 for nb in nbs:
562 if nb.is_bool:
563 new_blocks.append(nb)
564
565 return self._combine(new_blocks, copy)
566
567 def get_numeric_data(self: T, copy: bool = False) -> T:
568 """
569 Parameters
570 ----------
571 copy : bool, default False
572 Whether to copy the blocks
573 """
574 numeric_blocks = [blk for blk in self.blocks if blk.is_numeric]
575 if len(numeric_blocks) == len(self.blocks):
576 # Avoid somewhat expensive _combine
577 if copy:
578 return self.copy(deep=True)
579 return self
580 return self._combine(numeric_blocks, copy)
581
582 def _combine(
583 self: T, blocks: list[Block], copy: bool = True, index: Index | None = None
584 ) -> T:
585 """return a new manager with the blocks"""
586 if len(blocks) == 0:
587 if self.ndim == 2:
588 # retain our own Index dtype
589 if index is not None:
590 axes = [self.items[:0], index]
591 else:
592 axes = [self.items[:0]] + self.axes[1:]
593 return self.make_empty(axes)
594 return self.make_empty()
595
596 # FIXME: optimization potential
597 indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks]))
598 inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])
599
600 new_blocks: list[Block] = []
601 # TODO(CoW) we could optimize here if we know that the passed blocks
602 # are fully "owned" (eg created from an operation, not coming from
603 # an existing manager)
604 for b in blocks:
605 nb = b.copy(deep=copy)
606 nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer])
607 new_blocks.append(nb)
608
609 axes = list(self.axes)
610 if index is not None:
611 axes[-1] = index
612 axes[0] = self.items.take(indexer)
613
614 return type(self).from_blocks(new_blocks, axes)
615
616 @property
617 def nblocks(self) -> int:
618 return len(self.blocks)
619
620 def copy(self: T, deep: bool | None | Literal["all"] = True) -> T:
621 """
622 Make deep or shallow copy of BlockManager
623
624 Parameters
625 ----------
626 deep : bool, string or None, default True
627 If False or None, return a shallow copy (do not copy data)
628 If 'all', copy data and a deep copy of the index
629
630 Returns
631 -------
632 BlockManager
633 """
634 if deep is None:
635 if using_copy_on_write():
636 # use shallow copy
637 deep = False
638 else:
639 # preserve deep copy for BlockManager with copy=None
640 deep = True
641
642 # this preserves the notion of view copying of axes
643 if deep:
644 # hit in e.g. tests.io.json.test_pandas
645
646 def copy_func(ax):
647 return ax.copy(deep=True) if deep == "all" else ax.view()
648
649 new_axes = [copy_func(ax) for ax in self.axes]
650 else:
651 new_axes = list(self.axes)
652
653 res = self.apply("copy", deep=deep)
654 res.axes = new_axes
655
656 if self.ndim > 1:
657 # Avoid needing to re-compute these
658 blknos = self._blknos
659 if blknos is not None:
660 res._blknos = blknos.copy()
661 res._blklocs = self._blklocs.copy()
662
663 if deep:
664 res._consolidate_inplace()
665 return res
666
667 def consolidate(self: T) -> T:
668 """
669 Join together blocks having same dtype
670
671 Returns
672 -------
673 y : BlockManager
674 """
675 if self.is_consolidated():
676 return self
677
678 bm = type(self)(self.blocks, self.axes, verify_integrity=False)
679 bm._is_consolidated = False
680 bm._consolidate_inplace()
681 return bm
682
683 def reindex_indexer(
684 self: T,
685 new_axis: Index,
686 indexer: npt.NDArray[np.intp] | None,
687 axis: AxisInt,
688 fill_value=None,
689 allow_dups: bool = False,
690 copy: bool | None = True,
691 only_slice: bool = False,
692 *,
693 use_na_proxy: bool = False,
694 ) -> T:
695 """
696 Parameters
697 ----------
698 new_axis : Index
699 indexer : ndarray[intp] or None
700 axis : int
701 fill_value : object, default None
702 allow_dups : bool, default False
703 copy : bool or None, default True
704 If None, regard as False to get shallow copy.
705 only_slice : bool, default False
706 Whether to take views, not copies, along columns.
707 use_na_proxy : bool, default False
708 Whether to use a np.void ndarray for newly introduced columns.
709
710 pandas-indexer with -1's only.
711 """
712 if copy is None:
713 if using_copy_on_write():
714 # use shallow copy
715 copy = False
716 else:
717 # preserve deep copy for BlockManager with copy=None
718 copy = True
719
720 if indexer is None:
721 if new_axis is self.axes[axis] and not copy:
722 return self
723
724 result = self.copy(deep=copy)
725 result.axes = list(self.axes)
726 result.axes[axis] = new_axis
727 return result
728
729 # Should be intp, but in some cases we get int64 on 32bit builds
730 assert isinstance(indexer, np.ndarray)
731
732 # some axes don't allow reindexing with dups
733 if not allow_dups:
734 self.axes[axis]._validate_can_reindex(indexer)
735
736 if axis >= self.ndim:
737 raise IndexError("Requested axis not found in manager")
738
739 if axis == 0:
740 new_blocks = self._slice_take_blocks_ax0(
741 indexer,
742 fill_value=fill_value,
743 only_slice=only_slice,
744 use_na_proxy=use_na_proxy,
745 )
746 else:
747 new_blocks = [
748 blk.take_nd(
749 indexer,
750 axis=1,
751 fill_value=(
752 fill_value if fill_value is not None else blk.fill_value
753 ),
754 )
755 for blk in self.blocks
756 ]
757
758 new_axes = list(self.axes)
759 new_axes[axis] = new_axis
760
761 new_mgr = type(self).from_blocks(new_blocks, new_axes)
762 if axis == 1:
763 # We can avoid the need to rebuild these
764 new_mgr._blknos = self.blknos.copy()
765 new_mgr._blklocs = self.blklocs.copy()
766 return new_mgr
767
768 def _slice_take_blocks_ax0(
769 self,
770 slice_or_indexer: slice | np.ndarray,
771 fill_value=lib.no_default,
772 only_slice: bool = False,
773 *,
774 use_na_proxy: bool = False,
775 ) -> list[Block]:
776 """
777 Slice/take blocks along axis=0.
778
779 Overloaded for SingleBlock
780
781 Parameters
782 ----------
783 slice_or_indexer : slice or np.ndarray[int64]
784 fill_value : scalar, default lib.no_default
785 only_slice : bool, default False
786 If True, we always return views on existing arrays, never copies.
787 This is used when called from ops.blockwise.operate_blockwise.
788 use_na_proxy : bool, default False
789 Whether to use a np.void ndarray for newly introduced columns.
790
791 Returns
792 -------
793 new_blocks : list of Block
794 """
795 allow_fill = fill_value is not lib.no_default
796
797 sl_type, slobj, sllen = _preprocess_slice_or_indexer(
798 slice_or_indexer, self.shape[0], allow_fill=allow_fill
799 )
800
801 if self.is_single_block:
802 blk = self.blocks[0]
803
804 if sl_type == "slice":
805 # GH#32959 EABlock would fail since we can't make 0-width
806 # TODO(EA2D): special casing unnecessary with 2D EAs
807 if sllen == 0:
808 return []
809 bp = BlockPlacement(slice(0, sllen))
810 return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)]
811 elif not allow_fill or self.ndim == 1:
812 if allow_fill and fill_value is None:
813 fill_value = blk.fill_value
814
815 if not allow_fill and only_slice:
816 # GH#33597 slice instead of take, so we get
817 # views instead of copies
818 blocks = [
819 blk.getitem_block_columns(
820 slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i)
821 )
822 for i, ml in enumerate(slobj)
823 ]
824 return blocks
825 else:
826 bp = BlockPlacement(slice(0, sllen))
827 return [
828 blk.take_nd(
829 slobj,
830 axis=0,
831 new_mgr_locs=bp,
832 fill_value=fill_value,
833 )
834 ]
835
836 if sl_type == "slice":
837 blknos = self.blknos[slobj]
838 blklocs = self.blklocs[slobj]
839 else:
840 blknos = algos.take_nd(
841 self.blknos, slobj, fill_value=-1, allow_fill=allow_fill
842 )
843 blklocs = algos.take_nd(
844 self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill
845 )
846
847 # When filling blknos, make sure blknos is updated before appending to
848 # blocks list, that way new blkno is exactly len(blocks).
849 blocks = []
850 group = not only_slice
851 for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group):
852 if blkno == -1:
853 # If we've got here, fill_value was not lib.no_default
854
855 blocks.append(
856 self._make_na_block(
857 placement=mgr_locs,
858 fill_value=fill_value,
859 use_na_proxy=use_na_proxy,
860 )
861 )
862 else:
863 blk = self.blocks[blkno]
864
865 # Otherwise, slicing along items axis is necessary.
866 if not blk._can_consolidate and not blk._validate_ndim:
867 # i.e. we dont go through here for DatetimeTZBlock
868 # A non-consolidatable block, it's easy, because there's
869 # only one item and each mgr loc is a copy of that single
870 # item.
871 deep = not (only_slice or using_copy_on_write())
872 for mgr_loc in mgr_locs:
873 newblk = blk.copy(deep=deep)
874 newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1))
875 blocks.append(newblk)
876
877 else:
878 # GH#32779 to avoid the performance penalty of copying,
879 # we may try to only slice
880 taker = blklocs[mgr_locs.indexer]
881 max_len = max(len(mgr_locs), taker.max() + 1)
882 if only_slice or using_copy_on_write():
883 taker = lib.maybe_indices_to_slice(taker, max_len)
884
885 if isinstance(taker, slice):
886 nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs)
887 blocks.append(nb)
888 elif only_slice:
889 # GH#33597 slice instead of take, so we get
890 # views instead of copies
891 for i, ml in zip(taker, mgr_locs):
892 slc = slice(i, i + 1)
893 bp = BlockPlacement(ml)
894 nb = blk.getitem_block_columns(slc, new_mgr_locs=bp)
895 # We have np.shares_memory(nb.values, blk.values)
896 blocks.append(nb)
897 else:
898 nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs)
899 blocks.append(nb)
900
901 return blocks
902
903 def _make_na_block(
904 self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False
905 ) -> Block:
906 # Note: we only get here with self.ndim == 2
907
908 if use_na_proxy:
909 assert fill_value is None
910 shape = (len(placement), self.shape[1])
911 vals = np.empty(shape, dtype=np.void)
912 nb = NumpyBlock(vals, placement, ndim=2)
913 return nb
914
915 if fill_value is None:
916 fill_value = np.nan
917 block_shape = list(self.shape)
918 block_shape[0] = len(placement)
919
920 dtype, fill_value = infer_dtype_from_scalar(fill_value)
921 # error: Argument "dtype" to "empty" has incompatible type "Union[dtype,
922 # ExtensionDtype]"; expected "Union[dtype, None, type, _SupportsDtype, str,
923 # Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any], _DtypeDict,
924 # Tuple[Any, Any]]"
925 block_values = np.empty(block_shape, dtype=dtype) # type: ignore[arg-type]
926 block_values.fill(fill_value)
927 return new_block_2d(block_values, placement=placement)
928
929 def take(
930 self: T,
931 indexer,
932 axis: AxisInt = 1,
933 verify: bool = True,
934 convert_indices: bool = True,
935 ) -> T:
936 """
937 Take items along any axis.
938
939 indexer : np.ndarray or slice
940 axis : int, default 1
941 verify : bool, default True
942 Check that all entries are between 0 and len(self) - 1, inclusive.
943 Pass verify=False if this check has been done by the caller.
944 convert_indices : bool, default True
945 Whether to attempt to convert indices to positive values.
946
947 Returns
948 -------
949 BlockManager
950 """
951 # We have 6 tests that get here with a slice
952 indexer = (
953 np.arange(indexer.start, indexer.stop, indexer.step, dtype=np.intp)
954 if isinstance(indexer, slice)
955 else np.asanyarray(indexer, dtype=np.intp)
956 )
957
958 n = self.shape[axis]
959 if convert_indices:
960 indexer = maybe_convert_indices(indexer, n, verify=verify)
961
962 new_labels = self.axes[axis].take(indexer)
963 return self.reindex_indexer(
964 new_axis=new_labels,
965 indexer=indexer,
966 axis=axis,
967 allow_dups=True,
968 copy=None,
969 )
970
971
972class BlockManager(libinternals.BlockManager, BaseBlockManager):
973 """
974 BaseBlockManager that holds 2D blocks.
975 """
976
977 ndim = 2
978
979 # ----------------------------------------------------------------
980 # Constructors
981
982 def __init__(
983 self,
984 blocks: Sequence[Block],
985 axes: Sequence[Index],
986 verify_integrity: bool = True,
987 ) -> None:
988 if verify_integrity:
989 # Assertion disabled for performance
990 # assert all(isinstance(x, Index) for x in axes)
991
992 for block in blocks:
993 if self.ndim != block.ndim:
994 raise AssertionError(
995 f"Number of Block dimensions ({block.ndim}) must equal "
996 f"number of axes ({self.ndim})"
997 )
998 # As of 2.0, the caller is responsible for ensuring that
999 # DatetimeTZBlock with block.ndim == 2 has block.values.ndim ==2;
1000 # previously there was a special check for fastparquet compat.
1001
1002 self._verify_integrity()
1003
1004 def _verify_integrity(self) -> None:
1005 mgr_shape = self.shape
1006 tot_items = sum(len(x.mgr_locs) for x in self.blocks)
1007 for block in self.blocks:
1008 if block.shape[1:] != mgr_shape[1:]:
1009 raise_construction_error(tot_items, block.shape[1:], self.axes)
1010 if len(self.items) != tot_items:
1011 raise AssertionError(
1012 "Number of manager items must equal union of "
1013 f"block items\n# manager items: {len(self.items)}, # "
1014 f"tot_items: {tot_items}"
1015 )
1016
1017 @classmethod
1018 def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> BlockManager:
1019 """
1020 Constructor for BlockManager and SingleBlockManager with same signature.
1021 """
1022 return cls(blocks, axes, verify_integrity=False)
1023
1024 # ----------------------------------------------------------------
1025 # Indexing
1026
1027 def fast_xs(self, loc: int) -> SingleBlockManager:
1028 """
1029 Return the array corresponding to `frame.iloc[loc]`.
1030
1031 Parameters
1032 ----------
1033 loc : int
1034
1035 Returns
1036 -------
1037 np.ndarray or ExtensionArray
1038 """
1039 if len(self.blocks) == 1:
1040 # TODO: this could be wrong if blk.mgr_locs is not slice(None)-like;
1041 # is this ruled out in the general case?
1042 result = self.blocks[0].iget((slice(None), loc))
1043 # in the case of a single block, the new block is a view
1044 block = new_block(
1045 result,
1046 placement=slice(0, len(result)),
1047 ndim=1,
1048 refs=self.blocks[0].refs,
1049 )
1050 return SingleBlockManager(block, self.axes[0])
1051
1052 dtype = interleaved_dtype([blk.dtype for blk in self.blocks])
1053
1054 n = len(self)
1055
1056 # GH#46406
1057 immutable_ea = isinstance(dtype, SparseDtype)
1058
1059 if isinstance(dtype, ExtensionDtype) and not immutable_ea:
1060 cls = dtype.construct_array_type()
1061 result = cls._empty((n,), dtype=dtype)
1062 else:
1063 # error: Argument "dtype" to "empty" has incompatible type
1064 # "Union[Type[object], dtype[Any], ExtensionDtype, None]"; expected
1065 # "None"
1066 result = np.empty(
1067 n, dtype=object if immutable_ea else dtype # type: ignore[arg-type]
1068 )
1069 result = ensure_wrapped_if_datetimelike(result)
1070
1071 for blk in self.blocks:
1072 # Such assignment may incorrectly coerce NaT to None
1073 # result[blk.mgr_locs] = blk._slice((slice(None), loc))
1074 for i, rl in enumerate(blk.mgr_locs):
1075 result[rl] = blk.iget((i, loc))
1076
1077 if immutable_ea:
1078 dtype = cast(ExtensionDtype, dtype)
1079 result = dtype.construct_array_type()._from_sequence(result, dtype=dtype)
1080
1081 block = new_block(result, placement=slice(0, len(result)), ndim=1)
1082 return SingleBlockManager(block, self.axes[0])
1083
1084 def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:
1085 """
1086 Return the data as a SingleBlockManager.
1087 """
1088 block = self.blocks[self.blknos[i]]
1089 values = block.iget(self.blklocs[i])
1090
1091 # shortcut for select a single-dim from a 2-dim BM
1092 bp = BlockPlacement(slice(0, len(values)))
1093 nb = type(block)(
1094 values, placement=bp, ndim=1, refs=block.refs if track_ref else None
1095 )
1096 return SingleBlockManager(nb, self.axes[1])
1097
1098 def iget_values(self, i: int) -> ArrayLike:
1099 """
1100 Return the data for column i as the values (ndarray or ExtensionArray).
1101
1102 Warning! The returned array is a view but doesn't handle Copy-on-Write,
1103 so this should be used with caution.
1104 """
1105 # TODO(CoW) making the arrays read-only might make this safer to use?
1106 block = self.blocks[self.blknos[i]]
1107 values = block.iget(self.blklocs[i])
1108 return values
1109
1110 @property
1111 def column_arrays(self) -> list[np.ndarray]:
1112 """
1113 Used in the JSON C code to access column arrays.
1114 This optimizes compared to using `iget_values` by converting each
1115
1116 Warning! This doesn't handle Copy-on-Write, so should be used with
1117 caution (current use case of consuming this in the JSON code is fine).
1118 """
1119 # This is an optimized equivalent to
1120 # result = [self.iget_values(i) for i in range(len(self.items))]
1121 result: list[np.ndarray | None] = [None] * len(self.items)
1122
1123 for blk in self.blocks:
1124 mgr_locs = blk._mgr_locs
1125 values = blk.values_for_json()
1126 if values.ndim == 1:
1127 # TODO(EA2D): special casing not needed with 2D EAs
1128 result[mgr_locs[0]] = values
1129
1130 else:
1131 for i, loc in enumerate(mgr_locs):
1132 result[loc] = values[i]
1133
1134 # error: Incompatible return value type (got "List[None]",
1135 # expected "List[ndarray[Any, Any]]")
1136 return result # type: ignore[return-value]
1137
1138 def iset(
1139 self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
1140 ):
1141 """
1142 Set new item in-place. Does not consolidate. Adds new Block if not
1143 contained in the current set of items
1144 """
1145
1146 # FIXME: refactor, clearly separate broadcasting & zip-like assignment
1147 # can prob also fix the various if tests for sparse/categorical
1148 if self._blklocs is None and self.ndim > 1:
1149 self._rebuild_blknos_and_blklocs()
1150
1151 # Note: we exclude DTA/TDA here
1152 value_is_extension_type = is_1d_only_ea_dtype(value.dtype)
1153 if not value_is_extension_type:
1154 if value.ndim == 2:
1155 value = value.T
1156 else:
1157 value = ensure_block_shape(value, ndim=2)
1158
1159 if value.shape[1:] != self.shape[1:]:
1160 raise AssertionError(
1161 "Shape of new values must be compatible with manager shape"
1162 )
1163
1164 if lib.is_integer(loc):
1165 # We have 6 tests where loc is _not_ an int.
1166 # In this case, get_blkno_placements will yield only one tuple,
1167 # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1)))
1168
1169 # Check if we can use _iset_single fastpath
1170 loc = cast(int, loc)
1171 blkno = self.blknos[loc]
1172 blk = self.blocks[blkno]
1173 if len(blk._mgr_locs) == 1: # TODO: fastest way to check this?
1174 return self._iset_single(
1175 loc,
1176 value,
1177 inplace=inplace,
1178 blkno=blkno,
1179 blk=blk,
1180 )
1181
1182 # error: Incompatible types in assignment (expression has type
1183 # "List[Union[int, slice, ndarray]]", variable has type "Union[int,
1184 # slice, ndarray]")
1185 loc = [loc] # type: ignore[assignment]
1186
1187 # categorical/sparse/datetimetz
1188 if value_is_extension_type:
1189
1190 def value_getitem(placement):
1191 return value
1192
1193 else:
1194
1195 def value_getitem(placement):
1196 return value[placement.indexer]
1197
1198 # Accessing public blknos ensures the public versions are initialized
1199 blknos = self.blknos[loc]
1200 blklocs = self.blklocs[loc].copy()
1201
1202 unfit_mgr_locs = []
1203 unfit_val_locs = []
1204 removed_blknos = []
1205 for blkno_l, val_locs in libinternals.get_blkno_placements(blknos, group=True):
1206 blk = self.blocks[blkno_l]
1207 blk_locs = blklocs[val_locs.indexer]
1208 if inplace and blk.should_store(value):
1209 # Updating inplace -> check if we need to do Copy-on-Write
1210 if using_copy_on_write() and not self._has_no_reference_block(blkno_l):
1211 self._iset_split_block(blkno_l, blk_locs, value_getitem(val_locs))
1212 else:
1213 blk.set_inplace(blk_locs, value_getitem(val_locs))
1214 continue
1215 else:
1216 unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
1217 unfit_val_locs.append(val_locs)
1218
1219 # If all block items are unfit, schedule the block for removal.
1220 if len(val_locs) == len(blk.mgr_locs):
1221 removed_blknos.append(blkno_l)
1222 continue
1223 else:
1224 # Defer setting the new values to enable consolidation
1225 self._iset_split_block(blkno_l, blk_locs)
1226
1227 if len(removed_blknos):
1228 # Remove blocks & update blknos accordingly
1229 is_deleted = np.zeros(self.nblocks, dtype=np.bool_)
1230 is_deleted[removed_blknos] = True
1231
1232 new_blknos = np.empty(self.nblocks, dtype=np.intp)
1233 new_blknos.fill(-1)
1234 new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos))
1235 self._blknos = new_blknos[self._blknos]
1236 self.blocks = tuple(
1237 blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos)
1238 )
1239
1240 if unfit_val_locs:
1241 unfit_idxr = np.concatenate(unfit_mgr_locs)
1242 unfit_count = len(unfit_idxr)
1243
1244 new_blocks: list[Block] = []
1245 # TODO(CoW) is this always correct to assume that the new_blocks
1246 # are not referencing anything else?
1247 if value_is_extension_type:
1248 # This code (ab-)uses the fact that EA blocks contain only
1249 # one item.
1250 # TODO(EA2D): special casing unnecessary with 2D EAs
1251 new_blocks.extend(
1252 new_block_2d(
1253 values=value,
1254 placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)),
1255 )
1256 for mgr_loc in unfit_idxr
1257 )
1258
1259 self._blknos[unfit_idxr] = np.arange(unfit_count) + len(self.blocks)
1260 self._blklocs[unfit_idxr] = 0
1261
1262 else:
1263 # unfit_val_locs contains BlockPlacement objects
1264 unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])
1265
1266 new_blocks.append(
1267 new_block_2d(
1268 values=value_getitem(unfit_val_items),
1269 placement=BlockPlacement(unfit_idxr),
1270 )
1271 )
1272
1273 self._blknos[unfit_idxr] = len(self.blocks)
1274 self._blklocs[unfit_idxr] = np.arange(unfit_count)
1275
1276 self.blocks += tuple(new_blocks)
1277
1278 # Newly created block's dtype may already be present.
1279 self._known_consolidated = False
1280
1281 def _iset_split_block(
1282 self,
1283 blkno_l: int,
1284 blk_locs: np.ndarray | list[int],
1285 value: ArrayLike | None = None,
1286 ) -> None:
1287 """Removes columns from a block by splitting the block.
1288
1289 Avoids copying the whole block through slicing and updates the manager
1290 after determinint the new block structure. Optionally adds a new block,
1291 otherwise has to be done by the caller.
1292
1293 Parameters
1294 ----------
1295 blkno_l: The block number to operate on, relevant for updating the manager
1296 blk_locs: The locations of our block that should be deleted.
1297 value: The value to set as a replacement.
1298 """
1299 blk = self.blocks[blkno_l]
1300
1301 if self._blklocs is None:
1302 self._rebuild_blknos_and_blklocs()
1303
1304 nbs_tup = tuple(blk.delete(blk_locs))
1305 if value is not None:
1306 locs = blk.mgr_locs.as_array[blk_locs]
1307 first_nb = new_block_2d(value, BlockPlacement(locs))
1308 else:
1309 first_nb = nbs_tup[0]
1310 nbs_tup = tuple(nbs_tup[1:])
1311
1312 nr_blocks = len(self.blocks)
1313 blocks_tup = (
1314 self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup
1315 )
1316 self.blocks = blocks_tup
1317
1318 if not nbs_tup and value is not None:
1319 # No need to update anything if split did not happen
1320 return
1321
1322 self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))
1323
1324 for i, nb in enumerate(nbs_tup):
1325 self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
1326 self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
1327
1328 def _iset_single(
1329 self, loc: int, value: ArrayLike, inplace: bool, blkno: int, blk: Block
1330 ) -> None:
1331 """
1332 Fastpath for iset when we are only setting a single position and
1333 the Block currently in that position is itself single-column.
1334
1335 In this case we can swap out the entire Block and blklocs and blknos
1336 are unaffected.
1337 """
1338 # Caller is responsible for verifying value.shape
1339
1340 if inplace and blk.should_store(value):
1341 copy = False
1342 if using_copy_on_write() and not self._has_no_reference_block(blkno):
1343 # perform Copy-on-Write and clear the reference
1344 copy = True
1345 iloc = self.blklocs[loc]
1346 blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy)
1347 return
1348
1349 nb = new_block_2d(value, placement=blk._mgr_locs)
1350 old_blocks = self.blocks
1351 new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :]
1352 self.blocks = new_blocks
1353 return
1354
1355 def column_setitem(
1356 self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False
1357 ) -> None:
1358 """
1359 Set values ("setitem") into a single column (not setting the full column).
1360
1361 This is a method on the BlockManager level, to avoid creating an
1362 intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
1363 """
1364 if using_copy_on_write() and not self._has_no_reference(loc):
1365 blkno = self.blknos[loc]
1366 # Split blocks to only copy the column we want to modify
1367 blk_loc = self.blklocs[loc]
1368 # Copy our values
1369 values = self.blocks[blkno].values
1370 if values.ndim == 1:
1371 values = values.copy()
1372 else:
1373 # Use [blk_loc] as indexer to keep ndim=2, this already results in a
1374 # copy
1375 values = values[[blk_loc]]
1376 self._iset_split_block(blkno, [blk_loc], values)
1377
1378 # this manager is only created temporarily to mutate the values in place
1379 # so don't track references, otherwise the `setitem` would perform CoW again
1380 col_mgr = self.iget(loc, track_ref=False)
1381 if inplace_only:
1382 col_mgr.setitem_inplace(idx, value)
1383 else:
1384 new_mgr = col_mgr.setitem((idx,), value)
1385 self.iset(loc, new_mgr._block.values, inplace=True)
1386
1387 def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
1388 """
1389 Insert item at selected position.
1390
1391 Parameters
1392 ----------
1393 loc : int
1394 item : hashable
1395 value : np.ndarray or ExtensionArray
1396 """
1397 # insert to the axis; this could possibly raise a TypeError
1398 new_axis = self.items.insert(loc, item)
1399
1400 if value.ndim == 2:
1401 value = value.T
1402 if len(value) > 1:
1403 raise ValueError(
1404 f"Expected a 1D array, got an array with shape {value.T.shape}"
1405 )
1406 else:
1407 value = ensure_block_shape(value, ndim=self.ndim)
1408
1409 bp = BlockPlacement(slice(loc, loc + 1))
1410 # TODO(CoW) do we always "own" the passed `value`?
1411 block = new_block_2d(values=value, placement=bp)
1412
1413 if not len(self.blocks):
1414 # Fastpath
1415 self._blklocs = np.array([0], dtype=np.intp)
1416 self._blknos = np.array([0], dtype=np.intp)
1417 else:
1418 self._insert_update_mgr_locs(loc)
1419 self._insert_update_blklocs_and_blknos(loc)
1420
1421 self.axes[0] = new_axis
1422 self.blocks += (block,)
1423
1424 self._known_consolidated = False
1425
1426 if sum(not block.is_extension for block in self.blocks) > 100:
1427 warnings.warn(
1428 "DataFrame is highly fragmented. This is usually the result "
1429 "of calling `frame.insert` many times, which has poor performance. "
1430 "Consider joining all columns at once using pd.concat(axis=1) "
1431 "instead. To get a de-fragmented frame, use `newframe = frame.copy()`",
1432 PerformanceWarning,
1433 stacklevel=find_stack_level(),
1434 )
1435
1436 def _insert_update_mgr_locs(self, loc) -> None:
1437 """
1438 When inserting a new Block at location 'loc', we increment
1439 all of the mgr_locs of blocks above that by one.
1440 """
1441 for blkno, count in _fast_count_smallints(self.blknos[loc:]):
1442 # .620 this way, .326 of which is in increment_above
1443 blk = self.blocks[blkno]
1444 blk._mgr_locs = blk._mgr_locs.increment_above(loc)
1445
1446 def _insert_update_blklocs_and_blknos(self, loc) -> None:
1447 """
1448 When inserting a new Block at location 'loc', we update our
1449 _blklocs and _blknos.
1450 """
1451
1452 # Accessing public blklocs ensures the public versions are initialized
1453 if loc == self.blklocs.shape[0]:
1454 # np.append is a lot faster, let's use it if we can.
1455 self._blklocs = np.append(self._blklocs, 0)
1456 self._blknos = np.append(self._blknos, len(self.blocks))
1457 elif loc == 0:
1458 # np.append is a lot faster, let's use it if we can.
1459 self._blklocs = np.append(self._blklocs[::-1], 0)[::-1]
1460 self._blknos = np.append(self._blknos[::-1], len(self.blocks))[::-1]
1461 else:
1462 new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos(
1463 self.blklocs, self.blknos, loc, len(self.blocks)
1464 )
1465 self._blklocs = new_blklocs
1466 self._blknos = new_blknos
1467
1468 def idelete(self, indexer) -> BlockManager:
1469 """
1470 Delete selected locations, returning a new BlockManager.
1471 """
1472 is_deleted = np.zeros(self.shape[0], dtype=np.bool_)
1473 is_deleted[indexer] = True
1474 taker = (~is_deleted).nonzero()[0]
1475
1476 nbs = self._slice_take_blocks_ax0(taker, only_slice=True)
1477 new_columns = self.items[~is_deleted]
1478 axes = [new_columns, self.axes[1]]
1479 return type(self)(tuple(nbs), axes, verify_integrity=False)
1480
1481 # ----------------------------------------------------------------
1482 # Block-wise Operation
1483
1484 def grouped_reduce(self: T, func: Callable) -> T:
1485 """
1486 Apply grouped reduction function blockwise, returning a new BlockManager.
1487
1488 Parameters
1489 ----------
1490 func : grouped reduction function
1491
1492 Returns
1493 -------
1494 BlockManager
1495 """
1496 result_blocks: list[Block] = []
1497
1498 for blk in self.blocks:
1499 if blk.is_object:
1500 # split on object-dtype blocks bc some columns may raise
1501 # while others do not.
1502 for sb in blk._split():
1503 applied = sb.apply(func)
1504 result_blocks = extend_blocks(applied, result_blocks)
1505 else:
1506 applied = blk.apply(func)
1507 result_blocks = extend_blocks(applied, result_blocks)
1508
1509 if len(result_blocks) == 0:
1510 nrows = 0
1511 else:
1512 nrows = result_blocks[0].values.shape[-1]
1513 index = Index(range(nrows))
1514
1515 return type(self).from_blocks(result_blocks, [self.axes[0], index])
1516
1517 def reduce(self: T, func: Callable) -> T:
1518 """
1519 Apply reduction function blockwise, returning a single-row BlockManager.
1520
1521 Parameters
1522 ----------
1523 func : reduction function
1524
1525 Returns
1526 -------
1527 BlockManager
1528 """
1529 # If 2D, we assume that we're operating column-wise
1530 assert self.ndim == 2
1531
1532 res_blocks: list[Block] = []
1533 for blk in self.blocks:
1534 nbs = blk.reduce(func)
1535 res_blocks.extend(nbs)
1536
1537 index = Index([None]) # placeholder
1538 new_mgr = type(self).from_blocks(res_blocks, [self.items, index])
1539 return new_mgr
1540
1541 def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager:
1542 """
1543 Apply array_op blockwise with another (aligned) BlockManager.
1544 """
1545 return operate_blockwise(self, other, array_op)
1546
1547 def _equal_values(self: BlockManager, other: BlockManager) -> bool:
1548 """
1549 Used in .equals defined in base class. Only check the column values
1550 assuming shape and indexes have already been checked.
1551 """
1552 return blockwise_all(self, other, array_equals)
1553
1554 def quantile(
1555 self: T,
1556 *,
1557 qs: Index, # with dtype float 64
1558 axis: AxisInt = 0,
1559 interpolation: QuantileInterpolation = "linear",
1560 ) -> T:
1561 """
1562 Iterate over blocks applying quantile reduction.
1563 This routine is intended for reduction type operations and
1564 will do inference on the generated blocks.
1565
1566 Parameters
1567 ----------
1568 axis: reduction axis, default 0
1569 consolidate: bool, default True. Join together blocks having same
1570 dtype
1571 interpolation : type of interpolation, default 'linear'
1572 qs : list of the quantiles to be computed
1573
1574 Returns
1575 -------
1576 BlockManager
1577 """
1578 # Series dispatches to DataFrame for quantile, which allows us to
1579 # simplify some of the code here and in the blocks
1580 assert self.ndim >= 2
1581 assert is_list_like(qs) # caller is responsible for this
1582 assert axis == 1 # only ever called this way
1583
1584 new_axes = list(self.axes)
1585 new_axes[1] = Index(qs, dtype=np.float64)
1586
1587 blocks = [
1588 blk.quantile(axis=axis, qs=qs, interpolation=interpolation)
1589 for blk in self.blocks
1590 ]
1591
1592 return type(self)(blocks, new_axes)
1593
1594 # ----------------------------------------------------------------
1595
1596 def unstack(self, unstacker, fill_value) -> BlockManager:
1597 """
1598 Return a BlockManager with all blocks unstacked.
1599
1600 Parameters
1601 ----------
1602 unstacker : reshape._Unstacker
1603 fill_value : Any
1604 fill_value for newly introduced missing values.
1605
1606 Returns
1607 -------
1608 unstacked : BlockManager
1609 """
1610 new_columns = unstacker.get_new_columns(self.items)
1611 new_index = unstacker.new_index
1612
1613 allow_fill = not unstacker.mask_all
1614 if allow_fill:
1615 # calculating the full mask once and passing it to Block._unstack is
1616 # faster than letting calculating it in each repeated call
1617 new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape)
1618 needs_masking = new_mask2D.any(axis=0)
1619 else:
1620 needs_masking = np.zeros(unstacker.full_shape[1], dtype=bool)
1621
1622 new_blocks: list[Block] = []
1623 columns_mask: list[np.ndarray] = []
1624
1625 if len(self.items) == 0:
1626 factor = 1
1627 else:
1628 fac = len(new_columns) / len(self.items)
1629 assert fac == int(fac)
1630 factor = int(fac)
1631
1632 for blk in self.blocks:
1633 mgr_locs = blk.mgr_locs
1634 new_placement = mgr_locs.tile_for_unstack(factor)
1635
1636 blocks, mask = blk._unstack(
1637 unstacker,
1638 fill_value,
1639 new_placement=new_placement,
1640 needs_masking=needs_masking,
1641 )
1642
1643 new_blocks.extend(blocks)
1644 columns_mask.extend(mask)
1645
1646 # Block._unstack should ensure this holds,
1647 assert mask.sum() == sum(len(nb._mgr_locs) for nb in blocks)
1648 # In turn this ensures that in the BlockManager call below
1649 # we have len(new_columns) == sum(x.shape[0] for x in new_blocks)
1650 # which suffices to allow us to pass verify_inegrity=False
1651
1652 new_columns = new_columns[columns_mask]
1653
1654 bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False)
1655 return bm
1656
1657 def to_dict(self, copy: bool = True):
1658 """
1659 Return a dict of str(dtype) -> BlockManager
1660
1661 Parameters
1662 ----------
1663 copy : bool, default True
1664
1665 Returns
1666 -------
1667 values : a dict of dtype -> BlockManager
1668 """
1669
1670 bd: dict[str, list[Block]] = {}
1671 for b in self.blocks:
1672 bd.setdefault(str(b.dtype), []).append(b)
1673
1674 # TODO(EA2D): the combine will be unnecessary with 2D EAs
1675 return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()}
1676
1677 def as_array(
1678 self,
1679 dtype: np.dtype | None = None,
1680 copy: bool = False,
1681 na_value: object = lib.no_default,
1682 ) -> np.ndarray:
1683 """
1684 Convert the blockmanager data into an numpy array.
1685
1686 Parameters
1687 ----------
1688 dtype : np.dtype or None, default None
1689 Data type of the return array.
1690 copy : bool, default False
1691 If True then guarantee that a copy is returned. A value of
1692 False does not guarantee that the underlying data is not
1693 copied.
1694 na_value : object, default lib.no_default
1695 Value to be used as the missing value sentinel.
1696
1697 Returns
1698 -------
1699 arr : ndarray
1700 """
1701 # TODO(CoW) handle case where resulting array is a view
1702 if len(self.blocks) == 0:
1703 arr = np.empty(self.shape, dtype=float)
1704 return arr.transpose()
1705
1706 # We want to copy when na_value is provided to avoid
1707 # mutating the original object
1708 copy = copy or na_value is not lib.no_default
1709
1710 if self.is_single_block:
1711 blk = self.blocks[0]
1712 if blk.is_extension:
1713 # Avoid implicit conversion of extension blocks to object
1714
1715 # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no
1716 # attribute "to_numpy"
1717 arr = blk.values.to_numpy( # type: ignore[union-attr]
1718 dtype=dtype,
1719 na_value=na_value,
1720 ).reshape(blk.shape)
1721 else:
1722 arr = np.asarray(blk.get_values())
1723 if dtype:
1724 arr = arr.astype(dtype, copy=False)
1725
1726 if copy:
1727 arr = arr.copy()
1728 elif using_copy_on_write():
1729 arr = arr.view()
1730 arr.flags.writeable = False
1731 else:
1732 arr = self._interleave(dtype=dtype, na_value=na_value)
1733 # The underlying data was copied within _interleave, so no need
1734 # to further copy if copy=True or setting na_value
1735
1736 if na_value is not lib.no_default:
1737 arr[isna(arr)] = na_value
1738
1739 return arr.transpose()
1740
1741 def _interleave(
1742 self,
1743 dtype: np.dtype | None = None,
1744 na_value: object = lib.no_default,
1745 ) -> np.ndarray:
1746 """
1747 Return ndarray from blocks with specified item order
1748 Items must be contained in the blocks
1749 """
1750 if not dtype:
1751 # Incompatible types in assignment (expression has type
1752 # "Optional[Union[dtype[Any], ExtensionDtype]]", variable has
1753 # type "Optional[dtype[Any]]")
1754 dtype = interleaved_dtype( # type: ignore[assignment]
1755 [blk.dtype for blk in self.blocks]
1756 )
1757
1758 # TODO: https://github.com/pandas-dev/pandas/issues/22791
1759 # Give EAs some input on what happens here. Sparse needs this.
1760 if isinstance(dtype, SparseDtype):
1761 dtype = dtype.subtype
1762 dtype = cast(np.dtype, dtype)
1763 elif isinstance(dtype, ExtensionDtype):
1764 dtype = np.dtype("object")
1765 elif is_dtype_equal(dtype, str):
1766 dtype = np.dtype("object")
1767
1768 result = np.empty(self.shape, dtype=dtype)
1769
1770 itemmask = np.zeros(self.shape[0])
1771
1772 if dtype == np.dtype("object") and na_value is lib.no_default:
1773 # much more performant than using to_numpy below
1774 for blk in self.blocks:
1775 rl = blk.mgr_locs
1776 arr = blk.get_values(dtype)
1777 result[rl.indexer] = arr
1778 itemmask[rl.indexer] = 1
1779 return result
1780
1781 for blk in self.blocks:
1782 rl = blk.mgr_locs
1783 if blk.is_extension:
1784 # Avoid implicit conversion of extension blocks to object
1785
1786 # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no
1787 # attribute "to_numpy"
1788 arr = blk.values.to_numpy( # type: ignore[union-attr]
1789 dtype=dtype,
1790 na_value=na_value,
1791 )
1792 else:
1793 arr = blk.get_values(dtype)
1794 result[rl.indexer] = arr
1795 itemmask[rl.indexer] = 1
1796
1797 if not itemmask.all():
1798 raise AssertionError("Some items were not contained in blocks")
1799
1800 return result
1801
1802 # ----------------------------------------------------------------
1803 # Consolidation
1804
1805 def is_consolidated(self) -> bool:
1806 """
1807 Return True if more than one block with the same dtype
1808 """
1809 if not self._known_consolidated:
1810 self._consolidate_check()
1811 return self._is_consolidated
1812
1813 def _consolidate_check(self) -> None:
1814 if len(self.blocks) == 1:
1815 # fastpath
1816 self._is_consolidated = True
1817 self._known_consolidated = True
1818 return
1819 dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate]
1820 self._is_consolidated = len(dtypes) == len(set(dtypes))
1821 self._known_consolidated = True
1822
1823 def _consolidate_inplace(self) -> None:
1824 # In general, _consolidate_inplace should only be called via
1825 # DataFrame._consolidate_inplace, otherwise we will fail to invalidate
1826 # the DataFrame's _item_cache. The exception is for newly-created
1827 # BlockManager objects not yet attached to a DataFrame.
1828 if not self.is_consolidated():
1829 self.blocks = _consolidate(self.blocks)
1830 self._is_consolidated = True
1831 self._known_consolidated = True
1832 self._rebuild_blknos_and_blklocs()
1833
1834
1835class SingleBlockManager(BaseBlockManager, SingleDataManager):
1836 """manage a single block with"""
1837
1838 @property
1839 def ndim(self) -> Literal[1]:
1840 return 1
1841
1842 _is_consolidated = True
1843 _known_consolidated = True
1844 __slots__ = ()
1845 is_single_block = True
1846
1847 def __init__(
1848 self,
1849 block: Block,
1850 axis: Index,
1851 verify_integrity: bool = False,
1852 ) -> None:
1853 # Assertions disabled for performance
1854 # assert isinstance(block, Block), type(block)
1855 # assert isinstance(axis, Index), type(axis)
1856
1857 self.axes = [axis]
1858 self.blocks = (block,)
1859
1860 @classmethod
1861 def from_blocks(
1862 cls,
1863 blocks: list[Block],
1864 axes: list[Index],
1865 ) -> SingleBlockManager:
1866 """
1867 Constructor for BlockManager and SingleBlockManager with same signature.
1868 """
1869 assert len(blocks) == 1
1870 assert len(axes) == 1
1871 return cls(blocks[0], axes[0], verify_integrity=False)
1872
1873 @classmethod
1874 def from_array(
1875 cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None
1876 ) -> SingleBlockManager:
1877 """
1878 Constructor for if we have an array that is not yet a Block.
1879 """
1880 block = new_block(array, placement=slice(0, len(index)), ndim=1, refs=refs)
1881 return cls(block, index)
1882
1883 def to_2d_mgr(self, columns: Index) -> BlockManager:
1884 """
1885 Manager analogue of Series.to_frame
1886 """
1887 blk = self.blocks[0]
1888 arr = ensure_block_shape(blk.values, ndim=2)
1889 bp = BlockPlacement(0)
1890 new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs)
1891 axes = [columns, self.axes[0]]
1892 return BlockManager([new_blk], axes=axes, verify_integrity=False)
1893
1894 def _has_no_reference(self, i: int = 0) -> bool:
1895 """
1896 Check for column `i` if it has references.
1897 (whether it references another array or is itself being referenced)
1898 Returns True if the column has no references.
1899 """
1900 return not self.blocks[0].refs.has_reference()
1901
1902 def __getstate__(self):
1903 block_values = [b.values for b in self.blocks]
1904 block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
1905 axes_array = list(self.axes)
1906
1907 extra_state = {
1908 "0.14.1": {
1909 "axes": axes_array,
1910 "blocks": [
1911 {"values": b.values, "mgr_locs": b.mgr_locs.indexer}
1912 for b in self.blocks
1913 ],
1914 }
1915 }
1916
1917 # First three elements of the state are to maintain forward
1918 # compatibility with 0.13.1.
1919 return axes_array, block_values, block_items, extra_state
1920
1921 def __setstate__(self, state):
1922 def unpickle_block(values, mgr_locs, ndim: int) -> Block:
1923 # TODO(EA2D): ndim would be unnecessary with 2D EAs
1924 # older pickles may store e.g. DatetimeIndex instead of DatetimeArray
1925 values = extract_array(values, extract_numpy=True)
1926 return new_block(values, placement=mgr_locs, ndim=ndim)
1927
1928 if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:
1929 state = state[3]["0.14.1"]
1930 self.axes = [ensure_index(ax) for ax in state["axes"]]
1931 ndim = len(self.axes)
1932 self.blocks = tuple(
1933 unpickle_block(b["values"], b["mgr_locs"], ndim=ndim)
1934 for b in state["blocks"]
1935 )
1936 else:
1937 raise NotImplementedError("pre-0.14.1 pickles are no longer supported")
1938
1939 self._post_setstate()
1940
1941 def _post_setstate(self) -> None:
1942 pass
1943
1944 @cache_readonly
1945 def _block(self) -> Block:
1946 return self.blocks[0]
1947
1948 @property
1949 def _blknos(self):
1950 """compat with BlockManager"""
1951 return None
1952
1953 @property
1954 def _blklocs(self):
1955 """compat with BlockManager"""
1956 return None
1957
1958 def getitem_mgr(self, indexer: slice | np.ndarray) -> SingleBlockManager:
1959 # similar to get_slice, but not restricted to slice indexer
1960 blk = self._block
1961 if (
1962 using_copy_on_write()
1963 and isinstance(indexer, np.ndarray)
1964 and len(indexer) > 0
1965 and com.is_bool_indexer(indexer)
1966 and indexer.all()
1967 ):
1968 return type(self)(blk.copy(deep=False), self.index)
1969 array = blk._slice(indexer)
1970 if array.ndim > 1:
1971 # This will be caught by Series._get_values
1972 raise ValueError("dimension-expanding indexing not allowed")
1973
1974 bp = BlockPlacement(slice(0, len(array)))
1975 # TODO(CoW) in theory only need to track reference if new_array is a view
1976 block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs)
1977
1978 new_idx = self.index[indexer]
1979 return type(self)(block, new_idx)
1980
1981 def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager:
1982 # Assertion disabled for performance
1983 # assert isinstance(slobj, slice), type(slobj)
1984 if axis >= self.ndim:
1985 raise IndexError("Requested axis not found in manager")
1986
1987 blk = self._block
1988 array = blk._slice(slobj)
1989 bp = BlockPlacement(slice(0, len(array)))
1990 # TODO this method is only used in groupby SeriesSplitter at the moment,
1991 # so passing refs is not yet covered by the tests
1992 block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs)
1993 new_index = self.index._getitem_slice(slobj)
1994 return type(self)(block, new_index)
1995
1996 @property
1997 def index(self) -> Index:
1998 return self.axes[0]
1999
2000 @property
2001 def dtype(self) -> DtypeObj:
2002 return self._block.dtype
2003
2004 def get_dtypes(self) -> np.ndarray:
2005 return np.array([self._block.dtype])
2006
2007 def external_values(self):
2008 """The array that Series.values returns"""
2009 return self._block.external_values()
2010
2011 def internal_values(self):
2012 """The array that Series._values returns"""
2013 return self._block.values
2014
2015 def array_values(self):
2016 """The array that Series.array returns"""
2017 return self._block.array_values
2018
2019 def get_numeric_data(self, copy: bool = False):
2020 if self._block.is_numeric:
2021 return self.copy(deep=copy)
2022 return self.make_empty()
2023
2024 @property
2025 def _can_hold_na(self) -> bool:
2026 return self._block._can_hold_na
2027
2028 def setitem_inplace(self, indexer, value) -> None:
2029 """
2030 Set values with indexer.
2031
2032 For Single[Block/Array]Manager, this backs s[indexer] = value
2033
2034 This is an inplace version of `setitem()`, mutating the manager/values
2035 in place, not returning a new Manager (and Block), and thus never changing
2036 the dtype.
2037 """
2038 if using_copy_on_write() and not self._has_no_reference(0):
2039 self.blocks = (self._block.copy(),)
2040 self._cache.clear()
2041
2042 super().setitem_inplace(indexer, value)
2043
2044 def idelete(self, indexer) -> SingleBlockManager:
2045 """
2046 Delete single location from SingleBlockManager.
2047
2048 Ensures that self.blocks doesn't become empty.
2049 """
2050 nb = self._block.delete(indexer)[0]
2051 self.blocks = (nb,)
2052 self.axes[0] = self.axes[0].delete(indexer)
2053 self._cache.clear()
2054 return self
2055
2056 def fast_xs(self, loc):
2057 """
2058 fast path for getting a cross-section
2059 return a view of the data
2060 """
2061 raise NotImplementedError("Use series._values[loc] instead")
2062
2063 def set_values(self, values: ArrayLike) -> None:
2064 """
2065 Set the values of the single block in place.
2066
2067 Use at your own risk! This does not check if the passed values are
2068 valid for the current Block/SingleBlockManager (length, dtype, etc).
2069 """
2070 # TODO(CoW) do we need to handle copy on write here? Currently this is
2071 # only used for FrameColumnApply.series_generator (what if apply is
2072 # mutating inplace?)
2073 self.blocks[0].values = values
2074 self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values)))
2075
2076 def _equal_values(self: T, other: T) -> bool:
2077 """
2078 Used in .equals defined in base class. Only check the column values
2079 assuming shape and indexes have already been checked.
2080 """
2081 # For SingleBlockManager (i.e.Series)
2082 if other.ndim != 1:
2083 return False
2084 left = self.blocks[0].values
2085 right = other.blocks[0].values
2086 return array_equals(left, right)
2087
2088
2089# --------------------------------------------------------------------
2090# Constructor Helpers
2091
2092
2093def create_block_manager_from_blocks(
2094 blocks: list[Block],
2095 axes: list[Index],
2096 consolidate: bool = True,
2097 verify_integrity: bool = True,
2098) -> BlockManager:
2099 # If verify_integrity=False, then caller is responsible for checking
2100 # all(x.shape[-1] == len(axes[1]) for x in blocks)
2101 # sum(x.shape[0] for x in blocks) == len(axes[0])
2102 # set(x for blk in blocks for x in blk.mgr_locs) == set(range(len(axes[0])))
2103 # all(blk.ndim == 2 for blk in blocks)
2104 # This allows us to safely pass verify_integrity=False
2105
2106 try:
2107 mgr = BlockManager(blocks, axes, verify_integrity=verify_integrity)
2108
2109 except ValueError as err:
2110 arrays = [blk.values for blk in blocks]
2111 tot_items = sum(arr.shape[0] for arr in arrays)
2112 raise_construction_error(tot_items, arrays[0].shape[1:], axes, err)
2113
2114 if consolidate:
2115 mgr._consolidate_inplace()
2116 return mgr
2117
2118
2119def create_block_manager_from_column_arrays(
2120 arrays: list[ArrayLike],
2121 axes: list[Index],
2122 consolidate: bool,
2123 refs: list,
2124) -> BlockManager:
2125 # Assertions disabled for performance (caller is responsible for verifying)
2126 # assert isinstance(axes, list)
2127 # assert all(isinstance(x, Index) for x in axes)
2128 # assert all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)
2129 # assert all(type(x) is not PandasArray for x in arrays)
2130 # assert all(x.ndim == 1 for x in arrays)
2131 # assert all(len(x) == len(axes[1]) for x in arrays)
2132 # assert len(arrays) == len(axes[0])
2133 # These last three are sufficient to allow us to safely pass
2134 # verify_integrity=False below.
2135
2136 try:
2137 blocks = _form_blocks(arrays, consolidate, refs)
2138 mgr = BlockManager(blocks, axes, verify_integrity=False)
2139 except ValueError as e:
2140 raise_construction_error(len(arrays), arrays[0].shape, axes, e)
2141 if consolidate:
2142 mgr._consolidate_inplace()
2143 return mgr
2144
2145
2146def raise_construction_error(
2147 tot_items: int,
2148 block_shape: Shape,
2149 axes: list[Index],
2150 e: ValueError | None = None,
2151):
2152 """raise a helpful message about our construction"""
2153 passed = tuple(map(int, [tot_items] + list(block_shape)))
2154 # Correcting the user facing error message during dataframe construction
2155 if len(passed) <= 2:
2156 passed = passed[::-1]
2157
2158 implied = tuple(len(ax) for ax in axes)
2159 # Correcting the user facing error message during dataframe construction
2160 if len(implied) <= 2:
2161 implied = implied[::-1]
2162
2163 # We return the exception object instead of raising it so that we
2164 # can raise it in the caller; mypy plays better with that
2165 if passed == implied and e is not None:
2166 raise e
2167 if block_shape[0] == 0:
2168 raise ValueError("Empty data passed with indices specified.")
2169 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
2170
2171
2172# -----------------------------------------------------------------------
2173
2174
2175def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, bool, DtypeObj]:
2176 # compat for numpy<1.21, in which comparing a np.dtype with an ExtensionDtype
2177 # raises instead of returning False. Once earlier numpy versions are dropped,
2178 # this can be simplified to `return tup[1].dtype`
2179 dtype = tup[1].dtype
2180
2181 if is_1d_only_ea_dtype(dtype):
2182 # We know these won't be consolidated, so don't need to group these.
2183 # This avoids expensive comparisons of CategoricalDtype objects
2184 sep = id(dtype)
2185 else:
2186 sep = 0
2187
2188 return sep, isinstance(dtype, np.dtype), dtype
2189
2190
2191def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]:
2192 tuples = list(enumerate(arrays))
2193
2194 if not consolidate:
2195 nbs = _tuples_to_blocks_no_consolidate(tuples, refs)
2196 return nbs
2197
2198 # when consolidating, we can ignore refs (either stacking always copies,
2199 # or the EA is already copied in the calling dict_to_mgr)
2200 # TODO(CoW) check if this is also valid for rec_array_to_mgr
2201
2202 # group by dtype
2203 grouper = itertools.groupby(tuples, _grouping_func)
2204
2205 nbs = []
2206 for (_, _, dtype), tup_block in grouper:
2207 block_type = get_block_type(dtype)
2208
2209 if isinstance(dtype, np.dtype):
2210 is_dtlike = dtype.kind in ["m", "M"]
2211
2212 if issubclass(dtype.type, (str, bytes)):
2213 dtype = np.dtype(object)
2214
2215 values, placement = _stack_arrays(list(tup_block), dtype)
2216 if is_dtlike:
2217 values = ensure_wrapped_if_datetimelike(values)
2218 blk = block_type(values, placement=BlockPlacement(placement), ndim=2)
2219 nbs.append(blk)
2220
2221 elif is_1d_only_ea_dtype(dtype):
2222 dtype_blocks = [
2223 block_type(x[1], placement=BlockPlacement(x[0]), ndim=2)
2224 for x in tup_block
2225 ]
2226 nbs.extend(dtype_blocks)
2227
2228 else:
2229 dtype_blocks = [
2230 block_type(
2231 ensure_block_shape(x[1], 2), placement=BlockPlacement(x[0]), ndim=2
2232 )
2233 for x in tup_block
2234 ]
2235 nbs.extend(dtype_blocks)
2236 return nbs
2237
2238
2239def _tuples_to_blocks_no_consolidate(tuples, refs) -> list[Block]:
2240 # tuples produced within _form_blocks are of the form (placement, array)
2241 return [
2242 new_block_2d(
2243 ensure_block_shape(arr, ndim=2), placement=BlockPlacement(i), refs=ref
2244 )
2245 for ((i, arr), ref) in zip(tuples, refs)
2246 ]
2247
2248
2249def _stack_arrays(tuples, dtype: np.dtype):
2250 placement, arrays = zip(*tuples)
2251
2252 first = arrays[0]
2253 shape = (len(arrays),) + first.shape
2254
2255 stacked = np.empty(shape, dtype=dtype)
2256 for i, arr in enumerate(arrays):
2257 stacked[i] = arr
2258
2259 return stacked, placement
2260
2261
2262def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]:
2263 """
2264 Merge blocks having same dtype, exclude non-consolidating blocks
2265 """
2266 # sort by _can_consolidate, dtype
2267 gkey = lambda x: x._consolidate_key
2268 grouper = itertools.groupby(sorted(blocks, key=gkey), gkey)
2269
2270 new_blocks: list[Block] = []
2271 for (_can_consolidate, dtype), group_blocks in grouper:
2272 merged_blocks, _ = _merge_blocks(
2273 list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate
2274 )
2275 new_blocks = extend_blocks(merged_blocks, new_blocks)
2276 return tuple(new_blocks)
2277
2278
2279def _merge_blocks(
2280 blocks: list[Block], dtype: DtypeObj, can_consolidate: bool
2281) -> tuple[list[Block], bool]:
2282 if len(blocks) == 1:
2283 return blocks, False
2284
2285 if can_consolidate:
2286 # TODO: optimization potential in case all mgrs contain slices and
2287 # combination of those slices is a slice, too.
2288 new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])
2289
2290 new_values: ArrayLike
2291
2292 if isinstance(blocks[0].dtype, np.dtype):
2293 # error: List comprehension has incompatible type List[Union[ndarray,
2294 # ExtensionArray]]; expected List[Union[complex, generic,
2295 # Sequence[Union[int, float, complex, str, bytes, generic]],
2296 # Sequence[Sequence[Any]], SupportsArray]]
2297 new_values = np.vstack([b.values for b in blocks]) # type: ignore[misc]
2298 else:
2299 bvals = [blk.values for blk in blocks]
2300 bvals2 = cast(Sequence[NDArrayBackedExtensionArray], bvals)
2301 new_values = bvals2[0]._concat_same_type(bvals2, axis=0)
2302
2303 argsort = np.argsort(new_mgr_locs)
2304 new_values = new_values[argsort]
2305 new_mgr_locs = new_mgr_locs[argsort]
2306
2307 bp = BlockPlacement(new_mgr_locs)
2308 return [new_block_2d(new_values, placement=bp)], True
2309
2310 # can't consolidate --> no merge
2311 return blocks, False
2312
2313
2314def _fast_count_smallints(arr: npt.NDArray[np.intp]):
2315 """Faster version of set(arr) for sequences of small numbers."""
2316 counts = np.bincount(arr)
2317 nz = counts.nonzero()[0]
2318 # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here,
2319 # in one benchmark by a factor of 11
2320 return zip(nz, counts[nz])
2321
2322
2323def _preprocess_slice_or_indexer(
2324 slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool
2325):
2326 if isinstance(slice_or_indexer, slice):
2327 return (
2328 "slice",
2329 slice_or_indexer,
2330 libinternals.slice_len(slice_or_indexer, length),
2331 )
2332 else:
2333 if (
2334 not isinstance(slice_or_indexer, np.ndarray)
2335 or slice_or_indexer.dtype.kind != "i"
2336 ):
2337 dtype = getattr(slice_or_indexer, "dtype", None)
2338 raise TypeError(type(slice_or_indexer), dtype)
2339
2340 indexer = ensure_platform_int(slice_or_indexer)
2341 if not allow_fill:
2342 indexer = maybe_convert_indices(indexer, length)
2343 return "fancy", indexer, len(indexer)