1"""
2Experimental manager based on storing a collection of 1D arrays
3"""
4from __future__ import annotations
5
6import itertools
7from typing import (
8 TYPE_CHECKING,
9 Callable,
10 Literal,
11)
12
13import numpy as np
14
15from pandas._libs import (
16 NaT,
17 lib,
18)
19
20from pandas.core.dtypes.astype import (
21 astype_array,
22 astype_array_safe,
23)
24from pandas.core.dtypes.cast import (
25 ensure_dtype_can_hold_na,
26 find_common_type,
27 infer_dtype_from_scalar,
28 np_find_common_type,
29)
30from pandas.core.dtypes.common import (
31 ensure_platform_int,
32 is_datetime64_ns_dtype,
33 is_integer,
34 is_numeric_dtype,
35 is_object_dtype,
36 is_timedelta64_ns_dtype,
37)
38from pandas.core.dtypes.dtypes import ExtensionDtype
39from pandas.core.dtypes.generic import (
40 ABCDataFrame,
41 ABCSeries,
42)
43from pandas.core.dtypes.missing import (
44 array_equals,
45 isna,
46 na_value_for_dtype,
47)
48
49import pandas.core.algorithms as algos
50from pandas.core.array_algos.quantile import quantile_compat
51from pandas.core.array_algos.take import take_1d
52from pandas.core.arrays import (
53 DatetimeArray,
54 ExtensionArray,
55 NumpyExtensionArray,
56 TimedeltaArray,
57)
58from pandas.core.construction import (
59 ensure_wrapped_if_datetimelike,
60 extract_array,
61 sanitize_array,
62)
63from pandas.core.indexers import (
64 maybe_convert_indices,
65 validate_indices,
66)
67from pandas.core.indexes.api import (
68 Index,
69 ensure_index,
70)
71from pandas.core.indexes.base import get_values_for_csv
72from pandas.core.internals.base import (
73 DataManager,
74 SingleDataManager,
75 ensure_np_dtype,
76 interleaved_dtype,
77)
78from pandas.core.internals.blocks import (
79 BlockPlacement,
80 ensure_block_shape,
81 external_values,
82 extract_pandas_array,
83 maybe_coerce_values,
84 new_block,
85)
86from pandas.core.internals.managers import make_na_array
87
88if TYPE_CHECKING:
89 from collections.abc import Hashable
90
91 from pandas._typing import (
92 ArrayLike,
93 AxisInt,
94 DtypeObj,
95 QuantileInterpolation,
96 Self,
97 npt,
98 )
99
100
101class BaseArrayManager(DataManager):
102 """
103 Core internal data structure to implement DataFrame and Series.
104
105 Alternative to the BlockManager, storing a list of 1D arrays instead of
106 Blocks.
107
108 This is *not* a public API class
109
110 Parameters
111 ----------
112 arrays : Sequence of arrays
113 axes : Sequence of Index
114 verify_integrity : bool, default True
115
116 """
117
118 __slots__ = [
119 "_axes", # private attribute, because 'axes' has different order, see below
120 "arrays",
121 ]
122
123 arrays: list[np.ndarray | ExtensionArray]
124 _axes: list[Index]
125
126 def __init__(
127 self,
128 arrays: list[np.ndarray | ExtensionArray],
129 axes: list[Index],
130 verify_integrity: bool = True,
131 ) -> None:
132 raise NotImplementedError
133
134 def make_empty(self, axes=None) -> Self:
135 """Return an empty ArrayManager with the items axis of len 0 (no columns)"""
136 if axes is None:
137 axes = [self.axes[1:], Index([])]
138
139 arrays: list[np.ndarray | ExtensionArray] = []
140 return type(self)(arrays, axes)
141
142 @property
143 def items(self) -> Index:
144 return self._axes[-1]
145
146 @property
147 # error: Signature of "axes" incompatible with supertype "DataManager"
148 def axes(self) -> list[Index]: # type: ignore[override]
149 # mypy doesn't work to override attribute with property
150 # see https://github.com/python/mypy/issues/4125
151 """Axes is BlockManager-compatible order (columns, rows)"""
152 return [self._axes[1], self._axes[0]]
153
154 @property
155 def shape_proper(self) -> tuple[int, ...]:
156 # this returns (n_rows, n_columns)
157 return tuple(len(ax) for ax in self._axes)
158
159 @staticmethod
160 def _normalize_axis(axis: AxisInt) -> int:
161 # switch axis
162 axis = 1 if axis == 0 else 0
163 return axis
164
165 def set_axis(self, axis: AxisInt, new_labels: Index) -> None:
166 # Caller is responsible for ensuring we have an Index object.
167 self._validate_set_axis(axis, new_labels)
168 axis = self._normalize_axis(axis)
169 self._axes[axis] = new_labels
170
171 def get_dtypes(self) -> npt.NDArray[np.object_]:
172 return np.array([arr.dtype for arr in self.arrays], dtype="object")
173
174 def add_references(self, mgr: BaseArrayManager) -> None:
175 """
176 Only implemented on the BlockManager level
177 """
178 return
179
180 def __getstate__(self):
181 return self.arrays, self._axes
182
183 def __setstate__(self, state) -> None:
184 self.arrays = state[0]
185 self._axes = state[1]
186
187 def __repr__(self) -> str:
188 output = type(self).__name__
189 output += f"\nIndex: {self._axes[0]}"
190 if self.ndim == 2:
191 output += f"\nColumns: {self._axes[1]}"
192 output += f"\n{len(self.arrays)} arrays:"
193 for arr in self.arrays:
194 output += f"\n{arr.dtype}"
195 return output
196
197 def apply(
198 self,
199 f,
200 align_keys: list[str] | None = None,
201 **kwargs,
202 ) -> Self:
203 """
204 Iterate over the arrays, collect and create a new ArrayManager.
205
206 Parameters
207 ----------
208 f : str or callable
209 Name of the Array method to apply.
210 align_keys: List[str] or None, default None
211 **kwargs
212 Keywords to pass to `f`
213
214 Returns
215 -------
216 ArrayManager
217 """
218 assert "filter" not in kwargs
219
220 align_keys = align_keys or []
221 result_arrays: list[ArrayLike] = []
222 # fillna: Series/DataFrame is responsible for making sure value is aligned
223
224 aligned_args = {k: kwargs[k] for k in align_keys}
225
226 if f == "apply":
227 f = kwargs.pop("func")
228
229 for i, arr in enumerate(self.arrays):
230 if aligned_args:
231 for k, obj in aligned_args.items():
232 if isinstance(obj, (ABCSeries, ABCDataFrame)):
233 # The caller is responsible for ensuring that
234 # obj.axes[-1].equals(self.items)
235 if obj.ndim == 1:
236 kwargs[k] = obj.iloc[i]
237 else:
238 kwargs[k] = obj.iloc[:, i]._values
239 else:
240 # otherwise we have an array-like
241 kwargs[k] = obj[i]
242
243 if callable(f):
244 applied = f(arr, **kwargs)
245 else:
246 applied = getattr(arr, f)(**kwargs)
247
248 result_arrays.append(applied)
249
250 new_axes = self._axes
251 return type(self)(result_arrays, new_axes)
252
253 def apply_with_block(self, f, align_keys=None, **kwargs) -> Self:
254 # switch axis to follow BlockManager logic
255 swap_axis = True
256 if f == "interpolate":
257 swap_axis = False
258 if swap_axis and "axis" in kwargs and self.ndim == 2:
259 kwargs["axis"] = 1 if kwargs["axis"] == 0 else 0
260
261 align_keys = align_keys or []
262 aligned_args = {k: kwargs[k] for k in align_keys}
263
264 result_arrays = []
265
266 for i, arr in enumerate(self.arrays):
267 if aligned_args:
268 for k, obj in aligned_args.items():
269 if isinstance(obj, (ABCSeries, ABCDataFrame)):
270 # The caller is responsible for ensuring that
271 # obj.axes[-1].equals(self.items)
272 if obj.ndim == 1:
273 if self.ndim == 2:
274 kwargs[k] = obj.iloc[slice(i, i + 1)]._values
275 else:
276 kwargs[k] = obj.iloc[:]._values
277 else:
278 kwargs[k] = obj.iloc[:, [i]]._values
279 else:
280 # otherwise we have an ndarray
281 if obj.ndim == 2:
282 kwargs[k] = obj[[i]]
283
284 if isinstance(arr.dtype, np.dtype) and not isinstance(arr, np.ndarray):
285 # i.e. TimedeltaArray, DatetimeArray with tz=None. Need to
286 # convert for the Block constructors.
287 arr = np.asarray(arr)
288
289 arr = maybe_coerce_values(arr)
290 if self.ndim == 2:
291 arr = ensure_block_shape(arr, 2)
292 bp = BlockPlacement(slice(0, 1, 1))
293 block = new_block(arr, placement=bp, ndim=2)
294 else:
295 bp = BlockPlacement(slice(0, len(self), 1))
296 block = new_block(arr, placement=bp, ndim=1)
297
298 applied = getattr(block, f)(**kwargs)
299 if isinstance(applied, list):
300 applied = applied[0]
301 arr = applied.values
302 if self.ndim == 2 and arr.ndim == 2:
303 # 2D for np.ndarray or DatetimeArray/TimedeltaArray
304 assert len(arr) == 1
305 # error: No overload variant of "__getitem__" of "ExtensionArray"
306 # matches argument type "Tuple[int, slice]"
307 arr = arr[0, :] # type: ignore[call-overload]
308 result_arrays.append(arr)
309
310 return type(self)(result_arrays, self._axes)
311
312 def setitem(self, indexer, value, warn: bool = True) -> Self:
313 return self.apply_with_block("setitem", indexer=indexer, value=value)
314
315 def diff(self, n: int) -> Self:
316 assert self.ndim == 2 # caller ensures
317 return self.apply(algos.diff, n=n)
318
319 def astype(self, dtype, copy: bool | None = False, errors: str = "raise") -> Self:
320 if copy is None:
321 copy = True
322
323 return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors)
324
325 def convert(self, copy: bool | None) -> Self:
326 if copy is None:
327 copy = True
328
329 def _convert(arr):
330 if is_object_dtype(arr.dtype):
331 # extract NumpyExtensionArray for tests that patch
332 # NumpyExtensionArray._typ
333 arr = np.asarray(arr)
334 result = lib.maybe_convert_objects(
335 arr,
336 convert_non_numeric=True,
337 )
338 if result is arr and copy:
339 return arr.copy()
340 return result
341 else:
342 return arr.copy() if copy else arr
343
344 return self.apply(_convert)
345
346 def get_values_for_csv(
347 self, *, float_format, date_format, decimal, na_rep: str = "nan", quoting=None
348 ) -> Self:
349 return self.apply(
350 get_values_for_csv,
351 na_rep=na_rep,
352 quoting=quoting,
353 float_format=float_format,
354 date_format=date_format,
355 decimal=decimal,
356 )
357
358 @property
359 def any_extension_types(self) -> bool:
360 """Whether any of the blocks in this manager are extension blocks"""
361 return False # any(block.is_extension for block in self.blocks)
362
363 @property
364 def is_view(self) -> bool:
365 """return a boolean if we are a single block and are a view"""
366 # TODO what is this used for?
367 return False
368
369 @property
370 def is_single_block(self) -> bool:
371 return len(self.arrays) == 1
372
373 def _get_data_subset(self, predicate: Callable) -> Self:
374 indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)]
375 arrays = [self.arrays[i] for i in indices]
376 # TODO copy?
377 # Note: using Index.take ensures we can retain e.g. DatetimeIndex.freq,
378 # see test_describe_datetime_columns
379 taker = np.array(indices, dtype="intp")
380 new_cols = self._axes[1].take(taker)
381 new_axes = [self._axes[0], new_cols]
382 return type(self)(arrays, new_axes, verify_integrity=False)
383
384 def get_bool_data(self, copy: bool = False) -> Self:
385 """
386 Select columns that are bool-dtype and object-dtype columns that are all-bool.
387
388 Parameters
389 ----------
390 copy : bool, default False
391 Whether to copy the blocks
392 """
393 return self._get_data_subset(lambda x: x.dtype == np.dtype(bool))
394
395 def get_numeric_data(self, copy: bool = False) -> Self:
396 """
397 Select columns that have a numeric dtype.
398
399 Parameters
400 ----------
401 copy : bool, default False
402 Whether to copy the blocks
403 """
404 return self._get_data_subset(
405 lambda arr: is_numeric_dtype(arr.dtype)
406 or getattr(arr.dtype, "_is_numeric", False)
407 )
408
409 def copy(self, deep: bool | Literal["all"] | None = True) -> Self:
410 """
411 Make deep or shallow copy of ArrayManager
412
413 Parameters
414 ----------
415 deep : bool or string, default True
416 If False, return shallow copy (do not copy data)
417 If 'all', copy data and a deep copy of the index
418
419 Returns
420 -------
421 BlockManager
422 """
423 if deep is None:
424 # ArrayManager does not yet support CoW, so deep=None always means
425 # deep=True for now
426 deep = True
427
428 # this preserves the notion of view copying of axes
429 if deep:
430 # hit in e.g. tests.io.json.test_pandas
431
432 def copy_func(ax):
433 return ax.copy(deep=True) if deep == "all" else ax.view()
434
435 new_axes = [copy_func(ax) for ax in self._axes]
436 else:
437 new_axes = list(self._axes)
438
439 if deep:
440 new_arrays = [arr.copy() for arr in self.arrays]
441 else:
442 new_arrays = list(self.arrays)
443 return type(self)(new_arrays, new_axes, verify_integrity=False)
444
445 def reindex_indexer(
446 self,
447 new_axis,
448 indexer,
449 axis: AxisInt,
450 fill_value=None,
451 allow_dups: bool = False,
452 copy: bool | None = True,
453 # ignored keywords
454 only_slice: bool = False,
455 # ArrayManager specific keywords
456 use_na_proxy: bool = False,
457 ) -> Self:
458 axis = self._normalize_axis(axis)
459 return self._reindex_indexer(
460 new_axis,
461 indexer,
462 axis,
463 fill_value,
464 allow_dups,
465 copy,
466 use_na_proxy,
467 )
468
469 def _reindex_indexer(
470 self,
471 new_axis,
472 indexer: npt.NDArray[np.intp] | None,
473 axis: AxisInt,
474 fill_value=None,
475 allow_dups: bool = False,
476 copy: bool | None = True,
477 use_na_proxy: bool = False,
478 ) -> Self:
479 """
480 Parameters
481 ----------
482 new_axis : Index
483 indexer : ndarray[intp] or None
484 axis : int
485 fill_value : object, default None
486 allow_dups : bool, default False
487 copy : bool, default True
488
489
490 pandas-indexer with -1's only.
491 """
492 if copy is None:
493 # ArrayManager does not yet support CoW, so deep=None always means
494 # deep=True for now
495 copy = True
496
497 if indexer is None:
498 if new_axis is self._axes[axis] and not copy:
499 return self
500
501 result = self.copy(deep=copy)
502 result._axes = list(self._axes)
503 result._axes[axis] = new_axis
504 return result
505
506 # some axes don't allow reindexing with dups
507 if not allow_dups:
508 self._axes[axis]._validate_can_reindex(indexer)
509
510 if axis >= self.ndim:
511 raise IndexError("Requested axis not found in manager")
512
513 if axis == 1:
514 new_arrays = []
515 for i in indexer:
516 if i == -1:
517 arr = self._make_na_array(
518 fill_value=fill_value, use_na_proxy=use_na_proxy
519 )
520 else:
521 arr = self.arrays[i]
522 if copy:
523 arr = arr.copy()
524 new_arrays.append(arr)
525
526 else:
527 validate_indices(indexer, len(self._axes[0]))
528 indexer = ensure_platform_int(indexer)
529 mask = indexer == -1
530 needs_masking = mask.any()
531 new_arrays = [
532 take_1d(
533 arr,
534 indexer,
535 allow_fill=needs_masking,
536 fill_value=fill_value,
537 mask=mask,
538 # if fill_value is not None else blk.fill_value
539 )
540 for arr in self.arrays
541 ]
542
543 new_axes = list(self._axes)
544 new_axes[axis] = new_axis
545
546 return type(self)(new_arrays, new_axes, verify_integrity=False)
547
548 def take(
549 self,
550 indexer: npt.NDArray[np.intp],
551 axis: AxisInt = 1,
552 verify: bool = True,
553 ) -> Self:
554 """
555 Take items along any axis.
556 """
557 assert isinstance(indexer, np.ndarray), type(indexer)
558 assert indexer.dtype == np.intp, indexer.dtype
559
560 axis = self._normalize_axis(axis)
561
562 if not indexer.ndim == 1:
563 raise ValueError("indexer should be 1-dimensional")
564
565 n = self.shape_proper[axis]
566 indexer = maybe_convert_indices(indexer, n, verify=verify)
567
568 new_labels = self._axes[axis].take(indexer)
569 return self._reindex_indexer(
570 new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True
571 )
572
573 def _make_na_array(self, fill_value=None, use_na_proxy: bool = False):
574 if use_na_proxy:
575 assert fill_value is None
576 return NullArrayProxy(self.shape_proper[0])
577
578 if fill_value is None:
579 fill_value = np.nan
580
581 dtype, fill_value = infer_dtype_from_scalar(fill_value)
582 array_values = make_na_array(dtype, self.shape_proper[:1], fill_value)
583 return array_values
584
585 def _equal_values(self, other) -> bool:
586 """
587 Used in .equals defined in base class. Only check the column values
588 assuming shape and indexes have already been checked.
589 """
590 for left, right in zip(self.arrays, other.arrays):
591 if not array_equals(left, right):
592 return False
593 return True
594
595 # TODO
596 # to_dict
597
598
599class ArrayManager(BaseArrayManager):
600 @property
601 def ndim(self) -> Literal[2]:
602 return 2
603
604 def __init__(
605 self,
606 arrays: list[np.ndarray | ExtensionArray],
607 axes: list[Index],
608 verify_integrity: bool = True,
609 ) -> None:
610 # Note: we are storing the axes in "_axes" in the (row, columns) order
611 # which contrasts the order how it is stored in BlockManager
612 self._axes = axes
613 self.arrays = arrays
614
615 if verify_integrity:
616 self._axes = [ensure_index(ax) for ax in axes]
617 arrays = [extract_pandas_array(x, None, 1)[0] for x in arrays]
618 self.arrays = [maybe_coerce_values(arr) for arr in arrays]
619 self._verify_integrity()
620
621 def _verify_integrity(self) -> None:
622 n_rows, n_columns = self.shape_proper
623 if not len(self.arrays) == n_columns:
624 raise ValueError(
625 "Number of passed arrays must equal the size of the column Index: "
626 f"{len(self.arrays)} arrays vs {n_columns} columns."
627 )
628 for arr in self.arrays:
629 if not len(arr) == n_rows:
630 raise ValueError(
631 "Passed arrays should have the same length as the rows Index: "
632 f"{len(arr)} vs {n_rows} rows"
633 )
634 if not isinstance(arr, (np.ndarray, ExtensionArray)):
635 raise ValueError(
636 "Passed arrays should be np.ndarray or ExtensionArray instances, "
637 f"got {type(arr)} instead"
638 )
639 if not arr.ndim == 1:
640 raise ValueError(
641 "Passed arrays should be 1-dimensional, got array with "
642 f"{arr.ndim} dimensions instead."
643 )
644
645 # --------------------------------------------------------------------
646 # Indexing
647
648 def fast_xs(self, loc: int) -> SingleArrayManager:
649 """
650 Return the array corresponding to `frame.iloc[loc]`.
651
652 Parameters
653 ----------
654 loc : int
655
656 Returns
657 -------
658 np.ndarray or ExtensionArray
659 """
660 dtype = interleaved_dtype([arr.dtype for arr in self.arrays])
661
662 values = [arr[loc] for arr in self.arrays]
663 if isinstance(dtype, ExtensionDtype):
664 result = dtype.construct_array_type()._from_sequence(values, dtype=dtype)
665 # for datetime64/timedelta64, the np.ndarray constructor cannot handle pd.NaT
666 elif is_datetime64_ns_dtype(dtype):
667 result = DatetimeArray._from_sequence(values, dtype=dtype)._ndarray
668 elif is_timedelta64_ns_dtype(dtype):
669 result = TimedeltaArray._from_sequence(values, dtype=dtype)._ndarray
670 else:
671 result = np.array(values, dtype=dtype)
672 return SingleArrayManager([result], [self._axes[1]])
673
674 def get_slice(self, slobj: slice, axis: AxisInt = 0) -> ArrayManager:
675 axis = self._normalize_axis(axis)
676
677 if axis == 0:
678 arrays = [arr[slobj] for arr in self.arrays]
679 elif axis == 1:
680 arrays = self.arrays[slobj]
681
682 new_axes = list(self._axes)
683 new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
684
685 return type(self)(arrays, new_axes, verify_integrity=False)
686
687 def iget(self, i: int) -> SingleArrayManager:
688 """
689 Return the data as a SingleArrayManager.
690 """
691 values = self.arrays[i]
692 return SingleArrayManager([values], [self._axes[0]])
693
694 def iget_values(self, i: int) -> ArrayLike:
695 """
696 Return the data for column i as the values (ndarray or ExtensionArray).
697 """
698 return self.arrays[i]
699
700 @property
701 def column_arrays(self) -> list[ArrayLike]:
702 """
703 Used in the JSON C code to access column arrays.
704 """
705
706 return [np.asarray(arr) for arr in self.arrays]
707
708 def iset(
709 self,
710 loc: int | slice | np.ndarray,
711 value: ArrayLike,
712 inplace: bool = False,
713 refs=None,
714 ) -> None:
715 """
716 Set new column(s).
717
718 This changes the ArrayManager in-place, but replaces (an) existing
719 column(s), not changing column values in-place).
720
721 Parameters
722 ----------
723 loc : integer, slice or boolean mask
724 Positional location (already bounds checked)
725 value : np.ndarray or ExtensionArray
726 inplace : bool, default False
727 Whether overwrite existing array as opposed to replacing it.
728 """
729 # single column -> single integer index
730 if lib.is_integer(loc):
731 # TODO can we avoid needing to unpack this here? That means converting
732 # DataFrame into 1D array when loc is an integer
733 if isinstance(value, np.ndarray) and value.ndim == 2:
734 assert value.shape[1] == 1
735 value = value[:, 0]
736
737 # TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item
738 # but we should avoid that and pass directly the proper array
739 value = maybe_coerce_values(value)
740
741 assert isinstance(value, (np.ndarray, ExtensionArray))
742 assert value.ndim == 1
743 assert len(value) == len(self._axes[0])
744 self.arrays[loc] = value
745 return
746
747 # multiple columns -> convert slice or array to integer indices
748 elif isinstance(loc, slice):
749 indices: range | np.ndarray = range(
750 loc.start if loc.start is not None else 0,
751 loc.stop if loc.stop is not None else self.shape_proper[1],
752 loc.step if loc.step is not None else 1,
753 )
754 else:
755 assert isinstance(loc, np.ndarray)
756 assert loc.dtype == "bool"
757 indices = np.nonzero(loc)[0]
758
759 assert value.ndim == 2
760 assert value.shape[0] == len(self._axes[0])
761
762 for value_idx, mgr_idx in enumerate(indices):
763 # error: No overload variant of "__getitem__" of "ExtensionArray" matches
764 # argument type "Tuple[slice, int]"
765 value_arr = value[:, value_idx] # type: ignore[call-overload]
766 self.arrays[mgr_idx] = value_arr
767 return
768
769 def column_setitem(
770 self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False
771 ) -> None:
772 """
773 Set values ("setitem") into a single column (not setting the full column).
774
775 This is a method on the ArrayManager level, to avoid creating an
776 intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
777 """
778 if not is_integer(loc):
779 raise TypeError("The column index should be an integer")
780 arr = self.arrays[loc]
781 mgr = SingleArrayManager([arr], [self._axes[0]])
782 if inplace_only:
783 mgr.setitem_inplace(idx, value)
784 else:
785 new_mgr = mgr.setitem((idx,), value)
786 # update existing ArrayManager in-place
787 self.arrays[loc] = new_mgr.arrays[0]
788
789 def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None:
790 """
791 Insert item at selected position.
792
793 Parameters
794 ----------
795 loc : int
796 item : hashable
797 value : np.ndarray or ExtensionArray
798 """
799 # insert to the axis; this could possibly raise a TypeError
800 new_axis = self.items.insert(loc, item)
801
802 value = extract_array(value, extract_numpy=True)
803 if value.ndim == 2:
804 if value.shape[0] == 1:
805 # error: No overload variant of "__getitem__" of "ExtensionArray"
806 # matches argument type "Tuple[int, slice]"
807 value = value[0, :] # type: ignore[call-overload]
808 else:
809 raise ValueError(
810 f"Expected a 1D array, got an array with shape {value.shape}"
811 )
812 value = maybe_coerce_values(value)
813
814 # TODO self.arrays can be empty
815 # assert len(value) == len(self.arrays[0])
816
817 # TODO is this copy needed?
818 arrays = self.arrays.copy()
819 arrays.insert(loc, value)
820
821 self.arrays = arrays
822 self._axes[1] = new_axis
823
824 def idelete(self, indexer) -> ArrayManager:
825 """
826 Delete selected locations in-place (new block and array, same BlockManager)
827 """
828 to_keep = np.ones(self.shape[0], dtype=np.bool_)
829 to_keep[indexer] = False
830
831 self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]]
832 self._axes = [self._axes[0], self._axes[1][to_keep]]
833 return self
834
835 # --------------------------------------------------------------------
836 # Array-wise Operation
837
838 def grouped_reduce(self, func: Callable) -> Self:
839 """
840 Apply grouped reduction function columnwise, returning a new ArrayManager.
841
842 Parameters
843 ----------
844 func : grouped reduction function
845
846 Returns
847 -------
848 ArrayManager
849 """
850 result_arrays: list[np.ndarray] = []
851 result_indices: list[int] = []
852
853 for i, arr in enumerate(self.arrays):
854 # grouped_reduce functions all expect 2D arrays
855 arr = ensure_block_shape(arr, ndim=2)
856 res = func(arr)
857 if res.ndim == 2:
858 # reverse of ensure_block_shape
859 assert res.shape[0] == 1
860 res = res[0]
861
862 result_arrays.append(res)
863 result_indices.append(i)
864
865 if len(result_arrays) == 0:
866 nrows = 0
867 else:
868 nrows = result_arrays[0].shape[0]
869 index = Index(range(nrows))
870
871 columns = self.items
872
873 # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
874 # expected "List[Union[ndarray, ExtensionArray]]"
875 return type(self)(result_arrays, [index, columns]) # type: ignore[arg-type]
876
877 def reduce(self, func: Callable) -> Self:
878 """
879 Apply reduction function column-wise, returning a single-row ArrayManager.
880
881 Parameters
882 ----------
883 func : reduction function
884
885 Returns
886 -------
887 ArrayManager
888 """
889 result_arrays: list[np.ndarray] = []
890 for i, arr in enumerate(self.arrays):
891 res = func(arr, axis=0)
892
893 # TODO NaT doesn't preserve dtype, so we need to ensure to create
894 # a timedelta result array if original was timedelta
895 # what if datetime results in timedelta? (eg std)
896 dtype = arr.dtype if res is NaT else None
897 result_arrays.append(
898 sanitize_array([res], None, dtype=dtype) # type: ignore[arg-type]
899 )
900
901 index = Index._simple_new(np.array([None], dtype=object)) # placeholder
902 columns = self.items
903
904 # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
905 # expected "List[Union[ndarray, ExtensionArray]]"
906 new_mgr = type(self)(result_arrays, [index, columns]) # type: ignore[arg-type]
907 return new_mgr
908
909 def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager:
910 """
911 Apply array_op blockwise with another (aligned) BlockManager.
912 """
913 # TODO what if `other` is BlockManager ?
914 left_arrays = self.arrays
915 right_arrays = other.arrays
916 result_arrays = [
917 array_op(left, right) for left, right in zip(left_arrays, right_arrays)
918 ]
919 return type(self)(result_arrays, self._axes)
920
921 def quantile(
922 self,
923 *,
924 qs: Index, # with dtype float64
925 transposed: bool = False,
926 interpolation: QuantileInterpolation = "linear",
927 ) -> ArrayManager:
928 arrs = [ensure_block_shape(x, 2) for x in self.arrays]
929 new_arrs = [
930 quantile_compat(x, np.asarray(qs._values), interpolation) for x in arrs
931 ]
932 for i, arr in enumerate(new_arrs):
933 if arr.ndim == 2:
934 assert arr.shape[0] == 1, arr.shape
935 new_arrs[i] = arr[0]
936
937 axes = [qs, self._axes[1]]
938 return type(self)(new_arrs, axes)
939
940 # ----------------------------------------------------------------
941
942 def unstack(self, unstacker, fill_value) -> ArrayManager:
943 """
944 Return a BlockManager with all blocks unstacked.
945
946 Parameters
947 ----------
948 unstacker : reshape._Unstacker
949 fill_value : Any
950 fill_value for newly introduced missing values.
951
952 Returns
953 -------
954 unstacked : BlockManager
955 """
956 indexer, _ = unstacker._indexer_and_to_sort
957 if unstacker.mask.all():
958 new_indexer = indexer
959 allow_fill = False
960 new_mask2D = None
961 needs_masking = None
962 else:
963 new_indexer = np.full(unstacker.mask.shape, -1)
964 new_indexer[unstacker.mask] = indexer
965 allow_fill = True
966 # calculating the full mask once and passing it to take_1d is faster
967 # than letting take_1d calculate it in each repeated call
968 new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape)
969 needs_masking = new_mask2D.any(axis=0)
970 new_indexer2D = new_indexer.reshape(*unstacker.full_shape)
971 new_indexer2D = ensure_platform_int(new_indexer2D)
972
973 new_arrays = []
974 for arr in self.arrays:
975 for i in range(unstacker.full_shape[1]):
976 if allow_fill:
977 # error: Value of type "Optional[Any]" is not indexable [index]
978 new_arr = take_1d(
979 arr,
980 new_indexer2D[:, i],
981 allow_fill=needs_masking[i], # type: ignore[index]
982 fill_value=fill_value,
983 mask=new_mask2D[:, i], # type: ignore[index]
984 )
985 else:
986 new_arr = take_1d(arr, new_indexer2D[:, i], allow_fill=False)
987 new_arrays.append(new_arr)
988
989 new_index = unstacker.new_index
990 new_columns = unstacker.get_new_columns(self._axes[1])
991 new_axes = [new_index, new_columns]
992
993 return type(self)(new_arrays, new_axes, verify_integrity=False)
994
995 def as_array(
996 self,
997 dtype=None,
998 copy: bool = False,
999 na_value: object = lib.no_default,
1000 ) -> np.ndarray:
1001 """
1002 Convert the blockmanager data into an numpy array.
1003
1004 Parameters
1005 ----------
1006 dtype : object, default None
1007 Data type of the return array.
1008 copy : bool, default False
1009 If True then guarantee that a copy is returned. A value of
1010 False does not guarantee that the underlying data is not
1011 copied.
1012 na_value : object, default lib.no_default
1013 Value to be used as the missing value sentinel.
1014
1015 Returns
1016 -------
1017 arr : ndarray
1018 """
1019 if len(self.arrays) == 0:
1020 empty_arr = np.empty(self.shape, dtype=float)
1021 return empty_arr.transpose()
1022
1023 # We want to copy when na_value is provided to avoid
1024 # mutating the original object
1025 copy = copy or na_value is not lib.no_default
1026
1027 if not dtype:
1028 dtype = interleaved_dtype([arr.dtype for arr in self.arrays])
1029
1030 dtype = ensure_np_dtype(dtype)
1031
1032 result = np.empty(self.shape_proper, dtype=dtype)
1033
1034 for i, arr in enumerate(self.arrays):
1035 arr = arr.astype(dtype, copy=copy)
1036 result[:, i] = arr
1037
1038 if na_value is not lib.no_default:
1039 result[isna(result)] = na_value
1040
1041 return result
1042
1043 @classmethod
1044 def concat_horizontal(cls, mgrs: list[Self], axes: list[Index]) -> Self:
1045 """
1046 Concatenate uniformly-indexed ArrayManagers horizontally.
1047 """
1048 # concatting along the columns -> combine reindexed arrays in a single manager
1049 arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
1050 new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False)
1051 return new_mgr
1052
1053 @classmethod
1054 def concat_vertical(cls, mgrs: list[Self], axes: list[Index]) -> Self:
1055 """
1056 Concatenate uniformly-indexed ArrayManagers vertically.
1057 """
1058 # concatting along the rows -> concat the reindexed arrays
1059 # TODO(ArrayManager) doesn't yet preserve the correct dtype
1060 arrays = [
1061 concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
1062 for j in range(len(mgrs[0].arrays))
1063 ]
1064 new_mgr = cls(arrays, [axes[1], axes[0]], verify_integrity=False)
1065 return new_mgr
1066
1067
1068class SingleArrayManager(BaseArrayManager, SingleDataManager):
1069 __slots__ = [
1070 "_axes", # private attribute, because 'axes' has different order, see below
1071 "arrays",
1072 ]
1073
1074 arrays: list[np.ndarray | ExtensionArray]
1075 _axes: list[Index]
1076
1077 @property
1078 def ndim(self) -> Literal[1]:
1079 return 1
1080
1081 def __init__(
1082 self,
1083 arrays: list[np.ndarray | ExtensionArray],
1084 axes: list[Index],
1085 verify_integrity: bool = True,
1086 ) -> None:
1087 self._axes = axes
1088 self.arrays = arrays
1089
1090 if verify_integrity:
1091 assert len(axes) == 1
1092 assert len(arrays) == 1
1093 self._axes = [ensure_index(ax) for ax in self._axes]
1094 arr = arrays[0]
1095 arr = maybe_coerce_values(arr)
1096 arr = extract_pandas_array(arr, None, 1)[0]
1097 self.arrays = [arr]
1098 self._verify_integrity()
1099
1100 def _verify_integrity(self) -> None:
1101 (n_rows,) = self.shape
1102 assert len(self.arrays) == 1
1103 arr = self.arrays[0]
1104 assert len(arr) == n_rows
1105 if not arr.ndim == 1:
1106 raise ValueError(
1107 "Passed array should be 1-dimensional, got array with "
1108 f"{arr.ndim} dimensions instead."
1109 )
1110
1111 @staticmethod
1112 def _normalize_axis(axis):
1113 return axis
1114
1115 def make_empty(self, axes=None) -> Self:
1116 """Return an empty ArrayManager with index/array of length 0"""
1117 if axes is None:
1118 axes = [Index([], dtype=object)]
1119 array: np.ndarray = np.array([], dtype=self.dtype)
1120 return type(self)([array], axes)
1121
1122 @classmethod
1123 def from_array(cls, array, index) -> SingleArrayManager:
1124 return cls([array], [index])
1125
1126 # error: Cannot override writeable attribute with read-only property
1127 @property
1128 def axes(self) -> list[Index]: # type: ignore[override]
1129 return self._axes
1130
1131 @property
1132 def index(self) -> Index:
1133 return self._axes[0]
1134
1135 @property
1136 def dtype(self):
1137 return self.array.dtype
1138
1139 def external_values(self):
1140 """The array that Series.values returns"""
1141 return external_values(self.array)
1142
1143 def internal_values(self):
1144 """The array that Series._values returns"""
1145 return self.array
1146
1147 def array_values(self):
1148 """The array that Series.array returns"""
1149 arr = self.array
1150 if isinstance(arr, np.ndarray):
1151 arr = NumpyExtensionArray(arr)
1152 return arr
1153
1154 @property
1155 def _can_hold_na(self) -> bool:
1156 if isinstance(self.array, np.ndarray):
1157 return self.array.dtype.kind not in "iub"
1158 else:
1159 # ExtensionArray
1160 return self.array._can_hold_na
1161
1162 @property
1163 def is_single_block(self) -> bool:
1164 return True
1165
1166 def fast_xs(self, loc: int) -> SingleArrayManager:
1167 raise NotImplementedError("Use series._values[loc] instead")
1168
1169 def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleArrayManager:
1170 if axis >= self.ndim:
1171 raise IndexError("Requested axis not found in manager")
1172
1173 new_array = self.array[slobj]
1174 new_index = self.index._getitem_slice(slobj)
1175 return type(self)([new_array], [new_index], verify_integrity=False)
1176
1177 def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> SingleArrayManager:
1178 new_array = self.array[indexer]
1179 new_index = self.index[indexer]
1180 return type(self)([new_array], [new_index])
1181
1182 # error: Signature of "apply" incompatible with supertype "BaseArrayManager"
1183 def apply(self, func, **kwargs) -> Self: # type: ignore[override]
1184 if callable(func):
1185 new_array = func(self.array, **kwargs)
1186 else:
1187 new_array = getattr(self.array, func)(**kwargs)
1188 return type(self)([new_array], self._axes)
1189
1190 def setitem(self, indexer, value, warn: bool = True) -> SingleArrayManager:
1191 """
1192 Set values with indexer.
1193
1194 For SingleArrayManager, this backs s[indexer] = value
1195
1196 See `setitem_inplace` for a version that works inplace and doesn't
1197 return a new Manager.
1198 """
1199 if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
1200 raise ValueError(f"Cannot set values with ndim > {self.ndim}")
1201 return self.apply_with_block("setitem", indexer=indexer, value=value)
1202
1203 def idelete(self, indexer) -> SingleArrayManager:
1204 """
1205 Delete selected locations in-place (new array, same ArrayManager)
1206 """
1207 to_keep = np.ones(self.shape[0], dtype=np.bool_)
1208 to_keep[indexer] = False
1209
1210 self.arrays = [self.arrays[0][to_keep]]
1211 self._axes = [self._axes[0][to_keep]]
1212 return self
1213
1214 def _get_data_subset(self, predicate: Callable) -> SingleArrayManager:
1215 # used in get_numeric_data / get_bool_data
1216 if predicate(self.array):
1217 return type(self)(self.arrays, self._axes, verify_integrity=False)
1218 else:
1219 return self.make_empty()
1220
1221 def set_values(self, values: ArrayLike) -> None:
1222 """
1223 Set (replace) the values of the SingleArrayManager in place.
1224
1225 Use at your own risk! This does not check if the passed values are
1226 valid for the current SingleArrayManager (length, dtype, etc).
1227 """
1228 self.arrays[0] = values
1229
1230 def to_2d_mgr(self, columns: Index) -> ArrayManager:
1231 """
1232 Manager analogue of Series.to_frame
1233 """
1234 arrays = [self.arrays[0]]
1235 axes = [self.axes[0], columns]
1236
1237 return ArrayManager(arrays, axes, verify_integrity=False)
1238
1239
1240class NullArrayProxy:
1241 """
1242 Proxy object for an all-NA array.
1243
1244 Only stores the length of the array, and not the dtype. The dtype
1245 will only be known when actually concatenating (after determining the
1246 common dtype, for which this proxy is ignored).
1247 Using this object avoids that the internals/concat.py needs to determine
1248 the proper dtype and array type.
1249 """
1250
1251 ndim = 1
1252
1253 def __init__(self, n: int) -> None:
1254 self.n = n
1255
1256 @property
1257 def shape(self) -> tuple[int]:
1258 return (self.n,)
1259
1260 def to_array(self, dtype: DtypeObj) -> ArrayLike:
1261 """
1262 Helper function to create the actual all-NA array from the NullArrayProxy
1263 object.
1264
1265 Parameters
1266 ----------
1267 arr : NullArrayProxy
1268 dtype : the dtype for the resulting array
1269
1270 Returns
1271 -------
1272 np.ndarray or ExtensionArray
1273 """
1274 if isinstance(dtype, ExtensionDtype):
1275 empty = dtype.construct_array_type()._from_sequence([], dtype=dtype)
1276 indexer = -np.ones(self.n, dtype=np.intp)
1277 return empty.take(indexer, allow_fill=True)
1278 else:
1279 # when introducing missing values, int becomes float, bool becomes object
1280 dtype = ensure_dtype_can_hold_na(dtype)
1281 fill_value = na_value_for_dtype(dtype)
1282 arr = np.empty(self.n, dtype=dtype)
1283 arr.fill(fill_value)
1284 return ensure_wrapped_if_datetimelike(arr)
1285
1286
1287def concat_arrays(to_concat: list) -> ArrayLike:
1288 """
1289 Alternative for concat_compat but specialized for use in the ArrayManager.
1290
1291 Differences: only deals with 1D arrays (no axis keyword), assumes
1292 ensure_wrapped_if_datetimelike and does not skip empty arrays to determine
1293 the dtype.
1294 In addition ensures that all NullArrayProxies get replaced with actual
1295 arrays.
1296
1297 Parameters
1298 ----------
1299 to_concat : list of arrays
1300
1301 Returns
1302 -------
1303 np.ndarray or ExtensionArray
1304 """
1305 # ignore the all-NA proxies to determine the resulting dtype
1306 to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
1307
1308 dtypes = {x.dtype for x in to_concat_no_proxy}
1309 single_dtype = len(dtypes) == 1
1310
1311 if single_dtype:
1312 target_dtype = to_concat_no_proxy[0].dtype
1313 elif all(lib.is_np_dtype(x, "iub") for x in dtypes):
1314 # GH#42092
1315 target_dtype = np_find_common_type(*dtypes)
1316 else:
1317 target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
1318
1319 to_concat = [
1320 arr.to_array(target_dtype)
1321 if isinstance(arr, NullArrayProxy)
1322 else astype_array(arr, target_dtype, copy=False)
1323 for arr in to_concat
1324 ]
1325
1326 if isinstance(to_concat[0], ExtensionArray):
1327 cls = type(to_concat[0])
1328 return cls._concat_same_type(to_concat)
1329
1330 result = np.concatenate(to_concat)
1331
1332 # TODO decide on exact behaviour (we shouldn't do this only for empty result)
1333 # see https://github.com/pandas-dev/pandas/issues/39817
1334 if len(result) == 0:
1335 # all empties -> check for bool to not coerce to float
1336 kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
1337 if len(kinds) != 1:
1338 if "b" in kinds:
1339 result = result.astype(object)
1340 return result