1"""
2Experimental manager based on storing a collection of 1D arrays
3"""
4from __future__ import annotations
5
6from typing import (
7 Any,
8 Callable,
9 Hashable,
10 Literal,
11 TypeVar,
12)
13
14import numpy as np
15
16from pandas._libs import (
17 NaT,
18 algos as libalgos,
19 lib,
20)
21from pandas._typing import (
22 ArrayLike,
23 AxisInt,
24 DtypeObj,
25 QuantileInterpolation,
26 npt,
27)
28from pandas.util._validators import validate_bool_kwarg
29
30from pandas.core.dtypes.astype import astype_array_safe
31from pandas.core.dtypes.cast import (
32 ensure_dtype_can_hold_na,
33 infer_dtype_from_scalar,
34)
35from pandas.core.dtypes.common import (
36 ensure_platform_int,
37 is_datetime64_ns_dtype,
38 is_dtype_equal,
39 is_extension_array_dtype,
40 is_integer,
41 is_numeric_dtype,
42 is_object_dtype,
43 is_timedelta64_ns_dtype,
44)
45from pandas.core.dtypes.dtypes import (
46 ExtensionDtype,
47 PandasDtype,
48)
49from pandas.core.dtypes.generic import (
50 ABCDataFrame,
51 ABCSeries,
52)
53from pandas.core.dtypes.missing import (
54 array_equals,
55 isna,
56 na_value_for_dtype,
57)
58
59import pandas.core.algorithms as algos
60from pandas.core.array_algos.quantile import quantile_compat
61from pandas.core.array_algos.take import take_1d
62from pandas.core.arrays import (
63 DatetimeArray,
64 ExtensionArray,
65 PandasArray,
66 TimedeltaArray,
67)
68from pandas.core.arrays.sparse import SparseDtype
69from pandas.core.construction import (
70 ensure_wrapped_if_datetimelike,
71 extract_array,
72 sanitize_array,
73)
74from pandas.core.indexers import (
75 maybe_convert_indices,
76 validate_indices,
77)
78from pandas.core.indexes.api import (
79 Index,
80 ensure_index,
81)
82from pandas.core.internals.base import (
83 DataManager,
84 SingleDataManager,
85 interleaved_dtype,
86)
87from pandas.core.internals.blocks import (
88 ensure_block_shape,
89 external_values,
90 extract_pandas_array,
91 maybe_coerce_values,
92 new_block,
93 to_native_types,
94)
95
96T = TypeVar("T", bound="BaseArrayManager")
97
98
99class BaseArrayManager(DataManager):
100 """
101 Core internal data structure to implement DataFrame and Series.
102
103 Alternative to the BlockManager, storing a list of 1D arrays instead of
104 Blocks.
105
106 This is *not* a public API class
107
108 Parameters
109 ----------
110 arrays : Sequence of arrays
111 axes : Sequence of Index
112 verify_integrity : bool, default True
113
114 """
115
116 __slots__ = [
117 "_axes", # private attribute, because 'axes' has different order, see below
118 "arrays",
119 ]
120
121 arrays: list[np.ndarray | ExtensionArray]
122 _axes: list[Index]
123
124 def __init__(
125 self,
126 arrays: list[np.ndarray | ExtensionArray],
127 axes: list[Index],
128 verify_integrity: bool = True,
129 ) -> None:
130 raise NotImplementedError
131
132 def make_empty(self: T, axes=None) -> T:
133 """Return an empty ArrayManager with the items axis of len 0 (no columns)"""
134 if axes is None:
135 axes = [self.axes[1:], Index([])]
136
137 arrays: list[np.ndarray | ExtensionArray] = []
138 return type(self)(arrays, axes)
139
140 @property
141 def items(self) -> Index:
142 return self._axes[-1]
143
144 @property
145 # error: Signature of "axes" incompatible with supertype "DataManager"
146 def axes(self) -> list[Index]: # type: ignore[override]
147 # mypy doesn't work to override attribute with property
148 # see https://github.com/python/mypy/issues/4125
149 """Axes is BlockManager-compatible order (columns, rows)"""
150 return [self._axes[1], self._axes[0]]
151
152 @property
153 def shape_proper(self) -> tuple[int, ...]:
154 # this returns (n_rows, n_columns)
155 return tuple(len(ax) for ax in self._axes)
156
157 @staticmethod
158 def _normalize_axis(axis: AxisInt) -> int:
159 # switch axis
160 axis = 1 if axis == 0 else 0
161 return axis
162
163 def set_axis(self, axis: AxisInt, new_labels: Index) -> None:
164 # Caller is responsible for ensuring we have an Index object.
165 self._validate_set_axis(axis, new_labels)
166 axis = self._normalize_axis(axis)
167 self._axes[axis] = new_labels
168
169 def get_dtypes(self) -> np.ndarray:
170 return np.array([arr.dtype for arr in self.arrays], dtype="object")
171
172 def add_references(self, mgr: BaseArrayManager) -> None:
173 """
174 Only implemented on the BlockManager level
175 """
176 return
177
178 def __getstate__(self):
179 return self.arrays, self._axes
180
181 def __setstate__(self, state) -> None:
182 self.arrays = state[0]
183 self._axes = state[1]
184
185 def __repr__(self) -> str:
186 output = type(self).__name__
187 output += f"\nIndex: {self._axes[0]}"
188 if self.ndim == 2:
189 output += f"\nColumns: {self._axes[1]}"
190 output += f"\n{len(self.arrays)} arrays:"
191 for arr in self.arrays:
192 output += f"\n{arr.dtype}"
193 return output
194
195 def apply(
196 self: T,
197 f,
198 align_keys: list[str] | None = None,
199 **kwargs,
200 ) -> T:
201 """
202 Iterate over the arrays, collect and create a new ArrayManager.
203
204 Parameters
205 ----------
206 f : str or callable
207 Name of the Array method to apply.
208 align_keys: List[str] or None, default None
209 **kwargs
210 Keywords to pass to `f`
211
212 Returns
213 -------
214 ArrayManager
215 """
216 assert "filter" not in kwargs
217
218 align_keys = align_keys or []
219 result_arrays: list[np.ndarray] = []
220 # fillna: Series/DataFrame is responsible for making sure value is aligned
221
222 aligned_args = {k: kwargs[k] for k in align_keys}
223
224 if f == "apply":
225 f = kwargs.pop("func")
226
227 for i, arr in enumerate(self.arrays):
228 if aligned_args:
229 for k, obj in aligned_args.items():
230 if isinstance(obj, (ABCSeries, ABCDataFrame)):
231 # The caller is responsible for ensuring that
232 # obj.axes[-1].equals(self.items)
233 if obj.ndim == 1:
234 kwargs[k] = obj.iloc[i]
235 else:
236 kwargs[k] = obj.iloc[:, i]._values
237 else:
238 # otherwise we have an array-like
239 kwargs[k] = obj[i]
240
241 if callable(f):
242 applied = f(arr, **kwargs)
243 else:
244 applied = getattr(arr, f)(**kwargs)
245
246 # if not isinstance(applied, ExtensionArray):
247 # # TODO not all EA operations return new EAs (eg astype)
248 # applied = array(applied)
249 result_arrays.append(applied)
250
251 new_axes = self._axes
252
253 # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
254 # expected "List[Union[ndarray, ExtensionArray]]"
255 return type(self)(result_arrays, new_axes) # type: ignore[arg-type]
256
257 def apply_with_block(
258 self: T, f, align_keys=None, swap_axis: bool = True, **kwargs
259 ) -> T:
260 # switch axis to follow BlockManager logic
261 if swap_axis and "axis" in kwargs and self.ndim == 2:
262 kwargs["axis"] = 1 if kwargs["axis"] == 0 else 0
263
264 align_keys = align_keys or []
265 aligned_args = {k: kwargs[k] for k in align_keys}
266
267 result_arrays = []
268
269 for i, arr in enumerate(self.arrays):
270 if aligned_args:
271 for k, obj in aligned_args.items():
272 if isinstance(obj, (ABCSeries, ABCDataFrame)):
273 # The caller is responsible for ensuring that
274 # obj.axes[-1].equals(self.items)
275 if obj.ndim == 1:
276 if self.ndim == 2:
277 kwargs[k] = obj.iloc[slice(i, i + 1)]._values
278 else:
279 kwargs[k] = obj.iloc[:]._values
280 else:
281 kwargs[k] = obj.iloc[:, [i]]._values
282 else:
283 # otherwise we have an ndarray
284 if obj.ndim == 2:
285 kwargs[k] = obj[[i]]
286
287 if isinstance(arr.dtype, np.dtype) and not isinstance(arr, np.ndarray):
288 # i.e. TimedeltaArray, DatetimeArray with tz=None. Need to
289 # convert for the Block constructors.
290 arr = np.asarray(arr)
291
292 if self.ndim == 2:
293 arr = ensure_block_shape(arr, 2)
294 block = new_block(arr, placement=slice(0, 1, 1), ndim=2)
295 else:
296 block = new_block(arr, placement=slice(0, len(self), 1), ndim=1)
297
298 applied = getattr(block, f)(**kwargs)
299 if isinstance(applied, list):
300 applied = applied[0]
301 arr = applied.values
302 if self.ndim == 2 and arr.ndim == 2:
303 # 2D for np.ndarray or DatetimeArray/TimedeltaArray
304 assert len(arr) == 1
305 # error: No overload variant of "__getitem__" of "ExtensionArray"
306 # matches argument type "Tuple[int, slice]"
307 arr = arr[0, :] # type: ignore[call-overload]
308 result_arrays.append(arr)
309
310 return type(self)(result_arrays, self._axes)
311
312 def where(self: T, other, cond, align: bool) -> T:
313 if align:
314 align_keys = ["other", "cond"]
315 else:
316 align_keys = ["cond"]
317 other = extract_array(other, extract_numpy=True)
318
319 return self.apply_with_block(
320 "where",
321 align_keys=align_keys,
322 other=other,
323 cond=cond,
324 )
325
326 def round(self: T, decimals: int, using_cow: bool = False) -> T:
327 return self.apply_with_block("round", decimals=decimals, using_cow=using_cow)
328
329 def setitem(self: T, indexer, value) -> T:
330 return self.apply_with_block("setitem", indexer=indexer, value=value)
331
332 def putmask(self: T, mask, new, align: bool = True) -> T:
333 if align:
334 align_keys = ["new", "mask"]
335 else:
336 align_keys = ["mask"]
337 new = extract_array(new, extract_numpy=True)
338
339 return self.apply_with_block(
340 "putmask",
341 align_keys=align_keys,
342 mask=mask,
343 new=new,
344 )
345
346 def diff(self: T, n: int, axis: AxisInt) -> T:
347 assert self.ndim == 2 and axis == 0 # caller ensures
348 return self.apply(algos.diff, n=n, axis=axis)
349
350 def interpolate(self: T, **kwargs) -> T:
351 return self.apply_with_block("interpolate", swap_axis=False, **kwargs)
352
353 def shift(self: T, periods: int, axis: AxisInt, fill_value) -> T:
354 if fill_value is lib.no_default:
355 fill_value = None
356
357 if axis == 1 and self.ndim == 2:
358 # TODO column-wise shift
359 raise NotImplementedError
360
361 return self.apply_with_block(
362 "shift", periods=periods, axis=axis, fill_value=fill_value
363 )
364
365 def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
366 if limit is not None:
367 # Do this validation even if we go through one of the no-op paths
368 limit = libalgos.validate_limit(None, limit=limit)
369
370 return self.apply_with_block(
371 "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
372 )
373
374 def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
375 if copy is None:
376 copy = True
377
378 return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors)
379
380 def convert(self: T, copy: bool | None) -> T:
381 if copy is None:
382 copy = True
383
384 def _convert(arr):
385 if is_object_dtype(arr.dtype):
386 # extract PandasArray for tests that patch PandasArray._typ
387 arr = np.asarray(arr)
388 result = lib.maybe_convert_objects(
389 arr,
390 convert_datetime=True,
391 convert_timedelta=True,
392 convert_period=True,
393 convert_interval=True,
394 )
395 if result is arr and copy:
396 return arr.copy()
397 return result
398 else:
399 return arr.copy() if copy else arr
400
401 return self.apply(_convert)
402
403 def replace_regex(self: T, **kwargs) -> T:
404 return self.apply_with_block("_replace_regex", **kwargs)
405
406 def replace(self: T, to_replace, value, inplace: bool) -> T:
407 inplace = validate_bool_kwarg(inplace, "inplace")
408 assert np.ndim(value) == 0, value
409 # TODO "replace" is right now implemented on the blocks, we should move
410 # it to general array algos so it can be reused here
411 return self.apply_with_block(
412 "replace", value=value, to_replace=to_replace, inplace=inplace
413 )
414
415 def replace_list(
416 self: T,
417 src_list: list[Any],
418 dest_list: list[Any],
419 inplace: bool = False,
420 regex: bool = False,
421 ) -> T:
422 """do a list replace"""
423 inplace = validate_bool_kwarg(inplace, "inplace")
424
425 return self.apply_with_block(
426 "replace_list",
427 src_list=src_list,
428 dest_list=dest_list,
429 inplace=inplace,
430 regex=regex,
431 )
432
433 def to_native_types(self: T, **kwargs) -> T:
434 return self.apply(to_native_types, **kwargs)
435
436 @property
437 def is_mixed_type(self) -> bool:
438 return True
439
440 @property
441 def is_numeric_mixed_type(self) -> bool:
442 return all(is_numeric_dtype(t) for t in self.get_dtypes())
443
444 @property
445 def any_extension_types(self) -> bool:
446 """Whether any of the blocks in this manager are extension blocks"""
447 return False # any(block.is_extension for block in self.blocks)
448
449 @property
450 def is_view(self) -> bool:
451 """return a boolean if we are a single block and are a view"""
452 # TODO what is this used for?
453 return False
454
455 @property
456 def is_single_block(self) -> bool:
457 return len(self.arrays) == 1
458
459 def _get_data_subset(self: T, predicate: Callable) -> T:
460 indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)]
461 arrays = [self.arrays[i] for i in indices]
462 # TODO copy?
463 # Note: using Index.take ensures we can retain e.g. DatetimeIndex.freq,
464 # see test_describe_datetime_columns
465 taker = np.array(indices, dtype="intp")
466 new_cols = self._axes[1].take(taker)
467 new_axes = [self._axes[0], new_cols]
468 return type(self)(arrays, new_axes, verify_integrity=False)
469
470 def get_bool_data(self: T, copy: bool = False) -> T:
471 """
472 Select columns that are bool-dtype and object-dtype columns that are all-bool.
473
474 Parameters
475 ----------
476 copy : bool, default False
477 Whether to copy the blocks
478 """
479 return self._get_data_subset(lambda x: x.dtype == np.dtype(bool))
480
481 def get_numeric_data(self: T, copy: bool = False) -> T:
482 """
483 Select columns that have a numeric dtype.
484
485 Parameters
486 ----------
487 copy : bool, default False
488 Whether to copy the blocks
489 """
490 return self._get_data_subset(
491 lambda arr: is_numeric_dtype(arr.dtype)
492 or getattr(arr.dtype, "_is_numeric", False)
493 )
494
495 def copy(self: T, deep: bool | Literal["all"] | None = True) -> T:
496 """
497 Make deep or shallow copy of ArrayManager
498
499 Parameters
500 ----------
501 deep : bool or string, default True
502 If False, return shallow copy (do not copy data)
503 If 'all', copy data and a deep copy of the index
504
505 Returns
506 -------
507 BlockManager
508 """
509 if deep is None:
510 # ArrayManager does not yet support CoW, so deep=None always means
511 # deep=True for now
512 deep = True
513
514 # this preserves the notion of view copying of axes
515 if deep:
516 # hit in e.g. tests.io.json.test_pandas
517
518 def copy_func(ax):
519 return ax.copy(deep=True) if deep == "all" else ax.view()
520
521 new_axes = [copy_func(ax) for ax in self._axes]
522 else:
523 new_axes = list(self._axes)
524
525 if deep:
526 new_arrays = [arr.copy() for arr in self.arrays]
527 else:
528 new_arrays = list(self.arrays)
529 return type(self)(new_arrays, new_axes, verify_integrity=False)
530
531 def reindex_indexer(
532 self: T,
533 new_axis,
534 indexer,
535 axis: AxisInt,
536 fill_value=None,
537 allow_dups: bool = False,
538 copy: bool | None = True,
539 # ignored keywords
540 only_slice: bool = False,
541 # ArrayManager specific keywords
542 use_na_proxy: bool = False,
543 ) -> T:
544 axis = self._normalize_axis(axis)
545 return self._reindex_indexer(
546 new_axis,
547 indexer,
548 axis,
549 fill_value,
550 allow_dups,
551 copy,
552 use_na_proxy,
553 )
554
555 def _reindex_indexer(
556 self: T,
557 new_axis,
558 indexer: npt.NDArray[np.intp] | None,
559 axis: AxisInt,
560 fill_value=None,
561 allow_dups: bool = False,
562 copy: bool | None = True,
563 use_na_proxy: bool = False,
564 ) -> T:
565 """
566 Parameters
567 ----------
568 new_axis : Index
569 indexer : ndarray[intp] or None
570 axis : int
571 fill_value : object, default None
572 allow_dups : bool, default False
573 copy : bool, default True
574
575
576 pandas-indexer with -1's only.
577 """
578 if copy is None:
579 # ArrayManager does not yet support CoW, so deep=None always means
580 # deep=True for now
581 copy = True
582
583 if indexer is None:
584 if new_axis is self._axes[axis] and not copy:
585 return self
586
587 result = self.copy(deep=copy)
588 result._axes = list(self._axes)
589 result._axes[axis] = new_axis
590 return result
591
592 # some axes don't allow reindexing with dups
593 if not allow_dups:
594 self._axes[axis]._validate_can_reindex(indexer)
595
596 if axis >= self.ndim:
597 raise IndexError("Requested axis not found in manager")
598
599 if axis == 1:
600 new_arrays = []
601 for i in indexer:
602 if i == -1:
603 arr = self._make_na_array(
604 fill_value=fill_value, use_na_proxy=use_na_proxy
605 )
606 else:
607 arr = self.arrays[i]
608 if copy:
609 arr = arr.copy()
610 new_arrays.append(arr)
611
612 else:
613 validate_indices(indexer, len(self._axes[0]))
614 indexer = ensure_platform_int(indexer)
615 mask = indexer == -1
616 needs_masking = mask.any()
617 new_arrays = [
618 take_1d(
619 arr,
620 indexer,
621 allow_fill=needs_masking,
622 fill_value=fill_value,
623 mask=mask,
624 # if fill_value is not None else blk.fill_value
625 )
626 for arr in self.arrays
627 ]
628
629 new_axes = list(self._axes)
630 new_axes[axis] = new_axis
631
632 return type(self)(new_arrays, new_axes, verify_integrity=False)
633
634 def take(
635 self: T,
636 indexer,
637 axis: AxisInt = 1,
638 verify: bool = True,
639 convert_indices: bool = True,
640 ) -> T:
641 """
642 Take items along any axis.
643 """
644 axis = self._normalize_axis(axis)
645
646 indexer = (
647 np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64")
648 if isinstance(indexer, slice)
649 else np.asanyarray(indexer, dtype="int64")
650 )
651
652 if not indexer.ndim == 1:
653 raise ValueError("indexer should be 1-dimensional")
654
655 n = self.shape_proper[axis]
656 if convert_indices:
657 indexer = maybe_convert_indices(indexer, n, verify=verify)
658
659 new_labels = self._axes[axis].take(indexer)
660 return self._reindex_indexer(
661 new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True
662 )
663
664 def _make_na_array(self, fill_value=None, use_na_proxy: bool = False):
665 if use_na_proxy:
666 assert fill_value is None
667 return NullArrayProxy(self.shape_proper[0])
668
669 if fill_value is None:
670 fill_value = np.nan
671
672 dtype, fill_value = infer_dtype_from_scalar(fill_value)
673 # error: Argument "dtype" to "empty" has incompatible type "Union[dtype[Any],
674 # ExtensionDtype]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
675 # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
676 # _DTypeDict, Tuple[Any, Any]]]"
677 values = np.empty(self.shape_proper[0], dtype=dtype) # type: ignore[arg-type]
678 values.fill(fill_value)
679 return values
680
681 def _equal_values(self, other) -> bool:
682 """
683 Used in .equals defined in base class. Only check the column values
684 assuming shape and indexes have already been checked.
685 """
686 for left, right in zip(self.arrays, other.arrays):
687 if not array_equals(left, right):
688 return False
689 return True
690
691 # TODO
692 # to_dict
693
694
695class ArrayManager(BaseArrayManager):
696 @property
697 def ndim(self) -> Literal[2]:
698 return 2
699
700 def __init__(
701 self,
702 arrays: list[np.ndarray | ExtensionArray],
703 axes: list[Index],
704 verify_integrity: bool = True,
705 ) -> None:
706 # Note: we are storing the axes in "_axes" in the (row, columns) order
707 # which contrasts the order how it is stored in BlockManager
708 self._axes = axes
709 self.arrays = arrays
710
711 if verify_integrity:
712 self._axes = [ensure_index(ax) for ax in axes]
713 arrays = [extract_pandas_array(x, None, 1)[0] for x in arrays]
714 self.arrays = [maybe_coerce_values(arr) for arr in arrays]
715 self._verify_integrity()
716
717 def _verify_integrity(self) -> None:
718 n_rows, n_columns = self.shape_proper
719 if not len(self.arrays) == n_columns:
720 raise ValueError(
721 "Number of passed arrays must equal the size of the column Index: "
722 f"{len(self.arrays)} arrays vs {n_columns} columns."
723 )
724 for arr in self.arrays:
725 if not len(arr) == n_rows:
726 raise ValueError(
727 "Passed arrays should have the same length as the rows Index: "
728 f"{len(arr)} vs {n_rows} rows"
729 )
730 if not isinstance(arr, (np.ndarray, ExtensionArray)):
731 raise ValueError(
732 "Passed arrays should be np.ndarray or ExtensionArray instances, "
733 f"got {type(arr)} instead"
734 )
735 if not arr.ndim == 1:
736 raise ValueError(
737 "Passed arrays should be 1-dimensional, got array with "
738 f"{arr.ndim} dimensions instead."
739 )
740
741 # --------------------------------------------------------------------
742 # Indexing
743
744 def fast_xs(self, loc: int) -> SingleArrayManager:
745 """
746 Return the array corresponding to `frame.iloc[loc]`.
747
748 Parameters
749 ----------
750 loc : int
751
752 Returns
753 -------
754 np.ndarray or ExtensionArray
755 """
756 dtype = interleaved_dtype([arr.dtype for arr in self.arrays])
757
758 values = [arr[loc] for arr in self.arrays]
759 if isinstance(dtype, ExtensionDtype):
760 result = dtype.construct_array_type()._from_sequence(values, dtype=dtype)
761 # for datetime64/timedelta64, the np.ndarray constructor cannot handle pd.NaT
762 elif is_datetime64_ns_dtype(dtype):
763 result = DatetimeArray._from_sequence(values, dtype=dtype)._ndarray
764 elif is_timedelta64_ns_dtype(dtype):
765 result = TimedeltaArray._from_sequence(values, dtype=dtype)._ndarray
766 else:
767 result = np.array(values, dtype=dtype)
768 return SingleArrayManager([result], [self._axes[1]])
769
770 def get_slice(self, slobj: slice, axis: AxisInt = 0) -> ArrayManager:
771 axis = self._normalize_axis(axis)
772
773 if axis == 0:
774 arrays = [arr[slobj] for arr in self.arrays]
775 elif axis == 1:
776 arrays = self.arrays[slobj]
777
778 new_axes = list(self._axes)
779 new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
780
781 return type(self)(arrays, new_axes, verify_integrity=False)
782
783 def iget(self, i: int) -> SingleArrayManager:
784 """
785 Return the data as a SingleArrayManager.
786 """
787 values = self.arrays[i]
788 return SingleArrayManager([values], [self._axes[0]])
789
790 def iget_values(self, i: int) -> ArrayLike:
791 """
792 Return the data for column i as the values (ndarray or ExtensionArray).
793 """
794 return self.arrays[i]
795
796 @property
797 def column_arrays(self) -> list[ArrayLike]:
798 """
799 Used in the JSON C code to access column arrays.
800 """
801
802 return [np.asarray(arr) for arr in self.arrays]
803
804 def iset(
805 self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
806 ) -> None:
807 """
808 Set new column(s).
809
810 This changes the ArrayManager in-place, but replaces (an) existing
811 column(s), not changing column values in-place).
812
813 Parameters
814 ----------
815 loc : integer, slice or boolean mask
816 Positional location (already bounds checked)
817 value : np.ndarray or ExtensionArray
818 inplace : bool, default False
819 Whether overwrite existing array as opposed to replacing it.
820 """
821 # single column -> single integer index
822 if lib.is_integer(loc):
823 # TODO can we avoid needing to unpack this here? That means converting
824 # DataFrame into 1D array when loc is an integer
825 if isinstance(value, np.ndarray) and value.ndim == 2:
826 assert value.shape[1] == 1
827 value = value[:, 0]
828
829 # TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item
830 # but we should avoid that and pass directly the proper array
831 value = maybe_coerce_values(value)
832
833 assert isinstance(value, (np.ndarray, ExtensionArray))
834 assert value.ndim == 1
835 assert len(value) == len(self._axes[0])
836 self.arrays[loc] = value
837 return
838
839 # multiple columns -> convert slice or array to integer indices
840 elif isinstance(loc, slice):
841 indices = range(
842 loc.start if loc.start is not None else 0,
843 loc.stop if loc.stop is not None else self.shape_proper[1],
844 loc.step if loc.step is not None else 1,
845 )
846 else:
847 assert isinstance(loc, np.ndarray)
848 assert loc.dtype == "bool"
849 # error: Incompatible types in assignment (expression has type "ndarray",
850 # variable has type "range")
851 indices = np.nonzero(loc)[0] # type: ignore[assignment]
852
853 assert value.ndim == 2
854 assert value.shape[0] == len(self._axes[0])
855
856 for value_idx, mgr_idx in enumerate(indices):
857 # error: No overload variant of "__getitem__" of "ExtensionArray" matches
858 # argument type "Tuple[slice, int]"
859 value_arr = value[:, value_idx] # type: ignore[call-overload]
860 self.arrays[mgr_idx] = value_arr
861 return
862
863 def column_setitem(
864 self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False
865 ) -> None:
866 """
867 Set values ("setitem") into a single column (not setting the full column).
868
869 This is a method on the ArrayManager level, to avoid creating an
870 intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
871 """
872 if not is_integer(loc):
873 raise TypeError("The column index should be an integer")
874 arr = self.arrays[loc]
875 mgr = SingleArrayManager([arr], [self._axes[0]])
876 if inplace_only:
877 mgr.setitem_inplace(idx, value)
878 else:
879 new_mgr = mgr.setitem((idx,), value)
880 # update existing ArrayManager in-place
881 self.arrays[loc] = new_mgr.arrays[0]
882
883 def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
884 """
885 Insert item at selected position.
886
887 Parameters
888 ----------
889 loc : int
890 item : hashable
891 value : np.ndarray or ExtensionArray
892 """
893 # insert to the axis; this could possibly raise a TypeError
894 new_axis = self.items.insert(loc, item)
895
896 value = extract_array(value, extract_numpy=True)
897 if value.ndim == 2:
898 if value.shape[0] == 1:
899 # error: No overload variant of "__getitem__" of "ExtensionArray"
900 # matches argument type "Tuple[int, slice]"
901 value = value[0, :] # type: ignore[call-overload]
902 else:
903 raise ValueError(
904 f"Expected a 1D array, got an array with shape {value.shape}"
905 )
906 value = maybe_coerce_values(value)
907
908 # TODO self.arrays can be empty
909 # assert len(value) == len(self.arrays[0])
910
911 # TODO is this copy needed?
912 arrays = self.arrays.copy()
913 arrays.insert(loc, value)
914
915 self.arrays = arrays
916 self._axes[1] = new_axis
917
918 def idelete(self, indexer) -> ArrayManager:
919 """
920 Delete selected locations in-place (new block and array, same BlockManager)
921 """
922 to_keep = np.ones(self.shape[0], dtype=np.bool_)
923 to_keep[indexer] = False
924
925 self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]]
926 self._axes = [self._axes[0], self._axes[1][to_keep]]
927 return self
928
929 # --------------------------------------------------------------------
930 # Array-wise Operation
931
932 def grouped_reduce(self: T, func: Callable) -> T:
933 """
934 Apply grouped reduction function columnwise, returning a new ArrayManager.
935
936 Parameters
937 ----------
938 func : grouped reduction function
939
940 Returns
941 -------
942 ArrayManager
943 """
944 result_arrays: list[np.ndarray] = []
945 result_indices: list[int] = []
946
947 for i, arr in enumerate(self.arrays):
948 # grouped_reduce functions all expect 2D arrays
949 arr = ensure_block_shape(arr, ndim=2)
950 res = func(arr)
951 if res.ndim == 2:
952 # reverse of ensure_block_shape
953 assert res.shape[0] == 1
954 res = res[0]
955
956 result_arrays.append(res)
957 result_indices.append(i)
958
959 if len(result_arrays) == 0:
960 nrows = 0
961 else:
962 nrows = result_arrays[0].shape[0]
963 index = Index(range(nrows))
964
965 columns = self.items
966
967 # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
968 # expected "List[Union[ndarray, ExtensionArray]]"
969 return type(self)(result_arrays, [index, columns]) # type: ignore[arg-type]
970
971 def reduce(self: T, func: Callable) -> T:
972 """
973 Apply reduction function column-wise, returning a single-row ArrayManager.
974
975 Parameters
976 ----------
977 func : reduction function
978
979 Returns
980 -------
981 ArrayManager
982 """
983 result_arrays: list[np.ndarray] = []
984 for i, arr in enumerate(self.arrays):
985 res = func(arr, axis=0)
986
987 # TODO NaT doesn't preserve dtype, so we need to ensure to create
988 # a timedelta result array if original was timedelta
989 # what if datetime results in timedelta? (eg std)
990 dtype = arr.dtype if res is NaT else None
991 result_arrays.append(
992 sanitize_array([res], None, dtype=dtype) # type: ignore[arg-type]
993 )
994
995 index = Index._simple_new(np.array([None], dtype=object)) # placeholder
996 columns = self.items
997
998 # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
999 # expected "List[Union[ndarray, ExtensionArray]]"
1000 new_mgr = type(self)(result_arrays, [index, columns]) # type: ignore[arg-type]
1001 return new_mgr
1002
1003 def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager:
1004 """
1005 Apply array_op blockwise with another (aligned) BlockManager.
1006 """
1007 # TODO what if `other` is BlockManager ?
1008 left_arrays = self.arrays
1009 right_arrays = other.arrays
1010 result_arrays = [
1011 array_op(left, right) for left, right in zip(left_arrays, right_arrays)
1012 ]
1013 return type(self)(result_arrays, self._axes)
1014
1015 def quantile(
1016 self,
1017 *,
1018 qs: Index, # with dtype float64
1019 axis: AxisInt = 0,
1020 transposed: bool = False,
1021 interpolation: QuantileInterpolation = "linear",
1022 ) -> ArrayManager:
1023 arrs = [ensure_block_shape(x, 2) for x in self.arrays]
1024 assert axis == 1
1025 new_arrs = [
1026 quantile_compat(x, np.asarray(qs._values), interpolation) for x in arrs
1027 ]
1028 for i, arr in enumerate(new_arrs):
1029 if arr.ndim == 2:
1030 assert arr.shape[0] == 1, arr.shape
1031 new_arrs[i] = arr[0]
1032
1033 axes = [qs, self._axes[1]]
1034 return type(self)(new_arrs, axes)
1035
1036 # ----------------------------------------------------------------
1037
1038 def unstack(self, unstacker, fill_value) -> ArrayManager:
1039 """
1040 Return a BlockManager with all blocks unstacked.
1041
1042 Parameters
1043 ----------
1044 unstacker : reshape._Unstacker
1045 fill_value : Any
1046 fill_value for newly introduced missing values.
1047
1048 Returns
1049 -------
1050 unstacked : BlockManager
1051 """
1052 indexer, _ = unstacker._indexer_and_to_sort
1053 if unstacker.mask.all():
1054 new_indexer = indexer
1055 allow_fill = False
1056 new_mask2D = None
1057 needs_masking = None
1058 else:
1059 new_indexer = np.full(unstacker.mask.shape, -1)
1060 new_indexer[unstacker.mask] = indexer
1061 allow_fill = True
1062 # calculating the full mask once and passing it to take_1d is faster
1063 # than letting take_1d calculate it in each repeated call
1064 new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape)
1065 needs_masking = new_mask2D.any(axis=0)
1066 new_indexer2D = new_indexer.reshape(*unstacker.full_shape)
1067 new_indexer2D = ensure_platform_int(new_indexer2D)
1068
1069 new_arrays = []
1070 for arr in self.arrays:
1071 for i in range(unstacker.full_shape[1]):
1072 if allow_fill:
1073 # error: Value of type "Optional[Any]" is not indexable [index]
1074 new_arr = take_1d(
1075 arr,
1076 new_indexer2D[:, i],
1077 allow_fill=needs_masking[i], # type: ignore[index]
1078 fill_value=fill_value,
1079 mask=new_mask2D[:, i], # type: ignore[index]
1080 )
1081 else:
1082 new_arr = take_1d(arr, new_indexer2D[:, i], allow_fill=False)
1083 new_arrays.append(new_arr)
1084
1085 new_index = unstacker.new_index
1086 new_columns = unstacker.get_new_columns(self._axes[1])
1087 new_axes = [new_index, new_columns]
1088
1089 return type(self)(new_arrays, new_axes, verify_integrity=False)
1090
1091 def as_array(
1092 self,
1093 dtype=None,
1094 copy: bool = False,
1095 na_value: object = lib.no_default,
1096 ) -> np.ndarray:
1097 """
1098 Convert the blockmanager data into an numpy array.
1099
1100 Parameters
1101 ----------
1102 dtype : object, default None
1103 Data type of the return array.
1104 copy : bool, default False
1105 If True then guarantee that a copy is returned. A value of
1106 False does not guarantee that the underlying data is not
1107 copied.
1108 na_value : object, default lib.no_default
1109 Value to be used as the missing value sentinel.
1110
1111 Returns
1112 -------
1113 arr : ndarray
1114 """
1115 if len(self.arrays) == 0:
1116 empty_arr = np.empty(self.shape, dtype=float)
1117 return empty_arr.transpose()
1118
1119 # We want to copy when na_value is provided to avoid
1120 # mutating the original object
1121 copy = copy or na_value is not lib.no_default
1122
1123 if not dtype:
1124 dtype = interleaved_dtype([arr.dtype for arr in self.arrays])
1125
1126 if isinstance(dtype, SparseDtype):
1127 dtype = dtype.subtype
1128 elif isinstance(dtype, PandasDtype):
1129 dtype = dtype.numpy_dtype
1130 elif is_extension_array_dtype(dtype):
1131 dtype = "object"
1132 elif is_dtype_equal(dtype, str):
1133 dtype = "object"
1134
1135 result = np.empty(self.shape_proper, dtype=dtype)
1136
1137 for i, arr in enumerate(self.arrays):
1138 arr = arr.astype(dtype, copy=copy)
1139 result[:, i] = arr
1140
1141 if na_value is not lib.no_default:
1142 result[isna(result)] = na_value
1143
1144 return result
1145
1146
1147class SingleArrayManager(BaseArrayManager, SingleDataManager):
1148 __slots__ = [
1149 "_axes", # private attribute, because 'axes' has different order, see below
1150 "arrays",
1151 ]
1152
1153 arrays: list[np.ndarray | ExtensionArray]
1154 _axes: list[Index]
1155
1156 @property
1157 def ndim(self) -> Literal[1]:
1158 return 1
1159
1160 def __init__(
1161 self,
1162 arrays: list[np.ndarray | ExtensionArray],
1163 axes: list[Index],
1164 verify_integrity: bool = True,
1165 ) -> None:
1166 self._axes = axes
1167 self.arrays = arrays
1168
1169 if verify_integrity:
1170 assert len(axes) == 1
1171 assert len(arrays) == 1
1172 self._axes = [ensure_index(ax) for ax in self._axes]
1173 arr = arrays[0]
1174 arr = maybe_coerce_values(arr)
1175 arr = extract_pandas_array(arr, None, 1)[0]
1176 self.arrays = [arr]
1177 self._verify_integrity()
1178
1179 def _verify_integrity(self) -> None:
1180 (n_rows,) = self.shape
1181 assert len(self.arrays) == 1
1182 arr = self.arrays[0]
1183 assert len(arr) == n_rows
1184 if not arr.ndim == 1:
1185 raise ValueError(
1186 "Passed array should be 1-dimensional, got array with "
1187 f"{arr.ndim} dimensions instead."
1188 )
1189
1190 @staticmethod
1191 def _normalize_axis(axis):
1192 return axis
1193
1194 def make_empty(self, axes=None) -> SingleArrayManager:
1195 """Return an empty ArrayManager with index/array of length 0"""
1196 if axes is None:
1197 axes = [Index([], dtype=object)]
1198 array: np.ndarray = np.array([], dtype=self.dtype)
1199 return type(self)([array], axes)
1200
1201 @classmethod
1202 def from_array(cls, array, index) -> SingleArrayManager:
1203 return cls([array], [index])
1204
1205 @property
1206 def axes(self):
1207 return self._axes
1208
1209 @property
1210 def index(self) -> Index:
1211 return self._axes[0]
1212
1213 @property
1214 def dtype(self):
1215 return self.array.dtype
1216
1217 def external_values(self):
1218 """The array that Series.values returns"""
1219 return external_values(self.array)
1220
1221 def internal_values(self):
1222 """The array that Series._values returns"""
1223 return self.array
1224
1225 def array_values(self):
1226 """The array that Series.array returns"""
1227 arr = self.array
1228 if isinstance(arr, np.ndarray):
1229 arr = PandasArray(arr)
1230 return arr
1231
1232 @property
1233 def _can_hold_na(self) -> bool:
1234 if isinstance(self.array, np.ndarray):
1235 return self.array.dtype.kind not in ["b", "i", "u"]
1236 else:
1237 # ExtensionArray
1238 return self.array._can_hold_na
1239
1240 @property
1241 def is_single_block(self) -> bool:
1242 return True
1243
1244 def fast_xs(self, loc: int) -> SingleArrayManager:
1245 raise NotImplementedError("Use series._values[loc] instead")
1246
1247 def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleArrayManager:
1248 if axis >= self.ndim:
1249 raise IndexError("Requested axis not found in manager")
1250
1251 new_array = self.array[slobj]
1252 new_index = self.index._getitem_slice(slobj)
1253 return type(self)([new_array], [new_index], verify_integrity=False)
1254
1255 def getitem_mgr(self, indexer) -> SingleArrayManager:
1256 new_array = self.array[indexer]
1257 new_index = self.index[indexer]
1258 return type(self)([new_array], [new_index])
1259
1260 def apply(self, func, **kwargs):
1261 if callable(func):
1262 new_array = func(self.array, **kwargs)
1263 else:
1264 new_array = getattr(self.array, func)(**kwargs)
1265 return type(self)([new_array], self._axes)
1266
1267 def setitem(self, indexer, value) -> SingleArrayManager:
1268 """
1269 Set values with indexer.
1270
1271 For SingleArrayManager, this backs s[indexer] = value
1272
1273 See `setitem_inplace` for a version that works inplace and doesn't
1274 return a new Manager.
1275 """
1276 if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
1277 raise ValueError(f"Cannot set values with ndim > {self.ndim}")
1278 return self.apply_with_block("setitem", indexer=indexer, value=value)
1279
1280 def idelete(self, indexer) -> SingleArrayManager:
1281 """
1282 Delete selected locations in-place (new array, same ArrayManager)
1283 """
1284 to_keep = np.ones(self.shape[0], dtype=np.bool_)
1285 to_keep[indexer] = False
1286
1287 self.arrays = [self.arrays[0][to_keep]]
1288 self._axes = [self._axes[0][to_keep]]
1289 return self
1290
1291 def _get_data_subset(self, predicate: Callable) -> SingleArrayManager:
1292 # used in get_numeric_data / get_bool_data
1293 if predicate(self.array):
1294 return type(self)(self.arrays, self._axes, verify_integrity=False)
1295 else:
1296 return self.make_empty()
1297
1298 def set_values(self, values: ArrayLike) -> None:
1299 """
1300 Set (replace) the values of the SingleArrayManager in place.
1301
1302 Use at your own risk! This does not check if the passed values are
1303 valid for the current SingleArrayManager (length, dtype, etc).
1304 """
1305 self.arrays[0] = values
1306
1307 def to_2d_mgr(self, columns: Index) -> ArrayManager:
1308 """
1309 Manager analogue of Series.to_frame
1310 """
1311 arrays = [self.arrays[0]]
1312 axes = [self.axes[0], columns]
1313
1314 return ArrayManager(arrays, axes, verify_integrity=False)
1315
1316
1317class NullArrayProxy:
1318 """
1319 Proxy object for an all-NA array.
1320
1321 Only stores the length of the array, and not the dtype. The dtype
1322 will only be known when actually concatenating (after determining the
1323 common dtype, for which this proxy is ignored).
1324 Using this object avoids that the internals/concat.py needs to determine
1325 the proper dtype and array type.
1326 """
1327
1328 ndim = 1
1329
1330 def __init__(self, n: int) -> None:
1331 self.n = n
1332
1333 @property
1334 def shape(self) -> tuple[int]:
1335 return (self.n,)
1336
1337 def to_array(self, dtype: DtypeObj) -> ArrayLike:
1338 """
1339 Helper function to create the actual all-NA array from the NullArrayProxy
1340 object.
1341
1342 Parameters
1343 ----------
1344 arr : NullArrayProxy
1345 dtype : the dtype for the resulting array
1346
1347 Returns
1348 -------
1349 np.ndarray or ExtensionArray
1350 """
1351 if isinstance(dtype, ExtensionDtype):
1352 empty = dtype.construct_array_type()._from_sequence([], dtype=dtype)
1353 indexer = -np.ones(self.n, dtype=np.intp)
1354 return empty.take(indexer, allow_fill=True)
1355 else:
1356 # when introducing missing values, int becomes float, bool becomes object
1357 dtype = ensure_dtype_can_hold_na(dtype)
1358 fill_value = na_value_for_dtype(dtype)
1359 arr = np.empty(self.n, dtype=dtype)
1360 arr.fill(fill_value)
1361 return ensure_wrapped_if_datetimelike(arr)