1"""
2Functions for preparing various inputs passed to the DataFrame or Series
3constructors before passing them to a BlockManager.
4"""
5from __future__ import annotations
6
7from collections import abc
8from typing import (
9 Any,
10 Hashable,
11 Sequence,
12)
13
14import numpy as np
15from numpy import ma
16
17from pandas._libs import lib
18from pandas._typing import (
19 ArrayLike,
20 DtypeObj,
21 Manager,
22 npt,
23)
24
25from pandas.core.dtypes.astype import astype_is_view
26from pandas.core.dtypes.cast import (
27 construct_1d_arraylike_from_scalar,
28 dict_compat,
29 maybe_cast_to_datetime,
30 maybe_convert_platform,
31 maybe_infer_to_datetimelike,
32)
33from pandas.core.dtypes.common import (
34 is_1d_only_ea_dtype,
35 is_bool_dtype,
36 is_datetime_or_timedelta_dtype,
37 is_dtype_equal,
38 is_extension_array_dtype,
39 is_float_dtype,
40 is_integer_dtype,
41 is_list_like,
42 is_named_tuple,
43 is_object_dtype,
44)
45from pandas.core.dtypes.dtypes import ExtensionDtype
46from pandas.core.dtypes.generic import (
47 ABCDataFrame,
48 ABCSeries,
49)
50
51from pandas.core import (
52 algorithms,
53 common as com,
54)
55from pandas.core.arrays import (
56 BooleanArray,
57 ExtensionArray,
58 FloatingArray,
59 IntegerArray,
60)
61from pandas.core.arrays.string_ import StringDtype
62from pandas.core.construction import (
63 ensure_wrapped_if_datetimelike,
64 extract_array,
65 range_to_ndarray,
66 sanitize_array,
67)
68from pandas.core.indexes.api import (
69 DatetimeIndex,
70 Index,
71 TimedeltaIndex,
72 default_index,
73 ensure_index,
74 get_objs_combined_axis,
75 union_indexes,
76)
77from pandas.core.internals.array_manager import (
78 ArrayManager,
79 SingleArrayManager,
80)
81from pandas.core.internals.blocks import (
82 BlockPlacement,
83 ensure_block_shape,
84 new_block_2d,
85)
86from pandas.core.internals.managers import (
87 BlockManager,
88 SingleBlockManager,
89 create_block_manager_from_blocks,
90 create_block_manager_from_column_arrays,
91)
92
93# ---------------------------------------------------------------------
94# BlockManager Interface
95
96
97def arrays_to_mgr(
98 arrays,
99 columns: Index,
100 index,
101 *,
102 dtype: DtypeObj | None = None,
103 verify_integrity: bool = True,
104 typ: str | None = None,
105 consolidate: bool = True,
106) -> Manager:
107 """
108 Segregate Series based on type and coerce into matrices.
109
110 Needs to handle a lot of exceptional cases.
111 """
112 if verify_integrity:
113 # figure out the index, if necessary
114 if index is None:
115 index = _extract_index(arrays)
116 else:
117 index = ensure_index(index)
118
119 # don't force copy because getting jammed in an ndarray anyway
120 arrays, refs = _homogenize(arrays, index, dtype)
121 # _homogenize ensures
122 # - all(len(x) == len(index) for x in arrays)
123 # - all(x.ndim == 1 for x in arrays)
124 # - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)
125 # - all(type(x) is not PandasArray for x in arrays)
126
127 else:
128 index = ensure_index(index)
129 arrays = [extract_array(x, extract_numpy=True) for x in arrays]
130 # with _from_arrays, the passed arrays should never be Series objects
131 refs = [None] * len(arrays)
132
133 # Reached via DataFrame._from_arrays; we do minimal validation here
134 for arr in arrays:
135 if (
136 not isinstance(arr, (np.ndarray, ExtensionArray))
137 or arr.ndim != 1
138 or len(arr) != len(index)
139 ):
140 raise ValueError(
141 "Arrays must be 1-dimensional np.ndarray or ExtensionArray "
142 "with length matching len(index)"
143 )
144
145 columns = ensure_index(columns)
146 if len(columns) != len(arrays):
147 raise ValueError("len(arrays) must match len(columns)")
148
149 # from BlockManager perspective
150 axes = [columns, index]
151
152 if typ == "block":
153 return create_block_manager_from_column_arrays(
154 arrays, axes, consolidate=consolidate, refs=refs
155 )
156 elif typ == "array":
157 return ArrayManager(arrays, [index, columns])
158 else:
159 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
160
161
162def rec_array_to_mgr(
163 data: np.recarray | np.ndarray,
164 index,
165 columns,
166 dtype: DtypeObj | None,
167 copy: bool,
168 typ: str,
169) -> Manager:
170 """
171 Extract from a masked rec array and create the manager.
172 """
173 # essentially process a record array then fill it
174 fdata = ma.getdata(data)
175 if index is None:
176 index = default_index(len(fdata))
177 else:
178 index = ensure_index(index)
179
180 if columns is not None:
181 columns = ensure_index(columns)
182 arrays, arr_columns = to_arrays(fdata, columns)
183
184 # create the manager
185
186 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, len(index))
187 if columns is None:
188 columns = arr_columns
189
190 mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ)
191
192 if copy:
193 mgr = mgr.copy()
194 return mgr
195
196
197def mgr_to_mgr(mgr, typ: str, copy: bool = True):
198 """
199 Convert to specific type of Manager. Does not copy if the type is already
200 correct. Does not guarantee a copy otherwise. `copy` keyword only controls
201 whether conversion from Block->ArrayManager copies the 1D arrays.
202 """
203 new_mgr: Manager
204
205 if typ == "block":
206 if isinstance(mgr, BlockManager):
207 new_mgr = mgr
208 else:
209 if mgr.ndim == 2:
210 new_mgr = arrays_to_mgr(
211 mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block"
212 )
213 else:
214 new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index)
215 elif typ == "array":
216 if isinstance(mgr, ArrayManager):
217 new_mgr = mgr
218 else:
219 if mgr.ndim == 2:
220 arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))]
221 if copy:
222 arrays = [arr.copy() for arr in arrays]
223 new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])
224 else:
225 array = mgr.internal_values()
226 if copy:
227 array = array.copy()
228 new_mgr = SingleArrayManager([array], [mgr.index])
229 else:
230 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
231 return new_mgr
232
233
234# ---------------------------------------------------------------------
235# DataFrame Constructor Interface
236
237
238def ndarray_to_mgr(
239 values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str
240) -> Manager:
241 # used in DataFrame.__init__
242 # input must be a ndarray, list, Series, Index, ExtensionArray
243
244 if isinstance(values, ABCSeries):
245 if columns is None:
246 if values.name is not None:
247 columns = Index([values.name])
248 if index is None:
249 index = values.index
250 else:
251 values = values.reindex(index)
252
253 # zero len case (GH #2234)
254 if not len(values) and columns is not None and len(columns):
255 values = np.empty((0, 1), dtype=object)
256
257 # if the array preparation does a copy -> avoid this for ArrayManager,
258 # since the copy is done on conversion to 1D arrays
259 copy_on_sanitize = False if typ == "array" else copy
260
261 vdtype = getattr(values, "dtype", None)
262 refs = None
263 if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype):
264 # GH#19157
265
266 if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1:
267 # GH#12513 a EA dtype passed with a 2D array, split into
268 # multiple EAs that view the values
269 # error: No overload variant of "__getitem__" of "ExtensionArray"
270 # matches argument type "Tuple[slice, int]"
271 values = [
272 values[:, n] # type: ignore[call-overload]
273 for n in range(values.shape[1])
274 ]
275 else:
276 values = [values]
277
278 if columns is None:
279 columns = Index(range(len(values)))
280 else:
281 columns = ensure_index(columns)
282
283 return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ)
284
285 elif is_extension_array_dtype(vdtype):
286 # i.e. Datetime64TZ, PeriodDtype; cases with is_1d_only_ea_dtype(vdtype)
287 # are already caught above
288 values = extract_array(values, extract_numpy=True)
289 if copy:
290 values = values.copy()
291 if values.ndim == 1:
292 values = values.reshape(-1, 1)
293
294 elif isinstance(values, (ABCSeries, Index)):
295 if not copy_on_sanitize and (
296 dtype is None or astype_is_view(values.dtype, dtype)
297 ):
298 refs = values._references
299
300 if copy_on_sanitize:
301 values = values._values.copy()
302 else:
303 values = values._values
304
305 values = _ensure_2d(values)
306
307 elif isinstance(values, (np.ndarray, ExtensionArray)):
308 # drop subclass info
309 _copy = (
310 copy_on_sanitize
311 if (dtype is None or astype_is_view(values.dtype, dtype))
312 else False
313 )
314 values = np.array(values, copy=_copy)
315 values = _ensure_2d(values)
316
317 else:
318 # by definition an array here
319 # the dtypes will be coerced to a single dtype
320 values = _prep_ndarraylike(values, copy=copy_on_sanitize)
321
322 if dtype is not None and not is_dtype_equal(values.dtype, dtype):
323 # GH#40110 see similar check inside sanitize_array
324 values = sanitize_array(
325 values,
326 None,
327 dtype=dtype,
328 copy=copy_on_sanitize,
329 allow_2d=True,
330 )
331
332 # _prep_ndarraylike ensures that values.ndim == 2 at this point
333 index, columns = _get_axes(
334 values.shape[0], values.shape[1], index=index, columns=columns
335 )
336
337 _check_values_indices_shape_match(values, index, columns)
338
339 if typ == "array":
340 if issubclass(values.dtype.type, str):
341 values = np.array(values, dtype=object)
342
343 if dtype is None and is_object_dtype(values.dtype):
344 arrays = [
345 ensure_wrapped_if_datetimelike(
346 maybe_infer_to_datetimelike(values[:, i])
347 )
348 for i in range(values.shape[1])
349 ]
350 else:
351 if is_datetime_or_timedelta_dtype(values.dtype):
352 values = ensure_wrapped_if_datetimelike(values)
353 arrays = [values[:, i] for i in range(values.shape[1])]
354
355 if copy:
356 arrays = [arr.copy() for arr in arrays]
357
358 return ArrayManager(arrays, [index, columns], verify_integrity=False)
359
360 values = values.T
361
362 # if we don't have a dtype specified, then try to convert objects
363 # on the entire block; this is to convert if we have datetimelike's
364 # embedded in an object type
365 if dtype is None and is_object_dtype(values.dtype):
366 obj_columns = list(values)
367 maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]
368 # don't convert (and copy) the objects if no type inference occurs
369 if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):
370 dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime]
371 block_values = [
372 new_block_2d(dvals_list[n], placement=BlockPlacement(n))
373 for n in range(len(dvals_list))
374 ]
375 else:
376 bp = BlockPlacement(slice(len(columns)))
377 nb = new_block_2d(values, placement=bp, refs=refs)
378 block_values = [nb]
379 else:
380 bp = BlockPlacement(slice(len(columns)))
381 nb = new_block_2d(values, placement=bp, refs=refs)
382 block_values = [nb]
383
384 if len(columns) == 0:
385 # TODO: check len(values) == 0?
386 block_values = []
387
388 return create_block_manager_from_blocks(
389 block_values, [columns, index], verify_integrity=False
390 )
391
392
393def _check_values_indices_shape_match(
394 values: np.ndarray, index: Index, columns: Index
395) -> None:
396 """
397 Check that the shape implied by our axes matches the actual shape of the
398 data.
399 """
400 if values.shape[1] != len(columns) or values.shape[0] != len(index):
401 # Could let this raise in Block constructor, but we get a more
402 # helpful exception message this way.
403 if values.shape[0] == 0:
404 raise ValueError("Empty data passed with indices specified.")
405
406 passed = values.shape
407 implied = (len(index), len(columns))
408 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
409
410
411def dict_to_mgr(
412 data: dict,
413 index,
414 columns,
415 *,
416 dtype: DtypeObj | None = None,
417 typ: str = "block",
418 copy: bool = True,
419) -> Manager:
420 """
421 Segregate Series based on type and coerce into matrices.
422 Needs to handle a lot of exceptional cases.
423
424 Used in DataFrame.__init__
425 """
426 arrays: Sequence[Any] | Series
427
428 if columns is not None:
429 from pandas.core.series import Series
430
431 arrays = Series(data, index=columns, dtype=object)
432 missing = arrays.isna()
433 if index is None:
434 # GH10856
435 # raise ValueError if only scalars in dict
436 index = _extract_index(arrays[~missing])
437 else:
438 index = ensure_index(index)
439
440 # no obvious "empty" int column
441 if missing.any() and not is_integer_dtype(dtype):
442 nan_dtype: DtypeObj
443
444 if dtype is not None:
445 # calling sanitize_array ensures we don't mix-and-match
446 # NA dtypes
447 midxs = missing.values.nonzero()[0]
448 for i in midxs:
449 arr = sanitize_array(arrays.iat[i], index, dtype=dtype)
450 arrays.iat[i] = arr
451 else:
452 # GH#1783
453 nan_dtype = np.dtype("object")
454 val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)
455 nmissing = missing.sum()
456 if copy:
457 rhs = [val] * nmissing
458 else:
459 # GH#45369
460 rhs = [val.copy() for _ in range(nmissing)]
461 arrays.loc[missing] = rhs
462
463 arrays = list(arrays)
464 columns = ensure_index(columns)
465
466 else:
467 keys = list(data.keys())
468 columns = Index(keys) if keys else default_index(0)
469 arrays = [com.maybe_iterable_to_list(data[k]) for k in keys]
470 arrays = [arr if not isinstance(arr, Index) else arr._data for arr in arrays]
471
472 if copy:
473 if typ == "block":
474 # We only need to copy arrays that will not get consolidated, i.e.
475 # only EA arrays
476 arrays = [x.copy() if isinstance(x, ExtensionArray) else x for x in arrays]
477 else:
478 # dtype check to exclude e.g. range objects, scalars
479 arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays]
480
481 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
482
483
484def nested_data_to_arrays(
485 data: Sequence,
486 columns: Index | None,
487 index: Index | None,
488 dtype: DtypeObj | None,
489) -> tuple[list[ArrayLike], Index, Index]:
490 """
491 Convert a single sequence of arrays to multiple arrays.
492 """
493 # By the time we get here we have already checked treat_as_nested(data)
494
495 if is_named_tuple(data[0]) and columns is None:
496 columns = ensure_index(data[0]._fields)
497
498 arrays, columns = to_arrays(data, columns, dtype=dtype)
499 columns = ensure_index(columns)
500
501 if index is None:
502 if isinstance(data[0], ABCSeries):
503 index = _get_names_from_index(data)
504 else:
505 index = default_index(len(data))
506
507 return arrays, columns, index
508
509
510def treat_as_nested(data) -> bool:
511 """
512 Check if we should use nested_data_to_arrays.
513 """
514 return (
515 len(data) > 0
516 and is_list_like(data[0])
517 and getattr(data[0], "ndim", 1) == 1
518 and not (isinstance(data, ExtensionArray) and data.ndim == 2)
519 )
520
521
522# ---------------------------------------------------------------------
523
524
525def _prep_ndarraylike(values, copy: bool = True) -> np.ndarray:
526 # values is specifically _not_ ndarray, EA, Index, or Series
527 # We only get here with `not treat_as_nested(values)`
528
529 if len(values) == 0:
530 # TODO: check for length-zero range, in which case return int64 dtype?
531 # TODO: re-use anything in try_cast?
532 return np.empty((0, 0), dtype=object)
533 elif isinstance(values, range):
534 arr = range_to_ndarray(values)
535 return arr[..., np.newaxis]
536
537 def convert(v):
538 if not is_list_like(v) or isinstance(v, ABCDataFrame):
539 return v
540
541 v = extract_array(v, extract_numpy=True)
542 res = maybe_convert_platform(v)
543 # We don't do maybe_infer_to_datetimelike here bc we will end up doing
544 # it column-by-column in ndarray_to_mgr
545 return res
546
547 # we could have a 1-dim or 2-dim list here
548 # this is equiv of np.asarray, but does object conversion
549 # and platform dtype preservation
550 # does not convert e.g. [1, "a", True] to ["1", "a", "True"] like
551 # np.asarray would
552 if is_list_like(values[0]):
553 values = np.array([convert(v) for v in values])
554 elif isinstance(values[0], np.ndarray) and values[0].ndim == 0:
555 # GH#21861 see test_constructor_list_of_lists
556 values = np.array([convert(v) for v in values])
557 else:
558 values = convert(values)
559
560 return _ensure_2d(values)
561
562
563def _ensure_2d(values: np.ndarray) -> np.ndarray:
564 """
565 Reshape 1D values, raise on anything else other than 2D.
566 """
567 if values.ndim == 1:
568 values = values.reshape((values.shape[0], 1))
569 elif values.ndim != 2:
570 raise ValueError(f"Must pass 2-d input. shape={values.shape}")
571 return values
572
573
574def _homogenize(
575 data, index: Index, dtype: DtypeObj | None
576) -> tuple[list[ArrayLike], list[Any]]:
577 oindex = None
578 homogenized = []
579 # if the original array-like in `data` is a Series, keep track of this Series' refs
580 refs: list[Any] = []
581
582 for val in data:
583 if isinstance(val, ABCSeries):
584 if dtype is not None:
585 val = val.astype(dtype, copy=False)
586 if val.index is not index:
587 # Forces alignment. No need to copy data since we
588 # are putting it into an ndarray later
589 val = val.reindex(index, copy=False)
590 refs.append(val._references)
591 val = val._values
592 else:
593 if isinstance(val, dict):
594 # GH#41785 this _should_ be equivalent to (but faster than)
595 # val = Series(val, index=index)._values
596 if oindex is None:
597 oindex = index.astype("O")
598
599 if isinstance(index, (DatetimeIndex, TimedeltaIndex)):
600 # see test_constructor_dict_datetime64_index
601 val = dict_compat(val)
602 else:
603 # see test_constructor_subclass_dict
604 val = dict(val)
605 val = lib.fast_multiget(val, oindex._values, default=np.nan)
606
607 val = sanitize_array(val, index, dtype=dtype, copy=False)
608 com.require_length_match(val, index)
609 refs.append(None)
610
611 homogenized.append(val)
612
613 return homogenized, refs
614
615
616def _extract_index(data) -> Index:
617 """
618 Try to infer an Index from the passed data, raise ValueError on failure.
619 """
620 index: Index
621 if len(data) == 0:
622 return default_index(0)
623
624 raw_lengths = []
625 indexes: list[list[Hashable] | Index] = []
626
627 have_raw_arrays = False
628 have_series = False
629 have_dicts = False
630
631 for val in data:
632 if isinstance(val, ABCSeries):
633 have_series = True
634 indexes.append(val.index)
635 elif isinstance(val, dict):
636 have_dicts = True
637 indexes.append(list(val.keys()))
638 elif is_list_like(val) and getattr(val, "ndim", 1) == 1:
639 have_raw_arrays = True
640 raw_lengths.append(len(val))
641 elif isinstance(val, np.ndarray) and val.ndim > 1:
642 raise ValueError("Per-column arrays must each be 1-dimensional")
643
644 if not indexes and not raw_lengths:
645 raise ValueError("If using all scalar values, you must pass an index")
646
647 if have_series:
648 index = union_indexes(indexes)
649 elif have_dicts:
650 index = union_indexes(indexes, sort=False)
651
652 if have_raw_arrays:
653 lengths = list(set(raw_lengths))
654 if len(lengths) > 1:
655 raise ValueError("All arrays must be of the same length")
656
657 if have_dicts:
658 raise ValueError(
659 "Mixing dicts with non-Series may lead to ambiguous ordering."
660 )
661
662 if have_series:
663 if lengths[0] != len(index):
664 msg = (
665 f"array length {lengths[0]} does not match index "
666 f"length {len(index)}"
667 )
668 raise ValueError(msg)
669 else:
670 index = default_index(lengths[0])
671
672 return ensure_index(index)
673
674
675def reorder_arrays(
676 arrays: list[ArrayLike], arr_columns: Index, columns: Index | None, length: int
677) -> tuple[list[ArrayLike], Index]:
678 """
679 Pre-emptively (cheaply) reindex arrays with new columns.
680 """
681 # reorder according to the columns
682 if columns is not None:
683 if not columns.equals(arr_columns):
684 # if they are equal, there is nothing to do
685 new_arrays: list[ArrayLike | None]
686 new_arrays = [None] * len(columns)
687 indexer = arr_columns.get_indexer(columns)
688 for i, k in enumerate(indexer):
689 if k == -1:
690 # by convention default is all-NaN object dtype
691 arr = np.empty(length, dtype=object)
692 arr.fill(np.nan)
693 else:
694 arr = arrays[k]
695 new_arrays[i] = arr
696
697 # Incompatible types in assignment (expression has type
698 # "List[Union[ExtensionArray, ndarray[Any, Any], None]]", variable
699 # has type "List[Union[ExtensionArray, ndarray[Any, Any]]]")
700 arrays = new_arrays # type: ignore[assignment]
701 arr_columns = columns
702
703 return arrays, arr_columns
704
705
706def _get_names_from_index(data) -> Index:
707 has_some_name = any(getattr(s, "name", None) is not None for s in data)
708 if not has_some_name:
709 return default_index(len(data))
710
711 index: list[Hashable] = list(range(len(data)))
712 count = 0
713 for i, s in enumerate(data):
714 n = getattr(s, "name", None)
715 if n is not None:
716 index[i] = n
717 else:
718 index[i] = f"Unnamed {count}"
719 count += 1
720
721 return Index(index)
722
723
724def _get_axes(
725 N: int, K: int, index: Index | None, columns: Index | None
726) -> tuple[Index, Index]:
727 # helper to create the axes as indexes
728 # return axes or defaults
729
730 if index is None:
731 index = default_index(N)
732 else:
733 index = ensure_index(index)
734
735 if columns is None:
736 columns = default_index(K)
737 else:
738 columns = ensure_index(columns)
739 return index, columns
740
741
742def dataclasses_to_dicts(data):
743 """
744 Converts a list of dataclass instances to a list of dictionaries.
745
746 Parameters
747 ----------
748 data : List[Type[dataclass]]
749
750 Returns
751 --------
752 list_dict : List[dict]
753
754 Examples
755 --------
756 >>> from dataclasses import dataclass
757 >>> @dataclass
758 ... class Point:
759 ... x: int
760 ... y: int
761
762 >>> dataclasses_to_dicts([Point(1, 2), Point(2, 3)])
763 [{'x': 1, 'y': 2}, {'x': 2, 'y': 3}]
764
765 """
766 from dataclasses import asdict
767
768 return list(map(asdict, data))
769
770
771# ---------------------------------------------------------------------
772# Conversion of Inputs to Arrays
773
774
775def to_arrays(
776 data, columns: Index | None, dtype: DtypeObj | None = None
777) -> tuple[list[ArrayLike], Index]:
778 """
779 Return list of arrays, columns.
780
781 Returns
782 -------
783 list[ArrayLike]
784 These will become columns in a DataFrame.
785 Index
786 This will become frame.columns.
787
788 Notes
789 -----
790 Ensures that len(result_arrays) == len(result_index).
791 """
792 if isinstance(data, ABCDataFrame):
793 # see test_from_records_with_index_data, test_from_records_bad_index_column
794 if columns is not None:
795 arrays = [
796 data._ixs(i, axis=1)._values
797 for i, col in enumerate(data.columns)
798 if col in columns
799 ]
800 else:
801 columns = data.columns
802 arrays = [data._ixs(i, axis=1)._values for i in range(len(columns))]
803
804 return arrays, columns
805
806 if not len(data):
807 if isinstance(data, np.ndarray):
808 if data.dtype.names is not None:
809 # i.e. numpy structured array
810 columns = ensure_index(data.dtype.names)
811 arrays = [data[name] for name in columns]
812
813 if len(data) == 0:
814 # GH#42456 the indexing above results in list of 2D ndarrays
815 # TODO: is that an issue with numpy?
816 for i, arr in enumerate(arrays):
817 if arr.ndim == 2:
818 arrays[i] = arr[:, 0]
819
820 return arrays, columns
821 return [], ensure_index([])
822
823 elif isinstance(data, np.ndarray) and data.dtype.names is not None:
824 # e.g. recarray
825 columns = Index(list(data.dtype.names))
826 arrays = [data[k] for k in columns]
827 return arrays, columns
828
829 if isinstance(data[0], (list, tuple)):
830 arr = _list_to_arrays(data)
831 elif isinstance(data[0], abc.Mapping):
832 arr, columns = _list_of_dict_to_arrays(data, columns)
833 elif isinstance(data[0], ABCSeries):
834 arr, columns = _list_of_series_to_arrays(data, columns)
835 else:
836 # last ditch effort
837 data = [tuple(x) for x in data]
838 arr = _list_to_arrays(data)
839
840 content, columns = _finalize_columns_and_data(arr, columns, dtype)
841 return content, columns
842
843
844def _list_to_arrays(data: list[tuple | list]) -> np.ndarray:
845 # Returned np.ndarray has ndim = 2
846 # Note: we already check len(data) > 0 before getting hre
847 if isinstance(data[0], tuple):
848 content = lib.to_object_array_tuples(data)
849 else:
850 # list of lists
851 content = lib.to_object_array(data)
852 return content
853
854
855def _list_of_series_to_arrays(
856 data: list,
857 columns: Index | None,
858) -> tuple[np.ndarray, Index]:
859 # returned np.ndarray has ndim == 2
860
861 if columns is None:
862 # We know pass_data is non-empty because data[0] is a Series
863 pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))]
864 columns = get_objs_combined_axis(pass_data, sort=False)
865
866 indexer_cache: dict[int, np.ndarray] = {}
867
868 aligned_values = []
869 for s in data:
870 index = getattr(s, "index", None)
871 if index is None:
872 index = default_index(len(s))
873
874 if id(index) in indexer_cache:
875 indexer = indexer_cache[id(index)]
876 else:
877 indexer = indexer_cache[id(index)] = index.get_indexer(columns)
878
879 values = extract_array(s, extract_numpy=True)
880 aligned_values.append(algorithms.take_nd(values, indexer))
881
882 content = np.vstack(aligned_values)
883 return content, columns
884
885
886def _list_of_dict_to_arrays(
887 data: list[dict],
888 columns: Index | None,
889) -> tuple[np.ndarray, Index]:
890 """
891 Convert list of dicts to numpy arrays
892
893 if `columns` is not passed, column names are inferred from the records
894 - for OrderedDict and dicts, the column names match
895 the key insertion-order from the first record to the last.
896 - For other kinds of dict-likes, the keys are lexically sorted.
897
898 Parameters
899 ----------
900 data : iterable
901 collection of records (OrderedDict, dict)
902 columns: iterables or None
903
904 Returns
905 -------
906 content : np.ndarray[object, ndim=2]
907 columns : Index
908 """
909 if columns is None:
910 gen = (list(x.keys()) for x in data)
911 sort = not any(isinstance(d, dict) for d in data)
912 pre_cols = lib.fast_unique_multiple_list_gen(gen, sort=sort)
913 columns = ensure_index(pre_cols)
914
915 # assure that they are of the base dict class and not of derived
916 # classes
917 data = [d if type(d) is dict else dict(d) for d in data]
918
919 content = lib.dicts_to_array(data, list(columns))
920 return content, columns
921
922
923def _finalize_columns_and_data(
924 content: np.ndarray, # ndim == 2
925 columns: Index | None,
926 dtype: DtypeObj | None,
927) -> tuple[list[ArrayLike], Index]:
928 """
929 Ensure we have valid columns, cast object dtypes if possible.
930 """
931 contents = list(content.T)
932
933 try:
934 columns = _validate_or_indexify_columns(contents, columns)
935 except AssertionError as err:
936 # GH#26429 do not raise user-facing AssertionError
937 raise ValueError(err) from err
938
939 if len(contents) and contents[0].dtype == np.object_:
940 contents = convert_object_array(contents, dtype=dtype)
941
942 return contents, columns
943
944
945def _validate_or_indexify_columns(
946 content: list[np.ndarray], columns: Index | None
947) -> Index:
948 """
949 If columns is None, make numbers as column names; Otherwise, validate that
950 columns have valid length.
951
952 Parameters
953 ----------
954 content : list of np.ndarrays
955 columns : Index or None
956
957 Returns
958 -------
959 Index
960 If columns is None, assign positional column index value as columns.
961
962 Raises
963 ------
964 1. AssertionError when content is not composed of list of lists, and if
965 length of columns is not equal to length of content.
966 2. ValueError when content is list of lists, but length of each sub-list
967 is not equal
968 3. ValueError when content is list of lists, but length of sub-list is
969 not equal to length of content
970 """
971 if columns is None:
972 columns = default_index(len(content))
973 else:
974 # Add mask for data which is composed of list of lists
975 is_mi_list = isinstance(columns, list) and all(
976 isinstance(col, list) for col in columns
977 )
978
979 if not is_mi_list and len(columns) != len(content): # pragma: no cover
980 # caller's responsibility to check for this...
981 raise AssertionError(
982 f"{len(columns)} columns passed, passed data had "
983 f"{len(content)} columns"
984 )
985 if is_mi_list:
986 # check if nested list column, length of each sub-list should be equal
987 if len({len(col) for col in columns}) > 1:
988 raise ValueError(
989 "Length of columns passed for MultiIndex columns is different"
990 )
991
992 # if columns is not empty and length of sublist is not equal to content
993 if columns and len(columns[0]) != len(content):
994 raise ValueError(
995 f"{len(columns[0])} columns passed, passed data had "
996 f"{len(content)} columns"
997 )
998 return columns
999
1000
1001def convert_object_array(
1002 content: list[npt.NDArray[np.object_]],
1003 dtype: DtypeObj | None,
1004 dtype_backend: str = "numpy",
1005 coerce_float: bool = False,
1006) -> list[ArrayLike]:
1007 """
1008 Internal function to convert object array.
1009
1010 Parameters
1011 ----------
1012 content: List[np.ndarray]
1013 dtype: np.dtype or ExtensionDtype
1014 dtype_backend: Controls if nullable/pyarrow dtypes are returned.
1015 coerce_float: Cast floats that are integers to int.
1016
1017 Returns
1018 -------
1019 List[ArrayLike]
1020 """
1021 # provide soft conversion of object dtypes
1022
1023 def convert(arr):
1024 if dtype != np.dtype("O"):
1025 arr = lib.maybe_convert_objects(
1026 arr,
1027 try_float=coerce_float,
1028 convert_to_nullable_dtype=dtype_backend != "numpy",
1029 )
1030 # Notes on cases that get here 2023-02-15
1031 # 1) we DO get here when arr is all Timestamps and dtype=None
1032 # 2) disabling this doesn't break the world, so this must be
1033 # getting caught at a higher level
1034 # 3) passing convert_datetime to maybe_convert_objects get this right
1035 # 4) convert_timedelta?
1036
1037 if dtype is None:
1038 if arr.dtype == np.dtype("O"):
1039 # i.e. maybe_convert_objects didn't convert
1040 arr = maybe_infer_to_datetimelike(arr)
1041 if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):
1042 arr = StringDtype().construct_array_type()._from_sequence(arr)
1043 elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):
1044 if is_integer_dtype(arr.dtype):
1045 arr = IntegerArray(arr, np.zeros(arr.shape, dtype=np.bool_))
1046 elif is_bool_dtype(arr.dtype):
1047 arr = BooleanArray(arr, np.zeros(arr.shape, dtype=np.bool_))
1048 elif is_float_dtype(arr.dtype):
1049 arr = FloatingArray(arr, np.isnan(arr))
1050
1051 elif isinstance(dtype, ExtensionDtype):
1052 # TODO: test(s) that get here
1053 # TODO: try to de-duplicate this convert function with
1054 # core.construction functions
1055 cls = dtype.construct_array_type()
1056 arr = cls._from_sequence(arr, dtype=dtype, copy=False)
1057 elif dtype.kind in ["m", "M"]:
1058 # This restriction is harmless bc these are the only cases
1059 # where maybe_cast_to_datetime is not a no-op.
1060 # Here we know:
1061 # 1) dtype.kind in ["m", "M"] and
1062 # 2) arr is either object or numeric dtype
1063 arr = maybe_cast_to_datetime(arr, dtype)
1064
1065 return arr
1066
1067 arrays = [convert(arr) for arr in content]
1068
1069 return arrays