1"""
2Functions for preparing various inputs passed to the DataFrame or Series
3constructors before passing them to a BlockManager.
4"""
5from __future__ import annotations
6
7from collections import abc
8from typing import (
9 TYPE_CHECKING,
10 Any,
11)
12
13import numpy as np
14from numpy import ma
15
16from pandas._config import using_pyarrow_string_dtype
17
18from pandas._libs import lib
19
20from pandas.core.dtypes.astype import astype_is_view
21from pandas.core.dtypes.cast import (
22 construct_1d_arraylike_from_scalar,
23 dict_compat,
24 maybe_cast_to_datetime,
25 maybe_convert_platform,
26 maybe_infer_to_datetimelike,
27)
28from pandas.core.dtypes.common import (
29 is_1d_only_ea_dtype,
30 is_integer_dtype,
31 is_list_like,
32 is_named_tuple,
33 is_object_dtype,
34)
35from pandas.core.dtypes.dtypes import ExtensionDtype
36from pandas.core.dtypes.generic import (
37 ABCDataFrame,
38 ABCSeries,
39)
40
41from pandas.core import (
42 algorithms,
43 common as com,
44)
45from pandas.core.arrays import ExtensionArray
46from pandas.core.arrays.string_ import StringDtype
47from pandas.core.construction import (
48 array as pd_array,
49 ensure_wrapped_if_datetimelike,
50 extract_array,
51 range_to_ndarray,
52 sanitize_array,
53)
54from pandas.core.indexes.api import (
55 DatetimeIndex,
56 Index,
57 TimedeltaIndex,
58 default_index,
59 ensure_index,
60 get_objs_combined_axis,
61 union_indexes,
62)
63from pandas.core.internals.array_manager import (
64 ArrayManager,
65 SingleArrayManager,
66)
67from pandas.core.internals.blocks import (
68 BlockPlacement,
69 ensure_block_shape,
70 new_block,
71 new_block_2d,
72)
73from pandas.core.internals.managers import (
74 BlockManager,
75 SingleBlockManager,
76 create_block_manager_from_blocks,
77 create_block_manager_from_column_arrays,
78)
79
80if TYPE_CHECKING:
81 from collections.abc import (
82 Hashable,
83 Sequence,
84 )
85
86 from pandas._typing import (
87 ArrayLike,
88 DtypeObj,
89 Manager,
90 npt,
91 )
92# ---------------------------------------------------------------------
93# BlockManager Interface
94
95
96def arrays_to_mgr(
97 arrays,
98 columns: Index,
99 index,
100 *,
101 dtype: DtypeObj | None = None,
102 verify_integrity: bool = True,
103 typ: str | None = None,
104 consolidate: bool = True,
105) -> Manager:
106 """
107 Segregate Series based on type and coerce into matrices.
108
109 Needs to handle a lot of exceptional cases.
110 """
111 if verify_integrity:
112 # figure out the index, if necessary
113 if index is None:
114 index = _extract_index(arrays)
115 else:
116 index = ensure_index(index)
117
118 # don't force copy because getting jammed in an ndarray anyway
119 arrays, refs = _homogenize(arrays, index, dtype)
120 # _homogenize ensures
121 # - all(len(x) == len(index) for x in arrays)
122 # - all(x.ndim == 1 for x in arrays)
123 # - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)
124 # - all(type(x) is not NumpyExtensionArray for x in arrays)
125
126 else:
127 index = ensure_index(index)
128 arrays = [extract_array(x, extract_numpy=True) for x in arrays]
129 # with _from_arrays, the passed arrays should never be Series objects
130 refs = [None] * len(arrays)
131
132 # Reached via DataFrame._from_arrays; we do minimal validation here
133 for arr in arrays:
134 if (
135 not isinstance(arr, (np.ndarray, ExtensionArray))
136 or arr.ndim != 1
137 or len(arr) != len(index)
138 ):
139 raise ValueError(
140 "Arrays must be 1-dimensional np.ndarray or ExtensionArray "
141 "with length matching len(index)"
142 )
143
144 columns = ensure_index(columns)
145 if len(columns) != len(arrays):
146 raise ValueError("len(arrays) must match len(columns)")
147
148 # from BlockManager perspective
149 axes = [columns, index]
150
151 if typ == "block":
152 return create_block_manager_from_column_arrays(
153 arrays, axes, consolidate=consolidate, refs=refs
154 )
155 elif typ == "array":
156 return ArrayManager(arrays, [index, columns])
157 else:
158 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
159
160
161def rec_array_to_mgr(
162 data: np.rec.recarray | np.ndarray,
163 index,
164 columns,
165 dtype: DtypeObj | None,
166 copy: bool,
167 typ: str,
168) -> Manager:
169 """
170 Extract from a masked rec array and create the manager.
171 """
172 # essentially process a record array then fill it
173 fdata = ma.getdata(data)
174 if index is None:
175 index = default_index(len(fdata))
176 else:
177 index = ensure_index(index)
178
179 if columns is not None:
180 columns = ensure_index(columns)
181 arrays, arr_columns = to_arrays(fdata, columns)
182
183 # create the manager
184
185 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, len(index))
186 if columns is None:
187 columns = arr_columns
188
189 mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ)
190
191 if copy:
192 mgr = mgr.copy()
193 return mgr
194
195
196def mgr_to_mgr(mgr, typ: str, copy: bool = True) -> Manager:
197 """
198 Convert to specific type of Manager. Does not copy if the type is already
199 correct. Does not guarantee a copy otherwise. `copy` keyword only controls
200 whether conversion from Block->ArrayManager copies the 1D arrays.
201 """
202 new_mgr: Manager
203
204 if typ == "block":
205 if isinstance(mgr, BlockManager):
206 new_mgr = mgr
207 else:
208 if mgr.ndim == 2:
209 new_mgr = arrays_to_mgr(
210 mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block"
211 )
212 else:
213 new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index)
214 elif typ == "array":
215 if isinstance(mgr, ArrayManager):
216 new_mgr = mgr
217 else:
218 if mgr.ndim == 2:
219 arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))]
220 if copy:
221 arrays = [arr.copy() for arr in arrays]
222 new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])
223 else:
224 array = mgr.internal_values()
225 if copy:
226 array = array.copy()
227 new_mgr = SingleArrayManager([array], [mgr.index])
228 else:
229 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
230 return new_mgr
231
232
233# ---------------------------------------------------------------------
234# DataFrame Constructor Interface
235
236
237def ndarray_to_mgr(
238 values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str
239) -> Manager:
240 # used in DataFrame.__init__
241 # input must be a ndarray, list, Series, Index, ExtensionArray
242
243 if isinstance(values, ABCSeries):
244 if columns is None:
245 if values.name is not None:
246 columns = Index([values.name])
247 if index is None:
248 index = values.index
249 else:
250 values = values.reindex(index)
251
252 # zero len case (GH #2234)
253 if not len(values) and columns is not None and len(columns):
254 values = np.empty((0, 1), dtype=object)
255
256 # if the array preparation does a copy -> avoid this for ArrayManager,
257 # since the copy is done on conversion to 1D arrays
258 copy_on_sanitize = False if typ == "array" else copy
259
260 vdtype = getattr(values, "dtype", None)
261 refs = None
262 if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype):
263 # GH#19157
264
265 if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1:
266 # GH#12513 a EA dtype passed with a 2D array, split into
267 # multiple EAs that view the values
268 # error: No overload variant of "__getitem__" of "ExtensionArray"
269 # matches argument type "Tuple[slice, int]"
270 values = [
271 values[:, n] # type: ignore[call-overload]
272 for n in range(values.shape[1])
273 ]
274 else:
275 values = [values]
276
277 if columns is None:
278 columns = Index(range(len(values)))
279 else:
280 columns = ensure_index(columns)
281
282 return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ)
283
284 elif isinstance(vdtype, ExtensionDtype):
285 # i.e. Datetime64TZ, PeriodDtype; cases with is_1d_only_ea_dtype(vdtype)
286 # are already caught above
287 values = extract_array(values, extract_numpy=True)
288 if copy:
289 values = values.copy()
290 if values.ndim == 1:
291 values = values.reshape(-1, 1)
292
293 elif isinstance(values, (ABCSeries, Index)):
294 if not copy_on_sanitize and (
295 dtype is None or astype_is_view(values.dtype, dtype)
296 ):
297 refs = values._references
298
299 if copy_on_sanitize:
300 values = values._values.copy()
301 else:
302 values = values._values
303
304 values = _ensure_2d(values)
305
306 elif isinstance(values, (np.ndarray, ExtensionArray)):
307 # drop subclass info
308 _copy = (
309 copy_on_sanitize
310 if (dtype is None or astype_is_view(values.dtype, dtype))
311 else False
312 )
313 values = np.array(values, copy=_copy)
314 values = _ensure_2d(values)
315
316 else:
317 # by definition an array here
318 # the dtypes will be coerced to a single dtype
319 values = _prep_ndarraylike(values, copy=copy_on_sanitize)
320
321 if dtype is not None and values.dtype != dtype:
322 # GH#40110 see similar check inside sanitize_array
323 values = sanitize_array(
324 values,
325 None,
326 dtype=dtype,
327 copy=copy_on_sanitize,
328 allow_2d=True,
329 )
330
331 # _prep_ndarraylike ensures that values.ndim == 2 at this point
332 index, columns = _get_axes(
333 values.shape[0], values.shape[1], index=index, columns=columns
334 )
335
336 _check_values_indices_shape_match(values, index, columns)
337
338 if typ == "array":
339 if issubclass(values.dtype.type, str):
340 values = np.array(values, dtype=object)
341
342 if dtype is None and is_object_dtype(values.dtype):
343 arrays = [
344 ensure_wrapped_if_datetimelike(
345 maybe_infer_to_datetimelike(values[:, i])
346 )
347 for i in range(values.shape[1])
348 ]
349 else:
350 if lib.is_np_dtype(values.dtype, "mM"):
351 values = ensure_wrapped_if_datetimelike(values)
352 arrays = [values[:, i] for i in range(values.shape[1])]
353
354 if copy:
355 arrays = [arr.copy() for arr in arrays]
356
357 return ArrayManager(arrays, [index, columns], verify_integrity=False)
358
359 values = values.T
360
361 # if we don't have a dtype specified, then try to convert objects
362 # on the entire block; this is to convert if we have datetimelike's
363 # embedded in an object type
364 if dtype is None and is_object_dtype(values.dtype):
365 obj_columns = list(values)
366 maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]
367 # don't convert (and copy) the objects if no type inference occurs
368 if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):
369 dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime]
370 block_values = [
371 new_block_2d(dvals_list[n], placement=BlockPlacement(n))
372 for n in range(len(dvals_list))
373 ]
374 else:
375 bp = BlockPlacement(slice(len(columns)))
376 nb = new_block_2d(values, placement=bp, refs=refs)
377 block_values = [nb]
378 elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype():
379 dtype = StringDtype(storage="pyarrow_numpy")
380
381 obj_columns = list(values)
382 block_values = [
383 new_block(
384 dtype.construct_array_type()._from_sequence(data, dtype=dtype),
385 BlockPlacement(slice(i, i + 1)),
386 ndim=2,
387 )
388 for i, data in enumerate(obj_columns)
389 ]
390
391 else:
392 bp = BlockPlacement(slice(len(columns)))
393 nb = new_block_2d(values, placement=bp, refs=refs)
394 block_values = [nb]
395
396 if len(columns) == 0:
397 # TODO: check len(values) == 0?
398 block_values = []
399
400 return create_block_manager_from_blocks(
401 block_values, [columns, index], verify_integrity=False
402 )
403
404
405def _check_values_indices_shape_match(
406 values: np.ndarray, index: Index, columns: Index
407) -> None:
408 """
409 Check that the shape implied by our axes matches the actual shape of the
410 data.
411 """
412 if values.shape[1] != len(columns) or values.shape[0] != len(index):
413 # Could let this raise in Block constructor, but we get a more
414 # helpful exception message this way.
415 if values.shape[0] == 0 < len(index):
416 raise ValueError("Empty data passed with indices specified.")
417
418 passed = values.shape
419 implied = (len(index), len(columns))
420 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
421
422
423def dict_to_mgr(
424 data: dict,
425 index,
426 columns,
427 *,
428 dtype: DtypeObj | None = None,
429 typ: str = "block",
430 copy: bool = True,
431) -> Manager:
432 """
433 Segregate Series based on type and coerce into matrices.
434 Needs to handle a lot of exceptional cases.
435
436 Used in DataFrame.__init__
437 """
438 arrays: Sequence[Any] | Series
439
440 if columns is not None:
441 from pandas.core.series import Series
442
443 arrays = Series(data, index=columns, dtype=object)
444 missing = arrays.isna()
445 if index is None:
446 # GH10856
447 # raise ValueError if only scalars in dict
448 index = _extract_index(arrays[~missing])
449 else:
450 index = ensure_index(index)
451
452 # no obvious "empty" int column
453 if missing.any() and not is_integer_dtype(dtype):
454 nan_dtype: DtypeObj
455
456 if dtype is not None:
457 # calling sanitize_array ensures we don't mix-and-match
458 # NA dtypes
459 midxs = missing.values.nonzero()[0]
460 for i in midxs:
461 arr = sanitize_array(arrays.iat[i], index, dtype=dtype)
462 arrays.iat[i] = arr
463 else:
464 # GH#1783
465 nan_dtype = np.dtype("object")
466 val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)
467 nmissing = missing.sum()
468 if copy:
469 rhs = [val] * nmissing
470 else:
471 # GH#45369
472 rhs = [val.copy() for _ in range(nmissing)]
473 arrays.loc[missing] = rhs
474
475 arrays = list(arrays)
476 columns = ensure_index(columns)
477
478 else:
479 keys = list(data.keys())
480 columns = Index(keys) if keys else default_index(0)
481 arrays = [com.maybe_iterable_to_list(data[k]) for k in keys]
482
483 if copy:
484 if typ == "block":
485 # We only need to copy arrays that will not get consolidated, i.e.
486 # only EA arrays
487 arrays = [
488 x.copy()
489 if isinstance(x, ExtensionArray)
490 else x.copy(deep=True)
491 if (
492 isinstance(x, Index)
493 or isinstance(x, ABCSeries)
494 and is_1d_only_ea_dtype(x.dtype)
495 )
496 else x
497 for x in arrays
498 ]
499 else:
500 # dtype check to exclude e.g. range objects, scalars
501 arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays]
502
503 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
504
505
506def nested_data_to_arrays(
507 data: Sequence,
508 columns: Index | None,
509 index: Index | None,
510 dtype: DtypeObj | None,
511) -> tuple[list[ArrayLike], Index, Index]:
512 """
513 Convert a single sequence of arrays to multiple arrays.
514 """
515 # By the time we get here we have already checked treat_as_nested(data)
516
517 if is_named_tuple(data[0]) and columns is None:
518 columns = ensure_index(data[0]._fields)
519
520 arrays, columns = to_arrays(data, columns, dtype=dtype)
521 columns = ensure_index(columns)
522
523 if index is None:
524 if isinstance(data[0], ABCSeries):
525 index = _get_names_from_index(data)
526 else:
527 index = default_index(len(data))
528
529 return arrays, columns, index
530
531
532def treat_as_nested(data) -> bool:
533 """
534 Check if we should use nested_data_to_arrays.
535 """
536 return (
537 len(data) > 0
538 and is_list_like(data[0])
539 and getattr(data[0], "ndim", 1) == 1
540 and not (isinstance(data, ExtensionArray) and data.ndim == 2)
541 )
542
543
544# ---------------------------------------------------------------------
545
546
547def _prep_ndarraylike(values, copy: bool = True) -> np.ndarray:
548 # values is specifically _not_ ndarray, EA, Index, or Series
549 # We only get here with `not treat_as_nested(values)`
550
551 if len(values) == 0:
552 # TODO: check for length-zero range, in which case return int64 dtype?
553 # TODO: reuse anything in try_cast?
554 return np.empty((0, 0), dtype=object)
555 elif isinstance(values, range):
556 arr = range_to_ndarray(values)
557 return arr[..., np.newaxis]
558
559 def convert(v):
560 if not is_list_like(v) or isinstance(v, ABCDataFrame):
561 return v
562
563 v = extract_array(v, extract_numpy=True)
564 res = maybe_convert_platform(v)
565 # We don't do maybe_infer_to_datetimelike here bc we will end up doing
566 # it column-by-column in ndarray_to_mgr
567 return res
568
569 # we could have a 1-dim or 2-dim list here
570 # this is equiv of np.asarray, but does object conversion
571 # and platform dtype preservation
572 # does not convert e.g. [1, "a", True] to ["1", "a", "True"] like
573 # np.asarray would
574 if is_list_like(values[0]):
575 values = np.array([convert(v) for v in values])
576 elif isinstance(values[0], np.ndarray) and values[0].ndim == 0:
577 # GH#21861 see test_constructor_list_of_lists
578 values = np.array([convert(v) for v in values])
579 else:
580 values = convert(values)
581
582 return _ensure_2d(values)
583
584
585def _ensure_2d(values: np.ndarray) -> np.ndarray:
586 """
587 Reshape 1D values, raise on anything else other than 2D.
588 """
589 if values.ndim == 1:
590 values = values.reshape((values.shape[0], 1))
591 elif values.ndim != 2:
592 raise ValueError(f"Must pass 2-d input. shape={values.shape}")
593 return values
594
595
596def _homogenize(
597 data, index: Index, dtype: DtypeObj | None
598) -> tuple[list[ArrayLike], list[Any]]:
599 oindex = None
600 homogenized = []
601 # if the original array-like in `data` is a Series, keep track of this Series' refs
602 refs: list[Any] = []
603
604 for val in data:
605 if isinstance(val, (ABCSeries, Index)):
606 if dtype is not None:
607 val = val.astype(dtype, copy=False)
608 if isinstance(val, ABCSeries) and val.index is not index:
609 # Forces alignment. No need to copy data since we
610 # are putting it into an ndarray later
611 val = val.reindex(index, copy=False)
612 refs.append(val._references)
613 val = val._values
614 else:
615 if isinstance(val, dict):
616 # GH#41785 this _should_ be equivalent to (but faster than)
617 # val = Series(val, index=index)._values
618 if oindex is None:
619 oindex = index.astype("O")
620
621 if isinstance(index, (DatetimeIndex, TimedeltaIndex)):
622 # see test_constructor_dict_datetime64_index
623 val = dict_compat(val)
624 else:
625 # see test_constructor_subclass_dict
626 val = dict(val)
627 val = lib.fast_multiget(val, oindex._values, default=np.nan)
628
629 val = sanitize_array(val, index, dtype=dtype, copy=False)
630 com.require_length_match(val, index)
631 refs.append(None)
632
633 homogenized.append(val)
634
635 return homogenized, refs
636
637
638def _extract_index(data) -> Index:
639 """
640 Try to infer an Index from the passed data, raise ValueError on failure.
641 """
642 index: Index
643 if len(data) == 0:
644 return default_index(0)
645
646 raw_lengths = []
647 indexes: list[list[Hashable] | Index] = []
648
649 have_raw_arrays = False
650 have_series = False
651 have_dicts = False
652
653 for val in data:
654 if isinstance(val, ABCSeries):
655 have_series = True
656 indexes.append(val.index)
657 elif isinstance(val, dict):
658 have_dicts = True
659 indexes.append(list(val.keys()))
660 elif is_list_like(val) and getattr(val, "ndim", 1) == 1:
661 have_raw_arrays = True
662 raw_lengths.append(len(val))
663 elif isinstance(val, np.ndarray) and val.ndim > 1:
664 raise ValueError("Per-column arrays must each be 1-dimensional")
665
666 if not indexes and not raw_lengths:
667 raise ValueError("If using all scalar values, you must pass an index")
668
669 if have_series:
670 index = union_indexes(indexes)
671 elif have_dicts:
672 index = union_indexes(indexes, sort=False)
673
674 if have_raw_arrays:
675 lengths = list(set(raw_lengths))
676 if len(lengths) > 1:
677 raise ValueError("All arrays must be of the same length")
678
679 if have_dicts:
680 raise ValueError(
681 "Mixing dicts with non-Series may lead to ambiguous ordering."
682 )
683
684 if have_series:
685 if lengths[0] != len(index):
686 msg = (
687 f"array length {lengths[0]} does not match index "
688 f"length {len(index)}"
689 )
690 raise ValueError(msg)
691 else:
692 index = default_index(lengths[0])
693
694 return ensure_index(index)
695
696
697def reorder_arrays(
698 arrays: list[ArrayLike], arr_columns: Index, columns: Index | None, length: int
699) -> tuple[list[ArrayLike], Index]:
700 """
701 Pre-emptively (cheaply) reindex arrays with new columns.
702 """
703 # reorder according to the columns
704 if columns is not None:
705 if not columns.equals(arr_columns):
706 # if they are equal, there is nothing to do
707 new_arrays: list[ArrayLike] = []
708 indexer = arr_columns.get_indexer(columns)
709 for i, k in enumerate(indexer):
710 if k == -1:
711 # by convention default is all-NaN object dtype
712 arr = np.empty(length, dtype=object)
713 arr.fill(np.nan)
714 else:
715 arr = arrays[k]
716 new_arrays.append(arr)
717
718 arrays = new_arrays
719 arr_columns = columns
720
721 return arrays, arr_columns
722
723
724def _get_names_from_index(data) -> Index:
725 has_some_name = any(getattr(s, "name", None) is not None for s in data)
726 if not has_some_name:
727 return default_index(len(data))
728
729 index: list[Hashable] = list(range(len(data)))
730 count = 0
731 for i, s in enumerate(data):
732 n = getattr(s, "name", None)
733 if n is not None:
734 index[i] = n
735 else:
736 index[i] = f"Unnamed {count}"
737 count += 1
738
739 return Index(index)
740
741
742def _get_axes(
743 N: int, K: int, index: Index | None, columns: Index | None
744) -> tuple[Index, Index]:
745 # helper to create the axes as indexes
746 # return axes or defaults
747
748 if index is None:
749 index = default_index(N)
750 else:
751 index = ensure_index(index)
752
753 if columns is None:
754 columns = default_index(K)
755 else:
756 columns = ensure_index(columns)
757 return index, columns
758
759
760def dataclasses_to_dicts(data):
761 """
762 Converts a list of dataclass instances to a list of dictionaries.
763
764 Parameters
765 ----------
766 data : List[Type[dataclass]]
767
768 Returns
769 --------
770 list_dict : List[dict]
771
772 Examples
773 --------
774 >>> from dataclasses import dataclass
775 >>> @dataclass
776 ... class Point:
777 ... x: int
778 ... y: int
779
780 >>> dataclasses_to_dicts([Point(1, 2), Point(2, 3)])
781 [{'x': 1, 'y': 2}, {'x': 2, 'y': 3}]
782
783 """
784 from dataclasses import asdict
785
786 return list(map(asdict, data))
787
788
789# ---------------------------------------------------------------------
790# Conversion of Inputs to Arrays
791
792
793def to_arrays(
794 data, columns: Index | None, dtype: DtypeObj | None = None
795) -> tuple[list[ArrayLike], Index]:
796 """
797 Return list of arrays, columns.
798
799 Returns
800 -------
801 list[ArrayLike]
802 These will become columns in a DataFrame.
803 Index
804 This will become frame.columns.
805
806 Notes
807 -----
808 Ensures that len(result_arrays) == len(result_index).
809 """
810
811 if not len(data):
812 if isinstance(data, np.ndarray):
813 if data.dtype.names is not None:
814 # i.e. numpy structured array
815 columns = ensure_index(data.dtype.names)
816 arrays = [data[name] for name in columns]
817
818 if len(data) == 0:
819 # GH#42456 the indexing above results in list of 2D ndarrays
820 # TODO: is that an issue with numpy?
821 for i, arr in enumerate(arrays):
822 if arr.ndim == 2:
823 arrays[i] = arr[:, 0]
824
825 return arrays, columns
826 return [], ensure_index([])
827
828 elif isinstance(data, np.ndarray) and data.dtype.names is not None:
829 # e.g. recarray
830 columns = Index(list(data.dtype.names))
831 arrays = [data[k] for k in columns]
832 return arrays, columns
833
834 if isinstance(data[0], (list, tuple)):
835 arr = _list_to_arrays(data)
836 elif isinstance(data[0], abc.Mapping):
837 arr, columns = _list_of_dict_to_arrays(data, columns)
838 elif isinstance(data[0], ABCSeries):
839 arr, columns = _list_of_series_to_arrays(data, columns)
840 else:
841 # last ditch effort
842 data = [tuple(x) for x in data]
843 arr = _list_to_arrays(data)
844
845 content, columns = _finalize_columns_and_data(arr, columns, dtype)
846 return content, columns
847
848
849def _list_to_arrays(data: list[tuple | list]) -> np.ndarray:
850 # Returned np.ndarray has ndim = 2
851 # Note: we already check len(data) > 0 before getting hre
852 if isinstance(data[0], tuple):
853 content = lib.to_object_array_tuples(data)
854 else:
855 # list of lists
856 content = lib.to_object_array(data)
857 return content
858
859
860def _list_of_series_to_arrays(
861 data: list,
862 columns: Index | None,
863) -> tuple[np.ndarray, Index]:
864 # returned np.ndarray has ndim == 2
865
866 if columns is None:
867 # We know pass_data is non-empty because data[0] is a Series
868 pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))]
869 columns = get_objs_combined_axis(pass_data, sort=False)
870
871 indexer_cache: dict[int, np.ndarray] = {}
872
873 aligned_values = []
874 for s in data:
875 index = getattr(s, "index", None)
876 if index is None:
877 index = default_index(len(s))
878
879 if id(index) in indexer_cache:
880 indexer = indexer_cache[id(index)]
881 else:
882 indexer = indexer_cache[id(index)] = index.get_indexer(columns)
883
884 values = extract_array(s, extract_numpy=True)
885 aligned_values.append(algorithms.take_nd(values, indexer))
886
887 content = np.vstack(aligned_values)
888 return content, columns
889
890
891def _list_of_dict_to_arrays(
892 data: list[dict],
893 columns: Index | None,
894) -> tuple[np.ndarray, Index]:
895 """
896 Convert list of dicts to numpy arrays
897
898 if `columns` is not passed, column names are inferred from the records
899 - for OrderedDict and dicts, the column names match
900 the key insertion-order from the first record to the last.
901 - For other kinds of dict-likes, the keys are lexically sorted.
902
903 Parameters
904 ----------
905 data : iterable
906 collection of records (OrderedDict, dict)
907 columns: iterables or None
908
909 Returns
910 -------
911 content : np.ndarray[object, ndim=2]
912 columns : Index
913 """
914 if columns is None:
915 gen = (list(x.keys()) for x in data)
916 sort = not any(isinstance(d, dict) for d in data)
917 pre_cols = lib.fast_unique_multiple_list_gen(gen, sort=sort)
918 columns = ensure_index(pre_cols)
919
920 # assure that they are of the base dict class and not of derived
921 # classes
922 data = [d if type(d) is dict else dict(d) for d in data] # noqa: E721
923
924 content = lib.dicts_to_array(data, list(columns))
925 return content, columns
926
927
928def _finalize_columns_and_data(
929 content: np.ndarray, # ndim == 2
930 columns: Index | None,
931 dtype: DtypeObj | None,
932) -> tuple[list[ArrayLike], Index]:
933 """
934 Ensure we have valid columns, cast object dtypes if possible.
935 """
936 contents = list(content.T)
937
938 try:
939 columns = _validate_or_indexify_columns(contents, columns)
940 except AssertionError as err:
941 # GH#26429 do not raise user-facing AssertionError
942 raise ValueError(err) from err
943
944 if len(contents) and contents[0].dtype == np.object_:
945 contents = convert_object_array(contents, dtype=dtype)
946
947 return contents, columns
948
949
950def _validate_or_indexify_columns(
951 content: list[np.ndarray], columns: Index | None
952) -> Index:
953 """
954 If columns is None, make numbers as column names; Otherwise, validate that
955 columns have valid length.
956
957 Parameters
958 ----------
959 content : list of np.ndarrays
960 columns : Index or None
961
962 Returns
963 -------
964 Index
965 If columns is None, assign positional column index value as columns.
966
967 Raises
968 ------
969 1. AssertionError when content is not composed of list of lists, and if
970 length of columns is not equal to length of content.
971 2. ValueError when content is list of lists, but length of each sub-list
972 is not equal
973 3. ValueError when content is list of lists, but length of sub-list is
974 not equal to length of content
975 """
976 if columns is None:
977 columns = default_index(len(content))
978 else:
979 # Add mask for data which is composed of list of lists
980 is_mi_list = isinstance(columns, list) and all(
981 isinstance(col, list) for col in columns
982 )
983
984 if not is_mi_list and len(columns) != len(content): # pragma: no cover
985 # caller's responsibility to check for this...
986 raise AssertionError(
987 f"{len(columns)} columns passed, passed data had "
988 f"{len(content)} columns"
989 )
990 if is_mi_list:
991 # check if nested list column, length of each sub-list should be equal
992 if len({len(col) for col in columns}) > 1:
993 raise ValueError(
994 "Length of columns passed for MultiIndex columns is different"
995 )
996
997 # if columns is not empty and length of sublist is not equal to content
998 if columns and len(columns[0]) != len(content):
999 raise ValueError(
1000 f"{len(columns[0])} columns passed, passed data had "
1001 f"{len(content)} columns"
1002 )
1003 return columns
1004
1005
1006def convert_object_array(
1007 content: list[npt.NDArray[np.object_]],
1008 dtype: DtypeObj | None,
1009 dtype_backend: str = "numpy",
1010 coerce_float: bool = False,
1011) -> list[ArrayLike]:
1012 """
1013 Internal function to convert object array.
1014
1015 Parameters
1016 ----------
1017 content: List[np.ndarray]
1018 dtype: np.dtype or ExtensionDtype
1019 dtype_backend: Controls if nullable/pyarrow dtypes are returned.
1020 coerce_float: Cast floats that are integers to int.
1021
1022 Returns
1023 -------
1024 List[ArrayLike]
1025 """
1026 # provide soft conversion of object dtypes
1027
1028 def convert(arr):
1029 if dtype != np.dtype("O"):
1030 arr = lib.maybe_convert_objects(
1031 arr,
1032 try_float=coerce_float,
1033 convert_to_nullable_dtype=dtype_backend != "numpy",
1034 )
1035 # Notes on cases that get here 2023-02-15
1036 # 1) we DO get here when arr is all Timestamps and dtype=None
1037 # 2) disabling this doesn't break the world, so this must be
1038 # getting caught at a higher level
1039 # 3) passing convert_non_numeric to maybe_convert_objects get this right
1040 # 4) convert_non_numeric?
1041
1042 if dtype is None:
1043 if arr.dtype == np.dtype("O"):
1044 # i.e. maybe_convert_objects didn't convert
1045 arr = maybe_infer_to_datetimelike(arr)
1046 if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):
1047 new_dtype = StringDtype()
1048 arr_cls = new_dtype.construct_array_type()
1049 arr = arr_cls._from_sequence(arr, dtype=new_dtype)
1050 elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):
1051 if arr.dtype.kind in "iufb":
1052 arr = pd_array(arr, copy=False)
1053
1054 elif isinstance(dtype, ExtensionDtype):
1055 # TODO: test(s) that get here
1056 # TODO: try to de-duplicate this convert function with
1057 # core.construction functions
1058 cls = dtype.construct_array_type()
1059 arr = cls._from_sequence(arr, dtype=dtype, copy=False)
1060 elif dtype.kind in "mM":
1061 # This restriction is harmless bc these are the only cases
1062 # where maybe_cast_to_datetime is not a no-op.
1063 # Here we know:
1064 # 1) dtype.kind in "mM" and
1065 # 2) arr is either object or numeric dtype
1066 arr = maybe_cast_to_datetime(arr, dtype)
1067
1068 return arr
1069
1070 arrays = [convert(arr) for arr in content]
1071
1072 return arrays