Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/internals/construction.py: 10%

1"""

2Functions for preparing various inputs passed to the DataFrame or Series

3constructors before passing them to a BlockManager.

4"""

5from __future__ import annotations

7from collections import abc

8from typing import (

9 Any,

10 Hashable,

11 Sequence,

12)

14import numpy as np

15from numpy import ma

17from pandas._libs import lib

18from pandas._typing import (

19 ArrayLike,

20 DtypeObj,

21 Manager,

22 npt,

23)

25from pandas.core.dtypes.astype import astype_is_view

26from pandas.core.dtypes.cast import (

27 construct_1d_arraylike_from_scalar,

28 dict_compat,

29 maybe_cast_to_datetime,

30 maybe_convert_platform,

31 maybe_infer_to_datetimelike,

32)

33from pandas.core.dtypes.common import (

34 is_1d_only_ea_dtype,

35 is_bool_dtype,

36 is_datetime_or_timedelta_dtype,

37 is_dtype_equal,

38 is_extension_array_dtype,

39 is_float_dtype,

40 is_integer_dtype,

41 is_list_like,

42 is_named_tuple,

43 is_object_dtype,

44)

45from pandas.core.dtypes.dtypes import ExtensionDtype

46from pandas.core.dtypes.generic import (

47 ABCDataFrame,

48 ABCSeries,

49)

51from pandas.core import (

52 algorithms,

53 common as com,

54)

55from pandas.core.arrays import (

56 BooleanArray,

57 ExtensionArray,

58 FloatingArray,

59 IntegerArray,

60)

61from pandas.core.arrays.string_ import StringDtype

62from pandas.core.construction import (

63 ensure_wrapped_if_datetimelike,

64 extract_array,

65 range_to_ndarray,

66 sanitize_array,

67)

68from pandas.core.indexes.api import (

69 DatetimeIndex,

70 Index,

71 TimedeltaIndex,

72 default_index,

73 ensure_index,

74 get_objs_combined_axis,

75 union_indexes,

76)

77from pandas.core.internals.array_manager import (

78 ArrayManager,

79 SingleArrayManager,

80)

81from pandas.core.internals.blocks import (

82 BlockPlacement,

83 ensure_block_shape,

84 new_block_2d,

85)

86from pandas.core.internals.managers import (

87 BlockManager,

88 SingleBlockManager,

89 create_block_manager_from_blocks,

90 create_block_manager_from_column_arrays,

91)

93# ---------------------------------------------------------------------

94# BlockManager Interface

97def arrays_to_mgr(

98 arrays,

99 columns: Index,

100 index,

101 *,

102 dtype: DtypeObj | None = None,

103 verify_integrity: bool = True,

104 typ: str | None = None,

105 consolidate: bool = True,

106) -> Manager:

107 """

108 Segregate Series based on type and coerce into matrices.

109

110 Needs to handle a lot of exceptional cases.

111 """

112 if verify_integrity:

113 # figure out the index, if necessary

114 if index is None:

115 index = _extract_index(arrays)

116 else:

117 index = ensure_index(index)

118

119 # don't force copy because getting jammed in an ndarray anyway

120 arrays, refs = _homogenize(arrays, index, dtype)

121 # _homogenize ensures

122 # - all(len(x) == len(index) for x in arrays)

123 # - all(x.ndim == 1 for x in arrays)

124 # - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)

125 # - all(type(x) is not PandasArray for x in arrays)

126

127 else:

128 index = ensure_index(index)

129 arrays = [extract_array(x, extract_numpy=True) for x in arrays]

130 # with _from_arrays, the passed arrays should never be Series objects

131 refs = [None] * len(arrays)

132

133 # Reached via DataFrame._from_arrays; we do minimal validation here

134 for arr in arrays:

135 if (

136 not isinstance(arr, (np.ndarray, ExtensionArray))

137 or arr.ndim != 1

138 or len(arr) != len(index)

139 ):

140 raise ValueError(

141 "Arrays must be 1-dimensional np.ndarray or ExtensionArray "

142 "with length matching len(index)"

143 )

144

145 columns = ensure_index(columns)

146 if len(columns) != len(arrays):

147 raise ValueError("len(arrays) must match len(columns)")

148

149 # from BlockManager perspective

150 axes = [columns, index]

151

152 if typ == "block":

153 return create_block_manager_from_column_arrays(

154 arrays, axes, consolidate=consolidate, refs=refs

155 )

156 elif typ == "array":

157 return ArrayManager(arrays, [index, columns])

158 else:

159 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")

160

161

162def rec_array_to_mgr(

163 data: np.recarray | np.ndarray,

164 index,

165 columns,

166 dtype: DtypeObj | None,

167 copy: bool,

168 typ: str,

169) -> Manager:

170 """

171 Extract from a masked rec array and create the manager.

172 """

173 # essentially process a record array then fill it

174 fdata = ma.getdata(data)

175 if index is None:

176 index = default_index(len(fdata))

177 else:

178 index = ensure_index(index)

179

180 if columns is not None:

181 columns = ensure_index(columns)

182 arrays, arr_columns = to_arrays(fdata, columns)

183

184 # create the manager

185

186 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, len(index))

187 if columns is None:

188 columns = arr_columns

189

190 mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ)

191

192 if copy:

193 mgr = mgr.copy()

194 return mgr

195

196

197def mgr_to_mgr(mgr, typ: str, copy: bool = True):

198 """

199 Convert to specific type of Manager. Does not copy if the type is already

200 correct. Does not guarantee a copy otherwise. `copy` keyword only controls

201 whether conversion from Block->ArrayManager copies the 1D arrays.

202 """

203 new_mgr: Manager

204

205 if typ == "block":

206 if isinstance(mgr, BlockManager):

207 new_mgr = mgr

208 else:

209 if mgr.ndim == 2:

210 new_mgr = arrays_to_mgr(

211 mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block"

212 )

213 else:

214 new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index)

215 elif typ == "array":

216 if isinstance(mgr, ArrayManager):

217 new_mgr = mgr

218 else:

219 if mgr.ndim == 2:

220 arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))]

221 if copy:

222 arrays = [arr.copy() for arr in arrays]

223 new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])

224 else:

225 array = mgr.internal_values()

226 if copy:

227 array = array.copy()

228 new_mgr = SingleArrayManager([array], [mgr.index])

229 else:

230 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")

231 return new_mgr

232

233

234# ---------------------------------------------------------------------

235# DataFrame Constructor Interface

236

237

238def ndarray_to_mgr(

239 values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str

240) -> Manager:

241 # used in DataFrame.__init__

242 # input must be a ndarray, list, Series, Index, ExtensionArray

243

244 if isinstance(values, ABCSeries):

245 if columns is None:

246 if values.name is not None:

247 columns = Index([values.name])

248 if index is None:

249 index = values.index

250 else:

251 values = values.reindex(index)

252

253 # zero len case (GH #2234)

254 if not len(values) and columns is not None and len(columns):

255 values = np.empty((0, 1), dtype=object)

256

257 # if the array preparation does a copy -> avoid this for ArrayManager,

258 # since the copy is done on conversion to 1D arrays

259 copy_on_sanitize = False if typ == "array" else copy

260

261 vdtype = getattr(values, "dtype", None)

262 refs = None

263 if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype):

264 # GH#19157

265

266 if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1:

267 # GH#12513 a EA dtype passed with a 2D array, split into

268 # multiple EAs that view the values

269 # error: No overload variant of "__getitem__" of "ExtensionArray"

270 # matches argument type "Tuple[slice, int]"

271 values = [

272 values[:, n] # type: ignore[call-overload]

273 for n in range(values.shape[1])

274 ]

275 else:

276 values = [values]

277

278 if columns is None:

279 columns = Index(range(len(values)))

280 else:

281 columns = ensure_index(columns)

282

283 return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ)

284

285 elif is_extension_array_dtype(vdtype):

286 # i.e. Datetime64TZ, PeriodDtype; cases with is_1d_only_ea_dtype(vdtype)

287 # are already caught above

288 values = extract_array(values, extract_numpy=True)

289 if copy:

290 values = values.copy()

291 if values.ndim == 1:

292 values = values.reshape(-1, 1)

293

294 elif isinstance(values, (ABCSeries, Index)):

295 if not copy_on_sanitize and (

296 dtype is None or astype_is_view(values.dtype, dtype)

297 ):

298 refs = values._references

299

300 if copy_on_sanitize:

301 values = values._values.copy()

302 else:

303 values = values._values

304

305 values = _ensure_2d(values)

306

307 elif isinstance(values, (np.ndarray, ExtensionArray)):

308 # drop subclass info

309 _copy = (

310 copy_on_sanitize

311 if (dtype is None or astype_is_view(values.dtype, dtype))

312 else False

313 )

314 values = np.array(values, copy=_copy)

315 values = _ensure_2d(values)

316

317 else:

318 # by definition an array here

319 # the dtypes will be coerced to a single dtype

320 values = _prep_ndarraylike(values, copy=copy_on_sanitize)

321

322 if dtype is not None and not is_dtype_equal(values.dtype, dtype):

323 # GH#40110 see similar check inside sanitize_array

324 values = sanitize_array(

325 values,

326 None,

327 dtype=dtype,

328 copy=copy_on_sanitize,

329 allow_2d=True,

330 )

331

332 # _prep_ndarraylike ensures that values.ndim == 2 at this point

333 index, columns = _get_axes(

334 values.shape[0], values.shape[1], index=index, columns=columns

335 )

336

337 _check_values_indices_shape_match(values, index, columns)

338

339 if typ == "array":

340 if issubclass(values.dtype.type, str):

341 values = np.array(values, dtype=object)

342

343 if dtype is None and is_object_dtype(values.dtype):

344 arrays = [

345 ensure_wrapped_if_datetimelike(

346 maybe_infer_to_datetimelike(values[:, i])

347 )

348 for i in range(values.shape[1])

349 ]

350 else:

351 if is_datetime_or_timedelta_dtype(values.dtype):

352 values = ensure_wrapped_if_datetimelike(values)

353 arrays = [values[:, i] for i in range(values.shape[1])]

354

355 if copy:

356 arrays = [arr.copy() for arr in arrays]

357

358 return ArrayManager(arrays, [index, columns], verify_integrity=False)

359

360 values = values.T

361

362 # if we don't have a dtype specified, then try to convert objects

363 # on the entire block; this is to convert if we have datetimelike's

364 # embedded in an object type

365 if dtype is None and is_object_dtype(values.dtype):

366 obj_columns = list(values)

367 maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]

368 # don't convert (and copy) the objects if no type inference occurs

369 if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):

370 dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime]

371 block_values = [

372 new_block_2d(dvals_list[n], placement=BlockPlacement(n))

373 for n in range(len(dvals_list))

374 ]

375 else:

376 bp = BlockPlacement(slice(len(columns)))

377 nb = new_block_2d(values, placement=bp, refs=refs)

378 block_values = [nb]

379 else:

380 bp = BlockPlacement(slice(len(columns)))

381 nb = new_block_2d(values, placement=bp, refs=refs)

382 block_values = [nb]

383

384 if len(columns) == 0:

385 # TODO: check len(values) == 0?

386 block_values = []

387

388 return create_block_manager_from_blocks(

389 block_values, [columns, index], verify_integrity=False

390 )

391

392

393def _check_values_indices_shape_match(

394 values: np.ndarray, index: Index, columns: Index

395) -> None:

396 """

397 Check that the shape implied by our axes matches the actual shape of the

398 data.

399 """

400 if values.shape[1] != len(columns) or values.shape[0] != len(index):

401 # Could let this raise in Block constructor, but we get a more

402 # helpful exception message this way.

403 if values.shape[0] == 0:

404 raise ValueError("Empty data passed with indices specified.")

405

406 passed = values.shape

407 implied = (len(index), len(columns))

408 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")

409

410

411def dict_to_mgr(

412 data: dict,

413 index,

414 columns,

415 *,

416 dtype: DtypeObj | None = None,

417 typ: str = "block",

418 copy: bool = True,

419) -> Manager:

420 """

421 Segregate Series based on type and coerce into matrices.

422 Needs to handle a lot of exceptional cases.

423

424 Used in DataFrame.__init__

425 """

426 arrays: Sequence[Any] | Series

427

428 if columns is not None:

429 from pandas.core.series import Series

430

431 arrays = Series(data, index=columns, dtype=object)

432 missing = arrays.isna()

433 if index is None:

434 # GH10856

435 # raise ValueError if only scalars in dict

436 index = _extract_index(arrays[~missing])

437 else:

438 index = ensure_index(index)

439

440 # no obvious "empty" int column

441 if missing.any() and not is_integer_dtype(dtype):

442 nan_dtype: DtypeObj

443

444 if dtype is not None:

445 # calling sanitize_array ensures we don't mix-and-match

446 # NA dtypes

447 midxs = missing.values.nonzero()[0]

448 for i in midxs:

449 arr = sanitize_array(arrays.iat[i], index, dtype=dtype)

450 arrays.iat[i] = arr

451 else:

452 # GH#1783

453 nan_dtype = np.dtype("object")

454 val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)

455 nmissing = missing.sum()

456 if copy:

457 rhs = [val] * nmissing

458 else:

459 # GH#45369

460 rhs = [val.copy() for _ in range(nmissing)]

461 arrays.loc[missing] = rhs

462

463 arrays = list(arrays)

464 columns = ensure_index(columns)

465

466 else:

467 keys = list(data.keys())

468 columns = Index(keys) if keys else default_index(0)

469 arrays = [com.maybe_iterable_to_list(data[k]) for k in keys]

470 arrays = [arr if not isinstance(arr, Index) else arr._data for arr in arrays]

471

472 if copy:

473 if typ == "block":

474 # We only need to copy arrays that will not get consolidated, i.e.

475 # only EA arrays

476 arrays = [x.copy() if isinstance(x, ExtensionArray) else x for x in arrays]

477 else:

478 # dtype check to exclude e.g. range objects, scalars

479 arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays]

480

481 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)

482

483

484def nested_data_to_arrays(

485 data: Sequence,

486 columns: Index | None,

487 index: Index | None,

488 dtype: DtypeObj | None,

489) -> tuple[list[ArrayLike], Index, Index]:

490 """

491 Convert a single sequence of arrays to multiple arrays.

492 """

493 # By the time we get here we have already checked treat_as_nested(data)

494

495 if is_named_tuple(data[0]) and columns is None:

496 columns = ensure_index(data[0]._fields)

497

498 arrays, columns = to_arrays(data, columns, dtype=dtype)

499 columns = ensure_index(columns)

500

501 if index is None:

502 if isinstance(data[0], ABCSeries):

503 index = _get_names_from_index(data)

504 else:

505 index = default_index(len(data))

506

507 return arrays, columns, index

508

509

510def treat_as_nested(data) -> bool:

511 """

512 Check if we should use nested_data_to_arrays.

513 """

514 return (

515 len(data) > 0

516 and is_list_like(data[0])

517 and getattr(data[0], "ndim", 1) == 1

518 and not (isinstance(data, ExtensionArray) and data.ndim == 2)

519 )

520

521

522# ---------------------------------------------------------------------

523

524

525def _prep_ndarraylike(values, copy: bool = True) -> np.ndarray:

526 # values is specifically _not_ ndarray, EA, Index, or Series

527 # We only get here with `not treat_as_nested(values)`

528

529 if len(values) == 0:

530 # TODO: check for length-zero range, in which case return int64 dtype?

531 # TODO: re-use anything in try_cast?

532 return np.empty((0, 0), dtype=object)

533 elif isinstance(values, range):

534 arr = range_to_ndarray(values)

535 return arr[..., np.newaxis]

536

537 def convert(v):

538 if not is_list_like(v) or isinstance(v, ABCDataFrame):

539 return v

540

541 v = extract_array(v, extract_numpy=True)

542 res = maybe_convert_platform(v)

543 # We don't do maybe_infer_to_datetimelike here bc we will end up doing

544 # it column-by-column in ndarray_to_mgr

545 return res

546

547 # we could have a 1-dim or 2-dim list here

548 # this is equiv of np.asarray, but does object conversion

549 # and platform dtype preservation

550 # does not convert e.g. [1, "a", True] to ["1", "a", "True"] like

551 # np.asarray would

552 if is_list_like(values[0]):

553 values = np.array([convert(v) for v in values])

554 elif isinstance(values[0], np.ndarray) and values[0].ndim == 0:

555 # GH#21861 see test_constructor_list_of_lists

556 values = np.array([convert(v) for v in values])

557 else:

558 values = convert(values)

559

560 return _ensure_2d(values)

561

562

563def _ensure_2d(values: np.ndarray) -> np.ndarray:

564 """

565 Reshape 1D values, raise on anything else other than 2D.

566 """

567 if values.ndim == 1:

568 values = values.reshape((values.shape[0], 1))

569 elif values.ndim != 2:

570 raise ValueError(f"Must pass 2-d input. shape={values.shape}")

571 return values

572

573

574def _homogenize(

575 data, index: Index, dtype: DtypeObj | None

576) -> tuple[list[ArrayLike], list[Any]]:

577 oindex = None

578 homogenized = []

579 # if the original array-like in `data` is a Series, keep track of this Series' refs

580 refs: list[Any] = []

581

582 for val in data:

583 if isinstance(val, ABCSeries):

584 if dtype is not None:

585 val = val.astype(dtype, copy=False)

586 if val.index is not index:

587 # Forces alignment. No need to copy data since we

588 # are putting it into an ndarray later

589 val = val.reindex(index, copy=False)

590 refs.append(val._references)

591 val = val._values

592 else:

593 if isinstance(val, dict):

594 # GH#41785 this _should_ be equivalent to (but faster than)

595 # val = Series(val, index=index)._values

596 if oindex is None:

597 oindex = index.astype("O")

598

599 if isinstance(index, (DatetimeIndex, TimedeltaIndex)):

600 # see test_constructor_dict_datetime64_index

601 val = dict_compat(val)

602 else:

603 # see test_constructor_subclass_dict

604 val = dict(val)

605 val = lib.fast_multiget(val, oindex._values, default=np.nan)

606

607 val = sanitize_array(val, index, dtype=dtype, copy=False)

608 com.require_length_match(val, index)

609 refs.append(None)

610

611 homogenized.append(val)

612

613 return homogenized, refs

614

615

616def _extract_index(data) -> Index:

617 """

618 Try to infer an Index from the passed data, raise ValueError on failure.

619 """

620 index: Index

621 if len(data) == 0:

622 return default_index(0)

623

624 raw_lengths = []

625 indexes: list[list[Hashable] | Index] = []

626

627 have_raw_arrays = False

628 have_series = False

629 have_dicts = False

630

631 for val in data:

632 if isinstance(val, ABCSeries):

633 have_series = True

634 indexes.append(val.index)

635 elif isinstance(val, dict):

636 have_dicts = True

637 indexes.append(list(val.keys()))

638 elif is_list_like(val) and getattr(val, "ndim", 1) == 1:

639 have_raw_arrays = True

640 raw_lengths.append(len(val))

641 elif isinstance(val, np.ndarray) and val.ndim > 1:

642 raise ValueError("Per-column arrays must each be 1-dimensional")

643

644 if not indexes and not raw_lengths:

645 raise ValueError("If using all scalar values, you must pass an index")

646

647 if have_series:

648 index = union_indexes(indexes)

649 elif have_dicts:

650 index = union_indexes(indexes, sort=False)

651

652 if have_raw_arrays:

653 lengths = list(set(raw_lengths))

654 if len(lengths) > 1:

655 raise ValueError("All arrays must be of the same length")

656

657 if have_dicts:

658 raise ValueError(

659 "Mixing dicts with non-Series may lead to ambiguous ordering."

660 )

661

662 if have_series:

663 if lengths[0] != len(index):

664 msg = (

665 f"array length {lengths[0]} does not match index "

666 f"length {len(index)}"

667 )

668 raise ValueError(msg)

669 else:

670 index = default_index(lengths[0])

671

672 return ensure_index(index)

673

674

675def reorder_arrays(

676 arrays: list[ArrayLike], arr_columns: Index, columns: Index | None, length: int

677) -> tuple[list[ArrayLike], Index]:

678 """

679 Pre-emptively (cheaply) reindex arrays with new columns.

680 """

681 # reorder according to the columns

682 if columns is not None:

683 if not columns.equals(arr_columns):

684 # if they are equal, there is nothing to do

685 new_arrays: list[ArrayLike | None]

686 new_arrays = [None] * len(columns)

687 indexer = arr_columns.get_indexer(columns)

688 for i, k in enumerate(indexer):

689 if k == -1:

690 # by convention default is all-NaN object dtype

691 arr = np.empty(length, dtype=object)

692 arr.fill(np.nan)

693 else:

694 arr = arrays[k]

695 new_arrays[i] = arr

696

697 # Incompatible types in assignment (expression has type

698 # "List[Union[ExtensionArray, ndarray[Any, Any], None]]", variable

699 # has type "List[Union[ExtensionArray, ndarray[Any, Any]]]")

700 arrays = new_arrays # type: ignore[assignment]

701 arr_columns = columns

702

703 return arrays, arr_columns

704

705

706def _get_names_from_index(data) -> Index:

707 has_some_name = any(getattr(s, "name", None) is not None for s in data)

708 if not has_some_name:

709 return default_index(len(data))

710

711 index: list[Hashable] = list(range(len(data)))

712 count = 0

713 for i, s in enumerate(data):

714 n = getattr(s, "name", None)

715 if n is not None:

716 index[i] = n

717 else:

718 index[i] = f"Unnamed {count}"

719 count += 1

720

721 return Index(index)

722

723

724def _get_axes(

725 N: int, K: int, index: Index | None, columns: Index | None

726) -> tuple[Index, Index]:

727 # helper to create the axes as indexes

728 # return axes or defaults

729

730 if index is None:

731 index = default_index(N)

732 else:

733 index = ensure_index(index)

734

735 if columns is None:

736 columns = default_index(K)

737 else:

738 columns = ensure_index(columns)

739 return index, columns

740

741

742def dataclasses_to_dicts(data):

743 """

744 Converts a list of dataclass instances to a list of dictionaries.

745

746 Parameters

747 ----------

748 data : List[Type[dataclass]]

749

750 Returns

751 --------

752 list_dict : List[dict]

753

754 Examples

755 --------

756 >>> from dataclasses import dataclass

757 >>> @dataclass

758 ... class Point:

759 ... x: int

760 ... y: int

761

762 >>> dataclasses_to_dicts([Point(1, 2), Point(2, 3)])

763 [{'x': 1, 'y': 2}, {'x': 2, 'y': 3}]

764

765 """

766 from dataclasses import asdict

767

768 return list(map(asdict, data))

769

770

771# ---------------------------------------------------------------------

772# Conversion of Inputs to Arrays

773

774

775def to_arrays(

776 data, columns: Index | None, dtype: DtypeObj | None = None

777) -> tuple[list[ArrayLike], Index]:

778 """

779 Return list of arrays, columns.

780

781 Returns

782 -------

783 list[ArrayLike]

784 These will become columns in a DataFrame.

785 Index

786 This will become frame.columns.

787

788 Notes

789 -----

790 Ensures that len(result_arrays) == len(result_index).

791 """

792 if isinstance(data, ABCDataFrame):

793 # see test_from_records_with_index_data, test_from_records_bad_index_column

794 if columns is not None:

795 arrays = [

796 data._ixs(i, axis=1)._values

797 for i, col in enumerate(data.columns)

798 if col in columns

799 ]

800 else:

801 columns = data.columns

802 arrays = [data._ixs(i, axis=1)._values for i in range(len(columns))]

803

804 return arrays, columns

805

806 if not len(data):

807 if isinstance(data, np.ndarray):

808 if data.dtype.names is not None:

809 # i.e. numpy structured array

810 columns = ensure_index(data.dtype.names)

811 arrays = [data[name] for name in columns]

812

813 if len(data) == 0:

814 # GH#42456 the indexing above results in list of 2D ndarrays

815 # TODO: is that an issue with numpy?

816 for i, arr in enumerate(arrays):

817 if arr.ndim == 2:

818 arrays[i] = arr[:, 0]

819

820 return arrays, columns

821 return [], ensure_index([])

822

823 elif isinstance(data, np.ndarray) and data.dtype.names is not None:

824 # e.g. recarray

825 columns = Index(list(data.dtype.names))

826 arrays = [data[k] for k in columns]

827 return arrays, columns

828

829 if isinstance(data[0], (list, tuple)):

830 arr = _list_to_arrays(data)

831 elif isinstance(data[0], abc.Mapping):

832 arr, columns = _list_of_dict_to_arrays(data, columns)

833 elif isinstance(data[0], ABCSeries):

834 arr, columns = _list_of_series_to_arrays(data, columns)

835 else:

836 # last ditch effort

837 data = [tuple(x) for x in data]

838 arr = _list_to_arrays(data)

839

840 content, columns = _finalize_columns_and_data(arr, columns, dtype)

841 return content, columns

842

843

844def _list_to_arrays(data: list[tuple | list]) -> np.ndarray:

845 # Returned np.ndarray has ndim = 2

846 # Note: we already check len(data) > 0 before getting hre

847 if isinstance(data[0], tuple):

848 content = lib.to_object_array_tuples(data)

849 else:

850 # list of lists

851 content = lib.to_object_array(data)

852 return content

853

854

855def _list_of_series_to_arrays(

856 data: list,

857 columns: Index | None,

858) -> tuple[np.ndarray, Index]:

859 # returned np.ndarray has ndim == 2

860

861 if columns is None:

862 # We know pass_data is non-empty because data[0] is a Series

863 pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))]

864 columns = get_objs_combined_axis(pass_data, sort=False)

865

866 indexer_cache: dict[int, np.ndarray] = {}

867

868 aligned_values = []

869 for s in data:

870 index = getattr(s, "index", None)

871 if index is None:

872 index = default_index(len(s))

873

874 if id(index) in indexer_cache:

875 indexer = indexer_cache[id(index)]

876 else:

877 indexer = indexer_cache[id(index)] = index.get_indexer(columns)

878

879 values = extract_array(s, extract_numpy=True)

880 aligned_values.append(algorithms.take_nd(values, indexer))

881

882 content = np.vstack(aligned_values)

883 return content, columns

884

885

886def _list_of_dict_to_arrays(

887 data: list[dict],

888 columns: Index | None,

889) -> tuple[np.ndarray, Index]:

890 """

891 Convert list of dicts to numpy arrays

892

893 if `columns` is not passed, column names are inferred from the records

894 - for OrderedDict and dicts, the column names match

895 the key insertion-order from the first record to the last.

896 - For other kinds of dict-likes, the keys are lexically sorted.

897

898 Parameters

899 ----------

900 data : iterable

901 collection of records (OrderedDict, dict)

902 columns: iterables or None

903

904 Returns

905 -------

906 content : np.ndarray[object, ndim=2]

907 columns : Index

908 """

909 if columns is None:

910 gen = (list(x.keys()) for x in data)

911 sort = not any(isinstance(d, dict) for d in data)

912 pre_cols = lib.fast_unique_multiple_list_gen(gen, sort=sort)

913 columns = ensure_index(pre_cols)

914

915 # assure that they are of the base dict class and not of derived

916 # classes

917 data = [d if type(d) is dict else dict(d) for d in data]

918

919 content = lib.dicts_to_array(data, list(columns))

920 return content, columns

921

922

923def _finalize_columns_and_data(

924 content: np.ndarray, # ndim == 2

925 columns: Index | None,

926 dtype: DtypeObj | None,

927) -> tuple[list[ArrayLike], Index]:

928 """

929 Ensure we have valid columns, cast object dtypes if possible.

930 """

931 contents = list(content.T)

932

933 try:

934 columns = _validate_or_indexify_columns(contents, columns)

935 except AssertionError as err:

936 # GH#26429 do not raise user-facing AssertionError

937 raise ValueError(err) from err

938

939 if len(contents) and contents[0].dtype == np.object_:

940 contents = convert_object_array(contents, dtype=dtype)

941

942 return contents, columns

943

944

945def _validate_or_indexify_columns(

946 content: list[np.ndarray], columns: Index | None

947) -> Index:

948 """

949 If columns is None, make numbers as column names; Otherwise, validate that

950 columns have valid length.

951

952 Parameters

953 ----------

954 content : list of np.ndarrays

955 columns : Index or None

956

957 Returns

958 -------

959 Index

960 If columns is None, assign positional column index value as columns.

961

962 Raises

963 ------

964 1. AssertionError when content is not composed of list of lists, and if

965 length of columns is not equal to length of content.

966 2. ValueError when content is list of lists, but length of each sub-list

967 is not equal

968 3. ValueError when content is list of lists, but length of sub-list is

969 not equal to length of content

970 """

971 if columns is None:

972 columns = default_index(len(content))

973 else:

974 # Add mask for data which is composed of list of lists

975 is_mi_list = isinstance(columns, list) and all(

976 isinstance(col, list) for col in columns

977 )

978

979 if not is_mi_list and len(columns) != len(content): # pragma: no cover

980 # caller's responsibility to check for this...

981 raise AssertionError(

982 f"{len(columns)} columns passed, passed data had "

983 f"{len(content)} columns"

984 )

985 if is_mi_list:

986 # check if nested list column, length of each sub-list should be equal

987 if len({len(col) for col in columns}) > 1:

988 raise ValueError(

989 "Length of columns passed for MultiIndex columns is different"

990 )

991

992 # if columns is not empty and length of sublist is not equal to content

993 if columns and len(columns[0]) != len(content):

994 raise ValueError(

995 f"{len(columns[0])} columns passed, passed data had "

996 f"{len(content)} columns"

997 )

998 return columns

999

1000

1001def convert_object_array(

1002 content: list[npt.NDArray[np.object_]],

1003 dtype: DtypeObj | None,

1004 dtype_backend: str = "numpy",

1005 coerce_float: bool = False,

1006) -> list[ArrayLike]:

1007 """

1008 Internal function to convert object array.

1009

1010 Parameters

1011 ----------

1012 content: List[np.ndarray]

1013 dtype: np.dtype or ExtensionDtype

1014 dtype_backend: Controls if nullable/pyarrow dtypes are returned.

1015 coerce_float: Cast floats that are integers to int.

1016

1017 Returns

1018 -------

1019 List[ArrayLike]

1020 """

1021 # provide soft conversion of object dtypes

1022

1023 def convert(arr):

1024 if dtype != np.dtype("O"):

1025 arr = lib.maybe_convert_objects(

1026 arr,

1027 try_float=coerce_float,

1028 convert_to_nullable_dtype=dtype_backend != "numpy",

1029 )

1030 # Notes on cases that get here 2023-02-15

1031 # 1) we DO get here when arr is all Timestamps and dtype=None

1032 # 2) disabling this doesn't break the world, so this must be

1033 # getting caught at a higher level

1034 # 3) passing convert_datetime to maybe_convert_objects get this right

1035 # 4) convert_timedelta?

1036

1037 if dtype is None:

1038 if arr.dtype == np.dtype("O"):

1039 # i.e. maybe_convert_objects didn't convert

1040 arr = maybe_infer_to_datetimelike(arr)

1041 if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):

1042 arr = StringDtype().construct_array_type()._from_sequence(arr)

1043 elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):

1044 if is_integer_dtype(arr.dtype):

1045 arr = IntegerArray(arr, np.zeros(arr.shape, dtype=np.bool_))

1046 elif is_bool_dtype(arr.dtype):

1047 arr = BooleanArray(arr, np.zeros(arr.shape, dtype=np.bool_))

1048 elif is_float_dtype(arr.dtype):

1049 arr = FloatingArray(arr, np.isnan(arr))

1050

1051 elif isinstance(dtype, ExtensionDtype):

1052 # TODO: test(s) that get here

1053 # TODO: try to de-duplicate this convert function with

1054 # core.construction functions

1055 cls = dtype.construct_array_type()

1056 arr = cls._from_sequence(arr, dtype=dtype, copy=False)

1057 elif dtype.kind in ["m", "M"]:

1058 # This restriction is harmless bc these are the only cases

1059 # where maybe_cast_to_datetime is not a no-op.

1060 # Here we know:

1061 # 1) dtype.kind in ["m", "M"] and

1062 # 2) arr is either object or numeric dtype

1063 arr = maybe_cast_to_datetime(arr, dtype)

1064

1065 return arr

1066

1067 arrays = [convert(arr) for arr in content]

1068

1069 return arrays