Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/internals/construction.py: 50%

1"""

2Functions for preparing various inputs passed to the DataFrame or Series

3constructors before passing them to a BlockManager.

4"""

5from __future__ import annotations

7from collections import abc

8from typing import (

9 TYPE_CHECKING,

10 Any,

11)

13import numpy as np

14from numpy import ma

16from pandas._config import using_pyarrow_string_dtype

18from pandas._libs import lib

20from pandas.core.dtypes.astype import astype_is_view

21from pandas.core.dtypes.cast import (

22 construct_1d_arraylike_from_scalar,

23 dict_compat,

24 maybe_cast_to_datetime,

25 maybe_convert_platform,

26 maybe_infer_to_datetimelike,

27)

28from pandas.core.dtypes.common import (

29 is_1d_only_ea_dtype,

30 is_integer_dtype,

31 is_list_like,

32 is_named_tuple,

33 is_object_dtype,

34)

35from pandas.core.dtypes.dtypes import ExtensionDtype

36from pandas.core.dtypes.generic import (

37 ABCDataFrame,

38 ABCSeries,

39)

41from pandas.core import (

42 algorithms,

43 common as com,

44)

45from pandas.core.arrays import ExtensionArray

46from pandas.core.arrays.string_ import StringDtype

47from pandas.core.construction import (

48 array as pd_array,

49 ensure_wrapped_if_datetimelike,

50 extract_array,

51 range_to_ndarray,

52 sanitize_array,

53)

54from pandas.core.indexes.api import (

55 DatetimeIndex,

56 Index,

57 TimedeltaIndex,

58 default_index,

59 ensure_index,

60 get_objs_combined_axis,

61 union_indexes,

62)

63from pandas.core.internals.array_manager import (

64 ArrayManager,

65 SingleArrayManager,

66)

67from pandas.core.internals.blocks import (

68 BlockPlacement,

69 ensure_block_shape,

70 new_block,

71 new_block_2d,

72)

73from pandas.core.internals.managers import (

74 BlockManager,

75 SingleBlockManager,

76 create_block_manager_from_blocks,

77 create_block_manager_from_column_arrays,

78)

80if TYPE_CHECKING:

81 from collections.abc import (

82 Hashable,

83 Sequence,

84 )

86 from pandas._typing import (

87 ArrayLike,

88 DtypeObj,

89 Manager,

90 npt,

91 )

92# ---------------------------------------------------------------------

93# BlockManager Interface

96def arrays_to_mgr(

97 arrays,

98 columns: Index,

99 index,

100 *,

101 dtype: DtypeObj | None = None,

102 verify_integrity: bool = True,

103 typ: str | None = None,

104 consolidate: bool = True,

105) -> Manager:

106 """

107 Segregate Series based on type and coerce into matrices.

108

109 Needs to handle a lot of exceptional cases.

110 """

111 if verify_integrity:

112 # figure out the index, if necessary

113 if index is None:

114 index = _extract_index(arrays)

115 else:

116 index = ensure_index(index)

117

118 # don't force copy because getting jammed in an ndarray anyway

119 arrays, refs = _homogenize(arrays, index, dtype)

120 # _homogenize ensures

121 # - all(len(x) == len(index) for x in arrays)

122 # - all(x.ndim == 1 for x in arrays)

123 # - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)

124 # - all(type(x) is not NumpyExtensionArray for x in arrays)

125

126 else:

127 index = ensure_index(index)

128 arrays = [extract_array(x, extract_numpy=True) for x in arrays]

129 # with _from_arrays, the passed arrays should never be Series objects

130 refs = [None] * len(arrays)

131

132 # Reached via DataFrame._from_arrays; we do minimal validation here

133 for arr in arrays:

134 if (

135 not isinstance(arr, (np.ndarray, ExtensionArray))

136 or arr.ndim != 1

137 or len(arr) != len(index)

138 ):

139 raise ValueError(

140 "Arrays must be 1-dimensional np.ndarray or ExtensionArray "

141 "with length matching len(index)"

142 )

143

144 columns = ensure_index(columns)

145 if len(columns) != len(arrays):

146 raise ValueError("len(arrays) must match len(columns)")

147

148 # from BlockManager perspective

149 axes = [columns, index]

150

151 if typ == "block":

152 return create_block_manager_from_column_arrays(

153 arrays, axes, consolidate=consolidate, refs=refs

154 )

155 elif typ == "array":

156 return ArrayManager(arrays, [index, columns])

157 else:

158 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")

159

160

161def rec_array_to_mgr(

162 data: np.rec.recarray | np.ndarray,

163 index,

164 columns,

165 dtype: DtypeObj | None,

166 copy: bool,

167 typ: str,

168) -> Manager:

169 """

170 Extract from a masked rec array and create the manager.

171 """

172 # essentially process a record array then fill it

173 fdata = ma.getdata(data)

174 if index is None:

175 index = default_index(len(fdata))

176 else:

177 index = ensure_index(index)

178

179 if columns is not None:

180 columns = ensure_index(columns)

181 arrays, arr_columns = to_arrays(fdata, columns)

182

183 # create the manager

184

185 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, len(index))

186 if columns is None:

187 columns = arr_columns

188

189 mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ)

190

191 if copy:

192 mgr = mgr.copy()

193 return mgr

194

195

196def mgr_to_mgr(mgr, typ: str, copy: bool = True) -> Manager:

197 """

198 Convert to specific type of Manager. Does not copy if the type is already

199 correct. Does not guarantee a copy otherwise. `copy` keyword only controls

200 whether conversion from Block->ArrayManager copies the 1D arrays.

201 """

202 new_mgr: Manager

203

204 if typ == "block":

205 if isinstance(mgr, BlockManager):

206 new_mgr = mgr

207 else:

208 if mgr.ndim == 2:

209 new_mgr = arrays_to_mgr(

210 mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block"

211 )

212 else:

213 new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index)

214 elif typ == "array":

215 if isinstance(mgr, ArrayManager):

216 new_mgr = mgr

217 else:

218 if mgr.ndim == 2:

219 arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))]

220 if copy:

221 arrays = [arr.copy() for arr in arrays]

222 new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])

223 else:

224 array = mgr.internal_values()

225 if copy:

226 array = array.copy()

227 new_mgr = SingleArrayManager([array], [mgr.index])

228 else:

229 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")

230 return new_mgr

231

232

233# ---------------------------------------------------------------------

234# DataFrame Constructor Interface

235

236

237def ndarray_to_mgr(

238 values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str

239) -> Manager:

240 # used in DataFrame.__init__

241 # input must be a ndarray, list, Series, Index, ExtensionArray

242

243 if isinstance(values, ABCSeries):

244 if columns is None:

245 if values.name is not None:

246 columns = Index([values.name])

247 if index is None:

248 index = values.index

249 else:

250 values = values.reindex(index)

251

252 # zero len case (GH #2234)

253 if not len(values) and columns is not None and len(columns):

254 values = np.empty((0, 1), dtype=object)

255

256 # if the array preparation does a copy -> avoid this for ArrayManager,

257 # since the copy is done on conversion to 1D arrays

258 copy_on_sanitize = False if typ == "array" else copy

259

260 vdtype = getattr(values, "dtype", None)

261 refs = None

262 if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype):

263 # GH#19157

264

265 if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1:

266 # GH#12513 a EA dtype passed with a 2D array, split into

267 # multiple EAs that view the values

268 # error: No overload variant of "__getitem__" of "ExtensionArray"

269 # matches argument type "Tuple[slice, int]"

270 values = [

271 values[:, n] # type: ignore[call-overload]

272 for n in range(values.shape[1])

273 ]

274 else:

275 values = [values]

276

277 if columns is None:

278 columns = Index(range(len(values)))

279 else:

280 columns = ensure_index(columns)

281

282 return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ)

283

284 elif isinstance(vdtype, ExtensionDtype):

285 # i.e. Datetime64TZ, PeriodDtype; cases with is_1d_only_ea_dtype(vdtype)

286 # are already caught above

287 values = extract_array(values, extract_numpy=True)

288 if copy:

289 values = values.copy()

290 if values.ndim == 1:

291 values = values.reshape(-1, 1)

292

293 elif isinstance(values, (ABCSeries, Index)):

294 if not copy_on_sanitize and (

295 dtype is None or astype_is_view(values.dtype, dtype)

296 ):

297 refs = values._references

298

299 if copy_on_sanitize:

300 values = values._values.copy()

301 else:

302 values = values._values

303

304 values = _ensure_2d(values)

305

306 elif isinstance(values, (np.ndarray, ExtensionArray)):

307 # drop subclass info

308 _copy = (

309 copy_on_sanitize

310 if (dtype is None or astype_is_view(values.dtype, dtype))

311 else False

312 )

313 values = np.array(values, copy=_copy)

314 values = _ensure_2d(values)

315

316 else:

317 # by definition an array here

318 # the dtypes will be coerced to a single dtype

319 values = _prep_ndarraylike(values, copy=copy_on_sanitize)

320

321 if dtype is not None and values.dtype != dtype:

322 # GH#40110 see similar check inside sanitize_array

323 values = sanitize_array(

324 values,

325 None,

326 dtype=dtype,

327 copy=copy_on_sanitize,

328 allow_2d=True,

329 )

330

331 # _prep_ndarraylike ensures that values.ndim == 2 at this point

332 index, columns = _get_axes(

333 values.shape[0], values.shape[1], index=index, columns=columns

334 )

335

336 _check_values_indices_shape_match(values, index, columns)

337

338 if typ == "array":

339 if issubclass(values.dtype.type, str):

340 values = np.array(values, dtype=object)

341

342 if dtype is None and is_object_dtype(values.dtype):

343 arrays = [

344 ensure_wrapped_if_datetimelike(

345 maybe_infer_to_datetimelike(values[:, i])

346 )

347 for i in range(values.shape[1])

348 ]

349 else:

350 if lib.is_np_dtype(values.dtype, "mM"):

351 values = ensure_wrapped_if_datetimelike(values)

352 arrays = [values[:, i] for i in range(values.shape[1])]

353

354 if copy:

355 arrays = [arr.copy() for arr in arrays]

356

357 return ArrayManager(arrays, [index, columns], verify_integrity=False)

358

359 values = values.T

360

361 # if we don't have a dtype specified, then try to convert objects

362 # on the entire block; this is to convert if we have datetimelike's

363 # embedded in an object type

364 if dtype is None and is_object_dtype(values.dtype):

365 obj_columns = list(values)

366 maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]

367 # don't convert (and copy) the objects if no type inference occurs

368 if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):

369 dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime]

370 block_values = [

371 new_block_2d(dvals_list[n], placement=BlockPlacement(n))

372 for n in range(len(dvals_list))

373 ]

374 else:

375 bp = BlockPlacement(slice(len(columns)))

376 nb = new_block_2d(values, placement=bp, refs=refs)

377 block_values = [nb]

378 elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype():

379 dtype = StringDtype(storage="pyarrow_numpy")

380

381 obj_columns = list(values)

382 block_values = [

383 new_block(

384 dtype.construct_array_type()._from_sequence(data, dtype=dtype),

385 BlockPlacement(slice(i, i + 1)),

386 ndim=2,

387 )

388 for i, data in enumerate(obj_columns)

389 ]

390

391 else:

392 bp = BlockPlacement(slice(len(columns)))

393 nb = new_block_2d(values, placement=bp, refs=refs)

394 block_values = [nb]

395

396 if len(columns) == 0:

397 # TODO: check len(values) == 0?

398 block_values = []

399

400 return create_block_manager_from_blocks(

401 block_values, [columns, index], verify_integrity=False

402 )

403

404

405def _check_values_indices_shape_match(

406 values: np.ndarray, index: Index, columns: Index

407) -> None:

408 """

409 Check that the shape implied by our axes matches the actual shape of the

410 data.

411 """

412 if values.shape[1] != len(columns) or values.shape[0] != len(index):

413 # Could let this raise in Block constructor, but we get a more

414 # helpful exception message this way.

415 if values.shape[0] == 0 < len(index):

416 raise ValueError("Empty data passed with indices specified.")

417

418 passed = values.shape

419 implied = (len(index), len(columns))

420 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")

421

422

423def dict_to_mgr(

424 data: dict,

425 index,

426 columns,

427 *,

428 dtype: DtypeObj | None = None,

429 typ: str = "block",

430 copy: bool = True,

431) -> Manager:

432 """

433 Segregate Series based on type and coerce into matrices.

434 Needs to handle a lot of exceptional cases.

435

436 Used in DataFrame.__init__

437 """

438 arrays: Sequence[Any] | Series

439

440 if columns is not None:

441 from pandas.core.series import Series

442

443 arrays = Series(data, index=columns, dtype=object)

444 missing = arrays.isna()

445 if index is None:

446 # GH10856

447 # raise ValueError if only scalars in dict

448 index = _extract_index(arrays[~missing])

449 else:

450 index = ensure_index(index)

451

452 # no obvious "empty" int column

453 if missing.any() and not is_integer_dtype(dtype):

454 nan_dtype: DtypeObj

455

456 if dtype is not None:

457 # calling sanitize_array ensures we don't mix-and-match

458 # NA dtypes

459 midxs = missing.values.nonzero()[0]

460 for i in midxs:

461 arr = sanitize_array(arrays.iat[i], index, dtype=dtype)

462 arrays.iat[i] = arr

463 else:

464 # GH#1783

465 nan_dtype = np.dtype("object")

466 val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)

467 nmissing = missing.sum()

468 if copy:

469 rhs = [val] * nmissing

470 else:

471 # GH#45369

472 rhs = [val.copy() for _ in range(nmissing)]

473 arrays.loc[missing] = rhs

474

475 arrays = list(arrays)

476 columns = ensure_index(columns)

477

478 else:

479 keys = list(data.keys())

480 columns = Index(keys) if keys else default_index(0)

481 arrays = [com.maybe_iterable_to_list(data[k]) for k in keys]

482

483 if copy:

484 if typ == "block":

485 # We only need to copy arrays that will not get consolidated, i.e.

486 # only EA arrays

487 arrays = [

488 x.copy()

489 if isinstance(x, ExtensionArray)

490 else x.copy(deep=True)

491 if (

492 isinstance(x, Index)

493 or isinstance(x, ABCSeries)

494 and is_1d_only_ea_dtype(x.dtype)

495 )

496 else x

497 for x in arrays

498 ]

499 else:

500 # dtype check to exclude e.g. range objects, scalars

501 arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays]

502

503 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)

504

505

506def nested_data_to_arrays(

507 data: Sequence,

508 columns: Index | None,

509 index: Index | None,

510 dtype: DtypeObj | None,

511) -> tuple[list[ArrayLike], Index, Index]:

512 """

513 Convert a single sequence of arrays to multiple arrays.

514 """

515 # By the time we get here we have already checked treat_as_nested(data)

516

517 if is_named_tuple(data[0]) and columns is None:

518 columns = ensure_index(data[0]._fields)

519

520 arrays, columns = to_arrays(data, columns, dtype=dtype)

521 columns = ensure_index(columns)

522

523 if index is None:

524 if isinstance(data[0], ABCSeries):

525 index = _get_names_from_index(data)

526 else:

527 index = default_index(len(data))

528

529 return arrays, columns, index

530

531

532def treat_as_nested(data) -> bool:

533 """

534 Check if we should use nested_data_to_arrays.

535 """

536 return (

537 len(data) > 0

538 and is_list_like(data[0])

539 and getattr(data[0], "ndim", 1) == 1

540 and not (isinstance(data, ExtensionArray) and data.ndim == 2)

541 )

542

543

544# ---------------------------------------------------------------------

545

546

547def _prep_ndarraylike(values, copy: bool = True) -> np.ndarray:

548 # values is specifically _not_ ndarray, EA, Index, or Series

549 # We only get here with `not treat_as_nested(values)`

550

551 if len(values) == 0:

552 # TODO: check for length-zero range, in which case return int64 dtype?

553 # TODO: reuse anything in try_cast?

554 return np.empty((0, 0), dtype=object)

555 elif isinstance(values, range):

556 arr = range_to_ndarray(values)

557 return arr[..., np.newaxis]

558

559 def convert(v):

560 if not is_list_like(v) or isinstance(v, ABCDataFrame):

561 return v

562

563 v = extract_array(v, extract_numpy=True)

564 res = maybe_convert_platform(v)

565 # We don't do maybe_infer_to_datetimelike here bc we will end up doing

566 # it column-by-column in ndarray_to_mgr

567 return res

568

569 # we could have a 1-dim or 2-dim list here

570 # this is equiv of np.asarray, but does object conversion

571 # and platform dtype preservation

572 # does not convert e.g. [1, "a", True] to ["1", "a", "True"] like

573 # np.asarray would

574 if is_list_like(values[0]):

575 values = np.array([convert(v) for v in values])

576 elif isinstance(values[0], np.ndarray) and values[0].ndim == 0:

577 # GH#21861 see test_constructor_list_of_lists

578 values = np.array([convert(v) for v in values])

579 else:

580 values = convert(values)

581

582 return _ensure_2d(values)

583

584

585def _ensure_2d(values: np.ndarray) -> np.ndarray:

586 """

587 Reshape 1D values, raise on anything else other than 2D.

588 """

589 if values.ndim == 1:

590 values = values.reshape((values.shape[0], 1))

591 elif values.ndim != 2:

592 raise ValueError(f"Must pass 2-d input. shape={values.shape}")

593 return values

594

595

596def _homogenize(

597 data, index: Index, dtype: DtypeObj | None

598) -> tuple[list[ArrayLike], list[Any]]:

599 oindex = None

600 homogenized = []

601 # if the original array-like in `data` is a Series, keep track of this Series' refs

602 refs: list[Any] = []

603

604 for val in data:

605 if isinstance(val, (ABCSeries, Index)):

606 if dtype is not None:

607 val = val.astype(dtype, copy=False)

608 if isinstance(val, ABCSeries) and val.index is not index:

609 # Forces alignment. No need to copy data since we

610 # are putting it into an ndarray later

611 val = val.reindex(index, copy=False)

612 refs.append(val._references)

613 val = val._values

614 else:

615 if isinstance(val, dict):

616 # GH#41785 this _should_ be equivalent to (but faster than)

617 # val = Series(val, index=index)._values

618 if oindex is None:

619 oindex = index.astype("O")

620

621 if isinstance(index, (DatetimeIndex, TimedeltaIndex)):

622 # see test_constructor_dict_datetime64_index

623 val = dict_compat(val)

624 else:

625 # see test_constructor_subclass_dict

626 val = dict(val)

627 val = lib.fast_multiget(val, oindex._values, default=np.nan)

628

629 val = sanitize_array(val, index, dtype=dtype, copy=False)

630 com.require_length_match(val, index)

631 refs.append(None)

632

633 homogenized.append(val)

634

635 return homogenized, refs

636

637

638def _extract_index(data) -> Index:

639 """

640 Try to infer an Index from the passed data, raise ValueError on failure.

641 """

642 index: Index

643 if len(data) == 0:

644 return default_index(0)

645

646 raw_lengths = []

647 indexes: list[list[Hashable] | Index] = []

648

649 have_raw_arrays = False

650 have_series = False

651 have_dicts = False

652

653 for val in data:

654 if isinstance(val, ABCSeries):

655 have_series = True

656 indexes.append(val.index)

657 elif isinstance(val, dict):

658 have_dicts = True

659 indexes.append(list(val.keys()))

660 elif is_list_like(val) and getattr(val, "ndim", 1) == 1:

661 have_raw_arrays = True

662 raw_lengths.append(len(val))

663 elif isinstance(val, np.ndarray) and val.ndim > 1:

664 raise ValueError("Per-column arrays must each be 1-dimensional")

665

666 if not indexes and not raw_lengths:

667 raise ValueError("If using all scalar values, you must pass an index")

668

669 if have_series:

670 index = union_indexes(indexes)

671 elif have_dicts:

672 index = union_indexes(indexes, sort=False)

673

674 if have_raw_arrays:

675 lengths = list(set(raw_lengths))

676 if len(lengths) > 1:

677 raise ValueError("All arrays must be of the same length")

678

679 if have_dicts:

680 raise ValueError(

681 "Mixing dicts with non-Series may lead to ambiguous ordering."

682 )

683

684 if have_series:

685 if lengths[0] != len(index):

686 msg = (

687 f"array length {lengths[0]} does not match index "

688 f"length {len(index)}"

689 )

690 raise ValueError(msg)

691 else:

692 index = default_index(lengths[0])

693

694 return ensure_index(index)

695

696

697def reorder_arrays(

698 arrays: list[ArrayLike], arr_columns: Index, columns: Index | None, length: int

699) -> tuple[list[ArrayLike], Index]:

700 """

701 Pre-emptively (cheaply) reindex arrays with new columns.

702 """

703 # reorder according to the columns

704 if columns is not None:

705 if not columns.equals(arr_columns):

706 # if they are equal, there is nothing to do

707 new_arrays: list[ArrayLike] = []

708 indexer = arr_columns.get_indexer(columns)

709 for i, k in enumerate(indexer):

710 if k == -1:

711 # by convention default is all-NaN object dtype

712 arr = np.empty(length, dtype=object)

713 arr.fill(np.nan)

714 else:

715 arr = arrays[k]

716 new_arrays.append(arr)

717

718 arrays = new_arrays

719 arr_columns = columns

720

721 return arrays, arr_columns

722

723

724def _get_names_from_index(data) -> Index:

725 has_some_name = any(getattr(s, "name", None) is not None for s in data)

726 if not has_some_name:

727 return default_index(len(data))

728

729 index: list[Hashable] = list(range(len(data)))

730 count = 0

731 for i, s in enumerate(data):

732 n = getattr(s, "name", None)

733 if n is not None:

734 index[i] = n

735 else:

736 index[i] = f"Unnamed {count}"

737 count += 1

738

739 return Index(index)

740

741

742def _get_axes(

743 N: int, K: int, index: Index | None, columns: Index | None

744) -> tuple[Index, Index]:

745 # helper to create the axes as indexes

746 # return axes or defaults

747

748 if index is None:

749 index = default_index(N)

750 else:

751 index = ensure_index(index)

752

753 if columns is None:

754 columns = default_index(K)

755 else:

756 columns = ensure_index(columns)

757 return index, columns

758

759

760def dataclasses_to_dicts(data):

761 """

762 Converts a list of dataclass instances to a list of dictionaries.

763

764 Parameters

765 ----------

766 data : List[Type[dataclass]]

767

768 Returns

769 --------

770 list_dict : List[dict]

771

772 Examples

773 --------

774 >>> from dataclasses import dataclass

775 >>> @dataclass

776 ... class Point:

777 ... x: int

778 ... y: int

779

780 >>> dataclasses_to_dicts([Point(1, 2), Point(2, 3)])

781 [{'x': 1, 'y': 2}, {'x': 2, 'y': 3}]

782

783 """

784 from dataclasses import asdict

785

786 return list(map(asdict, data))

787

788

789# ---------------------------------------------------------------------

790# Conversion of Inputs to Arrays

791

792

793def to_arrays(

794 data, columns: Index | None, dtype: DtypeObj | None = None

795) -> tuple[list[ArrayLike], Index]:

796 """

797 Return list of arrays, columns.

798

799 Returns

800 -------

801 list[ArrayLike]

802 These will become columns in a DataFrame.

803 Index

804 This will become frame.columns.

805

806 Notes

807 -----

808 Ensures that len(result_arrays) == len(result_index).

809 """

810

811 if not len(data):

812 if isinstance(data, np.ndarray):

813 if data.dtype.names is not None:

814 # i.e. numpy structured array

815 columns = ensure_index(data.dtype.names)

816 arrays = [data[name] for name in columns]

817

818 if len(data) == 0:

819 # GH#42456 the indexing above results in list of 2D ndarrays

820 # TODO: is that an issue with numpy?

821 for i, arr in enumerate(arrays):

822 if arr.ndim == 2:

823 arrays[i] = arr[:, 0]

824

825 return arrays, columns

826 return [], ensure_index([])

827

828 elif isinstance(data, np.ndarray) and data.dtype.names is not None:

829 # e.g. recarray

830 columns = Index(list(data.dtype.names))

831 arrays = [data[k] for k in columns]

832 return arrays, columns

833

834 if isinstance(data[0], (list, tuple)):

835 arr = _list_to_arrays(data)

836 elif isinstance(data[0], abc.Mapping):

837 arr, columns = _list_of_dict_to_arrays(data, columns)

838 elif isinstance(data[0], ABCSeries):

839 arr, columns = _list_of_series_to_arrays(data, columns)

840 else:

841 # last ditch effort

842 data = [tuple(x) for x in data]

843 arr = _list_to_arrays(data)

844

845 content, columns = _finalize_columns_and_data(arr, columns, dtype)

846 return content, columns

847

848

849def _list_to_arrays(data: list[tuple | list]) -> np.ndarray:

850 # Returned np.ndarray has ndim = 2

851 # Note: we already check len(data) > 0 before getting hre

852 if isinstance(data[0], tuple):

853 content = lib.to_object_array_tuples(data)

854 else:

855 # list of lists

856 content = lib.to_object_array(data)

857 return content

858

859

860def _list_of_series_to_arrays(

861 data: list,

862 columns: Index | None,

863) -> tuple[np.ndarray, Index]:

864 # returned np.ndarray has ndim == 2

865

866 if columns is None:

867 # We know pass_data is non-empty because data[0] is a Series

868 pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))]

869 columns = get_objs_combined_axis(pass_data, sort=False)

870

871 indexer_cache: dict[int, np.ndarray] = {}

872

873 aligned_values = []

874 for s in data:

875 index = getattr(s, "index", None)

876 if index is None:

877 index = default_index(len(s))

878

879 if id(index) in indexer_cache:

880 indexer = indexer_cache[id(index)]

881 else:

882 indexer = indexer_cache[id(index)] = index.get_indexer(columns)

883

884 values = extract_array(s, extract_numpy=True)

885 aligned_values.append(algorithms.take_nd(values, indexer))

886

887 content = np.vstack(aligned_values)

888 return content, columns

889

890

891def _list_of_dict_to_arrays(

892 data: list[dict],

893 columns: Index | None,

894) -> tuple[np.ndarray, Index]:

895 """

896 Convert list of dicts to numpy arrays

897

898 if `columns` is not passed, column names are inferred from the records

899 - for OrderedDict and dicts, the column names match

900 the key insertion-order from the first record to the last.

901 - For other kinds of dict-likes, the keys are lexically sorted.

902

903 Parameters

904 ----------

905 data : iterable

906 collection of records (OrderedDict, dict)

907 columns: iterables or None

908

909 Returns

910 -------

911 content : np.ndarray[object, ndim=2]

912 columns : Index

913 """

914 if columns is None:

915 gen = (list(x.keys()) for x in data)

916 sort = not any(isinstance(d, dict) for d in data)

917 pre_cols = lib.fast_unique_multiple_list_gen(gen, sort=sort)

918 columns = ensure_index(pre_cols)

919

920 # assure that they are of the base dict class and not of derived

921 # classes

922 data = [d if type(d) is dict else dict(d) for d in data] # noqa: E721

923

924 content = lib.dicts_to_array(data, list(columns))

925 return content, columns

926

927

928def _finalize_columns_and_data(

929 content: np.ndarray, # ndim == 2

930 columns: Index | None,

931 dtype: DtypeObj | None,

932) -> tuple[list[ArrayLike], Index]:

933 """

934 Ensure we have valid columns, cast object dtypes if possible.

935 """

936 contents = list(content.T)

937

938 try:

939 columns = _validate_or_indexify_columns(contents, columns)

940 except AssertionError as err:

941 # GH#26429 do not raise user-facing AssertionError

942 raise ValueError(err) from err

943

944 if len(contents) and contents[0].dtype == np.object_:

945 contents = convert_object_array(contents, dtype=dtype)

946

947 return contents, columns

948

949

950def _validate_or_indexify_columns(

951 content: list[np.ndarray], columns: Index | None

952) -> Index:

953 """

954 If columns is None, make numbers as column names; Otherwise, validate that

955 columns have valid length.

956

957 Parameters

958 ----------

959 content : list of np.ndarrays

960 columns : Index or None

961

962 Returns

963 -------

964 Index

965 If columns is None, assign positional column index value as columns.

966

967 Raises

968 ------

969 1. AssertionError when content is not composed of list of lists, and if

970 length of columns is not equal to length of content.

971 2. ValueError when content is list of lists, but length of each sub-list

972 is not equal

973 3. ValueError when content is list of lists, but length of sub-list is

974 not equal to length of content

975 """

976 if columns is None:

977 columns = default_index(len(content))

978 else:

979 # Add mask for data which is composed of list of lists

980 is_mi_list = isinstance(columns, list) and all(

981 isinstance(col, list) for col in columns

982 )

983

984 if not is_mi_list and len(columns) != len(content): # pragma: no cover

985 # caller's responsibility to check for this...

986 raise AssertionError(

987 f"{len(columns)} columns passed, passed data had "

988 f"{len(content)} columns"

989 )

990 if is_mi_list:

991 # check if nested list column, length of each sub-list should be equal

992 if len({len(col) for col in columns}) > 1:

993 raise ValueError(

994 "Length of columns passed for MultiIndex columns is different"

995 )

996

997 # if columns is not empty and length of sublist is not equal to content

998 if columns and len(columns[0]) != len(content):

999 raise ValueError(

1000 f"{len(columns[0])} columns passed, passed data had "

1001 f"{len(content)} columns"

1002 )

1003 return columns

1004

1005

1006def convert_object_array(

1007 content: list[npt.NDArray[np.object_]],

1008 dtype: DtypeObj | None,

1009 dtype_backend: str = "numpy",

1010 coerce_float: bool = False,

1011) -> list[ArrayLike]:

1012 """

1013 Internal function to convert object array.

1014

1015 Parameters

1016 ----------

1017 content: List[np.ndarray]

1018 dtype: np.dtype or ExtensionDtype

1019 dtype_backend: Controls if nullable/pyarrow dtypes are returned.

1020 coerce_float: Cast floats that are integers to int.

1021

1022 Returns

1023 -------

1024 List[ArrayLike]

1025 """

1026 # provide soft conversion of object dtypes

1027

1028 def convert(arr):

1029 if dtype != np.dtype("O"):

1030 arr = lib.maybe_convert_objects(

1031 arr,

1032 try_float=coerce_float,

1033 convert_to_nullable_dtype=dtype_backend != "numpy",

1034 )

1035 # Notes on cases that get here 2023-02-15

1036 # 1) we DO get here when arr is all Timestamps and dtype=None

1037 # 2) disabling this doesn't break the world, so this must be

1038 # getting caught at a higher level

1039 # 3) passing convert_non_numeric to maybe_convert_objects get this right

1040 # 4) convert_non_numeric?

1041

1042 if dtype is None:

1043 if arr.dtype == np.dtype("O"):

1044 # i.e. maybe_convert_objects didn't convert

1045 arr = maybe_infer_to_datetimelike(arr)

1046 if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):

1047 new_dtype = StringDtype()

1048 arr_cls = new_dtype.construct_array_type()

1049 arr = arr_cls._from_sequence(arr, dtype=new_dtype)

1050 elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):

1051 if arr.dtype.kind in "iufb":

1052 arr = pd_array(arr, copy=False)

1053

1054 elif isinstance(dtype, ExtensionDtype):

1055 # TODO: test(s) that get here

1056 # TODO: try to de-duplicate this convert function with

1057 # core.construction functions

1058 cls = dtype.construct_array_type()

1059 arr = cls._from_sequence(arr, dtype=dtype, copy=False)

1060 elif dtype.kind in "mM":

1061 # This restriction is harmless bc these are the only cases

1062 # where maybe_cast_to_datetime is not a no-op.

1063 # Here we know:

1064 # 1) dtype.kind in "mM" and

1065 # 2) arr is either object or numeric dtype

1066 arr = maybe_cast_to_datetime(arr, dtype)

1067

1068 return arr

1069

1070 arrays = [convert(arr) for arr in content]

1071

1072 return arrays