Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/internals/construction.py: 50%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

426 statements  

1""" 

2Functions for preparing various inputs passed to the DataFrame or Series 

3constructors before passing them to a BlockManager. 

4""" 

5from __future__ import annotations 

6 

7from collections import abc 

8from typing import ( 

9 TYPE_CHECKING, 

10 Any, 

11) 

12 

13import numpy as np 

14from numpy import ma 

15 

16from pandas._config import using_pyarrow_string_dtype 

17 

18from pandas._libs import lib 

19 

20from pandas.core.dtypes.astype import astype_is_view 

21from pandas.core.dtypes.cast import ( 

22 construct_1d_arraylike_from_scalar, 

23 dict_compat, 

24 maybe_cast_to_datetime, 

25 maybe_convert_platform, 

26 maybe_infer_to_datetimelike, 

27) 

28from pandas.core.dtypes.common import ( 

29 is_1d_only_ea_dtype, 

30 is_integer_dtype, 

31 is_list_like, 

32 is_named_tuple, 

33 is_object_dtype, 

34) 

35from pandas.core.dtypes.dtypes import ExtensionDtype 

36from pandas.core.dtypes.generic import ( 

37 ABCDataFrame, 

38 ABCSeries, 

39) 

40 

41from pandas.core import ( 

42 algorithms, 

43 common as com, 

44) 

45from pandas.core.arrays import ExtensionArray 

46from pandas.core.arrays.string_ import StringDtype 

47from pandas.core.construction import ( 

48 array as pd_array, 

49 ensure_wrapped_if_datetimelike, 

50 extract_array, 

51 range_to_ndarray, 

52 sanitize_array, 

53) 

54from pandas.core.indexes.api import ( 

55 DatetimeIndex, 

56 Index, 

57 TimedeltaIndex, 

58 default_index, 

59 ensure_index, 

60 get_objs_combined_axis, 

61 union_indexes, 

62) 

63from pandas.core.internals.array_manager import ( 

64 ArrayManager, 

65 SingleArrayManager, 

66) 

67from pandas.core.internals.blocks import ( 

68 BlockPlacement, 

69 ensure_block_shape, 

70 new_block, 

71 new_block_2d, 

72) 

73from pandas.core.internals.managers import ( 

74 BlockManager, 

75 SingleBlockManager, 

76 create_block_manager_from_blocks, 

77 create_block_manager_from_column_arrays, 

78) 

79 

80if TYPE_CHECKING: 

81 from collections.abc import ( 

82 Hashable, 

83 Sequence, 

84 ) 

85 

86 from pandas._typing import ( 

87 ArrayLike, 

88 DtypeObj, 

89 Manager, 

90 npt, 

91 ) 

92# --------------------------------------------------------------------- 

93# BlockManager Interface 

94 

95 

96def arrays_to_mgr( 

97 arrays, 

98 columns: Index, 

99 index, 

100 *, 

101 dtype: DtypeObj | None = None, 

102 verify_integrity: bool = True, 

103 typ: str | None = None, 

104 consolidate: bool = True, 

105) -> Manager: 

106 """ 

107 Segregate Series based on type and coerce into matrices. 

108 

109 Needs to handle a lot of exceptional cases. 

110 """ 

111 if verify_integrity: 

112 # figure out the index, if necessary 

113 if index is None: 

114 index = _extract_index(arrays) 

115 else: 

116 index = ensure_index(index) 

117 

118 # don't force copy because getting jammed in an ndarray anyway 

119 arrays, refs = _homogenize(arrays, index, dtype) 

120 # _homogenize ensures 

121 # - all(len(x) == len(index) for x in arrays) 

122 # - all(x.ndim == 1 for x in arrays) 

123 # - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays) 

124 # - all(type(x) is not NumpyExtensionArray for x in arrays) 

125 

126 else: 

127 index = ensure_index(index) 

128 arrays = [extract_array(x, extract_numpy=True) for x in arrays] 

129 # with _from_arrays, the passed arrays should never be Series objects 

130 refs = [None] * len(arrays) 

131 

132 # Reached via DataFrame._from_arrays; we do minimal validation here 

133 for arr in arrays: 

134 if ( 

135 not isinstance(arr, (np.ndarray, ExtensionArray)) 

136 or arr.ndim != 1 

137 or len(arr) != len(index) 

138 ): 

139 raise ValueError( 

140 "Arrays must be 1-dimensional np.ndarray or ExtensionArray " 

141 "with length matching len(index)" 

142 ) 

143 

144 columns = ensure_index(columns) 

145 if len(columns) != len(arrays): 

146 raise ValueError("len(arrays) must match len(columns)") 

147 

148 # from BlockManager perspective 

149 axes = [columns, index] 

150 

151 if typ == "block": 

152 return create_block_manager_from_column_arrays( 

153 arrays, axes, consolidate=consolidate, refs=refs 

154 ) 

155 elif typ == "array": 

156 return ArrayManager(arrays, [index, columns]) 

157 else: 

158 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") 

159 

160 

161def rec_array_to_mgr( 

162 data: np.rec.recarray | np.ndarray, 

163 index, 

164 columns, 

165 dtype: DtypeObj | None, 

166 copy: bool, 

167 typ: str, 

168) -> Manager: 

169 """ 

170 Extract from a masked rec array and create the manager. 

171 """ 

172 # essentially process a record array then fill it 

173 fdata = ma.getdata(data) 

174 if index is None: 

175 index = default_index(len(fdata)) 

176 else: 

177 index = ensure_index(index) 

178 

179 if columns is not None: 

180 columns = ensure_index(columns) 

181 arrays, arr_columns = to_arrays(fdata, columns) 

182 

183 # create the manager 

184 

185 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, len(index)) 

186 if columns is None: 

187 columns = arr_columns 

188 

189 mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ) 

190 

191 if copy: 

192 mgr = mgr.copy() 

193 return mgr 

194 

195 

196def mgr_to_mgr(mgr, typ: str, copy: bool = True) -> Manager: 

197 """ 

198 Convert to specific type of Manager. Does not copy if the type is already 

199 correct. Does not guarantee a copy otherwise. `copy` keyword only controls 

200 whether conversion from Block->ArrayManager copies the 1D arrays. 

201 """ 

202 new_mgr: Manager 

203 

204 if typ == "block": 

205 if isinstance(mgr, BlockManager): 

206 new_mgr = mgr 

207 else: 

208 if mgr.ndim == 2: 

209 new_mgr = arrays_to_mgr( 

210 mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block" 

211 ) 

212 else: 

213 new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index) 

214 elif typ == "array": 

215 if isinstance(mgr, ArrayManager): 

216 new_mgr = mgr 

217 else: 

218 if mgr.ndim == 2: 

219 arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))] 

220 if copy: 

221 arrays = [arr.copy() for arr in arrays] 

222 new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) 

223 else: 

224 array = mgr.internal_values() 

225 if copy: 

226 array = array.copy() 

227 new_mgr = SingleArrayManager([array], [mgr.index]) 

228 else: 

229 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") 

230 return new_mgr 

231 

232 

233# --------------------------------------------------------------------- 

234# DataFrame Constructor Interface 

235 

236 

237def ndarray_to_mgr( 

238 values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str 

239) -> Manager: 

240 # used in DataFrame.__init__ 

241 # input must be a ndarray, list, Series, Index, ExtensionArray 

242 

243 if isinstance(values, ABCSeries): 

244 if columns is None: 

245 if values.name is not None: 

246 columns = Index([values.name]) 

247 if index is None: 

248 index = values.index 

249 else: 

250 values = values.reindex(index) 

251 

252 # zero len case (GH #2234) 

253 if not len(values) and columns is not None and len(columns): 

254 values = np.empty((0, 1), dtype=object) 

255 

256 # if the array preparation does a copy -> avoid this for ArrayManager, 

257 # since the copy is done on conversion to 1D arrays 

258 copy_on_sanitize = False if typ == "array" else copy 

259 

260 vdtype = getattr(values, "dtype", None) 

261 refs = None 

262 if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype): 

263 # GH#19157 

264 

265 if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1: 

266 # GH#12513 a EA dtype passed with a 2D array, split into 

267 # multiple EAs that view the values 

268 # error: No overload variant of "__getitem__" of "ExtensionArray" 

269 # matches argument type "Tuple[slice, int]" 

270 values = [ 

271 values[:, n] # type: ignore[call-overload] 

272 for n in range(values.shape[1]) 

273 ] 

274 else: 

275 values = [values] 

276 

277 if columns is None: 

278 columns = Index(range(len(values))) 

279 else: 

280 columns = ensure_index(columns) 

281 

282 return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ) 

283 

284 elif isinstance(vdtype, ExtensionDtype): 

285 # i.e. Datetime64TZ, PeriodDtype; cases with is_1d_only_ea_dtype(vdtype) 

286 # are already caught above 

287 values = extract_array(values, extract_numpy=True) 

288 if copy: 

289 values = values.copy() 

290 if values.ndim == 1: 

291 values = values.reshape(-1, 1) 

292 

293 elif isinstance(values, (ABCSeries, Index)): 

294 if not copy_on_sanitize and ( 

295 dtype is None or astype_is_view(values.dtype, dtype) 

296 ): 

297 refs = values._references 

298 

299 if copy_on_sanitize: 

300 values = values._values.copy() 

301 else: 

302 values = values._values 

303 

304 values = _ensure_2d(values) 

305 

306 elif isinstance(values, (np.ndarray, ExtensionArray)): 

307 # drop subclass info 

308 _copy = ( 

309 copy_on_sanitize 

310 if (dtype is None or astype_is_view(values.dtype, dtype)) 

311 else False 

312 ) 

313 values = np.array(values, copy=_copy) 

314 values = _ensure_2d(values) 

315 

316 else: 

317 # by definition an array here 

318 # the dtypes will be coerced to a single dtype 

319 values = _prep_ndarraylike(values, copy=copy_on_sanitize) 

320 

321 if dtype is not None and values.dtype != dtype: 

322 # GH#40110 see similar check inside sanitize_array 

323 values = sanitize_array( 

324 values, 

325 None, 

326 dtype=dtype, 

327 copy=copy_on_sanitize, 

328 allow_2d=True, 

329 ) 

330 

331 # _prep_ndarraylike ensures that values.ndim == 2 at this point 

332 index, columns = _get_axes( 

333 values.shape[0], values.shape[1], index=index, columns=columns 

334 ) 

335 

336 _check_values_indices_shape_match(values, index, columns) 

337 

338 if typ == "array": 

339 if issubclass(values.dtype.type, str): 

340 values = np.array(values, dtype=object) 

341 

342 if dtype is None and is_object_dtype(values.dtype): 

343 arrays = [ 

344 ensure_wrapped_if_datetimelike( 

345 maybe_infer_to_datetimelike(values[:, i]) 

346 ) 

347 for i in range(values.shape[1]) 

348 ] 

349 else: 

350 if lib.is_np_dtype(values.dtype, "mM"): 

351 values = ensure_wrapped_if_datetimelike(values) 

352 arrays = [values[:, i] for i in range(values.shape[1])] 

353 

354 if copy: 

355 arrays = [arr.copy() for arr in arrays] 

356 

357 return ArrayManager(arrays, [index, columns], verify_integrity=False) 

358 

359 values = values.T 

360 

361 # if we don't have a dtype specified, then try to convert objects 

362 # on the entire block; this is to convert if we have datetimelike's 

363 # embedded in an object type 

364 if dtype is None and is_object_dtype(values.dtype): 

365 obj_columns = list(values) 

366 maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] 

367 # don't convert (and copy) the objects if no type inference occurs 

368 if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): 

369 dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime] 

370 block_values = [ 

371 new_block_2d(dvals_list[n], placement=BlockPlacement(n)) 

372 for n in range(len(dvals_list)) 

373 ] 

374 else: 

375 bp = BlockPlacement(slice(len(columns))) 

376 nb = new_block_2d(values, placement=bp, refs=refs) 

377 block_values = [nb] 

378 elif dtype is None and values.dtype.kind == "U" and using_pyarrow_string_dtype(): 

379 dtype = StringDtype(storage="pyarrow_numpy") 

380 

381 obj_columns = list(values) 

382 block_values = [ 

383 new_block( 

384 dtype.construct_array_type()._from_sequence(data, dtype=dtype), 

385 BlockPlacement(slice(i, i + 1)), 

386 ndim=2, 

387 ) 

388 for i, data in enumerate(obj_columns) 

389 ] 

390 

391 else: 

392 bp = BlockPlacement(slice(len(columns))) 

393 nb = new_block_2d(values, placement=bp, refs=refs) 

394 block_values = [nb] 

395 

396 if len(columns) == 0: 

397 # TODO: check len(values) == 0? 

398 block_values = [] 

399 

400 return create_block_manager_from_blocks( 

401 block_values, [columns, index], verify_integrity=False 

402 ) 

403 

404 

405def _check_values_indices_shape_match( 

406 values: np.ndarray, index: Index, columns: Index 

407) -> None: 

408 """ 

409 Check that the shape implied by our axes matches the actual shape of the 

410 data. 

411 """ 

412 if values.shape[1] != len(columns) or values.shape[0] != len(index): 

413 # Could let this raise in Block constructor, but we get a more 

414 # helpful exception message this way. 

415 if values.shape[0] == 0 < len(index): 

416 raise ValueError("Empty data passed with indices specified.") 

417 

418 passed = values.shape 

419 implied = (len(index), len(columns)) 

420 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") 

421 

422 

423def dict_to_mgr( 

424 data: dict, 

425 index, 

426 columns, 

427 *, 

428 dtype: DtypeObj | None = None, 

429 typ: str = "block", 

430 copy: bool = True, 

431) -> Manager: 

432 """ 

433 Segregate Series based on type and coerce into matrices. 

434 Needs to handle a lot of exceptional cases. 

435 

436 Used in DataFrame.__init__ 

437 """ 

438 arrays: Sequence[Any] | Series 

439 

440 if columns is not None: 

441 from pandas.core.series import Series 

442 

443 arrays = Series(data, index=columns, dtype=object) 

444 missing = arrays.isna() 

445 if index is None: 

446 # GH10856 

447 # raise ValueError if only scalars in dict 

448 index = _extract_index(arrays[~missing]) 

449 else: 

450 index = ensure_index(index) 

451 

452 # no obvious "empty" int column 

453 if missing.any() and not is_integer_dtype(dtype): 

454 nan_dtype: DtypeObj 

455 

456 if dtype is not None: 

457 # calling sanitize_array ensures we don't mix-and-match 

458 # NA dtypes 

459 midxs = missing.values.nonzero()[0] 

460 for i in midxs: 

461 arr = sanitize_array(arrays.iat[i], index, dtype=dtype) 

462 arrays.iat[i] = arr 

463 else: 

464 # GH#1783 

465 nan_dtype = np.dtype("object") 

466 val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) 

467 nmissing = missing.sum() 

468 if copy: 

469 rhs = [val] * nmissing 

470 else: 

471 # GH#45369 

472 rhs = [val.copy() for _ in range(nmissing)] 

473 arrays.loc[missing] = rhs 

474 

475 arrays = list(arrays) 

476 columns = ensure_index(columns) 

477 

478 else: 

479 keys = list(data.keys()) 

480 columns = Index(keys) if keys else default_index(0) 

481 arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] 

482 

483 if copy: 

484 if typ == "block": 

485 # We only need to copy arrays that will not get consolidated, i.e. 

486 # only EA arrays 

487 arrays = [ 

488 x.copy() 

489 if isinstance(x, ExtensionArray) 

490 else x.copy(deep=True) 

491 if ( 

492 isinstance(x, Index) 

493 or isinstance(x, ABCSeries) 

494 and is_1d_only_ea_dtype(x.dtype) 

495 ) 

496 else x 

497 for x in arrays 

498 ] 

499 else: 

500 # dtype check to exclude e.g. range objects, scalars 

501 arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays] 

502 

503 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy) 

504 

505 

506def nested_data_to_arrays( 

507 data: Sequence, 

508 columns: Index | None, 

509 index: Index | None, 

510 dtype: DtypeObj | None, 

511) -> tuple[list[ArrayLike], Index, Index]: 

512 """ 

513 Convert a single sequence of arrays to multiple arrays. 

514 """ 

515 # By the time we get here we have already checked treat_as_nested(data) 

516 

517 if is_named_tuple(data[0]) and columns is None: 

518 columns = ensure_index(data[0]._fields) 

519 

520 arrays, columns = to_arrays(data, columns, dtype=dtype) 

521 columns = ensure_index(columns) 

522 

523 if index is None: 

524 if isinstance(data[0], ABCSeries): 

525 index = _get_names_from_index(data) 

526 else: 

527 index = default_index(len(data)) 

528 

529 return arrays, columns, index 

530 

531 

532def treat_as_nested(data) -> bool: 

533 """ 

534 Check if we should use nested_data_to_arrays. 

535 """ 

536 return ( 

537 len(data) > 0 

538 and is_list_like(data[0]) 

539 and getattr(data[0], "ndim", 1) == 1 

540 and not (isinstance(data, ExtensionArray) and data.ndim == 2) 

541 ) 

542 

543 

544# --------------------------------------------------------------------- 

545 

546 

547def _prep_ndarraylike(values, copy: bool = True) -> np.ndarray: 

548 # values is specifically _not_ ndarray, EA, Index, or Series 

549 # We only get here with `not treat_as_nested(values)` 

550 

551 if len(values) == 0: 

552 # TODO: check for length-zero range, in which case return int64 dtype? 

553 # TODO: reuse anything in try_cast? 

554 return np.empty((0, 0), dtype=object) 

555 elif isinstance(values, range): 

556 arr = range_to_ndarray(values) 

557 return arr[..., np.newaxis] 

558 

559 def convert(v): 

560 if not is_list_like(v) or isinstance(v, ABCDataFrame): 

561 return v 

562 

563 v = extract_array(v, extract_numpy=True) 

564 res = maybe_convert_platform(v) 

565 # We don't do maybe_infer_to_datetimelike here bc we will end up doing 

566 # it column-by-column in ndarray_to_mgr 

567 return res 

568 

569 # we could have a 1-dim or 2-dim list here 

570 # this is equiv of np.asarray, but does object conversion 

571 # and platform dtype preservation 

572 # does not convert e.g. [1, "a", True] to ["1", "a", "True"] like 

573 # np.asarray would 

574 if is_list_like(values[0]): 

575 values = np.array([convert(v) for v in values]) 

576 elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: 

577 # GH#21861 see test_constructor_list_of_lists 

578 values = np.array([convert(v) for v in values]) 

579 else: 

580 values = convert(values) 

581 

582 return _ensure_2d(values) 

583 

584 

585def _ensure_2d(values: np.ndarray) -> np.ndarray: 

586 """ 

587 Reshape 1D values, raise on anything else other than 2D. 

588 """ 

589 if values.ndim == 1: 

590 values = values.reshape((values.shape[0], 1)) 

591 elif values.ndim != 2: 

592 raise ValueError(f"Must pass 2-d input. shape={values.shape}") 

593 return values 

594 

595 

596def _homogenize( 

597 data, index: Index, dtype: DtypeObj | None 

598) -> tuple[list[ArrayLike], list[Any]]: 

599 oindex = None 

600 homogenized = [] 

601 # if the original array-like in `data` is a Series, keep track of this Series' refs 

602 refs: list[Any] = [] 

603 

604 for val in data: 

605 if isinstance(val, (ABCSeries, Index)): 

606 if dtype is not None: 

607 val = val.astype(dtype, copy=False) 

608 if isinstance(val, ABCSeries) and val.index is not index: 

609 # Forces alignment. No need to copy data since we 

610 # are putting it into an ndarray later 

611 val = val.reindex(index, copy=False) 

612 refs.append(val._references) 

613 val = val._values 

614 else: 

615 if isinstance(val, dict): 

616 # GH#41785 this _should_ be equivalent to (but faster than) 

617 # val = Series(val, index=index)._values 

618 if oindex is None: 

619 oindex = index.astype("O") 

620 

621 if isinstance(index, (DatetimeIndex, TimedeltaIndex)): 

622 # see test_constructor_dict_datetime64_index 

623 val = dict_compat(val) 

624 else: 

625 # see test_constructor_subclass_dict 

626 val = dict(val) 

627 val = lib.fast_multiget(val, oindex._values, default=np.nan) 

628 

629 val = sanitize_array(val, index, dtype=dtype, copy=False) 

630 com.require_length_match(val, index) 

631 refs.append(None) 

632 

633 homogenized.append(val) 

634 

635 return homogenized, refs 

636 

637 

638def _extract_index(data) -> Index: 

639 """ 

640 Try to infer an Index from the passed data, raise ValueError on failure. 

641 """ 

642 index: Index 

643 if len(data) == 0: 

644 return default_index(0) 

645 

646 raw_lengths = [] 

647 indexes: list[list[Hashable] | Index] = [] 

648 

649 have_raw_arrays = False 

650 have_series = False 

651 have_dicts = False 

652 

653 for val in data: 

654 if isinstance(val, ABCSeries): 

655 have_series = True 

656 indexes.append(val.index) 

657 elif isinstance(val, dict): 

658 have_dicts = True 

659 indexes.append(list(val.keys())) 

660 elif is_list_like(val) and getattr(val, "ndim", 1) == 1: 

661 have_raw_arrays = True 

662 raw_lengths.append(len(val)) 

663 elif isinstance(val, np.ndarray) and val.ndim > 1: 

664 raise ValueError("Per-column arrays must each be 1-dimensional") 

665 

666 if not indexes and not raw_lengths: 

667 raise ValueError("If using all scalar values, you must pass an index") 

668 

669 if have_series: 

670 index = union_indexes(indexes) 

671 elif have_dicts: 

672 index = union_indexes(indexes, sort=False) 

673 

674 if have_raw_arrays: 

675 lengths = list(set(raw_lengths)) 

676 if len(lengths) > 1: 

677 raise ValueError("All arrays must be of the same length") 

678 

679 if have_dicts: 

680 raise ValueError( 

681 "Mixing dicts with non-Series may lead to ambiguous ordering." 

682 ) 

683 

684 if have_series: 

685 if lengths[0] != len(index): 

686 msg = ( 

687 f"array length {lengths[0]} does not match index " 

688 f"length {len(index)}" 

689 ) 

690 raise ValueError(msg) 

691 else: 

692 index = default_index(lengths[0]) 

693 

694 return ensure_index(index) 

695 

696 

697def reorder_arrays( 

698 arrays: list[ArrayLike], arr_columns: Index, columns: Index | None, length: int 

699) -> tuple[list[ArrayLike], Index]: 

700 """ 

701 Pre-emptively (cheaply) reindex arrays with new columns. 

702 """ 

703 # reorder according to the columns 

704 if columns is not None: 

705 if not columns.equals(arr_columns): 

706 # if they are equal, there is nothing to do 

707 new_arrays: list[ArrayLike] = [] 

708 indexer = arr_columns.get_indexer(columns) 

709 for i, k in enumerate(indexer): 

710 if k == -1: 

711 # by convention default is all-NaN object dtype 

712 arr = np.empty(length, dtype=object) 

713 arr.fill(np.nan) 

714 else: 

715 arr = arrays[k] 

716 new_arrays.append(arr) 

717 

718 arrays = new_arrays 

719 arr_columns = columns 

720 

721 return arrays, arr_columns 

722 

723 

724def _get_names_from_index(data) -> Index: 

725 has_some_name = any(getattr(s, "name", None) is not None for s in data) 

726 if not has_some_name: 

727 return default_index(len(data)) 

728 

729 index: list[Hashable] = list(range(len(data))) 

730 count = 0 

731 for i, s in enumerate(data): 

732 n = getattr(s, "name", None) 

733 if n is not None: 

734 index[i] = n 

735 else: 

736 index[i] = f"Unnamed {count}" 

737 count += 1 

738 

739 return Index(index) 

740 

741 

742def _get_axes( 

743 N: int, K: int, index: Index | None, columns: Index | None 

744) -> tuple[Index, Index]: 

745 # helper to create the axes as indexes 

746 # return axes or defaults 

747 

748 if index is None: 

749 index = default_index(N) 

750 else: 

751 index = ensure_index(index) 

752 

753 if columns is None: 

754 columns = default_index(K) 

755 else: 

756 columns = ensure_index(columns) 

757 return index, columns 

758 

759 

760def dataclasses_to_dicts(data): 

761 """ 

762 Converts a list of dataclass instances to a list of dictionaries. 

763 

764 Parameters 

765 ---------- 

766 data : List[Type[dataclass]] 

767 

768 Returns 

769 -------- 

770 list_dict : List[dict] 

771 

772 Examples 

773 -------- 

774 >>> from dataclasses import dataclass 

775 >>> @dataclass 

776 ... class Point: 

777 ... x: int 

778 ... y: int 

779 

780 >>> dataclasses_to_dicts([Point(1, 2), Point(2, 3)]) 

781 [{'x': 1, 'y': 2}, {'x': 2, 'y': 3}] 

782 

783 """ 

784 from dataclasses import asdict 

785 

786 return list(map(asdict, data)) 

787 

788 

789# --------------------------------------------------------------------- 

790# Conversion of Inputs to Arrays 

791 

792 

793def to_arrays( 

794 data, columns: Index | None, dtype: DtypeObj | None = None 

795) -> tuple[list[ArrayLike], Index]: 

796 """ 

797 Return list of arrays, columns. 

798 

799 Returns 

800 ------- 

801 list[ArrayLike] 

802 These will become columns in a DataFrame. 

803 Index 

804 This will become frame.columns. 

805 

806 Notes 

807 ----- 

808 Ensures that len(result_arrays) == len(result_index). 

809 """ 

810 

811 if not len(data): 

812 if isinstance(data, np.ndarray): 

813 if data.dtype.names is not None: 

814 # i.e. numpy structured array 

815 columns = ensure_index(data.dtype.names) 

816 arrays = [data[name] for name in columns] 

817 

818 if len(data) == 0: 

819 # GH#42456 the indexing above results in list of 2D ndarrays 

820 # TODO: is that an issue with numpy? 

821 for i, arr in enumerate(arrays): 

822 if arr.ndim == 2: 

823 arrays[i] = arr[:, 0] 

824 

825 return arrays, columns 

826 return [], ensure_index([]) 

827 

828 elif isinstance(data, np.ndarray) and data.dtype.names is not None: 

829 # e.g. recarray 

830 columns = Index(list(data.dtype.names)) 

831 arrays = [data[k] for k in columns] 

832 return arrays, columns 

833 

834 if isinstance(data[0], (list, tuple)): 

835 arr = _list_to_arrays(data) 

836 elif isinstance(data[0], abc.Mapping): 

837 arr, columns = _list_of_dict_to_arrays(data, columns) 

838 elif isinstance(data[0], ABCSeries): 

839 arr, columns = _list_of_series_to_arrays(data, columns) 

840 else: 

841 # last ditch effort 

842 data = [tuple(x) for x in data] 

843 arr = _list_to_arrays(data) 

844 

845 content, columns = _finalize_columns_and_data(arr, columns, dtype) 

846 return content, columns 

847 

848 

849def _list_to_arrays(data: list[tuple | list]) -> np.ndarray: 

850 # Returned np.ndarray has ndim = 2 

851 # Note: we already check len(data) > 0 before getting hre 

852 if isinstance(data[0], tuple): 

853 content = lib.to_object_array_tuples(data) 

854 else: 

855 # list of lists 

856 content = lib.to_object_array(data) 

857 return content 

858 

859 

860def _list_of_series_to_arrays( 

861 data: list, 

862 columns: Index | None, 

863) -> tuple[np.ndarray, Index]: 

864 # returned np.ndarray has ndim == 2 

865 

866 if columns is None: 

867 # We know pass_data is non-empty because data[0] is a Series 

868 pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] 

869 columns = get_objs_combined_axis(pass_data, sort=False) 

870 

871 indexer_cache: dict[int, np.ndarray] = {} 

872 

873 aligned_values = [] 

874 for s in data: 

875 index = getattr(s, "index", None) 

876 if index is None: 

877 index = default_index(len(s)) 

878 

879 if id(index) in indexer_cache: 

880 indexer = indexer_cache[id(index)] 

881 else: 

882 indexer = indexer_cache[id(index)] = index.get_indexer(columns) 

883 

884 values = extract_array(s, extract_numpy=True) 

885 aligned_values.append(algorithms.take_nd(values, indexer)) 

886 

887 content = np.vstack(aligned_values) 

888 return content, columns 

889 

890 

891def _list_of_dict_to_arrays( 

892 data: list[dict], 

893 columns: Index | None, 

894) -> tuple[np.ndarray, Index]: 

895 """ 

896 Convert list of dicts to numpy arrays 

897 

898 if `columns` is not passed, column names are inferred from the records 

899 - for OrderedDict and dicts, the column names match 

900 the key insertion-order from the first record to the last. 

901 - For other kinds of dict-likes, the keys are lexically sorted. 

902 

903 Parameters 

904 ---------- 

905 data : iterable 

906 collection of records (OrderedDict, dict) 

907 columns: iterables or None 

908 

909 Returns 

910 ------- 

911 content : np.ndarray[object, ndim=2] 

912 columns : Index 

913 """ 

914 if columns is None: 

915 gen = (list(x.keys()) for x in data) 

916 sort = not any(isinstance(d, dict) for d in data) 

917 pre_cols = lib.fast_unique_multiple_list_gen(gen, sort=sort) 

918 columns = ensure_index(pre_cols) 

919 

920 # assure that they are of the base dict class and not of derived 

921 # classes 

922 data = [d if type(d) is dict else dict(d) for d in data] # noqa: E721 

923 

924 content = lib.dicts_to_array(data, list(columns)) 

925 return content, columns 

926 

927 

928def _finalize_columns_and_data( 

929 content: np.ndarray, # ndim == 2 

930 columns: Index | None, 

931 dtype: DtypeObj | None, 

932) -> tuple[list[ArrayLike], Index]: 

933 """ 

934 Ensure we have valid columns, cast object dtypes if possible. 

935 """ 

936 contents = list(content.T) 

937 

938 try: 

939 columns = _validate_or_indexify_columns(contents, columns) 

940 except AssertionError as err: 

941 # GH#26429 do not raise user-facing AssertionError 

942 raise ValueError(err) from err 

943 

944 if len(contents) and contents[0].dtype == np.object_: 

945 contents = convert_object_array(contents, dtype=dtype) 

946 

947 return contents, columns 

948 

949 

950def _validate_or_indexify_columns( 

951 content: list[np.ndarray], columns: Index | None 

952) -> Index: 

953 """ 

954 If columns is None, make numbers as column names; Otherwise, validate that 

955 columns have valid length. 

956 

957 Parameters 

958 ---------- 

959 content : list of np.ndarrays 

960 columns : Index or None 

961 

962 Returns 

963 ------- 

964 Index 

965 If columns is None, assign positional column index value as columns. 

966 

967 Raises 

968 ------ 

969 1. AssertionError when content is not composed of list of lists, and if 

970 length of columns is not equal to length of content. 

971 2. ValueError when content is list of lists, but length of each sub-list 

972 is not equal 

973 3. ValueError when content is list of lists, but length of sub-list is 

974 not equal to length of content 

975 """ 

976 if columns is None: 

977 columns = default_index(len(content)) 

978 else: 

979 # Add mask for data which is composed of list of lists 

980 is_mi_list = isinstance(columns, list) and all( 

981 isinstance(col, list) for col in columns 

982 ) 

983 

984 if not is_mi_list and len(columns) != len(content): # pragma: no cover 

985 # caller's responsibility to check for this... 

986 raise AssertionError( 

987 f"{len(columns)} columns passed, passed data had " 

988 f"{len(content)} columns" 

989 ) 

990 if is_mi_list: 

991 # check if nested list column, length of each sub-list should be equal 

992 if len({len(col) for col in columns}) > 1: 

993 raise ValueError( 

994 "Length of columns passed for MultiIndex columns is different" 

995 ) 

996 

997 # if columns is not empty and length of sublist is not equal to content 

998 if columns and len(columns[0]) != len(content): 

999 raise ValueError( 

1000 f"{len(columns[0])} columns passed, passed data had " 

1001 f"{len(content)} columns" 

1002 ) 

1003 return columns 

1004 

1005 

1006def convert_object_array( 

1007 content: list[npt.NDArray[np.object_]], 

1008 dtype: DtypeObj | None, 

1009 dtype_backend: str = "numpy", 

1010 coerce_float: bool = False, 

1011) -> list[ArrayLike]: 

1012 """ 

1013 Internal function to convert object array. 

1014 

1015 Parameters 

1016 ---------- 

1017 content: List[np.ndarray] 

1018 dtype: np.dtype or ExtensionDtype 

1019 dtype_backend: Controls if nullable/pyarrow dtypes are returned. 

1020 coerce_float: Cast floats that are integers to int. 

1021 

1022 Returns 

1023 ------- 

1024 List[ArrayLike] 

1025 """ 

1026 # provide soft conversion of object dtypes 

1027 

1028 def convert(arr): 

1029 if dtype != np.dtype("O"): 

1030 arr = lib.maybe_convert_objects( 

1031 arr, 

1032 try_float=coerce_float, 

1033 convert_to_nullable_dtype=dtype_backend != "numpy", 

1034 ) 

1035 # Notes on cases that get here 2023-02-15 

1036 # 1) we DO get here when arr is all Timestamps and dtype=None 

1037 # 2) disabling this doesn't break the world, so this must be 

1038 # getting caught at a higher level 

1039 # 3) passing convert_non_numeric to maybe_convert_objects get this right 

1040 # 4) convert_non_numeric? 

1041 

1042 if dtype is None: 

1043 if arr.dtype == np.dtype("O"): 

1044 # i.e. maybe_convert_objects didn't convert 

1045 arr = maybe_infer_to_datetimelike(arr) 

1046 if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): 

1047 new_dtype = StringDtype() 

1048 arr_cls = new_dtype.construct_array_type() 

1049 arr = arr_cls._from_sequence(arr, dtype=new_dtype) 

1050 elif dtype_backend != "numpy" and isinstance(arr, np.ndarray): 

1051 if arr.dtype.kind in "iufb": 

1052 arr = pd_array(arr, copy=False) 

1053 

1054 elif isinstance(dtype, ExtensionDtype): 

1055 # TODO: test(s) that get here 

1056 # TODO: try to de-duplicate this convert function with 

1057 # core.construction functions 

1058 cls = dtype.construct_array_type() 

1059 arr = cls._from_sequence(arr, dtype=dtype, copy=False) 

1060 elif dtype.kind in "mM": 

1061 # This restriction is harmless bc these are the only cases 

1062 # where maybe_cast_to_datetime is not a no-op. 

1063 # Here we know: 

1064 # 1) dtype.kind in "mM" and 

1065 # 2) arr is either object or numeric dtype 

1066 arr = maybe_cast_to_datetime(arr, dtype) 

1067 

1068 return arr 

1069 

1070 arrays = [convert(arr) for arr in content] 

1071 

1072 return arrays