Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/internals/construction.py: 10%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

428 statements  

1""" 

2Functions for preparing various inputs passed to the DataFrame or Series 

3constructors before passing them to a BlockManager. 

4""" 

5from __future__ import annotations 

6 

7from collections import abc 

8from typing import ( 

9 Any, 

10 Hashable, 

11 Sequence, 

12) 

13 

14import numpy as np 

15from numpy import ma 

16 

17from pandas._libs import lib 

18from pandas._typing import ( 

19 ArrayLike, 

20 DtypeObj, 

21 Manager, 

22 npt, 

23) 

24 

25from pandas.core.dtypes.astype import astype_is_view 

26from pandas.core.dtypes.cast import ( 

27 construct_1d_arraylike_from_scalar, 

28 dict_compat, 

29 maybe_cast_to_datetime, 

30 maybe_convert_platform, 

31 maybe_infer_to_datetimelike, 

32) 

33from pandas.core.dtypes.common import ( 

34 is_1d_only_ea_dtype, 

35 is_bool_dtype, 

36 is_datetime_or_timedelta_dtype, 

37 is_dtype_equal, 

38 is_extension_array_dtype, 

39 is_float_dtype, 

40 is_integer_dtype, 

41 is_list_like, 

42 is_named_tuple, 

43 is_object_dtype, 

44) 

45from pandas.core.dtypes.dtypes import ExtensionDtype 

46from pandas.core.dtypes.generic import ( 

47 ABCDataFrame, 

48 ABCSeries, 

49) 

50 

51from pandas.core import ( 

52 algorithms, 

53 common as com, 

54) 

55from pandas.core.arrays import ( 

56 BooleanArray, 

57 ExtensionArray, 

58 FloatingArray, 

59 IntegerArray, 

60) 

61from pandas.core.arrays.string_ import StringDtype 

62from pandas.core.construction import ( 

63 ensure_wrapped_if_datetimelike, 

64 extract_array, 

65 range_to_ndarray, 

66 sanitize_array, 

67) 

68from pandas.core.indexes.api import ( 

69 DatetimeIndex, 

70 Index, 

71 TimedeltaIndex, 

72 default_index, 

73 ensure_index, 

74 get_objs_combined_axis, 

75 union_indexes, 

76) 

77from pandas.core.internals.array_manager import ( 

78 ArrayManager, 

79 SingleArrayManager, 

80) 

81from pandas.core.internals.blocks import ( 

82 BlockPlacement, 

83 ensure_block_shape, 

84 new_block_2d, 

85) 

86from pandas.core.internals.managers import ( 

87 BlockManager, 

88 SingleBlockManager, 

89 create_block_manager_from_blocks, 

90 create_block_manager_from_column_arrays, 

91) 

92 

93# --------------------------------------------------------------------- 

94# BlockManager Interface 

95 

96 

97def arrays_to_mgr( 

98 arrays, 

99 columns: Index, 

100 index, 

101 *, 

102 dtype: DtypeObj | None = None, 

103 verify_integrity: bool = True, 

104 typ: str | None = None, 

105 consolidate: bool = True, 

106) -> Manager: 

107 """ 

108 Segregate Series based on type and coerce into matrices. 

109 

110 Needs to handle a lot of exceptional cases. 

111 """ 

112 if verify_integrity: 

113 # figure out the index, if necessary 

114 if index is None: 

115 index = _extract_index(arrays) 

116 else: 

117 index = ensure_index(index) 

118 

119 # don't force copy because getting jammed in an ndarray anyway 

120 arrays, refs = _homogenize(arrays, index, dtype) 

121 # _homogenize ensures 

122 # - all(len(x) == len(index) for x in arrays) 

123 # - all(x.ndim == 1 for x in arrays) 

124 # - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays) 

125 # - all(type(x) is not PandasArray for x in arrays) 

126 

127 else: 

128 index = ensure_index(index) 

129 arrays = [extract_array(x, extract_numpy=True) for x in arrays] 

130 # with _from_arrays, the passed arrays should never be Series objects 

131 refs = [None] * len(arrays) 

132 

133 # Reached via DataFrame._from_arrays; we do minimal validation here 

134 for arr in arrays: 

135 if ( 

136 not isinstance(arr, (np.ndarray, ExtensionArray)) 

137 or arr.ndim != 1 

138 or len(arr) != len(index) 

139 ): 

140 raise ValueError( 

141 "Arrays must be 1-dimensional np.ndarray or ExtensionArray " 

142 "with length matching len(index)" 

143 ) 

144 

145 columns = ensure_index(columns) 

146 if len(columns) != len(arrays): 

147 raise ValueError("len(arrays) must match len(columns)") 

148 

149 # from BlockManager perspective 

150 axes = [columns, index] 

151 

152 if typ == "block": 

153 return create_block_manager_from_column_arrays( 

154 arrays, axes, consolidate=consolidate, refs=refs 

155 ) 

156 elif typ == "array": 

157 return ArrayManager(arrays, [index, columns]) 

158 else: 

159 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") 

160 

161 

162def rec_array_to_mgr( 

163 data: np.recarray | np.ndarray, 

164 index, 

165 columns, 

166 dtype: DtypeObj | None, 

167 copy: bool, 

168 typ: str, 

169) -> Manager: 

170 """ 

171 Extract from a masked rec array and create the manager. 

172 """ 

173 # essentially process a record array then fill it 

174 fdata = ma.getdata(data) 

175 if index is None: 

176 index = default_index(len(fdata)) 

177 else: 

178 index = ensure_index(index) 

179 

180 if columns is not None: 

181 columns = ensure_index(columns) 

182 arrays, arr_columns = to_arrays(fdata, columns) 

183 

184 # create the manager 

185 

186 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, len(index)) 

187 if columns is None: 

188 columns = arr_columns 

189 

190 mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ) 

191 

192 if copy: 

193 mgr = mgr.copy() 

194 return mgr 

195 

196 

197def mgr_to_mgr(mgr, typ: str, copy: bool = True): 

198 """ 

199 Convert to specific type of Manager. Does not copy if the type is already 

200 correct. Does not guarantee a copy otherwise. `copy` keyword only controls 

201 whether conversion from Block->ArrayManager copies the 1D arrays. 

202 """ 

203 new_mgr: Manager 

204 

205 if typ == "block": 

206 if isinstance(mgr, BlockManager): 

207 new_mgr = mgr 

208 else: 

209 if mgr.ndim == 2: 

210 new_mgr = arrays_to_mgr( 

211 mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block" 

212 ) 

213 else: 

214 new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index) 

215 elif typ == "array": 

216 if isinstance(mgr, ArrayManager): 

217 new_mgr = mgr 

218 else: 

219 if mgr.ndim == 2: 

220 arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))] 

221 if copy: 

222 arrays = [arr.copy() for arr in arrays] 

223 new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) 

224 else: 

225 array = mgr.internal_values() 

226 if copy: 

227 array = array.copy() 

228 new_mgr = SingleArrayManager([array], [mgr.index]) 

229 else: 

230 raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") 

231 return new_mgr 

232 

233 

234# --------------------------------------------------------------------- 

235# DataFrame Constructor Interface 

236 

237 

238def ndarray_to_mgr( 

239 values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str 

240) -> Manager: 

241 # used in DataFrame.__init__ 

242 # input must be a ndarray, list, Series, Index, ExtensionArray 

243 

244 if isinstance(values, ABCSeries): 

245 if columns is None: 

246 if values.name is not None: 

247 columns = Index([values.name]) 

248 if index is None: 

249 index = values.index 

250 else: 

251 values = values.reindex(index) 

252 

253 # zero len case (GH #2234) 

254 if not len(values) and columns is not None and len(columns): 

255 values = np.empty((0, 1), dtype=object) 

256 

257 # if the array preparation does a copy -> avoid this for ArrayManager, 

258 # since the copy is done on conversion to 1D arrays 

259 copy_on_sanitize = False if typ == "array" else copy 

260 

261 vdtype = getattr(values, "dtype", None) 

262 refs = None 

263 if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype): 

264 # GH#19157 

265 

266 if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1: 

267 # GH#12513 a EA dtype passed with a 2D array, split into 

268 # multiple EAs that view the values 

269 # error: No overload variant of "__getitem__" of "ExtensionArray" 

270 # matches argument type "Tuple[slice, int]" 

271 values = [ 

272 values[:, n] # type: ignore[call-overload] 

273 for n in range(values.shape[1]) 

274 ] 

275 else: 

276 values = [values] 

277 

278 if columns is None: 

279 columns = Index(range(len(values))) 

280 else: 

281 columns = ensure_index(columns) 

282 

283 return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ) 

284 

285 elif is_extension_array_dtype(vdtype): 

286 # i.e. Datetime64TZ, PeriodDtype; cases with is_1d_only_ea_dtype(vdtype) 

287 # are already caught above 

288 values = extract_array(values, extract_numpy=True) 

289 if copy: 

290 values = values.copy() 

291 if values.ndim == 1: 

292 values = values.reshape(-1, 1) 

293 

294 elif isinstance(values, (ABCSeries, Index)): 

295 if not copy_on_sanitize and ( 

296 dtype is None or astype_is_view(values.dtype, dtype) 

297 ): 

298 refs = values._references 

299 

300 if copy_on_sanitize: 

301 values = values._values.copy() 

302 else: 

303 values = values._values 

304 

305 values = _ensure_2d(values) 

306 

307 elif isinstance(values, (np.ndarray, ExtensionArray)): 

308 # drop subclass info 

309 _copy = ( 

310 copy_on_sanitize 

311 if (dtype is None or astype_is_view(values.dtype, dtype)) 

312 else False 

313 ) 

314 values = np.array(values, copy=_copy) 

315 values = _ensure_2d(values) 

316 

317 else: 

318 # by definition an array here 

319 # the dtypes will be coerced to a single dtype 

320 values = _prep_ndarraylike(values, copy=copy_on_sanitize) 

321 

322 if dtype is not None and not is_dtype_equal(values.dtype, dtype): 

323 # GH#40110 see similar check inside sanitize_array 

324 values = sanitize_array( 

325 values, 

326 None, 

327 dtype=dtype, 

328 copy=copy_on_sanitize, 

329 allow_2d=True, 

330 ) 

331 

332 # _prep_ndarraylike ensures that values.ndim == 2 at this point 

333 index, columns = _get_axes( 

334 values.shape[0], values.shape[1], index=index, columns=columns 

335 ) 

336 

337 _check_values_indices_shape_match(values, index, columns) 

338 

339 if typ == "array": 

340 if issubclass(values.dtype.type, str): 

341 values = np.array(values, dtype=object) 

342 

343 if dtype is None and is_object_dtype(values.dtype): 

344 arrays = [ 

345 ensure_wrapped_if_datetimelike( 

346 maybe_infer_to_datetimelike(values[:, i]) 

347 ) 

348 for i in range(values.shape[1]) 

349 ] 

350 else: 

351 if is_datetime_or_timedelta_dtype(values.dtype): 

352 values = ensure_wrapped_if_datetimelike(values) 

353 arrays = [values[:, i] for i in range(values.shape[1])] 

354 

355 if copy: 

356 arrays = [arr.copy() for arr in arrays] 

357 

358 return ArrayManager(arrays, [index, columns], verify_integrity=False) 

359 

360 values = values.T 

361 

362 # if we don't have a dtype specified, then try to convert objects 

363 # on the entire block; this is to convert if we have datetimelike's 

364 # embedded in an object type 

365 if dtype is None and is_object_dtype(values.dtype): 

366 obj_columns = list(values) 

367 maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] 

368 # don't convert (and copy) the objects if no type inference occurs 

369 if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): 

370 dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime] 

371 block_values = [ 

372 new_block_2d(dvals_list[n], placement=BlockPlacement(n)) 

373 for n in range(len(dvals_list)) 

374 ] 

375 else: 

376 bp = BlockPlacement(slice(len(columns))) 

377 nb = new_block_2d(values, placement=bp, refs=refs) 

378 block_values = [nb] 

379 else: 

380 bp = BlockPlacement(slice(len(columns))) 

381 nb = new_block_2d(values, placement=bp, refs=refs) 

382 block_values = [nb] 

383 

384 if len(columns) == 0: 

385 # TODO: check len(values) == 0? 

386 block_values = [] 

387 

388 return create_block_manager_from_blocks( 

389 block_values, [columns, index], verify_integrity=False 

390 ) 

391 

392 

393def _check_values_indices_shape_match( 

394 values: np.ndarray, index: Index, columns: Index 

395) -> None: 

396 """ 

397 Check that the shape implied by our axes matches the actual shape of the 

398 data. 

399 """ 

400 if values.shape[1] != len(columns) or values.shape[0] != len(index): 

401 # Could let this raise in Block constructor, but we get a more 

402 # helpful exception message this way. 

403 if values.shape[0] == 0: 

404 raise ValueError("Empty data passed with indices specified.") 

405 

406 passed = values.shape 

407 implied = (len(index), len(columns)) 

408 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") 

409 

410 

411def dict_to_mgr( 

412 data: dict, 

413 index, 

414 columns, 

415 *, 

416 dtype: DtypeObj | None = None, 

417 typ: str = "block", 

418 copy: bool = True, 

419) -> Manager: 

420 """ 

421 Segregate Series based on type and coerce into matrices. 

422 Needs to handle a lot of exceptional cases. 

423 

424 Used in DataFrame.__init__ 

425 """ 

426 arrays: Sequence[Any] | Series 

427 

428 if columns is not None: 

429 from pandas.core.series import Series 

430 

431 arrays = Series(data, index=columns, dtype=object) 

432 missing = arrays.isna() 

433 if index is None: 

434 # GH10856 

435 # raise ValueError if only scalars in dict 

436 index = _extract_index(arrays[~missing]) 

437 else: 

438 index = ensure_index(index) 

439 

440 # no obvious "empty" int column 

441 if missing.any() and not is_integer_dtype(dtype): 

442 nan_dtype: DtypeObj 

443 

444 if dtype is not None: 

445 # calling sanitize_array ensures we don't mix-and-match 

446 # NA dtypes 

447 midxs = missing.values.nonzero()[0] 

448 for i in midxs: 

449 arr = sanitize_array(arrays.iat[i], index, dtype=dtype) 

450 arrays.iat[i] = arr 

451 else: 

452 # GH#1783 

453 nan_dtype = np.dtype("object") 

454 val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) 

455 nmissing = missing.sum() 

456 if copy: 

457 rhs = [val] * nmissing 

458 else: 

459 # GH#45369 

460 rhs = [val.copy() for _ in range(nmissing)] 

461 arrays.loc[missing] = rhs 

462 

463 arrays = list(arrays) 

464 columns = ensure_index(columns) 

465 

466 else: 

467 keys = list(data.keys()) 

468 columns = Index(keys) if keys else default_index(0) 

469 arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] 

470 arrays = [arr if not isinstance(arr, Index) else arr._data for arr in arrays] 

471 

472 if copy: 

473 if typ == "block": 

474 # We only need to copy arrays that will not get consolidated, i.e. 

475 # only EA arrays 

476 arrays = [x.copy() if isinstance(x, ExtensionArray) else x for x in arrays] 

477 else: 

478 # dtype check to exclude e.g. range objects, scalars 

479 arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays] 

480 

481 return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy) 

482 

483 

484def nested_data_to_arrays( 

485 data: Sequence, 

486 columns: Index | None, 

487 index: Index | None, 

488 dtype: DtypeObj | None, 

489) -> tuple[list[ArrayLike], Index, Index]: 

490 """ 

491 Convert a single sequence of arrays to multiple arrays. 

492 """ 

493 # By the time we get here we have already checked treat_as_nested(data) 

494 

495 if is_named_tuple(data[0]) and columns is None: 

496 columns = ensure_index(data[0]._fields) 

497 

498 arrays, columns = to_arrays(data, columns, dtype=dtype) 

499 columns = ensure_index(columns) 

500 

501 if index is None: 

502 if isinstance(data[0], ABCSeries): 

503 index = _get_names_from_index(data) 

504 else: 

505 index = default_index(len(data)) 

506 

507 return arrays, columns, index 

508 

509 

510def treat_as_nested(data) -> bool: 

511 """ 

512 Check if we should use nested_data_to_arrays. 

513 """ 

514 return ( 

515 len(data) > 0 

516 and is_list_like(data[0]) 

517 and getattr(data[0], "ndim", 1) == 1 

518 and not (isinstance(data, ExtensionArray) and data.ndim == 2) 

519 ) 

520 

521 

522# --------------------------------------------------------------------- 

523 

524 

525def _prep_ndarraylike(values, copy: bool = True) -> np.ndarray: 

526 # values is specifically _not_ ndarray, EA, Index, or Series 

527 # We only get here with `not treat_as_nested(values)` 

528 

529 if len(values) == 0: 

530 # TODO: check for length-zero range, in which case return int64 dtype? 

531 # TODO: re-use anything in try_cast? 

532 return np.empty((0, 0), dtype=object) 

533 elif isinstance(values, range): 

534 arr = range_to_ndarray(values) 

535 return arr[..., np.newaxis] 

536 

537 def convert(v): 

538 if not is_list_like(v) or isinstance(v, ABCDataFrame): 

539 return v 

540 

541 v = extract_array(v, extract_numpy=True) 

542 res = maybe_convert_platform(v) 

543 # We don't do maybe_infer_to_datetimelike here bc we will end up doing 

544 # it column-by-column in ndarray_to_mgr 

545 return res 

546 

547 # we could have a 1-dim or 2-dim list here 

548 # this is equiv of np.asarray, but does object conversion 

549 # and platform dtype preservation 

550 # does not convert e.g. [1, "a", True] to ["1", "a", "True"] like 

551 # np.asarray would 

552 if is_list_like(values[0]): 

553 values = np.array([convert(v) for v in values]) 

554 elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: 

555 # GH#21861 see test_constructor_list_of_lists 

556 values = np.array([convert(v) for v in values]) 

557 else: 

558 values = convert(values) 

559 

560 return _ensure_2d(values) 

561 

562 

563def _ensure_2d(values: np.ndarray) -> np.ndarray: 

564 """ 

565 Reshape 1D values, raise on anything else other than 2D. 

566 """ 

567 if values.ndim == 1: 

568 values = values.reshape((values.shape[0], 1)) 

569 elif values.ndim != 2: 

570 raise ValueError(f"Must pass 2-d input. shape={values.shape}") 

571 return values 

572 

573 

574def _homogenize( 

575 data, index: Index, dtype: DtypeObj | None 

576) -> tuple[list[ArrayLike], list[Any]]: 

577 oindex = None 

578 homogenized = [] 

579 # if the original array-like in `data` is a Series, keep track of this Series' refs 

580 refs: list[Any] = [] 

581 

582 for val in data: 

583 if isinstance(val, ABCSeries): 

584 if dtype is not None: 

585 val = val.astype(dtype, copy=False) 

586 if val.index is not index: 

587 # Forces alignment. No need to copy data since we 

588 # are putting it into an ndarray later 

589 val = val.reindex(index, copy=False) 

590 refs.append(val._references) 

591 val = val._values 

592 else: 

593 if isinstance(val, dict): 

594 # GH#41785 this _should_ be equivalent to (but faster than) 

595 # val = Series(val, index=index)._values 

596 if oindex is None: 

597 oindex = index.astype("O") 

598 

599 if isinstance(index, (DatetimeIndex, TimedeltaIndex)): 

600 # see test_constructor_dict_datetime64_index 

601 val = dict_compat(val) 

602 else: 

603 # see test_constructor_subclass_dict 

604 val = dict(val) 

605 val = lib.fast_multiget(val, oindex._values, default=np.nan) 

606 

607 val = sanitize_array(val, index, dtype=dtype, copy=False) 

608 com.require_length_match(val, index) 

609 refs.append(None) 

610 

611 homogenized.append(val) 

612 

613 return homogenized, refs 

614 

615 

616def _extract_index(data) -> Index: 

617 """ 

618 Try to infer an Index from the passed data, raise ValueError on failure. 

619 """ 

620 index: Index 

621 if len(data) == 0: 

622 return default_index(0) 

623 

624 raw_lengths = [] 

625 indexes: list[list[Hashable] | Index] = [] 

626 

627 have_raw_arrays = False 

628 have_series = False 

629 have_dicts = False 

630 

631 for val in data: 

632 if isinstance(val, ABCSeries): 

633 have_series = True 

634 indexes.append(val.index) 

635 elif isinstance(val, dict): 

636 have_dicts = True 

637 indexes.append(list(val.keys())) 

638 elif is_list_like(val) and getattr(val, "ndim", 1) == 1: 

639 have_raw_arrays = True 

640 raw_lengths.append(len(val)) 

641 elif isinstance(val, np.ndarray) and val.ndim > 1: 

642 raise ValueError("Per-column arrays must each be 1-dimensional") 

643 

644 if not indexes and not raw_lengths: 

645 raise ValueError("If using all scalar values, you must pass an index") 

646 

647 if have_series: 

648 index = union_indexes(indexes) 

649 elif have_dicts: 

650 index = union_indexes(indexes, sort=False) 

651 

652 if have_raw_arrays: 

653 lengths = list(set(raw_lengths)) 

654 if len(lengths) > 1: 

655 raise ValueError("All arrays must be of the same length") 

656 

657 if have_dicts: 

658 raise ValueError( 

659 "Mixing dicts with non-Series may lead to ambiguous ordering." 

660 ) 

661 

662 if have_series: 

663 if lengths[0] != len(index): 

664 msg = ( 

665 f"array length {lengths[0]} does not match index " 

666 f"length {len(index)}" 

667 ) 

668 raise ValueError(msg) 

669 else: 

670 index = default_index(lengths[0]) 

671 

672 return ensure_index(index) 

673 

674 

675def reorder_arrays( 

676 arrays: list[ArrayLike], arr_columns: Index, columns: Index | None, length: int 

677) -> tuple[list[ArrayLike], Index]: 

678 """ 

679 Pre-emptively (cheaply) reindex arrays with new columns. 

680 """ 

681 # reorder according to the columns 

682 if columns is not None: 

683 if not columns.equals(arr_columns): 

684 # if they are equal, there is nothing to do 

685 new_arrays: list[ArrayLike | None] 

686 new_arrays = [None] * len(columns) 

687 indexer = arr_columns.get_indexer(columns) 

688 for i, k in enumerate(indexer): 

689 if k == -1: 

690 # by convention default is all-NaN object dtype 

691 arr = np.empty(length, dtype=object) 

692 arr.fill(np.nan) 

693 else: 

694 arr = arrays[k] 

695 new_arrays[i] = arr 

696 

697 # Incompatible types in assignment (expression has type 

698 # "List[Union[ExtensionArray, ndarray[Any, Any], None]]", variable 

699 # has type "List[Union[ExtensionArray, ndarray[Any, Any]]]") 

700 arrays = new_arrays # type: ignore[assignment] 

701 arr_columns = columns 

702 

703 return arrays, arr_columns 

704 

705 

706def _get_names_from_index(data) -> Index: 

707 has_some_name = any(getattr(s, "name", None) is not None for s in data) 

708 if not has_some_name: 

709 return default_index(len(data)) 

710 

711 index: list[Hashable] = list(range(len(data))) 

712 count = 0 

713 for i, s in enumerate(data): 

714 n = getattr(s, "name", None) 

715 if n is not None: 

716 index[i] = n 

717 else: 

718 index[i] = f"Unnamed {count}" 

719 count += 1 

720 

721 return Index(index) 

722 

723 

724def _get_axes( 

725 N: int, K: int, index: Index | None, columns: Index | None 

726) -> tuple[Index, Index]: 

727 # helper to create the axes as indexes 

728 # return axes or defaults 

729 

730 if index is None: 

731 index = default_index(N) 

732 else: 

733 index = ensure_index(index) 

734 

735 if columns is None: 

736 columns = default_index(K) 

737 else: 

738 columns = ensure_index(columns) 

739 return index, columns 

740 

741 

742def dataclasses_to_dicts(data): 

743 """ 

744 Converts a list of dataclass instances to a list of dictionaries. 

745 

746 Parameters 

747 ---------- 

748 data : List[Type[dataclass]] 

749 

750 Returns 

751 -------- 

752 list_dict : List[dict] 

753 

754 Examples 

755 -------- 

756 >>> from dataclasses import dataclass 

757 >>> @dataclass 

758 ... class Point: 

759 ... x: int 

760 ... y: int 

761 

762 >>> dataclasses_to_dicts([Point(1, 2), Point(2, 3)]) 

763 [{'x': 1, 'y': 2}, {'x': 2, 'y': 3}] 

764 

765 """ 

766 from dataclasses import asdict 

767 

768 return list(map(asdict, data)) 

769 

770 

771# --------------------------------------------------------------------- 

772# Conversion of Inputs to Arrays 

773 

774 

775def to_arrays( 

776 data, columns: Index | None, dtype: DtypeObj | None = None 

777) -> tuple[list[ArrayLike], Index]: 

778 """ 

779 Return list of arrays, columns. 

780 

781 Returns 

782 ------- 

783 list[ArrayLike] 

784 These will become columns in a DataFrame. 

785 Index 

786 This will become frame.columns. 

787 

788 Notes 

789 ----- 

790 Ensures that len(result_arrays) == len(result_index). 

791 """ 

792 if isinstance(data, ABCDataFrame): 

793 # see test_from_records_with_index_data, test_from_records_bad_index_column 

794 if columns is not None: 

795 arrays = [ 

796 data._ixs(i, axis=1)._values 

797 for i, col in enumerate(data.columns) 

798 if col in columns 

799 ] 

800 else: 

801 columns = data.columns 

802 arrays = [data._ixs(i, axis=1)._values for i in range(len(columns))] 

803 

804 return arrays, columns 

805 

806 if not len(data): 

807 if isinstance(data, np.ndarray): 

808 if data.dtype.names is not None: 

809 # i.e. numpy structured array 

810 columns = ensure_index(data.dtype.names) 

811 arrays = [data[name] for name in columns] 

812 

813 if len(data) == 0: 

814 # GH#42456 the indexing above results in list of 2D ndarrays 

815 # TODO: is that an issue with numpy? 

816 for i, arr in enumerate(arrays): 

817 if arr.ndim == 2: 

818 arrays[i] = arr[:, 0] 

819 

820 return arrays, columns 

821 return [], ensure_index([]) 

822 

823 elif isinstance(data, np.ndarray) and data.dtype.names is not None: 

824 # e.g. recarray 

825 columns = Index(list(data.dtype.names)) 

826 arrays = [data[k] for k in columns] 

827 return arrays, columns 

828 

829 if isinstance(data[0], (list, tuple)): 

830 arr = _list_to_arrays(data) 

831 elif isinstance(data[0], abc.Mapping): 

832 arr, columns = _list_of_dict_to_arrays(data, columns) 

833 elif isinstance(data[0], ABCSeries): 

834 arr, columns = _list_of_series_to_arrays(data, columns) 

835 else: 

836 # last ditch effort 

837 data = [tuple(x) for x in data] 

838 arr = _list_to_arrays(data) 

839 

840 content, columns = _finalize_columns_and_data(arr, columns, dtype) 

841 return content, columns 

842 

843 

844def _list_to_arrays(data: list[tuple | list]) -> np.ndarray: 

845 # Returned np.ndarray has ndim = 2 

846 # Note: we already check len(data) > 0 before getting hre 

847 if isinstance(data[0], tuple): 

848 content = lib.to_object_array_tuples(data) 

849 else: 

850 # list of lists 

851 content = lib.to_object_array(data) 

852 return content 

853 

854 

855def _list_of_series_to_arrays( 

856 data: list, 

857 columns: Index | None, 

858) -> tuple[np.ndarray, Index]: 

859 # returned np.ndarray has ndim == 2 

860 

861 if columns is None: 

862 # We know pass_data is non-empty because data[0] is a Series 

863 pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] 

864 columns = get_objs_combined_axis(pass_data, sort=False) 

865 

866 indexer_cache: dict[int, np.ndarray] = {} 

867 

868 aligned_values = [] 

869 for s in data: 

870 index = getattr(s, "index", None) 

871 if index is None: 

872 index = default_index(len(s)) 

873 

874 if id(index) in indexer_cache: 

875 indexer = indexer_cache[id(index)] 

876 else: 

877 indexer = indexer_cache[id(index)] = index.get_indexer(columns) 

878 

879 values = extract_array(s, extract_numpy=True) 

880 aligned_values.append(algorithms.take_nd(values, indexer)) 

881 

882 content = np.vstack(aligned_values) 

883 return content, columns 

884 

885 

886def _list_of_dict_to_arrays( 

887 data: list[dict], 

888 columns: Index | None, 

889) -> tuple[np.ndarray, Index]: 

890 """ 

891 Convert list of dicts to numpy arrays 

892 

893 if `columns` is not passed, column names are inferred from the records 

894 - for OrderedDict and dicts, the column names match 

895 the key insertion-order from the first record to the last. 

896 - For other kinds of dict-likes, the keys are lexically sorted. 

897 

898 Parameters 

899 ---------- 

900 data : iterable 

901 collection of records (OrderedDict, dict) 

902 columns: iterables or None 

903 

904 Returns 

905 ------- 

906 content : np.ndarray[object, ndim=2] 

907 columns : Index 

908 """ 

909 if columns is None: 

910 gen = (list(x.keys()) for x in data) 

911 sort = not any(isinstance(d, dict) for d in data) 

912 pre_cols = lib.fast_unique_multiple_list_gen(gen, sort=sort) 

913 columns = ensure_index(pre_cols) 

914 

915 # assure that they are of the base dict class and not of derived 

916 # classes 

917 data = [d if type(d) is dict else dict(d) for d in data] 

918 

919 content = lib.dicts_to_array(data, list(columns)) 

920 return content, columns 

921 

922 

923def _finalize_columns_and_data( 

924 content: np.ndarray, # ndim == 2 

925 columns: Index | None, 

926 dtype: DtypeObj | None, 

927) -> tuple[list[ArrayLike], Index]: 

928 """ 

929 Ensure we have valid columns, cast object dtypes if possible. 

930 """ 

931 contents = list(content.T) 

932 

933 try: 

934 columns = _validate_or_indexify_columns(contents, columns) 

935 except AssertionError as err: 

936 # GH#26429 do not raise user-facing AssertionError 

937 raise ValueError(err) from err 

938 

939 if len(contents) and contents[0].dtype == np.object_: 

940 contents = convert_object_array(contents, dtype=dtype) 

941 

942 return contents, columns 

943 

944 

945def _validate_or_indexify_columns( 

946 content: list[np.ndarray], columns: Index | None 

947) -> Index: 

948 """ 

949 If columns is None, make numbers as column names; Otherwise, validate that 

950 columns have valid length. 

951 

952 Parameters 

953 ---------- 

954 content : list of np.ndarrays 

955 columns : Index or None 

956 

957 Returns 

958 ------- 

959 Index 

960 If columns is None, assign positional column index value as columns. 

961 

962 Raises 

963 ------ 

964 1. AssertionError when content is not composed of list of lists, and if 

965 length of columns is not equal to length of content. 

966 2. ValueError when content is list of lists, but length of each sub-list 

967 is not equal 

968 3. ValueError when content is list of lists, but length of sub-list is 

969 not equal to length of content 

970 """ 

971 if columns is None: 

972 columns = default_index(len(content)) 

973 else: 

974 # Add mask for data which is composed of list of lists 

975 is_mi_list = isinstance(columns, list) and all( 

976 isinstance(col, list) for col in columns 

977 ) 

978 

979 if not is_mi_list and len(columns) != len(content): # pragma: no cover 

980 # caller's responsibility to check for this... 

981 raise AssertionError( 

982 f"{len(columns)} columns passed, passed data had " 

983 f"{len(content)} columns" 

984 ) 

985 if is_mi_list: 

986 # check if nested list column, length of each sub-list should be equal 

987 if len({len(col) for col in columns}) > 1: 

988 raise ValueError( 

989 "Length of columns passed for MultiIndex columns is different" 

990 ) 

991 

992 # if columns is not empty and length of sublist is not equal to content 

993 if columns and len(columns[0]) != len(content): 

994 raise ValueError( 

995 f"{len(columns[0])} columns passed, passed data had " 

996 f"{len(content)} columns" 

997 ) 

998 return columns 

999 

1000 

1001def convert_object_array( 

1002 content: list[npt.NDArray[np.object_]], 

1003 dtype: DtypeObj | None, 

1004 dtype_backend: str = "numpy", 

1005 coerce_float: bool = False, 

1006) -> list[ArrayLike]: 

1007 """ 

1008 Internal function to convert object array. 

1009 

1010 Parameters 

1011 ---------- 

1012 content: List[np.ndarray] 

1013 dtype: np.dtype or ExtensionDtype 

1014 dtype_backend: Controls if nullable/pyarrow dtypes are returned. 

1015 coerce_float: Cast floats that are integers to int. 

1016 

1017 Returns 

1018 ------- 

1019 List[ArrayLike] 

1020 """ 

1021 # provide soft conversion of object dtypes 

1022 

1023 def convert(arr): 

1024 if dtype != np.dtype("O"): 

1025 arr = lib.maybe_convert_objects( 

1026 arr, 

1027 try_float=coerce_float, 

1028 convert_to_nullable_dtype=dtype_backend != "numpy", 

1029 ) 

1030 # Notes on cases that get here 2023-02-15 

1031 # 1) we DO get here when arr is all Timestamps and dtype=None 

1032 # 2) disabling this doesn't break the world, so this must be 

1033 # getting caught at a higher level 

1034 # 3) passing convert_datetime to maybe_convert_objects get this right 

1035 # 4) convert_timedelta? 

1036 

1037 if dtype is None: 

1038 if arr.dtype == np.dtype("O"): 

1039 # i.e. maybe_convert_objects didn't convert 

1040 arr = maybe_infer_to_datetimelike(arr) 

1041 if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): 

1042 arr = StringDtype().construct_array_type()._from_sequence(arr) 

1043 elif dtype_backend != "numpy" and isinstance(arr, np.ndarray): 

1044 if is_integer_dtype(arr.dtype): 

1045 arr = IntegerArray(arr, np.zeros(arr.shape, dtype=np.bool_)) 

1046 elif is_bool_dtype(arr.dtype): 

1047 arr = BooleanArray(arr, np.zeros(arr.shape, dtype=np.bool_)) 

1048 elif is_float_dtype(arr.dtype): 

1049 arr = FloatingArray(arr, np.isnan(arr)) 

1050 

1051 elif isinstance(dtype, ExtensionDtype): 

1052 # TODO: test(s) that get here 

1053 # TODO: try to de-duplicate this convert function with 

1054 # core.construction functions 

1055 cls = dtype.construct_array_type() 

1056 arr = cls._from_sequence(arr, dtype=dtype, copy=False) 

1057 elif dtype.kind in ["m", "M"]: 

1058 # This restriction is harmless bc these are the only cases 

1059 # where maybe_cast_to_datetime is not a no-op. 

1060 # Here we know: 

1061 # 1) dtype.kind in ["m", "M"] and 

1062 # 2) arr is either object or numeric dtype 

1063 arr = maybe_cast_to_datetime(arr, dtype) 

1064 

1065 return arr 

1066 

1067 arrays = [convert(arr) for arr in content] 

1068 

1069 return arrays