Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/internals/concat.py: 15%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

247 statements  

1from __future__ import annotations 

2 

3from typing import ( 

4 TYPE_CHECKING, 

5 cast, 

6) 

7import warnings 

8 

9import numpy as np 

10 

11from pandas._libs import ( 

12 NaT, 

13 algos as libalgos, 

14 internals as libinternals, 

15 lib, 

16) 

17from pandas._libs.missing import NA 

18from pandas.util._decorators import cache_readonly 

19from pandas.util._exceptions import find_stack_level 

20 

21from pandas.core.dtypes.cast import ( 

22 ensure_dtype_can_hold_na, 

23 find_common_type, 

24) 

25from pandas.core.dtypes.common import ( 

26 is_1d_only_ea_dtype, 

27 is_scalar, 

28 needs_i8_conversion, 

29) 

30from pandas.core.dtypes.concat import concat_compat 

31from pandas.core.dtypes.dtypes import ( 

32 ExtensionDtype, 

33 SparseDtype, 

34) 

35from pandas.core.dtypes.missing import ( 

36 is_valid_na_for_dtype, 

37 isna, 

38 isna_all, 

39) 

40 

41from pandas.core.construction import ensure_wrapped_if_datetimelike 

42from pandas.core.internals.array_manager import ArrayManager 

43from pandas.core.internals.blocks import ( 

44 ensure_block_shape, 

45 new_block_2d, 

46) 

47from pandas.core.internals.managers import ( 

48 BlockManager, 

49 make_na_array, 

50) 

51 

52if TYPE_CHECKING: 

53 from collections.abc import Sequence 

54 

55 from pandas._typing import ( 

56 ArrayLike, 

57 AxisInt, 

58 DtypeObj, 

59 Manager2D, 

60 Shape, 

61 ) 

62 

63 from pandas import Index 

64 from pandas.core.internals.blocks import ( 

65 Block, 

66 BlockPlacement, 

67 ) 

68 

69 

70def _concatenate_array_managers( 

71 mgrs: list[ArrayManager], axes: list[Index], concat_axis: AxisInt 

72) -> Manager2D: 

73 """ 

74 Concatenate array managers into one. 

75 

76 Parameters 

77 ---------- 

78 mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples 

79 axes : list of Index 

80 concat_axis : int 

81 

82 Returns 

83 ------- 

84 ArrayManager 

85 """ 

86 if concat_axis == 1: 

87 return mgrs[0].concat_vertical(mgrs, axes) 

88 else: 

89 # concatting along the columns -> combine reindexed arrays in a single manager 

90 assert concat_axis == 0 

91 return mgrs[0].concat_horizontal(mgrs, axes) 

92 

93 

94def concatenate_managers( 

95 mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool 

96) -> Manager2D: 

97 """ 

98 Concatenate block managers into one. 

99 

100 Parameters 

101 ---------- 

102 mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples 

103 axes : list of Index 

104 concat_axis : int 

105 copy : bool 

106 

107 Returns 

108 ------- 

109 BlockManager 

110 """ 

111 

112 needs_copy = copy and concat_axis == 0 

113 

114 # TODO(ArrayManager) this assumes that all managers are of the same type 

115 if isinstance(mgrs_indexers[0][0], ArrayManager): 

116 mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy) 

117 # error: Argument 1 to "_concatenate_array_managers" has incompatible 

118 # type "List[BlockManager]"; expected "List[Union[ArrayManager, 

119 # SingleArrayManager, BlockManager, SingleBlockManager]]" 

120 return _concatenate_array_managers( 

121 mgrs, axes, concat_axis # type: ignore[arg-type] 

122 ) 

123 

124 # Assertions disabled for performance 

125 # for tup in mgrs_indexers: 

126 # # caller is responsible for ensuring this 

127 # indexers = tup[1] 

128 # assert concat_axis not in indexers 

129 

130 if concat_axis == 0: 

131 mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy) 

132 return mgrs[0].concat_horizontal(mgrs, axes) 

133 

134 if len(mgrs_indexers) > 0 and mgrs_indexers[0][0].nblocks > 0: 

135 first_dtype = mgrs_indexers[0][0].blocks[0].dtype 

136 if first_dtype in [np.float64, np.float32]: 

137 # TODO: support more dtypes here. This will be simpler once 

138 # JoinUnit.is_na behavior is deprecated. 

139 if ( 

140 all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in mgrs_indexers) 

141 and len(mgrs_indexers) > 1 

142 ): 

143 # Fastpath! 

144 # Length restriction is just to avoid having to worry about 'copy' 

145 shape = tuple(len(x) for x in axes) 

146 nb = _concat_homogeneous_fastpath(mgrs_indexers, shape, first_dtype) 

147 return BlockManager((nb,), axes) 

148 

149 mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy) 

150 

151 if len(mgrs) == 1: 

152 mgr = mgrs[0] 

153 out = mgr.copy(deep=False) 

154 out.axes = axes 

155 return out 

156 

157 concat_plan = _get_combined_plan(mgrs) 

158 

159 blocks = [] 

160 values: ArrayLike 

161 

162 for placement, join_units in concat_plan: 

163 unit = join_units[0] 

164 blk = unit.block 

165 

166 if _is_uniform_join_units(join_units): 

167 vals = [ju.block.values for ju in join_units] 

168 

169 if not blk.is_extension: 

170 # _is_uniform_join_units ensures a single dtype, so 

171 # we can use np.concatenate, which is more performant 

172 # than concat_compat 

173 # error: Argument 1 to "concatenate" has incompatible type 

174 # "List[Union[ndarray[Any, Any], ExtensionArray]]"; 

175 # expected "Union[_SupportsArray[dtype[Any]], 

176 # _NestedSequence[_SupportsArray[dtype[Any]]]]" 

177 values = np.concatenate(vals, axis=1) # type: ignore[arg-type] 

178 elif is_1d_only_ea_dtype(blk.dtype): 

179 # TODO(EA2D): special-casing not needed with 2D EAs 

180 values = concat_compat(vals, axis=0, ea_compat_axis=True) 

181 values = ensure_block_shape(values, ndim=2) 

182 else: 

183 values = concat_compat(vals, axis=1) 

184 

185 values = ensure_wrapped_if_datetimelike(values) 

186 

187 fastpath = blk.values.dtype == values.dtype 

188 else: 

189 values = _concatenate_join_units(join_units, copy=copy) 

190 fastpath = False 

191 

192 if fastpath: 

193 b = blk.make_block_same_class(values, placement=placement) 

194 else: 

195 b = new_block_2d(values, placement=placement) 

196 

197 blocks.append(b) 

198 

199 return BlockManager(tuple(blocks), axes) 

200 

201 

202def _maybe_reindex_columns_na_proxy( 

203 axes: list[Index], 

204 mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]], 

205 needs_copy: bool, 

206) -> list[BlockManager]: 

207 """ 

208 Reindex along columns so that all of the BlockManagers being concatenated 

209 have matching columns. 

210 

211 Columns added in this reindexing have dtype=np.void, indicating they 

212 should be ignored when choosing a column's final dtype. 

213 """ 

214 new_mgrs = [] 

215 

216 for mgr, indexers in mgrs_indexers: 

217 # For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this 

218 # is a cheap reindexing. 

219 for i, indexer in indexers.items(): 

220 mgr = mgr.reindex_indexer( 

221 axes[i], 

222 indexers[i], 

223 axis=i, 

224 copy=False, 

225 only_slice=True, # only relevant for i==0 

226 allow_dups=True, 

227 use_na_proxy=True, # only relevant for i==0 

228 ) 

229 if needs_copy and not indexers: 

230 mgr = mgr.copy() 

231 

232 new_mgrs.append(mgr) 

233 return new_mgrs 

234 

235 

236def _is_homogeneous_mgr(mgr: BlockManager, first_dtype: DtypeObj) -> bool: 

237 """ 

238 Check if this Manager can be treated as a single ndarray. 

239 """ 

240 if mgr.nblocks != 1: 

241 return False 

242 blk = mgr.blocks[0] 

243 if not (blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1): 

244 return False 

245 

246 return blk.dtype == first_dtype 

247 

248 

249def _concat_homogeneous_fastpath( 

250 mgrs_indexers, shape: Shape, first_dtype: np.dtype 

251) -> Block: 

252 """ 

253 With single-Block managers with homogeneous dtypes (that can already hold nan), 

254 we avoid [...] 

255 """ 

256 # assumes 

257 # all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in in mgrs_indexers) 

258 

259 if all(not indexers for _, indexers in mgrs_indexers): 

260 # https://github.com/pandas-dev/pandas/pull/52685#issuecomment-1523287739 

261 arrs = [mgr.blocks[0].values.T for mgr, _ in mgrs_indexers] 

262 arr = np.concatenate(arrs).T 

263 bp = libinternals.BlockPlacement(slice(shape[0])) 

264 nb = new_block_2d(arr, bp) 

265 return nb 

266 

267 arr = np.empty(shape, dtype=first_dtype) 

268 

269 if first_dtype == np.float64: 

270 take_func = libalgos.take_2d_axis0_float64_float64 

271 else: 

272 take_func = libalgos.take_2d_axis0_float32_float32 

273 

274 start = 0 

275 for mgr, indexers in mgrs_indexers: 

276 mgr_len = mgr.shape[1] 

277 end = start + mgr_len 

278 

279 if 0 in indexers: 

280 take_func( 

281 mgr.blocks[0].values, 

282 indexers[0], 

283 arr[:, start:end], 

284 ) 

285 else: 

286 # No reindexing necessary, we can copy values directly 

287 arr[:, start:end] = mgr.blocks[0].values 

288 

289 start += mgr_len 

290 

291 bp = libinternals.BlockPlacement(slice(shape[0])) 

292 nb = new_block_2d(arr, bp) 

293 return nb 

294 

295 

296def _get_combined_plan( 

297 mgrs: list[BlockManager], 

298) -> list[tuple[BlockPlacement, list[JoinUnit]]]: 

299 plan = [] 

300 

301 max_len = mgrs[0].shape[0] 

302 

303 blknos_list = [mgr.blknos for mgr in mgrs] 

304 pairs = libinternals.get_concat_blkno_indexers(blknos_list) 

305 for ind, (blknos, bp) in enumerate(pairs): 

306 # assert bp.is_slice_like 

307 # assert len(bp) > 0 

308 

309 units_for_bp = [] 

310 for k, mgr in enumerate(mgrs): 

311 blkno = blknos[k] 

312 

313 nb = _get_block_for_concat_plan(mgr, bp, blkno, max_len=max_len) 

314 unit = JoinUnit(nb) 

315 units_for_bp.append(unit) 

316 

317 plan.append((bp, units_for_bp)) 

318 

319 return plan 

320 

321 

322def _get_block_for_concat_plan( 

323 mgr: BlockManager, bp: BlockPlacement, blkno: int, *, max_len: int 

324) -> Block: 

325 blk = mgr.blocks[blkno] 

326 # Assertions disabled for performance: 

327 # assert bp.is_slice_like 

328 # assert blkno != -1 

329 # assert (mgr.blknos[bp] == blkno).all() 

330 

331 if len(bp) == len(blk.mgr_locs) and ( 

332 blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1 

333 ): 

334 nb = blk 

335 else: 

336 ax0_blk_indexer = mgr.blklocs[bp.indexer] 

337 

338 slc = lib.maybe_indices_to_slice(ax0_blk_indexer, max_len) 

339 # TODO: in all extant test cases 2023-04-08 we have a slice here. 

340 # Will this always be the case? 

341 if isinstance(slc, slice): 

342 nb = blk.slice_block_columns(slc) 

343 else: 

344 nb = blk.take_block_columns(slc) 

345 

346 # assert nb.shape == (len(bp), mgr.shape[1]) 

347 return nb 

348 

349 

350class JoinUnit: 

351 def __init__(self, block: Block) -> None: 

352 self.block = block 

353 

354 def __repr__(self) -> str: 

355 return f"{type(self).__name__}({repr(self.block)})" 

356 

357 def _is_valid_na_for(self, dtype: DtypeObj) -> bool: 

358 """ 

359 Check that we are all-NA of a type/dtype that is compatible with this dtype. 

360 Augments `self.is_na` with an additional check of the type of NA values. 

361 """ 

362 if not self.is_na: 

363 return False 

364 

365 blk = self.block 

366 if blk.dtype.kind == "V": 

367 return True 

368 

369 if blk.dtype == object: 

370 values = blk.values 

371 return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K")) 

372 

373 na_value = blk.fill_value 

374 if na_value is NaT and blk.dtype != dtype: 

375 # e.g. we are dt64 and other is td64 

376 # fill_values match but we should not cast blk.values to dtype 

377 # TODO: this will need updating if we ever have non-nano dt64/td64 

378 return False 

379 

380 if na_value is NA and needs_i8_conversion(dtype): 

381 # FIXME: kludge; test_append_empty_frame_with_timedelta64ns_nat 

382 # e.g. blk.dtype == "Int64" and dtype is td64, we dont want 

383 # to consider these as matching 

384 return False 

385 

386 # TODO: better to use can_hold_element? 

387 return is_valid_na_for_dtype(na_value, dtype) 

388 

389 @cache_readonly 

390 def is_na(self) -> bool: 

391 blk = self.block 

392 if blk.dtype.kind == "V": 

393 return True 

394 

395 if not blk._can_hold_na: 

396 return False 

397 

398 values = blk.values 

399 if values.size == 0: 

400 # GH#39122 this case will return False once deprecation is enforced 

401 return True 

402 

403 if isinstance(values.dtype, SparseDtype): 

404 return False 

405 

406 if values.ndim == 1: 

407 # TODO(EA2D): no need for special case with 2D EAs 

408 val = values[0] 

409 if not is_scalar(val) or not isna(val): 

410 # ideally isna_all would do this short-circuiting 

411 return False 

412 return isna_all(values) 

413 else: 

414 val = values[0][0] 

415 if not is_scalar(val) or not isna(val): 

416 # ideally isna_all would do this short-circuiting 

417 return False 

418 return all(isna_all(row) for row in values) 

419 

420 @cache_readonly 

421 def is_na_after_size_and_isna_all_deprecation(self) -> bool: 

422 """ 

423 Will self.is_na be True after values.size == 0 deprecation and isna_all 

424 deprecation are enforced? 

425 """ 

426 blk = self.block 

427 if blk.dtype.kind == "V": 

428 return True 

429 return False 

430 

431 def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: 

432 values: ArrayLike 

433 

434 if upcasted_na is None and self.block.dtype.kind != "V": 

435 # No upcasting is necessary 

436 return self.block.values 

437 else: 

438 fill_value = upcasted_na 

439 

440 if self._is_valid_na_for(empty_dtype): 

441 # note: always holds when self.block.dtype.kind == "V" 

442 blk_dtype = self.block.dtype 

443 

444 if blk_dtype == np.dtype("object"): 

445 # we want to avoid filling with np.nan if we are 

446 # using None; we already know that we are all 

447 # nulls 

448 values = cast(np.ndarray, self.block.values) 

449 if values.size and values[0, 0] is None: 

450 fill_value = None 

451 

452 return make_na_array(empty_dtype, self.block.shape, fill_value) 

453 

454 return self.block.values 

455 

456 

457def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike: 

458 """ 

459 Concatenate values from several join units along axis=1. 

460 """ 

461 empty_dtype, empty_dtype_future = _get_empty_dtype(join_units) 

462 

463 has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) 

464 upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks) 

465 

466 to_concat = [ 

467 ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) 

468 for ju in join_units 

469 ] 

470 

471 if any(is_1d_only_ea_dtype(t.dtype) for t in to_concat): 

472 # TODO(EA2D): special case not needed if all EAs used HybridBlocks 

473 

474 # error: No overload variant of "__getitem__" of "ExtensionArray" matches 

475 # argument type "Tuple[int, slice]" 

476 to_concat = [ 

477 t 

478 if is_1d_only_ea_dtype(t.dtype) 

479 else t[0, :] # type: ignore[call-overload] 

480 for t in to_concat 

481 ] 

482 concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) 

483 concat_values = ensure_block_shape(concat_values, 2) 

484 

485 else: 

486 concat_values = concat_compat(to_concat, axis=1) 

487 

488 if empty_dtype != empty_dtype_future: 

489 if empty_dtype == concat_values.dtype: 

490 # GH#39122, GH#40893 

491 warnings.warn( 

492 "The behavior of DataFrame concatenation with empty or all-NA " 

493 "entries is deprecated. In a future version, this will no longer " 

494 "exclude empty or all-NA columns when determining the result dtypes. " 

495 "To retain the old behavior, exclude the relevant entries before " 

496 "the concat operation.", 

497 FutureWarning, 

498 stacklevel=find_stack_level(), 

499 ) 

500 return concat_values 

501 

502 

503def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): 

504 """ 

505 Find the NA value to go with this dtype. 

506 """ 

507 if isinstance(dtype, ExtensionDtype): 

508 return dtype.na_value 

509 elif dtype.kind in "mM": 

510 return dtype.type("NaT") 

511 elif dtype.kind in "fc": 

512 return dtype.type("NaN") 

513 elif dtype.kind == "b": 

514 # different from missing.na_value_for_dtype 

515 return None 

516 elif dtype.kind in "iu": 

517 if not has_none_blocks: 

518 # different from missing.na_value_for_dtype 

519 return None 

520 return np.nan 

521 elif dtype.kind == "O": 

522 return np.nan 

523 raise NotImplementedError 

524 

525 

526def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj]: 

527 """ 

528 Return dtype and N/A values to use when concatenating specified units. 

529 

530 Returned N/A value may be None which means there was no casting involved. 

531 

532 Returns 

533 ------- 

534 dtype 

535 """ 

536 if lib.dtypes_all_equal([ju.block.dtype for ju in join_units]): 

537 empty_dtype = join_units[0].block.dtype 

538 return empty_dtype, empty_dtype 

539 

540 has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) 

541 

542 dtypes = [unit.block.dtype for unit in join_units if not unit.is_na] 

543 if not len(dtypes): 

544 dtypes = [ 

545 unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V" 

546 ] 

547 

548 dtype = find_common_type(dtypes) 

549 if has_none_blocks: 

550 dtype = ensure_dtype_can_hold_na(dtype) 

551 

552 dtype_future = dtype 

553 if len(dtypes) != len(join_units): 

554 dtypes_future = [ 

555 unit.block.dtype 

556 for unit in join_units 

557 if not unit.is_na_after_size_and_isna_all_deprecation 

558 ] 

559 if not len(dtypes_future): 

560 dtypes_future = [ 

561 unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V" 

562 ] 

563 

564 if len(dtypes) != len(dtypes_future): 

565 dtype_future = find_common_type(dtypes_future) 

566 if has_none_blocks: 

567 dtype_future = ensure_dtype_can_hold_na(dtype_future) 

568 

569 return dtype, dtype_future 

570 

571 

572def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: 

573 """ 

574 Check if the join units consist of blocks of uniform type that can 

575 be concatenated using Block.concat_same_type instead of the generic 

576 _concatenate_join_units (which uses `concat_compat`). 

577 

578 """ 

579 first = join_units[0].block 

580 if first.dtype.kind == "V": 

581 return False 

582 return ( 

583 # exclude cases where a) ju.block is None or b) we have e.g. Int64+int64 

584 all(type(ju.block) is type(first) for ju in join_units) 

585 and 

586 # e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform 

587 all( 

588 ju.block.dtype == first.dtype 

589 # GH#42092 we only want the dtype_equal check for non-numeric blocks 

590 # (for now, may change but that would need a deprecation) 

591 or ju.block.dtype.kind in "iub" 

592 for ju in join_units 

593 ) 

594 and 

595 # no blocks that would get missing values (can lead to type upcasts) 

596 # unless we're an extension dtype. 

597 all(not ju.is_na or ju.block.is_extension for ju in join_units) 

598 )