Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/internals/concat.py: 15%

1from __future__ import annotations

3from typing import (

4 TYPE_CHECKING,

5 cast,

7import warnings

9import numpy as np

11from pandas._libs import (

12 NaT,

13 algos as libalgos,

14 internals as libinternals,

15 lib,

16)

17from pandas._libs.missing import NA

18from pandas.util._decorators import cache_readonly

19from pandas.util._exceptions import find_stack_level

21from pandas.core.dtypes.cast import (

22 ensure_dtype_can_hold_na,

23 find_common_type,

24)

25from pandas.core.dtypes.common import (

26 is_1d_only_ea_dtype,

27 is_scalar,

28 needs_i8_conversion,

29)

30from pandas.core.dtypes.concat import concat_compat

31from pandas.core.dtypes.dtypes import (

32 ExtensionDtype,

33 SparseDtype,

34)

35from pandas.core.dtypes.missing import (

36 is_valid_na_for_dtype,

37 isna,

38 isna_all,

39)

41from pandas.core.construction import ensure_wrapped_if_datetimelike

42from pandas.core.internals.array_manager import ArrayManager

43from pandas.core.internals.blocks import (

44 ensure_block_shape,

45 new_block_2d,

46)

47from pandas.core.internals.managers import (

48 BlockManager,

49 make_na_array,

50)

52if TYPE_CHECKING:

53 from collections.abc import Sequence

55 from pandas._typing import (

56 ArrayLike,

57 AxisInt,

58 DtypeObj,

59 Manager2D,

60 Shape,

61 )

63 from pandas import Index

64 from pandas.core.internals.blocks import (

65 Block,

66 BlockPlacement,

67 )

70def _concatenate_array_managers(

71 mgrs: list[ArrayManager], axes: list[Index], concat_axis: AxisInt

72) -> Manager2D:

73 """

74 Concatenate array managers into one.

76 Parameters

77 ----------

78 mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples

79 axes : list of Index

80 concat_axis : int

82 Returns

83 -------

84 ArrayManager

85 """

86 if concat_axis == 1:

87 return mgrs[0].concat_vertical(mgrs, axes)

88 else:

89 # concatting along the columns -> combine reindexed arrays in a single manager

90 assert concat_axis == 0

91 return mgrs[0].concat_horizontal(mgrs, axes)

94def concatenate_managers(

95 mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool

96) -> Manager2D:

97 """

98 Concatenate block managers into one.

100 Parameters

101 ----------

102 mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples

103 axes : list of Index

104 concat_axis : int

105 copy : bool

106

107 Returns

108 -------

109 BlockManager

110 """

111

112 needs_copy = copy and concat_axis == 0

113

114 # TODO(ArrayManager) this assumes that all managers are of the same type

115 if isinstance(mgrs_indexers[0][0], ArrayManager):

116 mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy)

117 # error: Argument 1 to "_concatenate_array_managers" has incompatible

118 # type "List[BlockManager]"; expected "List[Union[ArrayManager,

119 # SingleArrayManager, BlockManager, SingleBlockManager]]"

120 return _concatenate_array_managers(

121 mgrs, axes, concat_axis # type: ignore[arg-type]

122 )

123

124 # Assertions disabled for performance

125 # for tup in mgrs_indexers:

126 # # caller is responsible for ensuring this

127 # indexers = tup[1]

128 # assert concat_axis not in indexers

129

130 if concat_axis == 0:

131 mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy)

132 return mgrs[0].concat_horizontal(mgrs, axes)

133

134 if len(mgrs_indexers) > 0 and mgrs_indexers[0][0].nblocks > 0:

135 first_dtype = mgrs_indexers[0][0].blocks[0].dtype

136 if first_dtype in [np.float64, np.float32]:

137 # TODO: support more dtypes here. This will be simpler once

138 # JoinUnit.is_na behavior is deprecated.

139 if (

140 all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in mgrs_indexers)

141 and len(mgrs_indexers) > 1

142 ):

143 # Fastpath!

144 # Length restriction is just to avoid having to worry about 'copy'

145 shape = tuple(len(x) for x in axes)

146 nb = _concat_homogeneous_fastpath(mgrs_indexers, shape, first_dtype)

147 return BlockManager((nb,), axes)

148

149 mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy)

150

151 if len(mgrs) == 1:

152 mgr = mgrs[0]

153 out = mgr.copy(deep=False)

154 out.axes = axes

155 return out

156

157 concat_plan = _get_combined_plan(mgrs)

158

159 blocks = []

160 values: ArrayLike

161

162 for placement, join_units in concat_plan:

163 unit = join_units[0]

164 blk = unit.block

165

166 if _is_uniform_join_units(join_units):

167 vals = [ju.block.values for ju in join_units]

168

169 if not blk.is_extension:

170 # _is_uniform_join_units ensures a single dtype, so

171 # we can use np.concatenate, which is more performant

172 # than concat_compat

173 # error: Argument 1 to "concatenate" has incompatible type

174 # "List[Union[ndarray[Any, Any], ExtensionArray]]";

175 # expected "Union[_SupportsArray[dtype[Any]],

176 # _NestedSequence[_SupportsArray[dtype[Any]]]]"

177 values = np.concatenate(vals, axis=1) # type: ignore[arg-type]

178 elif is_1d_only_ea_dtype(blk.dtype):

179 # TODO(EA2D): special-casing not needed with 2D EAs

180 values = concat_compat(vals, axis=0, ea_compat_axis=True)

181 values = ensure_block_shape(values, ndim=2)

182 else:

183 values = concat_compat(vals, axis=1)

184

185 values = ensure_wrapped_if_datetimelike(values)

186

187 fastpath = blk.values.dtype == values.dtype

188 else:

189 values = _concatenate_join_units(join_units, copy=copy)

190 fastpath = False

191

192 if fastpath:

193 b = blk.make_block_same_class(values, placement=placement)

194 else:

195 b = new_block_2d(values, placement=placement)

196

197 blocks.append(b)

198

199 return BlockManager(tuple(blocks), axes)

200

201

202def _maybe_reindex_columns_na_proxy(

203 axes: list[Index],

204 mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]],

205 needs_copy: bool,

206) -> list[BlockManager]:

207 """

208 Reindex along columns so that all of the BlockManagers being concatenated

209 have matching columns.

210

211 Columns added in this reindexing have dtype=np.void, indicating they

212 should be ignored when choosing a column's final dtype.

213 """

214 new_mgrs = []

215

216 for mgr, indexers in mgrs_indexers:

217 # For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this

218 # is a cheap reindexing.

219 for i, indexer in indexers.items():

220 mgr = mgr.reindex_indexer(

221 axes[i],

222 indexers[i],

223 axis=i,

224 copy=False,

225 only_slice=True, # only relevant for i==0

226 allow_dups=True,

227 use_na_proxy=True, # only relevant for i==0

228 )

229 if needs_copy and not indexers:

230 mgr = mgr.copy()

231

232 new_mgrs.append(mgr)

233 return new_mgrs

234

235

236def _is_homogeneous_mgr(mgr: BlockManager, first_dtype: DtypeObj) -> bool:

237 """

238 Check if this Manager can be treated as a single ndarray.

239 """

240 if mgr.nblocks != 1:

241 return False

242 blk = mgr.blocks[0]

243 if not (blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1):

244 return False

245

246 return blk.dtype == first_dtype

247

248

249def _concat_homogeneous_fastpath(

250 mgrs_indexers, shape: Shape, first_dtype: np.dtype

251) -> Block:

252 """

253 With single-Block managers with homogeneous dtypes (that can already hold nan),

254 we avoid [...]

255 """

256 # assumes

257 # all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in in mgrs_indexers)

258

259 if all(not indexers for _, indexers in mgrs_indexers):

260 # https://github.com/pandas-dev/pandas/pull/52685#issuecomment-1523287739

261 arrs = [mgr.blocks[0].values.T for mgr, _ in mgrs_indexers]

262 arr = np.concatenate(arrs).T

263 bp = libinternals.BlockPlacement(slice(shape[0]))

264 nb = new_block_2d(arr, bp)

265 return nb

266

267 arr = np.empty(shape, dtype=first_dtype)

268

269 if first_dtype == np.float64:

270 take_func = libalgos.take_2d_axis0_float64_float64

271 else:

272 take_func = libalgos.take_2d_axis0_float32_float32

273

274 start = 0

275 for mgr, indexers in mgrs_indexers:

276 mgr_len = mgr.shape[1]

277 end = start + mgr_len

278

279 if 0 in indexers:

280 take_func(

281 mgr.blocks[0].values,

282 indexers[0],

283 arr[:, start:end],

284 )

285 else:

286 # No reindexing necessary, we can copy values directly

287 arr[:, start:end] = mgr.blocks[0].values

288

289 start += mgr_len

290

291 bp = libinternals.BlockPlacement(slice(shape[0]))

292 nb = new_block_2d(arr, bp)

293 return nb

294

295

296def _get_combined_plan(

297 mgrs: list[BlockManager],

298) -> list[tuple[BlockPlacement, list[JoinUnit]]]:

299 plan = []

300

301 max_len = mgrs[0].shape[0]

302

303 blknos_list = [mgr.blknos for mgr in mgrs]

304 pairs = libinternals.get_concat_blkno_indexers(blknos_list)

305 for ind, (blknos, bp) in enumerate(pairs):

306 # assert bp.is_slice_like

307 # assert len(bp) > 0

308

309 units_for_bp = []

310 for k, mgr in enumerate(mgrs):

311 blkno = blknos[k]

312

313 nb = _get_block_for_concat_plan(mgr, bp, blkno, max_len=max_len)

314 unit = JoinUnit(nb)

315 units_for_bp.append(unit)

316

317 plan.append((bp, units_for_bp))

318

319 return plan

320

321

322def _get_block_for_concat_plan(

323 mgr: BlockManager, bp: BlockPlacement, blkno: int, *, max_len: int

324) -> Block:

325 blk = mgr.blocks[blkno]

326 # Assertions disabled for performance:

327 # assert bp.is_slice_like

328 # assert blkno != -1

329 # assert (mgr.blknos[bp] == blkno).all()

330

331 if len(bp) == len(blk.mgr_locs) and (

332 blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1

333 ):

334 nb = blk

335 else:

336 ax0_blk_indexer = mgr.blklocs[bp.indexer]

337

338 slc = lib.maybe_indices_to_slice(ax0_blk_indexer, max_len)

339 # TODO: in all extant test cases 2023-04-08 we have a slice here.

340 # Will this always be the case?

341 if isinstance(slc, slice):

342 nb = blk.slice_block_columns(slc)

343 else:

344 nb = blk.take_block_columns(slc)

345

346 # assert nb.shape == (len(bp), mgr.shape[1])

347 return nb

348

349

350class JoinUnit:

351 def __init__(self, block: Block) -> None:

352 self.block = block

353

354 def __repr__(self) -> str:

355 return f"{type(self).__name__}({repr(self.block)})"

356

357 def _is_valid_na_for(self, dtype: DtypeObj) -> bool:

358 """

359 Check that we are all-NA of a type/dtype that is compatible with this dtype.

360 Augments `self.is_na` with an additional check of the type of NA values.

361 """

362 if not self.is_na:

363 return False

364

365 blk = self.block

366 if blk.dtype.kind == "V":

367 return True

368

369 if blk.dtype == object:

370 values = blk.values

371 return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K"))

372

373 na_value = blk.fill_value

374 if na_value is NaT and blk.dtype != dtype:

375 # e.g. we are dt64 and other is td64

376 # fill_values match but we should not cast blk.values to dtype

377 # TODO: this will need updating if we ever have non-nano dt64/td64

378 return False

379

380 if na_value is NA and needs_i8_conversion(dtype):

381 # FIXME: kludge; test_append_empty_frame_with_timedelta64ns_nat

382 # e.g. blk.dtype == "Int64" and dtype is td64, we dont want

383 # to consider these as matching

384 return False

385

386 # TODO: better to use can_hold_element?

387 return is_valid_na_for_dtype(na_value, dtype)

388

389 @cache_readonly

390 def is_na(self) -> bool:

391 blk = self.block

392 if blk.dtype.kind == "V":

393 return True

394

395 if not blk._can_hold_na:

396 return False

397

398 values = blk.values

399 if values.size == 0:

400 # GH#39122 this case will return False once deprecation is enforced

401 return True

402

403 if isinstance(values.dtype, SparseDtype):

404 return False

405

406 if values.ndim == 1:

407 # TODO(EA2D): no need for special case with 2D EAs

408 val = values[0]

409 if not is_scalar(val) or not isna(val):

410 # ideally isna_all would do this short-circuiting

411 return False

412 return isna_all(values)

413 else:

414 val = values[0][0]

415 if not is_scalar(val) or not isna(val):

416 # ideally isna_all would do this short-circuiting

417 return False

418 return all(isna_all(row) for row in values)

419

420 @cache_readonly

421 def is_na_after_size_and_isna_all_deprecation(self) -> bool:

422 """

423 Will self.is_na be True after values.size == 0 deprecation and isna_all

424 deprecation are enforced?

425 """

426 blk = self.block

427 if blk.dtype.kind == "V":

428 return True

429 return False

430

431 def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:

432 values: ArrayLike

433

434 if upcasted_na is None and self.block.dtype.kind != "V":

435 # No upcasting is necessary

436 return self.block.values

437 else:

438 fill_value = upcasted_na

439

440 if self._is_valid_na_for(empty_dtype):

441 # note: always holds when self.block.dtype.kind == "V"

442 blk_dtype = self.block.dtype

443

444 if blk_dtype == np.dtype("object"):

445 # we want to avoid filling with np.nan if we are

446 # using None; we already know that we are all

447 # nulls

448 values = cast(np.ndarray, self.block.values)

449 if values.size and values[0, 0] is None:

450 fill_value = None

451

452 return make_na_array(empty_dtype, self.block.shape, fill_value)

453

454 return self.block.values

455

456

457def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike:

458 """

459 Concatenate values from several join units along axis=1.

460 """

461 empty_dtype, empty_dtype_future = _get_empty_dtype(join_units)

462

463 has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)

464 upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks)

465

466 to_concat = [

467 ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na)

468 for ju in join_units

469 ]

470

471 if any(is_1d_only_ea_dtype(t.dtype) for t in to_concat):

472 # TODO(EA2D): special case not needed if all EAs used HybridBlocks

473

474 # error: No overload variant of "__getitem__" of "ExtensionArray" matches

475 # argument type "Tuple[int, slice]"

476 to_concat = [

477 t

478 if is_1d_only_ea_dtype(t.dtype)

479 else t[0, :] # type: ignore[call-overload]

480 for t in to_concat

481 ]

482 concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True)

483 concat_values = ensure_block_shape(concat_values, 2)

484

485 else:

486 concat_values = concat_compat(to_concat, axis=1)

487

488 if empty_dtype != empty_dtype_future:

489 if empty_dtype == concat_values.dtype:

490 # GH#39122, GH#40893

491 warnings.warn(

492 "The behavior of DataFrame concatenation with empty or all-NA "

493 "entries is deprecated. In a future version, this will no longer "

494 "exclude empty or all-NA columns when determining the result dtypes. "

495 "To retain the old behavior, exclude the relevant entries before "

496 "the concat operation.",

497 FutureWarning,

498 stacklevel=find_stack_level(),

499 )

500 return concat_values

501

502

503def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool):

504 """

505 Find the NA value to go with this dtype.

506 """

507 if isinstance(dtype, ExtensionDtype):

508 return dtype.na_value

509 elif dtype.kind in "mM":

510 return dtype.type("NaT")

511 elif dtype.kind in "fc":

512 return dtype.type("NaN")

513 elif dtype.kind == "b":

514 # different from missing.na_value_for_dtype

515 return None

516 elif dtype.kind in "iu":

517 if not has_none_blocks:

518 # different from missing.na_value_for_dtype

519 return None

520 return np.nan

521 elif dtype.kind == "O":

522 return np.nan

523 raise NotImplementedError

524

525

526def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj]:

527 """

528 Return dtype and N/A values to use when concatenating specified units.

529

530 Returned N/A value may be None which means there was no casting involved.

531

532 Returns

533 -------

534 dtype

535 """

536 if lib.dtypes_all_equal([ju.block.dtype for ju in join_units]):

537 empty_dtype = join_units[0].block.dtype

538 return empty_dtype, empty_dtype

539

540 has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)

541

542 dtypes = [unit.block.dtype for unit in join_units if not unit.is_na]

543 if not len(dtypes):

544 dtypes = [

545 unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V"

546 ]

547

548 dtype = find_common_type(dtypes)

549 if has_none_blocks:

550 dtype = ensure_dtype_can_hold_na(dtype)

551

552 dtype_future = dtype

553 if len(dtypes) != len(join_units):

554 dtypes_future = [

555 unit.block.dtype

556 for unit in join_units

557 if not unit.is_na_after_size_and_isna_all_deprecation

558 ]

559 if not len(dtypes_future):

560 dtypes_future = [

561 unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V"

562 ]

563

564 if len(dtypes) != len(dtypes_future):

565 dtype_future = find_common_type(dtypes_future)

566 if has_none_blocks:

567 dtype_future = ensure_dtype_can_hold_na(dtype_future)

568

569 return dtype, dtype_future

570

571

572def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:

573 """

574 Check if the join units consist of blocks of uniform type that can

575 be concatenated using Block.concat_same_type instead of the generic

576 _concatenate_join_units (which uses `concat_compat`).

577

578 """

579 first = join_units[0].block

580 if first.dtype.kind == "V":

581 return False

582 return (

583 # exclude cases where a) ju.block is None or b) we have e.g. Int64+int64

584 all(type(ju.block) is type(first) for ju in join_units)

585 and

586 # e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform

587 all(

588 ju.block.dtype == first.dtype

589 # GH#42092 we only want the dtype_equal check for non-numeric blocks

590 # (for now, may change but that would need a deprecation)

591 or ju.block.dtype.kind in "iub"

592 for ju in join_units

593 )

594 and

595 # no blocks that would get missing values (can lead to type upcasts)

596 # unless we're an extension dtype.

597 all(not ju.is_na or ju.block.is_extension for ju in join_units)

598 )