1from __future__ import annotations
2
3from typing import (
4 TYPE_CHECKING,
5 cast,
6)
7import warnings
8
9import numpy as np
10
11from pandas._libs import (
12 NaT,
13 algos as libalgos,
14 internals as libinternals,
15 lib,
16)
17from pandas._libs.missing import NA
18from pandas.util._decorators import cache_readonly
19from pandas.util._exceptions import find_stack_level
20
21from pandas.core.dtypes.cast import (
22 ensure_dtype_can_hold_na,
23 find_common_type,
24)
25from pandas.core.dtypes.common import (
26 is_1d_only_ea_dtype,
27 is_scalar,
28 needs_i8_conversion,
29)
30from pandas.core.dtypes.concat import concat_compat
31from pandas.core.dtypes.dtypes import (
32 ExtensionDtype,
33 SparseDtype,
34)
35from pandas.core.dtypes.missing import (
36 is_valid_na_for_dtype,
37 isna,
38 isna_all,
39)
40
41from pandas.core.construction import ensure_wrapped_if_datetimelike
42from pandas.core.internals.array_manager import ArrayManager
43from pandas.core.internals.blocks import (
44 ensure_block_shape,
45 new_block_2d,
46)
47from pandas.core.internals.managers import (
48 BlockManager,
49 make_na_array,
50)
51
52if TYPE_CHECKING:
53 from collections.abc import Sequence
54
55 from pandas._typing import (
56 ArrayLike,
57 AxisInt,
58 DtypeObj,
59 Manager2D,
60 Shape,
61 )
62
63 from pandas import Index
64 from pandas.core.internals.blocks import (
65 Block,
66 BlockPlacement,
67 )
68
69
70def _concatenate_array_managers(
71 mgrs: list[ArrayManager], axes: list[Index], concat_axis: AxisInt
72) -> Manager2D:
73 """
74 Concatenate array managers into one.
75
76 Parameters
77 ----------
78 mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples
79 axes : list of Index
80 concat_axis : int
81
82 Returns
83 -------
84 ArrayManager
85 """
86 if concat_axis == 1:
87 return mgrs[0].concat_vertical(mgrs, axes)
88 else:
89 # concatting along the columns -> combine reindexed arrays in a single manager
90 assert concat_axis == 0
91 return mgrs[0].concat_horizontal(mgrs, axes)
92
93
94def concatenate_managers(
95 mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool
96) -> Manager2D:
97 """
98 Concatenate block managers into one.
99
100 Parameters
101 ----------
102 mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples
103 axes : list of Index
104 concat_axis : int
105 copy : bool
106
107 Returns
108 -------
109 BlockManager
110 """
111
112 needs_copy = copy and concat_axis == 0
113
114 # TODO(ArrayManager) this assumes that all managers are of the same type
115 if isinstance(mgrs_indexers[0][0], ArrayManager):
116 mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy)
117 # error: Argument 1 to "_concatenate_array_managers" has incompatible
118 # type "List[BlockManager]"; expected "List[Union[ArrayManager,
119 # SingleArrayManager, BlockManager, SingleBlockManager]]"
120 return _concatenate_array_managers(
121 mgrs, axes, concat_axis # type: ignore[arg-type]
122 )
123
124 # Assertions disabled for performance
125 # for tup in mgrs_indexers:
126 # # caller is responsible for ensuring this
127 # indexers = tup[1]
128 # assert concat_axis not in indexers
129
130 if concat_axis == 0:
131 mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy)
132 return mgrs[0].concat_horizontal(mgrs, axes)
133
134 if len(mgrs_indexers) > 0 and mgrs_indexers[0][0].nblocks > 0:
135 first_dtype = mgrs_indexers[0][0].blocks[0].dtype
136 if first_dtype in [np.float64, np.float32]:
137 # TODO: support more dtypes here. This will be simpler once
138 # JoinUnit.is_na behavior is deprecated.
139 if (
140 all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in mgrs_indexers)
141 and len(mgrs_indexers) > 1
142 ):
143 # Fastpath!
144 # Length restriction is just to avoid having to worry about 'copy'
145 shape = tuple(len(x) for x in axes)
146 nb = _concat_homogeneous_fastpath(mgrs_indexers, shape, first_dtype)
147 return BlockManager((nb,), axes)
148
149 mgrs = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers, needs_copy)
150
151 if len(mgrs) == 1:
152 mgr = mgrs[0]
153 out = mgr.copy(deep=False)
154 out.axes = axes
155 return out
156
157 concat_plan = _get_combined_plan(mgrs)
158
159 blocks = []
160 values: ArrayLike
161
162 for placement, join_units in concat_plan:
163 unit = join_units[0]
164 blk = unit.block
165
166 if _is_uniform_join_units(join_units):
167 vals = [ju.block.values for ju in join_units]
168
169 if not blk.is_extension:
170 # _is_uniform_join_units ensures a single dtype, so
171 # we can use np.concatenate, which is more performant
172 # than concat_compat
173 # error: Argument 1 to "concatenate" has incompatible type
174 # "List[Union[ndarray[Any, Any], ExtensionArray]]";
175 # expected "Union[_SupportsArray[dtype[Any]],
176 # _NestedSequence[_SupportsArray[dtype[Any]]]]"
177 values = np.concatenate(vals, axis=1) # type: ignore[arg-type]
178 elif is_1d_only_ea_dtype(blk.dtype):
179 # TODO(EA2D): special-casing not needed with 2D EAs
180 values = concat_compat(vals, axis=0, ea_compat_axis=True)
181 values = ensure_block_shape(values, ndim=2)
182 else:
183 values = concat_compat(vals, axis=1)
184
185 values = ensure_wrapped_if_datetimelike(values)
186
187 fastpath = blk.values.dtype == values.dtype
188 else:
189 values = _concatenate_join_units(join_units, copy=copy)
190 fastpath = False
191
192 if fastpath:
193 b = blk.make_block_same_class(values, placement=placement)
194 else:
195 b = new_block_2d(values, placement=placement)
196
197 blocks.append(b)
198
199 return BlockManager(tuple(blocks), axes)
200
201
202def _maybe_reindex_columns_na_proxy(
203 axes: list[Index],
204 mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]],
205 needs_copy: bool,
206) -> list[BlockManager]:
207 """
208 Reindex along columns so that all of the BlockManagers being concatenated
209 have matching columns.
210
211 Columns added in this reindexing have dtype=np.void, indicating they
212 should be ignored when choosing a column's final dtype.
213 """
214 new_mgrs = []
215
216 for mgr, indexers in mgrs_indexers:
217 # For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this
218 # is a cheap reindexing.
219 for i, indexer in indexers.items():
220 mgr = mgr.reindex_indexer(
221 axes[i],
222 indexers[i],
223 axis=i,
224 copy=False,
225 only_slice=True, # only relevant for i==0
226 allow_dups=True,
227 use_na_proxy=True, # only relevant for i==0
228 )
229 if needs_copy and not indexers:
230 mgr = mgr.copy()
231
232 new_mgrs.append(mgr)
233 return new_mgrs
234
235
236def _is_homogeneous_mgr(mgr: BlockManager, first_dtype: DtypeObj) -> bool:
237 """
238 Check if this Manager can be treated as a single ndarray.
239 """
240 if mgr.nblocks != 1:
241 return False
242 blk = mgr.blocks[0]
243 if not (blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1):
244 return False
245
246 return blk.dtype == first_dtype
247
248
249def _concat_homogeneous_fastpath(
250 mgrs_indexers, shape: Shape, first_dtype: np.dtype
251) -> Block:
252 """
253 With single-Block managers with homogeneous dtypes (that can already hold nan),
254 we avoid [...]
255 """
256 # assumes
257 # all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in in mgrs_indexers)
258
259 if all(not indexers for _, indexers in mgrs_indexers):
260 # https://github.com/pandas-dev/pandas/pull/52685#issuecomment-1523287739
261 arrs = [mgr.blocks[0].values.T for mgr, _ in mgrs_indexers]
262 arr = np.concatenate(arrs).T
263 bp = libinternals.BlockPlacement(slice(shape[0]))
264 nb = new_block_2d(arr, bp)
265 return nb
266
267 arr = np.empty(shape, dtype=first_dtype)
268
269 if first_dtype == np.float64:
270 take_func = libalgos.take_2d_axis0_float64_float64
271 else:
272 take_func = libalgos.take_2d_axis0_float32_float32
273
274 start = 0
275 for mgr, indexers in mgrs_indexers:
276 mgr_len = mgr.shape[1]
277 end = start + mgr_len
278
279 if 0 in indexers:
280 take_func(
281 mgr.blocks[0].values,
282 indexers[0],
283 arr[:, start:end],
284 )
285 else:
286 # No reindexing necessary, we can copy values directly
287 arr[:, start:end] = mgr.blocks[0].values
288
289 start += mgr_len
290
291 bp = libinternals.BlockPlacement(slice(shape[0]))
292 nb = new_block_2d(arr, bp)
293 return nb
294
295
296def _get_combined_plan(
297 mgrs: list[BlockManager],
298) -> list[tuple[BlockPlacement, list[JoinUnit]]]:
299 plan = []
300
301 max_len = mgrs[0].shape[0]
302
303 blknos_list = [mgr.blknos for mgr in mgrs]
304 pairs = libinternals.get_concat_blkno_indexers(blknos_list)
305 for ind, (blknos, bp) in enumerate(pairs):
306 # assert bp.is_slice_like
307 # assert len(bp) > 0
308
309 units_for_bp = []
310 for k, mgr in enumerate(mgrs):
311 blkno = blknos[k]
312
313 nb = _get_block_for_concat_plan(mgr, bp, blkno, max_len=max_len)
314 unit = JoinUnit(nb)
315 units_for_bp.append(unit)
316
317 plan.append((bp, units_for_bp))
318
319 return plan
320
321
322def _get_block_for_concat_plan(
323 mgr: BlockManager, bp: BlockPlacement, blkno: int, *, max_len: int
324) -> Block:
325 blk = mgr.blocks[blkno]
326 # Assertions disabled for performance:
327 # assert bp.is_slice_like
328 # assert blkno != -1
329 # assert (mgr.blknos[bp] == blkno).all()
330
331 if len(bp) == len(blk.mgr_locs) and (
332 blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1
333 ):
334 nb = blk
335 else:
336 ax0_blk_indexer = mgr.blklocs[bp.indexer]
337
338 slc = lib.maybe_indices_to_slice(ax0_blk_indexer, max_len)
339 # TODO: in all extant test cases 2023-04-08 we have a slice here.
340 # Will this always be the case?
341 if isinstance(slc, slice):
342 nb = blk.slice_block_columns(slc)
343 else:
344 nb = blk.take_block_columns(slc)
345
346 # assert nb.shape == (len(bp), mgr.shape[1])
347 return nb
348
349
350class JoinUnit:
351 def __init__(self, block: Block) -> None:
352 self.block = block
353
354 def __repr__(self) -> str:
355 return f"{type(self).__name__}({repr(self.block)})"
356
357 def _is_valid_na_for(self, dtype: DtypeObj) -> bool:
358 """
359 Check that we are all-NA of a type/dtype that is compatible with this dtype.
360 Augments `self.is_na` with an additional check of the type of NA values.
361 """
362 if not self.is_na:
363 return False
364
365 blk = self.block
366 if blk.dtype.kind == "V":
367 return True
368
369 if blk.dtype == object:
370 values = blk.values
371 return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K"))
372
373 na_value = blk.fill_value
374 if na_value is NaT and blk.dtype != dtype:
375 # e.g. we are dt64 and other is td64
376 # fill_values match but we should not cast blk.values to dtype
377 # TODO: this will need updating if we ever have non-nano dt64/td64
378 return False
379
380 if na_value is NA and needs_i8_conversion(dtype):
381 # FIXME: kludge; test_append_empty_frame_with_timedelta64ns_nat
382 # e.g. blk.dtype == "Int64" and dtype is td64, we dont want
383 # to consider these as matching
384 return False
385
386 # TODO: better to use can_hold_element?
387 return is_valid_na_for_dtype(na_value, dtype)
388
389 @cache_readonly
390 def is_na(self) -> bool:
391 blk = self.block
392 if blk.dtype.kind == "V":
393 return True
394
395 if not blk._can_hold_na:
396 return False
397
398 values = blk.values
399 if values.size == 0:
400 # GH#39122 this case will return False once deprecation is enforced
401 return True
402
403 if isinstance(values.dtype, SparseDtype):
404 return False
405
406 if values.ndim == 1:
407 # TODO(EA2D): no need for special case with 2D EAs
408 val = values[0]
409 if not is_scalar(val) or not isna(val):
410 # ideally isna_all would do this short-circuiting
411 return False
412 return isna_all(values)
413 else:
414 val = values[0][0]
415 if not is_scalar(val) or not isna(val):
416 # ideally isna_all would do this short-circuiting
417 return False
418 return all(isna_all(row) for row in values)
419
420 @cache_readonly
421 def is_na_after_size_and_isna_all_deprecation(self) -> bool:
422 """
423 Will self.is_na be True after values.size == 0 deprecation and isna_all
424 deprecation are enforced?
425 """
426 blk = self.block
427 if blk.dtype.kind == "V":
428 return True
429 return False
430
431 def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
432 values: ArrayLike
433
434 if upcasted_na is None and self.block.dtype.kind != "V":
435 # No upcasting is necessary
436 return self.block.values
437 else:
438 fill_value = upcasted_na
439
440 if self._is_valid_na_for(empty_dtype):
441 # note: always holds when self.block.dtype.kind == "V"
442 blk_dtype = self.block.dtype
443
444 if blk_dtype == np.dtype("object"):
445 # we want to avoid filling with np.nan if we are
446 # using None; we already know that we are all
447 # nulls
448 values = cast(np.ndarray, self.block.values)
449 if values.size and values[0, 0] is None:
450 fill_value = None
451
452 return make_na_array(empty_dtype, self.block.shape, fill_value)
453
454 return self.block.values
455
456
457def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike:
458 """
459 Concatenate values from several join units along axis=1.
460 """
461 empty_dtype, empty_dtype_future = _get_empty_dtype(join_units)
462
463 has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
464 upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks)
465
466 to_concat = [
467 ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na)
468 for ju in join_units
469 ]
470
471 if any(is_1d_only_ea_dtype(t.dtype) for t in to_concat):
472 # TODO(EA2D): special case not needed if all EAs used HybridBlocks
473
474 # error: No overload variant of "__getitem__" of "ExtensionArray" matches
475 # argument type "Tuple[int, slice]"
476 to_concat = [
477 t
478 if is_1d_only_ea_dtype(t.dtype)
479 else t[0, :] # type: ignore[call-overload]
480 for t in to_concat
481 ]
482 concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True)
483 concat_values = ensure_block_shape(concat_values, 2)
484
485 else:
486 concat_values = concat_compat(to_concat, axis=1)
487
488 if empty_dtype != empty_dtype_future:
489 if empty_dtype == concat_values.dtype:
490 # GH#39122, GH#40893
491 warnings.warn(
492 "The behavior of DataFrame concatenation with empty or all-NA "
493 "entries is deprecated. In a future version, this will no longer "
494 "exclude empty or all-NA columns when determining the result dtypes. "
495 "To retain the old behavior, exclude the relevant entries before "
496 "the concat operation.",
497 FutureWarning,
498 stacklevel=find_stack_level(),
499 )
500 return concat_values
501
502
503def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool):
504 """
505 Find the NA value to go with this dtype.
506 """
507 if isinstance(dtype, ExtensionDtype):
508 return dtype.na_value
509 elif dtype.kind in "mM":
510 return dtype.type("NaT")
511 elif dtype.kind in "fc":
512 return dtype.type("NaN")
513 elif dtype.kind == "b":
514 # different from missing.na_value_for_dtype
515 return None
516 elif dtype.kind in "iu":
517 if not has_none_blocks:
518 # different from missing.na_value_for_dtype
519 return None
520 return np.nan
521 elif dtype.kind == "O":
522 return np.nan
523 raise NotImplementedError
524
525
526def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj]:
527 """
528 Return dtype and N/A values to use when concatenating specified units.
529
530 Returned N/A value may be None which means there was no casting involved.
531
532 Returns
533 -------
534 dtype
535 """
536 if lib.dtypes_all_equal([ju.block.dtype for ju in join_units]):
537 empty_dtype = join_units[0].block.dtype
538 return empty_dtype, empty_dtype
539
540 has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
541
542 dtypes = [unit.block.dtype for unit in join_units if not unit.is_na]
543 if not len(dtypes):
544 dtypes = [
545 unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V"
546 ]
547
548 dtype = find_common_type(dtypes)
549 if has_none_blocks:
550 dtype = ensure_dtype_can_hold_na(dtype)
551
552 dtype_future = dtype
553 if len(dtypes) != len(join_units):
554 dtypes_future = [
555 unit.block.dtype
556 for unit in join_units
557 if not unit.is_na_after_size_and_isna_all_deprecation
558 ]
559 if not len(dtypes_future):
560 dtypes_future = [
561 unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V"
562 ]
563
564 if len(dtypes) != len(dtypes_future):
565 dtype_future = find_common_type(dtypes_future)
566 if has_none_blocks:
567 dtype_future = ensure_dtype_can_hold_na(dtype_future)
568
569 return dtype, dtype_future
570
571
572def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
573 """
574 Check if the join units consist of blocks of uniform type that can
575 be concatenated using Block.concat_same_type instead of the generic
576 _concatenate_join_units (which uses `concat_compat`).
577
578 """
579 first = join_units[0].block
580 if first.dtype.kind == "V":
581 return False
582 return (
583 # exclude cases where a) ju.block is None or b) we have e.g. Int64+int64
584 all(type(ju.block) is type(first) for ju in join_units)
585 and
586 # e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform
587 all(
588 ju.block.dtype == first.dtype
589 # GH#42092 we only want the dtype_equal check for non-numeric blocks
590 # (for now, may change but that would need a deprecation)
591 or ju.block.dtype.kind in "iub"
592 for ju in join_units
593 )
594 and
595 # no blocks that would get missing values (can lead to type upcasts)
596 # unless we're an extension dtype.
597 all(not ju.is_na or ju.block.is_extension for ju in join_units)
598 )