1"""
2SQL-style merge routines
3"""
4from __future__ import annotations
5
6import copy as cp
7import datetime
8from functools import partial
9import string
10from typing import (
11 TYPE_CHECKING,
12 Hashable,
13 Literal,
14 Sequence,
15 cast,
16)
17import uuid
18import warnings
19
20import numpy as np
21
22from pandas._libs import (
23 Timedelta,
24 hashtable as libhashtable,
25 join as libjoin,
26 lib,
27)
28from pandas._libs.lib import is_range_indexer
29from pandas._typing import (
30 AnyArrayLike,
31 ArrayLike,
32 AxisInt,
33 DtypeObj,
34 IndexLabel,
35 JoinHow,
36 MergeHow,
37 Shape,
38 Suffixes,
39 npt,
40)
41from pandas.errors import MergeError
42from pandas.util._decorators import (
43 Appender,
44 Substitution,
45 cache_readonly,
46)
47from pandas.util._exceptions import find_stack_level
48
49from pandas.core.dtypes.base import ExtensionDtype
50from pandas.core.dtypes.cast import find_common_type
51from pandas.core.dtypes.common import (
52 ensure_float64,
53 ensure_int64,
54 ensure_object,
55 is_array_like,
56 is_bool,
57 is_bool_dtype,
58 is_categorical_dtype,
59 is_dtype_equal,
60 is_extension_array_dtype,
61 is_float_dtype,
62 is_integer,
63 is_integer_dtype,
64 is_list_like,
65 is_number,
66 is_numeric_dtype,
67 is_object_dtype,
68 needs_i8_conversion,
69)
70from pandas.core.dtypes.dtypes import DatetimeTZDtype
71from pandas.core.dtypes.generic import (
72 ABCDataFrame,
73 ABCSeries,
74)
75from pandas.core.dtypes.missing import (
76 isna,
77 na_value_for_dtype,
78)
79
80from pandas import (
81 ArrowDtype,
82 Categorical,
83 Index,
84 MultiIndex,
85 Series,
86)
87import pandas.core.algorithms as algos
88from pandas.core.arrays import (
89 ArrowExtensionArray,
90 BaseMaskedArray,
91 ExtensionArray,
92)
93from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
94import pandas.core.common as com
95from pandas.core.construction import (
96 ensure_wrapped_if_datetimelike,
97 extract_array,
98)
99from pandas.core.frame import _merge_doc
100from pandas.core.indexes.api import default_index
101from pandas.core.sorting import is_int64_overflow_possible
102
103if TYPE_CHECKING:
104 from pandas import DataFrame
105 from pandas.core import groupby
106 from pandas.core.arrays import DatetimeArray
107
108_factorizers = {
109 np.int64: libhashtable.Int64Factorizer,
110 np.longlong: libhashtable.Int64Factorizer,
111 np.int32: libhashtable.Int32Factorizer,
112 np.int16: libhashtable.Int16Factorizer,
113 np.int8: libhashtable.Int8Factorizer,
114 np.uint64: libhashtable.UInt64Factorizer,
115 np.uint32: libhashtable.UInt32Factorizer,
116 np.uint16: libhashtable.UInt16Factorizer,
117 np.uint8: libhashtable.UInt8Factorizer,
118 np.bool_: libhashtable.UInt8Factorizer,
119 np.float64: libhashtable.Float64Factorizer,
120 np.float32: libhashtable.Float32Factorizer,
121 np.complex64: libhashtable.Complex64Factorizer,
122 np.complex128: libhashtable.Complex128Factorizer,
123 np.object_: libhashtable.ObjectFactorizer,
124}
125
126# See https://github.com/pandas-dev/pandas/issues/52451
127if np.intc is not np.int32:
128 _factorizers[np.intc] = libhashtable.Int64Factorizer
129
130
131@Substitution("\nleft : DataFrame or named Series")
132@Appender(_merge_doc, indents=0)
133def merge(
134 left: DataFrame | Series,
135 right: DataFrame | Series,
136 how: MergeHow = "inner",
137 on: IndexLabel | None = None,
138 left_on: IndexLabel | None = None,
139 right_on: IndexLabel | None = None,
140 left_index: bool = False,
141 right_index: bool = False,
142 sort: bool = False,
143 suffixes: Suffixes = ("_x", "_y"),
144 copy: bool | None = None,
145 indicator: str | bool = False,
146 validate: str | None = None,
147) -> DataFrame:
148 op = _MergeOperation(
149 left,
150 right,
151 how=how,
152 on=on,
153 left_on=left_on,
154 right_on=right_on,
155 left_index=left_index,
156 right_index=right_index,
157 sort=sort,
158 suffixes=suffixes,
159 indicator=indicator,
160 validate=validate,
161 )
162 return op.get_result(copy=copy)
163
164
165def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces):
166 """
167 groupby & merge; we are always performing a left-by type operation
168
169 Parameters
170 ----------
171 by: field to group
172 left: DataFrame
173 right: DataFrame
174 merge_pieces: function for merging
175 """
176 pieces = []
177 if not isinstance(by, (list, tuple)):
178 by = [by]
179
180 lby = left.groupby(by, sort=False)
181 rby: groupby.DataFrameGroupBy | None = None
182
183 # if we can groupby the rhs
184 # then we can get vastly better perf
185 if all(item in right.columns for item in by):
186 rby = right.groupby(by, sort=False)
187
188 for key, lhs in lby.grouper.get_iterator(lby._selected_obj, axis=lby.axis):
189 if rby is None:
190 rhs = right
191 else:
192 try:
193 rhs = right.take(rby.indices[key])
194 except KeyError:
195 # key doesn't exist in left
196 lcols = lhs.columns.tolist()
197 cols = lcols + [r for r in right.columns if r not in set(lcols)]
198 merged = lhs.reindex(columns=cols)
199 merged.index = range(len(merged))
200 pieces.append(merged)
201 continue
202
203 merged = merge_pieces(lhs, rhs)
204
205 # make sure join keys are in the merged
206 # TODO, should merge_pieces do this?
207 merged[by] = key
208
209 pieces.append(merged)
210
211 # preserve the original order
212 # if we have a missing piece this can be reset
213 from pandas.core.reshape.concat import concat
214
215 result = concat(pieces, ignore_index=True)
216 result = result.reindex(columns=pieces[0].columns, copy=False)
217 return result, lby
218
219
220def merge_ordered(
221 left: DataFrame,
222 right: DataFrame,
223 on: IndexLabel | None = None,
224 left_on: IndexLabel | None = None,
225 right_on: IndexLabel | None = None,
226 left_by=None,
227 right_by=None,
228 fill_method: str | None = None,
229 suffixes: Suffixes = ("_x", "_y"),
230 how: JoinHow = "outer",
231) -> DataFrame:
232 """
233 Perform a merge for ordered data with optional filling/interpolation.
234
235 Designed for ordered data like time series data. Optionally
236 perform group-wise merge (see examples).
237
238 Parameters
239 ----------
240 left : DataFrame or named Series
241 right : DataFrame or named Series
242 on : label or list
243 Field names to join on. Must be found in both DataFrames.
244 left_on : label or list, or array-like
245 Field names to join on in left DataFrame. Can be a vector or list of
246 vectors of the length of the DataFrame to use a particular vector as
247 the join key instead of columns.
248 right_on : label or list, or array-like
249 Field names to join on in right DataFrame or vector/list of vectors per
250 left_on docs.
251 left_by : column name or list of column names
252 Group left DataFrame by group columns and merge piece by piece with
253 right DataFrame. Must be None if either left or right are a Series.
254 right_by : column name or list of column names
255 Group right DataFrame by group columns and merge piece by piece with
256 left DataFrame. Must be None if either left or right are a Series.
257 fill_method : {'ffill', None}, default None
258 Interpolation method for data.
259 suffixes : list-like, default is ("_x", "_y")
260 A length-2 sequence where each element is optionally a string
261 indicating the suffix to add to overlapping column names in
262 `left` and `right` respectively. Pass a value of `None` instead
263 of a string to indicate that the column name from `left` or
264 `right` should be left as-is, with no suffix. At least one of the
265 values must not be None.
266
267 how : {'left', 'right', 'outer', 'inner'}, default 'outer'
268 * left: use only keys from left frame (SQL: left outer join)
269 * right: use only keys from right frame (SQL: right outer join)
270 * outer: use union of keys from both frames (SQL: full outer join)
271 * inner: use intersection of keys from both frames (SQL: inner join).
272
273 Returns
274 -------
275 DataFrame
276 The merged DataFrame output type will be the same as
277 'left', if it is a subclass of DataFrame.
278
279 See Also
280 --------
281 merge : Merge with a database-style join.
282 merge_asof : Merge on nearest keys.
283
284 Examples
285 --------
286 >>> from pandas import merge_ordered
287 >>> df1 = pd.DataFrame(
288 ... {
289 ... "key": ["a", "c", "e", "a", "c", "e"],
290 ... "lvalue": [1, 2, 3, 1, 2, 3],
291 ... "group": ["a", "a", "a", "b", "b", "b"]
292 ... }
293 ... )
294 >>> df1
295 key lvalue group
296 0 a 1 a
297 1 c 2 a
298 2 e 3 a
299 3 a 1 b
300 4 c 2 b
301 5 e 3 b
302
303 >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
304 >>> df2
305 key rvalue
306 0 b 1
307 1 c 2
308 2 d 3
309
310 >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group")
311 key lvalue group rvalue
312 0 a 1 a NaN
313 1 b 1 a 1.0
314 2 c 2 a 2.0
315 3 d 2 a 3.0
316 4 e 3 a 3.0
317 5 a 1 b NaN
318 6 b 1 b 1.0
319 7 c 2 b 2.0
320 8 d 2 b 3.0
321 9 e 3 b 3.0
322 """
323
324 def _merger(x, y) -> DataFrame:
325 # perform the ordered merge operation
326 op = _OrderedMerge(
327 x,
328 y,
329 on=on,
330 left_on=left_on,
331 right_on=right_on,
332 suffixes=suffixes,
333 fill_method=fill_method,
334 how=how,
335 )
336 return op.get_result()
337
338 if left_by is not None and right_by is not None:
339 raise ValueError("Can only group either left or right frames")
340 if left_by is not None:
341 if isinstance(left_by, str):
342 left_by = [left_by]
343 check = set(left_by).difference(left.columns)
344 if len(check) != 0:
345 raise KeyError(f"{check} not found in left columns")
346 result, _ = _groupby_and_merge(left_by, left, right, lambda x, y: _merger(x, y))
347 elif right_by is not None:
348 if isinstance(right_by, str):
349 right_by = [right_by]
350 check = set(right_by).difference(right.columns)
351 if len(check) != 0:
352 raise KeyError(f"{check} not found in right columns")
353 result, _ = _groupby_and_merge(
354 right_by, right, left, lambda x, y: _merger(y, x)
355 )
356 else:
357 result = _merger(left, right)
358 return result
359
360
361def merge_asof(
362 left: DataFrame | Series,
363 right: DataFrame | Series,
364 on: IndexLabel | None = None,
365 left_on: IndexLabel | None = None,
366 right_on: IndexLabel | None = None,
367 left_index: bool = False,
368 right_index: bool = False,
369 by=None,
370 left_by=None,
371 right_by=None,
372 suffixes: Suffixes = ("_x", "_y"),
373 tolerance=None,
374 allow_exact_matches: bool = True,
375 direction: str = "backward",
376) -> DataFrame:
377 """
378 Perform a merge by key distance.
379
380 This is similar to a left-join except that we match on nearest
381 key rather than equal keys. Both DataFrames must be sorted by the key.
382
383 For each row in the left DataFrame:
384
385 - A "backward" search selects the last row in the right DataFrame whose
386 'on' key is less than or equal to the left's key.
387
388 - A "forward" search selects the first row in the right DataFrame whose
389 'on' key is greater than or equal to the left's key.
390
391 - A "nearest" search selects the row in the right DataFrame whose 'on'
392 key is closest in absolute distance to the left's key.
393
394 The default is "backward" and is compatible in versions below 0.20.0.
395 The direction parameter was added in version 0.20.0 and introduces
396 "forward" and "nearest".
397
398 Optionally match on equivalent keys with 'by' before searching with 'on'.
399
400 Parameters
401 ----------
402 left : DataFrame or named Series
403 right : DataFrame or named Series
404 on : label
405 Field name to join on. Must be found in both DataFrames.
406 The data MUST be ordered. Furthermore this must be a numeric column,
407 such as datetimelike, integer, or float. On or left_on/right_on
408 must be given.
409 left_on : label
410 Field name to join on in left DataFrame.
411 right_on : label
412 Field name to join on in right DataFrame.
413 left_index : bool
414 Use the index of the left DataFrame as the join key.
415 right_index : bool
416 Use the index of the right DataFrame as the join key.
417 by : column name or list of column names
418 Match on these columns before performing merge operation.
419 left_by : column name
420 Field names to match on in the left DataFrame.
421 right_by : column name
422 Field names to match on in the right DataFrame.
423 suffixes : 2-length sequence (tuple, list, ...)
424 Suffix to apply to overlapping column names in the left and right
425 side, respectively.
426 tolerance : int or Timedelta, optional, default None
427 Select asof tolerance within this range; must be compatible
428 with the merge index.
429 allow_exact_matches : bool, default True
430
431 - If True, allow matching with the same 'on' value
432 (i.e. less-than-or-equal-to / greater-than-or-equal-to)
433 - If False, don't match the same 'on' value
434 (i.e., strictly less-than / strictly greater-than).
435
436 direction : 'backward' (default), 'forward', or 'nearest'
437 Whether to search for prior, subsequent, or closest matches.
438
439 Returns
440 -------
441 DataFrame
442
443 See Also
444 --------
445 merge : Merge with a database-style join.
446 merge_ordered : Merge with optional filling/interpolation.
447
448 Examples
449 --------
450 >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
451 >>> left
452 a left_val
453 0 1 a
454 1 5 b
455 2 10 c
456
457 >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]})
458 >>> right
459 a right_val
460 0 1 1
461 1 2 2
462 2 3 3
463 3 6 6
464 4 7 7
465
466 >>> pd.merge_asof(left, right, on="a")
467 a left_val right_val
468 0 1 a 1
469 1 5 b 3
470 2 10 c 7
471
472 >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False)
473 a left_val right_val
474 0 1 a NaN
475 1 5 b 3.0
476 2 10 c 7.0
477
478 >>> pd.merge_asof(left, right, on="a", direction="forward")
479 a left_val right_val
480 0 1 a 1.0
481 1 5 b 6.0
482 2 10 c NaN
483
484 >>> pd.merge_asof(left, right, on="a", direction="nearest")
485 a left_val right_val
486 0 1 a 1
487 1 5 b 6
488 2 10 c 7
489
490 We can use indexed DataFrames as well.
491
492 >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10])
493 >>> left
494 left_val
495 1 a
496 5 b
497 10 c
498
499 >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7])
500 >>> right
501 right_val
502 1 1
503 2 2
504 3 3
505 6 6
506 7 7
507
508 >>> pd.merge_asof(left, right, left_index=True, right_index=True)
509 left_val right_val
510 1 a 1
511 5 b 3
512 10 c 7
513
514 Here is a real-world times-series example
515
516 >>> quotes = pd.DataFrame(
517 ... {
518 ... "time": [
519 ... pd.Timestamp("2016-05-25 13:30:00.023"),
520 ... pd.Timestamp("2016-05-25 13:30:00.023"),
521 ... pd.Timestamp("2016-05-25 13:30:00.030"),
522 ... pd.Timestamp("2016-05-25 13:30:00.041"),
523 ... pd.Timestamp("2016-05-25 13:30:00.048"),
524 ... pd.Timestamp("2016-05-25 13:30:00.049"),
525 ... pd.Timestamp("2016-05-25 13:30:00.072"),
526 ... pd.Timestamp("2016-05-25 13:30:00.075")
527 ... ],
528 ... "ticker": [
529 ... "GOOG",
530 ... "MSFT",
531 ... "MSFT",
532 ... "MSFT",
533 ... "GOOG",
534 ... "AAPL",
535 ... "GOOG",
536 ... "MSFT"
537 ... ],
538 ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
539 ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
540 ... }
541 ... )
542 >>> quotes
543 time ticker bid ask
544 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93
545 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96
546 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98
547 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00
548 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93
549 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01
550 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88
551 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03
552
553 >>> trades = pd.DataFrame(
554 ... {
555 ... "time": [
556 ... pd.Timestamp("2016-05-25 13:30:00.023"),
557 ... pd.Timestamp("2016-05-25 13:30:00.038"),
558 ... pd.Timestamp("2016-05-25 13:30:00.048"),
559 ... pd.Timestamp("2016-05-25 13:30:00.048"),
560 ... pd.Timestamp("2016-05-25 13:30:00.048")
561 ... ],
562 ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
563 ... "price": [51.95, 51.95, 720.77, 720.92, 98.0],
564 ... "quantity": [75, 155, 100, 100, 100]
565 ... }
566 ... )
567 >>> trades
568 time ticker price quantity
569 0 2016-05-25 13:30:00.023 MSFT 51.95 75
570 1 2016-05-25 13:30:00.038 MSFT 51.95 155
571 2 2016-05-25 13:30:00.048 GOOG 720.77 100
572 3 2016-05-25 13:30:00.048 GOOG 720.92 100
573 4 2016-05-25 13:30:00.048 AAPL 98.00 100
574
575 By default we are taking the asof of the quotes
576
577 >>> pd.merge_asof(trades, quotes, on="time", by="ticker")
578 time ticker price quantity bid ask
579 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
580 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
581 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
582 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
583 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
584
585 We only asof within 2ms between the quote time and the trade time
586
587 >>> pd.merge_asof(
588 ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms")
589 ... )
590 time ticker price quantity bid ask
591 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
592 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN
593 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
594 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
595 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
596
597 We only asof within 10ms between the quote time and the trade time
598 and we exclude exact matches on time. However *prior* data will
599 propagate forward
600
601 >>> pd.merge_asof(
602 ... trades,
603 ... quotes,
604 ... on="time",
605 ... by="ticker",
606 ... tolerance=pd.Timedelta("10ms"),
607 ... allow_exact_matches=False
608 ... )
609 time ticker price quantity bid ask
610 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN
611 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
612 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN
613 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN
614 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
615 """
616 op = _AsOfMerge(
617 left,
618 right,
619 on=on,
620 left_on=left_on,
621 right_on=right_on,
622 left_index=left_index,
623 right_index=right_index,
624 by=by,
625 left_by=left_by,
626 right_by=right_by,
627 suffixes=suffixes,
628 how="asof",
629 tolerance=tolerance,
630 allow_exact_matches=allow_exact_matches,
631 direction=direction,
632 )
633 return op.get_result()
634
635
636# TODO: transformations??
637# TODO: only copy DataFrames when modification necessary
638class _MergeOperation:
639 """
640 Perform a database (SQL) merge operation between two DataFrame or Series
641 objects using either columns as keys or their row indexes
642 """
643
644 _merge_type = "merge"
645 how: MergeHow | Literal["asof"]
646 on: IndexLabel | None
647 # left_on/right_on may be None when passed, but in validate_specification
648 # get replaced with non-None.
649 left_on: Sequence[Hashable | AnyArrayLike]
650 right_on: Sequence[Hashable | AnyArrayLike]
651 left_index: bool
652 right_index: bool
653 axis: AxisInt
654 bm_axis: AxisInt
655 sort: bool
656 suffixes: Suffixes
657 copy: bool
658 indicator: str | bool
659 validate: str | None
660 join_names: list[Hashable]
661 right_join_keys: list[AnyArrayLike]
662 left_join_keys: list[AnyArrayLike]
663
664 def __init__(
665 self,
666 left: DataFrame | Series,
667 right: DataFrame | Series,
668 how: MergeHow | Literal["asof"] = "inner",
669 on: IndexLabel | None = None,
670 left_on: IndexLabel | None = None,
671 right_on: IndexLabel | None = None,
672 axis: AxisInt = 1,
673 left_index: bool = False,
674 right_index: bool = False,
675 sort: bool = True,
676 suffixes: Suffixes = ("_x", "_y"),
677 indicator: str | bool = False,
678 validate: str | None = None,
679 ) -> None:
680 _left = _validate_operand(left)
681 _right = _validate_operand(right)
682 self.left = self.orig_left = _left
683 self.right = self.orig_right = _right
684 self.how = how
685
686 # bm_axis -> the axis on the BlockManager
687 self.bm_axis = axis
688 # axis --> the axis on the Series/DataFrame
689 self.axis = 1 - axis if self.left.ndim == 2 else 0
690
691 self.on = com.maybe_make_list(on)
692
693 self.suffixes = suffixes
694 self.sort = sort
695
696 self.left_index = left_index
697 self.right_index = right_index
698
699 self.indicator = indicator
700
701 if not is_bool(left_index):
702 raise ValueError(
703 f"left_index parameter must be of type bool, not {type(left_index)}"
704 )
705 if not is_bool(right_index):
706 raise ValueError(
707 f"right_index parameter must be of type bool, not {type(right_index)}"
708 )
709
710 # GH 40993: raise when merging between different levels; enforced in 2.0
711 if _left.columns.nlevels != _right.columns.nlevels:
712 msg = (
713 "Not allowed to merge between different levels. "
714 f"({_left.columns.nlevels} levels on the left, "
715 f"{_right.columns.nlevels} on the right)"
716 )
717 raise MergeError(msg)
718
719 self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)
720
721 cross_col = None
722 if self.how == "cross":
723 (
724 self.left,
725 self.right,
726 self.how,
727 cross_col,
728 ) = self._create_cross_configuration(self.left, self.right)
729 self.left_on = self.right_on = [cross_col]
730 self._cross = cross_col
731
732 # note this function has side effects
733 (
734 self.left_join_keys,
735 self.right_join_keys,
736 self.join_names,
737 ) = self._get_merge_keys()
738
739 # validate the merge keys dtypes. We may need to coerce
740 # to avoid incompatible dtypes
741 self._maybe_coerce_merge_keys()
742
743 # If argument passed to validate,
744 # check if columns specified as unique
745 # are in fact unique.
746 if validate is not None:
747 self._validate(validate)
748
749 def _reindex_and_concat(
750 self,
751 join_index: Index,
752 left_indexer: npt.NDArray[np.intp] | None,
753 right_indexer: npt.NDArray[np.intp] | None,
754 copy: bool | None,
755 ) -> DataFrame:
756 """
757 reindex along index and concat along columns.
758 """
759 # Take views so we do not alter the originals
760 left = self.left[:]
761 right = self.right[:]
762
763 llabels, rlabels = _items_overlap_with_suffix(
764 self.left._info_axis, self.right._info_axis, self.suffixes
765 )
766
767 if left_indexer is not None and not is_range_indexer(left_indexer, len(left)):
768 # Pinning the index here (and in the right code just below) is not
769 # necessary, but makes the `.take` more performant if we have e.g.
770 # a MultiIndex for left.index.
771 lmgr = left._mgr.reindex_indexer(
772 join_index,
773 left_indexer,
774 axis=1,
775 copy=False,
776 only_slice=True,
777 allow_dups=True,
778 use_na_proxy=True,
779 )
780 left = left._constructor(lmgr)
781 left.index = join_index
782
783 if right_indexer is not None and not is_range_indexer(
784 right_indexer, len(right)
785 ):
786 rmgr = right._mgr.reindex_indexer(
787 join_index,
788 right_indexer,
789 axis=1,
790 copy=False,
791 only_slice=True,
792 allow_dups=True,
793 use_na_proxy=True,
794 )
795 right = right._constructor(rmgr)
796 right.index = join_index
797
798 from pandas import concat
799
800 left.columns = llabels
801 right.columns = rlabels
802 result = concat([left, right], axis=1, copy=copy)
803 return result
804
805 def get_result(self, copy: bool | None = True) -> DataFrame:
806 if self.indicator:
807 self.left, self.right = self._indicator_pre_merge(self.left, self.right)
808
809 join_index, left_indexer, right_indexer = self._get_join_info()
810
811 result = self._reindex_and_concat(
812 join_index, left_indexer, right_indexer, copy=copy
813 )
814 result = result.__finalize__(self, method=self._merge_type)
815
816 if self.indicator:
817 result = self._indicator_post_merge(result)
818
819 self._maybe_add_join_keys(result, left_indexer, right_indexer)
820
821 self._maybe_restore_index_levels(result)
822
823 self._maybe_drop_cross_column(result, self._cross)
824
825 return result.__finalize__(self, method="merge")
826
827 def _maybe_drop_cross_column(
828 self, result: DataFrame, cross_col: str | None
829 ) -> None:
830 if cross_col is not None:
831 del result[cross_col]
832
833 @cache_readonly
834 def _indicator_name(self) -> str | None:
835 if isinstance(self.indicator, str):
836 return self.indicator
837 elif isinstance(self.indicator, bool):
838 return "_merge" if self.indicator else None
839 else:
840 raise ValueError(
841 "indicator option can only accept boolean or string arguments"
842 )
843
844 def _indicator_pre_merge(
845 self, left: DataFrame, right: DataFrame
846 ) -> tuple[DataFrame, DataFrame]:
847 columns = left.columns.union(right.columns)
848
849 for i in ["_left_indicator", "_right_indicator"]:
850 if i in columns:
851 raise ValueError(
852 "Cannot use `indicator=True` option when "
853 f"data contains a column named {i}"
854 )
855 if self._indicator_name in columns:
856 raise ValueError(
857 "Cannot use name of an existing column for indicator column"
858 )
859
860 left = left.copy()
861 right = right.copy()
862
863 left["_left_indicator"] = 1
864 left["_left_indicator"] = left["_left_indicator"].astype("int8")
865
866 right["_right_indicator"] = 2
867 right["_right_indicator"] = right["_right_indicator"].astype("int8")
868
869 return left, right
870
871 def _indicator_post_merge(self, result: DataFrame) -> DataFrame:
872 result["_left_indicator"] = result["_left_indicator"].fillna(0)
873 result["_right_indicator"] = result["_right_indicator"].fillna(0)
874
875 result[self._indicator_name] = Categorical(
876 (result["_left_indicator"] + result["_right_indicator"]),
877 categories=[1, 2, 3],
878 )
879 result[self._indicator_name] = result[
880 self._indicator_name
881 ].cat.rename_categories(["left_only", "right_only", "both"])
882
883 result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1)
884 return result
885
886 def _maybe_restore_index_levels(self, result: DataFrame) -> None:
887 """
888 Restore index levels specified as `on` parameters
889
890 Here we check for cases where `self.left_on` and `self.right_on` pairs
891 each reference an index level in their respective DataFrames. The
892 joined columns corresponding to these pairs are then restored to the
893 index of `result`.
894
895 **Note:** This method has side effects. It modifies `result` in-place
896
897 Parameters
898 ----------
899 result: DataFrame
900 merge result
901
902 Returns
903 -------
904 None
905 """
906 names_to_restore = []
907 for name, left_key, right_key in zip(
908 self.join_names, self.left_on, self.right_on
909 ):
910 if (
911 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible
912 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected
913 # "Hashable"
914 self.orig_left._is_level_reference(left_key) # type: ignore[arg-type]
915 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible
916 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected
917 # "Hashable"
918 and self.orig_right._is_level_reference(
919 right_key # type: ignore[arg-type]
920 )
921 and left_key == right_key
922 and name not in result.index.names
923 ):
924 names_to_restore.append(name)
925
926 if names_to_restore:
927 result.set_index(names_to_restore, inplace=True)
928
929 def _maybe_add_join_keys(
930 self,
931 result: DataFrame,
932 left_indexer: np.ndarray | None,
933 right_indexer: np.ndarray | None,
934 ) -> None:
935 left_has_missing = None
936 right_has_missing = None
937
938 assert all(is_array_like(x) for x in self.left_join_keys)
939
940 keys = zip(self.join_names, self.left_on, self.right_on)
941 for i, (name, lname, rname) in enumerate(keys):
942 if not _should_fill(lname, rname):
943 continue
944
945 take_left, take_right = None, None
946
947 if name in result:
948 if left_indexer is not None and right_indexer is not None:
949 if name in self.left:
950 if left_has_missing is None:
951 left_has_missing = (left_indexer == -1).any()
952
953 if left_has_missing:
954 take_right = self.right_join_keys[i]
955
956 if not is_dtype_equal(
957 result[name].dtype, self.left[name].dtype
958 ):
959 take_left = self.left[name]._values
960
961 elif name in self.right:
962 if right_has_missing is None:
963 right_has_missing = (right_indexer == -1).any()
964
965 if right_has_missing:
966 take_left = self.left_join_keys[i]
967
968 if not is_dtype_equal(
969 result[name].dtype, self.right[name].dtype
970 ):
971 take_right = self.right[name]._values
972
973 elif left_indexer is not None:
974 take_left = self.left_join_keys[i]
975 take_right = self.right_join_keys[i]
976
977 if take_left is not None or take_right is not None:
978 if take_left is None:
979 lvals = result[name]._values
980 else:
981 # TODO: can we pin down take_left's type earlier?
982 take_left = extract_array(take_left, extract_numpy=True)
983 lfill = na_value_for_dtype(take_left.dtype)
984 lvals = algos.take_nd(take_left, left_indexer, fill_value=lfill)
985
986 if take_right is None:
987 rvals = result[name]._values
988 else:
989 # TODO: can we pin down take_right's type earlier?
990 taker = extract_array(take_right, extract_numpy=True)
991 rfill = na_value_for_dtype(taker.dtype)
992 rvals = algos.take_nd(taker, right_indexer, fill_value=rfill)
993
994 # if we have an all missing left_indexer
995 # make sure to just use the right values or vice-versa
996 mask_left = left_indexer == -1
997 # error: Item "bool" of "Union[Any, bool]" has no attribute "all"
998 if mask_left.all(): # type: ignore[union-attr]
999 key_col = Index(rvals)
1000 result_dtype = rvals.dtype
1001 elif right_indexer is not None and (right_indexer == -1).all():
1002 key_col = Index(lvals)
1003 result_dtype = lvals.dtype
1004 else:
1005 key_col = Index(lvals).where(~mask_left, rvals)
1006 result_dtype = find_common_type([lvals.dtype, rvals.dtype])
1007 if (
1008 lvals.dtype.kind == "M"
1009 and rvals.dtype.kind == "M"
1010 and result_dtype.kind == "O"
1011 ):
1012 # TODO(non-nano) Workaround for common_type not dealing
1013 # with different resolutions
1014 result_dtype = key_col.dtype
1015
1016 if result._is_label_reference(name):
1017 result[name] = Series(
1018 key_col, dtype=result_dtype, index=result.index
1019 )
1020 elif result._is_level_reference(name):
1021 if isinstance(result.index, MultiIndex):
1022 key_col.name = name
1023 idx_list = [
1024 result.index.get_level_values(level_name)
1025 if level_name != name
1026 else key_col
1027 for level_name in result.index.names
1028 ]
1029
1030 result.set_index(idx_list, inplace=True)
1031 else:
1032 result.index = Index(key_col, name=name)
1033 else:
1034 result.insert(i, name or f"key_{i}", key_col)
1035
1036 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
1037 """return the join indexers"""
1038 return get_join_indexers(
1039 self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
1040 )
1041
1042 def _get_join_info(
1043 self,
1044 ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
1045 # make mypy happy
1046 assert self.how != "cross"
1047 left_ax = self.left.axes[self.axis]
1048 right_ax = self.right.axes[self.axis]
1049
1050 if self.left_index and self.right_index and self.how != "asof":
1051 join_index, left_indexer, right_indexer = left_ax.join(
1052 right_ax, how=self.how, return_indexers=True, sort=self.sort
1053 )
1054
1055 elif self.right_index and self.how == "left":
1056 join_index, left_indexer, right_indexer = _left_join_on_index(
1057 left_ax, right_ax, self.left_join_keys, sort=self.sort
1058 )
1059
1060 elif self.left_index and self.how == "right":
1061 join_index, right_indexer, left_indexer = _left_join_on_index(
1062 right_ax, left_ax, self.right_join_keys, sort=self.sort
1063 )
1064 else:
1065 (left_indexer, right_indexer) = self._get_join_indexers()
1066
1067 if self.right_index:
1068 if len(self.left) > 0:
1069 join_index = self._create_join_index(
1070 self.left.index,
1071 self.right.index,
1072 left_indexer,
1073 how="right",
1074 )
1075 else:
1076 join_index = self.right.index.take(right_indexer)
1077 elif self.left_index:
1078 if self.how == "asof":
1079 # GH#33463 asof should always behave like a left merge
1080 join_index = self._create_join_index(
1081 self.left.index,
1082 self.right.index,
1083 left_indexer,
1084 how="left",
1085 )
1086
1087 elif len(self.right) > 0:
1088 join_index = self._create_join_index(
1089 self.right.index,
1090 self.left.index,
1091 right_indexer,
1092 how="left",
1093 )
1094 else:
1095 join_index = self.left.index.take(left_indexer)
1096 else:
1097 join_index = default_index(len(left_indexer))
1098
1099 if len(join_index) == 0 and not isinstance(join_index, MultiIndex):
1100 join_index = default_index(0).set_names(join_index.name)
1101 return join_index, left_indexer, right_indexer
1102
1103 def _create_join_index(
1104 self,
1105 index: Index,
1106 other_index: Index,
1107 indexer: npt.NDArray[np.intp],
1108 how: JoinHow = "left",
1109 ) -> Index:
1110 """
1111 Create a join index by rearranging one index to match another
1112
1113 Parameters
1114 ----------
1115 index : Index being rearranged
1116 other_index : Index used to supply values not found in index
1117 indexer : np.ndarray[np.intp] how to rearrange index
1118 how : str
1119 Replacement is only necessary if indexer based on other_index.
1120
1121 Returns
1122 -------
1123 Index
1124 """
1125 if self.how in (how, "outer") and not isinstance(other_index, MultiIndex):
1126 # if final index requires values in other_index but not target
1127 # index, indexer may hold missing (-1) values, causing Index.take
1128 # to take the final value in target index. So, we set the last
1129 # element to be the desired fill value. We do not use allow_fill
1130 # and fill_value because it throws a ValueError on integer indices
1131 mask = indexer == -1
1132 if np.any(mask):
1133 fill_value = na_value_for_dtype(index.dtype, compat=False)
1134 index = index.append(Index([fill_value]))
1135 return index.take(indexer)
1136
1137 def _get_merge_keys(
1138 self,
1139 ) -> tuple[list[AnyArrayLike], list[AnyArrayLike], list[Hashable]]:
1140 """
1141 Note: has side effects (copy/delete key columns)
1142
1143 Parameters
1144 ----------
1145 left
1146 right
1147 on
1148
1149 Returns
1150 -------
1151 left_keys, right_keys, join_names
1152 """
1153 # left_keys, right_keys entries can actually be anything listlike
1154 # with a 'dtype' attr
1155 left_keys: list[AnyArrayLike] = []
1156 right_keys: list[AnyArrayLike] = []
1157 join_names: list[Hashable] = []
1158 right_drop: list[Hashable] = []
1159 left_drop: list[Hashable] = []
1160
1161 left, right = self.left, self.right
1162
1163 is_lkey = lambda x: is_array_like(x) and len(x) == len(left)
1164 is_rkey = lambda x: is_array_like(x) and len(x) == len(right)
1165
1166 # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A
1167 # user could, for example, request 'left_index' and 'left_by'. In a
1168 # regular pd.merge(), users cannot specify both 'left_index' and
1169 # 'left_on'. (Instead, users have a MultiIndex). That means the
1170 # self.left_on in this function is always empty in a pd.merge(), but
1171 # a pd.merge_asof(left_index=True, left_by=...) will result in a
1172 # self.left_on array with a None in the middle of it. This requires
1173 # a work-around as designated in the code below.
1174 # See _validate_left_right_on() for where this happens.
1175
1176 # ugh, spaghetti re #733
1177 if _any(self.left_on) and _any(self.right_on):
1178 for lk, rk in zip(self.left_on, self.right_on):
1179 if is_lkey(lk):
1180 lk = cast(AnyArrayLike, lk)
1181 left_keys.append(lk)
1182 if is_rkey(rk):
1183 rk = cast(AnyArrayLike, rk)
1184 right_keys.append(rk)
1185 join_names.append(None) # what to do?
1186 else:
1187 # Then we're either Hashable or a wrong-length arraylike,
1188 # the latter of which will raise
1189 rk = cast(Hashable, rk)
1190 if rk is not None:
1191 right_keys.append(right._get_label_or_level_values(rk))
1192 join_names.append(rk)
1193 else:
1194 # work-around for merge_asof(right_index=True)
1195 right_keys.append(right.index)
1196 join_names.append(right.index.name)
1197 else:
1198 if not is_rkey(rk):
1199 # Then we're either Hashable or a wrong-length arraylike,
1200 # the latter of which will raise
1201 rk = cast(Hashable, rk)
1202 if rk is not None:
1203 right_keys.append(right._get_label_or_level_values(rk))
1204 else:
1205 # work-around for merge_asof(right_index=True)
1206 right_keys.append(right.index)
1207 if lk is not None and lk == rk: # FIXME: what about other NAs?
1208 # avoid key upcast in corner case (length-0)
1209 lk = cast(Hashable, lk)
1210 if len(left) > 0:
1211 right_drop.append(rk)
1212 else:
1213 left_drop.append(lk)
1214 else:
1215 rk = cast(AnyArrayLike, rk)
1216 right_keys.append(rk)
1217 if lk is not None:
1218 # Then we're either Hashable or a wrong-length arraylike,
1219 # the latter of which will raise
1220 lk = cast(Hashable, lk)
1221 left_keys.append(left._get_label_or_level_values(lk))
1222 join_names.append(lk)
1223 else:
1224 # work-around for merge_asof(left_index=True)
1225 left_keys.append(left.index)
1226 join_names.append(left.index.name)
1227 elif _any(self.left_on):
1228 for k in self.left_on:
1229 if is_lkey(k):
1230 k = cast(AnyArrayLike, k)
1231 left_keys.append(k)
1232 join_names.append(None)
1233 else:
1234 # Then we're either Hashable or a wrong-length arraylike,
1235 # the latter of which will raise
1236 k = cast(Hashable, k)
1237 left_keys.append(left._get_label_or_level_values(k))
1238 join_names.append(k)
1239 if isinstance(self.right.index, MultiIndex):
1240 right_keys = [
1241 lev._values.take(lev_codes)
1242 for lev, lev_codes in zip(
1243 self.right.index.levels, self.right.index.codes
1244 )
1245 ]
1246 else:
1247 right_keys = [self.right.index._values]
1248 elif _any(self.right_on):
1249 for k in self.right_on:
1250 if is_rkey(k):
1251 k = cast(AnyArrayLike, k)
1252 right_keys.append(k)
1253 join_names.append(None)
1254 else:
1255 # Then we're either Hashable or a wrong-length arraylike,
1256 # the latter of which will raise
1257 k = cast(Hashable, k)
1258 right_keys.append(right._get_label_or_level_values(k))
1259 join_names.append(k)
1260 if isinstance(self.left.index, MultiIndex):
1261 left_keys = [
1262 lev._values.take(lev_codes)
1263 for lev, lev_codes in zip(
1264 self.left.index.levels, self.left.index.codes
1265 )
1266 ]
1267 else:
1268 left_keys = [self.left.index._values]
1269
1270 if left_drop:
1271 self.left = self.left._drop_labels_or_levels(left_drop)
1272
1273 if right_drop:
1274 self.right = self.right._drop_labels_or_levels(right_drop)
1275
1276 return left_keys, right_keys, join_names
1277
1278 def _maybe_coerce_merge_keys(self) -> None:
1279 # we have valid merges but we may have to further
1280 # coerce these if they are originally incompatible types
1281 #
1282 # for example if these are categorical, but are not dtype_equal
1283 # or if we have object and integer dtypes
1284
1285 for lk, rk, name in zip(
1286 self.left_join_keys, self.right_join_keys, self.join_names
1287 ):
1288 if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):
1289 continue
1290
1291 lk = extract_array(lk, extract_numpy=True)
1292 rk = extract_array(rk, extract_numpy=True)
1293
1294 lk_is_cat = is_categorical_dtype(lk.dtype)
1295 rk_is_cat = is_categorical_dtype(rk.dtype)
1296 lk_is_object = is_object_dtype(lk.dtype)
1297 rk_is_object = is_object_dtype(rk.dtype)
1298
1299 # if either left or right is a categorical
1300 # then the must match exactly in categories & ordered
1301 if lk_is_cat and rk_is_cat:
1302 lk = cast(Categorical, lk)
1303 rk = cast(Categorical, rk)
1304 if lk._categories_match_up_to_permutation(rk):
1305 continue
1306
1307 elif lk_is_cat or rk_is_cat:
1308 pass
1309
1310 elif is_dtype_equal(lk.dtype, rk.dtype):
1311 continue
1312
1313 msg = (
1314 f"You are trying to merge on {lk.dtype} and "
1315 f"{rk.dtype} columns. If you wish to proceed you should use pd.concat"
1316 )
1317
1318 # if we are numeric, then allow differing
1319 # kinds to proceed, eg. int64 and int8, int and float
1320 # further if we are object, but we infer to
1321 # the same, then proceed
1322 if is_numeric_dtype(lk.dtype) and is_numeric_dtype(rk.dtype):
1323 if lk.dtype.kind == rk.dtype.kind:
1324 continue
1325
1326 # check whether ints and floats
1327 if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype):
1328 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int
1329 with np.errstate(invalid="ignore"):
1330 # error: Argument 1 to "astype" of "ndarray" has incompatible
1331 # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected
1332 # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]"
1333 casted = lk.astype(rk.dtype) # type: ignore[arg-type]
1334
1335 mask = ~np.isnan(lk)
1336 match = lk == casted
1337 if not match[mask].all():
1338 warnings.warn(
1339 "You are merging on int and float "
1340 "columns where the float values "
1341 "are not equal to their int representation.",
1342 UserWarning,
1343 stacklevel=find_stack_level(),
1344 )
1345 continue
1346
1347 if is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype):
1348 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int
1349 with np.errstate(invalid="ignore"):
1350 # error: Argument 1 to "astype" of "ndarray" has incompatible
1351 # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected
1352 # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]"
1353 casted = rk.astype(lk.dtype) # type: ignore[arg-type]
1354
1355 mask = ~np.isnan(rk)
1356 match = rk == casted
1357 if not match[mask].all():
1358 warnings.warn(
1359 "You are merging on int and float "
1360 "columns where the float values "
1361 "are not equal to their int representation.",
1362 UserWarning,
1363 stacklevel=find_stack_level(),
1364 )
1365 continue
1366
1367 # let's infer and see if we are ok
1368 if lib.infer_dtype(lk, skipna=False) == lib.infer_dtype(
1369 rk, skipna=False
1370 ):
1371 continue
1372
1373 # Check if we are trying to merge on obviously
1374 # incompatible dtypes GH 9780, GH 15800
1375
1376 # bool values are coerced to object
1377 elif (lk_is_object and is_bool_dtype(rk.dtype)) or (
1378 is_bool_dtype(lk.dtype) and rk_is_object
1379 ):
1380 pass
1381
1382 # object values are allowed to be merged
1383 elif (lk_is_object and is_numeric_dtype(rk.dtype)) or (
1384 is_numeric_dtype(lk.dtype) and rk_is_object
1385 ):
1386 inferred_left = lib.infer_dtype(lk, skipna=False)
1387 inferred_right = lib.infer_dtype(rk, skipna=False)
1388 bool_types = ["integer", "mixed-integer", "boolean", "empty"]
1389 string_types = ["string", "unicode", "mixed", "bytes", "empty"]
1390
1391 # inferred bool
1392 if inferred_left in bool_types and inferred_right in bool_types:
1393 pass
1394
1395 # unless we are merging non-string-like with string-like
1396 elif (
1397 inferred_left in string_types and inferred_right not in string_types
1398 ) or (
1399 inferred_right in string_types and inferred_left not in string_types
1400 ):
1401 raise ValueError(msg)
1402
1403 # datetimelikes must match exactly
1404 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype):
1405 raise ValueError(msg)
1406 elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype):
1407 raise ValueError(msg)
1408 elif isinstance(lk.dtype, DatetimeTZDtype) and not isinstance(
1409 rk.dtype, DatetimeTZDtype
1410 ):
1411 raise ValueError(msg)
1412 elif not isinstance(lk.dtype, DatetimeTZDtype) and isinstance(
1413 rk.dtype, DatetimeTZDtype
1414 ):
1415 raise ValueError(msg)
1416 elif (
1417 isinstance(lk.dtype, DatetimeTZDtype)
1418 and isinstance(rk.dtype, DatetimeTZDtype)
1419 ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"):
1420 # allows datetime with different resolutions
1421 continue
1422
1423 elif lk_is_object and rk_is_object:
1424 continue
1425
1426 # Houston, we have a problem!
1427 # let's coerce to object if the dtypes aren't
1428 # categorical, otherwise coerce to the category
1429 # dtype. If we coerced categories to object,
1430 # then we would lose type information on some
1431 # columns, and end up trying to merge
1432 # incompatible dtypes. See GH 16900.
1433 if name in self.left.columns:
1434 typ = cast(Categorical, lk).categories.dtype if lk_is_cat else object
1435 self.left = self.left.copy()
1436 self.left[name] = self.left[name].astype(typ)
1437 if name in self.right.columns:
1438 typ = cast(Categorical, rk).categories.dtype if rk_is_cat else object
1439 self.right = self.right.copy()
1440 self.right[name] = self.right[name].astype(typ)
1441
1442 def _create_cross_configuration(
1443 self, left: DataFrame, right: DataFrame
1444 ) -> tuple[DataFrame, DataFrame, JoinHow, str]:
1445 """
1446 Creates the configuration to dispatch the cross operation to inner join,
1447 e.g. adding a join column and resetting parameters. Join column is added
1448 to a new object, no inplace modification
1449
1450 Parameters
1451 ----------
1452 left : DataFrame
1453 right : DataFrame
1454
1455 Returns
1456 -------
1457 a tuple (left, right, how, cross_col) representing the adjusted
1458 DataFrames with cross_col, the merge operation set to inner and the column
1459 to join over.
1460 """
1461 cross_col = f"_cross_{uuid.uuid4()}"
1462 how: JoinHow = "inner"
1463 return (
1464 left.assign(**{cross_col: 1}),
1465 right.assign(**{cross_col: 1}),
1466 how,
1467 cross_col,
1468 )
1469
1470 def _validate_left_right_on(self, left_on, right_on):
1471 left_on = com.maybe_make_list(left_on)
1472 right_on = com.maybe_make_list(right_on)
1473
1474 if self.how == "cross":
1475 if (
1476 self.left_index
1477 or self.right_index
1478 or right_on is not None
1479 or left_on is not None
1480 or self.on is not None
1481 ):
1482 raise MergeError(
1483 "Can not pass on, right_on, left_on or set right_index=True or "
1484 "left_index=True"
1485 )
1486 # Hm, any way to make this logic less complicated??
1487 elif self.on is None and left_on is None and right_on is None:
1488 if self.left_index and self.right_index:
1489 left_on, right_on = (), ()
1490 elif self.left_index:
1491 raise MergeError("Must pass right_on or right_index=True")
1492 elif self.right_index:
1493 raise MergeError("Must pass left_on or left_index=True")
1494 else:
1495 # use the common columns
1496 left_cols = self.left.columns
1497 right_cols = self.right.columns
1498 common_cols = left_cols.intersection(right_cols)
1499 if len(common_cols) == 0:
1500 raise MergeError(
1501 "No common columns to perform merge on. "
1502 f"Merge options: left_on={left_on}, "
1503 f"right_on={right_on}, "
1504 f"left_index={self.left_index}, "
1505 f"right_index={self.right_index}"
1506 )
1507 if (
1508 not left_cols.join(common_cols, how="inner").is_unique
1509 or not right_cols.join(common_cols, how="inner").is_unique
1510 ):
1511 raise MergeError(f"Data columns not unique: {repr(common_cols)}")
1512 left_on = right_on = common_cols
1513 elif self.on is not None:
1514 if left_on is not None or right_on is not None:
1515 raise MergeError(
1516 'Can only pass argument "on" OR "left_on" '
1517 'and "right_on", not a combination of both.'
1518 )
1519 if self.left_index or self.right_index:
1520 raise MergeError(
1521 'Can only pass argument "on" OR "left_index" '
1522 'and "right_index", not a combination of both.'
1523 )
1524 left_on = right_on = self.on
1525 elif left_on is not None:
1526 if self.left_index:
1527 raise MergeError(
1528 'Can only pass argument "left_on" OR "left_index" not both.'
1529 )
1530 if not self.right_index and right_on is None:
1531 raise MergeError('Must pass "right_on" OR "right_index".')
1532 n = len(left_on)
1533 if self.right_index:
1534 if len(left_on) != self.right.index.nlevels:
1535 raise ValueError(
1536 "len(left_on) must equal the number "
1537 'of levels in the index of "right"'
1538 )
1539 right_on = [None] * n
1540 elif right_on is not None:
1541 if self.right_index:
1542 raise MergeError(
1543 'Can only pass argument "right_on" OR "right_index" not both.'
1544 )
1545 if not self.left_index and left_on is None:
1546 raise MergeError('Must pass "left_on" OR "left_index".')
1547 n = len(right_on)
1548 if self.left_index:
1549 if len(right_on) != self.left.index.nlevels:
1550 raise ValueError(
1551 "len(right_on) must equal the number "
1552 'of levels in the index of "left"'
1553 )
1554 left_on = [None] * n
1555 if self.how != "cross" and len(right_on) != len(left_on):
1556 raise ValueError("len(right_on) must equal len(left_on)")
1557
1558 return left_on, right_on
1559
1560 def _validate(self, validate: str) -> None:
1561 # Check uniqueness of each
1562 if self.left_index:
1563 left_unique = self.orig_left.index.is_unique
1564 else:
1565 left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique
1566
1567 if self.right_index:
1568 right_unique = self.orig_right.index.is_unique
1569 else:
1570 right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique
1571
1572 # Check data integrity
1573 if validate in ["one_to_one", "1:1"]:
1574 if not left_unique and not right_unique:
1575 raise MergeError(
1576 "Merge keys are not unique in either left "
1577 "or right dataset; not a one-to-one merge"
1578 )
1579 if not left_unique:
1580 raise MergeError(
1581 "Merge keys are not unique in left dataset; not a one-to-one merge"
1582 )
1583 if not right_unique:
1584 raise MergeError(
1585 "Merge keys are not unique in right dataset; not a one-to-one merge"
1586 )
1587
1588 elif validate in ["one_to_many", "1:m"]:
1589 if not left_unique:
1590 raise MergeError(
1591 "Merge keys are not unique in left dataset; not a one-to-many merge"
1592 )
1593
1594 elif validate in ["many_to_one", "m:1"]:
1595 if not right_unique:
1596 raise MergeError(
1597 "Merge keys are not unique in right dataset; "
1598 "not a many-to-one merge"
1599 )
1600
1601 elif validate in ["many_to_many", "m:m"]:
1602 pass
1603
1604 else:
1605 raise ValueError(
1606 f'"{validate}" is not a valid argument. '
1607 "Valid arguments are:\n"
1608 '- "1:1"\n'
1609 '- "1:m"\n'
1610 '- "m:1"\n'
1611 '- "m:m"\n'
1612 '- "one_to_one"\n'
1613 '- "one_to_many"\n'
1614 '- "many_to_one"\n'
1615 '- "many_to_many"'
1616 )
1617
1618
1619def get_join_indexers(
1620 left_keys,
1621 right_keys,
1622 sort: bool = False,
1623 how: MergeHow | Literal["asof"] = "inner",
1624 **kwargs,
1625) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
1626 """
1627
1628 Parameters
1629 ----------
1630 left_keys : ndarray, Index, Series
1631 right_keys : ndarray, Index, Series
1632 sort : bool, default False
1633 how : {'inner', 'outer', 'left', 'right'}, default 'inner'
1634
1635 Returns
1636 -------
1637 np.ndarray[np.intp]
1638 Indexer into the left_keys.
1639 np.ndarray[np.intp]
1640 Indexer into the right_keys.
1641 """
1642 assert len(left_keys) == len(
1643 right_keys
1644 ), "left_key and right_keys must be the same length"
1645
1646 # fast-path for empty left/right
1647 left_n = len(left_keys[0])
1648 right_n = len(right_keys[0])
1649 if left_n == 0:
1650 if how in ["left", "inner", "cross"]:
1651 return _get_empty_indexer()
1652 elif not sort and how in ["right", "outer"]:
1653 return _get_no_sort_one_missing_indexer(right_n, True)
1654 elif right_n == 0:
1655 if how in ["right", "inner", "cross"]:
1656 return _get_empty_indexer()
1657 elif not sort and how in ["left", "outer"]:
1658 return _get_no_sort_one_missing_indexer(left_n, False)
1659
1660 # get left & right join labels and num. of levels at each location
1661 mapped = (
1662 _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
1663 for n in range(len(left_keys))
1664 )
1665 zipped = zip(*mapped)
1666 llab, rlab, shape = (list(x) for x in zipped)
1667
1668 # get flat i8 keys from label lists
1669 lkey, rkey = _get_join_keys(llab, rlab, tuple(shape), sort)
1670
1671 # factorize keys to a dense i8 space
1672 # `count` is the num. of unique keys
1673 # set(lkey) | set(rkey) == range(count)
1674
1675 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how)
1676 # preserve left frame order if how == 'left' and sort == False
1677 kwargs = cp.copy(kwargs)
1678 if how in ("left", "right"):
1679 kwargs["sort"] = sort
1680 join_func = {
1681 "inner": libjoin.inner_join,
1682 "left": libjoin.left_outer_join,
1683 "right": lambda x, y, count, **kwargs: libjoin.left_outer_join(
1684 y, x, count, **kwargs
1685 )[::-1],
1686 "outer": libjoin.full_outer_join,
1687 }[how]
1688
1689 # error: Cannot call function of unknown type
1690 return join_func(lkey, rkey, count, **kwargs) # type: ignore[operator]
1691
1692
1693def restore_dropped_levels_multijoin(
1694 left: MultiIndex,
1695 right: MultiIndex,
1696 dropped_level_names,
1697 join_index: Index,
1698 lindexer: npt.NDArray[np.intp],
1699 rindexer: npt.NDArray[np.intp],
1700) -> tuple[list[Index], npt.NDArray[np.intp], list[Hashable]]:
1701 """
1702 *this is an internal non-public method*
1703
1704 Returns the levels, labels and names of a multi-index to multi-index join.
1705 Depending on the type of join, this method restores the appropriate
1706 dropped levels of the joined multi-index.
1707 The method relies on lindexer, rindexer which hold the index positions of
1708 left and right, where a join was feasible
1709
1710 Parameters
1711 ----------
1712 left : MultiIndex
1713 left index
1714 right : MultiIndex
1715 right index
1716 dropped_level_names : str array
1717 list of non-common level names
1718 join_index : Index
1719 the index of the join between the
1720 common levels of left and right
1721 lindexer : np.ndarray[np.intp]
1722 left indexer
1723 rindexer : np.ndarray[np.intp]
1724 right indexer
1725
1726 Returns
1727 -------
1728 levels : list of Index
1729 levels of combined multiindexes
1730 labels : np.ndarray[np.intp]
1731 labels of combined multiindexes
1732 names : List[Hashable]
1733 names of combined multiindex levels
1734
1735 """
1736
1737 def _convert_to_multiindex(index: Index) -> MultiIndex:
1738 if isinstance(index, MultiIndex):
1739 return index
1740 else:
1741 return MultiIndex.from_arrays([index._values], names=[index.name])
1742
1743 # For multi-multi joins with one overlapping level,
1744 # the returned index if of type Index
1745 # Assure that join_index is of type MultiIndex
1746 # so that dropped levels can be appended
1747 join_index = _convert_to_multiindex(join_index)
1748
1749 join_levels = join_index.levels
1750 join_codes = join_index.codes
1751 join_names = join_index.names
1752
1753 # Iterate through the levels that must be restored
1754 for dropped_level_name in dropped_level_names:
1755 if dropped_level_name in left.names:
1756 idx = left
1757 indexer = lindexer
1758 else:
1759 idx = right
1760 indexer = rindexer
1761
1762 # The index of the level name to be restored
1763 name_idx = idx.names.index(dropped_level_name)
1764
1765 restore_levels = idx.levels[name_idx]
1766 # Inject -1 in the codes list where a join was not possible
1767 # IOW indexer[i]=-1
1768 codes = idx.codes[name_idx]
1769 if indexer is None:
1770 restore_codes = codes
1771 else:
1772 restore_codes = algos.take_nd(codes, indexer, fill_value=-1)
1773
1774 # error: Cannot determine type of "__add__"
1775 join_levels = join_levels + [restore_levels] # type: ignore[has-type]
1776 join_codes = join_codes + [restore_codes]
1777 join_names = join_names + [dropped_level_name]
1778
1779 return join_levels, join_codes, join_names
1780
1781
1782class _OrderedMerge(_MergeOperation):
1783 _merge_type = "ordered_merge"
1784
1785 def __init__(
1786 self,
1787 left: DataFrame | Series,
1788 right: DataFrame | Series,
1789 on: IndexLabel | None = None,
1790 left_on: IndexLabel | None = None,
1791 right_on: IndexLabel | None = None,
1792 left_index: bool = False,
1793 right_index: bool = False,
1794 axis: AxisInt = 1,
1795 suffixes: Suffixes = ("_x", "_y"),
1796 fill_method: str | None = None,
1797 how: JoinHow | Literal["asof"] = "outer",
1798 ) -> None:
1799 self.fill_method = fill_method
1800 _MergeOperation.__init__(
1801 self,
1802 left,
1803 right,
1804 on=on,
1805 left_on=left_on,
1806 left_index=left_index,
1807 right_index=right_index,
1808 right_on=right_on,
1809 axis=axis,
1810 how=how,
1811 suffixes=suffixes,
1812 sort=True, # factorize sorts
1813 )
1814
1815 def get_result(self, copy: bool | None = True) -> DataFrame:
1816 join_index, left_indexer, right_indexer = self._get_join_info()
1817
1818 llabels, rlabels = _items_overlap_with_suffix(
1819 self.left._info_axis, self.right._info_axis, self.suffixes
1820 )
1821
1822 left_join_indexer: np.ndarray | None
1823 right_join_indexer: np.ndarray | None
1824
1825 if self.fill_method == "ffill":
1826 if left_indexer is None:
1827 raise TypeError("left_indexer cannot be None")
1828 left_indexer, right_indexer = cast(np.ndarray, left_indexer), cast(
1829 np.ndarray, right_indexer
1830 )
1831 left_join_indexer = libjoin.ffill_indexer(left_indexer)
1832 right_join_indexer = libjoin.ffill_indexer(right_indexer)
1833 else:
1834 left_join_indexer = left_indexer
1835 right_join_indexer = right_indexer
1836
1837 result = self._reindex_and_concat(
1838 join_index, left_join_indexer, right_join_indexer, copy=copy
1839 )
1840 self._maybe_add_join_keys(result, left_indexer, right_indexer)
1841
1842 return result
1843
1844
1845def _asof_by_function(direction: str):
1846 name = f"asof_join_{direction}_on_X_by_Y"
1847 return getattr(libjoin, name, None)
1848
1849
1850_type_casters = {
1851 "int64_t": ensure_int64,
1852 "double": ensure_float64,
1853 "object": ensure_object,
1854}
1855
1856
1857def _get_cython_type_upcast(dtype: DtypeObj) -> str:
1858 """Upcast a dtype to 'int64_t', 'double', or 'object'"""
1859 if is_integer_dtype(dtype):
1860 return "int64_t"
1861 elif is_float_dtype(dtype):
1862 return "double"
1863 else:
1864 return "object"
1865
1866
1867class _AsOfMerge(_OrderedMerge):
1868 _merge_type = "asof_merge"
1869
1870 def __init__(
1871 self,
1872 left: DataFrame | Series,
1873 right: DataFrame | Series,
1874 on: IndexLabel | None = None,
1875 left_on: IndexLabel | None = None,
1876 right_on: IndexLabel | None = None,
1877 left_index: bool = False,
1878 right_index: bool = False,
1879 by=None,
1880 left_by=None,
1881 right_by=None,
1882 axis: AxisInt = 1,
1883 suffixes: Suffixes = ("_x", "_y"),
1884 copy: bool = True,
1885 fill_method: str | None = None,
1886 how: Literal["asof"] = "asof",
1887 tolerance=None,
1888 allow_exact_matches: bool = True,
1889 direction: str = "backward",
1890 ) -> None:
1891 self.by = by
1892 self.left_by = left_by
1893 self.right_by = right_by
1894 self.tolerance = tolerance
1895 self.allow_exact_matches = allow_exact_matches
1896 self.direction = direction
1897
1898 _OrderedMerge.__init__(
1899 self,
1900 left,
1901 right,
1902 on=on,
1903 left_on=left_on,
1904 right_on=right_on,
1905 left_index=left_index,
1906 right_index=right_index,
1907 axis=axis,
1908 how=how,
1909 suffixes=suffixes,
1910 fill_method=fill_method,
1911 )
1912
1913 def _validate_left_right_on(self, left_on, right_on):
1914 left_on, right_on = super()._validate_left_right_on(left_on, right_on)
1915
1916 # we only allow on to be a single item for on
1917 if len(left_on) != 1 and not self.left_index:
1918 raise MergeError("can only asof on a key for left")
1919
1920 if len(right_on) != 1 and not self.right_index:
1921 raise MergeError("can only asof on a key for right")
1922
1923 if self.left_index and isinstance(self.left.index, MultiIndex):
1924 raise MergeError("left can only have one index")
1925
1926 if self.right_index and isinstance(self.right.index, MultiIndex):
1927 raise MergeError("right can only have one index")
1928
1929 # set 'by' columns
1930 if self.by is not None:
1931 if self.left_by is not None or self.right_by is not None:
1932 raise MergeError("Can only pass by OR left_by and right_by")
1933 self.left_by = self.right_by = self.by
1934 if self.left_by is None and self.right_by is not None:
1935 raise MergeError("missing left_by")
1936 if self.left_by is not None and self.right_by is None:
1937 raise MergeError("missing right_by")
1938
1939 # GH#29130 Check that merge keys do not have dtype object
1940 if not self.left_index:
1941 left_on_0 = left_on[0]
1942 if is_array_like(left_on_0):
1943 lo_dtype = left_on_0.dtype
1944 else:
1945 lo_dtype = (
1946 self.left._get_label_or_level_values(left_on_0).dtype
1947 if left_on_0 in self.left.columns
1948 else self.left.index.get_level_values(left_on_0)
1949 )
1950 else:
1951 lo_dtype = self.left.index.dtype
1952
1953 if not self.right_index:
1954 right_on_0 = right_on[0]
1955 if is_array_like(right_on_0):
1956 ro_dtype = right_on_0.dtype
1957 else:
1958 ro_dtype = (
1959 self.right._get_label_or_level_values(right_on_0).dtype
1960 if right_on_0 in self.right.columns
1961 else self.right.index.get_level_values(right_on_0)
1962 )
1963 else:
1964 ro_dtype = self.right.index.dtype
1965
1966 if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype):
1967 raise MergeError(
1968 f"Incompatible merge dtype, {repr(ro_dtype)} and "
1969 f"{repr(lo_dtype)}, both sides must have numeric dtype"
1970 )
1971
1972 # add 'by' to our key-list so we can have it in the
1973 # output as a key
1974 if self.left_by is not None:
1975 if not is_list_like(self.left_by):
1976 self.left_by = [self.left_by]
1977 if not is_list_like(self.right_by):
1978 self.right_by = [self.right_by]
1979
1980 if len(self.left_by) != len(self.right_by):
1981 raise MergeError("left_by and right_by must be same length")
1982
1983 left_on = self.left_by + list(left_on)
1984 right_on = self.right_by + list(right_on)
1985
1986 # check 'direction' is valid
1987 if self.direction not in ["backward", "forward", "nearest"]:
1988 raise MergeError(f"direction invalid: {self.direction}")
1989
1990 return left_on, right_on
1991
1992 def _get_merge_keys(
1993 self,
1994 ) -> tuple[list[AnyArrayLike], list[AnyArrayLike], list[Hashable]]:
1995 # note this function has side effects
1996 (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys()
1997
1998 # validate index types are the same
1999 for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
2000 if not is_dtype_equal(lk.dtype, rk.dtype):
2001 if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype):
2002 # The generic error message is confusing for categoricals.
2003 #
2004 # In this function, the join keys include both the original
2005 # ones of the merge_asof() call, and also the keys passed
2006 # to its by= argument. Unordered but equal categories
2007 # are not supported for the former, but will fail
2008 # later with a ValueError, so we don't *need* to check
2009 # for them here.
2010 msg = (
2011 f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
2012 f"{repr(rk.dtype)}, both sides category, but not equal ones"
2013 )
2014 else:
2015 msg = (
2016 f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
2017 f"{repr(rk.dtype)}, must be the same type"
2018 )
2019 raise MergeError(msg)
2020
2021 # validate tolerance; datetime.timedelta or Timedelta if we have a DTI
2022 if self.tolerance is not None:
2023 if self.left_index:
2024 # Actually more specifically an Index
2025 lt = cast(AnyArrayLike, self.left.index)
2026 else:
2027 lt = left_join_keys[-1]
2028
2029 msg = (
2030 f"incompatible tolerance {self.tolerance}, must be compat "
2031 f"with type {repr(lt.dtype)}"
2032 )
2033
2034 if needs_i8_conversion(lt):
2035 if not isinstance(self.tolerance, datetime.timedelta):
2036 raise MergeError(msg)
2037 if self.tolerance < Timedelta(0):
2038 raise MergeError("tolerance must be positive")
2039
2040 elif is_integer_dtype(lt):
2041 if not is_integer(self.tolerance):
2042 raise MergeError(msg)
2043 if self.tolerance < 0:
2044 raise MergeError("tolerance must be positive")
2045
2046 elif is_float_dtype(lt):
2047 if not is_number(self.tolerance):
2048 raise MergeError(msg)
2049 if self.tolerance < 0:
2050 raise MergeError("tolerance must be positive")
2051
2052 else:
2053 raise MergeError("key must be integer, timestamp or float")
2054
2055 # validate allow_exact_matches
2056 if not is_bool(self.allow_exact_matches):
2057 msg = (
2058 "allow_exact_matches must be boolean, "
2059 f"passed {self.allow_exact_matches}"
2060 )
2061 raise MergeError(msg)
2062
2063 return left_join_keys, right_join_keys, join_names
2064
2065 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2066 """return the join indexers"""
2067
2068 def flip(xs) -> np.ndarray:
2069 """unlike np.transpose, this returns an array of tuples"""
2070
2071 def injection(obj):
2072 if not is_extension_array_dtype(obj):
2073 # ndarray
2074 return obj
2075 obj = extract_array(obj)
2076 if isinstance(obj, NDArrayBackedExtensionArray):
2077 # fastpath for e.g. dt64tz, categorical
2078 return obj._ndarray
2079 # FIXME: returning obj._values_for_argsort() here doesn't
2080 # break in any existing test cases, but i (@jbrockmendel)
2081 # am pretty sure it should!
2082 # e.g.
2083 # arr = pd.array([0, pd.NA, 255], dtype="UInt8")
2084 # will have values_for_argsort (before GH#45434)
2085 # np.array([0, 255, 255], dtype=np.uint8)
2086 # and the non-injectivity should make a difference somehow
2087 # shouldn't it?
2088 return np.asarray(obj)
2089
2090 xs = [injection(x) for x in xs]
2091 labels = list(string.ascii_lowercase[: len(xs)])
2092 dtypes = [x.dtype for x in xs]
2093 labeled_dtypes = list(zip(labels, dtypes))
2094 return np.array(list(zip(*xs)), labeled_dtypes)
2095
2096 # values to compare
2097 left_values = (
2098 self.left.index._values if self.left_index else self.left_join_keys[-1]
2099 )
2100 right_values = (
2101 self.right.index._values if self.right_index else self.right_join_keys[-1]
2102 )
2103 tolerance = self.tolerance
2104
2105 # we require sortedness and non-null values in the join keys
2106 if not Index(left_values).is_monotonic_increasing:
2107 side = "left"
2108 if isna(left_values).any():
2109 raise ValueError(f"Merge keys contain null values on {side} side")
2110 raise ValueError(f"{side} keys must be sorted")
2111
2112 if not Index(right_values).is_monotonic_increasing:
2113 side = "right"
2114 if isna(right_values).any():
2115 raise ValueError(f"Merge keys contain null values on {side} side")
2116 raise ValueError(f"{side} keys must be sorted")
2117
2118 # initial type conversion as needed
2119 if needs_i8_conversion(left_values):
2120 if tolerance is not None:
2121 tolerance = Timedelta(tolerance)
2122
2123 # TODO: we have no test cases with PeriodDtype here; probably
2124 # need to adjust tolerance for that case.
2125 if left_values.dtype.kind in ["m", "M"]:
2126 # Make sure the i8 representation for tolerance
2127 # matches that for left_values/right_values.
2128 lvs = ensure_wrapped_if_datetimelike(left_values)
2129 tolerance = tolerance.as_unit(lvs.unit)
2130
2131 tolerance = tolerance._value
2132
2133 # TODO: require left_values.dtype == right_values.dtype, or at least
2134 # comparable for e.g. dt64tz
2135 left_values = left_values.view("i8")
2136 right_values = right_values.view("i8")
2137
2138 # a "by" parameter requires special handling
2139 if self.left_by is not None:
2140 # remove 'on' parameter from values if one existed
2141 if self.left_index and self.right_index:
2142 left_by_values = self.left_join_keys
2143 right_by_values = self.right_join_keys
2144 else:
2145 left_by_values = self.left_join_keys[0:-1]
2146 right_by_values = self.right_join_keys[0:-1]
2147
2148 # get tuple representation of values if more than one
2149 if len(left_by_values) == 1:
2150 lbv = left_by_values[0]
2151 rbv = right_by_values[0]
2152 else:
2153 # We get here with non-ndarrays in test_merge_by_col_tz_aware
2154 # and test_merge_groupby_multiple_column_with_categorical_column
2155 lbv = flip(left_by_values)
2156 rbv = flip(right_by_values)
2157
2158 # upcast 'by' parameter because HashTable is limited
2159 by_type = _get_cython_type_upcast(lbv.dtype)
2160 by_type_caster = _type_casters[by_type]
2161 # error: Incompatible types in assignment (expression has type
2162 # "ndarray[Any, dtype[generic]]", variable has type
2163 # "List[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]]")
2164 left_by_values = by_type_caster(lbv) # type: ignore[assignment]
2165 # error: Incompatible types in assignment (expression has type
2166 # "ndarray[Any, dtype[generic]]", variable has type
2167 # "List[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]]")
2168 right_by_values = by_type_caster(rbv) # type: ignore[assignment]
2169
2170 # choose appropriate function by type
2171 func = _asof_by_function(self.direction)
2172 return func(
2173 left_values,
2174 right_values,
2175 left_by_values,
2176 right_by_values,
2177 self.allow_exact_matches,
2178 tolerance,
2179 )
2180 else:
2181 # choose appropriate function by type
2182 func = _asof_by_function(self.direction)
2183 # TODO(cython3):
2184 # Bug in beta1 preventing Cython from choosing
2185 # right specialization when one fused memview is None
2186 # Doesn't matter what type we choose
2187 # (nothing happens anyways since it is None)
2188 # GH 51640
2189 return func[f"{left_values.dtype}_t", object](
2190 left_values,
2191 right_values,
2192 None,
2193 None,
2194 self.allow_exact_matches,
2195 tolerance,
2196 False,
2197 )
2198
2199
2200def _get_multiindex_indexer(
2201 join_keys, index: MultiIndex, sort: bool
2202) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2203 # left & right join labels and num. of levels at each location
2204 mapped = (
2205 _factorize_keys(index.levels[n], join_keys[n], sort=sort)
2206 for n in range(index.nlevels)
2207 )
2208 zipped = zip(*mapped)
2209 rcodes, lcodes, shape = (list(x) for x in zipped)
2210 if sort:
2211 rcodes = list(map(np.take, rcodes, index.codes))
2212 else:
2213 i8copy = lambda a: a.astype("i8", subok=False, copy=True)
2214 rcodes = list(map(i8copy, index.codes))
2215
2216 # fix right labels if there were any nulls
2217 for i, join_key in enumerate(join_keys):
2218 mask = index.codes[i] == -1
2219 if mask.any():
2220 # check if there already was any nulls at this location
2221 # if there was, it is factorized to `shape[i] - 1`
2222 a = join_key[lcodes[i] == shape[i] - 1]
2223 if a.size == 0 or not a[0] != a[0]:
2224 shape[i] += 1
2225
2226 rcodes[i][mask] = shape[i] - 1
2227
2228 # get flat i8 join keys
2229 lkey, rkey = _get_join_keys(lcodes, rcodes, tuple(shape), sort)
2230
2231 # factorize keys to a dense i8 space
2232 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
2233
2234 return libjoin.left_outer_join(lkey, rkey, count, sort=sort)
2235
2236
2237def _get_single_indexer(
2238 join_key, index: Index, sort: bool = False
2239) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2240 left_key, right_key, count = _factorize_keys(join_key, index._values, sort=sort)
2241
2242 return libjoin.left_outer_join(left_key, right_key, count, sort=sort)
2243
2244
2245def _get_empty_indexer() -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2246 """Return empty join indexers."""
2247 return (
2248 np.array([], dtype=np.intp),
2249 np.array([], dtype=np.intp),
2250 )
2251
2252
2253def _get_no_sort_one_missing_indexer(
2254 n: int, left_missing: bool
2255) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2256 """
2257 Return join indexers where all of one side is selected without sorting
2258 and none of the other side is selected.
2259
2260 Parameters
2261 ----------
2262 n : int
2263 Length of indexers to create.
2264 left_missing : bool
2265 If True, the left indexer will contain only -1's.
2266 If False, the right indexer will contain only -1's.
2267
2268 Returns
2269 -------
2270 np.ndarray[np.intp]
2271 Left indexer
2272 np.ndarray[np.intp]
2273 Right indexer
2274 """
2275 idx = np.arange(n, dtype=np.intp)
2276 idx_missing = np.full(shape=n, fill_value=-1, dtype=np.intp)
2277 if left_missing:
2278 return idx_missing, idx
2279 return idx, idx_missing
2280
2281
2282def _left_join_on_index(
2283 left_ax: Index, right_ax: Index, join_keys, sort: bool = False
2284) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]:
2285 if len(join_keys) > 1:
2286 if not (
2287 isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels
2288 ):
2289 raise AssertionError(
2290 "If more than one join key is given then "
2291 "'right_ax' must be a MultiIndex and the "
2292 "number of join keys must be the number of levels in right_ax"
2293 )
2294
2295 left_indexer, right_indexer = _get_multiindex_indexer(
2296 join_keys, right_ax, sort=sort
2297 )
2298 else:
2299 jkey = join_keys[0]
2300
2301 left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort)
2302
2303 if sort or len(left_ax) != len(left_indexer):
2304 # if asked to sort or there are 1-to-many matches
2305 join_index = left_ax.take(left_indexer)
2306 return join_index, left_indexer, right_indexer
2307
2308 # left frame preserves order & length of its index
2309 return left_ax, None, right_indexer
2310
2311
2312def _factorize_keys(
2313 lk: ArrayLike,
2314 rk: ArrayLike,
2315 sort: bool = True,
2316 how: MergeHow | Literal["asof"] = "inner",
2317) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
2318 """
2319 Encode left and right keys as enumerated types.
2320
2321 This is used to get the join indexers to be used when merging DataFrames.
2322
2323 Parameters
2324 ----------
2325 lk : array-like
2326 Left key.
2327 rk : array-like
2328 Right key.
2329 sort : bool, defaults to True
2330 If True, the encoding is done such that the unique elements in the
2331 keys are sorted.
2332 how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’
2333 Type of merge.
2334
2335 Returns
2336 -------
2337 np.ndarray[np.intp]
2338 Left (resp. right if called with `key='right'`) labels, as enumerated type.
2339 np.ndarray[np.intp]
2340 Right (resp. left if called with `key='right'`) labels, as enumerated type.
2341 int
2342 Number of unique elements in union of left and right labels.
2343
2344 See Also
2345 --------
2346 merge : Merge DataFrame or named Series objects
2347 with a database-style join.
2348 algorithms.factorize : Encode the object as an enumerated type
2349 or categorical variable.
2350
2351 Examples
2352 --------
2353 >>> lk = np.array(["a", "c", "b"])
2354 >>> rk = np.array(["a", "c"])
2355
2356 Here, the unique values are `'a', 'b', 'c'`. With the default
2357 `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:
2358
2359 >>> pd.core.reshape.merge._factorize_keys(lk, rk)
2360 (array([0, 2, 1]), array([0, 2]), 3)
2361
2362 With the `sort=False`, the encoding will correspond to the order
2363 in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:
2364
2365 >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)
2366 (array([0, 1, 2]), array([0, 1]), 3)
2367 """
2368 # Some pre-processing for non-ndarray lk / rk
2369 lk = extract_array(lk, extract_numpy=True, extract_range=True)
2370 rk = extract_array(rk, extract_numpy=True, extract_range=True)
2371 # TODO: if either is a RangeIndex, we can likely factorize more efficiently?
2372
2373 if (
2374 isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype)
2375 ) or (
2376 isinstance(lk.dtype, np.dtype)
2377 and lk.dtype.kind == "M"
2378 and isinstance(rk.dtype, np.dtype)
2379 and rk.dtype.kind == "M"
2380 ):
2381 # Extract the ndarray (UTC-localized) values
2382 # Note: we dont need the dtypes to match, as these can still be compared
2383 lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk)
2384 lk = cast("DatetimeArray", lk)._ndarray
2385 rk = cast("DatetimeArray", rk)._ndarray
2386
2387 elif (
2388 is_categorical_dtype(lk.dtype)
2389 and is_categorical_dtype(rk.dtype)
2390 and is_dtype_equal(lk.dtype, rk.dtype)
2391 ):
2392 assert isinstance(lk, Categorical)
2393 assert isinstance(rk, Categorical)
2394 # Cast rk to encoding so we can compare codes with lk
2395
2396 rk = lk._encode_with_my_categories(rk)
2397
2398 lk = ensure_int64(lk.codes)
2399 rk = ensure_int64(rk.codes)
2400
2401 elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype):
2402 if not isinstance(lk, BaseMaskedArray) and not (
2403 # exclude arrow dtypes that would get cast to object
2404 isinstance(lk.dtype, ArrowDtype)
2405 and is_numeric_dtype(lk.dtype.numpy_dtype)
2406 ):
2407 lk, _ = lk._values_for_factorize()
2408
2409 # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute
2410 # "_values_for_factorize"
2411 rk, _ = rk._values_for_factorize() # type: ignore[union-attr]
2412
2413 if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype:
2414 # GH#23917 TODO: Needs tests for non-matching dtypes
2415 # GH#23917 TODO: needs tests for case where lk is integer-dtype
2416 # and rk is datetime-dtype
2417 lk = np.asarray(lk, dtype=np.int64)
2418 rk = np.asarray(rk, dtype=np.int64)
2419
2420 klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk)
2421
2422 rizer = klass(max(len(lk), len(rk)))
2423
2424 if isinstance(lk, BaseMaskedArray):
2425 assert isinstance(rk, BaseMaskedArray)
2426 llab = rizer.factorize(lk._data, mask=lk._mask)
2427 rlab = rizer.factorize(rk._data, mask=rk._mask)
2428 elif isinstance(lk, ArrowExtensionArray):
2429 assert isinstance(rk, ArrowExtensionArray)
2430 # we can only get here with numeric dtypes
2431 # TODO: Remove when we have a Factorizer for Arrow
2432 llab = rizer.factorize(
2433 lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna()
2434 )
2435 rlab = rizer.factorize(
2436 rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna()
2437 )
2438 else:
2439 # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
2440 # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
2441 # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
2442 llab = rizer.factorize(lk) # type: ignore[arg-type]
2443 rlab = rizer.factorize(rk) # type: ignore[arg-type]
2444 assert llab.dtype == np.dtype(np.intp), llab.dtype
2445 assert rlab.dtype == np.dtype(np.intp), rlab.dtype
2446
2447 count = rizer.get_count()
2448
2449 if sort:
2450 uniques = rizer.uniques.to_array()
2451 llab, rlab = _sort_labels(uniques, llab, rlab)
2452
2453 # NA group
2454 lmask = llab == -1
2455 lany = lmask.any()
2456 rmask = rlab == -1
2457 rany = rmask.any()
2458
2459 if lany or rany:
2460 if lany:
2461 np.putmask(llab, lmask, count)
2462 if rany:
2463 np.putmask(rlab, rmask, count)
2464 count += 1
2465
2466 if how == "right":
2467 return rlab, llab, count
2468 return llab, rlab, count
2469
2470
2471def _convert_arrays_and_get_rizer_klass(
2472 lk: ArrayLike, rk: ArrayLike
2473) -> tuple[type[libhashtable.Factorizer], ArrayLike, ArrayLike]:
2474 klass: type[libhashtable.Factorizer]
2475 if is_numeric_dtype(lk.dtype):
2476 if not is_dtype_equal(lk, rk):
2477 dtype = find_common_type([lk.dtype, rk.dtype])
2478 if isinstance(dtype, ExtensionDtype):
2479 cls = dtype.construct_array_type()
2480 if not isinstance(lk, ExtensionArray):
2481 lk = cls._from_sequence(lk, dtype=dtype, copy=False)
2482 else:
2483 lk = lk.astype(dtype)
2484
2485 if not isinstance(rk, ExtensionArray):
2486 rk = cls._from_sequence(rk, dtype=dtype, copy=False)
2487 else:
2488 rk = rk.astype(dtype)
2489 else:
2490 lk = lk.astype(dtype)
2491 rk = rk.astype(dtype)
2492 if isinstance(lk, BaseMaskedArray):
2493 # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]";
2494 # expected type "Type[object]"
2495 klass = _factorizers[lk.dtype.type] # type: ignore[index]
2496 elif isinstance(lk.dtype, ArrowDtype):
2497 klass = _factorizers[lk.dtype.numpy_dtype.type]
2498 else:
2499 klass = _factorizers[lk.dtype.type]
2500
2501 else:
2502 klass = libhashtable.ObjectFactorizer
2503 lk = ensure_object(lk)
2504 rk = ensure_object(rk)
2505 return klass, lk, rk
2506
2507
2508def _sort_labels(
2509 uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp]
2510) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2511 llength = len(left)
2512 labels = np.concatenate([left, right])
2513
2514 _, new_labels = algos.safe_sort(uniques, labels, use_na_sentinel=True)
2515 new_left, new_right = new_labels[:llength], new_labels[llength:]
2516
2517 return new_left, new_right
2518
2519
2520def _get_join_keys(
2521 llab: list[npt.NDArray[np.int64 | np.intp]],
2522 rlab: list[npt.NDArray[np.int64 | np.intp]],
2523 shape: Shape,
2524 sort: bool,
2525) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:
2526 # how many levels can be done without overflow
2527 nlev = next(
2528 lev
2529 for lev in range(len(shape), 0, -1)
2530 if not is_int64_overflow_possible(shape[:lev])
2531 )
2532
2533 # get keys for the first `nlev` levels
2534 stride = np.prod(shape[1:nlev], dtype="i8")
2535 lkey = stride * llab[0].astype("i8", subok=False, copy=False)
2536 rkey = stride * rlab[0].astype("i8", subok=False, copy=False)
2537
2538 for i in range(1, nlev):
2539 with np.errstate(divide="ignore"):
2540 stride //= shape[i]
2541 lkey += llab[i] * stride
2542 rkey += rlab[i] * stride
2543
2544 if nlev == len(shape): # all done!
2545 return lkey, rkey
2546
2547 # densify current keys to avoid overflow
2548 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
2549
2550 llab = [lkey] + llab[nlev:]
2551 rlab = [rkey] + rlab[nlev:]
2552 shape = (count,) + shape[nlev:]
2553
2554 return _get_join_keys(llab, rlab, shape, sort)
2555
2556
2557def _should_fill(lname, rname) -> bool:
2558 if not isinstance(lname, str) or not isinstance(rname, str):
2559 return True
2560 return lname == rname
2561
2562
2563def _any(x) -> bool:
2564 return x is not None and com.any_not_none(*x)
2565
2566
2567def _validate_operand(obj: DataFrame | Series) -> DataFrame:
2568 if isinstance(obj, ABCDataFrame):
2569 return obj
2570 elif isinstance(obj, ABCSeries):
2571 if obj.name is None:
2572 raise ValueError("Cannot merge a Series without a name")
2573 return obj.to_frame()
2574 else:
2575 raise TypeError(
2576 f"Can only merge Series or DataFrame objects, a {type(obj)} was passed"
2577 )
2578
2579
2580def _items_overlap_with_suffix(
2581 left: Index, right: Index, suffixes: Suffixes
2582) -> tuple[Index, Index]:
2583 """
2584 Suffixes type validation.
2585
2586 If two indices overlap, add suffixes to overlapping entries.
2587
2588 If corresponding suffix is empty, the entry is simply converted to string.
2589
2590 """
2591 if not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict):
2592 raise TypeError(
2593 f"Passing 'suffixes' as a {type(suffixes)}, is not supported. "
2594 "Provide 'suffixes' as a tuple instead."
2595 )
2596
2597 to_rename = left.intersection(right)
2598 if len(to_rename) == 0:
2599 return left, right
2600
2601 lsuffix, rsuffix = suffixes
2602
2603 if not lsuffix and not rsuffix:
2604 raise ValueError(f"columns overlap but no suffix specified: {to_rename}")
2605
2606 def renamer(x, suffix):
2607 """
2608 Rename the left and right indices.
2609
2610 If there is overlap, and suffix is not None, add
2611 suffix, otherwise, leave it as-is.
2612
2613 Parameters
2614 ----------
2615 x : original column name
2616 suffix : str or None
2617
2618 Returns
2619 -------
2620 x : renamed column name
2621 """
2622 if x in to_rename and suffix is not None:
2623 return f"{x}{suffix}"
2624 return x
2625
2626 lrenamer = partial(renamer, suffix=lsuffix)
2627 rrenamer = partial(renamer, suffix=rsuffix)
2628
2629 llabels = left._transform_index(lrenamer)
2630 rlabels = right._transform_index(rrenamer)
2631
2632 dups = []
2633 if not llabels.is_unique:
2634 # Only warn when duplicates are caused because of suffixes, already duplicated
2635 # columns in origin should not warn
2636 dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()
2637 if not rlabels.is_unique:
2638 dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())
2639 if dups:
2640 raise MergeError(
2641 f"Passing 'suffixes' which cause duplicate columns {set(dups)} is "
2642 f"not allowed.",
2643 )
2644
2645 return llabels, rlabels