1"""
2SQL-style merge routines
3"""
4from __future__ import annotations
5
6from collections.abc import (
7 Hashable,
8 Sequence,
9)
10import datetime
11from functools import partial
12from typing import (
13 TYPE_CHECKING,
14 Literal,
15 cast,
16 final,
17)
18import uuid
19import warnings
20
21import numpy as np
22
23from pandas._libs import (
24 Timedelta,
25 hashtable as libhashtable,
26 join as libjoin,
27 lib,
28)
29from pandas._libs.lib import is_range_indexer
30from pandas._typing import (
31 AnyArrayLike,
32 ArrayLike,
33 IndexLabel,
34 JoinHow,
35 MergeHow,
36 Shape,
37 Suffixes,
38 npt,
39)
40from pandas.errors import MergeError
41from pandas.util._decorators import (
42 Appender,
43 Substitution,
44 cache_readonly,
45)
46from pandas.util._exceptions import find_stack_level
47
48from pandas.core.dtypes.base import ExtensionDtype
49from pandas.core.dtypes.cast import find_common_type
50from pandas.core.dtypes.common import (
51 ensure_int64,
52 ensure_object,
53 is_bool,
54 is_bool_dtype,
55 is_float_dtype,
56 is_integer,
57 is_integer_dtype,
58 is_list_like,
59 is_number,
60 is_numeric_dtype,
61 is_object_dtype,
62 is_string_dtype,
63 needs_i8_conversion,
64)
65from pandas.core.dtypes.dtypes import (
66 CategoricalDtype,
67 DatetimeTZDtype,
68)
69from pandas.core.dtypes.generic import (
70 ABCDataFrame,
71 ABCSeries,
72)
73from pandas.core.dtypes.missing import (
74 isna,
75 na_value_for_dtype,
76)
77
78from pandas import (
79 ArrowDtype,
80 Categorical,
81 Index,
82 MultiIndex,
83 Series,
84)
85import pandas.core.algorithms as algos
86from pandas.core.arrays import (
87 ArrowExtensionArray,
88 BaseMaskedArray,
89 ExtensionArray,
90)
91from pandas.core.arrays.string_ import StringDtype
92import pandas.core.common as com
93from pandas.core.construction import (
94 ensure_wrapped_if_datetimelike,
95 extract_array,
96)
97from pandas.core.frame import _merge_doc
98from pandas.core.indexes.api import default_index
99from pandas.core.sorting import (
100 get_group_index,
101 is_int64_overflow_possible,
102)
103
104if TYPE_CHECKING:
105 from pandas import DataFrame
106 from pandas.core import groupby
107 from pandas.core.arrays import DatetimeArray
108 from pandas.core.indexes.frozen import FrozenList
109
110_factorizers = {
111 np.int64: libhashtable.Int64Factorizer,
112 np.longlong: libhashtable.Int64Factorizer,
113 np.int32: libhashtable.Int32Factorizer,
114 np.int16: libhashtable.Int16Factorizer,
115 np.int8: libhashtable.Int8Factorizer,
116 np.uint64: libhashtable.UInt64Factorizer,
117 np.uint32: libhashtable.UInt32Factorizer,
118 np.uint16: libhashtable.UInt16Factorizer,
119 np.uint8: libhashtable.UInt8Factorizer,
120 np.bool_: libhashtable.UInt8Factorizer,
121 np.float64: libhashtable.Float64Factorizer,
122 np.float32: libhashtable.Float32Factorizer,
123 np.complex64: libhashtable.Complex64Factorizer,
124 np.complex128: libhashtable.Complex128Factorizer,
125 np.object_: libhashtable.ObjectFactorizer,
126}
127
128# See https://github.com/pandas-dev/pandas/issues/52451
129if np.intc is not np.int32:
130 _factorizers[np.intc] = libhashtable.Int64Factorizer
131
132_known = (np.ndarray, ExtensionArray, Index, ABCSeries)
133
134
135@Substitution("\nleft : DataFrame or named Series")
136@Appender(_merge_doc, indents=0)
137def merge(
138 left: DataFrame | Series,
139 right: DataFrame | Series,
140 how: MergeHow = "inner",
141 on: IndexLabel | AnyArrayLike | None = None,
142 left_on: IndexLabel | AnyArrayLike | None = None,
143 right_on: IndexLabel | AnyArrayLike | None = None,
144 left_index: bool = False,
145 right_index: bool = False,
146 sort: bool = False,
147 suffixes: Suffixes = ("_x", "_y"),
148 copy: bool | None = None,
149 indicator: str | bool = False,
150 validate: str | None = None,
151) -> DataFrame:
152 left_df = _validate_operand(left)
153 right_df = _validate_operand(right)
154 if how == "cross":
155 return _cross_merge(
156 left_df,
157 right_df,
158 on=on,
159 left_on=left_on,
160 right_on=right_on,
161 left_index=left_index,
162 right_index=right_index,
163 sort=sort,
164 suffixes=suffixes,
165 indicator=indicator,
166 validate=validate,
167 copy=copy,
168 )
169 else:
170 op = _MergeOperation(
171 left_df,
172 right_df,
173 how=how,
174 on=on,
175 left_on=left_on,
176 right_on=right_on,
177 left_index=left_index,
178 right_index=right_index,
179 sort=sort,
180 suffixes=suffixes,
181 indicator=indicator,
182 validate=validate,
183 )
184 return op.get_result(copy=copy)
185
186
187def _cross_merge(
188 left: DataFrame,
189 right: DataFrame,
190 on: IndexLabel | AnyArrayLike | None = None,
191 left_on: IndexLabel | AnyArrayLike | None = None,
192 right_on: IndexLabel | AnyArrayLike | None = None,
193 left_index: bool = False,
194 right_index: bool = False,
195 sort: bool = False,
196 suffixes: Suffixes = ("_x", "_y"),
197 copy: bool | None = None,
198 indicator: str | bool = False,
199 validate: str | None = None,
200) -> DataFrame:
201 """
202 See merge.__doc__ with how='cross'
203 """
204
205 if (
206 left_index
207 or right_index
208 or right_on is not None
209 or left_on is not None
210 or on is not None
211 ):
212 raise MergeError(
213 "Can not pass on, right_on, left_on or set right_index=True or "
214 "left_index=True"
215 )
216
217 cross_col = f"_cross_{uuid.uuid4()}"
218 left = left.assign(**{cross_col: 1})
219 right = right.assign(**{cross_col: 1})
220
221 left_on = right_on = [cross_col]
222
223 res = merge(
224 left,
225 right,
226 how="inner",
227 on=on,
228 left_on=left_on,
229 right_on=right_on,
230 left_index=left_index,
231 right_index=right_index,
232 sort=sort,
233 suffixes=suffixes,
234 indicator=indicator,
235 validate=validate,
236 copy=copy,
237 )
238 del res[cross_col]
239 return res
240
241
242def _groupby_and_merge(
243 by, left: DataFrame | Series, right: DataFrame | Series, merge_pieces
244):
245 """
246 groupby & merge; we are always performing a left-by type operation
247
248 Parameters
249 ----------
250 by: field to group
251 left: DataFrame
252 right: DataFrame
253 merge_pieces: function for merging
254 """
255 pieces = []
256 if not isinstance(by, (list, tuple)):
257 by = [by]
258
259 lby = left.groupby(by, sort=False)
260 rby: groupby.DataFrameGroupBy | groupby.SeriesGroupBy | None = None
261
262 # if we can groupby the rhs
263 # then we can get vastly better perf
264 if all(item in right.columns for item in by):
265 rby = right.groupby(by, sort=False)
266
267 for key, lhs in lby._grouper.get_iterator(lby._selected_obj, axis=lby.axis):
268 if rby is None:
269 rhs = right
270 else:
271 try:
272 rhs = right.take(rby.indices[key])
273 except KeyError:
274 # key doesn't exist in left
275 lcols = lhs.columns.tolist()
276 cols = lcols + [r for r in right.columns if r not in set(lcols)]
277 merged = lhs.reindex(columns=cols)
278 merged.index = range(len(merged))
279 pieces.append(merged)
280 continue
281
282 merged = merge_pieces(lhs, rhs)
283
284 # make sure join keys are in the merged
285 # TODO, should merge_pieces do this?
286 merged[by] = key
287
288 pieces.append(merged)
289
290 # preserve the original order
291 # if we have a missing piece this can be reset
292 from pandas.core.reshape.concat import concat
293
294 result = concat(pieces, ignore_index=True)
295 result = result.reindex(columns=pieces[0].columns, copy=False)
296 return result, lby
297
298
299def merge_ordered(
300 left: DataFrame | Series,
301 right: DataFrame | Series,
302 on: IndexLabel | None = None,
303 left_on: IndexLabel | None = None,
304 right_on: IndexLabel | None = None,
305 left_by=None,
306 right_by=None,
307 fill_method: str | None = None,
308 suffixes: Suffixes = ("_x", "_y"),
309 how: JoinHow = "outer",
310) -> DataFrame:
311 """
312 Perform a merge for ordered data with optional filling/interpolation.
313
314 Designed for ordered data like time series data. Optionally
315 perform group-wise merge (see examples).
316
317 Parameters
318 ----------
319 left : DataFrame or named Series
320 right : DataFrame or named Series
321 on : label or list
322 Field names to join on. Must be found in both DataFrames.
323 left_on : label or list, or array-like
324 Field names to join on in left DataFrame. Can be a vector or list of
325 vectors of the length of the DataFrame to use a particular vector as
326 the join key instead of columns.
327 right_on : label or list, or array-like
328 Field names to join on in right DataFrame or vector/list of vectors per
329 left_on docs.
330 left_by : column name or list of column names
331 Group left DataFrame by group columns and merge piece by piece with
332 right DataFrame. Must be None if either left or right are a Series.
333 right_by : column name or list of column names
334 Group right DataFrame by group columns and merge piece by piece with
335 left DataFrame. Must be None if either left or right are a Series.
336 fill_method : {'ffill', None}, default None
337 Interpolation method for data.
338 suffixes : list-like, default is ("_x", "_y")
339 A length-2 sequence where each element is optionally a string
340 indicating the suffix to add to overlapping column names in
341 `left` and `right` respectively. Pass a value of `None` instead
342 of a string to indicate that the column name from `left` or
343 `right` should be left as-is, with no suffix. At least one of the
344 values must not be None.
345
346 how : {'left', 'right', 'outer', 'inner'}, default 'outer'
347 * left: use only keys from left frame (SQL: left outer join)
348 * right: use only keys from right frame (SQL: right outer join)
349 * outer: use union of keys from both frames (SQL: full outer join)
350 * inner: use intersection of keys from both frames (SQL: inner join).
351
352 Returns
353 -------
354 DataFrame
355 The merged DataFrame output type will be the same as
356 'left', if it is a subclass of DataFrame.
357
358 See Also
359 --------
360 merge : Merge with a database-style join.
361 merge_asof : Merge on nearest keys.
362
363 Examples
364 --------
365 >>> from pandas import merge_ordered
366 >>> df1 = pd.DataFrame(
367 ... {
368 ... "key": ["a", "c", "e", "a", "c", "e"],
369 ... "lvalue": [1, 2, 3, 1, 2, 3],
370 ... "group": ["a", "a", "a", "b", "b", "b"]
371 ... }
372 ... )
373 >>> df1
374 key lvalue group
375 0 a 1 a
376 1 c 2 a
377 2 e 3 a
378 3 a 1 b
379 4 c 2 b
380 5 e 3 b
381
382 >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
383 >>> df2
384 key rvalue
385 0 b 1
386 1 c 2
387 2 d 3
388
389 >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group")
390 key lvalue group rvalue
391 0 a 1 a NaN
392 1 b 1 a 1.0
393 2 c 2 a 2.0
394 3 d 2 a 3.0
395 4 e 3 a 3.0
396 5 a 1 b NaN
397 6 b 1 b 1.0
398 7 c 2 b 2.0
399 8 d 2 b 3.0
400 9 e 3 b 3.0
401 """
402
403 def _merger(x, y) -> DataFrame:
404 # perform the ordered merge operation
405 op = _OrderedMerge(
406 x,
407 y,
408 on=on,
409 left_on=left_on,
410 right_on=right_on,
411 suffixes=suffixes,
412 fill_method=fill_method,
413 how=how,
414 )
415 return op.get_result()
416
417 if left_by is not None and right_by is not None:
418 raise ValueError("Can only group either left or right frames")
419 if left_by is not None:
420 if isinstance(left_by, str):
421 left_by = [left_by]
422 check = set(left_by).difference(left.columns)
423 if len(check) != 0:
424 raise KeyError(f"{check} not found in left columns")
425 result, _ = _groupby_and_merge(left_by, left, right, lambda x, y: _merger(x, y))
426 elif right_by is not None:
427 if isinstance(right_by, str):
428 right_by = [right_by]
429 check = set(right_by).difference(right.columns)
430 if len(check) != 0:
431 raise KeyError(f"{check} not found in right columns")
432 result, _ = _groupby_and_merge(
433 right_by, right, left, lambda x, y: _merger(y, x)
434 )
435 else:
436 result = _merger(left, right)
437 return result
438
439
440def merge_asof(
441 left: DataFrame | Series,
442 right: DataFrame | Series,
443 on: IndexLabel | None = None,
444 left_on: IndexLabel | None = None,
445 right_on: IndexLabel | None = None,
446 left_index: bool = False,
447 right_index: bool = False,
448 by=None,
449 left_by=None,
450 right_by=None,
451 suffixes: Suffixes = ("_x", "_y"),
452 tolerance: int | Timedelta | None = None,
453 allow_exact_matches: bool = True,
454 direction: str = "backward",
455) -> DataFrame:
456 """
457 Perform a merge by key distance.
458
459 This is similar to a left-join except that we match on nearest
460 key rather than equal keys. Both DataFrames must be sorted by the key.
461
462 For each row in the left DataFrame:
463
464 - A "backward" search selects the last row in the right DataFrame whose
465 'on' key is less than or equal to the left's key.
466
467 - A "forward" search selects the first row in the right DataFrame whose
468 'on' key is greater than or equal to the left's key.
469
470 - A "nearest" search selects the row in the right DataFrame whose 'on'
471 key is closest in absolute distance to the left's key.
472
473 Optionally match on equivalent keys with 'by' before searching with 'on'.
474
475 Parameters
476 ----------
477 left : DataFrame or named Series
478 right : DataFrame or named Series
479 on : label
480 Field name to join on. Must be found in both DataFrames.
481 The data MUST be ordered. Furthermore this must be a numeric column,
482 such as datetimelike, integer, or float. On or left_on/right_on
483 must be given.
484 left_on : label
485 Field name to join on in left DataFrame.
486 right_on : label
487 Field name to join on in right DataFrame.
488 left_index : bool
489 Use the index of the left DataFrame as the join key.
490 right_index : bool
491 Use the index of the right DataFrame as the join key.
492 by : column name or list of column names
493 Match on these columns before performing merge operation.
494 left_by : column name
495 Field names to match on in the left DataFrame.
496 right_by : column name
497 Field names to match on in the right DataFrame.
498 suffixes : 2-length sequence (tuple, list, ...)
499 Suffix to apply to overlapping column names in the left and right
500 side, respectively.
501 tolerance : int or Timedelta, optional, default None
502 Select asof tolerance within this range; must be compatible
503 with the merge index.
504 allow_exact_matches : bool, default True
505
506 - If True, allow matching with the same 'on' value
507 (i.e. less-than-or-equal-to / greater-than-or-equal-to)
508 - If False, don't match the same 'on' value
509 (i.e., strictly less-than / strictly greater-than).
510
511 direction : 'backward' (default), 'forward', or 'nearest'
512 Whether to search for prior, subsequent, or closest matches.
513
514 Returns
515 -------
516 DataFrame
517
518 See Also
519 --------
520 merge : Merge with a database-style join.
521 merge_ordered : Merge with optional filling/interpolation.
522
523 Examples
524 --------
525 >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
526 >>> left
527 a left_val
528 0 1 a
529 1 5 b
530 2 10 c
531
532 >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]})
533 >>> right
534 a right_val
535 0 1 1
536 1 2 2
537 2 3 3
538 3 6 6
539 4 7 7
540
541 >>> pd.merge_asof(left, right, on="a")
542 a left_val right_val
543 0 1 a 1
544 1 5 b 3
545 2 10 c 7
546
547 >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False)
548 a left_val right_val
549 0 1 a NaN
550 1 5 b 3.0
551 2 10 c 7.0
552
553 >>> pd.merge_asof(left, right, on="a", direction="forward")
554 a left_val right_val
555 0 1 a 1.0
556 1 5 b 6.0
557 2 10 c NaN
558
559 >>> pd.merge_asof(left, right, on="a", direction="nearest")
560 a left_val right_val
561 0 1 a 1
562 1 5 b 6
563 2 10 c 7
564
565 We can use indexed DataFrames as well.
566
567 >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10])
568 >>> left
569 left_val
570 1 a
571 5 b
572 10 c
573
574 >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7])
575 >>> right
576 right_val
577 1 1
578 2 2
579 3 3
580 6 6
581 7 7
582
583 >>> pd.merge_asof(left, right, left_index=True, right_index=True)
584 left_val right_val
585 1 a 1
586 5 b 3
587 10 c 7
588
589 Here is a real-world times-series example
590
591 >>> quotes = pd.DataFrame(
592 ... {
593 ... "time": [
594 ... pd.Timestamp("2016-05-25 13:30:00.023"),
595 ... pd.Timestamp("2016-05-25 13:30:00.023"),
596 ... pd.Timestamp("2016-05-25 13:30:00.030"),
597 ... pd.Timestamp("2016-05-25 13:30:00.041"),
598 ... pd.Timestamp("2016-05-25 13:30:00.048"),
599 ... pd.Timestamp("2016-05-25 13:30:00.049"),
600 ... pd.Timestamp("2016-05-25 13:30:00.072"),
601 ... pd.Timestamp("2016-05-25 13:30:00.075")
602 ... ],
603 ... "ticker": [
604 ... "GOOG",
605 ... "MSFT",
606 ... "MSFT",
607 ... "MSFT",
608 ... "GOOG",
609 ... "AAPL",
610 ... "GOOG",
611 ... "MSFT"
612 ... ],
613 ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
614 ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
615 ... }
616 ... )
617 >>> quotes
618 time ticker bid ask
619 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93
620 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96
621 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98
622 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00
623 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93
624 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01
625 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88
626 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03
627
628 >>> trades = pd.DataFrame(
629 ... {
630 ... "time": [
631 ... pd.Timestamp("2016-05-25 13:30:00.023"),
632 ... pd.Timestamp("2016-05-25 13:30:00.038"),
633 ... pd.Timestamp("2016-05-25 13:30:00.048"),
634 ... pd.Timestamp("2016-05-25 13:30:00.048"),
635 ... pd.Timestamp("2016-05-25 13:30:00.048")
636 ... ],
637 ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
638 ... "price": [51.95, 51.95, 720.77, 720.92, 98.0],
639 ... "quantity": [75, 155, 100, 100, 100]
640 ... }
641 ... )
642 >>> trades
643 time ticker price quantity
644 0 2016-05-25 13:30:00.023 MSFT 51.95 75
645 1 2016-05-25 13:30:00.038 MSFT 51.95 155
646 2 2016-05-25 13:30:00.048 GOOG 720.77 100
647 3 2016-05-25 13:30:00.048 GOOG 720.92 100
648 4 2016-05-25 13:30:00.048 AAPL 98.00 100
649
650 By default we are taking the asof of the quotes
651
652 >>> pd.merge_asof(trades, quotes, on="time", by="ticker")
653 time ticker price quantity bid ask
654 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
655 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
656 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
657 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
658 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
659
660 We only asof within 2ms between the quote time and the trade time
661
662 >>> pd.merge_asof(
663 ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms")
664 ... )
665 time ticker price quantity bid ask
666 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
667 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN
668 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
669 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
670 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
671
672 We only asof within 10ms between the quote time and the trade time
673 and we exclude exact matches on time. However *prior* data will
674 propagate forward
675
676 >>> pd.merge_asof(
677 ... trades,
678 ... quotes,
679 ... on="time",
680 ... by="ticker",
681 ... tolerance=pd.Timedelta("10ms"),
682 ... allow_exact_matches=False
683 ... )
684 time ticker price quantity bid ask
685 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN
686 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
687 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN
688 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN
689 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
690 """
691 op = _AsOfMerge(
692 left,
693 right,
694 on=on,
695 left_on=left_on,
696 right_on=right_on,
697 left_index=left_index,
698 right_index=right_index,
699 by=by,
700 left_by=left_by,
701 right_by=right_by,
702 suffixes=suffixes,
703 how="asof",
704 tolerance=tolerance,
705 allow_exact_matches=allow_exact_matches,
706 direction=direction,
707 )
708 return op.get_result()
709
710
711# TODO: transformations??
712# TODO: only copy DataFrames when modification necessary
713class _MergeOperation:
714 """
715 Perform a database (SQL) merge operation between two DataFrame or Series
716 objects using either columns as keys or their row indexes
717 """
718
719 _merge_type = "merge"
720 how: JoinHow | Literal["asof"]
721 on: IndexLabel | None
722 # left_on/right_on may be None when passed, but in validate_specification
723 # get replaced with non-None.
724 left_on: Sequence[Hashable | AnyArrayLike]
725 right_on: Sequence[Hashable | AnyArrayLike]
726 left_index: bool
727 right_index: bool
728 sort: bool
729 suffixes: Suffixes
730 copy: bool
731 indicator: str | bool
732 validate: str | None
733 join_names: list[Hashable]
734 right_join_keys: list[ArrayLike]
735 left_join_keys: list[ArrayLike]
736
737 def __init__(
738 self,
739 left: DataFrame | Series,
740 right: DataFrame | Series,
741 how: JoinHow | Literal["asof"] = "inner",
742 on: IndexLabel | AnyArrayLike | None = None,
743 left_on: IndexLabel | AnyArrayLike | None = None,
744 right_on: IndexLabel | AnyArrayLike | None = None,
745 left_index: bool = False,
746 right_index: bool = False,
747 sort: bool = True,
748 suffixes: Suffixes = ("_x", "_y"),
749 indicator: str | bool = False,
750 validate: str | None = None,
751 ) -> None:
752 _left = _validate_operand(left)
753 _right = _validate_operand(right)
754 self.left = self.orig_left = _left
755 self.right = self.orig_right = _right
756 self.how = how
757
758 self.on = com.maybe_make_list(on)
759
760 self.suffixes = suffixes
761 self.sort = sort or how == "outer"
762
763 self.left_index = left_index
764 self.right_index = right_index
765
766 self.indicator = indicator
767
768 if not is_bool(left_index):
769 raise ValueError(
770 f"left_index parameter must be of type bool, not {type(left_index)}"
771 )
772 if not is_bool(right_index):
773 raise ValueError(
774 f"right_index parameter must be of type bool, not {type(right_index)}"
775 )
776
777 # GH 40993: raise when merging between different levels; enforced in 2.0
778 if _left.columns.nlevels != _right.columns.nlevels:
779 msg = (
780 "Not allowed to merge between different levels. "
781 f"({_left.columns.nlevels} levels on the left, "
782 f"{_right.columns.nlevels} on the right)"
783 )
784 raise MergeError(msg)
785
786 self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)
787
788 (
789 self.left_join_keys,
790 self.right_join_keys,
791 self.join_names,
792 left_drop,
793 right_drop,
794 ) = self._get_merge_keys()
795
796 if left_drop:
797 self.left = self.left._drop_labels_or_levels(left_drop)
798
799 if right_drop:
800 self.right = self.right._drop_labels_or_levels(right_drop)
801
802 self._maybe_require_matching_dtypes(self.left_join_keys, self.right_join_keys)
803 self._validate_tolerance(self.left_join_keys)
804
805 # validate the merge keys dtypes. We may need to coerce
806 # to avoid incompatible dtypes
807 self._maybe_coerce_merge_keys()
808
809 # If argument passed to validate,
810 # check if columns specified as unique
811 # are in fact unique.
812 if validate is not None:
813 self._validate_validate_kwd(validate)
814
815 def _maybe_require_matching_dtypes(
816 self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike]
817 ) -> None:
818 # Overridden by AsOfMerge
819 pass
820
821 def _validate_tolerance(self, left_join_keys: list[ArrayLike]) -> None:
822 # Overridden by AsOfMerge
823 pass
824
825 @final
826 def _reindex_and_concat(
827 self,
828 join_index: Index,
829 left_indexer: npt.NDArray[np.intp] | None,
830 right_indexer: npt.NDArray[np.intp] | None,
831 copy: bool | None,
832 ) -> DataFrame:
833 """
834 reindex along index and concat along columns.
835 """
836 # Take views so we do not alter the originals
837 left = self.left[:]
838 right = self.right[:]
839
840 llabels, rlabels = _items_overlap_with_suffix(
841 self.left._info_axis, self.right._info_axis, self.suffixes
842 )
843
844 if left_indexer is not None and not is_range_indexer(left_indexer, len(left)):
845 # Pinning the index here (and in the right code just below) is not
846 # necessary, but makes the `.take` more performant if we have e.g.
847 # a MultiIndex for left.index.
848 lmgr = left._mgr.reindex_indexer(
849 join_index,
850 left_indexer,
851 axis=1,
852 copy=False,
853 only_slice=True,
854 allow_dups=True,
855 use_na_proxy=True,
856 )
857 left = left._constructor_from_mgr(lmgr, axes=lmgr.axes)
858 left.index = join_index
859
860 if right_indexer is not None and not is_range_indexer(
861 right_indexer, len(right)
862 ):
863 rmgr = right._mgr.reindex_indexer(
864 join_index,
865 right_indexer,
866 axis=1,
867 copy=False,
868 only_slice=True,
869 allow_dups=True,
870 use_na_proxy=True,
871 )
872 right = right._constructor_from_mgr(rmgr, axes=rmgr.axes)
873 right.index = join_index
874
875 from pandas import concat
876
877 left.columns = llabels
878 right.columns = rlabels
879 result = concat([left, right], axis=1, copy=copy)
880 return result
881
882 def get_result(self, copy: bool | None = True) -> DataFrame:
883 if self.indicator:
884 self.left, self.right = self._indicator_pre_merge(self.left, self.right)
885
886 join_index, left_indexer, right_indexer = self._get_join_info()
887
888 result = self._reindex_and_concat(
889 join_index, left_indexer, right_indexer, copy=copy
890 )
891 result = result.__finalize__(self, method=self._merge_type)
892
893 if self.indicator:
894 result = self._indicator_post_merge(result)
895
896 self._maybe_add_join_keys(result, left_indexer, right_indexer)
897
898 self._maybe_restore_index_levels(result)
899
900 return result.__finalize__(self, method="merge")
901
902 @final
903 @cache_readonly
904 def _indicator_name(self) -> str | None:
905 if isinstance(self.indicator, str):
906 return self.indicator
907 elif isinstance(self.indicator, bool):
908 return "_merge" if self.indicator else None
909 else:
910 raise ValueError(
911 "indicator option can only accept boolean or string arguments"
912 )
913
914 @final
915 def _indicator_pre_merge(
916 self, left: DataFrame, right: DataFrame
917 ) -> tuple[DataFrame, DataFrame]:
918 columns = left.columns.union(right.columns)
919
920 for i in ["_left_indicator", "_right_indicator"]:
921 if i in columns:
922 raise ValueError(
923 "Cannot use `indicator=True` option when "
924 f"data contains a column named {i}"
925 )
926 if self._indicator_name in columns:
927 raise ValueError(
928 "Cannot use name of an existing column for indicator column"
929 )
930
931 left = left.copy()
932 right = right.copy()
933
934 left["_left_indicator"] = 1
935 left["_left_indicator"] = left["_left_indicator"].astype("int8")
936
937 right["_right_indicator"] = 2
938 right["_right_indicator"] = right["_right_indicator"].astype("int8")
939
940 return left, right
941
942 @final
943 def _indicator_post_merge(self, result: DataFrame) -> DataFrame:
944 result["_left_indicator"] = result["_left_indicator"].fillna(0)
945 result["_right_indicator"] = result["_right_indicator"].fillna(0)
946
947 result[self._indicator_name] = Categorical(
948 (result["_left_indicator"] + result["_right_indicator"]),
949 categories=[1, 2, 3],
950 )
951 result[self._indicator_name] = result[
952 self._indicator_name
953 ].cat.rename_categories(["left_only", "right_only", "both"])
954
955 result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1)
956 return result
957
958 @final
959 def _maybe_restore_index_levels(self, result: DataFrame) -> None:
960 """
961 Restore index levels specified as `on` parameters
962
963 Here we check for cases where `self.left_on` and `self.right_on` pairs
964 each reference an index level in their respective DataFrames. The
965 joined columns corresponding to these pairs are then restored to the
966 index of `result`.
967
968 **Note:** This method has side effects. It modifies `result` in-place
969
970 Parameters
971 ----------
972 result: DataFrame
973 merge result
974
975 Returns
976 -------
977 None
978 """
979 names_to_restore = []
980 for name, left_key, right_key in zip(
981 self.join_names, self.left_on, self.right_on
982 ):
983 if (
984 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible
985 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected
986 # "Hashable"
987 self.orig_left._is_level_reference(left_key) # type: ignore[arg-type]
988 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible
989 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected
990 # "Hashable"
991 and self.orig_right._is_level_reference(
992 right_key # type: ignore[arg-type]
993 )
994 and left_key == right_key
995 and name not in result.index.names
996 ):
997 names_to_restore.append(name)
998
999 if names_to_restore:
1000 result.set_index(names_to_restore, inplace=True)
1001
1002 @final
1003 def _maybe_add_join_keys(
1004 self,
1005 result: DataFrame,
1006 left_indexer: npt.NDArray[np.intp] | None,
1007 right_indexer: npt.NDArray[np.intp] | None,
1008 ) -> None:
1009 left_has_missing = None
1010 right_has_missing = None
1011
1012 assert all(isinstance(x, _known) for x in self.left_join_keys)
1013
1014 keys = zip(self.join_names, self.left_on, self.right_on)
1015 for i, (name, lname, rname) in enumerate(keys):
1016 if not _should_fill(lname, rname):
1017 continue
1018
1019 take_left, take_right = None, None
1020
1021 if name in result:
1022 if left_indexer is not None or right_indexer is not None:
1023 if name in self.left:
1024 if left_has_missing is None:
1025 left_has_missing = (
1026 False
1027 if left_indexer is None
1028 else (left_indexer == -1).any()
1029 )
1030
1031 if left_has_missing:
1032 take_right = self.right_join_keys[i]
1033
1034 if result[name].dtype != self.left[name].dtype:
1035 take_left = self.left[name]._values
1036
1037 elif name in self.right:
1038 if right_has_missing is None:
1039 right_has_missing = (
1040 False
1041 if right_indexer is None
1042 else (right_indexer == -1).any()
1043 )
1044
1045 if right_has_missing:
1046 take_left = self.left_join_keys[i]
1047
1048 if result[name].dtype != self.right[name].dtype:
1049 take_right = self.right[name]._values
1050
1051 else:
1052 take_left = self.left_join_keys[i]
1053 take_right = self.right_join_keys[i]
1054
1055 if take_left is not None or take_right is not None:
1056 if take_left is None:
1057 lvals = result[name]._values
1058 elif left_indexer is None:
1059 lvals = take_left
1060 else:
1061 # TODO: can we pin down take_left's type earlier?
1062 take_left = extract_array(take_left, extract_numpy=True)
1063 lfill = na_value_for_dtype(take_left.dtype)
1064 lvals = algos.take_nd(take_left, left_indexer, fill_value=lfill)
1065
1066 if take_right is None:
1067 rvals = result[name]._values
1068 elif right_indexer is None:
1069 rvals = take_right
1070 else:
1071 # TODO: can we pin down take_right's type earlier?
1072 taker = extract_array(take_right, extract_numpy=True)
1073 rfill = na_value_for_dtype(taker.dtype)
1074 rvals = algos.take_nd(taker, right_indexer, fill_value=rfill)
1075
1076 # if we have an all missing left_indexer
1077 # make sure to just use the right values or vice-versa
1078 if left_indexer is not None and (left_indexer == -1).all():
1079 key_col = Index(rvals)
1080 result_dtype = rvals.dtype
1081 elif right_indexer is not None and (right_indexer == -1).all():
1082 key_col = Index(lvals)
1083 result_dtype = lvals.dtype
1084 else:
1085 key_col = Index(lvals)
1086 if left_indexer is not None:
1087 mask_left = left_indexer == -1
1088 key_col = key_col.where(~mask_left, rvals)
1089 result_dtype = find_common_type([lvals.dtype, rvals.dtype])
1090 if (
1091 lvals.dtype.kind == "M"
1092 and rvals.dtype.kind == "M"
1093 and result_dtype.kind == "O"
1094 ):
1095 # TODO(non-nano) Workaround for common_type not dealing
1096 # with different resolutions
1097 result_dtype = key_col.dtype
1098
1099 if result._is_label_reference(name):
1100 result[name] = result._constructor_sliced(
1101 key_col, dtype=result_dtype, index=result.index
1102 )
1103 elif result._is_level_reference(name):
1104 if isinstance(result.index, MultiIndex):
1105 key_col.name = name
1106 idx_list = [
1107 result.index.get_level_values(level_name)
1108 if level_name != name
1109 else key_col
1110 for level_name in result.index.names
1111 ]
1112
1113 result.set_index(idx_list, inplace=True)
1114 else:
1115 result.index = Index(key_col, name=name)
1116 else:
1117 result.insert(i, name or f"key_{i}", key_col)
1118
1119 def _get_join_indexers(
1120 self,
1121 ) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
1122 """return the join indexers"""
1123 # make mypy happy
1124 assert self.how != "asof"
1125 return get_join_indexers(
1126 self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
1127 )
1128
1129 @final
1130 def _get_join_info(
1131 self,
1132 ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
1133 left_ax = self.left.index
1134 right_ax = self.right.index
1135
1136 if self.left_index and self.right_index and self.how != "asof":
1137 join_index, left_indexer, right_indexer = left_ax.join(
1138 right_ax, how=self.how, return_indexers=True, sort=self.sort
1139 )
1140
1141 elif self.right_index and self.how == "left":
1142 join_index, left_indexer, right_indexer = _left_join_on_index(
1143 left_ax, right_ax, self.left_join_keys, sort=self.sort
1144 )
1145
1146 elif self.left_index and self.how == "right":
1147 join_index, right_indexer, left_indexer = _left_join_on_index(
1148 right_ax, left_ax, self.right_join_keys, sort=self.sort
1149 )
1150 else:
1151 (left_indexer, right_indexer) = self._get_join_indexers()
1152
1153 if self.right_index:
1154 if len(self.left) > 0:
1155 join_index = self._create_join_index(
1156 left_ax,
1157 right_ax,
1158 left_indexer,
1159 how="right",
1160 )
1161 elif right_indexer is None:
1162 join_index = right_ax.copy()
1163 else:
1164 join_index = right_ax.take(right_indexer)
1165 elif self.left_index:
1166 if self.how == "asof":
1167 # GH#33463 asof should always behave like a left merge
1168 join_index = self._create_join_index(
1169 left_ax,
1170 right_ax,
1171 left_indexer,
1172 how="left",
1173 )
1174
1175 elif len(self.right) > 0:
1176 join_index = self._create_join_index(
1177 right_ax,
1178 left_ax,
1179 right_indexer,
1180 how="left",
1181 )
1182 elif left_indexer is None:
1183 join_index = left_ax.copy()
1184 else:
1185 join_index = left_ax.take(left_indexer)
1186 else:
1187 n = len(left_ax) if left_indexer is None else len(left_indexer)
1188 join_index = default_index(n)
1189
1190 return join_index, left_indexer, right_indexer
1191
1192 @final
1193 def _create_join_index(
1194 self,
1195 index: Index,
1196 other_index: Index,
1197 indexer: npt.NDArray[np.intp] | None,
1198 how: JoinHow = "left",
1199 ) -> Index:
1200 """
1201 Create a join index by rearranging one index to match another
1202
1203 Parameters
1204 ----------
1205 index : Index
1206 index being rearranged
1207 other_index : Index
1208 used to supply values not found in index
1209 indexer : np.ndarray[np.intp] or None
1210 how to rearrange index
1211 how : str
1212 Replacement is only necessary if indexer based on other_index.
1213
1214 Returns
1215 -------
1216 Index
1217 """
1218 if self.how in (how, "outer") and not isinstance(other_index, MultiIndex):
1219 # if final index requires values in other_index but not target
1220 # index, indexer may hold missing (-1) values, causing Index.take
1221 # to take the final value in target index. So, we set the last
1222 # element to be the desired fill value. We do not use allow_fill
1223 # and fill_value because it throws a ValueError on integer indices
1224 mask = indexer == -1
1225 if np.any(mask):
1226 fill_value = na_value_for_dtype(index.dtype, compat=False)
1227 index = index.append(Index([fill_value]))
1228 if indexer is None:
1229 return index.copy()
1230 return index.take(indexer)
1231
1232 @final
1233 def _get_merge_keys(
1234 self,
1235 ) -> tuple[
1236 list[ArrayLike],
1237 list[ArrayLike],
1238 list[Hashable],
1239 list[Hashable],
1240 list[Hashable],
1241 ]:
1242 """
1243 Returns
1244 -------
1245 left_keys, right_keys, join_names, left_drop, right_drop
1246 """
1247 left_keys: list[ArrayLike] = []
1248 right_keys: list[ArrayLike] = []
1249 join_names: list[Hashable] = []
1250 right_drop: list[Hashable] = []
1251 left_drop: list[Hashable] = []
1252
1253 left, right = self.left, self.right
1254
1255 is_lkey = lambda x: isinstance(x, _known) and len(x) == len(left)
1256 is_rkey = lambda x: isinstance(x, _known) and len(x) == len(right)
1257
1258 # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A
1259 # user could, for example, request 'left_index' and 'left_by'. In a
1260 # regular pd.merge(), users cannot specify both 'left_index' and
1261 # 'left_on'. (Instead, users have a MultiIndex). That means the
1262 # self.left_on in this function is always empty in a pd.merge(), but
1263 # a pd.merge_asof(left_index=True, left_by=...) will result in a
1264 # self.left_on array with a None in the middle of it. This requires
1265 # a work-around as designated in the code below.
1266 # See _validate_left_right_on() for where this happens.
1267
1268 # ugh, spaghetti re #733
1269 if _any(self.left_on) and _any(self.right_on):
1270 for lk, rk in zip(self.left_on, self.right_on):
1271 lk = extract_array(lk, extract_numpy=True)
1272 rk = extract_array(rk, extract_numpy=True)
1273 if is_lkey(lk):
1274 lk = cast(ArrayLike, lk)
1275 left_keys.append(lk)
1276 if is_rkey(rk):
1277 rk = cast(ArrayLike, rk)
1278 right_keys.append(rk)
1279 join_names.append(None) # what to do?
1280 else:
1281 # Then we're either Hashable or a wrong-length arraylike,
1282 # the latter of which will raise
1283 rk = cast(Hashable, rk)
1284 if rk is not None:
1285 right_keys.append(right._get_label_or_level_values(rk))
1286 join_names.append(rk)
1287 else:
1288 # work-around for merge_asof(right_index=True)
1289 right_keys.append(right.index._values)
1290 join_names.append(right.index.name)
1291 else:
1292 if not is_rkey(rk):
1293 # Then we're either Hashable or a wrong-length arraylike,
1294 # the latter of which will raise
1295 rk = cast(Hashable, rk)
1296 if rk is not None:
1297 right_keys.append(right._get_label_or_level_values(rk))
1298 else:
1299 # work-around for merge_asof(right_index=True)
1300 right_keys.append(right.index._values)
1301 if lk is not None and lk == rk: # FIXME: what about other NAs?
1302 right_drop.append(rk)
1303 else:
1304 rk = cast(ArrayLike, rk)
1305 right_keys.append(rk)
1306 if lk is not None:
1307 # Then we're either Hashable or a wrong-length arraylike,
1308 # the latter of which will raise
1309 lk = cast(Hashable, lk)
1310 left_keys.append(left._get_label_or_level_values(lk))
1311 join_names.append(lk)
1312 else:
1313 # work-around for merge_asof(left_index=True)
1314 left_keys.append(left.index._values)
1315 join_names.append(left.index.name)
1316 elif _any(self.left_on):
1317 for k in self.left_on:
1318 if is_lkey(k):
1319 k = extract_array(k, extract_numpy=True)
1320 k = cast(ArrayLike, k)
1321 left_keys.append(k)
1322 join_names.append(None)
1323 else:
1324 # Then we're either Hashable or a wrong-length arraylike,
1325 # the latter of which will raise
1326 k = cast(Hashable, k)
1327 left_keys.append(left._get_label_or_level_values(k))
1328 join_names.append(k)
1329 if isinstance(self.right.index, MultiIndex):
1330 right_keys = [
1331 lev._values.take(lev_codes)
1332 for lev, lev_codes in zip(
1333 self.right.index.levels, self.right.index.codes
1334 )
1335 ]
1336 else:
1337 right_keys = [self.right.index._values]
1338 elif _any(self.right_on):
1339 for k in self.right_on:
1340 k = extract_array(k, extract_numpy=True)
1341 if is_rkey(k):
1342 k = cast(ArrayLike, k)
1343 right_keys.append(k)
1344 join_names.append(None)
1345 else:
1346 # Then we're either Hashable or a wrong-length arraylike,
1347 # the latter of which will raise
1348 k = cast(Hashable, k)
1349 right_keys.append(right._get_label_or_level_values(k))
1350 join_names.append(k)
1351 if isinstance(self.left.index, MultiIndex):
1352 left_keys = [
1353 lev._values.take(lev_codes)
1354 for lev, lev_codes in zip(
1355 self.left.index.levels, self.left.index.codes
1356 )
1357 ]
1358 else:
1359 left_keys = [self.left.index._values]
1360
1361 return left_keys, right_keys, join_names, left_drop, right_drop
1362
1363 @final
1364 def _maybe_coerce_merge_keys(self) -> None:
1365 # we have valid merges but we may have to further
1366 # coerce these if they are originally incompatible types
1367 #
1368 # for example if these are categorical, but are not dtype_equal
1369 # or if we have object and integer dtypes
1370
1371 for lk, rk, name in zip(
1372 self.left_join_keys, self.right_join_keys, self.join_names
1373 ):
1374 if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):
1375 continue
1376
1377 lk = extract_array(lk, extract_numpy=True)
1378 rk = extract_array(rk, extract_numpy=True)
1379
1380 lk_is_cat = isinstance(lk.dtype, CategoricalDtype)
1381 rk_is_cat = isinstance(rk.dtype, CategoricalDtype)
1382 lk_is_object_or_string = is_object_dtype(lk.dtype) or is_string_dtype(
1383 lk.dtype
1384 )
1385 rk_is_object_or_string = is_object_dtype(rk.dtype) or is_string_dtype(
1386 rk.dtype
1387 )
1388
1389 # if either left or right is a categorical
1390 # then the must match exactly in categories & ordered
1391 if lk_is_cat and rk_is_cat:
1392 lk = cast(Categorical, lk)
1393 rk = cast(Categorical, rk)
1394 if lk._categories_match_up_to_permutation(rk):
1395 continue
1396
1397 elif lk_is_cat or rk_is_cat:
1398 pass
1399
1400 elif lk.dtype == rk.dtype:
1401 continue
1402
1403 msg = (
1404 f"You are trying to merge on {lk.dtype} and {rk.dtype} columns "
1405 f"for key '{name}'. If you wish to proceed you should use pd.concat"
1406 )
1407
1408 # if we are numeric, then allow differing
1409 # kinds to proceed, eg. int64 and int8, int and float
1410 # further if we are object, but we infer to
1411 # the same, then proceed
1412 if is_numeric_dtype(lk.dtype) and is_numeric_dtype(rk.dtype):
1413 if lk.dtype.kind == rk.dtype.kind:
1414 continue
1415
1416 if isinstance(lk.dtype, ExtensionDtype) and not isinstance(
1417 rk.dtype, ExtensionDtype
1418 ):
1419 ct = find_common_type([lk.dtype, rk.dtype])
1420 if isinstance(ct, ExtensionDtype):
1421 com_cls = ct.construct_array_type()
1422 rk = com_cls._from_sequence(rk, dtype=ct, copy=False)
1423 else:
1424 rk = rk.astype(ct)
1425 elif isinstance(rk.dtype, ExtensionDtype):
1426 ct = find_common_type([lk.dtype, rk.dtype])
1427 if isinstance(ct, ExtensionDtype):
1428 com_cls = ct.construct_array_type()
1429 lk = com_cls._from_sequence(lk, dtype=ct, copy=False)
1430 else:
1431 lk = lk.astype(ct)
1432
1433 # check whether ints and floats
1434 if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype):
1435 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int
1436 with np.errstate(invalid="ignore"):
1437 # error: Argument 1 to "astype" of "ndarray" has incompatible
1438 # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected
1439 # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]"
1440 casted = lk.astype(rk.dtype) # type: ignore[arg-type]
1441
1442 mask = ~np.isnan(lk)
1443 match = lk == casted
1444 if not match[mask].all():
1445 warnings.warn(
1446 "You are merging on int and float "
1447 "columns where the float values "
1448 "are not equal to their int representation.",
1449 UserWarning,
1450 stacklevel=find_stack_level(),
1451 )
1452 continue
1453
1454 if is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype):
1455 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int
1456 with np.errstate(invalid="ignore"):
1457 # error: Argument 1 to "astype" of "ndarray" has incompatible
1458 # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected
1459 # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]"
1460 casted = rk.astype(lk.dtype) # type: ignore[arg-type]
1461
1462 mask = ~np.isnan(rk)
1463 match = rk == casted
1464 if not match[mask].all():
1465 warnings.warn(
1466 "You are merging on int and float "
1467 "columns where the float values "
1468 "are not equal to their int representation.",
1469 UserWarning,
1470 stacklevel=find_stack_level(),
1471 )
1472 continue
1473
1474 # let's infer and see if we are ok
1475 if lib.infer_dtype(lk, skipna=False) == lib.infer_dtype(
1476 rk, skipna=False
1477 ):
1478 continue
1479
1480 # Check if we are trying to merge on obviously
1481 # incompatible dtypes GH 9780, GH 15800
1482
1483 # bool values are coerced to object
1484 elif (lk_is_object_or_string and is_bool_dtype(rk.dtype)) or (
1485 is_bool_dtype(lk.dtype) and rk_is_object_or_string
1486 ):
1487 pass
1488
1489 # object values are allowed to be merged
1490 elif (lk_is_object_or_string and is_numeric_dtype(rk.dtype)) or (
1491 is_numeric_dtype(lk.dtype) and rk_is_object_or_string
1492 ):
1493 inferred_left = lib.infer_dtype(lk, skipna=False)
1494 inferred_right = lib.infer_dtype(rk, skipna=False)
1495 bool_types = ["integer", "mixed-integer", "boolean", "empty"]
1496 string_types = ["string", "unicode", "mixed", "bytes", "empty"]
1497
1498 # inferred bool
1499 if inferred_left in bool_types and inferred_right in bool_types:
1500 pass
1501
1502 # unless we are merging non-string-like with string-like
1503 elif (
1504 inferred_left in string_types and inferred_right not in string_types
1505 ) or (
1506 inferred_right in string_types and inferred_left not in string_types
1507 ):
1508 raise ValueError(msg)
1509
1510 # datetimelikes must match exactly
1511 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype):
1512 raise ValueError(msg)
1513 elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype):
1514 raise ValueError(msg)
1515 elif isinstance(lk.dtype, DatetimeTZDtype) and not isinstance(
1516 rk.dtype, DatetimeTZDtype
1517 ):
1518 raise ValueError(msg)
1519 elif not isinstance(lk.dtype, DatetimeTZDtype) and isinstance(
1520 rk.dtype, DatetimeTZDtype
1521 ):
1522 raise ValueError(msg)
1523 elif (
1524 isinstance(lk.dtype, DatetimeTZDtype)
1525 and isinstance(rk.dtype, DatetimeTZDtype)
1526 ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"):
1527 # allows datetime with different resolutions
1528 continue
1529 # datetime and timedelta not allowed
1530 elif lk.dtype.kind == "M" and rk.dtype.kind == "m":
1531 raise ValueError(msg)
1532 elif lk.dtype.kind == "m" and rk.dtype.kind == "M":
1533 raise ValueError(msg)
1534
1535 elif is_object_dtype(lk.dtype) and is_object_dtype(rk.dtype):
1536 continue
1537
1538 # Houston, we have a problem!
1539 # let's coerce to object if the dtypes aren't
1540 # categorical, otherwise coerce to the category
1541 # dtype. If we coerced categories to object,
1542 # then we would lose type information on some
1543 # columns, and end up trying to merge
1544 # incompatible dtypes. See GH 16900.
1545 if name in self.left.columns:
1546 typ = cast(Categorical, lk).categories.dtype if lk_is_cat else object
1547 self.left = self.left.copy()
1548 self.left[name] = self.left[name].astype(typ)
1549 if name in self.right.columns:
1550 typ = cast(Categorical, rk).categories.dtype if rk_is_cat else object
1551 self.right = self.right.copy()
1552 self.right[name] = self.right[name].astype(typ)
1553
1554 def _validate_left_right_on(self, left_on, right_on):
1555 left_on = com.maybe_make_list(left_on)
1556 right_on = com.maybe_make_list(right_on)
1557
1558 # Hm, any way to make this logic less complicated??
1559 if self.on is None and left_on is None and right_on is None:
1560 if self.left_index and self.right_index:
1561 left_on, right_on = (), ()
1562 elif self.left_index:
1563 raise MergeError("Must pass right_on or right_index=True")
1564 elif self.right_index:
1565 raise MergeError("Must pass left_on or left_index=True")
1566 else:
1567 # use the common columns
1568 left_cols = self.left.columns
1569 right_cols = self.right.columns
1570 common_cols = left_cols.intersection(right_cols)
1571 if len(common_cols) == 0:
1572 raise MergeError(
1573 "No common columns to perform merge on. "
1574 f"Merge options: left_on={left_on}, "
1575 f"right_on={right_on}, "
1576 f"left_index={self.left_index}, "
1577 f"right_index={self.right_index}"
1578 )
1579 if (
1580 not left_cols.join(common_cols, how="inner").is_unique
1581 or not right_cols.join(common_cols, how="inner").is_unique
1582 ):
1583 raise MergeError(f"Data columns not unique: {repr(common_cols)}")
1584 left_on = right_on = common_cols
1585 elif self.on is not None:
1586 if left_on is not None or right_on is not None:
1587 raise MergeError(
1588 'Can only pass argument "on" OR "left_on" '
1589 'and "right_on", not a combination of both.'
1590 )
1591 if self.left_index or self.right_index:
1592 raise MergeError(
1593 'Can only pass argument "on" OR "left_index" '
1594 'and "right_index", not a combination of both.'
1595 )
1596 left_on = right_on = self.on
1597 elif left_on is not None:
1598 if self.left_index:
1599 raise MergeError(
1600 'Can only pass argument "left_on" OR "left_index" not both.'
1601 )
1602 if not self.right_index and right_on is None:
1603 raise MergeError('Must pass "right_on" OR "right_index".')
1604 n = len(left_on)
1605 if self.right_index:
1606 if len(left_on) != self.right.index.nlevels:
1607 raise ValueError(
1608 "len(left_on) must equal the number "
1609 'of levels in the index of "right"'
1610 )
1611 right_on = [None] * n
1612 elif right_on is not None:
1613 if self.right_index:
1614 raise MergeError(
1615 'Can only pass argument "right_on" OR "right_index" not both.'
1616 )
1617 if not self.left_index and left_on is None:
1618 raise MergeError('Must pass "left_on" OR "left_index".')
1619 n = len(right_on)
1620 if self.left_index:
1621 if len(right_on) != self.left.index.nlevels:
1622 raise ValueError(
1623 "len(right_on) must equal the number "
1624 'of levels in the index of "left"'
1625 )
1626 left_on = [None] * n
1627 if len(right_on) != len(left_on):
1628 raise ValueError("len(right_on) must equal len(left_on)")
1629
1630 return left_on, right_on
1631
1632 @final
1633 def _validate_validate_kwd(self, validate: str) -> None:
1634 # Check uniqueness of each
1635 if self.left_index:
1636 left_unique = self.orig_left.index.is_unique
1637 else:
1638 left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique
1639
1640 if self.right_index:
1641 right_unique = self.orig_right.index.is_unique
1642 else:
1643 right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique
1644
1645 # Check data integrity
1646 if validate in ["one_to_one", "1:1"]:
1647 if not left_unique and not right_unique:
1648 raise MergeError(
1649 "Merge keys are not unique in either left "
1650 "or right dataset; not a one-to-one merge"
1651 )
1652 if not left_unique:
1653 raise MergeError(
1654 "Merge keys are not unique in left dataset; not a one-to-one merge"
1655 )
1656 if not right_unique:
1657 raise MergeError(
1658 "Merge keys are not unique in right dataset; not a one-to-one merge"
1659 )
1660
1661 elif validate in ["one_to_many", "1:m"]:
1662 if not left_unique:
1663 raise MergeError(
1664 "Merge keys are not unique in left dataset; not a one-to-many merge"
1665 )
1666
1667 elif validate in ["many_to_one", "m:1"]:
1668 if not right_unique:
1669 raise MergeError(
1670 "Merge keys are not unique in right dataset; "
1671 "not a many-to-one merge"
1672 )
1673
1674 elif validate in ["many_to_many", "m:m"]:
1675 pass
1676
1677 else:
1678 raise ValueError(
1679 f'"{validate}" is not a valid argument. '
1680 "Valid arguments are:\n"
1681 '- "1:1"\n'
1682 '- "1:m"\n'
1683 '- "m:1"\n'
1684 '- "m:m"\n'
1685 '- "one_to_one"\n'
1686 '- "one_to_many"\n'
1687 '- "many_to_one"\n'
1688 '- "many_to_many"'
1689 )
1690
1691
1692def get_join_indexers(
1693 left_keys: list[ArrayLike],
1694 right_keys: list[ArrayLike],
1695 sort: bool = False,
1696 how: JoinHow = "inner",
1697) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
1698 """
1699
1700 Parameters
1701 ----------
1702 left_keys : list[ndarray, ExtensionArray, Index, Series]
1703 right_keys : list[ndarray, ExtensionArray, Index, Series]
1704 sort : bool, default False
1705 how : {'inner', 'outer', 'left', 'right'}, default 'inner'
1706
1707 Returns
1708 -------
1709 np.ndarray[np.intp] or None
1710 Indexer into the left_keys.
1711 np.ndarray[np.intp] or None
1712 Indexer into the right_keys.
1713 """
1714 assert len(left_keys) == len(
1715 right_keys
1716 ), "left_keys and right_keys must be the same length"
1717
1718 # fast-path for empty left/right
1719 left_n = len(left_keys[0])
1720 right_n = len(right_keys[0])
1721 if left_n == 0:
1722 if how in ["left", "inner"]:
1723 return _get_empty_indexer()
1724 elif not sort and how in ["right", "outer"]:
1725 return _get_no_sort_one_missing_indexer(right_n, True)
1726 elif right_n == 0:
1727 if how in ["right", "inner"]:
1728 return _get_empty_indexer()
1729 elif not sort and how in ["left", "outer"]:
1730 return _get_no_sort_one_missing_indexer(left_n, False)
1731
1732 lkey: ArrayLike
1733 rkey: ArrayLike
1734 if len(left_keys) > 1:
1735 # get left & right join labels and num. of levels at each location
1736 mapped = (
1737 _factorize_keys(left_keys[n], right_keys[n], sort=sort)
1738 for n in range(len(left_keys))
1739 )
1740 zipped = zip(*mapped)
1741 llab, rlab, shape = (list(x) for x in zipped)
1742
1743 # get flat i8 keys from label lists
1744 lkey, rkey = _get_join_keys(llab, rlab, tuple(shape), sort)
1745 else:
1746 lkey = left_keys[0]
1747 rkey = right_keys[0]
1748
1749 left = Index(lkey)
1750 right = Index(rkey)
1751
1752 if (
1753 left.is_monotonic_increasing
1754 and right.is_monotonic_increasing
1755 and (left.is_unique or right.is_unique)
1756 ):
1757 _, lidx, ridx = left.join(right, how=how, return_indexers=True, sort=sort)
1758 else:
1759 lidx, ridx = get_join_indexers_non_unique(
1760 left._values, right._values, sort, how
1761 )
1762
1763 if lidx is not None and is_range_indexer(lidx, len(left)):
1764 lidx = None
1765 if ridx is not None and is_range_indexer(ridx, len(right)):
1766 ridx = None
1767 return lidx, ridx
1768
1769
1770def get_join_indexers_non_unique(
1771 left: ArrayLike,
1772 right: ArrayLike,
1773 sort: bool = False,
1774 how: JoinHow = "inner",
1775) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
1776 """
1777 Get join indexers for left and right.
1778
1779 Parameters
1780 ----------
1781 left : ArrayLike
1782 right : ArrayLike
1783 sort : bool, default False
1784 how : {'inner', 'outer', 'left', 'right'}, default 'inner'
1785
1786 Returns
1787 -------
1788 np.ndarray[np.intp]
1789 Indexer into left.
1790 np.ndarray[np.intp]
1791 Indexer into right.
1792 """
1793 lkey, rkey, count = _factorize_keys(left, right, sort=sort)
1794 if how == "left":
1795 lidx, ridx = libjoin.left_outer_join(lkey, rkey, count, sort=sort)
1796 elif how == "right":
1797 ridx, lidx = libjoin.left_outer_join(rkey, lkey, count, sort=sort)
1798 elif how == "inner":
1799 lidx, ridx = libjoin.inner_join(lkey, rkey, count, sort=sort)
1800 elif how == "outer":
1801 lidx, ridx = libjoin.full_outer_join(lkey, rkey, count)
1802 return lidx, ridx
1803
1804
1805def restore_dropped_levels_multijoin(
1806 left: MultiIndex,
1807 right: MultiIndex,
1808 dropped_level_names,
1809 join_index: Index,
1810 lindexer: npt.NDArray[np.intp],
1811 rindexer: npt.NDArray[np.intp],
1812) -> tuple[FrozenList, FrozenList, FrozenList]:
1813 """
1814 *this is an internal non-public method*
1815
1816 Returns the levels, labels and names of a multi-index to multi-index join.
1817 Depending on the type of join, this method restores the appropriate
1818 dropped levels of the joined multi-index.
1819 The method relies on lindexer, rindexer which hold the index positions of
1820 left and right, where a join was feasible
1821
1822 Parameters
1823 ----------
1824 left : MultiIndex
1825 left index
1826 right : MultiIndex
1827 right index
1828 dropped_level_names : str array
1829 list of non-common level names
1830 join_index : Index
1831 the index of the join between the
1832 common levels of left and right
1833 lindexer : np.ndarray[np.intp]
1834 left indexer
1835 rindexer : np.ndarray[np.intp]
1836 right indexer
1837
1838 Returns
1839 -------
1840 levels : list of Index
1841 levels of combined multiindexes
1842 labels : np.ndarray[np.intp]
1843 labels of combined multiindexes
1844 names : List[Hashable]
1845 names of combined multiindex levels
1846
1847 """
1848
1849 def _convert_to_multiindex(index: Index) -> MultiIndex:
1850 if isinstance(index, MultiIndex):
1851 return index
1852 else:
1853 return MultiIndex.from_arrays([index._values], names=[index.name])
1854
1855 # For multi-multi joins with one overlapping level,
1856 # the returned index if of type Index
1857 # Assure that join_index is of type MultiIndex
1858 # so that dropped levels can be appended
1859 join_index = _convert_to_multiindex(join_index)
1860
1861 join_levels = join_index.levels
1862 join_codes = join_index.codes
1863 join_names = join_index.names
1864
1865 # Iterate through the levels that must be restored
1866 for dropped_level_name in dropped_level_names:
1867 if dropped_level_name in left.names:
1868 idx = left
1869 indexer = lindexer
1870 else:
1871 idx = right
1872 indexer = rindexer
1873
1874 # The index of the level name to be restored
1875 name_idx = idx.names.index(dropped_level_name)
1876
1877 restore_levels = idx.levels[name_idx]
1878 # Inject -1 in the codes list where a join was not possible
1879 # IOW indexer[i]=-1
1880 codes = idx.codes[name_idx]
1881 if indexer is None:
1882 restore_codes = codes
1883 else:
1884 restore_codes = algos.take_nd(codes, indexer, fill_value=-1)
1885
1886 # error: Cannot determine type of "__add__"
1887 join_levels = join_levels + [restore_levels] # type: ignore[has-type]
1888 join_codes = join_codes + [restore_codes] # type: ignore[has-type]
1889 join_names = join_names + [dropped_level_name]
1890
1891 return join_levels, join_codes, join_names
1892
1893
1894class _OrderedMerge(_MergeOperation):
1895 _merge_type = "ordered_merge"
1896
1897 def __init__(
1898 self,
1899 left: DataFrame | Series,
1900 right: DataFrame | Series,
1901 on: IndexLabel | None = None,
1902 left_on: IndexLabel | None = None,
1903 right_on: IndexLabel | None = None,
1904 left_index: bool = False,
1905 right_index: bool = False,
1906 suffixes: Suffixes = ("_x", "_y"),
1907 fill_method: str | None = None,
1908 how: JoinHow | Literal["asof"] = "outer",
1909 ) -> None:
1910 self.fill_method = fill_method
1911 _MergeOperation.__init__(
1912 self,
1913 left,
1914 right,
1915 on=on,
1916 left_on=left_on,
1917 left_index=left_index,
1918 right_index=right_index,
1919 right_on=right_on,
1920 how=how,
1921 suffixes=suffixes,
1922 sort=True, # factorize sorts
1923 )
1924
1925 def get_result(self, copy: bool | None = True) -> DataFrame:
1926 join_index, left_indexer, right_indexer = self._get_join_info()
1927
1928 left_join_indexer: npt.NDArray[np.intp] | None
1929 right_join_indexer: npt.NDArray[np.intp] | None
1930
1931 if self.fill_method == "ffill":
1932 if left_indexer is None:
1933 left_join_indexer = None
1934 else:
1935 left_join_indexer = libjoin.ffill_indexer(left_indexer)
1936 if right_indexer is None:
1937 right_join_indexer = None
1938 else:
1939 right_join_indexer = libjoin.ffill_indexer(right_indexer)
1940 elif self.fill_method is None:
1941 left_join_indexer = left_indexer
1942 right_join_indexer = right_indexer
1943 else:
1944 raise ValueError("fill_method must be 'ffill' or None")
1945
1946 result = self._reindex_and_concat(
1947 join_index, left_join_indexer, right_join_indexer, copy=copy
1948 )
1949 self._maybe_add_join_keys(result, left_indexer, right_indexer)
1950
1951 return result
1952
1953
1954def _asof_by_function(direction: str):
1955 name = f"asof_join_{direction}_on_X_by_Y"
1956 return getattr(libjoin, name, None)
1957
1958
1959class _AsOfMerge(_OrderedMerge):
1960 _merge_type = "asof_merge"
1961
1962 def __init__(
1963 self,
1964 left: DataFrame | Series,
1965 right: DataFrame | Series,
1966 on: IndexLabel | None = None,
1967 left_on: IndexLabel | None = None,
1968 right_on: IndexLabel | None = None,
1969 left_index: bool = False,
1970 right_index: bool = False,
1971 by=None,
1972 left_by=None,
1973 right_by=None,
1974 suffixes: Suffixes = ("_x", "_y"),
1975 how: Literal["asof"] = "asof",
1976 tolerance=None,
1977 allow_exact_matches: bool = True,
1978 direction: str = "backward",
1979 ) -> None:
1980 self.by = by
1981 self.left_by = left_by
1982 self.right_by = right_by
1983 self.tolerance = tolerance
1984 self.allow_exact_matches = allow_exact_matches
1985 self.direction = direction
1986
1987 # check 'direction' is valid
1988 if self.direction not in ["backward", "forward", "nearest"]:
1989 raise MergeError(f"direction invalid: {self.direction}")
1990
1991 # validate allow_exact_matches
1992 if not is_bool(self.allow_exact_matches):
1993 msg = (
1994 "allow_exact_matches must be boolean, "
1995 f"passed {self.allow_exact_matches}"
1996 )
1997 raise MergeError(msg)
1998
1999 _OrderedMerge.__init__(
2000 self,
2001 left,
2002 right,
2003 on=on,
2004 left_on=left_on,
2005 right_on=right_on,
2006 left_index=left_index,
2007 right_index=right_index,
2008 how=how,
2009 suffixes=suffixes,
2010 fill_method=None,
2011 )
2012
2013 def _validate_left_right_on(self, left_on, right_on):
2014 left_on, right_on = super()._validate_left_right_on(left_on, right_on)
2015
2016 # we only allow on to be a single item for on
2017 if len(left_on) != 1 and not self.left_index:
2018 raise MergeError("can only asof on a key for left")
2019
2020 if len(right_on) != 1 and not self.right_index:
2021 raise MergeError("can only asof on a key for right")
2022
2023 if self.left_index and isinstance(self.left.index, MultiIndex):
2024 raise MergeError("left can only have one index")
2025
2026 if self.right_index and isinstance(self.right.index, MultiIndex):
2027 raise MergeError("right can only have one index")
2028
2029 # set 'by' columns
2030 if self.by is not None:
2031 if self.left_by is not None or self.right_by is not None:
2032 raise MergeError("Can only pass by OR left_by and right_by")
2033 self.left_by = self.right_by = self.by
2034 if self.left_by is None and self.right_by is not None:
2035 raise MergeError("missing left_by")
2036 if self.left_by is not None and self.right_by is None:
2037 raise MergeError("missing right_by")
2038
2039 # GH#29130 Check that merge keys do not have dtype object
2040 if not self.left_index:
2041 left_on_0 = left_on[0]
2042 if isinstance(left_on_0, _known):
2043 lo_dtype = left_on_0.dtype
2044 else:
2045 lo_dtype = (
2046 self.left._get_label_or_level_values(left_on_0).dtype
2047 if left_on_0 in self.left.columns
2048 else self.left.index.get_level_values(left_on_0)
2049 )
2050 else:
2051 lo_dtype = self.left.index.dtype
2052
2053 if not self.right_index:
2054 right_on_0 = right_on[0]
2055 if isinstance(right_on_0, _known):
2056 ro_dtype = right_on_0.dtype
2057 else:
2058 ro_dtype = (
2059 self.right._get_label_or_level_values(right_on_0).dtype
2060 if right_on_0 in self.right.columns
2061 else self.right.index.get_level_values(right_on_0)
2062 )
2063 else:
2064 ro_dtype = self.right.index.dtype
2065
2066 if (
2067 is_object_dtype(lo_dtype)
2068 or is_object_dtype(ro_dtype)
2069 or is_string_dtype(lo_dtype)
2070 or is_string_dtype(ro_dtype)
2071 ):
2072 raise MergeError(
2073 f"Incompatible merge dtype, {repr(ro_dtype)} and "
2074 f"{repr(lo_dtype)}, both sides must have numeric dtype"
2075 )
2076
2077 # add 'by' to our key-list so we can have it in the
2078 # output as a key
2079 if self.left_by is not None:
2080 if not is_list_like(self.left_by):
2081 self.left_by = [self.left_by]
2082 if not is_list_like(self.right_by):
2083 self.right_by = [self.right_by]
2084
2085 if len(self.left_by) != len(self.right_by):
2086 raise MergeError("left_by and right_by must be the same length")
2087
2088 left_on = self.left_by + list(left_on)
2089 right_on = self.right_by + list(right_on)
2090
2091 return left_on, right_on
2092
2093 def _maybe_require_matching_dtypes(
2094 self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike]
2095 ) -> None:
2096 # TODO: why do we do this for AsOfMerge but not the others?
2097
2098 def _check_dtype_match(left: ArrayLike, right: ArrayLike, i: int):
2099 if left.dtype != right.dtype:
2100 if isinstance(left.dtype, CategoricalDtype) and isinstance(
2101 right.dtype, CategoricalDtype
2102 ):
2103 # The generic error message is confusing for categoricals.
2104 #
2105 # In this function, the join keys include both the original
2106 # ones of the merge_asof() call, and also the keys passed
2107 # to its by= argument. Unordered but equal categories
2108 # are not supported for the former, but will fail
2109 # later with a ValueError, so we don't *need* to check
2110 # for them here.
2111 msg = (
2112 f"incompatible merge keys [{i}] {repr(left.dtype)} and "
2113 f"{repr(right.dtype)}, both sides category, but not equal ones"
2114 )
2115 else:
2116 msg = (
2117 f"incompatible merge keys [{i}] {repr(left.dtype)} and "
2118 f"{repr(right.dtype)}, must be the same type"
2119 )
2120 raise MergeError(msg)
2121
2122 # validate index types are the same
2123 for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
2124 _check_dtype_match(lk, rk, i)
2125
2126 if self.left_index:
2127 lt = self.left.index._values
2128 else:
2129 lt = left_join_keys[-1]
2130
2131 if self.right_index:
2132 rt = self.right.index._values
2133 else:
2134 rt = right_join_keys[-1]
2135
2136 _check_dtype_match(lt, rt, 0)
2137
2138 def _validate_tolerance(self, left_join_keys: list[ArrayLike]) -> None:
2139 # validate tolerance; datetime.timedelta or Timedelta if we have a DTI
2140 if self.tolerance is not None:
2141 if self.left_index:
2142 lt = self.left.index._values
2143 else:
2144 lt = left_join_keys[-1]
2145
2146 msg = (
2147 f"incompatible tolerance {self.tolerance}, must be compat "
2148 f"with type {repr(lt.dtype)}"
2149 )
2150
2151 if needs_i8_conversion(lt.dtype) or (
2152 isinstance(lt, ArrowExtensionArray) and lt.dtype.kind in "mM"
2153 ):
2154 if not isinstance(self.tolerance, datetime.timedelta):
2155 raise MergeError(msg)
2156 if self.tolerance < Timedelta(0):
2157 raise MergeError("tolerance must be positive")
2158
2159 elif is_integer_dtype(lt.dtype):
2160 if not is_integer(self.tolerance):
2161 raise MergeError(msg)
2162 if self.tolerance < 0:
2163 raise MergeError("tolerance must be positive")
2164
2165 elif is_float_dtype(lt.dtype):
2166 if not is_number(self.tolerance):
2167 raise MergeError(msg)
2168 # error: Unsupported operand types for > ("int" and "Number")
2169 if self.tolerance < 0: # type: ignore[operator]
2170 raise MergeError("tolerance must be positive")
2171
2172 else:
2173 raise MergeError("key must be integer, timestamp or float")
2174
2175 def _convert_values_for_libjoin(
2176 self, values: AnyArrayLike, side: str
2177 ) -> np.ndarray:
2178 # we require sortedness and non-null values in the join keys
2179 if not Index(values).is_monotonic_increasing:
2180 if isna(values).any():
2181 raise ValueError(f"Merge keys contain null values on {side} side")
2182 raise ValueError(f"{side} keys must be sorted")
2183
2184 if isinstance(values, ArrowExtensionArray):
2185 values = values._maybe_convert_datelike_array()
2186
2187 if needs_i8_conversion(values.dtype):
2188 values = values.view("i8")
2189
2190 elif isinstance(values, BaseMaskedArray):
2191 # we've verified above that no nulls exist
2192 values = values._data
2193 elif isinstance(values, ExtensionArray):
2194 values = values.to_numpy()
2195
2196 # error: Incompatible return value type (got "Union[ExtensionArray,
2197 # Any, ndarray[Any, Any], ndarray[Any, dtype[Any]], Index, Series]",
2198 # expected "ndarray[Any, Any]")
2199 return values # type: ignore[return-value]
2200
2201 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2202 """return the join indexers"""
2203
2204 # values to compare
2205 left_values = (
2206 self.left.index._values if self.left_index else self.left_join_keys[-1]
2207 )
2208 right_values = (
2209 self.right.index._values if self.right_index else self.right_join_keys[-1]
2210 )
2211
2212 # _maybe_require_matching_dtypes already checked for dtype matching
2213 assert left_values.dtype == right_values.dtype
2214
2215 tolerance = self.tolerance
2216 if tolerance is not None:
2217 # TODO: can we reuse a tolerance-conversion function from
2218 # e.g. TimedeltaIndex?
2219 if needs_i8_conversion(left_values.dtype) or (
2220 isinstance(left_values, ArrowExtensionArray)
2221 and left_values.dtype.kind in "mM"
2222 ):
2223 tolerance = Timedelta(tolerance)
2224 # TODO: we have no test cases with PeriodDtype here; probably
2225 # need to adjust tolerance for that case.
2226 if left_values.dtype.kind in "mM":
2227 # Make sure the i8 representation for tolerance
2228 # matches that for left_values/right_values.
2229 if isinstance(left_values, ArrowExtensionArray):
2230 unit = left_values.dtype.pyarrow_dtype.unit
2231 else:
2232 unit = ensure_wrapped_if_datetimelike(left_values).unit
2233 tolerance = tolerance.as_unit(unit)
2234
2235 tolerance = tolerance._value
2236
2237 # initial type conversion as needed
2238 left_values = self._convert_values_for_libjoin(left_values, "left")
2239 right_values = self._convert_values_for_libjoin(right_values, "right")
2240
2241 # a "by" parameter requires special handling
2242 if self.left_by is not None:
2243 # remove 'on' parameter from values if one existed
2244 if self.left_index and self.right_index:
2245 left_join_keys = self.left_join_keys
2246 right_join_keys = self.right_join_keys
2247 else:
2248 left_join_keys = self.left_join_keys[0:-1]
2249 right_join_keys = self.right_join_keys[0:-1]
2250
2251 mapped = [
2252 _factorize_keys(
2253 left_join_keys[n],
2254 right_join_keys[n],
2255 sort=False,
2256 )
2257 for n in range(len(left_join_keys))
2258 ]
2259
2260 if len(left_join_keys) == 1:
2261 left_by_values = mapped[0][0]
2262 right_by_values = mapped[0][1]
2263 else:
2264 arrs = [np.concatenate(m[:2]) for m in mapped]
2265 shape = tuple(m[2] for m in mapped)
2266 group_index = get_group_index(
2267 arrs, shape=shape, sort=False, xnull=False
2268 )
2269 left_len = len(left_join_keys[0])
2270 left_by_values = group_index[:left_len]
2271 right_by_values = group_index[left_len:]
2272
2273 left_by_values = ensure_int64(left_by_values)
2274 right_by_values = ensure_int64(right_by_values)
2275
2276 # choose appropriate function by type
2277 func = _asof_by_function(self.direction)
2278 return func(
2279 left_values,
2280 right_values,
2281 left_by_values,
2282 right_by_values,
2283 self.allow_exact_matches,
2284 tolerance,
2285 )
2286 else:
2287 # choose appropriate function by type
2288 func = _asof_by_function(self.direction)
2289 return func(
2290 left_values,
2291 right_values,
2292 None,
2293 None,
2294 self.allow_exact_matches,
2295 tolerance,
2296 False,
2297 )
2298
2299
2300def _get_multiindex_indexer(
2301 join_keys: list[ArrayLike], index: MultiIndex, sort: bool
2302) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2303 # left & right join labels and num. of levels at each location
2304 mapped = (
2305 _factorize_keys(index.levels[n]._values, join_keys[n], sort=sort)
2306 for n in range(index.nlevels)
2307 )
2308 zipped = zip(*mapped)
2309 rcodes, lcodes, shape = (list(x) for x in zipped)
2310 if sort:
2311 rcodes = list(map(np.take, rcodes, index.codes))
2312 else:
2313 i8copy = lambda a: a.astype("i8", subok=False, copy=True)
2314 rcodes = list(map(i8copy, index.codes))
2315
2316 # fix right labels if there were any nulls
2317 for i, join_key in enumerate(join_keys):
2318 mask = index.codes[i] == -1
2319 if mask.any():
2320 # check if there already was any nulls at this location
2321 # if there was, it is factorized to `shape[i] - 1`
2322 a = join_key[lcodes[i] == shape[i] - 1]
2323 if a.size == 0 or not a[0] != a[0]:
2324 shape[i] += 1
2325
2326 rcodes[i][mask] = shape[i] - 1
2327
2328 # get flat i8 join keys
2329 lkey, rkey = _get_join_keys(lcodes, rcodes, tuple(shape), sort)
2330 return lkey, rkey
2331
2332
2333def _get_empty_indexer() -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2334 """Return empty join indexers."""
2335 return (
2336 np.array([], dtype=np.intp),
2337 np.array([], dtype=np.intp),
2338 )
2339
2340
2341def _get_no_sort_one_missing_indexer(
2342 n: int, left_missing: bool
2343) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2344 """
2345 Return join indexers where all of one side is selected without sorting
2346 and none of the other side is selected.
2347
2348 Parameters
2349 ----------
2350 n : int
2351 Length of indexers to create.
2352 left_missing : bool
2353 If True, the left indexer will contain only -1's.
2354 If False, the right indexer will contain only -1's.
2355
2356 Returns
2357 -------
2358 np.ndarray[np.intp]
2359 Left indexer
2360 np.ndarray[np.intp]
2361 Right indexer
2362 """
2363 idx = np.arange(n, dtype=np.intp)
2364 idx_missing = np.full(shape=n, fill_value=-1, dtype=np.intp)
2365 if left_missing:
2366 return idx_missing, idx
2367 return idx, idx_missing
2368
2369
2370def _left_join_on_index(
2371 left_ax: Index, right_ax: Index, join_keys: list[ArrayLike], sort: bool = False
2372) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]:
2373 if isinstance(right_ax, MultiIndex):
2374 lkey, rkey = _get_multiindex_indexer(join_keys, right_ax, sort=sort)
2375 else:
2376 # error: Incompatible types in assignment (expression has type
2377 # "Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]",
2378 # variable has type "ndarray[Any, dtype[signedinteger[Any]]]")
2379 lkey = join_keys[0] # type: ignore[assignment]
2380 # error: Incompatible types in assignment (expression has type "Index",
2381 # variable has type "ndarray[Any, dtype[signedinteger[Any]]]")
2382 rkey = right_ax._values # type: ignore[assignment]
2383
2384 left_key, right_key, count = _factorize_keys(lkey, rkey, sort=sort)
2385 left_indexer, right_indexer = libjoin.left_outer_join(
2386 left_key, right_key, count, sort=sort
2387 )
2388
2389 if sort or len(left_ax) != len(left_indexer):
2390 # if asked to sort or there are 1-to-many matches
2391 join_index = left_ax.take(left_indexer)
2392 return join_index, left_indexer, right_indexer
2393
2394 # left frame preserves order & length of its index
2395 return left_ax, None, right_indexer
2396
2397
2398def _factorize_keys(
2399 lk: ArrayLike, rk: ArrayLike, sort: bool = True
2400) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
2401 """
2402 Encode left and right keys as enumerated types.
2403
2404 This is used to get the join indexers to be used when merging DataFrames.
2405
2406 Parameters
2407 ----------
2408 lk : ndarray, ExtensionArray
2409 Left key.
2410 rk : ndarray, ExtensionArray
2411 Right key.
2412 sort : bool, defaults to True
2413 If True, the encoding is done such that the unique elements in the
2414 keys are sorted.
2415
2416 Returns
2417 -------
2418 np.ndarray[np.intp]
2419 Left (resp. right if called with `key='right'`) labels, as enumerated type.
2420 np.ndarray[np.intp]
2421 Right (resp. left if called with `key='right'`) labels, as enumerated type.
2422 int
2423 Number of unique elements in union of left and right labels.
2424
2425 See Also
2426 --------
2427 merge : Merge DataFrame or named Series objects
2428 with a database-style join.
2429 algorithms.factorize : Encode the object as an enumerated type
2430 or categorical variable.
2431
2432 Examples
2433 --------
2434 >>> lk = np.array(["a", "c", "b"])
2435 >>> rk = np.array(["a", "c"])
2436
2437 Here, the unique values are `'a', 'b', 'c'`. With the default
2438 `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:
2439
2440 >>> pd.core.reshape.merge._factorize_keys(lk, rk)
2441 (array([0, 2, 1]), array([0, 2]), 3)
2442
2443 With the `sort=False`, the encoding will correspond to the order
2444 in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:
2445
2446 >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)
2447 (array([0, 1, 2]), array([0, 1]), 3)
2448 """
2449 # TODO: if either is a RangeIndex, we can likely factorize more efficiently?
2450
2451 if (
2452 isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype)
2453 ) or (lib.is_np_dtype(lk.dtype, "M") and lib.is_np_dtype(rk.dtype, "M")):
2454 # Extract the ndarray (UTC-localized) values
2455 # Note: we dont need the dtypes to match, as these can still be compared
2456 lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk)
2457 lk = cast("DatetimeArray", lk)._ndarray
2458 rk = cast("DatetimeArray", rk)._ndarray
2459
2460 elif (
2461 isinstance(lk.dtype, CategoricalDtype)
2462 and isinstance(rk.dtype, CategoricalDtype)
2463 and lk.dtype == rk.dtype
2464 ):
2465 assert isinstance(lk, Categorical)
2466 assert isinstance(rk, Categorical)
2467 # Cast rk to encoding so we can compare codes with lk
2468
2469 rk = lk._encode_with_my_categories(rk)
2470
2471 lk = ensure_int64(lk.codes)
2472 rk = ensure_int64(rk.codes)
2473
2474 elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
2475 if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or (
2476 isinstance(lk.dtype, StringDtype)
2477 and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"]
2478 ):
2479 import pyarrow as pa
2480 import pyarrow.compute as pc
2481
2482 len_lk = len(lk)
2483 lk = lk._pa_array # type: ignore[attr-defined]
2484 rk = rk._pa_array # type: ignore[union-attr]
2485 dc = (
2486 pa.chunked_array(lk.chunks + rk.chunks) # type: ignore[union-attr]
2487 .combine_chunks()
2488 .dictionary_encode()
2489 )
2490
2491 llab, rlab, count = (
2492 pc.fill_null(dc.indices[slice(len_lk)], -1)
2493 .to_numpy()
2494 .astype(np.intp, copy=False),
2495 pc.fill_null(dc.indices[slice(len_lk, None)], -1)
2496 .to_numpy()
2497 .astype(np.intp, copy=False),
2498 len(dc.dictionary),
2499 )
2500
2501 if sort:
2502 uniques = dc.dictionary.to_numpy(zero_copy_only=False)
2503 llab, rlab = _sort_labels(uniques, llab, rlab)
2504
2505 if dc.null_count > 0:
2506 lmask = llab == -1
2507 lany = lmask.any()
2508 rmask = rlab == -1
2509 rany = rmask.any()
2510 if lany:
2511 np.putmask(llab, lmask, count)
2512 if rany:
2513 np.putmask(rlab, rmask, count)
2514 count += 1
2515 return llab, rlab, count
2516
2517 if not isinstance(lk, BaseMaskedArray) and not (
2518 # exclude arrow dtypes that would get cast to object
2519 isinstance(lk.dtype, ArrowDtype)
2520 and (
2521 is_numeric_dtype(lk.dtype.numpy_dtype)
2522 or is_string_dtype(lk.dtype)
2523 and not sort
2524 )
2525 ):
2526 lk, _ = lk._values_for_factorize()
2527
2528 # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute
2529 # "_values_for_factorize"
2530 rk, _ = rk._values_for_factorize() # type: ignore[union-attr]
2531
2532 if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype:
2533 # GH#23917 TODO: Needs tests for non-matching dtypes
2534 # GH#23917 TODO: needs tests for case where lk is integer-dtype
2535 # and rk is datetime-dtype
2536 lk = np.asarray(lk, dtype=np.int64)
2537 rk = np.asarray(rk, dtype=np.int64)
2538
2539 klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk)
2540
2541 rizer = klass(max(len(lk), len(rk)))
2542
2543 if isinstance(lk, BaseMaskedArray):
2544 assert isinstance(rk, BaseMaskedArray)
2545 llab = rizer.factorize(lk._data, mask=lk._mask)
2546 rlab = rizer.factorize(rk._data, mask=rk._mask)
2547 elif isinstance(lk, ArrowExtensionArray):
2548 assert isinstance(rk, ArrowExtensionArray)
2549 # we can only get here with numeric dtypes
2550 # TODO: Remove when we have a Factorizer for Arrow
2551 llab = rizer.factorize(
2552 lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna()
2553 )
2554 rlab = rizer.factorize(
2555 rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna()
2556 )
2557 else:
2558 # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
2559 # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
2560 # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
2561 llab = rizer.factorize(lk) # type: ignore[arg-type]
2562 rlab = rizer.factorize(rk) # type: ignore[arg-type]
2563 assert llab.dtype == np.dtype(np.intp), llab.dtype
2564 assert rlab.dtype == np.dtype(np.intp), rlab.dtype
2565
2566 count = rizer.get_count()
2567
2568 if sort:
2569 uniques = rizer.uniques.to_array()
2570 llab, rlab = _sort_labels(uniques, llab, rlab)
2571
2572 # NA group
2573 lmask = llab == -1
2574 lany = lmask.any()
2575 rmask = rlab == -1
2576 rany = rmask.any()
2577
2578 if lany or rany:
2579 if lany:
2580 np.putmask(llab, lmask, count)
2581 if rany:
2582 np.putmask(rlab, rmask, count)
2583 count += 1
2584
2585 return llab, rlab, count
2586
2587
2588def _convert_arrays_and_get_rizer_klass(
2589 lk: ArrayLike, rk: ArrayLike
2590) -> tuple[type[libhashtable.Factorizer], ArrayLike, ArrayLike]:
2591 klass: type[libhashtable.Factorizer]
2592 if is_numeric_dtype(lk.dtype):
2593 if lk.dtype != rk.dtype:
2594 dtype = find_common_type([lk.dtype, rk.dtype])
2595 if isinstance(dtype, ExtensionDtype):
2596 cls = dtype.construct_array_type()
2597 if not isinstance(lk, ExtensionArray):
2598 lk = cls._from_sequence(lk, dtype=dtype, copy=False)
2599 else:
2600 lk = lk.astype(dtype, copy=False)
2601
2602 if not isinstance(rk, ExtensionArray):
2603 rk = cls._from_sequence(rk, dtype=dtype, copy=False)
2604 else:
2605 rk = rk.astype(dtype, copy=False)
2606 else:
2607 lk = lk.astype(dtype, copy=False)
2608 rk = rk.astype(dtype, copy=False)
2609 if isinstance(lk, BaseMaskedArray):
2610 # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]";
2611 # expected type "Type[object]"
2612 klass = _factorizers[lk.dtype.type] # type: ignore[index]
2613 elif isinstance(lk.dtype, ArrowDtype):
2614 klass = _factorizers[lk.dtype.numpy_dtype.type]
2615 else:
2616 klass = _factorizers[lk.dtype.type]
2617
2618 else:
2619 klass = libhashtable.ObjectFactorizer
2620 lk = ensure_object(lk)
2621 rk = ensure_object(rk)
2622 return klass, lk, rk
2623
2624
2625def _sort_labels(
2626 uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp]
2627) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
2628 llength = len(left)
2629 labels = np.concatenate([left, right])
2630
2631 _, new_labels = algos.safe_sort(uniques, labels, use_na_sentinel=True)
2632 new_left, new_right = new_labels[:llength], new_labels[llength:]
2633
2634 return new_left, new_right
2635
2636
2637def _get_join_keys(
2638 llab: list[npt.NDArray[np.int64 | np.intp]],
2639 rlab: list[npt.NDArray[np.int64 | np.intp]],
2640 shape: Shape,
2641 sort: bool,
2642) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:
2643 # how many levels can be done without overflow
2644 nlev = next(
2645 lev
2646 for lev in range(len(shape), 0, -1)
2647 if not is_int64_overflow_possible(shape[:lev])
2648 )
2649
2650 # get keys for the first `nlev` levels
2651 stride = np.prod(shape[1:nlev], dtype="i8")
2652 lkey = stride * llab[0].astype("i8", subok=False, copy=False)
2653 rkey = stride * rlab[0].astype("i8", subok=False, copy=False)
2654
2655 for i in range(1, nlev):
2656 with np.errstate(divide="ignore"):
2657 stride //= shape[i]
2658 lkey += llab[i] * stride
2659 rkey += rlab[i] * stride
2660
2661 if nlev == len(shape): # all done!
2662 return lkey, rkey
2663
2664 # densify current keys to avoid overflow
2665 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
2666
2667 llab = [lkey] + llab[nlev:]
2668 rlab = [rkey] + rlab[nlev:]
2669 shape = (count,) + shape[nlev:]
2670
2671 return _get_join_keys(llab, rlab, shape, sort)
2672
2673
2674def _should_fill(lname, rname) -> bool:
2675 if not isinstance(lname, str) or not isinstance(rname, str):
2676 return True
2677 return lname == rname
2678
2679
2680def _any(x) -> bool:
2681 return x is not None and com.any_not_none(*x)
2682
2683
2684def _validate_operand(obj: DataFrame | Series) -> DataFrame:
2685 if isinstance(obj, ABCDataFrame):
2686 return obj
2687 elif isinstance(obj, ABCSeries):
2688 if obj.name is None:
2689 raise ValueError("Cannot merge a Series without a name")
2690 return obj.to_frame()
2691 else:
2692 raise TypeError(
2693 f"Can only merge Series or DataFrame objects, a {type(obj)} was passed"
2694 )
2695
2696
2697def _items_overlap_with_suffix(
2698 left: Index, right: Index, suffixes: Suffixes
2699) -> tuple[Index, Index]:
2700 """
2701 Suffixes type validation.
2702
2703 If two indices overlap, add suffixes to overlapping entries.
2704
2705 If corresponding suffix is empty, the entry is simply converted to string.
2706
2707 """
2708 if not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict):
2709 raise TypeError(
2710 f"Passing 'suffixes' as a {type(suffixes)}, is not supported. "
2711 "Provide 'suffixes' as a tuple instead."
2712 )
2713
2714 to_rename = left.intersection(right)
2715 if len(to_rename) == 0:
2716 return left, right
2717
2718 lsuffix, rsuffix = suffixes
2719
2720 if not lsuffix and not rsuffix:
2721 raise ValueError(f"columns overlap but no suffix specified: {to_rename}")
2722
2723 def renamer(x, suffix: str | None):
2724 """
2725 Rename the left and right indices.
2726
2727 If there is overlap, and suffix is not None, add
2728 suffix, otherwise, leave it as-is.
2729
2730 Parameters
2731 ----------
2732 x : original column name
2733 suffix : str or None
2734
2735 Returns
2736 -------
2737 x : renamed column name
2738 """
2739 if x in to_rename and suffix is not None:
2740 return f"{x}{suffix}"
2741 return x
2742
2743 lrenamer = partial(renamer, suffix=lsuffix)
2744 rrenamer = partial(renamer, suffix=rsuffix)
2745
2746 llabels = left._transform_index(lrenamer)
2747 rlabels = right._transform_index(rrenamer)
2748
2749 dups = []
2750 if not llabels.is_unique:
2751 # Only warn when duplicates are caused because of suffixes, already duplicated
2752 # columns in origin should not warn
2753 dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()
2754 if not rlabels.is_unique:
2755 dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())
2756 if dups:
2757 raise MergeError(
2758 f"Passing 'suffixes' which cause duplicate columns {set(dups)} is "
2759 f"not allowed.",
2760 )
2761
2762 return llabels, rlabels