Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/reshape/merge.py: 11%

1"""

2SQL-style merge routines

3"""

4from __future__ import annotations

6from collections.abc import (

7 Hashable,

8 Sequence,

10import datetime

11from functools import partial

12from typing import (

13 TYPE_CHECKING,

14 Literal,

15 cast,

16 final,

17)

18import uuid

19import warnings

21import numpy as np

23from pandas._libs import (

24 Timedelta,

25 hashtable as libhashtable,

26 join as libjoin,

27 lib,

28)

29from pandas._libs.lib import is_range_indexer

30from pandas._typing import (

31 AnyArrayLike,

32 ArrayLike,

33 IndexLabel,

34 JoinHow,

35 MergeHow,

36 Shape,

37 Suffixes,

38 npt,

39)

40from pandas.errors import MergeError

41from pandas.util._decorators import (

42 Appender,

43 Substitution,

44 cache_readonly,

45)

46from pandas.util._exceptions import find_stack_level

48from pandas.core.dtypes.base import ExtensionDtype

49from pandas.core.dtypes.cast import find_common_type

50from pandas.core.dtypes.common import (

51 ensure_int64,

52 ensure_object,

53 is_bool,

54 is_bool_dtype,

55 is_float_dtype,

56 is_integer,

57 is_integer_dtype,

58 is_list_like,

59 is_number,

60 is_numeric_dtype,

61 is_object_dtype,

62 is_string_dtype,

63 needs_i8_conversion,

64)

65from pandas.core.dtypes.dtypes import (

66 CategoricalDtype,

67 DatetimeTZDtype,

68)

69from pandas.core.dtypes.generic import (

70 ABCDataFrame,

71 ABCSeries,

72)

73from pandas.core.dtypes.missing import (

74 isna,

75 na_value_for_dtype,

76)

78from pandas import (

79 ArrowDtype,

80 Categorical,

81 Index,

82 MultiIndex,

83 Series,

84)

85import pandas.core.algorithms as algos

86from pandas.core.arrays import (

87 ArrowExtensionArray,

88 BaseMaskedArray,

89 ExtensionArray,

90)

91from pandas.core.arrays.string_ import StringDtype

92import pandas.core.common as com

93from pandas.core.construction import (

94 ensure_wrapped_if_datetimelike,

95 extract_array,

96)

97from pandas.core.frame import _merge_doc

98from pandas.core.indexes.api import default_index

99from pandas.core.sorting import (

100 get_group_index,

101 is_int64_overflow_possible,

102)

103

104if TYPE_CHECKING:

105 from pandas import DataFrame

106 from pandas.core import groupby

107 from pandas.core.arrays import DatetimeArray

108 from pandas.core.indexes.frozen import FrozenList

109

110_factorizers = {

111 np.int64: libhashtable.Int64Factorizer,

112 np.longlong: libhashtable.Int64Factorizer,

113 np.int32: libhashtable.Int32Factorizer,

114 np.int16: libhashtable.Int16Factorizer,

115 np.int8: libhashtable.Int8Factorizer,

116 np.uint64: libhashtable.UInt64Factorizer,

117 np.uint32: libhashtable.UInt32Factorizer,

118 np.uint16: libhashtable.UInt16Factorizer,

119 np.uint8: libhashtable.UInt8Factorizer,

120 np.bool_: libhashtable.UInt8Factorizer,

121 np.float64: libhashtable.Float64Factorizer,

122 np.float32: libhashtable.Float32Factorizer,

123 np.complex64: libhashtable.Complex64Factorizer,

124 np.complex128: libhashtable.Complex128Factorizer,

125 np.object_: libhashtable.ObjectFactorizer,

126}

127

128# See https://github.com/pandas-dev/pandas/issues/52451

129if np.intc is not np.int32:

130 _factorizers[np.intc] = libhashtable.Int64Factorizer

131

132_known = (np.ndarray, ExtensionArray, Index, ABCSeries)

133

134

135@Substitution("\nleft : DataFrame or named Series")

136@Appender(_merge_doc, indents=0)

137def merge(

138 left: DataFrame | Series,

139 right: DataFrame | Series,

140 how: MergeHow = "inner",

141 on: IndexLabel | AnyArrayLike | None = None,

142 left_on: IndexLabel | AnyArrayLike | None = None,

143 right_on: IndexLabel | AnyArrayLike | None = None,

144 left_index: bool = False,

145 right_index: bool = False,

146 sort: bool = False,

147 suffixes: Suffixes = ("_x", "_y"),

148 copy: bool | None = None,

149 indicator: str | bool = False,

150 validate: str | None = None,

151) -> DataFrame:

152 left_df = _validate_operand(left)

153 right_df = _validate_operand(right)

154 if how == "cross":

155 return _cross_merge(

156 left_df,

157 right_df,

158 on=on,

159 left_on=left_on,

160 right_on=right_on,

161 left_index=left_index,

162 right_index=right_index,

163 sort=sort,

164 suffixes=suffixes,

165 indicator=indicator,

166 validate=validate,

167 copy=copy,

168 )

169 else:

170 op = _MergeOperation(

171 left_df,

172 right_df,

173 how=how,

174 on=on,

175 left_on=left_on,

176 right_on=right_on,

177 left_index=left_index,

178 right_index=right_index,

179 sort=sort,

180 suffixes=suffixes,

181 indicator=indicator,

182 validate=validate,

183 )

184 return op.get_result(copy=copy)

185

186

187def _cross_merge(

188 left: DataFrame,

189 right: DataFrame,

190 on: IndexLabel | AnyArrayLike | None = None,

191 left_on: IndexLabel | AnyArrayLike | None = None,

192 right_on: IndexLabel | AnyArrayLike | None = None,

193 left_index: bool = False,

194 right_index: bool = False,

195 sort: bool = False,

196 suffixes: Suffixes = ("_x", "_y"),

197 copy: bool | None = None,

198 indicator: str | bool = False,

199 validate: str | None = None,

200) -> DataFrame:

201 """

202 See merge.__doc__ with how='cross'

203 """

204

205 if (

206 left_index

207 or right_index

208 or right_on is not None

209 or left_on is not None

210 or on is not None

211 ):

212 raise MergeError(

213 "Can not pass on, right_on, left_on or set right_index=True or "

214 "left_index=True"

215 )

216

217 cross_col = f"_cross_{uuid.uuid4()}"

218 left = left.assign(**{cross_col: 1})

219 right = right.assign(**{cross_col: 1})

220

221 left_on = right_on = [cross_col]

222

223 res = merge(

224 left,

225 right,

226 how="inner",

227 on=on,

228 left_on=left_on,

229 right_on=right_on,

230 left_index=left_index,

231 right_index=right_index,

232 sort=sort,

233 suffixes=suffixes,

234 indicator=indicator,

235 validate=validate,

236 copy=copy,

237 )

238 del res[cross_col]

239 return res

240

241

242def _groupby_and_merge(

243 by, left: DataFrame | Series, right: DataFrame | Series, merge_pieces

244):

245 """

246 groupby & merge; we are always performing a left-by type operation

247

248 Parameters

249 ----------

250 by: field to group

251 left: DataFrame

252 right: DataFrame

253 merge_pieces: function for merging

254 """

255 pieces = []

256 if not isinstance(by, (list, tuple)):

257 by = [by]

258

259 lby = left.groupby(by, sort=False)

260 rby: groupby.DataFrameGroupBy | groupby.SeriesGroupBy | None = None

261

262 # if we can groupby the rhs

263 # then we can get vastly better perf

264 if all(item in right.columns for item in by):

265 rby = right.groupby(by, sort=False)

266

267 for key, lhs in lby._grouper.get_iterator(lby._selected_obj, axis=lby.axis):

268 if rby is None:

269 rhs = right

270 else:

271 try:

272 rhs = right.take(rby.indices[key])

273 except KeyError:

274 # key doesn't exist in left

275 lcols = lhs.columns.tolist()

276 cols = lcols + [r for r in right.columns if r not in set(lcols)]

277 merged = lhs.reindex(columns=cols)

278 merged.index = range(len(merged))

279 pieces.append(merged)

280 continue

281

282 merged = merge_pieces(lhs, rhs)

283

284 # make sure join keys are in the merged

285 # TODO, should merge_pieces do this?

286 merged[by] = key

287

288 pieces.append(merged)

289

290 # preserve the original order

291 # if we have a missing piece this can be reset

292 from pandas.core.reshape.concat import concat

293

294 result = concat(pieces, ignore_index=True)

295 result = result.reindex(columns=pieces[0].columns, copy=False)

296 return result, lby

297

298

299def merge_ordered(

300 left: DataFrame | Series,

301 right: DataFrame | Series,

302 on: IndexLabel | None = None,

303 left_on: IndexLabel | None = None,

304 right_on: IndexLabel | None = None,

305 left_by=None,

306 right_by=None,

307 fill_method: str | None = None,

308 suffixes: Suffixes = ("_x", "_y"),

309 how: JoinHow = "outer",

310) -> DataFrame:

311 """

312 Perform a merge for ordered data with optional filling/interpolation.

313

314 Designed for ordered data like time series data. Optionally

315 perform group-wise merge (see examples).

316

317 Parameters

318 ----------

319 left : DataFrame or named Series

320 right : DataFrame or named Series

321 on : label or list

322 Field names to join on. Must be found in both DataFrames.

323 left_on : label or list, or array-like

324 Field names to join on in left DataFrame. Can be a vector or list of

325 vectors of the length of the DataFrame to use a particular vector as

326 the join key instead of columns.

327 right_on : label or list, or array-like

328 Field names to join on in right DataFrame or vector/list of vectors per

329 left_on docs.

330 left_by : column name or list of column names

331 Group left DataFrame by group columns and merge piece by piece with

332 right DataFrame. Must be None if either left or right are a Series.

333 right_by : column name or list of column names

334 Group right DataFrame by group columns and merge piece by piece with

335 left DataFrame. Must be None if either left or right are a Series.

336 fill_method : {'ffill', None}, default None

337 Interpolation method for data.

338 suffixes : list-like, default is ("_x", "_y")

339 A length-2 sequence where each element is optionally a string

340 indicating the suffix to add to overlapping column names in

341 `left` and `right` respectively. Pass a value of `None` instead

342 of a string to indicate that the column name from `left` or

343 `right` should be left as-is, with no suffix. At least one of the

344 values must not be None.

345

346 how : {'left', 'right', 'outer', 'inner'}, default 'outer'

347 * left: use only keys from left frame (SQL: left outer join)

348 * right: use only keys from right frame (SQL: right outer join)

349 * outer: use union of keys from both frames (SQL: full outer join)

350 * inner: use intersection of keys from both frames (SQL: inner join).

351

352 Returns

353 -------

354 DataFrame

355 The merged DataFrame output type will be the same as

356 'left', if it is a subclass of DataFrame.

357

358 See Also

359 --------

360 merge : Merge with a database-style join.

361 merge_asof : Merge on nearest keys.

362

363 Examples

364 --------

365 >>> from pandas import merge_ordered

366 >>> df1 = pd.DataFrame(

367 ... {

368 ... "key": ["a", "c", "e", "a", "c", "e"],

369 ... "lvalue": [1, 2, 3, 1, 2, 3],

370 ... "group": ["a", "a", "a", "b", "b", "b"]

371 ... }

372 ... )

373 >>> df1

374 key lvalue group

375 0 a 1 a

376 1 c 2 a

377 2 e 3 a

378 3 a 1 b

379 4 c 2 b

380 5 e 3 b

381

382 >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})

383 >>> df2

384 key rvalue

385 0 b 1

386 1 c 2

387 2 d 3

388

389 >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group")

390 key lvalue group rvalue

391 0 a 1 a NaN

392 1 b 1 a 1.0

393 2 c 2 a 2.0

394 3 d 2 a 3.0

395 4 e 3 a 3.0

396 5 a 1 b NaN

397 6 b 1 b 1.0

398 7 c 2 b 2.0

399 8 d 2 b 3.0

400 9 e 3 b 3.0

401 """

402

403 def _merger(x, y) -> DataFrame:

404 # perform the ordered merge operation

405 op = _OrderedMerge(

406 x,

407 y,

408 on=on,

409 left_on=left_on,

410 right_on=right_on,

411 suffixes=suffixes,

412 fill_method=fill_method,

413 how=how,

414 )

415 return op.get_result()

416

417 if left_by is not None and right_by is not None:

418 raise ValueError("Can only group either left or right frames")

419 if left_by is not None:

420 if isinstance(left_by, str):

421 left_by = [left_by]

422 check = set(left_by).difference(left.columns)

423 if len(check) != 0:

424 raise KeyError(f"{check} not found in left columns")

425 result, _ = _groupby_and_merge(left_by, left, right, lambda x, y: _merger(x, y))

426 elif right_by is not None:

427 if isinstance(right_by, str):

428 right_by = [right_by]

429 check = set(right_by).difference(right.columns)

430 if len(check) != 0:

431 raise KeyError(f"{check} not found in right columns")

432 result, _ = _groupby_and_merge(

433 right_by, right, left, lambda x, y: _merger(y, x)

434 )

435 else:

436 result = _merger(left, right)

437 return result

438

439

440def merge_asof(

441 left: DataFrame | Series,

442 right: DataFrame | Series,

443 on: IndexLabel | None = None,

444 left_on: IndexLabel | None = None,

445 right_on: IndexLabel | None = None,

446 left_index: bool = False,

447 right_index: bool = False,

448 by=None,

449 left_by=None,

450 right_by=None,

451 suffixes: Suffixes = ("_x", "_y"),

452 tolerance: int | Timedelta | None = None,

453 allow_exact_matches: bool = True,

454 direction: str = "backward",

455) -> DataFrame:

456 """

457 Perform a merge by key distance.

458

459 This is similar to a left-join except that we match on nearest

460 key rather than equal keys. Both DataFrames must be sorted by the key.

461

462 For each row in the left DataFrame:

463

464 - A "backward" search selects the last row in the right DataFrame whose

465 'on' key is less than or equal to the left's key.

466

467 - A "forward" search selects the first row in the right DataFrame whose

468 'on' key is greater than or equal to the left's key.

469

470 - A "nearest" search selects the row in the right DataFrame whose 'on'

471 key is closest in absolute distance to the left's key.

472

473 Optionally match on equivalent keys with 'by' before searching with 'on'.

474

475 Parameters

476 ----------

477 left : DataFrame or named Series

478 right : DataFrame or named Series

479 on : label

480 Field name to join on. Must be found in both DataFrames.

481 The data MUST be ordered. Furthermore this must be a numeric column,

482 such as datetimelike, integer, or float. On or left_on/right_on

483 must be given.

484 left_on : label

485 Field name to join on in left DataFrame.

486 right_on : label

487 Field name to join on in right DataFrame.

488 left_index : bool

489 Use the index of the left DataFrame as the join key.

490 right_index : bool

491 Use the index of the right DataFrame as the join key.

492 by : column name or list of column names

493 Match on these columns before performing merge operation.

494 left_by : column name

495 Field names to match on in the left DataFrame.

496 right_by : column name

497 Field names to match on in the right DataFrame.

498 suffixes : 2-length sequence (tuple, list, ...)

499 Suffix to apply to overlapping column names in the left and right

500 side, respectively.

501 tolerance : int or Timedelta, optional, default None

502 Select asof tolerance within this range; must be compatible

503 with the merge index.

504 allow_exact_matches : bool, default True

505

506 - If True, allow matching with the same 'on' value

507 (i.e. less-than-or-equal-to / greater-than-or-equal-to)

508 - If False, don't match the same 'on' value

509 (i.e., strictly less-than / strictly greater-than).

510

511 direction : 'backward' (default), 'forward', or 'nearest'

512 Whether to search for prior, subsequent, or closest matches.

513

514 Returns

515 -------

516 DataFrame

517

518 See Also

519 --------

520 merge : Merge with a database-style join.

521 merge_ordered : Merge with optional filling/interpolation.

522

523 Examples

524 --------

525 >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})

526 >>> left

527 a left_val

528 0 1 a

529 1 5 b

530 2 10 c

531

532 >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]})

533 >>> right

534 a right_val

535 0 1 1

536 1 2 2

537 2 3 3

538 3 6 6

539 4 7 7

540

541 >>> pd.merge_asof(left, right, on="a")

542 a left_val right_val

543 0 1 a 1

544 1 5 b 3

545 2 10 c 7

546

547 >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False)

548 a left_val right_val

549 0 1 a NaN

550 1 5 b 3.0

551 2 10 c 7.0

552

553 >>> pd.merge_asof(left, right, on="a", direction="forward")

554 a left_val right_val

555 0 1 a 1.0

556 1 5 b 6.0

557 2 10 c NaN

558

559 >>> pd.merge_asof(left, right, on="a", direction="nearest")

560 a left_val right_val

561 0 1 a 1

562 1 5 b 6

563 2 10 c 7

564

565 We can use indexed DataFrames as well.

566

567 >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10])

568 >>> left

569 left_val

570 1 a

571 5 b

572 10 c

573

574 >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7])

575 >>> right

576 right_val

577 1 1

578 2 2

579 3 3

580 6 6

581 7 7

582

583 >>> pd.merge_asof(left, right, left_index=True, right_index=True)

584 left_val right_val

585 1 a 1

586 5 b 3

587 10 c 7

588

589 Here is a real-world times-series example

590

591 >>> quotes = pd.DataFrame(

592 ... {

593 ... "time": [

594 ... pd.Timestamp("2016-05-25 13:30:00.023"),

595 ... pd.Timestamp("2016-05-25 13:30:00.023"),

596 ... pd.Timestamp("2016-05-25 13:30:00.030"),

597 ... pd.Timestamp("2016-05-25 13:30:00.041"),

598 ... pd.Timestamp("2016-05-25 13:30:00.048"),

599 ... pd.Timestamp("2016-05-25 13:30:00.049"),

600 ... pd.Timestamp("2016-05-25 13:30:00.072"),

601 ... pd.Timestamp("2016-05-25 13:30:00.075")

602 ... ],

603 ... "ticker": [

604 ... "GOOG",

605 ... "MSFT",

606 ... "MSFT",

607 ... "MSFT",

608 ... "GOOG",

609 ... "AAPL",

610 ... "GOOG",

611 ... "MSFT"

612 ... ],

613 ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],

614 ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]

615 ... }

616 ... )

617 >>> quotes

618 time ticker bid ask

619 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93

620 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96

621 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98

622 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00

623 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93

624 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01

625 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88

626 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03

627

628 >>> trades = pd.DataFrame(

629 ... {

630 ... "time": [

631 ... pd.Timestamp("2016-05-25 13:30:00.023"),

632 ... pd.Timestamp("2016-05-25 13:30:00.038"),

633 ... pd.Timestamp("2016-05-25 13:30:00.048"),

634 ... pd.Timestamp("2016-05-25 13:30:00.048"),

635 ... pd.Timestamp("2016-05-25 13:30:00.048")

636 ... ],

637 ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],

638 ... "price": [51.95, 51.95, 720.77, 720.92, 98.0],

639 ... "quantity": [75, 155, 100, 100, 100]

640 ... }

641 ... )

642 >>> trades

643 time ticker price quantity

644 0 2016-05-25 13:30:00.023 MSFT 51.95 75

645 1 2016-05-25 13:30:00.038 MSFT 51.95 155

646 2 2016-05-25 13:30:00.048 GOOG 720.77 100

647 3 2016-05-25 13:30:00.048 GOOG 720.92 100

648 4 2016-05-25 13:30:00.048 AAPL 98.00 100

649

650 By default we are taking the asof of the quotes

651

652 >>> pd.merge_asof(trades, quotes, on="time", by="ticker")

653 time ticker price quantity bid ask

654 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96

655 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98

656 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93

657 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93

658 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN

659

660 We only asof within 2ms between the quote time and the trade time

661

662 >>> pd.merge_asof(

663 ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms")

664 ... )

665 time ticker price quantity bid ask

666 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96

667 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN

668 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93

669 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93

670 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN

671

672 We only asof within 10ms between the quote time and the trade time

673 and we exclude exact matches on time. However *prior* data will

674 propagate forward

675

676 >>> pd.merge_asof(

677 ... trades,

678 ... quotes,

679 ... on="time",

680 ... by="ticker",

681 ... tolerance=pd.Timedelta("10ms"),

682 ... allow_exact_matches=False

683 ... )

684 time ticker price quantity bid ask

685 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN

686 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98

687 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN

688 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN

689 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN

690 """

691 op = _AsOfMerge(

692 left,

693 right,

694 on=on,

695 left_on=left_on,

696 right_on=right_on,

697 left_index=left_index,

698 right_index=right_index,

699 by=by,

700 left_by=left_by,

701 right_by=right_by,

702 suffixes=suffixes,

703 how="asof",

704 tolerance=tolerance,

705 allow_exact_matches=allow_exact_matches,

706 direction=direction,

707 )

708 return op.get_result()

709

710

711# TODO: transformations??

712# TODO: only copy DataFrames when modification necessary

713class _MergeOperation:

714 """

715 Perform a database (SQL) merge operation between two DataFrame or Series

716 objects using either columns as keys or their row indexes

717 """

718

719 _merge_type = "merge"

720 how: JoinHow | Literal["asof"]

721 on: IndexLabel | None

722 # left_on/right_on may be None when passed, but in validate_specification

723 # get replaced with non-None.

724 left_on: Sequence[Hashable | AnyArrayLike]

725 right_on: Sequence[Hashable | AnyArrayLike]

726 left_index: bool

727 right_index: bool

728 sort: bool

729 suffixes: Suffixes

730 copy: bool

731 indicator: str | bool

732 validate: str | None

733 join_names: list[Hashable]

734 right_join_keys: list[ArrayLike]

735 left_join_keys: list[ArrayLike]

736

737 def __init__(

738 self,

739 left: DataFrame | Series,

740 right: DataFrame | Series,

741 how: JoinHow | Literal["asof"] = "inner",

742 on: IndexLabel | AnyArrayLike | None = None,

743 left_on: IndexLabel | AnyArrayLike | None = None,

744 right_on: IndexLabel | AnyArrayLike | None = None,

745 left_index: bool = False,

746 right_index: bool = False,

747 sort: bool = True,

748 suffixes: Suffixes = ("_x", "_y"),

749 indicator: str | bool = False,

750 validate: str | None = None,

751 ) -> None:

752 _left = _validate_operand(left)

753 _right = _validate_operand(right)

754 self.left = self.orig_left = _left

755 self.right = self.orig_right = _right

756 self.how = how

757

758 self.on = com.maybe_make_list(on)

759

760 self.suffixes = suffixes

761 self.sort = sort or how == "outer"

762

763 self.left_index = left_index

764 self.right_index = right_index

765

766 self.indicator = indicator

767

768 if not is_bool(left_index):

769 raise ValueError(

770 f"left_index parameter must be of type bool, not {type(left_index)}"

771 )

772 if not is_bool(right_index):

773 raise ValueError(

774 f"right_index parameter must be of type bool, not {type(right_index)}"

775 )

776

777 # GH 40993: raise when merging between different levels; enforced in 2.0

778 if _left.columns.nlevels != _right.columns.nlevels:

779 msg = (

780 "Not allowed to merge between different levels. "

781 f"({_left.columns.nlevels} levels on the left, "

782 f"{_right.columns.nlevels} on the right)"

783 )

784 raise MergeError(msg)

785

786 self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)

787

788 (

789 self.left_join_keys,

790 self.right_join_keys,

791 self.join_names,

792 left_drop,

793 right_drop,

794 ) = self._get_merge_keys()

795

796 if left_drop:

797 self.left = self.left._drop_labels_or_levels(left_drop)

798

799 if right_drop:

800 self.right = self.right._drop_labels_or_levels(right_drop)

801

802 self._maybe_require_matching_dtypes(self.left_join_keys, self.right_join_keys)

803 self._validate_tolerance(self.left_join_keys)

804

805 # validate the merge keys dtypes. We may need to coerce

806 # to avoid incompatible dtypes

807 self._maybe_coerce_merge_keys()

808

809 # If argument passed to validate,

810 # check if columns specified as unique

811 # are in fact unique.

812 if validate is not None:

813 self._validate_validate_kwd(validate)

814

815 def _maybe_require_matching_dtypes(

816 self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike]

817 ) -> None:

818 # Overridden by AsOfMerge

819 pass

820

821 def _validate_tolerance(self, left_join_keys: list[ArrayLike]) -> None:

822 # Overridden by AsOfMerge

823 pass

824

825 @final

826 def _reindex_and_concat(

827 self,

828 join_index: Index,

829 left_indexer: npt.NDArray[np.intp] | None,

830 right_indexer: npt.NDArray[np.intp] | None,

831 copy: bool | None,

832 ) -> DataFrame:

833 """

834 reindex along index and concat along columns.

835 """

836 # Take views so we do not alter the originals

837 left = self.left[:]

838 right = self.right[:]

839

840 llabels, rlabels = _items_overlap_with_suffix(

841 self.left._info_axis, self.right._info_axis, self.suffixes

842 )

843

844 if left_indexer is not None and not is_range_indexer(left_indexer, len(left)):

845 # Pinning the index here (and in the right code just below) is not

846 # necessary, but makes the `.take` more performant if we have e.g.

847 # a MultiIndex for left.index.

848 lmgr = left._mgr.reindex_indexer(

849 join_index,

850 left_indexer,

851 axis=1,

852 copy=False,

853 only_slice=True,

854 allow_dups=True,

855 use_na_proxy=True,

856 )

857 left = left._constructor_from_mgr(lmgr, axes=lmgr.axes)

858 left.index = join_index

859

860 if right_indexer is not None and not is_range_indexer(

861 right_indexer, len(right)

862 ):

863 rmgr = right._mgr.reindex_indexer(

864 join_index,

865 right_indexer,

866 axis=1,

867 copy=False,

868 only_slice=True,

869 allow_dups=True,

870 use_na_proxy=True,

871 )

872 right = right._constructor_from_mgr(rmgr, axes=rmgr.axes)

873 right.index = join_index

874

875 from pandas import concat

876

877 left.columns = llabels

878 right.columns = rlabels

879 result = concat([left, right], axis=1, copy=copy)

880 return result

881

882 def get_result(self, copy: bool | None = True) -> DataFrame:

883 if self.indicator:

884 self.left, self.right = self._indicator_pre_merge(self.left, self.right)

885

886 join_index, left_indexer, right_indexer = self._get_join_info()

887

888 result = self._reindex_and_concat(

889 join_index, left_indexer, right_indexer, copy=copy

890 )

891 result = result.__finalize__(self, method=self._merge_type)

892

893 if self.indicator:

894 result = self._indicator_post_merge(result)

895

896 self._maybe_add_join_keys(result, left_indexer, right_indexer)

897

898 self._maybe_restore_index_levels(result)

899

900 return result.__finalize__(self, method="merge")

901

902 @final

903 @cache_readonly

904 def _indicator_name(self) -> str | None:

905 if isinstance(self.indicator, str):

906 return self.indicator

907 elif isinstance(self.indicator, bool):

908 return "_merge" if self.indicator else None

909 else:

910 raise ValueError(

911 "indicator option can only accept boolean or string arguments"

912 )

913

914 @final

915 def _indicator_pre_merge(

916 self, left: DataFrame, right: DataFrame

917 ) -> tuple[DataFrame, DataFrame]:

918 columns = left.columns.union(right.columns)

919

920 for i in ["_left_indicator", "_right_indicator"]:

921 if i in columns:

922 raise ValueError(

923 "Cannot use `indicator=True` option when "

924 f"data contains a column named {i}"

925 )

926 if self._indicator_name in columns:

927 raise ValueError(

928 "Cannot use name of an existing column for indicator column"

929 )

930

931 left = left.copy()

932 right = right.copy()

933

934 left["_left_indicator"] = 1

935 left["_left_indicator"] = left["_left_indicator"].astype("int8")

936

937 right["_right_indicator"] = 2

938 right["_right_indicator"] = right["_right_indicator"].astype("int8")

939

940 return left, right

941

942 @final

943 def _indicator_post_merge(self, result: DataFrame) -> DataFrame:

944 result["_left_indicator"] = result["_left_indicator"].fillna(0)

945 result["_right_indicator"] = result["_right_indicator"].fillna(0)

946

947 result[self._indicator_name] = Categorical(

948 (result["_left_indicator"] + result["_right_indicator"]),

949 categories=[1, 2, 3],

950 )

951 result[self._indicator_name] = result[

952 self._indicator_name

953 ].cat.rename_categories(["left_only", "right_only", "both"])

954

955 result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1)

956 return result

957

958 @final

959 def _maybe_restore_index_levels(self, result: DataFrame) -> None:

960 """

961 Restore index levels specified as `on` parameters

962

963 Here we check for cases where `self.left_on` and `self.right_on` pairs

964 each reference an index level in their respective DataFrames. The

965 joined columns corresponding to these pairs are then restored to the

966 index of `result`.

967

968 **Note:** This method has side effects. It modifies `result` in-place

969

970 Parameters

971 ----------

972 result: DataFrame

973 merge result

974

975 Returns

976 -------

977 None

978 """

979 names_to_restore = []

980 for name, left_key, right_key in zip(

981 self.join_names, self.left_on, self.right_on

982 ):

983 if (

984 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible

985 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected

986 # "Hashable"

987 self.orig_left._is_level_reference(left_key) # type: ignore[arg-type]

988 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible

989 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected

990 # "Hashable"

991 and self.orig_right._is_level_reference(

992 right_key # type: ignore[arg-type]

993 )

994 and left_key == right_key

995 and name not in result.index.names

996 ):

997 names_to_restore.append(name)

998

999 if names_to_restore:

1000 result.set_index(names_to_restore, inplace=True)

1001

1002 @final

1003 def _maybe_add_join_keys(

1004 self,

1005 result: DataFrame,

1006 left_indexer: npt.NDArray[np.intp] | None,

1007 right_indexer: npt.NDArray[np.intp] | None,

1008 ) -> None:

1009 left_has_missing = None

1010 right_has_missing = None

1011

1012 assert all(isinstance(x, _known) for x in self.left_join_keys)

1013

1014 keys = zip(self.join_names, self.left_on, self.right_on)

1015 for i, (name, lname, rname) in enumerate(keys):

1016 if not _should_fill(lname, rname):

1017 continue

1018

1019 take_left, take_right = None, None

1020

1021 if name in result:

1022 if left_indexer is not None or right_indexer is not None:

1023 if name in self.left:

1024 if left_has_missing is None:

1025 left_has_missing = (

1026 False

1027 if left_indexer is None

1028 else (left_indexer == -1).any()

1029 )

1030

1031 if left_has_missing:

1032 take_right = self.right_join_keys[i]

1033

1034 if result[name].dtype != self.left[name].dtype:

1035 take_left = self.left[name]._values

1036

1037 elif name in self.right:

1038 if right_has_missing is None:

1039 right_has_missing = (

1040 False

1041 if right_indexer is None

1042 else (right_indexer == -1).any()

1043 )

1044

1045 if right_has_missing:

1046 take_left = self.left_join_keys[i]

1047

1048 if result[name].dtype != self.right[name].dtype:

1049 take_right = self.right[name]._values

1050

1051 else:

1052 take_left = self.left_join_keys[i]

1053 take_right = self.right_join_keys[i]

1054

1055 if take_left is not None or take_right is not None:

1056 if take_left is None:

1057 lvals = result[name]._values

1058 elif left_indexer is None:

1059 lvals = take_left

1060 else:

1061 # TODO: can we pin down take_left's type earlier?

1062 take_left = extract_array(take_left, extract_numpy=True)

1063 lfill = na_value_for_dtype(take_left.dtype)

1064 lvals = algos.take_nd(take_left, left_indexer, fill_value=lfill)

1065

1066 if take_right is None:

1067 rvals = result[name]._values

1068 elif right_indexer is None:

1069 rvals = take_right

1070 else:

1071 # TODO: can we pin down take_right's type earlier?

1072 taker = extract_array(take_right, extract_numpy=True)

1073 rfill = na_value_for_dtype(taker.dtype)

1074 rvals = algos.take_nd(taker, right_indexer, fill_value=rfill)

1075

1076 # if we have an all missing left_indexer

1077 # make sure to just use the right values or vice-versa

1078 if left_indexer is not None and (left_indexer == -1).all():

1079 key_col = Index(rvals)

1080 result_dtype = rvals.dtype

1081 elif right_indexer is not None and (right_indexer == -1).all():

1082 key_col = Index(lvals)

1083 result_dtype = lvals.dtype

1084 else:

1085 key_col = Index(lvals)

1086 if left_indexer is not None:

1087 mask_left = left_indexer == -1

1088 key_col = key_col.where(~mask_left, rvals)

1089 result_dtype = find_common_type([lvals.dtype, rvals.dtype])

1090 if (

1091 lvals.dtype.kind == "M"

1092 and rvals.dtype.kind == "M"

1093 and result_dtype.kind == "O"

1094 ):

1095 # TODO(non-nano) Workaround for common_type not dealing

1096 # with different resolutions

1097 result_dtype = key_col.dtype

1098

1099 if result._is_label_reference(name):

1100 result[name] = result._constructor_sliced(

1101 key_col, dtype=result_dtype, index=result.index

1102 )

1103 elif result._is_level_reference(name):

1104 if isinstance(result.index, MultiIndex):

1105 key_col.name = name

1106 idx_list = [

1107 result.index.get_level_values(level_name)

1108 if level_name != name

1109 else key_col

1110 for level_name in result.index.names

1111 ]

1112

1113 result.set_index(idx_list, inplace=True)

1114 else:

1115 result.index = Index(key_col, name=name)

1116 else:

1117 result.insert(i, name or f"key_{i}", key_col)

1118

1119 def _get_join_indexers(

1120 self,

1121 ) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:

1122 """return the join indexers"""

1123 # make mypy happy

1124 assert self.how != "asof"

1125 return get_join_indexers(

1126 self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how

1127 )

1128

1129 @final

1130 def _get_join_info(

1131 self,

1132 ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:

1133 left_ax = self.left.index

1134 right_ax = self.right.index

1135

1136 if self.left_index and self.right_index and self.how != "asof":

1137 join_index, left_indexer, right_indexer = left_ax.join(

1138 right_ax, how=self.how, return_indexers=True, sort=self.sort

1139 )

1140

1141 elif self.right_index and self.how == "left":

1142 join_index, left_indexer, right_indexer = _left_join_on_index(

1143 left_ax, right_ax, self.left_join_keys, sort=self.sort

1144 )

1145

1146 elif self.left_index and self.how == "right":

1147 join_index, right_indexer, left_indexer = _left_join_on_index(

1148 right_ax, left_ax, self.right_join_keys, sort=self.sort

1149 )

1150 else:

1151 (left_indexer, right_indexer) = self._get_join_indexers()

1152

1153 if self.right_index:

1154 if len(self.left) > 0:

1155 join_index = self._create_join_index(

1156 left_ax,

1157 right_ax,

1158 left_indexer,

1159 how="right",

1160 )

1161 elif right_indexer is None:

1162 join_index = right_ax.copy()

1163 else:

1164 join_index = right_ax.take(right_indexer)

1165 elif self.left_index:

1166 if self.how == "asof":

1167 # GH#33463 asof should always behave like a left merge

1168 join_index = self._create_join_index(

1169 left_ax,

1170 right_ax,

1171 left_indexer,

1172 how="left",

1173 )

1174

1175 elif len(self.right) > 0:

1176 join_index = self._create_join_index(

1177 right_ax,

1178 left_ax,

1179 right_indexer,

1180 how="left",

1181 )

1182 elif left_indexer is None:

1183 join_index = left_ax.copy()

1184 else:

1185 join_index = left_ax.take(left_indexer)

1186 else:

1187 n = len(left_ax) if left_indexer is None else len(left_indexer)

1188 join_index = default_index(n)

1189

1190 return join_index, left_indexer, right_indexer

1191

1192 @final

1193 def _create_join_index(

1194 self,

1195 index: Index,

1196 other_index: Index,

1197 indexer: npt.NDArray[np.intp] | None,

1198 how: JoinHow = "left",

1199 ) -> Index:

1200 """

1201 Create a join index by rearranging one index to match another

1202

1203 Parameters

1204 ----------

1205 index : Index

1206 index being rearranged

1207 other_index : Index

1208 used to supply values not found in index

1209 indexer : np.ndarray[np.intp] or None

1210 how to rearrange index

1211 how : str

1212 Replacement is only necessary if indexer based on other_index.

1213

1214 Returns

1215 -------

1216 Index

1217 """

1218 if self.how in (how, "outer") and not isinstance(other_index, MultiIndex):

1219 # if final index requires values in other_index but not target

1220 # index, indexer may hold missing (-1) values, causing Index.take

1221 # to take the final value in target index. So, we set the last

1222 # element to be the desired fill value. We do not use allow_fill

1223 # and fill_value because it throws a ValueError on integer indices

1224 mask = indexer == -1

1225 if np.any(mask):

1226 fill_value = na_value_for_dtype(index.dtype, compat=False)

1227 index = index.append(Index([fill_value]))

1228 if indexer is None:

1229 return index.copy()

1230 return index.take(indexer)

1231

1232 @final

1233 def _get_merge_keys(

1234 self,

1235 ) -> tuple[

1236 list[ArrayLike],

1237 list[ArrayLike],

1238 list[Hashable],

1239 list[Hashable],

1240 list[Hashable],

1241 ]:

1242 """

1243 Returns

1244 -------

1245 left_keys, right_keys, join_names, left_drop, right_drop

1246 """

1247 left_keys: list[ArrayLike] = []

1248 right_keys: list[ArrayLike] = []

1249 join_names: list[Hashable] = []

1250 right_drop: list[Hashable] = []

1251 left_drop: list[Hashable] = []

1252

1253 left, right = self.left, self.right

1254

1255 is_lkey = lambda x: isinstance(x, _known) and len(x) == len(left)

1256 is_rkey = lambda x: isinstance(x, _known) and len(x) == len(right)

1257

1258 # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A

1259 # user could, for example, request 'left_index' and 'left_by'. In a

1260 # regular pd.merge(), users cannot specify both 'left_index' and

1261 # 'left_on'. (Instead, users have a MultiIndex). That means the

1262 # self.left_on in this function is always empty in a pd.merge(), but

1263 # a pd.merge_asof(left_index=True, left_by=...) will result in a

1264 # self.left_on array with a None in the middle of it. This requires

1265 # a work-around as designated in the code below.

1266 # See _validate_left_right_on() for where this happens.

1267

1268 # ugh, spaghetti re #733

1269 if _any(self.left_on) and _any(self.right_on):

1270 for lk, rk in zip(self.left_on, self.right_on):

1271 lk = extract_array(lk, extract_numpy=True)

1272 rk = extract_array(rk, extract_numpy=True)

1273 if is_lkey(lk):

1274 lk = cast(ArrayLike, lk)

1275 left_keys.append(lk)

1276 if is_rkey(rk):

1277 rk = cast(ArrayLike, rk)

1278 right_keys.append(rk)

1279 join_names.append(None) # what to do?

1280 else:

1281 # Then we're either Hashable or a wrong-length arraylike,

1282 # the latter of which will raise

1283 rk = cast(Hashable, rk)

1284 if rk is not None:

1285 right_keys.append(right._get_label_or_level_values(rk))

1286 join_names.append(rk)

1287 else:

1288 # work-around for merge_asof(right_index=True)

1289 right_keys.append(right.index._values)

1290 join_names.append(right.index.name)

1291 else:

1292 if not is_rkey(rk):

1293 # Then we're either Hashable or a wrong-length arraylike,

1294 # the latter of which will raise

1295 rk = cast(Hashable, rk)

1296 if rk is not None:

1297 right_keys.append(right._get_label_or_level_values(rk))

1298 else:

1299 # work-around for merge_asof(right_index=True)

1300 right_keys.append(right.index._values)

1301 if lk is not None and lk == rk: # FIXME: what about other NAs?

1302 right_drop.append(rk)

1303 else:

1304 rk = cast(ArrayLike, rk)

1305 right_keys.append(rk)

1306 if lk is not None:

1307 # Then we're either Hashable or a wrong-length arraylike,

1308 # the latter of which will raise

1309 lk = cast(Hashable, lk)

1310 left_keys.append(left._get_label_or_level_values(lk))

1311 join_names.append(lk)

1312 else:

1313 # work-around for merge_asof(left_index=True)

1314 left_keys.append(left.index._values)

1315 join_names.append(left.index.name)

1316 elif _any(self.left_on):

1317 for k in self.left_on:

1318 if is_lkey(k):

1319 k = extract_array(k, extract_numpy=True)

1320 k = cast(ArrayLike, k)

1321 left_keys.append(k)

1322 join_names.append(None)

1323 else:

1324 # Then we're either Hashable or a wrong-length arraylike,

1325 # the latter of which will raise

1326 k = cast(Hashable, k)

1327 left_keys.append(left._get_label_or_level_values(k))

1328 join_names.append(k)

1329 if isinstance(self.right.index, MultiIndex):

1330 right_keys = [

1331 lev._values.take(lev_codes)

1332 for lev, lev_codes in zip(

1333 self.right.index.levels, self.right.index.codes

1334 )

1335 ]

1336 else:

1337 right_keys = [self.right.index._values]

1338 elif _any(self.right_on):

1339 for k in self.right_on:

1340 k = extract_array(k, extract_numpy=True)

1341 if is_rkey(k):

1342 k = cast(ArrayLike, k)

1343 right_keys.append(k)

1344 join_names.append(None)

1345 else:

1346 # Then we're either Hashable or a wrong-length arraylike,

1347 # the latter of which will raise

1348 k = cast(Hashable, k)

1349 right_keys.append(right._get_label_or_level_values(k))

1350 join_names.append(k)

1351 if isinstance(self.left.index, MultiIndex):

1352 left_keys = [

1353 lev._values.take(lev_codes)

1354 for lev, lev_codes in zip(

1355 self.left.index.levels, self.left.index.codes

1356 )

1357 ]

1358 else:

1359 left_keys = [self.left.index._values]

1360

1361 return left_keys, right_keys, join_names, left_drop, right_drop

1362

1363 @final

1364 def _maybe_coerce_merge_keys(self) -> None:

1365 # we have valid merges but we may have to further

1366 # coerce these if they are originally incompatible types

1367 #

1368 # for example if these are categorical, but are not dtype_equal

1369 # or if we have object and integer dtypes

1370

1371 for lk, rk, name in zip(

1372 self.left_join_keys, self.right_join_keys, self.join_names

1373 ):

1374 if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):

1375 continue

1376

1377 lk = extract_array(lk, extract_numpy=True)

1378 rk = extract_array(rk, extract_numpy=True)

1379

1380 lk_is_cat = isinstance(lk.dtype, CategoricalDtype)

1381 rk_is_cat = isinstance(rk.dtype, CategoricalDtype)

1382 lk_is_object_or_string = is_object_dtype(lk.dtype) or is_string_dtype(

1383 lk.dtype

1384 )

1385 rk_is_object_or_string = is_object_dtype(rk.dtype) or is_string_dtype(

1386 rk.dtype

1387 )

1388

1389 # if either left or right is a categorical

1390 # then the must match exactly in categories & ordered

1391 if lk_is_cat and rk_is_cat:

1392 lk = cast(Categorical, lk)

1393 rk = cast(Categorical, rk)

1394 if lk._categories_match_up_to_permutation(rk):

1395 continue

1396

1397 elif lk_is_cat or rk_is_cat:

1398 pass

1399

1400 elif lk.dtype == rk.dtype:

1401 continue

1402

1403 msg = (

1404 f"You are trying to merge on {lk.dtype} and {rk.dtype} columns "

1405 f"for key '{name}'. If you wish to proceed you should use pd.concat"

1406 )

1407

1408 # if we are numeric, then allow differing

1409 # kinds to proceed, eg. int64 and int8, int and float

1410 # further if we are object, but we infer to

1411 # the same, then proceed

1412 if is_numeric_dtype(lk.dtype) and is_numeric_dtype(rk.dtype):

1413 if lk.dtype.kind == rk.dtype.kind:

1414 continue

1415

1416 if isinstance(lk.dtype, ExtensionDtype) and not isinstance(

1417 rk.dtype, ExtensionDtype

1418 ):

1419 ct = find_common_type([lk.dtype, rk.dtype])

1420 if isinstance(ct, ExtensionDtype):

1421 com_cls = ct.construct_array_type()

1422 rk = com_cls._from_sequence(rk, dtype=ct, copy=False)

1423 else:

1424 rk = rk.astype(ct)

1425 elif isinstance(rk.dtype, ExtensionDtype):

1426 ct = find_common_type([lk.dtype, rk.dtype])

1427 if isinstance(ct, ExtensionDtype):

1428 com_cls = ct.construct_array_type()

1429 lk = com_cls._from_sequence(lk, dtype=ct, copy=False)

1430 else:

1431 lk = lk.astype(ct)

1432

1433 # check whether ints and floats

1434 if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype):

1435 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int

1436 with np.errstate(invalid="ignore"):

1437 # error: Argument 1 to "astype" of "ndarray" has incompatible

1438 # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected

1439 # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]"

1440 casted = lk.astype(rk.dtype) # type: ignore[arg-type]

1441

1442 mask = ~np.isnan(lk)

1443 match = lk == casted

1444 if not match[mask].all():

1445 warnings.warn(

1446 "You are merging on int and float "

1447 "columns where the float values "

1448 "are not equal to their int representation.",

1449 UserWarning,

1450 stacklevel=find_stack_level(),

1451 )

1452 continue

1453

1454 if is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype):

1455 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int

1456 with np.errstate(invalid="ignore"):

1457 # error: Argument 1 to "astype" of "ndarray" has incompatible

1458 # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected

1459 # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]"

1460 casted = rk.astype(lk.dtype) # type: ignore[arg-type]

1461

1462 mask = ~np.isnan(rk)

1463 match = rk == casted

1464 if not match[mask].all():

1465 warnings.warn(

1466 "You are merging on int and float "

1467 "columns where the float values "

1468 "are not equal to their int representation.",

1469 UserWarning,

1470 stacklevel=find_stack_level(),

1471 )

1472 continue

1473

1474 # let's infer and see if we are ok

1475 if lib.infer_dtype(lk, skipna=False) == lib.infer_dtype(

1476 rk, skipna=False

1477 ):

1478 continue

1479

1480 # Check if we are trying to merge on obviously

1481 # incompatible dtypes GH 9780, GH 15800

1482

1483 # bool values are coerced to object

1484 elif (lk_is_object_or_string and is_bool_dtype(rk.dtype)) or (

1485 is_bool_dtype(lk.dtype) and rk_is_object_or_string

1486 ):

1487 pass

1488

1489 # object values are allowed to be merged

1490 elif (lk_is_object_or_string and is_numeric_dtype(rk.dtype)) or (

1491 is_numeric_dtype(lk.dtype) and rk_is_object_or_string

1492 ):

1493 inferred_left = lib.infer_dtype(lk, skipna=False)

1494 inferred_right = lib.infer_dtype(rk, skipna=False)

1495 bool_types = ["integer", "mixed-integer", "boolean", "empty"]

1496 string_types = ["string", "unicode", "mixed", "bytes", "empty"]

1497

1498 # inferred bool

1499 if inferred_left in bool_types and inferred_right in bool_types:

1500 pass

1501

1502 # unless we are merging non-string-like with string-like

1503 elif (

1504 inferred_left in string_types and inferred_right not in string_types

1505 ) or (

1506 inferred_right in string_types and inferred_left not in string_types

1507 ):

1508 raise ValueError(msg)

1509

1510 # datetimelikes must match exactly

1511 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype):

1512 raise ValueError(msg)

1513 elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype):

1514 raise ValueError(msg)

1515 elif isinstance(lk.dtype, DatetimeTZDtype) and not isinstance(

1516 rk.dtype, DatetimeTZDtype

1517 ):

1518 raise ValueError(msg)

1519 elif not isinstance(lk.dtype, DatetimeTZDtype) and isinstance(

1520 rk.dtype, DatetimeTZDtype

1521 ):

1522 raise ValueError(msg)

1523 elif (

1524 isinstance(lk.dtype, DatetimeTZDtype)

1525 and isinstance(rk.dtype, DatetimeTZDtype)

1526 ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"):

1527 # allows datetime with different resolutions

1528 continue

1529 # datetime and timedelta not allowed

1530 elif lk.dtype.kind == "M" and rk.dtype.kind == "m":

1531 raise ValueError(msg)

1532 elif lk.dtype.kind == "m" and rk.dtype.kind == "M":

1533 raise ValueError(msg)

1534

1535 elif is_object_dtype(lk.dtype) and is_object_dtype(rk.dtype):

1536 continue

1537

1538 # Houston, we have a problem!

1539 # let's coerce to object if the dtypes aren't

1540 # categorical, otherwise coerce to the category

1541 # dtype. If we coerced categories to object,

1542 # then we would lose type information on some

1543 # columns, and end up trying to merge

1544 # incompatible dtypes. See GH 16900.

1545 if name in self.left.columns:

1546 typ = cast(Categorical, lk).categories.dtype if lk_is_cat else object

1547 self.left = self.left.copy()

1548 self.left[name] = self.left[name].astype(typ)

1549 if name in self.right.columns:

1550 typ = cast(Categorical, rk).categories.dtype if rk_is_cat else object

1551 self.right = self.right.copy()

1552 self.right[name] = self.right[name].astype(typ)

1553

1554 def _validate_left_right_on(self, left_on, right_on):

1555 left_on = com.maybe_make_list(left_on)

1556 right_on = com.maybe_make_list(right_on)

1557

1558 # Hm, any way to make this logic less complicated??

1559 if self.on is None and left_on is None and right_on is None:

1560 if self.left_index and self.right_index:

1561 left_on, right_on = (), ()

1562 elif self.left_index:

1563 raise MergeError("Must pass right_on or right_index=True")

1564 elif self.right_index:

1565 raise MergeError("Must pass left_on or left_index=True")

1566 else:

1567 # use the common columns

1568 left_cols = self.left.columns

1569 right_cols = self.right.columns

1570 common_cols = left_cols.intersection(right_cols)

1571 if len(common_cols) == 0:

1572 raise MergeError(

1573 "No common columns to perform merge on. "

1574 f"Merge options: left_on={left_on}, "

1575 f"right_on={right_on}, "

1576 f"left_index={self.left_index}, "

1577 f"right_index={self.right_index}"

1578 )

1579 if (

1580 not left_cols.join(common_cols, how="inner").is_unique

1581 or not right_cols.join(common_cols, how="inner").is_unique

1582 ):

1583 raise MergeError(f"Data columns not unique: {repr(common_cols)}")

1584 left_on = right_on = common_cols

1585 elif self.on is not None:

1586 if left_on is not None or right_on is not None:

1587 raise MergeError(

1588 'Can only pass argument "on" OR "left_on" '

1589 'and "right_on", not a combination of both.'

1590 )

1591 if self.left_index or self.right_index:

1592 raise MergeError(

1593 'Can only pass argument "on" OR "left_index" '

1594 'and "right_index", not a combination of both.'

1595 )

1596 left_on = right_on = self.on

1597 elif left_on is not None:

1598 if self.left_index:

1599 raise MergeError(

1600 'Can only pass argument "left_on" OR "left_index" not both.'

1601 )

1602 if not self.right_index and right_on is None:

1603 raise MergeError('Must pass "right_on" OR "right_index".')

1604 n = len(left_on)

1605 if self.right_index:

1606 if len(left_on) != self.right.index.nlevels:

1607 raise ValueError(

1608 "len(left_on) must equal the number "

1609 'of levels in the index of "right"'

1610 )

1611 right_on = [None] * n

1612 elif right_on is not None:

1613 if self.right_index:

1614 raise MergeError(

1615 'Can only pass argument "right_on" OR "right_index" not both.'

1616 )

1617 if not self.left_index and left_on is None:

1618 raise MergeError('Must pass "left_on" OR "left_index".')

1619 n = len(right_on)

1620 if self.left_index:

1621 if len(right_on) != self.left.index.nlevels:

1622 raise ValueError(

1623 "len(right_on) must equal the number "

1624 'of levels in the index of "left"'

1625 )

1626 left_on = [None] * n

1627 if len(right_on) != len(left_on):

1628 raise ValueError("len(right_on) must equal len(left_on)")

1629

1630 return left_on, right_on

1631

1632 @final

1633 def _validate_validate_kwd(self, validate: str) -> None:

1634 # Check uniqueness of each

1635 if self.left_index:

1636 left_unique = self.orig_left.index.is_unique

1637 else:

1638 left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique

1639

1640 if self.right_index:

1641 right_unique = self.orig_right.index.is_unique

1642 else:

1643 right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique

1644

1645 # Check data integrity

1646 if validate in ["one_to_one", "1:1"]:

1647 if not left_unique and not right_unique:

1648 raise MergeError(

1649 "Merge keys are not unique in either left "

1650 "or right dataset; not a one-to-one merge"

1651 )

1652 if not left_unique:

1653 raise MergeError(

1654 "Merge keys are not unique in left dataset; not a one-to-one merge"

1655 )

1656 if not right_unique:

1657 raise MergeError(

1658 "Merge keys are not unique in right dataset; not a one-to-one merge"

1659 )

1660

1661 elif validate in ["one_to_many", "1:m"]:

1662 if not left_unique:

1663 raise MergeError(

1664 "Merge keys are not unique in left dataset; not a one-to-many merge"

1665 )

1666

1667 elif validate in ["many_to_one", "m:1"]:

1668 if not right_unique:

1669 raise MergeError(

1670 "Merge keys are not unique in right dataset; "

1671 "not a many-to-one merge"

1672 )

1673

1674 elif validate in ["many_to_many", "m:m"]:

1675 pass

1676

1677 else:

1678 raise ValueError(

1679 f'"{validate}" is not a valid argument. '

1680 "Valid arguments are:\n"

1681 '- "1:1"\n'

1682 '- "1:m"\n'

1683 '- "m:1"\n'

1684 '- "m:m"\n'

1685 '- "one_to_one"\n'

1686 '- "one_to_many"\n'

1687 '- "many_to_one"\n'

1688 '- "many_to_many"'

1689 )

1690

1691

1692def get_join_indexers(

1693 left_keys: list[ArrayLike],

1694 right_keys: list[ArrayLike],

1695 sort: bool = False,

1696 how: JoinHow = "inner",

1697) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:

1698 """

1699

1700 Parameters

1701 ----------

1702 left_keys : list[ndarray, ExtensionArray, Index, Series]

1703 right_keys : list[ndarray, ExtensionArray, Index, Series]

1704 sort : bool, default False

1705 how : {'inner', 'outer', 'left', 'right'}, default 'inner'

1706

1707 Returns

1708 -------

1709 np.ndarray[np.intp] or None

1710 Indexer into the left_keys.

1711 np.ndarray[np.intp] or None

1712 Indexer into the right_keys.

1713 """

1714 assert len(left_keys) == len(

1715 right_keys

1716 ), "left_keys and right_keys must be the same length"

1717

1718 # fast-path for empty left/right

1719 left_n = len(left_keys[0])

1720 right_n = len(right_keys[0])

1721 if left_n == 0:

1722 if how in ["left", "inner"]:

1723 return _get_empty_indexer()

1724 elif not sort and how in ["right", "outer"]:

1725 return _get_no_sort_one_missing_indexer(right_n, True)

1726 elif right_n == 0:

1727 if how in ["right", "inner"]:

1728 return _get_empty_indexer()

1729 elif not sort and how in ["left", "outer"]:

1730 return _get_no_sort_one_missing_indexer(left_n, False)

1731

1732 lkey: ArrayLike

1733 rkey: ArrayLike

1734 if len(left_keys) > 1:

1735 # get left & right join labels and num. of levels at each location

1736 mapped = (

1737 _factorize_keys(left_keys[n], right_keys[n], sort=sort)

1738 for n in range(len(left_keys))

1739 )

1740 zipped = zip(*mapped)

1741 llab, rlab, shape = (list(x) for x in zipped)

1742

1743 # get flat i8 keys from label lists

1744 lkey, rkey = _get_join_keys(llab, rlab, tuple(shape), sort)

1745 else:

1746 lkey = left_keys[0]

1747 rkey = right_keys[0]

1748

1749 left = Index(lkey)

1750 right = Index(rkey)

1751

1752 if (

1753 left.is_monotonic_increasing

1754 and right.is_monotonic_increasing

1755 and (left.is_unique or right.is_unique)

1756 ):

1757 _, lidx, ridx = left.join(right, how=how, return_indexers=True, sort=sort)

1758 else:

1759 lidx, ridx = get_join_indexers_non_unique(

1760 left._values, right._values, sort, how

1761 )

1762

1763 if lidx is not None and is_range_indexer(lidx, len(left)):

1764 lidx = None

1765 if ridx is not None and is_range_indexer(ridx, len(right)):

1766 ridx = None

1767 return lidx, ridx

1768

1769

1770def get_join_indexers_non_unique(

1771 left: ArrayLike,

1772 right: ArrayLike,

1773 sort: bool = False,

1774 how: JoinHow = "inner",

1775) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

1776 """

1777 Get join indexers for left and right.

1778

1779 Parameters

1780 ----------

1781 left : ArrayLike

1782 right : ArrayLike

1783 sort : bool, default False

1784 how : {'inner', 'outer', 'left', 'right'}, default 'inner'

1785

1786 Returns

1787 -------

1788 np.ndarray[np.intp]

1789 Indexer into left.

1790 np.ndarray[np.intp]

1791 Indexer into right.

1792 """

1793 lkey, rkey, count = _factorize_keys(left, right, sort=sort)

1794 if how == "left":

1795 lidx, ridx = libjoin.left_outer_join(lkey, rkey, count, sort=sort)

1796 elif how == "right":

1797 ridx, lidx = libjoin.left_outer_join(rkey, lkey, count, sort=sort)

1798 elif how == "inner":

1799 lidx, ridx = libjoin.inner_join(lkey, rkey, count, sort=sort)

1800 elif how == "outer":

1801 lidx, ridx = libjoin.full_outer_join(lkey, rkey, count)

1802 return lidx, ridx

1803

1804

1805def restore_dropped_levels_multijoin(

1806 left: MultiIndex,

1807 right: MultiIndex,

1808 dropped_level_names,

1809 join_index: Index,

1810 lindexer: npt.NDArray[np.intp],

1811 rindexer: npt.NDArray[np.intp],

1812) -> tuple[FrozenList, FrozenList, FrozenList]:

1813 """

1814 *this is an internal non-public method*

1815

1816 Returns the levels, labels and names of a multi-index to multi-index join.

1817 Depending on the type of join, this method restores the appropriate

1818 dropped levels of the joined multi-index.

1819 The method relies on lindexer, rindexer which hold the index positions of

1820 left and right, where a join was feasible

1821

1822 Parameters

1823 ----------

1824 left : MultiIndex

1825 left index

1826 right : MultiIndex

1827 right index

1828 dropped_level_names : str array

1829 list of non-common level names

1830 join_index : Index

1831 the index of the join between the

1832 common levels of left and right

1833 lindexer : np.ndarray[np.intp]

1834 left indexer

1835 rindexer : np.ndarray[np.intp]

1836 right indexer

1837

1838 Returns

1839 -------

1840 levels : list of Index

1841 levels of combined multiindexes

1842 labels : np.ndarray[np.intp]

1843 labels of combined multiindexes

1844 names : List[Hashable]

1845 names of combined multiindex levels

1846

1847 """

1848

1849 def _convert_to_multiindex(index: Index) -> MultiIndex:

1850 if isinstance(index, MultiIndex):

1851 return index

1852 else:

1853 return MultiIndex.from_arrays([index._values], names=[index.name])

1854

1855 # For multi-multi joins with one overlapping level,

1856 # the returned index if of type Index

1857 # Assure that join_index is of type MultiIndex

1858 # so that dropped levels can be appended

1859 join_index = _convert_to_multiindex(join_index)

1860

1861 join_levels = join_index.levels

1862 join_codes = join_index.codes

1863 join_names = join_index.names

1864

1865 # Iterate through the levels that must be restored

1866 for dropped_level_name in dropped_level_names:

1867 if dropped_level_name in left.names:

1868 idx = left

1869 indexer = lindexer

1870 else:

1871 idx = right

1872 indexer = rindexer

1873

1874 # The index of the level name to be restored

1875 name_idx = idx.names.index(dropped_level_name)

1876

1877 restore_levels = idx.levels[name_idx]

1878 # Inject -1 in the codes list where a join was not possible

1879 # IOW indexer[i]=-1

1880 codes = idx.codes[name_idx]

1881 if indexer is None:

1882 restore_codes = codes

1883 else:

1884 restore_codes = algos.take_nd(codes, indexer, fill_value=-1)

1885

1886 # error: Cannot determine type of "__add__"

1887 join_levels = join_levels + [restore_levels] # type: ignore[has-type]

1888 join_codes = join_codes + [restore_codes] # type: ignore[has-type]

1889 join_names = join_names + [dropped_level_name]

1890

1891 return join_levels, join_codes, join_names

1892

1893

1894class _OrderedMerge(_MergeOperation):

1895 _merge_type = "ordered_merge"

1896

1897 def __init__(

1898 self,

1899 left: DataFrame | Series,

1900 right: DataFrame | Series,

1901 on: IndexLabel | None = None,

1902 left_on: IndexLabel | None = None,

1903 right_on: IndexLabel | None = None,

1904 left_index: bool = False,

1905 right_index: bool = False,

1906 suffixes: Suffixes = ("_x", "_y"),

1907 fill_method: str | None = None,

1908 how: JoinHow | Literal["asof"] = "outer",

1909 ) -> None:

1910 self.fill_method = fill_method

1911 _MergeOperation.__init__(

1912 self,

1913 left,

1914 right,

1915 on=on,

1916 left_on=left_on,

1917 left_index=left_index,

1918 right_index=right_index,

1919 right_on=right_on,

1920 how=how,

1921 suffixes=suffixes,

1922 sort=True, # factorize sorts

1923 )

1924

1925 def get_result(self, copy: bool | None = True) -> DataFrame:

1926 join_index, left_indexer, right_indexer = self._get_join_info()

1927

1928 left_join_indexer: npt.NDArray[np.intp] | None

1929 right_join_indexer: npt.NDArray[np.intp] | None

1930

1931 if self.fill_method == "ffill":

1932 if left_indexer is None:

1933 left_join_indexer = None

1934 else:

1935 left_join_indexer = libjoin.ffill_indexer(left_indexer)

1936 if right_indexer is None:

1937 right_join_indexer = None

1938 else:

1939 right_join_indexer = libjoin.ffill_indexer(right_indexer)

1940 elif self.fill_method is None:

1941 left_join_indexer = left_indexer

1942 right_join_indexer = right_indexer

1943 else:

1944 raise ValueError("fill_method must be 'ffill' or None")

1945

1946 result = self._reindex_and_concat(

1947 join_index, left_join_indexer, right_join_indexer, copy=copy

1948 )

1949 self._maybe_add_join_keys(result, left_indexer, right_indexer)

1950

1951 return result

1952

1953

1954def _asof_by_function(direction: str):

1955 name = f"asof_join_{direction}_on_X_by_Y"

1956 return getattr(libjoin, name, None)

1957

1958

1959class _AsOfMerge(_OrderedMerge):

1960 _merge_type = "asof_merge"

1961

1962 def __init__(

1963 self,

1964 left: DataFrame | Series,

1965 right: DataFrame | Series,

1966 on: IndexLabel | None = None,

1967 left_on: IndexLabel | None = None,

1968 right_on: IndexLabel | None = None,

1969 left_index: bool = False,

1970 right_index: bool = False,

1971 by=None,

1972 left_by=None,

1973 right_by=None,

1974 suffixes: Suffixes = ("_x", "_y"),

1975 how: Literal["asof"] = "asof",

1976 tolerance=None,

1977 allow_exact_matches: bool = True,

1978 direction: str = "backward",

1979 ) -> None:

1980 self.by = by

1981 self.left_by = left_by

1982 self.right_by = right_by

1983 self.tolerance = tolerance

1984 self.allow_exact_matches = allow_exact_matches

1985 self.direction = direction

1986

1987 # check 'direction' is valid

1988 if self.direction not in ["backward", "forward", "nearest"]:

1989 raise MergeError(f"direction invalid: {self.direction}")

1990

1991 # validate allow_exact_matches

1992 if not is_bool(self.allow_exact_matches):

1993 msg = (

1994 "allow_exact_matches must be boolean, "

1995 f"passed {self.allow_exact_matches}"

1996 )

1997 raise MergeError(msg)

1998

1999 _OrderedMerge.__init__(

2000 self,

2001 left,

2002 right,

2003 on=on,

2004 left_on=left_on,

2005 right_on=right_on,

2006 left_index=left_index,

2007 right_index=right_index,

2008 how=how,

2009 suffixes=suffixes,

2010 fill_method=None,

2011 )

2012

2013 def _validate_left_right_on(self, left_on, right_on):

2014 left_on, right_on = super()._validate_left_right_on(left_on, right_on)

2015

2016 # we only allow on to be a single item for on

2017 if len(left_on) != 1 and not self.left_index:

2018 raise MergeError("can only asof on a key for left")

2019

2020 if len(right_on) != 1 and not self.right_index:

2021 raise MergeError("can only asof on a key for right")

2022

2023 if self.left_index and isinstance(self.left.index, MultiIndex):

2024 raise MergeError("left can only have one index")

2025

2026 if self.right_index and isinstance(self.right.index, MultiIndex):

2027 raise MergeError("right can only have one index")

2028

2029 # set 'by' columns

2030 if self.by is not None:

2031 if self.left_by is not None or self.right_by is not None:

2032 raise MergeError("Can only pass by OR left_by and right_by")

2033 self.left_by = self.right_by = self.by

2034 if self.left_by is None and self.right_by is not None:

2035 raise MergeError("missing left_by")

2036 if self.left_by is not None and self.right_by is None:

2037 raise MergeError("missing right_by")

2038

2039 # GH#29130 Check that merge keys do not have dtype object

2040 if not self.left_index:

2041 left_on_0 = left_on[0]

2042 if isinstance(left_on_0, _known):

2043 lo_dtype = left_on_0.dtype

2044 else:

2045 lo_dtype = (

2046 self.left._get_label_or_level_values(left_on_0).dtype

2047 if left_on_0 in self.left.columns

2048 else self.left.index.get_level_values(left_on_0)

2049 )

2050 else:

2051 lo_dtype = self.left.index.dtype

2052

2053 if not self.right_index:

2054 right_on_0 = right_on[0]

2055 if isinstance(right_on_0, _known):

2056 ro_dtype = right_on_0.dtype

2057 else:

2058 ro_dtype = (

2059 self.right._get_label_or_level_values(right_on_0).dtype

2060 if right_on_0 in self.right.columns

2061 else self.right.index.get_level_values(right_on_0)

2062 )

2063 else:

2064 ro_dtype = self.right.index.dtype

2065

2066 if (

2067 is_object_dtype(lo_dtype)

2068 or is_object_dtype(ro_dtype)

2069 or is_string_dtype(lo_dtype)

2070 or is_string_dtype(ro_dtype)

2071 ):

2072 raise MergeError(

2073 f"Incompatible merge dtype, {repr(ro_dtype)} and "

2074 f"{repr(lo_dtype)}, both sides must have numeric dtype"

2075 )

2076

2077 # add 'by' to our key-list so we can have it in the

2078 # output as a key

2079 if self.left_by is not None:

2080 if not is_list_like(self.left_by):

2081 self.left_by = [self.left_by]

2082 if not is_list_like(self.right_by):

2083 self.right_by = [self.right_by]

2084

2085 if len(self.left_by) != len(self.right_by):

2086 raise MergeError("left_by and right_by must be the same length")

2087

2088 left_on = self.left_by + list(left_on)

2089 right_on = self.right_by + list(right_on)

2090

2091 return left_on, right_on

2092

2093 def _maybe_require_matching_dtypes(

2094 self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike]

2095 ) -> None:

2096 # TODO: why do we do this for AsOfMerge but not the others?

2097

2098 def _check_dtype_match(left: ArrayLike, right: ArrayLike, i: int):

2099 if left.dtype != right.dtype:

2100 if isinstance(left.dtype, CategoricalDtype) and isinstance(

2101 right.dtype, CategoricalDtype

2102 ):

2103 # The generic error message is confusing for categoricals.

2104 #

2105 # In this function, the join keys include both the original

2106 # ones of the merge_asof() call, and also the keys passed

2107 # to its by= argument. Unordered but equal categories

2108 # are not supported for the former, but will fail

2109 # later with a ValueError, so we don't *need* to check

2110 # for them here.

2111 msg = (

2112 f"incompatible merge keys [{i}] {repr(left.dtype)} and "

2113 f"{repr(right.dtype)}, both sides category, but not equal ones"

2114 )

2115 else:

2116 msg = (

2117 f"incompatible merge keys [{i}] {repr(left.dtype)} and "

2118 f"{repr(right.dtype)}, must be the same type"

2119 )

2120 raise MergeError(msg)

2121

2122 # validate index types are the same

2123 for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):

2124 _check_dtype_match(lk, rk, i)

2125

2126 if self.left_index:

2127 lt = self.left.index._values

2128 else:

2129 lt = left_join_keys[-1]

2130

2131 if self.right_index:

2132 rt = self.right.index._values

2133 else:

2134 rt = right_join_keys[-1]

2135

2136 _check_dtype_match(lt, rt, 0)

2137

2138 def _validate_tolerance(self, left_join_keys: list[ArrayLike]) -> None:

2139 # validate tolerance; datetime.timedelta or Timedelta if we have a DTI

2140 if self.tolerance is not None:

2141 if self.left_index:

2142 lt = self.left.index._values

2143 else:

2144 lt = left_join_keys[-1]

2145

2146 msg = (

2147 f"incompatible tolerance {self.tolerance}, must be compat "

2148 f"with type {repr(lt.dtype)}"

2149 )

2150

2151 if needs_i8_conversion(lt.dtype) or (

2152 isinstance(lt, ArrowExtensionArray) and lt.dtype.kind in "mM"

2153 ):

2154 if not isinstance(self.tolerance, datetime.timedelta):

2155 raise MergeError(msg)

2156 if self.tolerance < Timedelta(0):

2157 raise MergeError("tolerance must be positive")

2158

2159 elif is_integer_dtype(lt.dtype):

2160 if not is_integer(self.tolerance):

2161 raise MergeError(msg)

2162 if self.tolerance < 0:

2163 raise MergeError("tolerance must be positive")

2164

2165 elif is_float_dtype(lt.dtype):

2166 if not is_number(self.tolerance):

2167 raise MergeError(msg)

2168 # error: Unsupported operand types for > ("int" and "Number")

2169 if self.tolerance < 0: # type: ignore[operator]

2170 raise MergeError("tolerance must be positive")

2171

2172 else:

2173 raise MergeError("key must be integer, timestamp or float")

2174

2175 def _convert_values_for_libjoin(

2176 self, values: AnyArrayLike, side: str

2177 ) -> np.ndarray:

2178 # we require sortedness and non-null values in the join keys

2179 if not Index(values).is_monotonic_increasing:

2180 if isna(values).any():

2181 raise ValueError(f"Merge keys contain null values on {side} side")

2182 raise ValueError(f"{side} keys must be sorted")

2183

2184 if isinstance(values, ArrowExtensionArray):

2185 values = values._maybe_convert_datelike_array()

2186

2187 if needs_i8_conversion(values.dtype):

2188 values = values.view("i8")

2189

2190 elif isinstance(values, BaseMaskedArray):

2191 # we've verified above that no nulls exist

2192 values = values._data

2193 elif isinstance(values, ExtensionArray):

2194 values = values.to_numpy()

2195

2196 # error: Incompatible return value type (got "Union[ExtensionArray,

2197 # Any, ndarray[Any, Any], ndarray[Any, dtype[Any]], Index, Series]",

2198 # expected "ndarray[Any, Any]")

2199 return values # type: ignore[return-value]

2200

2201 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2202 """return the join indexers"""

2203

2204 # values to compare

2205 left_values = (

2206 self.left.index._values if self.left_index else self.left_join_keys[-1]

2207 )

2208 right_values = (

2209 self.right.index._values if self.right_index else self.right_join_keys[-1]

2210 )

2211

2212 # _maybe_require_matching_dtypes already checked for dtype matching

2213 assert left_values.dtype == right_values.dtype

2214

2215 tolerance = self.tolerance

2216 if tolerance is not None:

2217 # TODO: can we reuse a tolerance-conversion function from

2218 # e.g. TimedeltaIndex?

2219 if needs_i8_conversion(left_values.dtype) or (

2220 isinstance(left_values, ArrowExtensionArray)

2221 and left_values.dtype.kind in "mM"

2222 ):

2223 tolerance = Timedelta(tolerance)

2224 # TODO: we have no test cases with PeriodDtype here; probably

2225 # need to adjust tolerance for that case.

2226 if left_values.dtype.kind in "mM":

2227 # Make sure the i8 representation for tolerance

2228 # matches that for left_values/right_values.

2229 if isinstance(left_values, ArrowExtensionArray):

2230 unit = left_values.dtype.pyarrow_dtype.unit

2231 else:

2232 unit = ensure_wrapped_if_datetimelike(left_values).unit

2233 tolerance = tolerance.as_unit(unit)

2234

2235 tolerance = tolerance._value

2236

2237 # initial type conversion as needed

2238 left_values = self._convert_values_for_libjoin(left_values, "left")

2239 right_values = self._convert_values_for_libjoin(right_values, "right")

2240

2241 # a "by" parameter requires special handling

2242 if self.left_by is not None:

2243 # remove 'on' parameter from values if one existed

2244 if self.left_index and self.right_index:

2245 left_join_keys = self.left_join_keys

2246 right_join_keys = self.right_join_keys

2247 else:

2248 left_join_keys = self.left_join_keys[0:-1]

2249 right_join_keys = self.right_join_keys[0:-1]

2250

2251 mapped = [

2252 _factorize_keys(

2253 left_join_keys[n],

2254 right_join_keys[n],

2255 sort=False,

2256 )

2257 for n in range(len(left_join_keys))

2258 ]

2259

2260 if len(left_join_keys) == 1:

2261 left_by_values = mapped[0][0]

2262 right_by_values = mapped[0][1]

2263 else:

2264 arrs = [np.concatenate(m[:2]) for m in mapped]

2265 shape = tuple(m[2] for m in mapped)

2266 group_index = get_group_index(

2267 arrs, shape=shape, sort=False, xnull=False

2268 )

2269 left_len = len(left_join_keys[0])

2270 left_by_values = group_index[:left_len]

2271 right_by_values = group_index[left_len:]

2272

2273 left_by_values = ensure_int64(left_by_values)

2274 right_by_values = ensure_int64(right_by_values)

2275

2276 # choose appropriate function by type

2277 func = _asof_by_function(self.direction)

2278 return func(

2279 left_values,

2280 right_values,

2281 left_by_values,

2282 right_by_values,

2283 self.allow_exact_matches,

2284 tolerance,

2285 )

2286 else:

2287 # choose appropriate function by type

2288 func = _asof_by_function(self.direction)

2289 return func(

2290 left_values,

2291 right_values,

2292 None,

2293 None,

2294 self.allow_exact_matches,

2295 tolerance,

2296 False,

2297 )

2298

2299

2300def _get_multiindex_indexer(

2301 join_keys: list[ArrayLike], index: MultiIndex, sort: bool

2302) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2303 # left & right join labels and num. of levels at each location

2304 mapped = (

2305 _factorize_keys(index.levels[n]._values, join_keys[n], sort=sort)

2306 for n in range(index.nlevels)

2307 )

2308 zipped = zip(*mapped)

2309 rcodes, lcodes, shape = (list(x) for x in zipped)

2310 if sort:

2311 rcodes = list(map(np.take, rcodes, index.codes))

2312 else:

2313 i8copy = lambda a: a.astype("i8", subok=False, copy=True)

2314 rcodes = list(map(i8copy, index.codes))

2315

2316 # fix right labels if there were any nulls

2317 for i, join_key in enumerate(join_keys):

2318 mask = index.codes[i] == -1

2319 if mask.any():

2320 # check if there already was any nulls at this location

2321 # if there was, it is factorized to `shape[i] - 1`

2322 a = join_key[lcodes[i] == shape[i] - 1]

2323 if a.size == 0 or not a[0] != a[0]:

2324 shape[i] += 1

2325

2326 rcodes[i][mask] = shape[i] - 1

2327

2328 # get flat i8 join keys

2329 lkey, rkey = _get_join_keys(lcodes, rcodes, tuple(shape), sort)

2330 return lkey, rkey

2331

2332

2333def _get_empty_indexer() -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2334 """Return empty join indexers."""

2335 return (

2336 np.array([], dtype=np.intp),

2337 np.array([], dtype=np.intp),

2338 )

2339

2340

2341def _get_no_sort_one_missing_indexer(

2342 n: int, left_missing: bool

2343) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2344 """

2345 Return join indexers where all of one side is selected without sorting

2346 and none of the other side is selected.

2347

2348 Parameters

2349 ----------

2350 n : int

2351 Length of indexers to create.

2352 left_missing : bool

2353 If True, the left indexer will contain only -1's.

2354 If False, the right indexer will contain only -1's.

2355

2356 Returns

2357 -------

2358 np.ndarray[np.intp]

2359 Left indexer

2360 np.ndarray[np.intp]

2361 Right indexer

2362 """

2363 idx = np.arange(n, dtype=np.intp)

2364 idx_missing = np.full(shape=n, fill_value=-1, dtype=np.intp)

2365 if left_missing:

2366 return idx_missing, idx

2367 return idx, idx_missing

2368

2369

2370def _left_join_on_index(

2371 left_ax: Index, right_ax: Index, join_keys: list[ArrayLike], sort: bool = False

2372) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]:

2373 if isinstance(right_ax, MultiIndex):

2374 lkey, rkey = _get_multiindex_indexer(join_keys, right_ax, sort=sort)

2375 else:

2376 # error: Incompatible types in assignment (expression has type

2377 # "Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]",

2378 # variable has type "ndarray[Any, dtype[signedinteger[Any]]]")

2379 lkey = join_keys[0] # type: ignore[assignment]

2380 # error: Incompatible types in assignment (expression has type "Index",

2381 # variable has type "ndarray[Any, dtype[signedinteger[Any]]]")

2382 rkey = right_ax._values # type: ignore[assignment]

2383

2384 left_key, right_key, count = _factorize_keys(lkey, rkey, sort=sort)

2385 left_indexer, right_indexer = libjoin.left_outer_join(

2386 left_key, right_key, count, sort=sort

2387 )

2388

2389 if sort or len(left_ax) != len(left_indexer):

2390 # if asked to sort or there are 1-to-many matches

2391 join_index = left_ax.take(left_indexer)

2392 return join_index, left_indexer, right_indexer

2393

2394 # left frame preserves order & length of its index

2395 return left_ax, None, right_indexer

2396

2397

2398def _factorize_keys(

2399 lk: ArrayLike, rk: ArrayLike, sort: bool = True

2400) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:

2401 """

2402 Encode left and right keys as enumerated types.

2403

2404 This is used to get the join indexers to be used when merging DataFrames.

2405

2406 Parameters

2407 ----------

2408 lk : ndarray, ExtensionArray

2409 Left key.

2410 rk : ndarray, ExtensionArray

2411 Right key.

2412 sort : bool, defaults to True

2413 If True, the encoding is done such that the unique elements in the

2414 keys are sorted.

2415

2416 Returns

2417 -------

2418 np.ndarray[np.intp]

2419 Left (resp. right if called with `key='right'`) labels, as enumerated type.

2420 np.ndarray[np.intp]

2421 Right (resp. left if called with `key='right'`) labels, as enumerated type.

2422 int

2423 Number of unique elements in union of left and right labels.

2424

2425 See Also

2426 --------

2427 merge : Merge DataFrame or named Series objects

2428 with a database-style join.

2429 algorithms.factorize : Encode the object as an enumerated type

2430 or categorical variable.

2431

2432 Examples

2433 --------

2434 >>> lk = np.array(["a", "c", "b"])

2435 >>> rk = np.array(["a", "c"])

2436

2437 Here, the unique values are `'a', 'b', 'c'`. With the default

2438 `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:

2439

2440 >>> pd.core.reshape.merge._factorize_keys(lk, rk)

2441 (array([0, 2, 1]), array([0, 2]), 3)

2442

2443 With the `sort=False`, the encoding will correspond to the order

2444 in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:

2445

2446 >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)

2447 (array([0, 1, 2]), array([0, 1]), 3)

2448 """

2449 # TODO: if either is a RangeIndex, we can likely factorize more efficiently?

2450

2451 if (

2452 isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype)

2453 ) or (lib.is_np_dtype(lk.dtype, "M") and lib.is_np_dtype(rk.dtype, "M")):

2454 # Extract the ndarray (UTC-localized) values

2455 # Note: we dont need the dtypes to match, as these can still be compared

2456 lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk)

2457 lk = cast("DatetimeArray", lk)._ndarray

2458 rk = cast("DatetimeArray", rk)._ndarray

2459

2460 elif (

2461 isinstance(lk.dtype, CategoricalDtype)

2462 and isinstance(rk.dtype, CategoricalDtype)

2463 and lk.dtype == rk.dtype

2464 ):

2465 assert isinstance(lk, Categorical)

2466 assert isinstance(rk, Categorical)

2467 # Cast rk to encoding so we can compare codes with lk

2468

2469 rk = lk._encode_with_my_categories(rk)

2470

2471 lk = ensure_int64(lk.codes)

2472 rk = ensure_int64(rk.codes)

2473

2474 elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:

2475 if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or (

2476 isinstance(lk.dtype, StringDtype)

2477 and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"]

2478 ):

2479 import pyarrow as pa

2480 import pyarrow.compute as pc

2481

2482 len_lk = len(lk)

2483 lk = lk._pa_array # type: ignore[attr-defined]

2484 rk = rk._pa_array # type: ignore[union-attr]

2485 dc = (

2486 pa.chunked_array(lk.chunks + rk.chunks) # type: ignore[union-attr]

2487 .combine_chunks()

2488 .dictionary_encode()

2489 )

2490

2491 llab, rlab, count = (

2492 pc.fill_null(dc.indices[slice(len_lk)], -1)

2493 .to_numpy()

2494 .astype(np.intp, copy=False),

2495 pc.fill_null(dc.indices[slice(len_lk, None)], -1)

2496 .to_numpy()

2497 .astype(np.intp, copy=False),

2498 len(dc.dictionary),

2499 )

2500

2501 if sort:

2502 uniques = dc.dictionary.to_numpy(zero_copy_only=False)

2503 llab, rlab = _sort_labels(uniques, llab, rlab)

2504

2505 if dc.null_count > 0:

2506 lmask = llab == -1

2507 lany = lmask.any()

2508 rmask = rlab == -1

2509 rany = rmask.any()

2510 if lany:

2511 np.putmask(llab, lmask, count)

2512 if rany:

2513 np.putmask(rlab, rmask, count)

2514 count += 1

2515 return llab, rlab, count

2516

2517 if not isinstance(lk, BaseMaskedArray) and not (

2518 # exclude arrow dtypes that would get cast to object

2519 isinstance(lk.dtype, ArrowDtype)

2520 and (

2521 is_numeric_dtype(lk.dtype.numpy_dtype)

2522 or is_string_dtype(lk.dtype)

2523 and not sort

2524 )

2525 ):

2526 lk, _ = lk._values_for_factorize()

2527

2528 # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute

2529 # "_values_for_factorize"

2530 rk, _ = rk._values_for_factorize() # type: ignore[union-attr]

2531

2532 if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype:

2533 # GH#23917 TODO: Needs tests for non-matching dtypes

2534 # GH#23917 TODO: needs tests for case where lk is integer-dtype

2535 # and rk is datetime-dtype

2536 lk = np.asarray(lk, dtype=np.int64)

2537 rk = np.asarray(rk, dtype=np.int64)

2538

2539 klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk)

2540

2541 rizer = klass(max(len(lk), len(rk)))

2542

2543 if isinstance(lk, BaseMaskedArray):

2544 assert isinstance(rk, BaseMaskedArray)

2545 llab = rizer.factorize(lk._data, mask=lk._mask)

2546 rlab = rizer.factorize(rk._data, mask=rk._mask)

2547 elif isinstance(lk, ArrowExtensionArray):

2548 assert isinstance(rk, ArrowExtensionArray)

2549 # we can only get here with numeric dtypes

2550 # TODO: Remove when we have a Factorizer for Arrow

2551 llab = rizer.factorize(

2552 lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna()

2553 )

2554 rlab = rizer.factorize(

2555 rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna()

2556 )

2557 else:

2558 # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type

2559 # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],

2560 # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"

2561 llab = rizer.factorize(lk) # type: ignore[arg-type]

2562 rlab = rizer.factorize(rk) # type: ignore[arg-type]

2563 assert llab.dtype == np.dtype(np.intp), llab.dtype

2564 assert rlab.dtype == np.dtype(np.intp), rlab.dtype

2565

2566 count = rizer.get_count()

2567

2568 if sort:

2569 uniques = rizer.uniques.to_array()

2570 llab, rlab = _sort_labels(uniques, llab, rlab)

2571

2572 # NA group

2573 lmask = llab == -1

2574 lany = lmask.any()

2575 rmask = rlab == -1

2576 rany = rmask.any()

2577

2578 if lany or rany:

2579 if lany:

2580 np.putmask(llab, lmask, count)

2581 if rany:

2582 np.putmask(rlab, rmask, count)

2583 count += 1

2584

2585 return llab, rlab, count

2586

2587

2588def _convert_arrays_and_get_rizer_klass(

2589 lk: ArrayLike, rk: ArrayLike

2590) -> tuple[type[libhashtable.Factorizer], ArrayLike, ArrayLike]:

2591 klass: type[libhashtable.Factorizer]

2592 if is_numeric_dtype(lk.dtype):

2593 if lk.dtype != rk.dtype:

2594 dtype = find_common_type([lk.dtype, rk.dtype])

2595 if isinstance(dtype, ExtensionDtype):

2596 cls = dtype.construct_array_type()

2597 if not isinstance(lk, ExtensionArray):

2598 lk = cls._from_sequence(lk, dtype=dtype, copy=False)

2599 else:

2600 lk = lk.astype(dtype, copy=False)

2601

2602 if not isinstance(rk, ExtensionArray):

2603 rk = cls._from_sequence(rk, dtype=dtype, copy=False)

2604 else:

2605 rk = rk.astype(dtype, copy=False)

2606 else:

2607 lk = lk.astype(dtype, copy=False)

2608 rk = rk.astype(dtype, copy=False)

2609 if isinstance(lk, BaseMaskedArray):

2610 # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]";

2611 # expected type "Type[object]"

2612 klass = _factorizers[lk.dtype.type] # type: ignore[index]

2613 elif isinstance(lk.dtype, ArrowDtype):

2614 klass = _factorizers[lk.dtype.numpy_dtype.type]

2615 else:

2616 klass = _factorizers[lk.dtype.type]

2617

2618 else:

2619 klass = libhashtable.ObjectFactorizer

2620 lk = ensure_object(lk)

2621 rk = ensure_object(rk)

2622 return klass, lk, rk

2623

2624

2625def _sort_labels(

2626 uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp]

2627) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2628 llength = len(left)

2629 labels = np.concatenate([left, right])

2630

2631 _, new_labels = algos.safe_sort(uniques, labels, use_na_sentinel=True)

2632 new_left, new_right = new_labels[:llength], new_labels[llength:]

2633

2634 return new_left, new_right

2635

2636

2637def _get_join_keys(

2638 llab: list[npt.NDArray[np.int64 | np.intp]],

2639 rlab: list[npt.NDArray[np.int64 | np.intp]],

2640 shape: Shape,

2641 sort: bool,

2642) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:

2643 # how many levels can be done without overflow

2644 nlev = next(

2645 lev

2646 for lev in range(len(shape), 0, -1)

2647 if not is_int64_overflow_possible(shape[:lev])

2648 )

2649

2650 # get keys for the first `nlev` levels

2651 stride = np.prod(shape[1:nlev], dtype="i8")

2652 lkey = stride * llab[0].astype("i8", subok=False, copy=False)

2653 rkey = stride * rlab[0].astype("i8", subok=False, copy=False)

2654

2655 for i in range(1, nlev):

2656 with np.errstate(divide="ignore"):

2657 stride //= shape[i]

2658 lkey += llab[i] * stride

2659 rkey += rlab[i] * stride

2660

2661 if nlev == len(shape): # all done!

2662 return lkey, rkey

2663

2664 # densify current keys to avoid overflow

2665 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)

2666

2667 llab = [lkey] + llab[nlev:]

2668 rlab = [rkey] + rlab[nlev:]

2669 shape = (count,) + shape[nlev:]

2670

2671 return _get_join_keys(llab, rlab, shape, sort)

2672

2673

2674def _should_fill(lname, rname) -> bool:

2675 if not isinstance(lname, str) or not isinstance(rname, str):

2676 return True

2677 return lname == rname

2678

2679

2680def _any(x) -> bool:

2681 return x is not None and com.any_not_none(*x)

2682

2683

2684def _validate_operand(obj: DataFrame | Series) -> DataFrame:

2685 if isinstance(obj, ABCDataFrame):

2686 return obj

2687 elif isinstance(obj, ABCSeries):

2688 if obj.name is None:

2689 raise ValueError("Cannot merge a Series without a name")

2690 return obj.to_frame()

2691 else:

2692 raise TypeError(

2693 f"Can only merge Series or DataFrame objects, a {type(obj)} was passed"

2694 )

2695

2696

2697def _items_overlap_with_suffix(

2698 left: Index, right: Index, suffixes: Suffixes

2699) -> tuple[Index, Index]:

2700 """

2701 Suffixes type validation.

2702

2703 If two indices overlap, add suffixes to overlapping entries.

2704

2705 If corresponding suffix is empty, the entry is simply converted to string.

2706

2707 """

2708 if not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict):

2709 raise TypeError(

2710 f"Passing 'suffixes' as a {type(suffixes)}, is not supported. "

2711 "Provide 'suffixes' as a tuple instead."

2712 )

2713

2714 to_rename = left.intersection(right)

2715 if len(to_rename) == 0:

2716 return left, right

2717

2718 lsuffix, rsuffix = suffixes

2719

2720 if not lsuffix and not rsuffix:

2721 raise ValueError(f"columns overlap but no suffix specified: {to_rename}")

2722

2723 def renamer(x, suffix: str | None):

2724 """

2725 Rename the left and right indices.

2726

2727 If there is overlap, and suffix is not None, add

2728 suffix, otherwise, leave it as-is.

2729

2730 Parameters

2731 ----------

2732 x : original column name

2733 suffix : str or None

2734

2735 Returns

2736 -------

2737 x : renamed column name

2738 """

2739 if x in to_rename and suffix is not None:

2740 return f"{x}{suffix}"

2741 return x

2742

2743 lrenamer = partial(renamer, suffix=lsuffix)

2744 rrenamer = partial(renamer, suffix=rsuffix)

2745

2746 llabels = left._transform_index(lrenamer)

2747 rlabels = right._transform_index(rrenamer)

2748

2749 dups = []

2750 if not llabels.is_unique:

2751 # Only warn when duplicates are caused because of suffixes, already duplicated

2752 # columns in origin should not warn

2753 dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()

2754 if not rlabels.is_unique:

2755 dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())

2756 if dups:

2757 raise MergeError(

2758 f"Passing 'suffixes' which cause duplicate columns {set(dups)} is "

2759 f"not allowed.",

2760 )

2761

2762 return llabels, rlabels