Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/reshape/merge.py: 11%

1"""

2SQL-style merge routines

3"""

4from __future__ import annotations

6import copy as cp

7import datetime

8from functools import partial

9import string

10from typing import (

11 TYPE_CHECKING,

12 Hashable,

13 Literal,

14 Sequence,

15 cast,

16)

17import uuid

18import warnings

20import numpy as np

22from pandas._libs import (

23 Timedelta,

24 hashtable as libhashtable,

25 join as libjoin,

26 lib,

27)

28from pandas._libs.lib import is_range_indexer

29from pandas._typing import (

30 AnyArrayLike,

31 ArrayLike,

32 AxisInt,

33 DtypeObj,

34 IndexLabel,

35 JoinHow,

36 MergeHow,

37 Shape,

38 Suffixes,

39 npt,

40)

41from pandas.errors import MergeError

42from pandas.util._decorators import (

43 Appender,

44 Substitution,

45 cache_readonly,

46)

47from pandas.util._exceptions import find_stack_level

49from pandas.core.dtypes.base import ExtensionDtype

50from pandas.core.dtypes.cast import find_common_type

51from pandas.core.dtypes.common import (

52 ensure_float64,

53 ensure_int64,

54 ensure_object,

55 is_array_like,

56 is_bool,

57 is_bool_dtype,

58 is_categorical_dtype,

59 is_dtype_equal,

60 is_extension_array_dtype,

61 is_float_dtype,

62 is_integer,

63 is_integer_dtype,

64 is_list_like,

65 is_number,

66 is_numeric_dtype,

67 is_object_dtype,

68 needs_i8_conversion,

69)

70from pandas.core.dtypes.dtypes import DatetimeTZDtype

71from pandas.core.dtypes.generic import (

72 ABCDataFrame,

73 ABCSeries,

74)

75from pandas.core.dtypes.missing import (

76 isna,

77 na_value_for_dtype,

78)

80from pandas import (

81 ArrowDtype,

82 Categorical,

83 Index,

84 MultiIndex,

85 Series,

86)

87import pandas.core.algorithms as algos

88from pandas.core.arrays import (

89 ArrowExtensionArray,

90 BaseMaskedArray,

91 ExtensionArray,

92)

93from pandas.core.arrays._mixins import NDArrayBackedExtensionArray

94import pandas.core.common as com

95from pandas.core.construction import (

96 ensure_wrapped_if_datetimelike,

97 extract_array,

98)

99from pandas.core.frame import _merge_doc

100from pandas.core.indexes.api import default_index

101from pandas.core.sorting import is_int64_overflow_possible

102

103if TYPE_CHECKING:

104 from pandas import DataFrame

105 from pandas.core import groupby

106 from pandas.core.arrays import DatetimeArray

107

108_factorizers = {

109 np.int64: libhashtable.Int64Factorizer,

110 np.longlong: libhashtable.Int64Factorizer,

111 np.int32: libhashtable.Int32Factorizer,

112 np.int16: libhashtable.Int16Factorizer,

113 np.int8: libhashtable.Int8Factorizer,

114 np.uint64: libhashtable.UInt64Factorizer,

115 np.uint32: libhashtable.UInt32Factorizer,

116 np.uint16: libhashtable.UInt16Factorizer,

117 np.uint8: libhashtable.UInt8Factorizer,

118 np.bool_: libhashtable.UInt8Factorizer,

119 np.float64: libhashtable.Float64Factorizer,

120 np.float32: libhashtable.Float32Factorizer,

121 np.complex64: libhashtable.Complex64Factorizer,

122 np.complex128: libhashtable.Complex128Factorizer,

123 np.object_: libhashtable.ObjectFactorizer,

124}

125

126# See https://github.com/pandas-dev/pandas/issues/52451

127if np.intc is not np.int32:

128 _factorizers[np.intc] = libhashtable.Int64Factorizer

129

130

131@Substitution("\nleft : DataFrame or named Series")

132@Appender(_merge_doc, indents=0)

133def merge(

134 left: DataFrame | Series,

135 right: DataFrame | Series,

136 how: MergeHow = "inner",

137 on: IndexLabel | None = None,

138 left_on: IndexLabel | None = None,

139 right_on: IndexLabel | None = None,

140 left_index: bool = False,

141 right_index: bool = False,

142 sort: bool = False,

143 suffixes: Suffixes = ("_x", "_y"),

144 copy: bool | None = None,

145 indicator: str | bool = False,

146 validate: str | None = None,

147) -> DataFrame:

148 op = _MergeOperation(

149 left,

150 right,

151 how=how,

152 on=on,

153 left_on=left_on,

154 right_on=right_on,

155 left_index=left_index,

156 right_index=right_index,

157 sort=sort,

158 suffixes=suffixes,

159 indicator=indicator,

160 validate=validate,

161 )

162 return op.get_result(copy=copy)

163

164

165def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces):

166 """

167 groupby & merge; we are always performing a left-by type operation

168

169 Parameters

170 ----------

171 by: field to group

172 left: DataFrame

173 right: DataFrame

174 merge_pieces: function for merging

175 """

176 pieces = []

177 if not isinstance(by, (list, tuple)):

178 by = [by]

179

180 lby = left.groupby(by, sort=False)

181 rby: groupby.DataFrameGroupBy | None = None

182

183 # if we can groupby the rhs

184 # then we can get vastly better perf

185 if all(item in right.columns for item in by):

186 rby = right.groupby(by, sort=False)

187

188 for key, lhs in lby.grouper.get_iterator(lby._selected_obj, axis=lby.axis):

189 if rby is None:

190 rhs = right

191 else:

192 try:

193 rhs = right.take(rby.indices[key])

194 except KeyError:

195 # key doesn't exist in left

196 lcols = lhs.columns.tolist()

197 cols = lcols + [r for r in right.columns if r not in set(lcols)]

198 merged = lhs.reindex(columns=cols)

199 merged.index = range(len(merged))

200 pieces.append(merged)

201 continue

202

203 merged = merge_pieces(lhs, rhs)

204

205 # make sure join keys are in the merged

206 # TODO, should merge_pieces do this?

207 merged[by] = key

208

209 pieces.append(merged)

210

211 # preserve the original order

212 # if we have a missing piece this can be reset

213 from pandas.core.reshape.concat import concat

214

215 result = concat(pieces, ignore_index=True)

216 result = result.reindex(columns=pieces[0].columns, copy=False)

217 return result, lby

218

219

220def merge_ordered(

221 left: DataFrame,

222 right: DataFrame,

223 on: IndexLabel | None = None,

224 left_on: IndexLabel | None = None,

225 right_on: IndexLabel | None = None,

226 left_by=None,

227 right_by=None,

228 fill_method: str | None = None,

229 suffixes: Suffixes = ("_x", "_y"),

230 how: JoinHow = "outer",

231) -> DataFrame:

232 """

233 Perform a merge for ordered data with optional filling/interpolation.

234

235 Designed for ordered data like time series data. Optionally

236 perform group-wise merge (see examples).

237

238 Parameters

239 ----------

240 left : DataFrame or named Series

241 right : DataFrame or named Series

242 on : label or list

243 Field names to join on. Must be found in both DataFrames.

244 left_on : label or list, or array-like

245 Field names to join on in left DataFrame. Can be a vector or list of

246 vectors of the length of the DataFrame to use a particular vector as

247 the join key instead of columns.

248 right_on : label or list, or array-like

249 Field names to join on in right DataFrame or vector/list of vectors per

250 left_on docs.

251 left_by : column name or list of column names

252 Group left DataFrame by group columns and merge piece by piece with

253 right DataFrame. Must be None if either left or right are a Series.

254 right_by : column name or list of column names

255 Group right DataFrame by group columns and merge piece by piece with

256 left DataFrame. Must be None if either left or right are a Series.

257 fill_method : {'ffill', None}, default None

258 Interpolation method for data.

259 suffixes : list-like, default is ("_x", "_y")

260 A length-2 sequence where each element is optionally a string

261 indicating the suffix to add to overlapping column names in

262 `left` and `right` respectively. Pass a value of `None` instead

263 of a string to indicate that the column name from `left` or

264 `right` should be left as-is, with no suffix. At least one of the

265 values must not be None.

266

267 how : {'left', 'right', 'outer', 'inner'}, default 'outer'

268 * left: use only keys from left frame (SQL: left outer join)

269 * right: use only keys from right frame (SQL: right outer join)

270 * outer: use union of keys from both frames (SQL: full outer join)

271 * inner: use intersection of keys from both frames (SQL: inner join).

272

273 Returns

274 -------

275 DataFrame

276 The merged DataFrame output type will be the same as

277 'left', if it is a subclass of DataFrame.

278

279 See Also

280 --------

281 merge : Merge with a database-style join.

282 merge_asof : Merge on nearest keys.

283

284 Examples

285 --------

286 >>> from pandas import merge_ordered

287 >>> df1 = pd.DataFrame(

288 ... {

289 ... "key": ["a", "c", "e", "a", "c", "e"],

290 ... "lvalue": [1, 2, 3, 1, 2, 3],

291 ... "group": ["a", "a", "a", "b", "b", "b"]

292 ... }

293 ... )

294 >>> df1

295 key lvalue group

296 0 a 1 a

297 1 c 2 a

298 2 e 3 a

299 3 a 1 b

300 4 c 2 b

301 5 e 3 b

302

303 >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})

304 >>> df2

305 key rvalue

306 0 b 1

307 1 c 2

308 2 d 3

309

310 >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group")

311 key lvalue group rvalue

312 0 a 1 a NaN

313 1 b 1 a 1.0

314 2 c 2 a 2.0

315 3 d 2 a 3.0

316 4 e 3 a 3.0

317 5 a 1 b NaN

318 6 b 1 b 1.0

319 7 c 2 b 2.0

320 8 d 2 b 3.0

321 9 e 3 b 3.0

322 """

323

324 def _merger(x, y) -> DataFrame:

325 # perform the ordered merge operation

326 op = _OrderedMerge(

327 x,

328 y,

329 on=on,

330 left_on=left_on,

331 right_on=right_on,

332 suffixes=suffixes,

333 fill_method=fill_method,

334 how=how,

335 )

336 return op.get_result()

337

338 if left_by is not None and right_by is not None:

339 raise ValueError("Can only group either left or right frames")

340 if left_by is not None:

341 if isinstance(left_by, str):

342 left_by = [left_by]

343 check = set(left_by).difference(left.columns)

344 if len(check) != 0:

345 raise KeyError(f"{check} not found in left columns")

346 result, _ = _groupby_and_merge(left_by, left, right, lambda x, y: _merger(x, y))

347 elif right_by is not None:

348 if isinstance(right_by, str):

349 right_by = [right_by]

350 check = set(right_by).difference(right.columns)

351 if len(check) != 0:

352 raise KeyError(f"{check} not found in right columns")

353 result, _ = _groupby_and_merge(

354 right_by, right, left, lambda x, y: _merger(y, x)

355 )

356 else:

357 result = _merger(left, right)

358 return result

359

360

361def merge_asof(

362 left: DataFrame | Series,

363 right: DataFrame | Series,

364 on: IndexLabel | None = None,

365 left_on: IndexLabel | None = None,

366 right_on: IndexLabel | None = None,

367 left_index: bool = False,

368 right_index: bool = False,

369 by=None,

370 left_by=None,

371 right_by=None,

372 suffixes: Suffixes = ("_x", "_y"),

373 tolerance=None,

374 allow_exact_matches: bool = True,

375 direction: str = "backward",

376) -> DataFrame:

377 """

378 Perform a merge by key distance.

379

380 This is similar to a left-join except that we match on nearest

381 key rather than equal keys. Both DataFrames must be sorted by the key.

382

383 For each row in the left DataFrame:

384

385 - A "backward" search selects the last row in the right DataFrame whose

386 'on' key is less than or equal to the left's key.

387

388 - A "forward" search selects the first row in the right DataFrame whose

389 'on' key is greater than or equal to the left's key.

390

391 - A "nearest" search selects the row in the right DataFrame whose 'on'

392 key is closest in absolute distance to the left's key.

393

394 The default is "backward" and is compatible in versions below 0.20.0.

395 The direction parameter was added in version 0.20.0 and introduces

396 "forward" and "nearest".

397

398 Optionally match on equivalent keys with 'by' before searching with 'on'.

399

400 Parameters

401 ----------

402 left : DataFrame or named Series

403 right : DataFrame or named Series

404 on : label

405 Field name to join on. Must be found in both DataFrames.

406 The data MUST be ordered. Furthermore this must be a numeric column,

407 such as datetimelike, integer, or float. On or left_on/right_on

408 must be given.

409 left_on : label

410 Field name to join on in left DataFrame.

411 right_on : label

412 Field name to join on in right DataFrame.

413 left_index : bool

414 Use the index of the left DataFrame as the join key.

415 right_index : bool

416 Use the index of the right DataFrame as the join key.

417 by : column name or list of column names

418 Match on these columns before performing merge operation.

419 left_by : column name

420 Field names to match on in the left DataFrame.

421 right_by : column name

422 Field names to match on in the right DataFrame.

423 suffixes : 2-length sequence (tuple, list, ...)

424 Suffix to apply to overlapping column names in the left and right

425 side, respectively.

426 tolerance : int or Timedelta, optional, default None

427 Select asof tolerance within this range; must be compatible

428 with the merge index.

429 allow_exact_matches : bool, default True

430

431 - If True, allow matching with the same 'on' value

432 (i.e. less-than-or-equal-to / greater-than-or-equal-to)

433 - If False, don't match the same 'on' value

434 (i.e., strictly less-than / strictly greater-than).

435

436 direction : 'backward' (default), 'forward', or 'nearest'

437 Whether to search for prior, subsequent, or closest matches.

438

439 Returns

440 -------

441 DataFrame

442

443 See Also

444 --------

445 merge : Merge with a database-style join.

446 merge_ordered : Merge with optional filling/interpolation.

447

448 Examples

449 --------

450 >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})

451 >>> left

452 a left_val

453 0 1 a

454 1 5 b

455 2 10 c

456

457 >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]})

458 >>> right

459 a right_val

460 0 1 1

461 1 2 2

462 2 3 3

463 3 6 6

464 4 7 7

465

466 >>> pd.merge_asof(left, right, on="a")

467 a left_val right_val

468 0 1 a 1

469 1 5 b 3

470 2 10 c 7

471

472 >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False)

473 a left_val right_val

474 0 1 a NaN

475 1 5 b 3.0

476 2 10 c 7.0

477

478 >>> pd.merge_asof(left, right, on="a", direction="forward")

479 a left_val right_val

480 0 1 a 1.0

481 1 5 b 6.0

482 2 10 c NaN

483

484 >>> pd.merge_asof(left, right, on="a", direction="nearest")

485 a left_val right_val

486 0 1 a 1

487 1 5 b 6

488 2 10 c 7

489

490 We can use indexed DataFrames as well.

491

492 >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10])

493 >>> left

494 left_val

495 1 a

496 5 b

497 10 c

498

499 >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7])

500 >>> right

501 right_val

502 1 1

503 2 2

504 3 3

505 6 6

506 7 7

507

508 >>> pd.merge_asof(left, right, left_index=True, right_index=True)

509 left_val right_val

510 1 a 1

511 5 b 3

512 10 c 7

513

514 Here is a real-world times-series example

515

516 >>> quotes = pd.DataFrame(

517 ... {

518 ... "time": [

519 ... pd.Timestamp("2016-05-25 13:30:00.023"),

520 ... pd.Timestamp("2016-05-25 13:30:00.023"),

521 ... pd.Timestamp("2016-05-25 13:30:00.030"),

522 ... pd.Timestamp("2016-05-25 13:30:00.041"),

523 ... pd.Timestamp("2016-05-25 13:30:00.048"),

524 ... pd.Timestamp("2016-05-25 13:30:00.049"),

525 ... pd.Timestamp("2016-05-25 13:30:00.072"),

526 ... pd.Timestamp("2016-05-25 13:30:00.075")

527 ... ],

528 ... "ticker": [

529 ... "GOOG",

530 ... "MSFT",

531 ... "MSFT",

532 ... "MSFT",

533 ... "GOOG",

534 ... "AAPL",

535 ... "GOOG",

536 ... "MSFT"

537 ... ],

538 ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],

539 ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]

540 ... }

541 ... )

542 >>> quotes

543 time ticker bid ask

544 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93

545 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96

546 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98

547 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00

548 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93

549 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01

550 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88

551 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03

552

553 >>> trades = pd.DataFrame(

554 ... {

555 ... "time": [

556 ... pd.Timestamp("2016-05-25 13:30:00.023"),

557 ... pd.Timestamp("2016-05-25 13:30:00.038"),

558 ... pd.Timestamp("2016-05-25 13:30:00.048"),

559 ... pd.Timestamp("2016-05-25 13:30:00.048"),

560 ... pd.Timestamp("2016-05-25 13:30:00.048")

561 ... ],

562 ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],

563 ... "price": [51.95, 51.95, 720.77, 720.92, 98.0],

564 ... "quantity": [75, 155, 100, 100, 100]

565 ... }

566 ... )

567 >>> trades

568 time ticker price quantity

569 0 2016-05-25 13:30:00.023 MSFT 51.95 75

570 1 2016-05-25 13:30:00.038 MSFT 51.95 155

571 2 2016-05-25 13:30:00.048 GOOG 720.77 100

572 3 2016-05-25 13:30:00.048 GOOG 720.92 100

573 4 2016-05-25 13:30:00.048 AAPL 98.00 100

574

575 By default we are taking the asof of the quotes

576

577 >>> pd.merge_asof(trades, quotes, on="time", by="ticker")

578 time ticker price quantity bid ask

579 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96

580 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98

581 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93

582 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93

583 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN

584

585 We only asof within 2ms between the quote time and the trade time

586

587 >>> pd.merge_asof(

588 ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms")

589 ... )

590 time ticker price quantity bid ask

591 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96

592 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN

593 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93

594 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93

595 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN

596

597 We only asof within 10ms between the quote time and the trade time

598 and we exclude exact matches on time. However *prior* data will

599 propagate forward

600

601 >>> pd.merge_asof(

602 ... trades,

603 ... quotes,

604 ... on="time",

605 ... by="ticker",

606 ... tolerance=pd.Timedelta("10ms"),

607 ... allow_exact_matches=False

608 ... )

609 time ticker price quantity bid ask

610 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN

611 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98

612 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN

613 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN

614 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN

615 """

616 op = _AsOfMerge(

617 left,

618 right,

619 on=on,

620 left_on=left_on,

621 right_on=right_on,

622 left_index=left_index,

623 right_index=right_index,

624 by=by,

625 left_by=left_by,

626 right_by=right_by,

627 suffixes=suffixes,

628 how="asof",

629 tolerance=tolerance,

630 allow_exact_matches=allow_exact_matches,

631 direction=direction,

632 )

633 return op.get_result()

634

635

636# TODO: transformations??

637# TODO: only copy DataFrames when modification necessary

638class _MergeOperation:

639 """

640 Perform a database (SQL) merge operation between two DataFrame or Series

641 objects using either columns as keys or their row indexes

642 """

643

644 _merge_type = "merge"

645 how: MergeHow | Literal["asof"]

646 on: IndexLabel | None

647 # left_on/right_on may be None when passed, but in validate_specification

648 # get replaced with non-None.

649 left_on: Sequence[Hashable | AnyArrayLike]

650 right_on: Sequence[Hashable | AnyArrayLike]

651 left_index: bool

652 right_index: bool

653 axis: AxisInt

654 bm_axis: AxisInt

655 sort: bool

656 suffixes: Suffixes

657 copy: bool

658 indicator: str | bool

659 validate: str | None

660 join_names: list[Hashable]

661 right_join_keys: list[AnyArrayLike]

662 left_join_keys: list[AnyArrayLike]

663

664 def __init__(

665 self,

666 left: DataFrame | Series,

667 right: DataFrame | Series,

668 how: MergeHow | Literal["asof"] = "inner",

669 on: IndexLabel | None = None,

670 left_on: IndexLabel | None = None,

671 right_on: IndexLabel | None = None,

672 axis: AxisInt = 1,

673 left_index: bool = False,

674 right_index: bool = False,

675 sort: bool = True,

676 suffixes: Suffixes = ("_x", "_y"),

677 indicator: str | bool = False,

678 validate: str | None = None,

679 ) -> None:

680 _left = _validate_operand(left)

681 _right = _validate_operand(right)

682 self.left = self.orig_left = _left

683 self.right = self.orig_right = _right

684 self.how = how

685

686 # bm_axis -> the axis on the BlockManager

687 self.bm_axis = axis

688 # axis --> the axis on the Series/DataFrame

689 self.axis = 1 - axis if self.left.ndim == 2 else 0

690

691 self.on = com.maybe_make_list(on)

692

693 self.suffixes = suffixes

694 self.sort = sort

695

696 self.left_index = left_index

697 self.right_index = right_index

698

699 self.indicator = indicator

700

701 if not is_bool(left_index):

702 raise ValueError(

703 f"left_index parameter must be of type bool, not {type(left_index)}"

704 )

705 if not is_bool(right_index):

706 raise ValueError(

707 f"right_index parameter must be of type bool, not {type(right_index)}"

708 )

709

710 # GH 40993: raise when merging between different levels; enforced in 2.0

711 if _left.columns.nlevels != _right.columns.nlevels:

712 msg = (

713 "Not allowed to merge between different levels. "

714 f"({_left.columns.nlevels} levels on the left, "

715 f"{_right.columns.nlevels} on the right)"

716 )

717 raise MergeError(msg)

718

719 self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)

720

721 cross_col = None

722 if self.how == "cross":

723 (

724 self.left,

725 self.right,

726 self.how,

727 cross_col,

728 ) = self._create_cross_configuration(self.left, self.right)

729 self.left_on = self.right_on = [cross_col]

730 self._cross = cross_col

731

732 # note this function has side effects

733 (

734 self.left_join_keys,

735 self.right_join_keys,

736 self.join_names,

737 ) = self._get_merge_keys()

738

739 # validate the merge keys dtypes. We may need to coerce

740 # to avoid incompatible dtypes

741 self._maybe_coerce_merge_keys()

742

743 # If argument passed to validate,

744 # check if columns specified as unique

745 # are in fact unique.

746 if validate is not None:

747 self._validate(validate)

748

749 def _reindex_and_concat(

750 self,

751 join_index: Index,

752 left_indexer: npt.NDArray[np.intp] | None,

753 right_indexer: npt.NDArray[np.intp] | None,

754 copy: bool | None,

755 ) -> DataFrame:

756 """

757 reindex along index and concat along columns.

758 """

759 # Take views so we do not alter the originals

760 left = self.left[:]

761 right = self.right[:]

762

763 llabels, rlabels = _items_overlap_with_suffix(

764 self.left._info_axis, self.right._info_axis, self.suffixes

765 )

766

767 if left_indexer is not None and not is_range_indexer(left_indexer, len(left)):

768 # Pinning the index here (and in the right code just below) is not

769 # necessary, but makes the `.take` more performant if we have e.g.

770 # a MultiIndex for left.index.

771 lmgr = left._mgr.reindex_indexer(

772 join_index,

773 left_indexer,

774 axis=1,

775 copy=False,

776 only_slice=True,

777 allow_dups=True,

778 use_na_proxy=True,

779 )

780 left = left._constructor(lmgr)

781 left.index = join_index

782

783 if right_indexer is not None and not is_range_indexer(

784 right_indexer, len(right)

785 ):

786 rmgr = right._mgr.reindex_indexer(

787 join_index,

788 right_indexer,

789 axis=1,

790 copy=False,

791 only_slice=True,

792 allow_dups=True,

793 use_na_proxy=True,

794 )

795 right = right._constructor(rmgr)

796 right.index = join_index

797

798 from pandas import concat

799

800 left.columns = llabels

801 right.columns = rlabels

802 result = concat([left, right], axis=1, copy=copy)

803 return result

804

805 def get_result(self, copy: bool | None = True) -> DataFrame:

806 if self.indicator:

807 self.left, self.right = self._indicator_pre_merge(self.left, self.right)

808

809 join_index, left_indexer, right_indexer = self._get_join_info()

810

811 result = self._reindex_and_concat(

812 join_index, left_indexer, right_indexer, copy=copy

813 )

814 result = result.__finalize__(self, method=self._merge_type)

815

816 if self.indicator:

817 result = self._indicator_post_merge(result)

818

819 self._maybe_add_join_keys(result, left_indexer, right_indexer)

820

821 self._maybe_restore_index_levels(result)

822

823 self._maybe_drop_cross_column(result, self._cross)

824

825 return result.__finalize__(self, method="merge")

826

827 def _maybe_drop_cross_column(

828 self, result: DataFrame, cross_col: str | None

829 ) -> None:

830 if cross_col is not None:

831 del result[cross_col]

832

833 @cache_readonly

834 def _indicator_name(self) -> str | None:

835 if isinstance(self.indicator, str):

836 return self.indicator

837 elif isinstance(self.indicator, bool):

838 return "_merge" if self.indicator else None

839 else:

840 raise ValueError(

841 "indicator option can only accept boolean or string arguments"

842 )

843

844 def _indicator_pre_merge(

845 self, left: DataFrame, right: DataFrame

846 ) -> tuple[DataFrame, DataFrame]:

847 columns = left.columns.union(right.columns)

848

849 for i in ["_left_indicator", "_right_indicator"]:

850 if i in columns:

851 raise ValueError(

852 "Cannot use `indicator=True` option when "

853 f"data contains a column named {i}"

854 )

855 if self._indicator_name in columns:

856 raise ValueError(

857 "Cannot use name of an existing column for indicator column"

858 )

859

860 left = left.copy()

861 right = right.copy()

862

863 left["_left_indicator"] = 1

864 left["_left_indicator"] = left["_left_indicator"].astype("int8")

865

866 right["_right_indicator"] = 2

867 right["_right_indicator"] = right["_right_indicator"].astype("int8")

868

869 return left, right

870

871 def _indicator_post_merge(self, result: DataFrame) -> DataFrame:

872 result["_left_indicator"] = result["_left_indicator"].fillna(0)

873 result["_right_indicator"] = result["_right_indicator"].fillna(0)

874

875 result[self._indicator_name] = Categorical(

876 (result["_left_indicator"] + result["_right_indicator"]),

877 categories=[1, 2, 3],

878 )

879 result[self._indicator_name] = result[

880 self._indicator_name

881 ].cat.rename_categories(["left_only", "right_only", "both"])

882

883 result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1)

884 return result

885

886 def _maybe_restore_index_levels(self, result: DataFrame) -> None:

887 """

888 Restore index levels specified as `on` parameters

889

890 Here we check for cases where `self.left_on` and `self.right_on` pairs

891 each reference an index level in their respective DataFrames. The

892 joined columns corresponding to these pairs are then restored to the

893 index of `result`.

894

895 **Note:** This method has side effects. It modifies `result` in-place

896

897 Parameters

898 ----------

899 result: DataFrame

900 merge result

901

902 Returns

903 -------

904 None

905 """

906 names_to_restore = []

907 for name, left_key, right_key in zip(

908 self.join_names, self.left_on, self.right_on

909 ):

910 if (

911 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible

912 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected

913 # "Hashable"

914 self.orig_left._is_level_reference(left_key) # type: ignore[arg-type]

915 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible

916 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected

917 # "Hashable"

918 and self.orig_right._is_level_reference(

919 right_key # type: ignore[arg-type]

920 )

921 and left_key == right_key

922 and name not in result.index.names

923 ):

924 names_to_restore.append(name)

925

926 if names_to_restore:

927 result.set_index(names_to_restore, inplace=True)

928

929 def _maybe_add_join_keys(

930 self,

931 result: DataFrame,

932 left_indexer: np.ndarray | None,

933 right_indexer: np.ndarray | None,

934 ) -> None:

935 left_has_missing = None

936 right_has_missing = None

937

938 assert all(is_array_like(x) for x in self.left_join_keys)

939

940 keys = zip(self.join_names, self.left_on, self.right_on)

941 for i, (name, lname, rname) in enumerate(keys):

942 if not _should_fill(lname, rname):

943 continue

944

945 take_left, take_right = None, None

946

947 if name in result:

948 if left_indexer is not None and right_indexer is not None:

949 if name in self.left:

950 if left_has_missing is None:

951 left_has_missing = (left_indexer == -1).any()

952

953 if left_has_missing:

954 take_right = self.right_join_keys[i]

955

956 if not is_dtype_equal(

957 result[name].dtype, self.left[name].dtype

958 ):

959 take_left = self.left[name]._values

960

961 elif name in self.right:

962 if right_has_missing is None:

963 right_has_missing = (right_indexer == -1).any()

964

965 if right_has_missing:

966 take_left = self.left_join_keys[i]

967

968 if not is_dtype_equal(

969 result[name].dtype, self.right[name].dtype

970 ):

971 take_right = self.right[name]._values

972

973 elif left_indexer is not None:

974 take_left = self.left_join_keys[i]

975 take_right = self.right_join_keys[i]

976

977 if take_left is not None or take_right is not None:

978 if take_left is None:

979 lvals = result[name]._values

980 else:

981 # TODO: can we pin down take_left's type earlier?

982 take_left = extract_array(take_left, extract_numpy=True)

983 lfill = na_value_for_dtype(take_left.dtype)

984 lvals = algos.take_nd(take_left, left_indexer, fill_value=lfill)

985

986 if take_right is None:

987 rvals = result[name]._values

988 else:

989 # TODO: can we pin down take_right's type earlier?

990 taker = extract_array(take_right, extract_numpy=True)

991 rfill = na_value_for_dtype(taker.dtype)

992 rvals = algos.take_nd(taker, right_indexer, fill_value=rfill)

993

994 # if we have an all missing left_indexer

995 # make sure to just use the right values or vice-versa

996 mask_left = left_indexer == -1

997 # error: Item "bool" of "Union[Any, bool]" has no attribute "all"

998 if mask_left.all(): # type: ignore[union-attr]

999 key_col = Index(rvals)

1000 result_dtype = rvals.dtype

1001 elif right_indexer is not None and (right_indexer == -1).all():

1002 key_col = Index(lvals)

1003 result_dtype = lvals.dtype

1004 else:

1005 key_col = Index(lvals).where(~mask_left, rvals)

1006 result_dtype = find_common_type([lvals.dtype, rvals.dtype])

1007 if (

1008 lvals.dtype.kind == "M"

1009 and rvals.dtype.kind == "M"

1010 and result_dtype.kind == "O"

1011 ):

1012 # TODO(non-nano) Workaround for common_type not dealing

1013 # with different resolutions

1014 result_dtype = key_col.dtype

1015

1016 if result._is_label_reference(name):

1017 result[name] = Series(

1018 key_col, dtype=result_dtype, index=result.index

1019 )

1020 elif result._is_level_reference(name):

1021 if isinstance(result.index, MultiIndex):

1022 key_col.name = name

1023 idx_list = [

1024 result.index.get_level_values(level_name)

1025 if level_name != name

1026 else key_col

1027 for level_name in result.index.names

1028 ]

1029

1030 result.set_index(idx_list, inplace=True)

1031 else:

1032 result.index = Index(key_col, name=name)

1033 else:

1034 result.insert(i, name or f"key_{i}", key_col)

1035

1036 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

1037 """return the join indexers"""

1038 return get_join_indexers(

1039 self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how

1040 )

1041

1042 def _get_join_info(

1043 self,

1044 ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:

1045 # make mypy happy

1046 assert self.how != "cross"

1047 left_ax = self.left.axes[self.axis]

1048 right_ax = self.right.axes[self.axis]

1049

1050 if self.left_index and self.right_index and self.how != "asof":

1051 join_index, left_indexer, right_indexer = left_ax.join(

1052 right_ax, how=self.how, return_indexers=True, sort=self.sort

1053 )

1054

1055 elif self.right_index and self.how == "left":

1056 join_index, left_indexer, right_indexer = _left_join_on_index(

1057 left_ax, right_ax, self.left_join_keys, sort=self.sort

1058 )

1059

1060 elif self.left_index and self.how == "right":

1061 join_index, right_indexer, left_indexer = _left_join_on_index(

1062 right_ax, left_ax, self.right_join_keys, sort=self.sort

1063 )

1064 else:

1065 (left_indexer, right_indexer) = self._get_join_indexers()

1066

1067 if self.right_index:

1068 if len(self.left) > 0:

1069 join_index = self._create_join_index(

1070 self.left.index,

1071 self.right.index,

1072 left_indexer,

1073 how="right",

1074 )

1075 else:

1076 join_index = self.right.index.take(right_indexer)

1077 elif self.left_index:

1078 if self.how == "asof":

1079 # GH#33463 asof should always behave like a left merge

1080 join_index = self._create_join_index(

1081 self.left.index,

1082 self.right.index,

1083 left_indexer,

1084 how="left",

1085 )

1086

1087 elif len(self.right) > 0:

1088 join_index = self._create_join_index(

1089 self.right.index,

1090 self.left.index,

1091 right_indexer,

1092 how="left",

1093 )

1094 else:

1095 join_index = self.left.index.take(left_indexer)

1096 else:

1097 join_index = default_index(len(left_indexer))

1098

1099 if len(join_index) == 0 and not isinstance(join_index, MultiIndex):

1100 join_index = default_index(0).set_names(join_index.name)

1101 return join_index, left_indexer, right_indexer

1102

1103 def _create_join_index(

1104 self,

1105 index: Index,

1106 other_index: Index,

1107 indexer: npt.NDArray[np.intp],

1108 how: JoinHow = "left",

1109 ) -> Index:

1110 """

1111 Create a join index by rearranging one index to match another

1112

1113 Parameters

1114 ----------

1115 index : Index being rearranged

1116 other_index : Index used to supply values not found in index

1117 indexer : np.ndarray[np.intp] how to rearrange index

1118 how : str

1119 Replacement is only necessary if indexer based on other_index.

1120

1121 Returns

1122 -------

1123 Index

1124 """

1125 if self.how in (how, "outer") and not isinstance(other_index, MultiIndex):

1126 # if final index requires values in other_index but not target

1127 # index, indexer may hold missing (-1) values, causing Index.take

1128 # to take the final value in target index. So, we set the last

1129 # element to be the desired fill value. We do not use allow_fill

1130 # and fill_value because it throws a ValueError on integer indices

1131 mask = indexer == -1

1132 if np.any(mask):

1133 fill_value = na_value_for_dtype(index.dtype, compat=False)

1134 index = index.append(Index([fill_value]))

1135 return index.take(indexer)

1136

1137 def _get_merge_keys(

1138 self,

1139 ) -> tuple[list[AnyArrayLike], list[AnyArrayLike], list[Hashable]]:

1140 """

1141 Note: has side effects (copy/delete key columns)

1142

1143 Parameters

1144 ----------

1145 left

1146 right

1147 on

1148

1149 Returns

1150 -------

1151 left_keys, right_keys, join_names

1152 """

1153 # left_keys, right_keys entries can actually be anything listlike

1154 # with a 'dtype' attr

1155 left_keys: list[AnyArrayLike] = []

1156 right_keys: list[AnyArrayLike] = []

1157 join_names: list[Hashable] = []

1158 right_drop: list[Hashable] = []

1159 left_drop: list[Hashable] = []

1160

1161 left, right = self.left, self.right

1162

1163 is_lkey = lambda x: is_array_like(x) and len(x) == len(left)

1164 is_rkey = lambda x: is_array_like(x) and len(x) == len(right)

1165

1166 # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A

1167 # user could, for example, request 'left_index' and 'left_by'. In a

1168 # regular pd.merge(), users cannot specify both 'left_index' and

1169 # 'left_on'. (Instead, users have a MultiIndex). That means the

1170 # self.left_on in this function is always empty in a pd.merge(), but

1171 # a pd.merge_asof(left_index=True, left_by=...) will result in a

1172 # self.left_on array with a None in the middle of it. This requires

1173 # a work-around as designated in the code below.

1174 # See _validate_left_right_on() for where this happens.

1175

1176 # ugh, spaghetti re #733

1177 if _any(self.left_on) and _any(self.right_on):

1178 for lk, rk in zip(self.left_on, self.right_on):

1179 if is_lkey(lk):

1180 lk = cast(AnyArrayLike, lk)

1181 left_keys.append(lk)

1182 if is_rkey(rk):

1183 rk = cast(AnyArrayLike, rk)

1184 right_keys.append(rk)

1185 join_names.append(None) # what to do?

1186 else:

1187 # Then we're either Hashable or a wrong-length arraylike,

1188 # the latter of which will raise

1189 rk = cast(Hashable, rk)

1190 if rk is not None:

1191 right_keys.append(right._get_label_or_level_values(rk))

1192 join_names.append(rk)

1193 else:

1194 # work-around for merge_asof(right_index=True)

1195 right_keys.append(right.index)

1196 join_names.append(right.index.name)

1197 else:

1198 if not is_rkey(rk):

1199 # Then we're either Hashable or a wrong-length arraylike,

1200 # the latter of which will raise

1201 rk = cast(Hashable, rk)

1202 if rk is not None:

1203 right_keys.append(right._get_label_or_level_values(rk))

1204 else:

1205 # work-around for merge_asof(right_index=True)

1206 right_keys.append(right.index)

1207 if lk is not None and lk == rk: # FIXME: what about other NAs?

1208 # avoid key upcast in corner case (length-0)

1209 lk = cast(Hashable, lk)

1210 if len(left) > 0:

1211 right_drop.append(rk)

1212 else:

1213 left_drop.append(lk)

1214 else:

1215 rk = cast(AnyArrayLike, rk)

1216 right_keys.append(rk)

1217 if lk is not None:

1218 # Then we're either Hashable or a wrong-length arraylike,

1219 # the latter of which will raise

1220 lk = cast(Hashable, lk)

1221 left_keys.append(left._get_label_or_level_values(lk))

1222 join_names.append(lk)

1223 else:

1224 # work-around for merge_asof(left_index=True)

1225 left_keys.append(left.index)

1226 join_names.append(left.index.name)

1227 elif _any(self.left_on):

1228 for k in self.left_on:

1229 if is_lkey(k):

1230 k = cast(AnyArrayLike, k)

1231 left_keys.append(k)

1232 join_names.append(None)

1233 else:

1234 # Then we're either Hashable or a wrong-length arraylike,

1235 # the latter of which will raise

1236 k = cast(Hashable, k)

1237 left_keys.append(left._get_label_or_level_values(k))

1238 join_names.append(k)

1239 if isinstance(self.right.index, MultiIndex):

1240 right_keys = [

1241 lev._values.take(lev_codes)

1242 for lev, lev_codes in zip(

1243 self.right.index.levels, self.right.index.codes

1244 )

1245 ]

1246 else:

1247 right_keys = [self.right.index._values]

1248 elif _any(self.right_on):

1249 for k in self.right_on:

1250 if is_rkey(k):

1251 k = cast(AnyArrayLike, k)

1252 right_keys.append(k)

1253 join_names.append(None)

1254 else:

1255 # Then we're either Hashable or a wrong-length arraylike,

1256 # the latter of which will raise

1257 k = cast(Hashable, k)

1258 right_keys.append(right._get_label_or_level_values(k))

1259 join_names.append(k)

1260 if isinstance(self.left.index, MultiIndex):

1261 left_keys = [

1262 lev._values.take(lev_codes)

1263 for lev, lev_codes in zip(

1264 self.left.index.levels, self.left.index.codes

1265 )

1266 ]

1267 else:

1268 left_keys = [self.left.index._values]

1269

1270 if left_drop:

1271 self.left = self.left._drop_labels_or_levels(left_drop)

1272

1273 if right_drop:

1274 self.right = self.right._drop_labels_or_levels(right_drop)

1275

1276 return left_keys, right_keys, join_names

1277

1278 def _maybe_coerce_merge_keys(self) -> None:

1279 # we have valid merges but we may have to further

1280 # coerce these if they are originally incompatible types

1281 #

1282 # for example if these are categorical, but are not dtype_equal

1283 # or if we have object and integer dtypes

1284

1285 for lk, rk, name in zip(

1286 self.left_join_keys, self.right_join_keys, self.join_names

1287 ):

1288 if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):

1289 continue

1290

1291 lk = extract_array(lk, extract_numpy=True)

1292 rk = extract_array(rk, extract_numpy=True)

1293

1294 lk_is_cat = is_categorical_dtype(lk.dtype)

1295 rk_is_cat = is_categorical_dtype(rk.dtype)

1296 lk_is_object = is_object_dtype(lk.dtype)

1297 rk_is_object = is_object_dtype(rk.dtype)

1298

1299 # if either left or right is a categorical

1300 # then the must match exactly in categories & ordered

1301 if lk_is_cat and rk_is_cat:

1302 lk = cast(Categorical, lk)

1303 rk = cast(Categorical, rk)

1304 if lk._categories_match_up_to_permutation(rk):

1305 continue

1306

1307 elif lk_is_cat or rk_is_cat:

1308 pass

1309

1310 elif is_dtype_equal(lk.dtype, rk.dtype):

1311 continue

1312

1313 msg = (

1314 f"You are trying to merge on {lk.dtype} and "

1315 f"{rk.dtype} columns. If you wish to proceed you should use pd.concat"

1316 )

1317

1318 # if we are numeric, then allow differing

1319 # kinds to proceed, eg. int64 and int8, int and float

1320 # further if we are object, but we infer to

1321 # the same, then proceed

1322 if is_numeric_dtype(lk.dtype) and is_numeric_dtype(rk.dtype):

1323 if lk.dtype.kind == rk.dtype.kind:

1324 continue

1325

1326 # check whether ints and floats

1327 if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype):

1328 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int

1329 with np.errstate(invalid="ignore"):

1330 # error: Argument 1 to "astype" of "ndarray" has incompatible

1331 # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected

1332 # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]"

1333 casted = lk.astype(rk.dtype) # type: ignore[arg-type]

1334

1335 mask = ~np.isnan(lk)

1336 match = lk == casted

1337 if not match[mask].all():

1338 warnings.warn(

1339 "You are merging on int and float "

1340 "columns where the float values "

1341 "are not equal to their int representation.",

1342 UserWarning,

1343 stacklevel=find_stack_level(),

1344 )

1345 continue

1346

1347 if is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype):

1348 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int

1349 with np.errstate(invalid="ignore"):

1350 # error: Argument 1 to "astype" of "ndarray" has incompatible

1351 # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected

1352 # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]"

1353 casted = rk.astype(lk.dtype) # type: ignore[arg-type]

1354

1355 mask = ~np.isnan(rk)

1356 match = rk == casted

1357 if not match[mask].all():

1358 warnings.warn(

1359 "You are merging on int and float "

1360 "columns where the float values "

1361 "are not equal to their int representation.",

1362 UserWarning,

1363 stacklevel=find_stack_level(),

1364 )

1365 continue

1366

1367 # let's infer and see if we are ok

1368 if lib.infer_dtype(lk, skipna=False) == lib.infer_dtype(

1369 rk, skipna=False

1370 ):

1371 continue

1372

1373 # Check if we are trying to merge on obviously

1374 # incompatible dtypes GH 9780, GH 15800

1375

1376 # bool values are coerced to object

1377 elif (lk_is_object and is_bool_dtype(rk.dtype)) or (

1378 is_bool_dtype(lk.dtype) and rk_is_object

1379 ):

1380 pass

1381

1382 # object values are allowed to be merged

1383 elif (lk_is_object and is_numeric_dtype(rk.dtype)) or (

1384 is_numeric_dtype(lk.dtype) and rk_is_object

1385 ):

1386 inferred_left = lib.infer_dtype(lk, skipna=False)

1387 inferred_right = lib.infer_dtype(rk, skipna=False)

1388 bool_types = ["integer", "mixed-integer", "boolean", "empty"]

1389 string_types = ["string", "unicode", "mixed", "bytes", "empty"]

1390

1391 # inferred bool

1392 if inferred_left in bool_types and inferred_right in bool_types:

1393 pass

1394

1395 # unless we are merging non-string-like with string-like

1396 elif (

1397 inferred_left in string_types and inferred_right not in string_types

1398 ) or (

1399 inferred_right in string_types and inferred_left not in string_types

1400 ):

1401 raise ValueError(msg)

1402

1403 # datetimelikes must match exactly

1404 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype):

1405 raise ValueError(msg)

1406 elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype):

1407 raise ValueError(msg)

1408 elif isinstance(lk.dtype, DatetimeTZDtype) and not isinstance(

1409 rk.dtype, DatetimeTZDtype

1410 ):

1411 raise ValueError(msg)

1412 elif not isinstance(lk.dtype, DatetimeTZDtype) and isinstance(

1413 rk.dtype, DatetimeTZDtype

1414 ):

1415 raise ValueError(msg)

1416 elif (

1417 isinstance(lk.dtype, DatetimeTZDtype)

1418 and isinstance(rk.dtype, DatetimeTZDtype)

1419 ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"):

1420 # allows datetime with different resolutions

1421 continue

1422

1423 elif lk_is_object and rk_is_object:

1424 continue

1425

1426 # Houston, we have a problem!

1427 # let's coerce to object if the dtypes aren't

1428 # categorical, otherwise coerce to the category

1429 # dtype. If we coerced categories to object,

1430 # then we would lose type information on some

1431 # columns, and end up trying to merge

1432 # incompatible dtypes. See GH 16900.

1433 if name in self.left.columns:

1434 typ = cast(Categorical, lk).categories.dtype if lk_is_cat else object

1435 self.left = self.left.copy()

1436 self.left[name] = self.left[name].astype(typ)

1437 if name in self.right.columns:

1438 typ = cast(Categorical, rk).categories.dtype if rk_is_cat else object

1439 self.right = self.right.copy()

1440 self.right[name] = self.right[name].astype(typ)

1441

1442 def _create_cross_configuration(

1443 self, left: DataFrame, right: DataFrame

1444 ) -> tuple[DataFrame, DataFrame, JoinHow, str]:

1445 """

1446 Creates the configuration to dispatch the cross operation to inner join,

1447 e.g. adding a join column and resetting parameters. Join column is added

1448 to a new object, no inplace modification

1449

1450 Parameters

1451 ----------

1452 left : DataFrame

1453 right : DataFrame

1454

1455 Returns

1456 -------

1457 a tuple (left, right, how, cross_col) representing the adjusted

1458 DataFrames with cross_col, the merge operation set to inner and the column

1459 to join over.

1460 """

1461 cross_col = f"_cross_{uuid.uuid4()}"

1462 how: JoinHow = "inner"

1463 return (

1464 left.assign(**{cross_col: 1}),

1465 right.assign(**{cross_col: 1}),

1466 how,

1467 cross_col,

1468 )

1469

1470 def _validate_left_right_on(self, left_on, right_on):

1471 left_on = com.maybe_make_list(left_on)

1472 right_on = com.maybe_make_list(right_on)

1473

1474 if self.how == "cross":

1475 if (

1476 self.left_index

1477 or self.right_index

1478 or right_on is not None

1479 or left_on is not None

1480 or self.on is not None

1481 ):

1482 raise MergeError(

1483 "Can not pass on, right_on, left_on or set right_index=True or "

1484 "left_index=True"

1485 )

1486 # Hm, any way to make this logic less complicated??

1487 elif self.on is None and left_on is None and right_on is None:

1488 if self.left_index and self.right_index:

1489 left_on, right_on = (), ()

1490 elif self.left_index:

1491 raise MergeError("Must pass right_on or right_index=True")

1492 elif self.right_index:

1493 raise MergeError("Must pass left_on or left_index=True")

1494 else:

1495 # use the common columns

1496 left_cols = self.left.columns

1497 right_cols = self.right.columns

1498 common_cols = left_cols.intersection(right_cols)

1499 if len(common_cols) == 0:

1500 raise MergeError(

1501 "No common columns to perform merge on. "

1502 f"Merge options: left_on={left_on}, "

1503 f"right_on={right_on}, "

1504 f"left_index={self.left_index}, "

1505 f"right_index={self.right_index}"

1506 )

1507 if (

1508 not left_cols.join(common_cols, how="inner").is_unique

1509 or not right_cols.join(common_cols, how="inner").is_unique

1510 ):

1511 raise MergeError(f"Data columns not unique: {repr(common_cols)}")

1512 left_on = right_on = common_cols

1513 elif self.on is not None:

1514 if left_on is not None or right_on is not None:

1515 raise MergeError(

1516 'Can only pass argument "on" OR "left_on" '

1517 'and "right_on", not a combination of both.'

1518 )

1519 if self.left_index or self.right_index:

1520 raise MergeError(

1521 'Can only pass argument "on" OR "left_index" '

1522 'and "right_index", not a combination of both.'

1523 )

1524 left_on = right_on = self.on

1525 elif left_on is not None:

1526 if self.left_index:

1527 raise MergeError(

1528 'Can only pass argument "left_on" OR "left_index" not both.'

1529 )

1530 if not self.right_index and right_on is None:

1531 raise MergeError('Must pass "right_on" OR "right_index".')

1532 n = len(left_on)

1533 if self.right_index:

1534 if len(left_on) != self.right.index.nlevels:

1535 raise ValueError(

1536 "len(left_on) must equal the number "

1537 'of levels in the index of "right"'

1538 )

1539 right_on = [None] * n

1540 elif right_on is not None:

1541 if self.right_index:

1542 raise MergeError(

1543 'Can only pass argument "right_on" OR "right_index" not both.'

1544 )

1545 if not self.left_index and left_on is None:

1546 raise MergeError('Must pass "left_on" OR "left_index".')

1547 n = len(right_on)

1548 if self.left_index:

1549 if len(right_on) != self.left.index.nlevels:

1550 raise ValueError(

1551 "len(right_on) must equal the number "

1552 'of levels in the index of "left"'

1553 )

1554 left_on = [None] * n

1555 if self.how != "cross" and len(right_on) != len(left_on):

1556 raise ValueError("len(right_on) must equal len(left_on)")

1557

1558 return left_on, right_on

1559

1560 def _validate(self, validate: str) -> None:

1561 # Check uniqueness of each

1562 if self.left_index:

1563 left_unique = self.orig_left.index.is_unique

1564 else:

1565 left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique

1566

1567 if self.right_index:

1568 right_unique = self.orig_right.index.is_unique

1569 else:

1570 right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique

1571

1572 # Check data integrity

1573 if validate in ["one_to_one", "1:1"]:

1574 if not left_unique and not right_unique:

1575 raise MergeError(

1576 "Merge keys are not unique in either left "

1577 "or right dataset; not a one-to-one merge"

1578 )

1579 if not left_unique:

1580 raise MergeError(

1581 "Merge keys are not unique in left dataset; not a one-to-one merge"

1582 )

1583 if not right_unique:

1584 raise MergeError(

1585 "Merge keys are not unique in right dataset; not a one-to-one merge"

1586 )

1587

1588 elif validate in ["one_to_many", "1:m"]:

1589 if not left_unique:

1590 raise MergeError(

1591 "Merge keys are not unique in left dataset; not a one-to-many merge"

1592 )

1593

1594 elif validate in ["many_to_one", "m:1"]:

1595 if not right_unique:

1596 raise MergeError(

1597 "Merge keys are not unique in right dataset; "

1598 "not a many-to-one merge"

1599 )

1600

1601 elif validate in ["many_to_many", "m:m"]:

1602 pass

1603

1604 else:

1605 raise ValueError(

1606 f'"{validate}" is not a valid argument. '

1607 "Valid arguments are:\n"

1608 '- "1:1"\n'

1609 '- "1:m"\n'

1610 '- "m:1"\n'

1611 '- "m:m"\n'

1612 '- "one_to_one"\n'

1613 '- "one_to_many"\n'

1614 '- "many_to_one"\n'

1615 '- "many_to_many"'

1616 )

1617

1618

1619def get_join_indexers(

1620 left_keys,

1621 right_keys,

1622 sort: bool = False,

1623 how: MergeHow | Literal["asof"] = "inner",

1624 **kwargs,

1625) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

1626 """

1627

1628 Parameters

1629 ----------

1630 left_keys : ndarray, Index, Series

1631 right_keys : ndarray, Index, Series

1632 sort : bool, default False

1633 how : {'inner', 'outer', 'left', 'right'}, default 'inner'

1634

1635 Returns

1636 -------

1637 np.ndarray[np.intp]

1638 Indexer into the left_keys.

1639 np.ndarray[np.intp]

1640 Indexer into the right_keys.

1641 """

1642 assert len(left_keys) == len(

1643 right_keys

1644 ), "left_key and right_keys must be the same length"

1645

1646 # fast-path for empty left/right

1647 left_n = len(left_keys[0])

1648 right_n = len(right_keys[0])

1649 if left_n == 0:

1650 if how in ["left", "inner", "cross"]:

1651 return _get_empty_indexer()

1652 elif not sort and how in ["right", "outer"]:

1653 return _get_no_sort_one_missing_indexer(right_n, True)

1654 elif right_n == 0:

1655 if how in ["right", "inner", "cross"]:

1656 return _get_empty_indexer()

1657 elif not sort and how in ["left", "outer"]:

1658 return _get_no_sort_one_missing_indexer(left_n, False)

1659

1660 # get left & right join labels and num. of levels at each location

1661 mapped = (

1662 _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)

1663 for n in range(len(left_keys))

1664 )

1665 zipped = zip(*mapped)

1666 llab, rlab, shape = (list(x) for x in zipped)

1667

1668 # get flat i8 keys from label lists

1669 lkey, rkey = _get_join_keys(llab, rlab, tuple(shape), sort)

1670

1671 # factorize keys to a dense i8 space

1672 # `count` is the num. of unique keys

1673 # set(lkey) | set(rkey) == range(count)

1674

1675 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how)

1676 # preserve left frame order if how == 'left' and sort == False

1677 kwargs = cp.copy(kwargs)

1678 if how in ("left", "right"):

1679 kwargs["sort"] = sort

1680 join_func = {

1681 "inner": libjoin.inner_join,

1682 "left": libjoin.left_outer_join,

1683 "right": lambda x, y, count, **kwargs: libjoin.left_outer_join(

1684 y, x, count, **kwargs

1685 )[::-1],

1686 "outer": libjoin.full_outer_join,

1687 }[how]

1688

1689 # error: Cannot call function of unknown type

1690 return join_func(lkey, rkey, count, **kwargs) # type: ignore[operator]

1691

1692

1693def restore_dropped_levels_multijoin(

1694 left: MultiIndex,

1695 right: MultiIndex,

1696 dropped_level_names,

1697 join_index: Index,

1698 lindexer: npt.NDArray[np.intp],

1699 rindexer: npt.NDArray[np.intp],

1700) -> tuple[list[Index], npt.NDArray[np.intp], list[Hashable]]:

1701 """

1702 *this is an internal non-public method*

1703

1704 Returns the levels, labels and names of a multi-index to multi-index join.

1705 Depending on the type of join, this method restores the appropriate

1706 dropped levels of the joined multi-index.

1707 The method relies on lindexer, rindexer which hold the index positions of

1708 left and right, where a join was feasible

1709

1710 Parameters

1711 ----------

1712 left : MultiIndex

1713 left index

1714 right : MultiIndex

1715 right index

1716 dropped_level_names : str array

1717 list of non-common level names

1718 join_index : Index

1719 the index of the join between the

1720 common levels of left and right

1721 lindexer : np.ndarray[np.intp]

1722 left indexer

1723 rindexer : np.ndarray[np.intp]

1724 right indexer

1725

1726 Returns

1727 -------

1728 levels : list of Index

1729 levels of combined multiindexes

1730 labels : np.ndarray[np.intp]

1731 labels of combined multiindexes

1732 names : List[Hashable]

1733 names of combined multiindex levels

1734

1735 """

1736

1737 def _convert_to_multiindex(index: Index) -> MultiIndex:

1738 if isinstance(index, MultiIndex):

1739 return index

1740 else:

1741 return MultiIndex.from_arrays([index._values], names=[index.name])

1742

1743 # For multi-multi joins with one overlapping level,

1744 # the returned index if of type Index

1745 # Assure that join_index is of type MultiIndex

1746 # so that dropped levels can be appended

1747 join_index = _convert_to_multiindex(join_index)

1748

1749 join_levels = join_index.levels

1750 join_codes = join_index.codes

1751 join_names = join_index.names

1752

1753 # Iterate through the levels that must be restored

1754 for dropped_level_name in dropped_level_names:

1755 if dropped_level_name in left.names:

1756 idx = left

1757 indexer = lindexer

1758 else:

1759 idx = right

1760 indexer = rindexer

1761

1762 # The index of the level name to be restored

1763 name_idx = idx.names.index(dropped_level_name)

1764

1765 restore_levels = idx.levels[name_idx]

1766 # Inject -1 in the codes list where a join was not possible

1767 # IOW indexer[i]=-1

1768 codes = idx.codes[name_idx]

1769 if indexer is None:

1770 restore_codes = codes

1771 else:

1772 restore_codes = algos.take_nd(codes, indexer, fill_value=-1)

1773

1774 # error: Cannot determine type of "__add__"

1775 join_levels = join_levels + [restore_levels] # type: ignore[has-type]

1776 join_codes = join_codes + [restore_codes]

1777 join_names = join_names + [dropped_level_name]

1778

1779 return join_levels, join_codes, join_names

1780

1781

1782class _OrderedMerge(_MergeOperation):

1783 _merge_type = "ordered_merge"

1784

1785 def __init__(

1786 self,

1787 left: DataFrame | Series,

1788 right: DataFrame | Series,

1789 on: IndexLabel | None = None,

1790 left_on: IndexLabel | None = None,

1791 right_on: IndexLabel | None = None,

1792 left_index: bool = False,

1793 right_index: bool = False,

1794 axis: AxisInt = 1,

1795 suffixes: Suffixes = ("_x", "_y"),

1796 fill_method: str | None = None,

1797 how: JoinHow | Literal["asof"] = "outer",

1798 ) -> None:

1799 self.fill_method = fill_method

1800 _MergeOperation.__init__(

1801 self,

1802 left,

1803 right,

1804 on=on,

1805 left_on=left_on,

1806 left_index=left_index,

1807 right_index=right_index,

1808 right_on=right_on,

1809 axis=axis,

1810 how=how,

1811 suffixes=suffixes,

1812 sort=True, # factorize sorts

1813 )

1814

1815 def get_result(self, copy: bool | None = True) -> DataFrame:

1816 join_index, left_indexer, right_indexer = self._get_join_info()

1817

1818 llabels, rlabels = _items_overlap_with_suffix(

1819 self.left._info_axis, self.right._info_axis, self.suffixes

1820 )

1821

1822 left_join_indexer: np.ndarray | None

1823 right_join_indexer: np.ndarray | None

1824

1825 if self.fill_method == "ffill":

1826 if left_indexer is None:

1827 raise TypeError("left_indexer cannot be None")

1828 left_indexer, right_indexer = cast(np.ndarray, left_indexer), cast(

1829 np.ndarray, right_indexer

1830 )

1831 left_join_indexer = libjoin.ffill_indexer(left_indexer)

1832 right_join_indexer = libjoin.ffill_indexer(right_indexer)

1833 else:

1834 left_join_indexer = left_indexer

1835 right_join_indexer = right_indexer

1836

1837 result = self._reindex_and_concat(

1838 join_index, left_join_indexer, right_join_indexer, copy=copy

1839 )

1840 self._maybe_add_join_keys(result, left_indexer, right_indexer)

1841

1842 return result

1843

1844

1845def _asof_by_function(direction: str):

1846 name = f"asof_join_{direction}_on_X_by_Y"

1847 return getattr(libjoin, name, None)

1848

1849

1850_type_casters = {

1851 "int64_t": ensure_int64,

1852 "double": ensure_float64,

1853 "object": ensure_object,

1854}

1855

1856

1857def _get_cython_type_upcast(dtype: DtypeObj) -> str:

1858 """Upcast a dtype to 'int64_t', 'double', or 'object'"""

1859 if is_integer_dtype(dtype):

1860 return "int64_t"

1861 elif is_float_dtype(dtype):

1862 return "double"

1863 else:

1864 return "object"

1865

1866

1867class _AsOfMerge(_OrderedMerge):

1868 _merge_type = "asof_merge"

1869

1870 def __init__(

1871 self,

1872 left: DataFrame | Series,

1873 right: DataFrame | Series,

1874 on: IndexLabel | None = None,

1875 left_on: IndexLabel | None = None,

1876 right_on: IndexLabel | None = None,

1877 left_index: bool = False,

1878 right_index: bool = False,

1879 by=None,

1880 left_by=None,

1881 right_by=None,

1882 axis: AxisInt = 1,

1883 suffixes: Suffixes = ("_x", "_y"),

1884 copy: bool = True,

1885 fill_method: str | None = None,

1886 how: Literal["asof"] = "asof",

1887 tolerance=None,

1888 allow_exact_matches: bool = True,

1889 direction: str = "backward",

1890 ) -> None:

1891 self.by = by

1892 self.left_by = left_by

1893 self.right_by = right_by

1894 self.tolerance = tolerance

1895 self.allow_exact_matches = allow_exact_matches

1896 self.direction = direction

1897

1898 _OrderedMerge.__init__(

1899 self,

1900 left,

1901 right,

1902 on=on,

1903 left_on=left_on,

1904 right_on=right_on,

1905 left_index=left_index,

1906 right_index=right_index,

1907 axis=axis,

1908 how=how,

1909 suffixes=suffixes,

1910 fill_method=fill_method,

1911 )

1912

1913 def _validate_left_right_on(self, left_on, right_on):

1914 left_on, right_on = super()._validate_left_right_on(left_on, right_on)

1915

1916 # we only allow on to be a single item for on

1917 if len(left_on) != 1 and not self.left_index:

1918 raise MergeError("can only asof on a key for left")

1919

1920 if len(right_on) != 1 and not self.right_index:

1921 raise MergeError("can only asof on a key for right")

1922

1923 if self.left_index and isinstance(self.left.index, MultiIndex):

1924 raise MergeError("left can only have one index")

1925

1926 if self.right_index and isinstance(self.right.index, MultiIndex):

1927 raise MergeError("right can only have one index")

1928

1929 # set 'by' columns

1930 if self.by is not None:

1931 if self.left_by is not None or self.right_by is not None:

1932 raise MergeError("Can only pass by OR left_by and right_by")

1933 self.left_by = self.right_by = self.by

1934 if self.left_by is None and self.right_by is not None:

1935 raise MergeError("missing left_by")

1936 if self.left_by is not None and self.right_by is None:

1937 raise MergeError("missing right_by")

1938

1939 # GH#29130 Check that merge keys do not have dtype object

1940 if not self.left_index:

1941 left_on_0 = left_on[0]

1942 if is_array_like(left_on_0):

1943 lo_dtype = left_on_0.dtype

1944 else:

1945 lo_dtype = (

1946 self.left._get_label_or_level_values(left_on_0).dtype

1947 if left_on_0 in self.left.columns

1948 else self.left.index.get_level_values(left_on_0)

1949 )

1950 else:

1951 lo_dtype = self.left.index.dtype

1952

1953 if not self.right_index:

1954 right_on_0 = right_on[0]

1955 if is_array_like(right_on_0):

1956 ro_dtype = right_on_0.dtype

1957 else:

1958 ro_dtype = (

1959 self.right._get_label_or_level_values(right_on_0).dtype

1960 if right_on_0 in self.right.columns

1961 else self.right.index.get_level_values(right_on_0)

1962 )

1963 else:

1964 ro_dtype = self.right.index.dtype

1965

1966 if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype):

1967 raise MergeError(

1968 f"Incompatible merge dtype, {repr(ro_dtype)} and "

1969 f"{repr(lo_dtype)}, both sides must have numeric dtype"

1970 )

1971

1972 # add 'by' to our key-list so we can have it in the

1973 # output as a key

1974 if self.left_by is not None:

1975 if not is_list_like(self.left_by):

1976 self.left_by = [self.left_by]

1977 if not is_list_like(self.right_by):

1978 self.right_by = [self.right_by]

1979

1980 if len(self.left_by) != len(self.right_by):

1981 raise MergeError("left_by and right_by must be same length")

1982

1983 left_on = self.left_by + list(left_on)

1984 right_on = self.right_by + list(right_on)

1985

1986 # check 'direction' is valid

1987 if self.direction not in ["backward", "forward", "nearest"]:

1988 raise MergeError(f"direction invalid: {self.direction}")

1989

1990 return left_on, right_on

1991

1992 def _get_merge_keys(

1993 self,

1994 ) -> tuple[list[AnyArrayLike], list[AnyArrayLike], list[Hashable]]:

1995 # note this function has side effects

1996 (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys()

1997

1998 # validate index types are the same

1999 for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):

2000 if not is_dtype_equal(lk.dtype, rk.dtype):

2001 if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype):

2002 # The generic error message is confusing for categoricals.

2003 #

2004 # In this function, the join keys include both the original

2005 # ones of the merge_asof() call, and also the keys passed

2006 # to its by= argument. Unordered but equal categories

2007 # are not supported for the former, but will fail

2008 # later with a ValueError, so we don't *need* to check

2009 # for them here.

2010 msg = (

2011 f"incompatible merge keys [{i}] {repr(lk.dtype)} and "

2012 f"{repr(rk.dtype)}, both sides category, but not equal ones"

2013 )

2014 else:

2015 msg = (

2016 f"incompatible merge keys [{i}] {repr(lk.dtype)} and "

2017 f"{repr(rk.dtype)}, must be the same type"

2018 )

2019 raise MergeError(msg)

2020

2021 # validate tolerance; datetime.timedelta or Timedelta if we have a DTI

2022 if self.tolerance is not None:

2023 if self.left_index:

2024 # Actually more specifically an Index

2025 lt = cast(AnyArrayLike, self.left.index)

2026 else:

2027 lt = left_join_keys[-1]

2028

2029 msg = (

2030 f"incompatible tolerance {self.tolerance}, must be compat "

2031 f"with type {repr(lt.dtype)}"

2032 )

2033

2034 if needs_i8_conversion(lt):

2035 if not isinstance(self.tolerance, datetime.timedelta):

2036 raise MergeError(msg)

2037 if self.tolerance < Timedelta(0):

2038 raise MergeError("tolerance must be positive")

2039

2040 elif is_integer_dtype(lt):

2041 if not is_integer(self.tolerance):

2042 raise MergeError(msg)

2043 if self.tolerance < 0:

2044 raise MergeError("tolerance must be positive")

2045

2046 elif is_float_dtype(lt):

2047 if not is_number(self.tolerance):

2048 raise MergeError(msg)

2049 if self.tolerance < 0:

2050 raise MergeError("tolerance must be positive")

2051

2052 else:

2053 raise MergeError("key must be integer, timestamp or float")

2054

2055 # validate allow_exact_matches

2056 if not is_bool(self.allow_exact_matches):

2057 msg = (

2058 "allow_exact_matches must be boolean, "

2059 f"passed {self.allow_exact_matches}"

2060 )

2061 raise MergeError(msg)

2062

2063 return left_join_keys, right_join_keys, join_names

2064

2065 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2066 """return the join indexers"""

2067

2068 def flip(xs) -> np.ndarray:

2069 """unlike np.transpose, this returns an array of tuples"""

2070

2071 def injection(obj):

2072 if not is_extension_array_dtype(obj):

2073 # ndarray

2074 return obj

2075 obj = extract_array(obj)

2076 if isinstance(obj, NDArrayBackedExtensionArray):

2077 # fastpath for e.g. dt64tz, categorical

2078 return obj._ndarray

2079 # FIXME: returning obj._values_for_argsort() here doesn't

2080 # break in any existing test cases, but i (@jbrockmendel)

2081 # am pretty sure it should!

2082 # e.g.

2083 # arr = pd.array([0, pd.NA, 255], dtype="UInt8")

2084 # will have values_for_argsort (before GH#45434)

2085 # np.array([0, 255, 255], dtype=np.uint8)

2086 # and the non-injectivity should make a difference somehow

2087 # shouldn't it?

2088 return np.asarray(obj)

2089

2090 xs = [injection(x) for x in xs]

2091 labels = list(string.ascii_lowercase[: len(xs)])

2092 dtypes = [x.dtype for x in xs]

2093 labeled_dtypes = list(zip(labels, dtypes))

2094 return np.array(list(zip(*xs)), labeled_dtypes)

2095

2096 # values to compare

2097 left_values = (

2098 self.left.index._values if self.left_index else self.left_join_keys[-1]

2099 )

2100 right_values = (

2101 self.right.index._values if self.right_index else self.right_join_keys[-1]

2102 )

2103 tolerance = self.tolerance

2104

2105 # we require sortedness and non-null values in the join keys

2106 if not Index(left_values).is_monotonic_increasing:

2107 side = "left"

2108 if isna(left_values).any():

2109 raise ValueError(f"Merge keys contain null values on {side} side")

2110 raise ValueError(f"{side} keys must be sorted")

2111

2112 if not Index(right_values).is_monotonic_increasing:

2113 side = "right"

2114 if isna(right_values).any():

2115 raise ValueError(f"Merge keys contain null values on {side} side")

2116 raise ValueError(f"{side} keys must be sorted")

2117

2118 # initial type conversion as needed

2119 if needs_i8_conversion(left_values):

2120 if tolerance is not None:

2121 tolerance = Timedelta(tolerance)

2122

2123 # TODO: we have no test cases with PeriodDtype here; probably

2124 # need to adjust tolerance for that case.

2125 if left_values.dtype.kind in ["m", "M"]:

2126 # Make sure the i8 representation for tolerance

2127 # matches that for left_values/right_values.

2128 lvs = ensure_wrapped_if_datetimelike(left_values)

2129 tolerance = tolerance.as_unit(lvs.unit)

2130

2131 tolerance = tolerance._value

2132

2133 # TODO: require left_values.dtype == right_values.dtype, or at least

2134 # comparable for e.g. dt64tz

2135 left_values = left_values.view("i8")

2136 right_values = right_values.view("i8")

2137

2138 # a "by" parameter requires special handling

2139 if self.left_by is not None:

2140 # remove 'on' parameter from values if one existed

2141 if self.left_index and self.right_index:

2142 left_by_values = self.left_join_keys

2143 right_by_values = self.right_join_keys

2144 else:

2145 left_by_values = self.left_join_keys[0:-1]

2146 right_by_values = self.right_join_keys[0:-1]

2147

2148 # get tuple representation of values if more than one

2149 if len(left_by_values) == 1:

2150 lbv = left_by_values[0]

2151 rbv = right_by_values[0]

2152 else:

2153 # We get here with non-ndarrays in test_merge_by_col_tz_aware

2154 # and test_merge_groupby_multiple_column_with_categorical_column

2155 lbv = flip(left_by_values)

2156 rbv = flip(right_by_values)

2157

2158 # upcast 'by' parameter because HashTable is limited

2159 by_type = _get_cython_type_upcast(lbv.dtype)

2160 by_type_caster = _type_casters[by_type]

2161 # error: Incompatible types in assignment (expression has type

2162 # "ndarray[Any, dtype[generic]]", variable has type

2163 # "List[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]]")

2164 left_by_values = by_type_caster(lbv) # type: ignore[assignment]

2165 # error: Incompatible types in assignment (expression has type

2166 # "ndarray[Any, dtype[generic]]", variable has type

2167 # "List[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]]")

2168 right_by_values = by_type_caster(rbv) # type: ignore[assignment]

2169

2170 # choose appropriate function by type

2171 func = _asof_by_function(self.direction)

2172 return func(

2173 left_values,

2174 right_values,

2175 left_by_values,

2176 right_by_values,

2177 self.allow_exact_matches,

2178 tolerance,

2179 )

2180 else:

2181 # choose appropriate function by type

2182 func = _asof_by_function(self.direction)

2183 # TODO(cython3):

2184 # Bug in beta1 preventing Cython from choosing

2185 # right specialization when one fused memview is None

2186 # Doesn't matter what type we choose

2187 # (nothing happens anyways since it is None)

2188 # GH 51640

2189 return func[f"{left_values.dtype}_t", object](

2190 left_values,

2191 right_values,

2192 None,

2193 None,

2194 self.allow_exact_matches,

2195 tolerance,

2196 False,

2197 )

2198

2199

2200def _get_multiindex_indexer(

2201 join_keys, index: MultiIndex, sort: bool

2202) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2203 # left & right join labels and num. of levels at each location

2204 mapped = (

2205 _factorize_keys(index.levels[n], join_keys[n], sort=sort)

2206 for n in range(index.nlevels)

2207 )

2208 zipped = zip(*mapped)

2209 rcodes, lcodes, shape = (list(x) for x in zipped)

2210 if sort:

2211 rcodes = list(map(np.take, rcodes, index.codes))

2212 else:

2213 i8copy = lambda a: a.astype("i8", subok=False, copy=True)

2214 rcodes = list(map(i8copy, index.codes))

2215

2216 # fix right labels if there were any nulls

2217 for i, join_key in enumerate(join_keys):

2218 mask = index.codes[i] == -1

2219 if mask.any():

2220 # check if there already was any nulls at this location

2221 # if there was, it is factorized to `shape[i] - 1`

2222 a = join_key[lcodes[i] == shape[i] - 1]

2223 if a.size == 0 or not a[0] != a[0]:

2224 shape[i] += 1

2225

2226 rcodes[i][mask] = shape[i] - 1

2227

2228 # get flat i8 join keys

2229 lkey, rkey = _get_join_keys(lcodes, rcodes, tuple(shape), sort)

2230

2231 # factorize keys to a dense i8 space

2232 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)

2233

2234 return libjoin.left_outer_join(lkey, rkey, count, sort=sort)

2235

2236

2237def _get_single_indexer(

2238 join_key, index: Index, sort: bool = False

2239) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2240 left_key, right_key, count = _factorize_keys(join_key, index._values, sort=sort)

2241

2242 return libjoin.left_outer_join(left_key, right_key, count, sort=sort)

2243

2244

2245def _get_empty_indexer() -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2246 """Return empty join indexers."""

2247 return (

2248 np.array([], dtype=np.intp),

2249 np.array([], dtype=np.intp),

2250 )

2251

2252

2253def _get_no_sort_one_missing_indexer(

2254 n: int, left_missing: bool

2255) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2256 """

2257 Return join indexers where all of one side is selected without sorting

2258 and none of the other side is selected.

2259

2260 Parameters

2261 ----------

2262 n : int

2263 Length of indexers to create.

2264 left_missing : bool

2265 If True, the left indexer will contain only -1's.

2266 If False, the right indexer will contain only -1's.

2267

2268 Returns

2269 -------

2270 np.ndarray[np.intp]

2271 Left indexer

2272 np.ndarray[np.intp]

2273 Right indexer

2274 """

2275 idx = np.arange(n, dtype=np.intp)

2276 idx_missing = np.full(shape=n, fill_value=-1, dtype=np.intp)

2277 if left_missing:

2278 return idx_missing, idx

2279 return idx, idx_missing

2280

2281

2282def _left_join_on_index(

2283 left_ax: Index, right_ax: Index, join_keys, sort: bool = False

2284) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]:

2285 if len(join_keys) > 1:

2286 if not (

2287 isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels

2288 ):

2289 raise AssertionError(

2290 "If more than one join key is given then "

2291 "'right_ax' must be a MultiIndex and the "

2292 "number of join keys must be the number of levels in right_ax"

2293 )

2294

2295 left_indexer, right_indexer = _get_multiindex_indexer(

2296 join_keys, right_ax, sort=sort

2297 )

2298 else:

2299 jkey = join_keys[0]

2300

2301 left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort)

2302

2303 if sort or len(left_ax) != len(left_indexer):

2304 # if asked to sort or there are 1-to-many matches

2305 join_index = left_ax.take(left_indexer)

2306 return join_index, left_indexer, right_indexer

2307

2308 # left frame preserves order & length of its index

2309 return left_ax, None, right_indexer

2310

2311

2312def _factorize_keys(

2313 lk: ArrayLike,

2314 rk: ArrayLike,

2315 sort: bool = True,

2316 how: MergeHow | Literal["asof"] = "inner",

2317) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:

2318 """

2319 Encode left and right keys as enumerated types.

2320

2321 This is used to get the join indexers to be used when merging DataFrames.

2322

2323 Parameters

2324 ----------

2325 lk : array-like

2326 Left key.

2327 rk : array-like

2328 Right key.

2329 sort : bool, defaults to True

2330 If True, the encoding is done such that the unique elements in the

2331 keys are sorted.

2332 how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’

2333 Type of merge.

2334

2335 Returns

2336 -------

2337 np.ndarray[np.intp]

2338 Left (resp. right if called with `key='right'`) labels, as enumerated type.

2339 np.ndarray[np.intp]

2340 Right (resp. left if called with `key='right'`) labels, as enumerated type.

2341 int

2342 Number of unique elements in union of left and right labels.

2343

2344 See Also

2345 --------

2346 merge : Merge DataFrame or named Series objects

2347 with a database-style join.

2348 algorithms.factorize : Encode the object as an enumerated type

2349 or categorical variable.

2350

2351 Examples

2352 --------

2353 >>> lk = np.array(["a", "c", "b"])

2354 >>> rk = np.array(["a", "c"])

2355

2356 Here, the unique values are `'a', 'b', 'c'`. With the default

2357 `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:

2358

2359 >>> pd.core.reshape.merge._factorize_keys(lk, rk)

2360 (array([0, 2, 1]), array([0, 2]), 3)

2361

2362 With the `sort=False`, the encoding will correspond to the order

2363 in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:

2364

2365 >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)

2366 (array([0, 1, 2]), array([0, 1]), 3)

2367 """

2368 # Some pre-processing for non-ndarray lk / rk

2369 lk = extract_array(lk, extract_numpy=True, extract_range=True)

2370 rk = extract_array(rk, extract_numpy=True, extract_range=True)

2371 # TODO: if either is a RangeIndex, we can likely factorize more efficiently?

2372

2373 if (

2374 isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype)

2375 ) or (

2376 isinstance(lk.dtype, np.dtype)

2377 and lk.dtype.kind == "M"

2378 and isinstance(rk.dtype, np.dtype)

2379 and rk.dtype.kind == "M"

2380 ):

2381 # Extract the ndarray (UTC-localized) values

2382 # Note: we dont need the dtypes to match, as these can still be compared

2383 lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk)

2384 lk = cast("DatetimeArray", lk)._ndarray

2385 rk = cast("DatetimeArray", rk)._ndarray

2386

2387 elif (

2388 is_categorical_dtype(lk.dtype)

2389 and is_categorical_dtype(rk.dtype)

2390 and is_dtype_equal(lk.dtype, rk.dtype)

2391 ):

2392 assert isinstance(lk, Categorical)

2393 assert isinstance(rk, Categorical)

2394 # Cast rk to encoding so we can compare codes with lk

2395

2396 rk = lk._encode_with_my_categories(rk)

2397

2398 lk = ensure_int64(lk.codes)

2399 rk = ensure_int64(rk.codes)

2400

2401 elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype):

2402 if not isinstance(lk, BaseMaskedArray) and not (

2403 # exclude arrow dtypes that would get cast to object

2404 isinstance(lk.dtype, ArrowDtype)

2405 and is_numeric_dtype(lk.dtype.numpy_dtype)

2406 ):

2407 lk, _ = lk._values_for_factorize()

2408

2409 # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute

2410 # "_values_for_factorize"

2411 rk, _ = rk._values_for_factorize() # type: ignore[union-attr]

2412

2413 if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype:

2414 # GH#23917 TODO: Needs tests for non-matching dtypes

2415 # GH#23917 TODO: needs tests for case where lk is integer-dtype

2416 # and rk is datetime-dtype

2417 lk = np.asarray(lk, dtype=np.int64)

2418 rk = np.asarray(rk, dtype=np.int64)

2419

2420 klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk)

2421

2422 rizer = klass(max(len(lk), len(rk)))

2423

2424 if isinstance(lk, BaseMaskedArray):

2425 assert isinstance(rk, BaseMaskedArray)

2426 llab = rizer.factorize(lk._data, mask=lk._mask)

2427 rlab = rizer.factorize(rk._data, mask=rk._mask)

2428 elif isinstance(lk, ArrowExtensionArray):

2429 assert isinstance(rk, ArrowExtensionArray)

2430 # we can only get here with numeric dtypes

2431 # TODO: Remove when we have a Factorizer for Arrow

2432 llab = rizer.factorize(

2433 lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna()

2434 )

2435 rlab = rizer.factorize(

2436 rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna()

2437 )

2438 else:

2439 # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type

2440 # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],

2441 # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"

2442 llab = rizer.factorize(lk) # type: ignore[arg-type]

2443 rlab = rizer.factorize(rk) # type: ignore[arg-type]

2444 assert llab.dtype == np.dtype(np.intp), llab.dtype

2445 assert rlab.dtype == np.dtype(np.intp), rlab.dtype

2446

2447 count = rizer.get_count()

2448

2449 if sort:

2450 uniques = rizer.uniques.to_array()

2451 llab, rlab = _sort_labels(uniques, llab, rlab)

2452

2453 # NA group

2454 lmask = llab == -1

2455 lany = lmask.any()

2456 rmask = rlab == -1

2457 rany = rmask.any()

2458

2459 if lany or rany:

2460 if lany:

2461 np.putmask(llab, lmask, count)

2462 if rany:

2463 np.putmask(rlab, rmask, count)

2464 count += 1

2465

2466 if how == "right":

2467 return rlab, llab, count

2468 return llab, rlab, count

2469

2470

2471def _convert_arrays_and_get_rizer_klass(

2472 lk: ArrayLike, rk: ArrayLike

2473) -> tuple[type[libhashtable.Factorizer], ArrayLike, ArrayLike]:

2474 klass: type[libhashtable.Factorizer]

2475 if is_numeric_dtype(lk.dtype):

2476 if not is_dtype_equal(lk, rk):

2477 dtype = find_common_type([lk.dtype, rk.dtype])

2478 if isinstance(dtype, ExtensionDtype):

2479 cls = dtype.construct_array_type()

2480 if not isinstance(lk, ExtensionArray):

2481 lk = cls._from_sequence(lk, dtype=dtype, copy=False)

2482 else:

2483 lk = lk.astype(dtype)

2484

2485 if not isinstance(rk, ExtensionArray):

2486 rk = cls._from_sequence(rk, dtype=dtype, copy=False)

2487 else:

2488 rk = rk.astype(dtype)

2489 else:

2490 lk = lk.astype(dtype)

2491 rk = rk.astype(dtype)

2492 if isinstance(lk, BaseMaskedArray):

2493 # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]";

2494 # expected type "Type[object]"

2495 klass = _factorizers[lk.dtype.type] # type: ignore[index]

2496 elif isinstance(lk.dtype, ArrowDtype):

2497 klass = _factorizers[lk.dtype.numpy_dtype.type]

2498 else:

2499 klass = _factorizers[lk.dtype.type]

2500

2501 else:

2502 klass = libhashtable.ObjectFactorizer

2503 lk = ensure_object(lk)

2504 rk = ensure_object(rk)

2505 return klass, lk, rk

2506

2507

2508def _sort_labels(

2509 uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp]

2510) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:

2511 llength = len(left)

2512 labels = np.concatenate([left, right])

2513

2514 _, new_labels = algos.safe_sort(uniques, labels, use_na_sentinel=True)

2515 new_left, new_right = new_labels[:llength], new_labels[llength:]

2516

2517 return new_left, new_right

2518

2519

2520def _get_join_keys(

2521 llab: list[npt.NDArray[np.int64 | np.intp]],

2522 rlab: list[npt.NDArray[np.int64 | np.intp]],

2523 shape: Shape,

2524 sort: bool,

2525) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:

2526 # how many levels can be done without overflow

2527 nlev = next(

2528 lev

2529 for lev in range(len(shape), 0, -1)

2530 if not is_int64_overflow_possible(shape[:lev])

2531 )

2532

2533 # get keys for the first `nlev` levels

2534 stride = np.prod(shape[1:nlev], dtype="i8")

2535 lkey = stride * llab[0].astype("i8", subok=False, copy=False)

2536 rkey = stride * rlab[0].astype("i8", subok=False, copy=False)

2537

2538 for i in range(1, nlev):

2539 with np.errstate(divide="ignore"):

2540 stride //= shape[i]

2541 lkey += llab[i] * stride

2542 rkey += rlab[i] * stride

2543

2544 if nlev == len(shape): # all done!

2545 return lkey, rkey

2546

2547 # densify current keys to avoid overflow

2548 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)

2549

2550 llab = [lkey] + llab[nlev:]

2551 rlab = [rkey] + rlab[nlev:]

2552 shape = (count,) + shape[nlev:]

2553

2554 return _get_join_keys(llab, rlab, shape, sort)

2555

2556

2557def _should_fill(lname, rname) -> bool:

2558 if not isinstance(lname, str) or not isinstance(rname, str):

2559 return True

2560 return lname == rname

2561

2562

2563def _any(x) -> bool:

2564 return x is not None and com.any_not_none(*x)

2565

2566

2567def _validate_operand(obj: DataFrame | Series) -> DataFrame:

2568 if isinstance(obj, ABCDataFrame):

2569 return obj

2570 elif isinstance(obj, ABCSeries):

2571 if obj.name is None:

2572 raise ValueError("Cannot merge a Series without a name")

2573 return obj.to_frame()

2574 else:

2575 raise TypeError(

2576 f"Can only merge Series or DataFrame objects, a {type(obj)} was passed"

2577 )

2578

2579

2580def _items_overlap_with_suffix(

2581 left: Index, right: Index, suffixes: Suffixes

2582) -> tuple[Index, Index]:

2583 """

2584 Suffixes type validation.

2585

2586 If two indices overlap, add suffixes to overlapping entries.

2587

2588 If corresponding suffix is empty, the entry is simply converted to string.

2589

2590 """

2591 if not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict):

2592 raise TypeError(

2593 f"Passing 'suffixes' as a {type(suffixes)}, is not supported. "

2594 "Provide 'suffixes' as a tuple instead."

2595 )

2596

2597 to_rename = left.intersection(right)

2598 if len(to_rename) == 0:

2599 return left, right

2600

2601 lsuffix, rsuffix = suffixes

2602

2603 if not lsuffix and not rsuffix:

2604 raise ValueError(f"columns overlap but no suffix specified: {to_rename}")

2605

2606 def renamer(x, suffix):

2607 """

2608 Rename the left and right indices.

2609

2610 If there is overlap, and suffix is not None, add

2611 suffix, otherwise, leave it as-is.

2612

2613 Parameters

2614 ----------

2615 x : original column name

2616 suffix : str or None

2617

2618 Returns

2619 -------

2620 x : renamed column name

2621 """

2622 if x in to_rename and suffix is not None:

2623 return f"{x}{suffix}"

2624 return x

2625

2626 lrenamer = partial(renamer, suffix=lsuffix)

2627 rrenamer = partial(renamer, suffix=rsuffix)

2628

2629 llabels = left._transform_index(lrenamer)

2630 rlabels = right._transform_index(rrenamer)

2631

2632 dups = []

2633 if not llabels.is_unique:

2634 # Only warn when duplicates are caused because of suffixes, already duplicated

2635 # columns in origin should not warn

2636 dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()

2637 if not rlabels.is_unique:

2638 dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())

2639 if dups:

2640 raise MergeError(

2641 f"Passing 'suffixes' which cause duplicate columns {set(dups)} is "

2642 f"not allowed.",

2643 )

2644

2645 return llabels, rlabels