Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/reshape/merge.py: 11%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

919 statements  

1""" 

2SQL-style merge routines 

3""" 

4from __future__ import annotations 

5 

6import copy as cp 

7import datetime 

8from functools import partial 

9import string 

10from typing import ( 

11 TYPE_CHECKING, 

12 Hashable, 

13 Literal, 

14 Sequence, 

15 cast, 

16) 

17import uuid 

18import warnings 

19 

20import numpy as np 

21 

22from pandas._libs import ( 

23 Timedelta, 

24 hashtable as libhashtable, 

25 join as libjoin, 

26 lib, 

27) 

28from pandas._libs.lib import is_range_indexer 

29from pandas._typing import ( 

30 AnyArrayLike, 

31 ArrayLike, 

32 AxisInt, 

33 DtypeObj, 

34 IndexLabel, 

35 JoinHow, 

36 MergeHow, 

37 Shape, 

38 Suffixes, 

39 npt, 

40) 

41from pandas.errors import MergeError 

42from pandas.util._decorators import ( 

43 Appender, 

44 Substitution, 

45 cache_readonly, 

46) 

47from pandas.util._exceptions import find_stack_level 

48 

49from pandas.core.dtypes.base import ExtensionDtype 

50from pandas.core.dtypes.cast import find_common_type 

51from pandas.core.dtypes.common import ( 

52 ensure_float64, 

53 ensure_int64, 

54 ensure_object, 

55 is_array_like, 

56 is_bool, 

57 is_bool_dtype, 

58 is_categorical_dtype, 

59 is_dtype_equal, 

60 is_extension_array_dtype, 

61 is_float_dtype, 

62 is_integer, 

63 is_integer_dtype, 

64 is_list_like, 

65 is_number, 

66 is_numeric_dtype, 

67 is_object_dtype, 

68 needs_i8_conversion, 

69) 

70from pandas.core.dtypes.dtypes import DatetimeTZDtype 

71from pandas.core.dtypes.generic import ( 

72 ABCDataFrame, 

73 ABCSeries, 

74) 

75from pandas.core.dtypes.missing import ( 

76 isna, 

77 na_value_for_dtype, 

78) 

79 

80from pandas import ( 

81 ArrowDtype, 

82 Categorical, 

83 Index, 

84 MultiIndex, 

85 Series, 

86) 

87import pandas.core.algorithms as algos 

88from pandas.core.arrays import ( 

89 ArrowExtensionArray, 

90 BaseMaskedArray, 

91 ExtensionArray, 

92) 

93from pandas.core.arrays._mixins import NDArrayBackedExtensionArray 

94import pandas.core.common as com 

95from pandas.core.construction import ( 

96 ensure_wrapped_if_datetimelike, 

97 extract_array, 

98) 

99from pandas.core.frame import _merge_doc 

100from pandas.core.indexes.api import default_index 

101from pandas.core.sorting import is_int64_overflow_possible 

102 

103if TYPE_CHECKING: 

104 from pandas import DataFrame 

105 from pandas.core import groupby 

106 from pandas.core.arrays import DatetimeArray 

107 

108_factorizers = { 

109 np.int64: libhashtable.Int64Factorizer, 

110 np.longlong: libhashtable.Int64Factorizer, 

111 np.int32: libhashtable.Int32Factorizer, 

112 np.int16: libhashtable.Int16Factorizer, 

113 np.int8: libhashtable.Int8Factorizer, 

114 np.uint64: libhashtable.UInt64Factorizer, 

115 np.uint32: libhashtable.UInt32Factorizer, 

116 np.uint16: libhashtable.UInt16Factorizer, 

117 np.uint8: libhashtable.UInt8Factorizer, 

118 np.bool_: libhashtable.UInt8Factorizer, 

119 np.float64: libhashtable.Float64Factorizer, 

120 np.float32: libhashtable.Float32Factorizer, 

121 np.complex64: libhashtable.Complex64Factorizer, 

122 np.complex128: libhashtable.Complex128Factorizer, 

123 np.object_: libhashtable.ObjectFactorizer, 

124} 

125 

126# See https://github.com/pandas-dev/pandas/issues/52451 

127if np.intc is not np.int32: 

128 _factorizers[np.intc] = libhashtable.Int64Factorizer 

129 

130 

131@Substitution("\nleft : DataFrame or named Series") 

132@Appender(_merge_doc, indents=0) 

133def merge( 

134 left: DataFrame | Series, 

135 right: DataFrame | Series, 

136 how: MergeHow = "inner", 

137 on: IndexLabel | None = None, 

138 left_on: IndexLabel | None = None, 

139 right_on: IndexLabel | None = None, 

140 left_index: bool = False, 

141 right_index: bool = False, 

142 sort: bool = False, 

143 suffixes: Suffixes = ("_x", "_y"), 

144 copy: bool | None = None, 

145 indicator: str | bool = False, 

146 validate: str | None = None, 

147) -> DataFrame: 

148 op = _MergeOperation( 

149 left, 

150 right, 

151 how=how, 

152 on=on, 

153 left_on=left_on, 

154 right_on=right_on, 

155 left_index=left_index, 

156 right_index=right_index, 

157 sort=sort, 

158 suffixes=suffixes, 

159 indicator=indicator, 

160 validate=validate, 

161 ) 

162 return op.get_result(copy=copy) 

163 

164 

165def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces): 

166 """ 

167 groupby & merge; we are always performing a left-by type operation 

168 

169 Parameters 

170 ---------- 

171 by: field to group 

172 left: DataFrame 

173 right: DataFrame 

174 merge_pieces: function for merging 

175 """ 

176 pieces = [] 

177 if not isinstance(by, (list, tuple)): 

178 by = [by] 

179 

180 lby = left.groupby(by, sort=False) 

181 rby: groupby.DataFrameGroupBy | None = None 

182 

183 # if we can groupby the rhs 

184 # then we can get vastly better perf 

185 if all(item in right.columns for item in by): 

186 rby = right.groupby(by, sort=False) 

187 

188 for key, lhs in lby.grouper.get_iterator(lby._selected_obj, axis=lby.axis): 

189 if rby is None: 

190 rhs = right 

191 else: 

192 try: 

193 rhs = right.take(rby.indices[key]) 

194 except KeyError: 

195 # key doesn't exist in left 

196 lcols = lhs.columns.tolist() 

197 cols = lcols + [r for r in right.columns if r not in set(lcols)] 

198 merged = lhs.reindex(columns=cols) 

199 merged.index = range(len(merged)) 

200 pieces.append(merged) 

201 continue 

202 

203 merged = merge_pieces(lhs, rhs) 

204 

205 # make sure join keys are in the merged 

206 # TODO, should merge_pieces do this? 

207 merged[by] = key 

208 

209 pieces.append(merged) 

210 

211 # preserve the original order 

212 # if we have a missing piece this can be reset 

213 from pandas.core.reshape.concat import concat 

214 

215 result = concat(pieces, ignore_index=True) 

216 result = result.reindex(columns=pieces[0].columns, copy=False) 

217 return result, lby 

218 

219 

220def merge_ordered( 

221 left: DataFrame, 

222 right: DataFrame, 

223 on: IndexLabel | None = None, 

224 left_on: IndexLabel | None = None, 

225 right_on: IndexLabel | None = None, 

226 left_by=None, 

227 right_by=None, 

228 fill_method: str | None = None, 

229 suffixes: Suffixes = ("_x", "_y"), 

230 how: JoinHow = "outer", 

231) -> DataFrame: 

232 """ 

233 Perform a merge for ordered data with optional filling/interpolation. 

234 

235 Designed for ordered data like time series data. Optionally 

236 perform group-wise merge (see examples). 

237 

238 Parameters 

239 ---------- 

240 left : DataFrame or named Series 

241 right : DataFrame or named Series 

242 on : label or list 

243 Field names to join on. Must be found in both DataFrames. 

244 left_on : label or list, or array-like 

245 Field names to join on in left DataFrame. Can be a vector or list of 

246 vectors of the length of the DataFrame to use a particular vector as 

247 the join key instead of columns. 

248 right_on : label or list, or array-like 

249 Field names to join on in right DataFrame or vector/list of vectors per 

250 left_on docs. 

251 left_by : column name or list of column names 

252 Group left DataFrame by group columns and merge piece by piece with 

253 right DataFrame. Must be None if either left or right are a Series. 

254 right_by : column name or list of column names 

255 Group right DataFrame by group columns and merge piece by piece with 

256 left DataFrame. Must be None if either left or right are a Series. 

257 fill_method : {'ffill', None}, default None 

258 Interpolation method for data. 

259 suffixes : list-like, default is ("_x", "_y") 

260 A length-2 sequence where each element is optionally a string 

261 indicating the suffix to add to overlapping column names in 

262 `left` and `right` respectively. Pass a value of `None` instead 

263 of a string to indicate that the column name from `left` or 

264 `right` should be left as-is, with no suffix. At least one of the 

265 values must not be None. 

266 

267 how : {'left', 'right', 'outer', 'inner'}, default 'outer' 

268 * left: use only keys from left frame (SQL: left outer join) 

269 * right: use only keys from right frame (SQL: right outer join) 

270 * outer: use union of keys from both frames (SQL: full outer join) 

271 * inner: use intersection of keys from both frames (SQL: inner join). 

272 

273 Returns 

274 ------- 

275 DataFrame 

276 The merged DataFrame output type will be the same as 

277 'left', if it is a subclass of DataFrame. 

278 

279 See Also 

280 -------- 

281 merge : Merge with a database-style join. 

282 merge_asof : Merge on nearest keys. 

283 

284 Examples 

285 -------- 

286 >>> from pandas import merge_ordered 

287 >>> df1 = pd.DataFrame( 

288 ... { 

289 ... "key": ["a", "c", "e", "a", "c", "e"], 

290 ... "lvalue": [1, 2, 3, 1, 2, 3], 

291 ... "group": ["a", "a", "a", "b", "b", "b"] 

292 ... } 

293 ... ) 

294 >>> df1 

295 key lvalue group 

296 0 a 1 a 

297 1 c 2 a 

298 2 e 3 a 

299 3 a 1 b 

300 4 c 2 b 

301 5 e 3 b 

302 

303 >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) 

304 >>> df2 

305 key rvalue 

306 0 b 1 

307 1 c 2 

308 2 d 3 

309 

310 >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group") 

311 key lvalue group rvalue 

312 0 a 1 a NaN 

313 1 b 1 a 1.0 

314 2 c 2 a 2.0 

315 3 d 2 a 3.0 

316 4 e 3 a 3.0 

317 5 a 1 b NaN 

318 6 b 1 b 1.0 

319 7 c 2 b 2.0 

320 8 d 2 b 3.0 

321 9 e 3 b 3.0 

322 """ 

323 

324 def _merger(x, y) -> DataFrame: 

325 # perform the ordered merge operation 

326 op = _OrderedMerge( 

327 x, 

328 y, 

329 on=on, 

330 left_on=left_on, 

331 right_on=right_on, 

332 suffixes=suffixes, 

333 fill_method=fill_method, 

334 how=how, 

335 ) 

336 return op.get_result() 

337 

338 if left_by is not None and right_by is not None: 

339 raise ValueError("Can only group either left or right frames") 

340 if left_by is not None: 

341 if isinstance(left_by, str): 

342 left_by = [left_by] 

343 check = set(left_by).difference(left.columns) 

344 if len(check) != 0: 

345 raise KeyError(f"{check} not found in left columns") 

346 result, _ = _groupby_and_merge(left_by, left, right, lambda x, y: _merger(x, y)) 

347 elif right_by is not None: 

348 if isinstance(right_by, str): 

349 right_by = [right_by] 

350 check = set(right_by).difference(right.columns) 

351 if len(check) != 0: 

352 raise KeyError(f"{check} not found in right columns") 

353 result, _ = _groupby_and_merge( 

354 right_by, right, left, lambda x, y: _merger(y, x) 

355 ) 

356 else: 

357 result = _merger(left, right) 

358 return result 

359 

360 

361def merge_asof( 

362 left: DataFrame | Series, 

363 right: DataFrame | Series, 

364 on: IndexLabel | None = None, 

365 left_on: IndexLabel | None = None, 

366 right_on: IndexLabel | None = None, 

367 left_index: bool = False, 

368 right_index: bool = False, 

369 by=None, 

370 left_by=None, 

371 right_by=None, 

372 suffixes: Suffixes = ("_x", "_y"), 

373 tolerance=None, 

374 allow_exact_matches: bool = True, 

375 direction: str = "backward", 

376) -> DataFrame: 

377 """ 

378 Perform a merge by key distance. 

379 

380 This is similar to a left-join except that we match on nearest 

381 key rather than equal keys. Both DataFrames must be sorted by the key. 

382 

383 For each row in the left DataFrame: 

384 

385 - A "backward" search selects the last row in the right DataFrame whose 

386 'on' key is less than or equal to the left's key. 

387 

388 - A "forward" search selects the first row in the right DataFrame whose 

389 'on' key is greater than or equal to the left's key. 

390 

391 - A "nearest" search selects the row in the right DataFrame whose 'on' 

392 key is closest in absolute distance to the left's key. 

393 

394 The default is "backward" and is compatible in versions below 0.20.0. 

395 The direction parameter was added in version 0.20.0 and introduces 

396 "forward" and "nearest". 

397 

398 Optionally match on equivalent keys with 'by' before searching with 'on'. 

399 

400 Parameters 

401 ---------- 

402 left : DataFrame or named Series 

403 right : DataFrame or named Series 

404 on : label 

405 Field name to join on. Must be found in both DataFrames. 

406 The data MUST be ordered. Furthermore this must be a numeric column, 

407 such as datetimelike, integer, or float. On or left_on/right_on 

408 must be given. 

409 left_on : label 

410 Field name to join on in left DataFrame. 

411 right_on : label 

412 Field name to join on in right DataFrame. 

413 left_index : bool 

414 Use the index of the left DataFrame as the join key. 

415 right_index : bool 

416 Use the index of the right DataFrame as the join key. 

417 by : column name or list of column names 

418 Match on these columns before performing merge operation. 

419 left_by : column name 

420 Field names to match on in the left DataFrame. 

421 right_by : column name 

422 Field names to match on in the right DataFrame. 

423 suffixes : 2-length sequence (tuple, list, ...) 

424 Suffix to apply to overlapping column names in the left and right 

425 side, respectively. 

426 tolerance : int or Timedelta, optional, default None 

427 Select asof tolerance within this range; must be compatible 

428 with the merge index. 

429 allow_exact_matches : bool, default True 

430 

431 - If True, allow matching with the same 'on' value 

432 (i.e. less-than-or-equal-to / greater-than-or-equal-to) 

433 - If False, don't match the same 'on' value 

434 (i.e., strictly less-than / strictly greater-than). 

435 

436 direction : 'backward' (default), 'forward', or 'nearest' 

437 Whether to search for prior, subsequent, or closest matches. 

438 

439 Returns 

440 ------- 

441 DataFrame 

442 

443 See Also 

444 -------- 

445 merge : Merge with a database-style join. 

446 merge_ordered : Merge with optional filling/interpolation. 

447 

448 Examples 

449 -------- 

450 >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) 

451 >>> left 

452 a left_val 

453 0 1 a 

454 1 5 b 

455 2 10 c 

456 

457 >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) 

458 >>> right 

459 a right_val 

460 0 1 1 

461 1 2 2 

462 2 3 3 

463 3 6 6 

464 4 7 7 

465 

466 >>> pd.merge_asof(left, right, on="a") 

467 a left_val right_val 

468 0 1 a 1 

469 1 5 b 3 

470 2 10 c 7 

471 

472 >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False) 

473 a left_val right_val 

474 0 1 a NaN 

475 1 5 b 3.0 

476 2 10 c 7.0 

477 

478 >>> pd.merge_asof(left, right, on="a", direction="forward") 

479 a left_val right_val 

480 0 1 a 1.0 

481 1 5 b 6.0 

482 2 10 c NaN 

483 

484 >>> pd.merge_asof(left, right, on="a", direction="nearest") 

485 a left_val right_val 

486 0 1 a 1 

487 1 5 b 6 

488 2 10 c 7 

489 

490 We can use indexed DataFrames as well. 

491 

492 >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10]) 

493 >>> left 

494 left_val 

495 1 a 

496 5 b 

497 10 c 

498 

499 >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7]) 

500 >>> right 

501 right_val 

502 1 1 

503 2 2 

504 3 3 

505 6 6 

506 7 7 

507 

508 >>> pd.merge_asof(left, right, left_index=True, right_index=True) 

509 left_val right_val 

510 1 a 1 

511 5 b 3 

512 10 c 7 

513 

514 Here is a real-world times-series example 

515 

516 >>> quotes = pd.DataFrame( 

517 ... { 

518 ... "time": [ 

519 ... pd.Timestamp("2016-05-25 13:30:00.023"), 

520 ... pd.Timestamp("2016-05-25 13:30:00.023"), 

521 ... pd.Timestamp("2016-05-25 13:30:00.030"), 

522 ... pd.Timestamp("2016-05-25 13:30:00.041"), 

523 ... pd.Timestamp("2016-05-25 13:30:00.048"), 

524 ... pd.Timestamp("2016-05-25 13:30:00.049"), 

525 ... pd.Timestamp("2016-05-25 13:30:00.072"), 

526 ... pd.Timestamp("2016-05-25 13:30:00.075") 

527 ... ], 

528 ... "ticker": [ 

529 ... "GOOG", 

530 ... "MSFT", 

531 ... "MSFT", 

532 ... "MSFT", 

533 ... "GOOG", 

534 ... "AAPL", 

535 ... "GOOG", 

536 ... "MSFT" 

537 ... ], 

538 ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], 

539 ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03] 

540 ... } 

541 ... ) 

542 >>> quotes 

543 time ticker bid ask 

544 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 

545 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96 

546 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98 

547 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00 

548 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93 

549 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01 

550 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 

551 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 

552 

553 >>> trades = pd.DataFrame( 

554 ... { 

555 ... "time": [ 

556 ... pd.Timestamp("2016-05-25 13:30:00.023"), 

557 ... pd.Timestamp("2016-05-25 13:30:00.038"), 

558 ... pd.Timestamp("2016-05-25 13:30:00.048"), 

559 ... pd.Timestamp("2016-05-25 13:30:00.048"), 

560 ... pd.Timestamp("2016-05-25 13:30:00.048") 

561 ... ], 

562 ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], 

563 ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], 

564 ... "quantity": [75, 155, 100, 100, 100] 

565 ... } 

566 ... ) 

567 >>> trades 

568 time ticker price quantity 

569 0 2016-05-25 13:30:00.023 MSFT 51.95 75 

570 1 2016-05-25 13:30:00.038 MSFT 51.95 155 

571 2 2016-05-25 13:30:00.048 GOOG 720.77 100 

572 3 2016-05-25 13:30:00.048 GOOG 720.92 100 

573 4 2016-05-25 13:30:00.048 AAPL 98.00 100 

574 

575 By default we are taking the asof of the quotes 

576 

577 >>> pd.merge_asof(trades, quotes, on="time", by="ticker") 

578 time ticker price quantity bid ask 

579 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 

580 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 

581 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 

582 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 

583 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN 

584 

585 We only asof within 2ms between the quote time and the trade time 

586 

587 >>> pd.merge_asof( 

588 ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms") 

589 ... ) 

590 time ticker price quantity bid ask 

591 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 

592 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN 

593 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 

594 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 

595 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN 

596 

597 We only asof within 10ms between the quote time and the trade time 

598 and we exclude exact matches on time. However *prior* data will 

599 propagate forward 

600 

601 >>> pd.merge_asof( 

602 ... trades, 

603 ... quotes, 

604 ... on="time", 

605 ... by="ticker", 

606 ... tolerance=pd.Timedelta("10ms"), 

607 ... allow_exact_matches=False 

608 ... ) 

609 time ticker price quantity bid ask 

610 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN 

611 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 

612 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN 

613 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN 

614 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN 

615 """ 

616 op = _AsOfMerge( 

617 left, 

618 right, 

619 on=on, 

620 left_on=left_on, 

621 right_on=right_on, 

622 left_index=left_index, 

623 right_index=right_index, 

624 by=by, 

625 left_by=left_by, 

626 right_by=right_by, 

627 suffixes=suffixes, 

628 how="asof", 

629 tolerance=tolerance, 

630 allow_exact_matches=allow_exact_matches, 

631 direction=direction, 

632 ) 

633 return op.get_result() 

634 

635 

636# TODO: transformations?? 

637# TODO: only copy DataFrames when modification necessary 

638class _MergeOperation: 

639 """ 

640 Perform a database (SQL) merge operation between two DataFrame or Series 

641 objects using either columns as keys or their row indexes 

642 """ 

643 

644 _merge_type = "merge" 

645 how: MergeHow | Literal["asof"] 

646 on: IndexLabel | None 

647 # left_on/right_on may be None when passed, but in validate_specification 

648 # get replaced with non-None. 

649 left_on: Sequence[Hashable | AnyArrayLike] 

650 right_on: Sequence[Hashable | AnyArrayLike] 

651 left_index: bool 

652 right_index: bool 

653 axis: AxisInt 

654 bm_axis: AxisInt 

655 sort: bool 

656 suffixes: Suffixes 

657 copy: bool 

658 indicator: str | bool 

659 validate: str | None 

660 join_names: list[Hashable] 

661 right_join_keys: list[AnyArrayLike] 

662 left_join_keys: list[AnyArrayLike] 

663 

664 def __init__( 

665 self, 

666 left: DataFrame | Series, 

667 right: DataFrame | Series, 

668 how: MergeHow | Literal["asof"] = "inner", 

669 on: IndexLabel | None = None, 

670 left_on: IndexLabel | None = None, 

671 right_on: IndexLabel | None = None, 

672 axis: AxisInt = 1, 

673 left_index: bool = False, 

674 right_index: bool = False, 

675 sort: bool = True, 

676 suffixes: Suffixes = ("_x", "_y"), 

677 indicator: str | bool = False, 

678 validate: str | None = None, 

679 ) -> None: 

680 _left = _validate_operand(left) 

681 _right = _validate_operand(right) 

682 self.left = self.orig_left = _left 

683 self.right = self.orig_right = _right 

684 self.how = how 

685 

686 # bm_axis -> the axis on the BlockManager 

687 self.bm_axis = axis 

688 # axis --> the axis on the Series/DataFrame 

689 self.axis = 1 - axis if self.left.ndim == 2 else 0 

690 

691 self.on = com.maybe_make_list(on) 

692 

693 self.suffixes = suffixes 

694 self.sort = sort 

695 

696 self.left_index = left_index 

697 self.right_index = right_index 

698 

699 self.indicator = indicator 

700 

701 if not is_bool(left_index): 

702 raise ValueError( 

703 f"left_index parameter must be of type bool, not {type(left_index)}" 

704 ) 

705 if not is_bool(right_index): 

706 raise ValueError( 

707 f"right_index parameter must be of type bool, not {type(right_index)}" 

708 ) 

709 

710 # GH 40993: raise when merging between different levels; enforced in 2.0 

711 if _left.columns.nlevels != _right.columns.nlevels: 

712 msg = ( 

713 "Not allowed to merge between different levels. " 

714 f"({_left.columns.nlevels} levels on the left, " 

715 f"{_right.columns.nlevels} on the right)" 

716 ) 

717 raise MergeError(msg) 

718 

719 self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on) 

720 

721 cross_col = None 

722 if self.how == "cross": 

723 ( 

724 self.left, 

725 self.right, 

726 self.how, 

727 cross_col, 

728 ) = self._create_cross_configuration(self.left, self.right) 

729 self.left_on = self.right_on = [cross_col] 

730 self._cross = cross_col 

731 

732 # note this function has side effects 

733 ( 

734 self.left_join_keys, 

735 self.right_join_keys, 

736 self.join_names, 

737 ) = self._get_merge_keys() 

738 

739 # validate the merge keys dtypes. We may need to coerce 

740 # to avoid incompatible dtypes 

741 self._maybe_coerce_merge_keys() 

742 

743 # If argument passed to validate, 

744 # check if columns specified as unique 

745 # are in fact unique. 

746 if validate is not None: 

747 self._validate(validate) 

748 

749 def _reindex_and_concat( 

750 self, 

751 join_index: Index, 

752 left_indexer: npt.NDArray[np.intp] | None, 

753 right_indexer: npt.NDArray[np.intp] | None, 

754 copy: bool | None, 

755 ) -> DataFrame: 

756 """ 

757 reindex along index and concat along columns. 

758 """ 

759 # Take views so we do not alter the originals 

760 left = self.left[:] 

761 right = self.right[:] 

762 

763 llabels, rlabels = _items_overlap_with_suffix( 

764 self.left._info_axis, self.right._info_axis, self.suffixes 

765 ) 

766 

767 if left_indexer is not None and not is_range_indexer(left_indexer, len(left)): 

768 # Pinning the index here (and in the right code just below) is not 

769 # necessary, but makes the `.take` more performant if we have e.g. 

770 # a MultiIndex for left.index. 

771 lmgr = left._mgr.reindex_indexer( 

772 join_index, 

773 left_indexer, 

774 axis=1, 

775 copy=False, 

776 only_slice=True, 

777 allow_dups=True, 

778 use_na_proxy=True, 

779 ) 

780 left = left._constructor(lmgr) 

781 left.index = join_index 

782 

783 if right_indexer is not None and not is_range_indexer( 

784 right_indexer, len(right) 

785 ): 

786 rmgr = right._mgr.reindex_indexer( 

787 join_index, 

788 right_indexer, 

789 axis=1, 

790 copy=False, 

791 only_slice=True, 

792 allow_dups=True, 

793 use_na_proxy=True, 

794 ) 

795 right = right._constructor(rmgr) 

796 right.index = join_index 

797 

798 from pandas import concat 

799 

800 left.columns = llabels 

801 right.columns = rlabels 

802 result = concat([left, right], axis=1, copy=copy) 

803 return result 

804 

805 def get_result(self, copy: bool | None = True) -> DataFrame: 

806 if self.indicator: 

807 self.left, self.right = self._indicator_pre_merge(self.left, self.right) 

808 

809 join_index, left_indexer, right_indexer = self._get_join_info() 

810 

811 result = self._reindex_and_concat( 

812 join_index, left_indexer, right_indexer, copy=copy 

813 ) 

814 result = result.__finalize__(self, method=self._merge_type) 

815 

816 if self.indicator: 

817 result = self._indicator_post_merge(result) 

818 

819 self._maybe_add_join_keys(result, left_indexer, right_indexer) 

820 

821 self._maybe_restore_index_levels(result) 

822 

823 self._maybe_drop_cross_column(result, self._cross) 

824 

825 return result.__finalize__(self, method="merge") 

826 

827 def _maybe_drop_cross_column( 

828 self, result: DataFrame, cross_col: str | None 

829 ) -> None: 

830 if cross_col is not None: 

831 del result[cross_col] 

832 

833 @cache_readonly 

834 def _indicator_name(self) -> str | None: 

835 if isinstance(self.indicator, str): 

836 return self.indicator 

837 elif isinstance(self.indicator, bool): 

838 return "_merge" if self.indicator else None 

839 else: 

840 raise ValueError( 

841 "indicator option can only accept boolean or string arguments" 

842 ) 

843 

844 def _indicator_pre_merge( 

845 self, left: DataFrame, right: DataFrame 

846 ) -> tuple[DataFrame, DataFrame]: 

847 columns = left.columns.union(right.columns) 

848 

849 for i in ["_left_indicator", "_right_indicator"]: 

850 if i in columns: 

851 raise ValueError( 

852 "Cannot use `indicator=True` option when " 

853 f"data contains a column named {i}" 

854 ) 

855 if self._indicator_name in columns: 

856 raise ValueError( 

857 "Cannot use name of an existing column for indicator column" 

858 ) 

859 

860 left = left.copy() 

861 right = right.copy() 

862 

863 left["_left_indicator"] = 1 

864 left["_left_indicator"] = left["_left_indicator"].astype("int8") 

865 

866 right["_right_indicator"] = 2 

867 right["_right_indicator"] = right["_right_indicator"].astype("int8") 

868 

869 return left, right 

870 

871 def _indicator_post_merge(self, result: DataFrame) -> DataFrame: 

872 result["_left_indicator"] = result["_left_indicator"].fillna(0) 

873 result["_right_indicator"] = result["_right_indicator"].fillna(0) 

874 

875 result[self._indicator_name] = Categorical( 

876 (result["_left_indicator"] + result["_right_indicator"]), 

877 categories=[1, 2, 3], 

878 ) 

879 result[self._indicator_name] = result[ 

880 self._indicator_name 

881 ].cat.rename_categories(["left_only", "right_only", "both"]) 

882 

883 result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1) 

884 return result 

885 

886 def _maybe_restore_index_levels(self, result: DataFrame) -> None: 

887 """ 

888 Restore index levels specified as `on` parameters 

889 

890 Here we check for cases where `self.left_on` and `self.right_on` pairs 

891 each reference an index level in their respective DataFrames. The 

892 joined columns corresponding to these pairs are then restored to the 

893 index of `result`. 

894 

895 **Note:** This method has side effects. It modifies `result` in-place 

896 

897 Parameters 

898 ---------- 

899 result: DataFrame 

900 merge result 

901 

902 Returns 

903 ------- 

904 None 

905 """ 

906 names_to_restore = [] 

907 for name, left_key, right_key in zip( 

908 self.join_names, self.left_on, self.right_on 

909 ): 

910 if ( 

911 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible 

912 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected 

913 # "Hashable" 

914 self.orig_left._is_level_reference(left_key) # type: ignore[arg-type] 

915 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible 

916 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected 

917 # "Hashable" 

918 and self.orig_right._is_level_reference( 

919 right_key # type: ignore[arg-type] 

920 ) 

921 and left_key == right_key 

922 and name not in result.index.names 

923 ): 

924 names_to_restore.append(name) 

925 

926 if names_to_restore: 

927 result.set_index(names_to_restore, inplace=True) 

928 

929 def _maybe_add_join_keys( 

930 self, 

931 result: DataFrame, 

932 left_indexer: np.ndarray | None, 

933 right_indexer: np.ndarray | None, 

934 ) -> None: 

935 left_has_missing = None 

936 right_has_missing = None 

937 

938 assert all(is_array_like(x) for x in self.left_join_keys) 

939 

940 keys = zip(self.join_names, self.left_on, self.right_on) 

941 for i, (name, lname, rname) in enumerate(keys): 

942 if not _should_fill(lname, rname): 

943 continue 

944 

945 take_left, take_right = None, None 

946 

947 if name in result: 

948 if left_indexer is not None and right_indexer is not None: 

949 if name in self.left: 

950 if left_has_missing is None: 

951 left_has_missing = (left_indexer == -1).any() 

952 

953 if left_has_missing: 

954 take_right = self.right_join_keys[i] 

955 

956 if not is_dtype_equal( 

957 result[name].dtype, self.left[name].dtype 

958 ): 

959 take_left = self.left[name]._values 

960 

961 elif name in self.right: 

962 if right_has_missing is None: 

963 right_has_missing = (right_indexer == -1).any() 

964 

965 if right_has_missing: 

966 take_left = self.left_join_keys[i] 

967 

968 if not is_dtype_equal( 

969 result[name].dtype, self.right[name].dtype 

970 ): 

971 take_right = self.right[name]._values 

972 

973 elif left_indexer is not None: 

974 take_left = self.left_join_keys[i] 

975 take_right = self.right_join_keys[i] 

976 

977 if take_left is not None or take_right is not None: 

978 if take_left is None: 

979 lvals = result[name]._values 

980 else: 

981 # TODO: can we pin down take_left's type earlier? 

982 take_left = extract_array(take_left, extract_numpy=True) 

983 lfill = na_value_for_dtype(take_left.dtype) 

984 lvals = algos.take_nd(take_left, left_indexer, fill_value=lfill) 

985 

986 if take_right is None: 

987 rvals = result[name]._values 

988 else: 

989 # TODO: can we pin down take_right's type earlier? 

990 taker = extract_array(take_right, extract_numpy=True) 

991 rfill = na_value_for_dtype(taker.dtype) 

992 rvals = algos.take_nd(taker, right_indexer, fill_value=rfill) 

993 

994 # if we have an all missing left_indexer 

995 # make sure to just use the right values or vice-versa 

996 mask_left = left_indexer == -1 

997 # error: Item "bool" of "Union[Any, bool]" has no attribute "all" 

998 if mask_left.all(): # type: ignore[union-attr] 

999 key_col = Index(rvals) 

1000 result_dtype = rvals.dtype 

1001 elif right_indexer is not None and (right_indexer == -1).all(): 

1002 key_col = Index(lvals) 

1003 result_dtype = lvals.dtype 

1004 else: 

1005 key_col = Index(lvals).where(~mask_left, rvals) 

1006 result_dtype = find_common_type([lvals.dtype, rvals.dtype]) 

1007 if ( 

1008 lvals.dtype.kind == "M" 

1009 and rvals.dtype.kind == "M" 

1010 and result_dtype.kind == "O" 

1011 ): 

1012 # TODO(non-nano) Workaround for common_type not dealing 

1013 # with different resolutions 

1014 result_dtype = key_col.dtype 

1015 

1016 if result._is_label_reference(name): 

1017 result[name] = Series( 

1018 key_col, dtype=result_dtype, index=result.index 

1019 ) 

1020 elif result._is_level_reference(name): 

1021 if isinstance(result.index, MultiIndex): 

1022 key_col.name = name 

1023 idx_list = [ 

1024 result.index.get_level_values(level_name) 

1025 if level_name != name 

1026 else key_col 

1027 for level_name in result.index.names 

1028 ] 

1029 

1030 result.set_index(idx_list, inplace=True) 

1031 else: 

1032 result.index = Index(key_col, name=name) 

1033 else: 

1034 result.insert(i, name or f"key_{i}", key_col) 

1035 

1036 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

1037 """return the join indexers""" 

1038 return get_join_indexers( 

1039 self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how 

1040 ) 

1041 

1042 def _get_join_info( 

1043 self, 

1044 ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: 

1045 # make mypy happy 

1046 assert self.how != "cross" 

1047 left_ax = self.left.axes[self.axis] 

1048 right_ax = self.right.axes[self.axis] 

1049 

1050 if self.left_index and self.right_index and self.how != "asof": 

1051 join_index, left_indexer, right_indexer = left_ax.join( 

1052 right_ax, how=self.how, return_indexers=True, sort=self.sort 

1053 ) 

1054 

1055 elif self.right_index and self.how == "left": 

1056 join_index, left_indexer, right_indexer = _left_join_on_index( 

1057 left_ax, right_ax, self.left_join_keys, sort=self.sort 

1058 ) 

1059 

1060 elif self.left_index and self.how == "right": 

1061 join_index, right_indexer, left_indexer = _left_join_on_index( 

1062 right_ax, left_ax, self.right_join_keys, sort=self.sort 

1063 ) 

1064 else: 

1065 (left_indexer, right_indexer) = self._get_join_indexers() 

1066 

1067 if self.right_index: 

1068 if len(self.left) > 0: 

1069 join_index = self._create_join_index( 

1070 self.left.index, 

1071 self.right.index, 

1072 left_indexer, 

1073 how="right", 

1074 ) 

1075 else: 

1076 join_index = self.right.index.take(right_indexer) 

1077 elif self.left_index: 

1078 if self.how == "asof": 

1079 # GH#33463 asof should always behave like a left merge 

1080 join_index = self._create_join_index( 

1081 self.left.index, 

1082 self.right.index, 

1083 left_indexer, 

1084 how="left", 

1085 ) 

1086 

1087 elif len(self.right) > 0: 

1088 join_index = self._create_join_index( 

1089 self.right.index, 

1090 self.left.index, 

1091 right_indexer, 

1092 how="left", 

1093 ) 

1094 else: 

1095 join_index = self.left.index.take(left_indexer) 

1096 else: 

1097 join_index = default_index(len(left_indexer)) 

1098 

1099 if len(join_index) == 0 and not isinstance(join_index, MultiIndex): 

1100 join_index = default_index(0).set_names(join_index.name) 

1101 return join_index, left_indexer, right_indexer 

1102 

1103 def _create_join_index( 

1104 self, 

1105 index: Index, 

1106 other_index: Index, 

1107 indexer: npt.NDArray[np.intp], 

1108 how: JoinHow = "left", 

1109 ) -> Index: 

1110 """ 

1111 Create a join index by rearranging one index to match another 

1112 

1113 Parameters 

1114 ---------- 

1115 index : Index being rearranged 

1116 other_index : Index used to supply values not found in index 

1117 indexer : np.ndarray[np.intp] how to rearrange index 

1118 how : str 

1119 Replacement is only necessary if indexer based on other_index. 

1120 

1121 Returns 

1122 ------- 

1123 Index 

1124 """ 

1125 if self.how in (how, "outer") and not isinstance(other_index, MultiIndex): 

1126 # if final index requires values in other_index but not target 

1127 # index, indexer may hold missing (-1) values, causing Index.take 

1128 # to take the final value in target index. So, we set the last 

1129 # element to be the desired fill value. We do not use allow_fill 

1130 # and fill_value because it throws a ValueError on integer indices 

1131 mask = indexer == -1 

1132 if np.any(mask): 

1133 fill_value = na_value_for_dtype(index.dtype, compat=False) 

1134 index = index.append(Index([fill_value])) 

1135 return index.take(indexer) 

1136 

1137 def _get_merge_keys( 

1138 self, 

1139 ) -> tuple[list[AnyArrayLike], list[AnyArrayLike], list[Hashable]]: 

1140 """ 

1141 Note: has side effects (copy/delete key columns) 

1142 

1143 Parameters 

1144 ---------- 

1145 left 

1146 right 

1147 on 

1148 

1149 Returns 

1150 ------- 

1151 left_keys, right_keys, join_names 

1152 """ 

1153 # left_keys, right_keys entries can actually be anything listlike 

1154 # with a 'dtype' attr 

1155 left_keys: list[AnyArrayLike] = [] 

1156 right_keys: list[AnyArrayLike] = [] 

1157 join_names: list[Hashable] = [] 

1158 right_drop: list[Hashable] = [] 

1159 left_drop: list[Hashable] = [] 

1160 

1161 left, right = self.left, self.right 

1162 

1163 is_lkey = lambda x: is_array_like(x) and len(x) == len(left) 

1164 is_rkey = lambda x: is_array_like(x) and len(x) == len(right) 

1165 

1166 # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A 

1167 # user could, for example, request 'left_index' and 'left_by'. In a 

1168 # regular pd.merge(), users cannot specify both 'left_index' and 

1169 # 'left_on'. (Instead, users have a MultiIndex). That means the 

1170 # self.left_on in this function is always empty in a pd.merge(), but 

1171 # a pd.merge_asof(left_index=True, left_by=...) will result in a 

1172 # self.left_on array with a None in the middle of it. This requires 

1173 # a work-around as designated in the code below. 

1174 # See _validate_left_right_on() for where this happens. 

1175 

1176 # ugh, spaghetti re #733 

1177 if _any(self.left_on) and _any(self.right_on): 

1178 for lk, rk in zip(self.left_on, self.right_on): 

1179 if is_lkey(lk): 

1180 lk = cast(AnyArrayLike, lk) 

1181 left_keys.append(lk) 

1182 if is_rkey(rk): 

1183 rk = cast(AnyArrayLike, rk) 

1184 right_keys.append(rk) 

1185 join_names.append(None) # what to do? 

1186 else: 

1187 # Then we're either Hashable or a wrong-length arraylike, 

1188 # the latter of which will raise 

1189 rk = cast(Hashable, rk) 

1190 if rk is not None: 

1191 right_keys.append(right._get_label_or_level_values(rk)) 

1192 join_names.append(rk) 

1193 else: 

1194 # work-around for merge_asof(right_index=True) 

1195 right_keys.append(right.index) 

1196 join_names.append(right.index.name) 

1197 else: 

1198 if not is_rkey(rk): 

1199 # Then we're either Hashable or a wrong-length arraylike, 

1200 # the latter of which will raise 

1201 rk = cast(Hashable, rk) 

1202 if rk is not None: 

1203 right_keys.append(right._get_label_or_level_values(rk)) 

1204 else: 

1205 # work-around for merge_asof(right_index=True) 

1206 right_keys.append(right.index) 

1207 if lk is not None and lk == rk: # FIXME: what about other NAs? 

1208 # avoid key upcast in corner case (length-0) 

1209 lk = cast(Hashable, lk) 

1210 if len(left) > 0: 

1211 right_drop.append(rk) 

1212 else: 

1213 left_drop.append(lk) 

1214 else: 

1215 rk = cast(AnyArrayLike, rk) 

1216 right_keys.append(rk) 

1217 if lk is not None: 

1218 # Then we're either Hashable or a wrong-length arraylike, 

1219 # the latter of which will raise 

1220 lk = cast(Hashable, lk) 

1221 left_keys.append(left._get_label_or_level_values(lk)) 

1222 join_names.append(lk) 

1223 else: 

1224 # work-around for merge_asof(left_index=True) 

1225 left_keys.append(left.index) 

1226 join_names.append(left.index.name) 

1227 elif _any(self.left_on): 

1228 for k in self.left_on: 

1229 if is_lkey(k): 

1230 k = cast(AnyArrayLike, k) 

1231 left_keys.append(k) 

1232 join_names.append(None) 

1233 else: 

1234 # Then we're either Hashable or a wrong-length arraylike, 

1235 # the latter of which will raise 

1236 k = cast(Hashable, k) 

1237 left_keys.append(left._get_label_or_level_values(k)) 

1238 join_names.append(k) 

1239 if isinstance(self.right.index, MultiIndex): 

1240 right_keys = [ 

1241 lev._values.take(lev_codes) 

1242 for lev, lev_codes in zip( 

1243 self.right.index.levels, self.right.index.codes 

1244 ) 

1245 ] 

1246 else: 

1247 right_keys = [self.right.index._values] 

1248 elif _any(self.right_on): 

1249 for k in self.right_on: 

1250 if is_rkey(k): 

1251 k = cast(AnyArrayLike, k) 

1252 right_keys.append(k) 

1253 join_names.append(None) 

1254 else: 

1255 # Then we're either Hashable or a wrong-length arraylike, 

1256 # the latter of which will raise 

1257 k = cast(Hashable, k) 

1258 right_keys.append(right._get_label_or_level_values(k)) 

1259 join_names.append(k) 

1260 if isinstance(self.left.index, MultiIndex): 

1261 left_keys = [ 

1262 lev._values.take(lev_codes) 

1263 for lev, lev_codes in zip( 

1264 self.left.index.levels, self.left.index.codes 

1265 ) 

1266 ] 

1267 else: 

1268 left_keys = [self.left.index._values] 

1269 

1270 if left_drop: 

1271 self.left = self.left._drop_labels_or_levels(left_drop) 

1272 

1273 if right_drop: 

1274 self.right = self.right._drop_labels_or_levels(right_drop) 

1275 

1276 return left_keys, right_keys, join_names 

1277 

1278 def _maybe_coerce_merge_keys(self) -> None: 

1279 # we have valid merges but we may have to further 

1280 # coerce these if they are originally incompatible types 

1281 # 

1282 # for example if these are categorical, but are not dtype_equal 

1283 # or if we have object and integer dtypes 

1284 

1285 for lk, rk, name in zip( 

1286 self.left_join_keys, self.right_join_keys, self.join_names 

1287 ): 

1288 if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): 

1289 continue 

1290 

1291 lk = extract_array(lk, extract_numpy=True) 

1292 rk = extract_array(rk, extract_numpy=True) 

1293 

1294 lk_is_cat = is_categorical_dtype(lk.dtype) 

1295 rk_is_cat = is_categorical_dtype(rk.dtype) 

1296 lk_is_object = is_object_dtype(lk.dtype) 

1297 rk_is_object = is_object_dtype(rk.dtype) 

1298 

1299 # if either left or right is a categorical 

1300 # then the must match exactly in categories & ordered 

1301 if lk_is_cat and rk_is_cat: 

1302 lk = cast(Categorical, lk) 

1303 rk = cast(Categorical, rk) 

1304 if lk._categories_match_up_to_permutation(rk): 

1305 continue 

1306 

1307 elif lk_is_cat or rk_is_cat: 

1308 pass 

1309 

1310 elif is_dtype_equal(lk.dtype, rk.dtype): 

1311 continue 

1312 

1313 msg = ( 

1314 f"You are trying to merge on {lk.dtype} and " 

1315 f"{rk.dtype} columns. If you wish to proceed you should use pd.concat" 

1316 ) 

1317 

1318 # if we are numeric, then allow differing 

1319 # kinds to proceed, eg. int64 and int8, int and float 

1320 # further if we are object, but we infer to 

1321 # the same, then proceed 

1322 if is_numeric_dtype(lk.dtype) and is_numeric_dtype(rk.dtype): 

1323 if lk.dtype.kind == rk.dtype.kind: 

1324 continue 

1325 

1326 # check whether ints and floats 

1327 if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype): 

1328 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int 

1329 with np.errstate(invalid="ignore"): 

1330 # error: Argument 1 to "astype" of "ndarray" has incompatible 

1331 # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected 

1332 # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]" 

1333 casted = lk.astype(rk.dtype) # type: ignore[arg-type] 

1334 

1335 mask = ~np.isnan(lk) 

1336 match = lk == casted 

1337 if not match[mask].all(): 

1338 warnings.warn( 

1339 "You are merging on int and float " 

1340 "columns where the float values " 

1341 "are not equal to their int representation.", 

1342 UserWarning, 

1343 stacklevel=find_stack_level(), 

1344 ) 

1345 continue 

1346 

1347 if is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype): 

1348 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int 

1349 with np.errstate(invalid="ignore"): 

1350 # error: Argument 1 to "astype" of "ndarray" has incompatible 

1351 # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected 

1352 # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]" 

1353 casted = rk.astype(lk.dtype) # type: ignore[arg-type] 

1354 

1355 mask = ~np.isnan(rk) 

1356 match = rk == casted 

1357 if not match[mask].all(): 

1358 warnings.warn( 

1359 "You are merging on int and float " 

1360 "columns where the float values " 

1361 "are not equal to their int representation.", 

1362 UserWarning, 

1363 stacklevel=find_stack_level(), 

1364 ) 

1365 continue 

1366 

1367 # let's infer and see if we are ok 

1368 if lib.infer_dtype(lk, skipna=False) == lib.infer_dtype( 

1369 rk, skipna=False 

1370 ): 

1371 continue 

1372 

1373 # Check if we are trying to merge on obviously 

1374 # incompatible dtypes GH 9780, GH 15800 

1375 

1376 # bool values are coerced to object 

1377 elif (lk_is_object and is_bool_dtype(rk.dtype)) or ( 

1378 is_bool_dtype(lk.dtype) and rk_is_object 

1379 ): 

1380 pass 

1381 

1382 # object values are allowed to be merged 

1383 elif (lk_is_object and is_numeric_dtype(rk.dtype)) or ( 

1384 is_numeric_dtype(lk.dtype) and rk_is_object 

1385 ): 

1386 inferred_left = lib.infer_dtype(lk, skipna=False) 

1387 inferred_right = lib.infer_dtype(rk, skipna=False) 

1388 bool_types = ["integer", "mixed-integer", "boolean", "empty"] 

1389 string_types = ["string", "unicode", "mixed", "bytes", "empty"] 

1390 

1391 # inferred bool 

1392 if inferred_left in bool_types and inferred_right in bool_types: 

1393 pass 

1394 

1395 # unless we are merging non-string-like with string-like 

1396 elif ( 

1397 inferred_left in string_types and inferred_right not in string_types 

1398 ) or ( 

1399 inferred_right in string_types and inferred_left not in string_types 

1400 ): 

1401 raise ValueError(msg) 

1402 

1403 # datetimelikes must match exactly 

1404 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype): 

1405 raise ValueError(msg) 

1406 elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype): 

1407 raise ValueError(msg) 

1408 elif isinstance(lk.dtype, DatetimeTZDtype) and not isinstance( 

1409 rk.dtype, DatetimeTZDtype 

1410 ): 

1411 raise ValueError(msg) 

1412 elif not isinstance(lk.dtype, DatetimeTZDtype) and isinstance( 

1413 rk.dtype, DatetimeTZDtype 

1414 ): 

1415 raise ValueError(msg) 

1416 elif ( 

1417 isinstance(lk.dtype, DatetimeTZDtype) 

1418 and isinstance(rk.dtype, DatetimeTZDtype) 

1419 ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"): 

1420 # allows datetime with different resolutions 

1421 continue 

1422 

1423 elif lk_is_object and rk_is_object: 

1424 continue 

1425 

1426 # Houston, we have a problem! 

1427 # let's coerce to object if the dtypes aren't 

1428 # categorical, otherwise coerce to the category 

1429 # dtype. If we coerced categories to object, 

1430 # then we would lose type information on some 

1431 # columns, and end up trying to merge 

1432 # incompatible dtypes. See GH 16900. 

1433 if name in self.left.columns: 

1434 typ = cast(Categorical, lk).categories.dtype if lk_is_cat else object 

1435 self.left = self.left.copy() 

1436 self.left[name] = self.left[name].astype(typ) 

1437 if name in self.right.columns: 

1438 typ = cast(Categorical, rk).categories.dtype if rk_is_cat else object 

1439 self.right = self.right.copy() 

1440 self.right[name] = self.right[name].astype(typ) 

1441 

1442 def _create_cross_configuration( 

1443 self, left: DataFrame, right: DataFrame 

1444 ) -> tuple[DataFrame, DataFrame, JoinHow, str]: 

1445 """ 

1446 Creates the configuration to dispatch the cross operation to inner join, 

1447 e.g. adding a join column and resetting parameters. Join column is added 

1448 to a new object, no inplace modification 

1449 

1450 Parameters 

1451 ---------- 

1452 left : DataFrame 

1453 right : DataFrame 

1454 

1455 Returns 

1456 ------- 

1457 a tuple (left, right, how, cross_col) representing the adjusted 

1458 DataFrames with cross_col, the merge operation set to inner and the column 

1459 to join over. 

1460 """ 

1461 cross_col = f"_cross_{uuid.uuid4()}" 

1462 how: JoinHow = "inner" 

1463 return ( 

1464 left.assign(**{cross_col: 1}), 

1465 right.assign(**{cross_col: 1}), 

1466 how, 

1467 cross_col, 

1468 ) 

1469 

1470 def _validate_left_right_on(self, left_on, right_on): 

1471 left_on = com.maybe_make_list(left_on) 

1472 right_on = com.maybe_make_list(right_on) 

1473 

1474 if self.how == "cross": 

1475 if ( 

1476 self.left_index 

1477 or self.right_index 

1478 or right_on is not None 

1479 or left_on is not None 

1480 or self.on is not None 

1481 ): 

1482 raise MergeError( 

1483 "Can not pass on, right_on, left_on or set right_index=True or " 

1484 "left_index=True" 

1485 ) 

1486 # Hm, any way to make this logic less complicated?? 

1487 elif self.on is None and left_on is None and right_on is None: 

1488 if self.left_index and self.right_index: 

1489 left_on, right_on = (), () 

1490 elif self.left_index: 

1491 raise MergeError("Must pass right_on or right_index=True") 

1492 elif self.right_index: 

1493 raise MergeError("Must pass left_on or left_index=True") 

1494 else: 

1495 # use the common columns 

1496 left_cols = self.left.columns 

1497 right_cols = self.right.columns 

1498 common_cols = left_cols.intersection(right_cols) 

1499 if len(common_cols) == 0: 

1500 raise MergeError( 

1501 "No common columns to perform merge on. " 

1502 f"Merge options: left_on={left_on}, " 

1503 f"right_on={right_on}, " 

1504 f"left_index={self.left_index}, " 

1505 f"right_index={self.right_index}" 

1506 ) 

1507 if ( 

1508 not left_cols.join(common_cols, how="inner").is_unique 

1509 or not right_cols.join(common_cols, how="inner").is_unique 

1510 ): 

1511 raise MergeError(f"Data columns not unique: {repr(common_cols)}") 

1512 left_on = right_on = common_cols 

1513 elif self.on is not None: 

1514 if left_on is not None or right_on is not None: 

1515 raise MergeError( 

1516 'Can only pass argument "on" OR "left_on" ' 

1517 'and "right_on", not a combination of both.' 

1518 ) 

1519 if self.left_index or self.right_index: 

1520 raise MergeError( 

1521 'Can only pass argument "on" OR "left_index" ' 

1522 'and "right_index", not a combination of both.' 

1523 ) 

1524 left_on = right_on = self.on 

1525 elif left_on is not None: 

1526 if self.left_index: 

1527 raise MergeError( 

1528 'Can only pass argument "left_on" OR "left_index" not both.' 

1529 ) 

1530 if not self.right_index and right_on is None: 

1531 raise MergeError('Must pass "right_on" OR "right_index".') 

1532 n = len(left_on) 

1533 if self.right_index: 

1534 if len(left_on) != self.right.index.nlevels: 

1535 raise ValueError( 

1536 "len(left_on) must equal the number " 

1537 'of levels in the index of "right"' 

1538 ) 

1539 right_on = [None] * n 

1540 elif right_on is not None: 

1541 if self.right_index: 

1542 raise MergeError( 

1543 'Can only pass argument "right_on" OR "right_index" not both.' 

1544 ) 

1545 if not self.left_index and left_on is None: 

1546 raise MergeError('Must pass "left_on" OR "left_index".') 

1547 n = len(right_on) 

1548 if self.left_index: 

1549 if len(right_on) != self.left.index.nlevels: 

1550 raise ValueError( 

1551 "len(right_on) must equal the number " 

1552 'of levels in the index of "left"' 

1553 ) 

1554 left_on = [None] * n 

1555 if self.how != "cross" and len(right_on) != len(left_on): 

1556 raise ValueError("len(right_on) must equal len(left_on)") 

1557 

1558 return left_on, right_on 

1559 

1560 def _validate(self, validate: str) -> None: 

1561 # Check uniqueness of each 

1562 if self.left_index: 

1563 left_unique = self.orig_left.index.is_unique 

1564 else: 

1565 left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique 

1566 

1567 if self.right_index: 

1568 right_unique = self.orig_right.index.is_unique 

1569 else: 

1570 right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique 

1571 

1572 # Check data integrity 

1573 if validate in ["one_to_one", "1:1"]: 

1574 if not left_unique and not right_unique: 

1575 raise MergeError( 

1576 "Merge keys are not unique in either left " 

1577 "or right dataset; not a one-to-one merge" 

1578 ) 

1579 if not left_unique: 

1580 raise MergeError( 

1581 "Merge keys are not unique in left dataset; not a one-to-one merge" 

1582 ) 

1583 if not right_unique: 

1584 raise MergeError( 

1585 "Merge keys are not unique in right dataset; not a one-to-one merge" 

1586 ) 

1587 

1588 elif validate in ["one_to_many", "1:m"]: 

1589 if not left_unique: 

1590 raise MergeError( 

1591 "Merge keys are not unique in left dataset; not a one-to-many merge" 

1592 ) 

1593 

1594 elif validate in ["many_to_one", "m:1"]: 

1595 if not right_unique: 

1596 raise MergeError( 

1597 "Merge keys are not unique in right dataset; " 

1598 "not a many-to-one merge" 

1599 ) 

1600 

1601 elif validate in ["many_to_many", "m:m"]: 

1602 pass 

1603 

1604 else: 

1605 raise ValueError( 

1606 f'"{validate}" is not a valid argument. ' 

1607 "Valid arguments are:\n" 

1608 '- "1:1"\n' 

1609 '- "1:m"\n' 

1610 '- "m:1"\n' 

1611 '- "m:m"\n' 

1612 '- "one_to_one"\n' 

1613 '- "one_to_many"\n' 

1614 '- "many_to_one"\n' 

1615 '- "many_to_many"' 

1616 ) 

1617 

1618 

1619def get_join_indexers( 

1620 left_keys, 

1621 right_keys, 

1622 sort: bool = False, 

1623 how: MergeHow | Literal["asof"] = "inner", 

1624 **kwargs, 

1625) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

1626 """ 

1627 

1628 Parameters 

1629 ---------- 

1630 left_keys : ndarray, Index, Series 

1631 right_keys : ndarray, Index, Series 

1632 sort : bool, default False 

1633 how : {'inner', 'outer', 'left', 'right'}, default 'inner' 

1634 

1635 Returns 

1636 ------- 

1637 np.ndarray[np.intp] 

1638 Indexer into the left_keys. 

1639 np.ndarray[np.intp] 

1640 Indexer into the right_keys. 

1641 """ 

1642 assert len(left_keys) == len( 

1643 right_keys 

1644 ), "left_key and right_keys must be the same length" 

1645 

1646 # fast-path for empty left/right 

1647 left_n = len(left_keys[0]) 

1648 right_n = len(right_keys[0]) 

1649 if left_n == 0: 

1650 if how in ["left", "inner", "cross"]: 

1651 return _get_empty_indexer() 

1652 elif not sort and how in ["right", "outer"]: 

1653 return _get_no_sort_one_missing_indexer(right_n, True) 

1654 elif right_n == 0: 

1655 if how in ["right", "inner", "cross"]: 

1656 return _get_empty_indexer() 

1657 elif not sort and how in ["left", "outer"]: 

1658 return _get_no_sort_one_missing_indexer(left_n, False) 

1659 

1660 # get left & right join labels and num. of levels at each location 

1661 mapped = ( 

1662 _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how) 

1663 for n in range(len(left_keys)) 

1664 ) 

1665 zipped = zip(*mapped) 

1666 llab, rlab, shape = (list(x) for x in zipped) 

1667 

1668 # get flat i8 keys from label lists 

1669 lkey, rkey = _get_join_keys(llab, rlab, tuple(shape), sort) 

1670 

1671 # factorize keys to a dense i8 space 

1672 # `count` is the num. of unique keys 

1673 # set(lkey) | set(rkey) == range(count) 

1674 

1675 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) 

1676 # preserve left frame order if how == 'left' and sort == False 

1677 kwargs = cp.copy(kwargs) 

1678 if how in ("left", "right"): 

1679 kwargs["sort"] = sort 

1680 join_func = { 

1681 "inner": libjoin.inner_join, 

1682 "left": libjoin.left_outer_join, 

1683 "right": lambda x, y, count, **kwargs: libjoin.left_outer_join( 

1684 y, x, count, **kwargs 

1685 )[::-1], 

1686 "outer": libjoin.full_outer_join, 

1687 }[how] 

1688 

1689 # error: Cannot call function of unknown type 

1690 return join_func(lkey, rkey, count, **kwargs) # type: ignore[operator] 

1691 

1692 

1693def restore_dropped_levels_multijoin( 

1694 left: MultiIndex, 

1695 right: MultiIndex, 

1696 dropped_level_names, 

1697 join_index: Index, 

1698 lindexer: npt.NDArray[np.intp], 

1699 rindexer: npt.NDArray[np.intp], 

1700) -> tuple[list[Index], npt.NDArray[np.intp], list[Hashable]]: 

1701 """ 

1702 *this is an internal non-public method* 

1703 

1704 Returns the levels, labels and names of a multi-index to multi-index join. 

1705 Depending on the type of join, this method restores the appropriate 

1706 dropped levels of the joined multi-index. 

1707 The method relies on lindexer, rindexer which hold the index positions of 

1708 left and right, where a join was feasible 

1709 

1710 Parameters 

1711 ---------- 

1712 left : MultiIndex 

1713 left index 

1714 right : MultiIndex 

1715 right index 

1716 dropped_level_names : str array 

1717 list of non-common level names 

1718 join_index : Index 

1719 the index of the join between the 

1720 common levels of left and right 

1721 lindexer : np.ndarray[np.intp] 

1722 left indexer 

1723 rindexer : np.ndarray[np.intp] 

1724 right indexer 

1725 

1726 Returns 

1727 ------- 

1728 levels : list of Index 

1729 levels of combined multiindexes 

1730 labels : np.ndarray[np.intp] 

1731 labels of combined multiindexes 

1732 names : List[Hashable] 

1733 names of combined multiindex levels 

1734 

1735 """ 

1736 

1737 def _convert_to_multiindex(index: Index) -> MultiIndex: 

1738 if isinstance(index, MultiIndex): 

1739 return index 

1740 else: 

1741 return MultiIndex.from_arrays([index._values], names=[index.name]) 

1742 

1743 # For multi-multi joins with one overlapping level, 

1744 # the returned index if of type Index 

1745 # Assure that join_index is of type MultiIndex 

1746 # so that dropped levels can be appended 

1747 join_index = _convert_to_multiindex(join_index) 

1748 

1749 join_levels = join_index.levels 

1750 join_codes = join_index.codes 

1751 join_names = join_index.names 

1752 

1753 # Iterate through the levels that must be restored 

1754 for dropped_level_name in dropped_level_names: 

1755 if dropped_level_name in left.names: 

1756 idx = left 

1757 indexer = lindexer 

1758 else: 

1759 idx = right 

1760 indexer = rindexer 

1761 

1762 # The index of the level name to be restored 

1763 name_idx = idx.names.index(dropped_level_name) 

1764 

1765 restore_levels = idx.levels[name_idx] 

1766 # Inject -1 in the codes list where a join was not possible 

1767 # IOW indexer[i]=-1 

1768 codes = idx.codes[name_idx] 

1769 if indexer is None: 

1770 restore_codes = codes 

1771 else: 

1772 restore_codes = algos.take_nd(codes, indexer, fill_value=-1) 

1773 

1774 # error: Cannot determine type of "__add__" 

1775 join_levels = join_levels + [restore_levels] # type: ignore[has-type] 

1776 join_codes = join_codes + [restore_codes] 

1777 join_names = join_names + [dropped_level_name] 

1778 

1779 return join_levels, join_codes, join_names 

1780 

1781 

1782class _OrderedMerge(_MergeOperation): 

1783 _merge_type = "ordered_merge" 

1784 

1785 def __init__( 

1786 self, 

1787 left: DataFrame | Series, 

1788 right: DataFrame | Series, 

1789 on: IndexLabel | None = None, 

1790 left_on: IndexLabel | None = None, 

1791 right_on: IndexLabel | None = None, 

1792 left_index: bool = False, 

1793 right_index: bool = False, 

1794 axis: AxisInt = 1, 

1795 suffixes: Suffixes = ("_x", "_y"), 

1796 fill_method: str | None = None, 

1797 how: JoinHow | Literal["asof"] = "outer", 

1798 ) -> None: 

1799 self.fill_method = fill_method 

1800 _MergeOperation.__init__( 

1801 self, 

1802 left, 

1803 right, 

1804 on=on, 

1805 left_on=left_on, 

1806 left_index=left_index, 

1807 right_index=right_index, 

1808 right_on=right_on, 

1809 axis=axis, 

1810 how=how, 

1811 suffixes=suffixes, 

1812 sort=True, # factorize sorts 

1813 ) 

1814 

1815 def get_result(self, copy: bool | None = True) -> DataFrame: 

1816 join_index, left_indexer, right_indexer = self._get_join_info() 

1817 

1818 llabels, rlabels = _items_overlap_with_suffix( 

1819 self.left._info_axis, self.right._info_axis, self.suffixes 

1820 ) 

1821 

1822 left_join_indexer: np.ndarray | None 

1823 right_join_indexer: np.ndarray | None 

1824 

1825 if self.fill_method == "ffill": 

1826 if left_indexer is None: 

1827 raise TypeError("left_indexer cannot be None") 

1828 left_indexer, right_indexer = cast(np.ndarray, left_indexer), cast( 

1829 np.ndarray, right_indexer 

1830 ) 

1831 left_join_indexer = libjoin.ffill_indexer(left_indexer) 

1832 right_join_indexer = libjoin.ffill_indexer(right_indexer) 

1833 else: 

1834 left_join_indexer = left_indexer 

1835 right_join_indexer = right_indexer 

1836 

1837 result = self._reindex_and_concat( 

1838 join_index, left_join_indexer, right_join_indexer, copy=copy 

1839 ) 

1840 self._maybe_add_join_keys(result, left_indexer, right_indexer) 

1841 

1842 return result 

1843 

1844 

1845def _asof_by_function(direction: str): 

1846 name = f"asof_join_{direction}_on_X_by_Y" 

1847 return getattr(libjoin, name, None) 

1848 

1849 

1850_type_casters = { 

1851 "int64_t": ensure_int64, 

1852 "double": ensure_float64, 

1853 "object": ensure_object, 

1854} 

1855 

1856 

1857def _get_cython_type_upcast(dtype: DtypeObj) -> str: 

1858 """Upcast a dtype to 'int64_t', 'double', or 'object'""" 

1859 if is_integer_dtype(dtype): 

1860 return "int64_t" 

1861 elif is_float_dtype(dtype): 

1862 return "double" 

1863 else: 

1864 return "object" 

1865 

1866 

1867class _AsOfMerge(_OrderedMerge): 

1868 _merge_type = "asof_merge" 

1869 

1870 def __init__( 

1871 self, 

1872 left: DataFrame | Series, 

1873 right: DataFrame | Series, 

1874 on: IndexLabel | None = None, 

1875 left_on: IndexLabel | None = None, 

1876 right_on: IndexLabel | None = None, 

1877 left_index: bool = False, 

1878 right_index: bool = False, 

1879 by=None, 

1880 left_by=None, 

1881 right_by=None, 

1882 axis: AxisInt = 1, 

1883 suffixes: Suffixes = ("_x", "_y"), 

1884 copy: bool = True, 

1885 fill_method: str | None = None, 

1886 how: Literal["asof"] = "asof", 

1887 tolerance=None, 

1888 allow_exact_matches: bool = True, 

1889 direction: str = "backward", 

1890 ) -> None: 

1891 self.by = by 

1892 self.left_by = left_by 

1893 self.right_by = right_by 

1894 self.tolerance = tolerance 

1895 self.allow_exact_matches = allow_exact_matches 

1896 self.direction = direction 

1897 

1898 _OrderedMerge.__init__( 

1899 self, 

1900 left, 

1901 right, 

1902 on=on, 

1903 left_on=left_on, 

1904 right_on=right_on, 

1905 left_index=left_index, 

1906 right_index=right_index, 

1907 axis=axis, 

1908 how=how, 

1909 suffixes=suffixes, 

1910 fill_method=fill_method, 

1911 ) 

1912 

1913 def _validate_left_right_on(self, left_on, right_on): 

1914 left_on, right_on = super()._validate_left_right_on(left_on, right_on) 

1915 

1916 # we only allow on to be a single item for on 

1917 if len(left_on) != 1 and not self.left_index: 

1918 raise MergeError("can only asof on a key for left") 

1919 

1920 if len(right_on) != 1 and not self.right_index: 

1921 raise MergeError("can only asof on a key for right") 

1922 

1923 if self.left_index and isinstance(self.left.index, MultiIndex): 

1924 raise MergeError("left can only have one index") 

1925 

1926 if self.right_index and isinstance(self.right.index, MultiIndex): 

1927 raise MergeError("right can only have one index") 

1928 

1929 # set 'by' columns 

1930 if self.by is not None: 

1931 if self.left_by is not None or self.right_by is not None: 

1932 raise MergeError("Can only pass by OR left_by and right_by") 

1933 self.left_by = self.right_by = self.by 

1934 if self.left_by is None and self.right_by is not None: 

1935 raise MergeError("missing left_by") 

1936 if self.left_by is not None and self.right_by is None: 

1937 raise MergeError("missing right_by") 

1938 

1939 # GH#29130 Check that merge keys do not have dtype object 

1940 if not self.left_index: 

1941 left_on_0 = left_on[0] 

1942 if is_array_like(left_on_0): 

1943 lo_dtype = left_on_0.dtype 

1944 else: 

1945 lo_dtype = ( 

1946 self.left._get_label_or_level_values(left_on_0).dtype 

1947 if left_on_0 in self.left.columns 

1948 else self.left.index.get_level_values(left_on_0) 

1949 ) 

1950 else: 

1951 lo_dtype = self.left.index.dtype 

1952 

1953 if not self.right_index: 

1954 right_on_0 = right_on[0] 

1955 if is_array_like(right_on_0): 

1956 ro_dtype = right_on_0.dtype 

1957 else: 

1958 ro_dtype = ( 

1959 self.right._get_label_or_level_values(right_on_0).dtype 

1960 if right_on_0 in self.right.columns 

1961 else self.right.index.get_level_values(right_on_0) 

1962 ) 

1963 else: 

1964 ro_dtype = self.right.index.dtype 

1965 

1966 if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype): 

1967 raise MergeError( 

1968 f"Incompatible merge dtype, {repr(ro_dtype)} and " 

1969 f"{repr(lo_dtype)}, both sides must have numeric dtype" 

1970 ) 

1971 

1972 # add 'by' to our key-list so we can have it in the 

1973 # output as a key 

1974 if self.left_by is not None: 

1975 if not is_list_like(self.left_by): 

1976 self.left_by = [self.left_by] 

1977 if not is_list_like(self.right_by): 

1978 self.right_by = [self.right_by] 

1979 

1980 if len(self.left_by) != len(self.right_by): 

1981 raise MergeError("left_by and right_by must be same length") 

1982 

1983 left_on = self.left_by + list(left_on) 

1984 right_on = self.right_by + list(right_on) 

1985 

1986 # check 'direction' is valid 

1987 if self.direction not in ["backward", "forward", "nearest"]: 

1988 raise MergeError(f"direction invalid: {self.direction}") 

1989 

1990 return left_on, right_on 

1991 

1992 def _get_merge_keys( 

1993 self, 

1994 ) -> tuple[list[AnyArrayLike], list[AnyArrayLike], list[Hashable]]: 

1995 # note this function has side effects 

1996 (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys() 

1997 

1998 # validate index types are the same 

1999 for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): 

2000 if not is_dtype_equal(lk.dtype, rk.dtype): 

2001 if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype): 

2002 # The generic error message is confusing for categoricals. 

2003 # 

2004 # In this function, the join keys include both the original 

2005 # ones of the merge_asof() call, and also the keys passed 

2006 # to its by= argument. Unordered but equal categories 

2007 # are not supported for the former, but will fail 

2008 # later with a ValueError, so we don't *need* to check 

2009 # for them here. 

2010 msg = ( 

2011 f"incompatible merge keys [{i}] {repr(lk.dtype)} and " 

2012 f"{repr(rk.dtype)}, both sides category, but not equal ones" 

2013 ) 

2014 else: 

2015 msg = ( 

2016 f"incompatible merge keys [{i}] {repr(lk.dtype)} and " 

2017 f"{repr(rk.dtype)}, must be the same type" 

2018 ) 

2019 raise MergeError(msg) 

2020 

2021 # validate tolerance; datetime.timedelta or Timedelta if we have a DTI 

2022 if self.tolerance is not None: 

2023 if self.left_index: 

2024 # Actually more specifically an Index 

2025 lt = cast(AnyArrayLike, self.left.index) 

2026 else: 

2027 lt = left_join_keys[-1] 

2028 

2029 msg = ( 

2030 f"incompatible tolerance {self.tolerance}, must be compat " 

2031 f"with type {repr(lt.dtype)}" 

2032 ) 

2033 

2034 if needs_i8_conversion(lt): 

2035 if not isinstance(self.tolerance, datetime.timedelta): 

2036 raise MergeError(msg) 

2037 if self.tolerance < Timedelta(0): 

2038 raise MergeError("tolerance must be positive") 

2039 

2040 elif is_integer_dtype(lt): 

2041 if not is_integer(self.tolerance): 

2042 raise MergeError(msg) 

2043 if self.tolerance < 0: 

2044 raise MergeError("tolerance must be positive") 

2045 

2046 elif is_float_dtype(lt): 

2047 if not is_number(self.tolerance): 

2048 raise MergeError(msg) 

2049 if self.tolerance < 0: 

2050 raise MergeError("tolerance must be positive") 

2051 

2052 else: 

2053 raise MergeError("key must be integer, timestamp or float") 

2054 

2055 # validate allow_exact_matches 

2056 if not is_bool(self.allow_exact_matches): 

2057 msg = ( 

2058 "allow_exact_matches must be boolean, " 

2059 f"passed {self.allow_exact_matches}" 

2060 ) 

2061 raise MergeError(msg) 

2062 

2063 return left_join_keys, right_join_keys, join_names 

2064 

2065 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2066 """return the join indexers""" 

2067 

2068 def flip(xs) -> np.ndarray: 

2069 """unlike np.transpose, this returns an array of tuples""" 

2070 

2071 def injection(obj): 

2072 if not is_extension_array_dtype(obj): 

2073 # ndarray 

2074 return obj 

2075 obj = extract_array(obj) 

2076 if isinstance(obj, NDArrayBackedExtensionArray): 

2077 # fastpath for e.g. dt64tz, categorical 

2078 return obj._ndarray 

2079 # FIXME: returning obj._values_for_argsort() here doesn't 

2080 # break in any existing test cases, but i (@jbrockmendel) 

2081 # am pretty sure it should! 

2082 # e.g. 

2083 # arr = pd.array([0, pd.NA, 255], dtype="UInt8") 

2084 # will have values_for_argsort (before GH#45434) 

2085 # np.array([0, 255, 255], dtype=np.uint8) 

2086 # and the non-injectivity should make a difference somehow 

2087 # shouldn't it? 

2088 return np.asarray(obj) 

2089 

2090 xs = [injection(x) for x in xs] 

2091 labels = list(string.ascii_lowercase[: len(xs)]) 

2092 dtypes = [x.dtype for x in xs] 

2093 labeled_dtypes = list(zip(labels, dtypes)) 

2094 return np.array(list(zip(*xs)), labeled_dtypes) 

2095 

2096 # values to compare 

2097 left_values = ( 

2098 self.left.index._values if self.left_index else self.left_join_keys[-1] 

2099 ) 

2100 right_values = ( 

2101 self.right.index._values if self.right_index else self.right_join_keys[-1] 

2102 ) 

2103 tolerance = self.tolerance 

2104 

2105 # we require sortedness and non-null values in the join keys 

2106 if not Index(left_values).is_monotonic_increasing: 

2107 side = "left" 

2108 if isna(left_values).any(): 

2109 raise ValueError(f"Merge keys contain null values on {side} side") 

2110 raise ValueError(f"{side} keys must be sorted") 

2111 

2112 if not Index(right_values).is_monotonic_increasing: 

2113 side = "right" 

2114 if isna(right_values).any(): 

2115 raise ValueError(f"Merge keys contain null values on {side} side") 

2116 raise ValueError(f"{side} keys must be sorted") 

2117 

2118 # initial type conversion as needed 

2119 if needs_i8_conversion(left_values): 

2120 if tolerance is not None: 

2121 tolerance = Timedelta(tolerance) 

2122 

2123 # TODO: we have no test cases with PeriodDtype here; probably 

2124 # need to adjust tolerance for that case. 

2125 if left_values.dtype.kind in ["m", "M"]: 

2126 # Make sure the i8 representation for tolerance 

2127 # matches that for left_values/right_values. 

2128 lvs = ensure_wrapped_if_datetimelike(left_values) 

2129 tolerance = tolerance.as_unit(lvs.unit) 

2130 

2131 tolerance = tolerance._value 

2132 

2133 # TODO: require left_values.dtype == right_values.dtype, or at least 

2134 # comparable for e.g. dt64tz 

2135 left_values = left_values.view("i8") 

2136 right_values = right_values.view("i8") 

2137 

2138 # a "by" parameter requires special handling 

2139 if self.left_by is not None: 

2140 # remove 'on' parameter from values if one existed 

2141 if self.left_index and self.right_index: 

2142 left_by_values = self.left_join_keys 

2143 right_by_values = self.right_join_keys 

2144 else: 

2145 left_by_values = self.left_join_keys[0:-1] 

2146 right_by_values = self.right_join_keys[0:-1] 

2147 

2148 # get tuple representation of values if more than one 

2149 if len(left_by_values) == 1: 

2150 lbv = left_by_values[0] 

2151 rbv = right_by_values[0] 

2152 else: 

2153 # We get here with non-ndarrays in test_merge_by_col_tz_aware 

2154 # and test_merge_groupby_multiple_column_with_categorical_column 

2155 lbv = flip(left_by_values) 

2156 rbv = flip(right_by_values) 

2157 

2158 # upcast 'by' parameter because HashTable is limited 

2159 by_type = _get_cython_type_upcast(lbv.dtype) 

2160 by_type_caster = _type_casters[by_type] 

2161 # error: Incompatible types in assignment (expression has type 

2162 # "ndarray[Any, dtype[generic]]", variable has type 

2163 # "List[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]]") 

2164 left_by_values = by_type_caster(lbv) # type: ignore[assignment] 

2165 # error: Incompatible types in assignment (expression has type 

2166 # "ndarray[Any, dtype[generic]]", variable has type 

2167 # "List[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]]") 

2168 right_by_values = by_type_caster(rbv) # type: ignore[assignment] 

2169 

2170 # choose appropriate function by type 

2171 func = _asof_by_function(self.direction) 

2172 return func( 

2173 left_values, 

2174 right_values, 

2175 left_by_values, 

2176 right_by_values, 

2177 self.allow_exact_matches, 

2178 tolerance, 

2179 ) 

2180 else: 

2181 # choose appropriate function by type 

2182 func = _asof_by_function(self.direction) 

2183 # TODO(cython3): 

2184 # Bug in beta1 preventing Cython from choosing 

2185 # right specialization when one fused memview is None 

2186 # Doesn't matter what type we choose 

2187 # (nothing happens anyways since it is None) 

2188 # GH 51640 

2189 return func[f"{left_values.dtype}_t", object]( 

2190 left_values, 

2191 right_values, 

2192 None, 

2193 None, 

2194 self.allow_exact_matches, 

2195 tolerance, 

2196 False, 

2197 ) 

2198 

2199 

2200def _get_multiindex_indexer( 

2201 join_keys, index: MultiIndex, sort: bool 

2202) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2203 # left & right join labels and num. of levels at each location 

2204 mapped = ( 

2205 _factorize_keys(index.levels[n], join_keys[n], sort=sort) 

2206 for n in range(index.nlevels) 

2207 ) 

2208 zipped = zip(*mapped) 

2209 rcodes, lcodes, shape = (list(x) for x in zipped) 

2210 if sort: 

2211 rcodes = list(map(np.take, rcodes, index.codes)) 

2212 else: 

2213 i8copy = lambda a: a.astype("i8", subok=False, copy=True) 

2214 rcodes = list(map(i8copy, index.codes)) 

2215 

2216 # fix right labels if there were any nulls 

2217 for i, join_key in enumerate(join_keys): 

2218 mask = index.codes[i] == -1 

2219 if mask.any(): 

2220 # check if there already was any nulls at this location 

2221 # if there was, it is factorized to `shape[i] - 1` 

2222 a = join_key[lcodes[i] == shape[i] - 1] 

2223 if a.size == 0 or not a[0] != a[0]: 

2224 shape[i] += 1 

2225 

2226 rcodes[i][mask] = shape[i] - 1 

2227 

2228 # get flat i8 join keys 

2229 lkey, rkey = _get_join_keys(lcodes, rcodes, tuple(shape), sort) 

2230 

2231 # factorize keys to a dense i8 space 

2232 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) 

2233 

2234 return libjoin.left_outer_join(lkey, rkey, count, sort=sort) 

2235 

2236 

2237def _get_single_indexer( 

2238 join_key, index: Index, sort: bool = False 

2239) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2240 left_key, right_key, count = _factorize_keys(join_key, index._values, sort=sort) 

2241 

2242 return libjoin.left_outer_join(left_key, right_key, count, sort=sort) 

2243 

2244 

2245def _get_empty_indexer() -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2246 """Return empty join indexers.""" 

2247 return ( 

2248 np.array([], dtype=np.intp), 

2249 np.array([], dtype=np.intp), 

2250 ) 

2251 

2252 

2253def _get_no_sort_one_missing_indexer( 

2254 n: int, left_missing: bool 

2255) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2256 """ 

2257 Return join indexers where all of one side is selected without sorting 

2258 and none of the other side is selected. 

2259 

2260 Parameters 

2261 ---------- 

2262 n : int 

2263 Length of indexers to create. 

2264 left_missing : bool 

2265 If True, the left indexer will contain only -1's. 

2266 If False, the right indexer will contain only -1's. 

2267 

2268 Returns 

2269 ------- 

2270 np.ndarray[np.intp] 

2271 Left indexer 

2272 np.ndarray[np.intp] 

2273 Right indexer 

2274 """ 

2275 idx = np.arange(n, dtype=np.intp) 

2276 idx_missing = np.full(shape=n, fill_value=-1, dtype=np.intp) 

2277 if left_missing: 

2278 return idx_missing, idx 

2279 return idx, idx_missing 

2280 

2281 

2282def _left_join_on_index( 

2283 left_ax: Index, right_ax: Index, join_keys, sort: bool = False 

2284) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]: 

2285 if len(join_keys) > 1: 

2286 if not ( 

2287 isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels 

2288 ): 

2289 raise AssertionError( 

2290 "If more than one join key is given then " 

2291 "'right_ax' must be a MultiIndex and the " 

2292 "number of join keys must be the number of levels in right_ax" 

2293 ) 

2294 

2295 left_indexer, right_indexer = _get_multiindex_indexer( 

2296 join_keys, right_ax, sort=sort 

2297 ) 

2298 else: 

2299 jkey = join_keys[0] 

2300 

2301 left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort) 

2302 

2303 if sort or len(left_ax) != len(left_indexer): 

2304 # if asked to sort or there are 1-to-many matches 

2305 join_index = left_ax.take(left_indexer) 

2306 return join_index, left_indexer, right_indexer 

2307 

2308 # left frame preserves order & length of its index 

2309 return left_ax, None, right_indexer 

2310 

2311 

2312def _factorize_keys( 

2313 lk: ArrayLike, 

2314 rk: ArrayLike, 

2315 sort: bool = True, 

2316 how: MergeHow | Literal["asof"] = "inner", 

2317) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: 

2318 """ 

2319 Encode left and right keys as enumerated types. 

2320 

2321 This is used to get the join indexers to be used when merging DataFrames. 

2322 

2323 Parameters 

2324 ---------- 

2325 lk : array-like 

2326 Left key. 

2327 rk : array-like 

2328 Right key. 

2329 sort : bool, defaults to True 

2330 If True, the encoding is done such that the unique elements in the 

2331 keys are sorted. 

2332 how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’ 

2333 Type of merge. 

2334 

2335 Returns 

2336 ------- 

2337 np.ndarray[np.intp] 

2338 Left (resp. right if called with `key='right'`) labels, as enumerated type. 

2339 np.ndarray[np.intp] 

2340 Right (resp. left if called with `key='right'`) labels, as enumerated type. 

2341 int 

2342 Number of unique elements in union of left and right labels. 

2343 

2344 See Also 

2345 -------- 

2346 merge : Merge DataFrame or named Series objects 

2347 with a database-style join. 

2348 algorithms.factorize : Encode the object as an enumerated type 

2349 or categorical variable. 

2350 

2351 Examples 

2352 -------- 

2353 >>> lk = np.array(["a", "c", "b"]) 

2354 >>> rk = np.array(["a", "c"]) 

2355 

2356 Here, the unique values are `'a', 'b', 'c'`. With the default 

2357 `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`: 

2358 

2359 >>> pd.core.reshape.merge._factorize_keys(lk, rk) 

2360 (array([0, 2, 1]), array([0, 2]), 3) 

2361 

2362 With the `sort=False`, the encoding will correspond to the order 

2363 in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`: 

2364 

2365 >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False) 

2366 (array([0, 1, 2]), array([0, 1]), 3) 

2367 """ 

2368 # Some pre-processing for non-ndarray lk / rk 

2369 lk = extract_array(lk, extract_numpy=True, extract_range=True) 

2370 rk = extract_array(rk, extract_numpy=True, extract_range=True) 

2371 # TODO: if either is a RangeIndex, we can likely factorize more efficiently? 

2372 

2373 if ( 

2374 isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype) 

2375 ) or ( 

2376 isinstance(lk.dtype, np.dtype) 

2377 and lk.dtype.kind == "M" 

2378 and isinstance(rk.dtype, np.dtype) 

2379 and rk.dtype.kind == "M" 

2380 ): 

2381 # Extract the ndarray (UTC-localized) values 

2382 # Note: we dont need the dtypes to match, as these can still be compared 

2383 lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk) 

2384 lk = cast("DatetimeArray", lk)._ndarray 

2385 rk = cast("DatetimeArray", rk)._ndarray 

2386 

2387 elif ( 

2388 is_categorical_dtype(lk.dtype) 

2389 and is_categorical_dtype(rk.dtype) 

2390 and is_dtype_equal(lk.dtype, rk.dtype) 

2391 ): 

2392 assert isinstance(lk, Categorical) 

2393 assert isinstance(rk, Categorical) 

2394 # Cast rk to encoding so we can compare codes with lk 

2395 

2396 rk = lk._encode_with_my_categories(rk) 

2397 

2398 lk = ensure_int64(lk.codes) 

2399 rk = ensure_int64(rk.codes) 

2400 

2401 elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype): 

2402 if not isinstance(lk, BaseMaskedArray) and not ( 

2403 # exclude arrow dtypes that would get cast to object 

2404 isinstance(lk.dtype, ArrowDtype) 

2405 and is_numeric_dtype(lk.dtype.numpy_dtype) 

2406 ): 

2407 lk, _ = lk._values_for_factorize() 

2408 

2409 # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute 

2410 # "_values_for_factorize" 

2411 rk, _ = rk._values_for_factorize() # type: ignore[union-attr] 

2412 

2413 if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype: 

2414 # GH#23917 TODO: Needs tests for non-matching dtypes 

2415 # GH#23917 TODO: needs tests for case where lk is integer-dtype 

2416 # and rk is datetime-dtype 

2417 lk = np.asarray(lk, dtype=np.int64) 

2418 rk = np.asarray(rk, dtype=np.int64) 

2419 

2420 klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk) 

2421 

2422 rizer = klass(max(len(lk), len(rk))) 

2423 

2424 if isinstance(lk, BaseMaskedArray): 

2425 assert isinstance(rk, BaseMaskedArray) 

2426 llab = rizer.factorize(lk._data, mask=lk._mask) 

2427 rlab = rizer.factorize(rk._data, mask=rk._mask) 

2428 elif isinstance(lk, ArrowExtensionArray): 

2429 assert isinstance(rk, ArrowExtensionArray) 

2430 # we can only get here with numeric dtypes 

2431 # TODO: Remove when we have a Factorizer for Arrow 

2432 llab = rizer.factorize( 

2433 lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna() 

2434 ) 

2435 rlab = rizer.factorize( 

2436 rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna() 

2437 ) 

2438 else: 

2439 # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type 

2440 # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], 

2441 # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" 

2442 llab = rizer.factorize(lk) # type: ignore[arg-type] 

2443 rlab = rizer.factorize(rk) # type: ignore[arg-type] 

2444 assert llab.dtype == np.dtype(np.intp), llab.dtype 

2445 assert rlab.dtype == np.dtype(np.intp), rlab.dtype 

2446 

2447 count = rizer.get_count() 

2448 

2449 if sort: 

2450 uniques = rizer.uniques.to_array() 

2451 llab, rlab = _sort_labels(uniques, llab, rlab) 

2452 

2453 # NA group 

2454 lmask = llab == -1 

2455 lany = lmask.any() 

2456 rmask = rlab == -1 

2457 rany = rmask.any() 

2458 

2459 if lany or rany: 

2460 if lany: 

2461 np.putmask(llab, lmask, count) 

2462 if rany: 

2463 np.putmask(rlab, rmask, count) 

2464 count += 1 

2465 

2466 if how == "right": 

2467 return rlab, llab, count 

2468 return llab, rlab, count 

2469 

2470 

2471def _convert_arrays_and_get_rizer_klass( 

2472 lk: ArrayLike, rk: ArrayLike 

2473) -> tuple[type[libhashtable.Factorizer], ArrayLike, ArrayLike]: 

2474 klass: type[libhashtable.Factorizer] 

2475 if is_numeric_dtype(lk.dtype): 

2476 if not is_dtype_equal(lk, rk): 

2477 dtype = find_common_type([lk.dtype, rk.dtype]) 

2478 if isinstance(dtype, ExtensionDtype): 

2479 cls = dtype.construct_array_type() 

2480 if not isinstance(lk, ExtensionArray): 

2481 lk = cls._from_sequence(lk, dtype=dtype, copy=False) 

2482 else: 

2483 lk = lk.astype(dtype) 

2484 

2485 if not isinstance(rk, ExtensionArray): 

2486 rk = cls._from_sequence(rk, dtype=dtype, copy=False) 

2487 else: 

2488 rk = rk.astype(dtype) 

2489 else: 

2490 lk = lk.astype(dtype) 

2491 rk = rk.astype(dtype) 

2492 if isinstance(lk, BaseMaskedArray): 

2493 # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]"; 

2494 # expected type "Type[object]" 

2495 klass = _factorizers[lk.dtype.type] # type: ignore[index] 

2496 elif isinstance(lk.dtype, ArrowDtype): 

2497 klass = _factorizers[lk.dtype.numpy_dtype.type] 

2498 else: 

2499 klass = _factorizers[lk.dtype.type] 

2500 

2501 else: 

2502 klass = libhashtable.ObjectFactorizer 

2503 lk = ensure_object(lk) 

2504 rk = ensure_object(rk) 

2505 return klass, lk, rk 

2506 

2507 

2508def _sort_labels( 

2509 uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp] 

2510) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2511 llength = len(left) 

2512 labels = np.concatenate([left, right]) 

2513 

2514 _, new_labels = algos.safe_sort(uniques, labels, use_na_sentinel=True) 

2515 new_left, new_right = new_labels[:llength], new_labels[llength:] 

2516 

2517 return new_left, new_right 

2518 

2519 

2520def _get_join_keys( 

2521 llab: list[npt.NDArray[np.int64 | np.intp]], 

2522 rlab: list[npt.NDArray[np.int64 | np.intp]], 

2523 shape: Shape, 

2524 sort: bool, 

2525) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: 

2526 # how many levels can be done without overflow 

2527 nlev = next( 

2528 lev 

2529 for lev in range(len(shape), 0, -1) 

2530 if not is_int64_overflow_possible(shape[:lev]) 

2531 ) 

2532 

2533 # get keys for the first `nlev` levels 

2534 stride = np.prod(shape[1:nlev], dtype="i8") 

2535 lkey = stride * llab[0].astype("i8", subok=False, copy=False) 

2536 rkey = stride * rlab[0].astype("i8", subok=False, copy=False) 

2537 

2538 for i in range(1, nlev): 

2539 with np.errstate(divide="ignore"): 

2540 stride //= shape[i] 

2541 lkey += llab[i] * stride 

2542 rkey += rlab[i] * stride 

2543 

2544 if nlev == len(shape): # all done! 

2545 return lkey, rkey 

2546 

2547 # densify current keys to avoid overflow 

2548 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) 

2549 

2550 llab = [lkey] + llab[nlev:] 

2551 rlab = [rkey] + rlab[nlev:] 

2552 shape = (count,) + shape[nlev:] 

2553 

2554 return _get_join_keys(llab, rlab, shape, sort) 

2555 

2556 

2557def _should_fill(lname, rname) -> bool: 

2558 if not isinstance(lname, str) or not isinstance(rname, str): 

2559 return True 

2560 return lname == rname 

2561 

2562 

2563def _any(x) -> bool: 

2564 return x is not None and com.any_not_none(*x) 

2565 

2566 

2567def _validate_operand(obj: DataFrame | Series) -> DataFrame: 

2568 if isinstance(obj, ABCDataFrame): 

2569 return obj 

2570 elif isinstance(obj, ABCSeries): 

2571 if obj.name is None: 

2572 raise ValueError("Cannot merge a Series without a name") 

2573 return obj.to_frame() 

2574 else: 

2575 raise TypeError( 

2576 f"Can only merge Series or DataFrame objects, a {type(obj)} was passed" 

2577 ) 

2578 

2579 

2580def _items_overlap_with_suffix( 

2581 left: Index, right: Index, suffixes: Suffixes 

2582) -> tuple[Index, Index]: 

2583 """ 

2584 Suffixes type validation. 

2585 

2586 If two indices overlap, add suffixes to overlapping entries. 

2587 

2588 If corresponding suffix is empty, the entry is simply converted to string. 

2589 

2590 """ 

2591 if not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict): 

2592 raise TypeError( 

2593 f"Passing 'suffixes' as a {type(suffixes)}, is not supported. " 

2594 "Provide 'suffixes' as a tuple instead." 

2595 ) 

2596 

2597 to_rename = left.intersection(right) 

2598 if len(to_rename) == 0: 

2599 return left, right 

2600 

2601 lsuffix, rsuffix = suffixes 

2602 

2603 if not lsuffix and not rsuffix: 

2604 raise ValueError(f"columns overlap but no suffix specified: {to_rename}") 

2605 

2606 def renamer(x, suffix): 

2607 """ 

2608 Rename the left and right indices. 

2609 

2610 If there is overlap, and suffix is not None, add 

2611 suffix, otherwise, leave it as-is. 

2612 

2613 Parameters 

2614 ---------- 

2615 x : original column name 

2616 suffix : str or None 

2617 

2618 Returns 

2619 ------- 

2620 x : renamed column name 

2621 """ 

2622 if x in to_rename and suffix is not None: 

2623 return f"{x}{suffix}" 

2624 return x 

2625 

2626 lrenamer = partial(renamer, suffix=lsuffix) 

2627 rrenamer = partial(renamer, suffix=rsuffix) 

2628 

2629 llabels = left._transform_index(lrenamer) 

2630 rlabels = right._transform_index(rrenamer) 

2631 

2632 dups = [] 

2633 if not llabels.is_unique: 

2634 # Only warn when duplicates are caused because of suffixes, already duplicated 

2635 # columns in origin should not warn 

2636 dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist() 

2637 if not rlabels.is_unique: 

2638 dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist()) 

2639 if dups: 

2640 raise MergeError( 

2641 f"Passing 'suffixes' which cause duplicate columns {set(dups)} is " 

2642 f"not allowed.", 

2643 ) 

2644 

2645 return llabels, rlabels