Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/reshape/merge.py: 11%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

990 statements  

1""" 

2SQL-style merge routines 

3""" 

4from __future__ import annotations 

5 

6from collections.abc import ( 

7 Hashable, 

8 Sequence, 

9) 

10import datetime 

11from functools import partial 

12from typing import ( 

13 TYPE_CHECKING, 

14 Literal, 

15 cast, 

16 final, 

17) 

18import uuid 

19import warnings 

20 

21import numpy as np 

22 

23from pandas._libs import ( 

24 Timedelta, 

25 hashtable as libhashtable, 

26 join as libjoin, 

27 lib, 

28) 

29from pandas._libs.lib import is_range_indexer 

30from pandas._typing import ( 

31 AnyArrayLike, 

32 ArrayLike, 

33 IndexLabel, 

34 JoinHow, 

35 MergeHow, 

36 Shape, 

37 Suffixes, 

38 npt, 

39) 

40from pandas.errors import MergeError 

41from pandas.util._decorators import ( 

42 Appender, 

43 Substitution, 

44 cache_readonly, 

45) 

46from pandas.util._exceptions import find_stack_level 

47 

48from pandas.core.dtypes.base import ExtensionDtype 

49from pandas.core.dtypes.cast import find_common_type 

50from pandas.core.dtypes.common import ( 

51 ensure_int64, 

52 ensure_object, 

53 is_bool, 

54 is_bool_dtype, 

55 is_float_dtype, 

56 is_integer, 

57 is_integer_dtype, 

58 is_list_like, 

59 is_number, 

60 is_numeric_dtype, 

61 is_object_dtype, 

62 is_string_dtype, 

63 needs_i8_conversion, 

64) 

65from pandas.core.dtypes.dtypes import ( 

66 CategoricalDtype, 

67 DatetimeTZDtype, 

68) 

69from pandas.core.dtypes.generic import ( 

70 ABCDataFrame, 

71 ABCSeries, 

72) 

73from pandas.core.dtypes.missing import ( 

74 isna, 

75 na_value_for_dtype, 

76) 

77 

78from pandas import ( 

79 ArrowDtype, 

80 Categorical, 

81 Index, 

82 MultiIndex, 

83 Series, 

84) 

85import pandas.core.algorithms as algos 

86from pandas.core.arrays import ( 

87 ArrowExtensionArray, 

88 BaseMaskedArray, 

89 ExtensionArray, 

90) 

91from pandas.core.arrays.string_ import StringDtype 

92import pandas.core.common as com 

93from pandas.core.construction import ( 

94 ensure_wrapped_if_datetimelike, 

95 extract_array, 

96) 

97from pandas.core.frame import _merge_doc 

98from pandas.core.indexes.api import default_index 

99from pandas.core.sorting import ( 

100 get_group_index, 

101 is_int64_overflow_possible, 

102) 

103 

104if TYPE_CHECKING: 

105 from pandas import DataFrame 

106 from pandas.core import groupby 

107 from pandas.core.arrays import DatetimeArray 

108 from pandas.core.indexes.frozen import FrozenList 

109 

110_factorizers = { 

111 np.int64: libhashtable.Int64Factorizer, 

112 np.longlong: libhashtable.Int64Factorizer, 

113 np.int32: libhashtable.Int32Factorizer, 

114 np.int16: libhashtable.Int16Factorizer, 

115 np.int8: libhashtable.Int8Factorizer, 

116 np.uint64: libhashtable.UInt64Factorizer, 

117 np.uint32: libhashtable.UInt32Factorizer, 

118 np.uint16: libhashtable.UInt16Factorizer, 

119 np.uint8: libhashtable.UInt8Factorizer, 

120 np.bool_: libhashtable.UInt8Factorizer, 

121 np.float64: libhashtable.Float64Factorizer, 

122 np.float32: libhashtable.Float32Factorizer, 

123 np.complex64: libhashtable.Complex64Factorizer, 

124 np.complex128: libhashtable.Complex128Factorizer, 

125 np.object_: libhashtable.ObjectFactorizer, 

126} 

127 

128# See https://github.com/pandas-dev/pandas/issues/52451 

129if np.intc is not np.int32: 

130 _factorizers[np.intc] = libhashtable.Int64Factorizer 

131 

132_known = (np.ndarray, ExtensionArray, Index, ABCSeries) 

133 

134 

135@Substitution("\nleft : DataFrame or named Series") 

136@Appender(_merge_doc, indents=0) 

137def merge( 

138 left: DataFrame | Series, 

139 right: DataFrame | Series, 

140 how: MergeHow = "inner", 

141 on: IndexLabel | AnyArrayLike | None = None, 

142 left_on: IndexLabel | AnyArrayLike | None = None, 

143 right_on: IndexLabel | AnyArrayLike | None = None, 

144 left_index: bool = False, 

145 right_index: bool = False, 

146 sort: bool = False, 

147 suffixes: Suffixes = ("_x", "_y"), 

148 copy: bool | None = None, 

149 indicator: str | bool = False, 

150 validate: str | None = None, 

151) -> DataFrame: 

152 left_df = _validate_operand(left) 

153 right_df = _validate_operand(right) 

154 if how == "cross": 

155 return _cross_merge( 

156 left_df, 

157 right_df, 

158 on=on, 

159 left_on=left_on, 

160 right_on=right_on, 

161 left_index=left_index, 

162 right_index=right_index, 

163 sort=sort, 

164 suffixes=suffixes, 

165 indicator=indicator, 

166 validate=validate, 

167 copy=copy, 

168 ) 

169 else: 

170 op = _MergeOperation( 

171 left_df, 

172 right_df, 

173 how=how, 

174 on=on, 

175 left_on=left_on, 

176 right_on=right_on, 

177 left_index=left_index, 

178 right_index=right_index, 

179 sort=sort, 

180 suffixes=suffixes, 

181 indicator=indicator, 

182 validate=validate, 

183 ) 

184 return op.get_result(copy=copy) 

185 

186 

187def _cross_merge( 

188 left: DataFrame, 

189 right: DataFrame, 

190 on: IndexLabel | AnyArrayLike | None = None, 

191 left_on: IndexLabel | AnyArrayLike | None = None, 

192 right_on: IndexLabel | AnyArrayLike | None = None, 

193 left_index: bool = False, 

194 right_index: bool = False, 

195 sort: bool = False, 

196 suffixes: Suffixes = ("_x", "_y"), 

197 copy: bool | None = None, 

198 indicator: str | bool = False, 

199 validate: str | None = None, 

200) -> DataFrame: 

201 """ 

202 See merge.__doc__ with how='cross' 

203 """ 

204 

205 if ( 

206 left_index 

207 or right_index 

208 or right_on is not None 

209 or left_on is not None 

210 or on is not None 

211 ): 

212 raise MergeError( 

213 "Can not pass on, right_on, left_on or set right_index=True or " 

214 "left_index=True" 

215 ) 

216 

217 cross_col = f"_cross_{uuid.uuid4()}" 

218 left = left.assign(**{cross_col: 1}) 

219 right = right.assign(**{cross_col: 1}) 

220 

221 left_on = right_on = [cross_col] 

222 

223 res = merge( 

224 left, 

225 right, 

226 how="inner", 

227 on=on, 

228 left_on=left_on, 

229 right_on=right_on, 

230 left_index=left_index, 

231 right_index=right_index, 

232 sort=sort, 

233 suffixes=suffixes, 

234 indicator=indicator, 

235 validate=validate, 

236 copy=copy, 

237 ) 

238 del res[cross_col] 

239 return res 

240 

241 

242def _groupby_and_merge( 

243 by, left: DataFrame | Series, right: DataFrame | Series, merge_pieces 

244): 

245 """ 

246 groupby & merge; we are always performing a left-by type operation 

247 

248 Parameters 

249 ---------- 

250 by: field to group 

251 left: DataFrame 

252 right: DataFrame 

253 merge_pieces: function for merging 

254 """ 

255 pieces = [] 

256 if not isinstance(by, (list, tuple)): 

257 by = [by] 

258 

259 lby = left.groupby(by, sort=False) 

260 rby: groupby.DataFrameGroupBy | groupby.SeriesGroupBy | None = None 

261 

262 # if we can groupby the rhs 

263 # then we can get vastly better perf 

264 if all(item in right.columns for item in by): 

265 rby = right.groupby(by, sort=False) 

266 

267 for key, lhs in lby._grouper.get_iterator(lby._selected_obj, axis=lby.axis): 

268 if rby is None: 

269 rhs = right 

270 else: 

271 try: 

272 rhs = right.take(rby.indices[key]) 

273 except KeyError: 

274 # key doesn't exist in left 

275 lcols = lhs.columns.tolist() 

276 cols = lcols + [r for r in right.columns if r not in set(lcols)] 

277 merged = lhs.reindex(columns=cols) 

278 merged.index = range(len(merged)) 

279 pieces.append(merged) 

280 continue 

281 

282 merged = merge_pieces(lhs, rhs) 

283 

284 # make sure join keys are in the merged 

285 # TODO, should merge_pieces do this? 

286 merged[by] = key 

287 

288 pieces.append(merged) 

289 

290 # preserve the original order 

291 # if we have a missing piece this can be reset 

292 from pandas.core.reshape.concat import concat 

293 

294 result = concat(pieces, ignore_index=True) 

295 result = result.reindex(columns=pieces[0].columns, copy=False) 

296 return result, lby 

297 

298 

299def merge_ordered( 

300 left: DataFrame | Series, 

301 right: DataFrame | Series, 

302 on: IndexLabel | None = None, 

303 left_on: IndexLabel | None = None, 

304 right_on: IndexLabel | None = None, 

305 left_by=None, 

306 right_by=None, 

307 fill_method: str | None = None, 

308 suffixes: Suffixes = ("_x", "_y"), 

309 how: JoinHow = "outer", 

310) -> DataFrame: 

311 """ 

312 Perform a merge for ordered data with optional filling/interpolation. 

313 

314 Designed for ordered data like time series data. Optionally 

315 perform group-wise merge (see examples). 

316 

317 Parameters 

318 ---------- 

319 left : DataFrame or named Series 

320 right : DataFrame or named Series 

321 on : label or list 

322 Field names to join on. Must be found in both DataFrames. 

323 left_on : label or list, or array-like 

324 Field names to join on in left DataFrame. Can be a vector or list of 

325 vectors of the length of the DataFrame to use a particular vector as 

326 the join key instead of columns. 

327 right_on : label or list, or array-like 

328 Field names to join on in right DataFrame or vector/list of vectors per 

329 left_on docs. 

330 left_by : column name or list of column names 

331 Group left DataFrame by group columns and merge piece by piece with 

332 right DataFrame. Must be None if either left or right are a Series. 

333 right_by : column name or list of column names 

334 Group right DataFrame by group columns and merge piece by piece with 

335 left DataFrame. Must be None if either left or right are a Series. 

336 fill_method : {'ffill', None}, default None 

337 Interpolation method for data. 

338 suffixes : list-like, default is ("_x", "_y") 

339 A length-2 sequence where each element is optionally a string 

340 indicating the suffix to add to overlapping column names in 

341 `left` and `right` respectively. Pass a value of `None` instead 

342 of a string to indicate that the column name from `left` or 

343 `right` should be left as-is, with no suffix. At least one of the 

344 values must not be None. 

345 

346 how : {'left', 'right', 'outer', 'inner'}, default 'outer' 

347 * left: use only keys from left frame (SQL: left outer join) 

348 * right: use only keys from right frame (SQL: right outer join) 

349 * outer: use union of keys from both frames (SQL: full outer join) 

350 * inner: use intersection of keys from both frames (SQL: inner join). 

351 

352 Returns 

353 ------- 

354 DataFrame 

355 The merged DataFrame output type will be the same as 

356 'left', if it is a subclass of DataFrame. 

357 

358 See Also 

359 -------- 

360 merge : Merge with a database-style join. 

361 merge_asof : Merge on nearest keys. 

362 

363 Examples 

364 -------- 

365 >>> from pandas import merge_ordered 

366 >>> df1 = pd.DataFrame( 

367 ... { 

368 ... "key": ["a", "c", "e", "a", "c", "e"], 

369 ... "lvalue": [1, 2, 3, 1, 2, 3], 

370 ... "group": ["a", "a", "a", "b", "b", "b"] 

371 ... } 

372 ... ) 

373 >>> df1 

374 key lvalue group 

375 0 a 1 a 

376 1 c 2 a 

377 2 e 3 a 

378 3 a 1 b 

379 4 c 2 b 

380 5 e 3 b 

381 

382 >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) 

383 >>> df2 

384 key rvalue 

385 0 b 1 

386 1 c 2 

387 2 d 3 

388 

389 >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group") 

390 key lvalue group rvalue 

391 0 a 1 a NaN 

392 1 b 1 a 1.0 

393 2 c 2 a 2.0 

394 3 d 2 a 3.0 

395 4 e 3 a 3.0 

396 5 a 1 b NaN 

397 6 b 1 b 1.0 

398 7 c 2 b 2.0 

399 8 d 2 b 3.0 

400 9 e 3 b 3.0 

401 """ 

402 

403 def _merger(x, y) -> DataFrame: 

404 # perform the ordered merge operation 

405 op = _OrderedMerge( 

406 x, 

407 y, 

408 on=on, 

409 left_on=left_on, 

410 right_on=right_on, 

411 suffixes=suffixes, 

412 fill_method=fill_method, 

413 how=how, 

414 ) 

415 return op.get_result() 

416 

417 if left_by is not None and right_by is not None: 

418 raise ValueError("Can only group either left or right frames") 

419 if left_by is not None: 

420 if isinstance(left_by, str): 

421 left_by = [left_by] 

422 check = set(left_by).difference(left.columns) 

423 if len(check) != 0: 

424 raise KeyError(f"{check} not found in left columns") 

425 result, _ = _groupby_and_merge(left_by, left, right, lambda x, y: _merger(x, y)) 

426 elif right_by is not None: 

427 if isinstance(right_by, str): 

428 right_by = [right_by] 

429 check = set(right_by).difference(right.columns) 

430 if len(check) != 0: 

431 raise KeyError(f"{check} not found in right columns") 

432 result, _ = _groupby_and_merge( 

433 right_by, right, left, lambda x, y: _merger(y, x) 

434 ) 

435 else: 

436 result = _merger(left, right) 

437 return result 

438 

439 

440def merge_asof( 

441 left: DataFrame | Series, 

442 right: DataFrame | Series, 

443 on: IndexLabel | None = None, 

444 left_on: IndexLabel | None = None, 

445 right_on: IndexLabel | None = None, 

446 left_index: bool = False, 

447 right_index: bool = False, 

448 by=None, 

449 left_by=None, 

450 right_by=None, 

451 suffixes: Suffixes = ("_x", "_y"), 

452 tolerance: int | Timedelta | None = None, 

453 allow_exact_matches: bool = True, 

454 direction: str = "backward", 

455) -> DataFrame: 

456 """ 

457 Perform a merge by key distance. 

458 

459 This is similar to a left-join except that we match on nearest 

460 key rather than equal keys. Both DataFrames must be sorted by the key. 

461 

462 For each row in the left DataFrame: 

463 

464 - A "backward" search selects the last row in the right DataFrame whose 

465 'on' key is less than or equal to the left's key. 

466 

467 - A "forward" search selects the first row in the right DataFrame whose 

468 'on' key is greater than or equal to the left's key. 

469 

470 - A "nearest" search selects the row in the right DataFrame whose 'on' 

471 key is closest in absolute distance to the left's key. 

472 

473 Optionally match on equivalent keys with 'by' before searching with 'on'. 

474 

475 Parameters 

476 ---------- 

477 left : DataFrame or named Series 

478 right : DataFrame or named Series 

479 on : label 

480 Field name to join on. Must be found in both DataFrames. 

481 The data MUST be ordered. Furthermore this must be a numeric column, 

482 such as datetimelike, integer, or float. On or left_on/right_on 

483 must be given. 

484 left_on : label 

485 Field name to join on in left DataFrame. 

486 right_on : label 

487 Field name to join on in right DataFrame. 

488 left_index : bool 

489 Use the index of the left DataFrame as the join key. 

490 right_index : bool 

491 Use the index of the right DataFrame as the join key. 

492 by : column name or list of column names 

493 Match on these columns before performing merge operation. 

494 left_by : column name 

495 Field names to match on in the left DataFrame. 

496 right_by : column name 

497 Field names to match on in the right DataFrame. 

498 suffixes : 2-length sequence (tuple, list, ...) 

499 Suffix to apply to overlapping column names in the left and right 

500 side, respectively. 

501 tolerance : int or Timedelta, optional, default None 

502 Select asof tolerance within this range; must be compatible 

503 with the merge index. 

504 allow_exact_matches : bool, default True 

505 

506 - If True, allow matching with the same 'on' value 

507 (i.e. less-than-or-equal-to / greater-than-or-equal-to) 

508 - If False, don't match the same 'on' value 

509 (i.e., strictly less-than / strictly greater-than). 

510 

511 direction : 'backward' (default), 'forward', or 'nearest' 

512 Whether to search for prior, subsequent, or closest matches. 

513 

514 Returns 

515 ------- 

516 DataFrame 

517 

518 See Also 

519 -------- 

520 merge : Merge with a database-style join. 

521 merge_ordered : Merge with optional filling/interpolation. 

522 

523 Examples 

524 -------- 

525 >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) 

526 >>> left 

527 a left_val 

528 0 1 a 

529 1 5 b 

530 2 10 c 

531 

532 >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) 

533 >>> right 

534 a right_val 

535 0 1 1 

536 1 2 2 

537 2 3 3 

538 3 6 6 

539 4 7 7 

540 

541 >>> pd.merge_asof(left, right, on="a") 

542 a left_val right_val 

543 0 1 a 1 

544 1 5 b 3 

545 2 10 c 7 

546 

547 >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False) 

548 a left_val right_val 

549 0 1 a NaN 

550 1 5 b 3.0 

551 2 10 c 7.0 

552 

553 >>> pd.merge_asof(left, right, on="a", direction="forward") 

554 a left_val right_val 

555 0 1 a 1.0 

556 1 5 b 6.0 

557 2 10 c NaN 

558 

559 >>> pd.merge_asof(left, right, on="a", direction="nearest") 

560 a left_val right_val 

561 0 1 a 1 

562 1 5 b 6 

563 2 10 c 7 

564 

565 We can use indexed DataFrames as well. 

566 

567 >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10]) 

568 >>> left 

569 left_val 

570 1 a 

571 5 b 

572 10 c 

573 

574 >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7]) 

575 >>> right 

576 right_val 

577 1 1 

578 2 2 

579 3 3 

580 6 6 

581 7 7 

582 

583 >>> pd.merge_asof(left, right, left_index=True, right_index=True) 

584 left_val right_val 

585 1 a 1 

586 5 b 3 

587 10 c 7 

588 

589 Here is a real-world times-series example 

590 

591 >>> quotes = pd.DataFrame( 

592 ... { 

593 ... "time": [ 

594 ... pd.Timestamp("2016-05-25 13:30:00.023"), 

595 ... pd.Timestamp("2016-05-25 13:30:00.023"), 

596 ... pd.Timestamp("2016-05-25 13:30:00.030"), 

597 ... pd.Timestamp("2016-05-25 13:30:00.041"), 

598 ... pd.Timestamp("2016-05-25 13:30:00.048"), 

599 ... pd.Timestamp("2016-05-25 13:30:00.049"), 

600 ... pd.Timestamp("2016-05-25 13:30:00.072"), 

601 ... pd.Timestamp("2016-05-25 13:30:00.075") 

602 ... ], 

603 ... "ticker": [ 

604 ... "GOOG", 

605 ... "MSFT", 

606 ... "MSFT", 

607 ... "MSFT", 

608 ... "GOOG", 

609 ... "AAPL", 

610 ... "GOOG", 

611 ... "MSFT" 

612 ... ], 

613 ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], 

614 ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03] 

615 ... } 

616 ... ) 

617 >>> quotes 

618 time ticker bid ask 

619 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 

620 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96 

621 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98 

622 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00 

623 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93 

624 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01 

625 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 

626 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 

627 

628 >>> trades = pd.DataFrame( 

629 ... { 

630 ... "time": [ 

631 ... pd.Timestamp("2016-05-25 13:30:00.023"), 

632 ... pd.Timestamp("2016-05-25 13:30:00.038"), 

633 ... pd.Timestamp("2016-05-25 13:30:00.048"), 

634 ... pd.Timestamp("2016-05-25 13:30:00.048"), 

635 ... pd.Timestamp("2016-05-25 13:30:00.048") 

636 ... ], 

637 ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], 

638 ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], 

639 ... "quantity": [75, 155, 100, 100, 100] 

640 ... } 

641 ... ) 

642 >>> trades 

643 time ticker price quantity 

644 0 2016-05-25 13:30:00.023 MSFT 51.95 75 

645 1 2016-05-25 13:30:00.038 MSFT 51.95 155 

646 2 2016-05-25 13:30:00.048 GOOG 720.77 100 

647 3 2016-05-25 13:30:00.048 GOOG 720.92 100 

648 4 2016-05-25 13:30:00.048 AAPL 98.00 100 

649 

650 By default we are taking the asof of the quotes 

651 

652 >>> pd.merge_asof(trades, quotes, on="time", by="ticker") 

653 time ticker price quantity bid ask 

654 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 

655 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 

656 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 

657 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 

658 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN 

659 

660 We only asof within 2ms between the quote time and the trade time 

661 

662 >>> pd.merge_asof( 

663 ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms") 

664 ... ) 

665 time ticker price quantity bid ask 

666 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 

667 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN 

668 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 

669 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 

670 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN 

671 

672 We only asof within 10ms between the quote time and the trade time 

673 and we exclude exact matches on time. However *prior* data will 

674 propagate forward 

675 

676 >>> pd.merge_asof( 

677 ... trades, 

678 ... quotes, 

679 ... on="time", 

680 ... by="ticker", 

681 ... tolerance=pd.Timedelta("10ms"), 

682 ... allow_exact_matches=False 

683 ... ) 

684 time ticker price quantity bid ask 

685 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN 

686 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 

687 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN 

688 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN 

689 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN 

690 """ 

691 op = _AsOfMerge( 

692 left, 

693 right, 

694 on=on, 

695 left_on=left_on, 

696 right_on=right_on, 

697 left_index=left_index, 

698 right_index=right_index, 

699 by=by, 

700 left_by=left_by, 

701 right_by=right_by, 

702 suffixes=suffixes, 

703 how="asof", 

704 tolerance=tolerance, 

705 allow_exact_matches=allow_exact_matches, 

706 direction=direction, 

707 ) 

708 return op.get_result() 

709 

710 

711# TODO: transformations?? 

712# TODO: only copy DataFrames when modification necessary 

713class _MergeOperation: 

714 """ 

715 Perform a database (SQL) merge operation between two DataFrame or Series 

716 objects using either columns as keys or their row indexes 

717 """ 

718 

719 _merge_type = "merge" 

720 how: JoinHow | Literal["asof"] 

721 on: IndexLabel | None 

722 # left_on/right_on may be None when passed, but in validate_specification 

723 # get replaced with non-None. 

724 left_on: Sequence[Hashable | AnyArrayLike] 

725 right_on: Sequence[Hashable | AnyArrayLike] 

726 left_index: bool 

727 right_index: bool 

728 sort: bool 

729 suffixes: Suffixes 

730 copy: bool 

731 indicator: str | bool 

732 validate: str | None 

733 join_names: list[Hashable] 

734 right_join_keys: list[ArrayLike] 

735 left_join_keys: list[ArrayLike] 

736 

737 def __init__( 

738 self, 

739 left: DataFrame | Series, 

740 right: DataFrame | Series, 

741 how: JoinHow | Literal["asof"] = "inner", 

742 on: IndexLabel | AnyArrayLike | None = None, 

743 left_on: IndexLabel | AnyArrayLike | None = None, 

744 right_on: IndexLabel | AnyArrayLike | None = None, 

745 left_index: bool = False, 

746 right_index: bool = False, 

747 sort: bool = True, 

748 suffixes: Suffixes = ("_x", "_y"), 

749 indicator: str | bool = False, 

750 validate: str | None = None, 

751 ) -> None: 

752 _left = _validate_operand(left) 

753 _right = _validate_operand(right) 

754 self.left = self.orig_left = _left 

755 self.right = self.orig_right = _right 

756 self.how = how 

757 

758 self.on = com.maybe_make_list(on) 

759 

760 self.suffixes = suffixes 

761 self.sort = sort or how == "outer" 

762 

763 self.left_index = left_index 

764 self.right_index = right_index 

765 

766 self.indicator = indicator 

767 

768 if not is_bool(left_index): 

769 raise ValueError( 

770 f"left_index parameter must be of type bool, not {type(left_index)}" 

771 ) 

772 if not is_bool(right_index): 

773 raise ValueError( 

774 f"right_index parameter must be of type bool, not {type(right_index)}" 

775 ) 

776 

777 # GH 40993: raise when merging between different levels; enforced in 2.0 

778 if _left.columns.nlevels != _right.columns.nlevels: 

779 msg = ( 

780 "Not allowed to merge between different levels. " 

781 f"({_left.columns.nlevels} levels on the left, " 

782 f"{_right.columns.nlevels} on the right)" 

783 ) 

784 raise MergeError(msg) 

785 

786 self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on) 

787 

788 ( 

789 self.left_join_keys, 

790 self.right_join_keys, 

791 self.join_names, 

792 left_drop, 

793 right_drop, 

794 ) = self._get_merge_keys() 

795 

796 if left_drop: 

797 self.left = self.left._drop_labels_or_levels(left_drop) 

798 

799 if right_drop: 

800 self.right = self.right._drop_labels_or_levels(right_drop) 

801 

802 self._maybe_require_matching_dtypes(self.left_join_keys, self.right_join_keys) 

803 self._validate_tolerance(self.left_join_keys) 

804 

805 # validate the merge keys dtypes. We may need to coerce 

806 # to avoid incompatible dtypes 

807 self._maybe_coerce_merge_keys() 

808 

809 # If argument passed to validate, 

810 # check if columns specified as unique 

811 # are in fact unique. 

812 if validate is not None: 

813 self._validate_validate_kwd(validate) 

814 

815 def _maybe_require_matching_dtypes( 

816 self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike] 

817 ) -> None: 

818 # Overridden by AsOfMerge 

819 pass 

820 

821 def _validate_tolerance(self, left_join_keys: list[ArrayLike]) -> None: 

822 # Overridden by AsOfMerge 

823 pass 

824 

825 @final 

826 def _reindex_and_concat( 

827 self, 

828 join_index: Index, 

829 left_indexer: npt.NDArray[np.intp] | None, 

830 right_indexer: npt.NDArray[np.intp] | None, 

831 copy: bool | None, 

832 ) -> DataFrame: 

833 """ 

834 reindex along index and concat along columns. 

835 """ 

836 # Take views so we do not alter the originals 

837 left = self.left[:] 

838 right = self.right[:] 

839 

840 llabels, rlabels = _items_overlap_with_suffix( 

841 self.left._info_axis, self.right._info_axis, self.suffixes 

842 ) 

843 

844 if left_indexer is not None and not is_range_indexer(left_indexer, len(left)): 

845 # Pinning the index here (and in the right code just below) is not 

846 # necessary, but makes the `.take` more performant if we have e.g. 

847 # a MultiIndex for left.index. 

848 lmgr = left._mgr.reindex_indexer( 

849 join_index, 

850 left_indexer, 

851 axis=1, 

852 copy=False, 

853 only_slice=True, 

854 allow_dups=True, 

855 use_na_proxy=True, 

856 ) 

857 left = left._constructor_from_mgr(lmgr, axes=lmgr.axes) 

858 left.index = join_index 

859 

860 if right_indexer is not None and not is_range_indexer( 

861 right_indexer, len(right) 

862 ): 

863 rmgr = right._mgr.reindex_indexer( 

864 join_index, 

865 right_indexer, 

866 axis=1, 

867 copy=False, 

868 only_slice=True, 

869 allow_dups=True, 

870 use_na_proxy=True, 

871 ) 

872 right = right._constructor_from_mgr(rmgr, axes=rmgr.axes) 

873 right.index = join_index 

874 

875 from pandas import concat 

876 

877 left.columns = llabels 

878 right.columns = rlabels 

879 result = concat([left, right], axis=1, copy=copy) 

880 return result 

881 

882 def get_result(self, copy: bool | None = True) -> DataFrame: 

883 if self.indicator: 

884 self.left, self.right = self._indicator_pre_merge(self.left, self.right) 

885 

886 join_index, left_indexer, right_indexer = self._get_join_info() 

887 

888 result = self._reindex_and_concat( 

889 join_index, left_indexer, right_indexer, copy=copy 

890 ) 

891 result = result.__finalize__(self, method=self._merge_type) 

892 

893 if self.indicator: 

894 result = self._indicator_post_merge(result) 

895 

896 self._maybe_add_join_keys(result, left_indexer, right_indexer) 

897 

898 self._maybe_restore_index_levels(result) 

899 

900 return result.__finalize__(self, method="merge") 

901 

902 @final 

903 @cache_readonly 

904 def _indicator_name(self) -> str | None: 

905 if isinstance(self.indicator, str): 

906 return self.indicator 

907 elif isinstance(self.indicator, bool): 

908 return "_merge" if self.indicator else None 

909 else: 

910 raise ValueError( 

911 "indicator option can only accept boolean or string arguments" 

912 ) 

913 

914 @final 

915 def _indicator_pre_merge( 

916 self, left: DataFrame, right: DataFrame 

917 ) -> tuple[DataFrame, DataFrame]: 

918 columns = left.columns.union(right.columns) 

919 

920 for i in ["_left_indicator", "_right_indicator"]: 

921 if i in columns: 

922 raise ValueError( 

923 "Cannot use `indicator=True` option when " 

924 f"data contains a column named {i}" 

925 ) 

926 if self._indicator_name in columns: 

927 raise ValueError( 

928 "Cannot use name of an existing column for indicator column" 

929 ) 

930 

931 left = left.copy() 

932 right = right.copy() 

933 

934 left["_left_indicator"] = 1 

935 left["_left_indicator"] = left["_left_indicator"].astype("int8") 

936 

937 right["_right_indicator"] = 2 

938 right["_right_indicator"] = right["_right_indicator"].astype("int8") 

939 

940 return left, right 

941 

942 @final 

943 def _indicator_post_merge(self, result: DataFrame) -> DataFrame: 

944 result["_left_indicator"] = result["_left_indicator"].fillna(0) 

945 result["_right_indicator"] = result["_right_indicator"].fillna(0) 

946 

947 result[self._indicator_name] = Categorical( 

948 (result["_left_indicator"] + result["_right_indicator"]), 

949 categories=[1, 2, 3], 

950 ) 

951 result[self._indicator_name] = result[ 

952 self._indicator_name 

953 ].cat.rename_categories(["left_only", "right_only", "both"]) 

954 

955 result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1) 

956 return result 

957 

958 @final 

959 def _maybe_restore_index_levels(self, result: DataFrame) -> None: 

960 """ 

961 Restore index levels specified as `on` parameters 

962 

963 Here we check for cases where `self.left_on` and `self.right_on` pairs 

964 each reference an index level in their respective DataFrames. The 

965 joined columns corresponding to these pairs are then restored to the 

966 index of `result`. 

967 

968 **Note:** This method has side effects. It modifies `result` in-place 

969 

970 Parameters 

971 ---------- 

972 result: DataFrame 

973 merge result 

974 

975 Returns 

976 ------- 

977 None 

978 """ 

979 names_to_restore = [] 

980 for name, left_key, right_key in zip( 

981 self.join_names, self.left_on, self.right_on 

982 ): 

983 if ( 

984 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible 

985 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected 

986 # "Hashable" 

987 self.orig_left._is_level_reference(left_key) # type: ignore[arg-type] 

988 # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible 

989 # type "Union[Hashable, ExtensionArray, Index, Series]"; expected 

990 # "Hashable" 

991 and self.orig_right._is_level_reference( 

992 right_key # type: ignore[arg-type] 

993 ) 

994 and left_key == right_key 

995 and name not in result.index.names 

996 ): 

997 names_to_restore.append(name) 

998 

999 if names_to_restore: 

1000 result.set_index(names_to_restore, inplace=True) 

1001 

1002 @final 

1003 def _maybe_add_join_keys( 

1004 self, 

1005 result: DataFrame, 

1006 left_indexer: npt.NDArray[np.intp] | None, 

1007 right_indexer: npt.NDArray[np.intp] | None, 

1008 ) -> None: 

1009 left_has_missing = None 

1010 right_has_missing = None 

1011 

1012 assert all(isinstance(x, _known) for x in self.left_join_keys) 

1013 

1014 keys = zip(self.join_names, self.left_on, self.right_on) 

1015 for i, (name, lname, rname) in enumerate(keys): 

1016 if not _should_fill(lname, rname): 

1017 continue 

1018 

1019 take_left, take_right = None, None 

1020 

1021 if name in result: 

1022 if left_indexer is not None or right_indexer is not None: 

1023 if name in self.left: 

1024 if left_has_missing is None: 

1025 left_has_missing = ( 

1026 False 

1027 if left_indexer is None 

1028 else (left_indexer == -1).any() 

1029 ) 

1030 

1031 if left_has_missing: 

1032 take_right = self.right_join_keys[i] 

1033 

1034 if result[name].dtype != self.left[name].dtype: 

1035 take_left = self.left[name]._values 

1036 

1037 elif name in self.right: 

1038 if right_has_missing is None: 

1039 right_has_missing = ( 

1040 False 

1041 if right_indexer is None 

1042 else (right_indexer == -1).any() 

1043 ) 

1044 

1045 if right_has_missing: 

1046 take_left = self.left_join_keys[i] 

1047 

1048 if result[name].dtype != self.right[name].dtype: 

1049 take_right = self.right[name]._values 

1050 

1051 else: 

1052 take_left = self.left_join_keys[i] 

1053 take_right = self.right_join_keys[i] 

1054 

1055 if take_left is not None or take_right is not None: 

1056 if take_left is None: 

1057 lvals = result[name]._values 

1058 elif left_indexer is None: 

1059 lvals = take_left 

1060 else: 

1061 # TODO: can we pin down take_left's type earlier? 

1062 take_left = extract_array(take_left, extract_numpy=True) 

1063 lfill = na_value_for_dtype(take_left.dtype) 

1064 lvals = algos.take_nd(take_left, left_indexer, fill_value=lfill) 

1065 

1066 if take_right is None: 

1067 rvals = result[name]._values 

1068 elif right_indexer is None: 

1069 rvals = take_right 

1070 else: 

1071 # TODO: can we pin down take_right's type earlier? 

1072 taker = extract_array(take_right, extract_numpy=True) 

1073 rfill = na_value_for_dtype(taker.dtype) 

1074 rvals = algos.take_nd(taker, right_indexer, fill_value=rfill) 

1075 

1076 # if we have an all missing left_indexer 

1077 # make sure to just use the right values or vice-versa 

1078 if left_indexer is not None and (left_indexer == -1).all(): 

1079 key_col = Index(rvals) 

1080 result_dtype = rvals.dtype 

1081 elif right_indexer is not None and (right_indexer == -1).all(): 

1082 key_col = Index(lvals) 

1083 result_dtype = lvals.dtype 

1084 else: 

1085 key_col = Index(lvals) 

1086 if left_indexer is not None: 

1087 mask_left = left_indexer == -1 

1088 key_col = key_col.where(~mask_left, rvals) 

1089 result_dtype = find_common_type([lvals.dtype, rvals.dtype]) 

1090 if ( 

1091 lvals.dtype.kind == "M" 

1092 and rvals.dtype.kind == "M" 

1093 and result_dtype.kind == "O" 

1094 ): 

1095 # TODO(non-nano) Workaround for common_type not dealing 

1096 # with different resolutions 

1097 result_dtype = key_col.dtype 

1098 

1099 if result._is_label_reference(name): 

1100 result[name] = result._constructor_sliced( 

1101 key_col, dtype=result_dtype, index=result.index 

1102 ) 

1103 elif result._is_level_reference(name): 

1104 if isinstance(result.index, MultiIndex): 

1105 key_col.name = name 

1106 idx_list = [ 

1107 result.index.get_level_values(level_name) 

1108 if level_name != name 

1109 else key_col 

1110 for level_name in result.index.names 

1111 ] 

1112 

1113 result.set_index(idx_list, inplace=True) 

1114 else: 

1115 result.index = Index(key_col, name=name) 

1116 else: 

1117 result.insert(i, name or f"key_{i}", key_col) 

1118 

1119 def _get_join_indexers( 

1120 self, 

1121 ) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: 

1122 """return the join indexers""" 

1123 # make mypy happy 

1124 assert self.how != "asof" 

1125 return get_join_indexers( 

1126 self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how 

1127 ) 

1128 

1129 @final 

1130 def _get_join_info( 

1131 self, 

1132 ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: 

1133 left_ax = self.left.index 

1134 right_ax = self.right.index 

1135 

1136 if self.left_index and self.right_index and self.how != "asof": 

1137 join_index, left_indexer, right_indexer = left_ax.join( 

1138 right_ax, how=self.how, return_indexers=True, sort=self.sort 

1139 ) 

1140 

1141 elif self.right_index and self.how == "left": 

1142 join_index, left_indexer, right_indexer = _left_join_on_index( 

1143 left_ax, right_ax, self.left_join_keys, sort=self.sort 

1144 ) 

1145 

1146 elif self.left_index and self.how == "right": 

1147 join_index, right_indexer, left_indexer = _left_join_on_index( 

1148 right_ax, left_ax, self.right_join_keys, sort=self.sort 

1149 ) 

1150 else: 

1151 (left_indexer, right_indexer) = self._get_join_indexers() 

1152 

1153 if self.right_index: 

1154 if len(self.left) > 0: 

1155 join_index = self._create_join_index( 

1156 left_ax, 

1157 right_ax, 

1158 left_indexer, 

1159 how="right", 

1160 ) 

1161 elif right_indexer is None: 

1162 join_index = right_ax.copy() 

1163 else: 

1164 join_index = right_ax.take(right_indexer) 

1165 elif self.left_index: 

1166 if self.how == "asof": 

1167 # GH#33463 asof should always behave like a left merge 

1168 join_index = self._create_join_index( 

1169 left_ax, 

1170 right_ax, 

1171 left_indexer, 

1172 how="left", 

1173 ) 

1174 

1175 elif len(self.right) > 0: 

1176 join_index = self._create_join_index( 

1177 right_ax, 

1178 left_ax, 

1179 right_indexer, 

1180 how="left", 

1181 ) 

1182 elif left_indexer is None: 

1183 join_index = left_ax.copy() 

1184 else: 

1185 join_index = left_ax.take(left_indexer) 

1186 else: 

1187 n = len(left_ax) if left_indexer is None else len(left_indexer) 

1188 join_index = default_index(n) 

1189 

1190 return join_index, left_indexer, right_indexer 

1191 

1192 @final 

1193 def _create_join_index( 

1194 self, 

1195 index: Index, 

1196 other_index: Index, 

1197 indexer: npt.NDArray[np.intp] | None, 

1198 how: JoinHow = "left", 

1199 ) -> Index: 

1200 """ 

1201 Create a join index by rearranging one index to match another 

1202 

1203 Parameters 

1204 ---------- 

1205 index : Index 

1206 index being rearranged 

1207 other_index : Index 

1208 used to supply values not found in index 

1209 indexer : np.ndarray[np.intp] or None 

1210 how to rearrange index 

1211 how : str 

1212 Replacement is only necessary if indexer based on other_index. 

1213 

1214 Returns 

1215 ------- 

1216 Index 

1217 """ 

1218 if self.how in (how, "outer") and not isinstance(other_index, MultiIndex): 

1219 # if final index requires values in other_index but not target 

1220 # index, indexer may hold missing (-1) values, causing Index.take 

1221 # to take the final value in target index. So, we set the last 

1222 # element to be the desired fill value. We do not use allow_fill 

1223 # and fill_value because it throws a ValueError on integer indices 

1224 mask = indexer == -1 

1225 if np.any(mask): 

1226 fill_value = na_value_for_dtype(index.dtype, compat=False) 

1227 index = index.append(Index([fill_value])) 

1228 if indexer is None: 

1229 return index.copy() 

1230 return index.take(indexer) 

1231 

1232 @final 

1233 def _get_merge_keys( 

1234 self, 

1235 ) -> tuple[ 

1236 list[ArrayLike], 

1237 list[ArrayLike], 

1238 list[Hashable], 

1239 list[Hashable], 

1240 list[Hashable], 

1241 ]: 

1242 """ 

1243 Returns 

1244 ------- 

1245 left_keys, right_keys, join_names, left_drop, right_drop 

1246 """ 

1247 left_keys: list[ArrayLike] = [] 

1248 right_keys: list[ArrayLike] = [] 

1249 join_names: list[Hashable] = [] 

1250 right_drop: list[Hashable] = [] 

1251 left_drop: list[Hashable] = [] 

1252 

1253 left, right = self.left, self.right 

1254 

1255 is_lkey = lambda x: isinstance(x, _known) and len(x) == len(left) 

1256 is_rkey = lambda x: isinstance(x, _known) and len(x) == len(right) 

1257 

1258 # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A 

1259 # user could, for example, request 'left_index' and 'left_by'. In a 

1260 # regular pd.merge(), users cannot specify both 'left_index' and 

1261 # 'left_on'. (Instead, users have a MultiIndex). That means the 

1262 # self.left_on in this function is always empty in a pd.merge(), but 

1263 # a pd.merge_asof(left_index=True, left_by=...) will result in a 

1264 # self.left_on array with a None in the middle of it. This requires 

1265 # a work-around as designated in the code below. 

1266 # See _validate_left_right_on() for where this happens. 

1267 

1268 # ugh, spaghetti re #733 

1269 if _any(self.left_on) and _any(self.right_on): 

1270 for lk, rk in zip(self.left_on, self.right_on): 

1271 lk = extract_array(lk, extract_numpy=True) 

1272 rk = extract_array(rk, extract_numpy=True) 

1273 if is_lkey(lk): 

1274 lk = cast(ArrayLike, lk) 

1275 left_keys.append(lk) 

1276 if is_rkey(rk): 

1277 rk = cast(ArrayLike, rk) 

1278 right_keys.append(rk) 

1279 join_names.append(None) # what to do? 

1280 else: 

1281 # Then we're either Hashable or a wrong-length arraylike, 

1282 # the latter of which will raise 

1283 rk = cast(Hashable, rk) 

1284 if rk is not None: 

1285 right_keys.append(right._get_label_or_level_values(rk)) 

1286 join_names.append(rk) 

1287 else: 

1288 # work-around for merge_asof(right_index=True) 

1289 right_keys.append(right.index._values) 

1290 join_names.append(right.index.name) 

1291 else: 

1292 if not is_rkey(rk): 

1293 # Then we're either Hashable or a wrong-length arraylike, 

1294 # the latter of which will raise 

1295 rk = cast(Hashable, rk) 

1296 if rk is not None: 

1297 right_keys.append(right._get_label_or_level_values(rk)) 

1298 else: 

1299 # work-around for merge_asof(right_index=True) 

1300 right_keys.append(right.index._values) 

1301 if lk is not None and lk == rk: # FIXME: what about other NAs? 

1302 right_drop.append(rk) 

1303 else: 

1304 rk = cast(ArrayLike, rk) 

1305 right_keys.append(rk) 

1306 if lk is not None: 

1307 # Then we're either Hashable or a wrong-length arraylike, 

1308 # the latter of which will raise 

1309 lk = cast(Hashable, lk) 

1310 left_keys.append(left._get_label_or_level_values(lk)) 

1311 join_names.append(lk) 

1312 else: 

1313 # work-around for merge_asof(left_index=True) 

1314 left_keys.append(left.index._values) 

1315 join_names.append(left.index.name) 

1316 elif _any(self.left_on): 

1317 for k in self.left_on: 

1318 if is_lkey(k): 

1319 k = extract_array(k, extract_numpy=True) 

1320 k = cast(ArrayLike, k) 

1321 left_keys.append(k) 

1322 join_names.append(None) 

1323 else: 

1324 # Then we're either Hashable or a wrong-length arraylike, 

1325 # the latter of which will raise 

1326 k = cast(Hashable, k) 

1327 left_keys.append(left._get_label_or_level_values(k)) 

1328 join_names.append(k) 

1329 if isinstance(self.right.index, MultiIndex): 

1330 right_keys = [ 

1331 lev._values.take(lev_codes) 

1332 for lev, lev_codes in zip( 

1333 self.right.index.levels, self.right.index.codes 

1334 ) 

1335 ] 

1336 else: 

1337 right_keys = [self.right.index._values] 

1338 elif _any(self.right_on): 

1339 for k in self.right_on: 

1340 k = extract_array(k, extract_numpy=True) 

1341 if is_rkey(k): 

1342 k = cast(ArrayLike, k) 

1343 right_keys.append(k) 

1344 join_names.append(None) 

1345 else: 

1346 # Then we're either Hashable or a wrong-length arraylike, 

1347 # the latter of which will raise 

1348 k = cast(Hashable, k) 

1349 right_keys.append(right._get_label_or_level_values(k)) 

1350 join_names.append(k) 

1351 if isinstance(self.left.index, MultiIndex): 

1352 left_keys = [ 

1353 lev._values.take(lev_codes) 

1354 for lev, lev_codes in zip( 

1355 self.left.index.levels, self.left.index.codes 

1356 ) 

1357 ] 

1358 else: 

1359 left_keys = [self.left.index._values] 

1360 

1361 return left_keys, right_keys, join_names, left_drop, right_drop 

1362 

1363 @final 

1364 def _maybe_coerce_merge_keys(self) -> None: 

1365 # we have valid merges but we may have to further 

1366 # coerce these if they are originally incompatible types 

1367 # 

1368 # for example if these are categorical, but are not dtype_equal 

1369 # or if we have object and integer dtypes 

1370 

1371 for lk, rk, name in zip( 

1372 self.left_join_keys, self.right_join_keys, self.join_names 

1373 ): 

1374 if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): 

1375 continue 

1376 

1377 lk = extract_array(lk, extract_numpy=True) 

1378 rk = extract_array(rk, extract_numpy=True) 

1379 

1380 lk_is_cat = isinstance(lk.dtype, CategoricalDtype) 

1381 rk_is_cat = isinstance(rk.dtype, CategoricalDtype) 

1382 lk_is_object_or_string = is_object_dtype(lk.dtype) or is_string_dtype( 

1383 lk.dtype 

1384 ) 

1385 rk_is_object_or_string = is_object_dtype(rk.dtype) or is_string_dtype( 

1386 rk.dtype 

1387 ) 

1388 

1389 # if either left or right is a categorical 

1390 # then the must match exactly in categories & ordered 

1391 if lk_is_cat and rk_is_cat: 

1392 lk = cast(Categorical, lk) 

1393 rk = cast(Categorical, rk) 

1394 if lk._categories_match_up_to_permutation(rk): 

1395 continue 

1396 

1397 elif lk_is_cat or rk_is_cat: 

1398 pass 

1399 

1400 elif lk.dtype == rk.dtype: 

1401 continue 

1402 

1403 msg = ( 

1404 f"You are trying to merge on {lk.dtype} and {rk.dtype} columns " 

1405 f"for key '{name}'. If you wish to proceed you should use pd.concat" 

1406 ) 

1407 

1408 # if we are numeric, then allow differing 

1409 # kinds to proceed, eg. int64 and int8, int and float 

1410 # further if we are object, but we infer to 

1411 # the same, then proceed 

1412 if is_numeric_dtype(lk.dtype) and is_numeric_dtype(rk.dtype): 

1413 if lk.dtype.kind == rk.dtype.kind: 

1414 continue 

1415 

1416 if isinstance(lk.dtype, ExtensionDtype) and not isinstance( 

1417 rk.dtype, ExtensionDtype 

1418 ): 

1419 ct = find_common_type([lk.dtype, rk.dtype]) 

1420 if isinstance(ct, ExtensionDtype): 

1421 com_cls = ct.construct_array_type() 

1422 rk = com_cls._from_sequence(rk, dtype=ct, copy=False) 

1423 else: 

1424 rk = rk.astype(ct) 

1425 elif isinstance(rk.dtype, ExtensionDtype): 

1426 ct = find_common_type([lk.dtype, rk.dtype]) 

1427 if isinstance(ct, ExtensionDtype): 

1428 com_cls = ct.construct_array_type() 

1429 lk = com_cls._from_sequence(lk, dtype=ct, copy=False) 

1430 else: 

1431 lk = lk.astype(ct) 

1432 

1433 # check whether ints and floats 

1434 if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype): 

1435 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int 

1436 with np.errstate(invalid="ignore"): 

1437 # error: Argument 1 to "astype" of "ndarray" has incompatible 

1438 # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected 

1439 # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]" 

1440 casted = lk.astype(rk.dtype) # type: ignore[arg-type] 

1441 

1442 mask = ~np.isnan(lk) 

1443 match = lk == casted 

1444 if not match[mask].all(): 

1445 warnings.warn( 

1446 "You are merging on int and float " 

1447 "columns where the float values " 

1448 "are not equal to their int representation.", 

1449 UserWarning, 

1450 stacklevel=find_stack_level(), 

1451 ) 

1452 continue 

1453 

1454 if is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype): 

1455 # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int 

1456 with np.errstate(invalid="ignore"): 

1457 # error: Argument 1 to "astype" of "ndarray" has incompatible 

1458 # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected 

1459 # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]" 

1460 casted = rk.astype(lk.dtype) # type: ignore[arg-type] 

1461 

1462 mask = ~np.isnan(rk) 

1463 match = rk == casted 

1464 if not match[mask].all(): 

1465 warnings.warn( 

1466 "You are merging on int and float " 

1467 "columns where the float values " 

1468 "are not equal to their int representation.", 

1469 UserWarning, 

1470 stacklevel=find_stack_level(), 

1471 ) 

1472 continue 

1473 

1474 # let's infer and see if we are ok 

1475 if lib.infer_dtype(lk, skipna=False) == lib.infer_dtype( 

1476 rk, skipna=False 

1477 ): 

1478 continue 

1479 

1480 # Check if we are trying to merge on obviously 

1481 # incompatible dtypes GH 9780, GH 15800 

1482 

1483 # bool values are coerced to object 

1484 elif (lk_is_object_or_string and is_bool_dtype(rk.dtype)) or ( 

1485 is_bool_dtype(lk.dtype) and rk_is_object_or_string 

1486 ): 

1487 pass 

1488 

1489 # object values are allowed to be merged 

1490 elif (lk_is_object_or_string and is_numeric_dtype(rk.dtype)) or ( 

1491 is_numeric_dtype(lk.dtype) and rk_is_object_or_string 

1492 ): 

1493 inferred_left = lib.infer_dtype(lk, skipna=False) 

1494 inferred_right = lib.infer_dtype(rk, skipna=False) 

1495 bool_types = ["integer", "mixed-integer", "boolean", "empty"] 

1496 string_types = ["string", "unicode", "mixed", "bytes", "empty"] 

1497 

1498 # inferred bool 

1499 if inferred_left in bool_types and inferred_right in bool_types: 

1500 pass 

1501 

1502 # unless we are merging non-string-like with string-like 

1503 elif ( 

1504 inferred_left in string_types and inferred_right not in string_types 

1505 ) or ( 

1506 inferred_right in string_types and inferred_left not in string_types 

1507 ): 

1508 raise ValueError(msg) 

1509 

1510 # datetimelikes must match exactly 

1511 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype): 

1512 raise ValueError(msg) 

1513 elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype): 

1514 raise ValueError(msg) 

1515 elif isinstance(lk.dtype, DatetimeTZDtype) and not isinstance( 

1516 rk.dtype, DatetimeTZDtype 

1517 ): 

1518 raise ValueError(msg) 

1519 elif not isinstance(lk.dtype, DatetimeTZDtype) and isinstance( 

1520 rk.dtype, DatetimeTZDtype 

1521 ): 

1522 raise ValueError(msg) 

1523 elif ( 

1524 isinstance(lk.dtype, DatetimeTZDtype) 

1525 and isinstance(rk.dtype, DatetimeTZDtype) 

1526 ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"): 

1527 # allows datetime with different resolutions 

1528 continue 

1529 # datetime and timedelta not allowed 

1530 elif lk.dtype.kind == "M" and rk.dtype.kind == "m": 

1531 raise ValueError(msg) 

1532 elif lk.dtype.kind == "m" and rk.dtype.kind == "M": 

1533 raise ValueError(msg) 

1534 

1535 elif is_object_dtype(lk.dtype) and is_object_dtype(rk.dtype): 

1536 continue 

1537 

1538 # Houston, we have a problem! 

1539 # let's coerce to object if the dtypes aren't 

1540 # categorical, otherwise coerce to the category 

1541 # dtype. If we coerced categories to object, 

1542 # then we would lose type information on some 

1543 # columns, and end up trying to merge 

1544 # incompatible dtypes. See GH 16900. 

1545 if name in self.left.columns: 

1546 typ = cast(Categorical, lk).categories.dtype if lk_is_cat else object 

1547 self.left = self.left.copy() 

1548 self.left[name] = self.left[name].astype(typ) 

1549 if name in self.right.columns: 

1550 typ = cast(Categorical, rk).categories.dtype if rk_is_cat else object 

1551 self.right = self.right.copy() 

1552 self.right[name] = self.right[name].astype(typ) 

1553 

1554 def _validate_left_right_on(self, left_on, right_on): 

1555 left_on = com.maybe_make_list(left_on) 

1556 right_on = com.maybe_make_list(right_on) 

1557 

1558 # Hm, any way to make this logic less complicated?? 

1559 if self.on is None and left_on is None and right_on is None: 

1560 if self.left_index and self.right_index: 

1561 left_on, right_on = (), () 

1562 elif self.left_index: 

1563 raise MergeError("Must pass right_on or right_index=True") 

1564 elif self.right_index: 

1565 raise MergeError("Must pass left_on or left_index=True") 

1566 else: 

1567 # use the common columns 

1568 left_cols = self.left.columns 

1569 right_cols = self.right.columns 

1570 common_cols = left_cols.intersection(right_cols) 

1571 if len(common_cols) == 0: 

1572 raise MergeError( 

1573 "No common columns to perform merge on. " 

1574 f"Merge options: left_on={left_on}, " 

1575 f"right_on={right_on}, " 

1576 f"left_index={self.left_index}, " 

1577 f"right_index={self.right_index}" 

1578 ) 

1579 if ( 

1580 not left_cols.join(common_cols, how="inner").is_unique 

1581 or not right_cols.join(common_cols, how="inner").is_unique 

1582 ): 

1583 raise MergeError(f"Data columns not unique: {repr(common_cols)}") 

1584 left_on = right_on = common_cols 

1585 elif self.on is not None: 

1586 if left_on is not None or right_on is not None: 

1587 raise MergeError( 

1588 'Can only pass argument "on" OR "left_on" ' 

1589 'and "right_on", not a combination of both.' 

1590 ) 

1591 if self.left_index or self.right_index: 

1592 raise MergeError( 

1593 'Can only pass argument "on" OR "left_index" ' 

1594 'and "right_index", not a combination of both.' 

1595 ) 

1596 left_on = right_on = self.on 

1597 elif left_on is not None: 

1598 if self.left_index: 

1599 raise MergeError( 

1600 'Can only pass argument "left_on" OR "left_index" not both.' 

1601 ) 

1602 if not self.right_index and right_on is None: 

1603 raise MergeError('Must pass "right_on" OR "right_index".') 

1604 n = len(left_on) 

1605 if self.right_index: 

1606 if len(left_on) != self.right.index.nlevels: 

1607 raise ValueError( 

1608 "len(left_on) must equal the number " 

1609 'of levels in the index of "right"' 

1610 ) 

1611 right_on = [None] * n 

1612 elif right_on is not None: 

1613 if self.right_index: 

1614 raise MergeError( 

1615 'Can only pass argument "right_on" OR "right_index" not both.' 

1616 ) 

1617 if not self.left_index and left_on is None: 

1618 raise MergeError('Must pass "left_on" OR "left_index".') 

1619 n = len(right_on) 

1620 if self.left_index: 

1621 if len(right_on) != self.left.index.nlevels: 

1622 raise ValueError( 

1623 "len(right_on) must equal the number " 

1624 'of levels in the index of "left"' 

1625 ) 

1626 left_on = [None] * n 

1627 if len(right_on) != len(left_on): 

1628 raise ValueError("len(right_on) must equal len(left_on)") 

1629 

1630 return left_on, right_on 

1631 

1632 @final 

1633 def _validate_validate_kwd(self, validate: str) -> None: 

1634 # Check uniqueness of each 

1635 if self.left_index: 

1636 left_unique = self.orig_left.index.is_unique 

1637 else: 

1638 left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique 

1639 

1640 if self.right_index: 

1641 right_unique = self.orig_right.index.is_unique 

1642 else: 

1643 right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique 

1644 

1645 # Check data integrity 

1646 if validate in ["one_to_one", "1:1"]: 

1647 if not left_unique and not right_unique: 

1648 raise MergeError( 

1649 "Merge keys are not unique in either left " 

1650 "or right dataset; not a one-to-one merge" 

1651 ) 

1652 if not left_unique: 

1653 raise MergeError( 

1654 "Merge keys are not unique in left dataset; not a one-to-one merge" 

1655 ) 

1656 if not right_unique: 

1657 raise MergeError( 

1658 "Merge keys are not unique in right dataset; not a one-to-one merge" 

1659 ) 

1660 

1661 elif validate in ["one_to_many", "1:m"]: 

1662 if not left_unique: 

1663 raise MergeError( 

1664 "Merge keys are not unique in left dataset; not a one-to-many merge" 

1665 ) 

1666 

1667 elif validate in ["many_to_one", "m:1"]: 

1668 if not right_unique: 

1669 raise MergeError( 

1670 "Merge keys are not unique in right dataset; " 

1671 "not a many-to-one merge" 

1672 ) 

1673 

1674 elif validate in ["many_to_many", "m:m"]: 

1675 pass 

1676 

1677 else: 

1678 raise ValueError( 

1679 f'"{validate}" is not a valid argument. ' 

1680 "Valid arguments are:\n" 

1681 '- "1:1"\n' 

1682 '- "1:m"\n' 

1683 '- "m:1"\n' 

1684 '- "m:m"\n' 

1685 '- "one_to_one"\n' 

1686 '- "one_to_many"\n' 

1687 '- "many_to_one"\n' 

1688 '- "many_to_many"' 

1689 ) 

1690 

1691 

1692def get_join_indexers( 

1693 left_keys: list[ArrayLike], 

1694 right_keys: list[ArrayLike], 

1695 sort: bool = False, 

1696 how: JoinHow = "inner", 

1697) -> tuple[npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: 

1698 """ 

1699 

1700 Parameters 

1701 ---------- 

1702 left_keys : list[ndarray, ExtensionArray, Index, Series] 

1703 right_keys : list[ndarray, ExtensionArray, Index, Series] 

1704 sort : bool, default False 

1705 how : {'inner', 'outer', 'left', 'right'}, default 'inner' 

1706 

1707 Returns 

1708 ------- 

1709 np.ndarray[np.intp] or None 

1710 Indexer into the left_keys. 

1711 np.ndarray[np.intp] or None 

1712 Indexer into the right_keys. 

1713 """ 

1714 assert len(left_keys) == len( 

1715 right_keys 

1716 ), "left_keys and right_keys must be the same length" 

1717 

1718 # fast-path for empty left/right 

1719 left_n = len(left_keys[0]) 

1720 right_n = len(right_keys[0]) 

1721 if left_n == 0: 

1722 if how in ["left", "inner"]: 

1723 return _get_empty_indexer() 

1724 elif not sort and how in ["right", "outer"]: 

1725 return _get_no_sort_one_missing_indexer(right_n, True) 

1726 elif right_n == 0: 

1727 if how in ["right", "inner"]: 

1728 return _get_empty_indexer() 

1729 elif not sort and how in ["left", "outer"]: 

1730 return _get_no_sort_one_missing_indexer(left_n, False) 

1731 

1732 lkey: ArrayLike 

1733 rkey: ArrayLike 

1734 if len(left_keys) > 1: 

1735 # get left & right join labels and num. of levels at each location 

1736 mapped = ( 

1737 _factorize_keys(left_keys[n], right_keys[n], sort=sort) 

1738 for n in range(len(left_keys)) 

1739 ) 

1740 zipped = zip(*mapped) 

1741 llab, rlab, shape = (list(x) for x in zipped) 

1742 

1743 # get flat i8 keys from label lists 

1744 lkey, rkey = _get_join_keys(llab, rlab, tuple(shape), sort) 

1745 else: 

1746 lkey = left_keys[0] 

1747 rkey = right_keys[0] 

1748 

1749 left = Index(lkey) 

1750 right = Index(rkey) 

1751 

1752 if ( 

1753 left.is_monotonic_increasing 

1754 and right.is_monotonic_increasing 

1755 and (left.is_unique or right.is_unique) 

1756 ): 

1757 _, lidx, ridx = left.join(right, how=how, return_indexers=True, sort=sort) 

1758 else: 

1759 lidx, ridx = get_join_indexers_non_unique( 

1760 left._values, right._values, sort, how 

1761 ) 

1762 

1763 if lidx is not None and is_range_indexer(lidx, len(left)): 

1764 lidx = None 

1765 if ridx is not None and is_range_indexer(ridx, len(right)): 

1766 ridx = None 

1767 return lidx, ridx 

1768 

1769 

1770def get_join_indexers_non_unique( 

1771 left: ArrayLike, 

1772 right: ArrayLike, 

1773 sort: bool = False, 

1774 how: JoinHow = "inner", 

1775) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

1776 """ 

1777 Get join indexers for left and right. 

1778 

1779 Parameters 

1780 ---------- 

1781 left : ArrayLike 

1782 right : ArrayLike 

1783 sort : bool, default False 

1784 how : {'inner', 'outer', 'left', 'right'}, default 'inner' 

1785 

1786 Returns 

1787 ------- 

1788 np.ndarray[np.intp] 

1789 Indexer into left. 

1790 np.ndarray[np.intp] 

1791 Indexer into right. 

1792 """ 

1793 lkey, rkey, count = _factorize_keys(left, right, sort=sort) 

1794 if how == "left": 

1795 lidx, ridx = libjoin.left_outer_join(lkey, rkey, count, sort=sort) 

1796 elif how == "right": 

1797 ridx, lidx = libjoin.left_outer_join(rkey, lkey, count, sort=sort) 

1798 elif how == "inner": 

1799 lidx, ridx = libjoin.inner_join(lkey, rkey, count, sort=sort) 

1800 elif how == "outer": 

1801 lidx, ridx = libjoin.full_outer_join(lkey, rkey, count) 

1802 return lidx, ridx 

1803 

1804 

1805def restore_dropped_levels_multijoin( 

1806 left: MultiIndex, 

1807 right: MultiIndex, 

1808 dropped_level_names, 

1809 join_index: Index, 

1810 lindexer: npt.NDArray[np.intp], 

1811 rindexer: npt.NDArray[np.intp], 

1812) -> tuple[FrozenList, FrozenList, FrozenList]: 

1813 """ 

1814 *this is an internal non-public method* 

1815 

1816 Returns the levels, labels and names of a multi-index to multi-index join. 

1817 Depending on the type of join, this method restores the appropriate 

1818 dropped levels of the joined multi-index. 

1819 The method relies on lindexer, rindexer which hold the index positions of 

1820 left and right, where a join was feasible 

1821 

1822 Parameters 

1823 ---------- 

1824 left : MultiIndex 

1825 left index 

1826 right : MultiIndex 

1827 right index 

1828 dropped_level_names : str array 

1829 list of non-common level names 

1830 join_index : Index 

1831 the index of the join between the 

1832 common levels of left and right 

1833 lindexer : np.ndarray[np.intp] 

1834 left indexer 

1835 rindexer : np.ndarray[np.intp] 

1836 right indexer 

1837 

1838 Returns 

1839 ------- 

1840 levels : list of Index 

1841 levels of combined multiindexes 

1842 labels : np.ndarray[np.intp] 

1843 labels of combined multiindexes 

1844 names : List[Hashable] 

1845 names of combined multiindex levels 

1846 

1847 """ 

1848 

1849 def _convert_to_multiindex(index: Index) -> MultiIndex: 

1850 if isinstance(index, MultiIndex): 

1851 return index 

1852 else: 

1853 return MultiIndex.from_arrays([index._values], names=[index.name]) 

1854 

1855 # For multi-multi joins with one overlapping level, 

1856 # the returned index if of type Index 

1857 # Assure that join_index is of type MultiIndex 

1858 # so that dropped levels can be appended 

1859 join_index = _convert_to_multiindex(join_index) 

1860 

1861 join_levels = join_index.levels 

1862 join_codes = join_index.codes 

1863 join_names = join_index.names 

1864 

1865 # Iterate through the levels that must be restored 

1866 for dropped_level_name in dropped_level_names: 

1867 if dropped_level_name in left.names: 

1868 idx = left 

1869 indexer = lindexer 

1870 else: 

1871 idx = right 

1872 indexer = rindexer 

1873 

1874 # The index of the level name to be restored 

1875 name_idx = idx.names.index(dropped_level_name) 

1876 

1877 restore_levels = idx.levels[name_idx] 

1878 # Inject -1 in the codes list where a join was not possible 

1879 # IOW indexer[i]=-1 

1880 codes = idx.codes[name_idx] 

1881 if indexer is None: 

1882 restore_codes = codes 

1883 else: 

1884 restore_codes = algos.take_nd(codes, indexer, fill_value=-1) 

1885 

1886 # error: Cannot determine type of "__add__" 

1887 join_levels = join_levels + [restore_levels] # type: ignore[has-type] 

1888 join_codes = join_codes + [restore_codes] # type: ignore[has-type] 

1889 join_names = join_names + [dropped_level_name] 

1890 

1891 return join_levels, join_codes, join_names 

1892 

1893 

1894class _OrderedMerge(_MergeOperation): 

1895 _merge_type = "ordered_merge" 

1896 

1897 def __init__( 

1898 self, 

1899 left: DataFrame | Series, 

1900 right: DataFrame | Series, 

1901 on: IndexLabel | None = None, 

1902 left_on: IndexLabel | None = None, 

1903 right_on: IndexLabel | None = None, 

1904 left_index: bool = False, 

1905 right_index: bool = False, 

1906 suffixes: Suffixes = ("_x", "_y"), 

1907 fill_method: str | None = None, 

1908 how: JoinHow | Literal["asof"] = "outer", 

1909 ) -> None: 

1910 self.fill_method = fill_method 

1911 _MergeOperation.__init__( 

1912 self, 

1913 left, 

1914 right, 

1915 on=on, 

1916 left_on=left_on, 

1917 left_index=left_index, 

1918 right_index=right_index, 

1919 right_on=right_on, 

1920 how=how, 

1921 suffixes=suffixes, 

1922 sort=True, # factorize sorts 

1923 ) 

1924 

1925 def get_result(self, copy: bool | None = True) -> DataFrame: 

1926 join_index, left_indexer, right_indexer = self._get_join_info() 

1927 

1928 left_join_indexer: npt.NDArray[np.intp] | None 

1929 right_join_indexer: npt.NDArray[np.intp] | None 

1930 

1931 if self.fill_method == "ffill": 

1932 if left_indexer is None: 

1933 left_join_indexer = None 

1934 else: 

1935 left_join_indexer = libjoin.ffill_indexer(left_indexer) 

1936 if right_indexer is None: 

1937 right_join_indexer = None 

1938 else: 

1939 right_join_indexer = libjoin.ffill_indexer(right_indexer) 

1940 elif self.fill_method is None: 

1941 left_join_indexer = left_indexer 

1942 right_join_indexer = right_indexer 

1943 else: 

1944 raise ValueError("fill_method must be 'ffill' or None") 

1945 

1946 result = self._reindex_and_concat( 

1947 join_index, left_join_indexer, right_join_indexer, copy=copy 

1948 ) 

1949 self._maybe_add_join_keys(result, left_indexer, right_indexer) 

1950 

1951 return result 

1952 

1953 

1954def _asof_by_function(direction: str): 

1955 name = f"asof_join_{direction}_on_X_by_Y" 

1956 return getattr(libjoin, name, None) 

1957 

1958 

1959class _AsOfMerge(_OrderedMerge): 

1960 _merge_type = "asof_merge" 

1961 

1962 def __init__( 

1963 self, 

1964 left: DataFrame | Series, 

1965 right: DataFrame | Series, 

1966 on: IndexLabel | None = None, 

1967 left_on: IndexLabel | None = None, 

1968 right_on: IndexLabel | None = None, 

1969 left_index: bool = False, 

1970 right_index: bool = False, 

1971 by=None, 

1972 left_by=None, 

1973 right_by=None, 

1974 suffixes: Suffixes = ("_x", "_y"), 

1975 how: Literal["asof"] = "asof", 

1976 tolerance=None, 

1977 allow_exact_matches: bool = True, 

1978 direction: str = "backward", 

1979 ) -> None: 

1980 self.by = by 

1981 self.left_by = left_by 

1982 self.right_by = right_by 

1983 self.tolerance = tolerance 

1984 self.allow_exact_matches = allow_exact_matches 

1985 self.direction = direction 

1986 

1987 # check 'direction' is valid 

1988 if self.direction not in ["backward", "forward", "nearest"]: 

1989 raise MergeError(f"direction invalid: {self.direction}") 

1990 

1991 # validate allow_exact_matches 

1992 if not is_bool(self.allow_exact_matches): 

1993 msg = ( 

1994 "allow_exact_matches must be boolean, " 

1995 f"passed {self.allow_exact_matches}" 

1996 ) 

1997 raise MergeError(msg) 

1998 

1999 _OrderedMerge.__init__( 

2000 self, 

2001 left, 

2002 right, 

2003 on=on, 

2004 left_on=left_on, 

2005 right_on=right_on, 

2006 left_index=left_index, 

2007 right_index=right_index, 

2008 how=how, 

2009 suffixes=suffixes, 

2010 fill_method=None, 

2011 ) 

2012 

2013 def _validate_left_right_on(self, left_on, right_on): 

2014 left_on, right_on = super()._validate_left_right_on(left_on, right_on) 

2015 

2016 # we only allow on to be a single item for on 

2017 if len(left_on) != 1 and not self.left_index: 

2018 raise MergeError("can only asof on a key for left") 

2019 

2020 if len(right_on) != 1 and not self.right_index: 

2021 raise MergeError("can only asof on a key for right") 

2022 

2023 if self.left_index and isinstance(self.left.index, MultiIndex): 

2024 raise MergeError("left can only have one index") 

2025 

2026 if self.right_index and isinstance(self.right.index, MultiIndex): 

2027 raise MergeError("right can only have one index") 

2028 

2029 # set 'by' columns 

2030 if self.by is not None: 

2031 if self.left_by is not None or self.right_by is not None: 

2032 raise MergeError("Can only pass by OR left_by and right_by") 

2033 self.left_by = self.right_by = self.by 

2034 if self.left_by is None and self.right_by is not None: 

2035 raise MergeError("missing left_by") 

2036 if self.left_by is not None and self.right_by is None: 

2037 raise MergeError("missing right_by") 

2038 

2039 # GH#29130 Check that merge keys do not have dtype object 

2040 if not self.left_index: 

2041 left_on_0 = left_on[0] 

2042 if isinstance(left_on_0, _known): 

2043 lo_dtype = left_on_0.dtype 

2044 else: 

2045 lo_dtype = ( 

2046 self.left._get_label_or_level_values(left_on_0).dtype 

2047 if left_on_0 in self.left.columns 

2048 else self.left.index.get_level_values(left_on_0) 

2049 ) 

2050 else: 

2051 lo_dtype = self.left.index.dtype 

2052 

2053 if not self.right_index: 

2054 right_on_0 = right_on[0] 

2055 if isinstance(right_on_0, _known): 

2056 ro_dtype = right_on_0.dtype 

2057 else: 

2058 ro_dtype = ( 

2059 self.right._get_label_or_level_values(right_on_0).dtype 

2060 if right_on_0 in self.right.columns 

2061 else self.right.index.get_level_values(right_on_0) 

2062 ) 

2063 else: 

2064 ro_dtype = self.right.index.dtype 

2065 

2066 if ( 

2067 is_object_dtype(lo_dtype) 

2068 or is_object_dtype(ro_dtype) 

2069 or is_string_dtype(lo_dtype) 

2070 or is_string_dtype(ro_dtype) 

2071 ): 

2072 raise MergeError( 

2073 f"Incompatible merge dtype, {repr(ro_dtype)} and " 

2074 f"{repr(lo_dtype)}, both sides must have numeric dtype" 

2075 ) 

2076 

2077 # add 'by' to our key-list so we can have it in the 

2078 # output as a key 

2079 if self.left_by is not None: 

2080 if not is_list_like(self.left_by): 

2081 self.left_by = [self.left_by] 

2082 if not is_list_like(self.right_by): 

2083 self.right_by = [self.right_by] 

2084 

2085 if len(self.left_by) != len(self.right_by): 

2086 raise MergeError("left_by and right_by must be the same length") 

2087 

2088 left_on = self.left_by + list(left_on) 

2089 right_on = self.right_by + list(right_on) 

2090 

2091 return left_on, right_on 

2092 

2093 def _maybe_require_matching_dtypes( 

2094 self, left_join_keys: list[ArrayLike], right_join_keys: list[ArrayLike] 

2095 ) -> None: 

2096 # TODO: why do we do this for AsOfMerge but not the others? 

2097 

2098 def _check_dtype_match(left: ArrayLike, right: ArrayLike, i: int): 

2099 if left.dtype != right.dtype: 

2100 if isinstance(left.dtype, CategoricalDtype) and isinstance( 

2101 right.dtype, CategoricalDtype 

2102 ): 

2103 # The generic error message is confusing for categoricals. 

2104 # 

2105 # In this function, the join keys include both the original 

2106 # ones of the merge_asof() call, and also the keys passed 

2107 # to its by= argument. Unordered but equal categories 

2108 # are not supported for the former, but will fail 

2109 # later with a ValueError, so we don't *need* to check 

2110 # for them here. 

2111 msg = ( 

2112 f"incompatible merge keys [{i}] {repr(left.dtype)} and " 

2113 f"{repr(right.dtype)}, both sides category, but not equal ones" 

2114 ) 

2115 else: 

2116 msg = ( 

2117 f"incompatible merge keys [{i}] {repr(left.dtype)} and " 

2118 f"{repr(right.dtype)}, must be the same type" 

2119 ) 

2120 raise MergeError(msg) 

2121 

2122 # validate index types are the same 

2123 for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): 

2124 _check_dtype_match(lk, rk, i) 

2125 

2126 if self.left_index: 

2127 lt = self.left.index._values 

2128 else: 

2129 lt = left_join_keys[-1] 

2130 

2131 if self.right_index: 

2132 rt = self.right.index._values 

2133 else: 

2134 rt = right_join_keys[-1] 

2135 

2136 _check_dtype_match(lt, rt, 0) 

2137 

2138 def _validate_tolerance(self, left_join_keys: list[ArrayLike]) -> None: 

2139 # validate tolerance; datetime.timedelta or Timedelta if we have a DTI 

2140 if self.tolerance is not None: 

2141 if self.left_index: 

2142 lt = self.left.index._values 

2143 else: 

2144 lt = left_join_keys[-1] 

2145 

2146 msg = ( 

2147 f"incompatible tolerance {self.tolerance}, must be compat " 

2148 f"with type {repr(lt.dtype)}" 

2149 ) 

2150 

2151 if needs_i8_conversion(lt.dtype) or ( 

2152 isinstance(lt, ArrowExtensionArray) and lt.dtype.kind in "mM" 

2153 ): 

2154 if not isinstance(self.tolerance, datetime.timedelta): 

2155 raise MergeError(msg) 

2156 if self.tolerance < Timedelta(0): 

2157 raise MergeError("tolerance must be positive") 

2158 

2159 elif is_integer_dtype(lt.dtype): 

2160 if not is_integer(self.tolerance): 

2161 raise MergeError(msg) 

2162 if self.tolerance < 0: 

2163 raise MergeError("tolerance must be positive") 

2164 

2165 elif is_float_dtype(lt.dtype): 

2166 if not is_number(self.tolerance): 

2167 raise MergeError(msg) 

2168 # error: Unsupported operand types for > ("int" and "Number") 

2169 if self.tolerance < 0: # type: ignore[operator] 

2170 raise MergeError("tolerance must be positive") 

2171 

2172 else: 

2173 raise MergeError("key must be integer, timestamp or float") 

2174 

2175 def _convert_values_for_libjoin( 

2176 self, values: AnyArrayLike, side: str 

2177 ) -> np.ndarray: 

2178 # we require sortedness and non-null values in the join keys 

2179 if not Index(values).is_monotonic_increasing: 

2180 if isna(values).any(): 

2181 raise ValueError(f"Merge keys contain null values on {side} side") 

2182 raise ValueError(f"{side} keys must be sorted") 

2183 

2184 if isinstance(values, ArrowExtensionArray): 

2185 values = values._maybe_convert_datelike_array() 

2186 

2187 if needs_i8_conversion(values.dtype): 

2188 values = values.view("i8") 

2189 

2190 elif isinstance(values, BaseMaskedArray): 

2191 # we've verified above that no nulls exist 

2192 values = values._data 

2193 elif isinstance(values, ExtensionArray): 

2194 values = values.to_numpy() 

2195 

2196 # error: Incompatible return value type (got "Union[ExtensionArray, 

2197 # Any, ndarray[Any, Any], ndarray[Any, dtype[Any]], Index, Series]", 

2198 # expected "ndarray[Any, Any]") 

2199 return values # type: ignore[return-value] 

2200 

2201 def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2202 """return the join indexers""" 

2203 

2204 # values to compare 

2205 left_values = ( 

2206 self.left.index._values if self.left_index else self.left_join_keys[-1] 

2207 ) 

2208 right_values = ( 

2209 self.right.index._values if self.right_index else self.right_join_keys[-1] 

2210 ) 

2211 

2212 # _maybe_require_matching_dtypes already checked for dtype matching 

2213 assert left_values.dtype == right_values.dtype 

2214 

2215 tolerance = self.tolerance 

2216 if tolerance is not None: 

2217 # TODO: can we reuse a tolerance-conversion function from 

2218 # e.g. TimedeltaIndex? 

2219 if needs_i8_conversion(left_values.dtype) or ( 

2220 isinstance(left_values, ArrowExtensionArray) 

2221 and left_values.dtype.kind in "mM" 

2222 ): 

2223 tolerance = Timedelta(tolerance) 

2224 # TODO: we have no test cases with PeriodDtype here; probably 

2225 # need to adjust tolerance for that case. 

2226 if left_values.dtype.kind in "mM": 

2227 # Make sure the i8 representation for tolerance 

2228 # matches that for left_values/right_values. 

2229 if isinstance(left_values, ArrowExtensionArray): 

2230 unit = left_values.dtype.pyarrow_dtype.unit 

2231 else: 

2232 unit = ensure_wrapped_if_datetimelike(left_values).unit 

2233 tolerance = tolerance.as_unit(unit) 

2234 

2235 tolerance = tolerance._value 

2236 

2237 # initial type conversion as needed 

2238 left_values = self._convert_values_for_libjoin(left_values, "left") 

2239 right_values = self._convert_values_for_libjoin(right_values, "right") 

2240 

2241 # a "by" parameter requires special handling 

2242 if self.left_by is not None: 

2243 # remove 'on' parameter from values if one existed 

2244 if self.left_index and self.right_index: 

2245 left_join_keys = self.left_join_keys 

2246 right_join_keys = self.right_join_keys 

2247 else: 

2248 left_join_keys = self.left_join_keys[0:-1] 

2249 right_join_keys = self.right_join_keys[0:-1] 

2250 

2251 mapped = [ 

2252 _factorize_keys( 

2253 left_join_keys[n], 

2254 right_join_keys[n], 

2255 sort=False, 

2256 ) 

2257 for n in range(len(left_join_keys)) 

2258 ] 

2259 

2260 if len(left_join_keys) == 1: 

2261 left_by_values = mapped[0][0] 

2262 right_by_values = mapped[0][1] 

2263 else: 

2264 arrs = [np.concatenate(m[:2]) for m in mapped] 

2265 shape = tuple(m[2] for m in mapped) 

2266 group_index = get_group_index( 

2267 arrs, shape=shape, sort=False, xnull=False 

2268 ) 

2269 left_len = len(left_join_keys[0]) 

2270 left_by_values = group_index[:left_len] 

2271 right_by_values = group_index[left_len:] 

2272 

2273 left_by_values = ensure_int64(left_by_values) 

2274 right_by_values = ensure_int64(right_by_values) 

2275 

2276 # choose appropriate function by type 

2277 func = _asof_by_function(self.direction) 

2278 return func( 

2279 left_values, 

2280 right_values, 

2281 left_by_values, 

2282 right_by_values, 

2283 self.allow_exact_matches, 

2284 tolerance, 

2285 ) 

2286 else: 

2287 # choose appropriate function by type 

2288 func = _asof_by_function(self.direction) 

2289 return func( 

2290 left_values, 

2291 right_values, 

2292 None, 

2293 None, 

2294 self.allow_exact_matches, 

2295 tolerance, 

2296 False, 

2297 ) 

2298 

2299 

2300def _get_multiindex_indexer( 

2301 join_keys: list[ArrayLike], index: MultiIndex, sort: bool 

2302) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2303 # left & right join labels and num. of levels at each location 

2304 mapped = ( 

2305 _factorize_keys(index.levels[n]._values, join_keys[n], sort=sort) 

2306 for n in range(index.nlevels) 

2307 ) 

2308 zipped = zip(*mapped) 

2309 rcodes, lcodes, shape = (list(x) for x in zipped) 

2310 if sort: 

2311 rcodes = list(map(np.take, rcodes, index.codes)) 

2312 else: 

2313 i8copy = lambda a: a.astype("i8", subok=False, copy=True) 

2314 rcodes = list(map(i8copy, index.codes)) 

2315 

2316 # fix right labels if there were any nulls 

2317 for i, join_key in enumerate(join_keys): 

2318 mask = index.codes[i] == -1 

2319 if mask.any(): 

2320 # check if there already was any nulls at this location 

2321 # if there was, it is factorized to `shape[i] - 1` 

2322 a = join_key[lcodes[i] == shape[i] - 1] 

2323 if a.size == 0 or not a[0] != a[0]: 

2324 shape[i] += 1 

2325 

2326 rcodes[i][mask] = shape[i] - 1 

2327 

2328 # get flat i8 join keys 

2329 lkey, rkey = _get_join_keys(lcodes, rcodes, tuple(shape), sort) 

2330 return lkey, rkey 

2331 

2332 

2333def _get_empty_indexer() -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2334 """Return empty join indexers.""" 

2335 return ( 

2336 np.array([], dtype=np.intp), 

2337 np.array([], dtype=np.intp), 

2338 ) 

2339 

2340 

2341def _get_no_sort_one_missing_indexer( 

2342 n: int, left_missing: bool 

2343) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2344 """ 

2345 Return join indexers where all of one side is selected without sorting 

2346 and none of the other side is selected. 

2347 

2348 Parameters 

2349 ---------- 

2350 n : int 

2351 Length of indexers to create. 

2352 left_missing : bool 

2353 If True, the left indexer will contain only -1's. 

2354 If False, the right indexer will contain only -1's. 

2355 

2356 Returns 

2357 ------- 

2358 np.ndarray[np.intp] 

2359 Left indexer 

2360 np.ndarray[np.intp] 

2361 Right indexer 

2362 """ 

2363 idx = np.arange(n, dtype=np.intp) 

2364 idx_missing = np.full(shape=n, fill_value=-1, dtype=np.intp) 

2365 if left_missing: 

2366 return idx_missing, idx 

2367 return idx, idx_missing 

2368 

2369 

2370def _left_join_on_index( 

2371 left_ax: Index, right_ax: Index, join_keys: list[ArrayLike], sort: bool = False 

2372) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]: 

2373 if isinstance(right_ax, MultiIndex): 

2374 lkey, rkey = _get_multiindex_indexer(join_keys, right_ax, sort=sort) 

2375 else: 

2376 # error: Incompatible types in assignment (expression has type 

2377 # "Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]", 

2378 # variable has type "ndarray[Any, dtype[signedinteger[Any]]]") 

2379 lkey = join_keys[0] # type: ignore[assignment] 

2380 # error: Incompatible types in assignment (expression has type "Index", 

2381 # variable has type "ndarray[Any, dtype[signedinteger[Any]]]") 

2382 rkey = right_ax._values # type: ignore[assignment] 

2383 

2384 left_key, right_key, count = _factorize_keys(lkey, rkey, sort=sort) 

2385 left_indexer, right_indexer = libjoin.left_outer_join( 

2386 left_key, right_key, count, sort=sort 

2387 ) 

2388 

2389 if sort or len(left_ax) != len(left_indexer): 

2390 # if asked to sort or there are 1-to-many matches 

2391 join_index = left_ax.take(left_indexer) 

2392 return join_index, left_indexer, right_indexer 

2393 

2394 # left frame preserves order & length of its index 

2395 return left_ax, None, right_indexer 

2396 

2397 

2398def _factorize_keys( 

2399 lk: ArrayLike, rk: ArrayLike, sort: bool = True 

2400) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: 

2401 """ 

2402 Encode left and right keys as enumerated types. 

2403 

2404 This is used to get the join indexers to be used when merging DataFrames. 

2405 

2406 Parameters 

2407 ---------- 

2408 lk : ndarray, ExtensionArray 

2409 Left key. 

2410 rk : ndarray, ExtensionArray 

2411 Right key. 

2412 sort : bool, defaults to True 

2413 If True, the encoding is done such that the unique elements in the 

2414 keys are sorted. 

2415 

2416 Returns 

2417 ------- 

2418 np.ndarray[np.intp] 

2419 Left (resp. right if called with `key='right'`) labels, as enumerated type. 

2420 np.ndarray[np.intp] 

2421 Right (resp. left if called with `key='right'`) labels, as enumerated type. 

2422 int 

2423 Number of unique elements in union of left and right labels. 

2424 

2425 See Also 

2426 -------- 

2427 merge : Merge DataFrame or named Series objects 

2428 with a database-style join. 

2429 algorithms.factorize : Encode the object as an enumerated type 

2430 or categorical variable. 

2431 

2432 Examples 

2433 -------- 

2434 >>> lk = np.array(["a", "c", "b"]) 

2435 >>> rk = np.array(["a", "c"]) 

2436 

2437 Here, the unique values are `'a', 'b', 'c'`. With the default 

2438 `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`: 

2439 

2440 >>> pd.core.reshape.merge._factorize_keys(lk, rk) 

2441 (array([0, 2, 1]), array([0, 2]), 3) 

2442 

2443 With the `sort=False`, the encoding will correspond to the order 

2444 in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`: 

2445 

2446 >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False) 

2447 (array([0, 1, 2]), array([0, 1]), 3) 

2448 """ 

2449 # TODO: if either is a RangeIndex, we can likely factorize more efficiently? 

2450 

2451 if ( 

2452 isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype) 

2453 ) or (lib.is_np_dtype(lk.dtype, "M") and lib.is_np_dtype(rk.dtype, "M")): 

2454 # Extract the ndarray (UTC-localized) values 

2455 # Note: we dont need the dtypes to match, as these can still be compared 

2456 lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk) 

2457 lk = cast("DatetimeArray", lk)._ndarray 

2458 rk = cast("DatetimeArray", rk)._ndarray 

2459 

2460 elif ( 

2461 isinstance(lk.dtype, CategoricalDtype) 

2462 and isinstance(rk.dtype, CategoricalDtype) 

2463 and lk.dtype == rk.dtype 

2464 ): 

2465 assert isinstance(lk, Categorical) 

2466 assert isinstance(rk, Categorical) 

2467 # Cast rk to encoding so we can compare codes with lk 

2468 

2469 rk = lk._encode_with_my_categories(rk) 

2470 

2471 lk = ensure_int64(lk.codes) 

2472 rk = ensure_int64(rk.codes) 

2473 

2474 elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype: 

2475 if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or ( 

2476 isinstance(lk.dtype, StringDtype) 

2477 and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"] 

2478 ): 

2479 import pyarrow as pa 

2480 import pyarrow.compute as pc 

2481 

2482 len_lk = len(lk) 

2483 lk = lk._pa_array # type: ignore[attr-defined] 

2484 rk = rk._pa_array # type: ignore[union-attr] 

2485 dc = ( 

2486 pa.chunked_array(lk.chunks + rk.chunks) # type: ignore[union-attr] 

2487 .combine_chunks() 

2488 .dictionary_encode() 

2489 ) 

2490 

2491 llab, rlab, count = ( 

2492 pc.fill_null(dc.indices[slice(len_lk)], -1) 

2493 .to_numpy() 

2494 .astype(np.intp, copy=False), 

2495 pc.fill_null(dc.indices[slice(len_lk, None)], -1) 

2496 .to_numpy() 

2497 .astype(np.intp, copy=False), 

2498 len(dc.dictionary), 

2499 ) 

2500 

2501 if sort: 

2502 uniques = dc.dictionary.to_numpy(zero_copy_only=False) 

2503 llab, rlab = _sort_labels(uniques, llab, rlab) 

2504 

2505 if dc.null_count > 0: 

2506 lmask = llab == -1 

2507 lany = lmask.any() 

2508 rmask = rlab == -1 

2509 rany = rmask.any() 

2510 if lany: 

2511 np.putmask(llab, lmask, count) 

2512 if rany: 

2513 np.putmask(rlab, rmask, count) 

2514 count += 1 

2515 return llab, rlab, count 

2516 

2517 if not isinstance(lk, BaseMaskedArray) and not ( 

2518 # exclude arrow dtypes that would get cast to object 

2519 isinstance(lk.dtype, ArrowDtype) 

2520 and ( 

2521 is_numeric_dtype(lk.dtype.numpy_dtype) 

2522 or is_string_dtype(lk.dtype) 

2523 and not sort 

2524 ) 

2525 ): 

2526 lk, _ = lk._values_for_factorize() 

2527 

2528 # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute 

2529 # "_values_for_factorize" 

2530 rk, _ = rk._values_for_factorize() # type: ignore[union-attr] 

2531 

2532 if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype: 

2533 # GH#23917 TODO: Needs tests for non-matching dtypes 

2534 # GH#23917 TODO: needs tests for case where lk is integer-dtype 

2535 # and rk is datetime-dtype 

2536 lk = np.asarray(lk, dtype=np.int64) 

2537 rk = np.asarray(rk, dtype=np.int64) 

2538 

2539 klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk) 

2540 

2541 rizer = klass(max(len(lk), len(rk))) 

2542 

2543 if isinstance(lk, BaseMaskedArray): 

2544 assert isinstance(rk, BaseMaskedArray) 

2545 llab = rizer.factorize(lk._data, mask=lk._mask) 

2546 rlab = rizer.factorize(rk._data, mask=rk._mask) 

2547 elif isinstance(lk, ArrowExtensionArray): 

2548 assert isinstance(rk, ArrowExtensionArray) 

2549 # we can only get here with numeric dtypes 

2550 # TODO: Remove when we have a Factorizer for Arrow 

2551 llab = rizer.factorize( 

2552 lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna() 

2553 ) 

2554 rlab = rizer.factorize( 

2555 rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna() 

2556 ) 

2557 else: 

2558 # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type 

2559 # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]], 

2560 # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]" 

2561 llab = rizer.factorize(lk) # type: ignore[arg-type] 

2562 rlab = rizer.factorize(rk) # type: ignore[arg-type] 

2563 assert llab.dtype == np.dtype(np.intp), llab.dtype 

2564 assert rlab.dtype == np.dtype(np.intp), rlab.dtype 

2565 

2566 count = rizer.get_count() 

2567 

2568 if sort: 

2569 uniques = rizer.uniques.to_array() 

2570 llab, rlab = _sort_labels(uniques, llab, rlab) 

2571 

2572 # NA group 

2573 lmask = llab == -1 

2574 lany = lmask.any() 

2575 rmask = rlab == -1 

2576 rany = rmask.any() 

2577 

2578 if lany or rany: 

2579 if lany: 

2580 np.putmask(llab, lmask, count) 

2581 if rany: 

2582 np.putmask(rlab, rmask, count) 

2583 count += 1 

2584 

2585 return llab, rlab, count 

2586 

2587 

2588def _convert_arrays_and_get_rizer_klass( 

2589 lk: ArrayLike, rk: ArrayLike 

2590) -> tuple[type[libhashtable.Factorizer], ArrayLike, ArrayLike]: 

2591 klass: type[libhashtable.Factorizer] 

2592 if is_numeric_dtype(lk.dtype): 

2593 if lk.dtype != rk.dtype: 

2594 dtype = find_common_type([lk.dtype, rk.dtype]) 

2595 if isinstance(dtype, ExtensionDtype): 

2596 cls = dtype.construct_array_type() 

2597 if not isinstance(lk, ExtensionArray): 

2598 lk = cls._from_sequence(lk, dtype=dtype, copy=False) 

2599 else: 

2600 lk = lk.astype(dtype, copy=False) 

2601 

2602 if not isinstance(rk, ExtensionArray): 

2603 rk = cls._from_sequence(rk, dtype=dtype, copy=False) 

2604 else: 

2605 rk = rk.astype(dtype, copy=False) 

2606 else: 

2607 lk = lk.astype(dtype, copy=False) 

2608 rk = rk.astype(dtype, copy=False) 

2609 if isinstance(lk, BaseMaskedArray): 

2610 # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]"; 

2611 # expected type "Type[object]" 

2612 klass = _factorizers[lk.dtype.type] # type: ignore[index] 

2613 elif isinstance(lk.dtype, ArrowDtype): 

2614 klass = _factorizers[lk.dtype.numpy_dtype.type] 

2615 else: 

2616 klass = _factorizers[lk.dtype.type] 

2617 

2618 else: 

2619 klass = libhashtable.ObjectFactorizer 

2620 lk = ensure_object(lk) 

2621 rk = ensure_object(rk) 

2622 return klass, lk, rk 

2623 

2624 

2625def _sort_labels( 

2626 uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp] 

2627) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: 

2628 llength = len(left) 

2629 labels = np.concatenate([left, right]) 

2630 

2631 _, new_labels = algos.safe_sort(uniques, labels, use_na_sentinel=True) 

2632 new_left, new_right = new_labels[:llength], new_labels[llength:] 

2633 

2634 return new_left, new_right 

2635 

2636 

2637def _get_join_keys( 

2638 llab: list[npt.NDArray[np.int64 | np.intp]], 

2639 rlab: list[npt.NDArray[np.int64 | np.intp]], 

2640 shape: Shape, 

2641 sort: bool, 

2642) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: 

2643 # how many levels can be done without overflow 

2644 nlev = next( 

2645 lev 

2646 for lev in range(len(shape), 0, -1) 

2647 if not is_int64_overflow_possible(shape[:lev]) 

2648 ) 

2649 

2650 # get keys for the first `nlev` levels 

2651 stride = np.prod(shape[1:nlev], dtype="i8") 

2652 lkey = stride * llab[0].astype("i8", subok=False, copy=False) 

2653 rkey = stride * rlab[0].astype("i8", subok=False, copy=False) 

2654 

2655 for i in range(1, nlev): 

2656 with np.errstate(divide="ignore"): 

2657 stride //= shape[i] 

2658 lkey += llab[i] * stride 

2659 rkey += rlab[i] * stride 

2660 

2661 if nlev == len(shape): # all done! 

2662 return lkey, rkey 

2663 

2664 # densify current keys to avoid overflow 

2665 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) 

2666 

2667 llab = [lkey] + llab[nlev:] 

2668 rlab = [rkey] + rlab[nlev:] 

2669 shape = (count,) + shape[nlev:] 

2670 

2671 return _get_join_keys(llab, rlab, shape, sort) 

2672 

2673 

2674def _should_fill(lname, rname) -> bool: 

2675 if not isinstance(lname, str) or not isinstance(rname, str): 

2676 return True 

2677 return lname == rname 

2678 

2679 

2680def _any(x) -> bool: 

2681 return x is not None and com.any_not_none(*x) 

2682 

2683 

2684def _validate_operand(obj: DataFrame | Series) -> DataFrame: 

2685 if isinstance(obj, ABCDataFrame): 

2686 return obj 

2687 elif isinstance(obj, ABCSeries): 

2688 if obj.name is None: 

2689 raise ValueError("Cannot merge a Series without a name") 

2690 return obj.to_frame() 

2691 else: 

2692 raise TypeError( 

2693 f"Can only merge Series or DataFrame objects, a {type(obj)} was passed" 

2694 ) 

2695 

2696 

2697def _items_overlap_with_suffix( 

2698 left: Index, right: Index, suffixes: Suffixes 

2699) -> tuple[Index, Index]: 

2700 """ 

2701 Suffixes type validation. 

2702 

2703 If two indices overlap, add suffixes to overlapping entries. 

2704 

2705 If corresponding suffix is empty, the entry is simply converted to string. 

2706 

2707 """ 

2708 if not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict): 

2709 raise TypeError( 

2710 f"Passing 'suffixes' as a {type(suffixes)}, is not supported. " 

2711 "Provide 'suffixes' as a tuple instead." 

2712 ) 

2713 

2714 to_rename = left.intersection(right) 

2715 if len(to_rename) == 0: 

2716 return left, right 

2717 

2718 lsuffix, rsuffix = suffixes 

2719 

2720 if not lsuffix and not rsuffix: 

2721 raise ValueError(f"columns overlap but no suffix specified: {to_rename}") 

2722 

2723 def renamer(x, suffix: str | None): 

2724 """ 

2725 Rename the left and right indices. 

2726 

2727 If there is overlap, and suffix is not None, add 

2728 suffix, otherwise, leave it as-is. 

2729 

2730 Parameters 

2731 ---------- 

2732 x : original column name 

2733 suffix : str or None 

2734 

2735 Returns 

2736 ------- 

2737 x : renamed column name 

2738 """ 

2739 if x in to_rename and suffix is not None: 

2740 return f"{x}{suffix}" 

2741 return x 

2742 

2743 lrenamer = partial(renamer, suffix=lsuffix) 

2744 rrenamer = partial(renamer, suffix=rsuffix) 

2745 

2746 llabels = left._transform_index(lrenamer) 

2747 rlabels = right._transform_index(rrenamer) 

2748 

2749 dups = [] 

2750 if not llabels.is_unique: 

2751 # Only warn when duplicates are caused because of suffixes, already duplicated 

2752 # columns in origin should not warn 

2753 dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist() 

2754 if not rlabels.is_unique: 

2755 dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist()) 

2756 if dups: 

2757 raise MergeError( 

2758 f"Passing 'suffixes' which cause duplicate columns {set(dups)} is " 

2759 f"not allowed.", 

2760 ) 

2761 

2762 return llabels, rlabels