Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/networkx/algorithms/similarity.py: 7%

339 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-20 07:00 +0000

1""" Functions measuring similarity using graph edit distance. 

2 

3The graph edit distance is the number of edge/node changes needed 

4to make two graphs isomorphic. 

5 

6The default algorithm/implementation is sub-optimal for some graphs. 

7The problem of finding the exact Graph Edit Distance (GED) is NP-hard 

8so it is often slow. If the simple interface `graph_edit_distance` 

9takes too long for your graph, try `optimize_graph_edit_distance` 

10and/or `optimize_edit_paths`. 

11 

12At the same time, I encourage capable people to investigate 

13alternative GED algorithms, in order to improve the choices available. 

14""" 

15 

16import math 

17import time 

18import warnings 

19from dataclasses import dataclass 

20from itertools import product 

21 

22import networkx as nx 

23 

24__all__ = [ 

25 "graph_edit_distance", 

26 "optimal_edit_paths", 

27 "optimize_graph_edit_distance", 

28 "optimize_edit_paths", 

29 "simrank_similarity", 

30 "panther_similarity", 

31 "generate_random_paths", 

32] 

33 

34 

35def debug_print(*args, **kwargs): 

36 print(*args, **kwargs) 

37 

38 

39@nx._dispatch( 

40 graphs={"G1": 0, "G2": 1}, preserve_edge_attrs=True, preserve_node_attrs=True 

41) 

42def graph_edit_distance( 

43 G1, 

44 G2, 

45 node_match=None, 

46 edge_match=None, 

47 node_subst_cost=None, 

48 node_del_cost=None, 

49 node_ins_cost=None, 

50 edge_subst_cost=None, 

51 edge_del_cost=None, 

52 edge_ins_cost=None, 

53 roots=None, 

54 upper_bound=None, 

55 timeout=None, 

56): 

57 """Returns GED (graph edit distance) between graphs G1 and G2. 

58 

59 Graph edit distance is a graph similarity measure analogous to 

60 Levenshtein distance for strings. It is defined as minimum cost 

61 of edit path (sequence of node and edge edit operations) 

62 transforming graph G1 to graph isomorphic to G2. 

63 

64 Parameters 

65 ---------- 

66 G1, G2: graphs 

67 The two graphs G1 and G2 must be of the same type. 

68 

69 node_match : callable 

70 A function that returns True if node n1 in G1 and n2 in G2 

71 should be considered equal during matching. 

72 

73 The function will be called like 

74 

75 node_match(G1.nodes[n1], G2.nodes[n2]). 

76 

77 That is, the function will receive the node attribute 

78 dictionaries for n1 and n2 as inputs. 

79 

80 Ignored if node_subst_cost is specified. If neither 

81 node_match nor node_subst_cost are specified then node 

82 attributes are not considered. 

83 

84 edge_match : callable 

85 A function that returns True if the edge attribute dictionaries 

86 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should 

87 be considered equal during matching. 

88 

89 The function will be called like 

90 

91 edge_match(G1[u1][v1], G2[u2][v2]). 

92 

93 That is, the function will receive the edge attribute 

94 dictionaries of the edges under consideration. 

95 

96 Ignored if edge_subst_cost is specified. If neither 

97 edge_match nor edge_subst_cost are specified then edge 

98 attributes are not considered. 

99 

100 node_subst_cost, node_del_cost, node_ins_cost : callable 

101 Functions that return the costs of node substitution, node 

102 deletion, and node insertion, respectively. 

103 

104 The functions will be called like 

105 

106 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), 

107 node_del_cost(G1.nodes[n1]), 

108 node_ins_cost(G2.nodes[n2]). 

109 

110 That is, the functions will receive the node attribute 

111 dictionaries as inputs. The functions are expected to return 

112 positive numeric values. 

113 

114 Function node_subst_cost overrides node_match if specified. 

115 If neither node_match nor node_subst_cost are specified then 

116 default node substitution cost of 0 is used (node attributes 

117 are not considered during matching). 

118 

119 If node_del_cost is not specified then default node deletion 

120 cost of 1 is used. If node_ins_cost is not specified then 

121 default node insertion cost of 1 is used. 

122 

123 edge_subst_cost, edge_del_cost, edge_ins_cost : callable 

124 Functions that return the costs of edge substitution, edge 

125 deletion, and edge insertion, respectively. 

126 

127 The functions will be called like 

128 

129 edge_subst_cost(G1[u1][v1], G2[u2][v2]), 

130 edge_del_cost(G1[u1][v1]), 

131 edge_ins_cost(G2[u2][v2]). 

132 

133 That is, the functions will receive the edge attribute 

134 dictionaries as inputs. The functions are expected to return 

135 positive numeric values. 

136 

137 Function edge_subst_cost overrides edge_match if specified. 

138 If neither edge_match nor edge_subst_cost are specified then 

139 default edge substitution cost of 0 is used (edge attributes 

140 are not considered during matching). 

141 

142 If edge_del_cost is not specified then default edge deletion 

143 cost of 1 is used. If edge_ins_cost is not specified then 

144 default edge insertion cost of 1 is used. 

145 

146 roots : 2-tuple 

147 Tuple where first element is a node in G1 and the second 

148 is a node in G2. 

149 These nodes are forced to be matched in the comparison to 

150 allow comparison between rooted graphs. 

151 

152 upper_bound : numeric 

153 Maximum edit distance to consider. Return None if no edit 

154 distance under or equal to upper_bound exists. 

155 

156 timeout : numeric 

157 Maximum number of seconds to execute. 

158 After timeout is met, the current best GED is returned. 

159 

160 Examples 

161 -------- 

162 >>> G1 = nx.cycle_graph(6) 

163 >>> G2 = nx.wheel_graph(7) 

164 >>> nx.graph_edit_distance(G1, G2) 

165 7.0 

166 

167 >>> G1 = nx.star_graph(5) 

168 >>> G2 = nx.star_graph(5) 

169 >>> nx.graph_edit_distance(G1, G2, roots=(0, 0)) 

170 0.0 

171 >>> nx.graph_edit_distance(G1, G2, roots=(1, 0)) 

172 8.0 

173 

174 See Also 

175 -------- 

176 optimal_edit_paths, optimize_graph_edit_distance, 

177 

178 is_isomorphic: test for graph edit distance of 0 

179 

180 References 

181 ---------- 

182 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick 

183 Martineau. An Exact Graph Edit Distance Algorithm for Solving 

184 Pattern Recognition Problems. 4th International Conference on 

185 Pattern Recognition Applications and Methods 2015, Jan 2015, 

186 Lisbon, Portugal. 2015, 

187 <10.5220/0005209202710278>. <hal-01168816> 

188 https://hal.archives-ouvertes.fr/hal-01168816 

189 

190 """ 

191 bestcost = None 

192 for _, _, cost in optimize_edit_paths( 

193 G1, 

194 G2, 

195 node_match, 

196 edge_match, 

197 node_subst_cost, 

198 node_del_cost, 

199 node_ins_cost, 

200 edge_subst_cost, 

201 edge_del_cost, 

202 edge_ins_cost, 

203 upper_bound, 

204 True, 

205 roots, 

206 timeout, 

207 ): 

208 # assert bestcost is None or cost < bestcost 

209 bestcost = cost 

210 return bestcost 

211 

212 

213@nx._dispatch(graphs={"G1": 0, "G2": 1}) 

214def optimal_edit_paths( 

215 G1, 

216 G2, 

217 node_match=None, 

218 edge_match=None, 

219 node_subst_cost=None, 

220 node_del_cost=None, 

221 node_ins_cost=None, 

222 edge_subst_cost=None, 

223 edge_del_cost=None, 

224 edge_ins_cost=None, 

225 upper_bound=None, 

226): 

227 """Returns all minimum-cost edit paths transforming G1 to G2. 

228 

229 Graph edit path is a sequence of node and edge edit operations 

230 transforming graph G1 to graph isomorphic to G2. Edit operations 

231 include substitutions, deletions, and insertions. 

232 

233 Parameters 

234 ---------- 

235 G1, G2: graphs 

236 The two graphs G1 and G2 must be of the same type. 

237 

238 node_match : callable 

239 A function that returns True if node n1 in G1 and n2 in G2 

240 should be considered equal during matching. 

241 

242 The function will be called like 

243 

244 node_match(G1.nodes[n1], G2.nodes[n2]). 

245 

246 That is, the function will receive the node attribute 

247 dictionaries for n1 and n2 as inputs. 

248 

249 Ignored if node_subst_cost is specified. If neither 

250 node_match nor node_subst_cost are specified then node 

251 attributes are not considered. 

252 

253 edge_match : callable 

254 A function that returns True if the edge attribute dictionaries 

255 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should 

256 be considered equal during matching. 

257 

258 The function will be called like 

259 

260 edge_match(G1[u1][v1], G2[u2][v2]). 

261 

262 That is, the function will receive the edge attribute 

263 dictionaries of the edges under consideration. 

264 

265 Ignored if edge_subst_cost is specified. If neither 

266 edge_match nor edge_subst_cost are specified then edge 

267 attributes are not considered. 

268 

269 node_subst_cost, node_del_cost, node_ins_cost : callable 

270 Functions that return the costs of node substitution, node 

271 deletion, and node insertion, respectively. 

272 

273 The functions will be called like 

274 

275 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), 

276 node_del_cost(G1.nodes[n1]), 

277 node_ins_cost(G2.nodes[n2]). 

278 

279 That is, the functions will receive the node attribute 

280 dictionaries as inputs. The functions are expected to return 

281 positive numeric values. 

282 

283 Function node_subst_cost overrides node_match if specified. 

284 If neither node_match nor node_subst_cost are specified then 

285 default node substitution cost of 0 is used (node attributes 

286 are not considered during matching). 

287 

288 If node_del_cost is not specified then default node deletion 

289 cost of 1 is used. If node_ins_cost is not specified then 

290 default node insertion cost of 1 is used. 

291 

292 edge_subst_cost, edge_del_cost, edge_ins_cost : callable 

293 Functions that return the costs of edge substitution, edge 

294 deletion, and edge insertion, respectively. 

295 

296 The functions will be called like 

297 

298 edge_subst_cost(G1[u1][v1], G2[u2][v2]), 

299 edge_del_cost(G1[u1][v1]), 

300 edge_ins_cost(G2[u2][v2]). 

301 

302 That is, the functions will receive the edge attribute 

303 dictionaries as inputs. The functions are expected to return 

304 positive numeric values. 

305 

306 Function edge_subst_cost overrides edge_match if specified. 

307 If neither edge_match nor edge_subst_cost are specified then 

308 default edge substitution cost of 0 is used (edge attributes 

309 are not considered during matching). 

310 

311 If edge_del_cost is not specified then default edge deletion 

312 cost of 1 is used. If edge_ins_cost is not specified then 

313 default edge insertion cost of 1 is used. 

314 

315 upper_bound : numeric 

316 Maximum edit distance to consider. 

317 

318 Returns 

319 ------- 

320 edit_paths : list of tuples (node_edit_path, edge_edit_path) 

321 node_edit_path : list of tuples (u, v) 

322 edge_edit_path : list of tuples ((u1, v1), (u2, v2)) 

323 

324 cost : numeric 

325 Optimal edit path cost (graph edit distance). 

326 

327 Examples 

328 -------- 

329 >>> G1 = nx.cycle_graph(4) 

330 >>> G2 = nx.wheel_graph(5) 

331 >>> paths, cost = nx.optimal_edit_paths(G1, G2) 

332 >>> len(paths) 

333 40 

334 >>> cost 

335 5.0 

336 

337 See Also 

338 -------- 

339 graph_edit_distance, optimize_edit_paths 

340 

341 References 

342 ---------- 

343 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick 

344 Martineau. An Exact Graph Edit Distance Algorithm for Solving 

345 Pattern Recognition Problems. 4th International Conference on 

346 Pattern Recognition Applications and Methods 2015, Jan 2015, 

347 Lisbon, Portugal. 2015, 

348 <10.5220/0005209202710278>. <hal-01168816> 

349 https://hal.archives-ouvertes.fr/hal-01168816 

350 

351 """ 

352 paths = [] 

353 bestcost = None 

354 for vertex_path, edge_path, cost in optimize_edit_paths( 

355 G1, 

356 G2, 

357 node_match, 

358 edge_match, 

359 node_subst_cost, 

360 node_del_cost, 

361 node_ins_cost, 

362 edge_subst_cost, 

363 edge_del_cost, 

364 edge_ins_cost, 

365 upper_bound, 

366 False, 

367 ): 

368 # assert bestcost is None or cost <= bestcost 

369 if bestcost is not None and cost < bestcost: 

370 paths = [] 

371 paths.append((vertex_path, edge_path)) 

372 bestcost = cost 

373 return paths, bestcost 

374 

375 

376@nx._dispatch(graphs={"G1": 0, "G2": 1}) 

377def optimize_graph_edit_distance( 

378 G1, 

379 G2, 

380 node_match=None, 

381 edge_match=None, 

382 node_subst_cost=None, 

383 node_del_cost=None, 

384 node_ins_cost=None, 

385 edge_subst_cost=None, 

386 edge_del_cost=None, 

387 edge_ins_cost=None, 

388 upper_bound=None, 

389): 

390 """Returns consecutive approximations of GED (graph edit distance) 

391 between graphs G1 and G2. 

392 

393 Graph edit distance is a graph similarity measure analogous to 

394 Levenshtein distance for strings. It is defined as minimum cost 

395 of edit path (sequence of node and edge edit operations) 

396 transforming graph G1 to graph isomorphic to G2. 

397 

398 Parameters 

399 ---------- 

400 G1, G2: graphs 

401 The two graphs G1 and G2 must be of the same type. 

402 

403 node_match : callable 

404 A function that returns True if node n1 in G1 and n2 in G2 

405 should be considered equal during matching. 

406 

407 The function will be called like 

408 

409 node_match(G1.nodes[n1], G2.nodes[n2]). 

410 

411 That is, the function will receive the node attribute 

412 dictionaries for n1 and n2 as inputs. 

413 

414 Ignored if node_subst_cost is specified. If neither 

415 node_match nor node_subst_cost are specified then node 

416 attributes are not considered. 

417 

418 edge_match : callable 

419 A function that returns True if the edge attribute dictionaries 

420 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should 

421 be considered equal during matching. 

422 

423 The function will be called like 

424 

425 edge_match(G1[u1][v1], G2[u2][v2]). 

426 

427 That is, the function will receive the edge attribute 

428 dictionaries of the edges under consideration. 

429 

430 Ignored if edge_subst_cost is specified. If neither 

431 edge_match nor edge_subst_cost are specified then edge 

432 attributes are not considered. 

433 

434 node_subst_cost, node_del_cost, node_ins_cost : callable 

435 Functions that return the costs of node substitution, node 

436 deletion, and node insertion, respectively. 

437 

438 The functions will be called like 

439 

440 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), 

441 node_del_cost(G1.nodes[n1]), 

442 node_ins_cost(G2.nodes[n2]). 

443 

444 That is, the functions will receive the node attribute 

445 dictionaries as inputs. The functions are expected to return 

446 positive numeric values. 

447 

448 Function node_subst_cost overrides node_match if specified. 

449 If neither node_match nor node_subst_cost are specified then 

450 default node substitution cost of 0 is used (node attributes 

451 are not considered during matching). 

452 

453 If node_del_cost is not specified then default node deletion 

454 cost of 1 is used. If node_ins_cost is not specified then 

455 default node insertion cost of 1 is used. 

456 

457 edge_subst_cost, edge_del_cost, edge_ins_cost : callable 

458 Functions that return the costs of edge substitution, edge 

459 deletion, and edge insertion, respectively. 

460 

461 The functions will be called like 

462 

463 edge_subst_cost(G1[u1][v1], G2[u2][v2]), 

464 edge_del_cost(G1[u1][v1]), 

465 edge_ins_cost(G2[u2][v2]). 

466 

467 That is, the functions will receive the edge attribute 

468 dictionaries as inputs. The functions are expected to return 

469 positive numeric values. 

470 

471 Function edge_subst_cost overrides edge_match if specified. 

472 If neither edge_match nor edge_subst_cost are specified then 

473 default edge substitution cost of 0 is used (edge attributes 

474 are not considered during matching). 

475 

476 If edge_del_cost is not specified then default edge deletion 

477 cost of 1 is used. If edge_ins_cost is not specified then 

478 default edge insertion cost of 1 is used. 

479 

480 upper_bound : numeric 

481 Maximum edit distance to consider. 

482 

483 Returns 

484 ------- 

485 Generator of consecutive approximations of graph edit distance. 

486 

487 Examples 

488 -------- 

489 >>> G1 = nx.cycle_graph(6) 

490 >>> G2 = nx.wheel_graph(7) 

491 >>> for v in nx.optimize_graph_edit_distance(G1, G2): 

492 ... minv = v 

493 >>> minv 

494 7.0 

495 

496 See Also 

497 -------- 

498 graph_edit_distance, optimize_edit_paths 

499 

500 References 

501 ---------- 

502 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick 

503 Martineau. An Exact Graph Edit Distance Algorithm for Solving 

504 Pattern Recognition Problems. 4th International Conference on 

505 Pattern Recognition Applications and Methods 2015, Jan 2015, 

506 Lisbon, Portugal. 2015, 

507 <10.5220/0005209202710278>. <hal-01168816> 

508 https://hal.archives-ouvertes.fr/hal-01168816 

509 """ 

510 for _, _, cost in optimize_edit_paths( 

511 G1, 

512 G2, 

513 node_match, 

514 edge_match, 

515 node_subst_cost, 

516 node_del_cost, 

517 node_ins_cost, 

518 edge_subst_cost, 

519 edge_del_cost, 

520 edge_ins_cost, 

521 upper_bound, 

522 True, 

523 ): 

524 yield cost 

525 

526 

527@nx._dispatch( 

528 graphs={"G1": 0, "G2": 1}, preserve_edge_attrs=True, preserve_node_attrs=True 

529) 

530def optimize_edit_paths( 

531 G1, 

532 G2, 

533 node_match=None, 

534 edge_match=None, 

535 node_subst_cost=None, 

536 node_del_cost=None, 

537 node_ins_cost=None, 

538 edge_subst_cost=None, 

539 edge_del_cost=None, 

540 edge_ins_cost=None, 

541 upper_bound=None, 

542 strictly_decreasing=True, 

543 roots=None, 

544 timeout=None, 

545): 

546 """GED (graph edit distance) calculation: advanced interface. 

547 

548 Graph edit path is a sequence of node and edge edit operations 

549 transforming graph G1 to graph isomorphic to G2. Edit operations 

550 include substitutions, deletions, and insertions. 

551 

552 Graph edit distance is defined as minimum cost of edit path. 

553 

554 Parameters 

555 ---------- 

556 G1, G2: graphs 

557 The two graphs G1 and G2 must be of the same type. 

558 

559 node_match : callable 

560 A function that returns True if node n1 in G1 and n2 in G2 

561 should be considered equal during matching. 

562 

563 The function will be called like 

564 

565 node_match(G1.nodes[n1], G2.nodes[n2]). 

566 

567 That is, the function will receive the node attribute 

568 dictionaries for n1 and n2 as inputs. 

569 

570 Ignored if node_subst_cost is specified. If neither 

571 node_match nor node_subst_cost are specified then node 

572 attributes are not considered. 

573 

574 edge_match : callable 

575 A function that returns True if the edge attribute dictionaries 

576 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should 

577 be considered equal during matching. 

578 

579 The function will be called like 

580 

581 edge_match(G1[u1][v1], G2[u2][v2]). 

582 

583 That is, the function will receive the edge attribute 

584 dictionaries of the edges under consideration. 

585 

586 Ignored if edge_subst_cost is specified. If neither 

587 edge_match nor edge_subst_cost are specified then edge 

588 attributes are not considered. 

589 

590 node_subst_cost, node_del_cost, node_ins_cost : callable 

591 Functions that return the costs of node substitution, node 

592 deletion, and node insertion, respectively. 

593 

594 The functions will be called like 

595 

596 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), 

597 node_del_cost(G1.nodes[n1]), 

598 node_ins_cost(G2.nodes[n2]). 

599 

600 That is, the functions will receive the node attribute 

601 dictionaries as inputs. The functions are expected to return 

602 positive numeric values. 

603 

604 Function node_subst_cost overrides node_match if specified. 

605 If neither node_match nor node_subst_cost are specified then 

606 default node substitution cost of 0 is used (node attributes 

607 are not considered during matching). 

608 

609 If node_del_cost is not specified then default node deletion 

610 cost of 1 is used. If node_ins_cost is not specified then 

611 default node insertion cost of 1 is used. 

612 

613 edge_subst_cost, edge_del_cost, edge_ins_cost : callable 

614 Functions that return the costs of edge substitution, edge 

615 deletion, and edge insertion, respectively. 

616 

617 The functions will be called like 

618 

619 edge_subst_cost(G1[u1][v1], G2[u2][v2]), 

620 edge_del_cost(G1[u1][v1]), 

621 edge_ins_cost(G2[u2][v2]). 

622 

623 That is, the functions will receive the edge attribute 

624 dictionaries as inputs. The functions are expected to return 

625 positive numeric values. 

626 

627 Function edge_subst_cost overrides edge_match if specified. 

628 If neither edge_match nor edge_subst_cost are specified then 

629 default edge substitution cost of 0 is used (edge attributes 

630 are not considered during matching). 

631 

632 If edge_del_cost is not specified then default edge deletion 

633 cost of 1 is used. If edge_ins_cost is not specified then 

634 default edge insertion cost of 1 is used. 

635 

636 upper_bound : numeric 

637 Maximum edit distance to consider. 

638 

639 strictly_decreasing : bool 

640 If True, return consecutive approximations of strictly 

641 decreasing cost. Otherwise, return all edit paths of cost 

642 less than or equal to the previous minimum cost. 

643 

644 roots : 2-tuple 

645 Tuple where first element is a node in G1 and the second 

646 is a node in G2. 

647 These nodes are forced to be matched in the comparison to 

648 allow comparison between rooted graphs. 

649 

650 timeout : numeric 

651 Maximum number of seconds to execute. 

652 After timeout is met, the current best GED is returned. 

653 

654 Returns 

655 ------- 

656 Generator of tuples (node_edit_path, edge_edit_path, cost) 

657 node_edit_path : list of tuples (u, v) 

658 edge_edit_path : list of tuples ((u1, v1), (u2, v2)) 

659 cost : numeric 

660 

661 See Also 

662 -------- 

663 graph_edit_distance, optimize_graph_edit_distance, optimal_edit_paths 

664 

665 References 

666 ---------- 

667 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick 

668 Martineau. An Exact Graph Edit Distance Algorithm for Solving 

669 Pattern Recognition Problems. 4th International Conference on 

670 Pattern Recognition Applications and Methods 2015, Jan 2015, 

671 Lisbon, Portugal. 2015, 

672 <10.5220/0005209202710278>. <hal-01168816> 

673 https://hal.archives-ouvertes.fr/hal-01168816 

674 

675 """ 

676 # TODO: support DiGraph 

677 

678 import numpy as np 

679 import scipy as sp 

680 

681 @dataclass 

682 class CostMatrix: 

683 C: ... 

684 lsa_row_ind: ... 

685 lsa_col_ind: ... 

686 ls: ... 

687 

688 def make_CostMatrix(C, m, n): 

689 # assert(C.shape == (m + n, m + n)) 

690 lsa_row_ind, lsa_col_ind = sp.optimize.linear_sum_assignment(C) 

691 

692 # Fixup dummy assignments: 

693 # each substitution i<->j should have dummy assignment m+j<->n+i 

694 # NOTE: fast reduce of Cv relies on it 

695 # assert len(lsa_row_ind) == len(lsa_col_ind) 

696 indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind) 

697 subst_ind = [k for k, i, j in indexes if i < m and j < n] 

698 indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind) 

699 dummy_ind = [k for k, i, j in indexes if i >= m and j >= n] 

700 # assert len(subst_ind) == len(dummy_ind) 

701 lsa_row_ind[dummy_ind] = lsa_col_ind[subst_ind] + m 

702 lsa_col_ind[dummy_ind] = lsa_row_ind[subst_ind] + n 

703 

704 return CostMatrix( 

705 C, lsa_row_ind, lsa_col_ind, C[lsa_row_ind, lsa_col_ind].sum() 

706 ) 

707 

708 def extract_C(C, i, j, m, n): 

709 # assert(C.shape == (m + n, m + n)) 

710 row_ind = [k in i or k - m in j for k in range(m + n)] 

711 col_ind = [k in j or k - n in i for k in range(m + n)] 

712 return C[row_ind, :][:, col_ind] 

713 

714 def reduce_C(C, i, j, m, n): 

715 # assert(C.shape == (m + n, m + n)) 

716 row_ind = [k not in i and k - m not in j for k in range(m + n)] 

717 col_ind = [k not in j and k - n not in i for k in range(m + n)] 

718 return C[row_ind, :][:, col_ind] 

719 

720 def reduce_ind(ind, i): 

721 # assert set(ind) == set(range(len(ind))) 

722 rind = ind[[k not in i for k in ind]] 

723 for k in set(i): 

724 rind[rind >= k] -= 1 

725 return rind 

726 

727 def match_edges(u, v, pending_g, pending_h, Ce, matched_uv=None): 

728 """ 

729 Parameters: 

730 u, v: matched vertices, u=None or v=None for 

731 deletion/insertion 

732 pending_g, pending_h: lists of edges not yet mapped 

733 Ce: CostMatrix of pending edge mappings 

734 matched_uv: partial vertex edit path 

735 list of tuples (u, v) of previously matched vertex 

736 mappings u<->v, u=None or v=None for 

737 deletion/insertion 

738 

739 Returns: 

740 list of (i, j): indices of edge mappings g<->h 

741 localCe: local CostMatrix of edge mappings 

742 (basically submatrix of Ce at cross of rows i, cols j) 

743 """ 

744 M = len(pending_g) 

745 N = len(pending_h) 

746 # assert Ce.C.shape == (M + N, M + N) 

747 

748 # only attempt to match edges after one node match has been made 

749 # this will stop self-edges on the first node being automatically deleted 

750 # even when a substitution is the better option 

751 if matched_uv is None or len(matched_uv) == 0: 

752 g_ind = [] 

753 h_ind = [] 

754 else: 

755 g_ind = [ 

756 i 

757 for i in range(M) 

758 if pending_g[i][:2] == (u, u) 

759 or any( 

760 pending_g[i][:2] in ((p, u), (u, p), (p, p)) for p, q in matched_uv 

761 ) 

762 ] 

763 h_ind = [ 

764 j 

765 for j in range(N) 

766 if pending_h[j][:2] == (v, v) 

767 or any( 

768 pending_h[j][:2] in ((q, v), (v, q), (q, q)) for p, q in matched_uv 

769 ) 

770 ] 

771 

772 m = len(g_ind) 

773 n = len(h_ind) 

774 

775 if m or n: 

776 C = extract_C(Ce.C, g_ind, h_ind, M, N) 

777 # assert C.shape == (m + n, m + n) 

778 

779 # Forbid structurally invalid matches 

780 # NOTE: inf remembered from Ce construction 

781 for k, i in enumerate(g_ind): 

782 g = pending_g[i][:2] 

783 for l, j in enumerate(h_ind): 

784 h = pending_h[j][:2] 

785 if nx.is_directed(G1) or nx.is_directed(G2): 

786 if any( 

787 g == (p, u) and h == (q, v) or g == (u, p) and h == (v, q) 

788 for p, q in matched_uv 

789 ): 

790 continue 

791 else: 

792 if any( 

793 g in ((p, u), (u, p)) and h in ((q, v), (v, q)) 

794 for p, q in matched_uv 

795 ): 

796 continue 

797 if g == (u, u) or any(g == (p, p) for p, q in matched_uv): 

798 continue 

799 if h == (v, v) or any(h == (q, q) for p, q in matched_uv): 

800 continue 

801 C[k, l] = inf 

802 

803 localCe = make_CostMatrix(C, m, n) 

804 ij = [ 

805 ( 

806 g_ind[k] if k < m else M + h_ind[l], 

807 h_ind[l] if l < n else N + g_ind[k], 

808 ) 

809 for k, l in zip(localCe.lsa_row_ind, localCe.lsa_col_ind) 

810 if k < m or l < n 

811 ] 

812 

813 else: 

814 ij = [] 

815 localCe = CostMatrix(np.empty((0, 0)), [], [], 0) 

816 

817 return ij, localCe 

818 

819 def reduce_Ce(Ce, ij, m, n): 

820 if len(ij): 

821 i, j = zip(*ij) 

822 m_i = m - sum(1 for t in i if t < m) 

823 n_j = n - sum(1 for t in j if t < n) 

824 return make_CostMatrix(reduce_C(Ce.C, i, j, m, n), m_i, n_j) 

825 return Ce 

826 

827 def get_edit_ops( 

828 matched_uv, pending_u, pending_v, Cv, pending_g, pending_h, Ce, matched_cost 

829 ): 

830 """ 

831 Parameters: 

832 matched_uv: partial vertex edit path 

833 list of tuples (u, v) of vertex mappings u<->v, 

834 u=None or v=None for deletion/insertion 

835 pending_u, pending_v: lists of vertices not yet mapped 

836 Cv: CostMatrix of pending vertex mappings 

837 pending_g, pending_h: lists of edges not yet mapped 

838 Ce: CostMatrix of pending edge mappings 

839 matched_cost: cost of partial edit path 

840 

841 Returns: 

842 sequence of 

843 (i, j): indices of vertex mapping u<->v 

844 Cv_ij: reduced CostMatrix of pending vertex mappings 

845 (basically Cv with row i, col j removed) 

846 list of (x, y): indices of edge mappings g<->h 

847 Ce_xy: reduced CostMatrix of pending edge mappings 

848 (basically Ce with rows x, cols y removed) 

849 cost: total cost of edit operation 

850 NOTE: most promising ops first 

851 """ 

852 m = len(pending_u) 

853 n = len(pending_v) 

854 # assert Cv.C.shape == (m + n, m + n) 

855 

856 # 1) a vertex mapping from optimal linear sum assignment 

857 i, j = min( 

858 (k, l) for k, l in zip(Cv.lsa_row_ind, Cv.lsa_col_ind) if k < m or l < n 

859 ) 

860 xy, localCe = match_edges( 

861 pending_u[i] if i < m else None, 

862 pending_v[j] if j < n else None, 

863 pending_g, 

864 pending_h, 

865 Ce, 

866 matched_uv, 

867 ) 

868 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h)) 

869 # assert Ce.ls <= localCe.ls + Ce_xy.ls 

870 if prune(matched_cost + Cv.ls + localCe.ls + Ce_xy.ls): 

871 pass 

872 else: 

873 # get reduced Cv efficiently 

874 Cv_ij = CostMatrix( 

875 reduce_C(Cv.C, (i,), (j,), m, n), 

876 reduce_ind(Cv.lsa_row_ind, (i, m + j)), 

877 reduce_ind(Cv.lsa_col_ind, (j, n + i)), 

878 Cv.ls - Cv.C[i, j], 

879 ) 

880 yield (i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls 

881 

882 # 2) other candidates, sorted by lower-bound cost estimate 

883 other = [] 

884 fixed_i, fixed_j = i, j 

885 if m <= n: 

886 candidates = ( 

887 (t, fixed_j) 

888 for t in range(m + n) 

889 if t != fixed_i and (t < m or t == m + fixed_j) 

890 ) 

891 else: 

892 candidates = ( 

893 (fixed_i, t) 

894 for t in range(m + n) 

895 if t != fixed_j and (t < n or t == n + fixed_i) 

896 ) 

897 for i, j in candidates: 

898 if prune(matched_cost + Cv.C[i, j] + Ce.ls): 

899 continue 

900 Cv_ij = make_CostMatrix( 

901 reduce_C(Cv.C, (i,), (j,), m, n), 

902 m - 1 if i < m else m, 

903 n - 1 if j < n else n, 

904 ) 

905 # assert Cv.ls <= Cv.C[i, j] + Cv_ij.ls 

906 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + Ce.ls): 

907 continue 

908 xy, localCe = match_edges( 

909 pending_u[i] if i < m else None, 

910 pending_v[j] if j < n else None, 

911 pending_g, 

912 pending_h, 

913 Ce, 

914 matched_uv, 

915 ) 

916 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls): 

917 continue 

918 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h)) 

919 # assert Ce.ls <= localCe.ls + Ce_xy.ls 

920 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls + Ce_xy.ls): 

921 continue 

922 other.append(((i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls)) 

923 

924 yield from sorted(other, key=lambda t: t[4] + t[1].ls + t[3].ls) 

925 

926 def get_edit_paths( 

927 matched_uv, 

928 pending_u, 

929 pending_v, 

930 Cv, 

931 matched_gh, 

932 pending_g, 

933 pending_h, 

934 Ce, 

935 matched_cost, 

936 ): 

937 """ 

938 Parameters: 

939 matched_uv: partial vertex edit path 

940 list of tuples (u, v) of vertex mappings u<->v, 

941 u=None or v=None for deletion/insertion 

942 pending_u, pending_v: lists of vertices not yet mapped 

943 Cv: CostMatrix of pending vertex mappings 

944 matched_gh: partial edge edit path 

945 list of tuples (g, h) of edge mappings g<->h, 

946 g=None or h=None for deletion/insertion 

947 pending_g, pending_h: lists of edges not yet mapped 

948 Ce: CostMatrix of pending edge mappings 

949 matched_cost: cost of partial edit path 

950 

951 Returns: 

952 sequence of (vertex_path, edge_path, cost) 

953 vertex_path: complete vertex edit path 

954 list of tuples (u, v) of vertex mappings u<->v, 

955 u=None or v=None for deletion/insertion 

956 edge_path: complete edge edit path 

957 list of tuples (g, h) of edge mappings g<->h, 

958 g=None or h=None for deletion/insertion 

959 cost: total cost of edit path 

960 NOTE: path costs are non-increasing 

961 """ 

962 # debug_print('matched-uv:', matched_uv) 

963 # debug_print('matched-gh:', matched_gh) 

964 # debug_print('matched-cost:', matched_cost) 

965 # debug_print('pending-u:', pending_u) 

966 # debug_print('pending-v:', pending_v) 

967 # debug_print(Cv.C) 

968 # assert list(sorted(G1.nodes)) == list(sorted(list(u for u, v in matched_uv if u is not None) + pending_u)) 

969 # assert list(sorted(G2.nodes)) == list(sorted(list(v for u, v in matched_uv if v is not None) + pending_v)) 

970 # debug_print('pending-g:', pending_g) 

971 # debug_print('pending-h:', pending_h) 

972 # debug_print(Ce.C) 

973 # assert list(sorted(G1.edges)) == list(sorted(list(g for g, h in matched_gh if g is not None) + pending_g)) 

974 # assert list(sorted(G2.edges)) == list(sorted(list(h for g, h in matched_gh if h is not None) + pending_h)) 

975 # debug_print() 

976 

977 if prune(matched_cost + Cv.ls + Ce.ls): 

978 return 

979 

980 if not max(len(pending_u), len(pending_v)): 

981 # assert not len(pending_g) 

982 # assert not len(pending_h) 

983 # path completed! 

984 # assert matched_cost <= maxcost_value 

985 nonlocal maxcost_value 

986 maxcost_value = min(maxcost_value, matched_cost) 

987 yield matched_uv, matched_gh, matched_cost 

988 

989 else: 

990 edit_ops = get_edit_ops( 

991 matched_uv, 

992 pending_u, 

993 pending_v, 

994 Cv, 

995 pending_g, 

996 pending_h, 

997 Ce, 

998 matched_cost, 

999 ) 

1000 for ij, Cv_ij, xy, Ce_xy, edit_cost in edit_ops: 

1001 i, j = ij 

1002 # assert Cv.C[i, j] + sum(Ce.C[t] for t in xy) == edit_cost 

1003 if prune(matched_cost + edit_cost + Cv_ij.ls + Ce_xy.ls): 

1004 continue 

1005 

1006 # dive deeper 

1007 u = pending_u.pop(i) if i < len(pending_u) else None 

1008 v = pending_v.pop(j) if j < len(pending_v) else None 

1009 matched_uv.append((u, v)) 

1010 for x, y in xy: 

1011 len_g = len(pending_g) 

1012 len_h = len(pending_h) 

1013 matched_gh.append( 

1014 ( 

1015 pending_g[x] if x < len_g else None, 

1016 pending_h[y] if y < len_h else None, 

1017 ) 

1018 ) 

1019 sortedx = sorted(x for x, y in xy) 

1020 sortedy = sorted(y for x, y in xy) 

1021 G = [ 

1022 (pending_g.pop(x) if x < len(pending_g) else None) 

1023 for x in reversed(sortedx) 

1024 ] 

1025 H = [ 

1026 (pending_h.pop(y) if y < len(pending_h) else None) 

1027 for y in reversed(sortedy) 

1028 ] 

1029 

1030 yield from get_edit_paths( 

1031 matched_uv, 

1032 pending_u, 

1033 pending_v, 

1034 Cv_ij, 

1035 matched_gh, 

1036 pending_g, 

1037 pending_h, 

1038 Ce_xy, 

1039 matched_cost + edit_cost, 

1040 ) 

1041 

1042 # backtrack 

1043 if u is not None: 

1044 pending_u.insert(i, u) 

1045 if v is not None: 

1046 pending_v.insert(j, v) 

1047 matched_uv.pop() 

1048 for x, g in zip(sortedx, reversed(G)): 

1049 if g is not None: 

1050 pending_g.insert(x, g) 

1051 for y, h in zip(sortedy, reversed(H)): 

1052 if h is not None: 

1053 pending_h.insert(y, h) 

1054 for _ in xy: 

1055 matched_gh.pop() 

1056 

1057 # Initialization 

1058 

1059 pending_u = list(G1.nodes) 

1060 pending_v = list(G2.nodes) 

1061 

1062 initial_cost = 0 

1063 if roots: 

1064 root_u, root_v = roots 

1065 if root_u not in pending_u or root_v not in pending_v: 

1066 raise nx.NodeNotFound("Root node not in graph.") 

1067 

1068 # remove roots from pending 

1069 pending_u.remove(root_u) 

1070 pending_v.remove(root_v) 

1071 

1072 # cost matrix of vertex mappings 

1073 m = len(pending_u) 

1074 n = len(pending_v) 

1075 C = np.zeros((m + n, m + n)) 

1076 if node_subst_cost: 

1077 C[0:m, 0:n] = np.array( 

1078 [ 

1079 node_subst_cost(G1.nodes[u], G2.nodes[v]) 

1080 for u in pending_u 

1081 for v in pending_v 

1082 ] 

1083 ).reshape(m, n) 

1084 if roots: 

1085 initial_cost = node_subst_cost(G1.nodes[root_u], G2.nodes[root_v]) 

1086 elif node_match: 

1087 C[0:m, 0:n] = np.array( 

1088 [ 

1089 1 - int(node_match(G1.nodes[u], G2.nodes[v])) 

1090 for u in pending_u 

1091 for v in pending_v 

1092 ] 

1093 ).reshape(m, n) 

1094 if roots: 

1095 initial_cost = 1 - node_match(G1.nodes[root_u], G2.nodes[root_v]) 

1096 else: 

1097 # all zeroes 

1098 pass 

1099 # assert not min(m, n) or C[0:m, 0:n].min() >= 0 

1100 if node_del_cost: 

1101 del_costs = [node_del_cost(G1.nodes[u]) for u in pending_u] 

1102 else: 

1103 del_costs = [1] * len(pending_u) 

1104 # assert not m or min(del_costs) >= 0 

1105 if node_ins_cost: 

1106 ins_costs = [node_ins_cost(G2.nodes[v]) for v in pending_v] 

1107 else: 

1108 ins_costs = [1] * len(pending_v) 

1109 # assert not n or min(ins_costs) >= 0 

1110 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1 

1111 C[0:m, n : n + m] = np.array( 

1112 [del_costs[i] if i == j else inf for i in range(m) for j in range(m)] 

1113 ).reshape(m, m) 

1114 C[m : m + n, 0:n] = np.array( 

1115 [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)] 

1116 ).reshape(n, n) 

1117 Cv = make_CostMatrix(C, m, n) 

1118 # debug_print(f"Cv: {m} x {n}") 

1119 # debug_print(Cv.C) 

1120 

1121 pending_g = list(G1.edges) 

1122 pending_h = list(G2.edges) 

1123 

1124 # cost matrix of edge mappings 

1125 m = len(pending_g) 

1126 n = len(pending_h) 

1127 C = np.zeros((m + n, m + n)) 

1128 if edge_subst_cost: 

1129 C[0:m, 0:n] = np.array( 

1130 [ 

1131 edge_subst_cost(G1.edges[g], G2.edges[h]) 

1132 for g in pending_g 

1133 for h in pending_h 

1134 ] 

1135 ).reshape(m, n) 

1136 elif edge_match: 

1137 C[0:m, 0:n] = np.array( 

1138 [ 

1139 1 - int(edge_match(G1.edges[g], G2.edges[h])) 

1140 for g in pending_g 

1141 for h in pending_h 

1142 ] 

1143 ).reshape(m, n) 

1144 else: 

1145 # all zeroes 

1146 pass 

1147 # assert not min(m, n) or C[0:m, 0:n].min() >= 0 

1148 if edge_del_cost: 

1149 del_costs = [edge_del_cost(G1.edges[g]) for g in pending_g] 

1150 else: 

1151 del_costs = [1] * len(pending_g) 

1152 # assert not m or min(del_costs) >= 0 

1153 if edge_ins_cost: 

1154 ins_costs = [edge_ins_cost(G2.edges[h]) for h in pending_h] 

1155 else: 

1156 ins_costs = [1] * len(pending_h) 

1157 # assert not n or min(ins_costs) >= 0 

1158 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1 

1159 C[0:m, n : n + m] = np.array( 

1160 [del_costs[i] if i == j else inf for i in range(m) for j in range(m)] 

1161 ).reshape(m, m) 

1162 C[m : m + n, 0:n] = np.array( 

1163 [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)] 

1164 ).reshape(n, n) 

1165 Ce = make_CostMatrix(C, m, n) 

1166 # debug_print(f'Ce: {m} x {n}') 

1167 # debug_print(Ce.C) 

1168 # debug_print() 

1169 

1170 maxcost_value = Cv.C.sum() + Ce.C.sum() + 1 

1171 

1172 if timeout is not None: 

1173 if timeout <= 0: 

1174 raise nx.NetworkXError("Timeout value must be greater than 0") 

1175 start = time.perf_counter() 

1176 

1177 def prune(cost): 

1178 if timeout is not None: 

1179 if time.perf_counter() - start > timeout: 

1180 return True 

1181 if upper_bound is not None: 

1182 if cost > upper_bound: 

1183 return True 

1184 if cost > maxcost_value: 

1185 return True 

1186 if strictly_decreasing and cost >= maxcost_value: 

1187 return True 

1188 return False 

1189 

1190 # Now go! 

1191 

1192 done_uv = [] if roots is None else [roots] 

1193 

1194 for vertex_path, edge_path, cost in get_edit_paths( 

1195 done_uv, pending_u, pending_v, Cv, [], pending_g, pending_h, Ce, initial_cost 

1196 ): 

1197 # assert sorted(G1.nodes) == sorted(u for u, v in vertex_path if u is not None) 

1198 # assert sorted(G2.nodes) == sorted(v for u, v in vertex_path if v is not None) 

1199 # assert sorted(G1.edges) == sorted(g for g, h in edge_path if g is not None) 

1200 # assert sorted(G2.edges) == sorted(h for g, h in edge_path if h is not None) 

1201 # print(vertex_path, edge_path, cost, file = sys.stderr) 

1202 # assert cost == maxcost_value 

1203 yield list(vertex_path), list(edge_path), cost 

1204 

1205 

1206@nx._dispatch 

1207def simrank_similarity( 

1208 G, 

1209 source=None, 

1210 target=None, 

1211 importance_factor=0.9, 

1212 max_iterations=1000, 

1213 tolerance=1e-4, 

1214): 

1215 """Returns the SimRank similarity of nodes in the graph ``G``. 

1216 

1217 SimRank is a similarity metric that says "two objects are considered 

1218 to be similar if they are referenced by similar objects." [1]_. 

1219 

1220 The pseudo-code definition from the paper is:: 

1221 

1222 def simrank(G, u, v): 

1223 in_neighbors_u = G.predecessors(u) 

1224 in_neighbors_v = G.predecessors(v) 

1225 scale = C / (len(in_neighbors_u) * len(in_neighbors_v)) 

1226 return scale * sum(simrank(G, w, x) 

1227 for w, x in product(in_neighbors_u, 

1228 in_neighbors_v)) 

1229 

1230 where ``G`` is the graph, ``u`` is the source, ``v`` is the target, 

1231 and ``C`` is a float decay or importance factor between 0 and 1. 

1232 

1233 The SimRank algorithm for determining node similarity is defined in 

1234 [2]_. 

1235 

1236 Parameters 

1237 ---------- 

1238 G : NetworkX graph 

1239 A NetworkX graph 

1240 

1241 source : node 

1242 If this is specified, the returned dictionary maps each node 

1243 ``v`` in the graph to the similarity between ``source`` and 

1244 ``v``. 

1245 

1246 target : node 

1247 If both ``source`` and ``target`` are specified, the similarity 

1248 value between ``source`` and ``target`` is returned. If 

1249 ``target`` is specified but ``source`` is not, this argument is 

1250 ignored. 

1251 

1252 importance_factor : float 

1253 The relative importance of indirect neighbors with respect to 

1254 direct neighbors. 

1255 

1256 max_iterations : integer 

1257 Maximum number of iterations. 

1258 

1259 tolerance : float 

1260 Error tolerance used to check convergence. When an iteration of 

1261 the algorithm finds that no similarity value changes more than 

1262 this amount, the algorithm halts. 

1263 

1264 Returns 

1265 ------- 

1266 similarity : dictionary or float 

1267 If ``source`` and ``target`` are both ``None``, this returns a 

1268 dictionary of dictionaries, where keys are node pairs and value 

1269 are similarity of the pair of nodes. 

1270 

1271 If ``source`` is not ``None`` but ``target`` is, this returns a 

1272 dictionary mapping node to the similarity of ``source`` and that 

1273 node. 

1274 

1275 If neither ``source`` nor ``target`` is ``None``, this returns 

1276 the similarity value for the given pair of nodes. 

1277 

1278 Examples 

1279 -------- 

1280 >>> G = nx.cycle_graph(2) 

1281 >>> nx.simrank_similarity(G) 

1282 {0: {0: 1.0, 1: 0.0}, 1: {0: 0.0, 1: 1.0}} 

1283 >>> nx.simrank_similarity(G, source=0) 

1284 {0: 1.0, 1: 0.0} 

1285 >>> nx.simrank_similarity(G, source=0, target=0) 

1286 1.0 

1287 

1288 The result of this function can be converted to a numpy array 

1289 representing the SimRank matrix by using the node order of the 

1290 graph to determine which row and column represent each node. 

1291 Other ordering of nodes is also possible. 

1292 

1293 >>> import numpy as np 

1294 >>> sim = nx.simrank_similarity(G) 

1295 >>> np.array([[sim[u][v] for v in G] for u in G]) 

1296 array([[1., 0.], 

1297 [0., 1.]]) 

1298 >>> sim_1d = nx.simrank_similarity(G, source=0) 

1299 >>> np.array([sim[0][v] for v in G]) 

1300 array([1., 0.]) 

1301 

1302 References 

1303 ---------- 

1304 .. [1] https://en.wikipedia.org/wiki/SimRank 

1305 .. [2] G. Jeh and J. Widom. 

1306 "SimRank: a measure of structural-context similarity", 

1307 In KDD'02: Proceedings of the Eighth ACM SIGKDD 

1308 International Conference on Knowledge Discovery and Data Mining, 

1309 pp. 538--543. ACM Press, 2002. 

1310 """ 

1311 import numpy as np 

1312 

1313 nodelist = list(G) 

1314 s_indx = None if source is None else nodelist.index(source) 

1315 t_indx = None if target is None else nodelist.index(target) 

1316 

1317 x = _simrank_similarity_numpy( 

1318 G, s_indx, t_indx, importance_factor, max_iterations, tolerance 

1319 ) 

1320 

1321 if isinstance(x, np.ndarray): 

1322 if x.ndim == 1: 

1323 return dict(zip(G, x)) 

1324 # else x.ndim == 2 

1325 return {u: dict(zip(G, row)) for u, row in zip(G, x)} 

1326 return x 

1327 

1328 

1329def _simrank_similarity_python( 

1330 G, 

1331 source=None, 

1332 target=None, 

1333 importance_factor=0.9, 

1334 max_iterations=1000, 

1335 tolerance=1e-4, 

1336): 

1337 """Returns the SimRank similarity of nodes in the graph ``G``. 

1338 

1339 This pure Python version is provided for pedagogical purposes. 

1340 

1341 Examples 

1342 -------- 

1343 >>> G = nx.cycle_graph(2) 

1344 >>> nx.similarity._simrank_similarity_python(G) 

1345 {0: {0: 1, 1: 0.0}, 1: {0: 0.0, 1: 1}} 

1346 >>> nx.similarity._simrank_similarity_python(G, source=0) 

1347 {0: 1, 1: 0.0} 

1348 >>> nx.similarity._simrank_similarity_python(G, source=0, target=0) 

1349 1 

1350 """ 

1351 # build up our similarity adjacency dictionary output 

1352 newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G} 

1353 

1354 # These functions compute the update to the similarity value of the nodes 

1355 # `u` and `v` with respect to the previous similarity values. 

1356 def avg_sim(s): 

1357 return sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0 

1358 

1359 Gadj = G.pred if G.is_directed() else G.adj 

1360 

1361 def sim(u, v): 

1362 return importance_factor * avg_sim(list(product(Gadj[u], Gadj[v]))) 

1363 

1364 for its in range(max_iterations): 

1365 oldsim = newsim 

1366 newsim = {u: {v: sim(u, v) if u is not v else 1 for v in G} for u in G} 

1367 is_close = all( 

1368 all( 

1369 abs(newsim[u][v] - old) <= tolerance * (1 + abs(old)) 

1370 for v, old in nbrs.items() 

1371 ) 

1372 for u, nbrs in oldsim.items() 

1373 ) 

1374 if is_close: 

1375 break 

1376 

1377 if its + 1 == max_iterations: 

1378 raise nx.ExceededMaxIterations( 

1379 f"simrank did not converge after {max_iterations} iterations." 

1380 ) 

1381 

1382 if source is not None and target is not None: 

1383 return newsim[source][target] 

1384 if source is not None: 

1385 return newsim[source] 

1386 return newsim 

1387 

1388 

1389def _simrank_similarity_numpy( 

1390 G, 

1391 source=None, 

1392 target=None, 

1393 importance_factor=0.9, 

1394 max_iterations=1000, 

1395 tolerance=1e-4, 

1396): 

1397 """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``. 

1398 

1399 The SimRank algorithm for determining node similarity is defined in 

1400 [1]_. 

1401 

1402 Parameters 

1403 ---------- 

1404 G : NetworkX graph 

1405 A NetworkX graph 

1406 

1407 source : node 

1408 If this is specified, the returned dictionary maps each node 

1409 ``v`` in the graph to the similarity between ``source`` and 

1410 ``v``. 

1411 

1412 target : node 

1413 If both ``source`` and ``target`` are specified, the similarity 

1414 value between ``source`` and ``target`` is returned. If 

1415 ``target`` is specified but ``source`` is not, this argument is 

1416 ignored. 

1417 

1418 importance_factor : float 

1419 The relative importance of indirect neighbors with respect to 

1420 direct neighbors. 

1421 

1422 max_iterations : integer 

1423 Maximum number of iterations. 

1424 

1425 tolerance : float 

1426 Error tolerance used to check convergence. When an iteration of 

1427 the algorithm finds that no similarity value changes more than 

1428 this amount, the algorithm halts. 

1429 

1430 Returns 

1431 ------- 

1432 similarity : numpy array or float 

1433 If ``source`` and ``target`` are both ``None``, this returns a 

1434 2D array containing SimRank scores of the nodes. 

1435 

1436 If ``source`` is not ``None`` but ``target`` is, this returns an 

1437 1D array containing SimRank scores of ``source`` and that 

1438 node. 

1439 

1440 If neither ``source`` nor ``target`` is ``None``, this returns 

1441 the similarity value for the given pair of nodes. 

1442 

1443 Examples 

1444 -------- 

1445 >>> G = nx.cycle_graph(2) 

1446 >>> nx.similarity._simrank_similarity_numpy(G) 

1447 array([[1., 0.], 

1448 [0., 1.]]) 

1449 >>> nx.similarity._simrank_similarity_numpy(G, source=0) 

1450 array([1., 0.]) 

1451 >>> nx.similarity._simrank_similarity_numpy(G, source=0, target=0) 

1452 1.0 

1453 

1454 References 

1455 ---------- 

1456 .. [1] G. Jeh and J. Widom. 

1457 "SimRank: a measure of structural-context similarity", 

1458 In KDD'02: Proceedings of the Eighth ACM SIGKDD 

1459 International Conference on Knowledge Discovery and Data Mining, 

1460 pp. 538--543. ACM Press, 2002. 

1461 """ 

1462 # This algorithm follows roughly 

1463 # 

1464 # S = max{C * (A.T * S * A), I} 

1465 # 

1466 # where C is the importance factor, A is the column normalized 

1467 # adjacency matrix, and I is the identity matrix. 

1468 import numpy as np 

1469 

1470 adjacency_matrix = nx.to_numpy_array(G) 

1471 

1472 # column-normalize the ``adjacency_matrix`` 

1473 s = np.array(adjacency_matrix.sum(axis=0)) 

1474 s[s == 0] = 1 

1475 adjacency_matrix /= s # adjacency_matrix.sum(axis=0) 

1476 

1477 newsim = np.eye(len(G), dtype=np.float64) 

1478 for its in range(max_iterations): 

1479 prevsim = newsim.copy() 

1480 newsim = importance_factor * ((adjacency_matrix.T @ prevsim) @ adjacency_matrix) 

1481 np.fill_diagonal(newsim, 1.0) 

1482 

1483 if np.allclose(prevsim, newsim, atol=tolerance): 

1484 break 

1485 

1486 if its + 1 == max_iterations: 

1487 raise nx.ExceededMaxIterations( 

1488 f"simrank did not converge after {max_iterations} iterations." 

1489 ) 

1490 

1491 if source is not None and target is not None: 

1492 return newsim[source, target] 

1493 if source is not None: 

1494 return newsim[source] 

1495 return newsim 

1496 

1497 

1498@nx._dispatch(edge_attrs="weight") 

1499def panther_similarity( 

1500 G, source, k=5, path_length=5, c=0.5, delta=0.1, eps=None, weight="weight" 

1501): 

1502 r"""Returns the Panther similarity of nodes in the graph `G` to node ``v``. 

1503 

1504 Panther is a similarity metric that says "two objects are considered 

1505 to be similar if they frequently appear on the same paths." [1]_. 

1506 

1507 Parameters 

1508 ---------- 

1509 G : NetworkX graph 

1510 A NetworkX graph 

1511 source : node 

1512 Source node for which to find the top `k` similar other nodes 

1513 k : int (default = 5) 

1514 The number of most similar nodes to return 

1515 path_length : int (default = 5) 

1516 How long the randomly generated paths should be (``T`` in [1]_) 

1517 c : float (default = 0.5) 

1518 A universal positive constant used to scale the number 

1519 of sample random paths to generate. 

1520 delta : float (default = 0.1) 

1521 The probability that the similarity $S$ is not an epsilon-approximation to (R, phi), 

1522 where $R$ is the number of random paths and $\phi$ is the probability 

1523 that an element sampled from a set $A \subseteq D$, where $D$ is the domain. 

1524 eps : float or None (default = None) 

1525 The error bound. Per [1]_, a good value is ``sqrt(1/|E|)``. Therefore, 

1526 if no value is provided, the recommended computed value will be used. 

1527 weight : string or None, optional (default="weight") 

1528 The name of an edge attribute that holds the numerical value 

1529 used as a weight. If None then each edge has weight 1. 

1530 

1531 Returns 

1532 ------- 

1533 similarity : dictionary 

1534 Dictionary of nodes to similarity scores (as floats). Note: 

1535 the self-similarity (i.e., ``v``) will not be included in 

1536 the returned dictionary. 

1537 

1538 Examples 

1539 -------- 

1540 >>> G = nx.star_graph(10) 

1541 >>> sim = nx.panther_similarity(G, 0) 

1542 

1543 References 

1544 ---------- 

1545 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J. 

1546 Panther: Fast top-k similarity search on large networks. 

1547 In Proceedings of the ACM SIGKDD International Conference 

1548 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454). 

1549 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267. 

1550 """ 

1551 import numpy as np 

1552 

1553 num_nodes = G.number_of_nodes() 

1554 if num_nodes < k: 

1555 warnings.warn( 

1556 f"Number of nodes is {num_nodes}, but requested k is {k}. " 

1557 "Setting k to number of nodes." 

1558 ) 

1559 k = num_nodes 

1560 # According to [1], they empirically determined 

1561 # a good value for ``eps`` to be sqrt( 1 / |E| ) 

1562 if eps is None: 

1563 eps = np.sqrt(1.0 / G.number_of_edges()) 

1564 

1565 inv_node_map = {name: index for index, name in enumerate(G.nodes)} 

1566 node_map = np.array(G) 

1567 

1568 # Calculate the sample size ``R`` for how many paths 

1569 # to randomly generate 

1570 t_choose_2 = math.comb(path_length, 2) 

1571 sample_size = int((c / eps**2) * (np.log2(t_choose_2) + 1 + np.log(1 / delta))) 

1572 index_map = {} 

1573 _ = list( 

1574 generate_random_paths( 

1575 G, sample_size, path_length=path_length, index_map=index_map, weight=weight 

1576 ) 

1577 ) 

1578 S = np.zeros(num_nodes) 

1579 

1580 inv_sample_size = 1 / sample_size 

1581 

1582 source_paths = set(index_map[source]) 

1583 

1584 # Calculate the path similarities 

1585 # between ``source`` (v) and ``node`` (v_j) 

1586 # using our inverted index mapping of 

1587 # vertices to paths 

1588 for node, paths in index_map.items(): 

1589 # Only consider paths where both 

1590 # ``node`` and ``source`` are present 

1591 common_paths = source_paths.intersection(paths) 

1592 S[inv_node_map[node]] = len(common_paths) * inv_sample_size 

1593 

1594 # Retrieve top ``k`` similar 

1595 # Note: the below performed anywhere from 4-10x faster 

1596 # (depending on input sizes) vs the equivalent ``np.argsort(S)[::-1]`` 

1597 top_k_unsorted = np.argpartition(S, -k)[-k:] 

1598 top_k_sorted = top_k_unsorted[np.argsort(S[top_k_unsorted])][::-1] 

1599 

1600 # Add back the similarity scores 

1601 top_k_sorted_names = (node_map[n] for n in top_k_sorted) 

1602 top_k_with_val = dict(zip(top_k_sorted_names, S[top_k_sorted])) 

1603 

1604 # Remove the self-similarity 

1605 top_k_with_val.pop(source, None) 

1606 return top_k_with_val 

1607 

1608 

1609@nx._dispatch(edge_attrs="weight") 

1610def generate_random_paths( 

1611 G, sample_size, path_length=5, index_map=None, weight="weight" 

1612): 

1613 """Randomly generate `sample_size` paths of length `path_length`. 

1614 

1615 Parameters 

1616 ---------- 

1617 G : NetworkX graph 

1618 A NetworkX graph 

1619 sample_size : integer 

1620 The number of paths to generate. This is ``R`` in [1]_. 

1621 path_length : integer (default = 5) 

1622 The maximum size of the path to randomly generate. 

1623 This is ``T`` in [1]_. According to the paper, ``T >= 5`` is 

1624 recommended. 

1625 index_map : dictionary, optional 

1626 If provided, this will be populated with the inverted 

1627 index of nodes mapped to the set of generated random path 

1628 indices within ``paths``. 

1629 weight : string or None, optional (default="weight") 

1630 The name of an edge attribute that holds the numerical value 

1631 used as a weight. If None then each edge has weight 1. 

1632 

1633 Returns 

1634 ------- 

1635 paths : generator of lists 

1636 Generator of `sample_size` paths each with length `path_length`. 

1637 

1638 Examples 

1639 -------- 

1640 Note that the return value is the list of paths: 

1641 

1642 >>> G = nx.star_graph(3) 

1643 >>> random_path = nx.generate_random_paths(G, 2) 

1644 

1645 By passing a dictionary into `index_map`, it will build an 

1646 inverted index mapping of nodes to the paths in which that node is present: 

1647 

1648 >>> G = nx.star_graph(3) 

1649 >>> index_map = {} 

1650 >>> random_path = nx.generate_random_paths(G, 3, index_map=index_map) 

1651 >>> paths_containing_node_0 = [random_path[path_idx] for path_idx in index_map.get(0, [])] 

1652 

1653 References 

1654 ---------- 

1655 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J. 

1656 Panther: Fast top-k similarity search on large networks. 

1657 In Proceedings of the ACM SIGKDD International Conference 

1658 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454). 

1659 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267. 

1660 """ 

1661 import numpy as np 

1662 

1663 # Calculate transition probabilities between 

1664 # every pair of vertices according to Eq. (3) 

1665 adj_mat = nx.to_numpy_array(G, weight=weight) 

1666 inv_row_sums = np.reciprocal(adj_mat.sum(axis=1)).reshape(-1, 1) 

1667 transition_probabilities = adj_mat * inv_row_sums 

1668 

1669 node_map = np.array(G) 

1670 num_nodes = G.number_of_nodes() 

1671 

1672 for path_index in range(sample_size): 

1673 # Sample current vertex v = v_i uniformly at random 

1674 node_index = np.random.randint(0, high=num_nodes) 

1675 node = node_map[node_index] 

1676 

1677 # Add v into p_r and add p_r into the path set 

1678 # of v, i.e., P_v 

1679 path = [node] 

1680 

1681 # Build the inverted index (P_v) of vertices to paths 

1682 if index_map is not None: 

1683 if node in index_map: 

1684 index_map[node].add(path_index) 

1685 else: 

1686 index_map[node] = {path_index} 

1687 

1688 starting_index = node_index 

1689 for _ in range(path_length): 

1690 # Randomly sample a neighbor (v_j) according 

1691 # to transition probabilities from ``node`` (v) to its neighbors 

1692 neighbor_index = np.random.choice( 

1693 num_nodes, p=transition_probabilities[starting_index] 

1694 ) 

1695 

1696 # Set current vertex (v = v_j) 

1697 starting_index = neighbor_index 

1698 

1699 # Add v into p_r 

1700 neighbor_node = node_map[neighbor_index] 

1701 path.append(neighbor_node) 

1702 

1703 # Add p_r into P_v 

1704 if index_map is not None: 

1705 if neighbor_node in index_map: 

1706 index_map[neighbor_node].add(path_index) 

1707 else: 

1708 index_map[neighbor_node] = {path_index} 

1709 

1710 yield path