Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/networkx/algorithms/similarity.py: 8%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

409 statements  

1"""Functions measuring similarity using graph edit distance. 

2 

3The graph edit distance is the number of edge/node changes needed 

4to make two graphs isomorphic. 

5 

6The default algorithm/implementation is sub-optimal for some graphs. 

7The problem of finding the exact Graph Edit Distance (GED) is NP-hard 

8so it is often slow. If the simple interface `graph_edit_distance` 

9takes too long for your graph, try `optimize_graph_edit_distance` 

10and/or `optimize_edit_paths`. 

11 

12At the same time, I encourage capable people to investigate 

13alternative GED algorithms, in order to improve the choices available. 

14""" 

15 

16import math 

17import time 

18from dataclasses import dataclass 

19from itertools import product 

20 

21import networkx as nx 

22from networkx.utils import np_random_state 

23 

24__all__ = [ 

25 "graph_edit_distance", 

26 "optimal_edit_paths", 

27 "optimize_graph_edit_distance", 

28 "optimize_edit_paths", 

29 "simrank_similarity", 

30 "panther_similarity", 

31 "panther_vector_similarity", 

32 "generate_random_paths", 

33] 

34 

35 

36@nx._dispatchable( 

37 graphs={"G1": 0, "G2": 1}, preserve_edge_attrs=True, preserve_node_attrs=True 

38) 

39def graph_edit_distance( 

40 G1, 

41 G2, 

42 node_match=None, 

43 edge_match=None, 

44 node_subst_cost=None, 

45 node_del_cost=None, 

46 node_ins_cost=None, 

47 edge_subst_cost=None, 

48 edge_del_cost=None, 

49 edge_ins_cost=None, 

50 roots=None, 

51 upper_bound=None, 

52 timeout=None, 

53): 

54 """Returns GED (graph edit distance) between graphs G1 and G2. 

55 

56 Graph edit distance is a graph similarity measure analogous to 

57 Levenshtein distance for strings. It is defined as minimum cost 

58 of edit path (sequence of node and edge edit operations) 

59 transforming graph G1 to graph isomorphic to G2. 

60 

61 Parameters 

62 ---------- 

63 G1, G2: graphs 

64 The two graphs G1 and G2 must be of the same type. 

65 

66 node_match : callable 

67 A function that returns True if node n1 in G1 and n2 in G2 

68 should be considered equal during matching. 

69 

70 The function will be called like 

71 

72 node_match(G1.nodes[n1], G2.nodes[n2]). 

73 

74 That is, the function will receive the node attribute 

75 dictionaries for n1 and n2 as inputs. 

76 

77 Ignored if node_subst_cost is specified. If neither 

78 node_match nor node_subst_cost are specified then node 

79 attributes are not considered. 

80 

81 edge_match : callable 

82 A function that returns True if the edge attribute dictionaries 

83 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should 

84 be considered equal during matching. 

85 

86 The function will be called like 

87 

88 edge_match(G1[u1][v1], G2[u2][v2]). 

89 

90 That is, the function will receive the edge attribute 

91 dictionaries of the edges under consideration. 

92 

93 Ignored if edge_subst_cost is specified. If neither 

94 edge_match nor edge_subst_cost are specified then edge 

95 attributes are not considered. 

96 

97 node_subst_cost, node_del_cost, node_ins_cost : callable 

98 Functions that return the costs of node substitution, node 

99 deletion, and node insertion, respectively. 

100 

101 The functions will be called like 

102 

103 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), 

104 node_del_cost(G1.nodes[n1]), 

105 node_ins_cost(G2.nodes[n2]). 

106 

107 That is, the functions will receive the node attribute 

108 dictionaries as inputs. The functions are expected to return 

109 positive numeric values. 

110 

111 Function node_subst_cost overrides node_match if specified. 

112 If neither node_match nor node_subst_cost are specified then 

113 default node substitution cost of 0 is used (node attributes 

114 are not considered during matching). 

115 

116 If node_del_cost is not specified then default node deletion 

117 cost of 1 is used. If node_ins_cost is not specified then 

118 default node insertion cost of 1 is used. 

119 

120 edge_subst_cost, edge_del_cost, edge_ins_cost : callable 

121 Functions that return the costs of edge substitution, edge 

122 deletion, and edge insertion, respectively. 

123 

124 The functions will be called like 

125 

126 edge_subst_cost(G1[u1][v1], G2[u2][v2]), 

127 edge_del_cost(G1[u1][v1]), 

128 edge_ins_cost(G2[u2][v2]). 

129 

130 That is, the functions will receive the edge attribute 

131 dictionaries as inputs. The functions are expected to return 

132 positive numeric values. 

133 

134 Function edge_subst_cost overrides edge_match if specified. 

135 If neither edge_match nor edge_subst_cost are specified then 

136 default edge substitution cost of 0 is used (edge attributes 

137 are not considered during matching). 

138 

139 If edge_del_cost is not specified then default edge deletion 

140 cost of 1 is used. If edge_ins_cost is not specified then 

141 default edge insertion cost of 1 is used. 

142 

143 roots : 2-tuple 

144 Tuple where first element is a node in G1 and the second 

145 is a node in G2. 

146 These nodes are forced to be matched in the comparison to 

147 allow comparison between rooted graphs. 

148 

149 upper_bound : numeric 

150 Maximum edit distance to consider. Return None if no edit 

151 distance under or equal to upper_bound exists. 

152 

153 timeout : numeric 

154 Maximum number of seconds to execute. 

155 After timeout is met, the current best GED is returned. 

156 

157 Examples 

158 -------- 

159 >>> G1 = nx.cycle_graph(6) 

160 >>> G2 = nx.wheel_graph(7) 

161 >>> nx.graph_edit_distance(G1, G2) 

162 7.0 

163 

164 >>> G1 = nx.star_graph(5) 

165 >>> G2 = nx.star_graph(5) 

166 >>> nx.graph_edit_distance(G1, G2, roots=(0, 0)) 

167 0.0 

168 >>> nx.graph_edit_distance(G1, G2, roots=(1, 0)) 

169 8.0 

170 

171 See Also 

172 -------- 

173 optimal_edit_paths, optimize_graph_edit_distance, 

174 

175 is_isomorphic: test for graph edit distance of 0 

176 

177 References 

178 ---------- 

179 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick 

180 Martineau. An Exact Graph Edit Distance Algorithm for Solving 

181 Pattern Recognition Problems. 4th International Conference on 

182 Pattern Recognition Applications and Methods 2015, Jan 2015, 

183 Lisbon, Portugal. 2015, 

184 <10.5220/0005209202710278>. <hal-01168816> 

185 https://hal.archives-ouvertes.fr/hal-01168816 

186 

187 """ 

188 bestcost = None 

189 for _, _, cost in optimize_edit_paths( 

190 G1, 

191 G2, 

192 node_match, 

193 edge_match, 

194 node_subst_cost, 

195 node_del_cost, 

196 node_ins_cost, 

197 edge_subst_cost, 

198 edge_del_cost, 

199 edge_ins_cost, 

200 upper_bound, 

201 True, 

202 roots, 

203 timeout, 

204 ): 

205 # assert bestcost is None or cost < bestcost 

206 bestcost = cost 

207 return bestcost 

208 

209 

210@nx._dispatchable(graphs={"G1": 0, "G2": 1}) 

211def optimal_edit_paths( 

212 G1, 

213 G2, 

214 node_match=None, 

215 edge_match=None, 

216 node_subst_cost=None, 

217 node_del_cost=None, 

218 node_ins_cost=None, 

219 edge_subst_cost=None, 

220 edge_del_cost=None, 

221 edge_ins_cost=None, 

222 upper_bound=None, 

223): 

224 """Returns all minimum-cost edit paths transforming G1 to G2. 

225 

226 Graph edit path is a sequence of node and edge edit operations 

227 transforming graph G1 to graph isomorphic to G2. Edit operations 

228 include substitutions, deletions, and insertions. 

229 

230 Parameters 

231 ---------- 

232 G1, G2: graphs 

233 The two graphs G1 and G2 must be of the same type. 

234 

235 node_match : callable 

236 A function that returns True if node n1 in G1 and n2 in G2 

237 should be considered equal during matching. 

238 

239 The function will be called like 

240 

241 node_match(G1.nodes[n1], G2.nodes[n2]). 

242 

243 That is, the function will receive the node attribute 

244 dictionaries for n1 and n2 as inputs. 

245 

246 Ignored if node_subst_cost is specified. If neither 

247 node_match nor node_subst_cost are specified then node 

248 attributes are not considered. 

249 

250 edge_match : callable 

251 A function that returns True if the edge attribute dictionaries 

252 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should 

253 be considered equal during matching. 

254 

255 The function will be called like 

256 

257 edge_match(G1[u1][v1], G2[u2][v2]). 

258 

259 That is, the function will receive the edge attribute 

260 dictionaries of the edges under consideration. 

261 

262 Ignored if edge_subst_cost is specified. If neither 

263 edge_match nor edge_subst_cost are specified then edge 

264 attributes are not considered. 

265 

266 node_subst_cost, node_del_cost, node_ins_cost : callable 

267 Functions that return the costs of node substitution, node 

268 deletion, and node insertion, respectively. 

269 

270 The functions will be called like 

271 

272 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), 

273 node_del_cost(G1.nodes[n1]), 

274 node_ins_cost(G2.nodes[n2]). 

275 

276 That is, the functions will receive the node attribute 

277 dictionaries as inputs. The functions are expected to return 

278 positive numeric values. 

279 

280 Function node_subst_cost overrides node_match if specified. 

281 If neither node_match nor node_subst_cost are specified then 

282 default node substitution cost of 0 is used (node attributes 

283 are not considered during matching). 

284 

285 If node_del_cost is not specified then default node deletion 

286 cost of 1 is used. If node_ins_cost is not specified then 

287 default node insertion cost of 1 is used. 

288 

289 edge_subst_cost, edge_del_cost, edge_ins_cost : callable 

290 Functions that return the costs of edge substitution, edge 

291 deletion, and edge insertion, respectively. 

292 

293 The functions will be called like 

294 

295 edge_subst_cost(G1[u1][v1], G2[u2][v2]), 

296 edge_del_cost(G1[u1][v1]), 

297 edge_ins_cost(G2[u2][v2]). 

298 

299 That is, the functions will receive the edge attribute 

300 dictionaries as inputs. The functions are expected to return 

301 positive numeric values. 

302 

303 Function edge_subst_cost overrides edge_match if specified. 

304 If neither edge_match nor edge_subst_cost are specified then 

305 default edge substitution cost of 0 is used (edge attributes 

306 are not considered during matching). 

307 

308 If edge_del_cost is not specified then default edge deletion 

309 cost of 1 is used. If edge_ins_cost is not specified then 

310 default edge insertion cost of 1 is used. 

311 

312 upper_bound : numeric 

313 Maximum edit distance to consider. 

314 

315 Returns 

316 ------- 

317 edit_paths : list of tuples (node_edit_path, edge_edit_path) 

318 - node_edit_path : list of tuples ``(u, v)`` indicating node transformations 

319 between `G1` and `G2`. ``u`` is `None` for insertion, ``v`` is `None` 

320 for deletion. 

321 - edge_edit_path : list of tuples ``((u1, v1), (u2, v2))`` indicating edge 

322 transformations between `G1` and `G2`. ``(None, (u2,v2))`` for insertion 

323 and ``((u1,v1), None)`` for deletion. 

324 

325 cost : numeric 

326 Optimal edit path cost (graph edit distance). When the cost 

327 is zero, it indicates that `G1` and `G2` are isomorphic. 

328 

329 Examples 

330 -------- 

331 >>> G1 = nx.cycle_graph(4) 

332 >>> G2 = nx.wheel_graph(5) 

333 >>> paths, cost = nx.optimal_edit_paths(G1, G2) 

334 >>> len(paths) 

335 40 

336 >>> cost 

337 5.0 

338 

339 Notes 

340 ----- 

341 To transform `G1` into a graph isomorphic to `G2`, apply the node 

342 and edge edits in the returned ``edit_paths``. 

343 In the case of isomorphic graphs, the cost is zero, and the paths 

344 represent different isomorphic mappings (isomorphisms). That is, the 

345 edits involve renaming nodes and edges to match the structure of `G2`. 

346 

347 See Also 

348 -------- 

349 graph_edit_distance, optimize_edit_paths 

350 

351 References 

352 ---------- 

353 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick 

354 Martineau. An Exact Graph Edit Distance Algorithm for Solving 

355 Pattern Recognition Problems. 4th International Conference on 

356 Pattern Recognition Applications and Methods 2015, Jan 2015, 

357 Lisbon, Portugal. 2015, 

358 <10.5220/0005209202710278>. <hal-01168816> 

359 https://hal.archives-ouvertes.fr/hal-01168816 

360 

361 """ 

362 paths = [] 

363 bestcost = None 

364 for vertex_path, edge_path, cost in optimize_edit_paths( 

365 G1, 

366 G2, 

367 node_match, 

368 edge_match, 

369 node_subst_cost, 

370 node_del_cost, 

371 node_ins_cost, 

372 edge_subst_cost, 

373 edge_del_cost, 

374 edge_ins_cost, 

375 upper_bound, 

376 False, 

377 ): 

378 # assert bestcost is None or cost <= bestcost 

379 if bestcost is not None and cost < bestcost: 

380 paths = [] 

381 paths.append((vertex_path, edge_path)) 

382 bestcost = cost 

383 return paths, bestcost 

384 

385 

386@nx._dispatchable(graphs={"G1": 0, "G2": 1}) 

387def optimize_graph_edit_distance( 

388 G1, 

389 G2, 

390 node_match=None, 

391 edge_match=None, 

392 node_subst_cost=None, 

393 node_del_cost=None, 

394 node_ins_cost=None, 

395 edge_subst_cost=None, 

396 edge_del_cost=None, 

397 edge_ins_cost=None, 

398 upper_bound=None, 

399): 

400 """Returns consecutive approximations of GED (graph edit distance) 

401 between graphs G1 and G2. 

402 

403 Graph edit distance is a graph similarity measure analogous to 

404 Levenshtein distance for strings. It is defined as minimum cost 

405 of edit path (sequence of node and edge edit operations) 

406 transforming graph G1 to graph isomorphic to G2. 

407 

408 Parameters 

409 ---------- 

410 G1, G2: graphs 

411 The two graphs G1 and G2 must be of the same type. 

412 

413 node_match : callable 

414 A function that returns True if node n1 in G1 and n2 in G2 

415 should be considered equal during matching. 

416 

417 The function will be called like 

418 

419 node_match(G1.nodes[n1], G2.nodes[n2]). 

420 

421 That is, the function will receive the node attribute 

422 dictionaries for n1 and n2 as inputs. 

423 

424 Ignored if node_subst_cost is specified. If neither 

425 node_match nor node_subst_cost are specified then node 

426 attributes are not considered. 

427 

428 edge_match : callable 

429 A function that returns True if the edge attribute dictionaries 

430 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should 

431 be considered equal during matching. 

432 

433 The function will be called like 

434 

435 edge_match(G1[u1][v1], G2[u2][v2]). 

436 

437 That is, the function will receive the edge attribute 

438 dictionaries of the edges under consideration. 

439 

440 Ignored if edge_subst_cost is specified. If neither 

441 edge_match nor edge_subst_cost are specified then edge 

442 attributes are not considered. 

443 

444 node_subst_cost, node_del_cost, node_ins_cost : callable 

445 Functions that return the costs of node substitution, node 

446 deletion, and node insertion, respectively. 

447 

448 The functions will be called like 

449 

450 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), 

451 node_del_cost(G1.nodes[n1]), 

452 node_ins_cost(G2.nodes[n2]). 

453 

454 That is, the functions will receive the node attribute 

455 dictionaries as inputs. The functions are expected to return 

456 positive numeric values. 

457 

458 Function node_subst_cost overrides node_match if specified. 

459 If neither node_match nor node_subst_cost are specified then 

460 default node substitution cost of 0 is used (node attributes 

461 are not considered during matching). 

462 

463 If node_del_cost is not specified then default node deletion 

464 cost of 1 is used. If node_ins_cost is not specified then 

465 default node insertion cost of 1 is used. 

466 

467 edge_subst_cost, edge_del_cost, edge_ins_cost : callable 

468 Functions that return the costs of edge substitution, edge 

469 deletion, and edge insertion, respectively. 

470 

471 The functions will be called like 

472 

473 edge_subst_cost(G1[u1][v1], G2[u2][v2]), 

474 edge_del_cost(G1[u1][v1]), 

475 edge_ins_cost(G2[u2][v2]). 

476 

477 That is, the functions will receive the edge attribute 

478 dictionaries as inputs. The functions are expected to return 

479 positive numeric values. 

480 

481 Function edge_subst_cost overrides edge_match if specified. 

482 If neither edge_match nor edge_subst_cost are specified then 

483 default edge substitution cost of 0 is used (edge attributes 

484 are not considered during matching). 

485 

486 If edge_del_cost is not specified then default edge deletion 

487 cost of 1 is used. If edge_ins_cost is not specified then 

488 default edge insertion cost of 1 is used. 

489 

490 upper_bound : numeric 

491 Maximum edit distance to consider. 

492 

493 Returns 

494 ------- 

495 Generator of consecutive approximations of graph edit distance. 

496 

497 Examples 

498 -------- 

499 >>> G1 = nx.cycle_graph(6) 

500 >>> G2 = nx.wheel_graph(7) 

501 >>> for v in nx.optimize_graph_edit_distance(G1, G2): 

502 ... minv = v 

503 >>> minv 

504 7.0 

505 

506 See Also 

507 -------- 

508 graph_edit_distance, optimize_edit_paths 

509 

510 References 

511 ---------- 

512 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick 

513 Martineau. An Exact Graph Edit Distance Algorithm for Solving 

514 Pattern Recognition Problems. 4th International Conference on 

515 Pattern Recognition Applications and Methods 2015, Jan 2015, 

516 Lisbon, Portugal. 2015, 

517 <10.5220/0005209202710278>. <hal-01168816> 

518 https://hal.archives-ouvertes.fr/hal-01168816 

519 """ 

520 for _, _, cost in optimize_edit_paths( 

521 G1, 

522 G2, 

523 node_match, 

524 edge_match, 

525 node_subst_cost, 

526 node_del_cost, 

527 node_ins_cost, 

528 edge_subst_cost, 

529 edge_del_cost, 

530 edge_ins_cost, 

531 upper_bound, 

532 True, 

533 ): 

534 yield cost 

535 

536 

537@nx._dispatchable( 

538 graphs={"G1": 0, "G2": 1}, preserve_edge_attrs=True, preserve_node_attrs=True 

539) 

540def optimize_edit_paths( 

541 G1, 

542 G2, 

543 node_match=None, 

544 edge_match=None, 

545 node_subst_cost=None, 

546 node_del_cost=None, 

547 node_ins_cost=None, 

548 edge_subst_cost=None, 

549 edge_del_cost=None, 

550 edge_ins_cost=None, 

551 upper_bound=None, 

552 strictly_decreasing=True, 

553 roots=None, 

554 timeout=None, 

555): 

556 """GED (graph edit distance) calculation: advanced interface. 

557 

558 Graph edit path is a sequence of node and edge edit operations 

559 transforming graph G1 to graph isomorphic to G2. Edit operations 

560 include substitutions, deletions, and insertions. 

561 

562 Graph edit distance is defined as minimum cost of edit path. 

563 

564 Parameters 

565 ---------- 

566 G1, G2: graphs 

567 The two graphs G1 and G2 must be of the same type. 

568 

569 node_match : callable 

570 A function that returns True if node n1 in G1 and n2 in G2 

571 should be considered equal during matching. 

572 

573 The function will be called like 

574 

575 node_match(G1.nodes[n1], G2.nodes[n2]). 

576 

577 That is, the function will receive the node attribute 

578 dictionaries for n1 and n2 as inputs. 

579 

580 Ignored if node_subst_cost is specified. If neither 

581 node_match nor node_subst_cost are specified then node 

582 attributes are not considered. 

583 

584 edge_match : callable 

585 A function that returns True if the edge attribute dictionaries 

586 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should 

587 be considered equal during matching. 

588 

589 The function will be called like 

590 

591 edge_match(G1[u1][v1], G2[u2][v2]). 

592 

593 That is, the function will receive the edge attribute 

594 dictionaries of the edges under consideration. 

595 

596 Ignored if edge_subst_cost is specified. If neither 

597 edge_match nor edge_subst_cost are specified then edge 

598 attributes are not considered. 

599 

600 node_subst_cost, node_del_cost, node_ins_cost : callable 

601 Functions that return the costs of node substitution, node 

602 deletion, and node insertion, respectively. 

603 

604 The functions will be called like 

605 

606 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), 

607 node_del_cost(G1.nodes[n1]), 

608 node_ins_cost(G2.nodes[n2]). 

609 

610 That is, the functions will receive the node attribute 

611 dictionaries as inputs. The functions are expected to return 

612 positive numeric values. 

613 

614 Function node_subst_cost overrides node_match if specified. 

615 If neither node_match nor node_subst_cost are specified then 

616 default node substitution cost of 0 is used (node attributes 

617 are not considered during matching). 

618 

619 If node_del_cost is not specified then default node deletion 

620 cost of 1 is used. If node_ins_cost is not specified then 

621 default node insertion cost of 1 is used. 

622 

623 edge_subst_cost, edge_del_cost, edge_ins_cost : callable 

624 Functions that return the costs of edge substitution, edge 

625 deletion, and edge insertion, respectively. 

626 

627 The functions will be called like 

628 

629 edge_subst_cost(G1[u1][v1], G2[u2][v2]), 

630 edge_del_cost(G1[u1][v1]), 

631 edge_ins_cost(G2[u2][v2]). 

632 

633 That is, the functions will receive the edge attribute 

634 dictionaries as inputs. The functions are expected to return 

635 positive numeric values. 

636 

637 Function edge_subst_cost overrides edge_match if specified. 

638 If neither edge_match nor edge_subst_cost are specified then 

639 default edge substitution cost of 0 is used (edge attributes 

640 are not considered during matching). 

641 

642 If edge_del_cost is not specified then default edge deletion 

643 cost of 1 is used. If edge_ins_cost is not specified then 

644 default edge insertion cost of 1 is used. 

645 

646 upper_bound : numeric 

647 Maximum edit distance to consider. 

648 

649 strictly_decreasing : bool 

650 If True, return consecutive approximations of strictly 

651 decreasing cost. Otherwise, return all edit paths of cost 

652 less than or equal to the previous minimum cost. 

653 

654 roots : 2-tuple 

655 Tuple where first element is a node in G1 and the second 

656 is a node in G2. 

657 These nodes are forced to be matched in the comparison to 

658 allow comparison between rooted graphs. 

659 

660 timeout : numeric 

661 Maximum number of seconds to execute. 

662 After timeout is met, the current best GED is returned. 

663 

664 Returns 

665 ------- 

666 Generator of tuples (node_edit_path, edge_edit_path, cost) 

667 node_edit_path : list of tuples (u, v) 

668 edge_edit_path : list of tuples ((u1, v1), (u2, v2)) 

669 cost : numeric 

670 

671 See Also 

672 -------- 

673 graph_edit_distance, optimize_graph_edit_distance, optimal_edit_paths 

674 

675 References 

676 ---------- 

677 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick 

678 Martineau. An Exact Graph Edit Distance Algorithm for Solving 

679 Pattern Recognition Problems. 4th International Conference on 

680 Pattern Recognition Applications and Methods 2015, Jan 2015, 

681 Lisbon, Portugal. 2015, 

682 <10.5220/0005209202710278>. <hal-01168816> 

683 https://hal.archives-ouvertes.fr/hal-01168816 

684 

685 """ 

686 # TODO: support DiGraph 

687 

688 import numpy as np 

689 import scipy as sp 

690 

691 @dataclass 

692 class CostMatrix: 

693 C: ... 

694 lsa_row_ind: ... 

695 lsa_col_ind: ... 

696 ls: ... 

697 

698 def make_CostMatrix(C, m, n): 

699 # assert(C.shape == (m + n, m + n)) 

700 lsa_row_ind, lsa_col_ind = sp.optimize.linear_sum_assignment(C) 

701 

702 # Fixup dummy assignments: 

703 # each substitution i<->j should have dummy assignment m+j<->n+i 

704 # NOTE: fast reduce of Cv relies on it 

705 # Create masks for substitution and dummy indices 

706 is_subst = (lsa_row_ind < m) & (lsa_col_ind < n) 

707 is_dummy = (lsa_row_ind >= m) & (lsa_col_ind >= n) 

708 

709 # Map dummy assignments to the correct indices 

710 lsa_row_ind[is_dummy] = lsa_col_ind[is_subst] + m 

711 lsa_col_ind[is_dummy] = lsa_row_ind[is_subst] + n 

712 

713 return CostMatrix( 

714 C, lsa_row_ind, lsa_col_ind, C[lsa_row_ind, lsa_col_ind].sum() 

715 ) 

716 

717 def extract_C(C, i, j, m, n): 

718 # assert(C.shape == (m + n, m + n)) 

719 row_ind = [k in i or k - m in j for k in range(m + n)] 

720 col_ind = [k in j or k - n in i for k in range(m + n)] 

721 return C[row_ind, :][:, col_ind] 

722 

723 def reduce_C(C, i, j, m, n): 

724 # assert(C.shape == (m + n, m + n)) 

725 row_ind = [k not in i and k - m not in j for k in range(m + n)] 

726 col_ind = [k not in j and k - n not in i for k in range(m + n)] 

727 return C[row_ind, :][:, col_ind] 

728 

729 def reduce_ind(ind, i): 

730 # assert set(ind) == set(range(len(ind))) 

731 rind = ind[[k not in i for k in ind]] 

732 for k in set(i): 

733 rind[rind >= k] -= 1 

734 return rind 

735 

736 def match_edges(u, v, pending_g, pending_h, Ce, matched_uv=None): 

737 """ 

738 Parameters: 

739 u, v: matched vertices, u=None or v=None for 

740 deletion/insertion 

741 pending_g, pending_h: lists of edges not yet mapped 

742 Ce: CostMatrix of pending edge mappings 

743 matched_uv: partial vertex edit path 

744 list of tuples (u, v) of previously matched vertex 

745 mappings u<->v, u=None or v=None for 

746 deletion/insertion 

747 

748 Returns: 

749 list of (i, j): indices of edge mappings g<->h 

750 localCe: local CostMatrix of edge mappings 

751 (basically submatrix of Ce at cross of rows i, cols j) 

752 """ 

753 M = len(pending_g) 

754 N = len(pending_h) 

755 # assert Ce.C.shape == (M + N, M + N) 

756 

757 # only attempt to match edges after one node match has been made 

758 # this will stop self-edges on the first node being automatically deleted 

759 # even when a substitution is the better option 

760 

761 substitution_possible = M and N 

762 at_least_one_node_match = matched_uv is None or len(matched_uv) == 0 

763 if at_least_one_node_match and substitution_possible: 

764 g_ind = [] 

765 h_ind = [] 

766 else: 

767 g_ind = [ 

768 i 

769 for i in range(M) 

770 if pending_g[i][:2] == (u, u) 

771 or any( 

772 pending_g[i][:2] in ((p, u), (u, p), (p, p)) for p, q in matched_uv 

773 ) 

774 ] 

775 h_ind = [ 

776 j 

777 for j in range(N) 

778 if pending_h[j][:2] == (v, v) 

779 or any( 

780 pending_h[j][:2] in ((q, v), (v, q), (q, q)) for p, q in matched_uv 

781 ) 

782 ] 

783 

784 m = len(g_ind) 

785 n = len(h_ind) 

786 

787 if m or n: 

788 C = extract_C(Ce.C, g_ind, h_ind, M, N) 

789 # assert C.shape == (m + n, m + n) 

790 

791 # Forbid structurally invalid matches 

792 # NOTE: inf remembered from Ce construction 

793 for k, i in enumerate(g_ind): 

794 g = pending_g[i][:2] 

795 for l, j in enumerate(h_ind): 

796 h = pending_h[j][:2] 

797 if nx.is_directed(G1) or nx.is_directed(G2): 

798 if any( 

799 g == (p, u) and h == (q, v) or g == (u, p) and h == (v, q) 

800 for p, q in matched_uv 

801 ): 

802 continue 

803 else: 

804 if any( 

805 g in ((p, u), (u, p)) and h in ((q, v), (v, q)) 

806 for p, q in matched_uv 

807 ): 

808 continue 

809 if g == (u, u) or any(g == (p, p) for p, q in matched_uv): 

810 continue 

811 if h == (v, v) or any(h == (q, q) for p, q in matched_uv): 

812 continue 

813 C[k, l] = inf 

814 

815 localCe = make_CostMatrix(C, m, n) 

816 ij = [ 

817 ( 

818 g_ind[k] if k < m else M + h_ind[l], 

819 h_ind[l] if l < n else N + g_ind[k], 

820 ) 

821 for k, l in zip(localCe.lsa_row_ind, localCe.lsa_col_ind) 

822 if k < m or l < n 

823 ] 

824 

825 else: 

826 ij = [] 

827 localCe = CostMatrix(np.empty((0, 0)), [], [], 0) 

828 

829 return ij, localCe 

830 

831 def reduce_Ce(Ce, ij, m, n): 

832 if len(ij): 

833 i, j = zip(*ij) 

834 m_i = m - sum(1 for t in i if t < m) 

835 n_j = n - sum(1 for t in j if t < n) 

836 return make_CostMatrix(reduce_C(Ce.C, i, j, m, n), m_i, n_j) 

837 return Ce 

838 

839 def get_edit_ops( 

840 matched_uv, pending_u, pending_v, Cv, pending_g, pending_h, Ce, matched_cost 

841 ): 

842 """ 

843 Parameters: 

844 matched_uv: partial vertex edit path 

845 list of tuples (u, v) of vertex mappings u<->v, 

846 u=None or v=None for deletion/insertion 

847 pending_u, pending_v: lists of vertices not yet mapped 

848 Cv: CostMatrix of pending vertex mappings 

849 pending_g, pending_h: lists of edges not yet mapped 

850 Ce: CostMatrix of pending edge mappings 

851 matched_cost: cost of partial edit path 

852 

853 Returns: 

854 sequence of 

855 (i, j): indices of vertex mapping u<->v 

856 Cv_ij: reduced CostMatrix of pending vertex mappings 

857 (basically Cv with row i, col j removed) 

858 list of (x, y): indices of edge mappings g<->h 

859 Ce_xy: reduced CostMatrix of pending edge mappings 

860 (basically Ce with rows x, cols y removed) 

861 cost: total cost of edit operation 

862 NOTE: most promising ops first 

863 """ 

864 m = len(pending_u) 

865 n = len(pending_v) 

866 # assert Cv.C.shape == (m + n, m + n) 

867 

868 # 1) a vertex mapping from optimal linear sum assignment 

869 i, j = min( 

870 (k, l) for k, l in zip(Cv.lsa_row_ind, Cv.lsa_col_ind) if k < m or l < n 

871 ) 

872 xy, localCe = match_edges( 

873 pending_u[i] if i < m else None, 

874 pending_v[j] if j < n else None, 

875 pending_g, 

876 pending_h, 

877 Ce, 

878 matched_uv, 

879 ) 

880 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h)) 

881 # assert Ce.ls <= localCe.ls + Ce_xy.ls 

882 if prune(matched_cost + Cv.ls + localCe.ls + Ce_xy.ls): 

883 pass 

884 else: 

885 # get reduced Cv efficiently 

886 Cv_ij = CostMatrix( 

887 reduce_C(Cv.C, (i,), (j,), m, n), 

888 reduce_ind(Cv.lsa_row_ind, (i, m + j)), 

889 reduce_ind(Cv.lsa_col_ind, (j, n + i)), 

890 Cv.ls - Cv.C[i, j], 

891 ) 

892 yield (i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls 

893 

894 # 2) other candidates, sorted by lower-bound cost estimate 

895 other = [] 

896 fixed_i, fixed_j = i, j 

897 if m <= n: 

898 candidates = ( 

899 (t, fixed_j) 

900 for t in range(m + n) 

901 if t != fixed_i and (t < m or t == m + fixed_j) 

902 ) 

903 else: 

904 candidates = ( 

905 (fixed_i, t) 

906 for t in range(m + n) 

907 if t != fixed_j and (t < n or t == n + fixed_i) 

908 ) 

909 for i, j in candidates: 

910 if prune(matched_cost + Cv.C[i, j] + Ce.ls): 

911 continue 

912 Cv_ij = make_CostMatrix( 

913 reduce_C(Cv.C, (i,), (j,), m, n), 

914 m - 1 if i < m else m, 

915 n - 1 if j < n else n, 

916 ) 

917 # assert Cv.ls <= Cv.C[i, j] + Cv_ij.ls 

918 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + Ce.ls): 

919 continue 

920 xy, localCe = match_edges( 

921 pending_u[i] if i < m else None, 

922 pending_v[j] if j < n else None, 

923 pending_g, 

924 pending_h, 

925 Ce, 

926 matched_uv, 

927 ) 

928 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls): 

929 continue 

930 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h)) 

931 # assert Ce.ls <= localCe.ls + Ce_xy.ls 

932 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls + Ce_xy.ls): 

933 continue 

934 other.append(((i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls)) 

935 

936 yield from sorted(other, key=lambda t: t[4] + t[1].ls + t[3].ls) 

937 

938 def get_edit_paths( 

939 matched_uv, 

940 pending_u, 

941 pending_v, 

942 Cv, 

943 matched_gh, 

944 pending_g, 

945 pending_h, 

946 Ce, 

947 matched_cost, 

948 ): 

949 """ 

950 Parameters: 

951 matched_uv: partial vertex edit path 

952 list of tuples (u, v) of vertex mappings u<->v, 

953 u=None or v=None for deletion/insertion 

954 pending_u, pending_v: lists of vertices not yet mapped 

955 Cv: CostMatrix of pending vertex mappings 

956 matched_gh: partial edge edit path 

957 list of tuples (g, h) of edge mappings g<->h, 

958 g=None or h=None for deletion/insertion 

959 pending_g, pending_h: lists of edges not yet mapped 

960 Ce: CostMatrix of pending edge mappings 

961 matched_cost: cost of partial edit path 

962 

963 Returns: 

964 sequence of (vertex_path, edge_path, cost) 

965 vertex_path: complete vertex edit path 

966 list of tuples (u, v) of vertex mappings u<->v, 

967 u=None or v=None for deletion/insertion 

968 edge_path: complete edge edit path 

969 list of tuples (g, h) of edge mappings g<->h, 

970 g=None or h=None for deletion/insertion 

971 cost: total cost of edit path 

972 NOTE: path costs are non-increasing 

973 """ 

974 if prune(matched_cost + Cv.ls + Ce.ls): 

975 return 

976 

977 if not max(len(pending_u), len(pending_v)): 

978 # assert not len(pending_g) 

979 # assert not len(pending_h) 

980 # path completed! 

981 # assert matched_cost <= maxcost_value 

982 nonlocal maxcost_value 

983 maxcost_value = min(maxcost_value, matched_cost) 

984 yield matched_uv, matched_gh, matched_cost 

985 

986 else: 

987 edit_ops = get_edit_ops( 

988 matched_uv, 

989 pending_u, 

990 pending_v, 

991 Cv, 

992 pending_g, 

993 pending_h, 

994 Ce, 

995 matched_cost, 

996 ) 

997 for ij, Cv_ij, xy, Ce_xy, edit_cost in edit_ops: 

998 i, j = ij 

999 # assert Cv.C[i, j] + sum(Ce.C[t] for t in xy) == edit_cost 

1000 if prune(matched_cost + edit_cost + Cv_ij.ls + Ce_xy.ls): 

1001 continue 

1002 

1003 # dive deeper 

1004 u = pending_u.pop(i) if i < len(pending_u) else None 

1005 v = pending_v.pop(j) if j < len(pending_v) else None 

1006 matched_uv.append((u, v)) 

1007 for x, y in xy: 

1008 len_g = len(pending_g) 

1009 len_h = len(pending_h) 

1010 matched_gh.append( 

1011 ( 

1012 pending_g[x] if x < len_g else None, 

1013 pending_h[y] if y < len_h else None, 

1014 ) 

1015 ) 

1016 sortedx = sorted(x for x, y in xy) 

1017 sortedy = sorted(y for x, y in xy) 

1018 G = [ 

1019 (pending_g.pop(x) if x < len(pending_g) else None) 

1020 for x in reversed(sortedx) 

1021 ] 

1022 H = [ 

1023 (pending_h.pop(y) if y < len(pending_h) else None) 

1024 for y in reversed(sortedy) 

1025 ] 

1026 

1027 yield from get_edit_paths( 

1028 matched_uv, 

1029 pending_u, 

1030 pending_v, 

1031 Cv_ij, 

1032 matched_gh, 

1033 pending_g, 

1034 pending_h, 

1035 Ce_xy, 

1036 matched_cost + edit_cost, 

1037 ) 

1038 

1039 # backtrack 

1040 if u is not None: 

1041 pending_u.insert(i, u) 

1042 if v is not None: 

1043 pending_v.insert(j, v) 

1044 matched_uv.pop() 

1045 for x, g in zip(sortedx, reversed(G)): 

1046 if g is not None: 

1047 pending_g.insert(x, g) 

1048 for y, h in zip(sortedy, reversed(H)): 

1049 if h is not None: 

1050 pending_h.insert(y, h) 

1051 for _ in xy: 

1052 matched_gh.pop() 

1053 

1054 # Initialization 

1055 

1056 pending_u = list(G1.nodes) 

1057 pending_v = list(G2.nodes) 

1058 

1059 initial_cost = 0 

1060 if roots: 

1061 root_u, root_v = roots 

1062 if root_u not in pending_u or root_v not in pending_v: 

1063 raise nx.NodeNotFound("Root node not in graph.") 

1064 

1065 # remove roots from pending 

1066 pending_u.remove(root_u) 

1067 pending_v.remove(root_v) 

1068 

1069 # cost matrix of vertex mappings 

1070 m = len(pending_u) 

1071 n = len(pending_v) 

1072 C = np.zeros((m + n, m + n)) 

1073 if node_subst_cost: 

1074 C[0:m, 0:n] = np.array( 

1075 [ 

1076 node_subst_cost(G1.nodes[u], G2.nodes[v]) 

1077 for u in pending_u 

1078 for v in pending_v 

1079 ] 

1080 ).reshape(m, n) 

1081 if roots: 

1082 initial_cost = node_subst_cost(G1.nodes[root_u], G2.nodes[root_v]) 

1083 elif node_match: 

1084 C[0:m, 0:n] = np.array( 

1085 [ 

1086 1 - int(node_match(G1.nodes[u], G2.nodes[v])) 

1087 for u in pending_u 

1088 for v in pending_v 

1089 ] 

1090 ).reshape(m, n) 

1091 if roots: 

1092 initial_cost = 1 - node_match(G1.nodes[root_u], G2.nodes[root_v]) 

1093 else: 

1094 # all zeroes 

1095 pass 

1096 # assert not min(m, n) or C[0:m, 0:n].min() >= 0 

1097 if node_del_cost: 

1098 del_costs = [node_del_cost(G1.nodes[u]) for u in pending_u] 

1099 else: 

1100 del_costs = [1] * len(pending_u) 

1101 # assert not m or min(del_costs) >= 0 

1102 if node_ins_cost: 

1103 ins_costs = [node_ins_cost(G2.nodes[v]) for v in pending_v] 

1104 else: 

1105 ins_costs = [1] * len(pending_v) 

1106 # assert not n or min(ins_costs) >= 0 

1107 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1 

1108 C[0:m, n : n + m] = np.array( 

1109 [del_costs[i] if i == j else inf for i in range(m) for j in range(m)] 

1110 ).reshape(m, m) 

1111 C[m : m + n, 0:n] = np.array( 

1112 [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)] 

1113 ).reshape(n, n) 

1114 Cv = make_CostMatrix(C, m, n) 

1115 

1116 pending_g = list(G1.edges) 

1117 pending_h = list(G2.edges) 

1118 

1119 # cost matrix of edge mappings 

1120 m = len(pending_g) 

1121 n = len(pending_h) 

1122 C = np.zeros((m + n, m + n)) 

1123 if edge_subst_cost: 

1124 C[0:m, 0:n] = np.array( 

1125 [ 

1126 edge_subst_cost(G1.edges[g], G2.edges[h]) 

1127 for g in pending_g 

1128 for h in pending_h 

1129 ] 

1130 ).reshape(m, n) 

1131 elif edge_match: 

1132 C[0:m, 0:n] = np.array( 

1133 [ 

1134 1 - int(edge_match(G1.edges[g], G2.edges[h])) 

1135 for g in pending_g 

1136 for h in pending_h 

1137 ] 

1138 ).reshape(m, n) 

1139 else: 

1140 # all zeroes 

1141 pass 

1142 # assert not min(m, n) or C[0:m, 0:n].min() >= 0 

1143 if edge_del_cost: 

1144 del_costs = [edge_del_cost(G1.edges[g]) for g in pending_g] 

1145 else: 

1146 del_costs = [1] * len(pending_g) 

1147 # assert not m or min(del_costs) >= 0 

1148 if edge_ins_cost: 

1149 ins_costs = [edge_ins_cost(G2.edges[h]) for h in pending_h] 

1150 else: 

1151 ins_costs = [1] * len(pending_h) 

1152 # assert not n or min(ins_costs) >= 0 

1153 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1 

1154 C[0:m, n : n + m] = np.array( 

1155 [del_costs[i] if i == j else inf for i in range(m) for j in range(m)] 

1156 ).reshape(m, m) 

1157 C[m : m + n, 0:n] = np.array( 

1158 [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)] 

1159 ).reshape(n, n) 

1160 Ce = make_CostMatrix(C, m, n) 

1161 

1162 maxcost_value = Cv.C.sum() + Ce.C.sum() + 1 

1163 

1164 if timeout is not None: 

1165 if timeout <= 0: 

1166 raise nx.NetworkXError("Timeout value must be greater than 0") 

1167 start = time.perf_counter() 

1168 

1169 def prune(cost): 

1170 if timeout is not None: 

1171 if time.perf_counter() - start > timeout: 

1172 return True 

1173 if upper_bound is not None: 

1174 if cost > upper_bound: 

1175 return True 

1176 if cost > maxcost_value: 

1177 return True 

1178 if strictly_decreasing and cost >= maxcost_value: 

1179 return True 

1180 return False 

1181 

1182 # Now go! 

1183 

1184 done_uv = [] if roots is None else [roots] 

1185 

1186 for vertex_path, edge_path, cost in get_edit_paths( 

1187 done_uv, pending_u, pending_v, Cv, [], pending_g, pending_h, Ce, initial_cost 

1188 ): 

1189 # assert sorted(G1.nodes) == sorted(u for u, v in vertex_path if u is not None) 

1190 # assert sorted(G2.nodes) == sorted(v for u, v in vertex_path if v is not None) 

1191 # assert sorted(G1.edges) == sorted(g for g, h in edge_path if g is not None) 

1192 # assert sorted(G2.edges) == sorted(h for g, h in edge_path if h is not None) 

1193 # print(vertex_path, edge_path, cost, file = sys.stderr) 

1194 # assert cost == maxcost_value 

1195 yield list(vertex_path), list(edge_path), float(cost) 

1196 

1197 

1198@nx._dispatchable 

1199def simrank_similarity( 

1200 G, 

1201 source=None, 

1202 target=None, 

1203 importance_factor=0.9, 

1204 max_iterations=1000, 

1205 tolerance=1e-4, 

1206): 

1207 """Returns the SimRank similarity of nodes in the graph ``G``. 

1208 

1209 SimRank is a similarity metric that says "two objects are considered 

1210 to be similar if they are referenced by similar objects." [1]_. 

1211 

1212 The pseudo-code definition from the paper is:: 

1213 

1214 def simrank(G, u, v): 

1215 in_neighbors_u = G.predecessors(u) 

1216 in_neighbors_v = G.predecessors(v) 

1217 scale = C / (len(in_neighbors_u) * len(in_neighbors_v)) 

1218 return scale * sum( 

1219 simrank(G, w, x) for w, x in product(in_neighbors_u, in_neighbors_v) 

1220 ) 

1221 

1222 where ``G`` is the graph, ``u`` is the source, ``v`` is the target, 

1223 and ``C`` is a float decay or importance factor between 0 and 1. 

1224 

1225 The SimRank algorithm for determining node similarity is defined in 

1226 [2]_. 

1227 

1228 Parameters 

1229 ---------- 

1230 G : NetworkX graph 

1231 A NetworkX graph 

1232 

1233 source : node 

1234 If this is specified, the returned dictionary maps each node 

1235 ``v`` in the graph to the similarity between ``source`` and 

1236 ``v``. 

1237 

1238 target : node 

1239 If both ``source`` and ``target`` are specified, the similarity 

1240 value between ``source`` and ``target`` is returned. If 

1241 ``target`` is specified but ``source`` is not, this argument is 

1242 ignored. 

1243 

1244 importance_factor : float 

1245 The relative importance of indirect neighbors with respect to 

1246 direct neighbors. 

1247 

1248 max_iterations : integer 

1249 Maximum number of iterations. 

1250 

1251 tolerance : float 

1252 Error tolerance used to check convergence. When an iteration of 

1253 the algorithm finds that no similarity value changes more than 

1254 this amount, the algorithm halts. 

1255 

1256 Returns 

1257 ------- 

1258 similarity : dictionary or float 

1259 If ``source`` and ``target`` are both ``None``, this returns a 

1260 dictionary of dictionaries, where keys are node pairs and value 

1261 are similarity of the pair of nodes. 

1262 

1263 If ``source`` is not ``None`` but ``target`` is, this returns a 

1264 dictionary mapping node to the similarity of ``source`` and that 

1265 node. 

1266 

1267 If neither ``source`` nor ``target`` is ``None``, this returns 

1268 the similarity value for the given pair of nodes. 

1269 

1270 Raises 

1271 ------ 

1272 ExceededMaxIterations 

1273 If the algorithm does not converge within ``max_iterations``. 

1274 

1275 NodeNotFound 

1276 If either ``source`` or ``target`` is not in `G`. 

1277 

1278 Examples 

1279 -------- 

1280 >>> G = nx.cycle_graph(2) 

1281 >>> nx.simrank_similarity(G) 

1282 {0: {0: 1.0, 1: 0.0}, 1: {0: 0.0, 1: 1.0}} 

1283 >>> nx.simrank_similarity(G, source=0) 

1284 {0: 1.0, 1: 0.0} 

1285 >>> nx.simrank_similarity(G, source=0, target=0) 

1286 1.0 

1287 

1288 The result of this function can be converted to a numpy array 

1289 representing the SimRank matrix by using the node order of the 

1290 graph to determine which row and column represent each node. 

1291 Other ordering of nodes is also possible. 

1292 

1293 >>> import numpy as np 

1294 >>> sim = nx.simrank_similarity(G) 

1295 >>> np.array([[sim[u][v] for v in G] for u in G]) 

1296 array([[1., 0.], 

1297 [0., 1.]]) 

1298 >>> sim_1d = nx.simrank_similarity(G, source=0) 

1299 >>> np.array([sim[0][v] for v in G]) 

1300 array([1., 0.]) 

1301 

1302 References 

1303 ---------- 

1304 .. [1] https://en.wikipedia.org/wiki/SimRank 

1305 .. [2] G. Jeh and J. Widom. 

1306 "SimRank: a measure of structural-context similarity", 

1307 In KDD'02: Proceedings of the Eighth ACM SIGKDD 

1308 International Conference on Knowledge Discovery and Data Mining, 

1309 pp. 538--543. ACM Press, 2002. 

1310 """ 

1311 import numpy as np 

1312 

1313 nodelist = list(G) 

1314 if source is not None: 

1315 if source not in nodelist: 

1316 raise nx.NodeNotFound(f"Source node {source} not in G") 

1317 else: 

1318 s_indx = nodelist.index(source) 

1319 else: 

1320 s_indx = None 

1321 

1322 if target is not None: 

1323 if target not in nodelist: 

1324 raise nx.NodeNotFound(f"Target node {target} not in G") 

1325 else: 

1326 t_indx = nodelist.index(target) 

1327 else: 

1328 t_indx = None 

1329 

1330 x = _simrank_similarity_numpy( 

1331 G, s_indx, t_indx, importance_factor, max_iterations, tolerance 

1332 ) 

1333 

1334 if isinstance(x, np.ndarray): 

1335 if x.ndim == 1: 

1336 return dict(zip(G, x.tolist())) 

1337 # else x.ndim == 2 

1338 return {u: dict(zip(G, row)) for u, row in zip(G, x.tolist())} 

1339 return float(x) 

1340 

1341 

1342def _simrank_similarity_python( 

1343 G, 

1344 source=None, 

1345 target=None, 

1346 importance_factor=0.9, 

1347 max_iterations=1000, 

1348 tolerance=1e-4, 

1349): 

1350 """Returns the SimRank similarity of nodes in the graph ``G``. 

1351 

1352 This pure Python version is provided for pedagogical purposes. 

1353 

1354 Examples 

1355 -------- 

1356 >>> G = nx.cycle_graph(2) 

1357 >>> nx.similarity._simrank_similarity_python(G) 

1358 {0: {0: 1, 1: 0.0}, 1: {0: 0.0, 1: 1}} 

1359 >>> nx.similarity._simrank_similarity_python(G, source=0) 

1360 {0: 1, 1: 0.0} 

1361 >>> nx.similarity._simrank_similarity_python(G, source=0, target=0) 

1362 1 

1363 """ 

1364 # build up our similarity adjacency dictionary output 

1365 newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G} 

1366 

1367 # These functions compute the update to the similarity value of the nodes 

1368 # `u` and `v` with respect to the previous similarity values. 

1369 def avg_sim(s): 

1370 return sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0 

1371 

1372 Gadj = G.pred if G.is_directed() else G.adj 

1373 

1374 def sim(u, v): 

1375 return importance_factor * avg_sim(list(product(Gadj[u], Gadj[v]))) 

1376 

1377 for its in range(max_iterations): 

1378 oldsim = newsim 

1379 newsim = {u: {v: sim(u, v) if u != v else 1 for v in G} for u in G} 

1380 is_close = all( 

1381 all( 

1382 abs(newsim[u][v] - old) <= tolerance * (1 + abs(old)) 

1383 for v, old in nbrs.items() 

1384 ) 

1385 for u, nbrs in oldsim.items() 

1386 ) 

1387 if is_close: 

1388 break 

1389 

1390 if its + 1 == max_iterations: 

1391 raise nx.ExceededMaxIterations( 

1392 f"simrank did not converge after {max_iterations} iterations." 

1393 ) 

1394 

1395 if source is not None and target is not None: 

1396 return newsim[source][target] 

1397 if source is not None: 

1398 return newsim[source] 

1399 return newsim 

1400 

1401 

1402def _simrank_similarity_numpy( 

1403 G, 

1404 source=None, 

1405 target=None, 

1406 importance_factor=0.9, 

1407 max_iterations=1000, 

1408 tolerance=1e-4, 

1409): 

1410 """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``. 

1411 

1412 The SimRank algorithm for determining node similarity is defined in 

1413 [1]_. 

1414 

1415 Parameters 

1416 ---------- 

1417 G : NetworkX graph 

1418 A NetworkX graph 

1419 

1420 source : node 

1421 If this is specified, the returned dictionary maps each node 

1422 ``v`` in the graph to the similarity between ``source`` and 

1423 ``v``. 

1424 

1425 target : node 

1426 If both ``source`` and ``target`` are specified, the similarity 

1427 value between ``source`` and ``target`` is returned. If 

1428 ``target`` is specified but ``source`` is not, this argument is 

1429 ignored. 

1430 

1431 importance_factor : float 

1432 The relative importance of indirect neighbors with respect to 

1433 direct neighbors. 

1434 

1435 max_iterations : integer 

1436 Maximum number of iterations. 

1437 

1438 tolerance : float 

1439 Error tolerance used to check convergence. When an iteration of 

1440 the algorithm finds that no similarity value changes more than 

1441 this amount, the algorithm halts. 

1442 

1443 Returns 

1444 ------- 

1445 similarity : numpy array or float 

1446 If ``source`` and ``target`` are both ``None``, this returns a 

1447 2D array containing SimRank scores of the nodes. 

1448 

1449 If ``source`` is not ``None`` but ``target`` is, this returns an 

1450 1D array containing SimRank scores of ``source`` and that 

1451 node. 

1452 

1453 If neither ``source`` nor ``target`` is ``None``, this returns 

1454 the similarity value for the given pair of nodes. 

1455 

1456 Examples 

1457 -------- 

1458 >>> G = nx.cycle_graph(2) 

1459 >>> nx.similarity._simrank_similarity_numpy(G) 

1460 array([[1., 0.], 

1461 [0., 1.]]) 

1462 >>> nx.similarity._simrank_similarity_numpy(G, source=0) 

1463 array([1., 0.]) 

1464 >>> nx.similarity._simrank_similarity_numpy(G, source=0, target=0) 

1465 1.0 

1466 

1467 References 

1468 ---------- 

1469 .. [1] G. Jeh and J. Widom. 

1470 "SimRank: a measure of structural-context similarity", 

1471 In KDD'02: Proceedings of the Eighth ACM SIGKDD 

1472 International Conference on Knowledge Discovery and Data Mining, 

1473 pp. 538--543. ACM Press, 2002. 

1474 """ 

1475 # This algorithm follows roughly 

1476 # 

1477 # S = max{C * (A.T * S * A), I} 

1478 # 

1479 # where C is the importance factor, A is the column normalized 

1480 # adjacency matrix, and I is the identity matrix. 

1481 import numpy as np 

1482 

1483 adjacency_matrix = nx.to_numpy_array(G) 

1484 

1485 # column-normalize the ``adjacency_matrix`` 

1486 s = np.array(adjacency_matrix.sum(axis=0)) 

1487 s[s == 0] = 1 

1488 adjacency_matrix /= s # adjacency_matrix.sum(axis=0) 

1489 

1490 newsim = np.eye(len(G), dtype=np.float64) 

1491 for its in range(max_iterations): 

1492 prevsim = newsim.copy() 

1493 newsim = importance_factor * ((adjacency_matrix.T @ prevsim) @ adjacency_matrix) 

1494 np.fill_diagonal(newsim, 1.0) 

1495 

1496 if np.allclose(prevsim, newsim, atol=tolerance): 

1497 break 

1498 

1499 if its + 1 == max_iterations: 

1500 raise nx.ExceededMaxIterations( 

1501 f"simrank did not converge after {max_iterations} iterations." 

1502 ) 

1503 

1504 if source is not None and target is not None: 

1505 return float(newsim[source, target]) 

1506 if source is not None: 

1507 return newsim[source] 

1508 return newsim 

1509 

1510 

1511@np_random_state("seed") 

1512def _prepare_panther_paths( 

1513 G, 

1514 source, 

1515 path_length=5, 

1516 c=0.5, 

1517 delta=0.1, 

1518 eps=None, 

1519 weight="weight", 

1520 remove_isolates=True, 

1521 k=None, 

1522 seed=None, 

1523): 

1524 """Common preparation code for Panther similarity algorithms. 

1525 

1526 Parameters 

1527 ---------- 

1528 G : NetworkX graph 

1529 A NetworkX graph 

1530 source : node 

1531 Source node for similarity calculation 

1532 path_length : int 

1533 How long the randomly generated paths should be 

1534 c : float 

1535 A universal constant that controls the number of random paths to generate 

1536 delta : float 

1537 The probability parameter for similarity approximation 

1538 eps : float or None 

1539 The error bound for similarity approximation 

1540 weight : string or None 

1541 The name of an edge attribute that holds the numerical value used as a weight 

1542 remove_isolates : bool 

1543 Whether to remove isolated nodes from graph processing 

1544 k : int or None 

1545 The number of most similar nodes to return. If provided, validates that 

1546 ``k`` is not greater than the number of nodes in the graph. 

1547 seed : integer, random_state, or None (default) 

1548 Indicator of random number generation state. 

1549 See :ref:`Randomness<randomness>`. 

1550 

1551 Returns 

1552 ------- 

1553 PantherPaths 

1554 A tuple containing the prepared data: 

1555 - G: The graph (possibly with isolates removed) 

1556 - inv_node_map: Dictionary mapping node names to indices 

1557 - index_map: Populated index map of paths 

1558 - inv_sample_size: Inverse of sample size (for fast calculation) 

1559 - eps: Error bound for similarity approximation 

1560 """ 

1561 import numpy as np 

1562 

1563 if source not in G: 

1564 raise nx.NodeNotFound(f"Source node {source} not in G") 

1565 

1566 isolates = set(nx.isolates(G)) 

1567 

1568 if source in isolates: 

1569 raise nx.NetworkXUnfeasible( 

1570 f"Panther similarity is not defined for the isolated source node {source}." 

1571 ) 

1572 

1573 if remove_isolates: 

1574 G = G.subgraph(node for node in G if node not in isolates).copy() 

1575 

1576 # According to [1], they empirically determined 

1577 # a good value for ``eps`` to be sqrt( 1 / |E| ) 

1578 if eps is None: 

1579 eps = np.sqrt(1.0 / G.number_of_edges()) 

1580 

1581 num_nodes = G.number_of_nodes() 

1582 

1583 # Check if k is provided and validate it against the number of nodes 

1584 if k is not None and not remove_isolates: # For panther_vector_similarity 

1585 if num_nodes < k: 

1586 raise nx.NetworkXUnfeasible( 

1587 f"The number of requested nodes {k} is greater than the number of nodes {num_nodes}." 

1588 ) 

1589 

1590 inv_node_map = {name: index for index, name in enumerate(G)} 

1591 

1592 # Calculate the sample size ``R`` for how many paths 

1593 # to randomly generate 

1594 t_choose_2 = math.comb(path_length, 2) 

1595 sample_size = int((c / eps**2) * (np.log2(t_choose_2) + 1 + np.log(1 / delta))) 

1596 index_map = {} 

1597 

1598 # Check for isolated nodes before generating random paths 

1599 # If there are still isolated nodes in the graph after filtering, 

1600 # they will cause issues with path generation 

1601 remaining_isolates = set(nx.isolates(G)) 

1602 if remaining_isolates: 

1603 raise nx.NetworkXUnfeasible( 

1604 f"Cannot generate random paths with isolated nodes present: {remaining_isolates}" 

1605 ) 

1606 

1607 # Generate the random paths and populate the index_map 

1608 for _ in generate_random_paths( 

1609 G, 

1610 sample_size, 

1611 path_length=path_length, 

1612 index_map=index_map, 

1613 weight=weight, 

1614 seed=seed, 

1615 ): 

1616 # NOTE: index_map is modified in-place by `generate_random_paths` 

1617 pass 

1618 

1619 return ( 

1620 G, # The graph with isolated nodes removed 

1621 inv_node_map, 

1622 index_map, 

1623 1 / sample_size, 

1624 eps, 

1625 ) 

1626 

1627 

1628@np_random_state("seed") 

1629@nx._dispatchable(edge_attrs="weight") 

1630def panther_similarity( 

1631 G, 

1632 source, 

1633 k=5, 

1634 path_length=5, 

1635 c=0.5, 

1636 delta=0.1, 

1637 eps=None, 

1638 weight="weight", 

1639 seed=None, 

1640): 

1641 r"""Returns the Panther similarity of nodes in the graph `G` to node ``v``. 

1642 

1643 Panther is a similarity metric that says "two objects are considered 

1644 to be similar if they frequently appear on the same paths." [1]_. 

1645 

1646 Parameters 

1647 ---------- 

1648 G : NetworkX graph 

1649 A NetworkX graph 

1650 source : node 

1651 Source node for which to find the top `k` similar other nodes 

1652 k : int (default = 5) 

1653 The number of most similar nodes to return. 

1654 path_length : int (default = 5) 

1655 How long the randomly generated paths should be (``T`` in [1]_) 

1656 c : float (default = 0.5) 

1657 A universal constant that controls the number of random paths to generate. 

1658 Higher values increase the number of sample paths and potentially improve 

1659 accuracy at the cost of more computation. Defaults to 0.5 as recommended 

1660 in [1]_. 

1661 delta : float (default = 0.1) 

1662 The probability that the similarity $S$ is not an epsilon-approximation to (R, phi), 

1663 where $R$ is the number of random paths and $\phi$ is the probability 

1664 that an element sampled from a set $A \subseteq D$, where $D$ is the domain. 

1665 eps : float or None (default = None) 

1666 The error bound for similarity approximation. This controls the accuracy 

1667 of the sampled paths in representing the true similarity. Smaller values 

1668 yield more accurate results but require more sample paths. If `None`, a 

1669 value of ``sqrt(1/|E|)`` is used, which the authors found empirically 

1670 effective. 

1671 weight : string or None, optional (default="weight") 

1672 The name of an edge attribute that holds the numerical value 

1673 used as a weight. If None then each edge has weight 1. 

1674 seed : integer, random_state, or None (default) 

1675 Indicator of random number generation state. 

1676 See :ref:`Randomness<randomness>`. 

1677 

1678 Returns 

1679 ------- 

1680 similarity : dictionary 

1681 Dictionary of nodes to similarity scores (as floats). Note: 

1682 the self-similarity (i.e., ``v``) will not be included in 

1683 the returned dictionary. So, for ``k = 5``, a dictionary of 

1684 top 4 nodes and their similarity scores will be returned. 

1685 

1686 Raises 

1687 ------ 

1688 NetworkXUnfeasible 

1689 If `source` is an isolated node. 

1690 

1691 NodeNotFound 

1692 If `source` is not in `G`. 

1693 

1694 Notes 

1695 ----- 

1696 The isolated nodes in `G` are ignored. 

1697 

1698 Examples 

1699 -------- 

1700 >>> G = nx.star_graph(10) 

1701 >>> sim = nx.panther_similarity(G, 0) 

1702 

1703 References 

1704 ---------- 

1705 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J. 

1706 Panther: Fast top-k similarity search on large networks. 

1707 In Proceedings of the ACM SIGKDD International Conference 

1708 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454). 

1709 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267. 

1710 """ 

1711 import numpy as np 

1712 

1713 # Use helper method to prepare common data structures 

1714 G, inv_node_map, index_map, inv_sample_size, eps = _prepare_panther_paths( 

1715 G, 

1716 source, 

1717 path_length=path_length, 

1718 c=c, 

1719 delta=delta, 

1720 eps=eps, 

1721 weight=weight, 

1722 k=k, 

1723 seed=seed, 

1724 ) 

1725 

1726 num_nodes = G.number_of_nodes() 

1727 node_list = list(G.nodes) 

1728 

1729 # Check number of nodes after any modifications by _prepare_panther_paths 

1730 if num_nodes < k: 

1731 raise nx.NetworkXUnfeasible( 

1732 f"The number of requested nodes {k} is greater than the number of nodes {num_nodes}." 

1733 ) 

1734 

1735 S = np.zeros(num_nodes) 

1736 source_paths = set(index_map[source]) 

1737 

1738 # Calculate the path similarities 

1739 # between ``source`` (v) and ``node`` (v_j) 

1740 # using our inverted index mapping of 

1741 # vertices to paths 

1742 for node, paths in index_map.items(): 

1743 # Only consider paths where both 

1744 # ``node`` and ``source`` are present 

1745 common_paths = source_paths.intersection(paths) 

1746 S[inv_node_map[node]] = len(common_paths) * inv_sample_size 

1747 

1748 # Retrieve top ``k+1`` similar to account for removing self-similarity 

1749 # Note: the below performed anywhere from 4-10x faster 

1750 # (depending on input sizes) vs the equivalent ``np.argsort(S)[::-1]`` 

1751 partition_k = min(k + 1, num_nodes) 

1752 top_k_unsorted = np.argpartition(S, -partition_k)[-partition_k:] 

1753 top_k_sorted = top_k_unsorted[np.argsort(S[top_k_unsorted])][::-1] 

1754 

1755 # Add back the similarity scores 

1756 # Convert numpy scalars to native Python types for dispatch compatibility 

1757 top_k_with_val = dict( 

1758 zip((node_list[i] for i in top_k_sorted), S[top_k_sorted].tolist()) 

1759 ) 

1760 

1761 # Remove the self-similarity 

1762 top_k_with_val.pop(source, None) 

1763 return top_k_with_val 

1764 

1765 

1766@np_random_state("seed") 

1767@nx._dispatchable(edge_attrs="weight") 

1768def panther_vector_similarity( 

1769 G, 

1770 source, 

1771 *, 

1772 D=10, 

1773 k=5, 

1774 path_length=5, 

1775 c=0.5, 

1776 delta=0.1, 

1777 eps=None, 

1778 weight="weight", 

1779 seed=None, 

1780): 

1781 r"""Returns the Panther vector similarity (Panther++) of nodes in `G`. 

1782 

1783 Computes similarity between nodes based on the "Panther++" algorithm [1]_, which extends 

1784 the basic Panther algorithm by using feature vectors to better capture structural 

1785 similarity. 

1786 

1787 While basic Panther similarity measures how often two nodes appear on the same paths, 

1788 Panther vector similarity (Panther++) creates a ``D``-dimensional feature vector for each 

1789 node using its top similarity scores with other nodes, then computes similarity based 

1790 on the Euclidean distance between these feature vectors. This approach better captures 

1791 structural similarity and addresses the bias towards close neighbors present in 

1792 the original Panther algorithm. 

1793 

1794 This approach is preferred when: 

1795 

1796 1. You need better structural similarity than basic path co-occurrence 

1797 2. You want to overcome the close-neighbor bias of standard Panther 

1798 3. You're working with large graphs where k-d tree indexing would be beneficial 

1799 4. Graph edit distance-like similarity is more appropriate than path co-occurrence 

1800 

1801 Parameters 

1802 ---------- 

1803 G : NetworkX graph 

1804 A NetworkX graph 

1805 source : node 

1806 Source node for which to find the top ``k`` similar other nodes 

1807 D : int 

1808 The number of similarity scores to use (in descending order) 

1809 for each feature vector. Defaults to 10. Note that the original paper 

1810 used D=50 [1]_, but KDTree is optimized for lower dimensions. 

1811 k : int 

1812 The number of most similar nodes to return 

1813 path_length : int 

1814 How long the randomly generated paths should be (``T`` in [1]_) 

1815 c : float 

1816 A universal constant that controls the number of random paths to generate. 

1817 Higher values increase the number of sample paths and potentially improve 

1818 accuracy at the cost of more computation. Defaults to 0.5 as recommended 

1819 in [1]_. 

1820 delta : float 

1821 The probability that ``S`` is not an epsilon-approximation to (R, phi) 

1822 eps : float 

1823 The error bound for similarity approximation. This controls the accuracy 

1824 of the sampled paths in representing the true similarity. Smaller values 

1825 yield more accurate results but require more sample paths. If None, a 

1826 value of ``sqrt(1/|E|)`` is used, which the authors found empirically 

1827 effective. 

1828 weight : string or None, optional (default="weight") 

1829 The name of an edge attribute that holds the numerical value 

1830 used as a weight. If `None` then each edge has weight 1. 

1831 seed : integer, random_state, or None (default) 

1832 Indicator of random number generation state. 

1833 See :ref:`Randomness<randomness>`. 

1834 

1835 Returns 

1836 ------- 

1837 similarity : dict 

1838 Dict of nodes to similarity scores (as floats). 

1839 Note: the self-similarity (i.e., `node`) is not included in the dict. 

1840 

1841 Examples 

1842 -------- 

1843 >>> G = nx.star_graph(100) 

1844 

1845 The "hub" node is distinct from the "spoke" nodes 

1846 

1847 >>> from pprint import pprint 

1848 >>> pprint(nx.panther_vector_similarity(G, source=0, seed=42)) 

1849 {35: 0.10402634656233918, 

1850 61: 0.10434063328712018, 

1851 65: 0.10401247833456054, 

1852 85: 0.10506718868571752, 

1853 88: 0.10402634656233918} 

1854 

1855 But "spoke" nodes are similar to one another 

1856 

1857 >>> result = nx.panther_vector_similarity(G, source=1, seed=42) 

1858 >>> len(result) 

1859 5 

1860 >>> all(similarity == 1.0 for similarity in result.values()) 

1861 True 

1862 

1863 Notes 

1864 ----- 

1865 Results may be nondeterministic when feature vectors have the same distances, 

1866 as the KDTree's internal tie-breaking behavior can vary between runs. 

1867 Using the same ``seed`` parameter ensures reproducible results. 

1868 

1869 References 

1870 ---------- 

1871 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J. 

1872 Panther: Fast top-k similarity search on large networks. 

1873 In Proceedings of the ACM SIGKDD International Conference 

1874 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454). 

1875 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267. 

1876 """ 

1877 import numpy as np 

1878 import scipy as sp 

1879 

1880 # Use helper method to prepare common data structures but keep isolates in the graph 

1881 G, inv_node_map, index_map, inv_sample_size, eps = _prepare_panther_paths( 

1882 G, 

1883 source, 

1884 path_length=path_length, 

1885 c=c, 

1886 delta=delta, 

1887 eps=eps, 

1888 weight=weight, 

1889 remove_isolates=False, 

1890 k=k, 

1891 seed=seed, 

1892 ) 

1893 num_nodes = G.number_of_nodes() 

1894 node_list = list(G.nodes) 

1895 

1896 # Ensure D doesn't exceed the number of nodes 

1897 if num_nodes < D: 

1898 raise nx.NetworkXUnfeasible( 

1899 f"The number of requested similarity scores {D} is greater than the number of nodes {num_nodes}." 

1900 ) 

1901 

1902 similarities = np.zeros((num_nodes, num_nodes)) 

1903 theta = np.zeros((num_nodes, D)) 

1904 index_map_sets = {node: set(paths) for node, paths in index_map.items()} 

1905 

1906 # Calculate the path similarities for each node 

1907 for vi_idx, vi in enumerate(G.nodes): 

1908 vi_paths = index_map_sets[vi] 

1909 

1910 for node, node_paths in index_map_sets.items(): 

1911 # Calculate similarity score 

1912 common_path_count = len(vi_paths.intersection(node_paths)) 

1913 similarities[vi_idx, inv_node_map[node]] = ( 

1914 common_path_count * inv_sample_size 

1915 ) 

1916 

1917 # Build up the feature vector using the largest D similarity scores 

1918 theta[vi_idx] = np.sort(np.partition(similarities[vi_idx], -D)[-D:])[::-1] 

1919 

1920 # Insert the feature vectors into a k-d tree 

1921 # for fast retrieval 

1922 kdtree = sp.spatial.KDTree(theta) 

1923 

1924 # Retrieve top ``k+1`` similar vertices (i.e., vectors) 

1925 # (based on their Euclidean distance) 

1926 # Note that it's k+1 because the source node will be included and later removed 

1927 query_k = min(k + 1, num_nodes) 

1928 neighbor_distances, nearest_neighbors = kdtree.query( 

1929 theta[inv_node_map[source]], k=query_k 

1930 ) 

1931 

1932 # Ensure results are always arrays (KDTree returns scalars when k=1) 

1933 neighbor_distances = np.atleast_1d(neighbor_distances) 

1934 nearest_neighbors = np.atleast_1d(nearest_neighbors) 

1935 

1936 # The paper defines the similarity S(v_i, v_j) as 

1937 # 1 / || Theta(v_i) - Theta(v_j) || 

1938 # Calculate reciprocals and normalize to [0, 1] range 

1939 

1940 # Handle the case where distances are very small or zero (common in small graphs) 

1941 # Use the passed in eps parameter instead of defining a new epsilon 

1942 neighbor_distances = np.maximum(neighbor_distances, eps) 

1943 similarities = 1 / neighbor_distances 

1944 

1945 # Always normalize to ensure values are between 0 and 1 

1946 if len(similarities) > 0 and (max_sim := np.max(similarities)) > 0: 

1947 similarities /= max_sim 

1948 

1949 # Add back the similarity scores (i.e., distances) 

1950 # Convert numpy scalars to native Python types for dispatch compatibility 

1951 top_k_with_val = dict( 

1952 zip((node_list[n] for n in nearest_neighbors), similarities.tolist()) 

1953 ) 

1954 

1955 # Remove the self-similarity 

1956 top_k_with_val.pop(source, None) 

1957 

1958 # Ensure we return exactly k results (sorted by similarity) 

1959 if len(top_k_with_val) > k: 

1960 sorted_items = sorted(top_k_with_val.items(), key=lambda x: x[1], reverse=True) 

1961 top_k_with_val = dict(sorted_items[:k]) 

1962 

1963 return top_k_with_val 

1964 

1965 

1966@np_random_state("seed") 

1967@nx._dispatchable(edge_attrs="weight") 

1968def generate_random_paths( 

1969 G, 

1970 sample_size, 

1971 path_length=5, 

1972 index_map=None, 

1973 weight="weight", 

1974 seed=None, 

1975 *, 

1976 source=None, 

1977): 

1978 """Randomly generate `sample_size` paths of length `path_length`. 

1979 

1980 Parameters 

1981 ---------- 

1982 G : NetworkX graph 

1983 A NetworkX graph 

1984 sample_size : integer 

1985 The number of paths to generate. This is ``R`` in [1]_. 

1986 path_length : integer (default = 5) 

1987 The maximum size of the path to randomly generate. 

1988 This is ``T`` in [1]_. According to the paper, ``T >= 5`` is 

1989 recommended. 

1990 index_map : dictionary, optional 

1991 If provided, this will be populated with the inverted 

1992 index of nodes mapped to the set of generated random path 

1993 indices within ``paths``. 

1994 weight : string or None, optional (default="weight") 

1995 The name of an edge attribute that holds the numerical value 

1996 used as a weight. If None then each edge has weight 1. 

1997 seed : integer, random_state, or None (default) 

1998 Indicator of random number generation state. 

1999 See :ref:`Randomness<randomness>`. 

2000 source : node, optional 

2001 Node to use as the starting point for all generated paths. 

2002 If None then starting nodes are selected at random with uniform probability. 

2003 

2004 Returns 

2005 ------- 

2006 paths : generator of lists 

2007 Generator of `sample_size` paths each with length `path_length`. 

2008 

2009 Examples 

2010 -------- 

2011 The generator yields `sample_size` number of paths of length `path_length` 

2012 drawn from `G`: 

2013 

2014 >>> G = nx.complete_graph(5) 

2015 >>> next(nx.generate_random_paths(G, sample_size=1, path_length=3, seed=42)) 

2016 [3, 4, 2, 3] 

2017 >>> list(nx.generate_random_paths(G, sample_size=3, path_length=4, seed=42)) 

2018 [[3, 4, 2, 3, 0], [2, 0, 2, 1, 0], [2, 0, 4, 3, 0]] 

2019 

2020 By passing a dictionary into `index_map`, it will build an 

2021 inverted index mapping of nodes to the paths in which that node is present: 

2022 

2023 >>> G = nx.wheel_graph(10) 

2024 >>> index_map = {} 

2025 >>> random_paths = list( 

2026 ... nx.generate_random_paths(G, sample_size=3, index_map=index_map, seed=2771) 

2027 ... ) 

2028 >>> random_paths 

2029 [[3, 2, 1, 9, 8, 7], [4, 0, 5, 6, 7, 8], [3, 0, 5, 0, 9, 8]] 

2030 >>> paths_containing_node_0 = [ 

2031 ... random_paths[path_idx] for path_idx in index_map.get(0, []) 

2032 ... ] 

2033 >>> paths_containing_node_0 

2034 [[4, 0, 5, 6, 7, 8], [3, 0, 5, 0, 9, 8]] 

2035 

2036 References 

2037 ---------- 

2038 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J. 

2039 Panther: Fast top-k similarity search on large networks. 

2040 In Proceedings of the ACM SIGKDD International Conference 

2041 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454). 

2042 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267. 

2043 """ 

2044 import numpy as np 

2045 

2046 randint_fn = ( 

2047 seed.integers if isinstance(seed, np.random.Generator) else seed.randint 

2048 ) 

2049 

2050 # Calculate transition probabilities between 

2051 # every pair of vertices according to Eq. (3) 

2052 adj_mat = nx.to_numpy_array(G, weight=weight) 

2053 

2054 # Handle isolated nodes by checking for zero row sums 

2055 row_sums = adj_mat.sum(axis=1).reshape(-1, 1) 

2056 inv_row_sums = np.reciprocal(row_sums) 

2057 transition_probabilities = adj_mat * inv_row_sums 

2058 

2059 node_map = list(G) 

2060 num_nodes = G.number_of_nodes() 

2061 

2062 for path_index in range(sample_size): 

2063 if source is None: 

2064 # Sample current vertex v = v_i uniformly at random 

2065 node_index = randint_fn(num_nodes) 

2066 node = node_map[node_index] 

2067 else: 

2068 if source not in node_map: 

2069 raise nx.NodeNotFound(f"Initial node {source} not in G") 

2070 

2071 node = source 

2072 node_index = node_map.index(node) 

2073 

2074 # Add v into p_r and add p_r into the path set 

2075 # of v, i.e., P_v 

2076 path = [node] 

2077 

2078 # Build the inverted index (P_v) of vertices to paths 

2079 if index_map is not None: 

2080 if node in index_map: 

2081 index_map[node].add(path_index) 

2082 else: 

2083 index_map[node] = {path_index} 

2084 

2085 starting_index = node_index 

2086 for _ in range(path_length): 

2087 # Randomly sample a neighbor (v_j) according 

2088 # to transition probabilities from ``node`` (v) to its neighbors 

2089 nbr_index = seed.choice( 

2090 num_nodes, p=transition_probabilities[starting_index] 

2091 ) 

2092 

2093 # Set current vertex (v = v_j) 

2094 starting_index = nbr_index 

2095 

2096 # Add v into p_r 

2097 nbr_node = node_map[nbr_index] 

2098 path.append(nbr_node) 

2099 

2100 # Add p_r into P_v 

2101 if index_map is not None: 

2102 if nbr_node in index_map: 

2103 index_map[nbr_node].add(path_index) 

2104 else: 

2105 index_map[nbr_node] = {path_index} 

2106 

2107 yield path