Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/networkx/algorithms/similarity.py: 8%

1"""Functions measuring similarity using graph edit distance.

3The graph edit distance is the number of edge/node changes needed

4to make two graphs isomorphic.

6The default algorithm/implementation is sub-optimal for some graphs.

7The problem of finding the exact Graph Edit Distance (GED) is NP-hard

8so it is often slow. If the simple interface `graph_edit_distance`

9takes too long for your graph, try `optimize_graph_edit_distance`

10and/or `optimize_edit_paths`.

12At the same time, I encourage capable people to investigate

13alternative GED algorithms, in order to improve the choices available.

14"""

16import math

17import time

18from dataclasses import dataclass

19from itertools import product

21import networkx as nx

22from networkx.utils import np_random_state

24__all__ = [

25 "graph_edit_distance",

26 "optimal_edit_paths",

27 "optimize_graph_edit_distance",

28 "optimize_edit_paths",

29 "simrank_similarity",

30 "panther_similarity",

31 "panther_vector_similarity",

32 "generate_random_paths",

33]

36@nx._dispatchable(

37 graphs={"G1": 0, "G2": 1}, preserve_edge_attrs=True, preserve_node_attrs=True

38)

39def graph_edit_distance(

40 G1,

41 G2,

42 node_match=None,

43 edge_match=None,

44 node_subst_cost=None,

45 node_del_cost=None,

46 node_ins_cost=None,

47 edge_subst_cost=None,

48 edge_del_cost=None,

49 edge_ins_cost=None,

50 roots=None,

51 upper_bound=None,

52 timeout=None,

53):

54 """Returns GED (graph edit distance) between graphs G1 and G2.

56 Graph edit distance is a graph similarity measure analogous to

57 Levenshtein distance for strings. It is defined as minimum cost

58 of edit path (sequence of node and edge edit operations)

59 transforming graph G1 to graph isomorphic to G2.

61 Parameters

62 ----------

63 G1, G2: graphs

64 The two graphs G1 and G2 must be of the same type.

66 node_match : callable

67 A function that returns True if node n1 in G1 and n2 in G2

68 should be considered equal during matching.

70 The function will be called like

72 node_match(G1.nodes[n1], G2.nodes[n2]).

74 That is, the function will receive the node attribute

75 dictionaries for n1 and n2 as inputs.

77 Ignored if node_subst_cost is specified. If neither

78 node_match nor node_subst_cost are specified then node

79 attributes are not considered.

81 edge_match : callable

82 A function that returns True if the edge attribute dictionaries

83 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should

84 be considered equal during matching.

86 The function will be called like

88 edge_match(G1[u1][v1], G2[u2][v2]).

90 That is, the function will receive the edge attribute

91 dictionaries of the edges under consideration.

93 Ignored if edge_subst_cost is specified. If neither

94 edge_match nor edge_subst_cost are specified then edge

95 attributes are not considered.

97 node_subst_cost, node_del_cost, node_ins_cost : callable

98 Functions that return the costs of node substitution, node

99 deletion, and node insertion, respectively.

100

101 The functions will be called like

102

103 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),

104 node_del_cost(G1.nodes[n1]),

105 node_ins_cost(G2.nodes[n2]).

106

107 That is, the functions will receive the node attribute

108 dictionaries as inputs. The functions are expected to return

109 positive numeric values.

110

111 Function node_subst_cost overrides node_match if specified.

112 If neither node_match nor node_subst_cost are specified then

113 default node substitution cost of 0 is used (node attributes

114 are not considered during matching).

115

116 If node_del_cost is not specified then default node deletion

117 cost of 1 is used. If node_ins_cost is not specified then

118 default node insertion cost of 1 is used.

119

120 edge_subst_cost, edge_del_cost, edge_ins_cost : callable

121 Functions that return the costs of edge substitution, edge

122 deletion, and edge insertion, respectively.

123

124 The functions will be called like

125

126 edge_subst_cost(G1[u1][v1], G2[u2][v2]),

127 edge_del_cost(G1[u1][v1]),

128 edge_ins_cost(G2[u2][v2]).

129

130 That is, the functions will receive the edge attribute

131 dictionaries as inputs. The functions are expected to return

132 positive numeric values.

133

134 Function edge_subst_cost overrides edge_match if specified.

135 If neither edge_match nor edge_subst_cost are specified then

136 default edge substitution cost of 0 is used (edge attributes

137 are not considered during matching).

138

139 If edge_del_cost is not specified then default edge deletion

140 cost of 1 is used. If edge_ins_cost is not specified then

141 default edge insertion cost of 1 is used.

142

143 roots : 2-tuple

144 Tuple where first element is a node in G1 and the second

145 is a node in G2.

146 These nodes are forced to be matched in the comparison to

147 allow comparison between rooted graphs.

148

149 upper_bound : numeric

150 Maximum edit distance to consider. Return None if no edit

151 distance under or equal to upper_bound exists.

152

153 timeout : numeric

154 Maximum number of seconds to execute.

155 After timeout is met, the current best GED is returned.

156

157 Examples

158 --------

159 >>> G1 = nx.cycle_graph(6)

160 >>> G2 = nx.wheel_graph(7)

161 >>> nx.graph_edit_distance(G1, G2)

162 7.0

163

164 >>> G1 = nx.star_graph(5)

165 >>> G2 = nx.star_graph(5)

166 >>> nx.graph_edit_distance(G1, G2, roots=(0, 0))

167 0.0

168 >>> nx.graph_edit_distance(G1, G2, roots=(1, 0))

169 8.0

170

171 See Also

172 --------

173 optimal_edit_paths, optimize_graph_edit_distance,

174

175 is_isomorphic: test for graph edit distance of 0

176

177 References

178 ----------

179 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick

180 Martineau. An Exact Graph Edit Distance Algorithm for Solving

181 Pattern Recognition Problems. 4th International Conference on

182 Pattern Recognition Applications and Methods 2015, Jan 2015,

183 Lisbon, Portugal. 2015,

184 <10.5220/0005209202710278>. <hal-01168816>

185 https://hal.archives-ouvertes.fr/hal-01168816

186

187 """

188 bestcost = None

189 for _, _, cost in optimize_edit_paths(

190 G1,

191 G2,

192 node_match,

193 edge_match,

194 node_subst_cost,

195 node_del_cost,

196 node_ins_cost,

197 edge_subst_cost,

198 edge_del_cost,

199 edge_ins_cost,

200 upper_bound,

201 True,

202 roots,

203 timeout,

204 ):

205 # assert bestcost is None or cost < bestcost

206 bestcost = cost

207 return bestcost

208

209

210@nx._dispatchable(graphs={"G1": 0, "G2": 1})

211def optimal_edit_paths(

212 G1,

213 G2,

214 node_match=None,

215 edge_match=None,

216 node_subst_cost=None,

217 node_del_cost=None,

218 node_ins_cost=None,

219 edge_subst_cost=None,

220 edge_del_cost=None,

221 edge_ins_cost=None,

222 upper_bound=None,

223):

224 """Returns all minimum-cost edit paths transforming G1 to G2.

225

226 Graph edit path is a sequence of node and edge edit operations

227 transforming graph G1 to graph isomorphic to G2. Edit operations

228 include substitutions, deletions, and insertions.

229

230 Parameters

231 ----------

232 G1, G2: graphs

233 The two graphs G1 and G2 must be of the same type.

234

235 node_match : callable

236 A function that returns True if node n1 in G1 and n2 in G2

237 should be considered equal during matching.

238

239 The function will be called like

240

241 node_match(G1.nodes[n1], G2.nodes[n2]).

242

243 That is, the function will receive the node attribute

244 dictionaries for n1 and n2 as inputs.

245

246 Ignored if node_subst_cost is specified. If neither

247 node_match nor node_subst_cost are specified then node

248 attributes are not considered.

249

250 edge_match : callable

251 A function that returns True if the edge attribute dictionaries

252 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should

253 be considered equal during matching.

254

255 The function will be called like

256

257 edge_match(G1[u1][v1], G2[u2][v2]).

258

259 That is, the function will receive the edge attribute

260 dictionaries of the edges under consideration.

261

262 Ignored if edge_subst_cost is specified. If neither

263 edge_match nor edge_subst_cost are specified then edge

264 attributes are not considered.

265

266 node_subst_cost, node_del_cost, node_ins_cost : callable

267 Functions that return the costs of node substitution, node

268 deletion, and node insertion, respectively.

269

270 The functions will be called like

271

272 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),

273 node_del_cost(G1.nodes[n1]),

274 node_ins_cost(G2.nodes[n2]).

275

276 That is, the functions will receive the node attribute

277 dictionaries as inputs. The functions are expected to return

278 positive numeric values.

279

280 Function node_subst_cost overrides node_match if specified.

281 If neither node_match nor node_subst_cost are specified then

282 default node substitution cost of 0 is used (node attributes

283 are not considered during matching).

284

285 If node_del_cost is not specified then default node deletion

286 cost of 1 is used. If node_ins_cost is not specified then

287 default node insertion cost of 1 is used.

288

289 edge_subst_cost, edge_del_cost, edge_ins_cost : callable

290 Functions that return the costs of edge substitution, edge

291 deletion, and edge insertion, respectively.

292

293 The functions will be called like

294

295 edge_subst_cost(G1[u1][v1], G2[u2][v2]),

296 edge_del_cost(G1[u1][v1]),

297 edge_ins_cost(G2[u2][v2]).

298

299 That is, the functions will receive the edge attribute

300 dictionaries as inputs. The functions are expected to return

301 positive numeric values.

302

303 Function edge_subst_cost overrides edge_match if specified.

304 If neither edge_match nor edge_subst_cost are specified then

305 default edge substitution cost of 0 is used (edge attributes

306 are not considered during matching).

307

308 If edge_del_cost is not specified then default edge deletion

309 cost of 1 is used. If edge_ins_cost is not specified then

310 default edge insertion cost of 1 is used.

311

312 upper_bound : numeric

313 Maximum edit distance to consider.

314

315 Returns

316 -------

317 edit_paths : list of tuples (node_edit_path, edge_edit_path)

318 - node_edit_path : list of tuples ``(u, v)`` indicating node transformations

319 between `G1` and `G2`. ``u`` is `None` for insertion, ``v`` is `None`

320 for deletion.

321 - edge_edit_path : list of tuples ``((u1, v1), (u2, v2))`` indicating edge

322 transformations between `G1` and `G2`. ``(None, (u2,v2))`` for insertion

323 and ``((u1,v1), None)`` for deletion.

324

325 cost : numeric

326 Optimal edit path cost (graph edit distance). When the cost

327 is zero, it indicates that `G1` and `G2` are isomorphic.

328

329 Examples

330 --------

331 >>> G1 = nx.cycle_graph(4)

332 >>> G2 = nx.wheel_graph(5)

333 >>> paths, cost = nx.optimal_edit_paths(G1, G2)

334 >>> len(paths)

335 40

336 >>> cost

337 5.0

338

339 Notes

340 -----

341 To transform `G1` into a graph isomorphic to `G2`, apply the node

342 and edge edits in the returned ``edit_paths``.

343 In the case of isomorphic graphs, the cost is zero, and the paths

344 represent different isomorphic mappings (isomorphisms). That is, the

345 edits involve renaming nodes and edges to match the structure of `G2`.

346

347 See Also

348 --------

349 graph_edit_distance, optimize_edit_paths

350

351 References

352 ----------

353 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick

354 Martineau. An Exact Graph Edit Distance Algorithm for Solving

355 Pattern Recognition Problems. 4th International Conference on

356 Pattern Recognition Applications and Methods 2015, Jan 2015,

357 Lisbon, Portugal. 2015,

358 <10.5220/0005209202710278>. <hal-01168816>

359 https://hal.archives-ouvertes.fr/hal-01168816

360

361 """

362 paths = []

363 bestcost = None

364 for vertex_path, edge_path, cost in optimize_edit_paths(

365 G1,

366 G2,

367 node_match,

368 edge_match,

369 node_subst_cost,

370 node_del_cost,

371 node_ins_cost,

372 edge_subst_cost,

373 edge_del_cost,

374 edge_ins_cost,

375 upper_bound,

376 False,

377 ):

378 # assert bestcost is None or cost <= bestcost

379 if bestcost is not None and cost < bestcost:

380 paths = []

381 paths.append((vertex_path, edge_path))

382 bestcost = cost

383 return paths, bestcost

384

385

386@nx._dispatchable(graphs={"G1": 0, "G2": 1})

387def optimize_graph_edit_distance(

388 G1,

389 G2,

390 node_match=None,

391 edge_match=None,

392 node_subst_cost=None,

393 node_del_cost=None,

394 node_ins_cost=None,

395 edge_subst_cost=None,

396 edge_del_cost=None,

397 edge_ins_cost=None,

398 upper_bound=None,

399):

400 """Returns consecutive approximations of GED (graph edit distance)

401 between graphs G1 and G2.

402

403 Graph edit distance is a graph similarity measure analogous to

404 Levenshtein distance for strings. It is defined as minimum cost

405 of edit path (sequence of node and edge edit operations)

406 transforming graph G1 to graph isomorphic to G2.

407

408 Parameters

409 ----------

410 G1, G2: graphs

411 The two graphs G1 and G2 must be of the same type.

412

413 node_match : callable

414 A function that returns True if node n1 in G1 and n2 in G2

415 should be considered equal during matching.

416

417 The function will be called like

418

419 node_match(G1.nodes[n1], G2.nodes[n2]).

420

421 That is, the function will receive the node attribute

422 dictionaries for n1 and n2 as inputs.

423

424 Ignored if node_subst_cost is specified. If neither

425 node_match nor node_subst_cost are specified then node

426 attributes are not considered.

427

428 edge_match : callable

429 A function that returns True if the edge attribute dictionaries

430 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should

431 be considered equal during matching.

432

433 The function will be called like

434

435 edge_match(G1[u1][v1], G2[u2][v2]).

436

437 That is, the function will receive the edge attribute

438 dictionaries of the edges under consideration.

439

440 Ignored if edge_subst_cost is specified. If neither

441 edge_match nor edge_subst_cost are specified then edge

442 attributes are not considered.

443

444 node_subst_cost, node_del_cost, node_ins_cost : callable

445 Functions that return the costs of node substitution, node

446 deletion, and node insertion, respectively.

447

448 The functions will be called like

449

450 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),

451 node_del_cost(G1.nodes[n1]),

452 node_ins_cost(G2.nodes[n2]).

453

454 That is, the functions will receive the node attribute

455 dictionaries as inputs. The functions are expected to return

456 positive numeric values.

457

458 Function node_subst_cost overrides node_match if specified.

459 If neither node_match nor node_subst_cost are specified then

460 default node substitution cost of 0 is used (node attributes

461 are not considered during matching).

462

463 If node_del_cost is not specified then default node deletion

464 cost of 1 is used. If node_ins_cost is not specified then

465 default node insertion cost of 1 is used.

466

467 edge_subst_cost, edge_del_cost, edge_ins_cost : callable

468 Functions that return the costs of edge substitution, edge

469 deletion, and edge insertion, respectively.

470

471 The functions will be called like

472

473 edge_subst_cost(G1[u1][v1], G2[u2][v2]),

474 edge_del_cost(G1[u1][v1]),

475 edge_ins_cost(G2[u2][v2]).

476

477 That is, the functions will receive the edge attribute

478 dictionaries as inputs. The functions are expected to return

479 positive numeric values.

480

481 Function edge_subst_cost overrides edge_match if specified.

482 If neither edge_match nor edge_subst_cost are specified then

483 default edge substitution cost of 0 is used (edge attributes

484 are not considered during matching).

485

486 If edge_del_cost is not specified then default edge deletion

487 cost of 1 is used. If edge_ins_cost is not specified then

488 default edge insertion cost of 1 is used.

489

490 upper_bound : numeric

491 Maximum edit distance to consider.

492

493 Returns

494 -------

495 Generator of consecutive approximations of graph edit distance.

496

497 Examples

498 --------

499 >>> G1 = nx.cycle_graph(6)

500 >>> G2 = nx.wheel_graph(7)

501 >>> for v in nx.optimize_graph_edit_distance(G1, G2):

502 ... minv = v

503 >>> minv

504 7.0

505

506 See Also

507 --------

508 graph_edit_distance, optimize_edit_paths

509

510 References

511 ----------

512 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick

513 Martineau. An Exact Graph Edit Distance Algorithm for Solving

514 Pattern Recognition Problems. 4th International Conference on

515 Pattern Recognition Applications and Methods 2015, Jan 2015,

516 Lisbon, Portugal. 2015,

517 <10.5220/0005209202710278>. <hal-01168816>

518 https://hal.archives-ouvertes.fr/hal-01168816

519 """

520 for _, _, cost in optimize_edit_paths(

521 G1,

522 G2,

523 node_match,

524 edge_match,

525 node_subst_cost,

526 node_del_cost,

527 node_ins_cost,

528 edge_subst_cost,

529 edge_del_cost,

530 edge_ins_cost,

531 upper_bound,

532 True,

533 ):

534 yield cost

535

536

537@nx._dispatchable(

538 graphs={"G1": 0, "G2": 1}, preserve_edge_attrs=True, preserve_node_attrs=True

539)

540def optimize_edit_paths(

541 G1,

542 G2,

543 node_match=None,

544 edge_match=None,

545 node_subst_cost=None,

546 node_del_cost=None,

547 node_ins_cost=None,

548 edge_subst_cost=None,

549 edge_del_cost=None,

550 edge_ins_cost=None,

551 upper_bound=None,

552 strictly_decreasing=True,

553 roots=None,

554 timeout=None,

555):

556 """GED (graph edit distance) calculation: advanced interface.

557

558 Graph edit path is a sequence of node and edge edit operations

559 transforming graph G1 to graph isomorphic to G2. Edit operations

560 include substitutions, deletions, and insertions.

561

562 Graph edit distance is defined as minimum cost of edit path.

563

564 Parameters

565 ----------

566 G1, G2: graphs

567 The two graphs G1 and G2 must be of the same type.

568

569 node_match : callable

570 A function that returns True if node n1 in G1 and n2 in G2

571 should be considered equal during matching.

572

573 The function will be called like

574

575 node_match(G1.nodes[n1], G2.nodes[n2]).

576

577 That is, the function will receive the node attribute

578 dictionaries for n1 and n2 as inputs.

579

580 Ignored if node_subst_cost is specified. If neither

581 node_match nor node_subst_cost are specified then node

582 attributes are not considered.

583

584 edge_match : callable

585 A function that returns True if the edge attribute dictionaries

586 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should

587 be considered equal during matching.

588

589 The function will be called like

590

591 edge_match(G1[u1][v1], G2[u2][v2]).

592

593 That is, the function will receive the edge attribute

594 dictionaries of the edges under consideration.

595

596 Ignored if edge_subst_cost is specified. If neither

597 edge_match nor edge_subst_cost are specified then edge

598 attributes are not considered.

599

600 node_subst_cost, node_del_cost, node_ins_cost : callable

601 Functions that return the costs of node substitution, node

602 deletion, and node insertion, respectively.

603

604 The functions will be called like

605

606 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),

607 node_del_cost(G1.nodes[n1]),

608 node_ins_cost(G2.nodes[n2]).

609

610 That is, the functions will receive the node attribute

611 dictionaries as inputs. The functions are expected to return

612 positive numeric values.

613

614 Function node_subst_cost overrides node_match if specified.

615 If neither node_match nor node_subst_cost are specified then

616 default node substitution cost of 0 is used (node attributes

617 are not considered during matching).

618

619 If node_del_cost is not specified then default node deletion

620 cost of 1 is used. If node_ins_cost is not specified then

621 default node insertion cost of 1 is used.

622

623 edge_subst_cost, edge_del_cost, edge_ins_cost : callable

624 Functions that return the costs of edge substitution, edge

625 deletion, and edge insertion, respectively.

626

627 The functions will be called like

628

629 edge_subst_cost(G1[u1][v1], G2[u2][v2]),

630 edge_del_cost(G1[u1][v1]),

631 edge_ins_cost(G2[u2][v2]).

632

633 That is, the functions will receive the edge attribute

634 dictionaries as inputs. The functions are expected to return

635 positive numeric values.

636

637 Function edge_subst_cost overrides edge_match if specified.

638 If neither edge_match nor edge_subst_cost are specified then

639 default edge substitution cost of 0 is used (edge attributes

640 are not considered during matching).

641

642 If edge_del_cost is not specified then default edge deletion

643 cost of 1 is used. If edge_ins_cost is not specified then

644 default edge insertion cost of 1 is used.

645

646 upper_bound : numeric

647 Maximum edit distance to consider.

648

649 strictly_decreasing : bool

650 If True, return consecutive approximations of strictly

651 decreasing cost. Otherwise, return all edit paths of cost

652 less than or equal to the previous minimum cost.

653

654 roots : 2-tuple

655 Tuple where first element is a node in G1 and the second

656 is a node in G2.

657 These nodes are forced to be matched in the comparison to

658 allow comparison between rooted graphs.

659

660 timeout : numeric

661 Maximum number of seconds to execute.

662 After timeout is met, the current best GED is returned.

663

664 Returns

665 -------

666 Generator of tuples (node_edit_path, edge_edit_path, cost)

667 node_edit_path : list of tuples (u, v)

668 edge_edit_path : list of tuples ((u1, v1), (u2, v2))

669 cost : numeric

670

671 See Also

672 --------

673 graph_edit_distance, optimize_graph_edit_distance, optimal_edit_paths

674

675 References

676 ----------

677 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick

678 Martineau. An Exact Graph Edit Distance Algorithm for Solving

679 Pattern Recognition Problems. 4th International Conference on

680 Pattern Recognition Applications and Methods 2015, Jan 2015,

681 Lisbon, Portugal. 2015,

682 <10.5220/0005209202710278>. <hal-01168816>

683 https://hal.archives-ouvertes.fr/hal-01168816

684

685 """

686 # TODO: support DiGraph

687

688 import numpy as np

689 import scipy as sp

690

691 @dataclass

692 class CostMatrix:

693 C: ...

694 lsa_row_ind: ...

695 lsa_col_ind: ...

696 ls: ...

697

698 def make_CostMatrix(C, m, n):

699 # assert(C.shape == (m + n, m + n))

700 lsa_row_ind, lsa_col_ind = sp.optimize.linear_sum_assignment(C)

701

702 # Fixup dummy assignments:

703 # each substitution i<->j should have dummy assignment m+j<->n+i

704 # NOTE: fast reduce of Cv relies on it

705 # Create masks for substitution and dummy indices

706 is_subst = (lsa_row_ind < m) & (lsa_col_ind < n)

707 is_dummy = (lsa_row_ind >= m) & (lsa_col_ind >= n)

708

709 # Map dummy assignments to the correct indices

710 lsa_row_ind[is_dummy] = lsa_col_ind[is_subst] + m

711 lsa_col_ind[is_dummy] = lsa_row_ind[is_subst] + n

712

713 return CostMatrix(

714 C, lsa_row_ind, lsa_col_ind, C[lsa_row_ind, lsa_col_ind].sum()

715 )

716

717 def extract_C(C, i, j, m, n):

718 # assert(C.shape == (m + n, m + n))

719 row_ind = [k in i or k - m in j for k in range(m + n)]

720 col_ind = [k in j or k - n in i for k in range(m + n)]

721 return C[row_ind, :][:, col_ind]

722

723 def reduce_C(C, i, j, m, n):

724 # assert(C.shape == (m + n, m + n))

725 row_ind = [k not in i and k - m not in j for k in range(m + n)]

726 col_ind = [k not in j and k - n not in i for k in range(m + n)]

727 return C[row_ind, :][:, col_ind]

728

729 def reduce_ind(ind, i):

730 # assert set(ind) == set(range(len(ind)))

731 rind = ind[[k not in i for k in ind]]

732 for k in set(i):

733 rind[rind >= k] -= 1

734 return rind

735

736 def match_edges(u, v, pending_g, pending_h, Ce, matched_uv=None):

737 """

738 Parameters:

739 u, v: matched vertices, u=None or v=None for

740 deletion/insertion

741 pending_g, pending_h: lists of edges not yet mapped

742 Ce: CostMatrix of pending edge mappings

743 matched_uv: partial vertex edit path

744 list of tuples (u, v) of previously matched vertex

745 mappings u<->v, u=None or v=None for

746 deletion/insertion

747

748 Returns:

749 list of (i, j): indices of edge mappings g<->h

750 localCe: local CostMatrix of edge mappings

751 (basically submatrix of Ce at cross of rows i, cols j)

752 """

753 M = len(pending_g)

754 N = len(pending_h)

755 # assert Ce.C.shape == (M + N, M + N)

756

757 # only attempt to match edges after one node match has been made

758 # this will stop self-edges on the first node being automatically deleted

759 # even when a substitution is the better option

760

761 substitution_possible = M and N

762 at_least_one_node_match = matched_uv is None or len(matched_uv) == 0

763 if at_least_one_node_match and substitution_possible:

764 g_ind = []

765 h_ind = []

766 else:

767 g_ind = [

768 i

769 for i in range(M)

770 if pending_g[i][:2] == (u, u)

771 or any(

772 pending_g[i][:2] in ((p, u), (u, p), (p, p)) for p, q in matched_uv

773 )

774 ]

775 h_ind = [

776 j

777 for j in range(N)

778 if pending_h[j][:2] == (v, v)

779 or any(

780 pending_h[j][:2] in ((q, v), (v, q), (q, q)) for p, q in matched_uv

781 )

782 ]

783

784 m = len(g_ind)

785 n = len(h_ind)

786

787 if m or n:

788 C = extract_C(Ce.C, g_ind, h_ind, M, N)

789 # assert C.shape == (m + n, m + n)

790

791 # Forbid structurally invalid matches

792 # NOTE: inf remembered from Ce construction

793 for k, i in enumerate(g_ind):

794 g = pending_g[i][:2]

795 for l, j in enumerate(h_ind):

796 h = pending_h[j][:2]

797 if nx.is_directed(G1) or nx.is_directed(G2):

798 if any(

799 g == (p, u) and h == (q, v) or g == (u, p) and h == (v, q)

800 for p, q in matched_uv

801 ):

802 continue

803 else:

804 if any(

805 g in ((p, u), (u, p)) and h in ((q, v), (v, q))

806 for p, q in matched_uv

807 ):

808 continue

809 if g == (u, u) or any(g == (p, p) for p, q in matched_uv):

810 continue

811 if h == (v, v) or any(h == (q, q) for p, q in matched_uv):

812 continue

813 C[k, l] = inf

814

815 localCe = make_CostMatrix(C, m, n)

816 ij = [

817 (

818 g_ind[k] if k < m else M + h_ind[l],

819 h_ind[l] if l < n else N + g_ind[k],

820 )

821 for k, l in zip(localCe.lsa_row_ind, localCe.lsa_col_ind)

822 if k < m or l < n

823 ]

824

825 else:

826 ij = []

827 localCe = CostMatrix(np.empty((0, 0)), [], [], 0)

828

829 return ij, localCe

830

831 def reduce_Ce(Ce, ij, m, n):

832 if len(ij):

833 i, j = zip(*ij)

834 m_i = m - sum(1 for t in i if t < m)

835 n_j = n - sum(1 for t in j if t < n)

836 return make_CostMatrix(reduce_C(Ce.C, i, j, m, n), m_i, n_j)

837 return Ce

838

839 def get_edit_ops(

840 matched_uv, pending_u, pending_v, Cv, pending_g, pending_h, Ce, matched_cost

841 ):

842 """

843 Parameters:

844 matched_uv: partial vertex edit path

845 list of tuples (u, v) of vertex mappings u<->v,

846 u=None or v=None for deletion/insertion

847 pending_u, pending_v: lists of vertices not yet mapped

848 Cv: CostMatrix of pending vertex mappings

849 pending_g, pending_h: lists of edges not yet mapped

850 Ce: CostMatrix of pending edge mappings

851 matched_cost: cost of partial edit path

852

853 Returns:

854 sequence of

855 (i, j): indices of vertex mapping u<->v

856 Cv_ij: reduced CostMatrix of pending vertex mappings

857 (basically Cv with row i, col j removed)

858 list of (x, y): indices of edge mappings g<->h

859 Ce_xy: reduced CostMatrix of pending edge mappings

860 (basically Ce with rows x, cols y removed)

861 cost: total cost of edit operation

862 NOTE: most promising ops first

863 """

864 m = len(pending_u)

865 n = len(pending_v)

866 # assert Cv.C.shape == (m + n, m + n)

867

868 # 1) a vertex mapping from optimal linear sum assignment

869 i, j = min(

870 (k, l) for k, l in zip(Cv.lsa_row_ind, Cv.lsa_col_ind) if k < m or l < n

871 )

872 xy, localCe = match_edges(

873 pending_u[i] if i < m else None,

874 pending_v[j] if j < n else None,

875 pending_g,

876 pending_h,

877 Ce,

878 matched_uv,

879 )

880 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))

881 # assert Ce.ls <= localCe.ls + Ce_xy.ls

882 if prune(matched_cost + Cv.ls + localCe.ls + Ce_xy.ls):

883 pass

884 else:

885 # get reduced Cv efficiently

886 Cv_ij = CostMatrix(

887 reduce_C(Cv.C, (i,), (j,), m, n),

888 reduce_ind(Cv.lsa_row_ind, (i, m + j)),

889 reduce_ind(Cv.lsa_col_ind, (j, n + i)),

890 Cv.ls - Cv.C[i, j],

891 )

892 yield (i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls

893

894 # 2) other candidates, sorted by lower-bound cost estimate

895 other = []

896 fixed_i, fixed_j = i, j

897 if m <= n:

898 candidates = (

899 (t, fixed_j)

900 for t in range(m + n)

901 if t != fixed_i and (t < m or t == m + fixed_j)

902 )

903 else:

904 candidates = (

905 (fixed_i, t)

906 for t in range(m + n)

907 if t != fixed_j and (t < n or t == n + fixed_i)

908 )

909 for i, j in candidates:

910 if prune(matched_cost + Cv.C[i, j] + Ce.ls):

911 continue

912 Cv_ij = make_CostMatrix(

913 reduce_C(Cv.C, (i,), (j,), m, n),

914 m - 1 if i < m else m,

915 n - 1 if j < n else n,

916 )

917 # assert Cv.ls <= Cv.C[i, j] + Cv_ij.ls

918 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + Ce.ls):

919 continue

920 xy, localCe = match_edges(

921 pending_u[i] if i < m else None,

922 pending_v[j] if j < n else None,

923 pending_g,

924 pending_h,

925 Ce,

926 matched_uv,

927 )

928 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls):

929 continue

930 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))

931 # assert Ce.ls <= localCe.ls + Ce_xy.ls

932 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls + Ce_xy.ls):

933 continue

934 other.append(((i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls))

935

936 yield from sorted(other, key=lambda t: t[4] + t[1].ls + t[3].ls)

937

938 def get_edit_paths(

939 matched_uv,

940 pending_u,

941 pending_v,

942 Cv,

943 matched_gh,

944 pending_g,

945 pending_h,

946 Ce,

947 matched_cost,

948 ):

949 """

950 Parameters:

951 matched_uv: partial vertex edit path

952 list of tuples (u, v) of vertex mappings u<->v,

953 u=None or v=None for deletion/insertion

954 pending_u, pending_v: lists of vertices not yet mapped

955 Cv: CostMatrix of pending vertex mappings

956 matched_gh: partial edge edit path

957 list of tuples (g, h) of edge mappings g<->h,

958 g=None or h=None for deletion/insertion

959 pending_g, pending_h: lists of edges not yet mapped

960 Ce: CostMatrix of pending edge mappings

961 matched_cost: cost of partial edit path

962

963 Returns:

964 sequence of (vertex_path, edge_path, cost)

965 vertex_path: complete vertex edit path

966 list of tuples (u, v) of vertex mappings u<->v,

967 u=None or v=None for deletion/insertion

968 edge_path: complete edge edit path

969 list of tuples (g, h) of edge mappings g<->h,

970 g=None or h=None for deletion/insertion

971 cost: total cost of edit path

972 NOTE: path costs are non-increasing

973 """

974 if prune(matched_cost + Cv.ls + Ce.ls):

975 return

976

977 if not max(len(pending_u), len(pending_v)):

978 # assert not len(pending_g)

979 # assert not len(pending_h)

980 # path completed!

981 # assert matched_cost <= maxcost_value

982 nonlocal maxcost_value

983 maxcost_value = min(maxcost_value, matched_cost)

984 yield matched_uv, matched_gh, matched_cost

985

986 else:

987 edit_ops = get_edit_ops(

988 matched_uv,

989 pending_u,

990 pending_v,

991 Cv,

992 pending_g,

993 pending_h,

994 Ce,

995 matched_cost,

996 )

997 for ij, Cv_ij, xy, Ce_xy, edit_cost in edit_ops:

998 i, j = ij

999 # assert Cv.C[i, j] + sum(Ce.C[t] for t in xy) == edit_cost

1000 if prune(matched_cost + edit_cost + Cv_ij.ls + Ce_xy.ls):

1001 continue

1002

1003 # dive deeper

1004 u = pending_u.pop(i) if i < len(pending_u) else None

1005 v = pending_v.pop(j) if j < len(pending_v) else None

1006 matched_uv.append((u, v))

1007 for x, y in xy:

1008 len_g = len(pending_g)

1009 len_h = len(pending_h)

1010 matched_gh.append(

1011 (

1012 pending_g[x] if x < len_g else None,

1013 pending_h[y] if y < len_h else None,

1014 )

1015 )

1016 sortedx = sorted(x for x, y in xy)

1017 sortedy = sorted(y for x, y in xy)

1018 G = [

1019 (pending_g.pop(x) if x < len(pending_g) else None)

1020 for x in reversed(sortedx)

1021 ]

1022 H = [

1023 (pending_h.pop(y) if y < len(pending_h) else None)

1024 for y in reversed(sortedy)

1025 ]

1026

1027 yield from get_edit_paths(

1028 matched_uv,

1029 pending_u,

1030 pending_v,

1031 Cv_ij,

1032 matched_gh,

1033 pending_g,

1034 pending_h,

1035 Ce_xy,

1036 matched_cost + edit_cost,

1037 )

1038

1039 # backtrack

1040 if u is not None:

1041 pending_u.insert(i, u)

1042 if v is not None:

1043 pending_v.insert(j, v)

1044 matched_uv.pop()

1045 for x, g in zip(sortedx, reversed(G)):

1046 if g is not None:

1047 pending_g.insert(x, g)

1048 for y, h in zip(sortedy, reversed(H)):

1049 if h is not None:

1050 pending_h.insert(y, h)

1051 for _ in xy:

1052 matched_gh.pop()

1053

1054 # Initialization

1055

1056 pending_u = list(G1.nodes)

1057 pending_v = list(G2.nodes)

1058

1059 initial_cost = 0

1060 if roots:

1061 root_u, root_v = roots

1062 if root_u not in pending_u or root_v not in pending_v:

1063 raise nx.NodeNotFound("Root node not in graph.")

1064

1065 # remove roots from pending

1066 pending_u.remove(root_u)

1067 pending_v.remove(root_v)

1068

1069 # cost matrix of vertex mappings

1070 m = len(pending_u)

1071 n = len(pending_v)

1072 C = np.zeros((m + n, m + n))

1073 if node_subst_cost:

1074 C[0:m, 0:n] = np.array(

1075 [

1076 node_subst_cost(G1.nodes[u], G2.nodes[v])

1077 for u in pending_u

1078 for v in pending_v

1079 ]

1080 ).reshape(m, n)

1081 if roots:

1082 initial_cost = node_subst_cost(G1.nodes[root_u], G2.nodes[root_v])

1083 elif node_match:

1084 C[0:m, 0:n] = np.array(

1085 [

1086 1 - int(node_match(G1.nodes[u], G2.nodes[v]))

1087 for u in pending_u

1088 for v in pending_v

1089 ]

1090 ).reshape(m, n)

1091 if roots:

1092 initial_cost = 1 - node_match(G1.nodes[root_u], G2.nodes[root_v])

1093 else:

1094 # all zeroes

1095 pass

1096 # assert not min(m, n) or C[0:m, 0:n].min() >= 0

1097 if node_del_cost:

1098 del_costs = [node_del_cost(G1.nodes[u]) for u in pending_u]

1099 else:

1100 del_costs = [1] * len(pending_u)

1101 # assert not m or min(del_costs) >= 0

1102 if node_ins_cost:

1103 ins_costs = [node_ins_cost(G2.nodes[v]) for v in pending_v]

1104 else:

1105 ins_costs = [1] * len(pending_v)

1106 # assert not n or min(ins_costs) >= 0

1107 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1

1108 C[0:m, n : n + m] = np.array(

1109 [del_costs[i] if i == j else inf for i in range(m) for j in range(m)]

1110 ).reshape(m, m)

1111 C[m : m + n, 0:n] = np.array(

1112 [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)]

1113 ).reshape(n, n)

1114 Cv = make_CostMatrix(C, m, n)

1115

1116 pending_g = list(G1.edges)

1117 pending_h = list(G2.edges)

1118

1119 # cost matrix of edge mappings

1120 m = len(pending_g)

1121 n = len(pending_h)

1122 C = np.zeros((m + n, m + n))

1123 if edge_subst_cost:

1124 C[0:m, 0:n] = np.array(

1125 [

1126 edge_subst_cost(G1.edges[g], G2.edges[h])

1127 for g in pending_g

1128 for h in pending_h

1129 ]

1130 ).reshape(m, n)

1131 elif edge_match:

1132 C[0:m, 0:n] = np.array(

1133 [

1134 1 - int(edge_match(G1.edges[g], G2.edges[h]))

1135 for g in pending_g

1136 for h in pending_h

1137 ]

1138 ).reshape(m, n)

1139 else:

1140 # all zeroes

1141 pass

1142 # assert not min(m, n) or C[0:m, 0:n].min() >= 0

1143 if edge_del_cost:

1144 del_costs = [edge_del_cost(G1.edges[g]) for g in pending_g]

1145 else:

1146 del_costs = [1] * len(pending_g)

1147 # assert not m or min(del_costs) >= 0

1148 if edge_ins_cost:

1149 ins_costs = [edge_ins_cost(G2.edges[h]) for h in pending_h]

1150 else:

1151 ins_costs = [1] * len(pending_h)

1152 # assert not n or min(ins_costs) >= 0

1153 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1

1154 C[0:m, n : n + m] = np.array(

1155 [del_costs[i] if i == j else inf for i in range(m) for j in range(m)]

1156 ).reshape(m, m)

1157 C[m : m + n, 0:n] = np.array(

1158 [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)]

1159 ).reshape(n, n)

1160 Ce = make_CostMatrix(C, m, n)

1161

1162 maxcost_value = Cv.C.sum() + Ce.C.sum() + 1

1163

1164 if timeout is not None:

1165 if timeout <= 0:

1166 raise nx.NetworkXError("Timeout value must be greater than 0")

1167 start = time.perf_counter()

1168

1169 def prune(cost):

1170 if timeout is not None:

1171 if time.perf_counter() - start > timeout:

1172 return True

1173 if upper_bound is not None:

1174 if cost > upper_bound:

1175 return True

1176 if cost > maxcost_value:

1177 return True

1178 if strictly_decreasing and cost >= maxcost_value:

1179 return True

1180 return False

1181

1182 # Now go!

1183

1184 done_uv = [] if roots is None else [roots]

1185

1186 for vertex_path, edge_path, cost in get_edit_paths(

1187 done_uv, pending_u, pending_v, Cv, [], pending_g, pending_h, Ce, initial_cost

1188 ):

1189 # assert sorted(G1.nodes) == sorted(u for u, v in vertex_path if u is not None)

1190 # assert sorted(G2.nodes) == sorted(v for u, v in vertex_path if v is not None)

1191 # assert sorted(G1.edges) == sorted(g for g, h in edge_path if g is not None)

1192 # assert sorted(G2.edges) == sorted(h for g, h in edge_path if h is not None)

1193 # print(vertex_path, edge_path, cost, file = sys.stderr)

1194 # assert cost == maxcost_value

1195 yield list(vertex_path), list(edge_path), float(cost)

1196

1197

1198@nx._dispatchable

1199def simrank_similarity(

1200 G,

1201 source=None,

1202 target=None,

1203 importance_factor=0.9,

1204 max_iterations=1000,

1205 tolerance=1e-4,

1206):

1207 """Returns the SimRank similarity of nodes in the graph ``G``.

1208

1209 SimRank is a similarity metric that says "two objects are considered

1210 to be similar if they are referenced by similar objects." [1]_.

1211

1212 The pseudo-code definition from the paper is::

1213

1214 def simrank(G, u, v):

1215 in_neighbors_u = G.predecessors(u)

1216 in_neighbors_v = G.predecessors(v)

1217 scale = C / (len(in_neighbors_u) * len(in_neighbors_v))

1218 return scale * sum(

1219 simrank(G, w, x) for w, x in product(in_neighbors_u, in_neighbors_v)

1220 )

1221

1222 where ``G`` is the graph, ``u`` is the source, ``v`` is the target,

1223 and ``C`` is a float decay or importance factor between 0 and 1.

1224

1225 The SimRank algorithm for determining node similarity is defined in

1226 [2]_.

1227

1228 Parameters

1229 ----------

1230 G : NetworkX graph

1231 A NetworkX graph

1232

1233 source : node

1234 If this is specified, the returned dictionary maps each node

1235 ``v`` in the graph to the similarity between ``source`` and

1236 ``v``.

1237

1238 target : node

1239 If both ``source`` and ``target`` are specified, the similarity

1240 value between ``source`` and ``target`` is returned. If

1241 ``target`` is specified but ``source`` is not, this argument is

1242 ignored.

1243

1244 importance_factor : float

1245 The relative importance of indirect neighbors with respect to

1246 direct neighbors.

1247

1248 max_iterations : integer

1249 Maximum number of iterations.

1250

1251 tolerance : float

1252 Error tolerance used to check convergence. When an iteration of

1253 the algorithm finds that no similarity value changes more than

1254 this amount, the algorithm halts.

1255

1256 Returns

1257 -------

1258 similarity : dictionary or float

1259 If ``source`` and ``target`` are both ``None``, this returns a

1260 dictionary of dictionaries, where keys are node pairs and value

1261 are similarity of the pair of nodes.

1262

1263 If ``source`` is not ``None`` but ``target`` is, this returns a

1264 dictionary mapping node to the similarity of ``source`` and that

1265 node.

1266

1267 If neither ``source`` nor ``target`` is ``None``, this returns

1268 the similarity value for the given pair of nodes.

1269

1270 Raises

1271 ------

1272 ExceededMaxIterations

1273 If the algorithm does not converge within ``max_iterations``.

1274

1275 NodeNotFound

1276 If either ``source`` or ``target`` is not in `G`.

1277

1278 Examples

1279 --------

1280 >>> G = nx.cycle_graph(2)

1281 >>> nx.simrank_similarity(G)

1282 {0: {0: 1.0, 1: 0.0}, 1: {0: 0.0, 1: 1.0}}

1283 >>> nx.simrank_similarity(G, source=0)

1284 {0: 1.0, 1: 0.0}

1285 >>> nx.simrank_similarity(G, source=0, target=0)

1286 1.0

1287

1288 The result of this function can be converted to a numpy array

1289 representing the SimRank matrix by using the node order of the

1290 graph to determine which row and column represent each node.

1291 Other ordering of nodes is also possible.

1292

1293 >>> import numpy as np

1294 >>> sim = nx.simrank_similarity(G)

1295 >>> np.array([[sim[u][v] for v in G] for u in G])

1296 array([[1., 0.],

1297 [0., 1.]])

1298 >>> sim_1d = nx.simrank_similarity(G, source=0)

1299 >>> np.array([sim[0][v] for v in G])

1300 array([1., 0.])

1301

1302 References

1303 ----------

1304 .. [1] https://en.wikipedia.org/wiki/SimRank

1305 .. [2] G. Jeh and J. Widom.

1306 "SimRank: a measure of structural-context similarity",

1307 In KDD'02: Proceedings of the Eighth ACM SIGKDD

1308 International Conference on Knowledge Discovery and Data Mining,

1309 pp. 538--543. ACM Press, 2002.

1310 """

1311 import numpy as np

1312

1313 nodelist = list(G)

1314 if source is not None:

1315 if source not in nodelist:

1316 raise nx.NodeNotFound(f"Source node {source} not in G")

1317 else:

1318 s_indx = nodelist.index(source)

1319 else:

1320 s_indx = None

1321

1322 if target is not None:

1323 if target not in nodelist:

1324 raise nx.NodeNotFound(f"Target node {target} not in G")

1325 else:

1326 t_indx = nodelist.index(target)

1327 else:

1328 t_indx = None

1329

1330 x = _simrank_similarity_numpy(

1331 G, s_indx, t_indx, importance_factor, max_iterations, tolerance

1332 )

1333

1334 if isinstance(x, np.ndarray):

1335 if x.ndim == 1:

1336 return dict(zip(G, x.tolist()))

1337 # else x.ndim == 2

1338 return {u: dict(zip(G, row)) for u, row in zip(G, x.tolist())}

1339 return float(x)

1340

1341

1342def _simrank_similarity_python(

1343 G,

1344 source=None,

1345 target=None,

1346 importance_factor=0.9,

1347 max_iterations=1000,

1348 tolerance=1e-4,

1349):

1350 """Returns the SimRank similarity of nodes in the graph ``G``.

1351

1352 This pure Python version is provided for pedagogical purposes.

1353

1354 Examples

1355 --------

1356 >>> G = nx.cycle_graph(2)

1357 >>> nx.similarity._simrank_similarity_python(G)

1358 {0: {0: 1, 1: 0.0}, 1: {0: 0.0, 1: 1}}

1359 >>> nx.similarity._simrank_similarity_python(G, source=0)

1360 {0: 1, 1: 0.0}

1361 >>> nx.similarity._simrank_similarity_python(G, source=0, target=0)

1362 1

1363 """

1364 # build up our similarity adjacency dictionary output

1365 newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G}

1366

1367 # These functions compute the update to the similarity value of the nodes

1368 # `u` and `v` with respect to the previous similarity values.

1369 def avg_sim(s):

1370 return sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0

1371

1372 Gadj = G.pred if G.is_directed() else G.adj

1373

1374 def sim(u, v):

1375 return importance_factor * avg_sim(list(product(Gadj[u], Gadj[v])))

1376

1377 for its in range(max_iterations):

1378 oldsim = newsim

1379 newsim = {u: {v: sim(u, v) if u != v else 1 for v in G} for u in G}

1380 is_close = all(

1381 all(

1382 abs(newsim[u][v] - old) <= tolerance * (1 + abs(old))

1383 for v, old in nbrs.items()

1384 )

1385 for u, nbrs in oldsim.items()

1386 )

1387 if is_close:

1388 break

1389

1390 if its + 1 == max_iterations:

1391 raise nx.ExceededMaxIterations(

1392 f"simrank did not converge after {max_iterations} iterations."

1393 )

1394

1395 if source is not None and target is not None:

1396 return newsim[source][target]

1397 if source is not None:

1398 return newsim[source]

1399 return newsim

1400

1401

1402def _simrank_similarity_numpy(

1403 G,

1404 source=None,

1405 target=None,

1406 importance_factor=0.9,

1407 max_iterations=1000,

1408 tolerance=1e-4,

1409):

1410 """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``.

1411

1412 The SimRank algorithm for determining node similarity is defined in

1413 [1]_.

1414

1415 Parameters

1416 ----------

1417 G : NetworkX graph

1418 A NetworkX graph

1419

1420 source : node

1421 If this is specified, the returned dictionary maps each node

1422 ``v`` in the graph to the similarity between ``source`` and

1423 ``v``.

1424

1425 target : node

1426 If both ``source`` and ``target`` are specified, the similarity

1427 value between ``source`` and ``target`` is returned. If

1428 ``target`` is specified but ``source`` is not, this argument is

1429 ignored.

1430

1431 importance_factor : float

1432 The relative importance of indirect neighbors with respect to

1433 direct neighbors.

1434

1435 max_iterations : integer

1436 Maximum number of iterations.

1437

1438 tolerance : float

1439 Error tolerance used to check convergence. When an iteration of

1440 the algorithm finds that no similarity value changes more than

1441 this amount, the algorithm halts.

1442

1443 Returns

1444 -------

1445 similarity : numpy array or float

1446 If ``source`` and ``target`` are both ``None``, this returns a

1447 2D array containing SimRank scores of the nodes.

1448

1449 If ``source`` is not ``None`` but ``target`` is, this returns an

1450 1D array containing SimRank scores of ``source`` and that

1451 node.

1452

1453 If neither ``source`` nor ``target`` is ``None``, this returns

1454 the similarity value for the given pair of nodes.

1455

1456 Examples

1457 --------

1458 >>> G = nx.cycle_graph(2)

1459 >>> nx.similarity._simrank_similarity_numpy(G)

1460 array([[1., 0.],

1461 [0., 1.]])

1462 >>> nx.similarity._simrank_similarity_numpy(G, source=0)

1463 array([1., 0.])

1464 >>> nx.similarity._simrank_similarity_numpy(G, source=0, target=0)

1465 1.0

1466

1467 References

1468 ----------

1469 .. [1] G. Jeh and J. Widom.

1470 "SimRank: a measure of structural-context similarity",

1471 In KDD'02: Proceedings of the Eighth ACM SIGKDD

1472 International Conference on Knowledge Discovery and Data Mining,

1473 pp. 538--543. ACM Press, 2002.

1474 """

1475 # This algorithm follows roughly

1476 #

1477 # S = max{C * (A.T * S * A), I}

1478 #

1479 # where C is the importance factor, A is the column normalized

1480 # adjacency matrix, and I is the identity matrix.

1481 import numpy as np

1482

1483 adjacency_matrix = nx.to_numpy_array(G)

1484

1485 # column-normalize the ``adjacency_matrix``

1486 s = np.array(adjacency_matrix.sum(axis=0))

1487 s[s == 0] = 1

1488 adjacency_matrix /= s # adjacency_matrix.sum(axis=0)

1489

1490 newsim = np.eye(len(G), dtype=np.float64)

1491 for its in range(max_iterations):

1492 prevsim = newsim.copy()

1493 newsim = importance_factor * ((adjacency_matrix.T @ prevsim) @ adjacency_matrix)

1494 np.fill_diagonal(newsim, 1.0)

1495

1496 if np.allclose(prevsim, newsim, atol=tolerance):

1497 break

1498

1499 if its + 1 == max_iterations:

1500 raise nx.ExceededMaxIterations(

1501 f"simrank did not converge after {max_iterations} iterations."

1502 )

1503

1504 if source is not None and target is not None:

1505 return float(newsim[source, target])

1506 if source is not None:

1507 return newsim[source]

1508 return newsim

1509

1510

1511@np_random_state("seed")

1512def _prepare_panther_paths(

1513 G,

1514 source,

1515 path_length=5,

1516 c=0.5,

1517 delta=0.1,

1518 eps=None,

1519 weight="weight",

1520 remove_isolates=True,

1521 k=None,

1522 seed=None,

1523):

1524 """Common preparation code for Panther similarity algorithms.

1525

1526 Parameters

1527 ----------

1528 G : NetworkX graph

1529 A NetworkX graph

1530 source : node

1531 Source node for similarity calculation

1532 path_length : int

1533 How long the randomly generated paths should be

1534 c : float

1535 A universal constant that controls the number of random paths to generate

1536 delta : float

1537 The probability parameter for similarity approximation

1538 eps : float or None

1539 The error bound for similarity approximation

1540 weight : string or None

1541 The name of an edge attribute that holds the numerical value used as a weight

1542 remove_isolates : bool

1543 Whether to remove isolated nodes from graph processing

1544 k : int or None

1545 The number of most similar nodes to return. If provided, validates that

1546 ``k`` is not greater than the number of nodes in the graph.

1547 seed : integer, random_state, or None (default)

1548 Indicator of random number generation state.

1549 See :ref:`Randomness<randomness>`.

1550

1551 Returns

1552 -------

1553 PantherPaths

1554 A tuple containing the prepared data:

1555 - G: The graph (possibly with isolates removed)

1556 - inv_node_map: Dictionary mapping node names to indices

1557 - index_map: Populated index map of paths

1558 - inv_sample_size: Inverse of sample size (for fast calculation)

1559 - eps: Error bound for similarity approximation

1560 """

1561 import numpy as np

1562

1563 if source not in G:

1564 raise nx.NodeNotFound(f"Source node {source} not in G")

1565

1566 isolates = set(nx.isolates(G))

1567

1568 if source in isolates:

1569 raise nx.NetworkXUnfeasible(

1570 f"Panther similarity is not defined for the isolated source node {source}."

1571 )

1572

1573 if remove_isolates:

1574 G = G.subgraph(node for node in G if node not in isolates).copy()

1575

1576 # According to [1], they empirically determined

1577 # a good value for ``eps`` to be sqrt( 1 / |E| )

1578 if eps is None:

1579 eps = np.sqrt(1.0 / G.number_of_edges())

1580

1581 num_nodes = G.number_of_nodes()

1582

1583 # Check if k is provided and validate it against the number of nodes

1584 if k is not None and not remove_isolates: # For panther_vector_similarity

1585 if num_nodes < k:

1586 raise nx.NetworkXUnfeasible(

1587 f"The number of requested nodes {k} is greater than the number of nodes {num_nodes}."

1588 )

1589

1590 inv_node_map = {name: index for index, name in enumerate(G)}

1591

1592 # Calculate the sample size ``R`` for how many paths

1593 # to randomly generate

1594 t_choose_2 = math.comb(path_length, 2)

1595 sample_size = int((c / eps**2) * (np.log2(t_choose_2) + 1 + np.log(1 / delta)))

1596 index_map = {}

1597

1598 # Check for isolated nodes before generating random paths

1599 # If there are still isolated nodes in the graph after filtering,

1600 # they will cause issues with path generation

1601 remaining_isolates = set(nx.isolates(G))

1602 if remaining_isolates:

1603 raise nx.NetworkXUnfeasible(

1604 f"Cannot generate random paths with isolated nodes present: {remaining_isolates}"

1605 )

1606

1607 # Generate the random paths and populate the index_map

1608 for _ in generate_random_paths(

1609 G,

1610 sample_size,

1611 path_length=path_length,

1612 index_map=index_map,

1613 weight=weight,

1614 seed=seed,

1615 ):

1616 # NOTE: index_map is modified in-place by `generate_random_paths`

1617 pass

1618

1619 return (

1620 G, # The graph with isolated nodes removed

1621 inv_node_map,

1622 index_map,

1623 1 / sample_size,

1624 eps,

1625 )

1626

1627

1628@np_random_state("seed")

1629@nx._dispatchable(edge_attrs="weight")

1630def panther_similarity(

1631 G,

1632 source,

1633 k=5,

1634 path_length=5,

1635 c=0.5,

1636 delta=0.1,

1637 eps=None,

1638 weight="weight",

1639 seed=None,

1640):

1641 r"""Returns the Panther similarity of nodes in the graph `G` to node ``v``.

1642

1643 Panther is a similarity metric that says "two objects are considered

1644 to be similar if they frequently appear on the same paths." [1]_.

1645

1646 Parameters

1647 ----------

1648 G : NetworkX graph

1649 A NetworkX graph

1650 source : node

1651 Source node for which to find the top `k` similar other nodes

1652 k : int (default = 5)

1653 The number of most similar nodes to return.

1654 path_length : int (default = 5)

1655 How long the randomly generated paths should be (``T`` in [1]_)

1656 c : float (default = 0.5)

1657 A universal constant that controls the number of random paths to generate.

1658 Higher values increase the number of sample paths and potentially improve

1659 accuracy at the cost of more computation. Defaults to 0.5 as recommended

1660 in [1]_.

1661 delta : float (default = 0.1)

1662 The probability that the similarity $S$ is not an epsilon-approximation to (R, phi),

1663 where $R$ is the number of random paths and $\phi$ is the probability

1664 that an element sampled from a set $A \subseteq D$, where $D$ is the domain.

1665 eps : float or None (default = None)

1666 The error bound for similarity approximation. This controls the accuracy

1667 of the sampled paths in representing the true similarity. Smaller values

1668 yield more accurate results but require more sample paths. If `None`, a

1669 value of ``sqrt(1/|E|)`` is used, which the authors found empirically

1670 effective.

1671 weight : string or None, optional (default="weight")

1672 The name of an edge attribute that holds the numerical value

1673 used as a weight. If None then each edge has weight 1.

1674 seed : integer, random_state, or None (default)

1675 Indicator of random number generation state.

1676 See :ref:`Randomness<randomness>`.

1677

1678 Returns

1679 -------

1680 similarity : dictionary

1681 Dictionary of nodes to similarity scores (as floats). Note:

1682 the self-similarity (i.e., ``v``) will not be included in

1683 the returned dictionary. So, for ``k = 5``, a dictionary of

1684 top 4 nodes and their similarity scores will be returned.

1685

1686 Raises

1687 ------

1688 NetworkXUnfeasible

1689 If `source` is an isolated node.

1690

1691 NodeNotFound

1692 If `source` is not in `G`.

1693

1694 Notes

1695 -----

1696 The isolated nodes in `G` are ignored.

1697

1698 Examples

1699 --------

1700 >>> G = nx.star_graph(10)

1701 >>> sim = nx.panther_similarity(G, 0)

1702

1703 References

1704 ----------

1705 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.

1706 Panther: Fast top-k similarity search on large networks.

1707 In Proceedings of the ACM SIGKDD International Conference

1708 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).

1709 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.

1710 """

1711 import numpy as np

1712

1713 # Use helper method to prepare common data structures

1714 G, inv_node_map, index_map, inv_sample_size, eps = _prepare_panther_paths(

1715 G,

1716 source,

1717 path_length=path_length,

1718 c=c,

1719 delta=delta,

1720 eps=eps,

1721 weight=weight,

1722 k=k,

1723 seed=seed,

1724 )

1725

1726 num_nodes = G.number_of_nodes()

1727 node_list = list(G.nodes)

1728

1729 # Check number of nodes after any modifications by _prepare_panther_paths

1730 if num_nodes < k:

1731 raise nx.NetworkXUnfeasible(

1732 f"The number of requested nodes {k} is greater than the number of nodes {num_nodes}."

1733 )

1734

1735 S = np.zeros(num_nodes)

1736 source_paths = set(index_map[source])

1737

1738 # Calculate the path similarities

1739 # between ``source`` (v) and ``node`` (v_j)

1740 # using our inverted index mapping of

1741 # vertices to paths

1742 for node, paths in index_map.items():

1743 # Only consider paths where both

1744 # ``node`` and ``source`` are present

1745 common_paths = source_paths.intersection(paths)

1746 S[inv_node_map[node]] = len(common_paths) * inv_sample_size

1747

1748 # Retrieve top ``k+1`` similar to account for removing self-similarity

1749 # Note: the below performed anywhere from 4-10x faster

1750 # (depending on input sizes) vs the equivalent ``np.argsort(S)[::-1]``

1751 partition_k = min(k + 1, num_nodes)

1752 top_k_unsorted = np.argpartition(S, -partition_k)[-partition_k:]

1753 top_k_sorted = top_k_unsorted[np.argsort(S[top_k_unsorted])][::-1]

1754

1755 # Add back the similarity scores

1756 # Convert numpy scalars to native Python types for dispatch compatibility

1757 top_k_with_val = dict(

1758 zip((node_list[i] for i in top_k_sorted), S[top_k_sorted].tolist())

1759 )

1760

1761 # Remove the self-similarity

1762 top_k_with_val.pop(source, None)

1763 return top_k_with_val

1764

1765

1766@np_random_state("seed")

1767@nx._dispatchable(edge_attrs="weight")

1768def panther_vector_similarity(

1769 G,

1770 source,

1771 *,

1772 D=10,

1773 k=5,

1774 path_length=5,

1775 c=0.5,

1776 delta=0.1,

1777 eps=None,

1778 weight="weight",

1779 seed=None,

1780):

1781 r"""Returns the Panther vector similarity (Panther++) of nodes in `G`.

1782

1783 Computes similarity between nodes based on the "Panther++" algorithm [1]_, which extends

1784 the basic Panther algorithm by using feature vectors to better capture structural

1785 similarity.

1786

1787 While basic Panther similarity measures how often two nodes appear on the same paths,

1788 Panther vector similarity (Panther++) creates a ``D``-dimensional feature vector for each

1789 node using its top similarity scores with other nodes, then computes similarity based

1790 on the Euclidean distance between these feature vectors. This approach better captures

1791 structural similarity and addresses the bias towards close neighbors present in

1792 the original Panther algorithm.

1793

1794 This approach is preferred when:

1795

1796 1. You need better structural similarity than basic path co-occurrence

1797 2. You want to overcome the close-neighbor bias of standard Panther

1798 3. You're working with large graphs where k-d tree indexing would be beneficial

1799 4. Graph edit distance-like similarity is more appropriate than path co-occurrence

1800

1801 Parameters

1802 ----------

1803 G : NetworkX graph

1804 A NetworkX graph

1805 source : node

1806 Source node for which to find the top ``k`` similar other nodes

1807 D : int

1808 The number of similarity scores to use (in descending order)

1809 for each feature vector. Defaults to 10. Note that the original paper

1810 used D=50 [1]_, but KDTree is optimized for lower dimensions.

1811 k : int

1812 The number of most similar nodes to return

1813 path_length : int

1814 How long the randomly generated paths should be (``T`` in [1]_)

1815 c : float

1816 A universal constant that controls the number of random paths to generate.

1817 Higher values increase the number of sample paths and potentially improve

1818 accuracy at the cost of more computation. Defaults to 0.5 as recommended

1819 in [1]_.

1820 delta : float

1821 The probability that ``S`` is not an epsilon-approximation to (R, phi)

1822 eps : float

1823 The error bound for similarity approximation. This controls the accuracy

1824 of the sampled paths in representing the true similarity. Smaller values

1825 yield more accurate results but require more sample paths. If None, a

1826 value of ``sqrt(1/|E|)`` is used, which the authors found empirically

1827 effective.

1828 weight : string or None, optional (default="weight")

1829 The name of an edge attribute that holds the numerical value

1830 used as a weight. If `None` then each edge has weight 1.

1831 seed : integer, random_state, or None (default)

1832 Indicator of random number generation state.

1833 See :ref:`Randomness<randomness>`.

1834

1835 Returns

1836 -------

1837 similarity : dict

1838 Dict of nodes to similarity scores (as floats).

1839 Note: the self-similarity (i.e., `node`) is not included in the dict.

1840

1841 Examples

1842 --------

1843 >>> G = nx.star_graph(100)

1844

1845 The "hub" node is distinct from the "spoke" nodes

1846

1847 >>> from pprint import pprint

1848 >>> pprint(nx.panther_vector_similarity(G, source=0, seed=42))

1849 {35: 0.10402634656233918,

1850 61: 0.10434063328712018,

1851 65: 0.10401247833456054,

1852 85: 0.10506718868571752,

1853 88: 0.10402634656233918}

1854

1855 But "spoke" nodes are similar to one another

1856

1857 >>> result = nx.panther_vector_similarity(G, source=1, seed=42)

1858 >>> len(result)

1859 5

1860 >>> all(similarity == 1.0 for similarity in result.values())

1861 True

1862

1863 Notes

1864 -----

1865 Results may be nondeterministic when feature vectors have the same distances,

1866 as the KDTree's internal tie-breaking behavior can vary between runs.

1867 Using the same ``seed`` parameter ensures reproducible results.

1868

1869 References

1870 ----------

1871 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.

1872 Panther: Fast top-k similarity search on large networks.

1873 In Proceedings of the ACM SIGKDD International Conference

1874 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).

1875 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.

1876 """

1877 import numpy as np

1878 import scipy as sp

1879

1880 # Use helper method to prepare common data structures but keep isolates in the graph

1881 G, inv_node_map, index_map, inv_sample_size, eps = _prepare_panther_paths(

1882 G,

1883 source,

1884 path_length=path_length,

1885 c=c,

1886 delta=delta,

1887 eps=eps,

1888 weight=weight,

1889 remove_isolates=False,

1890 k=k,

1891 seed=seed,

1892 )

1893 num_nodes = G.number_of_nodes()

1894 node_list = list(G.nodes)

1895

1896 # Ensure D doesn't exceed the number of nodes

1897 if num_nodes < D:

1898 raise nx.NetworkXUnfeasible(

1899 f"The number of requested similarity scores {D} is greater than the number of nodes {num_nodes}."

1900 )

1901

1902 similarities = np.zeros((num_nodes, num_nodes))

1903 theta = np.zeros((num_nodes, D))

1904 index_map_sets = {node: set(paths) for node, paths in index_map.items()}

1905

1906 # Calculate the path similarities for each node

1907 for vi_idx, vi in enumerate(G.nodes):

1908 vi_paths = index_map_sets[vi]

1909

1910 for node, node_paths in index_map_sets.items():

1911 # Calculate similarity score

1912 common_path_count = len(vi_paths.intersection(node_paths))

1913 similarities[vi_idx, inv_node_map[node]] = (

1914 common_path_count * inv_sample_size

1915 )

1916

1917 # Build up the feature vector using the largest D similarity scores

1918 theta[vi_idx] = np.sort(np.partition(similarities[vi_idx], -D)[-D:])[::-1]

1919

1920 # Insert the feature vectors into a k-d tree

1921 # for fast retrieval

1922 kdtree = sp.spatial.KDTree(theta)

1923

1924 # Retrieve top ``k+1`` similar vertices (i.e., vectors)

1925 # (based on their Euclidean distance)

1926 # Note that it's k+1 because the source node will be included and later removed

1927 query_k = min(k + 1, num_nodes)

1928 neighbor_distances, nearest_neighbors = kdtree.query(

1929 theta[inv_node_map[source]], k=query_k

1930 )

1931

1932 # Ensure results are always arrays (KDTree returns scalars when k=1)

1933 neighbor_distances = np.atleast_1d(neighbor_distances)

1934 nearest_neighbors = np.atleast_1d(nearest_neighbors)

1935

1936 # The paper defines the similarity S(v_i, v_j) as

1937 # 1 / || Theta(v_i) - Theta(v_j) ||

1938 # Calculate reciprocals and normalize to [0, 1] range

1939

1940 # Handle the case where distances are very small or zero (common in small graphs)

1941 # Use the passed in eps parameter instead of defining a new epsilon

1942 neighbor_distances = np.maximum(neighbor_distances, eps)

1943 similarities = 1 / neighbor_distances

1944

1945 # Always normalize to ensure values are between 0 and 1

1946 if len(similarities) > 0 and (max_sim := np.max(similarities)) > 0:

1947 similarities /= max_sim

1948

1949 # Add back the similarity scores (i.e., distances)

1950 # Convert numpy scalars to native Python types for dispatch compatibility

1951 top_k_with_val = dict(

1952 zip((node_list[n] for n in nearest_neighbors), similarities.tolist())

1953 )

1954

1955 # Remove the self-similarity

1956 top_k_with_val.pop(source, None)

1957

1958 # Ensure we return exactly k results (sorted by similarity)

1959 if len(top_k_with_val) > k:

1960 sorted_items = sorted(top_k_with_val.items(), key=lambda x: x[1], reverse=True)

1961 top_k_with_val = dict(sorted_items[:k])

1962

1963 return top_k_with_val

1964

1965

1966@np_random_state("seed")

1967@nx._dispatchable(edge_attrs="weight")

1968def generate_random_paths(

1969 G,

1970 sample_size,

1971 path_length=5,

1972 index_map=None,

1973 weight="weight",

1974 seed=None,

1975 *,

1976 source=None,

1977):

1978 """Randomly generate `sample_size` paths of length `path_length`.

1979

1980 Parameters

1981 ----------

1982 G : NetworkX graph

1983 A NetworkX graph

1984 sample_size : integer

1985 The number of paths to generate. This is ``R`` in [1]_.

1986 path_length : integer (default = 5)

1987 The maximum size of the path to randomly generate.

1988 This is ``T`` in [1]_. According to the paper, ``T >= 5`` is

1989 recommended.

1990 index_map : dictionary, optional

1991 If provided, this will be populated with the inverted

1992 index of nodes mapped to the set of generated random path

1993 indices within ``paths``.

1994 weight : string or None, optional (default="weight")

1995 The name of an edge attribute that holds the numerical value

1996 used as a weight. If None then each edge has weight 1.

1997 seed : integer, random_state, or None (default)

1998 Indicator of random number generation state.

1999 See :ref:`Randomness<randomness>`.

2000 source : node, optional

2001 Node to use as the starting point for all generated paths.

2002 If None then starting nodes are selected at random with uniform probability.

2003

2004 Returns

2005 -------

2006 paths : generator of lists

2007 Generator of `sample_size` paths each with length `path_length`.

2008

2009 Examples

2010 --------

2011 The generator yields `sample_size` number of paths of length `path_length`

2012 drawn from `G`:

2013

2014 >>> G = nx.complete_graph(5)

2015 >>> next(nx.generate_random_paths(G, sample_size=1, path_length=3, seed=42))

2016 [3, 4, 2, 3]

2017 >>> list(nx.generate_random_paths(G, sample_size=3, path_length=4, seed=42))

2018 [[3, 4, 2, 3, 0], [2, 0, 2, 1, 0], [2, 0, 4, 3, 0]]

2019

2020 By passing a dictionary into `index_map`, it will build an

2021 inverted index mapping of nodes to the paths in which that node is present:

2022

2023 >>> G = nx.wheel_graph(10)

2024 >>> index_map = {}

2025 >>> random_paths = list(

2026 ... nx.generate_random_paths(G, sample_size=3, index_map=index_map, seed=2771)

2027 ... )

2028 >>> random_paths

2029 [[3, 2, 1, 9, 8, 7], [4, 0, 5, 6, 7, 8], [3, 0, 5, 0, 9, 8]]

2030 >>> paths_containing_node_0 = [

2031 ... random_paths[path_idx] for path_idx in index_map.get(0, [])

2032 ... ]

2033 >>> paths_containing_node_0

2034 [[4, 0, 5, 6, 7, 8], [3, 0, 5, 0, 9, 8]]

2035

2036 References

2037 ----------

2038 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.

2039 Panther: Fast top-k similarity search on large networks.

2040 In Proceedings of the ACM SIGKDD International Conference

2041 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).

2042 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.

2043 """

2044 import numpy as np

2045

2046 randint_fn = (

2047 seed.integers if isinstance(seed, np.random.Generator) else seed.randint

2048 )

2049

2050 # Calculate transition probabilities between

2051 # every pair of vertices according to Eq. (3)

2052 adj_mat = nx.to_numpy_array(G, weight=weight)

2053

2054 # Handle isolated nodes by checking for zero row sums

2055 row_sums = adj_mat.sum(axis=1).reshape(-1, 1)

2056 inv_row_sums = np.reciprocal(row_sums)

2057 transition_probabilities = adj_mat * inv_row_sums

2058

2059 node_map = list(G)

2060 num_nodes = G.number_of_nodes()

2061

2062 for path_index in range(sample_size):

2063 if source is None:

2064 # Sample current vertex v = v_i uniformly at random

2065 node_index = randint_fn(num_nodes)

2066 node = node_map[node_index]

2067 else:

2068 if source not in node_map:

2069 raise nx.NodeNotFound(f"Initial node {source} not in G")

2070

2071 node = source

2072 node_index = node_map.index(node)

2073

2074 # Add v into p_r and add p_r into the path set

2075 # of v, i.e., P_v

2076 path = [node]

2077

2078 # Build the inverted index (P_v) of vertices to paths

2079 if index_map is not None:

2080 if node in index_map:

2081 index_map[node].add(path_index)

2082 else:

2083 index_map[node] = {path_index}

2084

2085 starting_index = node_index

2086 for _ in range(path_length):

2087 # Randomly sample a neighbor (v_j) according

2088 # to transition probabilities from ``node`` (v) to its neighbors

2089 nbr_index = seed.choice(

2090 num_nodes, p=transition_probabilities[starting_index]

2091 )

2092

2093 # Set current vertex (v = v_j)

2094 starting_index = nbr_index

2095

2096 # Add v into p_r

2097 nbr_node = node_map[nbr_index]

2098 path.append(nbr_node)

2099

2100 # Add p_r into P_v

2101 if index_map is not None:

2102 if nbr_node in index_map:

2103 index_map[nbr_node].add(path_index)

2104 else:

2105 index_map[nbr_node] = {path_index}

2106

2107 yield path