1"""Functions measuring similarity using graph edit distance.
2
3The graph edit distance is the number of edge/node changes needed
4to make two graphs isomorphic.
5
6The default algorithm/implementation is sub-optimal for some graphs.
7The problem of finding the exact Graph Edit Distance (GED) is NP-hard
8so it is often slow. If the simple interface `graph_edit_distance`
9takes too long for your graph, try `optimize_graph_edit_distance`
10and/or `optimize_edit_paths`.
11
12At the same time, I encourage capable people to investigate
13alternative GED algorithms, in order to improve the choices available.
14"""
15
16import math
17import time
18from dataclasses import dataclass
19from itertools import product
20
21import networkx as nx
22from networkx.utils import np_random_state
23
24__all__ = [
25 "graph_edit_distance",
26 "optimal_edit_paths",
27 "optimize_graph_edit_distance",
28 "optimize_edit_paths",
29 "simrank_similarity",
30 "panther_similarity",
31 "panther_vector_similarity",
32 "generate_random_paths",
33]
34
35
36@nx._dispatchable(
37 graphs={"G1": 0, "G2": 1}, preserve_edge_attrs=True, preserve_node_attrs=True
38)
39def graph_edit_distance(
40 G1,
41 G2,
42 node_match=None,
43 edge_match=None,
44 node_subst_cost=None,
45 node_del_cost=None,
46 node_ins_cost=None,
47 edge_subst_cost=None,
48 edge_del_cost=None,
49 edge_ins_cost=None,
50 roots=None,
51 upper_bound=None,
52 timeout=None,
53):
54 """Returns GED (graph edit distance) between graphs G1 and G2.
55
56 Graph edit distance is a graph similarity measure analogous to
57 Levenshtein distance for strings. It is defined as minimum cost
58 of edit path (sequence of node and edge edit operations)
59 transforming graph G1 to graph isomorphic to G2.
60
61 Parameters
62 ----------
63 G1, G2: graphs
64 The two graphs G1 and G2 must be of the same type.
65
66 node_match : callable
67 A function that returns True if node n1 in G1 and n2 in G2
68 should be considered equal during matching.
69
70 The function will be called like
71
72 node_match(G1.nodes[n1], G2.nodes[n2]).
73
74 That is, the function will receive the node attribute
75 dictionaries for n1 and n2 as inputs.
76
77 Ignored if node_subst_cost is specified. If neither
78 node_match nor node_subst_cost are specified then node
79 attributes are not considered.
80
81 edge_match : callable
82 A function that returns True if the edge attribute dictionaries
83 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
84 be considered equal during matching.
85
86 The function will be called like
87
88 edge_match(G1[u1][v1], G2[u2][v2]).
89
90 That is, the function will receive the edge attribute
91 dictionaries of the edges under consideration.
92
93 Ignored if edge_subst_cost is specified. If neither
94 edge_match nor edge_subst_cost are specified then edge
95 attributes are not considered.
96
97 node_subst_cost, node_del_cost, node_ins_cost : callable
98 Functions that return the costs of node substitution, node
99 deletion, and node insertion, respectively.
100
101 The functions will be called like
102
103 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
104 node_del_cost(G1.nodes[n1]),
105 node_ins_cost(G2.nodes[n2]).
106
107 That is, the functions will receive the node attribute
108 dictionaries as inputs. The functions are expected to return
109 positive numeric values.
110
111 Function node_subst_cost overrides node_match if specified.
112 If neither node_match nor node_subst_cost are specified then
113 default node substitution cost of 0 is used (node attributes
114 are not considered during matching).
115
116 If node_del_cost is not specified then default node deletion
117 cost of 1 is used. If node_ins_cost is not specified then
118 default node insertion cost of 1 is used.
119
120 edge_subst_cost, edge_del_cost, edge_ins_cost : callable
121 Functions that return the costs of edge substitution, edge
122 deletion, and edge insertion, respectively.
123
124 The functions will be called like
125
126 edge_subst_cost(G1[u1][v1], G2[u2][v2]),
127 edge_del_cost(G1[u1][v1]),
128 edge_ins_cost(G2[u2][v2]).
129
130 That is, the functions will receive the edge attribute
131 dictionaries as inputs. The functions are expected to return
132 positive numeric values.
133
134 Function edge_subst_cost overrides edge_match if specified.
135 If neither edge_match nor edge_subst_cost are specified then
136 default edge substitution cost of 0 is used (edge attributes
137 are not considered during matching).
138
139 If edge_del_cost is not specified then default edge deletion
140 cost of 1 is used. If edge_ins_cost is not specified then
141 default edge insertion cost of 1 is used.
142
143 roots : 2-tuple
144 Tuple where first element is a node in G1 and the second
145 is a node in G2.
146 These nodes are forced to be matched in the comparison to
147 allow comparison between rooted graphs.
148
149 upper_bound : numeric
150 Maximum edit distance to consider. Return None if no edit
151 distance under or equal to upper_bound exists.
152
153 timeout : numeric
154 Maximum number of seconds to execute.
155 After timeout is met, the current best GED is returned.
156
157 Examples
158 --------
159 >>> G1 = nx.cycle_graph(6)
160 >>> G2 = nx.wheel_graph(7)
161 >>> nx.graph_edit_distance(G1, G2)
162 7.0
163
164 >>> G1 = nx.star_graph(5)
165 >>> G2 = nx.star_graph(5)
166 >>> nx.graph_edit_distance(G1, G2, roots=(0, 0))
167 0.0
168 >>> nx.graph_edit_distance(G1, G2, roots=(1, 0))
169 8.0
170
171 See Also
172 --------
173 optimal_edit_paths, optimize_graph_edit_distance,
174
175 is_isomorphic: test for graph edit distance of 0
176
177 References
178 ----------
179 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
180 Martineau. An Exact Graph Edit Distance Algorithm for Solving
181 Pattern Recognition Problems. 4th International Conference on
182 Pattern Recognition Applications and Methods 2015, Jan 2015,
183 Lisbon, Portugal. 2015,
184 <10.5220/0005209202710278>. <hal-01168816>
185 https://hal.archives-ouvertes.fr/hal-01168816
186
187 """
188 bestcost = None
189 for _, _, cost in optimize_edit_paths(
190 G1,
191 G2,
192 node_match,
193 edge_match,
194 node_subst_cost,
195 node_del_cost,
196 node_ins_cost,
197 edge_subst_cost,
198 edge_del_cost,
199 edge_ins_cost,
200 upper_bound,
201 True,
202 roots,
203 timeout,
204 ):
205 # assert bestcost is None or cost < bestcost
206 bestcost = cost
207 return bestcost
208
209
210@nx._dispatchable(graphs={"G1": 0, "G2": 1})
211def optimal_edit_paths(
212 G1,
213 G2,
214 node_match=None,
215 edge_match=None,
216 node_subst_cost=None,
217 node_del_cost=None,
218 node_ins_cost=None,
219 edge_subst_cost=None,
220 edge_del_cost=None,
221 edge_ins_cost=None,
222 upper_bound=None,
223):
224 """Returns all minimum-cost edit paths transforming G1 to G2.
225
226 Graph edit path is a sequence of node and edge edit operations
227 transforming graph G1 to graph isomorphic to G2. Edit operations
228 include substitutions, deletions, and insertions.
229
230 Parameters
231 ----------
232 G1, G2: graphs
233 The two graphs G1 and G2 must be of the same type.
234
235 node_match : callable
236 A function that returns True if node n1 in G1 and n2 in G2
237 should be considered equal during matching.
238
239 The function will be called like
240
241 node_match(G1.nodes[n1], G2.nodes[n2]).
242
243 That is, the function will receive the node attribute
244 dictionaries for n1 and n2 as inputs.
245
246 Ignored if node_subst_cost is specified. If neither
247 node_match nor node_subst_cost are specified then node
248 attributes are not considered.
249
250 edge_match : callable
251 A function that returns True if the edge attribute dictionaries
252 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
253 be considered equal during matching.
254
255 The function will be called like
256
257 edge_match(G1[u1][v1], G2[u2][v2]).
258
259 That is, the function will receive the edge attribute
260 dictionaries of the edges under consideration.
261
262 Ignored if edge_subst_cost is specified. If neither
263 edge_match nor edge_subst_cost are specified then edge
264 attributes are not considered.
265
266 node_subst_cost, node_del_cost, node_ins_cost : callable
267 Functions that return the costs of node substitution, node
268 deletion, and node insertion, respectively.
269
270 The functions will be called like
271
272 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
273 node_del_cost(G1.nodes[n1]),
274 node_ins_cost(G2.nodes[n2]).
275
276 That is, the functions will receive the node attribute
277 dictionaries as inputs. The functions are expected to return
278 positive numeric values.
279
280 Function node_subst_cost overrides node_match if specified.
281 If neither node_match nor node_subst_cost are specified then
282 default node substitution cost of 0 is used (node attributes
283 are not considered during matching).
284
285 If node_del_cost is not specified then default node deletion
286 cost of 1 is used. If node_ins_cost is not specified then
287 default node insertion cost of 1 is used.
288
289 edge_subst_cost, edge_del_cost, edge_ins_cost : callable
290 Functions that return the costs of edge substitution, edge
291 deletion, and edge insertion, respectively.
292
293 The functions will be called like
294
295 edge_subst_cost(G1[u1][v1], G2[u2][v2]),
296 edge_del_cost(G1[u1][v1]),
297 edge_ins_cost(G2[u2][v2]).
298
299 That is, the functions will receive the edge attribute
300 dictionaries as inputs. The functions are expected to return
301 positive numeric values.
302
303 Function edge_subst_cost overrides edge_match if specified.
304 If neither edge_match nor edge_subst_cost are specified then
305 default edge substitution cost of 0 is used (edge attributes
306 are not considered during matching).
307
308 If edge_del_cost is not specified then default edge deletion
309 cost of 1 is used. If edge_ins_cost is not specified then
310 default edge insertion cost of 1 is used.
311
312 upper_bound : numeric
313 Maximum edit distance to consider.
314
315 Returns
316 -------
317 edit_paths : list of tuples (node_edit_path, edge_edit_path)
318 - node_edit_path : list of tuples ``(u, v)`` indicating node transformations
319 between `G1` and `G2`. ``u`` is `None` for insertion, ``v`` is `None`
320 for deletion.
321 - edge_edit_path : list of tuples ``((u1, v1), (u2, v2))`` indicating edge
322 transformations between `G1` and `G2`. ``(None, (u2,v2))`` for insertion
323 and ``((u1,v1), None)`` for deletion.
324
325 cost : numeric
326 Optimal edit path cost (graph edit distance). When the cost
327 is zero, it indicates that `G1` and `G2` are isomorphic.
328
329 Examples
330 --------
331 >>> G1 = nx.cycle_graph(4)
332 >>> G2 = nx.wheel_graph(5)
333 >>> paths, cost = nx.optimal_edit_paths(G1, G2)
334 >>> len(paths)
335 40
336 >>> cost
337 5.0
338
339 Notes
340 -----
341 To transform `G1` into a graph isomorphic to `G2`, apply the node
342 and edge edits in the returned ``edit_paths``.
343 In the case of isomorphic graphs, the cost is zero, and the paths
344 represent different isomorphic mappings (isomorphisms). That is, the
345 edits involve renaming nodes and edges to match the structure of `G2`.
346
347 See Also
348 --------
349 graph_edit_distance, optimize_edit_paths
350
351 References
352 ----------
353 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
354 Martineau. An Exact Graph Edit Distance Algorithm for Solving
355 Pattern Recognition Problems. 4th International Conference on
356 Pattern Recognition Applications and Methods 2015, Jan 2015,
357 Lisbon, Portugal. 2015,
358 <10.5220/0005209202710278>. <hal-01168816>
359 https://hal.archives-ouvertes.fr/hal-01168816
360
361 """
362 paths = []
363 bestcost = None
364 for vertex_path, edge_path, cost in optimize_edit_paths(
365 G1,
366 G2,
367 node_match,
368 edge_match,
369 node_subst_cost,
370 node_del_cost,
371 node_ins_cost,
372 edge_subst_cost,
373 edge_del_cost,
374 edge_ins_cost,
375 upper_bound,
376 False,
377 ):
378 # assert bestcost is None or cost <= bestcost
379 if bestcost is not None and cost < bestcost:
380 paths = []
381 paths.append((vertex_path, edge_path))
382 bestcost = cost
383 return paths, bestcost
384
385
386@nx._dispatchable(graphs={"G1": 0, "G2": 1})
387def optimize_graph_edit_distance(
388 G1,
389 G2,
390 node_match=None,
391 edge_match=None,
392 node_subst_cost=None,
393 node_del_cost=None,
394 node_ins_cost=None,
395 edge_subst_cost=None,
396 edge_del_cost=None,
397 edge_ins_cost=None,
398 upper_bound=None,
399):
400 """Returns consecutive approximations of GED (graph edit distance)
401 between graphs G1 and G2.
402
403 Graph edit distance is a graph similarity measure analogous to
404 Levenshtein distance for strings. It is defined as minimum cost
405 of edit path (sequence of node and edge edit operations)
406 transforming graph G1 to graph isomorphic to G2.
407
408 Parameters
409 ----------
410 G1, G2: graphs
411 The two graphs G1 and G2 must be of the same type.
412
413 node_match : callable
414 A function that returns True if node n1 in G1 and n2 in G2
415 should be considered equal during matching.
416
417 The function will be called like
418
419 node_match(G1.nodes[n1], G2.nodes[n2]).
420
421 That is, the function will receive the node attribute
422 dictionaries for n1 and n2 as inputs.
423
424 Ignored if node_subst_cost is specified. If neither
425 node_match nor node_subst_cost are specified then node
426 attributes are not considered.
427
428 edge_match : callable
429 A function that returns True if the edge attribute dictionaries
430 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
431 be considered equal during matching.
432
433 The function will be called like
434
435 edge_match(G1[u1][v1], G2[u2][v2]).
436
437 That is, the function will receive the edge attribute
438 dictionaries of the edges under consideration.
439
440 Ignored if edge_subst_cost is specified. If neither
441 edge_match nor edge_subst_cost are specified then edge
442 attributes are not considered.
443
444 node_subst_cost, node_del_cost, node_ins_cost : callable
445 Functions that return the costs of node substitution, node
446 deletion, and node insertion, respectively.
447
448 The functions will be called like
449
450 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
451 node_del_cost(G1.nodes[n1]),
452 node_ins_cost(G2.nodes[n2]).
453
454 That is, the functions will receive the node attribute
455 dictionaries as inputs. The functions are expected to return
456 positive numeric values.
457
458 Function node_subst_cost overrides node_match if specified.
459 If neither node_match nor node_subst_cost are specified then
460 default node substitution cost of 0 is used (node attributes
461 are not considered during matching).
462
463 If node_del_cost is not specified then default node deletion
464 cost of 1 is used. If node_ins_cost is not specified then
465 default node insertion cost of 1 is used.
466
467 edge_subst_cost, edge_del_cost, edge_ins_cost : callable
468 Functions that return the costs of edge substitution, edge
469 deletion, and edge insertion, respectively.
470
471 The functions will be called like
472
473 edge_subst_cost(G1[u1][v1], G2[u2][v2]),
474 edge_del_cost(G1[u1][v1]),
475 edge_ins_cost(G2[u2][v2]).
476
477 That is, the functions will receive the edge attribute
478 dictionaries as inputs. The functions are expected to return
479 positive numeric values.
480
481 Function edge_subst_cost overrides edge_match if specified.
482 If neither edge_match nor edge_subst_cost are specified then
483 default edge substitution cost of 0 is used (edge attributes
484 are not considered during matching).
485
486 If edge_del_cost is not specified then default edge deletion
487 cost of 1 is used. If edge_ins_cost is not specified then
488 default edge insertion cost of 1 is used.
489
490 upper_bound : numeric
491 Maximum edit distance to consider.
492
493 Returns
494 -------
495 Generator of consecutive approximations of graph edit distance.
496
497 Examples
498 --------
499 >>> G1 = nx.cycle_graph(6)
500 >>> G2 = nx.wheel_graph(7)
501 >>> for v in nx.optimize_graph_edit_distance(G1, G2):
502 ... minv = v
503 >>> minv
504 7.0
505
506 See Also
507 --------
508 graph_edit_distance, optimize_edit_paths
509
510 References
511 ----------
512 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
513 Martineau. An Exact Graph Edit Distance Algorithm for Solving
514 Pattern Recognition Problems. 4th International Conference on
515 Pattern Recognition Applications and Methods 2015, Jan 2015,
516 Lisbon, Portugal. 2015,
517 <10.5220/0005209202710278>. <hal-01168816>
518 https://hal.archives-ouvertes.fr/hal-01168816
519 """
520 for _, _, cost in optimize_edit_paths(
521 G1,
522 G2,
523 node_match,
524 edge_match,
525 node_subst_cost,
526 node_del_cost,
527 node_ins_cost,
528 edge_subst_cost,
529 edge_del_cost,
530 edge_ins_cost,
531 upper_bound,
532 True,
533 ):
534 yield cost
535
536
537@nx._dispatchable(
538 graphs={"G1": 0, "G2": 1}, preserve_edge_attrs=True, preserve_node_attrs=True
539)
540def optimize_edit_paths(
541 G1,
542 G2,
543 node_match=None,
544 edge_match=None,
545 node_subst_cost=None,
546 node_del_cost=None,
547 node_ins_cost=None,
548 edge_subst_cost=None,
549 edge_del_cost=None,
550 edge_ins_cost=None,
551 upper_bound=None,
552 strictly_decreasing=True,
553 roots=None,
554 timeout=None,
555):
556 """GED (graph edit distance) calculation: advanced interface.
557
558 Graph edit path is a sequence of node and edge edit operations
559 transforming graph G1 to graph isomorphic to G2. Edit operations
560 include substitutions, deletions, and insertions.
561
562 Graph edit distance is defined as minimum cost of edit path.
563
564 Parameters
565 ----------
566 G1, G2: graphs
567 The two graphs G1 and G2 must be of the same type.
568
569 node_match : callable
570 A function that returns True if node n1 in G1 and n2 in G2
571 should be considered equal during matching.
572
573 The function will be called like
574
575 node_match(G1.nodes[n1], G2.nodes[n2]).
576
577 That is, the function will receive the node attribute
578 dictionaries for n1 and n2 as inputs.
579
580 Ignored if node_subst_cost is specified. If neither
581 node_match nor node_subst_cost are specified then node
582 attributes are not considered.
583
584 edge_match : callable
585 A function that returns True if the edge attribute dictionaries
586 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
587 be considered equal during matching.
588
589 The function will be called like
590
591 edge_match(G1[u1][v1], G2[u2][v2]).
592
593 That is, the function will receive the edge attribute
594 dictionaries of the edges under consideration.
595
596 Ignored if edge_subst_cost is specified. If neither
597 edge_match nor edge_subst_cost are specified then edge
598 attributes are not considered.
599
600 node_subst_cost, node_del_cost, node_ins_cost : callable
601 Functions that return the costs of node substitution, node
602 deletion, and node insertion, respectively.
603
604 The functions will be called like
605
606 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
607 node_del_cost(G1.nodes[n1]),
608 node_ins_cost(G2.nodes[n2]).
609
610 That is, the functions will receive the node attribute
611 dictionaries as inputs. The functions are expected to return
612 positive numeric values.
613
614 Function node_subst_cost overrides node_match if specified.
615 If neither node_match nor node_subst_cost are specified then
616 default node substitution cost of 0 is used (node attributes
617 are not considered during matching).
618
619 If node_del_cost is not specified then default node deletion
620 cost of 1 is used. If node_ins_cost is not specified then
621 default node insertion cost of 1 is used.
622
623 edge_subst_cost, edge_del_cost, edge_ins_cost : callable
624 Functions that return the costs of edge substitution, edge
625 deletion, and edge insertion, respectively.
626
627 The functions will be called like
628
629 edge_subst_cost(G1[u1][v1], G2[u2][v2]),
630 edge_del_cost(G1[u1][v1]),
631 edge_ins_cost(G2[u2][v2]).
632
633 That is, the functions will receive the edge attribute
634 dictionaries as inputs. The functions are expected to return
635 positive numeric values.
636
637 Function edge_subst_cost overrides edge_match if specified.
638 If neither edge_match nor edge_subst_cost are specified then
639 default edge substitution cost of 0 is used (edge attributes
640 are not considered during matching).
641
642 If edge_del_cost is not specified then default edge deletion
643 cost of 1 is used. If edge_ins_cost is not specified then
644 default edge insertion cost of 1 is used.
645
646 upper_bound : numeric
647 Maximum edit distance to consider.
648
649 strictly_decreasing : bool
650 If True, return consecutive approximations of strictly
651 decreasing cost. Otherwise, return all edit paths of cost
652 less than or equal to the previous minimum cost.
653
654 roots : 2-tuple
655 Tuple where first element is a node in G1 and the second
656 is a node in G2.
657 These nodes are forced to be matched in the comparison to
658 allow comparison between rooted graphs.
659
660 timeout : numeric
661 Maximum number of seconds to execute.
662 After timeout is met, the current best GED is returned.
663
664 Returns
665 -------
666 Generator of tuples (node_edit_path, edge_edit_path, cost)
667 node_edit_path : list of tuples (u, v)
668 edge_edit_path : list of tuples ((u1, v1), (u2, v2))
669 cost : numeric
670
671 See Also
672 --------
673 graph_edit_distance, optimize_graph_edit_distance, optimal_edit_paths
674
675 References
676 ----------
677 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
678 Martineau. An Exact Graph Edit Distance Algorithm for Solving
679 Pattern Recognition Problems. 4th International Conference on
680 Pattern Recognition Applications and Methods 2015, Jan 2015,
681 Lisbon, Portugal. 2015,
682 <10.5220/0005209202710278>. <hal-01168816>
683 https://hal.archives-ouvertes.fr/hal-01168816
684
685 """
686 # TODO: support DiGraph
687
688 import numpy as np
689 import scipy as sp
690
691 @dataclass
692 class CostMatrix:
693 C: ...
694 lsa_row_ind: ...
695 lsa_col_ind: ...
696 ls: ...
697
698 def make_CostMatrix(C, m, n):
699 # assert(C.shape == (m + n, m + n))
700 lsa_row_ind, lsa_col_ind = sp.optimize.linear_sum_assignment(C)
701
702 # Fixup dummy assignments:
703 # each substitution i<->j should have dummy assignment m+j<->n+i
704 # NOTE: fast reduce of Cv relies on it
705 # Create masks for substitution and dummy indices
706 is_subst = (lsa_row_ind < m) & (lsa_col_ind < n)
707 is_dummy = (lsa_row_ind >= m) & (lsa_col_ind >= n)
708
709 # Map dummy assignments to the correct indices
710 lsa_row_ind[is_dummy] = lsa_col_ind[is_subst] + m
711 lsa_col_ind[is_dummy] = lsa_row_ind[is_subst] + n
712
713 return CostMatrix(
714 C, lsa_row_ind, lsa_col_ind, C[lsa_row_ind, lsa_col_ind].sum()
715 )
716
717 def extract_C(C, i, j, m, n):
718 # assert(C.shape == (m + n, m + n))
719 row_ind = [k in i or k - m in j for k in range(m + n)]
720 col_ind = [k in j or k - n in i for k in range(m + n)]
721 return C[row_ind, :][:, col_ind]
722
723 def reduce_C(C, i, j, m, n):
724 # assert(C.shape == (m + n, m + n))
725 row_ind = [k not in i and k - m not in j for k in range(m + n)]
726 col_ind = [k not in j and k - n not in i for k in range(m + n)]
727 return C[row_ind, :][:, col_ind]
728
729 def reduce_ind(ind, i):
730 # assert set(ind) == set(range(len(ind)))
731 rind = ind[[k not in i for k in ind]]
732 for k in set(i):
733 rind[rind >= k] -= 1
734 return rind
735
736 def match_edges(u, v, pending_g, pending_h, Ce, matched_uv=None):
737 """
738 Parameters:
739 u, v: matched vertices, u=None or v=None for
740 deletion/insertion
741 pending_g, pending_h: lists of edges not yet mapped
742 Ce: CostMatrix of pending edge mappings
743 matched_uv: partial vertex edit path
744 list of tuples (u, v) of previously matched vertex
745 mappings u<->v, u=None or v=None for
746 deletion/insertion
747
748 Returns:
749 list of (i, j): indices of edge mappings g<->h
750 localCe: local CostMatrix of edge mappings
751 (basically submatrix of Ce at cross of rows i, cols j)
752 """
753 M = len(pending_g)
754 N = len(pending_h)
755 # assert Ce.C.shape == (M + N, M + N)
756
757 # only attempt to match edges after one node match has been made
758 # this will stop self-edges on the first node being automatically deleted
759 # even when a substitution is the better option
760
761 substitution_possible = M and N
762 at_least_one_node_match = matched_uv is None or len(matched_uv) == 0
763 if at_least_one_node_match and substitution_possible:
764 g_ind = []
765 h_ind = []
766 else:
767 g_ind = [
768 i
769 for i in range(M)
770 if pending_g[i][:2] == (u, u)
771 or any(
772 pending_g[i][:2] in ((p, u), (u, p), (p, p)) for p, q in matched_uv
773 )
774 ]
775 h_ind = [
776 j
777 for j in range(N)
778 if pending_h[j][:2] == (v, v)
779 or any(
780 pending_h[j][:2] in ((q, v), (v, q), (q, q)) for p, q in matched_uv
781 )
782 ]
783
784 m = len(g_ind)
785 n = len(h_ind)
786
787 if m or n:
788 C = extract_C(Ce.C, g_ind, h_ind, M, N)
789 # assert C.shape == (m + n, m + n)
790
791 # Forbid structurally invalid matches
792 # NOTE: inf remembered from Ce construction
793 for k, i in enumerate(g_ind):
794 g = pending_g[i][:2]
795 for l, j in enumerate(h_ind):
796 h = pending_h[j][:2]
797 if nx.is_directed(G1) or nx.is_directed(G2):
798 if any(
799 g == (p, u) and h == (q, v) or g == (u, p) and h == (v, q)
800 for p, q in matched_uv
801 ):
802 continue
803 else:
804 if any(
805 g in ((p, u), (u, p)) and h in ((q, v), (v, q))
806 for p, q in matched_uv
807 ):
808 continue
809 if g == (u, u) or any(g == (p, p) for p, q in matched_uv):
810 continue
811 if h == (v, v) or any(h == (q, q) for p, q in matched_uv):
812 continue
813 C[k, l] = inf
814
815 localCe = make_CostMatrix(C, m, n)
816 ij = [
817 (
818 g_ind[k] if k < m else M + h_ind[l],
819 h_ind[l] if l < n else N + g_ind[k],
820 )
821 for k, l in zip(localCe.lsa_row_ind, localCe.lsa_col_ind)
822 if k < m or l < n
823 ]
824
825 else:
826 ij = []
827 localCe = CostMatrix(np.empty((0, 0)), [], [], 0)
828
829 return ij, localCe
830
831 def reduce_Ce(Ce, ij, m, n):
832 if len(ij):
833 i, j = zip(*ij)
834 m_i = m - sum(1 for t in i if t < m)
835 n_j = n - sum(1 for t in j if t < n)
836 return make_CostMatrix(reduce_C(Ce.C, i, j, m, n), m_i, n_j)
837 return Ce
838
839 def get_edit_ops(
840 matched_uv, pending_u, pending_v, Cv, pending_g, pending_h, Ce, matched_cost
841 ):
842 """
843 Parameters:
844 matched_uv: partial vertex edit path
845 list of tuples (u, v) of vertex mappings u<->v,
846 u=None or v=None for deletion/insertion
847 pending_u, pending_v: lists of vertices not yet mapped
848 Cv: CostMatrix of pending vertex mappings
849 pending_g, pending_h: lists of edges not yet mapped
850 Ce: CostMatrix of pending edge mappings
851 matched_cost: cost of partial edit path
852
853 Returns:
854 sequence of
855 (i, j): indices of vertex mapping u<->v
856 Cv_ij: reduced CostMatrix of pending vertex mappings
857 (basically Cv with row i, col j removed)
858 list of (x, y): indices of edge mappings g<->h
859 Ce_xy: reduced CostMatrix of pending edge mappings
860 (basically Ce with rows x, cols y removed)
861 cost: total cost of edit operation
862 NOTE: most promising ops first
863 """
864 m = len(pending_u)
865 n = len(pending_v)
866 # assert Cv.C.shape == (m + n, m + n)
867
868 # 1) a vertex mapping from optimal linear sum assignment
869 i, j = min(
870 (k, l) for k, l in zip(Cv.lsa_row_ind, Cv.lsa_col_ind) if k < m or l < n
871 )
872 xy, localCe = match_edges(
873 pending_u[i] if i < m else None,
874 pending_v[j] if j < n else None,
875 pending_g,
876 pending_h,
877 Ce,
878 matched_uv,
879 )
880 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))
881 # assert Ce.ls <= localCe.ls + Ce_xy.ls
882 if prune(matched_cost + Cv.ls + localCe.ls + Ce_xy.ls):
883 pass
884 else:
885 # get reduced Cv efficiently
886 Cv_ij = CostMatrix(
887 reduce_C(Cv.C, (i,), (j,), m, n),
888 reduce_ind(Cv.lsa_row_ind, (i, m + j)),
889 reduce_ind(Cv.lsa_col_ind, (j, n + i)),
890 Cv.ls - Cv.C[i, j],
891 )
892 yield (i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls
893
894 # 2) other candidates, sorted by lower-bound cost estimate
895 other = []
896 fixed_i, fixed_j = i, j
897 if m <= n:
898 candidates = (
899 (t, fixed_j)
900 for t in range(m + n)
901 if t != fixed_i and (t < m or t == m + fixed_j)
902 )
903 else:
904 candidates = (
905 (fixed_i, t)
906 for t in range(m + n)
907 if t != fixed_j and (t < n or t == n + fixed_i)
908 )
909 for i, j in candidates:
910 if prune(matched_cost + Cv.C[i, j] + Ce.ls):
911 continue
912 Cv_ij = make_CostMatrix(
913 reduce_C(Cv.C, (i,), (j,), m, n),
914 m - 1 if i < m else m,
915 n - 1 if j < n else n,
916 )
917 # assert Cv.ls <= Cv.C[i, j] + Cv_ij.ls
918 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + Ce.ls):
919 continue
920 xy, localCe = match_edges(
921 pending_u[i] if i < m else None,
922 pending_v[j] if j < n else None,
923 pending_g,
924 pending_h,
925 Ce,
926 matched_uv,
927 )
928 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls):
929 continue
930 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))
931 # assert Ce.ls <= localCe.ls + Ce_xy.ls
932 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls + Ce_xy.ls):
933 continue
934 other.append(((i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls))
935
936 yield from sorted(other, key=lambda t: t[4] + t[1].ls + t[3].ls)
937
938 def get_edit_paths(
939 matched_uv,
940 pending_u,
941 pending_v,
942 Cv,
943 matched_gh,
944 pending_g,
945 pending_h,
946 Ce,
947 matched_cost,
948 ):
949 """
950 Parameters:
951 matched_uv: partial vertex edit path
952 list of tuples (u, v) of vertex mappings u<->v,
953 u=None or v=None for deletion/insertion
954 pending_u, pending_v: lists of vertices not yet mapped
955 Cv: CostMatrix of pending vertex mappings
956 matched_gh: partial edge edit path
957 list of tuples (g, h) of edge mappings g<->h,
958 g=None or h=None for deletion/insertion
959 pending_g, pending_h: lists of edges not yet mapped
960 Ce: CostMatrix of pending edge mappings
961 matched_cost: cost of partial edit path
962
963 Returns:
964 sequence of (vertex_path, edge_path, cost)
965 vertex_path: complete vertex edit path
966 list of tuples (u, v) of vertex mappings u<->v,
967 u=None or v=None for deletion/insertion
968 edge_path: complete edge edit path
969 list of tuples (g, h) of edge mappings g<->h,
970 g=None or h=None for deletion/insertion
971 cost: total cost of edit path
972 NOTE: path costs are non-increasing
973 """
974 if prune(matched_cost + Cv.ls + Ce.ls):
975 return
976
977 if not max(len(pending_u), len(pending_v)):
978 # assert not len(pending_g)
979 # assert not len(pending_h)
980 # path completed!
981 # assert matched_cost <= maxcost_value
982 nonlocal maxcost_value
983 maxcost_value = min(maxcost_value, matched_cost)
984 yield matched_uv, matched_gh, matched_cost
985
986 else:
987 edit_ops = get_edit_ops(
988 matched_uv,
989 pending_u,
990 pending_v,
991 Cv,
992 pending_g,
993 pending_h,
994 Ce,
995 matched_cost,
996 )
997 for ij, Cv_ij, xy, Ce_xy, edit_cost in edit_ops:
998 i, j = ij
999 # assert Cv.C[i, j] + sum(Ce.C[t] for t in xy) == edit_cost
1000 if prune(matched_cost + edit_cost + Cv_ij.ls + Ce_xy.ls):
1001 continue
1002
1003 # dive deeper
1004 u = pending_u.pop(i) if i < len(pending_u) else None
1005 v = pending_v.pop(j) if j < len(pending_v) else None
1006 matched_uv.append((u, v))
1007 for x, y in xy:
1008 len_g = len(pending_g)
1009 len_h = len(pending_h)
1010 matched_gh.append(
1011 (
1012 pending_g[x] if x < len_g else None,
1013 pending_h[y] if y < len_h else None,
1014 )
1015 )
1016 sortedx = sorted(x for x, y in xy)
1017 sortedy = sorted(y for x, y in xy)
1018 G = [
1019 (pending_g.pop(x) if x < len(pending_g) else None)
1020 for x in reversed(sortedx)
1021 ]
1022 H = [
1023 (pending_h.pop(y) if y < len(pending_h) else None)
1024 for y in reversed(sortedy)
1025 ]
1026
1027 yield from get_edit_paths(
1028 matched_uv,
1029 pending_u,
1030 pending_v,
1031 Cv_ij,
1032 matched_gh,
1033 pending_g,
1034 pending_h,
1035 Ce_xy,
1036 matched_cost + edit_cost,
1037 )
1038
1039 # backtrack
1040 if u is not None:
1041 pending_u.insert(i, u)
1042 if v is not None:
1043 pending_v.insert(j, v)
1044 matched_uv.pop()
1045 for x, g in zip(sortedx, reversed(G)):
1046 if g is not None:
1047 pending_g.insert(x, g)
1048 for y, h in zip(sortedy, reversed(H)):
1049 if h is not None:
1050 pending_h.insert(y, h)
1051 for _ in xy:
1052 matched_gh.pop()
1053
1054 # Initialization
1055
1056 pending_u = list(G1.nodes)
1057 pending_v = list(G2.nodes)
1058
1059 initial_cost = 0
1060 if roots:
1061 root_u, root_v = roots
1062 if root_u not in pending_u or root_v not in pending_v:
1063 raise nx.NodeNotFound("Root node not in graph.")
1064
1065 # remove roots from pending
1066 pending_u.remove(root_u)
1067 pending_v.remove(root_v)
1068
1069 # cost matrix of vertex mappings
1070 m = len(pending_u)
1071 n = len(pending_v)
1072 C = np.zeros((m + n, m + n))
1073 if node_subst_cost:
1074 C[0:m, 0:n] = np.array(
1075 [
1076 node_subst_cost(G1.nodes[u], G2.nodes[v])
1077 for u in pending_u
1078 for v in pending_v
1079 ]
1080 ).reshape(m, n)
1081 if roots:
1082 initial_cost = node_subst_cost(G1.nodes[root_u], G2.nodes[root_v])
1083 elif node_match:
1084 C[0:m, 0:n] = np.array(
1085 [
1086 1 - int(node_match(G1.nodes[u], G2.nodes[v]))
1087 for u in pending_u
1088 for v in pending_v
1089 ]
1090 ).reshape(m, n)
1091 if roots:
1092 initial_cost = 1 - node_match(G1.nodes[root_u], G2.nodes[root_v])
1093 else:
1094 # all zeroes
1095 pass
1096 # assert not min(m, n) or C[0:m, 0:n].min() >= 0
1097 if node_del_cost:
1098 del_costs = [node_del_cost(G1.nodes[u]) for u in pending_u]
1099 else:
1100 del_costs = [1] * len(pending_u)
1101 # assert not m or min(del_costs) >= 0
1102 if node_ins_cost:
1103 ins_costs = [node_ins_cost(G2.nodes[v]) for v in pending_v]
1104 else:
1105 ins_costs = [1] * len(pending_v)
1106 # assert not n or min(ins_costs) >= 0
1107 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1
1108 C[0:m, n : n + m] = np.array(
1109 [del_costs[i] if i == j else inf for i in range(m) for j in range(m)]
1110 ).reshape(m, m)
1111 C[m : m + n, 0:n] = np.array(
1112 [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)]
1113 ).reshape(n, n)
1114 Cv = make_CostMatrix(C, m, n)
1115
1116 pending_g = list(G1.edges)
1117 pending_h = list(G2.edges)
1118
1119 # cost matrix of edge mappings
1120 m = len(pending_g)
1121 n = len(pending_h)
1122 C = np.zeros((m + n, m + n))
1123 if edge_subst_cost:
1124 C[0:m, 0:n] = np.array(
1125 [
1126 edge_subst_cost(G1.edges[g], G2.edges[h])
1127 for g in pending_g
1128 for h in pending_h
1129 ]
1130 ).reshape(m, n)
1131 elif edge_match:
1132 C[0:m, 0:n] = np.array(
1133 [
1134 1 - int(edge_match(G1.edges[g], G2.edges[h]))
1135 for g in pending_g
1136 for h in pending_h
1137 ]
1138 ).reshape(m, n)
1139 else:
1140 # all zeroes
1141 pass
1142 # assert not min(m, n) or C[0:m, 0:n].min() >= 0
1143 if edge_del_cost:
1144 del_costs = [edge_del_cost(G1.edges[g]) for g in pending_g]
1145 else:
1146 del_costs = [1] * len(pending_g)
1147 # assert not m or min(del_costs) >= 0
1148 if edge_ins_cost:
1149 ins_costs = [edge_ins_cost(G2.edges[h]) for h in pending_h]
1150 else:
1151 ins_costs = [1] * len(pending_h)
1152 # assert not n or min(ins_costs) >= 0
1153 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1
1154 C[0:m, n : n + m] = np.array(
1155 [del_costs[i] if i == j else inf for i in range(m) for j in range(m)]
1156 ).reshape(m, m)
1157 C[m : m + n, 0:n] = np.array(
1158 [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)]
1159 ).reshape(n, n)
1160 Ce = make_CostMatrix(C, m, n)
1161
1162 maxcost_value = Cv.C.sum() + Ce.C.sum() + 1
1163
1164 if timeout is not None:
1165 if timeout <= 0:
1166 raise nx.NetworkXError("Timeout value must be greater than 0")
1167 start = time.perf_counter()
1168
1169 def prune(cost):
1170 if timeout is not None:
1171 if time.perf_counter() - start > timeout:
1172 return True
1173 if upper_bound is not None:
1174 if cost > upper_bound:
1175 return True
1176 if cost > maxcost_value:
1177 return True
1178 if strictly_decreasing and cost >= maxcost_value:
1179 return True
1180 return False
1181
1182 # Now go!
1183
1184 done_uv = [] if roots is None else [roots]
1185
1186 for vertex_path, edge_path, cost in get_edit_paths(
1187 done_uv, pending_u, pending_v, Cv, [], pending_g, pending_h, Ce, initial_cost
1188 ):
1189 # assert sorted(G1.nodes) == sorted(u for u, v in vertex_path if u is not None)
1190 # assert sorted(G2.nodes) == sorted(v for u, v in vertex_path if v is not None)
1191 # assert sorted(G1.edges) == sorted(g for g, h in edge_path if g is not None)
1192 # assert sorted(G2.edges) == sorted(h for g, h in edge_path if h is not None)
1193 # print(vertex_path, edge_path, cost, file = sys.stderr)
1194 # assert cost == maxcost_value
1195 yield list(vertex_path), list(edge_path), float(cost)
1196
1197
1198@nx._dispatchable
1199def simrank_similarity(
1200 G,
1201 source=None,
1202 target=None,
1203 importance_factor=0.9,
1204 max_iterations=1000,
1205 tolerance=1e-4,
1206):
1207 """Returns the SimRank similarity of nodes in the graph ``G``.
1208
1209 SimRank is a similarity metric that says "two objects are considered
1210 to be similar if they are referenced by similar objects." [1]_.
1211
1212 The pseudo-code definition from the paper is::
1213
1214 def simrank(G, u, v):
1215 in_neighbors_u = G.predecessors(u)
1216 in_neighbors_v = G.predecessors(v)
1217 scale = C / (len(in_neighbors_u) * len(in_neighbors_v))
1218 return scale * sum(
1219 simrank(G, w, x) for w, x in product(in_neighbors_u, in_neighbors_v)
1220 )
1221
1222 where ``G`` is the graph, ``u`` is the source, ``v`` is the target,
1223 and ``C`` is a float decay or importance factor between 0 and 1.
1224
1225 The SimRank algorithm for determining node similarity is defined in
1226 [2]_.
1227
1228 Parameters
1229 ----------
1230 G : NetworkX graph
1231 A NetworkX graph
1232
1233 source : node
1234 If this is specified, the returned dictionary maps each node
1235 ``v`` in the graph to the similarity between ``source`` and
1236 ``v``.
1237
1238 target : node
1239 If both ``source`` and ``target`` are specified, the similarity
1240 value between ``source`` and ``target`` is returned. If
1241 ``target`` is specified but ``source`` is not, this argument is
1242 ignored.
1243
1244 importance_factor : float
1245 The relative importance of indirect neighbors with respect to
1246 direct neighbors.
1247
1248 max_iterations : integer
1249 Maximum number of iterations.
1250
1251 tolerance : float
1252 Error tolerance used to check convergence. When an iteration of
1253 the algorithm finds that no similarity value changes more than
1254 this amount, the algorithm halts.
1255
1256 Returns
1257 -------
1258 similarity : dictionary or float
1259 If ``source`` and ``target`` are both ``None``, this returns a
1260 dictionary of dictionaries, where keys are node pairs and value
1261 are similarity of the pair of nodes.
1262
1263 If ``source`` is not ``None`` but ``target`` is, this returns a
1264 dictionary mapping node to the similarity of ``source`` and that
1265 node.
1266
1267 If neither ``source`` nor ``target`` is ``None``, this returns
1268 the similarity value for the given pair of nodes.
1269
1270 Raises
1271 ------
1272 ExceededMaxIterations
1273 If the algorithm does not converge within ``max_iterations``.
1274
1275 NodeNotFound
1276 If either ``source`` or ``target`` is not in `G`.
1277
1278 Examples
1279 --------
1280 >>> G = nx.cycle_graph(2)
1281 >>> nx.simrank_similarity(G)
1282 {0: {0: 1.0, 1: 0.0}, 1: {0: 0.0, 1: 1.0}}
1283 >>> nx.simrank_similarity(G, source=0)
1284 {0: 1.0, 1: 0.0}
1285 >>> nx.simrank_similarity(G, source=0, target=0)
1286 1.0
1287
1288 The result of this function can be converted to a numpy array
1289 representing the SimRank matrix by using the node order of the
1290 graph to determine which row and column represent each node.
1291 Other ordering of nodes is also possible.
1292
1293 >>> import numpy as np
1294 >>> sim = nx.simrank_similarity(G)
1295 >>> np.array([[sim[u][v] for v in G] for u in G])
1296 array([[1., 0.],
1297 [0., 1.]])
1298 >>> sim_1d = nx.simrank_similarity(G, source=0)
1299 >>> np.array([sim[0][v] for v in G])
1300 array([1., 0.])
1301
1302 References
1303 ----------
1304 .. [1] https://en.wikipedia.org/wiki/SimRank
1305 .. [2] G. Jeh and J. Widom.
1306 "SimRank: a measure of structural-context similarity",
1307 In KDD'02: Proceedings of the Eighth ACM SIGKDD
1308 International Conference on Knowledge Discovery and Data Mining,
1309 pp. 538--543. ACM Press, 2002.
1310 """
1311 import numpy as np
1312
1313 nodelist = list(G)
1314 if source is not None:
1315 if source not in nodelist:
1316 raise nx.NodeNotFound(f"Source node {source} not in G")
1317 else:
1318 s_indx = nodelist.index(source)
1319 else:
1320 s_indx = None
1321
1322 if target is not None:
1323 if target not in nodelist:
1324 raise nx.NodeNotFound(f"Target node {target} not in G")
1325 else:
1326 t_indx = nodelist.index(target)
1327 else:
1328 t_indx = None
1329
1330 x = _simrank_similarity_numpy(
1331 G, s_indx, t_indx, importance_factor, max_iterations, tolerance
1332 )
1333
1334 if isinstance(x, np.ndarray):
1335 if x.ndim == 1:
1336 return dict(zip(G, x.tolist()))
1337 # else x.ndim == 2
1338 return {u: dict(zip(G, row)) for u, row in zip(G, x.tolist())}
1339 return float(x)
1340
1341
1342def _simrank_similarity_python(
1343 G,
1344 source=None,
1345 target=None,
1346 importance_factor=0.9,
1347 max_iterations=1000,
1348 tolerance=1e-4,
1349):
1350 """Returns the SimRank similarity of nodes in the graph ``G``.
1351
1352 This pure Python version is provided for pedagogical purposes.
1353
1354 Examples
1355 --------
1356 >>> G = nx.cycle_graph(2)
1357 >>> nx.similarity._simrank_similarity_python(G)
1358 {0: {0: 1, 1: 0.0}, 1: {0: 0.0, 1: 1}}
1359 >>> nx.similarity._simrank_similarity_python(G, source=0)
1360 {0: 1, 1: 0.0}
1361 >>> nx.similarity._simrank_similarity_python(G, source=0, target=0)
1362 1
1363 """
1364 # build up our similarity adjacency dictionary output
1365 newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G}
1366
1367 # These functions compute the update to the similarity value of the nodes
1368 # `u` and `v` with respect to the previous similarity values.
1369 def avg_sim(s):
1370 return sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0
1371
1372 Gadj = G.pred if G.is_directed() else G.adj
1373
1374 def sim(u, v):
1375 return importance_factor * avg_sim(list(product(Gadj[u], Gadj[v])))
1376
1377 for its in range(max_iterations):
1378 oldsim = newsim
1379 newsim = {u: {v: sim(u, v) if u != v else 1 for v in G} for u in G}
1380 is_close = all(
1381 all(
1382 abs(newsim[u][v] - old) <= tolerance * (1 + abs(old))
1383 for v, old in nbrs.items()
1384 )
1385 for u, nbrs in oldsim.items()
1386 )
1387 if is_close:
1388 break
1389
1390 if its + 1 == max_iterations:
1391 raise nx.ExceededMaxIterations(
1392 f"simrank did not converge after {max_iterations} iterations."
1393 )
1394
1395 if source is not None and target is not None:
1396 return newsim[source][target]
1397 if source is not None:
1398 return newsim[source]
1399 return newsim
1400
1401
1402def _simrank_similarity_numpy(
1403 G,
1404 source=None,
1405 target=None,
1406 importance_factor=0.9,
1407 max_iterations=1000,
1408 tolerance=1e-4,
1409):
1410 """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``.
1411
1412 The SimRank algorithm for determining node similarity is defined in
1413 [1]_.
1414
1415 Parameters
1416 ----------
1417 G : NetworkX graph
1418 A NetworkX graph
1419
1420 source : node
1421 If this is specified, the returned dictionary maps each node
1422 ``v`` in the graph to the similarity between ``source`` and
1423 ``v``.
1424
1425 target : node
1426 If both ``source`` and ``target`` are specified, the similarity
1427 value between ``source`` and ``target`` is returned. If
1428 ``target`` is specified but ``source`` is not, this argument is
1429 ignored.
1430
1431 importance_factor : float
1432 The relative importance of indirect neighbors with respect to
1433 direct neighbors.
1434
1435 max_iterations : integer
1436 Maximum number of iterations.
1437
1438 tolerance : float
1439 Error tolerance used to check convergence. When an iteration of
1440 the algorithm finds that no similarity value changes more than
1441 this amount, the algorithm halts.
1442
1443 Returns
1444 -------
1445 similarity : numpy array or float
1446 If ``source`` and ``target`` are both ``None``, this returns a
1447 2D array containing SimRank scores of the nodes.
1448
1449 If ``source`` is not ``None`` but ``target`` is, this returns an
1450 1D array containing SimRank scores of ``source`` and that
1451 node.
1452
1453 If neither ``source`` nor ``target`` is ``None``, this returns
1454 the similarity value for the given pair of nodes.
1455
1456 Examples
1457 --------
1458 >>> G = nx.cycle_graph(2)
1459 >>> nx.similarity._simrank_similarity_numpy(G)
1460 array([[1., 0.],
1461 [0., 1.]])
1462 >>> nx.similarity._simrank_similarity_numpy(G, source=0)
1463 array([1., 0.])
1464 >>> nx.similarity._simrank_similarity_numpy(G, source=0, target=0)
1465 1.0
1466
1467 References
1468 ----------
1469 .. [1] G. Jeh and J. Widom.
1470 "SimRank: a measure of structural-context similarity",
1471 In KDD'02: Proceedings of the Eighth ACM SIGKDD
1472 International Conference on Knowledge Discovery and Data Mining,
1473 pp. 538--543. ACM Press, 2002.
1474 """
1475 # This algorithm follows roughly
1476 #
1477 # S = max{C * (A.T * S * A), I}
1478 #
1479 # where C is the importance factor, A is the column normalized
1480 # adjacency matrix, and I is the identity matrix.
1481 import numpy as np
1482
1483 adjacency_matrix = nx.to_numpy_array(G)
1484
1485 # column-normalize the ``adjacency_matrix``
1486 s = np.array(adjacency_matrix.sum(axis=0))
1487 s[s == 0] = 1
1488 adjacency_matrix /= s # adjacency_matrix.sum(axis=0)
1489
1490 newsim = np.eye(len(G), dtype=np.float64)
1491 for its in range(max_iterations):
1492 prevsim = newsim.copy()
1493 newsim = importance_factor * ((adjacency_matrix.T @ prevsim) @ adjacency_matrix)
1494 np.fill_diagonal(newsim, 1.0)
1495
1496 if np.allclose(prevsim, newsim, atol=tolerance):
1497 break
1498
1499 if its + 1 == max_iterations:
1500 raise nx.ExceededMaxIterations(
1501 f"simrank did not converge after {max_iterations} iterations."
1502 )
1503
1504 if source is not None and target is not None:
1505 return float(newsim[source, target])
1506 if source is not None:
1507 return newsim[source]
1508 return newsim
1509
1510
1511@np_random_state("seed")
1512def _prepare_panther_paths(
1513 G,
1514 source,
1515 path_length=5,
1516 c=0.5,
1517 delta=0.1,
1518 eps=None,
1519 weight="weight",
1520 remove_isolates=True,
1521 k=None,
1522 seed=None,
1523):
1524 """Common preparation code for Panther similarity algorithms.
1525
1526 Parameters
1527 ----------
1528 G : NetworkX graph
1529 A NetworkX graph
1530 source : node
1531 Source node for similarity calculation
1532 path_length : int
1533 How long the randomly generated paths should be
1534 c : float
1535 A universal constant that controls the number of random paths to generate
1536 delta : float
1537 The probability parameter for similarity approximation
1538 eps : float or None
1539 The error bound for similarity approximation
1540 weight : string or None
1541 The name of an edge attribute that holds the numerical value used as a weight
1542 remove_isolates : bool
1543 Whether to remove isolated nodes from graph processing
1544 k : int or None
1545 The number of most similar nodes to return. If provided, validates that
1546 ``k`` is not greater than the number of nodes in the graph.
1547 seed : integer, random_state, or None (default)
1548 Indicator of random number generation state.
1549 See :ref:`Randomness<randomness>`.
1550
1551 Returns
1552 -------
1553 PantherPaths
1554 A tuple containing the prepared data:
1555 - G: The graph (possibly with isolates removed)
1556 - inv_node_map: Dictionary mapping node names to indices
1557 - index_map: Populated index map of paths
1558 - inv_sample_size: Inverse of sample size (for fast calculation)
1559 - eps: Error bound for similarity approximation
1560 """
1561 import numpy as np
1562
1563 if source not in G:
1564 raise nx.NodeNotFound(f"Source node {source} not in G")
1565
1566 isolates = set(nx.isolates(G))
1567
1568 if source in isolates:
1569 raise nx.NetworkXUnfeasible(
1570 f"Panther similarity is not defined for the isolated source node {source}."
1571 )
1572
1573 if remove_isolates:
1574 G = G.subgraph(node for node in G if node not in isolates).copy()
1575
1576 # According to [1], they empirically determined
1577 # a good value for ``eps`` to be sqrt( 1 / |E| )
1578 if eps is None:
1579 eps = np.sqrt(1.0 / G.number_of_edges())
1580
1581 num_nodes = G.number_of_nodes()
1582
1583 # Check if k is provided and validate it against the number of nodes
1584 if k is not None and not remove_isolates: # For panther_vector_similarity
1585 if num_nodes < k:
1586 raise nx.NetworkXUnfeasible(
1587 f"The number of requested nodes {k} is greater than the number of nodes {num_nodes}."
1588 )
1589
1590 inv_node_map = {name: index for index, name in enumerate(G)}
1591
1592 # Calculate the sample size ``R`` for how many paths
1593 # to randomly generate
1594 t_choose_2 = math.comb(path_length, 2)
1595 sample_size = int((c / eps**2) * (np.log2(t_choose_2) + 1 + np.log(1 / delta)))
1596 index_map = {}
1597
1598 # Check for isolated nodes before generating random paths
1599 # If there are still isolated nodes in the graph after filtering,
1600 # they will cause issues with path generation
1601 remaining_isolates = set(nx.isolates(G))
1602 if remaining_isolates:
1603 raise nx.NetworkXUnfeasible(
1604 f"Cannot generate random paths with isolated nodes present: {remaining_isolates}"
1605 )
1606
1607 # Generate the random paths and populate the index_map
1608 for _ in generate_random_paths(
1609 G,
1610 sample_size,
1611 path_length=path_length,
1612 index_map=index_map,
1613 weight=weight,
1614 seed=seed,
1615 ):
1616 # NOTE: index_map is modified in-place by `generate_random_paths`
1617 pass
1618
1619 return (
1620 G, # The graph with isolated nodes removed
1621 inv_node_map,
1622 index_map,
1623 1 / sample_size,
1624 eps,
1625 )
1626
1627
1628@np_random_state("seed")
1629@nx._dispatchable(edge_attrs="weight")
1630def panther_similarity(
1631 G,
1632 source,
1633 k=5,
1634 path_length=5,
1635 c=0.5,
1636 delta=0.1,
1637 eps=None,
1638 weight="weight",
1639 seed=None,
1640):
1641 r"""Returns the Panther similarity of nodes in the graph `G` to node ``v``.
1642
1643 Panther is a similarity metric that says "two objects are considered
1644 to be similar if they frequently appear on the same paths." [1]_.
1645
1646 Parameters
1647 ----------
1648 G : NetworkX graph
1649 A NetworkX graph
1650 source : node
1651 Source node for which to find the top `k` similar other nodes
1652 k : int (default = 5)
1653 The number of most similar nodes to return.
1654 path_length : int (default = 5)
1655 How long the randomly generated paths should be (``T`` in [1]_)
1656 c : float (default = 0.5)
1657 A universal constant that controls the number of random paths to generate.
1658 Higher values increase the number of sample paths and potentially improve
1659 accuracy at the cost of more computation. Defaults to 0.5 as recommended
1660 in [1]_.
1661 delta : float (default = 0.1)
1662 The probability that the similarity $S$ is not an epsilon-approximation to (R, phi),
1663 where $R$ is the number of random paths and $\phi$ is the probability
1664 that an element sampled from a set $A \subseteq D$, where $D$ is the domain.
1665 eps : float or None (default = None)
1666 The error bound for similarity approximation. This controls the accuracy
1667 of the sampled paths in representing the true similarity. Smaller values
1668 yield more accurate results but require more sample paths. If `None`, a
1669 value of ``sqrt(1/|E|)`` is used, which the authors found empirically
1670 effective.
1671 weight : string or None, optional (default="weight")
1672 The name of an edge attribute that holds the numerical value
1673 used as a weight. If None then each edge has weight 1.
1674 seed : integer, random_state, or None (default)
1675 Indicator of random number generation state.
1676 See :ref:`Randomness<randomness>`.
1677
1678 Returns
1679 -------
1680 similarity : dictionary
1681 Dictionary of nodes to similarity scores (as floats). Note:
1682 the self-similarity (i.e., ``v``) will not be included in
1683 the returned dictionary. So, for ``k = 5``, a dictionary of
1684 top 4 nodes and their similarity scores will be returned.
1685
1686 Raises
1687 ------
1688 NetworkXUnfeasible
1689 If `source` is an isolated node.
1690
1691 NodeNotFound
1692 If `source` is not in `G`.
1693
1694 Notes
1695 -----
1696 The isolated nodes in `G` are ignored.
1697
1698 Examples
1699 --------
1700 >>> G = nx.star_graph(10)
1701 >>> sim = nx.panther_similarity(G, 0)
1702
1703 References
1704 ----------
1705 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.
1706 Panther: Fast top-k similarity search on large networks.
1707 In Proceedings of the ACM SIGKDD International Conference
1708 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).
1709 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.
1710 """
1711 import numpy as np
1712
1713 # Use helper method to prepare common data structures
1714 G, inv_node_map, index_map, inv_sample_size, eps = _prepare_panther_paths(
1715 G,
1716 source,
1717 path_length=path_length,
1718 c=c,
1719 delta=delta,
1720 eps=eps,
1721 weight=weight,
1722 k=k,
1723 seed=seed,
1724 )
1725
1726 num_nodes = G.number_of_nodes()
1727 node_list = list(G.nodes)
1728
1729 # Check number of nodes after any modifications by _prepare_panther_paths
1730 if num_nodes < k:
1731 raise nx.NetworkXUnfeasible(
1732 f"The number of requested nodes {k} is greater than the number of nodes {num_nodes}."
1733 )
1734
1735 S = np.zeros(num_nodes)
1736 source_paths = set(index_map[source])
1737
1738 # Calculate the path similarities
1739 # between ``source`` (v) and ``node`` (v_j)
1740 # using our inverted index mapping of
1741 # vertices to paths
1742 for node, paths in index_map.items():
1743 # Only consider paths where both
1744 # ``node`` and ``source`` are present
1745 common_paths = source_paths.intersection(paths)
1746 S[inv_node_map[node]] = len(common_paths) * inv_sample_size
1747
1748 # Retrieve top ``k+1`` similar to account for removing self-similarity
1749 # Note: the below performed anywhere from 4-10x faster
1750 # (depending on input sizes) vs the equivalent ``np.argsort(S)[::-1]``
1751 partition_k = min(k + 1, num_nodes)
1752 top_k_unsorted = np.argpartition(S, -partition_k)[-partition_k:]
1753 top_k_sorted = top_k_unsorted[np.argsort(S[top_k_unsorted])][::-1]
1754
1755 # Add back the similarity scores
1756 # Convert numpy scalars to native Python types for dispatch compatibility
1757 top_k_with_val = dict(
1758 zip((node_list[i] for i in top_k_sorted), S[top_k_sorted].tolist())
1759 )
1760
1761 # Remove the self-similarity
1762 top_k_with_val.pop(source, None)
1763 return top_k_with_val
1764
1765
1766@np_random_state("seed")
1767@nx._dispatchable(edge_attrs="weight")
1768def panther_vector_similarity(
1769 G,
1770 source,
1771 *,
1772 D=10,
1773 k=5,
1774 path_length=5,
1775 c=0.5,
1776 delta=0.1,
1777 eps=None,
1778 weight="weight",
1779 seed=None,
1780):
1781 r"""Returns the Panther vector similarity (Panther++) of nodes in `G`.
1782
1783 Computes similarity between nodes based on the "Panther++" algorithm [1]_, which extends
1784 the basic Panther algorithm by using feature vectors to better capture structural
1785 similarity.
1786
1787 While basic Panther similarity measures how often two nodes appear on the same paths,
1788 Panther vector similarity (Panther++) creates a ``D``-dimensional feature vector for each
1789 node using its top similarity scores with other nodes, then computes similarity based
1790 on the Euclidean distance between these feature vectors. This approach better captures
1791 structural similarity and addresses the bias towards close neighbors present in
1792 the original Panther algorithm.
1793
1794 This approach is preferred when:
1795
1796 1. You need better structural similarity than basic path co-occurrence
1797 2. You want to overcome the close-neighbor bias of standard Panther
1798 3. You're working with large graphs where k-d tree indexing would be beneficial
1799 4. Graph edit distance-like similarity is more appropriate than path co-occurrence
1800
1801 Parameters
1802 ----------
1803 G : NetworkX graph
1804 A NetworkX graph
1805 source : node
1806 Source node for which to find the top ``k`` similar other nodes
1807 D : int
1808 The number of similarity scores to use (in descending order)
1809 for each feature vector. Defaults to 10. Note that the original paper
1810 used D=50 [1]_, but KDTree is optimized for lower dimensions.
1811 k : int
1812 The number of most similar nodes to return
1813 path_length : int
1814 How long the randomly generated paths should be (``T`` in [1]_)
1815 c : float
1816 A universal constant that controls the number of random paths to generate.
1817 Higher values increase the number of sample paths and potentially improve
1818 accuracy at the cost of more computation. Defaults to 0.5 as recommended
1819 in [1]_.
1820 delta : float
1821 The probability that ``S`` is not an epsilon-approximation to (R, phi)
1822 eps : float
1823 The error bound for similarity approximation. This controls the accuracy
1824 of the sampled paths in representing the true similarity. Smaller values
1825 yield more accurate results but require more sample paths. If None, a
1826 value of ``sqrt(1/|E|)`` is used, which the authors found empirically
1827 effective.
1828 weight : string or None, optional (default="weight")
1829 The name of an edge attribute that holds the numerical value
1830 used as a weight. If `None` then each edge has weight 1.
1831 seed : integer, random_state, or None (default)
1832 Indicator of random number generation state.
1833 See :ref:`Randomness<randomness>`.
1834
1835 Returns
1836 -------
1837 similarity : dict
1838 Dict of nodes to similarity scores (as floats).
1839 Note: the self-similarity (i.e., `node`) is not included in the dict.
1840
1841 Examples
1842 --------
1843 >>> G = nx.star_graph(100)
1844
1845 The "hub" node is distinct from the "spoke" nodes
1846
1847 >>> from pprint import pprint
1848 >>> pprint(nx.panther_vector_similarity(G, source=0, seed=42))
1849 {35: 0.10402634656233918,
1850 61: 0.10434063328712018,
1851 65: 0.10401247833456054,
1852 85: 0.10506718868571752,
1853 88: 0.10402634656233918}
1854
1855 But "spoke" nodes are similar to one another
1856
1857 >>> result = nx.panther_vector_similarity(G, source=1, seed=42)
1858 >>> len(result)
1859 5
1860 >>> all(similarity == 1.0 for similarity in result.values())
1861 True
1862
1863 Notes
1864 -----
1865 Results may be nondeterministic when feature vectors have the same distances,
1866 as the KDTree's internal tie-breaking behavior can vary between runs.
1867 Using the same ``seed`` parameter ensures reproducible results.
1868
1869 References
1870 ----------
1871 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.
1872 Panther: Fast top-k similarity search on large networks.
1873 In Proceedings of the ACM SIGKDD International Conference
1874 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).
1875 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.
1876 """
1877 import numpy as np
1878 import scipy as sp
1879
1880 # Use helper method to prepare common data structures but keep isolates in the graph
1881 G, inv_node_map, index_map, inv_sample_size, eps = _prepare_panther_paths(
1882 G,
1883 source,
1884 path_length=path_length,
1885 c=c,
1886 delta=delta,
1887 eps=eps,
1888 weight=weight,
1889 remove_isolates=False,
1890 k=k,
1891 seed=seed,
1892 )
1893 num_nodes = G.number_of_nodes()
1894 node_list = list(G.nodes)
1895
1896 # Ensure D doesn't exceed the number of nodes
1897 if num_nodes < D:
1898 raise nx.NetworkXUnfeasible(
1899 f"The number of requested similarity scores {D} is greater than the number of nodes {num_nodes}."
1900 )
1901
1902 similarities = np.zeros((num_nodes, num_nodes))
1903 theta = np.zeros((num_nodes, D))
1904 index_map_sets = {node: set(paths) for node, paths in index_map.items()}
1905
1906 # Calculate the path similarities for each node
1907 for vi_idx, vi in enumerate(G.nodes):
1908 vi_paths = index_map_sets[vi]
1909
1910 for node, node_paths in index_map_sets.items():
1911 # Calculate similarity score
1912 common_path_count = len(vi_paths.intersection(node_paths))
1913 similarities[vi_idx, inv_node_map[node]] = (
1914 common_path_count * inv_sample_size
1915 )
1916
1917 # Build up the feature vector using the largest D similarity scores
1918 theta[vi_idx] = np.sort(np.partition(similarities[vi_idx], -D)[-D:])[::-1]
1919
1920 # Insert the feature vectors into a k-d tree
1921 # for fast retrieval
1922 kdtree = sp.spatial.KDTree(theta)
1923
1924 # Retrieve top ``k+1`` similar vertices (i.e., vectors)
1925 # (based on their Euclidean distance)
1926 # Note that it's k+1 because the source node will be included and later removed
1927 query_k = min(k + 1, num_nodes)
1928 neighbor_distances, nearest_neighbors = kdtree.query(
1929 theta[inv_node_map[source]], k=query_k
1930 )
1931
1932 # Ensure results are always arrays (KDTree returns scalars when k=1)
1933 neighbor_distances = np.atleast_1d(neighbor_distances)
1934 nearest_neighbors = np.atleast_1d(nearest_neighbors)
1935
1936 # The paper defines the similarity S(v_i, v_j) as
1937 # 1 / || Theta(v_i) - Theta(v_j) ||
1938 # Calculate reciprocals and normalize to [0, 1] range
1939
1940 # Handle the case where distances are very small or zero (common in small graphs)
1941 # Use the passed in eps parameter instead of defining a new epsilon
1942 neighbor_distances = np.maximum(neighbor_distances, eps)
1943 similarities = 1 / neighbor_distances
1944
1945 # Always normalize to ensure values are between 0 and 1
1946 if len(similarities) > 0 and (max_sim := np.max(similarities)) > 0:
1947 similarities /= max_sim
1948
1949 # Add back the similarity scores (i.e., distances)
1950 # Convert numpy scalars to native Python types for dispatch compatibility
1951 top_k_with_val = dict(
1952 zip((node_list[n] for n in nearest_neighbors), similarities.tolist())
1953 )
1954
1955 # Remove the self-similarity
1956 top_k_with_val.pop(source, None)
1957
1958 # Ensure we return exactly k results (sorted by similarity)
1959 if len(top_k_with_val) > k:
1960 sorted_items = sorted(top_k_with_val.items(), key=lambda x: x[1], reverse=True)
1961 top_k_with_val = dict(sorted_items[:k])
1962
1963 return top_k_with_val
1964
1965
1966@np_random_state("seed")
1967@nx._dispatchable(edge_attrs="weight")
1968def generate_random_paths(
1969 G,
1970 sample_size,
1971 path_length=5,
1972 index_map=None,
1973 weight="weight",
1974 seed=None,
1975 *,
1976 source=None,
1977):
1978 """Randomly generate `sample_size` paths of length `path_length`.
1979
1980 Parameters
1981 ----------
1982 G : NetworkX graph
1983 A NetworkX graph
1984 sample_size : integer
1985 The number of paths to generate. This is ``R`` in [1]_.
1986 path_length : integer (default = 5)
1987 The maximum size of the path to randomly generate.
1988 This is ``T`` in [1]_. According to the paper, ``T >= 5`` is
1989 recommended.
1990 index_map : dictionary, optional
1991 If provided, this will be populated with the inverted
1992 index of nodes mapped to the set of generated random path
1993 indices within ``paths``.
1994 weight : string or None, optional (default="weight")
1995 The name of an edge attribute that holds the numerical value
1996 used as a weight. If None then each edge has weight 1.
1997 seed : integer, random_state, or None (default)
1998 Indicator of random number generation state.
1999 See :ref:`Randomness<randomness>`.
2000 source : node, optional
2001 Node to use as the starting point for all generated paths.
2002 If None then starting nodes are selected at random with uniform probability.
2003
2004 Returns
2005 -------
2006 paths : generator of lists
2007 Generator of `sample_size` paths each with length `path_length`.
2008
2009 Examples
2010 --------
2011 The generator yields `sample_size` number of paths of length `path_length`
2012 drawn from `G`:
2013
2014 >>> G = nx.complete_graph(5)
2015 >>> next(nx.generate_random_paths(G, sample_size=1, path_length=3, seed=42))
2016 [3, 4, 2, 3]
2017 >>> list(nx.generate_random_paths(G, sample_size=3, path_length=4, seed=42))
2018 [[3, 4, 2, 3, 0], [2, 0, 2, 1, 0], [2, 0, 4, 3, 0]]
2019
2020 By passing a dictionary into `index_map`, it will build an
2021 inverted index mapping of nodes to the paths in which that node is present:
2022
2023 >>> G = nx.wheel_graph(10)
2024 >>> index_map = {}
2025 >>> random_paths = list(
2026 ... nx.generate_random_paths(G, sample_size=3, index_map=index_map, seed=2771)
2027 ... )
2028 >>> random_paths
2029 [[3, 2, 1, 9, 8, 7], [4, 0, 5, 6, 7, 8], [3, 0, 5, 0, 9, 8]]
2030 >>> paths_containing_node_0 = [
2031 ... random_paths[path_idx] for path_idx in index_map.get(0, [])
2032 ... ]
2033 >>> paths_containing_node_0
2034 [[4, 0, 5, 6, 7, 8], [3, 0, 5, 0, 9, 8]]
2035
2036 References
2037 ----------
2038 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.
2039 Panther: Fast top-k similarity search on large networks.
2040 In Proceedings of the ACM SIGKDD International Conference
2041 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).
2042 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.
2043 """
2044 import numpy as np
2045
2046 randint_fn = (
2047 seed.integers if isinstance(seed, np.random.Generator) else seed.randint
2048 )
2049
2050 # Calculate transition probabilities between
2051 # every pair of vertices according to Eq. (3)
2052 adj_mat = nx.to_numpy_array(G, weight=weight)
2053
2054 # Handle isolated nodes by checking for zero row sums
2055 row_sums = adj_mat.sum(axis=1).reshape(-1, 1)
2056 inv_row_sums = np.reciprocal(row_sums)
2057 transition_probabilities = adj_mat * inv_row_sums
2058
2059 node_map = list(G)
2060 num_nodes = G.number_of_nodes()
2061
2062 for path_index in range(sample_size):
2063 if source is None:
2064 # Sample current vertex v = v_i uniformly at random
2065 node_index = randint_fn(num_nodes)
2066 node = node_map[node_index]
2067 else:
2068 if source not in node_map:
2069 raise nx.NodeNotFound(f"Initial node {source} not in G")
2070
2071 node = source
2072 node_index = node_map.index(node)
2073
2074 # Add v into p_r and add p_r into the path set
2075 # of v, i.e., P_v
2076 path = [node]
2077
2078 # Build the inverted index (P_v) of vertices to paths
2079 if index_map is not None:
2080 if node in index_map:
2081 index_map[node].add(path_index)
2082 else:
2083 index_map[node] = {path_index}
2084
2085 starting_index = node_index
2086 for _ in range(path_length):
2087 # Randomly sample a neighbor (v_j) according
2088 # to transition probabilities from ``node`` (v) to its neighbors
2089 nbr_index = seed.choice(
2090 num_nodes, p=transition_probabilities[starting_index]
2091 )
2092
2093 # Set current vertex (v = v_j)
2094 starting_index = nbr_index
2095
2096 # Add v into p_r
2097 nbr_node = node_map[nbr_index]
2098 path.append(nbr_node)
2099
2100 # Add p_r into P_v
2101 if index_map is not None:
2102 if nbr_node in index_map:
2103 index_map[nbr_node].add(path_index)
2104 else:
2105 index_map[nbr_node] = {path_index}
2106
2107 yield path