Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/networkx/algorithms/similarity.py: 7%
339 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-20 07:00 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-10-20 07:00 +0000
1""" Functions measuring similarity using graph edit distance.
3The graph edit distance is the number of edge/node changes needed
4to make two graphs isomorphic.
6The default algorithm/implementation is sub-optimal for some graphs.
7The problem of finding the exact Graph Edit Distance (GED) is NP-hard
8so it is often slow. If the simple interface `graph_edit_distance`
9takes too long for your graph, try `optimize_graph_edit_distance`
10and/or `optimize_edit_paths`.
12At the same time, I encourage capable people to investigate
13alternative GED algorithms, in order to improve the choices available.
14"""
16import math
17import time
18import warnings
19from dataclasses import dataclass
20from itertools import product
22import networkx as nx
24__all__ = [
25 "graph_edit_distance",
26 "optimal_edit_paths",
27 "optimize_graph_edit_distance",
28 "optimize_edit_paths",
29 "simrank_similarity",
30 "panther_similarity",
31 "generate_random_paths",
32]
35def debug_print(*args, **kwargs):
36 print(*args, **kwargs)
39@nx._dispatch(
40 graphs={"G1": 0, "G2": 1}, preserve_edge_attrs=True, preserve_node_attrs=True
41)
42def graph_edit_distance(
43 G1,
44 G2,
45 node_match=None,
46 edge_match=None,
47 node_subst_cost=None,
48 node_del_cost=None,
49 node_ins_cost=None,
50 edge_subst_cost=None,
51 edge_del_cost=None,
52 edge_ins_cost=None,
53 roots=None,
54 upper_bound=None,
55 timeout=None,
56):
57 """Returns GED (graph edit distance) between graphs G1 and G2.
59 Graph edit distance is a graph similarity measure analogous to
60 Levenshtein distance for strings. It is defined as minimum cost
61 of edit path (sequence of node and edge edit operations)
62 transforming graph G1 to graph isomorphic to G2.
64 Parameters
65 ----------
66 G1, G2: graphs
67 The two graphs G1 and G2 must be of the same type.
69 node_match : callable
70 A function that returns True if node n1 in G1 and n2 in G2
71 should be considered equal during matching.
73 The function will be called like
75 node_match(G1.nodes[n1], G2.nodes[n2]).
77 That is, the function will receive the node attribute
78 dictionaries for n1 and n2 as inputs.
80 Ignored if node_subst_cost is specified. If neither
81 node_match nor node_subst_cost are specified then node
82 attributes are not considered.
84 edge_match : callable
85 A function that returns True if the edge attribute dictionaries
86 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
87 be considered equal during matching.
89 The function will be called like
91 edge_match(G1[u1][v1], G2[u2][v2]).
93 That is, the function will receive the edge attribute
94 dictionaries of the edges under consideration.
96 Ignored if edge_subst_cost is specified. If neither
97 edge_match nor edge_subst_cost are specified then edge
98 attributes are not considered.
100 node_subst_cost, node_del_cost, node_ins_cost : callable
101 Functions that return the costs of node substitution, node
102 deletion, and node insertion, respectively.
104 The functions will be called like
106 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
107 node_del_cost(G1.nodes[n1]),
108 node_ins_cost(G2.nodes[n2]).
110 That is, the functions will receive the node attribute
111 dictionaries as inputs. The functions are expected to return
112 positive numeric values.
114 Function node_subst_cost overrides node_match if specified.
115 If neither node_match nor node_subst_cost are specified then
116 default node substitution cost of 0 is used (node attributes
117 are not considered during matching).
119 If node_del_cost is not specified then default node deletion
120 cost of 1 is used. If node_ins_cost is not specified then
121 default node insertion cost of 1 is used.
123 edge_subst_cost, edge_del_cost, edge_ins_cost : callable
124 Functions that return the costs of edge substitution, edge
125 deletion, and edge insertion, respectively.
127 The functions will be called like
129 edge_subst_cost(G1[u1][v1], G2[u2][v2]),
130 edge_del_cost(G1[u1][v1]),
131 edge_ins_cost(G2[u2][v2]).
133 That is, the functions will receive the edge attribute
134 dictionaries as inputs. The functions are expected to return
135 positive numeric values.
137 Function edge_subst_cost overrides edge_match if specified.
138 If neither edge_match nor edge_subst_cost are specified then
139 default edge substitution cost of 0 is used (edge attributes
140 are not considered during matching).
142 If edge_del_cost is not specified then default edge deletion
143 cost of 1 is used. If edge_ins_cost is not specified then
144 default edge insertion cost of 1 is used.
146 roots : 2-tuple
147 Tuple where first element is a node in G1 and the second
148 is a node in G2.
149 These nodes are forced to be matched in the comparison to
150 allow comparison between rooted graphs.
152 upper_bound : numeric
153 Maximum edit distance to consider. Return None if no edit
154 distance under or equal to upper_bound exists.
156 timeout : numeric
157 Maximum number of seconds to execute.
158 After timeout is met, the current best GED is returned.
160 Examples
161 --------
162 >>> G1 = nx.cycle_graph(6)
163 >>> G2 = nx.wheel_graph(7)
164 >>> nx.graph_edit_distance(G1, G2)
165 7.0
167 >>> G1 = nx.star_graph(5)
168 >>> G2 = nx.star_graph(5)
169 >>> nx.graph_edit_distance(G1, G2, roots=(0, 0))
170 0.0
171 >>> nx.graph_edit_distance(G1, G2, roots=(1, 0))
172 8.0
174 See Also
175 --------
176 optimal_edit_paths, optimize_graph_edit_distance,
178 is_isomorphic: test for graph edit distance of 0
180 References
181 ----------
182 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
183 Martineau. An Exact Graph Edit Distance Algorithm for Solving
184 Pattern Recognition Problems. 4th International Conference on
185 Pattern Recognition Applications and Methods 2015, Jan 2015,
186 Lisbon, Portugal. 2015,
187 <10.5220/0005209202710278>. <hal-01168816>
188 https://hal.archives-ouvertes.fr/hal-01168816
190 """
191 bestcost = None
192 for _, _, cost in optimize_edit_paths(
193 G1,
194 G2,
195 node_match,
196 edge_match,
197 node_subst_cost,
198 node_del_cost,
199 node_ins_cost,
200 edge_subst_cost,
201 edge_del_cost,
202 edge_ins_cost,
203 upper_bound,
204 True,
205 roots,
206 timeout,
207 ):
208 # assert bestcost is None or cost < bestcost
209 bestcost = cost
210 return bestcost
213@nx._dispatch(graphs={"G1": 0, "G2": 1})
214def optimal_edit_paths(
215 G1,
216 G2,
217 node_match=None,
218 edge_match=None,
219 node_subst_cost=None,
220 node_del_cost=None,
221 node_ins_cost=None,
222 edge_subst_cost=None,
223 edge_del_cost=None,
224 edge_ins_cost=None,
225 upper_bound=None,
226):
227 """Returns all minimum-cost edit paths transforming G1 to G2.
229 Graph edit path is a sequence of node and edge edit operations
230 transforming graph G1 to graph isomorphic to G2. Edit operations
231 include substitutions, deletions, and insertions.
233 Parameters
234 ----------
235 G1, G2: graphs
236 The two graphs G1 and G2 must be of the same type.
238 node_match : callable
239 A function that returns True if node n1 in G1 and n2 in G2
240 should be considered equal during matching.
242 The function will be called like
244 node_match(G1.nodes[n1], G2.nodes[n2]).
246 That is, the function will receive the node attribute
247 dictionaries for n1 and n2 as inputs.
249 Ignored if node_subst_cost is specified. If neither
250 node_match nor node_subst_cost are specified then node
251 attributes are not considered.
253 edge_match : callable
254 A function that returns True if the edge attribute dictionaries
255 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
256 be considered equal during matching.
258 The function will be called like
260 edge_match(G1[u1][v1], G2[u2][v2]).
262 That is, the function will receive the edge attribute
263 dictionaries of the edges under consideration.
265 Ignored if edge_subst_cost is specified. If neither
266 edge_match nor edge_subst_cost are specified then edge
267 attributes are not considered.
269 node_subst_cost, node_del_cost, node_ins_cost : callable
270 Functions that return the costs of node substitution, node
271 deletion, and node insertion, respectively.
273 The functions will be called like
275 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
276 node_del_cost(G1.nodes[n1]),
277 node_ins_cost(G2.nodes[n2]).
279 That is, the functions will receive the node attribute
280 dictionaries as inputs. The functions are expected to return
281 positive numeric values.
283 Function node_subst_cost overrides node_match if specified.
284 If neither node_match nor node_subst_cost are specified then
285 default node substitution cost of 0 is used (node attributes
286 are not considered during matching).
288 If node_del_cost is not specified then default node deletion
289 cost of 1 is used. If node_ins_cost is not specified then
290 default node insertion cost of 1 is used.
292 edge_subst_cost, edge_del_cost, edge_ins_cost : callable
293 Functions that return the costs of edge substitution, edge
294 deletion, and edge insertion, respectively.
296 The functions will be called like
298 edge_subst_cost(G1[u1][v1], G2[u2][v2]),
299 edge_del_cost(G1[u1][v1]),
300 edge_ins_cost(G2[u2][v2]).
302 That is, the functions will receive the edge attribute
303 dictionaries as inputs. The functions are expected to return
304 positive numeric values.
306 Function edge_subst_cost overrides edge_match if specified.
307 If neither edge_match nor edge_subst_cost are specified then
308 default edge substitution cost of 0 is used (edge attributes
309 are not considered during matching).
311 If edge_del_cost is not specified then default edge deletion
312 cost of 1 is used. If edge_ins_cost is not specified then
313 default edge insertion cost of 1 is used.
315 upper_bound : numeric
316 Maximum edit distance to consider.
318 Returns
319 -------
320 edit_paths : list of tuples (node_edit_path, edge_edit_path)
321 node_edit_path : list of tuples (u, v)
322 edge_edit_path : list of tuples ((u1, v1), (u2, v2))
324 cost : numeric
325 Optimal edit path cost (graph edit distance).
327 Examples
328 --------
329 >>> G1 = nx.cycle_graph(4)
330 >>> G2 = nx.wheel_graph(5)
331 >>> paths, cost = nx.optimal_edit_paths(G1, G2)
332 >>> len(paths)
333 40
334 >>> cost
335 5.0
337 See Also
338 --------
339 graph_edit_distance, optimize_edit_paths
341 References
342 ----------
343 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
344 Martineau. An Exact Graph Edit Distance Algorithm for Solving
345 Pattern Recognition Problems. 4th International Conference on
346 Pattern Recognition Applications and Methods 2015, Jan 2015,
347 Lisbon, Portugal. 2015,
348 <10.5220/0005209202710278>. <hal-01168816>
349 https://hal.archives-ouvertes.fr/hal-01168816
351 """
352 paths = []
353 bestcost = None
354 for vertex_path, edge_path, cost in optimize_edit_paths(
355 G1,
356 G2,
357 node_match,
358 edge_match,
359 node_subst_cost,
360 node_del_cost,
361 node_ins_cost,
362 edge_subst_cost,
363 edge_del_cost,
364 edge_ins_cost,
365 upper_bound,
366 False,
367 ):
368 # assert bestcost is None or cost <= bestcost
369 if bestcost is not None and cost < bestcost:
370 paths = []
371 paths.append((vertex_path, edge_path))
372 bestcost = cost
373 return paths, bestcost
376@nx._dispatch(graphs={"G1": 0, "G2": 1})
377def optimize_graph_edit_distance(
378 G1,
379 G2,
380 node_match=None,
381 edge_match=None,
382 node_subst_cost=None,
383 node_del_cost=None,
384 node_ins_cost=None,
385 edge_subst_cost=None,
386 edge_del_cost=None,
387 edge_ins_cost=None,
388 upper_bound=None,
389):
390 """Returns consecutive approximations of GED (graph edit distance)
391 between graphs G1 and G2.
393 Graph edit distance is a graph similarity measure analogous to
394 Levenshtein distance for strings. It is defined as minimum cost
395 of edit path (sequence of node and edge edit operations)
396 transforming graph G1 to graph isomorphic to G2.
398 Parameters
399 ----------
400 G1, G2: graphs
401 The two graphs G1 and G2 must be of the same type.
403 node_match : callable
404 A function that returns True if node n1 in G1 and n2 in G2
405 should be considered equal during matching.
407 The function will be called like
409 node_match(G1.nodes[n1], G2.nodes[n2]).
411 That is, the function will receive the node attribute
412 dictionaries for n1 and n2 as inputs.
414 Ignored if node_subst_cost is specified. If neither
415 node_match nor node_subst_cost are specified then node
416 attributes are not considered.
418 edge_match : callable
419 A function that returns True if the edge attribute dictionaries
420 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
421 be considered equal during matching.
423 The function will be called like
425 edge_match(G1[u1][v1], G2[u2][v2]).
427 That is, the function will receive the edge attribute
428 dictionaries of the edges under consideration.
430 Ignored if edge_subst_cost is specified. If neither
431 edge_match nor edge_subst_cost are specified then edge
432 attributes are not considered.
434 node_subst_cost, node_del_cost, node_ins_cost : callable
435 Functions that return the costs of node substitution, node
436 deletion, and node insertion, respectively.
438 The functions will be called like
440 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
441 node_del_cost(G1.nodes[n1]),
442 node_ins_cost(G2.nodes[n2]).
444 That is, the functions will receive the node attribute
445 dictionaries as inputs. The functions are expected to return
446 positive numeric values.
448 Function node_subst_cost overrides node_match if specified.
449 If neither node_match nor node_subst_cost are specified then
450 default node substitution cost of 0 is used (node attributes
451 are not considered during matching).
453 If node_del_cost is not specified then default node deletion
454 cost of 1 is used. If node_ins_cost is not specified then
455 default node insertion cost of 1 is used.
457 edge_subst_cost, edge_del_cost, edge_ins_cost : callable
458 Functions that return the costs of edge substitution, edge
459 deletion, and edge insertion, respectively.
461 The functions will be called like
463 edge_subst_cost(G1[u1][v1], G2[u2][v2]),
464 edge_del_cost(G1[u1][v1]),
465 edge_ins_cost(G2[u2][v2]).
467 That is, the functions will receive the edge attribute
468 dictionaries as inputs. The functions are expected to return
469 positive numeric values.
471 Function edge_subst_cost overrides edge_match if specified.
472 If neither edge_match nor edge_subst_cost are specified then
473 default edge substitution cost of 0 is used (edge attributes
474 are not considered during matching).
476 If edge_del_cost is not specified then default edge deletion
477 cost of 1 is used. If edge_ins_cost is not specified then
478 default edge insertion cost of 1 is used.
480 upper_bound : numeric
481 Maximum edit distance to consider.
483 Returns
484 -------
485 Generator of consecutive approximations of graph edit distance.
487 Examples
488 --------
489 >>> G1 = nx.cycle_graph(6)
490 >>> G2 = nx.wheel_graph(7)
491 >>> for v in nx.optimize_graph_edit_distance(G1, G2):
492 ... minv = v
493 >>> minv
494 7.0
496 See Also
497 --------
498 graph_edit_distance, optimize_edit_paths
500 References
501 ----------
502 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
503 Martineau. An Exact Graph Edit Distance Algorithm for Solving
504 Pattern Recognition Problems. 4th International Conference on
505 Pattern Recognition Applications and Methods 2015, Jan 2015,
506 Lisbon, Portugal. 2015,
507 <10.5220/0005209202710278>. <hal-01168816>
508 https://hal.archives-ouvertes.fr/hal-01168816
509 """
510 for _, _, cost in optimize_edit_paths(
511 G1,
512 G2,
513 node_match,
514 edge_match,
515 node_subst_cost,
516 node_del_cost,
517 node_ins_cost,
518 edge_subst_cost,
519 edge_del_cost,
520 edge_ins_cost,
521 upper_bound,
522 True,
523 ):
524 yield cost
527@nx._dispatch(
528 graphs={"G1": 0, "G2": 1}, preserve_edge_attrs=True, preserve_node_attrs=True
529)
530def optimize_edit_paths(
531 G1,
532 G2,
533 node_match=None,
534 edge_match=None,
535 node_subst_cost=None,
536 node_del_cost=None,
537 node_ins_cost=None,
538 edge_subst_cost=None,
539 edge_del_cost=None,
540 edge_ins_cost=None,
541 upper_bound=None,
542 strictly_decreasing=True,
543 roots=None,
544 timeout=None,
545):
546 """GED (graph edit distance) calculation: advanced interface.
548 Graph edit path is a sequence of node and edge edit operations
549 transforming graph G1 to graph isomorphic to G2. Edit operations
550 include substitutions, deletions, and insertions.
552 Graph edit distance is defined as minimum cost of edit path.
554 Parameters
555 ----------
556 G1, G2: graphs
557 The two graphs G1 and G2 must be of the same type.
559 node_match : callable
560 A function that returns True if node n1 in G1 and n2 in G2
561 should be considered equal during matching.
563 The function will be called like
565 node_match(G1.nodes[n1], G2.nodes[n2]).
567 That is, the function will receive the node attribute
568 dictionaries for n1 and n2 as inputs.
570 Ignored if node_subst_cost is specified. If neither
571 node_match nor node_subst_cost are specified then node
572 attributes are not considered.
574 edge_match : callable
575 A function that returns True if the edge attribute dictionaries
576 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
577 be considered equal during matching.
579 The function will be called like
581 edge_match(G1[u1][v1], G2[u2][v2]).
583 That is, the function will receive the edge attribute
584 dictionaries of the edges under consideration.
586 Ignored if edge_subst_cost is specified. If neither
587 edge_match nor edge_subst_cost are specified then edge
588 attributes are not considered.
590 node_subst_cost, node_del_cost, node_ins_cost : callable
591 Functions that return the costs of node substitution, node
592 deletion, and node insertion, respectively.
594 The functions will be called like
596 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
597 node_del_cost(G1.nodes[n1]),
598 node_ins_cost(G2.nodes[n2]).
600 That is, the functions will receive the node attribute
601 dictionaries as inputs. The functions are expected to return
602 positive numeric values.
604 Function node_subst_cost overrides node_match if specified.
605 If neither node_match nor node_subst_cost are specified then
606 default node substitution cost of 0 is used (node attributes
607 are not considered during matching).
609 If node_del_cost is not specified then default node deletion
610 cost of 1 is used. If node_ins_cost is not specified then
611 default node insertion cost of 1 is used.
613 edge_subst_cost, edge_del_cost, edge_ins_cost : callable
614 Functions that return the costs of edge substitution, edge
615 deletion, and edge insertion, respectively.
617 The functions will be called like
619 edge_subst_cost(G1[u1][v1], G2[u2][v2]),
620 edge_del_cost(G1[u1][v1]),
621 edge_ins_cost(G2[u2][v2]).
623 That is, the functions will receive the edge attribute
624 dictionaries as inputs. The functions are expected to return
625 positive numeric values.
627 Function edge_subst_cost overrides edge_match if specified.
628 If neither edge_match nor edge_subst_cost are specified then
629 default edge substitution cost of 0 is used (edge attributes
630 are not considered during matching).
632 If edge_del_cost is not specified then default edge deletion
633 cost of 1 is used. If edge_ins_cost is not specified then
634 default edge insertion cost of 1 is used.
636 upper_bound : numeric
637 Maximum edit distance to consider.
639 strictly_decreasing : bool
640 If True, return consecutive approximations of strictly
641 decreasing cost. Otherwise, return all edit paths of cost
642 less than or equal to the previous minimum cost.
644 roots : 2-tuple
645 Tuple where first element is a node in G1 and the second
646 is a node in G2.
647 These nodes are forced to be matched in the comparison to
648 allow comparison between rooted graphs.
650 timeout : numeric
651 Maximum number of seconds to execute.
652 After timeout is met, the current best GED is returned.
654 Returns
655 -------
656 Generator of tuples (node_edit_path, edge_edit_path, cost)
657 node_edit_path : list of tuples (u, v)
658 edge_edit_path : list of tuples ((u1, v1), (u2, v2))
659 cost : numeric
661 See Also
662 --------
663 graph_edit_distance, optimize_graph_edit_distance, optimal_edit_paths
665 References
666 ----------
667 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
668 Martineau. An Exact Graph Edit Distance Algorithm for Solving
669 Pattern Recognition Problems. 4th International Conference on
670 Pattern Recognition Applications and Methods 2015, Jan 2015,
671 Lisbon, Portugal. 2015,
672 <10.5220/0005209202710278>. <hal-01168816>
673 https://hal.archives-ouvertes.fr/hal-01168816
675 """
676 # TODO: support DiGraph
678 import numpy as np
679 import scipy as sp
681 @dataclass
682 class CostMatrix:
683 C: ...
684 lsa_row_ind: ...
685 lsa_col_ind: ...
686 ls: ...
688 def make_CostMatrix(C, m, n):
689 # assert(C.shape == (m + n, m + n))
690 lsa_row_ind, lsa_col_ind = sp.optimize.linear_sum_assignment(C)
692 # Fixup dummy assignments:
693 # each substitution i<->j should have dummy assignment m+j<->n+i
694 # NOTE: fast reduce of Cv relies on it
695 # assert len(lsa_row_ind) == len(lsa_col_ind)
696 indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind)
697 subst_ind = [k for k, i, j in indexes if i < m and j < n]
698 indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind)
699 dummy_ind = [k for k, i, j in indexes if i >= m and j >= n]
700 # assert len(subst_ind) == len(dummy_ind)
701 lsa_row_ind[dummy_ind] = lsa_col_ind[subst_ind] + m
702 lsa_col_ind[dummy_ind] = lsa_row_ind[subst_ind] + n
704 return CostMatrix(
705 C, lsa_row_ind, lsa_col_ind, C[lsa_row_ind, lsa_col_ind].sum()
706 )
708 def extract_C(C, i, j, m, n):
709 # assert(C.shape == (m + n, m + n))
710 row_ind = [k in i or k - m in j for k in range(m + n)]
711 col_ind = [k in j or k - n in i for k in range(m + n)]
712 return C[row_ind, :][:, col_ind]
714 def reduce_C(C, i, j, m, n):
715 # assert(C.shape == (m + n, m + n))
716 row_ind = [k not in i and k - m not in j for k in range(m + n)]
717 col_ind = [k not in j and k - n not in i for k in range(m + n)]
718 return C[row_ind, :][:, col_ind]
720 def reduce_ind(ind, i):
721 # assert set(ind) == set(range(len(ind)))
722 rind = ind[[k not in i for k in ind]]
723 for k in set(i):
724 rind[rind >= k] -= 1
725 return rind
727 def match_edges(u, v, pending_g, pending_h, Ce, matched_uv=None):
728 """
729 Parameters:
730 u, v: matched vertices, u=None or v=None for
731 deletion/insertion
732 pending_g, pending_h: lists of edges not yet mapped
733 Ce: CostMatrix of pending edge mappings
734 matched_uv: partial vertex edit path
735 list of tuples (u, v) of previously matched vertex
736 mappings u<->v, u=None or v=None for
737 deletion/insertion
739 Returns:
740 list of (i, j): indices of edge mappings g<->h
741 localCe: local CostMatrix of edge mappings
742 (basically submatrix of Ce at cross of rows i, cols j)
743 """
744 M = len(pending_g)
745 N = len(pending_h)
746 # assert Ce.C.shape == (M + N, M + N)
748 # only attempt to match edges after one node match has been made
749 # this will stop self-edges on the first node being automatically deleted
750 # even when a substitution is the better option
751 if matched_uv is None or len(matched_uv) == 0:
752 g_ind = []
753 h_ind = []
754 else:
755 g_ind = [
756 i
757 for i in range(M)
758 if pending_g[i][:2] == (u, u)
759 or any(
760 pending_g[i][:2] in ((p, u), (u, p), (p, p)) for p, q in matched_uv
761 )
762 ]
763 h_ind = [
764 j
765 for j in range(N)
766 if pending_h[j][:2] == (v, v)
767 or any(
768 pending_h[j][:2] in ((q, v), (v, q), (q, q)) for p, q in matched_uv
769 )
770 ]
772 m = len(g_ind)
773 n = len(h_ind)
775 if m or n:
776 C = extract_C(Ce.C, g_ind, h_ind, M, N)
777 # assert C.shape == (m + n, m + n)
779 # Forbid structurally invalid matches
780 # NOTE: inf remembered from Ce construction
781 for k, i in enumerate(g_ind):
782 g = pending_g[i][:2]
783 for l, j in enumerate(h_ind):
784 h = pending_h[j][:2]
785 if nx.is_directed(G1) or nx.is_directed(G2):
786 if any(
787 g == (p, u) and h == (q, v) or g == (u, p) and h == (v, q)
788 for p, q in matched_uv
789 ):
790 continue
791 else:
792 if any(
793 g in ((p, u), (u, p)) and h in ((q, v), (v, q))
794 for p, q in matched_uv
795 ):
796 continue
797 if g == (u, u) or any(g == (p, p) for p, q in matched_uv):
798 continue
799 if h == (v, v) or any(h == (q, q) for p, q in matched_uv):
800 continue
801 C[k, l] = inf
803 localCe = make_CostMatrix(C, m, n)
804 ij = [
805 (
806 g_ind[k] if k < m else M + h_ind[l],
807 h_ind[l] if l < n else N + g_ind[k],
808 )
809 for k, l in zip(localCe.lsa_row_ind, localCe.lsa_col_ind)
810 if k < m or l < n
811 ]
813 else:
814 ij = []
815 localCe = CostMatrix(np.empty((0, 0)), [], [], 0)
817 return ij, localCe
819 def reduce_Ce(Ce, ij, m, n):
820 if len(ij):
821 i, j = zip(*ij)
822 m_i = m - sum(1 for t in i if t < m)
823 n_j = n - sum(1 for t in j if t < n)
824 return make_CostMatrix(reduce_C(Ce.C, i, j, m, n), m_i, n_j)
825 return Ce
827 def get_edit_ops(
828 matched_uv, pending_u, pending_v, Cv, pending_g, pending_h, Ce, matched_cost
829 ):
830 """
831 Parameters:
832 matched_uv: partial vertex edit path
833 list of tuples (u, v) of vertex mappings u<->v,
834 u=None or v=None for deletion/insertion
835 pending_u, pending_v: lists of vertices not yet mapped
836 Cv: CostMatrix of pending vertex mappings
837 pending_g, pending_h: lists of edges not yet mapped
838 Ce: CostMatrix of pending edge mappings
839 matched_cost: cost of partial edit path
841 Returns:
842 sequence of
843 (i, j): indices of vertex mapping u<->v
844 Cv_ij: reduced CostMatrix of pending vertex mappings
845 (basically Cv with row i, col j removed)
846 list of (x, y): indices of edge mappings g<->h
847 Ce_xy: reduced CostMatrix of pending edge mappings
848 (basically Ce with rows x, cols y removed)
849 cost: total cost of edit operation
850 NOTE: most promising ops first
851 """
852 m = len(pending_u)
853 n = len(pending_v)
854 # assert Cv.C.shape == (m + n, m + n)
856 # 1) a vertex mapping from optimal linear sum assignment
857 i, j = min(
858 (k, l) for k, l in zip(Cv.lsa_row_ind, Cv.lsa_col_ind) if k < m or l < n
859 )
860 xy, localCe = match_edges(
861 pending_u[i] if i < m else None,
862 pending_v[j] if j < n else None,
863 pending_g,
864 pending_h,
865 Ce,
866 matched_uv,
867 )
868 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))
869 # assert Ce.ls <= localCe.ls + Ce_xy.ls
870 if prune(matched_cost + Cv.ls + localCe.ls + Ce_xy.ls):
871 pass
872 else:
873 # get reduced Cv efficiently
874 Cv_ij = CostMatrix(
875 reduce_C(Cv.C, (i,), (j,), m, n),
876 reduce_ind(Cv.lsa_row_ind, (i, m + j)),
877 reduce_ind(Cv.lsa_col_ind, (j, n + i)),
878 Cv.ls - Cv.C[i, j],
879 )
880 yield (i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls
882 # 2) other candidates, sorted by lower-bound cost estimate
883 other = []
884 fixed_i, fixed_j = i, j
885 if m <= n:
886 candidates = (
887 (t, fixed_j)
888 for t in range(m + n)
889 if t != fixed_i and (t < m or t == m + fixed_j)
890 )
891 else:
892 candidates = (
893 (fixed_i, t)
894 for t in range(m + n)
895 if t != fixed_j and (t < n or t == n + fixed_i)
896 )
897 for i, j in candidates:
898 if prune(matched_cost + Cv.C[i, j] + Ce.ls):
899 continue
900 Cv_ij = make_CostMatrix(
901 reduce_C(Cv.C, (i,), (j,), m, n),
902 m - 1 if i < m else m,
903 n - 1 if j < n else n,
904 )
905 # assert Cv.ls <= Cv.C[i, j] + Cv_ij.ls
906 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + Ce.ls):
907 continue
908 xy, localCe = match_edges(
909 pending_u[i] if i < m else None,
910 pending_v[j] if j < n else None,
911 pending_g,
912 pending_h,
913 Ce,
914 matched_uv,
915 )
916 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls):
917 continue
918 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))
919 # assert Ce.ls <= localCe.ls + Ce_xy.ls
920 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls + Ce_xy.ls):
921 continue
922 other.append(((i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls))
924 yield from sorted(other, key=lambda t: t[4] + t[1].ls + t[3].ls)
926 def get_edit_paths(
927 matched_uv,
928 pending_u,
929 pending_v,
930 Cv,
931 matched_gh,
932 pending_g,
933 pending_h,
934 Ce,
935 matched_cost,
936 ):
937 """
938 Parameters:
939 matched_uv: partial vertex edit path
940 list of tuples (u, v) of vertex mappings u<->v,
941 u=None or v=None for deletion/insertion
942 pending_u, pending_v: lists of vertices not yet mapped
943 Cv: CostMatrix of pending vertex mappings
944 matched_gh: partial edge edit path
945 list of tuples (g, h) of edge mappings g<->h,
946 g=None or h=None for deletion/insertion
947 pending_g, pending_h: lists of edges not yet mapped
948 Ce: CostMatrix of pending edge mappings
949 matched_cost: cost of partial edit path
951 Returns:
952 sequence of (vertex_path, edge_path, cost)
953 vertex_path: complete vertex edit path
954 list of tuples (u, v) of vertex mappings u<->v,
955 u=None or v=None for deletion/insertion
956 edge_path: complete edge edit path
957 list of tuples (g, h) of edge mappings g<->h,
958 g=None or h=None for deletion/insertion
959 cost: total cost of edit path
960 NOTE: path costs are non-increasing
961 """
962 # debug_print('matched-uv:', matched_uv)
963 # debug_print('matched-gh:', matched_gh)
964 # debug_print('matched-cost:', matched_cost)
965 # debug_print('pending-u:', pending_u)
966 # debug_print('pending-v:', pending_v)
967 # debug_print(Cv.C)
968 # assert list(sorted(G1.nodes)) == list(sorted(list(u for u, v in matched_uv if u is not None) + pending_u))
969 # assert list(sorted(G2.nodes)) == list(sorted(list(v for u, v in matched_uv if v is not None) + pending_v))
970 # debug_print('pending-g:', pending_g)
971 # debug_print('pending-h:', pending_h)
972 # debug_print(Ce.C)
973 # assert list(sorted(G1.edges)) == list(sorted(list(g for g, h in matched_gh if g is not None) + pending_g))
974 # assert list(sorted(G2.edges)) == list(sorted(list(h for g, h in matched_gh if h is not None) + pending_h))
975 # debug_print()
977 if prune(matched_cost + Cv.ls + Ce.ls):
978 return
980 if not max(len(pending_u), len(pending_v)):
981 # assert not len(pending_g)
982 # assert not len(pending_h)
983 # path completed!
984 # assert matched_cost <= maxcost_value
985 nonlocal maxcost_value
986 maxcost_value = min(maxcost_value, matched_cost)
987 yield matched_uv, matched_gh, matched_cost
989 else:
990 edit_ops = get_edit_ops(
991 matched_uv,
992 pending_u,
993 pending_v,
994 Cv,
995 pending_g,
996 pending_h,
997 Ce,
998 matched_cost,
999 )
1000 for ij, Cv_ij, xy, Ce_xy, edit_cost in edit_ops:
1001 i, j = ij
1002 # assert Cv.C[i, j] + sum(Ce.C[t] for t in xy) == edit_cost
1003 if prune(matched_cost + edit_cost + Cv_ij.ls + Ce_xy.ls):
1004 continue
1006 # dive deeper
1007 u = pending_u.pop(i) if i < len(pending_u) else None
1008 v = pending_v.pop(j) if j < len(pending_v) else None
1009 matched_uv.append((u, v))
1010 for x, y in xy:
1011 len_g = len(pending_g)
1012 len_h = len(pending_h)
1013 matched_gh.append(
1014 (
1015 pending_g[x] if x < len_g else None,
1016 pending_h[y] if y < len_h else None,
1017 )
1018 )
1019 sortedx = sorted(x for x, y in xy)
1020 sortedy = sorted(y for x, y in xy)
1021 G = [
1022 (pending_g.pop(x) if x < len(pending_g) else None)
1023 for x in reversed(sortedx)
1024 ]
1025 H = [
1026 (pending_h.pop(y) if y < len(pending_h) else None)
1027 for y in reversed(sortedy)
1028 ]
1030 yield from get_edit_paths(
1031 matched_uv,
1032 pending_u,
1033 pending_v,
1034 Cv_ij,
1035 matched_gh,
1036 pending_g,
1037 pending_h,
1038 Ce_xy,
1039 matched_cost + edit_cost,
1040 )
1042 # backtrack
1043 if u is not None:
1044 pending_u.insert(i, u)
1045 if v is not None:
1046 pending_v.insert(j, v)
1047 matched_uv.pop()
1048 for x, g in zip(sortedx, reversed(G)):
1049 if g is not None:
1050 pending_g.insert(x, g)
1051 for y, h in zip(sortedy, reversed(H)):
1052 if h is not None:
1053 pending_h.insert(y, h)
1054 for _ in xy:
1055 matched_gh.pop()
1057 # Initialization
1059 pending_u = list(G1.nodes)
1060 pending_v = list(G2.nodes)
1062 initial_cost = 0
1063 if roots:
1064 root_u, root_v = roots
1065 if root_u not in pending_u or root_v not in pending_v:
1066 raise nx.NodeNotFound("Root node not in graph.")
1068 # remove roots from pending
1069 pending_u.remove(root_u)
1070 pending_v.remove(root_v)
1072 # cost matrix of vertex mappings
1073 m = len(pending_u)
1074 n = len(pending_v)
1075 C = np.zeros((m + n, m + n))
1076 if node_subst_cost:
1077 C[0:m, 0:n] = np.array(
1078 [
1079 node_subst_cost(G1.nodes[u], G2.nodes[v])
1080 for u in pending_u
1081 for v in pending_v
1082 ]
1083 ).reshape(m, n)
1084 if roots:
1085 initial_cost = node_subst_cost(G1.nodes[root_u], G2.nodes[root_v])
1086 elif node_match:
1087 C[0:m, 0:n] = np.array(
1088 [
1089 1 - int(node_match(G1.nodes[u], G2.nodes[v]))
1090 for u in pending_u
1091 for v in pending_v
1092 ]
1093 ).reshape(m, n)
1094 if roots:
1095 initial_cost = 1 - node_match(G1.nodes[root_u], G2.nodes[root_v])
1096 else:
1097 # all zeroes
1098 pass
1099 # assert not min(m, n) or C[0:m, 0:n].min() >= 0
1100 if node_del_cost:
1101 del_costs = [node_del_cost(G1.nodes[u]) for u in pending_u]
1102 else:
1103 del_costs = [1] * len(pending_u)
1104 # assert not m or min(del_costs) >= 0
1105 if node_ins_cost:
1106 ins_costs = [node_ins_cost(G2.nodes[v]) for v in pending_v]
1107 else:
1108 ins_costs = [1] * len(pending_v)
1109 # assert not n or min(ins_costs) >= 0
1110 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1
1111 C[0:m, n : n + m] = np.array(
1112 [del_costs[i] if i == j else inf for i in range(m) for j in range(m)]
1113 ).reshape(m, m)
1114 C[m : m + n, 0:n] = np.array(
1115 [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)]
1116 ).reshape(n, n)
1117 Cv = make_CostMatrix(C, m, n)
1118 # debug_print(f"Cv: {m} x {n}")
1119 # debug_print(Cv.C)
1121 pending_g = list(G1.edges)
1122 pending_h = list(G2.edges)
1124 # cost matrix of edge mappings
1125 m = len(pending_g)
1126 n = len(pending_h)
1127 C = np.zeros((m + n, m + n))
1128 if edge_subst_cost:
1129 C[0:m, 0:n] = np.array(
1130 [
1131 edge_subst_cost(G1.edges[g], G2.edges[h])
1132 for g in pending_g
1133 for h in pending_h
1134 ]
1135 ).reshape(m, n)
1136 elif edge_match:
1137 C[0:m, 0:n] = np.array(
1138 [
1139 1 - int(edge_match(G1.edges[g], G2.edges[h]))
1140 for g in pending_g
1141 for h in pending_h
1142 ]
1143 ).reshape(m, n)
1144 else:
1145 # all zeroes
1146 pass
1147 # assert not min(m, n) or C[0:m, 0:n].min() >= 0
1148 if edge_del_cost:
1149 del_costs = [edge_del_cost(G1.edges[g]) for g in pending_g]
1150 else:
1151 del_costs = [1] * len(pending_g)
1152 # assert not m or min(del_costs) >= 0
1153 if edge_ins_cost:
1154 ins_costs = [edge_ins_cost(G2.edges[h]) for h in pending_h]
1155 else:
1156 ins_costs = [1] * len(pending_h)
1157 # assert not n or min(ins_costs) >= 0
1158 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1
1159 C[0:m, n : n + m] = np.array(
1160 [del_costs[i] if i == j else inf for i in range(m) for j in range(m)]
1161 ).reshape(m, m)
1162 C[m : m + n, 0:n] = np.array(
1163 [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)]
1164 ).reshape(n, n)
1165 Ce = make_CostMatrix(C, m, n)
1166 # debug_print(f'Ce: {m} x {n}')
1167 # debug_print(Ce.C)
1168 # debug_print()
1170 maxcost_value = Cv.C.sum() + Ce.C.sum() + 1
1172 if timeout is not None:
1173 if timeout <= 0:
1174 raise nx.NetworkXError("Timeout value must be greater than 0")
1175 start = time.perf_counter()
1177 def prune(cost):
1178 if timeout is not None:
1179 if time.perf_counter() - start > timeout:
1180 return True
1181 if upper_bound is not None:
1182 if cost > upper_bound:
1183 return True
1184 if cost > maxcost_value:
1185 return True
1186 if strictly_decreasing and cost >= maxcost_value:
1187 return True
1188 return False
1190 # Now go!
1192 done_uv = [] if roots is None else [roots]
1194 for vertex_path, edge_path, cost in get_edit_paths(
1195 done_uv, pending_u, pending_v, Cv, [], pending_g, pending_h, Ce, initial_cost
1196 ):
1197 # assert sorted(G1.nodes) == sorted(u for u, v in vertex_path if u is not None)
1198 # assert sorted(G2.nodes) == sorted(v for u, v in vertex_path if v is not None)
1199 # assert sorted(G1.edges) == sorted(g for g, h in edge_path if g is not None)
1200 # assert sorted(G2.edges) == sorted(h for g, h in edge_path if h is not None)
1201 # print(vertex_path, edge_path, cost, file = sys.stderr)
1202 # assert cost == maxcost_value
1203 yield list(vertex_path), list(edge_path), cost
1206@nx._dispatch
1207def simrank_similarity(
1208 G,
1209 source=None,
1210 target=None,
1211 importance_factor=0.9,
1212 max_iterations=1000,
1213 tolerance=1e-4,
1214):
1215 """Returns the SimRank similarity of nodes in the graph ``G``.
1217 SimRank is a similarity metric that says "two objects are considered
1218 to be similar if they are referenced by similar objects." [1]_.
1220 The pseudo-code definition from the paper is::
1222 def simrank(G, u, v):
1223 in_neighbors_u = G.predecessors(u)
1224 in_neighbors_v = G.predecessors(v)
1225 scale = C / (len(in_neighbors_u) * len(in_neighbors_v))
1226 return scale * sum(simrank(G, w, x)
1227 for w, x in product(in_neighbors_u,
1228 in_neighbors_v))
1230 where ``G`` is the graph, ``u`` is the source, ``v`` is the target,
1231 and ``C`` is a float decay or importance factor between 0 and 1.
1233 The SimRank algorithm for determining node similarity is defined in
1234 [2]_.
1236 Parameters
1237 ----------
1238 G : NetworkX graph
1239 A NetworkX graph
1241 source : node
1242 If this is specified, the returned dictionary maps each node
1243 ``v`` in the graph to the similarity between ``source`` and
1244 ``v``.
1246 target : node
1247 If both ``source`` and ``target`` are specified, the similarity
1248 value between ``source`` and ``target`` is returned. If
1249 ``target`` is specified but ``source`` is not, this argument is
1250 ignored.
1252 importance_factor : float
1253 The relative importance of indirect neighbors with respect to
1254 direct neighbors.
1256 max_iterations : integer
1257 Maximum number of iterations.
1259 tolerance : float
1260 Error tolerance used to check convergence. When an iteration of
1261 the algorithm finds that no similarity value changes more than
1262 this amount, the algorithm halts.
1264 Returns
1265 -------
1266 similarity : dictionary or float
1267 If ``source`` and ``target`` are both ``None``, this returns a
1268 dictionary of dictionaries, where keys are node pairs and value
1269 are similarity of the pair of nodes.
1271 If ``source`` is not ``None`` but ``target`` is, this returns a
1272 dictionary mapping node to the similarity of ``source`` and that
1273 node.
1275 If neither ``source`` nor ``target`` is ``None``, this returns
1276 the similarity value for the given pair of nodes.
1278 Examples
1279 --------
1280 >>> G = nx.cycle_graph(2)
1281 >>> nx.simrank_similarity(G)
1282 {0: {0: 1.0, 1: 0.0}, 1: {0: 0.0, 1: 1.0}}
1283 >>> nx.simrank_similarity(G, source=0)
1284 {0: 1.0, 1: 0.0}
1285 >>> nx.simrank_similarity(G, source=0, target=0)
1286 1.0
1288 The result of this function can be converted to a numpy array
1289 representing the SimRank matrix by using the node order of the
1290 graph to determine which row and column represent each node.
1291 Other ordering of nodes is also possible.
1293 >>> import numpy as np
1294 >>> sim = nx.simrank_similarity(G)
1295 >>> np.array([[sim[u][v] for v in G] for u in G])
1296 array([[1., 0.],
1297 [0., 1.]])
1298 >>> sim_1d = nx.simrank_similarity(G, source=0)
1299 >>> np.array([sim[0][v] for v in G])
1300 array([1., 0.])
1302 References
1303 ----------
1304 .. [1] https://en.wikipedia.org/wiki/SimRank
1305 .. [2] G. Jeh and J. Widom.
1306 "SimRank: a measure of structural-context similarity",
1307 In KDD'02: Proceedings of the Eighth ACM SIGKDD
1308 International Conference on Knowledge Discovery and Data Mining,
1309 pp. 538--543. ACM Press, 2002.
1310 """
1311 import numpy as np
1313 nodelist = list(G)
1314 s_indx = None if source is None else nodelist.index(source)
1315 t_indx = None if target is None else nodelist.index(target)
1317 x = _simrank_similarity_numpy(
1318 G, s_indx, t_indx, importance_factor, max_iterations, tolerance
1319 )
1321 if isinstance(x, np.ndarray):
1322 if x.ndim == 1:
1323 return dict(zip(G, x))
1324 # else x.ndim == 2
1325 return {u: dict(zip(G, row)) for u, row in zip(G, x)}
1326 return x
1329def _simrank_similarity_python(
1330 G,
1331 source=None,
1332 target=None,
1333 importance_factor=0.9,
1334 max_iterations=1000,
1335 tolerance=1e-4,
1336):
1337 """Returns the SimRank similarity of nodes in the graph ``G``.
1339 This pure Python version is provided for pedagogical purposes.
1341 Examples
1342 --------
1343 >>> G = nx.cycle_graph(2)
1344 >>> nx.similarity._simrank_similarity_python(G)
1345 {0: {0: 1, 1: 0.0}, 1: {0: 0.0, 1: 1}}
1346 >>> nx.similarity._simrank_similarity_python(G, source=0)
1347 {0: 1, 1: 0.0}
1348 >>> nx.similarity._simrank_similarity_python(G, source=0, target=0)
1349 1
1350 """
1351 # build up our similarity adjacency dictionary output
1352 newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G}
1354 # These functions compute the update to the similarity value of the nodes
1355 # `u` and `v` with respect to the previous similarity values.
1356 def avg_sim(s):
1357 return sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0
1359 Gadj = G.pred if G.is_directed() else G.adj
1361 def sim(u, v):
1362 return importance_factor * avg_sim(list(product(Gadj[u], Gadj[v])))
1364 for its in range(max_iterations):
1365 oldsim = newsim
1366 newsim = {u: {v: sim(u, v) if u is not v else 1 for v in G} for u in G}
1367 is_close = all(
1368 all(
1369 abs(newsim[u][v] - old) <= tolerance * (1 + abs(old))
1370 for v, old in nbrs.items()
1371 )
1372 for u, nbrs in oldsim.items()
1373 )
1374 if is_close:
1375 break
1377 if its + 1 == max_iterations:
1378 raise nx.ExceededMaxIterations(
1379 f"simrank did not converge after {max_iterations} iterations."
1380 )
1382 if source is not None and target is not None:
1383 return newsim[source][target]
1384 if source is not None:
1385 return newsim[source]
1386 return newsim
1389def _simrank_similarity_numpy(
1390 G,
1391 source=None,
1392 target=None,
1393 importance_factor=0.9,
1394 max_iterations=1000,
1395 tolerance=1e-4,
1396):
1397 """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``.
1399 The SimRank algorithm for determining node similarity is defined in
1400 [1]_.
1402 Parameters
1403 ----------
1404 G : NetworkX graph
1405 A NetworkX graph
1407 source : node
1408 If this is specified, the returned dictionary maps each node
1409 ``v`` in the graph to the similarity between ``source`` and
1410 ``v``.
1412 target : node
1413 If both ``source`` and ``target`` are specified, the similarity
1414 value between ``source`` and ``target`` is returned. If
1415 ``target`` is specified but ``source`` is not, this argument is
1416 ignored.
1418 importance_factor : float
1419 The relative importance of indirect neighbors with respect to
1420 direct neighbors.
1422 max_iterations : integer
1423 Maximum number of iterations.
1425 tolerance : float
1426 Error tolerance used to check convergence. When an iteration of
1427 the algorithm finds that no similarity value changes more than
1428 this amount, the algorithm halts.
1430 Returns
1431 -------
1432 similarity : numpy array or float
1433 If ``source`` and ``target`` are both ``None``, this returns a
1434 2D array containing SimRank scores of the nodes.
1436 If ``source`` is not ``None`` but ``target`` is, this returns an
1437 1D array containing SimRank scores of ``source`` and that
1438 node.
1440 If neither ``source`` nor ``target`` is ``None``, this returns
1441 the similarity value for the given pair of nodes.
1443 Examples
1444 --------
1445 >>> G = nx.cycle_graph(2)
1446 >>> nx.similarity._simrank_similarity_numpy(G)
1447 array([[1., 0.],
1448 [0., 1.]])
1449 >>> nx.similarity._simrank_similarity_numpy(G, source=0)
1450 array([1., 0.])
1451 >>> nx.similarity._simrank_similarity_numpy(G, source=0, target=0)
1452 1.0
1454 References
1455 ----------
1456 .. [1] G. Jeh and J. Widom.
1457 "SimRank: a measure of structural-context similarity",
1458 In KDD'02: Proceedings of the Eighth ACM SIGKDD
1459 International Conference on Knowledge Discovery and Data Mining,
1460 pp. 538--543. ACM Press, 2002.
1461 """
1462 # This algorithm follows roughly
1463 #
1464 # S = max{C * (A.T * S * A), I}
1465 #
1466 # where C is the importance factor, A is the column normalized
1467 # adjacency matrix, and I is the identity matrix.
1468 import numpy as np
1470 adjacency_matrix = nx.to_numpy_array(G)
1472 # column-normalize the ``adjacency_matrix``
1473 s = np.array(adjacency_matrix.sum(axis=0))
1474 s[s == 0] = 1
1475 adjacency_matrix /= s # adjacency_matrix.sum(axis=0)
1477 newsim = np.eye(len(G), dtype=np.float64)
1478 for its in range(max_iterations):
1479 prevsim = newsim.copy()
1480 newsim = importance_factor * ((adjacency_matrix.T @ prevsim) @ adjacency_matrix)
1481 np.fill_diagonal(newsim, 1.0)
1483 if np.allclose(prevsim, newsim, atol=tolerance):
1484 break
1486 if its + 1 == max_iterations:
1487 raise nx.ExceededMaxIterations(
1488 f"simrank did not converge after {max_iterations} iterations."
1489 )
1491 if source is not None and target is not None:
1492 return newsim[source, target]
1493 if source is not None:
1494 return newsim[source]
1495 return newsim
1498@nx._dispatch(edge_attrs="weight")
1499def panther_similarity(
1500 G, source, k=5, path_length=5, c=0.5, delta=0.1, eps=None, weight="weight"
1501):
1502 r"""Returns the Panther similarity of nodes in the graph `G` to node ``v``.
1504 Panther is a similarity metric that says "two objects are considered
1505 to be similar if they frequently appear on the same paths." [1]_.
1507 Parameters
1508 ----------
1509 G : NetworkX graph
1510 A NetworkX graph
1511 source : node
1512 Source node for which to find the top `k` similar other nodes
1513 k : int (default = 5)
1514 The number of most similar nodes to return
1515 path_length : int (default = 5)
1516 How long the randomly generated paths should be (``T`` in [1]_)
1517 c : float (default = 0.5)
1518 A universal positive constant used to scale the number
1519 of sample random paths to generate.
1520 delta : float (default = 0.1)
1521 The probability that the similarity $S$ is not an epsilon-approximation to (R, phi),
1522 where $R$ is the number of random paths and $\phi$ is the probability
1523 that an element sampled from a set $A \subseteq D$, where $D$ is the domain.
1524 eps : float or None (default = None)
1525 The error bound. Per [1]_, a good value is ``sqrt(1/|E|)``. Therefore,
1526 if no value is provided, the recommended computed value will be used.
1527 weight : string or None, optional (default="weight")
1528 The name of an edge attribute that holds the numerical value
1529 used as a weight. If None then each edge has weight 1.
1531 Returns
1532 -------
1533 similarity : dictionary
1534 Dictionary of nodes to similarity scores (as floats). Note:
1535 the self-similarity (i.e., ``v``) will not be included in
1536 the returned dictionary.
1538 Examples
1539 --------
1540 >>> G = nx.star_graph(10)
1541 >>> sim = nx.panther_similarity(G, 0)
1543 References
1544 ----------
1545 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.
1546 Panther: Fast top-k similarity search on large networks.
1547 In Proceedings of the ACM SIGKDD International Conference
1548 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).
1549 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.
1550 """
1551 import numpy as np
1553 num_nodes = G.number_of_nodes()
1554 if num_nodes < k:
1555 warnings.warn(
1556 f"Number of nodes is {num_nodes}, but requested k is {k}. "
1557 "Setting k to number of nodes."
1558 )
1559 k = num_nodes
1560 # According to [1], they empirically determined
1561 # a good value for ``eps`` to be sqrt( 1 / |E| )
1562 if eps is None:
1563 eps = np.sqrt(1.0 / G.number_of_edges())
1565 inv_node_map = {name: index for index, name in enumerate(G.nodes)}
1566 node_map = np.array(G)
1568 # Calculate the sample size ``R`` for how many paths
1569 # to randomly generate
1570 t_choose_2 = math.comb(path_length, 2)
1571 sample_size = int((c / eps**2) * (np.log2(t_choose_2) + 1 + np.log(1 / delta)))
1572 index_map = {}
1573 _ = list(
1574 generate_random_paths(
1575 G, sample_size, path_length=path_length, index_map=index_map, weight=weight
1576 )
1577 )
1578 S = np.zeros(num_nodes)
1580 inv_sample_size = 1 / sample_size
1582 source_paths = set(index_map[source])
1584 # Calculate the path similarities
1585 # between ``source`` (v) and ``node`` (v_j)
1586 # using our inverted index mapping of
1587 # vertices to paths
1588 for node, paths in index_map.items():
1589 # Only consider paths where both
1590 # ``node`` and ``source`` are present
1591 common_paths = source_paths.intersection(paths)
1592 S[inv_node_map[node]] = len(common_paths) * inv_sample_size
1594 # Retrieve top ``k`` similar
1595 # Note: the below performed anywhere from 4-10x faster
1596 # (depending on input sizes) vs the equivalent ``np.argsort(S)[::-1]``
1597 top_k_unsorted = np.argpartition(S, -k)[-k:]
1598 top_k_sorted = top_k_unsorted[np.argsort(S[top_k_unsorted])][::-1]
1600 # Add back the similarity scores
1601 top_k_sorted_names = (node_map[n] for n in top_k_sorted)
1602 top_k_with_val = dict(zip(top_k_sorted_names, S[top_k_sorted]))
1604 # Remove the self-similarity
1605 top_k_with_val.pop(source, None)
1606 return top_k_with_val
1609@nx._dispatch(edge_attrs="weight")
1610def generate_random_paths(
1611 G, sample_size, path_length=5, index_map=None, weight="weight"
1612):
1613 """Randomly generate `sample_size` paths of length `path_length`.
1615 Parameters
1616 ----------
1617 G : NetworkX graph
1618 A NetworkX graph
1619 sample_size : integer
1620 The number of paths to generate. This is ``R`` in [1]_.
1621 path_length : integer (default = 5)
1622 The maximum size of the path to randomly generate.
1623 This is ``T`` in [1]_. According to the paper, ``T >= 5`` is
1624 recommended.
1625 index_map : dictionary, optional
1626 If provided, this will be populated with the inverted
1627 index of nodes mapped to the set of generated random path
1628 indices within ``paths``.
1629 weight : string or None, optional (default="weight")
1630 The name of an edge attribute that holds the numerical value
1631 used as a weight. If None then each edge has weight 1.
1633 Returns
1634 -------
1635 paths : generator of lists
1636 Generator of `sample_size` paths each with length `path_length`.
1638 Examples
1639 --------
1640 Note that the return value is the list of paths:
1642 >>> G = nx.star_graph(3)
1643 >>> random_path = nx.generate_random_paths(G, 2)
1645 By passing a dictionary into `index_map`, it will build an
1646 inverted index mapping of nodes to the paths in which that node is present:
1648 >>> G = nx.star_graph(3)
1649 >>> index_map = {}
1650 >>> random_path = nx.generate_random_paths(G, 3, index_map=index_map)
1651 >>> paths_containing_node_0 = [random_path[path_idx] for path_idx in index_map.get(0, [])]
1653 References
1654 ----------
1655 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.
1656 Panther: Fast top-k similarity search on large networks.
1657 In Proceedings of the ACM SIGKDD International Conference
1658 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).
1659 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.
1660 """
1661 import numpy as np
1663 # Calculate transition probabilities between
1664 # every pair of vertices according to Eq. (3)
1665 adj_mat = nx.to_numpy_array(G, weight=weight)
1666 inv_row_sums = np.reciprocal(adj_mat.sum(axis=1)).reshape(-1, 1)
1667 transition_probabilities = adj_mat * inv_row_sums
1669 node_map = np.array(G)
1670 num_nodes = G.number_of_nodes()
1672 for path_index in range(sample_size):
1673 # Sample current vertex v = v_i uniformly at random
1674 node_index = np.random.randint(0, high=num_nodes)
1675 node = node_map[node_index]
1677 # Add v into p_r and add p_r into the path set
1678 # of v, i.e., P_v
1679 path = [node]
1681 # Build the inverted index (P_v) of vertices to paths
1682 if index_map is not None:
1683 if node in index_map:
1684 index_map[node].add(path_index)
1685 else:
1686 index_map[node] = {path_index}
1688 starting_index = node_index
1689 for _ in range(path_length):
1690 # Randomly sample a neighbor (v_j) according
1691 # to transition probabilities from ``node`` (v) to its neighbors
1692 neighbor_index = np.random.choice(
1693 num_nodes, p=transition_probabilities[starting_index]
1694 )
1696 # Set current vertex (v = v_j)
1697 starting_index = neighbor_index
1699 # Add v into p_r
1700 neighbor_node = node_map[neighbor_index]
1701 path.append(neighbor_node)
1703 # Add p_r into P_v
1704 if index_map is not None:
1705 if neighbor_node in index_map:
1706 index_map[neighbor_node].add(path_index)
1707 else:
1708 index_map[neighbor_node] = {path_index}
1710 yield path