1"""
2Functions for hashing graphs to strings.
3Isomorphic graphs should be assigned identical hashes.
4For now, only Weisfeiler-Lehman hashing is implemented.
5"""
6
7import warnings
8from collections import Counter, defaultdict
9from hashlib import blake2b
10
11import networkx as nx
12
13__all__ = ["weisfeiler_lehman_graph_hash", "weisfeiler_lehman_subgraph_hashes"]
14
15
16def _hash_label(label, digest_size):
17 return blake2b(label.encode("ascii"), digest_size=digest_size).hexdigest()
18
19
20def _init_node_labels(G, edge_attr, node_attr):
21 if node_attr:
22 return {u: str(dd[node_attr]) for u, dd in G.nodes(data=True)}
23 elif edge_attr:
24 return {u: "" for u in G}
25 else:
26 warnings.warn(
27 "The hashes produced for graphs without node or edge attributes "
28 "changed in v3.5 due to a bugfix (see documentation).",
29 UserWarning,
30 stacklevel=2,
31 )
32 if nx.is_directed(G):
33 return {u: str(G.in_degree(u)) + "_" + str(G.out_degree(u)) for u in G}
34 else:
35 return {u: str(deg) for u, deg in G.degree()}
36
37
38def _neighborhood_aggregate_undirected(G, node, node_labels, edge_attr=None):
39 """
40 Compute new labels for given node in an undirected graph by aggregating
41 the labels of each node's neighbors.
42 """
43 label_list = []
44 for nbr in G.neighbors(node):
45 prefix = "" if edge_attr is None else str(G[node][nbr][edge_attr])
46 label_list.append(prefix + node_labels[nbr])
47 return node_labels[node] + "".join(sorted(label_list))
48
49
50def _neighborhood_aggregate_directed(G, node, node_labels, edge_attr=None):
51 """
52 Compute new labels for given node in a directed graph by aggregating
53 the labels of each node's neighbors.
54 """
55 successor_labels = []
56 for nbr in G.successors(node):
57 prefix = "s_" + "" if edge_attr is None else str(G[node][nbr][edge_attr])
58 successor_labels.append(prefix + node_labels[nbr])
59
60 predecessor_labels = []
61 for nbr in G.predecessors(node):
62 prefix = "p_" + "" if edge_attr is None else str(G[nbr][node][edge_attr])
63 predecessor_labels.append(prefix + node_labels[nbr])
64 return (
65 node_labels[node]
66 + "".join(sorted(successor_labels))
67 + "".join(sorted(predecessor_labels))
68 )
69
70
71@nx.utils.not_implemented_for("multigraph")
72@nx._dispatchable(edge_attrs={"edge_attr": None}, node_attrs="node_attr")
73def weisfeiler_lehman_graph_hash(
74 G, edge_attr=None, node_attr=None, iterations=3, digest_size=16
75):
76 """Return Weisfeiler Lehman (WL) graph hash.
77
78 .. Warning:: Hash values for directed graphs and graphs without edge or
79 node attributes have changed in v3.5. In previous versions,
80 directed graphs did not distinguish in- and outgoing edges. Also,
81 graphs without attributes set initial states such that effectively
82 one extra iteration of WL occurred than indicated by `iterations`.
83 For undirected graphs without node or edge labels, the old
84 hashes can be obtained by increasing the iteration count by one.
85 For more details, see `issue #7806
86 <https://github.com/networkx/networkx/issues/7806>`_.
87
88 The function iteratively aggregates and hashes neighborhoods of each node.
89 After each node's neighbors are hashed to obtain updated node labels,
90 a hashed histogram of resulting labels is returned as the final hash.
91
92 Hashes are identical for isomorphic graphs and strong guarantees that
93 non-isomorphic graphs will get different hashes. See [1]_ for details.
94
95 If no node or edge attributes are provided, the degree of each node
96 is used as its initial label.
97 Otherwise, node and/or edge labels are used to compute the hash.
98
99 Parameters
100 ----------
101 G : graph
102 The graph to be hashed.
103 Can have node and/or edge attributes. Can also have no attributes.
104 edge_attr : string, optional (default=None)
105 The key in edge attribute dictionary to be used for hashing.
106 If None, edge labels are ignored.
107 node_attr: string, optional (default=None)
108 The key in node attribute dictionary to be used for hashing.
109 If None, and no edge_attr given, use the degrees of the nodes as labels.
110 iterations: int, optional (default=3)
111 Number of neighbor aggregations to perform.
112 Should be larger for larger graphs.
113 digest_size: int, optional (default=16)
114 Size (in bytes) of blake2b hash digest to use for hashing node labels.
115
116 Returns
117 -------
118 h : string
119 Hexadecimal string corresponding to hash of `G` (length ``2 * digest_size``).
120
121 Raises
122 ------
123 ValueError
124 If `iterations` is not a positve number.
125
126 Examples
127 --------
128 Two graphs with edge attributes that are isomorphic, except for
129 differences in the edge labels.
130
131 >>> G1 = nx.Graph()
132 >>> G1.add_edges_from(
133 ... [
134 ... (1, 2, {"label": "A"}),
135 ... (2, 3, {"label": "A"}),
136 ... (3, 1, {"label": "A"}),
137 ... (1, 4, {"label": "B"}),
138 ... ]
139 ... )
140 >>> G2 = nx.Graph()
141 >>> G2.add_edges_from(
142 ... [
143 ... (5, 6, {"label": "B"}),
144 ... (6, 7, {"label": "A"}),
145 ... (7, 5, {"label": "A"}),
146 ... (7, 8, {"label": "A"}),
147 ... ]
148 ... )
149
150 Omitting the `edge_attr` option, results in identical hashes.
151
152 >>> nx.weisfeiler_lehman_graph_hash(G1)
153 'c045439172215f49e0bef8c3d26c6b61'
154 >>> nx.weisfeiler_lehman_graph_hash(G2)
155 'c045439172215f49e0bef8c3d26c6b61'
156
157 With edge labels, the graphs are no longer assigned
158 the same hash digest.
159
160 >>> nx.weisfeiler_lehman_graph_hash(G1, edge_attr="label")
161 'c653d85538bcf041d88c011f4f905f10'
162 >>> nx.weisfeiler_lehman_graph_hash(G2, edge_attr="label")
163 '3dcd84af1ca855d0eff3c978d88e7ec7'
164
165 Notes
166 -----
167 To return the WL hashes of each subgraph of a graph, use
168 `weisfeiler_lehman_subgraph_hashes`
169
170 Similarity between hashes does not imply similarity between graphs.
171
172 References
173 ----------
174 .. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen,
175 Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman
176 Graph Kernels. Journal of Machine Learning Research. 2011.
177 http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf
178
179 See also
180 --------
181 weisfeiler_lehman_subgraph_hashes
182 """
183
184 if G.is_directed():
185 _neighborhood_aggregate = _neighborhood_aggregate_directed
186 warnings.warn(
187 "The hashes produced for directed graphs changed in version v3.5"
188 " due to a bugfix to track in and out edges separately (see documentation).",
189 UserWarning,
190 stacklevel=2,
191 )
192 else:
193 _neighborhood_aggregate = _neighborhood_aggregate_undirected
194
195 def weisfeiler_lehman_step(G, labels, edge_attr=None):
196 """
197 Apply neighborhood aggregation to each node
198 in the graph.
199 Computes a dictionary with labels for each node.
200 """
201 new_labels = {}
202 for node in G.nodes():
203 label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr)
204 new_labels[node] = _hash_label(label, digest_size)
205 return new_labels
206
207 if iterations <= 0:
208 raise ValueError("The WL algorithm requires that `iterations` be positive")
209
210 # set initial node labels
211 node_labels = _init_node_labels(G, edge_attr, node_attr)
212
213 # If the graph has no attributes, initial labels are the nodes' degrees.
214 # This is equivalent to doing the first iterations of WL.
215 if not edge_attr and not node_attr:
216 iterations -= 1
217
218 subgraph_hash_counts = []
219 for _ in range(iterations):
220 node_labels = weisfeiler_lehman_step(G, node_labels, edge_attr=edge_attr)
221 counter = Counter(node_labels.values())
222 # sort the counter, extend total counts
223 subgraph_hash_counts.extend(sorted(counter.items(), key=lambda x: x[0]))
224
225 # hash the final counter
226 return _hash_label(str(tuple(subgraph_hash_counts)), digest_size)
227
228
229@nx.utils.not_implemented_for("multigraph")
230@nx._dispatchable(edge_attrs={"edge_attr": None}, node_attrs="node_attr")
231def weisfeiler_lehman_subgraph_hashes(
232 G,
233 edge_attr=None,
234 node_attr=None,
235 iterations=3,
236 digest_size=16,
237 include_initial_labels=False,
238):
239 """
240 Return a dictionary of subgraph hashes by node.
241
242 .. Warning:: Hash values for directed graphs have changed in version
243 v3.5. In previous versions, directed graphs did not distinguish in-
244 and outgoing edges.
245 Graphs without attributes previously performed an extra iteration of
246 WL at initialisation, which was not visible in the output of this
247 function. This hash value is now included in the returned dictionary,
248 shifting the other calculated hashes one position to the right. To
249 obtain the same last subgraph hash, increase the number of iterations
250 by one.
251 For more details, see `issue #7806
252 <https://github.com/networkx/networkx/issues/7806>`_.
253
254 Dictionary keys are nodes in `G`, and values are a list of hashes.
255 Each hash corresponds to a subgraph rooted at a given node u in `G`.
256 Lists of subgraph hashes are sorted in increasing order of depth from
257 their root node, with the hash at index i corresponding to a subgraph
258 of nodes at most i-hops (i edges) distance from u. Thus, each list will contain
259 `iterations` elements - a hash for a subgraph at each depth. If
260 `include_initial_labels` is set to `True`, each list will additionally
261 have contain a hash of the initial node label (or equivalently a
262 subgraph of depth 0) prepended, totalling ``iterations + 1`` elements.
263
264 The function iteratively aggregates and hashes neighborhoods of each node.
265 This is achieved for each step by replacing for each node its label from
266 the previous iteration with its hashed 1-hop neighborhood aggregate.
267 The new node label is then appended to a list of node labels for each
268 node.
269
270 To aggregate neighborhoods for a node $u$ at each step, all labels of
271 nodes adjacent to $u$ are concatenated. If the `edge_attr` parameter is set,
272 labels for each neighboring node are prefixed with the value of this attribute
273 along the connecting edge from this neighbor to node $u$. The resulting string
274 is then hashed to compress this information into a fixed digest size.
275
276 Thus, at the i-th iteration, nodes within i hops influence any given
277 hashed node label. We can therefore say that at depth $i$ for node $u$
278 we have a hash for a subgraph induced by the i-hop neighborhood of $u$.
279
280 The output can be used to create general Weisfeiler-Lehman graph kernels,
281 or generate features for graphs or nodes - for example to generate 'words' in
282 a graph as seen in the 'graph2vec' algorithm.
283 See [1]_ & [2]_ respectively for details.
284
285 Hashes are identical for isomorphic subgraphs and there exist strong
286 guarantees that non-isomorphic graphs will get different hashes.
287 See [1]_ for details.
288
289 If no node or edge attributes are provided, the degree of each node
290 is used as its initial label.
291 Otherwise, node and/or edge labels are used to compute the hash.
292
293 Parameters
294 ----------
295 G : graph
296 The graph to be hashed.
297 Can have node and/or edge attributes. Can also have no attributes.
298 edge_attr : string, optional (default=None)
299 The key in edge attribute dictionary to be used for hashing.
300 If None, edge labels are ignored.
301 node_attr : string, optional (default=None)
302 The key in node attribute dictionary to be used for hashing.
303 If None, and no edge_attr given, use the degrees of the nodes as labels.
304 If None, and edge_attr is given, each node starts with an identical label.
305 iterations : int, optional (default=3)
306 Number of neighbor aggregations to perform.
307 Should be larger for larger graphs.
308 digest_size : int, optional (default=16)
309 Size (in bytes) of blake2b hash digest to use for hashing node labels.
310 The default size is 16 bytes.
311 include_initial_labels : bool, optional (default=False)
312 If True, include the hashed initial node label as the first subgraph
313 hash for each node.
314
315 Returns
316 -------
317 node_subgraph_hashes : dict
318 A dictionary with each key given by a node in G, and each value given
319 by the subgraph hashes in order of depth from the key node.
320 Hashes are hexadecimal strings (hence ``2 * digest_size`` long).
321
322
323 Raises
324 ------
325 ValueError
326 If `iterations` is not a positve number.
327
328 Examples
329 --------
330 Finding similar nodes in different graphs:
331
332 >>> G1 = nx.Graph()
333 >>> G1.add_edges_from([(1, 2), (2, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 7)])
334 >>> G2 = nx.Graph()
335 >>> G2.add_edges_from([(1, 3), (2, 3), (1, 6), (1, 5), (4, 6)])
336 >>> g1_hashes = nx.weisfeiler_lehman_subgraph_hashes(
337 ... G1, iterations=4, digest_size=8
338 ... )
339 >>> g2_hashes = nx.weisfeiler_lehman_subgraph_hashes(
340 ... G2, iterations=4, digest_size=8
341 ... )
342
343 Even though G1 and G2 are not isomorphic (they have different numbers of edges),
344 the hash sequence of depth 3 for node 1 in G1 and node 5 in G2 are similar:
345
346 >>> g1_hashes[1]
347 ['f6fc42039fba3776', 'a93b64973cfc8897', 'db1b43ae35a1878f', '57872a7d2059c1c0']
348 >>> g2_hashes[5]
349 ['f6fc42039fba3776', 'a93b64973cfc8897', 'db1b43ae35a1878f', '1716d2a4012fa4bc']
350
351 The first 3 WL subgraph hashes match. From this we can conclude that it's very
352 likely the neighborhood of 3 hops around these nodes are isomorphic.
353
354 However the 4-hop neighborhoods of ``G1`` and ``G2`` are not isomorphic since the
355 4th hashes in the lists above are not equal.
356
357 These nodes may be candidates to be classified together since their local topology
358 is similar.
359
360 Notes
361 -----
362 To hash the full graph when subgraph hashes are not needed, use
363 `weisfeiler_lehman_graph_hash` for efficiency.
364
365 Similarity between hashes does not imply similarity between graphs.
366
367 References
368 ----------
369 .. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen,
370 Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman
371 Graph Kernels. Journal of Machine Learning Research. 2011.
372 http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf
373 .. [2] Annamalai Narayanan, Mahinthan Chandramohan, Rajasekar Venkatesan,
374 Lihui Chen, Yang Liu and Shantanu Jaiswa. graph2vec: Learning
375 Distributed Representations of Graphs. arXiv. 2017
376 https://arxiv.org/pdf/1707.05005.pdf
377
378 See also
379 --------
380 weisfeiler_lehman_graph_hash
381 """
382
383 if G.is_directed():
384 _neighborhood_aggregate = _neighborhood_aggregate_directed
385 warnings.warn(
386 "The hashes produced for directed graphs changed in v3.5"
387 " due to a bugfix (see documentation).",
388 UserWarning,
389 stacklevel=2,
390 )
391 else:
392 _neighborhood_aggregate = _neighborhood_aggregate_undirected
393
394 def weisfeiler_lehman_step(G, labels, node_subgraph_hashes, edge_attr=None):
395 """
396 Apply neighborhood aggregation to each node
397 in the graph.
398 Computes a dictionary with labels for each node.
399 Appends the new hashed label to the dictionary of subgraph hashes
400 originating from and indexed by each node in G
401 """
402 new_labels = {}
403 for node in G.nodes():
404 label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr)
405 hashed_label = _hash_label(label, digest_size)
406 new_labels[node] = hashed_label
407 node_subgraph_hashes[node].append(hashed_label)
408 return new_labels
409
410 if iterations <= 0:
411 raise ValueError("The WL algorithm requires that `iterations` be positive")
412
413 node_labels = _init_node_labels(G, edge_attr, node_attr)
414
415 if include_initial_labels:
416 node_subgraph_hashes = {
417 k: [_hash_label(v, digest_size)] for k, v in node_labels.items()
418 }
419 else:
420 node_subgraph_hashes = defaultdict(list)
421
422 # If the graph has no attributes, initial labels are the nodes' degrees.
423 # This is equivalent to doing the first iterations of WL.
424 if not edge_attr and not node_attr:
425 iterations -= 1
426 for node in G.nodes():
427 hashed_label = _hash_label(node_labels[node], digest_size)
428 node_subgraph_hashes[node].append(hashed_label)
429
430 for _ in range(iterations):
431 node_labels = weisfeiler_lehman_step(
432 G, node_labels, node_subgraph_hashes, edge_attr
433 )
434
435 return dict(node_subgraph_hashes)