Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/networkx/algorithms/graph_hashing.py: 19%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

86 statements  

1""" 

2Functions for hashing graphs to strings. 

3Isomorphic graphs should be assigned identical hashes. 

4For now, only Weisfeiler-Lehman hashing is implemented. 

5""" 

6 

7import warnings 

8from collections import Counter, defaultdict 

9from hashlib import blake2b 

10 

11import networkx as nx 

12 

13__all__ = ["weisfeiler_lehman_graph_hash", "weisfeiler_lehman_subgraph_hashes"] 

14 

15 

16def _hash_label(label, digest_size): 

17 return blake2b(label.encode("ascii"), digest_size=digest_size).hexdigest() 

18 

19 

20def _init_node_labels(G, edge_attr, node_attr): 

21 if node_attr: 

22 return {u: str(dd[node_attr]) for u, dd in G.nodes(data=True)} 

23 elif edge_attr: 

24 return {u: "" for u in G} 

25 else: 

26 warnings.warn( 

27 "The hashes produced for graphs without node or edge attributes " 

28 "changed in v3.5 due to a bugfix (see documentation).", 

29 UserWarning, 

30 stacklevel=2, 

31 ) 

32 if nx.is_directed(G): 

33 return {u: str(G.in_degree(u)) + "_" + str(G.out_degree(u)) for u in G} 

34 else: 

35 return {u: str(deg) for u, deg in G.degree()} 

36 

37 

38def _neighborhood_aggregate_undirected(G, node, node_labels, edge_attr=None): 

39 """ 

40 Compute new labels for given node in an undirected graph by aggregating 

41 the labels of each node's neighbors. 

42 """ 

43 label_list = [] 

44 for nbr in G.neighbors(node): 

45 prefix = "" if edge_attr is None else str(G[node][nbr][edge_attr]) 

46 label_list.append(prefix + node_labels[nbr]) 

47 return node_labels[node] + "".join(sorted(label_list)) 

48 

49 

50def _neighborhood_aggregate_directed(G, node, node_labels, edge_attr=None): 

51 """ 

52 Compute new labels for given node in a directed graph by aggregating 

53 the labels of each node's neighbors. 

54 """ 

55 successor_labels = [] 

56 for nbr in G.successors(node): 

57 prefix = "s_" + "" if edge_attr is None else str(G[node][nbr][edge_attr]) 

58 successor_labels.append(prefix + node_labels[nbr]) 

59 

60 predecessor_labels = [] 

61 for nbr in G.predecessors(node): 

62 prefix = "p_" + "" if edge_attr is None else str(G[nbr][node][edge_attr]) 

63 predecessor_labels.append(prefix + node_labels[nbr]) 

64 return ( 

65 node_labels[node] 

66 + "".join(sorted(successor_labels)) 

67 + "".join(sorted(predecessor_labels)) 

68 ) 

69 

70 

71@nx.utils.not_implemented_for("multigraph") 

72@nx._dispatchable(edge_attrs={"edge_attr": None}, node_attrs="node_attr") 

73def weisfeiler_lehman_graph_hash( 

74 G, edge_attr=None, node_attr=None, iterations=3, digest_size=16 

75): 

76 """Return Weisfeiler Lehman (WL) graph hash. 

77 

78 .. Warning:: Hash values for directed graphs and graphs without edge or 

79 node attributes have changed in v3.5. In previous versions, 

80 directed graphs did not distinguish in- and outgoing edges. Also, 

81 graphs without attributes set initial states such that effectively 

82 one extra iteration of WL occurred than indicated by `iterations`. 

83 For undirected graphs without node or edge labels, the old 

84 hashes can be obtained by increasing the iteration count by one. 

85 For more details, see `issue #7806 

86 <https://github.com/networkx/networkx/issues/7806>`_. 

87 

88 The function iteratively aggregates and hashes neighborhoods of each node. 

89 After each node's neighbors are hashed to obtain updated node labels, 

90 a hashed histogram of resulting labels is returned as the final hash. 

91 

92 Hashes are identical for isomorphic graphs and strong guarantees that 

93 non-isomorphic graphs will get different hashes. See [1]_ for details. 

94 

95 If no node or edge attributes are provided, the degree of each node 

96 is used as its initial label. 

97 Otherwise, node and/or edge labels are used to compute the hash. 

98 

99 Parameters 

100 ---------- 

101 G : graph 

102 The graph to be hashed. 

103 Can have node and/or edge attributes. Can also have no attributes. 

104 edge_attr : string, optional (default=None) 

105 The key in edge attribute dictionary to be used for hashing. 

106 If None, edge labels are ignored. 

107 node_attr: string, optional (default=None) 

108 The key in node attribute dictionary to be used for hashing. 

109 If None, and no edge_attr given, use the degrees of the nodes as labels. 

110 iterations: int, optional (default=3) 

111 Number of neighbor aggregations to perform. 

112 Should be larger for larger graphs. 

113 digest_size: int, optional (default=16) 

114 Size (in bytes) of blake2b hash digest to use for hashing node labels. 

115 

116 Returns 

117 ------- 

118 h : string 

119 Hexadecimal string corresponding to hash of `G` (length ``2 * digest_size``). 

120 

121 Raises 

122 ------ 

123 ValueError 

124 If `iterations` is not a positve number. 

125 

126 Examples 

127 -------- 

128 Two graphs with edge attributes that are isomorphic, except for 

129 differences in the edge labels. 

130 

131 >>> G1 = nx.Graph() 

132 >>> G1.add_edges_from( 

133 ... [ 

134 ... (1, 2, {"label": "A"}), 

135 ... (2, 3, {"label": "A"}), 

136 ... (3, 1, {"label": "A"}), 

137 ... (1, 4, {"label": "B"}), 

138 ... ] 

139 ... ) 

140 >>> G2 = nx.Graph() 

141 >>> G2.add_edges_from( 

142 ... [ 

143 ... (5, 6, {"label": "B"}), 

144 ... (6, 7, {"label": "A"}), 

145 ... (7, 5, {"label": "A"}), 

146 ... (7, 8, {"label": "A"}), 

147 ... ] 

148 ... ) 

149 

150 Omitting the `edge_attr` option, results in identical hashes. 

151 

152 >>> nx.weisfeiler_lehman_graph_hash(G1) 

153 'c045439172215f49e0bef8c3d26c6b61' 

154 >>> nx.weisfeiler_lehman_graph_hash(G2) 

155 'c045439172215f49e0bef8c3d26c6b61' 

156 

157 With edge labels, the graphs are no longer assigned 

158 the same hash digest. 

159 

160 >>> nx.weisfeiler_lehman_graph_hash(G1, edge_attr="label") 

161 'c653d85538bcf041d88c011f4f905f10' 

162 >>> nx.weisfeiler_lehman_graph_hash(G2, edge_attr="label") 

163 '3dcd84af1ca855d0eff3c978d88e7ec7' 

164 

165 Notes 

166 ----- 

167 To return the WL hashes of each subgraph of a graph, use 

168 `weisfeiler_lehman_subgraph_hashes` 

169 

170 Similarity between hashes does not imply similarity between graphs. 

171 

172 References 

173 ---------- 

174 .. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen, 

175 Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman 

176 Graph Kernels. Journal of Machine Learning Research. 2011. 

177 http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf 

178 

179 See also 

180 -------- 

181 weisfeiler_lehman_subgraph_hashes 

182 """ 

183 

184 if G.is_directed(): 

185 _neighborhood_aggregate = _neighborhood_aggregate_directed 

186 warnings.warn( 

187 "The hashes produced for directed graphs changed in version v3.5" 

188 " due to a bugfix to track in and out edges separately (see documentation).", 

189 UserWarning, 

190 stacklevel=2, 

191 ) 

192 else: 

193 _neighborhood_aggregate = _neighborhood_aggregate_undirected 

194 

195 def weisfeiler_lehman_step(G, labels, edge_attr=None): 

196 """ 

197 Apply neighborhood aggregation to each node 

198 in the graph. 

199 Computes a dictionary with labels for each node. 

200 """ 

201 new_labels = {} 

202 for node in G.nodes(): 

203 label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr) 

204 new_labels[node] = _hash_label(label, digest_size) 

205 return new_labels 

206 

207 if iterations <= 0: 

208 raise ValueError("The WL algorithm requires that `iterations` be positive") 

209 

210 # set initial node labels 

211 node_labels = _init_node_labels(G, edge_attr, node_attr) 

212 

213 # If the graph has no attributes, initial labels are the nodes' degrees. 

214 # This is equivalent to doing the first iterations of WL. 

215 if not edge_attr and not node_attr: 

216 iterations -= 1 

217 

218 subgraph_hash_counts = [] 

219 for _ in range(iterations): 

220 node_labels = weisfeiler_lehman_step(G, node_labels, edge_attr=edge_attr) 

221 counter = Counter(node_labels.values()) 

222 # sort the counter, extend total counts 

223 subgraph_hash_counts.extend(sorted(counter.items(), key=lambda x: x[0])) 

224 

225 # hash the final counter 

226 return _hash_label(str(tuple(subgraph_hash_counts)), digest_size) 

227 

228 

229@nx.utils.not_implemented_for("multigraph") 

230@nx._dispatchable(edge_attrs={"edge_attr": None}, node_attrs="node_attr") 

231def weisfeiler_lehman_subgraph_hashes( 

232 G, 

233 edge_attr=None, 

234 node_attr=None, 

235 iterations=3, 

236 digest_size=16, 

237 include_initial_labels=False, 

238): 

239 """ 

240 Return a dictionary of subgraph hashes by node. 

241 

242 .. Warning:: Hash values for directed graphs have changed in version 

243 v3.5. In previous versions, directed graphs did not distinguish in- 

244 and outgoing edges. 

245 Graphs without attributes previously performed an extra iteration of 

246 WL at initialisation, which was not visible in the output of this 

247 function. This hash value is now included in the returned dictionary, 

248 shifting the other calculated hashes one position to the right. To 

249 obtain the same last subgraph hash, increase the number of iterations 

250 by one. 

251 For more details, see `issue #7806 

252 <https://github.com/networkx/networkx/issues/7806>`_. 

253 

254 Dictionary keys are nodes in `G`, and values are a list of hashes. 

255 Each hash corresponds to a subgraph rooted at a given node u in `G`. 

256 Lists of subgraph hashes are sorted in increasing order of depth from 

257 their root node, with the hash at index i corresponding to a subgraph 

258 of nodes at most i-hops (i edges) distance from u. Thus, each list will contain 

259 `iterations` elements - a hash for a subgraph at each depth. If 

260 `include_initial_labels` is set to `True`, each list will additionally 

261 have contain a hash of the initial node label (or equivalently a 

262 subgraph of depth 0) prepended, totalling ``iterations + 1`` elements. 

263 

264 The function iteratively aggregates and hashes neighborhoods of each node. 

265 This is achieved for each step by replacing for each node its label from 

266 the previous iteration with its hashed 1-hop neighborhood aggregate. 

267 The new node label is then appended to a list of node labels for each 

268 node. 

269 

270 To aggregate neighborhoods for a node $u$ at each step, all labels of 

271 nodes adjacent to $u$ are concatenated. If the `edge_attr` parameter is set, 

272 labels for each neighboring node are prefixed with the value of this attribute 

273 along the connecting edge from this neighbor to node $u$. The resulting string 

274 is then hashed to compress this information into a fixed digest size. 

275 

276 Thus, at the i-th iteration, nodes within i hops influence any given 

277 hashed node label. We can therefore say that at depth $i$ for node $u$ 

278 we have a hash for a subgraph induced by the i-hop neighborhood of $u$. 

279 

280 The output can be used to create general Weisfeiler-Lehman graph kernels, 

281 or generate features for graphs or nodes - for example to generate 'words' in 

282 a graph as seen in the 'graph2vec' algorithm. 

283 See [1]_ & [2]_ respectively for details. 

284 

285 Hashes are identical for isomorphic subgraphs and there exist strong 

286 guarantees that non-isomorphic graphs will get different hashes. 

287 See [1]_ for details. 

288 

289 If no node or edge attributes are provided, the degree of each node 

290 is used as its initial label. 

291 Otherwise, node and/or edge labels are used to compute the hash. 

292 

293 Parameters 

294 ---------- 

295 G : graph 

296 The graph to be hashed. 

297 Can have node and/or edge attributes. Can also have no attributes. 

298 edge_attr : string, optional (default=None) 

299 The key in edge attribute dictionary to be used for hashing. 

300 If None, edge labels are ignored. 

301 node_attr : string, optional (default=None) 

302 The key in node attribute dictionary to be used for hashing. 

303 If None, and no edge_attr given, use the degrees of the nodes as labels. 

304 If None, and edge_attr is given, each node starts with an identical label. 

305 iterations : int, optional (default=3) 

306 Number of neighbor aggregations to perform. 

307 Should be larger for larger graphs. 

308 digest_size : int, optional (default=16) 

309 Size (in bytes) of blake2b hash digest to use for hashing node labels. 

310 The default size is 16 bytes. 

311 include_initial_labels : bool, optional (default=False) 

312 If True, include the hashed initial node label as the first subgraph 

313 hash for each node. 

314 

315 Returns 

316 ------- 

317 node_subgraph_hashes : dict 

318 A dictionary with each key given by a node in G, and each value given 

319 by the subgraph hashes in order of depth from the key node. 

320 Hashes are hexadecimal strings (hence ``2 * digest_size`` long). 

321 

322 

323 Raises 

324 ------ 

325 ValueError 

326 If `iterations` is not a positve number. 

327 

328 Examples 

329 -------- 

330 Finding similar nodes in different graphs: 

331 

332 >>> G1 = nx.Graph() 

333 >>> G1.add_edges_from([(1, 2), (2, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 7)]) 

334 >>> G2 = nx.Graph() 

335 >>> G2.add_edges_from([(1, 3), (2, 3), (1, 6), (1, 5), (4, 6)]) 

336 >>> g1_hashes = nx.weisfeiler_lehman_subgraph_hashes( 

337 ... G1, iterations=4, digest_size=8 

338 ... ) 

339 >>> g2_hashes = nx.weisfeiler_lehman_subgraph_hashes( 

340 ... G2, iterations=4, digest_size=8 

341 ... ) 

342 

343 Even though G1 and G2 are not isomorphic (they have different numbers of edges), 

344 the hash sequence of depth 3 for node 1 in G1 and node 5 in G2 are similar: 

345 

346 >>> g1_hashes[1] 

347 ['f6fc42039fba3776', 'a93b64973cfc8897', 'db1b43ae35a1878f', '57872a7d2059c1c0'] 

348 >>> g2_hashes[5] 

349 ['f6fc42039fba3776', 'a93b64973cfc8897', 'db1b43ae35a1878f', '1716d2a4012fa4bc'] 

350 

351 The first 3 WL subgraph hashes match. From this we can conclude that it's very 

352 likely the neighborhood of 3 hops around these nodes are isomorphic. 

353 

354 However the 4-hop neighborhoods of ``G1`` and ``G2`` are not isomorphic since the 

355 4th hashes in the lists above are not equal. 

356 

357 These nodes may be candidates to be classified together since their local topology 

358 is similar. 

359 

360 Notes 

361 ----- 

362 To hash the full graph when subgraph hashes are not needed, use 

363 `weisfeiler_lehman_graph_hash` for efficiency. 

364 

365 Similarity between hashes does not imply similarity between graphs. 

366 

367 References 

368 ---------- 

369 .. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen, 

370 Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman 

371 Graph Kernels. Journal of Machine Learning Research. 2011. 

372 http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf 

373 .. [2] Annamalai Narayanan, Mahinthan Chandramohan, Rajasekar Venkatesan, 

374 Lihui Chen, Yang Liu and Shantanu Jaiswa. graph2vec: Learning 

375 Distributed Representations of Graphs. arXiv. 2017 

376 https://arxiv.org/pdf/1707.05005.pdf 

377 

378 See also 

379 -------- 

380 weisfeiler_lehman_graph_hash 

381 """ 

382 

383 if G.is_directed(): 

384 _neighborhood_aggregate = _neighborhood_aggregate_directed 

385 warnings.warn( 

386 "The hashes produced for directed graphs changed in v3.5" 

387 " due to a bugfix (see documentation).", 

388 UserWarning, 

389 stacklevel=2, 

390 ) 

391 else: 

392 _neighborhood_aggregate = _neighborhood_aggregate_undirected 

393 

394 def weisfeiler_lehman_step(G, labels, node_subgraph_hashes, edge_attr=None): 

395 """ 

396 Apply neighborhood aggregation to each node 

397 in the graph. 

398 Computes a dictionary with labels for each node. 

399 Appends the new hashed label to the dictionary of subgraph hashes 

400 originating from and indexed by each node in G 

401 """ 

402 new_labels = {} 

403 for node in G.nodes(): 

404 label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr) 

405 hashed_label = _hash_label(label, digest_size) 

406 new_labels[node] = hashed_label 

407 node_subgraph_hashes[node].append(hashed_label) 

408 return new_labels 

409 

410 if iterations <= 0: 

411 raise ValueError("The WL algorithm requires that `iterations` be positive") 

412 

413 node_labels = _init_node_labels(G, edge_attr, node_attr) 

414 

415 if include_initial_labels: 

416 node_subgraph_hashes = { 

417 k: [_hash_label(v, digest_size)] for k, v in node_labels.items() 

418 } 

419 else: 

420 node_subgraph_hashes = defaultdict(list) 

421 

422 # If the graph has no attributes, initial labels are the nodes' degrees. 

423 # This is equivalent to doing the first iterations of WL. 

424 if not edge_attr and not node_attr: 

425 iterations -= 1 

426 for node in G.nodes(): 

427 hashed_label = _hash_label(node_labels[node], digest_size) 

428 node_subgraph_hashes[node].append(hashed_label) 

429 

430 for _ in range(iterations): 

431 node_labels = weisfeiler_lehman_step( 

432 G, node_labels, node_subgraph_hashes, edge_attr 

433 ) 

434 

435 return dict(node_subgraph_hashes)