Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/networkx/algorithms/graph_hashing.py: 23%

48 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-20 07:00 +0000

1""" 

2Functions for hashing graphs to strings. 

3Isomorphic graphs should be assigned identical hashes. 

4For now, only Weisfeiler-Lehman hashing is implemented. 

5""" 

6 

7from collections import Counter, defaultdict 

8from hashlib import blake2b 

9 

10import networkx as nx 

11 

12__all__ = ["weisfeiler_lehman_graph_hash", "weisfeiler_lehman_subgraph_hashes"] 

13 

14 

15def _hash_label(label, digest_size): 

16 return blake2b(label.encode("ascii"), digest_size=digest_size).hexdigest() 

17 

18 

19def _init_node_labels(G, edge_attr, node_attr): 

20 if node_attr: 

21 return {u: str(dd[node_attr]) for u, dd in G.nodes(data=True)} 

22 elif edge_attr: 

23 return {u: "" for u in G} 

24 else: 

25 return {u: str(deg) for u, deg in G.degree()} 

26 

27 

28def _neighborhood_aggregate(G, node, node_labels, edge_attr=None): 

29 """ 

30 Compute new labels for given node by aggregating 

31 the labels of each node's neighbors. 

32 """ 

33 label_list = [] 

34 for nbr in G.neighbors(node): 

35 prefix = "" if edge_attr is None else str(G[node][nbr][edge_attr]) 

36 label_list.append(prefix + node_labels[nbr]) 

37 return node_labels[node] + "".join(sorted(label_list)) 

38 

39 

40@nx._dispatch(edge_attrs={"edge_attr": None}, node_attrs="node_attr") 

41def weisfeiler_lehman_graph_hash( 

42 G, edge_attr=None, node_attr=None, iterations=3, digest_size=16 

43): 

44 """Return Weisfeiler Lehman (WL) graph hash. 

45 

46 The function iteratively aggregates and hashes neighbourhoods of each node. 

47 After each node's neighbors are hashed to obtain updated node labels, 

48 a hashed histogram of resulting labels is returned as the final hash. 

49 

50 Hashes are identical for isomorphic graphs and strong guarantees that 

51 non-isomorphic graphs will get different hashes. See [1]_ for details. 

52 

53 If no node or edge attributes are provided, the degree of each node 

54 is used as its initial label. 

55 Otherwise, node and/or edge labels are used to compute the hash. 

56 

57 Parameters 

58 ---------- 

59 G: graph 

60 The graph to be hashed. 

61 Can have node and/or edge attributes. Can also have no attributes. 

62 edge_attr: string, default=None 

63 The key in edge attribute dictionary to be used for hashing. 

64 If None, edge labels are ignored. 

65 node_attr: string, default=None 

66 The key in node attribute dictionary to be used for hashing. 

67 If None, and no edge_attr given, use the degrees of the nodes as labels. 

68 iterations: int, default=3 

69 Number of neighbor aggregations to perform. 

70 Should be larger for larger graphs. 

71 digest_size: int, default=16 

72 Size (in bits) of blake2b hash digest to use for hashing node labels. 

73 

74 Returns 

75 ------- 

76 h : string 

77 Hexadecimal string corresponding to hash of the input graph. 

78 

79 Examples 

80 -------- 

81 Two graphs with edge attributes that are isomorphic, except for 

82 differences in the edge labels. 

83 

84 >>> G1 = nx.Graph() 

85 >>> G1.add_edges_from( 

86 ... [ 

87 ... (1, 2, {"label": "A"}), 

88 ... (2, 3, {"label": "A"}), 

89 ... (3, 1, {"label": "A"}), 

90 ... (1, 4, {"label": "B"}), 

91 ... ] 

92 ... ) 

93 >>> G2 = nx.Graph() 

94 >>> G2.add_edges_from( 

95 ... [ 

96 ... (5, 6, {"label": "B"}), 

97 ... (6, 7, {"label": "A"}), 

98 ... (7, 5, {"label": "A"}), 

99 ... (7, 8, {"label": "A"}), 

100 ... ] 

101 ... ) 

102 

103 Omitting the `edge_attr` option, results in identical hashes. 

104 

105 >>> nx.weisfeiler_lehman_graph_hash(G1) 

106 '7bc4dde9a09d0b94c5097b219891d81a' 

107 >>> nx.weisfeiler_lehman_graph_hash(G2) 

108 '7bc4dde9a09d0b94c5097b219891d81a' 

109 

110 With edge labels, the graphs are no longer assigned 

111 the same hash digest. 

112 

113 >>> nx.weisfeiler_lehman_graph_hash(G1, edge_attr="label") 

114 'c653d85538bcf041d88c011f4f905f10' 

115 >>> nx.weisfeiler_lehman_graph_hash(G2, edge_attr="label") 

116 '3dcd84af1ca855d0eff3c978d88e7ec7' 

117 

118 Notes 

119 ----- 

120 To return the WL hashes of each subgraph of a graph, use 

121 `weisfeiler_lehman_subgraph_hashes` 

122 

123 Similarity between hashes does not imply similarity between graphs. 

124 

125 References 

126 ---------- 

127 .. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen, 

128 Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman 

129 Graph Kernels. Journal of Machine Learning Research. 2011. 

130 http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf 

131 

132 See also 

133 -------- 

134 weisfeiler_lehman_subgraph_hashes 

135 """ 

136 

137 def weisfeiler_lehman_step(G, labels, edge_attr=None): 

138 """ 

139 Apply neighborhood aggregation to each node 

140 in the graph. 

141 Computes a dictionary with labels for each node. 

142 """ 

143 new_labels = {} 

144 for node in G.nodes(): 

145 label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr) 

146 new_labels[node] = _hash_label(label, digest_size) 

147 return new_labels 

148 

149 # set initial node labels 

150 node_labels = _init_node_labels(G, edge_attr, node_attr) 

151 

152 subgraph_hash_counts = [] 

153 for _ in range(iterations): 

154 node_labels = weisfeiler_lehman_step(G, node_labels, edge_attr=edge_attr) 

155 counter = Counter(node_labels.values()) 

156 # sort the counter, extend total counts 

157 subgraph_hash_counts.extend(sorted(counter.items(), key=lambda x: x[0])) 

158 

159 # hash the final counter 

160 return _hash_label(str(tuple(subgraph_hash_counts)), digest_size) 

161 

162 

163@nx._dispatch(edge_attrs={"edge_attr": None}, node_attrs="node_attr") 

164def weisfeiler_lehman_subgraph_hashes( 

165 G, edge_attr=None, node_attr=None, iterations=3, digest_size=16 

166): 

167 """ 

168 Return a dictionary of subgraph hashes by node. 

169 

170 Dictionary keys are nodes in `G`, and values are a list of hashes. 

171 Each hash corresponds to a subgraph rooted at a given node u in `G`. 

172 Lists of subgraph hashes are sorted in increasing order of depth from 

173 their root node, with the hash at index i corresponding to a subgraph 

174 of nodes at most i edges distance from u. Thus, each list will contain 

175 ``iterations + 1`` elements - a hash for a subgraph at each depth, and 

176 additionally a hash of the initial node label (or equivalently a 

177 subgraph of depth 0) 

178 

179 The function iteratively aggregates and hashes neighbourhoods of each node. 

180 This is achieved for each step by replacing for each node its label from 

181 the previous iteration with its hashed 1-hop neighborhood aggregate. 

182 The new node label is then appended to a list of node labels for each 

183 node. 

184 

185 To aggregate neighborhoods at each step for a node $n$, all labels of 

186 nodes adjacent to $n$ are concatenated. If the `edge_attr` parameter is set, 

187 labels for each neighboring node are prefixed with the value of this attribute 

188 along the connecting edge from this neighbor to node $n$. The resulting string 

189 is then hashed to compress this information into a fixed digest size. 

190 

191 Thus, at the $i$-th iteration, nodes within $i$ hops influence any given 

192 hashed node label. We can therefore say that at depth $i$ for node $n$ 

193 we have a hash for a subgraph induced by the $2i$-hop neighborhood of $n$. 

194 

195 The output can be used to to create general Weisfeiler-Lehman graph kernels, 

196 or generate features for graphs or nodes - for example to generate 'words' in 

197 a graph as seen in the 'graph2vec' algorithm. 

198 See [1]_ & [2]_ respectively for details. 

199 

200 Hashes are identical for isomorphic subgraphs and there exist strong 

201 guarantees that non-isomorphic graphs will get different hashes. 

202 See [1]_ for details. 

203 

204 If no node or edge attributes are provided, the degree of each node 

205 is used as its initial label. 

206 Otherwise, node and/or edge labels are used to compute the hash. 

207 

208 Parameters 

209 ---------- 

210 G: graph 

211 The graph to be hashed. 

212 Can have node and/or edge attributes. Can also have no attributes. 

213 edge_attr: string, default=None 

214 The key in edge attribute dictionary to be used for hashing. 

215 If None, edge labels are ignored. 

216 node_attr: string, default=None 

217 The key in node attribute dictionary to be used for hashing. 

218 If None, and no edge_attr given, use the degrees of the nodes as labels. 

219 iterations: int, default=3 

220 Number of neighbor aggregations to perform. 

221 Should be larger for larger graphs. 

222 digest_size: int, default=16 

223 Size (in bits) of blake2b hash digest to use for hashing node labels. 

224 The default size is 16 bits 

225 

226 Returns 

227 ------- 

228 node_subgraph_hashes : dict 

229 A dictionary with each key given by a node in G, and each value given 

230 by the subgraph hashes in order of depth from the key node. 

231 

232 Examples 

233 -------- 

234 Finding similar nodes in different graphs: 

235 

236 >>> G1 = nx.Graph() 

237 >>> G1.add_edges_from([ 

238 ... (1, 2), (2, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 7) 

239 ... ]) 

240 >>> G2 = nx.Graph() 

241 >>> G2.add_edges_from([ 

242 ... (1, 3), (2, 3), (1, 6), (1, 5), (4, 6) 

243 ... ]) 

244 >>> g1_hashes = nx.weisfeiler_lehman_subgraph_hashes(G1, iterations=3, digest_size=8) 

245 >>> g2_hashes = nx.weisfeiler_lehman_subgraph_hashes(G2, iterations=3, digest_size=8) 

246 

247 Even though G1 and G2 are not isomorphic (they have different numbers of edges), 

248 the hash sequence of depth 3 for node 1 in G1 and node 5 in G2 are similar: 

249 

250 >>> g1_hashes[1] 

251 ['a93b64973cfc8897', 'db1b43ae35a1878f', '57872a7d2059c1c0'] 

252 >>> g2_hashes[5] 

253 ['a93b64973cfc8897', 'db1b43ae35a1878f', '1716d2a4012fa4bc'] 

254 

255 The first 2 WL subgraph hashes match. From this we can conclude that it's very 

256 likely the neighborhood of 4 hops around these nodes are isomorphic: each 

257 iteration aggregates 1-hop neighbourhoods meaning hashes at depth $n$ are influenced 

258 by every node within $2n$ hops. 

259 

260 However the neighborhood of 6 hops is no longer isomorphic since their 3rd hash does 

261 not match. 

262 

263 These nodes may be candidates to be classified together since their local topology 

264 is similar. 

265 

266 Notes 

267 ----- 

268 To hash the full graph when subgraph hashes are not needed, use 

269 `weisfeiler_lehman_graph_hash` for efficiency. 

270 

271 Similarity between hashes does not imply similarity between graphs. 

272 

273 References 

274 ---------- 

275 .. [1] Shervashidze, Nino, Pascal Schweitzer, Erik Jan Van Leeuwen, 

276 Kurt Mehlhorn, and Karsten M. Borgwardt. Weisfeiler Lehman 

277 Graph Kernels. Journal of Machine Learning Research. 2011. 

278 http://www.jmlr.org/papers/volume12/shervashidze11a/shervashidze11a.pdf 

279 .. [2] Annamalai Narayanan, Mahinthan Chandramohan, Rajasekar Venkatesan, 

280 Lihui Chen, Yang Liu and Shantanu Jaiswa. graph2vec: Learning 

281 Distributed Representations of Graphs. arXiv. 2017 

282 https://arxiv.org/pdf/1707.05005.pdf 

283 

284 See also 

285 -------- 

286 weisfeiler_lehman_graph_hash 

287 """ 

288 

289 def weisfeiler_lehman_step(G, labels, node_subgraph_hashes, edge_attr=None): 

290 """ 

291 Apply neighborhood aggregation to each node 

292 in the graph. 

293 Computes a dictionary with labels for each node. 

294 Appends the new hashed label to the dictionary of subgraph hashes 

295 originating from and indexed by each node in G 

296 """ 

297 new_labels = {} 

298 for node in G.nodes(): 

299 label = _neighborhood_aggregate(G, node, labels, edge_attr=edge_attr) 

300 hashed_label = _hash_label(label, digest_size) 

301 new_labels[node] = hashed_label 

302 node_subgraph_hashes[node].append(hashed_label) 

303 return new_labels 

304 

305 node_labels = _init_node_labels(G, edge_attr, node_attr) 

306 

307 node_subgraph_hashes = defaultdict(list) 

308 for _ in range(iterations): 

309 node_labels = weisfeiler_lehman_step( 

310 G, node_labels, node_subgraph_hashes, edge_attr 

311 ) 

312 

313 return dict(node_subgraph_hashes)