1"""Functions for detecting communities based on modularity."""
2
3import random
4from collections import defaultdict
5from copy import deepcopy
6
7import networkx as nx
8from networkx.algorithms.community.quality import modularity
9from networkx.utils.mapped_queue import MappedQueue
10
11__all__ = [
12 "greedy_modularity_communities",
13 "naive_greedy_modularity_communities",
14]
15
16
17def _greedy_modularity_communities_generator(G, weight=None, resolution=1):
18 r"""Yield community partitions of G and the modularity change at each step.
19
20 This function performs Clauset-Newman-Moore greedy modularity maximization [2]_
21 At each step of the process it yields the change in modularity that will occur in
22 the next step followed by yielding the new community partition after that step.
23
24 Greedy modularity maximization begins with each node in its own community
25 and repeatedly joins the pair of communities that lead to the largest
26 modularity until one community contains all nodes (the partition has one set).
27
28 This function maximizes the generalized modularity, where `resolution`
29 is the resolution parameter, often expressed as $\gamma$.
30 See :func:`~networkx.algorithms.community.quality.modularity`.
31
32 Parameters
33 ----------
34 G : NetworkX graph
35
36 weight : string or None, optional (default=None)
37 The name of an edge attribute that holds the numerical value used
38 as a weight. If None, then each edge has weight 1.
39 The degree is the sum of the edge weights adjacent to the node.
40
41 resolution : float (default=1)
42 If resolution is less than 1, modularity favors larger communities.
43 Greater than 1 favors smaller communities.
44
45 Yields
46 ------
47 Alternating yield statements produce the following two objects:
48
49 communities: dict_values
50 A dict_values of frozensets of nodes, one for each community.
51 This represents a partition of the nodes of the graph into communities.
52 The first yield is the partition with each node in its own community.
53
54 dq: float
55 The change in modularity when merging the next two communities
56 that leads to the largest modularity.
57
58 See Also
59 --------
60 modularity
61
62 References
63 ----------
64 .. [1] Newman, M. E. J. "Networks: An Introduction", page 224
65 Oxford University Press 2011.
66 .. [2] Clauset, A., Newman, M. E., & Moore, C.
67 "Finding community structure in very large networks."
68 Physical Review E 70(6), 2004.
69 .. [3] Reichardt and Bornholdt "Statistical Mechanics of Community
70 Detection" Phys. Rev. E74, 2006.
71 .. [4] Newman, M. E. J."Analysis of weighted networks"
72 Physical Review E 70(5 Pt 2):056131, 2004.
73 """
74 directed = G.is_directed()
75 N = G.number_of_nodes()
76
77 # Count edges (or the sum of edge-weights for weighted graphs)
78 m = G.size(weight)
79 q0 = 1 / m
80
81 # Calculate degrees (notation from the papers)
82 # a : the fraction of (weighted) out-degree for each node
83 # b : the fraction of (weighted) in-degree for each node
84 if directed:
85 a = {node: deg_out * q0 for node, deg_out in G.out_degree(weight=weight)}
86 b = {node: deg_in * q0 for node, deg_in in G.in_degree(weight=weight)}
87 else:
88 a = b = {node: deg * q0 * 0.5 for node, deg in G.degree(weight=weight)}
89
90 # this preliminary step collects the edge weights for each node pair
91 # It handles multigraph and digraph and works fine for graph.
92 dq_dict = defaultdict(lambda: defaultdict(float))
93 for u, v, wt in G.edges(data=weight, default=1):
94 if u == v:
95 continue
96 dq_dict[u][v] += wt
97 dq_dict[v][u] += wt
98
99 # now scale and subtract the expected edge-weights term
100 for u, nbrdict in dq_dict.items():
101 for v, wt in nbrdict.items():
102 dq_dict[u][v] = q0 * wt - resolution * (a[u] * b[v] + b[u] * a[v])
103
104 # Use -dq to get a max_heap instead of a min_heap
105 # dq_heap holds a heap for each node's neighbors
106 dq_heap = {u: MappedQueue({(u, v): -dq for v, dq in dq_dict[u].items()}) for u in G}
107 # H -> all_dq_heap holds a heap with the best items for each node
108 H = MappedQueue([dq_heap[n].heap[0] for n in G if len(dq_heap[n]) > 0])
109
110 # Initialize single-node communities
111 communities = {n: frozenset([n]) for n in G}
112 yield communities.values()
113
114 # Merge the two communities that lead to the largest modularity
115 while len(H) > 1:
116 # Find best merge
117 # Remove from heap of row maxes
118 # Ties will be broken by choosing the pair with lowest min community id
119 try:
120 negdq, u, v = H.pop()
121 except IndexError:
122 break
123 dq = -negdq
124 yield dq
125 # Remove best merge from row u heap
126 dq_heap[u].pop()
127 # Push new row max onto H
128 if len(dq_heap[u]) > 0:
129 H.push(dq_heap[u].heap[0])
130 # If this element was also at the root of row v, we need to remove the
131 # duplicate entry from H
132 if dq_heap[v].heap[0] == (v, u):
133 H.remove((v, u))
134 # Remove best merge from row v heap
135 dq_heap[v].remove((v, u))
136 # Push new row max onto H
137 if len(dq_heap[v]) > 0:
138 H.push(dq_heap[v].heap[0])
139 else:
140 # Duplicate wasn't in H, just remove from row v heap
141 dq_heap[v].remove((v, u))
142
143 # Perform merge
144 communities[v] = frozenset(communities[u] | communities[v])
145 del communities[u]
146
147 # Get neighbor communities connected to the merged communities
148 u_nbrs = set(dq_dict[u])
149 v_nbrs = set(dq_dict[v])
150 all_nbrs = (u_nbrs | v_nbrs) - {u, v}
151 both_nbrs = u_nbrs & v_nbrs
152 # Update dq for merge of u into v
153 for w in all_nbrs:
154 # Calculate new dq value
155 if w in both_nbrs:
156 dq_vw = dq_dict[v][w] + dq_dict[u][w]
157 elif w in v_nbrs:
158 dq_vw = dq_dict[v][w] - resolution * (a[u] * b[w] + a[w] * b[u])
159 else: # w in u_nbrs
160 dq_vw = dq_dict[u][w] - resolution * (a[v] * b[w] + a[w] * b[v])
161 # Update rows v and w
162 for row, col in [(v, w), (w, v)]:
163 dq_heap_row = dq_heap[row]
164 # Update dict for v,w only (u is removed below)
165 dq_dict[row][col] = dq_vw
166 # Save old max of per-row heap
167 if len(dq_heap_row) > 0:
168 d_oldmax = dq_heap_row.heap[0]
169 else:
170 d_oldmax = None
171 # Add/update heaps
172 d = (row, col)
173 d_negdq = -dq_vw
174 # Save old value for finding heap index
175 if w in v_nbrs:
176 # Update existing element in per-row heap
177 dq_heap_row.update(d, d, priority=d_negdq)
178 else:
179 # We're creating a new nonzero element, add to heap
180 dq_heap_row.push(d, priority=d_negdq)
181 # Update heap of row maxes if necessary
182 if d_oldmax is None:
183 # No entries previously in this row, push new max
184 H.push(d, priority=d_negdq)
185 else:
186 # We've updated an entry in this row, has the max changed?
187 row_max = dq_heap_row.heap[0]
188 if d_oldmax != row_max or d_oldmax.priority != row_max.priority:
189 H.update(d_oldmax, row_max)
190
191 # Remove row/col u from dq_dict matrix
192 for w in dq_dict[u]:
193 # Remove from dict
194 dq_old = dq_dict[w][u]
195 del dq_dict[w][u]
196 # Remove from heaps if we haven't already
197 if w != v:
198 # Remove both row and column
199 for row, col in [(w, u), (u, w)]:
200 dq_heap_row = dq_heap[row]
201 # Check if replaced dq is row max
202 d_old = (row, col)
203 if dq_heap_row.heap[0] == d_old:
204 # Update per-row heap and heap of row maxes
205 dq_heap_row.remove(d_old)
206 H.remove(d_old)
207 # Update row max
208 if len(dq_heap_row) > 0:
209 H.push(dq_heap_row.heap[0])
210 else:
211 # Only update per-row heap
212 dq_heap_row.remove(d_old)
213
214 del dq_dict[u]
215 # Mark row u as deleted, but keep placeholder
216 dq_heap[u] = MappedQueue()
217 # Merge u into v and update a
218 a[v] += a[u]
219 a[u] = 0
220 if directed:
221 b[v] += b[u]
222 b[u] = 0
223
224 yield communities.values()
225
226
227@nx._dispatchable(edge_attrs="weight")
228def greedy_modularity_communities(
229 G,
230 weight=None,
231 resolution=1,
232 cutoff=1,
233 best_n=None,
234):
235 r"""Find communities in G using greedy modularity maximization.
236
237 This function uses Clauset-Newman-Moore greedy modularity maximization [2]_
238 to find the community partition with the largest modularity.
239
240 Greedy modularity maximization begins with each node in its own community
241 and repeatedly joins the pair of communities that lead to the largest
242 modularity until no further increase in modularity is possible (a maximum).
243 Two keyword arguments adjust the stopping condition. `cutoff` is a lower
244 limit on the number of communities so you can stop the process before
245 reaching a maximum (used to save computation time). `best_n` is an upper
246 limit on the number of communities so you can make the process continue
247 until at most n communities remain even if the maximum modularity occurs
248 for more. To obtain exactly n communities, set both `cutoff` and `best_n` to n.
249
250 This function maximizes the generalized modularity, where `resolution`
251 is the resolution parameter, often expressed as $\gamma$.
252 See :func:`~networkx.algorithms.community.quality.modularity`.
253
254 Parameters
255 ----------
256 G : NetworkX graph
257
258 weight : string or None, optional (default=None)
259 The name of an edge attribute that holds the numerical value used
260 as a weight. If None, then each edge has weight 1.
261 The degree is the sum of the edge weights adjacent to the node.
262
263 resolution : float, optional (default=1)
264 If resolution is less than 1, modularity favors larger communities.
265 Greater than 1 favors smaller communities.
266
267 cutoff : int, optional (default=1)
268 A minimum number of communities below which the merging process stops.
269 The process stops at this number of communities even if modularity
270 is not maximized. The goal is to let the user stop the process early.
271 The process stops before the cutoff if it finds a maximum of modularity.
272
273 best_n : int or None, optional (default=None)
274 A maximum number of communities above which the merging process will
275 not stop. This forces community merging to continue after modularity
276 starts to decrease until `best_n` communities remain.
277 If ``None``, don't force it to continue beyond a maximum.
278
279 Raises
280 ------
281 ValueError : If the `cutoff` or `best_n` value is not in the range
282 ``[1, G.number_of_nodes()]``, or if `best_n` < `cutoff`.
283
284 Returns
285 -------
286 communities: list
287 A list of frozensets of nodes, one for each community.
288 Sorted by length with largest communities first.
289
290 Examples
291 --------
292 >>> G = nx.karate_club_graph()
293 >>> c = nx.community.greedy_modularity_communities(G)
294 >>> sorted(c[0])
295 [8, 14, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
296
297 See Also
298 --------
299 modularity
300
301 References
302 ----------
303 .. [1] Newman, M. E. J. "Networks: An Introduction", page 224
304 Oxford University Press 2011.
305 .. [2] Clauset, A., Newman, M. E., & Moore, C.
306 "Finding community structure in very large networks."
307 Physical Review E 70(6), 2004.
308 .. [3] Reichardt and Bornholdt "Statistical Mechanics of Community
309 Detection" Phys. Rev. E74, 2006.
310 .. [4] Newman, M. E. J."Analysis of weighted networks"
311 Physical Review E 70(5 Pt 2):056131, 2004.
312 """
313 if not G.size():
314 return [{n} for n in G]
315
316 if (cutoff < 1) or (cutoff > G.number_of_nodes()):
317 raise ValueError(f"cutoff must be between 1 and {len(G)}. Got {cutoff}.")
318 if best_n is not None:
319 if (best_n < 1) or (best_n > G.number_of_nodes()):
320 raise ValueError(f"best_n must be between 1 and {len(G)}. Got {best_n}.")
321 if best_n < cutoff:
322 raise ValueError(f"Must have best_n >= cutoff. Got {best_n} < {cutoff}")
323 if best_n == 1:
324 return [set(G)]
325 else:
326 best_n = G.number_of_nodes()
327
328 # retrieve generator object to construct output
329 community_gen = _greedy_modularity_communities_generator(
330 G, weight=weight, resolution=resolution
331 )
332
333 # construct the first best community
334 communities = next(community_gen)
335
336 # continue merging communities until one of the breaking criteria is satisfied
337 while len(communities) > cutoff:
338 try:
339 dq = next(community_gen)
340 # StopIteration occurs when communities are the connected components
341 except StopIteration:
342 communities = sorted(communities, key=len, reverse=True)
343 # if best_n requires more merging, merge big sets for highest modularity
344 while len(communities) > best_n:
345 comm1, comm2, *rest = communities
346 communities = [comm1 ^ comm2]
347 communities.extend(rest)
348 return communities
349
350 # keep going unless max_mod is reached or best_n says to merge more
351 if dq < 0 and len(communities) <= best_n:
352 break
353 communities = next(community_gen)
354
355 return sorted(communities, key=len, reverse=True)
356
357
358@nx.utils.not_implemented_for("directed")
359@nx.utils.not_implemented_for("multigraph")
360@nx._dispatchable(edge_attrs="weight")
361def naive_greedy_modularity_communities(G, resolution=1, weight=None):
362 r"""Find communities in G using greedy modularity maximization.
363
364 This implementation is O(n^4), much slower than alternatives, but it is
365 provided as an easy-to-understand reference implementation.
366
367 Greedy modularity maximization begins with each node in its own community
368 and joins the pair of communities that most increases modularity until no
369 such pair exists.
370
371 This function maximizes the generalized modularity, where `resolution`
372 is the resolution parameter, often expressed as $\gamma$.
373 See :func:`~networkx.algorithms.community.quality.modularity`.
374
375 Parameters
376 ----------
377 G : NetworkX graph
378 Graph must be simple and undirected.
379
380 resolution : float (default=1)
381 If resolution is less than 1, modularity favors larger communities.
382 Greater than 1 favors smaller communities.
383
384 weight : string or None, optional (default=None)
385 The name of an edge attribute that holds the numerical value used
386 as a weight. If None, then each edge has weight 1.
387 The degree is the sum of the edge weights adjacent to the node.
388
389 Returns
390 -------
391 list
392 A list of sets of nodes, one for each community.
393 Sorted by length with largest communities first.
394
395 Examples
396 --------
397 >>> G = nx.karate_club_graph()
398 >>> c = nx.community.naive_greedy_modularity_communities(G)
399 >>> sorted(c[0])
400 [8, 14, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
401
402 See Also
403 --------
404 greedy_modularity_communities
405 modularity
406 """
407 # First create one community for each node
408 communities = [frozenset([u]) for u in G.nodes()]
409 # Track merges
410 merges = []
411 # Greedily merge communities until no improvement is possible
412 old_modularity = None
413 new_modularity = modularity(G, communities, resolution=resolution, weight=weight)
414 while old_modularity is None or new_modularity > old_modularity:
415 # Save modularity for comparison
416 old_modularity = new_modularity
417 # Find best pair to merge
418 trial_communities = list(communities)
419 to_merge = None
420 for i, u in enumerate(communities):
421 for j, v in enumerate(communities):
422 # Skip i==j and empty communities
423 if j <= i or len(u) == 0 or len(v) == 0:
424 continue
425 # Merge communities u and v
426 trial_communities[j] = u | v
427 trial_communities[i] = frozenset([])
428 trial_modularity = modularity(
429 G, trial_communities, resolution=resolution, weight=weight
430 )
431 if trial_modularity >= new_modularity:
432 # Check if strictly better or tie
433 if trial_modularity > new_modularity:
434 # Found new best, save modularity and group indexes
435 new_modularity = trial_modularity
436 to_merge = (i, j, new_modularity - old_modularity)
437 elif to_merge and min(i, j) < min(to_merge[0], to_merge[1]):
438 # Break ties by choosing pair with lowest min id
439 new_modularity = trial_modularity
440 to_merge = (i, j, new_modularity - old_modularity)
441 # Un-merge
442 trial_communities[i] = u
443 trial_communities[j] = v
444 if to_merge is not None:
445 # If the best merge improves modularity, use it
446 merges.append(to_merge)
447 i, j, dq = to_merge
448 u, v = communities[i], communities[j]
449 communities[j] = u | v
450 communities[i] = frozenset([])
451 # Remove empty communities and sort
452 return sorted((c for c in communities if len(c) > 0), key=len, reverse=True)