1"""
2Label propagation community detection algorithms.
3"""
4
5from collections import Counter, defaultdict, deque
6
7import networkx as nx
8from networkx.utils import groups, not_implemented_for, py_random_state
9
10__all__ = [
11 "label_propagation_communities",
12 "asyn_lpa_communities",
13 "fast_label_propagation_communities",
14]
15
16
17@py_random_state("seed")
18@nx._dispatchable(edge_attrs="weight")
19def fast_label_propagation_communities(G, *, weight=None, seed=None):
20 """Returns communities in `G` as detected by fast label propagation.
21
22 The fast label propagation algorithm is described in [1]_. The algorithm is
23 probabilistic and the found communities may vary in different executions.
24
25 The algorithm operates as follows. First, the community label of each node is
26 set to a unique label. The algorithm then repeatedly updates the labels of
27 the nodes to the most frequent label in their neighborhood. In case of ties,
28 a random label is chosen from the most frequent labels.
29
30 The algorithm maintains a queue of nodes that still need to be processed.
31 Initially, all nodes are added to the queue in a random order. Then the nodes
32 are removed from the queue one by one and processed. If a node updates its label,
33 all its neighbors that have a different label are added to the queue (if not
34 already in the queue). The algorithm stops when the queue is empty.
35
36 Parameters
37 ----------
38 G : Graph, DiGraph, MultiGraph, or MultiDiGraph
39 Any NetworkX graph.
40
41 weight : string, or None (default)
42 The edge attribute representing a non-negative weight of an edge. If None,
43 each edge is assumed to have weight one. The weight of an edge is used in
44 determining the frequency with which a label appears among the neighbors of
45 a node (edge with weight `w` is equivalent to `w` unweighted edges).
46
47 seed : integer, random_state, or None (default)
48 Indicator of random number generation state. See :ref:`Randomness<randomness>`.
49
50 Returns
51 -------
52 communities : iterable
53 Iterable of communities given as sets of nodes.
54
55 Notes
56 -----
57 Edge directions are ignored for directed graphs.
58 Edge weights must be non-negative numbers.
59
60 References
61 ----------
62 .. [1] Vincent A. Traag & Lovro Šubelj. "Large network community detection by
63 fast label propagation." Scientific Reports 13 (2023): 2701.
64 https://doi.org/10.1038/s41598-023-29610-z
65 """
66
67 # Queue of nodes to be processed.
68 nodes_queue = deque(G)
69 seed.shuffle(nodes_queue)
70
71 # Set of nodes in the queue.
72 nodes_set = set(G)
73
74 # Assign unique label to each node.
75 comms = {node: i for i, node in enumerate(G)}
76
77 while nodes_queue:
78 # Remove next node from the queue to process.
79 node = nodes_queue.popleft()
80 nodes_set.remove(node)
81
82 # Isolated nodes retain their initial label.
83 if G.degree(node) > 0:
84 # Compute frequency of labels in node's neighborhood.
85 label_freqs = _fast_label_count(G, comms, node, weight)
86 max_freq = max(label_freqs.values())
87
88 # Always sample new label from most frequent labels.
89 comm = seed.choice(
90 [comm for comm in label_freqs if label_freqs[comm] == max_freq]
91 )
92
93 if comms[node] != comm:
94 comms[node] = comm
95
96 # Add neighbors that have different label to the queue.
97 for nbr in nx.all_neighbors(G, node):
98 if comms[nbr] != comm and nbr not in nodes_set:
99 nodes_queue.append(nbr)
100 nodes_set.add(nbr)
101
102 yield from groups(comms).values()
103
104
105def _fast_label_count(G, comms, node, weight=None):
106 """Computes the frequency of labels in the neighborhood of a node.
107
108 Returns a dictionary keyed by label to the frequency of that label.
109 """
110
111 if weight is None:
112 # Unweighted (un)directed simple graph.
113 if not G.is_multigraph():
114 label_freqs = Counter(map(comms.get, nx.all_neighbors(G, node)))
115
116 # Unweighted (un)directed multigraph.
117 else:
118 label_freqs = defaultdict(int)
119 for nbr in G[node]:
120 label_freqs[comms[nbr]] += len(G[node][nbr])
121
122 if G.is_directed():
123 for nbr in G.pred[node]:
124 label_freqs[comms[nbr]] += len(G.pred[node][nbr])
125
126 else:
127 # Weighted undirected simple/multigraph.
128 label_freqs = defaultdict(float)
129 for _, nbr, w in G.edges(node, data=weight, default=1):
130 label_freqs[comms[nbr]] += w
131
132 # Weighted directed simple/multigraph.
133 if G.is_directed():
134 for nbr, _, w in G.in_edges(node, data=weight, default=1):
135 label_freqs[comms[nbr]] += w
136
137 return label_freqs
138
139
140@py_random_state(2)
141@nx._dispatchable(edge_attrs="weight")
142def asyn_lpa_communities(G, weight=None, seed=None):
143 """Returns communities in `G` as detected by asynchronous label
144 propagation.
145
146 The asynchronous label propagation algorithm is described in
147 [1]_. The algorithm is probabilistic and the found communities may
148 vary on different executions.
149
150 The algorithm proceeds as follows. After initializing each node with
151 a unique label, the algorithm repeatedly sets the label of a node to
152 be the label that appears most frequently among that nodes
153 neighbors. The algorithm halts when each node has the label that
154 appears most frequently among its neighbors. The algorithm is
155 asynchronous because each node is updated without waiting for
156 updates on the remaining nodes.
157
158 This generalized version of the algorithm in [1]_ accepts edge
159 weights.
160
161 Parameters
162 ----------
163 G : Graph
164
165 weight : string
166 The edge attribute representing the weight of an edge.
167 If None, each edge is assumed to have weight one. In this
168 algorithm, the weight of an edge is used in determining the
169 frequency with which a label appears among the neighbors of a
170 node: a higher weight means the label appears more often.
171
172 seed : integer, random_state, or None (default)
173 Indicator of random number generation state.
174 See :ref:`Randomness<randomness>`.
175
176 Returns
177 -------
178 communities : iterable
179 Iterable of communities given as sets of nodes.
180
181 Notes
182 -----
183 Edge weight attributes must be numerical.
184
185 References
186 ----------
187 .. [1] Raghavan, Usha Nandini, Réka Albert, and Soundar Kumara. "Near
188 linear time algorithm to detect community structures in large-scale
189 networks." Physical Review E 76.3 (2007): 036106.
190 """
191
192 labels = {n: i for i, n in enumerate(G)}
193 cont = True
194
195 while cont:
196 cont = False
197 nodes = list(G)
198 seed.shuffle(nodes)
199
200 for node in nodes:
201 if not G[node]:
202 continue
203
204 # Get label frequencies among adjacent nodes.
205 # Depending on the order they are processed in,
206 # some nodes will be in iteration t and others in t-1,
207 # making the algorithm asynchronous.
208 if weight is None:
209 # initialising a Counter from an iterator of labels is
210 # faster for getting unweighted label frequencies
211 label_freq = Counter(map(labels.get, G[node]))
212 else:
213 # updating a defaultdict is substantially faster
214 # for getting weighted label frequencies
215 label_freq = defaultdict(float)
216 for _, v, wt in G.edges(node, data=weight, default=1):
217 label_freq[labels[v]] += wt
218
219 # Get the labels that appear with maximum frequency.
220 max_freq = max(label_freq.values())
221 best_labels = [
222 label for label, freq in label_freq.items() if freq == max_freq
223 ]
224
225 # If the node does not have one of the maximum frequency labels,
226 # randomly choose one of them and update the node's label.
227 # Continue the iteration as long as at least one node
228 # doesn't have a maximum frequency label.
229 if labels[node] not in best_labels:
230 labels[node] = seed.choice(best_labels)
231 cont = True
232
233 yield from groups(labels).values()
234
235
236@not_implemented_for("directed")
237@nx._dispatchable
238def label_propagation_communities(G):
239 """Generates community sets determined by label propagation
240
241 Finds communities in `G` using a semi-synchronous label propagation
242 method [1]_. This method combines the advantages of both the synchronous
243 and asynchronous models. Not implemented for directed graphs.
244
245 Parameters
246 ----------
247 G : graph
248 An undirected NetworkX graph.
249
250 Returns
251 -------
252 communities : iterable
253 A dict_values object that contains a set of nodes for each community.
254
255 Raises
256 ------
257 NetworkXNotImplemented
258 If the graph is directed
259
260 References
261 ----------
262 .. [1] Cordasco, G., & Gargano, L. (2010, December). Community detection
263 via semi-synchronous label propagation algorithms. In Business
264 Applications of Social Network Analysis (BASNA), 2010 IEEE International
265 Workshop on (pp. 1-8). IEEE.
266 """
267 coloring = _color_network(G)
268 # Create a unique label for each node in the graph
269 labeling = {v: k for k, v in enumerate(G)}
270 while not _labeling_complete(labeling, G):
271 # Update the labels of every node with the same color.
272 for color, nodes in coloring.items():
273 for n in nodes:
274 _update_label(n, labeling, G)
275
276 clusters = defaultdict(set)
277 for node, label in labeling.items():
278 clusters[label].add(node)
279 return clusters.values()
280
281
282def _color_network(G):
283 """Colors the network so that neighboring nodes all have distinct colors.
284
285 Returns a dict keyed by color to a set of nodes with that color.
286 """
287 coloring = {} # color => set(node)
288 colors = nx.coloring.greedy_color(G)
289 for node, color in colors.items():
290 if color in coloring:
291 coloring[color].add(node)
292 else:
293 coloring[color] = {node}
294 return coloring
295
296
297def _labeling_complete(labeling, G):
298 """Determines whether or not LPA is done.
299
300 Label propagation is complete when all nodes have a label that is
301 in the set of highest frequency labels amongst its neighbors.
302
303 Nodes with no neighbors are considered complete.
304 """
305 return all(
306 labeling[v] in _most_frequent_labels(v, labeling, G) for v in G if len(G[v]) > 0
307 )
308
309
310def _most_frequent_labels(node, labeling, G):
311 """Returns a set of all labels with maximum frequency in `labeling`.
312
313 Input `labeling` should be a dict keyed by node to labels.
314 """
315 if not G[node]:
316 # Nodes with no neighbors are themselves a community and are labeled
317 # accordingly, hence the immediate if statement.
318 return {labeling[node]}
319
320 # Compute the frequencies of all neighbors of node
321 freqs = Counter(labeling[q] for q in G[node])
322 max_freq = max(freqs.values())
323 return {label for label, freq in freqs.items() if freq == max_freq}
324
325
326def _update_label(node, labeling, G):
327 """Updates the label of a node using the Prec-Max tie breaking algorithm
328
329 The algorithm is explained in: 'Community Detection via Semi-Synchronous
330 Label Propagation Algorithms' Cordasco and Gargano, 2011
331 """
332 high_labels = _most_frequent_labels(node, labeling, G)
333 if len(high_labels) == 1:
334 labeling[node] = high_labels.pop()
335 elif len(high_labels) > 1:
336 # Prec-Max
337 if labeling[node] not in high_labels:
338 labeling[node] = max(high_labels)