/src/igraph/src/community/label_propagation.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | IGraph library. |
3 | | Copyright (C) 2007-2022 The igraph development team <igraph@igraph.org> |
4 | | |
5 | | This program is free software; you can redistribute it and/or modify |
6 | | it under the terms of the GNU General Public License as published by |
7 | | the Free Software Foundation; either version 2 of the License, or |
8 | | (at your option) any later version. |
9 | | |
10 | | This program is distributed in the hope that it will be useful, |
11 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | GNU General Public License for more details. |
14 | | |
15 | | You should have received a copy of the GNU General Public License |
16 | | along with this program. If not, see <https://www.gnu.org/licenses/>. |
17 | | */ |
18 | | |
19 | | #include "igraph_community.h" |
20 | | |
21 | | #include "igraph_adjlist.h" |
22 | | #include "igraph_dqueue.h" |
23 | | #include "igraph_interface.h" |
24 | | #include "igraph_memory.h" |
25 | | #include "igraph_random.h" |
26 | | |
27 | | #include "core/interruption.h" |
28 | | |
29 | | static igraph_error_t community_label_propagation( |
30 | | const igraph_t *graph, |
31 | | igraph_vector_int_t *membership, |
32 | | igraph_neimode_t mode, |
33 | | const igraph_vector_t *weights, |
34 | | const igraph_vector_bool_t *fixed, |
35 | 6.63k | igraph_bool_t retention) { |
36 | | |
37 | 6.63k | const igraph_integer_t no_of_nodes = igraph_vcount(graph); |
38 | 6.63k | igraph_integer_t no_of_not_fixed_nodes = 0; |
39 | 6.63k | igraph_integer_t i, j, k; |
40 | 6.63k | igraph_adjlist_t al; |
41 | 6.63k | igraph_inclist_t il; |
42 | 6.63k | igraph_bool_t running, control_iteration; |
43 | | |
44 | 6.63k | igraph_vector_t label_weights; |
45 | 6.63k | igraph_vector_int_t dominant_labels, nonzero_labels, node_order; |
46 | 6.63k | igraph_neimode_t reverse_mode; |
47 | 6.63k | int iter = 0; /* interruption counter */ |
48 | | |
49 | 6.63k | reverse_mode = IGRAPH_REVERSE_MODE(mode); |
50 | | |
51 | | /* Create an adjacency/incidence list representation for efficiency. |
52 | | * For the unweighted case, the adjacency list is enough. For the |
53 | | * weighted case, we need the incidence list */ |
54 | 6.63k | if (weights) { |
55 | 6.63k | IGRAPH_CHECK(igraph_inclist_init(graph, &il, reverse_mode, IGRAPH_LOOPS_ONCE)); |
56 | 6.63k | IGRAPH_FINALLY(igraph_inclist_destroy, &il); |
57 | 6.63k | } else { |
58 | 0 | IGRAPH_CHECK(igraph_adjlist_init(graph, &al, reverse_mode, IGRAPH_LOOPS_ONCE, IGRAPH_MULTIPLE)); |
59 | 0 | IGRAPH_FINALLY(igraph_adjlist_destroy, &al); |
60 | 0 | } |
61 | | |
62 | | /* Create storage space for counting distinct labels and dominant ones */ |
63 | 6.63k | IGRAPH_VECTOR_INIT_FINALLY(&label_weights, no_of_nodes); |
64 | 6.63k | IGRAPH_VECTOR_INT_INIT_FINALLY(&dominant_labels, 0); |
65 | 6.63k | IGRAPH_VECTOR_INT_INIT_FINALLY(&nonzero_labels, 0); |
66 | 6.63k | IGRAPH_CHECK(igraph_vector_int_reserve(&dominant_labels, 2)); |
67 | | |
68 | | /* Initialize node ordering vector with only the not fixed nodes */ |
69 | 6.63k | if (fixed) { |
70 | 0 | IGRAPH_VECTOR_INT_INIT_FINALLY(&node_order, no_of_nodes); |
71 | 0 | for (i = 0; i < no_of_nodes; i++) { |
72 | 0 | if (!VECTOR(*fixed)[i]) { |
73 | 0 | VECTOR(node_order)[no_of_not_fixed_nodes] = i; |
74 | 0 | no_of_not_fixed_nodes++; |
75 | 0 | } |
76 | 0 | } |
77 | 0 | IGRAPH_CHECK(igraph_vector_int_resize(&node_order, no_of_not_fixed_nodes)); |
78 | 6.63k | } else { |
79 | 6.63k | IGRAPH_CHECK(igraph_vector_int_init_range(&node_order, 0, no_of_nodes)); |
80 | 6.63k | IGRAPH_FINALLY(igraph_vector_int_destroy, &node_order); |
81 | 6.63k | no_of_not_fixed_nodes = no_of_nodes; |
82 | 6.63k | } |
83 | | |
84 | | /* There are two modes of operation in this implementation: retention or |
85 | | * dominance. When using retention, we prefer to keep the current label of a node. |
86 | | * Only if the current label is not among the dominant labels will we |
87 | | * update the label. If a label changes, we will continue to iterate |
88 | | * over all nodes. |
89 | | * |
90 | | * When not using retention we check for dominance after each iteration. This |
91 | | * is implemented as two alternating types of iterations, one for changing |
92 | | * labels and the other one for checking the end condition - every vertex in the |
93 | | * graph has a label to which the maximum number of its neighbors belongs. If |
94 | | * control_iteration is true, we are just checking the end condition and not |
95 | | * relabeling nodes. |
96 | | */ |
97 | | |
98 | 6.63k | control_iteration = true; |
99 | 6.63k | running = true; |
100 | 29.3k | while (running) { |
101 | 22.7k | igraph_integer_t v1, num_neis; |
102 | 22.7k | igraph_real_t max_count; |
103 | 22.7k | igraph_vector_int_t *neis; |
104 | 22.7k | igraph_vector_int_t *ineis; |
105 | 22.7k | igraph_bool_t was_zero; |
106 | | |
107 | 22.7k | IGRAPH_ALLOW_INTERRUPTION_LIMITED(iter, 1 << 8); |
108 | | |
109 | 22.7k | if (retention) { |
110 | | /* We stop in this iteration by default, unless a label changes */ |
111 | 8.55k | running = false; |
112 | | /* Shuffle the node ordering vector */ |
113 | 8.55k | igraph_vector_int_shuffle(&node_order); |
114 | 14.1k | } else { |
115 | 14.1k | if (control_iteration) { |
116 | | /* If we are in the control iteration, we expect in the beginning of |
117 | | the iteration that all vertices meet the end condition, so 'running' is false. |
118 | | If some of them does not, 'running' is set to true later in the code. */ |
119 | 8.75k | running = false; |
120 | 8.75k | } else { |
121 | | /* Shuffle the node ordering vector if we are in the label updating iteration */ |
122 | 5.43k | igraph_vector_int_shuffle(&node_order); |
123 | 5.43k | } |
124 | 14.1k | } |
125 | | |
126 | | /* In the prescribed order, loop over the vertices and reassign labels */ |
127 | 1.10M | for (i = 0; i < no_of_not_fixed_nodes; i++) { |
128 | 1.08M | v1 = VECTOR(node_order)[i]; |
129 | | |
130 | | /* Count the weights corresponding to different labels */ |
131 | 1.08M | igraph_vector_int_clear(&dominant_labels); |
132 | 1.08M | igraph_vector_int_clear(&nonzero_labels); |
133 | 1.08M | max_count = 0.0; |
134 | 1.08M | if (weights) { |
135 | | |
136 | 1.08M | ineis = igraph_inclist_get(&il, v1); |
137 | 1.08M | num_neis = igraph_vector_int_size(ineis); |
138 | | |
139 | 2.64M | for (j = 0; j < num_neis; j++) { |
140 | 1.55M | k = VECTOR(*membership)[IGRAPH_OTHER(graph, VECTOR(*ineis)[j], v1)]; |
141 | 1.55M | if (k < 0) { |
142 | 0 | continue; /* skip if it has no label yet */ |
143 | 0 | } |
144 | 1.55M | was_zero = (VECTOR(label_weights)[k] == 0); |
145 | 1.55M | VECTOR(label_weights)[k] += VECTOR(*weights)[VECTOR(*ineis)[j]]; |
146 | | |
147 | 1.55M | if (was_zero && VECTOR(label_weights)[k] != 0) { |
148 | | /* weights just became nonzero */ |
149 | 812k | IGRAPH_CHECK(igraph_vector_int_push_back(&nonzero_labels, k)); |
150 | 812k | } |
151 | | |
152 | 1.55M | if (max_count < VECTOR(label_weights)[k]) { |
153 | 1.20M | max_count = VECTOR(label_weights)[k]; |
154 | 1.20M | IGRAPH_CHECK(igraph_vector_int_resize(&dominant_labels, 1)); |
155 | 1.20M | VECTOR(dominant_labels)[0] = k; |
156 | 1.20M | } else if (max_count == VECTOR(label_weights)[k]) { |
157 | 16.0k | IGRAPH_CHECK(igraph_vector_int_push_back(&dominant_labels, k)); |
158 | 16.0k | } |
159 | 1.55M | } |
160 | 1.08M | } else { |
161 | |
|
162 | 0 | neis = igraph_adjlist_get(&al, v1); |
163 | 0 | num_neis = igraph_vector_int_size(neis); |
164 | |
|
165 | 0 | for (j = 0; j < num_neis; j++) { |
166 | |
|
167 | 0 | k = VECTOR(*membership)[VECTOR(*neis)[j]]; |
168 | 0 | if (k < 0) { |
169 | 0 | continue; /* skip if it has no label yet */ |
170 | 0 | } |
171 | 0 | VECTOR(label_weights)[k]++; |
172 | |
|
173 | 0 | if (VECTOR(label_weights)[k] == 1) { |
174 | | /* weights just became nonzero */ |
175 | 0 | IGRAPH_CHECK(igraph_vector_int_push_back(&nonzero_labels, k)); |
176 | 0 | } |
177 | | |
178 | 0 | if (max_count < VECTOR(label_weights)[k]) { |
179 | 0 | max_count = VECTOR(label_weights)[k]; |
180 | 0 | IGRAPH_CHECK(igraph_vector_int_resize(&dominant_labels, 1)); |
181 | 0 | VECTOR(dominant_labels)[0] = k; |
182 | 0 | } else if (max_count == VECTOR(label_weights)[k]) { |
183 | 0 | IGRAPH_CHECK(igraph_vector_int_push_back(&dominant_labels, k)); |
184 | 0 | } |
185 | 0 | } |
186 | 0 | } |
187 | | |
188 | 1.08M | if (igraph_vector_int_size(&dominant_labels) > 0) { |
189 | 438k | if (retention) { |
190 | | /* If we are using retention, we first check if the current label |
191 | | is among the maximum label. */ |
192 | 160k | j = (long)VECTOR(*membership)[v1]; |
193 | 160k | if (j < 0 || /* Doesn't have a label yet */ |
194 | 160k | VECTOR(label_weights)[j] == 0 || /* Label not present in neighbors */ |
195 | 160k | VECTOR(label_weights)[j] < max_count /* Label not dominant */) { |
196 | | /* Select randomly from the dominant labels */ |
197 | 46.1k | k = RNG_INTEGER(0, igraph_vector_int_size(&dominant_labels) - 1); |
198 | 46.1k | k = VECTOR(dominant_labels)[(long int)k]; |
199 | | /* If label changes, we will continue running */ |
200 | 46.1k | if (k != j) { |
201 | 46.1k | running = true; |
202 | 46.1k | } |
203 | | /* Actually change label */ |
204 | 46.1k | VECTOR(*membership)[v1] = k; |
205 | 46.1k | } |
206 | 278k | } else { |
207 | | /* We are not using retention, so check if we should do a control iteration |
208 | | or an update iteration. */ |
209 | 278k | if (control_iteration) { |
210 | | /* Check if the _current_ label of the node is also dominant */ |
211 | 164k | k = VECTOR(*membership)[v1]; |
212 | 164k | if (k < 0 || /* No label assigned yet or */ |
213 | 164k | VECTOR(label_weights)[k] < max_count /* Label is not maximum */ |
214 | 164k | ) { |
215 | | /* Nope, we need at least one more iteration */ |
216 | 56.1k | running = true; |
217 | 56.1k | } |
218 | 164k | } else { |
219 | | /* Select randomly from the dominant labels */ |
220 | 113k | k = RNG_INTEGER(0, igraph_vector_int_size(&dominant_labels) - 1); |
221 | 113k | VECTOR(*membership)[v1] = VECTOR(dominant_labels)[k]; |
222 | 113k | } |
223 | 278k | } |
224 | 438k | } |
225 | | |
226 | | /* Clear the nonzero elements in label_weights */ |
227 | 1.08M | num_neis = igraph_vector_int_size(&nonzero_labels); |
228 | 1.89M | for (j = 0; j < num_neis; j++) { |
229 | 812k | VECTOR(label_weights)[VECTOR(nonzero_labels)[j]] = 0; |
230 | 812k | } |
231 | 1.08M | } |
232 | | |
233 | | /* Alternating between control iterations and label updating iterations */ |
234 | 22.7k | if (!retention) { |
235 | 14.1k | control_iteration = !control_iteration; |
236 | 14.1k | } |
237 | 22.7k | } |
238 | | |
239 | 6.63k | if (weights) { |
240 | 6.63k | igraph_inclist_destroy(&il); |
241 | 6.63k | } else { |
242 | 0 | igraph_adjlist_destroy(&al); |
243 | 0 | } |
244 | 6.63k | IGRAPH_FINALLY_CLEAN(1); |
245 | | |
246 | 6.63k | igraph_vector_int_destroy(&node_order); |
247 | 6.63k | igraph_vector_int_destroy(&nonzero_labels); |
248 | 6.63k | igraph_vector_int_destroy(&dominant_labels); |
249 | 6.63k | igraph_vector_destroy(&label_weights); |
250 | 6.63k | IGRAPH_FINALLY_CLEAN(4); |
251 | | |
252 | 6.63k | return IGRAPH_SUCCESS; |
253 | 6.63k | } |
254 | | |
255 | | static igraph_error_t community_fast_label_propagation( |
256 | | const igraph_t *graph, |
257 | | igraph_vector_int_t *membership, |
258 | | igraph_neimode_t mode, |
259 | | const igraph_vector_t *weights, |
260 | 3.31k | const igraph_vector_bool_t *fixed) { |
261 | | |
262 | 3.31k | const igraph_integer_t no_of_nodes = igraph_vcount(graph); |
263 | 3.31k | igraph_integer_t no_of_not_fixed_nodes = 0; |
264 | 3.31k | igraph_integer_t i, j, k; |
265 | 3.31k | igraph_inclist_t il; |
266 | 3.31k | igraph_adjlist_t al; |
267 | | |
268 | 3.31k | igraph_vector_t label_weights; |
269 | 3.31k | igraph_vector_int_t dominant_labels, nonzero_labels, node_order; |
270 | 3.31k | igraph_dqueue_int_t queue; |
271 | 3.31k | igraph_vector_bool_t in_queue; |
272 | 3.31k | igraph_neimode_t reverse_mode; |
273 | 3.31k | int iter = 0; /* interruption counter */ |
274 | | |
275 | 3.31k | reverse_mode = IGRAPH_REVERSE_MODE(mode); |
276 | | |
277 | 3.31k | if (weights) { |
278 | 3.31k | IGRAPH_CHECK(igraph_inclist_init(graph, &il, reverse_mode, IGRAPH_LOOPS_ONCE)); |
279 | 3.31k | IGRAPH_FINALLY(igraph_inclist_destroy, &il); |
280 | 3.31k | } else { |
281 | 0 | IGRAPH_CHECK(igraph_adjlist_init(graph, &al, reverse_mode, IGRAPH_LOOPS_ONCE, IGRAPH_MULTIPLE)); |
282 | 0 | IGRAPH_FINALLY(igraph_adjlist_destroy, &al); |
283 | 0 | } |
284 | | |
285 | | /* Create storage space for counting distinct labels and dominant ones */ |
286 | 3.31k | IGRAPH_VECTOR_INIT_FINALLY(&label_weights, no_of_nodes); |
287 | 3.31k | IGRAPH_VECTOR_INT_INIT_FINALLY(&dominant_labels, 0); |
288 | 3.31k | IGRAPH_VECTOR_INT_INIT_FINALLY(&nonzero_labels, 0); |
289 | 3.31k | IGRAPH_CHECK(igraph_vector_int_reserve(&dominant_labels, 2)); |
290 | | |
291 | | /* Initialize node ordering vector with only the not fixed nodes */ |
292 | 3.31k | IGRAPH_DQUEUE_INT_INIT_FINALLY(&queue, no_of_nodes); |
293 | 3.31k | IGRAPH_VECTOR_BOOL_INIT_FINALLY(&in_queue, no_of_nodes); |
294 | | |
295 | | /* Initialize node ordering vector with only the not fixed nodes */ |
296 | 3.31k | if (fixed) { |
297 | 0 | IGRAPH_VECTOR_INT_INIT_FINALLY(&node_order, no_of_nodes); |
298 | 0 | for (i = 0; i < no_of_nodes; i++) { |
299 | 0 | if (!VECTOR(*fixed)[i]) { |
300 | 0 | VECTOR(node_order)[no_of_not_fixed_nodes] = i; |
301 | 0 | no_of_not_fixed_nodes++; |
302 | 0 | } |
303 | 0 | } |
304 | 0 | IGRAPH_CHECK(igraph_vector_int_resize(&node_order, no_of_not_fixed_nodes)); |
305 | 3.31k | } else { |
306 | 3.31k | IGRAPH_CHECK(igraph_vector_int_init_range(&node_order, 0, no_of_nodes)); |
307 | 3.31k | IGRAPH_FINALLY(igraph_vector_int_destroy, &node_order); |
308 | 3.31k | no_of_not_fixed_nodes = no_of_nodes; |
309 | 3.31k | } |
310 | | |
311 | 152k | for (i = 0; i < no_of_not_fixed_nodes; i++) { |
312 | 149k | IGRAPH_CHECK(igraph_dqueue_int_push(&queue, VECTOR(node_order)[i])); |
313 | 149k | VECTOR(in_queue)[VECTOR(node_order)[i]] = true; |
314 | 149k | } |
315 | 3.31k | igraph_vector_int_destroy(&node_order); |
316 | 3.31k | IGRAPH_FINALLY_CLEAN(1); |
317 | | |
318 | 167k | while (!igraph_dqueue_int_empty(&queue)) { |
319 | 164k | igraph_integer_t v1, v2, e = -1, num_neis; |
320 | 164k | igraph_real_t max_count; |
321 | 164k | igraph_vector_int_t *neis; |
322 | 164k | igraph_bool_t was_zero; |
323 | | |
324 | 164k | IGRAPH_ALLOW_INTERRUPTION_LIMITED(iter, 1 << 8); |
325 | | |
326 | 164k | v1 = igraph_dqueue_int_pop(&queue); |
327 | 164k | VECTOR(in_queue)[v1] = false; |
328 | | |
329 | | /* Count the weights corresponding to different labels */ |
330 | 164k | igraph_vector_int_clear(&dominant_labels); |
331 | 164k | igraph_vector_int_clear(&nonzero_labels); |
332 | 164k | max_count = 0.0; |
333 | 164k | if (weights) { |
334 | 164k | neis = igraph_inclist_get(&il, v1); |
335 | 164k | } else { |
336 | 0 | neis = igraph_adjlist_get(&al, v1); |
337 | 0 | } |
338 | | |
339 | 164k | num_neis = igraph_vector_int_size(neis); |
340 | 431k | for (j = 0; j < num_neis; j++) { |
341 | 267k | if (weights) { |
342 | 267k | e = VECTOR(*neis)[j]; |
343 | 267k | v2 = IGRAPH_OTHER(graph, e, v1); |
344 | 267k | } else { |
345 | 0 | v2 = VECTOR(*neis)[j]; |
346 | 0 | } |
347 | 267k | k = VECTOR(*membership)[v2]; |
348 | 267k | if (k < 0) { |
349 | 0 | continue; /* skip if it has no label yet */ |
350 | 0 | } |
351 | 267k | was_zero = (VECTOR(label_weights)[k] == 0); |
352 | 267k | VECTOR(label_weights)[k] += (weights ? VECTOR(*weights)[e] : 1); |
353 | | |
354 | 267k | if (was_zero && VECTOR(label_weights)[k] >= 0) { |
355 | | /* counter just became non-negative */ |
356 | 152k | IGRAPH_CHECK(igraph_vector_int_push_back(&nonzero_labels, k)); |
357 | 152k | } |
358 | | |
359 | 267k | if (max_count < VECTOR(label_weights)[k]) { |
360 | 182k | max_count = VECTOR(label_weights)[k]; |
361 | 182k | IGRAPH_CHECK(igraph_vector_int_resize(&dominant_labels, 1)); |
362 | 182k | VECTOR(dominant_labels)[0] = k; |
363 | 182k | } else if (max_count == VECTOR(label_weights)[k]) { |
364 | 3.57k | IGRAPH_CHECK(igraph_vector_int_push_back(&dominant_labels, k)); |
365 | 3.57k | } |
366 | 267k | } |
367 | | |
368 | 164k | if (igraph_vector_int_size(&dominant_labels) > 0) { |
369 | 65.5k | igraph_integer_t current_label = VECTOR(*membership)[v1]; |
370 | | |
371 | | /* Select randomly from the dominant labels */ |
372 | 65.5k | k = RNG_INTEGER(0, igraph_vector_int_size(&dominant_labels) - 1); |
373 | 65.5k | igraph_integer_t new_label = VECTOR(dominant_labels)[k]; /* a dominant label */ |
374 | | |
375 | | /* Check if the _current_ label of the node is not the same */ |
376 | 65.5k | if (new_label != current_label) { |
377 | | /* We still need to consider its neighbors not in the new community */ |
378 | 181k | for (j = 0; j < num_neis; j++) { |
379 | 135k | if (weights) { |
380 | 135k | e = VECTOR(*neis)[j]; |
381 | 135k | v2 = IGRAPH_OTHER(graph, e, v1); |
382 | 135k | } else { |
383 | 0 | v2 = VECTOR(*neis)[j]; |
384 | 0 | } |
385 | 135k | if (!VECTOR(in_queue)[v2]) { |
386 | 42.3k | igraph_integer_t neigh_label = VECTOR(*membership)[v2]; /* neighbor community */ |
387 | 42.3k | if (neigh_label != new_label && /* not in new community */ |
388 | 42.3k | (fixed == NULL || !VECTOR(*fixed)[v2]) ) { /* not fixed */ |
389 | 14.8k | IGRAPH_CHECK(igraph_dqueue_int_push(&queue, v2)); |
390 | 14.8k | VECTOR(in_queue)[v2] = true; |
391 | 14.8k | } |
392 | 42.3k | } |
393 | 135k | } |
394 | 46.1k | } |
395 | 65.5k | VECTOR(*membership)[v1] = new_label; |
396 | 65.5k | } |
397 | | |
398 | | /* Clear the nonzero elements in label_weights */ |
399 | 164k | num_neis = igraph_vector_int_size(&nonzero_labels); |
400 | 316k | for (j = 0; j < num_neis; j++) { |
401 | 152k | VECTOR(label_weights)[VECTOR(nonzero_labels)[j]] = 0; |
402 | 152k | } |
403 | 164k | } |
404 | | |
405 | 3.31k | if (weights) { |
406 | 3.31k | igraph_inclist_destroy(&il); |
407 | 3.31k | } else { |
408 | 0 | igraph_adjlist_destroy(&al); |
409 | 0 | } |
410 | 3.31k | IGRAPH_FINALLY_CLEAN(1); |
411 | | |
412 | 3.31k | igraph_vector_bool_destroy(&in_queue); |
413 | 3.31k | igraph_dqueue_int_destroy(&queue); |
414 | 3.31k | igraph_vector_destroy(&label_weights); |
415 | 3.31k | igraph_vector_int_destroy(&dominant_labels); |
416 | 3.31k | igraph_vector_int_destroy(&nonzero_labels); |
417 | 3.31k | IGRAPH_FINALLY_CLEAN(5); |
418 | | |
419 | 3.31k | return IGRAPH_SUCCESS; |
420 | 3.31k | } |
421 | | |
422 | | /** |
423 | | * \ingroup communities |
424 | | * \function igraph_community_label_propagation |
425 | | * \brief Community detection based on label propagation. |
426 | | * |
427 | | * This function implements the label propagation-based community detection |
428 | | * algorithm described by Raghavan, Albert and Kumara (2007). This version extends |
429 | | * the original method by the ability to take edge weights into consideration |
430 | | * and also by allowing some labels to be fixed. In addition, it implements |
431 | | * the fast label propagation alternative introduced by Traag and Šubelj (2023). |
432 | | * |
433 | | * </para><para> |
434 | | * The algorithm works by iterating over nodes and updating the label of a node |
435 | | * based on the labels of its neighbors. The labels that are most frequent among |
436 | | * the neighbors are said to be dominant labels. The label of a node is always |
437 | | * updated to a dominant label. The algorithm guarantees that the label for each |
438 | | * is dominant when it terminates. |
439 | | * |
440 | | * </para><para> |
441 | | * There are several variants implemented, which work slightly differently with |
442 | | * the dominance of labels. Nodes with a dominant label might no longer have a |
443 | | * dominant label if one of their neighbors change label. In \c |
444 | | * IGRAPH_LPA_DOMINANCE an additional iteration over all nodes is made after |
445 | | * updating all labels to double check whether all nodes indeed have a dominant |
446 | | * label. When updating the label of a node, labels are always sampled from |
447 | | * among all dominant labels. The algorithm stops when all nodes have dominant |
448 | | * labels. In \c IGRAPH_LPA_RETENTION instead labels are only updated when they |
449 | | * are not dominant. That is, they retain their current label whenever the |
450 | | * current label is already dominant. The algorithm does not make an additional |
451 | | * iteration to check for dominance. Instead, it simply keeps track whether a |
452 | | * label has been updated, and terminates if no updates have been made. In \c |
453 | | * IGRAPH_LPA_FAST labels are sampled from among all dominant labels, similar to |
454 | | * \c IGRAPH_LPA_DOMINANCE. Instead of iterating over all nodes, it keeps track |
455 | | * of a queue of nodes that should be considered. Nodes are popped from the |
456 | | * queue when they are considered for update. When the label of a node is |
457 | | * updated, the node's neighbors are added to the queue again (if they weren't |
458 | | * already in the queue). The algorithm terminates when the queue is empty. All |
459 | | * variants guarantee that the labels for all nodes are dominant. |
460 | | * |
461 | | * </para><para> |
462 | | * Weights are taken into account as follows: when the new label of node |
463 | | * \c i is determined, the algorithm iterates over all edges incident on |
464 | | * node \c i and calculate the total weight of edges leading to other |
465 | | * nodes with label 0, 1, 2, ..., \c k - 1 (where \c k is the number of possible |
466 | | * labels). The new label of node \c i will then be the label whose edges |
467 | | * (among the ones incident on node \c i) have the highest total weight. |
468 | | * |
469 | | * </para><para> |
470 | | * For directed graphs, it is important to know that labels can circulate |
471 | | * freely only within the strongly connected components of the graph and |
472 | | * may propagate in only one direction (or not at all) \em between strongly |
473 | | * connected components. You should treat directed edges as directed only |
474 | | * if you are aware of the consequences. |
475 | | * |
476 | | * </para><para> |
477 | | * References: |
478 | | * |
479 | | * </para><para> |
480 | | * Raghavan, U.N. and Albert, R. and Kumara, S.: Near linear time algorithm to |
481 | | * detect community structures in large-scale networks. Phys Rev E 76, 036106 |
482 | | * (2007). https://doi.org/10.1103/PhysRevE.76.036106 |
483 | | * |
484 | | * </para><para> |
485 | | * Šubelj, L.: Label propagation for clustering. Chapter in "Advances in |
486 | | * Network Clustering and Blockmodeling" edited by P. Doreian, V. Batagelj |
487 | | * & A. Ferligoj (Wiley, New York, 2018). |
488 | | * https://doi.org/10.1002/9781119483298.ch5 |
489 | | * https://arxiv.org/abs/1709.05634 |
490 | | * |
491 | | * </para><para> |
492 | | * Traag, V. A., and Šubelj, L.: Large network community detection by fast |
493 | | * label propagation. Scientific Reports, 13:1, (2023). |
494 | | * https://doi.org/10.1038/s41598-023-29610-z |
495 | | * https://arxiv.org/abs/2209.13338 |
496 | | * |
497 | | * \param graph The input graph. Note that the algorithm was originally |
498 | | * defined for undirected graphs. You are advised to set \p mode to |
499 | | * \c IGRAPH_ALL if you pass a directed graph here to treat it as |
500 | | * undirected. |
501 | | * \param membership The membership vector, the result is returned here. |
502 | | * For each vertex it gives the ID of its community (label). |
503 | | * \param mode Whether to consider edge directions for the label propagation, |
504 | | * and if so, which direction the labels should propagate. Ignored for |
505 | | * undirected graphs. \c IGRAPH_ALL means to ignore edge directions (even |
506 | | * in directed graphs). \c IGRAPH_OUT means to propagate labels along the |
507 | | * natural direction of the edges. \c IGRAPH_IN means to propagate labels |
508 | | * \em backwards (i.e. from head to tail). It is advised to set this to |
509 | | * \c IGRAPH_ALL unless you are specifically interested in the effect of |
510 | | * edge directions. |
511 | | * \param weights The weight vector, it should contain a positive |
512 | | * weight for all the edges. |
513 | | * \param initial The initial state. If \c NULL, every vertex will have |
514 | | * a different label at the beginning. Otherwise it must be a vector |
515 | | * with an entry for each vertex. Non-negative values denote different |
516 | | * labels, negative entries denote vertices without labels. Unlabeled |
517 | | * vertices which are not reachable from any labeled ones will remain |
518 | | * unlabeled at the end of the label propagation process, and will be |
519 | | * labeled in an additional step to avoid returning negative values in |
520 | | * \p membership. In undirected graphs, this happens when entire connected |
521 | | * components are unlabeled. Then, each unlabeled component will receive |
522 | | * its own separate label. In directed graphs, the outcome of the |
523 | | * additional labeling should be considered undefined and may change |
524 | | * in the future; please do not rely on it. |
525 | | * \param fixed Boolean vector denoting which labels are fixed. Of course |
526 | | * this makes sense only if you provided an initial state, otherwise |
527 | | * this element will be ignored. Note that vertices without labels |
528 | | * cannot be fixed. The fixed status will be ignored for these with a |
529 | | * warning. Also note that label numbers by themselves have no meaning, |
530 | | * and igraph may renumber labels. However, co-membership constraints |
531 | | * will be respected: two vertices can be fixed to be in the same or in |
532 | | * different communities. |
533 | | * \param lpa_variant Which variant of label propagation algorithm to run. |
534 | | * One of |
535 | | * \c IGRAPH_LPA_DOMINANCE Check for dominance of all nodes after each |
536 | | * iteration. |
537 | | * \c IGRAPH_LPA_RETENTION Keep current label if among dominant labels, |
538 | | * only check if labels changed. |
539 | | * \c IGRAPH_LPA_FAST Sample from dominant labels, only check |
540 | | * neighbors. |
541 | | * \return Error code. |
542 | | * |
543 | | * Time complexity: O(m+n) |
544 | | * |
545 | | * \example examples/simple/igraph_community_label_propagation.c |
546 | | */ |
547 | | igraph_error_t igraph_community_label_propagation(const igraph_t *graph, |
548 | | igraph_vector_int_t *membership, |
549 | | igraph_neimode_t mode, |
550 | | const igraph_vector_t *weights, |
551 | | const igraph_vector_int_t *initial, |
552 | | const igraph_vector_bool_t *fixed, |
553 | 9.95k | igraph_lpa_variant_t lpa_variant) { |
554 | | |
555 | 9.95k | const igraph_integer_t no_of_nodes = igraph_vcount(graph); |
556 | 9.95k | const igraph_integer_t no_of_edges = igraph_ecount(graph); |
557 | 9.95k | igraph_integer_t no_of_not_fixed_nodes = no_of_nodes; |
558 | 9.95k | igraph_integer_t i, j, k; |
559 | 9.95k | igraph_bool_t unlabelled_left; |
560 | | |
561 | | /* We make a copy of 'fixed' as a pointer into 'fixed_copy' after casting |
562 | | * away the constness, and promise ourselves that we will make a proper |
563 | | * copy of 'fixed' into 'fixed_copy' as soon as we start mutating it */ |
564 | 9.95k | igraph_vector_bool_t *fixed_copy = (igraph_vector_bool_t *) fixed; |
565 | | |
566 | | /* Unlabelled nodes are represented with -1. */ |
567 | 9.95k | #define IS_UNLABELLED(x) (VECTOR(*membership)[x] < 0) |
568 | | |
569 | | /* Do some initial checks */ |
570 | 9.95k | if (fixed && igraph_vector_bool_size(fixed) != no_of_nodes) { |
571 | 0 | IGRAPH_ERROR("Fixed labeling vector length must agree with number of nodes.", IGRAPH_EINVAL); |
572 | 0 | } |
573 | 9.95k | if (weights) { |
574 | 9.95k | if (igraph_vector_size(weights) != no_of_edges) { |
575 | 0 | IGRAPH_ERROR("Length of weight vector must agree with number of edges.", IGRAPH_EINVAL); |
576 | 0 | } |
577 | 9.95k | if (no_of_edges > 0) { |
578 | 9.84k | igraph_real_t minweight = igraph_vector_min(weights); |
579 | 9.84k | if (minweight < 0) { |
580 | 0 | IGRAPH_ERROR("Weights must not be negative.", IGRAPH_EINVAL); |
581 | 0 | } |
582 | 9.84k | if (isnan(minweight)) { |
583 | 0 | IGRAPH_ERROR("Weights must not be NaN.", IGRAPH_EINVAL); |
584 | 0 | } |
585 | 9.84k | } |
586 | 9.95k | } |
587 | 9.95k | if (fixed && !initial) { |
588 | 0 | IGRAPH_WARNING("Ignoring fixed vertices as no initial labeling given."); |
589 | 0 | } |
590 | | |
591 | 9.95k | IGRAPH_CHECK(igraph_vector_int_resize(membership, no_of_nodes)); |
592 | | |
593 | 9.95k | if (initial) { |
594 | 0 | if (igraph_vector_int_size(initial) != no_of_nodes) { |
595 | 0 | IGRAPH_ERROR("Initial labeling vector length must agree with number of nodes.", IGRAPH_EINVAL); |
596 | 0 | } |
597 | | /* Check if the labels used are valid, initialize membership vector */ |
598 | 0 | for (i = 0; i < no_of_nodes; i++) { |
599 | 0 | if (VECTOR(*initial)[i] < 0) { |
600 | 0 | VECTOR(*membership)[i] = -1; |
601 | 0 | } else { |
602 | 0 | VECTOR(*membership)[i] = VECTOR(*initial)[i]; |
603 | 0 | } |
604 | 0 | } |
605 | 0 | if (fixed) { |
606 | 0 | for (i = 0; i < no_of_nodes; i++) { |
607 | 0 | if (VECTOR(*fixed)[i]) { |
608 | 0 | if (IS_UNLABELLED(i)) { |
609 | 0 | IGRAPH_WARNING("Fixed nodes cannot be unlabeled, ignoring them."); |
610 | | |
611 | | /* We cannot modify 'fixed' because it is const, so we make a copy and |
612 | | * modify 'fixed_copy' instead */ |
613 | 0 | if (fixed_copy == fixed) { |
614 | 0 | fixed_copy = IGRAPH_CALLOC(1, igraph_vector_bool_t); |
615 | 0 | IGRAPH_CHECK_OOM(fixed_copy, "Insufficient memory for label propagation."); |
616 | 0 | IGRAPH_FINALLY(igraph_free, fixed_copy); |
617 | 0 | IGRAPH_CHECK(igraph_vector_bool_init_copy(fixed_copy, fixed)); |
618 | 0 | IGRAPH_FINALLY(igraph_vector_bool_destroy, fixed_copy); |
619 | 0 | } |
620 | | |
621 | 0 | VECTOR(*fixed_copy)[i] = false; |
622 | 0 | } else { |
623 | 0 | no_of_not_fixed_nodes--; |
624 | 0 | } |
625 | 0 | } |
626 | 0 | } |
627 | 0 | } |
628 | | |
629 | 0 | i = igraph_vector_int_max(membership); |
630 | 0 | if (i > no_of_nodes) { |
631 | 0 | IGRAPH_ERROR("Elements of the initial labeling vector must be between 0 and |V|-1.", IGRAPH_EINVAL); |
632 | 0 | } |
633 | 9.95k | } else { |
634 | 457k | for (i = 0; i < no_of_nodes; i++) { |
635 | 447k | VECTOR(*membership)[i] = i; |
636 | 447k | } |
637 | 9.95k | } |
638 | | |
639 | | /* From this point onwards we use 'fixed_copy' instead of 'fixed' */ |
640 | 9.95k | switch (lpa_variant) { |
641 | 3.31k | case IGRAPH_LPA_FAST: |
642 | 3.31k | IGRAPH_CHECK(community_fast_label_propagation(graph, membership, mode, weights, fixed_copy)); |
643 | 3.31k | break; |
644 | | |
645 | 3.31k | case IGRAPH_LPA_RETENTION: |
646 | 3.31k | IGRAPH_CHECK(community_label_propagation(graph, membership, mode, weights, fixed_copy, /* retention */ true )); |
647 | 3.31k | break; |
648 | | |
649 | 3.31k | case IGRAPH_LPA_DOMINANCE: |
650 | 3.31k | IGRAPH_CHECK(community_label_propagation(graph, membership, mode, weights, fixed_copy, /* retention */ false)); |
651 | 3.31k | break; |
652 | | |
653 | 3.31k | default: |
654 | 0 | IGRAPH_ERROR("Invalid igraph_lpa_variant_t.", IGRAPH_EINVAL); |
655 | 9.95k | } |
656 | | |
657 | | /* Permute labels in increasing order */ |
658 | 9.95k | igraph_vector_int_t relabel_label; |
659 | 9.95k | IGRAPH_CHECK(igraph_vector_int_init(&relabel_label, no_of_nodes)); |
660 | 9.95k | igraph_vector_int_fill(&relabel_label, -1); |
661 | 9.95k | IGRAPH_FINALLY(igraph_vector_int_destroy, &relabel_label); |
662 | | |
663 | 9.95k | j = 0; |
664 | 9.95k | unlabelled_left = false; |
665 | 457k | for (i = 0; i < no_of_nodes; i++) { |
666 | 447k | k = VECTOR(*membership)[i]; |
667 | 447k | if (k >= 0) { |
668 | 447k | if (VECTOR(relabel_label)[k] == -1) { |
669 | | /* We have seen this label for the first time */ |
670 | 328k | VECTOR(relabel_label)[k] = j; |
671 | 328k | k = j; |
672 | 328k | j++; |
673 | 328k | } else { |
674 | 119k | k = VECTOR(relabel_label)[k]; |
675 | 119k | } |
676 | 447k | } else { |
677 | | /* This is an unlabeled vertex */ |
678 | 0 | unlabelled_left = true; |
679 | 0 | } |
680 | 447k | VECTOR(*membership)[i] = k; |
681 | 447k | } |
682 | | |
683 | | /* If any nodes are left unlabelled, we assign the remaining labels to them, |
684 | | * as well as to all unlabelled nodes reachable from them. |
685 | | * |
686 | | * Note that only those nodes could remain unlabelled which were unreachable |
687 | | * from any labelled ones. Thus, in the undirected case, fully unlabelled |
688 | | * connected components remain unlabelled. Here we label each such component |
689 | | * with the same label. |
690 | | */ |
691 | 9.95k | if (unlabelled_left) { |
692 | 0 | igraph_dqueue_int_t q; |
693 | 0 | igraph_vector_int_t neis; |
694 | |
|
695 | 0 | igraph_vector_int_t node_order; |
696 | | /* Initialize node ordering vector with only the not fixed nodes */ |
697 | 0 | if (fixed) { |
698 | 0 | no_of_not_fixed_nodes = 0; |
699 | 0 | IGRAPH_VECTOR_INT_INIT_FINALLY(&node_order, no_of_nodes); |
700 | 0 | for (i = 0; i < no_of_nodes; i++) { |
701 | 0 | if (!VECTOR(*fixed)[i]) { |
702 | 0 | VECTOR(node_order)[no_of_not_fixed_nodes] = i; |
703 | 0 | no_of_not_fixed_nodes++; |
704 | 0 | } |
705 | 0 | } |
706 | 0 | IGRAPH_CHECK(igraph_vector_int_resize(&node_order, no_of_not_fixed_nodes)); |
707 | 0 | } else { |
708 | 0 | IGRAPH_CHECK(igraph_vector_int_init_range(&node_order, 0, no_of_nodes)); |
709 | 0 | IGRAPH_FINALLY(igraph_vector_int_destroy, &node_order); |
710 | 0 | no_of_not_fixed_nodes = no_of_nodes; |
711 | 0 | } |
712 | | /* Shuffle the node ordering vector */ |
713 | 0 | igraph_vector_int_shuffle(&node_order); |
714 | |
|
715 | 0 | IGRAPH_VECTOR_INT_INIT_FINALLY(&neis, 0); |
716 | | |
717 | 0 | IGRAPH_CHECK(igraph_dqueue_int_init(&q, 0)); |
718 | 0 | IGRAPH_FINALLY(igraph_dqueue_int_destroy, &q); |
719 | |
|
720 | 0 | for (i=0; i < no_of_not_fixed_nodes; ++i) { |
721 | 0 | igraph_integer_t v = VECTOR(node_order)[i]; |
722 | | |
723 | | /* Is this node unlabelled? */ |
724 | 0 | if (IS_UNLABELLED(v)) { |
725 | | /* If yes, we label it, and do a BFS to apply the same label |
726 | | * to all other unlabelled nodes reachable from it */ |
727 | 0 | IGRAPH_CHECK(igraph_dqueue_int_push(&q, v)); |
728 | 0 | VECTOR(*membership)[v] = j; |
729 | 0 | while (!igraph_dqueue_int_empty(&q)) { |
730 | 0 | igraph_integer_t ni, num_neis; |
731 | 0 | igraph_integer_t actnode = igraph_dqueue_int_pop(&q); |
732 | |
|
733 | 0 | IGRAPH_CHECK(igraph_neighbors(graph, &neis, actnode, mode, IGRAPH_LOOPS, IGRAPH_MULTIPLE)); |
734 | 0 | num_neis = igraph_vector_int_size(&neis); |
735 | |
|
736 | 0 | for (ni = 0; ni < num_neis; ++ni) { |
737 | 0 | igraph_integer_t neighbor = VECTOR(neis)[ni]; |
738 | 0 | if (IS_UNLABELLED(neighbor)) { |
739 | 0 | VECTOR(*membership)[neighbor] = j; |
740 | 0 | IGRAPH_CHECK(igraph_dqueue_int_push(&q, neighbor)); |
741 | 0 | } |
742 | 0 | } |
743 | 0 | } |
744 | 0 | j++; |
745 | 0 | } |
746 | 0 | } |
747 | | |
748 | 0 | igraph_vector_int_destroy(&neis); |
749 | 0 | igraph_dqueue_int_destroy(&q); |
750 | 0 | igraph_vector_int_destroy(&node_order); |
751 | 0 | IGRAPH_FINALLY_CLEAN(3); |
752 | 0 | } |
753 | | |
754 | 9.95k | igraph_vector_int_destroy(&relabel_label); |
755 | 9.95k | IGRAPH_FINALLY_CLEAN(1); |
756 | | |
757 | 9.95k | if (fixed != fixed_copy) { |
758 | 0 | igraph_vector_bool_destroy(fixed_copy); |
759 | 0 | IGRAPH_FREE(fixed_copy); |
760 | 0 | IGRAPH_FINALLY_CLEAN(2); |
761 | 0 | } |
762 | | |
763 | 9.95k | return IGRAPH_SUCCESS; |
764 | 9.95k | } |