/src/igraph/src/community/community_misc.c
Line | Count | Source |
1 | | /* |
2 | | igraph library. |
3 | | Copyright (C) 2007-2024 The igraph development team <igraph@igraph.org> |
4 | | |
5 | | This program is free software; you can redistribute it and/or modify |
6 | | it under the terms of the GNU General Public License as published by |
7 | | the Free Software Foundation; either version 2 of the License, or |
8 | | (at your option) any later version. |
9 | | |
10 | | This program is distributed in the hope that it will be useful, |
11 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | GNU General Public License for more details. |
14 | | |
15 | | You should have received a copy of the GNU General Public License |
16 | | along with this program. If not, see <https://www.gnu.org/licenses/>. |
17 | | */ |
18 | | |
19 | | #include "igraph_community.h" |
20 | | #include "igraph_memory.h" |
21 | | #include "igraph_sparsemat.h" |
22 | | |
23 | | #include "community/community_internal.h" |
24 | | |
25 | | #include <string.h> |
26 | | #include <math.h> |
27 | | |
28 | | /** |
29 | | * \section about_community |
30 | | * |
31 | | * <para> |
32 | | * Community detection is concerned with clustering the vertices of networks |
33 | | * into tightly connected subgraphs called "communities". The following |
34 | | * references provide a good introduction to the topic of community detection: |
35 | | * </para> |
36 | | * |
37 | | * <para> |
38 | | * S. Fortunato: |
39 | | * "Community Detection in Graphs". |
40 | | * Physics Reports 486, no. 3–5 (2010): 75–174. |
41 | | * https://doi.org/10.1016/j.physrep.2009.11.002. |
42 | | * </para> |
43 | | * |
44 | | * <para> |
45 | | * S. Fortunato and D. Hric: |
46 | | * "Community Detection in Networks: A User Guide". |
47 | | * Physics Reports 659 (2016): 1–44. |
48 | | * https://doi.org/10.1016/j.physrep.2016.09.002. |
49 | | * </para> |
50 | | */ |
51 | | |
52 | | /** |
53 | | * \function igraph_community_to_membership |
54 | | * \brief Cut a dendrogram after a given number of merges. |
55 | | * |
56 | | * This function creates a membership vector from a dendrogram whose leaves |
57 | | * are individual vertices by cutting it at the specified level. It produces |
58 | | * a membership vector that contains for each vertex its cluster ID, numbered |
59 | | * from zero. This is the same membership vector format that is produced by |
60 | | * \ref igraph_connected_components(), as well as all community detection |
61 | | * functions in igraph. |
62 | | * |
63 | | * </para><para> |
64 | | * It takes as input the number of vertices \p n, and a \p merges matrix |
65 | | * encoding the dendrogram, in the format produced by hierarchical clustering |
66 | | * functions such as \ref igraph_community_edge_betweenness(), |
67 | | * \ref igraph_community_walktrap() or \ref igraph_community_fastgreedy(). |
68 | | * The matrix must have two columns and up to <code>n - 1</code> rows. |
69 | | * Each row represents merging two dendrogram nodes into their parent node. |
70 | | * The leaf nodes of the dendrogram are indexed from 0 to <code>n - 1</code> |
71 | | * and are identical to the vertices of the graph that is being partitioned |
72 | | * into communities. Row \c i contains the children of dendrogram node |
73 | | * with index <code>n + i</code>. |
74 | | * |
75 | | * </para><para> |
76 | | * This function performs \p steps merge operations as prescribed by |
77 | | * the \p merges matrix and returns the resulting partitioning into |
78 | | * <code>n - steps</code> communities. |
79 | | * |
80 | | * </para><para> |
81 | | * If \p merges is not a complete dendrogram, it is possible to |
82 | | * take \p steps steps if \p steps is not bigger than the number |
83 | | * lines in \p merges. |
84 | | * |
85 | | * \param merges The two-column matrix containing the merge operations. |
86 | | * \param nodes The number of leaf nodes in the dendrogram. |
87 | | * \param steps Integer constant, the number of steps to take. |
88 | | * \param membership Pointer to an initialized vector, the membership |
89 | | * results will be stored here, if not \c NULL. The vector will be |
90 | | * resized as needed. |
91 | | * \param csize Pointer to an initialized vector, or \c NULL. If not \c NULL |
92 | | * then the sizes of the components will be stored here, the vector |
93 | | * will be resized as needed. |
94 | | * \return Error code. |
95 | | * |
96 | | * \sa \ref igraph_community_walktrap(), \ref |
97 | | * igraph_community_edge_betweenness(), \ref |
98 | | * igraph_community_fastgreedy() for community structure detection |
99 | | * algorithms producing merge matrices in this format; |
100 | | * \ref igraph_le_community_to_membership() to perform merges |
101 | | * starting from a given cluster assignment. |
102 | | * |
103 | | * Time complexity: O(|V|), the number of vertices in the graph. |
104 | | */ |
105 | | igraph_error_t igraph_community_to_membership(const igraph_matrix_int_t *merges, |
106 | | igraph_int_t nodes, |
107 | | igraph_int_t steps, |
108 | | igraph_vector_int_t *membership, |
109 | 6.59k | igraph_vector_int_t *csize) { |
110 | | |
111 | 6.59k | const igraph_int_t no_of_nodes = nodes; |
112 | 6.59k | const igraph_int_t components = no_of_nodes - steps; |
113 | 6.59k | igraph_int_t found = 0; |
114 | 6.59k | igraph_vector_int_t tmp; |
115 | 6.59k | igraph_vector_bool_t already_merged; |
116 | 6.59k | igraph_vector_int_t own_membership; |
117 | 6.59k | igraph_bool_t using_own_membership = false; |
118 | | |
119 | 6.59k | if (steps > igraph_matrix_int_nrow(merges)) { |
120 | 0 | IGRAPH_ERRORF("Number of steps is greater than number of rows in merges matrix: found %" |
121 | 0 | IGRAPH_PRId " steps, %" IGRAPH_PRId " rows.", IGRAPH_EINVAL, steps, igraph_matrix_int_nrow(merges)); |
122 | 0 | } |
123 | | |
124 | 6.59k | if (igraph_matrix_int_ncol(merges) != 2) { |
125 | 0 | IGRAPH_ERRORF("The merges matrix should have two columns, but has %" IGRAPH_PRId ".", |
126 | 0 | IGRAPH_EINVAL, igraph_matrix_int_ncol(merges)); |
127 | 0 | } |
128 | 6.59k | if (steps < 0) { |
129 | 0 | IGRAPH_ERRORF("Number of steps should be non-negative, found %" IGRAPH_PRId ".", IGRAPH_EINVAL, steps); |
130 | 0 | } |
131 | | |
132 | 6.59k | if (csize != NULL && membership == NULL) { |
133 | | /* we need a membership vector to calculate 'csize' but the user did |
134 | | * not provide one; let's allocate one ourselves */ |
135 | 0 | IGRAPH_VECTOR_INT_INIT_FINALLY(&own_membership, no_of_nodes); |
136 | 0 | using_own_membership = true; |
137 | 0 | membership = &own_membership; |
138 | 0 | } |
139 | | |
140 | 6.59k | if (membership) { |
141 | 6.59k | IGRAPH_CHECK(igraph_vector_int_resize(membership, no_of_nodes)); |
142 | 6.59k | igraph_vector_int_null(membership); |
143 | 6.59k | } |
144 | 6.59k | if (csize) { |
145 | 0 | IGRAPH_CHECK(igraph_vector_int_resize(csize, components)); |
146 | 0 | igraph_vector_int_null(csize); |
147 | 0 | } |
148 | | |
149 | 6.59k | IGRAPH_VECTOR_BOOL_INIT_FINALLY(&already_merged, steps + no_of_nodes); |
150 | 6.59k | IGRAPH_VECTOR_INT_INIT_FINALLY(&tmp, steps); |
151 | | |
152 | 75.8k | for (igraph_int_t i = steps - 1; i >= 0; i--) { |
153 | 69.2k | const igraph_int_t c1 = MATRIX(*merges, i, 0); |
154 | 69.2k | const igraph_int_t c2 = MATRIX(*merges, i, 1); |
155 | | |
156 | 69.2k | if (VECTOR(already_merged)[c1] == 0) { |
157 | 69.2k | VECTOR(already_merged)[c1] = true; |
158 | 69.2k | } else { |
159 | 0 | IGRAPH_ERRORF("Merges matrix contains multiple merges of cluster %" IGRAPH_PRId ".", IGRAPH_EINVAL, c1); |
160 | 0 | } |
161 | 69.2k | if (VECTOR(already_merged)[c2] == 0) { |
162 | 69.2k | VECTOR(already_merged)[c2] = true; |
163 | 69.2k | } else { |
164 | 0 | IGRAPH_ERRORF("Merges matrix contains multiple merges of cluster %" IGRAPH_PRId ".", IGRAPH_EINVAL, c2); |
165 | 0 | } |
166 | | |
167 | | /* new component? */ |
168 | 69.2k | if (VECTOR(tmp)[i] == 0) { |
169 | 21.4k | found++; |
170 | 21.4k | VECTOR(tmp)[i] = found; |
171 | 21.4k | } |
172 | | |
173 | 69.2k | if (c1 < no_of_nodes) { |
174 | 51.0k | const igraph_int_t cid = VECTOR(tmp)[i] - 1; |
175 | 51.0k | if (membership) { |
176 | 51.0k | VECTOR(*membership)[c1] = cid + 1; |
177 | 51.0k | } |
178 | 51.0k | if (csize) { |
179 | 0 | VECTOR(*csize)[cid] += 1; |
180 | 0 | } |
181 | 51.0k | } else { |
182 | 18.2k | VECTOR(tmp)[c1 - no_of_nodes] = VECTOR(tmp)[i]; |
183 | 18.2k | } |
184 | | |
185 | 69.2k | if (c2 < no_of_nodes) { |
186 | 39.6k | const igraph_int_t cid = VECTOR(tmp)[i] - 1; |
187 | 39.6k | if (membership) { |
188 | 39.6k | VECTOR(*membership)[c2] = cid + 1; |
189 | 39.6k | } |
190 | 39.6k | if (csize) { |
191 | 0 | VECTOR(*csize)[cid] += 1; |
192 | 0 | } |
193 | 39.6k | } else { |
194 | 29.6k | VECTOR(tmp)[c2 - no_of_nodes] = VECTOR(tmp)[i]; |
195 | 29.6k | } |
196 | | |
197 | 69.2k | } |
198 | | |
199 | 6.59k | if (membership || csize) { |
200 | | /* It can never happen that csize != NULL and membership == NULL; we have |
201 | | * handled that case above. Thus we skip if (membership) checks, which |
202 | | * would confuse static analysers in this context. */ |
203 | 302k | for (igraph_int_t i = 0; i < no_of_nodes; i++) { |
204 | 296k | const igraph_int_t c = VECTOR(*membership)[i]; |
205 | 296k | if (c != 0) { |
206 | 90.6k | VECTOR(*membership)[i] = c - 1; |
207 | 205k | } else { |
208 | 205k | if (csize) { |
209 | 0 | VECTOR(*csize)[found] += 1; |
210 | 0 | } |
211 | 205k | VECTOR(*membership)[i] = found; |
212 | 205k | found++; |
213 | 205k | } |
214 | 296k | } |
215 | 6.59k | } |
216 | | |
217 | 6.59k | igraph_vector_int_destroy(&tmp); |
218 | 6.59k | igraph_vector_bool_destroy(&already_merged); |
219 | 6.59k | IGRAPH_FINALLY_CLEAN(2); |
220 | | |
221 | 6.59k | if (using_own_membership) { |
222 | 0 | igraph_vector_int_destroy(&own_membership); |
223 | 0 | IGRAPH_FINALLY_CLEAN(1); |
224 | 0 | } |
225 | | |
226 | 6.59k | return IGRAPH_SUCCESS; |
227 | 6.59k | } |
228 | | |
229 | | |
230 | | /** |
231 | | * This slower implementation of igraph_reindex_membership() is used when |
232 | | * cluster indices are not within the range 0..vcount-1. |
233 | | * |
234 | | * \param membership Vector encoding the membership of each vertex. |
235 | | * Elements may be arbitrary integers, however, its length must not be zero. |
236 | | * \param new_to_old If not \c NULL, a mapping from new to old cluster |
237 | | * indices will be stored here. |
238 | | * \param nb_clusters If not \c NULL, the number of clusters will be stored here. |
239 | | * |
240 | | * Time complexity: O(n log(n)) where n is the membership vector length. |
241 | | */ |
242 | | igraph_error_t igraph_i_reindex_membership_large( |
243 | | igraph_vector_int_t *membership, |
244 | | igraph_vector_int_t *new_to_old, |
245 | 41.0k | igraph_int_t *nb_clusters) { |
246 | | |
247 | 41.0k | const igraph_int_t vcount = igraph_vector_int_size(membership); |
248 | 41.0k | igraph_vector_int_t permutation; |
249 | | |
250 | | /* The vcount == 0 case requires special handling because the main |
251 | | * algorithm of this function cannot handle it. */ |
252 | 41.0k | if (vcount == 0) { |
253 | 0 | if (new_to_old) { |
254 | 0 | igraph_vector_int_clear(new_to_old); |
255 | 0 | } |
256 | 0 | if (nb_clusters) { |
257 | 0 | *nb_clusters = 0; |
258 | 0 | } |
259 | 0 | return IGRAPH_SUCCESS; |
260 | 0 | } |
261 | | |
262 | 41.0k | IGRAPH_VECTOR_INT_INIT_FINALLY(&permutation, vcount); |
263 | | |
264 | 41.0k | IGRAPH_CHECK(igraph_vector_int_sort_ind(membership, &permutation, IGRAPH_ASCENDING)); |
265 | | |
266 | 41.0k | if (new_to_old) { |
267 | 0 | igraph_vector_int_clear(new_to_old); |
268 | 0 | } |
269 | | |
270 | 41.0k | igraph_int_t j = VECTOR(permutation)[0]; |
271 | 41.0k | igraph_int_t c_old = VECTOR(*membership)[j]; |
272 | 41.0k | igraph_int_t c_old_prev = c_old; |
273 | 41.0k | igraph_int_t c_new = 0; |
274 | 41.0k | if (new_to_old) { |
275 | 0 | IGRAPH_CHECK(igraph_vector_int_push_back(new_to_old, c_old)); |
276 | 0 | } |
277 | 41.0k | VECTOR(*membership)[j] = c_new; |
278 | | |
279 | 2.31M | for (igraph_int_t i=1; i < vcount; i++) { |
280 | 2.27M | j = VECTOR(permutation)[i]; |
281 | 2.27M | c_old = VECTOR(*membership)[j]; |
282 | 2.27M | if (c_old != c_old_prev) { |
283 | 1.66M | if (new_to_old) { |
284 | 0 | IGRAPH_CHECK(igraph_vector_int_push_back(new_to_old, c_old)); |
285 | 0 | } |
286 | 1.66M | c_new++; |
287 | 1.66M | } |
288 | 2.27M | c_old_prev = c_old; |
289 | 2.27M | VECTOR(*membership)[j] = c_new; |
290 | 2.27M | } |
291 | | |
292 | 41.0k | if (nb_clusters) { |
293 | 41.0k | *nb_clusters = c_new+1; |
294 | 41.0k | } |
295 | | |
296 | 41.0k | igraph_vector_int_destroy(&permutation); |
297 | 41.0k | IGRAPH_FINALLY_CLEAN(1); |
298 | | |
299 | 41.0k | return IGRAPH_SUCCESS; |
300 | 41.0k | } |
301 | | |
302 | | /** |
303 | | * \function igraph_reindex_membership |
304 | | * \brief Makes the IDs in a membership vector contiguous. |
305 | | * |
306 | | * This function reindexes component IDs in a membership vector in a way that |
307 | | * the new IDs start from zero and go up to <code>C-1</code>, where \c C is the |
308 | | * number of unique component IDs in the original vector. |
309 | | * |
310 | | * \param membership Numeric vector which gives the type of each |
311 | | * vertex, i.e. the component to which it belongs. |
312 | | * The vector will be altered in-place. |
313 | | * \param new_to_old Pointer to a vector which will contain the |
314 | | * old component ID for each new one, or \c NULL, |
315 | | * in which case it is not returned. The vector |
316 | | * will be resized as needed. |
317 | | * \param nb_clusters Pointer to an integer for the number of |
318 | | * distinct clusters. If not \c NULL, this will be |
319 | | * updated to reflect the number of distinct |
320 | | * clusters found in membership. |
321 | | * \return Error code. |
322 | | * |
323 | | * Time complexity: Let n be the length of the membership vector. |
324 | | * O(n) if cluster indices are within 0..n-1, and O(n log(n)) otherwise. |
325 | | */ |
326 | | igraph_error_t igraph_reindex_membership( |
327 | | igraph_vector_int_t *membership, |
328 | | igraph_vector_int_t *new_to_old, |
329 | 79.5k | igraph_int_t *nb_clusters) { |
330 | | |
331 | 79.5k | const igraph_int_t vcount = igraph_vector_int_size(membership); |
332 | 79.5k | igraph_vector_int_t new_cluster; |
333 | 79.5k | igraph_int_t i_nb_clusters; |
334 | | |
335 | | /* First, we assume cluster indices in the range 0..vcount-1, |
336 | | * and attempt to use a performant implementation. If indices |
337 | | * outside of this range are found, we will fall back to a |
338 | | * slower alternative implementation. */ |
339 | 79.5k | IGRAPH_VECTOR_INT_INIT_FINALLY(&new_cluster, vcount); |
340 | | |
341 | 79.5k | if (new_to_old) { |
342 | 0 | igraph_vector_int_clear(new_to_old); |
343 | 0 | } |
344 | | |
345 | | /* Clean clusters. We will store the new cluster + 1 so that membership == 0 |
346 | | * indicates that no cluster was assigned yet. */ |
347 | 79.5k | i_nb_clusters = 1; |
348 | 3.55M | for (igraph_int_t i = 0; i < vcount; i++) { |
349 | 3.47M | igraph_int_t c = VECTOR(*membership)[i]; |
350 | | |
351 | 3.47M | if (c < 0 || c >= vcount) { |
352 | | /* Found cluster indices out of the 0..vcount-1 range. |
353 | | * Use alternative implementation that supports these. */ |
354 | 0 | igraph_vector_int_destroy(&new_cluster); |
355 | 0 | IGRAPH_FINALLY_CLEAN(1); |
356 | 0 | return igraph_i_reindex_membership_large(membership, new_to_old, nb_clusters); |
357 | 0 | } |
358 | | |
359 | 3.47M | if (VECTOR(new_cluster)[c] == 0) { |
360 | 3.24M | VECTOR(new_cluster)[c] = i_nb_clusters; |
361 | 3.24M | i_nb_clusters += 1; |
362 | 3.24M | if (new_to_old) { |
363 | 0 | IGRAPH_CHECK(igraph_vector_int_push_back(new_to_old, c)); |
364 | 0 | } |
365 | 3.24M | } |
366 | 3.47M | } |
367 | | |
368 | | /* Assign new membership */ |
369 | 3.55M | for (igraph_int_t i = 0; i < vcount; i++) { |
370 | 3.47M | igraph_int_t c = VECTOR(*membership)[i]; |
371 | 3.47M | VECTOR(*membership)[i] = VECTOR(new_cluster)[c] - 1; |
372 | 3.47M | } |
373 | | |
374 | 79.5k | if (nb_clusters) { |
375 | | /* We used the cluster + 1, so correct */ |
376 | 31.9k | *nb_clusters = i_nb_clusters - 1; |
377 | 31.9k | } |
378 | | |
379 | 79.5k | igraph_vector_int_destroy(&new_cluster); |
380 | 79.5k | IGRAPH_FINALLY_CLEAN(1); |
381 | | |
382 | 79.5k | return IGRAPH_SUCCESS; |
383 | 79.5k | } |
384 | | |
385 | | static igraph_error_t igraph_i_compare_communities_vi(const igraph_vector_int_t *v1, |
386 | | const igraph_vector_int_t *v2, igraph_real_t* result); |
387 | | static igraph_error_t igraph_i_compare_communities_nmi(const igraph_vector_int_t *v1, |
388 | | const igraph_vector_int_t *v2, igraph_real_t* result); |
389 | | static igraph_error_t igraph_i_compare_communities_rand(const igraph_vector_int_t *v1, |
390 | | const igraph_vector_int_t *v2, igraph_real_t* result, igraph_bool_t adjust); |
391 | | static igraph_error_t igraph_i_split_join_distance(const igraph_vector_int_t *v1, |
392 | | const igraph_vector_int_t *v2, igraph_int_t* distance12, |
393 | | igraph_int_t* distance21); |
394 | | |
395 | | /** |
396 | | * \ingroup communities |
397 | | * \function igraph_compare_communities |
398 | | * \brief Compares community structures using various metrics. |
399 | | * |
400 | | * This function assesses the distance between two community structures |
401 | | * using the variation of information (VI) metric of Meila (2003), the |
402 | | * normalized mutual information (NMI) of Danon et al (2005), the |
403 | | * split-join distance of van Dongen (2000), the Rand index of Rand (1971) |
404 | | * or the adjusted Rand index of Hubert and Arabie (1985). |
405 | | * |
406 | | * </para><para> |
407 | | * Some of these measures are defined based on the entropy of a discrete |
408 | | * random variable associated with a given clustering \c C of vertices. |
409 | | * Let \c p_i be the probability that a randomly picked vertex would be part |
410 | | * of cluster \c i. Then the entropy of the clustering is |
411 | | * |
412 | | * </para><para> |
413 | | * <code>H(C) = - sum_i p_i log p_i</code> |
414 | | * |
415 | | * </para><para> |
416 | | * Similarly, we can define the joint entropy of two clusterings \c C_1 and \c C_2 |
417 | | * based on the probability \c p_ij that a random vertex is part of cluster \c i |
418 | | * in the first clustering and cluster \c j in the second one: |
419 | | * |
420 | | * </para><para> |
421 | | * <code>H(C_1, C_2) = - sum_ii p_ij log p_ij</code> |
422 | | * |
423 | | * </para><para> |
424 | | * The mutual information of \c C_1 and \c C_2 is then |
425 | | * <code>MI(C_1, C_2) = H(C_1) + H(C_2) - H(C_1, C_2) >= 0 </code>. |
426 | | * A large mutual information indicates a high overlap between the two clusterings. |
427 | | * The normalized mutual information, as computed by igraph, is |
428 | | * |
429 | | * </para><para> |
430 | | * <code>NMI(C_1, C_2) = 2 MI(C_1, C_2) / (H(C_1) + H(C_2))</code>. |
431 | | * |
432 | | * </para><para> |
433 | | * It takes its value from the interval (0, 1], with 1 achieved when the two clusterings |
434 | | * coincide. |
435 | | * |
436 | | * </para><para> |
437 | | * The variation of information is defined as |
438 | | * <code>VI(C_1, C_2) = [H(C_1) - MI(C_1, C_2)] + [H(C_2) - MI(C_1, C_2)]</code>. |
439 | | * Lower values of the variation of information indicate a smaller difference between |
440 | | * the two clusterings, with <code>VI = 0</code> achieved precisely when they coincide. |
441 | | * igraph uses natural units for the variation of information, i.e. it uses the |
442 | | * natural logarithm when computing entropies. |
443 | | * |
444 | | * </para><para> |
445 | | * The Rand index is defined as the probability that the two clusterings agree |
446 | | * about the cluster memberships of a randomly chosen vertex \em pair. All vertex |
447 | | * pairs are considered, and the two clusterings are considered to be in agreement |
448 | | * about the memberships of a vertex pair if either the two vertices are in the |
449 | | * same cluster in both clusterings, or they are in different clusters in both |
450 | | * clusterings. The Rand index is then the number of vertex pairs in agreement, |
451 | | * divided by the total number of vertex pairs. A Rand index of zero means that |
452 | | * the two clusterings disagree about the membership of all vertex pairs, while |
453 | | * 1 means that the two clusterings are identical. |
454 | | * |
455 | | * </para><para> |
456 | | * The adjusted Rand index is similar to the Rand index, but it takes into |
457 | | * account that agreement between the two clusterings may also occur by chance |
458 | | * even if the two clusterings are chosen completely randomly. The adjusted |
459 | | * Rand index therefore subtracts the expected fraction of agreements from the |
460 | | * value of the Rand index, and divides the result by one minus the expected |
461 | | * fraction of agreements. The maximum value of the adjusted Rand index is |
462 | | * still 1 (similarly to the Rand index), indicating maximum agreement, but |
463 | | * the value may be less than zero if there is \em less agreement between the |
464 | | * two clusterings than what would be expected by chance. |
465 | | * |
466 | | * </para><para> |
467 | | * For an explanation of the split-join distance, see \ref igraph_split_join_distance(). |
468 | | * |
469 | | * </para><para> |
470 | | * References: |
471 | | * |
472 | | * </para><para> |
473 | | * Meilă M: Comparing clusterings by the variation of information. |
474 | | * In: Schölkopf B, Warmuth MK (eds.). Learning Theory and Kernel Machines: |
475 | | * 16th Annual Conference on Computational Learning Theory and 7th Kernel |
476 | | * Workshop, COLT/Kernel 2003, Washington, DC, USA. Lecture Notes in Computer |
477 | | * Science, vol. 2777, Springer, 2003. ISBN: 978-3-540-40720-1. |
478 | | * https://doi.org/10.1007/978-3-540-45167-9_14 |
479 | | * |
480 | | * </para><para> |
481 | | * Danon L, Diaz-Guilera A, Duch J, Arenas A: Comparing community structure |
482 | | * identification. J Stat Mech P09008, 2005. |
483 | | * https://doi.org/10.1088/1742-5468/2005/09/P09008 |
484 | | * |
485 | | * </para><para> |
486 | | * van Dongen S: Performance criteria for graph clustering and Markov cluster |
487 | | * experiments. Technical Report INS-R0012, National Research Institute for |
488 | | * Mathematics and Computer Science in the Netherlands, Amsterdam, May 2000. |
489 | | * https://ir.cwi.nl/pub/4461 |
490 | | * |
491 | | * </para><para> |
492 | | * Rand WM: Objective criteria for the evaluation of clustering methods. |
493 | | * J Am Stat Assoc 66(336):846-850, 1971. |
494 | | * https://doi.org/10.2307/2284239 |
495 | | * |
496 | | * </para><para> |
497 | | * Hubert L and Arabie P: Comparing partitions. Journal of Classification |
498 | | * 2:193-218, 1985. |
499 | | * https://doi.org/10.1007/BF01908075 |
500 | | * |
501 | | * \param comm1 the membership vector of the first community structure |
502 | | * \param comm2 the membership vector of the second community structure |
503 | | * \param result the result is stored here. |
504 | | * \param method the comparison method to use. \c IGRAPH_COMMCMP_VI |
505 | | * selects the variation of information (VI) metric of |
506 | | * Meila (2003), \c IGRAPH_COMMCMP_NMI selects the |
507 | | * normalized mutual information measure proposed by |
508 | | * Danon et al (2005), \c IGRAPH_COMMCMP_SPLIT_JOIN |
509 | | * selects the split-join distance of van Dongen (2000), |
510 | | * \c IGRAPH_COMMCMP_RAND selects the unadjusted Rand |
511 | | * index (1971) and \c IGRAPH_COMMCMP_ADJUSTED_RAND |
512 | | * selects the adjusted Rand index. |
513 | | * |
514 | | * \return Error code. |
515 | | * |
516 | | * \sa \ref igraph_split_join_distance(). |
517 | | * |
518 | | * Time complexity: O(n log(n)). |
519 | | */ |
520 | | igraph_error_t igraph_compare_communities(const igraph_vector_int_t *comm1, |
521 | | const igraph_vector_int_t *comm2, igraph_real_t* result, |
522 | 16.4k | igraph_community_comparison_t method) { |
523 | 16.4k | igraph_vector_int_t c1, c2; |
524 | | |
525 | 16.4k | if (igraph_vector_int_size(comm1) != igraph_vector_int_size(comm2)) { |
526 | 0 | IGRAPH_ERROR("community membership vectors have different lengths", IGRAPH_EINVAL); |
527 | 0 | } |
528 | | |
529 | | /* Copy and reindex membership vectors to make sure they are continuous */ |
530 | 16.4k | IGRAPH_CHECK(igraph_vector_int_init_copy(&c1, comm1)); |
531 | 16.4k | IGRAPH_FINALLY(igraph_vector_int_destroy, &c1); |
532 | | |
533 | 16.4k | IGRAPH_CHECK(igraph_vector_int_init_copy(&c2, comm2)); |
534 | 16.4k | IGRAPH_FINALLY(igraph_vector_int_destroy, &c2); |
535 | | |
536 | 16.4k | IGRAPH_CHECK(igraph_reindex_membership(&c1, NULL, NULL)); |
537 | 16.4k | IGRAPH_CHECK(igraph_reindex_membership(&c2, NULL, NULL)); |
538 | | |
539 | 16.4k | switch (method) { |
540 | 3.29k | case IGRAPH_COMMCMP_VI: |
541 | 3.29k | IGRAPH_CHECK(igraph_i_compare_communities_vi(&c1, &c2, result)); |
542 | 3.29k | break; |
543 | | |
544 | 3.29k | case IGRAPH_COMMCMP_NMI: |
545 | 3.29k | IGRAPH_CHECK(igraph_i_compare_communities_nmi(&c1, &c2, result)); |
546 | 3.29k | break; |
547 | | |
548 | 3.29k | case IGRAPH_COMMCMP_SPLIT_JOIN: { |
549 | 3.29k | igraph_int_t d12, d21; |
550 | 3.29k | IGRAPH_CHECK(igraph_i_split_join_distance(&c1, &c2, &d12, &d21)); |
551 | 3.29k | *result = d12 + d21; |
552 | 3.29k | } |
553 | 0 | break; |
554 | | |
555 | 3.28k | case IGRAPH_COMMCMP_RAND: |
556 | 6.56k | case IGRAPH_COMMCMP_ADJUSTED_RAND: |
557 | 6.56k | IGRAPH_CHECK(igraph_i_compare_communities_rand(&c1, &c2, result, |
558 | 6.56k | method == IGRAPH_COMMCMP_ADJUSTED_RAND)); |
559 | 6.56k | break; |
560 | | |
561 | 6.56k | default: |
562 | 0 | IGRAPH_ERROR("unknown community comparison method", IGRAPH_EINVAL); |
563 | 16.4k | } |
564 | | |
565 | | /* Clean up everything */ |
566 | 16.4k | igraph_vector_int_destroy(&c1); |
567 | 16.4k | igraph_vector_int_destroy(&c2); |
568 | 16.4k | IGRAPH_FINALLY_CLEAN(2); |
569 | | |
570 | 16.4k | return IGRAPH_SUCCESS; |
571 | 16.4k | } |
572 | | |
573 | | /** |
574 | | * \ingroup communities |
575 | | * \function igraph_split_join_distance |
576 | | * \brief Calculates the split-join distance of two community structures. |
577 | | * |
578 | | * The split-join distance between partitions A and B is the sum of the |
579 | | * projection distance of A from B and the projection distance of B from |
580 | | * A. The projection distance is an asymmetric measure and it is defined |
581 | | * as follows: |
582 | | * |
583 | | * </para><para> |
584 | | * First, each set in partition A is evaluated against all sets in partition |
585 | | * B. For each set in partition A, the best matching set in partition B is |
586 | | * found and the overlap size is calculated. (Matching is quantified by the |
587 | | * size of the overlap between the two sets). Then, the maximal overlap sizes |
588 | | * for each set in A are summed together and subtracted from the number of |
589 | | * elements in A. |
590 | | * |
591 | | * </para><para> |
592 | | * The split-join distance will be returned in two arguments, \c distance12 |
593 | | * will contain the projection distance of the first partition from the |
594 | | * second, while \c distance21 will be the projection distance of the second |
595 | | * partition from the first. This makes it easier to detect whether a |
596 | | * partition is a subpartition of the other, since in this case, the |
597 | | * corresponding distance will be zero. |
598 | | * |
599 | | * </para><para> |
600 | | * Reference: |
601 | | * |
602 | | * </para><para> |
603 | | * van Dongen S: Performance criteria for graph clustering and Markov cluster |
604 | | * experiments. Technical Report INS-R0012, National Research Institute for |
605 | | * Mathematics and Computer Science in the Netherlands, Amsterdam, May 2000. |
606 | | * |
607 | | * \param comm1 the membership vector of the first community structure |
608 | | * \param comm2 the membership vector of the second community structure |
609 | | * \param distance12 pointer to an \c igraph_int_t, the projection distance |
610 | | * of the first community structure from the second one will be |
611 | | * returned here. |
612 | | * \param distance21 pointer to an \c igraph_int_t, the projection distance |
613 | | * of the second community structure from the first one will be |
614 | | * returned here. |
615 | | * \return Error code. |
616 | | * |
617 | | * \sa \ref igraph_compare_communities() with the \c IGRAPH_COMMCMP_SPLIT_JOIN |
618 | | * method if you are not interested in the individual distances but only the sum |
619 | | * of them. |
620 | | * |
621 | | * Time complexity: O(n log(n)). |
622 | | */ |
623 | | igraph_error_t igraph_split_join_distance(const igraph_vector_int_t *comm1, |
624 | | const igraph_vector_int_t *comm2, igraph_int_t *distance12, |
625 | 0 | igraph_int_t *distance21) { |
626 | 0 | igraph_vector_int_t c1, c2; |
627 | |
|
628 | 0 | if (igraph_vector_int_size(comm1) != igraph_vector_int_size(comm2)) { |
629 | 0 | IGRAPH_ERRORF("Community membership vectors have different lengths: %" IGRAPH_PRId " and %" IGRAPH_PRId ".", |
630 | 0 | IGRAPH_EINVAL, igraph_vector_int_size(comm1), igraph_vector_int_size(comm2)); |
631 | 0 | } |
632 | | |
633 | | /* Copy and reindex membership vectors to make sure they are continuous */ |
634 | 0 | IGRAPH_CHECK(igraph_vector_int_init_copy(&c1, comm1)); |
635 | 0 | IGRAPH_FINALLY(igraph_vector_int_destroy, &c1); |
636 | |
|
637 | 0 | IGRAPH_CHECK(igraph_vector_int_init_copy(&c2, comm2)); |
638 | 0 | IGRAPH_FINALLY(igraph_vector_int_destroy, &c2); |
639 | |
|
640 | 0 | IGRAPH_CHECK(igraph_reindex_membership(&c1, NULL, NULL)); |
641 | 0 | IGRAPH_CHECK(igraph_reindex_membership(&c2, NULL, NULL)); |
642 | | |
643 | 0 | IGRAPH_CHECK(igraph_i_split_join_distance(&c1, &c2, distance12, distance21)); |
644 | | |
645 | | /* Clean up everything */ |
646 | 0 | igraph_vector_int_destroy(&c1); |
647 | 0 | igraph_vector_int_destroy(&c2); |
648 | 0 | IGRAPH_FINALLY_CLEAN(2); |
649 | |
|
650 | 0 | return IGRAPH_SUCCESS; |
651 | 0 | } |
652 | | |
653 | | /** |
654 | | * Calculates the entropy and the mutual information for two reindexed community |
655 | | * membership vectors v1 and v2. This is needed by both Meila's and Danon's |
656 | | * community comparison measure. |
657 | | */ |
658 | | static igraph_error_t igraph_i_entropy_and_mutual_information(const igraph_vector_int_t* v1, |
659 | 6.59k | const igraph_vector_int_t* v2, double* h1, double* h2, double* mut_inf) { |
660 | 6.59k | igraph_int_t i, n; |
661 | 6.59k | igraph_int_t k1; |
662 | 6.59k | igraph_int_t k2; |
663 | 6.59k | igraph_real_t *p1, *p2; |
664 | 6.59k | igraph_sparsemat_t m; |
665 | 6.59k | igraph_sparsemat_t mu; /* uncompressed */ |
666 | 6.59k | igraph_sparsemat_iterator_t mit; |
667 | | |
668 | 6.59k | n = igraph_vector_int_size(v1); |
669 | 6.59k | if (n == 0) { |
670 | 2 | *h1 = 0; |
671 | 2 | *h2 = 0; |
672 | 2 | *mut_inf = 0; |
673 | 2 | return IGRAPH_SUCCESS; |
674 | 2 | } |
675 | 6.59k | k1 = igraph_vector_int_max(v1) + 1; |
676 | 6.59k | k2 = igraph_vector_int_max(v2) + 1; |
677 | 6.59k | p1 = IGRAPH_CALLOC(k1, igraph_real_t); |
678 | 6.59k | if (p1 == 0) { |
679 | 0 | IGRAPH_ERROR("Insufficient memory for computing community entropy.", IGRAPH_ENOMEM); |
680 | 0 | } |
681 | 6.59k | IGRAPH_FINALLY(igraph_free, p1); |
682 | 6.59k | p2 = IGRAPH_CALLOC(k2, igraph_real_t); |
683 | 6.59k | if (p2 == 0) { |
684 | 0 | IGRAPH_ERROR("Insufficient memory for computing community entropy.", IGRAPH_ENOMEM); |
685 | 0 | } |
686 | 6.59k | IGRAPH_FINALLY(igraph_free, p2); |
687 | | |
688 | | /* Calculate the entropy of v1 */ |
689 | 6.59k | *h1 = 0.0; |
690 | 302k | for (i = 0; i < n; i++) { |
691 | 296k | p1[VECTOR(*v1)[i]]++; |
692 | 296k | } |
693 | 300k | for (i = 0; i < k1; i++) { |
694 | 294k | p1[i] /= n; |
695 | 294k | *h1 -= p1[i] * log(p1[i]); |
696 | 294k | } |
697 | | |
698 | | /* Calculate the entropy of v2 */ |
699 | 6.59k | *h2 = 0.0; |
700 | 302k | for (i = 0; i < n; i++) { |
701 | 296k | p2[VECTOR(*v2)[i]]++; |
702 | 296k | } |
703 | 250k | for (i = 0; i < k2; i++) { |
704 | 244k | p2[i] /= n; |
705 | 244k | *h2 -= p2[i] * log(p2[i]); |
706 | 244k | } |
707 | | |
708 | | /* We will only need the logs of p1 and p2 from now on */ |
709 | 300k | for (i = 0; i < k1; i++) { |
710 | 294k | p1[i] = log(p1[i]); |
711 | 294k | } |
712 | 250k | for (i = 0; i < k2; i++) { |
713 | 244k | p2[i] = log(p2[i]); |
714 | 244k | } |
715 | | |
716 | | /* Calculate the mutual information of v1 and v2 */ |
717 | 6.59k | *mut_inf = 0.0; |
718 | 6.59k | IGRAPH_CHECK(igraph_sparsemat_init(&mu, k1, k2, n)); |
719 | 6.59k | IGRAPH_FINALLY(igraph_sparsemat_destroy, &mu); |
720 | 302k | for (i = 0; i < n; i++) { |
721 | 296k | IGRAPH_CHECK(igraph_sparsemat_entry( |
722 | 296k | &mu, VECTOR(*v1)[i], |
723 | 296k | VECTOR(*v2)[i], 1 |
724 | 296k | )); |
725 | 296k | } |
726 | | |
727 | 6.59k | IGRAPH_CHECK(igraph_sparsemat_compress(&mu, &m)); |
728 | 6.59k | IGRAPH_FINALLY(igraph_sparsemat_destroy, &m); |
729 | 6.59k | IGRAPH_CHECK(igraph_sparsemat_dupl(&m)); |
730 | | |
731 | 6.59k | IGRAPH_CHECK(igraph_sparsemat_iterator_init(&mit, &m)); |
732 | 300k | while (!igraph_sparsemat_iterator_end(&mit)) { |
733 | 294k | double p = igraph_sparsemat_iterator_get(&mit)/ n; |
734 | 294k | *mut_inf += p * (log(p) - p1[igraph_sparsemat_iterator_row(&mit)] - p2[igraph_sparsemat_iterator_col(&mit)]); |
735 | 294k | igraph_sparsemat_iterator_next(&mit); |
736 | 294k | } |
737 | 6.59k | igraph_sparsemat_destroy(&m); |
738 | 6.59k | igraph_sparsemat_destroy(&mu); |
739 | 6.59k | IGRAPH_FREE(p1); IGRAPH_FREE(p2); |
740 | | |
741 | 6.59k | IGRAPH_FINALLY_CLEAN(4); |
742 | | |
743 | 6.59k | return IGRAPH_SUCCESS; |
744 | 6.59k | } |
745 | | |
746 | | /** |
747 | | * Implementation of the normalized mutual information (NMI) measure of |
748 | | * Danon et al. This function assumes that the community membership |
749 | | * vectors have already been normalized using igraph_reindex_membership(). |
750 | | * |
751 | | * </para><para> |
752 | | * Reference: Danon L, Diaz-Guilera A, Duch J, Arenas A: Comparing community |
753 | | * structure identification. J Stat Mech P09008, 2005. |
754 | | * |
755 | | * </para><para> |
756 | | * Time complexity: O(n log(n)) |
757 | | */ |
758 | | static igraph_error_t igraph_i_compare_communities_nmi(const igraph_vector_int_t *v1, const igraph_vector_int_t *v2, |
759 | 3.29k | igraph_real_t* result) { |
760 | 3.29k | double h1, h2, mut_inf; |
761 | | |
762 | 3.29k | IGRAPH_CHECK(igraph_i_entropy_and_mutual_information(v1, v2, &h1, &h2, &mut_inf)); |
763 | | |
764 | 3.29k | if (h1 == 0 && h2 == 0) { |
765 | 31 | *result = 1; |
766 | 3.26k | } else { |
767 | 3.26k | *result = 2 * mut_inf / (h1 + h2); |
768 | 3.26k | } |
769 | | |
770 | 3.29k | return IGRAPH_SUCCESS; |
771 | 3.29k | } |
772 | | |
773 | | /** |
774 | | * Implementation of the variation of information metric (VI) of |
775 | | * Meila et al. This function assumes that the community membership |
776 | | * vectors have already been normalized using igraph_reindex_membership(). |
777 | | * |
778 | | * </para><para> |
779 | | * Reference: Meila M: Comparing clusterings by the variation of information. |
780 | | * In: Schölkopf B, Warmuth MK (eds.). Learning Theory and Kernel Machines: |
781 | | * 16th Annual Conference on Computational Learning Theory and 7th Kernel |
782 | | * Workshop, COLT/Kernel 2003, Washington, DC, USA. Lecture Notes in Computer |
783 | | * Science, vol. 2777, Springer, 2003. ISBN: 978-3-540-40720-1. |
784 | | * |
785 | | * </para><para> |
786 | | * Time complexity: O(n log(n)) |
787 | | */ |
788 | | static igraph_error_t igraph_i_compare_communities_vi(const igraph_vector_int_t *v1, const igraph_vector_int_t *v2, |
789 | 3.29k | igraph_real_t* result) { |
790 | 3.29k | double h1, h2, mut_inf; |
791 | | |
792 | 3.29k | IGRAPH_CHECK(igraph_i_entropy_and_mutual_information(v1, v2, &h1, &h2, &mut_inf)); |
793 | 3.29k | *result = h1 + h2 - 2 * mut_inf; |
794 | | |
795 | 3.29k | return IGRAPH_SUCCESS; |
796 | 3.29k | } |
797 | | |
798 | | /** |
799 | | * \brief Calculates the confusion matrix for two clusterings. |
800 | | * |
801 | | * </para><para> |
802 | | * This function assumes that the community membership vectors have already |
803 | | * been normalized using igraph_reindex_membership(). |
804 | | * |
805 | | * </para><para> |
806 | | * Time complexity: O(n log(max(k1, k2))), where n is the number of vertices, k1 |
807 | | * and k2 are the number of clusters in each of the clusterings. |
808 | | */ |
809 | | static igraph_error_t igraph_i_confusion_matrix(const igraph_vector_int_t *v1, const igraph_vector_int_t *v2, |
810 | 9.86k | igraph_sparsemat_t *m) { |
811 | 9.86k | igraph_int_t k1, k2, i, n; |
812 | | |
813 | 9.86k | n = igraph_vector_int_size(v1); |
814 | 9.86k | if (n == 0) { |
815 | 0 | IGRAPH_CHECK(igraph_sparsemat_resize(m, 0, 0, 0)); |
816 | 0 | return IGRAPH_SUCCESS; |
817 | 0 | } |
818 | | |
819 | 9.86k | k1 = igraph_vector_int_max(v1) + 1; |
820 | 9.86k | k2 = igraph_vector_int_max(v2) + 1; |
821 | 9.86k | IGRAPH_CHECK(igraph_sparsemat_resize(m, k1, k2, n)); |
822 | 454k | for (i = 0; i < n; i++) { |
823 | 444k | IGRAPH_CHECK(igraph_sparsemat_entry( |
824 | 444k | m, VECTOR(*v1)[i], VECTOR(*v2)[i], 1 |
825 | 444k | )); |
826 | 444k | } |
827 | | |
828 | 9.86k | return IGRAPH_SUCCESS; |
829 | 9.86k | } |
830 | | |
831 | | /** |
832 | | * Implementation of the split-join distance of van Dongen. |
833 | | * |
834 | | * </para><para> |
835 | | * This function assumes that the community membership vectors have already |
836 | | * been normalized using igraph_reindex_membership(). |
837 | | * |
838 | | * </para><para> |
839 | | * Reference: van Dongen S: Performance criteria for graph clustering and Markov |
840 | | * cluster experiments. Technical Report INS-R0012, National Research Institute |
841 | | * for Mathematics and Computer Science in the Netherlands, Amsterdam, May 2000. |
842 | | * |
843 | | * </para><para> |
844 | | * Time complexity: O(n log(max(k1, k2))), where n is the number of vertices, k1 |
845 | | * and k2 are the number of clusters in each of the clusterings. |
846 | | */ |
847 | | static igraph_error_t igraph_i_split_join_distance(const igraph_vector_int_t *v1, const igraph_vector_int_t *v2, |
848 | 3.29k | igraph_int_t* distance12, igraph_int_t* distance21) { |
849 | 3.29k | igraph_int_t n = igraph_vector_int_size(v1); |
850 | 3.29k | igraph_vector_t rowmax, colmax; |
851 | 3.29k | igraph_sparsemat_t m; |
852 | 3.29k | igraph_sparsemat_t mu; /* uncompressed */ |
853 | 3.29k | igraph_sparsemat_iterator_t mit; |
854 | | |
855 | 3.29k | if (n == 0) { |
856 | 1 | *distance12 = 0; |
857 | 1 | *distance21 = 0; |
858 | 1 | return IGRAPH_SUCCESS; |
859 | 1 | } |
860 | | /* Calculate the confusion matrix */ |
861 | 3.29k | IGRAPH_CHECK(igraph_sparsemat_init(&mu, 1, 1, 0)); |
862 | 3.29k | IGRAPH_FINALLY(igraph_sparsemat_destroy, &mu); |
863 | 3.29k | IGRAPH_CHECK(igraph_i_confusion_matrix(v1, v2, &mu)); |
864 | | |
865 | | /* Initialize vectors that will store the row/columnwise maxima */ |
866 | 3.29k | IGRAPH_VECTOR_INIT_FINALLY(&rowmax, igraph_sparsemat_nrow(&mu)); |
867 | 3.29k | IGRAPH_VECTOR_INIT_FINALLY(&colmax, igraph_sparsemat_ncol(&mu)); |
868 | | |
869 | | /* Find the row/columnwise maxima */ |
870 | 3.29k | igraph_sparsemat_compress(&mu, &m); |
871 | 3.29k | IGRAPH_FINALLY(igraph_sparsemat_destroy, &m); |
872 | 3.29k | IGRAPH_CHECK(igraph_sparsemat_dupl(&m)); |
873 | 3.29k | IGRAPH_CHECK(igraph_sparsemat_iterator_init(&mit, &m)); |
874 | 150k | while (!igraph_sparsemat_iterator_end(&mit)) { |
875 | 147k | igraph_real_t value = igraph_sparsemat_iterator_get(&mit); |
876 | 147k | igraph_int_t row = igraph_sparsemat_iterator_row(&mit); |
877 | 147k | igraph_int_t col = igraph_sparsemat_iterator_col(&mit); |
878 | 147k | if (value > VECTOR(rowmax)[row]) { |
879 | 147k | VECTOR(rowmax)[row] = value; |
880 | 147k | } |
881 | 147k | if (value > VECTOR(colmax)[col]) { |
882 | 122k | VECTOR(colmax)[col] = value; |
883 | 122k | } |
884 | 147k | igraph_sparsemat_iterator_next(&mit); |
885 | 147k | } |
886 | | |
887 | | /* Calculate the distances */ |
888 | 3.29k | *distance12 = (igraph_int_t) (n - igraph_vector_sum(&rowmax)); |
889 | 3.29k | *distance21 = (igraph_int_t) (n - igraph_vector_sum(&colmax)); |
890 | | |
891 | 3.29k | igraph_vector_destroy(&rowmax); |
892 | 3.29k | igraph_vector_destroy(&colmax); |
893 | 3.29k | igraph_sparsemat_destroy(&m); |
894 | 3.29k | igraph_sparsemat_destroy(&mu); |
895 | 3.29k | IGRAPH_FINALLY_CLEAN(4); |
896 | | |
897 | 3.29k | return IGRAPH_SUCCESS; |
898 | 3.29k | } |
899 | | |
900 | | /** |
901 | | * Implementation of the adjusted and unadjusted Rand indices. |
902 | | * |
903 | | * </para><para> |
904 | | * This function assumes that the community membership vectors have already |
905 | | * been normalized using igraph_reindex_membership(). |
906 | | * |
907 | | * </para><para> |
908 | | * References: |
909 | | * |
910 | | * </para><para> |
911 | | * Rand WM: Objective criteria for the evaluation of clustering methods. J Am |
912 | | * Stat Assoc 66(336):846-850, 1971. |
913 | | * |
914 | | * </para><para> |
915 | | * Hubert L and Arabie P: Comparing partitions. Journal of Classification |
916 | | * 2:193-218, 1985. |
917 | | * |
918 | | * </para><para> |
919 | | * Time complexity: O(n log(max(k1, k2))), where n is the number of vertices, k1 |
920 | | * and k2 are the number of clusters in each of the clusterings. |
921 | | */ |
922 | | static igraph_error_t igraph_i_compare_communities_rand( |
923 | | const igraph_vector_int_t *v1, const igraph_vector_int_t *v2, |
924 | 6.56k | igraph_real_t *result, igraph_bool_t adjust) { |
925 | 6.56k | igraph_sparsemat_t m; |
926 | 6.56k | igraph_sparsemat_t mu; /* uncompressed */ |
927 | 6.56k | igraph_sparsemat_iterator_t mit; |
928 | 6.56k | igraph_vector_t rowsums, colsums; |
929 | 6.56k | igraph_int_t i, nrow, ncol; |
930 | 6.56k | igraph_real_t rand, n; |
931 | 6.56k | igraph_real_t frac_pairs_in_1, frac_pairs_in_2; |
932 | | |
933 | 6.56k | if (igraph_vector_int_size(v1) <= 1) { |
934 | 0 | IGRAPH_ERRORF("Rand indices not defined for only zero or one vertices. " |
935 | 0 | "Found membership vector of size %" IGRAPH_PRId ".", IGRAPH_EINVAL, igraph_vector_int_size(v1)); |
936 | 0 | } |
937 | | |
938 | | /* Calculate the confusion matrix */ |
939 | 6.56k | IGRAPH_CHECK(igraph_sparsemat_init(&mu, 1, 1, 0)); |
940 | 6.56k | IGRAPH_FINALLY(igraph_sparsemat_destroy, &mu); |
941 | 6.56k | IGRAPH_CHECK(igraph_i_confusion_matrix(v1, v2, &mu)); |
942 | | |
943 | | /* The unadjusted Rand index is defined as (a+d) / (a+b+c+d), where: |
944 | | * |
945 | | * - a is the number of pairs in the same cluster both in v1 and v2. This |
946 | | * equals the sum of n(i,j) choose 2 for all i and j. |
947 | | * |
948 | | * - b is the number of pairs in the same cluster in v1 and in different |
949 | | * clusters in v2. This is sum n(i,*) choose 2 for all i minus a. |
950 | | * n(i,*) is the number of elements in cluster i in v1. |
951 | | * |
952 | | * - c is the number of pairs in the same cluster in v2 and in different |
953 | | * clusters in v1. This is sum n(*,j) choose 2 for all j minus a. |
954 | | * n(*,j) is the number of elements in cluster j in v2. |
955 | | * |
956 | | * - d is (n choose 2) - a - b - c. |
957 | | * |
958 | | * Therefore, a+d = (n choose 2) - b - c |
959 | | * = (n choose 2) - sum (n(i,*) choose 2) |
960 | | * - sum (n(*,j) choose 2) |
961 | | * + 2 * sum (n(i,j) choose 2). |
962 | | * |
963 | | * Since a+b+c+d = (n choose 2) and this goes in the denominator, we can |
964 | | * just as well start dividing each term in a+d by (n choose 2), which |
965 | | * yields: |
966 | | * |
967 | | * 1 - sum( n(i,*)/n * (n(i,*)-1)/(n-1) ) |
968 | | * - sum( n(*,i)/n * (n(*,i)-1)/(n-1) ) |
969 | | * + sum( n(i,j)/n * (n(i,j)-1)/(n-1) ) * 2 |
970 | | */ |
971 | | |
972 | | /* Calculate row and column sums */ |
973 | 6.56k | nrow = igraph_sparsemat_nrow(&mu); |
974 | 6.56k | ncol = igraph_sparsemat_ncol(&mu); |
975 | 6.56k | n = igraph_vector_int_size(v1); |
976 | 6.56k | IGRAPH_VECTOR_INIT_FINALLY(&rowsums, nrow); |
977 | 6.56k | IGRAPH_VECTOR_INIT_FINALLY(&colsums, ncol); |
978 | 6.56k | IGRAPH_CHECK(igraph_sparsemat_rowsums(&mu, &rowsums)); |
979 | 6.56k | IGRAPH_CHECK(igraph_sparsemat_colsums(&mu, &colsums)); |
980 | | |
981 | | /* Start calculating the unadjusted Rand index */ |
982 | 6.56k | rand = 0.0; |
983 | 6.56k | igraph_sparsemat_compress(&mu, &m); |
984 | 6.56k | IGRAPH_FINALLY(igraph_sparsemat_destroy, &m); |
985 | 6.56k | IGRAPH_CHECK(igraph_sparsemat_dupl(&m)); |
986 | | |
987 | 6.56k | IGRAPH_CHECK(igraph_sparsemat_iterator_init(&mit, &m)); |
988 | 300k | while (!igraph_sparsemat_iterator_end(&mit)) { |
989 | 294k | igraph_real_t value = igraph_sparsemat_iterator_get(&mit); |
990 | 294k | rand += (value / n) * (value - 1) / (n - 1); |
991 | 294k | igraph_sparsemat_iterator_next(&mit); |
992 | 294k | } |
993 | | |
994 | 6.56k | frac_pairs_in_1 = frac_pairs_in_2 = 0.0; |
995 | 300k | for (i = 0; i < nrow; i++) { |
996 | 294k | frac_pairs_in_1 += (VECTOR(rowsums)[i] / n) * (VECTOR(rowsums)[i] - 1) / (n - 1); |
997 | 294k | } |
998 | 250k | for (i = 0; i < ncol; i++) { |
999 | 244k | frac_pairs_in_2 += (VECTOR(colsums)[i] / n) * (VECTOR(colsums)[i] - 1) / (n - 1); |
1000 | 244k | } |
1001 | | |
1002 | 6.56k | rand = 1.0 + 2 * rand - frac_pairs_in_1 - frac_pairs_in_2; |
1003 | | |
1004 | 6.56k | if (adjust) { |
1005 | 3.28k | double expected = frac_pairs_in_1 * frac_pairs_in_2 + |
1006 | 3.28k | (1 - frac_pairs_in_1) * (1 - frac_pairs_in_2); |
1007 | 3.28k | rand = (rand - expected) / (1 - expected); |
1008 | 3.28k | } |
1009 | | |
1010 | 6.56k | igraph_vector_destroy(&rowsums); |
1011 | 6.56k | igraph_vector_destroy(&colsums); |
1012 | 6.56k | igraph_sparsemat_destroy(&m); |
1013 | 6.56k | igraph_sparsemat_destroy(&mu); |
1014 | 6.56k | IGRAPH_FINALLY_CLEAN(4); |
1015 | | |
1016 | 6.56k | *result = rand; |
1017 | | |
1018 | 6.56k | return IGRAPH_SUCCESS; |
1019 | 6.56k | } |