Coverage Report

Created: 2026-01-10 06:12

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/igraph/src/community/community_misc.c
Line
Count
Source
1
/*
2
   igraph library.
3
   Copyright (C) 2007-2024  The igraph development team <igraph@igraph.org>
4
5
   This program is free software; you can redistribute it and/or modify
6
   it under the terms of the GNU General Public License as published by
7
   the Free Software Foundation; either version 2 of the License, or
8
   (at your option) any later version.
9
10
   This program is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU General Public License for more details.
14
15
   You should have received a copy of the GNU General Public License
16
   along with this program.  If not, see <https://www.gnu.org/licenses/>.
17
*/
18
19
#include "igraph_community.h"
20
#include "igraph_memory.h"
21
#include "igraph_sparsemat.h"
22
23
#include "community/community_internal.h"
24
25
#include <string.h>
26
#include <math.h>
27
28
/**
29
 * \section about_community
30
 *
31
 * <para>
32
 * Community detection is concerned with clustering the vertices of networks
33
 * into tightly connected subgraphs called "communities". The following
34
 * references provide a good introduction to the topic of community detection:
35
 * </para>
36
 *
37
 * <para>
38
 * S. Fortunato:
39
 * "Community Detection in Graphs".
40
 * Physics Reports 486, no. 3–5 (2010): 75–174.
41
 * https://doi.org/10.1016/j.physrep.2009.11.002.
42
 * </para>
43
 *
44
 * <para>
45
 * S. Fortunato and D. Hric:
46
 * "Community Detection in Networks: A User Guide".
47
 * Physics Reports 659 (2016): 1–44.
48
 * https://doi.org/10.1016/j.physrep.2016.09.002.
49
 * </para>
50
 */
51
52
/**
53
 * \function igraph_community_to_membership
54
 * \brief Cut a dendrogram after a given number of merges.
55
 *
56
 * This function creates a membership vector from a dendrogram whose leaves
57
 * are individual vertices by cutting it at the specified level. It produces
58
 * a membership vector that contains for each vertex its cluster ID, numbered
59
 * from zero. This is the same membership vector format that is produced by
60
 * \ref igraph_connected_components(), as well as all community detection
61
 * functions in igraph.
62
 *
63
 * </para><para>
64
 * It takes as input the number of vertices \p n, and a \p merges matrix
65
 * encoding the dendrogram, in the format produced by hierarchical clustering
66
 * functions such as \ref igraph_community_edge_betweenness(),
67
 * \ref igraph_community_walktrap() or \ref igraph_community_fastgreedy().
68
 * The matrix must have two columns and up to <code>n - 1</code> rows.
69
 * Each row represents merging two dendrogram nodes into their parent node.
70
 * The leaf nodes of the dendrogram are indexed from 0 to <code>n - 1</code>
71
 * and are identical to the vertices of the graph that is being partitioned
72
 * into communities. Row \c i contains the children of dendrogram node
73
 * with index <code>n + i</code>.
74
 *
75
 * </para><para>
76
 * This function performs \p steps merge operations as prescribed by
77
 * the \p merges matrix and returns the resulting partitioning into
78
 * <code>n - steps</code> communities.
79
 *
80
 * </para><para>
81
 * If \p merges is not a complete dendrogram, it is possible to
82
 * take \p steps steps if \p steps is not bigger than the number
83
 * lines in \p merges.
84
 *
85
 * \param merges The two-column matrix containing the merge operations.
86
 * \param nodes The number of leaf nodes in the dendrogram.
87
 * \param steps Integer constant, the number of steps to take.
88
 * \param membership Pointer to an initialized vector, the membership
89
 *    results will be stored here, if not \c NULL. The vector will be
90
 *    resized as needed.
91
 * \param csize Pointer to an initialized vector, or \c NULL. If not \c NULL
92
 *    then the sizes of the components will be stored here, the vector
93
 *    will be resized as needed.
94
 * \return Error code.
95
 *
96
 * \sa \ref igraph_community_walktrap(), \ref
97
 * igraph_community_edge_betweenness(), \ref
98
 * igraph_community_fastgreedy() for community structure detection
99
 * algorithms producing merge matrices in this format;
100
 * \ref igraph_le_community_to_membership() to perform merges
101
 * starting from a given cluster assignment.
102
 *
103
 * Time complexity: O(|V|), the number of vertices in the graph.
104
 */
105
igraph_error_t igraph_community_to_membership(const igraph_matrix_int_t *merges,
106
                                   igraph_int_t nodes,
107
                                   igraph_int_t steps,
108
                                   igraph_vector_int_t *membership,
109
6.59k
                                   igraph_vector_int_t *csize) {
110
111
6.59k
    const igraph_int_t no_of_nodes = nodes;
112
6.59k
    const igraph_int_t components = no_of_nodes - steps;
113
6.59k
    igraph_int_t found = 0;
114
6.59k
    igraph_vector_int_t tmp;
115
6.59k
    igraph_vector_bool_t already_merged;
116
6.59k
    igraph_vector_int_t own_membership;
117
6.59k
    igraph_bool_t using_own_membership = false;
118
119
6.59k
    if (steps > igraph_matrix_int_nrow(merges)) {
120
0
        IGRAPH_ERRORF("Number of steps is greater than number of rows in merges matrix: found %"
121
0
                      IGRAPH_PRId " steps, %" IGRAPH_PRId " rows.", IGRAPH_EINVAL, steps, igraph_matrix_int_nrow(merges));
122
0
    }
123
124
6.59k
    if (igraph_matrix_int_ncol(merges) != 2) {
125
0
        IGRAPH_ERRORF("The merges matrix should have two columns, but has %" IGRAPH_PRId ".",
126
0
                      IGRAPH_EINVAL, igraph_matrix_int_ncol(merges));
127
0
    }
128
6.59k
    if (steps < 0) {
129
0
        IGRAPH_ERRORF("Number of steps should be non-negative, found %" IGRAPH_PRId ".", IGRAPH_EINVAL, steps);
130
0
    }
131
132
6.59k
    if (csize != NULL && membership == NULL) {
133
        /* we need a membership vector to calculate 'csize' but the user did
134
         * not provide one; let's allocate one ourselves */
135
0
        IGRAPH_VECTOR_INT_INIT_FINALLY(&own_membership, no_of_nodes);
136
0
        using_own_membership = true;
137
0
        membership = &own_membership;
138
0
    }
139
140
6.59k
    if (membership) {
141
6.59k
        IGRAPH_CHECK(igraph_vector_int_resize(membership, no_of_nodes));
142
6.59k
        igraph_vector_int_null(membership);
143
6.59k
    }
144
6.59k
    if (csize) {
145
0
        IGRAPH_CHECK(igraph_vector_int_resize(csize, components));
146
0
        igraph_vector_int_null(csize);
147
0
    }
148
149
6.59k
    IGRAPH_VECTOR_BOOL_INIT_FINALLY(&already_merged, steps + no_of_nodes);
150
6.59k
    IGRAPH_VECTOR_INT_INIT_FINALLY(&tmp, steps);
151
152
75.8k
    for (igraph_int_t i = steps - 1; i >= 0; i--) {
153
69.2k
        const igraph_int_t c1 = MATRIX(*merges, i, 0);
154
69.2k
        const igraph_int_t c2 = MATRIX(*merges, i, 1);
155
156
69.2k
        if (VECTOR(already_merged)[c1] == 0) {
157
69.2k
            VECTOR(already_merged)[c1] = true;
158
69.2k
        } else {
159
0
            IGRAPH_ERRORF("Merges matrix contains multiple merges of cluster %" IGRAPH_PRId ".", IGRAPH_EINVAL, c1);
160
0
        }
161
69.2k
        if (VECTOR(already_merged)[c2] == 0) {
162
69.2k
            VECTOR(already_merged)[c2] = true;
163
69.2k
        } else {
164
0
            IGRAPH_ERRORF("Merges matrix contains multiple merges of cluster %" IGRAPH_PRId ".", IGRAPH_EINVAL, c2);
165
0
        }
166
167
        /* new component? */
168
69.2k
        if (VECTOR(tmp)[i] == 0) {
169
21.4k
            found++;
170
21.4k
            VECTOR(tmp)[i] = found;
171
21.4k
        }
172
173
69.2k
        if (c1 < no_of_nodes) {
174
51.0k
            const igraph_int_t cid = VECTOR(tmp)[i] - 1;
175
51.0k
            if (membership) {
176
51.0k
                VECTOR(*membership)[c1] = cid + 1;
177
51.0k
            }
178
51.0k
            if (csize) {
179
0
                VECTOR(*csize)[cid] += 1;
180
0
            }
181
51.0k
        } else {
182
18.2k
            VECTOR(tmp)[c1 - no_of_nodes] = VECTOR(tmp)[i];
183
18.2k
        }
184
185
69.2k
        if (c2 < no_of_nodes) {
186
39.6k
            const igraph_int_t cid = VECTOR(tmp)[i] - 1;
187
39.6k
            if (membership) {
188
39.6k
                VECTOR(*membership)[c2] = cid + 1;
189
39.6k
            }
190
39.6k
            if (csize) {
191
0
                VECTOR(*csize)[cid] += 1;
192
0
            }
193
39.6k
        } else {
194
29.6k
            VECTOR(tmp)[c2 - no_of_nodes] = VECTOR(tmp)[i];
195
29.6k
        }
196
197
69.2k
    }
198
199
6.59k
    if (membership || csize) {
200
        /* It can never happen that csize != NULL and membership == NULL; we have
201
         * handled that case above. Thus we skip if (membership) checks, which
202
         * would confuse static analysers in this context. */
203
302k
        for (igraph_int_t i = 0; i < no_of_nodes; i++) {
204
296k
            const igraph_int_t c = VECTOR(*membership)[i];
205
296k
            if (c != 0) {
206
90.6k
                VECTOR(*membership)[i] = c - 1;
207
205k
            } else {
208
205k
                if (csize) {
209
0
                    VECTOR(*csize)[found] += 1;
210
0
                }
211
205k
                VECTOR(*membership)[i] = found;
212
205k
                found++;
213
205k
            }
214
296k
        }
215
6.59k
    }
216
217
6.59k
    igraph_vector_int_destroy(&tmp);
218
6.59k
    igraph_vector_bool_destroy(&already_merged);
219
6.59k
    IGRAPH_FINALLY_CLEAN(2);
220
221
6.59k
    if (using_own_membership) {
222
0
        igraph_vector_int_destroy(&own_membership);
223
0
        IGRAPH_FINALLY_CLEAN(1);
224
0
    }
225
226
6.59k
    return IGRAPH_SUCCESS;
227
6.59k
}
228
229
230
/**
231
 * This slower implementation of igraph_reindex_membership() is used when
232
 * cluster indices are not within the range 0..vcount-1.
233
 *
234
 * \param membership Vector encoding the membership of each vertex.
235
 *    Elements may be arbitrary integers, however, its length must not be zero.
236
 * \param new_to_old If not \c NULL, a mapping from new to old cluster
237
 *    indices will be stored here.
238
 * \param nb_clusters If not \c NULL, the number of clusters will be stored here.
239
 *
240
 * Time complexity: O(n log(n)) where n is the membership vector length.
241
 */
242
igraph_error_t igraph_i_reindex_membership_large(
243
        igraph_vector_int_t *membership,
244
        igraph_vector_int_t *new_to_old,
245
41.0k
        igraph_int_t *nb_clusters) {
246
247
41.0k
    const igraph_int_t vcount = igraph_vector_int_size(membership);
248
41.0k
    igraph_vector_int_t permutation;
249
250
    /* The vcount == 0 case requires special handling because the main
251
     * algorithm of this function cannot handle it. */
252
41.0k
    if (vcount == 0) {
253
0
        if (new_to_old) {
254
0
            igraph_vector_int_clear(new_to_old);
255
0
        }
256
0
        if (nb_clusters) {
257
0
            *nb_clusters = 0;
258
0
        }
259
0
        return IGRAPH_SUCCESS;
260
0
    }
261
262
41.0k
    IGRAPH_VECTOR_INT_INIT_FINALLY(&permutation, vcount);
263
264
41.0k
    IGRAPH_CHECK(igraph_vector_int_sort_ind(membership, &permutation, IGRAPH_ASCENDING));
265
266
41.0k
    if (new_to_old) {
267
0
        igraph_vector_int_clear(new_to_old);
268
0
    }
269
270
41.0k
    igraph_int_t j = VECTOR(permutation)[0];
271
41.0k
    igraph_int_t c_old = VECTOR(*membership)[j];
272
41.0k
    igraph_int_t c_old_prev = c_old;
273
41.0k
    igraph_int_t c_new = 0;
274
41.0k
    if (new_to_old) {
275
0
        IGRAPH_CHECK(igraph_vector_int_push_back(new_to_old, c_old));
276
0
    }
277
41.0k
    VECTOR(*membership)[j] = c_new;
278
279
2.31M
    for (igraph_int_t i=1; i < vcount; i++) {
280
2.27M
        j = VECTOR(permutation)[i];
281
2.27M
        c_old = VECTOR(*membership)[j];
282
2.27M
        if (c_old != c_old_prev) {
283
1.66M
            if (new_to_old) {
284
0
                IGRAPH_CHECK(igraph_vector_int_push_back(new_to_old, c_old));
285
0
            }
286
1.66M
            c_new++;
287
1.66M
        }
288
2.27M
        c_old_prev = c_old;
289
2.27M
        VECTOR(*membership)[j] = c_new;
290
2.27M
    }
291
292
41.0k
    if (nb_clusters) {
293
41.0k
        *nb_clusters = c_new+1;
294
41.0k
    }
295
296
41.0k
    igraph_vector_int_destroy(&permutation);
297
41.0k
    IGRAPH_FINALLY_CLEAN(1);
298
299
41.0k
    return IGRAPH_SUCCESS;
300
41.0k
}
301
302
/**
303
 * \function igraph_reindex_membership
304
 * \brief Makes the IDs in a membership vector contiguous.
305
 *
306
 * This function reindexes component IDs in a membership vector in a way that
307
 * the new IDs start from zero and go up to <code>C-1</code>, where \c C is the
308
 * number of unique component IDs in the original vector.
309
 *
310
 * \param  membership  Numeric vector which gives the type of each
311
 *                     vertex, i.e. the component to which it belongs.
312
 *                     The vector will be altered in-place.
313
 * \param  new_to_old  Pointer to a vector which will contain the
314
 *                     old component ID for each new one, or \c NULL,
315
 *                     in which case it is not returned. The vector
316
 *                     will be resized as needed.
317
 * \param  nb_clusters Pointer to an integer for the number of
318
 *                     distinct clusters. If not \c NULL, this will be
319
 *                     updated to reflect the number of distinct
320
 *                     clusters found in membership.
321
 * \return Error code.
322
 *
323
 * Time complexity: Let n be the length of the membership vector.
324
 * O(n) if cluster indices are within 0..n-1, and O(n log(n)) otherwise.
325
 */
326
igraph_error_t igraph_reindex_membership(
327
        igraph_vector_int_t *membership,
328
        igraph_vector_int_t *new_to_old,
329
79.5k
        igraph_int_t *nb_clusters) {
330
331
79.5k
    const igraph_int_t vcount = igraph_vector_int_size(membership);
332
79.5k
    igraph_vector_int_t new_cluster;
333
79.5k
    igraph_int_t i_nb_clusters;
334
335
    /* First, we assume cluster indices in the range 0..vcount-1,
336
     * and attempt to use a performant implementation. If indices
337
     * outside of this range are found, we will fall back to a
338
     * slower alternative implementation. */
339
79.5k
    IGRAPH_VECTOR_INT_INIT_FINALLY(&new_cluster, vcount);
340
341
79.5k
    if (new_to_old) {
342
0
        igraph_vector_int_clear(new_to_old);
343
0
    }
344
345
    /* Clean clusters. We will store the new cluster + 1 so that membership == 0
346
     * indicates that no cluster was assigned yet. */
347
79.5k
    i_nb_clusters = 1;
348
3.55M
    for (igraph_int_t i = 0; i < vcount; i++) {
349
3.47M
        igraph_int_t c = VECTOR(*membership)[i];
350
351
3.47M
        if (c < 0 || c >= vcount) {
352
            /* Found cluster indices out of the 0..vcount-1 range.
353
             * Use alternative implementation that supports these. */
354
0
            igraph_vector_int_destroy(&new_cluster);
355
0
            IGRAPH_FINALLY_CLEAN(1);
356
0
            return igraph_i_reindex_membership_large(membership, new_to_old, nb_clusters);
357
0
        }
358
359
3.47M
        if (VECTOR(new_cluster)[c] == 0) {
360
3.24M
            VECTOR(new_cluster)[c] = i_nb_clusters;
361
3.24M
            i_nb_clusters += 1;
362
3.24M
            if (new_to_old) {
363
0
                IGRAPH_CHECK(igraph_vector_int_push_back(new_to_old, c));
364
0
            }
365
3.24M
        }
366
3.47M
    }
367
368
    /* Assign new membership */
369
3.55M
    for (igraph_int_t i = 0; i < vcount; i++) {
370
3.47M
        igraph_int_t c = VECTOR(*membership)[i];
371
3.47M
        VECTOR(*membership)[i] = VECTOR(new_cluster)[c] - 1;
372
3.47M
    }
373
374
79.5k
    if (nb_clusters) {
375
        /* We used the cluster + 1, so correct */
376
31.9k
        *nb_clusters = i_nb_clusters - 1;
377
31.9k
    }
378
379
79.5k
    igraph_vector_int_destroy(&new_cluster);
380
79.5k
    IGRAPH_FINALLY_CLEAN(1);
381
382
79.5k
    return IGRAPH_SUCCESS;
383
79.5k
}
384
385
static igraph_error_t igraph_i_compare_communities_vi(const igraph_vector_int_t *v1,
386
                                           const igraph_vector_int_t *v2, igraph_real_t* result);
387
static igraph_error_t igraph_i_compare_communities_nmi(const igraph_vector_int_t *v1,
388
                                            const igraph_vector_int_t *v2, igraph_real_t* result);
389
static igraph_error_t igraph_i_compare_communities_rand(const igraph_vector_int_t *v1,
390
                                             const igraph_vector_int_t *v2, igraph_real_t* result, igraph_bool_t adjust);
391
static igraph_error_t igraph_i_split_join_distance(const igraph_vector_int_t *v1,
392
                                        const igraph_vector_int_t *v2, igraph_int_t* distance12,
393
                                        igraph_int_t* distance21);
394
395
/**
396
 * \ingroup communities
397
 * \function igraph_compare_communities
398
 * \brief Compares community structures using various metrics.
399
 *
400
 * This function assesses the distance between two community structures
401
 * using the variation of information (VI) metric of Meila (2003), the
402
 * normalized mutual information (NMI) of Danon et al (2005), the
403
 * split-join distance of van Dongen (2000), the Rand index of Rand (1971)
404
 * or the adjusted Rand index of Hubert and Arabie (1985).
405
 *
406
 * </para><para>
407
 * Some of these measures are defined based on the entropy of a discrete
408
 * random variable associated with a given clustering \c C of vertices.
409
 * Let \c p_i be the probability that a randomly picked vertex would be part
410
 * of cluster \c i. Then the entropy of the clustering is
411
 *
412
 * </para><para>
413
 * <code>H(C) = - sum_i p_i log p_i</code>
414
 *
415
 * </para><para>
416
 * Similarly, we can define the joint entropy of two clusterings \c C_1 and \c C_2
417
 * based on the probability \c p_ij that a random vertex is part of cluster \c i
418
 * in the first clustering and cluster \c j in the second one:
419
 *
420
 * </para><para>
421
 * <code>H(C_1, C_2) = - sum_ii p_ij log p_ij</code>
422
 *
423
 * </para><para>
424
 * The mutual information of \c C_1 and \c C_2 is then
425
 * <code>MI(C_1, C_2) = H(C_1) + H(C_2) - H(C_1, C_2) >= 0 </code>.
426
 * A large mutual information indicates a high overlap between the two clusterings.
427
 * The normalized mutual information, as computed by igraph, is
428
 *
429
 * </para><para>
430
 * <code>NMI(C_1, C_2) = 2 MI(C_1, C_2) / (H(C_1) + H(C_2))</code>.
431
 *
432
 * </para><para>
433
 * It takes its value from the interval (0, 1], with 1 achieved when the two clusterings
434
 * coincide.
435
 *
436
 * </para><para>
437
 * The variation of information is defined as
438
 * <code>VI(C_1, C_2) = [H(C_1) - MI(C_1, C_2)] + [H(C_2) - MI(C_1, C_2)]</code>.
439
 * Lower values of the variation of information indicate a smaller difference between
440
 * the two clusterings, with <code>VI = 0</code> achieved precisely when they coincide.
441
 * igraph uses natural units for the variation of information, i.e. it uses the
442
 * natural logarithm when computing entropies.
443
 *
444
 * </para><para>
445
 * The Rand index is defined as the probability that the two clusterings agree
446
 * about the cluster memberships of a randomly chosen vertex \em pair. All vertex
447
 * pairs are considered, and the two clusterings are considered to be in agreement
448
 * about the memberships of a vertex pair if either the two vertices are in the
449
 * same cluster in both clusterings, or they are in different clusters in both
450
 * clusterings. The Rand index is then the number of vertex pairs in agreement,
451
 * divided by the total number of vertex pairs. A Rand index of zero means that
452
 * the two clusterings disagree about the membership of all vertex pairs, while
453
 * 1 means that the two clusterings are identical.
454
 *
455
 * </para><para>
456
 * The adjusted Rand index is similar to the Rand index, but it takes into
457
 * account that agreement between the two clusterings may also occur by chance
458
 * even if the two clusterings are chosen completely randomly. The adjusted
459
 * Rand index therefore subtracts the expected fraction of agreements from the
460
 * value of the Rand index, and divides the result by one minus the expected
461
 * fraction of agreements. The maximum value of the adjusted Rand index is
462
 * still 1 (similarly to the Rand index), indicating maximum agreement, but
463
 * the value may be less than zero if there is \em less agreement between the
464
 * two clusterings than what would be expected by chance.
465
 *
466
 * </para><para>
467
 * For an explanation of the split-join distance, see \ref igraph_split_join_distance().
468
 *
469
 * </para><para>
470
 * References:
471
 *
472
 * </para><para>
473
 * Meilă M: Comparing clusterings by the variation of information.
474
 * In: Schölkopf B, Warmuth MK (eds.). Learning Theory and Kernel Machines:
475
 * 16th Annual Conference on Computational Learning Theory and 7th Kernel
476
 * Workshop, COLT/Kernel 2003, Washington, DC, USA. Lecture Notes in Computer
477
 * Science, vol. 2777, Springer, 2003. ISBN: 978-3-540-40720-1.
478
 * https://doi.org/10.1007/978-3-540-45167-9_14
479
 *
480
 * </para><para>
481
 * Danon L, Diaz-Guilera A, Duch J, Arenas A: Comparing community structure
482
 * identification. J Stat Mech P09008, 2005.
483
 * https://doi.org/10.1088/1742-5468/2005/09/P09008
484
 *
485
 * </para><para>
486
 * van Dongen S: Performance criteria for graph clustering and Markov cluster
487
 * experiments. Technical Report INS-R0012, National Research Institute for
488
 * Mathematics and Computer Science in the Netherlands, Amsterdam, May 2000.
489
 * https://ir.cwi.nl/pub/4461
490
 *
491
 * </para><para>
492
 * Rand WM: Objective criteria for the evaluation of clustering methods.
493
 * J Am Stat Assoc 66(336):846-850, 1971.
494
 * https://doi.org/10.2307/2284239
495
 *
496
 * </para><para>
497
 * Hubert L and Arabie P: Comparing partitions. Journal of Classification
498
 * 2:193-218, 1985.
499
 * https://doi.org/10.1007/BF01908075
500
 *
501
 * \param  comm1   the membership vector of the first community structure
502
 * \param  comm2   the membership vector of the second community structure
503
 * \param  result  the result is stored here.
504
 * \param  method  the comparison method to use. \c IGRAPH_COMMCMP_VI
505
 *                 selects the variation of information (VI) metric of
506
 *                 Meila (2003), \c IGRAPH_COMMCMP_NMI selects the
507
 *                 normalized mutual information measure proposed by
508
 *                 Danon et al (2005), \c IGRAPH_COMMCMP_SPLIT_JOIN
509
 *                 selects the split-join distance of van Dongen (2000),
510
 *                 \c IGRAPH_COMMCMP_RAND selects the unadjusted Rand
511
 *                 index (1971) and \c IGRAPH_COMMCMP_ADJUSTED_RAND
512
 *                 selects the adjusted Rand index.
513
 *
514
 * \return  Error code.
515
 *
516
 * \sa \ref igraph_split_join_distance().
517
 *
518
 * Time complexity: O(n log(n)).
519
 */
520
igraph_error_t igraph_compare_communities(const igraph_vector_int_t *comm1,
521
                               const igraph_vector_int_t *comm2, igraph_real_t* result,
522
16.4k
                               igraph_community_comparison_t method) {
523
16.4k
    igraph_vector_int_t c1, c2;
524
525
16.4k
    if (igraph_vector_int_size(comm1) != igraph_vector_int_size(comm2)) {
526
0
        IGRAPH_ERROR("community membership vectors have different lengths", IGRAPH_EINVAL);
527
0
    }
528
529
    /* Copy and reindex membership vectors to make sure they are continuous */
530
16.4k
    IGRAPH_CHECK(igraph_vector_int_init_copy(&c1, comm1));
531
16.4k
    IGRAPH_FINALLY(igraph_vector_int_destroy, &c1);
532
533
16.4k
    IGRAPH_CHECK(igraph_vector_int_init_copy(&c2, comm2));
534
16.4k
    IGRAPH_FINALLY(igraph_vector_int_destroy, &c2);
535
536
16.4k
    IGRAPH_CHECK(igraph_reindex_membership(&c1, NULL, NULL));
537
16.4k
    IGRAPH_CHECK(igraph_reindex_membership(&c2, NULL, NULL));
538
539
16.4k
    switch (method) {
540
3.29k
    case IGRAPH_COMMCMP_VI:
541
3.29k
        IGRAPH_CHECK(igraph_i_compare_communities_vi(&c1, &c2, result));
542
3.29k
        break;
543
544
3.29k
    case IGRAPH_COMMCMP_NMI:
545
3.29k
        IGRAPH_CHECK(igraph_i_compare_communities_nmi(&c1, &c2, result));
546
3.29k
        break;
547
548
3.29k
    case IGRAPH_COMMCMP_SPLIT_JOIN: {
549
3.29k
        igraph_int_t d12, d21;
550
3.29k
        IGRAPH_CHECK(igraph_i_split_join_distance(&c1, &c2, &d12, &d21));
551
3.29k
        *result = d12 + d21;
552
3.29k
    }
553
0
    break;
554
555
3.28k
    case IGRAPH_COMMCMP_RAND:
556
6.56k
    case IGRAPH_COMMCMP_ADJUSTED_RAND:
557
6.56k
        IGRAPH_CHECK(igraph_i_compare_communities_rand(&c1, &c2, result,
558
6.56k
                     method == IGRAPH_COMMCMP_ADJUSTED_RAND));
559
6.56k
        break;
560
561
6.56k
    default:
562
0
        IGRAPH_ERROR("unknown community comparison method", IGRAPH_EINVAL);
563
16.4k
    }
564
565
    /* Clean up everything */
566
16.4k
    igraph_vector_int_destroy(&c1);
567
16.4k
    igraph_vector_int_destroy(&c2);
568
16.4k
    IGRAPH_FINALLY_CLEAN(2);
569
570
16.4k
    return IGRAPH_SUCCESS;
571
16.4k
}
572
573
/**
574
 * \ingroup communities
575
 * \function igraph_split_join_distance
576
 * \brief Calculates the split-join distance of two community structures.
577
 *
578
 * The split-join distance between partitions A and B is the sum of the
579
 * projection distance of A from B and the projection distance of B from
580
 * A. The projection distance is an asymmetric measure and it is defined
581
 * as follows:
582
 *
583
 * </para><para>
584
 * First, each set in partition A is evaluated against all sets in partition
585
 * B. For each set in partition A, the best matching set in partition B is
586
 * found and the overlap size is calculated. (Matching is quantified by the
587
 * size of the overlap between the two sets). Then, the maximal overlap sizes
588
 * for each set in A are summed together and subtracted from the number of
589
 * elements in A.
590
 *
591
 * </para><para>
592
 * The split-join distance will be returned in two arguments, \c distance12
593
 * will contain the projection distance of the first partition from the
594
 * second, while \c distance21 will be the projection distance of the second
595
 * partition from the first. This makes it easier to detect whether a
596
 * partition is a subpartition of the other, since in this case, the
597
 * corresponding distance will be zero.
598
 *
599
 * </para><para>
600
 * Reference:
601
 *
602
 * </para><para>
603
 * van Dongen S: Performance criteria for graph clustering and Markov cluster
604
 * experiments. Technical Report INS-R0012, National Research Institute for
605
 * Mathematics and Computer Science in the Netherlands, Amsterdam, May 2000.
606
 *
607
 * \param  comm1       the membership vector of the first community structure
608
 * \param  comm2       the membership vector of the second community structure
609
 * \param  distance12  pointer to an \c igraph_int_t, the projection distance
610
 *                     of the first community structure from the second one will be
611
 *                     returned here.
612
 * \param  distance21  pointer to an \c igraph_int_t, the projection distance
613
 *                     of the second community structure from the first one will be
614
 *                     returned here.
615
 * \return  Error code.
616
 *
617
 * \sa \ref igraph_compare_communities() with the \c IGRAPH_COMMCMP_SPLIT_JOIN
618
 * method if you are not interested in the individual distances but only the sum
619
 * of them.
620
 *
621
 * Time complexity: O(n log(n)).
622
 */
623
igraph_error_t igraph_split_join_distance(const igraph_vector_int_t *comm1,
624
                               const igraph_vector_int_t *comm2, igraph_int_t *distance12,
625
0
                               igraph_int_t *distance21) {
626
0
    igraph_vector_int_t c1, c2;
627
628
0
    if (igraph_vector_int_size(comm1) != igraph_vector_int_size(comm2)) {
629
0
        IGRAPH_ERRORF("Community membership vectors have different lengths: %" IGRAPH_PRId " and %" IGRAPH_PRId ".",
630
0
                      IGRAPH_EINVAL, igraph_vector_int_size(comm1), igraph_vector_int_size(comm2));
631
0
    }
632
633
    /* Copy and reindex membership vectors to make sure they are continuous */
634
0
    IGRAPH_CHECK(igraph_vector_int_init_copy(&c1, comm1));
635
0
    IGRAPH_FINALLY(igraph_vector_int_destroy, &c1);
636
637
0
    IGRAPH_CHECK(igraph_vector_int_init_copy(&c2, comm2));
638
0
    IGRAPH_FINALLY(igraph_vector_int_destroy, &c2);
639
640
0
    IGRAPH_CHECK(igraph_reindex_membership(&c1, NULL, NULL));
641
0
    IGRAPH_CHECK(igraph_reindex_membership(&c2, NULL, NULL));
642
643
0
    IGRAPH_CHECK(igraph_i_split_join_distance(&c1, &c2, distance12, distance21));
644
645
    /* Clean up everything */
646
0
    igraph_vector_int_destroy(&c1);
647
0
    igraph_vector_int_destroy(&c2);
648
0
    IGRAPH_FINALLY_CLEAN(2);
649
650
0
    return IGRAPH_SUCCESS;
651
0
}
652
653
/**
654
 * Calculates the entropy and the mutual information for two reindexed community
655
 * membership vectors v1 and v2. This is needed by both Meila's and Danon's
656
 * community comparison measure.
657
 */
658
static igraph_error_t igraph_i_entropy_and_mutual_information(const igraph_vector_int_t* v1,
659
6.59k
        const igraph_vector_int_t* v2, double* h1, double* h2, double* mut_inf) {
660
6.59k
    igraph_int_t i, n;
661
6.59k
    igraph_int_t k1;
662
6.59k
    igraph_int_t k2;
663
6.59k
    igraph_real_t *p1, *p2;
664
6.59k
    igraph_sparsemat_t m;
665
6.59k
    igraph_sparsemat_t mu; /* uncompressed */
666
6.59k
    igraph_sparsemat_iterator_t mit;
667
668
6.59k
    n = igraph_vector_int_size(v1);
669
6.59k
    if (n == 0) {
670
2
        *h1 = 0;
671
2
        *h2 = 0;
672
2
        *mut_inf = 0;
673
2
        return IGRAPH_SUCCESS;
674
2
    }
675
6.59k
    k1 = igraph_vector_int_max(v1) + 1;
676
6.59k
    k2 = igraph_vector_int_max(v2) + 1;
677
6.59k
    p1 = IGRAPH_CALLOC(k1, igraph_real_t);
678
6.59k
    if (p1 == 0) {
679
0
        IGRAPH_ERROR("Insufficient memory for computing community entropy.", IGRAPH_ENOMEM);
680
0
    }
681
6.59k
    IGRAPH_FINALLY(igraph_free, p1);
682
6.59k
    p2 = IGRAPH_CALLOC(k2, igraph_real_t);
683
6.59k
    if (p2 == 0) {
684
0
        IGRAPH_ERROR("Insufficient memory for computing community entropy.", IGRAPH_ENOMEM);
685
0
    }
686
6.59k
    IGRAPH_FINALLY(igraph_free, p2);
687
688
    /* Calculate the entropy of v1 */
689
6.59k
    *h1 = 0.0;
690
302k
    for (i = 0; i < n; i++) {
691
296k
        p1[VECTOR(*v1)[i]]++;
692
296k
    }
693
300k
    for (i = 0; i < k1; i++) {
694
294k
        p1[i] /= n;
695
294k
        *h1 -= p1[i] * log(p1[i]);
696
294k
    }
697
698
    /* Calculate the entropy of v2 */
699
6.59k
    *h2 = 0.0;
700
302k
    for (i = 0; i < n; i++) {
701
296k
        p2[VECTOR(*v2)[i]]++;
702
296k
    }
703
250k
    for (i = 0; i < k2; i++) {
704
244k
        p2[i] /= n;
705
244k
        *h2 -= p2[i] * log(p2[i]);
706
244k
    }
707
708
    /* We will only need the logs of p1 and p2 from now on */
709
300k
    for (i = 0; i < k1; i++) {
710
294k
        p1[i] = log(p1[i]);
711
294k
    }
712
250k
    for (i = 0; i < k2; i++) {
713
244k
        p2[i] = log(p2[i]);
714
244k
    }
715
716
    /* Calculate the mutual information of v1 and v2 */
717
6.59k
    *mut_inf = 0.0;
718
6.59k
    IGRAPH_CHECK(igraph_sparsemat_init(&mu, k1, k2, n));
719
6.59k
    IGRAPH_FINALLY(igraph_sparsemat_destroy, &mu);
720
302k
    for (i = 0; i < n; i++) {
721
296k
        IGRAPH_CHECK(igraph_sparsemat_entry(
722
296k
            &mu, VECTOR(*v1)[i],
723
296k
            VECTOR(*v2)[i], 1
724
296k
        ));
725
296k
    }
726
727
6.59k
    IGRAPH_CHECK(igraph_sparsemat_compress(&mu, &m));
728
6.59k
    IGRAPH_FINALLY(igraph_sparsemat_destroy, &m);
729
6.59k
    IGRAPH_CHECK(igraph_sparsemat_dupl(&m));
730
731
6.59k
    IGRAPH_CHECK(igraph_sparsemat_iterator_init(&mit, &m));
732
300k
    while (!igraph_sparsemat_iterator_end(&mit)) {
733
294k
        double p = igraph_sparsemat_iterator_get(&mit)/ n;
734
294k
        *mut_inf += p * (log(p) - p1[igraph_sparsemat_iterator_row(&mit)] - p2[igraph_sparsemat_iterator_col(&mit)]);
735
294k
        igraph_sparsemat_iterator_next(&mit);
736
294k
    }
737
6.59k
    igraph_sparsemat_destroy(&m);
738
6.59k
    igraph_sparsemat_destroy(&mu);
739
6.59k
    IGRAPH_FREE(p1); IGRAPH_FREE(p2);
740
741
6.59k
    IGRAPH_FINALLY_CLEAN(4);
742
743
6.59k
    return IGRAPH_SUCCESS;
744
6.59k
}
745
746
/**
747
 * Implementation of the normalized mutual information (NMI) measure of
748
 * Danon et al. This function assumes that the community membership
749
 * vectors have already been normalized using igraph_reindex_membership().
750
 *
751
 * </para><para>
752
 * Reference: Danon L, Diaz-Guilera A, Duch J, Arenas A: Comparing community
753
 * structure identification. J Stat Mech P09008, 2005.
754
 *
755
 * </para><para>
756
 * Time complexity: O(n log(n))
757
 */
758
static igraph_error_t igraph_i_compare_communities_nmi(const igraph_vector_int_t *v1, const igraph_vector_int_t *v2,
759
3.29k
                                     igraph_real_t* result) {
760
3.29k
    double h1, h2, mut_inf;
761
762
3.29k
    IGRAPH_CHECK(igraph_i_entropy_and_mutual_information(v1, v2, &h1, &h2, &mut_inf));
763
764
3.29k
    if (h1 == 0 && h2 == 0) {
765
31
        *result = 1;
766
3.26k
    } else {
767
3.26k
        *result = 2 * mut_inf / (h1 + h2);
768
3.26k
    }
769
770
3.29k
    return IGRAPH_SUCCESS;
771
3.29k
}
772
773
/**
774
 * Implementation of the variation of information metric (VI) of
775
 * Meila et al. This function assumes that the community membership
776
 * vectors have already been normalized using igraph_reindex_membership().
777
 *
778
 * </para><para>
779
 * Reference: Meila M: Comparing clusterings by the variation of information.
780
 * In: Schölkopf B, Warmuth MK (eds.). Learning Theory and Kernel Machines:
781
 * 16th Annual Conference on Computational Learning Theory and 7th Kernel
782
 * Workshop, COLT/Kernel 2003, Washington, DC, USA. Lecture Notes in Computer
783
 * Science, vol. 2777, Springer, 2003. ISBN: 978-3-540-40720-1.
784
 *
785
 * </para><para>
786
 * Time complexity: O(n log(n))
787
 */
788
static igraph_error_t igraph_i_compare_communities_vi(const igraph_vector_int_t *v1, const igraph_vector_int_t *v2,
789
3.29k
                                    igraph_real_t* result) {
790
3.29k
    double h1, h2, mut_inf;
791
792
3.29k
    IGRAPH_CHECK(igraph_i_entropy_and_mutual_information(v1, v2, &h1, &h2, &mut_inf));
793
3.29k
    *result = h1 + h2 - 2 * mut_inf;
794
795
3.29k
    return IGRAPH_SUCCESS;
796
3.29k
}
797
798
/**
799
 * \brief Calculates the confusion matrix for two clusterings.
800
 *
801
 * </para><para>
802
 * This function assumes that the community membership vectors have already
803
 * been normalized using igraph_reindex_membership().
804
 *
805
 * </para><para>
806
 * Time complexity: O(n log(max(k1, k2))), where n is the number of vertices, k1
807
 * and k2 are the number of clusters in each of the clusterings.
808
 */
809
static igraph_error_t igraph_i_confusion_matrix(const igraph_vector_int_t *v1, const igraph_vector_int_t *v2,
810
9.86k
                              igraph_sparsemat_t *m) {
811
9.86k
    igraph_int_t k1, k2, i, n;
812
813
9.86k
    n = igraph_vector_int_size(v1);
814
9.86k
    if (n == 0) {
815
0
        IGRAPH_CHECK(igraph_sparsemat_resize(m, 0, 0, 0));
816
0
        return IGRAPH_SUCCESS;
817
0
    }
818
819
9.86k
    k1 = igraph_vector_int_max(v1) + 1;
820
9.86k
    k2 = igraph_vector_int_max(v2) + 1;
821
9.86k
    IGRAPH_CHECK(igraph_sparsemat_resize(m, k1, k2, n));
822
454k
    for (i = 0; i < n; i++) {
823
444k
        IGRAPH_CHECK(igraph_sparsemat_entry(
824
444k
            m, VECTOR(*v1)[i], VECTOR(*v2)[i], 1
825
444k
        ));
826
444k
    }
827
828
9.86k
    return IGRAPH_SUCCESS;
829
9.86k
}
830
831
/**
832
 * Implementation of the split-join distance of van Dongen.
833
 *
834
 * </para><para>
835
 * This function assumes that the community membership vectors have already
836
 * been normalized using igraph_reindex_membership().
837
 *
838
 * </para><para>
839
 * Reference: van Dongen S: Performance criteria for graph clustering and Markov
840
 * cluster experiments. Technical Report INS-R0012, National Research Institute
841
 * for Mathematics and Computer Science in the Netherlands, Amsterdam, May 2000.
842
 *
843
 * </para><para>
844
 * Time complexity: O(n log(max(k1, k2))), where n is the number of vertices, k1
845
 * and k2 are the number of clusters in each of the clusterings.
846
 */
847
static igraph_error_t igraph_i_split_join_distance(const igraph_vector_int_t *v1, const igraph_vector_int_t *v2,
848
3.29k
                                 igraph_int_t* distance12, igraph_int_t* distance21) {
849
3.29k
    igraph_int_t n = igraph_vector_int_size(v1);
850
3.29k
    igraph_vector_t rowmax, colmax;
851
3.29k
    igraph_sparsemat_t m;
852
3.29k
    igraph_sparsemat_t mu; /* uncompressed */
853
3.29k
    igraph_sparsemat_iterator_t mit;
854
855
3.29k
    if (n == 0) {
856
1
        *distance12 = 0;
857
1
        *distance21 = 0;
858
1
        return IGRAPH_SUCCESS;
859
1
    }
860
    /* Calculate the confusion matrix */
861
3.29k
    IGRAPH_CHECK(igraph_sparsemat_init(&mu, 1, 1, 0));
862
3.29k
    IGRAPH_FINALLY(igraph_sparsemat_destroy, &mu);
863
3.29k
    IGRAPH_CHECK(igraph_i_confusion_matrix(v1, v2, &mu));
864
865
    /* Initialize vectors that will store the row/columnwise maxima */
866
3.29k
    IGRAPH_VECTOR_INIT_FINALLY(&rowmax, igraph_sparsemat_nrow(&mu));
867
3.29k
    IGRAPH_VECTOR_INIT_FINALLY(&colmax, igraph_sparsemat_ncol(&mu));
868
869
    /* Find the row/columnwise maxima */
870
3.29k
    igraph_sparsemat_compress(&mu, &m);
871
3.29k
    IGRAPH_FINALLY(igraph_sparsemat_destroy, &m);
872
3.29k
    IGRAPH_CHECK(igraph_sparsemat_dupl(&m));
873
3.29k
    IGRAPH_CHECK(igraph_sparsemat_iterator_init(&mit, &m));
874
150k
    while (!igraph_sparsemat_iterator_end(&mit)) {
875
147k
        igraph_real_t value = igraph_sparsemat_iterator_get(&mit);
876
147k
        igraph_int_t row = igraph_sparsemat_iterator_row(&mit);
877
147k
        igraph_int_t col = igraph_sparsemat_iterator_col(&mit);
878
147k
        if (value > VECTOR(rowmax)[row]) {
879
147k
            VECTOR(rowmax)[row] = value;
880
147k
        }
881
147k
        if (value > VECTOR(colmax)[col]) {
882
122k
            VECTOR(colmax)[col] = value;
883
122k
        }
884
147k
        igraph_sparsemat_iterator_next(&mit);
885
147k
    }
886
887
    /* Calculate the distances */
888
3.29k
    *distance12 = (igraph_int_t) (n - igraph_vector_sum(&rowmax));
889
3.29k
    *distance21 = (igraph_int_t) (n - igraph_vector_sum(&colmax));
890
891
3.29k
    igraph_vector_destroy(&rowmax);
892
3.29k
    igraph_vector_destroy(&colmax);
893
3.29k
    igraph_sparsemat_destroy(&m);
894
3.29k
    igraph_sparsemat_destroy(&mu);
895
3.29k
    IGRAPH_FINALLY_CLEAN(4);
896
897
3.29k
    return IGRAPH_SUCCESS;
898
3.29k
}
899
900
/**
901
 * Implementation of the adjusted and unadjusted Rand indices.
902
 *
903
 * </para><para>
904
 * This function assumes that the community membership vectors have already
905
 * been normalized using igraph_reindex_membership().
906
 *
907
 * </para><para>
908
 * References:
909
 *
910
 * </para><para>
911
 * Rand WM: Objective criteria for the evaluation of clustering methods. J Am
912
 * Stat Assoc 66(336):846-850, 1971.
913
 *
914
 * </para><para>
915
 * Hubert L and Arabie P: Comparing partitions. Journal of Classification
916
 * 2:193-218, 1985.
917
 *
918
 * </para><para>
919
 * Time complexity: O(n log(max(k1, k2))), where n is the number of vertices, k1
920
 * and k2 are the number of clusters in each of the clusterings.
921
 */
922
static igraph_error_t igraph_i_compare_communities_rand(
923
        const igraph_vector_int_t *v1, const igraph_vector_int_t *v2,
924
6.56k
        igraph_real_t *result, igraph_bool_t adjust) {
925
6.56k
    igraph_sparsemat_t m;
926
6.56k
    igraph_sparsemat_t mu; /* uncompressed */
927
6.56k
    igraph_sparsemat_iterator_t mit;
928
6.56k
    igraph_vector_t rowsums, colsums;
929
6.56k
    igraph_int_t i, nrow, ncol;
930
6.56k
    igraph_real_t rand, n;
931
6.56k
    igraph_real_t frac_pairs_in_1, frac_pairs_in_2;
932
933
6.56k
    if (igraph_vector_int_size(v1) <= 1) {
934
0
        IGRAPH_ERRORF("Rand indices not defined for only zero or one vertices. "
935
0
        "Found membership vector of size %" IGRAPH_PRId ".", IGRAPH_EINVAL, igraph_vector_int_size(v1));
936
0
    }
937
938
    /* Calculate the confusion matrix */
939
6.56k
    IGRAPH_CHECK(igraph_sparsemat_init(&mu, 1, 1, 0));
940
6.56k
    IGRAPH_FINALLY(igraph_sparsemat_destroy, &mu);
941
6.56k
    IGRAPH_CHECK(igraph_i_confusion_matrix(v1, v2, &mu));
942
943
    /* The unadjusted Rand index is defined as (a+d) / (a+b+c+d), where:
944
     *
945
     * - a is the number of pairs in the same cluster both in v1 and v2. This
946
     *   equals the sum of n(i,j) choose 2 for all i and j.
947
     *
948
     * - b is the number of pairs in the same cluster in v1 and in different
949
     *   clusters in v2. This is sum n(i,*) choose 2 for all i minus a.
950
     *   n(i,*) is the number of elements in cluster i in v1.
951
     *
952
     * - c is the number of pairs in the same cluster in v2 and in different
953
     *   clusters in v1. This is sum n(*,j) choose 2 for all j minus a.
954
     *   n(*,j) is the number of elements in cluster j in v2.
955
     *
956
     * - d is (n choose 2) - a - b - c.
957
     *
958
     * Therefore, a+d = (n choose 2) - b - c
959
     *                = (n choose 2) - sum (n(i,*) choose 2)
960
     *                               - sum (n(*,j) choose 2)
961
     *                               + 2 * sum (n(i,j) choose 2).
962
     *
963
     * Since a+b+c+d = (n choose 2) and this goes in the denominator, we can
964
     * just as well start dividing each term in a+d by (n choose 2), which
965
     * yields:
966
     *
967
     * 1 - sum( n(i,*)/n * (n(i,*)-1)/(n-1) )
968
     *   - sum( n(*,i)/n * (n(*,i)-1)/(n-1) )
969
     *   + sum( n(i,j)/n * (n(i,j)-1)/(n-1) ) * 2
970
     */
971
972
    /* Calculate row and column sums */
973
6.56k
    nrow = igraph_sparsemat_nrow(&mu);
974
6.56k
    ncol = igraph_sparsemat_ncol(&mu);
975
6.56k
    n = igraph_vector_int_size(v1);
976
6.56k
    IGRAPH_VECTOR_INIT_FINALLY(&rowsums, nrow);
977
6.56k
    IGRAPH_VECTOR_INIT_FINALLY(&colsums, ncol);
978
6.56k
    IGRAPH_CHECK(igraph_sparsemat_rowsums(&mu, &rowsums));
979
6.56k
    IGRAPH_CHECK(igraph_sparsemat_colsums(&mu, &colsums));
980
981
    /* Start calculating the unadjusted Rand index */
982
6.56k
    rand = 0.0;
983
6.56k
    igraph_sparsemat_compress(&mu, &m);
984
6.56k
    IGRAPH_FINALLY(igraph_sparsemat_destroy, &m);
985
6.56k
    IGRAPH_CHECK(igraph_sparsemat_dupl(&m));
986
987
6.56k
    IGRAPH_CHECK(igraph_sparsemat_iterator_init(&mit, &m));
988
300k
    while (!igraph_sparsemat_iterator_end(&mit)) {
989
294k
        igraph_real_t value = igraph_sparsemat_iterator_get(&mit);
990
294k
        rand += (value / n) * (value - 1) / (n - 1);
991
294k
        igraph_sparsemat_iterator_next(&mit);
992
294k
    }
993
994
6.56k
    frac_pairs_in_1 = frac_pairs_in_2 = 0.0;
995
300k
    for (i = 0; i < nrow; i++) {
996
294k
        frac_pairs_in_1 += (VECTOR(rowsums)[i] / n) * (VECTOR(rowsums)[i] - 1) / (n - 1);
997
294k
    }
998
250k
    for (i = 0; i < ncol; i++) {
999
244k
        frac_pairs_in_2 += (VECTOR(colsums)[i] / n) * (VECTOR(colsums)[i] - 1) / (n - 1);
1000
244k
    }
1001
1002
6.56k
    rand = 1.0 + 2 * rand - frac_pairs_in_1 - frac_pairs_in_2;
1003
1004
6.56k
    if (adjust) {
1005
3.28k
        double expected = frac_pairs_in_1 * frac_pairs_in_2 +
1006
3.28k
                          (1 - frac_pairs_in_1) * (1 - frac_pairs_in_2);
1007
3.28k
        rand = (rand - expected) / (1 - expected);
1008
3.28k
    }
1009
1010
6.56k
    igraph_vector_destroy(&rowsums);
1011
6.56k
    igraph_vector_destroy(&colsums);
1012
6.56k
    igraph_sparsemat_destroy(&m);
1013
6.56k
    igraph_sparsemat_destroy(&mu);
1014
6.56k
    IGRAPH_FINALLY_CLEAN(4);
1015
1016
6.56k
    *result = rand;
1017
1018
6.56k
    return IGRAPH_SUCCESS;
1019
6.56k
}