/src/igraph/src/centrality/pagerank.c
Line | Count | Source |
1 | | /* |
2 | | igraph library. |
3 | | Copyright (C) 2007-2021 The igraph development team <igraph@igraph.org> |
4 | | |
5 | | This program is free software; you can redistribute it and/or modify |
6 | | it under the terms of the GNU General Public License as published by |
7 | | the Free Software Foundation; either version 2 of the License, or |
8 | | (at your option) any later version. |
9 | | |
10 | | This program is distributed in the hope that it will be useful, |
11 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | GNU General Public License for more details. |
14 | | |
15 | | You should have received a copy of the GNU General Public License |
16 | | along with this program. If not, see <https://www.gnu.org/licenses/>. |
17 | | */ |
18 | | |
19 | | #include "igraph_centrality.h" |
20 | | |
21 | | #include "igraph_adjlist.h" |
22 | | #include "igraph_interface.h" |
23 | | #include "igraph_random.h" |
24 | | #include "igraph_structural.h" |
25 | | |
26 | | #include "centrality/prpack_internal.h" |
27 | | |
28 | | #include <limits.h> |
29 | | |
30 | | static igraph_error_t igraph_i_personalized_pagerank_arpack(const igraph_t *graph, |
31 | | igraph_vector_t *vector, |
32 | | igraph_real_t *value, const igraph_vs_t vids, |
33 | | igraph_bool_t directed, igraph_real_t damping, |
34 | | const igraph_vector_t *reset, |
35 | | const igraph_vector_t *weights, |
36 | | igraph_arpack_options_t *options); |
37 | | |
38 | | typedef struct { |
39 | | const igraph_t *graph; |
40 | | igraph_adjlist_t *adjlist; |
41 | | igraph_real_t damping; |
42 | | igraph_vector_t *outdegree; |
43 | | igraph_vector_t *tmp; |
44 | | igraph_vector_t *reset; |
45 | | } pagerank_data_t; |
46 | | |
47 | | typedef struct { |
48 | | const igraph_t *graph; |
49 | | igraph_inclist_t *inclist; |
50 | | const igraph_vector_t *weights; |
51 | | igraph_real_t damping; |
52 | | igraph_vector_t *outdegree; |
53 | | igraph_vector_t *tmp; |
54 | | igraph_vector_t *reset; |
55 | | } pagerank_data_weighted_t; |
56 | | |
57 | | /* The two pagerank_operator functions below update the probabilities of a random walker |
58 | | * being in each of the vertices after one step of the walk. */ |
59 | | |
60 | | static igraph_error_t pagerank_operator_unweighted(igraph_real_t *to, const igraph_real_t *from, |
61 | 0 | int n, void *extra) { |
62 | |
|
63 | 0 | pagerank_data_t *data = extra; |
64 | 0 | igraph_adjlist_t *adjlist = data->adjlist; |
65 | 0 | igraph_vector_t *outdegree = data->outdegree; |
66 | 0 | igraph_vector_t *tmp = data->tmp; |
67 | 0 | igraph_vector_t *reset = data->reset; |
68 | 0 | igraph_vector_int_t *neis; |
69 | 0 | igraph_int_t i, j, nlen; |
70 | 0 | igraph_real_t sumfrom = 0.0; |
71 | 0 | igraph_real_t fact = 1 - data->damping; |
72 | | |
73 | | /* Calculate p(x) / outdegree(x) in advance for all the vertices. |
74 | | * Note that we may divide by zero here; this is intentional since |
75 | | * we won't use those values and we save a comparison this way. |
76 | | * At the same time, we calculate the global probability of a |
77 | | * random jump in `sumfrom`. For vertices with no outgoing edges, |
78 | | * we will surely jump from there if we are there, hence those |
79 | | * vertices contribute p(x) to the teleportation probability. |
80 | | * For vertices with some outgoing edges, we jump from there with |
81 | | * probability `fact` if we are there, hence they contribute |
82 | | * p(x)*fact */ |
83 | 0 | for (i = 0; i < n; i++) { |
84 | 0 | sumfrom += VECTOR(*outdegree)[i] != 0 ? from[i] * fact : from[i]; |
85 | 0 | VECTOR(*tmp)[i] = from[i] / VECTOR(*outdegree)[i]; |
86 | 0 | } |
87 | | |
88 | | /* Here we calculate the part of the `to` vector that results from |
89 | | * moving along links (and not from teleportation) */ |
90 | 0 | for (i = 0; i < n; i++) { |
91 | 0 | neis = igraph_adjlist_get(adjlist, i); |
92 | 0 | nlen = igraph_vector_int_size(neis); |
93 | 0 | to[i] = 0.0; |
94 | 0 | for (j = 0; j < nlen; j++) { |
95 | 0 | igraph_int_t nei = VECTOR(*neis)[j]; |
96 | 0 | to[i] += VECTOR(*tmp)[nei]; |
97 | 0 | } |
98 | 0 | to[i] *= data->damping; |
99 | 0 | } |
100 | | |
101 | | /* Now we add the contribution from random jumps. `reset` is a vector |
102 | | * that defines the probability of ending up in vertex i after a jump. |
103 | | * `sumfrom` is the global probability of jumping as mentioned above. */ |
104 | | /* printf("sumfrom = %.6f\n", (float)sumfrom); */ |
105 | |
|
106 | 0 | if (reset) { |
107 | | /* Running personalized PageRank */ |
108 | 0 | for (i = 0; i < n; i++) { |
109 | 0 | to[i] += sumfrom * VECTOR(*reset)[i]; |
110 | 0 | } |
111 | 0 | } else { |
112 | | /* Traditional PageRank with uniform reset vector */ |
113 | 0 | sumfrom /= n; |
114 | 0 | for (i = 0; i < n; i++) { |
115 | 0 | to[i] += sumfrom; |
116 | 0 | } |
117 | 0 | } |
118 | |
|
119 | 0 | return IGRAPH_SUCCESS; |
120 | 0 | } |
121 | | |
122 | | static igraph_error_t pagerank_operator_weighted(igraph_real_t *to, const igraph_real_t *from, |
123 | 0 | int n, void *extra) { |
124 | |
|
125 | 0 | pagerank_data_weighted_t *data = extra; |
126 | 0 | const igraph_t *graph = data->graph; |
127 | 0 | igraph_inclist_t *inclist = data->inclist; |
128 | 0 | const igraph_vector_t *weights = data->weights; |
129 | 0 | igraph_vector_t *outdegree = data->outdegree; |
130 | 0 | igraph_vector_t *tmp = data->tmp; |
131 | 0 | igraph_vector_t *reset = data->reset; |
132 | 0 | igraph_int_t i, j, nlen; |
133 | 0 | igraph_real_t sumfrom = 0.0; |
134 | 0 | igraph_vector_int_t *neis; |
135 | 0 | igraph_real_t fact = 1 - data->damping; |
136 | | |
137 | | /* |
138 | | printf("PageRank weighted: multiplying vector: "); |
139 | | for (i=0; i<n; i++) { printf(" %.4f", from[i]); } |
140 | | printf("\n"); |
141 | | */ |
142 | |
|
143 | 0 | for (i = 0; i < n; i++) { |
144 | 0 | if (VECTOR(*outdegree)[i] > 0) { |
145 | 0 | sumfrom += from[i] * fact; |
146 | 0 | VECTOR(*tmp)[i] = from[i] / VECTOR(*outdegree)[i]; |
147 | 0 | } else { |
148 | 0 | sumfrom += from[i]; |
149 | | /* The following value is used only when all outgoing edges have |
150 | | * weight zero (as opposed to there being no outgoing edges at all). |
151 | | * We set it to zero to avoid a 0.0*inf situation when computing |
152 | | * to[i] below. */ |
153 | 0 | VECTOR(*tmp)[i] = 0; |
154 | 0 | } |
155 | 0 | } |
156 | |
|
157 | 0 | for (i = 0; i < n; i++) { |
158 | 0 | neis = igraph_inclist_get(inclist, i); |
159 | 0 | nlen = igraph_vector_int_size(neis); |
160 | 0 | to[i] = 0.0; |
161 | 0 | for (j = 0; j < nlen; j++) { |
162 | 0 | igraph_int_t edge = VECTOR(*neis)[j]; |
163 | 0 | igraph_int_t nei = IGRAPH_OTHER(graph, edge, i); |
164 | 0 | to[i] += VECTOR(*weights)[edge] * VECTOR(*tmp)[nei]; |
165 | 0 | } |
166 | 0 | to[i] *= data->damping; |
167 | 0 | } |
168 | | |
169 | | /* printf("sumfrom = %.6f\n", (float)sumfrom); */ |
170 | |
|
171 | 0 | if (reset) { |
172 | | /* Running personalized PageRank */ |
173 | 0 | for (i = 0; i < n; i++) { |
174 | 0 | to[i] += sumfrom * VECTOR(*reset)[i]; |
175 | 0 | } |
176 | 0 | } else { |
177 | | /* Traditional PageRank with uniform reset vector */ |
178 | 0 | sumfrom /= n; |
179 | 0 | for (i = 0; i < n; i++) { |
180 | 0 | to[i] += sumfrom; |
181 | 0 | } |
182 | 0 | } |
183 | | |
184 | | /* |
185 | | printf("PageRank weighted: multiplied vector: "); |
186 | | for (i=0; i<n; i++) { printf(" %.4f", to[i]); } |
187 | | printf("\n"); |
188 | | */ |
189 | |
|
190 | 0 | return IGRAPH_SUCCESS; |
191 | 0 | } |
192 | | |
193 | | /** |
194 | | * \function igraph_pagerank |
195 | | * \brief Calculates the Google PageRank for the specified vertices. |
196 | | * |
197 | | * The PageRank centrality of a vertex is the fraction of time a |
198 | | * random walker traversing the graph would spend on that vertex. |
199 | | * The walker follows the out-edges with probabilities proportional |
200 | | * to their weights. Additionally, in each step, it restarts the walk |
201 | | * from a random vertex with probability <code>1 - damping</code>. |
202 | | * If the random walker gets stuck in a sink vertex, it will also restart |
203 | | * from a random vertex. |
204 | | * |
205 | | * </para><para> |
206 | | * The PageRank centrality is mainly useful for directed graphs. In undirected |
207 | | * graphs it converges to trivial values proportional to degrees as the damping |
208 | | * factor approaches 1. |
209 | | * |
210 | | * </para><para> |
211 | | * Starting from version 0.9, igraph has two PageRank implementations, |
212 | | * and the user can choose between them. The first implementation is |
213 | | * \c IGRAPH_PAGERANK_ALGO_ARPACK, which phrases the PageRank calculation |
214 | | * as an eigenvalue problem, which is then solved using the ARPACK library. |
215 | | * This was the default before igraph version 0.7. The second and recommended |
216 | | * implementation is \c IGRAPH_PAGERANK_ALGO_PRPACK. This is using the |
217 | | * PRPACK package, see https://github.com/dgleich/prpack. PRPACK uses an |
218 | | * algebraic method, i.e. solves a linear system to obtain the PageRank |
219 | | * scores. |
220 | | * |
221 | | * </para><para> |
222 | | * Note that the PageRank of a given vertex depends on the PageRank |
223 | | * of all other vertices, so even if you want to calculate the PageRank for |
224 | | * only some of the vertices, all of them must be calculated. Requesting |
225 | | * the PageRank for only some of the vertices does not result in any |
226 | | * performance increase at all. |
227 | | * |
228 | | * </para><para> |
229 | | * References: |
230 | | * |
231 | | * </para><para> |
232 | | * Sergey Brin and Larry Page: The Anatomy of a Large-Scale Hypertextual |
233 | | * Web Search Engine. Proceedings of the 7th World-Wide Web Conference, |
234 | | * Brisbane, Australia, April 1998. |
235 | | * https://doi.org/10.1016/S0169-7552(98)00110-X |
236 | | * |
237 | | * \param graph The graph object. |
238 | | * \param weights Optional edge weights. May be a \c NULL pointer, |
239 | | * meaning unweighted edges, or a vector of non-negative values |
240 | | * of the same length as the number of edges. |
241 | | * \param vector Pointer to an initialized vector, the result is |
242 | | * stored here. It is resized as needed. |
243 | | * \param value Pointer to a real variable. When using \c IGRAPH_PAGERANK_ALGO_ARPACK, |
244 | | * the eigenvalue corresponding to the PageRank vector is stored here. It is |
245 | | * expected to be exactly one. Checking this value can be used to diagnose cases |
246 | | * when ARPACK failed to converge to the leading eigenvector. |
247 | | * When using \c IGRAPH_PAGERANK_ALGO_PRPACK, this is always set to 1.0. |
248 | | * \param damping The damping factor ("d" in the original paper). |
249 | | * Must be a probability in the range [0, 1]. A commonly used value is 0.85. |
250 | | * \param directed Boolean, whether to consider the directedness of |
251 | | * the edges. This is ignored for undirected graphs. |
252 | | * \param vids The vertex IDs for which the PageRank is returned. This parameter |
253 | | * is only for convenience. Computing PageRank for fewer than all vertices will |
254 | | * not speed up the calculation. |
255 | | * \param algo The PageRank implementation to use. Possible values: |
256 | | * \c IGRAPH_PAGERANK_ALGO_ARPACK, \c IGRAPH_PAGERANK_ALGO_PRPACK. |
257 | | * \param options Options for the ARPACK method. See \ref igraph_arpack_options_t |
258 | | * for details. Supply \c NULL here to use the defaults. Note that the function |
259 | | * overwrites the <code>n</code> (number of vertices), <code>nev</code> (1), |
260 | | * <code>ncv</code> (3) and <code>which</code> (LM) parameters and it always |
261 | | * starts the calculation from a non-random vector calculated based on the |
262 | | * degree of the vertices. |
263 | | * \return Error code: |
264 | | * \c IGRAPH_ENOMEM, not enough memory for temporary data. |
265 | | * \c IGRAPH_EINVVID, invalid vertex ID in \p vids. |
266 | | * |
267 | | * Time complexity: depends on the input graph, usually it is O(|E|), |
268 | | * the number of edges. |
269 | | * |
270 | | * \sa \ref igraph_personalized_pagerank() and \ref igraph_personalized_pagerank_vs() |
271 | | * for the personalized PageRank measure. See \ref igraph_arpack_rssolve() and |
272 | | * \ref igraph_arpack_rnsolve() for the underlying machinery used by |
273 | | * \c IGRAPH_PAGERANK_ALGO_ARPACK. |
274 | | * |
275 | | * \example examples/simple/igraph_pagerank.c |
276 | | */ |
277 | | igraph_error_t igraph_pagerank( |
278 | | const igraph_t *graph, const igraph_vector_t *weights, |
279 | | igraph_vector_t *vector, igraph_real_t *value, |
280 | | igraph_real_t damping, igraph_bool_t directed, |
281 | | igraph_vs_t vids, |
282 | | igraph_pagerank_algo_t algo, |
283 | 2.01k | igraph_arpack_options_t *options) { |
284 | 2.01k | return igraph_personalized_pagerank( |
285 | 2.01k | graph, weights, |
286 | 2.01k | vector, value, |
287 | 2.01k | NULL, |
288 | 2.01k | damping, directed, |
289 | 2.01k | vids, |
290 | 2.01k | algo, |
291 | 2.01k | options); |
292 | 2.01k | } |
293 | | |
294 | | /** |
295 | | * \function igraph_personalized_pagerank_vs |
296 | | * \brief Calculates the personalized Google PageRank for the specified vertices. |
297 | | * |
298 | | * The personalized PageRank is similar to the original PageRank measure, but |
299 | | * when the random walk is restarted, a new starting vertex is chosen according to |
300 | | * a specified distribution. |
301 | | * This distribution is used both when restarting randomly with probability |
302 | | * <code>1 - damping</code>, and when the walker is forced to restart due to being |
303 | | * stuck in a sink vertex (a vertex with no outgoing edges). |
304 | | * |
305 | | * </para><para> |
306 | | * This simplified interface takes a vertex sequence and resets the random walk to |
307 | | * one of the vertices in the specified vertex sequence, chosen uniformly. A typical |
308 | | * application of personalized PageRank is when the random walk is reset to the same |
309 | | * vertex every time: this can easily be achieved using \ref igraph_vss_1() which |
310 | | * generates a vertex sequence containing only a single vertex. |
311 | | * |
312 | | * </para><para> |
313 | | * Note that the personalized PageRank of a given vertex depends on the |
314 | | * personalized PageRank of all other vertices, so even if you want to calculate |
315 | | * the personalized PageRank for only some of the vertices, all of them must be |
316 | | * calculated. Requesting the personalized PageRank for only some of the vertices |
317 | | * does not result in any performance increase at all. |
318 | | * |
319 | | * \param graph The graph object. |
320 | | * \param weights Optional edge weights, it is either a null pointer, |
321 | | * then the edges are not weighted, or a vector of the same length |
322 | | * as the number of edges. |
323 | | * \param vector Pointer to an initialized vector, the result is |
324 | | * stored here. It is resized as needed. |
325 | | * \param value Pointer to a real variable. When using \c IGRAPH_PAGERANK_ALGO_ARPACK, |
326 | | * the eigenvalue corresponding to the PageRank vector is stored here. It is |
327 | | * expected to be exactly one. Checking this value can be used to diagnose cases |
328 | | * when ARPACK failed to converge to the leading eigenvector. |
329 | | * When using \c IGRAPH_PAGERANK_ALGO_PRPACK, this is always set to 1.0. |
330 | | * \param reset_vids IDs of the vertices used when resetting the random walk. |
331 | | * The walk will be restarted from a vertex in this set, chosen uniformly at |
332 | | * random. Duplicate vertices are allowed. |
333 | | * \param damping The damping factor ("d" in the original paper). |
334 | | * Must be a probability in the range [0, 1]. A commonly used value is 0.85. |
335 | | * \param directed Boolean, whether to consider the directedness of |
336 | | * the edges. This is ignored for undirected graphs. |
337 | | * \param vids The vertex IDs for which the PageRank is returned. This parameter |
338 | | * is only for convenience. Computing PageRank for fewer than all vertices will |
339 | | * not speed up the calculation. |
340 | | * \param algo The PageRank implementation to use. Possible values: |
341 | | * \c IGRAPH_PAGERANK_ALGO_ARPACK, \c IGRAPH_PAGERANK_ALGO_PRPACK. |
342 | | * \param options Options for the ARPACK method. See \ref igraph_arpack_options_t |
343 | | * for details. Supply \c NULL here to use the defaults. Note that the function |
344 | | * overwrites the <code>n</code> (number of vertices), <code>nev</code> (1), |
345 | | * <code>ncv</code> (3) and <code>which</code> (LM) parameters and it always |
346 | | * starts the calculation from a non-random vector calculated based on the |
347 | | * degree of the vertices. |
348 | | * \return Error code: |
349 | | * \c IGRAPH_ENOMEM, not enough memory for |
350 | | * temporary data. |
351 | | * \c IGRAPH_EINVVID, invalid vertex ID in |
352 | | * \p vids or an empty reset vertex sequence in |
353 | | * \p vids_reset. |
354 | | * |
355 | | * Time complexity: depends on the input graph, usually it is O(|E|), |
356 | | * the number of edges. |
357 | | * |
358 | | * \sa \ref igraph_pagerank() for the non-personalized implementation. |
359 | | */ |
360 | | igraph_error_t igraph_personalized_pagerank_vs( |
361 | | const igraph_t *graph, const igraph_vector_t *weights, |
362 | | igraph_vector_t *vector, igraph_real_t *value, |
363 | | igraph_vs_t reset_vids, |
364 | | igraph_real_t damping, igraph_bool_t directed, |
365 | | igraph_vs_t vids, |
366 | | igraph_pagerank_algo_t algo, |
367 | 0 | igraph_arpack_options_t *options) { |
368 | |
|
369 | 0 | igraph_vector_t reset; |
370 | 0 | igraph_vit_t vit; |
371 | |
|
372 | 0 | IGRAPH_VECTOR_INIT_FINALLY(&reset, igraph_vcount(graph)); |
373 | 0 | IGRAPH_CHECK(igraph_vit_create(graph, reset_vids, &vit)); |
374 | 0 | IGRAPH_FINALLY(igraph_vit_destroy, &vit); |
375 | |
|
376 | 0 | while (!IGRAPH_VIT_END(vit)) { |
377 | | /* Increment by 1 instead of setting to 1 to handle duplicates. */ |
378 | 0 | VECTOR(reset)[IGRAPH_VIT_GET(vit)] += 1.0; |
379 | 0 | IGRAPH_VIT_NEXT(vit); |
380 | 0 | } |
381 | 0 | igraph_vit_destroy(&vit); |
382 | 0 | IGRAPH_FINALLY_CLEAN(1); |
383 | |
|
384 | 0 | IGRAPH_CHECK(igraph_personalized_pagerank( |
385 | 0 | graph, weights, vector, |
386 | 0 | value, &reset, |
387 | 0 | damping, directed, vids, algo, |
388 | 0 | options)); |
389 | | |
390 | 0 | igraph_vector_destroy(&reset); |
391 | 0 | IGRAPH_FINALLY_CLEAN(1); |
392 | |
|
393 | 0 | return IGRAPH_SUCCESS; |
394 | 0 | } |
395 | | |
396 | | /** |
397 | | * \function igraph_personalized_pagerank |
398 | | * \brief Calculates the personalized Google PageRank for the specified vertices. |
399 | | * |
400 | | * The personalized PageRank is similar to the original PageRank measure, but |
401 | | * when the random walk is restarted, a new starting vertex is chosen non-uniformly, |
402 | | * according to the distribution specified in \p reset |
403 | | * (instead of the uniform distribution in the original PageRank measure). |
404 | | * The \p reset distribution is used both when restarting randomly with probability |
405 | | * <code>1 - damping</code>, and when the walker is forced to restart due to being |
406 | | * stuck in a sink vertex (a vertex with no outgoing edges). |
407 | | * |
408 | | * </para><para> |
409 | | * Note that the personalized PageRank of a given vertex depends on the |
410 | | * personalized PageRank of all other vertices, so even if you want to calculate |
411 | | * the personalized PageRank for only some of the vertices, all of them must be |
412 | | * calculated. Requesting the personalized PageRank for only some of the vertices |
413 | | * does not result in any performance increase at all. |
414 | | * |
415 | | * \param graph The graph object. |
416 | | * \param weights Optional edge weights. May be a \c NULL pointer, |
417 | | * meaning unweighted edges, or a vector of non-negative values |
418 | | * of the same length as the number of edges. |
419 | | * \param vector Pointer to an initialized vector, the result is |
420 | | * stored here. It is resized as needed. |
421 | | * \param value Pointer to a real variable. When using \c IGRAPH_PAGERANK_ALGO_ARPACK, |
422 | | * the eigenvalue corresponding to the PageRank vector is stored here. It is |
423 | | * expected to be exactly one. Checking this value can be used to diagnose cases |
424 | | * when ARPACK failed to converge to the leading eigenvector. |
425 | | * When using \c IGRAPH_PAGERANK_ALGO_PRPACK, this is always set to 1.0. |
426 | | * \param reset The probability distribution over the vertices used when |
427 | | * resetting the random walk. It is either a \c NULL pointer (denoting |
428 | | * a uniform choice that results in the original PageRank measure) |
429 | | * or a vector of the same length as the number of vertices. |
430 | | * \param damping The damping factor ("d" in the original paper). |
431 | | * Must be a probability in the range [0, 1]. A commonly used value is 0.85. |
432 | | * \param directed Boolean, whether to consider the directedness of |
433 | | * the edges. This is ignored for undirected graphs. |
434 | | * \param vids The vertex IDs for which the PageRank is returned. This parameter |
435 | | * is only for convenience. Computing PageRank for fewer than all vertices will |
436 | | * not speed up the calculation. |
437 | | * \param algo The PageRank implementation to use. Possible values: |
438 | | * \c IGRAPH_PAGERANK_ALGO_ARPACK, \c IGRAPH_PAGERANK_ALGO_PRPACK. |
439 | | * \param options Options for the ARPACK method. See \ref igraph_arpack_options_t |
440 | | * for details. Supply \c NULL here to use the defaults. Note that the function |
441 | | * overwrites the <code>n</code> (number of vertices), <code>nev</code> (1), |
442 | | * <code>ncv</code> (3) and <code>which</code> (LM) parameters and it always |
443 | | * starts the calculation from a non-random vector calculated based on the |
444 | | * degree of the vertices. |
445 | | * \return Error code: |
446 | | * \c IGRAPH_ENOMEM, not enough memory for |
447 | | * temporary data. |
448 | | * \c IGRAPH_EINVVID, invalid vertex ID in |
449 | | * \p vids or an invalid reset vector in \p reset. |
450 | | * |
451 | | * Time complexity: depends on the input graph, usually it is O(|E|), |
452 | | * the number of edges. |
453 | | * |
454 | | * \sa \ref igraph_pagerank() for the non-personalized implementation, |
455 | | * \ref igraph_personalized_pagerank_vs() for a personalized implementation |
456 | | * with resetting to specific vertices. |
457 | | */ |
458 | | igraph_error_t igraph_personalized_pagerank( |
459 | | const igraph_t *graph, const igraph_vector_t *weights, |
460 | | igraph_vector_t *vector, igraph_real_t *value, |
461 | | const igraph_vector_t *reset, |
462 | | igraph_real_t damping, igraph_bool_t directed, |
463 | | igraph_vs_t vids, |
464 | | igraph_pagerank_algo_t algo, |
465 | 2.01k | igraph_arpack_options_t *options) { |
466 | | |
467 | 2.01k | if (damping < 0.0 || damping > 1.0) { |
468 | 0 | IGRAPH_ERROR("The PageRank damping factor must be in the range [0,1].", IGRAPH_EINVAL); |
469 | 0 | } |
470 | | |
471 | 2.01k | if (algo == IGRAPH_PAGERANK_ALGO_ARPACK) { |
472 | 0 | return igraph_i_personalized_pagerank_arpack(graph, vector, value, vids, |
473 | 0 | directed, damping, reset, |
474 | 0 | weights, options ? options : igraph_arpack_options_get_default() |
475 | 0 | ); |
476 | 2.01k | } else if (algo == IGRAPH_PAGERANK_ALGO_PRPACK) { |
477 | 2.01k | return igraph_i_personalized_pagerank_prpack(graph, vector, value, vids, |
478 | 2.01k | directed, damping, reset, |
479 | 2.01k | weights); |
480 | 2.01k | } |
481 | | |
482 | 0 | IGRAPH_ERROR("Unknown PageRank algorithm", IGRAPH_EINVAL); |
483 | 0 | } |
484 | | |
485 | | /* |
486 | | * ARPACK-based implementation of \c igraph_personalized_pagerank. |
487 | | * |
488 | | * See \c igraph_personalized_pagerank for the documentation of the parameters. |
489 | | */ |
490 | | static igraph_error_t igraph_i_personalized_pagerank_arpack(const igraph_t *graph, igraph_vector_t *vector, |
491 | | igraph_real_t *value, const igraph_vs_t vids, |
492 | | igraph_bool_t directed, igraph_real_t damping, |
493 | | const igraph_vector_t *reset, |
494 | | const igraph_vector_t *weights, |
495 | 0 | igraph_arpack_options_t *options) { |
496 | 0 | igraph_matrix_t values; |
497 | 0 | igraph_matrix_t vectors; |
498 | 0 | igraph_neimode_t dirmode; |
499 | 0 | igraph_vector_t outdegree; |
500 | 0 | igraph_vector_t indegree; |
501 | 0 | igraph_vector_t tmp; |
502 | 0 | igraph_vector_t normalized_reset; |
503 | |
|
504 | 0 | igraph_int_t i; |
505 | 0 | igraph_int_t no_of_nodes = igraph_vcount(graph); |
506 | 0 | igraph_int_t no_of_edges = igraph_ecount(graph); |
507 | |
|
508 | 0 | igraph_real_t reset_sum; /* used only when reset != NULL */ |
509 | |
|
510 | 0 | if (no_of_nodes > INT_MAX) { |
511 | 0 | IGRAPH_ERROR("Graph has too many vertices for ARPACK.", IGRAPH_EOVERFLOW); |
512 | 0 | } |
513 | | |
514 | 0 | if (weights && igraph_vector_size(weights) != no_of_edges) { |
515 | 0 | IGRAPH_ERROR("Invalid length of weights vector when calculating PageRank scores.", IGRAPH_EINVAL); |
516 | 0 | } |
517 | | |
518 | 0 | if (reset && igraph_vector_size(reset) != no_of_nodes) { |
519 | 0 | IGRAPH_ERROR("Invalid length of reset vector when calculating personalized PageRank scores.", IGRAPH_EINVAL); |
520 | 0 | } |
521 | | |
522 | 0 | if (reset) { |
523 | 0 | reset_sum = igraph_vector_sum(reset); |
524 | 0 | if (no_of_nodes > 0 && reset_sum == 0) { |
525 | 0 | IGRAPH_ERROR("The sum of the elements in the reset vector must not be zero.", IGRAPH_EINVAL); |
526 | 0 | } |
527 | | |
528 | 0 | igraph_real_t reset_min = igraph_vector_min(reset); |
529 | 0 | if (reset_min < 0) { |
530 | 0 | IGRAPH_ERROR("The reset vector must not contain negative elements.", IGRAPH_EINVAL); |
531 | 0 | } |
532 | 0 | if (!isfinite(reset_sum)) { |
533 | 0 | IGRAPH_ERROR("The reset vector must not contain infinite or NaN values.", IGRAPH_EINVAL); |
534 | 0 | } |
535 | 0 | } |
536 | | |
537 | 0 | if (no_of_edges == 0) { |
538 | | /* Special case: graph with no edges. Result is the same as the personalization vector. */ |
539 | 0 | if (value) { |
540 | 0 | *value = 1.0; |
541 | 0 | } |
542 | 0 | if (vector) { |
543 | 0 | if (reset && no_of_nodes > 0) { |
544 | 0 | IGRAPH_CHECK(igraph_vector_update(vector, reset)); |
545 | 0 | igraph_vector_scale(vector, 1.0 / reset_sum); |
546 | 0 | } else { |
547 | 0 | IGRAPH_CHECK(igraph_vector_resize(vector, no_of_nodes)); |
548 | 0 | igraph_vector_fill(vector, 1.0 / no_of_nodes); |
549 | 0 | } |
550 | 0 | } |
551 | 0 | return IGRAPH_SUCCESS; |
552 | 0 | } |
553 | | |
554 | 0 | options->n = (int) no_of_nodes; |
555 | 0 | options->nev = 1; |
556 | 0 | options->ncv = 0; /* 0 means "automatic" in igraph_arpack_rnsolve */ |
557 | 0 | options->which[0] = 'L'; options->which[1] = 'R'; |
558 | 0 | options->start = 1; /* no random start vector */ |
559 | |
|
560 | 0 | directed = directed && igraph_is_directed(graph); |
561 | |
|
562 | 0 | if (weights) { |
563 | 0 | igraph_real_t min, max; |
564 | | |
565 | | /* Safe to call minmax, ecount == 0 case was caught earlier */ |
566 | 0 | igraph_vector_minmax(weights, &min, &max); |
567 | 0 | if (min < 0) { |
568 | 0 | IGRAPH_ERROR("Edge weights must not be negative.", IGRAPH_EINVAL); |
569 | 0 | } |
570 | 0 | if (isnan(min)) { |
571 | 0 | IGRAPH_ERROR("Weight vector must not contain NaN values.", IGRAPH_EINVAL); |
572 | 0 | } |
573 | 0 | if (min == 0 && max == 0) { |
574 | | /* Special case: all weights are zeros. Result is the same as the personalization vector. */ |
575 | 0 | if (value) { |
576 | 0 | *value = 1.0; |
577 | 0 | } |
578 | 0 | if (vector) { |
579 | 0 | IGRAPH_CHECK(igraph_vector_resize(vector, no_of_nodes)); |
580 | 0 | if (reset) { |
581 | 0 | for (i=0; i < no_of_nodes; ++i) { |
582 | 0 | VECTOR(*vector)[i] = VECTOR(*reset)[i]; |
583 | 0 | } |
584 | 0 | igraph_vector_scale(vector, 1.0 / igraph_vector_sum(vector)); |
585 | 0 | } else { |
586 | 0 | igraph_vector_fill(vector, 1.0 / no_of_nodes); |
587 | 0 | } |
588 | 0 | } |
589 | 0 | return IGRAPH_SUCCESS; |
590 | 0 | } |
591 | 0 | } |
592 | | |
593 | 0 | IGRAPH_MATRIX_INIT_FINALLY(&values, 0, 0); |
594 | 0 | IGRAPH_MATRIX_INIT_FINALLY(&vectors, options->n, 1); |
595 | | |
596 | 0 | if (directed) { |
597 | 0 | dirmode = IGRAPH_IN; |
598 | 0 | } else { |
599 | 0 | dirmode = IGRAPH_ALL; |
600 | 0 | } |
601 | |
|
602 | 0 | IGRAPH_VECTOR_INIT_FINALLY(&indegree, options->n); |
603 | 0 | IGRAPH_VECTOR_INIT_FINALLY(&outdegree, options->n); |
604 | 0 | IGRAPH_VECTOR_INIT_FINALLY(&tmp, options->n); |
605 | | |
606 | 0 | if (reset) { |
607 | | /* Normalize reset vector so the sum is 1 */ |
608 | 0 | IGRAPH_CHECK(igraph_vector_init_copy(&normalized_reset, reset)); |
609 | 0 | IGRAPH_FINALLY(igraph_vector_destroy, &normalized_reset); |
610 | |
|
611 | 0 | igraph_vector_scale(&normalized_reset, 1.0 / reset_sum); |
612 | 0 | } |
613 | | |
614 | 0 | IGRAPH_CHECK(igraph_strength(graph, &outdegree, igraph_vss_all(), |
615 | 0 | directed ? IGRAPH_OUT : IGRAPH_ALL, IGRAPH_LOOPS, weights)); |
616 | 0 | IGRAPH_CHECK(igraph_strength(graph, &indegree, igraph_vss_all(), |
617 | 0 | directed ? IGRAPH_IN : IGRAPH_ALL, IGRAPH_LOOPS, weights)); |
618 | | |
619 | | /* Set up an appropriate starting vector. We start from the (possibly weighted) |
620 | | * in-degrees plus some small random noise to avoid convergence problems. */ |
621 | 0 | for (i = 0; i < no_of_nodes; i++) { |
622 | 0 | if (VECTOR(indegree)[i] > 0) { |
623 | | /* Note: Keep random perturbation non-negative. */ |
624 | 0 | MATRIX(vectors, i, 0) = VECTOR(indegree)[i] + RNG_UNIF(0, 1e-4); |
625 | 0 | } else { |
626 | 0 | MATRIX(vectors, i, 0) = 1; |
627 | 0 | } |
628 | 0 | } |
629 | |
|
630 | 0 | if (!weights) { |
631 | |
|
632 | 0 | igraph_adjlist_t adjlist; |
633 | 0 | pagerank_data_t data; |
634 | |
|
635 | 0 | data.graph = graph; |
636 | 0 | data.adjlist = &adjlist; |
637 | 0 | data.damping = damping; |
638 | 0 | data.outdegree = &outdegree; |
639 | 0 | data.tmp = &tmp; |
640 | 0 | data.reset = reset ? &normalized_reset : NULL; |
641 | |
|
642 | 0 | IGRAPH_CHECK(igraph_adjlist_init(graph, &adjlist, dirmode, IGRAPH_LOOPS, IGRAPH_MULTIPLE)); |
643 | 0 | IGRAPH_FINALLY(igraph_adjlist_destroy, &adjlist); |
644 | |
|
645 | 0 | IGRAPH_CHECK(igraph_arpack_rnsolve(pagerank_operator_unweighted, |
646 | 0 | &data, options, NULL, &values, &vectors)); |
647 | | |
648 | 0 | igraph_adjlist_destroy(&adjlist); |
649 | 0 | IGRAPH_FINALLY_CLEAN(1); |
650 | |
|
651 | 0 | } else { |
652 | |
|
653 | 0 | igraph_inclist_t inclist; |
654 | 0 | pagerank_data_weighted_t data; |
655 | |
|
656 | 0 | data.graph = graph; |
657 | 0 | data.inclist = &inclist; |
658 | 0 | data.weights = weights; |
659 | 0 | data.damping = damping; |
660 | 0 | data.outdegree = &outdegree; |
661 | 0 | data.tmp = &tmp; |
662 | 0 | data.reset = reset ? &normalized_reset : NULL; |
663 | |
|
664 | 0 | IGRAPH_CHECK(igraph_inclist_init(graph, &inclist, dirmode, IGRAPH_LOOPS)); |
665 | 0 | IGRAPH_FINALLY(igraph_inclist_destroy, &inclist); |
666 | |
|
667 | 0 | IGRAPH_CHECK(igraph_arpack_rnsolve(pagerank_operator_weighted, |
668 | 0 | &data, options, NULL, &values, &vectors)); |
669 | | |
670 | 0 | igraph_inclist_destroy(&inclist); |
671 | 0 | IGRAPH_FINALLY_CLEAN(1); |
672 | 0 | } |
673 | | |
674 | 0 | if (reset) { |
675 | 0 | igraph_vector_destroy(&normalized_reset); |
676 | 0 | IGRAPH_FINALLY_CLEAN(1); |
677 | 0 | } |
678 | |
|
679 | 0 | igraph_vector_destroy(&tmp); |
680 | 0 | igraph_vector_destroy(&outdegree); |
681 | 0 | igraph_vector_destroy(&indegree); |
682 | 0 | IGRAPH_FINALLY_CLEAN(3); |
683 | |
|
684 | 0 | if (value) { |
685 | 0 | *value = MATRIX(values, 0, 0); |
686 | 0 | } |
687 | |
|
688 | 0 | if (vector) { |
689 | 0 | igraph_vit_t vit; |
690 | 0 | igraph_int_t nodes_to_calc; |
691 | 0 | igraph_real_t sum = 0; |
692 | |
|
693 | 0 | for (i = 0; i < no_of_nodes; i++) { |
694 | 0 | sum += MATRIX(vectors, i, 0); |
695 | 0 | } |
696 | |
|
697 | 0 | IGRAPH_CHECK(igraph_vit_create(graph, vids, &vit)); |
698 | 0 | IGRAPH_FINALLY(igraph_vit_destroy, &vit); |
699 | 0 | nodes_to_calc = IGRAPH_VIT_SIZE(vit); |
700 | |
|
701 | 0 | IGRAPH_CHECK(igraph_vector_resize(vector, nodes_to_calc)); |
702 | 0 | for (IGRAPH_VIT_RESET(vit), i = 0; !IGRAPH_VIT_END(vit); |
703 | 0 | IGRAPH_VIT_NEXT(vit), i++) { |
704 | 0 | VECTOR(*vector)[i] = MATRIX(vectors, IGRAPH_VIT_GET(vit), 0); |
705 | 0 | VECTOR(*vector)[i] /= sum; |
706 | 0 | } |
707 | |
|
708 | 0 | igraph_vit_destroy(&vit); |
709 | 0 | IGRAPH_FINALLY_CLEAN(1); |
710 | 0 | } |
711 | | |
712 | 0 | if (options->info) { |
713 | 0 | IGRAPH_WARNING("Non-zero return code from ARPACK routine!"); |
714 | 0 | } |
715 | |
|
716 | 0 | igraph_matrix_destroy(&vectors); |
717 | 0 | igraph_matrix_destroy(&values); |
718 | 0 | IGRAPH_FINALLY_CLEAN(2); |
719 | |
|
720 | 0 | return IGRAPH_SUCCESS; |
721 | 0 | } |