Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/networkx/algorithms/link_analysis/pagerank_alg.py: 10%

104 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-10-20 07:00 +0000

1"""PageRank analysis of graph structure. """ 

2from warnings import warn 

3 

4import networkx as nx 

5 

6__all__ = ["pagerank", "google_matrix"] 

7 

8 

9@nx._dispatch(edge_attrs="weight") 

10def pagerank( 

11 G, 

12 alpha=0.85, 

13 personalization=None, 

14 max_iter=100, 

15 tol=1.0e-6, 

16 nstart=None, 

17 weight="weight", 

18 dangling=None, 

19): 

20 """Returns the PageRank of the nodes in the graph. 

21 

22 PageRank computes a ranking of the nodes in the graph G based on 

23 the structure of the incoming links. It was originally designed as 

24 an algorithm to rank web pages. 

25 

26 Parameters 

27 ---------- 

28 G : graph 

29 A NetworkX graph. Undirected graphs will be converted to a directed 

30 graph with two directed edges for each undirected edge. 

31 

32 alpha : float, optional 

33 Damping parameter for PageRank, default=0.85. 

34 

35 personalization: dict, optional 

36 The "personalization vector" consisting of a dictionary with a 

37 key some subset of graph nodes and personalization value each of those. 

38 At least one personalization value must be non-zero. 

39 If not specified, a nodes personalization value will be zero. 

40 By default, a uniform distribution is used. 

41 

42 max_iter : integer, optional 

43 Maximum number of iterations in power method eigenvalue solver. 

44 

45 tol : float, optional 

46 Error tolerance used to check convergence in power method solver. 

47 The iteration will stop after a tolerance of ``len(G) * tol`` is reached. 

48 

49 nstart : dictionary, optional 

50 Starting value of PageRank iteration for each node. 

51 

52 weight : key, optional 

53 Edge data key to use as weight. If None weights are set to 1. 

54 

55 dangling: dict, optional 

56 The outedges to be assigned to any "dangling" nodes, i.e., nodes without 

57 any outedges. The dict key is the node the outedge points to and the dict 

58 value is the weight of that outedge. By default, dangling nodes are given 

59 outedges according to the personalization vector (uniform if not 

60 specified). This must be selected to result in an irreducible transition 

61 matrix (see notes under google_matrix). It may be common to have the 

62 dangling dict to be the same as the personalization dict. 

63 

64 

65 Returns 

66 ------- 

67 pagerank : dictionary 

68 Dictionary of nodes with PageRank as value 

69 

70 Examples 

71 -------- 

72 >>> G = nx.DiGraph(nx.path_graph(4)) 

73 >>> pr = nx.pagerank(G, alpha=0.9) 

74 

75 Notes 

76 ----- 

77 The eigenvector calculation is done by the power iteration method 

78 and has no guarantee of convergence. The iteration will stop after 

79 an error tolerance of ``len(G) * tol`` has been reached. If the 

80 number of iterations exceed `max_iter`, a 

81 :exc:`networkx.exception.PowerIterationFailedConvergence` exception 

82 is raised. 

83 

84 The PageRank algorithm was designed for directed graphs but this 

85 algorithm does not check if the input graph is directed and will 

86 execute on undirected graphs by converting each edge in the 

87 directed graph to two edges. 

88 

89 See Also 

90 -------- 

91 google_matrix 

92 

93 Raises 

94 ------ 

95 PowerIterationFailedConvergence 

96 If the algorithm fails to converge to the specified tolerance 

97 within the specified number of iterations of the power iteration 

98 method. 

99 

100 References 

101 ---------- 

102 .. [1] A. Langville and C. Meyer, 

103 "A survey of eigenvector methods of web information retrieval." 

104 http://citeseer.ist.psu.edu/713792.html 

105 .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry, 

106 The PageRank citation ranking: Bringing order to the Web. 1999 

107 http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf 

108 

109 """ 

110 return _pagerank_scipy( 

111 G, alpha, personalization, max_iter, tol, nstart, weight, dangling 

112 ) 

113 

114 

115def _pagerank_python( 

116 G, 

117 alpha=0.85, 

118 personalization=None, 

119 max_iter=100, 

120 tol=1.0e-6, 

121 nstart=None, 

122 weight="weight", 

123 dangling=None, 

124): 

125 if len(G) == 0: 

126 return {} 

127 

128 D = G.to_directed() 

129 

130 # Create a copy in (right) stochastic form 

131 W = nx.stochastic_graph(D, weight=weight) 

132 N = W.number_of_nodes() 

133 

134 # Choose fixed starting vector if not given 

135 if nstart is None: 

136 x = dict.fromkeys(W, 1.0 / N) 

137 else: 

138 # Normalized nstart vector 

139 s = sum(nstart.values()) 

140 x = {k: v / s for k, v in nstart.items()} 

141 

142 if personalization is None: 

143 # Assign uniform personalization vector if not given 

144 p = dict.fromkeys(W, 1.0 / N) 

145 else: 

146 s = sum(personalization.values()) 

147 p = {k: v / s for k, v in personalization.items()} 

148 

149 if dangling is None: 

150 # Use personalization vector if dangling vector not specified 

151 dangling_weights = p 

152 else: 

153 s = sum(dangling.values()) 

154 dangling_weights = {k: v / s for k, v in dangling.items()} 

155 dangling_nodes = [n for n in W if W.out_degree(n, weight=weight) == 0.0] 

156 

157 # power iteration: make up to max_iter iterations 

158 for _ in range(max_iter): 

159 xlast = x 

160 x = dict.fromkeys(xlast.keys(), 0) 

161 danglesum = alpha * sum(xlast[n] for n in dangling_nodes) 

162 for n in x: 

163 # this matrix multiply looks odd because it is 

164 # doing a left multiply x^T=xlast^T*W 

165 for _, nbr, wt in W.edges(n, data=weight): 

166 x[nbr] += alpha * xlast[n] * wt 

167 x[n] += danglesum * dangling_weights.get(n, 0) + (1.0 - alpha) * p.get(n, 0) 

168 # check convergence, l1 norm 

169 err = sum(abs(x[n] - xlast[n]) for n in x) 

170 if err < N * tol: 

171 return x 

172 raise nx.PowerIterationFailedConvergence(max_iter) 

173 

174 

175@nx._dispatch(edge_attrs="weight") 

176def google_matrix( 

177 G, alpha=0.85, personalization=None, nodelist=None, weight="weight", dangling=None 

178): 

179 """Returns the Google matrix of the graph. 

180 

181 Parameters 

182 ---------- 

183 G : graph 

184 A NetworkX graph. Undirected graphs will be converted to a directed 

185 graph with two directed edges for each undirected edge. 

186 

187 alpha : float 

188 The damping factor. 

189 

190 personalization: dict, optional 

191 The "personalization vector" consisting of a dictionary with a 

192 key some subset of graph nodes and personalization value each of those. 

193 At least one personalization value must be non-zero. 

194 If not specified, a nodes personalization value will be zero. 

195 By default, a uniform distribution is used. 

196 

197 nodelist : list, optional 

198 The rows and columns are ordered according to the nodes in nodelist. 

199 If nodelist is None, then the ordering is produced by G.nodes(). 

200 

201 weight : key, optional 

202 Edge data key to use as weight. If None weights are set to 1. 

203 

204 dangling: dict, optional 

205 The outedges to be assigned to any "dangling" nodes, i.e., nodes without 

206 any outedges. The dict key is the node the outedge points to and the dict 

207 value is the weight of that outedge. By default, dangling nodes are given 

208 outedges according to the personalization vector (uniform if not 

209 specified) This must be selected to result in an irreducible transition 

210 matrix (see notes below). It may be common to have the dangling dict to 

211 be the same as the personalization dict. 

212 

213 Returns 

214 ------- 

215 A : 2D NumPy ndarray 

216 Google matrix of the graph 

217 

218 Notes 

219 ----- 

220 The array returned represents the transition matrix that describes the 

221 Markov chain used in PageRank. For PageRank to converge to a unique 

222 solution (i.e., a unique stationary distribution in a Markov chain), the 

223 transition matrix must be irreducible. In other words, it must be that 

224 there exists a path between every pair of nodes in the graph, or else there 

225 is the potential of "rank sinks." 

226 

227 This implementation works with Multi(Di)Graphs. For multigraphs the 

228 weight between two nodes is set to be the sum of all edge weights 

229 between those nodes. 

230 

231 See Also 

232 -------- 

233 pagerank 

234 """ 

235 import numpy as np 

236 

237 if nodelist is None: 

238 nodelist = list(G) 

239 

240 A = nx.to_numpy_array(G, nodelist=nodelist, weight=weight) 

241 N = len(G) 

242 if N == 0: 

243 return A 

244 

245 # Personalization vector 

246 if personalization is None: 

247 p = np.repeat(1.0 / N, N) 

248 else: 

249 p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float) 

250 if p.sum() == 0: 

251 raise ZeroDivisionError 

252 p /= p.sum() 

253 

254 # Dangling nodes 

255 if dangling is None: 

256 dangling_weights = p 

257 else: 

258 # Convert the dangling dictionary into an array in nodelist order 

259 dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float) 

260 dangling_weights /= dangling_weights.sum() 

261 dangling_nodes = np.where(A.sum(axis=1) == 0)[0] 

262 

263 # Assign dangling_weights to any dangling nodes (nodes with no out links) 

264 A[dangling_nodes] = dangling_weights 

265 

266 A /= A.sum(axis=1)[:, np.newaxis] # Normalize rows to sum to 1 

267 

268 return alpha * A + (1 - alpha) * p 

269 

270 

271def _pagerank_numpy( 

272 G, alpha=0.85, personalization=None, weight="weight", dangling=None 

273): 

274 """Returns the PageRank of the nodes in the graph. 

275 

276 PageRank computes a ranking of the nodes in the graph G based on 

277 the structure of the incoming links. It was originally designed as 

278 an algorithm to rank web pages. 

279 

280 Parameters 

281 ---------- 

282 G : graph 

283 A NetworkX graph. Undirected graphs will be converted to a directed 

284 graph with two directed edges for each undirected edge. 

285 

286 alpha : float, optional 

287 Damping parameter for PageRank, default=0.85. 

288 

289 personalization: dict, optional 

290 The "personalization vector" consisting of a dictionary with a 

291 key some subset of graph nodes and personalization value each of those. 

292 At least one personalization value must be non-zero. 

293 If not specified, a nodes personalization value will be zero. 

294 By default, a uniform distribution is used. 

295 

296 weight : key, optional 

297 Edge data key to use as weight. If None weights are set to 1. 

298 

299 dangling: dict, optional 

300 The outedges to be assigned to any "dangling" nodes, i.e., nodes without 

301 any outedges. The dict key is the node the outedge points to and the dict 

302 value is the weight of that outedge. By default, dangling nodes are given 

303 outedges according to the personalization vector (uniform if not 

304 specified) This must be selected to result in an irreducible transition 

305 matrix (see notes under google_matrix). It may be common to have the 

306 dangling dict to be the same as the personalization dict. 

307 

308 Returns 

309 ------- 

310 pagerank : dictionary 

311 Dictionary of nodes with PageRank as value. 

312 

313 Examples 

314 -------- 

315 >>> from networkx.algorithms.link_analysis.pagerank_alg import _pagerank_numpy 

316 >>> G = nx.DiGraph(nx.path_graph(4)) 

317 >>> pr = _pagerank_numpy(G, alpha=0.9) 

318 

319 Notes 

320 ----- 

321 The eigenvector calculation uses NumPy's interface to the LAPACK 

322 eigenvalue solvers. This will be the fastest and most accurate 

323 for small graphs. 

324 

325 This implementation works with Multi(Di)Graphs. For multigraphs the 

326 weight between two nodes is set to be the sum of all edge weights 

327 between those nodes. 

328 

329 See Also 

330 -------- 

331 pagerank, google_matrix 

332 

333 References 

334 ---------- 

335 .. [1] A. Langville and C. Meyer, 

336 "A survey of eigenvector methods of web information retrieval." 

337 http://citeseer.ist.psu.edu/713792.html 

338 .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry, 

339 The PageRank citation ranking: Bringing order to the Web. 1999 

340 http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf 

341 """ 

342 import numpy as np 

343 

344 if len(G) == 0: 

345 return {} 

346 M = google_matrix( 

347 G, alpha, personalization=personalization, weight=weight, dangling=dangling 

348 ) 

349 # use numpy LAPACK solver 

350 eigenvalues, eigenvectors = np.linalg.eig(M.T) 

351 ind = np.argmax(eigenvalues) 

352 # eigenvector of largest eigenvalue is at ind, normalized 

353 largest = np.array(eigenvectors[:, ind]).flatten().real 

354 norm = largest.sum() 

355 return dict(zip(G, map(float, largest / norm))) 

356 

357 

358def _pagerank_scipy( 

359 G, 

360 alpha=0.85, 

361 personalization=None, 

362 max_iter=100, 

363 tol=1.0e-6, 

364 nstart=None, 

365 weight="weight", 

366 dangling=None, 

367): 

368 """Returns the PageRank of the nodes in the graph. 

369 

370 PageRank computes a ranking of the nodes in the graph G based on 

371 the structure of the incoming links. It was originally designed as 

372 an algorithm to rank web pages. 

373 

374 Parameters 

375 ---------- 

376 G : graph 

377 A NetworkX graph. Undirected graphs will be converted to a directed 

378 graph with two directed edges for each undirected edge. 

379 

380 alpha : float, optional 

381 Damping parameter for PageRank, default=0.85. 

382 

383 personalization: dict, optional 

384 The "personalization vector" consisting of a dictionary with a 

385 key some subset of graph nodes and personalization value each of those. 

386 At least one personalization value must be non-zero. 

387 If not specified, a nodes personalization value will be zero. 

388 By default, a uniform distribution is used. 

389 

390 max_iter : integer, optional 

391 Maximum number of iterations in power method eigenvalue solver. 

392 

393 tol : float, optional 

394 Error tolerance used to check convergence in power method solver. 

395 The iteration will stop after a tolerance of ``len(G) * tol`` is reached. 

396 

397 nstart : dictionary, optional 

398 Starting value of PageRank iteration for each node. 

399 

400 weight : key, optional 

401 Edge data key to use as weight. If None weights are set to 1. 

402 

403 dangling: dict, optional 

404 The outedges to be assigned to any "dangling" nodes, i.e., nodes without 

405 any outedges. The dict key is the node the outedge points to and the dict 

406 value is the weight of that outedge. By default, dangling nodes are given 

407 outedges according to the personalization vector (uniform if not 

408 specified) This must be selected to result in an irreducible transition 

409 matrix (see notes under google_matrix). It may be common to have the 

410 dangling dict to be the same as the personalization dict. 

411 

412 Returns 

413 ------- 

414 pagerank : dictionary 

415 Dictionary of nodes with PageRank as value 

416 

417 Examples 

418 -------- 

419 >>> from networkx.algorithms.link_analysis.pagerank_alg import _pagerank_scipy 

420 >>> G = nx.DiGraph(nx.path_graph(4)) 

421 >>> pr = _pagerank_scipy(G, alpha=0.9) 

422 

423 Notes 

424 ----- 

425 The eigenvector calculation uses power iteration with a SciPy 

426 sparse matrix representation. 

427 

428 This implementation works with Multi(Di)Graphs. For multigraphs the 

429 weight between two nodes is set to be the sum of all edge weights 

430 between those nodes. 

431 

432 See Also 

433 -------- 

434 pagerank 

435 

436 Raises 

437 ------ 

438 PowerIterationFailedConvergence 

439 If the algorithm fails to converge to the specified tolerance 

440 within the specified number of iterations of the power iteration 

441 method. 

442 

443 References 

444 ---------- 

445 .. [1] A. Langville and C. Meyer, 

446 "A survey of eigenvector methods of web information retrieval." 

447 http://citeseer.ist.psu.edu/713792.html 

448 .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry, 

449 The PageRank citation ranking: Bringing order to the Web. 1999 

450 http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf 

451 """ 

452 import numpy as np 

453 import scipy as sp 

454 

455 N = len(G) 

456 if N == 0: 

457 return {} 

458 

459 nodelist = list(G) 

460 A = nx.to_scipy_sparse_array(G, nodelist=nodelist, weight=weight, dtype=float) 

461 S = A.sum(axis=1) 

462 S[S != 0] = 1.0 / S[S != 0] 

463 # TODO: csr_array 

464 Q = sp.sparse.csr_array(sp.sparse.spdiags(S.T, 0, *A.shape)) 

465 A = Q @ A 

466 

467 # initial vector 

468 if nstart is None: 

469 x = np.repeat(1.0 / N, N) 

470 else: 

471 x = np.array([nstart.get(n, 0) for n in nodelist], dtype=float) 

472 x /= x.sum() 

473 

474 # Personalization vector 

475 if personalization is None: 

476 p = np.repeat(1.0 / N, N) 

477 else: 

478 p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float) 

479 if p.sum() == 0: 

480 raise ZeroDivisionError 

481 p /= p.sum() 

482 # Dangling nodes 

483 if dangling is None: 

484 dangling_weights = p 

485 else: 

486 # Convert the dangling dictionary into an array in nodelist order 

487 dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float) 

488 dangling_weights /= dangling_weights.sum() 

489 is_dangling = np.where(S == 0)[0] 

490 

491 # power iteration: make up to max_iter iterations 

492 for _ in range(max_iter): 

493 xlast = x 

494 x = alpha * (x @ A + sum(x[is_dangling]) * dangling_weights) + (1 - alpha) * p 

495 # check convergence, l1 norm 

496 err = np.absolute(x - xlast).sum() 

497 if err < N * tol: 

498 return dict(zip(nodelist, map(float, x))) 

499 raise nx.PowerIterationFailedConvergence(max_iter)