Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/networkx/algorithms/similarity.py: 7%

1""" Functions measuring similarity using graph edit distance.

3The graph edit distance is the number of edge/node changes needed

4to make two graphs isomorphic.

6The default algorithm/implementation is sub-optimal for some graphs.

7The problem of finding the exact Graph Edit Distance (GED) is NP-hard

8so it is often slow. If the simple interface `graph_edit_distance`

9takes too long for your graph, try `optimize_graph_edit_distance`

10and/or `optimize_edit_paths`.

12At the same time, I encourage capable people to investigate

13alternative GED algorithms, in order to improve the choices available.

14"""

16import math

17import time

18import warnings

19from dataclasses import dataclass

20from itertools import product

22import networkx as nx

24__all__ = [

25 "graph_edit_distance",

26 "optimal_edit_paths",

27 "optimize_graph_edit_distance",

28 "optimize_edit_paths",

29 "simrank_similarity",

30 "panther_similarity",

31 "generate_random_paths",

32]

35def debug_print(*args, **kwargs):

36 print(*args, **kwargs)

39@nx._dispatch(

40 graphs={"G1": 0, "G2": 1}, preserve_edge_attrs=True, preserve_node_attrs=True

41)

42def graph_edit_distance(

43 G1,

44 G2,

45 node_match=None,

46 edge_match=None,

47 node_subst_cost=None,

48 node_del_cost=None,

49 node_ins_cost=None,

50 edge_subst_cost=None,

51 edge_del_cost=None,

52 edge_ins_cost=None,

53 roots=None,

54 upper_bound=None,

55 timeout=None,

56):

57 """Returns GED (graph edit distance) between graphs G1 and G2.

59 Graph edit distance is a graph similarity measure analogous to

60 Levenshtein distance for strings. It is defined as minimum cost

61 of edit path (sequence of node and edge edit operations)

62 transforming graph G1 to graph isomorphic to G2.

64 Parameters

65 ----------

66 G1, G2: graphs

67 The two graphs G1 and G2 must be of the same type.

69 node_match : callable

70 A function that returns True if node n1 in G1 and n2 in G2

71 should be considered equal during matching.

73 The function will be called like

75 node_match(G1.nodes[n1], G2.nodes[n2]).

77 That is, the function will receive the node attribute

78 dictionaries for n1 and n2 as inputs.

80 Ignored if node_subst_cost is specified. If neither

81 node_match nor node_subst_cost are specified then node

82 attributes are not considered.

84 edge_match : callable

85 A function that returns True if the edge attribute dictionaries

86 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should

87 be considered equal during matching.

89 The function will be called like

91 edge_match(G1[u1][v1], G2[u2][v2]).

93 That is, the function will receive the edge attribute

94 dictionaries of the edges under consideration.

96 Ignored if edge_subst_cost is specified. If neither

97 edge_match nor edge_subst_cost are specified then edge

98 attributes are not considered.

100 node_subst_cost, node_del_cost, node_ins_cost : callable

101 Functions that return the costs of node substitution, node

102 deletion, and node insertion, respectively.

103

104 The functions will be called like

105

106 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),

107 node_del_cost(G1.nodes[n1]),

108 node_ins_cost(G2.nodes[n2]).

109

110 That is, the functions will receive the node attribute

111 dictionaries as inputs. The functions are expected to return

112 positive numeric values.

113

114 Function node_subst_cost overrides node_match if specified.

115 If neither node_match nor node_subst_cost are specified then

116 default node substitution cost of 0 is used (node attributes

117 are not considered during matching).

118

119 If node_del_cost is not specified then default node deletion

120 cost of 1 is used. If node_ins_cost is not specified then

121 default node insertion cost of 1 is used.

122

123 edge_subst_cost, edge_del_cost, edge_ins_cost : callable

124 Functions that return the costs of edge substitution, edge

125 deletion, and edge insertion, respectively.

126

127 The functions will be called like

128

129 edge_subst_cost(G1[u1][v1], G2[u2][v2]),

130 edge_del_cost(G1[u1][v1]),

131 edge_ins_cost(G2[u2][v2]).

132

133 That is, the functions will receive the edge attribute

134 dictionaries as inputs. The functions are expected to return

135 positive numeric values.

136

137 Function edge_subst_cost overrides edge_match if specified.

138 If neither edge_match nor edge_subst_cost are specified then

139 default edge substitution cost of 0 is used (edge attributes

140 are not considered during matching).

141

142 If edge_del_cost is not specified then default edge deletion

143 cost of 1 is used. If edge_ins_cost is not specified then

144 default edge insertion cost of 1 is used.

145

146 roots : 2-tuple

147 Tuple where first element is a node in G1 and the second

148 is a node in G2.

149 These nodes are forced to be matched in the comparison to

150 allow comparison between rooted graphs.

151

152 upper_bound : numeric

153 Maximum edit distance to consider. Return None if no edit

154 distance under or equal to upper_bound exists.

155

156 timeout : numeric

157 Maximum number of seconds to execute.

158 After timeout is met, the current best GED is returned.

159

160 Examples

161 --------

162 >>> G1 = nx.cycle_graph(6)

163 >>> G2 = nx.wheel_graph(7)

164 >>> nx.graph_edit_distance(G1, G2)

165 7.0

166

167 >>> G1 = nx.star_graph(5)

168 >>> G2 = nx.star_graph(5)

169 >>> nx.graph_edit_distance(G1, G2, roots=(0, 0))

170 0.0

171 >>> nx.graph_edit_distance(G1, G2, roots=(1, 0))

172 8.0

173

174 See Also

175 --------

176 optimal_edit_paths, optimize_graph_edit_distance,

177

178 is_isomorphic: test for graph edit distance of 0

179

180 References

181 ----------

182 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick

183 Martineau. An Exact Graph Edit Distance Algorithm for Solving

184 Pattern Recognition Problems. 4th International Conference on

185 Pattern Recognition Applications and Methods 2015, Jan 2015,

186 Lisbon, Portugal. 2015,

187 <10.5220/0005209202710278>. <hal-01168816>

188 https://hal.archives-ouvertes.fr/hal-01168816

189

190 """

191 bestcost = None

192 for _, _, cost in optimize_edit_paths(

193 G1,

194 G2,

195 node_match,

196 edge_match,

197 node_subst_cost,

198 node_del_cost,

199 node_ins_cost,

200 edge_subst_cost,

201 edge_del_cost,

202 edge_ins_cost,

203 upper_bound,

204 True,

205 roots,

206 timeout,

207 ):

208 # assert bestcost is None or cost < bestcost

209 bestcost = cost

210 return bestcost

211

212

213@nx._dispatch(graphs={"G1": 0, "G2": 1})

214def optimal_edit_paths(

215 G1,

216 G2,

217 node_match=None,

218 edge_match=None,

219 node_subst_cost=None,

220 node_del_cost=None,

221 node_ins_cost=None,

222 edge_subst_cost=None,

223 edge_del_cost=None,

224 edge_ins_cost=None,

225 upper_bound=None,

226):

227 """Returns all minimum-cost edit paths transforming G1 to G2.

228

229 Graph edit path is a sequence of node and edge edit operations

230 transforming graph G1 to graph isomorphic to G2. Edit operations

231 include substitutions, deletions, and insertions.

232

233 Parameters

234 ----------

235 G1, G2: graphs

236 The two graphs G1 and G2 must be of the same type.

237

238 node_match : callable

239 A function that returns True if node n1 in G1 and n2 in G2

240 should be considered equal during matching.

241

242 The function will be called like

243

244 node_match(G1.nodes[n1], G2.nodes[n2]).

245

246 That is, the function will receive the node attribute

247 dictionaries for n1 and n2 as inputs.

248

249 Ignored if node_subst_cost is specified. If neither

250 node_match nor node_subst_cost are specified then node

251 attributes are not considered.

252

253 edge_match : callable

254 A function that returns True if the edge attribute dictionaries

255 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should

256 be considered equal during matching.

257

258 The function will be called like

259

260 edge_match(G1[u1][v1], G2[u2][v2]).

261

262 That is, the function will receive the edge attribute

263 dictionaries of the edges under consideration.

264

265 Ignored if edge_subst_cost is specified. If neither

266 edge_match nor edge_subst_cost are specified then edge

267 attributes are not considered.

268

269 node_subst_cost, node_del_cost, node_ins_cost : callable

270 Functions that return the costs of node substitution, node

271 deletion, and node insertion, respectively.

272

273 The functions will be called like

274

275 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),

276 node_del_cost(G1.nodes[n1]),

277 node_ins_cost(G2.nodes[n2]).

278

279 That is, the functions will receive the node attribute

280 dictionaries as inputs. The functions are expected to return

281 positive numeric values.

282

283 Function node_subst_cost overrides node_match if specified.

284 If neither node_match nor node_subst_cost are specified then

285 default node substitution cost of 0 is used (node attributes

286 are not considered during matching).

287

288 If node_del_cost is not specified then default node deletion

289 cost of 1 is used. If node_ins_cost is not specified then

290 default node insertion cost of 1 is used.

291

292 edge_subst_cost, edge_del_cost, edge_ins_cost : callable

293 Functions that return the costs of edge substitution, edge

294 deletion, and edge insertion, respectively.

295

296 The functions will be called like

297

298 edge_subst_cost(G1[u1][v1], G2[u2][v2]),

299 edge_del_cost(G1[u1][v1]),

300 edge_ins_cost(G2[u2][v2]).

301

302 That is, the functions will receive the edge attribute

303 dictionaries as inputs. The functions are expected to return

304 positive numeric values.

305

306 Function edge_subst_cost overrides edge_match if specified.

307 If neither edge_match nor edge_subst_cost are specified then

308 default edge substitution cost of 0 is used (edge attributes

309 are not considered during matching).

310

311 If edge_del_cost is not specified then default edge deletion

312 cost of 1 is used. If edge_ins_cost is not specified then

313 default edge insertion cost of 1 is used.

314

315 upper_bound : numeric

316 Maximum edit distance to consider.

317

318 Returns

319 -------

320 edit_paths : list of tuples (node_edit_path, edge_edit_path)

321 node_edit_path : list of tuples (u, v)

322 edge_edit_path : list of tuples ((u1, v1), (u2, v2))

323

324 cost : numeric

325 Optimal edit path cost (graph edit distance).

326

327 Examples

328 --------

329 >>> G1 = nx.cycle_graph(4)

330 >>> G2 = nx.wheel_graph(5)

331 >>> paths, cost = nx.optimal_edit_paths(G1, G2)

332 >>> len(paths)

333 40

334 >>> cost

335 5.0

336

337 See Also

338 --------

339 graph_edit_distance, optimize_edit_paths

340

341 References

342 ----------

343 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick

344 Martineau. An Exact Graph Edit Distance Algorithm for Solving

345 Pattern Recognition Problems. 4th International Conference on

346 Pattern Recognition Applications and Methods 2015, Jan 2015,

347 Lisbon, Portugal. 2015,

348 <10.5220/0005209202710278>. <hal-01168816>

349 https://hal.archives-ouvertes.fr/hal-01168816

350

351 """

352 paths = []

353 bestcost = None

354 for vertex_path, edge_path, cost in optimize_edit_paths(

355 G1,

356 G2,

357 node_match,

358 edge_match,

359 node_subst_cost,

360 node_del_cost,

361 node_ins_cost,

362 edge_subst_cost,

363 edge_del_cost,

364 edge_ins_cost,

365 upper_bound,

366 False,

367 ):

368 # assert bestcost is None or cost <= bestcost

369 if bestcost is not None and cost < bestcost:

370 paths = []

371 paths.append((vertex_path, edge_path))

372 bestcost = cost

373 return paths, bestcost

374

375

376@nx._dispatch(graphs={"G1": 0, "G2": 1})

377def optimize_graph_edit_distance(

378 G1,

379 G2,

380 node_match=None,

381 edge_match=None,

382 node_subst_cost=None,

383 node_del_cost=None,

384 node_ins_cost=None,

385 edge_subst_cost=None,

386 edge_del_cost=None,

387 edge_ins_cost=None,

388 upper_bound=None,

389):

390 """Returns consecutive approximations of GED (graph edit distance)

391 between graphs G1 and G2.

392

393 Graph edit distance is a graph similarity measure analogous to

394 Levenshtein distance for strings. It is defined as minimum cost

395 of edit path (sequence of node and edge edit operations)

396 transforming graph G1 to graph isomorphic to G2.

397

398 Parameters

399 ----------

400 G1, G2: graphs

401 The two graphs G1 and G2 must be of the same type.

402

403 node_match : callable

404 A function that returns True if node n1 in G1 and n2 in G2

405 should be considered equal during matching.

406

407 The function will be called like

408

409 node_match(G1.nodes[n1], G2.nodes[n2]).

410

411 That is, the function will receive the node attribute

412 dictionaries for n1 and n2 as inputs.

413

414 Ignored if node_subst_cost is specified. If neither

415 node_match nor node_subst_cost are specified then node

416 attributes are not considered.

417

418 edge_match : callable

419 A function that returns True if the edge attribute dictionaries

420 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should

421 be considered equal during matching.

422

423 The function will be called like

424

425 edge_match(G1[u1][v1], G2[u2][v2]).

426

427 That is, the function will receive the edge attribute

428 dictionaries of the edges under consideration.

429

430 Ignored if edge_subst_cost is specified. If neither

431 edge_match nor edge_subst_cost are specified then edge

432 attributes are not considered.

433

434 node_subst_cost, node_del_cost, node_ins_cost : callable

435 Functions that return the costs of node substitution, node

436 deletion, and node insertion, respectively.

437

438 The functions will be called like

439

440 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),

441 node_del_cost(G1.nodes[n1]),

442 node_ins_cost(G2.nodes[n2]).

443

444 That is, the functions will receive the node attribute

445 dictionaries as inputs. The functions are expected to return

446 positive numeric values.

447

448 Function node_subst_cost overrides node_match if specified.

449 If neither node_match nor node_subst_cost are specified then

450 default node substitution cost of 0 is used (node attributes

451 are not considered during matching).

452

453 If node_del_cost is not specified then default node deletion

454 cost of 1 is used. If node_ins_cost is not specified then

455 default node insertion cost of 1 is used.

456

457 edge_subst_cost, edge_del_cost, edge_ins_cost : callable

458 Functions that return the costs of edge substitution, edge

459 deletion, and edge insertion, respectively.

460

461 The functions will be called like

462

463 edge_subst_cost(G1[u1][v1], G2[u2][v2]),

464 edge_del_cost(G1[u1][v1]),

465 edge_ins_cost(G2[u2][v2]).

466

467 That is, the functions will receive the edge attribute

468 dictionaries as inputs. The functions are expected to return

469 positive numeric values.

470

471 Function edge_subst_cost overrides edge_match if specified.

472 If neither edge_match nor edge_subst_cost are specified then

473 default edge substitution cost of 0 is used (edge attributes

474 are not considered during matching).

475

476 If edge_del_cost is not specified then default edge deletion

477 cost of 1 is used. If edge_ins_cost is not specified then

478 default edge insertion cost of 1 is used.

479

480 upper_bound : numeric

481 Maximum edit distance to consider.

482

483 Returns

484 -------

485 Generator of consecutive approximations of graph edit distance.

486

487 Examples

488 --------

489 >>> G1 = nx.cycle_graph(6)

490 >>> G2 = nx.wheel_graph(7)

491 >>> for v in nx.optimize_graph_edit_distance(G1, G2):

492 ... minv = v

493 >>> minv

494 7.0

495

496 See Also

497 --------

498 graph_edit_distance, optimize_edit_paths

499

500 References

501 ----------

502 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick

503 Martineau. An Exact Graph Edit Distance Algorithm for Solving

504 Pattern Recognition Problems. 4th International Conference on

505 Pattern Recognition Applications and Methods 2015, Jan 2015,

506 Lisbon, Portugal. 2015,

507 <10.5220/0005209202710278>. <hal-01168816>

508 https://hal.archives-ouvertes.fr/hal-01168816

509 """

510 for _, _, cost in optimize_edit_paths(

511 G1,

512 G2,

513 node_match,

514 edge_match,

515 node_subst_cost,

516 node_del_cost,

517 node_ins_cost,

518 edge_subst_cost,

519 edge_del_cost,

520 edge_ins_cost,

521 upper_bound,

522 True,

523 ):

524 yield cost

525

526

527@nx._dispatch(

528 graphs={"G1": 0, "G2": 1}, preserve_edge_attrs=True, preserve_node_attrs=True

529)

530def optimize_edit_paths(

531 G1,

532 G2,

533 node_match=None,

534 edge_match=None,

535 node_subst_cost=None,

536 node_del_cost=None,

537 node_ins_cost=None,

538 edge_subst_cost=None,

539 edge_del_cost=None,

540 edge_ins_cost=None,

541 upper_bound=None,

542 strictly_decreasing=True,

543 roots=None,

544 timeout=None,

545):

546 """GED (graph edit distance) calculation: advanced interface.

547

548 Graph edit path is a sequence of node and edge edit operations

549 transforming graph G1 to graph isomorphic to G2. Edit operations

550 include substitutions, deletions, and insertions.

551

552 Graph edit distance is defined as minimum cost of edit path.

553

554 Parameters

555 ----------

556 G1, G2: graphs

557 The two graphs G1 and G2 must be of the same type.

558

559 node_match : callable

560 A function that returns True if node n1 in G1 and n2 in G2

561 should be considered equal during matching.

562

563 The function will be called like

564

565 node_match(G1.nodes[n1], G2.nodes[n2]).

566

567 That is, the function will receive the node attribute

568 dictionaries for n1 and n2 as inputs.

569

570 Ignored if node_subst_cost is specified. If neither

571 node_match nor node_subst_cost are specified then node

572 attributes are not considered.

573

574 edge_match : callable

575 A function that returns True if the edge attribute dictionaries

576 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should

577 be considered equal during matching.

578

579 The function will be called like

580

581 edge_match(G1[u1][v1], G2[u2][v2]).

582

583 That is, the function will receive the edge attribute

584 dictionaries of the edges under consideration.

585

586 Ignored if edge_subst_cost is specified. If neither

587 edge_match nor edge_subst_cost are specified then edge

588 attributes are not considered.

589

590 node_subst_cost, node_del_cost, node_ins_cost : callable

591 Functions that return the costs of node substitution, node

592 deletion, and node insertion, respectively.

593

594 The functions will be called like

595

596 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),

597 node_del_cost(G1.nodes[n1]),

598 node_ins_cost(G2.nodes[n2]).

599

600 That is, the functions will receive the node attribute

601 dictionaries as inputs. The functions are expected to return

602 positive numeric values.

603

604 Function node_subst_cost overrides node_match if specified.

605 If neither node_match nor node_subst_cost are specified then

606 default node substitution cost of 0 is used (node attributes

607 are not considered during matching).

608

609 If node_del_cost is not specified then default node deletion

610 cost of 1 is used. If node_ins_cost is not specified then

611 default node insertion cost of 1 is used.

612

613 edge_subst_cost, edge_del_cost, edge_ins_cost : callable

614 Functions that return the costs of edge substitution, edge

615 deletion, and edge insertion, respectively.

616

617 The functions will be called like

618

619 edge_subst_cost(G1[u1][v1], G2[u2][v2]),

620 edge_del_cost(G1[u1][v1]),

621 edge_ins_cost(G2[u2][v2]).

622

623 That is, the functions will receive the edge attribute

624 dictionaries as inputs. The functions are expected to return

625 positive numeric values.

626

627 Function edge_subst_cost overrides edge_match if specified.

628 If neither edge_match nor edge_subst_cost are specified then

629 default edge substitution cost of 0 is used (edge attributes

630 are not considered during matching).

631

632 If edge_del_cost is not specified then default edge deletion

633 cost of 1 is used. If edge_ins_cost is not specified then

634 default edge insertion cost of 1 is used.

635

636 upper_bound : numeric

637 Maximum edit distance to consider.

638

639 strictly_decreasing : bool

640 If True, return consecutive approximations of strictly

641 decreasing cost. Otherwise, return all edit paths of cost

642 less than or equal to the previous minimum cost.

643

644 roots : 2-tuple

645 Tuple where first element is a node in G1 and the second

646 is a node in G2.

647 These nodes are forced to be matched in the comparison to

648 allow comparison between rooted graphs.

649

650 timeout : numeric

651 Maximum number of seconds to execute.

652 After timeout is met, the current best GED is returned.

653

654 Returns

655 -------

656 Generator of tuples (node_edit_path, edge_edit_path, cost)

657 node_edit_path : list of tuples (u, v)

658 edge_edit_path : list of tuples ((u1, v1), (u2, v2))

659 cost : numeric

660

661 See Also

662 --------

663 graph_edit_distance, optimize_graph_edit_distance, optimal_edit_paths

664

665 References

666 ----------

667 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick

668 Martineau. An Exact Graph Edit Distance Algorithm for Solving

669 Pattern Recognition Problems. 4th International Conference on

670 Pattern Recognition Applications and Methods 2015, Jan 2015,

671 Lisbon, Portugal. 2015,

672 <10.5220/0005209202710278>. <hal-01168816>

673 https://hal.archives-ouvertes.fr/hal-01168816

674

675 """

676 # TODO: support DiGraph

677

678 import numpy as np

679 import scipy as sp

680

681 @dataclass

682 class CostMatrix:

683 C: ...

684 lsa_row_ind: ...

685 lsa_col_ind: ...

686 ls: ...

687

688 def make_CostMatrix(C, m, n):

689 # assert(C.shape == (m + n, m + n))

690 lsa_row_ind, lsa_col_ind = sp.optimize.linear_sum_assignment(C)

691

692 # Fixup dummy assignments:

693 # each substitution i<->j should have dummy assignment m+j<->n+i

694 # NOTE: fast reduce of Cv relies on it

695 # assert len(lsa_row_ind) == len(lsa_col_ind)

696 indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind)

697 subst_ind = [k for k, i, j in indexes if i < m and j < n]

698 indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind)

699 dummy_ind = [k for k, i, j in indexes if i >= m and j >= n]

700 # assert len(subst_ind) == len(dummy_ind)

701 lsa_row_ind[dummy_ind] = lsa_col_ind[subst_ind] + m

702 lsa_col_ind[dummy_ind] = lsa_row_ind[subst_ind] + n

703

704 return CostMatrix(

705 C, lsa_row_ind, lsa_col_ind, C[lsa_row_ind, lsa_col_ind].sum()

706 )

707

708 def extract_C(C, i, j, m, n):

709 # assert(C.shape == (m + n, m + n))

710 row_ind = [k in i or k - m in j for k in range(m + n)]

711 col_ind = [k in j or k - n in i for k in range(m + n)]

712 return C[row_ind, :][:, col_ind]

713

714 def reduce_C(C, i, j, m, n):

715 # assert(C.shape == (m + n, m + n))

716 row_ind = [k not in i and k - m not in j for k in range(m + n)]

717 col_ind = [k not in j and k - n not in i for k in range(m + n)]

718 return C[row_ind, :][:, col_ind]

719

720 def reduce_ind(ind, i):

721 # assert set(ind) == set(range(len(ind)))

722 rind = ind[[k not in i for k in ind]]

723 for k in set(i):

724 rind[rind >= k] -= 1

725 return rind

726

727 def match_edges(u, v, pending_g, pending_h, Ce, matched_uv=None):

728 """

729 Parameters:

730 u, v: matched vertices, u=None or v=None for

731 deletion/insertion

732 pending_g, pending_h: lists of edges not yet mapped

733 Ce: CostMatrix of pending edge mappings

734 matched_uv: partial vertex edit path

735 list of tuples (u, v) of previously matched vertex

736 mappings u<->v, u=None or v=None for

737 deletion/insertion

738

739 Returns:

740 list of (i, j): indices of edge mappings g<->h

741 localCe: local CostMatrix of edge mappings

742 (basically submatrix of Ce at cross of rows i, cols j)

743 """

744 M = len(pending_g)

745 N = len(pending_h)

746 # assert Ce.C.shape == (M + N, M + N)

747

748 # only attempt to match edges after one node match has been made

749 # this will stop self-edges on the first node being automatically deleted

750 # even when a substitution is the better option

751 if matched_uv is None or len(matched_uv) == 0:

752 g_ind = []

753 h_ind = []

754 else:

755 g_ind = [

756 i

757 for i in range(M)

758 if pending_g[i][:2] == (u, u)

759 or any(

760 pending_g[i][:2] in ((p, u), (u, p), (p, p)) for p, q in matched_uv

761 )

762 ]

763 h_ind = [

764 j

765 for j in range(N)

766 if pending_h[j][:2] == (v, v)

767 or any(

768 pending_h[j][:2] in ((q, v), (v, q), (q, q)) for p, q in matched_uv

769 )

770 ]

771

772 m = len(g_ind)

773 n = len(h_ind)

774

775 if m or n:

776 C = extract_C(Ce.C, g_ind, h_ind, M, N)

777 # assert C.shape == (m + n, m + n)

778

779 # Forbid structurally invalid matches

780 # NOTE: inf remembered from Ce construction

781 for k, i in enumerate(g_ind):

782 g = pending_g[i][:2]

783 for l, j in enumerate(h_ind):

784 h = pending_h[j][:2]

785 if nx.is_directed(G1) or nx.is_directed(G2):

786 if any(

787 g == (p, u) and h == (q, v) or g == (u, p) and h == (v, q)

788 for p, q in matched_uv

789 ):

790 continue

791 else:

792 if any(

793 g in ((p, u), (u, p)) and h in ((q, v), (v, q))

794 for p, q in matched_uv

795 ):

796 continue

797 if g == (u, u) or any(g == (p, p) for p, q in matched_uv):

798 continue

799 if h == (v, v) or any(h == (q, q) for p, q in matched_uv):

800 continue

801 C[k, l] = inf

802

803 localCe = make_CostMatrix(C, m, n)

804 ij = [

805 (

806 g_ind[k] if k < m else M + h_ind[l],

807 h_ind[l] if l < n else N + g_ind[k],

808 )

809 for k, l in zip(localCe.lsa_row_ind, localCe.lsa_col_ind)

810 if k < m or l < n

811 ]

812

813 else:

814 ij = []

815 localCe = CostMatrix(np.empty((0, 0)), [], [], 0)

816

817 return ij, localCe

818

819 def reduce_Ce(Ce, ij, m, n):

820 if len(ij):

821 i, j = zip(*ij)

822 m_i = m - sum(1 for t in i if t < m)

823 n_j = n - sum(1 for t in j if t < n)

824 return make_CostMatrix(reduce_C(Ce.C, i, j, m, n), m_i, n_j)

825 return Ce

826

827 def get_edit_ops(

828 matched_uv, pending_u, pending_v, Cv, pending_g, pending_h, Ce, matched_cost

829 ):

830 """

831 Parameters:

832 matched_uv: partial vertex edit path

833 list of tuples (u, v) of vertex mappings u<->v,

834 u=None or v=None for deletion/insertion

835 pending_u, pending_v: lists of vertices not yet mapped

836 Cv: CostMatrix of pending vertex mappings

837 pending_g, pending_h: lists of edges not yet mapped

838 Ce: CostMatrix of pending edge mappings

839 matched_cost: cost of partial edit path

840

841 Returns:

842 sequence of

843 (i, j): indices of vertex mapping u<->v

844 Cv_ij: reduced CostMatrix of pending vertex mappings

845 (basically Cv with row i, col j removed)

846 list of (x, y): indices of edge mappings g<->h

847 Ce_xy: reduced CostMatrix of pending edge mappings

848 (basically Ce with rows x, cols y removed)

849 cost: total cost of edit operation

850 NOTE: most promising ops first

851 """

852 m = len(pending_u)

853 n = len(pending_v)

854 # assert Cv.C.shape == (m + n, m + n)

855

856 # 1) a vertex mapping from optimal linear sum assignment

857 i, j = min(

858 (k, l) for k, l in zip(Cv.lsa_row_ind, Cv.lsa_col_ind) if k < m or l < n

859 )

860 xy, localCe = match_edges(

861 pending_u[i] if i < m else None,

862 pending_v[j] if j < n else None,

863 pending_g,

864 pending_h,

865 Ce,

866 matched_uv,

867 )

868 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))

869 # assert Ce.ls <= localCe.ls + Ce_xy.ls

870 if prune(matched_cost + Cv.ls + localCe.ls + Ce_xy.ls):

871 pass

872 else:

873 # get reduced Cv efficiently

874 Cv_ij = CostMatrix(

875 reduce_C(Cv.C, (i,), (j,), m, n),

876 reduce_ind(Cv.lsa_row_ind, (i, m + j)),

877 reduce_ind(Cv.lsa_col_ind, (j, n + i)),

878 Cv.ls - Cv.C[i, j],

879 )

880 yield (i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls

881

882 # 2) other candidates, sorted by lower-bound cost estimate

883 other = []

884 fixed_i, fixed_j = i, j

885 if m <= n:

886 candidates = (

887 (t, fixed_j)

888 for t in range(m + n)

889 if t != fixed_i and (t < m or t == m + fixed_j)

890 )

891 else:

892 candidates = (

893 (fixed_i, t)

894 for t in range(m + n)

895 if t != fixed_j and (t < n or t == n + fixed_i)

896 )

897 for i, j in candidates:

898 if prune(matched_cost + Cv.C[i, j] + Ce.ls):

899 continue

900 Cv_ij = make_CostMatrix(

901 reduce_C(Cv.C, (i,), (j,), m, n),

902 m - 1 if i < m else m,

903 n - 1 if j < n else n,

904 )

905 # assert Cv.ls <= Cv.C[i, j] + Cv_ij.ls

906 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + Ce.ls):

907 continue

908 xy, localCe = match_edges(

909 pending_u[i] if i < m else None,

910 pending_v[j] if j < n else None,

911 pending_g,

912 pending_h,

913 Ce,

914 matched_uv,

915 )

916 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls):

917 continue

918 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))

919 # assert Ce.ls <= localCe.ls + Ce_xy.ls

920 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls + Ce_xy.ls):

921 continue

922 other.append(((i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls))

923

924 yield from sorted(other, key=lambda t: t[4] + t[1].ls + t[3].ls)

925

926 def get_edit_paths(

927 matched_uv,

928 pending_u,

929 pending_v,

930 Cv,

931 matched_gh,

932 pending_g,

933 pending_h,

934 Ce,

935 matched_cost,

936 ):

937 """

938 Parameters:

939 matched_uv: partial vertex edit path

940 list of tuples (u, v) of vertex mappings u<->v,

941 u=None or v=None for deletion/insertion

942 pending_u, pending_v: lists of vertices not yet mapped

943 Cv: CostMatrix of pending vertex mappings

944 matched_gh: partial edge edit path

945 list of tuples (g, h) of edge mappings g<->h,

946 g=None or h=None for deletion/insertion

947 pending_g, pending_h: lists of edges not yet mapped

948 Ce: CostMatrix of pending edge mappings

949 matched_cost: cost of partial edit path

950

951 Returns:

952 sequence of (vertex_path, edge_path, cost)

953 vertex_path: complete vertex edit path

954 list of tuples (u, v) of vertex mappings u<->v,

955 u=None or v=None for deletion/insertion

956 edge_path: complete edge edit path

957 list of tuples (g, h) of edge mappings g<->h,

958 g=None or h=None for deletion/insertion

959 cost: total cost of edit path

960 NOTE: path costs are non-increasing

961 """

962 # debug_print('matched-uv:', matched_uv)

963 # debug_print('matched-gh:', matched_gh)

964 # debug_print('matched-cost:', matched_cost)

965 # debug_print('pending-u:', pending_u)

966 # debug_print('pending-v:', pending_v)

967 # debug_print(Cv.C)

968 # assert list(sorted(G1.nodes)) == list(sorted(list(u for u, v in matched_uv if u is not None) + pending_u))

969 # assert list(sorted(G2.nodes)) == list(sorted(list(v for u, v in matched_uv if v is not None) + pending_v))

970 # debug_print('pending-g:', pending_g)

971 # debug_print('pending-h:', pending_h)

972 # debug_print(Ce.C)

973 # assert list(sorted(G1.edges)) == list(sorted(list(g for g, h in matched_gh if g is not None) + pending_g))

974 # assert list(sorted(G2.edges)) == list(sorted(list(h for g, h in matched_gh if h is not None) + pending_h))

975 # debug_print()

976

977 if prune(matched_cost + Cv.ls + Ce.ls):

978 return

979

980 if not max(len(pending_u), len(pending_v)):

981 # assert not len(pending_g)

982 # assert not len(pending_h)

983 # path completed!

984 # assert matched_cost <= maxcost_value

985 nonlocal maxcost_value

986 maxcost_value = min(maxcost_value, matched_cost)

987 yield matched_uv, matched_gh, matched_cost

988

989 else:

990 edit_ops = get_edit_ops(

991 matched_uv,

992 pending_u,

993 pending_v,

994 Cv,

995 pending_g,

996 pending_h,

997 Ce,

998 matched_cost,

999 )

1000 for ij, Cv_ij, xy, Ce_xy, edit_cost in edit_ops:

1001 i, j = ij

1002 # assert Cv.C[i, j] + sum(Ce.C[t] for t in xy) == edit_cost

1003 if prune(matched_cost + edit_cost + Cv_ij.ls + Ce_xy.ls):

1004 continue

1005

1006 # dive deeper

1007 u = pending_u.pop(i) if i < len(pending_u) else None

1008 v = pending_v.pop(j) if j < len(pending_v) else None

1009 matched_uv.append((u, v))

1010 for x, y in xy:

1011 len_g = len(pending_g)

1012 len_h = len(pending_h)

1013 matched_gh.append(

1014 (

1015 pending_g[x] if x < len_g else None,

1016 pending_h[y] if y < len_h else None,

1017 )

1018 )

1019 sortedx = sorted(x for x, y in xy)

1020 sortedy = sorted(y for x, y in xy)

1021 G = [

1022 (pending_g.pop(x) if x < len(pending_g) else None)

1023 for x in reversed(sortedx)

1024 ]

1025 H = [

1026 (pending_h.pop(y) if y < len(pending_h) else None)

1027 for y in reversed(sortedy)

1028 ]

1029

1030 yield from get_edit_paths(

1031 matched_uv,

1032 pending_u,

1033 pending_v,

1034 Cv_ij,

1035 matched_gh,

1036 pending_g,

1037 pending_h,

1038 Ce_xy,

1039 matched_cost + edit_cost,

1040 )

1041

1042 # backtrack

1043 if u is not None:

1044 pending_u.insert(i, u)

1045 if v is not None:

1046 pending_v.insert(j, v)

1047 matched_uv.pop()

1048 for x, g in zip(sortedx, reversed(G)):

1049 if g is not None:

1050 pending_g.insert(x, g)

1051 for y, h in zip(sortedy, reversed(H)):

1052 if h is not None:

1053 pending_h.insert(y, h)

1054 for _ in xy:

1055 matched_gh.pop()

1056

1057 # Initialization

1058

1059 pending_u = list(G1.nodes)

1060 pending_v = list(G2.nodes)

1061

1062 initial_cost = 0

1063 if roots:

1064 root_u, root_v = roots

1065 if root_u not in pending_u or root_v not in pending_v:

1066 raise nx.NodeNotFound("Root node not in graph.")

1067

1068 # remove roots from pending

1069 pending_u.remove(root_u)

1070 pending_v.remove(root_v)

1071

1072 # cost matrix of vertex mappings

1073 m = len(pending_u)

1074 n = len(pending_v)

1075 C = np.zeros((m + n, m + n))

1076 if node_subst_cost:

1077 C[0:m, 0:n] = np.array(

1078 [

1079 node_subst_cost(G1.nodes[u], G2.nodes[v])

1080 for u in pending_u

1081 for v in pending_v

1082 ]

1083 ).reshape(m, n)

1084 if roots:

1085 initial_cost = node_subst_cost(G1.nodes[root_u], G2.nodes[root_v])

1086 elif node_match:

1087 C[0:m, 0:n] = np.array(

1088 [

1089 1 - int(node_match(G1.nodes[u], G2.nodes[v]))

1090 for u in pending_u

1091 for v in pending_v

1092 ]

1093 ).reshape(m, n)

1094 if roots:

1095 initial_cost = 1 - node_match(G1.nodes[root_u], G2.nodes[root_v])

1096 else:

1097 # all zeroes

1098 pass

1099 # assert not min(m, n) or C[0:m, 0:n].min() >= 0

1100 if node_del_cost:

1101 del_costs = [node_del_cost(G1.nodes[u]) for u in pending_u]

1102 else:

1103 del_costs = [1] * len(pending_u)

1104 # assert not m or min(del_costs) >= 0

1105 if node_ins_cost:

1106 ins_costs = [node_ins_cost(G2.nodes[v]) for v in pending_v]

1107 else:

1108 ins_costs = [1] * len(pending_v)

1109 # assert not n or min(ins_costs) >= 0

1110 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1

1111 C[0:m, n : n + m] = np.array(

1112 [del_costs[i] if i == j else inf for i in range(m) for j in range(m)]

1113 ).reshape(m, m)

1114 C[m : m + n, 0:n] = np.array(

1115 [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)]

1116 ).reshape(n, n)

1117 Cv = make_CostMatrix(C, m, n)

1118 # debug_print(f"Cv: {m} x {n}")

1119 # debug_print(Cv.C)

1120

1121 pending_g = list(G1.edges)

1122 pending_h = list(G2.edges)

1123

1124 # cost matrix of edge mappings

1125 m = len(pending_g)

1126 n = len(pending_h)

1127 C = np.zeros((m + n, m + n))

1128 if edge_subst_cost:

1129 C[0:m, 0:n] = np.array(

1130 [

1131 edge_subst_cost(G1.edges[g], G2.edges[h])

1132 for g in pending_g

1133 for h in pending_h

1134 ]

1135 ).reshape(m, n)

1136 elif edge_match:

1137 C[0:m, 0:n] = np.array(

1138 [

1139 1 - int(edge_match(G1.edges[g], G2.edges[h]))

1140 for g in pending_g

1141 for h in pending_h

1142 ]

1143 ).reshape(m, n)

1144 else:

1145 # all zeroes

1146 pass

1147 # assert not min(m, n) or C[0:m, 0:n].min() >= 0

1148 if edge_del_cost:

1149 del_costs = [edge_del_cost(G1.edges[g]) for g in pending_g]

1150 else:

1151 del_costs = [1] * len(pending_g)

1152 # assert not m or min(del_costs) >= 0

1153 if edge_ins_cost:

1154 ins_costs = [edge_ins_cost(G2.edges[h]) for h in pending_h]

1155 else:

1156 ins_costs = [1] * len(pending_h)

1157 # assert not n or min(ins_costs) >= 0

1158 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1

1159 C[0:m, n : n + m] = np.array(

1160 [del_costs[i] if i == j else inf for i in range(m) for j in range(m)]

1161 ).reshape(m, m)

1162 C[m : m + n, 0:n] = np.array(

1163 [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)]

1164 ).reshape(n, n)

1165 Ce = make_CostMatrix(C, m, n)

1166 # debug_print(f'Ce: {m} x {n}')

1167 # debug_print(Ce.C)

1168 # debug_print()

1169

1170 maxcost_value = Cv.C.sum() + Ce.C.sum() + 1

1171

1172 if timeout is not None:

1173 if timeout <= 0:

1174 raise nx.NetworkXError("Timeout value must be greater than 0")

1175 start = time.perf_counter()

1176

1177 def prune(cost):

1178 if timeout is not None:

1179 if time.perf_counter() - start > timeout:

1180 return True

1181 if upper_bound is not None:

1182 if cost > upper_bound:

1183 return True

1184 if cost > maxcost_value:

1185 return True

1186 if strictly_decreasing and cost >= maxcost_value:

1187 return True

1188 return False

1189

1190 # Now go!

1191

1192 done_uv = [] if roots is None else [roots]

1193

1194 for vertex_path, edge_path, cost in get_edit_paths(

1195 done_uv, pending_u, pending_v, Cv, [], pending_g, pending_h, Ce, initial_cost

1196 ):

1197 # assert sorted(G1.nodes) == sorted(u for u, v in vertex_path if u is not None)

1198 # assert sorted(G2.nodes) == sorted(v for u, v in vertex_path if v is not None)

1199 # assert sorted(G1.edges) == sorted(g for g, h in edge_path if g is not None)

1200 # assert sorted(G2.edges) == sorted(h for g, h in edge_path if h is not None)

1201 # print(vertex_path, edge_path, cost, file = sys.stderr)

1202 # assert cost == maxcost_value

1203 yield list(vertex_path), list(edge_path), cost

1204

1205

1206@nx._dispatch

1207def simrank_similarity(

1208 G,

1209 source=None,

1210 target=None,

1211 importance_factor=0.9,

1212 max_iterations=1000,

1213 tolerance=1e-4,

1214):

1215 """Returns the SimRank similarity of nodes in the graph ``G``.

1216

1217 SimRank is a similarity metric that says "two objects are considered

1218 to be similar if they are referenced by similar objects." [1]_.

1219

1220 The pseudo-code definition from the paper is::

1221

1222 def simrank(G, u, v):

1223 in_neighbors_u = G.predecessors(u)

1224 in_neighbors_v = G.predecessors(v)

1225 scale = C / (len(in_neighbors_u) * len(in_neighbors_v))

1226 return scale * sum(simrank(G, w, x)

1227 for w, x in product(in_neighbors_u,

1228 in_neighbors_v))

1229

1230 where ``G`` is the graph, ``u`` is the source, ``v`` is the target,

1231 and ``C`` is a float decay or importance factor between 0 and 1.

1232

1233 The SimRank algorithm for determining node similarity is defined in

1234 [2]_.

1235

1236 Parameters

1237 ----------

1238 G : NetworkX graph

1239 A NetworkX graph

1240

1241 source : node

1242 If this is specified, the returned dictionary maps each node

1243 ``v`` in the graph to the similarity between ``source`` and

1244 ``v``.

1245

1246 target : node

1247 If both ``source`` and ``target`` are specified, the similarity

1248 value between ``source`` and ``target`` is returned. If

1249 ``target`` is specified but ``source`` is not, this argument is

1250 ignored.

1251

1252 importance_factor : float

1253 The relative importance of indirect neighbors with respect to

1254 direct neighbors.

1255

1256 max_iterations : integer

1257 Maximum number of iterations.

1258

1259 tolerance : float

1260 Error tolerance used to check convergence. When an iteration of

1261 the algorithm finds that no similarity value changes more than

1262 this amount, the algorithm halts.

1263

1264 Returns

1265 -------

1266 similarity : dictionary or float

1267 If ``source`` and ``target`` are both ``None``, this returns a

1268 dictionary of dictionaries, where keys are node pairs and value

1269 are similarity of the pair of nodes.

1270

1271 If ``source`` is not ``None`` but ``target`` is, this returns a

1272 dictionary mapping node to the similarity of ``source`` and that

1273 node.

1274

1275 If neither ``source`` nor ``target`` is ``None``, this returns

1276 the similarity value for the given pair of nodes.

1277

1278 Examples

1279 --------

1280 >>> G = nx.cycle_graph(2)

1281 >>> nx.simrank_similarity(G)

1282 {0: {0: 1.0, 1: 0.0}, 1: {0: 0.0, 1: 1.0}}

1283 >>> nx.simrank_similarity(G, source=0)

1284 {0: 1.0, 1: 0.0}

1285 >>> nx.simrank_similarity(G, source=0, target=0)

1286 1.0

1287

1288 The result of this function can be converted to a numpy array

1289 representing the SimRank matrix by using the node order of the

1290 graph to determine which row and column represent each node.

1291 Other ordering of nodes is also possible.

1292

1293 >>> import numpy as np

1294 >>> sim = nx.simrank_similarity(G)

1295 >>> np.array([[sim[u][v] for v in G] for u in G])

1296 array([[1., 0.],

1297 [0., 1.]])

1298 >>> sim_1d = nx.simrank_similarity(G, source=0)

1299 >>> np.array([sim[0][v] for v in G])

1300 array([1., 0.])

1301

1302 References

1303 ----------

1304 .. [1] https://en.wikipedia.org/wiki/SimRank

1305 .. [2] G. Jeh and J. Widom.

1306 "SimRank: a measure of structural-context similarity",

1307 In KDD'02: Proceedings of the Eighth ACM SIGKDD

1308 International Conference on Knowledge Discovery and Data Mining,

1309 pp. 538--543. ACM Press, 2002.

1310 """

1311 import numpy as np

1312

1313 nodelist = list(G)

1314 s_indx = None if source is None else nodelist.index(source)

1315 t_indx = None if target is None else nodelist.index(target)

1316

1317 x = _simrank_similarity_numpy(

1318 G, s_indx, t_indx, importance_factor, max_iterations, tolerance

1319 )

1320

1321 if isinstance(x, np.ndarray):

1322 if x.ndim == 1:

1323 return dict(zip(G, x))

1324 # else x.ndim == 2

1325 return {u: dict(zip(G, row)) for u, row in zip(G, x)}

1326 return x

1327

1328

1329def _simrank_similarity_python(

1330 G,

1331 source=None,

1332 target=None,

1333 importance_factor=0.9,

1334 max_iterations=1000,

1335 tolerance=1e-4,

1336):

1337 """Returns the SimRank similarity of nodes in the graph ``G``.

1338

1339 This pure Python version is provided for pedagogical purposes.

1340

1341 Examples

1342 --------

1343 >>> G = nx.cycle_graph(2)

1344 >>> nx.similarity._simrank_similarity_python(G)

1345 {0: {0: 1, 1: 0.0}, 1: {0: 0.0, 1: 1}}

1346 >>> nx.similarity._simrank_similarity_python(G, source=0)

1347 {0: 1, 1: 0.0}

1348 >>> nx.similarity._simrank_similarity_python(G, source=0, target=0)

1349 1

1350 """

1351 # build up our similarity adjacency dictionary output

1352 newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G}

1353

1354 # These functions compute the update to the similarity value of the nodes

1355 # `u` and `v` with respect to the previous similarity values.

1356 def avg_sim(s):

1357 return sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0

1358

1359 Gadj = G.pred if G.is_directed() else G.adj

1360

1361 def sim(u, v):

1362 return importance_factor * avg_sim(list(product(Gadj[u], Gadj[v])))

1363

1364 for its in range(max_iterations):

1365 oldsim = newsim

1366 newsim = {u: {v: sim(u, v) if u is not v else 1 for v in G} for u in G}

1367 is_close = all(

1368 all(

1369 abs(newsim[u][v] - old) <= tolerance * (1 + abs(old))

1370 for v, old in nbrs.items()

1371 )

1372 for u, nbrs in oldsim.items()

1373 )

1374 if is_close:

1375 break

1376

1377 if its + 1 == max_iterations:

1378 raise nx.ExceededMaxIterations(

1379 f"simrank did not converge after {max_iterations} iterations."

1380 )

1381

1382 if source is not None and target is not None:

1383 return newsim[source][target]

1384 if source is not None:

1385 return newsim[source]

1386 return newsim

1387

1388

1389def _simrank_similarity_numpy(

1390 G,

1391 source=None,

1392 target=None,

1393 importance_factor=0.9,

1394 max_iterations=1000,

1395 tolerance=1e-4,

1396):

1397 """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``.

1398

1399 The SimRank algorithm for determining node similarity is defined in

1400 [1]_.

1401

1402 Parameters

1403 ----------

1404 G : NetworkX graph

1405 A NetworkX graph

1406

1407 source : node

1408 If this is specified, the returned dictionary maps each node

1409 ``v`` in the graph to the similarity between ``source`` and

1410 ``v``.

1411

1412 target : node

1413 If both ``source`` and ``target`` are specified, the similarity

1414 value between ``source`` and ``target`` is returned. If

1415 ``target`` is specified but ``source`` is not, this argument is

1416 ignored.

1417

1418 importance_factor : float

1419 The relative importance of indirect neighbors with respect to

1420 direct neighbors.

1421

1422 max_iterations : integer

1423 Maximum number of iterations.

1424

1425 tolerance : float

1426 Error tolerance used to check convergence. When an iteration of

1427 the algorithm finds that no similarity value changes more than

1428 this amount, the algorithm halts.

1429

1430 Returns

1431 -------

1432 similarity : numpy array or float

1433 If ``source`` and ``target`` are both ``None``, this returns a

1434 2D array containing SimRank scores of the nodes.

1435

1436 If ``source`` is not ``None`` but ``target`` is, this returns an

1437 1D array containing SimRank scores of ``source`` and that

1438 node.

1439

1440 If neither ``source`` nor ``target`` is ``None``, this returns

1441 the similarity value for the given pair of nodes.

1442

1443 Examples

1444 --------

1445 >>> G = nx.cycle_graph(2)

1446 >>> nx.similarity._simrank_similarity_numpy(G)

1447 array([[1., 0.],

1448 [0., 1.]])

1449 >>> nx.similarity._simrank_similarity_numpy(G, source=0)

1450 array([1., 0.])

1451 >>> nx.similarity._simrank_similarity_numpy(G, source=0, target=0)

1452 1.0

1453

1454 References

1455 ----------

1456 .. [1] G. Jeh and J. Widom.

1457 "SimRank: a measure of structural-context similarity",

1458 In KDD'02: Proceedings of the Eighth ACM SIGKDD

1459 International Conference on Knowledge Discovery and Data Mining,

1460 pp. 538--543. ACM Press, 2002.

1461 """

1462 # This algorithm follows roughly

1463 #

1464 # S = max{C * (A.T * S * A), I}

1465 #

1466 # where C is the importance factor, A is the column normalized

1467 # adjacency matrix, and I is the identity matrix.

1468 import numpy as np

1469

1470 adjacency_matrix = nx.to_numpy_array(G)

1471

1472 # column-normalize the ``adjacency_matrix``

1473 s = np.array(adjacency_matrix.sum(axis=0))

1474 s[s == 0] = 1

1475 adjacency_matrix /= s # adjacency_matrix.sum(axis=0)

1476

1477 newsim = np.eye(len(G), dtype=np.float64)

1478 for its in range(max_iterations):

1479 prevsim = newsim.copy()

1480 newsim = importance_factor * ((adjacency_matrix.T @ prevsim) @ adjacency_matrix)

1481 np.fill_diagonal(newsim, 1.0)

1482

1483 if np.allclose(prevsim, newsim, atol=tolerance):

1484 break

1485

1486 if its + 1 == max_iterations:

1487 raise nx.ExceededMaxIterations(

1488 f"simrank did not converge after {max_iterations} iterations."

1489 )

1490

1491 if source is not None and target is not None:

1492 return newsim[source, target]

1493 if source is not None:

1494 return newsim[source]

1495 return newsim

1496

1497

1498@nx._dispatch(edge_attrs="weight")

1499def panther_similarity(

1500 G, source, k=5, path_length=5, c=0.5, delta=0.1, eps=None, weight="weight"

1501):

1502 r"""Returns the Panther similarity of nodes in the graph `G` to node ``v``.

1503

1504 Panther is a similarity metric that says "two objects are considered

1505 to be similar if they frequently appear on the same paths." [1]_.

1506

1507 Parameters

1508 ----------

1509 G : NetworkX graph

1510 A NetworkX graph

1511 source : node

1512 Source node for which to find the top `k` similar other nodes

1513 k : int (default = 5)

1514 The number of most similar nodes to return

1515 path_length : int (default = 5)

1516 How long the randomly generated paths should be (``T`` in [1]_)

1517 c : float (default = 0.5)

1518 A universal positive constant used to scale the number

1519 of sample random paths to generate.

1520 delta : float (default = 0.1)

1521 The probability that the similarity $S$ is not an epsilon-approximation to (R, phi),

1522 where $R$ is the number of random paths and $\phi$ is the probability

1523 that an element sampled from a set $A \subseteq D$, where $D$ is the domain.

1524 eps : float or None (default = None)

1525 The error bound. Per [1]_, a good value is ``sqrt(1/|E|)``. Therefore,

1526 if no value is provided, the recommended computed value will be used.

1527 weight : string or None, optional (default="weight")

1528 The name of an edge attribute that holds the numerical value

1529 used as a weight. If None then each edge has weight 1.

1530

1531 Returns

1532 -------

1533 similarity : dictionary

1534 Dictionary of nodes to similarity scores (as floats). Note:

1535 the self-similarity (i.e., ``v``) will not be included in

1536 the returned dictionary.

1537

1538 Examples

1539 --------

1540 >>> G = nx.star_graph(10)

1541 >>> sim = nx.panther_similarity(G, 0)

1542

1543 References

1544 ----------

1545 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.

1546 Panther: Fast top-k similarity search on large networks.

1547 In Proceedings of the ACM SIGKDD International Conference

1548 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).

1549 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.

1550 """

1551 import numpy as np

1552

1553 num_nodes = G.number_of_nodes()

1554 if num_nodes < k:

1555 warnings.warn(

1556 f"Number of nodes is {num_nodes}, but requested k is {k}. "

1557 "Setting k to number of nodes."

1558 )

1559 k = num_nodes

1560 # According to [1], they empirically determined

1561 # a good value for ``eps`` to be sqrt( 1 / |E| )

1562 if eps is None:

1563 eps = np.sqrt(1.0 / G.number_of_edges())

1564

1565 inv_node_map = {name: index for index, name in enumerate(G.nodes)}

1566 node_map = np.array(G)

1567

1568 # Calculate the sample size ``R`` for how many paths

1569 # to randomly generate

1570 t_choose_2 = math.comb(path_length, 2)

1571 sample_size = int((c / eps**2) * (np.log2(t_choose_2) + 1 + np.log(1 / delta)))

1572 index_map = {}

1573 _ = list(

1574 generate_random_paths(

1575 G, sample_size, path_length=path_length, index_map=index_map, weight=weight

1576 )

1577 )

1578 S = np.zeros(num_nodes)

1579

1580 inv_sample_size = 1 / sample_size

1581

1582 source_paths = set(index_map[source])

1583

1584 # Calculate the path similarities

1585 # between ``source`` (v) and ``node`` (v_j)

1586 # using our inverted index mapping of

1587 # vertices to paths

1588 for node, paths in index_map.items():

1589 # Only consider paths where both

1590 # ``node`` and ``source`` are present

1591 common_paths = source_paths.intersection(paths)

1592 S[inv_node_map[node]] = len(common_paths) * inv_sample_size

1593

1594 # Retrieve top ``k`` similar

1595 # Note: the below performed anywhere from 4-10x faster

1596 # (depending on input sizes) vs the equivalent ``np.argsort(S)[::-1]``

1597 top_k_unsorted = np.argpartition(S, -k)[-k:]

1598 top_k_sorted = top_k_unsorted[np.argsort(S[top_k_unsorted])][::-1]

1599

1600 # Add back the similarity scores

1601 top_k_sorted_names = (node_map[n] for n in top_k_sorted)

1602 top_k_with_val = dict(zip(top_k_sorted_names, S[top_k_sorted]))

1603

1604 # Remove the self-similarity

1605 top_k_with_val.pop(source, None)

1606 return top_k_with_val

1607

1608

1609@nx._dispatch(edge_attrs="weight")

1610def generate_random_paths(

1611 G, sample_size, path_length=5, index_map=None, weight="weight"

1612):

1613 """Randomly generate `sample_size` paths of length `path_length`.

1614

1615 Parameters

1616 ----------

1617 G : NetworkX graph

1618 A NetworkX graph

1619 sample_size : integer

1620 The number of paths to generate. This is ``R`` in [1]_.

1621 path_length : integer (default = 5)

1622 The maximum size of the path to randomly generate.

1623 This is ``T`` in [1]_. According to the paper, ``T >= 5`` is

1624 recommended.

1625 index_map : dictionary, optional

1626 If provided, this will be populated with the inverted

1627 index of nodes mapped to the set of generated random path

1628 indices within ``paths``.

1629 weight : string or None, optional (default="weight")

1630 The name of an edge attribute that holds the numerical value

1631 used as a weight. If None then each edge has weight 1.

1632

1633 Returns

1634 -------

1635 paths : generator of lists

1636 Generator of `sample_size` paths each with length `path_length`.

1637

1638 Examples

1639 --------

1640 Note that the return value is the list of paths:

1641

1642 >>> G = nx.star_graph(3)

1643 >>> random_path = nx.generate_random_paths(G, 2)

1644

1645 By passing a dictionary into `index_map`, it will build an

1646 inverted index mapping of nodes to the paths in which that node is present:

1647

1648 >>> G = nx.star_graph(3)

1649 >>> index_map = {}

1650 >>> random_path = nx.generate_random_paths(G, 3, index_map=index_map)

1651 >>> paths_containing_node_0 = [random_path[path_idx] for path_idx in index_map.get(0, [])]

1652

1653 References

1654 ----------

1655 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.

1656 Panther: Fast top-k similarity search on large networks.

1657 In Proceedings of the ACM SIGKDD International Conference

1658 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).

1659 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.

1660 """

1661 import numpy as np

1662

1663 # Calculate transition probabilities between

1664 # every pair of vertices according to Eq. (3)

1665 adj_mat = nx.to_numpy_array(G, weight=weight)

1666 inv_row_sums = np.reciprocal(adj_mat.sum(axis=1)).reshape(-1, 1)

1667 transition_probabilities = adj_mat * inv_row_sums

1668

1669 node_map = np.array(G)

1670 num_nodes = G.number_of_nodes()

1671

1672 for path_index in range(sample_size):

1673 # Sample current vertex v = v_i uniformly at random

1674 node_index = np.random.randint(0, high=num_nodes)

1675 node = node_map[node_index]

1676

1677 # Add v into p_r and add p_r into the path set

1678 # of v, i.e., P_v

1679 path = [node]

1680

1681 # Build the inverted index (P_v) of vertices to paths

1682 if index_map is not None:

1683 if node in index_map:

1684 index_map[node].add(path_index)

1685 else:

1686 index_map[node] = {path_index}

1687

1688 starting_index = node_index

1689 for _ in range(path_length):

1690 # Randomly sample a neighbor (v_j) according

1691 # to transition probabilities from ``node`` (v) to its neighbors

1692 neighbor_index = np.random.choice(

1693 num_nodes, p=transition_probabilities[starting_index]

1694 )

1695

1696 # Set current vertex (v = v_j)

1697 starting_index = neighbor_index

1698

1699 # Add v into p_r

1700 neighbor_node = node_map[neighbor_index]

1701 path.append(neighbor_node)

1702

1703 # Add p_r into P_v

1704 if index_map is not None:

1705 if neighbor_node in index_map:

1706 index_map[neighbor_node].add(path_index)

1707 else:

1708 index_map[neighbor_node] = {path_index}

1709

1710 yield path