DiffImpl.java
package graphql.schema.diffing;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multisets;
import graphql.Internal;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import static graphql.Assert.assertFalse;
import static graphql.Assert.assertTrue;
import static graphql.schema.diffing.EditorialCostForMapping.baseEditorialCostForMapping;
import static graphql.schema.diffing.EditorialCostForMapping.editorialCostForMapping;
/**
* This is an algorithm calculating the optimal edit to change the source graph into the target graph.
* <p>
* It is based on the following two papers (both papers are from the same authors. The first one is newer, but the older one is more detailed in some aspects)
* <p>
* Accelerating Graph Similarity Search via Efficient GED Computation (https://lijunchang.github.io/pdf/2022-ged-tkde.pdf)
* <p>
* Efficient Graph Edit Distance Computation and Verification via Anchor-aware Lower Bound Estimation (https://arxiv.org/abs/1709.06810)
* <p>
* The algorithm is a modified version of "AStar-BMao".
* It is adapted to directed graphs as a GraphQL schema is most naturally represented as directed graph (vs the undirected graphs used in the papers).
*/
@Internal
public class DiffImpl {
private final PossibleMappingsCalculator possibleMappingsCalculator;
private final SchemaGraph completeSourceGraph;
private final SchemaGraph completeTargetGraph;
private final PossibleMappingsCalculator.PossibleMappings possibleMappings;
private final SchemaDiffingRunningCheck runningCheck;
private static class MappingEntry {
public LinkedBlockingQueue<MappingEntry> mappingEntriesSiblings = new LinkedBlockingQueue<>();
public int[] assignments;
/**
* These are the available vertices, relative to the parent mapping.
* Meaning the last mapped element is NOT contained in it.
*/
public List<Vertex> availableTargetVertices;
Mapping partialMapping;
int level; // = partialMapping.size
double lowerBoundCost;
public MappingEntry(Mapping partialMapping, int level, double lowerBoundCost) {
this.partialMapping = partialMapping;
this.level = level;
this.lowerBoundCost = lowerBoundCost;
}
}
/**
* An optimal edit from one graph to another.
* The mapping maps all vertices from source to target, but
* not all mappings represent an actual change. This is why there is a separate list
* of the actual changes.
*/
public static class OptimalEdit {
private final SchemaGraph completeSourceGraph;
private final SchemaGraph completeTargetGraph;
public Mapping mapping;
public int ged = Integer.MAX_VALUE;
public OptimalEdit(
SchemaGraph completeSourceGraph,
SchemaGraph completeTargetGraph) {
this.completeSourceGraph = completeSourceGraph;
this.completeTargetGraph = completeTargetGraph;
}
public OptimalEdit(
SchemaGraph completeSourceGraph,
SchemaGraph completeTargetGraph,
Mapping mapping,
int ged) {
this.completeSourceGraph = completeSourceGraph;
this.completeTargetGraph = completeTargetGraph;
this.mapping = mapping;
this.ged = ged;
}
public List<EditOperation> getListOfEditOperations() {
ArrayList<EditOperation> listOfEditOperations = new ArrayList<>();
assertTrue(baseEditorialCostForMapping(mapping, completeSourceGraph, completeTargetGraph, listOfEditOperations) == ged);
return listOfEditOperations;
}
}
public DiffImpl(PossibleMappingsCalculator possibleMappingsCalculator, SchemaGraph completeSourceGraph, SchemaGraph completeTargetGraph, PossibleMappingsCalculator.PossibleMappings possibleMappings, SchemaDiffingRunningCheck runningCheck) {
this.possibleMappingsCalculator = possibleMappingsCalculator;
this.completeSourceGraph = completeSourceGraph;
this.completeTargetGraph = completeTargetGraph;
this.possibleMappings = possibleMappings;
this.runningCheck = runningCheck;
}
OptimalEdit diffImpl(Mapping startMapping, List<Vertex> allSources, List<Vertex> allTargets, AtomicInteger algoIterationCount) throws Exception {
int graphSize = allSources.size();
int fixedEditorialCost = baseEditorialCostForMapping(startMapping, completeSourceGraph, completeTargetGraph);
int level = startMapping.size();
List<Vertex> allNonFixedTargets = new ArrayList<>(allTargets);
startMapping.forEachTarget(allNonFixedTargets::remove);
MappingEntry firstMappingEntry = new MappingEntry(startMapping, level, fixedEditorialCost);
firstMappingEntry.availableTargetVertices = allNonFixedTargets;
OptimalEdit optimalEdit = new OptimalEdit(completeSourceGraph, completeTargetGraph);
PriorityQueue<MappingEntry> queue = new PriorityQueue<>((mappingEntry1, mappingEntry2) -> {
int compareResult = Double.compare(mappingEntry1.lowerBoundCost, mappingEntry2.lowerBoundCost);
// we prefer higher levels for equal lower bound costs
if (compareResult == 0) {
return Integer.compare(mappingEntry2.level, mappingEntry1.level);
} else {
return compareResult;
}
});
queue.add(firstMappingEntry);
while (!queue.isEmpty()) {
MappingEntry mappingEntry = queue.poll();
algoIterationCount.incrementAndGet();
if (mappingEntry.lowerBoundCost >= optimalEdit.ged) {
// once the lowest lowerBoundCost is not lower than the optimal edit, we are done
break;
}
if (mappingEntry.level > 0 && !mappingEntry.mappingEntriesSiblings.isEmpty()) {
addSiblingToQueue(
fixedEditorialCost,
mappingEntry.level,
queue,
optimalEdit,
allSources,
allTargets,
mappingEntry);
}
if (mappingEntry.level < graphSize) {
addChildToQueue(
fixedEditorialCost,
mappingEntry,
queue,
optimalEdit,
allSources,
allTargets
);
}
runningCheck.check();
}
return optimalEdit;
}
// this calculates all children for the provided parentEntry, but only the first is directly added to the queue
private void addChildToQueue(int fixedEditorialCost,
MappingEntry parentEntry,
PriorityQueue<MappingEntry> queue,
OptimalEdit optimalEdit,
List<Vertex> allSources,
List<Vertex> allTargets
) {
Mapping parentPartialMapping = parentEntry.partialMapping;
int parentLevel = parentEntry.level;
int level = parentLevel + 1;
assertTrue(parentLevel == parentPartialMapping.size());
// the available target vertices are the parent queue entry ones plus
// minus the additional mapped element in parentPartialMapping
ArrayList<Vertex> availableTargetVertices = new ArrayList<>(parentEntry.availableTargetVertices);
availableTargetVertices.remove(parentPartialMapping.getTarget(parentLevel - 1));
assertTrue(availableTargetVertices.size() + parentPartialMapping.size() == allTargets.size());
Vertex v_i = allSources.get(parentLevel);
// the cost matrix is for the non mapped vertices
int costMatrixSize = allSources.size() - parentLevel;
// costMatrix gets modified by the hungarian algorithm ... therefore we create two of them
double[][] costMatrixForHungarianAlgo = new double[costMatrixSize][costMatrixSize];
double[][] costMatrix = new double[costMatrixSize][costMatrixSize];
Map<Vertex, Double> isolatedVerticesCache = new LinkedHashMap<>();
Map<Vertex, Vertex> nonFixedParentRestrictions = possibleMappingsCalculator.getNonFixedParentRestrictions(completeSourceGraph, completeTargetGraph, parentPartialMapping);
for (int i = parentLevel; i < allSources.size(); i++) {
Vertex v = allSources.get(i);
int j = 0;
for (Vertex u : availableTargetVertices) {
double cost = calcLowerBoundMappingCost(v, u, parentPartialMapping, isolatedVerticesCache, nonFixedParentRestrictions);
costMatrixForHungarianAlgo[i - parentLevel][j] = cost;
costMatrix[i - parentLevel][j] = cost;
j++;
}
runningCheck.check();
}
HungarianAlgorithm hungarianAlgorithm = new HungarianAlgorithm(costMatrixForHungarianAlgo);
int[] assignments = hungarianAlgorithm.execute();
int editorialCostForMapping = editorialCostForMapping(fixedEditorialCost, parentPartialMapping, completeSourceGraph, completeTargetGraph);
double costMatrixSum = getCostMatrixSum(costMatrix, assignments);
double lowerBoundForPartialMapping = editorialCostForMapping + costMatrixSum;
Mapping newMapping = parentPartialMapping.extendMapping(v_i, availableTargetVertices.get(assignments[0]));
if (costMatrixSum >= Integer.MAX_VALUE && optimalEdit.mapping == null) {
throw new RuntimeException("bug: could not find any allowed mapping");
}
if (lowerBoundForPartialMapping >= optimalEdit.ged) {
return;
}
MappingEntry newMappingEntry = new MappingEntry(newMapping, level, lowerBoundForPartialMapping);
LinkedBlockingQueue<MappingEntry> siblings = new LinkedBlockingQueue<>();
newMappingEntry.mappingEntriesSiblings = siblings;
newMappingEntry.assignments = assignments;
newMappingEntry.availableTargetVertices = availableTargetVertices;
queue.add(newMappingEntry);
expandMappingAndUpdateOptimalMapping(fixedEditorialCost,
level,
optimalEdit,
allSources,
parentPartialMapping.copy(),
assignments,
availableTargetVertices,
lowerBoundForPartialMapping);
calculateRestOfChildren(
availableTargetVertices,
hungarianAlgorithm,
costMatrix,
editorialCostForMapping,
parentPartialMapping,
v_i,
optimalEdit.ged,
level,
siblings
);
}
private void updateOptimalEdit(OptimalEdit optimalEdit, int newGed, Mapping mapping) {
assertTrue(newGed < optimalEdit.ged);
optimalEdit.ged = newGed;
optimalEdit.mapping = mapping;
}
// generate all children mappings and save in MappingEntry.sibling
private void calculateRestOfChildren(List<Vertex> availableTargetVertices,
HungarianAlgorithm hungarianAlgorithm,
double[][] costMatrixCopy,
double editorialCostForMapping,
Mapping partialMapping,
Vertex v_i,
int upperBound,
int level,
LinkedBlockingQueue<MappingEntry> siblings
) {
// starting from 1 as we already generated the first one
for (int child = 1; child < availableTargetVertices.size(); child++) {
int[] assignments = hungarianAlgorithm.nextChild();
if (hungarianAlgorithm.costMatrix[0][assignments[0]] == Integer.MAX_VALUE) {
break;
}
double costMatrixSumSibling = getCostMatrixSum(costMatrixCopy, assignments);
double lowerBoundForPartialMappingSibling = editorialCostForMapping + costMatrixSumSibling;
Mapping newMappingSibling = partialMapping.extendMapping(v_i, availableTargetVertices.get(assignments[0]));
if (lowerBoundForPartialMappingSibling >= upperBound) {
break;
}
MappingEntry sibling = new MappingEntry(newMappingSibling, level, lowerBoundForPartialMappingSibling);
sibling.mappingEntriesSiblings = siblings;
sibling.assignments = assignments;
sibling.availableTargetVertices = availableTargetVertices;
siblings.add(sibling);
runningCheck.check();
}
}
// this retrieves the next sibling from MappingEntry.sibling and adds it to the queue if the lowerBound is less than the current upperBound
private void addSiblingToQueue(
int fixedEditorialCost,
int level,
PriorityQueue<MappingEntry> queue,
OptimalEdit optimalEdit,
List<Vertex> allSources,
List<Vertex> allTargets,
MappingEntry mappingEntry) throws InterruptedException {
assertFalse(mappingEntry.mappingEntriesSiblings.isEmpty());
MappingEntry sibling = mappingEntry.mappingEntriesSiblings.take();
if (sibling.lowerBoundCost < optimalEdit.ged) {
queue.add(sibling);
// we need to start here from the parent mapping, this is why we remove the last element
Mapping toExpand = sibling.partialMapping.copyMappingWithLastElementRemoved();
expandMappingAndUpdateOptimalMapping(fixedEditorialCost,
level,
optimalEdit,
allSources,
toExpand,
sibling.assignments,
sibling.availableTargetVertices,
sibling.lowerBoundCost);
}
}
/**
* Extend the partial mapping to a full mapping according to the optimal
* matching (hungarian algo result) and update the optimal edit if we
* found a better one.
*/
private void expandMappingAndUpdateOptimalMapping(int fixedEditorialCost,
int level,
OptimalEdit optimalEdit,
List<Vertex> allSources,
Mapping toExpand,
int[] assignments,
List<Vertex> availableTargetVertices,
double lowerBoundCost) {
for (int i = 0; i < assignments.length; i++) {
toExpand.add(allSources.get(level - 1 + i), availableTargetVertices.get(assignments[i]));
}
assertTrue(toExpand.size() == this.completeSourceGraph.size());
int costForFullMapping = editorialCostForMapping(fixedEditorialCost, toExpand, completeSourceGraph, completeTargetGraph);
assertTrue(lowerBoundCost <= costForFullMapping);
if (costForFullMapping < optimalEdit.ged) {
updateOptimalEdit(optimalEdit, costForFullMapping, toExpand);
}
}
private double getCostMatrixSum(double[][] costMatrix, int[] assignments) {
double costMatrixSum = 0;
for (int i = 0; i < assignments.length; i++) {
costMatrixSum += costMatrix[i][assignments[i]];
}
return costMatrixSum;
}
/**
* a partial mapping introduces a sub graph. The editorial cost is only calculated with respect to this sub graph.
*/
/**
* lower bound mapping cost between for v -> u in respect to a partial mapping.
* It basically tells the minimal costs we can expect for all mappings that come from extending
* the partial mapping with v -> u.
* <p>
* This is basically the formula (5) from page 6 of https://lijunchang.github.io/pdf/2022-ged-tkde.pdf.
* <p>
*
* The main difference is that the formula works with undirected graphs, but we have a directed graph,
* hence there is no 1/2 factor and for comparing the labels of anchored vertices to v/u we need to
* take both directions into account.
* <p>
*
* The other optimization is that a schema graph will have never a lot of adjacent edges compared to
* the overall vertices count, therefore the algorithm for the anchored vertices costs iterates
* over the adjacent edges of v/u instead of all the mapped vertices.
* <p>
*
* Additionally, there is a shortcut for isolated vertices, representing deletion/insertion which is also cached.
* <p>
* Some naming: an anchored vertex is a vertex that is mapped via the partial mapping.
* An inner edge is an edge between two vertices that are both not anchored (mapped).
* The vertices v and u are by definition not mapped.
*/
private double calcLowerBoundMappingCost(Vertex v,
Vertex u,
Mapping partialMapping,
Map<Vertex, Double> isolatedVerticesCache,
Map<Vertex, Vertex> nonFixedParentRestrictions) {
if (nonFixedParentRestrictions.containsKey(v) || partialMapping.hasParentRestriction(v)) {
if (!u.isIsolated()) { // Always allow mapping to isolated nodes
Vertex uParentRestriction = nonFixedParentRestrictions.get(v);
if (uParentRestriction == null) {
uParentRestriction = partialMapping.getParentRestriction(v);
}
Collection<Edge> parentEdges = completeTargetGraph.getAdjacentEdgesInverseNonCopy(u);
if (parentEdges.size() != 1) {
return Integer.MAX_VALUE;
}
Vertex uParent = parentEdges.iterator().next().getFrom();
if (uParent != uParentRestriction) {
return Integer.MAX_VALUE;
}
}
}
if (!possibleMappings.mappingPossible(v, u)) {
return Integer.MAX_VALUE;
}
if (u.isOfType(SchemaGraph.ISOLATED)) {
if (isolatedVerticesCache.containsKey(v)) {
return isolatedVerticesCache.get(v);
}
double result = calcLowerBoundMappingCostForIsolated(v, partialMapping, true);
isolatedVerticesCache.put(v, result);
return result;
}
if (v.isOfType(SchemaGraph.ISOLATED)) {
if (isolatedVerticesCache.containsKey(u)) {
return isolatedVerticesCache.get(u);
}
double result = calcLowerBoundMappingCostForIsolated(u, partialMapping, false);
isolatedVerticesCache.put(u, result);
return result;
}
boolean equalNodes = v.getType().equals(u.getType()) && v.getProperties().equals(u.getProperties());
int anchoredVerticesCost = 0;
Multiset<String> multisetInnerEdgeLabelsV = HashMultiset.create();
Multiset<String> multisetInnerEdgeLabelsU = HashMultiset.create();
Collection<Edge> adjacentEdgesV = completeSourceGraph.getAdjacentEdgesNonCopy(v);
Collection<Edge> adjacentEdgesU = completeTargetGraph.getAdjacentEdgesNonCopy(u);
Collection<Edge> adjacentEdgesInverseV = completeSourceGraph.getAdjacentEdgesInverseNonCopy(v);
Collection<Edge> adjacentEdgesInverseU = completeTargetGraph.getAdjacentEdgesInverseNonCopy(u);
Set<Edge> matchedTargetEdges = new LinkedHashSet<>();
Set<Edge> matchedTargetEdgesInverse = new LinkedHashSet<>();
for (Edge edgeV : adjacentEdgesV) {
Vertex targetTo = partialMapping.getTarget(edgeV.getTo());
if (targetTo == null) {
// meaning it is an inner edge(not part of the subgraph induced by the partial mapping)
multisetInnerEdgeLabelsV.add(edgeV.getLabel());
continue;
}
/* question is if the edge from v is mapped onto an edge from u
(also edge are not mapped directly, but the vertices are)
and if the adjacent edge is mapped onto an adjacent edge,
we need to check the labels of the edges
*/
Edge matchedTargetEdge = completeTargetGraph.getEdge(u, targetTo);
if (matchedTargetEdge != null) {
matchedTargetEdges.add(matchedTargetEdge);
if (!Objects.equals(edgeV.getLabel(), matchedTargetEdge.getLabel())) {
anchoredVerticesCost++;
}
} else {
// // no matching adjacent edge from u found means there is no
// // edge from edgeV.getTo() to mapped(edgeV.getTo())
// // and we need to increase the costs
anchoredVerticesCost++;
}
}
for (Edge edgeV : adjacentEdgesInverseV) {
Vertex targetFrom = partialMapping.getTarget(edgeV.getFrom());
// we are only interested in edges from anchored vertices
if (targetFrom == null) {
continue;
}
Edge matachedTargetEdge = completeTargetGraph.getEdge(targetFrom, u);
if (matachedTargetEdge != null) {
matchedTargetEdgesInverse.add(matachedTargetEdge);
if (!Objects.equals(edgeV.getLabel(), matachedTargetEdge.getLabel())) {
anchoredVerticesCost++;
}
} else {
anchoredVerticesCost++;
}
}
for (Edge edgeU : adjacentEdgesU) {
// test if this is an inner edge (meaning it not part of the subgraph induced by the partial mapping)
// we know that u is not part of the mapped vertices, therefore we only need to test the "to" vertex
if (!partialMapping.containsTarget(edgeU.getTo())) {
multisetInnerEdgeLabelsU.add(edgeU.getLabel());
continue;
}
if (matchedTargetEdges.contains(edgeU)) {
continue;
}
anchoredVerticesCost++;
}
for (Edge edgeU : adjacentEdgesInverseU) {
// we are only interested in edges from anchored vertices
if (!partialMapping.containsTarget(edgeU.getFrom()) || matchedTargetEdgesInverse.contains(edgeU)) {
continue;
}
anchoredVerticesCost++;
}
Multiset<String> intersectionInnerEdgeLabels = Multisets.intersection(multisetInnerEdgeLabelsV, multisetInnerEdgeLabelsU);
int multiSetEditDistanceForInnerEdges = Math.max(multisetInnerEdgeLabelsV.size(), multisetInnerEdgeLabelsU.size()) - intersectionInnerEdgeLabels.size();
int result = (equalNodes ? 0 : 1) + multiSetEditDistanceForInnerEdges + anchoredVerticesCost;
return result;
}
/**
* Simplified lower bound calc if the source/target vertex is isolated
*/
private double calcLowerBoundMappingCostForIsolated(Vertex vertex,
Mapping partialMapping,
boolean sourceOrTarget
) {
SchemaGraph schemaGraph = sourceOrTarget ? completeSourceGraph : completeTargetGraph;
// every adjacent edge is inserted/deleted for an isolated vertex
Collection<Edge> adjacentEdges = schemaGraph.getAdjacentEdgesNonCopy(vertex);
// for the inverse adjacent edges we only count the anchored ones
int anchoredInverseEdges = 0;
Collection<Edge> adjacentEdgesInverse = schemaGraph.getAdjacentEdgesInverseNonCopy(vertex);
for (Edge edge : adjacentEdgesInverse) {
if (partialMapping.contains(edge.getFrom(), sourceOrTarget)) {
anchoredInverseEdges++;
}
}
return 1 + adjacentEdges.size() + anchoredInverseEdges;
}
}