GraphComparisons.java
/*******************************************************************************
* Copyright (c) 2021 Eclipse RDF4J contributors.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.model.util;
import static org.eclipse.rdf4j.model.util.Values.bnode;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.eclipse.rdf4j.model.BNode;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.impl.DynamicModelFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.MultimapBuilder;
import com.google.common.collect.Multimaps;
import com.google.common.hash.HashCode;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
/**
* Functions for canonicalizing RDF models and computing isomorphism.
*
* @author Jeen Broekstra
* @implNote The algorithms used in this class are based on the iso-canonical algorithm as described in: Hogan, A.
* (2017). Canonical forms for isomorphic and equivalent RDF graphs: algorithms for leaning and labelling
* blank nodes. ACM Transactions on the Web (TWEB), 11(4), 1-62.
*/
class GraphComparisons {
private static final Logger logger = LoggerFactory.getLogger(GraphComparisons.class);
private static final HashFunction hashFunction = Hashing.sha256();
private static final HashCode initialHashCode = hashFunction.hashString("", StandardCharsets.UTF_8);
private static final HashCode outgoing = hashFunction.hashString("+", StandardCharsets.UTF_8);
private static final HashCode incoming = hashFunction.hashString("-", StandardCharsets.UTF_8);
private static final HashCode distinguisher = hashFunction.hashString("@", StandardCharsets.UTF_8);
/**
* Compares two RDF models, and returns <var>true</var> if they consist of isomorphic graphs and the isomorphic
* graph identifiers map 1:1 to each other. RDF graphs are isomorphic graphs if statements from one graphs can be
* mapped 1:1 on to statements in the other graphs. In this mapping, blank nodes are not considered mapped when
* having an identical internal id, but are mapped from one graph to the other by looking at the statements in which
* the blank nodes occur. A Model can consist of more than one graph (denoted by context identifiers). Two models
* are considered isomorphic if for each of the graphs in one model, an isomorphic graph exists in the other model,
* and the context identifiers of these graphs are identical.
*
* @implNote The algorithm used by this comparison is a depth-first search for an iso-canonical blank node mapping
* for each model, and using that as a basis for comparison. The algorithm is described in detail in:
* Hogan, A. (2017). Canonical forms for isomorphic and equivalent RDF graphs: algorithms for leaning and
* labelling blank nodes. ACM Transactions on the Web (TWEB), 11(4), 1-62.
* @see <a href="http://www.w3.org/TR/rdf11-concepts/#graph-isomorphism">RDF Concepts & Abstract Syntax, section
* 3.6 (Graph Comparison)</a>
* @see <a href="http://aidanhogan.com/docs/rdf-canonicalisation.pdf">Hogan, A. (2017). Canonical forms for
* isomorphic and equivalent RDF graphs: algorithms for leaning and labelling blank nodes. ACM Transactions on
* the Web (TWEB), 11(4), 1-62. Technical Paper (PDF )</a>
*/
public static boolean isomorphic(Model model1, Model model2) {
if (model1 == model2) {
return true;
}
if (model1.size() != model2.size()) {
return false;
}
if (model1.contexts().size() != model2.contexts().size()) {
return false;
}
if (model1.contexts().size() > 1) {
// model contains more than one context (including the null context). We compare per individual context.
for (Resource context : model1.contexts()) {
Model contextInModel1 = model1.filter(null, null, null, context);
if (context != null && context.isBNode()) {
// context identifier is a blank node. We to find blank node identifiers in the other model that map
// iso-canonically.
Map<BNode, HashCode> mapping1 = getIsoCanonicalMapping(model1);
Multimap<HashCode, BNode> partitionMapping2 = partitionMapping(getIsoCanonicalMapping(model2));
Collection<BNode> contextCandidates = partitionMapping2.get(mapping1.get(context));
if (contextCandidates.isEmpty()) {
return false;
}
boolean foundIsomorphicBlankNodeContext = false;
for (BNode context2 : contextCandidates) {
Model contextInModel2 = model2.filter(null, null, null, context2);
if (contextInModel1.size() != contextInModel2.size()) {
continue;
}
if (isomorphicSingleContext(contextInModel1, contextInModel2)) {
foundIsomorphicBlankNodeContext = true;
break;
}
}
if (!foundIsomorphicBlankNodeContext) {
return false;
}
} else {
// context identifier is an iri. Simple per-context check will suffice.
Model contextInModel2 = model2.filter(null, null, null, context);
if (contextInModel1.size() != contextInModel2.size()) {
return false;
}
final Model canonicalizedContext1 = isoCanonicalize(contextInModel1);
final Model canonicalizedContext2 = isoCanonicalize(contextInModel2);
if (!canonicalizedContext1.equals(canonicalizedContext2)) {
return false;
}
}
}
return true;
} else {
// only one context (the null context), so we're dealing with one graph only.
return isomorphicSingleContext(model1, model2);
}
}
private static boolean isomorphicSingleContext(Model model1, Model model2) {
final Map<BNode, HashCode> mapping1 = getIsoCanonicalMapping(model1);
if (mapping1.isEmpty()) {
// no blank nodes in model1 - simple collection equality will do
return model1.equals(model2);
}
final Map<BNode, HashCode> mapping2 = getIsoCanonicalMapping(model2);
if (mappingsIncompatible(mapping1, mapping2)) {
return false;
}
// Compatible blank node mapping found. We need to check that statements not involving blank nodes are equal in
// both models.
Optional<Statement> missingInModel2 = model1.stream()
.filter(st -> !(st.getSubject().isBNode() || st.getObject().isBNode()
|| st.getContext() instanceof BNode))
.filter(st -> !model2.contains(st))
.findAny();
// Because we have previously already checked that the models are the same size, we don't have to check both
// ways to establish model equality.
return missingInModel2.isEmpty();
}
private static boolean mappingsIncompatible(Map<BNode, HashCode> mapping1, Map<BNode, HashCode> mapping2) {
if (mapping1.size() != mapping2.size()) {
return true;
}
Set<HashCode> values1 = new HashSet<>(mapping1.values());
Set<HashCode> values2 = new HashSet<>(mapping2.values());
if (!(values1.equals(values2))) {
return true;
}
return false;
}
protected static Model isoCanonicalize(Model m) {
return labelModel(m, getIsoCanonicalMapping(m));
}
protected static Map<BNode, HashCode> getIsoCanonicalMapping(Model m) {
Partitioning partitioning = hashBNodes(m);
if (partitioning.isFine()) {
return partitioning.getCurrentNodeMapping();
}
return distinguish(m, partitioning, null, new ArrayList<>(), new ArrayList<>());
}
protected static Set<BNode> getBlankNodes(Model m) {
final Set<BNode> blankNodes = new HashSet<>();
m.forEach(st -> {
if (st.getSubject().isBNode()) {
blankNodes.add((BNode) st.getSubject());
}
if (st.getObject().isBNode()) {
blankNodes.add((BNode) st.getObject());
}
if (st.getContext() != null && st.getContext().isBNode()) {
blankNodes.add((BNode) st.getContext());
}
});
return blankNodes;
}
private static Map<BNode, HashCode> distinguish(Model m, Partitioning partitioning,
Map<BNode, HashCode> lowestFound, List<BNode> parentFixpoints,
List<Map<BNode, HashCode>> finePartitionMappings) {
for (BNode node : partitioning.getLowestNonTrivialPartition()) {
List<BNode> fixpoints = new ArrayList<>(parentFixpoints);
fixpoints.add(node);
Partitioning clonedPartitioning = new Partitioning(partitioning.getCurrentNodeMapping(),
partitioning.getStaticValueMapping());
clonedPartitioning.setCurrentHashCode(node,
hashTuple(clonedPartitioning.getCurrentHashCode(node), distinguisher));
clonedPartitioning = hashBNodes(m, clonedPartitioning);
if (clonedPartitioning.isFine()) {
finePartitionMappings.add(clonedPartitioning.getCurrentNodeMapping());
if (lowestFound == null
|| clonedPartitioning.getMappingSize().compareTo(partitioning.getMappingSize()) < 0) {
lowestFound = clonedPartitioning.getCurrentNodeMapping();
}
} else {
Map<BNode, BNode> compatibleAutomorphism = findCompatibleAutomorphism(fixpoints, finePartitionMappings);
if (compatibleAutomorphism != null) {
// prune
continue;
}
lowestFound = distinguish(m, clonedPartitioning, lowestFound, fixpoints, finePartitionMappings);
}
}
return lowestFound;
}
protected static Map<BNode, BNode> findCompatibleAutomorphism(List<BNode> fixpoints,
List<Map<BNode, HashCode>> partitionMappings) {
// check if two mappings with identical hash codes exist
for (Map<BNode, HashCode> mapping : partitionMappings) {
Map<BNode, HashCode> compatibleMapping = null;
for (Map<BNode, HashCode> om : partitionMappings) {
if (om.equals(mapping)) {
continue;
}
if (om.values().containsAll(mapping.values())) {
compatibleMapping = om;
break;
}
}
if (compatibleMapping != null) {
Map<HashCode, BNode> invertedMapping = mapping.entrySet()
.stream()
.collect(Collectors.toMap(Entry::getValue, Entry::getKey));
Map<BNode, BNode> automorphism = new HashMap<>();
for (Entry<BNode, HashCode> entry : compatibleMapping.entrySet()) {
automorphism.put(entry.getKey(), invertedMapping.get(entry.getValue()));
}
// check if fixpoints all map, if so we have a compatible automorphism
for (BNode fixpoint : fixpoints) {
if (!automorphism.get(fixpoint).equals(fixpoint)) {
break;
}
return automorphism;
}
}
}
return null;
}
protected static List<Collection<BNode>> partitions(Multimap<HashCode, BNode> partitionMapping) {
List<Collection<BNode>> partition = new ArrayList<>();
for (Entry<HashCode, Collection<BNode>> entry : partitionMapping.asMap().entrySet()) {
partition.add(entry.getValue());
}
return partition;
}
protected static Multimap<HashCode, BNode> partitionMapping(Map<BNode, HashCode> blankNodeMapping) {
return Multimaps.invertFrom(Multimaps.forMap(blankNodeMapping),
MultimapBuilder.hashKeys(blankNodeMapping.keySet().size()).arrayListValues().build());
}
private static Model labelModel(Model original, Map<BNode, HashCode> hash) {
Model result = new DynamicModelFactory().createEmptyModel();
for (Statement st : original) {
if (st.getSubject().isBNode() || st.getObject().isBNode()
|| (st.getContext() != null && st.getContext().isBNode())) {
Resource subject = st.getSubject().isBNode()
? createCanonicalBNode((BNode) st.getSubject(), hash)
: st.getSubject();
IRI predicate = st.getPredicate();
Value object = st.getObject().isBNode()
? createCanonicalBNode((BNode) st.getObject(), hash)
: st.getObject();
Resource context = (st.getContext() != null && st.getContext().isBNode())
? createCanonicalBNode((BNode) st.getContext(), hash)
: st.getContext();
result.add(subject, predicate, object, context);
} else {
result.add(st);
}
}
return result;
}
protected static Partitioning hashBNodes(Model m) {
return hashBNodes(m, null);
}
private static Partitioning hashBNodes(Model m, Partitioning partitioning) {
if (partitioning == null) {
final Set<BNode> blankNodes = getBlankNodes(m);
partitioning = new Partitioning(blankNodes);
}
if (!partitioning.getNodes().isEmpty()) {
do {
partitioning.nextIteration();
for (BNode b : partitioning.getNodes()) {
for (Statement st : m.getStatements(b, null, null)) {
HashCode c = hashTuple(
partitioning.getPreviousHashCode(st.getObject()),
partitioning.getPreviousHashCode(st.getPredicate()),
outgoing);
partitioning.setCurrentHashCode(b,
hashBag(c, partitioning.getCurrentHashCode(b)));
}
for (Statement st : m.getStatements(null, null, b)) {
HashCode c = hashTuple(
partitioning.getPreviousHashCode(st.getSubject()),
partitioning.getPreviousHashCode(st.getPredicate()),
incoming);
partitioning.setCurrentHashCode(b,
hashBag(c, partitioning.getCurrentHashCode(b)));
}
}
} while (!partitioning.isFullyDistinguished());
}
return partitioning;
}
protected static HashCode hashTuple(HashCode... hashCodes) {
return Hashing.combineOrdered(Arrays.asList(hashCodes));
}
protected static HashCode hashBag(HashCode... hashCodes) {
return Hashing.combineUnordered(Arrays.asList(hashCodes));
}
private static BNode createCanonicalBNode(BNode node, Map<BNode, HashCode> mapping) {
return bnode("iso-" + mapping.get(node).toString());
}
/**
* Encapsulates the current partitioning state of the algorithm, keeping track of previous and current node:hashcode
* mappings as well as static value mappings.
*/
static class Partitioning {
private final Map<Value, HashCode> staticValueMapping;
private Map<BNode, HashCode> previousNodeMapping;
private Map<BNode, HashCode> currentNodeMapping;
private Multimap<HashCode, BNode> currentHashCodeMapping;
private final int nodeCount;
public Partitioning(Set<BNode> blankNodes) {
this.staticValueMapping = new HashMap<>();
this.nodeCount = blankNodes.size();
this.currentNodeMapping = new HashMap<>(nodeCount);
blankNodes.forEach(node -> currentNodeMapping.put(node, initialHashCode));
}
public Partitioning(Map<BNode, HashCode> nodeMapping, Map<Value, HashCode> staticValueMapping) {
this.staticValueMapping = staticValueMapping;
this.nodeCount = nodeMapping.keySet().size();
this.currentNodeMapping = new HashMap<>(nodeMapping);
}
public Map<Value, HashCode> getStaticValueMapping() {
return staticValueMapping;
}
public HashCode getCurrentHashCode(Value value) {
if (value.isBNode()) {
return currentNodeMapping.get((BNode) value);
}
if (value.isLiteral()) {
return getStaticLiteralHashCode((Literal) value);
}
return staticValueMapping.computeIfAbsent(value,
v -> hashFunction.hashString(v.stringValue(), StandardCharsets.UTF_8));
}
public Set<BNode> getNodes() {
return currentNodeMapping.keySet();
}
public HashCode getPreviousHashCode(Value value) {
if (value.isBNode()) {
return previousNodeMapping.get((BNode) value);
}
if (value.isLiteral()) {
return getStaticLiteralHashCode((Literal) value);
}
return staticValueMapping.computeIfAbsent(value,
v -> hashFunction.hashString(v.stringValue(), StandardCharsets.UTF_8));
}
public void setCurrentHashCode(BNode bnode, HashCode hashCode) {
currentNodeMapping.put(bnode, hashCode);
}
public Map<BNode, HashCode> getCurrentNodeMapping() {
return Collections.unmodifiableMap(currentNodeMapping);
}
public void nextIteration() {
previousNodeMapping = currentNodeMapping;
currentNodeMapping = new HashMap<>(currentNodeMapping);
currentHashCodeMapping = null;
}
/**
* A partitioning is fine if every hashcode maps to exactly one blank node.
*
* @return true if the partitioning is fine, false otherwise.
*/
public boolean isFine() {
return getCurrentHashCodeMapping().asMap().values().stream().allMatch(member -> member.size() == 1);
}
public boolean isFullyDistinguished() {
if (isFine()) { // no two terms share a hash
return true;
}
return currentUnchanged();
}
public Collection<BNode> getLowestNonTrivialPartition() {
final List<Collection<BNode>> sortedPartitions = new ArrayList<>(
getCurrentHashCodeMapping().asMap().values());
Collections.sort(sortedPartitions, new Comparator<Collection<BNode>>() {
public int compare(Collection<BNode> a, Collection<BNode> b) {
int result = a.size() - b.size();
if (result == 0) {
// break tie by comparing value hash
HashCode hashOfA = currentNodeMapping.get(a.iterator().next());
HashCode hashOfB = currentNodeMapping.get(b.iterator().next());
BigInteger difference = new BigInteger(1, hashOfA.asBytes())
.subtract(new BigInteger(1, hashOfB.asBytes()));
result = difference.compareTo(BigInteger.ZERO);
}
return result;
}
});
Collection<BNode> lowestNonTrivialPartition = sortedPartitions.stream()
.filter(part -> part.size() > 1)
.findFirst()
.orElseThrow(RuntimeException::new);
return lowestNonTrivialPartition;
}
/**
* Return a mapping size to determine a canonical lowest mapping.
*/
public BigInteger getMappingSize() {
BigInteger size = currentNodeMapping.values()
.stream()
.map(h -> new BigInteger(1, h.asBytes()))
.reduce(BigInteger.ZERO, (v1, v2) -> v1.add(v2));
return size;
}
private HashCode getStaticLiteralHashCode(Literal value) {
// GH-2834: we need to include language and datatype when computing a unique hash code for literals
return staticValueMapping.computeIfAbsent(value,
v -> {
Literal l = (Literal) v;
List<HashCode> hashSequence = new ArrayList<>(3);
hashSequence.add(hashFunction.hashString(l.getLabel(), StandardCharsets.UTF_8));
// Per BCP47, language tags are case-insensitive. Use normalized form to ensure consistency if
// possible, otherwise just use lower-case.
l.getLanguage()
.map(lang -> hashFunction.hashString(
Literals.isValidLanguageTag(lang) ? Literals.normalizeLanguageTag(lang)
: lang.toLowerCase(),
StandardCharsets.UTF_8))
.ifPresent(h -> hashSequence.add(h));
hashSequence
.add(hashFunction.hashString(l.getDatatype().stringValue(), StandardCharsets.UTF_8));
return Hashing.combineOrdered(hashSequence);
}
);
}
private Multimap<HashCode, BNode> getCurrentHashCodeMapping() {
if (currentHashCodeMapping == null) {
currentHashCodeMapping = Multimaps.invertFrom(Multimaps.forMap(currentNodeMapping),
HashMultimap.create());
}
return currentHashCodeMapping;
}
/**
* Verify if the current node mapping is unchanged compared to the previous node mapping.
* <p>
* it is unchanged if: all bnodes that have the same hashcode in current also shared the same hashcode in
* previous, and all bnodes that have different ones in current also have different ones in previous.
*
* @return true if unchanged, false otherwise
*/
private boolean currentUnchanged() {
final Multimap<HashCode, BNode> previous = Multimaps.invertFrom(Multimaps.forMap(previousNodeMapping),
HashMultimap.create());
for (Collection<BNode> currentSharedHashNodes : getCurrentHashCodeMapping().asMap().values()) {
// pick a BNode, doesn't matter which: they all share the same hashcode
BNode node = currentSharedHashNodes.iterator().next();
HashCode previousHashCode = previousNodeMapping.get(node);
if (!previous.get(previousHashCode).equals(currentSharedHashNodes)) {
return false;
}
}
return true;
}
}
}