LuceneIndexTest.java
/*******************************************************************************
* Copyright (c) 2015 Eclipse RDF4J contributors, Aduna, and others.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Distribution License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/org/documents/edl-v10.php.
*
* SPDX-License-Identifier: BSD-3-Clause
*******************************************************************************/
package org.eclipse.rdf4j.sail.lucene.impl;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.function.Function;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TotalHitCountCollector;
import org.apache.lucene.store.RAMDirectory;
import org.eclipse.rdf4j.common.concurrent.locks.Properties;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.ValueFactory;
import org.eclipse.rdf4j.model.base.CoreDatatype;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.model.impl.TreeModel;
import org.eclipse.rdf4j.model.vocabulary.GEO;
import org.eclipse.rdf4j.model.vocabulary.GEOF;
import org.eclipse.rdf4j.query.BindingSet;
import org.eclipse.rdf4j.query.TupleQuery;
import org.eclipse.rdf4j.query.TupleQueryResult;
import org.eclipse.rdf4j.repository.sail.SailRepository;
import org.eclipse.rdf4j.repository.sail.SailRepositoryConnection;
import org.eclipse.rdf4j.repository.util.Repositories;
import org.eclipse.rdf4j.sail.evaluation.TupleFunctionEvaluationMode;
import org.eclipse.rdf4j.sail.lucene.LuceneSail;
import org.eclipse.rdf4j.sail.lucene.SearchFields;
import org.eclipse.rdf4j.sail.memory.MemoryStore;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
public class LuceneIndexTest {
private static final ValueFactory vf = SimpleValueFactory.getInstance();
public static final IRI CONTEXT_1 = vf.createIRI("urn:context1");
public static final IRI CONTEXT_2 = vf.createIRI("urn:context2");
public static final IRI CONTEXT_3 = vf.createIRI("urn:context3");
// create some objects that we will use throughout this test
IRI subject = vf.createIRI("urn:subj");
IRI subject2 = vf.createIRI("urn:subj2");
IRI predicate1 = vf.createIRI("urn:pred1");
IRI predicate2 = vf.createIRI("urn:pred2");
Literal object1 = vf.createLiteral("object1");
Literal object2 = vf.createLiteral("object2");
Literal object3 = vf.createLiteral("cats");
Literal object4 = vf.createLiteral("dogs");
Literal object5 = vf.createLiteral("chicken");
Statement statement11 = vf.createStatement(subject, predicate1, object1);
Statement statement12 = vf.createStatement(subject, predicate2, object2);
Statement statement21 = vf.createStatement(subject2, predicate1, object3);
Statement statement22 = vf.createStatement(subject2, predicate2, object4);
Statement statement23 = vf.createStatement(subject2, predicate2, object5);
Statement statementContext111 = vf.createStatement(subject, predicate1, object1, CONTEXT_1);
Statement statementContext121 = vf.createStatement(subject, predicate2, object2, CONTEXT_1);
Statement statementContext211 = vf.createStatement(subject2, predicate1, object3, CONTEXT_1);
Statement statementContext222 = vf.createStatement(subject2, predicate2, object4, CONTEXT_2);
Statement statementContext232 = vf.createStatement(subject2, predicate2, object5, CONTEXT_2);
// add a statement to an index
RAMDirectory directory;
StandardAnalyzer analyzer;
LuceneIndex index;
@BeforeEach
public void setUp() throws Exception {
directory = new RAMDirectory();
analyzer = new StandardAnalyzer();
index = new LuceneIndex(directory, analyzer);
}
@AfterEach
public void tearDown() throws Exception {
index.shutDown();
Properties.setLockTrackingEnabled(false);
}
@Test
public void testAddStatement() throws IOException, ParseException {
// add a statement to an index
index.begin();
index.addStatement(statement11);
index.commit();
// check that it arrived properly
DirectoryReader reader = DirectoryReader.open(directory);
assertEquals(1, reader.numDocs());
Term term = new Term(SearchFields.URI_FIELD_NAME, subject.toString());
PostingsEnum docs = termDocs(reader, term);
assertTrue(next(docs));
int documentNr = docs.docID();
Document document = reader.document(documentNr);
assertEquals(subject.toString(), document.get(SearchFields.URI_FIELD_NAME));
assertEquals(object1.getLabel(), document.get(predicate1.toString()));
assertFalse(next(docs));
reader.close();
// add another statement
index.begin();
index.addStatement(statement12);
index.commit();
// See if everything remains consistent. We must create a new IndexReader
// in order to be able to see the updates
reader = DirectoryReader.open(directory);
assertEquals(1, reader.numDocs()); // #docs should *not* have increased
docs = termDocs(reader, term);
assertTrue(next(docs));
documentNr = docs.docID();
document = reader.document(documentNr);
assertEquals(subject.toString(), document.get(SearchFields.URI_FIELD_NAME));
assertEquals(object1.getLabel(), document.get(predicate1.toString()));
assertEquals(object2.getLabel(), document.get(predicate2.toString()));
assertFalse(next(docs));
// see if we can query for these literals
IndexSearcher searcher = new IndexSearcher(reader);
QueryParser parser = new QueryParser(SearchFields.TEXT_FIELD_NAME, analyzer);
Query query = parser.parse(object1.getLabel());
System.out.println("query=" + query);
TotalHitCountCollector results = new TotalHitCountCollector();
searcher.search(query, results);
assertEquals(1, results.getTotalHits());
query = parser.parse(object2.getLabel());
results = new TotalHitCountCollector();
searcher.search(query, results);
assertEquals(1, results.getTotalHits());
reader.close();
// remove the first statement
index.begin();
index.removeStatement(statement11);
index.commit();
// check that that statement is actually removed and that the other still
// exists
reader = DirectoryReader.open(directory);
assertEquals(1, reader.numDocs());
docs = termDocs(reader, term);
assertTrue(next(docs));
documentNr = docs.docID();
document = reader.document(documentNr);
assertEquals(subject.toString(), document.get(SearchFields.URI_FIELD_NAME));
assertNull(document.get(predicate1.toString()));
assertEquals(object2.getLabel(), document.get(predicate2.toString()));
assertFalse(next(docs));
reader.close();
// remove the other statement
index.begin();
index.removeStatement(statement12);
index.commit();
// check that there are no documents left (i.e. the last Document was
// removed completely, rather than its remaining triple removed)
reader = DirectoryReader.open(directory);
assertEquals(0, reader.numDocs());
reader.close();
}
/**
* NB: this is a convenient but very slow way of getting termDocs. It is sufficient for testing purposes.
*
* @throws IOException
*/
private static PostingsEnum termDocs(IndexReader reader, Term term) throws IOException {
return MultiTerms.getTermPostingsEnum(reader, term.field(), term.bytes());
}
private static boolean next(PostingsEnum docs) throws IOException {
return (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS);
}
@Test
public void testAddMultiple() throws Exception {
// add a statement to an index
HashSet<Statement> added = new HashSet<>();
HashSet<Statement> removed = new HashSet<>();
added.add(statement11);
added.add(statement12);
added.add(statement21);
added.add(statement22);
index.begin();
index.addRemoveStatements(added, removed);
index.commit();
try ( // check that it arrived properly
DirectoryReader reader = DirectoryReader.open(directory)) {
assertEquals(2, reader.numDocs());
}
// check the documents
Document document = index.getDocuments(subject).iterator().next();
assertEquals(subject.toString(), document.get(SearchFields.URI_FIELD_NAME));
assertStatement(statement11, document);
assertStatement(statement12, document);
document = index.getDocuments(subject2).iterator().next();
assertEquals(subject2.toString(), document.get(SearchFields.URI_FIELD_NAME));
assertStatement(statement21, document);
assertStatement(statement22, document);
// check if the text field stores all added string values
Set<String> texts = new HashSet<>();
texts.add("cats");
texts.add("dogs");
// FIXME
// assertTexts(texts, document);
// add/remove one
added.clear();
removed.clear();
added.add(statement23);
removed.add(statement22);
index.begin();
index.addRemoveStatements(added, removed);
index.commit();
// check doc 2
document = index.getDocuments(subject2).iterator().next();
assertEquals(subject2.toString(), document.get(SearchFields.URI_FIELD_NAME));
assertStatement(statement21, document);
assertStatement(statement23, document);
assertNoStatement(statement22, document);
// check if the text field stores all added and no deleted string values
texts.remove("dogs");
texts.add("chicken");
// FIXME
// assertTexts(texts, document);
// TODO: check deletion of the rest
}
/**
* Contexts can only be tested in combination with a sail, as the triples have to be retrieved from the sail
*
* @throws Exception
*/
@Test
public void testContexts() throws Exception {
// add a sail
MemoryStore memoryStore = new MemoryStore();
// enable lock tracking
org.eclipse.rdf4j.common.concurrent.locks.Properties.setLockTrackingEnabled(true);
LuceneSail sail = new LuceneSail();
sail.setBaseSail(memoryStore);
sail.setLuceneIndex(index);
// create a Repository wrapping the LuceneSail
SailRepository repository = new SailRepository(sail);
try ( // now add the statements through the repo
// add statements with context
SailRepositoryConnection connection = repository.getConnection()) {
connection.begin();
connection.add(statementContext111, statementContext111.getContext());
connection.add(statementContext121, statementContext121.getContext());
connection.add(statementContext211, statementContext211.getContext());
connection.add(statementContext222, statementContext222.getContext());
connection.add(statementContext232, statementContext232.getContext());
connection.commit();
// check if they are there
assertStatement(statementContext111);
assertStatement(statementContext121);
assertStatement(statementContext211);
assertStatement(statementContext222);
assertStatement(statementContext232);
// delete context 1
connection.begin();
connection.clear(new Resource[] { CONTEXT_1 });
connection.commit();
assertNoStatement(statementContext111);
assertNoStatement(statementContext121);
assertNoStatement(statementContext211);
assertStatement(statementContext222);
assertStatement(statementContext232);
} finally {
// close repo
repository.shutDown();
}
}
/**
* Contexts can only be tested in combination with a sail, as the triples have to be retrieved from the sail
*
* @throws Exception
*/
@Test
public void testContextsRemoveContext2() throws Exception {
// add a sail
MemoryStore memoryStore = new MemoryStore();
// enable lock tracking
org.eclipse.rdf4j.common.concurrent.locks.Properties.setLockTrackingEnabled(true);
LuceneSail sail = new LuceneSail();
sail.setBaseSail(memoryStore);
sail.setLuceneIndex(index);
// create a Repository wrapping the LuceneSail
SailRepository repository = new SailRepository(sail);
try ( // now add the statements through the repo
// add statements with context
SailRepositoryConnection connection = repository.getConnection()) {
connection.begin();
connection.add(statementContext111, statementContext111.getContext());
connection.add(statementContext121, statementContext121.getContext());
connection.add(statementContext211, statementContext211.getContext());
connection.add(statementContext222, statementContext222.getContext());
connection.add(statementContext232, statementContext232.getContext());
connection.commit();
// check if they are there
assertStatement(statementContext111);
assertStatement(statementContext121);
assertStatement(statementContext211);
assertStatement(statementContext222);
assertStatement(statementContext232);
// delete context 2
connection.begin();
connection.clear(new Resource[] { CONTEXT_2 });
connection.commit();
assertStatement(statementContext111);
assertStatement(statementContext121);
assertStatement(statementContext211);
assertNoStatement(statementContext222);
assertNoStatement(statementContext232);
} finally {
// close repo
repository.shutDown();
}
}
@Test
public void testRejectedDatatypes() {
IRI STRING = vf.createIRI("http://www.w3.org/2001/XMLSchema#string");
IRI FLOAT = vf.createIRI("http://www.w3.org/2001/XMLSchema#float");
Literal literal1 = vf.createLiteral("hi there");
Literal literal2 = vf.createLiteral("hi there, too", STRING);
Literal literal3 = vf.createLiteral("1.0");
Literal literal4 = vf.createLiteral("1.0", FLOAT);
assertEquals(true, index.accept(literal1), "Is the first literal accepted?");
assertEquals(true, index.accept(literal2), "Is the second literal accepted?");
assertEquals(true, index.accept(literal3), "Is the third literal accepted?");
assertEquals(false, index.accept(literal4), "Is the fourth literal accepted?");
}
@Test
public void testInstantiatesCustomQueryAnalyzer() throws Exception {
LuceneIndex index = new LuceneIndex();
java.util.Properties props = new java.util.Properties();
props.put(LuceneSail.QUERY_ANALYZER_CLASS_KEY, EnglishAnalyzer.class.getName());
props.put(LuceneSail.ANALYZER_CLASS_KEY, EnglishAnalyzer.class.getName());
props.put(LuceneSail.LUCENE_RAMDIR_KEY, "true");
index.initialize(props);
assertTrue(index.getAnalyzer() instanceof EnglishAnalyzer);
assertTrue(index.getQueryAnalyzer() instanceof EnglishAnalyzer);
}
private void assertStatement(Statement statement) throws Exception {
Document document = index.getDocument(statement.getSubject(), statement.getContext());
if (document == null) {
fail("Missing document " + statement.getSubject());
}
assertStatement(statement, document);
}
private void assertNoStatement(Statement statement) throws Exception {
Document document = index.getDocument(statement.getSubject(), statement.getContext());
if (document == null) {
return;
}
assertNoStatement(statement, document);
}
/**
* @param statement112
* @param document
*/
private void assertStatement(Statement statement, Document document) {
IndexableField[] fields = document.getFields(SearchFields.getPropertyField(statement.getPredicate()));
assertNotNull(fields, "field " + statement.getPredicate() + " not found in document " + document);
for (IndexableField f : fields) {
if (((Literal) statement.getObject()).getLabel().equals(f.stringValue())) {
return;
}
}
fail("Statement not found in document " + statement);
}
/**
* @param statement112
* @param document
*/
private void assertNoStatement(Statement statement, Document document) {
IndexableField[] fields = document.getFields(SearchFields.getPropertyField(statement.getPredicate()));
if (fields == null) {
return;
}
for (IndexableField f : fields) {
if (((Literal) statement.getObject()).getLabel().equals(f.stringValue())) {
fail("Statement should not be found in document " + statement);
}
}
}
/*
* private void assertTexts(Set<String> texts, Document document) { Set<String> toFind = new HashSet<String>(texts);
* Set<String> found = new HashSet<String>(); for(Field field : document.getFields(LuceneIndex.TEXT_FIELD_NAME)) {
* // is the field value expected and not yet been found? if(toFind.remove(field.stringValue())) { // add it to the
* found set // (it was already remove from the toFind list in the if clause) found.add(field.stringValue()); } else
* { assertEquals( "Was the text value '" + field.stringValue() + "' expected to exist?", false, true); } }
* for(String notFound : toFind) { assertEquals("Was the expected text value '" + notFound + "' found?", true,
* false); } }
*/
@Test
public void geoSparqlQueryTest() {
final String prefix = "http://www.example.org/#";
final String prefixes = "PREFIX ex: <" + prefix + ">\n"
+ "PREFIX geof: <" + GEOF.NAMESPACE + ">\n"
+ "PREFIX geo: <" + CoreDatatype.GEO.NAMESPACE + ">\n"
+ "PREFIX uom: <http://www.opengis.net/def/uom/OGC/1.0/>\n";
Model data = new TreeModel();
IRI cp = vf.createIRI(prefix + "cp");
IRI bm = vf.createIRI(prefix + "bm");
IRI nkv = vf.createIRI(prefix + "nkv");
data.add(cp, GEO.AS_WKT, vf.createLiteral("Point(4.38436 45.44917)", CoreDatatype.GEO.WKT_LITERAL));
data.add(bm, GEO.AS_WKT, vf.createLiteral("Point(4.38311 45.45423)", CoreDatatype.GEO.WKT_LITERAL));
data.add(nkv, GEO.AS_WKT, vf.createLiteral("Point(4.87306 45.77903)", CoreDatatype.GEO.WKT_LITERAL));
data.add(vf.createIRI(prefix + "arp"), GEO.AS_WKT,
vf.createLiteral("Point(2.89271 42.69848)", CoreDatatype.GEO.WKT_LITERAL));
String polyVill = "POLYGON((4.864712 45.784405, 4.883165 45.787756, 4.889946 45.785781, 4.904881 45.767403, 4.900761 45.765487, 4.872093 45.770995, 4.86454 45.770457, 4.858789 45.770277, 4.859905 45.784644, 4.864712 45.784405))";
String polySain = "POLYGON((4.380627 45.463983, 4.400539 45.462177, 4.428349 45.436286, 4.399509 45.411346, 4.374447 45.426528, 4.370499 45.450618, 4.380627 45.463983))";
SailRepository m1 = new SailRepository(new MemoryStore());
LuceneSail lc = new LuceneSail();
lc.setBaseSail(new MemoryStore());
lc.setParameter(LuceneSail.WKT_FIELDS, GEO.AS_WKT.toString());
lc.setLuceneIndex(index);
lc.setEvaluationMode(TupleFunctionEvaluationMode.NATIVE);
SailRepository m2 = new SailRepository(lc);
// add test data
Repositories.consume(m1, conn -> conn.add(data));
Repositories.consume(m2, conn -> conn.add(data));
lc.reindex();
Function<TupleQueryResult, Set<Value>> toval = (res) -> {
Set<Value> list = new HashSet<>();
while (res.hasNext()) {
BindingSet next = res.next();
list.add(next.getValue("v"));
}
return list;
};
// test queries
String q0 = prefixes
+ "SELECT * {\n"
+ " ?v geo:asWKT ?loc .\n"
+ " FILTER(geof:distance(\"Point(4.386914 45.440637)\"^^geo:wktLiteral, ?loc, uom:metre) < 10000) \n"
+ "}\n";
Set<Value> q0ex = Set.of(bm, cp);
String q1 = prefixes
+ "SELECT * {\n"
+ " ?v geo:asWKT ?loc .\n"
+ " FILTER(geof:ehContains(\"" + polySain + "\"^^geo:wktLiteral, ?loc)) \n"
+ "}\n";
Set<Value> q1ex = Set.of(bm, cp);
String q2 = prefixes
+ "SELECT * {\n"
+ " ?v geo:asWKT ?loc .\n"
+ " FILTER(geof:ehContains(\"" + polyVill + "\"^^geo:wktLiteral, ?loc)) \n"
+ "}\n";
Set<Value> q2ex = Set.of(nkv);
Set<Value> nlcq0 = Repositories.tupleQuery(m1, q0, toval);
Set<Value> nlcq1 = Repositories.tupleQuery(m1, q1, toval);
Set<Value> nlcq2 = Repositories.tupleQuery(m1, q2, toval);
assertEquals(q0ex, nlcq0);
assertEquals(q1ex, nlcq1);
assertEquals(q2ex, nlcq2);
assertEquals(nlcq0, Repositories.tupleQuery(m2, q0, toval));
assertEquals(nlcq1, Repositories.tupleQuery(m2, q1, toval));
assertEquals(nlcq2, Repositories.tupleQuery(m2, q2, toval));
}
}