PDFIncrementalUpdatesTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.pdf;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.List;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.updates.StartXRefOffset;
import org.apache.tika.parser.pdf.updates.StartXRefScanner;
public class PDFIncrementalUpdatesTest extends TikaTest {
/*
Test files with incremental updates?
testPDF_Version.4.x.pdf 1
testPDFTwoTextBoxes.pdf 1
testPDF_incrementalUpdates.pdf 2
testOptionalHyphen.pdf 1
testPageNumber.pdf 1
testPDF_twoAuthors.pdf 1
testPDF_XFA_govdocs1_258578.pdf 1
testJournalParser.pdf 1
testPDF_bookmarks.pdf 1
testPDFVarious.pdf 1
*/
/*
Many thanks to Tyler Thorsted for sharing "testPDF_incrementalUpdates.pdf"
*/
@Test
public void testIncrementalUpdateInfoExtracted() throws Exception {
PDFParserConfig pdfParserConfig = new PDFParserConfig();
pdfParserConfig.setExtractIncrementalUpdateInfo(true);
ParseContext parseContext = new ParseContext();
parseContext.set(PDFParserConfig.class, pdfParserConfig);
List<Metadata> metadataList = getRecursiveMetadata(
"testPDF_incrementalUpdates.pdf",
parseContext);
assertEquals(2, metadataList.get(0).getInt(TikaCoreProperties.VERSION_COUNT));
assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT));
long[] expected = new long[]{16242, 41226, 64872};
long[] eofs = metadataList.get(0).getLongValues(PDF.EOF_OFFSETS);
assertEquals(3, eofs.length);
assertArrayEquals(expected, eofs);
}
@Test
public void testTooLongLong() throws Exception {
String s = "blah blah startxref 01234567890123456789\n%%EOF blah";
assertEquals(0, getOffsets(s).size());
}
@Test
public void testMissingEOF() throws Exception {
String s = "blah blah startxref 123456\nblah";
assertEquals(1, getOffsets(s).size());
}
@Test
public void testBrokenEOF() throws Exception {
String s = "blah blah startxref 123456\n%%EO\nstartxref 234567\n%%EOF\nblah";
assertEquals(2, getOffsets(s).size());
}
@Test
public void testNoSpace1() throws Exception {
String s = "blah blah startxref123456\n%%EOF\nblah";
assertEquals(1, getOffsets(s).size());
}
@Test
public void testNoSpace2() throws Exception {
String s = "blah blah startxref 123456%%EOF\nblah";
assertEquals(1, getOffsets(s).size());
}
@Test
public void testNoStartXref() throws Exception {
String s = "blah blah startxref not a startxre";
assertEquals(0, getOffsets(s).size());
}
@Test
public void testLongAtEOF() throws Exception {
//we should not count longs at EOF because
//they might be truncated?
String s = "blah blah startxref 100";
assertEquals(0, getOffsets(s).size());
}
@Test
public void testCommentInsteadOfEOF() throws Exception {
String s = "blah blah startxref 123456\n%%regular comment\n%%EOF";
assertEquals(1, getOffsets(s).size());
}
@Test
public void testStartxStartXref() throws Exception {
//make sure that we are rewinding last character
String s = "blah blah startxstartxref 123456\n%%EOFblah";
assertEquals(1, getOffsets(s).size());
}
private List<StartXRefOffset> getOffsets(String s) throws IOException {
try (RandomAccessRead randomAccessRead =
new RandomAccessReadBuffer(s.getBytes(StandardCharsets.US_ASCII))) {
StartXRefScanner scanner = new StartXRefScanner(randomAccessRead);
return scanner.scan();
}
}
@Test
public void testIncrementalUpdateParsing() throws Exception {
PDFParserConfig pdfParserConfig = new PDFParserConfig();
pdfParserConfig.setParseIncrementalUpdates(true);
ParseContext parseContext = new ParseContext();
parseContext.set(PDFParserConfig.class, pdfParserConfig);
List<Metadata> metadataList = getRecursiveMetadata(
"testPDF_incrementalUpdates.pdf",
parseContext);
assertEquals(3, metadataList.size());
assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT));
assertEquals(2, metadataList.get(0).getInt(TikaCoreProperties.VERSION_COUNT));
long[] expected = new long[]{16242, 41226, 64872};
long[] eofs = metadataList.get(0).getLongValues(PDF.EOF_OFFSETS);
assertEquals(3, eofs.length);
assertArrayEquals(expected, eofs);
assertContains("Testing Incremental", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
assertNotContained("Testing Incremental",
metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
assertContains("Testing Incremental", metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT));
assertNull(metadataList.get(0).get(PDF.INCREMENTAL_UPDATE_NUMBER));
assertNull(metadataList.get(0).get(PDF.INCREMENTAL_UPDATE_NUMBER));
assertEquals(0, metadataList.get(1).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
assertEquals(1, metadataList.get(2).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
assertEquals("/version-number-0",
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
assertEquals("/version-number-1",
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
assertNull(metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertNull(metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals(TikaCoreProperties.EmbeddedResourceType.VERSION.toString(),
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
assertEquals(TikaCoreProperties.EmbeddedResourceType.VERSION.toString(),
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
}
//TODO: embed the incremental updates PDF inside another doc and confirm it works
}