PDFMergerUtilityTest.java
/*
* Copyright 2014 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.multipdf;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.io.RandomAccessStreamCache.StreamCacheCreateFunction;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.COSObjectable;
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
import org.apache.pdfbox.pdmodel.common.PDNumberTreeNode;
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDMarkedContentReference;
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDParentTreeValue;
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement;
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureNode;
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionGoTo;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationPopup;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationText;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDNamedDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitDestination;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFMarkedContentExtractor;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
import org.junit.jupiter.api.parallel.ExecutionMode;
/**
* Test suite for PDFMergerUtility.
*
* @author Maruan Sahyoun (PDF files)
* @author Tilman Hausherr (code)
*/
@Execution(ExecutionMode.CONCURRENT)
class PDFMergerUtilityTest
{
private static final String SRCDIR = "src/test/resources/input/merge/";
private static final String TARGETTESTDIR = "target/test-output/merge/";
private static final File TARGETPDFDIR = new File("target/pdfs");
private static final int DPI = 96;
@BeforeAll
static void setUp()
{
new File(TARGETTESTDIR).mkdirs();
}
/**
* Tests whether the merge of two PDF files with identically named but
* different global resources works. The two PDF files have two fonts each
* named /TT1 and /TT0 that are Arial and Courier and vice versa in the
* second file. Revisions before 1613017 fail this test because global
* resources were merged which made trouble when resources of the same kind
* had the same name.
*
* @throws IOException if something goes wrong.
*/
@Test
void testPDFMergerUtility() throws IOException
{
checkMergeIdentical("PDFBox.GlobalResourceMergeTest.Doc01.decoded.pdf",
"PDFBox.GlobalResourceMergeTest.Doc02.decoded.pdf",
"GlobalResourceMergeTestResult1.pdf",
IOUtils.createMemoryOnlyStreamCache());
// once again, with scratch file
checkMergeIdentical("PDFBox.GlobalResourceMergeTest.Doc01.decoded.pdf",
"PDFBox.GlobalResourceMergeTest.Doc02.decoded.pdf",
"GlobalResourceMergeTestResult2.pdf",
IOUtils.createTempFileOnlyStreamCache());
}
// see PDFBOX-2893
@Test
void testPDFMergerUtility2() throws IOException
{
checkMergeIdentical("PDFBox.GlobalResourceMergeTest.Doc01.pdf",
"PDFBox.GlobalResourceMergeTest.Doc02.pdf",
"GlobalResourceMergeTestResult3.pdf",
IOUtils.createMemoryOnlyStreamCache());
// once again, with scratch file
checkMergeIdentical("PDFBox.GlobalResourceMergeTest.Doc01.pdf",
"PDFBox.GlobalResourceMergeTest.Doc02.pdf",
"GlobalResourceMergeTestResult4.pdf",
IOUtils.createTempFileOnlyStreamCache());
}
/**
* Tests whether the merge of two PDF files with JPEG and CCITT works. A few revisions before
* 1704911 this test failed because the clone utility attempted to decode and re-encode the
* streams, see PDFBOX-2893 on 23.9.2015.
*
* @throws IOException if something goes wrong.
*/
@Test
void testJpegCcitt() throws IOException
{
checkMergeIdentical("jpegrgb.pdf",
"multitiff.pdf",
"JpegMultiMergeTestResult.pdf",
IOUtils.createMemoryOnlyStreamCache());
// once again, with scratch file
checkMergeIdentical("jpegrgb.pdf",
"multitiff.pdf",
"JpegMultiMergeTestResult.pdf",
IOUtils.createTempFileOnlyStreamCache());
}
/**
* PDFBOX-3972: Test that OpenAction page destination isn't lost after merge.
*
* @throws IOException
*/
@Test
void testPDFMergerOpenAction() throws IOException
{
try (PDDocument doc1 = new PDDocument())
{
doc1.addPage(new PDPage());
doc1.addPage(new PDPage());
doc1.addPage(new PDPage());
doc1.save(new File(TARGETTESTDIR,"MergerOpenActionTest1.pdf"));
}
PDPageDestination dest;
try (PDDocument doc2 = new PDDocument())
{
doc2.addPage(new PDPage());
doc2.addPage(new PDPage());
doc2.addPage(new PDPage());
dest = new PDPageFitDestination();
dest.setPage(doc2.getPage(1));
doc2.getDocumentCatalog().setOpenAction(dest);
doc2.save(new File(TARGETTESTDIR,"MergerOpenActionTest2.pdf"));
}
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
pdfMergerUtility.addSource(new File(TARGETTESTDIR, "MergerOpenActionTest1.pdf"));
pdfMergerUtility.addSource(new File(TARGETTESTDIR, "MergerOpenActionTest2.pdf"));
pdfMergerUtility.setDestinationFileName(TARGETTESTDIR + "MergerOpenActionTestResult.pdf");
pdfMergerUtility.mergeDocuments(IOUtils.createMemoryOnlyStreamCache());
try (PDDocument mergedDoc = Loader
.loadPDF(new File(TARGETTESTDIR, "MergerOpenActionTestResult.pdf")))
{
PDDocumentCatalog documentCatalog = mergedDoc.getDocumentCatalog();
dest = (PDPageDestination) documentCatalog.getOpenAction();
assertEquals(4, documentCatalog.getPages().indexOf(dest.getPage()));
}
}
/**
* PDFBOX-3999: check that page entries in the structure tree only reference pages from the page
* tree, i.e. that no orphan pages exist.
*
* @throws IOException
*/
@Test
void testStructureTreeMerge() throws IOException
{
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
PDDocument src = Loader
.loadPDF(new File(TARGETPDFDIR, "PDFBOX-3999-GeneralForbearance.pdf"));
ElementCounter elementCounter = new ElementCounter();
elementCounter.walk(src.getDocumentCatalog().getStructureTreeRoot().getK());
int singleCnt = elementCounter.cnt;
int singleSetSize = elementCounter.set.size();
assertEquals(134, singleCnt);
assertEquals(134, singleSetSize);
PDDocument dst = Loader
.loadPDF(new File(TARGETPDFDIR, "PDFBOX-3999-GeneralForbearance.pdf"));
pdfMergerUtility.appendDocument(dst, src);
src.close();
dst.save(new File(TARGETTESTDIR, "PDFBOX-3999-GeneralForbearance-merged.pdf"));
dst.close();
PDDocument doc = Loader
.loadPDF(new File(TARGETTESTDIR, "PDFBOX-3999-GeneralForbearance-merged.pdf"));
// Assume that the merged tree has double element count
elementCounter = new ElementCounter();
elementCounter.walk(doc.getDocumentCatalog().getStructureTreeRoot().getK());
assertEquals(singleCnt * 2, elementCounter.cnt);
assertEquals(singleSetSize * 2, elementCounter.set.size());
checkForPageOrphans(doc);
doc.close();
}
/**
* PDFBOX-3999: check that no streams are kept from the source document by the destination
* document, despite orphan annotations remaining in the structure tree.
*
* @throws IOException
*/
@Test
void testStructureTreeMerge2() throws IOException
{
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
PDDocument doc = Loader
.loadPDF(new File(TARGETPDFDIR, "PDFBOX-3999-GeneralForbearance.pdf"));
doc.getDocumentCatalog().getAcroForm().flatten();
doc.save(new File(TARGETTESTDIR, "PDFBOX-3999-GeneralForbearance-flattened.pdf"));
ElementCounter elementCounter = new ElementCounter();
elementCounter.walk(doc.getDocumentCatalog().getStructureTreeRoot().getK());
int singleCnt = elementCounter.cnt;
int singleSetSize = elementCounter.set.size();
assertEquals(134, singleCnt);
assertEquals(134, singleSetSize);
doc.close();
PDDocument src = Loader
.loadPDF(new File(TARGETTESTDIR, "PDFBOX-3999-GeneralForbearance-flattened.pdf"));
PDDocument dst = Loader
.loadPDF(new File(TARGETTESTDIR, "PDFBOX-3999-GeneralForbearance-flattened.pdf"));
pdfMergerUtility.appendDocument(dst, src);
// before solving PDFBOX-3999, the close() below brought
// IOException: COSStream has been closed and cannot be read.
src.close();
dst.save(new File(TARGETTESTDIR, "PDFBOX-3999-GeneralForbearance-flattened-merged.pdf"));
dst.close();
doc = Loader.loadPDF(
new File(TARGETTESTDIR, "PDFBOX-3999-GeneralForbearance-flattened-merged.pdf"));
checkForPageOrphans(doc);
// Assume that the merged tree has double element count
elementCounter = new ElementCounter();
elementCounter.walk(doc.getDocumentCatalog().getStructureTreeRoot().getK());
assertEquals(singleCnt * 2, elementCounter.cnt);
assertEquals(singleSetSize * 2, elementCounter.set.size());
doc.close();
}
/**
* PDFBOX-4408: Check that /StructParents values from pages and /StructParent values from
* annotations are found in the /ParentTree.
*
* @throws IOException
*/
@Test
void testStructureTreeMerge3() throws IOException
{
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
PDDocument src = Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-4408.pdf"));
ElementCounter elementCounter = new ElementCounter();
elementCounter.walk(src.getDocumentCatalog().getStructureTreeRoot().getK());
int singleCnt = elementCounter.cnt;
int singleSetSize = elementCounter.set.size();
assertEquals(25, singleCnt);
assertEquals(25, singleSetSize);
PDDocument dst = Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-4408.pdf"));
pdfMergerUtility.appendDocument(dst, src);
src.close();
dst.save(new File(TARGETTESTDIR, "PDFBOX-4408-merged.pdf"));
dst.close();
dst = Loader.loadPDF(new File(TARGETTESTDIR, "PDFBOX-4408-merged.pdf"));
// Assume that the merged tree has double element count
elementCounter = new ElementCounter();
elementCounter.walk(dst.getDocumentCatalog().getStructureTreeRoot().getK());
assertEquals(singleCnt * 2, elementCounter.cnt);
assertEquals(singleSetSize * 2, elementCounter.set.size());
checkWithNumberTree(dst);
checkForPageOrphans(dst);
dst.close();
checkStructTreeRootCount(new File(TARGETTESTDIR, "PDFBOX-4408-merged.pdf"));
}
/**
* PDFBOX-4417: Same as the previous tests, but this one failed when the previous tests
* succeeded because of more bugs with cloning.
*
* @throws IOException
*/
@Test
void testStructureTreeMerge4() throws IOException
{
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
PDDocument src = Loader.loadPDF(new File(SRCDIR, "PDFBOX-4417-001031.pdf"));
ElementCounter elementCounter = new ElementCounter();
elementCounter.walk(src.getDocumentCatalog().getStructureTreeRoot().getK());
int singleCnt = elementCounter.cnt;
int singleSetSize = elementCounter.set.size();
assertEquals(104, singleCnt);
assertEquals(104, singleSetSize);
PDDocument dst = Loader.loadPDF(new File(SRCDIR, "PDFBOX-4417-001031.pdf"));
pdfMergerUtility.appendDocument(dst, src);
src.close();
dst.save(new File(TARGETTESTDIR, "PDFBOX-4417-001031-merged.pdf"));
dst.close();
dst = Loader.loadPDF(new File(TARGETTESTDIR, "PDFBOX-4417-001031-merged.pdf"));
// Assume that the merged tree has double element count
elementCounter = new ElementCounter();
elementCounter.walk(dst.getDocumentCatalog().getStructureTreeRoot().getK());
assertEquals(singleCnt * 2, elementCounter.cnt);
assertEquals(singleSetSize * 2, elementCounter.set.size());
checkWithNumberTree(dst);
checkForPageOrphans(dst);
dst.close();
checkStructTreeRootCount(new File(TARGETTESTDIR, "PDFBOX-4417-001031-merged.pdf"));
}
/**
* PDFBOX-4417: Same as the previous tests, but this one failed when the previous tests
* succeeded because the /K tree started with two dictionaries and not with an array.
*
* @throws IOException
*/
@Test
void testStructureTreeMerge5() throws IOException
{
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
PDDocument src = Loader.loadPDF(new File(SRCDIR, "PDFBOX-4417-054080.pdf"));
ElementCounter elementCounter = new ElementCounter();
elementCounter.walk(src.getDocumentCatalog().getStructureTreeRoot().getK());
int singleCnt = elementCounter.cnt;
int singleSetSize = elementCounter.set.size();
PDDocument dst = Loader.loadPDF(new File(SRCDIR, "PDFBOX-4417-054080.pdf"));
pdfMergerUtility.appendDocument(dst, src);
src.close();
dst.save(new File(TARGETTESTDIR, "PDFBOX-4417-054080-merged.pdf"));
dst.close();
dst = Loader.loadPDF(new File(TARGETTESTDIR, "PDFBOX-4417-054080-merged.pdf"));
checkWithNumberTree(dst);
checkForPageOrphans(dst);
// Assume that the merged tree has double element count
elementCounter = new ElementCounter();
elementCounter.walk(dst.getDocumentCatalog().getStructureTreeRoot().getK());
assertEquals(singleCnt * 2, elementCounter.cnt);
assertEquals(singleSetSize * 2, elementCounter.set.size());
dst.close();
checkStructTreeRootCount(new File(TARGETTESTDIR, "PDFBOX-4417-054080-merged.pdf"));
}
/**
* PDFBOX-4418: test merging PDFs where ParentTree have a hierarchy.
*
* @throws IOException
*/
@Test
void testStructureTreeMerge6() throws IOException
{
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
PDDocument src = Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-4418-000671.pdf"));
PDStructureTreeRoot structureTreeRoot = src.getDocumentCatalog().getStructureTreeRoot();
PDNumberTreeNode parentTree = structureTreeRoot.getParentTree();
Map<Integer, COSObjectable> numberTreeAsMap = PDFMergerUtility.getNumberTreeAsMap(parentTree);
assertEquals(381, numberTreeAsMap.size());
assertEquals(743, Collections.max(numberTreeAsMap.keySet()) + 1);
assertEquals(0, (int) Collections.min(numberTreeAsMap.keySet()));
assertEquals(743, structureTreeRoot.getParentTreeNextKey());
PDDocument dst = Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-4418-000314.pdf"));
structureTreeRoot = dst.getDocumentCatalog().getStructureTreeRoot();
parentTree = structureTreeRoot.getParentTree();
numberTreeAsMap = PDFMergerUtility.getNumberTreeAsMap(parentTree);
assertEquals(7, numberTreeAsMap.size());
assertEquals(328, Collections.max(numberTreeAsMap.keySet()) + 1);
assertEquals(321, (int) Collections.min(numberTreeAsMap.keySet()));
// ParentTreeNextKey should be 321 but PDF has a higher value
assertEquals(408, structureTreeRoot.getParentTreeNextKey());
pdfMergerUtility.appendDocument(dst, src);
src.close();
dst.save(new File(TARGETTESTDIR, "PDFBOX-4418-merged.pdf"));
dst.close();
dst = Loader.loadPDF(new File(TARGETTESTDIR, "PDFBOX-4418-merged.pdf"));
checkWithNumberTree(dst);
checkForPageOrphans(dst);
structureTreeRoot = dst.getDocumentCatalog().getStructureTreeRoot();
parentTree = structureTreeRoot.getParentTree();
numberTreeAsMap = PDFMergerUtility.getNumberTreeAsMap(parentTree);
assertEquals(381+7, numberTreeAsMap.size());
assertEquals(408+743, Collections.max(numberTreeAsMap.keySet()) + 1);
assertEquals(321, (int) Collections.min(numberTreeAsMap.keySet()));
assertEquals(408+743, structureTreeRoot.getParentTreeNextKey());
dst.close();
checkStructTreeRootCount(new File(TARGETTESTDIR, "PDFBOX-4418-merged.pdf"));
}
/**
* PDFBOX-4423: test merging a PDF where a widget has no StructParent.
*
* @throws IOException
*/
@Test
void testStructureTreeMerge7() throws IOException
{
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
PDDocument src = Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-4423-000746.pdf"));
PDStructureTreeRoot structureTreeRoot = src.getDocumentCatalog().getStructureTreeRoot();
PDNumberTreeNode parentTree = structureTreeRoot.getParentTree();
Map<Integer, COSObjectable> numberTreeAsMap = PDFMergerUtility.getNumberTreeAsMap(parentTree);
assertEquals(33, numberTreeAsMap.size());
assertEquals(64, Collections.max(numberTreeAsMap.keySet()) + 1);
assertEquals(31, (int) Collections.min(numberTreeAsMap.keySet()));
assertEquals(126, structureTreeRoot.getParentTreeNextKey());
PDDocument dst = new PDDocument();
pdfMergerUtility.appendDocument(dst, src);
src.close();
dst.save(new File(TARGETTESTDIR, "PDFBOX-4423-merged.pdf"));
dst.close();
dst = Loader.loadPDF(new File(TARGETTESTDIR, "PDFBOX-4423-merged.pdf"));
checkWithNumberTree(dst);
checkForPageOrphans(dst);
structureTreeRoot = dst.getDocumentCatalog().getStructureTreeRoot();
parentTree = structureTreeRoot.getParentTree();
numberTreeAsMap = PDFMergerUtility.getNumberTreeAsMap(parentTree);
assertEquals(33, numberTreeAsMap.size());
assertEquals(64, Collections.max(numberTreeAsMap.keySet()) + 1);
assertEquals(31, (int) Collections.min(numberTreeAsMap.keySet()));
assertEquals(64, structureTreeRoot.getParentTreeNextKey());
dst.close();
checkStructTreeRootCount(new File(TARGETTESTDIR, "PDFBOX-4423-merged.pdf"));
}
/**
* PDFBOX-4009: Test that ParentTreeNextKey is recalculated correctly.
*/
@Test
void testMissingParentTreeNextKey() throws IOException
{
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
PDDocument src = Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-4418-000314.pdf"));
PDDocument dst = Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-4418-000314.pdf"));
// existing numbers are 321..327; ParentTreeNextKey is 408.
// After deletion, it is recalculated in the merge 328.
// That value is added to all numbers of the destination,
// so the new numbers should be 321+328..327+328, i.e. 649..655,
// and this ParentTreeNextKey is 656 at the end.
dst.getDocumentCatalog().getStructureTreeRoot().getCOSObject().removeItem(COSName.PARENT_TREE_NEXT_KEY);
pdfMergerUtility.appendDocument(dst, src);
src.close();
dst.save(new File(TARGETTESTDIR, "PDFBOX-4418-000314-merged.pdf"));
dst.close();
dst = Loader.loadPDF(new File(TARGETTESTDIR, "PDFBOX-4418-000314-merged.pdf"));
assertEquals(656, dst.getDocumentCatalog().getStructureTreeRoot().getParentTreeNextKey());
dst.close();
}
/**
* PDFBOX-4416: Test merging of /IDTree
* <br>
* PDFBOX-4009: test merging to empty destination
*
* @throws IOException
*/
@Test
void testStructureTreeMergeIDTree() throws IOException
{
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
PDDocument src = Loader.loadPDF(new File(SRCDIR, "PDFBOX-4417-001031.pdf"));
PDDocument dst = Loader.loadPDF(new File(SRCDIR, "PDFBOX-4417-054080.pdf"));
PDNameTreeNode<PDStructureElement> srcIDTree = src.getDocumentCatalog().getStructureTreeRoot().getIDTree();
Map<String, PDStructureElement> srcIDTreeMap = PDFMergerUtility.getIDTreeAsMap(srcIDTree);
PDNameTreeNode<PDStructureElement> dstIDTree = dst.getDocumentCatalog().getStructureTreeRoot().getIDTree();
Map<String, PDStructureElement> dstIDTreeMap = PDFMergerUtility.getIDTreeAsMap(dstIDTree);
int expectedTotal = srcIDTreeMap.size() + dstIDTreeMap.size();
assertEquals(192, expectedTotal);
// PDFBOX-4009, test that empty dest doc still merges structure tree
// (empty dest doc is used in command line app)
PDDocument emptyDest = new PDDocument();
pdfMergerUtility.appendDocument(emptyDest, src);
src.close();
src = emptyDest;
assertEquals(4, src.getDocumentCatalog().getStructureTreeRoot().getParentTreeNextKey());
pdfMergerUtility.appendDocument(dst, src);
src.close();
dst.save(new File(TARGETTESTDIR, "PDFBOX-4416-IDTree-merged.pdf"));
dst.close();
dst = Loader.loadPDF(new File(TARGETTESTDIR, "PDFBOX-4416-IDTree-merged.pdf"));
checkWithNumberTree(dst);
checkForPageOrphans(dst);
dstIDTree = dst.getDocumentCatalog().getStructureTreeRoot().getIDTree();
dstIDTreeMap = PDFMergerUtility.getIDTreeAsMap(dstIDTree);
assertEquals(expectedTotal, dstIDTreeMap.size());
dst.close();
checkStructTreeRootCount(new File(TARGETTESTDIR, "PDFBOX-4416-IDTree-merged.pdf"));
}
/**
* PDFBOX-4429: merge into destination that has /StructParent(s) entries in the destination file
* but no structure tree.
*
* @throws IOException
*/
@Test
void testMergeBogusStructParents1() throws IOException
{
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
try (PDDocument src = Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-4408.pdf"));
PDDocument dst = Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-4408.pdf")))
{
dst.getDocumentCatalog().setStructureTreeRoot(null);
dst.getPage(0).setStructParents(9999);
dst.getPage(0).getAnnotations().get(0).setStructParent(9998);
pdfMergerUtility.appendDocument(dst, src);
checkWithNumberTree(dst);
checkForPageOrphans(dst);
}
}
/**
* PDFBOX-4429: merge into destination that has /StructParent(s) entries in the source file but
* no structure tree.
*
* @throws IOException
*/
@Test
void testMergeBogusStructParents2() throws IOException
{
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
try (PDDocument src = Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-4408.pdf"));
PDDocument dst = Loader.loadPDF(new File(TARGETPDFDIR, "PDFBOX-4408.pdf")))
{
src.getDocumentCatalog().setStructureTreeRoot(null);
src.getPage(0).setStructParents(9999);
src.getPage(0).getAnnotations().get(0).setStructParent(9998);
pdfMergerUtility.appendDocument(dst, src);
checkWithNumberTree(dst);
checkForPageOrphans(dst);
}
}
/**
* Test of the parent tree. Didn't work before PDFBOX-4003 because of incompatible class for
* PDNumberTreeNode.
*
* @throws IOException
*/
@Test
void testParentTree() throws IOException
{
try (PDDocument doc = Loader
.loadPDF(new File(TARGETPDFDIR, "PDFBOX-3999-GeneralForbearance.pdf")))
{
PDStructureTreeRoot structureTreeRoot = doc.getDocumentCatalog().getStructureTreeRoot();
PDNumberTreeNode parentTree = structureTreeRoot.getParentTree();
parentTree.getValue(0);
Map<Integer, COSObjectable> numberTreeAsMap = PDFMergerUtility.getNumberTreeAsMap(parentTree);
assertEquals(31, numberTreeAsMap.size());
assertEquals(31, Collections.max(numberTreeAsMap.keySet()) + 1);
assertEquals(0, (int) Collections.min(numberTreeAsMap.keySet()));
assertEquals(31, structureTreeRoot.getParentTreeNextKey());
}
}
// PDFBOX-4417: check for multiple /StructTreeRoot entries that was due to
// incorrect merging of /K entries
private void checkStructTreeRootCount(File file) throws IOException
{
try (PDDocument pdf = Loader.loadPDF(file))
{
List<COSObject> structTreeRootObjects = pdf.getDocument().getObjectsByType(COSName.STRUCT_TREE_ROOT);
assertEquals(1, structTreeRootObjects.size(), file.getPath() + " " + structTreeRootObjects);
}
}
/**
* PDFBOX-4408: Check that /StructParents values from pages and /StructParent values from
* annotations are found in the /ParentTree.
* <p>
* Expanded in 2025 to check that all MCIDs of a page content stream have an entry in the
* ParentTree.
*
* @param document
*/
void checkWithNumberTree(PDDocument document) throws IOException
{
PDDocumentCatalog documentCatalog = document.getDocumentCatalog();
PDNumberTreeNode parentTree = documentCatalog.getStructureTreeRoot().getParentTree();
Map<Integer, COSObjectable> numberTreeAsMap = PDFMergerUtility.getNumberTreeAsMap(parentTree);
Set<Integer> keySet = numberTreeAsMap.keySet();
PDAcroForm acroForm = documentCatalog.getAcroForm();
if (acroForm != null)
{
for (PDField field : acroForm.getFieldTree())
{
for (PDAnnotationWidget widget : field.getWidgets())
{
if (widget.getStructParent() >= 0)
{
assertTrue(keySet.contains(widget.getStructParent()),
"field '" + field.getFullyQualifiedName() + "' /StructParent "
+ widget.getStructParent() + " missing in /ParentTree");
}
}
}
}
PDPageTree pageTree = document.getPages();
for (PDPage page : pageTree)
{
int pageNum = pageTree.indexOf(page) + 1;
if (page.getStructParents() >= 0)
{
assertTrue(keySet.contains(page.getStructParents()), "/StructParents " + page.getStructParents() + " from page " +
pageNum + " not found in /ParentTree");
PDParentTreeValue obj = (PDParentTreeValue) numberTreeAsMap.get(page.getStructParents());
assertTrue(obj.getCOSObject() instanceof COSArray, "Expected array in page " + pageNum + ", got " + obj.getClass());
COSArray array = (COSArray) obj.getCOSObject();
PDFMarkedContentExtractor markedContentExtractor = new PDFMarkedContentExtractor();
markedContentExtractor.processPage(page);
List<PDMarkedContent> markedContents = markedContentExtractor.getMarkedContents();
TreeSet<Integer> set = new TreeSet<>();
for (PDMarkedContent pdMarkedContent : markedContents)
{
COSDictionary pdmcProperties = pdMarkedContent.getProperties();
if (pdmcProperties == null)
{
continue;
}
int mcid = pdMarkedContent.getMCID();
if (mcid >= 0)
{
// "For a page object (...), the value shall be an array of references
// to the parent elements of those marked-content sequences."
// this means that the /Pg entry doesn't have to match the page
COSDictionary dict = (COSDictionary) array.getObject(mcid);
assertNotNull(dict);
set.add(mcid);
PDStructureElement structureElemen = (PDStructureElement) PDStructureNode.create(dict);
List<Object> kids = structureElemen.getKids();
boolean found = false;
for (Object kid : kids)
{
if (kid instanceof Integer && ((Integer) kid) == mcid)
{
found = true;
break;
}
if (kid instanceof PDMarkedContentReference)
{
PDMarkedContentReference mcr = (PDMarkedContentReference) kid;
if (mcid == mcr.getMCID())
{
found = true;
if (mcr.getPage() != null)
{
assertEquals(page, mcr.getPage());
}
else
{
assertEquals(page, structureElemen.getPage());
}
break;
}
}
}
assertTrue(found, "page: " + pageNum + ", mcid: " + mcid + " not found");
}
}
// actual count may be larger if last element is null, e.g. PDFBOX-4408
assertTrue(set.last() <= array.size() - 1);
}
for (PDAnnotation ann : page.getAnnotations())
{
if (ann.getStructParent() >= 0)
{
assertTrue(keySet.contains(ann.getStructParent()),
"/StructParent " + ann.getStructParent() + " missing in /ParentTree");
}
}
}
// might also test image and form dictionaries...
}
/**
* PDFBOX-4383: Test that file can be deleted after merge.
*
* @throws IOException
*/
@Test
void testFileDeletion() throws IOException
{
File outFile = new File(TARGETTESTDIR, "PDFBOX-4383-result.pdf");
File inFile1 = new File(TARGETTESTDIR, "PDFBOX-4383-src1.pdf");
File inFile2 = new File(TARGETTESTDIR, "PDFBOX-4383-src2.pdf");
createSimpleFile(inFile1);
createSimpleFile(inFile2);
try (OutputStream out = new FileOutputStream(outFile))
{
PDFMergerUtility merger = new PDFMergerUtility();
merger.setDestinationStream(out);
merger.addSource(inFile1);
merger.addSource(inFile2);
merger.mergeDocuments(IOUtils.createMemoryOnlyStreamCache());
}
Files.delete(inFile1.toPath());
Files.delete(inFile2.toPath());
Files.delete(outFile.toPath());
}
/**
* Check that there is a top level Document and Parts below in a merge of 2 documents.
*
* @throws IOException
*/
@Test
void testPDFBox5198_2() throws IOException
{
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
pdfMergerUtility.addSource(new File(SRCDIR, "PDFA3A.pdf"));
pdfMergerUtility.addSource(new File(SRCDIR, "PDFA3A.pdf"));
pdfMergerUtility.setDestinationFileName(TARGETTESTDIR + "PDFA3A-merged2.pdf");
pdfMergerUtility.mergeDocuments(IOUtils.createMemoryOnlyStreamCache());
checkParts(new File(TARGETTESTDIR + "PDFA3A-merged2.pdf"));
}
/**
* Check that there is a top level Document and Parts below in a merge of 3 documents.
*
* @throws IOException
*/
@Test
void testPDFBox5198_3() throws IOException
{
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
pdfMergerUtility.addSource(new File(SRCDIR, "PDFA3A.pdf"));
pdfMergerUtility.addSource(new File(SRCDIR, "PDFA3A.pdf"));
pdfMergerUtility.addSource(new File(SRCDIR, "PDFA3A.pdf"));
pdfMergerUtility.setDestinationFileName(TARGETTESTDIR + "PDFA3A-merged3.pdf");
pdfMergerUtility.mergeDocuments(IOUtils.createMemoryOnlyStreamCache());
checkParts(new File(TARGETTESTDIR + "PDFA3A-merged3.pdf"));
}
/**
* Check that there is a top level Document and Parts below.
* @param file
* @throws IOException
*/
private void checkParts(File file) throws IOException
{
try (PDDocument doc = Loader.loadPDF(file))
{
PDStructureTreeRoot structureTreeRoot = doc.getDocumentCatalog().getStructureTreeRoot();
COSDictionary topDict = (COSDictionary) structureTreeRoot.getK();
assertEquals(COSName.DOCUMENT, topDict.getItem(COSName.S));
assertEquals(structureTreeRoot.getCOSObject(), topDict.getCOSDictionary(COSName.P));
COSArray kArray = topDict.getCOSArray(COSName.K);
assertEquals(doc.getNumberOfPages(), kArray.size());
for (int i = 0; i < kArray.size(); ++i)
{
COSDictionary dict = (COSDictionary) kArray.getObject(i);
assertEquals(COSName.PART, dict.getItem(COSName.S));
assertEquals(topDict, dict.getCOSDictionary(COSName.P));
}
}
}
private void checkForPageOrphans(PDDocument doc) throws IOException
{
// check for orphan pages in the StructTreeRoot/K, StructTreeRoot/ParentTree and
// StructTreeRoot/IDTree trees.
PDPageTree pageTree = doc.getPages();
PDStructureTreeRoot structureTreeRoot = doc.getDocumentCatalog().getStructureTreeRoot();
checkElement(pageTree, structureTreeRoot.getParentTree().getCOSObject());
checkElement(pageTree, structureTreeRoot.getK());
checkForIDTreeOrphans(pageTree, structureTreeRoot);
}
private void checkForIDTreeOrphans(PDPageTree pageTree, PDStructureTreeRoot structureTreeRoot)
throws IOException
{
PDNameTreeNode<PDStructureElement> idTree = structureTreeRoot.getIDTree();
if (idTree == null)
{
return;
}
Map<String, PDStructureElement> map = PDFMergerUtility.getIDTreeAsMap(idTree);
for (PDStructureElement element : map.values())
{
if (element.getPage() != null)
{
checkForPage(pageTree, element);
}
if (!element.getKids().isEmpty())
{
checkElement(pageTree, element.getCOSObject().getDictionaryObject(COSName.K));
}
}
}
private void createSimpleFile(File file) throws IOException
{
try (PDDocument doc = new PDDocument())
{
doc.addPage(new PDPage());
doc.save(file);
}
}
private class ElementCounter
{
int cnt = 0;
final Set<COSBase> set = new HashSet<>();
void walk(COSBase base)
{
if (base instanceof COSArray)
{
for (COSBase base2 : (COSArray) base)
{
if (base2 instanceof COSObject)
{
base2 = ((COSObject) base2).getObject();
}
walk(base2);
}
}
else if (base instanceof COSDictionary)
{
COSDictionary kdict = (COSDictionary) base;
if (kdict.containsKey(COSName.PG))
{
++cnt;
set.add(kdict);
}
if (kdict.containsKey(COSName.K))
{
walk(kdict.getDictionaryObject(COSName.K));
}
}
}
}
// Each element can be an array, a dictionary or a number.
// See PDF specification Table 37 - Entries in a number tree node dictionary
// See PDF specification Table 322 - Entries in the structure tree root
// See PDF specification Table 323 - Entries in a structure element dictionary
// See PDF specification Table 325 ��� Entries in an object reference dictionary
// example of file with /Kids: 000153.pdf 000208.pdf 000314.pdf 000359.pdf 000671.pdf
// from digitalcorpora site
private void checkElement(PDPageTree pageTree, COSBase base) throws IOException
{
if (base instanceof COSArray)
{
for (COSBase base2 : (COSArray) base)
{
if (base2 instanceof COSObject)
{
base2 = ((COSObject) base2).getObject();
}
checkElement(pageTree, base2);
}
}
else if (base instanceof COSDictionary)
{
COSDictionary kdict = (COSDictionary) base;
if (kdict.containsKey(COSName.PG))
{
PDStructureElement structureElement = new PDStructureElement(kdict);
checkForPage(pageTree, structureElement);
}
if (kdict.containsKey(COSName.K))
{
checkElement(pageTree, kdict.getDictionaryObject(COSName.K));
// Check that the /P entry points to the correct object
PDStructureNode node = PDStructureNode.create(kdict);
for (Object obj : node.getKids())
{
if (obj instanceof PDStructureElement)
{
PDStructureNode parent = ((PDStructureElement) obj).getParent();
assertSame(parent.getCOSObject(), kdict);
}
}
return;
}
// if we're in a number tree, check /Nums and /Kids
if (kdict.containsKey(COSName.KIDS))
{
checkElement(pageTree, kdict.getDictionaryObject(COSName.KIDS));
}
else if (kdict.containsKey(COSName.NUMS))
{
checkElement(pageTree, kdict.getDictionaryObject(COSName.NUMS));
}
// if we're an object reference dictionary (/OBJR), check the obj
if (kdict.containsKey(COSName.OBJ))
{
COSDictionary obj = (COSDictionary) kdict.getDictionaryObject(COSName.OBJ);
COSBase type = obj.getDictionaryObject(COSName.TYPE);
COSBase subtype = obj.getDictionaryObject(COSName.SUBTYPE);
if (COSName.ANNOT.equals(type) || COSName.LINK.equals(subtype))
{
PDAnnotation annotation = PDAnnotation.createAnnotation(obj);
PDPage page = annotation.getPage();
if (annotation instanceof PDAnnotationLink)
{
// PDFBOX-5928: check whether the destination of a link annotation is an orphan
PDAnnotationLink link = (PDAnnotationLink) annotation;
PDDestination destination = link.getDestination();
if (destination == null)
{
PDAction action = link.getAction();
if (action instanceof PDActionGoTo)
{
PDActionGoTo goToAction = (PDActionGoTo) action;
destination = goToAction.getDestination();
}
}
if (destination instanceof PDPageDestination)
{
PDPageDestination pageDestination = (PDPageDestination) destination;
PDPage destPage = pageDestination.getPage();
if (destPage != null)
{
assertNotEquals(-1, pageTree.indexOf(destPage),
"Annotation destination page is not in the page tree: " + destPage);
}
}
}
if (page != null)
{
if (pageTree.indexOf(page) == -1)
{
COSBase item = kdict.getItem(COSName.OBJ);
if (item instanceof COSObject)
{
assertNotEquals(-1, pageTree.indexOf(page),
"Annotation page is not in the page tree: " + item);
}
else
{
// don't display because of stack overflow
assertNotEquals(-1, pageTree.indexOf(page),
"Annotation page is not in the page tree");
}
}
}
}
else
{
//TODO needs to be investigated. Specification mentions
// "such as an XObject or an annotation"
fail("Other type: " + type + ", obj: " + obj);
}
}
}
}
// checks that the result file of a merge has the same rendering as the two source files
private void checkMergeIdentical(String filename1, String filename2, String mergeFilename,
StreamCacheCreateFunction streamCache)
throws IOException
{
int src1PageCount;
BufferedImage[] src1ImageTab;
try (PDDocument srcDoc1 = Loader.loadPDF(new File(SRCDIR, filename1), (String) null))
{
src1PageCount = srcDoc1.getNumberOfPages();
PDFRenderer src1PdfRenderer = new PDFRenderer(srcDoc1);
src1ImageTab = new BufferedImage[src1PageCount];
for (int page = 0; page < src1PageCount; ++page)
{
src1ImageTab[page] = src1PdfRenderer.renderImageWithDPI(page, DPI);
}
}
int src2PageCount;
BufferedImage[] src2ImageTab;
try (PDDocument srcDoc2 = Loader.loadPDF(new File(SRCDIR, filename2), (String) null))
{
src2PageCount = srcDoc2.getNumberOfPages();
PDFRenderer src2PdfRenderer = new PDFRenderer(srcDoc2);
src2ImageTab = new BufferedImage[src2PageCount];
for (int page = 0; page < src2PageCount; ++page)
{
src2ImageTab[page] = src2PdfRenderer.renderImageWithDPI(page, DPI);
}
}
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
pdfMergerUtility.addSource(new File(SRCDIR, filename1));
pdfMergerUtility.addSource(new File(SRCDIR, filename2));
pdfMergerUtility.setDestinationFileName(TARGETTESTDIR + mergeFilename);
pdfMergerUtility.mergeDocuments(streamCache);
try (PDDocument mergedDoc = Loader.loadPDF(new File(TARGETTESTDIR, mergeFilename),
(String) null))
{
PDFRenderer mergePdfRenderer = new PDFRenderer(mergedDoc);
int mergePageCount = mergedDoc.getNumberOfPages();
assertEquals(src1PageCount + src2PageCount, mergePageCount);
for (int page = 0; page < src1PageCount; ++page)
{
BufferedImage bim = mergePdfRenderer.renderImageWithDPI(page, DPI);
checkImagesIdentical(bim, src1ImageTab[page]);
}
for (int page = 0; page < src2PageCount; ++page)
{
int mergePage = page + src1PageCount;
BufferedImage bim = mergePdfRenderer.renderImageWithDPI(mergePage, DPI);
checkImagesIdentical(bim, src2ImageTab[page]);
}
}
}
private void checkImagesIdentical(BufferedImage bim1, BufferedImage bim2)
{
assertEquals(bim1.getHeight(), bim2.getHeight());
assertEquals(bim1.getWidth(), bim2.getWidth());
int w = bim1.getWidth();
int h = bim1.getHeight();
for (int i = 0; i < w; ++i)
{
for (int j = 0; j < h; ++j)
{
assertEquals(bim1.getRGB(i, j), bim2.getRGB(i, j));
}
}
}
private void checkForPage(PDPageTree pageTree, PDStructureElement structureElement)
{
PDPage page = structureElement.getPage();
if (page != null)
{
assertNotEquals(-1, pageTree.indexOf(page), "Page is not in the page tree");
}
}
@Test
void testSplitWithStructureTree() throws IOException
{
try (PDDocument doc = Loader.loadPDF(new File(SRCDIR, "PDFBOX-4417-001031.pdf")))
{
Splitter splitter = new Splitter();
splitter.setStartPage(1);
splitter.setEndPage(2);
splitter.setSplitAtPage(2);
List<PDDocument> splitResult = splitter.split(doc);
assertEquals(1, splitResult.size());
try (PDDocument dstDoc = splitResult.get(0))
{
assertEquals(2, dstDoc.getNumberOfPages());
checkForPageOrphans(dstDoc);
// these tests just verify the status quo. Changes should be checked visually with
// a PDF viewer that can display structural information.
PDStructureTreeRoot structureTreeRoot = dstDoc.getDocumentCatalog().getStructureTreeRoot();
assertEquals(126, PDFMergerUtility.getIDTreeAsMap(structureTreeRoot.getIDTree()).size());
assertEquals(2, PDFMergerUtility.getNumberTreeAsMap(structureTreeRoot.getParentTree()).size());
assertEquals(6, structureTreeRoot.getRoleMap().size());
}
}
}
@Test
void testSplitWithStructureTreeAndDestinations() throws IOException
{
try (PDDocument doc = Loader.loadPDF(new File(SRCDIR,"PDFBOX-5762-722238.pdf")))
{
Splitter splitter = new Splitter();
splitter.setStartPage(1);
splitter.setEndPage(2);
splitter.setSplitAtPage(2);
List<PDDocument> splitResult = splitter.split(doc);
assertEquals(1, splitResult.size());
try (PDDocument dstDoc = splitResult.get(0))
{
assertEquals(2, dstDoc.getNumberOfPages());
checkForPageOrphans(dstDoc);
// these tests just verify the status quo. Changes should be checked visually with
// a PDF viewer that can display structural information.
PDStructureTreeRoot structureTreeRoot = dstDoc.getDocumentCatalog().getStructureTreeRoot();
assertEquals(7, PDFMergerUtility.getNumberTreeAsMap(structureTreeRoot.getParentTree()).size());
assertEquals(4, structureTreeRoot.getRoleMap().size());
// check that destinations are fixed (only the two first point to the split doc)
List<PDAnnotation> annotations = dstDoc.getPage(0).getAnnotations();
assertEquals(5, annotations.size());
PDAnnotationLink link1 = (PDAnnotationLink) annotations.get(0);
PDAnnotationLink link2 = (PDAnnotationLink) annotations.get(1);
PDAnnotationLink link3 = (PDAnnotationLink) annotations.get(2);
PDAnnotationLink link4 = (PDAnnotationLink) annotations.get(3);
PDAnnotationLink link5 = (PDAnnotationLink) annotations.get(4);
PDPageDestination pd1 =
(PDPageDestination) ((PDActionGoTo) link1.getAction()).getDestination();
PDPageDestination pd2 =
(PDPageDestination) ((PDActionGoTo) link2.getAction()).getDestination();
PDPageDestination pd3 =
(PDPageDestination) ((PDActionGoTo) link3.getAction()).getDestination();
PDPageDestination pd4 =
(PDPageDestination) ((PDActionGoTo) link4.getAction()).getDestination();
PDPageDestination pd5 =
(PDPageDestination) ((PDActionGoTo) link5.getAction()).getDestination();
PDPageTree pageTree = dstDoc.getPages();
assertEquals(0, pageTree.indexOf(pd1.getPage()));
assertEquals(1, pageTree.indexOf(pd2.getPage()));
assertNull(pd3.getPage());
assertNull(pd4.getPage());
assertNull(pd5.getPage());
}
}
}
/**
* PDFBOX-5929: Check that orphan annotations are removed from the structure tree if annotations
* were removed from the pages (don't do that!).
*
* @throws IOException
*/
@Test
void testSplitWithStructureTreeAndDestinationsAndRemovedAnnotations() throws IOException
{
try (PDDocument doc = Loader.loadPDF(new File(SRCDIR,"PDFBOX-5762-722238.pdf")))
{
Splitter splitter = new Splitter();
for (PDPage page : doc.getPages())
{
page.setAnnotations(Collections.emptyList());
}
splitter.setStartPage(1);
splitter.setEndPage(2);
splitter.setSplitAtPage(2);
List<PDDocument> splitResult = splitter.split(doc);
assertEquals(1, splitResult.size());
try (PDDocument dstDoc = splitResult.get(0))
{
assertEquals(2, dstDoc.getNumberOfPages());
checkForPageOrphans(dstDoc);
}
}
}
/**
* Check for the bug that happened in PDFBOX-5792, where a destination was outside a target
* document and hit an NPE in the next call of Splitter.fixDestinations().
*
* @throws IOException
*/
@Test
void testSinglePageSplit() throws IOException
{
try (PDDocument doc = Loader.loadPDF(new File(SRCDIR, "PDFBOX-5792-240045.pdf")))
{
Splitter splitter = new Splitter();
splitter.setSplitAtPage(1);
List<PDDocument> splitResult = splitter.split(doc);
assertEquals(6, splitResult.size());
for (PDDocument dstDoc : splitResult)
{
assertEquals(1, dstDoc.getNumberOfPages());
checkForPageOrphans(dstDoc);
for (PDAnnotation ann : dstDoc.getPage(0).getAnnotations())
{
PDAnnotationLink link = (PDAnnotationLink) ann;
PDActionGoTo action = (PDActionGoTo) link.getAction();
PDPageDestination destination = (PDPageDestination) action.getDestination();
assertNull(destination.getPage());
}
}
PDStructureTreeRoot structureTreeRoot1 = splitResult.get(0).getDocumentCatalog().getStructureTreeRoot();
assertEquals(6, PDFMergerUtility.getNumberTreeAsMap(structureTreeRoot1.getParentTree()).size());
assertEquals(3, structureTreeRoot1.getRoleMap().size());
PDStructureTreeRoot structureTreeRoot2 = splitResult.get(1).getDocumentCatalog().getStructureTreeRoot();
assertEquals(6, PDFMergerUtility.getNumberTreeAsMap(structureTreeRoot2.getParentTree()).size());
assertEquals(3, structureTreeRoot2.getRoleMap().size());
PDStructureTreeRoot structureTreeRoot3 = splitResult.get(2).getDocumentCatalog().getStructureTreeRoot();
assertEquals(6, PDFMergerUtility.getNumberTreeAsMap(structureTreeRoot3.getParentTree()).size());
assertEquals(4, structureTreeRoot3.getRoleMap().size());
PDStructureTreeRoot structureTreeRoot4 = splitResult.get(3).getDocumentCatalog().getStructureTreeRoot();
assertEquals(5, PDFMergerUtility.getNumberTreeAsMap(structureTreeRoot4.getParentTree()).size());
assertEquals(4, structureTreeRoot4.getRoleMap().size());
PDStructureTreeRoot structureTreeRoot5 = splitResult.get(4).getDocumentCatalog().getStructureTreeRoot();
assertEquals(1, PDFMergerUtility.getNumberTreeAsMap(structureTreeRoot5.getParentTree()).size());
assertEquals(6, structureTreeRoot5.getRoleMap().size());
PDStructureTreeRoot structureTreeRoot6 = splitResult.get(5).getDocumentCatalog().getStructureTreeRoot();
assertEquals(1, PDFMergerUtility.getNumberTreeAsMap(structureTreeRoot6.getParentTree()).size());
assertEquals(7, structureTreeRoot6.getRoleMap().size());
for (PDDocument dstDoc : splitResult)
{
dstDoc.close();
}
}
}
@Test
void testSplitWithPopupAnnotations() throws IOException
{
try (PDDocument doc = Loader.loadPDF(new File(SRCDIR, "PDFBOX-5809-509329.pdf")))
{
Splitter splitter = new Splitter();
splitter.setStartPage(3);
splitter.setEndPage(3);
splitter.setSplitAtPage(1);
List<PDDocument> splitResult = splitter.split(doc);
assertEquals(1, splitResult.size());
List<PDAnnotation> annotations;
PDAnnotationText annotationText3;
PDAnnotationPopup annotationPopup4;
try (PDDocument dstDoc = splitResult.get(0))
{
checkForPageOrphans(dstDoc);
assertEquals(1, dstDoc.getNumberOfPages());
annotations = dstDoc.getPage(0).getAnnotations();
assertEquals(5, annotations.size());
annotationText3 = (PDAnnotationText) annotations.get(3);
annotationPopup4 = (PDAnnotationPopup) annotations.get(4);
assertEquals(annotationText3.getPopup(), annotationPopup4);
assertEquals(annotationPopup4.getParent(), annotationText3);
assertEquals(annotationText3.getPage(), dstDoc.getPage(0));
}
// Check that source document is ok
annotations = doc.getPage(2).getAnnotations();
assertEquals(5, annotations.size());
annotationText3 = (PDAnnotationText) annotations.get(3);
annotationPopup4 = (PDAnnotationPopup) annotations.get(4);
assertEquals(annotationText3.getPopup(), annotationPopup4);
assertEquals(annotationPopup4.getParent(), annotationText3);
assertEquals(annotationText3.getPage(), doc.getPage(2));
}
}
@Test
void testSplitWithBrokenDestination() throws IOException
{
try (PDDocument doc = Loader.loadPDF(new File(SRCDIR, "PDFBOX-5811-362972.pdf")))
{
Splitter splitter = new Splitter();
splitter.setStartPage(2);
splitter.setEndPage(2);
List<PDDocument> splitResult = splitter.split(doc);
assertEquals(1, splitResult.size());
List<PDAnnotation> annotations;
try (PDDocument dstDoc = splitResult.get(0))
{
checkForPageOrphans(dstDoc);
assertEquals(1, dstDoc.getNumberOfPages());
annotations = dstDoc.getPage(0).getAnnotations();
assertEquals(1, annotations.size());
PDAnnotationLink link = (PDAnnotationLink) annotations.get(0);
assertNull(link.getDestination());
}
// Check source document
annotations = doc.getPage(1).getAnnotations();
assertEquals(1, annotations.size());
PDAnnotationLink link = (PDAnnotationLink) annotations.get(0);
assertThrows(IOException.class, () -> link.getDestination());
}
}
@Test
void testSplitWithNamedDestinations() throws IOException
{
try (PDDocument doc = Loader.loadPDF(new File(SRCDIR, "PDFBOX-5840-410609.pdf")))
{
Splitter splitter = new Splitter();
splitter.setSplitAtPage(6);
List<PDDocument> splitResult = splitter.split(doc);
assertEquals(1, splitResult.size());
List<PDAnnotation> annotations;
try (PDDocument dstDoc = splitResult.get(0))
{
checkForPageOrphans(dstDoc);
assertEquals(6, dstDoc.getNumberOfPages());
annotations = dstDoc.getPage(0).getAnnotations();
assertEquals(5, annotations.size());
PDAnnotationLink link1 = (PDAnnotationLink) annotations.get(0);
PDAnnotationLink link2 = (PDAnnotationLink) annotations.get(1);
PDAnnotationLink link3 = (PDAnnotationLink) annotations.get(2);
PDAnnotationLink link4 = (PDAnnotationLink) annotations.get(3);
PDAnnotationLink link5 = (PDAnnotationLink) annotations.get(4);
PDPageDestination pd1 =
(PDPageDestination) ((PDActionGoTo) link1.getAction()).getDestination();
PDPageDestination pd2 =
(PDPageDestination) ((PDActionGoTo) link2.getAction()).getDestination();
PDPageDestination pd3 =
(PDPageDestination) ((PDActionGoTo) link3.getAction()).getDestination();
PDPageDestination pd4 =
(PDPageDestination) ((PDActionGoTo) link4.getAction()).getDestination();
PDPageDestination pd5 =
(PDPageDestination) ((PDActionGoTo) link5.getAction()).getDestination();
PDPageTree pageTree = dstDoc.getPages();
assertEquals(0, pageTree.indexOf(pd1.getPage()));
assertEquals(1, pageTree.indexOf(pd2.getPage()));
assertEquals(3, pageTree.indexOf(pd3.getPage()));
assertEquals(3, pageTree.indexOf(pd4.getPage()));
assertEquals(5, pageTree.indexOf(pd5.getPage()));
assertNotNull(dstDoc.getDocumentCatalog().getMetadata());
ByteArrayOutputStream baos = new ByteArrayOutputStream();
dstDoc.save(baos);
PDDocument reloadedDoc = Loader.loadPDF(baos.toByteArray());
assertNotNull(reloadedDoc.getDocumentCatalog().getMetadata());
reloadedDoc.close();
}
// Check that source document is unchanged
annotations = doc.getPage(0).getAnnotations();
assertEquals(5, annotations.size());
PDAnnotationLink link = (PDAnnotationLink) annotations.get(0);
assertTrue(((PDActionGoTo) link.getAction()).getDestination() instanceof PDNamedDestination);
}
}
/**
* PDFBOX-5939: merge a file with an outline that has itself as a parent without producing a
* stack overflow.
*
* @throws IOException
*/
@Test
void testOutlinesSelfParent() throws IOException
{
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
pdfMergerUtility.addSource(new File(TARGETPDFDIR, "PDFBOX-5939-google-docs-1.pdf"));
pdfMergerUtility.addSource(new File(TARGETPDFDIR, "PDFBOX-5939-google-docs-1.pdf"));
pdfMergerUtility.setDestinationFileName(TARGETTESTDIR + "PDFBOX-5939-google-docs-result.pdf");
pdfMergerUtility.mergeDocuments(IOUtils.createMemoryOnlyStreamCache());
try (PDDocument mergedDoc = Loader
.loadPDF(new File(TARGETTESTDIR, "PDFBOX-5939-google-docs-result.pdf")))
{
assertEquals(2, mergedDoc.getNumberOfPages());
}
}
/**
* PDFBOX-515 / PDFBOX-5950: test merging of two files where one file has a stream deep down in
* the info dictionary (Info/ImPDF/Images/Kids/[0]). This test will pass only if the source file
* isn't closed prematurely, or if deep cloning is applied.
*
* @throws IOException
*/
@Test
void testPDFBox515() throws IOException
{
PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
pdfMergerUtility.addSource(new File(TARGETPDFDIR, "ComSquare1.pdf"));
pdfMergerUtility.addSource(new File(TARGETPDFDIR, "Ghostscript1.pdf"));
pdfMergerUtility.setDestinationFileName(TARGETTESTDIR + "PDFBOX-515-result.pdf");
pdfMergerUtility.mergeDocuments(IOUtils.createMemoryOnlyStreamCache());
try (PDDocument mergedDoc = Loader.loadPDF(new File(TARGETTESTDIR, "PDFBOX-515-result.pdf")))
{
assertEquals(2, mergedDoc.getNumberOfPages());
COSDictionary imageDict = (COSDictionary) mergedDoc.getDocumentInformation().getCOSObject().
getCOSDictionary(COSName.getPDFName("ImPDF")).
getCOSDictionary(COSName.getPDFName("Images")).
getCOSArray(COSName.KIDS).getObject(0);
PDImageXObject imageXObject = (PDImageXObject) PDImageXObject.createXObject(imageDict, new PDResources());
BufferedImage bim = imageXObject.getImage();
assertEquals(909, bim.getWidth());
assertEquals(233, bim.getHeight());
}
}
}