Splitter.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.multipdf;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;

import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.io.RandomAccessStreamCache.StreamCacheCreateFunction;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.PDStructureElementNameTreeNode;
import org.apache.pdfbox.pdmodel.common.COSObjectable;
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
import org.apache.pdfbox.pdmodel.common.PDNumberTreeNode;
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDParentTreeValue;
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement;
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionFactory;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionGoTo;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationPopup;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAppearanceStream;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDNamedDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;

/**
 * Split a document into several other documents.
 *
 * @author Mario Ivankovits
 * @author Ben Litchfield
 * @author Tilman Hausherr
 */
public class Splitter
{
    private static final Logger LOG = LogManager.getLogger(Splitter.class);

    private PDDocument sourceDocument;
    private PDDocument currentDestinationDocument;

    private int splitLength = 1;
    private int startPage = Integer.MIN_VALUE;
    private int endPage = Integer.MAX_VALUE;
    private List<PDDocument> destinationDocuments;
    private Map<COSDictionary, COSDictionary> pageDictMap;
    private Map<COSDictionary, COSDictionary> structDictMap;
    private Map<COSDictionary, COSDictionary> annotDictMap;
    private Map<PDPageDestination,PDPage> destToFixMap;
    private Set<String> idSet;
    private Set<COSName> roleSet;

    private int currentPageNumber;

    private StreamCacheCreateFunction streamCacheCreateFunction = null;

    /**
     * @return the current function to be used to create an instance of stream cache.
     */
    public StreamCacheCreateFunction getStreamCacheCreateFunction()
    {
        return streamCacheCreateFunction;
    }

    /**
     * Set the current function to be used to create an instance of stream cache.
     * 
     * @param streamCacheCreateFunction the current function to be used to create an instance of stream cache.
     */
    public void setStreamCacheCreateFunction(StreamCacheCreateFunction streamCacheCreateFunction)
    {
        this.streamCacheCreateFunction = streamCacheCreateFunction;
    }

    /**
     * This will take a document and split into several other documents.
     *
     * @param document The document to split.
     *
     * @return A list of all the split documents. These should all be saved before closing any
     * documents, including the source document. Any further operations should be made after
     * reloading them, to avoid problems due to resource sharing. For the same reason, they should
     * not be saved with encryption.
     *
     * @throws IOException If there is an IOError
     */
    public List<PDDocument> split(PDDocument document) throws IOException
    {
        // reset the currentPageNumber for a case if the split method will be used several times
        currentPageNumber = 0;
        destinationDocuments = new ArrayList<>();
        sourceDocument = document;
        pageDictMap = new HashMap<>();
        destToFixMap = new HashMap<>();
        annotDictMap = new HashMap<>();
        idSet = new HashSet<>();
        roleSet = new HashSet<>();

        processPages();

        for (PDDocument destinationDocument : destinationDocuments)
        {
            cloneStructureTree(destinationDocument);
            fixDestinations(destinationDocument);
        }

        return destinationDocuments;
    }
    
    /**
     * Replace the page destinations, if the source and destination pages are in the target
     * document. This must be called after all pages (and its annotations) are processed.
     *
     * @param destinationDocument
     */
    private void fixDestinations(PDDocument destinationDocument)
    {
        PDPageTree pageTree = destinationDocument.getPages();
        for (Map.Entry<PDPageDestination,PDPage> entry : destToFixMap.entrySet())
        {
            PDPageDestination pageDestination = entry.getKey();
            // Find whether source page is inside or outside
            PDPage srcPage = entry.getValue();
            if (pageTree.indexOf(srcPage) < 0)
            {
                continue;
            }
            COSDictionary srcPageDict = pageDestination.getPage().getCOSObject();
            COSDictionary dstPageDict = pageDictMap.get(srcPageDict);
            PDPage dstPage = new PDPage(dstPageDict);
            // Find whether destination page is inside or outside
            if (pageTree.indexOf(dstPage) >= 0)
            {
                pageDestination.setPage(dstPage);
            }
            else
            {
                pageDestination.setPage(null);
            }
        }
    }

    /**
     * Clone the structure tree from the source to the current destination document.
     *
     * @param destinationDocument
     * @throws IOException 
     */
    private void cloneStructureTree(PDDocument destinationDocument) throws IOException
    {
        PDStructureTreeRoot srcStructureTreeRoot = sourceDocument.getDocumentCatalog().getStructureTreeRoot();
        if (srcStructureTreeRoot == null)
        {
            return;
        }
        structDictMap = new HashMap<>();
        PDStructureTreeRoot dstStructureTreeRoot = new PDStructureTreeRoot();
        PDPageTree dstPageTree = destinationDocument.getPages();

        // clone /K, also fills dictMap
        COSBase k1 = srcStructureTreeRoot.getK();
        COSBase k2 = new KCloner(dstPageTree).createClone(k1, dstStructureTreeRoot.getCOSObject(), null);
        dstStructureTreeRoot.setK(k2);

        // transfer ParentTree using the map because the dictionaries are all found in the /K structure.
        PDNumberTreeNode srcParentTree = srcStructureTreeRoot.getParentTree();
        Map<Integer, COSObjectable> srcNumberTreeAsMap = PDFMergerUtility.getNumberTreeAsMap(srcParentTree);
        Map<Integer, COSObjectable> dstNumberTreeAsMap = new LinkedHashMap<>();
        for (int p = 0; p < dstPageTree.getCount(); ++p)
        {
            PDPage page = dstPageTree.get(p);
            int sp1 = page.getStructParents();
            if (sp1 != -1)
            {
                cloneTreeElement(srcNumberTreeAsMap, dstNumberTreeAsMap, sp1);
            }
            for (PDAnnotation ann : page.getAnnotations())
            {
                int sp2 = ann.getStructParent();
                if (sp2 != -1)
                {
                    cloneTreeElement(srcNumberTreeAsMap, dstNumberTreeAsMap, sp2);
                }
                PDAppearanceStream normalAppearanceStream = ann.getNormalAppearanceStream();
                if (normalAppearanceStream != null)
                {
                    processResources(normalAppearanceStream.getResources(), srcNumberTreeAsMap, dstNumberTreeAsMap, new HashSet<>());
                }
            }
            processResources(page.getResources(), srcNumberTreeAsMap, dstNumberTreeAsMap, new HashSet<>());
        }
        PDNumberTreeNode dstNumberTreeNode = new PDNumberTreeNode(PDParentTreeValue.class);
        dstNumberTreeNode.setNumbers(dstNumberTreeAsMap);
        dstStructureTreeRoot.setParentTree(dstNumberTreeNode);

        dstStructureTreeRoot.setParentTreeNextKey(srcStructureTreeRoot.getParentTreeNextKey());
        dstStructureTreeRoot.setClassMap(srcStructureTreeRoot.getClassMap());
        cloneRoleMap(srcStructureTreeRoot, dstStructureTreeRoot);
        cloneIDTree(srcStructureTreeRoot, dstStructureTreeRoot);

        destinationDocument.getDocumentCatalog().setStructureTreeRoot(dstStructureTreeRoot);
    }

    private void cloneIDTree(PDStructureTreeRoot srcStructTree, PDStructureTreeRoot destStructTree)
            throws IOException
    {
        PDNameTreeNode<PDStructureElement> srcIDTree = srcStructTree.getIDTree();
        if (srcIDTree == null)
        {
            return;
        }
        Map<String, PDStructureElement> srcIDTreeAsMap = PDFMergerUtility.getIDTreeAsMap(srcIDTree);
        Map<String, PDStructureElement> destNames = new HashMap<>();
        srcIDTreeAsMap.forEach((key, val) ->
        {
            if (!idSet.contains(key))
            {
                return;
            }
            COSDictionary dstDict = structDictMap.get(val.getCOSObject());
            if (dstDict != null)
            {
                destNames.put(key, new PDStructureElement(dstDict));
            }
        });
        PDNameTreeNode<PDStructureElement> destIDTree = new PDStructureElementNameTreeNode();
        destIDTree.setNames(destNames);
        destStructTree.setIDTree(destIDTree);
        // See comment at the end of PDFMergerUtility.mergeIDTree()
    }

    // needed because getRoleMap() and setRoleMap() habe different map types?!
    private void cloneRoleMap(PDStructureTreeRoot srcStructTree, PDStructureTreeRoot destStructTree)
    {
        COSDictionary srcDict = srcStructTree.getCOSObject().getCOSDictionary(COSName.ROLE_MAP);
        if (srcDict == null)
        {
            return;
        }
        COSDictionary dstDict = new COSDictionary();
        for (Map.Entry<COSName, COSBase> entry : srcDict.entrySet())
        {
            if (roleSet.contains(entry.getKey()))
            {
                dstDict.setItem(entry.getKey(), entry.getValue());
            }
        }
        destStructTree.getCOSObject().setItem(COSName.ROLE_MAP, dstDict);
    }

    // clone tree element using the map so that structure elements are replaced
    private void cloneTreeElement(
            Map<Integer, COSObjectable> srcNumberTreeAsMap, 
            Map<Integer, COSObjectable> dstNumberTreeAsMap, 
            int sp)
    {
        COSObjectable srcObj = srcNumberTreeAsMap.get(sp); // this is a PDParentTreeValue class
        COSObjectable dstObj = null;
        if (srcObj != null)
        {
            COSBase actualSrcObj = srcObj.getCOSObject();
            // structure element or array
            if (actualSrcObj instanceof COSArray)
            {
                // create a clone of the array
                COSArray srcArray = (COSArray) actualSrcObj;
                COSArray dstArray = new COSArray();
                for (int i = 0; i < srcArray.size(); ++i)
                {
                    COSBase srcElement = srcArray.getObject(i);
                    dstArray.add(structDictMap.get(srcElement)); // may be null
                }
                dstObj = dstArray;
            }
            else if (actualSrcObj instanceof COSDictionary)
            {
                // get the clone from the map
                dstObj = structDictMap.get(actualSrcObj);
                if (dstObj == null)
                {
                    // 164421.pdf, structure tree is weird.
                    // also 250052.pdf, 250198.pdf, 257012.pdf, 271459.pdf (multiple), 
                    // 670045.pdf (multiple)
                    // In 71459.pdf annotations on page 1 have StructParent numbers
                    // that point to structure elements in the /ParentTree that point to
                    // a different page.
                    LOG.warn("ParentTree index {} dictionary not found in /K", sp);
                }
            }
            else
            {
                LOG.warn("tree element neither dictionary nor array, but " + 
                        (actualSrcObj == null ? "(null)" : actualSrcObj.getClass().getSimpleName()));
            }
            if (dstObj != null)
            {
                dstNumberTreeAsMap.put(sp, dstObj);
            }
        }
    }

    /**
     * Class to help clone the /K tree. It clones structure elements and fills the structure
     * elements map. Pages are replaced with the help of the page map. Elements with pages that
     * don't belong to the destination are removed from the clone.
     */
    private class KCloner
    {
        PDPageTree dstPageTree;

        KCloner(PDPageTree dstPageTree)
        {
            this.dstPageTree = dstPageTree;
        }

        /**
         * Creates a clone of the source.
         *
         * @param src source dictionary or array.
         * @param dstParent for the /P entry; parameter needed because arrays don't keep a parent.
         * @param currentPageDict used to remember whether we have a page parent somewhere or not.
         * Starts with null.
         * @return a clone, or null if source is null or if there is no clone because it belongs to a
         * different page or to no page.
         */
        COSBase createClone(COSBase src, COSBase dstParent, COSDictionary currentPageDict)
        {
            if (src instanceof COSArray)
            {
                return createArrayClone(src, dstParent, currentPageDict);
            }
            else if (src instanceof COSDictionary)
            {
                return createDictionaryClone(src, dstParent, currentPageDict);
            }
            else
            {
                return src;
            }
        }

        private COSBase createArrayClone(COSBase src, COSBase dstParent, COSDictionary currentPageDict)
        {
            COSArray dst = new COSArray();
            for (COSBase base2 : (COSArray) src)
            {
                COSBase rc;
                if (base2 instanceof COSObject)
                {
                    rc = createClone(((COSObject) base2).getObject(), dstParent, currentPageDict);
                }
                else
                {
                    rc = createClone(base2, dstParent, currentPageDict);
                }
                // if this is null then they don't belong to the destination document
                if (rc != null)
                {
                    dst.add(rc);
                }
            }
            return dst.isEmpty() ? null : dst;
        }

        private COSBase createDictionaryClone(COSBase src, COSBase dstParent, COSDictionary currentPageDict)
        {
            COSDictionary srcDict = (COSDictionary) src;
            COSDictionary dstDict = structDictMap.get(srcDict);
            if (dstDict != null)
            {
                return dstDict;
            }
            COSDictionary dstPageDict = null;
            if (srcDict.containsKey(COSName.PG))
            {
                COSDictionary srcPageDict = srcDict.getCOSDictionary(COSName.PG);
                if (srcPageDict == null)
                {
                    return null;
                }
                dstPageDict = pageDictMap.get(srcPageDict);
                if (dstPageDict == null)
                {
                    return null;
                }
                PDPage dstPage = new PDPage(dstPageDict);
                if (dstPageTree.indexOf(dstPage) == -1)
                {
                    return null;
                }
            }

            // Create and fill clone
            dstDict = new COSDictionary();
            structDictMap.put(srcDict, dstDict);
            for (Map.Entry<COSName,COSBase> entry : srcDict.entrySet())
            {
                COSName key = entry.getKey();
                if (!COSName.K.equals(key) &&
                    !COSName.PG.equals(key) &&
                    !COSName.P.equals(key))
                {
                    dstDict.setItem(key, entry.getValue());
                }
            }

            // special handling for OBJR items ("object reference dictionary")
            // see e.g. file 488300.pdf and Root/StructTreeRoot/K/K/[2]/K/[1]/K/[0]/Obj
            COSName type = srcDict.getCOSName(COSName.TYPE);
            if (COSName.OBJR.equals(type))
            {
                COSDictionary srcObj = srcDict.getCOSDictionary(COSName.OBJ);
                COSDictionary dstObj = annotDictMap.get(srcObj);
                if (dstObj != null)
                {
                    // replace annotation with clone
                    dstDict.setItem(COSName.OBJ, dstObj);
                }
                else
                {
                    removePossibleOrphanAnnotation(srcObj, srcDict, currentPageDict, dstDict);
                }
            }
            else
            {
                // /P not needed for OBJR items
                dstDict.setItem(COSName.P, dstParent);
            }

            dstDict.setItem(COSName.PG, dstPageDict);
            COSBase kid = srcDict.getDictionaryObject(COSName.K);
            
            // stack overflow here with 207658.pdf, too complex
            COSBase cloneKid = createClone(kid, dstDict, dstPageDict != null ? dstPageDict : currentPageDict);
            if (cloneKid == null && kid != null)
            {
                return null; // kids array wasn't empty, but is empty now => ignore
            }
            
            // removes orphan nodes, example:
            // Root/StructTreeRoot/K/[7]/K/[3]/K/[5]/K/[2] in 271459.pdf
            // decide about keeping source dictionaries with no /K and no /PG
            if (dstPageDict == null && cloneKid == null && currentPageDict == null)
            {
                // if no parent page and no page here and no kids, assume this is an orphan
                return null;
            }
            dstDict.setItem(COSName.K, cloneKid);
            String id = dstDict.getString(COSName.ID);
            if (id != null)
            {
                idSet.add(id);
            }
            COSName s = dstDict.getCOSName(COSName.S);
            if (s != null)
            {
                roleSet.add(s);
            }
            return dstDict;
        }

        private void removePossibleOrphanAnnotation(COSDictionary srcObj, COSDictionary srcDict,
                COSDictionary currentPageDict, COSDictionary dstDict)
        {
            // PDFBOX-5929: Check whether this is an "orphan" annotation that isn't in the page
            COSBase objType = srcObj.getDictionaryObject(COSName.TYPE);
            COSBase objSubtype = srcObj.getDictionaryObject(COSName.SUBTYPE);
            if (COSName.ANNOT.equals(objType) || COSName.LINK.equals(objSubtype))
            {
                COSDictionary srcPageDict = srcDict.getCOSDictionary(COSName.PG);
                if (srcPageDict == null)
                {
                    // /Pg entry is not always on this level
                    srcPageDict = currentPageDict;
                }
                if (srcPageDict != null)
                {
                    COSArray annotationArray = srcPageDict.getCOSArray(COSName.ANNOTS);
                    if (annotationArray == null || annotationArray.indexOfObject(srcObj) == -1)
                    {
                        // Ideally the entire OBJR entry should be removed.
                        // Removing the OBJ entry is done to avoid potential page orphans
                        // from the annotation destination.
                        LOG.warn("An annotation OBJ that isn't in the page has been removed from the structure tree");
                        dstDict.removeItem(COSName.OBJ);
                    }
                }
            }
        }
    }

    // Look for /StructParent and /StructParents and add them to the destination tree
    private void processResources(PDResources res, 
            Map<Integer, COSObjectable> srcNumberTreeAsMap, 
            Map<Integer, COSObjectable> dstNumberTreeAsMap,
            Set<COSDictionary> visited) throws IOException
    {
        if (res == null)
        {
            return;
        }
        if (visited.contains(res.getCOSObject()))
        {
            // avoid endless recursion, e.g. with 002874.pdf
            return;
        }
        visited.add(res.getCOSObject());

        for (COSName name : res.getXObjectNames())
        {
            PDXObject xObject = res.getXObject(name);
            int sp2 = -1;
            if (xObject instanceof PDFormXObject)
            {
                sp2 = ((PDFormXObject) xObject).getStructParents();
                processResources(((PDFormXObject) xObject).getResources(), srcNumberTreeAsMap, dstNumberTreeAsMap, visited);
            }
            else if (xObject instanceof PDImageXObject)
            {
                sp2 = ((PDImageXObject) xObject).getStructParent();
            }
            if (sp2 != -1)
            {
                cloneTreeElement(srcNumberTreeAsMap, dstNumberTreeAsMap, sp2);
            }
        }
    }

    /**
     * This will tell the splitting algorithm where to split the pages.  The default
     * is 1, so every page will become a new document.  If it was two then each document would
     * contain 2 pages.  If the source document had 5 pages it would split into
     * 3 new documents, 2 documents containing 2 pages and 1 document containing one
     * page.
     *
     * @param split The number of pages each split document should contain.
     * @throws IllegalArgumentException if the page is smaller than one.
     */
    public void setSplitAtPage(int split)
    {
        if(split <= 0)
        {
            throw new IllegalArgumentException("Number of pages is smaller than one");
        }
        splitLength = split;
    }

    /**
     * This will set the start page.
     *
     * @param start the 1-based start page
     * @throws IllegalArgumentException if the start page is smaller than one.
     */
    public void setStartPage(int start)
    {
        if(start <= 0)
        {
            throw new IllegalArgumentException("Start page is smaller than one");
        }
        startPage = start;
    }

    /**
     * This will set the end page.
     *
     * @param end the 1-based end page
     * @throws IllegalArgumentException if the end page is smaller than one.
     */
    public void setEndPage(int end)
    {
        if(end <= 0)
        {
            throw new IllegalArgumentException("End page is smaller than one");
        }
        endPage = end;
    }

    /**
     * Interface method to handle the start of the page processing.
     *
     * @throws IOException If an IO error occurs.
     */
    private void processPages() throws IOException
    {
        for (PDPage page : sourceDocument.getPages())
        {
            if (currentPageNumber + 1 >= startPage && currentPageNumber + 1 <= endPage)
            {
                processPage(page);
                currentPageNumber++;
            }
            else
            {
                if (currentPageNumber > endPage)
                {
                    break;
                }
                else
                {
                    currentPageNumber++;
                }
            }
        }
    }

    /**
     * Helper method for creating new documents at the appropriate pages.
     *
     * @throws IOException If there is an error creating the new document.
     */
    private void createNewDocumentIfNecessary() throws IOException
    {
        if (splitAtPage(currentPageNumber) || currentDestinationDocument == null)
        {
            currentDestinationDocument = createNewDocument();
            destinationDocuments.add(currentDestinationDocument);
        }
    }

    /**
     * Check if it is necessary to create a new document.
     * By default a split occurs at every page.  If you wanted to split
     * based on some complex logic then you could override this method.  For example.
     * <code>
     * protected void splitAtPage()
     * {
     *     // will split at pages with prime numbers only
     *     return isPrime(pageNumber);
     * }
     * </code>
     * @param pageNumber the 0-based page number to be checked as splitting page
     * 
     * @return true If a new document should be created.
     */
    protected boolean splitAtPage(int pageNumber)
    {
        return (pageNumber + 1 - Math.max(1, startPage)) % splitLength == 0;
    }

    /**
     * Create a new document to write the split contents to.
     *
     * @return the newly created PDDocument. 
     * @throws IOException If there is an problem creating the new document.
     */
    protected PDDocument createNewDocument() throws IOException
    {
        PDDocument document = streamCacheCreateFunction != null ? new PDDocument(streamCacheCreateFunction) : new PDDocument();
        document.getDocument().setVersion(getSourceDocument().getVersion());
        PDDocumentInformation sourceDocumentInformation = getSourceDocument().getDocumentInformation();
        if (sourceDocumentInformation != null)
        {
            // PDFBOX-5317: Image Capture Plus files where /Root and /Info share the same dictionary
            // Only copy simple elements to avoid huge files
            COSDictionary sourceDocumentInformationDictionary = sourceDocumentInformation.getCOSObject();
            COSDictionary destDocumentInformationDictionary = new COSDictionary();
            for (COSName key : sourceDocumentInformationDictionary.keySet())
            {
                COSBase value = sourceDocumentInformationDictionary.getDictionaryObject(key);
                if (value instanceof COSDictionary)
                {
                    LOG.warn("Nested entry for key '{}' skipped in document information dictionary",
                            key.getName());
                    if (sourceDocument.getDocumentCatalog().getCOSObject() ==
                            sourceDocument.getDocumentInformation().getCOSObject())
                    {
                        LOG.warn("/Root and /Info share the same dictionary");
                    }
                    continue;
                }
                if (COSName.TYPE.equals(key))
                {
                    continue; // there is no /Type in the document information dictionary
                }
                destDocumentInformationDictionary.setItem(key, value);
            }
            document.setDocumentInformation(new PDDocumentInformation(destDocumentInformationDictionary));
        }
        PDDocumentCatalog destCatalog = document.getDocumentCatalog();
        PDDocumentCatalog sourceCatalog = getSourceDocument().getDocumentCatalog();
        destCatalog.setViewerPreferences(sourceCatalog.getViewerPreferences());
        destCatalog.setLanguage(sourceCatalog.getLanguage());
        destCatalog.setMarkInfo(sourceCatalog.getMarkInfo());
        destCatalog.setMetadata(sourceCatalog.getMetadata());
        return document;
    }

    /**
     * Interface to start processing a new page.
     *
     * @param page The page that is about to get processed.
     *
     * @throws IOException If there is an error creating the new document.
     */
    protected void processPage(PDPage page) throws IOException
    {
        createNewDocumentIfNecessary();
        
        PDPage imported = getDestinationDocument().importPage(page);
        if (page.getResources() != null && !page.getCOSObject().containsKey(COSName.RESOURCES))
        {
            imported.setResources(page.getResources());
            LOG.info("Resources imported in Splitter"); // follow-up to warning in importPage
        }
        if (imported.getCOSObject().containsKey(COSName.B))
        {
            imported.getCOSObject().removeItem(COSName.B);
            LOG.warn("/B entry (beads) removed by splitter");
        }
        // remove page links to avoid copying not needed resources 
        processAnnotations(imported);

        pageDictMap.put(page.getCOSObject(), imported.getCOSObject());
    }

    /**
     * Clone all annotations because of changes possibly made, and because the structure tree is
     * cloned.
     *
     * @param imported
     * @throws IOException 
     */
    private void processAnnotations(PDPage imported) throws IOException
    {
        List<PDAnnotation> annotations = imported.getAnnotations();
        if (annotations.isEmpty())
        {
            return;
        }
        List<PDAnnotation> clonedAnnotations = new ArrayList<>(annotations.size());
        for (PDAnnotation annotation : annotations)
        {
            // create a shallow clone
            COSDictionary clonedDict = new COSDictionary(annotation.getCOSObject());
            PDAnnotation annotationClone = PDAnnotation.createAnnotation(clonedDict);
            annotDictMap.put(annotation.getCOSObject(), clonedDict);
            clonedAnnotations.add(annotationClone);
            if (annotationClone instanceof PDAnnotationLink)
            {
                PDAnnotationLink link = (PDAnnotationLink) annotationClone;
                PDDestination srcDestination = null;
                try
                {
                    srcDestination = link.getDestination();
                }
                catch (IOException ex)
                {
                    LOG.warn("Incorrect destination in link annotation on page " +
                            (currentPageNumber + 1) + " is removed", ex);
                    link.setDestination(null);
                }
                PDAction action = null;
                if (srcDestination == null)
                {
                    action = link.getAction();
                    if (action instanceof PDActionGoTo)
                    {
                        PDActionGoTo goToAction = (PDActionGoTo) action;
                        try
                        {
                            srcDestination = goToAction.getDestination();
                        }
                        catch (IOException ex)
                        {
                            LOG.warn("GoToAction with incorrect destination in link annotation on page " +
                                    (currentPageNumber + 1) + " is removed", ex);
                            link.setAction(null);
                        }
                    }
                }
                if (srcDestination instanceof PDNamedDestination)
                {
                    srcDestination = sourceDocument.getDocumentCatalog().
                            findNamedDestinationPage((PDNamedDestination) srcDestination);
                    // we do not use the named destination anymore because names get modified, e.g.
                    // 0xAD becomes 0, see file 410609.pdf where the name no longer matches with the
                    // entry in the new name tree; plus the original solution was 40 additional loc
                }
                if (srcDestination instanceof PDPageDestination)
                {
                    // preserve links to pages within the split result:
                    // not fully possible here because we don't have the full target document yet.
                    // However we're cloning as needed and remember what to do later.
                    PDPage destinationPage = ((PDPageDestination) srcDestination).getPage();
                    if (destinationPage != null)
                    {
                        // clone destination
                        COSArray clonedDestinationArray =
                                new COSArray(((PDPageDestination) srcDestination).getCOSObject().toList());
                        PDPageDestination dstDestination =
                                (PDPageDestination) PDDestination.create(clonedDestinationArray);

                        // remember the destination to adjust / remove page later
                        destToFixMap.put(dstDestination, imported);

                        if (action != null)
                        {
                            // if action is not null, then the destination came from an action,
                            // thus clone action as well, then assign destination clone, then action
                            COSDictionary clonedActionDict = new COSDictionary(action.getCOSObject());
                            PDActionGoTo dstAction =
                                    (PDActionGoTo) PDActionFactory.createAction(clonedActionDict);
                            dstAction.setDestination(dstDestination);
                            link.setAction(dstAction);
                        }
                        else
                        {
                            // just assign destination clone
                            link.setDestination(dstDestination);
                        }
                    }
                }
            }
            if (annotationClone instanceof PDAnnotationWidget &&
                annotationClone.getCOSObject().containsKey(COSName.PARENT))
            {
                // remove non-terminal field /Parent reference, because this may lead to orphan pages
                annotationClone.getCOSObject().removeItem(COSName.PARENT);
            }
            if (annotation.getPage() != null)
            {
                annotationClone.setPage(imported);
            }
        }
        // Second loop for markup and popup annotations, which reference annotations themselves
        for (PDAnnotation annotation : clonedAnnotations)
        {
            if (annotation instanceof PDAnnotationMarkup)
            {
                PDAnnotationPopup annotationPopup = ((PDAnnotationMarkup) annotation).getPopup();
                if (annotationPopup == null)
                {
                    continue;
                }
                COSDictionary clonedPopupDict = annotDictMap.get(annotationPopup.getCOSObject());
                if (clonedPopupDict != null)
                {
                    annotation.getCOSObject().setItem(COSName.POPUP, clonedPopupDict);
                }
                else
                {
                    // orphan popup (not in annotation list); clone it and fix references 
                    clonedPopupDict = new COSDictionary(annotationPopup.getCOSObject());
                    annotDictMap.put(annotationPopup.getCOSObject(), clonedPopupDict);
                    PDAnnotationPopup annotationPopupClone =
                            (PDAnnotationPopup) PDAnnotation.createAnnotation(clonedPopupDict);
                    annotationPopupClone.setParent((PDAnnotationMarkup) annotation);
                    ((PDAnnotationMarkup) annotation).setPopup(annotationPopupClone);
                    if (annotationPopupClone.getPage() != null)
                    {
                        annotationPopupClone.setPage(imported);
                    }
                }
            }
            if (annotation instanceof PDAnnotationPopup)
            {
                PDAnnotationMarkup annotationMarkup = ((PDAnnotationPopup) annotation).getParent();
                if (annotationMarkup == null)
                {
                    continue;
                }
                COSDictionary clonedMarkupDict = annotDictMap.get(annotationMarkup.getCOSObject());
                if (clonedMarkupDict != null)
                {
                    annotation.getCOSObject().setItem(COSName.PARENT, clonedMarkupDict);
                }
                else
                {
                    // orphan markup (not in annotation list); clone it and fix references 
                    clonedMarkupDict = new COSDictionary(annotationMarkup.getCOSObject());
                    annotDictMap.put(annotationMarkup.getCOSObject(), clonedMarkupDict);
                    PDAnnotationMarkup annotationMarkupClone =
                            (PDAnnotationMarkup) PDAnnotation.createAnnotation(clonedMarkupDict);
                    annotationMarkupClone.setPopup((PDAnnotationPopup) annotation);
                    ((PDAnnotationPopup) annotation).setParent(annotationMarkupClone);
                    if (annotationMarkupClone.getPage() != null)
                    {
                        annotationMarkupClone.setPage(imported);
                    }
                }
            }
        }
        imported.setAnnotations(clonedAnnotations);
    }

    /**
     * The source PDF document.
     * 
     * @return the pdf to be split
     */
    protected final PDDocument getSourceDocument()
    {
        return sourceDocument;
    }

    /**
     * The source PDF document.
     * 
     * @return current destination pdf
     */
    protected final PDDocument getDestinationDocument()
    {
        return currentDestinationDocument;
    }
}