ParentTreeHandler.java

/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2025 Apryse Group NV
    Authors: Apryse Software.

    This program is offered under a commercial and under the AGPL license.
    For commercial licensing, contact us at https://itextpdf.com/sales.  For AGPL licensing, see below.

    AGPL licensing:
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
package com.itextpdf.kernel.pdf.tagging;

import com.itextpdf.io.logs.IoLogMessageConstant;
import com.itextpdf.kernel.exceptions.KernelExceptionMessageConstant;
import com.itextpdf.kernel.exceptions.PdfException;
import com.itextpdf.kernel.logs.KernelLogMessageConstant;
import com.itextpdf.kernel.pdf.PdfArray;
import com.itextpdf.kernel.pdf.PdfDictionary;
import com.itextpdf.kernel.pdf.PdfIndirectReference;
import com.itextpdf.kernel.pdf.PdfName;
import com.itextpdf.kernel.pdf.PdfNull;
import com.itextpdf.kernel.pdf.PdfNumTree;
import com.itextpdf.kernel.pdf.PdfNumber;
import com.itextpdf.kernel.pdf.PdfObject;
import com.itextpdf.kernel.pdf.PdfPage;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.PdfStream;
import com.itextpdf.kernel.validation.context.TagStructElementValidationContext;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.TreeMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * Internal helper class which is used to effectively build parent tree and also find marked content references:
 * for specified page, by MCID or by struct parent index.
 */
class ParentTreeHandler {
    private static final Logger LOGGER = LoggerFactory.getLogger(ParentTreeHandler.class);


    private PdfStructTreeRoot structTreeRoot;

    /**
     * Represents parentTree in structTreeRoot. It contains only those entries that belong to the already flushed pages.
     */
    private PdfNumTree parentTree;

    private Map<PdfIndirectReference, PageMcrsContainer> pageToPageMcrs;

    private Map<PdfIndirectReference, Integer> pageToStructParentsInd;

    private Map<PdfIndirectReference, Integer> xObjectToStructParentsInd;

    private int maxStructParentIndex = -1;

    /**
     * Init ParentTreeHandler. On init the parent tree is read and stored in this instance.
     */
    ParentTreeHandler(PdfStructTreeRoot structTreeRoot) {
        this.structTreeRoot = structTreeRoot;
        parentTree = new PdfNumTree(structTreeRoot.getDocument().getCatalog(), PdfName.ParentTree);
        xObjectToStructParentsInd = new HashMap<>();
        registerAllMcrs();
        pageToStructParentsInd = new HashMap<>();
    }

    /**
     * Gets a list of all marked content references on the page.
     */
    public PageMcrsContainer getPageMarkedContentReferences(PdfPage page) {
        return pageToPageMcrs.get(page.getPdfObject().getIndirectReference());
    }

    // Mind that this method searches among items contained in page's content stream  only
    public PdfMcr findMcrByMcid(PdfDictionary pageDict, int mcid) {
        PageMcrsContainer pageMcrs = pageToPageMcrs.get(pageDict.getIndirectReference());
        return pageMcrs != null ? pageMcrs.getPageContentStreamsMcrs().get(mcid) : null;
    }

    public PdfObjRef findObjRefByStructParentIndex(PdfDictionary pageDict, int structParentIndex) {
        PageMcrsContainer pageMcrs = pageToPageMcrs.get(pageDict.getIndirectReference());
        return pageMcrs != null ? (PdfObjRef) pageMcrs.getObjRefs().get(structParentIndex) : null;
    }

    public int getNextMcidForPage(PdfPage page) {
        PageMcrsContainer pageMcrs = getPageMarkedContentReferences(page);
        if (pageMcrs == null || pageMcrs.getPageContentStreamsMcrs().size() == 0) {
            return 0;
        } else {
            return (int) pageMcrs.getPageContentStreamsMcrs().lastEntry().getKey() + 1;
        }
    }

    /**
     * Creates and flushes parent tree entry for the page.
     * Effectively this means that new content mustn't be added to the page.
     *
     * @param page {@link PdfPage} for which to create parent tree entry. Typically this page is flushed after this
     *             call.
     */
    public void createParentTreeEntryForPage(PdfPage page) {
        PageMcrsContainer mcrs = getPageMarkedContentReferences(page);
        if (mcrs == null) {
            return;
        }
        pageToPageMcrs.remove(page.getPdfObject().getIndirectReference());

        if (updateStructParentTreeEntries(page, mcrs)) {
            structTreeRoot.setModified();
        }
    }

    public void savePageStructParentIndexIfNeeded(PdfPage page) {
        PdfIndirectReference indRef = page.getPdfObject().getIndirectReference();
        if (page.isFlushed() || pageToPageMcrs.get(indRef) == null) {
            return;
        }
        boolean hasNonObjRefMcr = pageToPageMcrs.get(indRef).getPageContentStreamsMcrs().size() > 0 ||
                pageToPageMcrs.get(indRef).getPageResourceXObjects().size() > 0;

        if (hasNonObjRefMcr) {
            pageToStructParentsInd.put(indRef, (Integer) getOrCreatePageStructParentIndex(page));
        }
    }

    public PdfDictionary buildParentTree() {
        return (PdfDictionary) parentTree.buildTree().makeIndirect(structTreeRoot.getDocument());
    }

    public void registerMcr(PdfMcr mcr) {
        registerMcr(mcr, false);
    }

    private void registerMcr(PdfMcr mcr, boolean registeringOnInit) {
        PdfIndirectReference mcrPageIndRef = mcr.getPageIndirectReference();
        if (mcrPageIndRef == null || (!(mcr instanceof PdfObjRef) && mcr.getMcid() < 0)) {
            LOGGER.error(IoLogMessageConstant.ENCOUNTERED_INVALID_MCR);
            return;
        }
        PageMcrsContainer pageMcrs = pageToPageMcrs.get(mcrPageIndRef);
        if (pageMcrs == null) {
            pageMcrs = new PageMcrsContainer();
            pageToPageMcrs.put(mcrPageIndRef, pageMcrs);
        }

        PdfObject stm;
        if ((stm = getStm(mcr)) != null) {
            PdfIndirectReference stmIndRef;
            PdfStream xObjectStream;
            if (stm instanceof PdfIndirectReference) {
                stmIndRef = (PdfIndirectReference) stm;
                xObjectStream = (PdfStream) stmIndRef.getRefersTo();
            } else {
                if (stm.getIndirectReference() == null) {
                    stm.makeIndirect(structTreeRoot.getDocument());
                }
                stmIndRef = stm.getIndirectReference();
                xObjectStream = (PdfStream) stm;
            }

            Integer structParent = xObjectStream.getAsInt(PdfName.StructParents);
            if (structParent != null) {
                xObjectToStructParentsInd.put(stmIndRef, structParent);
                if (registeringOnInit) {
                    xObjectStream.release();
                }
            } else if (isModificationAllowed()) {
                maxStructParentIndex++;
                xObjectToStructParentsInd.put(stmIndRef, maxStructParentIndex);
                xObjectStream.put(PdfName.StructParents, new PdfNumber(maxStructParentIndex));
                structTreeRoot.getPdfObject().put(PdfName.ParentTreeNextKey, new PdfNumber(maxStructParentIndex + 1));
                LOGGER.warn(KernelLogMessageConstant.XOBJECT_STRUCT_PARENT_INDEX_MISSED_AND_RECREATED);
            } else {
                throw new PdfException(KernelExceptionMessageConstant.XOBJECT_STRUCT_PARENT_INDEX_MISSED);
            }
            pageMcrs.putXObjectMcr(stmIndRef, mcr);
        } else if (mcr instanceof PdfObjRef) {
            PdfObject mcrObj = ((PdfDictionary) mcr.getPdfObject()).get(PdfName.Obj);
            if (!(mcrObj instanceof PdfDictionary)) {
                throw new PdfException(KernelExceptionMessageConstant.INVALID_OBJECT_REFERENCE_TYPE);
            }

            PdfDictionary obj = (PdfDictionary) mcrObj;
            if (obj.isFlushed()) {
                throw new PdfException(
                        KernelExceptionMessageConstant.WHEN_ADDING_OBJECT_REFERENCE_TO_THE_TAG_TREE_IT_MUST_BE_CONNECTED_TO_NOT_FLUSHED_OBJECT);
            }
            PdfNumber n = obj.getAsNumber(PdfName.StructParent);
            if (n != null) {
                pageMcrs.putObjectReferenceMcr(n.intValue(), mcr);
            } else if (isModificationAllowed()) {
                maxStructParentIndex++;
                pageMcrs.putObjectReferenceMcr(maxStructParentIndex, mcr);
                obj.put(PdfName.StructParent, new PdfNumber(maxStructParentIndex));
                structTreeRoot.getPdfObject().put(PdfName.ParentTreeNextKey, new PdfNumber(maxStructParentIndex + 1));
                LOGGER.warn(KernelLogMessageConstant.STRUCT_PARENT_INDEX_MISSED_AND_RECREATED);
            } else {
                throw new PdfException(KernelExceptionMessageConstant.STRUCT_PARENT_INDEX_NOT_FOUND_IN_TAGGED_OBJECT);
            }
        } else {
            pageMcrs.putPageContentStreamMcr(mcr.getMcid(), mcr);
        }

        if (!registeringOnInit) {
            structTreeRoot.setModified();
        }
    }

    public void unregisterMcr(PdfMcr mcrToUnregister) {
        PdfDictionary pageDict = mcrToUnregister.getPageObject();
        if (pageDict == null) {
            // invalid mcr, ignore

            return;
        }
        if (pageDict.isFlushed()) {
            throw new PdfException(
                    KernelExceptionMessageConstant.CANNOT_REMOVE_MARKED_CONTENT_REFERENCE_BECAUSE_ITS_PAGE_WAS_ALREADY_FLUSHED);
        }
        PageMcrsContainer pageMcrs = pageToPageMcrs.get(pageDict.getIndirectReference());
        if (pageMcrs != null) {
            PdfObject stm;
            if ((stm = getStm(mcrToUnregister)) != null) {
                PdfIndirectReference xObjectReference =
                        stm instanceof PdfIndirectReference ? (PdfIndirectReference) stm : stm.getIndirectReference();
                pageMcrs.getPageResourceXObjects().get(xObjectReference).remove(mcrToUnregister.getMcid());
                if (pageMcrs.getPageResourceXObjects().get(xObjectReference).isEmpty()) {
                    pageMcrs.getPageResourceXObjects().remove(xObjectReference);
                    xObjectToStructParentsInd.remove(xObjectReference);
                }
                structTreeRoot.setModified();
            } else if (mcrToUnregister instanceof PdfObjRef) {
                for (Map.Entry<Integer, PdfMcr> entry : pageMcrs.getObjRefs().entrySet()) {
                    if (entry.getValue().getPdfObject() == mcrToUnregister.getPdfObject()) {
                        pageMcrs.getObjRefs().remove(entry.getKey());
                        structTreeRoot.setModified();
                        break;
                    }
                }
            } else {
                pageMcrs.getPageContentStreamsMcrs().remove(mcrToUnregister.getMcid());
                structTreeRoot.setModified();
            }
        }
    }

    private boolean isModificationAllowed() {
        PdfReader reader = this.structTreeRoot.getDocument().getReader();
        if (reader != null){
            return PdfReader.StrictnessLevel.CONSERVATIVE.isStricter(reader.getStrictnessLevel());
        } else {
            return true;
        }
    }

    private void registerAllMcrs() {
        pageToPageMcrs = new HashMap<>();
        // we create new number tree and not using parentTree, because we want parentTree to be empty
        Map<Integer, PdfObject> parentTreeEntries = new PdfNumTree(structTreeRoot.getDocument().getCatalog(),
                PdfName.ParentTree).getNumbers();
        Set<PdfDictionary> mcrParents = new LinkedHashSet<>();
        for (Map.Entry<Integer, PdfObject> entry : parentTreeEntries.entrySet()) {
            if (entry.getKey() > maxStructParentIndex) {
                maxStructParentIndex = (int) entry.getKey();
            }

            PdfObject entryValue = entry.getValue();
            if (entryValue.isDictionary()) {
                mcrParents.add((PdfDictionary) entryValue);
            } else if (entryValue.isArray()) {
                PdfArray parentsArray = (PdfArray) entryValue;
                for (int i = 0; i < parentsArray.size(); ++i) {
                    PdfDictionary parent = parentsArray.getAsDictionary(i);
                    if (parent != null) {
                        mcrParents.add(parent);
                    }
                }
            }
        }
        structTreeRoot.getPdfObject().put(PdfName.ParentTreeNextKey, new PdfNumber(maxStructParentIndex + 1));

        for (PdfObject mcrParent : mcrParents) {
            PdfStructElem mcrParentStructElem = new PdfStructElem((PdfDictionary) mcrParent);
            for (IStructureNode kid : mcrParentStructElem.getKids()) {
                if (kid instanceof PdfMcr) {
                    registerMcr((PdfMcr) kid, true);
                }
            }
        }
    }

    private boolean updateStructParentTreeEntries(PdfPage page, PageMcrsContainer mcrs) {
        boolean res = false;

        for (Map.Entry<Integer, PdfMcr> entry : mcrs.getObjRefs().entrySet()) {
            PdfMcr mcr = entry.getValue();
            PdfDictionary parentObj = ((PdfStructElem) mcr.getParent()).getPdfObject();
            if (!parentObj.isIndirect()) {
                continue;
            }
            int structParent = entry.getKey();
            parentTree.addEntry(structParent, parentObj);
            res = true;
        }

        int pageStructParentIndex;
        for (Map.Entry<PdfIndirectReference, TreeMap<Integer, PdfMcr>> entry : mcrs.getPageResourceXObjects()
                .entrySet()) {
            PdfIndirectReference xObjectRef = entry.getKey();
            if (xObjectToStructParentsInd.containsKey(xObjectRef)) {
                pageStructParentIndex = (int) xObjectToStructParentsInd.remove(xObjectRef);
                if (updateStructParentTreeForContentStreamEntries(entry.getValue(), pageStructParentIndex)) {
                    res = true;
                }
            }
        }
        if (page.isFlushed()) {
            PdfIndirectReference pageRef = page.getPdfObject().getIndirectReference();
            if (!pageToStructParentsInd.containsKey(pageRef)) {
                return res;
            }
            pageStructParentIndex = (int) pageToStructParentsInd.remove(pageRef);
        } else {
            pageStructParentIndex = getOrCreatePageStructParentIndex(page);
        }
        if (updateStructParentTreeForContentStreamEntries(mcrs.getPageContentStreamsMcrs(), pageStructParentIndex)) {
            res = true;
        }

        return res;
    }

    private boolean updateStructParentTreeForContentStreamEntries(Map<Integer, PdfMcr> mcrsOfContentStream,
            int pageStructParentIndex) {
        // element indices in parentsOfMcrs shall be the same as mcid of one of their kids.
        // See "Finding Structure Elements from Content Items" in pdf spec.
        PdfArray parentsOfMcrs = new PdfArray();
        int currentMcid = 0;
        for (Map.Entry<Integer, PdfMcr> entry : mcrsOfContentStream.entrySet()) {
            PdfMcr mcr = entry.getValue();
            PdfDictionary parentObj = ((PdfStructElem) mcr.getParent()).getPdfObject();
            if (!parentObj.isIndirect()) {
                continue;
            }
            // if for some reason some mcrs were not registered or don't exist, we ensure that the rest
            // of the parent objects were placed at correct index
            while (currentMcid++ < mcr.getMcid()) {
                parentsOfMcrs.add(PdfNull.PDF_NULL);
            }
            parentsOfMcrs.add(parentObj);
        }

        if (!parentsOfMcrs.isEmpty()) {
            parentsOfMcrs.makeIndirect(structTreeRoot.getDocument());
            parentTree.addEntry(pageStructParentIndex, parentsOfMcrs);
            structTreeRoot.getDocument().checkIsoConformance(new TagStructElementValidationContext(parentsOfMcrs));
            parentsOfMcrs.flush();
            return true;
        }
        return false;
    }

    private int getOrCreatePageStructParentIndex(PdfPage page) {
        int structParentIndex = page.getStructParentIndex();
        if (structParentIndex < 0) {
            structParentIndex = page.getDocument().getNextStructParentIndex();
            page.getPdfObject().put(PdfName.StructParents, new PdfNumber(structParentIndex));
        }
        return structParentIndex;
    }

    private static PdfObject getStm(PdfMcr mcr) {
        /*
         * Presence of Stm guarantees that the mcr belongs to XObject, absence of Stm guarantees that the mcr belongs to page content stream.
         * See 14.7.4.2 Marked-Content Sequences as Content Items, Table 324 ��� Entries in a marked-content reference dictionary.
         */
        if (mcr instanceof PdfMcrDictionary) {
            return ((PdfDictionary) mcr.getPdfObject()).get(PdfName.Stm, false);
        }
        return null;
    }

    static class PageMcrsContainer {


        Map<Integer, PdfMcr> objRefs;
        NavigableMap<Integer, PdfMcr> pageContentStreams;
        /*
         * Keys of this map are indirect references to XObjects contained in page's resources,
         * values are the mcrs contained in the corresponding XObject streams, stored as mappings "MCID-number to PdfMcr".
         */
        Map<PdfIndirectReference, TreeMap<Integer, PdfMcr>> pageResourceXObjects;

        PageMcrsContainer() {
            objRefs = new LinkedHashMap<Integer, PdfMcr>();
            pageContentStreams = new TreeMap<Integer, PdfMcr>();
            pageResourceXObjects = new LinkedHashMap<PdfIndirectReference, TreeMap<Integer, PdfMcr>>();
        }

        void putObjectReferenceMcr(int structParentIndex, PdfMcr mcr) {
            objRefs.put(structParentIndex, mcr);
        }

        void putPageContentStreamMcr(int mcid, PdfMcr mcr) {
            pageContentStreams.put(mcid, mcr);
        }

        void putXObjectMcr(PdfIndirectReference xObjectIndRef, PdfMcr mcr) {
            TreeMap<Integer, PdfMcr> xObjectMcrs = pageResourceXObjects.get(xObjectIndRef);
            if (xObjectMcrs == null) {
                xObjectMcrs = new TreeMap<Integer, PdfMcr>();
                pageResourceXObjects.put(xObjectIndRef, xObjectMcrs);
            }
            pageResourceXObjects.get(xObjectIndRef).put(mcr.getMcid(), mcr);
        }

        NavigableMap<Integer, PdfMcr> getPageContentStreamsMcrs() {
            return pageContentStreams;
        }

        Map<Integer, PdfMcr> getObjRefs() {
            return objRefs;
        }

        Map<PdfIndirectReference, TreeMap<Integer, PdfMcr>> getPageResourceXObjects() {
            return pageResourceXObjects;
        }

        Collection<PdfMcr> getAllMcrsAsCollection() {
            Collection<PdfMcr> collection = new ArrayList<PdfMcr>();
            collection.addAll(objRefs.values());
            collection.addAll(pageContentStreams.values());
            for (Map.Entry<PdfIndirectReference, TreeMap<Integer, PdfMcr>> entry : pageResourceXObjects.entrySet()) {
                collection.addAll(entry.getValue().values());
            }
            return collection;
        }
    }

}