PdfStructTreeRoot.java

/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2025 Apryse Group NV
    Authors: Apryse Software.

    This program is offered under a commercial and under the AGPL license.
    For commercial licensing, contact us at https://itextpdf.com/sales.  For AGPL licensing, see below.

    AGPL licensing:
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
package com.itextpdf.kernel.pdf.tagging;

import com.itextpdf.commons.utils.MessageFormatUtil;
import com.itextpdf.io.logs.IoLogMessageConstant;
import com.itextpdf.kernel.exceptions.KernelExceptionMessageConstant;
import com.itextpdf.kernel.exceptions.PdfException;
import com.itextpdf.kernel.pdf.PdfArray;
import com.itextpdf.kernel.pdf.PdfDictionary;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfName;
import com.itextpdf.kernel.pdf.PdfNumber;
import com.itextpdf.kernel.pdf.PdfObject;
import com.itextpdf.kernel.pdf.PdfObjectWrapper;
import com.itextpdf.kernel.pdf.PdfPage;
import com.itextpdf.kernel.pdf.PdfVersion;
import com.itextpdf.kernel.pdf.VersionConforming;
import com.itextpdf.kernel.pdf.filespec.PdfFileSpec;

import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import com.itextpdf.kernel.pdf.tagutils.TagTreeIterator;
import com.itextpdf.kernel.pdf.tagutils.TagTreeIteratorFlusher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Represents a wrapper-class for structure tree root dictionary. See ISO-32000-1 "14.7.2 Structure hierarchy".
 */
public class PdfStructTreeRoot extends PdfObjectWrapper<PdfDictionary> implements IStructureNode {


    private PdfDocument document;
    private ParentTreeHandler parentTreeHandler;
    private PdfStructIdTree idTree = null;

    private static Map<String, PdfName> staticRoleNames = new ConcurrentHashMap<>();

    /**
     * Creates a new structure tree root instance, this initializes empty logical structure in the document.
     * This class also handles global state of parent tree, so it's not expected to create multiple instances
     * of this class. Instead, use {@link PdfDocument#getStructTreeRoot()}.
     *
     * @param document a document to which new instance of struct tree root will be bound
     */
    public PdfStructTreeRoot(PdfDocument document) {
        this((PdfDictionary) new PdfDictionary().makeIndirect(document), document);
        getPdfObject().put(PdfName.Type, PdfName.StructTreeRoot);
    }

    /**
     * Creates wrapper instance for already existing logical structure tree root in the document.
     * This class also handles global state of parent tree, so it's not expected to create multiple instances
     * of this class. Instead, use {@link PdfDocument#getStructTreeRoot()}.
     *
     * @param structTreeRootDict a dictionary that defines document structure tree root
     * @param document a document, which contains given structure tree root dictionary
     */
    public PdfStructTreeRoot(PdfDictionary structTreeRootDict, PdfDocument document) {
        super(structTreeRootDict);
        this.document = document;
        if (this.document == null) {
            ensureObjectIsAddedToDocument(structTreeRootDict);
            this.document = structTreeRootDict.getIndirectReference().getDocument();
        }
        setForbidRelease();
        parentTreeHandler = new ParentTreeHandler(this);

        // Always init role map dictionary in order to avoid inconsistency, because
        // iText often initializes it during role mapping resolution anyway.
        // In future, better way might be to not write it to the document needlessly
        // and avoid possible redundant modifications in append mode.
        getRoleMap();
    }

    public static PdfName convertRoleToPdfName(String role) {
        PdfName name = PdfName.staticNames.get(role);
        if (name != null) {
            return name;
        }
        name = staticRoleNames.get(role);
        if (name != null) {
            return name;
        }
        name = new PdfName(role);
        staticRoleNames.put(role, name);
        return name;
    }

    public PdfStructElem addKid(PdfStructElem structElem) {
        return addKid(-1, structElem);
    }

    public PdfStructElem addKid(int index, PdfStructElem structElem) {
        addKidObject(index, structElem.getPdfObject());
        return structElem;
    }

    @Override
    public IStructureNode getParent() {
        return null;
    }

    /**
     * Gets list of the direct kids of StructTreeRoot.
     * If certain kid is flushed, there will be a {@code null} in the list on it's place.
     *
     * @return list of the direct kids of StructTreeRoot.
     */
    @Override
    public List<IStructureNode> getKids() {
        PdfObject k = getPdfObject().get(PdfName.K);
        List<IStructureNode> kids = new ArrayList<>();

        if (k != null) {
            if (k.isArray()) {
                PdfArray a = (PdfArray) k;
                for (int i = 0; i < a.size(); i++) {
                    ifKidIsStructElementAddToList(a.get(i), kids);
                }
            } else {
                ifKidIsStructElementAddToList(k, kids);
            }
        }
        return kids;
    }

    public PdfArray getKidsObject() {
        PdfArray k = null;
        PdfObject kObj = getPdfObject().get(PdfName.K);
        if (kObj != null && kObj.isArray()) {
            k = (PdfArray) kObj;
        }
        if (k == null) {
            k = new PdfArray();
            getPdfObject().put(PdfName.K, k);
            setModified();
            if (kObj != null) {
                k.add(kObj);
            }
        }
        return k;
    }

    public void addRoleMapping(String fromRole, String toRole) {
        PdfDictionary roleMap = getRoleMap();
        PdfObject prevVal = roleMap.put(convertRoleToPdfName(fromRole), convertRoleToPdfName(toRole));
        if (prevVal != null && prevVal instanceof PdfName) {
            Logger logger = LoggerFactory.getLogger(PdfStructTreeRoot.class);
            logger.warn(MessageFormat.format(IoLogMessageConstant.MAPPING_IN_STRUCT_ROOT_OVERWRITTEN, fromRole, prevVal,
                    toRole));
        }

        if (roleMap.isIndirect()) {
            roleMap.setModified();
        } else {
            setModified();
        }
    }

    public PdfDictionary getRoleMap() {
        PdfDictionary roleMap = getPdfObject().getAsDictionary(PdfName.RoleMap);
        if (roleMap == null) {
            roleMap = new PdfDictionary();
            getPdfObject().put(PdfName.RoleMap, roleMap);
            setModified();
        }
        return roleMap;
    }

    /**
     * Gets namespaces used within the document. Essentially this method returns value of {@link #getNamespacesObject()}
     * wrapped in the {@link PdfNamespace} and {@link List} classes. Therefore limitations of the referred method are
     * applied to this method too.
     *
     * @return a {@link List} of {@link PdfNamespace}s used within the document.
     */
    public List<PdfNamespace> getNamespaces() {
        PdfArray namespacesArray = getPdfObject().getAsArray(PdfName.Namespaces);
        if (namespacesArray == null) {
            return Collections.<PdfNamespace>emptyList();
        } else {
            List<PdfNamespace> namespacesList = new ArrayList<>(namespacesArray.size());
            for (int i = 0; i < namespacesArray.size(); ++i) {
                namespacesList.add(new PdfNamespace(namespacesArray.getAsDictionary(i)));
            }
            return namespacesList;
        }
    }

    /**
     * Adds a {@link PdfNamespace} to the list of the namespaces used within the document.
     * <p>
     * This value has meaning only for the PDF documents of version <b>2.0 and higher</b>.
     *
     * @param namespace a {@link PdfNamespace} to be added.
     */
    public void addNamespace(PdfNamespace namespace) {
        getNamespacesObject().add(namespace.getPdfObject());
        setModified();
    }

    /**
     * An array of namespaces used within the document. This value, however, is not automatically updated while
     * the document is processed. It identifies only the namespaces that were in the document at the moment of it's
     * opening.
     *
     * @return {@link PdfArray} of namespaces used within the document.
     */
    public PdfArray getNamespacesObject() {
        PdfArray namespacesArray = getPdfObject().getAsArray(PdfName.Namespaces);
        if (namespacesArray == null) {
            namespacesArray = new PdfArray();
            VersionConforming.validatePdfVersionForDictEntry(getDocument(), PdfVersion.PDF_2_0, PdfName.Namespaces, PdfName.StructTreeRoot);
            getPdfObject().put(PdfName.Namespaces, namespacesArray);
            setModified();
        }
        return namespacesArray;
    }

    /**
     * A {@link List} containing one or more {@link PdfFileSpec} objects, where each specified file
     * is a pronunciation lexicon, which is an XML file conforming to the Pronunciation Lexicon Specification (PLS) Version 1.0.
     * These pronunciation lexicons may be used as pronunciation hints when the document���s content is presented via
     * text-to-speech. Where two or more pronunciation lexicons apply to the same text, the first match ��� as defined by
     * the order of entries in the array and the order of entries inside the pronunciation lexicon file ��� should be used.
     * <p>
     * See ISO 32000-2 14.9.6, "Pronunciation hints".
     *
     * @return A {@link List} containing one or more {@link PdfFileSpec}.
     */
    public List<PdfFileSpec> getPronunciationLexiconsList() {
        PdfArray pronunciationLexicons = getPdfObject().getAsArray(PdfName.PronunciationLexicon);
        if (pronunciationLexicons == null) {
            return Collections.<PdfFileSpec>emptyList();
        } else {
            List<PdfFileSpec> lexiconsList = new ArrayList<>(pronunciationLexicons.size());
            for (int i = 0; i < pronunciationLexicons.size(); ++i) {
                lexiconsList.add(PdfFileSpec.wrapFileSpecObject(pronunciationLexicons.get(i)));
            }
            return lexiconsList;
        }
    }

    /**
     * Adds a single  {@link PdfFileSpec} object, which specifies XML file conforming to PLS.
     * For more info see {@link #getPronunciationLexiconsList()}.
     * <p>
     * This value has meaning only for the PDF documents of version <b>2.0 and higher</b>.
     *
     * @param pronunciationLexiconFileSpec a {@link PdfFileSpec} object, which specifies XML file conforming to PLS.
     */
    public void addPronunciationLexicon(PdfFileSpec pronunciationLexiconFileSpec) {
        PdfArray pronunciationLexicons = getPdfObject().getAsArray(PdfName.PronunciationLexicon);
        if (pronunciationLexicons == null) {
            pronunciationLexicons = new PdfArray();
            VersionConforming.validatePdfVersionForDictEntry(getDocument(), PdfVersion.PDF_2_0, PdfName.PronunciationLexicon, PdfName.StructTreeRoot);
            getPdfObject().put(PdfName.PronunciationLexicon, pronunciationLexicons);
        }
        pronunciationLexicons.add(pronunciationLexiconFileSpec.getPdfObject());
        setModified();
    }

    /**
     * Creates and flushes parent tree entry for the page.
     * Effectively this means that new content mustn't be added to the page.
     *
     * @param page {@link PdfPage} for which to create parent tree entry. Typically this page is flushed after this call.
     */
    public void createParentTreeEntryForPage(PdfPage page) {
        getParentTreeHandler().createParentTreeEntryForPage(page);
    }

    public void savePageStructParentIndexIfNeeded(PdfPage page) {
        getParentTreeHandler().savePageStructParentIndexIfNeeded(page);
    }

    /**
     * Gets an unmodifiable collection of marked content references on page.
     *
     * NOTE: Do not remove tags when iterating over returned collection, this could
     * lead to the ConcurrentModificationException, because returned collection is backed by the internal list of the
     * actual page tags.
     *
     * @param page {@link PdfPage} to obtain unmodifiable collection of marked content references
     * @return the unmodifiable collection of marked content references on page, if no Mcrs defined returns null
     */
    public Collection<PdfMcr> getPageMarkedContentReferences(PdfPage page) {
        ParentTreeHandler.PageMcrsContainer pageMcrs = getParentTreeHandler().getPageMarkedContentReferences(page);
        return pageMcrs != null ? Collections.unmodifiableCollection(pageMcrs.getAllMcrsAsCollection()) : null;
    }

    public PdfMcr findMcrByMcid(PdfDictionary pageDict, int mcid) {
        return getParentTreeHandler().findMcrByMcid(pageDict, mcid);
    }

    public PdfObjRef findObjRefByStructParentIndex(PdfDictionary pageDict, int structParentIndex) {
        return getParentTreeHandler().findObjRefByStructParentIndex(pageDict, structParentIndex);
    }

    @Override
    public PdfName getRole() {
        return null;
    }

    @Override
    public void flush() {
        for (int i = 0; i < getDocument().getNumberOfPages(); ++i) {
            createParentTreeEntryForPage(getDocument().getPage(i + 1));
        }
        getPdfObject().put(PdfName.ParentTree, getParentTreeHandler().buildParentTree());
        getPdfObject().put(PdfName.ParentTreeNextKey, new PdfNumber((int) getDocument().getNextStructParentIndex()));
        if(this.idTree != null && this.idTree.isModified()) {
            getPdfObject().put(PdfName.IDTree, this.idTree.buildTree().makeIndirect(getDocument()));
        }
        if (!getDocument().isAppendMode()) {
            PdfStructTreeRoot.flushAllKids(this);
        }
        super.flush();
    }

    /**
     * Copies structure to a {@code destDocument}.
     *
     * NOTE: Works only for {@link PdfStructTreeRoot} that is read from the document opened in reading mode,
     * otherwise an exception is thrown.
     *
     * @param destDocument document to copy structure to. Shall not be current document.
     * @param page2page    association between original page and copied page.
     */
    public void copyTo(PdfDocument destDocument, Map<PdfPage, PdfPage> page2page) {
        StructureTreeCopier.copyTo(destDocument, page2page, getDocument());
    }

    /**
     * Copies structure to a {@code destDocument} and insert it in a specified position in the document.
     *
     * NOTE: Works only for {@link PdfStructTreeRoot} that is read from the document opened in reading mode,
     * otherwise an exception is thrown.
     *
     * @param destDocument     document to copy structure to.
     * @param insertBeforePage indicates where the structure to be inserted.
     * @param page2page        association between original page and copied page.
     */
    public void copyTo(PdfDocument destDocument, int insertBeforePage, Map<PdfPage, PdfPage> page2page) {
        StructureTreeCopier.copyTo(destDocument, insertBeforePage, page2page, getDocument());
    }

    /**
     * Moves structure associated with specified page and insert it in a specified position in the document.
     * <p>
     * NOTE: Works only for document with not flushed pages.
     *
     * @param fromPage page which tag structure will be moved
     * @param insertBeforePage indicates before tags of which page tag structure will be moved to
     */
    public void move(PdfPage fromPage, int insertBeforePage) {
        for (int i = 1; i <= getDocument().getNumberOfPages(); ++i) {
            if (getDocument().getPage(i).isFlushed()) {
                throw new PdfException(MessageFormatUtil.format(
                        KernelExceptionMessageConstant.CANNOT_MOVE_PAGES_IN_PARTLY_FLUSHED_DOCUMENT, i));
            }
        }
        StructureTreeCopier.move(getDocument(), fromPage, insertBeforePage);
    }

    public int getParentTreeNextKey() {
        // /ParentTreeNextKey entry is always inited on ParentTreeHandler initialization
        return getPdfObject().getAsNumber(PdfName.ParentTreeNextKey).intValue();
    }

    public int getNextMcidForPage(PdfPage page) {
        return getParentTreeHandler().getNextMcidForPage(page);
    }

    public PdfDocument getDocument() {
        return document;
    }

    /**
     * Adds file associated with structure tree root and identifies the relationship between them.
     * <p>
     * Associated files may be used in Pdf/A-3 and Pdf 2.0 documents.
     * The method adds file to array value of the AF key in the structure tree root dictionary.
     * If description is provided, it also will add file description to catalog Names tree.
     * <p>
     * For associated files their associated file specification dictionaries shall include the AFRelationship key
     *
     * @param description the file description
     * @param fs          file specification dictionary of associated file
     */
    public void addAssociatedFile(String description, PdfFileSpec fs) {
        if (null == ((PdfDictionary) fs.getPdfObject()).get(PdfName.AFRelationship)) {
            Logger logger = LoggerFactory.getLogger(PdfStructTreeRoot.class);
            logger.error(IoLogMessageConstant.ASSOCIATED_FILE_SPEC_SHALL_INCLUDE_AFRELATIONSHIP);
        }
        if (null != description) {
            getDocument().getCatalog().getNameTree(PdfName.EmbeddedFiles).addEntry(description, fs.getPdfObject());
        }
        PdfArray afArray = getPdfObject().getAsArray(PdfName.AF);
        if (afArray == null) {
            afArray = new PdfArray();
            getPdfObject().put(PdfName.AF, afArray);
        }
        afArray.add(fs.getPdfObject());
    }

    /**
     * <p>
     * Adds file associated with structure tree root and identifies the relationship between them.
     * <p>
     * Associated files may be used in Pdf/A-3 and Pdf 2.0 documents.
     * The method adds file to array value of the AF key in the structure tree root dictionary.
     * <p>
     * For associated files their associated file specification dictionaries shall include the AFRelationship key
     *
     * @param fs file specification dictionary of associated file
     */
    public void addAssociatedFile(PdfFileSpec fs) {
        addAssociatedFile(null, fs);
    }

    /**
     * Returns files associated with structure tree root.
     *
     * @param create defines whether AF arrays will be created if it doesn't exist
     * @return associated files array
     */
    public PdfArray getAssociatedFiles(boolean create) {
        PdfArray afArray = getPdfObject().getAsArray(PdfName.AF);
        if (afArray == null && create) {
            afArray = new PdfArray();
            getPdfObject().put(PdfName.AF, afArray);
        }
        return afArray;
    }

    /**
     * Returns the document's structure element ID tree wrapped in a {@link PdfStructIdTree}
     * object. If no such tree exists, it is initialized. The initialization happens lazily,
     * and does not trigger any PDF object changes unless populated.
     *
     * @return the {@link PdfStructIdTree} of the document
     */
    public PdfStructIdTree getIdTree() {
        if(this.idTree == null) {
            // Attempt to parse the ID tree in the document if there is one
            PdfDictionary idTreeDict = this.getPdfObject().getAsDictionary(PdfName.IDTree);
            if (idTreeDict == null) {
                // No tree found -> initialise one
                // Don't call setModified() here, registering the first ID will
                // take care of that for us.
                // The ID tree will be registered at flush time.
                this.idTree = new PdfStructIdTree(document);
            } else {
                this.idTree = PdfStructIdTree.readFromDictionary(document, idTreeDict);
            }
        }
        return this.idTree;
    }

    ParentTreeHandler getParentTreeHandler() {
        return parentTreeHandler;
    }

    void addKidObject(int index, PdfDictionary structElem) {
        if (index == -1) {
            getKidsObject().add(structElem);
        } else {
            getKidsObject().add(index, structElem);
        }
        if (PdfStructElem.isStructElem(structElem)) {
            if (getPdfObject().getIndirectReference() == null) {
                throw new PdfException(
                        KernelExceptionMessageConstant.STRUCTURE_ELEMENT_DICTIONARY_SHALL_BE_AN_INDIRECT_OBJECT_IN_ORDER_TO_HAVE_CHILDREN);
            }
            structElem.put(PdfName.P, getPdfObject());
        }
        setModified();
    }

    @Override
    protected boolean isWrappedObjectMustBeIndirect() {
        return true;
    }

    private static void flushAllKids(PdfStructTreeRoot elem) {
        TagTreeIterator iterator = new TagTreeIterator(elem, TagTreeIterator.TreeTraversalOrder.POST_ORDER);
        iterator.addHandler(new TagTreeIteratorFlusher());
        iterator.traverse();
    }

    private void ifKidIsStructElementAddToList(PdfObject kid, List<IStructureNode> kids) {
        if (kid.isFlushed()) {
            kids.add(null);
        } else if (kid.isDictionary() && PdfStructElem.isStructElem((PdfDictionary) kid)) {
            kids.add(new PdfStructElem((PdfDictionary) kid));
        }
    }
}