Pdf20Checker.java

/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2025 Apryse Group NV
    Authors: Apryse Software.

    This program is offered under a commercial and under the AGPL license.
    For commercial licensing, contact us at https://itextpdf.com/sales.  For AGPL licensing, see below.

    AGPL licensing:
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
package com.itextpdf.kernel.validation;

import com.itextpdf.commons.utils.MessageFormatUtil;
import com.itextpdf.kernel.exceptions.KernelExceptionMessageConstant;
import com.itextpdf.kernel.exceptions.Pdf20ConformanceException;
import com.itextpdf.kernel.exceptions.PdfException;
import com.itextpdf.kernel.pdf.PdfCatalog;
import com.itextpdf.kernel.pdf.PdfDictionary;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfName;
import com.itextpdf.kernel.pdf.PdfObject;
import com.itextpdf.kernel.pdf.PdfStream;
import com.itextpdf.kernel.pdf.PdfString;
import com.itextpdf.kernel.pdf.tagging.IStructureNode;
import com.itextpdf.kernel.pdf.tagging.PdfNamespace;
import com.itextpdf.kernel.pdf.tagging.PdfStructElem;
import com.itextpdf.kernel.pdf.tagging.PdfStructTreeRoot;
import com.itextpdf.kernel.pdf.tagging.StandardNamespaces;
import com.itextpdf.kernel.pdf.tagutils.IRoleMappingResolver;
import com.itextpdf.kernel.pdf.tagutils.ITagTreeIteratorHandler;
import com.itextpdf.kernel.pdf.tagutils.PdfAllowedTagRelations;
import com.itextpdf.kernel.pdf.tagutils.TagStructureContext;
import com.itextpdf.kernel.pdf.tagutils.TagTreeIterator;
import com.itextpdf.kernel.utils.checkers.PdfCheckersUtil;
import com.itextpdf.kernel.validation.context.PdfDocumentValidationContext;
import com.itextpdf.kernel.xmp.XMPException;
import com.itextpdf.kernel.xmp.XMPMeta;

import java.util.function.Function;

/**
 * Class that will run through all necessary checks defined in the PDF 2.0 standard. The standard that is followed is
 * the series of ISO 32000 specifications, starting from ISO 32000-2:2020.
 */
public class Pdf20Checker implements IValidationChecker {

    private static final Function<String, PdfException> EXCEPTION_SUPPLIER = msg -> new Pdf20ConformanceException(msg);
    private static final PdfAllowedTagRelations allowedTagRelations = new PdfAllowedTagRelations();

    private final TagStructureContext tagStructureContext;

    /**
     * Creates new {@link Pdf20Checker} instance to validate PDF document against PDF 2.0 standard.
     *
     * @param pdfDocument {@link PdfDocument} to check
     */
    public Pdf20Checker(PdfDocument pdfDocument) {
        this.tagStructureContext = pdfDocument.isTagged() ? pdfDocument.getTagStructureContext() : null;
    }

    @Override
    public void validate(IValidationContext validationContext) {
        switch (validationContext.getType()) {
            case PDF_DOCUMENT:
                PdfDocumentValidationContext pdfDocContext = (PdfDocumentValidationContext) validationContext;
                checkCatalog(pdfDocContext.getPdfDocument().getCatalog());
                checkStructureTreeRoot(pdfDocContext.getPdfDocument().getStructTreeRoot());
                break;
        }
    }

    @Override
    public boolean isPdfObjectReadyToFlush(PdfObject object) {
        return true;
    }

    /**
     * Checks that natural language is declared using the methods described in ISO 32000-2:2020, 14.9.2.
     *
     * @param catalog {@link PdfCatalog} document catalog dictionary
     */
    void checkLang(PdfCatalog catalog) {
        PdfDictionary catalogDict = catalog.getPdfObject();
        PdfObject lang = catalogDict.get(PdfName.Lang);
        if (lang instanceof PdfString && !((PdfString) lang).getValue().isEmpty()) {
            PdfCheckersUtil.validateLang(catalogDict, EXCEPTION_SUPPLIER);
        }
    }

    /**
     * Checks that the value of the {@code Metadata} key from the {@code Catalog} dictionary of a conforming file
     * is a metadata stream as defined in ISO 32000-2:2020.
     *
     * @param catalog {@link PdfCatalog} document catalog dictionary
     */
    void checkMetadata(PdfCatalog catalog) {
        PdfDictionary catalogDict = catalog.getPdfObject();
        if (!catalogDict.containsKey(PdfName.Metadata)) {
            return;
        }
        try {
            XMPMeta metadata = catalog.getDocument().getXmpMetadata();
            if (metadata == null) {
                throw new Pdf20ConformanceException(
                        KernelExceptionMessageConstant.INVALID_METADATA_VALUE);
            }

            PdfStream pdfStream = catalogDict.getAsStream(PdfName.Metadata);
            PdfName type = pdfStream.getAsName(PdfName.Type);
            PdfName subtype = pdfStream.getAsName(PdfName.Subtype);
            if (!PdfName.Metadata.equals(type) || !PdfName.XML.equals(subtype)) {
                throw new Pdf20ConformanceException(
                        KernelExceptionMessageConstant.METADATA_STREAM_REQUIRES_METADATA_TYPE_AND_XML_SUBTYPE);
            }
        } catch (XMPException e) {
            throw new Pdf20ConformanceException(
                    KernelExceptionMessageConstant.INVALID_METADATA_VALUE, e);
        }
    }

    /**
     * Validates document structure tree root dictionary against PDF 2.0 standard.
     *
     * <p>
     * Checks, that all structure elements are belong to, or role mapped to (such role mapping may be transitive through
     * other namespaces), at least one of the following namespaces specified in ISO 32000-2:2020, 14.8.6:
     * ��� the PDF 1.7 namespace;
     * ��� the PDF 2.0 namespace;
     * ��� the MathML namespace.
     * A structure element with no explicit namespace may be present. Such a structure element shall have, after
     * any role mapping, a structure type matching one of the unique PDF 1.7 element types (the default standard
     * structure namespace in ISO 32000-2 is defined as the PDF 1.7 namespace).
     *
     * @param structTreeRoot {@link PdfStructTreeRoot} to validate
     */
    void checkStructureTreeRoot(PdfStructTreeRoot structTreeRoot) {
        if (tagStructureContext == null) {
            return;
        }
        TagTreeIterator tagTreeIterator = new TagTreeIterator(structTreeRoot);
        tagTreeIterator.addHandler(new StructureTreeRootHandler(tagStructureContext));
        tagTreeIterator.addHandler(new ParentChildRelationshipHandler(tagStructureContext));
        tagTreeIterator.traverse();
    }

    /**
     * Validates document catalog dictionary against PDF 2.0 standard.
     *
     * <p>
     * For now, only {@code Metadata} and {@code Lang} are checked.
     *
     * @param catalog {@link PdfCatalog} document catalog dictionary to check
     */
    private void checkCatalog(PdfCatalog catalog) {
        checkLang(catalog);
        checkMetadata(catalog);
    }

    static final class ParentChildRelationshipHandler implements ITagTreeIteratorHandler {
        private final TagStructureContext tagStructureContext;

        public ParentChildRelationshipHandler(TagStructureContext context) {
            this.tagStructureContext = context;
        }

        private static void throwInvalidRelationshipException(String parentRole, String childRole) {
            throw new Pdf20ConformanceException(MessageFormatUtil.format(
                    KernelExceptionMessageConstant.PARENT_CHILD_ROLE_RELATION_IS_NOT_ALLOWED,
                    parentRole, childRole));
        }

        private String resolveRole(PdfStructElem elem) {
            final IRoleMappingResolver parentResolver = tagStructureContext
                    .resolveMappingToStandardOrDomainSpecificRole(elem.getRole().getValue(), elem.getNamespace());
            if (parentResolver == null ||
                    (parentResolver.getNamespace() != null &&
                            StandardNamespaces.MATH_ML.equals(parentResolver.getNamespace().getNamespaceName()))) {
                return null;
            }
            return parentResolver.getRole();
        }

        @Override
        public boolean accept(IStructureNode node) {
            return node != null;
        }

        @Override
        public void processElement(IStructureNode elem) {
            if (!(elem instanceof PdfStructElem) && !(elem instanceof PdfStructTreeRoot)) {
                return;
            }
            String parentRole = elem instanceof PdfStructElem ?
                    resolveRole((PdfStructElem) elem) : PdfName.StructTreeRoot.getValue();
            if (parentRole == null) {
                return;
            }
            for (IStructureNode kid : elem.getKids()) {
                if (kid instanceof PdfStructTreeRoot) {
                    continue;
                }
                if (kid instanceof PdfStructElem) {
                    final String childRole = resolveRole((PdfStructElem) kid);
                    if (childRole == null) {
                        continue;
                    }
                    if (!allowedTagRelations.isRelationAllowed(parentRole, childRole)) {
                        throwInvalidRelationshipException(parentRole, kid.getRole().getValue());
                    }
                } else if (!allowedTagRelations.isContentAllowedInRole(parentRole)) {
                    throwInvalidRelationshipException(parentRole, PdfAllowedTagRelations.ACTUAL_CONTENT);
                }
            }
        }
    }


    /**
     * Handler class that checks structure nodes while traversing the document structure tree.
     */
    private static class StructureTreeRootHandler implements ITagTreeIteratorHandler {
        private final TagStructureContext tagStructureContext;

        /**
         * Creates new {@link StructureTreeRootHandler} instance.
         *
         * @param tagStructureContext {@link TagStructureContext} of the current tagged document
         */
        public StructureTreeRootHandler(TagStructureContext tagStructureContext) {
            this.tagStructureContext = tagStructureContext;
        }

        @Override
        public boolean accept(IStructureNode node) {
            return node != null;
        }

        @Override
        public void processElement(IStructureNode elem) {
            if (!(elem instanceof PdfStructElem)) {
                return;
            }
            PdfStructElem structElem = (PdfStructElem) elem;
            String role = structElem.getRole().getValue();
            PdfNamespace namespace = structElem.getNamespace();
            if (!tagStructureContext.checkIfRoleShallBeMappedToStandardRole(role, namespace)) {
                throw new Pdf20ConformanceException(MessageFormatUtil.format(namespace == null ?
                                KernelExceptionMessageConstant.ROLE_IS_NOT_MAPPED_TO_ANY_STANDARD_ROLE :
                                KernelExceptionMessageConstant.ROLE_IN_NAMESPACE_IS_NOT_MAPPED_TO_ANY_STANDARD_ROLE,
                        role, namespace != null ? namespace.getNamespaceName() : null));
            }
        }
    }
}