JempboxExtractor.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.xmp;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.IOException;
import java.io.InputStream;
import java.util.Calendar;
import java.util.List;
import java.util.StringJoiner;

import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.jempbox.xmp.ResourceEvent;
import org.apache.jempbox.xmp.ResourceRef;
import org.apache.jempbox.xmp.XMPMetadata;
import org.apache.jempbox.xmp.XMPSchemaDublinCore;
import org.apache.jempbox.xmp.XMPSchemaMediaManagement;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMPMM;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.utils.DateUtils;
import org.apache.tika.utils.XMLReaderUtils;

public class JempboxExtractor {

    //TODO: change signature to require parsecontext from parse
    private static final ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
    // The XMP spec says it must be unicode, but for most file formats it specifies
    // "must be encoded in UTF-8"
    private static final String DEFAULT_XMP_CHARSET = UTF_8.name();
    private static volatile int MAX_EVENT_HISTORY_IN_XMPMM = 1024;
    private XMPPacketScanner scanner = new XMPPacketScanner();
    private Metadata metadata;

    public JempboxExtractor(Metadata metadata) {
        this.metadata = metadata;
    }

    /**
     * Tries to extract Dublin Core schema from XMP.  If XMPMetadata is null
     * or if the DC schema is null, this will return without throwing an exception.
     *
     * @param xmpMetadata XMPMetadata to process
     * @param metadata    Tika's metadata to write to
     */
    public static void extractDublinCore(XMPMetadata xmpMetadata, Metadata metadata) {
        if (xmpMetadata == null) {
            return;
        }
        XMPSchemaDublinCore dc = null;
        try {
            dc = xmpMetadata.getDublinCoreSchema();
        } catch (IOException e) {
            //swallow
        }
        if (dc == null) {
            return;
        }
        if (dc.getTitle() != null) {
            metadata.set(TikaCoreProperties.TITLE, dc.getTitle());
        }
        if (dc.getDescription() != null) {
            metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription());
        }
        if (dc.getCreators() != null && dc.getCreators().size() > 0) {
            metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators()));
        }
        if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
            for (String keyword : dc.getSubjects()) {
                metadata.add(TikaCoreProperties.SUBJECT, keyword);
            }
            // TODO should we set SUBJECT too?
            // All tested photo managers set the same in Iptc.Application2.Keywords
            // and Xmp.dc.subject
        }
    }

    protected static String joinCreators(List<String> creators) {
        if (creators == null || creators.size() == 0) {
            return "";
        }
        if (creators.size() == 1) {
            return creators.get(0);
        }
        StringJoiner stringJoiner = new StringJoiner(", ");
        for (String s : creators) {
            stringJoiner.add(s);
        }
        return stringJoiner.toString();
    }

    /**
     * Extracts Media Management metadata from XMP.
     * <p>
     * Silently swallows exceptions.
     *
     * @param xmp
     * @param metadata
     */
    public static void extractXMPMM(XMPMetadata xmp, Metadata metadata) {
        if (xmp == null) {
            return;
        }
        XMPSchemaMediaManagement mmSchema = null;
        try {
            mmSchema = xmp.getMediaManagementSchema();
        } catch (IOException e) {
            //swallow
            return;
        }
        if (mmSchema != null) {
            addMetadata(metadata, XMPMM.DOCUMENTID, mmSchema.getDocumentID());
            // not currently supported by JempBox...
            // but might be in 1.8.18 if ever released, see PDFBOX-6116
            // until then use workaround (won't work if non standard prefix is used)
            metadata.set(XMPMM.INSTANCEID, mmSchema.getTextProperty("xmpMM:InstanceID" ));

            ResourceRef derivedFrom = mmSchema.getDerivedFrom();
            if (derivedFrom != null) {
                try {
                    addMetadata(metadata, XMPMM.DERIVED_FROM_DOCUMENTID,
                            derivedFrom.getDocumentID());
                } catch (NullPointerException e) {
                    //swallow
                    // NPE fixed in PDFBOX-5984; NPE catch can be removed if Jempbox 1.8.18 is released
                }

                try {
                    addMetadata(metadata, XMPMM.DERIVED_FROM_INSTANCEID,
                            derivedFrom.getInstanceID());
                } catch (NullPointerException e) {
                    //swallow
                    // NPE fixed in PDFBOX-5984; NPE catch can be removed if Jempbox 1.8.18 is released
                }

                //TODO: not yet supported by XMPBox...extract OriginalDocumentID
                //in DerivedFrom section
            }
            if (mmSchema.getHistory() != null) {
                int eventsAdded = 0;
                for (ResourceEvent stevt : mmSchema.getHistory()) {
                    if (eventsAdded >= MAX_EVENT_HISTORY_IN_XMPMM) {
                        break;
                    }
                    String instanceId = null;
                    String action = null;
                    Calendar when = null;
                    String softwareAgent = null;
                    try {
                        instanceId = stevt.getInstanceID();
                        action = stevt.getAction();
                        when = stevt.getWhen();
                        softwareAgent = stevt.getSoftwareAgent();

                        //instanceid can throw npe; getWhen can throw IOException
                    } catch (NullPointerException | IOException e) {
                        //swallow
                        // NPE fixed in PDFBOX-5984; NPE catch can be removed if Jempbox 1.8.18 is released
                    }
                    if (instanceId != null && !instanceId.isBlank()) {
                        //for absent data elements, pass in empty strings so
                        //that parallel arrays will have matching offsets
                        //for absent data

                        action = (action == null) ? "" : action;
                        String dateString = (when == null) ? "" : DateUtils.formatDate(when);
                        softwareAgent = (softwareAgent == null) ? "" : softwareAgent;

                        metadata.add(XMPMM.HISTORY_EVENT_INSTANCEID, instanceId);
                        metadata.add(XMPMM.HISTORY_ACTION, action);
                        metadata.add(XMPMM.HISTORY_WHEN, dateString);
                        metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, softwareAgent);
                        eventsAdded++;
                    }
                }
            }
        }
    }

    private static void addMetadata(Metadata m, Property p, String value) {
        if (value != null) {
            if (p.isMultiValuePermitted() || m.get(p) == null) {
                m.add(p, value);
            }
        }
    }

    /**
     * @return maximum number of events to extract from the XMPMM history.
     */
    public static int getMaxXMPMMHistory() {
        return MAX_EVENT_HISTORY_IN_XMPMM;
    }

    /**
     * Maximum number of events to extract from the
     * event history in the XMP Media Management (XMPMM) section.
     * The extractor will silently stop adding events after it
     * has reached this threshold.
     * <p>
     * The default is 1024.
     */
    public static void setMaxXMPMMHistory(int maxEvents) {
        MAX_EVENT_HISTORY_IN_XMPMM = maxEvents;
    }

    public void parse(InputStream file) throws IOException, TikaException {
        UnsynchronizedByteArrayOutputStream xmpraw = UnsynchronizedByteArrayOutputStream.builder().get();
        if (!scanner.parse(file, xmpraw)) {
            return;
        }

        XMPMetadata xmp = null;
        try (InputStream decoded = xmpraw.toInputStream()) {
            Document dom = XMLReaderUtils.buildDOM(decoded, EMPTY_PARSE_CONTEXT);
            if (dom != null) {
                xmp = new XMPMetadata(dom);
            }
        } catch (IOException | SAXException e) {
            //
        }
        extractDublinCore(xmp, metadata);
        extractXMPMM(xmp, metadata);
    }
}