XMPMetadataExtractor.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.xmp;
import java.io.IOException;
import java.io.InputStream;
import java.util.Calendar;
import java.util.List;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.xmpbox.XMPMetadata;
import org.apache.xmpbox.schema.DublinCoreSchema;
import org.apache.xmpbox.schema.XMPBasicSchema;
import org.apache.xmpbox.schema.XMPMediaManagementSchema;
import org.apache.xmpbox.type.AbstractField;
import org.apache.xmpbox.type.ArrayProperty;
import org.apache.xmpbox.type.BadFieldValueException;
import org.apache.xmpbox.type.ResourceEventType;
import org.apache.xmpbox.type.ResourceRefType;
import org.apache.xmpbox.xml.DomXmpParser;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.XMP;
import org.apache.tika.metadata.XMPMM;
import org.apache.tika.utils.DateUtils;
/**
* XMP Metadata Extractor based on Apache XmpBox.
*/
public class XMPMetadataExtractor {
private static volatile int MAX_EVENT_HISTORY_IN_XMPMM = 1024;
/**
* Parse the XMP Packets.
*
* @param stream the stream to parser.
* @param metadata the metadata collection to update
* @throws IOException on any IO error.
* @throws TikaException on any Tika error.
*/
public static void parse(InputStream stream, Metadata metadata) throws IOException, TikaException {
XMPMetadata xmp;
try {
DomXmpParser xmpParser = new DomXmpParser();
xmpParser.setStrictParsing(false);
xmp = xmpParser.parse(CloseShieldInputStream.wrap(stream));
} catch (Throwable ex) {
//swallow
return;
}
extractDublinCoreSchema(xmp, metadata);
extractXMPBasicSchema(xmp, metadata);
extractXMPMM(xmp, metadata);
}
/**
* Extracts Dublin Core.
*
* Silently swallows exceptions.
* @param xmp the XMP Metadata object.
* @param metadata the metadata map
* @throws IOException
*/
public static void extractDublinCoreSchema(XMPMetadata xmp, Metadata metadata) throws IOException {
if (xmp == null) {
return;
}
DublinCoreSchema schemaDublinCore = xmp.getDublinCoreSchema();
if (schemaDublinCore != null) {
try {
addMetadata(metadata, DublinCore.TITLE, schemaDublinCore.getTitle());
addMetadata(metadata, DublinCore.FORMAT, schemaDublinCore.getFormat());
addMetadata(metadata, DublinCore.DESCRIPTION, schemaDublinCore.getDescription());
addMetadata(metadata, DublinCore.CREATOR, schemaDublinCore.getCreators());
addMetadata(metadata, DublinCore.SUBJECT, schemaDublinCore.getSubjects());
}
catch (BadFieldValueException ex) {
throw new IOException(ex);
}
}
}
/**
* Extracts basic schema metadata from XMP.
*
* Silently swallows exceptions.
* @param xmp the XMP Metadata object.
* @param metadata the metadata map
* @throws IOException
*/
public static void extractXMPBasicSchema(XMPMetadata xmp, Metadata metadata) throws IOException {
if (xmp == null) {
return;
}
XMPBasicSchema schemaBasic = xmp.getXMPBasicSchema();
if (schemaBasic != null) {
addMetadata(metadata, XMP.CREATOR_TOOL, schemaBasic.getCreatorTool());
addMetadata(metadata, XMP.CREATE_DATE, schemaBasic.getCreateDate());
addMetadata(metadata, XMP.MODIFY_DATE, schemaBasic.getModifyDate());
addMetadata(metadata, XMP.METADATA_DATE, schemaBasic.getModifyDate());
addMetadata(metadata, XMP.RATING, schemaBasic.getRating());
}
}
/**
* @return maximum number of events to extract from the XMPMM history.
*/
public static int getMaxXMPMMHistory() {
return MAX_EVENT_HISTORY_IN_XMPMM;
}
/**
* Maximum number of events to extract from the
* event history in the XMP Media Management (XMPMM) section.
* The extractor will silently stop adding events after it
* has reached this threshold.
* <p>
* The default is 1024.
* @param maxEvents
*/
public static void setMaxXMPMMHistory(int maxEvents) {
MAX_EVENT_HISTORY_IN_XMPMM = maxEvents;
}
/**
* Extracts Media Management metadata from XMP.
* <p>
* Silently swallows exceptions.
*
* @param xmp
* @param metadata
*/
public static void extractXMPMM(XMPMetadata xmp, Metadata metadata) {
if (xmp == null) {
return;
}
XMPMediaManagementSchema mmSchema = xmp.getXMPMediaManagementSchema();
if (mmSchema != null) {
addMetadata(metadata, XMPMM.DOCUMENTID, mmSchema.getDocumentID());
metadata.set(XMPMM.INSTANCEID, mmSchema.getInstanceID());
metadata.set(XMPMM.ORIGINAL_DOCUMENTID, mmSchema.getOriginalDocumentID());
//ResourceRefType derivedFrom = mmSchema.getDerivedFromProperty(); //TODO after XMPBox 3.0.7
ResourceRefType derivedFrom = mmSchema.getResourceRefProperty();
if (derivedFrom != null) {
addMetadata(metadata, XMPMM.DERIVED_FROM_DOCUMENTID, derivedFrom.getDocumentID());
addMetadata(metadata, XMPMM.DERIVED_FROM_INSTANCEID, derivedFrom.getInstanceID());
}
ArrayProperty historyProperty = mmSchema.getHistoryProperty();
if (historyProperty != null) {
int eventsAdded = 0;
for (AbstractField af : historyProperty.getAllProperties()) {
if (eventsAdded >= MAX_EVENT_HISTORY_IN_XMPMM) {
break;
}
if (!(af instanceof ResourceEventType))
{
continue;
}
ResourceEventType stevt = (ResourceEventType) af;
String instanceId = stevt.getInstanceID();
String action = stevt.getAction();
Calendar when = stevt.getWhen();
String softwareAgent = stevt.getSoftwareAgent();
if (instanceId != null && !instanceId.isBlank())
{
// for absent data elements, pass in empty strings so
// that parallel arrays will have matching offsets for absent data
action = action == null ? "" : action;
String dateString = when == null ? "" : DateUtils.formatDate(when);
softwareAgent = softwareAgent == null ? "" : softwareAgent;
metadata.add(XMPMM.HISTORY_EVENT_INSTANCEID, instanceId);
metadata.add(XMPMM.HISTORY_ACTION, action);
metadata.add(XMPMM.HISTORY_WHEN, dateString);
metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, softwareAgent);
eventsAdded++;
}
}
}
}
}
/**
* Add list to the metadata map.
*
* @param metadata the metadata map to update.
* @param property the property to add.
* @param values the values to add.
*/
private static void addMetadata(Metadata metadata, Property property, List<String> values) {
if (values != null) {
for (String value : values) {
addMetadata(metadata, property, value);
}
}
}
/**
* Add value to the metadata map.
*
* @param metadata the metadata map to update.
* @param property the property to add.
* @param value the value to add.
*/
private static void addMetadata(Metadata metadata, Property property, String value) {
if (value != null) {
if (property.isMultiValuePermitted()) {
metadata.add(property, value);
} else {
metadata.set(property, value);
}
}
}
/**
* Add value to the metadata map.
*
* @param metadata the metadata map to update.
* @param property the property to add.
* @param value the value to add.
*/
private static void addMetadata(Metadata metadata, Property property, Integer value) {
if (value != null) {
if (property.isMultiValuePermitted()) {
metadata.add(property, value);
} else {
metadata.set(property, value);
}
}
}
/**
* Add value to the metadata map.
*
* @param metadata the metadata map to update.
* @param property the property to add.
* @param value the value to add.
*/
private static void addMetadata(Metadata metadata, Property property, Calendar value) {
if (value != null) {
metadata.set(property, value);
}
}
}