SAXBasedMetadataExtractor.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.ooxml;
import java.io.InputStream;
import java.math.BigDecimal;
import java.util.Date;
import java.util.Optional;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageProperties;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.SummaryExtractor;
import org.apache.tika.utils.XMLReaderUtils;
/**
* SAX-based metadata extractor for OOXML documents that reads document properties
* directly from the OPC package without needing POIXMLProperties or ooxml-lite schemas.
* <p>
* Core properties are read from {@link PackagePropertiesPart} (OPC level).
* Extended properties (app.xml) and custom properties (custom.xml) are parsed with SAX.
*/
class SAXBasedMetadataExtractor extends MetadataExtractor {
private static final String EXTENDED_PROPERTIES_REL =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties";
private static final String CUSTOM_PROPERTIES_REL =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/custom-properties";
private final OPCPackage opcPackage;
private final ParseContext parseContext;
SAXBasedMetadataExtractor(OPCPackage opcPackage, ParseContext parseContext) {
super(null);
this.opcPackage = opcPackage;
this.parseContext = parseContext;
}
@Override
public void extract(Metadata metadata) throws TikaException {
extractCoreProperties(metadata);
extractExtendedProperties(metadata);
extractCustomProperties(metadata);
}
private void extractCoreProperties(Metadata metadata) {
try {
PackageProperties props = opcPackage.getPackageProperties();
if (props == null) {
return;
}
setProperty(metadata, OfficeOpenXMLCore.CATEGORY, props.getCategoryProperty());
setProperty(metadata, OfficeOpenXMLCore.CONTENT_STATUS,
props.getContentStatusProperty());
setProperty(metadata, TikaCoreProperties.CREATED, props.getCreatedProperty());
addMultiProperty(metadata, TikaCoreProperties.CREATOR, props.getCreatorProperty());
setProperty(metadata, TikaCoreProperties.DESCRIPTION,
props.getDescriptionProperty());
setProperty(metadata, TikaCoreProperties.IDENTIFIER, props.getIdentifierProperty());
addProperty(metadata, DublinCore.SUBJECT, props.getSubjectProperty());
addProperty(metadata, Office.KEYWORDS, props.getKeywordsProperty());
setProperty(metadata, TikaCoreProperties.LANGUAGE, props.getLanguageProperty());
setProperty(metadata, TikaCoreProperties.MODIFIER,
props.getLastModifiedByProperty());
setProperty(metadata, TikaCoreProperties.PRINT_DATE,
props.getLastPrintedProperty());
setProperty(metadata, TikaCoreProperties.MODIFIED, props.getModifiedProperty());
setProperty(metadata, OfficeOpenXMLCore.REVISION, props.getRevisionProperty());
setProperty(metadata, TikaCoreProperties.TITLE, props.getTitleProperty());
setProperty(metadata, OfficeOpenXMLCore.VERSION, props.getVersionProperty());
} catch (Exception e) {
//swallow
}
}
private void extractExtendedProperties(Metadata metadata) {
try {
PackagePart extPart = getRelatedPart(EXTENDED_PROPERTIES_REL);
if (extPart == null) {
return;
}
ExtendedPropertiesHandler handler = new ExtendedPropertiesHandler();
try (InputStream is = extPart.getInputStream()) {
XMLReaderUtils.parseSAX(is, handler, parseContext);
}
handler.applyTo(metadata);
} catch (Exception e) {
//swallow
}
}
private void extractCustomProperties(Metadata metadata) {
try {
PackagePart custPart = getRelatedPart(CUSTOM_PROPERTIES_REL);
if (custPart == null) {
return;
}
CustomPropertiesHandler handler = new CustomPropertiesHandler();
try (InputStream is = custPart.getInputStream()) {
XMLReaderUtils.parseSAX(is, handler, parseContext);
}
handler.applyTo(metadata);
} catch (Exception e) {
//swallow
}
}
private PackagePart getRelatedPart(String relationshipType) {
try {
PackageRelationshipCollection rels =
opcPackage.getRelationshipsByType(relationshipType);
if (rels == null || rels.size() == 0) {
return null;
}
PackageRelationship rel = rels.getRelationship(0);
if (rel == null) {
return null;
}
return opcPackage.getPart(rel);
} catch (Exception e) {
return null;
}
}
private <T> void setProperty(Metadata metadata, Property property,
Optional<T> optionalValue) {
if (!optionalValue.isPresent()) {
return;
}
T value = optionalValue.get();
if (value instanceof Date) {
metadata.set(property, (Date) value);
} else if (value instanceof String) {
metadata.set(property, (String) value);
} else if (value instanceof Integer) {
metadata.set(property, (Integer) value);
} else if (value instanceof Double) {
metadata.set(property, (Double) value);
}
}
private <T> void addProperty(Metadata metadata, Property property,
Optional<T> optionalValue) {
if (!optionalValue.isPresent()) {
return;
}
T value = optionalValue.get();
if (value instanceof String) {
metadata.add(property, (String) value);
}
}
private void addMultiProperty(Metadata metadata, Property property,
Optional<String> value) {
if (!value.isPresent()) {
return;
}
SummaryExtractor.addMulti(metadata, property, value.get());
}
/**
* SAX handler for docProps/app.xml (extended properties).
*/
private static class ExtendedPropertiesHandler extends DefaultHandler {
private String application;
private String appVersion;
private String company;
private String manager;
private String notes;
private String presentationFormat;
private String template;
private int totalTime;
private int docSecurity;
private int pages;
private int slides;
private int paragraphs;
private int lines;
private int words;
private int characters;
private int charactersWithSpaces;
private String currentElement;
private final StringBuilder textBuffer = new StringBuilder();
@Override
public void startElement(String uri, String localName, String qName, Attributes atts) {
currentElement = localName;
textBuffer.setLength(0);
}
@Override
public void characters(char[] ch, int start, int length) {
textBuffer.append(ch, start, length);
}
@Override
public void endElement(String uri, String localName, String qName) {
if (!localName.equals(currentElement)) {
return;
}
String val = textBuffer.toString().trim();
if (val.isEmpty()) {
currentElement = null;
return;
}
switch (localName) {
case "Application":
application = val;
break;
case "AppVersion":
appVersion = val;
break;
case "Company":
company = val;
break;
case "Manager":
manager = val;
break;
case "Notes":
notes = val;
break;
case "PresentationFormat":
presentationFormat = val;
break;
case "Template":
template = val;
break;
case "TotalTime":
totalTime = safeParseInt(val);
break;
case "DocSecurity":
docSecurity = safeParseInt(val);
break;
case "Pages":
pages = safeParseInt(val);
break;
case "Slides":
slides = safeParseInt(val);
break;
case "Paragraphs":
paragraphs = safeParseInt(val);
break;
case "Lines":
lines = safeParseInt(val);
break;
case "Words":
words = safeParseInt(val);
break;
case "Characters":
characters = safeParseInt(val);
break;
case "CharactersWithSpaces":
charactersWithSpaces = safeParseInt(val);
break;
default:
break;
}
currentElement = null;
}
private int safeParseInt(String val) {
try {
// Handle unsigned int overflow (TIKA-2055)
long l = Long.parseLong(val);
if (l > Integer.MAX_VALUE || l < 0) {
return 0;
}
return (int) l;
} catch (NumberFormatException e) {
return 0;
}
}
void applyTo(Metadata metadata) {
setIfNotNull(metadata, OfficeOpenXMLExtended.APPLICATION, application);
setIfNotNull(metadata, OfficeOpenXMLExtended.APP_VERSION, appVersion);
setIfNotNull(metadata, TikaCoreProperties.PUBLISHER, company);
setIfNotNull(metadata, OfficeOpenXMLExtended.COMPANY, company);
if (manager != null) {
SummaryExtractor.addMulti(metadata, OfficeOpenXMLExtended.MANAGER, manager);
}
setIfNotNull(metadata, OfficeOpenXMLExtended.NOTES, notes);
setIfNotNull(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, presentationFormat);
setIfNotNull(metadata, OfficeOpenXMLExtended.TEMPLATE, template);
setIfPositive(metadata, OfficeOpenXMLExtended.TOTAL_TIME, totalTime);
setIfPositive(metadata, OfficeOpenXMLExtended.DOC_SECURITY, docSecurity);
metadata.set(OfficeOpenXMLExtended.DOC_SECURITY_STRING,
getDocSecurityString(docSecurity));
if (pages > 0) {
metadata.set(PagedText.N_PAGES, pages);
} else if (slides > 0) {
metadata.set(PagedText.N_PAGES, slides);
}
setIfPositive(metadata, Office.PAGE_COUNT, pages);
setIfPositive(metadata, Office.SLIDE_COUNT, slides);
setIfPositive(metadata, Office.PARAGRAPH_COUNT, paragraphs);
setIfPositive(metadata, Office.LINE_COUNT, lines);
setIfPositive(metadata, Office.WORD_COUNT, words);
setIfPositive(metadata, Office.CHARACTER_COUNT, characters);
setIfPositive(metadata, Office.CHARACTER_COUNT_WITH_SPACES, charactersWithSpaces);
}
private void setIfNotNull(Metadata metadata, Property property, String value) {
if (value != null) {
metadata.set(property, value);
}
}
private void setIfPositive(Metadata metadata, Property property, int value) {
if (value > 0) {
metadata.set(property, value);
}
}
private String getDocSecurityString(int flag) {
switch (flag) {
case 0:
return OfficeOpenXMLExtended.SECURITY_NONE;
case 1:
return OfficeOpenXMLExtended.SECURITY_PASSWORD_PROTECTED;
case 2:
return OfficeOpenXMLExtended.SECURITY_READ_ONLY_RECOMMENDED;
case 4:
return OfficeOpenXMLExtended.SECURITY_READ_ONLY_ENFORCED;
case 8:
return OfficeOpenXMLExtended.SECURITY_LOCKED_FOR_ANNOTATIONS;
default:
return OfficeOpenXMLExtended.SECURITY_UNKNOWN;
}
}
}
/**
* SAX handler for docProps/custom.xml (custom properties).
*/
private static class CustomPropertiesHandler extends DefaultHandler {
private static final String VT_NS =
"http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes";
private final Metadata customMetadata = new Metadata();
private String currentPropertyName;
private String currentValueType;
private final StringBuilder textBuffer = new StringBuilder();
@Override
public void startElement(String uri, String localName, String qName, Attributes atts) {
if ("property".equals(localName)) {
currentPropertyName = atts.getValue("name");
currentValueType = null;
} else if (VT_NS.equals(uri) && currentPropertyName != null) {
currentValueType = localName;
textBuffer.setLength(0);
}
}
@Override
public void characters(char[] ch, int start, int length) {
textBuffer.append(ch, start, length);
}
@Override
public void endElement(String uri, String localName, String qName) {
if (VT_NS.equals(uri) && currentValueType != null &&
localName.equals(currentValueType) && currentPropertyName != null) {
String val = textBuffer.toString().trim();
String propName = "custom:" + currentPropertyName;
switch (currentValueType) {
case "lpwstr":
case "lpstr":
case "bstr":
customMetadata.set(propName, val);
break;
case "filetime":
case "date":
Property tikaProp = Property.externalDate(propName);
customMetadata.set(tikaProp, val);
break;
case "bool":
customMetadata.set(propName, val);
break;
case "i1":
case "i2":
case "i4":
case "int":
case "ui1":
case "ui2":
customMetadata.set(propName, val);
break;
case "i8":
case "ui4":
case "ui8":
case "uint":
customMetadata.set(propName, val);
break;
case "r4":
case "r8":
customMetadata.set(propName, val);
break;
case "decimal":
try {
BigDecimal d = new BigDecimal(val);
customMetadata.set(propName, d.toPlainString());
} catch (NumberFormatException e) {
//swallow
}
break;
default:
break;
}
currentValueType = null;
} else if ("property".equals(localName)) {
currentPropertyName = null;
}
}
void applyTo(Metadata metadata) {
for (String name : customMetadata.names()) {
for (String value : customMetadata.getValues(name)) {
metadata.add(name, value);
}
}
}
}
}