ExtendedMetadataExtractor.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.msg;
import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.time.temporal.ChronoUnit;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.datatypes.ByteChunk;
import org.apache.poi.hsmf.datatypes.Chunk;
import org.apache.poi.hsmf.datatypes.MAPIProperty;
import org.apache.poi.hsmf.datatypes.PropertyValue;
import org.apache.poi.hsmf.datatypes.Types;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.metadata.MAPI;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.utils.StringUtils;
/**
* This class extracts mapi properties as defined in the props_table.txt, which was generated from MS-OXPROPS.
* For now, this ignores binary and unknown property types.
*/
public class ExtendedMetadataExtractor {
static Logger LOGGER = LoggerFactory.getLogger(ExtendedMetadataExtractor.class);
static Map<Integer, List<TikaMapiProperty>> TIKA_MAPI_PROPERTIES = new ConcurrentHashMap<>();
static Map<Integer, List<TikaMapiProperty>> TIKA_MAPI_LONG_PROPERTIES = new ConcurrentHashMap<>();
static {
loadProperties();
}
public static void extract(MAPIMessage msg, Metadata metadata) {
if (msg.getNameIdChunks() == null) {
return;
}
if (msg.getMainChunks() == null || msg.getMainChunks().getRawProperties() == null) {
return;
}
//prep our custom nameIdChunk handler
TikaNameIdChunks tikaNameIdChunks = new TikaNameIdChunks();
//short-circuit for files that have an empty nameIdChunk
long len = 0;
for (Chunk chunk : msg
.getNameIdChunks()
.getAll()) {
if (chunk == null) {
continue;
}
tikaNameIdChunks.record(chunk);
if (chunk instanceof ByteChunk) {
byte[] value = ((ByteChunk)chunk).getValue();
if (value != null) {
len += value.length;
}
}
}
if (len == 0) {
return;
}
try {
tikaNameIdChunks.chunksComplete();
} catch (IllegalStateException e) {
LOGGER.warn("bad namechunks stream", e);
}
for (Map.Entry<MAPIProperty, PropertyValue> e : msg
.getMainChunks()
.getRawProperties()
.entrySet()) {
//the mapiproperties from POI are the literal storage id for that particular file.
//Those storage ids must be mapped via the name chunk ids into a known id
PropertyValue v = e.getValue();
if (v == null) {
continue;
}
List<MAPITag> mapiTags = tikaNameIdChunks.getTags(e.getKey().id);
MAPITagPair pair = null;
for (MAPITag mapiTag : mapiTags) {
List<TikaMapiProperty> tikaMapiProperties = TIKA_MAPI_LONG_PROPERTIES.get(mapiTag.tagId);
if (tikaMapiProperties == null) {
tikaMapiProperties = TIKA_MAPI_PROPERTIES.get(mapiTag.tagId);
}
pair = findMatch(mapiTag, tikaMapiProperties, v);
if (pair != null) {
break;
}
}
updateMetadata(pair, v, metadata);
}
}
private static MAPITagPair findMatch(MAPITag mapiTag, List<TikaMapiProperty> tikaMapiProperties, PropertyValue propertyValue) {
if (mapiTag == null || tikaMapiProperties == null || propertyValue == null) {
return null;
}
for (TikaMapiProperty tikaMapiProperty : tikaMapiProperties) {
if (!mapiTag.classID.equals(tikaMapiProperty.classID)) {
continue;
}
if (tikaMapiProperty.types == null || tikaMapiProperty.types.isEmpty()) {
continue;
}
for (Types.MAPIType type : tikaMapiProperty.types) {
if (propertyValue
.getActualType()
.equals(type)) {
return new MAPITagPair(mapiTag, tikaMapiProperty);
}
}
}
return null;
}
private static void updateMetadata(MAPITagPair pair, PropertyValue propertyValue, Metadata metadata) {
if (pair == null || propertyValue == null) {
return;
}
if (!includeType(propertyValue)) {
return;
}
String key = MAPI.PREFIX_MAPI_PROPERTY + pair.tikaMapiProperty.name;
Types.MAPIType type = propertyValue.getActualType();
if (type == Types.TIME || type == Types.MV_TIME || type == Types.APP_TIME || type == Types.MV_APP_TIME) {
Calendar calendar = (Calendar) propertyValue.getValue();
String calendarString = calendar
.toInstant()
.truncatedTo(ChronoUnit.SECONDS)
.toString();
metadata.add(key, calendarString);
} else if (type == Types.BOOLEAN) {
Boolean val = (Boolean)propertyValue.getValue();
if (val == null) {
return;
}
metadata.add(key, Boolean.toString(val));
} else if (! StringUtils.isBlank(propertyValue.toString())) {
metadata.add(key, propertyValue.toString());
}
}
private static boolean includeType(PropertyValue propertyValue) {
Types.MAPIType mapiType = propertyValue.getActualType();
if (mapiType == Types.BINARY || mapiType == Types.UNKNOWN || mapiType == Types.UNSPECIFIED || mapiType == Types.DIRECTORY || mapiType.isPointer()) {
return false;
}
return true;
}
private static class TikaMapiProperty {
String name;
ClassID classID; // can be null
List<Types.MAPIType> types;
String refShort;
TikaMapiProperty(String name, ClassID classID, List<Types.MAPIType> types, String refShort) {
this.name = name;
this.classID = classID;
this.types = types;
this.refShort = refShort;
}
}
private static void loadProperties() {
Map<String, ClassID> knownClassIds = new HashMap<>();
for (TikaNameIdChunks.PredefinedPropertySet set : TikaNameIdChunks.PredefinedPropertySet.values()) {
knownClassIds.put(set
.getClassID()
.toUUIDString(), set.getClassID());
}
for (TikaNameIdChunks.PropertySetType setType : TikaNameIdChunks.PropertySetType.values()) {
knownClassIds.put(setType
.getClassID()
.toUUIDString(), setType.getClassID());
}
try (BufferedReader r = new BufferedReader(
new InputStreamReader(ExtendedMetadataExtractor.class.getResourceAsStream("/org/apache/tika/parser/microsoft/msg/props_table.txt"), UTF_8))) {
String line = r.readLine();
while (line != null) {
if (line.isBlank() || line.startsWith("#")) {
line = r.readLine();
continue;
}
String[] cols = line.split("\\|");
if (cols.length != 11) {
throw new IllegalArgumentException("column count must == 11: " + line);
}
String name = cols[1].trim();
ClassID classID = parseClassId(cols[3], knownClassIds);
List<Types.MAPIType> types = parseDataTypes(cols[7].split(";"));
String ref = cols[10];
String shortId = cols[5];
String longId = cols[6];
if (!StringUtils.isBlank(shortId)) {
int id = Integer.parseInt(shortId.substring(2), 16);
List<TikaMapiProperty> props = TIKA_MAPI_PROPERTIES.computeIfAbsent(id, k -> new ArrayList<>());
props.add(new TikaMapiProperty(name, classID, types, ref));
} else if (!StringUtils.isBlank(longId)) {
//remove leading "0x"
long id = Long.parseLong(longId.substring(2), 16);
if (id > Integer.MAX_VALUE) {
throw new IllegalArgumentException("id must actually be within int range");
}
int intId = (int) id;
List<TikaMapiProperty> props = TIKA_MAPI_LONG_PROPERTIES.computeIfAbsent(intId, k -> new ArrayList<>());
props.add(new TikaMapiProperty(name, classID, types, ref));
} else {
// some properties don't have an id
}
line = r.readLine();
}
} catch (IOException e) {
throw new IllegalStateException("can't find props_table.txt?!");
}
}
private static ClassID parseClassId(String s, Map<String, ClassID> knownClassIDs) {
if (StringUtils.isBlank(s)) {
return null;
}
int space = s.indexOf(" ");
if (space < 0) {
return null;
}
s = s
.substring(space)
.replaceAll("[\\{\\}]", "")
.trim();
if (knownClassIDs.containsKey(s)) {
return knownClassIDs.get(s);
}
LOGGER.warn("Add '{}' to list of known property set IDs", s);
ClassID classID = new ClassID(s);
knownClassIDs.put(classID.toUUIDString(), classID);
return classID;
}
private static class MAPITagPair {
final MAPITag mapiTag;
final TikaMapiProperty tikaMapiProperty;
public MAPITagPair(MAPITag mapiTag, TikaMapiProperty tikaMapiProperty) {
this.mapiTag = mapiTag;
this.tikaMapiProperty = tikaMapiProperty;
}
}
private static List<Types.MAPIType> parseDataTypes(String[] arr) {
if (arr.length == 1) {
Types.MAPIType type = parseDataType(arr[0]);
if (type != null) {
return List.of(type);
}
return Collections.EMPTY_LIST;
}
List<Types.MAPIType> types = new ArrayList<>();
for (String s : arr) {
Types.MAPIType type = parseDataType(s);
if (type != null) {
types.add(type);
}
}
return types;
}
private static Types.MAPIType parseDataType(String s) {
if (StringUtils.isBlank(s)) {
return null;
}
String[] parts = s.split(", ");
if (parts.length != 2) {
throw new IllegalArgumentException("expected two parts: " + s);
}
String num = parts[1];
if (num.startsWith("0x")) {
num = num.substring(2);
}
int id = Integer.parseInt(num, 16);
Types.MAPIType type = Types.getById(id);
if (type == null) {
//TODO:
/*
PtypRestriction, 0x00FD
PtypRuleAction, 0x00FE
PtypServerId, 0x00FB
*/
return Types.createCustom(id);
}
return type;
}
}