TikaNameIdChunks.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.msg;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.function.Consumer;
import org.apache.commons.codec.digest.PureJavaCrc32;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.hsmf.datatypes.ByteChunk;
import org.apache.poi.hsmf.datatypes.Chunk;
import org.apache.poi.hsmf.datatypes.ChunkGroup;
import org.apache.poi.hsmf.datatypes.MAPIProperty;
import org.apache.poi.hsmf.datatypes.Types;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.LittleEndianByteArrayInputStream;
import org.apache.poi.util.StringUtil;
/**
* Collection of convenience chunks for the NameID part of an outlook file
* <p>
* This is a temporary copy+paste+modify from Apache POI
*/
public final class TikaNameIdChunks implements ChunkGroup {
public static final String NAME = "__nameid_version1.0";
public enum PropertySetType {
PS_MAPI("00020328-0000-0000-C000-000000000046"), PS_PUBLIC_STRINGS("00020329-0000-0000-C000-000000000046"),
PS_INTERNET_HEADERS("00020386-0000-0000-C000-000000000046");
private final ClassID classID;
PropertySetType(String uuid) {
classID = new ClassID(uuid);
}
public ClassID getClassID() {
return classID;
}
}
public enum PredefinedPropertySet {
PSETID_COMMON("00062008-0000-0000-C000-000000000046"), PSETID_ADDRESS("00062004-0000-0000-C000-000000000046"), PSETID_APPOINTMENT("00062002-0000-0000-C000-000000000046"),
PSETID_MEETING("6ED8DA90-450B-101B-98DA-00AA003F1305"), PSETID_LOG("0006200A-0000-0000-C000-000000000046"), PSETID_MESSAGING("41F28F13-83F4-4114-A584-EEDB5A6B0BFF"),
PSETID_NOTE("0006200E-0000-0000-C000-000000000046"), PSETID_POST_RSS("00062041-0000-0000-C000-000000000046"), PSETID_TASK("00062003-0000-0000-C000-000000000046"),
PSETID_UNIFIED_MESSAGING("4442858E-A9E3-4E80-B900-317A210CC15B"), PSETID_AIR_SYNC("71035549-0739-4DCB-9163-00F0580DBBDF"),
PSETID_SHARING("00062040-0000-0000-C000-000000000046"), PSETID_XML_EXTRACTED_ENTITIES("23239608-685D-4732-9C55-4C95CB4E8E33"),
PSETID_ATTACHMENT("96357F7F-59E1-47D0-99A7-46515C183B54"), //add this to POI
PSETID_CALENDAR_ASSISTANT("11000E07-B51B-40D6-AF21-CAA85EDAB1D0");
private final ClassID classID;
PredefinedPropertySet(String uuid) {
classID = new ClassID(uuid);
}
public ClassID getClassID() {
return classID;
}
}
private ByteChunk guidStream;
private ByteChunk entryStream;
private ByteChunk stringStream;
/**
* Holds all the chunks that were found, keyed by id. Not clear if we need a list
* or if we can rely on a unique id
*/
private Map<Integer, List<Chunk>> chunksById = new HashMap<>();
private Map<Integer, List<MAPITag>> mapiTagMap = new HashMap<>();
public Chunk[] getAll() {
List<Chunk> chunks = new ArrayList<>();
for (List<Chunk> c : chunksById.values()) {
chunks.addAll(c);
}
return chunks.toArray(new Chunk[0]);
}
@Override
public Chunk[] getChunks() {
return getAll();
}
/**
* Called by the parser whenever a chunk is found.
*/
@Override
public void record(Chunk chunk) {
if (chunk.getType() == Types.BINARY) {
switch (chunk.getChunkId()) {
case 2:
guidStream = (ByteChunk) chunk;
break;
case 3:
entryStream = (ByteChunk) chunk;
break;
case 4:
stringStream = (ByteChunk) chunk;
break;
}
}
List<Chunk> chunkList = chunksById.computeIfAbsent(chunk.getChunkId(), k -> new ArrayList<>());
chunkList.add(chunk);
}
/**
* Used to flag that all the chunks of the NameID have now been located.
*/
@Override
public void chunksComplete() {
loadTags();
}
//does not return null
public List<MAPITag> getTags(int storageId) {
List<MAPITag> tags = mapiTagMap.get(storageId);
if (tags == null) {
return Collections.emptyList();
}
return tags;
}
private void loadTags() {
final byte[] entryStreamBytes = (entryStream == null) ? null : entryStream.getValue();
if (guidStream == null || entryStream == null || stringStream == null || entryStreamBytes == null) {
return;
}
LittleEndianByteArrayInputStream leis = new LittleEndianByteArrayInputStream(entryStreamBytes);
for (int i = 0; i < entryStreamBytes.length / 8; i++) {
final long nameOffset = leis.readUInt();
int guidIndex = leis.readUShort();
final int propertyKind = guidIndex & 0x01;
guidIndex = guidIndex >>> 1;
final int propertyIndex = leis.readUShort();
// fetch and match property GUID
ClassID guid = getPropertyGUID(guidIndex);
// fetch property name / stream ID
final String[] propertyName = {null};
final long[] propertyNameCRC32 = {-1L};
long streamID = getStreamID(propertyKind, (int) nameOffset, guid, guidIndex, n -> propertyName[0] = n, c -> propertyNameCRC32[0] = c);
long tag = -1;
// find property index in matching stream entry
if (propertyKind == 1 && propertyNameCRC32[0] < 0) {
// skip stream entry matching and return tag from property index from entry stream
// this code should not be reached
tag = 0x8000L + propertyIndex;
} else {
tag = getPropertyTag(streamID, nameOffset, propertyNameCRC32[0]);
}
if (tag > 0 && tag < Integer.MAX_VALUE) {
List<MAPITag> tagList = mapiTagMap.computeIfAbsent((int) tag, k -> new ArrayList<>());
tagList.add(new MAPITag((int) nameOffset, propertyName[0], guid));
}
}
}
/**
* Get property tag id by property set GUID and string name or numerical name from named properties mapping
*
* @param guid Property set GUID in registry format without brackets.
* May be one of the PS_* or PSETID_* constants
* @param name Property name in case of string named property
* @param id Property id in case of numerical named property
* @return Property tag which can be matched with {@link MAPIProperty#id}
* or 0 if the property could not be found.
*/
public long getPropertyTag(ClassID guid, String name, long id) {
final byte[] entryStreamBytes = (entryStream == null) ? null : entryStream.getValue();
if (guidStream == null || entryStream == null || stringStream == null || guid == null || entryStreamBytes == null) {
return 0;
}
LittleEndianByteArrayInputStream leis = new LittleEndianByteArrayInputStream(entryStreamBytes);
for (int i = 0; i < entryStreamBytes.length / 8; i++) {
final long nameOffset = leis.readUInt();
int guidIndex = leis.readUShort();
final int propertyKind = guidIndex & 0x01;
guidIndex = guidIndex >>> 1;
final int propertyIndex = leis.readUShort();
// fetch and match property GUID
if (!guid.equals(getPropertyGUID(guidIndex))) {
continue;
}
// fetch property name / stream ID
final String[] propertyName = {null};
final long[] propertyNameCRC32 = {-1L};
long streamID = getStreamID(propertyKind, (int) nameOffset, guid, guidIndex, n -> propertyName[0] = n, c -> propertyNameCRC32[0] = c);
if (!matchesProperty(propertyKind, nameOffset, name, propertyName[0], id)) {
continue;
}
// find property index in matching stream entry
if (propertyKind == 1 && propertyNameCRC32[0] < 0) {
// skip stream entry matching and return tag from property index from entry stream
// this code should not be reached
return 0x8000L + propertyIndex;
}
return getPropertyTag(streamID, nameOffset, propertyNameCRC32[0]);
}
return 0;
}
private long getPropertyTag(long streamID, long nameOffset, long propertyNameCRC32) {
List<Chunk> chunks = chunksById.get((int) streamID);
if (chunks == null) {
return 0;
}
for (Chunk chunk : chunks) {
if (chunk == null || chunk.getType() != Types.BINARY || chunk.getChunkId() != streamID) {
continue;
}
byte[] matchChunkBytes = ((ByteChunk) chunk).getValue();
if (matchChunkBytes == null) {
continue;
}
LittleEndianByteArrayInputStream leis = new LittleEndianByteArrayInputStream(matchChunkBytes);
for (int m = 0; m < matchChunkBytes.length / 8; m++) {
long nameCRC = leis.readUInt();
int matchGuidIndex = leis.readUShort();
int matchPropertyIndex = leis.readUShort();
int matchPropertyKind = matchGuidIndex & 0x01;
if (nameCRC == (matchPropertyKind == 0 ? nameOffset : propertyNameCRC32)) {
return 0x8000L + matchPropertyIndex;
}
}
}
return 0;
}
private ClassID getPropertyGUID(int guidIndex) {
if (guidIndex == 1) {
// predefined GUID
return PropertySetType.PS_MAPI.classID;
} else if (guidIndex == 2) {
// predefined GUID
return PropertySetType.PS_PUBLIC_STRINGS.classID;
} else if (guidIndex >= 3) {
// GUID from guid stream
byte[] guidStreamBytes = guidStream.getValue();
int guidIndexOffset = (guidIndex - 3) * 0x10;
if (guidStreamBytes.length >= guidIndexOffset + 0x10) {
return new ClassID(guidStreamBytes, guidIndexOffset);
}
}
return null;
}
// property set GUID matches
private static boolean matchesProperty(int propertyKind, long nameOffset, String name, String propertyName, long id) {
return
// match property by id
(propertyKind == 0 && id >= 0 && id == nameOffset) ||
// match property by name
(propertyKind == 1 && name != null && name.equals(propertyName));
}
private long getStreamID(int propertyKind, int nameOffset, ClassID guid, int guidIndex, Consumer<String> propertyNameSetter, Consumer<Long> propertyNameCRC32Setter) {
if (propertyKind == 0) {
// numerical named property
return 0x1000L + (nameOffset ^ (guidIndex << 1)) % 0x1F;
}
// string named property
byte[] stringBytes = stringStream.getValue();
long propertyNameCRC32 = -1;
if (stringBytes.length > nameOffset) {
long nameLength = LittleEndian.getUInt(stringBytes, nameOffset);
if (stringBytes.length >= nameOffset + 4 + nameLength) {
int nameStart = nameOffset + 4;
String propertyName = new String(stringBytes, nameStart, (int) nameLength, StringUtil.UTF16LE);
if (PropertySetType.PS_INTERNET_HEADERS.classID.equals(guid)) {
byte[] n = propertyName
.toLowerCase(Locale.ROOT)
.getBytes(StringUtil.UTF16LE);
propertyNameCRC32 = calculateCRC32(n, 0, n.length);
} else {
propertyNameCRC32 = calculateCRC32(stringBytes, nameStart, (int) nameLength);
}
propertyNameSetter.accept(propertyName);
propertyNameCRC32Setter.accept(propertyNameCRC32);
}
}
return 0x1000 + (propertyNameCRC32 ^ ((guidIndex << 1) | 1)) % 0x1F;
}
/**
* Calculates the CRC32 of the given bytes (conforms to RFC 1510, SSH-1).
* The CRC32 calculation is similar to the standard one as demonstrated in RFC 1952,
* but with the inversion (before and after the calculation) omitted.
* <ul>
* <li>poly: 0x04C11DB7</li>
* <li>init: 0x00000000</li>
* <li>xor: 0x00000000</li>
* <li>revin: true</li>
* <li>revout: true</li>
* <li>check: 0x2DFD2D88 (CRC32 of "123456789")</li>
* </ul>
*
* @param buf the byte array to calculate CRC32 on
* @param off the offset within buf at which the CRC32 calculation will start
* @param len the number of bytes on which to calculate the CRC32
* @return the CRC32 value (unsigned 32-bit integer stored in a long).
* @see <a href="http://www.zorc.breitbandkatze.de/crc.html">CRC parameter check</a>
*/
private static long calculateCRC32(byte[] buf, int off, int len) {
PureJavaCrc32 crc = new PureJavaCrc32();
// set initial crc value to 0
crc.update(new byte[]{-1, -1, -1, -1}, 0, 4);
crc.update(buf, off, len);
return ~crc.getValue() & 0xFFFFFFFFL;
}
}