LibPstParser.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.libpst;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.config.ConfigDeserializer;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.FileProcessResult;
import org.apache.tika.utils.ProcessUtils;
import org.apache.tika.utils.StringUtils;
/**
* This is an optional PST parser that relies on the user installing
* the GPL-3 libpst/readpst commandline tool and configuring
* Tika to call this library via tika-config.xml
*/
@TikaComponent(spi = false)
public class LibPstParser implements Parser, Initializable {
public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst");
private static final Set<MediaType> SUPPORTED = Set.of(MS_OUTLOOK_PST_MIMETYPE);
private static final Logger LOGGER = LoggerFactory.getLogger(LibPstParser.class);
private static final int MAX_STDOUT = 100000;
private static final int MAX_STDERR = 10000;
private static final String READ_PST_COMMAND = "readpst";
private LibPstParserConfig defaultConfig = new LibPstParserConfig();
public LibPstParser() {
}
public LibPstParser(LibPstParserConfig config) {
this.defaultConfig = config;
}
public LibPstParser(JsonConfig jsonConfig) {
defaultConfig = ConfigDeserializer.buildConfig(jsonConfig, LibPstParserConfig.class);
}
@Override
public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
return SUPPORTED;
}
@Override
public void parse(TikaInputStream tis, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
_parse(tis.getPath(), contentHandler, metadata, parseContext);
}
private void _parse(Path pst, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws TikaException, IOException, SAXException {
LibPstParserConfig activeConfig = parseContext.get(LibPstParserConfig.class, defaultConfig);
Path outDir = Files.createTempDirectory("libpst-");
Path debugFile = activeConfig.isDebug() ? Files.createTempFile("tika-libpst-debug", ".txt") : null;
try {
ProcessBuilder pb = getProcessBuilder(pst, activeConfig, outDir, debugFile);
XHTMLContentHandler xhtml = new XHTMLContentHandler(contentHandler, metadata, parseContext);
FileProcessResult fileProcessResult = ProcessUtils.execute(pb, activeConfig.getTimeoutSeconds() * 1000l, MAX_STDOUT, MAX_STDERR);
xhtml.startDocument();
processContents(outDir, activeConfig, xhtml, metadata, parseContext);
if (fileProcessResult.isTimeout()) {
throw new TikaException("Timeout exception: " + fileProcessResult.getProcessTimeMillis());
}
if (fileProcessResult.getExitValue() != 0) {
LOGGER.warn("libpst bad exit value {}: {}", fileProcessResult.getExitValue(), fileProcessResult.getStderr());
throw new TikaException("Bad exit value: " + fileProcessResult.getExitValue());
}
xhtml.endDocument();
} finally {
try {
FileUtils.deleteDirectory(outDir.toFile());
} catch (IOException e) {
LOGGER.warn("Couldn't delete temporary directory: " + outDir.toAbsolutePath(), e);
}
try {
if (debugFile != null) {
Files.delete(debugFile);
}
} catch (IOException e) {
LOGGER.warn("Couldn't delete debug file?!", e);
}
}
}
private void processContents(Path outDir, LibPstParserConfig config, XHTMLContentHandler xhtml, Metadata metadata, ParseContext parseContext) throws IOException {
Files.walkFileTree(outDir, new EmailVisitor(outDir, config.isProcessEmailAsMsg(), xhtml, metadata, parseContext));
}
private ProcessBuilder getProcessBuilder(Path pst, LibPstParserConfig config, Path outDir, Path debugFile)
throws TikaConfigException {
List commands = new ArrayList<String>();
commands.add(getFullReadPstCommand());
if (config.isDebug()) {
commands.add("-d");
commands.add(ProcessUtils.escapeCommandLine(debugFile
.toAbsolutePath()
.toString()));
}
if (config.isIncludeDeleted()) {
commands.add("-D");
}
if (config.isProcessEmailAsMsg()) {
commands.add("-m");
} else {
//include .eml and include extensions
commands.add("-e");
}
commands.add("-o");
commands.add(ProcessUtils.escapeCommandLine(outDir
.toAbsolutePath()
.toString()));
commands.add(ProcessUtils.escapeCommandLine(pst
.toAbsolutePath()
.toString()));
LOGGER.debug("command arguments: " + commands);
return new ProcessBuilder(commands);
}
@Override
public void initialize() throws TikaConfigException {
String readPstPath = defaultConfig.getReadPstPath();
if (readPstPath.contains("\u0000")) {
throw new TikaConfigException("path can't include null values");
}
String fullReadPstCommand = getFullReadPstCommand();
if (! StringUtils.isBlank(readPstPath) && ! Files.isRegularFile(Paths.get(fullReadPstCommand))) {
throw new TikaConfigException("I regret I can't find the readpst executable: " + fullReadPstCommand);
}
try {
check();
} catch (IOException e) {
LOGGER.error("Couldn't get version of libpst", e);
throw new TikaConfigException("Unable to check version of readpst. Is it installed?!", e);
}
}
//throws exception if readpst is not available
private void check() throws TikaConfigException, IOException {
String fullReadPstCommand = getFullReadPstCommand();
ProcessBuilder pb = new ProcessBuilder(ProcessUtils.escapeCommandLine(fullReadPstCommand), "-V");
FileProcessResult result = ProcessUtils.execute(pb, 30000, 10000, 10000);
if (result.getExitValue() != 0) {
throw new TikaConfigException(
"bad exit value for LibPstParser. It must be installed and on the path" + " if this parser is configured. Exit value: " + result.getExitValue());
}
if (result.isTimeout()) {
throw new TikaConfigException("timeout trying to get version from readpst?!");
}
}
public boolean checkQuietly() {
try {
check();
} catch (TikaConfigException | IOException e) {
return false;
}
return true;
}
private String getFullReadPstCommand() throws TikaConfigException {
String readPstPath = defaultConfig.getReadPstPath();
if (StringUtils.isBlank(readPstPath)) {
return READ_PST_COMMAND;
}
if (! readPstPath.endsWith("/") && readPstPath.endsWith("\\")) {
return readPstPath + "/" + READ_PST_COMMAND;
}
return readPstPath + READ_PST_COMMAND;
}
public LibPstParserConfig getDefaultConfig() {
return defaultConfig;
}
}