LibPstParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.libpst;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.config.ConfigDeserializer;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.FileProcessResult;
import org.apache.tika.utils.ProcessUtils;
import org.apache.tika.utils.StringUtils;

/**
 * This is an optional PST parser that relies on the user installing
 * the GPL-3 libpst/readpst commandline tool and configuring
 * Tika to call this library via tika-config.xml
 */
@TikaComponent(spi = false)
public class LibPstParser implements Parser, Initializable {

    public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst");

    private static final Set<MediaType> SUPPORTED = Set.of(MS_OUTLOOK_PST_MIMETYPE);

    private static final Logger LOGGER = LoggerFactory.getLogger(LibPstParser.class);

    private static final int MAX_STDOUT = 100000;
    private static final int MAX_STDERR = 10000;
    private static final String READ_PST_COMMAND = "readpst";

    private LibPstParserConfig defaultConfig = new LibPstParserConfig();

    public LibPstParser() {
    }

    public LibPstParser(LibPstParserConfig config) {
        this.defaultConfig = config;
    }

    public LibPstParser(JsonConfig jsonConfig) {
        defaultConfig = ConfigDeserializer.buildConfig(jsonConfig, LibPstParserConfig.class);
    }

    @Override
    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return SUPPORTED;
    }

    @Override
    public void parse(TikaInputStream tis, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
        _parse(tis.getPath(), contentHandler, metadata, parseContext);
    }

    private void _parse(Path pst, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws TikaException, IOException, SAXException {
        LibPstParserConfig activeConfig = parseContext.get(LibPstParserConfig.class, defaultConfig);
        Path outDir = Files.createTempDirectory("libpst-");
        Path debugFile = activeConfig.isDebug() ? Files.createTempFile("tika-libpst-debug", ".txt") : null;
        try {
            ProcessBuilder pb = getProcessBuilder(pst, activeConfig, outDir, debugFile);
            XHTMLContentHandler xhtml = new XHTMLContentHandler(contentHandler, metadata, parseContext);
            FileProcessResult fileProcessResult = ProcessUtils.execute(pb, activeConfig.getTimeoutSeconds() * 1000l, MAX_STDOUT, MAX_STDERR);
            xhtml.startDocument();
            processContents(outDir, activeConfig, xhtml, metadata, parseContext);
            if (fileProcessResult.isTimeout()) {
                throw new TikaException("Timeout exception: " + fileProcessResult.getProcessTimeMillis());
            }
            if (fileProcessResult.getExitValue() != 0) {
                LOGGER.warn("libpst bad exit value {}: {}", fileProcessResult.getExitValue(), fileProcessResult.getStderr());
                throw new TikaException("Bad exit value: " + fileProcessResult.getExitValue());
            }
            xhtml.endDocument();
        } finally {
            try {
                FileUtils.deleteDirectory(outDir.toFile());
            } catch (IOException e) {
                LOGGER.warn("Couldn't delete temporary directory: " + outDir.toAbsolutePath(), e);
            }
            try {
                if (debugFile != null) {
                    Files.delete(debugFile);
                }
            } catch (IOException e) {
                LOGGER.warn("Couldn't delete debug file?!", e);
            }
        }
    }

    private void processContents(Path outDir, LibPstParserConfig config, XHTMLContentHandler xhtml, Metadata metadata, ParseContext parseContext) throws IOException {
        Files.walkFileTree(outDir, new EmailVisitor(outDir, config.isProcessEmailAsMsg(), xhtml, metadata, parseContext));
    }

    private ProcessBuilder getProcessBuilder(Path pst, LibPstParserConfig config, Path outDir, Path debugFile)
            throws TikaConfigException {
        List commands = new ArrayList<String>();
        commands.add(getFullReadPstCommand());
        if (config.isDebug()) {
            commands.add("-d");
            commands.add(ProcessUtils.escapeCommandLine(debugFile
                    .toAbsolutePath()
                    .toString()));
        }
        if (config.isIncludeDeleted()) {
            commands.add("-D");
        }
        if (config.isProcessEmailAsMsg()) {
            commands.add("-m");
        } else {
            //include .eml and include extensions
            commands.add("-e");
        }
        commands.add("-o");
        commands.add(ProcessUtils.escapeCommandLine(outDir
                .toAbsolutePath()
                .toString()));

        commands.add(ProcessUtils.escapeCommandLine(pst
                .toAbsolutePath()
                .toString()));
        LOGGER.debug("command arguments: " + commands);
        return new ProcessBuilder(commands);
    }

    @Override
    public void initialize() throws TikaConfigException {
        String readPstPath = defaultConfig.getReadPstPath();
        if (readPstPath.contains("\u0000")) {
            throw new TikaConfigException("path can't include null values");
        }
        String fullReadPstCommand = getFullReadPstCommand();
        if (! StringUtils.isBlank(readPstPath) && ! Files.isRegularFile(Paths.get(fullReadPstCommand))) {
            throw new TikaConfigException("I regret I can't find the readpst executable: " + fullReadPstCommand);
        }
        try {
            check();
        } catch (IOException e) {
            LOGGER.error("Couldn't get version of libpst", e);
            throw new TikaConfigException("Unable to check version of readpst. Is it installed?!", e);
        }
    }

    //throws exception if readpst is not available
    private void check() throws TikaConfigException, IOException {
        String fullReadPstCommand = getFullReadPstCommand();

        ProcessBuilder pb = new ProcessBuilder(ProcessUtils.escapeCommandLine(fullReadPstCommand), "-V");
        FileProcessResult result = ProcessUtils.execute(pb, 30000, 10000, 10000);
        if (result.getExitValue() != 0) {
            throw new TikaConfigException(
                    "bad exit value for LibPstParser. It must be installed and on the path" + " if this parser is configured. Exit value: " + result.getExitValue());
        }
        if (result.isTimeout()) {
            throw new TikaConfigException("timeout trying to get version from readpst?!");
        }
    }

    public boolean checkQuietly() {
        try {
            check();
        } catch (TikaConfigException | IOException e) {
            return false;
        }
        return true;
    }

    private String getFullReadPstCommand() throws TikaConfigException {
        String readPstPath = defaultConfig.getReadPstPath();
        if (StringUtils.isBlank(readPstPath)) {
            return READ_PST_COMMAND;
        }
        if (! readPstPath.endsWith("/") && readPstPath.endsWith("\\")) {
            return readPstPath + "/" + READ_PST_COMMAND;
        }
        return readPstPath + READ_PST_COMMAND;
    }

    public LibPstParserConfig getDefaultConfig() {
        return defaultConfig;
    }
}