UnrarParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.pkg;

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Collections;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaTimeoutException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.FileProcessResult;
import org.apache.tika.utils.ProcessUtils;

/**
 * Parser for Rar files.  This relies on 'unrar' being installed
 * and on the path.  This is not the default rar parser and must
 * be selected via the tika-config.xml.
 */
@TikaComponent(spi = false)
public class UnrarParser implements Parser {
    private static final long serialVersionUID = 6157727985054451501L;

    private static final Set<MediaType> SUPPORTED_TYPES =
            Collections.singleton(MediaType.application("x-rar-compressed"));

    @Override
    public Set<MediaType> getSupportedTypes(ParseContext arg0) {
        return SUPPORTED_TYPES;
    }
    private long timeoutMillis = 60000;

    @Override
    public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata,
                      ParseContext context) throws IOException, SAXException, TikaException {

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
        xhtml.startDocument();

        EmbeddedDocumentExtractor extractor =
                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);

        Path cwd = Files.createTempDirectory("tika-unrar-");
        try {
            Path tmp = Files.createTempFile(cwd, "input", ".rar");
            try (OutputStream os = Files.newOutputStream(tmp, StandardOpenOption.WRITE)) {
                IOUtils.copy(tis, os);
            }
            FileProcessResult result = unrar(cwd, tmp);
            //delete the tmp rar file so that we don't recursively parse it in the next step
            try {
                Files.delete(tmp);
            } catch (IOException e) {
                //warn failed to delete tmp
            }
            if (result.isTimeout()) {
                throw new TikaTimeoutException("timed out unrarring");
            } else if (result.getExitValue() != 0) {
                if (result.getStderr().contains("error in the encrypted file")) {
                    throw new EncryptedDocumentException();
                }
                String msg = result.getStderr();
                if (msg.length() > 100) {
                    msg = msg.substring(0, 100);
                }
                throw new TikaException("Unrecoverable problem with rar file, exitValue=" +
                        result.getExitValue() + " : " + msg);
            }
            //TODO: process stdout to extract status for each file:
            //e.g. Extracting  test-documents/testEXCEL.xls                              OK
            processDirectory(cwd, cwd, xhtml, extractor, context);
        } finally {
            FileUtils.deleteDirectory(cwd.toFile());
        }
        xhtml.endDocument();
    }

    private void processDirectory(Path baseDir, Path path,
                               XHTMLContentHandler xhtml,
                               EmbeddedDocumentExtractor extractor, ParseContext context)
            throws IOException, SAXException {
        for (File f : path.toFile().listFiles()) {
            if (f.isDirectory()) {
                processDirectory(baseDir, f.toPath(), xhtml, extractor,
                        context);
            } else {
                processFile(baseDir, f.toPath(), xhtml, extractor, context);
            }
        }
    }

    private void processFile(Path base, Path embeddedFile,
                             XHTMLContentHandler xhtml, EmbeddedDocumentExtractor extractor, ParseContext context)
            throws IOException, SAXException {
        String relPath = base.relativize(embeddedFile).toString();
        Metadata metadata = Metadata.newInstance(context);
        String fName = FilenameUtils.getName(relPath);
        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fName);
        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, relPath);
        if (extractor.shouldParseEmbedded(metadata)) {
            try (TikaInputStream tis = TikaInputStream.get(embeddedFile)) {
                extractor.parseEmbedded(tis, xhtml, metadata, context, true);
            }
        }
    }

    private FileProcessResult unrar(Path cwd, Path tmp) throws IOException {
        //we could use the -l option to check for potentially bad file names
        //e.g. path traversals
        ProcessBuilder pb = new ProcessBuilder();
        pb.directory(cwd.toFile());
        pb.command(
                "unrar",
                "x",  //extract with paths...hope that unrar protects against path traversals
                "-kb", // keep broken files
                "-p-", // we don't support passwords yet -- don't hang waiting for password on stdin
                ProcessUtils.escapeCommandLine(tmp.toAbsolutePath().toString())

        );
        return ProcessUtils.execute(pb, timeoutMillis, 10000, 1000);
    }
}