FileCommandDetector.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.detect;

import java.io.IOException;
import java.nio.file.Path;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.tika.config.TikaComponent;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.ExternalProcess;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.utils.FileProcessResult;
import org.apache.tika.utils.ProcessUtils;
import org.apache.tika.utils.StringUtils;

/**
 * This runs the linux 'file' command against a file.  If
 * this is called on a TikaInputStream, it will use the underlying Path
 * or spool the full file to disk and then run file against that.
 * <p>
 * If this is run against any other type of InputStream, it will spool
 * up to {@link #maxBytes} to disk and then run the detector.
 * <p>
 * As with all detectors, mark must be supported.
 * <p>
 * If you want to use file's mime type in the parse, e.g.
 * to select the parser in AutoDetectParser, set {@link FileCommandDetector#setUseMime(boolean)}
 * to true.  The default behavior is to store the value as {@link FileCommandDetector#FILE_MIME}
 * but rely on other detectors for the "active" mime used by Tika.
 */
@TikaComponent(spi = false)
public class FileCommandDetector implements Detector {

    //TODO: file has some diff mimes names for some very common mimes
    //should we map file mimes to Tika mimes, e.g. text/xml -> application/xml??

    public static Property FILE_MIME = Property.externalText("file:mime");
    private static final Logger LOGGER = LoggerFactory.getLogger(FileCommandDetector.class);
    private static final long DEFAULT_TIMEOUT_MS = 6000;
    private static final String DEFAULT_FILE_COMMAND_PATH = "file";
    private static boolean HAS_WARNED = false;
    private Boolean hasFileCommand = null;
    private String fileCommandPath = DEFAULT_FILE_COMMAND_PATH;
    private int maxBytes = 1_000_000;
    private long timeoutMs = DEFAULT_TIMEOUT_MS;

    private boolean useMime = false;

    public static boolean checkHasFile() {
        return checkHasFile(DEFAULT_FILE_COMMAND_PATH);
    }


    public static boolean checkHasFile(String fileCommandPath) {
        String[] commandline = new String[]{fileCommandPath, "-v"};
        return ExternalParser.check(commandline);
    }

    /**
     * @param tis      document input stream, or <code>null</code>
     * @param metadata input metadata for the document
     * @param parseContext the parse context
     * @return mime as identified by the file command or application/octet-stream otherwise
     * @throws IOException
     */
    @Override
    public MediaType detect(TikaInputStream tis, Metadata metadata, ParseContext parseContext) throws IOException {
        if (hasFileCommand == null) {
            hasFileCommand = checkHasFile(this.fileCommandPath);
        }
        if (!hasFileCommand) {
            if (!HAS_WARNED) {
                LOGGER.warn("'file' command isn't working: '" + fileCommandPath + "'");
                HAS_WARNED = true;
            }
            return MediaType.OCTET_STREAM;
        }
        if (tis == null) {
            return MediaType.OCTET_STREAM;
        }
        //spool the full file to disk, if there is no underlying file
        return detectOnPath(tis.getPath(), metadata);
    }

    private MediaType detectOnPath(Path path, Metadata metadata) throws IOException {

        String[] args =
                new String[]{ProcessUtils.escapeCommandLine(fileCommandPath), "-b", "--mime-type",
                        ProcessUtils.escapeCommandLine(path.toAbsolutePath().toString())};
        ProcessBuilder builder = new ProcessBuilder(args);
        FileProcessResult result = ProcessUtils.execute(builder, timeoutMs, 10000, 10000);
        if (result.isTimeout()) {
            metadata.set(ExternalProcess.IS_TIMEOUT, true);
            return MediaType.OCTET_STREAM;
        }
        if (result.getExitValue() != 0) {
            metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue());
            return MediaType.OCTET_STREAM;
        }
        String mimeString = result.getStdout();
        if (StringUtils.isBlank(mimeString)) {
            return MediaType.OCTET_STREAM;
        }
        metadata.set(FILE_MIME, mimeString);
        if (useMime) {
            MediaType mt = MediaType.parse(mimeString);
            if (mt == null) {
                return MediaType.OCTET_STREAM;
            } else {
                return mt;
            }
        }
        return MediaType.OCTET_STREAM;
    }

    public void setFilePath(String fileCommandPath) {
        //this opens up a potential command vulnerability.
        //Don't ever let an untrusted user set this.
        this.fileCommandPath = fileCommandPath;
        checkHasFile(this.fileCommandPath);
    }

    public void setUseMime(boolean useMime) {
        this.useMime = useMime;
    }

    public boolean isUseMime() {
        return useMime;
    }
    /**
     * If this is not called on a TikaInputStream, this detector
     * will spool up to this many bytes to a file to be detected
     * by the 'file' command.
     *
     * @param maxBytes
     */
    public void setMaxBytes(int maxBytes) {
        this.maxBytes = maxBytes;
    }

    public void setTimeoutMs(long timeoutMs) {
        this.timeoutMs = timeoutMs;
    }

}