TikaCLI.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.cli;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.lang.reflect.Field;
import java.net.URI;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;

import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.logging.log4j.Level;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.Tika;
import org.apache.tika.async.cli.TikaAsyncCLI;
import org.apache.tika.config.EmbeddedLimits;
import org.apache.tika.config.TimeoutLimits;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.digest.DigestDef;
import org.apache.tika.digest.DigesterFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.gui.TikaGUI;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.language.detect.LanguageHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.NetworkParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.digestutils.CommonsDigesterFactory;
import org.apache.tika.pipes.api.ParseMode;
import org.apache.tika.pipes.fork.PipesForkParser;
import org.apache.tika.pipes.fork.PipesForkParserConfig;
import org.apache.tika.pipes.fork.PipesForkResult;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.apache.tika.sax.ToMarkdownContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
import org.apache.tika.serialization.JsonMetadata;
import org.apache.tika.serialization.JsonMetadataList;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
import org.apache.tika.xmp.XMPMetadata;

/**
 * Simple command line interface for Apache Tika.
 */
public class TikaCLI {
    private static final Logger LOG = LoggerFactory.getLogger(TikaCLI.class);

    private final int MAX_MARK = 20 * 1024 * 1024;//20MB

    private final OutputType NO_OUTPUT = new OutputType() {
        @Override
        protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) {
            return new DefaultHandler();
        }
    };
    private ParseContext context;
    private Detector detector;
    private Parser parser;
    private TikaLoader tikaLoader;
    private String configFilePath;
    private boolean recursiveJSON = false;
    private URI networkURI = null;
    /**
     * Output character encoding, or <code>null</code> for platform default
     */
    private String encoding = null;
    private final OutputType TEXT = new OutputType() {
        @Override
        protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
            return new BodyContentHandler(getOutputWriter(output, encoding));
        }
    };
    private final OutputType TEXT_MAIN = new OutputType() {
        @Override
        protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
            return new BoilerpipeContentHandler(getOutputWriter(output, encoding));
        }
    };
    private final OutputType TEXT_ALL = new OutputType() {
        @Override
        protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
            return new WriteOutContentHandler(getOutputWriter(output, encoding));
        }
    };
    private final OutputType METADATA = new OutputType() {
        @Override
        protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
            final PrintWriter writer = new PrintWriter(getOutputWriter(output, encoding));
            return new NoDocumentMetHandler(metadata, writer);
        }
    };
    private final OutputType JSON = new OutputType() {
        @Override
        protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
            final PrintWriter writer = new PrintWriter(getOutputWriter(output, encoding));
            return new NoDocumentJSONMetHandler(metadata, writer);
        }
    };
    private final OutputType XMP = new OutputType() {
        @Override
        protected ContentHandler getContentHandler(OutputStream output, final Metadata metadata) throws Exception {
            final PrintWriter writer = new PrintWriter(getOutputWriter(output, encoding));
            return new NoDocumentXMPMetaHandler(metadata, writer);
        }
    };
    private final OutputType LANGUAGE = new OutputType() {
        @Override
        protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
            final PrintWriter writer = new PrintWriter(getOutputWriter(output, encoding));
            return new LanguageHandler() {
                public void endDocument() {
                    writer.println(getLanguage().getLanguage());
                    writer.flush();
                }
            };
        }
    };
    private final OutputType DETECT = new OutputType() {
        @Override
        public void process(TikaInputStream tis, OutputStream output, Metadata metadata) throws Exception {
            PrintWriter writer = new PrintWriter(getOutputWriter(output, encoding));
            writer.println(detector
                    .detect(tis, metadata, context)
                    .toString());
            writer.flush();
        }
    };
    /**
     * Password for opening encrypted documents, or <code>null</code>.
     */
    private String password = System.getenv("TIKA_PASSWORD");
    private DigesterFactory digesterFactory = null;
    /**
     * Maximum depth for embedded document extraction, or -1 for unlimited.
     */
    private int maxEmbeddedDepth = EmbeddedLimits.UNLIMITED;
    /**
     * Maximum count of embedded documents to extract, or -1 for unlimited.
     */
    private int maxEmbeddedCount = EmbeddedLimits.UNLIMITED;
    private boolean pipeMode = true;
    private boolean prettyPrint;
    /**
     * Fork mode: run parsing in a forked JVM process for isolation.
     */
    private boolean forkMode = false;
    /**
     * Fork mode timeout in milliseconds.
     */
    private long forkTimeout = 60000;
    /**
     * Fork mode JVM arguments.
     */
    private List<String> forkJvmArgs = null;
    /**
     * Fork mode plugins directory.
     */
    private String forkPluginsDir = null;
    private final OutputType MARKDOWN = new OutputType() {
        @Override
        protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
            return new BodyContentHandler(new ToMarkdownContentHandler(getOutputWriter(output, encoding)));
        }
    };
    private final OutputType XML = new OutputType() {
        @Override
        protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
            return getTransformerHandler(output, "xml", encoding, prettyPrint);
        }
    };
    private OutputType type = XML;
    private final OutputType HTML = new OutputType() {
        @Override
        protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
            return new ExpandedTitleContentHandler(getTransformerHandler(output, "html", encoding, prettyPrint));
        }
    };

    public TikaCLI() {
        context = new ParseContext();
    }

    public static void main(String[] args) throws Exception {
        TikaCLI cli = new TikaCLI();

        if (cli.testForHelp(args)) {
            cli.usage();
            return;
        } else if (cli.testForAsync(args)) {
            async(args);
            return;
        }

        if (args.length > 0) {
            for (String arg : args) {
                cli.process(arg);
            }
            if (cli.pipeMode) {
                cli.process("-");
            }
        } else {
            // Started with no arguments. Wait for up to 0.1s to see if
            // we have something waiting in standard input and use the
            // pipe mode if we have. If no input is seen, start the GUI.
            if (System.in.available() == 0) {
                Thread.sleep(100);
            }
            if (System.in.available() > 0) {
                cli.process("-");
            } else {
                cli.process("--gui");
            }
        }
    }

    private static void async(String[] args) throws Exception {
        args = AsyncHelper.translateArgs(args);
        String tikaConfigPath = "";
        //TODO - runpack is a smelly. fix this.
        boolean runpack = false;
        for (int i = 0; i < args.length - 1; i++) {
            if (args[i].equals("-c")) {
                tikaConfigPath = args[i + 1];
            } else if ("-Z".equals(args[i]) || "-z".equals(args[i]) || "--extract".equals(args[i])) {
                runpack = true;
            }
        }

        if (runpack || ! StringUtils.isBlank(tikaConfigPath)) {
            TikaAsyncCLI.main(args);
            return;
        }
        if (args.length == 1 &&  args[0].endsWith(".json")) {
            TikaAsyncCLI.main(args);
            return;
        }
        // For batch mode (two directories), pass directly to TikaAsyncCLI.
        // It will create its own config with PluginsWriter that includes
        // plugin-roots, fetcher, emitter, and pipes-iterator configuration.
        TikaAsyncCLI.main(args);
    }

    /**
     * Returns a output writer with the given encoding.
     *
     * @param output   output stream
     * @param encoding output encoding,
     *                 or <code>null</code> for the platform default
     * @return output writer
     * @throws UnsupportedEncodingException if the given encoding is not supported
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a>
     */
    private static Writer getOutputWriter(OutputStream output, String encoding) throws UnsupportedEncodingException {
        if (encoding != null) {
            return new OutputStreamWriter(output, encoding);
        } else {
            return new OutputStreamWriter(output, UTF_8);
        }
    }

    /**
     * Returns a transformer handler that serializes incoming SAX events
     * to XHTML or HTML (depending the given method) using the given output
     * encoding.
     *
     * @param output   output stream
     * @param method   "xml" or "html"
     * @param encoding output encoding,
     *                 or <code>null</code> for the platform default
     * @return {@link System#out} transformer handler
     * @throws TransformerConfigurationException if the transformer can not be created
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a>
     */
    private static TransformerHandler getTransformerHandler(OutputStream output, String method, String encoding, boolean prettyPrint)
            throws TransformerConfigurationException, TikaException {
        SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory();
        TransformerHandler handler = factory.newTransformerHandler();

        handler
                .getTransformer()
                .setOutputProperty(OutputKeys.METHOD, method);
        handler
                .getTransformer()
                .setOutputProperty(OutputKeys.INDENT, prettyPrint ? "yes" : "no");
        if (encoding != null) {
            handler
                    .getTransformer()
                    .setOutputProperty(OutputKeys.ENCODING, encoding);
        }
        handler.setResult(new StreamResult(output));
        return handler;
    }

    private boolean testForAsync(String[] args) {

        // Single .json file is a config file for async mode
        if (args.length == 1 && args[0].endsWith(".json")) {
            return true;
        }

        if (args.length == 2) {
            if (Files.isDirectory(Paths.get(args[0]))) {
                return true;
            }
        }

        // Check if last two args are directories (batch mode with options)
        if (args.length >= 2) {
            String lastArg = args[args.length - 1];
            String secondLastArg = args[args.length - 2];
            // Make sure neither looks like an option value
            if (!lastArg.startsWith("-") && !secondLastArg.startsWith("-")) {
                try {
                    if (Files.isDirectory(Paths.get(secondLastArg)) &&
                        (Files.isDirectory(Paths.get(lastArg)) || !Files.exists(Paths.get(lastArg)))) {
                        return true;
                    }
                } catch (Exception e) {
                    // Invalid path, not batch mode
                }
            }
        }

        for (String arg : args) {
            if (arg.equals("-a") || arg.equals("--async")) {
                return true;
            }
            if (arg.equals("-i") || arg.startsWith("--input")) {
                return true;
            }
            if (arg.equals("-o") || arg.startsWith("--output")) {
                return true;
            }
            if (arg.equals("-Z") || arg.equals("-z") || arg.equals("--extract") || arg.startsWith("--extract-dir")) {
                return true;
            }
            if (arg.equals("--fileList")) {
                return true;
            }
        }
        return false;
    }

    public void process(String arg) throws Exception {
        if (arg.equals("-?") || arg.equals("--help")) {
            pipeMode = false;
            usage();
        } else if (arg.equals("-V") || arg.equals("--version")) {
            pipeMode = false;
            version();
        } else if (arg.equals("-v") || arg.equals("--verbose")) {
            org.apache.logging.log4j.core.config.Configurator.setRootLevel(Level.DEBUG);
        } else if (arg.equals("-g") || arg.equals("--gui")) {
            pipeMode = false;
            if (configFilePath != null) {
                TikaGUI.main(new String[]{configFilePath});
            } else {
                TikaGUI.main(new String[0]);
            }
        } else if (arg.equals("--list-parser") || arg.equals("--list-parsers")) {
            pipeMode = false;
            displayParsers(false, false);
        } else if (arg.equals("--list-detector") || arg.equals("--list-detectors")) {
            pipeMode = false;
            displayDetectors();
        } else if (arg.equals("--list-parser-detail") || arg.equals("--list-parser-details")) {
            pipeMode = false;
            displayParsers(true, false);
        } else if (arg.equals("--list-parser-detail-apt") || arg.equals("--list-parser-details-apt")) {
            pipeMode = false;
            displayParsers(true, true);
        } else if (arg.equals("--list-met-models")) {
            pipeMode = false;
            displayMetModels();
        } else if (arg.equals("--list-supported-types")) {
            pipeMode = false;
            displaySupportedTypes();
        } else if (arg.startsWith("--compare-file-magic=")) {
            pipeMode = false;
            compareFileMagic(arg.substring("--compare-file-magic=".length()));
        //TODO -- rework with json serialization
        /*} else if (arg.equals("--dump-minimal-config")) {
            pipeMode = false;
            dumpConfig(TikaConfigSerializer.Mode.MINIMAL);
        } else if (arg.equals("--dump-current-config")) {
            pipeMode = false;
            dumpConfig(TikaConfigSerializer.Mode.CURRENT);
        } else if (arg.equals("--dump-static-config")) {
            pipeMode = false;
            dumpConfig(TikaConfigSerializer.Mode.STATIC);
        } else if (arg.equals("--dump-static-full-config")) {
            pipeMode = false;
            dumpConfig(TikaConfigSerializer.Mode.STATIC_FULL);*/
        } else if (arg.startsWith("--convert-config-xml-to-json=")) {
            pipeMode = false;
            convertConfigXmlToJson(arg.substring("--convert-config-xml-to-json=".length()));
        } else if (arg.equals("--container-aware") || arg.equals("--container-aware-detector")) {
            // ignore, as container-aware detectors are now always used
        } else if (arg.startsWith("--config=")) {
            configFilePath = arg.substring("--config=".length());
        } else if (arg.startsWith("--digest=")) {
            String algorithmName = arg.substring("--digest=".length()).toUpperCase(Locale.ROOT);
            DigestDef.Algorithm algorithm = DigestDef.Algorithm.valueOf(algorithmName);
            CommonsDigesterFactory factory = new CommonsDigesterFactory();
            factory.setDigests(Collections.singletonList(new DigestDef(algorithm)));
            digesterFactory = factory;
        } else if (arg.startsWith("-e")) {
            encoding = arg.substring("-e".length());
        } else if (arg.startsWith("--encoding=")) {
            encoding = arg.substring("--encoding=".length());
        } else if (arg.startsWith("-p") && !arg.equals("-p")) {
            password = arg.substring("-p".length());
        } else if (arg.startsWith("--password=")) {
            password = arg.substring("--password=".length());
        } else if (arg.equals("-j") || arg.equals("--json")) {
            type = JSON;
        } else if (arg.equals("-J") || arg.equals("--jsonRecursive")) {
            recursiveJSON = true;
        } else if (arg.equals("-y") || arg.equals("--xmp")) {
            type = XMP;
        } else if (arg.equals("-x") || arg.equals("--xml")) {
            type = XML;
        } else if (arg.equals("-h") || arg.equals("--html")) {
            type = HTML;
        } else if (arg.equals("--md")) {
            type = MARKDOWN;
        } else if (arg.equals("-t") || arg.equals("--text")) {
            type = TEXT;
        } else if (arg.equals("-T") || arg.equals("--text-main")) {
            type = TEXT_MAIN;
        } else if (arg.equals("-A") || arg.equals("--text-all")) {
            type = TEXT_ALL;
        } else if (arg.equals("-m") || arg.equals("--metadata")) {
            type = METADATA;
        } else if (arg.equals("-l") || arg.equals("--language")) {
            type = LANGUAGE;
        } else if (arg.equals("-d") || arg.equals("--detect")) {
            type = DETECT;
        } else if (arg.equals("-f") || arg.equals("--fork")) {
            forkMode = true;
        } else if (arg.startsWith("--fork-timeout=")) {
            forkTimeout = Long.parseLong(arg.substring("--fork-timeout=".length()));
        } else if (arg.startsWith("--fork-jvm-args=")) {
            forkJvmArgs = Arrays.asList(arg.substring("--fork-jvm-args=".length()).split(","));
        } else if (arg.startsWith("--fork-plugins-dir=")) {
            forkPluginsDir = arg.substring("--fork-plugins-dir=".length());
        } else if (arg.startsWith("--maxEmbeddedDepth=")) {
            maxEmbeddedDepth = Integer.parseInt(arg.substring("--maxEmbeddedDepth=".length()));
        } else if (arg.startsWith("--maxEmbeddedCount=")) {
            maxEmbeddedCount = Integer.parseInt(arg.substring("--maxEmbeddedCount=".length()));
        } else if (arg.equals("-r") || arg.equals("--pretty-print")) {
            prettyPrint = true;
        } else if (arg.equals("-p") || arg.equals("--port") || arg.equals("-s") || arg.equals("--server")) {
            throw new IllegalArgumentException("As of Tika 2.0, the server option is no longer supported in tika-app.\n" + "See https://wiki.apache.org/tika/TikaJAXRS for usage.");
        } else if (arg.startsWith("-c")) {
            networkURI = new URI(arg.substring("-c".length()));
        } else if (arg.startsWith("--client=")) {
            networkURI = new URI(arg.substring("--client=".length()));
        } else {
            pipeMode = false;
            configure();

            if (arg.equals("-")) {
                try (TikaInputStream tis = TikaInputStream.get(CloseShieldInputStream.wrap(System.in))) {
                    if (forkMode) {
                        processWithFork(tis, Metadata.newInstance(context), System.out);
                    } else {
                        type.process(tis, System.out, Metadata.newInstance(context));
                    }
                }
            } else {
                URL url;
                File file = new File(arg);
                if (file.isFile()) {
                    url = file
                            .toURI()
                            .toURL();
                } else {
                    url = new URL(arg);
                }
                if (forkMode) {
                    Metadata metadata = Metadata.newInstance(context);
                    try (TikaInputStream tis = TikaInputStream.get(url, metadata)) {
                        processWithFork(tis, metadata, System.out);
                    }
                } else if (recursiveJSON) {
                    handleRecursiveJson(url, System.out);
                } else {
                    Metadata metadata = Metadata.newInstance(context);
                    try (TikaInputStream tis = TikaInputStream.get(url, metadata)) {
                        type.process(tis, System.out, metadata);
                    } finally {
                        System.out.flush();
                    }
                }
            }
        }
    }

    //TODO -- rework with json serialization
    /*private void dumpConfig(TikaConfigSerializer.Mode mode) throws Exception {
        configure();
        TikaLoader localConfig = (tikaLoader == null) ? TikaLoader.loadDefault() : tikaLoader;
        //TODO -- implement mode
        System.out.println(localConfig.getConfig().toString());
    }*/

    private void convertConfigXmlToJson(String paths) throws Exception {
        String[] parts = paths.split(",");
        if (parts.length != 2) {
            System.err.println("Error: --convert-config-xml-to-json requires input and output paths separated by comma");
            System.err.println("Usage: --convert-config-xml-to-json=<input.xml>,<output.json>");
            return;
        }

        Path xmlPath = Paths.get(parts[0].trim());
        Path jsonPath = Paths.get(parts[1].trim());

        if (!Files.exists(xmlPath)) {
            System.err.println("Error: Input XML file not found: " + xmlPath);
            return;
        }

        try {
            XmlToJsonConfigConverter.convert(xmlPath, jsonPath);
            System.out.println("Successfully converted XML config to JSON:");
            System.out.println("  Input:  " + xmlPath.toAbsolutePath());
            System.out.println("  Output: " + jsonPath.toAbsolutePath());
        } catch (Exception e) {
            System.err.println("Error converting config: " + e.getMessage());
            throw e;
        }
    }

    private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException {
        Metadata metadata = Metadata.newInstance(context);
        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
        RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(getContentHandlerFactory(type));
        try (TikaInputStream tis = TikaInputStream.get(url, metadata)) {
            wrapper.parse(tis, handler, metadata, context);
        }
        JsonMetadataList.setPrettyPrinting(prettyPrint);
        try (Writer writer = getOutputWriter(output, encoding)) {
            List<Metadata> metadataList = handler.getMetadataList();
            tikaLoader.loadMetadataFilters().filter(metadataList);
            JsonMetadataList.toJson(metadataList, writer);
        }
    }

    /**
     * Process a file using forked JVM process for isolation.
     * This provides protection against parser crashes, OOM, and other issues.
     */
    private void processWithFork(TikaInputStream tis, Metadata metadata, OutputStream output) throws Exception {
        PipesForkParserConfig config = new PipesForkParserConfig();

        // Set handler type based on output type
        config.setContentHandlerFactory(getContentHandlerFactory(type));

        // Set parse mode based on recursiveJSON flag
        if (recursiveJSON) {
            config.setParseMode(ParseMode.RMETA);
        } else {
            config.setParseMode(ParseMode.CONCATENATE);
        }

        // Set timeout
        config.setTimeoutLimits(new TimeoutLimits(
                TimeoutLimits.DEFAULT_TOTAL_TASK_TIMEOUT_MILLIS, forkTimeout));

        // Set JVM args if provided
        if (forkJvmArgs != null && !forkJvmArgs.isEmpty()) {
            config.setJvmArgs(forkJvmArgs);
        }

        // Set plugins directory if provided
        if (forkPluginsDir != null) {
            config.setPluginsDir(Paths.get(forkPluginsDir));
        }

        // Set embedded limits if configured
        if (maxEmbeddedDepth != EmbeddedLimits.UNLIMITED || maxEmbeddedCount != EmbeddedLimits.UNLIMITED) {
            EmbeddedLimits limits = new EmbeddedLimits();
            if (maxEmbeddedDepth != EmbeddedLimits.UNLIMITED) {
                limits.setMaxDepth(maxEmbeddedDepth);
            }
            if (maxEmbeddedCount != EmbeddedLimits.UNLIMITED) {
                limits.setMaxCount(maxEmbeddedCount);
            }
            config.setEmbeddedLimits(limits);
        }

        try (PipesForkParser parser = new PipesForkParser(config)) {
            PipesForkResult result = parser.parse(tis, metadata);

            if (result.isProcessCrash()) {
                LOG.error("Fork process crashed: {}", result.getStatus());
                System.err.println("Fork process crashed: " + result.getStatus());
                return;
            }

            List<Metadata> metadataList = result.getMetadataList();

            // Output based on type
            if (recursiveJSON) {
                // Output as JSON metadata list
                JsonMetadataList.setPrettyPrinting(prettyPrint);
                try (Writer writer = getOutputWriter(output, encoding)) {
                    JsonMetadataList.toJson(metadataList, writer);
                }
            } else if (type == JSON || type == METADATA) {
                // Output metadata (first item only for single-file mode)
                if (!metadataList.isEmpty()) {
                    Metadata m = metadataList.get(0);
                    if (type == JSON) {
                        JsonMetadata.setPrettyPrinting(prettyPrint);
                        try (Writer writer = getOutputWriter(output, encoding)) {
                            JsonMetadata.toJson(m, writer);
                        }
                    } else {
                        try (PrintWriter writer = new PrintWriter(getOutputWriter(output, encoding))) {
                            String[] names = m.names();
                            Arrays.sort(names);
                            for (String name : names) {
                                for (String value : m.getValues(name)) {
                                    writer.println(name + ": " + value);
                                }
                            }
                            writer.flush();
                        }
                    }
                }
            } else {
                // Output content (text, xml, html)
                if (!metadataList.isEmpty()) {
                    String content = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
                    if (content != null) {
                        try (Writer writer = getOutputWriter(output, encoding)) {
                            writer.write(content);
                            writer.flush();
                        }
                    }
                }
            }
        }
    }

    private ContentHandlerFactory getContentHandlerFactory(OutputType type) {
        BasicContentHandlerFactory.HANDLER_TYPE handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE;
        if (type.equals(HTML)) {
            handlerType = BasicContentHandlerFactory.HANDLER_TYPE.HTML;
        } else if (type.equals(XML)) {
            handlerType = BasicContentHandlerFactory.HANDLER_TYPE.XML;
        } else if (type.equals(TEXT)) {
            handlerType = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
        } else if (type.equals(TEXT_MAIN)) {
            handlerType = BasicContentHandlerFactory.HANDLER_TYPE.BODY;
        } else if (type.equals(METADATA)) {
            handlerType = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE;
        }
        return new BasicContentHandlerFactory(handlerType, -1);
    }

    private void usage() {
        PrintStream out = System.out;
        out.println("usage: java -jar tika-app.jar [option...] [file...]");
        out.println();
        out.println("Options:");
        out.println("    -?  or --help          Print this usage message");
        out.println("    -v  or --verbose       Print debug level messages");
        out.println("    -V  or --version       Print the Apache Tika version number");
        out.println();
        out.println("    -g  or --gui           Start the Apache Tika GUI");
        out.println();
        out.println("    --config=<tika-config.xml>");
        out.println("        TikaConfig file. Must be specified before -g, -s, -f or the dump-x-config !");
        // TODO: TIKA-XXXX - Re-enable config dump options once JSON serialization is complete
        // These options are not yet implemented in 4.x due to the migration from XML to JSON config
        // out.println("    --dump-minimal-config  Print minimal TikaConfig");
        // out.println("    --dump-current-config  Print current TikaConfig");
        // out.println("    --dump-static-config   Print static config");
        // out.println("    --dump-static-full-config  Print static explicit config");
        out.println("    --convert-config-xml-to-json=<input.xml>,<output.json>");
        out.println("        Convert legacy XML config to JSON format (parsers section only)");
        out.println("");
        out.println("    -x  or --xml           Output XHTML content (default)");
        out.println("    -h  or --html          Output HTML content");
        out.println("    -t  or --text          Output plain text content (body)");
        out.println("    --md                   Output Markdown content (body)");
        out.println("    -T  or --text-main     Output plain text content (main content only via boilerpipe handler)");
        out.println("    -A  or --text-all      Output all text content");
        out.println("    -m  or --metadata      Output only metadata");
        out.println("    -j  or --json          Output metadata in JSON");
        out.println("    -y  or --xmp           Output metadata in XMP");
        out.println("    -J  or --jsonRecursive Output metadata and content from all");
        out.println("                           embedded files (choose content type");
        out.println("                           with -x, -h, -t or -m; default is -x)");
        out.println("    -a  or --async         Run Tika in async mode; must specify details in a" + " tikaConfig file");
        out.println("    -l  or --language      Output only language");
        out.println("    -d  or --detect        Detect document type");
        out.println("           --digest=X      Include digest X (md2, md5, sha1,");
        out.println("                               sha256, sha384, sha512");
        out.println("    -eX or --encoding=X    Use output encoding X");
        out.println("    -pX or --password=X    Use document password X");
        out.println("    -z  or --extract       Extract all attachements into current directory");
        out.println("    --extract-dir=<dir>    Specify target directory for -z");
        out.println("    --maxEmbeddedDepth=X   Maximum depth for embedded document extraction");
        out.println("    --maxEmbeddedCount=X   Maximum number of embedded documents to extract");
        out.println("    -r  or --pretty-print  For JSON, XML and XHTML outputs, adds newlines and");
        out.println("                           whitespace, for better readability");
        out.println();
        out.println("Fork Mode (process isolation):");
        out.println("    -f  or --fork          Run parsing in a forked JVM process for isolation");
        out.println("                           Protects against parser crashes, OOM, and timeouts");
        out.println("    --fork-timeout=<ms>    Parse timeout in milliseconds (default: 60000)");
        out.println("    --fork-jvm-args=<args> JVM args for forked process (comma-separated)");
        out.println("                           e.g., --fork-jvm-args=-Xmx512m,-Dsome.prop=value");
        out.println("    --fork-plugins-dir=<dir> Directory containing plugin zips");
        out.println();
        out.println("    --list-parsers");
        out.println("         List the available document parsers");
        out.println("    --list-parser-details");
        out.println("         List the available document parsers and their supported mime types");
        out.println("    --list-parser-details-apt");
        out.println("         List the available document parsers and their supported mime types in apt format.");
        out.println("    --list-detectors");
        out.println("         List the available document detectors");
        out.println("    --list-met-models");
        out.println("         List the available metadata models, and their supported keys");
        out.println("    --list-supported-types");
        out.println("         List all known media types and related information");
        out.println();
        out.println();
        out.println("    --compare-file-magic=<dir>");
        out.println("         Compares Tika's known media types to the File(1) tool's magic directory");
        out.println("Description:");
        out.println("    Apache Tika will parse the file(s) specified on the");
        out.println("    command line and output the extracted text content");
        out.println("    or metadata to standard output.");
        out.println();
        out.println("    Instead of a file name you can also specify the URL");
        out.println("    of a document to be parsed.");
        out.println();
        out.println("    If no file name or URL is specified (or the special");
        out.println("    name \"-\" is used), then the standard input stream");
        out.println("    is parsed. If no arguments were given and no input");
        out.println("    data is available, the GUI is started instead.");
        out.println();
        out.println("- GUI mode");
        out.println();
        out.println("    Use the \"--gui\" (or \"-g\") option to start the");
        out.println("    Apache Tika GUI. You can drag and drop files from");
        out.println("    a normal file explorer to the GUI window to extract");
        out.println("    text content and metadata from the files.");
        out.println();
        out.println("- Batch mode");
        out.println();
        out.println("    Simplest method.");
        out.println("    Specify two directories as args with no other args:");
        out.println("         java -jar tika-app.jar <inputDirectory> <outputDirectory>");
        out.println();
        out.println("Batch/Pipes Options:");
        out.println("    -i                         Input directory");
        out.println("    -o                         Output directory");
        out.println("    -n                         Number of forked processes");
        out.println("    -X                         -Xmx in the forked processes");
        out.println("    -T                         Timeout in milliseconds");
        out.println("    --fileList                  File list (one path per line, relative to -i or absolute)");
        out.println("    --handler                  Handler type: t=text, h=html, x=xml, m=markdown, b=body, i=ignore");
        out.println("    -Z                         Recursively unpack all the attachments, too");
        out.println("    --unpack-format=<format>   Output format: REGULAR (default) or FRICTIONLESS");
        out.println("    --unpack-mode=<mode>       Output mode: ZIPPED (default) or DIRECTORY");
        out.println("    --unpack-include-metadata  Include metadata.json in Frictionless output");
        out.println();
        out.println();
    }

    private void version() {
        System.out.println(Tika.getString());
    }

    private boolean testForHelp(String[] args) {
        for (String s : args) {
            if (s.equals("-?") || s.equals("--help")) {
                return true;
            }
        }
        return false;
    }

    private boolean testForBatch(String[] args) {
        if (args.length == 2 && !args[0].startsWith("-") && !args[1].startsWith("-")) {
            Path inputCand = Paths.get(args[0]);
            Path outputCand = Paths.get(args[1]);
            if (Files.isDirectory(inputCand) && !Files.isRegularFile(outputCand)) {
                return true;
            }
        }

        for (String s : args) {
            if (s.equals("-inputDir") || s.equals("--inputDir") || s.equals("-i")) {
                return true;
            }
        }
        return false;
    }

    private void configure() throws TikaException, IOException, SAXException {
        if (configFilePath != null) {
            tikaLoader = TikaLoader.load(Paths.get(configFilePath));
        } else {
            String warn = "As a convenience, TikaCLI has turned on several non-default features\n" +
                    "as specified in tika-app/src/main/resources/tika-config-default-single-file.json.\n" +
                    "See: TIKA-2374, TIKA-4017, TIKA-4354 and TIKA-4472).\n" +
                    "This is not the default behavior in Tika generally or in tika-server.";
            LOG.info(warn);
            Path tempConfig = Files.createTempFile("tika-config-", ".json");
            try {
                try (InputStream is = getClass().getResourceAsStream("/tika-config-default-single-file.json")) {
                    Files.copy(is, tempConfig, StandardCopyOption.REPLACE_EXISTING);
                }
                tikaLoader = TikaLoader.load(tempConfig);
            } finally {
                Files.deleteIfExists(tempConfig);
            }
        }
        if (networkURI != null) {
            parser = new NetworkParser(networkURI);
        } else {
            parser = tikaLoader.loadAutoDetectParser();
        }

        // Load configs from tika-config.json and merge into existing context
        // (preserves EmbeddedDocumentExtractor and other items set before configure())
        ParseContext loadedContext = tikaLoader.loadParseContext();
        context.copyFrom(loadedContext);

        // Override DigesterFactory in ParseContext if configured via --digest= command line
        if (digesterFactory != null) {
            context.set(DigesterFactory.class, digesterFactory);
        }
        // Set EmbeddedLimits if any limits were specified via command line
        if (maxEmbeddedDepth != EmbeddedLimits.UNLIMITED || maxEmbeddedCount != EmbeddedLimits.UNLIMITED) {
            EmbeddedLimits limits = new EmbeddedLimits();
            if (maxEmbeddedDepth != EmbeddedLimits.UNLIMITED) {
                limits.setMaxDepth(maxEmbeddedDepth);
            }
            if (maxEmbeddedCount != EmbeddedLimits.UNLIMITED) {
                limits.setMaxCount(maxEmbeddedCount);
            }
            context.set(EmbeddedLimits.class, limits);
        }
        detector = tikaLoader.loadDetectors();
        context.set(Parser.class, parser);
        context.set(PasswordProvider.class, new SimplePasswordProvider(password));
    }

    private void displayMetModels() {
        Class<?>[] modelClasses = Metadata.class.getInterfaces();
        Arrays.sort(modelClasses, Comparator.comparing(Class::getName));

        for (Class<?> modelClass : modelClasses) {
            // we don't care about internal Tika met classes
            // if we do, then we can take this conditional out
            if (!modelClass
                    .getSimpleName()
                    .contains("Tika")) {
                System.out.println(modelClass.getSimpleName());
                Field[] keyFields = modelClass.getFields();
                Arrays.sort(keyFields, Comparator.comparing(Field::getName));
                for (Field keyField : keyFields) {
                    System.out.println(" " + keyField.getName());
                }
            }
        }
    }

    /*
     * Displays loaded parsers and their mime types
     * If a parser is a composite parser, it will list the
     * sub parsers and their mime-types.
     */
    private void displayParsers(boolean includeMimeTypes, boolean aptListFormat) throws TikaException, IOException, SAXException {
        configure();
        displayParser(parser, includeMimeTypes, aptListFormat, 3);
    }

    private void displayParser(Parser p, boolean includeMimeTypes, boolean apt, int i) {
        String decorated = null;
        if (p instanceof ParserDecorator) {
            ParserDecorator pd = (ParserDecorator) p;
            decorated = " (Wrapped by " + pd.getDecorationName() + ")";
            p = pd.getWrappedParser();
        }

        boolean isComposite = (p instanceof CompositeParser);
        String name = p
                .getClass()
                .getName();

        if (apt) {
            name = name.substring(0, name.lastIndexOf(".") + 1) + "{{{./api/" + name.replace(".", "/") + "}" + name.substring(name.lastIndexOf(".") + 1) + "}}";
        } else if (decorated != null) {
            name += decorated;
        }
        if ((apt && !isComposite) || !apt) {    // Don't display Composite parsers in the apt output.
            System.out.println(indent(i) + ((apt) ? "* " : "") + name + (isComposite ? " (Composite Parser):" : ""));
            if (apt) {
                System.out.println();
            }
            if (includeMimeTypes && !isComposite) {
                for (MediaType mt : p.getSupportedTypes(context)) {
                    System.out.println(indent(i + 3) + ((apt) ? "* " : "") + mt);
                    if (apt) {
                        System.out.println();
                    }
                }
            }
        }

        if (isComposite) {
            Parser[] subParsers = sortParsers(invertMediaTypeMap(((CompositeParser) p).getParsers()));
            for (Parser sp : subParsers) {
                displayParser(sp, includeMimeTypes, apt, i + ((apt) ? 0 : 3));  // Don't indent for Composites in apt.
            }
        }
    }

    /*
     * Displays loaded detectors and their mime types
     * If a detector is a composite detector, it will list the
     *  sub detectors.
     */
    private void displayDetectors() throws TikaException, IOException, SAXException {
        configure();
        displayDetector(detector, 0);
    }

    private void displayDetector(Detector d, int i) {
        boolean isComposite = (d instanceof CompositeDetector);
        String name = d
                .getClass()
                .getName();
        System.out.println(indent(i) + name + (isComposite ? " (Composite Detector):" : ""));
        if (isComposite) {
            List<Detector> subDetectors = ((CompositeDetector) d).getDetectors();
            for (Detector sd : subDetectors) {
                displayDetector(sd, i + 2);
            }
        }
    }

    private String indent(int indent) {
        return "                     ".substring(0, indent);
    }

    private Parser[] sortParsers(Map<Parser, Set<MediaType>> parsers) {
        // Get a nicely sorted list of the parsers
        Parser[] sortedParsers = parsers
                .keySet()
                .toArray(new Parser[0]);
        Arrays.sort(sortedParsers, (p1, p2) -> {
            String name1 = p1
                    .getClass()
                    .getName();
            String name2 = p2
                    .getClass()
                    .getName();
            return name1.compareTo(name2);
        });
        return sortedParsers;
    }

    private Map<Parser, Set<MediaType>> invertMediaTypeMap(Map<MediaType, Parser> supported) {
        Map<Parser, Set<MediaType>> parsers = new HashMap<>();
        for (Entry<MediaType, Parser> e : supported.entrySet()) {
            if (!parsers.containsKey(e.getValue())) {
                parsers.put(e.getValue(), new HashSet<>());
            }
            parsers
                    .get(e.getValue())
                    .add(e.getKey());
        }
        return parsers;
    }

    /**
     * Prints all the known media types, aliases and matching parser classes.
     */
    private void displaySupportedTypes() {
        AutoDetectParser parser = new AutoDetectParser();
        MediaTypeRegistry registry = parser.getMediaTypeRegistry();
        Map<MediaType, Parser> parsers = parser.getParsers();

        for (MediaType type : registry.getTypes()) {
            System.out.println(type);
            for (MediaType alias : registry.getAliases(type)) {
                System.out.println("  alias:     " + alias);
            }
            MediaType supertype = registry.getSupertype(type);
            if (supertype != null) {
                System.out.println("  supertype: " + supertype);
            }
            Parser p = parsers.get(type);
            if (p != null) {
                if (p instanceof CompositeParser) {
                    p = ((CompositeParser) p)
                            .getParsers()
                            .get(type);
                }
                System.out.println("  parser:    " + p
                        .getClass()
                        .getName());
            }
        }
    }

    /**
     * Compares our mime types registry with the File(1) tool's
     * directory of (uncompiled) Magic entries.
     * (Well, those with mimetypes anyway)
     *
     * @param magicDir Path to the magic directory
     */
    private void compareFileMagic(String magicDir) throws Exception {
        Set<String> tikaLacking = new TreeSet<>();
        Set<String> tikaNoMagic = new TreeSet<>();

        // Plausibility check
        File dir = new File(magicDir);
        if ((new File(dir, "elf")).exists() && (new File(dir, "mime")).exists() && (new File(dir, "vorbis")).exists()) {
            // Looks plausible
        } else {
            throw new IllegalArgumentException(magicDir + " doesn't seem to hold uncompressed file magic entries");
        }

        // Find all the mimetypes in the directory
        Set<String> fileMimes = new HashSet<>();
        for (File mf : dir.listFiles()) {
            if (mf.isFile()) {
                try (BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(mf), UTF_8))) {
                    String line;
                    while ((line = r.readLine()) != null) {
                        if (line.startsWith("!:mime") || line.startsWith("#!:mime")) {
                            String mime = line
                                    .substring(7)
                                    .trim();
                            fileMimes.add(mime);
                        }
                    }
                }
            }
        }

        // See how those compare to the Tika ones
        TikaLoader loader = TikaLoader.loadDefault();
        MimeTypes mimeTypes = TikaLoader.getMimeTypes();
        MediaTypeRegistry registry = loader.getMediaTypeRegistry();
        for (String mime : fileMimes) {
            try {
                final MimeType type = mimeTypes.getRegisteredMimeType(mime);

                if (type == null) {
                    // Tika doesn't know about this one
                    tikaLacking.add(mime);
                } else {
                    // Tika knows about this one!

                    // Does Tika have magic for it?
                    boolean hasMagic = type.hasMagic();

                    // How about the children?
                    if (!hasMagic) {
                        for (MediaType child : registry.getChildTypes(type.getType())) {
                            MimeType childType = mimeTypes.getRegisteredMimeType(child.toString());
                            if (childType != null && childType.hasMagic()) {
                                hasMagic = true;
                            }
                        }
                    }

                    // How about the parents?
                    MimeType parentType = type;
                    while (parentType != null && !hasMagic) {
                        if (parentType.hasMagic()) {
                            // Has magic, fine
                            hasMagic = true;
                        } else {
                            // Check the parent next
                            MediaType parent = registry.getSupertype(type.getType());
                            if (parent == MediaType.APPLICATION_XML || parent == MediaType.TEXT_PLAIN || parent == MediaType.OCTET_STREAM) {
                                // Stop checking parents if we hit a top level type
                                parent = null;
                            }
                            if (parent != null) {
                                parentType = mimeTypes.getRegisteredMimeType(parent.toString());
                            } else {
                                parentType = null;
                            }
                        }
                    }
                    if (!hasMagic) {
                        tikaNoMagic.add(mime);
                    }
                }
            } catch (MimeTypeException e) {
                // Broken entry in the file magic directory
                // Silently skip
            }
        }

        // Check how many tika knows about
        int tikaTypes = 0;
        int tikaAliases = 0;
        for (MediaType type : registry.getTypes()) {
            tikaTypes++;
            tikaAliases += registry
                    .getAliases(type)
                    .size();
        }

        // Report
        System.out.println("Tika knows about " + tikaTypes + " unique mime types");
        System.out.println("Tika knows about " + (tikaTypes + tikaAliases) + " mime types including aliases");
        System.out.println("The File Magic directory knows about " + fileMimes.size() + " unique mime types");
        System.out.println();
        System.out.println("The following mime types are known to File but not Tika:");
        for (String mime : tikaLacking) {
            System.out.println("  " + mime);
        }
        System.out.println();
        System.out.println("The following mime types from File have no Tika magic (but their children might):");
        for (String mime : tikaNoMagic) {
            System.out.println("  " + mime);
        }
    }

    private static class NoDocumentMetHandler extends DefaultHandler {

        protected final Metadata metadata;

        protected PrintWriter writer;

        private boolean metOutput;

        public NoDocumentMetHandler(Metadata metadata, PrintWriter writer) {
            this.metadata = metadata;
            this.writer = writer;
            this.metOutput = false;
        }

        @Override
        public void endDocument() {
            String[] names = metadata.names();
            Arrays.sort(names);
            outputMetadata(names);
            writer.flush();
            this.metOutput = true;
        }

        public void outputMetadata(String[] names) {
            for (String name : names) {
                for (String value : metadata.getValues(name)) {
                    writer.println(name + ": " + value);
                }
            }
        }

        public boolean metOutput() {
            return this.metOutput;
        }

    }

    /**
     * Outputs the Tika metadata as XMP using the Tika XMP module
     */
    private static class NoDocumentXMPMetaHandler extends DefaultHandler {
        protected final Metadata metadata;

        protected PrintWriter writer;

        public NoDocumentXMPMetaHandler(Metadata metadata, PrintWriter writer) {
            this.metadata = metadata;
            this.writer = writer;
        }

        @Override
        public void endDocument() throws SAXException {
            try {
                XMPMetadata xmp = new XMPMetadata(metadata);
                String result;
                result = xmp.toString();
                writer.write(result);
                writer.flush();
            } catch (TikaException e) {
                throw new SAXException(e);
            }
        }
    }

    private static class SimplePasswordProvider implements PasswordProvider, Serializable {
        private final String password;

        public SimplePasswordProvider(String password) {
            this.password = password;
        }

        @Override
        public String getPassword(Metadata metadata) {
            return password;
        }
    }

    private class OutputType {
        public void process(TikaInputStream tis, OutputStream output, Metadata metadata) throws Exception {
            Parser p = parser;
            ContentHandler handler = getContentHandler(output, metadata);
            p.parse(tis, handler, metadata, context);
            // fix for TIKA-596: if a parser doesn't generate
            // XHTML output, the lack of an output document prevents
            // metadata from being output: this fixes that
            if (handler instanceof NoDocumentMetHandler) {
                NoDocumentMetHandler metHandler = (NoDocumentMetHandler) handler;
                if (!metHandler.metOutput()) {
                    metHandler.endDocument();
                }
            }
        }

        protected ContentHandler getContentHandler(OutputStream output, Metadata metadata) throws Exception {
            throw new UnsupportedOperationException();
        }

    }


    private class NoDocumentJSONMetHandler extends DefaultHandler {

        protected final Metadata metadata;

        protected PrintWriter writer;

        public NoDocumentJSONMetHandler(Metadata metadata, PrintWriter writer) {
            this.metadata = metadata;
            this.writer = writer;
        }

        @Override
        public void endDocument() throws SAXException {
            try {
                JsonMetadata.setPrettyPrinting(prettyPrint);
                JsonMetadata.toJson(metadata, writer);
                writer.flush();
            } catch (IOException e) {
                throw new SAXException(e);
            }
        }
    }
}