PluginsWriter.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.async.cli;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.Map;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;

import org.apache.tika.config.loader.TikaObjectMapperFactory;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.utils.StringUtils;

public class PluginsWriter {


    private final SimpleAsyncConfig simpleAsyncConfig;
    private final Path pluginsPath;

    public PluginsWriter(SimpleAsyncConfig simpleAsyncConfig, Path pluginsConfig) {
        this.simpleAsyncConfig = simpleAsyncConfig;
        this.pluginsPath = pluginsConfig;
    }

    void write(Path output) throws IOException {
        Path baseInput = StringUtils.isBlank(simpleAsyncConfig.getInputDir())
                ? Paths.get(".").toAbsolutePath()
                : Paths.get(simpleAsyncConfig.getInputDir());
        Path baseOutput = StringUtils.isBlank(simpleAsyncConfig.getOutputDir())
                ? null
                : Paths.get(simpleAsyncConfig.getOutputDir());
        if (Files.isRegularFile(baseInput)) {
            baseInput = baseInput.toAbsolutePath().getParent();
            if (baseInput == null) {
                throw new IllegalArgumentException("File must be at least one directory below root");
            }
        }
        try {
            ObjectMapper objectMapper = TikaObjectMapperFactory.getMapper();
            ObjectNode root = (ObjectNode) objectMapper.readTree(
                    getClass().getResourceAsStream("/config-template.json"));

            // Set fetcher basePath
            ObjectNode fetchers = (ObjectNode) root.get("fetchers");
            if (fetchers != null && fetchers.has("fsf")) {
                ObjectNode fsf = (ObjectNode) fetchers.get("fsf");
                if (fsf != null && fsf.has("file-system-fetcher")) {
                    ObjectNode fsFetcher = (ObjectNode) fsf.get("file-system-fetcher");
                    fsFetcher.put("basePath", baseInput.toAbsolutePath().toString());
                }
            }

            // Set emitter basePath
            ObjectNode emitters = (ObjectNode) root.get("emitters");
            if (baseOutput != null && emitters != null && emitters.has("fse")) {
                ObjectNode fse = (ObjectNode) emitters.get("fse");
                if (fse != null && fse.has("file-system-emitter")) {
                    ObjectNode fsEmitter = (ObjectNode) fse.get("file-system-emitter");
                    fsEmitter.put("basePath", baseOutput.toAbsolutePath().toString());
                }
            }

            // Set pipes-iterator basePath
            ObjectNode pipesIterator = (ObjectNode) root.get("pipes-iterator");
            if (pipesIterator != null && pipesIterator.has("file-system-pipes-iterator")) {
                ObjectNode fsIterator = (ObjectNode) pipesIterator.get("file-system-pipes-iterator");
                fsIterator.put("basePath", baseInput.toAbsolutePath().toString());
            }

            // Set plugin-roots
            String pluginString;
            if (!StringUtils.isBlank(simpleAsyncConfig.getPluginsDir())) {
                pluginString = simpleAsyncConfig.getPluginsDir();
                Path plugins = Paths.get(pluginString);
                if (Files.isDirectory(plugins)) {
                    pluginString = plugins.toAbsolutePath().toString();
                }
            } else {
                pluginString = TikaAsyncCLI.resolveDefaultPluginsDir();
            }
            root.put("plugin-roots", pluginString);

            // If the user provided a -c config, merge their settings first.
            // This brings in parsers, parse-context, metadata-filters, and
            // optionally pipes config (e.g. forkedJvmArgs with log4j settings).
            if (!StringUtils.isBlank(simpleAsyncConfig.getTikaConfig())) {
                Path userConfigPath = Paths.get(simpleAsyncConfig.getTikaConfig());
                JsonNode userRoot = objectMapper.readTree(userConfigPath.toFile());
                mergeUserConfig(root, (ObjectNode) userRoot);
            }

            // Now apply CLI overrides on top of whatever pipes config exists.
            // This lets the user have forkedJvmArgs in their config (e.g. log4j)
            // while still controlling numClients and Xmx from the command line.
            ObjectNode pipesNode = root.has("pipes")
                    ? (ObjectNode) root.get("pipes")
                    : objectMapper.createObjectNode();

            if (simpleAsyncConfig.getNumClients() != null) {
                pipesNode.put("numClients", simpleAsyncConfig.getNumClients());
            } else if (!pipesNode.has("numClients")) {
                pipesNode.put("numClients", 2);
            }

            if (simpleAsyncConfig.getXmx() != null) {
                String xmx = simpleAsyncConfig.getXmx();
                if (!xmx.startsWith("-")) {
                    xmx = "-Xmx" + xmx;
                }
                // Replace or add -Xmx in forkedJvmArgs, preserving other args
                mergeXmxIntoJvmArgs(pipesNode, xmx, objectMapper);
            }

            if (simpleAsyncConfig.isContentOnly()) {
                pipesNode.put("parseMode", "CONTENT_ONLY");
            } else if (simpleAsyncConfig.isConcatenate()) {
                pipesNode.put("parseMode", "CONCATENATE");
            }

            root.set("pipes", pipesNode);

            // For content-only mode, change the emitter file extension based on handler type
            if (simpleAsyncConfig.isContentOnly()) {
                String ext = getFileExtensionForHandlerType(simpleAsyncConfig.getHandlerType());
                if (emitters != null && emitters.has("fse")) {
                    ObjectNode fse = (ObjectNode) emitters.get("fse");
                    if (fse != null && fse.has("file-system-emitter")) {
                        ObjectNode fsEmitter = (ObjectNode) fse.get("file-system-emitter");
                        fsEmitter.put("fileExtension", ext);
                    }
                }
            }

            // Write timeout limits to parse-context if configured on CLI
            if (simpleAsyncConfig.getTimeoutMs() != null) {
                ObjectNode parseContext = root.has("parse-context")
                        ? (ObjectNode) root.get("parse-context")
                        : objectMapper.createObjectNode();
                ObjectNode timeoutNode = objectMapper.createObjectNode();
                timeoutNode.put("progressTimeoutMillis", simpleAsyncConfig.getTimeoutMs());
                parseContext.set("timeout-limits", timeoutNode);
                root.set("parse-context", parseContext);
            }

            objectMapper.writerWithDefaultPrettyPrinter().writeValue(output.toFile(), root);
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    /**
     * Merges user config fields into the auto-generated root.
     * All user fields override the auto-generated template values.
     */
    private static void mergeUserConfig(ObjectNode root, ObjectNode userConfig) {
        Iterator<Map.Entry<String, JsonNode>> fields = userConfig.fields();
        while (fields.hasNext()) {
            Map.Entry<String, JsonNode> entry = fields.next();
            root.set(entry.getKey(), entry.getValue());
        }
    }

    /**
     * Merges an -Xmx arg into the forkedJvmArgs array, replacing any existing -Xmx
     * and preserving all other args (e.g. -Dlog4j2.configurationFile=...).
     */
    private static void mergeXmxIntoJvmArgs(ObjectNode pipesNode, String xmx,
                                              ObjectMapper objectMapper) {
        com.fasterxml.jackson.databind.node.ArrayNode argsArray =
                objectMapper.createArrayNode();

        // Preserve existing args, skipping any old -Xmx
        if (pipesNode.has("forkedJvmArgs") && pipesNode.get("forkedJvmArgs").isArray()) {
            for (JsonNode arg : pipesNode.get("forkedJvmArgs")) {
                String val = arg.asText();
                if (!val.startsWith("-Xmx")) {
                    argsArray.add(val);
                }
            }
        }
        argsArray.add(xmx);
        pipesNode.set("forkedJvmArgs", argsArray);
    }

    private static String getFileExtensionForHandlerType(
            BasicContentHandlerFactory.HANDLER_TYPE handlerType) {
        return switch (handlerType) {
            case MARKDOWN -> "md";
            case HTML -> "html";
            case XML -> "xml";
            case BODY, TEXT -> "txt";
            default -> "txt";
        };
    }
}