TikaPluginManager.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.plugins;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;

import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.pf4j.DefaultExtensionFinder;
import org.pf4j.DefaultPluginManager;
import org.pf4j.ExtensionFinder;
import org.pf4j.RuntimeMode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.tika.config.loader.TikaJsonConfig;
import org.apache.tika.exception.TikaConfigException;

/**
 * PF4J-based plugin manager for Tika pipes components.
 * <p>
 * This manager loads plugins from configured plugin root directories and
 * makes their extensions available for discovery.
 */
public class TikaPluginManager extends DefaultPluginManager {

    private static final Logger LOG = LoggerFactory.getLogger(TikaPluginManager.class);
    
    private static final String DEV_MODE_PROPERTY = "tika.plugin.dev.mode";
    private static final String DEV_MODE_ENV = "TIKA_PLUGIN_DEV_MODE";

    //we're only using this to convert a single path or a list of paths to a list
    //we don't need all the functionality of the polymorphic objectmapper in tika-serialization
    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

    static {
        OBJECT_MAPPER.configure(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY, true);
    }

    /**
     * Pre-extracts plugin zip files without loading them.
     * <p>
     * Call this method early in parent processes (e.g., AsyncProcessor, PipesParser)
     * before spawning child processes. This ensures plugins are extracted once in
     * the parent, so child processes don't race to extract the same plugins.
     * <p>
     * This method is synchronized to prevent concurrent extraction within the same JVM.
     * For cross-process safety, {@link ThreadSafeUnzipper} uses atomic rename.
     * <p>
     * If plugin-roots is not specified in the config, this method does nothing.
     *
     * @param tikaJsonConfig the configuration containing plugin-roots
     * @throws IOException if extraction fails
     */
    public static synchronized void preExtractPlugins(TikaJsonConfig tikaJsonConfig)
            throws IOException {
        JsonNode root = tikaJsonConfig.getRootNode();
        JsonNode pluginRoots = root.get("plugin-roots");
        if (pluginRoots == null) {
            // No plugins configured - nothing to extract
            return;
        }
        List<Path> roots = OBJECT_MAPPER.convertValue(pluginRoots,
                new TypeReference<List<Path>>() {});
        for (Path pluginRoot : roots) {
            extractPluginsInDirectory(pluginRoot);
        }
    }

    private static void extractPluginsInDirectory(Path root) throws IOException {
        if (!Files.isDirectory(root)) {
            return;
        }
        long start = System.currentTimeMillis();
        File[] files = root.toFile().listFiles();
        if (files == null) {
            return;
        }
        for (File f : files) {
            if (f.getName().endsWith(".zip")) {
                ThreadSafeUnzipper.unzipPlugin(f.toPath());
            }
        }
        LOG.debug("took {} ms to pre-extract plugins in {}",
                System.currentTimeMillis() - start, root);
    }

    /**
     * Loads plugin manager from a pre-parsed TikaJsonConfig.
     * This is the preferred method when sharing configuration across
     * core Tika and pipes components.
     *
     * @param tikaJsonConfig the pre-parsed JSON configuration
     * @return the plugin manager
     * @throws TikaConfigException if configuration is invalid
     * @throws IOException if plugin initialization fails
     */
    public static TikaPluginManager load(TikaJsonConfig tikaJsonConfig)
            throws TikaConfigException, IOException {

        // Configure pf4j runtime mode before creating the manager
        configurePf4jRuntimeMode();
        
        JsonNode root = tikaJsonConfig.getRootNode();
        JsonNode pluginRoots = root.get("plugin-roots");
        if (pluginRoots == null) {
            throw new TikaConfigException("plugin-roots must be specified");
        }
        List<Path> roots = OBJECT_MAPPER.convertValue(pluginRoots,
                new TypeReference<List<Path>>() {});
        if (roots.isEmpty()) {
            throw new TikaConfigException("plugin-roots must not be empty");
        }
        return new TikaPluginManager(roots);
    }

    /**
     * Loads plugin manager from a comma-separated string of paths.
     *
     * @param pathsString comma-separated list of plugin root directories
     * @return the plugin manager
     * @throws TikaConfigException if configuration is invalid
     * @throws IOException if reading or plugin initialization fails
     */
    public static TikaPluginManager loadFromPaths(String pathsString) 
            throws TikaConfigException, IOException {
        if (pathsString == null || pathsString.trim().isEmpty()) {
            throw new TikaConfigException("plugin-roots must not be empty");
        }
        
        configurePf4jRuntimeMode();
        
        List<Path> roots = new java.util.ArrayList<>();
        for (String path : pathsString.split(",")) {
            String trimmed = path.trim();
            if (!trimmed.isEmpty()) {
                roots.add(java.nio.file.Paths.get(trimmed));
            }
        }
        
        if (roots.isEmpty()) {
            throw new TikaConfigException("plugin-roots must not be empty");
        }
        
        return new TikaPluginManager(roots);
    }

    /**
     * Loads plugin manager from a configuration file.
     *
     * @param configPath the path to the JSON configuration file
     * @return the plugin manager
     * @throws TikaConfigException if configuration is invalid
     * @throws IOException if reading or plugin initialization fails
     */
    public static TikaPluginManager load(Path configPath) throws TikaConfigException, IOException {
        TikaJsonConfig tikaJsonConfig = TikaJsonConfig.load(configPath);
        return load(tikaJsonConfig);
    }

    public TikaPluginManager(List<Path> pluginRoots) throws IOException {
        this(pluginRoots, true);
    }
    
    /**
     * Internal constructor that allows skipping runtime mode configuration.
     * Used by tests and factory methods that have already configured the mode.
     */
    private TikaPluginManager(List<Path> pluginRoots, boolean configureMode) throws IOException {
        super(configureMode ? configurePf4jRuntimeModeAndGetRoots(pluginRoots) : pluginRoots);
        
        if (getRuntimeMode() == RuntimeMode.DEVELOPMENT) {
            LOG.info("TikaPluginManager running in DEVELOPMENT mode");
        }
        
        init();
    }
    
    /**
     * Helper method to configure PF4J runtime mode and return the plugin roots.
     * This allows mode configuration before super() is called.
     */
    private static List<Path> configurePf4jRuntimeModeAndGetRoots(List<Path> pluginRoots) {
        configurePf4jRuntimeMode();
        return pluginRoots;
    }
    
    /**
     * Set pf4j's runtime mode system property based on Tika's dev mode setting.
     * This must be called before creating TikaPluginManager instance.
     */
    private static void configurePf4jRuntimeMode() {
        if (isDevelopmentMode()) {
            System.setProperty("pf4j.mode", RuntimeMode.DEVELOPMENT.toString());
        } else {
            // Explicitly set to deployment mode to ensure clean state
            System.setProperty("pf4j.mode", RuntimeMode.DEPLOYMENT.toString());
        }
    }
    
    private static boolean isDevelopmentMode() {
        String sysProp = System.getProperty(DEV_MODE_PROPERTY);
        if (sysProp != null) {
            return Boolean.parseBoolean(sysProp);
        }
        String envVar = System.getenv(DEV_MODE_ENV);
        if (envVar != null) {
            return Boolean.parseBoolean(envVar);
        }
        return false;
    }

    /**
     * Override to disable classpath scanning for extensions.
     * By default, PF4J's DefaultExtensionFinder scans both plugins AND the classpath:
     * - LegacyExtensionFinder scans for extensions.idx files (causes errors for unpackaged JARs)
     * - ServiceProviderExtensionFinder scans META-INF/services (finds Lombok and other libs)
     *
     * We only want to discover extensions from the configured plugin directories,
     * not from the application classpath. The DefaultExtensionFinder without any
     * additional finders will only scan the loaded plugins.
     */
    @Override
    protected ExtensionFinder createExtensionFinder() {
        // Return a DefaultExtensionFinder without any classpath-scanning finders.
        // This will only discover extensions within the loaded plugin JARs.
        return new DefaultExtensionFinder(this);
    }
    
    /**
     * Override to prevent scanning subdirectories in development mode.
     * In development mode, the default DevelopmentPluginRepository scans for subdirectories,
     * but we want each path in plugin-roots to be treated as a complete plugin directory.
     */
    @Override
    protected org.pf4j.PluginRepository createPluginRepository() {
        if (getRuntimeMode() == RuntimeMode.DEVELOPMENT) {
            // In development mode, return a repository that treats each path as a plugin
            return new org.pf4j.BasePluginRepository(getPluginsRoots()) {
                @Override
                public List<Path> getPluginPaths() {
                    // Don't scan subdirectories - each configured path IS a plugin
                    return new java.util.ArrayList<>(pluginsRoots);
                }
            };
        }
        return super.createPluginRepository();
    }
    
    /**
     * Override to use PropertiesPluginDescriptorFinder in development mode.
     * In development mode, plugins are in target/classes with plugin.properties,
     * not packaged JARs with META-INF/MANIFEST.MF.
     */
    @Override
    protected org.pf4j.PluginDescriptorFinder createPluginDescriptorFinder() {
        if (getRuntimeMode() == RuntimeMode.DEVELOPMENT) {
            return new org.pf4j.PropertiesPluginDescriptorFinder();
        }
        return super.createPluginDescriptorFinder();
    }

    private void init() throws IOException {
        if (getRuntimeMode() == RuntimeMode.DEPLOYMENT) {
            for (Path root : pluginsRoots) {
                unzip(root);
            }
        } else {
            LOG.debug("Skipping ZIP extraction in DEVELOPMENT mode");
        }
    }

    private void unzip(Path root) throws IOException {
        long start = System.currentTimeMillis();
        if (!Files.isDirectory(root)) {
            return;
        }

        for (File f : root
                .toFile()
                .listFiles()) {
            if (f
                    .getName()
                    .endsWith(".zip")) {
                ThreadSafeUnzipper.unzipPlugin(f.toPath());
            }
        }
        LOG.debug("took {} ms to unzip/check for unzipped plugins", System.currentTimeMillis() - start);
    }
}