ParserDecorator.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy;
import org.apache.tika.parser.multiple.FallbackParser;

/**
 * Decorator base class for the {@link Parser} interface.
 * <p>This class simply delegates all parsing calls to an underlying decorated
 * parser instance. Subclasses can provide extra decoration by overriding the
 * parse method.
 * <p>To decorate several different parsers at the same time, wrap them in
 * a {@link CompositeParser} instance first.
 */
public class ParserDecorator implements Parser {

    /**
     * Serial version UID
     */
    private static final long serialVersionUID = -3861669115439125268L;
    /**
     * The decorated parser instance.
     */
    private final Parser parser;

    /**
     * Creates a decorator for the given parser.
     *
     * @param parser the parser instance to be decorated
     */
    public ParserDecorator(Parser parser) {
        this.parser = parser;
    }

    /**
     * Decorates the given parser so that it always claims to support
     * parsing of the given media types.
     *
     * @param parser the parser to be decorated
     * @param types  supported media types
     * @return the decorated parser
     */
    public static Parser withTypes(Parser parser, Set<MediaType> types) {
        return new MimeFilteringDecorator(parser, types, null);
    }

    /**
     * Decorates the given parser so that it never claims to support
     * parsing of the given media types, but will work for all others.
     *
     * @param parser       the parser to be decorated
     * @param excludeTypes excluded/ignored media types
     * @return the decorated parser
     */
    public static Parser withoutTypes(Parser parser, Set<MediaType> excludeTypes) {
        return new MimeFilteringDecorator(parser, null, excludeTypes);
    }

    /**
     * Decorates the given parser with mime type filtering.
     * Supports both include and exclude lists for round-trip serialization.
     *
     * @param parser the parser to be decorated
     * @param includeTypes types to include (if non-empty, only these types are supported)
     * @param excludeTypes types to exclude (removed from supported types)
     * @return the decorated parser, or the original parser if no filtering needed
     */
    public static Parser withMimeFilters(Parser parser, Set<MediaType> includeTypes,
                                         Set<MediaType> excludeTypes) {
        if ((includeTypes == null || includeTypes.isEmpty()) &&
                (excludeTypes == null || excludeTypes.isEmpty())) {
            return parser;
        }
        return new MimeFilteringDecorator(parser, includeTypes, excludeTypes);
    }

    /**
     * Decorates the given parsers into a virtual parser, where they'll
     * be tried in preference order until one works without error.
     *
     * @deprecated This has been replaced by {@link FallbackParser}
     */
    @Deprecated
    public static final Parser withFallbacks(final Collection<? extends Parser> parsers,
                                             final Set<MediaType> types) {
        // Delegate to the new FallbackParser for now, until people upgrade
        // Keep old behaviour on metadata, which was to preseve all
        MediaTypeRegistry registry = MediaTypeRegistry.getDefaultRegistry();
        Parser p = new FallbackParser(registry, MetadataPolicy.KEEP_ALL, parsers);

        if (types == null || types.isEmpty()) {
            return p;
        }
        return withTypes(p, types);
    }

    /**
     * A ParserDecorator that filters supported mime types.
     * Stores include/exclude sets for round-trip serialization.
     * Results are cached when includeTypes is specified (deterministic case).
     */
    public static class MimeFilteringDecorator extends ParserDecorator {
        private static final long serialVersionUID = 1L;

        private final Set<MediaType> includeTypes;
        private final Set<MediaType> excludeTypes;
        private volatile Set<MediaType> cachedTypes;

        public MimeFilteringDecorator(Parser parser, Set<MediaType> includeTypes,
                                      Set<MediaType> excludeTypes) {
            super(parser);
            this.includeTypes = includeTypes != null ?
                    Collections.unmodifiableSet(new HashSet<>(includeTypes)) : Collections.emptySet();
            this.excludeTypes = excludeTypes != null ?
                    Collections.unmodifiableSet(new HashSet<>(excludeTypes)) : Collections.emptySet();
        }

        @Override
        public Set<MediaType> getSupportedTypes(ParseContext context) {
            // If includeTypes is specified, result is deterministic - use cache
            if (!includeTypes.isEmpty()) {
                if (cachedTypes == null) {
                    Set<MediaType> types = new HashSet<>(includeTypes);
                    types.removeAll(excludeTypes);
                    cachedTypes = Collections.unmodifiableSet(types);
                }
                return cachedTypes;
            }

            // Otherwise compute from wrapped parser (may vary by context)
            Set<MediaType> types = new HashSet<>(super.getSupportedTypes(context));
            types.removeAll(excludeTypes);
            return types;
        }

        @Override
        public String getDecorationName() {
            return "Mime Filtering";
        }

        /**
         * @return the included mime types, or empty set if no include filter
         */
        public Set<MediaType> getIncludeTypes() {
            return includeTypes;
        }

        /**
         * @return the excluded mime types, or empty set if no exclude filter
         */
        public Set<MediaType> getExcludeTypes() {
            return excludeTypes;
        }
    }

    /**
     * Delegates the method call to the decorated parser. Subclasses should
     * override this method (and use <code>super.getSupportedTypes()</code>
     * to invoke the decorated parser) to implement extra decoration.
     */
    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return parser.getSupportedTypes(context);
    }

    /**
     * Delegates the method call to the decorated parser. Subclasses should
     * override this method (and use <code>super.parse()</code> to invoke
     * the decorated parser) to implement extra decoration.
     */
    public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata,
                      ParseContext context) throws IOException, SAXException, TikaException {
        parser.parse(tis, handler, metadata, context);
    }

    /**
     * @return A name/description of the decoration, or null if none available
     */
    public String getDecorationName() {
        return null;
    }

    /**
     * Gets the parser wrapped by this ParserDecorator
     *
     * @return the parser wrapped by this ParserDecorator
     */
    public Parser getWrappedParser() {
        return this.parser;
    }
}