AbstractMultipleParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.multiple;

import static org.apache.tika.utils.ParserUtils.cloneMetadata;
import static org.apache.tika.utils.ParserUtils.recordParserDetails;
import static org.apache.tika.utils.ParserUtils.recordParserFailure;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.utils.ParserUtils;

/**
 * Abstract base class for parser wrappers which may / will
 * process a given stream multiple times, merging the results
 * of the various parsers used.
 * End users should normally use {@link FallbackParser} or
 * {@link SupplementingParser} along with a Strategy.
 * Note that unless you give a {@link ContentHandlerFactory},
 * you'll get content from every parser tried mushed together!
 *
 * @since Apache Tika 1.18
 */
public abstract class AbstractMultipleParser implements Parser {
    protected static final String METADATA_POLICY_CONFIG_KEY = "metadataPolicy";
    /**
     * Serial version UID.
     */
    private static final long serialVersionUID = 5383668090329836559L;
    /**
     * How we should handle metadata clashes
     */
    private final MetadataPolicy policy;
    /**
     * List of the multiple parsers to try.
     */
    private final Collection<? extends Parser> parsers;
    /**
     * Computed list of Mime Types to offer, which is all
     * those in common between the parsers.
     * For explicit mimetypes only, use a {@link ParserDecorator}
     */
    private final Set<MediaType> offeredTypes;
    /**
     * Media type registry.
     */
    private MediaTypeRegistry registry;

    public AbstractMultipleParser(MediaTypeRegistry registry, MetadataPolicy policy,
                                  Parser... parsers) {
        this(registry, policy, Arrays.asList(parsers));
    }

    public AbstractMultipleParser(MediaTypeRegistry registry, MetadataPolicy policy,
                                  Collection<? extends Parser> parsers) {
        this.policy = policy;
        this.parsers = parsers;
        this.registry = registry;

        // TODO Only offer those in common to several/all parser
        // TODO Some sort of specialisation / subtype support
        this.offeredTypes = new HashSet<>();
        for (Parser parser : parsers) {
            offeredTypes.addAll(parser.getSupportedTypes(new ParseContext()));
        }
    }

    protected static Metadata mergeMetadata(Metadata newMetadata, Metadata lastMetadata,
                                            MetadataPolicy policy) {
        if (policy == MetadataPolicy.DISCARD_ALL) {
            return newMetadata;
        }

        for (String n : lastMetadata.names()) {
            // If this is one of the metadata keys we're setting ourselves
            //  for tracking/errors, then always keep the latest one!
            if (n.equals(TikaCoreProperties.TIKA_PARSED_BY.getName())) {
                continue;
            }
            if (n.equals(ParserUtils.EMBEDDED_PARSER.getName())) {
                continue;
            }
            if (n.equals(TikaCoreProperties.EMBEDDED_EXCEPTION.getName())) {
                continue;
            }

            // Merge as per policy
            String[] newVals = newMetadata.getValues(n);
            String[] oldVals = lastMetadata.getValues(n);
            if (newVals == null || newVals.length == 0) {
                // Metadata only in previous run, keep old values
                for (String val : oldVals) {
                    newMetadata.add(n, val);
                }
            } else if (Arrays.deepEquals(oldVals, newVals)) {
                // Metadata is the same, nothing to do
                continue;
            } else {
                switch (policy) {
                    case FIRST_WINS:
                        // Use the earlier value(s) in place of this/these one/s
                        newMetadata.remove(n);
                        for (String val : oldVals) {
                            newMetadata.add(n, val);
                        }
                        continue;
                    case LAST_WINS:
                        // Most recent (last) parser has already won
                        continue;
                    case KEEP_ALL:
                        // Start with old list, then add any new unique values
                        List<String> vals = new ArrayList<>(Arrays.asList(oldVals));
                        newMetadata.remove(n);
                        for (String oldVal : oldVals) {
                            newMetadata.add(n, oldVal);
                        }
                        for (String newVal : newVals) {
                            if (!vals.contains(newVal)) {
                                newMetadata.add(n, newVal);
                                vals.add(newVal);
                            }
                        }

                        continue;
                }
            }
        }
        return newMetadata;
    }

    /**
     * Returns the media type registry used to infer type relationships.
     *
     * @return media type registry
     */
    public MediaTypeRegistry getMediaTypeRegistry() {
        return registry;
    }

    /**
     * Sets the media type registry used to infer type relationships.
     *
     * @param registry media type registry
     */
    public void setMediaTypeRegistry(MediaTypeRegistry registry) {
        this.registry = registry;
    }

    @Override
    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return offeredTypes;
    }

    public MetadataPolicy getMetadataPolicy() {
        return policy;
    }

    public List<Parser> getAllParsers() {
        return Collections.unmodifiableList(new ArrayList<>(parsers));
    }

    /**
     * Used to allow implementations to prepare or change things
     * before parsing occurs
     */
    protected void parserPrepare(Parser parser, Metadata metadata, ParseContext context) {
    }

    /**
     * Used to notify implementations that a Parser has Finished
     * or Failed, and to allow them to decide to continue or
     * abort further parsing
     */
    protected abstract boolean parserCompleted(Parser parser, Metadata metadata,
                                               ContentHandler handler, ParseContext context,
                                               Exception exception);

    /**
     * Processes the given Stream through one or more parsers,
     * resetting things between parsers as requested by policy.
     * The actual processing is delegated to one or more {@link Parser}s.
     * <p>
     * Note that you'll get text from every parser this way, to have
     * control of which content is from which parser you need to
     * call the method with a {@link ContentHandlerFactory} instead.
     */
    @Override
    public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata,
                      ParseContext context) throws IOException, SAXException, TikaException {
        parse(tis, handler, null, metadata, context);
    }

    /**
     * Processes the given Stream through one or more parsers,
     * resetting things between parsers as requested by policy.
     * The actual processing is delegated to one or more {@link Parser}s.
     * You will get one ContentHandler fetched for each Parser used.
     * TODO Do we need to return all the ContentHandler instances we created?
     *
     * @deprecated The {@link ContentHandlerFactory} override is still experimental
     * and the method signature is subject to change before Tika 2.0
     */
    @Deprecated
    public void parse(TikaInputStream tis, ContentHandlerFactory handlers, Metadata metadata,
                      ParseContext context) throws IOException, SAXException, TikaException {
        parse(tis, null, handlers, metadata, context);
    }

    private void parse(TikaInputStream tis, ContentHandler handler,
                       ContentHandlerFactory handlerFactory, Metadata originalMetadata,
                       ParseContext context) throws IOException, SAXException, TikaException {
        // Enable rewind capability since we rewind between multiple parsers
        tis.enableRewind();

        // Track the metadata between parsers, so we can apply our policy
        Metadata lastMetadata = cloneMetadata(originalMetadata);
        Metadata metadata = lastMetadata;

        // Start tracking resources, so we can clean up when done
        TemporaryResources tmp = new TemporaryResources();
        try {
            for (Parser p : parsers) {
                // Get a new handler for this parser, if we can
                // If not, the user will get text from every parser
                //  mushed together onto the one solitary handler...
                if (handlerFactory != null) {
                    handler = handlerFactory.createHandler();
                }

                // Record that we used this parser
                recordParserDetails(p, originalMetadata);

                // Prepare an near-empty Metadata, will merge after
                metadata = cloneMetadata(originalMetadata);

                // Notify the implementation of what we're about to do
                parserPrepare(p, metadata, context);

                // Process if possible
                Exception failure = null;
                try {
                    p.parse(tis, handler, metadata, context);
                } catch (Exception e) {
                    // Record the failure such that it can't get lost / overwritten
                    recordParserFailure(p, e, originalMetadata);
                    recordParserFailure(p, e, metadata);
                    failure = e;
                }

                // Notify the implementation how it went
                boolean tryNext = parserCompleted(p, metadata, handler, context, failure);

                // Handle metadata merging / clashes
                metadata = mergeMetadata(metadata, lastMetadata, policy);

                // Abort if requested, with the exception if there was one
                if (!tryNext) {
                    if (failure != null) {
                        if (failure instanceof IOException) {
                            throw (IOException) failure;
                        }
                        if (failure instanceof SAXException) {
                            throw (SAXException) failure;
                        }
                        if (failure instanceof TikaException) {
                            throw (TikaException) failure;
                        }
                        throw new TikaException("Unexpected RuntimeException from " + p, failure);
                    }
                    // Abort processing, don't try any more parsers
                    break;
                }

                // Prepare for the next parser, if present
                lastMetadata = cloneMetadata(metadata);
                tis.rewind();
            }
        } finally {
            tmp.dispose();
        }

        // Finally, copy the latest metadata back onto their supplied object
        for (String n : metadata.names()) {
            originalMetadata.remove(n);
            for (String val : metadata.getValues(n)) {
                originalMetadata.add(n, val);
            }
        }
    }

    /**
     * The various strategies for handling metadata emitted by
     * multiple parsers.
     * Note that not all will be supported by all subclasses.
     */
    public enum MetadataPolicy {
        /**
         * Before moving onto another parser, throw away
         * all previously seen metadata
         */
        DISCARD_ALL,
        /**
         * The first parser to output a given key wins,
         * merge in non-clashing other keys
         */
        FIRST_WINS,
        /**
         * The last parser to output a given key wins,
         * overriding previous parser values for a
         * clashing key.
         */
        LAST_WINS,
        /**
         * Where multiple parsers output a given key,
         * store all their different (unique) values
         */
        KEEP_ALL
    }
}