RecursiveParserWrapper.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser;

import java.io.IOException;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.exception.CorruptedFileException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.extractor.ParentContentHandler;
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.apache.tika.sax.SecureContentHandler;
import org.apache.tika.sax.WriteLimiter;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.ParserUtils;

/**
 * This is a helper class that wraps a parser in a recursive handler.
 * It takes care of setting the embedded parser in the ParseContext
 * and handling the embedded path calculations.
 * <p>
 * After parsing a document, call getMetadata() to retrieve a list of
 * Metadata objects, one for each embedded resource.  The first item
 * in the list will contain the Metadata for the outer container file.
 * <p>
 * Content can also be extracted and stored in the {@link TikaCoreProperties#TIKA_CONTENT} field
 * of a Metadata object.  Select the type of content to be stored
 * at initialization.
 * <p>
 * If a WriteLimitReachedException is encountered, the wrapper will stop
 * processing the current resource, and it will not process
 * any of the child resources for the given resource.  However, it will try to
 * parse as much as it can.  If a WLRE is reached in the parent document,
 * no child resources will be parsed.
 * <p>
 * The implementation is based on Jukka's RecursiveMetadataParser
 * and Nick's additions. See:
 * <a href="http://wiki.apache.org/tika/RecursiveMetadata#Jukka.27s_RecursiveMetadata_Parser">RecursiveMetadataParser</a>.
 * <p>
 * Note that this wrapper holds all data in memory and is not appropriate
 * for files with content too large to be held in memory.
 * <p>
 * The unit tests for this class are in the tika-parsers module.
 * </p>
 */
public class RecursiveParserWrapper extends ParserDecorator {

    /**
     * Generated serial version
     */
    private static final long serialVersionUID = 9086536568120690938L;


    private final boolean catchEmbeddedExceptions;

    private final boolean inlineContent = false;
    /**
     * Initialize the wrapper with {@link #catchEmbeddedExceptions} set
     * to <code>true</code> as default.
     *
     * @param wrappedParser parser to use for the container documents and the embedded documents
     */
    public RecursiveParserWrapper(Parser wrappedParser) {
        this(wrappedParser, true);
    }

    /**
     * @param wrappedParser           parser to wrap
     * @param catchEmbeddedExceptions whether or not to catch+record embedded exceptions.
     *                                If set to <code>false</code>, embedded exceptions will be
     *                                thrown and the rest of the file will not be parsed. The
     *                                following will not be ignored:
     *                                {@link CorruptedFileException}, {@link RuntimeException}
     */
    public RecursiveParserWrapper(Parser wrappedParser, boolean catchEmbeddedExceptions) {
        super(wrappedParser);
        this.catchEmbeddedExceptions = catchEmbeddedExceptions;
    }


    @Override
    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return getWrappedParser().getSupportedTypes(context);
    }


    /**
     * @param stream
     * @param recursiveParserWrapperHandler -- handler must implement
     * {@link RecursiveParserWrapperHandler}
     * @param metadata
     * @param context
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     * @throws IllegalStateException if the handler is not a {@link RecursiveParserWrapperHandler}
     */
    @Override
    public void parse(TikaInputStream tis, ContentHandler recursiveParserWrapperHandler,
                      Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        //this tracks the state of the parent parser, per call to #parse
        ParserState parserState;
        if (recursiveParserWrapperHandler instanceof AbstractRecursiveParserWrapperHandler) {
            parserState = new ParserState(
                    (AbstractRecursiveParserWrapperHandler) recursiveParserWrapperHandler);
        } else {
            throw new IllegalStateException(
                    "ContentHandler must implement RecursiveParserWrapperHandler");
        }
        EmbeddedParserDecorator decorator =
                new EmbeddedParserDecorator(getWrappedParser(), "/", "/", parserState);
        context.set(Parser.class, decorator);
        ContentHandler localHandler =
                parserState.recursiveParserWrapperHandler.createHandler();
        long started = System.currentTimeMillis();
        parserState.recursiveParserWrapperHandler.startDocument();
        int writeLimit = -1;
        boolean throwOnWriteLimitReached = true;

        if (recursiveParserWrapperHandler instanceof AbstractRecursiveParserWrapperHandler) {
            ContentHandlerFactory factory =
                    ((AbstractRecursiveParserWrapperHandler)recursiveParserWrapperHandler).getContentHandlerFactory();
            if (factory instanceof WriteLimiter) {
                writeLimit = ((WriteLimiter)factory).getWriteLimit();
                throwOnWriteLimitReached = ((WriteLimiter)factory).isThrowOnWriteLimitReached();
            }
        }
        try {
            RecursivelySecureContentHandler secureContentHandler =
                    new RecursivelySecureContentHandler(localHandler, tis, new SecureHandlerCounter(writeLimit),
                            throwOnWriteLimitReached, context);
            context.set(RecursivelySecureContentHandler.class, secureContentHandler);
            getWrappedParser().parse(tis, secureContentHandler, metadata, context);
        } catch (Throwable e) {
            if (e instanceof EncryptedDocumentException) {
                metadata.set(TikaCoreProperties.IS_ENCRYPTED, "true");
            }
            if (WriteLimitReachedException.isWriteLimitReached(e)) {
                metadata.set(TikaCoreProperties.WRITE_LIMIT_REACHED, "true");
            } else {
                String stackTrace = ExceptionUtils.getFilteredStackTrace(e);
                metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION, stackTrace);
                throw e;
            }
        } finally {
            long elapsedMillis = System.currentTimeMillis() - started;
            metadata.set(TikaCoreProperties.PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
            parserState.recursiveParserWrapperHandler.endDocument(localHandler, metadata);
            parserState.recursiveParserWrapperHandler.endDocument();
            context.set(RecursivelySecureContentHandler.class, null);
        }
    }

    public static String getResourceName(Metadata metadata, AtomicInteger counter) {
        String objectName = "";
        if (metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY) != null) {
            objectName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
        } else if (metadata.get(TikaCoreProperties.INTERNAL_PATH) != null) {
            objectName = FilenameUtils.getName(metadata.get(TikaCoreProperties.INTERNAL_PATH));
        } else if (metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID) != null) {
            objectName = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
        } else if (metadata.get(TikaCoreProperties.VERSION_NUMBER) != null) {
            objectName = "version-number-" + metadata.get(TikaCoreProperties.VERSION_NUMBER);
        } else {
            objectName = EmbeddedDocumentUtil.generateResourceName(
                    EmbeddedDocumentUtil.EmbeddedResourcePrefix.EMBEDDED,
                    counter.incrementAndGet(),
                    metadata.get(Metadata.CONTENT_TYPE));
            metadata.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
        }
        //make sure that there isn't any path info in the objectName
        //some parsers can return paths, not just file names
        objectName = FilenameUtils.getName(objectName);
        return objectName;
    }


    private class EmbeddedParserDecorator extends StatefulParser {

        private static final long serialVersionUID = 207648200464263337L;
        private final ParserState parserState;
        private String location = null;

        private String embeddedIdPath = null;


        private EmbeddedParserDecorator(Parser parser, String location,
                                        String embeddedIdPath, ParserState parseState) {
            super(parser);
            this.location = location;
            if (!this.location.endsWith("/")) {
                this.location += "/";
            }
            this.embeddedIdPath = embeddedIdPath;
            this.parserState = parseState;
        }

        @Override
        public void parse(TikaInputStream tis, ContentHandler ignore, Metadata metadata,
                          ParseContext context) throws IOException, SAXException, TikaException {

            // Work out what this thing is
            String objectName = getResourceName(metadata, parserState.unknownCount);
            String objectLocation = this.location + objectName;

            metadata.add(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, objectLocation);

            String idPath =
                    this.embeddedIdPath.equals("/") ?
                            this.embeddedIdPath + ++parserState.embeddedCount :
                            this.embeddedIdPath + "/" + ++parserState.embeddedCount;
            metadata.add(TikaCoreProperties.EMBEDDED_ID_PATH, idPath);
            metadata.set(TikaCoreProperties.EMBEDDED_ID, parserState.embeddedCount);
            //get a fresh handler
            ContentHandler localHandler =
                    parserState.recursiveParserWrapperHandler.createHandler();
            parserState.recursiveParserWrapperHandler.startEmbeddedDocument(localHandler, metadata);

            Parser preContextParser = context.get(Parser.class);
            context.set(Parser.class,
                    new EmbeddedParserDecorator(getWrappedParser(), objectLocation,
                            idPath, parserState));
            long started = System.currentTimeMillis();
            //store the handler that was used before this parse
            //so that you can return it back to its state at the end of this parse
            RecursivelySecureContentHandler preParseHandler = context.get(RecursivelySecureContentHandler.class);

            ParentContentHandler preParseParentHandler = context.get(ParentContentHandler.class);
            context.set(ParentContentHandler.class, new ParentContentHandler(preParseHandler));
            ContentHandler secureContentHandler =
                    new RecursivelySecureContentHandler(localHandler, tis, preParseHandler.handlerCounter,
                    preParseHandler.throwOnWriteLimitReached, context);

            try {
                tis.setCloseShield();
                super.parse(tis, secureContentHandler, metadata, context);
            } catch (SAXException e) {
                if (WriteLimitReachedException.isWriteLimitReached(e)) {
                    metadata.add(TikaCoreProperties.WRITE_LIMIT_REACHED, "true");
                    throw e;
                } else {
                    if (catchEmbeddedExceptions) {
                        ParserUtils.recordParserFailure(this, e, metadata);
                    } else {
                        throw e;
                    }
                }
            } catch (CorruptedFileException e) {
                throw e;
            } catch (TikaException e) {
                if (e instanceof EncryptedDocumentException) {
                    metadata.set(TikaCoreProperties.IS_ENCRYPTED, true);
                }
                if (context.get(ZeroByteFileException.IgnoreZeroByteFileException.class) != null &&
                        e instanceof ZeroByteFileException) {
                    //do nothing
                } else if (catchEmbeddedExceptions) {
                    ParserUtils.recordParserFailure(this, e, metadata);
                } else {
                    throw e;
                }
            } finally {
                tis.removeCloseShield();
                context.set(Parser.class, preContextParser);
                context.set(RecursivelySecureContentHandler.class, preParseHandler);
                context.set(ParentContentHandler.class, preParseParentHandler);
                long elapsedMillis = System.currentTimeMillis() - started;
                metadata.set(TikaCoreProperties.PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
                parserState.recursiveParserWrapperHandler
                        .endEmbeddedDocument(localHandler, metadata);
            }
        }
    }

    /**
     * This tracks the state of the parse of a single document.
     * In future versions, this will allow the RecursiveParserWrapper to be thread safe.
     */
    private static class ParserState {
        private final AbstractRecursiveParserWrapperHandler recursiveParserWrapperHandler;
        private AtomicInteger unknownCount = new AtomicInteger(0);
        private int embeddedCount = 0;//this is effectively 1-indexed
        private ParserState(AbstractRecursiveParserWrapperHandler handler) {
            this.recursiveParserWrapperHandler = handler;
        }
    }

    static class SecureHandlerCounter {
        private final int totalWriteLimit;
        private boolean writeLimitReached = false;
        //total chars written to all handlers
        private int totalChars = 0;

        private SecureHandlerCounter(int totalWriteLimit) {
            this.totalWriteLimit = totalWriteLimit;
        }
        /**
         * Given the requested length, how many characters are actually available
         * @param length
         * @return
         */
        int getAvailable(int length) {
            return Math.min(totalWriteLimit - totalChars, length);
        }
        void addChars(int numChars) {
            totalChars += numChars;
        }

    }

    //
    static class RecursivelySecureContentHandler extends SecureContentHandler {
        private static AtomicInteger COUNTER = new AtomicInteger();
        private final ContentHandler handler;
        private final SecureHandlerCounter handlerCounter;

        private final boolean throwOnWriteLimitReached;

        private final ParseContext parseContext;

        private final int id = COUNTER.getAndIncrement();

        public RecursivelySecureContentHandler(ContentHandler handler, TikaInputStream stream,
                                               SecureHandlerCounter handlerCounter,
                                               boolean throwOnWriteLimitReached, ParseContext parseContext) {
            super(handler, stream);
            this.handler = handler;
            this.handlerCounter = handlerCounter;
            this.throwOnWriteLimitReached = throwOnWriteLimitReached;
            this.parseContext = parseContext;
        }

        /**
         * Bypass the SecureContentHandler...
         * <p>
         * This handler only looks at zip bomb via zip expansion.
         * Users should be protected within entries against nested
         * nested xml entities.  We don't want to carry
         * those stats _across_ entries.
         *
         * @param uri
         * @param localName
         * @param name
         * @param atts
         * @throws SAXException
         */
        @Override
        public void startElement(String uri, String localName, String name, Attributes atts)
                throws SAXException {
            this.handler.startElement(uri, localName, name, atts);
        }

        @Override
        public void endElement(String uri, String localName, String name) throws SAXException {
            this.handler.endElement(uri, localName, name);
        }

        @Override
        public void characters(char[] ch, int start, int length) throws SAXException {
            if (handlerCounter.writeLimitReached) {
                return;
            }

            if (handlerCounter.totalWriteLimit < 0) {
                super.characters(ch, start, length);
                return;
            }
            int availableLength = handlerCounter.getAvailable(length);
            super.characters(ch, start, availableLength);
            handlerCounter.addChars(availableLength);
            if (availableLength < length) {
                handleWriteLimitReached();
            }
        }

        @Override
        public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
            if (handlerCounter.writeLimitReached) {
                return;
            }

            if (handlerCounter.totalWriteLimit < 0) {
                super.ignorableWhitespace(ch, start, length);
                return;
            }
            int availableLength = handlerCounter.getAvailable(length);
            super.ignorableWhitespace(ch, start, availableLength);
            handlerCounter.addChars(availableLength);
            if (availableLength < length) {
                handleWriteLimitReached();
            }
        }

        private void handleWriteLimitReached() throws WriteLimitReachedException {
            handlerCounter.writeLimitReached = true;

            if (throwOnWriteLimitReached) {
                throw new WriteLimitReachedException(handlerCounter.totalWriteLimit);
            } else {
                ParseRecord parseRecord = parseContext.get(ParseRecord.class);
                if (parseRecord != null) {
                    parseRecord.setWriteLimitReached(true);
                }
            }
        }
    }
}