AbstractRecursiveParserWrapperHandler.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.sax;
import java.io.Serializable;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
/**
* This is a special handler to be used only with the
* {@link org.apache.tika.parser.RecursiveParserWrapper}.
* It allows for finer-grained processing of embedded documents than in the legacy handlers.
* Subclasses can choose how to process individual embedded documents.
*/
public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandler
implements Serializable {
public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED = Property.internalBoolean(
TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached");
public final static Property EMBEDDED_DEPTH_LIMIT_REACHED = Property.internalBoolean(
TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_depth_limit_reached");
private static final int MAX_DEPTH = 100;
private final ContentHandlerFactory contentHandlerFactory;
private int embeddedDepth = 0;
public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) {
this.contentHandlerFactory = contentHandlerFactory;
}
public ContentHandler createHandler() {
return contentHandlerFactory.createHandler();
}
/**
* This is called before parsing each embedded document. Override this
* for custom behavior. Make sure to call this in your custom classes
* because this tracks the embedded depth.
*
* @param contentHandler local handler to be used on this embedded document
* @param metadata embedded document's metadata
*/
public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metadata)
throws SAXException {
embeddedDepth++;
if (embeddedDepth >= MAX_DEPTH) {
throw new SAXException("Max embedded depth reached: " + embeddedDepth);
}
metadata.set(TikaCoreProperties.EMBEDDED_DEPTH, embeddedDepth);
}
/**
* This is called after parsing each embedded document. Override this
* for custom behavior. This is currently a no-op aside from tracking embedded depth.
* <p>
* When overriding, make sure to call {@link #decrementEmbeddedDepth()}
*
* @param contentHandler content handler that was used on this embedded document
* @param metadata metadata for this embedded document
* @throws SAXException
*/
public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata)
throws SAXException {
decrementEmbeddedDepth();
}
/**
* This is called by {@link #endEmbeddedDocument(ContentHandler, Metadata)}. Users
* overriding {@link #endEmbeddedDocument(ContentHandler, Metadata)} need to call this
* unless they are triggering it via <code>super.endEmbeddedDocument(contentHandler, metadata);</code>
*/
protected void decrementEmbeddedDepth() {
embeddedDepth--;
}
/**
* This is called after the full parse has completed. Override this
* for custom behavior. Make sure to call this as <code>super.endDocument(...)</code>
* in subclasses.
*
* @param contentHandler content handler that was used on the main document
* @param metadata metadata that was gathered for the main document
* @throws SAXException
*/
public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
metadata.set(TikaCoreProperties.EMBEDDED_DEPTH, 0);
}
public ContentHandlerFactory getContentHandlerFactory() {
return contentHandlerFactory;
}
}