BasicContentHandlerFactory.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.sax;


import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Locale;

import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.config.OutputLimits;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.parser.ParseContext;

/**
 * Basic factory for creating common types of ContentHandlers.
 * <p>
 * Implements {@link StreamingContentHandlerFactory} to support both in-memory
 * content extraction and streaming output to an OutputStream.
 */
@TikaComponent(defaultFor = ContentHandlerFactory.class)
public class BasicContentHandlerFactory implements StreamingContentHandlerFactory, WriteLimiter {

    private HANDLER_TYPE type = HANDLER_TYPE.MARKDOWN;
    private int writeLimit = -1;
    private boolean throwOnWriteLimitReached = true;
    private transient ParseContext parseContext;

    /**
     * No-arg constructor for bean-style configuration (e.g., Jackson deserialization).
     * Creates a factory with TEXT handler type, unlimited write, and throwOnWriteLimitReached=true.
     */
    public BasicContentHandlerFactory() {
    }

    /**
     * Create a BasicContentHandlerFactory with {@link #throwOnWriteLimitReached} is true
     * @param type       basic type of handler
     * @param writeLimit max number of characters to store; if < 0,
     *                   the handler will store all characters
     */
    public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit) {
        this(type, writeLimit, true, null);
    }

    /**
     *
     * @param type basic type of handler
     * @param writeLimit maximum number of characters to store
     * @param throwOnWriteLimitReached whether or not to throw a
     *          {@link org.apache.tika.exception.WriteLimitReachedException}
     *                                 when the write limit has been reached
     * @param parseContext to store the writelimitreached warning if
     *                 throwOnWriteLimitReached is set to <code>false</code>
     */
    public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit,
                                      boolean throwOnWriteLimitReached, ParseContext parseContext) {
        this.type = type;
        this.writeLimit = writeLimit;
        this.throwOnWriteLimitReached = throwOnWriteLimitReached;
        this.parseContext = parseContext;
        if (throwOnWriteLimitReached == false && parseContext == null) {
            throw new IllegalArgumentException("parse context must not be null if " +
                    "throwOnWriteLimitReached is false");
        }
    }

    /**
     * Creates a new BasicContentHandlerFactory configured from OutputLimits in the ParseContext.
     * <p>
     * If OutputLimits is present in the context, the factory will be configured with those
     * limits (writeLimit, throwOnWriteLimit). Otherwise, default values are used.
     *
     * @param type the handler type
     * @param context the ParseContext (required if throwOnWriteLimit is false)
     * @return a configured BasicContentHandlerFactory
     */
    public static BasicContentHandlerFactory newInstance(HANDLER_TYPE type, ParseContext context) {
        OutputLimits limits = OutputLimits.get(context);
        return new BasicContentHandlerFactory(type, limits.getWriteLimit(),
                limits.isThrowOnWriteLimit(), context);
    }

    /**
     * Tries to parse string into handler type.  Returns default if string is null or
     * parse fails.
     * <p/>
     * Options: xml, html, text, body, ignore (no content), markdown/md
     *
     * @param handlerTypeName string to parse
     * @param defaultType     type to return if parse fails
     * @return handler type
     */
    public static HANDLER_TYPE parseHandlerType(String handlerTypeName, HANDLER_TYPE defaultType) {
        if (handlerTypeName == null) {
            return defaultType;
        }

        String lcHandlerTypeName = handlerTypeName.toLowerCase(Locale.ROOT);
        switch (lcHandlerTypeName) {
            case "xml":
                return HANDLER_TYPE.XML;
            case "text":
                return HANDLER_TYPE.TEXT;
            case "txt":
                return HANDLER_TYPE.TEXT;
            case "html":
                return HANDLER_TYPE.HTML;
            case "body":
                return HANDLER_TYPE.BODY;
            case "ignore":
                return HANDLER_TYPE.IGNORE;
            case "markdown":
            case "md":
                return HANDLER_TYPE.MARKDOWN;
            default:
                return defaultType;
        }
    }

    @Override
    public ContentHandler createHandler() {

        if (type == HANDLER_TYPE.BODY) {
            return new BodyContentHandler(
                    new WriteOutContentHandler(new ToTextContentHandler(), writeLimit,
                    throwOnWriteLimitReached, parseContext));
        } else if (type == HANDLER_TYPE.IGNORE) {
            return new DefaultHandler();
        }
        ContentHandler formatHandler = getFormatHandler();
        if (writeLimit < 0) {
            return formatHandler;
        }
        return new WriteOutContentHandler(formatHandler, writeLimit, throwOnWriteLimitReached,
                parseContext);
    }

    private ContentHandler getFormatHandler() {
        switch (type) {
            case TEXT:
                return new ToTextContentHandler();
            case HTML:
                return new ToHTMLContentHandler();
            case XML:
                return new ToXMLContentHandler();
            case MARKDOWN:
                return new ToMarkdownContentHandler();
            default:
                return new ToTextContentHandler();
        }
    }

    @Override
    public ContentHandler createHandler(OutputStream os, Charset charset) {

        if (type == HANDLER_TYPE.IGNORE) {
            return new DefaultHandler();
        }
        try {
            if (writeLimit > -1) {
                switch (type) {
                    case BODY:
                        return new WriteOutContentHandler(
                                new BodyContentHandler(new OutputStreamWriter(os, charset)),
                                writeLimit);
                    case TEXT:
                        return new WriteOutContentHandler(
                                new ToTextContentHandler(os, charset.name()), writeLimit);
                    case HTML:
                        return new WriteOutContentHandler(
                                new ToHTMLContentHandler(os, charset.name()), writeLimit);
                    case XML:
                        return new WriteOutContentHandler(
                                new ToXMLContentHandler(os, charset.name()), writeLimit);
                    case MARKDOWN:
                        return new WriteOutContentHandler(
                                new ToMarkdownContentHandler(os, charset.name()), writeLimit);
                    default:
                        return new WriteOutContentHandler(
                                new ToTextContentHandler(os, charset.name()), writeLimit);
                }
            } else {
                switch (type) {
                    case BODY:
                        return new BodyContentHandler(new OutputStreamWriter(os, charset));
                    case TEXT:
                        return new ToTextContentHandler(os, charset.name());
                    case HTML:
                        return new ToHTMLContentHandler(os, charset.name());
                    case XML:
                        return new ToXMLContentHandler(os, charset.name());
                    case MARKDOWN:
                        return new ToMarkdownContentHandler(os, charset.name());
                    default:
                        return new ToTextContentHandler(os, charset.name());

                }
            }
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException("couldn't find charset for name: " + charset);
        }
    }

    /**
     * @return handler type used by this factory
     */
    public HANDLER_TYPE getType() {
        return type;
    }

    @Override
    public String handlerTypeName() {
        return type.name();
    }

    /**
     * Sets the handler type.
     * @param type the handler type
     */
    public void setType(HANDLER_TYPE type) {
        this.type = type;
    }

    /**
     * Common handler types for content.
     */
    public enum HANDLER_TYPE {
        BODY, IGNORE, //don't store content
        TEXT, HTML, XML, MARKDOWN
    }

    public int getWriteLimit() {
        return writeLimit;
    }

    /**
     * Sets the write limit.
     * @param writeLimit max characters to extract; -1 for unlimited
     */
    public void setWriteLimit(int writeLimit) {
        this.writeLimit = writeLimit;
    }

    @Override
    public boolean isThrowOnWriteLimitReached() {
        return throwOnWriteLimitReached;
    }

    /**
     * Sets whether to throw an exception when write limit is reached.
     * @param throwOnWriteLimitReached true to throw, false to silently stop
     */
    public void setThrowOnWriteLimitReached(boolean throwOnWriteLimitReached) {
        this.throwOnWriteLimitReached = throwOnWriteLimitReached;
    }

    /**
     * Sets the parse context for storing warnings when throwOnWriteLimitReached is false.
     * @param parseContext the parse context
     */
    public void setParseContext(ParseContext parseContext) {
        this.parseContext = parseContext;
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) {
            return true;
        }
        if (o == null || getClass() != o.getClass()) {
            return false;
        }
        BasicContentHandlerFactory that = (BasicContentHandlerFactory) o;
        return writeLimit == that.writeLimit &&
                throwOnWriteLimitReached == that.throwOnWriteLimitReached &&
                type == that.type;
    }

    @Override
    public int hashCode() {
        int result = type != null ? type.hashCode() : 0;
        result = 31 * result + writeLimit;
        result = 31 * result + (throwOnWriteLimitReached ? 1 : 0);
        return result;
    }
}