SafeContentHandler.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.sax;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

/**
 * Content handler decorator that makes sure that the character events
 * ({@link #characters(char[], int, int)} or
 * {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated
 * content handler contain only valid XML characters. All invalid characters
 * are replaced with the Unicode replacement character U+FFFD (though a
 * subclass may change this by overriding the {@link #writeReplacement(Output)}  method).
 * <p>
 * The XML standard defines the following Unicode character ranges as
 * valid XML characters:
 * <pre>
 * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
 * </pre>
 * <p>
 * Note that currently this class only detects those invalid characters whose
 * UTF-16 representation fits a single char. Also, this class does not ensure
 * that the UTF-16 encoding of incoming characters is correct.
 */
public class SafeContentHandler extends ContentHandlerDecorator {

    /**
     * Replacement for invalid characters.
     */
    private static final char[] REPLACEMENT = new char[]{'\ufffd'};
    /**
     * Output through the {@link ContentHandler#characters(char[], int, int)}
     * method of the decorated content handler.
     */
    private final Output charactersOutput = SafeContentHandler.super::characters;
    /**
     * Output through the
     * {@link ContentHandler#ignorableWhitespace(char[], int, int)}
     * method of the decorated content handler.
     */
    private final Output ignorableWhitespaceOutput = SafeContentHandler.super::ignorableWhitespace;

    public SafeContentHandler(ContentHandler handler) {
        super(handler);
    }

    /**
     * Filters and outputs the contents of the given input buffer. Any
     * invalid characters in the input buffer area handled by sending a
     * replacement (a space character) to the given output. Any sequences
     * of valid characters are passed as-is to the given output.
     *
     * @param ch     input buffer
     * @param start  start offset within the buffer
     * @param length number of characters to read from the buffer
     * @param output output channel
     * @throws SAXException if the filtered characters could not be written out
     */
    private void filter(char[] ch, int start, int length, Output output) throws SAXException {
        int end = start + length;

        int i = start;
        while (i < end) {
            int c = Character.codePointAt(ch, i, end);
            int j = i + Character.charCount(c);

            if (isInvalid(c)) {
                // Output any preceding valid characters
                if (i > start) {
                    output.write(ch, start, i - start);
                }

                // Output the replacement for this invalid character
                writeReplacement(output);

                // Continue with the rest of the array
                start = j;
            }

            i = j;
        }

        // Output any remaining valid characters
        output.write(ch, start, end - start);
    }

    /**
     * Checks if the given string contains any invalid XML characters.
     *
     * @param value string to be checked
     * @return <code>true</code> if the string contains invalid XML characters,
     * <code>false</code> otherwise
     */
    private boolean isInvalid(String value) {
        char[] ch = value.toCharArray();

        int i = 0;
        while (i < ch.length) {
            int c = Character.codePointAt(ch, i);
            if (isInvalid(c)) {
                return true;
            }
            i = i + Character.charCount(c);
        }

        return false;
    }

    /**
     * Checks whether the given Unicode character is an invalid XML character
     * and should be replaced for output. Subclasses can override this method
     * to use an alternative definition of which characters should be replaced
     * in the XML output. The default definition from the XML specification is:
     * <pre>
     * Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
     * </pre>
     *
     * @param ch character
     * @return <code>true</code> if the character should be replaced,
     * <code>false</code> otherwise
     */
    protected boolean isInvalid(int ch) {
        if (ch < 0x20) {
            return ch != 0x09 && ch != 0x0A && ch != 0x0D;
        } else if (ch < 0xE000) {
            return ch > 0xD7FF;
        } else if (ch < 0x10000) {
            return ch > 0xFFFD;
        } else {
            return ch > 0x10FFFF;
        }
    }

    /**
     * Outputs the replacement for an invalid character. Subclasses can
     * override this method to use a custom replacement.
     *
     * @param output where the replacement is written to
     * @throws SAXException if the replacement could not be written
     */
    protected void writeReplacement(Output output) throws SAXException {
        output.write(REPLACEMENT, 0, REPLACEMENT.length);
    }

    @Override
    public void startElement(String uri, String localName, String name, Attributes atts)
            throws SAXException {
        // TODO: enable this, but some parsers currently
        // trip it
        //assert verifyStartElement(name);
        // Look for any invalid characters in attribute values.
        for (int i = 0; i < atts.getLength(); i++) {
            if (isInvalid(atts.getValue(i))) {
                // Found an invalid character, so need to filter the attributes
                AttributesImpl filtered = new AttributesImpl();
                for (int j = 0; j < atts.getLength(); j++) {
                    String value = atts.getValue(j);
                    if (j >= i && isInvalid(value)) {
                        // Filter the attribute value when needed
                        Output buffer = new StringOutput();
                        filter(value.toCharArray(), 0, value.length(), buffer);
                        value = buffer.toString();
                    }
                    filtered.addAttribute(atts.getURI(j), atts.getLocalName(j), atts.getQName(j),
                            atts.getType(j), value);
                }
                atts = filtered;
                break;
            }
        }
        super.startElement(uri, localName, name, atts);
    }

    @Override
    public void endElement(String uri, String localName, String name) throws SAXException {
        // TODO: enable this, but some parsers currently
        // trip it
        //assert verifyEndElement(name);
        super.endElement(uri, localName, name);
    }


    /*
    private final List<String> elements = new ArrayList<String>();

    // Called only from assert
    private boolean verifyStartElement(String name) {
        // TODO: we could strengthen this to do full
        // XTHML validation, eg you shouldn't start p inside
        // another p (but ODF parser, at least, seems to
        // violate this):
        //if (name.equals("p")) {
        //assert elements.size() == 0 || !elements.get(elements.size()-1).equals("p");
        //}
        elements.add(name);
        return true;
    }

    // Called only from assert
    private boolean verifyEndElement(String name) {
        assert elements.size() > 0: "end tag=" + name + " with no startElement";
        final String currentElement = elements.get(elements.size()-1);
        assert currentElement.equals(name): "mismatched elements open=" +
        currentElement + " close=" + name;
        elements.remove(elements.size()-1);
        return true;
    }

    // Called only from assert
    private boolean verifyEndDocument() {
        assert elements.size() == 0;
        return true;
    }
    */

    //------------------------------------------------------< ContentHandler >

    @Override
    public void endDocument() throws SAXException {
        // TODO: enable this, but some parsers currently
        // trip it
        //assert verifyEndDocument();
        super.endDocument();
    }

    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
        filter(ch, start, length, charactersOutput);
    }

    @Override
    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
        filter(ch, start, length, ignorableWhitespaceOutput);
    }

    /**
     * Internal interface that allows both character and
     * ignorable whitespace content to be filtered the same way.
     */
    protected interface Output {
        void write(char[] ch, int start, int length) throws SAXException;
    }

    private static class StringOutput implements Output {

        private final StringBuilder builder = new StringBuilder();

        public void write(char[] ch, int start, int length) {
            builder.append(ch, start, length);
        }

        public String toString() {
            return builder.toString();
        }

    }

}