BasicContentHandlerFactoryTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.sax;

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Set;

import org.junit.jupiter.api.Test;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;

/**
 * Test cases for the {@link org.apache.tika.sax.BodyContentHandler} class.
 */
public class BasicContentHandlerFactoryTest {

    //default max char len (at least in WriteOutContentHandler is 100k)
    private static final int OVER_DEFAULT = 120000;

    //copied from TikaTest in tika-parsers package
    public static void assertNotContains(String needle, String haystack) {
        assertFalse(haystack.contains(needle), needle + " found in:\n" + haystack);
    }

    public static void assertNotContains(String needle, byte[] hayStack)
            throws UnsupportedEncodingException {
        assertNotContains(needle, new String(hayStack, UTF_8));
    }

    public static void assertContains(String needle, String haystack) {
        assertTrue(haystack.contains(needle), needle + " not found in:\n" + haystack);
    }

    public static void assertContains(String needle, byte[] hayStack)
            throws UnsupportedEncodingException {
        assertContains(needle, new String(hayStack, UTF_8));
    }

    @Test
    public void testIgnore() throws Exception {
        Parser p = new MockParser(OVER_DEFAULT);
        ContentHandler handler =
                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1)
                        .createHandler();
        assertTrue(handler instanceof DefaultHandler);
        p.parse(null, handler, null, null);
        //unfortunatley, the DefaultHandler does not return "",
        assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString());

        //tests that no write limit exception is thrown
        p = new MockParser(100);
        handler = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5)
                .createHandler();
        assertTrue(handler instanceof DefaultHandler);
        p.parse(null, handler, null, null);
        assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString());
    }

    @Test
    public void testText() throws Exception {
        Parser p = new MockParser(OVER_DEFAULT);
        BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
        ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler();

        assertTrue(handler instanceof ToTextContentHandler);
        p.parse(null, handler, null, null);
        String extracted = handler.toString();
        assertContains("This is the title", extracted);
        assertContains("aaaaaaaaaa", extracted);
        assertNotContains("<body", extracted);
        assertNotContains("<html", extracted);
        assertTrue(extracted.length() > 110000);
        //now test write limit
        p = new MockParser(10);
        handler = new BasicContentHandlerFactory(type, 5).createHandler();
        assertTrue(handler instanceof WriteOutContentHandler);
        assertWriteLimitReached(p, (WriteOutContentHandler) handler);
        extracted = handler.toString();
        assertContains("This ", extracted);
        assertNotContains("aaaa", extracted);

        //now test outputstream call
        p = new MockParser(OVER_DEFAULT);
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8);
        assertTrue(handler instanceof ToTextContentHandler);
        p.parse(null, handler, null, null);
        assertContains("This is the title", os.toByteArray());
        assertContains("aaaaaaaaaa", os.toByteArray());
        assertTrue(os.toByteArray().length > 110000);
        assertNotContains("<body", os.toByteArray());
        assertNotContains("<html", os.toByteArray());

        p = new MockParser(10);
        os = new ByteArrayOutputStream();
        handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8);
        assertTrue(handler instanceof WriteOutContentHandler);
        assertWriteLimitReached(p, (WriteOutContentHandler) handler);
        //When writing to an OutputStream and a write limit is reached,
        //currently, nothing is written.
        assertEquals(0, os.toByteArray().length);
    }

    @Test
    public void testHTML() throws Exception {
        Parser p = new MockParser(OVER_DEFAULT);
        BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.HTML;
        ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler();

        assertTrue(handler instanceof ToHTMLContentHandler);
        p.parse(null, handler, null, null);
        String extracted = handler.toString();
        assertContains("<head><title>This is the title", extracted);
        assertContains("aaaaaaaaaa", extracted);
        assertTrue(extracted.length() > 110000);

        //now test write limit
        p = new MockParser(10);
        handler = new BasicContentHandlerFactory(type, 5).createHandler();
        assertTrue(handler instanceof WriteOutContentHandler);
        assertWriteLimitReached(p, (WriteOutContentHandler) handler);
        extracted = handler.toString();
        assertContains("This ", extracted);
        assertNotContains("aaaa", extracted);

        //now test outputstream call
        p = new MockParser(OVER_DEFAULT);
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8);
        assertTrue(handler instanceof ToHTMLContentHandler);
        p.parse(null, handler, null, null);
        assertContains("This is the title", os.toByteArray());
        assertContains("aaaaaaaaaa", os.toByteArray());
        assertContains("<body", os.toByteArray());
        assertContains("<html", os.toByteArray());
        assertTrue(os.toByteArray().length > 110000);


        p = new MockParser(10);
        os = new ByteArrayOutputStream();
        handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8);
        assertTrue(handler instanceof WriteOutContentHandler);
        assertWriteLimitReached(p, (WriteOutContentHandler) handler);
        assertEquals(0, os.toByteArray().length);
    }

    @Test
    public void testXML() throws Exception {
        Parser p = new MockParser(OVER_DEFAULT);
        BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.HTML;
        ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler();

        assertTrue(handler instanceof ToXMLContentHandler);
        p.parse(null, handler, new Metadata(), null);
        String extracted = handler.toString();
        assertContains("<head><title>This is the title", extracted);
        assertContains("aaaaaaaaaa", extracted);
        assertTrue(handler.toString().length() > 110000);

        //now test write limit
        p = new MockParser(10);
        handler = new BasicContentHandlerFactory(type, 5).createHandler();
        assertTrue(handler instanceof WriteOutContentHandler);
        assertWriteLimitReached(p, (WriteOutContentHandler) handler);
        extracted = handler.toString();
        assertContains("This ", extracted);
        assertNotContains("aaaa", extracted);

        //now test outputstream call
        p = new MockParser(OVER_DEFAULT);
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8);
        assertTrue(handler instanceof ToXMLContentHandler);
        p.parse(null, handler, null, null);

        assertContains("This is the title", os.toByteArray());
        assertContains("aaaaaaaaaa", os.toByteArray());
        assertContains("<body", os.toByteArray());
        assertContains("<html", os.toByteArray());
        assertTrue(os.toByteArray().length > 110000);


        p = new MockParser(10);
        os = new ByteArrayOutputStream();
        handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8);
        assertTrue(handler instanceof WriteOutContentHandler);
        assertWriteLimitReached(p, (WriteOutContentHandler) handler);
        assertEquals(0, os.toByteArray().length);
    }

    @Test
    public void testBody() throws Exception {
        Parser p = new MockParser(OVER_DEFAULT);
        BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.BODY;
        ContentHandler handler = new BasicContentHandlerFactory(type, -1).createHandler();

        assertTrue(handler instanceof BodyContentHandler);

        p.parse(null, handler, null, null);
        String extracted = handler.toString();
        assertNotContains("title", extracted);
        assertContains("aaaaaaaaaa", extracted);
        assertTrue(extracted.length() > 110000);

        //now test write limit
        p = new MockParser(10);
        handler = new BasicContentHandlerFactory(type, 5).createHandler();
        assertTrue(handler instanceof BodyContentHandler);
        assertWriteLimitReached(p, (BodyContentHandler) handler);
        extracted = handler.toString();
        assertNotContains("This ", extracted);
        assertContains("aaaa", extracted);

        //now test outputstream call
        p = new MockParser(OVER_DEFAULT);
        ByteArrayOutputStream os = new ByteArrayOutputStream();
        handler = new BasicContentHandlerFactory(type, -1).createHandler(os, UTF_8);
        assertTrue(handler instanceof BodyContentHandler);
        p.parse(null, handler, null, null);
        assertNotContains("title", os.toByteArray());
        assertContains("aaaaaaaaaa", os.toByteArray());
        assertNotContains("<body", os.toByteArray());
        assertNotContains("<html", os.toByteArray());
        assertTrue(os.toByteArray().length > 110000);

        p = new MockParser(10);
        os = new ByteArrayOutputStream();
        handler = new BasicContentHandlerFactory(type, 5).createHandler(os, UTF_8);
        assertTrue(handler instanceof WriteOutContentHandler);
        assertWriteLimitReached(p, (WriteOutContentHandler) handler);
        assertEquals(0, os.toByteArray().length);
    }

    private void assertWriteLimitReached(Parser p, WriteOutContentHandler handler)
            throws Exception {
        boolean wlr = false;
        try {
            p.parse(null, handler, null, null);
        } catch (SAXException e) {
            if (!WriteLimitReachedException.isWriteLimitReached(e)) {
                throw e;
            }
            wlr = true;
        }
        assertTrue(wlr, "WriteLimitReached");
    }

    //TODO: is there a better way than to repeat this with diff signature?
    private void assertWriteLimitReached(Parser p, BodyContentHandler handler) throws Exception {
        boolean wlr = false;
        try {
            p.parse(null, handler, null, null);
        } catch (SAXException e) {
            if (! WriteLimitReachedException.isWriteLimitReached(e)) {
                throw e;
            }

            wlr = true;
        }
        assertTrue(wlr, "WriteLimitReached");
    }

    //Simple mockparser that writes a title
    //and charsToWrite number of 'a'
    private static class MockParser implements Parser {
        private final String XHTML = "http://www.w3.org/1999/xhtml";
        private final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
        private final char[] TITLE = "This is the title".toCharArray();

        private final int charsToWrite;

        public MockParser(int charsToWrite) {
            this.charsToWrite = charsToWrite;
        }

        @Override
        public Set<MediaType> getSupportedTypes(ParseContext context) {
            return null;
        }

        @Override
        public void parse(TikaInputStream stream, ContentHandler handler, Metadata metadata,
                          ParseContext context) throws IOException, SAXException, TikaException {
            handler.startDocument();
            handler.startPrefixMapping("", XHTML);
            handler.startElement(XHTML, "html", "html", EMPTY_ATTRIBUTES);
            handler.startElement(XHTML, "head", "head", EMPTY_ATTRIBUTES);
            handler.startElement(XHTML, "title", "head", EMPTY_ATTRIBUTES);
            handler.characters(TITLE, 0, TITLE.length);
            handler.endElement(XHTML, "title", "head");

            handler.endElement(XHTML, "head", "head");
            handler.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES);
            char[] body = new char[charsToWrite];
            for (int i = 0; i < charsToWrite; i++) {
                body[i] = 'a';
            }
            handler.characters(body, 0, body.length);
            handler.endElement(XHTML, "body", "body");
            handler.endElement(XHTML, "html", "html");
            handler.endDocument();
        }
    }
}