BoilerpipeHandlerTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.sax;

import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.StringWriter;
import java.io.Writer;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;

import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;

import org.apache.tika.TikaTest;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.JSoupParser;
import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
import org.apache.tika.utils.XMLReaderUtils;

public class BoilerpipeHandlerTest extends TikaTest {
    /**
     * Test case for TIKA-420
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-420">TIKA-420</a>
     */
    @Test
    public void testBoilerplateRemoval() throws Exception {
        String path = "/test-documents/boilerplate.html";

        Metadata metadata = new Metadata();
        BodyContentHandler handler = new BodyContentHandler();
        new JSoupParser()
                .parse(getResourceAsStream(path),
                        new BoilerpipeContentHandler(handler),
                        metadata,
                        new ParseContext());

        String content = handler.toString();
        assertTrue(content.startsWith("This is the real meat"));
        assertTrue(content.endsWith("This is the end of the text.\n"));
        assertFalse(content.contains("boilerplate"));
        assertFalse(content.contains("footer"));
    }

    /**
     * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
     */
    @Disabled("not clear why this doesn't work with jsoup")
    @Test
    public void testBoilerplateWithMarkup() throws Exception {
        String path = "/test-documents/boilerplate.html";

        Metadata metadata = new Metadata();
        StringWriter sw = new StringWriter();
        ContentHandler ch = makeHtmlTransformer(sw);
        BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
        bpch.setIncludeMarkup(true);

        new JSoupParser().parse(TikaInputStream.get(getResourceAsStream(path)), bpch, metadata,
                new ParseContext());

        String content = sw.toString();
        assertTrue(content.contains("<body><table><tr><td><table><tr><td>"),
                "Has empty table elements");
        assertTrue(content.contains("<a shape=\"rect\" href=\"Main.php\"/>"), "Has empty a element");
        assertTrue(content.contains("<p>This is the real meat"), "Has real content");
        assertTrue(content.endsWith("</p></body></html>"), "Ends with appropriate HTML");
        assertFalse(content.contains("boilerplate"));
        assertFalse(content.contains("footer"));
    }

    /**
     * Test case for TIKA-961
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-961">TIKA-961</a>
     */
    @Test
    public void testBoilerplateWhitespace() throws Exception {
        String path = "/test-documents/boilerplate-whitespace.html";

        Metadata metadata = new Metadata();
        BodyContentHandler handler = new BodyContentHandler();

        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
        bpHandler.setIncludeMarkup(true);

        new JSoupParser().parse(TikaInputStream.get(getResourceAsStream(path)), bpHandler, metadata,
                new ParseContext());

        String content = handler.toString();

        // Should not contain item_aitem_b
        assertFalse(content.contains("item_aitem_b"));

        // Should contain the two list items with a newline in between.
        assertContains("item_a\nitem_b", content);

        // Should contain ��������������������������� (can i help you) without whitespace
        assertContains("���������������������������", content);
    }

    /**
     * Test case for TIKA-2683
     *
     * @see <a href="https://issues.apache.org/jira/projects/TIKA/issues/TIKA-2683">TIKA-2683</a>
     */
    @Test
    public void testBoilerplateMissingWhitespace() throws Exception {
        String path = "/test-documents/testBoilerplateMissingSpace.html";

        Metadata metadata = new Metadata();
        BodyContentHandler handler = new BodyContentHandler();

        BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
        bpHandler.setIncludeMarkup(true);

        new JSoupParser().parse(
                TikaInputStream.get(getResourceAsStream(path)), bpHandler, metadata,
                new ParseContext());

        String content = handler.toString();

        // Should contain space between these two words as mentioned in HTML
        assertContains("family Psychrolutidae", content);

        // Shouldn't add new-line chars around brackets; This is not how the HTML look
        assertContains("(Psychrolutes marcidus)", content);
    }

    /**
     * Create ContentHandler that transforms SAX events into textual HTML output,
     * and writes it out to <writer> - typically this is a StringWriter.
     *
     * @param writer Where to write resulting HTML text.
     * @return ContentHandler suitable for passing to parse() methods.
     * @throws Exception
     */
    private ContentHandler makeHtmlTransformer(Writer writer) throws Exception {
        SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");
        handler.setResult(new StreamResult(writer));
        return handler;
    }
}