XHTMLContentHandlerTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.sax;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.util.ArrayList;
import java.util.List;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;

/**
 * Unit tests for the {@link XHTMLContentHandler} class.
 */
public class XHTMLContentHandlerTest {

    private ContentHandler output;

    private XHTMLContentHandler xhtml;

    /**
     * Return array of non-zerolength words. Splitting on whitespace will get us
     * empty words for emptylines.
     *
     * @param string some mix of newlines and real words
     * @return array of real words.
     */
    private static String[] getRealWords(String string) {
        String[] possibleWords = string.split("\\s+");
        List<String> words = new ArrayList<>(possibleWords.length);
        for (String word : possibleWords) {
            if (word.length() > 0) {
                words.add(word);
            }
        }

        return words.toArray(new String[0]);
    }

    @BeforeEach
    public void setUp() {
        output = new BodyContentHandler();
        xhtml = new XHTMLContentHandler(output, new Metadata());
    }

    /**
     * Test that content in block elements are properly separated in text
     * output.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-188">TIKA-188</a>
     */
    @Test
    public void testExtraWhitespace() throws SAXException {
        xhtml.startDocument();

        xhtml.element("p", "foo");
        xhtml.startElement("p");
        xhtml.characters("b");
        xhtml.element("b", "a"); // inlines should not cause extra whitespace
        xhtml.characters("r");
        xhtml.endElement("p");

        xhtml.startElement("table");
        xhtml.startElement("tr");
        xhtml.element("th", "x");
        xhtml.element("th", "y");
        xhtml.endElement("tr");
        xhtml.startElement("tr");
        xhtml.element("td", "a");
        xhtml.element("td", "b");
        xhtml.endElement("tr");
        xhtml.endElement("table");
        xhtml.endDocument();

        String[] words = output.toString().split("\\s+");
        assertEquals(6, words.length);
        assertEquals("foo", words[0]);
        assertEquals("bar", words[1]);
        assertEquals("x", words[2]);
        assertEquals("y", words[3]);
        assertEquals("a", words[4]);
        assertEquals("b", words[5]);
    }

    /**
     * Test that content in option elements are properly separated in text
     * output.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-394">TIKA-394</a>
     */
    @Test
    public void testWhitespaceWithOptions() throws Exception {
        xhtml.startDocument();
        xhtml.startElement("form");
        xhtml.startElement("select");
        xhtml.element("option", "opt1");
        xhtml.element("option", "opt2");
        xhtml.endElement("select");
        xhtml.endElement("form");
        xhtml.endDocument();

        String[] words = output.toString().split("\\s+");
        assertEquals(2, words.length);
        assertEquals("opt1", words[0]);
        assertEquals("opt2", words[1]);
    }

    @Test
    public void testWhitespaceWithMenus() throws Exception {
        xhtml.startDocument();
        xhtml.startElement("menu");
        xhtml.element("li", "one");
        xhtml.element("li", "two");
        xhtml.endElement("menu");
        xhtml.endDocument();

        String[] words = getRealWords(output.toString());
        assertEquals(2, words.length);
        assertEquals("one", words[0]);
        assertEquals("two", words[1]);
    }

    @Test
    public void testAttributesOnBody() throws Exception {
        ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler();
        XHTMLContentHandler xhtmlContentHandler =
                new XHTMLContentHandler(toHTMLContentHandler, new Metadata());
        AttributesImpl attributes = new AttributesImpl();

        attributes.addAttribute(XHTMLContentHandler.XHTML, "itemscope", "itemscope", "", "");
        attributes.addAttribute(XHTMLContentHandler.XHTML, "itemtype", "itemtype", "",
                "http://schema.org/Event");

        xhtmlContentHandler.startDocument();
        xhtmlContentHandler.startElement(XHTMLContentHandler.XHTML, "body", "body", attributes);
        xhtmlContentHandler.endElement("body");
        xhtmlContentHandler.endDocument();

        assertTrue(toHTMLContentHandler.toString().contains("itemscope"));
    }

    @Test
    public void testAttributesOnHtml() throws Exception {
        ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler();
        XHTMLContentHandler xhtmlContentHandler =
                new XHTMLContentHandler(toHTMLContentHandler, new Metadata());
        AttributesImpl attributes = new AttributesImpl();

        attributes.addAttribute(XHTMLContentHandler.XHTML, "itemscope", "itemscope", "", "");
        attributes.addAttribute(XHTMLContentHandler.XHTML, "itemtype", "itemtype", "",
                "http://schema.org/Event");

        xhtmlContentHandler.startDocument();
        xhtmlContentHandler.startElement(XHTMLContentHandler.XHTML, "html", "html", attributes);
        xhtmlContentHandler.endElement("html");
        xhtmlContentHandler.endDocument();

        assertTrue(toHTMLContentHandler.toString().contains("itemscope"));
    }

    @Test
    public void testInvalidControlCharacter0x7F() throws Exception {
        xhtml.startDocument();
        xhtml.startElement("menu");
        xhtml.element("li", "a\u007Fz");
        xhtml.endElement("menu");
        xhtml.endDocument();

        String[] words = getRealWords(output.toString());
        assertEquals(1, words.length);
        assertEquals("a\ufffdz", words[0]);
    }

    @Test
    public void testInvalidControlCharacter0x9F() throws Exception {
        xhtml.startDocument();
        xhtml.startElement("menu");
        xhtml.element("li", "a\u009Fz");
        xhtml.endElement("menu");
        xhtml.endDocument();

        String[] words = getRealWords(output.toString());
        assertEquals(1, words.length);
        assertEquals("a\ufffdz", words[0]);
    }

    @Test
    public void testInvalidControlCharacter0x93() throws Exception {
        xhtml.startDocument();
        xhtml.startElement("menu");
        xhtml.element("li", "a\u0093z");
        xhtml.endElement("menu");
        xhtml.endDocument();

        String[] words = getRealWords(output.toString());
        assertEquals(1, words.length);
        assertEquals("a\ufffdz", words[0]);
    }

    @Test
    public void testDefaultConfigIncludesMetadataAndTitle() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(TikaCoreProperties.TITLE, "Test Title");
        metadata.set("author", "Test Author");

        ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler();
        XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(toHTMLContentHandler, metadata);

        xhtmlHandler.startDocument();
        xhtmlHandler.element("p", "content");
        xhtmlHandler.endDocument();

        String result = toHTMLContentHandler.toString();
        assertTrue(result.contains("<title>Test Title</title>"), "Should contain title");
        assertTrue(result.contains("<meta name=\"author\" content=\"Test Author\""),
                "Should contain metadata");
    }

    @Test
    public void testConfigSkipMetadataInHead() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(TikaCoreProperties.TITLE, "Test Title");
        metadata.set("author", "Test Author");

        SAXOutputConfig config = new SAXOutputConfig();
        config.setWriteMetadataToHead(false);

        ParseContext context = new ParseContext();
        context.set(SAXOutputConfig.class, config);

        ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler();
        XHTMLContentHandler xhtmlHandler =
                new XHTMLContentHandler(toHTMLContentHandler, metadata, context);

        xhtmlHandler.startDocument();
        xhtmlHandler.element("p", "content");
        xhtmlHandler.endDocument();

        String result = toHTMLContentHandler.toString();
        assertTrue(result.contains("<title>Test Title</title>"), "Should still contain title");
        assertTrue(!result.contains("<meta name=\"author\""),
                "Should NOT contain metadata");
    }

    @Test
    public void testConfigSkipTitle() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(TikaCoreProperties.TITLE, "Test Title");
        metadata.set("author", "Test Author");

        SAXOutputConfig config = new SAXOutputConfig();
        config.setIncludeTitle(false);

        ParseContext context = new ParseContext();
        context.set(SAXOutputConfig.class, config);

        ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler();
        XHTMLContentHandler xhtmlHandler =
                new XHTMLContentHandler(toHTMLContentHandler, metadata, context);

        xhtmlHandler.startDocument();
        xhtmlHandler.element("p", "content");
        xhtmlHandler.endDocument();

        String result = toHTMLContentHandler.toString();
        assertTrue(!result.contains("<title>"), "Should NOT contain title");
        assertTrue(result.contains("<meta name=\"author\" content=\"Test Author\""),
                "Should still contain metadata");
    }

    @Test
    public void testConfigSkipBothMetadataAndTitle() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(TikaCoreProperties.TITLE, "Test Title");
        metadata.set("author", "Test Author");

        SAXOutputConfig config = new SAXOutputConfig();
        config.setWriteMetadataToHead(false);
        config.setIncludeTitle(false);

        ParseContext context = new ParseContext();
        context.set(SAXOutputConfig.class, config);

        ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler();
        XHTMLContentHandler xhtmlHandler =
                new XHTMLContentHandler(toHTMLContentHandler, metadata, context);

        xhtmlHandler.startDocument();
        xhtmlHandler.element("p", "content");
        xhtmlHandler.endDocument();

        String result = toHTMLContentHandler.toString();
        assertTrue(!result.contains("<title>"), "Should NOT contain title");
        assertTrue(!result.contains("<meta name=\"author\""),
                "Should NOT contain metadata");
        assertTrue(result.contains("<head>") && result.contains("</head>"),
                "Should still have head element");
        assertTrue(result.contains("content"), "Should have body content");
    }

    @Test
    public void testNullParseContextUsesDefaults() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(TikaCoreProperties.TITLE, "Test Title");
        metadata.set("author", "Test Author");

        ToHTMLContentHandler toHTMLContentHandler = new ToHTMLContentHandler();
        XHTMLContentHandler xhtmlHandler =
                new XHTMLContentHandler(toHTMLContentHandler, metadata, null);

        xhtmlHandler.startDocument();
        xhtmlHandler.element("p", "content");
        xhtmlHandler.endDocument();

        String result = toHTMLContentHandler.toString();
        assertTrue(result.contains("<title>Test Title</title>"), "Should contain title");
        assertTrue(result.contains("<meta name=\"author\" content=\"Test Author\""),
                "Should contain metadata");
    }

}