EpubParserTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.epub;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;

import org.apache.tika.TikaTest;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Epub;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.Parser;

public class EpubParserTest extends TikaTest {

    @Test
    public void testXMLParser() throws Exception {

        XMLResult xmlResult = getXML("testEPUB.epub");
        assertEquals("2.0", xmlResult.metadata.get(Epub.VERSION));
        assertEquals("application/epub+zip", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("en", xmlResult.metadata.get(TikaCoreProperties.LANGUAGE));
        assertEquals("This is an ePub test publication for Tika.",
                xmlResult.metadata.get(TikaCoreProperties.DESCRIPTION));
        assertEquals("Apache", xmlResult.metadata.get(TikaCoreProperties.PUBLISHER));

        String content = xmlResult.xml;
        assertContains("Plus a simple div", content);
        assertContains("First item", content);
        assertContains("The previous headings were <strong>subchapters</strong>", content);
        assertContains("Table data", content);
        assertContains("This is the text for chapter Two", content);

        //make sure style/script elements aren't extracted
        assertNotContained("nothing to see here", content);
        assertNotContained("nor here", content);
        assertNotContained("font-style", content);

        //make sure that there is only one of each
        assertContainsCount("<html", content, 1);
        assertContainsCount("<head", content, 1);
        assertContainsCount("<body", content, 1);
    }

    @Test
    public void testEpubOrder() throws Exception {
        List<Metadata> metadataList = getRecursiveMetadata("testEPUB.epub");
        assertEquals("2.0", metadataList.get(0).get(Epub.VERSION));

        //test attachments
        assertEquals(2, metadataList.size());
        assertEquals("image/jpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE));
        String xml = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
        int tocIndex = xml.indexOf("h3 class=\"toc_heading\">Table of Contents<");
        int ch1 = xml.indexOf("<h1>Chapter 1");
        int ch2 = xml.indexOf("<h1>Chapter 2");
        assert (tocIndex > -1 && ch1 > -1 && ch2 > -1);
        assert (tocIndex < ch1);
        assert (tocIndex < ch2);
        assert (ch1 < ch2);
        //remove streaming test
    }


    @Test
    public void testTruncated() throws Exception {
        Parser p = new EpubParser();
        List<Metadata> metadataList;
        try (TikaInputStream tis = truncate("testEPUB.epub", 10000)) {
            metadataList = getRecursiveMetadata(tis, p, true);
        }
        String xml = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
        int ch1 = xml.indexOf("<h1>Chapter 1");
        int ch2 = xml.indexOf("<h1>Chapter 2");
        assert (ch1 < ch2);
    }

    @Test
    public void testContentsWXMLExtensions() throws Exception {
        //TIKA-2310
        List<Metadata> metadataList = getRecursiveMetadata("testEPUB_xml_ext.epub");
        assertEquals(1, metadataList.size());
        assertEquals("2.0", metadataList.get(0).get(Epub.VERSION));
        assertContains("It was a bright cold day in April",
                metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
    }

    @Test
    @Disabled("add files to repo?")
    public void testPrePaginated() throws Exception {
        //this file has pre-paginated on an itemRef in a spine
        //https://github.com/IDPF/epub3-samples/releases/download/20170606/cole-voyage-of-life.epub

        //this file has pre-paginated in header metadata
        //https://github.com/IDPF/epub3-samples/releases/download/20170606/cole-voyage-of-life-tol.epub

        List<Metadata> metadataList = getRecursiveMetadata("cole-voyage-of-life.epub");
        assertEquals("pre-paginated", metadataList.get(0).get(Epub.RENDITION_LAYOUT));
    }

    @Test
    public void testMultipleMetadataValues() throws Exception {
        //TIKA_4466
        List<Metadata> metadataList = getRecursiveMetadata("testEPUB_multi-metadata-vals.epub");
        Set<String> publishers = Set.of("Standard Ebooks", "Guternberg");
        Set<String> titles = Set.of("The Inheritors", "An Extravagant Story", "The Inheritors: An Extravagant Story");
        Set<String> contributors = Set.of("The League of Moveable Type", "zikasak", "William Holyoake", "Clare Boothby",
                "Graeme Mackreth", "Distributed Proofreaders", "Szymon Szott", "David Reimer");
        Set<String> creators = Set.of("Joseph Conrad", "Ford Madox Ford");
        Set<String> languages = Set.of("en-GB", "en-US");
        Set<String> descriptions = Set.of("A young writer dabbling in journalism meets a strange, otherworldly woman with long-term political goals.",
                "additional description");
        Set<String> sources = Set.of("https://www.gutenberg.org/ebooks/14888", "https://archive.org/details/inheritorsanext01fordgoog/");
        Set<String> identifiers = Set.of("https://standardebooks.org/ebooks/joseph-conrad_ford-madox-ford/the-inheritors",
                "isbn:0571225470");
        Set<String> subjects = Set.of("Science fiction");

        Metadata m = metadataList.get(0);
        assertEquals(publishers, set(m, TikaCoreProperties.PUBLISHER));
        assertEquals(titles, set(m, TikaCoreProperties.TITLE));
        assertEquals(contributors, set(m, TikaCoreProperties.CONTRIBUTOR));
        assertEquals(creators, set(m, TikaCoreProperties.CREATOR));
        assertEquals(languages, set(m, TikaCoreProperties.LANGUAGE));
        assertEquals(descriptions, set(m, TikaCoreProperties.DESCRIPTION));
        assertEquals(sources, set(m, TikaCoreProperties.SOURCE));
        assertEquals(identifiers, set(m, TikaCoreProperties.IDENTIFIER));
        assertEquals(subjects, set(m, TikaCoreProperties.SUBJECT));

        assertEquals(2, m.getValues(TikaCoreProperties.RIGHTS).length);
        assertTrue(m.get(TikaCoreProperties.RIGHTS).startsWith("The source text and artwork"));
        assertEquals("test rights", m.getValues(TikaCoreProperties.RIGHTS)[1]);
    }

    private Set<String> set(Metadata m, Property property) {
        return new HashSet<>(Arrays.asList(m.getValues(property)));
    }
}