ODFParserTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.odf;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.TikaTest;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.utils.XMLReaderUtils;

public class ODFParserTest extends TikaTest {
    /**
     * For now, allow us to run some tests against both
     * the old and the new parser
     */
    private Parser[] getParsers() {
        return new Parser[]{new OpenDocumentParser()};
    }


    @Test
    public void testOO3() throws Exception {
        for (Parser parser : getParsers()) {
            try (TikaInputStream tis = getResourceAsStream("/test-documents/testODFwithOOo3.odt")) {
                Metadata metadata = new Metadata();
                ContentHandler handler = new BodyContentHandler();
                parser.parse(tis, handler, metadata, new ParseContext());

                assertEquals("application/vnd.oasis.opendocument.text",
                        metadata.get(Metadata.CONTENT_TYPE));

                String content = handler.toString();
                assertContains("Tika is part of the Lucene project.", content);
                assertContains("Solr", content);
                assertContains("one embedded", content);
                assertContains("Rectangle Title", content);
                assertContains("a blue background and dark border", content);
            }
        }
    }

    @Test
    public void testOO2() throws Exception {
        for (Parser parser : getParsers()) {
            try (TikaInputStream tis = getResourceAsStream("/test-documents/testOpenOffice2.odt")) {
                Metadata metadata = new Metadata();
                ContentHandler handler = new BodyContentHandler();
                parser.parse(tis, handler, metadata, new ParseContext());

                assertEquals("application/vnd.oasis.opendocument.text",
                        metadata.get(Metadata.CONTENT_TYPE));
                assertEquals("en-US", metadata.get(TikaCoreProperties.LANGUAGE));
                assertEquals("PT1M7S", metadata.get(OfficeOpenXMLExtended.TOTAL_TIME));
                assertEquals("NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
                        metadata.get("generator"));

                // Check date metadata, both old-style and new-style
                assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED));
                assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED));

                // Check the document statistics
                assertEquals("1", metadata.get(Office.PAGE_COUNT));
                assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT));
                assertEquals("14", metadata.get(Office.WORD_COUNT));
                assertEquals("78", metadata.get(Office.CHARACTER_COUNT));
                assertEquals("0", metadata.get(Office.TABLE_COUNT));
                assertEquals("0", metadata.get(Office.OBJECT_COUNT));
                assertEquals("0", metadata.get(Office.IMAGE_COUNT));

                // Custom metadata tags present but without values
                assertEquals(null, metadata.get("custom:Info 1"));
                assertEquals(null, metadata.get("custom:Info 2"));
                assertEquals(null, metadata.get("custom:Info 3"));
                assertEquals(null, metadata.get("custom:Info 4"));

                assertEquals("1.0", metadata.get(OpenDocumentMetaParser.ODF_VERSION_KEY));

                String content = handler.toString();
                assertTrue(content.contains("This is a sample Open Office document," +
                        " written in NeoOffice 2.2.1 for the Mac."));
            }
        }
    }

    /**
     * Similar to {@link #testOO2()}, but using a different
     * OO2 file with different metadata in it
     */
    @Test
    public void testOO2Metadata() throws Exception {
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testOpenOffice2.odf")) {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            new OpenDocumentParser().parse(tis, handler, metadata, new ParseContext());

            assertEquals("application/vnd.oasis.opendocument.formula",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals(null, metadata.get(TikaCoreProperties.MODIFIED));
            assertEquals("2006-01-27T11:55:22", metadata.get(TikaCoreProperties.CREATED));
            assertEquals("The quick brown fox jumps over the lazy dog",
                    metadata.get(TikaCoreProperties.TITLE));
            assertEquals("Gym class featuring a brown fox and lazy dog",
                    metadata.get(OfficeOpenXMLCore.SUBJECT));
            assertContains("Gym class featuring a brown fox and lazy dog",
                    Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT)));
            assertEquals("PT0S", metadata.get(OfficeOpenXMLExtended.TOTAL_TIME));
            assertEquals("1", metadata.get("editing-cycles"));
            assertEquals("OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134",
                    metadata.get("generator"));
            assertEquals("Pangram, fox, dog", metadata.get(TikaCoreProperties.SUBJECT));

            // User defined metadata
            assertEquals("Text 1", metadata.get("custom:Info 1"));
            assertEquals("2", metadata.get("custom:Info 2"));
            assertEquals("false", metadata.get("custom:Info 3"));
            assertEquals("true", metadata.get("custom:Info 4"));

            // No statistics present
            assertEquals(null, metadata.get(Office.PAGE_COUNT));
            assertEquals(null, metadata.get(Office.PARAGRAPH_COUNT));
            assertEquals(null, metadata.get(Office.WORD_COUNT));
            assertEquals(null, metadata.get(Office.CHARACTER_COUNT));
            assertEquals(null, metadata.get(Office.TABLE_COUNT));
            assertEquals(null, metadata.get(Office.OBJECT_COUNT));
            assertEquals(null, metadata.get(Office.IMAGE_COUNT));
            assertEquals(null, metadata.get("nbTab"));
            assertEquals(null, metadata.get("nbObject"));
            assertEquals(null, metadata.get("nbImg"));
            assertEquals(null, metadata.get("nbPage"));
            assertEquals(null, metadata.get("nbPara"));
            assertEquals(null, metadata.get("nbWord"));
            assertEquals(null, metadata.get("nbCharacter"));
            assertEquals("1.0", metadata.get(OpenDocumentMetaParser.ODF_VERSION_KEY));


            // Note - contents of maths files not currently supported
            String content = handler.toString().trim();
            assertEquals("", content.trim());
        }
    }

    /**
     * Similar to {@link #testOO2()} )}, but using an OO3 file
     */
    @Test
    public void testOO3Metadata() throws Exception {
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testODFwithOOo3.odt")) {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            new OpenDocumentParser().parse(tis, handler, metadata, new ParseContext());

            assertEquals("application/vnd.oasis.opendocument.text",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("2009-10-05T21:22:38", metadata.get(TikaCoreProperties.MODIFIED));
            assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED));
            assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED));
            assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
            assertEquals("Test document", metadata.get(OfficeOpenXMLCore.SUBJECT));
            assertContains("Test document",
                    Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT)));
            assertEquals("A rather complex document", metadata.get(TikaCoreProperties.DESCRIPTION));
            assertEquals("Bart Hanssens", metadata.get(TikaCoreProperties.CREATOR));
            assertEquals("2", metadata.get("editing-cycles"));
            assertEquals("PT02H03M24S", metadata.get(OfficeOpenXMLExtended.TOTAL_TIME));
            assertEquals("OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420",
                    metadata.get("generator"));
            assertEquals("Apache, Lucene, Tika", metadata.get(TikaCoreProperties.SUBJECT));

            // User defined metadata
            assertEquals("Bart Hanssens", metadata.get("custom:Editor"));
            assertEquals(null, metadata.get("custom:Info 2"));
            assertEquals(null, metadata.get("custom:Info 3"));
            assertEquals(null, metadata.get("custom:Info 4"));

            // Check the document statistics
            assertEquals("2", metadata.get(Office.PAGE_COUNT));
            assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT));
            assertEquals("54", metadata.get(Office.WORD_COUNT));
            assertEquals("351", metadata.get(Office.CHARACTER_COUNT));
            assertEquals("0", metadata.get(Office.TABLE_COUNT));
            assertEquals("2", metadata.get(Office.OBJECT_COUNT));
            assertEquals("0", metadata.get(Office.IMAGE_COUNT));
            assertEquals("1.1", metadata.get(OpenDocumentMetaParser.ODF_VERSION_KEY));

            String content = handler.toString();
            assertTrue(content.contains("Apache Tika Tika is part of the Lucene project."));
        }
    }

    @Test
    public void testODPMasterFooter() throws Exception {
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testMasterFooter.odp")) {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            AUTO_DETECT_PARSER.parse(tis, handler, metadata, new ParseContext());

            String content = handler.toString();
            assertContains("Master footer is here", content);
        }
    }

    @Test
    public void testODTFooter() throws Exception {
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testFooter.odt")) {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            AUTO_DETECT_PARSER.parse(tis, handler, metadata, new ParseContext());

            String content = handler.toString();
            assertContains("Here is some text...", content);
            assertContains("Here is some text on page 2", content);
            assertContains("Here is footer text", content);
        }
    }

    @Test
    public void testODSFooter() throws Exception {
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testFooter.ods")) {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            AUTO_DETECT_PARSER.parse(tis, handler, metadata, new ParseContext());

            String content = handler.toString();
            assertContains("Here is a footer in the center area", content);
        }
    }

    @Test
    public void testFromFile() throws Exception {
        try (TikaInputStream tis = TikaInputStream
                .get(getResourceAsUrl("/test-documents/testODFwithOOo3.odt"))) {
            assertEquals(true, tis.hasFile());
            OpenDocumentParser parser = new OpenDocumentParser();
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            parser.parse(tis, handler, metadata, new ParseContext());

            assertEquals("application/vnd.oasis.opendocument.text",
                    metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("1.1", metadata.get(OpenDocumentMetaParser.ODF_VERSION_KEY));

            String content = handler.toString();
            assertContains("Tika is part of the Lucene project.", content);
        }
    }

    @Test
    public void testNPEFromFile() throws Exception {
        OpenDocumentParser parser = new OpenDocumentParser();
        try (TikaInputStream tis = TikaInputStream
                .get(getResourceAsUrl("/test-documents/testNPEOpenDocument.odt"))) {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            parser.parse(tis, handler, metadata, new ParseContext());

            assertEquals("application/vnd.oasis.opendocument.text",
                    metadata.get(Metadata.CONTENT_TYPE));

            String content = handler.toString();
            assertContains("primero hay que generar un par de claves", content);
        }
    }

    // TIKA-1063: Test basic style support.
    @Test
    public void testODTStyles() throws Exception {
        String xml = getXML("testStyles.odt").xml;
        assertContains("This <i>is</i> <b>just</b> a <u>test</u>", xml);
        assertContains("<p>And <b>another <i>test</i> is</b> here.</p>", xml);
        assertContains("<ol>\t<li><p>One</p>", xml);
        assertContains("</ol>", xml);
        assertContains("<ul>\t<li><p>First</p>", xml);
        assertContains("</ul>", xml);
    }

    //TIKA-1600: Test that null pointer doesn't break parsing.
    @Test
    public void testNullStylesInODTFooter() throws Exception {
        Parser parser = new OpenDocumentParser();
        try (TikaInputStream tis = getResourceAsStream("/test-documents/testODT-TIKA-6000.odt")) {
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            parser.parse(tis, handler, metadata, getNonRecursingParseContext());

            assertEquals("application/vnd.oasis.opendocument.text",
                    metadata.get(Metadata.CONTENT_TYPE));

            String content = handler.toString();

            assertContains("Utilisation de ce document", content);
            assertContains("Copyright and License", content);
            assertContains("Changer la langue", content);
            assertContains("La page d���accueil permet de faire une recherche simple", content);
        }
    }

    @Test  //TIKA-1916
    public void testMissingMeta() throws Exception {
        String xml = getXML("testODTNoMeta.odt").xml;
        assertContains("Test text", xml);
    }

    @Test //TIKA-2242
    public void testParagraphLevelFontStyles() throws Exception {
        String xml = getXML("testODTStyles2.odt", getNonRecursingParseContext()).xml;
        //test text span font-style properties
        assertContains("<p><b>name</b>, advocaat", xml);
        //test paragraph's font-style properties
        assertContains("<p><b>Publicatie Onbekwaamverklaring", xml);
    }

    @Test //TIKA-2242
    public void testAnnotationsAndPDepthGt1() throws Exception {
        //not allowed in html: <p> <annotation> <p> this is an annotation </p> </annotation> </p>
        String xml = getXML("testODTStyles3.odt").xml;
        assertContains(
                "<p><b>WOUTERS Rolf</b><p class=\"annotation\"> Beschermde persoon is " +
                        "overleden </p>",
                xml);
    }

    @Test
    public void testEmbedded() throws Exception {
        List<Metadata> metadataList = getRecursiveMetadata("testODTEmbedded.odt");
        assertEquals(3, metadataList.size());
    }

    @Test
    public void testEmbeddedImageAndLink() throws Exception {
        String xml = getXML("testODTEmbeddedImageLink.odt").xml;
        assertContains("<a href=\"https://tika.apache.org/\">" +
                "<img src=\"embedded:Pictures/10000201000001240000006457F5B1D1243E0671.png\" />" +
                "<span>Visit Tika</span></a>", xml);
    }

    @Test
    public void testInvalidFromStream() throws Exception {
        try (TikaInputStream tis = TikaInputStream.get(getResourceAsUrl("/test-documents/testODTnotaZipFile.odt"))) {
            OpenDocumentParser parser = new OpenDocumentParser();
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            assertThrows(IOException.class, () -> {
                parser.parse(tis, handler, metadata, new ParseContext());
            });
        }
    }

    @Test
    public void testInvalidFromFile() throws Exception {
        try (TikaInputStream tis = TikaInputStream
                .get(getResourceAsUrl("/test-documents/testODTnotaZipFile.odt"))) {
            OpenDocumentParser parser = new OpenDocumentParser();
            Metadata metadata = new Metadata();
            ContentHandler handler = new BodyContentHandler();
            assertThrows(IOException.class, () -> {
                parser.parse(tis, handler, metadata, new ParseContext());
            });
        }
    }

    @Test
    public void testEncryptedODTFile() throws Exception {
        //the password to this file is "tika"
        Path p =
                Paths.get(
                        ODFParserTest.class.getResource(
                                "/test-documents/testODTEncrypted.odt").toURI());
        assertThrows(EncryptedDocumentException.class, () -> {
            getRecursiveMetadata(p, false);
        });

        assertThrows(EncryptedDocumentException.class, () -> {
            try (TikaInputStream tis = TikaInputStream.get(p)) {
                getRecursiveMetadata(tis, false);
            }
        });

        List<Metadata> metadataList = getRecursiveMetadata(p, true);
        assertEquals("true", metadataList.get(0).get(TikaCoreProperties.IS_ENCRYPTED));
    }

    //this, of course, should throw an EncryptedDocumentException
    //but the file can't be read by Java's ZipInputStream or
    //by commons compress, unless you enable descriptors.
    //https://issues.apache.org/jira/browse/ODFTOOLKIT-402
    @Test
    public void testEncryptedODTStream() throws Exception {
        try (TikaInputStream tis = TikaInputStream.get(
                ODFParserTest.class.getResourceAsStream("/test-documents/testODTEncrypted.odt"))) {
            assertThrows(TikaException.class, () -> {
                getRecursiveMetadata(tis, false);
            });
        }
    }

    private ParseContext getNonRecursingParseContext() {
        ParseContext parseContext = new ParseContext();
        parseContext.set(Parser.class, new EmptyParser());
        return parseContext;
    }

    @Test
    public void testMultiThreaded() throws Exception {
        int numThreads = 10;
        ExecutorService executorService = Executors.newFixedThreadPool(numThreads);
        ExecutorCompletionService<Integer> executorCompletionService =
                new ExecutorCompletionService<>(executorService);

        for (int i = 0; i < numThreads; i++) {
            executorCompletionService.submit(() -> {
                for (int i1 = 0; i1 < 10; i1++) {
                    List<Metadata> metadataList = getRecursiveMetadata("testODTEmbedded.odt");
                    assertEquals(3, metadataList.size());
                    assertEquals("THUMBNAIL",
                            metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
                }
                return 1;
            });
        }

        try {
            int finished = 0;
            while (finished < numThreads) {
                Future<Integer> future = executorCompletionService.take();
                future.get();
                finished++;
            }
        } finally {
            executorService.shutdownNow();
        }
    }

    @Test
    public void testODTXHTMLIsParseable() throws Exception {
        //for all OpenDocument files, make sure that the
        //output from the parse is parseable xhtml
        int filesTested = 0;
        for (Path p : getAllTestFiles()) {
            String fileName = p.getFileName().toString();
            if (fileName.endsWith(".odt") || fileName.endsWith("odp") || fileName.endsWith("odf") ||
                    fileName.endsWith(".ods")) {

                XMLResult xmlResult = null;
                try (TikaInputStream tis = TikaInputStream.get(p)) {
                    xmlResult = getXML(tis, AUTO_DETECT_PARSER, new Metadata());
                } catch (Exception e) {
                    continue;
                }
                try {
                    //just make sure this doesn't throw any exceptions
                    XMLReaderUtils.parseSAX(TikaInputStream.get(xmlResult.xml.getBytes(StandardCharsets.UTF_8)),
                            new DefaultHandler(), new ParseContext());
                    filesTested++;
                } catch (Exception e) {
                    fail(p.getFileName().toString(), e);
                }
            }
        }
        assertTrue(filesTested > 10);
    }

    @Test
    public void testVersions() throws Exception {
        //test at least that all files from
        // https://github.com/openpreserve/format-corpus/tree/master/office-examples/LibreOffice7-ODF-1.3
        //pass as 1.3.  Note that we don't currently parse base files, so skip that one.
        for (String name : new String[]{
                //"LibreOfficeBase_odb_1.3.odb",
                "LibreOfficeCalc_ods_1.3.ods",
                "LibreOfficeDraw_odg_1.3.odg",
                "LibreOfficeImpress_odp_1.3.odp",
                "LibreOfficeWriter_odt_1.3.odt",
        }) {
            List<Metadata> metadataList = getRecursiveMetadata("/versions/" + name);
            Metadata metadata = metadataList.get(0);
            assertEquals("1.3", metadata.get(OpenDocumentMetaParser.ODF_VERSION_KEY), "failed on " + name);
        }
    }
}