OutlookParserTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.StringWriter;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;

import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;

import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.MAPI;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.utils.XMLReaderUtils;

/**
 * Test case for parsing Outlook files.
 */
public class OutlookParserTest extends TikaTest {

    @Test
    public void testOutlookParsing() throws Exception {

        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        try (TikaInputStream tis = getResourceAsStream("/test-documents/test-outlook.msg")) {
            AUTO_DETECT_PARSER.parse(tis, handler, metadata, new ParseContext());
        }
        assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Microsoft Outlook Express 6", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Nouvel utilisateur de Outlook Express",
                metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
        assertEquals("L'\u00C9quipe Microsoft Outlook Express",
                metadata.get(TikaCoreProperties.CREATOR));

        //ensure that "raw" header is correctly decoded
        assertEquals("L'\u00C9quipe Microsoft Outlook Express <msoe@microsoft.com>",
                metadata.get(Metadata.MESSAGE_RAW_HEADER_PREFIX + "From"));

        assertEquals("Nouvel utilisateur de Outlook Express",
                metadata.get(Message.MESSAGE_TO_EMAIL));

        assertEquals("", metadata.get(Message.MESSAGE_TO_NAME));

        assertEquals("Nouvel utilisateur de Outlook Express",
                metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));

        // Stored as Thu, 5 Apr 2007 09:26:06 -0700
        assertEquals("2007-04-05T16:26:06Z", metadata.get(TikaCoreProperties.CREATED));

        String content = handler.toString();
        assertNotContained("Microsoft Outlook Express 6", content);
        assertNotContained("L'\u00C9quipe Microsoft Outlook Express", content);
        assertNotContained("Nouvel utilisateur de Outlook Express", content);


        //now try with inlining select headers
        ParseContext parseContext = new ParseContext();
        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
        officeParserConfig.setWriteSelectHeadersInBody(true);
        parseContext.set(OfficeParserConfig.class, officeParserConfig);
        content = getText("test-outlook.msg", new Metadata(), parseContext);
        assertTrue(content.startsWith("Microsoft Outlook Express 6"));
    }

    /**
     * Test case for TIKA-197
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
     */
    @Test
    public void testMultipleCopies() throws Exception {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        try (TikaInputStream tis = getResourceAsStream("/test-documents/testMSG.msg")) {
            AUTO_DETECT_PARSER.parse(tis, handler, metadata, new ParseContext());
        }

        assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));

        String content = handler.toString();
        Pattern pattern = Pattern.compile("From");
        Matcher matcher = pattern.matcher(content);
        assertFalse(matcher.find());

        //test that last header is added
        assertContains("29 Jan 2009 19:17:10.0163 (UTC) FILETIME=[2ED25E30:01C98246]",
                Arrays.asList(metadata.getValues("Message:Raw-Header:X-OriginalArrivalTime")));
        //confirm next line is added correctly
        assertContains("from athena.apache.org (HELO athena.apache.org) (140.211.11.136)\n" +
                        "    by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 29 Jan 2009 11:17:08 " +
                        "-0800",
                Arrays.asList(metadata.getValues("Message:Raw-Header:Received")));
        assertEquals("EX", metadata.get(MAPI.SENT_BY_SERVER_TYPE));
        assertEquals("NOTE", metadata.get(MAPI.MESSAGE_CLASS));
        assertEquals("Jukka Zitting", metadata.get(Message.MESSAGE_FROM_NAME));
        assertEquals("jukka.zitting@gmail.com", metadata.get(Message.MESSAGE_FROM_EMAIL));
        assertEquals("Jukka Zitting", metadata.get(MAPI.FROM_REPRESENTING_NAME));
        assertEquals("jukka.zitting@gmail.com", metadata.get(MAPI.FROM_REPRESENTING_EMAIL));

        //to-name is empty, make sure that we get an empty string.
        assertEquals("tika-dev@lucene.apache.org", metadata.get(Message.MESSAGE_TO_EMAIL));
        assertEquals("tika-dev@lucene.apache.org", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));
        assertEquals("", metadata.get(Message.MESSAGE_TO_NAME));

    }

    /**
     * Test case for TIKA-395, to ensure parser works for new Outlook formats.
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
     */
    @Test
    public void testOutlookNew() throws Exception {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();

        try (TikaInputStream tis = getResourceAsStream("/test-documents/test-outlook2003.msg")) {
            AUTO_DETECT_PARSER.parse(tis, handler, metadata, new ParseContext());
        }
        assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Welcome to Microsoft Office Outlook 2003",
                metadata.get(TikaCoreProperties.TITLE));
        assertEquals("Welcome to Microsoft Office Outlook 2003",
                metadata.get(TikaCoreProperties.SUBJECT));
        assertEquals("Welcome to Microsoft Office Outlook 2003",
                metadata.get(TikaCoreProperties.DESCRIPTION));

        String content = handler.toString();
        assertContains("Outlook 2003", content);
        assertContains("Streamlined Mail Experience", content);
        assertContains("Navigation Pane", content);

        //make sure these are parallel
        assertEquals("", metadata.get(Message.MESSAGE_TO_EMAIL));
        assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_NAME));
        assertEquals("New Outlook User", metadata.get(Message.MESSAGE_TO_DISPLAY_NAME));

    }

    @Test
    public void testOutlookHTMLVersion() throws Exception {
        Metadata metadata = new Metadata();

        // Check the HTML version
        StringWriter sw = new StringWriter();
        SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(new StreamResult(sw));

        try (TikaInputStream tis = getResourceAsStream("/test-documents/testMSG_chinese.msg")) {
            AUTO_DETECT_PARSER.parse(tis, handler, metadata, new ParseContext());
        }

        // As the HTML version should have been processed, ensure
        //  we got some of the links
        String content = sw.toString();
        assertNotContained("<dd>tests.chang@fengttt.com</dd>", content);
        assertContains("<p>Alfresco MSG format testing", content);
        assertContains("<li>1", content);
        assertContains("<li>2", content);

        // Make sure we don't have nested html docs
        assertEquals(2, content.split("<body>").length);
        assertEquals(2, content.split("<\\/body>").length);

        // Make sure that the Chinese actually came through
        assertContains("\u5F35\u6BD3\u502B", metadata.get(TikaCoreProperties.CREATOR));
        assertContains("\u9673\u60E0\u73CD", content);

        assertEquals("tests.chang@fengttt.com", metadata.get(Message.MESSAGE_TO_EMAIL));

        assertEquals("Tests Chang@FT (���������)", metadata.get(MAPI.FROM_REPRESENTING_NAME));
        assertEquals("/O=FT GROUP/OU=FT/CN=RECIPIENTS/CN=LYDIACHANG",
                metadata.get(MAPI.FROM_REPRESENTING_EMAIL));

        assertEquals("c=TW;a= ;p=FT GROUP;l=FTM02-110329085248Z-89735\u0000",
                metadata.get(MAPI.SUBMISSION_ID));
        assertEquals("<EBB9951D34EA4B41B70AB946CF3FB6EC1A297D98@ftm02.FT.FTG.COM>",
                metadata.get(MAPI.INTERNET_MESSAGE_ID));
        assertTrue(metadata.get(MAPI.SUBMISSION_ACCEPTED_AT_TIME).startsWith("2011-03-29"));
        assertTrue(metadata.get("mapi:client-submit-time").startsWith("2011-03-29"));
        assertTrue(metadata.get("mapi:message-delivery-time").startsWith("2011-03-29"));
        assertTrue(metadata.get("mapi:last-modification-time").startsWith("2011-03-29"));
        assertTrue(metadata.get("mapi:creation-time").startsWith("2011-03-29"));
    }

    @Test
    public void testOutlookForwarded() throws Exception {
        Metadata metadata = new Metadata();

        // Check the HTML version
        StringWriter sw = new StringWriter();
        SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(new StreamResult(sw));

        try (TikaInputStream tis = getResourceAsStream("/test-documents/testMSG_forwarded.msg")) {
            AUTO_DETECT_PARSER.parse(tis, handler, metadata, new ParseContext());
        }

        // Make sure we don't have nested docs
        String content = sw.toString();
        assertEquals(2, content.split("<body>").length);
        assertEquals(2, content.split("<\\/body>").length);
        assertEquals("01ccb5408a75b6cf3ad7837949b698499034202313ef000002a160", metadata.get(MAPI.CONVERSATION_INDEX));
        assertEquals("<C8508767C15DBF40A21693142739EA8D564D18FDA1@EXVMBX018-1.exch018.msoutlookonline.net>",
                metadata.get(MAPI.INTERNET_REFERENCES));
        assertEquals("<C8508767C15DBF40A21693142739EA8D564D18FDA1@EXVMBX018-1.exch018.msoutlookonline.net>",
                metadata.get(MAPI.IN_REPLY_TO_ID));

        assertEquals("true", metadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML));
    }

    @Test
    public void testEmbeddedPath() throws Exception {
        List<Metadata> metadataList = getRecursiveMetadata("testMSG_att_msg.msg");
        assertEquals("/Test Attachment.msg", metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
        assertEquals("/smbprn.00009008.KdcPjl.pdf", metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
        assertEquals("true", metadataList.get(0).get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML));
    }

    @Test
    public void testOutlookHTMLfromRTF() throws Exception {

        //test default behavior
        List<Metadata> metadataList = getRecursiveMetadata("test-outlook2003.msg");
        assertNotContained("<dd>New Outlook User</dd>", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));

        //test legacy behavior with the configuration set
        Metadata metadata = new Metadata();

        // Check the HTML version
        StringWriter sw = new StringWriter();
        SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(new StreamResult(sw));

        try (TikaInputStream tis = getResourceAsStream("/test-documents/test-outlook2003.msg")) {
            AUTO_DETECT_PARSER.parse(tis, handler, metadata, new ParseContext());
        }

        // As the HTML version should have been processed, ensure
        //  we got some of the links
        String content = sw.toString().replaceAll("[\\r\\n\\t]+", " ").replaceAll(" +", " ");
        assertNotContained("<dd>New Outlook User</dd>", content);
        assertContains("designed <i>to help you", content);
        assertContains(
                "<p> <a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>",
                content);

        // Link - check text around it, and the link itself
        assertContains("sign up for a free subscription", content);
        assertContains("Office Newsletter", content);
        assertContains("newsletter will be sent to you", content);
        assertContains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033", content);

        // Make sure we don't have nested html docs
        assertEquals(2, content.split("<body>").length);
        assertEquals(2, content.split("<\\/body>").length);

        assertEquals("true", metadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML));
    }

    @Test
    public void testMAPIMessageClasses() throws Exception {

        for (String messageClass : new String[]{"Appointment", "Contact", "Post", "StickyNote",
                "Task"}) {
            testMsgClass(messageClass, getXML("testMSG_" + messageClass + ".msg").metadata);
        }

        testMsgClass("NOTE", getXML("test-outlook2003.msg").metadata);
    }

    private void testMsgClass(String expected, Metadata metadata) {
        assertTrue(expected.equalsIgnoreCase(
                                metadata.get(MAPI.MESSAGE_CLASS).replaceAll("_", "")),
                expected + ", but got: " + metadata.get(MAPI.MESSAGE_CLASS));
    }

    @Test
    public void testAppointmentExtendedMetadata() throws Exception {
        List<Metadata> metadataList = getRecursiveMetadata("testMSG_Appointment.msg");
        Metadata m = metadataList.get(0);
        assertTrue(m.get("mapi:property:PidLidAppointmentEndWhole").contains("2017-02-28T19"));
        assertTrue(m.get("mapi:property:PidLidAppointmentStartWhole").contains("2017-02-28T18"));
        assertTrue(m.get("mapi:property:PidLidClipStart").contains("2017-02-28T18"));
        assertTrue(m.get("mapi:property:PidLidClipEnd").contains("2017-02-28T19"));
        assertTrue(m.get("mapi:property:PidLidCommonStart").contains("2017-02-28T18"));
        assertTrue(m.get("mapi:property:PidLidCommonEnd").contains("2017-02-28T19"));
        assertTrue(m.get("mapi:property:PidLidReminderSignalTime").contains("4501-01-01T00"));
        assertTrue(m.get("mapi:property:PidLidReminderTime").contains("2017-02-28T18"));
        assertTrue(m.get("mapi:property:PidLidValidFlagStringProof").contains("2017-02-28T18:42"));
        assertEquals("0", m.get("mapi:property:PidLidAppointmentSequence"));
        assertEquals("false", m.get("mapi:property:PidLidRecurring"));
        assertEquals("true", m.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML));

    }

    @Test
    public void testTaskExtendedMetadata() throws Exception {
        List<Metadata> metadataList = getRecursiveMetadata("testMSG_Task.msg");
        Metadata m = metadataList.get(0);
        assertTrue(m.get("mapi:property:PidLidToDoOrdinalDate").contains("2017-02-28T18:44"));
        assertTrue(m.get("mapi:property:PidLidValidFlagStringProof").contains("2017-02-28T18:44"));
        assertEquals("0", m.get("mapi:property:PidLidTaskActualEffort"));
        assertEquals("false", m.get("mapi:property:PidLidTeamTask"));
    }

    @Test
    public void testContactExtendedMetadata() throws Exception {
        List<Metadata> metadataList = getRecursiveMetadata("testMSG_Contact.msg");
        Metadata m = metadataList.get(0);
        assertEquals("2017-02-28T18:41:37Z", m.get("mapi:property:PidLidValidFlagStringProof"));
    }


    @Test
    public void testPostExtendedMetadata() throws Exception {
        List<Metadata> metadataList = getRecursiveMetadata("testMSG_Post.msg");
        Metadata m = metadataList.get(0);
        assertEquals("2017-02-28T18:47:11Z", m.get("mapi:property:PidLidValidFlagStringProof"));
    }


    @Test
    public void testHandlingAllAlternativesBodies() throws Exception {
        //test that default only has one body
        List<Metadata> metadataList = getRecursiveMetadata("testMSG.msg");
        assertEquals(1, metadataList.size());
        assertContains("breaking your application",
                metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
        assertEquals("application/vnd.ms-outlook", metadataList.get(0).get(Metadata.CONTENT_TYPE));

        //now try extracting all bodies
        //they should each appear as standalone attachments
        //with no content in the body of the msg level
        Parser p = TikaLoader.load(
                getConfigPath(OutlookParserTest.class, "tika-config-extract-all-alternatives-msg.json"))
                .loadAutoDetectParser();

        metadataList = getRecursiveMetadata("testMSG.msg", p);
        assertEquals(3, metadataList.size());

        assertNotContained("breaking your application",
                metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
        assertEquals("application/vnd.ms-outlook",
                metadataList.get(0).get(Metadata.CONTENT_TYPE));

        assertContains("breaking your application",
                metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
        assertEquals("application/rtf", metadataList.get(1).get(Metadata.CONTENT_TYPE));

        assertContains("breaking your application",
                metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT));
        assertTrue(metadataList.get(2).get(Metadata.CONTENT_TYPE).startsWith("text/plain"));

    }

    @Test
    public void testNewlinesInRTFBody() throws Exception {
        List<Metadata> metadataList = getRecursiveMetadata("test-outlook.msg", AUTO_DETECT_PARSER,
                BasicContentHandlerFactory.HANDLER_TYPE.BODY);
        assertContains("annuaires\t \n" + " Synchronisation", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
    }

}