OutlookExtractor.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.codec.binary.Hex;
import org.apache.james.mime4j.codec.DecodeMonitor;
import org.apache.james.mime4j.codec.DecoderUtil;
import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
import org.apache.poi.hsmf.datatypes.ByteChunk;
import org.apache.poi.hsmf.datatypes.Chunk;
import org.apache.poi.hsmf.datatypes.Chunks;
import org.apache.poi.hsmf.datatypes.MAPIProperty;
import org.apache.poi.hsmf.datatypes.MessageSubmissionChunk;
import org.apache.poi.hsmf.datatypes.PropertyValue;
import org.apache.poi.hsmf.datatypes.RecipientChunks;
import org.apache.poi.hsmf.datatypes.StringChunk;
import org.apache.poi.hsmf.datatypes.Types;
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.util.CodePageUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;

import org.apache.tika.detect.EncodingResult;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.MAPI;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlEncodingDetector;
import org.apache.tika.parser.html.JSoupParser;
import org.apache.tika.parser.mailcommons.MailDateParser;
import org.apache.tika.parser.microsoft.msg.ExtendedMetadataExtractor;
import org.apache.tika.parser.microsoft.rtf.RTFParser;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;


/**
 * Outlook Message Parser.
 */
public class OutlookExtractor extends AbstractPOIFSExtractor {
    static Logger LOGGER = LoggerFactory.getLogger(OutlookExtractor.class);
    public enum BODY_TYPES_PROCESSED {
        HTML, RTF, TEXT;
    }

    private static final Metadata EMPTY_METADATA = new Metadata();
    private static final MAPIProperty[] LITERAL_TIME_MAPI_PROPERTIES = new MAPIProperty[] {
            MAPIProperty.CLIENT_SUBMIT_TIME,
            MAPIProperty.CREATION_TIME,
            MAPIProperty.DEFERRED_DELIVERY_TIME,
            MAPIProperty.DELIVER_TIME,
            //EXPAND BEGIN and EXPAND END?
            MAPIProperty.EXPIRY_TIME,
            MAPIProperty.LAST_MODIFICATION_TIME,
            MAPIProperty.LATEST_DELIVERY_TIME,
            MAPIProperty.MESSAGE_DELIVERY_TIME,
            MAPIProperty.MESSAGE_DOWNLOAD_TIME,
            MAPIProperty.ORIGINAL_DELIVERY_TIME,
            MAPIProperty.ORIGINAL_SUBMIT_TIME,
            MAPIProperty.PROVIDER_SUBMIT_TIME,
            MAPIProperty.RECEIPT_TIME,
            MAPIProperty.REPLY_TIME,
            MAPIProperty.REPORT_TIME

    };

    private static final Map<MAPIProperty, Property> LITERAL_TIME_PROPERTIES = new HashMap<>();

    private static final Map<String, String> MESSAGE_CLASSES = new LinkedHashMap<>();

    private static final Pattern IMG_TAG_PATTERN = Pattern.compile("<img ([^>]{0,1000})>");
    private static final Pattern SRC_ATTR_PATTERN = Pattern.compile("src=\"cid:([^\"]{0,1000})\"");
    private static final Pattern TEXT_CID_PATTERN = Pattern.compile("\\[cid:([^]]{0,1000})]");

    static {
        for (MAPIProperty property : LITERAL_TIME_MAPI_PROPERTIES) {
            String name = property.mapiProperty.toLowerCase(Locale.ROOT);
            name = name.substring(3);
            name = name.replace('_', '-');
            name = MAPI.PREFIX_MAPI_META + name;
            Property tikaProp = Property.internalDate(name);
            LITERAL_TIME_PROPERTIES.put(property, tikaProp);
        }
        loadMessageClasses();
    }

    private static void loadMessageClasses() {
        String fName = "/org/apache/tika/parser/microsoft/msg/mapi_message_classes.properties";
        try (BufferedReader r = new BufferedReader(
                new InputStreamReader(
                        OutlookExtractor.class.getResourceAsStream(fName), UTF_8))) {
            String line = r.readLine();
            while (line != null) {
                if (line.isBlank() || line.startsWith("#")) {
                    line = r.readLine();
                    continue;
                }
                String[] cols = line.split("\\s+");
                String lcKey = cols[0].toLowerCase(Locale.ROOT);
                String value = cols[1];
                if (MESSAGE_CLASSES.containsKey(lcKey)) {
                    throw new IllegalArgumentException("Can't have duplicate keys: " + lcKey);
                }
                MESSAGE_CLASSES.put(lcKey, value);
                line = r.readLine();
            }
        } catch (IOException e) {
            throw new IllegalStateException("can't find mapi_message_classes.properties?!");
        }

    }

    //this according to the spec; in practice, it is probably more likely
    //that a "split field" fails to start with a space character than
    //that a real header contains anything but [-_A-Za-z0-9].
    //e.g.
    //header: this header goes onto the next line
    //<mailto:xyz@cnn.com...
    private static Pattern HEADER_KEY_PAT =
            Pattern.compile("\\A([\\x21-\\x39\\x3B-\\x7E]+):(.*?)\\Z");

    private final MAPIMessage msg;
    private final ParseContext parseContext;
    private final boolean extractAllAlternatives;
    HtmlEncodingDetector detector = new HtmlEncodingDetector();


    public OutlookExtractor(DirectoryNode root, Metadata metadata, ParseContext context) throws TikaException {
        super(context, metadata);
        this.parseContext = context;
        this.extractAllAlternatives =
                context.get(OfficeParserConfig.class).isExtractAllAlternativesFromMSG();
        try {
            this.msg = new MAPIMessage(root);
        } catch (IOException e) {
            throw new TikaException("Failed to parse Outlook message", e);
        }
    }

    //need to add empty string to ensure that parallel arrays are parallel
    //even if one value is null.
    public static void addEvenIfNull(Property property, String value, Metadata metadata) {
        if (value == null) {
            value = "";
        }
        metadata.add(property, value);
    }

    private static void setFirstChunk(List<Chunk> chunks, Property property, Metadata metadata) {
        if (chunks == null || chunks.isEmpty() || chunks.get(0) == null) {
            return;
        }
        metadata.set(property, chunks.get(0).toString());
    }

    public static String getNormalizedMessageClass(String messageClass) {
        if (messageClass == null || messageClass.isBlank()) {
            return "UNSPECIFIED";
        }
        String lc = messageClass.toLowerCase(Locale.ROOT);
        if (MESSAGE_CLASSES.containsKey(lc)) {
            return MESSAGE_CLASSES.get(lc);
        }
        return "UNKNOWN";
    }

    public void parse(XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException {
        try {
            _parse(xhtml);
        } catch (ChunkNotFoundException e) {
            throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
        } /*finally {
            //You'd think you'd want to call msg.close().
            //Don't do that.  That closes down the file system.
            //If an msg has multiple msg attachments, some of them
            //can reside in the same file system.  After the first
            //child is read, the fs is closed, and the other children
            //get a java.nio.channels.ClosedChannelException
        }*/
    }

    private void _parse(XHTMLContentHandler xhtml) throws TikaException, SAXException, IOException, ChunkNotFoundException {
        msg.setReturnNullOnMissingChunk(true);

        // If the message contains strings that aren't stored
        //  as Unicode, try to sort out an encoding for them
        if (msg.has7BitEncodingStrings()) {
            guess7BitEncoding(msg);
        }

        // Start with the metadata
        Map<String, String[]> headers = normalizeHeaders(msg.getHeaders());

        handleFromTo(headers, parentMetadata);
        handleMessageInfo(msg, headers, parentMetadata);
        ExtendedMetadataExtractor.extract(msg, parentMetadata);

        try {
            for (String recipientAddress : msg.getRecipientEmailAddressList()) {
                if (recipientAddress != null) {
                    parentMetadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
                }
            }
        } catch (ChunkNotFoundException e) {
            //you'd think we wouldn't need this. we do.
        }

        for (Map.Entry<String, String[]> e : headers.entrySet()) {
            String headerKey = e.getKey();
            for (String headerValue : e.getValue()) {
                parentMetadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + headerKey, headerValue);
            }
        }

        handleGeneralDates(msg, headers, parentMetadata);
        writeSelectHeadersInBody(parentMetadata, msg, xhtml);

        // Get the message body. Preference order is: html, rtf, text
        Chunk htmlChunk = null;
        Chunk rtfChunk = null;
        Chunk textChunk = null;
        for (Chunk chunk : msg.getMainChunks().getChunks()) {
            if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
                htmlChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
                rtfChunk = chunk;
            }
            if (chunk.getChunkId() == MAPIProperty.BODY.id) {
                textChunk = chunk;
            }
        }

        Set<String> contentIdNames = new HashSet<>();
        handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml, contentIdNames);
        // Process the attachments
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            Metadata attachMetadata = Metadata.newInstance(context);
            updateAttachmentMetadata(attachment, attachMetadata, contentIdNames);
            String filename = null;
            if (!StringUtils.isBlank(attachMetadata.get(MAPI.ATTACH_LONG_FILE_NAME))) {
                filename = attachMetadata.get(MAPI.ATTACH_LONG_FILE_NAME);
            } else if (!StringUtils.isBlank(attachMetadata.get(MAPI.ATTACH_DISPLAY_NAME))) {
                filename = attachMetadata.get(MAPI.ATTACH_DISPLAY_NAME);
            } else if (!StringUtils.isBlank(attachMetadata.get(MAPI.ATTACH_FILE_NAME))) {
                filename = attachMetadata.get(MAPI.ATTACH_FILE_NAME);
            }
            //this is allowed to be null;
            String mimeType = attachMetadata.get(MAPI.ATTACH_MIME);
            if (attachment.getAttachData() != null) {
                handleEmbeddedResource(TikaInputStream.get(attachment
                        .getAttachData()
                        .getValue()), attachMetadata, filename, null, null, mimeType, xhtml, true);
            }
            if (attachment.getAttachmentDirectory() != null) {
                handleEmbeddedOfficeDoc(attachment
                        .getAttachmentDirectory()
                        .getDirectory(), attachMetadata, filename, xhtml, true);
            }
        }

    }

    private void updateAttachmentMetadata(AttachmentChunks attachment, Metadata metadata,
                                          Set<String> contentIdNames) {
        StringChunk contentIdChunk = attachment.getAttachContentId();
        if (contentIdChunk != null) {
            String contentId = contentIdChunk.getValue();
            if (! StringUtils.isBlank(contentId)) {
                contentId = contentId.trim();
                if (contentIdNames.contains(contentId)) {
                    metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY,
                            TikaCoreProperties.EmbeddedResourceType.INLINE.name());
                }
                metadata.set(MAPI.ATTACH_CONTENT_ID, contentId);
            }
        }
        addStringChunkToMetadata(MAPI.ATTACH_LONG_PATH_NAME, attachment.getAttachLongPathName(), metadata);
        addStringChunkToMetadata(MAPI.ATTACH_LONG_FILE_NAME, attachment.getAttachLongFileName(), metadata);
        addStringChunkToMetadata(MAPI.ATTACH_FILE_NAME, attachment.getAttachFileName(), metadata);
        addStringChunkToMetadata(MAPI.ATTACH_CONTENT_LOCATION, attachment.getAttachContentLocation(), metadata);
        addStringChunkToMetadata(MAPI.ATTACH_DISPLAY_NAME, attachment.getAttachDisplayName(), metadata);
        addStringChunkToMetadata(MAPI.ATTACH_EXTENSION, attachment.getAttachExtension(), metadata);
        addStringChunkToMetadata(MAPI.ATTACH_MIME, attachment.getAttachMimeTag(), metadata);
        addStringChunkToMetadata(MAPI.ATTACH_LANGUAGE, attachment.getAttachLanguage(), metadata);
    }

    private void addStringChunkToMetadata(Property property, StringChunk stringChunk, Metadata metadata) {
        if (stringChunk == null) {
            return;
        }
        String v = stringChunk.getValue();
        if (StringUtils.isBlank(v)) {
            return;
        }
        metadata.set(property, v);
    }

    private void handleMessageInfo(MAPIMessage msg, Map<String, String[]> headers, Metadata metadata) throws ChunkNotFoundException {
        //this is the literal subject including "re: "
        metadata.set(TikaCoreProperties.TITLE, msg.getSubject());
        //this is the original topic for the thread without the "re: "
        String topic = msg.getConversationTopic();
        metadata.set(TikaCoreProperties.SUBJECT, topic);
        metadata.set(TikaCoreProperties.DESCRIPTION, topic);
        metadata.set(MAPI.CONVERSATION_TOPIC, topic);
        Chunks mainChunks = msg.getMainChunks();
        if (mainChunks == null) {
            return;
        }
        if (mainChunks.getMessageId() != null) {
            metadata.set(MAPI.INTERNET_MESSAGE_ID, mainChunks
                    .getMessageId()
                    .getValue());
        }

        String mc = msg.getStringFromChunk(mainChunks.getMessageClass());
        if (mc != null) {
            metadata.set(MAPI.MESSAGE_CLASS_RAW, mc);
        }
        metadata.set(MAPI.MESSAGE_CLASS, getNormalizedMessageClass(mc));
        List<Chunk> conversationIndex = mainChunks
                .getAll()
                .get(MAPIProperty.CONVERSATION_INDEX);
        if (conversationIndex != null && !conversationIndex.isEmpty()) {
            Chunk chunk = conversationIndex.get(0);
            if (chunk instanceof ByteChunk) {
                byte[] bytes = ((ByteChunk) chunk).getValue();
                String hex = Hex.encodeHexString(bytes);
                metadata.set(MAPI.CONVERSATION_INDEX, hex);
            }
        }

        List<Chunk> internetReferences = mainChunks
                .getAll()
                .get(MAPIProperty.INTERNET_REFERENCES);
        if (internetReferences != null) {
            for (Chunk ref : internetReferences) {
                if (ref instanceof StringChunk) {
                    metadata.add(MAPI.INTERNET_REFERENCES, ((StringChunk) ref).getValue());
                }
            }
        }
        List<Chunk> inReplyToIds = mainChunks
                .getAll()
                .get(MAPIProperty.IN_REPLY_TO_ID);
        if (inReplyToIds != null && !inReplyToIds.isEmpty()) {
            metadata.add(MAPI.IN_REPLY_TO_ID, inReplyToIds
                    .get(0)
                    .toString());
        }

        for (Map.Entry<MAPIProperty, Property> e : LITERAL_TIME_PROPERTIES.entrySet()) {
            List<PropertyValue> timeProp = mainChunks
                    .getProperties()
                    .get(e.getKey());
            if (timeProp != null && !timeProp.isEmpty()) {
                Calendar cal = ((PropertyValue.TimePropertyValue) timeProp.get(0)).getValue();
                metadata.set(e.getValue(), cal);
            }
        }

        MessageSubmissionChunk messageSubmissionChunk = mainChunks.getSubmissionChunk();
        if (messageSubmissionChunk != null) {
            String submissionId = messageSubmissionChunk.getSubmissionId();
            metadata.set(MAPI.SUBMISSION_ID, submissionId);
            metadata.set(MAPI.SUBMISSION_ACCEPTED_AT_TIME, messageSubmissionChunk.getAcceptedAtTime());
        }
    }


    private void handleGeneralDates(MAPIMessage msg, Map<String, String[]> headers, Metadata metadata) throws ChunkNotFoundException {
        // Date - try two ways to find it
        // First try via the proper chunk
        if (msg.getMessageDate() != null) {
            metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
            metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
        } else {
            if (headers != null && headers.size() > 0) {
                for (Map.Entry<String, String[]> header : headers.entrySet()) {
                    String headerKey = header.getKey();
                    if (headerKey.toLowerCase(Locale.ROOT).startsWith("date:")) {
                        String date = headerKey.substring(headerKey.indexOf(':') + 1).trim();

                        // See if we can parse it as a normal mail date
                        try {
                            Date d = MailDateParser.parseDateLenient(date);
                            metadata.set(TikaCoreProperties.CREATED, d);
                            metadata.set(TikaCoreProperties.MODIFIED, d);
                        } catch (SecurityException e) {
                            throw e;
                        } catch (Exception e) {
                            // Store it as-is, and hope for the best...
                            metadata.set(TikaCoreProperties.CREATED, date);
                            metadata.set(TikaCoreProperties.MODIFIED, date);
                        }
                        break;
                    }
                }
            }
        }
        //try to overwrite the modified property if the actual LAST_MODIFICATION_TIME property exists.
        List<PropertyValue> timeProp = msg.getMainChunks().getProperties().get(MAPIProperty.LAST_MODIFICATION_TIME);
        if (timeProp != null && ! timeProp.isEmpty()) {
            Calendar cal = ((PropertyValue.TimePropertyValue)timeProp.get(0)).getValue();
            metadata.set(TikaCoreProperties.MODIFIED, cal);
        }

    }

    private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk,
                                  XHTMLContentHandler xhtml, Set<String> contentIdNames)
            throws SAXException, IOException, TikaException {

        if (extractAllAlternatives) {
            extractAllAlternatives(htmlChunk, rtfChunk, textChunk, xhtml, contentIdNames);
            return;
        }
        _handleBestBodyChunk(htmlChunk, rtfChunk, textChunk, xhtml, contentIdNames);

    }
    private void _handleBestBodyChunk(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk,
                                      XHTMLContentHandler xhtml, Set<String> contentIdNames)
            throws SAXException, IOException, TikaException {
        //try html, then rtf, then text
        if (htmlChunk != null) {
            byte[] data = null;
            if (htmlChunk instanceof ByteChunk) {
                data = ((ByteChunk) htmlChunk).getValue();
            } else if (htmlChunk instanceof StringChunk) {
                data = ((StringChunk) htmlChunk).getRawValue();
            }
            if (data != null) {
                Parser htmlParser = EmbeddedDocumentUtil
                        .tryToFindExistingLeafParser(JSoupParser.class, parseContext);
                if (htmlParser == null) {
                    htmlParser = new JSoupParser();
                }
                Metadata htmlMetadata = Metadata.newInstance(context);
                try (TikaInputStream tis = TikaInputStream.get(data)) {
                    htmlParser.parse(tis, new EmbeddedContentHandler(new BodyContentHandler(xhtml)), htmlMetadata, parseContext);
                }
                extractContentIdNamesFromHtml(data, htmlMetadata, contentIdNames);
                parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.HTML.name());
                return;
            }
        }
        if (rtfChunk != null) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            //avoid buffer underflow TIKA-2530
            //TODO -- would be good to find an example triggering file and
            //figure out if this is a bug in POI or a genuine 0 length chunk
            if (chunk.getValue() != null && chunk.getValue().length > 0) {
                MAPIRtfAttribute rtf =
                        new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(),
                                chunk.getValue());
                RTFParser rtfParser = (RTFParser) EmbeddedDocumentUtil
                        .tryToFindExistingLeafParser(RTFParser.class, parseContext);
                if (rtfParser == null) {
                    rtfParser = new RTFParser();
                }
                Metadata rtfMetadata = Metadata.newInstance(context);
                try (TikaInputStream tis = TikaInputStream.get(rtf.getData())) {
                    rtfParser.parseInline(tis, xhtml, rtfMetadata, parseContext);
                }
                extractContentIdNamesFromRtf(rtf.getData(), rtfMetadata, contentIdNames);
                parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.RTF.name());
                parentMetadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML,
                        rtfMetadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML));
                return;
            }
        }
        if (textChunk != null) {
            String s = ((StringChunk) textChunk).getValue();
            xhtml.element("p", s);
            extractContentIdNamesFromText(s, contentIdNames);
            parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.TEXT.name());
        }

    }

    private void extractContentIdNamesFromRtf(byte[] data, Metadata metadata, Set<String> contentIdNames) {
        //for now, hope that there's encapsulated html
        //TODO: check for encapsulated html. If it doesn't exist, handle RTF specifically
        extractContentIdNamesFromHtml(data, metadata, contentIdNames);
    }

    private void extractContentIdNamesFromHtml(byte[] data, Metadata metadata, Set<String> contentIdNames) {
        String html = new String(data, UTF_8);
        Matcher imageMatcher = IMG_TAG_PATTERN.matcher(html);
        Matcher cidSrcMatcher = SRC_ATTR_PATTERN.matcher("");
        while (imageMatcher.find()) {
            String imgElementContents = imageMatcher.group(1);
            cidSrcMatcher.reset(imgElementContents);
            while (cidSrcMatcher.find()) {
                String cid = cidSrcMatcher.group(1);
                cid = cid.trim();
                contentIdNames.add(cid);
            }
        }
    }

    private void extractContentIdNamesFromText(String s, Set<String> contentIdNames) {
        Matcher m = TEXT_CID_PATTERN.matcher(s);
        while (m.find()) {
            contentIdNames.add(m.group(1));
        }
    }

    private void extractAllAlternatives(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk,
                                        XHTMLContentHandler xhtml, Set<String> contentIdNames)
            throws TikaException, SAXException, IOException {
        if (htmlChunk != null) {
            byte[] data = getValue(htmlChunk);
            if (data != null) {
                handleEmbeddedResource(TikaInputStream.get(data), "html-body", null,
                        MediaType.TEXT_HTML.toString(), xhtml, true);
                extractContentIdNamesFromHtml(data, Metadata.newInstance(context), contentIdNames);
                parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.HTML.name());
            }
        }
        if (rtfChunk != null) {
            ByteChunk chunk = (ByteChunk) rtfChunk;
            MAPIRtfAttribute rtf =
                    new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(),
                            chunk.getValue());

            byte[] data = rtf.getData();
            if (data != null) {
                Metadata rtfMetadata = Metadata.newInstance(context);
                handleEmbeddedResource(TikaInputStream.get(data), rtfMetadata,
                        "rtf-body", null, null,
                        "application/rtf", xhtml, true);
                extractContentIdNamesFromRtf(data, rtfMetadata, contentIdNames);
                //copy this info into the parent...what else should we copy?
                parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.RTF.name());
                parentMetadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML,
                        rtfMetadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML));

            }
        }
        if (textChunk != null) {
            byte[] data = getValue(textChunk);
            if (data != null) {
                Metadata chunkMetadata = Metadata.newInstance(context);
                chunkMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
                        MediaType.TEXT_PLAIN.toString());
                handleEmbeddedResource(TikaInputStream.get(data), chunkMetadata, null, "text-body",
                        null, MediaType.TEXT_PLAIN.toString(), xhtml, true);
                if (textChunk instanceof StringChunk) {
                    extractContentIdNamesFromText(((StringChunk) textChunk).getValue(), contentIdNames);
                }
                parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.TEXT.name());
            }
        }

    }

    //can return null!
    private byte[] getValue(Chunk chunk) {
        byte[] data = null;
        if (chunk instanceof ByteChunk) {
            data = ((ByteChunk) chunk).getValue();
        } else if (chunk instanceof StringChunk) {
            data = ((StringChunk) chunk).getRawValue();
        }
        return data;
    }

    private void handleFromTo(Map<String, String[]> headers, Metadata metadata)
            throws ChunkNotFoundException {
        String from = msg.getDisplayFrom();
        metadata.set(TikaCoreProperties.CREATOR, from);
        metadata.set(Metadata.MESSAGE_FROM, from);
        metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
        metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
        metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());


        Chunks chunks = msg.getMainChunks();
        StringChunk sentByServerType = chunks.getSentByServerType();
        if (sentByServerType != null) {
            metadata.set(MAPI.SENT_BY_SERVER_TYPE, sentByServerType.getValue());
        }

        Map<MAPIProperty, List<Chunk>> mainChunks = msg.getMainChunks().getAll();

        List<Chunk> senderAddresType = mainChunks.get(MAPIProperty.SENDER_ADDRTYPE);
        String senderAddressTypeString = "";
        if (senderAddresType != null && senderAddresType.size() > 0) {
            senderAddressTypeString = senderAddresType.get(0).toString();
        }

        //sometimes in SMTP .msg files there is an email in the sender name field.

        setFirstChunk(mainChunks.get(MAPIProperty.SENDER_NAME), Message.MESSAGE_FROM_NAME, metadata);
        setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_NAME), MAPI.FROM_REPRESENTING_NAME, metadata);

        setFirstChunk(mainChunks.get(MAPIProperty.SENDER_EMAIL_ADDRESS), Message.MESSAGE_FROM_EMAIL, metadata);
        setFirstChunk(mainChunks.get(MAPIProperty.SENT_REPRESENTING_EMAIL_ADDRESS), MAPI.FROM_REPRESENTING_EMAIL, metadata);

        for (Recipient recipient : buildRecipients()) {
            switch (recipient.recipientType) {
                case TO:
                    addEvenIfNull(Message.MESSAGE_TO_NAME, recipient.name, metadata);
                    addEvenIfNull(Message.MESSAGE_TO_DISPLAY_NAME, recipient.displayName, metadata);
                    addEvenIfNull(Message.MESSAGE_TO_EMAIL, recipient.emailAddress, metadata);
                    break;
                case CC:
                    addEvenIfNull(Message.MESSAGE_CC_NAME, recipient.name, metadata);
                    addEvenIfNull(Message.MESSAGE_CC_DISPLAY_NAME, recipient.displayName, metadata);
                    addEvenIfNull(Message.MESSAGE_CC_EMAIL, recipient.emailAddress, metadata);
                    break;
                case BCC:
                    addEvenIfNull(Message.MESSAGE_BCC_NAME, recipient.name, metadata);
                    addEvenIfNull(Message.MESSAGE_BCC_DISPLAY_NAME, recipient.displayName, metadata);
                    addEvenIfNull(Message.MESSAGE_BCC_EMAIL, recipient.emailAddress, metadata);
                    break;
                default:
                    //log unknown or undefined?
                    break;
            }
        }
    }

    //As of 3.15, POI currently returns header[] by splitting on /\r?\n/
    //this rebuilds headers that are broken up over several lines
    //this also decodes encoded headers.
    private Map<String, String[]> normalizeHeaders(String[] rows) {
        Map<String, String[]> ret = new LinkedHashMap<>();
        if (rows == null) {
            return ret;
        }
        StringBuilder sb = new StringBuilder();
        Map<String, List<String>> headers = new LinkedHashMap();
        Matcher headerKeyMatcher = HEADER_KEY_PAT.matcher("");
        String lastKey = null;
        int consec = 0;
        for (String row : rows) {
            headerKeyMatcher.reset(row);
            if (headerKeyMatcher.find()) {
                if (lastKey != null) {
                    List<String> vals = headers.get(lastKey);
                    vals = (vals == null) ? new ArrayList<>() : vals;
                    vals.add(decodeHeader(sb.toString()));
                    headers.put(lastKey, vals);
                }
                //reset sb
                sb.setLength(0);
                lastKey = headerKeyMatcher.group(1).trim();
                sb.append(headerKeyMatcher.group(2).trim());
                consec = 0;
            } else {
                if (consec > 0) {
                    sb.append("\n");
                }
                sb.append(row);
            }
            consec++;
        }

        //make sure to add the last value
        if (sb.length() > 0 && lastKey != null) {
            List<String> vals = headers.get(lastKey);
            vals = (vals == null) ? new ArrayList<>() : vals;
            vals.add(decodeHeader(sb.toString()));
            headers.put(lastKey, vals);
        }

        //convert to array
        for (Map.Entry<String, List<String>> e : headers.entrySet()) {
            ret.put(e.getKey(), e.getValue().toArray(new String[0]));
        }
        return ret;

    }

    private String decodeHeader(String header) {
        return DecoderUtil.decodeEncodedWords(header, DecodeMonitor.SILENT);
    }

    private void header(XHTMLContentHandler xhtml, String key, String value) throws SAXException {
        if (value != null && value.length() > 0) {
            xhtml.element("dt", key);
            xhtml.element("dd", value);
        }
    }

    /**
     * Tries to identify the correct encoding for 7-bit (non-unicode)
     * strings in the file.
     * <p>Many messages store their strings as unicode, which is
     * nice and easy. Some use one-byte encodings for their
     * strings, but don't always store the encoding anywhere
     * helpful in the file.</p>
     * <p>This method checks for codepage properties, and failing that
     * looks at the headers for the message, and uses these to
     * guess the correct encoding for your file.</p>
     * <p>Bug #49441 has more on why this is needed</p>
     * <p>This is taken verbatim from POI (TIKA-1238)
     * as a temporary workaround to prevent unsupported encoding exceptions</p>
     */
    private void guess7BitEncoding(MAPIMessage msg) {
        Chunks mainChunks = msg.getMainChunks();
        //null check
        if (mainChunks == null) {
            return;
        }

        Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties();
        if (props != null) {
            // First choice is a codepage property
            for (MAPIProperty prop : new MAPIProperty[]{MAPIProperty.MESSAGE_CODEPAGE, MAPIProperty.INTERNET_CPID}) {
                List<PropertyValue> val = props.get(prop);
                if (val != null && val.size() > 0) {
                    int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue();
                    String encoding = null;
                    try {
                        encoding = CodePageUtil.codepageToEncoding(codepage, true);
                    } catch (UnsupportedEncodingException e) {
                        //swallow
                    }
                    if (tryToSet7BitEncoding(msg, encoding)) {
                        return;
                    }
                }
            }
        }

        // Second choice is a charset on a content type header
        try {
            String[] headers = msg.getHeaders();
            if (headers != null && headers.length > 0) {
                // Look for a content type with a charset
                Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);

                for (String header : headers) {
                    if (header.startsWith("Content-Type")) {
                        Matcher m = p.matcher(header);
                        if (m.matches()) {
                            // Found it! Tell all the string chunks
                            String charset = m.group(1);
                            if (tryToSet7BitEncoding(msg, charset)) {
                                return;
                            }
                        }
                    }
                }
            }
        } catch (ChunkNotFoundException e) {
            //swallow
        }

        // Nothing suitable in the headers, try HTML
        // TODO: do we need to replicate this in Tika? If we wind up
        // parsing the html version of the email, this is duplicative??
        // Or do we need to reset the header strings based on the html
        // meta header if there is no other information?
        try {
            String html = msg.getHtmlBody();
            if (html != null && html.length() > 0) {
                Charset charset = null;
                try (TikaInputStream tis = TikaInputStream.get(html.getBytes(UTF_8))) {
                    List<EncodingResult> encResults =
                            detector.detect(tis, EMPTY_METADATA, context);
                    charset = encResults.isEmpty() ? null : encResults.get(0).getCharset();
                } catch (IOException e) {
                    //swallow
                }
                if (charset != null && tryToSet7BitEncoding(msg, charset.name())) {
                    return;
                }
            }
        } catch (ChunkNotFoundException e) {
            //swallow
        }

        //absolute last resort, try charset detector
        StringChunk text = mainChunks.getTextBodyChunk();
        if (text != null) {
            CharsetDetector detector = new CharsetDetector();
            detector.setText(text.getRawValue());
            CharsetMatch match = detector.detect();
            if (match != null && match.getConfidence() > 35 &&
                    tryToSet7BitEncoding(msg, match.getName())) {
                return;
            }
        }
    }

    private boolean tryToSet7BitEncoding(MAPIMessage msg, String charsetName) {
        if (charsetName == null) {
            return false;
        }

        if (charsetName.equalsIgnoreCase("utf-8")) {
            return false;
        }
        try {
            if (Charset.isSupported(charsetName)) {
                msg.set7BitEncoding(charsetName);
                return true;
            }
        } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
            //swallow
        }
        return false;
    }

    private void writeSelectHeadersInBody(Metadata metadata, MAPIMessage msg, XHTMLContentHandler xhtml)
            throws SAXException, ChunkNotFoundException {
        if (! officeParserConfig.isWriteSelectHeadersInBody()) {
            return;
        }
        String subject = metadata.get(TikaCoreProperties.TITLE);
        subject = (subject == null) ? "" : subject;
        xhtml.element("h1", subject);

        // Output the from and to details in text, as you
        //  often want them in text form for searching
        xhtml.startElement("dl");
        String from = metadata.get(Message.MESSAGE_FROM);
        if (from != null) {
            header(xhtml, "From", from);
        }
        header(xhtml, "To", msg.getDisplayTo());
        header(xhtml, "Cc", msg.getDisplayCC());
        header(xhtml, "Bcc", msg.getDisplayBCC());
        try {
            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
        } catch (ChunkNotFoundException e) {
            //swallow
        }
        xhtml.endElement("dl");
    }

    private List<Recipient> buildRecipients() {
        RecipientChunks[] recipientChunks = msg.getRecipientDetailsChunks();
        if (recipientChunks == null) {
            return Collections.EMPTY_LIST;
        }
        List<Recipient> recipients = new LinkedList<>();

        for (RecipientChunks chunks : recipientChunks) {
            Recipient r = new Recipient();
            r.displayName = (chunks.getRecipientDisplayNameChunk() != null) ?
                    chunks.getRecipientDisplayNameChunk().toString() : null;
            r.name = (chunks.getRecipientNameChunk() != null) ?
                    chunks.getRecipientNameChunk().toString() :
                    null;
            r.emailAddress = chunks.getRecipientEmailAddress();
            List<PropertyValue> vals = chunks.getProperties().get(MAPIProperty.RECIPIENT_TYPE);

            RECIPIENT_TYPE recipientType = RECIPIENT_TYPE.UNSPECIFIED;
            if (vals != null && vals.size() > 0) {
                Object val = vals.get(0).getValue();
                if (val instanceof Integer) {
                    recipientType = RECIPIENT_TYPE.getTypeFromVal((int) val);
                }
            }
            r.recipientType = recipientType;

            vals = chunks.getProperties().get(MAPIProperty.ADDRTYPE);
            if (vals != null && vals.size() > 0) {
                String val = vals.get(0).toString();
                if (val != null) {
                    val = val.toLowerCase(Locale.US);
                    //need to find example of this for testing
                    if (val.equals("ex")) {
                        r.addressType = ADDRESS_TYPE.EX;
                    } else if (val.equals("smtp")) {
                        r.addressType = ADDRESS_TYPE.SMTP;
                    }
                }
            }
            recipients.add(r);
        }
        return recipients;
    }

    public enum RECIPIENT_TYPE {
        TO(1), CC(2), BCC(3), UNRECOGNIZED(-1), UNSPECIFIED(-1);

        private final int val;

        RECIPIENT_TYPE(int val) {
            this.val = val;
        }

        public static RECIPIENT_TYPE getTypeFromVal(int val) {
            //mild hackery, clean up
            if (val > 0 && val < 4) {
                return RECIPIENT_TYPE.values()[val - 1];
            }
            return UNRECOGNIZED;
        }
    }


    private enum ADDRESS_TYPE {
        EX, SMTP
    }

    private static class Recipient {
        String name;
        String displayName;
        RECIPIENT_TYPE recipientType;
        String emailAddress;
        ADDRESS_TYPE addressType;
    }
}