MailContentHandler.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.mail;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Stack;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.james.mime4j.MimeException;
import org.apache.james.mime4j.codec.DecodeMonitor;
import org.apache.james.mime4j.codec.DecoderUtil;
import org.apache.james.mime4j.dom.address.Address;
import org.apache.james.mime4j.dom.address.AddressList;
import org.apache.james.mime4j.dom.address.Mailbox;
import org.apache.james.mime4j.dom.address.MailboxList;
import org.apache.james.mime4j.dom.field.AddressListField;
import org.apache.james.mime4j.dom.field.MailboxListField;
import org.apache.james.mime4j.dom.field.ParsedField;
import org.apache.james.mime4j.dom.field.UnstructuredField;
import org.apache.james.mime4j.field.LenientFieldParser;
import org.apache.james.mime4j.message.MaximalBodyDescriptor;
import org.apache.james.mime4j.parser.ContentHandler;
import org.apache.james.mime4j.stream.BodyDescriptor;
import org.apache.james.mime4j.stream.Field;
import org.xml.sax.SAXException;

import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.csv.TextAndCSVParser;
import org.apache.tika.parser.html.JSoupParser;
import org.apache.tika.parser.mailcommons.MailDateParser;
import org.apache.tika.parser.mailcommons.MailUtil;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;

/**
 * Bridge between mime4j's content handler and the generic Sax content handler
 * used by Tika. See
 * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/ContentHandler.html
 */
class MailContentHandler implements ContentHandler {

    //TODO -- specific handling for other multipart subtypes?  mixed, parallel, digest
    private static final String MULTIPART_ALTERNATIVE = "multipart/alternative";

    private final XHTMLContentHandler handler;
    private final Metadata metadata;
    private final ParseContext parseContext;
    private final boolean extractAllAlternatives;
    private final EmbeddedDocumentExtractor extractor;
    private final Detector detector;
    private boolean strictParsing = false;
    //this is used to buffer a multipart body that
    //keeps track of multipart/alternative and its children
    private Stack<Part> alternativePartBuffer = new Stack<>();
    private Stack<BodyDescriptor> parts = new Stack<>();
    MailContentHandler(XHTMLContentHandler xhtml, Detector detector, Metadata metadata,
                       ParseContext context, boolean strictParsing,
                       boolean extractAllAlternatives) {
        this.handler = xhtml;
        this.metadata = metadata;
        this.parseContext = context;
        this.strictParsing = strictParsing;
        this.extractAllAlternatives = extractAllAlternatives;

        // Fetch / Build an EmbeddedDocumentExtractor with which
        //  to handle/process the parts/attachments

        // Was an EmbeddedDocumentExtractor explicitly supplied?
        this.extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
        this.detector = detector;
    }

    @Override
    public void body(BodyDescriptor body, InputStream is) throws MimeException, IOException {
        // use a different metadata object
        // in order to specify the mime type of the
        // sub part without damaging the main metadata

        Metadata submd = Metadata.newInstance(parseContext);
        submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
        submd.set(Metadata.CONTENT_ENCODING, body.getCharset());

        // TIKA-2455: flag the containing type.
        if (parts.size() > 0) {
            submd.set(Message.MULTIPART_SUBTYPE, parts.peek().getSubType());
            submd.set(Message.MULTIPART_BOUNDARY, parts.peek().getBoundary());
        }
        if (body instanceof MaximalBodyDescriptor) {
            handleMaximalBodyDescriptor((MaximalBodyDescriptor)body, submd);
        }
        //if we're in a multipart/alternative or any one of its children
        //add the bodypart to the latest that was added
        if (!extractAllAlternatives && alternativePartBuffer.size() > 0) {
            UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream.builder().get();
            IOUtils.copy(is, bos);
            alternativePartBuffer.peek().children.add(new BodyContents(submd, bos.toByteArray()));
        } else if (!extractAllAlternatives && parts.size() < 2) {
            //if you're at the first level of embedding
            //and you're not in an alternative part block
            //and you're text/html, put that in the body of the email
            //otherwise treat as a regular attachment
            UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream.builder().get();
            IOUtils.copy(is, bos);
            final byte[] bytes = bos.toByteArray();
            if (detectInlineTextOrHtml(submd, bytes)) {
                handleInlineBodyPart(new BodyContents(submd, bytes));
            } else {
                //else handle as you would any other embedded content
                try (TikaInputStream tis = TikaInputStream.get(bytes)) {
                    handleEmbedded(tis, submd);
                }
            }
        } else {
            //else handle as you would any other embedded content
            try (TikaInputStream tis = TikaInputStream.get(is)) {
                handleEmbedded(tis, submd);
            }
        }
    }

    private void handleMaximalBodyDescriptor(MaximalBodyDescriptor body, Metadata submd) {
        String contentDispositionType = body.getContentDispositionType();
        if (contentDispositionType != null && !contentDispositionType.isEmpty()) {
            StringBuilder contentDisposition = new StringBuilder(contentDispositionType);
            if ("attachment".equalsIgnoreCase(contentDispositionType)) {
                submd.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                        TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
            } else if ("inline".equalsIgnoreCase(contentDispositionType)) {
                submd.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                        TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
            }
            Map<String, String> contentDispositionParameters =
                    body.getContentDispositionParameters();
            for (Entry<String, String> param : contentDispositionParameters.entrySet()) {
                contentDisposition.append("; ").append(param.getKey()).append("=\"")
                        .append(param.getValue()).append('"');
                if ("creation-date".equalsIgnoreCase(param.getKey())) {
                    tryToAddDate(param.getValue(), TikaCoreProperties.CREATED, submd);
                } else if ("modification-date".equalsIgnoreCase(param.getKey())) {
                    tryToAddDate(param.getValue(), TikaCoreProperties.MODIFIED, submd);
                }
                //do anything with "size"?
            }

            //the embedded file name can be in the content disposition field
            //or a parameter on the content type field as in:
            // Content-Type: application/pdf; name=blah.pdf
            //Or it can be in both
            //not sure we need this defensive null check?
            if (body.getContentTypeParameters() != null) {
                String contentTypeName = body.getContentTypeParameters().get("name");
                if (!StringUtils.isBlank(contentTypeName)) {
                    submd.set(TikaCoreProperties.RESOURCE_NAME_KEY, contentTypeName);
                }
            }
            String contentDispositionFileName = body.getContentDispositionFilename();
            if (!StringUtils.isBlank(contentDispositionFileName)) {
                //prefer the content disposition file name over the "name" param in the content-type
                submd.set(TikaCoreProperties.RESOURCE_NAME_KEY, contentDispositionFileName);
            }
            submd.set(Metadata.CONTENT_DISPOSITION, contentDisposition.toString());
        }
    }

    private void tryToAddDate(String value, Property property, Metadata metadata) {
        Date d = MailDateParser.parseDateLenient(value);
        if (d != null) {
            metadata.set(property, d);
        }
    }

    private boolean detectInlineTextOrHtml(Metadata submd, byte[] bytes) {
        String attachmentType = submd.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
        if (TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString().equals(attachmentType)) {
            return false;
        }

        String mediaTypeString = submd.get(Metadata.CONTENT_TYPE);
        if (mediaTypeString != null) {
            if (mediaTypeString.startsWith("text")) {
                return true;
            } else {
                return false;
            }
        }

        try (TikaInputStream tis = TikaInputStream.get(bytes)) {
            MediaType mediaType = detector.detect(tis, submd, parseContext);
            if (mediaType != null) {
                //detect only once
                submd.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, mediaType.toString());
                if (mediaType.toString().startsWith("text")) {
                    return true;
                }
            }
        } catch (IOException e) {
            //swallow
        }
        return false;
    }

    private void handleEmbedded(TikaInputStream tis, Metadata metadata)
            throws MimeException, IOException {

        if (metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE) == null) {
            metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                    TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
        }

        try {
            if (extractor.shouldParseEmbedded(metadata)) {
                // Wrap the InputStream before passing on, as the James provided
                //  one misses many features we might want eg mark/reset
                extractor.parseEmbedded(tis, handler, metadata, parseContext, false);
            }
        } catch (SAXException e) {
            throw new MimeException(e);
        }

    }

    @Override
    public void endBodyPart() throws MimeException {
        //if we're buffering for a multipart/alternative
        //don't write </p></div>
        if (alternativePartBuffer.size() > 0) {
            return;
        }
        try {
            handler.endElement("p");
            handler.endElement("div");
        } catch (SAXException e) {
            throw new MimeException(e);
        }
    }

    @Override
    public void endHeader() throws MimeException {
    }

    @Override
    public void startMessage() throws MimeException {
    }

    @Override
    public void endMessage() throws MimeException {
    }

    @Override
    public void endMultipart() throws MimeException {

        if (alternativePartBuffer.size() == 1) {
            Part alternativeRoot = alternativePartBuffer.pop();
            try {
                handleBestParts(alternativeRoot);
            } catch (IOException e) {
                throw new MimeException(e);
            }
        } else if (alternativePartBuffer.size() > 1) {
            alternativePartBuffer.pop();
        }
        //test that parts has something
        //if it doesn't, there's a problem with the file
        //e.g. more endMultiPart than startMultipart
        //we're currently silently swallowing this
        if (parts.size() > 0) {
            parts.pop();
        }
    }

    @Override
    public void epilogue(InputStream is) throws MimeException, IOException {
    }

    /**
     * Header for the whole message or its parts
     *
     * @see <a href="http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/">
     * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/</a>
     * Field.html
     */
    public void field(Field field) throws MimeException {
        // if we're in a part, skip.
        // We want to gather only the metadata for the whole msg.
        if (parts.size() > 0) {
            return;
        }

        try {
            String fieldname = field.getName();

            ParsedField parsedField =
                    LenientFieldParser.getParser().parse(field, DecodeMonitor.SILENT);
            if (fieldname.equalsIgnoreCase("From")) {
                MailboxListField fromField = (MailboxListField) parsedField;
                MailboxList mailboxList = fromField.getMailboxList();
                if (fromField.isValidField() && mailboxList != null) {
                    for (Address address : mailboxList) {
                        String from = getDisplayString(address);
                        MailUtil.setPersonAndEmail(from, Message.MESSAGE_FROM_NAME,
                                Message.MESSAGE_FROM_EMAIL, metadata);
                        metadata.add(Metadata.MESSAGE_FROM, from);
                        metadata.add(TikaCoreProperties.CREATOR, from);
                    }
                } else {
                    String from = stripOutFieldPrefix(field, "From:");
                    MailUtil.setPersonAndEmail(from, Message.MESSAGE_FROM_NAME,
                            Message.MESSAGE_FROM_EMAIL, metadata);

                    if (from.startsWith("<")) {
                        from = from.substring(1);
                    }
                    if (from.endsWith(">")) {
                        from = from.substring(0, from.length() - 1);
                    }
                    metadata.add(Metadata.MESSAGE_FROM, from);
                    metadata.add(TikaCoreProperties.CREATOR, from);
                }
            } else if (fieldname.equalsIgnoreCase("Subject")) {
                String txt = ((UnstructuredField) parsedField).getValue();
                metadata.set(TikaCoreProperties.TITLE, txt);
                metadata.set(TikaCoreProperties.SUBJECT, txt);
                metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, txt + ".eml");
            } else if (fieldname.equalsIgnoreCase("To")) {
                processAddressList(parsedField, "To:", Metadata.MESSAGE_TO);
            } else if (fieldname.equalsIgnoreCase("CC")) {
                processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC);
            } else if (fieldname.equalsIgnoreCase("BCC")) {
                processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC);
            } else if (fieldname.equalsIgnoreCase("Content-Type")) {
                final MediaType contentType = MediaType.parse(parsedField.getBody());

                if (contentType.getType().equalsIgnoreCase("multipart")) {
                    metadata.set(Message.MULTIPART_SUBTYPE, contentType.getSubtype());
                    metadata.set(Message.MULTIPART_BOUNDARY,
                            contentType.getParameters().get("boundary"));
                } else {
                    metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + parsedField.getName(),
                            field.getBody());
                }
            } else if (fieldname.equalsIgnoreCase("Date")) {
                String dateBody = parsedField.getBody();
                Date date = null;
                try {
                    date = MailDateParser.parseDateLenient(dateBody);
                    metadata.set(TikaCoreProperties.CREATED, date);
                } catch (SecurityException e) {
                    throw e;
                } catch (Exception e) {
                    //swallow
                }
            } else {
                metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + parsedField.getName(),
                        field.getBody());
            }
        } catch (RuntimeException me) {
            if (strictParsing) {
                throw me;
            }
        }
    }

    private void processAddressList(ParsedField field, String addressListType, String metadataField)
            throws MimeException {
        AddressListField toField = (AddressListField) field;
        if (toField.isValidField()) {
            AddressList addressList = toField.getAddressList();
            for (Address address : addressList) {
                metadata.add(metadataField, getDisplayString(address));
            }
        } else {
            String to = stripOutFieldPrefix(field, addressListType);
            for (String eachTo : to.split(",")) {
                metadata.add(metadataField, eachTo.trim());
            }
        }
    }

    private String getDisplayString(Address address) {
        if (address instanceof Mailbox) {
            Mailbox mailbox = (Mailbox) address;
            String name = mailbox.getName();
            if (name != null && name.length() > 0) {
                name = DecoderUtil.decodeEncodedWords(name, DecodeMonitor.SILENT);
                return name + " <" + mailbox.getAddress() + ">";
            } else {
                return mailbox.getAddress();
            }
        } else {
            return address.toString();
        }
    }

    @Override
    public void preamble(InputStream is) throws MimeException, IOException {
    }

    @Override
    public void raw(InputStream is) throws MimeException, IOException {
    }

    @Override
    public void startBodyPart() throws MimeException {
        //if we're buffering for a multipart/alternative
        //don't write <div><p>
        if (alternativePartBuffer.size() > 0) {
            return;
        }
        try {
            handler.startElement("div", "class", "email-entry");
            handler.startElement("p");
        } catch (SAXException e) {
            throw new MimeException(e);
        }
    }

    @Override
    public void startHeader() throws MimeException {
        // TODO Auto-generated method stub

    }

    @Override
    public void startMultipart(BodyDescriptor descr) throws MimeException {
        parts.push(descr);
        if (!extractAllAlternatives) {
            if (alternativePartBuffer.size() == 0 &&
                    MULTIPART_ALTERNATIVE.equalsIgnoreCase(descr.getMimeType())) {
                Part part = new Part(descr);
                alternativePartBuffer.push(part);
            } else if (alternativePartBuffer.size() > 0) {
                //add the part to the stack
                Part parent = alternativePartBuffer.peek();
                Part part = new Part(descr);
                alternativePartBuffer.push(part);

                if (parent != null) {
                    parent.children.add(part);
                }
            }
        }
    }

    private String stripOutFieldPrefix(Field field, String fieldname) {
        String temp = field.getRaw().toString();
        int loc = fieldname.length();
        while (temp.charAt(loc) == ' ') {
            loc++;
        }
        return temp.substring(loc);
    }

    private void handleBestParts(Part part) throws MimeException, IOException {
        if (part == null) {
            return;
        }
        if (part instanceof BodyContents) {
            handleInlineBodyPart((BodyContents) part);
            return;
        }


        if (MULTIPART_ALTERNATIVE.equalsIgnoreCase(part.bodyDescriptor.getMimeType())) {
            int bestPartScore = -1;
            Part bestPart = null;
            for (Part alternative : part.children) {
                int score = score(alternative);
                if (score > bestPartScore) {
                    bestPart = alternative;
                    bestPartScore = score;
                }
            }
            handleBestParts(bestPart);
        } else {
            for (Part child : part.children) {
                handleBestParts(child);
            }
        }
    }

    private void handleInlineBodyPart(BodyContents part) throws MimeException, IOException {
        String contentType = part.metadata.get(Metadata.CONTENT_TYPE);
        Parser parser = null;
        boolean inlineText = false;
        if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) {
            parser = EmbeddedDocumentUtil
                    .tryToFindExistingLeafParser(JSoupParser.class, parseContext);
        } else if (MediaType.TEXT_PLAIN.toString().equalsIgnoreCase(contentType)) {
            parser =
                    EmbeddedDocumentUtil.tryToFindExistingLeafParser(TXTParser.class, parseContext);
            if (parser == null) {
                parser = EmbeddedDocumentUtil
                        .tryToFindExistingLeafParser(TextAndCSVParser.class, parseContext);
                inlineText = true;
            }
        }


        if (parser == null) {
            //back off and treat it as an embedded chunk
            try (TikaInputStream tis = TikaInputStream.get(part.bytes)) {
                handleEmbedded(tis, part.metadata);
            }
        } else {

            //parse inline
            try {
                Metadata inlineMetadata = Metadata.newInstance(parseContext);
                if (inlineText) {
                    inlineMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
                            MediaType.TEXT_PLAIN.toString());
                }
                try (TikaInputStream tis = TikaInputStream.get(part.bytes)) {
                    parser.parse(tis,
                            new EmbeddedContentHandler(new BodyContentHandler(handler)), inlineMetadata, parseContext);
                }
            } catch (SAXException | TikaException e) {
                throw new MimeException(e);
            }
        }
    }

    private int score(Part part) {
        if (part == null) {
            return 0;
        }
        if (part instanceof BodyContents) {
            String contentType = ((BodyContents) part).metadata.get(Metadata.CONTENT_TYPE);
            if (contentType == null) {
                return 0;
            } else if (contentType.equalsIgnoreCase(MediaType.TEXT_PLAIN.toString())) {
                return 1;
            } else if (contentType.equalsIgnoreCase("application/rtf")) {
                //TODO -- is this the right definition in rfc822 for rich text?!
                return 2;
            } else if (contentType.equalsIgnoreCase(MediaType.TEXT_HTML.toString())) {
                return 3;
            }
        }
        return 4;
    }

    private static class Part {
        private final BodyDescriptor bodyDescriptor;
        private final List<Part> children = new ArrayList<>();

        public Part(BodyDescriptor bodyDescriptor) {
            this.bodyDescriptor = bodyDescriptor;
        }

        @Override
        public String toString() {
            return "Part{" + "bodyDescriptor=" + bodyDescriptor + ", children=" + children + '}';
        }
    }

    private static class BodyContents extends Part {
        private final Metadata metadata;
        private final byte[] bytes;

        private BodyContents(Metadata metadata, byte[] bytes) {
            super(null);
            this.metadata = metadata;
            this.bytes = bytes;
        }
    }
}