EMFParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft;

import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.Set;
import java.util.function.Supplier;

import org.apache.poi.hemf.record.emf.HemfComment;
import org.apache.poi.hemf.record.emf.HemfRecord;
import org.apache.poi.hemf.record.emf.HemfRecordType;
import org.apache.poi.hemf.record.emf.HemfText;
import org.apache.poi.hemf.usermodel.HemfPicture;
import org.apache.poi.util.RecordFormatException;
import org.apache.poi.util.StringUtil;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;

/**
 * Extracts files embedded in EMF and offers a
 * very rough capability to extract text if there
 * is text stored in the EMF.
 * <p/>
 * To improve text extraction, we'd have to implement
 * quite a bit more at the POI level. We'd want to track changes
 * in font and use that information for identifying character sets,
 * inserting spaces and new lines.
 * <p/>
 * We're also relying on storage order for text order, which isn't great.
 * We'd have to do something like what PDFBox or XPS do to sort the
 * runs and then put the cow back together from the hamburger...lol...
 */
@TikaComponent
public class EMFParser implements Parser {

    public static Property EMF_ICON_ONLY = Property.internalBoolean("emf:iconOnly");
    public static Property EMF_ICON_STRING = Property.internalText("emf:iconString");

    private static String ICON_ONLY = "IconOnly";

    private static final MediaType MEDIA_TYPE = MediaType.image("emf");
    private static final MediaType WMF_MEDIA_TYPE = MediaType.image("wmf");

    private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);

    private static void handleEmbedded(byte[] data,
                                       EmbeddedDocumentExtractor embeddedDocumentExtractor,
                                       ContentHandler handler, ParseContext context) throws TikaException, SAXException {
        try (TikaInputStream tis = TikaInputStream.get(data)) {
            Metadata embeddedMetadata = Metadata.newInstance(context);
            if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
                embeddedDocumentExtractor
                        .parseEmbedded(tis, new EmbeddedContentHandler(handler), embeddedMetadata, context, true);
            }
        } catch (IOException e) {
            //swallow
        }
    }

    @Override
    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    @Override
    public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata,
                      ParseContext context) throws IOException, SAXException, TikaException {

        EmbeddedDocumentExtractor embeddedDocumentExtractor = null;
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
        xhtml.startDocument();
        try {
            HemfPicture ex = new HemfPicture(tis);
            ParseState parseState = new ParseState();
            long fudgeFactorX = 10;//derive this from the font or frame/bounds information
            StringBuilder buffer = new StringBuilder();
            //iterate through the records.  if you hit IconOnly in a comment
            //and it is the first IconOnly, grab the string in the next comment record
            //and that'll be the full name of the file.

            //NOTE that we're just scraping the text out in storage order. The proper way to do this
            //is to sort the text records by x,y like we do for PDFs and xps
            for (HemfRecord record : ex) {
                parseState.isIconOnly = false;
                if (record.getEmfRecordType() == HemfRecordType.comment) {
                    handleCommentData(
                            ((HemfComment.EmfComment) record).getCommentData(), parseState, xhtml, context);
                } else if (record.getEmfRecordType().equals(HemfRecordType.extTextOutW)) {
                    handleExtTextOut((HemfText.EmfExtTextOutW) record, parseState, buffer, xhtml, fudgeFactorX, StandardCharsets.UTF_16LE);
                } else if (record.getEmfRecordType().equals(HemfRecordType.extTextOutA)) {
                    //do something better than assigning utf8.
                    handleExtTextOut((HemfText.EmfExtTextOutA) record, parseState, buffer, xhtml, fudgeFactorX, StandardCharsets.UTF_8);
                }

                if (parseState.isIconOnly) {
                    parseState.lastWasIconOnly = true;
                } else {
                    parseState.lastWasIconOnly = false;
                }
            }
            if (parseState.iconOnlyString != null) {
                metadata.set(EMF_ICON_ONLY, true);
                metadata.set(EMF_ICON_STRING, parseState.iconOnlyString);
            }
            if (! buffer.isEmpty()) {
                xhtml.startElement("p");
                xhtml.characters(buffer.toString());
                xhtml.endElement("p");
            }

        } catch (RecordFormatException e) { //POI's hemfparser can throw these for "parse
            // exceptions"
            throw new TikaException(e.getMessage(), e);
        } catch (RuntimeException e) { //convert Runtime to RecordFormatExceptions
            throw new TikaException(e.getMessage(), e);
        }
        xhtml.endDocument();
    }

    private void handleExtTextOut(HemfText.EmfExtTextOutA record, ParseState parseState,
                                  StringBuilder buffer, XHTMLContentHandler xhtml, double fudgeFactorX,
                                  Charset charset) throws IOException, SAXException {
        Rectangle2D currRectangle = getCurrentRectangle(record);
        if (parseState.lastRectangle.getY() > -1 &&
                deltaGreaterThan(parseState.lastRectangle.getMinY(), currRectangle.getMinY(), 0.0001)) {
            xhtml.startElement("p");
            xhtml.characters(buffer.toString());
            xhtml.endElement("p");
            buffer.setLength(0);
        } else if (parseState.lastRectangle.getX() > -1 &&
                deltaGreaterThan(currRectangle.getMinX(),
                        parseState.lastRectangle.getMaxX(), fudgeFactorX)) {
            buffer.append(" ");
        }
        //do something better than this
        String txt = record.getText(charset);
        buffer.append(txt);
        parseState.lastRectangle = currRectangle;

    }

    private boolean deltaGreaterThan(double a, double b, double delta) {
        return (Math.abs(a - b) > delta);
    }

    private Rectangle2D getCurrentRectangle(HemfText.EmfExtTextOutA extTextOutA) {
        //This gets the current rectangle out of the emfextTextOutA record.
        //via TIKA-4432, if the rectangle is 0,0,0,0 then back-off to the bounds ignored, if those exist

        //TODO: maybe use modifyWorldTransform and calculate font width etc...
        Rectangle2D bounds = extTextOutA.getBounds();
        double smidge = 0.000000001;
        if (deltaGreaterThan(bounds.getX(), 0.0d, smidge) ||
                deltaGreaterThan(bounds.getY(), 0.0d, smidge) ||
                deltaGreaterThan(bounds.getWidth(), 0.0d, smidge) ||
                deltaGreaterThan(bounds.getHeight(), 0.0d, smidge)) {
            return bounds;
        }
        Supplier<?> boundsIgnored = extTextOutA.getGenericProperties().get("boundsIgnored");
        if (boundsIgnored == null) {
            return bounds;
        }
        Object maybeBounds = boundsIgnored.get();
        if (maybeBounds == null) {
            return bounds;
        }
        if (! (maybeBounds instanceof Rectangle2D)) {
            return bounds;
        }
        return (Rectangle2D) maybeBounds;
    }

    private void handleCommentData(
            HemfComment.EmfCommentData commentData, ParseState parseState,
            XHTMLContentHandler xhtml, ParseContext context)
            throws IOException, TikaException, SAXException {

        if (commentData instanceof HemfComment.EmfCommentDataMultiformats) {
            if (parseState.extractor == null) {
                parseState.extractor =
                        EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
            }
            handleMultiFormats((HemfComment.EmfCommentDataMultiformats) commentData,
                    xhtml, parseState.extractor, context);
        } else if (commentData instanceof HemfComment.EmfCommentDataWMF) {
            if (parseState.extractor == null) {
                parseState.extractor =
                        EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
            }
            handleWMF(((HemfComment.EmfCommentDataWMF) commentData).getWMFData(), xhtml,
                    parseState.extractor, context);
        } else if (commentData instanceof HemfComment.EmfCommentDataGeneric) {
            String val =
                    tryToReadAsString((((HemfComment.EmfCommentDataGeneric) commentData).getPrivateData()));
            if (ICON_ONLY.equals(val) && parseState.hitIconOnly == false) {
                parseState.hitIconOnly = true;
                parseState.isIconOnly = true;
            } else if (parseState.lastWasIconOnly && parseState.iconOnlyString == null) {
                parseState.iconOnlyString = val;
            }
        }
    }

    private String tryToReadAsString(byte[] bytes) {
        if (bytes.length < 2) {
            return null;
        }
        //act like this is a null terminated unicode le
        int stringLen = (bytes.length - 2) / 2;
        try {
            return StringUtil.getFromUnicodeLE0Terminated(bytes, 0, stringLen);
        } catch (SecurityException e) {
            throw e;
        } catch (Exception e) {
            //didn't work out...oh, well
        }
        return null;
    }

    private void handleWMF(byte[] bytes, ContentHandler contentHandler,
                           EmbeddedDocumentExtractor embeddedDocumentExtractor,
                           ParseContext context)
            throws IOException, SAXException, TikaException {
        Metadata embeddedMetadata = Metadata.newInstance(context);
        embeddedMetadata.set(Metadata.CONTENT_TYPE, WMF_MEDIA_TYPE.toString());
        if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
            try (TikaInputStream tis = TikaInputStream.get(bytes)) {
                embeddedDocumentExtractor
                        .parseEmbedded(tis, new EmbeddedContentHandler(contentHandler),
                                embeddedMetadata, context, true);

            }

        }

    }

    private void handleMultiFormats(HemfComment.EmfCommentDataMultiformats commentData,
                                    ContentHandler handler,
                                    EmbeddedDocumentExtractor embeddedDocumentExtractor,
                                    ParseContext context)
            throws IOException, TikaException, SAXException {

        for (HemfComment.EmfCommentDataFormat dataFormat : commentData.getFormats()) {
            //is this right?!
            handleEmbedded(dataFormat.getRawData(), embeddedDocumentExtractor, handler, context);
        }
    }

    private static class ParseState {
        Rectangle2D lastRectangle = new Rectangle2D.Double(-1.0, -1.0, 0.0, 0.0);
        boolean hitIconOnly = false;
        boolean lastWasIconOnly = false;
        boolean isIconOnly = false;
        String iconOnlyString = null;

        EmbeddedDocumentExtractor extractor;
    }
}