DWGParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.dwg;

import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.apache.poi.util.StringUtil;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.config.JsonConfig;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;

/**
 * DWG (CAD Drawing) parser. This is a very basic parser, which just
 * looks for bits of the headers.
 * Note that we use Apache POI for various parts of the processing, as
 * lots of the low level string/int/short concepts are the same.
 */
@TikaComponent
public class DWGParser extends AbstractDWGParser {

    public DWGParser() {
        super();
    }

    public DWGParser(DWGParserConfig config) {
        super(config);
    }

    public DWGParser(JsonConfig jsonConfig) {
        super(jsonConfig);
    }

    public static String DWG_CUSTOM_META_PREFIX = "dwg-custom:";
    /**
     * Serial version UID
     */
    private static final long serialVersionUID = -7744232583079169119L;
    /**
     * The order of the fields in the header
     */
    private static final Property[] HEADER_PROPERTIES_ENTRIES = { TikaCoreProperties.TITLE,
            TikaCoreProperties.DESCRIPTION, TikaCoreProperties.CREATOR, TikaCoreProperties.SUBJECT,
            TikaCoreProperties.COMMENTS, TikaCoreProperties.MODIFIER, null, // Unknown?
            TikaCoreProperties.RELATION, // Hyperlink
    };
    /**
     * For the 2000 file, they're indexed
     */
    private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = { null, TikaCoreProperties.RELATION, // 0x01
            TikaCoreProperties.TITLE, // 0x02
            TikaCoreProperties.DESCRIPTION, // 0x03
            TikaCoreProperties.CREATOR, // 0x04
            null, TikaCoreProperties.COMMENTS, // 0x06
            TikaCoreProperties.SUBJECT, // 0x07
            TikaCoreProperties.MODIFIER, // 0x08
    };
    private static final String HEADER_2000_PROPERTIES_MARKER_STR = "DWGPROPS COOKIE";
    private static final byte[] HEADER_2000_PROPERTIES_MARKER = new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
    /**
     * How far to skip after the last standard property, before we find any custom
     * properties that might be there.
     */
    private static final int CUSTOM_PROPERTIES_SKIP = 20;
    /**
     * The value of padding bytes other than 0 in some DWG files.
     */
    private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new int[] { 0x2, 0, 0, 0 };
    private static MediaType TYPE = MediaType.image("vnd.dwg");

    static {
        StringUtil.putCompressedUnicode(HEADER_2000_PROPERTIES_MARKER_STR, HEADER_2000_PROPERTIES_MARKER, 0);
    }

    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return Collections.singleton(TYPE);
    }

    public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext context)
            throws IOException, TikaException, SAXException {

        configure(context);
        DWGParserConfig dwgc = context.get(DWGParserConfig.class);

        if (!dwgc.getDwgReadExecutable().isEmpty()) {
            DWGReadParser dwr = new DWGReadParser();
            dwr.parse(tis, handler, metadata, context);
        } else {
            // First up, which version of the format are we handling?
            byte[] header = new byte[128];
            IOUtils.readFully(tis, header);
            String version = new String(header, 0, 6, "US-ASCII");

            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
            xhtml.startDocument();

            switch (version) {
                case "AC1015":
                    metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
                    if (skipTo2000PropertyInfoSection(tis, header)) {
                        get2000Props(tis, metadata, xhtml);
                    }
                    break;
                case "AC1018":
                    metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
                    if (skipToPropertyInfoSection(tis, header)) {
                        get2004Props(tis, metadata, xhtml);
                    }
                    break;
                case "AC1027":
                case "AC1032":
                case "AC1021":
                case "AC1024":
                    metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
                    if (skipToPropertyInfoSection(tis, header)) {
                        get2007and2010Props(tis, metadata, xhtml);
                    }
                    break;
                default:
                    throw new TikaException("Unsupported AutoCAD drawing version: " + version);
            }

            xhtml.endDocument();
        }
    }

    /**
     * Stored as US-ASCII
     */
    private void get2004Props(InputStream tis, Metadata metadata, XHTMLContentHandler xhtml)
            throws IOException, TikaException, SAXException {
        // Standard properties
        for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
            String headerValue = read2004String(tis);
            handleHeader(i, headerValue, metadata, xhtml);
        }

        // Custom properties
        int customCount = skipToCustomProperties(tis);
        for (int i = 0; i < customCount; i++) {
            String propName = read2004String(tis);
            String propValue = read2004String(tis);
            if (propName.length() > 0 && propValue.length() > 0) {
                metadata.add(DWG_CUSTOM_META_PREFIX + propName, propValue);
            }
        }
    }

    private String read2004String(InputStream tis) throws IOException, TikaException {
        int stringLen = EndianUtils.readUShortLE(tis);

        byte[] stringData = new byte[stringLen];
        IOUtils.readFully(tis, stringData);

        // Often but not always null terminated
        if (stringData[stringLen - 1] == 0) {
            stringLen--;
        }
        return StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
    }

    /**
     * Stored as UCS2, so 16 bit "unicode"
     */
    private void get2007and2010Props(InputStream tis, Metadata metadata, XHTMLContentHandler xhtml)
            throws IOException, TikaException, SAXException {
        // Standard properties
        for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
            String headerValue = read2007and2010String(tis);
            handleHeader(i, headerValue, metadata, xhtml);
        }

        // Custom properties
        int customCount = skipToCustomProperties(tis);
        for (int i = 0; i < customCount; i++) {
            String propName = read2007and2010String(tis);
            String propValue = read2007and2010String(tis);
            if (propName.length() > 0 && propValue.length() > 0) {
                metadata.add(DWG_CUSTOM_META_PREFIX + propName, propValue);
            }
        }
    }

    private String read2007and2010String(InputStream tis) throws IOException, TikaException {
        int stringLen = EndianUtils.readUShortLE(tis);

        byte[] stringData = new byte[stringLen * 2];
        IOUtils.readFully(tis, stringData);
        String value = StringUtil.getFromUnicodeLE(stringData);

        // Some strings are null terminated
        if (value.charAt(value.length() - 1) == 0) {
            value = value.substring(0, value.length() - 1);
        }

        return value;
    }

    private void get2000Props(InputStream tis, Metadata metadata, XHTMLContentHandler xhtml)
            throws IOException, TikaException, SAXException {
        int propCount = 0;
        while (propCount < 30) {
            int propIdx = EndianUtils.readUShortLE(tis);
            int length = EndianUtils.readUShortLE(tis);
            int valueType = tis.read();

            if (propIdx == 0x28) {
                // This one seems not to follow the pattern
                length = 0x19;
            } else if (propIdx == 90) {
                // We think this means the end of properties
                break;
            }

            byte[] value = new byte[length];
            IOUtils.readFully(tis, value);
            if (valueType == 0x1e) {
                // Normal string, good
                String val = StringUtil.getFromCompressedUnicode(value, 0, length);

                // Is it one we can look up by index?
                if (propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
                    metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
                    xhtml.element("p", val);
                } else if (propIdx == 0x012c) {
                    int splitAt = val.indexOf('=');
                    if (splitAt > -1) {
                        String propName = val.substring(0, splitAt);
                        String propVal = val.substring(splitAt + 1);
                        metadata.add(DWGParser.DWG_CUSTOM_META_PREFIX + propName, propVal);
                    }
                }
            } else {
                // No idea...
            }

            propCount++;
        }
    }

    private void handleHeader(int headerNumber, String value, Metadata metadata, XHTMLContentHandler xhtml)
            throws SAXException {
        if (value == null || value.isEmpty()) {
            return;
        }

        Property headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber];
        if (headerProp != null) {
            metadata.set(headerProp, value);
        }

        xhtml.element("p", value);
    }

    /**
     * Grab the offset, then skip there
     */
    private boolean skipToPropertyInfoSection(InputStream tis, byte[] header) throws IOException, TikaException {
        // The offset is stored in the header from 0x20 onwards
        long offsetToSection = EndianUtils.getLongLE(header, 0x20);

        // Bounds check the offset. Some files seem to use a different format,
        // and the offset isn't available at 0x20. Until we can work out how
        // to find the offset in those files, skip them if detected
        if (offsetToSection > 0xa00000l) {
            // Header should never be more than 10mb into the file, something is wrong
            offsetToSection = 0;
        }

        // Work out how far to skip, and bounds check
        long toSkip = offsetToSection - header.length;
        if (offsetToSection == 0) {
            return false;
        }
        IOUtils.skipFully(tis, toSkip);

        return true;
    }

    /**
     * We think it can be anywhere...
     */
    private boolean skipTo2000PropertyInfoSection(InputStream tis, byte[] header) throws IOException {
        int val = 0;
        while (val != -1) {
            val = tis.read();
            if (val == HEADER_2000_PROPERTIES_MARKER[0]) {
                boolean going = true;
                for (int i = 1; i < HEADER_2000_PROPERTIES_MARKER.length && going; i++) {
                    val = tis.read();
                    if (val != HEADER_2000_PROPERTIES_MARKER[i]) {
                        going = false;
                    }
                }
                if (going) {
                    // Bingo, found it
                    return true;
                }
            }
        }
        return false;
    }

    private int skipToCustomProperties(InputStream tis) throws IOException, TikaException {
        // There should be 4 zero bytes or CUSTOM_PROPERTIES_ALT_PADDING_VALUES next
        byte[] padding = new byte[4];
        IOUtils.readFully(tis, padding);
        if ((padding[0] == 0 && padding[1] == 0 && padding[2] == 0 && padding[3] == 0)
                || (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0]
                        && padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1]
                        && padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2]
                        && padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {

            // Looks hopeful, skip on
            padding = new byte[CUSTOM_PROPERTIES_SKIP];
            IOUtils.readFully(tis, padding);

            // We should now have the count
            int count = EndianUtils.readUShortLE(tis);

            // Plausibilitu check it
            if (count > 0 && count < 0x7f) {
                // Looks plausible
                return count;
            } else {
                // No properties / count is too high to trust
                return 0;
            }
        } else {
            // No padding. That probably means no custom props
            return 0;
        }
    }

}