DWGReadParser.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.dwg;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.time.Instant;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.UUID;
import java.util.function.Consumer;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.core.json.JsonReadFeature;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Strings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.FileProcessResult;
import org.apache.tika.utils.ProcessUtils;





/**
 * DWGReadParser (CAD Drawing) parser. This extends the original DWGParser if in 
 * the parser configuration DwgRead is set. DWG reader can be found here: 
 * <p>
 * <a href="https://github.com/LibreDWG/libredwg">https://github.com/LibreDWG/libredwg</a>
 * <p>
 * DWGRead outputs json which we then loop through extracting the text elements 
 * The required configuration is dwgReadExecutable. The other settings which can be
 * overwritten are: 
 * <p>
 * boolean : cleanDwgReadOutput - whether to clean the json output 
 * <p>
 * int : cleanDwgReadOutputBatchSize - clean output batch size to process 
 * <p>
 * long : dwgReadTimeout -timeout in milliseconds before killing the dwgread process
 * <p>
 * String : cleanDwgReadRegexToReplace - characters to replace in the json 
 * <p>
 * String : cleanDwgReadReplaceWith - * replacement characters dwgReadExecutable
 */
@TikaComponent(spi = false)
public class DWGReadParser extends AbstractDWGParser {
    private static final Logger LOG = LoggerFactory.getLogger(DWGReadParser.class);
    /**
     * 
     */
    private static final long serialVersionUID = 7983127145030096837L;
    private static MediaType TYPE = MediaType.image("vnd.dwg");

    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return Collections.singleton(TYPE);
    }

    @Override
    public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {

        configure(context);
        DWGParserConfig dwgc = context.get(DWGParserConfig.class);
        final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
        xhtml.startDocument();
        // create unique files so we avoid overwriting out files if multithreaded
        UUID uuid = UUID.randomUUID();
        File tmpFileOut = Files.createTempFile(uuid + "dwgreadout", ".json").toFile();
        File tmpFileOutCleaned = Files.createTempFile(uuid + "dwgreadoutclean", ".json").toFile();
        File tmpFileIn = Files.createTempFile(uuid + "dwgreadin", ".dwg").toFile();
        try {
            

            FileUtils.copyInputStreamToFile(tis, tmpFileIn);

            List<String> command = Arrays.asList(dwgc.getDwgReadExecutable(), "-O", "JSON", "-o",
                    tmpFileOut.getCanonicalPath(), tmpFileIn.getCanonicalPath());
            ProcessBuilder pb = new ProcessBuilder().command(command);
            LOG.info("About to call DWGRead: " + command.toString());
            FileProcessResult fpr = ProcessUtils.execute(pb, dwgc.getDwgReadTimeout(), 10000, 10000);
            LOG.info("DWGRead Exit code is: " + fpr.getExitValue());
            if (fpr.getExitValue() == 0) {
                if (dwgc.isCleanDwgReadOutput()) {
                    // dwgread sometimes creates strings with invalid utf-8 sequences or invalid
                    // json (nan instead of NaN). replace them
                    // with empty string.
                    LOG.debug("Cleaning Json Output - Replace: " + dwgc.getCleanDwgReadRegexToReplace() 
                              + " with: " + dwgc.getCleanDwgReadReplaceWith());
                    try ( BufferedReader br = new BufferedReader(
                              new InputStreamReader(
                                      Files.newInputStream(tmpFileOut.toPath()),
                              StandardCharsets.UTF_8));
                            
                            BufferedWriter out = new BufferedWriter(
                                    new OutputStreamWriter(
                                            new FileOutputStream(tmpFileOutCleaned, true), 
                                            StandardCharsets.UTF_8),32768))
                    {

                        String sCurrentLine;
                        while ((sCurrentLine = br.readLine()) != null) 
                        {
                            sCurrentLine = sCurrentLine
                                            .replaceAll( dwgc.getCleanDwgReadRegexToReplace(), 
                                                    dwgc.getCleanDwgReadReplaceWith())
                                            .replaceAll("\\bnan\\b", " 0,")
                                            .replaceAll("\\.,", " \\. ,") + "\n";
                            out.write(sCurrentLine);
                        }                            
                                 
                    } finally {
                        FileUtils.deleteQuietly(tmpFileIn);
                        FileUtils.deleteQuietly(tmpFileOut);
                        tmpFileOut = tmpFileOutCleaned;
                    }

                } else {
                    LOG.debug(
                            "Json wasn't cleaned, "
                            + "if json parsing fails consider reviewing dwgread json output to check it's valid");
                }
            } else if (fpr.isTimeout()) {
                throw new TikaException(
                        "DWGRead Failed - Timeout setting exceeded current setting of " + dwgc.getDwgReadTimeout() );
            }
            else {
                throw new TikaException(
                        "DWGRead Failed - Exit Code is:" + fpr.getExitValue() + " Exe error is: " + fpr.getStderr() );
            }

            // we can't guarantee the json output is correct so we try to ignore as many
            // errors as we can
            JsonFactory jfactory = JsonFactory.builder()
                    .enable(JsonReadFeature.ALLOW_MISSING_VALUES, 
                            JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS,
                            JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, 
                            JsonReadFeature.ALLOW_UNQUOTED_FIELD_NAMES, 
                            JsonReadFeature.ALLOW_TRAILING_COMMA,
                            JsonReadFeature.ALLOW_NON_NUMERIC_NUMBERS, 
                            JsonReadFeature.ALLOW_LEADING_ZEROS_FOR_NUMBERS)
                    .build();
            JsonParser jParser;
            try {
                jParser = jfactory.createParser(tmpFileOut);
            } catch (JsonParseException e1) {
                throw new TikaException("Failed to parse Json: " + ExceptionUtils.getStackTrace(e1));
            } catch (IOException e1) {
                throw new TikaException("Failed to read json file: " + ExceptionUtils.getStackTrace(e1));
            }
            // read json token in a stream using jackson, iterate over each token. We only
            // support OBJECTS, FILEHEADER and SummaryInfo
            // these are the only ones we have in either sample files or have been tested
            // with
            DWGReadFormatRemover dwgReadFormatRemover = new DWGReadFormatRemover();
            JsonToken nextToken = jParser.nextToken();
            while ((nextToken = jParser.nextToken()) != JsonToken.END_OBJECT) {
                if (nextToken == JsonToken.FIELD_NAME) {
                    String nextFieldName = jParser.currentName();
                    nextToken = jParser.nextToken();
                    if (nextToken.isStructStart()) {

                        if ("OBJECTS".equals(nextFieldName)) {
                            // Start array
                            while (jParser.nextToken() != JsonToken.END_ARRAY) {
                                parseDwgObject(jParser, (nextTextValue) -> {

                                    try {
                                        xhtml.characters(dwgReadFormatRemover.cleanupDwgString(nextTextValue));
                                        xhtml.newline();
                                    } catch (SAXException e) {
                                        LOG.error("Could not write next text value {} to xhtml stream", nextTextValue);
                                    }
                                });
                            }
                        } else if ("FILEHEADER".equals(nextFieldName)) {
                            parseHeader(jParser, metadata);
                        } else if ("SummaryInfo".equals(nextFieldName)) {
                            parseSummaryInfo(jParser, metadata);
                        } else {
                            jParser.skipChildren();
                        }
                    }
                }
            }
            jParser.close();
        } finally {
            // make sure we delete all temp files
            FileUtils.deleteQuietly(tmpFileOut);
            FileUtils.deleteQuietly(tmpFileIn);
            FileUtils.deleteQuietly(tmpFileOutCleaned);
        }

        xhtml.endDocument();
    }

    private void parseDwgObject(JsonParser jsonParser, Consumer<String> textConsumer) throws IOException {
        JsonToken nextToken;
        while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
            if (nextToken == JsonToken.FIELD_NAME) {
                String nextFieldName = jsonParser.currentName();
                nextToken = jsonParser.nextToken();
                if (nextToken.isStructStart()) {
                    jsonParser.skipChildren();
                } else if (nextToken.isScalarValue()) {
                    if ("text".equals(nextFieldName)) {
                        String textVal = jsonParser.getText();
                        if (StringUtils.isNotBlank(textVal)) {

                            textConsumer.accept(textVal);
                        }
                    } else if ("text_value".equals(nextFieldName)) {
                        String textVal = jsonParser.getText();
                        if (StringUtils.isNotBlank(textVal)) {

                            textConsumer.accept(textVal);

                        }
                    }
                }
            }
        }
    }

    private void parseHeader(JsonParser jsonParser, Metadata metadata) throws IOException {
        JsonToken nextToken;
        while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
            if (nextToken == JsonToken.FIELD_NAME) {
                String nextFieldName = jsonParser.currentName();
                nextToken = jsonParser.nextToken();
                if (nextToken.isStructStart()) {
                    jsonParser.skipChildren();
                } else if (nextToken.isScalarValue()) {
                    metadata.set(nextFieldName, jsonParser.getText());
                }
            }
        }
    }

    private void parseSummaryInfo(JsonParser jsonParser, Metadata metadata) throws IOException {
        JsonToken nextToken;
        while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
            if (nextToken == JsonToken.FIELD_NAME) {
                String nextFieldName = jsonParser.currentName();
                nextToken = jsonParser.nextToken();
                if (nextToken.isStructStart()) {
                    if ("TDCREATE".equals(nextFieldName) || "TDUPDATE".equals(nextFieldName)) {
                        // timestamps are represented by an integer array of format with 2 values in the
                        // array:
                        // [julianDate, millisecondOfDay]
                        jsonParser.nextToken(); // start array
                        long julianDay = jsonParser.getValueAsLong();
                        jsonParser.nextToken();
                        long millisecondsIntoDay = jsonParser.getValueAsLong();
                        Instant instant = JulianDateUtil.toInstant(julianDay, millisecondsIntoDay);
                        jsonParser.nextToken(); // end array
                        if ("TDCREATE".equals(nextFieldName)) {
                            metadata.set(TikaCoreProperties.CREATED, instant.toString());
                        } else {
                            metadata.set(TikaCoreProperties.MODIFIED, instant.toString());
                        }
                    } else {
                        jsonParser.skipChildren();
                    }

                } else if (nextToken.isScalarValue()) {
                    String textVal = jsonParser.getText();
                    if (StringUtils.isNotBlank(textVal)) {
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Summary Info - {} = {}", nextFieldName, textVal);
                        }
                        if ("TITLE".equals(nextFieldName)) {
                            metadata.set(TikaCoreProperties.TITLE, textVal);
                        } else if ("LASTSAVEDBY".equals(nextFieldName)) {
                            metadata.set(TikaCoreProperties.MODIFIER, textVal);
                        } else if (!Strings.CI.startsWith(nextFieldName, "unknown")) {
                            metadata.set(nextFieldName, textVal);
                        }
                    }
                }
            }
        }
    }

}