DWGReadParser.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.dwg;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.time.Instant;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.UUID;
import java.util.function.Consumer;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.core.json.JsonReadFeature;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Strings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.FileProcessResult;
import org.apache.tika.utils.ProcessUtils;
/**
* DWGReadParser (CAD Drawing) parser. This extends the original DWGParser if in
* the parser configuration DwgRead is set. DWG reader can be found here:
* <p>
* <a href="https://github.com/LibreDWG/libredwg">https://github.com/LibreDWG/libredwg</a>
* <p>
* DWGRead outputs json which we then loop through extracting the text elements
* The required configuration is dwgReadExecutable. The other settings which can be
* overwritten are:
* <p>
* boolean : cleanDwgReadOutput - whether to clean the json output
* <p>
* int : cleanDwgReadOutputBatchSize - clean output batch size to process
* <p>
* long : dwgReadTimeout -timeout in milliseconds before killing the dwgread process
* <p>
* String : cleanDwgReadRegexToReplace - characters to replace in the json
* <p>
* String : cleanDwgReadReplaceWith - * replacement characters dwgReadExecutable
*/
@TikaComponent(spi = false)
public class DWGReadParser extends AbstractDWGParser {
private static final Logger LOG = LoggerFactory.getLogger(DWGReadParser.class);
/**
*
*/
private static final long serialVersionUID = 7983127145030096837L;
private static MediaType TYPE = MediaType.image("vnd.dwg");
public Set<MediaType> getSupportedTypes(ParseContext context) {
return Collections.singleton(TYPE);
}
@Override
public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
configure(context);
DWGParserConfig dwgc = context.get(DWGParserConfig.class);
final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context);
xhtml.startDocument();
// create unique files so we avoid overwriting out files if multithreaded
UUID uuid = UUID.randomUUID();
File tmpFileOut = Files.createTempFile(uuid + "dwgreadout", ".json").toFile();
File tmpFileOutCleaned = Files.createTempFile(uuid + "dwgreadoutclean", ".json").toFile();
File tmpFileIn = Files.createTempFile(uuid + "dwgreadin", ".dwg").toFile();
try {
FileUtils.copyInputStreamToFile(tis, tmpFileIn);
List<String> command = Arrays.asList(dwgc.getDwgReadExecutable(), "-O", "JSON", "-o",
tmpFileOut.getCanonicalPath(), tmpFileIn.getCanonicalPath());
ProcessBuilder pb = new ProcessBuilder().command(command);
LOG.info("About to call DWGRead: " + command.toString());
FileProcessResult fpr = ProcessUtils.execute(pb, dwgc.getDwgReadTimeout(), 10000, 10000);
LOG.info("DWGRead Exit code is: " + fpr.getExitValue());
if (fpr.getExitValue() == 0) {
if (dwgc.isCleanDwgReadOutput()) {
// dwgread sometimes creates strings with invalid utf-8 sequences or invalid
// json (nan instead of NaN). replace them
// with empty string.
LOG.debug("Cleaning Json Output - Replace: " + dwgc.getCleanDwgReadRegexToReplace()
+ " with: " + dwgc.getCleanDwgReadReplaceWith());
try ( BufferedReader br = new BufferedReader(
new InputStreamReader(
Files.newInputStream(tmpFileOut.toPath()),
StandardCharsets.UTF_8));
BufferedWriter out = new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(tmpFileOutCleaned, true),
StandardCharsets.UTF_8),32768))
{
String sCurrentLine;
while ((sCurrentLine = br.readLine()) != null)
{
sCurrentLine = sCurrentLine
.replaceAll( dwgc.getCleanDwgReadRegexToReplace(),
dwgc.getCleanDwgReadReplaceWith())
.replaceAll("\\bnan\\b", " 0,")
.replaceAll("\\.,", " \\. ,") + "\n";
out.write(sCurrentLine);
}
} finally {
FileUtils.deleteQuietly(tmpFileIn);
FileUtils.deleteQuietly(tmpFileOut);
tmpFileOut = tmpFileOutCleaned;
}
} else {
LOG.debug(
"Json wasn't cleaned, "
+ "if json parsing fails consider reviewing dwgread json output to check it's valid");
}
} else if (fpr.isTimeout()) {
throw new TikaException(
"DWGRead Failed - Timeout setting exceeded current setting of " + dwgc.getDwgReadTimeout() );
}
else {
throw new TikaException(
"DWGRead Failed - Exit Code is:" + fpr.getExitValue() + " Exe error is: " + fpr.getStderr() );
}
// we can't guarantee the json output is correct so we try to ignore as many
// errors as we can
JsonFactory jfactory = JsonFactory.builder()
.enable(JsonReadFeature.ALLOW_MISSING_VALUES,
JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS,
JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER,
JsonReadFeature.ALLOW_UNQUOTED_FIELD_NAMES,
JsonReadFeature.ALLOW_TRAILING_COMMA,
JsonReadFeature.ALLOW_NON_NUMERIC_NUMBERS,
JsonReadFeature.ALLOW_LEADING_ZEROS_FOR_NUMBERS)
.build();
JsonParser jParser;
try {
jParser = jfactory.createParser(tmpFileOut);
} catch (JsonParseException e1) {
throw new TikaException("Failed to parse Json: " + ExceptionUtils.getStackTrace(e1));
} catch (IOException e1) {
throw new TikaException("Failed to read json file: " + ExceptionUtils.getStackTrace(e1));
}
// read json token in a stream using jackson, iterate over each token. We only
// support OBJECTS, FILEHEADER and SummaryInfo
// these are the only ones we have in either sample files or have been tested
// with
DWGReadFormatRemover dwgReadFormatRemover = new DWGReadFormatRemover();
JsonToken nextToken = jParser.nextToken();
while ((nextToken = jParser.nextToken()) != JsonToken.END_OBJECT) {
if (nextToken == JsonToken.FIELD_NAME) {
String nextFieldName = jParser.currentName();
nextToken = jParser.nextToken();
if (nextToken.isStructStart()) {
if ("OBJECTS".equals(nextFieldName)) {
// Start array
while (jParser.nextToken() != JsonToken.END_ARRAY) {
parseDwgObject(jParser, (nextTextValue) -> {
try {
xhtml.characters(dwgReadFormatRemover.cleanupDwgString(nextTextValue));
xhtml.newline();
} catch (SAXException e) {
LOG.error("Could not write next text value {} to xhtml stream", nextTextValue);
}
});
}
} else if ("FILEHEADER".equals(nextFieldName)) {
parseHeader(jParser, metadata);
} else if ("SummaryInfo".equals(nextFieldName)) {
parseSummaryInfo(jParser, metadata);
} else {
jParser.skipChildren();
}
}
}
}
jParser.close();
} finally {
// make sure we delete all temp files
FileUtils.deleteQuietly(tmpFileOut);
FileUtils.deleteQuietly(tmpFileIn);
FileUtils.deleteQuietly(tmpFileOutCleaned);
}
xhtml.endDocument();
}
private void parseDwgObject(JsonParser jsonParser, Consumer<String> textConsumer) throws IOException {
JsonToken nextToken;
while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
if (nextToken == JsonToken.FIELD_NAME) {
String nextFieldName = jsonParser.currentName();
nextToken = jsonParser.nextToken();
if (nextToken.isStructStart()) {
jsonParser.skipChildren();
} else if (nextToken.isScalarValue()) {
if ("text".equals(nextFieldName)) {
String textVal = jsonParser.getText();
if (StringUtils.isNotBlank(textVal)) {
textConsumer.accept(textVal);
}
} else if ("text_value".equals(nextFieldName)) {
String textVal = jsonParser.getText();
if (StringUtils.isNotBlank(textVal)) {
textConsumer.accept(textVal);
}
}
}
}
}
}
private void parseHeader(JsonParser jsonParser, Metadata metadata) throws IOException {
JsonToken nextToken;
while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
if (nextToken == JsonToken.FIELD_NAME) {
String nextFieldName = jsonParser.currentName();
nextToken = jsonParser.nextToken();
if (nextToken.isStructStart()) {
jsonParser.skipChildren();
} else if (nextToken.isScalarValue()) {
metadata.set(nextFieldName, jsonParser.getText());
}
}
}
}
private void parseSummaryInfo(JsonParser jsonParser, Metadata metadata) throws IOException {
JsonToken nextToken;
while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
if (nextToken == JsonToken.FIELD_NAME) {
String nextFieldName = jsonParser.currentName();
nextToken = jsonParser.nextToken();
if (nextToken.isStructStart()) {
if ("TDCREATE".equals(nextFieldName) || "TDUPDATE".equals(nextFieldName)) {
// timestamps are represented by an integer array of format with 2 values in the
// array:
// [julianDate, millisecondOfDay]
jsonParser.nextToken(); // start array
long julianDay = jsonParser.getValueAsLong();
jsonParser.nextToken();
long millisecondsIntoDay = jsonParser.getValueAsLong();
Instant instant = JulianDateUtil.toInstant(julianDay, millisecondsIntoDay);
jsonParser.nextToken(); // end array
if ("TDCREATE".equals(nextFieldName)) {
metadata.set(TikaCoreProperties.CREATED, instant.toString());
} else {
metadata.set(TikaCoreProperties.MODIFIED, instant.toString());
}
} else {
jsonParser.skipChildren();
}
} else if (nextToken.isScalarValue()) {
String textVal = jsonParser.getText();
if (StringUtils.isNotBlank(textVal)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Summary Info - {} = {}", nextFieldName, textVal);
}
if ("TITLE".equals(nextFieldName)) {
metadata.set(TikaCoreProperties.TITLE, textVal);
} else if ("LASTSAVEDBY".equals(nextFieldName)) {
metadata.set(TikaCoreProperties.MODIFIER, textVal);
} else if (!Strings.CI.startsWith(nextFieldName, "unknown")) {
metadata.set(nextFieldName, textVal);
}
}
}
}
}
}
}