RTFEmbeddedHandler.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.rtf.jflex;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.FilenameUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.EmbeddedContentHandler;
/**
* Handles embedded objects and pictures within the JFlex-based RTF token stream.
*
* <p>Uses streaming parsers ({@link RTFObjDataStreamParser} and
* {@link RTFPictStreamParser}) so that large embedded objects are written
* to temp files rather than buffered entirely in memory.</p>
*/
public class RTFEmbeddedHandler {
private final ContentHandler handler;
private final ParseContext context;
private final EmbeddedDocumentUtil embeddedDocumentUtil;
private final long maxBytes;
private boolean inObject;
private boolean isPictBitmap;
private int hi = -1;
private int thumbCount;
private final AtomicInteger unknownFilenameCount = new AtomicInteger();
private String sn = "";
private String sv = "";
private final StringBuilder metadataBuffer = new StringBuilder();
private Metadata metadata;
// Streaming parsers -- one active at a time
private RTFObjDataStreamParser objParser;
private RTFPictStreamParser pictParser;
public RTFEmbeddedHandler(ContentHandler handler, ParseContext context,
int maxBytesInKb) {
this.handler = handler;
this.context = context;
this.embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
this.maxBytes = maxBytesInKb > 0 ? (long) maxBytesInKb * 1024 : -1;
this.metadata = Metadata.newInstance(context);
}
/**
* Process a token for embedded object/pict handling.
* Call this AFTER {@link RTFState#processToken(RTFToken)} has run.
*/
public void processToken(RTFToken tok, RTFState rtfState, RTFGroupState closingGroup)
throws IOException, SAXException, TikaException {
RTFGroupState group = rtfState.getCurrentGroup();
switch (tok.getType()) {
case GROUP_CLOSE:
if (closingGroup.objdata) {
handleCompletedObjData();
} else if (closingGroup.pictDepth == 1) {
handleCompletedPict();
} else if (closingGroup.sn) {
sn = metadataBuffer.toString();
} else if (closingGroup.sv) {
sv = metadataBuffer.toString();
} else if (closingGroup.sp) {
metadata.add(sn, sv);
}
if (closingGroup.object) {
inObject = false;
}
break;
case CONTROL_WORD:
switch (tok.getName()) {
case "object":
inObject = true;
break;
case "objdata":
metadata = Metadata.newInstance(context);
objParser = new RTFObjDataStreamParser(maxBytes);
break;
case "pict":
metadata = Metadata.newInstance(context);
pictParser = new RTFPictStreamParser(maxBytes);
break;
case "sn":
metadataBuffer.setLength(0);
metadataBuffer.append(RTFMetadata.RTF_PICT_META_PREFIX);
break;
case "sv":
metadataBuffer.setLength(0);
break;
case "wbitmap":
isPictBitmap = true;
break;
}
break;
case TEXT:
if (group.objdata || group.pictDepth == 1) {
writeHexChar(tok.getChar());
} else if (group.sn || group.sv) {
metadataBuffer.append(tok.getChar());
}
break;
case HEX_ESCAPE:
if (group.sn || group.sv) {
metadataBuffer.append((char) tok.getHexValue());
}
break;
default:
break;
}
}
private void handleCompletedObjData() throws IOException, SAXException, TikaException {
try (TikaInputStream tis = objParser.onComplete(metadata, unknownFilenameCount)) {
if (tis != null) {
extractObj(tis, metadata);
}
} catch (IOException e) {
EmbeddedDocumentUtil.recordException(e, metadata);
} finally {
objParser.close();
objParser = null;
reset();
}
}
private void handleCompletedPict() throws IOException, SAXException, TikaException {
try {
String filePath =
metadata.get(RTFMetadata.RTF_PICT_META_PREFIX + "wzDescription");
if (filePath != null && !filePath.isEmpty()) {
metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, filePath);
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
FilenameUtils.getName(filePath));
metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, filePath);
}
metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject));
if (isPictBitmap) {
metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
"image/x-rtf-raw-bitmap");
}
try (TikaInputStream tis = pictParser.onComplete(metadata)) {
if (tis != null) {
extractObj(tis, metadata);
}
}
} catch (IOException e) {
EmbeddedDocumentUtil.recordException(e, metadata);
} finally {
pictParser = null;
reset();
}
}
private void writeHexChar(int b) throws IOException, TikaException {
if (isHexChar(b)) {
if (hi == -1) {
hi = 16 * hexValue(b);
} else {
int decoded = hi + hexValue(b);
hi = -1;
if (objParser != null) {
objParser.onByte(decoded);
} else if (pictParser != null) {
pictParser.onByte(decoded);
}
}
}
}
private void extractObj(TikaInputStream tis, Metadata meta)
throws SAXException, IOException, TikaException {
meta.set(Metadata.CONTENT_LENGTH, Long.toString(tis.getLength()));
if (embeddedDocumentUtil.shouldParseEmbedded(meta)) {
if (meta.get(TikaCoreProperties.RESOURCE_NAME_KEY) == null) {
String extension = embeddedDocumentUtil.getExtension(tis, meta);
if (inObject && pictParser != null) {
meta.set(TikaCoreProperties.RESOURCE_NAME_KEY,
EmbeddedDocumentUtil.EmbeddedResourcePrefix.THUMBNAIL.getPrefix()
+ "-" + thumbCount++ + extension);
meta.set(RTFMetadata.THUMBNAIL, "true");
} else {
meta.set(TikaCoreProperties.RESOURCE_NAME_KEY,
EmbeddedDocumentUtil.EmbeddedResourcePrefix.EMBEDDED.getPrefix()
+ "-" + unknownFilenameCount.getAndIncrement()
+ extension);
}
meta.set(TikaCoreProperties.RESOURCE_NAME_EXTENSION_INFERRED, true);
}
try {
embeddedDocumentUtil.parseEmbedded(
tis, new EmbeddedContentHandler(handler), meta, true);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, meta);
}
}
}
private void reset() {
metadata = Metadata.newInstance(context);
hi = -1;
sn = "";
sv = "";
metadataBuffer.setLength(0);
isPictBitmap = false;
}
private static boolean isHexChar(int ch) {
return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
}
private static int hexValue(int ch) {
if (ch >= '0' && ch <= '9') {
return ch - '0';
} else if (ch >= 'a' && ch <= 'z') {
return 10 + (ch - 'a');
} else {
return 10 + (ch - 'A');
}
}
}