AbstractPDF2XHTML.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.pdf;
import static org.apache.tika.parser.pdf.OcrConfig.Strategy.AUTO;
import static org.apache.tika.parser.pdf.OcrConfig.Strategy.NO_OCR;
import static org.apache.tika.parser.pdf.OcrConfig.Strategy.OCR_AND_TEXT_EXTRACTION;
import static org.apache.tika.parser.pdf.OcrConfig.Strategy.OCR_ONLY;
import java.awt.image.BufferedImage;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import javax.xml.stream.XMLStreamException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDJavascriptNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.common.COSObjectable;
import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction;
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDSimpleFileSpecification;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionImportData;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionLaunch;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionRemoteGoTo;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
import org.apache.pdfbox.pdmodel.interactive.action.PDAnnotationAdditionalActions;
import org.apache.pdfbox.pdmodel.interactive.action.PDDocumentCatalogAdditionalActions;
import org.apache.pdfbox.pdmodel.interactive.action.PDFormFieldAdditionalActions;
import org.apache.pdfbox.pdmodel.interactive.action.PDPageAdditionalActions;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.Vector;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Font;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pdf.updates.IncrementalUpdateRecord;
import org.apache.tika.parser.pdf.updates.IsIncrementalUpdate;
import org.apache.tika.parser.pdf.updates.StartXRefOffset;
import org.apache.tika.renderer.CompositeRenderer;
import org.apache.tika.renderer.PageBasedRenderResults;
import org.apache.tika.renderer.PageRangeRequest;
import org.apache.tika.renderer.RenderResult;
import org.apache.tika.renderer.Renderer;
import org.apache.tika.renderer.RenderingTracker;
import org.apache.tika.renderer.pdf.pdfbox.NoTextPDFRenderer;
import org.apache.tika.renderer.pdf.pdfbox.PDDocumentRenderer;
import org.apache.tika.renderer.pdf.pdfbox.PDFRenderingState;
import org.apache.tika.renderer.pdf.pdfbox.TextOnlyPDFRenderer;
import org.apache.tika.renderer.pdf.pdfbox.VectorGraphicsOnlyPDFRenderer;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.StringUtils;
class AbstractPDF2XHTML extends PDFTextStripper {
public static final String XMP_DOCUMENT_CATALOG_LOCATION = "documentCatalog";
public static final String XMP_PAGE_LOCATION_PREFIX = "page ";
/**
* Maximum recursive depth to prevent cycles/recursion bombs.
* This applies to AcroForm processing and processing
* the embedded document tree.
*/
private final static int MAX_RECURSION_DEPTH = 100;
private final static int MAX_BOOKMARK_ITEMS = 10000;
//This is used for both types and subtypes.
//These can be unbounded. We need to limit the number we store.
private final static int MAX_ANNOTATION_TYPES = 100;
private static final String THREE_D = "3D";
private static final COSName ON_INSTANTIATE = COSName.getPDFName("OnInstantiate");
private static final String NULL_STRING = "null";
private static final MediaType XFA_MEDIA_TYPE = MediaType.application("vnd.adobe.xdp+xml");
private static final MediaType XMP_MEDIA_TYPE = MediaType.application("rdf+xml");
final List<IOException> exceptions = new ArrayList<>();
final PDDocument pdDocument;
final XHTMLContentHandler xhtml;
final ParseContext context;
final Metadata metadata;
final EmbeddedDocumentExtractor embeddedDocumentExtractor;
final PDFParserConfig config;
final Parser ocrParser;
final Renderer renderer;
/**
* Format used for signature dates
* TODO Make this thread-safe
*/
private final SimpleDateFormat dateFormat =
new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
private final Set<String> fontNames = new TreeSet<>();
private final Set<String> annotationTypes = new TreeSet<>();
private final Set<String> annotationSubtypes = new TreeSet<>();
private final Set<String> triggers = new TreeSet<>();
private final Set<String> actionTypes = new TreeSet<>();
//these are files that we extract as part of Annotations
//We don't want to extract them twice when we go through the
//full DOM looking for /Type = /EmbeddedFile
private final Set<COSBase> extractedFiles = new HashSet<>();
//zero-based pageIndex
int pageIndex = 0;
//private in PDFTextStripper...must have own copy because we override processpages
int unmappedUnicodeCharsPerPage = 0;
int totalCharsPerPage = 0;
int totalUnmappedUnicodeCharacters;
int totalCharacters;
//contains at least one font that is not embedded
boolean containsNonEmbeddedFont = false;
//contains at least one broken font
boolean containsDamagedFont = false;
int num3DAnnotations = 0;
AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context,
Metadata metadata, PDFParserConfig config, Renderer renderer) throws IOException {
this.pdDocument = pdDocument;
this.xhtml = new XHTMLContentHandler(handler, metadata, context);
this.context = context;
this.metadata = metadata;
this.config = config;
this.renderer = renderer;
embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (config.getOcrStrategy() == NO_OCR) {
ocrParser = null;
} else {
ocrParser = EmbeddedDocumentUtil.getStatelessParser(context);
}
}
private static void addNonNullAttribute(String name, String value, AttributesImpl attributes) {
if (name == null || value == null) {
return;
}
attributes.addAttribute("", name, name, "CDATA", value);
}
private static PDActionURI getActionURI(PDAnnotation annot) {
//copied and pasted from PDFBox's PrintURLs
// use reflection to catch all annotation types that have getAction()
// If you can't use reflection, then check for classes
// PDAnnotationLink and PDAnnotationWidget, and call getAction() and check for a
// PDActionURI result type
try {
Method actionMethod = annot.getClass().getDeclaredMethod("getAction");
if (actionMethod.getReturnType().equals(PDAction.class)) {
PDAction action = (PDAction) actionMethod.invoke(annot);
if (action instanceof PDActionURI) {
return (PDActionURI) action;
}
}
} catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) {
//swallow
}
return null;
}
@Override
protected void startPage(PDPage page) throws IOException {
try {
xhtml.startElement("div", "class", "page");
} catch (SAXException e) {
throw new IOException("Unable to start a page", e);
}
writeParagraphStart();
}
private void extractXMPXFA(PDDocument pdfDocument, Metadata parentMetadata,
ParseContext context) throws IOException, SAXException {
Set<MediaType> supportedTypes = Collections.EMPTY_SET;
Parser embeddedParser = context.get(Parser.class);
if (embeddedParser != null) {
supportedTypes = embeddedParser.getSupportedTypes(context);
}
if (supportedTypes == null || supportedTypes.size() == 0) {
return;
}
if (supportedTypes.contains(XMP_MEDIA_TYPE)) {
//try the main metadata
if (pdfDocument.getDocumentCatalog().getMetadata() != null) {
try (TikaInputStream tis = TikaInputStream.get(
pdfDocument.getDocumentCatalog().getMetadata().exportXMPMetadata())) {
extractXMPAsEmbeddedFile(tis, XMP_DOCUMENT_CATALOG_LOCATION);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
}
}
//now iterate through the pages
int pageNumber = 1;
for (PDPage page : pdfDocument.getPages()) {
if (page.getMetadata() != null) {
try (TikaInputStream tis = TikaInputStream.get(page.getMetadata().exportXMPMetadata())) {
extractXMPAsEmbeddedFile(tis, XMP_PAGE_LOCATION_PREFIX + pageNumber);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
}
}
pageNumber++;
}
}
//now try the xfa
if (pdfDocument.getDocumentCatalog().getAcroForm(null) != null &&
pdfDocument.getDocumentCatalog().getAcroForm(null).getXFA() != null) {
Metadata xfaMetadata = Metadata.newInstance(context);
xfaMetadata.set(Metadata.CONTENT_TYPE, XFA_MEDIA_TYPE.toString());
xfaMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
if (embeddedDocumentExtractor.shouldParseEmbedded(xfaMetadata) &&
supportedTypes.contains(XFA_MEDIA_TYPE)) {
byte[] bytes = null;
try {
bytes = pdfDocument.getDocumentCatalog().getAcroForm(null).getXFA().getBytes();
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
}
if (bytes != null) {
try (TikaInputStream tis = TikaInputStream.get(bytes)) {
parseMetadata(tis, xfaMetadata);
}
}
}
}
}
private void extractXMPAsEmbeddedFile(TikaInputStream tis, String location)
throws IOException, SAXException {
if (tis == null) {
return;
}
Metadata xmpMetadata = Metadata.newInstance(context);
xmpMetadata.set(Metadata.CONTENT_TYPE, XMP_MEDIA_TYPE.toString());
xmpMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
xmpMetadata.set(PDF.XMP_LOCATION, location);
if (embeddedDocumentExtractor.shouldParseEmbedded(xmpMetadata)) {
parseMetadata(tis, xmpMetadata);
}
}
private void parseMetadata(TikaInputStream tis, Metadata embeddedMetadata)
throws IOException, SAXException {
try {
embeddedDocumentExtractor.parseEmbedded(tis, new EmbeddedContentHandler(xhtml),
embeddedMetadata, context, true);
} catch (IOException e) {
handleCatchableIOE(e);
}
}
private void extractEmbeddedDocuments(PDDocument document)
throws IOException, SAXException, TikaException {
//See 14.13.10 for the 2.0 spec. Associated files can show up in lots of places...even
// streams.
// It would be great to get more context from the /AF info, but we risk missing files
//if we don't look everywhere. With the current method, we're at least getting all
//filespecs at the cost of losing context (to what was this file attached: doc, page,
// stream, etc?).
//find all Filespecs TIKA-4012
List<COSObject> objs = document.getDocument().getObjectsByType(COSName.FILESPEC);
Set<COSBase> seen = new HashSet<>();
for (COSObject obj : objs) {
processDoc("", "", createFileSpecification(obj.getObject()), new AttributesImpl());
seen.add(obj.getObject());
}
//now go through the embedded files names tree to get those rare cases where
//a file (instead of a filespec) is attached directly to the names tree
//or where the filespec is a direct object
if (document.getDocumentCatalog() == null) {
return;
}
if (document.getDocumentCatalog().getNames() == null) {
return;
}
if (document.getDocumentCatalog().getNames().getEmbeddedFiles() == null) {
return;
}
//use a list instead of a name-based map in case there are key collisions
//that could hide attachments
List<NameSpecTuple> specs = new ArrayList<>();
extractFilesfromEFTree(document.getDocumentCatalog().getNames().getEmbeddedFiles(), specs,
0);
//this avoids duplication with the above /FileSpec searching, but also in the case
//where the same underlying file has different names in the EFTree
for (NameSpecTuple nameSpecTuple : specs) {
if (seen.contains(nameSpecTuple.getSpec().getCOSObject())) {
continue;
}
processDoc(nameSpecTuple.getName(), "", nameSpecTuple.getSpec(), new AttributesImpl());
seen.add(nameSpecTuple.getSpec().getCOSObject());
}
}
private void processDocOnAction(String name, String annotationType, PDFileSpecification spec,
AttributesImpl attributes)
throws TikaException, SAXException, IOException {
if (spec == null) {
return;
}
processDoc(name, annotationType, spec, attributes);
extractedFiles.add(spec.getCOSObject());
}
private void processDoc(String name, String annotationType, PDFileSpecification spec,
AttributesImpl attributes)
throws TikaException, SAXException, IOException {
if (spec == null) {
return;
}
if (extractedFiles.contains(spec.getCOSObject())) {
return;
}
if (spec instanceof PDSimpleFileSpecification) {
//((PDSimpleFileSpecification)spec).getFile();
attributes.addAttribute("", "class", "class", "CDATA", "linked");
attributes.addAttribute("", "id", "id", "CDATA", spec.getFile());
xhtml.startElement("div", attributes);
xhtml.endElement("div");
} else if (spec instanceof PDComplexFileSpecification) {
if (attributes.getIndex("source") < 0) {
attributes.addAttribute("", "source", "source", "CDATA", "attachment");
}
extractMultiOSPDEmbeddedFiles(name, annotationType, (PDComplexFileSpecification) spec,
attributes);
}
}
private void extractMultiOSPDEmbeddedFiles(String displayName, String annotationType,
PDComplexFileSpecification spec,
AttributesImpl attributes)
throws IOException, SAXException, TikaException {
if (spec == null) {
return;
}
//current strategy is to pull all, not just first non-null
extractPDEmbeddedFile(displayName, annotationType, spec, spec.getFile(),
spec.getEmbeddedFile(), attributes);
extractPDEmbeddedFile(displayName, annotationType, spec, spec.getFileMac(),
spec.getEmbeddedFileMac(), attributes);
extractPDEmbeddedFile(displayName, annotationType, spec, spec.getFileDos(),
spec.getEmbeddedFileDos(), attributes);
extractPDEmbeddedFile(displayName, annotationType, spec, spec.getFileUnix(),
spec.getEmbeddedFileUnix(), attributes);
//Check for /Thumb (thumbnail image);
// /CI (collection item) adobe specific, can have /adobe:DisplayName and a summary
}
private void extractPDEmbeddedFile(String displayName, String annotationType,
PDComplexFileSpecification spec, String fileName,
PDEmbeddedFile pdEmbeddedFile, AttributesImpl attributes)
throws SAXException, IOException {
if (pdEmbeddedFile == null) {
//skip silently
return;
}
fileName =
(fileName == null || "".equals(fileName.trim())) ? spec.getFileUnicode() : fileName;
fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName;
// TODO: other metadata?
Metadata embeddedMetadata = Metadata.newInstance(context);
embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
//if the stream is missing a size, -1 is returned
long sz = pdEmbeddedFile.getSize();
if (sz > -1) {
embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(sz));
}
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
if (!StringUtils.isBlank(annotationType)) {
embeddedMetadata.set(PDF.EMBEDDED_FILE_ANNOTATION_TYPE, annotationType);
}
if (!StringUtils.isBlank(pdEmbeddedFile.getSubtype())) {
embeddedMetadata.set(PDF.EMBEDDED_FILE_SUBTYPE, pdEmbeddedFile.getSubtype());
}
if (!StringUtils.isBlank(spec.getFileDescription())) {
embeddedMetadata.set(PDF.EMBEDDED_FILE_DESCRIPTION, spec.getFileDescription());
}
String afRelationship = spec.getCOSObject().getNameAsString(PDFParser.AF_RELATIONSHIP);
if (StringUtils.isBlank(afRelationship)) {
afRelationship = spec.getCOSObject().getString(PDFParser.AF_RELATIONSHIP);
}
if (!StringUtils.isBlank(afRelationship)) {
embeddedMetadata.set(PDF.ASSOCIATED_FILE_RELATIONSHIP, afRelationship);
}
if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
return;
}
TikaInputStream tis = null;
try {
tis = TikaInputStream.get(pdEmbeddedFile.createInputStream());
} catch (IOException e) {
//store this exception in the parent's metadata
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
return;
}
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", fileName);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
try {
embeddedDocumentExtractor.parseEmbedded(tis, new EmbeddedContentHandler(xhtml),
embeddedMetadata, context, false);
} finally {
IOUtils.closeQuietly(tis);
}
}
void handleCatchableIOE(IOException e) throws IOException {
if (WriteLimitReachedException.isWriteLimitReached(e)) {
metadata.set(TikaCoreProperties.WRITE_LIMIT_REACHED, "true");
throw e;
}
if (config.isCatchIntermediateIOExceptions()) {
String msg = e.getMessage();
if (msg == null) {
msg = "IOException, no message";
}
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
exceptions.add(e);
} else {
throw e;
}
}
void doOCROnCurrentPage(PDPage pdPage, OcrConfig.Strategy ocrStrategy)
throws IOException, TikaException, SAXException {
if (ocrStrategy.equals(NO_OCR)) {
//I don't think this is reachable?
return;
}
//count the number of times that OCR would have been called
OCRPageCounter c = context.get(OCRPageCounter.class);
if (c != null) {
c.increment();
}
// Enforce maxPagesToOcr limit
int maxPagesToOcr = config.getOcrMaxPagesToOcr();
if (maxPagesToOcr > 0 && c != null && c.getCount() > maxPagesToOcr) {
return;
}
MediaType ocrImageMediaType = MediaType.image("ocr-" + config.getOcrImageFormat().getFormatName());
if (!ocrParser.getSupportedTypes(context).contains(ocrImageMediaType)) {
if (ocrStrategy == OCR_ONLY || ocrStrategy == OCR_AND_TEXT_EXTRACTION) {
throw new TikaException(
"" + "I regret that I couldn't find an OCR parser to handle " +
ocrImageMediaType + "." +
"Please set the OCR_STRATEGY to NO_OCR or configure your" +
"OCR parser correctly");
} else if (ocrStrategy == AUTO) {
//silently skip if there's no parser to run ocr
return;
}
}
try (TemporaryResources tmp = new TemporaryResources()) {
try (RenderResult renderResult = renderCurrentPage(pdPage, context, tmp)) {
Metadata renderMetadata = renderResult.getMetadata();
try (TikaInputStream tis = renderResult.getInputStream()) {
renderMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
ocrImageMediaType.toString());
ocrParser.parse(tis, new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
renderMetadata, context);
}
// Propagate enrichment metadata added by the OCR parser (e.g. tika:chunks
// from image embedding parsers) back to the parent document so it isn't
// silently discarded when the renderMetadata goes out of scope.
String renderChunks = renderMetadata.get(TikaCoreProperties.TIKA_CHUNKS);
if (renderChunks != null && metadata.get(TikaCoreProperties.TIKA_CHUNKS) == null) {
metadata.set(TikaCoreProperties.TIKA_CHUNKS, renderChunks);
}
}
} catch (IOException e) {
handleCatchableIOE(e);
} catch (SAXException e) {
throw new IOException("error writing OCR content from PDF", e);
}
}
private RenderResult renderCurrentPage(PDPage pdPage, ParseContext parseContext,
TemporaryResources tmpResources)
throws IOException, TikaException {
PDFRenderingState renderingState = parseContext.get(PDFRenderingState.class);
if (renderingState == null) {
Metadata pageMetadata = getCurrentPageMetadata(pdPage);
noContextRenderCurrentPage(pageMetadata, parseContext, tmpResources);
}
//if the full document has already been rendered, then reuse that file
//TODO: we need to prevent this if only a portion of the page or portions
//of the page have been rendered.
//TODO: we should also figure out how to not reuse the rendering if
//the user wants to render twice (say, full color to display to users, but
//grayscale for (notionally?) better OCR).
PageBasedRenderResults results = (PageBasedRenderResults) renderingState.getRenderResults();
if (results != null) {
List<RenderResult> pageResults = results.getPage(getCurrentPageNo());
if (pageResults.size() == 1) {
return pageResults.get(0);
}
}
Metadata pageMetadata = getCurrentPageMetadata(pdPage);
Renderer thisRenderer = getPDFRenderer(renderer);
//if there's a configured renderer and if the rendering strategy is "all"
if (thisRenderer != null &&
config.getOcrRenderingStrategy() == OcrConfig.RenderingStrategy.ALL) {
PageRangeRequest pageRangeRequest =
new PageRangeRequest(getCurrentPageNo(), getCurrentPageNo());
if (thisRenderer instanceof PDDocumentRenderer) {
//do not do autocloseable. We need to leave the pdDocument open!
TikaInputStream tis = TikaInputStream.get(new byte[0]);
tis.setOpenContainer(pdDocument);
return thisRenderer.render(tis, pageMetadata, parseContext, pageRangeRequest)
.getResults().get(0);
} else {
PDFRenderingState state = context.get(PDFRenderingState.class);
if (state == null) {
throw new IllegalArgumentException("RenderingState must not be null");
}
return thisRenderer.render(state.getTikaInputStream(), pageMetadata, parseContext,
pageRangeRequest).getResults().get(0);
}
} else {
return noContextRenderCurrentPage(pageMetadata, parseContext, tmpResources);
}
}
private Renderer getPDFRenderer(Renderer renderer) {
if (renderer == null) {
return renderer;
}
if (renderer instanceof CompositeRenderer) {
return ((CompositeRenderer) renderer).getLeafRenderer(PDFParser.MEDIA_TYPE);
} else if (renderer.getSupportedTypes(context).contains(PDFParser.MEDIA_TYPE)) {
return renderer;
}
return null;
}
private Metadata getCurrentPageMetadata(PDPage pdPage) {
Metadata pageMetadata = Metadata.newInstance(context);
pageMetadata.set(TikaCoreProperties.TYPE, PDFParser.MEDIA_TYPE.toString());
pageMetadata.set(TikaPagedText.PAGE_NUMBER, getCurrentPageNo());
pageMetadata.set(TikaPagedText.PAGE_ROTATION, (float) pdPage.getRotation());
return pageMetadata;
}
private RenderResult noContextRenderCurrentPage(Metadata pageMetadata,
ParseContext parseContext,
TemporaryResources tmpResources)
throws IOException, TikaException {
PDFRenderer renderer = null;
switch (config.getOcrRenderingStrategy()) {
case NO_TEXT:
renderer = new NoTextPDFRenderer(pdDocument);
break;
case TEXT_ONLY:
renderer = new TextOnlyPDFRenderer(pdDocument);
break;
case VECTOR_GRAPHICS_ONLY:
renderer = new VectorGraphicsOnlyPDFRenderer(pdDocument);
break;
case ALL:
renderer = new PDFRenderer(pdDocument);
break;
}
int dpi = config.getOcrDPI();
Path tmpFile = null;
RenderingTracker renderingTracker = parseContext.get(RenderingTracker.class);
if (renderingTracker == null) {
renderingTracker = new RenderingTracker();
parseContext.set(RenderingTracker.class, renderingTracker);
}
int id = renderingTracker.getNextId();
try {
// Check estimated pixel dimensions before rendering to
// prevent OOM on pathologically large pages
long maxPixels = config.getOcrMaxImagePixels();
if (maxPixels > 0) {
PDPage currentPage = pdDocument.getPage(pageIndex);
PDRectangle mediaBox = currentPage.getMediaBox();
long estWidth = (long) Math.ceil(mediaBox.getWidth() / 72.0 * dpi);
long estHeight = (long) Math.ceil(mediaBox.getHeight() / 72.0 * dpi);
long estPixels = estWidth * estHeight;
if (estPixels > maxPixels) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM,
"Skipping OCR for page " + (pageIndex + 1)
+ ": estimated " + estPixels
+ " pixels exceeds maxImagePixels="
+ maxPixels);
return new RenderResult(RenderResult.STATUS.EXCEPTION,
id, null, pageMetadata);
}
}
BufferedImage image =
renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType().getPdfBoxImageType());
//TODO -- get suffix based on OcrImageType
tmpFile = tmpResources.createTempFile();
try (OutputStream os = Files.newOutputStream(tmpFile)) {
//TODO: get output format from TesseractConfig
ImageIOUtil.writeImage(image, config.getOcrImageFormat().getFormatName(), os, dpi,
config.getOcrImageQuality());
}
} catch (SecurityException e) {
//throw SecurityExceptions immediately
throw e;
} catch (IOException | RuntimeException e) {
//image rendering can throw a variety of runtime exceptions, not just
// IOExceptions...
//need to have a wide catch
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM,
ExceptionUtils.getStackTrace(e));
return new RenderResult(RenderResult.STATUS.EXCEPTION, id, null, pageMetadata);
}
return new RenderResult(RenderResult.STATUS.SUCCESS, id, tmpFile, pageMetadata);
}
@Override
protected void endPage(PDPage page) throws IOException {
metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage);
metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE, unmappedUnicodeCharsPerPage);
try {
for (PDAnnotation annotation : page.getAnnotations()) {
processPageAnnotation(annotation);
}
if (config.getOcrStrategy() == OCR_AND_TEXT_EXTRACTION) {
doOCROnCurrentPage(page, OCR_AND_TEXT_EXTRACTION);
} else if (config.getOcrStrategy() == AUTO) {
boolean unmappedExceedsLimit = false;
if (totalCharsPerPage > config.getOcrStrategyAuto().getTotalCharsPerPage()) {
// There are enough characters to not have to do OCR. Check number of unmapped characters
final float percentUnmapped =
(float) unmappedUnicodeCharsPerPage / totalCharsPerPage;
final float unmappedCharacterLimit =
config.getOcrStrategyAuto().getUnmappedUnicodeCharsPerPage();
unmappedExceedsLimit = (unmappedCharacterLimit < 1) ?
percentUnmapped > unmappedCharacterLimit :
unmappedUnicodeCharsPerPage > unmappedCharacterLimit;
}
if (totalCharsPerPage <= config.getOcrStrategyAuto().getTotalCharsPerPage() ||
unmappedExceedsLimit) {
doOCROnCurrentPage(page, AUTO);
}
}
PDPageAdditionalActions pageActions = page.getActions();
if (pageActions != null) {
handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
}
xhtml.endElement("div");
} catch (SAXException | TikaException e) {
throw new IOException("Unable to end a page", e);
} catch (IOException e) {
handleCatchableIOE(e);
} finally {
totalCharsPerPage = 0;
unmappedUnicodeCharsPerPage = 0;
}
if (config.isExtractFontNames()) {
for (COSName n : page.getResources().getFontNames()) {
PDFont font = page.getResources().getFont(n);
if (font != null && font.getFontDescriptor() != null) {
String fontName = font.getFontDescriptor().getFontName();
if (fontName != null) {
fontNames.add(fontName);
}
}
}
}
}
private void processPageAnnotation(PDAnnotation annotation) throws TikaException, IOException, SAXException {
String annotationName = annotation.getAnnotationName();
if (annotationTypes.size() < MAX_ANNOTATION_TYPES) {
if (annotationName != null) {
annotationTypes.add(annotationName);
} else {
annotationTypes.add(NULL_STRING);
}
}
String annotationSubtype = annotation.getSubtype();
if (annotationSubtypes.size() < MAX_ANNOTATION_TYPES) {
if (annotationSubtype != null) {
annotationSubtypes.add(annotationSubtype);
} else {
annotationSubtypes.add(NULL_STRING);
}
}
if (annotation instanceof PDAnnotationFileAttachment) {
PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
String subtype = "annotationFileAttachment";
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "source", "source", "CDATA", subtype);
processDocOnAction("", subtype, fann.getFile(), attributes);
} else if (annotation instanceof PDAnnotationWidget) {
handleWidget((PDAnnotationWidget) annotation);
} else {
if (annotationSubtype == null) {
annotationSubtype = "unknown";
} else if (annotationSubtype.equals(THREE_D) ||
annotation.getCOSObject().containsKey(COSName.THREE_DD)) {
//To make this stricter, we could get the 3DD stream object and see if the
//subtype is U3D or PRC or model/ (prefix for model mime type)
extractOnInstantiate(annotation);
COSDictionary additionalActions = annotation.getCOSObject().getCOSDictionary(COSName.AA);
if (additionalActions != null) {
handlePDAnnotationAdditionalActions(new PDAnnotationAdditionalActions(additionalActions));
}
metadata.set(PDF.HAS_3D, true);
num3DAnnotations++;
}
for (COSDictionary fileSpec : findFileSpecs(annotation.getCOSObject())) {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "source", "source", "CDATA", annotationSubtype);
processDocOnAction("", annotationSubtype, createFileSpecification(fileSpec),
attributes);
}
}
if (! config.isExtractAnnotationText()) {
return;
}
// TODO: remove once PDFBOX-1143 is fixed:
PDActionURI uri = getActionURI(annotation);
if (uri != null) {
String link = uri.getURI();
if (link != null && !link.isBlank()) {
xhtml.startElement("div", "class", "annotation");
xhtml.startElement("a", "href", link);
xhtml.characters(link);
xhtml.endElement("a");
xhtml.endElement("div");
}
}
if (annotation instanceof PDAnnotationMarkup) {
PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
String title = annotationMarkup.getTitlePopup();
String subject = annotationMarkup.getSubject();
String contents = annotationMarkup.getContents();
// TODO: maybe also annotationMarkup.getRichContents()?
if (title != null || subject != null || contents != null) {
xhtml.startElement("div", "class", "annotation");
if (title != null) {
xhtml.startElement("div", "class", "annotationTitle");
xhtml.characters(title);
xhtml.endElement("div");
}
if (subject != null) {
xhtml.startElement("div", "class", "annotationSubject");
xhtml.characters(subject);
xhtml.endElement("div");
}
if (contents != null) {
xhtml.startElement("div", "class", "annotationContents");
xhtml.characters(contents);
xhtml.endElement("div");
}
xhtml.endElement("div");
}
}
}
private void extractOnInstantiate(PDAnnotation annotation) throws IOException, SAXException {
COSDictionary threeDD = annotation.getCOSObject().getCOSDictionary(COSName.THREE_DD);
if (threeDD == null) {
return;
}
COSStream stream = threeDD.getCOSStream(ON_INSTANTIATE);
if (stream == null) {
return;
}
Metadata m = getJavascriptMetadata("3DD_ON_INSTANTIATE", null, null);
if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
try (TikaInputStream tis = TikaInputStream.get(stream.createInputStream())) {
embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, context, true);
}
}
AttributesImpl attrs = new AttributesImpl();
addNonNullAttribute("class", "javascript", attrs);
addNonNullAttribute("type", "3dd_on_instantiate", attrs);
xhtml.startElement("div", attrs);
xhtml.endElement("div");
}
private List<COSDictionary> findFileSpecs(COSDictionary cosDict) {
Set<COSName> types = new HashSet<>();
types.add(COSName.FILESPEC);
return PDFDOMUtil.findType(cosDict, types, MAX_RECURSION_DEPTH);
}
private void extractFilesfromEFTree(PDNameTreeNode efTree,
List<NameSpecTuple> embeddedFileNames, int depth)
throws IOException {
if (depth > MAX_RECURSION_DEPTH) {
throw new IOException("Hit max recursion depth");
}
Map<String, PDComplexFileSpecification> names = null;
try {
names = efTree.getNames();
} catch (IOException e) {
//LOG?
}
if (names != null) {
for (Map.Entry<String, PDComplexFileSpecification> e : names.entrySet()) {
embeddedFileNames.add(new NameSpecTuple(e.getKey(), e.getValue()));
}
}
List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
if (kids == null) {
return;
} else {
for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
extractFilesfromEFTree(node, embeddedFileNames, depth + 1);
}
}
}
private void handleWidget(PDAnnotationWidget widget)
throws TikaException, SAXException, IOException {
if (widget == null) {
return;
}
handleDestinationOrAction(widget.getAction(), ActionTrigger.ANNOTATION_WIDGET);
handlePDAnnotationAdditionalActions(widget.getActions());
}
private void handlePDAnnotationAdditionalActions(PDAnnotationAdditionalActions annotationActions) throws TikaException, IOException, SAXException {
if (annotationActions == null) {
return;
}
handleDestinationOrAction(annotationActions.getBl(), ActionTrigger.ANNOTATION_LOSE_INPUT_FOCUS);
handleDestinationOrAction(annotationActions.getD(), ActionTrigger.ANNOTATION_MOUSE_CLICK);
handleDestinationOrAction(annotationActions.getE(), ActionTrigger.ANNOTATION_CURSOR_ENTERS);
handleDestinationOrAction(annotationActions.getFo(), ActionTrigger.ANNOTATION_RECEIVES_FOCUS);
handleDestinationOrAction(annotationActions.getPC(), ActionTrigger.ANNOTATION_PAGE_CLOSED);
handleDestinationOrAction(annotationActions.getPI(), ActionTrigger.ANNOTATION_PAGE_NO_LONGER_VISIBLE);
handleDestinationOrAction(annotationActions.getPO(), ActionTrigger.ANNOTATION_PAGE_OPENED);
handleDestinationOrAction(annotationActions.getPV(), ActionTrigger.ANNOTATION_PAGE_VISIBLE);
handleDestinationOrAction(annotationActions.getU(), ActionTrigger.ANNOTATION_MOUSE_RELEASED);
handleDestinationOrAction(annotationActions.getX(), ActionTrigger.ANNOTATION_CURSOR_EXIT);
}
@Override
protected void startDocument(PDDocument pdf) throws IOException {
try {
xhtml.startDocument();
extractJavaScriptFromNameTreeNode(pdf);
try {
handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(),
ActionTrigger.DOCUMENT_OPEN);
} catch (IOException e) {
//See PDFBOX-3773
//swallow -- no need to report this
}
} catch (TikaException | SAXException e) {
throw new IOException("Unable to start a document", e);
}
}
private void extractJavaScriptFromNameTreeNode(PDDocument pdf) throws SAXException {
if (! config.isExtractActions()) {
return;
}
if (pdf.getDocumentCatalog() == null || pdf.getDocumentCatalog().getNames() == null
|| pdf.getDocumentCatalog().getNames().getJavaScript() == null) {
return;
}
try {
PDJavascriptNameTreeNode pdjntn = pdf.getDocumentCatalog().getNames().getJavaScript();
addJavaScript(pdjntn.getNames());
int depth = 0;
processJavascriptNameTreeNodeKids(pdjntn.getKids(), depth + 1);
} catch (IOException e) {
//swallow
}
}
private void addJavaScript(Map<String, PDActionJavaScript> pdActionJavaScriptMap) throws IOException, SAXException {
if (pdActionJavaScriptMap == null) {
return;
}
for (Map.Entry<String, PDActionJavaScript> e : pdActionJavaScriptMap.entrySet()) {
String action = e.getValue().getAction();
if (StringUtils.isBlank(action)) {
return;
}
AttributesImpl attributes = new AttributesImpl();
addNonNullAttribute("trigger", "namesTree", attributes);
addNonNullAttribute("type", e.getValue().getClass().getSimpleName(), attributes);
processJavaScriptAction("NAMES_TREE", e.getKey(), e.getValue(), attributes);
}
}
private void processJavascriptNameTreeNodeKids(List<PDNameTreeNode<PDActionJavaScript>> kids, int depth) throws IOException, SAXException {
if (kids == null) {
return;
}
if (depth > MAX_RECURSION_DEPTH) {
//hit max recursion
//return silently for now...maybe throw Exception?
return;
}
for (PDNameTreeNode<PDActionJavaScript> pdntn: kids) {
addJavaScript(pdntn.getNames());
processJavascriptNameTreeNodeKids(pdntn.getKids(), depth + 1);
};
}
private void handleDestinationOrAction(PDDestinationOrAction action,
ActionTrigger actionTrigger)
throws IOException, SAXException, TikaException {
if (action == null || !config.isExtractActions()) {
return;
}
triggers.add(actionTrigger.name());
String actionOrDestString = "destination";
if (action instanceof PDAction) {
actionOrDestString = "action";
String actionType = ((PDAction) action).getType();
if (!StringUtils.isBlank(actionType)) {
actionTypes.add(actionType);
}
}
AttributesImpl attributes = new AttributesImpl();
addNonNullAttribute("class", actionOrDestString, attributes);
addNonNullAttribute("type", action.getClass().getSimpleName(), attributes);
addNonNullAttribute("trigger", actionTrigger.name(), attributes);
if (action instanceof PDActionImportData) {
processDocOnAction("", "", ((PDActionImportData) action).getFile(), attributes);
} else if (action instanceof PDActionLaunch) {
PDActionLaunch pdActionLaunch = (PDActionLaunch) action;
addNonNullAttribute("id", pdActionLaunch.getF(), attributes);
addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), attributes);
addNonNullAttribute("operation", pdActionLaunch.getO(), attributes);
addNonNullAttribute("parameters", pdActionLaunch.getP(), attributes);
processDocOnAction(pdActionLaunch.getF(), "", pdActionLaunch.getFile(), attributes);
} else if (action instanceof PDActionRemoteGoTo) {
PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo) action;
processDocOnAction("", "", remoteGoTo.getFile(), attributes);
} else if (action instanceof PDActionJavaScript) {
processJavaScriptAction(actionTrigger.name(), null, (PDActionJavaScript) action, attributes);
/*} else if (action instanceof PDActionSubmitForm) {
PDActionSubmitForm submitForm = (PDActionSubmitForm) action;
//these are typically urls, not actual file specification
PDFileSpecification fileSpecification = submitForm.getFile();
processDoc("", fileSpecification, new AttributesImpl());*/
} else {
xhtml.startElement("div", attributes);
xhtml.endElement("div");
}
}
private void processJavaScriptAction(String trigger, String jsActionName, PDActionJavaScript jsAction, AttributesImpl attrs) throws IOException, SAXException {
Metadata m = getJavascriptMetadata(trigger, jsActionName, StandardCharsets.UTF_8);
String js = jsAction.getAction();
js = (js == null) ? "" : js;
if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
try (TikaInputStream tis = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, context, true);
}
};
addNonNullAttribute("class", "javascript", attrs);
addNonNullAttribute("type", jsAction.getType(), attrs);
addNonNullAttribute("subtype", jsAction.getSubType(), attrs);
xhtml.startElement("div", attrs);
xhtml.endElement("div");
}
private Metadata getJavascriptMetadata(String trigger, String jsActionName, Charset charset) {
Metadata m = Metadata.newInstance(context);
m.set(Metadata.CONTENT_TYPE, "application/javascript");
m.set(PDF.ACTION_TRIGGER, trigger);
m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.name());
if (! StringUtils.isBlank(jsActionName)) {
m.set(PDF.JS_NAME, jsActionName);
}
if (charset != null) {
m.set(Metadata.CONTENT_ENCODING, charset.toString());
}
return m;
}
@Override
protected void endDocument(PDDocument pdf) throws IOException {
try {
// Extract text for any bookmarks:
if (config.isExtractBookmarksText()) {
extractBookmarkText();
}
try {
extractEmbeddedDocuments(pdf);
} catch (IOException e) {
handleCatchableIOE(e);
}
try {
extractIncrementalUpdates();
} catch (IOException e) {
handleCatchableIOE(e);
}
extractXMPXFA(pdf, metadata, context);
//extract acroform data at end of doc
if (config.isExtractAcroFormContent() == true) {
try {
extractAcroForm(pdf);
} catch (IOException e) {
handleCatchableIOE(e);
}
}
PDDocumentCatalogAdditionalActions additionalActions =
pdf.getDocumentCatalog().getActions();
handleDestinationOrAction(additionalActions.getDP(),
ActionTrigger.AFTER_DOCUMENT_PRINT);
handleDestinationOrAction(additionalActions.getDS(), ActionTrigger.AFTER_DOCUMENT_SAVE);
handleDestinationOrAction(additionalActions.getWC(),
ActionTrigger.BEFORE_DOCUMENT_CLOSE);
handleDestinationOrAction(additionalActions.getWP(),
ActionTrigger.BEFORE_DOCUMENT_PRINT);
handleDestinationOrAction(additionalActions.getWS(),
ActionTrigger.BEFORE_DOCUMENT_SAVE);
//now record annotationtypes and subtypes
for (String annotationType : annotationTypes) {
metadata.add(PDF.ANNOTATION_TYPES, annotationType);
}
for (String annotationSubtype : annotationSubtypes) {
metadata.add(PDF.ANNOTATION_SUBTYPES, annotationSubtype);
}
for (String trigger : triggers) {
metadata.add(PDF.ACTION_TRIGGERS, trigger);
}
for (String actionType : actionTypes) {
metadata.add(PDF.ACTION_TYPES, actionType);
}
xhtml.endDocument();
} catch (TikaException | SAXException e) {
throw new IOException("Unable to end a document", e);
}
if (fontNames.size() > 0) {
for (String fontName : fontNames) {
metadata.add(Font.FONT_NAME, fontName);
}
}
metadata.set(PDF.TOTAL_UNMAPPED_UNICODE_CHARS, totalUnmappedUnicodeCharacters);
if (totalCharacters > 0) {
metadata.set(PDF.OVERALL_PERCENTAGE_UNMAPPED_UNICODE_CHARS,
(float) totalUnmappedUnicodeCharacters / (float) totalCharacters);
}
metadata.set(PDF.CONTAINS_DAMAGED_FONT, containsDamagedFont);
metadata.set(PDF.CONTAINS_NON_EMBEDDED_FONT, containsNonEmbeddedFont);
metadata.set(PDF.NUM_3D_ANNOTATIONS, num3DAnnotations);
}
private void extractIncrementalUpdates() throws SAXException, IOException {
if (!config.isParseIncrementalUpdates()) {
return;
}
IncrementalUpdateRecord incrementalUpdateRecord =
context.get(IncrementalUpdateRecord.class);
if (incrementalUpdateRecord == null) {
//should log
return;
}
int count = 0;
//don't include the last xref (coz that's the full pdf)
for (int i = 0; i < incrementalUpdateRecord.getOffsets().size() - 1
&& i < config.getMaxIncrementalUpdates(); i++) {
StartXRefOffset xRefOffset = incrementalUpdateRecord.getOffsets().get(i);
//don't count linearized dummy xref offset
//TODO figure out better way of managing this
if (xRefOffset.getStartxref() == 0) {
continue;
}
try {
parseIncrementalUpdate(count, incrementalUpdateRecord.getPath(), xRefOffset);
count++;
} catch (IOException e) {
handleCatchableIOE(e);
}
}
}
private void parseIncrementalUpdate(int count, Path path, StartXRefOffset xRefOffset)
throws SAXException, IOException {
TemporaryResources tmp = new TemporaryResources();
try {
Path update = tmp.createTempFile();
try (InputStream input = Files.newInputStream(path);
OutputStream outputStream = Files.newOutputStream(update, StandardOpenOption.WRITE)) {
IOUtils.copyLarge(input, outputStream, 0, xRefOffset.getEndEofOffset());
}
Metadata updateMetadata = Metadata.newInstance(context);
updateMetadata.set(PDF.INCREMENTAL_UPDATE_NUMBER, count);
updateMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.VERSION.toString());
if (embeddedDocumentExtractor.shouldParseEmbedded(updateMetadata)) {
try (TikaInputStream tis = TikaInputStream.get(update)) {
context.set(IsIncrementalUpdate.class, IsIncrementalUpdate.IS_INCREMENTAL_UPDATE);
embeddedDocumentExtractor.parseEmbedded(tis, xhtml, updateMetadata, context, false);
}
}
} finally {
tmp.close();
}
}
void extractBookmarkText() throws SAXException, IOException, TikaException {
PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline();
if (outline != null) {
Set<COSObjectable> seen = new HashSet<>();
extractBookmarkText(outline, seen, 0);
}
}
void extractBookmarkText(PDOutlineNode bookmark, Set<COSObjectable> seen, int itemCount)
throws SAXException, IOException, TikaException {
PDOutlineItem current = bookmark.getFirstChild();
if (itemCount > MAX_BOOKMARK_ITEMS) {
return;
}
if (current != null) {
if (seen.contains(current)) {
return;
}
xhtml.startElement("ul");
while (current != null) {
if (seen.contains(current)) {
break;
}
if (itemCount > MAX_BOOKMARK_ITEMS) {
break;
}
seen.add(current);
xhtml.startElement("li");
xhtml.characters(current.getTitle());
xhtml.endElement("li");
handleDestinationOrAction(current.getAction(), ActionTrigger.BOOKMARK);
// Recurse:
extractBookmarkText(current, seen, itemCount + 1);
current = current.getNextSibling();
itemCount++;
}
xhtml.endElement("ul");
}
}
void extractAcroForm(PDDocument pdf) throws IOException, SAXException, TikaException {
//Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
//this code derives from Ben's code
PDDocumentCatalog catalog = pdf.getDocumentCatalog();
if (catalog == null) {
return;
}
PDAcroForm form = catalog.getAcroForm(null);
if (form == null) {
return;
}
//if it has xfa, try that.
//if it doesn't exist or there's an exception,
//go with traditional AcroForm
PDXFAResource pdxfa = form.getXFA();
if (pdxfa != null) {
//if successful, return
XFAExtractor xfaExtractor = new XFAExtractor();
InputStream is = null;
try {
is = new BufferedInputStream(
UnsynchronizedByteArrayInputStream.builder().setByteArray(pdxfa.getBytes()).get());
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
}
if (is != null) {
try {
xfaExtractor.extract(is, xhtml, metadata, context);
return;
} catch (XMLStreamException e) {
//if there was an xml parse exception in xfa, try the AcroForm
EmbeddedDocumentUtil.recordException(e, metadata);
} finally {
IOUtils.closeQuietly(is);
}
}
}
@SuppressWarnings("rawtypes") List fields = form.getFields();
if (fields == null) {
return;
}
@SuppressWarnings("rawtypes") ListIterator itr = fields.listIterator();
if (itr == null) {
return;
}
xhtml.startElement("div", "class", "acroform");
xhtml.startElement("ol");
while (itr.hasNext()) {
Object obj = itr.next();
if (obj != null && obj instanceof PDField) {
processAcroField((PDField) obj, 0);
}
}
xhtml.endElement("ol");
xhtml.endElement("div");
}
private void processAcroField(PDField field, final int currentRecursiveDepth)
throws SAXException, IOException, TikaException {
if (currentRecursiveDepth >= MAX_RECURSION_DEPTH) {
return;
}
PDFormFieldAdditionalActions pdFormFieldAdditionalActions = field.getActions();
if (pdFormFieldAdditionalActions != null) {
handleDestinationOrAction(pdFormFieldAdditionalActions.getC(),
ActionTrigger.FORM_FIELD_RECALCULATE);
handleDestinationOrAction(pdFormFieldAdditionalActions.getF(),
ActionTrigger.FORM_FIELD_FORMATTED);
handleDestinationOrAction(pdFormFieldAdditionalActions.getK(),
ActionTrigger.FORM_FIELD_KEYSTROKE);
handleDestinationOrAction(pdFormFieldAdditionalActions.getV(),
ActionTrigger.FORM_FIELD_VALUE_CHANGE);
}
if (field.getWidgets() != null) {
for (PDAnnotationWidget widget : field.getWidgets()) {
handleWidget(widget);
}
}
addFieldString(field);
if (field instanceof PDNonTerminalField) {
int r = currentRecursiveDepth + 1;
xhtml.startElement("ol");
for (PDField child : ((PDNonTerminalField) field).getChildren()) {
processAcroField(child, r);
}
xhtml.endElement("ol");
}
}
private void addFieldString(PDField field) throws SAXException {
//Pick partial name to present in content and altName for attribute
//Ignoring FullyQualifiedName for now
String partName = field.getPartialName();
String altName = field.getAlternateFieldName();
StringBuilder sb = new StringBuilder();
AttributesImpl attrs = new AttributesImpl();
if (partName != null) {
sb.append(partName).append(": ");
}
if (altName != null) {
attrs.addAttribute("", "altName", "altName", "CDATA", altName);
}
//return early if PDSignature field
if (field instanceof PDSignatureField) {
handleSignature(attrs, (PDSignatureField) field);
return;
}
String value = field.getValueAsString();
if (value != null && !value.equals("null")) {
sb.append(value);
}
if (attrs.getLength() > 0 || sb.length() > 0) {
xhtml.startElement("li", attrs);
xhtml.characters(sb.toString());
xhtml.endElement("li");
}
}
private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField)
throws SAXException {
PDSignature sig = sigField.getSignature();
if (sig == null) {
return;
}
Map<String, String> vals = new TreeMap<>();
vals.put("name", sig.getName());
vals.put("contactInfo", sig.getContactInfo());
vals.put("location", sig.getLocation());
vals.put("reason", sig.getReason());
Calendar cal = sig.getSignDate();
if (cal != null) {
dateFormat.setTimeZone(cal.getTimeZone());
vals.put("date", dateFormat.format(cal.getTime()));
}
//see if there is any data
int nonNull = 0;
for (String val : vals.keySet()) {
if (val != null && !val.equals("")) {
nonNull++;
}
}
//if there is, process it
if (nonNull > 0) {
metadata.set(TikaCoreProperties.HAS_SIGNATURE, "true");
xhtml.startElement("li", parentAttributes);
AttributesImpl attrs = new AttributesImpl();
attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
xhtml.startElement("ol", attrs);
for (Map.Entry<String, String> e : vals.entrySet()) {
if (e.getValue() == null || e.getValue().equals("")) {
continue;
}
attrs = new AttributesImpl();
attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey());
xhtml.startElement("li", attrs);
xhtml.characters(e.getValue());
xhtml.endElement("li");
}
xhtml.endElement("ol");
xhtml.endElement("li");
}
}
/**
* we need to override this because we are overriding {@link #processPages(PDPageTree)}
*
* @return
*/
@Override
public int getCurrentPageNo() {
return pageIndex + 1;
}
/**
* See TIKA-2845 for why we need to override this.
*
* @param pages
* @throws IOException
*/
@Override
protected void processPages(PDPageTree pages) throws IOException {
for (PDPage page : pages) {
if (getCurrentPageNo() >= getStartPage() && getCurrentPageNo() <= getEndPage()) {
processPage(page);
}
pageIndex++;
}
}
@Override
public void setStartBookmark(PDOutlineItem pdOutlineItem) {
throw new UnsupportedOperationException(
"We don't currently support this -- See PDFTextStripper's processPages() for how " +
"to implement this.");
}
@Override
public void setEndBookmark(PDOutlineItem pdOutlineItem) {
throw new UnsupportedOperationException(
"We don't currently support this -- See PDFTextStripper's processPages() for how " +
"to implement this.");
}
@Override
protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code,
Vector displacement) throws IOException {
super.showGlyph(textRenderingMatrix, font, code, displacement);
String unicode = font.toUnicode(code);
if (unicode == null || unicode.isEmpty()) {
unmappedUnicodeCharsPerPage++;
totalUnmappedUnicodeCharacters++;
}
totalCharsPerPage++;
totalCharacters++;
if (font.isDamaged()) {
containsDamagedFont = true;
}
if (!font.isEmbedded()) {
containsNonEmbeddedFont = true;
}
}
private PDFileSpecification createFileSpecification(COSBase cosBase) {
try {
return PDFileSpecification.createFS(cosBase);
} catch (IOException e) {
//swallow for now
}
return null;
}
private static class NameSpecTuple {
private final String name;
private final PDComplexFileSpecification spec;
public NameSpecTuple(String name, PDComplexFileSpecification spec) {
this.name = name;
this.spec = spec;
}
public String getName() {
return name;
}
public PDComplexFileSpecification getSpec() {
return spec;
}
}
enum ActionTrigger {
AFTER_DOCUMENT_PRINT, AFTER_DOCUMENT_SAVE, ANNOTATION_CURSOR_ENTERS, ANNOTATION_CURSOR_EXIT,
ANNOTATION_LOSE_INPUT_FOCUS, ANNOTATION_MOUSE_CLICK, ANNOTATION_MOUSE_RELEASED,
ANNOTATION_PAGE_CLOSED, ANNOTATION_PAGE_NO_LONGER_VISIBLE, ANNOTATION_PAGE_OPENED,
ANNOTATION_PAGE_VISIBLE, ANNOTATION_RECEIVES_FOCUS, ANNOTATION_WIDGET,
BEFORE_DOCUMENT_CLOSE, BEFORE_DOCUMENT_PRINT, BEFORE_DOCUMENT_SAVE, DOCUMENT_OPEN,
FORM_FIELD, FORM_FIELD_FORMATTED, FORM_FIELD_KEYSTROKE, FORM_FIELD_RECALCULATE,
FORM_FIELD_VALUE_CHANGE, PAGE_CLOSE, PAGE_OPEN, BOOKMARK,
}
}