PDFBoxRenderer.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.renderer.pdf.pdfbox;

import java.awt.image.BufferedImage;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Collections;
import java.util.Set;

import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.Rendering;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.renderer.PageBasedRenderResults;
import org.apache.tika.renderer.PageRangeRequest;
import org.apache.tika.renderer.RenderRequest;
import org.apache.tika.renderer.RenderResult;
import org.apache.tika.renderer.RenderResults;
import org.apache.tika.renderer.RenderingTracker;

@TikaComponent(name = "pdfbox-renderer")
public class PDFBoxRenderer implements PDDocumentRenderer {

    Set<MediaType> SUPPORTED_TYPES = Collections.singleton(PDFParser.MEDIA_TYPE);

    protected static final Logger LOG = LoggerFactory.getLogger(PDFBoxRenderer.class);

    /**
     * This is the amount of time it takes for PDFBox to render the page
     * to a BufferedImage
     */
    public static Property PDFBOX_RENDERING_TIME_MS =
            Property.externalReal(Rendering.RENDERING_PREFIX + "pdfbox-rendering-ms");

    /**
     * This is the amount of time it takes for PDFBox/java to write the image after
     * it has been rendered into a BufferedImage.  Some formats take much longer
     * to encode than others.
     */
    public static Property PDFBOX_IMAGE_WRITING_TIME_MS =
            Property.externalReal(Rendering.RENDERING_PREFIX + "pdfbox-image-writing-ms");

    @Override
    public Set<MediaType> getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    private int defaultDPI = 300;
    private ImageType defaultImageType = ImageType.GRAY;
    private String defaultImageFormatName = "png";


    @Override
    public RenderResults render(TikaInputStream tis, Metadata metadata, ParseContext parseContext,
                                RenderRequest... requests) throws IOException, TikaException {


        PDDocument pdDocument;
        boolean mustClose = false;
        if (tis.getOpenContainer() != null) {
            pdDocument = (PDDocument) tis.getOpenContainer();
        } else {
            pdDocument = Loader.loadPDF(new RandomAccessReadBuffer(tis));
            mustClose = true;
        }
        PageBasedRenderResults results = new PageBasedRenderResults(new TemporaryResources());
        try {
            for (RenderRequest renderRequest : requests) {
                processRequest(renderRequest, pdDocument, metadata, parseContext, results);
            }
        } finally {
            if (mustClose) {
                pdDocument.close();
            }
        }
        return results;
    }

    private void processRequest(RenderRequest renderRequest, PDDocument pdDocument,
                                Metadata metadata, ParseContext parseContext,
                                PageBasedRenderResults results) {
        if (renderRequest == PageRangeRequest.RENDER_ALL || renderRequest.equals(PageRangeRequest.RENDER_ALL)) {
            renderRange(pdDocument, 1, pdDocument.getNumberOfPages(),
                    metadata, parseContext, results);
        } else if (renderRequest instanceof PageRangeRequest) {
            int start = ((PageRangeRequest)renderRequest).getFrom();
            int toInclusive = ((PageRangeRequest)renderRequest).getTo();
            renderRange(pdDocument, start, toInclusive, metadata, parseContext, results);
        }
    }

    private void renderRange(PDDocument pdDocument, int start, int endInclusive, Metadata metadata,
                             ParseContext parseContext, PageBasedRenderResults results) {
        PDFRenderer renderer = new PDFRenderer(pdDocument);
        RenderingTracker tracker = parseContext.get(RenderingTracker.class);
        if (tracker == null) {
            tracker = new RenderingTracker();
            parseContext.set(RenderingTracker.class, tracker);
        }
        for (int i = start; i <= endInclusive; i++) {
            int id = tracker.getNextId();
            Metadata m = Metadata.newInstance(parseContext);
            m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                    TikaCoreProperties.EmbeddedResourceType.RENDERING.name());
            try {
                m.set(TikaPagedText.PAGE_NUMBER, i);
                m.set(TikaPagedText.PAGE_ROTATION, (double)pdDocument.getPage(i - 1).getRotation());
                results.add(renderPage(renderer, id, i, m, parseContext));
            } catch (IOException e) {
                EmbeddedDocumentUtil.recordException(e, m);
                results.add(new RenderResult(RenderResult.STATUS.EXCEPTION, id, null, m));
            }
        }
    }

    protected RenderResult renderPage(PDFRenderer renderer, int id, int pageNumber,
                                      Metadata metadata, ParseContext parseContext)
            throws IOException {

        Path tmpFile = Files.createTempFile("tika-pdfbox-rendering-",
                "-" + id + "-" + pageNumber + "." + getImageFormatName(parseContext));
        try {
            long start = System.currentTimeMillis();
            //TODO: parameterize whether or not to un-rotate page?
            BufferedImage image = renderer.renderImageWithDPI(
                    pageNumber - 1,
                    getDPI(parseContext),
                    getImageType(parseContext));
            long renderingElapsed = System.currentTimeMillis() - start;
            metadata.set(PDFBOX_RENDERING_TIME_MS, renderingElapsed);
            start = System.currentTimeMillis();
            try (OutputStream os = Files.newOutputStream(tmpFile)) {
                ImageIOUtil.writeImage(image, getImageFormatName(parseContext), os, getDPI(parseContext));
            }
            long elapsedWrite = System.currentTimeMillis() - start;
            metadata.set(PDFBOX_IMAGE_WRITING_TIME_MS, elapsedWrite);
            metadata.set(Rendering.RENDERED_MS, renderingElapsed + elapsedWrite);
        } catch (SecurityException e) {
            //throw SecurityExceptions immediately
            throw e;
        } catch (Exception e) {
            try {
                Files.delete(tmpFile);
            } catch (IOException ex) {
                LOG.warn("couldn't delete " + tmpFile, ex);
            }
            throw new IOException(e);
        }
        return new RenderResult(RenderResult.STATUS.SUCCESS, id, tmpFile, metadata);
    }

    public void setDPI(int dpi) {
        this.defaultDPI = dpi;
    }

    public void setImageType(ImageType imageType) {
        this.defaultImageType = imageType;
    }

    public void setImageFormatName(String imageFormatName) {
        this.defaultImageFormatName = imageFormatName;
    }

    protected int getDPI(ParseContext parseContext) {
        PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
        if (pdfParserConfig == null) {
            return defaultDPI;
        }
        return pdfParserConfig.getOcrDPI();
    }

    protected ImageType getImageType(ParseContext parseContext) {
        PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
        if (pdfParserConfig == null) {
            return defaultImageType;
        }
        return pdfParserConfig.getOcrImageType().getPdfBoxImageType();
    }

    protected String getImageFormatName(ParseContext parseContext) {
        PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
        if (pdfParserConfig == null) {
            return defaultImageFormatName;
        }
        return pdfParserConfig.getOcrImageFormat().getFormatName();
    }
}