PopplerRenderer.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.renderer.pdf.poppler;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.renderer.PageBasedRenderResults;
import org.apache.tika.renderer.PageRangeRequest;
import org.apache.tika.renderer.RenderRequest;
import org.apache.tika.renderer.RenderResult;
import org.apache.tika.renderer.RenderResults;
import org.apache.tika.renderer.Renderer;
import org.apache.tika.renderer.RenderingTracker;
import org.apache.tika.utils.FileProcessResult;
import org.apache.tika.utils.ProcessUtils;
/**
* Renderer that uses Poppler's {@code pdftoppm} command to convert PDF
* pages to PNG images.
* <p>
* Poppler is pre-installed on most Linux distributions and is the
* fastest widely-available PDF renderer. On macOS it can be installed
* via {@code brew install poppler}; on Windows via MSYS2 or Chocolatey.
* <p>
* Configuration key: {@code "poppler-renderer"}
*
* @since Apache Tika 4.0
*/
@TikaComponent(name = "poppler-renderer", spi = false)
public class PopplerRenderer implements Renderer {
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("pdf"));
/**
* Matches the Poppler output pattern: {@code prefix-01.png},
* {@code prefix-02.png}, etc.
*/
private static final Pattern PAGE_FILE_PATTERN =
Pattern.compile("tika-poppler-(\\d+)\\.png");
private String pdftoppmPath = "pdftoppm";
private int dpi = 300;
private boolean gray = true;
private int timeoutMs = 120000;
/**
* Maximum pixel dimension (in pixels) for the longest edge of a rendered
* page image. Maps to pdftoppm's {@code -scale-to} flag.
* <p>
* If a PDF page would render larger than this value (in pixels) at the
* configured DPI, pdftoppm scales the output image down so that its
* longest edge equals {@code maxScaleTo} pixels, preserving the aspect
* ratio. For example, with {@code maxScaleTo=4096}, a landscape page
* that would normally render to 6000×4000 pixels is scaled to
* 4096×2731 pixels instead.
* <p>
* If the rendered image is already smaller than {@code maxScaleTo}
* on both edges, no scaling is applied ��� the image is not enlarged.
* <p>
* This is the primary defense against pathologically large PDF pages
* (e.g., architectural drawings, maps, posters) that would otherwise
* produce multi-gigabyte images and cause OOM.
* <p>
* Default is 4096 pixels. Set to {@code -1} to disable scaling
* (not recommended).
*/
private int maxScaleTo = 4096;
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
public RenderResults render(TikaInputStream tis, Metadata metadata,
ParseContext parseContext,
RenderRequest... requests)
throws IOException, TikaException {
TemporaryResources tmp = new TemporaryResources();
PageBasedRenderResults results = new PageBasedRenderResults(tmp);
Path path = tis.getPath();
for (RenderRequest request : requests) {
renderRequest(path, metadata, parseContext, request, results, tmp);
}
return results;
}
private void renderRequest(Path pdf, Metadata metadata,
ParseContext parseContext,
RenderRequest request,
PageBasedRenderResults results,
TemporaryResources tmp)
throws TikaException, IOException {
if (!(request instanceof PageRangeRequest)) {
throw new TikaException(
"I regret that this renderer can only handle "
+ "PageRangeRequests, not " + request.getClass());
}
PageRangeRequest rangeRequest = (PageRangeRequest) request;
RenderingTracker tracker = parseContext.get(RenderingTracker.class);
if (tracker == null) {
tracker = new RenderingTracker();
parseContext.set(RenderingTracker.class, tracker);
}
Path dir = Files.createTempDirectory("tika-render-");
tmp.addResource(new Closeable() {
@Override
public void close() throws IOException {
Files.delete(dir);
}
});
String[] args = createCommandLine(pdf, dir, rangeRequest);
ProcessBuilder builder = new ProcessBuilder();
builder.command(args);
FileProcessResult result = ProcessUtils.execute(
builder, timeoutMs, 10, 1000);
if (result.getExitValue() != 0) {
throw new TikaException(
"pdftoppm failed (exit " + result.getExitValue()
+ "): " + result.getStderr());
}
Matcher m = PAGE_FILE_PATTERN.matcher("");
File[] files = dir.toFile().listFiles();
if (files == null) {
return;
}
for (File f : files) {
if (m.reset(f.getName()).find()) {
int pageNumber = Integer.parseInt(m.group(1));
Metadata renderMetadata = Metadata.newInstance(parseContext);
renderMetadata.set(TikaPagedText.PAGE_NUMBER, pageNumber);
renderMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.RENDERING
.name());
results.add(new RenderResult(
RenderResult.STATUS.SUCCESS,
tracker.getNextId(),
f.toPath(),
renderMetadata));
}
}
}
String[] createCommandLine(Path pdf, Path dir,
PageRangeRequest request) {
List<String> args = new ArrayList<>();
args.add(pdftoppmPath);
// Output format
args.add("-png");
// Resolution
args.add("-r");
args.add(String.valueOf(dpi));
// Scale cap ��� prevents OOM on huge pages
if (maxScaleTo > 0) {
args.add("-scale-to");
args.add(String.valueOf(maxScaleTo));
}
// Colorspace
if (gray) {
args.add("-gray");
}
// Page range
if (request != PageRangeRequest.RENDER_ALL) {
args.add("-f");
args.add(String.valueOf(request.getFrom()));
args.add("-l");
args.add(String.valueOf(request.getTo()));
}
// Input PDF
args.add(ProcessUtils.escapeCommandLine(
pdf.toAbsolutePath().toString()));
// Output prefix (pdftoppm appends -NN.png)
args.add(ProcessUtils.escapeCommandLine(
dir.toAbsolutePath().toString() + "/tika-poppler"));
return args.toArray(new String[0]);
}
// ---- config getters/setters -------------------------------------------
public String getPdftoppmPath() {
return pdftoppmPath;
}
/**
* Set the path to the {@code pdftoppm} executable. Defaults to
* {@code "pdftoppm"} (assumes it is on the system path).
*/
public void setPdftoppmPath(String pdftoppmPath) {
this.pdftoppmPath = pdftoppmPath;
}
public int getDpi() {
return dpi;
}
/**
* Set the rendering resolution in DPI. Defaults to 300.
*/
public void setDpi(int dpi) {
this.dpi = dpi;
}
public boolean isGray() {
return gray;
}
/**
* If true (the default), render in grayscale. Set to false for
* full-color rendering.
*/
public void setGray(boolean gray) {
this.gray = gray;
}
public int getTimeoutMs() {
return timeoutMs;
}
/**
* Set the timeout in milliseconds for the pdftoppm process.
* Defaults to 120000 (2 minutes).
*/
public void setTimeoutMs(int timeoutMs) {
this.timeoutMs = timeoutMs;
}
public int getMaxScaleTo() {
return maxScaleTo;
}
/**
* Set the maximum pixel dimension (in pixels) for the longest edge
* of rendered page images. Maps to pdftoppm's {@code -scale-to} flag.
* Pages that would render smaller than this are not enlarged.
* <p>
* Default is 4096 pixels. Set to {@code -1} to disable (not recommended).
*/
public void setMaxScaleTo(int maxScaleTo) {
if (maxScaleTo < 1 && maxScaleTo != -1) {
throw new IllegalArgumentException(
"maxScaleTo must be -1 (disabled) or at least 1, got: "
+ maxScaleTo);
}
this.maxScaleTo = maxScaleTo;
}
}