ExtractImages.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.tools;
import java.awt.geom.Point2D;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Callable;
import org.apache.commons.io.FilenameUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.graphics.color.PDPattern;
import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.pdmodel.graphics.pattern.PDAbstractPattern;
import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern;
import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState;
import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
import org.apache.pdfbox.pdmodel.graphics.state.PDSoftMask;
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.Vector;
import picocli.CommandLine;
import picocli.CommandLine.Command;
import picocli.CommandLine.Option;
/**
* Extracts the images from a PDF file.
*
* @author Ben Litchfield
*/
@Command(name = "extractimages", header = "Extracts the images from a PDF document", versionProvider = Version.class, mixinStandardHelpOptions = true)
public final class ExtractImages implements Callable<Integer>
{
// Expected for CLI app to write to System.out/System.err
@SuppressWarnings("squid:S106")
private final PrintStream SYSOUT;
@SuppressWarnings("squid:S106")
private final PrintStream SYSERR;
private static final List<String> JPEG = Arrays.asList(
COSName.DCT_DECODE.getName(),
COSName.DCT_DECODE_ABBREVIATION.getName());
@Option(names = "-password", description = "the password for the PDF or certificate in keystore.", arity = "0..1", interactive = true)
private String password;
@Option(names = "-prefix", description = "the image prefix (default to pdf name).")
private String prefix;
@Option(names = "-useDirectJPEG", description = "Forces the direct extraction of JPEG/JPX images " +
"regardless of colorspace or masking.")
private boolean useDirectJPEG;
@Option(names = "-noColorConvert", description = "Images are extracted with their " +
"original colorspace if possible.")
private boolean noColorConvert;
@Option(names = {"-i", "--input"}, description = "the PDF file", required = true)
private File infile;
private final Set<COSStream> seen = new HashSet<>();
private int imageCounter = 1;
/**
* Constructor.
*/
public ExtractImages()
{
SYSOUT = System.out;
SYSERR = System.err;
}
/**
* Entry point for the application.
*
* @param args The command-line arguments.
*/
public static void main(String[] args)
{
// suppress the Dock icon on OS X
System.setProperty("apple.awt.UIElement", "true");
int exitCode = new CommandLine(new ExtractImages()).execute(args);
System.exit(exitCode);
}
public Integer call()
{
try (PDDocument document = Loader.loadPDF(infile, password))
{
AccessPermission ap = document.getCurrentAccessPermission();
if (!ap.canExtractContent())
{
SYSERR.println("You do not have permission to extract images");
return 1;
}
if (prefix == null)
{
prefix = FilenameUtils.removeExtension(infile.getAbsolutePath());
}
for (PDPage page : document.getPages())
{
ImageGraphicsEngine extractor = new ImageGraphicsEngine(page);
extractor.run();
}
}
catch (IOException ioe)
{
SYSERR.println("Error extracting images [" + ioe.getClass().getSimpleName() + "]: " + ioe.getMessage());
return 4;
}
return 0;
}
private class ImageGraphicsEngine extends PDFGraphicsStreamEngine
{
protected ImageGraphicsEngine(PDPage page)
{
super(page);
}
public void run() throws IOException
{
PDPage page = getPage();
processPage(page);
PDResources res = page.getResources();
if (res == null)
{
return;
}
for (COSName name : res.getExtGStateNames())
{
PDExtendedGraphicsState extGState = res.getExtGState(name);
if (extGState == null)
{
// can happen if key exists but no value
continue;
}
PDSoftMask softMask = extGState.getSoftMask();
if (softMask != null)
{
PDTransparencyGroup group = softMask.getGroup();
if (group != null)
{
// PDFBOX-4327: without this line NPEs will occur
res.getExtGState(name).copyIntoGraphicsState(getGraphicsState());
processSoftMask(group);
}
}
}
}
@Override
public void drawImage(PDImage pdImage) throws IOException
{
if (pdImage instanceof PDImageXObject)
{
if (pdImage.isStencil())
{
processColor(getGraphicsState().getNonStrokingColor());
}
PDImageXObject xobject = (PDImageXObject)pdImage;
if (seen.contains(xobject.getCOSObject()))
{
// skip duplicate image
return;
}
seen.add(xobject.getCOSObject());
}
// save image
String name = prefix + "-" + imageCounter;
imageCounter++;
write2file(pdImage, name, useDirectJPEG, noColorConvert);
}
@Override
public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3)
throws IOException
{
// Empty: add special handling if needed
}
@Override
public void clip(int windingRule) throws IOException
{
// Empty: add special handling if needed
}
@Override
public void moveTo(float x, float y) throws IOException
{
// Empty: add special handling if needed
}
@Override
public void lineTo(float x, float y) throws IOException
{
// Empty: add special handling if needed
}
@Override
public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3)
throws IOException
{
// Empty: add special handling if needed
}
@Override
public Point2D getCurrentPoint() throws IOException
{
return new Point2D.Float(0, 0);
}
@Override
public void closePath() throws IOException
{
// Empty: add special handling if needed
}
@Override
public void endPath() throws IOException
{
// Empty: add special handling if needed
}
@Override
protected void showGlyph(Matrix textRenderingMatrix,
PDFont font,
int code,
Vector displacement) throws IOException
{
PDGraphicsState graphicsState = getGraphicsState();
RenderingMode renderingMode = graphicsState.getTextState().getRenderingMode();
if (renderingMode.isFill())
{
processColor(graphicsState.getNonStrokingColor());
}
if (renderingMode.isStroke())
{
processColor(graphicsState.getStrokingColor());
}
}
@Override
public void strokePath() throws IOException
{
processColor(getGraphicsState().getStrokingColor());
}
@Override
public void fillPath(int windingRule) throws IOException
{
processColor(getGraphicsState().getNonStrokingColor());
}
@Override
public void fillAndStrokePath(int windingRule) throws IOException
{
processColor(getGraphicsState().getNonStrokingColor());
}
@Override
public void shadingFill(COSName shadingName) throws IOException
{
// Empty: add special handling if needed
}
// find out if it is a tiling pattern, then process that one
private void processColor(PDColor color) throws IOException
{
if (color.getColorSpace() instanceof PDPattern)
{
PDPattern pattern = (PDPattern) color.getColorSpace();
PDAbstractPattern abstractPattern = pattern.getPattern(color);
if (abstractPattern instanceof PDTilingPattern)
{
processTilingPattern((PDTilingPattern) abstractPattern, null, null);
}
}
}
/**
* Writes the image to a file with the filename prefix + an appropriate suffix, like
* "Image.jpg". The suffix is automatically set depending on the image compression in the
* PDF.
*
* @param pdImage the image.
* @param prefix the filename prefix.
* @param directJPEG if true, force saving JPEG/JPX streams as they are in the PDF file.
* @param noColorConvert if true, images are extracted with their original colorspace if
* possible.
* @throws IOException When something is wrong with the corresponding file.
*/
private void write2file(PDImage pdImage, String prefix, boolean directJPEG,
boolean noColorConvert) throws IOException
{
String suffix = pdImage.getSuffix();
if (suffix == null || "jb2".equals(suffix))
{
suffix = "png";
}
else if ("jpx".equals(suffix))
{
// use jp2 suffix for file because jpx not known by windows
suffix = "jp2";
}
if (hasMasks(pdImage))
{
// TIKA-3040, PDFBOX-4771: can't save ARGB as JPEG
suffix = "png";
}
if (noColorConvert)
{
// We write the raw image if in any way possible.
// But we have no alpha information here.
BufferedImage image = pdImage.getRawImage();
if (image != null)
{
int elements = image.getRaster().getNumDataElements();
suffix = "png";
if (elements > 3)
{
// More then 3 channels: Thats likely CMYK. We use tiff here,
// but a TIFF codec must be in the class path for this to work.
suffix = "tiff";
}
try (FileOutputStream imageOutput = new FileOutputStream(prefix + "." + suffix))
{
SYSOUT.println("Writing image: " + prefix + "." + suffix);
ImageIOUtil.writeImage(image, suffix, imageOutput);
imageOutput.flush();
}
return;
}
}
try (FileOutputStream imageOutput = new FileOutputStream(prefix + "." + suffix))
{
SYSOUT.println("Writing image: " + prefix + "." + suffix);
if ("jpg".equals(suffix))
{
String colorSpaceName = pdImage.getColorSpace().getName();
if (directJPEG ||
(PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) ||
PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName)))
{
// RGB or Gray colorspace: get and write the unmodified JPEG stream
InputStream data = pdImage.createInputStream(JPEG);
data.transferTo(imageOutput);
IOUtils.closeQuietly(data);
}
else
{
// for CMYK and other "unusual" colorspaces, the JPEG will be converted
BufferedImage image = pdImage.getImage();
if (image != null)
{
ImageIOUtil.writeImage(image, suffix, imageOutput);
}
}
}
else if ("jp2".equals(suffix))
{
String colorSpaceName = pdImage.getColorSpace().getName();
if (directJPEG
|| (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName)
|| PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName)))
{
// RGB or Gray colorspace: get and write the unmodified JPEG2000 stream
InputStream data = pdImage.createInputStream(
Collections.singletonList(COSName.JPX_DECODE.getName()));
data.transferTo(imageOutput);
IOUtils.closeQuietly(data);
}
else
{
// for CMYK and other "unusual" colorspaces, the image will be converted
BufferedImage image = pdImage.getImage();
if (image != null)
{
ImageIOUtil.writeImage(image, "jpeg2000", imageOutput);
}
}
}
else if ("tiff".equals(suffix) && pdImage.getColorSpace().equals(PDDeviceGray.INSTANCE))
{
BufferedImage image = pdImage.getImage();
if (image == null)
{
return;
}
// CCITT compressed images can have a different colorspace, but this one is B/W
// This is a bitonal image, so copy to TYPE_BYTE_BINARY
// so that a G4 compressed TIFF image is created by ImageIOUtil.writeImage()
int w = image.getWidth();
int h = image.getHeight();
BufferedImage bitonalImage = new BufferedImage(w, h, BufferedImage.TYPE_BYTE_BINARY);
// copy image the old fashioned way - ColorConvertOp is slower!
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
{
bitonalImage.setRGB(x, y, image.getRGB(x, y));
}
}
ImageIOUtil.writeImage(bitonalImage, suffix, imageOutput);
}
else
{
BufferedImage image = pdImage.getImage();
if (image != null)
{
ImageIOUtil.writeImage(image, suffix, imageOutput);
}
}
imageOutput.flush();
}
}
private boolean hasMasks(PDImage pdImage) throws IOException
{
if (pdImage instanceof PDImageXObject)
{
PDImageXObject ximg = (PDImageXObject) pdImage;
return ximg.getMask() != null || ximg.getSoftMask() != null;
}
return false;
}
}
}