ExtractText.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.tools;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.Callable;

import org.apache.commons.io.FilenameUtils;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;

import picocli.CommandLine;
import picocli.CommandLine.Command;
import picocli.CommandLine.Option;

/**
 * This is the main program that simply parses the pdf document and transforms it
 * into text.
 *
 * @author Ben Litchfield
 * @author Tilman Hausherr
 */
@Command(name = "extracttext", header = "Extracts the text from a PDF document", versionProvider = Version.class, mixinStandardHelpOptions = true)
public final class ExtractText  implements Callable<Integer>
{
    private static final Logger LOG = LogManager.getLogger(ExtractText.class);

    private static final String STD_ENCODING = "UTF-8";

    // Expected for CLI app to write to System.out/System.err
    @SuppressWarnings("squid:S106")
    private final PrintStream SYSOUT;
    @SuppressWarnings("squid:S106")
    private final PrintStream SYSERR;

    @Option(names = "-alwaysNext", description = "Process next page (if applicable) despite IOException " + 
        "(ignored when -html)")
    private boolean alwaysNext = false;

    @Option(names = "-console", description = "Send text to console instead of file")
    private boolean toConsole = false;

    @Option(names = "-debug", description = "Enables debug output about the time consumption of every stage")
    private boolean debug = false;

    @Option(names = "-encoding", description = "UTF-8 or ISO-8859-1, UTF-16BE, UTF-16LE, etc. (default: ${DEFAULT-VALUE})")
    private String encoding = STD_ENCODING;

    @Option(names = "-endPage", description = "The last page to extract (1 based, inclusive)")
    private int endPage = Integer.MAX_VALUE;

    @Option(names = "-html", description = "Output in HTML format instead of raw text")
    private boolean toHTML = false;

    @Option(names = "-md", description = "Output in Markdown format instead of raw text")
    private boolean toMD = false;

    @Option(names = "-ignoreBeads", description = "Disables the separation by beads")
    private boolean ignoreBeads = false;

    @Option(names = "-password", description = "the password for the PDF or certificate in keystore.", arity = "0..1", interactive = true)    
    private String password = "";

    @Option(names = "-rotationMagic", description = "Analyze each page for rotated/skewed text, rotate to 0�� " +
        "and extract separately (slower, and ignored when -html)" )
    private boolean rotationMagic = false;

    @Option(names = "-sort", description = "Sort the text before writing of every stage")
    private boolean sort = false;

    @Option(names = "-startPage", description = "The first page to start extraction (1 based)")
    private int startPage = 1;

    @Option(names = {"-i", "--input"}, description = "the PDF file", required = true)
    private File infile;

    @Option(names = {"-o", "--output"}, description = "the exported text file")
    private File outfile;

    @Option(names = "-addFileName", description = "Print PDF file name to the output text")
    private boolean addFileName = false;

    @Option(names = "-append", description = "Use append mode for output file")
    private boolean append = false;

    /**
     * Constructor.
     */
    public ExtractText()
    {
        SYSOUT = System.out;
        SYSERR = System.err;
    }

    /**
     * Infamous main method.
     *
     * @param args Command line arguments, should be one and a reference to a file.
     */
    public static void main( String[] args )
    {
        // suppress the Dock icon on OS X
        System.setProperty("apple.awt.UIElement", "true");

        int exitCode = new CommandLine(new ExtractText()).execute(args);
        System.exit(exitCode);
    }

    /**
     * Starts the text extraction.
     *  
     */
    public Integer call()
    {
        // set file extension
        if (toHTML && toMD)
        {
            SYSERR.println( "You can't set md and html at the same time");
            return 1;
        }
        String ext = toHTML ? ".html" : ".txt";
        ext = toMD ? ".md" : ext;

        if (outfile == null)
        {
            String outPath = FilenameUtils.removeExtension(infile.getAbsolutePath()) + ext;
            outfile = new File(outPath);
        }

        if (toHTML && !STD_ENCODING.equals(encoding))
        {
            encoding = STD_ENCODING;
            SYSOUT.println("The encoding parameter is ignored when writing html output.");
        }

        if (toConsole && encoding != null)
        {
            SYSOUT.println("The encoding parameter is ignored when writing to the console.");
        }

        try (PDDocument document = Loader.loadPDF(infile, password);
                Writer output = createOutputWriter())
        {
            long startTime = startProcessing("Loading PDF " + infile);

            AccessPermission ap = document.getCurrentAccessPermission();
            if( ! ap.canExtractContent() )
            {
                SYSERR.println( "You do not have permission to extract text");
                return 1;
            }
            
            stopProcessing("Time for loading: ", startTime);

            startTime = startProcessing("Starting text extraction");

            if (addFileName)
            {
                output.write("PDF file: " + infile);
                output.write(System.lineSeparator());
            }

            if (debug)
            {
                SYSERR.println("Writing to " + outfile.getAbsolutePath());
            }

            PDFTextStripper stripper;
            if(toHTML)
            {
                // HTML stripper can't work page by page because of startDocument() callback
                stripper = new PDFText2HTML();
                stripper.setSortByPosition(sort);
                stripper.setShouldSeparateByBeads(!ignoreBeads);
                stripper.setStartPage(startPage);
                stripper.setEndPage(endPage);

                // Extract text for main document:
                stripper.writeText(document, output);
            }
            else
            {
                if (toMD)
                {
                    if (rotationMagic)
                    {
                        stripper = new FilteredText2Markdown();
                    }
                    else
                    {
                        stripper = new PDFText2Markdown();
                    }
                }
                else
                {
                    if (rotationMagic)
                    {
                        stripper = new FilteredTextStripper();
                    }
                    else
                    {
                        stripper = new PDFTextStripper();
                    }
                }
                stripper.setSortByPosition(sort);
                stripper.setShouldSeparateByBeads(!ignoreBeads);

                // Extract text for main document:
                extractPages(startPage, Math.min(endPage, document.getNumberOfPages()), 
                             stripper, document, output, rotationMagic, alwaysNext);
            }

            // ... also for any embedded PDFs:
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            PDDocumentNameDictionary names = catalog.getNames();    
            if (names != null)
            {
                PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
                if (embeddedFiles != null)
                {
                    Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames();
                    if (embeddedFileNames != null)
                    {
                        for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) 
                        {
                            if (debug)
                            {
                                SYSERR.println("Processing embedded file " + ent.getKey() + ":");
                            }
                            PDComplexFileSpecification spec = ent.getValue();
                            PDEmbeddedFile file = spec.getEmbeddedFile();
                            if (file != null && "application/pdf".equals(file.getSubtype()))
                            {
                                if (debug)
                                {
                                    SYSERR.println("  is PDF (size=" + file.getSize() + ")");
                                }
                                try (PDDocument subDoc = Loader.loadPDF(RandomAccessReadBuffer
                                                .createBufferFromStream(file.createInputStream())))
                                {
                                    if (toHTML)
                                    {
                                        // will not really work because of HTML header + footer
                                        stripper.writeText( subDoc, output );
                                    }
                                    else
                                    {
                                        extractPages(1, subDoc.getNumberOfPages(),
                                                     stripper, subDoc, output, rotationMagic, alwaysNext);
                                    }
                                } 
                            }
                        } 
                    }
                }
            }
            output.flush();
            stopProcessing("Time for extraction: ", startTime);
        }
        catch (IOException ioe)
        {
            SYSERR.println( "Error extracting text for document [" + ioe.getClass().getSimpleName() + "]: " + ioe.getMessage());
            return 4;
        }

        return 0;
    }

    private Writer createOutputWriter() throws IOException
    {
        if (toConsole)
        {
            return new PrintWriter(SYSOUT)
            {
                @Override
                public void close()
                {
                    // don't close the console
                }
            };
        }
        else
        {
            return new OutputStreamWriter(new FileOutputStream(outfile, append), encoding);
        }
    }

    private void extractPages(int startPage, int endPage,
            PDFTextStripper stripper, PDDocument document, Writer output,
            boolean rotationMagic, boolean alwaysNext) throws IOException
    {
        for (int p = startPage; p <= endPage; ++p)
        {
            stripper.setStartPage(p);
            stripper.setEndPage(p);
            try
            {
                if (rotationMagic)
                {
                    PDPage page = document.getPage(p - 1);
                    int rotation = page.getRotation();
                    page.setRotation(0);
                    AngleCollector angleCollector = new AngleCollector();
                    angleCollector.setStartPage(p);
                    angleCollector.setEndPage(p);
                    angleCollector.writeText(document, new NullWriter());
                    // rotation magic
                    for (int angle : angleCollector.getAngles())
                    {
                        // prepend a transformation
                        // (we could skip these parts for angle 0, but it doesn't matter much)
                        try (PDPageContentStream cs = new PDPageContentStream(document, page, 
                                PDPageContentStream.AppendMode.PREPEND, false))
                        {
                            cs.transform(Matrix.getRotateInstance(-Math.toRadians(angle), 0, 0));
                        }

                        stripper.writeText(document, output);

                        // remove prepended transformation
                        page.getCOSObject().getCOSArray(COSName.CONTENTS).remove(0);
                    }
                    page.setRotation(rotation);
                }
                else
                {
                    stripper.writeText(document, output);
                }
            }
            catch (IOException ex)
            {
                if (!alwaysNext)
                {
                    throw ex;
                }
                LOG.error("Failed to process page " + p, ex);
            }
        }
    }

    private long startProcessing(String message) 
    {
        if (debug) 
        {
            SYSERR.println(message);
        }
        return System.currentTimeMillis();
    }
    
    private void stopProcessing(String message, long startTime) 
    {
        if (debug)
        {
            long stopTime = System.currentTimeMillis();
            float elapsedTime = ((float)(stopTime - startTime))/1000;
            SYSERR.println(message + elapsedTime + " seconds");
        }
    }

    static int getAngle(TextPosition text)
    {
        // should this become a part of TextPosition?
        Matrix m = text.getTextMatrix().clone();
        m.concatenate(text.getFont().getFontMatrix());
        return (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
    }
}

/**
 * Collect all angles while doing text extraction. Angles are in degrees and rounded to the closest
 * integer (to avoid slight differences from floating point arithmetic resulting in similarly
 * angled glyphs being treated separately). This class must be constructed for each page so that the
 * angle set is initialized.
 */
class AngleCollector extends PDFTextStripper
{
    private final Set<Integer> angles = new TreeSet<>();

    AngleCollector() throws IOException
    {
    }

    Set<Integer> getAngles()
    {
        return angles;
    }

    @Override
    protected void processTextPosition(TextPosition text)
    {
        int angle = ExtractText.getAngle(text);
        angle = (angle + 360) % 360;
        angles.add(angle);
    }
}

/**
 * TextStripper that only processes glyphs that have angle 0.
 */
class FilteredTextStripper extends PDFTextStripper
{
    @Override
    protected void processTextPosition(TextPosition text)
    {
        int angle = ExtractText.getAngle(text);
        if (angle == 0)
        {
            super.processTextPosition(text);
        }
    }
}

/**
 * PDFText2Markdown that only processes glyphs that have angle 0.
 */
class FilteredText2Markdown extends PDFText2Markdown
{
    @Override
    protected void processTextPosition(TextPosition text)
    {
        int angle = ExtractText.getAngle(text);
        if (angle == 0)
        {
            super.processTextPosition(text);
        }
    }
}

/**
 * Dummy output.
 */
class NullWriter extends Writer
{
    @Override
    public void write(char[] cbuf, int off, int len) throws IOException
    {
        // do nothing
    }

    @Override
    public void flush() throws IOException
    {
        // do nothing
    }

    @Override
    public void close() throws IOException
    {
        // do nothing
    }
}