TestTextStripper.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.text;
import difflib.ChangeDelta;
import difflib.DeleteDelta;
import difflib.DiffUtils;
import difflib.InsertDelta;
import difflib.Patch;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.Writer;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;
import org.apache.fontbox.util.BoundingBox;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDFontDescriptor;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
/**
* Test suite for PDFTextStripper.
*
* FILE SET VALIDATION
*
* This test suite is designed to test PDFTextStripper using a set of PDF
* files and known good output for each. The default mode of testAll()
* is to process each *.pdf file in "src/test/resources/input". An output
* file is created in "target/test-output" with the same name as the PDF file,
* plus an additional ".txt" suffix.
*
* The output file is then tested against a known good result file from
* the input directory (again, with the same name as the tested PDF file,
* but with the additional ".txt" suffix). The process is performed both
* with and without sorting enabled. The sorted files have a "-sorted.txt"
* suffix.
*
* So for the file "src/test/resources/input/hello.pdf", an output file will
* be generated named "target/test-output/hello.pdf.txt". Then that file
* will be compared to the known good file
* "src/test/resources/input/hello.pdf.txt", if it exists.
*
* To support testing with files that are not officially distributed
* with PDFBox, this test will also look in the "target/test-input-ext"
* directory.
*
* Any errors are logged, and at the end of processing all *.pdf files, if
* there were any errors, the test fails. The logging is at INFO, as the
* general goal is overall validation, and on failure, the indication of
* which file or files failed.
*
* When processing new PDF files, you may use testAll() to generate output,
* verify the output manually, then move the output file to the test input
* directory to use as the basis for future validations.
*
* SINGLE FILE VALIDATION
*
* To further research individual failures, the org.apache.pdfbox.util.TextStripper.file
* system property may be set with the name of a single file in the "test/input"
* directory. In this mode, testAll() will evaluate only that file, and will
* do so with DEBUG level logging.
*
* @author Robert Dickinson
* @author Ben Litchfield
*/
class TestTextStripper
{
/**
* Logger instance.
*/
private static final Logger LOG = LogManager.getLogger(TestTextStripper.class);
private boolean bFail = false;
private static PDFTextStripper stripper;
private static final String ENCODING = "UTF-8";
/**
* Test class initialization.
*
* @throws IOException If there is an error initializing the test.
*/
@BeforeAll
static void init() throws IOException
{
stripper = new PDFTextStripper();
stripper.setLineSeparator("\n");
// If you want to test a single file using DEBUG logging, from an IDE,
// you can do something like this:
// System.setProperty("org.apache.pdfbox.util.TextStripper.file", "FVS318Ref.pdf");
}
/**
* Determine whether two strings are equal, where two null strings are
* considered equal.
*
* @param expected Expected string
* @param actual Actual String
* @return <code>true</code> is the strings are both null,
* or if their contents are the same, otherwise <code>false</code>.
*/
private boolean stringsEqual(String expected, String actual)
{
boolean equals = true;
if( (expected == null) && (actual == null) )
{
return true;
}
else if( expected != null && actual != null )
{
expected = expected.trim();
actual = actual.trim();
char[] expectedArray = expected.toCharArray();
char[] actualArray = actual.toCharArray();
int expectedIndex = 0;
int actualIndex = 0;
while( expectedIndex<expectedArray.length && actualIndex<actualArray.length )
{
if( expectedArray[expectedIndex] != actualArray[actualIndex] )
{
equals = false;
LOG.warn("Lines differ at index expected: {}-{ } actual: {}-{}", expectedIndex,
(int) expectedArray[expectedIndex], actualIndex,
(int) actualArray[actualIndex]);
break;
}
expectedIndex = skipWhitespace( expectedArray, expectedIndex );
actualIndex = skipWhitespace( actualArray, actualIndex );
expectedIndex++;
actualIndex++;
}
if( equals )
{
if( expectedIndex != expectedArray.length )
{
equals = false;
LOG.warn("Expected line is longer at: {}", expectedIndex);
}
if( actualIndex != actualArray.length )
{
equals = false;
LOG.warn("Actual line is longer at: {}", actualIndex);
}
if (expectedArray.length != actualArray.length)
{
equals = false;
LOG.warn("Expected lines: {}, actual lines: {}", expectedArray.length,
actualArray.length);
}
}
}
else
{
equals = (expected == null && actual != null && actual.trim().isEmpty())
|| (actual == null && expected != null && expected.trim().isEmpty());
}
return equals;
}
/**
* If the current index is whitespace then skip any subsequent whitespace.
*/
private int skipWhitespace( char[] array, int index )
{
//if we are at a space character then skip all space
//characters, but when all done rollback 1 because stringsEqual
//will roll forward 1
if( array[index] == ' ' || array[index] > 256 )
{
while( index < array.length && (array[index] == ' ' || array[index] > 256))
{
index++;
}
index--;
}
return index;
}
/**
* Validate text extraction on a single file.
*
* @param inFile The PDF file to validate
* @param outDir The directory to store the output in
* @param bLogResult Whether to log the extracted text
* @param bSort Whether or not the extracted text is sorted
* @throws Exception when there is an exception
*/
private void doTestFile(File inFile, File outDir, boolean bLogResult, boolean bSort)
throws Exception
{
if(bSort)
{
LOG.info("Preparing to parse {} for sorted test", inFile.getName());
}
else
{
LOG.info("Preparing to parse {} for standard test", inFile.getName());
}
Files.createDirectories(outDir.toPath());
try (PDDocument document = Loader.loadPDF(inFile))
{
File outFile;
File diffFile;
File expectedFile;
if(bSort)
{
outFile = new File(outDir, inFile.getName() + "-sorted.txt");
diffFile = new File(outDir, inFile.getName() + "-sorted-diff.txt");
expectedFile = new File(inFile.getParentFile(), inFile.getName() + "-sorted.txt");
}
else
{
outFile = new File(outDir, inFile.getName() + ".txt");
diffFile = new File(outDir, inFile.getName() + "-diff.txt");
expectedFile = new File(inFile.getParentFile(), inFile.getName() + ".txt");
}
// delete possible leftover
diffFile.delete();
try (OutputStream os = new FileOutputStream(outFile))
{
os.write (0xEF);
os.write (0xBB);
os.write (0xBF);
try (Writer writer = new BufferedWriter(new OutputStreamWriter(os, ENCODING)))
{
//Allows for sorted tests
stripper.setSortByPosition(bSort);
stripper.writeText(document, writer);
// close the written file before reading it again
}
}
if (bLogResult)
{
LOG.info("Text for {}:", inFile.getName());
LOG.info(stripper.getText(document));
}
if (!expectedFile.exists())
{
this.bFail = true;
LOG.error("FAILURE: Input verification file: {} does not exist",
expectedFile.getAbsolutePath());
return;
}
compareResult(expectedFile, outFile, inFile, bSort, diffFile);
}
}
private void compareResult(File expectedFile, File outFile, File inFile, boolean bSort, File diffFile)
throws IOException
{
boolean localFail = false;
try (LineNumberReader expectedReader =
new LineNumberReader(new InputStreamReader(new FileInputStream(expectedFile), ENCODING));
LineNumberReader actualReader =
new LineNumberReader(new InputStreamReader(new FileInputStream(outFile), ENCODING)))
{
while (true)
{
String expectedLine = expectedReader.readLine();
while( expectedLine != null && expectedLine.trim().length() == 0 )
{
expectedLine = expectedReader.readLine();
}
String actualLine = actualReader.readLine();
while( actualLine != null && actualLine.trim().length() == 0 )
{
actualLine = actualReader.readLine();
}
if (!stringsEqual(expectedLine, actualLine))
{
this.bFail = true;
localFail = true;
LOG.error(
"FAILURE: Line mismatch for file {} (sort = {}) at expected line: {} at actual line: {}\nexpected line was: \"{}\"\nactual line was: \"{}\"\n",
expectedFile.getAbsolutePath(), bSort, expectedReader.getLineNumber(),
actualReader.getLineNumber(), expectedLine, actualLine);
//lets report all lines, even though this might produce some verbose logging
//break;
}
if (expectedLine == null || actualLine == null)
{
break;
}
}
}
if (!localFail)
{
outFile.delete();
}
else
{
// https://code.google.com/p/java-diff-utils/wiki/SampleUsage
List<String> original = fileToLines(expectedFile);
List<String> revised = fileToLines(outFile);
// Compute diff. Get the Patch object. Patch is the container for computed deltas.
Patch<String> patch = DiffUtils.diff(original, revised);
try (PrintStream diffPS = new PrintStream(diffFile, ENCODING))
{
patch.getDeltas().forEach(delta ->
{
if (delta instanceof ChangeDelta)
{
ChangeDelta<String> cdelta = (ChangeDelta<String>) delta;
diffPS.println("Org: " + cdelta.getOriginal());
diffPS.println("New: " + cdelta.getRevised());
diffPS.println();
}
else if (delta instanceof DeleteDelta)
{
DeleteDelta<String> ddelta = (DeleteDelta<String>) delta;
diffPS.println("Org: " + ddelta.getOriginal());
diffPS.println("New: " + ddelta.getRevised());
diffPS.println();
}
else if (delta instanceof InsertDelta)
{
InsertDelta<String> idelta = (InsertDelta<String>) delta;
diffPS.println("Org: " + idelta.getOriginal());
diffPS.println("New: " + idelta.getRevised());
diffPS.println();
}
else
{
diffPS.println(delta);
}
});
}
}
}
// Helper method for get the file content
private static List<String> fileToLines(File file) throws IOException
{
List<String> lines = new LinkedList<>();
String line;
try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file), ENCODING)))
{
while ((line = in.readLine()) != null)
{
lines.add(line);
}
}
return lines;
}
private int findOutlineItemDestPageNum(PDDocument doc, PDOutlineItem oi) throws IOException
{
PDPageDestination pageDest = (PDPageDestination) oi.getDestination();
// two methods to get the page index, the result should be identical!
int indexOfPage = doc.getPages().indexOf(oi.findDestinationPage(doc));
int pageNum = pageDest.retrievePageNumber();
assertEquals(indexOfPage, pageNum);
return pageNum;
}
/**
* Test whether stripping controlled by outline items works properly. The test file has 4
* outline items at the top level, that point to 0-based pages 0, 2, 3 and 4. We are testing
* text stripping by outlines pointing to 0-based pages 2 and 3, and also text stripping of the
* 0-based page 2. The test makes sure that the output is different to a complete strip, not
* empty, different to each other when different bookmark intervals are used, but identical from
* bookmark intervals to strips with page intervals. When fed with orphan bookmarks, stripping
* must be empty.
*
* @throws IOException
* @throws URISyntaxException
*/
@Test
void testStripByOutlineItems() throws IOException, URISyntaxException
{
PDDocument doc = Loader
.loadPDF(new File(this.getClass().getResource("../pdmodel/with_outline.pdf").toURI()));
PDDocumentOutline outline = doc.getDocumentCatalog().getDocumentOutline();
Iterable<PDOutlineItem> children = outline.children();
Iterator<PDOutlineItem> it = children.iterator();
PDOutlineItem oi0 = it.next();
PDOutlineItem oi2 = it.next();
PDOutlineItem oi3 = it.next();
PDOutlineItem oi4 = it.next();
assertEquals(0, findOutlineItemDestPageNum(doc, oi0));
assertEquals(2, findOutlineItemDestPageNum(doc, oi2));
assertEquals(3, findOutlineItemDestPageNum(doc, oi3));
assertEquals(4, findOutlineItemDestPageNum(doc, oi4));
String textFull = stripper.getText(doc);
assertFalse(textFull.isEmpty());
String expectedTextFull =
"First level 1\n"
+ "First level 2\n"
+ "Fist level 3\n"
+ "Some content\n"
+ "Some other content\n"
+ "Second at level 1\n"
+ "Second level 2\n"
+ "Content\n"
+ "Third level 1\n"
+ "Third level 2\n"
+ "Third level 3\n"
+ "Content\n"
+ "Fourth level 1\n"
+ "Content\n"
+ "Content\n";
assertEquals(expectedTextFull, textFull.replaceAll("\r", ""));
// this should grab 0-based pages 2 and 3, i.e. 1-based pages 3 and 4
// by their bookmarks
stripper.setStartBookmark(oi2);
stripper.setEndBookmark(oi3);
String textoi23 = stripper.getText(doc);
assertFalse(textoi23.isEmpty());
assertNotEquals(textoi23, textFull);
String expectedTextoi23 =
"Second at level 1\n"
+ "Second level 2\n"
+ "Content\n"
+ "Third level 1\n"
+ "Third level 2\n"
+ "Third level 3\n"
+ "Content\n";
assertEquals(expectedTextoi23, textoi23.replaceAll("\r", ""));
// this should grab 0-based pages 2 and 3, i.e. 1-based pages 3 and 4
// by their page numbers
stripper.setStartBookmark(null);
stripper.setEndBookmark(null);
stripper.setStartPage(3);
stripper.setEndPage(4);
String textp34 = stripper.getText(doc);
assertFalse(textp34.isEmpty());
assertNotEquals(textoi23, textFull);
assertEquals(textoi23, textp34);
// this should grab 0-based page 2, i.e. 1-based page 3
// by the bookmark
stripper.setStartBookmark(oi2);
stripper.setEndBookmark(oi2);
String textoi2 = stripper.getText(doc);
assertFalse(textoi2.isEmpty());
assertNotEquals(textoi2, textoi23);
assertNotEquals(textoi23, textFull);
String expectedTextoi2 =
"Second at level 1\n"
+ "Second level 2\n"
+ "Content\n";
assertEquals(expectedTextoi2, textoi2.replaceAll("\r", ""));
// this should grab 0-based page 2, i.e. 1-based page 3
// by the page number
stripper.setStartBookmark(null);
stripper.setEndBookmark(null);
stripper.setStartPage(3);
stripper.setEndPage(3);
String textp3 = stripper.getText(doc);
assertFalse(textp3.isEmpty());
assertNotEquals(textp3, textp34);
assertNotEquals(textoi23, textFull);
assertEquals(textoi2, textp3);
// Test with orphan bookmark
PDOutlineItem oiOrphan = new PDOutlineItem();
stripper.setStartBookmark(oiOrphan);
stripper.setEndBookmark(oiOrphan);
String textOiOrphan = stripper.getText(doc);
assertTrue(textOiOrphan.isEmpty());
}
/**
* Process each file in the specified directory.
* @param inDir Input directory search for PDF files in.
* @param outDir Output directory where the temp files will be created.
*/
private void doTestDir(File inDir, File outDir) throws Exception
{
File[] testFiles = inDir.listFiles((File dir, String name) -> name.endsWith(".pdf"));
for (File testFile : testFiles)
{
//Test without sorting
doTestFile(testFile, outDir, false, false);
//Test with sorting
doTestFile(testFile, outDir, false, true);
}
}
/**
* Test to validate text extraction of file set.
*
* @throws Exception when there is an exception
*/
@Test
void testExtract() throws Exception
{
String filename = System.getProperty("org.apache.pdfbox.util.TextStripper.file");
File inDir = new File("src/test/resources/input");
File outDir = new File("target/test-output");
File inDirExt = new File("target/test-input-ext");
File outDirExt = new File("target/test-output-ext");
if ((filename == null) || (filename.length() == 0))
{
doTestDir(inDir, outDir);
if (inDirExt.exists())
{
doTestDir(inDirExt, outDirExt);
}
}
else
{
//Test without sorting
doTestFile(new File(inDir, filename), outDir, true, false);
//Test with sorting
doTestFile(new File(inDir, filename), outDir, true, true);
}
if (this.bFail)
{
fail("One or more failures, see test log for details");
}
}
@Test
void testTabula() throws IOException
{
File pdfFile = new File("src/test/resources/input","eu-001.pdf");
File outFile = new File("target/test-output","eu-001.pdf-tabula.txt");
File expectedOutFile = new File("src/test/resources/input","eu-001.pdf-tabula.txt");
File diffFile = new File("target/test-output","eu-001.pdf-tabula-diff.txt");
PDDocument tabulaDocument = Loader.loadPDF(pdfFile);
PDFTextStripper tabulaStripper = new PDFTabulaTextStripper();
try (OutputStream os = new FileOutputStream(outFile))
{
os.write(0xEF);
os.write(0xBB);
os.write(0xBF);
try (Writer writer = new BufferedWriter(new OutputStreamWriter(os, ENCODING)))
{
tabulaStripper.writeText(tabulaDocument, writer);
}
}
compareResult(expectedOutFile, outFile, pdfFile, false, diffFile);
assertFalse(bFail);
}
private class PDFTabulaTextStripper extends PDFTextStripper
{
PDFTabulaTextStripper() throws IOException
{
// empty
}
@Override
protected float computeFontHeight(PDFont font) throws IOException
{
BoundingBox bbox = font.getBoundingBox();
if (bbox.getLowerLeftY() < Short.MIN_VALUE)
{
// PDFBOX-2158 and PDFBOX-3130
// files by Salmat eSolutions / ClibPDF Library
bbox.setLowerLeftY(-(bbox.getLowerLeftY() + 65536));
}
// 1/2 the bbox is used as the height todo: why?
float glyphHeight = bbox.getHeight() / 2;
// sometimes the bbox has very high values, but CapHeight is OK
PDFontDescriptor fontDescriptor = font.getFontDescriptor();
if (fontDescriptor != null)
{
float capHeight = fontDescriptor.getCapHeight();
if (Float.compare(capHeight, 0) != 0
&& (capHeight < glyphHeight || Float.compare(glyphHeight, 0) == 0))
{
glyphHeight = capHeight;
}
// PDFBOX-3464, PDFBOX-448:
// sometimes even CapHeight has very high value, but Ascent and Descent are ok
float ascent = fontDescriptor.getAscent();
float descent = fontDescriptor.getDescent();
if (ascent > 0 && descent < 0
&& ((ascent - descent) / 2 < glyphHeight || Float.compare(glyphHeight, 0) == 0))
{
glyphHeight = (ascent - descent) / 2;
}
}
// transformPoint from glyph space -> text space
float height;
if (font instanceof PDType3Font)
{
height = font.getFontMatrix().transformPoint(0, glyphHeight).y;
}
else
{
height = glyphHeight / 1000;
}
return height;
}
}
/**
* Check that setting start and end pages work properly.
*
* @throws IOException
*/
@Test
void testStartEndPage() throws IOException
{
File pdfFile = new File("src/test/resources/input", "eu-001.pdf");
try (PDDocument doc = Loader.loadPDF(pdfFile))
{
PDFTextStripper textStripper = new PDFTextStripper();
textStripper.setStartPage(2);
textStripper.setEndPage(2);
String text = textStripper.getText(doc).trim();
assertTrue(text.startsWith("Pesticides"));
assertTrue(text.endsWith("1 000 10 10"));
assertEquals(1378, text.replaceAll("\r", "").length());
}
}
/**
* PDFBOX-3774: test the IgnoreContentStreamSpaceGlyphs option.
*
* @throws Exception
*/
@Test
void testIgnoreContentStreamSpaceGlyphs() throws Exception
{
try (PDDocument doc = new PDDocument())
{
PDPage page = new PDPage();
try (PDPageContentStream cs = new PDPageContentStream(doc, page))
{
float fontHeight = 8;
float x = 50;
float y = page.getMediaBox().getHeight() - 50;
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
cs.beginText();
cs.setFont(font, fontHeight);
cs.newLineAtOffset(x, y);
cs.showText("( )");
cs.endText();
int indent = 6;
float overlapX = x + indent * font.getAverageFontWidth() / 1000f * fontHeight;
PDFont overlapFont = new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN);
cs.beginText();
cs.setFont(overlapFont, fontHeight * 2f);
cs.newLineAtOffset(overlapX, y);
cs.showText("overlap");
cs.endText();
}
doc.addPage(page);
PDFTextStripper localStripper = new PDFTextStripper();
localStripper.setLineSeparator("\n");
localStripper.setPageEnd("\n");
localStripper.setStartPage(1);
localStripper.setEndPage(1);
localStripper.setSortByPosition(true);
localStripper.setIgnoreContentStreamSpaceGlyphs(true);
String text = localStripper.getText(doc);
assertEquals("( overlap )\n", text);
}
}
}