BidiTest.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.text;
import static org.junit.jupiter.api.Assertions.fail;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.file.Files;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.LogManager;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
/**
* Test for the PDButton class.
*
*/
class BidiTest
{
/**
* Logger instance.
*/
private static final Logger LOG = LogManager.getLogger(BidiTest.class);
private static final File IN_DIR = new File("src/test/resources/org/apache/pdfbox/text/");
private static final File OUT_DIR = new File("target/test-output");
private static final String NAME_OF_PDF = "BidiSample.pdf";
private static final String ENCODING = "UTF-8";
private PDDocument document;
private PDFTextStripper stripper;
@BeforeEach
public void setUp() throws IOException
{
Files.createDirectories(OUT_DIR.toPath());
document = Loader.loadPDF(new File(IN_DIR, NAME_OF_PDF));
stripper = new PDFTextStripper();
stripper.setLineSeparator("\n");
}
@Test
void testSorted() throws IOException
{
File testFile = new File(IN_DIR, NAME_OF_PDF);
doTestFile(testFile, OUT_DIR, false, true);
}
@Test
void testNotSorted() throws IOException
{
File testFile = new File(IN_DIR, NAME_OF_PDF);
doTestFile(testFile, OUT_DIR, false, false);
}
@AfterEach
public void tearDown() throws IOException
{
document.close();
}
/**
* Validate text extraction on a single file.
*
* @param inFile The PDF file to validate
* @param outDir The directory to store the output in
* @param bLogResult Whether to log the extracted text
* @param bSort Whether or not the extracted text is sorted
* @throws Exception when there is an exception
*/
private void doTestFile(File inFile, File outDir, boolean bLogResult, boolean bSort)
throws IOException
{
if(bSort)
{
LOG.info("Preparing to parse {} for sorted test", inFile.getName());
}
else
{
LOG.info("Preparing to parse {} for standard test", inFile.getName());
}
File outFile;
File expectedFile;
if (bSort)
{
outFile = new File(outDir, inFile.getName() + "-sorted.txt");
expectedFile = new File(inFile.getParentFile(), inFile.getName() + "-sorted.txt");
}
else
{
outFile = new File(outDir, inFile.getName() + ".txt");
expectedFile = new File(inFile.getParentFile(), inFile.getName() + ".txt");
}
try (OutputStream os = new FileOutputStream(outFile);
Writer writer = new OutputStreamWriter(os, ENCODING))
{
//Allows for sorted tests
stripper.setSortByPosition(bSort);
stripper.writeText(document, writer);
// close the written file before reading it again
}
if (bLogResult)
{
LOG.info("Text for {}:", inFile.getName());
LOG.info(stripper.getText(document));
}
if (!expectedFile.exists())
{
fail("FAILURE: Input verification file: " + expectedFile.getAbsolutePath() +
" did not exist");
return;
}
try (LineNumberReader expectedReader =
new LineNumberReader(new InputStreamReader(new FileInputStream(expectedFile), ENCODING));
LineNumberReader actualReader =
new LineNumberReader(new InputStreamReader(new FileInputStream(outFile), ENCODING)))
{
while (true)
{
String expectedLine = expectedReader.readLine();
while( expectedLine != null && expectedLine.trim().length() == 0 )
{
expectedLine = expectedReader.readLine();
}
String actualLine = actualReader.readLine();
while( actualLine != null && actualLine.trim().length() == 0 )
{
actualLine = actualReader.readLine();
}
if (!stringsEqual(expectedLine, actualLine))
{
fail("FAILURE: Line mismatch for file " + inFile.getName() +
" (sort = "+bSort+")" +
" at expected line: " + expectedReader.getLineNumber() +
" at actual line: " + actualReader.getLineNumber() +
"\nexpected line was: \"" + expectedLine + "\"" +
"\nactual line was: \"" + actualLine + "\"" + "\n");
//lets report all lines, even though this might produce some verbose logging
//break;
}
if (expectedLine == null || actualLine == null)
{
break;
}
}
}
}
/**
* Determine whether two strings are equal, where two null strings are
* considered equal.
*
* @param expected Expected string
* @param actual Actual String
* @return <code>true</code> is the strings are both null,
* or if their contents are the same, otherwise <code>false</code>.
*/
private boolean stringsEqual(String expected, String actual)
{
boolean equals = true;
if( (expected == null) && (actual == null) )
{
return true;
}
else if( expected != null && actual != null )
{
expected = expected.trim();
actual = actual.trim();
char[] expectedArray = expected.toCharArray();
char[] actualArray = actual.toCharArray();
int expectedIndex = 0;
int actualIndex = 0;
while( expectedIndex<expectedArray.length && actualIndex<actualArray.length )
{
if( expectedArray[expectedIndex] != actualArray[actualIndex] )
{
equals = false;
LOG.warn("Lines differ at index expected: {}-{} actual: {}-{}", expectedIndex,
(int) expectedArray[expectedIndex], actualIndex,
(int) actualArray[actualIndex]);
break;
}
expectedIndex = skipWhitespace( expectedArray, expectedIndex );
actualIndex = skipWhitespace( actualArray, actualIndex );
expectedIndex++;
actualIndex++;
}
if( equals )
{
if( expectedIndex != expectedArray.length )
{
equals = false;
LOG.warn("Expected line is longer at: {}", expectedIndex);
}
if( actualIndex != actualArray.length )
{
equals = false;
LOG.warn("Actual line is longer at: {}", actualIndex);
}
}
}
else
{
equals = (expected == null && actual != null && actual.trim().isEmpty())
|| (actual == null && expected != null && expected.trim().isEmpty());
}
return equals;
}
/**
* If the current index is whitespace then skip any subsequent whitespace.
*/
private int skipWhitespace( char[] array, int index )
{
//if we are at a space character then skip all space
//characters, but when all done rollback 1 because stringsEqual
//will roll forward 1
if( array[index] == ' ' || array[index] > 256 )
{
while( index < array.length && (array[index] == ' ' || array[index] > 256))
{
index++;
}
index--;
}
return index;
}
}