TestExtractText.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.tools;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.nio.file.Path;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.InvalidPathException;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.io.TempDir;
import picocli.CommandLine;
/**
* Test suite for ExtractText.
*/
class TestExtractText
{
final PrintStream originalOut = System.out;
final ByteArrayOutputStream out = new ByteArrayOutputStream();
PrintStream printStream = null;
static final String TESTFILE1 = "src/test/resources/org/apache/pdfbox/testPDFPackage.pdf";
static final String TESTFILE2 = "src/test/resources/org/apache/pdfbox/hello3.pdf";
static final String TESTFILE3 = "src/test/resources/org/apache/pdfbox/AngledExample.pdf";
static String filename1 = null;
static String filename2 = null;
@BeforeAll
public static void setupFilenames()
{
// the filename representation is platform dependent
filename1 = Paths.get(TESTFILE1).toString();
filename2 = Paths.get(TESTFILE2).toString();
}
@BeforeEach
public void setUpStreams()
{
out.reset();
try
{
printStream = new PrintStream(out, true, "utf-8");
System.setOut(printStream);
}
catch (UnsupportedEncodingException e)
{
// shouldn't happen at all
e.printStackTrace();
}
}
@AfterEach
public void restoreStreams()
{
System.setOut(originalOut);
if (printStream != null)
{
printStream.close();
}
}
/**
* Run the text extraction test using a pdf with embedded pdfs.
*
* @throws Exception if something went wrong
*/
@Test
void testEmbeddedPDFs() throws Exception
{
ExtractText app = new ExtractText();
CommandLine cmd = new CommandLine(app);
int exitCode = cmd.execute("-i", TESTFILE1, "-console");
assertEquals(0, exitCode);
String result = out.toString("UTF-8");
assertTrue(result.contains("PDF1"));
assertTrue(result.contains("PDF2"));
assertFalse(result.contains("PDF file: " + filename1));
assertFalse(result.contains("Hello"));
assertFalse(result.contains("World."));
assertFalse(result.contains("PDF file: " + filename2));
}
/**
* Run the text extraction with -addFileName
*
* @throws Exception if something went wrong
*/
@Test
void testAddFileName() throws Exception
{
ExtractText app = new ExtractText();
CommandLine cmd = new CommandLine(app);
int exitCode = cmd.execute("-i", TESTFILE1, "-console", "-addFileName");
assertEquals(0, exitCode);
String result = out.toString("UTF-8");
assertTrue(result.contains("PDF1"));
assertTrue(result.contains("PDF2"));
assertTrue(result.contains("PDF file: " + filename1));
assertFalse(result.contains("Hello"));
assertFalse(result.contains("World."));
assertFalse(result.contains("PDF file: " + filename2));
}
/**
* Run the text extraction as a PDFBox repeatable subcommand
*
* @throws Exception if something went wrong
*/
@Test
void testPDFBoxRepeatableSubcommand() throws Exception
{
PDFBox.main(new String[] { "export:text", "-i", TESTFILE1, "-console", //
"export:text", "-i", TESTFILE2, "-console" });
String result = out.toString("UTF-8");
assertTrue(result.contains("PDF1"));
assertTrue(result.contains("PDF2"));
assertFalse(result.contains("PDF file: " + filename1));
assertTrue(result.contains("Hello"));
assertTrue(result.contains("World."));
assertFalse(result.contains("PDF file: " + filename2));
}
/**
* Run the text extraction as a PDFBox repeatable subcommand with -addFileName
*
* @throws Exception if something went wrong
*/
@Test
void testPDFBoxRepeatableSubcommandAddFileName() throws Exception
{
PDFBox.main(new String[] { "export:text", "-i", TESTFILE1, "-console", "-addFileName",
"export:text", "-i", TESTFILE2, "-console", "-addFileName" });
String result = out.toString("UTF-8");
assertTrue(result.contains("PDF1"));
assertTrue(result.contains("PDF2"));
assertTrue(result.contains("PDF file: " + filename1));
assertTrue(result.contains("Hello"));
assertTrue(result.contains("World."));
assertTrue(result.contains("PDF file: " + filename2));
}
/**
* Run the text extraction as a PDFBox repeatable subcommand with -addFileName, with -o <outfile> and without
* -append
*
* @throws Exception if something went wrong
*/
@Test
void testPDFBoxRepeatableSubcommandAddFileNameOutfile(@TempDir Path tempDir) throws Exception
{
Path path = null;
try
{
path = tempDir.resolve("outfile.txt");
Files.deleteIfExists(path);
}
catch (InvalidPathException ipe)
{
System.err.println(
"Error creating temporary test file in " + this.getClass().getSimpleName());
}
assertNotNull(path);
PDFBox.main(new String[] { "export:text", "-i", TESTFILE1, "-encoding", "UTF-8",
"-addFileName", "-o", path.toString(), //
"export:text", "-i", TESTFILE2, "-encoding", "UTF-8", //
"-addFileName", "-o", path.toString() });
String result = new String(Files.readAllBytes(path), "UTF-8");
assertFalse(result.contains("PDF1"));
assertFalse(result.contains("PDF2"));
assertFalse(result.contains("PDF file: " + filename1));
assertTrue(result.contains("Hello"));
assertTrue(result.contains("World."));
assertTrue(result.contains("PDF file: " + filename2));
}
/**
* Run the text extraction as a PDFBox repeatable subcommand with -addFileName, -o <outfile> and -append
*
* @throws Exception if something went wrong
*/
@Test
void testPDFBoxRepeatableSubcommandAddFileNameOutfileAppend(@TempDir Path tempDir)
throws Exception
{
Path path = null;
try
{
path = tempDir.resolve("outfile.txt");
Files.deleteIfExists(path);
}
catch (InvalidPathException ipe)
{
System.err.println(
"Error creating temporary test file in " + this.getClass().getSimpleName());
}
assertNotNull(path);
PDFBox.main(new String[] { "export:text", "-i", TESTFILE1, "-encoding", "UTF-8",
"-addFileName", "-o", path.toString(), //
"export:text", "-i", TESTFILE2, "-encoding", "UTF-8",
"-addFileName", "-o", path.toString(), "-append" });
String result = new String(Files.readAllBytes(path), "UTF-8");
assertTrue(result.contains("PDF1"));
assertTrue(result.contains("PDF2"));
assertTrue(result.contains("PDF file: " + filename1));
assertTrue(result.contains("Hello"));
assertTrue(result.contains("World."));
assertTrue(result.contains("PDF file: " + filename2));
}
/**
* Simple test to check that the rotationMagic feature works.
*
* @param tempDir
* @throws Exception
*/
@Test
void testRotationMagic(@TempDir Path tempDir) throws Exception
{
Path path = null;
try
{
path = tempDir.resolve("outfile.txt");
Files.deleteIfExists(path);
}
catch (InvalidPathException ipe)
{
System.err.println(
"Error creating temporary test file in " + this.getClass().getSimpleName());
}
assertNotNull(path);
PDFBox.main(new String[] { "export:text", "-rotationMagic", "-i", TESTFILE3,
"-o", path.toString() });
String result = new String(Files.readAllBytes(path), "UTF-8");
assertTrue(result.contains("Horizontal Text"));
assertTrue(result.contains("Vertical Text"));
}
}