PdfTextExtractorTest.java
/*
This file is part of the iText (R) project.
Copyright (c) 1998-2025 Apryse Group NV
Authors: Apryse Software.
This program is offered under a commercial and under the AGPL license.
For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below.
AGPL licensing:
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package com.itextpdf.kernel.pdf.canvas.parser;
import com.itextpdf.io.logs.IoLogMessageConstant;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.test.ExtendedITextTest;
import com.itextpdf.test.annotations.LogMessage;
import com.itextpdf.test.annotations.LogMessages;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Tag;
import java.io.IOException;
@Tag("IntegrationTest")
public class PdfTextExtractorTest extends ExtendedITextTest {
private static final String sourceFolder = "./src/test/resources/com/itextpdf/kernel/parser/PdfTextExtractorTest/";
@Test
@LogMessages(messages = @LogMessage(messageTemplate = IoLogMessageConstant.PDF_REFERS_TO_NOT_EXISTING_PROPERTY_DICTIONARY))
public void noSpecifiedDictionaryInPropertiesTest() throws IOException {
String inFile = sourceFolder + "noSpecifiedDictionaryInProperties.pdf";
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFile))) {
String text = PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1));
// Here we check that no NPE wasn't thrown. There is no text on the page so the extracted string should be empty.
Assertions.assertEquals("", text);
}
}
@Test
@LogMessages(messages = @LogMessage(messageTemplate = IoLogMessageConstant.PDF_REFERS_TO_NOT_EXISTING_PROPERTY_DICTIONARY))
public void noPropertiesInResourcesTest() throws IOException {
String inFile = sourceFolder + "noPropertiesInResources.pdf";
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFile))) {
String text = PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1));
// Here we check that no NPE wasn't thrown. There is no text on the page so the extracted string should be empty.
Assertions.assertEquals("", text);
}
}
@Test
public void type3FontNoCMapTest() throws IOException {
String inFile = sourceFolder + "type3NoCMap.pdf";
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFile))) {
Assertions.assertEquals("*0*", PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1)));
}
}
@Test
public void noBaseEncodingTest() throws IOException {
String inFile = sourceFolder + "noBaseEncoding.pdf";
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFile))) {
Assertions.assertEquals("HELLO WORLD", PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1)));
}
}
@Test
public void simpleFontWithoutEncodingToUnicodeTest() throws IOException {
String inFile = sourceFolder + "simpleFontWithoutEncodingToUnicode.pdf";
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFile))) {
Assertions.assertEquals("MyriadPro-Bold font.", PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1)));
}
}
@Test
public void simpleFontWithPartialToUnicodeTest() throws IOException {
String inFile = sourceFolder + "simpleFontWithPartialToUnicode.pdf";
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFile))) {
Assertions.assertEquals("Registered", PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1)));
}
}
@Test
public void type0FontToUnicodeTest() throws IOException {
String inFile = sourceFolder + "type0FontToUnicode.pdf";
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFile))) {
Assertions.assertEquals("��� 390", PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1)));
}
}
@Test
public void parseTextDiacriticShiftedLessThanTwo() throws IOException {
String inFile = sourceFolder + "diacriticShiftedLessThanTwo.pdf";
// ��������������������� ������
String expected = "\u0938\u0902\u0938\u094d\u0915\u0943\u0924 \u092e\u094d";
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFile))) {
Assertions.assertEquals(expected, PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1)));
}
}
@Test
public void parseTextDiacriticShiftedMoreThanTwo() throws IOException {
String inFile = sourceFolder + "diacriticShiftedMoreThanTwo.pdf";
// ���
//��������������������� ������
String expected = "\u0943\n\u0938\u0902\u0938\u094d\u0915\u0943\u0924 \u092e\u094d";
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFile))) {
Assertions.assertEquals(expected, PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1)));
}
}
@Test
public void shortOctalDataAsTextTest() throws IOException {
String inFile = sourceFolder + "shortOctalDataAsText.pdf";
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFile))) {
Assertions.assertEquals("EC", PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1)));
}
}
@Test
public void notDefaultCodespacesCyrillicTest() throws IOException {
String inFile = sourceFolder + "notDefaultCodespacesCyrillic.pdf";
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFile))) {
String extractedText = PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1));
Assertions.assertTrue(extractedText.contains("������������������������"));
Assertions.assertTrue(extractedText.contains("From"));
}
}
@Test
public void notDefaultCodespacesChineseTest() throws IOException {
String inFile = sourceFolder + "notDefaultCodespacesChinese.pdf";
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFile))) {
String extractedText = PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1));
Assertions.assertTrue(extractedText.contains("L3B ������ ��������������������������������������������������� 1 ���"));
}
}
@Test
public void mixedCharacterCodes() throws IOException {
String inFile = sourceFolder + "SameCidForDifferentCodes.pdf";
try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFile))) {
String extractedText = PdfTextExtractor.getTextFromPage(pdfDocument.getPage(1));
Assertions.assertTrue(extractedText.contains("18������"));
Assertions.assertFalse(extractedText.contains("18������"));
}
}
}