TextRenderInfoTest.java
/*
This file is part of the iText (R) project.
Copyright (c) 1998-2025 Apryse Group NV
Authors: Apryse Software.
This program is offered under a commercial and under the AGPL license.
For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below.
AGPL licensing:
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package com.itextpdf.kernel.pdf.canvas.parser;
import com.itextpdf.kernel.geom.LineSegment;
import com.itextpdf.kernel.geom.Vector;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
import com.itextpdf.kernel.pdf.canvas.parser.listener.IEventListener;
import com.itextpdf.kernel.pdf.canvas.parser.listener.ITextExtractionStrategy;
import com.itextpdf.kernel.pdf.canvas.parser.listener.SimpleTextExtractionStrategy;
import com.itextpdf.test.ExtendedITextTest;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Tag;
@Tag("IntegrationTest")
public class TextRenderInfoTest extends ExtendedITextTest {
private static final String SOURCE_FOLDER = "./src/test/resources/com/itextpdf/kernel/parser/TextRenderInfoTest/";
public static final int FIRST_PAGE = 1;
public static final int FIRST_ELEMENT_INDEX = 0;
@Test
public void testCharacterRenderInfos() throws Exception {
PdfCanvasProcessor parser = new PdfCanvasProcessor(new CharacterPositionEventListener());
parser.processPageContent(new PdfDocument(new PdfReader(SOURCE_FOLDER + "simple_text.pdf")).getPage(FIRST_PAGE));
}
/**
* Test introduced to exclude a bug related to a Unicode quirk for
* Japanese. TextRenderInfo threw an AIOOBE for some characters.
*/
@Test
public void testUnicodeEmptyString() throws Exception {
StringBuilder sb = new StringBuilder();
String inFile = "japanese_text.pdf";
PdfDocument pdfDocument = new PdfDocument(new PdfReader(SOURCE_FOLDER + inFile));
ITextExtractionStrategy start = new SimpleTextExtractionStrategy();
sb.append(PdfTextExtractor.getTextFromPage(pdfDocument.getPage(FIRST_PAGE), start));
String result = sb.substring(0, sb.toString().indexOf("\n"));
String origText =
"\u76f4\u8fd1\u306e\u0053\uff06\u0050\u0035\u0030\u0030"
+ "\u914d\u5f53\u8cb4\u65cf\u6307\u6570\u306e\u30d1\u30d5"
+ "\u30a9\u30fc\u30de\u30f3\u30b9\u306f\u0053\uff06\u0050"
+ "\u0035\u0030\u0030\u6307\u6570\u3092\u4e0a\u56de\u308b";
Assertions.assertEquals(origText, result);
}
@Test
public void testType3FontWidth() throws Exception {
String inFile = "type3font_text.pdf";
LineSegment origLineSegment = new LineSegment(new Vector(20.3246f, 769.4974f, 1.0f), new Vector(151.22923f, 769.4974f, 1.0f));
PdfDocument pdfDocument = new PdfDocument(new PdfReader(SOURCE_FOLDER + inFile));
TextPositionEventListener renderListener = new TextPositionEventListener();
PdfCanvasProcessor processor = new PdfCanvasProcessor(renderListener);
processor.processPageContent(pdfDocument.getPage(FIRST_PAGE));
Assertions.assertEquals(renderListener.getLineSegments().get(FIRST_ELEMENT_INDEX).getStartPoint().get(FIRST_ELEMENT_INDEX),
origLineSegment.getStartPoint().get(FIRST_ELEMENT_INDEX), 1 / 2f);
Assertions.assertEquals(renderListener.getLineSegments().get(FIRST_ELEMENT_INDEX).getEndPoint().get(FIRST_ELEMENT_INDEX),
origLineSegment.getEndPoint().get(FIRST_ELEMENT_INDEX), 1 / 2f);
}
@Test
public void testDoubleMappedCharacterExtraction() throws IOException {
String inFile = "double_cmap_mapping.pdf";
String expectedResult = "Regular hyphen [\u002D] and non-breaking hyphen [\u002D] (both CID 14)\n"
+ "Turtle kyuujitai [\u9f9c] and turtle radical [\u9f9c] (both CID 7472)";
PdfDocument pdfDocument = new PdfDocument(new PdfReader(SOURCE_FOLDER + inFile));
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
String result = PdfTextExtractor.getTextFromPage(pdfDocument.getPage(FIRST_PAGE), strategy).trim();
Assertions.assertEquals(expectedResult, result);
}
@Test
public void testEmbeddedIdentityToUnicodeTest() throws IOException {
String inFile = "embedded_identity_to_unicode.pdf";
String expectedResult = "Regular hyphen [\u002d] and non-breaking hyphen [\u2011] (both CID 14)\n"
+ "Turtle kyuujitai [\u9f9c] and turtle radical [\u2fd4] (both CID 7472)";
PdfDocument pdfDocument = new PdfDocument(new PdfReader(SOURCE_FOLDER + inFile));
ITextExtractionStrategy start = new SimpleTextExtractionStrategy();
String result = PdfTextExtractor.getTextFromPage(pdfDocument.getPage(FIRST_PAGE), start).trim();
Assertions.assertEquals(expectedResult, result);
}
private static class TextPositionEventListener implements IEventListener {
List<LineSegment> lineSegments = new ArrayList<>();
@Override
public void eventOccurred(IEventData data, EventType type) {
if (type.equals(EventType.RENDER_TEXT)) {
lineSegments.add(((TextRenderInfo) data).getBaseline());
}
}
@Override
public Set<EventType> getSupportedEvents() {
return new LinkedHashSet<>(Collections.singletonList(EventType.RENDER_TEXT));
}
public List<LineSegment> getLineSegments() {
return lineSegments;
}
}
private static class CharacterPositionEventListener implements ITextExtractionStrategy {
@Override
public String getResultantText() {
return null;
}
@Override
public void eventOccurred(IEventData data, EventType type) {
if (type.equals(EventType.RENDER_TEXT)) {
TextRenderInfo renderInfo = (TextRenderInfo) data;
List<TextRenderInfo> subs = renderInfo.getCharacterRenderInfos();
TextRenderInfo previousCharInfo = subs.get(0);
for (int i = 1; i < subs.size(); i++) {
TextRenderInfo charInfo = subs.get(i);
Vector previousEndPoint = previousCharInfo.getBaseline().getEndPoint();
Vector currentStartPoint = charInfo.getBaseline().getStartPoint();
assertVectorsEqual(charInfo.getText(), previousEndPoint, currentStartPoint);
previousCharInfo = charInfo;
}
}
}
private void assertVectorsEqual(String message, Vector v1, Vector v2) {
Assertions.assertEquals(v1.get(0), v2.get(0), 1 / 72f, message);
Assertions.assertEquals(v1.get(1), v2.get(1), 1 / 72f, message);
}
@Override
public Set<EventType> getSupportedEvents() {
return new LinkedHashSet<>(Collections.singletonList(EventType.RENDER_TEXT));
}
}
}