LocationExtractTest.java
/*
This file is part of the iText (R) project.
Copyright (c) 1998-2025 Apryse Group NV
Authors: Apryse Software.
This program is offered under a commercial and under the AGPL license.
For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below.
AGPL licensing:
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package com.itextpdf.kernel.pdf.canvas.parser;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfPage;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.canvas.parser.listener.ILocationExtractionStrategy;
import com.itextpdf.kernel.pdf.canvas.parser.listener.IPdfTextLocation;
import com.itextpdf.kernel.pdf.canvas.parser.listener.RegexBasedLocationExtractionStrategy;
import com.itextpdf.test.ExtendedITextTest;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Tag;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* This class tests the LocationExtractionStrategy framework.
* It uses RegexBasedLocationExtractionStrategy, and searches for the word "Alice" in the book
* "Alice in Wonderland" by Lewis Caroll on page 1.
*/
@Tag("IntegrationTest")
public class LocationExtractTest extends ExtendedITextTest {
private static final String sourceFolder = "./src/test/resources/com/itextpdf/kernel/parser/LocationExtractionTest/";
@Test
public void testLocationExtraction() throws IOException {
String inputFile = sourceFolder + "aliceInWonderland.pdf";
PdfReader reader = new PdfReader(inputFile);
PdfDocument pdfDocument = new PdfDocument(reader);
// calculate marked areas
PdfPage page = pdfDocument.getPage(1);
Collection<Rectangle> rectangleCollection = processPage(new RegexBasedLocationExtractionStrategy("Alice"), page);
// close document
pdfDocument.close();
// compare rectangles
Set<Rectangle> expectedRectangles = new HashSet<>();
expectedRectangles.add(new Rectangle(174.67166f, 150.19658f, 29.191528f, 14.982529f));
expectedRectangles.add(new Rectangle(200.95114f, 326.95657f, 29.297531f, 14.982544f));
expectedRectangles.add(new Rectangle(250.17247f, 376.51657f, 29.191544f, 14.982544f));
expectedRectangles.add(new Rectangle(434.33588f, 457.1566f, 29.191467f, 14.982544f));
expectedRectangles.add(new Rectangle(374.3493f, 519.1966f, 29.191528f, 14.982483f));
expectedRectangles.add(new Rectangle(510.3833f, 618.4366f, 29.380737f, 14.982483f));
expectedRectangles.add(new Rectangle(84.0f, 649.3966f, 29.297523f, 14.982483f));
Assertions.assertTrue(expectedRectangles.size() == rectangleCollection.size());
Assertions.assertTrue(fuzzyContainsAll(rectangleCollection, expectedRectangles));
}
private Collection<Rectangle> processPage(ILocationExtractionStrategy strategy, PdfPage page) {
PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
parser.processPageContent(page);
List<Rectangle> retval = new ArrayList<>();
for(IPdfTextLocation l : strategy.getResultantLocations())
retval.add(l.getRectangle());
return retval;
}
/**
* Comparing floats does not usually yield proper results for equality.
* This function exists specifically to overcome that obstacle.
*
* @param rs
* @param r
* @return
*/
private boolean fuzzyContains(Collection<Rectangle> rs, Rectangle r) {
int x = (int) r.getX();
int y = (int) r.getY();
int w = (int) r.getWidth();
int h = (int) r.getHeight();
for (Rectangle r0 : rs) {
int x0 = (int) r0.getX();
int y0 = (int) r0.getY();
int w0 = (int) r0.getWidth();
int h0 = (int) r0.getHeight();
if (x0 == x && y0 == y && w0 == w && h0 == h)
return true;
}
return false;
}
/**
* This function tests whether a first collection contains all elements of a second collection.
* This method does not perform its job fast, but is only used for testing.
*
* @param rs0
* @param rs1
* @return true iff rs0 contains all elements of rs1
*/
private boolean fuzzyContainsAll(Collection<Rectangle> rs0, Collection<Rectangle> rs1) {
for (Rectangle r1 : rs1) {
if (!fuzzyContains(rs0, r1))
return false;
}
return true;
}
}