RegexBasedLocationExtractionStrategyTest.java

/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2025 Apryse Group NV
    Authors: Apryse Software.

    This program is offered under a commercial and under the AGPL license.
    For commercial licensing, contact us at https://itextpdf.com/sales.  For AGPL licensing, see below.

    AGPL licensing:
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
package com.itextpdf.kernel.pdf.canvas.parser.listener;

import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
import com.itextpdf.test.ExtendedITextTest;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Tag;

@Tag("IntegrationTest")
public class RegexBasedLocationExtractionStrategyTest extends ExtendedITextTest {

    private static final String sourceFolder = "./src/test/resources/com/itextpdf/kernel/parser/RegexBasedLocationExtractionStrategyTest/";

    @Test
    public void test00() throws IOException {
        PdfDocument document = new PdfDocument(new PdfReader(sourceFolder + "in00.pdf"));
        // get locations
        List<IPdfTextLocation> locationList = new ArrayList<>();
        for (int i = 1; i <= document.getNumberOfPages(); ++i) {
            RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy("cillum");
            PdfCanvasProcessor processor = new PdfCanvasProcessor(extractionStrategy);
            processor.processPageContent(document.getPage(i));
            for (IPdfTextLocation location : extractionStrategy.getResultantLocations()) {
                locationList.add(location);
            }
        }
        // compare
        Assertions.assertEquals(2, locationList.size());

        IPdfTextLocation loc = locationList.get(0);

        Assertions.assertEquals("cillum", loc.getText());
        Assertions.assertEquals(64, (int) loc.getRectangle().getX());
        Assertions.assertEquals(732, (int) loc.getRectangle().getY());
        Assertions.assertEquals(30, (int) loc.getRectangle().getWidth());
        Assertions.assertEquals(11, (int) loc.getRectangle().getHeight());

        IPdfTextLocation loc2 = locationList.get(1);
        Assertions.assertEquals(64, (int) loc2.getRectangle().getX());
        Assertions.assertEquals(732, (int) loc2.getRectangle().getY());
        Assertions.assertEquals(30, (int) loc2.getRectangle().getWidth());
        Assertions.assertEquals(11, (int) loc2.getRectangle().getHeight());

        document.close();
    }

    @Test
    public void test02() throws IOException {
        PdfDocument document = new PdfDocument(new PdfReader(sourceFolder + "in01.pdf"));
        // get locations
        List<IPdfTextLocation> locationList = new ArrayList<>();
        for (int i = 1; i <= document.getNumberOfPages(); ++i) {
            RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy("\\{\\{Signature\\}\\}");
            PdfCanvasProcessor processor = new PdfCanvasProcessor(extractionStrategy);
            processor.processPageContent(document.getPage(i));
            for (IPdfTextLocation location : extractionStrategy.getResultantLocations()) {
                locationList.add(location);
            }
        }
        // compare
        Assertions.assertEquals(1, locationList.size());

        IPdfTextLocation loc = locationList.get(0);

        Assertions.assertEquals("{{Signature}}", loc.getText());
        Assertions.assertEquals(23, (int) loc.getRectangle().getX());
        Assertions.assertEquals(375, (int) loc.getRectangle().getY());
        Assertions.assertEquals(55, (int) loc.getRectangle().getWidth());
        Assertions.assertEquals(11, (int) loc.getRectangle().getHeight());

        document.close();
    }


    // https://jira.itextsupport.com/browse/DEVSIX-1940
    // text is 'calligraphy' and 'll' is composing a ligature

    @Test
    public void testLigatureBeforeLigature() throws IOException {
        System.out.println(new File(sourceFolder).getAbsolutePath());

        PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "ligature.pdf"));

        // get locations
        List<IPdfTextLocation> locationList = new ArrayList<>();
        for (int x = 1; x <= pdfDocument.getNumberOfPages(); x++) {
            // build strategy
            RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy("ca");
            new PdfCanvasProcessor(extractionStrategy).processPageContent(pdfDocument.getPage(x));
            for(IPdfTextLocation location : extractionStrategy.getResultantLocations()) {
                if(location != null) {
                    locationList.add(location);
                }
            }
        }

        // compare
        Assertions.assertEquals(1, locationList.size());

        IPdfTextLocation loc = locationList.get(0);

        Assertions.assertEquals("ca", loc.getText());
        Rectangle rect = loc.getRectangle();
        Assertions.assertEquals(36, rect.getX(), 0.0001);
        Assertions.assertEquals(655.4600, rect.getY(), 0.0001);
        Assertions.assertEquals(25.1000, rect.getWidth(), 0.0001);
        Assertions.assertEquals(20, rect.getHeight(), 0.0001);

        pdfDocument.close();
    }

    @Test
    public void testLigatureCrossLigature() throws IOException {
        System.out.println(new File(sourceFolder).getAbsolutePath());

        PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "ligature.pdf"));

        // get locations
        List<IPdfTextLocation> locationList = new ArrayList<>();
        for (int x = 1; x <= pdfDocument.getNumberOfPages(); x++) {
            // build strategy
            RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy("al");
            new PdfCanvasProcessor(extractionStrategy).processPageContent(pdfDocument.getPage(x));
            for(IPdfTextLocation location : extractionStrategy.getResultantLocations()) {
                if(location != null) {
                    locationList.add(location);
                }
            }
        }

        // compare
        Assertions.assertEquals(1, locationList.size());

        IPdfTextLocation loc = locationList.get(0);

        Assertions.assertEquals("al", loc.getText());
        Rectangle rect = loc.getRectangle();
        Assertions.assertEquals(48.7600, rect.getX(), 0.0001);
        Assertions.assertEquals(655.4600, rect.getY(), 0.0001);
        Assertions.assertEquals(25.9799, rect.getWidth(), 0.0001);
        Assertions.assertEquals(20, rect.getHeight(), 0.0001);

        pdfDocument.close();
    }

    @Test
    public void testLigatureInLigature() throws IOException {
        System.out.println(new File(sourceFolder).getAbsolutePath());

        PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "ligature.pdf"));

        // get locations
        List<IPdfTextLocation> locationList = new ArrayList<>();
        for (int x = 1; x <= pdfDocument.getNumberOfPages(); x++) {
            // build strategy
            RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy("l");
            new PdfCanvasProcessor(extractionStrategy).processPageContent(pdfDocument.getPage(x));
            for(IPdfTextLocation location : extractionStrategy.getResultantLocations()) {
                if(location != null) {
                    locationList.add(location);
                }
            }
        }

        // compare
        Assertions.assertEquals(1, locationList.size());

        IPdfTextLocation loc = locationList.get(0);

        Assertions.assertEquals("l", loc.getText());
        Rectangle rect = loc.getRectangle();
        Assertions.assertEquals(61.0999, rect.getX(), 0.0001);
        Assertions.assertEquals(655.4600, rect.getY(), 0.0001);
        Assertions.assertEquals(13.6399, rect.getWidth(), 0.0001);
        Assertions.assertEquals(20, rect.getHeight(), 0.0001);

        pdfDocument.close();
    }

    @Test
    public void testRotatedText() throws IOException {
        PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "rotatedText.pdf"));

        // get locations
        List<IPdfTextLocation> locationList = new ArrayList<>();
        for (int x = 1; x <= pdfDocument.getNumberOfPages(); x++) {
            // build strategy
            RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy("abc");
            new PdfCanvasProcessor(extractionStrategy).processPageContent(pdfDocument.getPage(x));
            for(IPdfTextLocation location : extractionStrategy.getResultantLocations()) {
                if(location != null) {
                    locationList.add(location);
                }
            }
        }

        // compare
        Assertions.assertEquals(2, locationList.size());
        Assertions.assertTrue(locationList.get(0).getRectangle().equalsWithEpsilon(new Rectangle(188.512f, 450f, 14.800003f, 25.791992f)));
        Assertions.assertTrue(locationList.get(1).getRectangle().equalsWithEpsilon(new Rectangle(36f, 746.688f, 25.792f, 14.799988f)));

        pdfDocument.close();
    }

    @Test
    public void regexStartedWithWhiteSpaceTest() throws IOException {
        PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "regexStartedWithWhiteSpaceTest.pdf"));
        RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy("\\sstart");
        new PdfCanvasProcessor(extractionStrategy).processPageContent(pdfDocument.getPage(1));
        List<IPdfTextLocation> locations = new ArrayList<>(extractionStrategy.getResultantLocations());
        pdfDocument.close();

        Assertions.assertEquals(1, locations.size());
        Assertions.assertEquals(" start", locations.get(0).getText());
        Assertions.assertTrue(
                new Rectangle(92.3f, 743.3970f, 20.6159f, 13.2839f).equalsWithEpsilon(locations.get(0).getRectangle()));
    }

    @Test
    public void regexStartedWithNewLineTest() throws IOException {
        PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "regexStartedWithNewLineTest.pdf"));
        RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy("\\nstart");
        new PdfCanvasProcessor(extractionStrategy).processPageContent(pdfDocument.getPage(1));
        List<IPdfTextLocation> locations = new ArrayList<>(extractionStrategy.getResultantLocations());
        pdfDocument.close();

        Assertions.assertEquals(1, locations.size());
        Assertions.assertEquals("\nstart", locations.get(0).getText());
        Assertions.assertTrue(
                new Rectangle(56.8f, 729.5970f, 20.6159f, 13.2839f).equalsWithEpsilon(locations.get(0).getRectangle()));
    }

    @Test
    public void regexWithWhiteSpacesTest() throws IOException {
        PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "regexWithWhiteSpacesTest.pdf"));
        RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy(
                "\\sstart\\s");
        new PdfCanvasProcessor(extractionStrategy).processPageContent(pdfDocument.getPage(1));
        List<IPdfTextLocation> locations = new ArrayList<>(extractionStrategy.getResultantLocations());
        pdfDocument.close();

        Assertions.assertEquals(1, locations.size());
        Assertions.assertEquals(" start ", locations.get(0).getText());
        Assertions.assertTrue(
                new Rectangle(92.3f, 743.3970f, 20.6159f, 13.2839f).equalsWithEpsilon(locations.get(0).getRectangle()));
    }

    @Test
    public void regexWithNewLinesTest() throws IOException {
        PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "regexWithNewLinesTest.pdf"));
        RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy(
                "\\nstart\\n");
        new PdfCanvasProcessor(extractionStrategy).processPageContent(pdfDocument.getPage(1));
        List<IPdfTextLocation> locations = new ArrayList<>(extractionStrategy.getResultantLocations());
        pdfDocument.close();

        Assertions.assertEquals(1, locations.size());
        Assertions.assertEquals("\nstart\n", locations.get(0).getText());
        Assertions.assertTrue(
                new Rectangle(56.8f, 729.5970f, 20.6159f, 13.2839f).equalsWithEpsilon(locations.get(0).getRectangle()));
    }


    @Test
    public void regexWithNewLineBetweenWordsTest() throws IOException {
        PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "regexWithNewLineBetweenWordsTest.pdf"));
        RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy(
                "hello\\nworld");
        new PdfCanvasProcessor(extractionStrategy).processPageContent(pdfDocument.getPage(1));
        List<IPdfTextLocation> locations = new ArrayList<>(extractionStrategy.getResultantLocations());
        pdfDocument.close();

        Assertions.assertEquals(2, locations.size());
        Assertions.assertEquals("hello\nworld", locations.get(0).getText());
        Assertions.assertEquals("hello\nworld", locations.get(1).getText());
        Assertions.assertTrue(
                new Rectangle(56.8f, 729.5970f, 27.8999f, 13.2839f).equalsWithEpsilon(locations.get(0).getRectangle()));
        Assertions.assertTrue(
                new Rectangle(56.8f, 743.3970f, 23.9039f, 13.2839f).equalsWithEpsilon(locations.get(1).getRectangle()));
    }


    @Test
    public void regexWithOnlyNewLine() throws IOException {
        PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "regexWithNewLinesTest.pdf"));
        RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy("\\n");
        new PdfCanvasProcessor(extractionStrategy).processPageContent(pdfDocument.getPage(1));
        List<IPdfTextLocation> locations = new ArrayList<>(extractionStrategy.getResultantLocations());
        pdfDocument.close();

        Assertions.assertEquals(0, locations.size());
    }

    @Test
    public void regexWithOnlyWhiteSpace() throws IOException {
        PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "regexWithWhiteSpacesTest.pdf"));
        RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy(" ");
        new PdfCanvasProcessor(extractionStrategy).processPageContent(pdfDocument.getPage(1));
        List<IPdfTextLocation> locations = new ArrayList<>(extractionStrategy.getResultantLocations());
        pdfDocument.close();

        Assertions.assertEquals(0, locations.size());
    }

    @Test
    public void sortCompareTest() throws IOException {
        try (PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "sortCompare.pdf"))) {
            RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy("a");
            PdfCanvasProcessor pdfCanvasProcessor = new PdfCanvasProcessor(extractionStrategy);
            pdfCanvasProcessor.processPageContent(pdfDocument.getPage(1));
            pdfCanvasProcessor.processPageContent(pdfDocument.getPage(2));
            List<IPdfTextLocation> locations = new ArrayList<>(extractionStrategy.getResultantLocations());
            Assertions.assertEquals(13, locations.size());
        }
    }
}