RegexBasedLocationExtractionStrategy.java
/*
This file is part of the iText (R) project.
Copyright (c) 1998-2025 Apryse Group NV
Authors: Apryse Software.
This program is offered under a commercial and under the AGPL license.
For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below.
AGPL licensing:
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package com.itextpdf.kernel.pdf.canvas.parser.listener;
import com.itextpdf.kernel.colors.Color;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.canvas.parser.EventType;
import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This class is designed to search for the occurrences of a regular expression and return the resultant rectangles.
* Do note that this class holds all text locations and can't be used for processing multiple pages.
* If you want to extract text from several pages of pdf document you have to create a new instance
* of {@link RegexBasedLocationExtractionStrategy} for each page.
* <p>
* Here is an example of usage with new instance per each page:
* <code>
* PdfDocument document = new PdfDocument(new PdfReader("..."));
* for (int i = 1; i <= document.getNumberOfPages(); ++i) {
* RegexBasedLocationExtractionStrategy extractionStrategy = new RegexBasedLocationExtractionStrategy("");
* PdfCanvasProcessor processor = new PdfCanvasProcessor(extractionStrategy);
* processor.processPageContent(document.getPage(i));
* for (IPdfTextLocation location : extractionStrategy.getResultantLocations()) {
* //process locations ...
* }
* }
* </code>
*/
public class RegexBasedLocationExtractionStrategy implements ILocationExtractionStrategy {
private static final float EPS = 1.0E-4F;
private final Pattern pattern;
private final List<CharacterRenderInfo> parseResult = new ArrayList<>();
public RegexBasedLocationExtractionStrategy(String regex) {
this.pattern = Pattern.compile(regex);
}
public RegexBasedLocationExtractionStrategy(Pattern pattern) {
this.pattern = pattern;
}
/**
* {@inheritDoc}
*/
@Override
public Collection<IPdfTextLocation> getResultantLocations() {
// align characters in "logical" order
Collections.sort(parseResult, new TextChunkLocationBasedComparator(new DefaultTextChunkLocationComparator()));
// process parse results
List<IPdfTextLocation> retval = new ArrayList<>();
CharacterRenderInfo.StringConversionInfo txt = CharacterRenderInfo.mapString(parseResult);
Matcher mat = pattern.matcher(txt.text);
while (mat.find()) {
Integer startIndex = getStartIndex(txt.indexMap, mat.start(), txt.text);
Integer endIndex = getEndIndex(txt.indexMap, mat.end() - 1);
if (startIndex != null && endIndex != null && startIndex <= endIndex) {
for (Rectangle r : toRectangles(parseResult.subList(startIndex.intValue(), endIndex.intValue() + 1))) {
retval.add(new DefaultPdfTextLocation(r, mat.group(0)));
}
}
}
/* sort
* even though the return type is Collection<Rectangle>, we apply a sorting algorithm here
* This is to ensure that tests that use this functionality (for instance to generate pdf with
* areas of interest highlighted) will not break when compared.
*/
Collections.sort(retval, new PdfTextLocationComparator());
// ligatures can produces same rectangle
removeDuplicates(retval);
return retval;
}
/**
* {@inheritDoc}
*/
@Override
public void eventOccurred(IEventData data, EventType type) {
parseResult.addAll(toCRI((TextRenderInfo) data));
}
/**
* {@inheritDoc}
*/
@Override
public Set<EventType> getSupportedEvents() {
return Collections.singleton(EventType.RENDER_TEXT);
}
/**
* Convert {@link TextRenderInfo} to {@link CharacterRenderInfo}
* This method is public and not final so that custom implementations can choose to override it.
* Other implementations of {@code CharacterRenderInfo} may choose to store different properties than
* merely the {@link Rectangle} describing the bounding box. E.g. a custom implementation might choose to
* store {@link Color} information as well, to better match the content surrounding the redaction {@link Rectangle}.
*
* @param tri {@link TextRenderInfo} object
*
* @return a list of {@link CharacterRenderInfo}s which represents the passed {@link TextRenderInfo} ?
*/
protected List<CharacterRenderInfo> toCRI(TextRenderInfo tri) {
List<CharacterRenderInfo> cris = new ArrayList<>();
for (TextRenderInfo subTri : tri.getCharacterRenderInfos()) {
cris.add(new CharacterRenderInfo(subTri));
}
return cris;
}
/**
* Converts {@link CharacterRenderInfo} objects to {@link Rectangle}s
* This method is protected and not final so that custom implementations can choose to override it.
* E.g. other implementations may choose to add padding/margin to the Rectangles.
* This method also offers a convenient access point to the mapping of {@link CharacterRenderInfo} to {@link Rectangle}.
* This mapping enables (custom implementations) to match color of text in redacted Rectangles,
* or match color of background, by the mere virtue of offering access to the {@link CharacterRenderInfo} objects
* that generated the {@link Rectangle}.
*
* @param cris list of {@link CharacterRenderInfo} objects
*
* @return an array containing the elements of this list
*/
protected List<Rectangle> toRectangles(List<CharacterRenderInfo> cris) {
List<Rectangle> retval = new ArrayList<>();
if (cris.isEmpty()) {
return retval;
}
int prev = 0;
int curr = 0;
while (curr < cris.size()) {
while (curr < cris.size() && cris.get(curr).sameLine(cris.get(prev))) {
curr++;
}
Rectangle resultRectangle = null;
for (CharacterRenderInfo cri : cris.subList(prev, curr)) {
// in case letters are rotated (imagine text being written with an angle of 90 degrees)
resultRectangle = Rectangle.getCommonRectangle(resultRectangle, cri.getBoundingBox());
}
retval.add(resultRectangle);
prev = curr;
}
// return
return retval;
}
private void removeDuplicates(List<IPdfTextLocation> sortedList) {
IPdfTextLocation lastItem = null;
int orgSize = sortedList.size();
for (int i = orgSize - 1; i >= 0; i--) {
IPdfTextLocation currItem = sortedList.get(i);
Rectangle currRect = currItem.getRectangle();
if (lastItem != null && currRect.equalsWithEpsilon(lastItem.getRectangle())) {
sortedList.remove(currItem);
}
lastItem = currItem;
}
}
private static Integer getStartIndex(Map<Integer, Integer> indexMap, int index,
String txt) {
while (!indexMap.containsKey(index) && index < txt.length()) {
index++;
}
return indexMap.get(index);
}
private static Integer getEndIndex(Map<Integer, Integer> indexMap, int index) {
while (!indexMap.containsKey(index) && index >= 0) {
index--;
}
return indexMap.get(index);
}
private static final class PdfTextLocationComparator
implements Comparator<com.itextpdf.kernel.pdf.canvas.parser.listener.IPdfTextLocation> {
@Override
public int compare(com.itextpdf.kernel.pdf.canvas.parser.listener.IPdfTextLocation l1,
com.itextpdf.kernel.pdf.canvas.parser.listener.IPdfTextLocation l2) {
Rectangle o1 = l1.getRectangle();
Rectangle o2 = l2.getRectangle();
if (Math.abs(o1.getY() - o2.getY()) < EPS) {
return Math.abs(o1.getX() - o2.getX()) < EPS ? 0 : ((o2.getX() - o1.getX()) > EPS ? -1 : 1);
} else {
return (o2.getY() - o1.getY()) > EPS ? -1 : 1;
}
}
}
}