TaggedPdfReaderTool.java
/*
This file is part of the iText (R) project.
Copyright (c) 1998-2025 Apryse Group NV
Authors: Apryse Software.
This program is offered under a commercial and under the AGPL license.
For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below.
AGPL licensing:
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package com.itextpdf.kernel.utils;
import com.itextpdf.io.exceptions.IoExceptionMessageConstant;
import com.itextpdf.kernel.exceptions.PdfException;
import com.itextpdf.kernel.exceptions.KernelExceptionMessageConstant;
import com.itextpdf.kernel.pdf.PdfArray;
import com.itextpdf.kernel.pdf.PdfDictionary;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfName;
import com.itextpdf.kernel.pdf.PdfObject;
import com.itextpdf.kernel.pdf.PdfPage;
import com.itextpdf.kernel.pdf.PdfString;
import com.itextpdf.kernel.pdf.canvas.parser.EventType;
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
import com.itextpdf.kernel.pdf.canvas.parser.data.TextRenderInfo;
import com.itextpdf.kernel.pdf.canvas.parser.listener.IEventListener;
import com.itextpdf.kernel.pdf.canvas.parser.listener.ITextExtractionStrategy;
import com.itextpdf.kernel.pdf.canvas.parser.listener.LocationTextExtractionStrategy;
import com.itextpdf.kernel.pdf.tagging.IStructureNode;
import com.itextpdf.kernel.pdf.tagging.PdfMcr;
import com.itextpdf.kernel.pdf.tagging.PdfObjRef;
import com.itextpdf.kernel.pdf.tagging.PdfStructElem;
import com.itextpdf.kernel.pdf.tagging.PdfStructTreeRoot;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Converts a tagged PDF document into an XML file.
*/
public class TaggedPdfReaderTool {
protected PdfDocument document;
protected OutputStreamWriter out;
protected String rootTag;
// key - page dictionary; value - a mapping of mcids to text in them
protected Map<PdfDictionary, Map<Integer, String>> parsedTags = new HashMap<>();
private final Set<PdfObject> inspectedStructTreeElems = new HashSet<>();
/**
* Constructs a {@link TaggedPdfReaderTool} via a given {@link PdfDocument}.
*
* @param document the document to read tag structure from
*/
public TaggedPdfReaderTool(PdfDocument document) {
this.document = document;
}
/**
* Checks if a character value should be escaped/unescaped.
*
* @param c a character value
*
* @return true if it's OK to escape or unescape this value.
*/
public static boolean isValidCharacterValue(int c) {
return (c == 0x9 || c == 0xA || c == 0xD
|| c >= 0x20 && c <= 0xD7FF
|| c >= 0xE000 && c <= 0xFFFD
|| c >= 0x10000 && c <= 0x10FFFF);
}
/**
* Converts the current tag structure into an XML file with default encoding (UTF-8).
* @param os the output stream to save XML file to
* @throws java.io.IOException in case of any I/O error
*/
public void convertToXml(OutputStream os)
throws IOException {
convertToXml(os, "UTF-8");
}
/**
* Converts the current tag structure into an XML file with provided encoding.
* @param os the output stream to save XML file to
* @param charset the charset of the resultant XML file
* @throws java.io.IOException in case of any I/O error
*/
public void convertToXml(OutputStream os, String charset)
throws IOException {
out = new OutputStreamWriter(os, Charset.forName(charset));
if (rootTag != null) {
out.write("<" + rootTag + ">" + System.lineSeparator());
}
// get the StructTreeRoot from the document
PdfStructTreeRoot structTreeRoot = document.getStructTreeRoot();
if (structTreeRoot == null)
throw new PdfException(KernelExceptionMessageConstant.DOCUMENT_DOES_NOT_CONTAIN_STRUCT_TREE_ROOT);
// Inspect the child or children of the StructTreeRoot
inspectKids(structTreeRoot.getKids());
if (rootTag != null) {
out.write("</" + rootTag + ">");
}
out.flush();
out.close();
}
/**
* Sets the name of the root tag of the resultant XML file
* @param rootTagName the name of the root tag
* @return this object
*/
public TaggedPdfReaderTool setRootTag(String rootTagName) {
this.rootTag = rootTagName;
return this;
}
/**
* Inspect the children of the StructTreeRoot.
*
* @param kids list of the direct kids of the StructTreeRoot
*/
protected void inspectKids(List<IStructureNode> kids) {
if (kids == null)
return;
for (IStructureNode kid : kids) {
inspectKid(kid);
}
}
/**
* Inspect the child of the StructTreeRoot.
*
* @param kid the direct kid of the StructTreeRoot
*/
protected void inspectKid(IStructureNode kid) {
try {
if (kid instanceof PdfStructElem) {
PdfStructElem structElemKid = (PdfStructElem) kid;
if (inspectedStructTreeElems.contains(structElemKid.getPdfObject())) {
return;
}
inspectedStructTreeElems.add(structElemKid.getPdfObject());
PdfName s = structElemKid.getRole();
String tagN = s.getValue();
String tag = fixTagName(tagN);
out.write("<");
out.write(tag);
inspectAttributes(structElemKid);
out.write(">" + System.lineSeparator());
PdfString alt = (structElemKid).getAlt();
if (alt != null) {
out.write("<alt><![CDATA[");
out.write(alt.getValue().replaceAll("[\\000]*", ""));
out.write("]]></alt>" + System.lineSeparator());
}
inspectKids(structElemKid.getKids());
out.write("</");
out.write(tag);
out.write(">" + System.lineSeparator());
} else if (kid instanceof PdfMcr) {
parseTag((PdfMcr) kid);
} else {
out.write(" <flushedKid/> ");
}
} catch (java.io.IOException e) {
throw new com.itextpdf.io.exceptions.IOException(IoExceptionMessageConstant.UNKNOWN_IO_EXCEPTION, e);
}
}
/**
* Inspects attributes dictionary of the StructTreeRoot child.
*
* @param kid the direct kid of the StructTreeRoot
*/
protected void inspectAttributes(PdfStructElem kid) {
PdfObject attrObj = kid.getAttributes(false);
if (attrObj != null) {
PdfDictionary attrDict;
if (attrObj instanceof PdfArray) {
attrDict = ((PdfArray) attrObj).getAsDictionary(0);
} else {
attrDict = (PdfDictionary) attrObj;
}
try {
for (PdfName key : attrDict.keySet()) {
out.write(' ');
String attrName = key.getValue();
out.write(Character.toLowerCase(attrName.charAt(0)) + attrName.substring(1));
out.write("=\"");
out.write(attrDict.get(key, false).toString());
out.write("\"");
}
} catch (java.io.IOException e) {
throw new com.itextpdf.io.exceptions.IOException(IoExceptionMessageConstant.UNKNOWN_IO_EXCEPTION, e);
}
}
}
/**
* Parses tag of the Marked Content Reference (MCR) kid of the StructTreeRoot.
*
* @param kid the direct {@link PdfMcr} kid of the StructTreeRoot
*/
protected void parseTag(PdfMcr kid) {
int mcid = kid.getMcid();
PdfDictionary pageDic = kid.getPageObject();
String tagContent = "";
if (mcid != -1) {
if (!parsedTags.containsKey(pageDic)) {
MarkedContentEventListener listener = new MarkedContentEventListener();
PdfCanvasProcessor processor = new PdfCanvasProcessor(listener);
PdfPage page = document.getPage(pageDic);
processor.processContent(page.getContentBytes(), page.getResources());
parsedTags.put(pageDic, listener.getMcidContent());
}
if (parsedTags.get(pageDic).containsKey(mcid))
tagContent = parsedTags.get(pageDic).get(mcid);
} else {
PdfObjRef objRef = (PdfObjRef) kid;
PdfObject object = objRef.getReferencedObject();
if (object.isDictionary()) {
PdfName subtype = ((PdfDictionary) object).getAsName(PdfName.Subtype);
tagContent = subtype.toString();
}
}
try {
out.write(escapeXML(tagContent, true));
} catch (java.io.IOException e) {
throw new com.itextpdf.io.exceptions.IOException(IoExceptionMessageConstant.UNKNOWN_IO_EXCEPTION, e);
}
}
/**
* Fixes specified tag name to be valid XML tag.
*
* @param tag tag name to fix
*
* @return fixed tag name.
*/
protected static String fixTagName(String tag) {
StringBuilder sb = new StringBuilder();
for (int k = 0; k < tag.length(); ++k) {
char c = tag.charAt(k);
boolean nameStart =
c == ':'
|| (c >= 'A' && c <= 'Z')
|| c == '_'
|| (c >= 'a' && c <= 'z')
|| (c >= '\u00c0' && c <= '\u00d6')
|| (c >= '\u00d8' && c <= '\u00f6')
|| (c >= '\u00f8' && c <= '\u02ff')
|| (c >= '\u0370' && c <= '\u037d')
|| (c >= '\u037f' && c <= '\u1fff')
|| (c >= '\u200c' && c <= '\u200d')
|| (c >= '\u2070' && c <= '\u218f')
|| (c >= '\u2c00' && c <= '\u2fef')
|| (c >= '\u3001' && c <= '\ud7ff')
|| (c >= '\uf900' && c <= '\ufdcf')
|| (c >= '\ufdf0' && c <= '\ufffd');
boolean nameMiddle =
c == '-'
|| c == '.'
|| (c >= '0' && c <= '9')
|| c == '\u00b7'
|| (c >= '\u0300' && c <= '\u036f')
|| (c >= '\u203f' && c <= '\u2040')
|| nameStart;
if (k == 0) {
if (!nameStart)
c = '_';
} else {
if (!nameMiddle)
c = '-';
}
sb.append(c);
}
return sb.toString();
}
/**
* NOTE: copied from itext5 XMLUtils class
*
* Escapes a string with the appropriated XML codes.
*
* @param s the string to be escaped
* @param onlyASCII codes above 127 will always be escaped with &#nn; if <CODE>true</CODE>
* @return the escaped string
*/
protected static String escapeXML(String s, boolean onlyASCII) {
char[] cc = s.toCharArray();
int len = cc.length;
StringBuilder sb = new StringBuilder();
for (int k = 0; k < len; ++k) {
int c = cc[k];
switch (c) {
case '<':
sb.append("<");
break;
case '>':
sb.append(">");
break;
case '&':
sb.append("&");
break;
case '"':
sb.append(""");
break;
case '\'':
sb.append("'");
break;
default:
if (isValidCharacterValue(c)) {
if (onlyASCII && c > 127)
sb.append("&#").append(c).append(';');
else
sb.append((char) c);
}
}
}
return sb.toString();
}
private class MarkedContentEventListener implements IEventListener {
private Map<Integer, ITextExtractionStrategy> contentByMcid = new HashMap<>();
public Map<Integer, String> getMcidContent() {
Map<Integer, String> content = new HashMap<>();
for (int id : contentByMcid.keySet()) {
content.put(id, contentByMcid.get(id).getResultantText());
}
return content;
}
@Override
public void eventOccurred(IEventData data, EventType type) {
switch (type) {
case RENDER_TEXT:
TextRenderInfo textInfo = (TextRenderInfo) data;
int mcid = textInfo.getMcid();
if (mcid != -1) {
ITextExtractionStrategy textExtractionStrategy = contentByMcid.get(mcid);
if (textExtractionStrategy == null) {
textExtractionStrategy = new LocationTextExtractionStrategy();
contentByMcid.put(mcid, textExtractionStrategy);
}
textExtractionStrategy.eventOccurred(data, type);
}
break;
default:
break;
}
}
@Override
public Set<EventType> getSupportedEvents() {
return null;
}
}
}