XrefParser.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.pdfparser;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Optional;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSObjectKey;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.XrefTrailerResolver.XRefType;
/**
* Parser to be used to read the cross reference table of a pdf. It is either a simple table or a stream.
*/
public class XrefParser
{
private static final Logger LOG = LogManager.getLogger(XrefParser.class);
private static final int X = 'x';
private static final char[] XREF_TABLE = { 'x', 'r', 'e', 'f' };
private static final char[] STARTXREF = { 's', 't', 'a', 'r', 't', 'x', 'r', 'e', 'f' };
/**
* Collects all Xref/trailer objects and resolves them into single
* object using startxref reference.
*/
private XrefTrailerResolver xrefTrailerResolver = new XrefTrailerResolver();
private final COSParser parser;
private final RandomAccessRead source;
/**
* Default constructor.
*
* @param cosParser the parser to be used to read the pdf.
*
*/
public XrefParser(COSParser cosParser)
{
parser = cosParser;
source = parser.source;
}
/**
* Returns the resulting cross reference table.
*
* @return
*/
public Map<COSObjectKey, Long> getXrefTable()
{
return xrefTrailerResolver.getXrefTable();
}
/**
* Parses cross reference tables.
*
* @param document the corresponding COS document of the pdf.
* @param startXRefOffset start offset of the first table
*
* @return the trailer dictionary
*
* @throws IOException if something went wrong
*/
public COSDictionary parseXref(COSDocument document, long startXRefOffset) throws IOException
{
source.seek(startXRefOffset);
long startXrefOffset = Math.max(0, parseStartXref());
// check the startxref offset
long fixedOffset = checkXRefOffset(startXrefOffset);
if (fixedOffset > -1)
{
startXrefOffset = fixedOffset;
}
document.setStartXref(startXrefOffset);
long prev = startXrefOffset;
// ---- parse whole chain of xref tables/object streams using PREV reference
Set<Long> prevSet = new HashSet<>();
COSDictionary trailer = null;
while (prev > 0)
{
// save expected position for loop detection
prevSet.add(prev);
// seek to xref table
source.seek(prev);
// skip white spaces
parser.skipSpaces();
// save current position as well due to skipped spaces
prevSet.add(source.getPosition());
// -- parse xref
if (source.peek() == X)
{
// xref table and trailer
// use existing parser to parse xref table
if (!parseXrefTable(prev) || !parseTrailer())
{
throw new IOException("Expected trailer object at offset "
+ source.getPosition());
}
trailer = xrefTrailerResolver.getCurrentTrailer();
// check for a XRef stream, it may contain some object ids of compressed objects
if(trailer.containsKey(COSName.XREF_STM))
{
int streamOffset = trailer.getInt(COSName.XREF_STM);
// check the xref stream reference
fixedOffset = checkXRefOffset(streamOffset);
if (fixedOffset > -1 && fixedOffset != streamOffset)
{
LOG.warn("/XRefStm offset {} is incorrect, corrected to {}", streamOffset,
fixedOffset);
streamOffset = (int)fixedOffset;
trailer.setInt(COSName.XREF_STM, streamOffset);
}
if (streamOffset > 0)
{
source.seek(streamOffset);
parser.skipSpaces();
try
{
parseXrefObjStream(prev, false);
document.setHasHybridXRef();
}
catch (IOException ex)
{
LOG.error("Failed to parse /XRefStm at offset " + streamOffset, ex);
}
}
else
{
LOG.error("Skipped XRef stream due to a corrupt offset: {}", streamOffset);
}
}
prev = trailer.getLong(COSName.PREV);
}
else
{
// parse xref stream
prev = parseXrefObjStream(prev, true);
trailer = xrefTrailerResolver.getCurrentTrailer();
}
if (prev > 0)
{
// check the xref table reference
fixedOffset = checkXRefOffset(prev);
if (fixedOffset > -1 && fixedOffset != prev)
{
prev = fixedOffset;
trailer.setLong(COSName.PREV, prev);
}
}
if (prevSet.contains(prev))
{
throw new IOException("/Prev loop at offset " + prev);
}
}
// ---- build valid xrefs out of the xref chain
xrefTrailerResolver.setStartxref(startXrefOffset);
trailer = xrefTrailerResolver.getTrailer();
document.setTrailer(trailer);
document.setIsXRefStream(XRefType.STREAM == xrefTrailerResolver.getXrefType());
// check the offsets of all referenced objects
checkXrefOffsets();
// copy xref table
document.addXRefTable(xrefTrailerResolver.getXrefTable());
// remember the highest XRef object number to avoid it being reused in incremental saving
Optional<Long> maxValue = document.getXrefTable().keySet().stream() //
.map(COSObjectKey::getNumber) //
.reduce(Long::max);
document.setHighestXRefObjectNumber(maxValue.isPresent() ? maxValue.get() : 0);
return trailer;
}
/**
* This will parse the trailer from the stream and add it to the state.
*
* @return false on parsing error
* @throws IOException If an IO error occurs.
*/
private boolean parseTrailer() throws IOException
{
// parse the last trailer.
long trailerOffset = source.getPosition();
// PDFBOX-1739 skip extra xref entries in RegisSTAR documents
int nextCharacter = source.peek();
while (nextCharacter != 't' && BaseParser.isDigit(nextCharacter))
{
if (source.getPosition() == trailerOffset)
{
// warn only the first time
LOG.warn("Expected trailer object at offset {}, keep trying", trailerOffset);
}
parser.readLine();
nextCharacter = source.peek();
}
if (source.peek() != 't')
{
return false;
}
// read "trailer"
long currentOffset = source.getPosition();
String nextLine = parser.readLine();
if (!nextLine.trim().equals("trailer"))
{
// in some cases the EOL is missing and the trailer immediately
// continues with "<<" or with a blank character
// even if this does not comply with PDF reference we want to support as many PDFs as possible
// Acrobat reader can also deal with this.
if (nextLine.startsWith("trailer"))
{
// we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes
int len = "trailer".length();
// jump back right after "trailer"
source.seek(currentOffset + len);
}
else
{
return false;
}
}
// in some cases the EOL is missing and the trailer continues with " <<"
// even if this does not comply with PDF reference we want to support as many PDFs as possible
// Acrobat reader can also deal with this.
parser.skipSpaces();
COSDictionary parsedTrailer = parser.parseCOSDictionary(true);
xrefTrailerResolver.setTrailer(parsedTrailer);
parser.skipSpaces();
return true;
}
/**
* Parses an xref object stream starting with indirect object id.
*
* @return value of PREV item in dictionary or <code>-1</code> if no such item exists
*/
private long parseXrefObjStream(long objByteOffset, boolean isStandalone) throws IOException
{
// ---- parse indirect object head
parser.readObjectNumber();
parser.readGenerationNumber();
parser.readObjectMarker();
COSDictionary dict = parser.parseCOSDictionary(false);
try (COSStream xrefStream = parser.parseCOSStream(dict))
{
// the cross reference stream of a hybrid xref table will be added to the existing one
// and we must not override the offset and the trailer
if ( isStandalone )
{
xrefTrailerResolver.nextXrefObj( objByteOffset, XRefType.STREAM );
xrefTrailerResolver.setTrailer(xrefStream);
}
new PDFXrefStreamParser(xrefStream).parse(xrefTrailerResolver);
}
return dict.getLong(COSName.PREV);
}
/**
* Check if the cross reference table/stream can be found at the current offset.
*
* @param startXRefOffset
* @return the revised offset
* @throws IOException
*/
private long checkXRefOffset(long startXRefOffset) throws IOException
{
source.seek(startXRefOffset);
parser.skipSpaces();
if (parser.isString(XREF_TABLE))
{
return startXRefOffset;
}
if (startXRefOffset > 0)
{
if (checkXRefStreamOffset(startXRefOffset))
{
return startXRefOffset;
}
else
{
return calculateXRefFixedOffset(startXRefOffset);
}
}
// can't find a valid offset
return -1;
}
/**
* Try to find a fixed offset for the given xref table/stream.
*
* @param objectOffset the given offset where to look at
* @return the fixed offset
*
* @throws IOException if something went wrong
*/
private long calculateXRefFixedOffset(long objectOffset) throws IOException
{
if (objectOffset < 0)
{
LOG.error("Invalid object offset {} when searching for a xref table/stream",
objectOffset);
return 0;
}
// search for the offset of the given xref table/stream among those found by a brute force search.
long newOffset = parser.getBruteForceParser().bfSearchForXRef(objectOffset);
if (newOffset > -1)
{
LOG.debug("Fixed reference for xref table/stream {} -> {}", objectOffset, newOffset);
return newOffset;
}
LOG.error("Can't find the object xref table/stream at offset {}", objectOffset);
return 0;
}
/**
* Check if the cross reference stream can be found at the current offset.
*
* @param startXRefOffset the expected start offset of the XRef stream
* @return the revised offset
* @throws IOException if something went wrong
*/
private boolean checkXRefStreamOffset(long startXRefOffset) throws IOException
{
if (startXRefOffset == 0)
{
return true;
}
// seek to offset-1
source.seek(startXRefOffset - 1);
int nextValue = source.read();
// the first character has to be a whitespace, and then a digit
if (BaseParser.isWhitespace(nextValue))
{
parser.skipSpaces();
if (parser.isDigit())
{
try
{
// it's a XRef stream
parser.readObjectNumber();
parser.readGenerationNumber();
parser.readObjectMarker();
// check the dictionary to avoid false positives
COSDictionary dict = parser.parseCOSDictionary(false);
source.seek(startXRefOffset);
if ("XRef".equals(dict.getNameAsString(COSName.TYPE)))
{
return true;
}
}
catch (IOException exception)
{
// there wasn't an object of a xref stream
LOG.debug("No Xref stream at given location {}", startXRefOffset, exception);
source.seek(startXRefOffset);
}
}
}
return false;
}
private boolean validateXrefOffsets(Map<COSObjectKey, Long> xrefOffset) throws IOException
{
if (xrefOffset == null)
{
return true;
}
Map<COSObjectKey, COSObjectKey> correctedKeys = new HashMap<>();
HashSet<COSObjectKey> validKeys = new HashSet<>();
for (Entry<COSObjectKey, Long> objectEntry : xrefOffset.entrySet())
{
COSObjectKey objectKey = objectEntry.getKey();
Long objectOffset = objectEntry.getValue();
// a negative offset number represents an object number itself
// see type 2 entry in xref stream
if (objectOffset != null && objectOffset >= 0)
{
COSObjectKey foundObjectKey = findObjectKey(objectKey, objectOffset, xrefOffset);
if (foundObjectKey == null)
{
LOG.debug(
"Stop checking xref offsets as at least one ({}) couldn't be dereferenced",
objectKey);
return false;
}
else if (foundObjectKey != objectKey)
{
// Generation was fixed - need to update map later, after iteration
correctedKeys.put(objectKey, foundObjectKey);
}
else
{
validKeys.add(objectKey);
}
}
}
Map<COSObjectKey, Long> correctedPointers = new HashMap<>();
for (Entry<COSObjectKey, COSObjectKey> correctedKeyEntry : correctedKeys.entrySet())
{
if (!validKeys.contains(correctedKeyEntry.getValue()))
{
// Only replace entries, if the original entry does not point to a valid object
correctedPointers.put(correctedKeyEntry.getValue(),
xrefOffset.get(correctedKeyEntry.getKey()));
}
}
// remove old invalid, as some might not be replaced
correctedKeys.forEach((key, value) -> xrefOffset.remove(key));
xrefOffset.putAll(correctedPointers);
return true;
}
/**
* Check the XRef table by dereferencing all objects and fixing the offset if necessary.
*
* @throws IOException if something went wrong.
*/
private void checkXrefOffsets() throws IOException
{
Map<COSObjectKey, Long> xrefOffset = xrefTrailerResolver.getXrefTable();
if (!validateXrefOffsets(xrefOffset))
{
Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = parser.getBruteForceParser()
.getBFCOSObjectOffsets();
if (!bfCOSObjectKeyOffsets.isEmpty())
{
LOG.debug("Replaced read xref table with the results of a brute force search");
xrefOffset.clear();
xrefOffset.putAll(bfCOSObjectKeyOffsets);
}
}
}
/**
* Check if the given object can be found at the given offset. Returns the provided object key if everything is ok.
* If the generation number differs it will be fixed and a new object key is returned.
*
* @param objectKey the key of object we are looking for
* @param offset the offset where to look
* @param xrefOffset a map with with all known xref entries
* @return returns the found/fixed object key
*
* @throws IOException if something went wrong
*/
private COSObjectKey findObjectKey(COSObjectKey objectKey, long offset,
Map<COSObjectKey, Long> xrefOffset) throws IOException
{
// there can't be any object at the very beginning of a pdf
if (offset < COSParser.MINIMUM_SEARCH_OFFSET)
{
return null;
}
try
{
source.seek(offset);
parser.skipWhiteSpaces();
if (source.getPosition() == offset)
{
// ensure that at least one whitespace is skipped in front of the object number
source.seek(offset - 1);
if (source.getPosition() < offset)
{
if (!parser.isDigit())
{
// anything else but a digit may be some garbage of the previous object -> just ignore it
source.read();
}
else
{
long current = source.getPosition();
source.seek(--current);
while (parser.isDigit())
source.seek(--current);
long newObjNr = parser.readObjectNumber();
int newGenNr = parser.readGenerationNumber();
COSObjectKey newObjKey = new COSObjectKey(newObjNr, newGenNr);
Long existingOffset = xrefOffset.get(newObjKey);
// the found object number belongs to another uncompressed object at the same or nearby offset
// something has to be wrong
if (existingOffset != null && existingOffset > 0
&& Math.abs(offset - existingOffset) < 10)
{
LOG.debug("Found the object {} instead of {} at offset {} - ignoring",
newObjKey, objectKey, offset);
return null;
}
// something seems to be wrong but it's hard to determine what exactly -> simply continue
source.seek(offset);
}
}
}
// try to read the given object/generation number
long foundObjectNumber = parser.readObjectNumber();
if (objectKey.getNumber() != foundObjectNumber)
{
LOG.warn("found wrong object number. expected [{}] found [{}]",
objectKey.getNumber(), foundObjectNumber);
objectKey = new COSObjectKey(foundObjectNumber, objectKey.getGeneration());
}
int genNumber = parser.readGenerationNumber();
// finally try to read the object marker
parser.readObjectMarker();
if (genNumber == objectKey.getGeneration())
{
return objectKey;
}
else if (genNumber > objectKey.getGeneration())
{
return new COSObjectKey(objectKey.getNumber(), genNumber);
}
}
catch (IOException exception)
{
// Swallow the exception, obviously there isn't any valid object number
LOG.debug("No valid object at given location {} - ignoring", offset, exception);
}
return null;
}
/**
* This will parse the startxref section from the stream. The startxref value is ignored.
*
* @return the startxref value or -1 on parsing error
* @throws IOException If an IO error occurs.
*/
private long parseStartXref() throws IOException
{
long startXref = -1;
if (parser.isString(STARTXREF))
{
parser.readString();
parser.skipSpaces();
// This integer is the byte offset of the first object referenced by the xref or xref stream
startXref = parser.readLong();
}
return startXref;
}
/**
* This will parse the xref table from the stream and add it to the state
* The XrefTable contents are ignored.
* @param startByteOffset the offset to start at
* @return false on parsing error
* @throws IOException If an IO error occurs.
*/
private boolean parseXrefTable(long startByteOffset) throws IOException
{
if (source.peek() != 'x')
{
return false;
}
String xref = parser.readString();
if( !xref.trim().equals( "xref" ) )
{
return false;
}
// check for trailer after xref
String str = parser.readString();
byte[] b = str.getBytes(StandardCharsets.ISO_8859_1);
source.seek(source.getPosition() - b.length);
// signal start of new XRef
xrefTrailerResolver.nextXrefObj( startByteOffset, XRefType.TABLE );
if (str.startsWith("trailer"))
{
LOG.warn("skipping empty xref table");
return false;
}
// Xref tables can have multiple sections. Each starts with a starting object id and a count.
while(true)
{
String currentLine = parser.readLine();
String[] splitString = currentLine.split("\\s");
if (splitString.length != 2)
{
LOG.warn("Unexpected XRefTable Entry: {}", currentLine);
return false;
}
// first obj id
long currObjID;
try
{
currObjID = Long.parseLong(splitString[0]);
}
catch (NumberFormatException exception)
{
LOG.warn("XRefTable: invalid ID for the first object: {}", currentLine);
return false;
}
// the number of objects in the xref table
int count = 0;
try
{
count = Integer.parseInt(splitString[1]);
}
catch (NumberFormatException exception)
{
LOG.warn("XRefTable: invalid number of objects: {}", currentLine);
return false;
}
parser.skipSpaces();
for(int i = 0; i < count; i++)
{
if (parser.isEOF() )
{
break;
}
int nextChar = source.peek();
if (nextChar == 't' || BaseParser.isEndOfName(nextChar))
{
break;
}
//Ignore table contents
currentLine = parser.readLine();
splitString = currentLine.split("\\s");
if (splitString.length < 3)
{
LOG.warn("invalid xref line: {}", currentLine);
break;
}
/* This supports the corrupt table as reported in
* PDFBOX-474 (XXXX XXX XX n) */
if(splitString[splitString.length-1].equals("n"))
{
try
{
long currOffset = Long.parseLong(splitString[0]);
// skip 0 offsets
if (currOffset > 0)
{
int currGenID = Integer.parseInt(splitString[1]);
COSObjectKey objKey = new COSObjectKey(currObjID, currGenID);
xrefTrailerResolver.setXRef(objKey, currOffset);
}
}
catch (IllegalArgumentException e)
{
throw new IOException(e);
}
}
else if(!splitString[2].equals("f"))
{
throw new IOException("Corrupt XRefTable Entry - ObjID:" + currObjID);
}
currObjID++;
parser.skipSpaces();
}
parser.skipSpaces();
if (!parser.isDigit())
{
break;
}
}
return true;
}
}