StartXRefScanner.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.pdf.updates;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.io.RandomAccessRead;
/**
* This is a first draft of a scanner to extract incremental updates
* out of PDFs. It effectively scans the bytestream looking
* for startxref\\s*(\\d+)\\s*(%%EOF\n?)? It does not validate that the
* startxrefs point to actual xrefs.
* <p>
* If the number component ends at the literal end of the file
* (e.g. the file is truncated or malformed), the startxref
* will not be reported.
* <p>
* There may be false positives, especially in adversarial settings.
* For example, there may be a startxref string in a comment
* or inside a stream or object.
*
* <p>
* The good parts come directly from PDFBox.
*/
public class StartXRefScanner {
static final int MAX_LENGTH_LONG = Long.toString(Long.MAX_VALUE).length();
private static final char[] STARTXREF = new char[]{'s', 't', 'a', 'r', 't', 'x', 'r', 'e', 'f'};
private static final char[] EOF_MARKER = new char[]{'%', '%', 'E', 'O', 'F'};
/**
* ASCII code for line feed.
*/
private static final byte ASCII_LF = 10;
/**
* ASCII code for carriage return.
*/
private static final byte ASCII_CR = 13;
private static final byte ASCII_SPACE = 32;
private final RandomAccessRead source;
public StartXRefScanner(RandomAccessRead source) {
this.source = source;
}
public List<StartXRefOffset> scan() throws IOException {
List<StartXRefOffset> offsets = new ArrayList<>();
try {
int b = source.read();
while (b > -1) {
if (b == STARTXREF[0]) {
tryStartXRef(offsets);
}
b = source.read();
}
} finally {
if (source.getPosition() >= Integer.MAX_VALUE) {
throw new IOException("read more than " + Integer.MAX_VALUE + " bytes");
}
//TODO: if we're opening a new file for the source
//we shouldn't bother with this.
source.rewind((int) source.getPosition());
}
return offsets;
}
private void tryStartXRef(List<StartXRefOffset> offsets) throws IOException {
int match = 1;
int read = 0;
int b = source.read();
while (b > -1) {
if (b == STARTXREF[match]) {
++match;
if (match == STARTXREF.length) {
try {
long startXREFOffset = source.getPosition() - STARTXREF.length;
long startxref = readLong();
boolean hasEof = readEOF();
long endOfEOFOffset = source.getPosition();
offsets.add(new StartXRefOffset(startxref, startXREFOffset, endOfEOFOffset,
hasEof));
return;
} catch (IOException e) {
//swallow
return;
}
}
} else {
source.rewind(1);
return;
}
b = source.read();
}
}
private boolean readEOF() throws IOException {
//this expects %%EOF, with possibly some white space before it
//it will fail if there's a comment before %%EOF
//TODO -- make this more robust
skipWhiteSpaces();
int c = source.read();
int i = 0;
while (c > -1 && c == EOF_MARKER[i] && ++i < EOF_MARKER.length) {
c = source.read();
}
if (i == EOF_MARKER.length) {
//now look for a single new line following the eof
c = source.read();
if (c == -1) {
//do nothing
} else if (isEOL(c)) {
//do nothing
} else {
source.rewind(1);
}
return true;
}
//did not match, we need to rewind some
//read = i+1
i++;
if (c == -1) {
source.rewind(i - 1);
} else {
source.rewind(i);
}
return false;
}
protected void skipWhiteSpaces() throws IOException {
int whitespace = source.read();
while (whitespace > -1 && isWhitespace(whitespace)) {
whitespace = source.read();
}
if (whitespace > -1) {
source.rewind(1);
}
}
protected boolean isWhitespace(int c) {
return c == 0 || c == 9 || c == 12 || c == ASCII_LF || c == ASCII_CR || c == ASCII_SPACE;
}
protected long readLong() throws IOException {
skipSpaces();
long retval = 0;
StringBuilder longBuffer = readStringNumber();
try {
retval = Long.parseLong(longBuffer.toString());
} catch (NumberFormatException e) {
source.rewind(longBuffer.toString().getBytes(StandardCharsets.ISO_8859_1).length);
throw new IOException("Error: Expected a long type at offset " + source.getPosition() +
", instead got '" + longBuffer + "'", e);
}
return retval;
}
/**
* This will skip all spaces and comments that are present.
*
* @throws IOException If there is an error reading from the stream.
*/
protected void skipSpaces() throws IOException {
int c = source.read();
// 37 is the % character, a comment
while (isWhitespace(c) || c == 37) {
if (c == 37) {
// skip past the comment section
c = source.read();
while (!isEOL(c) && c != -1) {
c = source.read();
}
} else {
c = source.read();
}
}
if (c != -1) {
source.rewind(1);
}
}
/**
* This method is used to read a token by the {@linkplain #readLong()} method. Valid
* delimiters are any non digit values.
*
* @return the token to parse as integer or long by the calling method.
* @throws IOException throws by the {@link #source} methods.
*/
protected final StringBuilder readStringNumber() throws IOException {
int lastByte;
StringBuilder buffer = new StringBuilder();
while ((lastByte = source.read()) >= '0' && lastByte <= '9') {
buffer.append((char) lastByte);
if (buffer.length() > MAX_LENGTH_LONG) {
throw new IOException(
"Number '" + buffer + "' is getting too long, stop reading at offset " +
source.getPosition());
}
}
if (lastByte == -1) {
throw new IOException("number ended at EOF");
}
if (lastByte != -1) {
source.rewind(1);
}
return buffer;
}
/**
* This will tell if the next byte to be read is an end of line byte.
*
* @param c The character to check against end of line
* @return true if the next byte is 0x0A or 0x0D.
*/
protected boolean isEOL(int c) {
return isLF(c) || isCR(c);
}
private boolean isLF(int c) {
return ASCII_LF == c;
}
private boolean isCR(int c) {
return ASCII_CR == c;
}
}