BaseParser.java
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.pdfparser;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import org.apache.pdfbox.io.RandomAccessRead;
/**
* This class is used to contain parsing logic that will be used by all parsers.
*
* @author Ben Litchfield
*/
public abstract class BaseParser
{
private static final int MAX_LENGTH_LONG = Long.toString(Long.MAX_VALUE).length();
/**
* ASCII code for Null.
*/
private static final byte ASCII_NULL = 0;
/**
* ASCII code for horizontal tab.
*/
private static final byte ASCII_TAB = 9;
/**
* ASCII code for line feed.
*/
private static final byte ASCII_LF = 10;
/**
* ASCII code for form feed.
*/
private static final byte ASCII_FF = 12;
/**
* ASCII code for carriage return.
*/
private static final byte ASCII_CR = 13;
private static final byte ASCII_ZERO = 48;
private static final byte ASCII_NINE = 57;
private static final byte ASCII_SPACE = 32;
/**
* This is the stream that will be read from.
*/
protected final RandomAccessRead source;
/**
* Default constructor.
*/
BaseParser(RandomAccessRead pdfSource)
{
this.source = pdfSource;
}
/**
* Skip the upcoming CRLF or LF which are supposed to follow a stream. Trailing spaces are removed as well.
*
* @throws IOException if something went wrong
*/
protected void skipWhiteSpaces() throws IOException
{
//PDF Ref 3.2.7 A stream must be followed by either
//a CRLF or LF but nothing else.
int whitespace = source.read();
//see brother_scan_cover.pdf, it adds whitespaces
//after the stream but before the start of the
//data, so just read those first
while (isSpace(whitespace))
{
whitespace = source.read();
}
if (!skipLinebreak(whitespace))
{
source.rewind(1);
}
}
/**
* Skip one line break, such as CR, LF or CRLF.
*
* @return true if a line break was found and removed.
*
* @throws IOException if something went wrong
*/
protected boolean skipLinebreak() throws IOException
{
// a line break is a CR, or LF or CRLF
if (!skipLinebreak(source.read()))
{
source.rewind(1);
return false;
}
return true;
}
/**
* Skip one line break, such as CR, LF or CRLF.
*
* @param linebreak the first character to be checked.
*
* @return true if a line break was found and removed.
*
* @throws IOException if something went wrong
*/
private boolean skipLinebreak(int linebreak) throws IOException
{
// a line break is a CR, or LF or CRLF
if (isCR(linebreak))
{
int next = source.read();
if (!isLF(next))
{
source.rewind(1);
}
}
else if (!isLF(linebreak))
{
return false;
}
return true;
}
/**
* This is really a bug in the Document creators code, but it caused a crash in PDFBox, the first bug was in this
* format: /Title ( (5) /Creator which was patched in 1 place.
*
* However it missed the case where the number of opening and closing parenthesis isn't balanced
*
* The second bug was in this format /Title (c:\) /Producer
*
* @param bracesParameter the number of braces currently open.
*
* @return the corrected value of the brace counter
* @throws IOException
*/
private int checkForEndOfString(final int bracesParameter) throws IOException
{
if (bracesParameter == 0)
{
return 0;
}
// Check the next 3 bytes if available
byte[] nextThreeBytes = new byte[3];
int amountRead = source.read(nextThreeBytes);
if (amountRead > 0)
{
source.rewind(amountRead);
}
if (amountRead < 3)
{
return bracesParameter;
}
// The following cases are valid indicators for the end of the string
// 1. Next line contains another COSObject: CR + LF + '/'
// 2. COSDictionary ends in the next line: CR + LF + '>'
// 3. Next line contains another COSObject: LF + '/'
// 4. COSDictionary ends in the next line: LF + '>'
// 5. Next line contains another COSObject: CR + '/'
// 6. COSDictionary ends in the next line: CR + '>'
if (((isCR(nextThreeBytes[0]) || isLF(nextThreeBytes[0]))
&& (nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>')) //
|| //
(isCR(nextThreeBytes[0]) && isLF(nextThreeBytes[1])
&& (nextThreeBytes[2] == '/' || nextThreeBytes[2] == '>')) //
)
{
return 0;
}
return bracesParameter;
}
/**
* Determine if a character terminates a PDF name.
*
* @param ch The character
* @return true if the character terminates a PDF name, otherwise false.
*/
protected static boolean isEndOfName(int ch)
{
switch (ch)
{
case ASCII_SPACE:
case ASCII_CR:
case ASCII_LF:
case ASCII_TAB:
case '>':
case '<':
case '[':
case '/':
case ']':
case ')':
case '(':
case ASCII_NULL:
case '\f':
case '%':
case -1:
return true;
default:
return false;
}
}
/**
* This will read the next string from the stream.
*
* @return The string that was read from the stream, never null.
*
* @throws IOException If there is an error reading from the stream.
*/
protected String readString() throws IOException
{
skipSpaces();
StringBuilder buffer = new StringBuilder();
int c = source.read();
while (!isEndOfName(c))
{
buffer.append( (char)c );
c = source.read();
}
if (c != -1)
{
source.rewind(1);
}
return buffer.toString();
}
/**
* This will parse a PDF string.
*
* @return The parsed PDF string.
*
* @throws IOException If there is an error reading from the stream.
*/
protected byte[] readLiteralString() throws IOException
{
readExpectedChar('(');
ByteArrayOutputStream out = new ByteArrayOutputStream();
// This is the number of braces read
int braces = 1;
int c = source.read();
while( braces > 0 && c != -1)
{
char ch = (char)c;
int nextc = -2; // not yet read
if (ch == ')')
{
braces--;
braces = checkForEndOfString(braces);
if (braces != 0)
{
out.write(ch);
}
}
else if (ch == '(')
{
braces++;
out.write(ch);
}
else if( ch == '\\' )
{
//patched by ram
char next = (char) source.read();
switch (next)
{
case 'n':
out.write('\n');
break;
case 'r':
out.write('\r');
break;
case 't':
out.write('\t');
break;
case 'b':
out.write('\b');
break;
case 'f':
out.write('\f');
break;
case ')':
// PDFBox 276 /Title (c:\)
braces = checkForEndOfString(braces);
if (braces != 0)
{
out.write(next);
}
else
{
out.write('\\');
}
break;
case '(':
case '\\':
out.write(next);
break;
case ASCII_LF:
case ASCII_CR:
// this is a break in the line so ignore it and the newline and continue
c = source.read();
while (isEOL(c) && c != -1)
{
c = source.read();
}
nextc = c;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
StringBuilder octal = new StringBuilder();
octal.append(next);
c = source.read();
char digit = (char) c;
if (digit >= '0' && digit <= '7')
{
octal.append(digit);
c = source.read();
digit = (char) c;
if (digit >= '0' && digit <= '7')
{
octal.append(digit);
}
else
{
nextc = c;
}
}
else
{
nextc = c;
}
int character = 0;
try
{
character = Integer.parseInt(octal.toString(), 8);
}
catch (NumberFormatException e)
{
throw new IOException(
"Error: Expected octal character, actual='" + octal + "'", e);
}
out.write(character);
break;
default:
// dropping the backslash
// see 7.3.4.2 Literal Strings for further information
out.write(next);
}
}
else
{
out.write(ch);
}
if (nextc != -2)
{
c = nextc;
}
else
{
c = source.read();
}
}
if (c != -1)
{
source.rewind(1);
}
return out.toByteArray();
}
/**
* Reads given pattern from {@link #source}. Skipping whitespace at start and end if wanted.
*
* @param expectedString pattern to be skipped
* @param skipSpaces if set to true spaces before and after the string will be skipped
* @throws IOException if pattern could not be read
*/
protected final void readExpectedString(final char[] expectedString, boolean skipSpaces) throws IOException
{
if (skipSpaces)
{
skipSpaces();
}
for (char c : expectedString)
{
if (source.read() != c)
{
throw new IOException("Expected string '" + new String(expectedString)
+ "' but missed at character '" + c + "' at offset "
+ source.getPosition());
}
}
if (skipSpaces)
{
skipSpaces();
}
}
/**
* Read one char and throw an exception if it is not the expected value.
*
* @param ec the char value that is expected.
* @throws IOException if the read char is not the expected value or if an
* I/O error occurs.
*/
protected void readExpectedChar(char ec) throws IOException
{
char c = (char) source.read();
if (c != ec)
{
throw new IOException(
"expected='" + ec + "' actual='" + c + "' at offset " + source.getPosition());
}
}
/**
* This will tell if the end of the data is reached.
*
* @return true if the end of the data is reached.
* @throws IOException If there is an error reading from the stream.
*/
protected boolean isEOF() throws IOException
{
return source.isEOF();
}
/**
* This will tell if the next byte to be read is an end of line byte.
*
* @param c The character to check against end of line
* @return true if the next byte is 0x0A or 0x0D.
*/
protected static boolean isEOL(int c)
{
return isLF(c) || isCR(c);
}
/**
* This will tell if the next byte to be read is a line feed.
*
* @param c The character to check against line feed
* @return true if the next byte is 0x0A.
*/
protected static boolean isLF(int c)
{
return ASCII_LF == c;
}
/**
* This will tell if the next byte to be read is a carriage return.
*
* @param c The character to check against carriage return
* @return true if the next byte is 0x0D.
*/
protected static boolean isCR(int c)
{
return ASCII_CR == c;
}
/**
* This will tell if the next byte is whitespace or not.
*
* @return true if the next byte in the stream is a whitespace character.
*
* @throws IOException If there is an error reading from the stream.
*/
protected boolean isWhitespace() throws IOException
{
return isWhitespace(source.peek());
}
/**
* This will tell if a character is whitespace or not. These values are
* specified in table 1 (page 12) of ISO 32000-1:2008.
* @param c The character to check against whitespace
* @return true if the character is a whitespace character.
*/
protected static boolean isWhitespace( int c )
{
switch (c)
{
case ASCII_NULL:
case ASCII_TAB:
case ASCII_FF:
case ASCII_LF:
case ASCII_CR:
case ASCII_SPACE:
return true;
default:
return false;
}
}
/**
* This will tell if the next byte is a space or not.
*
* @return true if the next byte in the stream is a space character.
*
* @throws IOException If there is an error reading from the stream.
*/
protected boolean isSpace() throws IOException
{
return isSpace(source.peek());
}
/**
* This will tell if the given value is a space or not.
*
* @param c The character to check against space
* @return true if the next byte in the stream is a space character.
*/
private static boolean isSpace(int c)
{
return ASCII_SPACE == c;
}
/**
* This will tell if the next byte is a digit or not.
*
* @return true if the next byte in the stream is a digit.
*
* @throws IOException If there is an error reading from the stream.
*/
protected boolean isDigit() throws IOException
{
return isDigit(source.peek());
}
/**
* This will tell if the given value is a digit or not.
*
* @param c The character to be checked
* @return true if the next byte in the stream is a digit.
*/
protected static boolean isDigit(int c)
{
return c >= ASCII_ZERO && c <= ASCII_NINE;
}
/**
* This will skip all spaces and comments that are present.
*
* @throws IOException If there is an error reading from the stream.
*/
protected void skipSpaces() throws IOException
{
int c = source.read();
// 37 is the % character, a comment
while( isWhitespace(c) || c == 37)
{
if ( c == 37 )
{
// skip past the comment section
c = source.read();
while(!isEOL(c) && c != -1)
{
c = source.read();
}
}
else
{
c = source.read();
}
}
if (c != -1)
{
source.rewind(1);
}
}
/**
* This will read an integer from the stream.
*
* @return The integer that was read from the stream.
*
* @throws IOException If there is an error reading from the stream.
*/
protected int readInt() throws IOException
{
skipSpaces();
int retval = 0;
StringBuilder intBuffer = readStringNumber();
try
{
retval = Integer.parseInt( intBuffer.toString() );
}
catch( NumberFormatException e )
{
source.rewind(intBuffer.toString().getBytes(StandardCharsets.ISO_8859_1).length);
throw new IOException("Error: Expected an integer type at offset " +
source.getPosition() +
", instead got '" + intBuffer + "'", e);
}
return retval;
}
/**
* This will read an long from the stream.
*
* @return The long that was read from the stream.
*
* @throws IOException If there is an error reading from the stream.
*/
protected long readLong() throws IOException
{
skipSpaces();
long retval = 0;
StringBuilder longBuffer = readStringNumber();
try
{
retval = Long.parseLong( longBuffer.toString() );
}
catch( NumberFormatException e )
{
source.rewind(longBuffer.toString().getBytes(StandardCharsets.ISO_8859_1).length);
throw new IOException( "Error: Expected a long type at offset "
+ source.getPosition() + ", instead got '" + longBuffer + "'", e);
}
return retval;
}
/**
* This method is used to read a token by the {@linkplain #readInt()} and the {@linkplain #readLong()} method. Valid
* delimiters are any non digit values.
*
* @return the token to parse as integer or long by the calling method.
* @throws IOException throws by the {@link #source} methods.
*/
protected final StringBuilder readStringNumber() throws IOException
{
int lastByte;
StringBuilder buffer = new StringBuilder();
while (isDigit(lastByte = source.read()))
{
buffer.append( (char)lastByte );
if (buffer.length() > MAX_LENGTH_LONG)
{
throw new IOException("Number '" + buffer +
"' is getting too long, stop reading at offset " + source.getPosition());
}
}
if( lastByte != -1 )
{
source.rewind(1);
}
return buffer;
}
}