PdfTokenizerTest.java
/*
This file is part of the iText (R) project.
Copyright (c) 1998-2025 Apryse Group NV
Authors: Apryse Software.
This program is offered under a commercial and under the AGPL license.
For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below.
AGPL licensing:
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package com.itextpdf.kernel.pdf;
import com.itextpdf.io.logs.IoLogMessageConstant;
import com.itextpdf.io.font.PdfEncodings;
import com.itextpdf.io.source.PdfTokenizer;
import com.itextpdf.io.source.RandomAccessFileOrArray;
import com.itextpdf.io.source.RandomAccessSourceFactory;
import com.itextpdf.kernel.exceptions.KernelExceptionMessageConstant;
import com.itextpdf.test.ExtendedITextTest;
import com.itextpdf.test.annotations.LogMessage;
import com.itextpdf.test.annotations.LogMessages;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Tag;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
@Tag("IntegrationTest")
public class PdfTokenizerTest extends ExtendedITextTest {
public static final String sourceFolder = "./src/test/resources/com/itextpdf/kernel/pdf/PdfTokeniserTest/";
@Test
public void encodingTest() throws IOException {
RandomAccessSourceFactory factory;
PdfTokenizer tok;
PdfString pdfString;
// hex string parse and check
String testHexString = "<0D0A09557365729073204775696465>";
factory = new RandomAccessSourceFactory();
tok = new PdfTokenizer(new RandomAccessFileOrArray(factory.createSource(testHexString.getBytes(StandardCharsets.ISO_8859_1))));
tok.nextToken();
pdfString = new PdfString(tok.getByteContent(), tok.isHexString());
Assertions.assertEquals("\r\n\tUser\u0090s Guide", pdfString.getValue());
String testUnicodeString = "��������������������";
pdfString = new PdfString(PdfEncodings.convertToBytes(testUnicodeString, PdfEncodings.UNICODE_BIG), false);
Assertions.assertEquals(testUnicodeString, pdfString.toUnicodeString());
pdfString = new PdfString("FEFF041F04400438043204350442".getBytes(StandardCharsets.ISO_8859_1), true);
Assertions.assertEquals("\u041F\u0440\u0438\u0432\u0435\u0442", pdfString.toUnicodeString());
pdfString = new PdfString("FEFF041F04400438043204350442".getBytes(StandardCharsets.ISO_8859_1), false);
Assertions.assertEquals("FEFF041F04400438043204350442", pdfString.toUnicodeString());
String specialCharacter = "\r\n\t\\n\\r\\t\\f";
pdfString = new PdfString(specialCharacter.getBytes(StandardCharsets.ISO_8859_1), false);
Assertions.assertEquals("\n\t\n\r\t\f", pdfString.toUnicodeString());
String symbol = "\u0001\u0004\u0006\u000E\u001F";
pdfString = new PdfString(symbol.getBytes(StandardCharsets.ISO_8859_1), false);
Assertions.assertEquals(symbol, pdfString.toUnicodeString());
String testString1 ="These\\\n two\\\r strings\\\n are the same";
pdfString = new PdfString(testString1.getBytes(StandardCharsets.ISO_8859_1), false);
Assertions.assertEquals("These two strings are the same", pdfString.getValue());
String testString2 ="This string contains \\245two octal characters\\307";
pdfString = new PdfString(testString2.getBytes(StandardCharsets.ISO_8859_1), false);
Assertions.assertEquals("This string contains \u00A5two octal characters\u00C7", pdfString.getValue());
String testString3 ="\\0053";
pdfString = new PdfString(testString3.getBytes(StandardCharsets.ISO_8859_1), false);
Assertions.assertEquals("\u00053", pdfString.getValue());
String testString4 ="\\053";
pdfString = new PdfString(testString4.getBytes(StandardCharsets.ISO_8859_1), false);
Assertions.assertEquals("+", pdfString.getValue());
byte[] b = new byte[]{(byte)46,(byte)56,(byte)40};
pdfString = new PdfString(b,false);
Assertions.assertEquals(new String(b),pdfString.getValue());
}
@Test
@LogMessages(messages = @LogMessage(messageTemplate =
IoLogMessageConstant.XREF_ERROR_WHILE_READING_TABLE_WILL_BE_REBUILT_WITH_CAUSE))
public void readPdfStringTest() throws IOException {
final String author = "This string9078 contains \u00A5two octal characters\u00C7";
final String creator = "iText\r 6\n";
final String title = "\u00DF\u00E3\u00EB\u00F0";
final String subject = "+";
String filename = sourceFolder + "writePdfString.pdf";
PdfReader reader = new PdfReader(filename);
PdfDocument d = new PdfDocument(reader);
// text in pdf: int array ( 223,227, 235,240)
Assertions.assertEquals(d.getDocumentInfo().getTitle(), title);
// text in pdf: This string\9078 contains \245two octal characters\307
Assertions.assertEquals(d.getDocumentInfo().getAuthor(), author);
// text in pdf: iText\r 6\n
Assertions.assertEquals(d.getDocumentInfo().getCreator(), creator);
// text in pdf: \053
Assertions.assertEquals(d.getDocumentInfo().getSubject(), subject);
}
@Test
public void primitivesTest() throws Exception {
String data = "<</Size 70." +
"/Value#20 .1" +
"/Root 46 0 R" +
"/Info 44 0 R" +
"/ID[<736f6d652068657820737472696e672>(some simple string )<8C2547D58D4BD2C6F3D32B830BE3259D2>-70.1--0.2]" +
"/Name1 --15" +
"/Prev ---116.23 >>";
RandomAccessSourceFactory factory = new RandomAccessSourceFactory();
PdfTokenizer tok = new PdfTokenizer(new RandomAccessFileOrArray(factory.createSource(data.getBytes(StandardCharsets.ISO_8859_1))));
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.StartDic);
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.Name);
PdfName name = new PdfName(tok.getByteContent());
Assertions.assertEquals("Size", name.getValue());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.Number);
PdfNumber num = new PdfNumber(tok.getByteContent());
Assertions.assertEquals("70.", num.toString());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.Name);
name = new PdfName(tok.getByteContent());
Assertions.assertEquals("Value ", name.getValue());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.Number);
num = new PdfNumber(tok.getByteContent());
Assertions.assertNotSame("0.1", num.toString());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.Name);
name = new PdfName(tok.getByteContent());
Assertions.assertEquals("Root", name.getValue());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.Ref);
PdfIndirectReference ref = new PdfIndirectReference(null, tok.getObjNr(), tok.getGenNr());
Assertions.assertEquals("46 0 R", ref.toString());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.Name);
name = new PdfName(tok.getByteContent());
Assertions.assertEquals("Info", name.getValue());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.Ref);
ref = new PdfIndirectReference(null, tok.getObjNr(), tok.getGenNr());
Assertions.assertEquals("44 0 R", ref.toString());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.Name);
name = new PdfName(tok.getByteContent());
Assertions.assertEquals("ID", name.getValue());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.StartArray);
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.String);
Assertions.assertTrue(tok.isHexString());
PdfString str = new PdfString(tok.getByteContent(), tok.isHexString());
Assertions.assertEquals("some hex string ", str.getValue());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.String);
Assertions.assertFalse(tok.isHexString());
str = new PdfString(tok.getByteContent(), tok.isHexString());
Assertions.assertEquals("some simple string ", str.getValue());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.String);
Assertions.assertTrue(tok.isHexString());
str = new PdfString(tok.getByteContent(), tok.isHexString());
Assertions.assertEquals("\u008C%G\u00D5\u008DK\u00D2\u00C6\u00F3\u00D3+\u0083\u000B\u00E3%\u009D ", str.getValue());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.Number);
num = new PdfNumber(tok.getByteContent());
Assertions.assertEquals("-70.1", num.toString());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.Number);
num = new PdfNumber(tok.getByteContent());
Assertions.assertEquals("-0.2", num.toString());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.EndArray);
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.Name);
name = new PdfName(tok.getByteContent());
Assertions.assertEquals("Name1", name.getValue());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.Number);
num = new PdfNumber(tok.getByteContent());
Assertions.assertEquals("0", num.toString());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.Name);
name = new PdfName(tok.getByteContent());
Assertions.assertEquals("Prev", name.getValue());
tok.nextValidToken();
Assertions.assertEquals(tok.getTokenType(), PdfTokenizer.TokenType.Number);
num = new PdfNumber(tok.getByteContent());
Assertions.assertEquals("-116.23", num.toString());
}
}