InlineImageExtractionTest.java

/*
    This file is part of the iText (R) project.
    Copyright (c) 1998-2025 Apryse Group NV
    Authors: Apryse Software.

    This program is offered under a commercial and under the AGPL license.
    For commercial licensing, contact us at https://itextpdf.com/sales.  For AGPL licensing, see below.

    AGPL licensing:
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 */
package com.itextpdf.kernel.pdf.canvas.parser;

import com.itextpdf.kernel.pdf.PdfArray;
import com.itextpdf.kernel.pdf.PdfDictionary;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfName;
import com.itextpdf.kernel.pdf.PdfNumber;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.PdfStream;
import com.itextpdf.kernel.pdf.PdfString;
import com.itextpdf.kernel.pdf.canvas.parser.data.IEventData;
import com.itextpdf.kernel.pdf.canvas.parser.data.ImageRenderInfo;
import com.itextpdf.kernel.pdf.canvas.parser.listener.IEventListener;
import com.itextpdf.kernel.pdf.canvas.parser.listener.SimpleTextExtractionStrategy;
import com.itextpdf.kernel.pdf.colorspace.PdfColorSpace;
import com.itextpdf.kernel.pdf.colorspace.PdfSpecialCs.Indexed;
import com.itextpdf.kernel.pdf.xobject.PdfImageXObject;
import com.itextpdf.kernel.utils.CompareTool;
import com.itextpdf.test.ExtendedITextTest;
import com.itextpdf.test.TestUtil;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;

@Tag("IntegrationTest")
public class InlineImageExtractionTest extends ExtendedITextTest {
    public static final String destinationFolder = TestUtil.getOutputPath() + "/kernel/pdf/canvas/parser/InlineImageExtractionTest/";
    public static final String sourceFolder = "./src/test/resources/com/itextpdf/kernel/pdf/canvas/parser/InlineImageExtractionTest/";

    @BeforeAll
    public static void beforeClass() {
        createOrClearDestinationFolder(destinationFolder);
    }

    @Test
    public void extractSingleInlineImageWithIndexedColorSpaceTest() throws IOException {
        PdfDocument pdf = new PdfDocument(new PdfReader(sourceFolder + "inlineImageExtraction.pdf"));
        InlineImageEventListener eventListener = new InlineImageEventListener();

        PdfCanvasProcessor canvasProcessor = new PdfCanvasProcessor(eventListener);
        canvasProcessor.processPageContent(pdf.getFirstPage());
        pdf.close();

        List<PdfStream> inlineImages = eventListener.getInlineImages();

        Assertions.assertEquals(1, inlineImages.size());

        byte[] imgBytes = inlineImages.get(0).getBytes();
        byte[] cmpImgBytes = Files.readAllBytes(Paths.get(sourceFolder, "imgtest.dat"));
        Assertions.assertArrayEquals(cmpImgBytes, imgBytes);

        PdfDictionary expectedDict = new PdfDictionary();
        expectedDict.put(PdfName.BitsPerComponent, new PdfNumber(8));
        expectedDict.put(PdfName.Height, new PdfNumber(50));
        expectedDict.put(PdfName.Width, new PdfNumber(50));
        String indexedCsLookupData = "\u007F\u007F\u007F������\u000F\u000F\u000F???������������///\u001F\u001F\u001F___������"
                + "\u009F\u009F\u009FOOO������ooo\u008F\u008F\u008F������::<������uuy,,-������\u000E\u000E\u000F������\u001D\u001D\u001E"
                + "������XXZ::?\u0004\u0004\u0004226!!$IIK\u0019\u0019\u001B������\u0092\u0092\u0097������\f\f\r"
                + "))-������������������66;\b\b\t\u0084\u0084\u0088������������\u0014\u0014\u0016������\u0010\u0010\u0012������ffi������������..2������"
                + "������%%)������\u001D\u001D\u001F>>D������������\u000E\u000E\u000E������\u008D\u008D\u008C������CCI������\u009B\u009B\u009A"
                + "������������zz|888������������������������bbb\u0095\u0095\u0098������������������\u0089\u0089\u008B������������==>������***qqp������ZZ\\������"
                + "\u007F\u007F~\u008E\u008E\u008E\u001E\u001E\u001F������������������������������\u001C\u001C\u001C]]^������TTT������������FFF����"
                + "��������yy{������������^^^vvy������\u0087\u0087\u008A}}}xxz������jjl--.������\u0000\u0000\u0000������{{{|||}}}~~~\u007F\u007F"
                + "\u007F\u0080\u0080\u0080\u0081\u0081\u0081\u0082\u0082\u0082\u0083\u0083\u0083\u0084\u0084\u0084\u0085"
                + "\u0085\u0085\u0086\u0086\u0086\u0087\u0087\u0087\u0088\u0088\u0088\u0089\u0089\u0089\u008A\u008A\u008A"
                + "\u008B\u008B\u008B\u008C\u008C\u008C\u008D\u008D\u008D\u008E\u008E\u008E\u008F\u008F\u008F\u0090\u0090"
                + "\u0090\u0091\u0091\u0091\u0092\u0092\u0092\u0093\u0093\u0093\u0094\u0094\u0094\u0095\u0095\u0095\u0096"
                + "\u0096\u0096\u0097\u0097\u0097\u0098\u0098\u0098\u0099\u0099\u0099\u009A\u009A\u009A\u009B\u009B\u009B"
                + "\u009C\u009C\u009C\u009D\u009D\u009D\u009E\u009E\u009E\u009F\u009F\u009F������������������������������������������������������������"
                + "������������������\u00AD\u00AD\u00AD������������������������������������������������������������������������������������������������������������������������������������������������������"
                + "������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������"
                + "������������������������������������������������������������������������������������������������������������������������������������������";
        Indexed expectedIndexedCs = new Indexed(PdfName.DeviceRGB, 255, new PdfString(indexedCsLookupData));
        expectedDict.put(PdfName.ColorSpace, expectedIndexedCs.getPdfObject());

        Assertions.assertTrue(new CompareTool().compareDictionaries(inlineImages.get(0), expectedDict));
    }

    @Test
    public void parseInlineImageTest() throws IOException {
        PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "docWithInlineImage.pdf"));

        InlineImageEventListener listener = new InlineImageEventListener();
        new PdfCanvasProcessor(listener).processPageContent(pdfDocument.getFirstPage());
        List <PdfStream> inlineImages = listener.getInlineImages();

        byte[] data = new PdfImageXObject(inlineImages.get(0)).getImageBytes();

        byte[] cmpImgBytes = Files.readAllBytes(Paths.get(sourceFolder, "docWithInlineImageBytes.dat"));

        Assertions.assertArrayEquals(cmpImgBytes, data);
    }

    @Test
    public void parseInlineImageCalRGBColorSpaceTest() throws IOException {
        try(PdfDocument pdf = new PdfDocument(new PdfReader(sourceFolder + "inlineImageCalRGBColorSpace.pdf"))){
            PdfCanvasProcessor pdfCanvasProcessor = new PdfCanvasProcessor(new SimpleTextExtractionStrategy());
            pdfCanvasProcessor.processPageContent(pdf.getPage(1));

            PdfName name = new PdfName("Cs1");
            PdfColorSpace colorSpace = pdf.getPage(1).getResources().getColorSpace(name);

            PdfArray pdfArray = (PdfArray) colorSpace.getPdfObject();
            PdfName actualName = (PdfName) pdfArray.get(0);

            Assertions.assertEquals(PdfName.CalRGB, actualName);
        }
    }

    @Test
    public void parseInlineImageCalGrayColorSpaceTest() throws IOException {
        try(PdfDocument pdf = new PdfDocument(new PdfReader(sourceFolder + "inlineImageCalGrayColorSpace.pdf"))){
            PdfCanvasProcessor pdfCanvasProcessor = new PdfCanvasProcessor(new SimpleTextExtractionStrategy());
            pdfCanvasProcessor.processPageContent(pdf.getPage(1));

            PdfName name = new PdfName("Cs1");
            PdfColorSpace colorSpace = pdf.getPage(1).getResources().getColorSpace(name);

            PdfArray pdfArray = (PdfArray) colorSpace.getPdfObject();
            PdfName actualName = (PdfName) pdfArray.get(0);

            Assertions.assertEquals(PdfName.CalGray, actualName);
        }
    }

    @Test
    public void parseInlineImageLabColorSpaceTest() throws IOException {
        try(PdfDocument pdf = new PdfDocument(new PdfReader(sourceFolder + "inlineImageLabColorSpace.pdf"))){
            PdfCanvasProcessor pdfCanvasProcessor = new PdfCanvasProcessor(new SimpleTextExtractionStrategy());
            pdfCanvasProcessor.processPageContent(pdf.getPage(1));

            PdfName name = new PdfName("Cs1");
            PdfColorSpace colorSpace = pdf.getPage(1).getResources().getColorSpace(name);

            PdfArray pdfArray = (PdfArray) colorSpace.getPdfObject();
            PdfName actualName = (PdfName) pdfArray.get(0);

            Assertions.assertEquals(PdfName.Lab, actualName);
        }
    }

    @Test
    public void parseInlineImageICCBasedColorSpaceTest() throws IOException {
        try(PdfDocument pdf = new PdfDocument(new PdfReader(sourceFolder + "inlineImageICCBasedColorSpace.pdf"))){
            PdfCanvasProcessor pdfCanvasProcessor = new PdfCanvasProcessor(new SimpleTextExtractionStrategy());
            pdfCanvasProcessor.processPageContent(pdf.getPage(1));

            PdfName name = new PdfName("Cs1");
            PdfColorSpace colorSpace = pdf.getPage(1).getResources().getColorSpace(name);

            PdfArray pdfArray = (PdfArray) colorSpace.getPdfObject();
            PdfName actualName = (PdfName) pdfArray.get(0);

            Assertions.assertEquals(PdfName.ICCBased, actualName);
        }
    }

    @Test
    public void parseInlineImageDeviceRGBColorSpaceTest() throws IOException {
        try(PdfDocument pdf = new PdfDocument(new PdfReader(sourceFolder + "inlineImageDeviceRGBColorSpace.pdf"))){
            PdfCanvasProcessor pdfCanvasProcessor = new PdfCanvasProcessor(new SimpleTextExtractionStrategy());
            pdfCanvasProcessor.processPageContent(pdf.getPage(1));

            PdfStream stream = pdf.getPage(1).getContentStream(0);
            String firstPageData = new String(stream.getBytes());

            Assertions.assertTrue(firstPageData.contains(PdfName.DeviceRGB.getValue()));
        }
    }

    @Test
    public void parseInlineImageDeviceCMYKColorSpaceTest() throws IOException {
        try(PdfDocument pdf = new PdfDocument(new PdfReader(sourceFolder + "inlineImageDeviceCMYKColorSpace.pdf"))){
            PdfCanvasProcessor pdfCanvasProcessor = new PdfCanvasProcessor(new SimpleTextExtractionStrategy());
            pdfCanvasProcessor.processPageContent(pdf.getPage(1));

            PdfStream stream = pdf.getPage(1).getContentStream(0);
            String firstPageData = new String(stream.getBytes());

            Assertions.assertTrue(firstPageData.contains(PdfName.DeviceCMYK.getValue()));
        }
    }

    @Test
    public void parseInlineImageDeviceGrayColorSpaceTest() throws IOException {
        try(PdfDocument pdf = new PdfDocument(new PdfReader(sourceFolder + "inlineImageDeviceGrayColorSpace.pdf"))){
            PdfCanvasProcessor pdfCanvasProcessor = new PdfCanvasProcessor(new SimpleTextExtractionStrategy());
            pdfCanvasProcessor.processPageContent(pdf.getPage(1));

            PdfStream stream = pdf.getPage(1).getContentStream(0);
            String firstPageData = new String(stream.getBytes());

            Assertions.assertTrue(firstPageData.contains(PdfName.DeviceGray.getValue()));
        }
    }

    @Test
    public void parseInlineImageSeparationColorSpaceTest() throws IOException {
        try(PdfDocument pdf = new PdfDocument(new PdfReader(sourceFolder + "inlineImageSeparationColorSpace.pdf"))){
            PdfCanvasProcessor pdfCanvasProcessor = new PdfCanvasProcessor(new SimpleTextExtractionStrategy());
            pdfCanvasProcessor.processPageContent(pdf.getPage(1));

            PdfName name = new PdfName("Cs1");
            PdfColorSpace colorSpace = pdf.getPage(1).getResources().getColorSpace(name);

            PdfArray pdfArray = (PdfArray) colorSpace.getPdfObject();
            PdfName actualName = (PdfName) pdfArray.get(0);

            Assertions.assertEquals(PdfName.Separation, actualName);
        }
    }

    @Test
    public void parseInlineImageDeviceNColorSpaceTest() throws IOException {
        try(PdfDocument pdf = new PdfDocument(new PdfReader(sourceFolder + "inlineImageDeviceNColorSpace.pdf"))){
            PdfCanvasProcessor pdfCanvasProcessor = new PdfCanvasProcessor(new SimpleTextExtractionStrategy());
            pdfCanvasProcessor.processPageContent(pdf.getPage(1));

            PdfName name = new PdfName("Cs1");
            PdfColorSpace colorSpace = pdf.getPage(1).getResources().getColorSpace(name);

            PdfArray pdfArray = (PdfArray) colorSpace.getPdfObject();
            PdfName actualName = (PdfName) pdfArray.get(0);

            Assertions.assertEquals(PdfName.DeviceN, actualName);
        }
    }

    @Test
    public void parseInlineImageIndexedColorSpaceTest() throws IOException {
        try(PdfDocument pdf = new PdfDocument(new PdfReader(sourceFolder + "inlineImageIndexedColorSpace.pdf"))){
            PdfCanvasProcessor pdfCanvasProcessor = new PdfCanvasProcessor(new SimpleTextExtractionStrategy());
            pdfCanvasProcessor.processPageContent(pdf.getPage(1));

            PdfName name = new PdfName("Cs1");
            PdfColorSpace colorSpace = pdf.getPage(1).getResources().getColorSpace(name);

            PdfArray pdfArray = (PdfArray) colorSpace.getPdfObject();
            PdfName actualName = (PdfName) pdfArray.get(0);

            Assertions.assertEquals(PdfName.Indexed, actualName);
        }
    }

    private static class InlineImageEventListener implements IEventListener {
        private final List<PdfStream> inlineImages = new ArrayList<>();

        public List<PdfStream> getInlineImages() {
            return inlineImages;
        }

        public void eventOccurred(IEventData data, EventType type) {
            if (type == EventType.RENDER_IMAGE) {
                ImageRenderInfo imageEventData = (ImageRenderInfo) data;
                if (((ImageRenderInfo) data).isInline()) {
                    inlineImages.add(imageEventData.getImage().getPdfObject());
                }
            }
        }

        public Set<EventType> getSupportedEvents() {
            return new LinkedHashSet<>(Collections.singletonList(EventType.RENDER_IMAGE));
        }
    }
}