PDFontTest.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.pdfbox.pdmodel.font;

import java.awt.geom.Area;
import java.awt.geom.GeneralPath;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.List;

import org.apache.fontbox.ttf.TTFParser;
import org.apache.fontbox.ttf.TrueTypeCollection;
import org.apache.fontbox.ttf.TrueTypeFont;
import org.apache.fontbox.util.autodetect.FontFileFinder;

import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts.FontName;
import org.apache.pdfbox.pdmodel.font.encoding.WinAnsiEncoding;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.Assumptions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
import org.junit.jupiter.api.parallel.ExecutionMode;

/**
 * 
 * @author adam
 * @author Tilman Hausherr
 */
@Execution(ExecutionMode.CONCURRENT)
class PDFontTest
{
    private static final File OUT_DIR = new File("target/test-output");

    @BeforeAll
    static void setUp() throws Exception
    {
        OUT_DIR.mkdirs();
    }

    /**
     * Test of the error reported in PDFBOX-988
     *
     * @throws IOException
     * @throws URISyntaxException
     */
    @Test
    void testPDFBox988() throws IOException, URISyntaxException
    {
        try (PDDocument doc = 
                Loader.loadPDF(new File(PDFontTest.class.getResource("F001u_3_7j.pdf").toURI())))
        {
            PDFRenderer renderer = new PDFRenderer(doc);
            renderer.renderImage(0);
            // the allegation is that renderImage() will crash the JVM or hang
        }
    }

    @Test
    void testPDFBOX5486() throws IOException
    {
        try (PDDocument doc = new PDDocument())
        {
            PDTrueTypeFont ttf = PDTrueTypeFont.load(doc,
                    PDFontTest.class.getResourceAsStream(
                            "/org/apache/pdfbox/resources/ttf/LiberationSans-Regular.ttf"),
                    WinAnsiEncoding.INSTANCE);
            assertTrue(ttf.hasGlyph("A"));
            ttf.getPath("A");
        }
    }

    /**
     * PDFBOX-3747: Test that using "-" with Calibri in Windows 7 has "-" in text extraction and not
     * \u2010, which was because of a wrong ToUnicode mapping because prior to the bugfix,
     * CmapSubtable#getCharCodes provided values in random order.
     *
     * @throws IOException
     */
    @Test
    void testPDFBox3747() throws IOException
    {
        File file = new File("c:/windows/fonts", "calibri.ttf");
        Assumptions.assumeTrue(file.exists(), "testPDFBox3747 skipped");
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        try (PDDocument doc = new PDDocument())
        {
            PDPage page = new PDPage();
            doc.addPage(page);
            PDFont font = PDType0Font.load(doc, file);
            try (PDPageContentStream cs = new PDPageContentStream(doc, page))
            {
                cs.beginText();
                cs.setFont(font, 10);
                cs.showText("PDFBOX-3747");
                cs.endText();
            }
            doc.save(baos);
        }

        try (PDDocument doc = Loader.loadPDF(baos.toByteArray()))
        {
            PDFTextStripper stripper = new PDFTextStripper();
            String text = stripper.getText(doc);
            assertEquals("PDFBOX-3747", text.trim());
        }
    }

    /**
     * PDFBOX-3826: Test ability to reuse a TrueTypeFont created from a file or a stream for several
     * PDFs to avoid parsing it over and over again. Also check that full or partial embedding is
     * done, and do render and text extraction.
     *
     * @throws IOException
     * @throws URISyntaxException
     */
    @Test
    void testPDFBox3826() throws IOException, URISyntaxException
    {
        URL url = PDFont.class.getResource(
                "/org/apache/pdfbox/resources/ttf/LiberationSans-Regular.ttf");
        File fontFile = new File(url.toURI());

        try (TrueTypeFont ttf1 = new TTFParser().parse(new RandomAccessReadBufferedFile(fontFile)))
        {
            testPDFBox3826checkFonts(testPDFBox3826createDoc(ttf1), fontFile);
        }

        try (TrueTypeFont ttf2 = new TTFParser().parse(new RandomAccessReadBufferedFile(fontFile)))
        {
            testPDFBox3826checkFonts(testPDFBox3826createDoc(ttf2), fontFile);
        }
    }

    /**
     * PDFBOX-4115: Test ability to create PDF with german umlaut glyphs with a type 1 font.
     * Test for everything that went wrong before this was fixed.
     *
     * @throws IOException 
     */
    @Test
    void testPDFBOX4115() throws IOException
    {
        File fontFile = new File("target/fonts", "n019003l.pfb");
        File outputFile = new File(OUT_DIR, "FontType1.pdf");
        String text = "������������";

        try (PDDocument doc = new PDDocument())
        {
            PDPage page = new PDPage();
            try (PDPageContentStream contentStream = new PDPageContentStream(doc, page))
            {
                PDType1Font font = new PDType1Font(doc, new FileInputStream(fontFile), WinAnsiEncoding.INSTANCE);

                contentStream.beginText();
                contentStream.setFont(font, 10);
                contentStream.newLineAtOffset(10, 700);
                contentStream.showText(text);
                contentStream.endText();
            }

            doc.addPage(page);

            doc.save(outputFile);
        }
        try (PDDocument doc = Loader.loadPDF(outputFile))
        {
            PDType1Font font = (PDType1Font) doc.getPage(0).getResources().getFont(COSName.getPDFName("F1"));
            assertEquals(WinAnsiEncoding.INSTANCE, font.getEncoding());
            
            for (char c : text.toCharArray())
            {
                String name = font.getEncoding().getName(c);
                assertEquals("dieresis", name.substring(1));
                assertFalse(font.getPath(name).getBounds2D().isEmpty());
            }

            PDFTextStripper stripper = new PDFTextStripper();
            assertEquals(text, stripper.getText(doc).trim());
        }
    }

    /**
     * Test whether bug from PDFBOX-4318 is fixed, which had the wrong cache key.
     * @throws java.io.IOException
     */
    @Test
    void testPDFox4318() throws IOException
    {
        PDType1Font helveticaBold = new PDType1Font(FontName.HELVETICA_BOLD);
        assertThrows(IllegalArgumentException.class,
                () -> helveticaBold.encode("\u0080"),
                "should have thrown IllegalArgumentException");
        helveticaBold.encode("���");
        assertThrows(IllegalArgumentException.class,
                () -> helveticaBold.encode("\u0080"),
                "should have thrown IllegalArgumentException");
    }

    @Test
    void testFullEmbeddingTTC() throws IOException
    {
        FontFileFinder fff = new FontFileFinder();
        TrueTypeCollection ttc = null;
        for (URI uri : fff.find())
        {
            if (uri.getPath().endsWith(".ttc"))
            {
                File file = new File(uri);
                System.out.println("TrueType collection file: " + file);
                ttc = new TrueTypeCollection(file);
                break;
            }
        }
        Assumptions.assumeTrue(ttc != null, "testFullEmbeddingTTC skipped, no .ttc files available");

        final List<String> names = new ArrayList<>();
        ttc.processAllFonts((TrueTypeFont ttf) ->
        {
            System.out.println("TrueType font in collection: " + ttf.getName());
            names.add(ttf.getName());
        });

        TrueTypeFont ttf = ttc.getFontByName(names.get(0)); // take the first one
        System.out.println("TrueType font used for test: " + ttf.getName());

        IOException ex = assertThrows(IOException.class,
                () -> PDType0Font.load(new PDDocument(), ttf, false),
                "should have thrown IOException");
        assertEquals("Full embedding of TrueType font collections not supported", ex.getMessage());
    }

    /**
     * Test using broken Type1C font.
     *
     * @throws IOException
     * @throws URISyntaxException
     */
    @Test
    void testPDFox5048() throws IOException, URISyntaxException
    {
        try (PDDocument doc = Loader.loadPDF(RandomAccessReadBuffer.createBufferFromStream(
                new URI("https://issues.apache.org/jira/secure/attachment/13017227/stringwidth.pdf")
                        .toURL().openStream())))
        {
            PDPage page = doc.getPage(0);
            PDFont font = page.getResources().getFont(COSName.getPDFName("F70"));
            assertTrue(font.isDamaged());
            assertEquals(0, font.getHeight(0));
            assertEquals(0, font.getStringWidth("Pa"));
        }
    }

    private void testPDFBox3826checkFonts(byte[] byteArray, File fontFile) throws IOException
    {
        try (PDDocument doc = Loader.loadPDF(byteArray))
        {
            PDPage page2 = doc.getPage(0);
            
            // F1 = type0 subset
            PDType0Font fontF1 = (PDType0Font) page2.getResources().getFont(COSName.getPDFName("F1"));
            assertTrue(fontF1.getName().contains("+"));
            assertTrue(fontFile.length() > fontF1.getFontDescriptor().getFontFile2().toByteArray().length);
            
            // F2 = type0 full embed
            PDType0Font fontF2 = (PDType0Font) page2.getResources().getFont(COSName.getPDFName("F2"));
            assertFalse(fontF2.getName().contains("+"));
            assertEquals(fontFile.length(), fontF2.getFontDescriptor().getFontFile2().toByteArray().length);
            
            // F3 = tt full embed
            PDTrueTypeFont fontF3 = (PDTrueTypeFont) page2.getResources().getFont(COSName.getPDFName("F3"));
            assertFalse(fontF3.getName().contains("+"));
            assertEquals(fontFile.length(), fontF3.getFontDescriptor().getFontFile2().toByteArray().length);
            
            new PDFRenderer(doc).renderImage(0);
            
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
            String text = stripper.getText(doc);
            assertEquals("testMultipleFontFileReuse1\ntestMultipleFontFileReuse2\ntestMultipleFontFileReuse3", text.trim());
        }
    }

    private byte[] testPDFBox3826createDoc(TrueTypeFont ttf) throws IOException
    {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        try (PDDocument doc = new PDDocument())
        {
            PDPage page = new PDPage();
            doc.addPage(page);
            // type 0 subset embedding
            PDFont font = PDType0Font.load(doc, ttf, true);
            try (PDPageContentStream cs = new PDPageContentStream(doc, page))
            {
                cs.beginText();
                cs.newLineAtOffset(10, 700);
                cs.setFont(font, 10);
                cs.showText("testMultipleFontFileReuse1");
                cs.endText();
                // type 0 full embedding
                font = PDType0Font.load(doc, ttf, false);
                cs.beginText();
                cs.newLineAtOffset(10, 650);
                cs.setFont(font, 10);
                cs.showText("testMultipleFontFileReuse2");
                cs.endText();
                // tt full embedding but only WinAnsiEncoding
                font = PDTrueTypeFont.load(doc, ttf, WinAnsiEncoding.INSTANCE);
                cs.beginText();
                cs.newLineAtOffset(10, 600);
                cs.setFont(font, 10);
                cs.showText("testMultipleFontFileReuse3");
                cs.endText();
            }

            doc.save(baos);
        }
        return baos.toByteArray();
    }

    /**
     * Check that font can be deleted after usage.
     * 
     * @throws IOException 
     */
    @Test
    void testDeleteFont() throws IOException
    {
        File tempFontFile = new File(OUT_DIR, "LiberationSans-Regular.ttf");
        File tempPdfFile = new File(OUT_DIR, "testDeleteFont.pdf");
        String text = "Test PDFBOX-4823";

        try (InputStream is = PDFont.class.getResourceAsStream(
                "/org/apache/pdfbox/resources/ttf/LiberationSans-Regular.ttf"))
        {
            Files.copy(is, tempFontFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
        }

        try (PDDocument doc = new PDDocument())
        {
            PDPage page = new PDPage();
            doc.addPage(page);
            try (PDPageContentStream cs = new PDPageContentStream(doc, page))
            {
                PDFont font = PDType0Font.load(doc, tempFontFile);
                cs.beginText();
                cs.setFont(font, 50);
                cs.newLineAtOffset(50, 700);
                cs.showText(text);
                cs.endText();
            }
            doc.save(tempPdfFile);
        }

        Files.delete(tempFontFile.toPath());    

        try (PDDocument doc = Loader.loadPDF(tempPdfFile))
        {
            PDFTextStripper stripper = new PDFTextStripper();
            String extractedText = stripper.getText(doc);
            assertEquals(text, extractedText.trim());
        }

        Files.delete(tempPdfFile.toPath());    
    }

    /**
     * PDFBOX-5115: U+00AD (soft hyphen) should work with WinAnsiEncoding. 
     */
    @Test
    void testSoftHyphen() throws IOException
    {
        String text = "- \u00AD";
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        try (PDDocument doc = new PDDocument())
        {
            PDPage page = new PDPage();
            doc.addPage(page);
            PDFont font1 = new PDType1Font(FontName.HELVETICA);
            PDFont font2 = PDType0Font.load(doc, PDFontTest.class.getResourceAsStream(
                    "/org/apache/pdfbox/resources/ttf/LiberationSans-Regular.ttf"));

            assertEquals(font1.getStringWidth("-"), font1.getStringWidth("\u00AD"));
            assertEquals(font2.getStringWidth("-"), font2.getStringWidth("\u00AD"));

            try (PDPageContentStream cs = new PDPageContentStream(doc, page))
            {
                cs.beginText();
                cs.newLineAtOffset(100, 500);
                cs.setFont(font1, 10);
                cs.showText(text);
                cs.newLineAtOffset(0, 100);
                cs.setFont(font2, 10);
                cs.showText(text);
                cs.endText();
            }
            doc.save(baos);
        }
        
        try (PDDocument doc = Loader.loadPDF(baos.toByteArray()))
        {
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
            String extractedText = stripper.getText(doc);
            assertEquals(text + "\n" + text, extractedText.trim());
        }
    }

    /**
     * Test font with an unusual cmap table combination (0, 3).
     *
     * @throws IOException 
     */
    @Test
    void testPDFBox5484() throws IOException
    {
        File fontFile = new File("target/fonts", "PDFBOX-5484.ttf");
        TrueTypeFont ttf = new TTFParser().parse(new RandomAccessReadBufferedFile(fontFile));
        try (PDDocument doc = new PDDocument())
        {
            PDTrueTypeFont tr = PDTrueTypeFont.load(doc, ttf, WinAnsiEncoding.INSTANCE);
            GeneralPath path1 = tr.getPath("oslash");
            GeneralPath path2 = tr.getPath(248);
            assertFalse(path2.getPathIterator(null).isDone()); // not empty
            assertTrue(new Area(path1).equals(new Area(path2))); // assertEquals does not test equals()
        }
    }

    /**
     * Check space width.
     *
     * @throws IOException 
     */
    @Test
    void PDFBOX5920Type0() throws IOException
    {
        try (InputStream is = 
                PDFontTest.class.getResourceAsStream("/org/apache/pdfbox/resources/ttf/LiberationSans-Regular.ttf");
                PDDocument document = new PDDocument())
        {
            PDFont font = PDType0Font.load(document, is, false);
            assertEquals(20064.0f, font.getStringWidth("The quick brown fox jumps over the lazy dog."));
            assertEquals(278.0f, font.getSpaceWidth());
        }
    }

    /**
     * Check space width.
     *
     * @throws IOException 
     */
    @Test
    void PDFBOX5920TrueType() throws IOException
    {
        try (InputStream is = 
                PDFontTest.class.getResourceAsStream("/org/apache/pdfbox/resources/ttf/LiberationSans-Regular.ttf");
                PDDocument document = new PDDocument())
        {
            PDFont font = PDTrueTypeFont.load(document, is, WinAnsiEncoding.INSTANCE);
            assertEquals(20064.0f, 
                    font.getStringWidth("The quick brown fox jumps over the lazy dog."));
            assertEquals(278.0f, font.getSpaceWidth());
        }
    }
}