/src/leptonica/src/pdfio1.c

Source (jump to first uncovered line)
/*====================================================================*
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
 -
 -  Redistribution and use in source and binary forms, with or without
 -  modification, are permitted provided that the following conditions
 -  are met:
 -  1. Redistributions of source code must retain the above copyright
 -     notice, this list of conditions and the following disclaimer.
 -  2. Redistributions in binary form must reproduce the above
 -     copyright notice, this list of conditions and the following
 -     disclaimer in the documentation and/or other materials
 -     provided with the distribution.
 -
 -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
 -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *====================================================================*/

/*!
 * \file pdfio1.c
 * <pre>
 *
 *    Higher-level operations for generating pdf from images.
 *    Use poppler's pdftoppm or pdfimages to invert the process,
 *    extracting raster images from pdf.
 *
 *    |=============================================================|
 *    |                        Important notes                      |
 *    |=============================================================|
 *    | Some of these functions require I/O libraries such as       |
 *    | libtiff, libjpeg, libpng, libz and libopenjp2.  If you do   |
 *    | not have these libraries, some calls will fail.  For        |
 *    | example, if you do not have libopenjp2, you cannot write a  |
 *    | pdf where transcoding is required to incorporate a          |
 *    | jp2k image.                                                 |
 *    |                                                             |
 *    | You can manually deactivate all pdf writing by setting      |
 *    | this in environ.h:                                          |
 *    | \code                                                       |
 *    |      #define  USE_PDFIO     0                               |
 *    | \endcode                                                    |
 *    | This will link the stub file pdfiostub.c.                   |
 *    |=============================================================|
 *
 *     Set 1. These functions convert a set of image files
 *     to a multi-page pdf file, with one image on each page.
 *     All images are rendered at the same (input) resolution.
 *     The images can be specified as being in a directory, or they
 *     can be in an sarray.  The output pdf can be either a file
 *     or an array of bytes in memory.
 *
 *     Set 2. These functions are a special case of set 1, where
 *     no scaling or change in quality is required.  For jpeg, jp2k and
 *     tiffg4 images, the bytes in each file can be directly incorporated
 *     into the output pdf, and the wrapping up of multiple image
 *     files is very fast.  For non-interlaced png, the data bytes
 *     including the predictors can also be written directly into the
 *     flate pdf data.  For other image formats transcoding is required,
 *     where the image data is first decompressed and then flate (gzip),
 *     DCT (jpeg) or tiffg4 (1 bpp) encodings are generated.
 *
 *     Set 3. These functions convert a set of images in memory
 *     to a multi-page pdf, with one image on each page.  The pdf
 *     output can be either a file or an array of bytes in memory.
 *
 *     Set 4. These functions implement a pdf output "device driver"
 *     for wrapping (encoding) any number of images on a single page
 *     in pdf.  The input can be either an image file or a Pix;
 *     the pdf output can be either a file or an array of bytes in memory.
 *
 *     Set 5. These "segmented" functions take a set of image
 *     files, along with optional segmentation information, and
 *     generate a multi-page pdf file, where each page consists
 *     in general of a mixed raster pdf of image and non-image regions.
 *     The segmentation information for each page can be input as
 *     either a mask over the image parts, or as a Boxa of those
 *     regions.
 *
 *     Set 6. These "segmented" functions convert an image and
 *     an optional Boxa of image regions into a mixed raster pdf file
 *     for the page.  The input image can be either a file or a Pix.
 *
 *     Set 7. These functions take a set of single-page pdf files
 *     and concatenates it into a multi-page pdf.  The input can be
 *     a set of either single page pdf files or pdf 'strings' in memory.
 *     The output can be either a file or an array of bytes in memory.
 *
 *     The images in the pdf file can be rendered using a pdf viewer,
 *     such as evince, gv, xpdf or acroread.
 *
 *     Reference on the pdf file format:
 *         http://www.adobe.com/devnet/pdf/pdf_reference_archive.html
 *
 *     1. Convert specified image files to pdf (one image file per page)
 *          l_int32             convertFilesToPdf()
 *          l_int32             saConvertFilesToPdf()
 *          l_int32             saConvertFilesToPdfData()
 *          l_int32             selectDefaultPdfEncoding()
 *
 *     2. Convert specified image files to pdf without scaling
 *          l_int32             convertUnscaledFilesToPdf()
 *          l_int32             saConvertUnscaledFilesToPdf()
 *          l_int32             saConvertUnscaledFilesToPdfData()
 *          l_int32             convertUnscaledToPdfData()
 *
 *     3. Convert multiple images to pdf (one image per page)
 *          l_int32             pixaConvertToPdf()
 *          l_int32             pixaConvertToPdfData()
 *
 *     4. Single page, multi-image converters
 *          l_int32             convertToPdf()
 *          l_int32             convertImageDataToPdf()
 *          l_int32             convertToPdfData()
 *          l_int32             convertImageDataToPdfData()
 *          l_int32             pixConvertToPdf()
 *          l_int32             pixWriteStreamPdf()
 *          l_int32             pixWriteMemPdf()
 *
 *     5. Segmented multi-page, multi-image converter
 *          l_int32             convertSegmentedFilesToPdf()
 *          BOXAA              *convertNumberedMasksToBoxaa()
 *
 *     6. Segmented single page, multi-image converters
 *          l_int32             convertToPdfSegmented()
 *          l_int32             pixConvertToPdfSegmented()
 *          l_int32             convertToPdfDataSegmented()
 *          l_int32             pixConvertToPdfDataSegmented()
 *
 *     7. Multipage concatenation
 *          l_int32             concatenatePdf()
 *          l_int32             saConcatenatePdf()
 *          l_int32             ptraConcatenatePdf()
 *          l_int32             concatenatePdfToData()
 *          l_int32             saConcatenatePdfToData()
 *
 *     The top-level multi-image functions can be visualized as follows:
 *          Output pdf data to file:
 *             convertToPdf()  and  convertImageDataToPdf()
 *                     --> pixConvertToPdf()
 *                           --> pixConvertToPdfData()
 *
 *          Output pdf data to array in memory:
 *             convertToPdfData()  and  convertImageDataToPdfData()
 *                     --> pixConvertToPdfData()
 *
 *     The top-level segmented image functions can be visualized as follows:
 *          Output pdf data to file:
 *             convertToPdfSegmented()
 *                     --> pixConvertToPdfSegmented()
 *                           --> pixConvertToPdfDataSegmented()
 *
 *          Output pdf data to array in memory:
 *             convertToPdfDataSegmented()
 *                     --> pixConvertToPdfDataSegmented()
 *
 *     For multi-page concatenation, there are three different types of input
 *        (1) directory and optional filename filter
 *        (2) sarray of filenames
 *        (3) ptra of byte arrays of pdf data
 *     and two types of output for the concatenated pdf data
 *        (1) filename
 *        (2) data array and size
 *     High-level interfaces are given for each of the six combinations.
 *
 *     Note: When wrapping small images into pdf, it is useful to give
 *     them a relatively low resolution value, to avoid rounding errors
 *     when rendering the images.  For example, if you want an image
 *     of width w pixels to be 5 inches wide on a screen, choose a
 *     resolution w/5.
 *
 *     The very fast functions in section (2) require neither transcoding
 *     nor parsing of the compressed jpeg file.  With three types of image
 *     compression, the compressed strings can be incorporated into
 *     the pdf data without decompression and re-encoding: jpeg, jp2k
 *     and png.  The DCTDecode and JPXDecode filters can handle the
 *     entire jpeg and jp2k encoded string as a byte array in the pdf file.
 *     The FlateDecode filter can handle the png compressed image data,
 *     including predictors that occur as the first byte in each
 *     raster line, but it is necessary to store only the png IDAT chunk
 *     data in the pdf array.  The alternative for wrapping png images
 *     is to transcode them: uncompress into a raster (a pix) and then
 *     gzip the raster data.  This typically results in a larger pdf file
 *     because it doesn't use the two-dimensional png predictor.
 *     Colormaps, which are found in png PLTE chunks, must always be
 *     pulled out and included separately in the pdf.  For CCITT-G4
 *     compression, you can not simply include a tiff G4 file -- you must
 *     either parse it and extract the G4 compressed data within it,
 *     or uncompress to a raster and G4 compress again.
 * </pre>
 */

#ifdef HAVE_CONFIG_H
#include <config_auto.h>
#endif  /* HAVE_CONFIG_H */

#include <string.h>
#include <math.h>
#include "allheaders.h"

/* --------------------------------------------*/
#if  USE_PDFIO   /* defined in environ.h */
 /* --------------------------------------------*/

    /* Typical scan resolution in ppi (pixels/inch) */
static const l_int32  DefaultInputRes = 300;

/*---------------------------------------------------------------------*
 *    Convert specified image files to pdf (one image file per page)   *
 *---------------------------------------------------------------------*/
/*!
 * \brief   convertFilesToPdf()
 *
 * \param[in]    dirname       directory name containing images
 * \param[in]    substr        [optional] substring filter on filenames;
 *                             can be null
 * \param[in]    res           input resolution of all images
 * \param[in]    scalefactor   scaling factor applied to each image; > 0.0
 * \param[in]    type          encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
 *                             L_FLATE_ENCODE, L_JP2K_ENCODE or
 *                             L_DEFAULT_ENCODE for default)
 * \param[in]    quality       for jpeg: 1-100; 0 for default (75)
 *                             for jp2k: 27-45; 0 for default (34)
 * \param[in]    title         [optional] pdf title; can be null
 * \param[in]    fileout       pdf file of all images
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) If %substr is not NULL, only image filenames that contain
 *          the substring can be used.  If %substr == NULL, all files
 *          in the directory are used.
 *      (2) The files in the directory, after optional filtering by
 *          the substring, are lexically sorted in increasing order
 *          before concatenation.
 *      (3) The scalefactor is applied to each image before encoding.
 *          If you enter a value <= 0.0, it will be set to 1.0.
 *      (4) Specifying one of the four encoding types for %type forces
 *          all images to be compressed with that type.  Use 0 to have
 *          the type determined for each image based on depth and whether
 *          or not it has a colormap.
 * </pre>
 */
l_ok
convertFilesToPdf(const char  *dirname,
                  const char  *substr,
                  l_int32      res,
                  l_float32    scalefactor,
                  l_int32      type,
                  l_int32      quality,
                  const char  *title,
                  const char  *fileout)
{
l_int32  ret;
SARRAY  *sa;

    if (!dirname)
        return ERROR_INT("dirname not defined", __func__, 1);
    if (!fileout)
        return ERROR_INT("fileout not defined", __func__, 1);

    if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
        return ERROR_INT("sa not made", __func__, 1);
    ret = saConvertFilesToPdf(sa, res, scalefactor, type, quality,
                              title, fileout);
    sarrayDestroy(&sa);
    return ret;
}


/*!
 * \brief   saConvertFilesToPdf()
 *
 * \param[in]    sa            string array of pathnames for images
 * \param[in]    res           input resolution of all images
 * \param[in]    scalefactor   scaling factor applied to each image; > 0.0
 * \param[in]    type          encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
 *                             L_FLATE_ENCODE, L_JP2K_ENCODE or
 *                             L_DEFAULT_ENCODE for default)
 * \param[in]    quality       for jpeg: 1-100; 0 for default (75)
 *                             for jp2k: 27-45; 0 for default (34)
 * \param[in]    title         [optional] pdf title; can be null
 * \param[in]    fileout       pdf file of all images
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) See convertFilesToPdf().
 * </pre>
 */
l_ok
saConvertFilesToPdf(SARRAY      *sa,
                    l_int32      res,
                    l_float32    scalefactor,
                    l_int32      type,
                    l_int32      quality,
                    const char  *title,
                    const char  *fileout)
{
l_uint8  *data;
l_int32   ret;
size_t    nbytes;

    if (!sa)
        return ERROR_INT("sa not defined", __func__, 1);

    ret = saConvertFilesToPdfData(sa, res, scalefactor, type, quality,
                                  title, &data, &nbytes);
    if (ret) {
        if (data) LEPT_FREE(data);
        return ERROR_INT("pdf data not made", __func__, 1);
    }

    ret = l_binaryWrite(fileout, "w", data, nbytes);
    LEPT_FREE(data);
    if (ret)
        L_ERROR("pdf data not written to file\n", __func__);
    return ret;
}


/*!
 * \brief   saConvertFilesToPdfData()
 *
 * \param[in]    sa            string array of pathnames for images
 * \param[in]    res           input resolution of all images
 * \param[in]    scalefactor   scaling factor applied to each image; > 0.0
 * \param[in]    type          encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
 *                             L_FLATE_ENCODE, L_JP2K_ENCODE or
 *                             L_DEFAULT_ENCODE for default)
 * \param[in]    quality       for jpeg: 1-100; 0 for default (75)
 *                             for jp2k: 27-45; 0 for default (34)
 * \param[in]    title         [optional] pdf title; can be null
 * \param[out]   pdata         output pdf data (of all images
 * \param[out]   pnbytes       size of output pdf data
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) See convertFilesToPdf().
 * </pre>
 */
l_ok
saConvertFilesToPdfData(SARRAY      *sa,
                        l_int32      res,
                        l_float32    scalefactor,
                        l_int32      type,
                        l_int32      quality,
                        const char  *title,
                        l_uint8    **pdata,
                        size_t      *pnbytes)
{
char     *fname;
l_uint8  *imdata;
l_int32   i, n, ret, pagetype, npages, scaledres;
size_t    imbytes;
L_BYTEA  *ba;
PIX      *pixs, *pix;
L_PTRA   *pa_data;

    if (!pdata)
        return ERROR_INT("&data not defined", __func__, 1);
    *pdata = NULL;
    if (!pnbytes)
        return ERROR_INT("&nbytes not defined", __func__, 1);
    *pnbytes = 0;
    if (!sa)
        return ERROR_INT("sa not defined", __func__, 1);
    if (scalefactor <= 0.0) scalefactor = 1.0;
    if (type != L_JPEG_ENCODE && type != L_G4_ENCODE &&
        type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
        type = L_DEFAULT_ENCODE;
    }

        /* Generate all the encoded pdf strings */
    n = sarrayGetCount(sa);
    pa_data = ptraCreate(n);
    for (i = 0; i < n; i++) {
        if (i && (i % 10 == 0)) lept_stderr(".. %d ", i);
        fname = sarrayGetString(sa, i, L_NOCOPY);
        if ((pixs = pixRead(fname)) == NULL) {
            L_ERROR("image not readable from file %s\n", __func__, fname);
            continue;
        }
        if (scalefactor != 1.0)
            pix = pixScale(pixs, scalefactor, scalefactor);
        else
            pix = pixClone(pixs);
        pixDestroy(&pixs);
        scaledres = (l_int32)(res * scalefactor);

            /* Select the encoding type */
        if (type != L_DEFAULT_ENCODE) {
            pagetype = type;
        } else if (selectDefaultPdfEncoding(pix, &pagetype) != 0) {
            pixDestroy(&pix);
            L_ERROR("encoding type selection failed for file %s\n",
                    __func__, fname);
            continue;
        }

        ret = pixConvertToPdfData(pix, pagetype, quality, &imdata, &imbytes,
                                  0, 0, scaledres, title, NULL, 0);
        pixDestroy(&pix);
        if (ret) {
            LEPT_FREE(imdata);
            L_ERROR("pdf encoding failed for %s\n", __func__, fname);
            continue;
        }
        ba = l_byteaInitFromMem(imdata, imbytes);
        LEPT_FREE(imdata);
        ptraAdd(pa_data, ba);
    }
    ptraGetActualCount(pa_data, &npages);
    if (npages == 0) {
        L_ERROR("no pdf files made\n", __func__);
        ptraDestroy(&pa_data, FALSE, FALSE);
        return 1;
    }

        /* Concatenate them */
    lept_stderr("\nconcatenating ... ");
    ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
    lept_stderr("done\n");

    ptraGetActualCount(pa_data, &npages);  /* recalculate in case it changes */
    for (i = 0; i < npages; i++) {
        ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
        l_byteaDestroy(&ba);
    }
    ptraDestroy(&pa_data, FALSE, FALSE);
    return ret;
}


/*!
 * \brief   selectDefaultPdfEncoding()
 *
 * \param[in]    pix
 * \param[out]   ptype     L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) This attempts to choose an encoding for the pix that results
 *          in the smallest file, assuming that if jpeg encoded, it will
 *          use quality = 75.  The decision is approximate, in that
 *          (a) all colormapped images will be losslessly encoded with
 *          gzip (flate), and (b) an image with less than about 20 colors
 *          is likely to be smaller if flate encoded than if encoded
 *          as a jpeg (dct).  For example, an image made by pixScaleToGray3()
 *          will have 10 colors, and flate encoding will give about
 *          twice the compression as jpeg with quality = 75.
 *      (2) We could have used L_JP2K_ENCODE instead of L_JPEG_ENCODE.
 *          However, the jp2k compression is not much better than jpeg, and
 *          the jpeg library is more commonly available than the jp2k library.
 * </pre>
 */
l_ok
selectDefaultPdfEncoding(PIX      *pix,
                         l_int32  *ptype)
{
l_int32   w, h, d, factor, ncolors;
PIXCMAP  *cmap;

    if (!ptype)
        return ERROR_INT("&type not defined", __func__, 1);
    *ptype = L_FLATE_ENCODE;  /* default universal encoding */
    if (!pix)
        return ERROR_INT("pix not defined", __func__, 1);
    pixGetDimensions(pix, &w, &h, &d);
    cmap = pixGetColormap(pix);
    if (d == 8 && !cmap) {
        factor = L_MAX(1, (l_int32)sqrt((l_float64)(w * h) / 20000.));
        pixNumColors(pix, factor, &ncolors);
        if (ncolors < 20)
            *ptype = L_FLATE_ENCODE;
        else
            *ptype = L_JPEG_ENCODE;
    } else if (d == 1) {
        *ptype = L_G4_ENCODE;
    } else if (cmap || d == 2 || d == 4) {
        *ptype = L_FLATE_ENCODE;
    } else if (d == 8 || d == 32) {
        *ptype = L_JPEG_ENCODE;
    } else if (d == 16) {
        *ptype = L_FLATE_ENCODE;
    } else {
        return ERROR_INT("type selection failure", __func__, 1);
    }

    return 0;
}


/*---------------------------------------------------------------------*
 *          Convert specified image files to pdf without scaling       *
 *---------------------------------------------------------------------*/
/*!
 * \brief   convertUnscaledFilesToPdf()
 *
 * \param[in]    dirname   directory name containing images
 * \param[in]    substr    [optional] substring filter on filenames;
 *                         can be null
 * \param[in]    title     [optional] pdf title; can be null
 * \param[in]    fileout   pdf file of all images
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) If %substr is not NULL, only image filenames that contain
 *          the substring can be used.  If %substr == NULL, all files
 *          in the directory are used.
 *      (2) The files in the directory, after optional filtering by
 *          the substring, are lexically sorted in increasing order
 *          before concatenation.
 *      (3) This is very fast for jpeg, jp2k and some png files,
 *          because the compressed data is wrapped up and concatenated.
 *          For other types of png, the images must be read and recompressed.
 * </pre>
 */
l_ok
convertUnscaledFilesToPdf(const char  *dirname,
                          const char  *substr,
                          const char  *title,
                          const char  *fileout)
{
l_int32  ret;
SARRAY  *sa;

    if (!dirname)
        return ERROR_INT("dirname not defined", __func__, 1);
    if (!fileout)
        return ERROR_INT("fileout not defined", __func__, 1);

    if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
        return ERROR_INT("sa not made", __func__, 1);
    ret = saConvertUnscaledFilesToPdf(sa, title, fileout);
    sarrayDestroy(&sa);
    return ret;
}


/*!
 * \brief   saConvertUnscaledFilesToPdf()
 *
 * \param[in]    sa        string array of pathnames for images
 * \param[in]    title     [optional] pdf title; can be null
 * \param[in]    fileout   pdf file of all images
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) See convertUnscaledFilesToPdf().
 * </pre>
 */
l_ok
saConvertUnscaledFilesToPdf(SARRAY      *sa,
                            const char  *title,
                            const char  *fileout)
{
l_uint8  *data;
l_int32   ret;
size_t    nbytes;

    if (!sa)
        return ERROR_INT("sa not defined", __func__, 1);

    ret = saConvertUnscaledFilesToPdfData(sa, title, &data, &nbytes);
    if (ret) {
        if (data) LEPT_FREE(data);
        return ERROR_INT("pdf data not made", __func__, 1);
    }

    ret = l_binaryWrite(fileout, "w", data, nbytes);
    LEPT_FREE(data);
    if (ret)
        L_ERROR("pdf data not written to file\n", __func__);
    return ret;
}


/*!
 * \brief   saConvertUnscaledFilesToPdfData()
 *
 * \param[in]    sa        string array of pathnames for image files
 * \param[in]    title     [optional] pdf title; can be null
 * \param[out]   pdata     output pdf data (of all images)
 * \param[out]   pnbytes   size of output pdf data
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) This is very fast for jpeg, jp2k and some png files,
 *          because the compressed data is wrapped up and concatenated.
 *          For other types of png, the images must be read and recompressed.
 * </pre>
 */
l_ok
saConvertUnscaledFilesToPdfData(SARRAY      *sa,
                                const char  *title,
                                l_uint8    **pdata,
                                size_t      *pnbytes)
{
char         *fname;
l_uint8      *imdata;
l_int32       i, n, ret, npages;
size_t        imbytes;
L_BYTEA      *ba;
L_PTRA       *pa_data;

    if (!pdata)
        return ERROR_INT("&data not defined", __func__, 1);
    *pdata = NULL;
    if (!pnbytes)
        return ERROR_INT("&nbytes not defined", __func__, 1);
    *pnbytes = 0;
    if (!sa)
        return ERROR_INT("sa not defined", __func__, 1);

        /* Generate all the encoded pdf strings */
    n = sarrayGetCount(sa);
    pa_data = ptraCreate(n);
    for (i = 0; i < n; i++) {
        if (i && (i % 10 == 0)) lept_stderr(".. %d ", i);
        fname = sarrayGetString(sa, i, L_NOCOPY);

            /* Generate the pdf data */
        if (convertUnscaledToPdfData(fname, title, &imdata, &imbytes))
            continue;

            /* ... and add it to the array of single page data */
        ba = l_byteaInitFromMem(imdata, imbytes);
        if (imdata) LEPT_FREE(imdata);
        ptraAdd(pa_data, ba);
    }
    ptraGetActualCount(pa_data, &npages);
    if (npages == 0) {
        L_ERROR("no pdf files made\n", __func__);
        ptraDestroy(&pa_data, FALSE, FALSE);
        return 1;
    }

        /* Concatenate to generate a multipage pdf */
    lept_stderr("\nconcatenating ... ");
    ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
    lept_stderr("done\n");

        /* Clean up */
    ptraGetActualCount(pa_data, &npages);  /* maybe failed to read some files */
    for (i = 0; i < npages; i++) {
        ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
        l_byteaDestroy(&ba);
    }
    ptraDestroy(&pa_data, FALSE, FALSE);
    return ret;
}


/*!
 * \brief   convertUnscaledToPdfData()
 *
 * \param[in]    fname      of image file in all formats
 * \param[in]    title      [optional] pdf title; can be null
 * \param[out]   pdata      output pdf data for image
 * \param[out]   pnbytes    size of output pdf data
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) This is very fast for jpeg, jp2k and some png files,
 *          because the compressed data is wrapped up and concatenated.
 *          For other types of png, the images must be read and recompressed.
 * </pre>
 */
l_ok
convertUnscaledToPdfData(const char  *fname,
                         const char  *title,
                         l_uint8    **pdata,
                         size_t      *pnbytes)
{
l_int32       format;
L_COMP_DATA  *cid;

    if (!pdata)
        return ERROR_INT("&data not defined", __func__, 1);
    *pdata = NULL;
    if (!pnbytes)
        return ERROR_INT("&nbytes not defined", __func__, 1);
    *pnbytes = 0;
    if (!fname)
        return ERROR_INT("fname not defined", __func__, 1);

    findFileFormat(fname, &format);
    if (format == IFF_UNKNOWN) {
        L_WARNING("file %s format is unknown; skip\n", __func__, fname);
        return 1;
    }
    if (format == IFF_PS || format == IFF_LPDF) {
        L_WARNING("file %s format is %d; skip\n", __func__, fname, format);
        return 1;
    }

        /* Generate the image data required for pdf generation, always
         * in binary (not ascii85) coding.  Note that jpeg, jp2k and some
         * png files are not transcoded.  */
    l_generateCIDataForPdf(fname, NULL, 0, &cid);
    if (!cid) {
        L_ERROR("file %s format is %d; unreadable\n", __func__, fname, format);
        return 1;
    }

        /* Generate the pdf string for this page (image).  This destroys
         * the cid by attaching it to an lpd and destroying the lpd. */
    cidConvertToPdfData(cid, title, pdata, pnbytes);
    return 0;
}


/*---------------------------------------------------------------------*
 *          Convert multiple images to pdf (one image per page)        *
 *---------------------------------------------------------------------*/
/*!
 * \brief   pixaConvertToPdf()
 *
 * \param[in]    pixa          containing images all at the same resolution
 * \param[in]    res           override the resolution of each input image,
 *                             in ppi; use 0 to respect the resolution
 *                             embedded in the input images
 * \param[in]    scalefactor   scaling factor applied to each image; > 0.0
 * \param[in]    type          encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
 *                             L_FLATE_ENCODE, L_JP2K_ENCODE, or
 *                             L_DEFAULT_ENCODE for default)
 * \param[in]    quality       for jpeg: 1-100; 0 for default (75)
 *                             for jp2k: 27-45; 0 for default (34)
 * \param[in]    title         [optional] pdf title; can be null
 * \param[in]    fileout       pdf file of all images
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
 *          colormap and many colors, or 32 bpp; FLATE for anything else.
 *      (2) The scalefactor must be > 0.0; otherwise it is set to 1.0.
 *      (3) Specifying one of the three encoding types for %type forces
 *          all images to be compressed with that type.  Use 0 to have
 *          the type determined for each image based on depth and whether
 *          or not it has a colormap.
 * </pre>
 */
l_ok
pixaConvertToPdf(PIXA        *pixa,
                 l_int32      res,
                 l_float32    scalefactor,
                 l_int32      type,
                 l_int32      quality,
                 const char  *title,
                 const char  *fileout)
{
l_uint8  *data;
l_int32   ret;
size_t    nbytes;

    if (!pixa)
        return ERROR_INT("pixa not defined", __func__, 1);

    ret = pixaConvertToPdfData(pixa, res, scalefactor, type, quality,
                               title, &data, &nbytes);
    if (ret) {
        LEPT_FREE(data);
        return ERROR_INT("conversion to pdf failed", __func__, 1);
    }

    ret = l_binaryWrite(fileout, "w", data, nbytes);
    LEPT_FREE(data);
    if (ret)
        L_ERROR("pdf data not written to file\n", __func__);
    return ret;
}


/*!
 * \brief   pixaConvertToPdfData()
 *
 * \param[in]    pixa           containing images all at the same resolution
 * \param[in]    res            input resolution of all images
 * \param[in]    scalefactor    scaling factor applied to each image; > 0.0; <50
 * \param[in]    type           encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
 *                              L_FLATE_ENCODE, L_JP2K_ENCODE, or
 *                              L_DEFAULT_ENCODE for default)
 * \param[in]    quality        for jpeg: 1-100; 0 for default (75)
 *                              for jp2k: 27-45; 0 for default (34)
 * \param[in]    title          [optional] pdf title; can be null
 * \param[out]   pdata          output pdf data of all images
 * \param[out]   pnbytes        size of output pdf data
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) See pixaConvertToPdf().
 * </pre>
 */
l_ok
pixaConvertToPdfData(PIXA        *pixa,
                     l_int32      res,
                     l_float32    scalefactor,
                     l_int32      type,
                     l_int32      quality,
                     const char  *title,
                     l_uint8    **pdata,
                     size_t      *pnbytes)
{
l_uint8  *imdata;
l_int32   i, n, ret, scaledres, pagetype;
size_t    imbytes;
L_BYTEA  *ba;
PIX      *pixs, *pix;
L_PTRA   *pa_data;

    if (!pdata)
        return ERROR_INT("&data not defined", __func__, 1);
    *pdata = NULL;
    if (!pnbytes)
        return ERROR_INT("&nbytes not defined", __func__, 1);
    *pnbytes = 0;
    if (!pixa)
        return ERROR_INT("pixa not defined", __func__, 1);
    if (scalefactor <= 0.0) scalefactor = 1.0;
    if (scalefactor >= 50.0)
        return ERROR_INT("scalefactor too large", __func__, 1);
    if (type != L_DEFAULT_ENCODE && type != L_JPEG_ENCODE &&
        type != L_G4_ENCODE && type != L_FLATE_ENCODE &&
        type != L_JP2K_ENCODE) {
        L_WARNING("invalid compression type; using per-page default\n",
                  __func__);
        type = L_DEFAULT_ENCODE;
    }
    if (quality < 0 || quality > 100)
        return ERROR_INT("invalid quality", __func__, 1);

        /* Generate all the encoded pdf strings */
    n = pixaGetCount(pixa);
    pa_data = ptraCreate(n);
    for (i = 0; i < n; i++) {
        if ((pixs = pixaGetPix(pixa, i, L_CLONE)) == NULL) {
            L_ERROR("pixs[%d] not retrieved\n", __func__, i);
            continue;
        }
        if (scalefactor != 1.0)
            pix = pixScale(pixs, scalefactor, scalefactor);
        else
            pix = pixClone(pixs);
        pixDestroy(&pixs);
        if (!pix) {
            L_ERROR("pix[%d] not made\n", __func__, i);
            continue;
        }
        scaledres = (l_int32)(res * scalefactor);

            /* Select the encoding type */
        if (type != L_DEFAULT_ENCODE) {
            pagetype = type;
        } else if (selectDefaultPdfEncoding(pix, &pagetype) != 0) {
            L_ERROR("encoding type selection failed for pix[%d]\n",
                        __func__, i);
            pixDestroy(&pix);
            continue;
        }

        ret = pixConvertToPdfData(pix, pagetype, quality, &imdata, &imbytes,
                                  0, 0, scaledres, title, NULL, 0);
        pixDestroy(&pix);
        if (ret) {
            LEPT_FREE(imdata);
            L_ERROR("pdf encoding failed for pix[%d]\n", __func__, i);
            continue;
        }
        ba = l_byteaInitFromMem(imdata, imbytes);
        LEPT_FREE(imdata);
        ptraAdd(pa_data, ba);
    }
    ptraGetActualCount(pa_data, &n);
    if (n == 0) {
        L_ERROR("no pdf files made\n", __func__);
        ptraDestroy(&pa_data, FALSE, FALSE);
        return 1;
    }

        /* Concatenate them */
    ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);

    ptraGetActualCount(pa_data, &n);  /* recalculate in case it changes */
    for (i = 0; i < n; i++) {
        ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
        l_byteaDestroy(&ba);
    }
    ptraDestroy(&pa_data, FALSE, FALSE);
    return ret;
}


/*---------------------------------------------------------------------*
 *                Single page, multi-image converters                  *
 *---------------------------------------------------------------------*/
/*!
 * \brief   convertToPdf()
 *
 * \param[in]      filein       input image file -- any format
 * \param[in]      type         encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
 *                              L_FLATE_ENCODE, or L_JP2K_ENCODE)
 * \param[in]      quality      for jpeg: 1-100; 0 for default (75)
 *                              for jp2k: 27-45; 0 for default (34)
 * \param[in]      fileout      output pdf file; only required on last
 *                              image on page
 * \param[in]      x, y         location of lower-left corner of image,
 *                              in pixels, relative to the PostScript origin
 *                              (0,0) at the lower-left corner of the page
 * \param[in]      res          override the resolution of the input image,
 *                              in ppi; use 0 to respect the resolution
 *                              embedded in the input images
 * \param[in]      title        [optional] pdf title; can be null
 * \param[in,out]  plpd         ptr to lpd, which is created on the first
 *                              invocation and returned until last image is
 *                              processed, at which time it is destroyed
 * \param[in]      position     in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
 *                              L_LAST_IMAGE
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) To wrap only one image in pdf, input %plpd = NULL, and
 *          the value of %position will be ignored:
 *            convertToPdf(...  type, quality, x, y, res, NULL, 0);
 *      (2) To wrap multiple images on a single pdf page, this is called
 *          once for each successive image.  Do it this way:
 *            L_PDF_DATA   *lpd;
 *            convertToPdf(...  type, quality, x, y, res, &lpd, L_FIRST_IMAGE);
 *            convertToPdf(...  type, quality, x, y, res, &lpd, L_NEXT_IMAGE);
 *            ...
 *            convertToPdf(...  type, quality, x, y, res, &lpd, L_LAST_IMAGE);
 *          This will write the result to the value of %fileout specified
 *          in the first call; succeeding values of %fileout are ignored.
 *          On the last call: the pdf data bytes are computed and written
 *          to %fileout, lpd is destroyed internally, and the returned
 *          value of lpd is null.  So the client has nothing to clean up.
 *      (3) (a) Set %res == 0 to respect the resolution embedded in the
 *              image file.  If no resolution is embedded, it will be set
 *              to the default value.
 *          (b) Set %res to some other value to override the file resolution.
 *      (4) (a) If the input %res and the resolution of the output device
 *              are equal, the image will be "displayed" at the same size
 *              as the original.
 *          (b) If the input %res is 72, the output device will render
 *              the image at 1 pt/pixel.
 *          (c) Some possible choices for the default input pix resolution are:
 *                 72 ppi     Render pix on any output device at one pt/pixel
 *                 96 ppi     Windows default for generated display images
 *                300 ppi     Typical default for scanned images.
 *              We choose 300, which is sensible for rendering page images.
 *              However,  images come from a variety of sources, and
 *              some are explicitly created for viewing on a display.
 * </pre>
 */
l_ok
convertToPdf(const char   *filein,
             l_int32       type,
             l_int32       quality,
             const char   *fileout,
             l_int32       x,
             l_int32       y,
             l_int32       res,
             const char   *title,
             L_PDF_DATA  **plpd,
             l_int32       position)
{
l_uint8  *data;
l_int32   ret;
size_t    nbytes;

    if (!filein)
        return ERROR_INT("filein not defined", __func__, 1);
    if (!plpd || (position == L_LAST_IMAGE)) {
        if (!fileout)
            return ERROR_INT("fileout not defined", __func__, 1);
    }

    if (convertToPdfData(filein, type, quality, &data, &nbytes, x, y,
                         res, title, plpd, position))
        return ERROR_INT("pdf data not made", __func__, 1);

    if (!plpd || (position == L_LAST_IMAGE)) {
        ret = l_binaryWrite(fileout, "w", data, nbytes);
        LEPT_FREE(data);
        if (ret)
            return ERROR_INT("pdf data not written to file", __func__, 1);
    }

    return 0;
}


/*!
 * \brief   convertImageDataToPdf()
 *
 * \param[in]      imdata       array of formatted image data; e.g., png, jpeg
 * \param[in]      size         size of image data
 * \param[in]      type         encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
 *                              L_FLATE_ENCODE, or L_JP2K_ENCODE)
 * \param[in]      quality      for jpeg: 1-100; 0 for default (75)
 *                              for jp2k: 27-45; 0 for default (34)
 * \param[in]      fileout      output pdf file; only required on last
 *                              image on page
 * \param[in]      x, y         location of lower-left corner of image,
 *                              in pixels, relative to the PostScript origin
 *                              (0,0) at the lower-left corner of the page
 * \param[in]      res          override the resolution of the input image,
 *                              in ppi; use 0 to respect the resolution
 *                              embedded in the input images
 * \param[in]      title        [optional] pdf title; can be null
 * \param[in,out]  plpd         ptr to lpd, which is created on the first
 *                              invocation and returned until last image is
 *                              processed, at which time it is destroyed
 * \param[in]      position     in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
 *                              L_LAST_IMAGE
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) If %res == 0 and the input resolution field is 0,
 *          this will use DefaultInputRes.
 *      (2) See comments in convertToPdf().
 * </pre>
 */
l_ok
convertImageDataToPdf(l_uint8      *imdata,
                      size_t        size,
                      l_int32       type,
                      l_int32       quality,
                      const char   *fileout,
                      l_int32       x,
                      l_int32       y,
                      l_int32       res,
                      const char   *title,
                      L_PDF_DATA  **plpd,
                      l_int32       position)
{
l_int32  ret;
PIX     *pix;

    if (!imdata)
        return ERROR_INT("image data not defined", __func__, 1);
    if (!plpd || (position == L_LAST_IMAGE)) {
        if (!fileout)
            return ERROR_INT("fileout not defined", __func__, 1);
    }

    if ((pix = pixReadMem(imdata, size)) == NULL)
        return ERROR_INT("pix not read", __func__, 1);
    if (type != L_JPEG_ENCODE && type != L_G4_ENCODE &&
        type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
        selectDefaultPdfEncoding(pix, &type);
    }
    ret = pixConvertToPdf(pix, type, quality, fileout, x, y, res,
                          title, plpd, position);
    pixDestroy(&pix);
    return ret;
}


/*!
 * \brief   convertToPdfData()
 *
 * \param[in]      filein       input image file -- any format
 * \param[in]      type         encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
 *                              L_FLATE_ENCODE, or L_JP2K_ENCODE)
 * \param[in]      quality      for jpeg: 1-100; 0 for default (75)
 *                              for jp2k: 27-45; 0 for default (34)
 * \param[out]     pdata        pdf data in memory
 * \param[out]     pnbytes      number of bytes in pdf data
 * \param[in]      x, y         location of lower-left corner of image,
 *                              in pixels, relative to the PostScript origin
 *                              (0,0) at the lower-left corner of the page
 * \param[in]      res          override the resolution of the input image,
 *                              in ppi; use 0 to respect the resolution
 *                              embedded in the input images
 * \param[in]      title        [optional] pdf title; can be null
 * \param[in,out]  plpd         ptr to lpd, which is created on the first
 *                              invocation and returned until last image is
 *                              processed, at which time it is destroyed
 * \param[in]      position     in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
 *                              L_LAST_IMAGE
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) If %res == 0 and the input resolution field is 0,
 *          this will use DefaultInputRes.
 *      (2) See comments in convertToPdf().
 * </pre>
 */
l_ok
convertToPdfData(const char   *filein,
                 l_int32       type,
                 l_int32       quality,
                 l_uint8     **pdata,
                 size_t       *pnbytes,
                 l_int32       x,
                 l_int32       y,
                 l_int32       res,
                 const char   *title,
                 L_PDF_DATA  **plpd,
                 l_int32       position)
{
PIX  *pix;

    if (!pdata)
        return ERROR_INT("&data not defined", __func__, 1);
    *pdata = NULL;
    if (!pnbytes)
        return ERROR_INT("&nbytes not defined", __func__, 1);
    *pnbytes = 0;
    if (!filein)
        return ERROR_INT("filein not defined", __func__, 1);

    if ((pix = pixRead(filein)) == NULL)
        return ERROR_INT("pix not made", __func__, 1);

    pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
                        x, y, res, title, plpd, position);
    pixDestroy(&pix);
    return 0;
}


/*!
 * \brief   convertImageDataToPdfData()
 *
 * \param[in]    imdata       array of formatted image data; e.g., png, jpeg
 * \param[in]    size         size of image data
 * \param[in]    type         encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
 *                            L_FLATE_ENCODE, or L_JP2K_ENCODE)
 * \param[in]    quality      for jpeg: 1-100; 0 for default (75)
 *                            for jp2k: 27-45; 0 for default (34)
 * \param[out]   pdata        pdf data in memory
 * \param[out]   pnbytes      number of bytes in pdf data
 * \param[in]    x, y         location of lower-left corner of image,
 *                            in pixels, relative to the PostScript origin
 *                            (0,0) at the lower-left corner of the page
 * \param[in]    res          override the resolution of the input image,
 *                            in ppi; use 0 to respect the resolution
 *                            embedded in the input images
 * \param[in]    title        [optional] pdf title; can be null
 * \param[out]   plpd         ptr to lpd, which is created on the first
 *                            invocation and returned until last image is
 *                            processed, at which time it is destroyed
 * \param[in]    position     in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
 *                            L_LAST_IMAGE
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) If %res == 0 and the input resolution field is 0,
 *          this will use DefaultInputRes.
 *      (2) See comments in convertToPdf().
 * </pre>
 */
l_ok
convertImageDataToPdfData(l_uint8      *imdata,
                          size_t        size,
                          l_int32       type,
                          l_int32       quality,
                          l_uint8     **pdata,
                          size_t       *pnbytes,
                          l_int32       x,
                          l_int32       y,
                          l_int32       res,
                          const char   *title,
                          L_PDF_DATA  **plpd,
                          l_int32       position)
{
l_int32  ret;
PIX     *pix;

    if (!pdata)
        return ERROR_INT("&data not defined", __func__, 1);
    *pdata = NULL;
    if (!pnbytes)
        return ERROR_INT("&nbytes not defined", __func__, 1);
    *pnbytes = 0;
    if (!imdata)
        return ERROR_INT("image data not defined", __func__, 1);
    if (plpd) {  /* part of multi-page invocation */
        if (position == L_FIRST_IMAGE)
            *plpd = NULL;
    }

    if ((pix = pixReadMem(imdata, size)) == NULL)
        return ERROR_INT("pix not read", __func__, 1);
    if (type != L_JPEG_ENCODE && type != L_G4_ENCODE &&
        type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
        selectDefaultPdfEncoding(pix, &type);
    }
    ret = pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
                              x, y, res, title, plpd, position);
    pixDestroy(&pix);
    return ret;
}


/*!
 * \brief   pixConvertToPdf()
 *
 * \param[in]      pix
 * \param[in]      type         encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
 *                              L_FLATE_ENCODE, L_JP2K_ENCODE)
 * \param[in]      quality      for jpeg: 1-100; 0 for default (75)
 *                              for jp2k: 27-45; 0 for default (34)
 * \param[in]      fileout      output pdf file; only required on last
 *                              image on page
 * \param[in]      x, y         location of lower-left corner of image,
 *                              in pixels, relative to the PostScript origin
 *                              (0,0) at the lower-left corner of the page
 * \param[in]      res          override the resolution of the input image,
 *                              in ppi; use 0 to respect the resolution
 *                              embedded in the input images
 * \param[in]      title        [optional] pdf title; can be null
 * \param[in,out]  plpd         ptr to lpd, which is created on the first
 *                              invocation and returned until last image is
 *                              processed, at which time it is destroyed
 * \param[in]      position     in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
 *                              L_LAST_IMAGE
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) If %res == 0 and the input resolution field is 0,
 *          this will use DefaultInputRes.
 *      (2) This only writes data to fileout if it is the last
 *          image to be written on the page.
 *      (3) See comments in convertToPdf().
 * </pre>
 */
l_ok
pixConvertToPdf(PIX          *pix,
                l_int32       type,
                l_int32       quality,
                const char   *fileout,
                l_int32       x,
                l_int32       y,
                l_int32       res,
                const char   *title,
                L_PDF_DATA  **plpd,
                l_int32       position)
{
l_uint8  *data;
l_int32   ret;
size_t    nbytes;

    if (!pix)
        return ERROR_INT("pix not defined", __func__, 1);
    if (!plpd || (position == L_LAST_IMAGE)) {
        if (!fileout)
            return ERROR_INT("fileout not defined", __func__, 1);
    }

    if (pixConvertToPdfData(pix, type, quality, &data, &nbytes,
                            x, y, res, title, plpd, position)) {
        LEPT_FREE(data);
        return ERROR_INT("pdf data not made", __func__, 1);
    }

    if (!plpd || (position == L_LAST_IMAGE)) {
        ret = l_binaryWrite(fileout, "w", data, nbytes);
        LEPT_FREE(data);
        if (ret)
            return ERROR_INT("pdf data not written to file", __func__, 1);
    }
    return 0;
}


/*!
 * \brief   pixWriteStreamPdf()
 *
 * \param[in]    fp       file stream opened for writing
 * \param[in]    pix      all depths, cmap OK
 * \param[in]    res      override the resolution of the input image, in ppi;
 *                        use 0 to respect the resolution embedded in the input
 * \param[in]    title    [optional] pdf title; can be null
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) This is the simplest interface for writing a single image
 *          with pdf encoding to a stream.  It uses G4 encoding for 1 bpp,
 *          JPEG encoding for 8 bpp (no cmap) and 32 bpp, and FLATE
 *          encoding for everything else.
 * </pre>
 */
l_ok
pixWriteStreamPdf(FILE        *fp,
                  PIX         *pix,
                  l_int32      res,
                  const char  *title)
{
l_uint8  *data;
size_t    nbytes, nbytes_written;

    if (!fp)
        return ERROR_INT("stream not opened", __func__, 1);
    if (!pix)
        return ERROR_INT("pix not defined", __func__, 1);

    if (pixWriteMemPdf(&data, &nbytes, pix, res, title) != 0) {
        LEPT_FREE(data);
        return ERROR_INT("pdf data not made", __func__, 1);
    }

    nbytes_written = fwrite(data, 1, nbytes, fp);
    LEPT_FREE(data);
    if (nbytes != nbytes_written)
        return ERROR_INT("failure writing pdf data to stream", __func__, 1);
    return 0;
}


/*!
 * \brief   pixWriteMemPdf()
 *
 * \param[out]   pdata      pdf as byte array
 * \param[out]   pnbytes    number of bytes in pdf array
 * \param[in]    pix        all depths, cmap OK
 * \param[in]    res        override the resolution of the input image, in ppi;
 *                          use 0 to respect the res embedded in the input
 * \param[in]    title      [optional] pdf title; can be null
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) This is the simplest interface for writing a single image
 *          with pdf encoding to memory.  It uses G4 encoding for 1 bpp,
 *          and makes a guess whether to use JPEG or FLATE encoding for
 *          everything else.
 * </pre>
 */
l_ok
pixWriteMemPdf(l_uint8    **pdata,
               size_t      *pnbytes,
               PIX         *pix,
               l_int32      res,
               const char  *title)
{
l_int32  ret, type;

    if (pdata) *pdata = NULL;
    if (pnbytes) *pnbytes = 0;
    if (!pdata || !pnbytes)
        return ERROR_INT("&data or &nbytes not defined", __func__, 1);
    if (!pix)
        return ERROR_INT("pix not defined", __func__, 1);

    selectDefaultPdfEncoding(pix, &type);
    ret = pixConvertToPdfData(pix, type, 75, pdata, pnbytes,
                              0, 0, res, title, NULL, 0);
    if (ret)
        return ERROR_INT("pdf data not made", __func__, 1);
    return 0;
}


/*---------------------------------------------------------------------*
 *            Segmented multi-page, multi-image converter              *
 *---------------------------------------------------------------------*/
/*!
 * \brief   convertSegmentedFilesToPdf()
 *
 * \param[in]    dirname       directory name containing images
 * \param[in]    substr        [optional] substring filter on filenames;
 *                             can be null
 * \param[in]    res           input resolution of all images
 * \param[in]    type          compression type for non-image regions; the
 *                             image regions are always compressed with
 *                             L_JPEG_ENCODE
 * \param[in]    thresh        used for converting gray --> 1 bpp with
 *                             L_G4_ENCODE
 * \param[in]    baa           [optional] boxaa of image regions
 * \param[in]    quality       used for JPEG only; 0 for default (75)
 * \param[in]    scalefactor   scaling factor applied to each image region
 * \param[in]    title         [optional] pdf title; can be null
 * \param[in]    fileout       pdf file of all images
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) If %substr is not NULL, only image filenames that contain
 *          the substring can be used.  If %substr == NULL, all files
 *          in the directory are used.
 *      (2) The files in the directory, after optional filtering by
 *          the substring, are lexically sorted in increasing order
 *          before concatenation.
 *      (3) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
 *          colormap and many colors, or 32 bpp; FLATE for anything else.
 *      (4) The boxaa, if it exists, contains one boxa of "image regions"
 *          for each image file.  The boxa must be aligned with the
 *          sorted set of images.
 *      (5) The scalefactor is applied to each image region.  It is
 *          typically < 1.0, to save bytes in the final pdf, because
 *          the resolution is often not critical in non-text regions.
 *      (6) If the non-image regions have pixel depth > 1 and the encoding
 *          type is G4, they are automatically scaled up by 2x and
 *          thresholded.  Otherwise, no scaling is performed on them.
 *      (7) Note that this function can be used to generate multipage
 *          G4 compressed pdf from any input, by using %boxaa == NULL
 *          and %type == L_G4_ENCODE.
 * </pre>
 */
l_ok
convertSegmentedFilesToPdf(const char  *dirname,
                           const char  *substr,
                           l_int32      res,
                           l_int32      type,
                           l_int32      thresh,
                           BOXAA       *baa,
                           l_int32      quality,
                           l_float32    scalefactor,
                           const char  *title,
                           const char  *fileout)
{
char     *fname;
l_uint8  *imdata, *data;
l_int32   i, npages, nboxa, nboxes, ret;
size_t    imbytes, databytes;
BOXA     *boxa;
L_BYTEA  *ba;
L_PTRA   *pa_data;
SARRAY   *sa;

    if (!dirname)
        return ERROR_INT("dirname not defined", __func__, 1);
    if (!fileout)
        return ERROR_INT("fileout not defined", __func__, 1);

    if ((sa = getNumberedPathnamesInDirectory(dirname, substr, 0, 0, 10000))
            == NULL)
        return ERROR_INT("sa not made", __func__, 1);

    npages = sarrayGetCount(sa);
        /* If necessary, extend the boxaa, which is page-aligned with
         * the image files, to be as large as the set of images. */
    if (baa) {
        nboxa = boxaaGetCount(baa);
        if (nboxa < npages) {
            boxa = boxaCreate(1);
            boxaaExtendWithInit(baa, npages, boxa);
            boxaDestroy(&boxa);
        }
    }

        /* Generate and save all the encoded pdf strings */
    pa_data = ptraCreate(npages);
    for (i = 0; i < npages; i++) {
        fname = sarrayGetString(sa, i, L_NOCOPY);
        if (!strcmp(fname, "")) continue;
        boxa = NULL;
        if (baa) {
            boxa = boxaaGetBoxa(baa, i, L_CLONE);
            nboxes = boxaGetCount(boxa);
            if (nboxes == 0)
                boxaDestroy(&boxa);
        }
        ret = convertToPdfDataSegmented(fname, res, type, thresh, boxa,
                                        quality, scalefactor, title,
                                        &imdata, &imbytes);
        boxaDestroy(&boxa);  /* safe; in case nboxes > 0 */
        if (ret) {
            L_ERROR("pdf encoding failed for %s\n", __func__, fname);
            continue;
        }
        ba = l_byteaInitFromMem(imdata, imbytes);
        if (imdata) LEPT_FREE(imdata);
        ptraAdd(pa_data, ba);
    }
    sarrayDestroy(&sa);

    ptraGetActualCount(pa_data, &npages);
    if (npages == 0) {
        L_ERROR("no pdf files made\n", __func__);
        ptraDestroy(&pa_data, FALSE, FALSE);
        return 1;
    }

        /* Concatenate */
    ret = ptraConcatenatePdfToData(pa_data, NULL, &data, &databytes);

        /* Clean up */
    ptraGetActualCount(pa_data, &npages);  /* recalculate in case it changes */
    for (i = 0; i < npages; i++) {
        ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
        l_byteaDestroy(&ba);
    }
    ptraDestroy(&pa_data, FALSE, FALSE);

    if (ret) {
        if (data) LEPT_FREE(data);
        return ERROR_INT("pdf data not made", __func__, 1);
    }

    ret = l_binaryWrite(fileout, "w", data, databytes);
    LEPT_FREE(data);
    if (ret)
        L_ERROR("pdf data not written to file\n", __func__);
    return ret;
}


/*!
 * \brief   convertNumberedMasksToBoxaa()
 *
 * \param[in]    dirname   directory name containing mask images
 * \param[in]    substr    [optional] substring filter on filenames;
 *                         can be null 
 * \param[in]    numpre    number of characters in name before number
 * \param[in]    numpost   number of characters in name after number,
 *                         up to a dot before an extension
 * \return  boxaa of mask regions, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This is conveniently used to generate the input boxaa
 *          for convertSegmentedFilesToPdf().  It guarantees that the
 *          boxa will be aligned with the page images, even if some
 *          of the boxa are empty.
 * </pre>
 */
BOXAA *
convertNumberedMasksToBoxaa(const char  *dirname,
                            const char  *substr,
                            l_int32      numpre,
                            l_int32      numpost)
{
char    *fname;
l_int32  i, n;
BOXA    *boxa;
BOXAA   *baa;
PIX     *pix;
SARRAY  *sa;

    if (!dirname)
        return (BOXAA *)ERROR_PTR("dirname not defined", __func__, NULL);

    if ((sa = getNumberedPathnamesInDirectory(dirname, substr, numpre,
                                              numpost, 10000)) == NULL)
        return (BOXAA *)ERROR_PTR("sa not made", __func__, NULL);

        /* Generate and save all the encoded pdf strings */
    n = sarrayGetCount(sa);
    baa = boxaaCreate(n);
    boxa = boxaCreate(1);
    boxaaInitFull(baa, boxa);
    boxaDestroy(&boxa);
    for (i = 0; i < n; i++) {
        fname = sarrayGetString(sa, i, L_NOCOPY);
        if (!strcmp(fname, "")) continue;
        if ((pix = pixRead(fname)) == NULL) {
            L_WARNING("invalid image on page %d\n", __func__, i);
            continue;
        }
        boxa = pixConnComp(pix, NULL, 8);
        boxaaReplaceBoxa(baa, i, boxa);
        pixDestroy(&pix);
    }

    sarrayDestroy(&sa);
    return baa;
}


/*---------------------------------------------------------------------*
 *            Segmented single page, multi-image converters            *
 *---------------------------------------------------------------------*/
/*!
 * \brief   convertToPdfSegmented()
 *
 * \param[in]    filein        input image file -- any format
 * \param[in]    res           input image resolution; typ. 300 ppi;
 *                             use 0 for default
 * \param[in]    type          compression type for non-image regions; image
 *                             regions are always compressed with L_JPEG_ENCODE
 * \param[in]    thresh        for converting gray --> 1 bpp with L_G4_ENCODE
 * \param[in]    boxa          [optional] of image regions; can be null
 * \param[in]    quality       used for jpeg image regions; 0 for default
 * \param[in]    scalefactor   used for jpeg regions; must be <= 1.0
 * \param[in]    title         [optional] pdf title; can be null
 * \param[in]    fileout       output pdf file
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) If there are no image regions, set %boxa == NULL;
 *          %quality and %scalefactor are ignored.
 *      (2) Typically, %scalefactor is < 1.0, because the image regions
 *          can be rendered at a lower resolution (for better compression)
 *          than the text regions.  If %scalefactor == 0, we use 1.0.
 *          If the input image is 1 bpp and scalefactor < 1.0, we
 *          use scaleToGray() to downsample the image regions to gray
 *          before compressing them.
 *      (3) If the compression type for non-image regions is L_G4_ENCODE
 *          and bpp > 1, the image is upscaled 2x and thresholded
 *          to 1 bpp.  That is the only situation where %thresh is used.
 *      (4) The parameter %quality is only used for image regions.
 *          If %type == L_JPEG_ENCODE, default jpeg quality (75) is
 *          used for the non-image regions.
 *      (5) Processing matrix for non-image regions.
 *
 *          Input           G4              JPEG                FLATE
 *          ----------|---------------------------------------------------
 *          1 bpp     |  1x, 1 bpp       1x flate, 1 bpp     1x, 1 bpp
 *                    |
 *          cmap      |  2x, 1 bpp       1x flate, cmap      1x, cmap
 *                    |
 *          2,4 bpp   |  2x, 1 bpp       1x flate            1x, 2,4 bpp
 *          no cmap   |                  2,4 bpp
 *                    |
 *          8,32 bpp  |  2x, 1 bpp       1x (jpeg)           1x, 8,32 bpp
 *          no cmap   |                  8,32 bpp
 *
 *          Summary:
 *          (a) if G4 is requested, G4 is used, with 2x upscaling
 *              for all cases except 1 bpp.
 *          (b) if JPEG is requested, use flate encoding for all cases
 *              except 8 bpp without cmap and 32 bpp (rgb).
 *          (c) if FLATE is requested, use flate with no transformation
 *              of the raster data.
 *      (6) Calling options/sequence for these functions:
 *              file  -->  file      (convertToPdfSegmented)
 *                  pix  -->  file      (pixConvertToPdfSegmented)
 *                      pix  -->  data      (pixConvertToPdfDataSegmented)
 *              file  -->  data      (convertToPdfDataSegmented)
 *                      pix  -->  data      (pixConvertToPdfDataSegmented)
 * </pre>
 */
l_ok
convertToPdfSegmented(const char  *filein,
                      l_int32      res,
                      l_int32      type,
                      l_int32      thresh,
                      BOXA        *boxa,
                      l_int32      quality,
                      l_float32    scalefactor,
                      const char  *title,
                      const char  *fileout)
{
l_int32  ret;
PIX     *pixs;

    if (!filein)
        return ERROR_INT("filein not defined", __func__, 1);
    if (!fileout)
        return ERROR_INT("fileout not defined", __func__, 1);
    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
        type != L_FLATE_ENCODE)
        return ERROR_INT("invalid conversion type", __func__, 1);
    if (boxa && scalefactor > 1.0) {
        L_WARNING("setting scalefactor to 1.0\n", __func__);
        scalefactor = 1.0;
    }

    if ((pixs = pixRead(filein)) == NULL)
        return ERROR_INT("pixs not made", __func__, 1);

    ret = pixConvertToPdfSegmented(pixs, res, type, thresh, boxa, quality,
                                   scalefactor, title, fileout);
    pixDestroy(&pixs);
    return ret;
}


/*!
 * \brief   pixConvertToPdfSegmented()
 *
 * \param[in]    pixs          any depth, cmap OK
 * \param[in]    res           input image resolution; typ. 300 ppi;
 *                             use 0 for default
 * \param[in]    type          compression type for non-image regions; image
 *                             regions are always compressed with L_JPEG_ENCODE
 * \param[in]    thresh        for converting gray --> 1 bpp with L_G4_ENCODE
 * \param[in]    boxa          [optional] of image regions; can be null
 * \param[in]    quality       used for jpeg image regions; 0 for default
 * \param[in]    scalefactor   used for jpeg regions; must be <= 1.0
 * \param[in]    title         [optional] pdf title; can be null
 * \param[in]    fileout       output pdf file
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) See convertToPdfSegmented() for details.
 * </pre>
 */
l_ok
pixConvertToPdfSegmented(PIX         *pixs,
                         l_int32      res,
                         l_int32      type,
                         l_int32      thresh,
                         BOXA        *boxa,
                         l_int32      quality,
                         l_float32    scalefactor,
                         const char  *title,
                         const char  *fileout)
{
l_uint8  *data;
l_int32   ret;
size_t    nbytes;

    if (!pixs)
        return ERROR_INT("pixs not defined", __func__, 1);
    if (!fileout)
        return ERROR_INT("fileout not defined", __func__, 1);
    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
        type != L_FLATE_ENCODE)
        return ERROR_INT("invalid conversion type", __func__, 1);
    if (boxa && scalefactor > 1.0) {
        L_WARNING("setting scalefactor to 1.0\n", __func__);
        scalefactor = 1.0;
    }

    ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa, quality,
                                       scalefactor, title, &data, &nbytes);
    if (ret)
        return ERROR_INT("pdf generation failure", __func__, 1);

    ret = l_binaryWrite(fileout, "w", data, nbytes);
    if (data) LEPT_FREE(data);
    return ret;
}


/*!
 * \brief   convertToPdfDataSegmented()
 *
 * \param[in]    filein        input image file -- any format
 * \param[in]    res           input image resolution; typ. 300 ppi;
 *                             use 0 for default
 * \param[in]    type          compression type for non-image regions; image
 *                             regions are always compressed with L_JPEG_ENCODE
 * \param[in]    thresh        for converting gray --> 1 bpp with L_G4_ENCODE
 * \param[in]    boxa          [optional] image regions; can be null
 * \param[in]    quality       used for jpeg image regions; 0 for default
 * \param[in]    scalefactor   used for jpeg regions; must be <= 1.0
 * \param[in]    title         [optional] pdf title; can be null
 * \param[out]   pdata         pdf data in memory
 * \param[out]   pnbytes       number of bytes in pdf data
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) If there are no image regions, set %boxa == NULL;
 *          %quality and %scalefactor are ignored.
 *      (2) Typically, %scalefactor is < 1.0.  The image regions are
 * </pre>
 */
l_ok
convertToPdfDataSegmented(const char  *filein,
                          l_int32      res,
                          l_int32      type,
                          l_int32      thresh,
                          BOXA        *boxa,
                          l_int32      quality,
                          l_float32    scalefactor,
                          const char  *title,
                          l_uint8    **pdata,
                          size_t      *pnbytes)
{
l_int32  ret;
PIX     *pixs;

    if (!pdata)
        return ERROR_INT("&data not defined", __func__, 1);
    *pdata = NULL;
    if (!pnbytes)
        return ERROR_INT("&nbytes not defined", __func__, 1);
    *pnbytes = 0;
    if (!filein)
        return ERROR_INT("filein not defined", __func__, 1);
    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
        type != L_FLATE_ENCODE)
        return ERROR_INT("invalid conversion type", __func__, 1);
    if (boxa && scalefactor > 1.0) {
        L_WARNING("setting scalefactor to 1.0\n", __func__);
        scalefactor = 1.0;
    }

    if ((pixs = pixRead(filein)) == NULL)
        return ERROR_INT("pixs not made", __func__, 1);

    ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa,
                                       quality, scalefactor, title,
                                       pdata, pnbytes);
    pixDestroy(&pixs);
    return ret;
}


/*!
 * \brief   pixConvertToPdfDataSegmented()
 *
 * \param[in]    pixs          any depth, cmap OK
 * \param[in]    res           input image resolution; typ. 300 ppi;
 *                             use 0 for default
 * \param[in]    type          compression type for non-image regions; image
 *                             regions are always compressed with L_JPEG_ENCODE
 * \param[in]    thresh        for converting gray --> 1 bpp with L_G4_ENCODE
 * \param[in]    boxa          [optional] of image regions; can be null
 * \param[in]    quality       used for jpeg image regions; 0 for default
 * \param[in]    scalefactor   used for jpeg regions; must be <= 1.0
 * \param[in]    title         [optional] pdf title; can be null
 * \param[out]   pdata         pdf data in memory
 * \param[out]   pnbytes       number of bytes in pdf data
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) See convertToPdfSegmented() for details.
 * </pre>
 */
l_ok
pixConvertToPdfDataSegmented(PIX         *pixs,
                             l_int32      res,
                             l_int32      type,
                             l_int32      thresh,
                             BOXA        *boxa,
                             l_int32      quality,
                             l_float32    scalefactor,
                             const char  *title,
                             l_uint8    **pdata,
                             size_t      *pnbytes)
{
l_int32      i, nbox, seq, bx, by, bw, bh, upscale;
l_float32    scale;
BOX         *box, *boxc, *box2;
PIX         *pix, *pixt1, *pixt2, *pixt3, *pixt4, *pixt5, *pixt6;
PIXCMAP     *cmap;
L_PDF_DATA  *lpd;

    if (!pdata)
        return ERROR_INT("&data not defined", __func__, 1);
    *pdata = NULL;
    if (!pnbytes)
        return ERROR_INT("&nbytes not defined", __func__, 1);
    *pnbytes = 0;
    if (!pixs)
        return ERROR_INT("pixs not defined", __func__, 1);
    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
        type != L_FLATE_ENCODE)
        return ERROR_INT("invalid conversion type", __func__, 1);
    if (boxa && (scalefactor <= 0.0 || scalefactor > 1.0)) {
        L_WARNING("setting scalefactor to 1.0\n", __func__);
        scalefactor = 1.0;
    }

        /* Adjust scalefactor so that the product with res gives an integer */
    if (res <= 0)
        res = DefaultInputRes;
    scale = (l_float32)((l_int32)(scalefactor * res + 0.5)) / (l_float32)res;
    cmap = pixGetColormap(pixs);

        /* Simple case: single image to be encoded */
    if (!boxa || boxaGetCount(boxa) == 0) {
        if (pixGetDepth(pixs) > 1 && type == L_G4_ENCODE) {
            if (cmap)
                pixt1 = pixRemoveColormap(pixs, REMOVE_CMAP_TO_GRAYSCALE);
            else
                pixt1 = pixConvertTo8(pixs, FALSE);
            pixt2 = pixScaleGray2xLIThresh(pixt1, thresh);
            pixConvertToPdfData(pixt2, type, quality, pdata, pnbytes,
                                0, 0, 2 * res, title, NULL, 0);
            pixDestroy(&pixt1);
            pixDestroy(&pixt2);
        } else {
            pixConvertToPdfData(pixs, type, quality, pdata, pnbytes,
                                0, 0, res, title, NULL, 0);
        }
        return 0;
    }

        /* Multiple images to be encoded.  If %type == L_G4_ENCODE,
         * jpeg encode a version of pixs that is blanked in the non-image
         * regions, and paint the scaled non-image part onto it through a mask.
         * Otherwise, we must put the non-image part down first and
         * then render all the image regions separately on top of it,
         * at their own resolution. */
    pixt1 = pixSetBlackOrWhiteBoxa(pixs, boxa, L_SET_WHITE);  /* non-image */
    nbox = boxaGetCount(boxa);
    if (type == L_G4_ENCODE) {
        pixt2 = pixCreateTemplate(pixs);  /* only image regions */
        pixSetBlackOrWhite(pixt2, L_SET_WHITE);
        for (i = 0; i < nbox; i++) {
             box = boxaGetBox(boxa, i, L_CLONE);
             pix = pixClipRectangle(pixs, box, &boxc);
             boxGetGeometry(boxc, &bx, &by, &bw, &bh);
             pixRasterop(pixt2, bx, by, bw, bh, PIX_SRC, pix, 0, 0);
             pixDestroy(&pix);
             boxDestroy(&box);
             boxDestroy(&boxc);
        }
        pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
        if (pixGetDepth(pixt3) == 1)
            pixt4 = pixScaleToGray(pixt3, scale);
        else
            pixt4 = pixScale(pixt3, scale, scale);
        pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
                            0, 0, (l_int32)(scale * res), title,
                            &lpd, L_FIRST_IMAGE);

        if (pixGetDepth(pixt1) == 1) {
            pixt5 = pixClone(pixt1);
            upscale = 1;
        } else {
            pixt6 = pixConvertTo8(pixt1, 0);
            pixt5 = pixScaleGray2xLIThresh(pixt6, thresh);
            pixDestroy(&pixt6);
            upscale = 2;
        }
        pixConvertToPdfData(pixt5, L_G4_ENCODE, quality, pdata, pnbytes,
                            0, 0, upscale * res, title, &lpd, L_LAST_IMAGE);
        pixDestroy(&pixt2);
        pixDestroy(&pixt3);
        pixDestroy(&pixt4);
        pixDestroy(&pixt5);
    } else {
            /* Put the non-image part down first.  This is the full
               size of the page, so we can use it to find the page
               height in pixels, which is required for determining
               the LL corner of the image relative to the LL corner
               of the page. */
        pixConvertToPdfData(pixt1, type, quality, pdata, pnbytes, 0, 0,
                            res, title, &lpd, L_FIRST_IMAGE);
        for (i = 0; i < nbox; i++) {
            box = boxaGetBox(boxa, i, L_CLONE);
            pixt2 = pixClipRectangle(pixs, box, &boxc);
            pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
            if (pixGetDepth(pixt3) == 1)
                pixt4 = pixScaleToGray(pixt3, scale);
            else
                pixt4 = pixScale(pixt3, scale, scale);
            box2 = boxTransform(boxc, 0, 0, scale, scale);
            boxGetGeometry(box2, &bx, &by, NULL, &bh);
            seq = (i == nbox - 1) ? L_LAST_IMAGE : L_NEXT_IMAGE;
            pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
                                bx, by, (l_int32)(scale * res), title,
                                &lpd, seq);
            pixDestroy(&pixt2);
            pixDestroy(&pixt3);
            pixDestroy(&pixt4);
            boxDestroy(&box);
            boxDestroy(&boxc);
            boxDestroy(&box2);
        }
    }

    pixDestroy(&pixt1);
    return 0;
}


/*---------------------------------------------------------------------*
 *                         Multi-page concatenation                    *
 *---------------------------------------------------------------------*/
/*!
 * \brief   concatenatePdf()
 *
 * \param[in]    dirname   directory name containing single-page pdf files
 * \param[in]    substr    [optional] substring filter on filenames;
 *                         can be null
 * \param[in]    fileout   concatenated pdf file
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) This only works with leptonica-formatted single-page pdf files.
 *      (2) If %substr is not NULL, only filenames that contain
 *          the substring can be returned.  If %substr == NULL,
 *          none of the filenames are filtered out.
 *      (3) The files in the directory, after optional filtering by
 *          the substring, are lexically sorted in increasing order
 *          before concatenation.
 * </pre>
 */
l_ok
concatenatePdf(const char  *dirname,
               const char  *substr,
               const char  *fileout)
{
l_int32  ret;
SARRAY  *sa;

    if (!dirname)
        return ERROR_INT("dirname not defined", __func__, 1);
    if (!fileout)
        return ERROR_INT("fileout not defined", __func__, 1);

    if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
        return ERROR_INT("sa not made", __func__, 1);
    ret = saConcatenatePdf(sa, fileout);
    sarrayDestroy(&sa);
    return ret;
}


/*!
 * \brief   saConcatenatePdf()
 *
 * \param[in]    sa        string array of pathnames for single-page pdf files
 * \param[in]    fileout   concatenated pdf file
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) This only works with leptonica-formatted single-page pdf files.
 * </pre>
 */
l_ok
saConcatenatePdf(SARRAY      *sa,
                 const char  *fileout)
{
l_uint8  *data;
l_int32   ret;
size_t    nbytes;

    if (!sa)
        return ERROR_INT("sa not defined", __func__, 1);
    if (!fileout)
        return ERROR_INT("fileout not defined", __func__, 1);

    ret = saConcatenatePdfToData(sa, &data, &nbytes);
    if (ret)
        return ERROR_INT("pdf data not made", __func__, 1);
    ret = l_binaryWrite(fileout, "w", data, nbytes);
    LEPT_FREE(data);
    return ret;
}


/*!
 * \brief   ptraConcatenatePdf()
 *
 * \param[in]    pa       array of pdf strings, each for a single-page pdf file
 * \param[in]    fileout  concatenated pdf file
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) This only works with leptonica-formatted single-page pdf files.
 * </pre>
 */
l_ok
ptraConcatenatePdf(L_PTRA      *pa,
                   const char  *fileout)
{
l_uint8  *data;
l_int32   ret;
size_t    nbytes;

    if (!pa)
        return ERROR_INT("pa not defined", __func__, 1);
    if (!fileout)
        return ERROR_INT("fileout not defined", __func__, 1);

    ret = ptraConcatenatePdfToData(pa, NULL, &data, &nbytes);
    if (ret)
        return ERROR_INT("pdf data not made", __func__, 1);
    ret = l_binaryWrite(fileout, "w", data, nbytes);
    LEPT_FREE(data);
    return ret;
}


/*!
 * \brief   concatenatePdfToData()
 *
 * \param[in]    dirname   directory name containing single-page pdf files
 * \param[in]    substr    [optional] substring filter on filenames;
 *                         can be null
 * \param[out]   pdata     concatenated pdf data in memory
 * \param[out]   pnbytes   number of bytes in pdf data
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) This only works with leptonica-formatted single-page pdf files.
 *      (2) If %substr is not NULL, only filenames that contain
 *          the substring can be returned.  If %substr == NULL,
 *          none of the filenames are filtered out.
 *      (3) The files in the directory, after optional filtering by
 *          the substring, are lexically sorted in increasing order
 *          before concatenation.
 * </pre>
 */
l_ok
concatenatePdfToData(const char  *dirname,
                     const char  *substr,
                     l_uint8    **pdata,
                     size_t      *pnbytes)
{
l_int32  ret;
SARRAY  *sa;

    if (!pdata)
        return ERROR_INT("&data not defined", __func__, 1);
    *pdata = NULL;
    if (!pnbytes)
        return ERROR_INT("&nbytes not defined", __func__, 1);
    *pnbytes = 0;
    if (!dirname)
        return ERROR_INT("dirname not defined", __func__, 1);

    if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
        return ERROR_INT("sa not made", __func__, 1);
    ret = saConcatenatePdfToData(sa, pdata, pnbytes);
    sarrayDestroy(&sa);
    return ret;
}


/*!
 * \brief   saConcatenatePdfToData()
 *
 * \param[in]    sa        string array of pathnames for single-page pdf files
 * \param[out]   pdata     concatenated pdf data in memory
 * \param[out]   pnbytes   number of bytes in pdf data
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) This only works with leptonica-formatted single-page pdf files.
 * </pre>
 */
l_ok
saConcatenatePdfToData(SARRAY    *sa,
                       l_uint8  **pdata,
                       size_t    *pnbytes)
{
char     *fname;
l_int32   i, npages, ret;
L_BYTEA  *bas;
L_PTRA   *pa_data;  /* input pdf data for each page */

    if (!pdata)
        return ERROR_INT("&data not defined", __func__, 1);
    *pdata = NULL;
    if (!pnbytes)
        return ERROR_INT("&nbytes not defined", __func__, 1);
    *pnbytes = 0;
    if (!sa)
        return ERROR_INT("sa not defined", __func__, 1);

        /* Read the pdf files into memory */
    if ((npages = sarrayGetCount(sa)) == 0)
        return ERROR_INT("no filenames found", __func__, 1);
    pa_data = ptraCreate(npages);
    for (i = 0; i < npages; i++) {
        fname = sarrayGetString(sa, i, L_NOCOPY);
        bas = l_byteaInitFromFile(fname);
        ptraAdd(pa_data, bas);
    }

    ret = ptraConcatenatePdfToData(pa_data, sa, pdata, pnbytes);

        /* Cleanup: some pages could have been removed */
    ptraGetActualCount(pa_data, &npages);
    for (i = 0; i < npages; i++) {
        bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
        l_byteaDestroy(&bas);
    }
    ptraDestroy(&pa_data, FALSE, FALSE);
    return ret;
}

/* --------------------------------------------*/
#endif  /* USE_PDFIO */
/* --------------------------------------------*/

Coverage Report

Created: 2024-02-28 06:46