Coverage Report

Created: 2024-02-11 06:47

/src/leptonica/src/pdfio2.c
Line
Count
Source (jump to first uncovered line)
1
/*====================================================================*
2
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
3
 -
4
 -  Redistribution and use in source and binary forms, with or without
5
 -  modification, are permitted provided that the following conditions
6
 -  are met:
7
 -  1. Redistributions of source code must retain the above copyright
8
 -     notice, this list of conditions and the following disclaimer.
9
 -  2. Redistributions in binary form must reproduce the above
10
 -     copyright notice, this list of conditions and the following
11
 -     disclaimer in the documentation and/or other materials
12
 -     provided with the distribution.
13
 -
14
 -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15
 -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16
 -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17
 -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18
 -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19
 -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20
 -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21
 -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22
 -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23
 -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
 -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
 *====================================================================*/
26
27
/*!
28
 * \file pdfio2.c
29
 * <pre>
30
 *
31
 *    Lower-level operations for generating pdf.
32
 *
33
 *     Intermediate function for single page, multi-image conversion
34
 *          l_int32              pixConvertToPdfData()
35
 *
36
 *     Intermediate function for generating multipage pdf output
37
 *          l_int32              ptraConcatenatePdfToData()
38
 *
39
 *     Convert tiff multipage to pdf file
40
 *          l_int32              convertTiffMultipageToPdf()
41
 *
42
 *     Generates the CID, transcoding under some conditions
43
 *          l_int32              l_generateCIDataForPdf()
44
 *          l_int32              l_generateCIData()
45
 *
46
 *       Lower-level CID generation without transcoding
47
 *          L_COMP_DATA         *l_generateFlateDataPdf()
48
 *          L_COMP_DATA         *l_generateJpegData()
49
 *          L_COMP_DATA         *l_generateJpegDataMem()
50
 *          static L_COMP_DATA  *l_generateJp2kData()
51
 *          L_COMP_DATA         *l_generateG4Data()
52
 *
53
 *       Lower-level CID generation with transcoding
54
 *          l_int32              pixGenerateCIData()
55
 *          L_COMP_DATA         *l_generateFlateData()
56
 *          static L_COMP_DATA  *pixGenerateFlateData()
57
 *          static L_COMP_DATA  *pixGenerateJpegData()
58
 *          static L_COMP_DATA  *pixGenerateJp2kData()
59
 *          static L_COMP_DATA  *pixGenerateG4Data()
60
 *
61
 *       Other CID operations
62
 *          l_int32              cidConvertToPdfData()
63
 *          void                 l_CIDataDestroy()
64
 *
65
 *     Helper functions for generating the output pdf string
66
 *          static l_int32       l_generatePdf()
67
 *          static void          generateFixedStringsPdf()
68
 *          static char         *generateEscapeString()
69
 *          static void          generateMediaboxPdf()
70
 *          static l_int32       generatePageStringPdf()
71
 *          static l_int32       generateContentStringPdf()
72
 *          static l_int32       generatePreXStringsPdf()
73
 *          static l_int32       generateColormapStringsPdf()
74
 *          static void          generateTrailerPdf()
75
 *          static l_int32       makeTrailerStringPdf()
76
 *          static l_int32       generateOutputDataPdf()
77
 *
78
 *     Helper functions for generating multipage pdf output
79
 *          static l_int32       parseTrailerPdf()
80
 *          static char         *generatePagesObjStringPdf()
81
 *          static L_BYTEA      *substituteObjectNumbers()
82
 *
83
 *     Create/destroy/access pdf data
84
 *          static L_PDF_DATA   *pdfdataCreate()
85
 *          static void          pdfdataDestroy()
86
 *          static L_COMP_DATA  *pdfdataGetCid()
87
 *
88
 *     Find number of pages in a pdf
89
 *          l_int32              getPdfPageCount()
90
 *
91
 *     Find widths and heights of pages and media boxes in a pdf
92
 *          l_int32              getPdfPageSizes()
93
 *          l_int32              getPdfMediaBoxSizes()
94
 *
95
 *     Find effective resolution of images rendered from a pdf
96
 *          l_int32              getPdfRendererResolution()
97
 *
98
 *     Set flags for special modes
99
 *          void                 l_pdfSetG4ImageMask()
100
 *          void                 l_pdfSetDateAndVersion()
101
 *
102
 * </pre>
103
 */
104
105
#ifdef HAVE_CONFIG_H
106
#include <config_auto.h>
107
#endif  /* HAVE_CONFIG_H */
108
109
#include <string.h>
110
#include <math.h>
111
#include "allheaders.h"
112
113
/* --------------------------------------------*/
114
#if  USE_PDFIO   /* defined in environ.h */
115
 /* --------------------------------------------*/
116
117
    /* Typical scan resolution in ppi (pixels/inch) */
118
static const l_int32  DefaultInputRes = 300;
119
120
    /* Static helpers */
121
static L_COMP_DATA  *l_generateJp2kData(const char *fname);
122
static L_COMP_DATA  *pixGenerateFlateData(PIX *pixs, l_int32 ascii85flag);
123
static L_COMP_DATA  *pixGenerateJpegData(PIX *pixs, l_int32 ascii85flag,
124
                                         l_int32 quality);
125
static L_COMP_DATA  *pixGenerateJp2kData(PIX *pixs, l_int32 quality);
126
static L_COMP_DATA  *pixGenerateG4Data(PIX *pixs, l_int32 ascii85flag);
127
128
static l_int32       l_generatePdf(l_uint8 **pdata, size_t *pnbytes,
129
                                   L_PDF_DATA  *lpd);
130
static void          generateFixedStringsPdf(L_PDF_DATA *lpd);
131
static char         *generateEscapeString(const char  *str);
132
static void          generateMediaboxPdf(L_PDF_DATA *lpd);
133
static l_int32       generatePageStringPdf(L_PDF_DATA *lpd);
134
static l_int32       generateContentStringPdf(L_PDF_DATA *lpd);
135
static l_int32       generatePreXStringsPdf(L_PDF_DATA *lpd);
136
static l_int32       generateColormapStringsPdf(L_PDF_DATA *lpd);
137
static void          generateTrailerPdf(L_PDF_DATA *lpd);
138
static char         *makeTrailerStringPdf(L_DNA *daloc);
139
static l_int32       generateOutputDataPdf(l_uint8 **pdata, size_t *pnbytes,
140
                                       L_PDF_DATA *lpd);
141
142
static l_int32       parseTrailerPdf(L_BYTEA *bas, L_DNA **pda);
143
static char         *generatePagesObjStringPdf(NUMA *napage);
144
static L_BYTEA      *substituteObjectNumbers(L_BYTEA *bas, NUMA *na_objs);
145
146
static L_PDF_DATA   *pdfdataCreate(const char *title);
147
static void          pdfdataDestroy(L_PDF_DATA **plpd);
148
static L_COMP_DATA  *pdfdataGetCid(L_PDF_DATA *lpd, l_int32 index);
149
150
151
/* ---------------- Defaults for rendering options ----------------- */
152
    /* Output G4 as writing through image mask; this is the default */
153
static l_int32   var_WRITE_G4_IMAGE_MASK = 1;
154
    /* Write date/time and lib version into pdf; this is the default */
155
static l_int32   var_WRITE_DATE_AND_VERSION = 1;
156
157
#define L_SMALLBUF   256
158
#define L_BIGBUF    2048   /* must be able to hold hex colormap */
159
160
161
#ifndef  NO_CONSOLE_IO
162
#define  DEBUG_MULTIPAGE      0
163
#endif  /* ~NO_CONSOLE_IO */
164
165
166
/*---------------------------------------------------------------------*
167
 *       Intermediate function for generating multipage pdf output     *
168
 *---------------------------------------------------------------------*/
169
/*!
170
 * \brief   pixConvertToPdfData()
171
 *
172
 * \param[in]      pix       all depths; cmap OK
173
 * \param[in]      type      L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE,
174
 *                           L_JP2K_ENCODE
175
 * \param[in]      quality   for jpeg: 1-100; 0 for default (75)
176
 *                           for jp2k: 27-45; 0 for default (34)
177
 * \param[out]     pdata     pdf array
178
 * \param[out]     pnbytes   number of bytes in pdf array
179
 * \param[in]      x, y      location of lower-left corner of image, in pixels,
180
 *                           relative to the PostScript origin (0,0) at
181
 *                           the lower-left corner of the page)
182
 * \param[in]      res       override the resolution of the input image, in ppi;
183
 *                           use 0 to respect resolution embedded in the input
184
 * \param[in]      title     [optional] pdf title; can be null
185
 * \param[in,out]  plpd      ptr to lpd; created on the first invocation and
186
 *                           returned until last image is processed
187
 * \param[in]      position  in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
188
 *                           L_LAST_IMAGE
189
 * \return  0 if OK, 1 on error
190
 *
191
 * <pre>
192
 * Notes:
193
 *      (1) If %res == 0 and the input resolution field from the pix is 0,
194
 *          this will use DefaultInputRes.
195
 *      (2) This only writes %data if it is the last image to be
196
 *          written on the page.
197
 *      (3) See comments in convertToPdf().
198
 * </pre>
199
 */
200
l_ok
201
pixConvertToPdfData(PIX          *pix,
202
                    l_int32       type,
203
                    l_int32       quality,
204
                    l_uint8     **pdata,
205
                    size_t       *pnbytes,
206
                    l_int32       x,
207
                    l_int32       y,
208
                    l_int32       res,
209
                    const char   *title,
210
                    L_PDF_DATA  **plpd,
211
                    l_int32       position)
212
0
{
213
0
l_int32       pixres, w, h, ret;
214
0
l_float32     xpt, ypt, wpt, hpt;
215
0
L_COMP_DATA  *cid = NULL;
216
0
L_PDF_DATA   *lpd = NULL;
217
218
0
    if (!pdata)
219
0
        return ERROR_INT("&data not defined", __func__, 1);
220
0
    *pdata = NULL;
221
0
    if (!pnbytes)
222
0
        return ERROR_INT("&nbytes not defined", __func__, 1);
223
0
    *pnbytes = 0;
224
0
    if (!pix)
225
0
        return ERROR_INT("pix not defined", __func__, 1);
226
0
    if (type != L_JPEG_ENCODE && type != L_G4_ENCODE &&
227
0
        type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
228
0
        selectDefaultPdfEncoding(pix, &type);
229
0
    }
230
0
    if (quality < 0 || quality > 100)
231
0
        return ERROR_INT("invalid quality", __func__, 1);
232
233
0
    if (plpd) {  /* part of multi-page invocation */
234
0
        if (position == L_FIRST_IMAGE)
235
0
            *plpd = NULL;
236
0
    }
237
238
        /* Generate the compressed image data.  It must NOT
239
         * be ascii85 encoded. */
240
0
    pixGenerateCIData(pix, type, quality, 0, &cid);
241
0
    if (!cid)
242
0
        return ERROR_INT("cid not made", __func__, 1);
243
244
        /* Get media box in pts.  Guess the input image resolution
245
         * based on the input parameter %res, the resolution data in
246
         * the pix, and the size of the image. */
247
0
    pixres = cid->res;
248
0
    w = cid->w;
249
0
    h = cid->h;
250
0
    if (res <= 0.0)
251
0
        res = (pixres > 0) ? pixres : DefaultInputRes;
252
0
    xpt = x * 72.f / res;
253
0
    ypt = y * 72.f / res;
254
0
    wpt = w * 72.f / res;
255
0
    hpt = h * 72.f / res;
256
257
        /* Set up lpd */
258
0
    if (!plpd) {  /* single image */
259
0
        if ((lpd = pdfdataCreate(title)) == NULL)
260
0
            return ERROR_INT("lpd not made", __func__, 1);
261
0
    } else if (position == L_FIRST_IMAGE) {  /* first of multiple images */
262
0
        if ((lpd = pdfdataCreate(title)) == NULL)
263
0
            return ERROR_INT("lpd not made", __func__, 1);
264
0
        *plpd = lpd;
265
0
    } else {  /* not the first of multiple images */
266
0
        lpd = *plpd;
267
0
    }
268
269
        /* Add the data to the lpd */
270
0
    ptraAdd(lpd->cida, cid);
271
0
    lpd->n++;
272
0
    ptaAddPt(lpd->xy, xpt, ypt);
273
0
    ptaAddPt(lpd->wh, wpt, hpt);
274
275
        /* If a single image or the last of multiple images,
276
         * generate the pdf and destroy the lpd */
277
0
    if (!plpd || (position == L_LAST_IMAGE)) {
278
0
        ret = l_generatePdf(pdata, pnbytes, lpd);
279
0
        pdfdataDestroy(&lpd);
280
0
        if (plpd) *plpd = NULL;
281
0
        if (ret)
282
0
            return ERROR_INT("pdf output not made", __func__, 1);
283
0
    }
284
285
0
    return 0;
286
0
}
287
288
289
/*---------------------------------------------------------------------*
290
 *      Intermediate function for generating multipage pdf output      *
291
 *---------------------------------------------------------------------*/
292
/*!
293
 * \brief   ptraConcatenatePdfToData()
294
 *
295
 * \param[in]    pa_data    ptra array of pdf strings, each for a
296
 *                          single-page pdf file
297
 * \param[in]    sa         [optional] string array of pathnames for
298
 *                          input pdf files; can be null
299
 * \param[out]   pdata      concatenated pdf data in memory
300
 * \param[out]   pnbytes    number of bytes in pdf data
301
 * \return  0 if OK, 1 on error
302
 *
303
 * <pre>
304
 * Notes:
305
 *      (1) This only works with leptonica-formatted single-page pdf files.
306
 *          pdf files generated by other programs will have unpredictable
307
 *          (and usually bad) results.  The requirements for each pdf file:
308
 *            (a) The Catalog and Info objects are the first two.
309
 *            (b) Object 3 is Pages
310
 *            (c) Object 4 is Page
311
 *            (d) The remaining objects are Contents, XObjects, and ColorSpace
312
 *      (2) We remove trailers from each page, and append the full trailer
313
 *          for all pages at the end.
314
 *      (3) For all but the first file, remove the ID and the first 3
315
 *          objects (catalog, info, pages), so that each subsequent
316
 *          file has only objects of these classes:
317
 *              Page, Contents, XObject, ColorSpace (Indexed RGB).
318
 *          For those objects, we substitute these refs to objects
319
 *          in the local file:
320
 *              Page:  Parent(object 3), Contents, XObject(typically multiple)
321
 *              XObject:  [ColorSpace if indexed]
322
 *          The Pages object on the first page (object 3) has a Kids array
323
 *          of references to all the Page objects, with a Count equal
324
 *          to the number of pages.  Each Page object refers back to
325
 *          this parent.
326
 * </pre>
327
 */
328
l_ok
329
ptraConcatenatePdfToData(L_PTRA    *pa_data,
330
                         SARRAY    *sa,
331
                         l_uint8  **pdata,
332
                         size_t    *pnbytes)
333
0
{
334
0
char     *fname, *str_pages, *str_trailer;
335
0
l_uint8  *pdfdata, *data;
336
0
l_int32   i, j, index, nobj, npages;
337
0
l_int32  *sizes, *locs;
338
0
size_t    size;
339
0
L_BYTEA  *bas, *bad, *bat1, *bat2;
340
0
L_DNA    *da_locs, *da_sizes, *da_outlocs, *da;
341
0
L_DNAA   *daa_locs;  /* object locations on each page */
342
0
NUMA     *na_objs, *napage;
343
0
NUMAA    *naa_objs;  /* object mapping numbers to new values */
344
345
0
    if (!pdata)
346
0
        return ERROR_INT("&data not defined", __func__, 1);
347
0
    *pdata = NULL;
348
0
    if (!pnbytes)
349
0
        return ERROR_INT("&nbytes not defined", __func__, 1);
350
0
    *pnbytes = 0;
351
0
    if (!pa_data)
352
0
        return ERROR_INT("pa_data not defined", __func__, 1);
353
354
        /* Parse the files and find the object locations.
355
         * Remove file data that cannot be parsed. */
356
0
    ptraGetActualCount(pa_data, &npages);
357
0
    daa_locs = l_dnaaCreate(npages);
358
0
    for (i = 0; i < npages; i++) {
359
0
        bas = (L_BYTEA *)ptraGetPtrToItem(pa_data, i);
360
0
        if (parseTrailerPdf(bas, &da_locs) != 0) {
361
0
            bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
362
0
            l_byteaDestroy(&bas);
363
0
            if (sa) {
364
0
                fname = sarrayGetString(sa, i, L_NOCOPY);
365
0
                L_ERROR("can't parse file %s; skipping\n", __func__, fname);
366
0
            } else {
367
0
                L_ERROR("can't parse file %d; skipping\n", __func__, i);
368
0
            }
369
0
        } else {
370
0
            l_dnaaAddDna(daa_locs, da_locs, L_INSERT);
371
0
        }
372
0
    }
373
374
        /* Recompute npages in case some of the files were not pdf */
375
0
    ptraCompactArray(pa_data);
376
0
    ptraGetActualCount(pa_data, &npages);
377
0
    if (npages == 0) {
378
0
        l_dnaaDestroy(&daa_locs);
379
0
        return ERROR_INT("no parsable pdf files found", __func__, 1);
380
0
    }
381
382
        /* Find the mapping from initial to final object numbers */
383
0
    naa_objs = numaaCreate(npages);  /* stores final object numbers */
384
0
    napage = numaCreate(npages);  /* stores "Page" object numbers */
385
0
    index = 0;
386
0
    for (i = 0; i < npages; i++) {
387
0
        da = l_dnaaGetDna(daa_locs, i, L_CLONE);
388
0
        nobj = l_dnaGetCount(da);
389
0
        if (i == 0) {
390
0
            numaAddNumber(napage, 4);  /* object 4 on first page */
391
0
            na_objs = numaMakeSequence(0.0, 1.0, nobj - 1);
392
0
            index = nobj - 1;
393
0
        } else {  /* skip the first 3 objects in each file */
394
0
            numaAddNumber(napage, index);  /* Page object is first we add */
395
0
            na_objs = numaMakeConstant(0.0, nobj - 1);
396
0
            numaReplaceNumber(na_objs, 3, 3);  /* refers to parent of all */
397
0
            for (j = 4; j < nobj - 1; j++)
398
0
                numaSetValue(na_objs, j, index++);
399
0
        }
400
0
        numaaAddNuma(naa_objs, na_objs, L_INSERT);
401
0
        l_dnaDestroy(&da);
402
0
    }
403
404
        /* Make the Pages object (#3) */
405
0
    str_pages = generatePagesObjStringPdf(napage);
406
407
        /* Build the output */
408
0
    bad = l_byteaCreate(5000);
409
0
    da_outlocs = l_dnaCreate(0);  /* locations of all output objects */
410
0
    for (i = 0; i < npages; i++) {
411
0
        bas = (L_BYTEA *)ptraGetPtrToItem(pa_data, i);
412
0
        pdfdata = l_byteaGetData(bas, &size);
413
0
        da_locs = l_dnaaGetDna(daa_locs, i, L_CLONE);  /* locs on this page */
414
0
        na_objs = numaaGetNuma(naa_objs, i, L_CLONE);  /* obj # on this page */
415
0
        nobj = l_dnaGetCount(da_locs) - 1;
416
0
        da_sizes = l_dnaDiffAdjValues(da_locs);  /* object sizes on this page */
417
0
        sizes = l_dnaGetIArray(da_sizes);
418
0
        locs = l_dnaGetIArray(da_locs);
419
0
        if (i == 0) {
420
0
            l_byteaAppendData(bad, pdfdata, sizes[0]);
421
0
            l_byteaAppendData(bad, pdfdata + locs[1], sizes[1]);
422
0
            l_byteaAppendData(bad, pdfdata + locs[2], sizes[2]);
423
0
            l_byteaAppendString(bad, str_pages);
424
0
            for (j = 0; j < 4; j++)
425
0
                l_dnaAddNumber(da_outlocs, locs[j]);
426
0
        }
427
0
        for (j = 4; j < nobj; j++) {
428
0
            l_dnaAddNumber(da_outlocs, l_byteaGetSize(bad));
429
0
            bat1 = l_byteaInitFromMem(pdfdata + locs[j], sizes[j]);
430
0
            bat2 = substituteObjectNumbers(bat1, na_objs);
431
0
            data = l_byteaGetData(bat2, &size);
432
0
            l_byteaAppendData(bad, data, size);
433
0
            l_byteaDestroy(&bat1);
434
0
            l_byteaDestroy(&bat2);
435
0
        }
436
0
        if (i == npages - 1)  /* last one */
437
0
            l_dnaAddNumber(da_outlocs, l_byteaGetSize(bad));
438
0
        LEPT_FREE(sizes);
439
0
        LEPT_FREE(locs);
440
0
        l_dnaDestroy(&da_locs);
441
0
        numaDestroy(&na_objs);
442
0
        l_dnaDestroy(&da_sizes);
443
0
    }
444
445
        /* Add the trailer */
446
0
    str_trailer = makeTrailerStringPdf(da_outlocs);
447
0
    l_byteaAppendString(bad, str_trailer);
448
449
        /* Transfer the output data */
450
0
    *pdata = l_byteaCopyData(bad, pnbytes);
451
0
    l_byteaDestroy(&bad);
452
453
#if  DEBUG_MULTIPAGE
454
    lept_stderr("******** object mapper **********");
455
    numaaWriteStream(stderr, naa_objs);
456
457
    lept_stderr("******** Page object numbers ***********");
458
    numaWriteStderr(napage);
459
460
    lept_stderr("******** Pages object ***********\n");
461
    lept_stderr("%s\n", str_pages);
462
#endif  /* DEBUG_MULTIPAGE */
463
464
0
    numaDestroy(&napage);
465
0
    numaaDestroy(&naa_objs);
466
0
    l_dnaDestroy(&da_outlocs);
467
0
    l_dnaaDestroy(&daa_locs);
468
0
    LEPT_FREE(str_pages);
469
0
    LEPT_FREE(str_trailer);
470
0
    return 0;
471
0
}
472
473
474
/*---------------------------------------------------------------------*
475
 *                  Convert tiff multipage to pdf file                 *
476
 *---------------------------------------------------------------------*/
477
/*!
478
 * \brief   convertTiffMultipageToPdf()
479
 *
480
 * \param[in]    filein    (tiff)
481
 * \param[in]    fileout   (pdf)
482
 * \return  0 if OK, 1 on error
483
 *
484
 * <pre>
485
 * Notes:
486
 *      (1) A multipage tiff file can also be converted to PS, using
487
 *          convertTiffMultipageToPS()
488
 * </pre>
489
 */
490
l_ok
491
convertTiffMultipageToPdf(const char  *filein,
492
                          const char  *fileout)
493
0
{
494
0
l_int32  istiff;
495
0
PIXA    *pixa;
496
0
FILE    *fp;
497
498
0
    if ((fp = fopenReadStream(filein)) == NULL)
499
0
        return ERROR_INT_1("file not found", filein, __func__, 1);
500
0
    istiff = fileFormatIsTiff(fp);
501
0
    fclose(fp);
502
0
    if (!istiff)
503
0
        return ERROR_INT_1("file not tiff format", filein, __func__, 1);
504
505
0
    pixa = pixaReadMultipageTiff(filein);
506
0
    pixaConvertToPdf(pixa, 0, 1.0, 0, 0, "weasel2", fileout);
507
0
    pixaDestroy(&pixa);
508
0
    return 0;
509
0
}
510
511
512
/*---------------------------------------------------------------------*
513
 *                          CID-based operations                       *
514
 *---------------------------------------------------------------------*/
515
/*!
516
 * \brief   l_generateCIDataForPdf()
517
 *
518
 * \param[in]    fname      [optional] can be null
519
 * \param[in]    pix        [optional] can be null
520
 * \param[in]    quality    for jpeg if transcoded: 1-100; 0 for default (75)
521
 *                          for jp2k if transcoded: 27-45; 0 for default (34)
522
 * \param[out]   pcid       compressed data
523
 * \return  0 if OK, 1 on error
524
 *
525
 * <pre>
526
 * Notes:
527
 *      (1) You must set either filename or pix.
528
 *      (2) Given an image file and optionally a pix raster of that data,
529
 *          this provides a CID that is compatible with PDF, preferably
530
 *          without transcoding.
531
 *      (3) The pix is included for efficiency, in case transcoding
532
 *          is required and the pix is available to the caller.
533
 *      (4) We don't try to open files named "stdin" or "-" for Tesseract
534
 *          compatibility reasons. We may remove this restriction
535
 *          in the future.
536
 *      (5) Note that tiff-g4 must be transcoded to properly handle byte
537
 *          order and perhaps photometry (e.g., min-is-black).  For a
538
 *          multipage tiff file, data will only be extracted from the
539
 *          first page, so this should not be invoked.
540
 * </pre>
541
 */
542
l_ok
543
l_generateCIDataForPdf(const char    *fname,
544
                       PIX           *pix,
545
                       l_int32        quality,
546
                       L_COMP_DATA  **pcid)
547
0
{
548
0
l_int32       format, type;
549
0
L_COMP_DATA  *cid;
550
0
PIX          *pixt;
551
552
0
    if (!pcid)
553
0
        return ERROR_INT("&cid not defined", __func__, 1);
554
0
    *pcid = cid = NULL;
555
0
    if (!fname && !pix)
556
0
        return ERROR_INT("neither fname nor pix are defined", __func__, 1);
557
558
        /* If a compressed file is given that is not 'stdin', see if we
559
         * can generate the pdf output without transcoding. */
560
0
    if (fname && strcmp(fname, "-") != 0 && strcmp(fname, "stdin") != 0) {
561
0
        findFileFormat(fname, &format);
562
0
        if (format == IFF_UNKNOWN)
563
0
            L_WARNING("file %s format is unknown\n", __func__, fname);
564
0
        if (format == IFF_PS || format == IFF_LPDF) {
565
0
            L_ERROR("file %s is unsupported format %d\n",
566
0
                  __func__, fname, format);
567
0
            return 1;
568
0
        }
569
0
        if (format == IFF_JFIF_JPEG) {
570
0
            cid = l_generateJpegData(fname, 0);
571
0
        } else if (format == IFF_JP2) {
572
0
            cid = l_generateJp2kData(fname);
573
0
        } else if (format == IFF_PNG) {
574
0
            cid = l_generateFlateDataPdf(fname, pix);
575
0
        }
576
0
    }
577
578
        /* Otherwise, use the pix to generate the pdf output */
579
0
    if  (!cid) {
580
0
        if (!pix)
581
0
            pixt = pixRead(fname);
582
0
        else
583
0
            pixt = pixClone(pix);
584
0
        if (!pixt)
585
0
            return ERROR_INT("pixt not made", __func__, 1);
586
0
        if (selectDefaultPdfEncoding(pixt, &type)) {
587
0
            pixDestroy(&pixt);
588
0
            return 1;
589
0
        }
590
0
        pixGenerateCIData(pixt, type, quality, 0, &cid);
591
0
        pixDestroy(&pixt);
592
0
        if (!cid)
593
0
            return ERROR_INT("cid not made from pix", __func__, 1);
594
0
    }
595
0
    *pcid = cid;
596
0
    return 0;
597
0
}
598
599
600
/*!
601
 * \brief   l_generateCIData()
602
 *
603
 * \param[in]    fname
604
 * \param[in]    type       L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE,
605
 *                          L_JP2K_ENCODE
606
 * \param[in]    quality    for jpeg if transcoded: 1-100; 0 for default (75)
607
 *                          for jp2k if transcoded: 27-45; 0 for default (34)
608
 * \param[in]    ascii85    0 for binary; 1 for ascii85-encoded
609
 * \param[out]   pcid       compressed data
610
 * \return  0 if OK, 1 on error
611
 *
612
 * <pre>
613
 * Notes:
614
 *      (1) This can be used for both PostScript and pdf.
615
 *      (1) Set ascii85:
616
 *           ~ 0 for binary data (PDF only)
617
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
618
 *      (2) This attempts to compress according to the requested type.
619
 *          If this can't be done, it falls back to ordinary flate encoding.
620
 *      (3) This differs from l_generateCIDataForPdf(), which determines
621
 *          the file format and only works for pdf.
622
 * </pre>
623
 */
624
l_ok
625
l_generateCIData(const char    *fname,
626
                 l_int32        type,
627
                 l_int32        quality,
628
                 l_int32        ascii85,
629
                 L_COMP_DATA  **pcid)
630
0
{
631
0
l_int32       format, d, bps, spp, iscmap;
632
0
L_COMP_DATA  *cid;
633
0
PIX          *pix;
634
635
0
    if (!pcid)
636
0
        return ERROR_INT("&cid not defined", __func__, 1);
637
0
    *pcid = NULL;
638
0
    if (!fname)
639
0
        return ERROR_INT("fname not defined", __func__, 1);
640
0
    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
641
0
        type != L_FLATE_ENCODE && type != L_JP2K_ENCODE)
642
0
        return ERROR_INT("invalid conversion type", __func__, 1);
643
0
    if (ascii85 != 0 && ascii85 != 1)
644
0
        return ERROR_INT("invalid ascii85", __func__, 1);
645
646
        /* Sanity check on requested encoding */
647
0
    pixReadHeader(fname, &format, NULL, NULL, &bps, &spp, &iscmap);
648
0
    d = bps * spp;
649
0
    if (d == 24) d = 32;
650
0
    if (iscmap && type != L_FLATE_ENCODE) {
651
0
        L_WARNING("pixs has cmap; using flate encoding\n", __func__);
652
0
        type = L_FLATE_ENCODE;
653
0
    } else if (d < 8 && type == L_JPEG_ENCODE) {
654
0
        L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__);
655
0
        type = L_FLATE_ENCODE;
656
0
    } else if (d < 8 && type == L_JP2K_ENCODE) {
657
0
        L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__);
658
0
        type = L_FLATE_ENCODE;
659
0
    } else if (d > 1 && type == L_G4_ENCODE) {
660
0
        L_WARNING("pixs has > 1 bpp; using flate encoding\n", __func__);
661
0
        type = L_FLATE_ENCODE;
662
0
    }
663
664
0
    if (type == L_JPEG_ENCODE) {
665
0
        if (format == IFF_JFIF_JPEG) {  /* do not transcode */
666
0
            cid = l_generateJpegData(fname, ascii85);
667
0
        } else {
668
0
            if ((pix = pixRead(fname)) == NULL)
669
0
                return ERROR_INT("pix not returned for JPEG", __func__, 1);
670
0
            cid = pixGenerateJpegData(pix, ascii85, quality);
671
0
            pixDestroy(&pix);
672
0
        }
673
0
        if (!cid)
674
0
            return ERROR_INT("jpeg data not made", __func__, 1);
675
0
    } else if (type == L_JP2K_ENCODE) {
676
0
        if (format == IFF_JP2) {  /* do not transcode */
677
0
            cid = l_generateJp2kData(fname);
678
0
        } else {
679
0
            if ((pix = pixRead(fname)) == NULL)
680
0
                return ERROR_INT("pix not returned for JP2K", __func__, 1);
681
0
            cid = pixGenerateJp2kData(pix, quality);
682
0
            pixDestroy(&pix);
683
0
        }
684
0
        if (!cid)
685
0
            return ERROR_INT("jp2k data not made", __func__, 1);
686
0
    } else if (type == L_G4_ENCODE) {
687
0
        if ((pix = pixRead(fname)) == NULL)
688
0
            return ERROR_INT("pix not returned for G4", __func__, 1);
689
0
        cid = pixGenerateG4Data(pix, ascii85);
690
0
        pixDestroy(&pix);
691
0
        if (!cid)
692
0
            return ERROR_INT("g4 data not made", __func__, 1);
693
0
    } else if (type == L_FLATE_ENCODE) {
694
0
        if ((cid = l_generateFlateData(fname, ascii85)) == NULL)
695
0
            return ERROR_INT("flate data not made", __func__, 1);
696
0
    } else {
697
0
        return ERROR_INT("invalid conversion type", __func__, 1);
698
0
    }
699
0
    *pcid = cid;
700
701
0
    return 0;
702
0
}
703
704
705
/*---------------------------------------------------------------------*
706
 *                     Low-level CID-based operations                  *
707
 *---------------------------------------------------------------------*/
708
/*!
709
 * \brief   l_generateFlateDataPdf()
710
 *
711
 * \param[in]    fname     preferably png
712
 * \param[in]    pixs      [optional] can be null
713
 * \return  cid containing png data, or NULL on error
714
 *
715
 * <pre>
716
 * Notes:
717
 *      (1) If you hand this a png file, you are going to get
718
 *          png predictors embedded in the flate data. So it has
719
 *          come to this. http://xkcd.com/1022/
720
 *      (2) Exception: if the png is interlaced or if it is RGBA,
721
 *          it will be transcoded.
722
 *      (3) If transcoding is required, this will not have to read from
723
 *          file if a pix is input.
724
 * </pre>
725
 */
726
L_COMP_DATA *
727
l_generateFlateDataPdf(const char  *fname,
728
                       PIX         *pixs)
729
0
{
730
0
l_uint8      *pngcomp = NULL;  /* entire PNG compressed file */
731
0
l_uint8      *datacomp = NULL;  /* gzipped raster data */
732
0
l_uint8      *cmapdata = NULL;  /* uncompressed colormap */
733
0
char         *cmapdatahex = NULL;  /* hex ascii uncompressed colormap */
734
0
l_uint32      i, j, n;
735
0
l_int32       format, interlaced;
736
0
l_int32       ncolors;  /* in colormap */
737
0
l_int32       bps;  /* bits/sample: usually 8 */
738
0
l_int32       spp;  /* samples/pixel: 1-grayscale/cmap); 3-rgb; 4-rgba */
739
0
l_int32       w, h, cmapflag;
740
0
l_int32       xres, yres;
741
0
size_t        nbytescomp = 0, nbytespng = 0;
742
0
FILE         *fp;
743
0
L_COMP_DATA  *cid;
744
0
PIX          *pix;
745
0
PIXCMAP      *cmap = NULL;
746
747
0
    if (!fname)
748
0
        return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);
749
750
0
    findFileFormat(fname, &format);
751
0
    spp = 0;  /* init to spp != 4 if not png */
752
0
    interlaced = 0;  /* initialize to no interlacing */
753
0
    bps = 0;  /* initialize to a nonsense value */
754
0
    if (format == IFF_PNG) {
755
0
        isPngInterlaced(fname, &interlaced);
756
0
        if (readHeaderPng(fname, NULL, NULL, &bps, &spp, NULL))
757
0
            return (L_COMP_DATA *)ERROR_PTR("bad png input", __func__, NULL);
758
0
    }
759
760
        /* PDF is capable of inlining some types of PNG files, but not all
761
           of them. We need to transcode anything with interlacing, an
762
           alpha channel, or 1 bpp (which would otherwise be photo-inverted).
763
764
           Note: any PNG image file with an alpha channel is converted on
765
           reading to RGBA (spp == 4). This includes the (gray + alpha) format
766
           with spp == 2.  Because of the conversion, readHeaderPng() gives
767
           spp = 2, whereas pixGetSpp() gives spp = 4 on the converted pix. */
768
0
    if (format != IFF_PNG ||
769
0
       (format == IFF_PNG && (interlaced || bps == 1 || spp == 4 || spp == 2)))
770
0
    {  /* lgtm+ analyzer needed the logic expanded */
771
0
        if (!pixs)
772
0
            pix = pixRead(fname);
773
0
        else
774
0
            pix = pixClone(pixs);
775
0
        if (!pix)
776
0
            return (L_COMP_DATA *)ERROR_PTR("pix not made", __func__, NULL);
777
0
        cid = pixGenerateFlateData(pix, 0);
778
0
        pixDestroy(&pix);
779
0
        return cid;
780
0
    }
781
782
        /* It's png.  Generate the pdf data without transcoding.
783
         * Implementation by Jeff Breidenbach.
784
         * First, read the metadata */
785
0
    if ((fp = fopenReadStream(fname)) == NULL)
786
0
        return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
787
0
                                          fname, __func__, NULL);
788
0
    freadHeaderPng(fp, &w, &h, &bps, &spp, &cmapflag);
789
0
    fgetPngResolution(fp, &xres, &yres);
790
0
    fclose(fp);
791
792
        /* We get pdf corruption when inlining the data from 16 bpp png. */
793
0
    if (bps == 16)
794
0
        return l_generateFlateData(fname, 0);
795
796
        /* Read the entire png file */
797
0
    if ((pngcomp = l_binaryRead(fname, &nbytespng)) == NULL)
798
0
        return (L_COMP_DATA *)ERROR_PTR_1("unable to read file",
799
0
                                          fname, __func__, NULL);
800
801
        /* Extract flate data, copying portions of it to memory, including
802
         * the predictor information in a byte at the beginning of each
803
         * raster line.  The flate data makes up the vast majority of
804
         * the png file, so after extraction we expect datacomp to
805
         * be nearly full (i.e., nbytescomp will be only slightly less
806
         * than nbytespng).  Also extract the colormap if present. */
807
0
    if ((datacomp = (l_uint8 *)LEPT_CALLOC(1, nbytespng)) == NULL) {
808
0
        LEPT_FREE(pngcomp);
809
0
        return (L_COMP_DATA *)ERROR_PTR("unable to allocate memory",
810
0
                                        __func__, NULL);
811
0
    }
812
813
        /* Parse the png file.  Each chunk consists of:
814
         *    length: 4 bytes
815
         *    name:   4 bytes (e.g., "IDAT")
816
         *    data:   n bytes
817
         *    CRC:    4 bytes
818
         * Start at the beginning of the data section of the first chunk,
819
         * byte 16, because the png file begins with 8 bytes of header,
820
         * followed by the first 8 bytes of the first chunk
821
         * (length and name).  On each loop, increment by 12 bytes to
822
         * skip over the CRC, length and name of the next chunk. */
823
0
    for (i = 16; i < nbytespng; i += 12) {  /* do each successive chunk */
824
            /* Get the chunk length */
825
0
        n  = pngcomp[i - 8] << 24;
826
0
        n += pngcomp[i - 7] << 16;
827
0
        n += pngcomp[i - 6] << 8;
828
0
        n += pngcomp[i - 5] << 0;
829
0
        if (n >= nbytespng - i) {  /* "n + i" can overflow */
830
0
            LEPT_FREE(pngcomp);
831
0
            LEPT_FREE(datacomp);
832
0
            pixcmapDestroy(&cmap);
833
0
            L_ERROR("invalid png: i = %d, n = %d, nbytes = %zu\n", __func__,
834
0
                    i, n, nbytespng);
835
0
            return NULL;
836
0
        }
837
838
            /* Is it a data chunk? */
839
0
        if (memcmp(pngcomp + i - 4, "IDAT", 4) == 0) {
840
0
            memcpy(datacomp + nbytescomp, pngcomp + i, n);
841
0
            nbytescomp += n;
842
0
        }
843
844
            /* Is it a palette chunk? */
845
0
        if (cmapflag && !cmap &&
846
0
            memcmp(pngcomp + i - 4, "PLTE", 4) == 0) {
847
0
            if ((n / 3) > (1 << bps)) {
848
0
                LEPT_FREE(pngcomp);
849
0
                LEPT_FREE(datacomp);
850
0
                pixcmapDestroy(&cmap);
851
0
                L_ERROR("invalid png: i = %d, n = %d, cmapsize = %d\n",
852
0
                        __func__, i, n, (1 << bps));
853
0
                return NULL;
854
0
            }
855
0
            cmap = pixcmapCreate(bps);
856
0
            for (j = i; j < i + n; j += 3) {
857
0
                pixcmapAddColor(cmap, pngcomp[j], pngcomp[j + 1],
858
0
                                pngcomp[j + 2]);
859
0
            }
860
0
        }
861
0
        i += n;  /* move to the end of the data chunk */
862
0
    }
863
0
    LEPT_FREE(pngcomp);
864
865
0
    if (nbytescomp == 0) {
866
0
        LEPT_FREE(datacomp);
867
0
        pixcmapDestroy(&cmap);
868
0
        return (L_COMP_DATA *)ERROR_PTR("invalid PNG file", __func__, NULL);
869
0
    }
870
871
        /* Extract and encode the colormap data as hexascii  */
872
0
    ncolors = 0;
873
0
    if (cmap) {
874
0
        pixcmapSerializeToMemory(cmap, 3, &ncolors, &cmapdata);
875
0
        pixcmapDestroy(&cmap);
876
0
        if (!cmapdata) {
877
0
            LEPT_FREE(datacomp);
878
0
            return (L_COMP_DATA *)ERROR_PTR("cmapdata not made",
879
0
                                            __func__, NULL);
880
0
        }
881
0
        cmapdatahex = pixcmapConvertToHex(cmapdata, ncolors);
882
0
        LEPT_FREE(cmapdata);
883
0
    }
884
885
        /* Note that this is the only situation where the predictor
886
         * field of the CID is set to 1.  Adobe's predictor values on
887
         * p. 76 of pdf_reference_1-7.pdf give 1 for no predictor and
888
         * 10-14 for inline predictors, the specifics of which are
889
         * ignored by the pdf interpreter, which just needs to know that
890
         * the first byte on each compressed scanline is some predictor
891
         * whose type can be inferred from the byte itself.  */
892
0
    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
893
0
    cid->datacomp = datacomp;
894
0
    cid->type = L_FLATE_ENCODE;
895
0
    cid->cmapdatahex = cmapdatahex;
896
0
    cid->nbytescomp = nbytescomp;
897
0
    cid->ncolors = ncolors;
898
0
    cid->predictor = TRUE;
899
0
    cid->w = w;
900
0
    cid->h = h;
901
0
    cid->bps = bps;
902
0
    cid->spp = spp;
903
0
    cid->res = xres;
904
0
    return cid;
905
0
}
906
907
908
/*!
909
 * \brief   l_generateJpegData()
910
 *
911
 * \param[in]    fname           of jpeg file
912
 * \param[in]    ascii85flag     0 for jpeg; 1 for ascii85-encoded jpeg
913
 * \return  cid containing jpeg data, or NULL on error
914
 *
915
 * <pre>
916
 * Notes:
917
 *      (1) Set ascii85flag:
918
 *           ~ 0 for binary data (PDF only)
919
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
920
 *      (2) Most of this function is repeated in l_generateJpegMemData(),
921
 *          which is required in pixacompFastConvertToPdfData().
922
 * </pre>
923
 */
924
L_COMP_DATA *
925
l_generateJpegData(const char  *fname,
926
                   l_int32      ascii85flag)
927
0
{
928
0
char         *data85 = NULL;  /* ascii85 encoded jpeg compressed file */
929
0
l_uint8      *data = NULL;
930
0
l_int32       w, h, xres, yres, bps, spp;
931
0
size_t        nbytes, nbytes85;
932
0
L_COMP_DATA  *cid;
933
0
FILE         *fp;
934
935
0
    if (!fname)
936
0
        return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);
937
938
0
    if (ascii85flag != 0 && ascii85flag != 1)
939
0
        return (L_COMP_DATA *)ERROR_PTR("wrong ascii85flags", __func__, NULL);
940
941
        /* Read the metadata */
942
0
    if (readHeaderJpeg(fname, &w, &h, &spp, NULL, NULL))
943
0
        return (L_COMP_DATA *)ERROR_PTR("bad jpeg metadata", __func__, NULL);
944
0
    bps = 8;
945
0
    if ((fp = fopenReadStream(fname)) == NULL)
946
0
        return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
947
0
                                          fname, __func__, NULL);
948
0
    fgetJpegResolution(fp, &xres, &yres);
949
0
    fclose(fp);
950
951
        /* Read the entire jpeg file.  The returned jpeg data in memory
952
         * starts with ffd8 and ends with ffd9 */
953
0
    if ((data = l_binaryRead(fname, &nbytes)) == NULL)
954
0
        return (L_COMP_DATA *)ERROR_PTR_1("data not extracted",
955
0
                                          fname, __func__, NULL);
956
957
        /* Optionally, encode the compressed data */
958
0
    if (ascii85flag == 1) {
959
0
        data85 = encodeAscii85(data, nbytes, &nbytes85);
960
0
        LEPT_FREE(data);
961
0
        if (!data85)
962
0
            return (L_COMP_DATA *)ERROR_PTR_1("data85 not made",
963
0
                                              fname, __func__, NULL);
964
0
        else
965
0
            data85[nbytes85 - 1] = '\0';  /* remove the newline */
966
0
    }
967
968
0
    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
969
0
    if (ascii85flag == 0) {
970
0
        cid->datacomp = data;
971
0
    } else {  /* ascii85 */
972
0
        cid->data85 = data85;
973
0
        cid->nbytes85 = nbytes85;
974
0
    }
975
0
    cid->type = L_JPEG_ENCODE;
976
0
    cid->nbytescomp = nbytes;
977
0
    cid->w = w;
978
0
    cid->h = h;
979
0
    cid->bps = bps;
980
0
    cid->spp = spp;
981
0
    cid->res = xres;
982
0
    return cid;
983
0
}
984
985
986
/*!
987
 * \brief   l_generateJpegDataMem()
988
 *
989
 * \param[in]    data           of jpeg-encoded file
990
 * \param[in]    nbytes         size of jpeg-encoded file
991
 * \param[in]    ascii85flag    0 for jpeg; 1 for ascii85-encoded jpeg
992
 * \return  cid containing jpeg data, or NULL on error
993
 *
994
 * <pre>
995
 * Notes:
996
 *      (1) Set ascii85flag:
997
 *           ~ 0 for binary data (PDF only)
998
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
999
 * </pre>
1000
 */
1001
L_COMP_DATA *
1002
l_generateJpegDataMem(l_uint8  *data,
1003
                      size_t    nbytes,
1004
                      l_int32   ascii85flag)
1005
0
{
1006
0
char         *data85 = NULL;  /* ascii85 encoded jpeg compressed file */
1007
0
l_int32       w, h, xres, yres, bps, spp;
1008
0
size_t        nbytes85;
1009
0
L_COMP_DATA  *cid;
1010
1011
0
    if (!data)
1012
0
        return (L_COMP_DATA *)ERROR_PTR("data not defined", __func__, NULL);
1013
1014
        /* Read the metadata */
1015
0
    if (readHeaderMemJpeg(data, nbytes, &w, &h, &spp, NULL, NULL)) {
1016
0
        LEPT_FREE(data);
1017
0
        return (L_COMP_DATA *)ERROR_PTR("bad jpeg metadata", __func__, NULL);
1018
0
    }
1019
0
    bps = 8;
1020
0
    readResolutionMemJpeg(data, nbytes, &xres, &yres);
1021
1022
        /* Optionally, encode the compressed data */
1023
0
    if (ascii85flag == 1) {
1024
0
        data85 = encodeAscii85(data, nbytes, &nbytes85);
1025
0
        LEPT_FREE(data);
1026
0
        if (!data85)
1027
0
            return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL);
1028
0
        else
1029
0
            data85[nbytes85 - 1] = '\0';  /* remove the newline */
1030
0
    }
1031
1032
0
    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
1033
0
    if (ascii85flag == 0) {
1034
0
        cid->datacomp = data;
1035
0
    } else {  /* ascii85 */
1036
0
        cid->data85 = data85;
1037
0
        cid->nbytes85 = nbytes85;
1038
0
    }
1039
0
    cid->type = L_JPEG_ENCODE;
1040
0
    cid->nbytescomp = nbytes;
1041
0
    cid->w = w;
1042
0
    cid->h = h;
1043
0
    cid->bps = bps;
1044
0
    cid->spp = spp;
1045
0
    cid->res = xres;
1046
0
    return cid;
1047
0
}
1048
1049
1050
/*!
1051
 * \brief   l_generateJp2kData()
1052
 *
1053
 * \param[in]    fname     of jp2k file
1054
 * \return  cid containing jp2k data, or NULL on error
1055
 *
1056
 * <pre>
1057
 * Notes:
1058
 *      (1) This is only called after the file is verified to be jp2k.
1059
 * </pre>
1060
 */
1061
static L_COMP_DATA *
1062
l_generateJp2kData(const char  *fname)
1063
0
{
1064
0
l_int32       w, h, bps, spp, xres, yres;
1065
0
size_t        nbytes;
1066
0
L_COMP_DATA  *cid;
1067
0
FILE         *fp;
1068
1069
0
    if (!fname)
1070
0
        return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);
1071
1072
0
    if (readHeaderJp2k(fname, &w, &h, &bps, &spp, NULL))
1073
0
        return (L_COMP_DATA *)ERROR_PTR("bad jp2k metadata", __func__, NULL);
1074
1075
        /* The returned jp2k data in memory is the entire jp2k file */
1076
0
    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
1077
0
    if ((cid->datacomp = l_binaryRead(fname, &nbytes)) == NULL) {
1078
0
        l_CIDataDestroy(&cid);
1079
0
        return (L_COMP_DATA *)ERROR_PTR("data not extracted", __func__, NULL);
1080
0
    }
1081
1082
0
    xres = yres = 0;
1083
0
    if ((fp = fopenReadStream(fname)) != NULL) {
1084
0
        fgetJp2kResolution(fp, &xres, &yres);
1085
0
        fclose(fp);
1086
0
    }
1087
0
    cid->type = L_JP2K_ENCODE;
1088
0
    cid->nbytescomp = nbytes;
1089
0
    cid->w = w;
1090
0
    cid->h = h;
1091
0
    cid->bps = bps;
1092
0
    cid->spp = spp;
1093
0
    cid->res = xres;
1094
0
    return cid;
1095
0
}
1096
1097
1098
/*!
1099
 * \brief   l_generateG4Data()
1100
 *
1101
 * \param[in]    fname          of g4 compressed file
1102
 * \param[in]    ascii85flag    0 for g4 compressed; 1 for ascii85-encoded g4
1103
 * \return  cid g4 compressed image data, or NULL on error
1104
 *
1105
 * <pre>
1106
 * Notes:
1107
 *      (1) Set ascii85flag:
1108
 *           ~ 0 for binary data (PDF only)
1109
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
1110
 *      (2) This does not work for multipage tiff files.
1111
 * </pre>
1112
 */
1113
L_COMP_DATA *
1114
l_generateG4Data(const char  *fname,
1115
                 l_int32      ascii85flag)
1116
0
{
1117
0
l_uint8      *datacomp = NULL;  /* g4 compressed raster data */
1118
0
char         *data85 = NULL;  /* ascii85 encoded g4 compressed data */
1119
0
l_int32       w, h, xres, yres, npages;
1120
0
l_int32       minisblack;  /* TRUE or FALSE */
1121
0
size_t        nbytes85, nbytescomp;
1122
0
L_COMP_DATA  *cid;
1123
0
FILE         *fp;
1124
1125
0
    if (!fname)
1126
0
        return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);
1127
1128
        /* Make sure this is a single page tiff file */
1129
0
    if ((fp = fopenReadStream(fname)) == NULL)
1130
0
        return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
1131
0
                                          fname, __func__, NULL);
1132
0
    tiffGetCount(fp, &npages);
1133
0
    fclose(fp);
1134
0
    if (npages != 1) {
1135
0
        L_ERROR(" %d page tiff; only works with 1 page (file: %s)\n", __func__, npages, fname);
1136
0
        return NULL;
1137
0
    }
1138
1139
        /* Read the resolution */
1140
0
    if ((fp = fopenReadStream(fname)) == NULL)
1141
0
        return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
1142
0
                                          fname, __func__, NULL);
1143
0
    getTiffResolution(fp, &xres, &yres);
1144
0
    fclose(fp);
1145
1146
        /* The returned ccitt g4 data in memory is the block of
1147
         * bytes in the tiff file, starting after 8 bytes and
1148
         * ending before the directory. */
1149
0
    if (extractG4DataFromFile(fname, &datacomp, &nbytescomp,
1150
0
                              &w, &h, &minisblack)) {
1151
0
        return (L_COMP_DATA *)ERROR_PTR_1("datacomp not extracted",
1152
0
                                          fname, __func__, NULL);
1153
0
    }
1154
1155
        /* Optionally, encode the compressed data */
1156
0
    if (ascii85flag == 1) {
1157
0
        data85 = encodeAscii85(datacomp, nbytescomp, &nbytes85);
1158
0
        LEPT_FREE(datacomp);
1159
0
        if (!data85)
1160
0
            return (L_COMP_DATA *)ERROR_PTR_1("data85 not made",
1161
0
                                              fname, __func__, NULL);
1162
0
        else
1163
0
            data85[nbytes85 - 1] = '\0';  /* remove the newline */
1164
0
    }
1165
1166
0
    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
1167
0
    if (ascii85flag == 0) {
1168
0
        cid->datacomp = datacomp;
1169
0
    } else {  /* ascii85 */
1170
0
        cid->data85 = data85;
1171
0
        cid->nbytes85 = nbytes85;
1172
0
    }
1173
0
    cid->type = L_G4_ENCODE;
1174
0
    cid->nbytescomp = nbytescomp;
1175
0
    cid->w = w;
1176
0
    cid->h = h;
1177
0
    cid->bps = 1;
1178
0
    cid->spp = 1;
1179
0
    cid->minisblack = minisblack;
1180
0
    cid->res = xres;
1181
0
    return cid;
1182
0
}
1183
1184
1185
/*!
1186
 * \brief   pixGenerateCIData()
1187
 *
1188
 * \param[in]    pixs       8 or 32 bpp, no colormap
1189
 * \param[in]    type       L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE or
1190
 *                          L_JP2K_ENCODE
1191
 * \param[in]    quality    for jpeg if transcoded: 1-100; 0 for default (75)
1192
 *                          for jp2k if transcoded: 27-45; 0 for default (34)
1193
 * \param[in]    ascii85    0 for binary; 1 for ascii85-encoded
1194
 * \param[out]   pcid       compressed data
1195
 * \return  0 if OK, 1 on error
1196
 *
1197
 * <pre>
1198
 * Notes:
1199
 *      (1) Set ascii85:
1200
 *           ~ 0 for binary data (PDF only)
1201
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
1202
 *      (2) Do not accept images with an asperity ratio greater than 10.
1203
 * </pre>
1204
 */
1205
l_ok
1206
pixGenerateCIData(PIX           *pixs,
1207
                  l_int32        type,
1208
                  l_int32        quality,
1209
                  l_int32        ascii85,
1210
                  L_COMP_DATA  **pcid)
1211
0
{
1212
0
l_int32   w, h, d, maxAsp;
1213
0
PIXCMAP  *cmap;
1214
1215
0
    if (!pcid)
1216
0
        return ERROR_INT("&cid not defined", __func__, 1);
1217
0
    *pcid = NULL;
1218
0
    if (!pixs)
1219
0
        return ERROR_INT("pixs not defined", __func__, 1);
1220
0
    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1221
0
        type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
1222
0
        selectDefaultPdfEncoding(pixs, &type);
1223
0
    }
1224
0
    if (ascii85 != 0 && ascii85 != 1)
1225
0
        return ERROR_INT("invalid ascii85", __func__, 1);
1226
0
    pixGetDimensions(pixs, &w, &h, NULL);
1227
0
    if (w == 0 || h == 0)
1228
0
        return ERROR_INT("invalid w or h", __func__, 1);
1229
0
    maxAsp = L_MAX(w / h, h / w);
1230
0
    if (maxAsp > 10)
1231
0
        return ERROR_INT("max asperity > 10", __func__, 1);
1232
1233
        /* Conditionally modify the encoding type if libz is
1234
         * available and the requested library is missing. */
1235
0
#if defined(HAVE_LIBZ)
1236
# if !defined(HAVE_LIBJPEG)
1237
    if (type == L_JPEG_ENCODE) {
1238
        L_WARNING("no libjpeg; using flate encoding\n", __func__);
1239
        type = L_FLATE_ENCODE;
1240
    }
1241
# endif /* !defined(HAVE_LIBJPEG) */
1242
0
# if !defined(HAVE_LIBJP2K)
1243
0
    if (type == L_JP2K_ENCODE) {
1244
0
        L_WARNING("no libjp2k; using flate encoding\n", __func__);
1245
0
        type = L_FLATE_ENCODE;
1246
0
    }
1247
0
# endif /* !defined(HAVE_LIBJP2K) */
1248
# if !defined(HAVE_LIBTIFF)
1249
    if (type == L_G4_ENCODE) {
1250
        L_WARNING("no libtiff; using flate encoding\n", __func__);
1251
        type = L_FLATE_ENCODE;
1252
    }
1253
# endif /* !defined(HAVE_LIBTIFF) */
1254
0
#endif /* defined(HAVE_LIBZ) */
1255
1256
        /* Sanity check on requested encoding */
1257
0
    d = pixGetDepth(pixs);
1258
0
    cmap = pixGetColormap(pixs);
1259
0
    if (cmap && type != L_FLATE_ENCODE) {
1260
0
        L_WARNING("pixs has cmap; using flate encoding\n", __func__);
1261
0
        type = L_FLATE_ENCODE;
1262
0
    } else if (d < 8 && (type == L_JPEG_ENCODE || type == L_JP2K_ENCODE)) {
1263
0
        L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__);
1264
0
        type = L_FLATE_ENCODE;
1265
0
    } else if (d > 1 && type == L_G4_ENCODE) {
1266
0
        L_WARNING("pixs has > 1 bpp; using flate encoding\n", __func__);
1267
0
        type = L_FLATE_ENCODE;
1268
0
    }
1269
1270
0
    if (type == L_JPEG_ENCODE) {
1271
0
        if ((*pcid = pixGenerateJpegData(pixs, ascii85, quality)) == NULL)
1272
0
            return ERROR_INT("jpeg data not made", __func__, 1);
1273
0
    } else if (type == L_JP2K_ENCODE) {
1274
0
        if ((*pcid = pixGenerateJp2kData(pixs, quality)) == NULL)
1275
0
            return ERROR_INT("jp2k data not made", __func__, 1);
1276
0
    } else if (type == L_G4_ENCODE) {
1277
0
        if ((*pcid = pixGenerateG4Data(pixs, ascii85)) == NULL)
1278
0
            return ERROR_INT("g4 data not made", __func__, 1);
1279
0
    } else {  /* type == L_FLATE_ENCODE */
1280
0
        if ((*pcid = pixGenerateFlateData(pixs, ascii85)) == NULL)
1281
0
            return ERROR_INT("flate data not made", __func__, 1);
1282
0
    }
1283
0
    return 0;
1284
0
}
1285
1286
1287
/*!
1288
 * \brief   l_generateFlateData()
1289
 *
1290
 * \param[in]    fname
1291
 * \param[in]    ascii85flag    0 for gzipped; 1 for ascii85-encoded gzipped
1292
 * \return  cid flate compressed image data, or NULL on error
1293
 *
1294
 * <pre>
1295
 * Notes:
1296
 *      (1) The input image is converted to one of these 4 types:
1297
 *           ~ 1 bpp
1298
 *           ~ 8 bpp, no colormap
1299
 *           ~ 8 bpp, colormap
1300
 *           ~ 32 bpp rgb
1301
 *      (2) Set ascii85flag:
1302
 *           ~ 0 for binary data (PDF only)
1303
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
1304
 *      (3) Always transcodes (i.e., first decodes the png file)
1305
 * </pre>
1306
 */
1307
L_COMP_DATA *
1308
l_generateFlateData(const char  *fname,
1309
                    l_int32      ascii85flag)
1310
0
{
1311
0
L_COMP_DATA  *cid;
1312
0
PIX          *pixs;
1313
1314
0
    if (!fname)
1315
0
        return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);
1316
1317
0
    if ((pixs = pixRead(fname)) == NULL)
1318
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not made", __func__, NULL);
1319
0
    cid = pixGenerateFlateData(pixs, ascii85flag);
1320
0
    pixDestroy(&pixs);
1321
0
    return cid;
1322
0
}
1323
1324
1325
/*!
1326
 * \brief   pixGenerateFlateData()
1327
 *
1328
 * \param[in]    pixs
1329
 * \param[in]    ascii85flag 0    for gzipped; 1 for ascii85-encoded gzipped
1330
 * \return  cid flate compressed image data, or NULL on error
1331
 *
1332
 * <pre>
1333
 * Notes:
1334
 *     (1) If called with an RGBA pix (spp == 4), the alpha channel
1335
 *         will be removed, projecting a white backgrouond through
1336
 *         any transparency.
1337
 *     (2) If called with a colormapped pix, any transparency in the
1338
 *         alpha component in the colormap will be ignored, as it is
1339
 *         for all leptonica operations on colormapped pix.
1340
 * </pre>
1341
 */
1342
static L_COMP_DATA *
1343
pixGenerateFlateData(PIX     *pixs,
1344
                     l_int32  ascii85flag)
1345
0
{
1346
0
l_uint8      *data = NULL;  /* uncompressed raster data in required format */
1347
0
l_uint8      *datacomp = NULL;  /* gzipped raster data */
1348
0
char         *data85 = NULL;  /* ascii85 encoded gzipped raster data */
1349
0
l_uint8      *cmapdata = NULL;  /* uncompressed colormap */
1350
0
char         *cmapdata85 = NULL;  /* ascii85 encoded uncompressed colormap */
1351
0
char         *cmapdatahex = NULL;  /* hex ascii uncompressed colormap */
1352
0
l_int32       ncolors;  /* in colormap; not used if cmapdata85 is null */
1353
0
l_int32       bps;  /* bits/sample: usually 8 */
1354
0
l_int32       spp;  /* samples/pixel: 1-grayscale/cmap); 3-rgb */
1355
0
l_int32       w, h, d, cmapflag;
1356
0
size_t        ncmapbytes85 = 0;
1357
0
size_t        nbytes85 = 0;
1358
0
size_t        nbytes, nbytescomp;
1359
0
L_COMP_DATA  *cid;
1360
0
PIX          *pixt;
1361
0
PIXCMAP      *cmap;
1362
1363
0
    if (!pixs)
1364
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);
1365
1366
        /* Convert the image to one of these 4 types:
1367
         *     1 bpp
1368
         *     8 bpp, no colormap
1369
         *     8 bpp, colormap
1370
         *     32 bpp rgb    */
1371
0
    pixGetDimensions(pixs, &w, &h, &d);
1372
0
    cmap = pixGetColormap(pixs);
1373
0
    cmapflag = (cmap) ? 1 : 0;
1374
0
    if (d == 2 || d == 4 || d == 16) {
1375
0
        pixt = pixConvertTo8(pixs, cmapflag);
1376
0
        cmap = pixGetColormap(pixt);
1377
0
        d = pixGetDepth(pixt);
1378
0
    } else if (d == 32 && pixGetSpp(pixs) == 4) {  /* remove alpha */
1379
0
        pixt = pixAlphaBlendUniform(pixs, 0xffffff00);
1380
0
    } else {
1381
0
        pixt = pixClone(pixs);
1382
0
    }
1383
0
    if (!pixt)
1384
0
        return (L_COMP_DATA *)ERROR_PTR("pixt not made", __func__, NULL);
1385
0
    spp = (d == 32) ? 3 : 1;
1386
0
    bps = (d == 32) ? 8 : d;
1387
1388
        /* Extract and encode the colormap data as both ascii85 and hexascii  */
1389
0
    ncolors = 0;
1390
0
    if (cmap) {
1391
0
        pixcmapSerializeToMemory(cmap, 3, &ncolors, &cmapdata);
1392
0
        if (!cmapdata) {
1393
0
            pixDestroy(&pixt);
1394
0
            return (L_COMP_DATA *)ERROR_PTR("cmapdata not made",
1395
0
                                            __func__, NULL);
1396
0
        }
1397
1398
0
        cmapdata85 = encodeAscii85(cmapdata, 3 * ncolors, &ncmapbytes85);
1399
0
        cmapdatahex = pixcmapConvertToHex(cmapdata, ncolors);
1400
0
        LEPT_FREE(cmapdata);
1401
0
    }
1402
1403
        /* Extract and compress the raster data */
1404
0
    pixGetRasterData(pixt, &data, &nbytes);
1405
0
    pixDestroy(&pixt);
1406
0
    if (!data) {
1407
0
        LEPT_FREE(cmapdata85);
1408
0
        LEPT_FREE(cmapdatahex);
1409
0
        return (L_COMP_DATA *)ERROR_PTR("data not returned", __func__, NULL);
1410
0
    }
1411
0
    datacomp = zlibCompress(data, nbytes, &nbytescomp);
1412
0
    LEPT_FREE(data);
1413
0
    if (!datacomp) {
1414
0
        LEPT_FREE(cmapdata85);
1415
0
        LEPT_FREE(cmapdatahex);
1416
0
        return (L_COMP_DATA *)ERROR_PTR("datacomp not made", __func__, NULL);
1417
0
    }
1418
1419
        /* Optionally, encode the compressed data */
1420
0
    if (ascii85flag == 1) {
1421
0
        data85 = encodeAscii85(datacomp, nbytescomp, &nbytes85);
1422
0
        LEPT_FREE(datacomp);
1423
0
        if (!data85) {
1424
0
            LEPT_FREE(cmapdata85);
1425
0
            LEPT_FREE(cmapdatahex);
1426
0
            return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL);
1427
0
        } else {
1428
0
            data85[nbytes85 - 1] = '\0';  /* remove the newline */
1429
0
        }
1430
0
    }
1431
1432
0
    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
1433
0
    if (ascii85flag == 0) {
1434
0
        cid->datacomp = datacomp;
1435
0
    } else {  /* ascii85 */
1436
0
        cid->data85 = data85;
1437
0
        cid->nbytes85 = nbytes85;
1438
0
    }
1439
0
    cid->type = L_FLATE_ENCODE;
1440
0
    cid->cmapdatahex = cmapdatahex;
1441
0
    cid->cmapdata85 = cmapdata85;
1442
0
    cid->nbytescomp = nbytescomp;
1443
0
    cid->ncolors = ncolors;
1444
0
    cid->w = w;
1445
0
    cid->h = h;
1446
0
    cid->bps = bps;
1447
0
    cid->spp = spp;
1448
0
    cid->res = pixGetXRes(pixs);
1449
0
    cid->nbytes = nbytes;  /* only for debugging */
1450
0
    return cid;
1451
0
}
1452
1453
1454
/*!
1455
 * \brief   pixGenerateJpegData()
1456
 *
1457
 * \param[in]    pixs           8, 16 or 32 bpp, no colormap
1458
 * \param[in]    ascii85flag    0 for jpeg; 1 for ascii85-encoded jpeg
1459
 * \param[in]    quality        0 for default, which is 75
1460
 * \return  cid jpeg compressed data, or NULL on error
1461
 *
1462
 * <pre>
1463
 * Notes:
1464
 *      (1) Set ascii85flag:
1465
 *           ~ 0 for binary data (PDF only)
1466
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
1467
 *      (2) If 16 bpp, convert first to 8 bpp, using the MSB
1468
 * </pre>
1469
 */
1470
static L_COMP_DATA *
1471
pixGenerateJpegData(PIX     *pixs,
1472
                    l_int32  ascii85flag,
1473
                    l_int32  quality)
1474
0
{
1475
0
l_int32       d;
1476
0
char         *fname;
1477
0
L_COMP_DATA  *cid;
1478
1479
0
    if (!pixs)
1480
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);
1481
0
    if (pixGetColormap(pixs))
1482
0
        return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL);
1483
0
    d = pixGetDepth(pixs);
1484
0
    if (d != 8 && d != 16 && d != 32)
1485
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not 8, 16 or 32 bpp",
1486
0
                __func__, NULL);
1487
1488
        /* Compress to a temp jpeg file */
1489
0
    fname = l_makeTempFilename();
1490
0
    if (pixWriteJpeg(fname, pixs, quality, 0)) {
1491
0
        LEPT_FREE(fname);
1492
0
        return NULL;
1493
0
    }
1494
1495
        /* Generate the data */
1496
0
    cid = l_generateJpegData(fname, ascii85flag);
1497
0
    if (lept_rmfile(fname) != 0)
1498
0
        L_ERROR("temp file %s was not deleted\n", __func__, fname);
1499
0
    LEPT_FREE(fname);
1500
0
    return cid;
1501
0
}
1502
1503
1504
/*!
1505
 * \brief   pixGenerateJp2kData()
1506
 *
1507
 * \param[in]    pixs           8 or 32 bpp, no colormap
1508
 * \param[in]    quality        0 for default, which is 34
1509
 * \return  cid jp2k compressed data, or NULL on error
1510
 *
1511
 * <pre>
1512
 * Notes:
1513
 *      (1) The quality can be set between 27 (very poor) and 45
1514
 *          (nearly perfect).  Use 0 for default (34). Use 100 for lossless,
1515
 *          but this is very expensive and not recommended.
1516
 * </pre>
1517
 */
1518
static L_COMP_DATA *
1519
pixGenerateJp2kData(PIX     *pixs,
1520
                    l_int32  quality)
1521
0
{
1522
0
l_int32       d;
1523
0
char         *fname;
1524
0
L_COMP_DATA  *cid;
1525
1526
0
    if (!pixs)
1527
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);
1528
0
    if (pixGetColormap(pixs))
1529
0
        return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL);
1530
0
    d = pixGetDepth(pixs);
1531
0
    if (d != 8 && d != 32)
1532
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not 8 or 32 bpp", __func__, NULL);
1533
1534
        /* Compress to a temp jp2k file */
1535
0
    fname = l_makeTempFilename();
1536
0
    if (pixWriteJp2k(fname, pixs, quality, 5, 0, 0)) {
1537
0
        LEPT_FREE(fname);
1538
0
        return NULL;
1539
0
    }
1540
1541
        /* Generate the data */
1542
0
    cid = l_generateJp2kData(fname);
1543
0
    if (lept_rmfile(fname) != 0)
1544
0
        L_ERROR("temp file %s was not deleted\n", __func__, fname);
1545
0
    LEPT_FREE(fname);
1546
0
    return cid;
1547
0
}
1548
1549
1550
/*!
1551
 * \brief   pixGenerateG4Data()
1552
 *
1553
 * \param[in]    pixs           1 bpp, no colormap
1554
 * \param[in]    ascii85flag    0 for gzipped; 1 for ascii85-encoded gzipped
1555
 * \return  cid g4 compressed image data, or NULL on error
1556
 *
1557
 * <pre>
1558
 * Notes:
1559
 *      (1) Set ascii85flag:
1560
 *           ~ 0 for binary data (PDF only)
1561
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
1562
 * </pre>
1563
 */
1564
static L_COMP_DATA *
1565
pixGenerateG4Data(PIX     *pixs,
1566
                  l_int32  ascii85flag)
1567
0
{
1568
0
char         *fname;
1569
0
L_COMP_DATA  *cid;
1570
1571
0
    if (!pixs)
1572
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);
1573
0
    if (pixGetDepth(pixs) != 1)
1574
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not 1 bpp", __func__, NULL);
1575
0
    if (pixGetColormap(pixs))
1576
0
        return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL);
1577
1578
        /* Compress to a temp tiff g4 file */
1579
0
    fname = l_makeTempFilename();
1580
0
    if (pixWrite(fname, pixs, IFF_TIFF_G4)) {
1581
0
        LEPT_FREE(fname);
1582
0
        return NULL;
1583
0
    }
1584
1585
0
    cid = l_generateG4Data(fname, ascii85flag);
1586
0
    if (lept_rmfile(fname) != 0)
1587
0
        L_ERROR("temp file %s was not deleted\n", __func__, fname);
1588
0
    LEPT_FREE(fname);
1589
0
    return cid;
1590
0
}
1591
1592
1593
/*!
1594
 * \brief   cidConvertToPdfData()
1595
 *
1596
 * \param[in]    cid       compressed image data
1597
 * \param[in]    title     [optional] pdf title; can be null
1598
 * \param[out]   pdata     output pdf data for image
1599
 * \param[out]   pnbytes   size of output pdf data
1600
 * \return  0 if OK, 1 on error
1601
 *
1602
 * <pre>
1603
 * Notes:
1604
 *      (1) Caller must not destroy the cid.  It is absorbed in the
1605
 *          lpd and destroyed by this function.
1606
 * </pre>
1607
 */
1608
l_ok
1609
cidConvertToPdfData(L_COMP_DATA  *cid,
1610
                    const char   *title,
1611
                    l_uint8     **pdata,
1612
                    size_t       *pnbytes)
1613
0
{
1614
0
l_int32      res, ret;
1615
0
l_float32    wpt, hpt;
1616
0
L_PDF_DATA  *lpd = NULL;
1617
1618
0
    if (!pdata || !pnbytes)
1619
0
        return ERROR_INT("&data and &nbytes not both defined", __func__, 1);
1620
0
    *pdata = NULL;
1621
0
    *pnbytes = 0;
1622
0
    if (!cid)
1623
0
        return ERROR_INT("cid not defined", __func__, 1);
1624
1625
        /* Get media box parameters, in pts */
1626
0
    res = cid->res;
1627
0
    if (res <= 0)
1628
0
        res = DefaultInputRes;
1629
0
    wpt = cid->w * 72.f / res;
1630
0
    hpt = cid->h * 72.f / res;
1631
1632
        /* Set up the pdf data struct (lpd) */
1633
0
    if ((lpd = pdfdataCreate(title)) == NULL)
1634
0
        return ERROR_INT("lpd not made", __func__, 1);
1635
0
    ptraAdd(lpd->cida, cid);
1636
0
    lpd->n++;
1637
0
    ptaAddPt(lpd->xy, 0, 0);   /* xpt = ypt = 0 */
1638
0
    ptaAddPt(lpd->wh, wpt, hpt);
1639
1640
        /* Generate the pdf string and destroy the lpd */
1641
0
    ret = l_generatePdf(pdata, pnbytes, lpd);
1642
0
    pdfdataDestroy(&lpd);
1643
0
    if (ret)
1644
0
        return ERROR_INT("pdf output not made", __func__, 1);
1645
0
    return 0;
1646
0
}
1647
1648
1649
/*!
1650
 * \brief   l_CIDataDestroy()
1651
 *
1652
 * \param[in,out]   pcid     will be set to null before returning
1653
 * \return  void
1654
 */
1655
void
1656
l_CIDataDestroy(L_COMP_DATA  **pcid)
1657
0
{
1658
0
L_COMP_DATA  *cid;
1659
1660
0
    if (pcid == NULL) {
1661
0
        L_WARNING("ptr address is null!\n", __func__);
1662
0
        return;
1663
0
    }
1664
0
    if ((cid = *pcid) == NULL)
1665
0
        return;
1666
1667
0
    if (cid->datacomp) LEPT_FREE(cid->datacomp);
1668
0
    if (cid->data85) LEPT_FREE(cid->data85);
1669
0
    if (cid->cmapdata85) LEPT_FREE(cid->cmapdata85);
1670
0
    if (cid->cmapdatahex) LEPT_FREE(cid->cmapdatahex);
1671
0
    LEPT_FREE(cid);
1672
0
    *pcid = NULL;
1673
0
}
1674
1675
1676
/*---------------------------------------------------------------------*
1677
 *         Helper functions for generating the output pdf string       *
1678
 *---------------------------------------------------------------------*/
1679
/*!
1680
 * \brief   l_generatePdf()
1681
 *
1682
 * \param[out]   pdata     pdf array
1683
 * \param[out]   pnbytes   number of bytes in pdf array
1684
 * \param[in]    lpd       all the required input image data
1685
 * \return  0 if OK, 1 on error
1686
 *
1687
 * <pre>
1688
 * Notes:
1689
 *      (1) On error, no data is returned.
1690
 *      (2) The objects are:
1691
 *            1: Catalog
1692
 *            2: Info
1693
 *            3: Pages
1694
 *            4: Page
1695
 *            5: Contents  (rendering command)
1696
 *            6 to 6+n-1: n XObjects
1697
 *            6+n to 6+n+m-1: m colormaps
1698
 * </pre>
1699
 */
1700
static l_int32
1701
l_generatePdf(l_uint8    **pdata,
1702
              size_t      *pnbytes,
1703
              L_PDF_DATA  *lpd)
1704
0
{
1705
0
    if (!pdata)
1706
0
        return ERROR_INT("&data not defined", __func__, 1);
1707
0
    *pdata = NULL;
1708
0
    if (!pnbytes)
1709
0
        return ERROR_INT("&nbytes not defined", __func__, 1);
1710
0
    *pnbytes = 0;
1711
0
    if (!lpd)
1712
0
        return ERROR_INT("lpd not defined", __func__, 1);
1713
1714
0
    generateFixedStringsPdf(lpd);
1715
0
    generateMediaboxPdf(lpd);
1716
0
    generatePageStringPdf(lpd);
1717
0
    generateContentStringPdf(lpd);
1718
0
    generatePreXStringsPdf(lpd);
1719
0
    generateColormapStringsPdf(lpd);
1720
0
    generateTrailerPdf(lpd);
1721
0
    return generateOutputDataPdf(pdata, pnbytes, lpd);
1722
0
}
1723
1724
1725
static void
1726
generateFixedStringsPdf(L_PDF_DATA  *lpd)
1727
0
{
1728
0
char     buf[L_SMALLBUF];
1729
0
char    *version, *datestr;
1730
0
SARRAY  *sa;
1731
1732
        /* Accumulate data for the header and objects 1-3 */
1733
0
    lpd->id = stringNew("%PDF-1.5\n");
1734
0
    l_dnaAddNumber(lpd->objsize, strlen(lpd->id));
1735
1736
0
    lpd->obj1 = stringNew("1 0 obj\n"
1737
0
                          "<<\n"
1738
0
                          "/Type /Catalog\n"
1739
0
                          "/Pages 3 0 R\n"
1740
0
                          ">>\n"
1741
0
                          "endobj\n");
1742
0
    l_dnaAddNumber(lpd->objsize, strlen(lpd->obj1));
1743
1744
0
    sa = sarrayCreate(0);
1745
0
    sarrayAddString(sa, "2 0 obj\n"
1746
0
                        "<<\n", L_COPY);
1747
0
    if (var_WRITE_DATE_AND_VERSION) {
1748
0
        datestr = l_getFormattedDate();
1749
0
        snprintf(buf, sizeof(buf), "/CreationDate (D:%s)\n", datestr);
1750
0
        sarrayAddString(sa, buf, L_COPY);
1751
0
        LEPT_FREE(datestr);
1752
0
        version = getLeptonicaVersion();
1753
0
        snprintf(buf, sizeof(buf),
1754
0
                 "/Producer (leptonica: %s)\n", version);
1755
0
        LEPT_FREE(version);
1756
0
    } else {
1757
0
        snprintf(buf, sizeof(buf), "/Producer (leptonica)\n");
1758
0
    }
1759
0
    sarrayAddString(sa, buf, L_COPY);
1760
0
    if (lpd->title) {
1761
0
        char *hexstr;
1762
0
        if ((hexstr = generateEscapeString(lpd->title)) != NULL) {
1763
0
            snprintf(buf, sizeof(buf), "/Title %s\n", hexstr);
1764
0
            sarrayAddString(sa, buf, L_COPY);
1765
0
        } else {
1766
0
            L_ERROR("title string is not ascii\n", __func__);
1767
0
        }
1768
0
        LEPT_FREE(hexstr);
1769
0
    }
1770
0
    sarrayAddString(sa, ">>\n"
1771
0
                                "endobj\n", L_COPY);
1772
0
    lpd->obj2 = sarrayToString(sa, 0);
1773
0
    l_dnaAddNumber(lpd->objsize, strlen(lpd->obj2));
1774
0
    sarrayDestroy(&sa);
1775
1776
0
    lpd->obj3 = stringNew("3 0 obj\n"
1777
0
                          "<<\n"
1778
0
                          "/Type /Pages\n"
1779
0
                          "/Kids [ 4 0 R ]\n"
1780
0
                          "/Count 1\n"
1781
0
                          ">>\n");
1782
0
    l_dnaAddNumber(lpd->objsize, strlen(lpd->obj3));
1783
1784
        /* Do the post-datastream string */
1785
0
    lpd->poststream = stringNew("\n"
1786
0
                                "endstream\n"
1787
0
                                "endobj\n");
1788
0
}
1789
1790
1791
/*!
1792
 * \brief   generateEscapeString()
1793
 *
1794
 * \param[in]   str      input string
1795
 * \return   hex escape string, or null on error
1796
 *
1797
 * <pre>
1798
 * Notes:
1799
 *      (1) If the input string is not ascii, returns null.
1800
 *      (2) This takes an input ascii string and generates a hex
1801
 *          ascii output string with 4 bytes out for each byte in.
1802
 *          The feff code at the beginning tells the pdf interpreter
1803
 *          that the data is to be interpreted as big-endian, 4 bytes
1804
 *          at a time.  For ascii, the first two bytes are 0 and the
1805
 *          last two bytes are less than 0x80.
1806
 * </pre>
1807
 */
1808
static char  *
1809
generateEscapeString(const char  *str)
1810
0
{
1811
0
char     smallbuf[8];
1812
0
char    *buffer;
1813
0
l_int32  i, nchar, buflen;
1814
1815
0
    if (!str)
1816
0
        return (char *)ERROR_PTR("str not defined", __func__, NULL);
1817
0
    nchar = strlen(str);
1818
0
    for (i = 0; i < nchar; i++) {
1819
0
        if (str[i] < 0)
1820
0
            return (char *)ERROR_PTR("str not all ascii", __func__, NULL);
1821
0
    }
1822
1823
0
    buflen = 4 * nchar + 10;
1824
0
    buffer = (char *)LEPT_CALLOC(buflen, sizeof(char));
1825
0
    stringCat(buffer, buflen, "<feff");
1826
0
    for (i = 0; i < nchar; i++) {
1827
0
        snprintf(smallbuf, sizeof(smallbuf), "%04x", str[i]);
1828
0
        stringCat(buffer, buflen, smallbuf);
1829
0
    }
1830
0
    stringCat(buffer, buflen, ">");
1831
0
    return buffer;
1832
0
}
1833
1834
1835
static void
1836
generateMediaboxPdf(L_PDF_DATA  *lpd)
1837
0
{
1838
0
l_int32    i;
1839
0
l_float32  xpt, ypt, wpt, hpt, maxx, maxy;
1840
1841
        /* First get the full extent of all the images.
1842
         * This is the mediabox, in pts. */
1843
0
    maxx = maxy = 0;
1844
0
    for (i = 0; i < lpd->n; i++) {
1845
0
        ptaGetPt(lpd->xy, i, &xpt, &ypt);
1846
0
        ptaGetPt(lpd->wh, i, &wpt, &hpt);
1847
0
        maxx = L_MAX(maxx, xpt + wpt);
1848
0
        maxy = L_MAX(maxy, ypt + hpt);
1849
0
    }
1850
1851
0
    lpd->mediabox = boxCreate(0, 0, (l_int32)(maxx + 0.5),
1852
0
                              (l_int32)(maxy + 0.5));
1853
1854
        /* ypt is in standard image coordinates: the location of
1855
         * the UL image corner with respect to the UL media box corner.
1856
         * Rewrite each ypt for PostScript coordinates: the location of
1857
         * the LL image corner with respect to the LL media box corner. */
1858
0
    for (i = 0; i < lpd->n; i++) {
1859
0
        ptaGetPt(lpd->xy, i, &xpt, &ypt);
1860
0
        ptaGetPt(lpd->wh, i, &wpt, &hpt);
1861
0
        ptaSetPt(lpd->xy, i, xpt, maxy - ypt - hpt);
1862
0
    }
1863
0
}
1864
1865
1866
static l_int32
1867
generatePageStringPdf(L_PDF_DATA  *lpd)
1868
0
{
1869
0
char    *buf;
1870
0
char    *xstr;
1871
0
l_int32  bufsize, i, wpt, hpt;
1872
0
SARRAY  *sa;
1873
1874
        /* Allocate 1000 bytes for the boilerplate text, and
1875
         * 50 bytes for each reference to an image in the
1876
         * ProcSet array.  */
1877
0
    bufsize = 1000 + 50 * lpd->n;
1878
0
    if ((buf = (char *)LEPT_CALLOC(bufsize, sizeof(char))) == NULL)
1879
0
        return ERROR_INT("calloc fail for buf", __func__, 1);
1880
1881
0
    boxGetGeometry(lpd->mediabox, NULL, NULL, &wpt, &hpt);
1882
0
    sa = sarrayCreate(lpd->n);
1883
0
    for (i = 0; i < lpd->n; i++) {
1884
0
        snprintf(buf, bufsize, "/Im%d %d 0 R   ", i + 1, 6 + i);
1885
0
        sarrayAddString(sa, buf, L_COPY);
1886
0
    }
1887
0
    xstr = sarrayToString(sa, 0);
1888
0
    sarrayDestroy(&sa);
1889
0
    if (!xstr) {
1890
0
        LEPT_FREE(buf);
1891
0
        return ERROR_INT("xstr not made", __func__, 1);
1892
0
    }
1893
1894
0
    snprintf(buf, bufsize, "4 0 obj\n"
1895
0
                           "<<\n"
1896
0
                           "/Type /Page\n"
1897
0
                           "/Parent 3 0 R\n"
1898
0
                           "/MediaBox [%d %d %d %d]\n"
1899
0
                           "/Contents 5 0 R\n"
1900
0
                           "/Resources\n"
1901
0
                           "<<\n"
1902
0
                           "/XObject << %s >>\n"
1903
0
                           "/ProcSet [ /ImageB /ImageI /ImageC ]\n"
1904
0
                           ">>\n"
1905
0
                           ">>\n"
1906
0
                           "endobj\n",
1907
0
                           0, 0, wpt, hpt, xstr);
1908
1909
0
    lpd->obj4 = stringNew(buf);
1910
0
    l_dnaAddNumber(lpd->objsize, strlen(lpd->obj4));
1911
0
    sarrayDestroy(&sa);
1912
0
    LEPT_FREE(buf);
1913
0
    LEPT_FREE(xstr);
1914
0
    return 0;
1915
0
}
1916
1917
1918
static l_int32
1919
generateContentStringPdf(L_PDF_DATA  *lpd)
1920
0
{
1921
0
char      *buf;
1922
0
char      *cstr;
1923
0
l_int32    i, bufsize;
1924
0
l_float32  xpt, ypt, wpt, hpt;
1925
0
SARRAY    *sa;
1926
1927
0
    bufsize = 1000 + 200 * lpd->n;
1928
0
    if ((buf = (char *)LEPT_CALLOC(bufsize, sizeof(char))) == NULL)
1929
0
        return ERROR_INT("calloc fail for buf", __func__, 1);
1930
1931
0
    sa = sarrayCreate(lpd->n);
1932
0
    for (i = 0; i < lpd->n; i++) {
1933
0
        ptaGetPt(lpd->xy, i, &xpt, &ypt);
1934
0
        ptaGetPt(lpd->wh, i, &wpt, &hpt);
1935
0
        snprintf(buf, bufsize,
1936
0
                 "q %.4f %.4f %.4f %.4f %.4f %.4f cm /Im%d Do Q\n",
1937
0
                 wpt, 0.0, 0.0, hpt, xpt, ypt, i + 1);
1938
0
        sarrayAddString(sa, buf, L_COPY);
1939
0
    }
1940
0
    cstr = sarrayToString(sa, 0);
1941
0
    sarrayDestroy(&sa);
1942
0
    if (!cstr) {
1943
0
        LEPT_FREE(buf);
1944
0
        return ERROR_INT("cstr not made", __func__, 1);
1945
0
    }
1946
1947
0
    snprintf(buf, bufsize, "5 0 obj\n"
1948
0
                           "<< /Length %d >>\n"
1949
0
                           "stream\n"
1950
0
                           "%s"
1951
0
                           "endstream\n"
1952
0
                           "endobj\n",
1953
0
                           (l_int32)strlen(cstr), cstr);
1954
1955
0
    lpd->obj5 = stringNew(buf);
1956
0
    l_dnaAddNumber(lpd->objsize, strlen(lpd->obj5));
1957
0
    sarrayDestroy(&sa);
1958
0
    LEPT_FREE(buf);
1959
0
    LEPT_FREE(cstr);
1960
0
    return 0;
1961
0
}
1962
1963
1964
static l_int32
1965
generatePreXStringsPdf(L_PDF_DATA  *lpd)
1966
0
{
1967
0
char          buff[256];
1968
0
char          buf[L_BIGBUF];
1969
0
char         *cstr, *bstr, *fstr, *pstr, *xstr, *photometry;
1970
0
l_int32       i, cmindex;
1971
0
L_COMP_DATA  *cid;
1972
0
SARRAY       *sa;
1973
1974
0
    sa = lpd->saprex;
1975
0
    cmindex = 6 + lpd->n;  /* starting value */
1976
0
    for (i = 0; i < lpd->n; i++) {
1977
0
        pstr = cstr = NULL;
1978
0
        if ((cid = pdfdataGetCid(lpd, i)) == NULL)
1979
0
            return ERROR_INT("cid not found", __func__, 1);
1980
1981
0
        if (cid->type == L_G4_ENCODE) {
1982
0
            if (var_WRITE_G4_IMAGE_MASK) {
1983
0
                cstr = stringNew("/ImageMask true\n"
1984
0
                                 "/ColorSpace /DeviceGray");
1985
0
            } else {
1986
0
                cstr = stringNew("/ColorSpace /DeviceGray");
1987
0
            }
1988
0
            bstr = stringNew("/BitsPerComponent 1\n"
1989
0
                             "/Interpolate true");
1990
                /* Note: the reversal is deliberate.  The BlackIs1 flag
1991
                 * is misleadingly named: it says whether to invert the
1992
                 * image on decoding because the black pixels are 0,
1993
                 * not whether the black pixels are 1!  The default for
1994
                 * BlackIs1 is "false", which means "don't invert because
1995
                 * black is 1."  Yikes. */
1996
0
            photometry = (cid->minisblack) ? stringNew("true")
1997
0
                                           : stringNew("false");
1998
0
            snprintf(buff, sizeof(buff),
1999
0
                     "/Filter /CCITTFaxDecode\n"
2000
0
                     "/DecodeParms\n"
2001
0
                     "<<\n"
2002
0
                     "/BlackIs1 %s\n"
2003
0
                     "/K -1\n"
2004
0
                     "/Columns %d\n"
2005
0
                     ">>", photometry, cid->w);
2006
0
            fstr = stringNew(buff);
2007
0
            LEPT_FREE(photometry);
2008
0
        } else if (cid->type == L_JPEG_ENCODE) {
2009
0
            if (cid->spp == 1)
2010
0
                cstr = stringNew("/ColorSpace /DeviceGray");
2011
0
            else if (cid->spp == 3)
2012
0
                cstr = stringNew("/ColorSpace /DeviceRGB");
2013
0
            else if (cid->spp == 4)   /* pdf supports cmyk */
2014
0
                cstr = stringNew("/ColorSpace /DeviceCMYK");
2015
0
            else
2016
0
                L_ERROR("in jpeg: spp != 1, 3 or 4\n", __func__);
2017
0
            bstr = stringNew("/BitsPerComponent 8");
2018
0
            fstr = stringNew("/Filter /DCTDecode");
2019
0
        } else if (cid->type == L_JP2K_ENCODE) {
2020
0
            if (cid->spp == 1)
2021
0
                cstr = stringNew("/ColorSpace /DeviceGray");
2022
0
            else if (cid->spp == 3)
2023
0
                cstr = stringNew("/ColorSpace /DeviceRGB");
2024
0
            else
2025
0
                L_ERROR("in jp2k: spp != 1 && spp != 3\n", __func__);
2026
0
            bstr = stringNew("/BitsPerComponent 8");
2027
0
            fstr = stringNew("/Filter /JPXDecode");
2028
0
        } else {  /* type == L_FLATE_ENCODE */
2029
0
            if (cid->ncolors > 0) {  /* cmapped */
2030
0
                snprintf(buff, sizeof(buff), "/ColorSpace %d 0 R", cmindex++);
2031
0
                cstr = stringNew(buff);
2032
0
            } else {
2033
0
                if (cid->spp == 1 && cid->bps == 1)
2034
0
                    cstr = stringNew("/ColorSpace /DeviceGray\n"
2035
0
                                     "/Decode [1 0]");
2036
0
                else if (cid->spp == 1)  /* 8 bpp */
2037
0
                    cstr = stringNew("/ColorSpace /DeviceGray");
2038
0
                else if (cid->spp == 3)
2039
0
                    cstr = stringNew("/ColorSpace /DeviceRGB");
2040
0
                else
2041
0
                    L_ERROR("unknown colorspace: spp = %d\n",
2042
0
                            __func__, cid->spp);
2043
0
            }
2044
0
            snprintf(buff, sizeof(buff), "/BitsPerComponent %d", cid->bps);
2045
0
            bstr = stringNew(buff);
2046
0
            fstr = stringNew("/Filter /FlateDecode");
2047
0
            if (cid->predictor == TRUE) {
2048
0
                snprintf(buff, sizeof(buff),
2049
0
                         "/DecodeParms\n"
2050
0
                         "<<\n"
2051
0
                         "  /Columns %d\n"
2052
0
                         "  /Predictor 14\n"
2053
0
                         "  /Colors %d\n"
2054
0
                         "  /BitsPerComponent %d\n"
2055
0
                         ">>\n", cid->w, cid->spp, cid->bps);
2056
0
                pstr = stringNew(buff);
2057
0
            }
2058
0
        }
2059
0
        if (!pstr)  /* no decode parameters */
2060
0
            pstr = stringNew("");
2061
2062
0
        snprintf(buf, sizeof(buf),
2063
0
                 "%d 0 obj\n"
2064
0
                 "<<\n"
2065
0
                 "/Length %zu\n"
2066
0
                 "/Subtype /Image\n"
2067
0
                 "%s\n"  /* colorspace */
2068
0
                 "/Width %d\n"
2069
0
                 "/Height %d\n"
2070
0
                 "%s\n"  /* bits/component */
2071
0
                 "%s\n"  /* filter */
2072
0
                 "%s"   /* decode parms; can be empty */
2073
0
                 ">>\n"
2074
0
                 "stream\n",
2075
0
                 6 + i, cid->nbytescomp, cstr,
2076
0
                 cid->w, cid->h, bstr, fstr, pstr);
2077
0
        xstr = stringNew(buf);
2078
0
        sarrayAddString(sa, xstr, L_INSERT);
2079
0
        l_dnaAddNumber(lpd->objsize,
2080
0
                      strlen(xstr) + cid->nbytescomp + strlen(lpd->poststream));
2081
0
        LEPT_FREE(cstr);
2082
0
        LEPT_FREE(bstr);
2083
0
        LEPT_FREE(fstr);
2084
0
        LEPT_FREE(pstr);
2085
0
    }
2086
2087
0
    return 0;
2088
0
}
2089
2090
2091
static l_int32
2092
generateColormapStringsPdf(L_PDF_DATA  *lpd)
2093
0
{
2094
0
char          buf[L_BIGBUF];
2095
0
char         *cmstr;
2096
0
l_int32       i, cmindex, ncmap;
2097
0
L_COMP_DATA  *cid;
2098
0
SARRAY       *sa;
2099
2100
        /* In our canonical format, we have 5 objects, followed
2101
         * by n XObjects, followed by m colormaps, so the index of
2102
         * the first colormap object is 6 + n. */
2103
0
    sa = lpd->sacmap;
2104
0
    cmindex = 6 + lpd->n;  /* starting value */
2105
0
    ncmap = 0;
2106
0
    for (i = 0; i < lpd->n; i++) {
2107
0
        if ((cid = pdfdataGetCid(lpd, i)) == NULL)
2108
0
            return ERROR_INT("cid not found", __func__, 1);
2109
0
        if (cid->ncolors == 0) continue;
2110
2111
0
        ncmap++;
2112
0
        snprintf(buf, sizeof(buf), "%d 0 obj\n"
2113
0
                                   "[ /Indexed /DeviceRGB\n"
2114
0
                                   "%d\n"
2115
0
                                   "%s\n"
2116
0
                                   "]\n"
2117
0
                                   "endobj\n",
2118
0
                                   cmindex, cid->ncolors - 1, cid->cmapdatahex);
2119
0
        cmindex++;
2120
0
        cmstr = stringNew(buf);
2121
0
        l_dnaAddNumber(lpd->objsize, strlen(cmstr));
2122
0
        sarrayAddString(sa, cmstr, L_INSERT);
2123
0
    }
2124
2125
0
    lpd->ncmap = ncmap;
2126
0
    return 0;
2127
0
}
2128
2129
2130
static void
2131
generateTrailerPdf(L_PDF_DATA  *lpd)
2132
0
{
2133
0
l_int32  i, n, size, linestart;
2134
0
L_DNA   *daloc, *dasize;
2135
2136
        /* Let nobj be the number of numbered objects.  These numbered
2137
         * objects are indexed by their pdf number in arrays naloc[]
2138
         * and nasize[].  The 0th object is the 9 byte header.  Then
2139
         * the number of objects in nasize, which includes the header,
2140
         * is n = nobj + 1.  The array naloc[] has n + 1 elements,
2141
         * because it includes as the last element the starting
2142
         * location of xref.  The indexing of these objects, their
2143
         * starting locations and sizes are:
2144
         *
2145
         *     Object number         Starting location         Size
2146
         *     -------------         -----------------     --------------
2147
         *          0                   daloc[0] = 0       dasize[0] = 9
2148
         *          1                   daloc[1] = 9       dasize[1] = 49
2149
         *          n                   daloc[n]           dasize[n]
2150
         *          xref                daloc[n+1]
2151
         *
2152
         * We first generate daloc.
2153
         */
2154
0
    dasize = lpd->objsize;
2155
0
    daloc = lpd->objloc;
2156
0
    linestart = 0;
2157
0
    l_dnaAddNumber(daloc, linestart);  /* header */
2158
0
    n = l_dnaGetCount(dasize);
2159
0
    for (i = 0; i < n; i++) {
2160
0
        l_dnaGetIValue(dasize, i, &size);
2161
0
        linestart += size;
2162
0
        l_dnaAddNumber(daloc, linestart);
2163
0
    }
2164
0
    l_dnaGetIValue(daloc, n, &lpd->xrefloc);  /* save it */
2165
2166
        /* Now make the actual trailer string */
2167
0
    lpd->trailer = makeTrailerStringPdf(daloc);
2168
0
}
2169
2170
2171
static char *
2172
makeTrailerStringPdf(L_DNA  *daloc)
2173
0
{
2174
0
char    *outstr;
2175
0
char     buf[L_BIGBUF];
2176
0
l_int32  i, n, linestart, xrefloc;
2177
0
SARRAY  *sa;
2178
2179
0
    if (!daloc)
2180
0
        return (char *)ERROR_PTR("daloc not defined", __func__, NULL);
2181
0
    n = l_dnaGetCount(daloc) - 1;  /* numbered objects + 1 (yes, +1) */
2182
2183
0
    sa = sarrayCreate(0);
2184
0
    snprintf(buf, sizeof(buf), "xref\n"
2185
0
                               "0 %d\n"
2186
0
                               "0000000000 65535 f \n", n);
2187
0
    sarrayAddString(sa, buf, L_COPY);
2188
0
    for (i = 1; i < n; i++) {
2189
0
        l_dnaGetIValue(daloc, i, &linestart);
2190
0
        snprintf(buf, sizeof(buf), "%010d 00000 n \n", linestart);
2191
0
        sarrayAddString(sa, buf, L_COPY);
2192
0
    }
2193
2194
0
    l_dnaGetIValue(daloc, n, &xrefloc);
2195
0
    snprintf(buf, sizeof(buf), "trailer\n"
2196
0
                               "<<\n"
2197
0
                               "/Size %d\n"
2198
0
                               "/Root 1 0 R\n"
2199
0
                               "/Info 2 0 R\n"
2200
0
                               ">>\n"
2201
0
                               "startxref\n"
2202
0
                               "%d\n"
2203
0
                               "%%%%EOF\n", n, xrefloc);
2204
0
    sarrayAddString(sa, buf, L_COPY);
2205
0
    outstr = sarrayToString(sa, 0);
2206
0
    sarrayDestroy(&sa);
2207
0
    return outstr;
2208
0
}
2209
2210
2211
/*!
2212
 * \brief   generateOutputDataPdf()
2213
 *
2214
 * \param[out]   pdata      pdf data array
2215
 * \param[out]   pnbytes    size of pdf data array
2216
 * \param[in]    lpd        input data used to make pdf
2217
 * \return  0 if OK, 1 on error
2218
 *
2219
 * <pre>
2220
 * Notes:
2221
 *      (1) Only called from l_generatePdf().  On error, no data is returned.
2222
 * </pre>
2223
 */
2224
static l_int32
2225
generateOutputDataPdf(l_uint8    **pdata,
2226
                      size_t      *pnbytes,
2227
                      L_PDF_DATA  *lpd)
2228
0
{
2229
0
char         *str;
2230
0
l_uint8      *data;
2231
0
l_int32       nimages, i, len;
2232
0
l_int32      *sizes, *locs;
2233
0
size_t        nbytes;
2234
0
L_COMP_DATA  *cid;
2235
2236
0
    if (!pdata)
2237
0
        return ERROR_INT("&data not defined", __func__, 1);
2238
0
    *pdata = NULL;
2239
0
    if (!pnbytes)
2240
0
        return ERROR_INT("&nbytes not defined", __func__, 1);
2241
0
    nbytes = lpd->xrefloc + strlen(lpd->trailer);
2242
0
    *pnbytes = nbytes;
2243
0
    if ((data = (l_uint8 *)LEPT_CALLOC(nbytes, sizeof(l_uint8))) == NULL)
2244
0
        return ERROR_INT("calloc fail for data", __func__, 1);
2245
0
    *pdata = data;
2246
2247
0
    sizes = l_dnaGetIArray(lpd->objsize);
2248
0
    locs = l_dnaGetIArray(lpd->objloc);
2249
0
    memcpy(data, lpd->id, sizes[0]);
2250
0
    memcpy(data + locs[1], lpd->obj1, sizes[1]);
2251
0
    memcpy(data + locs[2], lpd->obj2, sizes[2]);
2252
0
    memcpy(data + locs[3], lpd->obj3, sizes[3]);
2253
0
    memcpy(data + locs[4], lpd->obj4, sizes[4]);
2254
0
    memcpy(data + locs[5], lpd->obj5, sizes[5]);
2255
2256
        /* Each image has 3 parts: variable preamble, the compressed
2257
         * data stream, and the fixed poststream. */
2258
0
    nimages = lpd->n;
2259
0
    for (i = 0; i < nimages; i++) {
2260
0
        if ((cid = pdfdataGetCid(lpd, i)) == NULL) {  /* should not happen */
2261
0
            LEPT_FREE(sizes);
2262
0
            LEPT_FREE(locs);
2263
0
            return ERROR_INT("cid not found", __func__, 1);
2264
0
        }
2265
0
        str = sarrayGetString(lpd->saprex, i, L_NOCOPY);
2266
0
        len = strlen(str);
2267
0
        memcpy(data + locs[6 + i], str, len);
2268
0
        memcpy(data + locs[6 + i] + len,
2269
0
               cid->datacomp, cid->nbytescomp);
2270
0
        memcpy(data + locs[6 + i] + len + cid->nbytescomp,
2271
0
               lpd->poststream, strlen(lpd->poststream));
2272
0
    }
2273
2274
        /* Each colormap is simply a stored string */
2275
0
    for (i = 0; i < lpd->ncmap; i++) {
2276
0
        str = sarrayGetString(lpd->sacmap, i, L_NOCOPY);
2277
0
        memcpy(data + locs[6 + nimages + i], str, strlen(str));
2278
0
    }
2279
2280
        /* And finally the trailer */
2281
0
    memcpy(data + lpd->xrefloc, lpd->trailer, strlen(lpd->trailer));
2282
0
    LEPT_FREE(sizes);
2283
0
    LEPT_FREE(locs);
2284
0
    return 0;
2285
0
}
2286
2287
2288
/*---------------------------------------------------------------------*
2289
 *          Helper functions for generating multipage pdf output       *
2290
 *---------------------------------------------------------------------*/
2291
/*!
2292
 * \brief   parseTrailerPdf()
2293
 *
2294
 * \param[in]    bas     lba of a pdf file
2295
 * \param[out]   pda     byte locations of the beginning of each object
2296
 * \return  0 if OK, 1 on error
2297
 */
2298
static l_int32
2299
parseTrailerPdf(L_BYTEA  *bas,
2300
                L_DNA   **pda)
2301
0
{
2302
0
char     *str;
2303
0
l_uint8   nl = '\n';
2304
0
l_uint8  *data;
2305
0
l_int32   i, j, start, startloc, xrefloc, found, loc, nobj, objno, trailer_ok;
2306
0
size_t    size;
2307
0
L_DNA    *da, *daobj, *daxref;
2308
0
SARRAY   *sa;
2309
2310
0
    if (!pda)
2311
0
        return ERROR_INT("&da not defined", __func__, 1);
2312
0
    *pda = NULL;
2313
0
    if (!bas)
2314
0
        return ERROR_INT("bas not defined", __func__, 1);
2315
0
    data = l_byteaGetData(bas, &size);
2316
0
    if (memcmp(data, "%PDF-1.", 7) != 0)
2317
0
        return ERROR_INT("PDF header signature not found", __func__, 1);
2318
2319
        /* Search for "startxref" starting 50 bytes from the EOF */
2320
0
    start = 0;
2321
0
    if (size > 50)
2322
0
        start = size - 50;
2323
0
    arrayFindSequence(data + start, size - start,
2324
0
                      (l_uint8 *)"startxref\n", 10, &loc, &found);
2325
0
    if (!found)
2326
0
        return ERROR_INT("startxref not found!", __func__, 1);
2327
0
    if (sscanf((char *)(data + start + loc + 10), "%d\n", &xrefloc) != 1)
2328
0
        return ERROR_INT("xrefloc not found!", __func__, 1);
2329
0
    if (xrefloc < 0 || xrefloc >= size)
2330
0
        return ERROR_INT("invalid xrefloc!", __func__, 1);
2331
0
    sa = sarrayCreateLinesFromString((char *)(data + xrefloc), 0);
2332
0
    str = sarrayGetString(sa, 1, L_NOCOPY);
2333
0
    if ((sscanf(str, "0 %d", &nobj)) != 1) {
2334
0
        sarrayDestroy(&sa);
2335
0
        return ERROR_INT("nobj not found", __func__, 1);
2336
0
    }
2337
2338
        /* Get starting locations.  The numa index is the
2339
         * object number.  loc[0] is the ID; loc[nobj + 1] is xrefloc.  */
2340
0
    da = l_dnaCreate(nobj + 1);
2341
0
    *pda = da;
2342
0
    for (i = 0; i < nobj; i++) {
2343
0
        str = sarrayGetString(sa, i + 2, L_NOCOPY);
2344
0
        sscanf(str, "%d", &startloc);
2345
0
        l_dnaAddNumber(da, startloc);
2346
0
    }
2347
0
    l_dnaAddNumber(da, xrefloc);
2348
2349
#if  DEBUG_MULTIPAGE
2350
    lept_stderr("************** Trailer string ************\n");
2351
    lept_stderr("xrefloc = %d", xrefloc);
2352
    sarrayWriteStderr(sa);
2353
2354
    lept_stderr("************** Object locations ************");
2355
    l_dnaWriteStderr(da);
2356
#endif  /* DEBUG_MULTIPAGE */
2357
0
    sarrayDestroy(&sa);
2358
2359
        /* Verify correct parsing */
2360
0
    trailer_ok = TRUE;
2361
0
    for (i = 1; i < nobj; i++) {
2362
0
        l_dnaGetIValue(da, i, &startloc);
2363
0
        if ((sscanf((char *)(data + startloc), "%d 0 obj", &objno)) != 1) {
2364
0
            L_ERROR("bad trailer for object %d\n", __func__, i);
2365
0
            trailer_ok = FALSE;
2366
0
            break;
2367
0
        }
2368
0
    }
2369
2370
        /* If the trailer is broken, reconstruct the correct obj locations */
2371
0
    if (!trailer_ok) {
2372
0
        L_INFO("rebuilding pdf trailer\n", __func__);
2373
0
        l_dnaEmpty(da);
2374
0
        l_dnaAddNumber(da, 0);
2375
0
        l_byteaFindEachSequence(bas, (l_uint8 *)" 0 obj\n", 7, &daobj);
2376
0
        nobj = l_dnaGetCount(daobj);
2377
0
        for (i = 0; i < nobj; i++) {
2378
0
            l_dnaGetIValue(daobj, i, &loc);
2379
0
            for (j = loc - 1; j > 0; j--) {
2380
0
                if (data[j] == nl)
2381
0
                    break;
2382
0
            }
2383
0
            l_dnaAddNumber(da, j + 1);
2384
0
        }
2385
0
        l_byteaFindEachSequence(bas, (l_uint8 *)"xref", 4, &daxref);
2386
0
        l_dnaGetIValue(daxref, 0, &loc);
2387
0
        l_dnaAddNumber(da, loc);
2388
0
        l_dnaDestroy(&daobj);
2389
0
        l_dnaDestroy(&daxref);
2390
0
    }
2391
2392
0
    return 0;
2393
0
}
2394
2395
2396
static char *
2397
generatePagesObjStringPdf(NUMA  *napage)
2398
0
{
2399
0
char    *str;
2400
0
char    *buf;
2401
0
l_int32  i, n, index, bufsize;
2402
0
SARRAY  *sa;
2403
2404
0
    if (!napage)
2405
0
        return (char *)ERROR_PTR("napage not defined", __func__, NULL);
2406
2407
0
    n = numaGetCount(napage);
2408
0
    bufsize = 100 + 16 * n;  /* large enough to hold the output string */
2409
0
    buf = (char *)LEPT_CALLOC(bufsize, sizeof(char));
2410
0
    sa = sarrayCreate(n);
2411
0
    for (i = 0; i < n; i++) {
2412
0
        numaGetIValue(napage, i, &index);
2413
0
        snprintf(buf, bufsize, " %d 0 R ", index);
2414
0
        sarrayAddString(sa, buf, L_COPY);
2415
0
    }
2416
2417
0
    str = sarrayToString(sa, 0);
2418
0
    snprintf(buf, bufsize - 1, "3 0 obj\n"
2419
0
                               "<<\n"
2420
0
                               "/Type /Pages\n"
2421
0
                               "/Kids [%s]\n"
2422
0
                               "/Count %d\n"
2423
0
                               ">>\n"
2424
0
                               "endobj\n",
2425
0
                               str, n);
2426
0
    sarrayDestroy(&sa);
2427
0
    LEPT_FREE(str);
2428
0
    return buf;
2429
0
}
2430
2431
2432
/*!
2433
 * \brief   substituteObjectNumbers()
2434
 *
2435
 * \param[in]   bas        lba of a pdf object
2436
 * \param[in]   na_objs    object number mapping array
2437
 * \return    bad   lba of rewritten pdf for the object
2438
 *
2439
 * <pre>
2440
 * Notes:
2441
 *      (1) Interpret the first set of bytes as the object number,
2442
 *          map to the new number, and write it out.
2443
 *      (2) Find all occurrences of this 4-byte sequence: " 0 R"
2444
 *      (3) Find the location and value of the integer preceding this,
2445
 *          and map it to the new value.
2446
 *      (4) Rewrite the object with new object numbers.
2447
 * </pre>
2448
 */
2449
static L_BYTEA *
2450
substituteObjectNumbers(L_BYTEA  *bas,
2451
                        NUMA     *na_objs)
2452
0
{
2453
0
l_uint8   space = ' ';
2454
0
l_uint8  *datas;
2455
0
l_uint8   buf[32];  /* only needs to hold one integer in ascii format */
2456
0
l_int32   start, nrepl, i, j, nobjs, objin, objout, found;
2457
0
l_int32  *objs, *matches;
2458
0
size_t    size;
2459
0
L_BYTEA  *bad;
2460
0
L_DNA    *da_match;
2461
2462
0
    if (!bas)
2463
0
        return (L_BYTEA *)ERROR_PTR("bas not defined", __func__, NULL);
2464
0
    if (!na_objs)
2465
0
        return (L_BYTEA *)ERROR_PTR("na_objs not defined", __func__, NULL);
2466
2467
0
    datas = l_byteaGetData(bas, &size);
2468
0
    bad = l_byteaCreate(100);
2469
0
    objs = numaGetIArray(na_objs);  /* object number mapper */
2470
0
    nobjs = numaGetCount(na_objs);  /* use for sanity checking */
2471
2472
        /* Substitute the object number on the first line */
2473
0
    sscanf((char *)datas, "%d", &objin);
2474
0
    if (objin < 0 || objin >= nobjs) {
2475
0
        L_ERROR("index %d into array of size %d\n", __func__, objin, nobjs);
2476
0
        LEPT_FREE(objs);
2477
0
        return bad;
2478
0
    }
2479
0
    objout = objs[objin];
2480
0
    snprintf((char *)buf, 32, "%d", objout);
2481
0
    l_byteaAppendString(bad, (char *)buf);
2482
2483
        /* Find the set of matching locations for object references */
2484
0
    arrayFindSequence(datas, size, &space, 1, &start, &found);
2485
0
    da_match = arrayFindEachSequence(datas, size, (l_uint8 *)" 0 R", 4);
2486
0
    if (!da_match) {
2487
0
        l_byteaAppendData(bad, datas + start, size - start);
2488
0
        LEPT_FREE(objs);
2489
0
        return bad;
2490
0
    }
2491
2492
        /* Substitute all the object reference numbers */
2493
0
    nrepl = l_dnaGetCount(da_match);
2494
0
    matches = l_dnaGetIArray(da_match);
2495
0
    for (i = 0; i < nrepl; i++) {
2496
            /* Find the first space before the object number */
2497
0
        for (j = matches[i] - 1; j > 0; j--) {
2498
0
            if (datas[j] == space)
2499
0
                break;
2500
0
        }
2501
            /* Copy bytes from 'start' up to the object number */
2502
0
        l_byteaAppendData(bad, datas + start, j - start + 1);
2503
0
        sscanf((char *)(datas + j + 1), "%d", &objin);
2504
0
        if (objin < 0 || objin >= nobjs) {
2505
0
            L_ERROR("index %d into array of size %d\n", __func__, objin, nobjs);
2506
0
            LEPT_FREE(objs);
2507
0
            LEPT_FREE(matches);
2508
0
            l_dnaDestroy(&da_match);
2509
0
            return bad;
2510
0
        }
2511
0
        objout = objs[objin];
2512
0
        snprintf((char *)buf, 32, "%d", objout);
2513
0
        l_byteaAppendString(bad, (char *)buf);
2514
0
        start = matches[i];
2515
0
    }
2516
0
    l_byteaAppendData(bad, datas + start, size - start);
2517
2518
0
    LEPT_FREE(objs);
2519
0
    LEPT_FREE(matches);
2520
0
    l_dnaDestroy(&da_match);
2521
0
    return bad;
2522
0
}
2523
2524
2525
/*---------------------------------------------------------------------*
2526
 *                     Create/destroy/access pdf data                  *
2527
 *---------------------------------------------------------------------*/
2528
static L_PDF_DATA *
2529
pdfdataCreate(const char  *title)
2530
0
{
2531
0
L_PDF_DATA *lpd;
2532
2533
0
    lpd = (L_PDF_DATA *)LEPT_CALLOC(1, sizeof(L_PDF_DATA));
2534
0
    if (title) lpd->title = stringNew(title);
2535
0
    lpd->cida = ptraCreate(10);
2536
0
    lpd->xy = ptaCreate(10);
2537
0
    lpd->wh = ptaCreate(10);
2538
0
    lpd->saprex = sarrayCreate(10);
2539
0
    lpd->sacmap = sarrayCreate(10);
2540
0
    lpd->objsize = l_dnaCreate(20);
2541
0
    lpd->objloc = l_dnaCreate(20);
2542
0
    return lpd;
2543
0
}
2544
2545
static void
2546
pdfdataDestroy(L_PDF_DATA  **plpd)
2547
0
{
2548
0
l_int32       i;
2549
0
L_COMP_DATA  *cid;
2550
0
L_PDF_DATA   *lpd;
2551
2552
0
    if (plpd== NULL) {
2553
0
        L_WARNING("ptr address is null!\n", __func__);
2554
0
        return;
2555
0
    }
2556
0
    if ((lpd = *plpd) == NULL)
2557
0
        return;
2558
2559
0
    if (lpd->title) LEPT_FREE(lpd->title);
2560
0
    for (i = 0; i < lpd->n; i++) {
2561
0
        cid = (L_COMP_DATA *)ptraRemove(lpd->cida, i, L_NO_COMPACTION);
2562
0
        l_CIDataDestroy(&cid);
2563
0
    }
2564
2565
0
    ptraDestroy(&lpd->cida, 0, 0);
2566
0
    if (lpd->id) LEPT_FREE(lpd->id);
2567
0
    if (lpd->obj1) LEPT_FREE(lpd->obj1);
2568
0
    if (lpd->obj2) LEPT_FREE(lpd->obj2);
2569
0
    if (lpd->obj3) LEPT_FREE(lpd->obj3);
2570
0
    if (lpd->obj4) LEPT_FREE(lpd->obj4);
2571
0
    if (lpd->obj5) LEPT_FREE(lpd->obj5);
2572
0
    if (lpd->poststream) LEPT_FREE(lpd->poststream);
2573
0
    if (lpd->trailer) LEPT_FREE(lpd->trailer);
2574
0
    if (lpd->xy) ptaDestroy(&lpd->xy);
2575
0
    if (lpd->wh) ptaDestroy(&lpd->wh);
2576
0
    if (lpd->mediabox) boxDestroy(&lpd->mediabox);
2577
0
    if (lpd->saprex) sarrayDestroy(&lpd->saprex);
2578
0
    if (lpd->sacmap) sarrayDestroy(&lpd->sacmap);
2579
0
    if (lpd->objsize) l_dnaDestroy(&lpd->objsize);
2580
0
    if (lpd->objloc) l_dnaDestroy(&lpd->objloc);
2581
0
    LEPT_FREE(lpd);
2582
0
    *plpd = NULL;
2583
0
}
2584
2585
2586
static L_COMP_DATA *
2587
pdfdataGetCid(L_PDF_DATA  *lpd,
2588
              l_int32      index)
2589
0
{
2590
0
    if (!lpd)
2591
0
        return (L_COMP_DATA *)ERROR_PTR("lpd not defined", __func__, NULL);
2592
0
    if (index < 0 || index >= lpd->n)
2593
0
        return (L_COMP_DATA *)ERROR_PTR("invalid image index", __func__, NULL);
2594
2595
0
    return (L_COMP_DATA *)ptraGetPtrToItem(lpd->cida, index);
2596
0
}
2597
2598
2599
/*---------------------------------------------------------------------*
2600
 *                     Find number of pages in a pdf                   *
2601
 *---------------------------------------------------------------------*/
2602
/*!
2603
 * \brief   getPdfPageCount()
2604
 *
2605
 * \param[in]    fname      filename
2606
 * \param[out]   pnpages    number of pages
2607
 * \return  0 if OK, 1 on error
2608
 *
2609
 * <pre>
2610
 * Notes:
2611
 *      (1) Looks for the argument of the first instance of /Count in the file.
2612
 *      (2) This first reads 10000 bytes from the beginning of the file.
2613
 *          If "/Count" is not in that string, it reads the entire file
2614
 *          and looks for "/Count".
2615
 *      (3) This will not work on encrypted pdf files or on files where
2616
 *          the "/Count" field is binary compressed.  Not finding the
2617
 *          "/Count" field is not an error, but a warning is given.
2618
 * </pre>
2619
 */
2620
l_ok
2621
getPdfPageCount(const char  *fname,
2622
                l_int32     *pnpages)
2623
0
{
2624
0
l_uint8  *data;
2625
0
l_int32   format, loc, ret, npages, found;
2626
0
size_t    nread;
2627
2628
0
    if (!pnpages)
2629
0
        return ERROR_INT("&npages not defined", __func__, 1);
2630
0
    *pnpages = 0;
2631
0
    if (!fname)
2632
0
        return ERROR_INT("fname not defined", __func__, 1);
2633
2634
        /* Make sure this a pdf file */
2635
0
    findFileFormat(fname, &format);
2636
0
    if (format != IFF_LPDF)
2637
0
        return ERROR_INT("file is not pdf", __func__, 1);
2638
2639
        /* Read 10000 bytes from the beginning of the file */
2640
0
    if ((data = l_binaryReadSelect(fname, 0, 10000, &nread))
2641
0
                 == NULL)
2642
0
        return ERROR_INT("partial data not read", __func__, 1);
2643
2644
        /* Find the location of the first instance of "/Count".
2645
         * If it is not found, try reading the entire file and
2646
         * looking again. */
2647
0
    arrayFindSequence(data, nread, (const l_uint8 *)"/Count",
2648
0
          strlen("/Count"), &loc, &found);
2649
0
    if (!found) {
2650
0
        lept_stderr("Reading entire file looking for '/Count'\n");
2651
0
        LEPT_FREE(data);
2652
0
        if ((data = l_binaryRead(fname, &nread)) == NULL)
2653
0
            return ERROR_INT("full data not read", __func__, 1);
2654
0
        arrayFindSequence(data, nread, (const l_uint8 *)"/Count",
2655
0
             strlen("/Count"), &loc, &found);
2656
0
        if (!found) {
2657
0
            LEPT_FREE(data);
2658
0
            L_WARNING("/Count not found\n", __func__);
2659
0
            return 0;
2660
0
        }
2661
0
    }
2662
2663
        /* Unlikely: make sure we can read the count field */
2664
0
    if (nread - loc < 12)  { /* haven't read enough to capture page count */
2665
0
        LEPT_FREE(data);
2666
0
        return ERROR_INT("data may not include page count field", __func__, 1);
2667
0
    }
2668
2669
        /* Read the page count; if not found, puts garbage in npages */
2670
0
    ret = sscanf((char *)&data[loc], "/Count %d", &npages);
2671
0
    LEPT_FREE(data);
2672
0
    if (ret != 1)
2673
0
        return ERROR_INT("npages not found", __func__, 1);
2674
0
    *pnpages = npages;
2675
/*    lept_stderr("bytes read = %d, loc = %d, npages = %d\n",
2676
                nread, loc, *pnpages);  */
2677
0
    return 0;
2678
0
}
2679
2680
2681
/*---------------------------------------------------------------------*
2682
 *      Find widths and heights of pages and media boxes in a pdf      *
2683
 *---------------------------------------------------------------------*/
2684
/*!
2685
 * \brief   getPdfPageSizes()
2686
 *
2687
 * \param[in]    fname        filename
2688
 * \param[out]   pnaw         [optional] array of page widths
2689
 * \param[out]   pnah         [optional] array of page heights
2690
 * \param[out]   pmedw        [optional] median page width
2691
 * \param[out]   pmedh        [optional] median page height
2692
 * \return  0 if OK, 1 on error
2693
 *
2694
 * <pre>
2695
 * Notes:
2696
 *      (1) Finds the arguments of each instance of '/Width' and '/Height'
2697
 *          in the file.
2698
 *      (2) This will not work on encrypted pdf files or on files where
2699
 *          the "/Width" and "/Height" fields are binary compressed.
2700
 *          Not finding the "/Width" and /Height" fields is not an error,
2701
 *          but a warning is given.
2702
 * </pre>
2703
 */
2704
l_ok
2705
getPdfPageSizes(const char  *fname,
2706
                NUMA       **pnaw,
2707
                NUMA       **pnah,
2708
                l_int32     *pmedw,
2709
                l_int32     *pmedh)
2710
0
{
2711
0
l_uint8   *data;
2712
0
l_int32    i, nw, nh, format, ret, loc, width, height;
2713
0
l_float32  fval;
2714
0
size_t     nread;
2715
0
L_DNA     *dnaw;  /* width locations */
2716
0
L_DNA     *dnah;  /* height locations */
2717
0
NUMA      *naw;   /* widths */
2718
0
NUMA      *nah;   /* heights */
2719
2720
0
    if (pnaw) *pnaw = NULL;
2721
0
    if (pnah) *pnah = NULL;
2722
0
    if (pmedw) *pmedw = 0;
2723
0
    if (pmedh) *pmedh = 0;
2724
0
    if (!pnaw && !pnah && !pmedw && !pmedh)
2725
0
        return ERROR_INT("no output requested", __func__, 1);
2726
0
    if (!fname)
2727
0
        return ERROR_INT("fname not defined", __func__, 1);
2728
2729
        /* Make sure this a pdf file */
2730
0
    findFileFormat(fname, &format);
2731
0
    if (format != IFF_LPDF)
2732
0
        return ERROR_INT("file is not pdf", __func__, 1);
2733
2734
        /* Read the file into memory and find all locations of
2735
         * '/Width' and '/Height' */
2736
0
    if ((data = l_binaryRead(fname, &nread)) == NULL)
2737
0
        return ERROR_INT("full data not read", __func__, 1);
2738
0
    dnaw = arrayFindEachSequence(data, nread, (const l_uint8 *)"/Width",
2739
0
                                 strlen("/Width"));
2740
0
    dnah = arrayFindEachSequence(data, nread, (const l_uint8 *)"/Height",
2741
0
                                 strlen("/Height"));
2742
0
    if (!dnaw)
2743
0
        L_WARNING("unable to find widths\n", __func__);
2744
0
    if (!dnah)
2745
0
        L_WARNING("unable to find heights\n", __func__);
2746
0
    if (!dnaw && !dnah) {
2747
0
        LEPT_FREE(data);
2748
0
        L_WARNING("no fields found\n", __func__);
2749
0
        return 0;
2750
0
    }
2751
2752
        /* Find the page widths and heights */
2753
0
    nw = l_dnaGetCount(dnaw);
2754
0
    naw = numaCreate(nw);
2755
0
    for (i = 0; i < nw; i++) {
2756
0
        l_dnaGetIValue(dnaw, i, &loc);
2757
0
        ret = sscanf((char *)&data[loc], "/Width %d", &width);
2758
0
        if (ret != 1) {
2759
0
            L_ERROR("width not found for item %d at loc %d\n",
2760
0
                    __func__, i, loc);
2761
0
            continue;
2762
0
        }
2763
0
        numaAddNumber(naw, width);
2764
0
    }
2765
0
    nh = l_dnaGetCount(dnah);
2766
0
    nah = numaCreate(nh);
2767
0
    for (i = 0; i < nh; i++) {
2768
0
        l_dnaGetIValue(dnah, i, &loc);
2769
0
        ret = sscanf((char *)&data[loc], "/Height %d", &height);
2770
0
        if (ret != 1) {
2771
0
            L_ERROR("height not found for item %d at loc %d\n",
2772
0
                    __func__, i, loc);
2773
0
            continue;
2774
0
        }
2775
0
        numaAddNumber(nah, height);
2776
0
    }
2777
2778
0
    LEPT_FREE(data);
2779
0
    l_dnaDestroy(&dnaw);
2780
0
    l_dnaDestroy(&dnah);
2781
0
    if (pmedw) {
2782
0
        numaGetMedian(naw, &fval);
2783
0
        *pmedw = lept_roundftoi(fval);
2784
0
    }
2785
0
    if (pnaw)
2786
0
        *pnaw = naw;
2787
0
    else
2788
0
        numaDestroy(&naw);
2789
0
    if (pmedh) {
2790
0
        numaGetMedian(nah, &fval);
2791
0
        *pmedh = lept_roundftoi(fval);
2792
0
    }
2793
0
    if (pnah)
2794
0
        *pnah = nah;
2795
0
    else
2796
0
        numaDestroy(&nah);
2797
0
    return 0;
2798
0
}
2799
2800
2801
/*!
2802
 * \brief   getPdfMediaBoxSizes()
2803
 *
2804
 * \param[in]    fname        filename
2805
 * \param[out]   pnaw         [optional] array of mediabox widths
2806
 * \param[out]   pnah         [optional] array of mediabox heights
2807
 * \param[out]   pmedw        [optional] median mediabox width
2808
 * \param[out]   pmedh        [optional] median mediabox height
2809
 * \return  0 if OK, 1 on error
2810
 *
2811
 * <pre>
2812
 * Notes:
2813
 *      (1) Finds the arguments of each instance of '/MediaBox' in the file.
2814
 *      (2) This will not work on encrypted pdf files or on files where
2815
 *          the "/MediaBoxes" field is binary compressed.  Not finding
2816
 *          the "/MediaBoxes" field is not an error, but a warning is given.
2817
 *      (3) This is useful for determining if the media boxes are
2818
 *          incorrectly assigned, such as assuming the resolution is 72 ppi.
2819
 *          If that happens and the input the the renderer assumes the
2820
 *          resolution is 300 ppi, the rendered images will be over 4x too
2821
 *          large in each dimension.
2822
 *      (4) An image dimension of 11 inches corresponds to a MediaBox
2823
 *          parameter of 792.  We consider a value > 850 to be oversized
2824
 *          and not to be taken literally.
2825
 * </pre>
2826
 */
2827
l_ok
2828
getPdfMediaBoxSizes(const char  *fname,
2829
                    NUMA       **pnaw,
2830
                    NUMA       **pnah,
2831
                    l_int32     *pmedw,
2832
                    l_int32     *pmedh)
2833
0
{
2834
0
l_uint8   *data;
2835
0
l_int32    i, n, format, ret, loc;
2836
0
l_float32  fval, ignore1, ignore2, w, h;
2837
0
size_t     nread;
2838
0
L_DNA     *dna;   /* mediabox locations */
2839
0
NUMA      *naw;   /* mediabox widths */
2840
0
NUMA      *nah;   /* mediabox heights */
2841
2842
0
    if (pnaw) *pnaw = NULL;
2843
0
    if (pnah) *pnah = NULL;
2844
0
    if (pmedw) *pmedw = 0;
2845
0
    if (pmedh) *pmedh = 0;
2846
0
    if (!pnaw && !pnah && !pmedw && !pmedh)
2847
0
        return ERROR_INT("no output requested", __func__, 1);
2848
0
    if (!fname)
2849
0
        return ERROR_INT("fname not defined", __func__, 1);
2850
2851
        /* Make sure this a pdf file */
2852
0
    findFileFormat(fname, &format);
2853
0
    if (format != IFF_LPDF)
2854
0
        return ERROR_INT("file is not pdf", __func__, 1);
2855
2856
        /* Read the file into memory and find all locations of '/MediaBox' */
2857
0
    if ((data = l_binaryRead(fname, &nread)) == NULL)
2858
0
        return ERROR_INT("full data not read", __func__, 1);
2859
0
    dna = arrayFindEachSequence(data, nread, (const l_uint8 *)"/MediaBox",
2860
0
                                strlen("/MediaBox"));
2861
0
    if (!dna) {
2862
0
        LEPT_FREE(data);
2863
0
        L_WARNING("no mediaboxes found\n", __func__);
2864
0
        return 1;
2865
0
    }
2866
2867
        /* Find the mediabox widths and heights */
2868
0
    n = l_dnaGetCount(dna);
2869
0
    naw = numaCreate(n);
2870
0
    nah = numaCreate(n);
2871
0
    for (i = 0; i < n; i++) {
2872
0
        l_dnaGetIValue(dna, i, &loc);
2873
0
        ret = sscanf((char *)&data[loc], "/MediaBox [ %f %f %f %f",
2874
0
                     &ignore1, &ignore2, &w, &h);
2875
0
        if (ret != 4) {
2876
0
            L_ERROR("mediabox sizes not found for item %d at loc %d\n",
2877
0
                    __func__, i, loc);
2878
0
            continue;
2879
0
        }
2880
0
        numaAddNumber(naw, w);
2881
0
        numaAddNumber(nah, h);
2882
0
    }
2883
0
    LEPT_FREE(data);
2884
0
    l_dnaDestroy(&dna);
2885
2886
0
    if (pmedw) {
2887
0
        numaGetMedian(naw, &fval);
2888
0
        *pmedw = lept_roundftoi(fval);
2889
0
        if (*pmedw > 850) lept_stderr("oversize width: %d\n", *pmedw);
2890
0
    }
2891
0
    if (pnaw)
2892
0
        *pnaw = naw;
2893
0
    else
2894
0
        numaDestroy(&naw);
2895
0
    if (pmedh) {
2896
0
        numaGetMedian(nah, &fval);
2897
0
        *pmedh = lept_roundftoi(fval);
2898
0
        if (*pmedh > 850) lept_stderr("oversize height: %d\n", *pmedh);
2899
0
    }
2900
0
    if (pnah)
2901
0
        *pnah = nah;
2902
0
    else
2903
0
        numaDestroy(&nah);
2904
0
    return 0;
2905
0
}
2906
2907
2908
/*---------------------------------------------------------------------*
2909
 *       Find effective resolution of images rendered from a pdf       *
2910
 *---------------------------------------------------------------------*/
2911
/*!
2912
 * \brief   getPdfRendererResolution()
2913
 *
2914
 * \param[in]    infile       filename of input pdf file
2915
 * \param[in]    outdir       directory of rendered output images
2916
 * \param[out]   pres         desired resolution to use with renderer
2917
 * \return  0 if OK, 1 on error
2918
 *
2919
 * <pre>
2920
 * Notes:
2921
 *      (1) Finds the input resolution to pdftoppm that will generate
2922
 *          images with a maximum dimension of about 3300 pixels,
2923
 *          representing a full page at 300 ppi.
2924
 *      (2) It is most important is to make sure the renderer does
2925
 *          not make huge images because of an error in /MediaBox.
2926
 *          An image dimension of 11 inches corresponds to a MediaBox
2927
 *          parameter of 792.  We consider a value > 850 to be oversized
2928
 *          and not to be taken literally.  If the mediaboxes are
2929
 *          oversized, choose an appropriate lower resolution.
2930
 *      (3) If the mediaboxes are not accessible, render an image at
2931
 *          a low known resolution (say, 72 ppi) and based on the image
2932
 *          size, determine the resolution necessary to make an image
2933
 *          with 3300 pixels in the largest dimension.
2934
 *      (4) Requires pdftoppm, so this is disabled on windows for now.
2935
 *      (5) Requires the ability to call an external program, so it is
2936
 *          necessary to call setLeptDebugOK(1) before this function.
2937
 * </pre>
2938
 */
2939
l_ok
2940
getPdfRendererResolution(const char  *infile,
2941
                         const char  *outdir,
2942
                         l_int32     *pres)
2943
0
{
2944
0
char      buf[256];
2945
0
char     *tail, *basename, *fname;
2946
0
l_int32   ret, res, medw, medh, medmax, npages, pageno, w, h;
2947
0
SARRAY   *sa;
2948
2949
0
    if (!pres)
2950
0
        return ERROR_INT("&res not defined", __func__, 1);
2951
0
    *pres = 300;  /* default */
2952
2953
#ifdef _WIN32
2954
    L_INFO("Requires pdftoppm, so this is disabled on windows.\n"
2955
           "Returns default resolution 300 ppi", __func__);
2956
    return 0;
2957
#endif  /* _WIN32 */
2958
2959
0
    if (!LeptDebugOK) {
2960
0
        L_INFO("Running pdftoppm is disabled; "
2961
0
               "use setLeptDebugOK(1) to enable\n",
2962
0
               "returns default resolution 300 ppi\n", __func__);
2963
0
        return 1;
2964
0
    }
2965
2966
0
    if (!infile)
2967
0
        return ERROR_INT("infile not defined", __func__, 1);
2968
0
    if (!outdir)
2969
0
        return ERROR_INT("outdir not defined", __func__, 1);
2970
2971
0
    res = 300;  /* default value */
2972
0
    ret = getPdfMediaBoxSizes(infile, NULL, NULL, &medw, &medh);
2973
0
    if (ret == 0) {  /* Check for oversize mediaboxes */
2974
0
        lept_stderr("Media Box medians: medw = %d, medh = %d\n", medw, medh);
2975
0
        medmax = L_MAX(medw, medh);
2976
0
        if (medmax > 850) {
2977
0
            res = 300 * ((l_float32)792 / (l_float32)medmax);
2978
0
            lept_stderr(" Oversize media box; use resolution = %d\n", res);
2979
0
            *pres = res;
2980
0
        }
2981
0
        return 0;
2982
0
    }
2983
2984
        /* No mediaboxes; render one page and measure the max dimension */
2985
0
    lept_stderr("Media Box dimensions not found\n");
2986
0
    getPdfPageCount(infile, &npages);
2987
0
    pageno = (npages > 0) ? (npages + 1) / 2 : 1;
2988
0
    splitPathAtDirectory(infile, NULL, &tail);
2989
0
    splitPathAtExtension(tail, &basename, NULL);
2990
0
    snprintf(buf, sizeof(buf), "pdftoppm -f %d -l %d -r 72 %s %s/%s",
2991
0
             pageno, pageno, infile, outdir, basename);
2992
0
    LEPT_FREE(tail);
2993
0
    LEPT_FREE(basename);
2994
0
    callSystemDebug(buf);  /* pdftoppm */
2995
2996
        /* Get the page size */
2997
0
    sa = getSortedPathnamesInDirectory(outdir, NULL, 0, 0);
2998
0
    fname = sarrayGetString(sa, 0, L_NOCOPY);
2999
0
    pixReadHeader(fname, NULL, &w, &h, NULL, NULL, NULL);
3000
0
    sarrayDestroy(&sa);
3001
0
    if (w > 0 && h > 0) {
3002
0
        res = L_MIN((72 * 3300 / L_MAX(w, h)), 600);
3003
0
        *pres = res;
3004
0
        lept_stderr("Use resolution = %d\n", res);
3005
0
    } else {
3006
0
        L_ERROR("page size not found; assuming res = 300\n", __func__);
3007
0
    }
3008
3009
0
    return 0;
3010
0
}
3011
3012
3013
/*---------------------------------------------------------------------*
3014
 *                      Set flags for special modes                    *
3015
 *---------------------------------------------------------------------*/
3016
/*!
3017
 * \brief   l_pdfSetG4ImageMask()
3018
 *
3019
 * \param[in]    flag    1 for writing g4 data as fg only through a mask;
3020
 *                       0 for writing fg and bg
3021
 * \return  void
3022
 *
3023
 * <pre>
3024
 * Notes:
3025
 *      (1) The default is for writing only the fg (through the mask).
3026
 *          That way when you write a 1 bpp image, the bg is transparent,
3027
 *          so any previously written image remains visible behind it.
3028
 * </pre>
3029
 */
3030
void
3031
l_pdfSetG4ImageMask(l_int32  flag)
3032
0
{
3033
0
    var_WRITE_G4_IMAGE_MASK = flag;
3034
0
}
3035
3036
3037
/*!
3038
 * \brief   l_pdfSetDateAndVersion()
3039
 *
3040
 * \param[in]    flag    1 for writing date/time and leptonica version;
3041
 *                       0 for omitting this from the metadata
3042
 * \return  void
3043
 *
3044
 * <pre>
3045
 * Notes:
3046
 *      (1) The default is for writing this data.  For regression tests
3047
 *          that compare output against golden files, it is useful to omit.
3048
 * </pre>
3049
 */
3050
void
3051
l_pdfSetDateAndVersion(l_int32  flag)
3052
0
{
3053
0
    var_WRITE_DATE_AND_VERSION = flag;
3054
0
}
3055
3056
/* --------------------------------------------*/
3057
#endif  /* USE_PDFIO */
3058
/* --------------------------------------------*/