Coverage Report

Created: 2023-11-19 06:40

/src/leptonica/src/pdfio2.c
Line
Count
Source (jump to first uncovered line)
1
/*====================================================================*
2
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
3
 -
4
 -  Redistribution and use in source and binary forms, with or without
5
 -  modification, are permitted provided that the following conditions
6
 -  are met:
7
 -  1. Redistributions of source code must retain the above copyright
8
 -     notice, this list of conditions and the following disclaimer.
9
 -  2. Redistributions in binary form must reproduce the above
10
 -     copyright notice, this list of conditions and the following
11
 -     disclaimer in the documentation and/or other materials
12
 -     provided with the distribution.
13
 -
14
 -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15
 -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16
 -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17
 -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18
 -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19
 -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20
 -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21
 -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22
 -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23
 -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
 -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
 *====================================================================*/
26
27
/*!
28
 * \file pdfio2.c
29
 * <pre>
30
 *
31
 *    Lower-level operations for generating pdf.
32
 *
33
 *     Intermediate function for single page, multi-image conversion
34
 *          l_int32              pixConvertToPdfData()
35
 *
36
 *     Intermediate function for generating multipage pdf output
37
 *          l_int32              ptraConcatenatePdfToData()
38
 *
39
 *     Convert tiff multipage to pdf file
40
 *          l_int32              convertTiffMultipageToPdf()
41
 *
42
 *     Generates the CID, transcoding under some conditions
43
 *          l_int32              l_generateCIDataForPdf()
44
 *          l_int32              l_generateCIData()
45
 *
46
 *       Lower-level CID generation without transcoding
47
 *          L_COMP_DATA         *l_generateFlateDataPdf()
48
 *          L_COMP_DATA         *l_generateJpegData()
49
 *          L_COMP_DATA         *l_generateJpegDataMem()
50
 *          static L_COMP_DATA  *l_generateJp2kData()
51
 *          L_COMP_DATA         *l_generateG4Data()
52
 *
53
 *       Lower-level CID generation with transcoding
54
 *          l_int32              pixGenerateCIData()
55
 *          L_COMP_DATA         *l_generateFlateData()
56
 *          static L_COMP_DATA  *pixGenerateFlateData()
57
 *          static L_COMP_DATA  *pixGenerateJpegData()
58
 *          static L_COMP_DATA  *pixGenerateJp2kData()
59
 *          static L_COMP_DATA  *pixGenerateG4Data()
60
 *
61
 *       Other CID operations
62
 *          l_int32              cidConvertToPdfData()
63
 *          void                 l_CIDataDestroy()
64
 *
65
 *     Helper functions for generating the output pdf string
66
 *          static l_int32       l_generatePdf()
67
 *          static void          generateFixedStringsPdf()
68
 *          static char         *generateEscapeString()
69
 *          static void          generateMediaboxPdf()
70
 *          static l_int32       generatePageStringPdf()
71
 *          static l_int32       generateContentStringPdf()
72
 *          static l_int32       generatePreXStringsPdf()
73
 *          static l_int32       generateColormapStringsPdf()
74
 *          static void          generateTrailerPdf()
75
 *          static l_int32       makeTrailerStringPdf()
76
 *          static l_int32       generateOutputDataPdf()
77
 *
78
 *     Helper functions for generating multipage pdf output
79
 *          static l_int32       parseTrailerPdf()
80
 *          static char         *generatePagesObjStringPdf()
81
 *          static L_BYTEA      *substituteObjectNumbers()
82
 *
83
 *     Create/destroy/access pdf data
84
 *          static L_PDF_DATA   *pdfdataCreate()
85
 *          static void          pdfdataDestroy()
86
 *          static L_COMP_DATA  *pdfdataGetCid()
87
 *
88
 *     Find number of pages in a pdf
89
 *          l_int32              getPdfPageCount()
90
 *
91
 *     Find widths and heights of pages and media boxes in a pdf
92
 *          l_int32              getPdfPageSizes()
93
 *          l_int32              getPdfMediaBoxSizes()
94
 *
95
 *     Find effective resolution of images rendered from a pdf
96
 *          l_int32              getPdfRendererResolution()
97
 *
98
 *     Set flags for special modes
99
 *          void                 l_pdfSetG4ImageMask()
100
 *          void                 l_pdfSetDateAndVersion()
101
 *
102
 * </pre>
103
 */
104
105
#ifdef HAVE_CONFIG_H
106
#include <config_auto.h>
107
#endif  /* HAVE_CONFIG_H */
108
109
#include <string.h>
110
#include <math.h>
111
#include "allheaders.h"
112
113
/* --------------------------------------------*/
114
#if  USE_PDFIO   /* defined in environ.h */
115
 /* --------------------------------------------*/
116
117
    /* Typical scan resolution in ppi (pixels/inch) */
118
static const l_int32  DefaultInputRes = 300;
119
120
    /* Static helpers */
121
static L_COMP_DATA  *l_generateJp2kData(const char *fname);
122
static L_COMP_DATA  *pixGenerateFlateData(PIX *pixs, l_int32 ascii85flag);
123
static L_COMP_DATA  *pixGenerateJpegData(PIX *pixs, l_int32 ascii85flag,
124
                                         l_int32 quality);
125
static L_COMP_DATA  *pixGenerateJp2kData(PIX *pixs, l_int32 quality);
126
static L_COMP_DATA  *pixGenerateG4Data(PIX *pixs, l_int32 ascii85flag);
127
128
static l_int32       l_generatePdf(l_uint8 **pdata, size_t *pnbytes,
129
                                   L_PDF_DATA  *lpd);
130
static void          generateFixedStringsPdf(L_PDF_DATA *lpd);
131
static char         *generateEscapeString(const char  *str);
132
static void          generateMediaboxPdf(L_PDF_DATA *lpd);
133
static l_int32       generatePageStringPdf(L_PDF_DATA *lpd);
134
static l_int32       generateContentStringPdf(L_PDF_DATA *lpd);
135
static l_int32       generatePreXStringsPdf(L_PDF_DATA *lpd);
136
static l_int32       generateColormapStringsPdf(L_PDF_DATA *lpd);
137
static void          generateTrailerPdf(L_PDF_DATA *lpd);
138
static char         *makeTrailerStringPdf(L_DNA *daloc);
139
static l_int32       generateOutputDataPdf(l_uint8 **pdata, size_t *pnbytes,
140
                                       L_PDF_DATA *lpd);
141
142
static l_int32       parseTrailerPdf(L_BYTEA *bas, L_DNA **pda);
143
static char         *generatePagesObjStringPdf(NUMA *napage);
144
static L_BYTEA      *substituteObjectNumbers(L_BYTEA *bas, NUMA *na_objs);
145
146
static L_PDF_DATA   *pdfdataCreate(const char *title);
147
static void          pdfdataDestroy(L_PDF_DATA **plpd);
148
static L_COMP_DATA  *pdfdataGetCid(L_PDF_DATA *lpd, l_int32 index);
149
150
151
/* ---------------- Defaults for rendering options ----------------- */
152
    /* Output G4 as writing through image mask; this is the default */
153
static l_int32   var_WRITE_G4_IMAGE_MASK = 1;
154
    /* Write date/time and lib version into pdf; this is the default */
155
static l_int32   var_WRITE_DATE_AND_VERSION = 1;
156
157
#define L_SMALLBUF   256
158
#define L_BIGBUF    2048   /* must be able to hold hex colormap */
159
160
161
#ifndef  NO_CONSOLE_IO
162
#define  DEBUG_MULTIPAGE      0
163
#endif  /* ~NO_CONSOLE_IO */
164
165
166
/*---------------------------------------------------------------------*
167
 *       Intermediate function for generating multipage pdf output     *
168
 *---------------------------------------------------------------------*/
169
/*!
170
 * \brief   pixConvertToPdfData()
171
 *
172
 * \param[in]      pix       all depths; cmap OK
173
 * \param[in]      type      L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE,
174
 *                           L_JP2K_ENCODE
175
 * \param[in]      quality   for jpeg: 1-100; 0 for default (75)
176
 *                           for jp2k: 27-45; 0 for default (34)
177
 * \param[out]     pdata     pdf array
178
 * \param[out]     pnbytes   number of bytes in pdf array
179
 * \param[in]      x, y      location of lower-left corner of image, in pixels,
180
 *                           relative to the PostScript origin (0,0) at
181
 *                           the lower-left corner of the page)
182
 * \param[in]      res       override the resolution of the input image, in ppi;
183
 *                           use 0 to respect resolution embedded in the input
184
 * \param[in]      title     [optional] pdf title; can be null
185
 * \param[in,out]  plpd      ptr to lpd; created on the first invocation and
186
 *                           returned until last image is processed
187
 * \param[in]      position  in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
188
 *                           L_LAST_IMAGE
189
 * \return  0 if OK, 1 on error
190
 *
191
 * <pre>
192
 * Notes:
193
 *      (1) If %res == 0 and the input resolution field from the pix is 0,
194
 *          this will use DefaultInputRes.
195
 *      (2) This only writes %data if it is the last image to be
196
 *          written on the page.
197
 *      (3) See comments in convertToPdf().
198
 * </pre>
199
 */
200
l_ok
201
pixConvertToPdfData(PIX          *pix,
202
                    l_int32       type,
203
                    l_int32       quality,
204
                    l_uint8     **pdata,
205
                    size_t       *pnbytes,
206
                    l_int32       x,
207
                    l_int32       y,
208
                    l_int32       res,
209
                    const char   *title,
210
                    L_PDF_DATA  **plpd,
211
                    l_int32       position)
212
0
{
213
0
l_int32       pixres, w, h, ret;
214
0
l_float32     xpt, ypt, wpt, hpt;
215
0
L_COMP_DATA  *cid = NULL;
216
0
L_PDF_DATA   *lpd = NULL;
217
218
0
    if (!pdata)
219
0
        return ERROR_INT("&data not defined", __func__, 1);
220
0
    *pdata = NULL;
221
0
    if (!pnbytes)
222
0
        return ERROR_INT("&nbytes not defined", __func__, 1);
223
0
    *pnbytes = 0;
224
0
    if (!pix)
225
0
        return ERROR_INT("pix not defined", __func__, 1);
226
0
    if (type != L_JPEG_ENCODE && type != L_G4_ENCODE &&
227
0
        type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
228
0
        selectDefaultPdfEncoding(pix, &type);
229
0
    }
230
0
    if (quality < 0 || quality > 100)
231
0
        return ERROR_INT("invalid quality", __func__, 1);
232
233
0
    if (plpd) {  /* part of multi-page invocation */
234
0
        if (position == L_FIRST_IMAGE)
235
0
            *plpd = NULL;
236
0
    }
237
238
        /* Generate the compressed image data.  It must NOT
239
         * be ascii85 encoded. */
240
0
    pixGenerateCIData(pix, type, quality, 0, &cid);
241
0
    if (!cid)
242
0
        return ERROR_INT("cid not made", __func__, 1);
243
244
        /* Get media box in pts.  Guess the input image resolution
245
         * based on the input parameter %res, the resolution data in
246
         * the pix, and the size of the image. */
247
0
    pixres = cid->res;
248
0
    w = cid->w;
249
0
    h = cid->h;
250
0
    if (res <= 0.0)
251
0
        res = (pixres > 0) ? pixres : DefaultInputRes;
252
0
    xpt = x * 72.f / res;
253
0
    ypt = y * 72.f / res;
254
0
    wpt = w * 72.f / res;
255
0
    hpt = h * 72.f / res;
256
257
        /* Set up lpd */
258
0
    if (!plpd) {  /* single image */
259
0
        if ((lpd = pdfdataCreate(title)) == NULL)
260
0
            return ERROR_INT("lpd not made", __func__, 1);
261
0
    } else if (position == L_FIRST_IMAGE) {  /* first of multiple images */
262
0
        if ((lpd = pdfdataCreate(title)) == NULL)
263
0
            return ERROR_INT("lpd not made", __func__, 1);
264
0
        *plpd = lpd;
265
0
    } else {  /* not the first of multiple images */
266
0
        lpd = *plpd;
267
0
    }
268
269
        /* Add the data to the lpd */
270
0
    ptraAdd(lpd->cida, cid);
271
0
    lpd->n++;
272
0
    ptaAddPt(lpd->xy, xpt, ypt);
273
0
    ptaAddPt(lpd->wh, wpt, hpt);
274
275
        /* If a single image or the last of multiple images,
276
         * generate the pdf and destroy the lpd */
277
0
    if (!plpd || (position == L_LAST_IMAGE)) {
278
0
        ret = l_generatePdf(pdata, pnbytes, lpd);
279
0
        pdfdataDestroy(&lpd);
280
0
        if (plpd) *plpd = NULL;
281
0
        if (ret)
282
0
            return ERROR_INT("pdf output not made", __func__, 1);
283
0
    }
284
285
0
    return 0;
286
0
}
287
288
289
/*---------------------------------------------------------------------*
290
 *      Intermediate function for generating multipage pdf output      *
291
 *---------------------------------------------------------------------*/
292
/*!
293
 * \brief   ptraConcatenatePdfToData()
294
 *
295
 * \param[in]    pa_data    ptra array of pdf strings, each for a
296
 *                          single-page pdf file
297
 * \param[in]    sa         [optional] string array of pathnames for
298
 *                          input pdf files; can be null
299
 * \param[out]   pdata      concatenated pdf data in memory
300
 * \param[out]   pnbytes    number of bytes in pdf data
301
 * \return  0 if OK, 1 on error
302
 *
303
 * <pre>
304
 * Notes:
305
 *      (1) This only works with leptonica-formatted single-page pdf files.
306
 *          pdf files generated by other programs will have unpredictable
307
 *          (and usually bad) results.  The requirements for each pdf file:
308
 *            (a) The Catalog and Info objects are the first two.
309
 *            (b) Object 3 is Pages
310
 *            (c) Object 4 is Page
311
 *            (d) The remaining objects are Contents, XObjects, and ColorSpace
312
 *      (2) We remove trailers from each page, and append the full trailer
313
 *          for all pages at the end.
314
 *      (3) For all but the first file, remove the ID and the first 3
315
 *          objects (catalog, info, pages), so that each subsequent
316
 *          file has only objects of these classes:
317
 *              Page, Contents, XObject, ColorSpace (Indexed RGB).
318
 *          For those objects, we substitute these refs to objects
319
 *          in the local file:
320
 *              Page:  Parent(object 3), Contents, XObject(typically multiple)
321
 *              XObject:  [ColorSpace if indexed]
322
 *          The Pages object on the first page (object 3) has a Kids array
323
 *          of references to all the Page objects, with a Count equal
324
 *          to the number of pages.  Each Page object refers back to
325
 *          this parent.
326
 * </pre>
327
 */
328
l_ok
329
ptraConcatenatePdfToData(L_PTRA    *pa_data,
330
                         SARRAY    *sa,
331
                         l_uint8  **pdata,
332
                         size_t    *pnbytes)
333
0
{
334
0
char     *fname, *str_pages, *str_trailer;
335
0
l_uint8  *pdfdata, *data;
336
0
l_int32   i, j, index, nobj, npages;
337
0
l_int32  *sizes, *locs;
338
0
size_t    size;
339
0
L_BYTEA  *bas, *bad, *bat1, *bat2;
340
0
L_DNA    *da_locs, *da_sizes, *da_outlocs, *da;
341
0
L_DNAA   *daa_locs;  /* object locations on each page */
342
0
NUMA     *na_objs, *napage;
343
0
NUMAA    *naa_objs;  /* object mapping numbers to new values */
344
345
0
    if (!pdata)
346
0
        return ERROR_INT("&data not defined", __func__, 1);
347
0
    *pdata = NULL;
348
0
    if (!pnbytes)
349
0
        return ERROR_INT("&nbytes not defined", __func__, 1);
350
0
    *pnbytes = 0;
351
0
    if (!pa_data)
352
0
        return ERROR_INT("pa_data not defined", __func__, 1);
353
354
        /* Parse the files and find the object locations.
355
         * Remove file data that cannot be parsed. */
356
0
    ptraGetActualCount(pa_data, &npages);
357
0
    daa_locs = l_dnaaCreate(npages);
358
0
    for (i = 0; i < npages; i++) {
359
0
        bas = (L_BYTEA *)ptraGetPtrToItem(pa_data, i);
360
0
        if (parseTrailerPdf(bas, &da_locs) != 0) {
361
0
            bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
362
0
            l_byteaDestroy(&bas);
363
0
            if (sa) {
364
0
                fname = sarrayGetString(sa, i, L_NOCOPY);
365
0
                L_ERROR("can't parse file %s; skipping\n", __func__, fname);
366
0
            } else {
367
0
                L_ERROR("can't parse file %d; skipping\n", __func__, i);
368
0
            }
369
0
        } else {
370
0
            l_dnaaAddDna(daa_locs, da_locs, L_INSERT);
371
0
        }
372
0
    }
373
374
        /* Recompute npages in case some of the files were not pdf */
375
0
    ptraCompactArray(pa_data);
376
0
    ptraGetActualCount(pa_data, &npages);
377
0
    if (npages == 0) {
378
0
        l_dnaaDestroy(&daa_locs);
379
0
        return ERROR_INT("no parsable pdf files found", __func__, 1);
380
0
    }
381
382
        /* Find the mapping from initial to final object numbers */
383
0
    naa_objs = numaaCreate(npages);  /* stores final object numbers */
384
0
    napage = numaCreate(npages);  /* stores "Page" object numbers */
385
0
    index = 0;
386
0
    for (i = 0; i < npages; i++) {
387
0
        da = l_dnaaGetDna(daa_locs, i, L_CLONE);
388
0
        nobj = l_dnaGetCount(da);
389
0
        if (i == 0) {
390
0
            numaAddNumber(napage, 4);  /* object 4 on first page */
391
0
            na_objs = numaMakeSequence(0.0, 1.0, nobj - 1);
392
0
            index = nobj - 1;
393
0
        } else {  /* skip the first 3 objects in each file */
394
0
            numaAddNumber(napage, index);  /* Page object is first we add */
395
0
            na_objs = numaMakeConstant(0.0, nobj - 1);
396
0
            numaReplaceNumber(na_objs, 3, 3);  /* refers to parent of all */
397
0
            for (j = 4; j < nobj - 1; j++)
398
0
                numaSetValue(na_objs, j, index++);
399
0
        }
400
0
        numaaAddNuma(naa_objs, na_objs, L_INSERT);
401
0
        l_dnaDestroy(&da);
402
0
    }
403
404
        /* Make the Pages object (#3) */
405
0
    str_pages = generatePagesObjStringPdf(napage);
406
407
        /* Build the output */
408
0
    bad = l_byteaCreate(5000);
409
0
    da_outlocs = l_dnaCreate(0);  /* locations of all output objects */
410
0
    for (i = 0; i < npages; i++) {
411
0
        bas = (L_BYTEA *)ptraGetPtrToItem(pa_data, i);
412
0
        pdfdata = l_byteaGetData(bas, &size);
413
0
        da_locs = l_dnaaGetDna(daa_locs, i, L_CLONE);  /* locs on this page */
414
0
        na_objs = numaaGetNuma(naa_objs, i, L_CLONE);  /* obj # on this page */
415
0
        nobj = l_dnaGetCount(da_locs) - 1;
416
0
        da_sizes = l_dnaDiffAdjValues(da_locs);  /* object sizes on this page */
417
0
        sizes = l_dnaGetIArray(da_sizes);
418
0
        locs = l_dnaGetIArray(da_locs);
419
0
        if (i == 0) {
420
0
            l_byteaAppendData(bad, pdfdata, sizes[0]);
421
0
            l_byteaAppendData(bad, pdfdata + locs[1], sizes[1]);
422
0
            l_byteaAppendData(bad, pdfdata + locs[2], sizes[2]);
423
0
            l_byteaAppendString(bad, str_pages);
424
0
            for (j = 0; j < 4; j++)
425
0
                l_dnaAddNumber(da_outlocs, locs[j]);
426
0
        }
427
0
        for (j = 4; j < nobj; j++) {
428
0
            l_dnaAddNumber(da_outlocs, l_byteaGetSize(bad));
429
0
            bat1 = l_byteaInitFromMem(pdfdata + locs[j], sizes[j]);
430
0
            bat2 = substituteObjectNumbers(bat1, na_objs);
431
0
            data = l_byteaGetData(bat2, &size);
432
0
            l_byteaAppendData(bad, data, size);
433
0
            l_byteaDestroy(&bat1);
434
0
            l_byteaDestroy(&bat2);
435
0
        }
436
0
        if (i == npages - 1)  /* last one */
437
0
            l_dnaAddNumber(da_outlocs, l_byteaGetSize(bad));
438
0
        LEPT_FREE(sizes);
439
0
        LEPT_FREE(locs);
440
0
        l_dnaDestroy(&da_locs);
441
0
        numaDestroy(&na_objs);
442
0
        l_dnaDestroy(&da_sizes);
443
0
    }
444
445
        /* Add the trailer */
446
0
    str_trailer = makeTrailerStringPdf(da_outlocs);
447
0
    l_byteaAppendString(bad, str_trailer);
448
449
        /* Transfer the output data */
450
0
    *pdata = l_byteaCopyData(bad, pnbytes);
451
0
    l_byteaDestroy(&bad);
452
453
#if  DEBUG_MULTIPAGE
454
    lept_stderr("******** object mapper **********");
455
    numaaWriteStream(stderr, naa_objs);
456
457
    lept_stderr("******** Page object numbers ***********");
458
    numaWriteStderr(napage);
459
460
    lept_stderr("******** Pages object ***********\n");
461
    lept_stderr("%s\n", str_pages);
462
#endif  /* DEBUG_MULTIPAGE */
463
464
0
    numaDestroy(&napage);
465
0
    numaaDestroy(&naa_objs);
466
0
    l_dnaDestroy(&da_outlocs);
467
0
    l_dnaaDestroy(&daa_locs);
468
0
    LEPT_FREE(str_pages);
469
0
    LEPT_FREE(str_trailer);
470
0
    return 0;
471
0
}
472
473
474
/*---------------------------------------------------------------------*
475
 *                  Convert tiff multipage to pdf file                 *
476
 *---------------------------------------------------------------------*/
477
/*!
478
 * \brief   convertTiffMultipageToPdf()
479
 *
480
 * \param[in]    filein    (tiff)
481
 * \param[in]    fileout   (pdf)
482
 * \return  0 if OK, 1 on error
483
 *
484
 * <pre>
485
 * Notes:
486
 *      (1) A multipage tiff file can also be converted to PS, using
487
 *          convertTiffMultipageToPS()
488
 * </pre>
489
 */
490
l_ok
491
convertTiffMultipageToPdf(const char  *filein,
492
                          const char  *fileout)
493
0
{
494
0
l_int32  istiff;
495
0
PIXA    *pixa;
496
0
FILE    *fp;
497
498
0
    if ((fp = fopenReadStream(filein)) == NULL)
499
0
        return ERROR_INT_1("file not found", filein, __func__, 1);
500
0
    istiff = fileFormatIsTiff(fp);
501
0
    fclose(fp);
502
0
    if (!istiff)
503
0
        return ERROR_INT_1("file not tiff format", filein, __func__, 1);
504
505
0
    pixa = pixaReadMultipageTiff(filein);
506
0
    pixaConvertToPdf(pixa, 0, 1.0, 0, 0, "weasel2", fileout);
507
0
    pixaDestroy(&pixa);
508
0
    return 0;
509
0
}
510
511
512
/*---------------------------------------------------------------------*
513
 *                          CID-based operations                       *
514
 *---------------------------------------------------------------------*/
515
/*!
516
 * \brief   l_generateCIDataForPdf()
517
 *
518
 * \param[in]    fname      [optional] can be null
519
 * \param[in]    pix        [optional] can be null
520
 * \param[in]    quality    for jpeg if transcoded: 1-100; 0 for default (75)
521
 *                          for jp2k if transcoded: 27-45; 0 for default (34)
522
 * \param[out]   pcid       compressed data
523
 * \return  0 if OK, 1 on error
524
 *
525
 * <pre>
526
 * Notes:
527
 *      (1) You must set either filename or pix.
528
 *      (2) Given an image file and optionally a pix raster of that data,
529
 *          this provides a CID that is compatible with PDF, preferably
530
 *          without transcoding.
531
 *      (3) The pix is included for efficiency, in case transcoding
532
 *          is required and the pix is available to the caller.
533
 *      (4) We don't try to open files named "stdin" or "-" for Tesseract
534
 *          compatibility reasons. We may remove this restriction
535
 *          in the future.
536
 *      (5) Note that tiff-g4 must be transcoded to properly handle byte
537
 *          order and perhaps photometry (e.g., min-is-black).  For a
538
 *          multipage tiff file, data will only be extracted from the
539
 *          first page, so this should not be invoked.
540
 * </pre>
541
 */
542
l_ok
543
l_generateCIDataForPdf(const char    *fname,
544
                       PIX           *pix,
545
                       l_int32        quality,
546
                       L_COMP_DATA  **pcid)
547
0
{
548
0
l_int32       format, type;
549
0
L_COMP_DATA  *cid;
550
0
PIX          *pixt;
551
552
0
    if (!pcid)
553
0
        return ERROR_INT("&cid not defined", __func__, 1);
554
0
    *pcid = cid = NULL;
555
0
    if (!fname && !pix)
556
0
        return ERROR_INT("neither fname nor pix are defined", __func__, 1);
557
558
        /* If a compressed file is given that is not 'stdin', see if we
559
         * can generate the pdf output without transcoding. */
560
0
    if (fname && strcmp(fname, "-") != 0 && strcmp(fname, "stdin") != 0) {
561
0
        findFileFormat(fname, &format);
562
0
        if (format == IFF_UNKNOWN)
563
0
            L_WARNING("file %s format is unknown\n", __func__, fname);
564
0
        if (format == IFF_PS || format == IFF_LPDF) {
565
0
            L_ERROR("file %s is unsupported format %d\n",
566
0
                  __func__, fname, format);
567
0
            return 1;
568
0
        }
569
0
        if (format == IFF_JFIF_JPEG) {
570
0
            cid = l_generateJpegData(fname, 0);
571
0
        } else if (format == IFF_JP2) {
572
0
            cid = l_generateJp2kData(fname);
573
0
        } else if (format == IFF_PNG) {
574
0
            cid = l_generateFlateDataPdf(fname, pix);
575
0
        }
576
0
    }
577
578
        /* Otherwise, use the pix to generate the pdf output */
579
0
    if  (!cid) {
580
0
        if (!pix)
581
0
            pixt = pixRead(fname);
582
0
        else
583
0
            pixt = pixClone(pix);
584
0
        if (!pixt)
585
0
            return ERROR_INT("pixt not made", __func__, 1);
586
0
        if (selectDefaultPdfEncoding(pixt, &type)) {
587
0
            pixDestroy(&pixt);
588
0
            return 1;
589
0
        }
590
0
        pixGenerateCIData(pixt, type, quality, 0, &cid);
591
0
        pixDestroy(&pixt);
592
0
        if (!cid)
593
0
            return ERROR_INT("cid not made from pix", __func__, 1);
594
0
    }
595
0
    *pcid = cid;
596
0
    return 0;
597
0
}
598
599
600
/*!
601
 * \brief   l_generateCIData()
602
 *
603
 * \param[in]    fname
604
 * \param[in]    type       L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE,
605
 *                          L_JP2K_ENCODE
606
 * \param[in]    quality    for jpeg if transcoded: 1-100; 0 for default (75)
607
 *                          for jp2k if transcoded: 27-45; 0 for default (34)
608
 * \param[in]    ascii85    0 for binary; 1 for ascii85-encoded
609
 * \param[out]   pcid       compressed data
610
 * \return  0 if OK, 1 on error
611
 *
612
 * <pre>
613
 * Notes:
614
 *      (1) This can be used for both PostScript and pdf.
615
 *      (1) Set ascii85:
616
 *           ~ 0 for binary data (PDF only)
617
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
618
 *      (2) This attempts to compress according to the requested type.
619
 *          If this can't be done, it falls back to ordinary flate encoding.
620
 *      (3) This differs from l_generateCIDataForPdf(), which determines
621
 *          the file format and only works for pdf.
622
 * </pre>
623
 */
624
l_ok
625
l_generateCIData(const char    *fname,
626
                 l_int32        type,
627
                 l_int32        quality,
628
                 l_int32        ascii85,
629
                 L_COMP_DATA  **pcid)
630
0
{
631
0
l_int32       format, d, bps, spp, iscmap;
632
0
L_COMP_DATA  *cid;
633
0
PIX          *pix;
634
635
0
    if (!pcid)
636
0
        return ERROR_INT("&cid not defined", __func__, 1);
637
0
    *pcid = NULL;
638
0
    if (!fname)
639
0
        return ERROR_INT("fname not defined", __func__, 1);
640
0
    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
641
0
        type != L_FLATE_ENCODE && type != L_JP2K_ENCODE)
642
0
        return ERROR_INT("invalid conversion type", __func__, 1);
643
0
    if (ascii85 != 0 && ascii85 != 1)
644
0
        return ERROR_INT("invalid ascii85", __func__, 1);
645
646
        /* Sanity check on requested encoding */
647
0
    pixReadHeader(fname, &format, NULL, NULL, &bps, &spp, &iscmap);
648
0
    d = bps * spp;
649
0
    if (d == 24) d = 32;
650
0
    if (iscmap && type != L_FLATE_ENCODE) {
651
0
        L_WARNING("pixs has cmap; using flate encoding\n", __func__);
652
0
        type = L_FLATE_ENCODE;
653
0
    } else if (d < 8 && type == L_JPEG_ENCODE) {
654
0
        L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__);
655
0
        type = L_FLATE_ENCODE;
656
0
    } else if (d < 8 && type == L_JP2K_ENCODE) {
657
0
        L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__);
658
0
        type = L_FLATE_ENCODE;
659
0
    } else if (d > 1 && type == L_G4_ENCODE) {
660
0
        L_WARNING("pixs has > 1 bpp; using flate encoding\n", __func__);
661
0
        type = L_FLATE_ENCODE;
662
0
    }
663
664
0
    if (type == L_JPEG_ENCODE) {
665
0
        if (format == IFF_JFIF_JPEG) {  /* do not transcode */
666
0
            cid = l_generateJpegData(fname, ascii85);
667
0
        } else {
668
0
            if ((pix = pixRead(fname)) == NULL)
669
0
                return ERROR_INT("pix not returned for JPEG", __func__, 1);
670
0
            cid = pixGenerateJpegData(pix, ascii85, quality);
671
0
            pixDestroy(&pix);
672
0
        }
673
0
        if (!cid)
674
0
            return ERROR_INT("jpeg data not made", __func__, 1);
675
0
    } else if (type == L_JP2K_ENCODE) {
676
0
        if (format == IFF_JP2) {  /* do not transcode */
677
0
            cid = l_generateJp2kData(fname);
678
0
        } else {
679
0
            if ((pix = pixRead(fname)) == NULL)
680
0
                return ERROR_INT("pix not returned for JP2K", __func__, 1);
681
0
            cid = pixGenerateJp2kData(pix, quality);
682
0
            pixDestroy(&pix);
683
0
        }
684
0
        if (!cid)
685
0
            return ERROR_INT("jp2k data not made", __func__, 1);
686
0
    } else if (type == L_G4_ENCODE) {
687
0
        if ((pix = pixRead(fname)) == NULL)
688
0
            return ERROR_INT("pix not returned for G4", __func__, 1);
689
0
        cid = pixGenerateG4Data(pix, ascii85);
690
0
        pixDestroy(&pix);
691
0
        if (!cid)
692
0
            return ERROR_INT("g4 data not made", __func__, 1);
693
0
    } else if (type == L_FLATE_ENCODE) {
694
0
        if ((cid = l_generateFlateData(fname, ascii85)) == NULL)
695
0
            return ERROR_INT("flate data not made", __func__, 1);
696
0
    } else {
697
0
        return ERROR_INT("invalid conversion type", __func__, 1);
698
0
    }
699
0
    *pcid = cid;
700
701
0
    return 0;
702
0
}
703
704
705
/*---------------------------------------------------------------------*
706
 *                     Low-level CID-based operations                  *
707
 *---------------------------------------------------------------------*/
708
/*!
709
 * \brief   l_generateFlateDataPdf()
710
 *
711
 * \param[in]    fname     preferably png
712
 * \param[in]    pixs      [optional] can be null
713
 * \return  cid containing png data, or NULL on error
714
 *
715
 * <pre>
716
 * Notes:
717
 *      (1) If you hand this a png file, you are going to get
718
 *          png predictors embedded in the flate data. So it has
719
 *          come to this. http://xkcd.com/1022/
720
 *      (2) Exception: if the png is interlaced or if it is RGBA,
721
 *          it will be transcoded.
722
 *      (3) If transcoding is required, this will not have to read from
723
 *          file if a pix is input.
724
 * </pre>
725
 */
726
L_COMP_DATA *
727
l_generateFlateDataPdf(const char  *fname,
728
                       PIX         *pixs)
729
0
{
730
0
l_uint8      *pngcomp = NULL;  /* entire PNG compressed file */
731
0
l_uint8      *datacomp = NULL;  /* gzipped raster data */
732
0
l_uint8      *cmapdata = NULL;  /* uncompressed colormap */
733
0
char         *cmapdatahex = NULL;  /* hex ascii uncompressed colormap */
734
0
l_uint32      i, j, n;
735
0
l_int32       format, interlaced;
736
0
l_int32       ncolors;  /* in colormap */
737
0
l_int32       bps;  /* bits/sample: usually 8 */
738
0
l_int32       spp;  /* samples/pixel: 1-grayscale/cmap); 3-rgb; 4-rgba */
739
0
l_int32       w, h, cmapflag;
740
0
l_int32       xres, yres;
741
0
size_t        nbytescomp = 0, nbytespng = 0;
742
0
FILE         *fp;
743
0
L_COMP_DATA  *cid;
744
0
PIX          *pix;
745
0
PIXCMAP      *cmap = NULL;
746
747
0
    if (!fname)
748
0
        return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);
749
750
0
    findFileFormat(fname, &format);
751
0
    spp = 0;  /* init to spp != 4 if not png */
752
0
    interlaced = 0;  /* initialize to no interlacing */
753
0
    bps = 0;  /* initialize to a nonsense value */
754
0
    if (format == IFF_PNG) {
755
0
        isPngInterlaced(fname, &interlaced);
756
0
        if (readHeaderPng(fname, NULL, NULL, &bps, &spp, NULL))
757
0
            return (L_COMP_DATA *)ERROR_PTR("bad png input", __func__, NULL);
758
0
    }
759
760
        /* PDF is capable of inlining some types of PNG files, but not all
761
           of them. We need to transcode anything with interlacing, an
762
           alpha channel, or 1 bpp (which would otherwise be photo-inverted).
763
764
           Note: any PNG image file with an alpha channel is converted on
765
           reading to RGBA (spp == 4). This includes the (gray + alpha) format
766
           with spp == 2.  Because of the conversion, readHeaderPng() gives
767
           spp = 2, whereas pixGetSpp() gives spp = 4 on the converted pix. */
768
0
    if (format != IFF_PNG ||
769
0
       (format == IFF_PNG && (interlaced || bps == 1 || spp == 4 || spp == 2)))
770
0
    {  /* lgtm+ analyzer needed the logic expanded */
771
0
        if (!pixs)
772
0
            pix = pixRead(fname);
773
0
        else
774
0
            pix = pixClone(pixs);
775
0
        if (!pix)
776
0
            return (L_COMP_DATA *)ERROR_PTR("pix not made", __func__, NULL);
777
0
        cid = pixGenerateFlateData(pix, 0);
778
0
        pixDestroy(&pix);
779
0
        return cid;
780
0
    }
781
782
        /* It's png.  Generate the pdf data without transcoding.
783
         * Implementation by Jeff Breidenbach.
784
         * First, read the metadata */
785
0
    if ((fp = fopenReadStream(fname)) == NULL)
786
0
        return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
787
0
                                          fname, __func__, NULL);
788
0
    freadHeaderPng(fp, &w, &h, &bps, &spp, &cmapflag);
789
0
    fgetPngResolution(fp, &xres, &yres);
790
0
    fclose(fp);
791
792
        /* We get pdf corruption when inlining the data from 16 bpp png. */
793
0
    if (bps == 16)
794
0
        return l_generateFlateData(fname, 0);
795
796
        /* Read the entire png file */
797
0
    if ((pngcomp = l_binaryRead(fname, &nbytespng)) == NULL)
798
0
        return (L_COMP_DATA *)ERROR_PTR_1("unable to read file",
799
0
                                          fname, __func__, NULL);
800
801
        /* Extract flate data, copying portions of it to memory, including
802
         * the predictor information in a byte at the beginning of each
803
         * raster line.  The flate data makes up the vast majority of
804
         * the png file, so after extraction we expect datacomp to
805
         * be nearly full (i.e., nbytescomp will be only slightly less
806
         * than nbytespng).  Also extract the colormap if present. */
807
0
    if ((datacomp = (l_uint8 *)LEPT_CALLOC(1, nbytespng)) == NULL) {
808
0
        LEPT_FREE(pngcomp);
809
0
        return (L_COMP_DATA *)ERROR_PTR("unable to allocate memory",
810
0
                                        __func__, NULL);
811
0
    }
812
813
        /* Parse the png file.  Each chunk consists of:
814
         *    length: 4 bytes
815
         *    name:   4 bytes (e.g., "IDAT")
816
         *    data:   n bytes
817
         *    CRC:    4 bytes
818
         * Start at the beginning of the data section of the first chunk,
819
         * byte 16, because the png file begins with 8 bytes of header,
820
         * followed by the first 8 bytes of the first chunk
821
         * (length and name).  On each loop, increment by 12 bytes to
822
         * skip over the CRC, length and name of the next chunk. */
823
0
    for (i = 16; i < nbytespng; i += 12) {  /* do each successive chunk */
824
            /* Get the chunk length */
825
0
        n  = pngcomp[i - 8] << 24;
826
0
        n += pngcomp[i - 7] << 16;
827
0
        n += pngcomp[i - 6] << 8;
828
0
        n += pngcomp[i - 5] << 0;
829
0
        if (n >= nbytespng - i) {  /* "n + i" can overflow */
830
0
            LEPT_FREE(pngcomp);
831
0
            LEPT_FREE(datacomp);
832
0
            pixcmapDestroy(&cmap);
833
0
            L_ERROR("invalid png: i = %d, n = %d, nbytes = %zu\n", __func__,
834
0
                    i, n, nbytespng);
835
0
            return NULL;
836
0
        }
837
838
            /* Is it a data chunk? */
839
0
        if (memcmp(pngcomp + i - 4, "IDAT", 4) == 0) {
840
0
            memcpy(datacomp + nbytescomp, pngcomp + i, n);
841
0
            nbytescomp += n;
842
0
        }
843
844
            /* Is it a palette chunk? */
845
0
        if (cmapflag && !cmap &&
846
0
            memcmp(pngcomp + i - 4, "PLTE", 4) == 0) {
847
0
            if ((n / 3) > (1 << bps)) {
848
0
                LEPT_FREE(pngcomp);
849
0
                LEPT_FREE(datacomp);
850
0
                pixcmapDestroy(&cmap);
851
0
                L_ERROR("invalid png: i = %d, n = %d, cmapsize = %d\n",
852
0
                        __func__, i, n, (1 << bps));
853
0
                return NULL;
854
0
            }
855
0
            cmap = pixcmapCreate(bps);
856
0
            for (j = i; j < i + n; j += 3) {
857
0
                pixcmapAddColor(cmap, pngcomp[j], pngcomp[j + 1],
858
0
                                pngcomp[j + 2]);
859
0
            }
860
0
        }
861
0
        i += n;  /* move to the end of the data chunk */
862
0
    }
863
0
    LEPT_FREE(pngcomp);
864
865
0
    if (nbytescomp == 0) {
866
0
        LEPT_FREE(datacomp);
867
0
        pixcmapDestroy(&cmap);
868
0
        return (L_COMP_DATA *)ERROR_PTR("invalid PNG file", __func__, NULL);
869
0
    }
870
871
        /* Extract and encode the colormap data as hexascii  */
872
0
    ncolors = 0;
873
0
    if (cmap) {
874
0
        pixcmapSerializeToMemory(cmap, 3, &ncolors, &cmapdata);
875
0
        pixcmapDestroy(&cmap);
876
0
        if (!cmapdata) {
877
0
            LEPT_FREE(datacomp);
878
0
            return (L_COMP_DATA *)ERROR_PTR("cmapdata not made",
879
0
                                            __func__, NULL);
880
0
        }
881
0
        cmapdatahex = pixcmapConvertToHex(cmapdata, ncolors);
882
0
        LEPT_FREE(cmapdata);
883
0
    }
884
885
        /* Note that this is the only situation where the predictor
886
         * field of the CID is set to 1.  Adobe's predictor values on
887
         * p. 76 of pdf_reference_1-7.pdf give 1 for no predictor and
888
         * 10-14 for inline predictors, the specifics of which are
889
         * ignored by the pdf interpreter, which just needs to know that
890
         * the first byte on each compressed scanline is some predictor
891
         * whose type can be inferred from the byte itself.  */
892
0
    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
893
0
    cid->datacomp = datacomp;
894
0
    cid->type = L_FLATE_ENCODE;
895
0
    cid->cmapdatahex = cmapdatahex;
896
0
    cid->nbytescomp = nbytescomp;
897
0
    cid->ncolors = ncolors;
898
0
    cid->predictor = TRUE;
899
0
    cid->w = w;
900
0
    cid->h = h;
901
0
    cid->bps = bps;
902
0
    cid->spp = spp;
903
0
    cid->res = xres;
904
0
    return cid;
905
0
}
906
907
908
/*!
909
 * \brief   l_generateJpegData()
910
 *
911
 * \param[in]    fname           of jpeg file
912
 * \param[in]    ascii85flag     0 for jpeg; 1 for ascii85-encoded jpeg
913
 * \return  cid containing jpeg data, or NULL on error
914
 *
915
 * <pre>
916
 * Notes:
917
 *      (1) Set ascii85flag:
918
 *           ~ 0 for binary data (PDF only)
919
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
920
 *      (2) Most of this function is repeated in l_generateJpegMemData(),
921
 *          which is required in pixacompFastConvertToPdfData().
922
 * </pre>
923
 */
924
L_COMP_DATA *
925
l_generateJpegData(const char  *fname,
926
                   l_int32      ascii85flag)
927
0
{
928
0
char         *data85 = NULL;  /* ascii85 encoded jpeg compressed file */
929
0
l_uint8      *data = NULL;
930
0
l_int32       w, h, xres, yres, bps, spp;
931
0
size_t        nbytes, nbytes85;
932
0
L_COMP_DATA  *cid;
933
0
FILE         *fp;
934
935
0
    if (!fname)
936
0
        return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);
937
938
0
    if (ascii85flag != 0 && ascii85flag != 1)
939
0
        return (L_COMP_DATA *)ERROR_PTR("wrong ascii85flags", __func__, NULL);
940
941
        /* Read the metadata */
942
0
    if (readHeaderJpeg(fname, &w, &h, &spp, NULL, NULL))
943
0
        return (L_COMP_DATA *)ERROR_PTR("bad jpeg metadata", __func__, NULL);
944
0
    bps = 8;
945
0
    if ((fp = fopenReadStream(fname)) == NULL)
946
0
        return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
947
0
                                          fname, __func__, NULL);
948
0
    fgetJpegResolution(fp, &xres, &yres);
949
0
    fclose(fp);
950
951
        /* Read the entire jpeg file.  The returned jpeg data in memory
952
         * starts with ffd8 and ends with ffd9 */
953
0
    if ((data = l_binaryRead(fname, &nbytes)) == NULL)
954
0
        return (L_COMP_DATA *)ERROR_PTR_1("data not extracted",
955
0
                                          fname, __func__, NULL);
956
957
        /* Optionally, encode the compressed data */
958
0
    if (ascii85flag == 1) {
959
0
        data85 = encodeAscii85(data, nbytes, &nbytes85);
960
0
        LEPT_FREE(data);
961
0
        if (!data85)
962
0
            return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL);
963
0
        else
964
0
            data85[nbytes85 - 1] = '\0';  /* remove the newline */
965
0
    }
966
967
0
    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
968
0
    if (ascii85flag == 0) {
969
0
        cid->datacomp = data;
970
0
    } else {  /* ascii85 */
971
0
        cid->data85 = data85;
972
0
        cid->nbytes85 = nbytes85;
973
0
    }
974
0
    cid->type = L_JPEG_ENCODE;
975
0
    cid->nbytescomp = nbytes;
976
0
    cid->w = w;
977
0
    cid->h = h;
978
0
    cid->bps = bps;
979
0
    cid->spp = spp;
980
0
    cid->res = xres;
981
0
    return cid;
982
0
}
983
984
985
/*!
986
 * \brief   l_generateJpegDataMem()
987
 *
988
 * \param[in]    data           of jpeg-encoded file
989
 * \param[in]    nbytes         size of jpeg-encoded file
990
 * \param[in]    ascii85flag    0 for jpeg; 1 for ascii85-encoded jpeg
991
 * \return  cid containing jpeg data, or NULL on error
992
 *
993
 * <pre>
994
 * Notes:
995
 *      (1) Set ascii85flag:
996
 *           ~ 0 for binary data (PDF only)
997
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
998
 * </pre>
999
 */
1000
L_COMP_DATA *
1001
l_generateJpegDataMem(l_uint8  *data,
1002
                      size_t    nbytes,
1003
                      l_int32   ascii85flag)
1004
0
{
1005
0
char         *data85 = NULL;  /* ascii85 encoded jpeg compressed file */
1006
0
l_int32       w, h, xres, yres, bps, spp;
1007
0
size_t        nbytes85;
1008
0
L_COMP_DATA  *cid;
1009
1010
0
    if (!data)
1011
0
        return (L_COMP_DATA *)ERROR_PTR("data not defined", __func__, NULL);
1012
1013
        /* Read the metadata */
1014
0
    if (readHeaderMemJpeg(data, nbytes, &w, &h, &spp, NULL, NULL)) {
1015
0
        LEPT_FREE(data);
1016
0
        return (L_COMP_DATA *)ERROR_PTR("bad jpeg metadata", __func__, NULL);
1017
0
    }
1018
0
    bps = 8;
1019
0
    readResolutionMemJpeg(data, nbytes, &xres, &yres);
1020
1021
        /* Optionally, encode the compressed data */
1022
0
    if (ascii85flag == 1) {
1023
0
        data85 = encodeAscii85(data, nbytes, &nbytes85);
1024
0
        LEPT_FREE(data);
1025
0
        if (!data85)
1026
0
            return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL);
1027
0
        else
1028
0
            data85[nbytes85 - 1] = '\0';  /* remove the newline */
1029
0
    }
1030
1031
0
    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
1032
0
    if (ascii85flag == 0) {
1033
0
        cid->datacomp = data;
1034
0
    } else {  /* ascii85 */
1035
0
        cid->data85 = data85;
1036
0
        cid->nbytes85 = nbytes85;
1037
0
    }
1038
0
    cid->type = L_JPEG_ENCODE;
1039
0
    cid->nbytescomp = nbytes;
1040
0
    cid->w = w;
1041
0
    cid->h = h;
1042
0
    cid->bps = bps;
1043
0
    cid->spp = spp;
1044
0
    cid->res = xres;
1045
0
    return cid;
1046
0
}
1047
1048
1049
/*!
1050
 * \brief   l_generateJp2kData()
1051
 *
1052
 * \param[in]    fname     of jp2k file
1053
 * \return  cid containing jp2k data, or NULL on error
1054
 *
1055
 * <pre>
1056
 * Notes:
1057
 *      (1) This is only called after the file is verified to be jp2k.
1058
 * </pre>
1059
 */
1060
static L_COMP_DATA *
1061
l_generateJp2kData(const char  *fname)
1062
0
{
1063
0
l_int32       w, h, bps, spp, xres, yres;
1064
0
size_t        nbytes;
1065
0
L_COMP_DATA  *cid;
1066
0
FILE         *fp;
1067
1068
0
    if (!fname)
1069
0
        return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);
1070
1071
0
    if (readHeaderJp2k(fname, &w, &h, &bps, &spp, NULL))
1072
0
        return (L_COMP_DATA *)ERROR_PTR("bad jp2k metadata", __func__, NULL);
1073
1074
        /* The returned jp2k data in memory is the entire jp2k file */
1075
0
    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
1076
0
    if ((cid->datacomp = l_binaryRead(fname, &nbytes)) == NULL) {
1077
0
        l_CIDataDestroy(&cid);
1078
0
        return (L_COMP_DATA *)ERROR_PTR("data not extracted", __func__, NULL);
1079
0
    }
1080
1081
0
    xres = yres = 0;
1082
0
    if ((fp = fopenReadStream(fname)) != NULL) {
1083
0
        fgetJp2kResolution(fp, &xres, &yres);
1084
0
        fclose(fp);
1085
0
    }
1086
0
    cid->type = L_JP2K_ENCODE;
1087
0
    cid->nbytescomp = nbytes;
1088
0
    cid->w = w;
1089
0
    cid->h = h;
1090
0
    cid->bps = bps;
1091
0
    cid->spp = spp;
1092
0
    cid->res = xres;
1093
0
    return cid;
1094
0
}
1095
1096
1097
/*!
1098
 * \brief   l_generateG4Data()
1099
 *
1100
 * \param[in]    fname          of g4 compressed file
1101
 * \param[in]    ascii85flag    0 for g4 compressed; 1 for ascii85-encoded g4
1102
 * \return  cid g4 compressed image data, or NULL on error
1103
 *
1104
 * <pre>
1105
 * Notes:
1106
 *      (1) Set ascii85flag:
1107
 *           ~ 0 for binary data (PDF only)
1108
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
1109
 *      (2) This does not work for multipage tiff files.
1110
 * </pre>
1111
 */
1112
L_COMP_DATA *
1113
l_generateG4Data(const char  *fname,
1114
                 l_int32      ascii85flag)
1115
0
{
1116
0
l_uint8      *datacomp = NULL;  /* g4 compressed raster data */
1117
0
char         *data85 = NULL;  /* ascii85 encoded g4 compressed data */
1118
0
l_int32       w, h, xres, yres, npages;
1119
0
l_int32       minisblack;  /* TRUE or FALSE */
1120
0
size_t        nbytes85, nbytescomp;
1121
0
L_COMP_DATA  *cid;
1122
0
FILE         *fp;
1123
1124
0
    if (!fname)
1125
0
        return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);
1126
1127
        /* Make sure this is a single page tiff file */
1128
0
    if ((fp = fopenReadStream(fname)) == NULL)
1129
0
        return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
1130
0
                                          fname, __func__, NULL);
1131
0
    tiffGetCount(fp, &npages);
1132
0
    fclose(fp);
1133
0
    if (npages != 1) {
1134
0
        L_ERROR(" %d page tiff; only works with 1 page (file: %s)\n", __func__, npages, fname);
1135
0
        return NULL;
1136
0
    }
1137
1138
        /* Read the resolution */
1139
0
    if ((fp = fopenReadStream(fname)) == NULL)
1140
0
        return (L_COMP_DATA *)ERROR_PTR_1("stream not opened",
1141
0
                                          fname, __func__, NULL);
1142
0
    getTiffResolution(fp, &xres, &yres);
1143
0
    fclose(fp);
1144
1145
        /* The returned ccitt g4 data in memory is the block of
1146
         * bytes in the tiff file, starting after 8 bytes and
1147
         * ending before the directory. */
1148
0
    if (extractG4DataFromFile(fname, &datacomp, &nbytescomp,
1149
0
                              &w, &h, &minisblack)) {
1150
0
        return (L_COMP_DATA *)ERROR_PTR_1("datacomp not extracted",
1151
0
                                          fname, __func__, NULL);
1152
0
    }
1153
1154
        /* Optionally, encode the compressed data */
1155
0
    if (ascii85flag == 1) {
1156
0
        data85 = encodeAscii85(datacomp, nbytescomp, &nbytes85);
1157
0
        LEPT_FREE(datacomp);
1158
0
        if (!data85)
1159
0
            return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL);
1160
0
        else
1161
0
            data85[nbytes85 - 1] = '\0';  /* remove the newline */
1162
0
    }
1163
1164
0
    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
1165
0
    if (ascii85flag == 0) {
1166
0
        cid->datacomp = datacomp;
1167
0
    } else {  /* ascii85 */
1168
0
        cid->data85 = data85;
1169
0
        cid->nbytes85 = nbytes85;
1170
0
    }
1171
0
    cid->type = L_G4_ENCODE;
1172
0
    cid->nbytescomp = nbytescomp;
1173
0
    cid->w = w;
1174
0
    cid->h = h;
1175
0
    cid->bps = 1;
1176
0
    cid->spp = 1;
1177
0
    cid->minisblack = minisblack;
1178
0
    cid->res = xres;
1179
0
    return cid;
1180
0
}
1181
1182
1183
/*!
1184
 * \brief   pixGenerateCIData()
1185
 *
1186
 * \param[in]    pixs       8 or 32 bpp, no colormap
1187
 * \param[in]    type       L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE or
1188
 *                          L_JP2K_ENCODE
1189
 * \param[in]    quality    for jpeg if transcoded: 1-100; 0 for default (75)
1190
 *                          for jp2k if transcoded: 27-45; 0 for default (34)
1191
 * \param[in]    ascii85    0 for binary; 1 for ascii85-encoded
1192
 * \param[out]   pcid       compressed data
1193
 * \return  0 if OK, 1 on error
1194
 *
1195
 * <pre>
1196
 * Notes:
1197
 *      (1) Set ascii85:
1198
 *           ~ 0 for binary data (PDF only)
1199
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
1200
 *      (2) Do not accept images with an asperity ratio greater than 10.
1201
 * </pre>
1202
 */
1203
l_ok
1204
pixGenerateCIData(PIX           *pixs,
1205
                  l_int32        type,
1206
                  l_int32        quality,
1207
                  l_int32        ascii85,
1208
                  L_COMP_DATA  **pcid)
1209
0
{
1210
0
l_int32   w, h, d, maxAsp;
1211
0
PIXCMAP  *cmap;
1212
1213
0
    if (!pcid)
1214
0
        return ERROR_INT("&cid not defined", __func__, 1);
1215
0
    *pcid = NULL;
1216
0
    if (!pixs)
1217
0
        return ERROR_INT("pixs not defined", __func__, 1);
1218
0
    if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1219
0
        type != L_FLATE_ENCODE && type != L_JP2K_ENCODE) {
1220
0
        selectDefaultPdfEncoding(pixs, &type);
1221
0
    }
1222
0
    if (ascii85 != 0 && ascii85 != 1)
1223
0
        return ERROR_INT("invalid ascii85", __func__, 1);
1224
0
    pixGetDimensions(pixs, &w, &h, NULL);
1225
0
    if (w == 0 || h == 0)
1226
0
        return ERROR_INT("invalid w or h", __func__, 1);
1227
0
    maxAsp = L_MAX(w / h, h / w);
1228
0
    if (maxAsp > 10)
1229
0
        return ERROR_INT("max asperity > 10", __func__, 1);
1230
1231
        /* Conditionally modify the encoding type if libz is
1232
         * available and the requested library is missing. */
1233
0
#if defined(HAVE_LIBZ)
1234
# if !defined(HAVE_LIBJPEG)
1235
    if (type == L_JPEG_ENCODE) {
1236
        L_WARNING("no libjpeg; using flate encoding\n", __func__);
1237
        type = L_FLATE_ENCODE;
1238
    }
1239
# endif /* !defined(HAVE_LIBJPEG) */
1240
0
# if !defined(HAVE_LIBJP2K)
1241
0
    if (type == L_JP2K_ENCODE) {
1242
0
        L_WARNING("no libjp2k; using flate encoding\n", __func__);
1243
0
        type = L_FLATE_ENCODE;
1244
0
    }
1245
0
# endif /* !defined(HAVE_LIBJP2K) */
1246
# if !defined(HAVE_LIBTIFF)
1247
    if (type == L_G4_ENCODE) {
1248
        L_WARNING("no libtiff; using flate encoding\n", __func__);
1249
        type = L_FLATE_ENCODE;
1250
    }
1251
# endif /* !defined(HAVE_LIBTIFF) */
1252
0
#endif /* defined(HAVE_LIBZ) */
1253
1254
        /* Sanity check on requested encoding */
1255
0
    d = pixGetDepth(pixs);
1256
0
    cmap = pixGetColormap(pixs);
1257
0
    if (cmap && type != L_FLATE_ENCODE) {
1258
0
        L_WARNING("pixs has cmap; using flate encoding\n", __func__);
1259
0
        type = L_FLATE_ENCODE;
1260
0
    } else if (d < 8 && (type == L_JPEG_ENCODE || type == L_JP2K_ENCODE)) {
1261
0
        L_WARNING("pixs has < 8 bpp; using flate encoding\n", __func__);
1262
0
        type = L_FLATE_ENCODE;
1263
0
    } else if (d > 1 && type == L_G4_ENCODE) {
1264
0
        L_WARNING("pixs has > 1 bpp; using flate encoding\n", __func__);
1265
0
        type = L_FLATE_ENCODE;
1266
0
    }
1267
1268
0
    if (type == L_JPEG_ENCODE) {
1269
0
        if ((*pcid = pixGenerateJpegData(pixs, ascii85, quality)) == NULL)
1270
0
            return ERROR_INT("jpeg data not made", __func__, 1);
1271
0
    } else if (type == L_JP2K_ENCODE) {
1272
0
        if ((*pcid = pixGenerateJp2kData(pixs, quality)) == NULL)
1273
0
            return ERROR_INT("jp2k data not made", __func__, 1);
1274
0
    } else if (type == L_G4_ENCODE) {
1275
0
        if ((*pcid = pixGenerateG4Data(pixs, ascii85)) == NULL)
1276
0
            return ERROR_INT("g4 data not made", __func__, 1);
1277
0
    } else {  /* type == L_FLATE_ENCODE */
1278
0
        if ((*pcid = pixGenerateFlateData(pixs, ascii85)) == NULL)
1279
0
            return ERROR_INT("flate data not made", __func__, 1);
1280
0
    }
1281
0
    return 0;
1282
0
}
1283
1284
1285
/*!
1286
 * \brief   l_generateFlateData()
1287
 *
1288
 * \param[in]    fname
1289
 * \param[in]    ascii85flag    0 for gzipped; 1 for ascii85-encoded gzipped
1290
 * \return  cid flate compressed image data, or NULL on error
1291
 *
1292
 * <pre>
1293
 * Notes:
1294
 *      (1) The input image is converted to one of these 4 types:
1295
 *           ~ 1 bpp
1296
 *           ~ 8 bpp, no colormap
1297
 *           ~ 8 bpp, colormap
1298
 *           ~ 32 bpp rgb
1299
 *      (2) Set ascii85flag:
1300
 *           ~ 0 for binary data (PDF only)
1301
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
1302
 *      (3) Always transcodes (i.e., first decodes the png file)
1303
 * </pre>
1304
 */
1305
L_COMP_DATA *
1306
l_generateFlateData(const char  *fname,
1307
                    l_int32      ascii85flag)
1308
0
{
1309
0
L_COMP_DATA  *cid;
1310
0
PIX          *pixs;
1311
1312
0
    if (!fname)
1313
0
        return (L_COMP_DATA *)ERROR_PTR("fname not defined", __func__, NULL);
1314
1315
0
    if ((pixs = pixRead(fname)) == NULL)
1316
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not made", __func__, NULL);
1317
0
    cid = pixGenerateFlateData(pixs, ascii85flag);
1318
0
    pixDestroy(&pixs);
1319
0
    return cid;
1320
0
}
1321
1322
1323
/*!
1324
 * \brief   pixGenerateFlateData()
1325
 *
1326
 * \param[in]    pixs
1327
 * \param[in]    ascii85flag 0    for gzipped; 1 for ascii85-encoded gzipped
1328
 * \return  cid flate compressed image data, or NULL on error
1329
 *
1330
 * <pre>
1331
 * Notes:
1332
 *     (1) If called with an RGBA pix (spp == 4), the alpha channel
1333
 *         will be removed, projecting a white backgrouond through
1334
 *         any transparency.
1335
 *     (2) If called with a colormapped pix, any transparency in the
1336
 *         alpha component in the colormap will be ignored, as it is
1337
 *         for all leptonica operations on colormapped pix.
1338
 * </pre>
1339
 */
1340
static L_COMP_DATA *
1341
pixGenerateFlateData(PIX     *pixs,
1342
                     l_int32  ascii85flag)
1343
0
{
1344
0
l_uint8      *data = NULL;  /* uncompressed raster data in required format */
1345
0
l_uint8      *datacomp = NULL;  /* gzipped raster data */
1346
0
char         *data85 = NULL;  /* ascii85 encoded gzipped raster data */
1347
0
l_uint8      *cmapdata = NULL;  /* uncompressed colormap */
1348
0
char         *cmapdata85 = NULL;  /* ascii85 encoded uncompressed colormap */
1349
0
char         *cmapdatahex = NULL;  /* hex ascii uncompressed colormap */
1350
0
l_int32       ncolors;  /* in colormap; not used if cmapdata85 is null */
1351
0
l_int32       bps;  /* bits/sample: usually 8 */
1352
0
l_int32       spp;  /* samples/pixel: 1-grayscale/cmap); 3-rgb */
1353
0
l_int32       w, h, d, cmapflag;
1354
0
size_t        ncmapbytes85 = 0;
1355
0
size_t        nbytes85 = 0;
1356
0
size_t        nbytes, nbytescomp;
1357
0
L_COMP_DATA  *cid;
1358
0
PIX          *pixt;
1359
0
PIXCMAP      *cmap;
1360
1361
0
    if (!pixs)
1362
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);
1363
1364
        /* Convert the image to one of these 4 types:
1365
         *     1 bpp
1366
         *     8 bpp, no colormap
1367
         *     8 bpp, colormap
1368
         *     32 bpp rgb    */
1369
0
    pixGetDimensions(pixs, &w, &h, &d);
1370
0
    cmap = pixGetColormap(pixs);
1371
0
    cmapflag = (cmap) ? 1 : 0;
1372
0
    if (d == 2 || d == 4 || d == 16) {
1373
0
        pixt = pixConvertTo8(pixs, cmapflag);
1374
0
        cmap = pixGetColormap(pixt);
1375
0
        d = pixGetDepth(pixt);
1376
0
    } else if (d == 32 && pixGetSpp(pixs) == 4) {  /* remove alpha */
1377
0
        pixt = pixAlphaBlendUniform(pixs, 0xffffff00);
1378
0
    } else {
1379
0
        pixt = pixClone(pixs);
1380
0
    }
1381
0
    if (!pixt)
1382
0
        return (L_COMP_DATA *)ERROR_PTR("pixt not made", __func__, NULL);
1383
0
    spp = (d == 32) ? 3 : 1;
1384
0
    bps = (d == 32) ? 8 : d;
1385
1386
        /* Extract and encode the colormap data as both ascii85 and hexascii  */
1387
0
    ncolors = 0;
1388
0
    if (cmap) {
1389
0
        pixcmapSerializeToMemory(cmap, 3, &ncolors, &cmapdata);
1390
0
        if (!cmapdata) {
1391
0
            pixDestroy(&pixt);
1392
0
            return (L_COMP_DATA *)ERROR_PTR("cmapdata not made",
1393
0
                                            __func__, NULL);
1394
0
        }
1395
1396
0
        cmapdata85 = encodeAscii85(cmapdata, 3 * ncolors, &ncmapbytes85);
1397
0
        cmapdatahex = pixcmapConvertToHex(cmapdata, ncolors);
1398
0
        LEPT_FREE(cmapdata);
1399
0
    }
1400
1401
        /* Extract and compress the raster data */
1402
0
    pixGetRasterData(pixt, &data, &nbytes);
1403
0
    pixDestroy(&pixt);
1404
0
    if (!data) {
1405
0
        LEPT_FREE(cmapdata85);
1406
0
        LEPT_FREE(cmapdatahex);
1407
0
        return (L_COMP_DATA *)ERROR_PTR("data not returned", __func__, NULL);
1408
0
    }
1409
0
    datacomp = zlibCompress(data, nbytes, &nbytescomp);
1410
0
    LEPT_FREE(data);
1411
0
    if (!datacomp) {
1412
0
        LEPT_FREE(cmapdata85);
1413
0
        LEPT_FREE(cmapdatahex);
1414
0
        return (L_COMP_DATA *)ERROR_PTR("datacomp not made", __func__, NULL);
1415
0
    }
1416
1417
        /* Optionally, encode the compressed data */
1418
0
    if (ascii85flag == 1) {
1419
0
        data85 = encodeAscii85(datacomp, nbytescomp, &nbytes85);
1420
0
        LEPT_FREE(datacomp);
1421
0
        if (!data85) {
1422
0
            LEPT_FREE(cmapdata85);
1423
0
            LEPT_FREE(cmapdatahex);
1424
0
            return (L_COMP_DATA *)ERROR_PTR("data85 not made", __func__, NULL);
1425
0
        } else {
1426
0
            data85[nbytes85 - 1] = '\0';  /* remove the newline */
1427
0
        }
1428
0
    }
1429
1430
0
    cid = (L_COMP_DATA *)LEPT_CALLOC(1, sizeof(L_COMP_DATA));
1431
0
    if (ascii85flag == 0) {
1432
0
        cid->datacomp = datacomp;
1433
0
    } else {  /* ascii85 */
1434
0
        cid->data85 = data85;
1435
0
        cid->nbytes85 = nbytes85;
1436
0
    }
1437
0
    cid->type = L_FLATE_ENCODE;
1438
0
    cid->cmapdatahex = cmapdatahex;
1439
0
    cid->cmapdata85 = cmapdata85;
1440
0
    cid->nbytescomp = nbytescomp;
1441
0
    cid->ncolors = ncolors;
1442
0
    cid->w = w;
1443
0
    cid->h = h;
1444
0
    cid->bps = bps;
1445
0
    cid->spp = spp;
1446
0
    cid->res = pixGetXRes(pixs);
1447
0
    cid->nbytes = nbytes;  /* only for debugging */
1448
0
    return cid;
1449
0
}
1450
1451
1452
/*!
1453
 * \brief   pixGenerateJpegData()
1454
 *
1455
 * \param[in]    pixs           8, 16 or 32 bpp, no colormap
1456
 * \param[in]    ascii85flag    0 for jpeg; 1 for ascii85-encoded jpeg
1457
 * \param[in]    quality        0 for default, which is 75
1458
 * \return  cid jpeg compressed data, or NULL on error
1459
 *
1460
 * <pre>
1461
 * Notes:
1462
 *      (1) Set ascii85flag:
1463
 *           ~ 0 for binary data (PDF only)
1464
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
1465
 *      (2) If 16 bpp, convert first to 8 bpp, using the MSB
1466
 * </pre>
1467
 */
1468
static L_COMP_DATA *
1469
pixGenerateJpegData(PIX     *pixs,
1470
                    l_int32  ascii85flag,
1471
                    l_int32  quality)
1472
0
{
1473
0
l_int32       d;
1474
0
char         *fname;
1475
0
L_COMP_DATA  *cid;
1476
1477
0
    if (!pixs)
1478
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);
1479
0
    if (pixGetColormap(pixs))
1480
0
        return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL);
1481
0
    d = pixGetDepth(pixs);
1482
0
    if (d != 8 && d != 16 && d != 32)
1483
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not 8, 16 or 32 bpp",
1484
0
                __func__, NULL);
1485
1486
        /* Compress to a temp jpeg file */
1487
0
    fname = l_makeTempFilename();
1488
0
    if (pixWriteJpeg(fname, pixs, quality, 0)) {
1489
0
        LEPT_FREE(fname);
1490
0
        return NULL;
1491
0
    }
1492
1493
        /* Generate the data */
1494
0
    cid = l_generateJpegData(fname, ascii85flag);
1495
0
    if (lept_rmfile(fname) != 0)
1496
0
        L_ERROR("temp file %s was not deleted\n", __func__, fname);
1497
0
    LEPT_FREE(fname);
1498
0
    return cid;
1499
0
}
1500
1501
1502
/*!
1503
 * \brief   pixGenerateJp2kData()
1504
 *
1505
 * \param[in]    pixs           8 or 32 bpp, no colormap
1506
 * \param[in]    quality        0 for default, which is 34
1507
 * \return  cid jp2k compressed data, or NULL on error
1508
 *
1509
 * <pre>
1510
 * Notes:
1511
 *      (1) The quality can be set between 27 (very poor) and 45
1512
 *          (nearly perfect).  Use 0 for default (34). Use 100 for lossless,
1513
 *          but this is very expensive and not recommended.
1514
 * </pre>
1515
 */
1516
static L_COMP_DATA *
1517
pixGenerateJp2kData(PIX     *pixs,
1518
                    l_int32  quality)
1519
0
{
1520
0
l_int32       d;
1521
0
char         *fname;
1522
0
L_COMP_DATA  *cid;
1523
1524
0
    if (!pixs)
1525
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);
1526
0
    if (pixGetColormap(pixs))
1527
0
        return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL);
1528
0
    d = pixGetDepth(pixs);
1529
0
    if (d != 8 && d != 32)
1530
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not 8 or 32 bpp", __func__, NULL);
1531
1532
        /* Compress to a temp jp2k file */
1533
0
    fname = l_makeTempFilename();
1534
0
    if (pixWriteJp2k(fname, pixs, quality, 5, 0, 0)) {
1535
0
        LEPT_FREE(fname);
1536
0
        return NULL;
1537
0
    }
1538
1539
        /* Generate the data */
1540
0
    cid = l_generateJp2kData(fname);
1541
0
    if (lept_rmfile(fname) != 0)
1542
0
        L_ERROR("temp file %s was not deleted\n", __func__, fname);
1543
0
    LEPT_FREE(fname);
1544
0
    return cid;
1545
0
}
1546
1547
1548
/*!
1549
 * \brief   pixGenerateG4Data()
1550
 *
1551
 * \param[in]    pixs           1 bpp, no colormap
1552
 * \param[in]    ascii85flag    0 for gzipped; 1 for ascii85-encoded gzipped
1553
 * \return  cid g4 compressed image data, or NULL on error
1554
 *
1555
 * <pre>
1556
 * Notes:
1557
 *      (1) Set ascii85flag:
1558
 *           ~ 0 for binary data (PDF only)
1559
 *           ~ 1 for ascii85 (5 for 4) encoded binary data (PostScript only)
1560
 * </pre>
1561
 */
1562
static L_COMP_DATA *
1563
pixGenerateG4Data(PIX     *pixs,
1564
                  l_int32  ascii85flag)
1565
0
{
1566
0
char         *fname;
1567
0
L_COMP_DATA  *cid;
1568
1569
0
    if (!pixs)
1570
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not defined", __func__, NULL);
1571
0
    if (pixGetDepth(pixs) != 1)
1572
0
        return (L_COMP_DATA *)ERROR_PTR("pixs not 1 bpp", __func__, NULL);
1573
0
    if (pixGetColormap(pixs))
1574
0
        return (L_COMP_DATA *)ERROR_PTR("pixs has colormap", __func__, NULL);
1575
1576
        /* Compress to a temp tiff g4 file */
1577
0
    fname = l_makeTempFilename();
1578
0
    if (pixWrite(fname, pixs, IFF_TIFF_G4)) {
1579
0
        LEPT_FREE(fname);
1580
0
        return NULL;
1581
0
    }
1582
1583
0
    cid = l_generateG4Data(fname, ascii85flag);
1584
0
    if (lept_rmfile(fname) != 0)
1585
0
        L_ERROR("temp file %s was not deleted\n", __func__, fname);
1586
0
    LEPT_FREE(fname);
1587
0
    return cid;
1588
0
}
1589
1590
1591
/*!
1592
 * \brief   cidConvertToPdfData()
1593
 *
1594
 * \param[in]    cid       compressed image data
1595
 * \param[in]    title     [optional] pdf title; can be null
1596
 * \param[out]   pdata     output pdf data for image
1597
 * \param[out]   pnbytes   size of output pdf data
1598
 * \return  0 if OK, 1 on error
1599
 *
1600
 * <pre>
1601
 * Notes:
1602
 *      (1) Caller must not destroy the cid.  It is absorbed in the
1603
 *          lpd and destroyed by this function.
1604
 * </pre>
1605
 */
1606
l_ok
1607
cidConvertToPdfData(L_COMP_DATA  *cid,
1608
                    const char   *title,
1609
                    l_uint8     **pdata,
1610
                    size_t       *pnbytes)
1611
0
{
1612
0
l_int32      res, ret;
1613
0
l_float32    wpt, hpt;
1614
0
L_PDF_DATA  *lpd = NULL;
1615
1616
0
    if (!pdata || !pnbytes)
1617
0
        return ERROR_INT("&data and &nbytes not both defined", __func__, 1);
1618
0
    *pdata = NULL;
1619
0
    *pnbytes = 0;
1620
0
    if (!cid)
1621
0
        return ERROR_INT("cid not defined", __func__, 1);
1622
1623
        /* Get media box parameters, in pts */
1624
0
    res = cid->res;
1625
0
    if (res <= 0)
1626
0
        res = DefaultInputRes;
1627
0
    wpt = cid->w * 72.f / res;
1628
0
    hpt = cid->h * 72.f / res;
1629
1630
        /* Set up the pdf data struct (lpd) */
1631
0
    if ((lpd = pdfdataCreate(title)) == NULL)
1632
0
        return ERROR_INT("lpd not made", __func__, 1);
1633
0
    ptraAdd(lpd->cida, cid);
1634
0
    lpd->n++;
1635
0
    ptaAddPt(lpd->xy, 0, 0);   /* xpt = ypt = 0 */
1636
0
    ptaAddPt(lpd->wh, wpt, hpt);
1637
1638
        /* Generate the pdf string and destroy the lpd */
1639
0
    ret = l_generatePdf(pdata, pnbytes, lpd);
1640
0
    pdfdataDestroy(&lpd);
1641
0
    if (ret)
1642
0
        return ERROR_INT("pdf output not made", __func__, 1);
1643
0
    return 0;
1644
0
}
1645
1646
1647
/*!
1648
 * \brief   l_CIDataDestroy()
1649
 *
1650
 * \param[in,out]   pcid     will be set to null before returning
1651
 * \return  void
1652
 */
1653
void
1654
l_CIDataDestroy(L_COMP_DATA  **pcid)
1655
0
{
1656
0
L_COMP_DATA  *cid;
1657
1658
0
    if (pcid == NULL) {
1659
0
        L_WARNING("ptr address is null!\n", __func__);
1660
0
        return;
1661
0
    }
1662
0
    if ((cid = *pcid) == NULL)
1663
0
        return;
1664
1665
0
    if (cid->datacomp) LEPT_FREE(cid->datacomp);
1666
0
    if (cid->data85) LEPT_FREE(cid->data85);
1667
0
    if (cid->cmapdata85) LEPT_FREE(cid->cmapdata85);
1668
0
    if (cid->cmapdatahex) LEPT_FREE(cid->cmapdatahex);
1669
0
    LEPT_FREE(cid);
1670
0
    *pcid = NULL;
1671
0
}
1672
1673
1674
/*---------------------------------------------------------------------*
1675
 *         Helper functions for generating the output pdf string       *
1676
 *---------------------------------------------------------------------*/
1677
/*!
1678
 * \brief   l_generatePdf()
1679
 *
1680
 * \param[out]   pdata     pdf array
1681
 * \param[out]   pnbytes   number of bytes in pdf array
1682
 * \param[in]    lpd       all the required input image data
1683
 * \return  0 if OK, 1 on error
1684
 *
1685
 * <pre>
1686
 * Notes:
1687
 *      (1) On error, no data is returned.
1688
 *      (2) The objects are:
1689
 *            1: Catalog
1690
 *            2: Info
1691
 *            3: Pages
1692
 *            4: Page
1693
 *            5: Contents  (rendering command)
1694
 *            6 to 6+n-1: n XObjects
1695
 *            6+n to 6+n+m-1: m colormaps
1696
 * </pre>
1697
 */
1698
static l_int32
1699
l_generatePdf(l_uint8    **pdata,
1700
              size_t      *pnbytes,
1701
              L_PDF_DATA  *lpd)
1702
0
{
1703
0
    if (!pdata)
1704
0
        return ERROR_INT("&data not defined", __func__, 1);
1705
0
    *pdata = NULL;
1706
0
    if (!pnbytes)
1707
0
        return ERROR_INT("&nbytes not defined", __func__, 1);
1708
0
    *pnbytes = 0;
1709
0
    if (!lpd)
1710
0
        return ERROR_INT("lpd not defined", __func__, 1);
1711
1712
0
    generateFixedStringsPdf(lpd);
1713
0
    generateMediaboxPdf(lpd);
1714
0
    generatePageStringPdf(lpd);
1715
0
    generateContentStringPdf(lpd);
1716
0
    generatePreXStringsPdf(lpd);
1717
0
    generateColormapStringsPdf(lpd);
1718
0
    generateTrailerPdf(lpd);
1719
0
    return generateOutputDataPdf(pdata, pnbytes, lpd);
1720
0
}
1721
1722
1723
static void
1724
generateFixedStringsPdf(L_PDF_DATA  *lpd)
1725
0
{
1726
0
char     buf[L_SMALLBUF];
1727
0
char    *version, *datestr;
1728
0
SARRAY  *sa;
1729
1730
        /* Accumulate data for the header and objects 1-3 */
1731
0
    lpd->id = stringNew("%PDF-1.5\n");
1732
0
    l_dnaAddNumber(lpd->objsize, strlen(lpd->id));
1733
1734
0
    lpd->obj1 = stringNew("1 0 obj\n"
1735
0
                          "<<\n"
1736
0
                          "/Type /Catalog\n"
1737
0
                          "/Pages 3 0 R\n"
1738
0
                          ">>\n"
1739
0
                          "endobj\n");
1740
0
    l_dnaAddNumber(lpd->objsize, strlen(lpd->obj1));
1741
1742
0
    sa = sarrayCreate(0);
1743
0
    sarrayAddString(sa, "2 0 obj\n"
1744
0
                        "<<\n", L_COPY);
1745
0
    if (var_WRITE_DATE_AND_VERSION) {
1746
0
        datestr = l_getFormattedDate();
1747
0
        snprintf(buf, sizeof(buf), "/CreationDate (D:%s)\n", datestr);
1748
0
        sarrayAddString(sa, buf, L_COPY);
1749
0
        LEPT_FREE(datestr);
1750
0
        version = getLeptonicaVersion();
1751
0
        snprintf(buf, sizeof(buf),
1752
0
                 "/Producer (leptonica: %s)\n", version);
1753
0
        LEPT_FREE(version);
1754
0
    } else {
1755
0
        snprintf(buf, sizeof(buf), "/Producer (leptonica)\n");
1756
0
    }
1757
0
    sarrayAddString(sa, buf, L_COPY);
1758
0
    if (lpd->title) {
1759
0
        char *hexstr;
1760
0
        if ((hexstr = generateEscapeString(lpd->title)) != NULL) {
1761
0
            snprintf(buf, sizeof(buf), "/Title %s\n", hexstr);
1762
0
            sarrayAddString(sa, buf, L_COPY);
1763
0
        } else {
1764
0
            L_ERROR("title string is not ascii\n", __func__);
1765
0
        }
1766
0
        LEPT_FREE(hexstr);
1767
0
    }
1768
0
    sarrayAddString(sa, ">>\n"
1769
0
                                "endobj\n", L_COPY);
1770
0
    lpd->obj2 = sarrayToString(sa, 0);
1771
0
    l_dnaAddNumber(lpd->objsize, strlen(lpd->obj2));
1772
0
    sarrayDestroy(&sa);
1773
1774
0
    lpd->obj3 = stringNew("3 0 obj\n"
1775
0
                          "<<\n"
1776
0
                          "/Type /Pages\n"
1777
0
                          "/Kids [ 4 0 R ]\n"
1778
0
                          "/Count 1\n"
1779
0
                          ">>\n");
1780
0
    l_dnaAddNumber(lpd->objsize, strlen(lpd->obj3));
1781
1782
        /* Do the post-datastream string */
1783
0
    lpd->poststream = stringNew("\n"
1784
0
                                "endstream\n"
1785
0
                                "endobj\n");
1786
0
}
1787
1788
1789
/*!
1790
 * \brief   generateEscapeString()
1791
 *
1792
 * \param[in]   str      input string
1793
 * \return   hex escape string, or null on error
1794
 *
1795
 * <pre>
1796
 * Notes:
1797
 *      (1) If the input string is not ascii, returns null.
1798
 *      (2) This takes an input ascii string and generates a hex
1799
 *          ascii output string with 4 bytes out for each byte in.
1800
 *          The feff code at the beginning tells the pdf interpreter
1801
 *          that the data is to be interpreted as big-endian, 4 bytes
1802
 *          at a time.  For ascii, the first two bytes are 0 and the
1803
 *          last two bytes are less than 0x80.
1804
 * </pre>
1805
 */
1806
static char  *
1807
generateEscapeString(const char  *str)
1808
0
{
1809
0
char     smallbuf[8];
1810
0
char    *buffer;
1811
0
l_int32  i, nchar, buflen;
1812
1813
0
    if (!str)
1814
0
        return (char *)ERROR_PTR("str not defined", __func__, NULL);
1815
0
    nchar = strlen(str);
1816
0
    for (i = 0; i < nchar; i++) {
1817
0
        if (str[i] < 0)
1818
0
            return (char *)ERROR_PTR("str not all ascii", __func__, NULL);
1819
0
    }
1820
1821
0
    buflen = 4 * nchar + 10;
1822
0
    buffer = (char *)LEPT_CALLOC(buflen, sizeof(char));
1823
0
    stringCat(buffer, buflen, "<feff");
1824
0
    for (i = 0; i < nchar; i++) {
1825
0
        snprintf(smallbuf, sizeof(smallbuf), "%04x", str[i]);
1826
0
        stringCat(buffer, buflen, smallbuf);
1827
0
    }
1828
0
    stringCat(buffer, buflen, ">");
1829
0
    return buffer;
1830
0
}
1831
1832
1833
static void
1834
generateMediaboxPdf(L_PDF_DATA  *lpd)
1835
0
{
1836
0
l_int32    i;
1837
0
l_float32  xpt, ypt, wpt, hpt, maxx, maxy;
1838
1839
        /* First get the full extent of all the images.
1840
         * This is the mediabox, in pts. */
1841
0
    maxx = maxy = 0;
1842
0
    for (i = 0; i < lpd->n; i++) {
1843
0
        ptaGetPt(lpd->xy, i, &xpt, &ypt);
1844
0
        ptaGetPt(lpd->wh, i, &wpt, &hpt);
1845
0
        maxx = L_MAX(maxx, xpt + wpt);
1846
0
        maxy = L_MAX(maxy, ypt + hpt);
1847
0
    }
1848
1849
0
    lpd->mediabox = boxCreate(0, 0, (l_int32)(maxx + 0.5),
1850
0
                              (l_int32)(maxy + 0.5));
1851
1852
        /* ypt is in standard image coordinates: the location of
1853
         * the UL image corner with respect to the UL media box corner.
1854
         * Rewrite each ypt for PostScript coordinates: the location of
1855
         * the LL image corner with respect to the LL media box corner. */
1856
0
    for (i = 0; i < lpd->n; i++) {
1857
0
        ptaGetPt(lpd->xy, i, &xpt, &ypt);
1858
0
        ptaGetPt(lpd->wh, i, &wpt, &hpt);
1859
0
        ptaSetPt(lpd->xy, i, xpt, maxy - ypt - hpt);
1860
0
    }
1861
0
}
1862
1863
1864
static l_int32
1865
generatePageStringPdf(L_PDF_DATA  *lpd)
1866
0
{
1867
0
char    *buf;
1868
0
char    *xstr;
1869
0
l_int32  bufsize, i, wpt, hpt;
1870
0
SARRAY  *sa;
1871
1872
        /* Allocate 1000 bytes for the boilerplate text, and
1873
         * 50 bytes for each reference to an image in the
1874
         * ProcSet array.  */
1875
0
    bufsize = 1000 + 50 * lpd->n;
1876
0
    if ((buf = (char *)LEPT_CALLOC(bufsize, sizeof(char))) == NULL)
1877
0
        return ERROR_INT("calloc fail for buf", __func__, 1);
1878
1879
0
    boxGetGeometry(lpd->mediabox, NULL, NULL, &wpt, &hpt);
1880
0
    sa = sarrayCreate(lpd->n);
1881
0
    for (i = 0; i < lpd->n; i++) {
1882
0
        snprintf(buf, bufsize, "/Im%d %d 0 R   ", i + 1, 6 + i);
1883
0
        sarrayAddString(sa, buf, L_COPY);
1884
0
    }
1885
0
    xstr = sarrayToString(sa, 0);
1886
0
    sarrayDestroy(&sa);
1887
0
    if (!xstr) {
1888
0
        LEPT_FREE(buf);
1889
0
        return ERROR_INT("xstr not made", __func__, 1);
1890
0
    }
1891
1892
0
    snprintf(buf, bufsize, "4 0 obj\n"
1893
0
                           "<<\n"
1894
0
                           "/Type /Page\n"
1895
0
                           "/Parent 3 0 R\n"
1896
0
                           "/MediaBox [%d %d %d %d]\n"
1897
0
                           "/Contents 5 0 R\n"
1898
0
                           "/Resources\n"
1899
0
                           "<<\n"
1900
0
                           "/XObject << %s >>\n"
1901
0
                           "/ProcSet [ /ImageB /ImageI /ImageC ]\n"
1902
0
                           ">>\n"
1903
0
                           ">>\n"
1904
0
                           "endobj\n",
1905
0
                           0, 0, wpt, hpt, xstr);
1906
1907
0
    lpd->obj4 = stringNew(buf);
1908
0
    l_dnaAddNumber(lpd->objsize, strlen(lpd->obj4));
1909
0
    sarrayDestroy(&sa);
1910
0
    LEPT_FREE(buf);
1911
0
    LEPT_FREE(xstr);
1912
0
    return 0;
1913
0
}
1914
1915
1916
static l_int32
1917
generateContentStringPdf(L_PDF_DATA  *lpd)
1918
0
{
1919
0
char      *buf;
1920
0
char      *cstr;
1921
0
l_int32    i, bufsize;
1922
0
l_float32  xpt, ypt, wpt, hpt;
1923
0
SARRAY    *sa;
1924
1925
0
    bufsize = 1000 + 200 * lpd->n;
1926
0
    if ((buf = (char *)LEPT_CALLOC(bufsize, sizeof(char))) == NULL)
1927
0
        return ERROR_INT("calloc fail for buf", __func__, 1);
1928
1929
0
    sa = sarrayCreate(lpd->n);
1930
0
    for (i = 0; i < lpd->n; i++) {
1931
0
        ptaGetPt(lpd->xy, i, &xpt, &ypt);
1932
0
        ptaGetPt(lpd->wh, i, &wpt, &hpt);
1933
0
        snprintf(buf, bufsize,
1934
0
                 "q %.4f %.4f %.4f %.4f %.4f %.4f cm /Im%d Do Q\n",
1935
0
                 wpt, 0.0, 0.0, hpt, xpt, ypt, i + 1);
1936
0
        sarrayAddString(sa, buf, L_COPY);
1937
0
    }
1938
0
    cstr = sarrayToString(sa, 0);
1939
0
    sarrayDestroy(&sa);
1940
0
    if (!cstr) {
1941
0
        LEPT_FREE(buf);
1942
0
        return ERROR_INT("cstr not made", __func__, 1);
1943
0
    }
1944
1945
0
    snprintf(buf, bufsize, "5 0 obj\n"
1946
0
                           "<< /Length %d >>\n"
1947
0
                           "stream\n"
1948
0
                           "%s"
1949
0
                           "endstream\n"
1950
0
                           "endobj\n",
1951
0
                           (l_int32)strlen(cstr), cstr);
1952
1953
0
    lpd->obj5 = stringNew(buf);
1954
0
    l_dnaAddNumber(lpd->objsize, strlen(lpd->obj5));
1955
0
    sarrayDestroy(&sa);
1956
0
    LEPT_FREE(buf);
1957
0
    LEPT_FREE(cstr);
1958
0
    return 0;
1959
0
}
1960
1961
1962
static l_int32
1963
generatePreXStringsPdf(L_PDF_DATA  *lpd)
1964
0
{
1965
0
char          buff[256];
1966
0
char          buf[L_BIGBUF];
1967
0
char         *cstr, *bstr, *fstr, *pstr, *xstr, *photometry;
1968
0
l_int32       i, cmindex;
1969
0
L_COMP_DATA  *cid;
1970
0
SARRAY       *sa;
1971
1972
0
    sa = lpd->saprex;
1973
0
    cmindex = 6 + lpd->n;  /* starting value */
1974
0
    for (i = 0; i < lpd->n; i++) {
1975
0
        pstr = cstr = NULL;
1976
0
        if ((cid = pdfdataGetCid(lpd, i)) == NULL)
1977
0
            return ERROR_INT("cid not found", __func__, 1);
1978
1979
0
        if (cid->type == L_G4_ENCODE) {
1980
0
            if (var_WRITE_G4_IMAGE_MASK) {
1981
0
                cstr = stringNew("/ImageMask true\n"
1982
0
                                 "/ColorSpace /DeviceGray");
1983
0
            } else {
1984
0
                cstr = stringNew("/ColorSpace /DeviceGray");
1985
0
            }
1986
0
            bstr = stringNew("/BitsPerComponent 1\n"
1987
0
                             "/Interpolate true");
1988
                /* Note: the reversal is deliberate.  The BlackIs1 flag
1989
                 * is misleadingly named: it says whether to invert the
1990
                 * image on decoding because the black pixels are 0,
1991
                 * not whether the black pixels are 1!  The default for
1992
                 * BlackIs1 is "false", which means "don't invert because
1993
                 * black is 1."  Yikes. */
1994
0
            photometry = (cid->minisblack) ? stringNew("true")
1995
0
                                           : stringNew("false");
1996
0
            snprintf(buff, sizeof(buff),
1997
0
                     "/Filter /CCITTFaxDecode\n"
1998
0
                     "/DecodeParms\n"
1999
0
                     "<<\n"
2000
0
                     "/BlackIs1 %s\n"
2001
0
                     "/K -1\n"
2002
0
                     "/Columns %d\n"
2003
0
                     ">>", photometry, cid->w);
2004
0
            fstr = stringNew(buff);
2005
0
            LEPT_FREE(photometry);
2006
0
        } else if (cid->type == L_JPEG_ENCODE) {
2007
0
            if (cid->spp == 1)
2008
0
                cstr = stringNew("/ColorSpace /DeviceGray");
2009
0
            else if (cid->spp == 3)
2010
0
                cstr = stringNew("/ColorSpace /DeviceRGB");
2011
0
            else if (cid->spp == 4)   /* pdf supports cmyk */
2012
0
                cstr = stringNew("/ColorSpace /DeviceCMYK");
2013
0
            else
2014
0
                L_ERROR("in jpeg: spp != 1, 3 or 4\n", __func__);
2015
0
            bstr = stringNew("/BitsPerComponent 8");
2016
0
            fstr = stringNew("/Filter /DCTDecode");
2017
0
        } else if (cid->type == L_JP2K_ENCODE) {
2018
0
            if (cid->spp == 1)
2019
0
                cstr = stringNew("/ColorSpace /DeviceGray");
2020
0
            else if (cid->spp == 3)
2021
0
                cstr = stringNew("/ColorSpace /DeviceRGB");
2022
0
            else
2023
0
                L_ERROR("in jp2k: spp != 1 && spp != 3\n", __func__);
2024
0
            bstr = stringNew("/BitsPerComponent 8");
2025
0
            fstr = stringNew("/Filter /JPXDecode");
2026
0
        } else {  /* type == L_FLATE_ENCODE */
2027
0
            if (cid->ncolors > 0) {  /* cmapped */
2028
0
                snprintf(buff, sizeof(buff), "/ColorSpace %d 0 R", cmindex++);
2029
0
                cstr = stringNew(buff);
2030
0
            } else {
2031
0
                if (cid->spp == 1 && cid->bps == 1)
2032
0
                    cstr = stringNew("/ColorSpace /DeviceGray\n"
2033
0
                                     "/Decode [1 0]");
2034
0
                else if (cid->spp == 1)  /* 8 bpp */
2035
0
                    cstr = stringNew("/ColorSpace /DeviceGray");
2036
0
                else if (cid->spp == 3)
2037
0
                    cstr = stringNew("/ColorSpace /DeviceRGB");
2038
0
                else
2039
0
                    L_ERROR("unknown colorspace: spp = %d\n",
2040
0
                            __func__, cid->spp);
2041
0
            }
2042
0
            snprintf(buff, sizeof(buff), "/BitsPerComponent %d", cid->bps);
2043
0
            bstr = stringNew(buff);
2044
0
            fstr = stringNew("/Filter /FlateDecode");
2045
0
            if (cid->predictor == TRUE) {
2046
0
                snprintf(buff, sizeof(buff),
2047
0
                         "/DecodeParms\n"
2048
0
                         "<<\n"
2049
0
                         "  /Columns %d\n"
2050
0
                         "  /Predictor 14\n"
2051
0
                         "  /Colors %d\n"
2052
0
                         "  /BitsPerComponent %d\n"
2053
0
                         ">>\n", cid->w, cid->spp, cid->bps);
2054
0
                pstr = stringNew(buff);
2055
0
            }
2056
0
        }
2057
0
        if (!pstr)  /* no decode parameters */
2058
0
            pstr = stringNew("");
2059
2060
0
        snprintf(buf, sizeof(buf),
2061
0
                 "%d 0 obj\n"
2062
0
                 "<<\n"
2063
0
                 "/Length %zu\n"
2064
0
                 "/Subtype /Image\n"
2065
0
                 "%s\n"  /* colorspace */
2066
0
                 "/Width %d\n"
2067
0
                 "/Height %d\n"
2068
0
                 "%s\n"  /* bits/component */
2069
0
                 "%s\n"  /* filter */
2070
0
                 "%s"   /* decode parms; can be empty */
2071
0
                 ">>\n"
2072
0
                 "stream\n",
2073
0
                 6 + i, cid->nbytescomp, cstr,
2074
0
                 cid->w, cid->h, bstr, fstr, pstr);
2075
0
        xstr = stringNew(buf);
2076
0
        sarrayAddString(sa, xstr, L_INSERT);
2077
0
        l_dnaAddNumber(lpd->objsize,
2078
0
                      strlen(xstr) + cid->nbytescomp + strlen(lpd->poststream));
2079
0
        LEPT_FREE(cstr);
2080
0
        LEPT_FREE(bstr);
2081
0
        LEPT_FREE(fstr);
2082
0
        LEPT_FREE(pstr);
2083
0
    }
2084
2085
0
    return 0;
2086
0
}
2087
2088
2089
static l_int32
2090
generateColormapStringsPdf(L_PDF_DATA  *lpd)
2091
0
{
2092
0
char          buf[L_BIGBUF];
2093
0
char         *cmstr;
2094
0
l_int32       i, cmindex, ncmap;
2095
0
L_COMP_DATA  *cid;
2096
0
SARRAY       *sa;
2097
2098
        /* In our canonical format, we have 5 objects, followed
2099
         * by n XObjects, followed by m colormaps, so the index of
2100
         * the first colormap object is 6 + n. */
2101
0
    sa = lpd->sacmap;
2102
0
    cmindex = 6 + lpd->n;  /* starting value */
2103
0
    ncmap = 0;
2104
0
    for (i = 0; i < lpd->n; i++) {
2105
0
        if ((cid = pdfdataGetCid(lpd, i)) == NULL)
2106
0
            return ERROR_INT("cid not found", __func__, 1);
2107
0
        if (cid->ncolors == 0) continue;
2108
2109
0
        ncmap++;
2110
0
        snprintf(buf, sizeof(buf), "%d 0 obj\n"
2111
0
                                   "[ /Indexed /DeviceRGB\n"
2112
0
                                   "%d\n"
2113
0
                                   "%s\n"
2114
0
                                   "]\n"
2115
0
                                   "endobj\n",
2116
0
                                   cmindex, cid->ncolors - 1, cid->cmapdatahex);
2117
0
        cmindex++;
2118
0
        cmstr = stringNew(buf);
2119
0
        l_dnaAddNumber(lpd->objsize, strlen(cmstr));
2120
0
        sarrayAddString(sa, cmstr, L_INSERT);
2121
0
    }
2122
2123
0
    lpd->ncmap = ncmap;
2124
0
    return 0;
2125
0
}
2126
2127
2128
static void
2129
generateTrailerPdf(L_PDF_DATA  *lpd)
2130
0
{
2131
0
l_int32  i, n, size, linestart;
2132
0
L_DNA   *daloc, *dasize;
2133
2134
        /* Let nobj be the number of numbered objects.  These numbered
2135
         * objects are indexed by their pdf number in arrays naloc[]
2136
         * and nasize[].  The 0th object is the 9 byte header.  Then
2137
         * the number of objects in nasize, which includes the header,
2138
         * is n = nobj + 1.  The array naloc[] has n + 1 elements,
2139
         * because it includes as the last element the starting
2140
         * location of xref.  The indexing of these objects, their
2141
         * starting locations and sizes are:
2142
         *
2143
         *     Object number         Starting location         Size
2144
         *     -------------         -----------------     --------------
2145
         *          0                   daloc[0] = 0       dasize[0] = 9
2146
         *          1                   daloc[1] = 9       dasize[1] = 49
2147
         *          n                   daloc[n]           dasize[n]
2148
         *          xref                daloc[n+1]
2149
         *
2150
         * We first generate daloc.
2151
         */
2152
0
    dasize = lpd->objsize;
2153
0
    daloc = lpd->objloc;
2154
0
    linestart = 0;
2155
0
    l_dnaAddNumber(daloc, linestart);  /* header */
2156
0
    n = l_dnaGetCount(dasize);
2157
0
    for (i = 0; i < n; i++) {
2158
0
        l_dnaGetIValue(dasize, i, &size);
2159
0
        linestart += size;
2160
0
        l_dnaAddNumber(daloc, linestart);
2161
0
    }
2162
0
    l_dnaGetIValue(daloc, n, &lpd->xrefloc);  /* save it */
2163
2164
        /* Now make the actual trailer string */
2165
0
    lpd->trailer = makeTrailerStringPdf(daloc);
2166
0
}
2167
2168
2169
static char *
2170
makeTrailerStringPdf(L_DNA  *daloc)
2171
0
{
2172
0
char    *outstr;
2173
0
char     buf[L_BIGBUF];
2174
0
l_int32  i, n, linestart, xrefloc;
2175
0
SARRAY  *sa;
2176
2177
0
    if (!daloc)
2178
0
        return (char *)ERROR_PTR("daloc not defined", __func__, NULL);
2179
0
    n = l_dnaGetCount(daloc) - 1;  /* numbered objects + 1 (yes, +1) */
2180
2181
0
    sa = sarrayCreate(0);
2182
0
    snprintf(buf, sizeof(buf), "xref\n"
2183
0
                               "0 %d\n"
2184
0
                               "0000000000 65535 f \n", n);
2185
0
    sarrayAddString(sa, buf, L_COPY);
2186
0
    for (i = 1; i < n; i++) {
2187
0
        l_dnaGetIValue(daloc, i, &linestart);
2188
0
        snprintf(buf, sizeof(buf), "%010d 00000 n \n", linestart);
2189
0
        sarrayAddString(sa, buf, L_COPY);
2190
0
    }
2191
2192
0
    l_dnaGetIValue(daloc, n, &xrefloc);
2193
0
    snprintf(buf, sizeof(buf), "trailer\n"
2194
0
                               "<<\n"
2195
0
                               "/Size %d\n"
2196
0
                               "/Root 1 0 R\n"
2197
0
                               "/Info 2 0 R\n"
2198
0
                               ">>\n"
2199
0
                               "startxref\n"
2200
0
                               "%d\n"
2201
0
                               "%%%%EOF\n", n, xrefloc);
2202
0
    sarrayAddString(sa, buf, L_COPY);
2203
0
    outstr = sarrayToString(sa, 0);
2204
0
    sarrayDestroy(&sa);
2205
0
    return outstr;
2206
0
}
2207
2208
2209
/*!
2210
 * \brief   generateOutputDataPdf()
2211
 *
2212
 * \param[out]   pdata      pdf data array
2213
 * \param[out]   pnbytes    size of pdf data array
2214
 * \param[in]    lpd        input data used to make pdf
2215
 * \return  0 if OK, 1 on error
2216
 *
2217
 * <pre>
2218
 * Notes:
2219
 *      (1) Only called from l_generatePdf().  On error, no data is returned.
2220
 * </pre>
2221
 */
2222
static l_int32
2223
generateOutputDataPdf(l_uint8    **pdata,
2224
                      size_t      *pnbytes,
2225
                      L_PDF_DATA  *lpd)
2226
0
{
2227
0
char         *str;
2228
0
l_uint8      *data;
2229
0
l_int32       nimages, i, len;
2230
0
l_int32      *sizes, *locs;
2231
0
size_t        nbytes;
2232
0
L_COMP_DATA  *cid;
2233
2234
0
    if (!pdata)
2235
0
        return ERROR_INT("&data not defined", __func__, 1);
2236
0
    *pdata = NULL;
2237
0
    if (!pnbytes)
2238
0
        return ERROR_INT("&nbytes not defined", __func__, 1);
2239
0
    nbytes = lpd->xrefloc + strlen(lpd->trailer);
2240
0
    *pnbytes = nbytes;
2241
0
    if ((data = (l_uint8 *)LEPT_CALLOC(nbytes, sizeof(l_uint8))) == NULL)
2242
0
        return ERROR_INT("calloc fail for data", __func__, 1);
2243
0
    *pdata = data;
2244
2245
0
    sizes = l_dnaGetIArray(lpd->objsize);
2246
0
    locs = l_dnaGetIArray(lpd->objloc);
2247
0
    memcpy(data, lpd->id, sizes[0]);
2248
0
    memcpy(data + locs[1], lpd->obj1, sizes[1]);
2249
0
    memcpy(data + locs[2], lpd->obj2, sizes[2]);
2250
0
    memcpy(data + locs[3], lpd->obj3, sizes[3]);
2251
0
    memcpy(data + locs[4], lpd->obj4, sizes[4]);
2252
0
    memcpy(data + locs[5], lpd->obj5, sizes[5]);
2253
2254
        /* Each image has 3 parts: variable preamble, the compressed
2255
         * data stream, and the fixed poststream. */
2256
0
    nimages = lpd->n;
2257
0
    for (i = 0; i < nimages; i++) {
2258
0
        if ((cid = pdfdataGetCid(lpd, i)) == NULL) {  /* should not happen */
2259
0
            LEPT_FREE(sizes);
2260
0
            LEPT_FREE(locs);
2261
0
            return ERROR_INT("cid not found", __func__, 1);
2262
0
        }
2263
0
        str = sarrayGetString(lpd->saprex, i, L_NOCOPY);
2264
0
        len = strlen(str);
2265
0
        memcpy(data + locs[6 + i], str, len);
2266
0
        memcpy(data + locs[6 + i] + len,
2267
0
               cid->datacomp, cid->nbytescomp);
2268
0
        memcpy(data + locs[6 + i] + len + cid->nbytescomp,
2269
0
               lpd->poststream, strlen(lpd->poststream));
2270
0
    }
2271
2272
        /* Each colormap is simply a stored string */
2273
0
    for (i = 0; i < lpd->ncmap; i++) {
2274
0
        str = sarrayGetString(lpd->sacmap, i, L_NOCOPY);
2275
0
        memcpy(data + locs[6 + nimages + i], str, strlen(str));
2276
0
    }
2277
2278
        /* And finally the trailer */
2279
0
    memcpy(data + lpd->xrefloc, lpd->trailer, strlen(lpd->trailer));
2280
0
    LEPT_FREE(sizes);
2281
0
    LEPT_FREE(locs);
2282
0
    return 0;
2283
0
}
2284
2285
2286
/*---------------------------------------------------------------------*
2287
 *          Helper functions for generating multipage pdf output       *
2288
 *---------------------------------------------------------------------*/
2289
/*!
2290
 * \brief   parseTrailerPdf()
2291
 *
2292
 * \param[in]    bas     lba of a pdf file
2293
 * \param[out]   pda     byte locations of the beginning of each object
2294
 * \return  0 if OK, 1 on error
2295
 */
2296
static l_int32
2297
parseTrailerPdf(L_BYTEA  *bas,
2298
                L_DNA   **pda)
2299
0
{
2300
0
char     *str;
2301
0
l_uint8   nl = '\n';
2302
0
l_uint8  *data;
2303
0
l_int32   i, j, start, startloc, xrefloc, found, loc, nobj, objno, trailer_ok;
2304
0
size_t    size;
2305
0
L_DNA    *da, *daobj, *daxref;
2306
0
SARRAY   *sa;
2307
2308
0
    if (!pda)
2309
0
        return ERROR_INT("&da not defined", __func__, 1);
2310
0
    *pda = NULL;
2311
0
    if (!bas)
2312
0
        return ERROR_INT("bas not defined", __func__, 1);
2313
0
    data = l_byteaGetData(bas, &size);
2314
0
    if (memcmp(data, "%PDF-1.", 7) != 0)
2315
0
        return ERROR_INT("PDF header signature not found", __func__, 1);
2316
2317
        /* Search for "startxref" starting 50 bytes from the EOF */
2318
0
    start = 0;
2319
0
    if (size > 50)
2320
0
        start = size - 50;
2321
0
    arrayFindSequence(data + start, size - start,
2322
0
                      (l_uint8 *)"startxref\n", 10, &loc, &found);
2323
0
    if (!found)
2324
0
        return ERROR_INT("startxref not found!", __func__, 1);
2325
0
    if (sscanf((char *)(data + start + loc + 10), "%d\n", &xrefloc) != 1)
2326
0
        return ERROR_INT("xrefloc not found!", __func__, 1);
2327
0
    if (xrefloc < 0 || xrefloc >= size)
2328
0
        return ERROR_INT("invalid xrefloc!", __func__, 1);
2329
0
    sa = sarrayCreateLinesFromString((char *)(data + xrefloc), 0);
2330
0
    str = sarrayGetString(sa, 1, L_NOCOPY);
2331
0
    if ((sscanf(str, "0 %d", &nobj)) != 1) {
2332
0
        sarrayDestroy(&sa);
2333
0
        return ERROR_INT("nobj not found", __func__, 1);
2334
0
    }
2335
2336
        /* Get starting locations.  The numa index is the
2337
         * object number.  loc[0] is the ID; loc[nobj + 1] is xrefloc.  */
2338
0
    da = l_dnaCreate(nobj + 1);
2339
0
    *pda = da;
2340
0
    for (i = 0; i < nobj; i++) {
2341
0
        str = sarrayGetString(sa, i + 2, L_NOCOPY);
2342
0
        sscanf(str, "%d", &startloc);
2343
0
        l_dnaAddNumber(da, startloc);
2344
0
    }
2345
0
    l_dnaAddNumber(da, xrefloc);
2346
2347
#if  DEBUG_MULTIPAGE
2348
    lept_stderr("************** Trailer string ************\n");
2349
    lept_stderr("xrefloc = %d", xrefloc);
2350
    sarrayWriteStderr(sa);
2351
2352
    lept_stderr("************** Object locations ************");
2353
    l_dnaWriteStderr(da);
2354
#endif  /* DEBUG_MULTIPAGE */
2355
0
    sarrayDestroy(&sa);
2356
2357
        /* Verify correct parsing */
2358
0
    trailer_ok = TRUE;
2359
0
    for (i = 1; i < nobj; i++) {
2360
0
        l_dnaGetIValue(da, i, &startloc);
2361
0
        if ((sscanf((char *)(data + startloc), "%d 0 obj", &objno)) != 1) {
2362
0
            L_ERROR("bad trailer for object %d\n", __func__, i);
2363
0
            trailer_ok = FALSE;
2364
0
            break;
2365
0
        }
2366
0
    }
2367
2368
        /* If the trailer is broken, reconstruct the correct obj locations */
2369
0
    if (!trailer_ok) {
2370
0
        L_INFO("rebuilding pdf trailer\n", __func__);
2371
0
        l_dnaEmpty(da);
2372
0
        l_dnaAddNumber(da, 0);
2373
0
        l_byteaFindEachSequence(bas, (l_uint8 *)" 0 obj\n", 7, &daobj);
2374
0
        nobj = l_dnaGetCount(daobj);
2375
0
        for (i = 0; i < nobj; i++) {
2376
0
            l_dnaGetIValue(daobj, i, &loc);
2377
0
            for (j = loc - 1; j > 0; j--) {
2378
0
                if (data[j] == nl)
2379
0
                    break;
2380
0
            }
2381
0
            l_dnaAddNumber(da, j + 1);
2382
0
        }
2383
0
        l_byteaFindEachSequence(bas, (l_uint8 *)"xref", 4, &daxref);
2384
0
        l_dnaGetIValue(daxref, 0, &loc);
2385
0
        l_dnaAddNumber(da, loc);
2386
0
        l_dnaDestroy(&daobj);
2387
0
        l_dnaDestroy(&daxref);
2388
0
    }
2389
2390
0
    return 0;
2391
0
}
2392
2393
2394
static char *
2395
generatePagesObjStringPdf(NUMA  *napage)
2396
0
{
2397
0
char    *str;
2398
0
char    *buf;
2399
0
l_int32  i, n, index, bufsize;
2400
0
SARRAY  *sa;
2401
2402
0
    if (!napage)
2403
0
        return (char *)ERROR_PTR("napage not defined", __func__, NULL);
2404
2405
0
    n = numaGetCount(napage);
2406
0
    bufsize = 100 + 16 * n;  /* large enough to hold the output string */
2407
0
    buf = (char *)LEPT_CALLOC(bufsize, sizeof(char));
2408
0
    sa = sarrayCreate(n);
2409
0
    for (i = 0; i < n; i++) {
2410
0
        numaGetIValue(napage, i, &index);
2411
0
        snprintf(buf, bufsize, " %d 0 R ", index);
2412
0
        sarrayAddString(sa, buf, L_COPY);
2413
0
    }
2414
2415
0
    str = sarrayToString(sa, 0);
2416
0
    snprintf(buf, bufsize - 1, "3 0 obj\n"
2417
0
                               "<<\n"
2418
0
                               "/Type /Pages\n"
2419
0
                               "/Kids [%s]\n"
2420
0
                               "/Count %d\n"
2421
0
                               ">>\n"
2422
0
                               "endobj\n",
2423
0
                               str, n);
2424
0
    sarrayDestroy(&sa);
2425
0
    LEPT_FREE(str);
2426
0
    return buf;
2427
0
}
2428
2429
2430
/*!
2431
 * \brief   substituteObjectNumbers()
2432
 *
2433
 * \param[in]   bas        lba of a pdf object
2434
 * \param[in]   na_objs    object number mapping array
2435
 * \return    bad   lba of rewritten pdf for the object
2436
 *
2437
 * <pre>
2438
 * Notes:
2439
 *      (1) Interpret the first set of bytes as the object number,
2440
 *          map to the new number, and write it out.
2441
 *      (2) Find all occurrences of this 4-byte sequence: " 0 R"
2442
 *      (3) Find the location and value of the integer preceding this,
2443
 *          and map it to the new value.
2444
 *      (4) Rewrite the object with new object numbers.
2445
 * </pre>
2446
 */
2447
static L_BYTEA *
2448
substituteObjectNumbers(L_BYTEA  *bas,
2449
                        NUMA     *na_objs)
2450
0
{
2451
0
l_uint8   space = ' ';
2452
0
l_uint8  *datas;
2453
0
l_uint8   buf[32];  /* only needs to hold one integer in ascii format */
2454
0
l_int32   start, nrepl, i, j, nobjs, objin, objout, found;
2455
0
l_int32  *objs, *matches;
2456
0
size_t    size;
2457
0
L_BYTEA  *bad;
2458
0
L_DNA    *da_match;
2459
2460
0
    if (!bas)
2461
0
        return (L_BYTEA *)ERROR_PTR("bas not defined", __func__, NULL);
2462
0
    if (!na_objs)
2463
0
        return (L_BYTEA *)ERROR_PTR("na_objs not defined", __func__, NULL);
2464
2465
0
    datas = l_byteaGetData(bas, &size);
2466
0
    bad = l_byteaCreate(100);
2467
0
    objs = numaGetIArray(na_objs);  /* object number mapper */
2468
0
    nobjs = numaGetCount(na_objs);  /* use for sanity checking */
2469
2470
        /* Substitute the object number on the first line */
2471
0
    sscanf((char *)datas, "%d", &objin);
2472
0
    if (objin < 0 || objin >= nobjs) {
2473
0
        L_ERROR("index %d into array of size %d\n", __func__, objin, nobjs);
2474
0
        LEPT_FREE(objs);
2475
0
        return bad;
2476
0
    }
2477
0
    objout = objs[objin];
2478
0
    snprintf((char *)buf, 32, "%d", objout);
2479
0
    l_byteaAppendString(bad, (char *)buf);
2480
2481
        /* Find the set of matching locations for object references */
2482
0
    arrayFindSequence(datas, size, &space, 1, &start, &found);
2483
0
    da_match = arrayFindEachSequence(datas, size, (l_uint8 *)" 0 R", 4);
2484
0
    if (!da_match) {
2485
0
        l_byteaAppendData(bad, datas + start, size - start);
2486
0
        LEPT_FREE(objs);
2487
0
        return bad;
2488
0
    }
2489
2490
        /* Substitute all the object reference numbers */
2491
0
    nrepl = l_dnaGetCount(da_match);
2492
0
    matches = l_dnaGetIArray(da_match);
2493
0
    for (i = 0; i < nrepl; i++) {
2494
            /* Find the first space before the object number */
2495
0
        for (j = matches[i] - 1; j > 0; j--) {
2496
0
            if (datas[j] == space)
2497
0
                break;
2498
0
        }
2499
            /* Copy bytes from 'start' up to the object number */
2500
0
        l_byteaAppendData(bad, datas + start, j - start + 1);
2501
0
        sscanf((char *)(datas + j + 1), "%d", &objin);
2502
0
        if (objin < 0 || objin >= nobjs) {
2503
0
            L_ERROR("index %d into array of size %d\n", __func__, objin, nobjs);
2504
0
            LEPT_FREE(objs);
2505
0
            LEPT_FREE(matches);
2506
0
            l_dnaDestroy(&da_match);
2507
0
            return bad;
2508
0
        }
2509
0
        objout = objs[objin];
2510
0
        snprintf((char *)buf, 32, "%d", objout);
2511
0
        l_byteaAppendString(bad, (char *)buf);
2512
0
        start = matches[i];
2513
0
    }
2514
0
    l_byteaAppendData(bad, datas + start, size - start);
2515
2516
0
    LEPT_FREE(objs);
2517
0
    LEPT_FREE(matches);
2518
0
    l_dnaDestroy(&da_match);
2519
0
    return bad;
2520
0
}
2521
2522
2523
/*---------------------------------------------------------------------*
2524
 *                     Create/destroy/access pdf data                  *
2525
 *---------------------------------------------------------------------*/
2526
static L_PDF_DATA *
2527
pdfdataCreate(const char  *title)
2528
0
{
2529
0
L_PDF_DATA *lpd;
2530
2531
0
    lpd = (L_PDF_DATA *)LEPT_CALLOC(1, sizeof(L_PDF_DATA));
2532
0
    if (title) lpd->title = stringNew(title);
2533
0
    lpd->cida = ptraCreate(10);
2534
0
    lpd->xy = ptaCreate(10);
2535
0
    lpd->wh = ptaCreate(10);
2536
0
    lpd->saprex = sarrayCreate(10);
2537
0
    lpd->sacmap = sarrayCreate(10);
2538
0
    lpd->objsize = l_dnaCreate(20);
2539
0
    lpd->objloc = l_dnaCreate(20);
2540
0
    return lpd;
2541
0
}
2542
2543
static void
2544
pdfdataDestroy(L_PDF_DATA  **plpd)
2545
0
{
2546
0
l_int32       i;
2547
0
L_COMP_DATA  *cid;
2548
0
L_PDF_DATA   *lpd;
2549
2550
0
    if (plpd== NULL) {
2551
0
        L_WARNING("ptr address is null!\n", __func__);
2552
0
        return;
2553
0
    }
2554
0
    if ((lpd = *plpd) == NULL)
2555
0
        return;
2556
2557
0
    if (lpd->title) LEPT_FREE(lpd->title);
2558
0
    for (i = 0; i < lpd->n; i++) {
2559
0
        cid = (L_COMP_DATA *)ptraRemove(lpd->cida, i, L_NO_COMPACTION);
2560
0
        l_CIDataDestroy(&cid);
2561
0
    }
2562
2563
0
    ptraDestroy(&lpd->cida, 0, 0);
2564
0
    if (lpd->id) LEPT_FREE(lpd->id);
2565
0
    if (lpd->obj1) LEPT_FREE(lpd->obj1);
2566
0
    if (lpd->obj2) LEPT_FREE(lpd->obj2);
2567
0
    if (lpd->obj3) LEPT_FREE(lpd->obj3);
2568
0
    if (lpd->obj4) LEPT_FREE(lpd->obj4);
2569
0
    if (lpd->obj5) LEPT_FREE(lpd->obj5);
2570
0
    if (lpd->poststream) LEPT_FREE(lpd->poststream);
2571
0
    if (lpd->trailer) LEPT_FREE(lpd->trailer);
2572
0
    if (lpd->xy) ptaDestroy(&lpd->xy);
2573
0
    if (lpd->wh) ptaDestroy(&lpd->wh);
2574
0
    if (lpd->mediabox) boxDestroy(&lpd->mediabox);
2575
0
    if (lpd->saprex) sarrayDestroy(&lpd->saprex);
2576
0
    if (lpd->sacmap) sarrayDestroy(&lpd->sacmap);
2577
0
    if (lpd->objsize) l_dnaDestroy(&lpd->objsize);
2578
0
    if (lpd->objloc) l_dnaDestroy(&lpd->objloc);
2579
0
    LEPT_FREE(lpd);
2580
0
    *plpd = NULL;
2581
0
}
2582
2583
2584
static L_COMP_DATA *
2585
pdfdataGetCid(L_PDF_DATA  *lpd,
2586
              l_int32      index)
2587
0
{
2588
0
    if (!lpd)
2589
0
        return (L_COMP_DATA *)ERROR_PTR("lpd not defined", __func__, NULL);
2590
0
    if (index < 0 || index >= lpd->n)
2591
0
        return (L_COMP_DATA *)ERROR_PTR("invalid image index", __func__, NULL);
2592
2593
0
    return (L_COMP_DATA *)ptraGetPtrToItem(lpd->cida, index);
2594
0
}
2595
2596
2597
/*---------------------------------------------------------------------*
2598
 *                     Find number of pages in a pdf                   *
2599
 *---------------------------------------------------------------------*/
2600
/*!
2601
 * \brief   getPdfPageCount()
2602
 *
2603
 * \param[in]    fname      filename
2604
 * \param[out]   pnpages    number of pages
2605
 * \return  0 if OK, 1 on error
2606
 *
2607
 * <pre>
2608
 * Notes:
2609
 *      (1) Looks for the argument of the first instance of /Count in the file.
2610
 *      (2) This first reads 10000 bytes from the beginning of the file.
2611
 *          If "/Count" is not in that string, it reads the entire file
2612
 *          and looks for "/Count".
2613
 *      (3) This will not work on encrypted pdf files or on files where
2614
 *          the "/Count" field is binary compressed.  Not finding the
2615
 *          "/Count" field is not an error, but a warning is given.
2616
 * </pre>
2617
 */
2618
l_ok
2619
getPdfPageCount(const char  *fname,
2620
                l_int32     *pnpages)
2621
0
{
2622
0
l_uint8  *data;
2623
0
l_int32   format, loc, ret, npages, found;
2624
0
size_t    nread;
2625
2626
0
    if (!pnpages)
2627
0
        return ERROR_INT("&npages not defined", __func__, 1);
2628
0
    *pnpages = 0;
2629
0
    if (!fname)
2630
0
        return ERROR_INT("fname not defined", __func__, 1);
2631
2632
        /* Make sure this a pdf file */
2633
0
    findFileFormat(fname, &format);
2634
0
    if (format != IFF_LPDF)
2635
0
        return ERROR_INT("file is not pdf", __func__, 1);
2636
2637
        /* Read 10000 bytes from the beginning of the file */
2638
0
    if ((data = l_binaryReadSelect(fname, 0, 10000, &nread))
2639
0
                 == NULL)
2640
0
        return ERROR_INT("partial data not read", __func__, 1);
2641
2642
        /* Find the location of the first instance of "/Count".
2643
         * If it is not found, try reading the entire file and
2644
         * looking again. */
2645
0
    arrayFindSequence(data, nread, (const l_uint8 *)"/Count",
2646
0
          strlen("/Count"), &loc, &found);
2647
0
    if (!found) {
2648
0
        lept_stderr("Reading entire file looking for '/Count'\n");
2649
0
        LEPT_FREE(data);
2650
0
        if ((data = l_binaryRead(fname, &nread)) == NULL)
2651
0
            return ERROR_INT("full data not read", __func__, 1);
2652
0
        arrayFindSequence(data, nread, (const l_uint8 *)"/Count",
2653
0
             strlen("/Count"), &loc, &found);
2654
0
        if (!found) {
2655
0
            LEPT_FREE(data);
2656
0
            L_WARNING("/Count not found\n", __func__);
2657
0
            return 0;
2658
0
        }
2659
0
    }
2660
2661
        /* Unlikely: make sure we can read the count field */
2662
0
    if (nread - loc < 12)  { /* haven't read enough to capture page count */
2663
0
        LEPT_FREE(data);
2664
0
        return ERROR_INT("data may not include page count field", __func__, 1);
2665
0
    }
2666
2667
        /* Read the page count; if not found, puts garbage in npages */
2668
0
    ret = sscanf((char *)&data[loc], "/Count %d", &npages);
2669
0
    LEPT_FREE(data);
2670
0
    if (ret != 1)
2671
0
        return ERROR_INT("npages not found", __func__, 1);
2672
0
    *pnpages = npages;
2673
/*    lept_stderr("bytes read = %d, loc = %d, npages = %d\n",
2674
                nread, loc, *pnpages);  */
2675
0
    return 0;
2676
0
}
2677
2678
2679
/*---------------------------------------------------------------------*
2680
 *      Find widths and heights of pages and media boxes in a pdf      *
2681
 *---------------------------------------------------------------------*/
2682
/*!
2683
 * \brief   getPdfPageSizes()
2684
 *
2685
 * \param[in]    fname        filename
2686
 * \param[out]   pnaw         [optional] array of page widths
2687
 * \param[out]   pnah         [optional] array of page heights
2688
 * \param[out]   pmedw        [optional] median page width
2689
 * \param[out]   pmedh        [optional] median page height
2690
 * \return  0 if OK, 1 on error
2691
 *
2692
 * <pre>
2693
 * Notes:
2694
 *      (1) Finds the arguments of each instance of '/Width' and '/Height'
2695
 *          in the file.
2696
 *      (2) This will not work on encrypted pdf files or on files where
2697
 *          the "/Width" and "/Height" fields are binary compressed.
2698
 *          Not finding the "/Width" and /Height" fields is not an error,
2699
 *          but a warning is given.
2700
 * </pre>
2701
 */
2702
l_ok
2703
getPdfPageSizes(const char  *fname,
2704
                NUMA       **pnaw,
2705
                NUMA       **pnah,
2706
                l_int32     *pmedw,
2707
                l_int32     *pmedh)
2708
0
{
2709
0
l_uint8   *data;
2710
0
l_int32    i, nw, nh, format, ret, loc, width, height;
2711
0
l_float32  fval;
2712
0
size_t     nread;
2713
0
L_DNA     *dnaw;  /* width locations */
2714
0
L_DNA     *dnah;  /* height locations */
2715
0
NUMA      *naw;   /* widths */
2716
0
NUMA      *nah;   /* heights */
2717
2718
0
    if (pnaw) *pnaw = NULL;
2719
0
    if (pnah) *pnah = NULL;
2720
0
    if (pmedw) *pmedw = 0;
2721
0
    if (pmedh) *pmedh = 0;
2722
0
    if (!pnaw && !pnah && !pmedw && !pmedh)
2723
0
        return ERROR_INT("no output requested", __func__, 1);
2724
0
    if (!fname)
2725
0
        return ERROR_INT("fname not defined", __func__, 1);
2726
2727
        /* Make sure this a pdf file */
2728
0
    findFileFormat(fname, &format);
2729
0
    if (format != IFF_LPDF)
2730
0
        return ERROR_INT("file is not pdf", __func__, 1);
2731
2732
        /* Read the file into memory and find all locations of
2733
         * '/Width' and '/Height' */
2734
0
    if ((data = l_binaryRead(fname, &nread)) == NULL)
2735
0
        return ERROR_INT("full data not read", __func__, 1);
2736
0
    dnaw = arrayFindEachSequence(data, nread, (const l_uint8 *)"/Width",
2737
0
                                 strlen("/Width"));
2738
0
    dnah = arrayFindEachSequence(data, nread, (const l_uint8 *)"/Height",
2739
0
                                 strlen("/Height"));
2740
0
    if (!dnaw)
2741
0
        L_WARNING("unable to find widths\n", __func__);
2742
0
    if (!dnah)
2743
0
        L_WARNING("unable to find heights\n", __func__);
2744
0
    if (!dnaw && !dnah) {
2745
0
        LEPT_FREE(data);
2746
0
        L_WARNING("no fields found\n", __func__);
2747
0
        return 0;
2748
0
    }
2749
2750
        /* Find the page widths and heights */
2751
0
    nw = l_dnaGetCount(dnaw);
2752
0
    naw = numaCreate(nw);
2753
0
    for (i = 0; i < nw; i++) {
2754
0
        l_dnaGetIValue(dnaw, i, &loc);
2755
0
        ret = sscanf((char *)&data[loc], "/Width %d", &width);
2756
0
        if (ret != 1) {
2757
0
            L_ERROR("width not found for item %d at loc %d\n",
2758
0
                    __func__, i, loc);
2759
0
            continue;
2760
0
        }
2761
0
        numaAddNumber(naw, width);
2762
0
    }
2763
0
    nh = l_dnaGetCount(dnah);
2764
0
    nah = numaCreate(nh);
2765
0
    for (i = 0; i < nh; i++) {
2766
0
        l_dnaGetIValue(dnah, i, &loc);
2767
0
        ret = sscanf((char *)&data[loc], "/Height %d", &height);
2768
0
        if (ret != 1) {
2769
0
            L_ERROR("height not found for item %d at loc %d\n",
2770
0
                    __func__, i, loc);
2771
0
            continue;
2772
0
        }
2773
0
        numaAddNumber(nah, height);
2774
0
    }
2775
2776
0
    LEPT_FREE(data);
2777
0
    l_dnaDestroy(&dnaw);
2778
0
    l_dnaDestroy(&dnah);
2779
0
    if (pmedw) {
2780
0
        numaGetMedian(naw, &fval);
2781
0
        *pmedw = lept_roundftoi(fval);
2782
0
    }
2783
0
    if (pnaw)
2784
0
        *pnaw = naw;
2785
0
    else
2786
0
        numaDestroy(&naw);
2787
0
    if (pmedh) {
2788
0
        numaGetMedian(nah, &fval);
2789
0
        *pmedh = lept_roundftoi(fval);
2790
0
    }
2791
0
    if (pnah)
2792
0
        *pnah = nah;
2793
0
    else
2794
0
        numaDestroy(&nah);
2795
0
    return 0;
2796
0
}
2797
2798
2799
/*!
2800
 * \brief   getPdfMediaBoxSizes()
2801
 *
2802
 * \param[in]    fname        filename
2803
 * \param[out]   pnaw         [optional] array of mediabox widths
2804
 * \param[out]   pnah         [optional] array of mediabox heights
2805
 * \param[out]   pmedw        [optional] median mediabox width
2806
 * \param[out]   pmedh        [optional] median mediabox height
2807
 * \return  0 if OK, 1 on error
2808
 *
2809
 * <pre>
2810
 * Notes:
2811
 *      (1) Finds the arguments of each instance of '/MediaBox' in the file.
2812
 *      (2) This will not work on encrypted pdf files or on files where
2813
 *          the "/MediaBoxes" field is binary compressed.  Not finding
2814
 *          the "/MediaBoxes" field is not an error, but a warning is given.
2815
 *      (3) This is useful for determining if the media boxes are
2816
 *          incorrectly assigned, such as assuming the resolution is 72 ppi.
2817
 *          If that happens and the input the the renderer assumes the
2818
 *          resolution is 300 ppi, the rendered images will be over 4x too
2819
 *          large in each dimension.
2820
 *      (4) An image dimension of 11 inches corresponds to a MediaBox
2821
 *          parameter of 792.  We consider a value > 850 to be oversized
2822
 *          and not to be taken literally.
2823
 * </pre>
2824
 */
2825
l_ok
2826
getPdfMediaBoxSizes(const char  *fname,
2827
                    NUMA       **pnaw,
2828
                    NUMA       **pnah,
2829
                    l_int32     *pmedw,
2830
                    l_int32     *pmedh)
2831
0
{
2832
0
l_uint8   *data;
2833
0
l_int32    i, n, format, ret, loc;
2834
0
l_float32  fval, ignore1, ignore2, w, h;
2835
0
size_t     nread;
2836
0
L_DNA     *dna;   /* mediabox locations */
2837
0
NUMA      *naw;   /* mediabox widths */
2838
0
NUMA      *nah;   /* mediabox heights */
2839
2840
0
    if (pnaw) *pnaw = NULL;
2841
0
    if (pnah) *pnah = NULL;
2842
0
    if (pmedw) *pmedw = 0;
2843
0
    if (pmedh) *pmedh = 0;
2844
0
    if (!pnaw && !pnah && !pmedw && !pmedh)
2845
0
        return ERROR_INT("no output requested", __func__, 1);
2846
0
    if (!fname)
2847
0
        return ERROR_INT("fname not defined", __func__, 1);
2848
2849
        /* Make sure this a pdf file */
2850
0
    findFileFormat(fname, &format);
2851
0
    if (format != IFF_LPDF)
2852
0
        return ERROR_INT("file is not pdf", __func__, 1);
2853
2854
        /* Read the file into memory and find all locations of '/MediaBox' */
2855
0
    if ((data = l_binaryRead(fname, &nread)) == NULL)
2856
0
        return ERROR_INT("full data not read", __func__, 1);
2857
0
    dna = arrayFindEachSequence(data, nread, (const l_uint8 *)"/MediaBox",
2858
0
                                strlen("/MediaBox"));
2859
0
    if (!dna) {
2860
0
        LEPT_FREE(data);
2861
0
        L_WARNING("no mediaboxes found\n", __func__);
2862
0
        return 1;
2863
0
    }
2864
2865
        /* Find the mediabox widths and heights */
2866
0
    n = l_dnaGetCount(dna);
2867
0
    naw = numaCreate(n);
2868
0
    nah = numaCreate(n);
2869
0
    for (i = 0; i < n; i++) {
2870
0
        l_dnaGetIValue(dna, i, &loc);
2871
0
        ret = sscanf((char *)&data[loc], "/MediaBox [ %f %f %f %f",
2872
0
                     &ignore1, &ignore2, &w, &h);
2873
0
        if (ret != 4) {
2874
0
            L_ERROR("mediabox sizes not found for item %d at loc %d\n",
2875
0
                    __func__, i, loc);
2876
0
            continue;
2877
0
        }
2878
0
        numaAddNumber(naw, w);
2879
0
        numaAddNumber(nah, h);
2880
0
    }
2881
0
    LEPT_FREE(data);
2882
0
    l_dnaDestroy(&dna);
2883
2884
0
    if (pmedw) {
2885
0
        numaGetMedian(naw, &fval);
2886
0
        *pmedw = lept_roundftoi(fval);
2887
0
        if (*pmedw > 850) lept_stderr("oversize width: %d\n", *pmedw);
2888
0
    }
2889
0
    if (pnaw)
2890
0
        *pnaw = naw;
2891
0
    else
2892
0
        numaDestroy(&naw);
2893
0
    if (pmedh) {
2894
0
        numaGetMedian(nah, &fval);
2895
0
        *pmedh = lept_roundftoi(fval);
2896
0
        if (*pmedh > 850) lept_stderr("oversize height: %d\n", *pmedh);
2897
0
    }
2898
0
    if (pnah)
2899
0
        *pnah = nah;
2900
0
    else
2901
0
        numaDestroy(&nah);
2902
0
    return 0;
2903
0
}
2904
2905
2906
/*---------------------------------------------------------------------*
2907
 *       Find effective resolution of images rendered from a pdf       *
2908
 *---------------------------------------------------------------------*/
2909
/*!
2910
 * \brief   getPdfRendererResolution()
2911
 *
2912
 * \param[in]    infile       filename of input pdf file
2913
 * \param[in]    outdir       directory of rendered output images
2914
 * \param[out]   pres         desired resolution to use with renderer
2915
 * \return  0 if OK, 1 on error
2916
 *
2917
 * <pre>
2918
 * Notes:
2919
 *      (1) Finds the input resolution to pdftoppm that will generate
2920
 *          images with a maximum dimension of about 3300 pixels,
2921
 *          representing a full page at 300 ppi.
2922
 *      (2) It is most important is to make sure the renderer does
2923
 *          not make huge images because of an error in /MediaBox.
2924
 *          An image dimension of 11 inches corresponds to a MediaBox
2925
 *          parameter of 792.  We consider a value > 850 to be oversized
2926
 *          and not to be taken literally.  If the mediaboxes are
2927
 *          oversized, choose an appropriate lower resolution.
2928
 *      (3) If the mediaboxes are not accessible, render an image at
2929
 *          a low known resolution (say, 72 ppi) and based on the image
2930
 *          size, determine the resolution necessary to make an image
2931
 *          with 3300 pixels in the largest dimension.
2932
 *      (4) Requires pdftoppm, so this is disabled on windows for now.
2933
 *      (5) Requires the ability to call an external program, so it is
2934
 *          necessary to call setLeptDebugOK(1) before this function.
2935
 * </pre>
2936
 */
2937
l_ok
2938
getPdfRendererResolution(const char  *infile,
2939
                         const char  *outdir,
2940
                         l_int32     *pres)
2941
0
{
2942
0
char      buf[256];
2943
0
char     *tail, *basename, *fname;
2944
0
l_int32   ret, res, medw, medh, medmax, npages, pageno, w, h;
2945
0
SARRAY   *sa;
2946
2947
0
    if (!pres)
2948
0
        return ERROR_INT("&res not defined", __func__, 1);
2949
0
    *pres = 300;  /* default */
2950
2951
#ifdef _WIN32
2952
    L_INFO("Requires pdftoppm, so this is disabled on windows.\n"
2953
           "Returns default resolution 300 ppi", __func__);
2954
    return 0;
2955
#endif  /* _WIN32 */
2956
2957
0
    if (!LeptDebugOK) {
2958
0
        L_INFO("Running pdftoppm is disabled; "
2959
0
               "use setLeptDebugOK(1) to enable\n",
2960
0
               "returns default resolution 300 ppi\n", __func__);
2961
0
        return 1;
2962
0
    }
2963
2964
0
    if (!infile)
2965
0
        return ERROR_INT("infile not defined", __func__, 1);
2966
0
    if (!outdir)
2967
0
        return ERROR_INT("outdir not defined", __func__, 1);
2968
2969
0
    res = 300;  /* default value */
2970
0
    ret = getPdfMediaBoxSizes(infile, NULL, NULL, &medw, &medh);
2971
0
    if (ret == 0) {  /* Check for oversize mediaboxes */
2972
0
        lept_stderr("Media Box medians: medw = %d, medh = %d\n", medw, medh);
2973
0
        medmax = L_MAX(medw, medh);
2974
0
        if (medmax > 850) {
2975
0
            res = 300 * ((l_float32)792 / (l_float32)medmax);
2976
0
            lept_stderr(" Oversize media box; use resolution = %d\n", res);
2977
0
            *pres = res;
2978
0
        }
2979
0
        return 0;
2980
0
    }
2981
2982
        /* No mediaboxes; render one page and measure the max dimension */
2983
0
    lept_stderr("Media Box dimensions not found\n");
2984
0
    getPdfPageCount(infile, &npages);
2985
0
    pageno = (npages > 0) ? (npages + 1) / 2 : 1;
2986
0
    splitPathAtDirectory(infile, NULL, &tail);
2987
0
    splitPathAtExtension(tail, &basename, NULL);
2988
0
    snprintf(buf, sizeof(buf), "pdftoppm -f %d -l %d -r 72 %s %s/%s",
2989
0
             pageno, pageno, infile, outdir, basename);
2990
0
    LEPT_FREE(tail);
2991
0
    LEPT_FREE(basename);
2992
0
    callSystemDebug(buf);  /* pdftoppm */
2993
2994
        /* Get the page size */
2995
0
    sa = getSortedPathnamesInDirectory(outdir, NULL, 0, 0);
2996
0
    fname = sarrayGetString(sa, 0, L_NOCOPY);
2997
0
    pixReadHeader(fname, NULL, &w, &h, NULL, NULL, NULL);
2998
0
    sarrayDestroy(&sa);
2999
0
    if (w > 0 && h > 0) {
3000
0
        res = L_MIN((72 * 3300 / L_MAX(w, h)), 600);
3001
0
        *pres = res;
3002
0
        lept_stderr("Use resolution = %d\n", res);
3003
0
    } else {
3004
0
        L_ERROR("page size not found; assuming res = 300\n", __func__);
3005
0
    }
3006
3007
0
    return 0;
3008
0
}
3009
3010
3011
/*---------------------------------------------------------------------*
3012
 *                      Set flags for special modes                    *
3013
 *---------------------------------------------------------------------*/
3014
/*!
3015
 * \brief   l_pdfSetG4ImageMask()
3016
 *
3017
 * \param[in]    flag    1 for writing g4 data as fg only through a mask;
3018
 *                       0 for writing fg and bg
3019
 * \return  void
3020
 *
3021
 * <pre>
3022
 * Notes:
3023
 *      (1) The default is for writing only the fg (through the mask).
3024
 *          That way when you write a 1 bpp image, the bg is transparent,
3025
 *          so any previously written image remains visible behind it.
3026
 * </pre>
3027
 */
3028
void
3029
l_pdfSetG4ImageMask(l_int32  flag)
3030
0
{
3031
0
    var_WRITE_G4_IMAGE_MASK = flag;
3032
0
}
3033
3034
3035
/*!
3036
 * \brief   l_pdfSetDateAndVersion()
3037
 *
3038
 * \param[in]    flag    1 for writing date/time and leptonica version;
3039
 *                       0 for omitting this from the metadata
3040
 * \return  void
3041
 *
3042
 * <pre>
3043
 * Notes:
3044
 *      (1) The default is for writing this data.  For regression tests
3045
 *          that compare output against golden files, it is useful to omit.
3046
 * </pre>
3047
 */
3048
void
3049
l_pdfSetDateAndVersion(l_int32  flag)
3050
0
{
3051
0
    var_WRITE_DATE_AND_VERSION = flag;
3052
0
}
3053
3054
/* --------------------------------------------*/
3055
#endif  /* USE_PDFIO */
3056
/* --------------------------------------------*/